├── src
    ├── test
    │   ├── resources
    │   │   ├── credentials.json
    │   │   └── logback-test.xml
    │   └── java
    │   │   └── io
    │   │       └── cdap
    │   │           └── plugin
    │   │               └── cdc
    │   │                   ├── integration
    │   │                       ├── StructuredRecordRepresentation.java
    │   │                       └── CDCPluginIntegrationTestBase.java
    │   │                   ├── common
    │   │                       └── BigtableOperations.java
    │   │                   ├── performance
    │   │                       ├── CDCPluginPerfTestBase.java
    │   │                       └── CDCPipelinePerfTest.java
    │   │                   ├── sink
    │   │                       └── CDCBigTableConfigUnitTest.java
    │   │                   └── source
    │   │                       └── oracle
    │   │                           └── GoldenGateKafkaConfigUnitTest.java
    └── main
    │   └── java
    │       └── io
    │           └── cdap
    │               └── plugin
    │                   └── cdc
    │                       ├── sink
    │                           ├── TypeConversionException.java
    │                           ├── CDCHBaseConfig.java
    │                           ├── CDCBigTableConfig.java
    │                           ├── CDCHBase.java
    │                           ├── CDCBigTable.java
    │                           ├── CDCKuduConfig.java
    │                           └── CDCTableUtil.java
    │                       ├── common
    │                           ├── OperationType.java
    │                           ├── DriverCleanup.java
    │                           ├── CDCReferencePluginConfig.java
    │                           ├── SparkConfigs.java
    │                           ├── JDBCDriverShim.java
    │                           ├── Schemas.java
    │                           └── DBUtils.java
    │                       ├── source
    │                           ├── oracle
    │                           │   ├── BinaryMessages.java
    │                           │   ├── GoldenGateKafkaConfig.java
    │                           │   ├── Normalizer.java
    │                           │   └── GoldenGateKafka.java
    │                           └── sqlserver
    │                           │   ├── SQLServerConnectionFactory.java
    │                           │   ├── ResultSetToDDLRecord.java
    │                           │   ├── TableInformation.java
    │                           │   ├── PluginConnectionFactory.java
    │                           │   ├── ResultSetToDMLRecord.java
    │                           │   ├── CTSQLServerConfig.java
    │                           │   └── CTSQLServer.java
    │                       └── DMLFlattener.java
├── icons
    └── CDCBigTable-sparksink.png
├── docker-compose
    └── cdc-env
    │   ├── GoldenGate
    │       ├── dirprm
    │       │   └── ext1.prm
    │       └── Dockerfile
    │   ├── Oracle
    │       ├── Dockerfile
    │       ├── init.sh
    │       ├── createDB.sh
    │       └── dbca.rsp.tmpl
    │   ├── GoldenGate-Bigdata
    │       ├── dirprm
    │       │   ├── dependencies
    │       │   │   └── kafka
    │       │   │   │   ├── lz4-java-1.4.1.jar
    │       │   │   │   ├── slf4j-api-1.7.25.jar
    │       │   │   │   ├── kafka-clients-2.1.1.jar
    │       │   │   │   └── snappy-java-1.1.7.2.jar
    │       │   ├── rconf.prm
    │       │   ├── kafka-producer.properties
    │       │   └── rconf.properties
    │       └── Dockerfile
    │   └── docker-compose.yml
├── widgets
    ├── DMLFlattener-transform.json
    ├── CDCHBase-sparksink.json
    ├── CDCBigTable-sparksink.json
    ├── CDCDatabase-streamingsource.json
    ├── CDCKudu-sparksink.json
    └── CTSQLServer-streamingsource.json
├── .gitignore
├── docs
    ├── CDCKudu-sparksink.md
    ├── CDCDatabase-streamingsource.md
    ├── CDCHBase-sparksink.md
    ├── CTSQLServer-streamingsource.md
    └── CDCBigTable-sparksink.md
├── suppressions.xml
└── README.md


/src/test/resources/credentials.json:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/icons/CDCBigTable-sparksink.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data-integrations/change-data-capture/HEAD/icons/CDCBigTable-sparksink.png


--------------------------------------------------------------------------------
/docker-compose/cdc-env/GoldenGate/dirprm/ext1.prm:
--------------------------------------------------------------------------------
1 | EXTRACT ext1
2 | USERIDALIAS oggadmin
3 | EXTTRAIL /u01/app/ogg/dirdat/in
4 | TABLE trans_user.*;


--------------------------------------------------------------------------------
/docker-compose/cdc-env/Oracle/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM oracle/database:18.3.0-se2
2 | 
3 | COPY "dbca.rsp.tmpl" "createDB.sh" $ORACLE_BASE/
4 | COPY "init.sh" $ORACLE_BASE/scripts/setup/


--------------------------------------------------------------------------------
/docker-compose/cdc-env/GoldenGate-Bigdata/dirprm/dependencies/kafka/lz4-java-1.4.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data-integrations/change-data-capture/HEAD/docker-compose/cdc-env/GoldenGate-Bigdata/dirprm/dependencies/kafka/lz4-java-1.4.1.jar


--------------------------------------------------------------------------------
/docker-compose/cdc-env/GoldenGate-Bigdata/dirprm/dependencies/kafka/slf4j-api-1.7.25.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data-integrations/change-data-capture/HEAD/docker-compose/cdc-env/GoldenGate-Bigdata/dirprm/dependencies/kafka/slf4j-api-1.7.25.jar


--------------------------------------------------------------------------------
/docker-compose/cdc-env/GoldenGate-Bigdata/dirprm/dependencies/kafka/kafka-clients-2.1.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data-integrations/change-data-capture/HEAD/docker-compose/cdc-env/GoldenGate-Bigdata/dirprm/dependencies/kafka/kafka-clients-2.1.1.jar


--------------------------------------------------------------------------------
/docker-compose/cdc-env/GoldenGate-Bigdata/dirprm/dependencies/kafka/snappy-java-1.1.7.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data-integrations/change-data-capture/HEAD/docker-compose/cdc-env/GoldenGate-Bigdata/dirprm/dependencies/kafka/snappy-java-1.1.7.2.jar


--------------------------------------------------------------------------------
/docker-compose/cdc-env/GoldenGate-Bigdata/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM oracle/goldengate-standard:12.3.0.1.2
2 | 
3 | COPY ./dirprm /u01/app/ogg/dirprm
4 | RUN chmod -R 777 /u01/app/ogg/dirprm \
5 |   && mkdir /u01/app/ogg/dirdat && chown -R oracle:oinstall /u01/app/ogg/dirdat


--------------------------------------------------------------------------------
/docker-compose/cdc-env/GoldenGate/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM oracle/goldengate-standard:18.1.0.0.0
2 | 
3 | COPY ./dirprm /u01/app/ogg/dirprm
4 | RUN chmod -R 777 /u01/app/ogg/dirprm \
5 |   && mkdir /u01/app/ogg/dirdat && chown -R oracle:oinstall /u01/app/ogg/dirdat
6 | 


--------------------------------------------------------------------------------
/docker-compose/cdc-env/GoldenGate-Bigdata/dirprm/rconf.prm:
--------------------------------------------------------------------------------
1 | REPLICAT rconf
2 | TARGETDB LIBFILE libggjava.so SET property=dirprm/rconf.properties
3 | discardfile ./dirrpt/kafkax.dsc, purge
4 | GETTRUNCATES
5 | GETUPDATEBEFORES
6 | ReportCount Every 1000 Records, Rate
7 | MAP *.*, TARGET *.*;


--------------------------------------------------------------------------------
/widgets/DMLFlattener-transform.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "metadata": {
 3 |     "spec-version": "1.5"
 4 |   },
 5 |   "display-name": "DML Flattener",
 6 |   "configuration-groups": [
 7 |     {
 8 |       "label": "Basic",
 9 |       "properties": [ ]
10 |     }
11 |   ],
12 |   "outputs": [
13 |     {
14 |       "name": "schema",
15 |       "widget-type": "schema",
16 |       "widget-attributes": {
17 |         "schema-default-type": "string"
18 |       }
19 |     }
20 |   ]
21 | }
22 | 
23 | 


--------------------------------------------------------------------------------
/widgets/CDCHBase-sparksink.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "metadata": {
 3 |     "spec-version": "1.5"
 4 |   },
 5 |   "configuration-groups": [
 6 |     {
 7 |       "label": "HBase Sink Configuration",
 8 |       "properties": [
 9 |         {
10 |           "widget-type" : "textbox",
11 |           "label" : "Reference Name",
12 |           "name" : "referenceName",
13 |           "description" : "Reference specifies the name to be used to track this external source"
14 |         }
15 |       ]
16 |     }
17 |   ]
18 | }


--------------------------------------------------------------------------------
/docker-compose/cdc-env/Oracle/init.sh:
--------------------------------------------------------------------------------
 1 | set -ex
 2 | 
 3 | sqlplus / as sysdba << EOF
 4 |   alter system set enable_goldengate_replication=TRUE;
 5 |   alter database add supplemental log data;
 6 |   alter database force logging;
 7 |   alter system switch logfile;
 8 | 
 9 |   shutdown immediate;
10 |   Startup mount;
11 |   Alter database archivelog;
12 |   alter database open;
13 | 
14 |   CREATE USER gg_extract IDENTIFIED BY gg_extract;
15 |   GRANT CREATE SESSION, CONNECT, RESOURCE, ALTER ANY TABLE, ALTER SYSTEM, DBA, SELECT ANY TRANSACTION TO gg_extract;
16 |   CREATE USER trans_user IDENTIFIED BY trans_user;
17 |   GRANT CREATE SESSION, CONNECT, RESOURCE TO trans_user;
18 |   ALTER USER trans_user QUOTA UNLIMITED ON USERS;
19 | 
20 |   exit;
21 | EOF


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.class
 2 | .*.swp
 3 | .beamer
 4 | # Package Files #
 5 | *.jar
 6 | *.war
 7 | *.ear
 8 | 
 9 | # Intellij Files & Dir #
10 | *.iml
11 | *.ipr
12 | *.iws
13 | atlassian-ide-plugin.xml
14 | out/
15 | .DS_Store
16 | ./lib/
17 | .idea
18 | 
19 | # Gradle Files & Dir #
20 | build/
21 | .gradle/
22 | .stickyStorage
23 | .build/
24 | target/
25 | 
26 | # Node log
27 | npm-*.log
28 | logs/
29 | .nux_enabled
30 | .nux_dashboard
31 | 
32 | # Singlenode and test data files.
33 | /templates/
34 | /artifacts/
35 | /data/
36 | /data-fabric-tests/data/
37 | 
38 | # gateway test leftover
39 | /gateway/data/
40 | /watchdog/data/
41 | 
42 | # Checkstyle report
43 | examples/checkstyle_report.xml
44 | 
45 | # Examples Stuff
46 | dependency-reduced-pom.xml
47 | 
48 | # Hive db Stuff
49 | derby.log
50 | 
51 | # generated config files
52 | /cdap-web-app/conf/generated
53 | /cdap-client-tests/conf/generated
54 | 
55 | # generated by docs build
56 | *.pyc
57 | !/docker-compose/cdc-env/**/*.jar
58 | 


--------------------------------------------------------------------------------
/src/main/java/io/cdap/plugin/cdc/sink/TypeConversionException.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2019 Cask Data, Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 5 |  * use this file except in compliance with the License. You may obtain a copy of
 6 |  * the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 |  * License for the specific language governing permissions and limitations under
14 |  * the License.
15 |  */
16 | 
17 | package io.cdap.plugin.cdc.sink;
18 | 
19 | /**
20 |  * Exception thrown when there is issue with type conversion from CDAP pipeline schema to Kudu.
21 |  */
22 | public class TypeConversionException extends Exception {
23 |   public TypeConversionException(String s) {
24 |     super(s);
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/src/main/java/io/cdap/plugin/cdc/sink/CDCHBaseConfig.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2019 Cask Data, Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 5 |  * use this file except in compliance with the License. You may obtain a copy of
 6 |  * the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 |  * License for the specific language governing permissions and limitations under
14 |  * the License.
15 |  */
16 | 
17 | package io.cdap.plugin.cdc.sink;
18 | 
19 | import io.cdap.cdap.api.plugin.PluginConfig;
20 | import io.cdap.plugin.cdc.common.CDCReferencePluginConfig;
21 | 
22 | /**
23 |  * Defines the {@link PluginConfig} for the {@link CDCHBase}.
24 |  */
25 | public class CDCHBaseConfig extends CDCReferencePluginConfig {
26 |   public CDCHBaseConfig(String referenceName) {
27 |     super(referenceName);
28 |   }
29 | }
30 | 


--------------------------------------------------------------------------------
/docker-compose/cdc-env/GoldenGate-Bigdata/dirprm/kafka-producer.properties:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright © 2019 Cask Data, Inc.
 3 | #  
 4 | # Licensed under the Apache License, Version 2.0 (the "License"); you may not
 5 | # use this file except in compliance with the License. You may obtain a copy of
 6 | # the License at
 7 | #  
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #  
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 | # License for the specific language governing permissions and limitations under
14 | # the License.
15 | #
16 | 
17 | bootstrap.servers=localhost:9092
18 | acks = 1
19 | compression.type = gzip
20 | reconnect.backoff.ms = 1000
21 | 
22 | value.serializer = org.apache.kafka.common.serialization.ByteArraySerializer
23 | key.serializer = org.apache.kafka.common.serialization.ByteArraySerializer
24 | # 100KB per partition
25 | batch.size = 102400
26 | linger.ms = 0
27 | max.request.size = 1048576
28 | send.buffer.bytes = 131072


--------------------------------------------------------------------------------
/src/main/java/io/cdap/plugin/cdc/common/OperationType.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2019 Cask Data, Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 5 |  * use this file except in compliance with the License. You may obtain a copy of
 6 |  * the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 |  * License for the specific language governing permissions and limitations under
14 |  * the License.
15 |  */
16 | 
17 | package io.cdap.plugin.cdc.common;
18 | 
19 | /**
20 |  * Represents change operation type.
21 |  */
22 | public enum OperationType {
23 |   INSERT, UPDATE, DELETE;
24 | 
25 |   public static OperationType fromShortName(String name) {
26 |     switch (name.toUpperCase()) {
27 |       case "I":
28 |         return INSERT;
29 |       case "U":
30 |         return UPDATE;
31 |       case "D":
32 |         return DELETE;
33 |       default:
34 |         throw new IllegalArgumentException(String.format("Unknown change operation '%s'", name));
35 |     }
36 |   }
37 | }
38 | 


--------------------------------------------------------------------------------
/docs/CDCKudu-sparksink.md:
--------------------------------------------------------------------------------
 1 | # CDC SQL Server Streaming Source
 2 | 
 3 | Description
 4 | -----------
 5 | This plugin takes input from a CDC source and writes the changes to Kudu.
 6 | 
 7 | All CDC sink plugins are normally used in conjunction with CDC source plugins. 
 8 | CDC sink expects messages in CDC format as an input.  
 9 | 
10 | Properties
11 | ----------
12 | **Reference Name**: Name used to uniquely identify this source for lineage, annotating metadata, etc.
13 | 
14 | **Master Addresses**: Comma separated list of hostname:port of Apache Kudu Masters.
15 | 
16 | **Replicas**: Specifies the number of buckets to split the table into.
17 | 
18 | **Seed**: Seed to randomize the mapping of rows to hash buckets.
19 | 
20 | **Compression Algorithm**: Compression algorithm to be applied on the columns.
21 | 
22 | **Encoding Type**: Specifies the encoding to be applied on the schema.
23 | 
24 | **User Operations Timeout**: Timeout for Kudu operations in milliseconds.
25 | 
26 | **Administration Operations Timeout**: Administration operation timeout.
27 | 
28 | **Number of copies**: Specifies the number of replicas for the Kudu tables.
29 | 
30 | **Rows Buffer**: Number of rows that are buffered before flushing to the tablet server.
31 | 
32 | **Boss Threads**: Specifies the number of boss threads to be used by the client.


--------------------------------------------------------------------------------
/src/main/java/io/cdap/plugin/cdc/source/oracle/BinaryMessages.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2019 Cask Data, Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 5 |  * use this file except in compliance with the License. You may obtain a copy of
 6 |  * the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 |  * License for the specific language governing permissions and limitations under
14 |  * the License.
15 |  */
16 | 
17 | package io.cdap.plugin.cdc.source.oracle;
18 | 
19 | import io.cdap.cdap.api.common.Bytes;
20 | 
21 | import java.nio.ByteBuffer;
22 | import javax.annotation.Nonnull;
23 | 
24 | /**
25 |  * Utility methods for dealing with binary messages.
26 |  */
27 | public class BinaryMessages {
28 |   private BinaryMessages() {
29 |     // utility class
30 |   }
31 | 
32 |   @Nonnull
33 |   static byte[] getBytesFromBinaryMessage(Object message) {
34 |     if (message instanceof ByteBuffer) {
35 |       ByteBuffer bb = (ByteBuffer) message;
36 |       return Bytes.toBytes(bb);
37 |     } else {
38 |       return (byte[]) message;
39 |     }
40 |   }
41 | }
42 | 


--------------------------------------------------------------------------------
/docker-compose/cdc-env/GoldenGate-Bigdata/dirprm/rconf.properties:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright © 2019 Cask Data, Inc.
 3 | #  
 4 | # Licensed under the Apache License, Version 2.0 (the "License"); you may not
 5 | # use this file except in compliance with the License. You may obtain a copy of
 6 | # the License at
 7 | #  
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #  
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 | # License for the specific language governing permissions and limitations under
14 | # the License.
15 | #
16 | 
17 | gg.handlerlist = kafkahandler
18 | 
19 | # Kafka handler properties
20 | gg.handler.kafkahandler.type = kafka
21 | gg.handler.kafkahandler.KafkaProducerConfigFile = kafka-producer.properties
22 | gg.handler.kafkahandler.topicMappingTemplate=oggtopic
23 | gg.handler.kafkahandler.keyMappingTemplate=${position}
24 | gg.handler.kafkahandler.format = avro_op
25 | gg.handler.kafkahandler.schemaTopicName=oggtopic
26 | gg.handler.kafkahandler.format.wrapMessageInGenericAvroMessage=true
27 | gg.handler.kafkahandler.mode = op
28 | gg.handler.kafkahandler.BlockingSend = true
29 | 
30 | # Logging settings
31 | gg.log=log4j
32 | gg.log.level=INFO
33 | gg.report.time=30sec
34 | 
35 | #Set the classpath here
36 | gg.classpath=dirprm/dependencies/kafka/*


--------------------------------------------------------------------------------
/widgets/CDCBigTable-sparksink.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "metadata": {
 3 |     "spec-version": "1.5"
 4 |   },
 5 |   "display-name": "CDC Google Cloud Bigtable Sink",
 6 |   "configuration-groups": [
 7 |     {
 8 |       "label": "Cloud Bigtable Sink Configuration",
 9 |       "properties": [
10 |         {
11 |           "widget-type": "textbox",
12 |           "label": "Reference Name",
13 |           "name": "referenceName",
14 |           "description": "Reference specifies the name to be used to track this external source"
15 |         },
16 |         {
17 |           "widget-type": "textbox",
18 |           "label": "Instance Id",
19 |           "name": "instance",
20 |           "description": "The Instance Id the Cloud Bigtable is in."
21 |         },
22 |         {
23 |           "widget-type": "textbox",
24 |           "label": "Project Id",
25 |           "name": "project",
26 |           "description": "The Project Id the Cloud Bigtable table is in.",
27 |           "widget-attributes": {
28 |             "default": "auto-detect"
29 |           }
30 |         },
31 |         {
32 |           "widget-type": "textbox",
33 |           "label": "Service Account File Path",
34 |           "name": "serviceFilePath",
35 |           "description": "Path to service account file (local to host running on).",
36 |           "widget-attributes": {
37 |             "default": "auto-detect"
38 |           }
39 |         }
40 |       ]
41 |     }
42 |   ]
43 | }
44 | 
45 | 


--------------------------------------------------------------------------------
/src/test/java/io/cdap/plugin/cdc/integration/StructuredRecordRepresentation.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2019 Cask Data, Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 5 |  * use this file except in compliance with the License. You may obtain a copy of
 6 |  * the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 |  * License for the specific language governing permissions and limitations under
14 |  * the License.
15 |  */
16 | 
17 | package io.cdap.plugin.cdc.integration;
18 | 
19 | import io.cdap.cdap.api.data.format.StructuredRecord;
20 | import io.cdap.cdap.format.StructuredRecordStringConverter;
21 | import org.assertj.core.presentation.StandardRepresentation;
22 | 
23 | import java.io.IOException;
24 | 
25 | public class StructuredRecordRepresentation extends StandardRepresentation {
26 |   @Override
27 |   public String toStringOf(Object object) {
28 |     try {
29 |       if (object instanceof StructuredRecord) {
30 |         return StructuredRecordStringConverter.toJsonString((StructuredRecord) object);
31 |       }
32 |       return super.toStringOf(object);
33 |     } catch (IOException e) {
34 |       throw new IllegalArgumentException(e);
35 |     }
36 |   }
37 | }
38 | 


--------------------------------------------------------------------------------
/src/main/java/io/cdap/plugin/cdc/common/DriverCleanup.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2019 Cask Data, Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 5 |  * use this file except in compliance with the License. You may obtain a copy of
 6 |  * the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 |  * License for the specific language governing permissions and limitations under
14 |  * the License.
15 |  */
16 | package io.cdap.plugin.cdc.common;
17 | 
18 | import com.google.common.base.Throwables;
19 | import io.cdap.cdap.etl.api.Destroyable;
20 | 
21 | import java.sql.DriverManager;
22 | import java.sql.SQLException;
23 | import javax.annotation.Nullable;
24 | 
25 | /**
26 |  * class to de-register driver
27 |  */
28 | public class DriverCleanup implements Destroyable {
29 |   private final JDBCDriverShim driverShim;
30 | 
31 |   DriverCleanup(@Nullable JDBCDriverShim driverShim) {
32 |     this.driverShim = driverShim;
33 |   }
34 | 
35 |   public void destroy() {
36 |     if (driverShim != null) {
37 |       try {
38 |         DriverManager.deregisterDriver(driverShim);
39 |       } catch (SQLException e) {
40 |         throw Throwables.propagate(e);
41 |       }
42 |     }
43 |   }
44 | }
45 | 
46 | 


--------------------------------------------------------------------------------
/widgets/CDCDatabase-streamingsource.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "metadata": {
 3 |     "spec-version": "1.5"
 4 |   },
 5 |   "configuration-groups": [
 6 |     {
 7 |       "label": "Kafka Configuration",
 8 |       "properties": [
 9 |         {
10 |           "widget-type": "textbox",
11 |           "label": "Reference Name",
12 |           "name": "referenceName",
13 |           "description": "Reference specifies the name to be used to track this external source"
14 |         },
15 |         {
16 |           "widget-type": "textbox",
17 |           "label": "Kafka Broker",
18 |           "name": "broker"
19 |         },
20 |         {
21 |           "widget-type": "textbox",
22 |           "label": "Kafka Topic",
23 |           "name": "topic"
24 |         },
25 |         {
26 |           "widget-type": "textbox",
27 |           "label": "Default Initial Offset",
28 |           "name": "defaultInitialOffset"
29 |         },
30 |         {
31 |           "widget-type": "textbox",
32 |           "label": "Max Rate Per Partition",
33 |           "name": "maxRatePerPartition",
34 |           "widget-attributes": {
35 |             "default": "1000"
36 |           }
37 |         }
38 |       ]
39 |     }
40 |   ],
41 |   "outputs": [
42 |     {
43 |       "widget-type": "non-editable-schema-editor",
44 |       "schema": {
45 |         "name": "CDCRecord",
46 |         "type": "record",
47 |         "fields": [
48 |           {
49 |             "name": "cdcMessage",
50 |             "type": "bytes"
51 |           }
52 |         ]
53 |       }
54 |     }
55 |   ]
56 | }
57 | 


--------------------------------------------------------------------------------
/src/main/java/io/cdap/plugin/cdc/common/CDCReferencePluginConfig.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2019 Cask Data, Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 5 |  * use this file except in compliance with the License. You may obtain a copy of
 6 |  * the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 |  * License for the specific language governing permissions and limitations under
14 |  * the License.
15 |  */
16 | 
17 | package io.cdap.plugin.cdc.common;
18 | 
19 | import io.cdap.cdap.etl.api.validation.InvalidConfigPropertyException;
20 | import io.cdap.plugin.common.Constants;
21 | import io.cdap.plugin.common.IdUtils;
22 | import io.cdap.plugin.common.ReferencePluginConfig;
23 | 
24 | /**
25 |  * {@link ReferencePluginConfig} with reference name validation.
26 |  */
27 | public class CDCReferencePluginConfig extends ReferencePluginConfig {
28 |   public CDCReferencePluginConfig(String referenceName) {
29 |     super(referenceName);
30 |   }
31 | 
32 |   public void validate() {
33 |     if (!containsMacro(Constants.Reference.REFERENCE_NAME)) {
34 |       try {
35 |         IdUtils.validateId(referenceName);
36 |       } catch (IllegalArgumentException e) {
37 |         throw new InvalidConfigPropertyException(e.getMessage(), Constants.Reference.REFERENCE_NAME);
38 |       }
39 |     }
40 |   }
41 | }
42 | 


--------------------------------------------------------------------------------
/docker-compose/cdc-env/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3'
 2 | services:
 3 |   mssql:
 4 |     image: mcr.microsoft.com/mssql/server:2017-latest
 5 |     ports:
 6 |       - 1433:1433
 7 |     environment:
 8 |       - SA_PASSWORD=123Qwe123
 9 |       - ACCEPT_EULA=Y
10 | 
11 |   zookeeper:
12 |     image: zookeeper:3.4.13
13 |     network_mode: host
14 |     ports:
15 |       - "2181:2181"
16 |     environment:
17 |       - ZOOKEEPER_TICK_TIME=2000
18 | 
19 |   kafka:
20 |     image: wurstmeister/kafka:2.12-2.2.0
21 |     network_mode: host
22 |     depends_on:
23 |       - zookeeper
24 |     environment:
25 |       - JVM_OPTS=-Xmx2g -XX:+UseG1GC
26 |       - KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS=0
27 |       - KAFKA_UNCLEAN_LEADER_ELECTION_ENABLE=true
28 |       - KAFKA_ADVERTISED_LISTENERS=PLAINTEXT://localhost:9092
29 |       - KAFKA_LISTENERS=PLAINTEXT://0.0.0.0:9092
30 |       - KAFKA_ZOOKEEPER_CONNECT=localhost:2181
31 |       - KAFKA_CREATE_TOPICS=oggtopic:1:1
32 | 
33 |   oracledb:
34 |     build: Oracle
35 |     ports:
36 |       - 1521:1521
37 |       - 8080:8080
38 |       - 5500:5500
39 |     environment:
40 |       - ORACLE_SID=XE
41 |       - ORACLE_PWD=123Qwe123
42 |     volumes:
43 |       - "oracleData:/opt/oracle/oradata"
44 | 
45 |   goldengate_oracle:
46 |     build: GoldenGate
47 |     cap_add:
48 |       - SYS_RESOURCE
49 |     volumes:
50 |       - "oracleData:/opt/oracle/oradata"
51 |       - "goldengateDirdat:/u01/app/ogg/dirdat"
52 | 
53 |   goldengate_bigdata:
54 |     build: GoldenGate-Bigdata
55 |     network_mode: host
56 |     volumes:
57 |       - "goldengateDirdat:/u01/app/ogg/dirdat"
58 | 
59 | 
60 | volumes:
61 |   oracleData:
62 |   goldengateDirdat:
63 | 


--------------------------------------------------------------------------------
/src/main/java/io/cdap/plugin/cdc/common/SparkConfigs.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2019 Cask Data, Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 5 |  * use this file except in compliance with the License. You may obtain a copy of
 6 |  * the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 |  * License for the specific language governing permissions and limitations under
14 |  * the License.
15 |  */
16 | 
17 | package io.cdap.plugin.cdc.common;
18 | 
19 | import org.apache.spark.api.java.JavaRDD;
20 | 
21 | import java.util.HashMap;
22 | import java.util.Iterator;
23 | import java.util.Map;
24 | 
25 | /**
26 |  * Utility methods for dealing with Spark configuration and data.
27 |  */
28 | public class SparkConfigs {
29 |   private SparkConfigs() {
30 |     // utility class
31 |   }
32 | 
33 |   /**
34 |    * Get the hadoop configurations and passed it as a Map to the closure
35 |    *
36 |    * @param javaRDD Spark RDD object
37 |    * @return configuration Map
38 |    */
39 |   public static Map<String, String> getHadoopConfigs(JavaRDD<?> javaRDD) {
40 |     Iterator<Map.Entry<String, String>> iterator = javaRDD.context().hadoopConfiguration().iterator();
41 |     Map<String, String> configs = new HashMap<>();
42 |     while (iterator.hasNext()) {
43 |       Map.Entry<String, String> next = iterator.next();
44 |       configs.put(next.getKey(), next.getValue());
45 |     }
46 |     return configs;
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/src/test/resources/logback-test.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | 
 3 | <!--
 4 |   ~ Copyright © 2019 Cask Data, Inc.
 5 |   ~
 6 |   ~ Licensed under the Apache License, Version 2.0 (the "License"); you may not
 7 |   ~ use this file except in compliance with the License. You may obtain a copy of
 8 |   ~ the License at
 9 |   ~
10 |   ~ http://www.apache.org/licenses/LICENSE-2.0
11 |   ~
12 |   ~ Unless required by applicable law or agreed to in writing, software
13 |   ~ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
14 |   ~ WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
15 |   ~ License for the specific language governing permissions and limitations under
16 |   ~ the License.
17 |   -->
18 | 
19 | <configuration>
20 | 
21 |     <!--Suppressing some chatty loggers -->
22 |     <logger name="com.ning" level="WARN"/>
23 |     <logger name="io.netty.util.internal" level="OFF"/>
24 |     <logger name="org.apache.commons.beanutils" level="ERROR"/>
25 |     <logger name="org.apache.directory" level="WARN"/>
26 |     <logger name="org.apache.spark" level="WARN"/>
27 |     <logger name="org.apache.hadoop" level="WARN"/>
28 |     <logger name="org.apache.http" level="ERROR"/>
29 |     <logger name="org.apache.hive" level="WARN"/>
30 |     <logger name="org.apache.zookeeper" level="ERROR"/>
31 |     <logger name="org.mortbay.log" level="WARN"/>
32 |     <logger name="org.spark-project" level="WARN"/>
33 |     <logger name="org.quartz.core" level="WARN"/>
34 | 
35 |     <appender name="Console" class="ch.qos.logback.core.ConsoleAppender">
36 |         <encoder>
37 |             <pattern>%d{ISO8601} - %-5p [%t:%C{1}@%L] - %m%n</pattern>
38 |         </encoder>
39 |     </appender>
40 | 
41 |     <root level="INFO">
42 |         <appender-ref ref="Console"/>
43 |     </root>
44 | 
45 | </configuration>
46 | 


--------------------------------------------------------------------------------
/src/main/java/io/cdap/plugin/cdc/source/sqlserver/SQLServerConnectionFactory.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2019 Cask Data, Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 5 |  * use this file except in compliance with the License. You may obtain a copy of
 6 |  * the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 |  * License for the specific language governing permissions and limitations under
14 |  * the License.
15 |  */
16 | 
17 | package io.cdap.plugin.cdc.source.sqlserver;
18 | 
19 | import com.microsoft.sqlserver.jdbc.SQLServerDriver;
20 | import org.apache.spark.rdd.JdbcRDD;
21 | 
22 | import java.sql.Connection;
23 | import java.sql.DriverManager;
24 | 
25 | /**
26 |  * A class which can provide a {@link Connection} using {@link SQLServerDriver} which is
27 |  * serializable.
28 |  * Note: This class does not do any connection management. Its the responsibility of the client
29 |  * to manage/close the connection.
30 |  */
31 | class SQLServerConnectionFactory implements JdbcRDD.ConnectionFactory {
32 |   private final String connectionUrl;
33 |   private final String userName;
34 |   private final String password;
35 | 
36 |   SQLServerConnectionFactory(String connectionUrl, String userName, String password) {
37 |     this.connectionUrl = connectionUrl;
38 |     this.userName = userName;
39 |     this.password = password;
40 |   }
41 | 
42 |   @Override
43 |   public Connection getConnection() throws Exception {
44 |     Class.forName(SQLServerDriver.class.getName());
45 |     return DriverManager.getConnection(connectionUrl, userName, password);
46 |   }
47 | }
48 | 


--------------------------------------------------------------------------------
/suppressions.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | 
 3 | <!--
 4 |   Copyright © 2014-2015 Cask Data, Inc.
 5 | 
 6 |   Licensed under the Apache License, Version 2.0 (the "License"); you may not
 7 |   use this file except in compliance with the License. You may obtain a copy of
 8 |   the License at
 9 | 
10 |   http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |   Unless required by applicable law or agreed to in writing, software
13 |   distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
14 |   WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
15 |   License for the specific language governing permissions and limitations under
16 |   the License.
17 |   -->
18 | 
19 | <!DOCTYPE suppressions PUBLIC
20 |         "-//Puppy Crawl//DTD Suppressions 1.1//EN"
21 |         "http://www.puppycrawl.com/dtds/suppressions_1_1.dtd">
22 | 
23 | <suppressions>
24 | 
25 |   <suppress checks="Javadoc.*" files=".*[/\\]src[/\\]test[/\\]java[/\\].*" />
26 | 
27 |   <suppress checks="JavadocPackage" files=".*[/\\]src[/\\](main|integration)[/\\]java[/\\].*" />
28 |   <suppress checks="JavadocPackage" files=".*[/\\]src[/\\].*[/\\]internal[/\\].*" />
29 | 
30 |   <suppress checks="JavadocStyle" files=".*[/\\]src[/\\](main|integration)[/\\]java[/\\].*" />
31 |   <suppress checks="JavadocStyle" files=".*[/\\]src[/\\].*[/\\]internal[/\\].*" />
32 | 
33 |   <suppress checks="RedundantModifier" files=".*[/\\]src[/\\]test[/\\]java[/\\].*" />
34 | 
35 | 
36 |   <!-- copied from apache hadoop, won't fix style to keep diff minimal -->
37 |   <suppress checks=".*" files=".*[/\\]LocalJobRunnerWithFix.java" />
38 |   <suppress checks=".*" files=".*[/\\]org[/\\]apache[/\\]hadoop[/\\]util[/\\]Shell.java" />
39 | 
40 |   <!-- do not check thrift generated files -->
41 |   <suppress checks=".*" files=".*[/\\]transaction[/\\]distributed[/\\]thrift[/\\].*" />
42 | 
43 |   <suppress checks=".*" files=".*[/\\]src[/\\](main)[/\\](thrift)[/\\].*" />
44 |   <suppress checks=".*" files=".*[/\\]src[/\\](main)[/\\](java)[/\\](co|org)[/\\](cask|apache)[/\\](tephra|thrift)[/\\].*" />
45 | 
46 | </suppressions>
47 | 


--------------------------------------------------------------------------------
/src/main/java/io/cdap/plugin/cdc/source/sqlserver/ResultSetToDDLRecord.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2019 Cask Data, Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 5 |  * use this file except in compliance with the License. You may obtain a copy of
 6 |  * the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 |  * License for the specific language governing permissions and limitations under
14 |  * the License.
15 |  */
16 | 
17 | package io.cdap.plugin.cdc.source.sqlserver;
18 | 
19 | import com.google.common.base.Joiner;
20 | import io.cdap.cdap.api.data.format.StructuredRecord;
21 | import io.cdap.cdap.api.data.schema.Schema;
22 | import io.cdap.plugin.cdc.common.DBUtils;
23 | import io.cdap.plugin.cdc.common.Schemas;
24 | import org.apache.spark.api.java.function.Function;
25 | 
26 | import java.sql.ResultSet;
27 | import java.sql.SQLException;
28 | 
29 | /**
30 |  * A serializable class to allow invoking {@link scala.Function1} from Java. The function converts {@link ResultSet}
31 |  * to {@link StructuredRecord} for DDL i.e. schema changes
32 |  */
33 | public class ResultSetToDDLRecord implements Function<ResultSet, StructuredRecord> {
34 | 
35 |   private final String schemaName;
36 |   private final String tableName;
37 | 
38 |   ResultSetToDDLRecord(String schemaName, String tableName) {
39 |     this.schemaName = schemaName;
40 |     this.tableName = tableName;
41 |   }
42 | 
43 |   @Override
44 |   public StructuredRecord call(ResultSet row) throws SQLException {
45 |     Schema tableSchema = Schema.recordOf(Schemas.SCHEMA_RECORD, DBUtils.getSchemaFields(row));
46 |     return StructuredRecord.builder(Schemas.DDL_SCHEMA)
47 |       .set(Schemas.TABLE_FIELD, Joiner.on(".").join(schemaName, tableName))
48 |       .set(Schemas.SCHEMA_FIELD, tableSchema.toString())
49 |       .build();
50 |   }
51 | }
52 | 


--------------------------------------------------------------------------------
/src/test/java/io/cdap/plugin/cdc/common/BigtableOperations.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2019 Cask Data, Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 5 |  * use this file except in compliance with the License. You may obtain a copy of
 6 |  * the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 |  * License for the specific language governing permissions and limitations under
14 |  * the License.
15 |  */
16 | 
17 | package io.cdap.plugin.cdc.common;
18 | 
19 | import com.google.cloud.bigtable.hbase.BigtableConfiguration;
20 | import com.google.cloud.bigtable.hbase.BigtableOptionsFactory;
21 | import org.apache.hadoop.conf.Configuration;
22 | import org.apache.hadoop.hbase.TableName;
23 | import org.apache.hadoop.hbase.client.Connection;
24 | 
25 | import java.io.IOException;
26 | import javax.annotation.Nullable;
27 | 
28 | /**
29 |  * Utility methods for common Bigtable operations.
30 |  */
31 | public class BigtableOperations {
32 |   private BigtableOperations() {
33 |     // utility class
34 |   }
35 | 
36 |   public static Connection connect(String projectId, String instanceId, @Nullable String serviceAccountFilepath) {
37 |     Configuration configuration = BigtableConfiguration.configure(projectId, instanceId);
38 |     if (serviceAccountFilepath != null) {
39 |       configuration.set(BigtableOptionsFactory.BIGTABLE_SERVICE_ACCOUNT_JSON_KEYFILE_LOCATION_KEY,
40 |                         serviceAccountFilepath);
41 |     }
42 |     return BigtableConfiguration.connect(configuration);
43 |   }
44 | 
45 |   public static void dropTableIfExists(Connection connection, String dbTableName) throws IOException {
46 |     TableName tableName = TableName.valueOf(dbTableName);
47 |     if (connection.getAdmin().tableExists(tableName)) {
48 |       connection.getAdmin().disableTable(tableName);
49 |       connection.getAdmin().deleteTable(tableName);
50 |     }
51 |   }
52 | }
53 | 


--------------------------------------------------------------------------------
/docs/CDCDatabase-streamingsource.md:
--------------------------------------------------------------------------------
 1 | # CDC Golden Gate Kafka Streaming Source
 2 | 
 3 | Description
 4 | -----------
 5 | This plugin reads Change Data Capture (CDC) events from a Golden Gate Kafka topic.
 6 | 
 7 | All CDC source plugins are normally used in conjunction with CDC sink plugins. 
 8 | CDC source produces messages in CDC format.  
 9 | 
10 | Useful links:
11 | * [Goldengate site](https://www.oracle.com/middleware/technologies/goldengate.html)
12 | * [Installing Oracle GoldenGate](https://docs.oracle.com/goldengate/1212/gg-winux/GIORA/install.htm#GIORA162).
13 | * [Using Oracle GoldenGate for Oracle Database](https://www.oracle.com/pls/topic/lookup?ctx=en/middleware/goldengate/core/18.1&id=GGODB-GUID-110CD372-2F7E-4262-B8D2-DC0A80422806).
14 | * [Using Oracle GoldenGate for BigData](https://docs.oracle.com/goldengate/bd123210/gg-bd/GADBD/introduction-oracle-goldengate-big-data.htm#GADBD114).
15 | 
16 | Properties
17 | ----------
18 | **Reference Name**: Name used to uniquely identify this source for lineage, annotating metadata, etc.
19 | 
20 | **Kafka Broker**: Kafka broker specified in host:port form. For example, example.com:9092.
21 | 
22 | **Kafka Topic**: Name of the topic to which Golden Gate publishes the DDL and DML changes.
23 | 
24 | **Default Initial Offset**: The default initial offset to read from. 
25 | An offset of -2 means the smallest offset (the beginning of the topic). 
26 | An offset of -1 means the latest offset (the end of the topic). 
27 | Defaults to -1. Offsets are inclusive. 
28 | If an offset of 5 is used, the message at offset 5 will be read.
29 | 
30 | **Max Rate Per Partition**: Max number of records to read per second per partition. 0 means there is no limit.
31 |  Defaults to 1000.
32 | 
33 | Required GoldenGate Settings
34 | ----------
35 | * GoldenGate should push data using Kafka handler
36 | * Generic Wrapper Functionality should be enabled ("gg.handler.kafkahandler.format.wrapMessageInGenericAvroMessage"). 
37 | * Schema topic ("gg.handler.kafkahandler.schemaTopicName") should be equal to DML changes topic. 
38 | * Handler should send events in "OP" mode ("gg.handler.kafkahandler.mode"). 
39 | * Handler should send events in "avro_op" format ("gg.handler.kafkahandler.format"). 


--------------------------------------------------------------------------------
/src/main/java/io/cdap/plugin/cdc/source/sqlserver/TableInformation.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2019 Cask Data, Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 5 |  * use this file except in compliance with the License. You may obtain a copy of
 6 |  * the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 |  * License for the specific language governing permissions and limitations under
14 |  * the License.
15 |  */
16 | 
17 | package io.cdap.plugin.cdc.source.sqlserver;
18 | 
19 | import com.google.common.collect.ImmutableSet;
20 | import com.google.common.collect.Sets;
21 | import scala.Serializable;
22 | 
23 | import java.util.Set;
24 | 
25 | /**
26 |  * Represents SQL Server Table information
27 |  */
28 | class TableInformation implements Serializable {
29 |   private final String schemaName;
30 |   private final String name;
31 |   private final Set<String> columnSchema;
32 |   private final Set<String> primaryKeys;
33 |   private final Set<String> valueColumns;
34 | 
35 |   TableInformation(String schemaName, String name, Set<String> columnSchema, Set<String> primaryKeys) {
36 |     this.schemaName = schemaName;
37 |     this.name = name;
38 |     this.columnSchema = columnSchema;
39 |     this.primaryKeys = primaryKeys;
40 |     this.valueColumns = ImmutableSet.copyOf(Sets.difference(columnSchema, primaryKeys));
41 |   }
42 | 
43 |   @Override
44 |   public String toString() {
45 |     return "TableInformation{" +
46 |       "schemaName='" + schemaName + '\'' +
47 |       ", name='" + name + '\'' +
48 |       ", columnSchema=" + columnSchema +
49 |       ", primaryKeys=" + primaryKeys +
50 |       ", valueColumns=" + valueColumns +
51 |       '}';
52 |   }
53 | 
54 |   String getSchemaName() {
55 |     return schemaName;
56 |   }
57 | 
58 |   String getName() {
59 |     return name;
60 |   }
61 | 
62 |   Set<String> getColumnSchema() {
63 |     return columnSchema;
64 |   }
65 | 
66 |   Set<String> getPrimaryKeys() {
67 |     return primaryKeys;
68 |   }
69 | 
70 |   Set<String> getValueColumnNames() {
71 |     return valueColumns;
72 |   }
73 | 
74 | }
75 | 


--------------------------------------------------------------------------------
/src/main/java/io/cdap/plugin/cdc/source/sqlserver/PluginConnectionFactory.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2019 Cask Data, Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 5 |  * use this file except in compliance with the License. You may obtain a copy of
 6 |  * the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 |  * License for the specific language governing permissions and limitations under
14 |  * the License.
15 |  */
16 | 
17 | 
18 | package io.cdap.plugin.cdc.source.sqlserver;
19 | 
20 | import io.cdap.cdap.api.plugin.PluginContext;
21 | import io.cdap.plugin.cdc.common.DBUtils;
22 | import org.apache.spark.rdd.JdbcRDD;
23 | 
24 | import java.io.Serializable;
25 | import java.sql.Connection;
26 | import java.sql.Driver;
27 | import java.sql.DriverManager;
28 | import java.util.Map;
29 | 
30 | /**
31 |  * Serializable jdbc connection factory that uses CDAP plugin context to instantiate the jdbc driver.
32 |  */
33 | public class PluginConnectionFactory implements JdbcRDD.ConnectionFactory, Serializable {
34 |   private static final long serialVersionUID = -7897960584858589314L;
35 |   private final String stageName;
36 |   private final String connectionString;
37 |   private final PluginContext pluginContext;
38 |   private transient String user;
39 |   private transient String password;
40 |   private transient boolean initialized;
41 | 
42 |   PluginConnectionFactory(PluginContext pluginContext, String stageName, String connectionString) {
43 |     this.stageName = stageName;
44 |     this.connectionString = connectionString;
45 |     this.pluginContext = pluginContext;
46 |   }
47 | 
48 |   @Override
49 |   public Connection getConnection() throws Exception {
50 |     if (!initialized) {
51 |       Class<? extends Driver> driverClass = pluginContext.loadPluginClass(stageName + ":" + CTSQLServer.JDBC_PLUGIN_ID);
52 |       DBUtils.ensureJDBCDriverIsAvailable(driverClass, connectionString);
53 |       Map<String, String> stageProperties = pluginContext.getPluginProperties(stageName).getProperties();
54 |       user = stageProperties.get(CTSQLServerConfig.USERNAME);
55 |       password = stageProperties.get(CTSQLServerConfig.PASSWORD);
56 |       initialized = true;
57 |     }
58 |     return DriverManager.getConnection(connectionString, user, password);
59 |   }
60 | 
61 |   private void initialize() {
62 | 
63 |   }
64 | }
65 | 


--------------------------------------------------------------------------------
/src/test/java/io/cdap/plugin/cdc/performance/CDCPluginPerfTestBase.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2019 Cask Data, Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 5 |  * use this file except in compliance with the License. You may obtain a copy of
 6 |  * the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 |  * License for the specific language governing permissions and limitations under
14 |  * the License.
15 |  */
16 | 
17 | package io.cdap.plugin.cdc.performance;
18 | 
19 | import io.cdap.cdap.api.artifact.ArtifactScope;
20 | import io.cdap.cdap.api.artifact.ArtifactSummary;
21 | import io.cdap.cdap.common.UnauthenticatedException;
22 | import io.cdap.cdap.datastreams.DataStreamsSparkLauncher;
23 | import io.cdap.cdap.etl.proto.v2.DataStreamsConfig;
24 | import io.cdap.cdap.etl.proto.v2.ETLPlugin;
25 | import io.cdap.cdap.etl.proto.v2.ETLStage;
26 | import io.cdap.cdap.proto.artifact.AppRequest;
27 | import io.cdap.cdap.proto.id.ApplicationId;
28 | import io.cdap.cdap.proto.id.NamespaceId;
29 | import io.cdap.cdap.test.ApplicationManager;
30 | import io.cdap.cdap.test.IntegrationTestBase;
31 | import io.cdap.cdap.test.SparkManager;
32 | 
33 | import java.io.IOException;
34 | 
35 | public abstract class CDCPluginPerfTestBase extends IntegrationTestBase {
36 |   protected SparkManager deployETL(ETLPlugin sourcePlugin, ETLPlugin sinkPlugin, String appName) throws Exception {
37 |     ETLStage source = new ETLStage("source", sourcePlugin);
38 |     ETLStage sink = new ETLStage("sink", sinkPlugin);
39 |     DataStreamsConfig etlConfig = DataStreamsConfig.builder()
40 |       .addStage(source)
41 |       .addStage(sink)
42 |       .addConnection(source.getName(), sink.getName())
43 |       .setBatchInterval("1s")
44 |       .build();
45 | 
46 |     AppRequest<DataStreamsConfig> appRequest = getStreamingAppRequest(etlConfig);
47 |     ApplicationId appId = NamespaceId.DEFAULT.app(appName);
48 |     ApplicationManager applicationManager = deployApplication(appId, appRequest);
49 |     return applicationManager.getSparkManager(DataStreamsSparkLauncher.NAME);
50 |   }
51 | 
52 |   private AppRequest<DataStreamsConfig> getStreamingAppRequest(DataStreamsConfig config)
53 |     throws IOException, UnauthenticatedException {
54 |     String version = getMetaClient().getVersion().getVersion();
55 |     return new AppRequest<>(new ArtifactSummary("cdap-data-streams", version, ArtifactScope.SYSTEM), config);
56 |   }
57 | }
58 | 


--------------------------------------------------------------------------------
/src/main/java/io/cdap/plugin/cdc/common/JDBCDriverShim.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2019 Cask Data, Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 5 |  * use this file except in compliance with the License. You may obtain a copy of
 6 |  * the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 |  * License for the specific language governing permissions and limitations under
14 |  * the License.
15 |  */
16 | 
17 | package io.cdap.plugin.cdc.common;
18 | 
19 | import java.sql.Connection;
20 | import java.sql.Driver;
21 | import java.sql.DriverPropertyInfo;
22 | import java.sql.SQLException;
23 | import java.sql.SQLFeatureNotSupportedException;
24 | import java.util.Properties;
25 | import java.util.logging.Logger;
26 | 
27 | /**
28 |  * Shim for JDBC driver as a better alternative to mere Class.forName to load the JDBC Driver class.
29 |  *
30 |  * From http://www.kfu.com/~nsayer/Java/dyn-jdbc.html
31 |  * One problem with using <pre>{@code Class.forName()}</pre> to find and load the JDBC Driver class is that it
32 |  * presumes that your driver is in the classpath. This means either packaging the driver in your jar, or having to
33 |  * stick the driver somewhere (probably unpacking it too), or modifying your classpath.
34 |  * But why not use something like URLClassLoader and the overload of Class.forName() that lets you specify the
35 |  * ClassLoader?" Because the DriverManager will refuse to use a driver not loaded by the system ClassLoader.
36 |  * The workaround for this is to create a shim class that implements java.sql.Driver.
37 |  * This shim class will do nothing but call the methods of an instance of a JDBC driver that we loaded dynamically.
38 |  */
39 | public class JDBCDriverShim implements Driver {
40 | 
41 |   private final Driver delegate;
42 | 
43 |   public JDBCDriverShim(Driver delegate) {
44 |     this.delegate = delegate;
45 |   }
46 | 
47 |   @Override
48 |   public boolean acceptsURL(String url) throws SQLException {
49 |     return delegate.acceptsURL(url);
50 |   }
51 | 
52 |   @Override
53 |   public Connection connect(String url, Properties info) throws SQLException {
54 |     return delegate.connect(url, info);
55 |   }
56 | 
57 |   @Override
58 |   public int getMajorVersion() {
59 |     return delegate.getMajorVersion();
60 |   }
61 | 
62 |   @Override
63 |   public int getMinorVersion() {
64 |     return delegate.getMinorVersion();
65 |   }
66 | 
67 |   @Override
68 |   public DriverPropertyInfo[] getPropertyInfo(String url, Properties info) throws SQLException {
69 |     return delegate.getPropertyInfo(url, info);
70 |   }
71 | 
72 |   @Override
73 |   public boolean jdbcCompliant() {
74 |     return delegate.jdbcCompliant();
75 |   }
76 | 
77 |   @Override
78 |   public Logger getParentLogger() throws SQLFeatureNotSupportedException {
79 |     return delegate.getParentLogger();
80 |   }
81 | }
82 | 


--------------------------------------------------------------------------------
/docker-compose/cdc-env/Oracle/createDB.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # LICENSE UPL 1.0
 3 | #
 4 | # Copyright (c) 1982-2018 Oracle and/or its affiliates. All rights reserved.
 5 | # 
 6 | # Since: November, 2016
 7 | # Author: gerald.venzl@oracle.com
 8 | # Description: Creates an Oracle Database based on following parameters:
 9 | #              $ORACLE_SID: The Oracle SID and CDB name
10 | #              $ORACLE_PDB: The PDB name
11 | #              $ORACLE_PWD: The Oracle password
12 | # 
13 | # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
14 | # 
15 | 
16 | set -e
17 | 
18 | # Check whether ORACLE_SID is passed on
19 | export ORACLE_SID=${1:-ORCLCDB}
20 | 
21 | # Check whether ORACLE_PDB is passed on
22 | export ORACLE_PDB=${2:-ORCLPDB1}
23 | 
24 | # Auto generate ORACLE PWD if not passed on
25 | export ORACLE_PWD=${3:-"`openssl rand -base64 8`1"}
26 | echo "ORACLE PASSWORD FOR SYS, SYSTEM AND PDBADMIN: $ORACLE_PWD";
27 | 
28 | # Replace place holders in response file
29 | cp $ORACLE_BASE/$CONFIG_RSP $ORACLE_BASE/dbca.rsp
30 | sed -i -e "s|###ORACLE_SID###|$ORACLE_SID|g" $ORACLE_BASE/dbca.rsp
31 | sed -i -e "s|###ORACLE_PDB###|$ORACLE_PDB|g" $ORACLE_BASE/dbca.rsp
32 | sed -i -e "s|###ORACLE_PWD###|$ORACLE_PWD|g" $ORACLE_BASE/dbca.rsp
33 | sed -i -e "s|###ORACLE_CHARACTERSET###|$ORACLE_CHARACTERSET|g" $ORACLE_BASE/dbca.rsp
34 | 
35 | # If there is greater than 8 CPUs default back to dbca memory calculations
36 | # dbca will automatically pick 40% of available memory for Oracle DB
37 | # The minimum of 2G is for small environments to guarantee that Oracle has enough memory to function
38 | # However, bigger environment can and should use more of the available memory
39 | # This is due to Github Issue #307
40 | if [ `nproc` -gt 8 ]; then
41 |    sed -i -e "s|totalMemory=2048||g" $ORACLE_BASE/dbca.rsp
42 | fi;
43 | 
44 | # Create network related config files (sqlnet.ora, tnsnames.ora, listener.ora)
45 | mkdir -p $ORACLE_HOME/network/admin
46 | echo "NAME.DIRECTORY_PATH= (TNSNAMES, EZCONNECT, HOSTNAME)" > $ORACLE_HOME/network/admin/sqlnet.ora
47 | 
48 | # Listener.ora
49 | echo "LISTENER = 
50 | (DESCRIPTION_LIST = 
51 |   (DESCRIPTION = 
52 |     (ADDRESS = (PROTOCOL = IPC)(KEY = EXTPROC1)) 
53 |     (ADDRESS = (PROTOCOL = TCP)(HOST = 0.0.0.0)(PORT = 1521)) 
54 |   ) 
55 | ) 
56 | 
57 | DEDICATED_THROUGH_BROKER_LISTENER=ON
58 | DIAG_ADR_ENABLED = off
59 | " > $ORACLE_HOME/network/admin/listener.ora
60 | 
61 | # Start LISTENER and run DBCA
62 | lsnrctl start &&
63 | dbca -silent -createDatabase -responseFile $ORACLE_BASE/dbca.rsp ||
64 |  cat /opt/oracle/cfgtoollogs/dbca/$ORACLE_SID/$ORACLE_SID.log ||
65 |  cat /opt/oracle/cfgtoollogs/dbca/$ORACLE_SID.log
66 | 
67 | echo "$ORACLE_SID=localhost:1521/$ORACLE_SID" > $ORACLE_HOME/network/admin/tnsnames.ora
68 | echo "$ORACLE_PDB= 
69 | (DESCRIPTION = 
70 |   (ADDRESS = (PROTOCOL = TCP)(HOST = 0.0.0.0)(PORT = 1521))
71 |   (CONNECT_DATA =
72 |     (SERVER = DEDICATED)
73 |     (SERVICE_NAME = $ORACLE_PDB)
74 |   )
75 | )" >> $ORACLE_HOME/network/admin/tnsnames.ora
76 | 
77 | # Remove second control file, fix local_listener, make PDB auto open
78 | sqlplus / as sysdba << EOF
79 |    ALTER SYSTEM SET control_files='$ORACLE_BASE/oradata/$ORACLE_SID/control01.ctl' scope=spfile;
80 |    ALTER SYSTEM SET local_listener='';
81 |    exit;
82 | EOF
83 | 
84 | # Remove temporary response file
85 | rm $ORACLE_BASE/dbca.rsp
86 | 


--------------------------------------------------------------------------------
/docs/CDCHBase-sparksink.md:
--------------------------------------------------------------------------------
 1 | # CDC HBase Sink
 2 | 
 3 | Description
 4 | -----------
 5 | This plugin takes input from a CDC source and writes the changes to HBase. 
 6 | It will write to the HBase instance running on the cluster.
 7 | 
 8 | All CDC sink plugins are normally used in conjunction with CDC source plugins. 
 9 | CDC sink expects messages in CDC format as an input.  
10 | 
11 | Properties
12 | ----------
13 | **Reference Name**: Name used to uniquely identify this source for lineage, annotating metadata, etc.
14 | 
15 | Usage Notes
16 | -----------
17 | This plugin supports table creation and table modification on an HBase server. We recommend placing a normalizer transformation plugin before this plugin. It converts inputs into standard Data Definition Language (DDL) and Data Manipulation Language (DML) records that can be parsed by this plugin.
18 | 
19 | Table Creation
20 | --------------
21 | When the plugin receives a DDL record, it creates a table in the target HBase database. The name of the table is specified in the DDL record. Below is a sample DDL Record that creates a table in namespace `GGTEST` with name `TESTANOTHER`.
22 | ```{
23 |   "schema": {
24 |     "type": "RECORD",
25 |     "recordName": "DDLRecord",
26 |     "fieldMap": {
27 |       "table": {
28 |         "name": "table",
29 |         "schema": {
30 |           "type": "STRING",
31 |           "unionSchemas": []
32 |         }
33 |       },
34 |       "schema": {
35 |         "name": "schema",
36 |         "schema": {
37 |           "type": "STRING",
38 |           "unionSchemas": []
39 |         }
40 |       }
41 |     },
42 |     "fields": [
43 |       {
44 |         "name": "table",
45 |         "schema": {
46 |           "type": "STRING",
47 |           "unionSchemas": []
48 |         }
49 |       },
50 |       {
51 |         "name": "schema",
52 |         "schema": {
53 |           "type": "STRING",
54 |           "unionSchemas": []
55 |         }
56 |       }
57 |     ],
58 |     "unionSchemas": []
59 |   },
60 |   "fields": {
61 |     "schema": "{\"type\":\"record\",\"name\":\"columns\",\"fields\":[{\"name\":\"CID\",\"type\":[\"null\",\"long\"]},{\"name\":\"CNAME\",\"type\":[\"null\",\"string\"]}]}",
62 |     "table": "GGTEST.TESTANOTHER"
63 |   }
64 | }
65 | ```
66 | 
67 | Table Modification
68 | --------------
69 | When the plugin receives a DML record, it modifies the corresponding table according to the operation specified in `op_type`. 
70 | 
71 | | op\_type | Operation |
72 | | :--------------: | :--------------: |
73 | | I | Insert |
74 | | U | Update | 
75 | | D | Delete |
76 | 
77 | The content of the changes is listed in the `change` field. The `primary_keys` field specifies the fields in `change` that will be used to name a row in the table. Below is a sample DML record that creates a row for `Scott` and inserts his information into the row.
78 | ```
79 | {
80 |   "table": "GGTEST_EMPLOYEE",
81 |   "schema": "{\"type\":\"record\",\"name\":\"columns\",\"fields\":[{\"name\":\"EMPNO\",\"type\":[\"null\",\"long\"]},{\"name\":\"ENAME\",\"type\":[\"null\",\"string\"]},{\"name\":\"JOB\",\"type\":[\"null\",\"string\"]},{\"name\":\"MGR\",\"type\":[\"null\",\"long\"]},{\"name\":\"HIREDATE\",\"type\":[\"null\",\"string\"]},{\"name\":\"SAL\",\"type\":[\"null\",\"long\"]},{\"name\":\"COMM\",\"type\":[\"null\",\"long\"]},{\"name\":\"DEPTNO\",\"type\":[\"null\",\"long\"]},{\"name\":\"EMP_ADDRESS\",\"type\":[\"null\",\"string\"]}]}",
82 |   "op_type": "I",
83 |   "primary_keys": [
84 |     "ENAME"
85 |   ],
86 |   "change": {
87 |     "HIREDATE": "03-DEC-2015",
88 |     "JOB": "Software Engineer",
89 |     "MGR": 991,
90 |     "SAL": 1234,
91 |     "DEPTNO": 1,
92 |     "EMP_ADDRESS": "San Jose",
93 |     "ENAME": "Scott",
94 |     "EMPNO": 1,
95 |     "COMM": 1
96 |   }
97 | }
98 | ```


--------------------------------------------------------------------------------
/widgets/CDCKudu-sparksink.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "metadata": {
  3 |     "spec-version": "1.5"
  4 |   },
  5 |   "configuration-groups": [
  6 |     {
  7 |       "label": "Kudu Configuration",
  8 |       "properties": [
  9 |         {
 10 |           "widget-type" : "textbox",
 11 |           "label" : "Reference Name",
 12 |           "name" : "referenceName",
 13 |           "description" : "Reference specifies the name to be used to track this external source"
 14 |         },
 15 |         {
 16 |           "widget-type": "textbox",
 17 |           "label": "Master Addresses",
 18 |           "name": "master",
 19 |           "description": "Comma separated list of hostname:port of Apache Kudu Masters"
 20 |         }
 21 |       ]
 22 |     },
 23 |     {
 24 |       "label": "Kudu Advanced Options",
 25 |       "properties" : [
 26 |         {
 27 |           "widget-type": "number",
 28 |           "label": "No of buckets",
 29 |           "name": "buckets",
 30 |           "widget-attributes" : {
 31 |             "default" : 16
 32 |           }
 33 |         },
 34 |         {
 35 |           "widget-type": "number",
 36 |           "label": "Seed",
 37 |           "name": "seed",
 38 |           "description": "Seed to randomize the mapping of rows to hash buckets",
 39 |           "widget-attributes" : {
 40 |             "default" : 0
 41 |           }
 42 |         },
 43 |         {
 44 |           "widget-type": "select",
 45 |           "label": "Compression Algorithm",
 46 |           "name": "compression-algo",
 47 |           "description": "Compression Algorithm. All fields will be applied same compression",
 48 |           "widget-attributes" : {
 49 |             "values" : [
 50 |               "Snappy",
 51 |               "LZ4",
 52 |               "ZLib",
 53 |               "Backend configured",
 54 |               "No Compression"
 55 |             ],
 56 |             "default": "Snappy"
 57 |           }
 58 |         },
 59 |         {
 60 |           "widget-type": "select",
 61 |           "label": "Encoding Type",
 62 |           "name": "encoding",
 63 |           "description": "Encoding Type. All fields will be applied same encoding",
 64 |           "widget-attributes" : {
 65 |             "values": [
 66 |               "Auto",
 67 |               "Plain",
 68 |               "Prefix",
 69 |               "Group Variant",
 70 |               "RLE",
 71 |               "Dictionary",
 72 |               "Bit Shuffle"
 73 |             ],
 74 |             "default": "Auto"
 75 |           }
 76 |         },
 77 |         {
 78 |           "widget-type": "number",
 79 |           "label": "User Operations Timeout",
 80 |           "name": "opt-timeout",
 81 |           "description": "User operations timeout in milliseconds",
 82 |           "widget-attributes" : {
 83 |             "default" : 30000
 84 |           }
 85 |         },
 86 |         {
 87 |           "widget-type": "number",
 88 |           "label": "Administration Operations Timeout",
 89 |           "name": "admin-timeout",
 90 |           "description": "Administration operation timeout in milliseconds",
 91 |           "widget-attributes" : {
 92 |             "default" : 30000
 93 |           }
 94 |         },
 95 |         {
 96 |           "widget-type": "number",
 97 |           "label": "Replicas",
 98 |           "name": "replicas",
 99 |           "description": "Specifies the number of replicas for the Kudu tables",
100 |           "widget-attributes" : {
101 |             "default" : 1
102 |           }
103 |         },
104 |         {
105 |           "widget-type": "number",
106 |           "label": "Rows Buffer",
107 |           "name" : "row-flush",
108 |           "description": "Number of rows that are buffered before flushing to the tablet server",
109 |           "widget-attributes" : {
110 |             "default" : 1000
111 |           }
112 |         },
113 |         {
114 |           "widget-type": "number",
115 |           "label" : "Boss Threads",
116 |           "name": "boss-threads",
117 |           "description" : "Specifies the number of boss threads to be used by the client",
118 |           "widget-attributes" : {
119 |             "default" : 1
120 |           }
121 |         }
122 |       ]
123 |     }
124 |   ]
125 | }
126 | 


--------------------------------------------------------------------------------
/docs/CTSQLServer-streamingsource.md:
--------------------------------------------------------------------------------
 1 | # CDC SQL Server Streaming Source
 2 | 
 3 | Description
 4 | -----------
 5 | This plugin reads Change Data Capture (CDC) events from SQL Server through Change Tracking.
 6 | 
 7 | All CDC source plugins are normally used in conjunction with CDC sink plugins. 
 8 | CDC source produces messages in CDC format.  
 9 | 
10 | Properties
11 | ----------
12 | **Reference Name**: Name used to uniquely identify this source for lineage, annotating metadata, etc.
13 | 
14 | **Hostname**: Hostname of the SQL Server from which the data needs to be offloaded. 
15 | Ex: mysqlserver.net or 12.123.12.123.
16 | 
17 | **Port**: SQL Server Port.
18 | 
19 | **Username**:  Username to use to connect to the specified database. Required for databases that need authentication. 
20 | Optional for databases that do not require authentication.
21 | 
22 | **Password**:  Password to use to connect to the specified database. Required for databases that need authentication.
23 | Optional for databases that do not require authentication.
24 | 
25 | **Database name**:  SQL Server database name which needs to be tracked. 
26 | Note: Change Tracking must be enabled on the database for the source to read the chage data.
27 | 
28 | **Max Retry Seconds**: Maximum number of seconds to retry failures when reading change events.
29 | If no retries should be done, this should be set to 0.
30 | If there should not be a retry limit, this should be set to a negative number or left empty.
31 | 
32 | SQL Server Change Tracking
33 | --------------------------
34 | Change Tracking allows to identify the rows which have changed. Change Tracking SQL Server Streaming Source leverage 
35 | this to retrieve just the minimum information to keep a SQL server database in sync with a downstream sink. You can 
36 | read more about SQL Server Change Tracking 
37 | [here](https://docs.microsoft.com/en-us/sql/relational-databases/track-changes/about-change-tracking-sql-server)
38 | 
39 | ### Change Tracking and Change Data Capture
40 | SQL Server also allow capturing the changed data through Change Data Capture. Change Data Capture provides historical 
41 | information about the changes. This plugin uses Change Tracking over Change Data Capture because of the following 
42 | reasons:
43 | 
44 | 1. Historical Information: From a pipeline whose purose is to offload data from a database and/or to keep a database 
45 | in sync with some external storage, historical information is not critical.
46 | 2. Schema Changes: Change Data Capture has very limited suport for schema changes in the table being tracked. 
47 | New columns added to a tracked table are not automatically tracked. For more details please refer 
48 | [Handling Changes to Source Data](https://docs.microsoft.com/en-us/sql/relational-databases/track-changes/about-change-data-capture-sql-server#handling-changes-to-source-tables)
49 | 3. Supported Editions: Change Data Capture is only avaliable in DataCenter and Enterprise editions whereas 
50 | Change Tracking is supported in Express, Workgroup, Web Standard, Enterprise and DataCenter.
51 | You can read more about differences between SQL Server CT and CDC 
52 | [here](https://technet.microsoft.com/en-us/library/cc280519(v=sql.105).aspx)
53 | 
54 | ### Enable Change Tracking for a Database
55 | Before you start using the Change Tracking SQL Server Source to track changes in your database you will need to 
56 | enable Change Tracking on the database. Change Tracking can be enabled on database by:
57 | 
58 | > ALTER DATABASE dbName SET CHANGE_TRACKING = ON (CHANGE_RETENTION = 2 DAYS, AUTO_CLEANUP = ON)
59 | 
60 | Refer [Enable Change Tracking for a Database](https://docs.microsoft.com/en-us/sql/relational-databases/track-changes/enable-and-disable-change-tracking-sql-server#enable-change-tracking-for-a-database) for more details.
61 | 
62 | ### Enable Change Tracking for a Table
63 | Change Tracking SQL Server Streaming Source will sync all the tables in a database which has change tracking enabled. 
64 | Change Tracking can ge enabled for a table by:
65 | 
66 | > ALTER TABLE tableName ENABLE CHANGE_TRACKING WITH (TRACK_COLUMNS_UPDATED = OFF)
67 | 
68 | Refer [Enable Change Tracking for a Table](https://docs.microsoft.com/en-us/sql/relational-databases/track-changes/enable-and-disable-change-tracking-sql-server#enable-change-tracking-for-a-table) for more details.
69 | 
70 | 


--------------------------------------------------------------------------------
/src/test/java/io/cdap/plugin/cdc/integration/CDCPluginIntegrationTestBase.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright © 2019 Cask Data, Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 5 |  * use this file except in compliance with the License. You may obtain a copy of
 6 |  * the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 |  * License for the specific language governing permissions and limitations under
14 |  * the License.
15 |  */
16 | 
17 | package io.cdap.plugin.cdc.integration;
18 | 
19 | import com.codahale.metrics.MetricRegistry;
20 | import io.cdap.cdap.api.artifact.ArtifactSummary;
21 | import io.cdap.cdap.common.conf.Constants;
22 | import io.cdap.cdap.datastreams.DataStreamsApp;
23 | import io.cdap.cdap.datastreams.DataStreamsSparkLauncher;
24 | import io.cdap.cdap.etl.mock.test.HydratorTestBase;
25 | import io.cdap.cdap.etl.proto.v2.DataStreamsConfig;
26 | import io.cdap.cdap.etl.proto.v2.ETLPlugin;
27 | import io.cdap.cdap.etl.proto.v2.ETLStage;
28 | import io.cdap.cdap.etl.spark.Compat;
29 | import io.cdap.cdap.proto.artifact.AppRequest;
30 | import io.cdap.cdap.proto.id.ApplicationId;
31 | import io.cdap.cdap.proto.id.ArtifactId;
32 | import io.cdap.cdap.proto.id.NamespaceId;
33 | import io.cdap.cdap.test.ApplicationManager;
34 | import io.cdap.cdap.test.SparkManager;
35 | import io.cdap.cdap.test.TestConfiguration;
36 | import io.cdap.plugin.cdc.sink.CDCBigTable;
37 | import io.cdap.plugin.cdc.sink.CDCHBase;
38 | import io.cdap.plugin.cdc.sink.CDCKudu;
39 | import io.cdap.plugin.cdc.source.oracle.GoldenGateKafka;
40 | import io.cdap.plugin.cdc.source.sqlserver.CTSQLServer;
41 | import kafka.serializer.DefaultDecoder;
42 | import org.apache.spark.streaming.kafka.KafkaUtils;
43 | import org.junit.BeforeClass;
44 | import org.junit.ClassRule;
45 | import org.slf4j.Logger;
46 | import org.slf4j.LoggerFactory;
47 | 
48 | public abstract class CDCPluginIntegrationTestBase extends HydratorTestBase {
49 |   private static final Logger LOG = LoggerFactory.getLogger(CDCPluginIntegrationTestBase.class);
50 |   private static final ArtifactId APP_ARTIFACT_ID = NamespaceId.DEFAULT.artifact("data-streams", "1.0.0");
51 |   private static final ArtifactSummary APP_ARTIFACT = new ArtifactSummary("data-streams", "1.0.0");
52 | 
53 |   @ClassRule
54 |   public static final TestConfiguration CONFIG =
55 |     new TestConfiguration(Constants.Explore.EXPLORE_ENABLED, false,
56 |                           Constants.AppFabric.SPARK_COMPAT, Compat.SPARK_COMPAT);
57 | 
58 |   @BeforeClass
59 |   public static void setupTest() throws Exception {
60 |     LOG.info("Setting up application");
61 | 
62 |     setupStreamingArtifacts(APP_ARTIFACT_ID, DataStreamsApp.class);
63 | 
64 |     LOG.info("Setting up plugins");
65 | 
66 |     addPluginArtifact(NamespaceId.DEFAULT.artifact("cdc-plugins", "1.0.0"),
67 |                       APP_ARTIFACT_ID,
68 |                       GoldenGateKafka.class, CTSQLServer.class,
69 |                       CDCBigTable.class, CDCHBase.class, CDCKudu.class,
70 |                       // Bigtable plugin dependencies
71 |                       MetricRegistry.class,
72 |                       // GoldenGateKafka plugin dependencies
73 |                       KafkaUtils.class, DefaultDecoder.class);
74 |   }
75 | 
76 |   protected SparkManager deployETL(ETLPlugin sourcePlugin, ETLPlugin sinkPlugin, String appName) throws Exception {
77 |     ETLStage source = new ETLStage("source", sourcePlugin);
78 |     ETLStage sink = new ETLStage("sink", sinkPlugin);
79 |     DataStreamsConfig etlConfig = DataStreamsConfig.builder()
80 |       .addStage(source)
81 |       .addStage(sink)
82 |       .addConnection(source.getName(), sink.getName())
83 |       .setBatchInterval("1s")
84 |       .build();
85 | 
86 |     AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
87 |     ApplicationId appId = NamespaceId.DEFAULT.app(appName);
88 |     ApplicationManager applicationManager = deployApplication(appId, appRequest);
89 |     return getProgramManager(applicationManager);
90 |   }
91 | 
92 |   private SparkManager getProgramManager(ApplicationManager appManager) {
93 |     return appManager.getSparkManager(DataStreamsSparkLauncher.NAME);
94 |   }
95 | }
96 | 


--------------------------------------------------------------------------------
/src/main/java/io/cdap/plugin/cdc/sink/CDCBigTableConfig.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright © 2019 Cask Data, Inc.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5 |  * use this file except in compliance with the License. You may obtain a copy of
  6 |  * the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12 |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13 |  * License for the specific language governing permissions and limitations under
 14 |  * the License.
 15 |  */
 16 | 
 17 | package io.cdap.plugin.cdc.sink;
 18 | 
 19 | import com.google.bigtable.repackaged.com.google.cloud.ServiceOptions;
 20 | import com.google.common.base.Strings;
 21 | import io.cdap.cdap.api.annotation.Description;
 22 | import io.cdap.cdap.api.annotation.Macro;
 23 | import io.cdap.cdap.api.annotation.Name;
 24 | import io.cdap.cdap.api.plugin.PluginConfig;
 25 | import io.cdap.cdap.etl.api.validation.InvalidConfigPropertyException;
 26 | import io.cdap.plugin.cdc.common.CDCReferencePluginConfig;
 27 | 
 28 | import java.io.File;
 29 | import javax.annotation.Nullable;
 30 | 
 31 | /**
 32 |  * Defines the {@link PluginConfig} for the {@link CDCBigTable}.
 33 |  */
 34 | public class CDCBigTableConfig extends CDCReferencePluginConfig {
 35 |   public static final String AUTO_DETECT = "auto-detect";
 36 | 
 37 |   public static final String INSTANCE = "instance";
 38 |   public static final String PROJECT = "project";
 39 |   public static final String SERVICE_ACCOUNT_FILE_PATH = "serviceFilePath";
 40 | 
 41 |   @Name(INSTANCE)
 42 |   @Description("BigTable instance id. " +
 43 |     "Uniquely identifies BigTable instance within your Google Cloud Platform project.")
 44 |   @Macro
 45 |   public final String instance;
 46 | 
 47 |   @Name(PROJECT)
 48 |   @Description("Google Cloud Project ID, which uniquely identifies a project. "
 49 |     + "It can be found on the Dashboard in the Google Cloud Platform Console.")
 50 |   @Macro
 51 |   @Nullable
 52 |   public final String project;
 53 | 
 54 |   @Name(SERVICE_ACCOUNT_FILE_PATH)
 55 |   @Description("Path on the local file system of the service account key used "
 56 |     + "for authorization. Can be set to 'auto-detect' when running on a Dataproc cluster. "
 57 |     + "When running on other clusters, the file must be present on every node in the cluster.")
 58 |   @Macro
 59 |   @Nullable
 60 |   public final String serviceAccountFilePath;
 61 | 
 62 |   public CDCBigTableConfig(String referenceName, String instance, @Nullable String project,
 63 |                            @Nullable String serviceAccountFilePath) {
 64 |     super(referenceName);
 65 |     this.instance = instance;
 66 |     this.project = project;
 67 |     this.serviceAccountFilePath = serviceAccountFilePath;
 68 |   }
 69 | 
 70 |   @Nullable
 71 |   public String resolveProject() {
 72 |     if (project == null || project.isEmpty() || AUTO_DETECT.equals(project)) {
 73 |       return ServiceOptions.getDefaultProjectId();
 74 |     }
 75 |     return project;
 76 |   }
 77 | 
 78 |   @Nullable
 79 |   public String resolveServiceAccountFilePath() {
 80 |     if (serviceAccountFilePath == null || serviceAccountFilePath.isEmpty()
 81 |       || AUTO_DETECT.equals(serviceAccountFilePath)) {
 82 |       return null;
 83 |     }
 84 |     return serviceAccountFilePath;
 85 |   }
 86 | 
 87 |   @Override
 88 |   public void validate() {
 89 |     super.validate();
 90 |     if (!containsMacro(PROJECT) && resolveProject() == null) {
 91 |       throw new InvalidConfigPropertyException("Could not detect Google Cloud project id from the environment. " +
 92 |                                                  "Please specify a project id.", PROJECT);
 93 |     }
 94 |     if (!containsMacro(INSTANCE) && Strings.isNullOrEmpty(instance)) {
 95 |       throw new InvalidConfigPropertyException("Instance ID cannot be null or empty", INSTANCE);
 96 |     }
 97 |     String serviceAccountFilePath = resolveServiceAccountFilePath();
 98 |     if (!containsMacro(SERVICE_ACCOUNT_FILE_PATH) && serviceAccountFilePath != null) {
 99 |       File serviceAccountFile = new File(serviceAccountFilePath);
100 |       if (!serviceAccountFile.exists()) {
101 |         throw new InvalidConfigPropertyException(String.format("File '%s' does not exist", serviceAccountFilePath),
102 |                                                  SERVICE_ACCOUNT_FILE_PATH);
103 |       }
104 |     }
105 |   }
106 | }
107 | 


--------------------------------------------------------------------------------
/src/main/java/io/cdap/plugin/cdc/common/Schemas.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright © 2019 Cask Data, Inc.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5 |  * use this file except in compliance with the License. You may obtain a copy of
  6 |  * the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12 |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13 |  * License for the specific language governing permissions and limitations under
 14 |  * the License.
 15 |  */
 16 | 
 17 | package io.cdap.plugin.cdc.common;
 18 | 
 19 | import io.cdap.cdap.api.data.format.StructuredRecord;
 20 | import io.cdap.cdap.api.data.schema.Schema;
 21 | import io.cdap.cdap.api.data.schema.Schema.Field;
 22 | import io.cdap.cdap.api.data.schema.Schema.Type;
 23 | 
 24 | import java.util.Arrays;
 25 | import java.util.Objects;
 26 | import java.util.stream.Collectors;
 27 | 
 28 | /**
 29 |  * Helper class with common cdc schemes definitions.
 30 |  */
 31 | public class Schemas {
 32 | 
 33 |   private static final Schema SIMPLE_TYPES = Schema.unionOf(Arrays.stream(Type.values())
 34 |                                                               .filter(Type::isSimpleType)
 35 |                                                               .map(Schema::of)
 36 |                                                               .collect(Collectors.toList()));
 37 | 
 38 |   public static final String SCHEMA_RECORD = "schema";
 39 |   public static final String TABLE_FIELD = "table";
 40 |   public static final String SCHEMA_FIELD = "schema";
 41 |   public static final String OP_TYPE_FIELD = "op_type";
 42 |   public static final String PRIMARY_KEYS_FIELD = "primary_keys";
 43 |   public static final String DDL_FIELD = "ddl";
 44 |   public static final String DML_FIELD = "dml";
 45 |   public static final String UPDATE_SCHEMA_FIELD = "rows_schema";
 46 |   public static final String UPDATE_VALUES_FIELD = "rows_values";
 47 |   public static final String CHANGE_TRACKING_VERSION = "change_tracking_version";
 48 |   public static final String CDC_CURRENT_TIMESTAMP = "cdc_current_timestamp";
 49 | 
 50 |   public static final Schema DDL_SCHEMA = Schema.recordOf(
 51 |     "DDLRecord",
 52 |     Field.of(TABLE_FIELD, Schema.of(Type.STRING)),
 53 |     Field.of(SCHEMA_FIELD, Schema.of(Type.STRING))
 54 |   );
 55 | 
 56 |   public static final Schema DML_SCHEMA = Schema.recordOf(
 57 |     "DMLRecord",
 58 |     Field.of(OP_TYPE_FIELD, enumWith(OperationType.class)),
 59 |     Field.of(TABLE_FIELD, Schema.of(Type.STRING)),
 60 |     Field.of(PRIMARY_KEYS_FIELD, Schema.arrayOf(Schema.of(Type.STRING))),
 61 |     Field.of(UPDATE_SCHEMA_FIELD, Schema.of(Type.STRING)),
 62 |     Field.of(UPDATE_VALUES_FIELD, Schema.mapOf(Schema.of(Type.STRING), SIMPLE_TYPES)),
 63 |     Field.of(CHANGE_TRACKING_VERSION, Schema.of(Type.STRING)),
 64 |     Field.of(CDC_CURRENT_TIMESTAMP, Schema.of(Schema.LogicalType.TIME_MICROS))
 65 |   );
 66 | 
 67 |   public static final Schema CHANGE_SCHEMA = Schema.recordOf(
 68 |     "changeRecord",
 69 |     Field.of(DDL_FIELD, Schema.nullableOf(DDL_SCHEMA)),
 70 |     Field.of(DML_FIELD, Schema.nullableOf(DML_SCHEMA))
 71 |   );
 72 | 
 73 |   public static StructuredRecord toCDCRecord(StructuredRecord changeRecord) {
 74 |     String recordName = changeRecord.getSchema().getRecordName();
 75 |     if (Objects.equals(recordName, DDL_SCHEMA.getRecordName())) {
 76 |       return StructuredRecord.builder(CHANGE_SCHEMA)
 77 |         .set(DDL_FIELD, changeRecord)
 78 |         .build();
 79 |     } else if (Objects.equals(recordName, DML_SCHEMA.getRecordName())) {
 80 |       return StructuredRecord.builder(CHANGE_SCHEMA)
 81 |         .set(DML_FIELD, changeRecord)
 82 |         .build();
 83 |     }
 84 |     throw new IllegalArgumentException(String.format("Wrong schema name '%s' for record", recordName));
 85 |   }
 86 | 
 87 |   public static String getTableName(String namespacedTableName) {
 88 |     return namespacedTableName.split("\\.")[1];
 89 |   }
 90 | 
 91 |   private static Schema enumWith(Class<? extends Enum<?>> enumClass) {
 92 |     // this method may be removed when Schema.enumWith() method signature fixed
 93 |     Enum<?>[] enumConstants = enumClass.getEnumConstants();
 94 |     String[] names = new String[enumConstants.length];
 95 |     for (int i = 0; i < enumConstants.length; i++) {
 96 |       names[i] = enumConstants[i].name();
 97 |     }
 98 |     return Schema.enumWith(names);
 99 |   }
100 | 
101 |   private Schemas() {
102 |     // utility class
103 |   }
104 | }
105 | 


--------------------------------------------------------------------------------
/src/main/java/io/cdap/plugin/cdc/sink/CDCHBase.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright © 2019 Cask Data, Inc.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5 |  * use this file except in compliance with the License. You may obtain a copy of
  6 |  * the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12 |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13 |  * License for the specific language governing permissions and limitations under
 14 |  * the License.
 15 |  */
 16 | 
 17 | package io.cdap.plugin.cdc.sink;
 18 | 
 19 | import io.cdap.cdap.api.annotation.Name;
 20 | import io.cdap.cdap.api.annotation.Plugin;
 21 | import io.cdap.cdap.api.data.format.StructuredRecord;
 22 | import io.cdap.cdap.etl.api.PipelineConfigurer;
 23 | import io.cdap.cdap.etl.api.batch.SparkExecutionPluginContext;
 24 | import io.cdap.cdap.etl.api.batch.SparkPluginContext;
 25 | import io.cdap.cdap.etl.api.batch.SparkSink;
 26 | import io.cdap.cdap.etl.api.validation.InvalidStageException;
 27 | import io.cdap.plugin.cdc.common.Schemas;
 28 | import io.cdap.plugin.cdc.common.SparkConfigs;
 29 | import io.cdap.plugin.common.batch.JobUtils;
 30 | import org.apache.hadoop.conf.Configuration;
 31 | import org.apache.hadoop.hbase.TableName;
 32 | import org.apache.hadoop.hbase.client.Admin;
 33 | import org.apache.hadoop.hbase.client.Connection;
 34 | import org.apache.hadoop.hbase.client.ConnectionFactory;
 35 | import org.apache.hadoop.hbase.client.Table;
 36 | import org.apache.hadoop.mapreduce.Job;
 37 | import org.apache.spark.api.java.JavaRDD;
 38 | 
 39 | import java.io.IOException;
 40 | import java.util.Map;
 41 | 
 42 | /**
 43 |  * HBase sink for CDC
 44 |  */
 45 | @Plugin(type = SparkSink.PLUGIN_TYPE)
 46 | @Name("CDCHBase")
 47 | public class CDCHBase extends SparkSink<StructuredRecord> {
 48 |   private final CDCHBaseConfig config;
 49 | 
 50 |   public CDCHBase(CDCHBaseConfig config) {
 51 |     this.config = config;
 52 |   }
 53 | 
 54 |   @Override
 55 |   public void prepareRun(SparkPluginContext context) throws Exception {
 56 |   }
 57 | 
 58 |   @Override
 59 |   public void configurePipeline(PipelineConfigurer pipelineConfigurer) {
 60 |     config.validate();
 61 |     if (!Schemas.CHANGE_SCHEMA.isCompatible(pipelineConfigurer.getStageConfigurer().getInputSchema())) {
 62 |       throw new InvalidStageException("Input schema is incompatible with change record schema");
 63 |     }
 64 |   }
 65 | 
 66 |   @Override
 67 |   public void run(SparkExecutionPluginContext context, JavaRDD<StructuredRecord> javaRDD) throws Exception {
 68 |     Map<String, String> hadoopConfigs = SparkConfigs.getHadoopConfigs(javaRDD);
 69 |     // maps data sets to each block of computing resources
 70 |     javaRDD.foreachPartition(structuredRecordIterator -> {
 71 |       try (Connection conn = getConnection(hadoopConfigs);
 72 |            Admin hBaseAdmin = conn.getAdmin()) {
 73 |         while (structuredRecordIterator.hasNext()) {
 74 |           StructuredRecord input = structuredRecordIterator.next();
 75 |           StructuredRecord ddlRecord = input.get(Schemas.DDL_FIELD);
 76 |           if (ddlRecord != null) {
 77 |             String tableName = Schemas.getTableName(ddlRecord.get(Schemas.TABLE_FIELD));
 78 |             CDCTableUtil.createHBaseTable(hBaseAdmin, tableName);
 79 |           }
 80 |           StructuredRecord dmlRecord = input.get(Schemas.DML_FIELD);
 81 |           if (dmlRecord != null) {
 82 |             String tableName = Schemas.getTableName(dmlRecord.get(Schemas.TABLE_FIELD));
 83 |             Table table = hBaseAdmin.getConnection().getTable(TableName.valueOf(tableName));
 84 |             CDCTableUtil.updateHBaseTable(table, dmlRecord);
 85 |           }
 86 |         }
 87 |       }
 88 |     });
 89 |   }
 90 | 
 91 |   private Connection getConnection(Map<String, String> hadoopConfigs) throws IOException {
 92 |     ClassLoader oldClassLoader = Thread.currentThread().getContextClassLoader();
 93 |     // Switch the context classloader to plugin class' classloader (PluginClassLoader) so that
 94 |     // when Job/Configuration is created, it uses PluginClassLoader to load resources (hbase-default.xml)
 95 |     // which is present in the plugin jar and is not visible in the CombineClassLoader (which is what oldClassLoader
 96 |     // points to).
 97 |     Thread.currentThread().setContextClassLoader(getClass().getClassLoader());
 98 |     Job job;
 99 |     try {
100 |       job = JobUtils.createInstance();
101 |     } finally {
102 |       // Switch back to the original
103 |       Thread.currentThread().setContextClassLoader(oldClassLoader);
104 |     }
105 |     Configuration conf = job.getConfiguration();
106 | 
107 |     for (Map.Entry<String, String> configEntry : hadoopConfigs.entrySet()) {
108 |       conf.set(configEntry.getKey(), configEntry.getValue());
109 |     }
110 | 
111 |     return ConnectionFactory.createConnection(conf);
112 |   }
113 | }
114 | 


--------------------------------------------------------------------------------
/docs/CDCBigTable-sparksink.md:
--------------------------------------------------------------------------------
  1 | # CDC Google Cloud Bigtable Sink
  2 | 
  3 | Description
  4 | -----------
  5 | This plugin takes input from a CDC source and writes the changes to Cloud Bigtable.
  6 | 
  7 | All CDC sink plugins are normally used in conjunction with CDC source plugins. 
  8 | CDC sink expects messages in CDC format as an input.  
  9 | 
 10 | Credentials
 11 | -----------
 12 | If the plugin is run on a Google Cloud Dataproc cluster, the service account key does not need to be
 13 | provided and can be set to 'auto-detect'.
 14 | Credentials will be automatically read from the cluster environment.
 15 | 
 16 | If the plugin is not run on a Dataproc cluster, the path to a service account key must be provided.
 17 | The service account key can be found on the Dashboard in the Cloud Platform Console.
 18 | Make sure the account key has permission to access BigQuery and Google Cloud Storage.
 19 | The service account key file needs to be available on every node in your cluster and
 20 | must be readable by all users running the job.
 21 | 
 22 | Properties
 23 | ----------
 24 | **Reference Name**: Name used to uniquely identify this source for lineage, annotating metadata, etc.
 25 | 
 26 | **Instance ID**: The Instance Id the Cloud Bigtable is in.
 27 | 
 28 | **Project ID**: Google Cloud Project ID, which uniquely identifies a project.
 29 | It can be found on the Dashboard in the Google Cloud Platform Console. This is the project
 30 | that the BigQuery job will run in. If a temporary bucket needs to be created, the service account
 31 | must have permission in this project to create buckets.
 32 | 
 33 | **Service Account File Path**: Path on the local file system of the service account key used for
 34 | authorization. Can be set to 'auto-detect' when running on a Dataproc cluster.
 35 | When running on other clusters, the file must be present on every node in the cluster.
 36 | 
 37 | Usage Notes
 38 | -----------
 39 | This plugin supports table creation and table modification on a Cloud Bigtable project. 
 40 | We recommend placing a normalizer transformation plugin before this plugin. 
 41 | It converts inputs into standard Data Definition Language (DDL) and Data Manipulation Language (DML) records that 
 42 | can be parsed by this plugin.
 43 | 
 44 | Table Creation
 45 | --------------
 46 | When the plugin receives a DDL record, it creates a table in the target Cloud Bigtable project. The name of the table 
 47 | is specified in the DDL record. Below is a sample DDL Record that creates a table with name `TESTANOTHER`.
 48 | ```{
 49 |   "schema": {
 50 |     "type": "RECORD",
 51 |     "recordName": "DDLRecord",
 52 |     "fieldMap": {
 53 |       "table": {
 54 |         "name": "table",
 55 |         "schema": {
 56 |           "type": "STRING",
 57 |           "unionSchemas": []
 58 |         }
 59 |       },
 60 |       "schema": {
 61 |         "name": "schema",
 62 |         "schema": {
 63 |           "type": "STRING",
 64 |           "unionSchemas": []
 65 |         }
 66 |       }
 67 |     },
 68 |     "fields": [
 69 |       {
 70 |         "name": "table",
 71 |         "schema": {
 72 |           "type": "STRING",
 73 |           "unionSchemas": []
 74 |         }
 75 |       },
 76 |       {
 77 |         "name": "schema",
 78 |         "schema": {
 79 |           "type": "STRING",
 80 |           "unionSchemas": []
 81 |         }
 82 |       }
 83 |     ],
 84 |     "unionSchemas": []
 85 |   },
 86 |   "fields": {
 87 |     "schema": "{\"type\":\"record\",\"name\":\"columns\",\"fields\":[{\"name\":\"CID\",\"type\":[\"null\",\"long\"]},{\"name\":\"CNAME\",\"type\":[\"null\",\"string\"]}]}",
 88 |     "table": "TESTANOTHER"
 89 |   }
 90 | }
 91 | ```
 92 | 
 93 | Table Modification
 94 | --------------
 95 | When the plugin receives a DML record, it modifies the corresponding table according to the operation specified in 
 96 | `op_type`. 
 97 | 
 98 | | op\_type | Operation |
 99 | | :--------------: | :--------------: |
100 | | I | Insert |
101 | | U | Update | 
102 | | D | Delete |
103 | 
104 | The content of the changes is listed in the `change` field. The `primary_keys` field specifies the fields in `change` 
105 | that will be used to name a row in the table. Below is a sample DML record that creates a row for `Scott` and inserts 
106 | his information into the row.
107 | ```
108 | {
109 |   "table": "EMPLOYEE",
110 |   "schema": "{\"type\":\"record\",\"name\":\"columns\",\"fields\":[{\"name\":\"EMPNO\",\"type\":[\"null\",\"long\"]},{\"name\":\"ENAME\",\"type\":[\"null\",\"string\"]},{\"name\":\"JOB\",\"type\":[\"null\",\"string\"]},{\"name\":\"MGR\",\"type\":[\"null\",\"long\"]},{\"name\":\"HIREDATE\",\"type\":[\"null\",\"string\"]},{\"name\":\"SAL\",\"type\":[\"null\",\"long\"]},{\"name\":\"COMM\",\"type\":[\"null\",\"long\"]},{\"name\":\"DEPTNO\",\"type\":[\"null\",\"long\"]},{\"name\":\"EMP_ADDRESS\",\"type\":[\"null\",\"string\"]}]}",
111 |   "op_type": "I",
112 |   "primary_keys": [
113 |     "ENAME"
114 |   ],
115 |   "change": {
116 |     "HIREDATE": "03-DEC-2015",
117 |     "JOB": "Software Engineer",
118 |     "MGR": 991,
119 |     "SAL": 1234,
120 |     "DEPTNO": 1,
121 |     "EMP_ADDRESS": "San Jose",
122 |     "ENAME": "Scott",
123 |     "EMPNO": 1,
124 |     "COMM": 1
125 |   }
126 | }
127 | ```


--------------------------------------------------------------------------------
/src/main/java/io/cdap/plugin/cdc/source/oracle/GoldenGateKafkaConfig.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright © 2019 Cask Data, Inc.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5 |  * use this file except in compliance with the License. You may obtain a copy of
  6 |  * the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12 |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13 |  * License for the specific language governing permissions and limitations under
 14 |  * the License.
 15 |  */
 16 | package io.cdap.plugin.cdc.source.oracle;
 17 | 
 18 | import com.google.common.base.Strings;
 19 | import io.cdap.cdap.api.annotation.Description;
 20 | import io.cdap.cdap.api.annotation.Macro;
 21 | import io.cdap.cdap.api.annotation.Name;
 22 | import io.cdap.cdap.etl.api.validation.InvalidConfigPropertyException;
 23 | import io.cdap.plugin.cdc.common.CDCReferencePluginConfig;
 24 | import org.apache.commons.lang3.ObjectUtils;
 25 | 
 26 | import javax.annotation.Nullable;
 27 | 
 28 | /**
 29 |  * Configurations to be used for Golden Gate Kafka source.
 30 |  */
 31 | public class GoldenGateKafkaConfig extends CDCReferencePluginConfig {
 32 | 
 33 |   private static final long serialVersionUID = 8069169417140954175L;
 34 | 
 35 |   public static final String BROKER = "broker";
 36 |   public static final String TOPIC = "topic";
 37 |   public static final String DEFAULT_INITIAL_OFFSET = "defaultInitialOffset";
 38 |   public static final String MAX_RATE_PER_PARTITION = "maxRatePerPartition";
 39 | 
 40 |   @Name(BROKER)
 41 |   @Description("Kafka broker specified in host:port form. For example, example.com:9092")
 42 |   @Macro
 43 |   private final String broker;
 44 | 
 45 |   @Name(TOPIC)
 46 |   @Description("Name of the topic to which Golden Gate publishes the DDL and DML changes.")
 47 |   @Macro
 48 |   private final String topic;
 49 | 
 50 |   @Name(DEFAULT_INITIAL_OFFSET)
 51 |   @Description("The default initial offset to read from. " +
 52 |     "An offset of -2 means the smallest offset. An offset of -1 means the latest offset. Defaults to -1. " +
 53 |     "Offsets are inclusive. If an offset of 5 is used, the message at offset 5 will be read. ")
 54 |   @Macro
 55 |   @Nullable
 56 |   private final Long defaultInitialOffset;
 57 | 
 58 |   @Name(MAX_RATE_PER_PARTITION)
 59 |   @Description("Max number of records to read per second per partition. 0 means there is no limit. Defaults to 1000.")
 60 |   @Macro
 61 |   @Nullable
 62 |   private final Integer maxRatePerPartition;
 63 | 
 64 |   public GoldenGateKafkaConfig(String referenceName, @Nullable String broker, @Nullable String topic,
 65 |                                @Nullable Long defaultInitialOffset, @Nullable Integer maxRatePerPartition) {
 66 |     super(referenceName);
 67 |     this.broker = broker;
 68 |     this.topic = topic;
 69 |     this.defaultInitialOffset = defaultInitialOffset;
 70 |     this.maxRatePerPartition = maxRatePerPartition;
 71 |   }
 72 | 
 73 |   @Nullable
 74 |   public String getBroker() {
 75 |     return broker;
 76 |   }
 77 | 
 78 |   public String getHost() {
 79 |     return broker.split(":")[0];
 80 |   }
 81 | 
 82 |   public int getPort() {
 83 |     return Integer.valueOf(broker.split(":")[1]);
 84 |   }
 85 | 
 86 |   @Nullable
 87 |   public String getTopic() {
 88 |     return topic;
 89 |   }
 90 | 
 91 |   public Long getDefaultInitialOffset() {
 92 |     return ObjectUtils.defaultIfNull(defaultInitialOffset, -1L);
 93 |   }
 94 | 
 95 |   public Integer getMaxRatePerPartition() {
 96 |     return ObjectUtils.defaultIfNull(maxRatePerPartition, 1000);
 97 |   }
 98 | 
 99 |   /**
100 |    * Method to validate the broker address which should be in the form 'host:port'.
101 |    * throws IllegalArgumentException if validation fails
102 |    */
103 |   @Override
104 |   public void validate() {
105 |     super.validate();
106 |     if (!containsMacro(BROKER)) {
107 |       if (Strings.isNullOrEmpty(broker)) {
108 |         throw new InvalidConfigPropertyException("Broker address cannot be null or empty", BROKER);
109 |       }
110 |       try {
111 |         getHost();
112 |         getPort();
113 |       } catch (Exception e) {
114 |         throw new InvalidConfigPropertyException(
115 |           String.format("Broker address '%s' should be in the form of 'host:port'.", broker), e, BROKER);
116 |       }
117 |     }
118 |     if (!containsMacro(TOPIC) && Strings.isNullOrEmpty(topic)) {
119 |       throw new InvalidConfigPropertyException("Topic cannot be null or empty", TOPIC);
120 |     }
121 |     if (!containsMacro(DEFAULT_INITIAL_OFFSET) && defaultInitialOffset != null && defaultInitialOffset < -2) {
122 |       throw new InvalidConfigPropertyException("'defaultInitialOffset' should be equal to -2, -1, 0 or positive number",
123 |                                                DEFAULT_INITIAL_OFFSET);
124 |     }
125 |     if (!containsMacro(MAX_RATE_PER_PARTITION) && maxRatePerPartition != null && maxRatePerPartition < 0) {
126 |       throw new InvalidConfigPropertyException("'maxRatePerPartition' should be equal to 0 or positive number",
127 |                                                MAX_RATE_PER_PARTITION);
128 |     }
129 |   }
130 | }
131 | 


--------------------------------------------------------------------------------
/src/test/java/io/cdap/plugin/cdc/sink/CDCBigTableConfigUnitTest.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright © 2019 Cask Data, Inc.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5 |  * use this file except in compliance with the License. You may obtain a copy of
  6 |  * the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12 |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13 |  * License for the specific language governing permissions and limitations under
 14 |  * the License.
 15 |  */
 16 | 
 17 | package io.cdap.plugin.cdc.sink;
 18 | 
 19 | import com.google.bigtable.repackaged.com.google.cloud.ServiceOptions;
 20 | import io.cdap.cdap.etl.api.validation.InvalidConfigPropertyException;
 21 | import io.cdap.plugin.common.Constants;
 22 | import org.junit.Assert;
 23 | import org.junit.Assume;
 24 | import org.junit.Test;
 25 | 
 26 | public class CDCBigTableConfigUnitTest {
 27 |   private static final String VALID_REF = "test-ref";
 28 |   private static final String VALID_PROJECT = "test-project";
 29 |   private static final String VALID_INSTANCE = "test-instance";
 30 |   private static final String VALID_ACCOUNT_FILE_PATH
 31 |     = CDCBigTableConfigUnitTest.class.getResource("/credentials.json").getPath();
 32 | 
 33 |   @Test
 34 |   public void testValidateValidConfig() {
 35 |     CDCBigTableConfig config = new CDCBigTableConfig(
 36 |       VALID_REF,
 37 |       VALID_INSTANCE,
 38 |       VALID_PROJECT,
 39 |       VALID_ACCOUNT_FILE_PATH
 40 |     );
 41 | 
 42 |     config.validate();
 43 |   }
 44 | 
 45 |   @Test
 46 |   public void testValidateReference() {
 47 |     CDCBigTableConfig config = new CDCBigTableConfig(
 48 |       "",
 49 |       VALID_INSTANCE,
 50 |       VALID_PROJECT,
 51 |       VALID_ACCOUNT_FILE_PATH
 52 |     );
 53 | 
 54 |     try {
 55 |       config.validate();
 56 |       Assert.fail(String.format("Expected to throw %s", InvalidConfigPropertyException.class.getName()));
 57 |     } catch (InvalidConfigPropertyException e) {
 58 |       Assert.assertEquals(Constants.Reference.REFERENCE_NAME, e.getProperty());
 59 |     }
 60 |   }
 61 | 
 62 |   @Test
 63 |   public void testValidateMissingCredentialsFile() {
 64 |     CDCBigTableConfig config = new CDCBigTableConfig(
 65 |       VALID_REF,
 66 |       VALID_INSTANCE,
 67 |       VALID_PROJECT,
 68 |       "/tmp/non_existing_file"
 69 |     );
 70 | 
 71 |     try {
 72 |       config.validate();
 73 |       Assert.fail(String.format("Expected to throw %s", InvalidConfigPropertyException.class.getName()));
 74 |     } catch (InvalidConfigPropertyException e) {
 75 |       Assert.assertEquals(CDCBigTableConfig.SERVICE_ACCOUNT_FILE_PATH, e.getProperty());
 76 |     }
 77 |   }
 78 | 
 79 |   @Test
 80 |   public void testValidateMissingProjectId() {
 81 |     Assume.assumeTrue(ServiceOptions.getDefaultProjectId() == null);
 82 | 
 83 |     CDCBigTableConfig config = new CDCBigTableConfig(
 84 |       VALID_REF,
 85 |       VALID_INSTANCE,
 86 |       null,
 87 |       VALID_ACCOUNT_FILE_PATH
 88 |     );
 89 | 
 90 |     try {
 91 |       config.validate();
 92 |       Assert.fail(String.format("Expected to throw %s", InvalidConfigPropertyException.class.getName()));
 93 |     } catch (InvalidConfigPropertyException e) {
 94 |       Assert.assertEquals(CDCBigTableConfig.PROJECT, e.getProperty());
 95 |     }
 96 |   }
 97 | 
 98 |   @Test
 99 |   public void testValidateMissingInstanceId() {
100 |     CDCBigTableConfig config = new CDCBigTableConfig(
101 |       VALID_REF,
102 |       null,
103 |       VALID_PROJECT,
104 |       VALID_ACCOUNT_FILE_PATH
105 |     );
106 | 
107 |     try {
108 |       config.validate();
109 |       Assert.fail(String.format("Expected to throw %s", InvalidConfigPropertyException.class.getName()));
110 |     } catch (InvalidConfigPropertyException e) {
111 |       Assert.assertEquals(CDCBigTableConfig.INSTANCE, e.getProperty());
112 |     }
113 |   }
114 | 
115 |   @Test
116 |   public void testResolveProjectId() {
117 |     CDCBigTableConfig config = new CDCBigTableConfig(
118 |       VALID_REF,
119 |       VALID_INSTANCE,
120 |       null,
121 |       VALID_ACCOUNT_FILE_PATH
122 |     );
123 | 
124 |     Assert.assertEquals(ServiceOptions.getDefaultProjectId(), config.resolveProject());
125 |   }
126 | 
127 |   @Test
128 |   public void testResolveProjectIdAutoDetect() {
129 |     CDCBigTableConfig config = new CDCBigTableConfig(
130 |       VALID_REF,
131 |       VALID_INSTANCE,
132 |       CDCBigTableConfig.AUTO_DETECT,
133 |       VALID_ACCOUNT_FILE_PATH
134 |     );
135 | 
136 |     Assert.assertEquals(ServiceOptions.getDefaultProjectId(), config.resolveProject());
137 |   }
138 | 
139 |   @Test
140 |   public void testServiceAccountFilePath() {
141 |     CDCBigTableConfig config = new CDCBigTableConfig(
142 |       VALID_REF,
143 |       VALID_INSTANCE,
144 |       VALID_PROJECT,
145 |       null
146 |     );
147 | 
148 |     Assert.assertNull(config.resolveServiceAccountFilePath());
149 |   }
150 | 
151 |   @Test
152 |   public void testServiceAccountFilePathAutoDetect() {
153 |     CDCBigTableConfig config = new CDCBigTableConfig(
154 |       VALID_REF,
155 |       VALID_INSTANCE,
156 |       VALID_PROJECT,
157 |       CDCBigTableConfig.AUTO_DETECT
158 |     );
159 | 
160 |     Assert.assertNull(config.resolveServiceAccountFilePath());
161 |   }
162 | }
163 | 


--------------------------------------------------------------------------------
/src/main/java/io/cdap/plugin/cdc/source/sqlserver/ResultSetToDMLRecord.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright © 2019 Cask Data, Inc.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5 |  * use this file except in compliance with the License. You may obtain a copy of
  6 |  * the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12 |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13 |  * License for the specific language governing permissions and limitations under
 14 |  * the License.
 15 |  */
 16 | 
 17 | package io.cdap.plugin.cdc.source.sqlserver;
 18 | 
 19 | import com.google.common.base.Joiner;
 20 | import com.google.common.collect.Lists;
 21 | import io.cdap.cdap.api.data.format.StructuredRecord;
 22 | import io.cdap.cdap.api.data.schema.Schema;
 23 | import io.cdap.plugin.cdc.common.DBUtils;
 24 | import io.cdap.plugin.cdc.common.OperationType;
 25 | import io.cdap.plugin.cdc.common.Schemas;
 26 | import org.apache.spark.api.java.function.Function;
 27 | import org.slf4j.Logger;
 28 | import org.slf4j.LoggerFactory;
 29 | 
 30 | import java.sql.Date;
 31 | import java.sql.ResultSet;
 32 | import java.sql.ResultSetMetaData;
 33 | import java.sql.SQLException;
 34 | import java.sql.Time;
 35 | import java.sql.Timestamp;
 36 | import java.time.Instant;
 37 | import java.util.HashMap;
 38 | import java.util.List;
 39 | import java.util.Map;
 40 | import java.util.concurrent.TimeUnit;
 41 | 
 42 | /**
 43 |  * A serializable class to allow invoking {@link scala.Function1} from Java. The function converts {@link ResultSet}
 44 |  * to {@link StructuredRecord} for dml records
 45 |  */
 46 | public class ResultSetToDMLRecord implements Function<ResultSet, StructuredRecord> {
 47 |   private static final Logger LOG = LoggerFactory.getLogger(ResultSetToDMLRecord.class);
 48 |   private static final int CHANGE_TABLE_COLUMNS_SIZE = 4;
 49 |   private final TableInformation tableInformation;
 50 | 
 51 |   ResultSetToDMLRecord(TableInformation tableInformation) {
 52 |     this.tableInformation = tableInformation;
 53 |   }
 54 | 
 55 |   @Override
 56 |   public StructuredRecord call(ResultSet row) throws SQLException {
 57 |     Schema changeSchema = getChangeSchema(row);
 58 |     String operation = row.getString("SYS_CHANGE_OPERATION");
 59 |     OperationType operationType = OperationType.fromShortName(operation);
 60 |     return StructuredRecord.builder(Schemas.DML_SCHEMA)
 61 |       .set(Schemas.TABLE_FIELD, Joiner.on(".").join(tableInformation.getSchemaName(), tableInformation.getName()))
 62 |       .set(Schemas.PRIMARY_KEYS_FIELD, Lists.newArrayList(tableInformation.getPrimaryKeys()))
 63 |       .set(Schemas.OP_TYPE_FIELD, operationType.name())
 64 |       .set(Schemas.UPDATE_SCHEMA_FIELD, changeSchema.toString())
 65 |       .set(Schemas.UPDATE_VALUES_FIELD, getChangeData(row, changeSchema))
 66 |       .set(Schemas.CHANGE_TRACKING_VERSION, row.getString("CHANGE_TRACKING_VERSION"))
 67 |       .set(Schemas.CDC_CURRENT_TIMESTAMP, row.getTimestamp("CDC_CURRENT_TIMESTAMP").getTime() * 1000)
 68 |       .build();
 69 |   }
 70 | 
 71 |   private static Map<String, Object> getChangeData(ResultSet resultSet, Schema changeSchema) throws SQLException {
 72 |     ResultSetMetaData metadata = resultSet.getMetaData();
 73 |     Map<String, Object> changes = new HashMap<>();
 74 |     for (int i = 0; i < changeSchema.getFields().size(); i++) {
 75 |       Schema.Field field = changeSchema.getFields().get(i);
 76 |       // Ignore the first CHANGE_TABLE_COLUMN_SIZE columns since those are change tracking data and not the
 77 |       // actual row data. Add 1 because ResultSetMetaData starts from 1, not 0.
 78 |       int column = 1 + i + CHANGE_TABLE_COLUMNS_SIZE;
 79 |       int sqlType = metadata.getColumnType(column);
 80 |       int sqlPrecision = metadata.getPrecision(column);
 81 |       int sqlScale = metadata.getScale(column);
 82 |       Object sqlValue = DBUtils.transformValue(sqlType, sqlPrecision, sqlScale, resultSet, field.getName());
 83 |       Object javaValue = transformSQLToJavaType(sqlValue);
 84 |       changes.put(field.getName(), javaValue);
 85 |     }
 86 |     return changes;
 87 |   }
 88 | 
 89 |   private static Schema getChangeSchema(ResultSet resultSet) throws SQLException {
 90 |     List<Schema.Field> schemaFields = DBUtils.getSchemaFields(resultSet);
 91 |     // drop first four columns as they are from change tracking tables and does not represent the change data
 92 |     return Schema.recordOf(Schemas.SCHEMA_RECORD,
 93 |                            schemaFields.subList(CHANGE_TABLE_COLUMNS_SIZE, schemaFields.size()));
 94 |   }
 95 | 
 96 |   private static Object transformSQLToJavaType(Object sqlValue) {
 97 |     if (sqlValue instanceof Date) {
 98 |       Date d = (Date) sqlValue;
 99 |       // dates are number of days since the epoch
100 |       return (int) d.toLocalDate().toEpochDay();
101 |     } else if (sqlValue instanceof Time) {
102 |       // times are microseconds since midnight
103 |       Time t = (Time) sqlValue;
104 |       return TimeUnit.NANOSECONDS.toMicros(t.toLocalTime().toNanoOfDay());
105 |     } else if (sqlValue instanceof Timestamp) {
106 |       // timestamps are in microseconds
107 |       Instant instant = ((Timestamp) sqlValue).toInstant();
108 |       long micros = TimeUnit.SECONDS.toMicros(instant.getEpochSecond());
109 |       return micros + TimeUnit.NANOSECONDS.toMicros(instant.getNano());
110 |     } else {
111 |       return sqlValue;
112 |     }
113 |   }
114 | }
115 | 


--------------------------------------------------------------------------------
/widgets/CTSQLServer-streamingsource.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "metadata": {
  3 |     "spec-version": "1.5"
  4 |   },
  5 |   "configuration-groups": [
  6 |     {
  7 |       "label": "Basic",
  8 |       "properties": [
  9 |         {
 10 |           "widget-type": "textbox",
 11 |           "label": "Reference Name",
 12 |           "name": "referenceName",
 13 |           "description": "Reference specifies the name to be used to track this external source"
 14 |         },
 15 |         {
 16 |           "widget-type": "textbox",
 17 |           "label": "Username",
 18 |           "name": "username",
 19 |           "description": "Username to use to connect to the specified database. Required for databases that need authentication. Optional for databases that do not require authentication"
 20 |         },
 21 |         {
 22 |           "widget-type": "password",
 23 |           "label": "Password",
 24 |           "name": "password",
 25 |           "description": "Password to use to connect to the specified database. Required for databases that need authentication. Optional for databases that do not require authentication"
 26 |         },
 27 |         {
 28 |           "widget-type": "textbox",
 29 |           "label": "Database name",
 30 |           "name": "dbname",
 31 |           "description": "SQL Server database name which needs to be tracked. Note: Change Tracking must be enabled on the database for the source to read the chage data"
 32 |         },
 33 |         {
 34 |           "widget-type": "csv",
 35 |           "label": "Table Whitelist",
 36 |           "name": "tableWhitelist"
 37 |         }
 38 |       ]
 39 |     },
 40 |     {
 41 |       "label": "Connection",
 42 |       "properties": [
 43 |         {
 44 |           "widget-type": "textbox",
 45 |           "label": "Hostname",
 46 |           "name": "hostname",
 47 |           "widget-attributes": {
 48 |             "placeholder": "SQL Server hostname"
 49 |           }
 50 |         },
 51 |         {
 52 |           "widget-type": "textbox",
 53 |           "label": "Port",
 54 |           "name": "port",
 55 |           "widget-attributes": {
 56 |             "placeholder": "SQL Server Port. Ex: 1433"
 57 |           }
 58 |         }
 59 |       ]
 60 |     },
 61 |     {
 62 |       "label": "Custom JDBC Connection",
 63 |       "properties": [
 64 |         {
 65 |           "widget-type": "textbox",
 66 |           "label": "JDBC Plugin Name",
 67 |           "name": "jdbcPluginName"
 68 |         },
 69 |         {
 70 |           "widget-type": "textbox",
 71 |           "label": "Connection String",
 72 |           "name": "connectionString"
 73 |         }
 74 |       ]
 75 |     },
 76 |     {
 77 |       "label": "Advanced",
 78 |       "properties": [
 79 |         {
 80 |           "widget-type": "textbox",
 81 |           "label": "Max Retry Seconds",
 82 |           "name": "maxRetrySeconds"
 83 |         },
 84 |         {
 85 |           "widget-type": "textbox",
 86 |           "label": "Max Batch Size",
 87 |           "name": "maxBatchSize",
 88 |           "widget-attributes": {
 89 |             "default": "100000"
 90 |           }
 91 |         },
 92 |         {
 93 |           "widget-type": "textbox",
 94 |           "label": "Starting Sequence Number",
 95 |           "name": "sequenceStartNum",
 96 |           "widget-attributes": {
 97 |             "default": "0"
 98 |           }
 99 |         }
100 |       ]
101 |     }
102 |   ],
103 |   "outputs": [
104 |     {
105 |       "widget-type": "non-editable-schema-editor",
106 |       "schema": {
107 |         "name": "changeRecord",
108 |         "type": "record",
109 |         "fields": [
110 |           {
111 |             "name": "ddl",
112 |             "type": [
113 |               {
114 |                 "type": "record",
115 |                 "name": "DDLRecord",
116 |                 "fields": [
117 |                   { "name": "table", "type":  "string" },
118 |                   { "name":  "schema", "type":  "string" }
119 |                 ]
120 |               },
121 |               "null"
122 |             ]
123 |           },
124 |           {
125 |             "name": "dml",
126 |             "type": [
127 |               {
128 |                 "type": "record",
129 |                 "name": "DMLRecord",
130 |                 "fields": [
131 |                   {
132 |                     "name": "op_type",
133 |                     "type": {
134 |                       "symbols": [ "INSERT", "UPDATE", "DELETE" ],
135 |                       "type": "enum"
136 |                     }
137 |                   },
138 |                   { "name": "table", "type": "string" },
139 |                   { "name": "primary_keys", "type": { "type": "array", "items": "string" } },
140 |                   { "name": "rows_schema", "type": "string" },
141 |                   {
142 |                     "name": "rows_values",
143 |                     "type": {
144 |                       "type": "map",
145 |                       "keys": "string",
146 |                       "values": [
147 |                         "null",
148 |                         "boolean",
149 |                         "int",
150 |                         "long",
151 |                         "float",
152 |                         "double",
153 |                         "bytes",
154 |                         "string"
155 |                       ]
156 |                     }
157 |                   },
158 |                   { "name": "change_tracking_version", "type": "string" }
159 |                 ]
160 |               },
161 |               "null"
162 |             ]
163 |           }
164 |         ]
165 |       }
166 |     }
167 |   ]
168 | }
169 | 


--------------------------------------------------------------------------------
/src/main/java/io/cdap/plugin/cdc/DMLFlattener.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright © 2019 Cask Data, Inc.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5 |  * use this file except in compliance with the License. You may obtain a copy of
  6 |  * the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12 |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13 |  * License for the specific language governing permissions and limitations under
 14 |  * the License.
 15 |  */
 16 | 
 17 | package io.cdap.plugin.cdc;
 18 | 
 19 | import io.cdap.cdap.api.annotation.Description;
 20 | import io.cdap.cdap.api.annotation.Name;
 21 | import io.cdap.cdap.api.annotation.Plugin;
 22 | import io.cdap.cdap.api.data.format.StructuredRecord;
 23 | import io.cdap.cdap.api.data.schema.Schema;
 24 | import io.cdap.cdap.api.plugin.PluginConfig;
 25 | import io.cdap.cdap.etl.api.Emitter;
 26 | import io.cdap.cdap.etl.api.PipelineConfigurer;
 27 | import io.cdap.cdap.etl.api.Transform;
 28 | import io.cdap.cdap.etl.api.TransformContext;
 29 | 
 30 | import java.io.IOException;
 31 | import java.util.ArrayList;
 32 | import java.util.HashMap;
 33 | import java.util.List;
 34 | import java.util.Map;
 35 | import javax.annotation.Nullable;
 36 | 
 37 | /**
 38 |  * Extracts the DML record from the output of a cdc source for direct manipulation.
 39 |  */
 40 | @Plugin(type = Transform.PLUGIN_TYPE)
 41 | @Name("DMLFlattener")
 42 | @Description("Flattens DML records output by a CDC source.")
 43 | public class DMLFlattener extends Transform<StructuredRecord, StructuredRecord> {
 44 |   private static final String OP_TYPE = "CDC_OP_TYPE";
 45 |   private static final String CHANGE_TRACKING_VERSION = "CHANGE_TRACKING_VERSION";
 46 |   private static final String CDC_TIMESTAMP = "CDC_CURRENT_TIMESTAMP";
 47 |   private final Conf conf;
 48 |   private Map<Schema, Schema> schemaCache;
 49 |   private Schema configuredOutputSchema;
 50 |   private boolean addOpType = false;
 51 |   private boolean addTrackingVersion = false;
 52 |   private boolean addTimestamp = false;
 53 | 
 54 |   public DMLFlattener(Conf conf) {
 55 |     this.conf = conf;
 56 |   }
 57 | 
 58 |   @Override
 59 |   public void configurePipeline(PipelineConfigurer pipelineConfigurer) {
 60 |     if (conf.schema != null) {
 61 |       try {
 62 |         pipelineConfigurer.getStageConfigurer().setOutputSchema(Schema.parseJson(conf.schema));
 63 |       } catch (IOException e) {
 64 |         throw new IllegalArgumentException("Unable to parse configured schema: " + e.getMessage(), e);
 65 |       }
 66 |     }
 67 |   }
 68 | 
 69 |   @Override
 70 |   public void initialize(TransformContext context) throws IOException {
 71 |     configuredOutputSchema = conf.schema == null ? null : Schema.parseJson(conf.schema);
 72 |     addOpType = configuredOutputSchema.getField(OP_TYPE) != null;
 73 |     addTrackingVersion = configuredOutputSchema.getField(CHANGE_TRACKING_VERSION) != null;
 74 |     addTimestamp = configuredOutputSchema.getField(CDC_TIMESTAMP) != null;
 75 |     schemaCache = new HashMap<>();
 76 |   }
 77 | 
 78 |   @Override
 79 |   public void transform(StructuredRecord record, Emitter<StructuredRecord> emitter) throws Exception {
 80 |     StructuredRecord dml = record.get("dml");
 81 |     if (dml == null) {
 82 |       return;
 83 |     }
 84 | 
 85 |     Schema rowSchema = Schema.parseJson((String) dml.get("rows_schema"));
 86 |     Schema outputSchema = schemaCache.computeIfAbsent(rowSchema, this::createOutputSchema);
 87 | 
 88 |     StructuredRecord.Builder output = StructuredRecord.builder(outputSchema);
 89 |     if (addOpType) {
 90 |       output.set(OP_TYPE, dml.get("op_type").toString());
 91 |     }
 92 |     if (addTrackingVersion) {
 93 |       output.set(CHANGE_TRACKING_VERSION, dml.get("change_tracking_version"));
 94 |     }
 95 |     if (addTimestamp) {
 96 |       output.set(CDC_TIMESTAMP, dml.get("cdc_current_timestamp"));
 97 |     }
 98 |     Map<String, Object> valueMap = dml.get("rows_values");
 99 |     if (valueMap == null) {
100 |       valueMap = new HashMap<>();
101 |     }
102 |     for (Map.Entry<String, Object> entry : valueMap.entrySet()) {
103 |       output.set(entry.getKey(), entry.getValue());
104 |     }
105 |     emitter.emit(output.build());
106 |   }
107 | 
108 |   private Schema createOutputSchema(Schema rowSchema) {
109 |     // the transform optionally adds a OP_TYPE field and CHANGE_TRACKING_VERSION field that do not come from the
110 |     // actual row data, but from general change tracking information.
111 |     int numFields = rowSchema.getFields().size() + (addOpType ? 1 : 0) +
112 |       (addTrackingVersion ? 1 : 0) + (addTimestamp ? 1 : 0);
113 |     List<Schema.Field> fields = new ArrayList<>(numFields);
114 |     fields.addAll(rowSchema.getFields());
115 |     if (addOpType) {
116 |       fields.add(Schema.Field.of(OP_TYPE, Schema.of(Schema.Type.STRING)));
117 |     }
118 |     if (addTrackingVersion) {
119 |       fields.add(Schema.Field.of(CHANGE_TRACKING_VERSION, Schema.of(Schema.Type.STRING)));
120 |     }
121 |     if (addTimestamp) {
122 |       fields.add(Schema.Field.of(CDC_TIMESTAMP, Schema.of(Schema.LogicalType.TIMESTAMP_MICROS)));
123 |     }
124 |     return Schema.recordOf(rowSchema + ".added", fields);
125 |   }
126 | 
127 |   /**
128 |    * plugin config.
129 |    */
130 |   public static class Conf extends PluginConfig {
131 | 
132 |     @Nullable
133 |     @Description("The output schema of DML records. This should only be set if the source has been configured to read "
134 |       + "from a single table whose schema will never change.")
135 |     private String schema;
136 |   }
137 | }
138 | 


--------------------------------------------------------------------------------
/src/main/java/io/cdap/plugin/cdc/sink/CDCBigTable.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright © 2019 Cask Data, Inc.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5 |  * use this file except in compliance with the License. You may obtain a copy of
  6 |  * the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12 |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13 |  * License for the specific language governing permissions and limitations under
 14 |  * the License.
 15 |  */
 16 | 
 17 | package io.cdap.plugin.cdc.sink;
 18 | 
 19 | import com.google.cloud.bigtable.hbase.BigtableConfiguration;
 20 | import com.google.cloud.bigtable.hbase.BigtableOptionsFactory;
 21 | import io.cdap.cdap.api.annotation.Name;
 22 | import io.cdap.cdap.api.annotation.Plugin;
 23 | import io.cdap.cdap.api.data.format.StructuredRecord;
 24 | import io.cdap.cdap.etl.api.PipelineConfigurer;
 25 | import io.cdap.cdap.etl.api.batch.SparkExecutionPluginContext;
 26 | import io.cdap.cdap.etl.api.batch.SparkPluginContext;
 27 | import io.cdap.cdap.etl.api.batch.SparkSink;
 28 | import io.cdap.cdap.etl.api.validation.InvalidStageException;
 29 | import io.cdap.plugin.cdc.common.Schemas;
 30 | import io.cdap.plugin.cdc.common.SparkConfigs;
 31 | import io.cdap.plugin.common.batch.JobUtils;
 32 | import org.apache.hadoop.conf.Configuration;
 33 | import org.apache.hadoop.hbase.TableName;
 34 | import org.apache.hadoop.hbase.client.Admin;
 35 | import org.apache.hadoop.hbase.client.Connection;
 36 | import org.apache.hadoop.hbase.client.Table;
 37 | import org.apache.hadoop.mapreduce.Job;
 38 | import org.apache.spark.api.java.JavaRDD;
 39 | 
 40 | import java.io.IOException;
 41 | import java.util.Map;
 42 | 
 43 | /**
 44 |  * BigTable sink for CDC
 45 |  */
 46 | @Plugin(type = SparkSink.PLUGIN_TYPE)
 47 | @Name("CDCBigTable")
 48 | public class CDCBigTable extends SparkSink<StructuredRecord> {
 49 |   private final CDCBigTableConfig config;
 50 | 
 51 |   public CDCBigTable(CDCBigTableConfig config) {
 52 |     this.config = config;
 53 |   }
 54 | 
 55 |   @Override
 56 |   public void prepareRun(SparkPluginContext context) throws Exception {
 57 |   }
 58 | 
 59 |   @Override
 60 |   public void configurePipeline(PipelineConfigurer pipelineConfigurer) {
 61 |     config.validate();
 62 |     if (!Schemas.CHANGE_SCHEMA.isCompatible(pipelineConfigurer.getStageConfigurer().getInputSchema())) {
 63 |       throw new InvalidStageException("Input schema is incompatible with change record schema");
 64 |     }
 65 |   }
 66 | 
 67 |   @Override
 68 |   public void run(SparkExecutionPluginContext context, JavaRDD<StructuredRecord> javaRDD) throws Exception {
 69 |     Map<String, String> hadoopConfigs = SparkConfigs.getHadoopConfigs(javaRDD);
 70 |     // maps data sets to each block of computing resources
 71 |     javaRDD.foreachPartition(structuredRecordIterator -> {
 72 |       try (Connection conn = getConnection(hadoopConfigs);
 73 |            Admin hBaseAdmin = conn.getAdmin()) {
 74 |         while (structuredRecordIterator.hasNext()) {
 75 |           StructuredRecord input = structuredRecordIterator.next();
 76 |           StructuredRecord ddlRecord = input.get(Schemas.DDL_FIELD);
 77 |           if (ddlRecord != null) {
 78 |             // Notes: In BigTable, there no such thing as namespace.
 79 |             // Dots are allowed in table names, but colons are not.
 80 |             // If you try a table name with a colon in it, you will get:
 81 |             // io.grpc.StatusRuntimeException: INVALID_ARGUMENT: Invalid id for collection tables : \
 82 |             // Should match [_a-zA-Z0-9][-_.a-zA-Z0-9]* but found 'ns:abcd'
 83 |             String tableName = Schemas.getTableName(ddlRecord.get(Schemas.TABLE_FIELD));
 84 |             CDCTableUtil.createHBaseTable(hBaseAdmin, tableName);
 85 |           }
 86 |           StructuredRecord dmlRecord = input.get(Schemas.DML_FIELD);
 87 |           if (dmlRecord != null) {
 88 |             String tableName = Schemas.getTableName(dmlRecord.get(Schemas.TABLE_FIELD));
 89 |             Table table = hBaseAdmin.getConnection().getTable(TableName.valueOf(tableName));
 90 |             CDCTableUtil.updateHBaseTable(table, dmlRecord);
 91 |           }
 92 |         }
 93 |       }
 94 |     });
 95 |   }
 96 | 
 97 |   private Connection getConnection(Map<String, String> hadoopConfigs) throws IOException {
 98 |     ClassLoader oldClassLoader = Thread.currentThread().getContextClassLoader();
 99 |     // Switch the context classloader to plugin class' classloader (PluginClassLoader) so that
100 |     // when Job/Configuration is created, it uses PluginClassLoader to load resources (hbase-default.xml)
101 |     // which is present in the plugin jar and is not visible in the CombineClassLoader (which is what oldClassLoader
102 |     // points to).
103 |     Thread.currentThread().setContextClassLoader(getClass().getClassLoader());
104 |     try {
105 |       Job job = JobUtils.createInstance();
106 | 
107 |       Configuration conf = job.getConfiguration();
108 | 
109 |       for (Map.Entry<String, String> configEntry : hadoopConfigs.entrySet()) {
110 |         conf.set(configEntry.getKey(), configEntry.getValue());
111 |       }
112 | 
113 |       String projectId = config.resolveProject();
114 |       String serviceAccountFilePath = config.resolveServiceAccountFilePath();
115 |       BigtableConfiguration.configure(conf, projectId, config.instance);
116 |       if (serviceAccountFilePath != null) {
117 |         conf.set(BigtableOptionsFactory.BIGTABLE_SERVICE_ACCOUNT_JSON_KEYFILE_LOCATION_KEY, serviceAccountFilePath);
118 |       }
119 | 
120 |       return BigtableConfiguration.connect(conf);
121 |     } finally {
122 |       // Switch back to the original
123 |       Thread.currentThread().setContextClassLoader(oldClassLoader);
124 |     }
125 |   }
126 | }
127 | 


--------------------------------------------------------------------------------
/src/test/java/io/cdap/plugin/cdc/source/oracle/GoldenGateKafkaConfigUnitTest.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright © 2019 Cask Data, Inc.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5 |  * use this file except in compliance with the License. You may obtain a copy of
  6 |  * the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12 |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13 |  * License for the specific language governing permissions and limitations under
 14 |  * the License.
 15 |  */
 16 | 
 17 | package io.cdap.plugin.cdc.source.oracle;
 18 | 
 19 | import io.cdap.cdap.etl.api.validation.InvalidConfigPropertyException;
 20 | import io.cdap.plugin.common.Constants;
 21 | import org.junit.Assert;
 22 | import org.junit.Test;
 23 | 
 24 | public class GoldenGateKafkaConfigUnitTest {
 25 |   private static final String VALID_REF = "test-ref";
 26 |   private static final String VALID_BROKER = "localhost:9092";
 27 |   private static final String VALID_TOPIC = "topic1";
 28 |   private static final Long VALID_DEFAULT_INITIAL_OFFSET = 0L;
 29 |   private static final Integer VALID_MAX_RATE_PER_PARTITION = 0;
 30 | 
 31 |   @Test
 32 |   public void testValidateValidConfig() {
 33 |     GoldenGateKafkaConfig config = new GoldenGateKafkaConfig(
 34 |       VALID_REF,
 35 |       VALID_BROKER,
 36 |       VALID_TOPIC,
 37 |       VALID_DEFAULT_INITIAL_OFFSET,
 38 |       VALID_MAX_RATE_PER_PARTITION
 39 |     );
 40 | 
 41 |     config.validate();
 42 |   }
 43 | 
 44 |   @Test
 45 |   public void testValidateReference() {
 46 |     GoldenGateKafkaConfig config = new GoldenGateKafkaConfig(
 47 |       "",
 48 |       VALID_BROKER,
 49 |       VALID_TOPIC,
 50 |       VALID_DEFAULT_INITIAL_OFFSET,
 51 |       VALID_MAX_RATE_PER_PARTITION
 52 |     );
 53 | 
 54 |     try {
 55 |       config.validate();
 56 |       Assert.fail(String.format("Expected to throw %s", InvalidConfigPropertyException.class.getName()));
 57 |     } catch (InvalidConfigPropertyException e) {
 58 |       Assert.assertEquals(Constants.Reference.REFERENCE_NAME, e.getProperty());
 59 |     }
 60 |   }
 61 | 
 62 |   @Test
 63 |   public void testValidateMissingBroker() {
 64 |     GoldenGateKafkaConfig config = new GoldenGateKafkaConfig(
 65 |       VALID_REF,
 66 |       null,
 67 |       VALID_TOPIC,
 68 |       VALID_DEFAULT_INITIAL_OFFSET,
 69 |       VALID_MAX_RATE_PER_PARTITION
 70 |     );
 71 | 
 72 |     try {
 73 |       config.validate();
 74 |       Assert.fail(String.format("Expected to throw %s", InvalidConfigPropertyException.class.getName()));
 75 |     } catch (InvalidConfigPropertyException e) {
 76 |       Assert.assertEquals(GoldenGateKafkaConfig.BROKER, e.getProperty());
 77 |     }
 78 |   }
 79 | 
 80 |   @Test
 81 |   public void testValidateEmptyBroker() {
 82 |     GoldenGateKafkaConfig config = new GoldenGateKafkaConfig(
 83 |       VALID_REF,
 84 |       "",
 85 |       VALID_TOPIC,
 86 |       VALID_DEFAULT_INITIAL_OFFSET,
 87 |       VALID_MAX_RATE_PER_PARTITION
 88 |     );
 89 | 
 90 |     try {
 91 |       config.validate();
 92 |       Assert.fail(String.format("Expected to throw %s", InvalidConfigPropertyException.class.getName()));
 93 |     } catch (InvalidConfigPropertyException e) {
 94 |       Assert.assertEquals(GoldenGateKafkaConfig.BROKER, e.getProperty());
 95 |     }
 96 |   }
 97 | 
 98 |   @Test
 99 |   public void testValidateWronglyFormattedBroker() {
100 |     GoldenGateKafkaConfig config = new GoldenGateKafkaConfig(
101 |       VALID_REF,
102 |       "localhost",
103 |       VALID_TOPIC,
104 |       VALID_DEFAULT_INITIAL_OFFSET,
105 |       VALID_MAX_RATE_PER_PARTITION
106 |     );
107 | 
108 |     try {
109 |       config.validate();
110 |       Assert.fail(String.format("Expected to throw %s", InvalidConfigPropertyException.class.getName()));
111 |     } catch (InvalidConfigPropertyException e) {
112 |       Assert.assertEquals(GoldenGateKafkaConfig.BROKER, e.getProperty());
113 |     }
114 |   }
115 | 
116 |   @Test
117 |   public void testValidateMissingTopic() {
118 |     GoldenGateKafkaConfig config = new GoldenGateKafkaConfig(
119 |       VALID_REF,
120 |       VALID_BROKER,
121 |       null,
122 |       VALID_DEFAULT_INITIAL_OFFSET,
123 |       VALID_MAX_RATE_PER_PARTITION
124 |     );
125 | 
126 |     try {
127 |       config.validate();
128 |       Assert.fail(String.format("Expected to throw %s", InvalidConfigPropertyException.class.getName()));
129 |     } catch (InvalidConfigPropertyException e) {
130 |       Assert.assertEquals(GoldenGateKafkaConfig.TOPIC, e.getProperty());
131 |     }
132 |   }
133 | 
134 |   @Test
135 |   public void testValidateEmptyTopic() {
136 |     GoldenGateKafkaConfig config = new GoldenGateKafkaConfig(
137 |       VALID_REF,
138 |       VALID_BROKER,
139 |       "",
140 |       VALID_DEFAULT_INITIAL_OFFSET,
141 |       VALID_MAX_RATE_PER_PARTITION
142 |     );
143 | 
144 |     try {
145 |       config.validate();
146 |       Assert.fail(String.format("Expected to throw %s", InvalidConfigPropertyException.class.getName()));
147 |     } catch (InvalidConfigPropertyException e) {
148 |       Assert.assertEquals(GoldenGateKafkaConfig.TOPIC, e.getProperty());
149 |     }
150 |   }
151 | 
152 |   @Test
153 |   public void testValidateDefaultInitialOffset() {
154 |     GoldenGateKafkaConfig config = new GoldenGateKafkaConfig(
155 |       VALID_REF,
156 |       VALID_BROKER,
157 |       VALID_TOPIC,
158 |       -3L,
159 |       VALID_MAX_RATE_PER_PARTITION
160 |     );
161 | 
162 |     try {
163 |       config.validate();
164 |       Assert.fail(String.format("Expected to throw %s", InvalidConfigPropertyException.class.getName()));
165 |     } catch (InvalidConfigPropertyException e) {
166 |       Assert.assertEquals(GoldenGateKafkaConfig.DEFAULT_INITIAL_OFFSET, e.getProperty());
167 |     }
168 |   }
169 | 
170 |   @Test
171 |   public void testValidateMaxRatePerPartition() {
172 |     GoldenGateKafkaConfig config = new GoldenGateKafkaConfig(
173 |       VALID_REF,
174 |       VALID_BROKER,
175 |       VALID_TOPIC,
176 |       VALID_DEFAULT_INITIAL_OFFSET,
177 |       -1
178 |     );
179 | 
180 |     try {
181 |       config.validate();
182 |       Assert.fail(String.format("Expected to throw %s", InvalidConfigPropertyException.class.getName()));
183 |     } catch (InvalidConfigPropertyException e) {
184 |       Assert.assertEquals(GoldenGateKafkaConfig.MAX_RATE_PER_PARTITION, e.getProperty());
185 |     }
186 |   }
187 | }
188 | 


--------------------------------------------------------------------------------
/src/main/java/io/cdap/plugin/cdc/sink/CDCKuduConfig.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright © 2019 Cask Data, Inc.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5 |  * use this file except in compliance with the License. You may obtain a copy of
  6 |  * the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12 |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13 |  * License for the specific language governing permissions and limitations under
 14 |  * the License.
 15 |  */
 16 | 
 17 | package io.cdap.plugin.cdc.sink;
 18 | 
 19 | import io.cdap.cdap.api.annotation.Description;
 20 | import io.cdap.cdap.api.annotation.Macro;
 21 | import io.cdap.cdap.api.annotation.Name;
 22 | import io.cdap.plugin.cdc.common.CDCReferencePluginConfig;
 23 | import org.apache.kudu.ColumnSchema;
 24 | 
 25 | import javax.annotation.Nullable;
 26 | 
 27 | /**
 28 |  * Configurations for the Kudu.
 29 |  */
 30 | public class CDCKuduConfig extends CDCReferencePluginConfig {
 31 | 
 32 |   // Required Fields.
 33 | 
 34 |   @Name("master")
 35 |   @Description("Comma-separated list of hostname:port for Kudu masters")
 36 |   @Macro
 37 |   public String optMasterAddresses;
 38 | 
 39 |   // Options Fields
 40 |   @Name("opt-timeout")
 41 |   @Description("Timeout for Kudu operations in milliseconds. Defaults is  '30000 ms'.")
 42 |   @Nullable
 43 |   public String optOperationTimeoutMs;
 44 | 
 45 |   @Name("admin-timeout")
 46 |   @Description("Administration operation time out. Default is '30000 ms'.")
 47 |   @Nullable
 48 |   public String optAdminTimeoutMs;
 49 | 
 50 |   @Name("seed")
 51 |   @Description("Seed to be used for hashing. Default is 0")
 52 |   @Nullable
 53 |   public String optSeed;
 54 | 
 55 |   @Name("replicas")
 56 |   @Description("Specifies the number of replicas for the Kudu tables")
 57 |   @Nullable
 58 |   public String optReplicas;
 59 | 
 60 |   @Name("compression-algo")
 61 |   @Description("Compression algorithm to be applied on the columns. Default is 'snappy'")
 62 |   @Nullable
 63 |   public String optCompressionAlgorithm;
 64 | 
 65 |   @Name("encoding")
 66 |   @Description("Specifies the encoding to be applied on the schema. Default is 'auto'")
 67 |   @Nullable
 68 |   public String optEncoding;
 69 | 
 70 |   @Name("row-flush")
 71 |   @Description("Number of rows that are buffered before flushing to the tablet server")
 72 |   @Nullable
 73 |   public String optFlushRows;
 74 | 
 75 |   @Name("buckets")
 76 |   @Description("Specifies the number of buckets to split the table into.")
 77 |   @Nullable
 78 |   public String optBucketsCounts;
 79 | 
 80 |   @Name("boss-threads")
 81 |   @Description("Specifies the number of boss threads to be used by the client.")
 82 |   @Nullable
 83 |   private String optBossThreads;
 84 | 
 85 |   public CDCKuduConfig(ColumnSchema.CompressionAlgorithm compression) {
 86 |     this("kudu");
 87 |   }
 88 | 
 89 |   public CDCKuduConfig(String referenceName) {
 90 |     super(referenceName);
 91 |   }
 92 | 
 93 |   /**
 94 |    * @return cleaned up master address.
 95 |    */
 96 |   public String getMasterAddress() {
 97 |     return optMasterAddresses.trim();
 98 |   }
 99 | 
100 |   /**
101 |    * @return Compression algorithm to be associated with all the fields.
102 |    */
103 |   public ColumnSchema.CompressionAlgorithm getCompression() {
104 |     ColumnSchema.CompressionAlgorithm algorithm = ColumnSchema.CompressionAlgorithm.SNAPPY;
105 | 
106 |     switch(optCompressionAlgorithm.toLowerCase()) {
107 |       case "snappy":
108 |         algorithm = ColumnSchema.CompressionAlgorithm.SNAPPY;
109 |         break;
110 | 
111 |       case "lz4":
112 |         algorithm = ColumnSchema.CompressionAlgorithm.LZ4;
113 |         break;
114 | 
115 |       case "zlib":
116 |         algorithm = ColumnSchema.CompressionAlgorithm.ZLIB;
117 |         break;
118 | 
119 |       case "backend configured":
120 |         algorithm = ColumnSchema.CompressionAlgorithm.DEFAULT_COMPRESSION;
121 |         break;
122 | 
123 |       case "No Compression":
124 |         algorithm = ColumnSchema.CompressionAlgorithm.NO_COMPRESSION;
125 |         break;
126 |     }
127 |     return algorithm;
128 |   }
129 | 
130 |   /**
131 |    * @return Encoding to be applied to all the columns.
132 |    */
133 |   public ColumnSchema.Encoding getEncoding() {
134 |     ColumnSchema.Encoding encoding = ColumnSchema.Encoding.AUTO_ENCODING;
135 |     switch(optEncoding.toLowerCase()) {
136 |       case "auto":
137 |         encoding = ColumnSchema.Encoding.AUTO_ENCODING;
138 |         break;
139 | 
140 |       case "plain":
141 |         encoding = ColumnSchema.Encoding.PLAIN_ENCODING;
142 |         break;
143 | 
144 |       case "prefix":
145 |         encoding = ColumnSchema.Encoding.PREFIX_ENCODING;
146 |         break;
147 | 
148 |       case "group variant":
149 |         encoding = ColumnSchema.Encoding.GROUP_VARINT;
150 |         break;
151 | 
152 |       case "rle":
153 |         encoding = ColumnSchema.Encoding.RLE;
154 |         break;
155 | 
156 |       case "dictionary":
157 |         encoding = ColumnSchema.Encoding.DICT_ENCODING;
158 |         break;
159 | 
160 |       case "bit shuffle":
161 |         encoding = ColumnSchema.Encoding.BIT_SHUFFLE;
162 |         break;
163 |     }
164 |     return encoding;
165 |   }
166 | 
167 |   /**
168 |    * @return Number of replicas of a table on tablet servers.
169 |    */
170 |   public int getReplicas() {
171 |     return (optReplicas != null) ? Integer.parseInt(optReplicas) : 1;
172 |   }
173 | 
174 |   /**
175 |    * @return Timeout for user operations.
176 |    */
177 |   public int getOperationTimeout() {
178 |     return (optOperationTimeoutMs != null) ? Integer.parseInt(optOperationTimeoutMs) : 30000;
179 |   }
180 | 
181 |   /**
182 |    * @return Number of rows to be cached before being flushed.
183 |    */
184 |   public int getCacheRowCount() {
185 |     return (optFlushRows != null) ? Integer.parseInt(optFlushRows) : 30000;
186 |   }
187 | 
188 |   /**
189 |    * @return Timeout for admin operations.
190 |    */
191 |   public int getAdministrationTimeout() {
192 |     return (optAdminTimeoutMs != null) ? Integer.parseInt(optAdminTimeoutMs) : 30000;
193 |   }
194 | 
195 |   /**
196 |    * @return Number of buckets to be used for storing the rows.
197 |    */
198 |   public int getBuckets() {
199 |     return (optBucketsCounts != null) ? Integer.parseInt(optBucketsCounts) : 16;
200 |   }
201 | 
202 |   /**
203 |    * @return Seed to be used for randomizing rows into hashed buckets.
204 |    */
205 |   public int getSeed() {
206 |     return (optSeed != null) ? Integer.parseInt(optSeed) : 0;
207 |   }
208 | 
209 |   /**
210 |    * @return Number of boss threads to be used.
211 |    */
212 |   public int getThreads() {
213 |     return (optBossThreads != null) ? Integer.parseInt(optBossThreads) : 1;
214 |   }
215 | }
216 | 


--------------------------------------------------------------------------------
/src/main/java/io/cdap/plugin/cdc/sink/CDCTableUtil.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright © 2019 Cask Data, Inc.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5 |  * use this file except in compliance with the License. You may obtain a copy of
  6 |  * the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12 |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13 |  * License for the specific language governing permissions and limitations under
 14 |  * the License.
 15 |  */
 16 | 
 17 | package io.cdap.plugin.cdc.sink;
 18 | 
 19 | import com.google.common.base.Preconditions;
 20 | import io.cdap.cdap.api.common.Bytes;
 21 | import io.cdap.cdap.api.data.format.StructuredRecord;
 22 | import io.cdap.cdap.api.data.schema.Schema;
 23 | import io.cdap.plugin.cdc.common.OperationType;
 24 | import io.cdap.plugin.cdc.common.Schemas;
 25 | import org.apache.hadoop.hbase.HColumnDescriptor;
 26 | import org.apache.hadoop.hbase.HTableDescriptor;
 27 | import org.apache.hadoop.hbase.TableName;
 28 | import org.apache.hadoop.hbase.client.Admin;
 29 | import org.apache.hadoop.hbase.client.Delete;
 30 | import org.apache.hadoop.hbase.client.Put;
 31 | import org.apache.hadoop.hbase.client.Table;
 32 | import org.slf4j.Logger;
 33 | import org.slf4j.LoggerFactory;
 34 | 
 35 | import java.io.IOException;
 36 | import java.nio.ByteBuffer;
 37 | import java.util.List;
 38 | import java.util.Map;
 39 | import java.util.stream.Collectors;
 40 | import javax.annotation.Nullable;
 41 | 
 42 | /**
 43 |  * Utility methods for dealing with Tables, for CDC use cases.
 44 |  */
 45 | public class CDCTableUtil {
 46 | 
 47 |   private static final Logger LOG = LoggerFactory.getLogger(CDCTableUtil.class);
 48 | 
 49 |   public static final String CDC_COLUMN_FAMILY = "cdc";
 50 | 
 51 |   /**
 52 |    * Creates a table using the HBase Admin API.
 53 |    *
 54 |    * @param admin     the HBase Admin to use to create the table
 55 |    * @param tableName the name of the table
 56 |    */
 57 |   public static void createHBaseTable(Admin admin, String tableName) throws IOException {
 58 |     if (!admin.tableExists(TableName.valueOf(tableName))) {
 59 |       HTableDescriptor descriptor = new HTableDescriptor(TableName.valueOf(tableName));
 60 |       descriptor.addFamily(new HColumnDescriptor(CDC_COLUMN_FAMILY));
 61 |       LOG.debug("Creating HBase table {}.", tableName);
 62 |       admin.createTable(descriptor);
 63 |     }
 64 |   }
 65 | 
 66 |   /**
 67 |    * Updates an HBase API Table with a CDC record.
 68 |    *
 69 |    * @param table        the HBase API Table to update
 70 |    * @param dmlRecord the StructuredRecord containing the CDC data
 71 |    */
 72 |   public static void updateHBaseTable(Table table, StructuredRecord dmlRecord) throws Exception {
 73 |     OperationType operationType = OperationType.valueOf(dmlRecord.get(Schemas.OP_TYPE_FIELD));
 74 |     List<String> primaryKeys = dmlRecord.get(Schemas.PRIMARY_KEYS_FIELD);
 75 |     Schema updateSchema = Schema.parseJson((String) dmlRecord.get(Schemas.UPDATE_SCHEMA_FIELD));
 76 |     Map<String, Object> changes = dmlRecord.get(Schemas.UPDATE_VALUES_FIELD);
 77 | 
 78 |     switch (operationType) {
 79 |       case INSERT:
 80 |       case UPDATE:
 81 |         Put put = new Put(getRowKey(primaryKeys, changes));
 82 |         for (Schema.Field field : updateSchema.getFields()) {
 83 |           setPutField(put, field, changes.get(field.getName()));
 84 |         }
 85 |         table.put(put);
 86 |         LOG.debug("Putting row {}", Bytes.toString(getRowKey(primaryKeys, changes)));
 87 |         break;
 88 |       case DELETE:
 89 |         Delete delete = new Delete(getRowKey(primaryKeys, changes));
 90 |         table.delete(delete);
 91 |         LOG.debug("Deleting row {}", Bytes.toString(getRowKey(primaryKeys, changes)));
 92 |         break;
 93 |       default:
 94 |         LOG.warn("Operation of type '{}' will be ignored.", operationType);
 95 |     }
 96 |   }
 97 | 
 98 |   private static byte[] getRowKey(List<String> primaryKeys, Map<String, Object> changes) {
 99 |     // the primary keys are always in sorted order
100 |     String joinedValue = primaryKeys.stream()
101 |       .sorted()
102 |       .map(primaryKey -> changes.get(primaryKey).toString())
103 |       .collect(Collectors.joining(":"));
104 |     return Bytes.toBytes(joinedValue);
105 |   }
106 | 
107 |   // get the non-nullable type of the field and check that it's a simple type.
108 |   private static Schema.Type validateAndGetType(Schema.Field field) {
109 |     Schema.Type type;
110 |     if (field.getSchema().isNullable()) {
111 |       type = field.getSchema().getNonNullable().getType();
112 |     } else {
113 |       type = field.getSchema().getType();
114 |     }
115 |     Preconditions.checkArgument(type.isSimpleType(),
116 |                                 "only simple types are supported (boolean, int, long, float, double, bytes).");
117 |     return type;
118 |   }
119 | 
120 |   private static void setPutField(Put put, Schema.Field field, @Nullable Object val) {
121 |     Schema.Type type = validateAndGetType(field);
122 |     String column = field.getName();
123 |     if (val == null) {
124 |       put.addColumn(Bytes.toBytes(CDC_COLUMN_FAMILY), Bytes.toBytes(column), null);
125 |       return;
126 |     }
127 | 
128 |     switch (type) {
129 |       case BOOLEAN:
130 |         put.addColumn(Bytes.toBytes(CDC_COLUMN_FAMILY), Bytes.toBytes(column), Bytes.toBytes((Boolean) val));
131 |         break;
132 |       case INT:
133 |         put.addColumn(Bytes.toBytes(CDC_COLUMN_FAMILY), Bytes.toBytes(column),
134 |                       Bytes.toBytes(((Number) val).intValue()));
135 |         break;
136 |       case LONG:
137 |         put.addColumn(Bytes.toBytes(CDC_COLUMN_FAMILY), Bytes.toBytes(column),
138 |                       Bytes.toBytes(((Number) val).longValue()));
139 |         break;
140 |       case FLOAT:
141 |         put.addColumn(Bytes.toBytes(CDC_COLUMN_FAMILY), Bytes.toBytes(column),
142 |                       Bytes.toBytes(((Number) val).floatValue()));
143 |         break;
144 |       case DOUBLE:
145 |         put.addColumn(Bytes.toBytes(CDC_COLUMN_FAMILY), Bytes.toBytes(column),
146 |                       Bytes.toBytes(((Number) val).doubleValue()));
147 |         break;
148 |       case BYTES:
149 |         if (val instanceof ByteBuffer) {
150 |           put.addColumn(Bytes.toBytes(CDC_COLUMN_FAMILY), Bytes.toBytes(column), Bytes.toBytes((ByteBuffer) val));
151 |         } else {
152 |           put.addColumn(Bytes.toBytes(CDC_COLUMN_FAMILY), Bytes.toBytes(column), (byte[]) val);
153 |         }
154 |         break;
155 |       case STRING:
156 |         put.addColumn(Bytes.toBytes(CDC_COLUMN_FAMILY), Bytes.toBytes(column), Bytes.toBytes((String) val));
157 |         break;
158 |       default:
159 |         throw new IllegalArgumentException("Field " + field.getName() + " is of unsupported type " + type);
160 |     }
161 |   }
162 | }
163 | 


--------------------------------------------------------------------------------
/src/main/java/io/cdap/plugin/cdc/source/sqlserver/CTSQLServerConfig.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright © 2019 Cask Data, Inc.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5 |  * use this file except in compliance with the License. You may obtain a copy of
  6 |  * the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12 |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13 |  * License for the specific language governing permissions and limitations under
 14 |  * the License.
 15 |  */
 16 | 
 17 | package io.cdap.plugin.cdc.source.sqlserver;
 18 | 
 19 | import io.cdap.cdap.api.annotation.Description;
 20 | import io.cdap.cdap.api.annotation.Name;
 21 | import io.cdap.cdap.api.plugin.PluginConfig;
 22 | import io.cdap.cdap.etl.api.validation.InvalidConfigPropertyException;
 23 | import io.cdap.plugin.cdc.common.CDCReferencePluginConfig;
 24 | 
 25 | import java.util.Arrays;
 26 | import java.util.Collections;
 27 | import java.util.Set;
 28 | import java.util.stream.Collectors;
 29 | import javax.annotation.Nullable;
 30 | 
 31 | /**
 32 |  * Defines the {@link PluginConfig} for the {@link CTSQLServer}.
 33 |  */
 34 | public class CTSQLServerConfig extends CDCReferencePluginConfig {
 35 | 
 36 |   public static final String HOST_NAME = "hostname";
 37 |   public static final String PORT = "port";
 38 |   public static final String USERNAME = "username";
 39 |   public static final String PASSWORD = "password";
 40 |   public static final String DATABASE_NAME = "dbname";
 41 |   public static final String SEQUENCE_START_NUM = "sequenceStartNum";
 42 |   public static final String MAX_RETRY_SECONDS = "maxRetrySeconds";
 43 |   public static final String MAX_BATCH_SIZE = "maxBatchSize";
 44 |   public static final String TABLE_WHITELIST = "tableWhitelist";
 45 |   public static final String JDBC_PLUGIN_NAME = "jdbcPluginName";
 46 |   public static final String CONNECTION_STRING = "connectionString";
 47 | 
 48 |   @Name(HOST_NAME)
 49 |   @Description("SQL Server hostname. This is not required if a connection string was specified.")
 50 |   @Nullable
 51 |   private final String hostname;
 52 | 
 53 |   @Name(PORT)
 54 |   @Description("SQL Server port. This is not required if a connection string was specified.")
 55 |   @Nullable
 56 |   private final Integer port;
 57 | 
 58 |   @Name(DATABASE_NAME)
 59 |   @Description("SQL Server database name. Note: CT must be enabled on the database for change tracking.")
 60 |   @Nullable
 61 |   private String dbName;
 62 | 
 63 |   @Name(USERNAME)
 64 |   @Description("User to use to connect to the specified database. Required for databases that " +
 65 |     "need authentication. Optional for databases that do not require authentication.")
 66 |   @Nullable
 67 |   private final String username;
 68 | 
 69 |   @Name(PASSWORD)
 70 |   @Description("Password to use to connect to the specified database. Required for databases that " +
 71 |     "need authentication. Optional for databases that do not require authentication.")
 72 |   @Nullable
 73 |   private final String password;
 74 | 
 75 |   @Name(MAX_RETRY_SECONDS)
 76 |   @Description("Maximum amount of time to retry reading change events if there is an error. "
 77 |     + "If no retries should be done, this should be set to 0. "
 78 |     + "If there should not be a retry limit, this should be set to a negative number or left empty.")
 79 |   @Nullable
 80 |   private final Long maxRetrySeconds;
 81 | 
 82 |   @Name(SEQUENCE_START_NUM)
 83 |   @Description("The Change Tracking sequence number to start from.")
 84 |   @Nullable
 85 |   private final Long sequenceStartNum;
 86 | 
 87 |   @Name(MAX_BATCH_SIZE)
 88 |   @Description("Maximum number of changes to consume in a single batch interval.")
 89 |   @Nullable
 90 |   private final Integer maxBatchSize;
 91 | 
 92 |   @Name(TABLE_WHITELIST)
 93 |   @Description("A whitelist of tables to consume changes from. "
 94 |     + "If none is specified, changes from all tables will be consumed.")
 95 |   @Nullable
 96 |   private final String tableWhitelist;
 97 | 
 98 |   @Description("Name of the JDBC plugin to use if something different than the built-in sql server driver is required.")
 99 |   @Nullable
100 |   private final String jdbcPluginName;
101 | 
102 |   @Description("Connection string to use when connecting to the database through JDBC. "
103 |     + "This is required if a JDBC plugin was specified.")
104 |   @Nullable
105 |   private final String connectionString;
106 | 
107 |   public CTSQLServerConfig() {
108 |     super("");
109 |     this.hostname = null;
110 |     this.port = 1433;
111 |     this.dbName = null;
112 |     this.username = null;
113 |     this.password = null;
114 |     this.sequenceStartNum = 0L;
115 |     this.maxRetrySeconds = -1L;
116 |     this.maxBatchSize = 100000;
117 |     this.tableWhitelist = null;
118 |     this.jdbcPluginName = null;
119 |     this.connectionString = null;
120 |   }
121 | 
122 |   public String getHostname() {
123 |     return hostname;
124 |   }
125 | 
126 |   public int getPort() {
127 |     return port;
128 |   }
129 | 
130 |   public String getDbName() {
131 |     return dbName;
132 |   }
133 | 
134 |   @Nullable
135 |   public String getUsername() {
136 |     return username;
137 |   }
138 | 
139 |   @Nullable
140 |   public String getPassword() {
141 |     return password;
142 |   }
143 | 
144 |   public long getSequenceStartNum() {
145 |     return sequenceStartNum == null ? 0L : sequenceStartNum;
146 |   }
147 | 
148 |   public long getMaxRetrySeconds() {
149 |     return maxRetrySeconds == null ? -1L : maxRetrySeconds;
150 |   }
151 | 
152 |   public int getMaxBatchSize() {
153 |     return maxBatchSize == null ? 100000 : maxBatchSize;
154 |   }
155 | 
156 |   public Set<String> getTableWhitelist() {
157 |     return tableWhitelist == null ? Collections.emptySet() :
158 |       Arrays.stream(tableWhitelist.split(",")).map(String::trim).collect(Collectors.toSet());
159 |   }
160 | 
161 |   @Nullable
162 |   public String getJdbcPluginName() {
163 |     return jdbcPluginName;
164 |   }
165 | 
166 |   public String getConnectionString() {
167 |     if (connectionString != null) {
168 |       return connectionString;
169 |     }
170 |     return String.format("jdbc:sqlserver://%s:%s;DatabaseName=%s", hostname, port, dbName);
171 |   }
172 | 
173 |   @Override
174 |   public void validate() {
175 |     super.validate();
176 |     if (jdbcPluginName != null && connectionString == null) {
177 |       throw new InvalidConfigPropertyException(
178 |         "A connection string must be specified when a custom jdbc driver is used.", CONNECTION_STRING);
179 |     }
180 | 
181 |     if (dbName == null) {
182 |       throw new InvalidConfigPropertyException("A database name must be specified", DATABASE_NAME);
183 |     }
184 | 
185 |     if (connectionString == null) {
186 |       if (hostname == null) {
187 |         throw new InvalidConfigPropertyException("A hostname must be specified", HOST_NAME);
188 |       }
189 |       if (port == null) {
190 |         throw new InvalidConfigPropertyException("A port must be specified", PORT);
191 |       }
192 |     }
193 | 
194 |     if (port != null && (port < 0 || port > 65535)) {
195 |       throw new InvalidConfigPropertyException("Port number should be in range 0-65535", PORT);
196 |     }
197 |   }
198 | }
199 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
  2 | [![Join CDAP community](https://cdap-users.herokuapp.com/badge.svg?t=wrangler)](https://cdap-users.herokuapp.com?t=1)
  3 | 
  4 | Change Data Capture (Alpha)
  5 | ===========================
  6 | 
  7 | In databases, Change Data Capture(CDC) is used to determine and track the data that has changed so that
  8 | action can be taken using the changed data. This repository contains CDAP plugins which allows to capture
  9 | the changes from databases such as Oracle and Microsoft SQL Server and to push those changes in realtime
 10 | to sinks such as Kudu, HBase, and Database.
 11 | 
 12 | * [Oracle](docs/oracle/Oracle.md)
 13 | * [Microsoft SQL Server](docs/CTSQLServer.md)
 14 | 
 15 | # Overview
 16 | 
 17 | Following plugins are available in this repository. 
 18 | 
 19 |   * [Google Cloud BigTable Sink](docs/CDCBigTable-sparksink.md)
 20 |   * [HBase Sink](docs/CDCHBase-sparksink.md)
 21 |   * Kudu Sink 
 22 |   * [Golden Gate Kafka Source](docs/oracle/Oracle.md)
 23 |   * [SQL Server Change Tracking Streaming Source](docs/CTSQLServer.md)
 24 |   
 25 | # Development
 26 | 
 27 | ## Run Integration Tests
 28 | It is possible to run integration tests against **local** (see [Setup Local Environment](#setup-local-environment)) 
 29 | or **remote environment**.
 30 | 
 31 | Run tests against **local** environment:
 32 | ```bash
 33 | mvn clean test
 34 | ```
 35 | 
 36 | Run tests against **remote** environment:
 37 | ```bash
 38 | mvn clean test -DinstanceUri=<HostAndPort>
 39 | ```
 40 | 
 41 | To use **remote environment** you may configure the following system properties:
 42 | * **test.sql-server.host** - SQL Server host. Default: localhost.
 43 | * **test.sql-server.port** - SQL Server port. Default: 1433.
 44 | * **test.sql-server.username** - SQL Server username. This user should have permissions to create databases.
 45 |  Default: SA.
 46 | * **test.sql-server.password** - SQL Server password. Default: 123Qwe123.
 47 | * **test.sql-server.namespace** - SQL Server namespace for test databases. Default: dbo.
 48 | * **test.bigtable.project** - Google Cloud Project ID. Default: lookup from local environment.
 49 | * **test.bigtable.instance** - Bigtable Instance ID. Default: null.
 50 | * **test.bigtable.serviceFilePath** - Path on the local file system of the service account key used for
 51 |   authorization. Default: lookup from local environment.
 52 | * **test.oracle-db.host** - Oracle DB host. Default: localhost.
 53 | * **test.oracle-db.port** - Oracle DB port. Default: 1521.
 54 | * **test.oracle-db.service** - Oracle DB service name. Default: XE.
 55 | * **test.oracle-db.username** - Oracle DB username. Default: trans_user.
 56 | * **test.oracle-db.password** - Oracle DB password. Default: trans_user.
 57 | * **test.oracle-db.driver.jar** - Path to Oracle Java Driver jar file. Default: null.
 58 | * **test.oracle-db.driver.class** - Oracle Java Driver class name. Default: oracle.jdbc.OracleDriver.
 59 | * **test.goldengate.broker** - Kafka broker specified in host:port form. Default: localhost:9092.
 60 | * **test.goldengate.topic** - Name of the topic to which Golden Gate publishes the DDL and DML changes. 
 61 | Default: oggtopic.
 62 |   
 63 | **NOTE:** Bigtable Sink tests will be skipped without provided properties.
 64 | **NOTE:** Golden Gate Kafka Source tests will be skipped without provided properties.
 65 | 
 66 | ## Run Performance Tests
 67 | It is possible to run performance tests against **local** (see [Setup Local Environment](#setup-local-environment)) 
 68 | or **remote environment**.
 69 | 
 70 | Run tests against **local** environment:
 71 | ```bash
 72 | mvn clean test -P perf-tests
 73 | ```
 74 | 
 75 | Run tests against **remote** environment:
 76 | ```bash
 77 | mvn clean test -P perf-tests -DinstanceUri=<HostAndPort>
 78 | ```
 79 | 
 80 | Common system properties for tests:
 81 | * **ptest.test-data.load** - Prepare and load test data to source storage. Default: true.
 82 | * **ptest.test-data.inserts** - Number of records to prepare. Default: 5000.
 83 | * **ptest.target-table-created-timeout.seconds** - Timeout for table creation in sink storage. Default: 300.
 84 | * **ptest.data-transferred-timeout.seconds** - Timeout for data transfer to target storage. Default: 600.
 85 | 
 86 | To use **remote environment** you may configure the following system properties:
 87 | * **ptest.sql-server.host** - SQL Server host. Default: localhost.
 88 | * **ptest.sql-server.port** - SQL Server port. Default: 1433.
 89 | * **ptest.sql-server.username** - SQL Server username. This user should have permissions to create databases.
 90 |  Default: SA.
 91 | * **ptest.sql-server.password** - SQL Server password. Default: 123Qwe123.
 92 | * **ptest.bigtable.project** - Google Cloud Project ID. Default: lookup from local environment.
 93 | * **ptest.bigtable.instance** - Bigtable Instance ID. Default: null.
 94 | * **ptest.bigtable.serviceFilePath** - Path on the local file system of the service account key used for
 95 |   authorization. Default: lookup from local environment.
 96 |   
 97 | **NOTE:** Bigtable Sink tests will be skipped without provided properties.
 98 | 
 99 | ## Setup Local Environment
100 | To start local environment you should:
101 | * [Install Docker Compose](https://docs.docker.com/compose/install/)
102 | * Build local docker images
103 |   * [Build Oracle DB docker image](https://github.com/oracle/docker-images/tree/master/OracleDatabase/SingleInstance)
104 |   * [Build Oracle GoldenGate docker image](https://github.com/oracle/docker-images/tree/master/OracleGoldenGate)
105 | * Start environment by running commands:
106 |   ```bash
107 |   cd docker-compose/cdc-env/
108 |   docker-compose up -d
109 |   ```
110 | * Configure GoldenGate for Oracle:
111 |   * Start ggsci:
112 |     ```bash
113 |     docker-compose exec --user oracle goldengate_oracle ggsci
114 |     ```
115 |   * Configure user credentials:
116 |     ```bash
117 |     ADD credentialstore  
118 |     alter credentialstore add user gg_extract@oracledb:1521/xe password gg_extract alias oggadmin 
119 |     ```
120 |   * Change source schema configuration:
121 |     ```bash
122 |     DBLOGIN USERIDALIAS oggadmin
123 |     add schematrandata trans_user ALLCOLS
124 |     ```
125 |   * Define the Extract and start it 
126 |   (all EXTRACT params are defined in docker-compose/cdc-env/GoldenGate/dirprm/ext1.prm):
127 |     ```bash
128 |     ADD EXTRACT ext1, TRANLOG, BEGIN NOW
129 |     ADD EXTTRAIL /u01/app/ogg/dirdat/in, EXTRACT ext1
130 |     START ext1
131 |     ```
132 |   * Check its status:
133 |     ```bash
134 |     INFO ext1
135 |     ```
136 | * Configure GoldenGate for BigData:
137 |   * Start ggsci:
138 |     ```bash
139 |     docker-compose exec --user oracle goldengate_bigdata ggsci
140 |     ```
141 |   * Define the Replicat and start it
142 |   (all REPLICAT params are defined in docker-compose/cdc-env/GoldenGate-Bigdata/dirprm/rconf.prm):
143 |     ```bash
144 |     ADD REPLICAT rconf, EXTTRAIL /u01/app/ogg/dirdat/in
145 |     START rconf
146 |     ```
147 |   * Check its status:
148 |     ```bash
149 |     INFO RCONF
150 |     ```
151 | NOTE: More info about *.prm files - https://docs.oracle.com/goldengate/1212/gg-winux/GWURF/gg_parameters.htm#GWURF394
152 |  
153 | # Contact
154 | 
155 | ## Mailing Lists
156 | 
157 | CDAP User Group and Development Discussions:
158 | 
159 | * [cdap-user@googlegroups.com](https://groups.google.com/d/forum/cdap-user)
160 | 
161 | The *cdap-user* mailing list is primarily for users using the product to develop
162 | applications or building plugins for appplications. You can expect questions from
163 | users, release announcements, and any other discussions that we think will be helpful
164 | to the users.
165 | 
166 | # License and Trademarks
167 | 
168 | Copyright © 2016-2019 Cask Data, Inc.
169 | 
170 | Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
171 | in compliance with the License. You may obtain a copy of the License at
172 | 
173 | http://www.apache.org/licenses/LICENSE-2.0
174 | 
175 | Unless required by applicable law or agreed to in writing, software distributed under the
176 | License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
177 | either express or implied. See the License for the specific language governing permissions
178 | and limitations under the License.
179 | 
180 | Cask is a trademark of Cask Data, Inc. All rights reserved.
181 | 
182 | Apache, Apache HBase, and HBase are trademarks of The Apache Software Foundation. Used with
183 | permission. No endorsement by The Apache Software Foundation is implied by the use of these marks.
184 | 
185 | 
186 | 
187 | 
188 | 


--------------------------------------------------------------------------------
/src/main/java/io/cdap/plugin/cdc/source/sqlserver/CTSQLServer.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright © 2019 Cask Data, Inc.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5 |  * use this file except in compliance with the License. You may obtain a copy of
  6 |  * the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12 |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13 |  * License for the specific language governing permissions and limitations under
 14 |  * the License.
 15 |  */
 16 | 
 17 | package io.cdap.plugin.cdc.source.sqlserver;
 18 | 
 19 | import io.cdap.cdap.api.annotation.Description;
 20 | import io.cdap.cdap.api.annotation.Name;
 21 | import io.cdap.cdap.api.annotation.Plugin;
 22 | import io.cdap.cdap.api.data.format.StructuredRecord;
 23 | import io.cdap.cdap.api.dataset.DatasetProperties;
 24 | import io.cdap.cdap.api.plugin.PluginProperties;
 25 | import io.cdap.cdap.etl.api.PipelineConfigurer;
 26 | import io.cdap.cdap.etl.api.streaming.StreamingContext;
 27 | import io.cdap.cdap.etl.api.streaming.StreamingSource;
 28 | import io.cdap.cdap.etl.api.validation.InvalidConfigPropertyException;
 29 | import io.cdap.cdap.etl.api.validation.InvalidStageException;
 30 | import io.cdap.plugin.cdc.common.DBUtils;
 31 | import io.cdap.plugin.cdc.common.DriverCleanup;
 32 | import io.cdap.plugin.cdc.common.Schemas;
 33 | import io.cdap.plugin.common.Constants;
 34 | import org.apache.spark.api.java.Optional;
 35 | import org.apache.spark.api.java.function.Function4;
 36 | import org.apache.spark.rdd.JdbcRDD;
 37 | import org.apache.spark.streaming.State;
 38 | import org.apache.spark.streaming.StateSpec;
 39 | import org.apache.spark.streaming.Time;
 40 | import org.apache.spark.streaming.api.java.JavaDStream;
 41 | import org.slf4j.Logger;
 42 | import org.slf4j.LoggerFactory;
 43 | import scala.Tuple2;
 44 | import scala.reflect.ClassTag;
 45 | import scala.reflect.ClassTag$;
 46 | 
 47 | import java.sql.Connection;
 48 | import java.sql.Driver;
 49 | import java.sql.DriverManager;
 50 | import java.sql.PreparedStatement;
 51 | import java.sql.ResultSet;
 52 | import java.sql.SQLException;
 53 | import java.util.HashMap;
 54 | import java.util.Map;
 55 | 
 56 | /**
 57 |  * Streaming source for reading changes from SQL Server.
 58 |  */
 59 | @Plugin(type = StreamingSource.PLUGIN_TYPE)
 60 | @Name("CTSQLServer")
 61 | @Description("SQL Server Change Tracking Streaming Source")
 62 | public class CTSQLServer extends StreamingSource<StructuredRecord> {
 63 |   private static final Logger LOG = LoggerFactory.getLogger(CTSQLServer.class);
 64 |   static final String JDBC_PLUGIN_ID = "jdbc";
 65 |   private final CTSQLServerConfig conf;
 66 | 
 67 |   public CTSQLServer(CTSQLServerConfig conf) {
 68 |     this.conf = conf;
 69 |   }
 70 | 
 71 |   @Override
 72 |   public void configurePipeline(PipelineConfigurer pipelineConfigurer) {
 73 |     conf.validate();
 74 |     pipelineConfigurer.createDataset(conf.referenceName, Constants.EXTERNAL_DATASET_TYPE, DatasetProperties.EMPTY);
 75 |     pipelineConfigurer.getStageConfigurer().setOutputSchema(Schemas.CHANGE_SCHEMA);
 76 | 
 77 |     DriverCleanup driverCleanup = null;
 78 |     JdbcRDD.ConnectionFactory connectionFactory;
 79 |     if (conf.getJdbcPluginName() != null) {
 80 |       Class<? extends Driver> driverClass = pipelineConfigurer.usePluginClass("jdbc", conf.getJdbcPluginName(),
 81 |                                                                               JDBC_PLUGIN_ID,
 82 |                                                                               PluginProperties.builder().build());
 83 |       if (driverClass == null) {
 84 |         throw new InvalidConfigPropertyException("Unable to find jdbc driver plugin",
 85 |                                                  CTSQLServerConfig.JDBC_PLUGIN_NAME);
 86 |       }
 87 |       try {
 88 |         driverCleanup = DBUtils.ensureJDBCDriverIsAvailable(driverClass, conf.getConnectionString());
 89 |       } catch (IllegalAccessException | InstantiationException | SQLException e) {
 90 |         throw new IllegalArgumentException("Unable to instantiate jdbc driver plugin: " + e.getMessage(), e);
 91 |       }
 92 |       connectionFactory = (JdbcRDD.ConnectionFactory) () -> DriverManager.getConnection(conf.getConnectionString(),
 93 |                                                                                         conf.getUsername(),
 94 |                                                                                         conf.getPassword());
 95 |     } else {
 96 |       connectionFactory = new SQLServerConnectionFactory(conf.getConnectionString(),
 97 |                                                          conf.getUsername(), conf.getPassword());
 98 |     }
 99 | 
100 |     if (conf.getUsername() != null && conf.getPassword() != null) {
101 |       LOG.info("Creating connection with url {}, username {}, password *****",
102 |                getConnectionString(), conf.getUsername());
103 |     } else {
104 |       LOG.info("Creating connection with url {}", getConnectionString());
105 |     }
106 | 
107 |     try (Connection connection = connectionFactory.getConnection()) {
108 |       // check that CDC is enabled on the database
109 |       checkDBCTEnabled(connection, conf.getDbName());
110 |     } catch (InvalidStageException e) {
111 |       // rethrow validation exception
112 |       throw e;
113 |     } catch (Exception e) {
114 |       throw new InvalidStageException(String.format("Failed to check tracking status. Error: %s", e.getMessage()), e);
115 |     } finally {
116 |       if (driverCleanup != null) {
117 |         driverCleanup.destroy();
118 |       }
119 |     }
120 |   }
121 | 
122 |   @Override
123 |   public JavaDStream<StructuredRecord> getStream(StreamingContext context) throws Exception {
124 |     context.registerLineage(conf.referenceName);
125 | 
126 | 
127 |     JdbcRDD.ConnectionFactory connectionFactory;
128 |     if (conf.getJdbcPluginName() != null) {
129 |       connectionFactory = new PluginConnectionFactory(context.getSparkExecutionContext().getPluginContext(),
130 |                                                       context.getStageName(), conf.getConnectionString());
131 |     } else {
132 |       connectionFactory = new SQLServerConnectionFactory(conf.getConnectionString(),
133 |                                                          conf.getUsername(), conf.getPassword());
134 |     }
135 | 
136 |     // get change information dtream. This dstream has both schema and data changes
137 |     LOG.info("Creating change information dstream");
138 |     ClassTag<StructuredRecord> tag = ClassTag$.MODULE$.apply(StructuredRecord.class);
139 |     CTInputDStream dstream = new CTInputDStream(context.getSparkStreamingContext().ssc(), connectionFactory,
140 |                                                 conf.getTableWhitelist(), conf.getSequenceStartNum(),
141 |                                                 conf.getMaxRetrySeconds(), conf.getMaxBatchSize());
142 |     return JavaDStream.fromDStream(dstream, tag)
143 |       .mapToPair(structuredRecord -> new Tuple2<>("", structuredRecord))
144 |       // map the dstream with schema state store to detect changes in schema
145 |       // filter out the ddl record whose schema hasn't changed and then drop all the keys
146 |       .mapWithState(StateSpec.function(schemaStateFunction()))
147 |       .map(Schemas::toCDCRecord);
148 |   }
149 | 
150 |   private void checkDBCTEnabled(Connection connection, String dbName) throws SQLException {
151 |     String query = "SELECT * FROM sys.change_tracking_databases WHERE database_id=DB_ID(?)";
152 |     try (PreparedStatement preparedStatement = connection.prepareStatement(query)) {
153 |       preparedStatement.setString(1, dbName);
154 |       try (ResultSet resultSet = preparedStatement.executeQuery()) {
155 |         if (resultSet.next()) {
156 |           // if resultset is not empty it means that our select with where clause returned data meaning ct is enabled.
157 |           return;
158 |         }
159 |       }
160 |     }
161 |     throw new InvalidStageException(String.format("Change Tracking is not enabled on the specified database '%s'." +
162 |       " Please enable it first.", dbName));
163 |   }
164 | 
165 |   private String getConnectionString() {
166 |     return String.format("jdbc:sqlserver://%s:%s;DatabaseName=%s", conf.getHostname(), conf.getPort(),
167 |                          conf.getDbName());
168 |   }
169 | 
170 |   private static Function4<Time, String, Optional<StructuredRecord>, State<Map<String, String>>,
171 |     Optional<StructuredRecord>> schemaStateFunction() {
172 |     return (time, key, value, state) -> {
173 |       if (!value.isPresent()) {
174 |         return Optional.empty();
175 |       }
176 |       StructuredRecord input = value.get();
177 |       // for dml record we don't need to maintain any state so skip it
178 |       if (Schemas.DML_SCHEMA.getRecordName().equals(input.getSchema().getRecordName())) {
179 |         return Optional.of(input);
180 |       }
181 | 
182 |       // we know now that its a ddl record so process it
183 |       String tableName = input.get(Schemas.TABLE_FIELD);
184 |       String tableSchemaStructure = input.get(Schemas.SCHEMA_FIELD);
185 |       Map<String, String> newState;
186 |       if (state.exists()) {
187 |         newState = state.get();
188 |         if (newState.containsKey(tableName) && newState.get(tableName).equals(tableSchemaStructure)) {
189 |           // schema hasn't changed so emit with false so that we can later filter this record out
190 |           return Optional.empty();
191 |         }
192 |       } else {
193 |         newState = new HashMap<>();
194 |       }
195 |       // update the state
196 |       newState.put(tableName, tableSchemaStructure);
197 |       state.update(newState);
198 |       LOG.debug("Update schema state store for table {}. New schema will be emitted.", tableName);
199 |       return Optional.of(input);
200 |     };
201 |   }
202 | }
203 | 


--------------------------------------------------------------------------------
/docker-compose/cdc-env/Oracle/dbca.rsp.tmpl:
--------------------------------------------------------------------------------
  1 | ##############################################################################
  2 | ##                                                                          ##
  3 | ##                            DBCA response file                            ##
  4 | ##                            ------------------                            ##
  5 | ## Copyright(c) Oracle Corporation 1998,2017. All rights reserved.          ##
  6 | ##                                                                          ##
  7 | ## Specify values for the variables listed below to customize 			    ##
  8 | ## your installation.                                         			    ##
  9 | ##                                                            			    ##
 10 | ## Each variable is associated with a comment. The comment    			    ##
 11 | ## can help to populate the variables with the appropriate   			    ##
 12 | ## values.                                                  			    ##
 13 | ##                                                               			##
 14 | ## IMPORTANT NOTE: This file contains plain text passwords and   			##
 15 | ## should be secured to have read permission only by oracle user 			##
 16 | ## or db administrator who owns this installation.               			##
 17 | ##############################################################################
 18 | #-------------------------------------------------------------------------------
 19 | # Do not change the following system generated value. 
 20 | #-------------------------------------------------------------------------------
 21 | responseFileVersion=/oracle/assistants/rspfmt_dbca_response_schema_v18.0.0
 22 | 
 23 | #-----------------------------------------------------------------------------
 24 | # Name          : gdbName
 25 | # Datatype      : String
 26 | # Description   : Global database name of the database
 27 | # Valid values  : <db_name>.<db_domain> - when database domain isn't NULL
 28 | #                 <db_name>             - when database domain is NULL
 29 | # Default value : None
 30 | # Mandatory     : Yes
 31 | #-----------------------------------------------------------------------------
 32 | gdbName=###ORACLE_SID###
 33 | 
 34 | #-----------------------------------------------------------------------------
 35 | # Name          : sid
 36 | # Datatype      : String
 37 | # Description   : System identifier (SID) of the database
 38 | # Valid values  : Check Oracle12c Administrator's Guide
 39 | # Default value : <db_name> specified in GDBNAME
 40 | # Mandatory     : No
 41 | #-----------------------------------------------------------------------------
 42 | sid=###ORACLE_SID###
 43 | 
 44 | #-----------------------------------------------------------------------------
 45 | # Name          : createAsContainerDatabase 
 46 | # Datatype      : boolean
 47 | # Description   : flag to create database as container database 
 48 | # Valid values  : Check Oracle12c Administrator's Guide
 49 | # Default value : false
 50 | # Mandatory     : No
 51 | #-----------------------------------------------------------------------------
 52 | createAsContainerDatabase=false
 53 | 
 54 | #-----------------------------------------------------------------------------
 55 | # Name          : numberOfPDBs
 56 | # Datatype      : Number
 57 | # Description   : Specify the number of pdb to be created
 58 | # Valid values  : 0 to 4094
 59 | # Default value : 0
 60 | # Mandatory     : No
 61 | #-----------------------------------------------------------------------------
 62 | numberOfPDBs=0
 63 | 
 64 | #-----------------------------------------------------------------------------
 65 | # Name          : pdbName 
 66 | # Datatype      : String
 67 | # Description   : Specify the pdbname/pdbanme prefix if one or more pdb need to be created
 68 | # Valid values  : Check Oracle12c Administrator's Guide
 69 | # Default value : None
 70 | # Mandatory     : No
 71 | #-----------------------------------------------------------------------------
 72 | pdbName=None
 73 | 
 74 | #-----------------------------------------------------------------------------
 75 | # Name          : pdbAdminPassword
 76 | # Datatype      : String
 77 | # Description   : PDB Administrator user password
 78 | # Valid values  : Check Oracle12c Administrator's Guide
 79 | # Default value : None
 80 | # Mandatory     : No
 81 | #-----------------------------------------------------------------------------
 82 | pdbAdminPassword=None
 83 | 
 84 | #-----------------------------------------------------------------------------
 85 | # Name          : templateName
 86 | # Datatype      : String
 87 | # Description   : Name of the template
 88 | # Valid values  : Template file name
 89 | # Default value : None
 90 | # Mandatory     : Yes
 91 | #-----------------------------------------------------------------------------
 92 | templateName=General_Purpose.dbc
 93 | 
 94 | #-----------------------------------------------------------------------------
 95 | # Name          : sysPassword
 96 | # Datatype      : String
 97 | # Description   : Password for SYS user
 98 | # Valid values  : Check Oracle12c Administrator's Guide
 99 | # Default value : None
100 | # Mandatory     : Yes
101 | #-----------------------------------------------------------------------------
102 | sysPassword=###ORACLE_PWD###
103 | 
104 | #-----------------------------------------------------------------------------
105 | # Name          : systemPassword
106 | # Datatype      : String
107 | # Description   : Password for SYSTEM user
108 | # Valid values  : Check Oracle12c Administrator's Guide
109 | # Default value : None
110 | # Mandatory     : Yes
111 | #-----------------------------------------------------------------------------
112 | systemPassword=###ORACLE_PWD###
113 | 
114 | #-----------------------------------------------------------------------------
115 | # Name          : emConfiguration
116 | # Datatype      : String
117 | # Description   : Enterprise Manager Configuration Type
118 | # Valid values  : CENTRAL|DBEXPRESS|BOTH|NONE
119 | # Default value : NONE
120 | # Mandatory     : No
121 | #-----------------------------------------------------------------------------
122 | emConfiguration=NONE
123 | 
124 | #-----------------------------------------------------------------------------
125 | # Name          : emExpressPort
126 | # Datatype      : Number
127 | # Description   : Enterprise Manager Configuration Type
128 | # Valid values  : Check Oracle12c Administrator's Guide
129 | # Default value : NONE
130 | # Mandatory     : No, will be picked up from DBEXPRESS_HTTPS_PORT env variable
131 | #                 or auto generates a free port between 5500 and 5599
132 | #-----------------------------------------------------------------------------
133 | #emExpressPort=NONE
134 | 
135 | #-----------------------------------------------------------------------------
136 | # Name          : dbsnmpPassword
137 | # Datatype      : String
138 | # Description   : Password for DBSNMP user
139 | # Valid values  : Check Oracle12c Administrator's Guide
140 | # Default value : None
141 | # Mandatory     : Yes, if emConfiguration is specified or
142 | #                 the value of runCVUChecks is TRUE
143 | #-----------------------------------------------------------------------------
144 | dbsnmpPassword=###ORACLE_PWD###
145 | 
146 | #-----------------------------------------------------------------------------
147 | # Name          : characterSet
148 | # Datatype      : String
149 | # Description   : Character set of the database
150 | # Valid values  : Check Oracle12c National Language Support Guide
151 | # Default value : "US7ASCII"
152 | # Mandatory     : NO
153 | #-----------------------------------------------------------------------------
154 | characterSet=###ORACLE_CHARACTERSET###
155 | 
156 | #-----------------------------------------------------------------------------
157 | # Name          : nationalCharacterSet
158 | # Datatype      : String
159 | # Description   : National Character set of the database
160 | # Valid values  : "UTF8" or "AL16UTF16". For details, check Oracle12c National Language Support Guide
161 | # Default value : "AL16UTF16"
162 | # Mandatory     : No
163 | #-----------------------------------------------------------------------------
164 | nationalCharacterSet=AL16UTF16
165 | 
166 | #-----------------------------------------------------------------------------
167 | # Name          : initParams
168 | # Datatype      : String
169 | # Description   : comma separated list of name=value pairs. Overrides initialization parameters defined in templates
170 | # Default value : None
171 | # Mandatory     : NO
172 | #-----------------------------------------------------------------------------
173 | initParams=audit_trail=none,audit_sys_operations=false
174 | 
175 | #-----------------------------------------------------------------------------
176 | # Name          : listeners
177 | # Datatype      : String
178 | # Description   : Specifies list of listeners to register the database with.
179 | #		  By default the database is configured for all the listeners specified in the 
180 | #		  $ORACLE_HOME/network/admin/listener.ora 	
181 | # Valid values  : The list should be comma separated like "listener1,listener2".
182 | # Mandatory     : NO
183 | #-----------------------------------------------------------------------------
184 | #listeners=LISTENER
185 | 
186 | #-----------------------------------------------------------------------------
187 | # Name          : automaticMemoryManagement
188 | # Datatype      : Boolean
189 | # Description   : flag to indicate Automatic Memory Management is used
190 | # Valid values  : TRUE/FALSE
191 | # Default value : TRUE
192 | # Mandatory     : NO
193 | #-----------------------------------------------------------------------------
194 | automaticMemoryManagement=FALSE
195 | 
196 | #-----------------------------------------------------------------------------
197 | # Name          : totalMemory
198 | # Datatype      : String
199 | # Description   : total memory in MB to allocate to Oracle
200 | # Valid values  : 
201 | # Default value : 
202 | # Mandatory     : NO
203 | #-----------------------------------------------------------------------------
204 | totalMemory=2048


--------------------------------------------------------------------------------
/src/main/java/io/cdap/plugin/cdc/common/DBUtils.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright © 2019 Cask Data, Inc.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5 |  * use this file except in compliance with the License. You may obtain a copy of
  6 |  * the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12 |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13 |  * License for the specific language governing permissions and limitations under
 14 |  * the License.
 15 |  */
 16 | 
 17 | package io.cdap.plugin.cdc.common;
 18 | 
 19 | import com.google.common.collect.Lists;
 20 | import io.cdap.cdap.api.data.schema.Schema;
 21 | import io.cdap.cdap.api.data.schema.UnsupportedTypeException;
 22 | import org.slf4j.Logger;
 23 | import org.slf4j.LoggerFactory;
 24 | 
 25 | import java.lang.reflect.Field;
 26 | import java.math.BigDecimal;
 27 | import java.sql.Blob;
 28 | import java.sql.Clob;
 29 | import java.sql.Driver;
 30 | import java.sql.DriverManager;
 31 | import java.sql.ResultSet;
 32 | import java.sql.ResultSetMetaData;
 33 | import java.sql.SQLException;
 34 | import java.sql.Types;
 35 | import java.util.List;
 36 | import javax.annotation.Nullable;
 37 | 
 38 | /**
 39 |  * Utility methods for Database plugins shared by Database plugins.
 40 |  */
 41 | public final class DBUtils {
 42 |   private static final Logger LOG = LoggerFactory.getLogger(DBUtils.class);
 43 | 
 44 |   /**
 45 |    * Ensures that the JDBC Driver specified in configuration is available and can be loaded. Also registers it with
 46 |    * {@link DriverManager} if it is not already registered.
 47 |    */
 48 |   public static DriverCleanup ensureJDBCDriverIsAvailable(Class<? extends Driver> jdbcDriverClass,
 49 |                                                           String connectionString)
 50 |     throws IllegalAccessException, InstantiationException, SQLException {
 51 | 
 52 |     try {
 53 |       DriverManager.getDriver(connectionString);
 54 |       return new DriverCleanup(null);
 55 |     } catch (SQLException e) {
 56 |       // Driver not found. We will try to register it with the DriverManager.
 57 |       final JDBCDriverShim driverShim = new JDBCDriverShim(jdbcDriverClass.newInstance());
 58 |       try {
 59 |         DBUtils.deregisterAllDrivers(jdbcDriverClass);
 60 |       } catch (NoSuchFieldException | ClassNotFoundException e1) {
 61 |         LOG.error("Unable to deregister JDBC Driver class {}", jdbcDriverClass);
 62 |       }
 63 |       DriverManager.registerDriver(driverShim);
 64 |       return new DriverCleanup(driverShim);
 65 |     }
 66 |   }
 67 | 
 68 |   /**
 69 |    * Given the result set, get the metadata of the result set and return
 70 |    * list of {@link io.cdap.cdap.api.data.schema.Schema.Field}.
 71 |    *
 72 |    * @param resultSet result set of executed query
 73 |    * @return list of schema fields
 74 |    * @throws SQLException
 75 |    */
 76 |   public static List<Schema.Field> getSchemaFields(ResultSet resultSet) throws SQLException {
 77 |     List<Schema.Field> schemaFields = Lists.newArrayList();
 78 |     ResultSetMetaData metadata = resultSet.getMetaData();
 79 |     // ResultSetMetadata columns are numbered starting with 1
 80 |     for (int i = 1; i <= metadata.getColumnCount(); i++) {
 81 |       String columnName = metadata.getColumnName(i);
 82 |       int columnSqlType = metadata.getColumnType(i);
 83 |       int columnSqlPrecision = metadata.getPrecision(i); // total number of digits
 84 |       int columnSqlScale = metadata.getScale(i); // digits after the decimal point
 85 |       String columnTypeName = metadata.getColumnTypeName(i);
 86 |       Schema columnSchema = getSchema(columnTypeName, columnSqlType, columnSqlPrecision, columnSqlScale);
 87 |       if (ResultSetMetaData.columnNullable == metadata.isNullable(i)) {
 88 |         columnSchema = Schema.nullableOf(columnSchema);
 89 |       }
 90 |       Schema.Field field = Schema.Field.of(columnName, columnSchema);
 91 |       schemaFields.add(field);
 92 |     }
 93 |     return schemaFields;
 94 |   }
 95 | 
 96 |   // given a sql type return schema type
 97 |   private static Schema getSchema(String typeName, int sqlType, int precision, int scale) throws SQLException {
 98 |     // Type.STRING covers sql types - VARCHAR,CHAR,CLOB,LONGNVARCHAR,LONGVARCHAR,NCHAR,NCLOB,NVARCHAR
 99 |     Schema.Type type = Schema.Type.STRING;
100 |     switch (sqlType) {
101 |       case Types.NULL:
102 |         type = Schema.Type.NULL;
103 |         break;
104 | 
105 |       case Types.ROWID:
106 |         break;
107 | 
108 |       case Types.BOOLEAN:
109 |       case Types.BIT:
110 |         type = Schema.Type.BOOLEAN;
111 |         break;
112 | 
113 |       case Types.TINYINT:
114 |       case Types.SMALLINT:
115 |         type = Schema.Type.INT;
116 |         break;
117 |       case Types.INTEGER:
118 |         // CDAP-12211 - handling unsigned integers in mysql
119 |         type = "int unsigned".equalsIgnoreCase(typeName) ? Schema.Type.LONG : Schema.Type.INT;
120 |         break;
121 | 
122 |       case Types.BIGINT:
123 |         type = Schema.Type.LONG;
124 |         break;
125 | 
126 |       case Types.REAL:
127 |       case Types.FLOAT:
128 |         type = Schema.Type.FLOAT;
129 |         break;
130 | 
131 |       case Types.NUMERIC:
132 |       case Types.DECIMAL:
133 |         // if there are no digits after the point, use integer types
134 |         type = scale != 0 ? Schema.Type.DOUBLE :
135 |           // with 10 digits we can represent 2^32 and LONG is required
136 |           precision > 9 ? Schema.Type.LONG : Schema.Type.INT;
137 |         break;
138 | 
139 |       case Types.DOUBLE:
140 |         type = Schema.Type.DOUBLE;
141 |         break;
142 | 
143 |       case Types.DATE:
144 |         return Schema.of(Schema.LogicalType.DATE);
145 |       case Types.TIME:
146 |         return Schema.of(Schema.LogicalType.TIME_MICROS);
147 |       case Types.TIMESTAMP:
148 |         return Schema.of(Schema.LogicalType.TIMESTAMP_MICROS);
149 | 
150 |       case Types.BINARY:
151 |       case Types.VARBINARY:
152 |       case Types.LONGVARBINARY:
153 |       case Types.BLOB:
154 |         type = Schema.Type.BYTES;
155 |         break;
156 | 
157 |       case Types.ARRAY:
158 |       case Types.DATALINK:
159 |       case Types.DISTINCT:
160 |       case Types.JAVA_OBJECT:
161 |       case Types.OTHER:
162 |       case Types.REF:
163 |       case Types.SQLXML:
164 |       case Types.STRUCT:
165 |         throw new SQLException(new UnsupportedTypeException("Unsupported SQL Type: " + sqlType));
166 |     }
167 | 
168 |     return Schema.of(type);
169 |   }
170 | 
171 |   @Nullable
172 |   public static Object transformValue(int sqlType, int precision, int scale,
173 |                                       ResultSet resultSet, String fieldName) throws SQLException {
174 |     Object original = resultSet.getObject(fieldName);
175 |     if (original != null) {
176 |       switch (sqlType) {
177 |         case Types.SMALLINT:
178 |         case Types.TINYINT:
179 |           return ((Number) original).intValue();
180 |         case Types.NUMERIC:
181 |         case Types.DECIMAL:
182 |           BigDecimal decimal = (BigDecimal) original;
183 |           if (scale != 0) {
184 |             // if there are digits after the point, use double types
185 |             return decimal.doubleValue();
186 |           } else if (precision > 9) {
187 |             // with 10 digits we can represent 2^32 and LONG is required
188 |             return decimal.longValue();
189 |           } else {
190 |             return decimal.intValue();
191 |           }
192 |         case Types.DATE:
193 |           return resultSet.getDate(fieldName);
194 |         case Types.TIME:
195 |           return resultSet.getTime(fieldName);
196 |         case Types.TIMESTAMP:
197 |           return resultSet.getTimestamp(fieldName);
198 |         case Types.ROWID:
199 |           return resultSet.getString(fieldName);
200 |         case Types.BLOB:
201 |           Blob blob = (Blob) original;
202 |           return blob.getBytes(1, (int) blob.length());
203 |         case Types.CLOB:
204 |           Clob clob = (Clob) original;
205 |           return clob.getSubString(1, (int) clob.length());
206 |       }
207 |     }
208 |     return original;
209 |   }
210 | 
211 |   /**
212 |    * De-register all SQL drivers that are associated with the class
213 |    */
214 |   public static void deregisterAllDrivers(Class<? extends Driver> driverClass)
215 |     throws NoSuchFieldException, IllegalAccessException, ClassNotFoundException {
216 |     Field field = DriverManager.class.getDeclaredField("registeredDrivers");
217 |     field.setAccessible(true);
218 |     List<?> list = (List<?>) field.get(null);
219 |     for (Object driverInfo : list) {
220 |       Class<?> driverInfoClass = DBUtils.class.getClassLoader().loadClass("java.sql.DriverInfo");
221 |       Field driverField = driverInfoClass.getDeclaredField("driver");
222 |       driverField.setAccessible(true);
223 |       Driver d = (Driver) driverField.get(driverInfo);
224 |       if (d == null) {
225 |         LOG.trace("Could not find driver %s", driverInfo);
226 |         continue;
227 |       }
228 |       LOG.trace("Removing non-null driver object from drivers list.");
229 |       ClassLoader registeredDriverClassLoader = d.getClass().getClassLoader();
230 |       if (registeredDriverClassLoader == null) {
231 |         LOG.trace("Found null classloader for default driver {}. Ignoring since this may be using system classloader.",
232 |                   d.getClass().getName());
233 |         continue;
234 |       }
235 |       // Remove all objects in this list that were created using the classloader of the caller.
236 |       if (d.getClass().getClassLoader().equals(driverClass.getClassLoader())) {
237 |         LOG.trace("Removing default driver {} from registeredDrivers", d.getClass().getName());
238 |         list.remove(driverInfo);
239 |       }
240 |     }
241 |   }
242 | 
243 |   private DBUtils() {
244 |     throw new AssertionError("Should not instantiate static utility class.");
245 |   }
246 | }
247 | 
248 | 


--------------------------------------------------------------------------------
/src/main/java/io/cdap/plugin/cdc/source/oracle/Normalizer.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright © 2019 Cask Data, Inc.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5 |  * use this file except in compliance with the License. You may obtain a copy of
  6 |  * the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12 |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13 |  * License for the specific language governing permissions and limitations under
 14 |  * the License.
 15 |  */
 16 | 
 17 | package io.cdap.plugin.cdc.source.oracle;
 18 | 
 19 | import com.google.gson.Gson;
 20 | import com.google.gson.JsonObject;
 21 | import io.cdap.cdap.api.data.format.StructuredRecord;
 22 | import io.cdap.cdap.api.data.schema.Schema;
 23 | import io.cdap.plugin.cdc.common.AvroConverter;
 24 | import io.cdap.plugin.cdc.common.OperationType;
 25 | import io.cdap.plugin.cdc.common.Schemas;
 26 | import org.apache.avro.generic.GenericDatumReader;
 27 | import org.apache.avro.generic.GenericRecord;
 28 | import org.apache.avro.io.DecoderFactory;
 29 | import org.slf4j.Logger;
 30 | import org.slf4j.LoggerFactory;
 31 | 
 32 | import java.io.IOException;
 33 | import java.nio.charset.StandardCharsets;
 34 | import java.util.ArrayList;
 35 | import java.util.Collections;
 36 | import java.util.LinkedHashMap;
 37 | import java.util.List;
 38 | import java.util.Map;
 39 | import java.util.Objects;
 40 | 
 41 | /**
 42 |  * Class responsible for normalizing the StructuredRecords to be sent to the CDC sinks
 43 |  */
 44 | public class Normalizer {
 45 |   private static final Logger LOG = LoggerFactory.getLogger(Normalizer.class);
 46 |   private static final Gson GSON = new Gson();
 47 | 
 48 |   private static final String INPUT_FIELD = "message";
 49 | 
 50 |   /**
 51 |    * Normalize the input StructuredRecord containing byte array into the DDL or DML records.
 52 |    * One input record can result into multiple output records. For example in case of primary key
 53 |    * updates, the output record consist of two StructuredRcords, one represents delete and another represnents
 54 |    * insert.
 55 |    *
 56 |    * @param input record containing message as byte array to be normalized
 57 |    * @return {@link List} of normalized records
 58 |    */
 59 |   public List<StructuredRecord> transform(StructuredRecord input) throws Exception {
 60 |     Object message = input.get(INPUT_FIELD);
 61 |     if (message == null) {
 62 |       throw new IllegalStateException(String.format("Input record does not contain the field '%s'.", INPUT_FIELD));
 63 |     }
 64 | 
 65 |     if ("GenericWrapperSchema".equals(input.getSchema().getRecordName())) {
 66 |       // Do nothing for the generic wrapper schema message
 67 |       // Return empty list
 68 |       return Collections.emptyList();
 69 |     }
 70 | 
 71 |     byte[] messageBytes = BinaryMessages.getBytesFromBinaryMessage(message);
 72 | 
 73 |     if (input.getSchema().getRecordName().equals(Schemas.DDL_SCHEMA.getRecordName())) {
 74 |       String messageBody = new String(messageBytes, StandardCharsets.UTF_8);
 75 |       JsonObject schemaObj = GSON.fromJson(messageBody, JsonObject.class);
 76 |       String namespaceName = schemaObj.get("namespace").getAsString();
 77 |       String tableName = schemaObj.get("name").getAsString();
 78 |       StructuredRecord ddlRecord = StructuredRecord.builder(Schemas.DDL_SCHEMA)
 79 |         .set(Schemas.TABLE_FIELD, namespaceName + "." + tableName)
 80 |         .set(Schemas.SCHEMA_FIELD, getNormalizedDDLSchema(messageBody))
 81 |         .build();
 82 |       return Collections.singletonList(ddlRecord);
 83 |     }
 84 | 
 85 |     // Current message is the Wrapped Avro binary message
 86 |     // Get the state map
 87 |     StructuredRecord stateRecord = input.get("staterecord");
 88 |     Map<Long, String> schemaCacheMap = stateRecord.get("data");
 89 |     org.apache.avro.Schema avroGenericWrapperSchema = getGenericWrapperMessageSchema();
 90 | 
 91 |     GenericRecord genericRecord = getRecord(messageBytes, avroGenericWrapperSchema);
 92 |     String tableName = genericRecord.get("table_name").toString();
 93 |     long schameHashId = (Long) genericRecord.get("schema_fingerprint");
 94 | 
 95 |     byte[] payload = BinaryMessages.getBytesFromBinaryMessage(genericRecord.get("payload"));
 96 | 
 97 |     org.apache.avro.Schema avroSchema = new org.apache.avro.Schema.Parser().parse(schemaCacheMap.get(schameHashId));
 98 |     LOG.debug("Avro schema {} for table {} with fingerprint {}", avroSchema, tableName, schameHashId);
 99 | 
100 |     StructuredRecord structuredRecord = AvroConverter.fromAvroRecord(getRecord(payload, avroSchema),
101 |                                                                      AvroConverter.fromAvroSchema(avroSchema));
102 | 
103 |     return getNormalizedDMLRecord(structuredRecord);
104 |   }
105 | 
106 |   private String getNormalizedDDLSchema(String jsonSchema) {
107 |     org.apache.avro.Schema avroSchema = new org.apache.avro.Schema.Parser().parse(jsonSchema);
108 |     Schema schema = AvroConverter.fromAvroSchema(avroSchema);
109 |     Schema column = schema.getField("before").getSchema().getNonNullable();
110 | 
111 |     List<Schema.Field> columnFields = new ArrayList<>();
112 |     for (Schema.Field field : column.getFields()) {
113 |       if (!field.getName().endsWith("_isMissing")) {
114 |         columnFields.add(field);
115 |       }
116 |     }
117 | 
118 |     Schema ddlSchema = Schema.recordOf(Schemas.SCHEMA_RECORD, columnFields);
119 |     LOG.debug("Schema for DDL {}", ddlSchema);
120 |     return ddlSchema.toString();
121 |   }
122 | 
123 |   private org.apache.avro.Schema getGenericWrapperMessageSchema() {
124 |     String avroGenericWrapperSchema = "{\n" +
125 |       "          \"type\" : \"record\",\n" +
126 |       "          \"name\" : \"generic_wrapper\",\n" +
127 |       "          \"namespace\" : \"oracle.goldengate\",\n" +
128 |       "          \"fields\" : [ {\n" +
129 |       "            \"name\" : \"table_name\",\n" +
130 |       "            \"type\" : \"string\"\n" +
131 |       "          }, {\n" +
132 |       "            \"name\" : \"schema_fingerprint\",\n" +
133 |       "            \"type\" : \"long\"\n" +
134 |       "          }, {\n" +
135 |       "            \"name\" : \"payload\",\n" +
136 |       "            \"type\" : \"bytes\"\n" +
137 |       "          } ]\n" +
138 |       "        }";
139 |     return new org.apache.avro.Schema.Parser().parse(avroGenericWrapperSchema);
140 |   }
141 | 
142 |   private GenericRecord getRecord(byte[] message, org.apache.avro.Schema schema) throws IOException {
143 |     GenericDatumReader<GenericRecord> datumReader = new GenericDatumReader<>(schema);
144 |     return datumReader.read(null, DecoderFactory.get().binaryDecoder(message, null));
145 |   }
146 | 
147 |   private List<StructuredRecord> getNormalizedDMLRecord(StructuredRecord record) throws IOException {
148 |     List<StructuredRecord> normalizedRecords = new ArrayList<>();
149 |     // This table name contains "." in it already
150 |     String tableName = record.get("table");
151 |     List<String> primaryKeys = record.get("primary_keys");
152 |     OperationType opType = OperationType.fromShortName(record.get("op_type"));
153 |     Map<Schema.Field, Object> suppliedFieldValues = new LinkedHashMap<>();
154 |     switch (opType) {
155 |       case INSERT:
156 |         StructuredRecord insertRecord = record.get("after");
157 |         for (Schema.Field field : insertRecord.getSchema().getFields()) {
158 |           if (!field.getName().endsWith("_isMissing")) {
159 |             suppliedFieldValues.put(field, insertRecord.get(field.getName()));
160 |           }
161 |         }
162 |         break;
163 |       case UPDATE:
164 |         StructuredRecord afterUpdateRecord = record.get("after");
165 |         StructuredRecord beforeUpdateRecord = record.get("before");
166 |         boolean pkChanged = primaryKeyChanged(primaryKeys, beforeUpdateRecord, afterUpdateRecord);
167 | 
168 |         if (pkChanged) {
169 |           // We need to emit two records
170 |           // One for DELETE and one for INSERT
171 |           suppliedFieldValues = addDeleteFields(record);
172 |           normalizedRecords.add(createDMLRecord(tableName, OperationType.DELETE, primaryKeys, suppliedFieldValues));
173 |         }
174 | 
175 |         suppliedFieldValues.clear();
176 |         for (Schema.Field field : afterUpdateRecord.getSchema().getFields()) {
177 |           if (!field.getName().endsWith("_isMissing")) {
178 |             String fieldName = field.getName();
179 |             if (!((boolean) afterUpdateRecord.get(fieldName + "_isMissing"))) {
180 |               suppliedFieldValues.put(field, afterUpdateRecord.get(field.getName()));
181 |             } else {
182 |               // Field is not updated, use the field value from the before record
183 |               suppliedFieldValues.put(field, beforeUpdateRecord.get(field.getName()));
184 |             }
185 |           }
186 |         }
187 |         if (pkChanged) {
188 |           // Change the operation type to Insert if the primary key is changed
189 |           opType = OperationType.INSERT;
190 |         }
191 |         break;
192 |       case DELETE:
193 |         suppliedFieldValues = addDeleteFields(record);
194 |         break;
195 |       default:
196 |         break;
197 |     }
198 | 
199 |     normalizedRecords.add(createDMLRecord(tableName, opType, primaryKeys, suppliedFieldValues));
200 |     return normalizedRecords;
201 |   }
202 | 
203 |   private boolean primaryKeyChanged(List<String> primaryKeys, StructuredRecord before, StructuredRecord after) {
204 |     for (String key : primaryKeys) {
205 |       if (!Objects.equals(before.get(key), after.get(key))) {
206 |         return true;
207 |       }
208 |     }
209 |     return false;
210 |   }
211 | 
212 |   private Map<Schema.Field, Object> addDeleteFields(StructuredRecord record) {
213 |     Map<Schema.Field, Object> fieldValues = new LinkedHashMap<>();
214 |     StructuredRecord deleteRecord = record.get("before");
215 |     for (Schema.Field field : deleteRecord.getSchema().getFields()) {
216 |       if (!field.getName().endsWith("_isMissing")) {
217 |         fieldValues.put(field, deleteRecord.get(field.getName()));
218 |       }
219 |     }
220 |     return fieldValues;
221 |   }
222 | 
223 |   private StructuredRecord createDMLRecord(String tableName, OperationType opType, List<String> primaryKeys,
224 |                                            Map<Schema.Field, Object> changedFields) throws IOException {
225 |     Schema changeSchema = Schema.recordOf(Schemas.SCHEMA_RECORD, changedFields.keySet());
226 |     Map<String, Object> changes = new LinkedHashMap<>();
227 |     for (Map.Entry<Schema.Field, Object> entry : changedFields.entrySet()) {
228 |       changes.put(entry.getKey().getName(), entry.getValue());
229 |     }
230 |     return StructuredRecord.builder(Schemas.DML_SCHEMA)
231 |       .set(Schemas.TABLE_FIELD, tableName)
232 |       .set(Schemas.OP_TYPE_FIELD, opType.name())
233 |       .set(Schemas.PRIMARY_KEYS_FIELD, primaryKeys)
234 |       .set(Schemas.UPDATE_SCHEMA_FIELD, changeSchema.toString())
235 |       .set(Schemas.UPDATE_VALUES_FIELD, changes)
236 |       .build();
237 |   }
238 | }
239 | 


--------------------------------------------------------------------------------
/src/main/java/io/cdap/plugin/cdc/source/oracle/GoldenGateKafka.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright © 2019 Cask Data, Inc.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5 |  * use this file except in compliance with the License. You may obtain a copy of
  6 |  * the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12 |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13 |  * License for the specific language governing permissions and limitations under
 14 |  * the License.
 15 |  */
 16 | 
 17 | package io.cdap.plugin.cdc.source.oracle;
 18 | 
 19 | import io.cdap.cdap.api.annotation.Description;
 20 | import io.cdap.cdap.api.annotation.Name;
 21 | import io.cdap.cdap.api.annotation.Plugin;
 22 | import io.cdap.cdap.api.data.format.StructuredRecord;
 23 | import io.cdap.cdap.api.data.schema.Schema;
 24 | import io.cdap.cdap.api.dataset.DatasetProperties;
 25 | import io.cdap.cdap.etl.api.PipelineConfigurer;
 26 | import io.cdap.cdap.etl.api.streaming.StreamingContext;
 27 | import io.cdap.cdap.etl.api.streaming.StreamingSource;
 28 | import io.cdap.plugin.cdc.common.Schemas;
 29 | import io.cdap.plugin.common.Constants;
 30 | import kafka.api.OffsetRequest;
 31 | import kafka.api.PartitionOffsetRequestInfo;
 32 | import kafka.common.TopicAndPartition;
 33 | import kafka.javaapi.OffsetResponse;
 34 | import kafka.javaapi.PartitionMetadata;
 35 | import kafka.javaapi.TopicMetadata;
 36 | import kafka.javaapi.TopicMetadataRequest;
 37 | import kafka.javaapi.TopicMetadataResponse;
 38 | import kafka.javaapi.consumer.SimpleConsumer;
 39 | import kafka.message.MessageAndMetadata;
 40 | import kafka.serializer.DefaultDecoder;
 41 | import org.apache.avro.SchemaNormalization;
 42 | import org.apache.spark.api.java.Optional;
 43 | import org.apache.spark.api.java.function.Function3;
 44 | import org.apache.spark.streaming.State;
 45 | import org.apache.spark.streaming.StateSpec;
 46 | import org.apache.spark.streaming.api.java.JavaDStream;
 47 | import org.apache.spark.streaming.api.java.JavaInputDStream;
 48 | import org.apache.spark.streaming.kafka.KafkaUtils;
 49 | import org.slf4j.Logger;
 50 | import org.slf4j.LoggerFactory;
 51 | import scala.Tuple2;
 52 | 
 53 | import java.nio.charset.StandardCharsets;
 54 | import java.util.Collections;
 55 | import java.util.HashMap;
 56 | import java.util.HashSet;
 57 | import java.util.Map;
 58 | import java.util.Set;
 59 | 
 60 | /**
 61 |  * Streaming source for reading from Golden Gate Kafka topic.
 62 |  */
 63 | @Plugin(type = StreamingSource.PLUGIN_TYPE)
 64 | @Name("CDCDatabase")
 65 | @Description("Streaming source for reading through Golden Gate Kafka topic")
 66 | public class GoldenGateKafka extends StreamingSource<StructuredRecord> {
 67 |   private static final Logger LOG = LoggerFactory.getLogger(GoldenGateKafka.class);
 68 |   private static final Schema GENERIC_WRAPPER_SCHEMA_MESSAGE
 69 |     = Schema.recordOf("GenericWrapperSchema", Schema.Field.of("message", Schema.of(Schema.Type.BYTES)));
 70 |   private static final Schema DDL_SCHEMA_MESSAGE
 71 |     = Schema.recordOf("DDLRecord", Schema.Field.of("message", Schema.of(Schema.Type.BYTES)));
 72 |   private static final Schema TRANSFORMED_MESSAGE
 73 |     = Schema.recordOf("Message", Schema.Field.of("message", Schema.of(Schema.Type.BYTES)));
 74 | 
 75 |   private static final Schema STATE_SCHEMA
 76 |     = Schema.recordOf("state",
 77 |                       Schema.Field.of("data",
 78 |                                       Schema.mapOf(Schema.of(Schema.Type.LONG),
 79 |                                                    Schema.of(Schema.Type.STRING))));
 80 | 
 81 |   private static final Schema DML_SCHEMA = Schema.recordOf("DMLRecord",
 82 |                                                            Schema.Field.of("message", Schema.of(Schema.Type.BYTES)),
 83 |                                                            Schema.Field.of("staterecord", STATE_SCHEMA));
 84 | 
 85 |   private static final Normalizer NORMALIZER = new Normalizer();
 86 | 
 87 |   private final GoldenGateKafkaConfig conf;
 88 | 
 89 | 
 90 |   public GoldenGateKafka(GoldenGateKafkaConfig conf) {
 91 |     this.conf = conf;
 92 |   }
 93 | 
 94 |   @Override
 95 |   public void configurePipeline(PipelineConfigurer pipelineConfigurer) throws IllegalArgumentException {
 96 |     conf.validate();
 97 |     pipelineConfigurer.createDataset(conf.referenceName, Constants.EXTERNAL_DATASET_TYPE, DatasetProperties.EMPTY);
 98 |     pipelineConfigurer.getStageConfigurer().setOutputSchema(Schemas.CHANGE_SCHEMA);
 99 | 
100 |     // Make sure that Golden Gate kafka topic only have single partition
101 |     SimpleConsumer consumer = new SimpleConsumer(conf.getHost(), conf.getPort(), 20 * 1000, 128 * 1024,
102 |                                                  "partitionLookup");
103 |     try {
104 |       getPartitionId(consumer);
105 |     } finally {
106 |       consumer.close();
107 |     }
108 | 
109 |     if (conf.getMaxRatePerPartition() > 0) {
110 |       Map<String, String> pipelineProperties = new HashMap<>();
111 |       pipelineProperties.put("spark.streaming.kafka.maxRatePerPartition", conf.getMaxRatePerPartition().toString());
112 |       pipelineConfigurer.setPipelineProperties(pipelineProperties);
113 |     }
114 |   }
115 | 
116 |   @Override
117 |   public JavaDStream<StructuredRecord> getStream(StreamingContext context) throws Exception {
118 |     context.registerLineage(conf.referenceName);
119 | 
120 |     SimpleConsumer consumer = new SimpleConsumer(conf.getHost(), conf.getPort(), 20 * 1000, 128 * 1024,
121 |                                                  "partitionLookup");
122 |     Map<TopicAndPartition, Long> offsets;
123 |     try {
124 |       offsets = loadOffsets(consumer);
125 |     } finally {
126 |       consumer.close();
127 |     }
128 | 
129 |     LOG.info("Using initial offsets {}", offsets);
130 |     Map<String, String> kafkaParams = new HashMap<>();
131 |     kafkaParams.put("metadata.broker.list", conf.getBroker());
132 |     JavaInputDStream<StructuredRecord> directStream = KafkaUtils.createDirectStream(
133 |       context.getSparkStreamingContext(), byte[].class, byte[].class, DefaultDecoder.class, DefaultDecoder.class,
134 |       StructuredRecord.class, kafkaParams, offsets, this::kafkaMessageToRecord);
135 |     return directStream
136 |       .mapToPair(record -> new Tuple2<>("", record))
137 |       .mapWithState(StateSpec.function(schemaStateFunction()))
138 |       .flatMap(record -> NORMALIZER.transform(record).iterator())
139 |       .map(Schemas::toCDCRecord);
140 |   }
141 | 
142 |   private Map<TopicAndPartition, Long> loadOffsets(SimpleConsumer consumer) {
143 |     // KafkaUtils doesn't understand -1 and -2 as latest offset and smallest offset.
144 |     // so we have to replace them with the actual smallest and latest
145 |     String topicName = conf.getTopic();
146 |     int partitionId = getPartitionId(consumer);
147 |     long initialOffset = conf.getDefaultInitialOffset();
148 | 
149 |     TopicAndPartition topicAndPartition = new TopicAndPartition(topicName, partitionId);
150 | 
151 |     Map<TopicAndPartition, PartitionOffsetRequestInfo> offsetsToRequest = new HashMap<>();
152 |     if (initialOffset == OffsetRequest.EarliestTime() || initialOffset == OffsetRequest.LatestTime()) {
153 |       offsetsToRequest.put(topicAndPartition, new PartitionOffsetRequestInfo(initialOffset, 1));
154 |     }
155 | 
156 |     kafka.javaapi.OffsetRequest offsetRequest =
157 |       new kafka.javaapi.OffsetRequest(offsetsToRequest, OffsetRequest.CurrentVersion(), "offsetLookup");
158 |     OffsetResponse response = consumer.getOffsetsBefore(offsetRequest);
159 | 
160 |     if (response.errorCode(topicName, partitionId) != 0) {
161 |       throw new IllegalStateException(String.format(
162 |         "Could not find offset for topic '%s' and partition '%s'. Please check all brokers were included in the " +
163 |           "broker list.", topicName, partitionId));
164 |     }
165 | 
166 |     Map<TopicAndPartition, Long> offsets = new HashMap<>();
167 |     offsets.put(topicAndPartition, response.offsets(topicName, partitionId)[0]);
168 |     return offsets;
169 |   }
170 | 
171 |   private StructuredRecord kafkaMessageToRecord(MessageAndMetadata<byte[], byte[]> messageAndMetadata) {
172 |     return StructuredRecord.builder(TRANSFORMED_MESSAGE)
173 |       .set("message", messageAndMetadata.message())
174 |       .build();
175 |   }
176 | 
177 |   private int getPartitionId(SimpleConsumer consumer) {
178 |     Set<Integer> partitions = new HashSet<>();
179 |     TopicMetadataRequest topicMetadataRequest = new TopicMetadataRequest(Collections.singletonList(conf.getTopic()));
180 |     TopicMetadataResponse response = consumer.send(topicMetadataRequest);
181 | 
182 |     for (TopicMetadata topicMetadata : response.topicsMetadata()) {
183 |       for (PartitionMetadata partitionMetadata : topicMetadata.partitionsMetadata()) {
184 |         partitions.add(partitionMetadata.partitionId());
185 |       }
186 |     }
187 | 
188 |     if (partitions.size() != 1) {
189 |       throw new IllegalArgumentException(
190 |         String.format("Topic '%s' should only have one partition. Found '%s' partitions.",
191 |                       conf.getTopic(), partitions.size()));
192 |     }
193 |     return partitions.iterator().next();
194 |   }
195 | 
196 |   private static Function3<String, Optional<StructuredRecord>, State<Map<Long, String>>, StructuredRecord>
197 |   schemaStateFunction() {
198 |     return (key, value, state) -> {
199 |       StructuredRecord input = value.get();
200 |       Object message = input.get("message");
201 | 
202 |       byte[] messageBytes = BinaryMessages.getBytesFromBinaryMessage(message);
203 |       String messageBody = new String(messageBytes, StandardCharsets.UTF_8);
204 | 
205 |       if (messageBody.contains("generic_wrapper") && messageBody.contains("oracle.goldengate")) {
206 |         StructuredRecord.Builder builder = StructuredRecord.builder(GENERIC_WRAPPER_SCHEMA_MESSAGE);
207 |         builder.set("message", message);
208 |         return builder.build();
209 |       }
210 | 
211 |       if (messageBody.contains("\"type\" : \"record\"")) {
212 |         org.apache.avro.Schema avroSchema = new org.apache.avro.Schema.Parser().parse(messageBody);
213 |         long schemaFingerPrint = SchemaNormalization.parsingFingerprint64(avroSchema);
214 |         Map<Long, String> newState;
215 |         if (state.exists()) {
216 |           newState = state.get();
217 |         } else {
218 |           newState = new HashMap<>();
219 |         }
220 |         newState.put(schemaFingerPrint, messageBody);
221 |         state.update(newState);
222 |         LOG.debug("Schema mapping updated to {}", state.get());
223 | 
224 |         StructuredRecord.Builder builder = StructuredRecord.builder(DDL_SCHEMA_MESSAGE);
225 |         builder.set("message", message);
226 |         return builder.build();
227 |       }
228 | 
229 |       StructuredRecord.Builder stateBuilder = StructuredRecord.builder(STATE_SCHEMA);
230 |       stateBuilder.set("data", state.get());
231 | 
232 |       StructuredRecord.Builder builder = StructuredRecord.builder(DML_SCHEMA);
233 |       builder.set("message", message);
234 |       builder.set("staterecord", stateBuilder.build());
235 |       return builder.build();
236 |     };
237 |   }
238 | }
239 | 


--------------------------------------------------------------------------------
/src/test/java/io/cdap/plugin/cdc/performance/CDCPipelinePerfTest.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright © 2019 Cask Data, Inc.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
  5 |  * use this file except in compliance with the License. You may obtain a copy of
  6 |  * the License at
  7 |  *
  8 |  * http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 12 |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 13 |  * License for the specific language governing permissions and limitations under
 14 |  * the License.
 15 |  */
 16 | 
 17 | package io.cdap.plugin.cdc.performance;
 18 | 
 19 | import com.google.bigtable.repackaged.com.google.cloud.ServiceOptions;
 20 | import com.google.bigtable.repackaged.com.google.cloud.bigtable.grpc.io.IOExceptionWithStatus;
 21 | import com.google.bigtable.repackaged.io.grpc.StatusRuntimeException;
 22 | import com.google.common.collect.ImmutableMap;
 23 | import io.cdap.cdap.etl.api.batch.SparkSink;
 24 | import io.cdap.cdap.etl.api.streaming.StreamingSource;
 25 | import io.cdap.cdap.etl.proto.v2.ETLPlugin;
 26 | import io.cdap.cdap.proto.ProgramRunStatus;
 27 | import io.cdap.cdap.test.SparkManager;
 28 | import io.cdap.plugin.cdc.common.BigtableOperations;
 29 | import io.cdap.plugin.cdc.sink.CDCBigTableConfig;
 30 | import io.cdap.plugin.cdc.source.sqlserver.CTSQLServerConfig;
 31 | import io.cdap.plugin.common.Constants;
 32 | import org.apache.hadoop.hbase.TableName;
 33 | import org.apache.hadoop.hbase.client.Connection;
 34 | import org.apache.hadoop.hbase.client.Result;
 35 | import org.apache.hadoop.hbase.client.ResultScanner;
 36 | import org.apache.hadoop.hbase.client.Scan;
 37 | import org.apache.hadoop.hbase.client.Table;
 38 | import org.awaitility.Awaitility;
 39 | import org.awaitility.Duration;
 40 | import org.junit.After;
 41 | import org.junit.Assert;
 42 | import org.junit.Assume;
 43 | import org.junit.Before;
 44 | import org.junit.Rule;
 45 | import org.junit.Test;
 46 | import org.junit.rules.TestName;
 47 | import org.slf4j.Logger;
 48 | import org.slf4j.LoggerFactory;
 49 | 
 50 | import java.io.IOException;
 51 | import java.sql.DriverManager;
 52 | import java.sql.SQLException;
 53 | import java.sql.Statement;
 54 | import java.util.Map;
 55 | import java.util.concurrent.TimeUnit;
 56 | 
 57 | public class CDCPipelinePerfTest extends CDCPluginPerfTestBase {
 58 |   private static final Logger LOG = LoggerFactory.getLogger(CDCPipelinePerfTest.class);
 59 |   private static final String APP_NAME = CDCPipelinePerfTest.class.getSimpleName();
 60 | 
 61 |   // Common properties
 62 |   private static final boolean TEST_DATA_LOAD =
 63 |     Boolean.parseBoolean(System.getProperty("ptest.test-data.load", "true"));
 64 |   private static final int TEST_DATA_INSERTS =
 65 |     Integer.parseInt(System.getProperty("ptest.test-data.inserts", "5000"));
 66 |   private static final int TEST_TARGET_TABLE_CREATED_TIMEOUT_SECONDS =
 67 |     Integer.parseInt(System.getProperty("ptest.target-table-created-timeout.seconds", "300"));
 68 |   private static final int TEST_DATA_TRANSFERED_TIMEOUT_SECONDS =
 69 |     Integer.parseInt(System.getProperty("ptest.data-transferred-timeout.seconds", "600"));
 70 | 
 71 |   // Bigtable properties
 72 |   private static final String BIGTABLE_PROJECT
 73 |     = System.getProperty("ptest.bigtable.project", ServiceOptions.getDefaultProjectId());
 74 |   private static final String BIGTABLE_INSTANCE = System.getProperty("ptest.bigtable.instance");
 75 |   private static final String BIGTABLE_SERVICE_ACCOUNT_FILE_PATH
 76 |     = System.getProperty("ptest.bigtable.serviceFilePath", System.getenv("CREDENTIAL_ENV_NAME"));
 77 | 
 78 |   // SQL Server properties
 79 |   private static final String SQL_HOST = System.getProperty("ptest.sql-server.host", "localhost");
 80 |   private static final String SQL_PORT = System.getProperty("ptest.sql-server.port", "1433");
 81 |   private static final String SQL_USERNAME = System.getProperty("ptest.sql-server.username", "SA");
 82 |   private static final String SQL_PASSWORD = System.getProperty("ptest.sql-server.password", "123Qwe123");
 83 | 
 84 |   @Rule
 85 |   public TestName testName = new TestName();
 86 | 
 87 |   private String dbName;
 88 |   private String dbTableName;
 89 |   private SparkManager programManager;
 90 |   private Connection connection;
 91 | 
 92 |   @Before
 93 |   @Override
 94 |   public void setUp() throws Exception {
 95 |     Assume.assumeNotNull(BIGTABLE_PROJECT);
 96 |     Assume.assumeNotNull(BIGTABLE_INSTANCE);
 97 | 
 98 |     super.setUp();
 99 | 
100 |     dbName = CDCPipelinePerfTest.class.getSimpleName() + '_' + testName.getMethodName();
101 |     dbTableName = testName.getMethodName();
102 | 
103 |     connection = BigtableOperations.connect(BIGTABLE_PROJECT, BIGTABLE_INSTANCE, BIGTABLE_SERVICE_ACCOUNT_FILE_PATH);
104 | 
105 |     // cleanup Bigtable
106 |     LOG.info("Cleaning up Bigtable");
107 |     BigtableOperations.dropTableIfExists(connection, dbTableName);
108 | 
109 |     if (TEST_DATA_LOAD) {
110 |       LOG.info("Preparing test data");
111 |       // cleanup SQL Server
112 |       LOG.info("Cleaning up SQL Server");
113 |       dropDatabaseIfExists(dbName);
114 |       // prepare data
115 |       LOG.info("Inserting test data ({} records)", TEST_DATA_INSERTS);
116 |       createDatabaseWithTracking(dbName);
117 |       createTableWithTracking(dbTableName);
118 |       try (java.sql.Connection connection = getConnectionToDb();
119 |            Statement statement = connection.createStatement()) {
120 |         for (int i = 0; i < TEST_DATA_INSERTS; i++) {
121 |           statement.executeUpdate(String.format("insert into %s(value) values ('initial value')", dbTableName));
122 |         }
123 |       }
124 |     }
125 | 
126 |     LOG.info("Deploying application");
127 | 
128 |     ImmutableMap<String, String> sourceProps = ImmutableMap.<String, String>builder()
129 |       .put(CTSQLServerConfig.HOST_NAME, SQL_HOST)
130 |       .put(CTSQLServerConfig.PORT, SQL_PORT)
131 |       .put(CTSQLServerConfig.USERNAME, SQL_USERNAME)
132 |       .put(CTSQLServerConfig.PASSWORD, SQL_PASSWORD)
133 |       .put(CTSQLServerConfig.DATABASE_NAME, dbName)
134 |       .put(Constants.Reference.REFERENCE_NAME, "CTSQLServerSource")
135 |       .build();
136 |     ETLPlugin sourceConfig = new ETLPlugin("CTSQLServer", StreamingSource.PLUGIN_TYPE, sourceProps);
137 | 
138 |     Map<String, String> sinkProps = ImmutableMap.<String, String>builder()
139 |       .put(CDCBigTableConfig.PROJECT, BIGTABLE_PROJECT)
140 |       .put(CDCBigTableConfig.INSTANCE, BIGTABLE_INSTANCE)
141 |       .put(CDCBigTableConfig.SERVICE_ACCOUNT_FILE_PATH, BIGTABLE_SERVICE_ACCOUNT_FILE_PATH)
142 |       .put(Constants.Reference.REFERENCE_NAME, "CDCBigTableSink")
143 |       .build();
144 |     ETLPlugin sinkConfig = new ETLPlugin("CDCBigTable", SparkSink.PLUGIN_TYPE, sinkProps);
145 | 
146 |     programManager = deployETL(sourceConfig, sinkConfig, APP_NAME);
147 |   }
148 | 
149 |   @After
150 |   @Override
151 |   public void tearDown() throws Exception {
152 |     if (programManager != null) {
153 |       programManager.stop();
154 |       programManager.waitForStopped(10, TimeUnit.SECONDS);
155 |       programManager.waitForRun(ProgramRunStatus.KILLED, 10, TimeUnit.SECONDS);
156 |     }
157 |     super.tearDown();
158 |     if (connection != null) {
159 |       connection.close();
160 |     }
161 |   }
162 | 
163 |   @Test
164 |   public void testSqlServerToBigtablePipeline() throws Exception {
165 |     long testStart = System.currentTimeMillis();
166 |     LOG.info("Starting pipeline");
167 |     programManager.startAndWaitForRun(ProgramRunStatus.RUNNING, 10, TimeUnit.SECONDS);
168 | 
169 |     LOG.info("Waiting until {} records are present in Bigtable", TEST_DATA_INSERTS);
170 | 
171 |     Awaitility.await()
172 |       .atMost(TEST_TARGET_TABLE_CREATED_TIMEOUT_SECONDS, TimeUnit.SECONDS)
173 |       .pollInterval(Duration.TEN_SECONDS)
174 |       .ignoreException(StatusRuntimeException.class)
175 |       .untilAsserted(() -> {
176 |         TableName tableName = TableName.valueOf(dbTableName);
177 |         Assert.assertTrue(String.format("Table '%s' was not created", tableName),
178 |                           connection.getAdmin().tableExists(tableName));
179 |         Assert.assertTrue(connection.getAdmin().isTableAvailable(tableName));
180 |         Assert.assertTrue(connection.getAdmin().isTableEnabled(tableName));
181 |     });
182 | 
183 |     Awaitility.await()
184 |       .atMost(TEST_DATA_TRANSFERED_TIMEOUT_SECONDS, TimeUnit.SECONDS)
185 |       .pollInterval(Duration.TEN_SECONDS)
186 |       .ignoreException(IOExceptionWithStatus.class)
187 |       .untilAsserted(() -> {
188 |         TableName tableName = TableName.valueOf(dbTableName);
189 |         Table table = connection.getTable(tableName);
190 |         int rowCount = getRowCount(table);
191 |         LOG.info("Currently {} records are present in Bigtable", rowCount);
192 |         Assert.assertEquals(TEST_DATA_INSERTS, rowCount);
193 |       });
194 | 
195 |     long testEnd = System.currentTimeMillis();
196 |     long elapsedSeconds = (testEnd - testStart) / 1000;
197 |     long recordsPerSecond = TEST_DATA_INSERTS / elapsedSeconds;
198 |     LOG.info("Test finished. Transferred '{}' records. Elapsed time is '{} seconds' ({} records/second)",
199 |              TEST_DATA_INSERTS, elapsedSeconds, recordsPerSecond);
200 |   }
201 | 
202 |   private static int getRowCount(Table table) throws IOException {
203 |     int rowCount = 0;
204 |     try (ResultScanner scanner = table.getScanner(new Scan())) {
205 |       for (Result rs = scanner.next(); rs != null; rs = scanner.next()) {
206 |         rowCount++;
207 |       }
208 |     }
209 |     return rowCount;
210 |   }
211 | 
212 |   private static void dropDatabaseIfExists(String dbName) throws SQLException {
213 |     try (java.sql.Connection connection = getConnectionToRoot();
214 |          Statement statement = connection.createStatement()) {
215 |       statement.executeUpdate(String.format("drop database if exists %s", dbName));
216 |     }
217 |   }
218 | 
219 |   private static java.sql.Connection getConnectionToRoot() throws SQLException {
220 |     String connectionString = String.format("jdbc:sqlserver://%s:%s", SQL_HOST, SQL_PORT);
221 |     return DriverManager.getConnection(connectionString, SQL_USERNAME, SQL_PASSWORD);
222 |   }
223 | 
224 |   private static void createDatabaseWithTracking(String dbName) throws SQLException {
225 |     try (java.sql.Connection connection = getConnectionToRoot();
226 |          Statement statement = connection.createStatement()) {
227 |       statement.executeUpdate(String.format("create database %s", dbName));
228 |       statement.executeUpdate(String.format("alter database %s set change_tracking = ON", dbName));
229 |     }
230 |   }
231 | 
232 |   private void createTableWithTracking(String tableName) throws SQLException {
233 |     try (java.sql.Connection connection = getConnectionToDb();
234 |          Statement statement = connection.createStatement()) {
235 |       statement.executeUpdate(String.format("create table %s (id bigint identity primary key, value text)",
236 |                                             tableName));
237 |       statement.executeUpdate(String.format("alter table %s enable change_tracking", tableName));
238 |     }
239 |   }
240 | 
241 |   private java.sql.Connection getConnectionToDb() throws SQLException {
242 |     String connectionString = String.format("jdbc:sqlserver://%s:%s;DatabaseName=%s", SQL_HOST, SQL_PORT, dbName);
243 |     return DriverManager.getConnection(connectionString, SQL_USERNAME, SQL_PASSWORD);
244 |   }
245 | }
246 | 


--------------------------------------------------------------------------------