├── .gitignore
├── LICENSE
├── README.md
├── contribution-guide
    └── README.md
├── datacollector
    ├── README.md
    ├── best-practices
    │   └── README.md
    ├── sample-pipelines
    │   ├── README.md
    │   └── pipelines
    │   │   ├── Citi Bike real-time system data (Basic)
    │   │       ├── Citi_Bike_real_time_system_data.zip
    │   │       ├── README.md
    │   │       └── images
    │   │       │   ├── import_from_archive.png
    │   │       │   ├── pipeline.png
    │   │       │   ├── preview.png
    │   │       │   ├── preview_data.png
    │   │       │   ├── running_pipeline.png
    │   │       │   ├── select_downloaded_archive.png
    │   │       │   └── start_pipeline.png
    │   │   ├── Date Conversions
    │   │       ├── DateConvef79157e5-c817-423a-a072-0c01e5d01745.json
    │   │       ├── README.md
    │   │       └── images
    │   │       │   ├── img1.png
    │   │       │   ├── img2a.png
    │   │       │   ├── img2b.png
    │   │       │   ├── img3a.png
    │   │       │   └── img3b.png
    │   │   ├── Drift Synchronization for Hive
    │   │       ├── Drift Synchronization for Hive.json
    │   │       ├── README.md
    │   │       ├── change_stage_library.png
    │   │       ├── pipeline.png
    │   │       └── running.png
    │   │   ├── HDFS to ADLS Gen2
    │   │       ├── README.md
    │   │       ├── dataset
    │   │       │   └── tennis.csv
    │   │       ├── hdfs_to_adls_gen2_processors.json
    │   │       └── images
    │   │       │   └── ADLS_gen2_data_lake.jpeg
    │   │   ├── ML - TensorFlow Binary Classification
    │   │       ├── README.md
    │   │       ├── TensorFlowBreastCancerBinaryclassificationc6bb44b7-bf28-4b3a-8c8f-e419625b3096.json
    │   │       ├── dataset
    │   │       │   └── BreastCancer.csv
    │   │       └── model
    │   │       │   └── BreastCancer_TF_1.14
    │   │       │       ├── saved_model.pb
    │   │       │       └── variables
    │   │       │           ├── variables.data-00000-of-00001
    │   │       │           └── variables.index
    │   │   ├── MySQL CDC to Delta Lake
    │   │       ├── MySQL CDC (Binary Log) to DeltaLake.json
    │   │       ├── MySQL CDC to Delta Lake.png
    │   │       └── README.md
    │   │   ├── MySQL CDC to S3 to Snowflake
    │   │       ├── README.md
    │   │       ├── images
    │   │       │   ├── MySQLCDCtoS3toSnowflake_step2.png
    │   │       │   ├── MySQLCDCtoS3toSnowflake_step2a.png
    │   │       │   ├── MySQLCDCtoS3toSnowflake_step3.png
    │   │       │   ├── MySQLCDCtoS3toSnowflake_step4.png
    │   │       │   ├── MySQLCDCtoS3toSnowflake_step5.png
    │   │       │   ├── MySQLCDCtoS3toSnowflake_step5a1.png
    │   │       │   ├── MySQLCDCtoS3toSnowflake_step5a2.png
    │   │       │   ├── MySQLCDCtoS3toSnowflake_step6a1.png
    │   │       │   ├── MySQLCDCtoS3toSnowflake_step6a2.png
    │   │       │   ├── pipeline1.png
    │   │       │   └── pipeline2.png
    │   │       └── pipelines.zip
    │   │   ├── MySQL CDC to Snowflake
    │   │       ├── MySQL_CDC_to_Snowflake.zip
    │   │       ├── README.md
    │   │       └── images
    │   │       │   ├── MySQLtoSnowflake_step2.png
    │   │       │   ├── MySQLtoSnowflake_step2a.png
    │   │       │   ├── MySQLtoSnowflake_step3.png
    │   │       │   ├── MySQLtoSnowflake_step4.png
    │   │       │   ├── MySQLtoSnowflake_step4a.png
    │   │       │   ├── MySQLtoSnowflake_step5.png
    │   │       │   └── pipeline.png
    │   │   ├── MySQL Schema replication to Azure Synapse SQL
    │   │       ├── MySQL_Schema_Replication_to_Azure_Synapse_SQL.zip
    │   │       ├── README.md
    │   │       └── images
    │   │       │   ├── MySQL_Schema_Replication_to_Azure_Synapse_SQL.png
    │   │       │   ├── mysql_configs.png
    │   │       │   ├── parameters.png
    │   │       │   └── synapse_auto_create.png
    │   │   ├── MySQL Schema replication to Delta Lake
    │   │       ├── MySQL Schema Replication to Delta Lake.json
    │   │       ├── MySQL_Schema_replication_to_DeltaLake.png
    │   │       └── README.md
    │   │   ├── MySQL binlog to DeltaLake
    │   │       ├── MySQL_CDC_to_DeltaLake.zip
    │   │       ├── README.md
    │   │       └── images
    │   │       │   ├── MySQLtoDBDeltaLake_step2.png
    │   │       │   ├── MySQLtoDBDeltaLake_step2a.png
    │   │       │   ├── MySQLtoDBDeltaLake_step3.png
    │   │       │   ├── MySQLtoDBDeltaLake_step4.png
    │   │       │   ├── MySQLtoDBDeltaLake_step4a.png
    │   │       │   ├── MySQLtoDBDeltaLake_step5.png
    │   │       │   └── pipeline.png
    │   │   ├── NYC Taxi Ride Payment Type (Basic)
    │   │       ├── NYC_Taxi_Ride_Payment_Type_Basic.zip
    │   │       ├── README.md
    │   │       └── images
    │   │       │   ├── import_from_archive.png
    │   │       │   ├── parameter.png
    │   │       │   ├── pipeline.png
    │   │       │   ├── running_pipeline.png
    │   │       │   ├── select_downloaded_archive.png
    │   │       │   └── start_pipeline.png
    │   │   ├── NYC Taxi Ride Payment Type (with Jython)
    │   │       ├── NYC_Taxi_Ride_Payment_Type_with_Jython.zip
    │   │       ├── README.md
    │   │       └── images
    │   │       │   ├── import_from_archive.png
    │   │       │   ├── parameter.png
    │   │       │   ├── pipeline.png
    │   │       │   ├── running_pipeline.png
    │   │       │   ├── select_downloaded_archive.png
    │   │       │   └── start_pipeline.png
    │   │   ├── Oracle 19c Bulk Ingest and CDC to Databricks Delta Lake
    │   │       ├── Oracle19CDCToDatabricksDeltaLaked30dd1ff-7029-41b6-943d-86771ee2e8b6.json
    │   │       ├── Oracle19ToDatabricksDeltaLake87a09a35-d201-497f-82ea-c7c03de13517.json
    │   │       └── README.md
    │   │   ├── Oracle CDC to Delta Lake
    │   │       ├── OracleCDC_to_DeltaLake.zip
    │   │       ├── README.md
    │   │       └── images
    │   │       │   ├── OracletoDBDeltaLake_step2.png
    │   │       │   ├── OracletoDBDeltaLake_step2a.png
    │   │       │   ├── OracletoDBDeltaLake_step3.png
    │   │       │   ├── OracletoDBDeltaLake_step4.png
    │   │       │   ├── OracletoDBDeltaLake_step4a.png
    │   │       │   ├── OracletoDBDeltaLake_step5.png
    │   │       │   └── pipeline.png
    │   │   ├── Oracle CDC to Snowflake
    │   │       ├── OracleCDC_to_Snowflake.zip
    │   │       ├── README.md
    │   │       └── images
    │   │       │   ├── OracletoSnowflake_step2.png
    │   │       │   ├── OracletoSnowflake_step2a.png
    │   │       │   ├── OracletoSnowflake_step3.png
    │   │       │   ├── OracletoSnowflake_step4.png
    │   │       │   ├── OracletoSnowflake_step4a.png
    │   │       │   ├── OracletoSnowflake_step5.png
    │   │       │   └── pipeline.png
    │   │   ├── Parse Twitter Data To JSON
    │   │       ├── ParseTwit08ef5e13-c53c-4664-8acf-4b393ec7782f.json
    │   │       ├── README.md
    │   │       ├── images
    │   │       │   ├── img1.png
    │   │       │   ├── img2.png
    │   │       │   ├── img3.png
    │   │       │   ├── img4.png
    │   │       │   ├── img5.png
    │   │       │   └── img6.png
    │   │       └── output
    │   │       │   └── tweets-curated-6a29b1bb-da8e-11ea-8b72-417388e3a72a_335bc746-b9c2-4ef0-a5ef-5a049d35985d.json
    │   │   ├── Parse Web Logs To JSON And Avro
    │   │       ├── Parse Web Logs To JSON & Avro.json
    │   │       ├── README.md
    │   │       ├── images
    │   │       │   ├── img1.png
    │   │       │   ├── img2.png
    │   │       │   ├── img3.png
    │   │       │   ├── img4.png
    │   │       │   ├── img5.png
    │   │       │   └── img6.png
    │   │       └── output
    │   │       │   ├── added-to-cart-a8a11b6c-d8fc-11ea-9149-abc78c1550f2_19aa3278-119c-4820-b11b-d58637a7b275.avro
    │   │       │   └── page-views-a8a11b6c-d8fc-11ea-9149-abc78c1550f2_69b93e73-ac07-45ab-b89b-90550dc14ad9.json
    │   │   ├── PostgreSQL CDC to Delta Lake
    │   │       ├── PostgreSQL_CDC_to_DeltaLake.zip
    │   │       ├── README.md
    │   │       └── images
    │   │       │   ├── PostgreSQLtoDBDeltaLake_step2.png
    │   │       │   ├── PostgreSQLtoDBDeltaLake_step2a.png
    │   │       │   ├── PostgreSQLtoDBDeltaLake_step3.png
    │   │       │   ├── PostgreSQLtoDBDeltaLake_step4.png
    │   │       │   ├── PostgreSQLtoDBDeltaLake_step4a.png
    │   │       │   ├── PostgreSQLtoDBDeltaLake_step5.png
    │   │       │   └── pipeline.png
    │   │   ├── PostgreSQL CDC to Snowflake
    │   │       ├── PostgreSQL_CDC_to_Snowflake.zip
    │   │       ├── README.md
    │   │       └── images
    │   │       │   ├── PostgreSQLtoSnowflake_step2.png
    │   │       │   ├── PostgreSQLtoSnowflake_step2a.png
    │   │       │   ├── PostgreSQLtoSnowflake_step3.png
    │   │       │   ├── PostgreSQLtoSnowflake_step4.png
    │   │       │   ├── PostgreSQLtoSnowflake_step4a.png
    │   │       │   ├── PostgreSQLtoSnowflake_step5.png
    │   │       │   └── pipeline.png
    │   │   ├── SQLServer CDC to Delta Lake
    │   │       ├── README.md
    │   │       ├── SQL_Server_CDC_to_DeltaLake.zip
    │   │       └── images
    │   │       │   ├── import_from_archive.png
    │   │       │   ├── key_columns.png
    │   │       │   ├── parameters.png
    │   │       │   ├── pipeline.png
    │   │       │   ├── running_pipeline.png
    │   │       │   ├── select_downloaded_archive.png
    │   │       │   └── start_pipeline.png
    │   │   ├── SQLServer CDC to Snowflake
    │   │       ├── README.md
    │   │       ├── SQL_Server_CDC_to_Snowflake.zip
    │   │       └── images
    │   │       │   ├── get_key_columns.png
    │   │       │   ├── import_from_archive.png
    │   │       │   ├── key_columns.png
    │   │       │   ├── parameters.png
    │   │       │   ├── pipeline.png
    │   │       │   ├── running_pipeline.png
    │   │       │   ├── sample_preview.png
    │   │       │   ├── select_downloaded_archive.png
    │   │       │   ├── select_pipeline_to_import.png
    │   │       │   └── start_pipeline.png
    │   │   ├── Salesforce CDC to Delta Lake
    │   │       ├── README.md
    │   │       ├── Salesforce_CDC_to_DeltaLake.zip
    │   │       └── images
    │   │       │   ├── SalesforcetoDBDeltaLake_step2.png
    │   │       │   ├── SalesforcetoDBDeltaLake_step2a.png
    │   │       │   ├── SalesforcetoDBDeltaLake_step3.png
    │   │       │   ├── SalesforcetoDBDeltaLake_step4.png
    │   │       │   ├── SalesforcetoDBDeltaLake_step4a.png
    │   │       │   ├── SalesforcetoDBDeltaLake_step5.png
    │   │       │   └── pipeline.png
    │   │   ├── Salesforce CDC to Snowflake
    │   │       ├── README.md
    │   │       ├── Salesforce_CDC_to_Snowflake.zip
    │   │       └── images
    │   │       │   ├── SalesforcetoSnowflake_step2.png
    │   │       │   ├── SalesforcetoSnowflake_step2a.png
    │   │       │   ├── SalesforcetoSnowflake_step3.png
    │   │       │   ├── SalesforcetoSnowflake_step4.png
    │   │       │   ├── SalesforcetoSnowflake_step4a.png
    │   │       │   ├── SalesforcetoSnowflake_step5.png
    │   │       │   └── pipeline.png
    │   │   ├── Salesforce to Delta Lake
    │   │       ├── README.md
    │   │       ├── Salesforce Accounts to Delta Lake.json
    │   │       └── Salesforce_to_DeltaLake.png
    │   │   ├── Working with XML (Basic)
    │   │       ├── README.md
    │   │       ├── Working_with_XML.zip
    │   │       └── images
    │   │       │   ├── import_from_archive.png
    │   │       │   ├── pipeline.png
    │   │       │   ├── preview.png
    │   │       │   ├── preview_data.png
    │   │       │   ├── select_downloaded_archive.png
    │   │       │   └── start_pipeline.png
    │   │   └── aws-marketplace-reports
    │   │       ├── AWS Marketplace Data - Disbursed.json
    │   │       ├── AWS Marketplace Data.json
    │   │       └── create_tables.sql
    └── sample-scripts
    │   └── README.md
├── images
    └── Full Color Transparent.png
└── transformer
    ├── README.md
    ├── best-practices
        └── README.md
    ├── sample-pipelines
        ├── README.md
        └── pipelines
        │   ├── Calculate Geographic Distance - UDF
        │       ├── Calculate_distance_between_airports.zip
        │       ├── README.md
        │       ├── airport.sql
        │       └── images
        │       │   ├── Transformer_Pipeline_Monitoring.png
        │       │   ├── Transformer_distance_table.png
        │       │   ├── Transformer_parameters.png
        │       │   └── pipeline.png
        │   ├── Clickstream Analysis on Amazon EMR, Amazon Redshift and Elasticsearch
        │       ├── ClickstreamLogsToESRedshiftEMRfe856fed-ca84-4689-88d1-432f6ae8e6cd.json
        │       ├── README.md
        │       └── Schematic_Log.csv
        │   ├── ML - Train NLP Model in PySpark
        │       ├── README.md
        │       ├── TrainNLPModelPySparkDB787ba4f1-dcb1-4d53-ab61-d80569daac14.json
        │       └── dataset
        │       │   ├── negative_tweets.json
        │       │   └── positive_tweets.json
        │   ├── ML - Train Random Forest Regression Model in Scala
        │       ├── README.md
        │       ├── TrainRandomForestRegressionModelScalaDB28582ef8-fecf-4fa8-94d1-1a58d803153d.json
        │       └── dataset
        │       │   └── Advertising_training.csv
        │   ├── Slowly Changing Dimensions - Type 2
        │       ├── README.md
        │       ├── SCDType2588a6d29-c8b9-439e-8bec-8b1f7b9c0e99.json
        │       ├── images
        │       │   ├── img1a.png
        │       │   ├── img1b.png
        │       │   ├── img2.png
        │       │   ├── img2a.png
        │       │   ├── img2b.png
        │       │   ├── img2c.png
        │       │   ├── img3.png
        │       │   ├── img4.png
        │       │   └── img5.png
        │       └── output
        │       │   └── part-00000-d21cc8cc-75b9-4e69-aa56-55e5abe93bac-c000.csv
        │   ├── Spark ETL To Derive Sales Insights on Azure HDInsight And Power BI
        │       ├── README.md
        │       ├── SalesInsightsOnAzureSQLHDInsight37efd28a-9c98-494b-85bd-c6fd8a85af10.json
        │       └── dataset
        │       │   └── sales
        │       │       ├── sales_data0.csv
        │       │       ├── sales_data1.csv
        │       │       ├── sales_data10.csv
        │       │       ├── sales_data11.csv
        │       │       ├── sales_data12.csv
        │       │       ├── sales_data13.csv
        │       │       ├── sales_data14.csv
        │       │       ├── sales_data15.csv
        │       │       ├── sales_data16.csv
        │       │       ├── sales_data17.csv
        │       │       ├── sales_data18.csv
        │       │       ├── sales_data19.csv
        │       │       ├── sales_data2.csv
        │       │       ├── sales_data20.csv
        │       │       ├── sales_data21.csv
        │       │       ├── sales_data22.csv
        │       │       ├── sales_data23.csv
        │       │       ├── sales_data24.csv
        │       │       ├── sales_data25.csv
        │       │       ├── sales_data26.csv
        │       │       ├── sales_data27.csv
        │       │       ├── sales_data28.csv
        │       │       ├── sales_data29.csv
        │       │       ├── sales_data3.csv
        │       │       ├── sales_data30.csv
        │       │       ├── sales_data31.csv
        │       │       ├── sales_data32.csv
        │       │       ├── sales_data33.csv
        │       │       ├── sales_data34.csv
        │       │       ├── sales_data35.csv
        │       │       ├── sales_data36.csv
        │       │       ├── sales_data37.csv
        │       │       ├── sales_data38.csv
        │       │       ├── sales_data39.csv
        │       │       ├── sales_data4.csv
        │       │       ├── sales_data40.csv
        │       │       ├── sales_data41.csv
        │       │       ├── sales_data42.csv
        │       │       ├── sales_data43.csv
        │       │       ├── sales_data44.csv
        │       │       ├── sales_data45.csv
        │       │       ├── sales_data46.csv
        │       │       ├── sales_data47.csv
        │       │       ├── sales_data48.csv
        │       │       ├── sales_data49.csv
        │       │       ├── sales_data5.csv
        │       │       ├── sales_data50.csv
        │       │       ├── sales_data51.csv
        │       │       ├── sales_data52.csv
        │       │       ├── sales_data53.csv
        │       │       ├── sales_data54.csv
        │       │       ├── sales_data55.csv
        │       │       ├── sales_data56.csv
        │       │       ├── sales_data57.csv
        │       │       ├── sales_data58.csv
        │       │       ├── sales_data59.csv
        │       │       ├── sales_data6.csv
        │       │       ├── sales_data60.csv
        │       │       ├── sales_data61.csv
        │       │       ├── sales_data62.csv
        │       │       ├── sales_data63.csv
        │       │       ├── sales_data64.csv
        │       │       ├── sales_data65.csv
        │       │       ├── sales_data66.csv
        │       │       ├── sales_data67.csv
        │       │       ├── sales_data68.csv
        │       │       ├── sales_data69.csv
        │       │       ├── sales_data7.csv
        │       │       ├── sales_data70.csv
        │       │       ├── sales_data71.csv
        │       │       ├── sales_data72.csv
        │       │       ├── sales_data73.csv
        │       │       ├── sales_data74.csv
        │       │       ├── sales_data75.csv
        │       │       ├── sales_data76.csv
        │       │       ├── sales_data77.csv
        │       │       ├── sales_data78.csv
        │       │       ├── sales_data79.csv
        │       │       ├── sales_data8.csv
        │       │       ├── sales_data80.csv
        │       │       ├── sales_data81.csv
        │       │       ├── sales_data82.csv
        │       │       ├── sales_data83.csv
        │       │       ├── sales_data84.csv
        │       │       ├── sales_data85.csv
        │       │       ├── sales_data86.csv
        │       │       ├── sales_data87.csv
        │       │       ├── sales_data88.csv
        │       │       ├── sales_data89.csv
        │       │       ├── sales_data9.csv
        │       │       ├── sales_data90.csv
        │       │       ├── sales_data91.csv
        │       │       ├── sales_data92.csv
        │       │       ├── sales_data93.csv
        │       │       ├── sales_data94.csv
        │       │       ├── sales_data95.csv
        │       │       ├── sales_data96.csv
        │       │       ├── sales_data97.csv
        │       │       ├── sales_data98.csv
        │       │       └── sales_data99.csv
        │   ├── Tx Retail Inventory - Join Agg Repartition
        │       ├── Data.zip
        │       ├── README.md
        │       ├── Tx_Retail_Inventory_join_agg_repartition.zip
        │       └── images
        │       │   ├── TxRetailInventory_step3.png
        │       │   ├── TxRetailInventory_step3a.png
        │       │   ├── TxRetailInventory_step4.png
        │       │   ├── TxRetailInventory_step5.png
        │       │   ├── TxRetailInventory_step5a.png
        │       │   └── pipeline.png
        │   ├── Tx Scala UDF
        │       ├── README.md
        │       ├── Tx_Scala_UDF.zip
        │       └── images
        │       │   ├── TxScalaUDF_step2.png
        │       │   ├── TxScalaUDF_step2a.png
        │       │   ├── TxScalaUDF_step3.png
        │       │   ├── TxScalaUDF_step4.png
        │       │   ├── TxScalaUDF_step4a.png
        │       │   └── pipeline.png
        │   └── Tx Slowly Changing Dimension - Type 1
        │       ├── README.md
        │       ├── Tx_SCD_Type1.zip
        │       └── images
        │           ├── TxSCDType1_step2.png
        │           ├── TxSCDType1_step2a.png
        │           ├── TxSCDType1_step3.png
        │           ├── TxSCDType1_step4.png
        │           ├── TxSCDType1_step4a.png
        │           └── pipeline.png
    └── sample-scripts
        └── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ![StreamSets Logo](images/Full%20Color%20Transparent.png)
 2 | 
 3 | <h1><p align="center">Pipeline Library</p></h1>
 4 | 
 5 | This repository contains assets that will help you get started with StreamSets DataOps Platform.  
 6 | 
 7 | The following asset categories are currently available:
 8 | 
 9 | ## Data Collector 3.x
10 | 
11 | | Asset            | Description     |
12 | | --------------- | --------------- |
13 | | [Sample Pipelines](./datacollector/sample-pipelines) | Contains sample pipelines for Data Collector |
14 | | [Sample Scripts](./datacollector/sample-scripts) | Contain sample scripts |
15 | | [Best Practices](./datacollector/best-practices) | Contains best practices and configurations |
16 | 
17 | ## Transformer 3.x
18 | 
19 | | Asset            | Description     |
20 | | --------------- | --------------- |
21 | | [Sample Pipelines](./transformer/sample-pipelines) | Contains sample pipelines for Transformer |
22 | | [Sample Scripts](./transformer/sample-scripts) | Contain sample scripts |
23 | | [Best Practices](./transformer/best-practices) | Contains best practices and configurations |
24 | 
25 | # Help
26 | 
27 | For any queries, questions, comments related to these pipelines reach out to us on [Slack](https://streamsetters-slack.herokuapp.com/)
28 | 


--------------------------------------------------------------------------------
/contribution-guide/README.md:
--------------------------------------------------------------------------------
 1 | ![StreamSets Logo](../../images/Full%20Color%20Transparent.png)
 2 | 
 3 | <h1><p align="center">Data Collector: Contribution Guide</p></h1>
 4 | 
 5 | # Contributing to Streamsets
 6 | 
 7 | **First off, thanks for taking the time to contribute!**
 8 | 
 9 | The following is a set of guidelines for contributing to StreamSets, which are hosted in the [StreamSets Organization](https://github.com/streamsets/pipeline-library) on GitHub. These are mostly guidelines, not rules. Use your best judgment, and feel free to propose changes to this document in a pull request.
10 | 
11 | #### Table Of Contents
12 | 
13 | **COMING SOON**
14 | 
15 | # Help
16 | 
17 | For any queries, questions, comments related to these pipelines reach out on any of these channels:
18 | 
19 | [Chat on Slack](https://streamsetters-slack.herokuapp.com/)
20 | 
21 | [User Group](https://groups.google.com/a/streamsets.com/d/forum/sdc-user)
22 | 
23 | [Ask StreamSets](https://ask.streamsets.com/questions/)
24 | 


--------------------------------------------------------------------------------
/datacollector/README.md:
--------------------------------------------------------------------------------
 1 | ![StreamSets Logo](../images/Full%20Color%20Transparent.png)
 2 | 
 3 | <h1><p align="center">Data Collector</p></h1>
 4 | 
 5 | This repository contains assets that will help you get started with StreamSets Data Collector.  
 6 | 
 7 | The following folders are currently available:
 8 | 
 9 | ## Data Collector
10 | 
11 | | Asset            | Description     |
12 | | --------------- | --------------- |
13 | | [Best Practices](./best-practices) | Contains best practices and configurations |
14 | | [Sample Pipelines](./sample-pipelines) | Contains sample pipelines for Data Collector |
15 | | [Sample Scripts](./sample-scripts) | Contain sample scripts |
16 | 
17 | # Help
18 | 
19 | For any queries, questions, comments related to these pipelines reach out on any of these channels:
20 | 
21 | [Chat on Slack](https://streamsetters-slack.herokuapp.com/)
22 | 
23 | [User Group](https://groups.google.com/a/streamsets.com/d/forum/sdc-user)
24 | 
25 | [Ask StreamSets](https://ask.streamsets.com/questions/)
26 | 


--------------------------------------------------------------------------------
/datacollector/best-practices/README.md:
--------------------------------------------------------------------------------
 1 | ![StreamSets Logo](../../images/Full%20Color%20Transparent.png)
 2 | 
 3 | <h1><p align="center">Data Collector: Best Practices</p></h1>
 4 | 
 5 | This folder contains Best Practices and Configurations for StreamSets Data Collector.
 6 | 
 7 | The following best practices/configurations are currently available:
 8 | | Name            | Description     |
 9 | | --------------- | --------------- |
10 | 
11 | **COMING SOON**
12 | 
13 | # Help
14 | 
15 | For any queries, questions, comments related to these pipelines reach out on any of these channels:
16 | 
17 | [Chat on Slack](https://streamsetters-slack.herokuapp.com/)
18 | 
19 | [User Group](https://groups.google.com/a/streamsets.com/d/forum/sdc-user)
20 | 
21 | [Ask StreamSets](https://ask.streamsets.com/questions/)
22 | 


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/README.md:
--------------------------------------------------------------------------------
 1 | ![StreamSets Logo](../../images/Full%20Color%20Transparent.png)
 2 | 
 3 | <h1><p align="center">Data Collector: Sample Pipelines</p></h1>
 4 | 
 5 | This folder contains pipeline templates and samples for StreamSets Data Collector.
 6 | 
 7 | The following templates/samples are currently available:
 8 | | Name            | Description     |
 9 | | --------------- | --------------- |
10 | | [Citi Bike real-time system data (Basic)](./pipelines/Citi%20Bike%20real-time%20system%20data%20(Basic)) | Reads from Rest API with unstructured and hierarchical data and convert to relational format |
11 | | [Date Conversions](./pipelines/Date%20Conversions) | Convert dates from string to various datetime formats and timezones using Field Type Converter and Expression Evaluator processors |
12 | | [Drift Synchronization for Hive](./pipelines/Drift%20Synchronization%20for%20Hive) | Drift Synchronization from MySQL to the Cloudera distribution of Apache Hive and Apache Impala |
13 | | [Hadoop FS to ADLS Gen2](./pipelines/HDFS%20to%20ADLS%20Gen2) | Load data from Hadoop FS to ADLS Gen 2 by performing some transformations |
14 | | [ML - TensorFlow Binary Classification](./pipelines/ML%20-%20TensorFlow%20Binary%20Classification) |  Load a pre-trained TensorFlow model to classify cancer condition as either benign or malignant |
15 | | [MySQL CDC to Delta Lake](./pipelines/MySQL%20CDC%20to%20Delta%20Lake) | Reads MySQL change data capture (CDC) data and writes to Databricks Delta Lake |
16 | | [MySQL CDC to S3 to Snowflake](./pipelines/MySQL%20CDC%20to%20S3%20to%20Snowflake) | Reads MySQL change data capture (CDC) data, writes to S3 then reads from S3 and writes to Snowflake |
17 | | [MySQL CDC to Snowflake](./pipelines/MySQL%20CDC%20to%20Snowflake) | Reads MySQL change data capture (CDC) data and writes to Snowflake |
18 | | [MySQL Schema Replication to Azure Synapse SQL](./pipelines/MySQL%20Schema%20replication%20to%20Azure%20Synapse%20SQL)| Bulk load data from MySQL into Azure Synapse SQL |
19 | | [MySQL Schema replication to Delta Lake](./pipelines/MySQL%20Schema%20replication%20to%20Delta%20Lake) | Bulk load data from MySQL into Databricks Delta Lake |
20 | | [MySQL binlog to DeltaLake](./pipelines/MySQL%20binlog%20to%20DeltaLake) | Reads MySQL binlog changed data and writes to Databricks Delta Lake |
21 | | [NYC Taxi Ride Payment Type (Basic)](./pipelines/NYC%20Taxi%20Ride%20Payment%20Type%20(Basic)) | Reads data from a directory, process it, route it, mask sensitive data and write into another file system with a different data format |
22 | | [NYC Taxi Ride Payment Type (with Jython)](./pipelines/NYC%20Taxi%20Ride%20Payment%20Type%20(with%20Jython)) | Reads data from a directory, process it using Jython, route it, mask sensitive data and write into another file system with a different data format | 
23 | | [Oracle 19c Bulk Ingest and CDC to Databricks Delta Lake](./pipelines/Oracle%2019c%20Bulk%20Ingest%20and%20CDC%20to%20Databricks%20Delta%20Lake) | Bulk ingest data from Oracle 19c and process Change Data Capture (CDC) into Databricks Delta Lake |
24 | | [Oracle CDC to Delta Lake](./pipelines/Oracle%20CDC%20to%20Delta%20Lake) | Reads change data capture (CDC) data Oracle and writes to Databricks Delta Lake |
25 | | [Oracle CDC to Snowflake](./pipelines/Oracle%20CDC%20to%20Snowflake) | Reads change data capture (CDC) data Oracle and writes to Snowflake |
26 | | [Parse Twitter Data to JSON](./pipelines/Parse%20Twitter%20Data%20to%20JSON) | Parse raw Twitter data and store curated data in JSON format |
27 | | [Parse Web Logs to JSON and Avro](./pipelines/Parse%20Web%20Logs%20to%20JSON%20and%20Avro) | Parse raw web logs ingested in Common Log Format and store curated data in JSON and Avro formats |
28 | | [PostgreSQL CDC to Delta Lake](./pipelines/PostgreSQL%20CDC%20to%20Delta%20Lake) | Reads change data capture (CDC) data from PostgreSQL and writes to Databricks Delta Lake |
29 | | [PostgreSQL CDC to Snowflake](./pipelines/PostgreSQL%20CDC%20to%20Snowflake) | Reads change data capture (CDC) data from PostgreSQL and writes to Snowflake |
30 | | [SQLServer CDC to Delta Lake](./pipelines/SQLServer%20CDC%20to%20Delta%20Lake) | Reads change data capture (CDC) data from SQL Server and writes to Databricks Delta Lake |
31 | | [SQLServer CDC to Snowflake](./pipelines/SQLServer%20CDC%20to%20Snowflake) | Reads change data capture (CDC) data from SQL Server and writes to Snowflake |
32 | | [Salesforce CDC to Delta Lake](./pipelines/Salesforce%20CDC%20to%20Delta%20Lake) | Reads change data capture (CDC) data from Salesforce and writes to Databricks Delta Lake |
33 | | [Salesforce CDC to Snowflake](./pipelines/Salesforce%20CDC%20to%20Snowflake) | Reads change data capture (CDC) data from Salesforce and writes to Snowflake |
34 | | [Salesforce to Delta Lake](./pipelines/Salesforce%20to%20Delta%20Lake) | Bulk load data from Salesforce accounts into Databricks Delta Lake |
35 | | [Working with XML (Basic)](./pipelines/Working%20with%20XML%20(Basic)) | Read and process XML data in Data Collector
36 | | [aws-marketplace-reports](./pipelines/aws-marketplace-reports) | Bulk load data from Salesforce accounts into Databricks Delta Lake |
37 | 
38 | # Help
39 | 
40 | For any queries, questions, comments related to these pipelines reach out on any of these channels:
41 | 
42 | [Chat on Slack](https://streamsetters-slack.herokuapp.com/)
43 | 
44 | [User Group](https://groups.google.com/a/streamsets.com/d/forum/sdc-user)
45 | 
46 | [Ask StreamSets](https://ask.streamsets.com/questions/)
47 | 


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Citi Bike real-time system data (Basic)/Citi_Bike_real_time_system_data.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Citi Bike real-time system data (Basic)/Citi_Bike_real_time_system_data.zip


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Citi Bike real-time system data (Basic)/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | <h1><p align="center">Citi Bike real-time system data (Basic)</p></h1>
 3 | 
 4 | **Important:** *These instructions assume you have access to StreamSets Data Collector (v3.16+) and have performed all the prerequisites*
 5 | 
 6 | - For help installing [StreamSets Data Collector](https://streamsets.com/products/dataops-platform/data-collector/), see [StreamSets Data Collector Installation](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Installation/Install_title.html).
 7 | 
 8 | ## OVERVIEW
 9 | 
10 | This pipeline demonstrates how to:
11 | - Read from Rest APIs to get ‘real time’ data
12 | - Work easily with unstructured and hierarchical data
13 | - Easily convert hierarchical data into rows for relational stores
14 | 
15 | ## PIPELINE
16 | 
17 | ![Pipeline](images/pipeline.png "Citi Bike real-time system data (Basic)")
18 | 
19 | ## DOCUMENTATION
20 | 
21 | [HTTP Client Origin](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Origins/HTTPClient.html)
22 | 
23 | ## STEP-BY-STEP
24 | 
25 | ### Step 1: Download the pipeline
26 | 
27 | [Click Here](./Citi_Bike_real_time_system_data.zip?raw=true) to download the pipeline and save it to your drive.
28 | 
29 | ### Step 2: Import the pipeline
30 | 
31 | Click the down arrow next to the "Create New Pipeline" and select "Import Pipeline".
32 | 
33 | ![Step 2](images/import_from_archive.png "Import the Pipeline")
34 | 
35 | Click "Browse" and locate the pipeline file you just downloaded, then click "Import"
36 | 
37 | ### Step 3: Preview the pipeline
38 | 
39 | Click on the pipeline you just imported to open it and select the Preview icon
40 | 
41 | ![Step 3](images/preview.png "Start preview")
42 | 
43 | Leave all default options in the preview configuration and click "Run Preview"
44 | 
45 | ![Step 3a](images/preview_data.png "Preview the pipeline")
46 | 
47 | While in the Preview, click on each processor to see how data is being parsed.
48 | Close the Preview to be able to run the pipeline
49 | 
50 | ### Step 4: Run the pipeline
51 | 
52 | Start the pipeline
53 | 
54 | ![Step 4](images/running_pipeline.png "Configure the parameters")
55 | 


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Citi Bike real-time system data (Basic)/images/import_from_archive.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Citi Bike real-time system data (Basic)/images/import_from_archive.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Citi Bike real-time system data (Basic)/images/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Citi Bike real-time system data (Basic)/images/pipeline.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Citi Bike real-time system data (Basic)/images/preview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Citi Bike real-time system data (Basic)/images/preview.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Citi Bike real-time system data (Basic)/images/preview_data.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Citi Bike real-time system data (Basic)/images/preview_data.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Citi Bike real-time system data (Basic)/images/running_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Citi Bike real-time system data (Basic)/images/running_pipeline.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Citi Bike real-time system data (Basic)/images/select_downloaded_archive.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Citi Bike real-time system data (Basic)/images/select_downloaded_archive.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Citi Bike real-time system data (Basic)/images/start_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Citi Bike real-time system data (Basic)/images/start_pipeline.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Date Conversions/README.md:
--------------------------------------------------------------------------------
 1 | Date Conversions
 2 | ================
 3 | 
 4 | This pipeline demonstrates how to convert dates from string to various datetime formats and timezones using Field Type Converter and Expression Evaluator processors.
 5 | 
 6 | Setup And Technical Details
 7 | ---------------------------
 8 | 
 9 | * Download the [pipeline](DateConvef79157e5-c817-423a-a072-0c01e5d01745.json) and import it into your Data Collector
10 | 
11 | Pipeline Overview
12 | ----------------
13 | 
14 | The pipeline has been prepopulated with sample dates in string format.
15 | 
16 | ![Pipeline Overview](images/img1.png)
17 | 
18 | Click on **Preview** icon to see how the data is being transformed as it is flowing through various stages in the pipeline. For details on data preview, refer to the [documentation](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Data_Preview/DataPreview_Title.html#concept_jjk_23z_sq).
19 | 
20 | 
21 | **Field Type Converter**
22 | 
23 | ![Field Type Converter](images/img2a.png)
24 | 
25 | Using [Field Type Converter](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Processors/FieldTypeConverter.html#concept_is3_zkp_wq) dates in columns *date1*, *date2*, and *date3* are converted to datetime using formats *EEE MMM dd HH:mm:ss Z yyyy*, *EEE MMM dd HH:mm:ss*, and *yyyy-MM-dd HH:mm:ss.SSSSSS* respectively.
26 | 
27 | ![Field Type Converter](images/img2b.png)
28 | 
29 | 
30 | **Expression Evaluator**
31 | 
32 | ![Expression Evaluator](images/img3a.png)
33 | 
34 | Using [Expression Evaluator](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Processors/Expression.html#concept_zm2_pp3_wq) dates in columns *date3*, *date4*, *date5* and *date6* are converted to datetime in various timezones using expressions *${time:extractStringFromDateTZ(record:value('/date3'), 'GMT','MMM d, yyyy HH:mm:ss')}*, *${time:createDateFromStringTZ(record:value('/date4'), 'CST','yyyy-MM-dd HH:mm:ss.SSSSSS')}*, *${time:createDateFromStringTZ(record:value('/date5'), 'CST','dd-MMM-yy HH.mm.ss.SSSSSS a z')}*, and *${time:createDateFromStringTZ(record:value('/date6'), 'EST','dd-MMM-yy HH.mm.ss.SSSSSSSSS a z')}* respectively.
35 | 
36 | ![Expression Evaluator](images/img3b.png)
37 | 


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Date Conversions/images/img1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Date Conversions/images/img1.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Date Conversions/images/img2a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Date Conversions/images/img2a.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Date Conversions/images/img2b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Date Conversions/images/img2b.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Date Conversions/images/img3a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Date Conversions/images/img3a.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Date Conversions/images/img3b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Date Conversions/images/img3b.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Drift Synchronization for Hive/change_stage_library.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Drift Synchronization for Hive/change_stage_library.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Drift Synchronization for Hive/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Drift Synchronization for Hive/pipeline.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Drift Synchronization for Hive/running.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Drift Synchronization for Hive/running.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/HDFS to ADLS Gen2/README.md:
--------------------------------------------------------------------------------
 1 | How to migrate to a Cloud Data Lake in Hours!!
 2 | ==============================================
 3 | 
 4 | This StreamSets Data Collector pipeline is designed to load data from HDFS to ADLS Gen 2.
 5 | 
 6 | Prerequisites
 7 | ---------------------
 8 | 
 9 | * StreamSets Data Collector [installed, up and running](https://streamsets.com/getting-started/download-install-data-collector/)
10 | * Hadoop FS installed and accessible from above SDC (StreamSets Data Collector). More details are available in [documentation](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Origins/HDFSStandalone.html#concept_djz_pdm_hdb).
11 | * Microsoft Azure account. At the time of writing, you can [create a free Azure account](https://azure.microsoft.com/en-us/free/). Configure it according to [documentation](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Destinations/ADLS-G2-D.html#concept_vvn_21l_vhb) (One sample way is this [ADLS destination tutorial](https://github.com/streamsets/tutorials/tree/master/tutorial-adls-destination)).
12 | 
13 | 
14 | Setup
15 | ---------------------
16 | 
17 | * [Download and import the pipeline](hdfs_to_adls_gen2_processors.json) into your instance of Data Collector
18 | * [Download the sample dataset](dataset)
19 | 
20 | * After importing the pipeline into your environment and before running the pipeline, enter the following data lake configurations.
21 | :
22 | 
23 | ![Pipeline](images/ADLS_gen2_data_lake.jpeg)
24 | 
25 | Also, take a look into Hadoop FS Destination stage configuration to see if you need to make any changes. 
26 | 
27 | Technical Details
28 | ------------------------------
29 | 
30 | For technical information and detailed explanation of this use case, read this [blog](https://streamsets.com/blog/how-to-migrate-to-a-cloud-data-lake-in-hours/).
31 | 


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/HDFS to ADLS Gen2/dataset/tennis.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/HDFS to ADLS Gen2/dataset/tennis.csv


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/HDFS to ADLS Gen2/images/ADLS_gen2_data_lake.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/HDFS to ADLS Gen2/images/ADLS_gen2_data_lake.jpeg


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/ML - TensorFlow Binary Classification/README.md:
--------------------------------------------------------------------------------
 1 | TensorFlow Binary Classification Of Breast Cancer Condition
 2 | =======================================================================
 3 | 
 4 | This StreamSets Data Collector pipeline is designed to load a pre-trained TensorFlow model to classify cancer condition as either *benign* or *malignant*.
 5 | 
 6 | Prerequisites
 7 | ---------------------
 8 | 
 9 | * [StreamSets Data Collector](https://streamsets.com/products/dataops-platform/data-collector/). You can [deploy Data Collector](https://streamsets.com/products/dataops-platform/data-collector/download/) on your choice of cloud provider or you can download it for local development.
10 | 
11 | Setup
12 | ---------------------
13 | 
14 | * [Download and import the pipeline](TensorFlowBreastCancerBinaryclassificationc6bb44b7-bf28-4b3a-8c8f-e419625b3096.json) into your instance of Data Colelctor
15 | * [Download the sample dataset](dataset)
16 | * [Download the TensorFlow model](model)
17 | * After importing the pipeline into your environment and before running the pipeline, update the following pipeline parameters:
18 | 
19 | ```
20 | [
21 |   {
22 |     "key": "INPUT_DATA_LOCATION",
23 |     "value": ""
24 |   },
25 |   {
26 |     "key": "INPUT_DATA_FILE",
27 |     "value": ""
28 |   },
29 |   {
30 |     "key": "KAFKA_TOPIC_BENIGN",
31 |     "value": ""
32 |   },
33 |   {
34 |     "key": "KAFKA_TOPIC_MALIGNANT",
35 |     "value": ""
36 |   },
37 |   {
38 |     "key": "KAFKA_BROKER_URI",
39 |     "value": ""
40 |   },
41 |   {
42 |     "key": "TF_MODEL_LOCATION",
43 |     "value": ""
44 |   }
45 | ]
46 | ```
47 | 
48 | These pipeline parameters refer to the locations of source dataset, the TensorFlow model, Kafka topics as well as Kafka broker URI.
49 | 
50 | Technical Details
51 | ------------------------------
52 | 
53 | For techincal information and detailed explanation of this use case, read this [blog](https://bit.ly/TFInSDC).
54 | 


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/ML - TensorFlow Binary Classification/dataset/BreastCancer.csv:
--------------------------------------------------------------------------------
1 | patient_id,first_name,last_name,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,mean_concavity,mean_concave_points,mean_symmetry,mean_fractal_dimension,radius_error,texture_error,perimeter_error,area_error,smoothness_error,compactness_error,concavity_error,concave_points_error,symmetry_error,fractal_dimension_error,worst_radius,worst_texture,worst_perimeter,worst_area,worst_smoothness,worst_compactness,worst_concavity,worst_concave_points,worst_symmetry,worst_fractal_dimension
2 | 123123123,John,Doe,-1.701923139999999890e+00,-5.777106560000000446e-01,-1.675466200000000017e+00,-1.314600989999999969e+00,-7.604850690000000135e-01,-8.636586240000000414e-01,-9.222962630000000051e-01,-1.120535480000000028e+00,-1.453964389999999884e-01,3.351444900000000171e-01,-9.526204900000000420e-01,-4.823731359999999802e-01,-9.226508960000000537e-01,-7.780981559999999853e-01,6.297042630000000418e-01,-4.942025270000000026e-01,-5.050396369999999857e-01,-9.288121129999999948e-01,6.187914509999999924e-01,-4.338434660000000109e-01,-1.548702070000000042e+00,-6.112336219999999765e-01,-1.522000030000000059e+00,-1.168588240000000056e+00,-1.196387559999999989e-01,-7.568081700000000023e-01,-9.674689220000000089e-01,-1.356760559999999893e+00,3.559366130000000128e-01,-5.230637270000000338e-01
3 | 456464645,Jane,Doe,1.051007989999999948e-01,8.968739380000000372e-01,5.552952149999999809e-02,1.995401539999999956e-03,-7.003895949999999759e-01,-7.287803379999999720e-01,-7.496175339999999743e-02,8.483616619999999331e-02,-6.572880359999999644e-01,-1.188578179999999929e+00,2.174647570000000016e-02,7.015755299999999473e-01,7.855029260000000180e-03,-1.809711440000000005e-01,4.133246259999999728e-01,-4.401056369999999940e-01,4.878417530000000307e-02,5.977213169999999742e-01,-3.568774900000000194e-01,-3.307869079999999906e-01,-1.249840740000000006e-01,9.487371599999999683e-01,-1.736646229999999902e-01,-2.390185999999999977e-01,-5.357940540000000196e-02,-6.091419070000000380e-01,-3.048685760000000022e-02,3.161804049999999977e-01,-6.500217230000000512e-01,-8.395523980000000330e-01
4 | 345789045,John,Smith,-1.058936599999999936e-02,1.015755179999999980e+00,1.047284159999999943e-02,-1.516264760000000100e-01,-1.280168479999999886e-01,5.215199590000000329e-01,2.688552479999999911e-01,3.916336850000000092e-01,1.534473170000000053e+00,2.370206749999999862e-01,4.052764310000000342e-02,1.086628209999999983e+00,3.248405639999999983e-01,-2.293575700000000106e-01,1.260371740000000074e+00,6.688806139999999845e-01,1.407293830000000134e-01,6.768163130000000027e-01,2.870785109999999918e+00,3.456777659999999974e-01,9.818883259999999780e-02,1.451127749999999939e+00,2.103739430000000077e-01,-1.433593989999999985e-01,1.034197899999999892e+00,1.024416130000000091e+00,4.736066629999999833e-01,9.428125619999999651e-01,2.949322829999999840e+00,1.051646929999999980e+00
5 | 34567123,Jane,Smith,-7.548872680000000279e-02,-5.297009249999999891e-01,-4.851044840000000302e-02,-1.752387280000000103e-01,9.690919909999999859e-01,4.689699780000000096e-01,9.931360370000000271e-02,7.839625179999999693e-02,3.556805459999999863e-01,-1.877152669999999912e-01,5.667944710000000219e-02,-7.418421179999999948e-01,-1.715996850000000018e-01,-1.152522670000000055e-01,-2.335053559999999973e-01,8.790255240000000025e-02,-3.424327020000000193e-02,2.348148630000000125e-01,1.981963130000000128e-02,-3.935479949999999838e-01,1.596120559999999956e-04,-5.936339520000000203e-01,3.353153639999999821e-03,-1.168774729999999956e-01,8.272119310000000114e-01,4.990142940000000249e-01,2.213241250000000104e-01,3.389394419999999797e-01,2.966683669999999884e-01,-2.203834610000000027e-01
6 | 34450989,John&Jane,Doe&Smith,6.340902029999999703e-03,6.362496840000000375e-01,1.558830360000000026e-01,-5.205674059999999825e-02,6.246423229999999993e-01,1.887819469999999944e+00,1.539483300000000110e+00,9.560277780000000503e-01,5.034802320000000275e-01,1.441140059999999945e+00,1.103166099999999927e+00,6.747502099999999614e-02,1.265060560000000001e+00,4.875318650000000087e-01,7.745071000000000039e-01,2.554382640000000038e+00,1.128140970000000021e+00,1.019561299999999893e+00,-4.036721629999999728e-01,8.253008480000000313e-01,2.671328079999999994e-01,5.967437530000000434e-01,3.333862969999999981e-01,1.236216480000000006e-01,1.390918389999999949e+00,2.466799099999999800e+00,1.972212099999999912e+00,9.625370610000000271e-01,-6.214479760000000269e-02,1.637123349999999977e+00


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/ML - TensorFlow Binary Classification/model/BreastCancer_TF_1.14/saved_model.pb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/ML - TensorFlow Binary Classification/model/BreastCancer_TF_1.14/saved_model.pb


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/ML - TensorFlow Binary Classification/model/BreastCancer_TF_1.14/variables/variables.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/ML - TensorFlow Binary Classification/model/BreastCancer_TF_1.14/variables/variables.data-00000-of-00001


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/ML - TensorFlow Binary Classification/model/BreastCancer_TF_1.14/variables/variables.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/ML - TensorFlow Binary Classification/model/BreastCancer_TF_1.14/variables/variables.index


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/MySQL CDC to Delta Lake/MySQL CDC to Delta Lake.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/MySQL CDC to Delta Lake/MySQL CDC to Delta Lake.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/MySQL CDC to Delta Lake/README.md:
--------------------------------------------------------------------------------
 1 | MySQL CDC to Delta Lake
 2 | ==============================
 3 | 
 4 | This pipeline demonstrates how to read change data capture (CDC) data from a MySQL database and replicate the changes to Delta Lake table(s) on Databricks.
 5 | 
 6 | For more information, see [Loading Data into Databricks Delta Lake](https://streamsets.com/documentation/datacollector/latest/help/index.html?contextID=concept_a5b_wvk_ckb) in [StreamSets Data Collector documentation](https://streamsets.com/documentation/datacollector/latest/help/).
 7 | 
 8 | Prerequisites
 9 | -------------
10 | 
11 | * [StreamSets Data Collector](https://streamsets.com/products/dataops-platform/data-collector/) 3.15.0 or higher. You can [run Data Collector on your cloud provider of choice](https://streamsets.com/products/cloud/), or [download it for local use](https://streamsets.com/products/dataops-platform/data-collector/download/).
12 | * Ensure the [pre-requisites](https://streamsets.com/documentation/datacollector/latest/help/index.html?contextID=concept_xnp_y5f_dlb "pre-requisites") for Databricks Delta Lake are complete
13 | * [MySQL Server](https://www.mysql.com/) with Binary log enabled
14 | * [MySQL Connector/J](https://dev.mysql.com/downloads/connector/j/) JDBC Driver
15 | 
16 | Setup
17 | -----
18 | 
19 | * [Download the pipeline](MySQL%20CDC%20(Binary%20Log)%20to%20DeltaLake.json) and import it into Data Collector or Control Hub
20 | * Configure all the pipeline parameters for your MySQL Database and Databricks connections
21 | * If necessary, update the MySQL binlog origin to replicate only specific tables
22 | * By default, the Databricks Delta Lake destination is configured to auto create each table that is replicated from MySQL and write the data in DBFS. If you'd like, update the configurations in the destination per your needs.
23 | * Configure Databricks Delta Lake destination to add a key column for each Delta Lake table being replicated. This is required for ensure the Merge command is run with the right conditional logic for Inserts, Updates and Deletes.
24 | * Start your Databricks cluster.
25 | 
26 | Running the Pipeline
27 | --------------------
28 | 
29 | Start the pipeline. It takes a couple of seconds to create a connection to Databricks. Once the connection is established, you should see records replicated from MySQL and sent to Delta Lake. Insert, update and delete records in MySQL to see how they are being replicated in Delta Lake.
30 | 
31 | ![Pipeline running](MySQL%20CDC%20to%20Delta%20Lake.png)
32 | 
33 | 


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/MySQL CDC to S3 to Snowflake/images/MySQLCDCtoS3toSnowflake_step2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/MySQL CDC to S3 to Snowflake/images/MySQLCDCtoS3toSnowflake_step2.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/MySQL CDC to S3 to Snowflake/images/MySQLCDCtoS3toSnowflake_step2a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/MySQL CDC to S3 to Snowflake/images/MySQLCDCtoS3toSnowflake_step2a.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/MySQL CDC to S3 to Snowflake/images/MySQLCDCtoS3toSnowflake_step3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/MySQL CDC to S3 to Snowflake/images/MySQLCDCtoS3toSnowflake_step3.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/MySQL CDC to S3 to Snowflake/images/MySQLCDCtoS3toSnowflake_step4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/MySQL CDC to S3 to Snowflake/images/MySQLCDCtoS3toSnowflake_step4.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/MySQL CDC to S3 to Snowflake/images/MySQLCDCtoS3toSnowflake_step5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/MySQL CDC to S3 to Snowflake/images/MySQLCDCtoS3toSnowflake_step5.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/MySQL CDC to S3 to Snowflake/images/MySQLCDCtoS3toSnowflake_step5a1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/MySQL CDC to S3 to Snowflake/images/MySQLCDCtoS3toSnowflake_step5a1.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/MySQL CDC to S3 to Snowflake/images/MySQLCDCtoS3toSnowflake_step5a2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/MySQL CDC to S3 to Snowflake/images/MySQLCDCtoS3toSnowflake_step5a2.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/MySQL CDC to S3 to Snowflake/images/MySQLCDCtoS3toSnowflake_step6a1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/MySQL CDC to S3 to Snowflake/images/MySQLCDCtoS3toSnowflake_step6a1.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/MySQL CDC to S3 to Snowflake/images/MySQLCDCtoS3toSnowflake_step6a2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/MySQL CDC to S3 to Snowflake/images/MySQLCDCtoS3toSnowflake_step6a2.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/MySQL CDC to S3 to Snowflake/images/pipeline1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/MySQL CDC to S3 to Snowflake/images/pipeline1.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/MySQL CDC to S3 to Snowflake/images/pipeline2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/MySQL CDC to S3 to Snowflake/images/pipeline2.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/MySQL CDC to S3 to Snowflake/pipelines.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/MySQL CDC to S3 to Snowflake/pipelines.zip


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/MySQL CDC to Snowflake/MySQL_CDC_to_Snowflake.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/MySQL CDC to Snowflake/MySQL_CDC_to_Snowflake.zip


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/MySQL CDC to Snowflake/README.md:
--------------------------------------------------------------------------------
  1 | ![StreamSets Logo](../../../../images/Full%20Color%20Transparent.png)
  2 | 
  3 | <h1><p align="center">MySQL CDC to Snowflake</p></h1>
  4 | 
  5 | # MySQL CDC to Snowflake
  6 | 
  7 | **Important:** *These instructions assume you have access to StreamSets Data Collector (v3.15+) and have performed all the prerequisites for MySQL and Snowflake*
  8 | 
  9 | - For help installing [StreamSets Data Collector](https://streamsets.com/products/dataops-platform/data-collector/), see [StreamSets Data Collector Installation](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Installation/Install_title.html).
 10 | - For help with MySQL Binary Log prerequisites, see [MySQL Binary Log](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Origins/MySQLBinaryLog.html).
 11 | - For help with Snowflake prerequisites, see [Snowflake](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Destinations/Snowflake.html).
 12 | 
 13 | Here is a link to a short video on using this pipeline template: [Video Link](https://www.youtube.com/channel/UC_4K-__dngOCEmoZs7PVZAg)
 14 | 
 15 | ## OVERVIEW
 16 | 
 17 | This pipeline demonstrates how to read change data capture (CDC) data from a MySQL database and replicate the changes to Snowflake.
 18 | 
 19 | **Disclaimer:** *This pipeline is meant to serve as a template for performing MySQL binlog CDC to Snowflake.  Some of the parameters, tables and fields may be different for your environment and may need additional customizations.  Please consult the StreamSets documentation (linked below) for full information on configuration of each stage used below.*
 20 | 
 21 | ## USING THE TEMPLATE
 22 | 
 23 | NOTE: [Templates](https://streamsets.com/documentation/controlhub/latest/onpremhelp/controlhub/UserGuide/Pipelines/PipelineTemplates.html) are supported in [StreamSets Control Hub](https://streamsets.com/products/dataops-platform/control-hub/). If you do not have Control Hub, you can import the template pipeline in Data Collector but will need to do that each time you want to use the template.
 24 | 
 25 | ## PIPELINE
 26 | 
 27 | ![Pipeline](images/pipeline.png "MySQL CDC to Snowflake")
 28 | 
 29 | ## DOCUMENTATION
 30 | 
 31 | [MySQL Binlog Origin](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Origins/MySQLBinaryLog.html)
 32 | 
 33 | [Expression Evaluator](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Processors/Expression.html)
 34 | 
 35 | [StreamSelector](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Processors/StreamSelector.html)
 36 | 
 37 | [Field Renamer](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Processors/FieldRenamer.html)
 38 | 
 39 | [Snowflake Destination](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Destinations/Snowflake.html)
 40 | 
 41 | ## STEP-BY-STEP
 42 | 
 43 | ### Step 1: Download the pipeline
 44 | 
 45 | [Click Here](./MySQL_CDC_to_Snowflake.zip?raw=true) to download the pipeline and save it to your drive.
 46 | 
 47 | ### Step 2: Import the pipeline
 48 | 
 49 | Click the down arrow next to the "Create New Pipeline" and select "Import Pipeline From Archive".
 50 | 
 51 | ![Step 2](images/MySQLtoSnowflake_step2.png "Import the Pipeline")
 52 | 
 53 | Click "Browse" and locate the pipeline file you just downloaded, click "OK", then click "Import"
 54 | 
 55 | ![Step 2a](images/MySQLtoSnowflake_step2a.png "Import the Pipeline")
 56 | 
 57 | ### Step 3: Configure the parameters
 58 | 
 59 | Click on the pipeline you just imported to open it and click on the "Parameters" tab and fill in the appropriate information for your environment.
 60 | 
 61 | **Important:** *The pipeline template uses the most common default settings for things like the Snowflake region, staging location, etc. All of these are configurable and if you need to change those, you can opt to not use the built-in parameters and choose the appropriate settings yourself. Please refer to the documentation listed in this document for all the available options.*
 62 | 
 63 | ![Step 3](images/MySQLtoSnowflake_step3.png "Configure the parameters")
 64 | 
 65 | The following parameters are set up for this pipeline:
 66 | 
 67 | <table>
 68 |   <tr>
 69 |    <td><code>mysql_hostname</code>
 70 |    </td>
 71 |    <td class="entry cellrowborder" headers="d436212e756 ">MySQL server hostname.</td>
 72 |   </tr>
 73 |   <tr>
 74 |    <td><code>mysql_port</code>
 75 |    </td>
 76 |    <td class="entry cellrowborder" headers="d436212e756 ">MySQL server port. </td>
 77 |   </tr>
 78 |   <tr>
 79 |    <td><code>mysql_serverid</code>
 80 |    </td>
 81 |    <td class="entry cellrowborder" headers="d436212e756 ">Replication server ID that the origin uses to connect to
 82 |                                         the master MySQL server. Must be unique from the server ID
 83 |                                         of the replication master and of all the other replication
 84 |                                             slaves.<p class="p">When the MySQL server database is enabled for
 85 |                                             GTID, the server ID is optional. </p>
 86 | </td>
 87 |   </tr>
 88 |   <tr>
 89 |    <td><code>mysql_username</code>
 90 |    </td>
 91 |    <td class="entry cellrowborder" headers="d436212e853 ">MySQL username. <div class="p">The user must have the following MySQL
 92 |                                                 privileges:<ul>
 93 |                                                 <li>REPLICATION CLIENT</li>
 94 |                                                 <li>REPLICATION SLAVE</li>
 95 |                                             </ul>
 96 | </div>
 97 | </td>
 98 |   </tr>
 99 |   <tr>
100 |    <td><code>mysql_password</code>
101 |    </td>
102 |    <td class="entry cellrowborder" headers="d436212e853 ">MySQL password.<div class="note tip"><span class="tiptitle">Tip:</span> <span class="ph" id="task_qbt_kyh_xx__d15e6239">To secure sensitive information such as user names and passwords, you can use <a class="xref" href="https://streamsets.com/documentation/datacollector/latest/help/index.html?contextID=concept_bs4_5nm_2s" title="Similar to runtime properties, runtime resources are values that you define in a file local to the Data Collector and call from within a pipeline. But with runtime resources, you can restrict the permissions for the files to secure information.">runtime resources</a> or <span class="ph"><a class="xref" href="https://streamsets.com/documentation/datacollector/latest/help/index.html?contextID=concept_bt1_bpj_r1b">credential stores.</a></span></span></div>
103 | </td>
104 |   </tr>
105 |   <tr>
106 |    <td><code>snowflake_account</code>
107 |    </td>
108 |    <td class="entry cellrowborder" headers="d198512e2230 ">Snowflake account name.</td>
109 |   </tr>
110 |   <tr>
111 |    <td><code>snowflake_user</code>
112 |    </td>
113 |    <td class="entry cellrowborder" headers="d198512e2230 ">Snowflake user name.</td>
114 |   </tr>
115 |   <tr>
116 |    <td><code>snowflake_password</code>
117 |    </td>
118 |    <td class="entry cellrowborder" headers="d198512e2230 ">Snowflake password.</td>
119 |   </tr>
120 |   <tr>
121 |    <td><code>snowflake_warehouse</code>
122 |    </td>
123 |    <td class="entry cellrowborder" headers="d198512e2372 ">Snowflake warehouse.</td>
124 |   </tr>
125 |   <tr>
126 |    <td><code>snowflake_database</code>
127 |    </td>
128 |    <td class="entry cellrowborder" headers="d198512e2372 ">Snowflake database.</td>
129 |   </tr>
130 |   <tr>
131 |    <td><code>snowflake_schema</code>
132 |    </td>
133 |    <td class="entry cellrowborder" headers="d198512e2372 ">Snowflake schema.</td>
134 |   </tr>
135 |   <tr>
136 |    <td><code>snowflake_stage_name</code>
137 |    </td>
138 |    <td class="entry cellrowborder" headers="d198512e2713 ">Name of the Snowflake stage used to stage the data.
139 |                                             <p class="p">Unless using a Snowflake internal user stage, you
140 |                                             create this stage as part of the <a class="xref" href="https://streamsets.com/documentation/datacollector/latest/help/index.html?contextID=concept_ysy_fcj_ggb">Snowflake prerequisite tasks</a>.</p>
141 | <p class="p">To use a
142 |                                             Snowflake internal user stage, enter a tilde
143 |                                                 (<code class="ph codeph">~</code>).</p>
144 | </td>
145 |   </tr>
146 | </table>
147 | 
148 | ### Step 4: Run the pipeline
149 | 
150 | Click the "START" button to run the pipeline.
151 | 
152 | ![Step 4](images/MySQLtoSnowflake_step4.png "Run the pipeline")
153 | 
154 | ![Step 4a](images/MySQLtoSnowflake_step4a.png "Run the pipeline")
155 | 
156 | ### Step 5: Make changes to the MySQL source table and see the pipeline process them
157 | 
158 | ![Step 5](images/MySQLtoSnowflake_step5.png "View the results")
159 | 


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/MySQL CDC to Snowflake/images/MySQLtoSnowflake_step2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/MySQL CDC to Snowflake/images/MySQLtoSnowflake_step2.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/MySQL CDC to Snowflake/images/MySQLtoSnowflake_step2a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/MySQL CDC to Snowflake/images/MySQLtoSnowflake_step2a.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/MySQL CDC to Snowflake/images/MySQLtoSnowflake_step3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/MySQL CDC to Snowflake/images/MySQLtoSnowflake_step3.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/MySQL CDC to Snowflake/images/MySQLtoSnowflake_step4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/MySQL CDC to Snowflake/images/MySQLtoSnowflake_step4.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/MySQL CDC to Snowflake/images/MySQLtoSnowflake_step4a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/MySQL CDC to Snowflake/images/MySQLtoSnowflake_step4a.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/MySQL CDC to Snowflake/images/MySQLtoSnowflake_step5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/MySQL CDC to Snowflake/images/MySQLtoSnowflake_step5.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/MySQL CDC to Snowflake/images/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/MySQL CDC to Snowflake/images/pipeline.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/MySQL Schema replication to Azure Synapse SQL/MySQL_Schema_Replication_to_Azure_Synapse_SQL.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/MySQL Schema replication to Azure Synapse SQL/MySQL_Schema_Replication_to_Azure_Synapse_SQL.zip


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/MySQL Schema replication to Azure Synapse SQL/README.md:
--------------------------------------------------------------------------------
 1 | MySQL Schema replication to Azure Synapse SQL
 2 | ==============================
 3 | 
 4 | This pipeline demonstrates how to bulk load data from MySQL into Azure Synapse SQL. You can select one or more MySQL schemas to replicate into Synapse SQL with a single Data Collector pipeline.
 5 | 
 6 | For more information, see [Loading Data into Azure Synapse SQL](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Destinations/AzureSynapse.html) in [StreamSets Data Collector documentation](https://streamsets.com/documentation/datacollector/latest/help/).
 7 | 
 8 | Pre-requisites
 9 | -------------
10 | 
11 | * Run [StreamSets Data Collector](https://streamsets.com/products/dataops-platform/data-collector/) from Azure Marketplace or [download it for local use](https://streamsets.com/products/dataops-platform/data-collector/download/). Download version must be 3.19.0 or higher.
12 | * Ensure the [pre-requisites](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Destinations/AzureSynapse.html "pre-requisites") for Azure Synapse SQL are complete
13 | 
14 | Setup
15 | -----
16 | 
17 | * [Download the pipeline](MySQL_Schema_Replication_to_Azure_Synapse_SQL.zip?raw=true) and import it into Data Collector or Control Hub
18 | * Configure all the pipeline parameters for your MySQL Database, Synapse and Azure Staging connections.
19 | ![Parameters](images/parameters.png)
20 | * Optionally, update the MySQL origin to read from specific schemas and/or tables
21 | ![MySQL](images/mysql_configs.png)
22 | * By default, the Azure Synapse SQL destination is configured to auto create each table that is replicated from MySQL. If you'd like, update the configurations in the destination per your needs.
23 | ![MySQL](images/synapse_auto_create.png)
24 | 
25 | Running the Pipeline
26 | --------------------
27 | 
28 | Start the pipeline. It takes a couple of seconds to create a connection to Synapse. Once the connection is established, you should see records ingested from MySQL and sent to Synapse SQL. The pipeline is also configured to automatically stop after it's ingested all the data from MySQL. If you'd like to continue running the pipeline such that it incrementally ingests as data arrives in MySQL then uncheck 'Process Events' from the origin and delete the 'Load complete' executor.
29 | 
30 | ![Pipeline](images/MySQL_Schema_Replication_to_Azure_Synapse_SQL.png)
31 | 


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/MySQL Schema replication to Azure Synapse SQL/images/MySQL_Schema_Replication_to_Azure_Synapse_SQL.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/MySQL Schema replication to Azure Synapse SQL/images/MySQL_Schema_Replication_to_Azure_Synapse_SQL.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/MySQL Schema replication to Azure Synapse SQL/images/mysql_configs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/MySQL Schema replication to Azure Synapse SQL/images/mysql_configs.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/MySQL Schema replication to Azure Synapse SQL/images/parameters.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/MySQL Schema replication to Azure Synapse SQL/images/parameters.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/MySQL Schema replication to Azure Synapse SQL/images/synapse_auto_create.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/MySQL Schema replication to Azure Synapse SQL/images/synapse_auto_create.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/MySQL Schema replication to Delta Lake/MySQL_Schema_replication_to_DeltaLake.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/MySQL Schema replication to Delta Lake/MySQL_Schema_replication_to_DeltaLake.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/MySQL Schema replication to Delta Lake/README.md:
--------------------------------------------------------------------------------
 1 | MySQL Schema replication to Delta Lake
 2 | ==============================
 3 | 
 4 | This pipeline demonstrates how to bulk load data from MySQL into Databricks Delta Lake. You can select one or more MySQL schemas to replicate into Delta Lake with a single pipeline.
 5 | 
 6 | For more information, see [Loading Data into Databricks Delta Lake](https://streamsets.com/documentation/datacollector/latest/help/index.html?contextID=concept_a5b_wvk_ckb) in [StreamSets Data Collector documentation](https://streamsets.com/documentation/datacollector/latest/help/).
 7 | 
 8 | Prerequisites
 9 | -------------
10 | 
11 | * [StreamSets Data Collector](https://streamsets.com/products/dataops-platform/data-collector/) 3.15.0 or higher. You can [run Data Collector on your cloud provider of choice](https://streamsets.com/products/cloud/), or [download it for local use](https://streamsets.com/products/dataops-platform/data-collector/download/).
12 | * Ensure the [pre-requisites](https://streamsets.com/documentation/datacollector/latest/help/index.html?contextID=concept_xnp_y5f_dlb "pre-requisites") for Databricks Delta Lake are complete
13 | * [MySQL Server](https://www.mysql.com/)
14 | * [MySQL Connector/J](https://dev.mysql.com/downloads/connector/j/) JDBC Driver
15 | 
16 | Setup
17 | -----
18 | 
19 | * [Download the pipeline](MySQL%20Schema%20Replication%20to%20Delta%20Lake.json) and import it into Data Collector or Control Hub
20 | * Configure all the pipeline parameters for your MySQL Database and Databricks connections
21 | * Update the MySQL origin to read from specific schemas and/or tables
22 | * By default, the Databricks Delta Lake destination is configured to auto create each table that is replicated from MySQL and write the data in DBFS. If you'd like, update the configurations in the destination per your needs.
23 | * Start your Databricks cluster.
24 | 
25 | Running the Pipeline
26 | --------------------
27 | 
28 | Start the pipeline. It takes a couple of seconds to create a connection to Databricks. Once the connection is established, you should see records ingested from MySQL and sent to Delta Lake. The pipeline is also configured to automatically stop after it's ingested all the data from MySQL. If you'd like to continue running the pipeline such that it incrementally ingests as data arrives in MySQL then remove 'Process Events' from the origin and all the stages connected to the Event Lane.
29 | 
30 | ![Pipeline running](MySQL_Schema_replication_to_DeltaLake.png)
31 | 
32 | 


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/MySQL binlog to DeltaLake/MySQL_CDC_to_DeltaLake.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/MySQL binlog to DeltaLake/MySQL_CDC_to_DeltaLake.zip


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/MySQL binlog to DeltaLake/images/MySQLtoDBDeltaLake_step2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/MySQL binlog to DeltaLake/images/MySQLtoDBDeltaLake_step2.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/MySQL binlog to DeltaLake/images/MySQLtoDBDeltaLake_step2a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/MySQL binlog to DeltaLake/images/MySQLtoDBDeltaLake_step2a.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/MySQL binlog to DeltaLake/images/MySQLtoDBDeltaLake_step3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/MySQL binlog to DeltaLake/images/MySQLtoDBDeltaLake_step3.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/MySQL binlog to DeltaLake/images/MySQLtoDBDeltaLake_step4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/MySQL binlog to DeltaLake/images/MySQLtoDBDeltaLake_step4.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/MySQL binlog to DeltaLake/images/MySQLtoDBDeltaLake_step4a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/MySQL binlog to DeltaLake/images/MySQLtoDBDeltaLake_step4a.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/MySQL binlog to DeltaLake/images/MySQLtoDBDeltaLake_step5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/MySQL binlog to DeltaLake/images/MySQLtoDBDeltaLake_step5.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/MySQL binlog to DeltaLake/images/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/MySQL binlog to DeltaLake/images/pipeline.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/NYC Taxi Ride Payment Type (Basic)/NYC_Taxi_Ride_Payment_Type_Basic.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/NYC Taxi Ride Payment Type (Basic)/NYC_Taxi_Ride_Payment_Type_Basic.zip


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/NYC Taxi Ride Payment Type (Basic)/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | <h1><p align="center">NYC Taxi Ride Payment Type (Basic)</p></h1>
 3 | 
 4 | **Important:** *These instructions assume you have access to StreamSets Data Collector (v3.16+) and have performed all the prerequisites*
 5 | 
 6 | - For help installing [StreamSets Data Collector](https://streamsets.com/products/dataops-platform/data-collector/), see [StreamSets Data Collector Installation](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Installation/Install_title.html).
 7 | - Download a sample data set from the following [location](https://www.streamsets.com/documentation/datacollector/sample_data/tutorial/nyc_taxi_data.csv). Place the file on the same host where Data Collector is running and provide read permissions to it.
 8 | 
 9 | ## OVERVIEW
10 | 
11 | This pipeline demonstrates how to read data from a directory, process it, route it, mask sensitive data and write into another file system with a different data format. The pipeline also shows an example of how to use & configure a Data Rule.
12 | 
13 | ## PIPELINE
14 | 
15 | ![Pipeline](images/pipeline.png "NYC Taxi Ride Payment Type (Basic)")
16 | 
17 | ## DOCUMENTATION
18 | 
19 | [Tutorial](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Tutorial/BasicTutorial.html)
20 | 
21 | ## STEP-BY-STEP
22 | 
23 | ### Step 1: Download the pipeline
24 | 
25 | [Click Here](./NYC_Taxi_Ride_Payment_Type_Basic.zip?raw=true) to download the pipeline and save it to your drive.
26 | 
27 | ### Step 2: Import the pipeline
28 | 
29 | Click the down arrow next to the "Create New Pipeline" and select "Import Pipeline".
30 | 
31 | ![Step 2](images/import_from_archive.png "Import the Pipeline")
32 | 
33 | Click "Browse" and locate the pipeline file you just downloaded, then click "Import"
34 | 
35 | ### Step 3: Configure the parameters
36 | 
37 | Click on the pipeline you just imported to open it and click on the "Parameters" tab and fill in the appropriate folder path pointing to the absolute path to the directory where the sample file was downloaded.
38 | 
39 | ![Step 3](images/parameter.png "Configure the parameters")
40 | 
41 | ### Step 4: Run the pipeline
42 | 
43 | Click the "START" button to run the pipeline.
44 | 
45 | ![Step 4](images/start_pipeline.png "Run the pipeline")
46 | 
47 | ![Step 4](images/running_pipeline.png "View the results")
48 | 


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/NYC Taxi Ride Payment Type (Basic)/images/import_from_archive.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/NYC Taxi Ride Payment Type (Basic)/images/import_from_archive.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/NYC Taxi Ride Payment Type (Basic)/images/parameter.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/NYC Taxi Ride Payment Type (Basic)/images/parameter.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/NYC Taxi Ride Payment Type (Basic)/images/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/NYC Taxi Ride Payment Type (Basic)/images/pipeline.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/NYC Taxi Ride Payment Type (Basic)/images/running_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/NYC Taxi Ride Payment Type (Basic)/images/running_pipeline.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/NYC Taxi Ride Payment Type (Basic)/images/select_downloaded_archive.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/NYC Taxi Ride Payment Type (Basic)/images/select_downloaded_archive.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/NYC Taxi Ride Payment Type (Basic)/images/start_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/NYC Taxi Ride Payment Type (Basic)/images/start_pipeline.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/NYC Taxi Ride Payment Type (with Jython)/NYC_Taxi_Ride_Payment_Type_with_Jython.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/NYC Taxi Ride Payment Type (with Jython)/NYC_Taxi_Ride_Payment_Type_with_Jython.zip


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/NYC Taxi Ride Payment Type (with Jython)/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | <h1><p align="center">NYC Taxi Ride Payment Type (with Jython processor)</p></h1>
 3 | 
 4 | **Important:** *These instructions assume you have access to StreamSets Data Collector (v3.16+) and have performed all the prerequisites*
 5 | 
 6 | - For help installing [StreamSets Data Collector](https://streamsets.com/products/dataops-platform/data-collector/), see [StreamSets Data Collector Installation](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Installation/Install_title.html).
 7 | - Ensure Jython stage library is installed. For instruction on how to install a stage library, refer to the [docs](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Installation/AddtionalStageLibs.html)
 8 | - Download a sample data set from the following [location](https://www.streamsets.com/documentation/datacollector/sample_data/tutorial/nyc_taxi_data.csv). Place the file on the same host where Data Collector is running and provide read permissions to it.
 9 | 
10 | ## OVERVIEW
11 | 
12 | This pipeline demonstrates how to read data from a directory, process it, route it, mask sensitive data and write into another file system with a different data format. The pipeline also shows an example of how to use & configure a Data Rule.
13 | 
14 | ## PIPELINE
15 | 
16 | ![Pipeline](images/pipeline.png "NYC Taxi Ride Payment Type (Basic)")
17 | 
18 | ## DOCUMENTATION
19 | 
20 | [Tutorial](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Tutorial/BasicTutorial.html)
21 | 
22 | ## STEP-BY-STEP
23 | 
24 | ### Step 1: Download the pipeline
25 | 
26 | [Click Here](./NYC_Taxi_Ride_Payment_Type_with_Jython.zip?raw=true) to download the pipeline and save it to your drive.
27 | 
28 | ### Step 2: Import the pipeline
29 | 
30 | Click the down arrow next to the "Create New Pipeline" and select "Import Pipeline".
31 | 
32 | ![Step 2](images/import_from_archive.png "Import the Pipeline")
33 | 
34 | Click "Browse" and locate the pipeline file you just downloaded, then click "Import"
35 | 
36 | ### Step 3: Configure the parameters
37 | 
38 | Click on the pipeline you just imported to open it and click on the "Parameters" tab and fill in the appropriate folder path pointing to the absolute path to the directory where the sample file was downloaded.
39 | 
40 | ![Step 3](images/parameter.png "Configure the parameters")
41 | 
42 | ### Step 4: Run the pipeline
43 | 
44 | Click the "START" button to run the pipeline.
45 | 
46 | ![Step 4](images/start_pipeline.png "Run the pipeline")
47 | 
48 | ![Step 4](images/running_pipeline.png "View the results")
49 | 


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/NYC Taxi Ride Payment Type (with Jython)/images/import_from_archive.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/NYC Taxi Ride Payment Type (with Jython)/images/import_from_archive.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/NYC Taxi Ride Payment Type (with Jython)/images/parameter.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/NYC Taxi Ride Payment Type (with Jython)/images/parameter.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/NYC Taxi Ride Payment Type (with Jython)/images/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/NYC Taxi Ride Payment Type (with Jython)/images/pipeline.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/NYC Taxi Ride Payment Type (with Jython)/images/running_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/NYC Taxi Ride Payment Type (with Jython)/images/running_pipeline.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/NYC Taxi Ride Payment Type (with Jython)/images/select_downloaded_archive.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/NYC Taxi Ride Payment Type (with Jython)/images/select_downloaded_archive.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/NYC Taxi Ride Payment Type (with Jython)/images/start_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/NYC Taxi Ride Payment Type (with Jython)/images/start_pipeline.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Oracle 19c Bulk Ingest and CDC to Databricks Delta Lake/README.md:
--------------------------------------------------------------------------------
 1 | Oracle 19c Bulk Ingest And Change Data Capture To Databricks Delta Lake
 2 | =======================================================================
 3 | 
 4 | These pipelines demonstrate how to bulk ingest data from Oracle 19c and process Change Data Capture (CDC) into Databricks Delta Lake.
 5 | 
 6 | Prerequisites
 7 | ---------------------
 8 | 
 9 | * [StreamSets Data Collector](https://streamsets.com/products/dataops-platform/data-collector/) 3.16.0 or higher. You can [deploy Data Collector on your choice of cloud provider of choice](https://streamsets.com/products/cloud/), or [you can download it for local use](https://streamsets.com/products/dataops-platform/data-collector/download/).
10 | * Access to Databricks cluster with Databricks Runtime 6.3 or higher
11 | * Ensure the [prerequisites](https://streamsets.com/documentation/datacollector/latest/help/index.html?contextID=concept_xnp_y5f_dlb "pre-requisites") for Databricks Delta Lake are satisfied
12 | * Access to Oracle 19c database
13 | 
14 | Setup
15 | ---------------------
16 | 
17 | * Download the pipelines and import them into your Data Collector or Control Hub
18 | * After importing the pipelines into your environment and before running the pipelines, update pipeline parameters with your Oracle 19c JDBC URL, Databricks cluster JDBC URL, staging information on Databricks Delta Lake destination >> Staging tab and Table/Key columns information on Databricks Delta Lake destination >> Data tab.
19 | * Start your Databricks cluster
20 | 
21 | Technical Details
22 | ---------------------
23 | 
24 | For techincal info and detailed explanation, refer to this [blog](https://bit.ly/2ZMAWDk).
25 | 


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Oracle CDC to Delta Lake/OracleCDC_to_DeltaLake.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Oracle CDC to Delta Lake/OracleCDC_to_DeltaLake.zip


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Oracle CDC to Delta Lake/images/OracletoDBDeltaLake_step2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Oracle CDC to Delta Lake/images/OracletoDBDeltaLake_step2.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Oracle CDC to Delta Lake/images/OracletoDBDeltaLake_step2a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Oracle CDC to Delta Lake/images/OracletoDBDeltaLake_step2a.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Oracle CDC to Delta Lake/images/OracletoDBDeltaLake_step3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Oracle CDC to Delta Lake/images/OracletoDBDeltaLake_step3.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Oracle CDC to Delta Lake/images/OracletoDBDeltaLake_step4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Oracle CDC to Delta Lake/images/OracletoDBDeltaLake_step4.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Oracle CDC to Delta Lake/images/OracletoDBDeltaLake_step4a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Oracle CDC to Delta Lake/images/OracletoDBDeltaLake_step4a.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Oracle CDC to Delta Lake/images/OracletoDBDeltaLake_step5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Oracle CDC to Delta Lake/images/OracletoDBDeltaLake_step5.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Oracle CDC to Delta Lake/images/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Oracle CDC to Delta Lake/images/pipeline.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Oracle CDC to Snowflake/OracleCDC_to_Snowflake.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Oracle CDC to Snowflake/OracleCDC_to_Snowflake.zip


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Oracle CDC to Snowflake/images/OracletoSnowflake_step2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Oracle CDC to Snowflake/images/OracletoSnowflake_step2.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Oracle CDC to Snowflake/images/OracletoSnowflake_step2a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Oracle CDC to Snowflake/images/OracletoSnowflake_step2a.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Oracle CDC to Snowflake/images/OracletoSnowflake_step3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Oracle CDC to Snowflake/images/OracletoSnowflake_step3.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Oracle CDC to Snowflake/images/OracletoSnowflake_step4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Oracle CDC to Snowflake/images/OracletoSnowflake_step4.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Oracle CDC to Snowflake/images/OracletoSnowflake_step4a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Oracle CDC to Snowflake/images/OracletoSnowflake_step4a.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Oracle CDC to Snowflake/images/OracletoSnowflake_step5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Oracle CDC to Snowflake/images/OracletoSnowflake_step5.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Oracle CDC to Snowflake/images/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Oracle CDC to Snowflake/images/pipeline.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Parse Twitter Data To JSON/README.md:
--------------------------------------------------------------------------------
 1 | Parse Twitter Data To JSON
 2 | ==========================
 3 | 
 4 | This pipeline demonstrates how to parse raw Twitter data and store curated data in JSON format.
 5 | 
 6 | Setup And Technical Details
 7 | ---------------------------
 8 | 
 9 | * Download the [pipeline](ParseTwit08ef5e13-c53c-4664-8acf-4b393ec7782f.json) and import it into your Data Collector
10 | * Update the following pipeline parameters
11 |     * OUTPUT_FOLDER
12 |         * This is the path to the folder where you'd like JSON file to be created
13 |     * OUTPUT_FILE_PREFIX
14 |         * This is the prefix you'd like to add to the JSON output file
15 | 
16 | Pipeline Overview
17 | -----------------
18 | 
19 | The pipeline has been prepopulated with sample Twitter data using Dev Raw Data Source origin so you don't have to worry about loading it using HTTP Client origin.
20 | 
21 | Note that Twitter's API returns tweets in a nested structure within ***statuses*** list.
22 | 
23 | ![Pipeline Overview](images/img1.png)
24 | 
25 | Pipeline Preview
26 | ----------------
27 | 
28 | Once you have updated the pipeline parameters, click on **Preview** icon to see how the data is being transformed as it is flowing through various stages in the pipeline. For details on data preview, refer to the [documentation](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Data_Preview/DataPreview_Title.html#concept_jjk_23z_sq).
29 | 
30 | **Field Pivoter**
31 | 
32 | ![Field Pivoter](images/img2.png)
33 | 
34 | Using [Field Pivoter](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Processors/ListPivoter.html#concept_ekg_313_qw) a tweet record is created for each nested item within *statuses* column. The configuration attribute of interest here is **Field To Pivot** set to ***/statuses*** on **Field To Pivot** tab.
35 | 
36 | 
37 | **Field Remover**
38 | 
39 | ![Field Remover](images/img3.png)
40 | 
41 | Using [Field Remover](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Processors/FieldRemover.html#concept_jdd_blr_wq) only a set of columns have been selected that will be part of the output record.
42 | 
43 | 
44 | **Field Flattener**
45 | 
46 | ![Field Flattener](images/img4.png)
47 | 
48 | Using [Field Flattener](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Processors/FieldFlattener.html#concept_njn_3kk_fx) the nested structure for *user* map field is flattened. The configuration attribute of interest here is **Name separator** set to ***_*** on **Flatten** tab.
49 | 
50 | 
51 | **Field Renamer**
52 | 
53 | ![Field Renamer](images/img5.png)
54 | 
55 | Using [Field Renamer](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Processors/FieldRenamer.html#concept_vyv_zsg_ht) the column *user_screen_name* is renamed to *user_screenname*.
56 | 
57 | 
58 | **Local FS**
59 | 
60 | ![Local FS](images/img6.png)
61 | 
62 | The curated Twitter data is stored in the local filesystem using [Local FS](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Destinations/LocalFS.html#concept_zvc_bv5_1r) destination in JSON format. The configuration attribute of interest here is **Data Format** set to ***JSON*** on **Data Format** tab.
63 | 
64 | 
65 | Pipeline Run
66 | ------------
67 | 
68 | Provided you've updated the pipeline parameters and there aren't any validation errors, running the pipleline should create one JSON output file.
69 | 
70 | **JSON** -- [Sample output file](output/tweets-curated-6a29b1bb-da8e-11ea-8b72-417388e3a72a_335bc746-b9c2-4ef0-a5ef-5a049d35985d.json).
71 | 


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Parse Twitter Data To JSON/images/img1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Parse Twitter Data To JSON/images/img1.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Parse Twitter Data To JSON/images/img2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Parse Twitter Data To JSON/images/img2.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Parse Twitter Data To JSON/images/img3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Parse Twitter Data To JSON/images/img3.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Parse Twitter Data To JSON/images/img4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Parse Twitter Data To JSON/images/img4.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Parse Twitter Data To JSON/images/img5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Parse Twitter Data To JSON/images/img5.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Parse Twitter Data To JSON/images/img6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Parse Twitter Data To JSON/images/img6.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Parse Web Logs To JSON And Avro/README.md:
--------------------------------------------------------------------------------
 1 | Parse Web Logs To JSON And Avro
 2 | ===============================
 3 | 
 4 | This pipeline demonstrates how to parse raw web logs ingested in Common Log Format and store curated data in JSON and Avro formats.
 5 | 
 6 | Setup And Technical Details
 7 | ---------------------------
 8 | 
 9 | * Download the [pipeline](Parse%20Web%20Logs%20To%20JSON%20%26%20Avro.json) and import it into your Data Collector
10 | * Update the following pipeline parameters
11 |     * ADDED_TO_CART_FS_FOLDER
12 |         * This is the path to the folder where you'd like Avro output file to be created
13 |     * PAGE_VIEWS_FS_FOLDER
14 |         * This is the path to the folder where you'd like JSON output file to be created
15 |     * ADDED_TO_CART_FILE_PREFIX
16 |         * This is the prefix you'd like to add to the Avro output file
17 |     * PAGE_VIEWS_FILE_PREFIX
18 |         * This is the prefix you'd like to add to the JSON output file
19 | 
20 | 
21 | Pipeline Overview
22 | -----------------
23 | 
24 | The pipeline has been prepopulated with sample web logs using Dev Raw Data Source origin so you don't have to worry about loading it.
25 | 
26 | ![Pipeline Overview](images/img1.png)
27 | 
28 | Note that the origin's Data Format has been configured to **Log** >> **Common Log Format**. This makes it really easy to have the web logs automatically parsed. For a complete list of origins that support this and other formats, refer to the [documentation](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Apx-DataFormats/DataFormat_Title.html#concept_kgd_11c_kv).
29 | 
30 | ![Data Format](images/img2.png)
31 | 
32 | 
33 | Pipeline Preview
34 | ----------------
35 | 
36 | Once you have updated the pipeline parameters, click on **Preview** icon to see how the data is being transformed as it is flowing through various stages in the pipeline. For details on data preview, refer to the [documentation](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Data_Preview/DataPreview_Title.html#concept_jjk_23z_sq).
37 | 
38 | 
39 | **Origin**
40 | 
41 | ![Origin](images/img3.png)
42 | 
43 | As shown above, the logs in Common Log Format are being read by the origin and automatically transformed into a *column/field: value* pairs of records.
44 | 
45 | 
46 | **Field Type Converter**
47 | 
48 | ![Field Type Converter](images/img4.png)
49 | 
50 | Using [Field Type Converter](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Processors/FieldTypeConverter.html#concept_is3_zkp_wq) columns *response* and *timestamp* are being converted from string to integer and datetime data types respectively.
51 | 
52 | 
53 | **Stream Selector**
54 | 
55 | ![Stream Selector](images/img5.png)
56 | 
57 | Using [Stream Selector](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Processors/StreamSelector.html#concept_tqv_t5r_wq) the records are conditionally routed to different paths based on condition *${str:contains(record:value("/request"), "add_to_cart")}* -- which implies that if the record column *request* (which is the HTTP request URL) contains *add_to_cart*, then we'd like to store those record in Avro format. All other records such as regular page views will be stored in JSON format.
58 | 
59 | 
60 | **Schema Generator**
61 | 
62 | ![Schema Generator](images/img6.png)
63 | 
64 | For the records that need to be stored in Avro format, [Schema Generator](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Processors/SchemaGenerator.html#concept_rfz_ks3_x1b) will automatically generate schema for those records.
65 | 
66 | *Note:* You can also configure Schema Generator to cache schema so it's not calculated for every single record. For details, refer to the [documentation](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Processors/SchemaGenerator.html#concept_rjk_y1q_1bb).
67 | 
68 | 
69 | Pipeline Run
70 | ------------
71 | 
72 | Provided you've updated the pipeline parameters and there aren't any validation errors, running the pipleline should create two output files.
73 | 
74 | **Avro** -- for requests going to carts: [Sample output file](output/added-to-cart-a8a11b6c-d8fc-11ea-9149-abc78c1550f2_19aa3278-119c-4820-b11b-d58637a7b275.avro).
75 | 
76 | **JSON** -- for regular page views: [Sample output file](output/page-views-a8a11b6c-d8fc-11ea-9149-abc78c1550f2_69b93e73-ac07-45ab-b89b-90550dc14ad9.json).
77 | 
78 | 
79 | 
80 | 


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Parse Web Logs To JSON And Avro/images/img1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Parse Web Logs To JSON And Avro/images/img1.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Parse Web Logs To JSON And Avro/images/img2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Parse Web Logs To JSON And Avro/images/img2.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Parse Web Logs To JSON And Avro/images/img3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Parse Web Logs To JSON And Avro/images/img3.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Parse Web Logs To JSON And Avro/images/img4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Parse Web Logs To JSON And Avro/images/img4.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Parse Web Logs To JSON And Avro/images/img5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Parse Web Logs To JSON And Avro/images/img5.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Parse Web Logs To JSON And Avro/images/img6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Parse Web Logs To JSON And Avro/images/img6.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Parse Web Logs To JSON And Avro/output/added-to-cart-a8a11b6c-d8fc-11ea-9149-abc78c1550f2_19aa3278-119c-4820-b11b-d58637a7b275.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Parse Web Logs To JSON And Avro/output/added-to-cart-a8a11b6c-d8fc-11ea-9149-abc78c1550f2_19aa3278-119c-4820-b11b-d58637a7b275.avro


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/PostgreSQL CDC to Delta Lake/PostgreSQL_CDC_to_DeltaLake.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/PostgreSQL CDC to Delta Lake/PostgreSQL_CDC_to_DeltaLake.zip


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/PostgreSQL CDC to Delta Lake/images/PostgreSQLtoDBDeltaLake_step2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/PostgreSQL CDC to Delta Lake/images/PostgreSQLtoDBDeltaLake_step2.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/PostgreSQL CDC to Delta Lake/images/PostgreSQLtoDBDeltaLake_step2a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/PostgreSQL CDC to Delta Lake/images/PostgreSQLtoDBDeltaLake_step2a.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/PostgreSQL CDC to Delta Lake/images/PostgreSQLtoDBDeltaLake_step3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/PostgreSQL CDC to Delta Lake/images/PostgreSQLtoDBDeltaLake_step3.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/PostgreSQL CDC to Delta Lake/images/PostgreSQLtoDBDeltaLake_step4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/PostgreSQL CDC to Delta Lake/images/PostgreSQLtoDBDeltaLake_step4.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/PostgreSQL CDC to Delta Lake/images/PostgreSQLtoDBDeltaLake_step4a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/PostgreSQL CDC to Delta Lake/images/PostgreSQLtoDBDeltaLake_step4a.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/PostgreSQL CDC to Delta Lake/images/PostgreSQLtoDBDeltaLake_step5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/PostgreSQL CDC to Delta Lake/images/PostgreSQLtoDBDeltaLake_step5.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/PostgreSQL CDC to Delta Lake/images/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/PostgreSQL CDC to Delta Lake/images/pipeline.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/PostgreSQL CDC to Snowflake/PostgreSQL_CDC_to_Snowflake.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/PostgreSQL CDC to Snowflake/PostgreSQL_CDC_to_Snowflake.zip


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/PostgreSQL CDC to Snowflake/README.md:
--------------------------------------------------------------------------------
  1 | ![StreamSets Logo](../../../../images/Full%20Color%20Transparent.png)
  2 | 
  3 | <h1><p align="center">PostgreSQL CDC to Snowflake</p></h1>
  4 | 
  5 | # PostgreSQL CDC to Snowflake
  6 | 
  7 | **Important:** *These instructions assume you have access to StreamSets Data Collector (v3.15+) and have performed all the prerequisites for PostgreSQL and Snowflake*
  8 | 
  9 | - For help installing [StreamSets Data Collector](https://streamsets.com/products/dataops-platform/data-collector/), see [StreamSets Data Collector Installation](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Installation/Install_title.html).
 10 | - For help with PostgreSQL CDC Client prerequisites, see [PostgreSQL CDC Client](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Origins/PostgreSQL.html).
 11 | - For help with Snowflake prerequisites, see [Snowflake](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Destinations/Snowflake.html).
 12 | 
 13 | Here is a link to a short video on using this pipeline template: [Video Link](https://www.youtube.com/channel/UC_4K-__dngOCEmoZs7PVZAg)
 14 | 
 15 | ## OVERVIEW
 16 | 
 17 | This pipeline demonstrates how to read change data capture (CDC) data from a PostgreSQL database and replicate the changes to Snowflake.
 18 | 
 19 | **Disclaimer:** *This pipeline is meant to serve as a template for performing PostgreSQL CDC to Snowflake.  Some of the parameters, tables and fields may be different for your environment and may need additional customizations.  Please consult the StreamSets documentation (linked below) for full information on configuration of each stage used below.*
 20 | 
 21 | ## USING THE TEMPLATE
 22 | 
 23 | NOTE: [Templates](https://streamsets.com/documentation/controlhub/latest/onpremhelp/controlhub/UserGuide/Pipelines/PipelineTemplates.html) are supported in [StreamSets Control Hub](https://streamsets.com/products/dataops-platform/control-hub/). If you do not have Control Hub, you can import the template pipeline in Data Collector but will need to do that each time you want to use the template.
 24 | 
 25 | ## PIPELINE
 26 | 
 27 | ![Pipeline](images/pipeline.png "PostgreSQL CDC to Snowflake")
 28 | 
 29 | ## DOCUMENTATION
 30 | 
 31 | [PostgreSQL CDC Client](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Origins/PostgreSQL.html)
 32 | 
 33 | [Jython Evaluator](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Processors/Jython.html)
 34 | 
 35 | [Expression Evaluator](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Processors/Expression.html)
 36 | 
 37 | [Snowflake Destination](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Destinations/Snowflake.html)
 38 | 
 39 | ## STEP-BY-STEP
 40 | 
 41 | ### Step 1: Download the pipeline
 42 | 
 43 | [Click Here](./PostgreSQL_CDC_to_Snowflake.zip?raw=true) to download the pipeline and save it to your drive.
 44 | 
 45 | ### Step 2: Import the pipeline
 46 | 
 47 | Click the down arrow next to the "Create New Pipeline" and select "Import Pipeline From Archive".
 48 | 
 49 | ![Step 2](images/PostgreSQLtoSnowflake_step2.png "Import the Pipeline")
 50 | 
 51 | Click "Browse" and locate the pipeline file you just downloaded, click "OK", then click "Import"
 52 | 
 53 | ![Step 2a](images/PostgreSQLtoSnowflake_step2a.png "Import the Pipeline")
 54 | 
 55 | ### Step 3: Configure the parameters
 56 | 
 57 | Click on the pipeline you just imported to open it and click on the "Parameters" tab and fill in the appropriate information for your environment.
 58 | 
 59 | **Important:** *The pipeline template uses the most common default settings for things like the Snowflake region, staging location, etc. All of these are configurable and if you need to change those, you can opt to not use the built-in parameters and choose the appropriate settings yourself. Please refer to the documentation listed in this document for all the available options.*
 60 | 
 61 | ![Step 3](images/PostgreSQLtoSnowflake_step3.png "Configure the parameters")
 62 | 
 63 | The following parameters are set up for this pipeline:
 64 | 
 65 | <table>
 66 |   <tr>
 67 |    <td><code>postgres_schema</code>
 68 |    </td>
 69 |    <td class="entry cellrowborder" style="text-align:left;" headers="d450352e828 ">Schema to use. You can enter a schema name or use a <a class="xref" href="https://streamsets.com/documentation/datacollector/latest/help/index.html?contextID=concept_vd4_nsc_gs" title="A regular expression, also known as regex, describes a pattern for a string.">regular expression</a> to specify a set of
 70 |                                         schemas.</td>
 71 |   </tr>
 72 |   <tr>
 73 |    <td><code>postgres_tablename_pattern</code>
 74 |    </td>
 75 |    <td class="entry cellrowborder" style="text-align:left;" headers="d450352e828 ">A table name pattern that specifies the tables to track.
 76 |                                         You can enter a table name or use a <a class="xref" href="https://streamsets.com/documentation/datacollector/latest/help/index.html?contextID=concept_vd4_nsc_gs" title="A regular expression, also known as regex, describes a pattern for a string.">regular expression</a> to specify a set of
 77 |                                         tables.</td>
 78 |   </tr>
 79 |   <tr>
 80 |    <td><code>postgres_jdbc_conn_string</code>
 81 |    </td>
 82 |    <td class="entry cellrowborder" headers="d450352e1093 ">
 83 |                                         <p class="p">Connection string to use to connect to the database. Use
 84 |                                             the following syntax:</p>
 85 |                                         <div class="p">
 86 |                                             <pre class="pre codeblock"><code>jdbc:postgresql://&lt;host&gt;:&lt;port&gt;/&lt;dbname&gt;</code></pre>
 87 |                                             <div class="note note"><span class="notetitle">Note:</span> If you include the JDBC credentials in the
 88 |                                                 connection string, use the user account created for
 89 |                                                 the origin. That user must have the superuser or
 90 |                                                 replication role.</div>
 91 |                                         </div>
 92 |                                     </td>
 93 |   </tr>
 94 |   <tr>
 95 |    <td><code>postgres_username</code>
 96 |    </td>
 97 |    <td class="entry cellrowborder" headers="d450352e1195 "><span class="ph">User name for the
 98 |                                             JDBC connection.</span><p class="p">The specified user must have the superuser or
 99 |                                             replication role.</p>
100 | </td>
101 |   </tr>
102 |   <tr>
103 |    <td><code>postgres_password</code>
104 |    <td class="entry cellrowborder" headers="d450352e1195 ">Password for the JDBC account.<div class="note tip"><span class="tiptitle">Tip:</span> <span class="ph" id="task_v21_nm4_n2b__d15e6239">To
105 |                         secure sensitive information such as user names and passwords, you can use
106 |                               <a class="xref" href="https://streamsets.com/documentation/datacollector/latest/help/index.html?contextID=concept_bs4_5nm_2s" title="Similar to runtime properties, runtime resources are values that you define in a file local to the Data Collector and call from within a pipeline. But with runtime resources, you can restrict the permissions for the files to secure information.">runtime resources</a> or <span class="ph"><a class="xref" href="https://streamsets.com/documentation/datacollector/latest/help/index.html?contextID=concept_bt1_bpj_r1b">credential stores.</a></span></span></div>
107 | </td>
108 |   </tr>
109 |   <tr>
110 |    <td><code>snowflake_account</code>
111 |    </td>
112 |    <td class="entry cellrowborder" headers="d198512e2230 ">Snowflake account name.</td>
113 |   </tr>
114 |   <tr>
115 |    <td><code>snowflake_user</code>
116 |    </td>
117 |    <td class="entry cellrowborder" headers="d198512e2230 ">Snowflake user name.</td>
118 |   </tr>
119 |   <tr>
120 |    <td><code>snowflake_password</code>
121 |    </td>
122 |    <td class="entry cellrowborder" headers="d198512e2230 ">Snowflake password.</td>
123 |   </tr>
124 |   <tr>
125 |    <td><code>snowflake_warehouse</code>
126 |    </td>
127 |    <td class="entry cellrowborder" headers="d198512e2372 ">Snowflake warehouse.</td>
128 |   </tr>
129 |   <tr>
130 |    <td><code>snowflake_database</code>
131 |    </td>
132 |    <td class="entry cellrowborder" headers="d198512e2372 ">Snowflake database.</td>
133 |   </tr>
134 |   <tr>
135 |    <td><code>snowflake_schema</code>
136 |    </td>
137 |    <td class="entry cellrowborder" headers="d198512e2372 ">Snowflake schema.</td>
138 |   </tr>
139 |   <tr>
140 |    <td><code>snowflake_stage_name</code>
141 |    </td>
142 |    <td class="entry cellrowborder" headers="d198512e2713 ">Name of the Snowflake stage used to stage the data.
143 |                                             <p class="p">Unless using a Snowflake internal user stage, you
144 |                                             create this stage as part of the <a class="xref" href="https://streamsets.com/documentation/datacollector/latest/help/index.html?contextID=concept_ysy_fcj_ggb">Snowflake prerequisite tasks</a>.</p>
145 | <p class="p">To use a
146 |                                             Snowflake internal user stage, enter a tilde
147 |                                                 (<code class="ph codeph">~</code>).</p>
148 | </td>
149 |   </tr>
150 | </table>
151 | 
152 | ### Step 4: Run the pipeline
153 | 
154 | Click the "START" button to run the pipeline.
155 | 
156 | ![Step 4](images/PostgreSQLtoSnowflake_step4.png "Run the pipeline")
157 | 
158 | ![Step 4a](images/PostgreSQLtoSnowflake_step4a.png "Run the pipeline")
159 | 
160 | ### Step 5: Make changes to the PostgreSQL source table and see the pipeline process them
161 | 
162 | ![Step 5](images/PostgreSQLtoSnowflake_step5.png "View the results")


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/PostgreSQL CDC to Snowflake/images/PostgreSQLtoSnowflake_step2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/PostgreSQL CDC to Snowflake/images/PostgreSQLtoSnowflake_step2.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/PostgreSQL CDC to Snowflake/images/PostgreSQLtoSnowflake_step2a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/PostgreSQL CDC to Snowflake/images/PostgreSQLtoSnowflake_step2a.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/PostgreSQL CDC to Snowflake/images/PostgreSQLtoSnowflake_step3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/PostgreSQL CDC to Snowflake/images/PostgreSQLtoSnowflake_step3.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/PostgreSQL CDC to Snowflake/images/PostgreSQLtoSnowflake_step4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/PostgreSQL CDC to Snowflake/images/PostgreSQLtoSnowflake_step4.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/PostgreSQL CDC to Snowflake/images/PostgreSQLtoSnowflake_step4a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/PostgreSQL CDC to Snowflake/images/PostgreSQLtoSnowflake_step4a.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/PostgreSQL CDC to Snowflake/images/PostgreSQLtoSnowflake_step5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/PostgreSQL CDC to Snowflake/images/PostgreSQLtoSnowflake_step5.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/PostgreSQL CDC to Snowflake/images/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/PostgreSQL CDC to Snowflake/images/pipeline.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/SQLServer CDC to Delta Lake/README.md:
--------------------------------------------------------------------------------
  1 | ![StreamSets Logo](../../../../images/Full%20Color%20Transparent.png)
  2 | 
  3 | <h1><p align="center">SQL Server CDC to Databricks Delta Lake</p></h1>
  4 | 
  5 | # SQL Server CDC to Databricks Delta Lake
  6 | 
  7 | **Important:** *These instructions assume you have access to StreamSets Data Collector (v3.15+) and have performed all the prerequisites for SQL Server and Databricks Delta Lake*
  8 | 
  9 | - For help installing [StreamSets Data Collector](https://streamsets.com/products/dataops-platform/data-collector/), see [StreamSets Data Collector Installation](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Installation/Install_title.html).
 10 | - Your SQL Server database is enabled for Change Data Capture (CDC). For help with enabling CDC, see [About CDC - SQL Server](https://docs.microsoft.com/en-us/sql/relational-databases/track-changes/about-change-data-capture-sql-server?view=sql-server-ver15).
 11 | - For help with Databricks Delta Lake, see [Delta Lake Prerequisites](https://streamsets.com/documentation/datacollector/latest/help/index.html?contextID=concept_xnp_y5f_dlb).
 12 | 
 13 | ## OVERVIEW
 14 | 
 15 | This pipeline demonstrates how to read change data capture (CDC) data from a SQL Server database and replicate the changes to Databricks Delta Lake. The pipeline assumes the following:
 16 | * SQL Server database is enabled for Change Data Capture.
 17 | * All tables in the database will be tracked for ingesting changes (Inserts, Updates and Deletes). If you need to track only certain tables, configure the [Capture Instance Name](https://streamsets.com/documentation/datacollector/latest/help/index.html?contextID=concept_sx3_d11_s1b) accordingly in the origin.
 18 | * Each source table will be mapped to it's corresponding table in Delta Lake. If the table doesn't exist in Delta Lake, it will be auto-created.
 19 | * The SQL Server CDC client generates records for multiple transaction types. The transaction type is captured in the record header as an attribute called [sdc.operation.type](https://streamsets.com/documentation/datacollector/latest/help/index.html?contextID=concept_yqg_sts_r1b):
 20 |   * 1 for Insert
 21 |   * 2 for Delete
 22 |   * 3 for Update, including updates captured after the update operation
 23 |   * 5 for unsupported operations, including updates captured before the update operation.
 24 |   <br>The Databricks Delta Lake destination handles all but one operation type, which is sdc.operation.type=5. The Stream Selector processor in the pipeline routes and discards all records with sdc.operation.type=5.
 25 | 
 26 | **Disclaimer:** *This pipeline is meant to serve as a template for performing SQL Server CDC to Databricks Delta Lake.  Some of the parameters, tables and fields may be different for your environment and may need additional customizations.  Please consult the StreamSets documentation for full information on configuration of each stage used below.*
 27 | 
 28 | ## PIPELINE
 29 | 
 30 | ![Pipeline](images/pipeline.png "SQL Server CDC to Databricks Delta Lake")
 31 | 
 32 | ## DOCUMENTATION
 33 | 
 34 | [SQL Server CDC Origin](https://streamsets.com/documentation/datacollector/latest/help//datacollector/UserGuide/Origins/SQLServerCDC.html)
 35 | 
 36 | [Stream Selector Processor](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Processors/StreamSelector.html)
 37 | 
 38 | [Databricks Delta Lake Destination](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Destinations/DeltaLake.html)
 39 | 
 40 | ## STEP-BY-STEP
 41 | 
 42 | ### Step 1: Download the pipeline
 43 | 
 44 | [Click Here](./SQL_Server_CDC_to_DeltaLake.zip?raw=true) to download the pipeline and save it to your drive.
 45 | 
 46 | ### Step 2: Import the pipeline
 47 | 
 48 | Click the down arrow next to the "Create New Pipeline" and select "Import Pipeline from archive".
 49 | 
 50 | ![Step 2](images/import_from_archive.png "Import the Pipeline")
 51 | 
 52 | Click "Browse" and locate the pipeline file you just downloaded, then click "Import"
 53 | 
 54 | ![Step 2a](images/select_downloaded_archive.png "Import the pipeline")
 55 | 
 56 | ### Step 3: Configure the parameters
 57 | 
 58 | Click on the pipeline you just imported to open it and click on the "Parameters" tab and fill in the appropriate information for your environment.
 59 | 
 60 | **Important:** *The pipeline template uses the most common default settings for things like the Delta Lake staging location, etc. All of these are configurable and if you need to change those, you can opt to not use the built-in parameters and choose the appropriate settings yourself. Please refer to the documentation listed in this document for all the available options.*
 61 | 
 62 | ![Step 3](images/parameters.png "Configure the parameters")
 63 | 
 64 | The following parameters are set up for this pipeline:
 65 | 
 66 | <table>
 67 |   <tr>
 68 |    <td><code>sqlserver_jdbc_url</code>
 69 |    </td>
 70 |    <td class="entry cellrowborder" headers="d436212e756 ">JDBC URL to connect to the SQL Server database.</td>
 71 |   </tr>
 72 |   <tr>
 73 |    <td><code>sqlserver_username</code>
 74 |    </td>
 75 |    <td class="entry cellrowborder" headers="d436212e853 ">SQL Server username.
 76 | </td>
 77 |   </tr>
 78 |   <tr>
 79 |    <td><code>sqlserver_password</code>
 80 |    </td>
 81 |     <td class="entry cellrowborder" headers="d436212e853 ">SQL Server password.<br>Tip: To secure sensitive information such as user tokens and passwords, you can use <a class="href" href="https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Pipeline_Configuration/RuntimeValues.html">runtime resources</a> or <a class="href" href="https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Configuration/CredentialStores.html">credential stores.</a></div>
 82 |   </tr>
 83 |   <tr>
 84 |    <td><code>databricks_jdbc_url</code>
 85 |    </td>
 86 |    <td class="entry cellrowborder" headers="d198512e2230 ">JDBC URL used to connect to the Databricks cluster</td>
 87 |   </tr>
 88 |   <tr>
 89 |    <td><code>databricks_token</code>
 90 |    </td>
 91 |    <td class="entry cellrowborder" headers="d198512e2230 ">Personal access token used to connect to the Databricks cluster</td>
 92 |   </tr>
 93 |   <tr>
 94 |    <td><code>deltalake_database</code>
 95 |    </td>
 96 |    <td class="entry cellrowborder" headers="d198512e2230 ">Databricks database name for all tables</td>
 97 |   </tr>
 98 |   <tr>
 99 |    <td><code>deltalake_s3_bucket</code>
100 |    </td>
101 |    <td class="entry cellrowborder" headers="d198512e2372 ">S3 bucket for staging data before invoking COPY/MERGE command.</td>
102 |   </tr>
103 |   <tr>
104 |    <td><code>s3_access_key</code>
105 |    </td>
106 |    <td class="entry cellrowborder" headers="d198512e2372 ">AWS access key. Leave empty if you enable IAM Roles</td>
107 |   </tr>
108 |   <tr>
109 |    <td><code>s3_secret_key</code>
110 |    </td>
111 |    <td class="entry cellrowborder" headers="d198512e2372 ">AWS secret key. Leave empty if you enable IAM Roles</td>
112 |   </tr>
113 | </table>
114 | 
115 | ### Step 4: Configure Key Columns in Delta Lake destination
116 | 
117 | Configure each table's Key Column in the Delta Lake destination. For CDC data, the MERGE command is applied on Delta Lake requiring columns for matching each table. See details [here](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Destinations/DeltaLake.html)
118 | 
119 | ![Step 4](images/key_columns.png "Update key columns per table")
120 | 
121 | ### Step 5: Run the pipeline
122 | 
123 | Click the "START" button to run the pipeline.
124 | 
125 | ![Step 5](images/start_pipeline.png "Run the pipeline")
126 | 
127 | ### Step 6: Make changes to the SQL Server source tables and see the pipeline process them
128 | 
129 | ![Step 6](images/running_pipeline.png "View the results")
130 | 


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/SQLServer CDC to Delta Lake/SQL_Server_CDC_to_DeltaLake.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/SQLServer CDC to Delta Lake/SQL_Server_CDC_to_DeltaLake.zip


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/SQLServer CDC to Delta Lake/images/import_from_archive.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/SQLServer CDC to Delta Lake/images/import_from_archive.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/SQLServer CDC to Delta Lake/images/key_columns.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/SQLServer CDC to Delta Lake/images/key_columns.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/SQLServer CDC to Delta Lake/images/parameters.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/SQLServer CDC to Delta Lake/images/parameters.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/SQLServer CDC to Delta Lake/images/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/SQLServer CDC to Delta Lake/images/pipeline.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/SQLServer CDC to Delta Lake/images/running_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/SQLServer CDC to Delta Lake/images/running_pipeline.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/SQLServer CDC to Delta Lake/images/select_downloaded_archive.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/SQLServer CDC to Delta Lake/images/select_downloaded_archive.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/SQLServer CDC to Delta Lake/images/start_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/SQLServer CDC to Delta Lake/images/start_pipeline.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/SQLServer CDC to Snowflake/README.md:
--------------------------------------------------------------------------------
  1 | ![StreamSets Logo](../../../../images/Full%20Color%20Transparent.png)
  2 | 
  3 | <h1><p align="center">SQL Server CDC to Snowflake</p></h1>
  4 | 
  5 | # SQL Server CDC to Snowflake
  6 | 
  7 | **Important:** *These instructions assume you have access to StreamSets Data Collector (v3.15+) and have performed all the prerequisites for MySQL and Snowflake*
  8 | 
  9 | - For help installing [StreamSets Data Collector](https://streamsets.com/products/dataops-platform/data-collector/), see [StreamSets Data Collector Installation](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Installation/Install_title.html).
 10 | - Your SQL Server database is enabled for Change Data Capture (CDC). For help with enabling CDC, see [About CDC - SQL Server](https://docs.microsoft.com/en-us/sql/relational-databases/track-changes/about-change-data-capture-sql-server?view=sql-server-ver15).
 11 | - For help with Snowflake prerequisites, see [Snowflake](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Destinations/Snowflake.html).
 12 | 
 13 | ## OVERVIEW
 14 | 
 15 | This pipeline demonstrates how to read change data capture (CDC) data from a SQL Server database and replicate the changes to Snowflake. The pipeline assumes the following:
 16 | * SQL Server database is enabled for Change Data Capture.
 17 | * All tables in the database will be tracked for ingesting changes (Inserts, Updates and Deletes). If you need to track only certain tables, configure the [Capture Instance Name](https://streamsets.com/documentation/datacollector/latest/help/index.html?contextID=concept_sx3_d11_s1b) accordingly in the origin.
 18 | * Each source table will be mapped to it's corresponding table in Snowflake. If the table doesn't exist in Snowflake, it will be auto-created.
 19 | * The SQL Server CDC client generates records for multiple transaction types. The transaction type is captured in the record header as an attribute called [sdc.operation.type](https://streamsets.com/documentation/datacollector/latest/help/index.html?contextID=concept_yqg_sts_r1b):
 20 |   * 1 for Insert
 21 |   * 2 for Delete
 22 |   * 3 for Update, including updates captured after the update operation
 23 |   * 5 for unsupported operations, including updates captured before the update operation.
 24 |   <br>The Snowflake destination handles all but one operation type, which is sdc.operation.type=5. The Stream Selector processor in the pipeline routes and discards all records with sdc.operation.type=5.
 25 | 
 26 | **Disclaimer:** *This pipeline is meant to serve as a template for performing SQL Server CDC to Snowflake.  Some of the parameters, tables and fields may be different for your environment and may need additional customizations.  Please consult the StreamSets documentation for full information on configuration of each stage used below.*
 27 | 
 28 | ## PIPELINE
 29 | 
 30 | ![Pipeline](images/pipeline.png "SQL Server CDC to Snowflake")
 31 | 
 32 | ## DOCUMENTATION
 33 | 
 34 | [SQL Server CDC Origin](https://streamsets.com/documentation/datacollector/latest/help//datacollector/UserGuide/Origins/SQLServerCDC.html)
 35 | 
 36 | [Stream Selector Processor](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Processors/StreamSelector.html)
 37 | 
 38 | [Snowflake Destination](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Destinations/Snowflake.html)
 39 | 
 40 | ## STEP-BY-STEP
 41 | 
 42 | ### Step 1: Download the pipeline
 43 | 
 44 | [Click Here](./SQL_Server_CDC_to_Snowflake.zip?raw=true) to download the pipeline and save it to your drive.
 45 | 
 46 | ### Step 2: Import the pipeline
 47 | 
 48 | Click the down arrow next to the "Create New Pipeline" and select "Import Pipeline".
 49 | 
 50 | ![Step 2](images/import_from_archive.png "Import the Pipeline")
 51 | 
 52 | Click "Browse" and locate the pipeline file you just downloaded, then click "Import"
 53 | 
 54 | ![Step 2a](images/select_downloaded_archive.png "Import the Pipeline")
 55 | 
 56 | ### Step 3: Configure the parameters
 57 | 
 58 | Click on the pipeline you just imported to open it and click on the "Parameters" tab and fill in the appropriate information for your environment.
 59 | 
 60 | **Important:** *The pipeline template uses the most common default settings for things like the Snowflake region, staging location, etc. All of these are configurable and if you need to change those, you can opt to not use the built-in parameters and choose the appropriate settings yourself. Please refer to the documentation listed in this document for all the available options.*
 61 | 
 62 | ![Step 3](images/parameters.png "Configure the parameters")
 63 | 
 64 | The following parameters are set up for this pipeline:
 65 | 
 66 | <table>
 67 |   <tr>
 68 |    <td><code>sqlserver_jdbc_url</code>
 69 |    </td>
 70 |    <td class="entry cellrowborder" headers="d436212e756 ">JDBC URL to connect to the SQL Server database.</td>
 71 |   </tr>
 72 |   <tr>
 73 |    <td><code>sqlserver_username</code>
 74 |    </td>
 75 |    <td class="entry cellrowborder" headers="d436212e853 ">MySQL username.
 76 | </td>
 77 |   </tr>
 78 |   <tr>
 79 |    <td><code>sqlserver_password</code>
 80 |    </td>
 81 |  <td class="entry cellrowborder" headers="d436212e853 ">MySQL password.<br>Tip: To secure sensitive information such as user tokens and passwords, you can use <a class="href" href="https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Pipeline_Configuration/RuntimeValues.html">runtime resources</a> or <a class="href" href="https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Configuration/CredentialStores.html">credential stores.</a></div>
 82 | </td>
 83 |   </tr>
 84 |   <tr>
 85 |    <td><code>snowflake_account</code>
 86 |    </td>
 87 |    <td class="entry cellrowborder" headers="d198512e2230 ">Snowflake account name.</td>
 88 |   </tr>
 89 |   <tr>
 90 |    <td><code>snowflake_user</code>
 91 |    </td>
 92 |    <td class="entry cellrowborder" headers="d198512e2230 ">Snowflake user name.</td>
 93 |   </tr>
 94 |   <tr>
 95 |    <td><code>snowflake_pwd</code>
 96 |    </td>
 97 |    <td class="entry cellrowborder" headers="d198512e2230 ">Snowflake password.</td>
 98 |   </tr>
 99 |   <tr>
100 |    <td><code>snowflake_wh</code>
101 |    </td>
102 |    <td class="entry cellrowborder" headers="d198512e2372 ">Snowflake warehouse.</td>
103 |   </tr>
104 |   <tr>
105 |    <td><code>snowflake_db</code>
106 |    </td>
107 |    <td class="entry cellrowborder" headers="d198512e2372 ">Snowflake database.</td>
108 |   </tr>
109 |   <tr>
110 |    <td><code>snowflake_schema</code>
111 |    </td>
112 |    <td class="entry cellrowborder" headers="d198512e2372 ">Snowflake schema.</td>
113 |   </tr>
114 |   <tr>
115 |    <td><code>snowflake_stage</code>
116 |    </td>
117 |    <td class="entry cellrowborder" headers="d198512e2713 ">Name of the Snowflake stage used to stage the data.
118 |                                             <p class="p">Unless using a Snowflake internal user stage, you
119 |                                             create this stage as part of the <a class="xref">Snowflake prerequisite tasks</a>.</p>
120 | <p class="p">To use a
121 |                                             Snowflake internal user stage, enter a tilde
122 |                                                 (<code class="ph codeph">~</code>).</p>
123 | </td>
124 |   </tr>
125 | </table>
126 | 
127 | ### Step 4: Configure Key Columns in Snowflake destination
128 | 
129 | Configure each table's Key Column in the Snowflake destination. For CDC data, the MERGE command is applied on Snowflake requiring columns for matching each table. See details [here](https://streamsets.com/documentation/datacollector/latest/help/index.html?contextID=concept_w35_vsq_2gb)
130 | 
131 | ![Step 4](images/key_columns.png "Update key columns per table")
132 | 
133 | ### Step 5: Run the pipeline
134 | 
135 | Click the "START" button to run the pipeline.
136 | 
137 | ![Step 5](images/start_pipeline.png "Run the pipeline")
138 | 
139 | ### Step 6: Make changes to the SQL Server source tables and see the pipeline process them
140 | 
141 | ![Step 6](images/running_pipeline.png "View the results")
142 | 


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/SQLServer CDC to Snowflake/SQL_Server_CDC_to_Snowflake.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/SQLServer CDC to Snowflake/SQL_Server_CDC_to_Snowflake.zip


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/SQLServer CDC to Snowflake/images/get_key_columns.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/SQLServer CDC to Snowflake/images/get_key_columns.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/SQLServer CDC to Snowflake/images/import_from_archive.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/SQLServer CDC to Snowflake/images/import_from_archive.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/SQLServer CDC to Snowflake/images/key_columns.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/SQLServer CDC to Snowflake/images/key_columns.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/SQLServer CDC to Snowflake/images/parameters.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/SQLServer CDC to Snowflake/images/parameters.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/SQLServer CDC to Snowflake/images/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/SQLServer CDC to Snowflake/images/pipeline.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/SQLServer CDC to Snowflake/images/running_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/SQLServer CDC to Snowflake/images/running_pipeline.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/SQLServer CDC to Snowflake/images/sample_preview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/SQLServer CDC to Snowflake/images/sample_preview.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/SQLServer CDC to Snowflake/images/select_downloaded_archive.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/SQLServer CDC to Snowflake/images/select_downloaded_archive.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/SQLServer CDC to Snowflake/images/select_pipeline_to_import.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/SQLServer CDC to Snowflake/images/select_pipeline_to_import.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/SQLServer CDC to Snowflake/images/start_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/SQLServer CDC to Snowflake/images/start_pipeline.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Salesforce CDC to Delta Lake/Salesforce_CDC_to_DeltaLake.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Salesforce CDC to Delta Lake/Salesforce_CDC_to_DeltaLake.zip


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Salesforce CDC to Delta Lake/images/SalesforcetoDBDeltaLake_step2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Salesforce CDC to Delta Lake/images/SalesforcetoDBDeltaLake_step2.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Salesforce CDC to Delta Lake/images/SalesforcetoDBDeltaLake_step2a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Salesforce CDC to Delta Lake/images/SalesforcetoDBDeltaLake_step2a.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Salesforce CDC to Delta Lake/images/SalesforcetoDBDeltaLake_step3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Salesforce CDC to Delta Lake/images/SalesforcetoDBDeltaLake_step3.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Salesforce CDC to Delta Lake/images/SalesforcetoDBDeltaLake_step4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Salesforce CDC to Delta Lake/images/SalesforcetoDBDeltaLake_step4.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Salesforce CDC to Delta Lake/images/SalesforcetoDBDeltaLake_step4a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Salesforce CDC to Delta Lake/images/SalesforcetoDBDeltaLake_step4a.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Salesforce CDC to Delta Lake/images/SalesforcetoDBDeltaLake_step5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Salesforce CDC to Delta Lake/images/SalesforcetoDBDeltaLake_step5.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Salesforce CDC to Delta Lake/images/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Salesforce CDC to Delta Lake/images/pipeline.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Salesforce CDC to Snowflake/README.md:
--------------------------------------------------------------------------------
  1 | ![StreamSets Logo](../../../../images/Full%20Color%20Transparent.png)
  2 | 
  3 | <h1><p align="center">Salesforce CDC to Snowflake</p></h1>
  4 | 
  5 | # Salesforce CDC to Snowflake
  6 | 
  7 | **Important:** *These instructions assume you have access to StreamSets Data Collector (v3.15+) and have performed all the prerequisites for Salesforce and Snowflake*
  8 | 
  9 | - For help installing [StreamSets Data Collector](https://streamsets.com/products/dataops-platform/data-collector/), see [StreamSets Data Collector Installation](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Installation/Install_title.html).
 10 | - For help with Salesforce prerequisites, see [Salesforce](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Origins/Salesforce.html).
 11 | - For help with Snowflake prerequisites, see [Snowflake](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Destinations/Snowflake.html).
 12 | 
 13 | For more information, see [Loading Data into Databricks Delta Lake](https://streamsets.com/documentation/datacollector/latest/help/index.html?contextID=concept_a5b_wvk_ckb) in [StreamSets Data Collector documentation](https://streamsets.com/documentation/datacollector/latest/help/).
 14 | 
 15 | Here is a link to a short video on using this pipeline template: [Video Link](https://www.youtube.com/channel/UC_4K-__dngOCEmoZs7PVZAg)
 16 | 
 17 | ## OVERVIEW
 18 | 
 19 | This pipeline demonstrates how to read change data capture (CDC) data from a Salesforce and replicate the changes to Databricks Delta Lake.
 20 | 
 21 | **Disclaimer:** *This pipeline is meant to serve as a template for performing Salesforce CDC to Snowflake.  Some of the parameters, tables and fields may be different for your environment and may need additional customizations.  Please consult the StreamSets documentation (linked below) for full information on configuration of each stage used below.  For example, this pipeline was used on the 'Opportunity' table from Salesforce and writes to a Snowflake table named 'Opportunity'.  Using other tables may require additional configurations.*
 22 | 
 23 | ## USING THE TEMPLATE
 24 | 
 25 | NOTE: [Templates](https://streamsets.com/documentation/controlhub/latest/onpremhelp/controlhub/UserGuide/Pipelines/PipelineTemplates.html) are supported in [StreamSets Control Hub](https://streamsets.com/products/dataops-platform/control-hub/). If you do not have Control Hub, you can import the template pipeline in Data Collector but will need to do that each time you want to use the template.
 26 | 
 27 | ## PIPELINE
 28 | 
 29 | ![Pipeline](images/pipeline.png "Salesforce CDC to Snowflake")
 30 | 
 31 | ## DOCUMENTATION
 32 | 
 33 | [Salesforce Origin](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Origins/Salesforce.html)
 34 | 
 35 | [Expression Evaluator](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Processors/Expression.html)
 36 | 
 37 | [Field Pivoter](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Processors/ListPivoter.html)
 38 | 
 39 | [StreamSelector](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Processors/StreamSelector.html)
 40 | 
 41 | [Field Type Converter](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Processors/FieldTypeConverter.html)
 42 | 
 43 | [Salesforce Lookup](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Processors/SalesforceLookup.html)
 44 | 
 45 | [Snowflake Destination](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Destinations/Snowflake.html)
 46 | 
 47 | ## STEP-BY-STEP
 48 | 
 49 | ### Step 1: Download the pipeline
 50 | 
 51 | [Click Here](./Salesforce_CDC_to_Snowflake.zip?raw=true) to download the pipeline and save it to your drive.
 52 | 
 53 | ### Step 2: Import the pipeline
 54 | 
 55 | Click the down arrow next to the "Create New Pipeline" and select "Import Pipeline From Archive".
 56 | 
 57 | ![Step 2](images/SalesforcetoSnowflake_step2.png "Import the Pipeline")
 58 | 
 59 | Click "Browse" and locate the pipeline file you just downloaded, click "OK", then click "Import"
 60 | 
 61 | ![Step 2a](images/SalesforcetoSnowflake_step2a.png "Import the Pipeline")
 62 | 
 63 | ### Step 3: Configure the parameters
 64 | 
 65 | Click on the pipeline you just imported to open it and click on the "Parameters" tab and fill in the appropriate information for your environment.
 66 | 
 67 | **Important:** *The pipeline template uses the most common default settings for things like the region, staging location, etc. All of these are configurable and if you need to change those, you can opt to not use the built-in parameters and choose the appropriate settings yourself. Please refer to the documentation listed in this document for all the available options.*
 68 | 
 69 | ![Step 3](images/SalesforcetoSnowflake_step3.png "Configure the parameters")
 70 | 
 71 | The following parameters are set up for this pipeline:
 72 | <table>
 73 |   <tr>
 74 |    <td><code>salesforce_username</code>
 75 |    </td>
 76 |    <td class="entry cellrowborder" headers="d497702e2019 ">Salesforce username in the following email format:
 77 |                 <code class="ph codeph">&lt;text&gt;@&lt;text&gt;.com</code>. </td>
 78 |   </tr>
 79 |   <tr>
 80 |    <td><code>salesforce_password</code>
 81 |    </td>
 82 |    <td class="entry cellrowborder" headers="d497702e2019 ">Salesforce password.<p class="p">If the machine running <span class="ph">Data Collector</span> is outside the trusted
 83 |                 IP range configured in your Salesforce environment, you must generate a security
 84 |                 token and then set this property to the password followed by the security token.
 85 |                 </p>
 86 | <p class="p">For example, if the password is <code class="ph codeph">abcd</code> and the security token
 87 |                 is <code class="ph codeph">1234</code>, then set this property to <kbd class="ph userinput">abcd1234</kbd>.
 88 |                 For more information on generating a security token, see <a class="xref" href="https://help.salesforce.com/articleView?id=user_security_token.htm&amp;type=0" target="_blank">Reset Your Security Token</a>.</p>
 89 |               <div class="note tip"><span class="tiptitle">Tip:</span> <span class="ph" id="task_h1n_bs3_rx__d15e6239">To
 90 |                         secure sensitive information such as user names and passwords, you can use
 91 |                               <a class="xref" href="https://streamsets.com/documentation/datacollector/latest/help/index.html?contextID=concept_bs4_5nm_2s" title="Similar to runtime properties, runtime resources are values that you define in a file local to the Data Collector and call from within a pipeline. But with runtime resources, you can restrict the permissions for the files to secure information.">runtime resources</a> or <span class="ph"><a class="xref" href="https://streamsets.com/documentation/datacollector/latest/help/index.html?contextID=concept_bt1_bpj_r1b">credential stores.</a></span></span></div>
 92 | </td>
 93 |   </tr>
 94 |   <tr>
 95 |    <td><code>salesforce_auth_endpoint</code>
 96 |    </td>
 97 |    <td class="entry cellrowborder" headers="d497702e2019 ">Salesforce SOAP API authentication endpoint. Enter one of the following
 98 |                 values:<ul class="ul" id="task_h1n_bs3_rx__d68e4576">
 99 |                 <li class="li"><code class="ph codeph">login.salesforce.com</code> - Use to connect to a Production or
100 |                   Developer Edition organization.</li>
101 |                 <li class="li"><code class="ph codeph">test.salesforce.com</code> - Use to connect to a sandbox
102 |                   organization.</li>
103 |               </ul>
104 | <p class="p">Default is <code class="ph codeph">login.salesforce.com</code>.</p>
105 | </td>
106 |   </tr>
107 |   <tr>
108 |    <td><code>snowflake_account</code>
109 |    </td>
110 |    <td class="entry cellrowborder" headers="d198512e2230 ">Snowflake account name.</td>
111 |   </tr>
112 |   <tr>
113 |    <td><code>snowflake_user</code>
114 |    </td>
115 |    <td class="entry cellrowborder" headers="d198512e2230 ">Snowflake user name.</td>
116 |   </tr>
117 |   <tr>
118 |    <td><code>snowflake_password</code>
119 |    </td>
120 |    <td class="entry cellrowborder" headers="d198512e2230 ">Snowflake password.</td>
121 |   </tr>
122 |   <tr>
123 |    <td><code>snowflake_warehouse</code>
124 |    </td>
125 |    <td class="entry cellrowborder" headers="d198512e2372 ">Snowflake warehouse.</td>
126 |   </tr>
127 |   <tr>
128 |    <td><code>snowflake_database</code>
129 |    </td>
130 |    <td class="entry cellrowborder" headers="d198512e2372 ">Snowflake database.</td>
131 |   </tr>
132 |   <tr>
133 |    <td><code>snowflake_schema</code>
134 |    </td>
135 |    <td class="entry cellrowborder" headers="d198512e2372 ">Snowflake schema.</td>
136 |   </tr>
137 |   <tr>
138 |    <td><code>snowflake_stage_name</code>
139 |    </td>
140 |    <td class="entry cellrowborder" headers="d198512e2713 ">Name of the Snowflake stage used to stage the data.
141 |                                             <p class="p">Unless using a Snowflake internal user stage, you
142 |                                             create this stage as part of the <a class="xref" href="https://streamsets.com/documentation/datacollector/latest/help/index.html?contextID=concept_ysy_fcj_ggb">Snowflake prerequisite tasks</a>.</p>
143 | <p class="p">To use a
144 |                                             Snowflake internal user stage, enter a tilde
145 |                                                 (<code class="ph codeph">~</code>).</p>
146 | </td>
147 |   </tr>
148 | </table>
149 | 
150 | ### Step 4: Run the pipeline
151 | 
152 | Click the "START" button to run the pipeline.
153 | 
154 | ![Step 4](images/SalesforcetoSnowflake_step4.png "Run the pipeline")
155 | 
156 | ![Step 4a](images/SalesforcetoSnowflake_step4a.png "Run the pipeline")
157 | 
158 | ### Step 5: Make changes to the MySQL source table and see the pipeline process them
159 | 
160 | ![Step 5](images/SalesforcetoSnowflake_step5.png "View the results")
161 | 


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Salesforce CDC to Snowflake/Salesforce_CDC_to_Snowflake.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Salesforce CDC to Snowflake/Salesforce_CDC_to_Snowflake.zip


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Salesforce CDC to Snowflake/images/SalesforcetoSnowflake_step2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Salesforce CDC to Snowflake/images/SalesforcetoSnowflake_step2.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Salesforce CDC to Snowflake/images/SalesforcetoSnowflake_step2a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Salesforce CDC to Snowflake/images/SalesforcetoSnowflake_step2a.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Salesforce CDC to Snowflake/images/SalesforcetoSnowflake_step3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Salesforce CDC to Snowflake/images/SalesforcetoSnowflake_step3.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Salesforce CDC to Snowflake/images/SalesforcetoSnowflake_step4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Salesforce CDC to Snowflake/images/SalesforcetoSnowflake_step4.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Salesforce CDC to Snowflake/images/SalesforcetoSnowflake_step4a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Salesforce CDC to Snowflake/images/SalesforcetoSnowflake_step4a.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Salesforce CDC to Snowflake/images/SalesforcetoSnowflake_step5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Salesforce CDC to Snowflake/images/SalesforcetoSnowflake_step5.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Salesforce CDC to Snowflake/images/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Salesforce CDC to Snowflake/images/pipeline.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Salesforce to Delta Lake/README.md:
--------------------------------------------------------------------------------
 1 | Salesforce Accounts to Databricks Delta Lake
 2 | ==============================
 3 | 
 4 | This pipeline demonstrates how to bulk load data from Salesforce accounts into Databricks Delta Lake.
 5 | 
 6 | For more information, see [Loading Data into Databricks Delta Lake](https://streamsets.com/documentation/datacollector/latest/help/index.html?contextID=concept_a5b_wvk_ckb) in [StreamSets Data Collector documentation](https://streamsets.com/documentation/datacollector/latest/help/).
 7 | 
 8 | Prerequisites
 9 | -------------
10 | 
11 | * [StreamSets Data Collector](https://streamsets.com/products/dataops-platform/data-collector/) 3.15.0 or higher. You can [run Data Collector on your cloud provider of choice](https://streamsets.com/products/cloud/), or [download it for local use](https://streamsets.com/products/dataops-platform/data-collector/download/).
12 | * Ensure the [pre-requisites](https://streamsets.com/documentation/datacollector/latest/help/index.html?contextID=concept_xnp_y5f_dlb "pre-requisites") for Databricks Delta Lake are complete
13 | * Salesforce login details with appropriate permissions
14 | 
15 | Setup
16 | -----
17 | 
18 | * [Download the pipeline](Salesforce%20Accounts%20to%20Delta%20Lake.json) and import it into Data Collector or Control Hub
19 | * Configure all the pipeline parameters for your Salesforce and Databricks connections
20 | * Update the Salesforce Query to retrieve desired attributes from the Salesforce object being queried.
21 | * By default, the Databricks Delta Lake destination is configured to auto create a table and write the data in DBFS. If you'd like, update the configurations in the destination per your needs.
22 | * Start your Databricks cluster.
23 | 
24 | Running the Pipeline
25 | --------------------
26 | 
27 | Start the pipeline. It takes a couple of seconds to create a connection to Databricks. Once the connection is established, you should see records ingested from Salesforce Accounts and sent to Delta Lake.
28 | 
29 | ![Pipeline running](Salesforce_to_DeltaLake.png)
30 | 
31 | 


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Salesforce to Delta Lake/Salesforce_to_DeltaLake.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Salesforce to Delta Lake/Salesforce_to_DeltaLake.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Working with XML (Basic)/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | <h1><p align="center">Working with XML (Basic)</p></h1>
 3 | 
 4 | **Important:** *These instructions assume you have access to StreamSets Data Collector (v3.16+) and have performed all the prerequisites*
 5 | 
 6 | - For help installing [StreamSets Data Collector](https://streamsets.com/products/dataops-platform/data-collector/), see [StreamSets Data Collector Installation](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Installation/Install_title.html).
 7 | 
 8 | ## OVERVIEW
 9 | 
10 | This pipeline demonstrates how to read and process XML data in Data Collector. XML is very common data format verticals like HealthCare and Finance that have to follow industry standard data formats which are typically in XML.
11 | - This shows an example of an XML source that can be split into records at the source itself by specifying the delimiter.
12 | - The Field Mapper processor is a ‘swiss army knife’. It allows operating on multiple fields by name, value or data type in any hierarchical structure with a single expression.
13 | - The example shows the most common requirement for XML in SDC, i.e. removing ‘attr’ and the prefix value from all field names.
14 | 
15 | ## PIPELINE
16 | 
17 | ![Pipeline](images/pipeline.png "Working witu XML (Basic)")
18 | 
19 | ## DOCUMENTATION
20 | 
21 | [Field Mapper](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Processors/FieldMapper.html)
22 | 
23 | ## STEP-BY-STEP
24 | 
25 | ### Step 1: Download the pipeline
26 | 
27 | [Click Here](./Working_with_XML.zip?raw=true) to download the pipeline and save it to your drive.
28 | 
29 | ### Step 2: Import the pipeline
30 | 
31 | Click the down arrow next to the "Create New Pipeline" and select "Import Pipeline".
32 | 
33 | ![Step 2](images/import_from_archive.png "Import the Pipeline")
34 | 
35 | Click "Browse" and locate the pipeline file you just downloaded, then click "Import"
36 | 
37 | ### Step 3: Preview the pipeline
38 | 
39 | Click on the pipeline you just imported to open it and select the Preview icon
40 | 
41 | ![Step 3](images/preview.png "Configure the parameters")
42 | 
43 | Leave all default options in the preview configuration and click "Run Preview"
44 | 
45 | ![Step 4](images/preview_data.png "Preview the pipeline")
46 | 
47 | Play around with the [Field Mapper](https://streamsets.com/documentation/datacollector/latest/help/datacollector/UserGuide/Processors/FieldMapper.html) processor to see all the different data processing operations it can handle.


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Working with XML (Basic)/Working_with_XML.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Working with XML (Basic)/Working_with_XML.zip


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Working with XML (Basic)/images/import_from_archive.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Working with XML (Basic)/images/import_from_archive.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Working with XML (Basic)/images/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Working with XML (Basic)/images/pipeline.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Working with XML (Basic)/images/preview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Working with XML (Basic)/images/preview.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Working with XML (Basic)/images/preview_data.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Working with XML (Basic)/images/preview_data.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Working with XML (Basic)/images/select_downloaded_archive.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Working with XML (Basic)/images/select_downloaded_archive.png


--------------------------------------------------------------------------------
/datacollector/sample-pipelines/pipelines/Working with XML (Basic)/images/start_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/datacollector/sample-pipelines/pipelines/Working with XML (Basic)/images/start_pipeline.png


--------------------------------------------------------------------------------
/datacollector/sample-scripts/README.md:
--------------------------------------------------------------------------------
 1 | ![StreamSets Logo](../../images/Full%20Color%20Transparent.png)
 2 | 
 3 | <h1><p align="center">Data Collector: Sample Scripts</p></h1>
 4 | 
 5 | This folder contains sample scripts for StreamSets Data Collector.
 6 | 
 7 | The following best sample scripts are currently available:
 8 | | Name            | Description     |
 9 | | --------------- | --------------- |
10 | 
11 | **COMING SOON**
12 | 
13 | # Help
14 | 
15 | For any queries, questions, comments related to these pipelines reach out on any of these channels:
16 | 
17 | [Chat on Slack](https://streamsetters-slack.herokuapp.com/)
18 | 
19 | [User Group](https://groups.google.com/a/streamsets.com/d/forum/sdc-user)
20 | 
21 | [Ask StreamSets](https://ask.streamsets.com/questions/)
22 | 


--------------------------------------------------------------------------------
/images/Full Color Transparent.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/images/Full Color Transparent.png


--------------------------------------------------------------------------------
/transformer/README.md:
--------------------------------------------------------------------------------
 1 | ![StreamSets Logo](../images/Full%20Color%20Transparent.png)
 2 | 
 3 | <h1><p align="center">Transformer</p></h1>
 4 | 
 5 | This repository contains assets that will help you get started with StreamSets Transformer.  
 6 | 
 7 | The following folders are currently available:
 8 | 
 9 | | Asset            | Description     |
10 | | --------------- | --------------- |
11 | | [Best Practices](./best-practices) | Contains best practices and configurations |
12 | | [Sample Pipelines](./sample-pipelines) | Contains sample pipelines for Transformer |
13 | | [Sample Scripts](./sample-scripts) | Contain sample scripts |
14 | 
15 | 
16 | # Help
17 | 
18 | For any queries, questions, comments related to these pipelines reach out on any of these channels:
19 | 
20 | [Chat on Slack](https://streamsetters-slack.herokuapp.com/)
21 | 
22 | [User Group](https://groups.google.com/a/streamsets.com/d/forum/sdc-user)
23 | 
24 | [Ask StreamSets](https://ask.streamsets.com/questions/)
25 | 


--------------------------------------------------------------------------------
/transformer/best-practices/README.md:
--------------------------------------------------------------------------------
 1 | ![StreamSets Logo](../../images/Full%20Color%20Transparent.png)
 2 | 
 3 | <h1><p align="center">Transformer: Best Practices</p></h1>
 4 | 
 5 | This folder contains Best Practices and Configurations for StreamSets Transformer.
 6 | 
 7 | The following best practices/configurations are currently available:
 8 | | Name            | Description     |
 9 | | --------------- | --------------- |
10 | 
11 | **COMING SOON**
12 | 
13 | # Help
14 | 
15 | For any queries, questions, comments related to these pipelines reach out on any of these channels:
16 | 
17 | [Chat on Slack](https://streamsetters-slack.herokuapp.com/)
18 | 
19 | [User Group](https://groups.google.com/a/streamsets.com/d/forum/sdc-user)
20 | 
21 | [Ask StreamSets](https://ask.streamsets.com/questions/)
22 | 


--------------------------------------------------------------------------------
/transformer/sample-pipelines/README.md:
--------------------------------------------------------------------------------
 1 | ![StreamSets Logo](../../images/Full%20Color%20Transparent.png)
 2 | 
 3 | <h1><p align="center">Transformer: Sample Pipelines</p></h1>
 4 | 
 5 | This folder contains pipeline templates and samples for Transformer.
 6 | 
 7 | The following templates/samples are currently available:
 8 | | Name            | Description     |
 9 | | --------------- | --------------- |
10 | | [Clickstream Analysis on Amazon EMR, Amazon Redshift and Elasticsearch](./pipelines/Clickstream%20Analysis%20on%20Amazon%20EMR,%20Amazon%20Redshift%20and%20Elasticsearch) | Ingest raw clickstream logs from Amazon S3, perform aggregations and store those on Amazon Redshift and ElasticSearch for analysis |
11 | | [ML - Train NLP Model in PySpark](./pipelines/ML%20-%20Train%20NLP%20Model%20in%20PySpark) | Train a Spark MLlib Logistic Regression model for Natural Language Processing (NLP) using PySpark processor |
12 | | [ML - Train Random Forest Regression Model in Scala](./pipelines/ML%20-%20Train%20Random%20Forest%20Regression%20Model%20in%20Scala) | Train a Spark MLlib Random Forest Regression model using Scala processor |
13 | | [Slowly Changing Dimension - Type 2](./pipelines/Slowly%20Changing%20Dimensions%20-%20Type%202) | Slowly Changing Dimension - Type 2 |
14 | | [Spark ETL To Derive Sales Insights on Azure HDInsight And Power BI](./pipelines/Spark%20ETL%20To%20Derive%20Sales%20Insights%20on%20Azure%20HDInsight%20And%20Power%20BI) | Extract raw data and transform it (cleanse and curate) before storing it in multiple destinations for efficient downstream analysis
15 | | [Tx Retail Inventory - Join Agg Repartition](./pipelines/Tx%20Retail%20Inventory%20-%20Join%20Agg%20Repartition) | Example using Join, Aggregation and Repartition |
16 | | [Tx Scala UDF](./pipelines/Tx%20Scala%20UDF) | Example using Scala to create, register and use a User-Defined Function |
17 | | [Tx Slowly Changing Dimensions - Type 1](./pipelines/Tx%20Slowly%20Changing%20Dimension%20-%20Type%201) | Slowly Changing Dimension (SCD) - Type 1 |
18 | 
19 | 
20 | # Help
21 | 
22 | For any queries, questions, comments related to these pipelines reach out on any of these channels:
23 | 
24 | [Chat on Slack](https://streamsetters-slack.herokuapp.com/)
25 | 
26 | [User Group](https://groups.google.com/a/streamsets.com/d/forum/sdc-user)
27 | 
28 | [Ask StreamSets](https://ask.streamsets.com/questions/)
29 | 


--------------------------------------------------------------------------------
/transformer/sample-pipelines/pipelines/Calculate Geographic Distance - UDF/Calculate_distance_between_airports.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/transformer/sample-pipelines/pipelines/Calculate Geographic Distance - UDF/Calculate_distance_between_airports.zip


--------------------------------------------------------------------------------
/transformer/sample-pipelines/pipelines/Calculate Geographic Distance - UDF/README.md:
--------------------------------------------------------------------------------
 1 | ![StreamSets Logo](../../../../images/Full%20Color%20Transparent.png)
 2 | 
 3 | <h1><p align="center">Calculate Geographic Distance: UDF</p></h1>
 4 | 
 5 | **Important:** *These instructions assume you have access to StreamSets Transformer*
 6 | 
 7 | - For help installing [StreamSets Transformer](https://streamsets.com/products/dataops-platform/transformer-etl/), see [StreamSets Transformer Installation](https://streamsets.com/documentation/transformer/latest/help/transformer/Installation/Installation-Title.html).
 8 | 
 9 | ## OVERVIEW
10 | 
11 | This pipeline demonstrates how to create and register a User-Defined Function in Scala using [StreamSets Transformer](https://streamsets.com/products/dataops-platform/transformer-etl/). The UDF created in this example is a Haversine function that will return the geographic distance between 2 locations.
12 | 
13 | The source data for this pipeline is a table in SQL Server. Use this [SQL File](./airport.sql?raw=true "Airport Data") to create a copy of the table.
14 | 
15 | **Disclaimer:** *This pipeline is meant to serve as a template for creating, registering and using a User-Defined Function in Scala*
16 | 
17 | ## PIPELINE
18 | 
19 | ![Pipeline](images/pipeline.png "Calculate Geographic Distance: UDF")
20 | 
21 | 
22 | ## STEP-BY-STEP
23 | 
24 | ### Step 1: Download the pipeline
25 | 
26 | [Click Here](./Calculate_distance_between_airports.zip?raw=true) to download the pipeline and save it to your drive.
27 | 
28 | ### Step 2: Import the pipeline
29 | 
30 | Click the down arrow next to the "Create New Pipeline" and select "Import Pipeline From Archive".
31 | 
32 | 
33 | Click "Browse" and locate the pipeline file you just downloaded, click "OK", then click "Import"
34 | 
35 | ### Step 3: Configure the parameters
36 | 
37 | Click on the pipeline you just imported to open it and click on the "Parameters" tab and fill in the appropriate information for your environment.
38 | ![Step 3](images/Transformer_Parameters.png "Configure Pipeline Parameters")
39 | 
40 | #### Note: The User-Defined function is created under Pipeline -> Advanced tab. You'll see how the function is defined and registered.
41 | 
42 | ### Step 4: Run the pipeline
43 | 
44 | Click the "START" button to run the pipeline.
45 | 
46 | ![Step 4a](images/Transformer_Pipeline_Monitoring.png "Run the pipeline")
47 | 
48 | The pipeline will automatically complete and you'll notice a new table created in your database with the output results.
49 | 
50 | ![Step 4b](images/Transformer_distance_table.png "Distances")


--------------------------------------------------------------------------------
/transformer/sample-pipelines/pipelines/Calculate Geographic Distance - UDF/images/Transformer_Pipeline_Monitoring.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/transformer/sample-pipelines/pipelines/Calculate Geographic Distance - UDF/images/Transformer_Pipeline_Monitoring.png


--------------------------------------------------------------------------------
/transformer/sample-pipelines/pipelines/Calculate Geographic Distance - UDF/images/Transformer_distance_table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/transformer/sample-pipelines/pipelines/Calculate Geographic Distance - UDF/images/Transformer_distance_table.png


--------------------------------------------------------------------------------
/transformer/sample-pipelines/pipelines/Calculate Geographic Distance - UDF/images/Transformer_parameters.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/transformer/sample-pipelines/pipelines/Calculate Geographic Distance - UDF/images/Transformer_parameters.png


--------------------------------------------------------------------------------
/transformer/sample-pipelines/pipelines/Calculate Geographic Distance - UDF/images/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/transformer/sample-pipelines/pipelines/Calculate Geographic Distance - UDF/images/pipeline.png


--------------------------------------------------------------------------------
/transformer/sample-pipelines/pipelines/Clickstream Analysis on Amazon EMR, Amazon Redshift and Elasticsearch/README.md:
--------------------------------------------------------------------------------
 1 | Clickstream Analysis on Amazon EMR, Amazon Redshift and Elasticsearch
 2 | =======================================================================
 3 | 
 4 | This StreamSets Transformer pipeline runs on Apache Spark deployed on an Amazon EMR cluster and it's designed to perform clickstream analysis. It ingests raw clickstream logs from Amazon S3, perform aggregations and store those on Amazon Redshift for analysis and the pipeline also sends raw logs to Elasticsearch for querying and quick visualizations. 
 5 | 
 6 | Prerequisites
 7 | ---------------------
 8 | 
 9 | * [StreamSets Transformer](https://streamsets.com/products/dataops-platform/transformer-etl/) 3.14.0 or higher. You can [deploy Transformer](https://streamsets.com/products/dataops-platform/transformer-etl/download/) on your choice of **cloud provider** or **download** it for local development.
10 | * Access to Amazon EMR with Spark cluster 
11 |     * Ensure the [prerequisites](https://streamsets.com/documentation/transformer/latest/help/transformer/Clusters/EMR.html#concept_yjs_gzt_vkb) for Amazon EMR are satisfied
12 | * Access to Amazon S3
13 | * Access to Amazon Redshift cluster
14 | 
15 | Setup
16 | ---------------------
17 | 
18 | * [Download and import the pipeline](ClickstreamLogsToESRedshiftEMRfe856fed-ca84-4689-88d1-432f6ae8e6cd.json) into your instance of Transformer
19 | * [Download the sample dataset](Schematic_Log.csv) and upload it to your Amazon S3 bucket
20 | * After importing the pipeline into your environment and before running the pipeline, update the following pipeline parameters:
21 | 
22 | ```
23 | [
24 |   {
25 |     "key": "EMR_STAGING",
26 |     "value": ""
27 |   },
28 |   {
29 |     "key": "EMR_CLUSTER_ID",
30 |     "value": ""
31 |   },
32 |   {
33 |     "key": "AWS_DATA_BUCKET",
34 |     "value": ""
35 |   },
36 |   {
37 |     "key": "ES_URL",
38 |     "value": ""
39 |   },
40 |   {
41 |     "key": "REDSHIFT_ENDPOINT",
42 |     "value": ""
43 |   },
44 |   {
45 |     "key": "AWS_TEMP_BUCKET",
46 |     "value": ""
47 |   },
48 |   {
49 |     "key": "ES_INDEX",
50 |     "value": ""
51 |   },
52 |   {
53 |     "key": "REDSHIFT_USER",
54 |     "value": ""
55 |   },
56 |   {
57 |     "key": "REDSHIFT_SCHEMA",
58 |     "value": ""
59 |   }
60 | ]
61 | 
62 | ```
63 | 
64 | These pipeline parameter are used by various stages in the pipleine, such as, Amazon S3 buckets, Amazon Redshift endpoint and credentials, Elasticsearch URL and index name, etc.
65 | 
66 | Technical Details & Demo Video
67 | ------------------------------
68 | 
69 | For techincal info, detailed explanation of this use case and to watch demo video, read this [blog](https://bit.ly/EMRRedshiftES).
70 | 


--------------------------------------------------------------------------------
/transformer/sample-pipelines/pipelines/ML - Train NLP Model in PySpark/README.md:
--------------------------------------------------------------------------------
 1 | Train Natural Language Processing Model using PySpark
 2 | =======================================================================
 3 | 
 4 | This StreamSets Transformer pipeline runs on Databricks cluster. It is designed to train a Spark MLlib Logistic Regression model for Natural Language Processing (NLP) using PySpark processor. The model is trained to classify given tweet as a *positive* or *negative* sentiment.
 5 | 
 6 | Prerequisites
 7 | ---------------------
 8 | 
 9 | * [StreamSets Transformer](https://streamsets.com/products/dataops-platform/transformer-etl/) 3.14.0 or higher. You can [deploy Transformer](https://streamsets.com/products/dataops-platform/transformer-etl/download/) on your choice of **cloud provider** or **download** it for local development.
10 | * Access to Databricks cluster 
11 |     * Ensure the PySpark [prerequisites](https://streamsets.com/documentation/transformer/latest/help/transformer/Processors/PySpark.html#concept_ok3_bd2_qkb) are satisfied
12 | 
13 | Setup
14 | ---------------------
15 | 
16 | * [Download and import the pipeline](TrainNLPModelPySparkDB787ba4f1-dcb1-4d53-ab61-d80569daac14.json) into your instance of Transformer
17 | * [Download the sample datasets](dataset)
18 | * After importing the pipeline into your environment and before running the pipeline, update the following pipeline parameters:
19 | 
20 | ```
21 | [
22 |   {
23 |     "key": "DB_URL",
24 |     "value": ""
25 |   },
26 |   {
27 |     "key": "DB_TOKEN",
28 |     "value": ""
29 |   },
30 |   {
31 |     "key": "DB_CLUSTER_ID",
32 |     "value": ""
33 |   },
34 |   {
35 |     "key": "POSITIVE_TWEETS_LOCATION",
36 |     "value": ""
37 |   },
38 |   {
39 |     "key": "NEGATIVE_TWEETS_LOCATION",
40 |     "value": ""
41 |   },
42 |   {
43 |     "key": "OUTPUT_FILE_LOCATION",
44 |     "value": ""
45 |   }
46 | ]
47 | ```
48 | 
49 | These pipeline parameters refer to the Databricks URL, Databricks token, Databricks cluster Id and locations for source datasets as well as output file location.
50 | 
51 | Technical Details
52 | ------------------------------
53 | 
54 | For techincal information and detailed explanation of this use case, read this [blog](https://bit.ly/TrainNLPModel).
55 | 


--------------------------------------------------------------------------------
/transformer/sample-pipelines/pipelines/ML - Train Random Forest Regression Model in Scala/README.md:
--------------------------------------------------------------------------------
 1 | Train Random Forest Regression Model in Scala
 2 | =======================================================================
 3 | 
 4 | This StreamSets Transformer pipeline runs on Databricks cluster. It is designed to train a Spark MLlib Random Forest Regression model using Scala processor. The model is trained to predict sales (== number of units sold) based on advertising budgets allocated to TV, Radio and Newspapers media channels.
 5 | 
 6 | Prerequisites
 7 | ---------------------
 8 | 
 9 | * [StreamSets Transformer](https://streamsets.com/products/dataops-platform/transformer-etl/) 3.14.0 or higher. You can [deploy Transformer](https://streamsets.com/products/dataops-platform/transformer-etl/download/) on your choice of **cloud provider** or **download** it for local development.
10 | * Access to Databricks cluster
11 | 
12 | Setup
13 | ---------------------
14 | 
15 | * [Download and import the pipeline](TrainRandomForestRegressionModelScalaDB28582ef8-fecf-4fa8-94d1-1a58d803153d.json) into your instance of Transformer
16 | * [Download the sample dataset](dataset)
17 | * After importing the pipeline into your environment and before running the pipeline, update the following pipeline parameters:
18 | 
19 | ```
20 | [
21 |   {
22 |     "key": "DB_URL",
23 |     "value": ""
24 |   },
25 |   {
26 |     "key": "DB_TOKEN",
27 |     "value": ""
28 |   },
29 |   {
30 |     "key": "DB_CLUSTER_ID",
31 |     "value": ""
32 |   },
33 |   {
34 |     "key": "STAGING",
35 |     "value": ""
36 |   },
37 |   {
38 |     "key": "SOURCE_DATA_LOCATION",
39 |     "value": ""
40 |   }
41 |   {
42 |     "key": "OUTPUT_DATA_LOCATION",
43 |     "value": ""
44 |   }
45 | ]
46 | ```
47 | 
48 | These pipeline parameters refer to the Databricks URL, Databricks token, Databricks cluster Id and location of source dataset as well as output file location.
49 | 
50 | Technical Details
51 | ------------------------------
52 | 
53 | For techincal information and detailed explanation of this use case, read this [blog](https://bit.ly/TrainRFModel).
54 | 


--------------------------------------------------------------------------------
/transformer/sample-pipelines/pipelines/ML - Train Random Forest Regression Model in Scala/dataset/Advertising_training.csv:
--------------------------------------------------------------------------------
  1 | "_c0","_c1","_c2","_c3","_c4"
  2 | "1",230.1,37.8,69.2,22.1
  3 | "2",44.5,39.3,45.1,10.4
  4 | "3",17.2,45.9,69.3,9.3
  5 | "4",151.5,41.3,58.5,18.5
  6 | "5",180.8,10.8,58.4,12.9
  7 | "6",8.7,48.9,75,7.2
  8 | "7",57.5,32.8,23.5,11.8
  9 | "8",120.2,19.6,11.6,13.2
 10 | "9",8.6,2.1,1,4.8
 11 | "10",199.8,2.6,21.2,10.6
 12 | "11",66.1,5.8,24.2,8.6
 13 | "12",214.7,24,4,17.4
 14 | "13",23.8,35.1,65.9,9.2
 15 | "14",97.5,7.6,7.2,9.7
 16 | "15",204.1,32.9,46,19
 17 | "16",195.4,47.7,52.9,22.4
 18 | "17",67.8,36.6,114,12.5
 19 | "18",281.4,39.6,55.8,24.4
 20 | "19",69.2,20.5,18.3,11.3
 21 | "20",147.3,23.9,19.1,14.6
 22 | "21",218.4,27.7,53.4,18
 23 | "22",237.4,5.1,23.5,12.5
 24 | "23",13.2,15.9,49.6,5.6
 25 | "24",228.3,16.9,26.2,15.5
 26 | "25",62.3,12.6,18.3,9.7
 27 | "26",262.9,3.5,19.5,12
 28 | "27",142.9,29.3,12.6,15
 29 | "28",240.1,16.7,22.9,15.9
 30 | "29",248.8,27.1,22.9,18.9
 31 | "30",70.6,16,40.8,10.5
 32 | "31",292.9,28.3,43.2,21.4
 33 | "32",112.9,17.4,38.6,11.9
 34 | "33",97.2,1.5,30,9.6
 35 | "34",265.6,20,0.3,17.4
 36 | "35",95.7,1.4,7.4,9.5
 37 | "36",290.7,4.1,8.5,12.8
 38 | "37",266.9,43.8,5,25.4
 39 | "38",74.7,49.4,45.7,14.7
 40 | "39",43.1,26.7,35.1,10.1
 41 | "40",228,37.7,32,21.5
 42 | "41",202.5,22.3,31.6,16.6
 43 | "42",177,33.4,38.7,17.1
 44 | "43",293.6,27.7,1.8,20.7
 45 | "44",206.9,8.4,26.4,12.9
 46 | "45",25.1,25.7,43.3,8.5
 47 | "46",175.1,22.5,31.5,14.9
 48 | "47",89.7,9.9,35.7,10.6
 49 | "48",239.9,41.5,18.5,23.2
 50 | "49",227.2,15.8,49.9,14.8
 51 | "50",66.9,11.7,36.8,9.7
 52 | "51",199.8,3.1,34.6,11.4
 53 | "52",100.4,9.6,3.6,10.7
 54 | "53",216.4,41.7,39.6,22.6
 55 | "54",182.6,46.2,58.7,21.2
 56 | "55",262.7,28.8,15.9,20.2
 57 | "56",198.9,49.4,60,23.7
 58 | "57",7.3,28.1,41.4,5.5
 59 | "58",136.2,19.2,16.6,13.2
 60 | "59",210.8,49.6,37.7,23.8
 61 | "60",210.7,29.5,9.3,18.4
 62 | "61",53.5,2,21.4,8.1
 63 | "62",261.3,42.7,54.7,24.2
 64 | "63",239.3,15.5,27.3,15.7
 65 | "64",102.7,29.6,8.4,14
 66 | "65",131.1,42.8,28.9,18
 67 | "66",69,9.3,0.9,9.3
 68 | "67",31.5,24.6,2.2,9.5
 69 | "68",139.3,14.5,10.2,13.4
 70 | "69",237.4,27.5,11,18.9
 71 | "70",216.8,43.9,27.2,22.3
 72 | "71",199.1,30.6,38.7,18.3
 73 | "72",109.8,14.3,31.7,12.4
 74 | "73",26.8,33,19.3,8.8
 75 | "74",129.4,5.7,31.3,11
 76 | "75",213.4,24.6,13.1,17
 77 | "76",16.9,43.7,89.4,8.7
 78 | "77",27.5,1.6,20.7,6.9
 79 | "78",120.5,28.5,14.2,14.2
 80 | "79",5.4,29.9,9.4,5.3
 81 | "80",116,7.7,23.1,11
 82 | "81",76.4,26.7,22.3,11.8
 83 | "82",239.8,4.1,36.9,12.3
 84 | "83",75.3,20.3,32.5,11.3
 85 | "84",68.4,44.5,35.6,13.6
 86 | "85",213.5,43,33.8,21.7
 87 | "86",193.2,18.4,65.7,15.2
 88 | "87",76.3,27.5,16,12
 89 | "88",110.7,40.6,63.2,16
 90 | "89",88.3,25.5,73.4,12.9
 91 | "90",109.8,47.8,51.4,16.7
 92 | "91",134.3,4.9,9.3,11.2
 93 | "92",28.6,1.5,33,7.3
 94 | "93",217.7,33.5,59,19.4
 95 | "94",250.9,36.5,72.3,22.2
 96 | "95",107.4,14,10.9,11.5
 97 | "96",163.3,31.6,52.9,16.9
 98 | "97",197.6,3.5,5.9,11.7
 99 | "98",184.9,21,22,15.5
100 | "99",289.7,42.3,51.2,25.4
101 | "100",135.2,41.7,45.9,17.2
102 | "101",222.4,4.3,49.8,11.7
103 | "102",296.4,36.3,100.9,23.8
104 | "103",280.2,10.1,21.4,14.8
105 | "104",187.9,17.2,17.9,14.7
106 | "105",238.2,34.3,5.3,20.7
107 | "106",137.9,46.4,59,19.2
108 | "107",25,11,29.7,7.2
109 | "108",90.4,0.3,23.2,8.7
110 | "109",13.1,0.4,25.6,5.3
111 | "110",255.4,26.9,5.5,19.8
112 | "111",225.8,8.2,56.5,13.4
113 | "112",241.7,38,23.2,21.8
114 | "113",175.7,15.4,2.4,14.1
115 | "114",209.6,20.6,10.7,15.9
116 | "115",78.2,46.8,34.5,14.6
117 | "116",75.1,35,52.7,12.6
118 | "117",139.2,14.3,25.6,12.2
119 | "118",76.4,0.8,14.8,9.4
120 | "119",125.7,36.9,79.2,15.9
121 | "120",19.4,16,22.3,6.6
122 | "121",141.3,26.8,46.2,15.5
123 | "122",18.8,21.7,50.4,7
124 | "123",224,2.4,15.6,11.6
125 | "124",123.1,34.6,12.4,15.2
126 | "125",229.5,32.3,74.2,19.7
127 | "126",87.2,11.8,25.9,10.6
128 | "127",7.8,38.9,50.6,6.6
129 | "128",80.2,0,9.2,8.8
130 | "129",220.3,49,3.2,24.7
131 | "130",59.6,12,43.1,9.7
132 | "131",0.7,39.6,8.7,1.6
133 | "132",265.2,2.9,43,12.7
134 | "133",8.4,27.2,2.1,5.7
135 | "134",219.8,33.5,45.1,19.6
136 | "135",36.9,38.6,65.6,10.8
137 | "136",48.3,47,8.5,11.6
138 | "137",25.6,39,9.3,9.5
139 | "138",273.7,28.9,59.7,20.8
140 | "139",43,25.9,20.5,9.6
141 | "140",184.9,43.9,1.7,20.7
142 | "141",73.4,17,12.9,10.9
143 | "142",193.7,35.4,75.6,19.2
144 | "143",220.5,33.2,37.9,20.1
145 | "144",104.6,5.7,34.4,10.4
146 | "145",96.2,14.8,38.9,11.4
147 | "146",140.3,1.9,9,10.3
148 | "147",240.1,7.3,8.7,13.2
149 | "148",243.2,49,44.3,25.4
150 | "149",38,40.3,11.9,10.9
151 | "150",44.7,25.8,20.6,10.1
152 | "151",280.7,13.9,37,16.1
153 | "152",121,8.4,48.7,11.6
154 | "153",197.6,23.3,14.2,16.6
155 | "154",171.3,39.7,37.7,19
156 | "155",187.8,21.1,9.5,15.6
157 | "156",4.1,11.6,5.7,3.2
158 | "157",93.9,43.5,50.5,15.3
159 | "158",149.8,1.3,24.3,10.1
160 | "159",11.7,36.9,45.2,7.3
161 | "160",131.7,18.4,34.6,12.9
162 | "161",172.5,18.1,30.7,14.4
163 | "162",85.7,35.8,49.3,13.3
164 | "163",188.4,18.1,25.6,14.9
165 | "164",163.5,36.8,7.4,18
166 | "165",117.2,14.7,5.4,11.9
167 | "166",234.5,3.4,84.8,11.9
168 | "167",17.9,37.6,21.6,8
169 | "168",206.8,5.2,19.4,12.2
170 | "169",215.4,23.6,57.6,17.1
171 | "170",284.3,10.6,6.4,15
172 | "171",50,11.6,18.4,8.4
173 | "172",164.5,20.9,47.4,14.5
174 | "173",19.6,20.1,17,7.6
175 | "174",168.4,7.1,12.8,11.7
176 | "175",222.4,3.4,13.1,11.5
177 | "176",276.9,48.9,41.8,27
178 | "177",248.4,30.2,20.3,20.2
179 | "178",170.2,7.8,35.2,11.7
180 | "179",276.7,2.3,23.7,11.8
181 | "180",165.6,10,17.6,12.6
182 | "181",156.6,2.6,8.3,10.5
183 | "182",218.5,5.4,27.4,12.2
184 | "183",56.2,5.7,29.7,8.7
185 | "184",287.6,43,71.8,26.2
186 | "185",253.8,21.3,30,17.6
187 | "186",205,45.1,19.6,22.6
188 | "187",139.5,2.1,26.6,10.3
189 | "188",191.1,28.7,18.2,17.3
190 | "189",286,13.9,3.7,15.9
191 | "190",18.7,12.1,23.4,6.7
192 | "191",39.5,41.1,5.8,10.8
193 | "192",75.5,10.8,6,9.9
194 | "193",17.2,4.1,31.6,5.9
195 | "194",166.8,42,3.6,19.6
196 | "195",149.7,35.6,6,17.3
197 | "196",38.2,3.7,13.8,7.6
198 | "197",94.2,4.9,8.1,9.7
199 | "198",177,9.3,6.4,12.8
200 | "199",283.6,42,66.2,25.5
201 | "200",232.1,8.6,8.7,13.4
202 | 


--------------------------------------------------------------------------------
/transformer/sample-pipelines/pipelines/Slowly Changing Dimensions - Type 2/README.md:
--------------------------------------------------------------------------------
  1 | Slowly Changing Dimensions - Type 2
  2 | ===================================
  3 | 
  4 | This pipeline demonstrates how to handle Slowly Changing Dimension - Type 2 operations.
  5 | 
  6 | Setup And Technical Details
  7 | ---------------------------
  8 | 
  9 | * Download the [pipeline](SCDType2588a6d29-c8b9-439e-8bec-8b1f7b9c0e99.json) and import it into your Data Collector
 10 | * Update the following pipeline parameters
 11 |     * OUTPUT_FILE_FOLDER
 12 |         * This is the path to the folder where you'd like output file to be created
 13 |     * MASTER_PRIMARY_KEY
 14 |         * This is the primary key of the dimension table. **Note**: If using the prepopulated customer data "as-is", leave the default value of *customer_id*
 15 |     * MASTER_VERSION_TRACKING_FIELD
 16 |         * This is the tracking field of the dimension table. **Note**: If using the prepopulated customer data "as-is", leave the default value of *version*
 17 | 
 18 | 
 19 | Pipeline Overview
 20 | -----------------
 21 | 
 22 | Let’s take a very simple yet concrete example of managing customer records (with updates to addresses) for existing and new customers. In this case, the assumption is that the destination is empty so it’s a one-time migration scenario for ingesting "master" and "change" records from respective origins to a new file destination.
 23 | 
 24 | The pipeline has been prepopulated with sample customers "master" and "change" data using Dev Raw Data Source origins so you don't have to worry about loading it.
 25 | 
 26 | "Master" data
 27 | 
 28 | ![Pipeline Overview](images/img1a.png)
 29 | 
 30 | ```
 31 | customer_id,customer_fname,customer_lname,customer_email,customer_password,customer_street,customer_city,customer_state,customer_zipcode,version
 32 | 1,Richard,Hernandez,XXXXXXXXX,XXXXXXXXX,6303 Heather Plaza,Brownsville,TX,78521,1
 33 | 2,Mary,Barrett,XXXXXXXXX,XXXXXXXXX,9526 Noble Embers Ridge,Littleton,CO,80126,1
 34 | 3,Ann,Smith,XXXXXXXXX,XXXXXXXXX,3422 Blue Pioneer Bend,Caguas,PR,00725,1
 35 | ```
 36 | 
 37 | "Change" data
 38 | 
 39 | ![Pipeline Overview](images/img1b.png)
 40 | 
 41 | ```
 42 | customer_id,customer_fname,customer_lname,customer_email,customer_password,customer_street,customer_city,customer_state,customer_zipcode
 43 | 2,Mary,Barrett,XXXXXXXXX,XXXXXXXXX,4963 Ponderosa Ct,Park City,UT,80126
 44 | 3,Ann,Smith,XXXXXXXXX,XXXXXXXXX,1991 Margo Pl,San Fran,CA,00725
 45 | 3,Ann,Smith,XXXXXXXXX,XXXXXXXXX,1991 Little Canyon,Salt Lake City,UT,84098
 46 | 4,Mark,Barrett,XXXXXXXXX,XXXXXXXXX,4963 Ponderosa Ct,Park City,UT,80126
 47 | ```
 48 | 
 49 | *Note*: The "change" data does **not** include *version* tracking field.
 50 | 
 51 | 
 52 | Pipeline Preview
 53 | ----------------
 54 | 
 55 | Once you have updated the pipeline parameters, click on **Preview** icon to see how the data is being transformed as it is flowing through various stages in the pipeline.
 56 | 
 57 | **Slowly Changing Dimension (SCD)**
 58 | 
 59 | ![Slowly Changing Dimension](images/img2.png)
 60 | 
 61 | Using [Slowly Changing Dimension](https://streamsets.com/documentation/transformer/latest/help/transformer/Processors/SCDimension.html#concept_ixk_bbr_j3b) the pipeline is desgined to handle Slowly Changing Dimension - Type 2 operations.
 62 | 
 63 | The attributes of interest on **Dimension** tab are:
 64 | 
 65 | * **SCD Type**: Type 2
 66 | * **Output Full Master Data**: Enabled (So all the records from "master" origin will be included in the output.)
 67 | * **Key Fields**: *${MASTER_PRIMARY_KEY}* which is defaulted to *customer_id* for this particular dataset. It is used to join "master" and "change" data.
 68 | * **Tracking Fields**: *${MASTER_VERSION_TRACKING_FIELD}* which is defaulted to *version* for this particular dataset. The SCD processor will set the value for it.
 69 | 
 70 | Given the datasets and above configuration, all of the "change" data records will have *__STREAMSETS_TRANSFORMER_METADATA_ATTRIBUTE_ChangeType__* set to "Insert". 
 71 | 
 72 | ![Slowly Changing Dimension](images/img2a.png)
 73 | 
 74 | *Note*: The SCD processor sets *version* column to 1 where *customer_id* doesn’t exist in "master" dataset. Otherwise it will be incremented.
 75 | 
 76 | ![Slowly Changing Dimension](images/img2b.png)
 77 | 
 78 | Given the datasets and above configuration, all of the "master" data records will have *__STREAMSETS_TRANSFORMER_METADATA_ATTRIBUTE_ChangeType__* set to "PassThrough".
 79 | 
 80 | ![Slowly Changing Dimension](images/img2c.png)
 81 | 
 82 |  *Note*: If **Output Full Master Data** was not enabled, these records would **not** be included in the output.
 83 | 
 84 | 
 85 | **Sort**
 86 | 
 87 | ![Sort](images/img3.png)
 88 | 
 89 | Using [Sort](https://streamsets.com/documentation/transformer/latest/help/transformer/Processors/Sort.html#concept_jw2_pq5_rgb) the records are sorted by *${MASTER_PRIMARY_KEY}* and *${MASTER_VERSION_TRACKING_FIELD}* so in this case the output records will ordered by *customer_id* and *version*. *Note*: This stage is optional and depends on your use case.
 90 | 
 91 | 
 92 | **Repartition**
 93 | 
 94 | ![Repartition](images/img4.png)
 95 | 
 96 | Using [Repartition](https://streamsets.com/documentation/transformer/latest/help/transformer/Processors/Repartition.html#concept_cm5_lfg_wgb) will force the pipeline to generate output in a single file. *Note*: This stage is optional and depends on your use case.
 97 | 
 98 | 
 99 | **Output**
100 | 
101 | ![Output](images/img5.png)
102 | 
103 | Given the two datasets above the resulting output will look like this. Notice that the total number of output records is **7**; **3** records from "master" origin for existing customers and **4** records from the "change" origin–-where **3** records are for existing customers (1 for Mary and 2 for Ann) with their updated address and version incremented, and one record for new customer Mark with version set to 1.
104 | 
105 | ```
106 | customer_id,customer_fname,customer_lname,customer_email,customer_password,customer_street,customer_city,customer_state,customer_zipcode,version
107 | 1,Richard,Hernandez,XXXXXXXXX,XXXXXXXXX,6303 Heather Plaza,Brownsville,TX,78521,1
108 | 2,Mary,Barrett,XXXXXXXXX,XXXXXXXXX,9526 Noble Embers Ridge,Littleton,CO,80126,1
109 | 2,Mary,Barrett,XXXXXXXXX,XXXXXXXXX,4963 Ponderosa Ct,Park City,UT,80126,2
110 | 3,Ann,Smith,XXXXXXXXX,XXXXXXXXX,3422 Blue Pioneer Bend,Caguas,PR,725,1
111 | 3,Ann,Smith,XXXXXXXXX,XXXXXXXXX,1991 Margo Pl,San Fran,CA,725,2
112 | 3,Ann,Smith,XXXXXXXXX,XXXXXXXXX,1991 Little Canyon,Salt Lake City,UT,84098,3
113 | 4,Mark,Barrett,XXXXXXXXX,XXXXXXXXX,4963 Ponderosa Ct,Park City,UT,80126,1
114 | ```
115 | 
116 | 
117 | Pipeline Run
118 | ------------
119 | 
120 | Provided you've updated the pipeline parameters and there aren't any validation errors, running the pipleline should create one output file in CSV format.
121 | 
122 | **CSV** -- [Sample output file](output/part-00000-d21cc8cc-75b9-4e69-aa56-55e5abe93bac-c000.csv).
123 | 
124 | 
125 | Design Patterns
126 | ---------------
127 | 
128 | To learn more about a few design patterns for Slowly Changing Dimensions - Type 2, [read this blog](https://streamsets.com/blog/streamsets-transformer-design-patterns-for-slowly-changing-dimensions/).
129 | 


--------------------------------------------------------------------------------
/transformer/sample-pipelines/pipelines/Slowly Changing Dimensions - Type 2/images/img1a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/transformer/sample-pipelines/pipelines/Slowly Changing Dimensions - Type 2/images/img1a.png


--------------------------------------------------------------------------------
/transformer/sample-pipelines/pipelines/Slowly Changing Dimensions - Type 2/images/img1b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/transformer/sample-pipelines/pipelines/Slowly Changing Dimensions - Type 2/images/img1b.png


--------------------------------------------------------------------------------
/transformer/sample-pipelines/pipelines/Slowly Changing Dimensions - Type 2/images/img2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/transformer/sample-pipelines/pipelines/Slowly Changing Dimensions - Type 2/images/img2.png


--------------------------------------------------------------------------------
/transformer/sample-pipelines/pipelines/Slowly Changing Dimensions - Type 2/images/img2a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/transformer/sample-pipelines/pipelines/Slowly Changing Dimensions - Type 2/images/img2a.png


--------------------------------------------------------------------------------
/transformer/sample-pipelines/pipelines/Slowly Changing Dimensions - Type 2/images/img2b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/transformer/sample-pipelines/pipelines/Slowly Changing Dimensions - Type 2/images/img2b.png


--------------------------------------------------------------------------------
/transformer/sample-pipelines/pipelines/Slowly Changing Dimensions - Type 2/images/img2c.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/transformer/sample-pipelines/pipelines/Slowly Changing Dimensions - Type 2/images/img2c.png


--------------------------------------------------------------------------------
/transformer/sample-pipelines/pipelines/Slowly Changing Dimensions - Type 2/images/img3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/transformer/sample-pipelines/pipelines/Slowly Changing Dimensions - Type 2/images/img3.png


--------------------------------------------------------------------------------
/transformer/sample-pipelines/pipelines/Slowly Changing Dimensions - Type 2/images/img4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/transformer/sample-pipelines/pipelines/Slowly Changing Dimensions - Type 2/images/img4.png


--------------------------------------------------------------------------------
/transformer/sample-pipelines/pipelines/Slowly Changing Dimensions - Type 2/images/img5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/transformer/sample-pipelines/pipelines/Slowly Changing Dimensions - Type 2/images/img5.png


--------------------------------------------------------------------------------
/transformer/sample-pipelines/pipelines/Slowly Changing Dimensions - Type 2/output/part-00000-d21cc8cc-75b9-4e69-aa56-55e5abe93bac-c000.csv:
--------------------------------------------------------------------------------
1 | customer_id,customer_fname,customer_lname,customer_email,customer_password,customer_street,customer_city,customer_state,customer_zipcode,version
2 | 1,Richard,Hernandez,XXXXXXXXX,XXXXXXXXX,6303 Heather Plaza,Brownsville,TX,78521,1
3 | 2,Mary,Barrett,XXXXXXXXX,XXXXXXXXX,9526 Noble Embers Ridge,Littleton,CO,80126,1
4 | 2,Mary,Barrett,XXXXXXXXX,XXXXXXXXX,4963 Ponderosa Ct,Park City,UT,80126,2
5 | 3,Ann,Smith,XXXXXXXXX,XXXXXXXXX,3422 Blue Pioneer Bend,Caguas,PR,725,1
6 | 3,Ann,Smith,XXXXXXXXX,XXXXXXXXX,1991 Margo Pl,San Fran,CA,725,2
7 | 3,Ann,Smith,XXXXXXXXX,XXXXXXXXX,1991 Little Canyon,Salt Lake City,UT,84098,3
8 | 4,Mark,Barrett,XXXXXXXXX,XXXXXXXXX,4963 Ponderosa Ct,Park City,UT,80126,1
9 | 


--------------------------------------------------------------------------------
/transformer/sample-pipelines/pipelines/Spark ETL To Derive Sales Insights on Azure HDInsight And Power BI/README.md:
--------------------------------------------------------------------------------
 1 | Spark ETL To Derive Sales Insights on Azure HDInsight And Power BI
 2 | ==================================================================
 3 | 
 4 | This StreamSets Transformer pipeline runs on Apache Spark for Azure HDInsight cluster to extract raw data and transform it (cleanse and curate) before storing it in multiple destinations for efficient downstream analysis. The pipeline also uses technologies like Azure Data Lake Storage (ADLS) Gen2 and Azure SQL database, and the curated data is queried and visualized in Power BI.
 5 | 
 6 | Prerequisites
 7 | ---------------------
 8 | 
 9 | * [StreamSets Transformer](https://streamsets.com/products/dataops-platform/transformer-etl/) 3.14.0 or higher. You can [deploy Transformer](https://streamsets.com/products/dataops-platform/transformer-etl/download/) on your choice of **cloud provider** or **download** it for local development.
10 | * Access to Apache Spark for Azure HDInsight cluster 
11 | * Access to ADLS Gen2 storage
12 | * Access to Azure SQL database
13 | 
14 | Setup
15 | ---------------------
16 | 
17 | * [Download and import the pipeline](SalesInsightsOnAzureSQLHDInsight37efd28a-9c98-494b-85bd-c6fd8a85af10.json) into your instance of Transformer
18 | * [Download the sample sales dataset](dataset/sales) and upload it to your ADLS Gen2 storage
19 | * After importing the pipeline into your environment and before running the pipeline, update the following pipeline parameters:
20 | 
21 | ```
22 | [
23 |   {
24 |     "key": "HD_LIVY_ENDPOINT",
25 |     "value": ""
26 |   },
27 |   {
28 |     "key": "HD_STAGING",
29 |     "value": ""
30 |   },
31 |   {
32 |     "key": "HD_USER",
33 |     "value": ""
34 |   },
35 |   {
36 |     "key": "HD_PASSWORD",
37 |     "value": ""
38 |   },
39 |   {
40 |     "key": "ADLS_STORAGE_ACCOUNT",
41 |     "value": ""
42 |   },
43 |   {
44 |     "key": "ADLS_FILESYSTEM_CONTAINER",
45 |     "value": ""
46 |   },
47 |   {
48 |     "key": "ADLS_SHARED_KEY",
49 |     "value": ""
50 |   },
51 |   {
52 |     "key": "AZURE_SQL_URL",
53 |     "value": ""
54 |   },
55 |   {
56 |     "key": "ADLS_GEN2_SOURCE_PATH",
57 |     "value": ""
58 |   },
59 |   {
60 |     "key": "AZURE_SQL_DB",
61 |     "value": ""
62 |   },
63 |   {
64 |     "key": "AZURE_SQL_DB_USER",
65 |     "value": ""
66 |   },
67 |   {
68 |     "key": "AZURE_SQL_DB_PWD",
69 |     "value": ""
70 |   },
71 |   {
72 |     "key": "AZURE_SQL_DB_TABLE",
73 |     "value": ""
74 |   },
75 |   {
76 |     "key": "ADLS_GEN2_DESTINATION_PATH",
77 |     "value": ""
78 |   },
79 |   {
80 |     "key": "ADLS_GEN2_DESTINATION_PARTITION",
81 |     "value": ""
82 |   }
83 | ]
84 | 
85 | ```
86 | 
87 | These pipeline parameter are used by various stages in the pipleine, such as, HDInsight cluster details, ADLS Gen2 information for loading raw/source data and also to store clean data, Azure SQL database and credentials for storing curated data, etc.
88 | 
89 | Technical Details
90 | ------------------------------
91 | 
92 | For techincal information, detailed explanation of this use case and to see how the curated data in Azure SQL is analyzed in Power BI, read this [blog](https://bit.ly/SparkETLonHDInsight).
93 | 
94 | 


--------------------------------------------------------------------------------
/transformer/sample-pipelines/pipelines/Tx Retail Inventory - Join Agg Repartition/Data.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/transformer/sample-pipelines/pipelines/Tx Retail Inventory - Join Agg Repartition/Data.zip


--------------------------------------------------------------------------------
/transformer/sample-pipelines/pipelines/Tx Retail Inventory - Join Agg Repartition/README.md:
--------------------------------------------------------------------------------
  1 | ![StreamSets Logo](../../../../images/Full%20Color%20Transparent.png)
  2 | 
  3 | <h1><p align="center">Tx Retail Inventory - Join Agg Repartition</p></h1>
  4 | 
  5 | # Tx Retail Inventory - Join Agg Repartition
  6 | 
  7 | **Important:** *These instructions assume you have access to StreamSets Transformer*
  8 | 
  9 | - For help installing [StreamSets Transformer](https://streamsets.com/products/dataops-platform/transformer-etl/), see [StreamSets Transformer Installation](https://streamsets.com/documentation/transformer/latest/help/transformer/Installation/Installation-Title.html).
 10 | 
 11 | Here is a link to a short video on using this pipeline template: [Video Link](https://www.youtube.com/channel/UC_4K-__dngOCEmoZs7PVZAg)
 12 | 
 13 | ## OVERVIEW
 14 | 
 15 | This pipeline demonstrates how to use [StreamSets Transformer](https://streamsets.com/products/dataops-platform/transformer-etl/) to perform a Join, Aggregations and Repartitions.
 16 | 
 17 | The source data for this pipeline is attached to this folder and requires you to download them and put them in a folder on your Transformer instance.  This pipeline writes data to a file on the local file system.  You could change the origins and destination to any supported types, but you would need to configure those manually.  This pipeline uses the built-in spark node for demonstration only.  You would want to execute this on your spark cluster. See [this link](https://streamsets.com/documentation/transformer/latest/help/transformer/Clusters/Clusters-Title.html) for more information.
 18 | 
 19 | **Disclaimer:** *This pipeline is meant to serve as a template for using Joins, Aggregations and Repartitions.*
 20 | 
 21 | ## USING THE TEMPLATE
 22 | 
 23 | NOTE: [Templates](https://streamsets.com/documentation/controlhub/latest/help/controlhub/UserGuide/Pipelines/PipelineTemplates.html) are supported in [StreamSets Control Hub](https://streamsets.com/products/dataops-platform/control-hub/). If you do not have Control Hub, you can import the template pipeline in Data Collector but will need to do that each time you want to use the template.
 24 | 
 25 | ## PIPELINE
 26 | 
 27 | ![Pipeline](images/pipeline.png "Tx Retail Inventory - Join Agg Repartition")
 28 | 
 29 | ### Pipeline Description with links to documentation
 30 | 
 31 | Stage | Description
 32 | --- | ---
 33 | [Retail and Store Details](https://streamsets.com/documentation/transformer/latest/help/transformer/Origins/File.html?contextID=concept_jcx_f2d_qgb) | Reads the source data from files on the local file system
 34 | [Join by store_zip](https://streamsets.com/documentation/transformer/latest/help/transformer/Processors/Join.html?contextID=concept_xdr_slq_sgb) | Joins the data sources by store_zip
 35 | [Generate IDs](https://streamsets.com/documentation/transformer/latest/help/transformer/Processors/SparkSQLExp.html?contextID=concept_akj_gsz_mhb) | Uses SparkSQL to generate an ID
 36 | [Route Data](https://streamsets.com/documentation/transformer/latest/help/transformer/Processors/StreamSelector.html?contextID=concept_wv3_k4j_zgb) | Routes data based on lead time
 37 | [Consolidate to 3/1](https://streamsets.com/documentation/transformer/latest/help/transformer/Processors/Repartition.html?contextID=concept_cm5_lfg_wgb) | Repartition - change the number of partitions that are written to file systems to 1
 38 | [Get Avg Lead Time/Total Unit Price & Weight](https://streamsets.com/documentation/transformer/latest/help/transformer/Processors/Aggregate.html?contextID=concept_eby_fb4_wgb) | Aggregator to perform AVG and SUMs
 39 | [Sort by City/Sort by Zipcode](https://streamsets.com/documentation/transformer/latest/help/transformer/Processors/Sort.html?contextID=concept_jw2_pq5_rgb) | Sorts the data
 40 | [IDs/Avg Lead Time/Total Unit Price](https://streamsets.com/documentation/transformer/latest/help/transformer/Destinations/File-D.html?contextID=concept_akw_2r3_xgb) | Writes data to a local file system
 41 | 
 42 | ## PREREQUISITES
 43 | 
 44 | You need to download the source files from links below and put these on your Transformer machine.  By default, the pipeline uses ```/data/store_retail``` as the source directory.  This can be changed with the parameters below.  Please create the source directories with proper permissions for the Transformer user to read/write files.  The destination directories will be created automatically but please ensure the user has permissions to create and write to those directories.
 45 | 
 46 | ## STEP-BY-STEP
 47 | 
 48 | ### Step 1: Download the source data
 49 | 
 50 | [Click Here](./Data.zip?raw=true) to download the source data
 51 | 
 52 | * Move the file (Data.zip) to the Transformer machine (using SCP/FTP, etc...)
 53 | * On the Transformer machine, create the directories for the source files
 54 |   * ```mkdir -p /data/store_retail```
 55 | * Ensure proper permissions are set
 56 |   * ```chmod 777 /data/store_retail```
 57 | * Unzip Data.zip to the directory created above
 58 |   * ```unzip -d /data/store_retail Data.zip```
 59 | 
 60 | ### Step 2: Download the pipeline
 61 | 
 62 | [Click Here](./Tx_Retail_Inventory_join_agg_repartition.zip?raw=true) to download the pipeline and save it to your drive.
 63 | 
 64 | ### Step 3: Import the pipeline
 65 | 
 66 | Click the down arrow next to the "Create New Pipeline" and select "Import Pipeline From Archive".
 67 | 
 68 | ![Step 3](images/TxRetailInventory_step3.png "Import the Pipeline")
 69 | 
 70 | Click "Browse" and locate the pipeline file you just downloaded, click "OK", then click "Import"
 71 | 
 72 | ![Step 3a](images/TxRetailInventory_step3a.png "Import the Pipeline")
 73 | 
 74 | ### Step 4: Configure the parameters
 75 | 
 76 | Click on the pipeline you just imported to open it and click on the "Parameters" tab and fill in the appropriate information for your environment.
 77 | 
 78 | **Important:** *For this pipeline, you only need to specify the input and output directories for the source and destination files.  This is on the local file system where Transformer is installed.*
 79 | 
 80 | ![Step 4](images/TxRetailInventory_step4.png "Configure the parameters")
 81 | 
 82 | The following parameters are set up for this pipeline:
 83 | <table>
 84 |   <tr>
 85 |   <td><code>origin_directory</code>
 86 |    </td>
 87 |    <td class="entry cellrowborder" headers="d31669e688 "><span class="ph" id="task_rcy_b4f_dhb__d67e2186">Path to the directory for
 88 |                                             the output files.</span>
 89 |                                         <p>Use the following
 90 |                                             format:</p>
 91 | <p class="p"><code class="ph codeph">/&lt;directory&gt;</code></p>
 92 | </td>
 93 | </tr>
 94 |    <td><code>destination_directory</code>
 95 |    </td>
 96 |    <td class="entry cellrowborder" headers="d31669e688 "><span class="ph" id="task_rcy_b4f_dhb__d67e2186">Path to the directory for
 97 |                                             the output files.</span>
 98 |                                         <p>Use the following
 99 |                                             format:</p>
100 | <p class="p"><code class="ph codeph">/&lt;directory&gt;</code></p>
101 | </td>
102 |   </tr>
103 | </table>
104 | 
105 | ### Step 5: Run the pipeline
106 | 
107 | Click the "START" button to run the pipeline.
108 | 
109 | ![Step 5](images/TxRetailInventory_step5.png "Run the pipeline")
110 | 
111 | ![Step 4a](images/TxRetailInventory_step5a.png "Run the pipeline")


--------------------------------------------------------------------------------
/transformer/sample-pipelines/pipelines/Tx Retail Inventory - Join Agg Repartition/Tx_Retail_Inventory_join_agg_repartition.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/transformer/sample-pipelines/pipelines/Tx Retail Inventory - Join Agg Repartition/Tx_Retail_Inventory_join_agg_repartition.zip


--------------------------------------------------------------------------------
/transformer/sample-pipelines/pipelines/Tx Retail Inventory - Join Agg Repartition/images/TxRetailInventory_step3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/transformer/sample-pipelines/pipelines/Tx Retail Inventory - Join Agg Repartition/images/TxRetailInventory_step3.png


--------------------------------------------------------------------------------
/transformer/sample-pipelines/pipelines/Tx Retail Inventory - Join Agg Repartition/images/TxRetailInventory_step3a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/transformer/sample-pipelines/pipelines/Tx Retail Inventory - Join Agg Repartition/images/TxRetailInventory_step3a.png


--------------------------------------------------------------------------------
/transformer/sample-pipelines/pipelines/Tx Retail Inventory - Join Agg Repartition/images/TxRetailInventory_step4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/transformer/sample-pipelines/pipelines/Tx Retail Inventory - Join Agg Repartition/images/TxRetailInventory_step4.png


--------------------------------------------------------------------------------
/transformer/sample-pipelines/pipelines/Tx Retail Inventory - Join Agg Repartition/images/TxRetailInventory_step5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/transformer/sample-pipelines/pipelines/Tx Retail Inventory - Join Agg Repartition/images/TxRetailInventory_step5.png


--------------------------------------------------------------------------------
/transformer/sample-pipelines/pipelines/Tx Retail Inventory - Join Agg Repartition/images/TxRetailInventory_step5a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/transformer/sample-pipelines/pipelines/Tx Retail Inventory - Join Agg Repartition/images/TxRetailInventory_step5a.png


--------------------------------------------------------------------------------
/transformer/sample-pipelines/pipelines/Tx Retail Inventory - Join Agg Repartition/images/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/transformer/sample-pipelines/pipelines/Tx Retail Inventory - Join Agg Repartition/images/pipeline.png


--------------------------------------------------------------------------------
/transformer/sample-pipelines/pipelines/Tx Scala UDF/README.md:
--------------------------------------------------------------------------------
 1 | ![StreamSets Logo](../../../../images/Full%20Color%20Transparent.png)
 2 | 
 3 | <h1><p align="center">Tx Scala UDF</p></h1>
 4 | 
 5 | # Tx Scala UDF
 6 | 
 7 | **Important:** *These instructions assume you have access to StreamSets Transformer*
 8 | 
 9 | - For help installing [StreamSets Transformer](https://streamsets.com/products/dataops-platform/transformer-etl/), see [StreamSets Transformer Installation](https://streamsets.com/documentation/transformer/latest/help/transformer/Installation/Installation-Title.html).
10 | 
11 | Here is a link to a short video on using this pipeline template: [Video Link](https://www.youtube.com/channel/UC_4K-__dngOCEmoZs7PVZAg)
12 | 
13 | ## OVERVIEW
14 | 
15 | This pipeline demonstrates how to create, register, and use a User-Defined Function in Scala using [StreamSets Transformer](https://streamsets.com/products/dataops-platform/transformer-etl/).
16 | 
17 | The source data for this pipeline is included in the ```Dev Raw Data Source``` as an example.  Typically, you would replace these with your actual source data (JDBC/Files/etc...).  This template writes data to a file on the local file system, but you would typically replace this with your actual destination.
18 | 
19 | **Disclaimer:** *This pipeline is meant to serve as a template for creating, registering and using a User-Defined Function in Scala*
20 | 
21 | ## USING THE TEMPLATE
22 | 
23 | NOTE: [Templates](https://streamsets.com/documentation/controlhub/latest/help/controlhub/UserGuide/Pipelines/PipelineTemplates.html) are supported in [StreamSets Control Hub](https://streamsets.com/products/dataops-platform/control-hub/). If you do not have Control Hub, you can import the template pipeline in Data Collector but will need to do that each time you want to use the template.
24 | 
25 | ## PIPELINE
26 | 
27 | ![Pipeline](images/pipeline.png "Tx Scal UDF")
28 | 
29 | ### Pipeline Description with links to documentation
30 | 
31 | Stage | Description
32 | --- | ---
33 | `Dev Raw Data Source` | Generates records based on user-supplied data
34 | [Create UDFs](https://streamsets.com/documentation/transformer/latest/help/transformer/Processors/Scala.html?contextID=concept_vhf_jlj_x3b) | Creates a small example function and registers it with SparkSQL as a column function
35 | [Use UDF](https://streamsets.com/documentation/transformer/latest/help/transformer/Processors/SparkSQLExp.html?contextID=concept_akj_gsz_mhb) | Leverages created UDF as a SparkSQL Expression Function
36 | [Write udf](https://streamsets.com/documentation/transformer/latest/help/transformer/Destinations/File-D.html?contextID=concept_akw_2r3_xgb) | Writes data to a local file system
37 | 
38 | 
39 | ## STEP-BY-STEP
40 | 
41 | ### Step 1: Download the pipeline
42 | 
43 | [Click Here](./Tx_Scala_UDF.zip?raw=true) to download the pipeline and save it to your drive.
44 | 
45 | ### Step 2: Import the pipeline
46 | 
47 | Click the down arrow next to the "Create New Pipeline" and select "Import Pipeline From Archive".
48 | 
49 | ![Step 2](images/TxScalaUDF_step2.png "Import the Pipeline")
50 | 
51 | Click "Browse" and locate the pipeline file you just downloaded, click "OK", then click "Import"
52 | 
53 | ![Step 2a](images/TxScalaUDF_step2a.png "Import the Pipeline")
54 | 
55 | ### Step 3: Configure the parameters
56 | 
57 | Click on the pipeline you just imported to open it and click on the "Parameters" tab and fill in the appropriate information for your environment.
58 | 
59 | **Important:** *For this pipeline, you only need to specify the output directory for the file.  This is on the local file system where Transformer is installed.  Make sure the directory is created and proper permissions are set so that the transformer user can create files.  By default, the directory ```/data/udf``` is used.  You can change it to anything you want.*
60 | 
61 | ![Step 3](images/TxScalaUDF_step3.png "Configure the parameters")
62 | 
63 | The following parameters are set up for this pipeline:
64 | <table>
65 |   </tr>
66 |    <td><code>destination_directory</code>
67 |    </td>
68 |    <td class="entry cellrowborder" headers="d31669e688 "><span class="ph" id="task_rcy_b4f_dhb__d67e2186">Path to the directory for
69 |                                             the output files.</span>
70 |                                         <p>Use the following
71 |                                             format:</p>
72 | <p class="p"><code class="ph codeph">/&lt;directory&gt;</code></p>
73 | </td>
74 |   </tr>
75 | </table>
76 | 
77 | ### Step 4: Run the pipeline
78 | 
79 | Click the "START" button to run the pipeline.
80 | 
81 | ![Step 4](images/TxScalaUDF_step4.png "Run the pipeline")
82 | 
83 | ![Step 4a](images/TxScalaUDF_step4a.png "Run the pipeline")


--------------------------------------------------------------------------------
/transformer/sample-pipelines/pipelines/Tx Scala UDF/Tx_Scala_UDF.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/transformer/sample-pipelines/pipelines/Tx Scala UDF/Tx_Scala_UDF.zip


--------------------------------------------------------------------------------
/transformer/sample-pipelines/pipelines/Tx Scala UDF/images/TxScalaUDF_step2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/transformer/sample-pipelines/pipelines/Tx Scala UDF/images/TxScalaUDF_step2.png


--------------------------------------------------------------------------------
/transformer/sample-pipelines/pipelines/Tx Scala UDF/images/TxScalaUDF_step2a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/transformer/sample-pipelines/pipelines/Tx Scala UDF/images/TxScalaUDF_step2a.png


--------------------------------------------------------------------------------
/transformer/sample-pipelines/pipelines/Tx Scala UDF/images/TxScalaUDF_step3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/transformer/sample-pipelines/pipelines/Tx Scala UDF/images/TxScalaUDF_step3.png


--------------------------------------------------------------------------------
/transformer/sample-pipelines/pipelines/Tx Scala UDF/images/TxScalaUDF_step4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/transformer/sample-pipelines/pipelines/Tx Scala UDF/images/TxScalaUDF_step4.png


--------------------------------------------------------------------------------
/transformer/sample-pipelines/pipelines/Tx Scala UDF/images/TxScalaUDF_step4a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/transformer/sample-pipelines/pipelines/Tx Scala UDF/images/TxScalaUDF_step4a.png


--------------------------------------------------------------------------------
/transformer/sample-pipelines/pipelines/Tx Scala UDF/images/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/transformer/sample-pipelines/pipelines/Tx Scala UDF/images/pipeline.png


--------------------------------------------------------------------------------
/transformer/sample-pipelines/pipelines/Tx Slowly Changing Dimension - Type 1/README.md:
--------------------------------------------------------------------------------
 1 | ![StreamSets Logo](../../../../images/Full%20Color%20Transparent.png)
 2 | 
 3 | <h1><p align="center">Tx Slowly Changing Dimension - Type 1</p></h1>
 4 | 
 5 | # Tx Slowly Changing Dimension - Type 1
 6 | 
 7 | **Important:** *These instructions assume you have access to StreamSets Transformer*
 8 | 
 9 | - For help installing [StreamSets Transformer](https://streamsets.com/products/dataops-platform/transformer-etl/), see [StreamSets Transformer Installation](https://streamsets.com/documentation/transformer/latest/help/transformer/Installation/Installation-Title.html).
10 | 
11 | Here is a link to a short video on using this pipeline template: [Video Link](https://www.youtube.com/channel/UC_4K-__dngOCEmoZs7PVZAg)
12 | 
13 | ## OVERVIEW
14 | 
15 | This pipeline demonstrates how to perform a Slowly Changing Dimension - Type 1 using [StreamSets Transformer](https://streamsets.com/products/dataops-platform/transformer-etl/).
16 | 
17 | The source data for this pipeline is included in the ```Dev Raw Data Source``` as an example.  Typically, you would replace these with your actual source data (JDBC/Files).  This template writes data to a file on the local file system, but you would typically replace this with your actual destination.
18 | 
19 | **Disclaimer:** *This pipeline is meant to serve as a template for performing a Slowly Changing Dimension - Type 1.*
20 | 
21 | ## USING THE TEMPLATE
22 | 
23 | NOTE: [Templates](https://streamsets.com/documentation/controlhub/latest/help/controlhub/UserGuide/Pipelines/PipelineTemplates.html) are supported in [StreamSets Control Hub](https://streamsets.com/products/dataops-platform/control-hub/). If you do not have Control Hub, you can import the template pipeline in Data Collector but will need to do that each time you want to use the template.
24 | 
25 | ## PIPELINE
26 | 
27 | ![Pipeline](images/pipeline.png "Tx Slowly Changing Dimension - Type 1")
28 | 
29 | ### Pipeline Description with links to documentation
30 | 
31 | Stage | Description
32 | --- | ---
33 | `Dev Raw Data Source` | Generates records based on user-supplied data
34 | [Slowly Changing Dimension](https://streamsets.com/documentation/transformer/latest/help/transformer/Processors/SCDimension.html?contextID=concept_ixk_bbr_j3b) | Generates updates to a Type 1 or Type 2 slowly changing dimension by evaluating change data against master dimension data
35 | [Consolidate to 1 file](https://streamsets.com/documentation/transformer/latest/help/transformer/Processors/Repartition.html?contextID=concept_cm5_lfg_wgb) | Repartition - change the number of partitions that are written to file systems to 1
36 | [Master](https://streamsets.com/documentation/transformer/latest/help/transformer/Destinations/File-D.html?contextID=concept_akw_2r3_xgb) | Writes data to a local file system
37 | 
38 | 
39 | ## STEP-BY-STEP
40 | 
41 | ### Step 1: Download the pipeline
42 | 
43 | [Click Here](./Tx_SCD_Type1.zip?raw=true) to download the pipeline and save it to your drive.
44 | 
45 | ### Step 2: Import the pipeline
46 | 
47 | Click the down arrow next to the "Create New Pipeline" and select "Import Pipeline From Archive".
48 | 
49 | ![Step 2](images/TxSCDType1_step2.png "Import the Pipeline")
50 | 
51 | Click "Browse" and locate the pipeline file you just downloaded, click "OK", then click "Import"
52 | 
53 | ![Step 2a](images/TxSCDType1_step2a.png "Import the Pipeline")
54 | 
55 | ### Step 3: Configure the parameters
56 | 
57 | Click on the pipeline you just imported to open it and click on the "Parameters" tab and fill in the appropriate information for your environment.
58 | 
59 | **Important:** *For this pipeline, you only need to specify the output directory for the file.  This is on the local file system where Transformer is installed.  Make sure the directory is created and proper permissions are set so that the transformer user can create files.  By default, the directory ```/data/master``` is used.  You can change it to anything you want.*
60 | 
61 | ![Step 3](images/TxSCDType1_step3.png "Configure the parameters")
62 | 
63 | The following parameters are set up for this pipeline:
64 | <table>
65 |   </tr>
66 |    <td><code>destination_directory</code>
67 |    </td>
68 |    <td class="entry cellrowborder" headers="d31669e688 "><span class="ph" id="task_rcy_b4f_dhb__d67e2186">Path to the directory for
69 |                                             the output files.</span>
70 |                                         <p>Use the following
71 |                                             format:</p>
72 | <p class="p"><code class="ph codeph">/&lt;directory&gt;</code></p>
73 | </td>
74 |   </tr>
75 | </table>
76 | 
77 | ### Step 4: Run the pipeline
78 | 
79 | Click the "START" button to run the pipeline.
80 | 
81 | ![Step 4](images/TxSCDType1_step4.png "Run the pipeline")
82 | 
83 | ![Step 4a](images/TxSCDType1_step4a.png "Run the pipeline")


--------------------------------------------------------------------------------
/transformer/sample-pipelines/pipelines/Tx Slowly Changing Dimension - Type 1/Tx_SCD_Type1.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/transformer/sample-pipelines/pipelines/Tx Slowly Changing Dimension - Type 1/Tx_SCD_Type1.zip


--------------------------------------------------------------------------------
/transformer/sample-pipelines/pipelines/Tx Slowly Changing Dimension - Type 1/images/TxSCDType1_step2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/transformer/sample-pipelines/pipelines/Tx Slowly Changing Dimension - Type 1/images/TxSCDType1_step2.png


--------------------------------------------------------------------------------
/transformer/sample-pipelines/pipelines/Tx Slowly Changing Dimension - Type 1/images/TxSCDType1_step2a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/transformer/sample-pipelines/pipelines/Tx Slowly Changing Dimension - Type 1/images/TxSCDType1_step2a.png


--------------------------------------------------------------------------------
/transformer/sample-pipelines/pipelines/Tx Slowly Changing Dimension - Type 1/images/TxSCDType1_step3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/transformer/sample-pipelines/pipelines/Tx Slowly Changing Dimension - Type 1/images/TxSCDType1_step3.png


--------------------------------------------------------------------------------
/transformer/sample-pipelines/pipelines/Tx Slowly Changing Dimension - Type 1/images/TxSCDType1_step4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/transformer/sample-pipelines/pipelines/Tx Slowly Changing Dimension - Type 1/images/TxSCDType1_step4.png


--------------------------------------------------------------------------------
/transformer/sample-pipelines/pipelines/Tx Slowly Changing Dimension - Type 1/images/TxSCDType1_step4a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/transformer/sample-pipelines/pipelines/Tx Slowly Changing Dimension - Type 1/images/TxSCDType1_step4a.png


--------------------------------------------------------------------------------
/transformer/sample-pipelines/pipelines/Tx Slowly Changing Dimension - Type 1/images/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/streamsets/pipeline-library/5ac5793fa473d479b7208f7d6d52a8f339c035b3/transformer/sample-pipelines/pipelines/Tx Slowly Changing Dimension - Type 1/images/pipeline.png


--------------------------------------------------------------------------------
/transformer/sample-scripts/README.md:
--------------------------------------------------------------------------------
 1 | ![StreamSets Logo](../../images/Full%20Color%20Transparent.png)
 2 | 
 3 | <h1><p align="center">Transformer: Sample Scripts</p></h1>
 4 | 
 5 | This folder contains sample scripts for StreamSets Transformer.
 6 | 
 7 | The following best sample scripts are currently available:
 8 | | Name            | Description     |
 9 | | --------------- | --------------- |
10 | 
11 | **COMING SOON**
12 | 
13 | # Help
14 | 
15 | For any queries, questions, comments related to these pipelines reach out on any of these channels:
16 | 
17 | [Chat on Slack](https://streamsetters-slack.herokuapp.com/)
18 | 
19 | [User Group](https://groups.google.com/a/streamsets.com/d/forum/sdc-user)
20 | 
21 | [Ask StreamSets](https://ask.streamsets.com/questions/)
22 | 


--------------------------------------------------------------------------------