├── .gitignore ├── data_modelling ├── analyses │ └── .gitkeep ├── macros │ └── .gitkeep ├── seeds │ └── .gitkeep ├── snapshots │ └── .gitkeep ├── tests │ └── .gitkeep ├── .gitignore ├── models │ ├── dimensions │ │ ├── dim_forex_rates.sql │ │ ├── dim_outcome_details.sql │ │ ├── dim_billing_details.sql │ │ ├── dim_source_details.sql │ │ ├── dim_payment_method_details.sql │ │ └── schema.yml │ ├── core │ │ ├── facts_transactions.sql │ │ └── schema.yml │ └── staging │ │ ├── stg_transactions.sql │ │ └── schema.yml ├── README.md └── dbt_project.yml ├── forex-rates ├── requirements.txt ├── Dockerfile └── forex_rates.py ├── postgres ├── Dockerfile └── create_table.sql ├── kafka-producer ├── requirements.txt ├── Dockerfile ├── python-producer.py └── wait-for-it.sh ├── kafka-consumer ├── requirements.txt ├── Dockerfile ├── wait-for-it.sh ├── spark_schema.py └── python-consumer.py ├── project-png ├── transactions_stream_data_model.png └── transactions_stream_project_diagram.png ├── grafana └── dashboards │ └── grafana_datasources.yaml ├── sample.env ├── Makefile ├── docker-compose.yml └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | .env 2 | .DS_Store -------------------------------------------------------------------------------- /data_modelling/analyses/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data_modelling/macros/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data_modelling/seeds/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data_modelling/snapshots/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data_modelling/tests/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data_modelling/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | target/ 3 | dbt_packages/ 4 | logs/ 5 | -------------------------------------------------------------------------------- /forex-rates/requirements.txt: -------------------------------------------------------------------------------- 1 | psycopg2-binary 2 | requests 3 | schedule 4 | -------------------------------------------------------------------------------- /postgres/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM postgres:latest 2 | 3 | COPY create_table.sql /docker-entrypoint-initdb.d/ -------------------------------------------------------------------------------- /kafka-producer/requirements.txt: -------------------------------------------------------------------------------- 1 | kafka-python==2.0.2 2 | schedule==1.1.0 3 | aiokafka==0.7.2 4 | stripe 5 | -------------------------------------------------------------------------------- /kafka-consumer/requirements.txt: -------------------------------------------------------------------------------- 1 | kafka-python==2.0.2 2 | schedule==1.1.0 3 | aiokafka==0.7.2 4 | psycopg2-binary 5 | pyspark 6 | -------------------------------------------------------------------------------- /project-png/transactions_stream_data_model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/divakaivan/transaction-stream-data-pipeline/HEAD/project-png/transactions_stream_data_model.png -------------------------------------------------------------------------------- /project-png/transactions_stream_project_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/divakaivan/transaction-stream-data-pipeline/HEAD/project-png/transactions_stream_project_diagram.png -------------------------------------------------------------------------------- /forex-rates/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.11-slim 2 | 3 | WORKDIR / 4 | 5 | COPY requirements.txt . 6 | 7 | RUN pip install --no-cache-dir -r requirements.txt 8 | 9 | ADD forex_rates.py . 10 | 11 | CMD python -u forex_rates.py -------------------------------------------------------------------------------- /kafka-producer/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.8-slim 2 | 3 | COPY requirements.txt . 4 | 5 | RUN set -ex; \ 6 | pip install --no-cache-dir -r requirements.txt 7 | 8 | # copy resources 9 | WORKDIR / 10 | COPY wait-for-it.sh wait-for-it.sh 11 | 12 | ADD python-producer.py . 13 | 14 | CMD ./wait-for-it.sh -s -t 30 $ZOOKEEPER_SERVER -- ./wait-for-it.sh -s -t 30 $KAFKA_SERVER -- python -u python-producer.py -------------------------------------------------------------------------------- /grafana/dashboards/grafana_datasources.yaml: -------------------------------------------------------------------------------- 1 | # config file version 2 | apiVersion: 1 3 | 4 | datasources: 5 | - name: PostgreSQL 6 | type: postgres 7 | access: proxy 8 | url: $POSTGRES_HOST:5432 9 | database: $POSTGRES_DB 10 | user: $POSTGRES_USER 11 | secureJsonData: 12 | password: $POSTGRES_PASSWORD 13 | jsonData: 14 | sslmode: 'disable' 15 | database: $POSTGRES_DB -------------------------------------------------------------------------------- /sample.env: -------------------------------------------------------------------------------- 1 | STRIPE_API_KEY= 2 | 3 | POSTGRES_HOST= 4 | POSTGRES_USER= 5 | POSTGRES_PASSWORD= 6 | POSTGRES_DB= 7 | PGADMIN_DEFAULT_EMAIL= 8 | PGADMIN_DEFAULT_PASSWORD= 9 | POSTGRES_TABLE= 10 | 11 | KAFKA_TOPIC= 12 | KAFKA_SERVER= 13 | ZOOKEEPER_SERVER= 14 | PRODUCER_INTERVAL= 15 | 16 | ZOOKEEPER_CLIENT_PORT= 17 | ZOOKEPER_TICK_TIME= 18 | 19 | KAFKA_BROKER_ID= 20 | KAFKA_ADVERTISED_HOST_NAME= 21 | KAFKA_ZOOKEEPER_CONNECT= 22 | KAFKA_ADVERTISED_LISTENERS=PLAINTEXT://$KAFKA_SERVER 23 | KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR= 24 | JMX_PORT= -------------------------------------------------------------------------------- /data_modelling/models/dimensions/dim_forex_rates.sql: -------------------------------------------------------------------------------- 1 | {{ config( 2 | materialized='table' 3 | ) }} 4 | 5 | with dim_forex_rates_data as ( 6 | select 7 | "date" as date_rates, 8 | usd, 9 | eur, 10 | jpy, 11 | cad, 12 | aud, 13 | chf, 14 | cny, 15 | sek, 16 | nzd, 17 | mxn 18 | from {{ source('postgres', 'forex_rates') }} 19 | ) 20 | 21 | select * from dim_forex_rates_data 22 | 23 | {% if var("is_dev_run", default=true) %} limit 100 {% endif %} -------------------------------------------------------------------------------- /data_modelling/models/dimensions/dim_outcome_details.sql: -------------------------------------------------------------------------------- 1 | {{ config( 2 | materialized='table' 3 | ) }} 4 | 5 | with dim_outcome_details_data as ( 6 | select 7 | id, 8 | outcome_network_status as network_status, 9 | outcome_reason as reason, 10 | outcome_risk_level as risk_level, 11 | outcome_risk_score as risk_score, 12 | outcome_seller_message as seller_message, 13 | outcome_type 14 | from {{ ref('stg_transactions') }} 15 | ) 16 | 17 | select * from dim_outcome_details_data 18 | 19 | {% if var("is_dev_run", default=true) %} limit 100 {% endif %} -------------------------------------------------------------------------------- /data_modelling/README.md: -------------------------------------------------------------------------------- 1 | Welcome to your new dbt project! 2 | 3 | ### Using the starter project 4 | 5 | Try running the following commands: 6 | - dbt run 7 | - dbt test 8 | 9 | 10 | ### Resources: 11 | - Learn more about dbt [in the docs](https://docs.getdbt.com/docs/introduction) 12 | - Check out [Discourse](https://discourse.getdbt.com/) for commonly asked questions and answers 13 | - Join the [chat](https://community.getdbt.com/) on Slack for live discussions and support 14 | - Find [dbt events](https://events.getdbt.com) near you 15 | - Check out [the blog](https://blog.getdbt.com/) for the latest news on dbt's development and best practices 16 | -------------------------------------------------------------------------------- /data_modelling/models/dimensions/dim_billing_details.sql: -------------------------------------------------------------------------------- 1 | {{ config( 2 | materialized='table' 3 | ) }} 4 | 5 | with dim_billing_details_data as ( 6 | select 7 | id, 8 | billing_details_address_city as address_city, 9 | billing_details_address_country as address_country, 10 | billing_details_address_line1 as address_line1, 11 | billing_details_address_line2 as address_line2, 12 | billing_details_address_postal_code as address_postal_code, 13 | billing_details_address_state as address_state, 14 | billing_details_email as email, 15 | billing_details_name as "name", 16 | billing_details_phone as phone 17 | from {{ ref('stg_transactions') }} 18 | ) 19 | 20 | select * from dim_billing_details_data 21 | 22 | {% if var("is_dev_run", default=true) %} limit 100 {% endif %} -------------------------------------------------------------------------------- /data_modelling/models/core/facts_transactions.sql: -------------------------------------------------------------------------------- 1 | {{ config( 2 | materialized='table' 3 | ) }} 4 | 5 | with transactions as ( 6 | select 7 | t.id as transaction_id, 8 | t.created as transaction_date, 9 | t.amount, 10 | t.amount_captured, 11 | t.amount_refunded, 12 | t.currency, 13 | t.customer, 14 | t.payment_intent, 15 | t.payment_method, 16 | t.status, 17 | fr.usd as forex_usd, 18 | fr.eur as forex_eur, 19 | fr.jpy as forex_jpy, 20 | fr.cad as forex_cad, 21 | fr.aud as forex_aud, 22 | fr.chf as forex_chf, 23 | fr.cny as forex_cny, 24 | fr.sek as forex_sek, 25 | fr.nzd as forex_nzd, 26 | fr.mxn as forex_mxn 27 | from {{ ref('stg_transactions') }} t 28 | left join {{ ref('dim_forex_rates') }} fr on cast(t.created as date) = cast(fr.date_rates as date) 29 | ) 30 | 31 | select * from transactions 32 | 33 | {% if var("is_dev_run", default=true) %} limit 100 {% endif %} 34 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Define variables 2 | DOCKER_COMPOSE_FILE=docker-compose.yml 3 | 4 | # Default target when `make` is run without arguments 5 | .DEFAULT_GOAL := help 6 | 7 | .PHONY: help 8 | help: ## Show this help message 9 | @echo "" 10 | @echo "Usage: make [option]" 11 | @echo "" 12 | @echo "Options:" 13 | @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf " \033[36m%-20s\033[0m %s\n", $$1, $$2}' 14 | @echo "" 15 | 16 | .PHONY: build 17 | build: ## Build docker services 18 | docker-compose -f $(DOCKER_COMPOSE_FILE) build 19 | 20 | .PHONY: start 21 | start: ## Start docker services (detached mode) 22 | docker-compose -f $(DOCKER_COMPOSE_FILE) up -d 23 | 24 | .PHONY: stop 25 | stop: ## Stop docker services 26 | docker-compose -f $(DOCKER_COMPOSE_FILE) stop 27 | 28 | .PHONY: dbt-limit 29 | dbt-test: ## Run dbt with LIMIT 100 30 | cd data_modelling && dbt build 31 | 32 | .PHONY: dbt-full 33 | dbt-full: ## Run dbt with full data 34 | cd data_modelling && dbt build --vars '{"is_dev_run": false}' 35 | 36 | -------------------------------------------------------------------------------- /data_modelling/dbt_project.yml: -------------------------------------------------------------------------------- 1 | 2 | # Name your project! Project names should contain only lowercase characters 3 | # and underscores. A good package name should reflect your organization's 4 | # name or the intended use of these models 5 | name: 'data_modelling' 6 | version: '1.0.0' 7 | 8 | # This setting configures which "profile" dbt uses for this project. 9 | profile: 'data_modelling' 10 | 11 | # These configurations specify where dbt should look for different types of files. 12 | # The `model-paths` config, for example, states that models in this project can be 13 | # found in the "models/" directory. You probably won't need to change these! 14 | model-paths: ["models"] 15 | analysis-paths: ["analyses"] 16 | test-paths: ["tests"] 17 | seed-paths: ["seeds"] 18 | macro-paths: ["macros"] 19 | snapshot-paths: ["snapshots"] 20 | 21 | clean-targets: # directories to be removed by `dbt clean` 22 | - "target" 23 | - "dbt_packages" 24 | 25 | 26 | # Configuring models 27 | # Full documentation: https://docs.getdbt.com/docs/configuring-models 28 | 29 | # In this example config, we tell dbt to build all models in the example/ 30 | # directory as views. These settings can be overridden in the individual model 31 | # files using the `{{ config(...) }}` macro. 32 | models: 33 | data_modelling: 34 | 35 | -------------------------------------------------------------------------------- /data_modelling/models/dimensions/dim_source_details.sql: -------------------------------------------------------------------------------- 1 | {{ config( 2 | materialized='table' 3 | ) }} 4 | 5 | with dim_source_details_data as ( 6 | select 7 | id, 8 | source_address_city as address_city, 9 | source_address_country as address_country, 10 | source_address_line1 as address_line1, 11 | source_address_line1_check as address_line1_check, 12 | source_address_line2 as address_line2, 13 | source_address_state as address_state, 14 | source_address_zip as address_zip, 15 | source_address_zip_check as address_zip_check, 16 | source_brand as brand, 17 | source_country as country, 18 | source_customer as customer, 19 | source_cvc_check as cvc_check, 20 | source_dynamic_last4 as dynamic_last4, 21 | source_exp_month as exp_month, 22 | source_exp_year as exp_year, 23 | source_fingerprint as fingerprint, 24 | source_funding as funding, 25 | source_id as source_id, 26 | source_last4 as last4, 27 | source_name as "name", 28 | source_object as object_type, 29 | source_tokenization_method as tokenization_method, 30 | source_wallet as wallet, 31 | source_transfer as transfer 32 | from {{ ref('stg_transactions') }} 33 | ) 34 | 35 | select * from dim_source_details_data 36 | 37 | {% if var("is_dev_run", default=true) %} limit 100 {% endif %} -------------------------------------------------------------------------------- /kafka-producer/python-producer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import random 4 | import schedule 5 | from json import dumps 6 | 7 | from kafka import KafkaProducer 8 | 9 | import stripe 10 | 11 | kafka_nodes = os.getenv('KAFKA_SERVER') 12 | my_topic = os.getenv('KAFKA_TOPIC') 13 | 14 | def create_test_charge(): 15 | try: 16 | amount = random.randint(100, 1000000) # 1p to £10,000 17 | stripe.api_key = os.getenv('STRIPE_API_KEY') 18 | charge = stripe.Charge.create( 19 | amount=amount, 20 | currency='gbp', 21 | source='tok_visa', 22 | ) 23 | return charge 24 | except stripe.error.CardError as e: 25 | print(f"Card declined: {e.error.message}") 26 | return None 27 | except stripe.error.StripeError as e: 28 | print(f"Stripe error: {e}") 29 | return None 30 | 31 | def send_to_kafka(charges): 32 | try: 33 | prod = KafkaProducer(bootstrap_servers=kafka_nodes, api_version=(2, 0, 2), 34 | value_serializer=lambda x: dumps(x).encode('utf-8')) 35 | 36 | my_data = {'transactions': charges} 37 | prod.send(my_topic, value=my_data) 38 | prod.flush() 39 | 40 | print(f"Sent {len(charges)} transactions to Kafka") 41 | 42 | except Exception as e: 43 | print(f"Error sending to Kafka: {e}") 44 | 45 | def gen_data(): 46 | num_charges = 25 # stripe create limit 47 | charges = [create_test_charge() for _ in range(num_charges)] 48 | send_to_kafka(charges) 49 | 50 | if __name__ == '__main__': 51 | schedule.every(3).seconds.do(gen_data) 52 | try: 53 | while True: 54 | schedule.run_pending() 55 | time.sleep(0.5) 56 | except KeyboardInterrupt: 57 | print("Stopping...") 58 | -------------------------------------------------------------------------------- /kafka-consumer/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.8-slim-buster 2 | 3 | # Install necessary dependencies 4 | RUN apt-get update && \ 5 | apt-get install -y --no-install-recommends openjdk-11-jre-headless ca-certificates-java procps wget && \ 6 | apt-get clean && \ 7 | update-ca-certificates -f && \ 8 | rm -rf /var/lib/apt/lists/* 9 | 10 | # Verify the Java installation 11 | RUN java -version 12 | 13 | # Install Spark 14 | RUN wget https://dlcdn.apache.org/spark/spark-3.5.1/spark-3.5.1-bin-hadoop3.tgz 15 | RUN tar -xzf spark-3.5.1-bin-hadoop3.tgz -C /opt 16 | ENV SPARK_HOME=/opt/spark-3.5.1-bin-hadoop3 17 | ENV PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin 18 | 19 | # Install postgres class 20 | RUN mkdir -p /opt/spark/jars/ 21 | RUN wget -O /opt/spark/jars/postgresql-42.2.20.jar https://jdbc.postgresql.org/download/postgresql-42.2.20.jar 22 | 23 | # Find the Java installation path using update-alternatives and set JAVA_HOME 24 | RUN export JAVA_HOME=$(dirname $(dirname $(readlink -f $(which java)))) && \ 25 | echo $JAVA_HOME && \ 26 | ln -s $JAVA_HOME /usr/lib/jvm/java-11-openjdk-amd64 && \ 27 | echo "export JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64" >> /etc/profile && \ 28 | echo "export PATH=\$PATH:\$JAVA_HOME/bin" >> /etc/profile 29 | 30 | # Set environment variables for Java in Dockerfile scope 31 | ENV JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64 32 | ENV PATH=$PATH:$JAVA_HOME/bin 33 | 34 | # Verify the JAVA_HOME path 35 | RUN echo $JAVA_HOME && ls -l $JAVA_HOME/bin/java 36 | 37 | # Copy requirements.txt and install Python dependencies 38 | COPY requirements.txt . 39 | RUN pip install --no-cache-dir -r requirements.txt 40 | 41 | # Copy resources 42 | WORKDIR / 43 | COPY wait-for-it.sh wait-for-it.sh 44 | 45 | ADD python-consumer.py . 46 | 47 | CMD ["/bin/bash", "-c", "/wait-for-it.sh -s -t 30 $ZOOKEEPER_SERVER -- /wait-for-it.sh -s -t 30 $KAFKA_SERVER -- ${SPARK_HOME}/bin/spark-submit --packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.1,org.postgresql:postgresql:42.2.20 python-consumer.py"] -------------------------------------------------------------------------------- /data_modelling/models/dimensions/dim_payment_method_details.sql: -------------------------------------------------------------------------------- 1 | {{ config( 2 | materialized='table' 3 | ) }} 4 | 5 | with dim_payment_method_details_data as ( 6 | select 7 | id, 8 | payment_method_details_card_amount_authorized as card_amount_authorized, 9 | payment_method_details_card_brand as card_brand, 10 | payment_method_details_card_checks_address_line1_check as card_checks_address_line1_check, 11 | payment_method_details_card_checks_address_postal_code_check as card_checks_address_postal_code_check, 12 | payment_method_details_card_checks_cvc_check as card_checks_cvc_check, 13 | payment_method_details_card_country as card_country, 14 | payment_method_details_card_exp_month as card_exp_month, 15 | payment_method_details_card_exp_year as card_exp_year, 16 | payment_method_details_card_extended_authorization_status as card_extended_authorization_status, 17 | payment_method_details_card_fingerprint as card_fingerprint, 18 | payment_method_details_card_funding as card_funding, 19 | payment_method_details_card_incremental_authorization_status as card_incremental_authorization_status, 20 | payment_method_details_card_installments as card_installments, 21 | payment_method_details_card_last4 as card_last4, 22 | payment_method_details_card_mandate as card_mandate, 23 | payment_method_details_card_multicapture_status as card_multicapture_status, 24 | payment_method_details_card_network as card_network, 25 | payment_method_details_card_network_token_used as card_network_token_used, 26 | payment_method_details_card_overcapture_maximum_amount as card_overcapture_maximum_amount, 27 | payment_method_details_card_overcapture_status as card_overcapture_status, 28 | payment_method_details_card_three_d_secure as card_three_d_secure, 29 | payment_method_details_card_wallet as card_wallet, 30 | payment_method_details_type as "type" 31 | from {{ ref('stg_transactions') }} 32 | ) 33 | 34 | select * from dim_payment_method_details_data 35 | 36 | {% if var("is_dev_run", default=true) %} limit 100 {% endif %} -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | networks: 2 | bridge: 3 | driver: bridge 4 | 5 | services: 6 | zookeeper: 7 | image: confluentinc/cp-zookeeper:latest 8 | env_file: 9 | - .env 10 | networks: 11 | bridge: 12 | aliases: 13 | - zookeeper 14 | 15 | kafka: 16 | image: confluentinc/cp-kafka 17 | depends_on: 18 | - zookeeper 19 | env_file: 20 | - .env 21 | networks: 22 | bridge: 23 | aliases: 24 | - kafka 25 | 26 | kafka-producer: 27 | build: 28 | context: ./kafka-producer 29 | container_name: kafka-producer 30 | depends_on: 31 | - kafka 32 | - postgres 33 | - kafka-consumer 34 | env_file: 35 | - .env 36 | networks: 37 | - bridge 38 | 39 | kafka-consumer: 40 | build: 41 | context: ./kafka-consumer 42 | container_name: kafka-consumer 43 | depends_on: 44 | - kafka 45 | - postgres 46 | env_file: 47 | - .env 48 | networks: 49 | - bridge 50 | 51 | postgres: 52 | build: 53 | context: ./postgres 54 | container_name: postgres 55 | restart: always 56 | env_file: 57 | - .env 58 | ports: 59 | - "5432:5432" 60 | networks: 61 | - bridge 62 | 63 | pgadmin: 64 | image: dpage/pgadmin4 65 | restart: always 66 | env_file: 67 | - .env 68 | ports: 69 | - "8080:80" 70 | depends_on: 71 | - postgres 72 | networks: 73 | - bridge 74 | 75 | forex-rates: 76 | build: 77 | context: ./forex-rates 78 | container_name: forex-rates 79 | restart: always 80 | env_file: 81 | - .env 82 | networks: 83 | - bridge 84 | 85 | grafana: 86 | image: grafana/grafana 87 | user: "472" 88 | env_file: 89 | - .env 90 | ports: 91 | - "3000:3000" 92 | volumes: 93 | - ./grafana/dashboards/grafana_datasources.yaml:/etc/grafana/provisioning/datasources/datasource.yaml:ro 94 | - ./grafana/dashboards:/opt/grafana/dashboards 95 | networks: 96 | - bridge 97 | restart: always 98 | 99 | volumes: 100 | settings: 101 | data: 102 | grafana_data: {} 103 | -------------------------------------------------------------------------------- /forex-rates/forex_rates.py: -------------------------------------------------------------------------------- 1 | import os 2 | import requests 3 | import schedule 4 | import psycopg2 5 | from datetime import datetime 6 | import time 7 | 8 | def get_forex_rates(): 9 | date = datetime.now().strftime("%Y-%m-%d") 10 | api_url = f"https://cdn.jsdelivr.net/npm/@fawazahmed0/currency-api@{date}/v1/currencies/gbp.json" 11 | response = requests.get(api_url) 12 | data = response.json() 13 | # top currencies only 14 | currencies = ["usd", "eur", "jpy", "cad", "aud", "chf", "cny", "sek", "nzd", "mxn"] 15 | top_currency_rates = {'date': data.get('date'), **{currency: data.get('gbp')[currency] for currency in currencies}} 16 | return top_currency_rates 17 | 18 | def insert_into_db(data): 19 | conn = psycopg2.connect( 20 | host=os.getenv("POSTGRES_HOST"), 21 | database=os.getenv("POSTGRES_DB"), 22 | user=os.getenv("POSTGRES_USER"), 23 | password=os.getenv("POSTGRES_PASSWORD") 24 | ) 25 | cursor = conn.cursor() 26 | # doing it here so I do not lose the already loaded data into postgres 27 | cursor.execute(""" 28 | CREATE TABLE IF NOT EXISTS forex_rates ( 29 | date DATE PRIMARY KEY, 30 | usd FLOAT, 31 | eur FLOAT, 32 | jpy FLOAT, 33 | cad FLOAT, 34 | aud FLOAT, 35 | chf FLOAT, 36 | cny FLOAT, 37 | sek FLOAT, 38 | nzd FLOAT, 39 | mxn FLOAT 40 | ) 41 | """) 42 | 43 | cursor.execute("SELECT date FROM forex_rates WHERE date = %s", (data['date'],)) 44 | if cursor.fetchone() is None: 45 | cursor.execute(""" 46 | INSERT INTO forex_rates (date, usd, eur, jpy, cad, aud, chf, cny, sek, nzd, mxn) 47 | VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) 48 | """, ( 49 | data['date'], 50 | data['usd'], 51 | data['eur'], 52 | data['jpy'], 53 | data['cad'], 54 | data['aud'], 55 | data['chf'], 56 | data['cny'], 57 | data['sek'], 58 | data['nzd'], 59 | data['mxn'] 60 | )) 61 | 62 | conn.commit() 63 | cursor.close() 64 | conn.close() 65 | 66 | def job(): 67 | data = get_forex_rates() 68 | insert_into_db(data) 69 | print('New forex rates fetched and inserted into the database.') 70 | 71 | if __name__ == "__main__": 72 | job() 73 | schedule.every(24).hours.do(job) 74 | 75 | try: 76 | while True: 77 | schedule.run_pending() 78 | time.sleep(1) 79 | except KeyboardInterrupt: 80 | print("Stopping...") 81 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Project overview 2 | 3 | ![transactions_stream_project_diagram](/project-png/transactions_stream_project_diagram.png) 4 | 5 | * **Stripe**: Using Stripe's API as the source of generating realistic transaction data. 6 | 7 | * **Apache Kafka**: Stripe transaction data is streamed into Apache Kafka. It handles the data streams and ensures they are processed in real-time. 8 | 9 | * **Apache ZooKeeper**: ZooKeeper is used alongside Kafka to manage and coordinate the Kafka brokers. ZooKeeper helps maintain configuration information, naming, synchronization, and group services. 10 | 11 | * **PySpark**: The data from Kafka is then processed using PySpark Structured Streaming. This involves transforming individual transaction data into rows fit for a database. 12 | 13 | * **Forex API**: GBP/x exchange rates are taken from an online API and updated every 24 hours. 14 | 15 | * **PostgreSQL**: After processing, the data is stored in PostgreSQL. 16 | 17 | * **dbt (Data Build Tool)**: dbt is used to manage and transform data within PostgreSQL. Data is split into dimension and a fact tables. 18 | 19 | * **Grafana**: Finally, the data stored in PostgreSQL is visualized using Grafana. 20 | 21 | 22 | # Data model 23 | 24 | ![transactions_stream_data_model](/project-png/transactions_stream_data_model.png) 25 | 26 | # dbt documentation 27 | 28 | [Link to the docs](https://transaction-stream-data-docs.netlify.app/) 29 | 30 | #### dbt lineage 31 | 32 | image 33 | 34 | # Visualisation 35 | 36 | image 37 | 38 | # Considerations for improvements 39 | 40 | * add PySpark tests 41 | * use an orchestrator 42 | * use more data 43 | * Spark might be an overkill due to the data amount limitations, but I wanted to learn how to set Spark Streaming up in case data is much more 44 | * for a better Grafana visualisation 45 | * maybe find an alternative transactions data source because the Stripe API has a 25 rate limit 46 | * also many of the generated values in a transaction from the Stripe API are null 47 | 48 | # Setup 49 | 50 | 1. `git clone https://github.com/divakaivan/transaction-stream-data-pipeline.git` 51 | 2. Rename `sample.env` to `.env` and fill in the necessary environment variables 52 | 3. Type `make` in the terminal to see the setup options 53 | ```bash 54 | Usage: make [option] 55 | 56 | Options: 57 | help Show this help message 58 | build Build docker services 59 | start Start docker services (detached mode) 60 | stop Stop docker services 61 | dbt-test Run dbt with LIMIT 100 62 | dbt-full Run dbt with full data 63 | ``` 64 | -------------------------------------------------------------------------------- /data_modelling/models/core/schema.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | models: 4 | - name: facts_transactions 5 | description: "Transaction facts" 6 | columns: 7 | - name: transaction_id 8 | description: "Transaction ID" 9 | data_type: string 10 | data_tests: 11 | - not_null: 12 | severity: warn 13 | - unique: 14 | severity: warn 15 | - name: transaction_date 16 | description: "Timestamp of when the transaction was created" 17 | data_type: datetime 18 | data_tests: 19 | - not_null: 20 | severity: warn 21 | - name: amount 22 | description: "Amount of the transaction - 100 = £1" 23 | data_type: numeric 24 | data_tests: 25 | - not_null: 26 | severity: warn 27 | - name: amount_captured 28 | description: "Amount in bani captured (can be less than the amount attribute on the charge if a partial capture was made)" 29 | data_type: numeric 30 | - name: amount_refunded 31 | description: "Amount in bani refunded (can be less than the amount attribute on the charge if a partial refund was issued)" 32 | data_type: numeric 33 | - name: currency 34 | description: "Three-letter ISO currency code, in lowercase Must be a supported currency" 35 | data_type: string 36 | data_tests: 37 | - not_null: 38 | severity: warn 39 | - name: customer 40 | description: "The ID of the customer this charge is for if one exists" 41 | data_type: string 42 | - name: payment_intent 43 | description: "The ID of the PaymentIntent associated with this charge, if one exists" 44 | data_type: string 45 | - name: payment_method 46 | description: "ID of the payment method used in this charge" 47 | data_type: string 48 | - name: status 49 | description: "The status of the payment is either succeeded, pending, or failed" 50 | data_type: string 51 | - name: forex_usd 52 | data_type: numeric 53 | description: "USD to GBP rate" 54 | data_tests: 55 | - not_null: 56 | severity: warn 57 | - name: forex_eur 58 | data_type: numeric 59 | description: "EUR to GBP rate" 60 | data_tests: 61 | - not_null: 62 | severity: warn 63 | - name: forex_jpy 64 | data_type: numeric 65 | description: "JPY to GBP rate" 66 | data_tests: 67 | - not_null: 68 | severity: warn 69 | - name: forex_cad 70 | data_type: numeric 71 | description: "CAD to GBP rate" 72 | data_tests: 73 | - not_null: 74 | severity: warn 75 | - name: forex_aud 76 | data_type: numeric 77 | description: "AUD to GBP rate" 78 | data_tests: 79 | - not_null: 80 | severity: warn 81 | - name: forex_chf 82 | data_type: numeric 83 | description: "CHF to GBP rate" 84 | data_tests: 85 | - not_null: 86 | severity: warn 87 | - name: forex_cny 88 | data_type: numeric 89 | description: "CNY to GBP rate" 90 | data_tests: 91 | - not_null: 92 | severity: warn 93 | - name: forex_sek 94 | data_type: numeric 95 | description: "SEK to GBP rate" 96 | data_tests: 97 | - not_null: 98 | severity: warn 99 | - name: forex_nzd 100 | data_type: numeric 101 | description: "NZD to GBP rate" 102 | data_tests: 103 | - not_null: 104 | severity: warn 105 | - name: forex_mxn 106 | data_type: numeric 107 | description: "MXN to GBP rate" 108 | data_tests: 109 | - not_null: 110 | severity: warn 111 | -------------------------------------------------------------------------------- /data_modelling/models/staging/stg_transactions.sql: -------------------------------------------------------------------------------- 1 | {{ config( 2 | materialized='table' 3 | ) }} 4 | 5 | with source_data as ( 6 | SELECT 7 | amount, 8 | amount_captured, 9 | amount_refunded, 10 | "application", 11 | application_fee, 12 | application_fee_amount, 13 | balance_transaction, 14 | billing_details_address_city, 15 | billing_details_address_country, 16 | billing_details_address_line1, 17 | billing_details_address_line2, 18 | billing_details_address_postal_code, 19 | billing_details_address_state, 20 | billing_details_email, 21 | billing_details_name, 22 | billing_details_phone, 23 | calculated_statement_descriptor, 24 | captured, 25 | created, 26 | currency, 27 | customer, 28 | "description", 29 | destination, 30 | dispute, 31 | disputed, 32 | failure_balance_transaction, 33 | failure_code, 34 | failure_message, 35 | id, 36 | invoice, 37 | livemode, 38 | "object", 39 | on_behalf_of, 40 | "order", 41 | outcome_network_status, 42 | outcome_reason, 43 | outcome_risk_level, 44 | outcome_risk_score, 45 | outcome_seller_message, 46 | outcome_type, 47 | paid, 48 | payment_intent, 49 | payment_method, 50 | payment_method_details_card_amount_authorized, 51 | payment_method_details_card_brand, 52 | payment_method_details_card_checks_address_line1_check, 53 | payment_method_details_card_checks_address_postal_code_check, 54 | payment_method_details_card_checks_cvc_check, 55 | payment_method_details_card_country, 56 | payment_method_details_card_exp_month, 57 | payment_method_details_card_exp_year, 58 | payment_method_details_card_extended_authorization_status, 59 | payment_method_details_card_fingerprint, 60 | payment_method_details_card_funding, 61 | payment_method_details_card_incremental_authorization_status, 62 | payment_method_details_card_installments, 63 | payment_method_details_card_last4, 64 | payment_method_details_card_mandate, 65 | payment_method_details_card_multicapture_status, 66 | payment_method_details_card_network, 67 | payment_method_details_card_network_token_used, 68 | payment_method_details_card_overcapture_maximum_amount, 69 | payment_method_details_card_overcapture_status, 70 | payment_method_details_card_three_d_secure, 71 | payment_method_details_card_wallet, 72 | payment_method_details_type, 73 | receipt_email, 74 | receipt_number, 75 | receipt_url, 76 | refunded, 77 | review, 78 | shipping, 79 | source_address_city, 80 | source_address_country, 81 | source_address_line1, 82 | source_address_line1_check, 83 | source_address_line2, 84 | source_address_state, 85 | source_address_zip, 86 | source_address_zip_check, 87 | source_brand, 88 | source_country, 89 | source_customer, 90 | source_cvc_check, 91 | source_dynamic_last4, 92 | source_exp_month, 93 | source_exp_year, 94 | source_fingerprint, 95 | source_funding, 96 | source_id, 97 | source_last4, 98 | source_name, 99 | source_object, 100 | source_tokenization_method, 101 | source_wallet, 102 | source_transfer, 103 | statement_descriptor, 104 | statement_descriptor_suffix, 105 | "status", 106 | transfer_data, 107 | transfer_group 108 | FROM {{ source('postgres', 'transactions') }} 109 | ) 110 | 111 | select * from source_data 112 | 113 | {% if var("is_dev_run", default=true) %} limit 100 {% endif %} -------------------------------------------------------------------------------- /postgres/create_table.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE IF NOT EXISTS transactions ( 2 | amount BIGINT, 3 | amount_captured BIGINT, 4 | amount_refunded BIGINT, 5 | "application" VARCHAR(255), 6 | application_fee VARCHAR(255), 7 | application_fee_amount VARCHAR(255), 8 | balance_transaction VARCHAR(255), 9 | billing_details_address_city VARCHAR(255), 10 | billing_details_address_country VARCHAR(255), 11 | billing_details_address_line1 VARCHAR(255), 12 | billing_details_address_line2 VARCHAR(255), 13 | billing_details_address_postal_code VARCHAR(255), 14 | billing_details_address_state VARCHAR(255), 15 | billing_details_email VARCHAR(255), 16 | billing_details_name VARCHAR(255), 17 | billing_details_phone VARCHAR(255), 18 | calculated_statement_descriptor VARCHAR(255), 19 | captured BOOLEAN, 20 | created TIMESTAMP, 21 | currency VARCHAR(3), 22 | customer VARCHAR(255), 23 | "description" TEXT, 24 | destination VARCHAR(255), 25 | dispute VARCHAR(255), 26 | disputed BOOLEAN, 27 | failure_balance_transaction VARCHAR(255), 28 | failure_code VARCHAR(255), 29 | failure_message TEXT, 30 | id VARCHAR(255), 31 | invoice VARCHAR(255), 32 | livemode BOOLEAN, 33 | "object" VARCHAR(255), 34 | on_behalf_of VARCHAR(255), 35 | "order" VARCHAR(255), 36 | outcome_network_status VARCHAR(255), 37 | outcome_reason VARCHAR(255), 38 | outcome_risk_level VARCHAR(255), 39 | outcome_risk_score BIGINT, 40 | outcome_seller_message VARCHAR(255), 41 | outcome_type VARCHAR(255), 42 | paid BOOLEAN, 43 | payment_intent VARCHAR(255), 44 | payment_method VARCHAR(255), 45 | payment_method_details_card_amount_authorized BIGINT, 46 | payment_method_details_card_brand VARCHAR(255), 47 | payment_method_details_card_checks_address_line1_check VARCHAR(255), 48 | payment_method_details_card_checks_address_postal_code_check VARCHAR(255), 49 | payment_method_details_card_checks_cvc_check VARCHAR(255), 50 | payment_method_details_card_country VARCHAR(2), 51 | payment_method_details_card_exp_month INT, 52 | payment_method_details_card_exp_year INT, 53 | payment_method_details_card_extended_authorization_status VARCHAR(255), 54 | payment_method_details_card_fingerprint VARCHAR(255), 55 | payment_method_details_card_funding VARCHAR(255), 56 | payment_method_details_card_incremental_authorization_status VARCHAR(255), 57 | payment_method_details_card_installments VARCHAR(255), 58 | payment_method_details_card_last4 VARCHAR(4), 59 | payment_method_details_card_mandate VARCHAR(255), 60 | payment_method_details_card_multicapture_status VARCHAR(255), 61 | payment_method_details_card_network VARCHAR(255), 62 | payment_method_details_card_network_token_used BOOLEAN, 63 | payment_method_details_card_overcapture_maximum_amount BIGINT, 64 | payment_method_details_card_overcapture_status VARCHAR(255), 65 | payment_method_details_card_three_d_secure VARCHAR(255), 66 | payment_method_details_card_wallet VARCHAR(255), 67 | payment_method_details_type VARCHAR(255), 68 | receipt_email VARCHAR(255), 69 | receipt_number VARCHAR(255), 70 | receipt_url TEXT, 71 | refunded BOOLEAN, 72 | review VARCHAR(255), 73 | shipping VARCHAR(255), 74 | source_address_city VARCHAR(255), 75 | source_address_country VARCHAR(255), 76 | source_address_line1 VARCHAR(255), 77 | source_address_line1_check VARCHAR(255), 78 | source_address_line2 VARCHAR(255), 79 | source_address_state VARCHAR(255), 80 | source_address_zip VARCHAR(255), 81 | source_address_zip_check VARCHAR(255), 82 | source_brand VARCHAR(255), 83 | source_country VARCHAR(2), 84 | source_customer VARCHAR(255), 85 | source_cvc_check VARCHAR(255), 86 | source_dynamic_last4 VARCHAR(4), 87 | source_exp_month INT, 88 | source_exp_year INT, 89 | source_fingerprint VARCHAR(255), 90 | source_funding VARCHAR(255), 91 | source_id VARCHAR(255), 92 | source_last4 VARCHAR(4), 93 | source_name VARCHAR(255), 94 | source_object VARCHAR(255), 95 | source_tokenization_method VARCHAR(255), 96 | source_wallet VARCHAR(255), 97 | source_transfer VARCHAR(255), 98 | statement_descriptor VARCHAR(255), 99 | statement_descriptor_suffix VARCHAR(255), 100 | "status" VARCHAR(255), 101 | transfer_data VARCHAR(255), 102 | transfer_group VARCHAR(255) 103 | ); 104 | -------------------------------------------------------------------------------- /kafka-consumer/wait-for-it.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Use this script to test if a given TCP host/port are available 3 | 4 | cmdname=$(basename $0) 5 | 6 | echoerr() { if [[ $QUIET -ne 1 ]]; then echo "$@" 1>&2; fi } 7 | 8 | usage() 9 | { 10 | cat << USAGE >&2 11 | Usage: 12 | $cmdname host:port [-s] [-t timeout] [-- command args] 13 | -h HOST | --host=HOST Host or IP under test 14 | -p PORT | --port=PORT TCP port under test 15 | Alternatively, you specify the host and port as host:port 16 | -s | --strict Only execute subcommand if the test succeeds 17 | -q | --quiet Don't output any status messages 18 | -t TIMEOUT | --timeout=TIMEOUT 19 | Timeout in seconds, zero for no timeout 20 | -- COMMAND ARGS Execute command with args after the test finishes 21 | USAGE 22 | exit 1 23 | } 24 | 25 | wait_for() 26 | { 27 | if [[ $TIMEOUT -gt 0 ]]; then 28 | echoerr "$cmdname: waiting $TIMEOUT seconds for $HOST:$PORT" 29 | else 30 | echoerr "$cmdname: waiting for $HOST:$PORT without a timeout" 31 | fi 32 | start_ts=$(date +%s) 33 | while : 34 | do 35 | if [[ $ISBUSY -eq 1 ]]; then 36 | nc -z $HOST $PORT 37 | result=$? 38 | else 39 | (echo > /dev/tcp/$HOST/$PORT) >/dev/null 2>&1 40 | result=$? 41 | fi 42 | if [[ $result -eq 0 ]]; then 43 | end_ts=$(date +%s) 44 | echoerr "$cmdname: $HOST:$PORT is available after $((end_ts - start_ts)) seconds" 45 | break 46 | fi 47 | sleep 1 48 | done 49 | return $result 50 | } 51 | 52 | wait_for_wrapper() 53 | { 54 | # In order to support SIGINT during timeout: http://unix.stackexchange.com/a/57692 55 | if [[ $QUIET -eq 1 ]]; then 56 | timeout $BUSYTIMEFLAG $TIMEOUT $0 --quiet --child --host=$HOST --port=$PORT --timeout=$TIMEOUT & 57 | else 58 | timeout $BUSYTIMEFLAG $TIMEOUT $0 --child --host=$HOST --port=$PORT --timeout=$TIMEOUT & 59 | fi 60 | PID=$! 61 | trap "kill -INT -$PID" INT 62 | wait $PID 63 | RESULT=$? 64 | if [[ $RESULT -ne 0 ]]; then 65 | echoerr "$cmdname: timeout occurred after waiting $TIMEOUT seconds for $HOST:$PORT" 66 | fi 67 | return $RESULT 68 | } 69 | 70 | # process arguments 71 | while [[ $# -gt 0 ]] 72 | do 73 | case "$1" in 74 | *:* ) 75 | hostport=(${1//:/ }) 76 | HOST=${hostport[0]} 77 | PORT=${hostport[1]} 78 | shift 1 79 | ;; 80 | --child) 81 | CHILD=1 82 | shift 1 83 | ;; 84 | -q | --quiet) 85 | QUIET=1 86 | shift 1 87 | ;; 88 | -s | --strict) 89 | STRICT=1 90 | shift 1 91 | ;; 92 | -h) 93 | HOST="$2" 94 | if [[ $HOST == "" ]]; then break; fi 95 | shift 2 96 | ;; 97 | --host=*) 98 | HOST="${1#*=}" 99 | shift 1 100 | ;; 101 | -p) 102 | PORT="$2" 103 | if [[ $PORT == "" ]]; then break; fi 104 | shift 2 105 | ;; 106 | --port=*) 107 | PORT="${1#*=}" 108 | shift 1 109 | ;; 110 | -t) 111 | TIMEOUT="$2" 112 | if [[ $TIMEOUT == "" ]]; then break; fi 113 | shift 2 114 | ;; 115 | --timeout=*) 116 | TIMEOUT="${1#*=}" 117 | shift 1 118 | ;; 119 | --) 120 | shift 121 | CLI=("$@") 122 | break 123 | ;; 124 | --help) 125 | usage 126 | ;; 127 | *) 128 | echoerr "Unknown argument: $1" 129 | usage 130 | ;; 131 | esac 132 | done 133 | 134 | if [[ "$HOST" == "" || "$PORT" == "" ]]; then 135 | echoerr "Error: you need to provide a host and port to test." 136 | usage 137 | fi 138 | 139 | TIMEOUT=${TIMEOUT:-15} 140 | STRICT=${STRICT:-0} 141 | CHILD=${CHILD:-0} 142 | QUIET=${QUIET:-0} 143 | 144 | # check to see if timeout is from busybox? 145 | # check to see if timeout is from busybox? 146 | TIMEOUT_PATH=$(realpath $(which timeout)) 147 | if [[ $TIMEOUT_PATH =~ "busybox" ]]; then 148 | ISBUSY=1 149 | BUSYTIMEFLAG="-t" 150 | else 151 | ISBUSY=0 152 | BUSYTIMEFLAG="" 153 | fi 154 | 155 | if [[ $CHILD -gt 0 ]]; then 156 | wait_for 157 | RESULT=$? 158 | exit $RESULT 159 | else 160 | if [[ $TIMEOUT -gt 0 ]]; then 161 | wait_for_wrapper 162 | RESULT=$? 163 | else 164 | wait_for 165 | RESULT=$? 166 | fi 167 | fi 168 | 169 | if [[ $CLI != "" ]]; then 170 | if [[ $RESULT -ne 0 && $STRICT -eq 1 ]]; then 171 | echoerr "$cmdname: strict mode, refusing to execute subprocess" 172 | exit $RESULT 173 | fi 174 | exec "${CLI[@]}" 175 | else 176 | exit $RESULT 177 | fi -------------------------------------------------------------------------------- /kafka-producer/wait-for-it.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Use this script to test if a given TCP host/port are available 3 | 4 | cmdname=$(basename $0) 5 | 6 | echoerr() { if [[ $QUIET -ne 1 ]]; then echo "$@" 1>&2; fi } 7 | 8 | usage() 9 | { 10 | cat << USAGE >&2 11 | Usage: 12 | $cmdname host:port [-s] [-t timeout] [-- command args] 13 | -h HOST | --host=HOST Host or IP under test 14 | -p PORT | --port=PORT TCP port under test 15 | Alternatively, you specify the host and port as host:port 16 | -s | --strict Only execute subcommand if the test succeeds 17 | -q | --quiet Don't output any status messages 18 | -t TIMEOUT | --timeout=TIMEOUT 19 | Timeout in seconds, zero for no timeout 20 | -- COMMAND ARGS Execute command with args after the test finishes 21 | USAGE 22 | exit 1 23 | } 24 | 25 | wait_for() 26 | { 27 | if [[ $TIMEOUT -gt 0 ]]; then 28 | echoerr "$cmdname: waiting $TIMEOUT seconds for $HOST:$PORT" 29 | else 30 | echoerr "$cmdname: waiting for $HOST:$PORT without a timeout" 31 | fi 32 | start_ts=$(date +%s) 33 | while : 34 | do 35 | if [[ $ISBUSY -eq 1 ]]; then 36 | nc -z $HOST $PORT 37 | result=$? 38 | else 39 | (echo > /dev/tcp/$HOST/$PORT) >/dev/null 2>&1 40 | result=$? 41 | fi 42 | if [[ $result -eq 0 ]]; then 43 | end_ts=$(date +%s) 44 | echoerr "$cmdname: $HOST:$PORT is available after $((end_ts - start_ts)) seconds" 45 | break 46 | fi 47 | sleep 1 48 | done 49 | return $result 50 | } 51 | 52 | wait_for_wrapper() 53 | { 54 | # In order to support SIGINT during timeout: http://unix.stackexchange.com/a/57692 55 | if [[ $QUIET -eq 1 ]]; then 56 | timeout $BUSYTIMEFLAG $TIMEOUT $0 --quiet --child --host=$HOST --port=$PORT --timeout=$TIMEOUT & 57 | else 58 | timeout $BUSYTIMEFLAG $TIMEOUT $0 --child --host=$HOST --port=$PORT --timeout=$TIMEOUT & 59 | fi 60 | PID=$! 61 | trap "kill -INT -$PID" INT 62 | wait $PID 63 | RESULT=$? 64 | if [[ $RESULT -ne 0 ]]; then 65 | echoerr "$cmdname: timeout occurred after waiting $TIMEOUT seconds for $HOST:$PORT" 66 | fi 67 | return $RESULT 68 | } 69 | 70 | # process arguments 71 | while [[ $# -gt 0 ]] 72 | do 73 | case "$1" in 74 | *:* ) 75 | hostport=(${1//:/ }) 76 | HOST=${hostport[0]} 77 | PORT=${hostport[1]} 78 | shift 1 79 | ;; 80 | --child) 81 | CHILD=1 82 | shift 1 83 | ;; 84 | -q | --quiet) 85 | QUIET=1 86 | shift 1 87 | ;; 88 | -s | --strict) 89 | STRICT=1 90 | shift 1 91 | ;; 92 | -h) 93 | HOST="$2" 94 | if [[ $HOST == "" ]]; then break; fi 95 | shift 2 96 | ;; 97 | --host=*) 98 | HOST="${1#*=}" 99 | shift 1 100 | ;; 101 | -p) 102 | PORT="$2" 103 | if [[ $PORT == "" ]]; then break; fi 104 | shift 2 105 | ;; 106 | --port=*) 107 | PORT="${1#*=}" 108 | shift 1 109 | ;; 110 | -t) 111 | TIMEOUT="$2" 112 | if [[ $TIMEOUT == "" ]]; then break; fi 113 | shift 2 114 | ;; 115 | --timeout=*) 116 | TIMEOUT="${1#*=}" 117 | shift 1 118 | ;; 119 | --) 120 | shift 121 | CLI=("$@") 122 | break 123 | ;; 124 | --help) 125 | usage 126 | ;; 127 | *) 128 | echoerr "Unknown argument: $1" 129 | usage 130 | ;; 131 | esac 132 | done 133 | 134 | if [[ "$HOST" == "" || "$PORT" == "" ]]; then 135 | echoerr "Error: you need to provide a host and port to test." 136 | usage 137 | fi 138 | 139 | TIMEOUT=${TIMEOUT:-15} 140 | STRICT=${STRICT:-0} 141 | CHILD=${CHILD:-0} 142 | QUIET=${QUIET:-0} 143 | 144 | # check to see if timeout is from busybox? 145 | # check to see if timeout is from busybox? 146 | TIMEOUT_PATH=$(realpath $(which timeout)) 147 | if [[ $TIMEOUT_PATH =~ "busybox" ]]; then 148 | ISBUSY=1 149 | BUSYTIMEFLAG="-t" 150 | else 151 | ISBUSY=0 152 | BUSYTIMEFLAG="" 153 | fi 154 | 155 | if [[ $CHILD -gt 0 ]]; then 156 | wait_for 157 | RESULT=$? 158 | exit $RESULT 159 | else 160 | if [[ $TIMEOUT -gt 0 ]]; then 161 | wait_for_wrapper 162 | RESULT=$? 163 | else 164 | wait_for 165 | RESULT=$? 166 | fi 167 | fi 168 | 169 | if [[ $CLI != "" ]]; then 170 | if [[ $RESULT -ne 0 && $STRICT -eq 1 ]]; then 171 | echoerr "$cmdname: strict mode, refusing to execute subprocess" 172 | exit $RESULT 173 | fi 174 | exec "${CLI[@]}" 175 | else 176 | exit $RESULT 177 | fi -------------------------------------------------------------------------------- /kafka-consumer/spark_schema.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql.types import StructType, StructField, StringType, LongType, BooleanType, ArrayType, MapType 2 | 3 | schema = StructType([ 4 | StructField("transactions", ArrayType( 5 | StructType([ 6 | StructField("amount", LongType(), True), 7 | StructField("amount_captured", LongType(), True), 8 | StructField("amount_refunded", LongType(), True), 9 | StructField("application", StringType(), True), 10 | StructField("application_fee", StringType(), True), 11 | StructField("application_fee_amount", StringType(), True), 12 | StructField("balance_transaction", StringType(), True), 13 | StructField("billing_details", StructType([ 14 | StructField("address", StructType([ 15 | StructField("city", StringType(), True), 16 | StructField("country", StringType(), True), 17 | StructField("line1", StringType(), True), 18 | StructField("line2", StringType(), True), 19 | StructField("postal_code", StringType(), True), 20 | StructField("state", StringType(), True) 21 | ]), True), 22 | StructField("email", StringType(), True), 23 | StructField("name", StringType(), True), 24 | StructField("phone", StringType(), True) 25 | ]), True), 26 | StructField("calculated_statement_descriptor", StringType(), True), 27 | StructField("captured", BooleanType(), True), 28 | StructField("created", LongType(), True), 29 | StructField("currency", StringType(), True), 30 | StructField("customer", StringType(), True), 31 | StructField("description", StringType(), True), 32 | StructField("destination", StringType(), True), 33 | StructField("dispute", StringType(), True), 34 | StructField("disputed", BooleanType(), True), 35 | StructField("failure_balance_transaction", StringType(), True), 36 | StructField("failure_code", StringType(), True), 37 | StructField("failure_message", StringType(), True), 38 | StructField("fraud_details", MapType(StringType(), StringType()), True), 39 | StructField("id", StringType(), True), 40 | StructField("invoice", StringType(), True), 41 | StructField("livemode", BooleanType(), True), 42 | StructField("metadata", MapType(StringType(), StringType()), True), 43 | StructField("object", StringType(), True), 44 | StructField("on_behalf_of", StringType(), True), 45 | StructField("order", StringType(), True), 46 | StructField("outcome", StructType([ 47 | StructField("network_status", StringType(), True), 48 | StructField("reason", StringType(), True), 49 | StructField("risk_level", StringType(), True), 50 | StructField("risk_score", LongType(), True), 51 | StructField("seller_message", StringType(), True), 52 | StructField("type", StringType(), True) 53 | ]), True), 54 | StructField("paid", BooleanType(), True), 55 | StructField("payment_intent", StringType(), True), 56 | StructField("payment_method", StringType(), True), 57 | StructField("payment_method_details", StructType([ 58 | StructField("card", StructType([ 59 | StructField("amount_authorized", LongType(), True), 60 | StructField("brand", StringType(), True), 61 | StructField("checks", StructType([ 62 | StructField("address_line1_check", StringType(), True), 63 | StructField("address_postal_code_check", StringType(), True), 64 | StructField("cvc_check", StringType(), True) 65 | ]), True), 66 | StructField("country", StringType(), True), 67 | StructField("exp_month", LongType(), True), 68 | StructField("exp_year", LongType(), True), 69 | StructField("extended_authorization", StructType([ 70 | StructField("status", StringType(), True) 71 | ]), True), 72 | StructField("fingerprint", StringType(), True), 73 | StructField("funding", StringType(), True), 74 | StructField("incremental_authorization", StructType([ 75 | StructField("status", StringType(), True) 76 | ]), True), 77 | StructField("installments", StringType(), True), 78 | StructField("last4", StringType(), True), 79 | StructField("mandate", StringType(), True), 80 | StructField("multicapture", StructType([ 81 | StructField("status", StringType(), True) 82 | ]), True), 83 | StructField("network", StringType(), True), 84 | StructField("network_token", StructType([ 85 | StructField("used", BooleanType(), True) 86 | ]), True), 87 | StructField("overcapture", StructType([ 88 | StructField("maximum_amount_capturable", LongType(), True), 89 | StructField("status", StringType(), True) 90 | ]), True), 91 | StructField("three_d_secure", StringType(), True), 92 | StructField("wallet", StringType(), True) 93 | ]), True), 94 | StructField("type", StringType(), True) 95 | ]), True), 96 | StructField("receipt_email", StringType(), True), 97 | StructField("receipt_number", StringType(), True), 98 | StructField("receipt_url", StringType(), True), 99 | StructField("refunded", BooleanType(), True), 100 | StructField("review", StringType(), True), 101 | StructField("shipping", StringType(), True), 102 | StructField("source", StructType([ 103 | StructField("address_city", StringType(), True), 104 | StructField("address_country", StringType(), True), 105 | StructField("address_line1", StringType(), True), 106 | StructField("address_line1_check", StringType(), True), 107 | StructField("address_line2", StringType(), True), 108 | StructField("address_state", StringType(), True), 109 | StructField("address_zip", StringType(), True), 110 | StructField("address_zip_check", StringType(), True), 111 | StructField("brand", StringType(), True), 112 | StructField("country", StringType(), True), 113 | StructField("customer", StringType(), True), 114 | StructField("cvc_check", StringType(), True), 115 | StructField("dynamic_last4", StringType(), True), 116 | StructField("exp_month", LongType(), True), 117 | StructField("exp_year", LongType(), True), 118 | StructField("fingerprint", StringType(), True), 119 | StructField("funding", StringType(), True), 120 | StructField("id", StringType(), True), 121 | StructField("last4", StringType(), True), 122 | StructField("metadata", MapType(StringType(), StringType()), True), 123 | StructField("name", StringType(), True), 124 | StructField("object", StringType(), True), 125 | StructField("tokenization_method", StringType(), True), 126 | StructField("wallet", StringType(), True) 127 | ]), True), 128 | StructField("source_transfer", StringType(), True), 129 | StructField("statement_descriptor", StringType(), True), 130 | StructField("statement_descriptor_suffix", StringType(), True), 131 | StructField("status", StringType(), True), 132 | StructField("transfer_data", StringType(), True), 133 | StructField("transfer_group", StringType(), True) 134 | ]) 135 | )) 136 | ]) 137 | -------------------------------------------------------------------------------- /kafka-consumer/python-consumer.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyspark.sql import SparkSession 4 | from pyspark.sql.types import LongType 5 | from pyspark.sql.functions import from_json, col, explode, from_unixtime 6 | 7 | from spark_schema import schema 8 | 9 | kafka_nodes = os.getenv('KAFKA_SERVER') 10 | my_topic = os.getenv('KAFKA_TOPIC') 11 | 12 | spark = SparkSession.builder \ 13 | .appName("KafkaConsumer") \ 14 | .getOrCreate() 15 | 16 | df = spark \ 17 | .readStream \ 18 | .format("kafka") \ 19 | .option("kafka.bootstrap.servers", kafka_nodes) \ 20 | .option("subscribe", my_topic) \ 21 | .option("startingOffsets", "latest") \ 22 | .load() 23 | 24 | df = df.withColumn("value", col("value").cast("string")) 25 | 26 | df_parsed = df.withColumn("parsed_value", from_json(col("value"), schema)) \ 27 | .select("parsed_value.*") 28 | 29 | df_exploded = df_parsed.select(explode(col("transactions")).alias("transaction")) 30 | 31 | df_final = df_exploded.selectExpr( 32 | "transaction.amount as amount", 33 | "transaction.amount_captured as amount_captured", 34 | "transaction.amount_refunded as amount_refunded", 35 | "transaction.application as application", 36 | "transaction.application_fee application_fee", 37 | "transaction.application_fee_amount as application_fee_amount", 38 | "transaction.balance_transaction as balance_transaction", 39 | "transaction.billing_details.address.city as billing_details_address_city", 40 | "transaction.billing_details.address.country as billing_details_address_country", 41 | "transaction.billing_details.address.line1 as billing_details_address_line1", 42 | "transaction.billing_details.address.line2 as billing_details_address_line2", 43 | "transaction.billing_details.address.postal_code as billing_details_address_postal_code", 44 | "transaction.billing_details.address.state as billing_details_address_state", 45 | "transaction.billing_details.email as billing_details_email", 46 | "transaction.billing_details.name as billing_details_name", 47 | "transaction.billing_details.phone as billing_details_phone", 48 | "transaction.calculated_statement_descriptor as calculated_statement_descriptor", 49 | "transaction.captured as captured", 50 | "transaction.created as created", 51 | "transaction.currency as currency", 52 | "transaction.customer as customer", 53 | "transaction.description as description", 54 | "transaction.destination as destination", 55 | "transaction.dispute as dispute", 56 | "transaction.disputed as disputed", 57 | "transaction.failure_balance_transaction as failure_balance_transaction", 58 | "transaction.failure_code as failure_code", 59 | "transaction.failure_message as failure_message", 60 | "transaction.id as id", 61 | "transaction.invoice as invoice", 62 | "transaction.livemode as livemode", 63 | "transaction.object as object", 64 | "transaction.on_behalf_of as on_behalf_of", 65 | "transaction.order as order", 66 | "transaction.outcome.network_status as outcome_network_status", 67 | "transaction.outcome.reason as outcome_reason", 68 | "transaction.outcome.risk_level as outcome_risk_level", 69 | "transaction.outcome.risk_score as outcome_risk_score", 70 | "transaction.outcome.seller_message as outcome_seller_message", 71 | "transaction.outcome.type as outcome_type", 72 | "transaction.paid as paid", 73 | "transaction.payment_intent as payment_intent", 74 | "transaction.payment_method as payment_method", 75 | "transaction.payment_method_details.card.amount_authorized as payment_method_details_card_amount_authorized", 76 | "transaction.payment_method_details.card.brand as payment_method_details_card_brand", 77 | "transaction.payment_method_details.card.checks.address_line1_check as payment_method_details_card_checks_address_line1_check", 78 | "transaction.payment_method_details.card.checks.address_postal_code_check as payment_method_details_card_checks_address_postal_code_check", 79 | "transaction.payment_method_details.card.checks.cvc_check as payment_method_details_card_checks_cvc_check", 80 | "transaction.payment_method_details.card.country as payment_method_details_card_country", 81 | "transaction.payment_method_details.card.exp_month as payment_method_details_card_exp_month", 82 | "transaction.payment_method_details.card.exp_year as payment_method_details_card_exp_year", 83 | "transaction.payment_method_details.card.extended_authorization.status as payment_method_details_card_extended_authorization_status", 84 | "transaction.payment_method_details.card.fingerprint as payment_method_details_card_fingerprint", 85 | "transaction.payment_method_details.card.funding as payment_method_details_card_funding", 86 | "transaction.payment_method_details.card.incremental_authorization.status as payment_method_details_card_incremental_authorization_status", 87 | "transaction.payment_method_details.card.installments as payment_method_details_card_installments", 88 | "transaction.payment_method_details.card.last4 as payment_method_details_card_last4", 89 | "transaction.payment_method_details.card.mandate as payment_method_details_card_mandate", 90 | "transaction.payment_method_details.card.multicapture.status as payment_method_details_card_multicapture_status", 91 | "transaction.payment_method_details.card.network as payment_method_details_card_network", 92 | "transaction.payment_method_details.card.network_token.used as payment_method_details_card_network_token_used", 93 | "transaction.payment_method_details.card.overcapture.maximum_amount_capturable as payment_method_details_card_overcapture_maximum_amount", 94 | "transaction.payment_method_details.card.overcapture.status as payment_method_details_card_overcapture_status", 95 | "transaction.payment_method_details.card.three_d_secure as payment_method_details_card_three_d_secure", 96 | "transaction.payment_method_details.card.wallet as payment_method_details_card_wallet", 97 | "transaction.payment_method_details.type as payment_method_details_type", 98 | "transaction.receipt_email as receipt_email", 99 | "transaction.receipt_number as receipt_number", 100 | "transaction.receipt_url as receipt_url", 101 | "transaction.refunded as refunded", 102 | "transaction.review as review", 103 | "transaction.shipping as shipping", 104 | "transaction.source.address_city as source_address_city", 105 | "transaction.source.address_country as source_address_country", 106 | "transaction.source.address_line1 as source_address_line1", 107 | "transaction.source.address_line1_check as source_address_line1_check", 108 | "transaction.source.address_line2 as source_address_line2", 109 | "transaction.source.address_state as source_address_state", 110 | "transaction.source.address_zip as source_address_zip", 111 | "transaction.source.address_zip_check as source_address_zip_check", 112 | "transaction.source.brand as source_brand", 113 | "transaction.source.country as source_country", 114 | "transaction.source.customer as source_customer", 115 | "transaction.source.cvc_check as source_cvc_check", 116 | "transaction.source.dynamic_last4 as source_dynamic_last4", 117 | "transaction.source.exp_month as source_exp_month", 118 | "transaction.source.exp_year as source_exp_year", 119 | "transaction.source.fingerprint as source_fingerprint", 120 | "transaction.source.funding as source_funding", 121 | "transaction.source.id as source_id", 122 | "transaction.source.last4 as source_last4", 123 | "transaction.source.name as source_name", 124 | "transaction.source.object as source_object", 125 | "transaction.source.tokenization_method as source_tokenization_method", 126 | "transaction.source.wallet as source_wallet", 127 | "transaction.source_transfer as source_transfer", 128 | "transaction.statement_descriptor as statement_descriptor", 129 | "transaction.statement_descriptor_suffix as statement_descriptor_suffix", 130 | "transaction.status as status", 131 | "transaction.transfer_data as transfer_data", 132 | "transaction.transfer_group as transfer_group" 133 | ) 134 | 135 | df_final.printSchema() 136 | df_final = df_final.withColumn("created", from_unixtime(col("created").cast(LongType())).cast("timestamp")) 137 | 138 | POSTGRES_DB = os.getenv("POSTGRES_DB") 139 | POSTGRES_USER = os.getenv("POSTGRES_USER") 140 | POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD") 141 | POSTGRES_HOST = os.getenv("POSTGRES_HOST") 142 | 143 | pg_url = f"jdbc:postgresql://{POSTGRES_HOST}:5432/{POSTGRES_DB}" 144 | 145 | pg_properties = { 146 | "user": POSTGRES_USER, 147 | "password": POSTGRES_PASSWORD, 148 | "driver": "org.postgresql.Driver" 149 | } 150 | 151 | def write_to_postgres(df, epoch_id): 152 | df.write \ 153 | .jdbc(url=pg_url, table="transactions", mode="append", properties=pg_properties) 154 | 155 | query = df_final \ 156 | .writeStream \ 157 | .foreachBatch(write_to_postgres) \ 158 | .outputMode("append") \ 159 | .start() 160 | 161 | query.awaitTermination() 162 | -------------------------------------------------------------------------------- /data_modelling/models/dimensions/schema.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | sources: 4 | - name: postgres 5 | database: postgres 6 | schema: public 7 | tables: 8 | - name: forex_rates 9 | 10 | models: 11 | - name: dim_forex_rates 12 | description: "Daily Forex rates for top currencies" 13 | columns: 14 | - name: date_rates 15 | data_type: date 16 | description: "Date of the forex rate" 17 | data_tests: 18 | - not_null: 19 | severity: warn 20 | - name: usd 21 | data_type: float 22 | description: "USD to GBP rate" 23 | data_tests: 24 | - not_null: 25 | severity: warn 26 | - name: eur 27 | data_type: float 28 | description: "EUR to GBP rate" 29 | data_tests: 30 | - not_null: 31 | severity: warn 32 | - name: jpy 33 | data_type: float 34 | description: "JPY to GBP rate" 35 | data_tests: 36 | - not_null: 37 | severity: warn 38 | - name: cad 39 | data_type: float 40 | description: "CAD to GBP rate" 41 | data_tests: 42 | - not_null: 43 | severity: warn 44 | - name: aud 45 | data_type: float 46 | description: "AUD to GBP rate" 47 | data_tests: 48 | - not_null: 49 | severity: warn 50 | - name: chf 51 | data_type: float 52 | description: "CHF to GBP rate" 53 | data_tests: 54 | - not_null: 55 | severity: warn 56 | - name: cny 57 | data_type: float 58 | description: "CNY to GBP rate" 59 | data_tests: 60 | - not_null: 61 | severity: warn 62 | - name: sek 63 | data_type: float 64 | description: "SEK to GBP rate" 65 | data_tests: 66 | - not_null: 67 | severity: warn 68 | - name: nzd 69 | data_type: float 70 | description: "NZD to GBP rate" 71 | data_tests: 72 | - not_null: 73 | severity: warn 74 | - name: mxn 75 | data_type: float 76 | description: "MXN to GBP rate" 77 | data_tests: 78 | - not_null: 79 | severity: warn 80 | 81 | - name: dim_billing_details 82 | description: "Billing details for a charge" 83 | columns: 84 | - name: id 85 | data_type: string 86 | description: "ID of the Charge" 87 | data_tests: 88 | - not_null: 89 | severity: warn 90 | - unique: 91 | severity: warn 92 | - name: address_city 93 | data_type: string 94 | description: "City of the address" 95 | - name: address_country 96 | data_type: string 97 | description: "Country of the address" 98 | - name: address_line1 99 | data_type: string 100 | description: "Address line 1" 101 | - name: address_line2 102 | data_type: string 103 | description: "Address line 2" 104 | - name: address_postal_code 105 | data_type: string 106 | description: "Postal code of the address" 107 | - name: address_state 108 | data_type: string 109 | description: "State of the address" 110 | - name: email 111 | data_type: string 112 | description: "Email of the customer" 113 | - name: name 114 | data_type: string 115 | description: "Name of the customer" 116 | - name: phone 117 | data_type: string 118 | description: "Phone number of the customer" 119 | 120 | - name: dim_outcome_details 121 | description: "Outcome details for a charge" 122 | columns: 123 | - name: id 124 | data_type: string 125 | description: "ID of the Charge" 126 | data_tests: 127 | - not_null: 128 | severity: warn 129 | - unique: 130 | severity: warn 131 | - name: network_status 132 | data_type: string 133 | description: "Network status of the outcome" 134 | - name: reason 135 | data_type: string 136 | description: "Reason for the outcome" 137 | - name: risk_level 138 | data_type: string 139 | description: "Risk level of the outcome (low, normal, high)" 140 | - name: risk_score 141 | data_type: float 142 | description: "Risk score of the outcome" 143 | - name: seller_message 144 | data_type: string 145 | description: "Seller message of the outcome" 146 | - name: outcome_type 147 | data_type: string 148 | description: "Type of the outcome" 149 | 150 | 151 | - name: dim_payment_method_details 152 | description: "Payment method details for a charge" 153 | columns: 154 | - name: id 155 | data_type: string 156 | description: "ID of the Charge" 157 | data_tests: 158 | - not_null: 159 | severity: warn 160 | - unique: 161 | severity: warn 162 | - name: card_amount_authorized 163 | data_type: float 164 | description: "Authorized amount on the card" 165 | - name: card_brand 166 | data_type: string 167 | description: "Brand of the card" 168 | - name: card_checks_address_line1_check 169 | data_type: string 170 | description: "Address line 1 check result" 171 | - name: card_checks_address_postal_code_check 172 | data_type: string 173 | description: "Postal code check result" 174 | - name: card_checks_cvc_check 175 | data_type: string 176 | description: "CVC check result" 177 | - name: card_country 178 | data_type: string 179 | description: "Country of the card" 180 | - name: card_exp_month 181 | data_type: integer 182 | description: "Expiration month of the card" 183 | - name: card_exp_year 184 | data_type: integer 185 | description: "Expiration year of the card" 186 | - name: card_extended_authorization_status 187 | data_type: string 188 | description: "Extended authorization status of the card" 189 | - name: card_fingerprint 190 | data_type: string 191 | description: "Fingerprint of the card" 192 | - name: card_funding 193 | data_type: string 194 | description: "Funding source of the card" 195 | - name: card_incremental_authorization_status 196 | data_type: string 197 | description: "Incremental authorization status of the card" 198 | - name: card_installments 199 | data_type: integer 200 | description: "Number of installments for the card" 201 | - name: card_last4 202 | data_type: string 203 | description: "Last 4 digits of the card" 204 | - name: card_mandate 205 | data_type: string 206 | description: "Mandate of the card" 207 | - name: card_multicapture_status 208 | data_type: string 209 | description: "Multicapture status of the card" 210 | - name: card_network 211 | data_type: string 212 | description: "Network of the card" 213 | - name: card_network_token_used 214 | data_type: string 215 | description: "Whether a network token was used for the card" 216 | - name: card_overcapture_maximum_amount 217 | data_type: float 218 | description: "Maximum amount for overcapture on the card" 219 | - name: card_overcapture_status 220 | data_type: string 221 | description: "Overcapture status of the card" 222 | - name: card_three_d_secure 223 | data_type: string 224 | description: "3D Secure status of the card" 225 | - name: card_wallet 226 | data_type: string 227 | description: "Wallet used for the card" 228 | - name: type 229 | data_type: string 230 | description: "Type of the payment method" 231 | 232 | - name: dim_source_details 233 | description: "Source details for a charge" 234 | columns: 235 | - name: id 236 | data_type: string 237 | description: "ID of the Charge" 238 | data_tests: 239 | - not_null: 240 | severity: warn 241 | - unique: 242 | severity: warn 243 | - name: address_city 244 | data_type: string 245 | description: "City of the address" 246 | - name: address_country 247 | data_type: string 248 | description: "Country of the address" 249 | - name: address_line1 250 | data_type: string 251 | description: "Address line 1" 252 | - name: address_line1_check 253 | data_type: string 254 | description: "Address line 1 check result" 255 | - name: address_line2 256 | data_type: string 257 | description: "Address line 2" 258 | - name: address_state 259 | data_type: string 260 | description: "State of the address" 261 | - name: address_zip 262 | data_type: string 263 | description: "ZIP code of the address" 264 | - name: address_zip_check 265 | data_type: string 266 | description: "ZIP code check result" 267 | - name: brand 268 | data_type: string 269 | description: "Brand of the source" 270 | - name: country 271 | data_type: string 272 | description: "Country of the source" 273 | - name: customer 274 | data_type: string 275 | description: "Customer associated with the source" 276 | - name: cvc_check 277 | data_type: string 278 | description: "CVC check result" 279 | - name: dynamic_last4 280 | data_type: string 281 | description: "Dynamic last 4 digits of the source" 282 | - name: exp_month 283 | data_type: integer 284 | description: "Expiration month of the source" 285 | - name: exp_year 286 | data_type: integer 287 | description: "Expiration year of the source" 288 | - name: fingerprint 289 | data_type: string 290 | description: "Fingerprint of the source" 291 | - name: funding 292 | data_type: string 293 | description: "Funding source of the source" 294 | - name: source_id 295 | data_type: string 296 | description: "ID of the source" 297 | - name: last4 298 | data_type: string 299 | description: "Last 4 digits of the source" 300 | - name: name 301 | data_type: string 302 | description: "Name of the source" 303 | - name: object_type 304 | data_type: string 305 | description: "Type of the source object" 306 | - name: tokenization_method 307 | data_type: string 308 | description: "Tokenization method of the source" 309 | - name: wallet 310 | data_type: string 311 | description: "Wallet used for the source" 312 | - name: transfer 313 | data_type: string 314 | description: "Transfer associated with the source" 315 | -------------------------------------------------------------------------------- /data_modelling/models/staging/schema.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | sources: 4 | - name: postgres 5 | database: postgres 6 | schema: public 7 | tables: 8 | - name: transactions 9 | 10 | models: 11 | - name: stg_transactions 12 | description: "Staging table for transactions data" 13 | columns: 14 | - name: amount 15 | data_type: numeric 16 | description: "Amount of the transaction - 100 = £1" 17 | data_tests: 18 | - not_null: 19 | severity: warn 20 | - name: amount_captured 21 | data_type: numeric 22 | description: "Amount in bani captured (can be less than the amount attribute on the charge if a partial capture was made)" 23 | - name: amount_refunded 24 | data_type: numeric 25 | description: "Amount in bani refunded (can be less than the amount attribute on the charge if a partial refund was issued)" 26 | - name: application 27 | data_type: string 28 | description: "ID of the Connect application that created the charge" 29 | - name: application_fee 30 | data_type: numeric 31 | description: "The application fee (if any) for the charge See the Connect documentation for details" 32 | - name: application_fee_amount 33 | data_type: numeric 34 | description: "The amount of the application fee (if any) requested for the charge See the Connect documentation for details" 35 | - name: balance_transaction 36 | data_type: string 37 | description: "ID of the balance transaction that describes the impact of this charge on your account balance (not including refunds or disputes)" 38 | - name: billing_details_address_city 39 | data_type: string 40 | description: "City in the billing address" 41 | - name: billing_details_address_country 42 | data_type: string 43 | description: "Country in the billing address" 44 | - name: billing_details_address_line1 45 | data_type: string 46 | description: "Line 1 of the billing address" 47 | - name: billing_details_address_line2 48 | data_type: string 49 | description: "Line 2 of the billing address" 50 | - name: billing_details_address_postal_code 51 | data_type: string 52 | description: "Postal code in the billing address" 53 | - name: billing_details_address_state 54 | data_type: string 55 | description: "State in the billing address" 56 | - name: billing_details_email 57 | data_type: string 58 | description: "Email in the billing details" 59 | - name: billing_details_name 60 | data_type: string 61 | description: "Name in the billing details" 62 | - name: billing_details_phone 63 | data_type: string 64 | description: "Phone number in the billing details" 65 | - name: calculated_statement_descriptor 66 | data_type: string 67 | description: "The full statement descriptor that is passed to card networks, and that is displayed on your customers’ credit card and bank statements Allows you to see what the statement descriptor looks like after the static and dynamic portions are combined" 68 | - name: captured 69 | data_type: boolean 70 | description: "If the charge was created without capturing, this Boolean represents whether it is still uncaptured or has since been captured" 71 | - name: created 72 | data_type: datetime 73 | description: "Timestamp of when the transaction was created" 74 | data_tests: 75 | - not_null: 76 | severity: warn 77 | - name: currency 78 | data_type: string 79 | description: "Three-letter ISO currency code, in lowercase Must be a supported currency" 80 | - name: customer 81 | data_type: string 82 | description: "The ID of the customer this charge is for if one exists" 83 | - name: description 84 | data_type: string 85 | description: "An arbitrary string attached to the object Often useful for displaying to users" 86 | - name: destination 87 | data_type: string 88 | description: "Destination of the transaction" 89 | - name: dispute 90 | data_type: string 91 | description: "Dispute associated with the transaction" 92 | - name: disputed 93 | data_type: boolean 94 | description: "Whether the charge has been disputed" 95 | - name: failure_balance_transaction 96 | data_type: string 97 | description: "Balance transaction associated with the failure" 98 | - name: failure_code 99 | data_type: string 100 | description: "Error code explaining reason for charge failure if available (see the errors section for a list of codes)" 101 | - name: failure_message 102 | data_type: string 103 | description: "Message to user further explaining reason for charge failure if available" 104 | - name: id 105 | data_type: string 106 | description: "ID of the transaction" 107 | data_tests: 108 | - not_null: 109 | severity: warn 110 | - unique: 111 | severity: warn 112 | - name: invoice 113 | data_type: string 114 | description: "The ID of the invoice this charge is for if one exists" 115 | - name: livemode 116 | data_type: boolean 117 | description: "Has the value true if the object exists in live mode or the value false if the object exists in test mode" 118 | - name: object 119 | data_type: string 120 | description: "String representing the object’s type Objects of the same type share the same value" 121 | - name: on_behalf_of 122 | data_type: string 123 | description: "The account (if any) the charge was made on behalf of without triggering an automatic transfer See the Connect documentation for details" 124 | - name: order 125 | data_type: string 126 | description: "ID of the order this charge is for if one exists" 127 | - name: outcome_network_status 128 | data_type: string 129 | description: "Network status of the transaction outcome" 130 | - name: outcome_reason 131 | data_type: string 132 | description: "Reason for the transaction outcome" 133 | - name: outcome_risk_level 134 | data_type: string 135 | description: "Risk level of the transaction outcome" 136 | - name: outcome_risk_score 137 | data_type: numeric 138 | description: "Risk score of the transaction outcome" 139 | - name: outcome_seller_message 140 | data_type: string 141 | description: "Seller message of the transaction outcome" 142 | - name: outcome_type 143 | data_type: string 144 | description: "Type of the transaction outcome" 145 | - name: paid 146 | data_type: boolean 147 | description: "True if the charge succeeded, or was successfully authorized for later capture" 148 | - name: payment_intent 149 | data_type: string 150 | description: "The ID of the PaymentIntent associated with this charge, if one exists" 151 | - name: payment_method 152 | data_type: string 153 | description: "ID of the payment method used in this charge" 154 | - name: payment_method_details_card_amount_authorized 155 | data_type: numeric 156 | description: "Amount authorized by the card payment method" 157 | - name: payment_method_details_card_brand 158 | data_type: string 159 | description: "Brand of the card payment method" 160 | - name: payment_method_details_card_checks_address_line1_check 161 | data_type: string 162 | description: "Address line 1 check result of the card payment method" 163 | - name: payment_method_details_card_checks_address_postal_code_check 164 | data_type: string 165 | description: "Postal code check result of the card payment method" 166 | - name: payment_method_details_card_checks_cvc_check 167 | data_type: string 168 | description: "CVC check result of the card payment method" 169 | - name: payment_method_details_card_country 170 | data_type: string 171 | description: "Country of the card payment method" 172 | - name: payment_method_details_card_exp_month 173 | data_type: integer 174 | description: "Expiration month of the card payment method" 175 | - name: payment_method_details_card_exp_year 176 | data_type: integer 177 | description: "Expiration year of the card payment method" 178 | - name: payment_method_details_card_extended_authorization_status 179 | data_type: string 180 | description: "Extended authorization status of the card payment method" 181 | - name: payment_method_details_card_fingerprint 182 | data_type: string 183 | description: "Fingerprint of the card payment method" 184 | - name: payment_method_details_card_funding 185 | data_type: string 186 | description: "Funding type of the card payment method" 187 | - name: payment_method_details_card_incremental_authorization_status 188 | data_type: string 189 | description: "Incremental authorization status of the card payment method" 190 | - name: payment_method_details_card_installments 191 | data_type: integer 192 | description: "Number of installments for the card payment method" 193 | - name: payment_method_details_card_last4 194 | data_type: string 195 | description: "Last 4 digits of the card payment method" 196 | - name: payment_method_details_card_mandate 197 | data_type: string 198 | description: "Mandate of the card payment method" 199 | - name: payment_method_details_card_multicapture_status 200 | data_type: string 201 | description: "Multicapture status of the card payment method" 202 | - name: payment_method_details_card_network 203 | data_type: string 204 | description: "Network of the card payment method" 205 | - name: payment_method_details_card_network_token_used 206 | data_type: boolean 207 | description: "Flag indicating if a network token was used with the card payment method" 208 | - name: payment_method_details_card_overcapture_maximum_amount 209 | data_type: numeric 210 | description: "Maximum amount that can be overcaptured with the card payment method" 211 | - name: payment_method_details_card_overcapture_status 212 | data_type: string 213 | description: "Overcapture status of the card payment method" 214 | - name: payment_method_details_card_three_d_secure 215 | data_type: string 216 | description: "3D Secure status of the card payment method" 217 | - name: payment_method_details_card_wallet 218 | data_type: string 219 | description: "Wallet of the card payment method" 220 | - name: payment_method_details_type 221 | data_type: string 222 | description: "Type of the payment method details" 223 | - name: receipt_email 224 | data_type: string 225 | description: "This is the email address that the receipt for this charge was sent to" 226 | - name: receipt_number 227 | data_type: string 228 | description: "This is the transaction number that appears on email receipts sent for this charge This attribute will be null until a receipt has been sent" 229 | - name: receipt_url 230 | data_type: string 231 | description: "This is the URL to view the receipt for this charge The receipt is kept up-to-date to the latest state of the charge, including any refunds If the charge is for an Invoice, the receipt will be stylized as an Invoice receipt" 232 | - name: refunded 233 | data_type: boolean 234 | description: "Whether the charge has been fully refunded If the charge is only partially refunded, this attribute will still be false" 235 | - name: review 236 | data_type: string 237 | description: "ID of the review associated with this charge if one exists" 238 | - name: shipping 239 | data_type: string 240 | description: "Shipping information for the charge" 241 | - name: source_address_city 242 | data_type: string 243 | description: "City in the source address" 244 | - name: source_address_country 245 | data_type: string 246 | description: "Country in the source address" 247 | - name: source_address_line1 248 | data_type: string 249 | description: "Line 1 of the source address" 250 | - name: source_address_line1_check 251 | data_type: string 252 | description: "Address line 1 check result of the source" 253 | - name: source_address_line2 254 | data_type: string 255 | description: "Line 2 of the source address" 256 | - name: source_address_state 257 | data_type: string 258 | description: "State in the source address" 259 | - name: source_address_zip 260 | data_type: string 261 | description: "Zip code in the source address" 262 | - name: source_address_zip_check 263 | data_type: string 264 | description: "Zip code check result of the source" 265 | - name: source_brand 266 | data_type: string 267 | description: "Brand of the source" 268 | - name: source_country 269 | data_type: string 270 | description: "Country of the source" 271 | - name: source_customer 272 | data_type: string 273 | description: "Customer associated with the source" 274 | - name: source_cvc_check 275 | data_type: string 276 | description: "CVC check result of the source" 277 | - name: source_dynamic_last4 278 | data_type: string 279 | description: "Dynamic last 4 digits of the source" 280 | - name: source_exp_month 281 | data_type: integer 282 | description: "Expiration month of the source" 283 | - name: source_exp_year 284 | data_type: integer 285 | description: "Expiration year of the source" 286 | - name: source_fingerprint 287 | data_type: string 288 | description: "Fingerprint of the source" 289 | - name: source_funding 290 | data_type: string 291 | description: "Funding type of the source" 292 | - name: source_id 293 | data_type: string 294 | description: "ID of the source" 295 | - name: source_last4 296 | data_type: string 297 | description: "Last 4 digits of the source" 298 | - name: source_name 299 | data_type: string 300 | description: "Name in the source" 301 | - name: source_object 302 | data_type: string 303 | description: "Object type of the source" 304 | - name: source_tokenization_method 305 | data_type: string 306 | description: "Tokenization method of the source" 307 | - name: source_wallet 308 | data_type: string 309 | description: "Wallet of the source" 310 | - name: source_transfer 311 | data_type: string 312 | description: "Transfer associated with the source" 313 | - name: statement_descriptor 314 | data_type: string 315 | description: "For card charges, use statement_descriptor_suffix instead Otherwise, you can use this value as the complete description of a charge on your customers’ statements Must contain at least one letter, maximum 22 characters" 316 | - name: statement_descriptor_suffix 317 | data_type: string 318 | description: "Provides information about the charge that customers see on their statements Concatenated with the prefix (shortened descriptor) or statement descriptor that’s set on the account to form the complete statement descriptor Maximum 22 characters for the concatenated descriptor" 319 | - name: status 320 | data_type: string 321 | description: "The status of the payment is either succeeded, pending, or failed" 322 | - name: transfer_data 323 | data_type: string 324 | description: "An optional dictionary including the account to automatically transfer to as part of a destination charge" 325 | - name: transfer_group 326 | data_type: string 327 | description: "A string that identifies this transaction as part of a group" 328 | --------------------------------------------------------------------------------