├── .gitignore
├── data_modelling
├── analyses
│ └── .gitkeep
├── macros
│ └── .gitkeep
├── seeds
│ └── .gitkeep
├── snapshots
│ └── .gitkeep
├── tests
│ └── .gitkeep
├── .gitignore
├── models
│ ├── dimensions
│ │ ├── dim_forex_rates.sql
│ │ ├── dim_outcome_details.sql
│ │ ├── dim_billing_details.sql
│ │ ├── dim_source_details.sql
│ │ ├── dim_payment_method_details.sql
│ │ └── schema.yml
│ ├── core
│ │ ├── facts_transactions.sql
│ │ └── schema.yml
│ └── staging
│ │ ├── stg_transactions.sql
│ │ └── schema.yml
├── README.md
└── dbt_project.yml
├── forex-rates
├── requirements.txt
├── Dockerfile
└── forex_rates.py
├── postgres
├── Dockerfile
└── create_table.sql
├── kafka-producer
├── requirements.txt
├── Dockerfile
├── python-producer.py
└── wait-for-it.sh
├── kafka-consumer
├── requirements.txt
├── Dockerfile
├── wait-for-it.sh
├── spark_schema.py
└── python-consumer.py
├── project-png
├── transactions_stream_data_model.png
└── transactions_stream_project_diagram.png
├── grafana
└── dashboards
│ └── grafana_datasources.yaml
├── sample.env
├── Makefile
├── docker-compose.yml
└── README.md
/.gitignore:
--------------------------------------------------------------------------------
1 | .env
2 | .DS_Store
--------------------------------------------------------------------------------
/data_modelling/analyses/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/data_modelling/macros/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/data_modelling/seeds/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/data_modelling/snapshots/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/data_modelling/tests/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/data_modelling/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | target/
3 | dbt_packages/
4 | logs/
5 |
--------------------------------------------------------------------------------
/forex-rates/requirements.txt:
--------------------------------------------------------------------------------
1 | psycopg2-binary
2 | requests
3 | schedule
4 |
--------------------------------------------------------------------------------
/postgres/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM postgres:latest
2 |
3 | COPY create_table.sql /docker-entrypoint-initdb.d/
--------------------------------------------------------------------------------
/kafka-producer/requirements.txt:
--------------------------------------------------------------------------------
1 | kafka-python==2.0.2
2 | schedule==1.1.0
3 | aiokafka==0.7.2
4 | stripe
5 |
--------------------------------------------------------------------------------
/kafka-consumer/requirements.txt:
--------------------------------------------------------------------------------
1 | kafka-python==2.0.2
2 | schedule==1.1.0
3 | aiokafka==0.7.2
4 | psycopg2-binary
5 | pyspark
6 |
--------------------------------------------------------------------------------
/project-png/transactions_stream_data_model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/divakaivan/transaction-stream-data-pipeline/HEAD/project-png/transactions_stream_data_model.png
--------------------------------------------------------------------------------
/project-png/transactions_stream_project_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/divakaivan/transaction-stream-data-pipeline/HEAD/project-png/transactions_stream_project_diagram.png
--------------------------------------------------------------------------------
/forex-rates/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.11-slim
2 |
3 | WORKDIR /
4 |
5 | COPY requirements.txt .
6 |
7 | RUN pip install --no-cache-dir -r requirements.txt
8 |
9 | ADD forex_rates.py .
10 |
11 | CMD python -u forex_rates.py
--------------------------------------------------------------------------------
/kafka-producer/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.8-slim
2 |
3 | COPY requirements.txt .
4 |
5 | RUN set -ex; \
6 | pip install --no-cache-dir -r requirements.txt
7 |
8 | # copy resources
9 | WORKDIR /
10 | COPY wait-for-it.sh wait-for-it.sh
11 |
12 | ADD python-producer.py .
13 |
14 | CMD ./wait-for-it.sh -s -t 30 $ZOOKEEPER_SERVER -- ./wait-for-it.sh -s -t 30 $KAFKA_SERVER -- python -u python-producer.py
--------------------------------------------------------------------------------
/grafana/dashboards/grafana_datasources.yaml:
--------------------------------------------------------------------------------
1 | # config file version
2 | apiVersion: 1
3 |
4 | datasources:
5 | - name: PostgreSQL
6 | type: postgres
7 | access: proxy
8 | url: $POSTGRES_HOST:5432
9 | database: $POSTGRES_DB
10 | user: $POSTGRES_USER
11 | secureJsonData:
12 | password: $POSTGRES_PASSWORD
13 | jsonData:
14 | sslmode: 'disable'
15 | database: $POSTGRES_DB
--------------------------------------------------------------------------------
/sample.env:
--------------------------------------------------------------------------------
1 | STRIPE_API_KEY=
2 |
3 | POSTGRES_HOST=
4 | POSTGRES_USER=
5 | POSTGRES_PASSWORD=
6 | POSTGRES_DB=
7 | PGADMIN_DEFAULT_EMAIL=
8 | PGADMIN_DEFAULT_PASSWORD=
9 | POSTGRES_TABLE=
10 |
11 | KAFKA_TOPIC=
12 | KAFKA_SERVER=
13 | ZOOKEEPER_SERVER=
14 | PRODUCER_INTERVAL=
15 |
16 | ZOOKEEPER_CLIENT_PORT=
17 | ZOOKEPER_TICK_TIME=
18 |
19 | KAFKA_BROKER_ID=
20 | KAFKA_ADVERTISED_HOST_NAME=
21 | KAFKA_ZOOKEEPER_CONNECT=
22 | KAFKA_ADVERTISED_LISTENERS=PLAINTEXT://$KAFKA_SERVER
23 | KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR=
24 | JMX_PORT=
--------------------------------------------------------------------------------
/data_modelling/models/dimensions/dim_forex_rates.sql:
--------------------------------------------------------------------------------
1 | {{ config(
2 | materialized='table'
3 | ) }}
4 |
5 | with dim_forex_rates_data as (
6 | select
7 | "date" as date_rates,
8 | usd,
9 | eur,
10 | jpy,
11 | cad,
12 | aud,
13 | chf,
14 | cny,
15 | sek,
16 | nzd,
17 | mxn
18 | from {{ source('postgres', 'forex_rates') }}
19 | )
20 |
21 | select * from dim_forex_rates_data
22 |
23 | {% if var("is_dev_run", default=true) %} limit 100 {% endif %}
--------------------------------------------------------------------------------
/data_modelling/models/dimensions/dim_outcome_details.sql:
--------------------------------------------------------------------------------
1 | {{ config(
2 | materialized='table'
3 | ) }}
4 |
5 | with dim_outcome_details_data as (
6 | select
7 | id,
8 | outcome_network_status as network_status,
9 | outcome_reason as reason,
10 | outcome_risk_level as risk_level,
11 | outcome_risk_score as risk_score,
12 | outcome_seller_message as seller_message,
13 | outcome_type
14 | from {{ ref('stg_transactions') }}
15 | )
16 |
17 | select * from dim_outcome_details_data
18 |
19 | {% if var("is_dev_run", default=true) %} limit 100 {% endif %}
--------------------------------------------------------------------------------
/data_modelling/README.md:
--------------------------------------------------------------------------------
1 | Welcome to your new dbt project!
2 |
3 | ### Using the starter project
4 |
5 | Try running the following commands:
6 | - dbt run
7 | - dbt test
8 |
9 |
10 | ### Resources:
11 | - Learn more about dbt [in the docs](https://docs.getdbt.com/docs/introduction)
12 | - Check out [Discourse](https://discourse.getdbt.com/) for commonly asked questions and answers
13 | - Join the [chat](https://community.getdbt.com/) on Slack for live discussions and support
14 | - Find [dbt events](https://events.getdbt.com) near you
15 | - Check out [the blog](https://blog.getdbt.com/) for the latest news on dbt's development and best practices
16 |
--------------------------------------------------------------------------------
/data_modelling/models/dimensions/dim_billing_details.sql:
--------------------------------------------------------------------------------
1 | {{ config(
2 | materialized='table'
3 | ) }}
4 |
5 | with dim_billing_details_data as (
6 | select
7 | id,
8 | billing_details_address_city as address_city,
9 | billing_details_address_country as address_country,
10 | billing_details_address_line1 as address_line1,
11 | billing_details_address_line2 as address_line2,
12 | billing_details_address_postal_code as address_postal_code,
13 | billing_details_address_state as address_state,
14 | billing_details_email as email,
15 | billing_details_name as "name",
16 | billing_details_phone as phone
17 | from {{ ref('stg_transactions') }}
18 | )
19 |
20 | select * from dim_billing_details_data
21 |
22 | {% if var("is_dev_run", default=true) %} limit 100 {% endif %}
--------------------------------------------------------------------------------
/data_modelling/models/core/facts_transactions.sql:
--------------------------------------------------------------------------------
1 | {{ config(
2 | materialized='table'
3 | ) }}
4 |
5 | with transactions as (
6 | select
7 | t.id as transaction_id,
8 | t.created as transaction_date,
9 | t.amount,
10 | t.amount_captured,
11 | t.amount_refunded,
12 | t.currency,
13 | t.customer,
14 | t.payment_intent,
15 | t.payment_method,
16 | t.status,
17 | fr.usd as forex_usd,
18 | fr.eur as forex_eur,
19 | fr.jpy as forex_jpy,
20 | fr.cad as forex_cad,
21 | fr.aud as forex_aud,
22 | fr.chf as forex_chf,
23 | fr.cny as forex_cny,
24 | fr.sek as forex_sek,
25 | fr.nzd as forex_nzd,
26 | fr.mxn as forex_mxn
27 | from {{ ref('stg_transactions') }} t
28 | left join {{ ref('dim_forex_rates') }} fr on cast(t.created as date) = cast(fr.date_rates as date)
29 | )
30 |
31 | select * from transactions
32 |
33 | {% if var("is_dev_run", default=true) %} limit 100 {% endif %}
34 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | # Define variables
2 | DOCKER_COMPOSE_FILE=docker-compose.yml
3 |
4 | # Default target when `make` is run without arguments
5 | .DEFAULT_GOAL := help
6 |
7 | .PHONY: help
8 | help: ## Show this help message
9 | @echo ""
10 | @echo "Usage: make [option]"
11 | @echo ""
12 | @echo "Options:"
13 | @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf " \033[36m%-20s\033[0m %s\n", $$1, $$2}'
14 | @echo ""
15 |
16 | .PHONY: build
17 | build: ## Build docker services
18 | docker-compose -f $(DOCKER_COMPOSE_FILE) build
19 |
20 | .PHONY: start
21 | start: ## Start docker services (detached mode)
22 | docker-compose -f $(DOCKER_COMPOSE_FILE) up -d
23 |
24 | .PHONY: stop
25 | stop: ## Stop docker services
26 | docker-compose -f $(DOCKER_COMPOSE_FILE) stop
27 |
28 | .PHONY: dbt-limit
29 | dbt-test: ## Run dbt with LIMIT 100
30 | cd data_modelling && dbt build
31 |
32 | .PHONY: dbt-full
33 | dbt-full: ## Run dbt with full data
34 | cd data_modelling && dbt build --vars '{"is_dev_run": false}'
35 |
36 |
--------------------------------------------------------------------------------
/data_modelling/dbt_project.yml:
--------------------------------------------------------------------------------
1 |
2 | # Name your project! Project names should contain only lowercase characters
3 | # and underscores. A good package name should reflect your organization's
4 | # name or the intended use of these models
5 | name: 'data_modelling'
6 | version: '1.0.0'
7 |
8 | # This setting configures which "profile" dbt uses for this project.
9 | profile: 'data_modelling'
10 |
11 | # These configurations specify where dbt should look for different types of files.
12 | # The `model-paths` config, for example, states that models in this project can be
13 | # found in the "models/" directory. You probably won't need to change these!
14 | model-paths: ["models"]
15 | analysis-paths: ["analyses"]
16 | test-paths: ["tests"]
17 | seed-paths: ["seeds"]
18 | macro-paths: ["macros"]
19 | snapshot-paths: ["snapshots"]
20 |
21 | clean-targets: # directories to be removed by `dbt clean`
22 | - "target"
23 | - "dbt_packages"
24 |
25 |
26 | # Configuring models
27 | # Full documentation: https://docs.getdbt.com/docs/configuring-models
28 |
29 | # In this example config, we tell dbt to build all models in the example/
30 | # directory as views. These settings can be overridden in the individual model
31 | # files using the `{{ config(...) }}` macro.
32 | models:
33 | data_modelling:
34 |
35 |
--------------------------------------------------------------------------------
/data_modelling/models/dimensions/dim_source_details.sql:
--------------------------------------------------------------------------------
1 | {{ config(
2 | materialized='table'
3 | ) }}
4 |
5 | with dim_source_details_data as (
6 | select
7 | id,
8 | source_address_city as address_city,
9 | source_address_country as address_country,
10 | source_address_line1 as address_line1,
11 | source_address_line1_check as address_line1_check,
12 | source_address_line2 as address_line2,
13 | source_address_state as address_state,
14 | source_address_zip as address_zip,
15 | source_address_zip_check as address_zip_check,
16 | source_brand as brand,
17 | source_country as country,
18 | source_customer as customer,
19 | source_cvc_check as cvc_check,
20 | source_dynamic_last4 as dynamic_last4,
21 | source_exp_month as exp_month,
22 | source_exp_year as exp_year,
23 | source_fingerprint as fingerprint,
24 | source_funding as funding,
25 | source_id as source_id,
26 | source_last4 as last4,
27 | source_name as "name",
28 | source_object as object_type,
29 | source_tokenization_method as tokenization_method,
30 | source_wallet as wallet,
31 | source_transfer as transfer
32 | from {{ ref('stg_transactions') }}
33 | )
34 |
35 | select * from dim_source_details_data
36 |
37 | {% if var("is_dev_run", default=true) %} limit 100 {% endif %}
--------------------------------------------------------------------------------
/kafka-producer/python-producer.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 | import random
4 | import schedule
5 | from json import dumps
6 |
7 | from kafka import KafkaProducer
8 |
9 | import stripe
10 |
11 | kafka_nodes = os.getenv('KAFKA_SERVER')
12 | my_topic = os.getenv('KAFKA_TOPIC')
13 |
14 | def create_test_charge():
15 | try:
16 | amount = random.randint(100, 1000000) # 1p to £10,000
17 | stripe.api_key = os.getenv('STRIPE_API_KEY')
18 | charge = stripe.Charge.create(
19 | amount=amount,
20 | currency='gbp',
21 | source='tok_visa',
22 | )
23 | return charge
24 | except stripe.error.CardError as e:
25 | print(f"Card declined: {e.error.message}")
26 | return None
27 | except stripe.error.StripeError as e:
28 | print(f"Stripe error: {e}")
29 | return None
30 |
31 | def send_to_kafka(charges):
32 | try:
33 | prod = KafkaProducer(bootstrap_servers=kafka_nodes, api_version=(2, 0, 2),
34 | value_serializer=lambda x: dumps(x).encode('utf-8'))
35 |
36 | my_data = {'transactions': charges}
37 | prod.send(my_topic, value=my_data)
38 | prod.flush()
39 |
40 | print(f"Sent {len(charges)} transactions to Kafka")
41 |
42 | except Exception as e:
43 | print(f"Error sending to Kafka: {e}")
44 |
45 | def gen_data():
46 | num_charges = 25 # stripe create limit
47 | charges = [create_test_charge() for _ in range(num_charges)]
48 | send_to_kafka(charges)
49 |
50 | if __name__ == '__main__':
51 | schedule.every(3).seconds.do(gen_data)
52 | try:
53 | while True:
54 | schedule.run_pending()
55 | time.sleep(0.5)
56 | except KeyboardInterrupt:
57 | print("Stopping...")
58 |
--------------------------------------------------------------------------------
/kafka-consumer/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.8-slim-buster
2 |
3 | # Install necessary dependencies
4 | RUN apt-get update && \
5 | apt-get install -y --no-install-recommends openjdk-11-jre-headless ca-certificates-java procps wget && \
6 | apt-get clean && \
7 | update-ca-certificates -f && \
8 | rm -rf /var/lib/apt/lists/*
9 |
10 | # Verify the Java installation
11 | RUN java -version
12 |
13 | # Install Spark
14 | RUN wget https://dlcdn.apache.org/spark/spark-3.5.1/spark-3.5.1-bin-hadoop3.tgz
15 | RUN tar -xzf spark-3.5.1-bin-hadoop3.tgz -C /opt
16 | ENV SPARK_HOME=/opt/spark-3.5.1-bin-hadoop3
17 | ENV PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin
18 |
19 | # Install postgres class
20 | RUN mkdir -p /opt/spark/jars/
21 | RUN wget -O /opt/spark/jars/postgresql-42.2.20.jar https://jdbc.postgresql.org/download/postgresql-42.2.20.jar
22 |
23 | # Find the Java installation path using update-alternatives and set JAVA_HOME
24 | RUN export JAVA_HOME=$(dirname $(dirname $(readlink -f $(which java)))) && \
25 | echo $JAVA_HOME && \
26 | ln -s $JAVA_HOME /usr/lib/jvm/java-11-openjdk-amd64 && \
27 | echo "export JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64" >> /etc/profile && \
28 | echo "export PATH=\$PATH:\$JAVA_HOME/bin" >> /etc/profile
29 |
30 | # Set environment variables for Java in Dockerfile scope
31 | ENV JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64
32 | ENV PATH=$PATH:$JAVA_HOME/bin
33 |
34 | # Verify the JAVA_HOME path
35 | RUN echo $JAVA_HOME && ls -l $JAVA_HOME/bin/java
36 |
37 | # Copy requirements.txt and install Python dependencies
38 | COPY requirements.txt .
39 | RUN pip install --no-cache-dir -r requirements.txt
40 |
41 | # Copy resources
42 | WORKDIR /
43 | COPY wait-for-it.sh wait-for-it.sh
44 |
45 | ADD python-consumer.py .
46 |
47 | CMD ["/bin/bash", "-c", "/wait-for-it.sh -s -t 30 $ZOOKEEPER_SERVER -- /wait-for-it.sh -s -t 30 $KAFKA_SERVER -- ${SPARK_HOME}/bin/spark-submit --packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.1,org.postgresql:postgresql:42.2.20 python-consumer.py"]
--------------------------------------------------------------------------------
/data_modelling/models/dimensions/dim_payment_method_details.sql:
--------------------------------------------------------------------------------
1 | {{ config(
2 | materialized='table'
3 | ) }}
4 |
5 | with dim_payment_method_details_data as (
6 | select
7 | id,
8 | payment_method_details_card_amount_authorized as card_amount_authorized,
9 | payment_method_details_card_brand as card_brand,
10 | payment_method_details_card_checks_address_line1_check as card_checks_address_line1_check,
11 | payment_method_details_card_checks_address_postal_code_check as card_checks_address_postal_code_check,
12 | payment_method_details_card_checks_cvc_check as card_checks_cvc_check,
13 | payment_method_details_card_country as card_country,
14 | payment_method_details_card_exp_month as card_exp_month,
15 | payment_method_details_card_exp_year as card_exp_year,
16 | payment_method_details_card_extended_authorization_status as card_extended_authorization_status,
17 | payment_method_details_card_fingerprint as card_fingerprint,
18 | payment_method_details_card_funding as card_funding,
19 | payment_method_details_card_incremental_authorization_status as card_incremental_authorization_status,
20 | payment_method_details_card_installments as card_installments,
21 | payment_method_details_card_last4 as card_last4,
22 | payment_method_details_card_mandate as card_mandate,
23 | payment_method_details_card_multicapture_status as card_multicapture_status,
24 | payment_method_details_card_network as card_network,
25 | payment_method_details_card_network_token_used as card_network_token_used,
26 | payment_method_details_card_overcapture_maximum_amount as card_overcapture_maximum_amount,
27 | payment_method_details_card_overcapture_status as card_overcapture_status,
28 | payment_method_details_card_three_d_secure as card_three_d_secure,
29 | payment_method_details_card_wallet as card_wallet,
30 | payment_method_details_type as "type"
31 | from {{ ref('stg_transactions') }}
32 | )
33 |
34 | select * from dim_payment_method_details_data
35 |
36 | {% if var("is_dev_run", default=true) %} limit 100 {% endif %}
--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
1 | networks:
2 | bridge:
3 | driver: bridge
4 |
5 | services:
6 | zookeeper:
7 | image: confluentinc/cp-zookeeper:latest
8 | env_file:
9 | - .env
10 | networks:
11 | bridge:
12 | aliases:
13 | - zookeeper
14 |
15 | kafka:
16 | image: confluentinc/cp-kafka
17 | depends_on:
18 | - zookeeper
19 | env_file:
20 | - .env
21 | networks:
22 | bridge:
23 | aliases:
24 | - kafka
25 |
26 | kafka-producer:
27 | build:
28 | context: ./kafka-producer
29 | container_name: kafka-producer
30 | depends_on:
31 | - kafka
32 | - postgres
33 | - kafka-consumer
34 | env_file:
35 | - .env
36 | networks:
37 | - bridge
38 |
39 | kafka-consumer:
40 | build:
41 | context: ./kafka-consumer
42 | container_name: kafka-consumer
43 | depends_on:
44 | - kafka
45 | - postgres
46 | env_file:
47 | - .env
48 | networks:
49 | - bridge
50 |
51 | postgres:
52 | build:
53 | context: ./postgres
54 | container_name: postgres
55 | restart: always
56 | env_file:
57 | - .env
58 | ports:
59 | - "5432:5432"
60 | networks:
61 | - bridge
62 |
63 | pgadmin:
64 | image: dpage/pgadmin4
65 | restart: always
66 | env_file:
67 | - .env
68 | ports:
69 | - "8080:80"
70 | depends_on:
71 | - postgres
72 | networks:
73 | - bridge
74 |
75 | forex-rates:
76 | build:
77 | context: ./forex-rates
78 | container_name: forex-rates
79 | restart: always
80 | env_file:
81 | - .env
82 | networks:
83 | - bridge
84 |
85 | grafana:
86 | image: grafana/grafana
87 | user: "472"
88 | env_file:
89 | - .env
90 | ports:
91 | - "3000:3000"
92 | volumes:
93 | - ./grafana/dashboards/grafana_datasources.yaml:/etc/grafana/provisioning/datasources/datasource.yaml:ro
94 | - ./grafana/dashboards:/opt/grafana/dashboards
95 | networks:
96 | - bridge
97 | restart: always
98 |
99 | volumes:
100 | settings:
101 | data:
102 | grafana_data: {}
103 |
--------------------------------------------------------------------------------
/forex-rates/forex_rates.py:
--------------------------------------------------------------------------------
1 | import os
2 | import requests
3 | import schedule
4 | import psycopg2
5 | from datetime import datetime
6 | import time
7 |
8 | def get_forex_rates():
9 | date = datetime.now().strftime("%Y-%m-%d")
10 | api_url = f"https://cdn.jsdelivr.net/npm/@fawazahmed0/currency-api@{date}/v1/currencies/gbp.json"
11 | response = requests.get(api_url)
12 | data = response.json()
13 | # top currencies only
14 | currencies = ["usd", "eur", "jpy", "cad", "aud", "chf", "cny", "sek", "nzd", "mxn"]
15 | top_currency_rates = {'date': data.get('date'), **{currency: data.get('gbp')[currency] for currency in currencies}}
16 | return top_currency_rates
17 |
18 | def insert_into_db(data):
19 | conn = psycopg2.connect(
20 | host=os.getenv("POSTGRES_HOST"),
21 | database=os.getenv("POSTGRES_DB"),
22 | user=os.getenv("POSTGRES_USER"),
23 | password=os.getenv("POSTGRES_PASSWORD")
24 | )
25 | cursor = conn.cursor()
26 | # doing it here so I do not lose the already loaded data into postgres
27 | cursor.execute("""
28 | CREATE TABLE IF NOT EXISTS forex_rates (
29 | date DATE PRIMARY KEY,
30 | usd FLOAT,
31 | eur FLOAT,
32 | jpy FLOAT,
33 | cad FLOAT,
34 | aud FLOAT,
35 | chf FLOAT,
36 | cny FLOAT,
37 | sek FLOAT,
38 | nzd FLOAT,
39 | mxn FLOAT
40 | )
41 | """)
42 |
43 | cursor.execute("SELECT date FROM forex_rates WHERE date = %s", (data['date'],))
44 | if cursor.fetchone() is None:
45 | cursor.execute("""
46 | INSERT INTO forex_rates (date, usd, eur, jpy, cad, aud, chf, cny, sek, nzd, mxn)
47 | VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
48 | """, (
49 | data['date'],
50 | data['usd'],
51 | data['eur'],
52 | data['jpy'],
53 | data['cad'],
54 | data['aud'],
55 | data['chf'],
56 | data['cny'],
57 | data['sek'],
58 | data['nzd'],
59 | data['mxn']
60 | ))
61 |
62 | conn.commit()
63 | cursor.close()
64 | conn.close()
65 |
66 | def job():
67 | data = get_forex_rates()
68 | insert_into_db(data)
69 | print('New forex rates fetched and inserted into the database.')
70 |
71 | if __name__ == "__main__":
72 | job()
73 | schedule.every(24).hours.do(job)
74 |
75 | try:
76 | while True:
77 | schedule.run_pending()
78 | time.sleep(1)
79 | except KeyboardInterrupt:
80 | print("Stopping...")
81 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Project overview
2 |
3 | 
4 |
5 | * **Stripe**: Using Stripe's API as the source of generating realistic transaction data.
6 |
7 | * **Apache Kafka**: Stripe transaction data is streamed into Apache Kafka. It handles the data streams and ensures they are processed in real-time.
8 |
9 | * **Apache ZooKeeper**: ZooKeeper is used alongside Kafka to manage and coordinate the Kafka brokers. ZooKeeper helps maintain configuration information, naming, synchronization, and group services.
10 |
11 | * **PySpark**: The data from Kafka is then processed using PySpark Structured Streaming. This involves transforming individual transaction data into rows fit for a database.
12 |
13 | * **Forex API**: GBP/x exchange rates are taken from an online API and updated every 24 hours.
14 |
15 | * **PostgreSQL**: After processing, the data is stored in PostgreSQL.
16 |
17 | * **dbt (Data Build Tool)**: dbt is used to manage and transform data within PostgreSQL. Data is split into dimension and a fact tables.
18 |
19 | * **Grafana**: Finally, the data stored in PostgreSQL is visualized using Grafana.
20 |
21 |
22 | # Data model
23 |
24 | 
25 |
26 | # dbt documentation
27 |
28 | [Link to the docs](https://transaction-stream-data-docs.netlify.app/)
29 |
30 | #### dbt lineage
31 |
32 |
33 |
34 | # Visualisation
35 |
36 |
37 |
38 | # Considerations for improvements
39 |
40 | * add PySpark tests
41 | * use an orchestrator
42 | * use more data
43 | * Spark might be an overkill due to the data amount limitations, but I wanted to learn how to set Spark Streaming up in case data is much more
44 | * for a better Grafana visualisation
45 | * maybe find an alternative transactions data source because the Stripe API has a 25 rate limit
46 | * also many of the generated values in a transaction from the Stripe API are null
47 |
48 | # Setup
49 |
50 | 1. `git clone https://github.com/divakaivan/transaction-stream-data-pipeline.git`
51 | 2. Rename `sample.env` to `.env` and fill in the necessary environment variables
52 | 3. Type `make` in the terminal to see the setup options
53 | ```bash
54 | Usage: make [option]
55 |
56 | Options:
57 | help Show this help message
58 | build Build docker services
59 | start Start docker services (detached mode)
60 | stop Stop docker services
61 | dbt-test Run dbt with LIMIT 100
62 | dbt-full Run dbt with full data
63 | ```
64 |
--------------------------------------------------------------------------------
/data_modelling/models/core/schema.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 |
3 | models:
4 | - name: facts_transactions
5 | description: "Transaction facts"
6 | columns:
7 | - name: transaction_id
8 | description: "Transaction ID"
9 | data_type: string
10 | data_tests:
11 | - not_null:
12 | severity: warn
13 | - unique:
14 | severity: warn
15 | - name: transaction_date
16 | description: "Timestamp of when the transaction was created"
17 | data_type: datetime
18 | data_tests:
19 | - not_null:
20 | severity: warn
21 | - name: amount
22 | description: "Amount of the transaction - 100 = £1"
23 | data_type: numeric
24 | data_tests:
25 | - not_null:
26 | severity: warn
27 | - name: amount_captured
28 | description: "Amount in bani captured (can be less than the amount attribute on the charge if a partial capture was made)"
29 | data_type: numeric
30 | - name: amount_refunded
31 | description: "Amount in bani refunded (can be less than the amount attribute on the charge if a partial refund was issued)"
32 | data_type: numeric
33 | - name: currency
34 | description: "Three-letter ISO currency code, in lowercase Must be a supported currency"
35 | data_type: string
36 | data_tests:
37 | - not_null:
38 | severity: warn
39 | - name: customer
40 | description: "The ID of the customer this charge is for if one exists"
41 | data_type: string
42 | - name: payment_intent
43 | description: "The ID of the PaymentIntent associated with this charge, if one exists"
44 | data_type: string
45 | - name: payment_method
46 | description: "ID of the payment method used in this charge"
47 | data_type: string
48 | - name: status
49 | description: "The status of the payment is either succeeded, pending, or failed"
50 | data_type: string
51 | - name: forex_usd
52 | data_type: numeric
53 | description: "USD to GBP rate"
54 | data_tests:
55 | - not_null:
56 | severity: warn
57 | - name: forex_eur
58 | data_type: numeric
59 | description: "EUR to GBP rate"
60 | data_tests:
61 | - not_null:
62 | severity: warn
63 | - name: forex_jpy
64 | data_type: numeric
65 | description: "JPY to GBP rate"
66 | data_tests:
67 | - not_null:
68 | severity: warn
69 | - name: forex_cad
70 | data_type: numeric
71 | description: "CAD to GBP rate"
72 | data_tests:
73 | - not_null:
74 | severity: warn
75 | - name: forex_aud
76 | data_type: numeric
77 | description: "AUD to GBP rate"
78 | data_tests:
79 | - not_null:
80 | severity: warn
81 | - name: forex_chf
82 | data_type: numeric
83 | description: "CHF to GBP rate"
84 | data_tests:
85 | - not_null:
86 | severity: warn
87 | - name: forex_cny
88 | data_type: numeric
89 | description: "CNY to GBP rate"
90 | data_tests:
91 | - not_null:
92 | severity: warn
93 | - name: forex_sek
94 | data_type: numeric
95 | description: "SEK to GBP rate"
96 | data_tests:
97 | - not_null:
98 | severity: warn
99 | - name: forex_nzd
100 | data_type: numeric
101 | description: "NZD to GBP rate"
102 | data_tests:
103 | - not_null:
104 | severity: warn
105 | - name: forex_mxn
106 | data_type: numeric
107 | description: "MXN to GBP rate"
108 | data_tests:
109 | - not_null:
110 | severity: warn
111 |
--------------------------------------------------------------------------------
/data_modelling/models/staging/stg_transactions.sql:
--------------------------------------------------------------------------------
1 | {{ config(
2 | materialized='table'
3 | ) }}
4 |
5 | with source_data as (
6 | SELECT
7 | amount,
8 | amount_captured,
9 | amount_refunded,
10 | "application",
11 | application_fee,
12 | application_fee_amount,
13 | balance_transaction,
14 | billing_details_address_city,
15 | billing_details_address_country,
16 | billing_details_address_line1,
17 | billing_details_address_line2,
18 | billing_details_address_postal_code,
19 | billing_details_address_state,
20 | billing_details_email,
21 | billing_details_name,
22 | billing_details_phone,
23 | calculated_statement_descriptor,
24 | captured,
25 | created,
26 | currency,
27 | customer,
28 | "description",
29 | destination,
30 | dispute,
31 | disputed,
32 | failure_balance_transaction,
33 | failure_code,
34 | failure_message,
35 | id,
36 | invoice,
37 | livemode,
38 | "object",
39 | on_behalf_of,
40 | "order",
41 | outcome_network_status,
42 | outcome_reason,
43 | outcome_risk_level,
44 | outcome_risk_score,
45 | outcome_seller_message,
46 | outcome_type,
47 | paid,
48 | payment_intent,
49 | payment_method,
50 | payment_method_details_card_amount_authorized,
51 | payment_method_details_card_brand,
52 | payment_method_details_card_checks_address_line1_check,
53 | payment_method_details_card_checks_address_postal_code_check,
54 | payment_method_details_card_checks_cvc_check,
55 | payment_method_details_card_country,
56 | payment_method_details_card_exp_month,
57 | payment_method_details_card_exp_year,
58 | payment_method_details_card_extended_authorization_status,
59 | payment_method_details_card_fingerprint,
60 | payment_method_details_card_funding,
61 | payment_method_details_card_incremental_authorization_status,
62 | payment_method_details_card_installments,
63 | payment_method_details_card_last4,
64 | payment_method_details_card_mandate,
65 | payment_method_details_card_multicapture_status,
66 | payment_method_details_card_network,
67 | payment_method_details_card_network_token_used,
68 | payment_method_details_card_overcapture_maximum_amount,
69 | payment_method_details_card_overcapture_status,
70 | payment_method_details_card_three_d_secure,
71 | payment_method_details_card_wallet,
72 | payment_method_details_type,
73 | receipt_email,
74 | receipt_number,
75 | receipt_url,
76 | refunded,
77 | review,
78 | shipping,
79 | source_address_city,
80 | source_address_country,
81 | source_address_line1,
82 | source_address_line1_check,
83 | source_address_line2,
84 | source_address_state,
85 | source_address_zip,
86 | source_address_zip_check,
87 | source_brand,
88 | source_country,
89 | source_customer,
90 | source_cvc_check,
91 | source_dynamic_last4,
92 | source_exp_month,
93 | source_exp_year,
94 | source_fingerprint,
95 | source_funding,
96 | source_id,
97 | source_last4,
98 | source_name,
99 | source_object,
100 | source_tokenization_method,
101 | source_wallet,
102 | source_transfer,
103 | statement_descriptor,
104 | statement_descriptor_suffix,
105 | "status",
106 | transfer_data,
107 | transfer_group
108 | FROM {{ source('postgres', 'transactions') }}
109 | )
110 |
111 | select * from source_data
112 |
113 | {% if var("is_dev_run", default=true) %} limit 100 {% endif %}
--------------------------------------------------------------------------------
/postgres/create_table.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE IF NOT EXISTS transactions (
2 | amount BIGINT,
3 | amount_captured BIGINT,
4 | amount_refunded BIGINT,
5 | "application" VARCHAR(255),
6 | application_fee VARCHAR(255),
7 | application_fee_amount VARCHAR(255),
8 | balance_transaction VARCHAR(255),
9 | billing_details_address_city VARCHAR(255),
10 | billing_details_address_country VARCHAR(255),
11 | billing_details_address_line1 VARCHAR(255),
12 | billing_details_address_line2 VARCHAR(255),
13 | billing_details_address_postal_code VARCHAR(255),
14 | billing_details_address_state VARCHAR(255),
15 | billing_details_email VARCHAR(255),
16 | billing_details_name VARCHAR(255),
17 | billing_details_phone VARCHAR(255),
18 | calculated_statement_descriptor VARCHAR(255),
19 | captured BOOLEAN,
20 | created TIMESTAMP,
21 | currency VARCHAR(3),
22 | customer VARCHAR(255),
23 | "description" TEXT,
24 | destination VARCHAR(255),
25 | dispute VARCHAR(255),
26 | disputed BOOLEAN,
27 | failure_balance_transaction VARCHAR(255),
28 | failure_code VARCHAR(255),
29 | failure_message TEXT,
30 | id VARCHAR(255),
31 | invoice VARCHAR(255),
32 | livemode BOOLEAN,
33 | "object" VARCHAR(255),
34 | on_behalf_of VARCHAR(255),
35 | "order" VARCHAR(255),
36 | outcome_network_status VARCHAR(255),
37 | outcome_reason VARCHAR(255),
38 | outcome_risk_level VARCHAR(255),
39 | outcome_risk_score BIGINT,
40 | outcome_seller_message VARCHAR(255),
41 | outcome_type VARCHAR(255),
42 | paid BOOLEAN,
43 | payment_intent VARCHAR(255),
44 | payment_method VARCHAR(255),
45 | payment_method_details_card_amount_authorized BIGINT,
46 | payment_method_details_card_brand VARCHAR(255),
47 | payment_method_details_card_checks_address_line1_check VARCHAR(255),
48 | payment_method_details_card_checks_address_postal_code_check VARCHAR(255),
49 | payment_method_details_card_checks_cvc_check VARCHAR(255),
50 | payment_method_details_card_country VARCHAR(2),
51 | payment_method_details_card_exp_month INT,
52 | payment_method_details_card_exp_year INT,
53 | payment_method_details_card_extended_authorization_status VARCHAR(255),
54 | payment_method_details_card_fingerprint VARCHAR(255),
55 | payment_method_details_card_funding VARCHAR(255),
56 | payment_method_details_card_incremental_authorization_status VARCHAR(255),
57 | payment_method_details_card_installments VARCHAR(255),
58 | payment_method_details_card_last4 VARCHAR(4),
59 | payment_method_details_card_mandate VARCHAR(255),
60 | payment_method_details_card_multicapture_status VARCHAR(255),
61 | payment_method_details_card_network VARCHAR(255),
62 | payment_method_details_card_network_token_used BOOLEAN,
63 | payment_method_details_card_overcapture_maximum_amount BIGINT,
64 | payment_method_details_card_overcapture_status VARCHAR(255),
65 | payment_method_details_card_three_d_secure VARCHAR(255),
66 | payment_method_details_card_wallet VARCHAR(255),
67 | payment_method_details_type VARCHAR(255),
68 | receipt_email VARCHAR(255),
69 | receipt_number VARCHAR(255),
70 | receipt_url TEXT,
71 | refunded BOOLEAN,
72 | review VARCHAR(255),
73 | shipping VARCHAR(255),
74 | source_address_city VARCHAR(255),
75 | source_address_country VARCHAR(255),
76 | source_address_line1 VARCHAR(255),
77 | source_address_line1_check VARCHAR(255),
78 | source_address_line2 VARCHAR(255),
79 | source_address_state VARCHAR(255),
80 | source_address_zip VARCHAR(255),
81 | source_address_zip_check VARCHAR(255),
82 | source_brand VARCHAR(255),
83 | source_country VARCHAR(2),
84 | source_customer VARCHAR(255),
85 | source_cvc_check VARCHAR(255),
86 | source_dynamic_last4 VARCHAR(4),
87 | source_exp_month INT,
88 | source_exp_year INT,
89 | source_fingerprint VARCHAR(255),
90 | source_funding VARCHAR(255),
91 | source_id VARCHAR(255),
92 | source_last4 VARCHAR(4),
93 | source_name VARCHAR(255),
94 | source_object VARCHAR(255),
95 | source_tokenization_method VARCHAR(255),
96 | source_wallet VARCHAR(255),
97 | source_transfer VARCHAR(255),
98 | statement_descriptor VARCHAR(255),
99 | statement_descriptor_suffix VARCHAR(255),
100 | "status" VARCHAR(255),
101 | transfer_data VARCHAR(255),
102 | transfer_group VARCHAR(255)
103 | );
104 |
--------------------------------------------------------------------------------
/kafka-consumer/wait-for-it.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # Use this script to test if a given TCP host/port are available
3 |
4 | cmdname=$(basename $0)
5 |
6 | echoerr() { if [[ $QUIET -ne 1 ]]; then echo "$@" 1>&2; fi }
7 |
8 | usage()
9 | {
10 | cat << USAGE >&2
11 | Usage:
12 | $cmdname host:port [-s] [-t timeout] [-- command args]
13 | -h HOST | --host=HOST Host or IP under test
14 | -p PORT | --port=PORT TCP port under test
15 | Alternatively, you specify the host and port as host:port
16 | -s | --strict Only execute subcommand if the test succeeds
17 | -q | --quiet Don't output any status messages
18 | -t TIMEOUT | --timeout=TIMEOUT
19 | Timeout in seconds, zero for no timeout
20 | -- COMMAND ARGS Execute command with args after the test finishes
21 | USAGE
22 | exit 1
23 | }
24 |
25 | wait_for()
26 | {
27 | if [[ $TIMEOUT -gt 0 ]]; then
28 | echoerr "$cmdname: waiting $TIMEOUT seconds for $HOST:$PORT"
29 | else
30 | echoerr "$cmdname: waiting for $HOST:$PORT without a timeout"
31 | fi
32 | start_ts=$(date +%s)
33 | while :
34 | do
35 | if [[ $ISBUSY -eq 1 ]]; then
36 | nc -z $HOST $PORT
37 | result=$?
38 | else
39 | (echo > /dev/tcp/$HOST/$PORT) >/dev/null 2>&1
40 | result=$?
41 | fi
42 | if [[ $result -eq 0 ]]; then
43 | end_ts=$(date +%s)
44 | echoerr "$cmdname: $HOST:$PORT is available after $((end_ts - start_ts)) seconds"
45 | break
46 | fi
47 | sleep 1
48 | done
49 | return $result
50 | }
51 |
52 | wait_for_wrapper()
53 | {
54 | # In order to support SIGINT during timeout: http://unix.stackexchange.com/a/57692
55 | if [[ $QUIET -eq 1 ]]; then
56 | timeout $BUSYTIMEFLAG $TIMEOUT $0 --quiet --child --host=$HOST --port=$PORT --timeout=$TIMEOUT &
57 | else
58 | timeout $BUSYTIMEFLAG $TIMEOUT $0 --child --host=$HOST --port=$PORT --timeout=$TIMEOUT &
59 | fi
60 | PID=$!
61 | trap "kill -INT -$PID" INT
62 | wait $PID
63 | RESULT=$?
64 | if [[ $RESULT -ne 0 ]]; then
65 | echoerr "$cmdname: timeout occurred after waiting $TIMEOUT seconds for $HOST:$PORT"
66 | fi
67 | return $RESULT
68 | }
69 |
70 | # process arguments
71 | while [[ $# -gt 0 ]]
72 | do
73 | case "$1" in
74 | *:* )
75 | hostport=(${1//:/ })
76 | HOST=${hostport[0]}
77 | PORT=${hostport[1]}
78 | shift 1
79 | ;;
80 | --child)
81 | CHILD=1
82 | shift 1
83 | ;;
84 | -q | --quiet)
85 | QUIET=1
86 | shift 1
87 | ;;
88 | -s | --strict)
89 | STRICT=1
90 | shift 1
91 | ;;
92 | -h)
93 | HOST="$2"
94 | if [[ $HOST == "" ]]; then break; fi
95 | shift 2
96 | ;;
97 | --host=*)
98 | HOST="${1#*=}"
99 | shift 1
100 | ;;
101 | -p)
102 | PORT="$2"
103 | if [[ $PORT == "" ]]; then break; fi
104 | shift 2
105 | ;;
106 | --port=*)
107 | PORT="${1#*=}"
108 | shift 1
109 | ;;
110 | -t)
111 | TIMEOUT="$2"
112 | if [[ $TIMEOUT == "" ]]; then break; fi
113 | shift 2
114 | ;;
115 | --timeout=*)
116 | TIMEOUT="${1#*=}"
117 | shift 1
118 | ;;
119 | --)
120 | shift
121 | CLI=("$@")
122 | break
123 | ;;
124 | --help)
125 | usage
126 | ;;
127 | *)
128 | echoerr "Unknown argument: $1"
129 | usage
130 | ;;
131 | esac
132 | done
133 |
134 | if [[ "$HOST" == "" || "$PORT" == "" ]]; then
135 | echoerr "Error: you need to provide a host and port to test."
136 | usage
137 | fi
138 |
139 | TIMEOUT=${TIMEOUT:-15}
140 | STRICT=${STRICT:-0}
141 | CHILD=${CHILD:-0}
142 | QUIET=${QUIET:-0}
143 |
144 | # check to see if timeout is from busybox?
145 | # check to see if timeout is from busybox?
146 | TIMEOUT_PATH=$(realpath $(which timeout))
147 | if [[ $TIMEOUT_PATH =~ "busybox" ]]; then
148 | ISBUSY=1
149 | BUSYTIMEFLAG="-t"
150 | else
151 | ISBUSY=0
152 | BUSYTIMEFLAG=""
153 | fi
154 |
155 | if [[ $CHILD -gt 0 ]]; then
156 | wait_for
157 | RESULT=$?
158 | exit $RESULT
159 | else
160 | if [[ $TIMEOUT -gt 0 ]]; then
161 | wait_for_wrapper
162 | RESULT=$?
163 | else
164 | wait_for
165 | RESULT=$?
166 | fi
167 | fi
168 |
169 | if [[ $CLI != "" ]]; then
170 | if [[ $RESULT -ne 0 && $STRICT -eq 1 ]]; then
171 | echoerr "$cmdname: strict mode, refusing to execute subprocess"
172 | exit $RESULT
173 | fi
174 | exec "${CLI[@]}"
175 | else
176 | exit $RESULT
177 | fi
--------------------------------------------------------------------------------
/kafka-producer/wait-for-it.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # Use this script to test if a given TCP host/port are available
3 |
4 | cmdname=$(basename $0)
5 |
6 | echoerr() { if [[ $QUIET -ne 1 ]]; then echo "$@" 1>&2; fi }
7 |
8 | usage()
9 | {
10 | cat << USAGE >&2
11 | Usage:
12 | $cmdname host:port [-s] [-t timeout] [-- command args]
13 | -h HOST | --host=HOST Host or IP under test
14 | -p PORT | --port=PORT TCP port under test
15 | Alternatively, you specify the host and port as host:port
16 | -s | --strict Only execute subcommand if the test succeeds
17 | -q | --quiet Don't output any status messages
18 | -t TIMEOUT | --timeout=TIMEOUT
19 | Timeout in seconds, zero for no timeout
20 | -- COMMAND ARGS Execute command with args after the test finishes
21 | USAGE
22 | exit 1
23 | }
24 |
25 | wait_for()
26 | {
27 | if [[ $TIMEOUT -gt 0 ]]; then
28 | echoerr "$cmdname: waiting $TIMEOUT seconds for $HOST:$PORT"
29 | else
30 | echoerr "$cmdname: waiting for $HOST:$PORT without a timeout"
31 | fi
32 | start_ts=$(date +%s)
33 | while :
34 | do
35 | if [[ $ISBUSY -eq 1 ]]; then
36 | nc -z $HOST $PORT
37 | result=$?
38 | else
39 | (echo > /dev/tcp/$HOST/$PORT) >/dev/null 2>&1
40 | result=$?
41 | fi
42 | if [[ $result -eq 0 ]]; then
43 | end_ts=$(date +%s)
44 | echoerr "$cmdname: $HOST:$PORT is available after $((end_ts - start_ts)) seconds"
45 | break
46 | fi
47 | sleep 1
48 | done
49 | return $result
50 | }
51 |
52 | wait_for_wrapper()
53 | {
54 | # In order to support SIGINT during timeout: http://unix.stackexchange.com/a/57692
55 | if [[ $QUIET -eq 1 ]]; then
56 | timeout $BUSYTIMEFLAG $TIMEOUT $0 --quiet --child --host=$HOST --port=$PORT --timeout=$TIMEOUT &
57 | else
58 | timeout $BUSYTIMEFLAG $TIMEOUT $0 --child --host=$HOST --port=$PORT --timeout=$TIMEOUT &
59 | fi
60 | PID=$!
61 | trap "kill -INT -$PID" INT
62 | wait $PID
63 | RESULT=$?
64 | if [[ $RESULT -ne 0 ]]; then
65 | echoerr "$cmdname: timeout occurred after waiting $TIMEOUT seconds for $HOST:$PORT"
66 | fi
67 | return $RESULT
68 | }
69 |
70 | # process arguments
71 | while [[ $# -gt 0 ]]
72 | do
73 | case "$1" in
74 | *:* )
75 | hostport=(${1//:/ })
76 | HOST=${hostport[0]}
77 | PORT=${hostport[1]}
78 | shift 1
79 | ;;
80 | --child)
81 | CHILD=1
82 | shift 1
83 | ;;
84 | -q | --quiet)
85 | QUIET=1
86 | shift 1
87 | ;;
88 | -s | --strict)
89 | STRICT=1
90 | shift 1
91 | ;;
92 | -h)
93 | HOST="$2"
94 | if [[ $HOST == "" ]]; then break; fi
95 | shift 2
96 | ;;
97 | --host=*)
98 | HOST="${1#*=}"
99 | shift 1
100 | ;;
101 | -p)
102 | PORT="$2"
103 | if [[ $PORT == "" ]]; then break; fi
104 | shift 2
105 | ;;
106 | --port=*)
107 | PORT="${1#*=}"
108 | shift 1
109 | ;;
110 | -t)
111 | TIMEOUT="$2"
112 | if [[ $TIMEOUT == "" ]]; then break; fi
113 | shift 2
114 | ;;
115 | --timeout=*)
116 | TIMEOUT="${1#*=}"
117 | shift 1
118 | ;;
119 | --)
120 | shift
121 | CLI=("$@")
122 | break
123 | ;;
124 | --help)
125 | usage
126 | ;;
127 | *)
128 | echoerr "Unknown argument: $1"
129 | usage
130 | ;;
131 | esac
132 | done
133 |
134 | if [[ "$HOST" == "" || "$PORT" == "" ]]; then
135 | echoerr "Error: you need to provide a host and port to test."
136 | usage
137 | fi
138 |
139 | TIMEOUT=${TIMEOUT:-15}
140 | STRICT=${STRICT:-0}
141 | CHILD=${CHILD:-0}
142 | QUIET=${QUIET:-0}
143 |
144 | # check to see if timeout is from busybox?
145 | # check to see if timeout is from busybox?
146 | TIMEOUT_PATH=$(realpath $(which timeout))
147 | if [[ $TIMEOUT_PATH =~ "busybox" ]]; then
148 | ISBUSY=1
149 | BUSYTIMEFLAG="-t"
150 | else
151 | ISBUSY=0
152 | BUSYTIMEFLAG=""
153 | fi
154 |
155 | if [[ $CHILD -gt 0 ]]; then
156 | wait_for
157 | RESULT=$?
158 | exit $RESULT
159 | else
160 | if [[ $TIMEOUT -gt 0 ]]; then
161 | wait_for_wrapper
162 | RESULT=$?
163 | else
164 | wait_for
165 | RESULT=$?
166 | fi
167 | fi
168 |
169 | if [[ $CLI != "" ]]; then
170 | if [[ $RESULT -ne 0 && $STRICT -eq 1 ]]; then
171 | echoerr "$cmdname: strict mode, refusing to execute subprocess"
172 | exit $RESULT
173 | fi
174 | exec "${CLI[@]}"
175 | else
176 | exit $RESULT
177 | fi
--------------------------------------------------------------------------------
/kafka-consumer/spark_schema.py:
--------------------------------------------------------------------------------
1 | from pyspark.sql.types import StructType, StructField, StringType, LongType, BooleanType, ArrayType, MapType
2 |
3 | schema = StructType([
4 | StructField("transactions", ArrayType(
5 | StructType([
6 | StructField("amount", LongType(), True),
7 | StructField("amount_captured", LongType(), True),
8 | StructField("amount_refunded", LongType(), True),
9 | StructField("application", StringType(), True),
10 | StructField("application_fee", StringType(), True),
11 | StructField("application_fee_amount", StringType(), True),
12 | StructField("balance_transaction", StringType(), True),
13 | StructField("billing_details", StructType([
14 | StructField("address", StructType([
15 | StructField("city", StringType(), True),
16 | StructField("country", StringType(), True),
17 | StructField("line1", StringType(), True),
18 | StructField("line2", StringType(), True),
19 | StructField("postal_code", StringType(), True),
20 | StructField("state", StringType(), True)
21 | ]), True),
22 | StructField("email", StringType(), True),
23 | StructField("name", StringType(), True),
24 | StructField("phone", StringType(), True)
25 | ]), True),
26 | StructField("calculated_statement_descriptor", StringType(), True),
27 | StructField("captured", BooleanType(), True),
28 | StructField("created", LongType(), True),
29 | StructField("currency", StringType(), True),
30 | StructField("customer", StringType(), True),
31 | StructField("description", StringType(), True),
32 | StructField("destination", StringType(), True),
33 | StructField("dispute", StringType(), True),
34 | StructField("disputed", BooleanType(), True),
35 | StructField("failure_balance_transaction", StringType(), True),
36 | StructField("failure_code", StringType(), True),
37 | StructField("failure_message", StringType(), True),
38 | StructField("fraud_details", MapType(StringType(), StringType()), True),
39 | StructField("id", StringType(), True),
40 | StructField("invoice", StringType(), True),
41 | StructField("livemode", BooleanType(), True),
42 | StructField("metadata", MapType(StringType(), StringType()), True),
43 | StructField("object", StringType(), True),
44 | StructField("on_behalf_of", StringType(), True),
45 | StructField("order", StringType(), True),
46 | StructField("outcome", StructType([
47 | StructField("network_status", StringType(), True),
48 | StructField("reason", StringType(), True),
49 | StructField("risk_level", StringType(), True),
50 | StructField("risk_score", LongType(), True),
51 | StructField("seller_message", StringType(), True),
52 | StructField("type", StringType(), True)
53 | ]), True),
54 | StructField("paid", BooleanType(), True),
55 | StructField("payment_intent", StringType(), True),
56 | StructField("payment_method", StringType(), True),
57 | StructField("payment_method_details", StructType([
58 | StructField("card", StructType([
59 | StructField("amount_authorized", LongType(), True),
60 | StructField("brand", StringType(), True),
61 | StructField("checks", StructType([
62 | StructField("address_line1_check", StringType(), True),
63 | StructField("address_postal_code_check", StringType(), True),
64 | StructField("cvc_check", StringType(), True)
65 | ]), True),
66 | StructField("country", StringType(), True),
67 | StructField("exp_month", LongType(), True),
68 | StructField("exp_year", LongType(), True),
69 | StructField("extended_authorization", StructType([
70 | StructField("status", StringType(), True)
71 | ]), True),
72 | StructField("fingerprint", StringType(), True),
73 | StructField("funding", StringType(), True),
74 | StructField("incremental_authorization", StructType([
75 | StructField("status", StringType(), True)
76 | ]), True),
77 | StructField("installments", StringType(), True),
78 | StructField("last4", StringType(), True),
79 | StructField("mandate", StringType(), True),
80 | StructField("multicapture", StructType([
81 | StructField("status", StringType(), True)
82 | ]), True),
83 | StructField("network", StringType(), True),
84 | StructField("network_token", StructType([
85 | StructField("used", BooleanType(), True)
86 | ]), True),
87 | StructField("overcapture", StructType([
88 | StructField("maximum_amount_capturable", LongType(), True),
89 | StructField("status", StringType(), True)
90 | ]), True),
91 | StructField("three_d_secure", StringType(), True),
92 | StructField("wallet", StringType(), True)
93 | ]), True),
94 | StructField("type", StringType(), True)
95 | ]), True),
96 | StructField("receipt_email", StringType(), True),
97 | StructField("receipt_number", StringType(), True),
98 | StructField("receipt_url", StringType(), True),
99 | StructField("refunded", BooleanType(), True),
100 | StructField("review", StringType(), True),
101 | StructField("shipping", StringType(), True),
102 | StructField("source", StructType([
103 | StructField("address_city", StringType(), True),
104 | StructField("address_country", StringType(), True),
105 | StructField("address_line1", StringType(), True),
106 | StructField("address_line1_check", StringType(), True),
107 | StructField("address_line2", StringType(), True),
108 | StructField("address_state", StringType(), True),
109 | StructField("address_zip", StringType(), True),
110 | StructField("address_zip_check", StringType(), True),
111 | StructField("brand", StringType(), True),
112 | StructField("country", StringType(), True),
113 | StructField("customer", StringType(), True),
114 | StructField("cvc_check", StringType(), True),
115 | StructField("dynamic_last4", StringType(), True),
116 | StructField("exp_month", LongType(), True),
117 | StructField("exp_year", LongType(), True),
118 | StructField("fingerprint", StringType(), True),
119 | StructField("funding", StringType(), True),
120 | StructField("id", StringType(), True),
121 | StructField("last4", StringType(), True),
122 | StructField("metadata", MapType(StringType(), StringType()), True),
123 | StructField("name", StringType(), True),
124 | StructField("object", StringType(), True),
125 | StructField("tokenization_method", StringType(), True),
126 | StructField("wallet", StringType(), True)
127 | ]), True),
128 | StructField("source_transfer", StringType(), True),
129 | StructField("statement_descriptor", StringType(), True),
130 | StructField("statement_descriptor_suffix", StringType(), True),
131 | StructField("status", StringType(), True),
132 | StructField("transfer_data", StringType(), True),
133 | StructField("transfer_group", StringType(), True)
134 | ])
135 | ))
136 | ])
137 |
--------------------------------------------------------------------------------
/kafka-consumer/python-consumer.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from pyspark.sql import SparkSession
4 | from pyspark.sql.types import LongType
5 | from pyspark.sql.functions import from_json, col, explode, from_unixtime
6 |
7 | from spark_schema import schema
8 |
9 | kafka_nodes = os.getenv('KAFKA_SERVER')
10 | my_topic = os.getenv('KAFKA_TOPIC')
11 |
12 | spark = SparkSession.builder \
13 | .appName("KafkaConsumer") \
14 | .getOrCreate()
15 |
16 | df = spark \
17 | .readStream \
18 | .format("kafka") \
19 | .option("kafka.bootstrap.servers", kafka_nodes) \
20 | .option("subscribe", my_topic) \
21 | .option("startingOffsets", "latest") \
22 | .load()
23 |
24 | df = df.withColumn("value", col("value").cast("string"))
25 |
26 | df_parsed = df.withColumn("parsed_value", from_json(col("value"), schema)) \
27 | .select("parsed_value.*")
28 |
29 | df_exploded = df_parsed.select(explode(col("transactions")).alias("transaction"))
30 |
31 | df_final = df_exploded.selectExpr(
32 | "transaction.amount as amount",
33 | "transaction.amount_captured as amount_captured",
34 | "transaction.amount_refunded as amount_refunded",
35 | "transaction.application as application",
36 | "transaction.application_fee application_fee",
37 | "transaction.application_fee_amount as application_fee_amount",
38 | "transaction.balance_transaction as balance_transaction",
39 | "transaction.billing_details.address.city as billing_details_address_city",
40 | "transaction.billing_details.address.country as billing_details_address_country",
41 | "transaction.billing_details.address.line1 as billing_details_address_line1",
42 | "transaction.billing_details.address.line2 as billing_details_address_line2",
43 | "transaction.billing_details.address.postal_code as billing_details_address_postal_code",
44 | "transaction.billing_details.address.state as billing_details_address_state",
45 | "transaction.billing_details.email as billing_details_email",
46 | "transaction.billing_details.name as billing_details_name",
47 | "transaction.billing_details.phone as billing_details_phone",
48 | "transaction.calculated_statement_descriptor as calculated_statement_descriptor",
49 | "transaction.captured as captured",
50 | "transaction.created as created",
51 | "transaction.currency as currency",
52 | "transaction.customer as customer",
53 | "transaction.description as description",
54 | "transaction.destination as destination",
55 | "transaction.dispute as dispute",
56 | "transaction.disputed as disputed",
57 | "transaction.failure_balance_transaction as failure_balance_transaction",
58 | "transaction.failure_code as failure_code",
59 | "transaction.failure_message as failure_message",
60 | "transaction.id as id",
61 | "transaction.invoice as invoice",
62 | "transaction.livemode as livemode",
63 | "transaction.object as object",
64 | "transaction.on_behalf_of as on_behalf_of",
65 | "transaction.order as order",
66 | "transaction.outcome.network_status as outcome_network_status",
67 | "transaction.outcome.reason as outcome_reason",
68 | "transaction.outcome.risk_level as outcome_risk_level",
69 | "transaction.outcome.risk_score as outcome_risk_score",
70 | "transaction.outcome.seller_message as outcome_seller_message",
71 | "transaction.outcome.type as outcome_type",
72 | "transaction.paid as paid",
73 | "transaction.payment_intent as payment_intent",
74 | "transaction.payment_method as payment_method",
75 | "transaction.payment_method_details.card.amount_authorized as payment_method_details_card_amount_authorized",
76 | "transaction.payment_method_details.card.brand as payment_method_details_card_brand",
77 | "transaction.payment_method_details.card.checks.address_line1_check as payment_method_details_card_checks_address_line1_check",
78 | "transaction.payment_method_details.card.checks.address_postal_code_check as payment_method_details_card_checks_address_postal_code_check",
79 | "transaction.payment_method_details.card.checks.cvc_check as payment_method_details_card_checks_cvc_check",
80 | "transaction.payment_method_details.card.country as payment_method_details_card_country",
81 | "transaction.payment_method_details.card.exp_month as payment_method_details_card_exp_month",
82 | "transaction.payment_method_details.card.exp_year as payment_method_details_card_exp_year",
83 | "transaction.payment_method_details.card.extended_authorization.status as payment_method_details_card_extended_authorization_status",
84 | "transaction.payment_method_details.card.fingerprint as payment_method_details_card_fingerprint",
85 | "transaction.payment_method_details.card.funding as payment_method_details_card_funding",
86 | "transaction.payment_method_details.card.incremental_authorization.status as payment_method_details_card_incremental_authorization_status",
87 | "transaction.payment_method_details.card.installments as payment_method_details_card_installments",
88 | "transaction.payment_method_details.card.last4 as payment_method_details_card_last4",
89 | "transaction.payment_method_details.card.mandate as payment_method_details_card_mandate",
90 | "transaction.payment_method_details.card.multicapture.status as payment_method_details_card_multicapture_status",
91 | "transaction.payment_method_details.card.network as payment_method_details_card_network",
92 | "transaction.payment_method_details.card.network_token.used as payment_method_details_card_network_token_used",
93 | "transaction.payment_method_details.card.overcapture.maximum_amount_capturable as payment_method_details_card_overcapture_maximum_amount",
94 | "transaction.payment_method_details.card.overcapture.status as payment_method_details_card_overcapture_status",
95 | "transaction.payment_method_details.card.three_d_secure as payment_method_details_card_three_d_secure",
96 | "transaction.payment_method_details.card.wallet as payment_method_details_card_wallet",
97 | "transaction.payment_method_details.type as payment_method_details_type",
98 | "transaction.receipt_email as receipt_email",
99 | "transaction.receipt_number as receipt_number",
100 | "transaction.receipt_url as receipt_url",
101 | "transaction.refunded as refunded",
102 | "transaction.review as review",
103 | "transaction.shipping as shipping",
104 | "transaction.source.address_city as source_address_city",
105 | "transaction.source.address_country as source_address_country",
106 | "transaction.source.address_line1 as source_address_line1",
107 | "transaction.source.address_line1_check as source_address_line1_check",
108 | "transaction.source.address_line2 as source_address_line2",
109 | "transaction.source.address_state as source_address_state",
110 | "transaction.source.address_zip as source_address_zip",
111 | "transaction.source.address_zip_check as source_address_zip_check",
112 | "transaction.source.brand as source_brand",
113 | "transaction.source.country as source_country",
114 | "transaction.source.customer as source_customer",
115 | "transaction.source.cvc_check as source_cvc_check",
116 | "transaction.source.dynamic_last4 as source_dynamic_last4",
117 | "transaction.source.exp_month as source_exp_month",
118 | "transaction.source.exp_year as source_exp_year",
119 | "transaction.source.fingerprint as source_fingerprint",
120 | "transaction.source.funding as source_funding",
121 | "transaction.source.id as source_id",
122 | "transaction.source.last4 as source_last4",
123 | "transaction.source.name as source_name",
124 | "transaction.source.object as source_object",
125 | "transaction.source.tokenization_method as source_tokenization_method",
126 | "transaction.source.wallet as source_wallet",
127 | "transaction.source_transfer as source_transfer",
128 | "transaction.statement_descriptor as statement_descriptor",
129 | "transaction.statement_descriptor_suffix as statement_descriptor_suffix",
130 | "transaction.status as status",
131 | "transaction.transfer_data as transfer_data",
132 | "transaction.transfer_group as transfer_group"
133 | )
134 |
135 | df_final.printSchema()
136 | df_final = df_final.withColumn("created", from_unixtime(col("created").cast(LongType())).cast("timestamp"))
137 |
138 | POSTGRES_DB = os.getenv("POSTGRES_DB")
139 | POSTGRES_USER = os.getenv("POSTGRES_USER")
140 | POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD")
141 | POSTGRES_HOST = os.getenv("POSTGRES_HOST")
142 |
143 | pg_url = f"jdbc:postgresql://{POSTGRES_HOST}:5432/{POSTGRES_DB}"
144 |
145 | pg_properties = {
146 | "user": POSTGRES_USER,
147 | "password": POSTGRES_PASSWORD,
148 | "driver": "org.postgresql.Driver"
149 | }
150 |
151 | def write_to_postgres(df, epoch_id):
152 | df.write \
153 | .jdbc(url=pg_url, table="transactions", mode="append", properties=pg_properties)
154 |
155 | query = df_final \
156 | .writeStream \
157 | .foreachBatch(write_to_postgres) \
158 | .outputMode("append") \
159 | .start()
160 |
161 | query.awaitTermination()
162 |
--------------------------------------------------------------------------------
/data_modelling/models/dimensions/schema.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 |
3 | sources:
4 | - name: postgres
5 | database: postgres
6 | schema: public
7 | tables:
8 | - name: forex_rates
9 |
10 | models:
11 | - name: dim_forex_rates
12 | description: "Daily Forex rates for top currencies"
13 | columns:
14 | - name: date_rates
15 | data_type: date
16 | description: "Date of the forex rate"
17 | data_tests:
18 | - not_null:
19 | severity: warn
20 | - name: usd
21 | data_type: float
22 | description: "USD to GBP rate"
23 | data_tests:
24 | - not_null:
25 | severity: warn
26 | - name: eur
27 | data_type: float
28 | description: "EUR to GBP rate"
29 | data_tests:
30 | - not_null:
31 | severity: warn
32 | - name: jpy
33 | data_type: float
34 | description: "JPY to GBP rate"
35 | data_tests:
36 | - not_null:
37 | severity: warn
38 | - name: cad
39 | data_type: float
40 | description: "CAD to GBP rate"
41 | data_tests:
42 | - not_null:
43 | severity: warn
44 | - name: aud
45 | data_type: float
46 | description: "AUD to GBP rate"
47 | data_tests:
48 | - not_null:
49 | severity: warn
50 | - name: chf
51 | data_type: float
52 | description: "CHF to GBP rate"
53 | data_tests:
54 | - not_null:
55 | severity: warn
56 | - name: cny
57 | data_type: float
58 | description: "CNY to GBP rate"
59 | data_tests:
60 | - not_null:
61 | severity: warn
62 | - name: sek
63 | data_type: float
64 | description: "SEK to GBP rate"
65 | data_tests:
66 | - not_null:
67 | severity: warn
68 | - name: nzd
69 | data_type: float
70 | description: "NZD to GBP rate"
71 | data_tests:
72 | - not_null:
73 | severity: warn
74 | - name: mxn
75 | data_type: float
76 | description: "MXN to GBP rate"
77 | data_tests:
78 | - not_null:
79 | severity: warn
80 |
81 | - name: dim_billing_details
82 | description: "Billing details for a charge"
83 | columns:
84 | - name: id
85 | data_type: string
86 | description: "ID of the Charge"
87 | data_tests:
88 | - not_null:
89 | severity: warn
90 | - unique:
91 | severity: warn
92 | - name: address_city
93 | data_type: string
94 | description: "City of the address"
95 | - name: address_country
96 | data_type: string
97 | description: "Country of the address"
98 | - name: address_line1
99 | data_type: string
100 | description: "Address line 1"
101 | - name: address_line2
102 | data_type: string
103 | description: "Address line 2"
104 | - name: address_postal_code
105 | data_type: string
106 | description: "Postal code of the address"
107 | - name: address_state
108 | data_type: string
109 | description: "State of the address"
110 | - name: email
111 | data_type: string
112 | description: "Email of the customer"
113 | - name: name
114 | data_type: string
115 | description: "Name of the customer"
116 | - name: phone
117 | data_type: string
118 | description: "Phone number of the customer"
119 |
120 | - name: dim_outcome_details
121 | description: "Outcome details for a charge"
122 | columns:
123 | - name: id
124 | data_type: string
125 | description: "ID of the Charge"
126 | data_tests:
127 | - not_null:
128 | severity: warn
129 | - unique:
130 | severity: warn
131 | - name: network_status
132 | data_type: string
133 | description: "Network status of the outcome"
134 | - name: reason
135 | data_type: string
136 | description: "Reason for the outcome"
137 | - name: risk_level
138 | data_type: string
139 | description: "Risk level of the outcome (low, normal, high)"
140 | - name: risk_score
141 | data_type: float
142 | description: "Risk score of the outcome"
143 | - name: seller_message
144 | data_type: string
145 | description: "Seller message of the outcome"
146 | - name: outcome_type
147 | data_type: string
148 | description: "Type of the outcome"
149 |
150 |
151 | - name: dim_payment_method_details
152 | description: "Payment method details for a charge"
153 | columns:
154 | - name: id
155 | data_type: string
156 | description: "ID of the Charge"
157 | data_tests:
158 | - not_null:
159 | severity: warn
160 | - unique:
161 | severity: warn
162 | - name: card_amount_authorized
163 | data_type: float
164 | description: "Authorized amount on the card"
165 | - name: card_brand
166 | data_type: string
167 | description: "Brand of the card"
168 | - name: card_checks_address_line1_check
169 | data_type: string
170 | description: "Address line 1 check result"
171 | - name: card_checks_address_postal_code_check
172 | data_type: string
173 | description: "Postal code check result"
174 | - name: card_checks_cvc_check
175 | data_type: string
176 | description: "CVC check result"
177 | - name: card_country
178 | data_type: string
179 | description: "Country of the card"
180 | - name: card_exp_month
181 | data_type: integer
182 | description: "Expiration month of the card"
183 | - name: card_exp_year
184 | data_type: integer
185 | description: "Expiration year of the card"
186 | - name: card_extended_authorization_status
187 | data_type: string
188 | description: "Extended authorization status of the card"
189 | - name: card_fingerprint
190 | data_type: string
191 | description: "Fingerprint of the card"
192 | - name: card_funding
193 | data_type: string
194 | description: "Funding source of the card"
195 | - name: card_incremental_authorization_status
196 | data_type: string
197 | description: "Incremental authorization status of the card"
198 | - name: card_installments
199 | data_type: integer
200 | description: "Number of installments for the card"
201 | - name: card_last4
202 | data_type: string
203 | description: "Last 4 digits of the card"
204 | - name: card_mandate
205 | data_type: string
206 | description: "Mandate of the card"
207 | - name: card_multicapture_status
208 | data_type: string
209 | description: "Multicapture status of the card"
210 | - name: card_network
211 | data_type: string
212 | description: "Network of the card"
213 | - name: card_network_token_used
214 | data_type: string
215 | description: "Whether a network token was used for the card"
216 | - name: card_overcapture_maximum_amount
217 | data_type: float
218 | description: "Maximum amount for overcapture on the card"
219 | - name: card_overcapture_status
220 | data_type: string
221 | description: "Overcapture status of the card"
222 | - name: card_three_d_secure
223 | data_type: string
224 | description: "3D Secure status of the card"
225 | - name: card_wallet
226 | data_type: string
227 | description: "Wallet used for the card"
228 | - name: type
229 | data_type: string
230 | description: "Type of the payment method"
231 |
232 | - name: dim_source_details
233 | description: "Source details for a charge"
234 | columns:
235 | - name: id
236 | data_type: string
237 | description: "ID of the Charge"
238 | data_tests:
239 | - not_null:
240 | severity: warn
241 | - unique:
242 | severity: warn
243 | - name: address_city
244 | data_type: string
245 | description: "City of the address"
246 | - name: address_country
247 | data_type: string
248 | description: "Country of the address"
249 | - name: address_line1
250 | data_type: string
251 | description: "Address line 1"
252 | - name: address_line1_check
253 | data_type: string
254 | description: "Address line 1 check result"
255 | - name: address_line2
256 | data_type: string
257 | description: "Address line 2"
258 | - name: address_state
259 | data_type: string
260 | description: "State of the address"
261 | - name: address_zip
262 | data_type: string
263 | description: "ZIP code of the address"
264 | - name: address_zip_check
265 | data_type: string
266 | description: "ZIP code check result"
267 | - name: brand
268 | data_type: string
269 | description: "Brand of the source"
270 | - name: country
271 | data_type: string
272 | description: "Country of the source"
273 | - name: customer
274 | data_type: string
275 | description: "Customer associated with the source"
276 | - name: cvc_check
277 | data_type: string
278 | description: "CVC check result"
279 | - name: dynamic_last4
280 | data_type: string
281 | description: "Dynamic last 4 digits of the source"
282 | - name: exp_month
283 | data_type: integer
284 | description: "Expiration month of the source"
285 | - name: exp_year
286 | data_type: integer
287 | description: "Expiration year of the source"
288 | - name: fingerprint
289 | data_type: string
290 | description: "Fingerprint of the source"
291 | - name: funding
292 | data_type: string
293 | description: "Funding source of the source"
294 | - name: source_id
295 | data_type: string
296 | description: "ID of the source"
297 | - name: last4
298 | data_type: string
299 | description: "Last 4 digits of the source"
300 | - name: name
301 | data_type: string
302 | description: "Name of the source"
303 | - name: object_type
304 | data_type: string
305 | description: "Type of the source object"
306 | - name: tokenization_method
307 | data_type: string
308 | description: "Tokenization method of the source"
309 | - name: wallet
310 | data_type: string
311 | description: "Wallet used for the source"
312 | - name: transfer
313 | data_type: string
314 | description: "Transfer associated with the source"
315 |
--------------------------------------------------------------------------------
/data_modelling/models/staging/schema.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 |
3 | sources:
4 | - name: postgres
5 | database: postgres
6 | schema: public
7 | tables:
8 | - name: transactions
9 |
10 | models:
11 | - name: stg_transactions
12 | description: "Staging table for transactions data"
13 | columns:
14 | - name: amount
15 | data_type: numeric
16 | description: "Amount of the transaction - 100 = £1"
17 | data_tests:
18 | - not_null:
19 | severity: warn
20 | - name: amount_captured
21 | data_type: numeric
22 | description: "Amount in bani captured (can be less than the amount attribute on the charge if a partial capture was made)"
23 | - name: amount_refunded
24 | data_type: numeric
25 | description: "Amount in bani refunded (can be less than the amount attribute on the charge if a partial refund was issued)"
26 | - name: application
27 | data_type: string
28 | description: "ID of the Connect application that created the charge"
29 | - name: application_fee
30 | data_type: numeric
31 | description: "The application fee (if any) for the charge See the Connect documentation for details"
32 | - name: application_fee_amount
33 | data_type: numeric
34 | description: "The amount of the application fee (if any) requested for the charge See the Connect documentation for details"
35 | - name: balance_transaction
36 | data_type: string
37 | description: "ID of the balance transaction that describes the impact of this charge on your account balance (not including refunds or disputes)"
38 | - name: billing_details_address_city
39 | data_type: string
40 | description: "City in the billing address"
41 | - name: billing_details_address_country
42 | data_type: string
43 | description: "Country in the billing address"
44 | - name: billing_details_address_line1
45 | data_type: string
46 | description: "Line 1 of the billing address"
47 | - name: billing_details_address_line2
48 | data_type: string
49 | description: "Line 2 of the billing address"
50 | - name: billing_details_address_postal_code
51 | data_type: string
52 | description: "Postal code in the billing address"
53 | - name: billing_details_address_state
54 | data_type: string
55 | description: "State in the billing address"
56 | - name: billing_details_email
57 | data_type: string
58 | description: "Email in the billing details"
59 | - name: billing_details_name
60 | data_type: string
61 | description: "Name in the billing details"
62 | - name: billing_details_phone
63 | data_type: string
64 | description: "Phone number in the billing details"
65 | - name: calculated_statement_descriptor
66 | data_type: string
67 | description: "The full statement descriptor that is passed to card networks, and that is displayed on your customers’ credit card and bank statements Allows you to see what the statement descriptor looks like after the static and dynamic portions are combined"
68 | - name: captured
69 | data_type: boolean
70 | description: "If the charge was created without capturing, this Boolean represents whether it is still uncaptured or has since been captured"
71 | - name: created
72 | data_type: datetime
73 | description: "Timestamp of when the transaction was created"
74 | data_tests:
75 | - not_null:
76 | severity: warn
77 | - name: currency
78 | data_type: string
79 | description: "Three-letter ISO currency code, in lowercase Must be a supported currency"
80 | - name: customer
81 | data_type: string
82 | description: "The ID of the customer this charge is for if one exists"
83 | - name: description
84 | data_type: string
85 | description: "An arbitrary string attached to the object Often useful for displaying to users"
86 | - name: destination
87 | data_type: string
88 | description: "Destination of the transaction"
89 | - name: dispute
90 | data_type: string
91 | description: "Dispute associated with the transaction"
92 | - name: disputed
93 | data_type: boolean
94 | description: "Whether the charge has been disputed"
95 | - name: failure_balance_transaction
96 | data_type: string
97 | description: "Balance transaction associated with the failure"
98 | - name: failure_code
99 | data_type: string
100 | description: "Error code explaining reason for charge failure if available (see the errors section for a list of codes)"
101 | - name: failure_message
102 | data_type: string
103 | description: "Message to user further explaining reason for charge failure if available"
104 | - name: id
105 | data_type: string
106 | description: "ID of the transaction"
107 | data_tests:
108 | - not_null:
109 | severity: warn
110 | - unique:
111 | severity: warn
112 | - name: invoice
113 | data_type: string
114 | description: "The ID of the invoice this charge is for if one exists"
115 | - name: livemode
116 | data_type: boolean
117 | description: "Has the value true if the object exists in live mode or the value false if the object exists in test mode"
118 | - name: object
119 | data_type: string
120 | description: "String representing the object’s type Objects of the same type share the same value"
121 | - name: on_behalf_of
122 | data_type: string
123 | description: "The account (if any) the charge was made on behalf of without triggering an automatic transfer See the Connect documentation for details"
124 | - name: order
125 | data_type: string
126 | description: "ID of the order this charge is for if one exists"
127 | - name: outcome_network_status
128 | data_type: string
129 | description: "Network status of the transaction outcome"
130 | - name: outcome_reason
131 | data_type: string
132 | description: "Reason for the transaction outcome"
133 | - name: outcome_risk_level
134 | data_type: string
135 | description: "Risk level of the transaction outcome"
136 | - name: outcome_risk_score
137 | data_type: numeric
138 | description: "Risk score of the transaction outcome"
139 | - name: outcome_seller_message
140 | data_type: string
141 | description: "Seller message of the transaction outcome"
142 | - name: outcome_type
143 | data_type: string
144 | description: "Type of the transaction outcome"
145 | - name: paid
146 | data_type: boolean
147 | description: "True if the charge succeeded, or was successfully authorized for later capture"
148 | - name: payment_intent
149 | data_type: string
150 | description: "The ID of the PaymentIntent associated with this charge, if one exists"
151 | - name: payment_method
152 | data_type: string
153 | description: "ID of the payment method used in this charge"
154 | - name: payment_method_details_card_amount_authorized
155 | data_type: numeric
156 | description: "Amount authorized by the card payment method"
157 | - name: payment_method_details_card_brand
158 | data_type: string
159 | description: "Brand of the card payment method"
160 | - name: payment_method_details_card_checks_address_line1_check
161 | data_type: string
162 | description: "Address line 1 check result of the card payment method"
163 | - name: payment_method_details_card_checks_address_postal_code_check
164 | data_type: string
165 | description: "Postal code check result of the card payment method"
166 | - name: payment_method_details_card_checks_cvc_check
167 | data_type: string
168 | description: "CVC check result of the card payment method"
169 | - name: payment_method_details_card_country
170 | data_type: string
171 | description: "Country of the card payment method"
172 | - name: payment_method_details_card_exp_month
173 | data_type: integer
174 | description: "Expiration month of the card payment method"
175 | - name: payment_method_details_card_exp_year
176 | data_type: integer
177 | description: "Expiration year of the card payment method"
178 | - name: payment_method_details_card_extended_authorization_status
179 | data_type: string
180 | description: "Extended authorization status of the card payment method"
181 | - name: payment_method_details_card_fingerprint
182 | data_type: string
183 | description: "Fingerprint of the card payment method"
184 | - name: payment_method_details_card_funding
185 | data_type: string
186 | description: "Funding type of the card payment method"
187 | - name: payment_method_details_card_incremental_authorization_status
188 | data_type: string
189 | description: "Incremental authorization status of the card payment method"
190 | - name: payment_method_details_card_installments
191 | data_type: integer
192 | description: "Number of installments for the card payment method"
193 | - name: payment_method_details_card_last4
194 | data_type: string
195 | description: "Last 4 digits of the card payment method"
196 | - name: payment_method_details_card_mandate
197 | data_type: string
198 | description: "Mandate of the card payment method"
199 | - name: payment_method_details_card_multicapture_status
200 | data_type: string
201 | description: "Multicapture status of the card payment method"
202 | - name: payment_method_details_card_network
203 | data_type: string
204 | description: "Network of the card payment method"
205 | - name: payment_method_details_card_network_token_used
206 | data_type: boolean
207 | description: "Flag indicating if a network token was used with the card payment method"
208 | - name: payment_method_details_card_overcapture_maximum_amount
209 | data_type: numeric
210 | description: "Maximum amount that can be overcaptured with the card payment method"
211 | - name: payment_method_details_card_overcapture_status
212 | data_type: string
213 | description: "Overcapture status of the card payment method"
214 | - name: payment_method_details_card_three_d_secure
215 | data_type: string
216 | description: "3D Secure status of the card payment method"
217 | - name: payment_method_details_card_wallet
218 | data_type: string
219 | description: "Wallet of the card payment method"
220 | - name: payment_method_details_type
221 | data_type: string
222 | description: "Type of the payment method details"
223 | - name: receipt_email
224 | data_type: string
225 | description: "This is the email address that the receipt for this charge was sent to"
226 | - name: receipt_number
227 | data_type: string
228 | description: "This is the transaction number that appears on email receipts sent for this charge This attribute will be null until a receipt has been sent"
229 | - name: receipt_url
230 | data_type: string
231 | description: "This is the URL to view the receipt for this charge The receipt is kept up-to-date to the latest state of the charge, including any refunds If the charge is for an Invoice, the receipt will be stylized as an Invoice receipt"
232 | - name: refunded
233 | data_type: boolean
234 | description: "Whether the charge has been fully refunded If the charge is only partially refunded, this attribute will still be false"
235 | - name: review
236 | data_type: string
237 | description: "ID of the review associated with this charge if one exists"
238 | - name: shipping
239 | data_type: string
240 | description: "Shipping information for the charge"
241 | - name: source_address_city
242 | data_type: string
243 | description: "City in the source address"
244 | - name: source_address_country
245 | data_type: string
246 | description: "Country in the source address"
247 | - name: source_address_line1
248 | data_type: string
249 | description: "Line 1 of the source address"
250 | - name: source_address_line1_check
251 | data_type: string
252 | description: "Address line 1 check result of the source"
253 | - name: source_address_line2
254 | data_type: string
255 | description: "Line 2 of the source address"
256 | - name: source_address_state
257 | data_type: string
258 | description: "State in the source address"
259 | - name: source_address_zip
260 | data_type: string
261 | description: "Zip code in the source address"
262 | - name: source_address_zip_check
263 | data_type: string
264 | description: "Zip code check result of the source"
265 | - name: source_brand
266 | data_type: string
267 | description: "Brand of the source"
268 | - name: source_country
269 | data_type: string
270 | description: "Country of the source"
271 | - name: source_customer
272 | data_type: string
273 | description: "Customer associated with the source"
274 | - name: source_cvc_check
275 | data_type: string
276 | description: "CVC check result of the source"
277 | - name: source_dynamic_last4
278 | data_type: string
279 | description: "Dynamic last 4 digits of the source"
280 | - name: source_exp_month
281 | data_type: integer
282 | description: "Expiration month of the source"
283 | - name: source_exp_year
284 | data_type: integer
285 | description: "Expiration year of the source"
286 | - name: source_fingerprint
287 | data_type: string
288 | description: "Fingerprint of the source"
289 | - name: source_funding
290 | data_type: string
291 | description: "Funding type of the source"
292 | - name: source_id
293 | data_type: string
294 | description: "ID of the source"
295 | - name: source_last4
296 | data_type: string
297 | description: "Last 4 digits of the source"
298 | - name: source_name
299 | data_type: string
300 | description: "Name in the source"
301 | - name: source_object
302 | data_type: string
303 | description: "Object type of the source"
304 | - name: source_tokenization_method
305 | data_type: string
306 | description: "Tokenization method of the source"
307 | - name: source_wallet
308 | data_type: string
309 | description: "Wallet of the source"
310 | - name: source_transfer
311 | data_type: string
312 | description: "Transfer associated with the source"
313 | - name: statement_descriptor
314 | data_type: string
315 | description: "For card charges, use statement_descriptor_suffix instead Otherwise, you can use this value as the complete description of a charge on your customers’ statements Must contain at least one letter, maximum 22 characters"
316 | - name: statement_descriptor_suffix
317 | data_type: string
318 | description: "Provides information about the charge that customers see on their statements Concatenated with the prefix (shortened descriptor) or statement descriptor that’s set on the account to form the complete statement descriptor Maximum 22 characters for the concatenated descriptor"
319 | - name: status
320 | data_type: string
321 | description: "The status of the payment is either succeeded, pending, or failed"
322 | - name: transfer_data
323 | data_type: string
324 | description: "An optional dictionary including the account to automatically transfer to as part of a destination charge"
325 | - name: transfer_group
326 | data_type: string
327 | description: "A string that identifies this transaction as part of a group"
328 |
--------------------------------------------------------------------------------