├── const.py
├── creditcard.csv
├── docker-compose.yml
├── grafana
    ├── __init__.py
    ├── grafana_db
    │   └── grafana.db
    ├── login_config
    └── provisioning
    │   └── dashboards
    │       ├── Node Exporter.json
    │       ├── Spark Metrics.json
    │       └── dashboard.yml
├── inference
    ├── __init__.py
    ├── base.py
    ├── batch_inference.py
    └── stream_inference.py
├── model
    ├── metadata
    │   ├── ._SUCCESS.crc
    │   ├── .part-00000.crc
    │   ├── _SUCCESS
    │   └── part-00000
    └── stages
    │   ├── 0_StringIndexer_d065dab91d9f
    │       ├── data
    │       │   ├── ._SUCCESS.crc
    │       │   ├── .part-00000-7abf6b25-05d4-4465-bc28-f85b21eda3b2-c000.snappy.parquet.crc
    │       │   ├── _SUCCESS
    │       │   └── part-00000-7abf6b25-05d4-4465-bc28-f85b21eda3b2-c000.snappy.parquet
    │       └── metadata
    │       │   ├── ._SUCCESS.crc
    │       │   ├── .part-00000.crc
    │       │   ├── _SUCCESS
    │       │   └── part-00000
    │   ├── 1_OneHotEncoder_de68f7eb6b38
    │       ├── data
    │       │   ├── ._SUCCESS.crc
    │       │   ├── .part-00000-8acdc128-241b-4b22-a551-aeba6a6ca387-c000.snappy.parquet.crc
    │       │   ├── _SUCCESS
    │       │   └── part-00000-8acdc128-241b-4b22-a551-aeba6a6ca387-c000.snappy.parquet
    │       └── metadata
    │       │   ├── ._SUCCESS.crc
    │       │   ├── .part-00000.crc
    │       │   ├── _SUCCESS
    │       │   └── part-00000
    │   ├── 2_Imputer_7b5e57f74a2b
    │       ├── data
    │       │   ├── ._SUCCESS.crc
    │       │   ├── .part-00000-3ba2ba2d-997b-482f-a2a7-d39eedb93fad-c000.snappy.parquet.crc
    │       │   ├── _SUCCESS
    │       │   └── part-00000-3ba2ba2d-997b-482f-a2a7-d39eedb93fad-c000.snappy.parquet
    │       └── metadata
    │       │   ├── ._SUCCESS.crc
    │       │   ├── .part-00000.crc
    │       │   ├── _SUCCESS
    │       │   └── part-00000
    │   ├── 3_VectorAssembler_441d7d1bea9d
    │       └── metadata
    │       │   ├── ._SUCCESS.crc
    │       │   ├── .part-00000.crc
    │       │   ├── _SUCCESS
    │       │   └── part-00000
    │   ├── 4_StandardScaler_5dbb14844ce4
    │       ├── data
    │       │   ├── ._SUCCESS.crc
    │       │   ├── .part-00000-27e8b4bc-d589-45cb-9991-def4d3b88a50-c000.snappy.parquet.crc
    │       │   ├── _SUCCESS
    │       │   └── part-00000-27e8b4bc-d589-45cb-9991-def4d3b88a50-c000.snappy.parquet
    │       └── metadata
    │       │   ├── ._SUCCESS.crc
    │       │   ├── .part-00000.crc
    │       │   ├── _SUCCESS
    │       │   └── part-00000
    │   ├── 5_VectorAssembler_a9434a6047bc
    │       └── metadata
    │       │   ├── ._SUCCESS.crc
    │       │   ├── .part-00000.crc
    │       │   ├── _SUCCESS
    │       │   └── part-00000
    │   └── 6_LightGBMClassifier_788cfead8c52
    │       ├── complexParams
    │           ├── .lightGBMBooster.crc
    │           └── lightGBMBooster
    │       └── metadata
    │           ├── ._SUCCESS.crc
    │           ├── .part-00000.crc
    │           ├── _SUCCESS
    │           └── part-00000
├── producer.py
├── prometheus
    ├── prometheus.yml
    └── prometheus_db
    │   ├── 01JM1D7YTFGV5Q46F581Y59V6N
    │       ├── chunks
    │       │   └── 000001
    │       ├── index
    │       ├── meta.json
    │       └── tombstones
    │   ├── 01JM4N0Q121PZKCKNVYCSJCVSZ
    │       ├── chunks
    │       │   └── 000001
    │       ├── index
    │       ├── meta.json
    │       └── tombstones
    │   ├── 01JM4V3JR55NWKNGPRKH0JB4D5
    │       ├── chunks
    │       │   └── 000001
    │       ├── index
    │       ├── meta.json
    │       └── tombstones
    │   ├── 01JM4V3K06JFK35VSGTMKRYC65
    │       ├── chunks
    │       │   └── 000001
    │       ├── index
    │       ├── meta.json
    │       └── tombstones
    │   ├── chunks_head
    │       ├── 000002
    │       └── 000003
    │   ├── lock
    │   ├── queries.active
    │   └── wal
    │       ├── 00000034
    │       ├── 00000035
    │       ├── 00000036
    │       └── checkpoint.00000033
    │           └── 00000000
├── requirements.txt
├── spec.py
├── tests
    ├── conftest.py
    └── test_inference.py
├── train.py
└── udfs.py


/const.py:
--------------------------------------------------------------------------------
1 | ROOT_PATH = "replace with your path"


--------------------------------------------------------------------------------
/creditcard.csv:
--------------------------------------------------------------------------------
 1 | "Time","V1","V2","V3","V4","V5","V6","V7","V8","V9","V10","V11","V12","V13","V14","V15","V16","V17","V18","V19","V20","V21","V22","V23","V24","V25","V26","V27","V28","Amount","Class"
 2 | 0,-1.3598071336738,-0.0727811733098497,2.53634673796914,1.37815522427443,-0.338320769942518,0.462387777762292,0.239598554061257,0.0986979012610507,0.363786969611213,0.0907941719789316,-0.551599533260813,-0.617800855762348,-0.991389847235408,-0.311169353699879,1.46817697209427,-0.470400525259478,0.207971241929242,0.0257905801985591,0.403992960255733,0.251412098239705,-0.018306777944153,0.277837575558899,-0.110473910188767,0.0669280749146731,0.128539358273528,-0.189114843888824,0.133558376740387,-0.0210530534538215,149.62,"0"
 3 | 0,1.19185711131486,0.26615071205963,0.16648011335321,0.448154078460911,0.0600176492822243,-0.0823608088155687,-0.0788029833323113,0.0851016549148104,-0.255425128109186,-0.166974414004614,1.61272666105479,1.06523531137287,0.48909501589608,-0.143772296441519,0.635558093258208,0.463917041022171,-0.114804663102346,-0.183361270123994,-0.145783041325259,-0.0690831352230203,-0.225775248033138,-0.638671952771851,0.101288021253234,-0.339846475529127,0.167170404418143,0.125894532368176,-0.00898309914322813,0.0147241691924927,2.69,"0"
 4 | 1,-1.35835406159823,-1.34016307473609,1.77320934263119,0.379779593034328,-0.503198133318193,1.80049938079263,0.791460956450422,0.247675786588991,-1.51465432260583,0.207642865216696,0.624501459424895,0.066083685268831,0.717292731410831,-0.165945922763554,2.34586494901581,-2.89008319444231,1.10996937869599,-0.121359313195888,-2.26185709530414,0.524979725224404,0.247998153469754,0.771679401917229,0.909412262347719,-0.689280956490685,-0.327641833735251,-0.139096571514147,-0.0553527940384261,-0.0597518405929204,378.66,"0"
 5 | 1,-0.966271711572087,-0.185226008082898,1.79299333957872,-0.863291275036453,-0.0103088796030823,1.24720316752486,0.23760893977178,0.377435874652262,-1.38702406270197,-0.0549519224713749,-0.226487263835401,0.178228225877303,0.507756869957169,-0.28792374549456,-0.631418117709045,-1.0596472454325,-0.684092786345479,1.96577500349538,-1.2326219700892,-0.208037781160366,-0.108300452035545,0.00527359678253453,-0.190320518742841,-1.17557533186321,0.647376034602038,-0.221928844458407,0.0627228487293033,0.0614576285006353,123.5,"0"
 6 | 2,-1.15823309349523,0.877736754848451,1.548717846511,0.403033933955121,-0.407193377311653,0.0959214624684256,0.592940745385545,-0.270532677192282,0.817739308235294,0.753074431976354,-0.822842877946363,0.53819555014995,1.3458515932154,-1.11966983471731,0.175121130008994,-0.451449182813529,-0.237033239362776,-0.0381947870352842,0.803486924960175,0.408542360392758,-0.00943069713232919,0.79827849458971,-0.137458079619063,0.141266983824769,-0.206009587619756,0.502292224181569,0.219422229513348,0.215153147499206,69.99,"0"
 7 | 2,-0.425965884412454,0.960523044882985,1.14110934232219,-0.168252079760302,0.42098688077219,-0.0297275516639742,0.476200948720027,0.260314333074874,-0.56867137571251,-0.371407196834471,1.34126198001957,0.359893837038039,-0.358090652573631,-0.137133700217612,0.517616806555742,0.401725895589603,-0.0581328233640131,0.0686531494425432,-0.0331937877876282,0.0849676720682049,-0.208253514656728,-0.559824796253248,-0.0263976679795373,-0.371426583174346,-0.232793816737034,0.105914779097957,0.253844224739337,0.0810802569229443,3.67,"0"
 8 | 4,1.22965763450793,0.141003507049326,0.0453707735899449,1.20261273673594,0.191880988597645,0.272708122899098,-0.00515900288250983,0.0812129398830894,0.464959994783886,-0.0992543211289237,-1.41690724314928,-0.153825826253651,-0.75106271556262,0.16737196252175,0.0501435942254188,-0.443586797916727,0.00282051247234708,-0.61198733994012,-0.0455750446637976,-0.21963255278686,-0.167716265815783,-0.270709726172363,-0.154103786809305,-0.780055415004671,0.75013693580659,-0.257236845917139,0.0345074297438413,0.00516776890624916,4.99,"0"
 9 | 7,-0.644269442348146,1.41796354547385,1.0743803763556,-0.492199018495015,0.948934094764157,0.428118462833089,1.12063135838353,-3.80786423873589,0.615374730667027,1.24937617815176,-0.619467796121913,0.291474353088705,1.75796421396042,-1.32386521970526,0.686132504394383,-0.0761269994382006,-1.2221273453247,-0.358221569869078,0.324504731321494,-0.156741852488285,1.94346533978412,-1.01545470979971,0.057503529867291,-0.649709005559993,-0.415266566234811,-0.0516342969262494,-1.20692108094258,-1.08533918832377,40.8,"0"
10 | 7,-0.89428608220282,0.286157196276544,-0.113192212729871,-0.271526130088604,2.6695986595986,3.72181806112751,0.370145127676916,0.851084443200905,-0.392047586798604,-0.410430432848439,-0.705116586646536,-0.110452261733098,-0.286253632470583,0.0743553603016731,-0.328783050303565,-0.210077268148783,-0.499767968800267,0.118764861004217,0.57032816746536,0.0527356691149697,-0.0734251001059225,-0.268091632235551,-0.204232669947878,1.0115918018785,0.373204680146282,-0.384157307702294,0.0117473564581996,0.14240432992147,93.2,"0"
11 | 9,-0.33826175242575,1.11959337641566,1.04436655157316,-0.222187276738296,0.49936080649727,-0.24676110061991,0.651583206489972,0.0695385865186387,-0.736727316364109,-0.366845639206541,1.01761446783262,0.836389570307029,1.00684351373408,-0.443522816876142,0.150219101422635,0.739452777052119,-0.540979921943059,0.47667726004282,0.451772964394125,0.203711454727929,-0.246913936910008,-0.633752642406113,-0.12079408408185,-0.385049925313426,-0.0697330460416923,0.0941988339514961,0.246219304619926,0.0830756493473326,3.68,"0"
12 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
  1 | ---
  2 | version: '3.8'
  3 | services:
  4 |   zookeeper:
  5 |     image: confluentinc/cp-zookeeper:7.5.0
  6 |     container_name: zookeeper
  7 |     ports:
  8 |       - "2181:2181"
  9 |     healthcheck:
 10 |       test: echo srvr | nc zookeeper 2181 || exit 1
 11 |       start_period: 10s
 12 |       retries: 20
 13 |       interval: 10s
 14 |     environment:
 15 |       ZOOKEEPER_CLIENT_PORT: 2181
 16 |       ZOOKEEPER_TICK_TIME: 2000
 17 | 
 18 |   broker:
 19 |     image: confluentinc/cp-server:7.5.0
 20 |     container_name: kafka-broker
 21 |     depends_on:
 22 |       zookeeper:
 23 |         condition: service_healthy
 24 |     ports:
 25 |       - "9092:9092"
 26 |       - "9101:9101"
 27 |     healthcheck:
 28 |       test: nc -z localhost 9092 || exit -1
 29 |       start_period: 15s
 30 |       interval: 5s
 31 |       timeout: 10s
 32 |       retries: 10
 33 |     environment:
 34 |       # ID of the broker in a cluster
 35 |       KAFKA_BROKER_ID: 1
 36 |        # Connect to Zoo Keeper for distributed coordination and leader election
 37 |       KAFKA_ZOOKEEPER_CONNECT: 'zookeeper:2181'
 38 |       KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT
 39 |       # Define how clients connect to brokers
 40 |       KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://broker:29092,PLAINTEXT_HOST://localhost:9092
 41 |       # How many copies are maintained for fault tolerance
 42 |       KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
 43 |       # Confluent Metrics Reporter for Control Center Cluster Monitoring
 44 |       KAFKA_METRIC_REPORTERS: io.confluent.metrics.reporter.ConfluentMetricsReporter
 45 |       CONFLUENT_METRICS_REPORTER_BOOTSTRAP_SERVERS: 'broker:9092'
 46 |       CONFLUENT_METRICS_REPORTER_TOPIC_REPLICAS: 1
 47 |       CONFLUENT_METRICS_ENABLE: 'true'
 48 |       # For fixing the bug replication factor 3 > the number of node
 49 |       KAFKA_CONFLUENT_BALANCER_TOPIC_REPLICATION_FACTOR: 1
 50 |       KAFKA_TRANSACTION_STATE_LOG_MIN_ISR: 1
 51 |       KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 1
 52 |       CLUSTER_ID: 'MkU3OEVBNTcwNTJENDM2Qk'
 53 | 
 54 | 
 55 |   # Confluent control center to manage Kafka
 56 |   control-center:
 57 |     image: confluentinc/cp-enterprise-control-center:7.5.0
 58 |     container_name: control-center
 59 |     depends_on:
 60 |       - broker
 61 |     ports:
 62 |       - "9021:9021"
 63 |     healthcheck:
 64 |       test: ["CMD", "curl", "-f", "http://localhost:9021/healthcheck"] # Adjust the URL and options as needed
 65 |       interval: 30s
 66 |       timeout: 10s
 67 |       retries: 3
 68 |     environment:
 69 |       CONTROL_CENTER_BOOTSTRAP_SERVERS: 'broker:29092'
 70 |       CONTROL_CENTER_REPLICATION_FACTOR: 1
 71 |       CONTROL_CENTER_INTERNAL_TOPICS_PARTITIONS: 1
 72 |       CONTROL_CENTER_CONNECT_HEALTHCHECK_ENDPOINT: '/connectors'
 73 |       CONFLUENT_METRICS_TOPIC_REPLICATION: 1
 74 |       CONTROL_CENTER_MONITORING_INTERCEPTOR_TOPIC_PARTITIONS: 1
 75 | 
 76 | 
 77 |   grafana:
 78 |     image: grafana/grafana:6.5.0
 79 |     container_name: grafana
 80 |     restart: unless-stopped
 81 |     ports:
 82 |       - 3000:3000
 83 |     env_file:
 84 |       - ./grafana/login_config
 85 |     volumes:
 86 |       - ./grafana/grafana_db:/var/lib/grafana:rw
 87 |       - ./grafana/provisioning/datasource:/etc/grafana/provisioning/datasources
 88 |       - ./grafana/provisioning/dashboards:/etc/grafana/provisioning/dashboards
 89 |     depends_on:
 90 |       - prometheus
 91 | 
 92 |   prometheus:
 93 |     image: prom/prometheus:v2.20.1
 94 |     container_name: prometheus
 95 |     restart: unless-stopped
 96 |     ports:
 97 |       - 9090:9090
 98 |     command:
 99 |       - '--config.file=/etc/prometheus/prometheus.yml'
100 |     volumes:
101 |       - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
102 |       - ./prometheus/prometheus_db:/prometheus/data:rw #needs command 'sudo
103 |     network_mode: host
104 |     pid: host
105 | 
106 |   node-exporter:
107 |     image: prom/node-exporter:v1.0.1
108 |     container_name: node-exporter
109 |     command:
110 |       - '--path.rootfs=/host'
111 |     restart: unless-stopped
112 |     network_mode: host
113 |     pid: host
114 | 


--------------------------------------------------------------------------------
/grafana/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/grafana/__init__.py


--------------------------------------------------------------------------------
/grafana/grafana_db/grafana.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/grafana/grafana_db/grafana.db


--------------------------------------------------------------------------------
/grafana/login_config:
--------------------------------------------------------------------------------
1 | GF_SECURITY_ADMIN_USER=admin
2 | GF_SECURITY_ADMIN_PASSWORD=admin
3 | GF_USERS_ALLOW_SIGN_UP=false
4 | 


--------------------------------------------------------------------------------
/grafana/provisioning/dashboards/Spark Metrics.json:
--------------------------------------------------------------------------------
   1 | {
   2 |   "annotations": {
   3 |     "list": [
   4 |       {
   5 |         "builtIn": 1,
   6 |         "datasource": "-- Grafana --",
   7 |         "enable": true,
   8 |         "hide": true,
   9 |         "iconColor": "rgba(0, 211, 255, 1)",
  10 |         "name": "Annotations & Alerts",
  11 |         "type": "dashboard"
  12 |       }
  13 |     ]
  14 |   },
  15 |   "editable": true,
  16 |   "gnetId": null,
  17 |   "graphTooltip": 0,
  18 |   "id": 9,
  19 |   "links": [],
  20 |   "panels": [
  21 |     {
  22 |       "cacheTimeout": null,
  23 |       "colorBackground": false,
  24 |       "colorValue": false,
  25 |       "colors": [
  26 |         "#299c46",
  27 |         "rgba(237, 129, 40, 0.89)",
  28 |         "#d44a3a"
  29 |       ],
  30 |       "datasource": null,
  31 |       "format": "none",
  32 |       "gauge": {
  33 |         "maxValue": 100,
  34 |         "minValue": 0,
  35 |         "show": false,
  36 |         "thresholdLabels": false,
  37 |         "thresholdMarkers": true
  38 |       },
  39 |       "gridPos": {
  40 |         "h": 5,
  41 |         "w": 4,
  42 |         "x": 0,
  43 |         "y": 0
  44 |       },
  45 |       "id": 7,
  46 |       "interval": null,
  47 |       "links": [],
  48 |       "mappingType": 1,
  49 |       "mappingTypes": [
  50 |         {
  51 |           "name": "value to text",
  52 |           "value": 1
  53 |         },
  54 |         {
  55 |           "name": "range to text",
  56 |           "value": 2
  57 |         }
  58 |       ],
  59 |       "maxDataPoints": 100,
  60 |       "nullPointMode": "connected",
  61 |       "nullText": null,
  62 |       "options": {},
  63 |       "pluginVersion": "6.5.0",
  64 |       "postfix": "",
  65 |       "postfixFontSize": "50%",
  66 |       "prefix": "",
  67 |       "prefixFontSize": "50%",
  68 |       "rangeMaps": [
  69 |         {
  70 |           "from": "null",
  71 |           "text": "N/A",
  72 |           "to": "null"
  73 |         }
  74 |       ],
  75 |       "sparkline": {
  76 |         "fillColor": "rgba(31, 118, 189, 0.18)",
  77 |         "full": false,
  78 |         "lineColor": "rgb(31, 120, 193)",
  79 |         "show": false,
  80 |         "ymax": null,
  81 |         "ymin": null
  82 |       },
  83 |       "tableColumn": "",
  84 |       "targets": [
  85 |         {
  86 |           "expr": "metrics_spark_app_driver_DAGScheduler_job_allJobs_Number{group=\"spark\", instance=\"driver\", job=\"spark-driver\"}",
  87 |           "refId": "A"
  88 |         }
  89 |       ],
  90 |       "thresholds": "",
  91 |       "timeFrom": null,
  92 |       "timeShift": null,
  93 |       "title": "All Jobs",
  94 |       "type": "singlestat",
  95 |       "valueFontSize": "80%",
  96 |       "valueMaps": [
  97 |         {
  98 |           "op": "=",
  99 |           "text": "N/A",
 100 |           "value": "null"
 101 |         }
 102 |       ],
 103 |       "valueName": "current"
 104 |     },
 105 |     {
 106 |       "cacheTimeout": null,
 107 |       "colorBackground": false,
 108 |       "colorValue": false,
 109 |       "colors": [
 110 |         "#299c46",
 111 |         "rgba(237, 129, 40, 0.89)",
 112 |         "#d44a3a"
 113 |       ],
 114 |       "datasource": null,
 115 |       "format": "none",
 116 |       "gauge": {
 117 |         "maxValue": 100,
 118 |         "minValue": 0,
 119 |         "show": false,
 120 |         "thresholdLabels": false,
 121 |         "thresholdMarkers": true
 122 |       },
 123 |       "gridPos": {
 124 |         "h": 5,
 125 |         "w": 4,
 126 |         "x": 4,
 127 |         "y": 0
 128 |       },
 129 |       "id": 6,
 130 |       "interval": null,
 131 |       "links": [],
 132 |       "mappingType": 1,
 133 |       "mappingTypes": [
 134 |         {
 135 |           "name": "value to text",
 136 |           "value": 1
 137 |         },
 138 |         {
 139 |           "name": "range to text",
 140 |           "value": 2
 141 |         }
 142 |       ],
 143 |       "maxDataPoints": 100,
 144 |       "nullPointMode": "connected",
 145 |       "nullText": null,
 146 |       "options": {},
 147 |       "pluginVersion": "6.5.0",
 148 |       "postfix": "",
 149 |       "postfixFontSize": "50%",
 150 |       "prefix": "",
 151 |       "prefixFontSize": "50%",
 152 |       "rangeMaps": [
 153 |         {
 154 |           "from": "null",
 155 |           "text": "N/A",
 156 |           "to": "null"
 157 |         }
 158 |       ],
 159 |       "sparkline": {
 160 |         "fillColor": "rgba(31, 118, 189, 0.18)",
 161 |         "full": false,
 162 |         "lineColor": "rgb(31, 120, 193)",
 163 |         "show": false,
 164 |         "ymax": null,
 165 |         "ymin": null
 166 |       },
 167 |       "tableColumn": "",
 168 |       "targets": [
 169 |         {
 170 |           "expr": "metrics_spark_app_driver_DAGScheduler_job_activeJobs_Number{group=\"spark\", instance=\"driver\", job=\"spark-driver\"}",
 171 |           "refId": "A"
 172 |         }
 173 |       ],
 174 |       "thresholds": "",
 175 |       "timeFrom": null,
 176 |       "timeShift": null,
 177 |       "title": "Active Jobs",
 178 |       "type": "singlestat",
 179 |       "valueFontSize": "80%",
 180 |       "valueMaps": [
 181 |         {
 182 |           "op": "=",
 183 |           "text": "N/A",
 184 |           "value": "null"
 185 |         }
 186 |       ],
 187 |       "valueName": "current"
 188 |     },
 189 |     {
 190 |       "cacheTimeout": null,
 191 |       "colorBackground": false,
 192 |       "colorValue": false,
 193 |       "colors": [
 194 |         "#299c46",
 195 |         "rgba(237, 129, 40, 0.89)",
 196 |         "#d44a3a"
 197 |       ],
 198 |       "datasource": null,
 199 |       "description": "No of currently running stages",
 200 |       "format": "none",
 201 |       "gauge": {
 202 |         "maxValue": 100,
 203 |         "minValue": 0,
 204 |         "show": false,
 205 |         "thresholdLabels": false,
 206 |         "thresholdMarkers": true
 207 |       },
 208 |       "gridPos": {
 209 |         "h": 5,
 210 |         "w": 4,
 211 |         "x": 8,
 212 |         "y": 0
 213 |       },
 214 |       "id": 11,
 215 |       "interval": null,
 216 |       "links": [],
 217 |       "mappingType": 1,
 218 |       "mappingTypes": [
 219 |         {
 220 |           "name": "value to text",
 221 |           "value": 1
 222 |         },
 223 |         {
 224 |           "name": "range to text",
 225 |           "value": 2
 226 |         }
 227 |       ],
 228 |       "maxDataPoints": 100,
 229 |       "nullPointMode": "connected",
 230 |       "nullText": null,
 231 |       "options": {},
 232 |       "postfix": "",
 233 |       "postfixFontSize": "50%",
 234 |       "prefix": "",
 235 |       "prefixFontSize": "50%",
 236 |       "rangeMaps": [
 237 |         {
 238 |           "from": "null",
 239 |           "text": "N/A",
 240 |           "to": "null"
 241 |         }
 242 |       ],
 243 |       "sparkline": {
 244 |         "fillColor": "rgba(31, 118, 189, 0.18)",
 245 |         "full": false,
 246 |         "lineColor": "rgb(31, 120, 193)",
 247 |         "show": false,
 248 |         "ymax": null,
 249 |         "ymin": null
 250 |       },
 251 |       "tableColumn": "",
 252 |       "targets": [
 253 |         {
 254 |           "expr": "metrics_master_workers_Number{group=\"spark\", instance=\"master\", job=\"spark-master\"} ",
 255 |           "legendFormat": "maxMem_MB",
 256 |           "refId": "A"
 257 |         }
 258 |       ],
 259 |       "thresholds": "",
 260 |       "timeFrom": null,
 261 |       "timeShift": null,
 262 |       "title": "No Of Workers",
 263 |       "type": "singlestat",
 264 |       "valueFontSize": "80%",
 265 |       "valueMaps": [
 266 |         {
 267 |           "op": "=",
 268 |           "text": "N/A",
 269 |           "value": "null"
 270 |         }
 271 |       ],
 272 |       "valueName": "current"
 273 |     },
 274 |     {
 275 |       "cacheTimeout": null,
 276 |       "colorBackground": false,
 277 |       "colorValue": false,
 278 |       "colors": [
 279 |         "#299c46",
 280 |         "rgba(237, 129, 40, 0.89)",
 281 |         "#d44a3a"
 282 |       ],
 283 |       "datasource": null,
 284 |       "description": "No of currently running stages",
 285 |       "format": "none",
 286 |       "gauge": {
 287 |         "maxValue": 100,
 288 |         "minValue": 0,
 289 |         "show": false,
 290 |         "thresholdLabels": false,
 291 |         "thresholdMarkers": true
 292 |       },
 293 |       "gridPos": {
 294 |         "h": 5,
 295 |         "w": 4,
 296 |         "x": 12,
 297 |         "y": 0
 298 |       },
 299 |       "id": 10,
 300 |       "interval": null,
 301 |       "links": [],
 302 |       "mappingType": 1,
 303 |       "mappingTypes": [
 304 |         {
 305 |           "name": "value to text",
 306 |           "value": 1
 307 |         },
 308 |         {
 309 |           "name": "range to text",
 310 |           "value": 2
 311 |         }
 312 |       ],
 313 |       "maxDataPoints": 100,
 314 |       "nullPointMode": "connected",
 315 |       "nullText": null,
 316 |       "options": {},
 317 |       "postfix": "",
 318 |       "postfixFontSize": "50%",
 319 |       "prefix": "",
 320 |       "prefixFontSize": "50%",
 321 |       "rangeMaps": [
 322 |         {
 323 |           "from": "null",
 324 |           "text": "N/A",
 325 |           "to": "null"
 326 |         }
 327 |       ],
 328 |       "sparkline": {
 329 |         "fillColor": "rgba(31, 118, 189, 0.18)",
 330 |         "full": false,
 331 |         "lineColor": "rgb(31, 120, 193)",
 332 |         "show": false,
 333 |         "ymax": null,
 334 |         "ymin": null
 335 |       },
 336 |       "tableColumn": "",
 337 |       "targets": [
 338 |         {
 339 |           "expr": "metrics_master_workers_Number{group=\"spark\", instance=\"master\", job=\"spark-master\"}  ",
 340 |           "legendFormat": "maxMem_MB",
 341 |           "refId": "A"
 342 |         }
 343 |       ],
 344 |       "thresholds": "",
 345 |       "timeFrom": null,
 346 |       "timeShift": null,
 347 |       "title": "No Of Alive Workers",
 348 |       "type": "singlestat",
 349 |       "valueFontSize": "80%",
 350 |       "valueMaps": [
 351 |         {
 352 |           "op": "=",
 353 |           "text": "N/A",
 354 |           "value": "null"
 355 |         }
 356 |       ],
 357 |       "valueName": "current"
 358 |     },
 359 |     {
 360 |       "cacheTimeout": null,
 361 |       "colorBackground": false,
 362 |       "colorValue": false,
 363 |       "colors": [
 364 |         "#299c46",
 365 |         "rgba(237, 129, 40, 0.89)",
 366 |         "#d44a3a"
 367 |       ],
 368 |       "datasource": null,
 369 |       "description": "No of currently running stages",
 370 |       "format": "none",
 371 |       "gauge": {
 372 |         "maxValue": 100,
 373 |         "minValue": 0,
 374 |         "show": false,
 375 |         "thresholdLabels": false,
 376 |         "thresholdMarkers": true
 377 |       },
 378 |       "gridPos": {
 379 |         "h": 5,
 380 |         "w": 4,
 381 |         "x": 16,
 382 |         "y": 0
 383 |       },
 384 |       "id": 12,
 385 |       "interval": null,
 386 |       "links": [],
 387 |       "mappingType": 1,
 388 |       "mappingTypes": [
 389 |         {
 390 |           "name": "value to text",
 391 |           "value": 1
 392 |         },
 393 |         {
 394 |           "name": "range to text",
 395 |           "value": 2
 396 |         }
 397 |       ],
 398 |       "maxDataPoints": 100,
 399 |       "nullPointMode": "connected",
 400 |       "nullText": null,
 401 |       "options": {},
 402 |       "pluginVersion": "6.5.0",
 403 |       "postfix": "",
 404 |       "postfixFontSize": "50%",
 405 |       "prefix": "",
 406 |       "prefixFontSize": "50%",
 407 |       "rangeMaps": [
 408 |         {
 409 |           "from": "null",
 410 |           "text": "N/A",
 411 |           "to": "null"
 412 |         }
 413 |       ],
 414 |       "sparkline": {
 415 |         "fillColor": "rgba(31, 118, 189, 0.18)",
 416 |         "full": false,
 417 |         "lineColor": "rgb(31, 120, 193)",
 418 |         "show": false,
 419 |         "ymax": null,
 420 |         "ymin": null
 421 |       },
 422 |       "tableColumn": "",
 423 |       "targets": [
 424 |         {
 425 |           "expr": "metrics_worker_coresFree_Number{ group=\"spark\", instance=\"worker\", job=\"spark-worker\"}  ",
 426 |           "legendFormat": "maxMem_MB",
 427 |           "refId": "A"
 428 |         }
 429 |       ],
 430 |       "thresholds": "",
 431 |       "timeFrom": null,
 432 |       "timeShift": null,
 433 |       "title": "No Of Cores Free By Worker",
 434 |       "type": "singlestat",
 435 |       "valueFontSize": "80%",
 436 |       "valueMaps": [
 437 |         {
 438 |           "op": "=",
 439 |           "text": "N/A",
 440 |           "value": "null"
 441 |         }
 442 |       ],
 443 |       "valueName": "current"
 444 |     },
 445 |     {
 446 |       "cacheTimeout": null,
 447 |       "colorBackground": false,
 448 |       "colorValue": false,
 449 |       "colors": [
 450 |         "#299c46",
 451 |         "rgba(237, 129, 40, 0.89)",
 452 |         "#d44a3a"
 453 |       ],
 454 |       "datasource": null,
 455 |       "description": "No of currently running stages",
 456 |       "format": "none",
 457 |       "gauge": {
 458 |         "maxValue": 100,
 459 |         "minValue": 0,
 460 |         "show": false,
 461 |         "thresholdLabels": false,
 462 |         "thresholdMarkers": true
 463 |       },
 464 |       "gridPos": {
 465 |         "h": 5,
 466 |         "w": 4,
 467 |         "x": 20,
 468 |         "y": 0
 469 |       },
 470 |       "id": 13,
 471 |       "interval": null,
 472 |       "links": [],
 473 |       "mappingType": 1,
 474 |       "mappingTypes": [
 475 |         {
 476 |           "name": "value to text",
 477 |           "value": 1
 478 |         },
 479 |         {
 480 |           "name": "range to text",
 481 |           "value": 2
 482 |         }
 483 |       ],
 484 |       "maxDataPoints": 100,
 485 |       "nullPointMode": "connected",
 486 |       "nullText": null,
 487 |       "options": {},
 488 |       "pluginVersion": "6.5.0",
 489 |       "postfix": "",
 490 |       "postfixFontSize": "50%",
 491 |       "prefix": "",
 492 |       "prefixFontSize": "50%",
 493 |       "rangeMaps": [
 494 |         {
 495 |           "from": "null",
 496 |           "text": "N/A",
 497 |           "to": "null"
 498 |         }
 499 |       ],
 500 |       "sparkline": {
 501 |         "fillColor": "rgba(31, 118, 189, 0.18)",
 502 |         "full": false,
 503 |         "lineColor": "rgb(31, 120, 193)",
 504 |         "show": false,
 505 |         "ymax": null,
 506 |         "ymin": null
 507 |       },
 508 |       "tableColumn": "",
 509 |       "targets": [
 510 |         {
 511 |           "expr": "metrics_worker_coresUsed_Number{ group=\"spark\", instance=\"worker\", job=\"spark-worker\"}  ",
 512 |           "legendFormat": "",
 513 |           "refId": "A"
 514 |         }
 515 |       ],
 516 |       "thresholds": "",
 517 |       "timeFrom": null,
 518 |       "timeShift": null,
 519 |       "title": "No Of Cores Used By Worker",
 520 |       "type": "singlestat",
 521 |       "valueFontSize": "80%",
 522 |       "valueMaps": [
 523 |         {
 524 |           "op": "=",
 525 |           "text": "N/A",
 526 |           "value": "null"
 527 |         }
 528 |       ],
 529 |       "valueName": "current"
 530 |     },
 531 |     {
 532 |       "aliasColors": {},
 533 |       "bars": false,
 534 |       "dashLength": 10,
 535 |       "dashes": false,
 536 |       "datasource": null,
 537 |       "description": "No of currently running stages",
 538 |       "fill": 1,
 539 |       "fillGradient": 0,
 540 |       "gridPos": {
 541 |         "h": 8,
 542 |         "w": 8,
 543 |         "x": 0,
 544 |         "y": 5
 545 |       },
 546 |       "hiddenSeries": false,
 547 |       "id": 2,
 548 |       "legend": {
 549 |         "avg": false,
 550 |         "current": false,
 551 |         "max": false,
 552 |         "min": false,
 553 |         "show": true,
 554 |         "total": false,
 555 |         "values": false
 556 |       },
 557 |       "lines": true,
 558 |       "linewidth": 1,
 559 |       "nullPointMode": "null",
 560 |       "options": {
 561 |         "dataLinks": []
 562 |       },
 563 |       "percentage": false,
 564 |       "pointradius": 2,
 565 |       "points": false,
 566 |       "renderer": "flot",
 567 |       "seriesOverrides": [],
 568 |       "spaceLength": 10,
 569 |       "stack": false,
 570 |       "steppedLine": false,
 571 |       "targets": [
 572 |         {
 573 |           "expr": "metrics_spark_app_driver_DAGScheduler_stage_runningStages_Number{group=\"spark\", instance=\"driver\", job=\"spark-driver\"} ",
 574 |           "legendFormat": "Running Stages",
 575 |           "refId": "A"
 576 |         },
 577 |         {
 578 |           "expr": "metrics_spark_app_driver_DAGScheduler_stage_waitingStages_Number{group=\"spark\", instance=\"driver\", job=\"spark-driver\"} ",
 579 |           "legendFormat": "Waiting Stages",
 580 |           "refId": "B"
 581 |         },
 582 |         {
 583 |           "expr": "metrics_spark_app_driver_DAGScheduler_stage_failedStages_Number{group=\"spark\", instance=\"driver\", job=\"spark-driver\"} ",
 584 |           "legendFormat": "Failed Stages",
 585 |           "refId": "C"
 586 |         }
 587 |       ],
 588 |       "thresholds": [],
 589 |       "timeFrom": null,
 590 |       "timeRegions": [],
 591 |       "timeShift": null,
 592 |       "title": "Spark Driver Running Stages",
 593 |       "tooltip": {
 594 |         "shared": true,
 595 |         "sort": 0,
 596 |         "value_type": "individual"
 597 |       },
 598 |       "type": "graph",
 599 |       "xaxis": {
 600 |         "buckets": null,
 601 |         "mode": "time",
 602 |         "name": null,
 603 |         "show": true,
 604 |         "values": []
 605 |       },
 606 |       "yaxes": [
 607 |         {
 608 |           "format": "short",
 609 |           "label": null,
 610 |           "logBase": 1,
 611 |           "max": null,
 612 |           "min": null,
 613 |           "show": true
 614 |         },
 615 |         {
 616 |           "format": "short",
 617 |           "label": null,
 618 |           "logBase": 1,
 619 |           "max": null,
 620 |           "min": null,
 621 |           "show": true
 622 |         }
 623 |       ],
 624 |       "yaxis": {
 625 |         "align": false,
 626 |         "alignLevel": null
 627 |       }
 628 |     },
 629 |     {
 630 |       "aliasColors": {},
 631 |       "bars": false,
 632 |       "dashLength": 10,
 633 |       "dashes": false,
 634 |       "datasource": null,
 635 |       "description": "No of currently running stages",
 636 |       "fill": 1,
 637 |       "fillGradient": 0,
 638 |       "gridPos": {
 639 |         "h": 8,
 640 |         "w": 8,
 641 |         "x": 8,
 642 |         "y": 5
 643 |       },
 644 |       "hiddenSeries": false,
 645 |       "id": 8,
 646 |       "legend": {
 647 |         "avg": false,
 648 |         "current": false,
 649 |         "max": false,
 650 |         "min": false,
 651 |         "show": true,
 652 |         "total": false,
 653 |         "values": false
 654 |       },
 655 |       "lines": true,
 656 |       "linewidth": 1,
 657 |       "nullPointMode": "null",
 658 |       "options": {
 659 |         "dataLinks": []
 660 |       },
 661 |       "percentage": false,
 662 |       "pointradius": 2,
 663 |       "points": false,
 664 |       "renderer": "flot",
 665 |       "seriesOverrides": [],
 666 |       "spaceLength": 10,
 667 |       "stack": false,
 668 |       "steppedLine": false,
 669 |       "targets": [
 670 |         {
 671 |           "expr": "metrics_spark_app_driver_DAGScheduler_messageProcessingTime_Max{group=\"spark\", instance=\"driver\", job=\"spark-driver\"} ",
 672 |           "legendFormat": "Max",
 673 |           "refId": "A"
 674 |         },
 675 |         {
 676 |           "expr": "metrics_spark_app_driver_DAGScheduler_messageProcessingTime_Min{group=\"spark\", instance=\"driver\", job=\"spark-driver\"} ",
 677 |           "legendFormat": "Min",
 678 |           "refId": "B"
 679 |         },
 680 |         {
 681 |           "expr": "metrics_spark_app_driver_DAGScheduler_messageProcessingTime_Mean{group=\"spark\", instance=\"driver\", job=\"spark-driver\"} ",
 682 |           "legendFormat": "Mean",
 683 |           "refId": "C"
 684 |         },
 685 |         {
 686 |           "expr": "metrics_spark_app_driver_DAGScheduler_messageProcessingTime_StdDev{group=\"spark\", instance=\"driver\", job=\"spark-driver\"} ",
 687 |           "legendFormat": "StdDev",
 688 |           "refId": "D"
 689 |         }
 690 |       ],
 691 |       "thresholds": [],
 692 |       "timeFrom": null,
 693 |       "timeRegions": [],
 694 |       "timeShift": null,
 695 |       "title": "Message Processing Time",
 696 |       "tooltip": {
 697 |         "shared": true,
 698 |         "sort": 0,
 699 |         "value_type": "individual"
 700 |       },
 701 |       "type": "graph",
 702 |       "xaxis": {
 703 |         "buckets": null,
 704 |         "mode": "time",
 705 |         "name": null,
 706 |         "show": true,
 707 |         "values": []
 708 |       },
 709 |       "yaxes": [
 710 |         {
 711 |           "format": "short",
 712 |           "label": null,
 713 |           "logBase": 1,
 714 |           "max": null,
 715 |           "min": null,
 716 |           "show": true
 717 |         },
 718 |         {
 719 |           "format": "short",
 720 |           "label": null,
 721 |           "logBase": 1,
 722 |           "max": null,
 723 |           "min": null,
 724 |           "show": true
 725 |         }
 726 |       ],
 727 |       "yaxis": {
 728 |         "align": false,
 729 |         "alignLevel": null
 730 |       }
 731 |     },
 732 |     {
 733 |       "aliasColors": {},
 734 |       "bars": false,
 735 |       "dashLength": 10,
 736 |       "dashes": false,
 737 |       "datasource": null,
 738 |       "description": "No of currently running stages",
 739 |       "fill": 1,
 740 |       "fillGradient": 0,
 741 |       "gridPos": {
 742 |         "h": 8,
 743 |         "w": 8,
 744 |         "x": 16,
 745 |         "y": 5
 746 |       },
 747 |       "hiddenSeries": false,
 748 |       "id": 9,
 749 |       "legend": {
 750 |         "avg": false,
 751 |         "current": false,
 752 |         "max": false,
 753 |         "min": false,
 754 |         "show": true,
 755 |         "total": false,
 756 |         "values": false
 757 |       },
 758 |       "lines": true,
 759 |       "linewidth": 1,
 760 |       "nullPointMode": "null",
 761 |       "options": {
 762 |         "dataLinks": []
 763 |       },
 764 |       "percentage": false,
 765 |       "pointradius": 2,
 766 |       "points": false,
 767 |       "renderer": "flot",
 768 |       "seriesOverrides": [],
 769 |       "spaceLength": 10,
 770 |       "stack": false,
 771 |       "steppedLine": false,
 772 |       "targets": [
 773 |         {
 774 |           "expr": "metrics_spark_app_driver_BlockManager_memory_maxMem_MB_Number{group=\"spark\", instance=\"driver\", job=\"spark-driver\"} ",
 775 |           "legendFormat": "maxMem_MB",
 776 |           "refId": "A"
 777 |         },
 778 |         {
 779 |           "expr": "metrics_spark_app_driver_BlockManager_disk_diskSpaceUsed_MB_Number{group=\"spark\", instance=\"driver\", job=\"spark-driver\"} ",
 780 |           "legendFormat": "diskSpaceUsed_MB",
 781 |           "refId": "D"
 782 |         },
 783 |         {
 784 |           "expr": "metrics_spark_app_driver_BlockManager_memory_maxOnHeapMem_MB_Number{group=\"spark\", instance=\"driver\", job=\"spark-driver\"} ",
 785 |           "legendFormat": "maxOnHeapMem_MB",
 786 |           "refId": "B"
 787 |         },
 788 |         {
 789 |           "expr": "metrics_spark_app_driver_BlockManager_memory_maxOffHeapMem_MB_Number{group=\"spark\", instance=\"driver\", job=\"spark-driver\"} ",
 790 |           "legendFormat": "maxOffHeapMem_MB",
 791 |           "refId": "C"
 792 |         }
 793 |       ],
 794 |       "thresholds": [],
 795 |       "timeFrom": null,
 796 |       "timeRegions": [],
 797 |       "timeShift": null,
 798 |       "title": "BlockManager",
 799 |       "tooltip": {
 800 |         "shared": true,
 801 |         "sort": 0,
 802 |         "value_type": "individual"
 803 |       },
 804 |       "type": "graph",
 805 |       "xaxis": {
 806 |         "buckets": null,
 807 |         "mode": "time",
 808 |         "name": null,
 809 |         "show": true,
 810 |         "values": []
 811 |       },
 812 |       "yaxes": [
 813 |         {
 814 |           "format": "short",
 815 |           "label": null,
 816 |           "logBase": 1,
 817 |           "max": null,
 818 |           "min": null,
 819 |           "show": true
 820 |         },
 821 |         {
 822 |           "format": "short",
 823 |           "label": null,
 824 |           "logBase": 1,
 825 |           "max": null,
 826 |           "min": null,
 827 |           "show": true
 828 |         }
 829 |       ],
 830 |       "yaxis": {
 831 |         "align": false,
 832 |         "alignLevel": null
 833 |       }
 834 |     },
 835 |     {
 836 |       "aliasColors": {},
 837 |       "bars": false,
 838 |       "cacheTimeout": null,
 839 |       "dashLength": 10,
 840 |       "dashes": false,
 841 |       "datasource": null,
 842 |       "description": "No of currently running stages",
 843 |       "fill": 1,
 844 |       "fillGradient": 0,
 845 |       "gridPos": {
 846 |         "h": 6,
 847 |         "w": 12,
 848 |         "x": 0,
 849 |         "y": 13
 850 |       },
 851 |       "hiddenSeries": false,
 852 |       "id": 14,
 853 |       "legend": {
 854 |         "avg": false,
 855 |         "current": false,
 856 |         "max": false,
 857 |         "min": false,
 858 |         "show": true,
 859 |         "total": false,
 860 |         "values": false
 861 |       },
 862 |       "lines": true,
 863 |       "linewidth": 1,
 864 |       "links": [],
 865 |       "nullPointMode": "null",
 866 |       "options": {
 867 |         "dataLinks": []
 868 |       },
 869 |       "percentage": false,
 870 |       "pluginVersion": "6.5.0",
 871 |       "pointradius": 2,
 872 |       "points": false,
 873 |       "renderer": "flot",
 874 |       "seriesOverrides": [],
 875 |       "spaceLength": 10,
 876 |       "stack": false,
 877 |       "steppedLine": false,
 878 |       "targets": [
 879 |         {
 880 |           "expr": "metrics_worker_memFree_MB_Number{ group=\"spark\", instance=\"worker\", job=\"spark-worker\"}  ",
 881 |           "legendFormat": "memFree_MB",
 882 |           "refId": "A"
 883 |         },
 884 |         {
 885 |           "expr": "metrics_worker_memUsed_MB_Number{ group=\"spark\", instance=\"worker\", job=\"spark-worker\"}  ",
 886 |           "legendFormat": "memUsed_MB",
 887 |           "refId": "B"
 888 |         }
 889 |       ],
 890 |       "thresholds": [],
 891 |       "timeFrom": null,
 892 |       "timeRegions": [],
 893 |       "timeShift": null,
 894 |       "title": "Memory Free/Used by Worker",
 895 |       "tooltip": {
 896 |         "shared": true,
 897 |         "sort": 0,
 898 |         "value_type": "individual"
 899 |       },
 900 |       "type": "graph",
 901 |       "xaxis": {
 902 |         "buckets": null,
 903 |         "mode": "time",
 904 |         "name": null,
 905 |         "show": true,
 906 |         "values": []
 907 |       },
 908 |       "yaxes": [
 909 |         {
 910 |           "format": "short",
 911 |           "label": null,
 912 |           "logBase": 1,
 913 |           "max": null,
 914 |           "min": null,
 915 |           "show": true
 916 |         },
 917 |         {
 918 |           "format": "short",
 919 |           "label": null,
 920 |           "logBase": 1,
 921 |           "max": null,
 922 |           "min": null,
 923 |           "show": true
 924 |         }
 925 |       ],
 926 |       "yaxis": {
 927 |         "align": false,
 928 |         "alignLevel": null
 929 |       }
 930 |     },
 931 |     {
 932 |       "aliasColors": {},
 933 |       "bars": false,
 934 |       "cacheTimeout": null,
 935 |       "dashLength": 10,
 936 |       "dashes": false,
 937 |       "datasource": null,
 938 |       "description": "No of currently running stages",
 939 |       "fill": 1,
 940 |       "fillGradient": 0,
 941 |       "gridPos": {
 942 |         "h": 6,
 943 |         "w": 12,
 944 |         "x": 12,
 945 |         "y": 13
 946 |       },
 947 |       "hiddenSeries": false,
 948 |       "id": 15,
 949 |       "legend": {
 950 |         "avg": false,
 951 |         "current": false,
 952 |         "max": false,
 953 |         "min": false,
 954 |         "show": true,
 955 |         "total": false,
 956 |         "values": false
 957 |       },
 958 |       "lines": true,
 959 |       "linewidth": 1,
 960 |       "links": [],
 961 |       "nullPointMode": "null",
 962 |       "options": {
 963 |         "dataLinks": []
 964 |       },
 965 |       "percentage": false,
 966 |       "pluginVersion": "6.5.0",
 967 |       "pointradius": 2,
 968 |       "points": false,
 969 |       "renderer": "flot",
 970 |       "seriesOverrides": [],
 971 |       "spaceLength": 10,
 972 |       "stack": false,
 973 |       "steppedLine": false,
 974 |       "targets": [
 975 |         {
 976 |           "expr": "metrics_spark_app_driver_LiveListenerBus_listenerProcessingTime_org_apache_spark_HeartbeatReceiver_Count{type=\"timers\", group=\"spark\", instance=\"driver\", job=\"spark-driver\"}  ",
 977 |           "legendFormat": "Count",
 978 |           "refId": "A"
 979 |         },
 980 |         {
 981 |           "expr": "metrics_spark_app_driver_LiveListenerBus_listenerProcessingTime_org_apache_spark_HeartbeatReceiver_Max{type=\"timers\", group=\"spark\", instance=\"driver\", job=\"spark-driver\"}  ",
 982 |           "legendFormat": "Max",
 983 |           "refId": "B"
 984 |         },
 985 |         {
 986 |           "expr": "metrics_spark_app_driver_LiveListenerBus_listenerProcessingTime_org_apache_spark_HeartbeatReceiver_Mean{type=\"timers\", group=\"spark\", instance=\"driver\", job=\"spark-driver\"}  ",
 987 |           "legendFormat": "Mean",
 988 |           "refId": "C"
 989 |         },
 990 |         {
 991 |           "expr": "metrics_spark_app_driver_LiveListenerBus_listenerProcessingTime_org_apache_spark_HeartbeatReceiver_Min{type=\"timers\", group=\"spark\", instance=\"driver\", job=\"spark-driver\"}  ",
 992 |           "legendFormat": "Min",
 993 |           "refId": "D"
 994 |         }
 995 |       ],
 996 |       "thresholds": [],
 997 |       "timeFrom": null,
 998 |       "timeRegions": [],
 999 |       "timeShift": null,
1000 |       "title": "LiveListenerBus listenerProcessingTime HeartbeatReceiver",
1001 |       "tooltip": {
1002 |         "shared": true,
1003 |         "sort": 0,
1004 |         "value_type": "individual"
1005 |       },
1006 |       "type": "graph",
1007 |       "xaxis": {
1008 |         "buckets": null,
1009 |         "mode": "time",
1010 |         "name": null,
1011 |         "show": true,
1012 |         "values": []
1013 |       },
1014 |       "yaxes": [
1015 |         {
1016 |           "format": "short",
1017 |           "label": null,
1018 |           "logBase": 1,
1019 |           "max": null,
1020 |           "min": null,
1021 |           "show": true
1022 |         },
1023 |         {
1024 |           "format": "short",
1025 |           "label": null,
1026 |           "logBase": 1,
1027 |           "max": null,
1028 |           "min": null,
1029 |           "show": true
1030 |         }
1031 |       ],
1032 |       "yaxis": {
1033 |         "align": false,
1034 |         "alignLevel": null
1035 |       }
1036 |     }
1037 |   ],
1038 |   "refresh": false,
1039 |   "schemaVersion": 21,
1040 |   "style": "dark",
1041 |   "tags": [],
1042 |   "templating": {
1043 |     "list": []
1044 |   },
1045 |   "time": {
1046 |     "from": "now-15m",
1047 |     "to": "now"
1048 |   },
1049 |   "timepicker": {
1050 |     "refresh_intervals": [
1051 |       "5s",
1052 |       "10s",
1053 |       "30s",
1054 |       "1m",
1055 |       "5m",
1056 |       "15m",
1057 |       "30m",
1058 |       "1h",
1059 |       "2h",
1060 |       "1d"
1061 |     ]
1062 |   },
1063 |   "timezone": "",
1064 |   "title": "Apache Spark Metrics",
1065 |   "uid": "EWIYh-OMz",
1066 |   "version": 10
1067 | }
1068 | 


--------------------------------------------------------------------------------
/grafana/provisioning/dashboards/dashboard.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | apiVersion: 1
 3 | 
 4 | providers:
 5 | - name: 'Prometheus'
 6 |   orgId: 1
 7 |   folder: ''
 8 |   type: file
 9 |   disableDeletion: false
10 |   editable: true
11 |   options:
12 |     path: /etc/grafana/provisioning/dashboards
13 | 


--------------------------------------------------------------------------------
/inference/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/inference/__init__.py


--------------------------------------------------------------------------------
/inference/base.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | 
 3 | from synapse.ml.lightgbm import LightGBMClassifier
 4 | import pyspark.sql.types as T
 5 | import pyspark.sql.functions as F
 6 | from pyspark.ml import PipelineModel
 7 | from pyspark.sql import SparkSession
 8 | from pyspark.sql.dataframe import DataFrame
 9 | 
10 | from const import ROOT_PATH
11 | from udfs import extract_predicted_prob
12 | 
13 | 
14 | class Inference(abc.ABC):
15 |     def __init__(self, model_path, config: dict = None):
16 |         self.spark = SparkSession \
17 |             .builder \
18 |             .master('local[*]') \
19 |             .appName("inference") \
20 |             .config("spark.jars.packages",
21 |                     "org.apache.kafka:kafka-clients:3.2.1,"
22 |                     "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0,"
23 |                     "com.microsoft.azure:synapseml_2.12:1.0.2") \
24 |             .getOrCreate()
25 | 
26 |         self.config = config or {}
27 |         self.model = self.load_pipeline_model(model_path)
28 | 
29 |     @staticmethod
30 |     def load_pipeline_model(model_path):
31 |         pipeline_model = PipelineModel.load(model_path)
32 |         return pipeline_model
33 | 
34 |     @staticmethod
35 |     def preprocess_data(raw_data: DataFrame) -> DataFrame:
36 |         return raw_data
37 | 
38 |     @staticmethod
39 |     def make_prediction(pipeline: PipelineModel,
40 |                         reprocessed_data: DataFrame) -> DataFrame:
41 |         predictions = pipeline.transform(reprocessed_data)
42 |         predictions = predictions \
43 |             .withColumn("predicted_prob",
44 |                         extract_predicted_prob(F.col("probability"))) \
45 |             .withColumn("predicted_prob", F.col("predicted_prob")
46 |                         .cast(T.DoubleType()))
47 |         return predictions
48 | 
49 |     @abc.abstractmethod
50 |     def read(self) -> DataFrame:
51 |         pass
52 | 
53 |     @abc.abstractmethod
54 |     def write(self, predictions_df: DataFrame):
55 |         pass
56 | 
57 |     def run(self) -> None:
58 |         df = self.read()
59 |         preprocessed_df = self.preprocess_data(df)
60 |         predictions_df = self.make_prediction(self.model, preprocessed_df)
61 |         self.write(predictions_df)
62 | 


--------------------------------------------------------------------------------
/inference/batch_inference.py:
--------------------------------------------------------------------------------
 1 | from synapse.ml.lightgbm import LightGBMClassifier
 2 | from pyspark.sql.dataframe import DataFrame
 3 | 
 4 | from const import ROOT_PATH
 5 | from spec import output_schema
 6 | from inference.base import Inference
 7 | 
 8 | 
 9 | class BatchInference(Inference):
10 | 
11 |     def read(self) -> DataFrame:
12 |         return self.spark.read.csv(self.config["dataset"], header=True,
13 |                                    inferSchema=True).limit(10000)
14 | 
15 |     def write(self, df: DataFrame) -> None:
16 |         df.select(output_schema.names).show()
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     batch_config = {
21 |         "dataset": f"{ROOT_PATH}/creditcard.csv"
22 |     }
23 | 
24 |     inference = BatchInference(f"{ROOT_PATH}/model", batch_config)
25 |     inference.run()
26 | 


--------------------------------------------------------------------------------
/inference/stream_inference.py:
--------------------------------------------------------------------------------
 1 | from synapse.ml.lightgbm import LightGBMClassifier
 2 | 
 3 | import pyspark.sql.types as T
 4 | import pyspark.sql.functions as F
 5 | from pyspark.sql.dataframe import DataFrame
 6 | 
 7 | from const import ROOT_PATH
 8 | from inference.base import Inference
 9 | from spec import input_schema, output_schema
10 | 
11 | 
12 | class StreamInference(Inference):
13 |     def read(self) -> DataFrame:
14 |         return self.spark \
15 |             .readStream \
16 |             .format("kafka") \
17 |             .option("subscribe", self.config['source']['kafka_topics']) \
18 |             .options(**self.config["source"]["kafka_options"]) \
19 |             .load()\
20 |             .select(F.from_json(F.col("value").cast("string"),
21 |                                 input_schema).alias("value"))\
22 |             .select(F.col("value.*"))
23 | 
24 |     def write(self, df: DataFrame) -> None:
25 |         (df
26 |          .select(output_schema.names)
27 |          .withColumn("value", F.to_json(F.struct(*[F.col(c) for c in
28 |                                                    output_schema.names])))
29 |          .withColumn("key", F.rand(seed=42).astype(T.StringType()))
30 |          .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
31 |          .writeStream
32 |          .trigger(processingTime="0 seconds")
33 |          .format("kafka")
34 |          .options(**self.config["sink"]["kafka_options"])
35 |          .option("topic", self.config["sink"]["sink_topic"])
36 |          .option("checkpointLocation",
37 |                  self.config["sink"]["checkpoint_location"])
38 |          .start()
39 |          .awaitTermination()
40 |          )
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     stream_config = {
45 |         "source": {
46 |             "kafka_topics": "raw",
47 |             "kafka_options": {
48 |                 "kafka.bootstrap.servers": "localhost:9092",
49 |                 "startingOffsets": "latest",
50 |             }
51 |         },
52 |         "sink": {
53 |             "sink_topic": "predictions",
54 |             "checkpoint_location": f"{ROOT_PATH}/spark-streaming-checkpoint",
55 |             # Change this
56 |             # to your desired path.
57 |             "write_format": "console",
58 |             "write_output_mode": "append",  # "complete", "append" or "update",
59 |             "kafka_options": {
60 |                 "kafka.bootstrap.servers": "localhost:9092"
61 |             }
62 |         }
63 |     }
64 | 
65 |     inference = StreamInference(f"{ROOT_PATH}/model", stream_config)
66 |     inference.run()
67 | 


--------------------------------------------------------------------------------
/model/metadata/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/model/metadata/.part-00000.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/metadata/.part-00000.crc


--------------------------------------------------------------------------------
/model/metadata/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/metadata/_SUCCESS


--------------------------------------------------------------------------------
/model/metadata/part-00000:
--------------------------------------------------------------------------------
1 | {"class":"org.apache.spark.ml.PipelineModel","timestamp":1739608795458,"sparkVersion":"3.5.0","uid":"PipelineModel_5b77bd046814","paramMap":{"stageUids":["StringIndexer_d065dab91d9f","OneHotEncoder_de68f7eb6b38","Imputer_7b5e57f74a2b","VectorAssembler_441d7d1bea9d","StandardScaler_5dbb14844ce4","VectorAssembler_a9434a6047bc","LightGBMClassifier_788cfead8c52"]},"defaultParamMap":{}}
2 | 


--------------------------------------------------------------------------------
/model/stages/0_StringIndexer_d065dab91d9f/data/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/model/stages/0_StringIndexer_d065dab91d9f/data/.part-00000-7abf6b25-05d4-4465-bc28-f85b21eda3b2-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/stages/0_StringIndexer_d065dab91d9f/data/.part-00000-7abf6b25-05d4-4465-bc28-f85b21eda3b2-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/model/stages/0_StringIndexer_d065dab91d9f/data/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/stages/0_StringIndexer_d065dab91d9f/data/_SUCCESS


--------------------------------------------------------------------------------
/model/stages/0_StringIndexer_d065dab91d9f/data/part-00000-7abf6b25-05d4-4465-bc28-f85b21eda3b2-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/stages/0_StringIndexer_d065dab91d9f/data/part-00000-7abf6b25-05d4-4465-bc28-f85b21eda3b2-c000.snappy.parquet


--------------------------------------------------------------------------------
/model/stages/0_StringIndexer_d065dab91d9f/metadata/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/model/stages/0_StringIndexer_d065dab91d9f/metadata/.part-00000.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/stages/0_StringIndexer_d065dab91d9f/metadata/.part-00000.crc


--------------------------------------------------------------------------------
/model/stages/0_StringIndexer_d065dab91d9f/metadata/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/stages/0_StringIndexer_d065dab91d9f/metadata/_SUCCESS


--------------------------------------------------------------------------------
/model/stages/0_StringIndexer_d065dab91d9f/metadata/part-00000:
--------------------------------------------------------------------------------
1 | {"class":"org.apache.spark.ml.feature.StringIndexerModel","timestamp":1739608796112,"sparkVersion":"3.5.0","uid":"StringIndexer_d065dab91d9f","paramMap":{"stringOrderType":"alphabetAsc","outputCols":["TimeIndex"],"handleInvalid":"keep","inputCols":["Time"]},"defaultParamMap":{"stringOrderType":"frequencyDesc","outputCol":"StringIndexer_d065dab91d9f__output","handleInvalid":"error"}}
2 | 


--------------------------------------------------------------------------------
/model/stages/1_OneHotEncoder_de68f7eb6b38/data/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/model/stages/1_OneHotEncoder_de68f7eb6b38/data/.part-00000-8acdc128-241b-4b22-a551-aeba6a6ca387-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/stages/1_OneHotEncoder_de68f7eb6b38/data/.part-00000-8acdc128-241b-4b22-a551-aeba6a6ca387-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/model/stages/1_OneHotEncoder_de68f7eb6b38/data/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/stages/1_OneHotEncoder_de68f7eb6b38/data/_SUCCESS


--------------------------------------------------------------------------------
/model/stages/1_OneHotEncoder_de68f7eb6b38/data/part-00000-8acdc128-241b-4b22-a551-aeba6a6ca387-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/stages/1_OneHotEncoder_de68f7eb6b38/data/part-00000-8acdc128-241b-4b22-a551-aeba6a6ca387-c000.snappy.parquet


--------------------------------------------------------------------------------
/model/stages/1_OneHotEncoder_de68f7eb6b38/metadata/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/model/stages/1_OneHotEncoder_de68f7eb6b38/metadata/.part-00000.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/stages/1_OneHotEncoder_de68f7eb6b38/metadata/.part-00000.crc


--------------------------------------------------------------------------------
/model/stages/1_OneHotEncoder_de68f7eb6b38/metadata/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/stages/1_OneHotEncoder_de68f7eb6b38/metadata/_SUCCESS


--------------------------------------------------------------------------------
/model/stages/1_OneHotEncoder_de68f7eb6b38/metadata/part-00000:
--------------------------------------------------------------------------------
1 | {"class":"org.apache.spark.ml.feature.OneHotEncoderModel","timestamp":1739608798510,"sparkVersion":"3.5.0","uid":"OneHotEncoder_de68f7eb6b38","paramMap":{"handleInvalid":"error","outputCols":["TimeOHE"],"inputCols":["TimeIndex"],"dropLast":false},"defaultParamMap":{"outputCol":"OneHotEncoder_de68f7eb6b38__output","handleInvalid":"error","dropLast":true}}
2 | 


--------------------------------------------------------------------------------
/model/stages/2_Imputer_7b5e57f74a2b/data/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/model/stages/2_Imputer_7b5e57f74a2b/data/.part-00000-3ba2ba2d-997b-482f-a2a7-d39eedb93fad-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/stages/2_Imputer_7b5e57f74a2b/data/.part-00000-3ba2ba2d-997b-482f-a2a7-d39eedb93fad-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/model/stages/2_Imputer_7b5e57f74a2b/data/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/stages/2_Imputer_7b5e57f74a2b/data/_SUCCESS


--------------------------------------------------------------------------------
/model/stages/2_Imputer_7b5e57f74a2b/data/part-00000-3ba2ba2d-997b-482f-a2a7-d39eedb93fad-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/stages/2_Imputer_7b5e57f74a2b/data/part-00000-3ba2ba2d-997b-482f-a2a7-d39eedb93fad-c000.snappy.parquet


--------------------------------------------------------------------------------
/model/stages/2_Imputer_7b5e57f74a2b/metadata/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/model/stages/2_Imputer_7b5e57f74a2b/metadata/.part-00000.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/stages/2_Imputer_7b5e57f74a2b/metadata/.part-00000.crc


--------------------------------------------------------------------------------
/model/stages/2_Imputer_7b5e57f74a2b/metadata/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/stages/2_Imputer_7b5e57f74a2b/metadata/_SUCCESS


--------------------------------------------------------------------------------
/model/stages/2_Imputer_7b5e57f74a2b/metadata/part-00000:
--------------------------------------------------------------------------------
1 | {"class":"org.apache.spark.ml.feature.ImputerModel","timestamp":1739608799419,"sparkVersion":"3.5.0","uid":"Imputer_7b5e57f74a2b","paramMap":{"strategy":"median","inputCols":["V1","V2","V3","V4","V5","V6","V7","V8","V9","V10","V11","V12","V13","V14","V15","V16","V17","V18","V19","V20","V21","V22","V23","V24","V25","V26","V27"],"outputCols":["V1_imputed","V2_imputed","V3_imputed","V4_imputed","V5_imputed","V6_imputed","V7_imputed","V8_imputed","V9_imputed","V10_imputed","V11_imputed","V12_imputed","V13_imputed","V14_imputed","V15_imputed","V16_imputed","V17_imputed","V18_imputed","V19_imputed","V20_imputed","V21_imputed","V22_imputed","V23_imputed","V24_imputed","V25_imputed","V26_imputed","V27_imputed"]},"defaultParamMap":{"strategy":"mean","relativeError":0.001,"missingValue":"NaN","outputCol":"Imputer_7b5e57f74a2b__output"}}
2 | 


--------------------------------------------------------------------------------
/model/stages/3_VectorAssembler_441d7d1bea9d/metadata/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/model/stages/3_VectorAssembler_441d7d1bea9d/metadata/.part-00000.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/stages/3_VectorAssembler_441d7d1bea9d/metadata/.part-00000.crc


--------------------------------------------------------------------------------
/model/stages/3_VectorAssembler_441d7d1bea9d/metadata/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/stages/3_VectorAssembler_441d7d1bea9d/metadata/_SUCCESS


--------------------------------------------------------------------------------
/model/stages/3_VectorAssembler_441d7d1bea9d/metadata/part-00000:
--------------------------------------------------------------------------------
1 | {"class":"org.apache.spark.ml.feature.VectorAssembler","timestamp":1739608800637,"sparkVersion":"3.5.0","uid":"VectorAssembler_441d7d1bea9d","paramMap":{"outputCol":"numerical_features","inputCols":["V1_imputed","V2_imputed","V3_imputed","V4_imputed","V5_imputed","V6_imputed","V7_imputed","V8_imputed","V9_imputed","V10_imputed","V11_imputed","V12_imputed","V13_imputed","V14_imputed","V15_imputed","V16_imputed","V17_imputed","V18_imputed","V19_imputed","V20_imputed","V21_imputed","V22_imputed","V23_imputed","V24_imputed","V25_imputed","V26_imputed","V27_imputed"]},"defaultParamMap":{"outputCol":"VectorAssembler_441d7d1bea9d__output","handleInvalid":"error"}}
2 | 


--------------------------------------------------------------------------------
/model/stages/4_StandardScaler_5dbb14844ce4/data/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/model/stages/4_StandardScaler_5dbb14844ce4/data/.part-00000-27e8b4bc-d589-45cb-9991-def4d3b88a50-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/stages/4_StandardScaler_5dbb14844ce4/data/.part-00000-27e8b4bc-d589-45cb-9991-def4d3b88a50-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/model/stages/4_StandardScaler_5dbb14844ce4/data/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/stages/4_StandardScaler_5dbb14844ce4/data/_SUCCESS


--------------------------------------------------------------------------------
/model/stages/4_StandardScaler_5dbb14844ce4/data/part-00000-27e8b4bc-d589-45cb-9991-def4d3b88a50-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/stages/4_StandardScaler_5dbb14844ce4/data/part-00000-27e8b4bc-d589-45cb-9991-def4d3b88a50-c000.snappy.parquet


--------------------------------------------------------------------------------
/model/stages/4_StandardScaler_5dbb14844ce4/metadata/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/model/stages/4_StandardScaler_5dbb14844ce4/metadata/.part-00000.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/stages/4_StandardScaler_5dbb14844ce4/metadata/.part-00000.crc


--------------------------------------------------------------------------------
/model/stages/4_StandardScaler_5dbb14844ce4/metadata/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/stages/4_StandardScaler_5dbb14844ce4/metadata/_SUCCESS


--------------------------------------------------------------------------------
/model/stages/4_StandardScaler_5dbb14844ce4/metadata/part-00000:
--------------------------------------------------------------------------------
1 | {"class":"org.apache.spark.ml.feature.StandardScalerModel","timestamp":1739608801209,"sparkVersion":"3.5.0","uid":"StandardScaler_5dbb14844ce4","paramMap":{"inputCol":"numerical_features","outputCol":"numerical_features_scaled"},"defaultParamMap":{"withMean":false,"outputCol":"StandardScaler_5dbb14844ce4__output","withStd":true}}
2 | 


--------------------------------------------------------------------------------
/model/stages/5_VectorAssembler_a9434a6047bc/metadata/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/model/stages/5_VectorAssembler_a9434a6047bc/metadata/.part-00000.crc:
--------------------------------------------------------------------------------
1 | crc    V)F�


--------------------------------------------------------------------------------
/model/stages/5_VectorAssembler_a9434a6047bc/metadata/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/stages/5_VectorAssembler_a9434a6047bc/metadata/_SUCCESS


--------------------------------------------------------------------------------
/model/stages/5_VectorAssembler_a9434a6047bc/metadata/part-00000:
--------------------------------------------------------------------------------
1 | {"class":"org.apache.spark.ml.feature.VectorAssembler","timestamp":1739608802396,"sparkVersion":"3.5.0","uid":"VectorAssembler_a9434a6047bc","paramMap":{"inputCols":["TimeOHE","numerical_features"],"outputCol":"features"},"defaultParamMap":{"outputCol":"VectorAssembler_a9434a6047bc__output","handleInvalid":"error"}}
2 | 


--------------------------------------------------------------------------------
/model/stages/6_LightGBMClassifier_788cfead8c52/complexParams/.lightGBMBooster.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/stages/6_LightGBMClassifier_788cfead8c52/complexParams/.lightGBMBooster.crc


--------------------------------------------------------------------------------
/model/stages/6_LightGBMClassifier_788cfead8c52/complexParams/lightGBMBooster:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/stages/6_LightGBMClassifier_788cfead8c52/complexParams/lightGBMBooster


--------------------------------------------------------------------------------
/model/stages/6_LightGBMClassifier_788cfead8c52/metadata/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/model/stages/6_LightGBMClassifier_788cfead8c52/metadata/.part-00000.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/stages/6_LightGBMClassifier_788cfead8c52/metadata/.part-00000.crc


--------------------------------------------------------------------------------
/model/stages/6_LightGBMClassifier_788cfead8c52/metadata/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/stages/6_LightGBMClassifier_788cfead8c52/metadata/_SUCCESS


--------------------------------------------------------------------------------
/model/stages/6_LightGBMClassifier_788cfead8c52/metadata/part-00000:
--------------------------------------------------------------------------------
1 | {"class":"com.microsoft.azure.synapse.ml.lightgbm.LightGBMClassificationModel","timestamp":1739608802925,"sparkVersion":"3.5.0","uid":"LightGBMClassifier_788cfead8c52","paramMap":{"featuresCol":"features","labelCol":"Class","numIterations":-1,"actualNumClasses":1,"featuresShapCol":"","startIteration":0,"probabilityCol":"probability","rawPredictionCol":"rawPrediction","predictionCol":"prediction","leafPredictionCol":"","predictDisableShapeCheck":false},"defaultParamMap":{"featuresCol":"features","labelCol":"label","numIterations":-1,"featuresShapCol":"","startIteration":0,"probabilityCol":"probability","rawPredictionCol":"rawPrediction","predictionCol":"prediction","leafPredictionCol":"","predictDisableShapeCheck":false},"complexParamLocs":{"lightGBMBooster":"complexParams/lightGBMBooster"}}
2 | 


--------------------------------------------------------------------------------
/producer.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import time
 3 | 
 4 | import pandas as pd
 5 | from datetime import datetime
 6 | 
 7 | from kafka import KafkaAdminClient, KafkaProducer
 8 | from kafka.admin import NewTopic
 9 | 
10 | from const import ROOT_PATH
11 | 
12 | 
13 | def create_topic(admin, topic_name):
14 |     # Create topic if not exists
15 |     try:
16 |         # Create Kafka topic
17 |         topic = NewTopic(name=topic_name, num_partitions=1,
18 |                          replication_factor=1)
19 |         admin.create_topics([topic])
20 |         print(f"A new topic {topic_name} has been created!")
21 |     except:
22 |         print(f"Topic {topic_name} already exists. Skipping creation!")
23 |         pass
24 | 
25 | 
26 | def create_streams(topic_name: str, servers):
27 |     producer = None
28 |     admin = None
29 |     for _ in range(10):
30 |         try:
31 |             producer = KafkaProducer(bootstrap_servers=servers)
32 |             admin = KafkaAdminClient(bootstrap_servers=servers)
33 |             print("SUCCESS: instantiated Kafka admin and producer")
34 |             break
35 |         except Exception as e:
36 |             print(
37 |                 f"Trying to instantiate admin and producer with bootstrap servers {servers} with error {e}"
38 |             )
39 |             time.sleep(10)
40 |             pass
41 | 
42 |     df = pd.read_csv(f"{ROOT_PATH}/creditcard.csv")
43 |     df = df.drop(columns="Class")
44 |     records = df.to_dict(orient="records")
45 | 
46 |     for record in records:
47 |         producer.send(
48 |             topic_name,
49 |             json.dumps(record).encode("utf-8")
50 |         )
51 |         print(record)
52 |         time.sleep(5)
53 | 
54 | 
55 | if __name__ == '__main__':
56 |     create_streams(topic_name="raw", servers="localhost:9092", )
57 | 


--------------------------------------------------------------------------------
/prometheus/prometheus.yml:
--------------------------------------------------------------------------------
 1 | # my global config
 2 | global:
 3 |   scrape_interval:     10s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
 4 |   evaluation_interval: 10s # Evaluate rules every 15 seconds. The default is every 1 minute.
 5 |   # scrape_timeout is set to the global default (10s).
 6 |   external_labels:
 7 |     monitor:  'Spark-monitoring'
 8 | 
 9 | # Alertmanager configuration
10 | alerting:
11 |   alertmanagers:
12 |   - static_configs:
13 |     - targets:
14 |       # - alertmanager:9093
15 | 
16 | # Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
17 | rule_files:
18 |   # - "first_rules.yml"
19 |   # - "second_rules.yml"
20 |   # - 'prometheus.yml'
21 | # A scrape configuration containing exactly one endpoint to scrape:
22 | # Here it's Prometheus itself.
23 | scrape_configs:
24 |   # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
25 | 
26 | ##############
27 | # Prometheus #
28 | ##############
29 |   - job_name: 'prometheus'
30 |     scrape_interval: 10s
31 |     # metrics_path defaults to '/metrics'
32 |     # scheme defaults to 'http'.
33 |     static_configs:
34 |       - targets: ['localhost:9090']
35 | 
36 | ###############
37 | # Node-exporter #
38 | ###############
39 |   - job_name: 'node-exporter'
40 |     scrape_interval: 10s
41 |     static_configs:
42 |       - targets: ['node-exporter:9100']
43 |         labels:
44 |           group:  'spark'
45 |           instance: 'node-exporter'
46 | 
47 | ###############
48 | # Spark Master #
49 | ###############
50 |   - job_name: 'spark-master'
51 |     scrape_interval: 10s
52 |     metrics_path: '/metrics/master/prometheus'
53 |     static_configs:
54 |       - targets: ['spark:8080']
55 |         labels:
56 |           group:  'spark'
57 |           instance: 'master'
58 | 
59 | ###############
60 | # Spark Worker #
61 | ###############
62 |   - job_name: 'spark-worker'
63 |     scrape_interval: 10s
64 |     metrics_path: '/metrics/prometheus'
65 |     static_configs:
66 |       - targets: ['spark:8081']
67 |         labels:
68 |           group:  'spark'
69 |           instance: 'worker'
70 | 
71 | ###############
72 | # Spark Applications #
73 | ###############
74 |   - job_name: 'spark-applications'
75 |     scrape_interval: 10s
76 |     metrics_path: '/metrics/applications/prometheus'
77 |     static_configs:
78 |       - targets: ['spark:8080']
79 |         labels:
80 |           group:  'spark'
81 |           instance: 'applications'
82 | 
83 | ###############
84 | # Spark Driver #
85 | ###############
86 |   - job_name: 'spark-driver'
87 |     scrape_interval: 10s
88 |     metrics_path: '/metrics/prometheus/'
89 |     static_configs:
90 |       - targets: ['localhost:4040']
91 |         labels:
92 |           group:  'spark'
93 |           instance: 'driver'


--------------------------------------------------------------------------------
/prometheus/prometheus_db/01JM1D7YTFGV5Q46F581Y59V6N/chunks/000001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/prometheus/prometheus_db/01JM1D7YTFGV5Q46F581Y59V6N/chunks/000001


--------------------------------------------------------------------------------
/prometheus/prometheus_db/01JM1D7YTFGV5Q46F581Y59V6N/index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/prometheus/prometheus_db/01JM1D7YTFGV5Q46F581Y59V6N/index


--------------------------------------------------------------------------------
/prometheus/prometheus_db/01JM1D7YTFGV5Q46F581Y59V6N/meta.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"ulid": "01JM1D7YTFGV5Q46F581Y59V6N",
 3 | 	"minTime": 1739451882655,
 4 | 	"maxTime": 1739491200000,
 5 | 	"stats": {
 6 | 		"numSamples": 627172,
 7 | 		"numSeries": 1853,
 8 | 		"numChunks": 7346
 9 | 	},
10 | 	"compaction": {
11 | 		"level": 3,
12 | 		"sources": [
13 | 			"01JM01E7GVDS62213E6J88CZH6",
14 | 			"01JM04FNYWFKPAQFK4W1HWE3DR",
15 | 			"01JM0BE48DXCMSRDTPW5ZM4Z64",
16 | 			"01JM0JFQAJDEGC8W1GQ3QSBJ9B",
17 | 			"01JM0SAGNRWVTCP5R0ETHGJC93",
18 | 			"01JM10CCCRWNY4NH1XXF230WC2"
19 | 		],
20 | 		"parents": [
21 | 			{
22 | 				"ulid": "01JM0SAGY1409W3PD84G1R08KH",
23 | 				"minTime": 1739451882655,
24 | 				"maxTime": 1739469600000
25 | 			},
26 | 			{
27 | 				"ulid": "01JM16C7KHYRAPEYR3SRTMVNZ9",
28 | 				"minTime": 1739469600000,
29 | 				"maxTime": 1739491200000
30 | 			}
31 | 		]
32 | 	},
33 | 	"version": 1
34 | }


--------------------------------------------------------------------------------
/prometheus/prometheus_db/01JM1D7YTFGV5Q46F581Y59V6N/tombstones:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/prometheus/prometheus_db/01JM1D7YTFGV5Q46F581Y59V6N/tombstones


--------------------------------------------------------------------------------
/prometheus/prometheus_db/01JM4N0Q121PZKCKNVYCSJCVSZ/chunks/000001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/prometheus/prometheus_db/01JM4N0Q121PZKCKNVYCSJCVSZ/chunks/000001


--------------------------------------------------------------------------------
/prometheus/prometheus_db/01JM4N0Q121PZKCKNVYCSJCVSZ/index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/prometheus/prometheus_db/01JM4N0Q121PZKCKNVYCSJCVSZ/index


--------------------------------------------------------------------------------
/prometheus/prometheus_db/01JM4N0Q121PZKCKNVYCSJCVSZ/meta.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/prometheus/prometheus_db/01JM4N0Q121PZKCKNVYCSJCVSZ/meta.json


--------------------------------------------------------------------------------
/prometheus/prometheus_db/01JM4N0Q121PZKCKNVYCSJCVSZ/tombstones:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/prometheus/prometheus_db/01JM4N0Q121PZKCKNVYCSJCVSZ/tombstones


--------------------------------------------------------------------------------
/prometheus/prometheus_db/01JM4V3JR55NWKNGPRKH0JB4D5/chunks/000001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/prometheus/prometheus_db/01JM4V3JR55NWKNGPRKH0JB4D5/chunks/000001


--------------------------------------------------------------------------------
/prometheus/prometheus_db/01JM4V3JR55NWKNGPRKH0JB4D5/index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/prometheus/prometheus_db/01JM4V3JR55NWKNGPRKH0JB4D5/index


--------------------------------------------------------------------------------
/prometheus/prometheus_db/01JM4V3JR55NWKNGPRKH0JB4D5/meta.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/prometheus/prometheus_db/01JM4V3JR55NWKNGPRKH0JB4D5/meta.json


--------------------------------------------------------------------------------
/prometheus/prometheus_db/01JM4V3JR55NWKNGPRKH0JB4D5/tombstones:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/prometheus/prometheus_db/01JM4V3JR55NWKNGPRKH0JB4D5/tombstones


--------------------------------------------------------------------------------
/prometheus/prometheus_db/01JM4V3K06JFK35VSGTMKRYC65/chunks/000001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/prometheus/prometheus_db/01JM4V3K06JFK35VSGTMKRYC65/chunks/000001


--------------------------------------------------------------------------------
/prometheus/prometheus_db/01JM4V3K06JFK35VSGTMKRYC65/index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/prometheus/prometheus_db/01JM4V3K06JFK35VSGTMKRYC65/index


--------------------------------------------------------------------------------
/prometheus/prometheus_db/01JM4V3K06JFK35VSGTMKRYC65/meta.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/prometheus/prometheus_db/01JM4V3K06JFK35VSGTMKRYC65/meta.json


--------------------------------------------------------------------------------
/prometheus/prometheus_db/01JM4V3K06JFK35VSGTMKRYC65/tombstones:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/prometheus/prometheus_db/01JM4V3K06JFK35VSGTMKRYC65/tombstones


--------------------------------------------------------------------------------
/prometheus/prometheus_db/chunks_head/000002:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/prometheus/prometheus_db/chunks_head/000002


--------------------------------------------------------------------------------
/prometheus/prometheus_db/chunks_head/000003:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/prometheus/prometheus_db/chunks_head/000003


--------------------------------------------------------------------------------
/prometheus/prometheus_db/lock:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/prometheus/prometheus_db/lock


--------------------------------------------------------------------------------
/prometheus/prometheus_db/queries.active:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/prometheus/prometheus_db/queries.active


--------------------------------------------------------------------------------
/prometheus/prometheus_db/wal/00000034:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/prometheus/prometheus_db/wal/00000034


--------------------------------------------------------------------------------
/prometheus/prometheus_db/wal/00000035:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/prometheus/prometheus_db/wal/00000035


--------------------------------------------------------------------------------
/prometheus/prometheus_db/wal/00000036:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/prometheus/prometheus_db/wal/00000036


--------------------------------------------------------------------------------
/prometheus/prometheus_db/wal/checkpoint.00000033/00000000:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/prometheus/prometheus_db/wal/checkpoint.00000033/00000000


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pyspark==3.5.0
2 | pydeequ==1.0.1
3 | python-dotenv==1.0.0
4 | nltk==3.8.1
5 | synapseml
6 | pytest


--------------------------------------------------------------------------------
/spec.py:
--------------------------------------------------------------------------------
 1 | import pyspark.sql.types as T
 2 | 
 3 | 
 4 | input_schema = T.StructType([
 5 |     T.StructField('Time', T.StringType(), True),
 6 |     T.StructField('V1', T.DoubleType(), True),
 7 |     T.StructField('V2', T.DoubleType(), True),
 8 |     T.StructField('V3', T.DoubleType(), True),
 9 |     T.StructField('V4', T.DoubleType(), True),
10 |     T.StructField('V5', T.DoubleType(), True),
11 |     T.StructField('V6', T.DoubleType(), True),
12 |     T.StructField('V7', T.DoubleType(), True),
13 |     T.StructField('V8', T.DoubleType(), True),
14 |     T.StructField('V9', T.DoubleType(), True),
15 |     T.StructField('V10', T.DoubleType(), True),
16 |     T.StructField('V11', T.DoubleType(), True),
17 |     T.StructField('V12', T.DoubleType(), True),
18 |     T.StructField('V13', T.DoubleType(), True),
19 |     T.StructField('V14', T.DoubleType(), True),
20 |     T.StructField('V15', T.DoubleType(), True),
21 |     T.StructField('V16', T.DoubleType(), True),
22 |     T.StructField('V17', T.DoubleType(), True),
23 |     T.StructField('V18', T.DoubleType(), True),
24 |     T.StructField('V19', T.DoubleType(), True),
25 |     T.StructField('V20', T.DoubleType(), True),
26 |     T.StructField('V21', T.DoubleType(), True),
27 |     T.StructField('V22', T.DoubleType(), True),
28 |     T.StructField('V23', T.DoubleType(), True),
29 |     T.StructField('V24', T.DoubleType(), True),
30 |     T.StructField('V25', T.DoubleType(), True),
31 |     T.StructField('V26', T.DoubleType(), True),
32 |     T.StructField('V27', T.DoubleType(), True),
33 | ])
34 | 
35 | output_schema = T.StructType([
36 |     T.StructField('Time', T.StringType(), True),
37 |     T.StructField('predicted_prob', T.DoubleType(), True)
38 | ])
39 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | import pytest as pytest
 3 | 
 4 | from unittest.mock import Mock, MagicMock
 5 | from pyspark.sql.session import SparkSession
 6 | from pyspark.sql.dataframe import DataFrame
 7 | import pyspark.sql.types as T
 8 | 
 9 | 
10 | @pytest.fixture
11 | def spark() -> SparkSession:
12 |     spark = (
13 |         SparkSession.builder.master("local[1]")
14 |         .appName("local-tests")
15 |         .config("spark.executor.cores", "1")
16 |         .config("spark.executor.instances", "1")
17 |         .config("spark.sql.shuffle.partitions", "1")
18 |         .config("spark.driver.bindAddress", "127.0.0.1")
19 |         .getOrCreate()
20 |     )
21 |     yield spark
22 |     spark.stop()
23 | 
24 | 
25 | @pytest.fixture
26 | def sample_df(spark) -> DataFrame:
27 |     data = [
28 |         ("Walter", 32, "Germany", 10000.0, datetime(2023, 1, 1, 0, 1), 'segment1', '20230101', 0),
29 |         ("Nic", 12, "England", 2.0, datetime(2023, 1, 1, 0, 2), 'segment1', '20230101', 0)
30 |     ]
31 | 
32 |     schema = T.StructType(
33 |         [
34 |             T.StructType("name", T.StructType(), False),
35 |             T.StructType("age", T.StructType(), False),
36 |             T.StructType("country", T.StructType(), False),
37 |             T.StructType("salary", T.StructType(), False),
38 |             T.StructType("time", T.StructType(), False),
39 |             T.StructType("segment", T.StructType(), False),
40 |             T.StructType("date", T.StructType(), False),
41 |             T.StructType("hour", T.StructType(), False),
42 |         ]
43 |     )
44 |     return spark.createDataFrame(data, schema)
45 | 
46 | 
47 | @pytest.fixture
48 | def spark_mock(sample_df):
49 |     spark_mock = Mock()
50 |     type(spark_mock).write = spark_mock
51 |     type(spark_mock).read = spark_mock
52 |     spark_mock.table.return_value = spark_mock
53 |     spark_mock.format.return_value = spark_mock
54 |     spark_mock.option.return_value = spark_mock
55 |     spark_mock.options.return_value = spark_mock
56 |     spark_mock.mode.return_value = spark_mock
57 |     spark_mock.save.return_value = None
58 |     spark_mock.load.return_value = sample_df
59 |     return spark_mock
60 | 
61 | 
62 | @pytest.fixture
63 | def schema_mock():
64 |     return MagicMock()
65 | 


--------------------------------------------------------------------------------
/tests/test_inference.py:
--------------------------------------------------------------------------------
 1 | from chispa.dataframe_comparer import assert_df_equality
 2 | from inference.base import Inference
 3 | 
 4 | 
 5 | def test_remove_non_word_characters_short(spark):
 6 |     input_data = [
 7 |         ("jo&&se", "jose"),
 8 |         ("**li**", "li"),
 9 |         ("#::luisa", "luisa"),
10 |         (None, None)
11 |     ]
12 |     expected_data = [
13 |         ("jo&&se", "jose"),
14 |         ("**li**", "li"),
15 |         ("#::luisa", "luisa"),
16 |         (None, None)
17 |     ]
18 | 
19 |     df = spark.createDataFrame(input_data, ["name", "name"])
20 |     actual_df = Inference.preprocess_data(df)
21 | 
22 |     expected_df = spark.createDataFrame(expected_data, ["name", "name"])
23 |     assert_df_equality(actual_df, expected_df)
24 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import pyspark.sql.types as T
  3 | import pyspark.sql.functions as F
  4 | from synapse.ml.lightgbm import LightGBMClassifier
  5 | from pyspark.sql import SparkSession
  6 | from pyspark.ml import Pipeline, PipelineModel
  7 | from pyspark.ml.feature import (StringIndexer, OneHotEncoder, VectorAssembler,
  8 |                                 Imputer, StandardScaler)
  9 | 
 10 | # Local modules
 11 | from const import ROOT_PATH
 12 | 
 13 | 
 14 | def train(dataset):
 15 |     spark = SparkSession\
 16 |         .builder\
 17 |         .master('local[*]')\
 18 |         .appName("train_model") \
 19 |         .config("spark.jars.packages", "com.microsoft.azure:synapseml_2.12:1.0.2")\
 20 |         .getOrCreate()
 21 | 
 22 |     df = spark.read.csv(dataset, header=True, inferSchema=True)
 23 |     df = df.withColumn("Class", F.col("Class").astype(T.IntegerType()))
 24 | 
 25 |     fraud_df = df.filter(F.col("Class") == 1).limit(100)
 26 |     non_fraud_df = df.filter(F.col("Class") == 0).limit(100)
 27 |     train_df = fraud_df.union(non_fraud_df)
 28 | 
 29 |     numerical_cols = ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9',
 30 |                       'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17',
 31 |                       'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25',
 32 |                       'V26', 'V27']
 33 |     categorical_cols = ['Time']
 34 |     label_column_name = 'Class'
 35 | 
 36 |     # Convert categorical string columns in a DataFrame into numerical indices
 37 |     index_cols = [c + 'Index' for c in categorical_cols]
 38 |     string_indexer = StringIndexer(
 39 |         inputCols=categorical_cols,
 40 |         outputCols=index_cols,
 41 |         stringOrderType='alphabetAsc',
 42 |         handleInvalid='keep'
 43 |     )
 44 | 
 45 |     # Apply one-hot encoding to integer indexes
 46 |     ohe_cols = [c + 'OHE' for c in categorical_cols]
 47 |     one_hot_encoder = OneHotEncoder(
 48 |         inputCols=index_cols,
 49 |         outputCols=ohe_cols,
 50 |         handleInvalid='error',
 51 |         dropLast=False
 52 |     )
 53 | 
 54 |     # Imputation transformer for completing missing values.
 55 |     imputer = Imputer(
 56 |         strategy='median',
 57 |         inputCols=numerical_cols,
 58 |         outputCols=[c + "_imputed" for c in numerical_cols]
 59 |     )
 60 | 
 61 |     # Combine numerical columns into a single vector
 62 |     vec_assembler = VectorAssembler(
 63 |         inputCols=[c + "_imputed" for c in numerical_cols],
 64 |         outputCol='numerical_features'
 65 |     )
 66 | 
 67 |     # Scale numerical features
 68 |     standard_scaler = StandardScaler(
 69 |         inputCol='numerical_features',
 70 |         outputCol='numerical_features_scaled'
 71 |     )
 72 | 
 73 |     # Combine one-hot encoded and scaled numerical features
 74 |     assembler_cols = ohe_cols + ['numerical_features']
 75 |     vec_assembler2 = VectorAssembler(
 76 |         inputCols=assembler_cols,
 77 |         outputCol='features'
 78 |     )
 79 | 
 80 |     lgb_classifier = LightGBMClassifier(
 81 |         featuresCol="features",
 82 |         labelCol=label_column_name
 83 |     )
 84 | 
 85 |     pipeline = Pipeline(
 86 |         stages=[
 87 |             string_indexer,
 88 |             one_hot_encoder,
 89 |             imputer,
 90 |             vec_assembler,
 91 |             standard_scaler,
 92 |             vec_assembler2,
 93 |             lgb_classifier
 94 |         ]
 95 |     )
 96 | 
 97 |     model = pipeline.fit(train_df)
 98 | 
 99 |     # Save the trained model to a file
100 |     model.write().overwrite().save(f"{ROOT_PATH}/model")
101 | 
102 |     # Test the trained model
103 |     pipelineModel = PipelineModel.load(f"{ROOT_PATH}/model")
104 |     df = pipelineModel.transform(train_df)
105 |     df.show()
106 | 
107 | 
108 | if __name__ == '__main__':
109 |     train(f"{ROOT_PATH}/creditcard.csv")
110 | 


--------------------------------------------------------------------------------
/udfs.py:
--------------------------------------------------------------------------------
1 | import pyspark.sql.types as T
2 | import pyspark.sql.functions as F
3 | from pyspark.ml.linalg import DenseVector
4 | 
5 | 
6 | @F.udf(T.FloatType())
7 | def extract_predicted_prob(v: DenseVector) -> float:
8 |     return float(v[1])
9 | 


--------------------------------------------------------------------------------