├── const.py ├── creditcard.csv ├── docker-compose.yml ├── grafana ├── __init__.py ├── grafana_db │ └── grafana.db ├── login_config └── provisioning │ └── dashboards │ ├── Node Exporter.json │ ├── Spark Metrics.json │ └── dashboard.yml ├── inference ├── __init__.py ├── base.py ├── batch_inference.py └── stream_inference.py ├── model ├── metadata │ ├── ._SUCCESS.crc │ ├── .part-00000.crc │ ├── _SUCCESS │ └── part-00000 └── stages │ ├── 0_StringIndexer_d065dab91d9f │ ├── data │ │ ├── ._SUCCESS.crc │ │ ├── .part-00000-7abf6b25-05d4-4465-bc28-f85b21eda3b2-c000.snappy.parquet.crc │ │ ├── _SUCCESS │ │ └── part-00000-7abf6b25-05d4-4465-bc28-f85b21eda3b2-c000.snappy.parquet │ └── metadata │ │ ├── ._SUCCESS.crc │ │ ├── .part-00000.crc │ │ ├── _SUCCESS │ │ └── part-00000 │ ├── 1_OneHotEncoder_de68f7eb6b38 │ ├── data │ │ ├── ._SUCCESS.crc │ │ ├── .part-00000-8acdc128-241b-4b22-a551-aeba6a6ca387-c000.snappy.parquet.crc │ │ ├── _SUCCESS │ │ └── part-00000-8acdc128-241b-4b22-a551-aeba6a6ca387-c000.snappy.parquet │ └── metadata │ │ ├── ._SUCCESS.crc │ │ ├── .part-00000.crc │ │ ├── _SUCCESS │ │ └── part-00000 │ ├── 2_Imputer_7b5e57f74a2b │ ├── data │ │ ├── ._SUCCESS.crc │ │ ├── .part-00000-3ba2ba2d-997b-482f-a2a7-d39eedb93fad-c000.snappy.parquet.crc │ │ ├── _SUCCESS │ │ └── part-00000-3ba2ba2d-997b-482f-a2a7-d39eedb93fad-c000.snappy.parquet │ └── metadata │ │ ├── ._SUCCESS.crc │ │ ├── .part-00000.crc │ │ ├── _SUCCESS │ │ └── part-00000 │ ├── 3_VectorAssembler_441d7d1bea9d │ └── metadata │ │ ├── ._SUCCESS.crc │ │ ├── .part-00000.crc │ │ ├── _SUCCESS │ │ └── part-00000 │ ├── 4_StandardScaler_5dbb14844ce4 │ ├── data │ │ ├── ._SUCCESS.crc │ │ ├── .part-00000-27e8b4bc-d589-45cb-9991-def4d3b88a50-c000.snappy.parquet.crc │ │ ├── _SUCCESS │ │ └── part-00000-27e8b4bc-d589-45cb-9991-def4d3b88a50-c000.snappy.parquet │ └── metadata │ │ ├── ._SUCCESS.crc │ │ ├── .part-00000.crc │ │ ├── _SUCCESS │ │ └── part-00000 │ ├── 5_VectorAssembler_a9434a6047bc │ └── metadata │ │ ├── ._SUCCESS.crc │ │ ├── .part-00000.crc │ │ ├── _SUCCESS │ │ └── part-00000 │ └── 6_LightGBMClassifier_788cfead8c52 │ ├── complexParams │ ├── .lightGBMBooster.crc │ └── lightGBMBooster │ └── metadata │ ├── ._SUCCESS.crc │ ├── .part-00000.crc │ ├── _SUCCESS │ └── part-00000 ├── producer.py ├── prometheus ├── prometheus.yml └── prometheus_db │ ├── 01JM1D7YTFGV5Q46F581Y59V6N │ ├── chunks │ │ └── 000001 │ ├── index │ ├── meta.json │ └── tombstones │ ├── 01JM4N0Q121PZKCKNVYCSJCVSZ │ ├── chunks │ │ └── 000001 │ ├── index │ ├── meta.json │ └── tombstones │ ├── 01JM4V3JR55NWKNGPRKH0JB4D5 │ ├── chunks │ │ └── 000001 │ ├── index │ ├── meta.json │ └── tombstones │ ├── 01JM4V3K06JFK35VSGTMKRYC65 │ ├── chunks │ │ └── 000001 │ ├── index │ ├── meta.json │ └── tombstones │ ├── chunks_head │ ├── 000002 │ └── 000003 │ ├── lock │ ├── queries.active │ └── wal │ ├── 00000034 │ ├── 00000035 │ ├── 00000036 │ └── checkpoint.00000033 │ └── 00000000 ├── requirements.txt ├── spec.py ├── tests ├── conftest.py └── test_inference.py ├── train.py └── udfs.py /const.py: -------------------------------------------------------------------------------- 1 | ROOT_PATH = "replace with your path" -------------------------------------------------------------------------------- /creditcard.csv: -------------------------------------------------------------------------------- 1 | "Time","V1","V2","V3","V4","V5","V6","V7","V8","V9","V10","V11","V12","V13","V14","V15","V16","V17","V18","V19","V20","V21","V22","V23","V24","V25","V26","V27","V28","Amount","Class" 2 | 0,-1.3598071336738,-0.0727811733098497,2.53634673796914,1.37815522427443,-0.338320769942518,0.462387777762292,0.239598554061257,0.0986979012610507,0.363786969611213,0.0907941719789316,-0.551599533260813,-0.617800855762348,-0.991389847235408,-0.311169353699879,1.46817697209427,-0.470400525259478,0.207971241929242,0.0257905801985591,0.403992960255733,0.251412098239705,-0.018306777944153,0.277837575558899,-0.110473910188767,0.0669280749146731,0.128539358273528,-0.189114843888824,0.133558376740387,-0.0210530534538215,149.62,"0" 3 | 0,1.19185711131486,0.26615071205963,0.16648011335321,0.448154078460911,0.0600176492822243,-0.0823608088155687,-0.0788029833323113,0.0851016549148104,-0.255425128109186,-0.166974414004614,1.61272666105479,1.06523531137287,0.48909501589608,-0.143772296441519,0.635558093258208,0.463917041022171,-0.114804663102346,-0.183361270123994,-0.145783041325259,-0.0690831352230203,-0.225775248033138,-0.638671952771851,0.101288021253234,-0.339846475529127,0.167170404418143,0.125894532368176,-0.00898309914322813,0.0147241691924927,2.69,"0" 4 | 1,-1.35835406159823,-1.34016307473609,1.77320934263119,0.379779593034328,-0.503198133318193,1.80049938079263,0.791460956450422,0.247675786588991,-1.51465432260583,0.207642865216696,0.624501459424895,0.066083685268831,0.717292731410831,-0.165945922763554,2.34586494901581,-2.89008319444231,1.10996937869599,-0.121359313195888,-2.26185709530414,0.524979725224404,0.247998153469754,0.771679401917229,0.909412262347719,-0.689280956490685,-0.327641833735251,-0.139096571514147,-0.0553527940384261,-0.0597518405929204,378.66,"0" 5 | 1,-0.966271711572087,-0.185226008082898,1.79299333957872,-0.863291275036453,-0.0103088796030823,1.24720316752486,0.23760893977178,0.377435874652262,-1.38702406270197,-0.0549519224713749,-0.226487263835401,0.178228225877303,0.507756869957169,-0.28792374549456,-0.631418117709045,-1.0596472454325,-0.684092786345479,1.96577500349538,-1.2326219700892,-0.208037781160366,-0.108300452035545,0.00527359678253453,-0.190320518742841,-1.17557533186321,0.647376034602038,-0.221928844458407,0.0627228487293033,0.0614576285006353,123.5,"0" 6 | 2,-1.15823309349523,0.877736754848451,1.548717846511,0.403033933955121,-0.407193377311653,0.0959214624684256,0.592940745385545,-0.270532677192282,0.817739308235294,0.753074431976354,-0.822842877946363,0.53819555014995,1.3458515932154,-1.11966983471731,0.175121130008994,-0.451449182813529,-0.237033239362776,-0.0381947870352842,0.803486924960175,0.408542360392758,-0.00943069713232919,0.79827849458971,-0.137458079619063,0.141266983824769,-0.206009587619756,0.502292224181569,0.219422229513348,0.215153147499206,69.99,"0" 7 | 2,-0.425965884412454,0.960523044882985,1.14110934232219,-0.168252079760302,0.42098688077219,-0.0297275516639742,0.476200948720027,0.260314333074874,-0.56867137571251,-0.371407196834471,1.34126198001957,0.359893837038039,-0.358090652573631,-0.137133700217612,0.517616806555742,0.401725895589603,-0.0581328233640131,0.0686531494425432,-0.0331937877876282,0.0849676720682049,-0.208253514656728,-0.559824796253248,-0.0263976679795373,-0.371426583174346,-0.232793816737034,0.105914779097957,0.253844224739337,0.0810802569229443,3.67,"0" 8 | 4,1.22965763450793,0.141003507049326,0.0453707735899449,1.20261273673594,0.191880988597645,0.272708122899098,-0.00515900288250983,0.0812129398830894,0.464959994783886,-0.0992543211289237,-1.41690724314928,-0.153825826253651,-0.75106271556262,0.16737196252175,0.0501435942254188,-0.443586797916727,0.00282051247234708,-0.61198733994012,-0.0455750446637976,-0.21963255278686,-0.167716265815783,-0.270709726172363,-0.154103786809305,-0.780055415004671,0.75013693580659,-0.257236845917139,0.0345074297438413,0.00516776890624916,4.99,"0" 9 | 7,-0.644269442348146,1.41796354547385,1.0743803763556,-0.492199018495015,0.948934094764157,0.428118462833089,1.12063135838353,-3.80786423873589,0.615374730667027,1.24937617815176,-0.619467796121913,0.291474353088705,1.75796421396042,-1.32386521970526,0.686132504394383,-0.0761269994382006,-1.2221273453247,-0.358221569869078,0.324504731321494,-0.156741852488285,1.94346533978412,-1.01545470979971,0.057503529867291,-0.649709005559993,-0.415266566234811,-0.0516342969262494,-1.20692108094258,-1.08533918832377,40.8,"0" 10 | 7,-0.89428608220282,0.286157196276544,-0.113192212729871,-0.271526130088604,2.6695986595986,3.72181806112751,0.370145127676916,0.851084443200905,-0.392047586798604,-0.410430432848439,-0.705116586646536,-0.110452261733098,-0.286253632470583,0.0743553603016731,-0.328783050303565,-0.210077268148783,-0.499767968800267,0.118764861004217,0.57032816746536,0.0527356691149697,-0.0734251001059225,-0.268091632235551,-0.204232669947878,1.0115918018785,0.373204680146282,-0.384157307702294,0.0117473564581996,0.14240432992147,93.2,"0" 11 | 9,-0.33826175242575,1.11959337641566,1.04436655157316,-0.222187276738296,0.49936080649727,-0.24676110061991,0.651583206489972,0.0695385865186387,-0.736727316364109,-0.366845639206541,1.01761446783262,0.836389570307029,1.00684351373408,-0.443522816876142,0.150219101422635,0.739452777052119,-0.540979921943059,0.47667726004282,0.451772964394125,0.203711454727929,-0.246913936910008,-0.633752642406113,-0.12079408408185,-0.385049925313426,-0.0697330460416923,0.0941988339514961,0.246219304619926,0.0830756493473326,3.68,"0" 12 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | --- 2 | version: '3.8' 3 | services: 4 | zookeeper: 5 | image: confluentinc/cp-zookeeper:7.5.0 6 | container_name: zookeeper 7 | ports: 8 | - "2181:2181" 9 | healthcheck: 10 | test: echo srvr | nc zookeeper 2181 || exit 1 11 | start_period: 10s 12 | retries: 20 13 | interval: 10s 14 | environment: 15 | ZOOKEEPER_CLIENT_PORT: 2181 16 | ZOOKEEPER_TICK_TIME: 2000 17 | 18 | broker: 19 | image: confluentinc/cp-server:7.5.0 20 | container_name: kafka-broker 21 | depends_on: 22 | zookeeper: 23 | condition: service_healthy 24 | ports: 25 | - "9092:9092" 26 | - "9101:9101" 27 | healthcheck: 28 | test: nc -z localhost 9092 || exit -1 29 | start_period: 15s 30 | interval: 5s 31 | timeout: 10s 32 | retries: 10 33 | environment: 34 | # ID of the broker in a cluster 35 | KAFKA_BROKER_ID: 1 36 | # Connect to Zoo Keeper for distributed coordination and leader election 37 | KAFKA_ZOOKEEPER_CONNECT: 'zookeeper:2181' 38 | KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT 39 | # Define how clients connect to brokers 40 | KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://broker:29092,PLAINTEXT_HOST://localhost:9092 41 | # How many copies are maintained for fault tolerance 42 | KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 43 | # Confluent Metrics Reporter for Control Center Cluster Monitoring 44 | KAFKA_METRIC_REPORTERS: io.confluent.metrics.reporter.ConfluentMetricsReporter 45 | CONFLUENT_METRICS_REPORTER_BOOTSTRAP_SERVERS: 'broker:9092' 46 | CONFLUENT_METRICS_REPORTER_TOPIC_REPLICAS: 1 47 | CONFLUENT_METRICS_ENABLE: 'true' 48 | # For fixing the bug replication factor 3 > the number of node 49 | KAFKA_CONFLUENT_BALANCER_TOPIC_REPLICATION_FACTOR: 1 50 | KAFKA_TRANSACTION_STATE_LOG_MIN_ISR: 1 51 | KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 1 52 | CLUSTER_ID: 'MkU3OEVBNTcwNTJENDM2Qk' 53 | 54 | 55 | # Confluent control center to manage Kafka 56 | control-center: 57 | image: confluentinc/cp-enterprise-control-center:7.5.0 58 | container_name: control-center 59 | depends_on: 60 | - broker 61 | ports: 62 | - "9021:9021" 63 | healthcheck: 64 | test: ["CMD", "curl", "-f", "http://localhost:9021/healthcheck"] # Adjust the URL and options as needed 65 | interval: 30s 66 | timeout: 10s 67 | retries: 3 68 | environment: 69 | CONTROL_CENTER_BOOTSTRAP_SERVERS: 'broker:29092' 70 | CONTROL_CENTER_REPLICATION_FACTOR: 1 71 | CONTROL_CENTER_INTERNAL_TOPICS_PARTITIONS: 1 72 | CONTROL_CENTER_CONNECT_HEALTHCHECK_ENDPOINT: '/connectors' 73 | CONFLUENT_METRICS_TOPIC_REPLICATION: 1 74 | CONTROL_CENTER_MONITORING_INTERCEPTOR_TOPIC_PARTITIONS: 1 75 | 76 | 77 | grafana: 78 | image: grafana/grafana:6.5.0 79 | container_name: grafana 80 | restart: unless-stopped 81 | ports: 82 | - 3000:3000 83 | env_file: 84 | - ./grafana/login_config 85 | volumes: 86 | - ./grafana/grafana_db:/var/lib/grafana:rw 87 | - ./grafana/provisioning/datasource:/etc/grafana/provisioning/datasources 88 | - ./grafana/provisioning/dashboards:/etc/grafana/provisioning/dashboards 89 | depends_on: 90 | - prometheus 91 | 92 | prometheus: 93 | image: prom/prometheus:v2.20.1 94 | container_name: prometheus 95 | restart: unless-stopped 96 | ports: 97 | - 9090:9090 98 | command: 99 | - '--config.file=/etc/prometheus/prometheus.yml' 100 | volumes: 101 | - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro 102 | - ./prometheus/prometheus_db:/prometheus/data:rw #needs command 'sudo 103 | network_mode: host 104 | pid: host 105 | 106 | node-exporter: 107 | image: prom/node-exporter:v1.0.1 108 | container_name: node-exporter 109 | command: 110 | - '--path.rootfs=/host' 111 | restart: unless-stopped 112 | network_mode: host 113 | pid: host 114 | -------------------------------------------------------------------------------- /grafana/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/grafana/__init__.py -------------------------------------------------------------------------------- /grafana/grafana_db/grafana.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/grafana/grafana_db/grafana.db -------------------------------------------------------------------------------- /grafana/login_config: -------------------------------------------------------------------------------- 1 | GF_SECURITY_ADMIN_USER=admin 2 | GF_SECURITY_ADMIN_PASSWORD=admin 3 | GF_USERS_ALLOW_SIGN_UP=false 4 | -------------------------------------------------------------------------------- /grafana/provisioning/dashboards/Spark Metrics.json: -------------------------------------------------------------------------------- 1 | { 2 | "annotations": { 3 | "list": [ 4 | { 5 | "builtIn": 1, 6 | "datasource": "-- Grafana --", 7 | "enable": true, 8 | "hide": true, 9 | "iconColor": "rgba(0, 211, 255, 1)", 10 | "name": "Annotations & Alerts", 11 | "type": "dashboard" 12 | } 13 | ] 14 | }, 15 | "editable": true, 16 | "gnetId": null, 17 | "graphTooltip": 0, 18 | "id": 9, 19 | "links": [], 20 | "panels": [ 21 | { 22 | "cacheTimeout": null, 23 | "colorBackground": false, 24 | "colorValue": false, 25 | "colors": [ 26 | "#299c46", 27 | "rgba(237, 129, 40, 0.89)", 28 | "#d44a3a" 29 | ], 30 | "datasource": null, 31 | "format": "none", 32 | "gauge": { 33 | "maxValue": 100, 34 | "minValue": 0, 35 | "show": false, 36 | "thresholdLabels": false, 37 | "thresholdMarkers": true 38 | }, 39 | "gridPos": { 40 | "h": 5, 41 | "w": 4, 42 | "x": 0, 43 | "y": 0 44 | }, 45 | "id": 7, 46 | "interval": null, 47 | "links": [], 48 | "mappingType": 1, 49 | "mappingTypes": [ 50 | { 51 | "name": "value to text", 52 | "value": 1 53 | }, 54 | { 55 | "name": "range to text", 56 | "value": 2 57 | } 58 | ], 59 | "maxDataPoints": 100, 60 | "nullPointMode": "connected", 61 | "nullText": null, 62 | "options": {}, 63 | "pluginVersion": "6.5.0", 64 | "postfix": "", 65 | "postfixFontSize": "50%", 66 | "prefix": "", 67 | "prefixFontSize": "50%", 68 | "rangeMaps": [ 69 | { 70 | "from": "null", 71 | "text": "N/A", 72 | "to": "null" 73 | } 74 | ], 75 | "sparkline": { 76 | "fillColor": "rgba(31, 118, 189, 0.18)", 77 | "full": false, 78 | "lineColor": "rgb(31, 120, 193)", 79 | "show": false, 80 | "ymax": null, 81 | "ymin": null 82 | }, 83 | "tableColumn": "", 84 | "targets": [ 85 | { 86 | "expr": "metrics_spark_app_driver_DAGScheduler_job_allJobs_Number{group=\"spark\", instance=\"driver\", job=\"spark-driver\"}", 87 | "refId": "A" 88 | } 89 | ], 90 | "thresholds": "", 91 | "timeFrom": null, 92 | "timeShift": null, 93 | "title": "All Jobs", 94 | "type": "singlestat", 95 | "valueFontSize": "80%", 96 | "valueMaps": [ 97 | { 98 | "op": "=", 99 | "text": "N/A", 100 | "value": "null" 101 | } 102 | ], 103 | "valueName": "current" 104 | }, 105 | { 106 | "cacheTimeout": null, 107 | "colorBackground": false, 108 | "colorValue": false, 109 | "colors": [ 110 | "#299c46", 111 | "rgba(237, 129, 40, 0.89)", 112 | "#d44a3a" 113 | ], 114 | "datasource": null, 115 | "format": "none", 116 | "gauge": { 117 | "maxValue": 100, 118 | "minValue": 0, 119 | "show": false, 120 | "thresholdLabels": false, 121 | "thresholdMarkers": true 122 | }, 123 | "gridPos": { 124 | "h": 5, 125 | "w": 4, 126 | "x": 4, 127 | "y": 0 128 | }, 129 | "id": 6, 130 | "interval": null, 131 | "links": [], 132 | "mappingType": 1, 133 | "mappingTypes": [ 134 | { 135 | "name": "value to text", 136 | "value": 1 137 | }, 138 | { 139 | "name": "range to text", 140 | "value": 2 141 | } 142 | ], 143 | "maxDataPoints": 100, 144 | "nullPointMode": "connected", 145 | "nullText": null, 146 | "options": {}, 147 | "pluginVersion": "6.5.0", 148 | "postfix": "", 149 | "postfixFontSize": "50%", 150 | "prefix": "", 151 | "prefixFontSize": "50%", 152 | "rangeMaps": [ 153 | { 154 | "from": "null", 155 | "text": "N/A", 156 | "to": "null" 157 | } 158 | ], 159 | "sparkline": { 160 | "fillColor": "rgba(31, 118, 189, 0.18)", 161 | "full": false, 162 | "lineColor": "rgb(31, 120, 193)", 163 | "show": false, 164 | "ymax": null, 165 | "ymin": null 166 | }, 167 | "tableColumn": "", 168 | "targets": [ 169 | { 170 | "expr": "metrics_spark_app_driver_DAGScheduler_job_activeJobs_Number{group=\"spark\", instance=\"driver\", job=\"spark-driver\"}", 171 | "refId": "A" 172 | } 173 | ], 174 | "thresholds": "", 175 | "timeFrom": null, 176 | "timeShift": null, 177 | "title": "Active Jobs", 178 | "type": "singlestat", 179 | "valueFontSize": "80%", 180 | "valueMaps": [ 181 | { 182 | "op": "=", 183 | "text": "N/A", 184 | "value": "null" 185 | } 186 | ], 187 | "valueName": "current" 188 | }, 189 | { 190 | "cacheTimeout": null, 191 | "colorBackground": false, 192 | "colorValue": false, 193 | "colors": [ 194 | "#299c46", 195 | "rgba(237, 129, 40, 0.89)", 196 | "#d44a3a" 197 | ], 198 | "datasource": null, 199 | "description": "No of currently running stages", 200 | "format": "none", 201 | "gauge": { 202 | "maxValue": 100, 203 | "minValue": 0, 204 | "show": false, 205 | "thresholdLabels": false, 206 | "thresholdMarkers": true 207 | }, 208 | "gridPos": { 209 | "h": 5, 210 | "w": 4, 211 | "x": 8, 212 | "y": 0 213 | }, 214 | "id": 11, 215 | "interval": null, 216 | "links": [], 217 | "mappingType": 1, 218 | "mappingTypes": [ 219 | { 220 | "name": "value to text", 221 | "value": 1 222 | }, 223 | { 224 | "name": "range to text", 225 | "value": 2 226 | } 227 | ], 228 | "maxDataPoints": 100, 229 | "nullPointMode": "connected", 230 | "nullText": null, 231 | "options": {}, 232 | "postfix": "", 233 | "postfixFontSize": "50%", 234 | "prefix": "", 235 | "prefixFontSize": "50%", 236 | "rangeMaps": [ 237 | { 238 | "from": "null", 239 | "text": "N/A", 240 | "to": "null" 241 | } 242 | ], 243 | "sparkline": { 244 | "fillColor": "rgba(31, 118, 189, 0.18)", 245 | "full": false, 246 | "lineColor": "rgb(31, 120, 193)", 247 | "show": false, 248 | "ymax": null, 249 | "ymin": null 250 | }, 251 | "tableColumn": "", 252 | "targets": [ 253 | { 254 | "expr": "metrics_master_workers_Number{group=\"spark\", instance=\"master\", job=\"spark-master\"} ", 255 | "legendFormat": "maxMem_MB", 256 | "refId": "A" 257 | } 258 | ], 259 | "thresholds": "", 260 | "timeFrom": null, 261 | "timeShift": null, 262 | "title": "No Of Workers", 263 | "type": "singlestat", 264 | "valueFontSize": "80%", 265 | "valueMaps": [ 266 | { 267 | "op": "=", 268 | "text": "N/A", 269 | "value": "null" 270 | } 271 | ], 272 | "valueName": "current" 273 | }, 274 | { 275 | "cacheTimeout": null, 276 | "colorBackground": false, 277 | "colorValue": false, 278 | "colors": [ 279 | "#299c46", 280 | "rgba(237, 129, 40, 0.89)", 281 | "#d44a3a" 282 | ], 283 | "datasource": null, 284 | "description": "No of currently running stages", 285 | "format": "none", 286 | "gauge": { 287 | "maxValue": 100, 288 | "minValue": 0, 289 | "show": false, 290 | "thresholdLabels": false, 291 | "thresholdMarkers": true 292 | }, 293 | "gridPos": { 294 | "h": 5, 295 | "w": 4, 296 | "x": 12, 297 | "y": 0 298 | }, 299 | "id": 10, 300 | "interval": null, 301 | "links": [], 302 | "mappingType": 1, 303 | "mappingTypes": [ 304 | { 305 | "name": "value to text", 306 | "value": 1 307 | }, 308 | { 309 | "name": "range to text", 310 | "value": 2 311 | } 312 | ], 313 | "maxDataPoints": 100, 314 | "nullPointMode": "connected", 315 | "nullText": null, 316 | "options": {}, 317 | "postfix": "", 318 | "postfixFontSize": "50%", 319 | "prefix": "", 320 | "prefixFontSize": "50%", 321 | "rangeMaps": [ 322 | { 323 | "from": "null", 324 | "text": "N/A", 325 | "to": "null" 326 | } 327 | ], 328 | "sparkline": { 329 | "fillColor": "rgba(31, 118, 189, 0.18)", 330 | "full": false, 331 | "lineColor": "rgb(31, 120, 193)", 332 | "show": false, 333 | "ymax": null, 334 | "ymin": null 335 | }, 336 | "tableColumn": "", 337 | "targets": [ 338 | { 339 | "expr": "metrics_master_workers_Number{group=\"spark\", instance=\"master\", job=\"spark-master\"} ", 340 | "legendFormat": "maxMem_MB", 341 | "refId": "A" 342 | } 343 | ], 344 | "thresholds": "", 345 | "timeFrom": null, 346 | "timeShift": null, 347 | "title": "No Of Alive Workers", 348 | "type": "singlestat", 349 | "valueFontSize": "80%", 350 | "valueMaps": [ 351 | { 352 | "op": "=", 353 | "text": "N/A", 354 | "value": "null" 355 | } 356 | ], 357 | "valueName": "current" 358 | }, 359 | { 360 | "cacheTimeout": null, 361 | "colorBackground": false, 362 | "colorValue": false, 363 | "colors": [ 364 | "#299c46", 365 | "rgba(237, 129, 40, 0.89)", 366 | "#d44a3a" 367 | ], 368 | "datasource": null, 369 | "description": "No of currently running stages", 370 | "format": "none", 371 | "gauge": { 372 | "maxValue": 100, 373 | "minValue": 0, 374 | "show": false, 375 | "thresholdLabels": false, 376 | "thresholdMarkers": true 377 | }, 378 | "gridPos": { 379 | "h": 5, 380 | "w": 4, 381 | "x": 16, 382 | "y": 0 383 | }, 384 | "id": 12, 385 | "interval": null, 386 | "links": [], 387 | "mappingType": 1, 388 | "mappingTypes": [ 389 | { 390 | "name": "value to text", 391 | "value": 1 392 | }, 393 | { 394 | "name": "range to text", 395 | "value": 2 396 | } 397 | ], 398 | "maxDataPoints": 100, 399 | "nullPointMode": "connected", 400 | "nullText": null, 401 | "options": {}, 402 | "pluginVersion": "6.5.0", 403 | "postfix": "", 404 | "postfixFontSize": "50%", 405 | "prefix": "", 406 | "prefixFontSize": "50%", 407 | "rangeMaps": [ 408 | { 409 | "from": "null", 410 | "text": "N/A", 411 | "to": "null" 412 | } 413 | ], 414 | "sparkline": { 415 | "fillColor": "rgba(31, 118, 189, 0.18)", 416 | "full": false, 417 | "lineColor": "rgb(31, 120, 193)", 418 | "show": false, 419 | "ymax": null, 420 | "ymin": null 421 | }, 422 | "tableColumn": "", 423 | "targets": [ 424 | { 425 | "expr": "metrics_worker_coresFree_Number{ group=\"spark\", instance=\"worker\", job=\"spark-worker\"} ", 426 | "legendFormat": "maxMem_MB", 427 | "refId": "A" 428 | } 429 | ], 430 | "thresholds": "", 431 | "timeFrom": null, 432 | "timeShift": null, 433 | "title": "No Of Cores Free By Worker", 434 | "type": "singlestat", 435 | "valueFontSize": "80%", 436 | "valueMaps": [ 437 | { 438 | "op": "=", 439 | "text": "N/A", 440 | "value": "null" 441 | } 442 | ], 443 | "valueName": "current" 444 | }, 445 | { 446 | "cacheTimeout": null, 447 | "colorBackground": false, 448 | "colorValue": false, 449 | "colors": [ 450 | "#299c46", 451 | "rgba(237, 129, 40, 0.89)", 452 | "#d44a3a" 453 | ], 454 | "datasource": null, 455 | "description": "No of currently running stages", 456 | "format": "none", 457 | "gauge": { 458 | "maxValue": 100, 459 | "minValue": 0, 460 | "show": false, 461 | "thresholdLabels": false, 462 | "thresholdMarkers": true 463 | }, 464 | "gridPos": { 465 | "h": 5, 466 | "w": 4, 467 | "x": 20, 468 | "y": 0 469 | }, 470 | "id": 13, 471 | "interval": null, 472 | "links": [], 473 | "mappingType": 1, 474 | "mappingTypes": [ 475 | { 476 | "name": "value to text", 477 | "value": 1 478 | }, 479 | { 480 | "name": "range to text", 481 | "value": 2 482 | } 483 | ], 484 | "maxDataPoints": 100, 485 | "nullPointMode": "connected", 486 | "nullText": null, 487 | "options": {}, 488 | "pluginVersion": "6.5.0", 489 | "postfix": "", 490 | "postfixFontSize": "50%", 491 | "prefix": "", 492 | "prefixFontSize": "50%", 493 | "rangeMaps": [ 494 | { 495 | "from": "null", 496 | "text": "N/A", 497 | "to": "null" 498 | } 499 | ], 500 | "sparkline": { 501 | "fillColor": "rgba(31, 118, 189, 0.18)", 502 | "full": false, 503 | "lineColor": "rgb(31, 120, 193)", 504 | "show": false, 505 | "ymax": null, 506 | "ymin": null 507 | }, 508 | "tableColumn": "", 509 | "targets": [ 510 | { 511 | "expr": "metrics_worker_coresUsed_Number{ group=\"spark\", instance=\"worker\", job=\"spark-worker\"} ", 512 | "legendFormat": "", 513 | "refId": "A" 514 | } 515 | ], 516 | "thresholds": "", 517 | "timeFrom": null, 518 | "timeShift": null, 519 | "title": "No Of Cores Used By Worker", 520 | "type": "singlestat", 521 | "valueFontSize": "80%", 522 | "valueMaps": [ 523 | { 524 | "op": "=", 525 | "text": "N/A", 526 | "value": "null" 527 | } 528 | ], 529 | "valueName": "current" 530 | }, 531 | { 532 | "aliasColors": {}, 533 | "bars": false, 534 | "dashLength": 10, 535 | "dashes": false, 536 | "datasource": null, 537 | "description": "No of currently running stages", 538 | "fill": 1, 539 | "fillGradient": 0, 540 | "gridPos": { 541 | "h": 8, 542 | "w": 8, 543 | "x": 0, 544 | "y": 5 545 | }, 546 | "hiddenSeries": false, 547 | "id": 2, 548 | "legend": { 549 | "avg": false, 550 | "current": false, 551 | "max": false, 552 | "min": false, 553 | "show": true, 554 | "total": false, 555 | "values": false 556 | }, 557 | "lines": true, 558 | "linewidth": 1, 559 | "nullPointMode": "null", 560 | "options": { 561 | "dataLinks": [] 562 | }, 563 | "percentage": false, 564 | "pointradius": 2, 565 | "points": false, 566 | "renderer": "flot", 567 | "seriesOverrides": [], 568 | "spaceLength": 10, 569 | "stack": false, 570 | "steppedLine": false, 571 | "targets": [ 572 | { 573 | "expr": "metrics_spark_app_driver_DAGScheduler_stage_runningStages_Number{group=\"spark\", instance=\"driver\", job=\"spark-driver\"} ", 574 | "legendFormat": "Running Stages", 575 | "refId": "A" 576 | }, 577 | { 578 | "expr": "metrics_spark_app_driver_DAGScheduler_stage_waitingStages_Number{group=\"spark\", instance=\"driver\", job=\"spark-driver\"} ", 579 | "legendFormat": "Waiting Stages", 580 | "refId": "B" 581 | }, 582 | { 583 | "expr": "metrics_spark_app_driver_DAGScheduler_stage_failedStages_Number{group=\"spark\", instance=\"driver\", job=\"spark-driver\"} ", 584 | "legendFormat": "Failed Stages", 585 | "refId": "C" 586 | } 587 | ], 588 | "thresholds": [], 589 | "timeFrom": null, 590 | "timeRegions": [], 591 | "timeShift": null, 592 | "title": "Spark Driver Running Stages", 593 | "tooltip": { 594 | "shared": true, 595 | "sort": 0, 596 | "value_type": "individual" 597 | }, 598 | "type": "graph", 599 | "xaxis": { 600 | "buckets": null, 601 | "mode": "time", 602 | "name": null, 603 | "show": true, 604 | "values": [] 605 | }, 606 | "yaxes": [ 607 | { 608 | "format": "short", 609 | "label": null, 610 | "logBase": 1, 611 | "max": null, 612 | "min": null, 613 | "show": true 614 | }, 615 | { 616 | "format": "short", 617 | "label": null, 618 | "logBase": 1, 619 | "max": null, 620 | "min": null, 621 | "show": true 622 | } 623 | ], 624 | "yaxis": { 625 | "align": false, 626 | "alignLevel": null 627 | } 628 | }, 629 | { 630 | "aliasColors": {}, 631 | "bars": false, 632 | "dashLength": 10, 633 | "dashes": false, 634 | "datasource": null, 635 | "description": "No of currently running stages", 636 | "fill": 1, 637 | "fillGradient": 0, 638 | "gridPos": { 639 | "h": 8, 640 | "w": 8, 641 | "x": 8, 642 | "y": 5 643 | }, 644 | "hiddenSeries": false, 645 | "id": 8, 646 | "legend": { 647 | "avg": false, 648 | "current": false, 649 | "max": false, 650 | "min": false, 651 | "show": true, 652 | "total": false, 653 | "values": false 654 | }, 655 | "lines": true, 656 | "linewidth": 1, 657 | "nullPointMode": "null", 658 | "options": { 659 | "dataLinks": [] 660 | }, 661 | "percentage": false, 662 | "pointradius": 2, 663 | "points": false, 664 | "renderer": "flot", 665 | "seriesOverrides": [], 666 | "spaceLength": 10, 667 | "stack": false, 668 | "steppedLine": false, 669 | "targets": [ 670 | { 671 | "expr": "metrics_spark_app_driver_DAGScheduler_messageProcessingTime_Max{group=\"spark\", instance=\"driver\", job=\"spark-driver\"} ", 672 | "legendFormat": "Max", 673 | "refId": "A" 674 | }, 675 | { 676 | "expr": "metrics_spark_app_driver_DAGScheduler_messageProcessingTime_Min{group=\"spark\", instance=\"driver\", job=\"spark-driver\"} ", 677 | "legendFormat": "Min", 678 | "refId": "B" 679 | }, 680 | { 681 | "expr": "metrics_spark_app_driver_DAGScheduler_messageProcessingTime_Mean{group=\"spark\", instance=\"driver\", job=\"spark-driver\"} ", 682 | "legendFormat": "Mean", 683 | "refId": "C" 684 | }, 685 | { 686 | "expr": "metrics_spark_app_driver_DAGScheduler_messageProcessingTime_StdDev{group=\"spark\", instance=\"driver\", job=\"spark-driver\"} ", 687 | "legendFormat": "StdDev", 688 | "refId": "D" 689 | } 690 | ], 691 | "thresholds": [], 692 | "timeFrom": null, 693 | "timeRegions": [], 694 | "timeShift": null, 695 | "title": "Message Processing Time", 696 | "tooltip": { 697 | "shared": true, 698 | "sort": 0, 699 | "value_type": "individual" 700 | }, 701 | "type": "graph", 702 | "xaxis": { 703 | "buckets": null, 704 | "mode": "time", 705 | "name": null, 706 | "show": true, 707 | "values": [] 708 | }, 709 | "yaxes": [ 710 | { 711 | "format": "short", 712 | "label": null, 713 | "logBase": 1, 714 | "max": null, 715 | "min": null, 716 | "show": true 717 | }, 718 | { 719 | "format": "short", 720 | "label": null, 721 | "logBase": 1, 722 | "max": null, 723 | "min": null, 724 | "show": true 725 | } 726 | ], 727 | "yaxis": { 728 | "align": false, 729 | "alignLevel": null 730 | } 731 | }, 732 | { 733 | "aliasColors": {}, 734 | "bars": false, 735 | "dashLength": 10, 736 | "dashes": false, 737 | "datasource": null, 738 | "description": "No of currently running stages", 739 | "fill": 1, 740 | "fillGradient": 0, 741 | "gridPos": { 742 | "h": 8, 743 | "w": 8, 744 | "x": 16, 745 | "y": 5 746 | }, 747 | "hiddenSeries": false, 748 | "id": 9, 749 | "legend": { 750 | "avg": false, 751 | "current": false, 752 | "max": false, 753 | "min": false, 754 | "show": true, 755 | "total": false, 756 | "values": false 757 | }, 758 | "lines": true, 759 | "linewidth": 1, 760 | "nullPointMode": "null", 761 | "options": { 762 | "dataLinks": [] 763 | }, 764 | "percentage": false, 765 | "pointradius": 2, 766 | "points": false, 767 | "renderer": "flot", 768 | "seriesOverrides": [], 769 | "spaceLength": 10, 770 | "stack": false, 771 | "steppedLine": false, 772 | "targets": [ 773 | { 774 | "expr": "metrics_spark_app_driver_BlockManager_memory_maxMem_MB_Number{group=\"spark\", instance=\"driver\", job=\"spark-driver\"} ", 775 | "legendFormat": "maxMem_MB", 776 | "refId": "A" 777 | }, 778 | { 779 | "expr": "metrics_spark_app_driver_BlockManager_disk_diskSpaceUsed_MB_Number{group=\"spark\", instance=\"driver\", job=\"spark-driver\"} ", 780 | "legendFormat": "diskSpaceUsed_MB", 781 | "refId": "D" 782 | }, 783 | { 784 | "expr": "metrics_spark_app_driver_BlockManager_memory_maxOnHeapMem_MB_Number{group=\"spark\", instance=\"driver\", job=\"spark-driver\"} ", 785 | "legendFormat": "maxOnHeapMem_MB", 786 | "refId": "B" 787 | }, 788 | { 789 | "expr": "metrics_spark_app_driver_BlockManager_memory_maxOffHeapMem_MB_Number{group=\"spark\", instance=\"driver\", job=\"spark-driver\"} ", 790 | "legendFormat": "maxOffHeapMem_MB", 791 | "refId": "C" 792 | } 793 | ], 794 | "thresholds": [], 795 | "timeFrom": null, 796 | "timeRegions": [], 797 | "timeShift": null, 798 | "title": "BlockManager", 799 | "tooltip": { 800 | "shared": true, 801 | "sort": 0, 802 | "value_type": "individual" 803 | }, 804 | "type": "graph", 805 | "xaxis": { 806 | "buckets": null, 807 | "mode": "time", 808 | "name": null, 809 | "show": true, 810 | "values": [] 811 | }, 812 | "yaxes": [ 813 | { 814 | "format": "short", 815 | "label": null, 816 | "logBase": 1, 817 | "max": null, 818 | "min": null, 819 | "show": true 820 | }, 821 | { 822 | "format": "short", 823 | "label": null, 824 | "logBase": 1, 825 | "max": null, 826 | "min": null, 827 | "show": true 828 | } 829 | ], 830 | "yaxis": { 831 | "align": false, 832 | "alignLevel": null 833 | } 834 | }, 835 | { 836 | "aliasColors": {}, 837 | "bars": false, 838 | "cacheTimeout": null, 839 | "dashLength": 10, 840 | "dashes": false, 841 | "datasource": null, 842 | "description": "No of currently running stages", 843 | "fill": 1, 844 | "fillGradient": 0, 845 | "gridPos": { 846 | "h": 6, 847 | "w": 12, 848 | "x": 0, 849 | "y": 13 850 | }, 851 | "hiddenSeries": false, 852 | "id": 14, 853 | "legend": { 854 | "avg": false, 855 | "current": false, 856 | "max": false, 857 | "min": false, 858 | "show": true, 859 | "total": false, 860 | "values": false 861 | }, 862 | "lines": true, 863 | "linewidth": 1, 864 | "links": [], 865 | "nullPointMode": "null", 866 | "options": { 867 | "dataLinks": [] 868 | }, 869 | "percentage": false, 870 | "pluginVersion": "6.5.0", 871 | "pointradius": 2, 872 | "points": false, 873 | "renderer": "flot", 874 | "seriesOverrides": [], 875 | "spaceLength": 10, 876 | "stack": false, 877 | "steppedLine": false, 878 | "targets": [ 879 | { 880 | "expr": "metrics_worker_memFree_MB_Number{ group=\"spark\", instance=\"worker\", job=\"spark-worker\"} ", 881 | "legendFormat": "memFree_MB", 882 | "refId": "A" 883 | }, 884 | { 885 | "expr": "metrics_worker_memUsed_MB_Number{ group=\"spark\", instance=\"worker\", job=\"spark-worker\"} ", 886 | "legendFormat": "memUsed_MB", 887 | "refId": "B" 888 | } 889 | ], 890 | "thresholds": [], 891 | "timeFrom": null, 892 | "timeRegions": [], 893 | "timeShift": null, 894 | "title": "Memory Free/Used by Worker", 895 | "tooltip": { 896 | "shared": true, 897 | "sort": 0, 898 | "value_type": "individual" 899 | }, 900 | "type": "graph", 901 | "xaxis": { 902 | "buckets": null, 903 | "mode": "time", 904 | "name": null, 905 | "show": true, 906 | "values": [] 907 | }, 908 | "yaxes": [ 909 | { 910 | "format": "short", 911 | "label": null, 912 | "logBase": 1, 913 | "max": null, 914 | "min": null, 915 | "show": true 916 | }, 917 | { 918 | "format": "short", 919 | "label": null, 920 | "logBase": 1, 921 | "max": null, 922 | "min": null, 923 | "show": true 924 | } 925 | ], 926 | "yaxis": { 927 | "align": false, 928 | "alignLevel": null 929 | } 930 | }, 931 | { 932 | "aliasColors": {}, 933 | "bars": false, 934 | "cacheTimeout": null, 935 | "dashLength": 10, 936 | "dashes": false, 937 | "datasource": null, 938 | "description": "No of currently running stages", 939 | "fill": 1, 940 | "fillGradient": 0, 941 | "gridPos": { 942 | "h": 6, 943 | "w": 12, 944 | "x": 12, 945 | "y": 13 946 | }, 947 | "hiddenSeries": false, 948 | "id": 15, 949 | "legend": { 950 | "avg": false, 951 | "current": false, 952 | "max": false, 953 | "min": false, 954 | "show": true, 955 | "total": false, 956 | "values": false 957 | }, 958 | "lines": true, 959 | "linewidth": 1, 960 | "links": [], 961 | "nullPointMode": "null", 962 | "options": { 963 | "dataLinks": [] 964 | }, 965 | "percentage": false, 966 | "pluginVersion": "6.5.0", 967 | "pointradius": 2, 968 | "points": false, 969 | "renderer": "flot", 970 | "seriesOverrides": [], 971 | "spaceLength": 10, 972 | "stack": false, 973 | "steppedLine": false, 974 | "targets": [ 975 | { 976 | "expr": "metrics_spark_app_driver_LiveListenerBus_listenerProcessingTime_org_apache_spark_HeartbeatReceiver_Count{type=\"timers\", group=\"spark\", instance=\"driver\", job=\"spark-driver\"} ", 977 | "legendFormat": "Count", 978 | "refId": "A" 979 | }, 980 | { 981 | "expr": "metrics_spark_app_driver_LiveListenerBus_listenerProcessingTime_org_apache_spark_HeartbeatReceiver_Max{type=\"timers\", group=\"spark\", instance=\"driver\", job=\"spark-driver\"} ", 982 | "legendFormat": "Max", 983 | "refId": "B" 984 | }, 985 | { 986 | "expr": "metrics_spark_app_driver_LiveListenerBus_listenerProcessingTime_org_apache_spark_HeartbeatReceiver_Mean{type=\"timers\", group=\"spark\", instance=\"driver\", job=\"spark-driver\"} ", 987 | "legendFormat": "Mean", 988 | "refId": "C" 989 | }, 990 | { 991 | "expr": "metrics_spark_app_driver_LiveListenerBus_listenerProcessingTime_org_apache_spark_HeartbeatReceiver_Min{type=\"timers\", group=\"spark\", instance=\"driver\", job=\"spark-driver\"} ", 992 | "legendFormat": "Min", 993 | "refId": "D" 994 | } 995 | ], 996 | "thresholds": [], 997 | "timeFrom": null, 998 | "timeRegions": [], 999 | "timeShift": null, 1000 | "title": "LiveListenerBus listenerProcessingTime HeartbeatReceiver", 1001 | "tooltip": { 1002 | "shared": true, 1003 | "sort": 0, 1004 | "value_type": "individual" 1005 | }, 1006 | "type": "graph", 1007 | "xaxis": { 1008 | "buckets": null, 1009 | "mode": "time", 1010 | "name": null, 1011 | "show": true, 1012 | "values": [] 1013 | }, 1014 | "yaxes": [ 1015 | { 1016 | "format": "short", 1017 | "label": null, 1018 | "logBase": 1, 1019 | "max": null, 1020 | "min": null, 1021 | "show": true 1022 | }, 1023 | { 1024 | "format": "short", 1025 | "label": null, 1026 | "logBase": 1, 1027 | "max": null, 1028 | "min": null, 1029 | "show": true 1030 | } 1031 | ], 1032 | "yaxis": { 1033 | "align": false, 1034 | "alignLevel": null 1035 | } 1036 | } 1037 | ], 1038 | "refresh": false, 1039 | "schemaVersion": 21, 1040 | "style": "dark", 1041 | "tags": [], 1042 | "templating": { 1043 | "list": [] 1044 | }, 1045 | "time": { 1046 | "from": "now-15m", 1047 | "to": "now" 1048 | }, 1049 | "timepicker": { 1050 | "refresh_intervals": [ 1051 | "5s", 1052 | "10s", 1053 | "30s", 1054 | "1m", 1055 | "5m", 1056 | "15m", 1057 | "30m", 1058 | "1h", 1059 | "2h", 1060 | "1d" 1061 | ] 1062 | }, 1063 | "timezone": "", 1064 | "title": "Apache Spark Metrics", 1065 | "uid": "EWIYh-OMz", 1066 | "version": 10 1067 | } 1068 | -------------------------------------------------------------------------------- /grafana/provisioning/dashboards/dashboard.yml: -------------------------------------------------------------------------------- 1 | 2 | apiVersion: 1 3 | 4 | providers: 5 | - name: 'Prometheus' 6 | orgId: 1 7 | folder: '' 8 | type: file 9 | disableDeletion: false 10 | editable: true 11 | options: 12 | path: /etc/grafana/provisioning/dashboards 13 | -------------------------------------------------------------------------------- /inference/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/inference/__init__.py -------------------------------------------------------------------------------- /inference/base.py: -------------------------------------------------------------------------------- 1 | import abc 2 | 3 | from synapse.ml.lightgbm import LightGBMClassifier 4 | import pyspark.sql.types as T 5 | import pyspark.sql.functions as F 6 | from pyspark.ml import PipelineModel 7 | from pyspark.sql import SparkSession 8 | from pyspark.sql.dataframe import DataFrame 9 | 10 | from const import ROOT_PATH 11 | from udfs import extract_predicted_prob 12 | 13 | 14 | class Inference(abc.ABC): 15 | def __init__(self, model_path, config: dict = None): 16 | self.spark = SparkSession \ 17 | .builder \ 18 | .master('local[*]') \ 19 | .appName("inference") \ 20 | .config("spark.jars.packages", 21 | "org.apache.kafka:kafka-clients:3.2.1," 22 | "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0," 23 | "com.microsoft.azure:synapseml_2.12:1.0.2") \ 24 | .getOrCreate() 25 | 26 | self.config = config or {} 27 | self.model = self.load_pipeline_model(model_path) 28 | 29 | @staticmethod 30 | def load_pipeline_model(model_path): 31 | pipeline_model = PipelineModel.load(model_path) 32 | return pipeline_model 33 | 34 | @staticmethod 35 | def preprocess_data(raw_data: DataFrame) -> DataFrame: 36 | return raw_data 37 | 38 | @staticmethod 39 | def make_prediction(pipeline: PipelineModel, 40 | reprocessed_data: DataFrame) -> DataFrame: 41 | predictions = pipeline.transform(reprocessed_data) 42 | predictions = predictions \ 43 | .withColumn("predicted_prob", 44 | extract_predicted_prob(F.col("probability"))) \ 45 | .withColumn("predicted_prob", F.col("predicted_prob") 46 | .cast(T.DoubleType())) 47 | return predictions 48 | 49 | @abc.abstractmethod 50 | def read(self) -> DataFrame: 51 | pass 52 | 53 | @abc.abstractmethod 54 | def write(self, predictions_df: DataFrame): 55 | pass 56 | 57 | def run(self) -> None: 58 | df = self.read() 59 | preprocessed_df = self.preprocess_data(df) 60 | predictions_df = self.make_prediction(self.model, preprocessed_df) 61 | self.write(predictions_df) 62 | -------------------------------------------------------------------------------- /inference/batch_inference.py: -------------------------------------------------------------------------------- 1 | from synapse.ml.lightgbm import LightGBMClassifier 2 | from pyspark.sql.dataframe import DataFrame 3 | 4 | from const import ROOT_PATH 5 | from spec import output_schema 6 | from inference.base import Inference 7 | 8 | 9 | class BatchInference(Inference): 10 | 11 | def read(self) -> DataFrame: 12 | return self.spark.read.csv(self.config["dataset"], header=True, 13 | inferSchema=True).limit(10000) 14 | 15 | def write(self, df: DataFrame) -> None: 16 | df.select(output_schema.names).show() 17 | 18 | 19 | if __name__ == "__main__": 20 | batch_config = { 21 | "dataset": f"{ROOT_PATH}/creditcard.csv" 22 | } 23 | 24 | inference = BatchInference(f"{ROOT_PATH}/model", batch_config) 25 | inference.run() 26 | -------------------------------------------------------------------------------- /inference/stream_inference.py: -------------------------------------------------------------------------------- 1 | from synapse.ml.lightgbm import LightGBMClassifier 2 | 3 | import pyspark.sql.types as T 4 | import pyspark.sql.functions as F 5 | from pyspark.sql.dataframe import DataFrame 6 | 7 | from const import ROOT_PATH 8 | from inference.base import Inference 9 | from spec import input_schema, output_schema 10 | 11 | 12 | class StreamInference(Inference): 13 | def read(self) -> DataFrame: 14 | return self.spark \ 15 | .readStream \ 16 | .format("kafka") \ 17 | .option("subscribe", self.config['source']['kafka_topics']) \ 18 | .options(**self.config["source"]["kafka_options"]) \ 19 | .load()\ 20 | .select(F.from_json(F.col("value").cast("string"), 21 | input_schema).alias("value"))\ 22 | .select(F.col("value.*")) 23 | 24 | def write(self, df: DataFrame) -> None: 25 | (df 26 | .select(output_schema.names) 27 | .withColumn("value", F.to_json(F.struct(*[F.col(c) for c in 28 | output_schema.names]))) 29 | .withColumn("key", F.rand(seed=42).astype(T.StringType())) 30 | .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") 31 | .writeStream 32 | .trigger(processingTime="0 seconds") 33 | .format("kafka") 34 | .options(**self.config["sink"]["kafka_options"]) 35 | .option("topic", self.config["sink"]["sink_topic"]) 36 | .option("checkpointLocation", 37 | self.config["sink"]["checkpoint_location"]) 38 | .start() 39 | .awaitTermination() 40 | ) 41 | 42 | 43 | if __name__ == "__main__": 44 | stream_config = { 45 | "source": { 46 | "kafka_topics": "raw", 47 | "kafka_options": { 48 | "kafka.bootstrap.servers": "localhost:9092", 49 | "startingOffsets": "latest", 50 | } 51 | }, 52 | "sink": { 53 | "sink_topic": "predictions", 54 | "checkpoint_location": f"{ROOT_PATH}/spark-streaming-checkpoint", 55 | # Change this 56 | # to your desired path. 57 | "write_format": "console", 58 | "write_output_mode": "append", # "complete", "append" or "update", 59 | "kafka_options": { 60 | "kafka.bootstrap.servers": "localhost:9092" 61 | } 62 | } 63 | } 64 | 65 | inference = StreamInference(f"{ROOT_PATH}/model", stream_config) 66 | inference.run() 67 | -------------------------------------------------------------------------------- /model/metadata/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /model/metadata/.part-00000.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/metadata/.part-00000.crc -------------------------------------------------------------------------------- /model/metadata/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/metadata/_SUCCESS -------------------------------------------------------------------------------- /model/metadata/part-00000: -------------------------------------------------------------------------------- 1 | {"class":"org.apache.spark.ml.PipelineModel","timestamp":1739608795458,"sparkVersion":"3.5.0","uid":"PipelineModel_5b77bd046814","paramMap":{"stageUids":["StringIndexer_d065dab91d9f","OneHotEncoder_de68f7eb6b38","Imputer_7b5e57f74a2b","VectorAssembler_441d7d1bea9d","StandardScaler_5dbb14844ce4","VectorAssembler_a9434a6047bc","LightGBMClassifier_788cfead8c52"]},"defaultParamMap":{}} 2 | -------------------------------------------------------------------------------- /model/stages/0_StringIndexer_d065dab91d9f/data/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /model/stages/0_StringIndexer_d065dab91d9f/data/.part-00000-7abf6b25-05d4-4465-bc28-f85b21eda3b2-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/stages/0_StringIndexer_d065dab91d9f/data/.part-00000-7abf6b25-05d4-4465-bc28-f85b21eda3b2-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /model/stages/0_StringIndexer_d065dab91d9f/data/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/stages/0_StringIndexer_d065dab91d9f/data/_SUCCESS -------------------------------------------------------------------------------- /model/stages/0_StringIndexer_d065dab91d9f/data/part-00000-7abf6b25-05d4-4465-bc28-f85b21eda3b2-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/stages/0_StringIndexer_d065dab91d9f/data/part-00000-7abf6b25-05d4-4465-bc28-f85b21eda3b2-c000.snappy.parquet -------------------------------------------------------------------------------- /model/stages/0_StringIndexer_d065dab91d9f/metadata/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /model/stages/0_StringIndexer_d065dab91d9f/metadata/.part-00000.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/stages/0_StringIndexer_d065dab91d9f/metadata/.part-00000.crc -------------------------------------------------------------------------------- /model/stages/0_StringIndexer_d065dab91d9f/metadata/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/stages/0_StringIndexer_d065dab91d9f/metadata/_SUCCESS -------------------------------------------------------------------------------- /model/stages/0_StringIndexer_d065dab91d9f/metadata/part-00000: -------------------------------------------------------------------------------- 1 | {"class":"org.apache.spark.ml.feature.StringIndexerModel","timestamp":1739608796112,"sparkVersion":"3.5.0","uid":"StringIndexer_d065dab91d9f","paramMap":{"stringOrderType":"alphabetAsc","outputCols":["TimeIndex"],"handleInvalid":"keep","inputCols":["Time"]},"defaultParamMap":{"stringOrderType":"frequencyDesc","outputCol":"StringIndexer_d065dab91d9f__output","handleInvalid":"error"}} 2 | -------------------------------------------------------------------------------- /model/stages/1_OneHotEncoder_de68f7eb6b38/data/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /model/stages/1_OneHotEncoder_de68f7eb6b38/data/.part-00000-8acdc128-241b-4b22-a551-aeba6a6ca387-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/stages/1_OneHotEncoder_de68f7eb6b38/data/.part-00000-8acdc128-241b-4b22-a551-aeba6a6ca387-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /model/stages/1_OneHotEncoder_de68f7eb6b38/data/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/stages/1_OneHotEncoder_de68f7eb6b38/data/_SUCCESS -------------------------------------------------------------------------------- /model/stages/1_OneHotEncoder_de68f7eb6b38/data/part-00000-8acdc128-241b-4b22-a551-aeba6a6ca387-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/stages/1_OneHotEncoder_de68f7eb6b38/data/part-00000-8acdc128-241b-4b22-a551-aeba6a6ca387-c000.snappy.parquet -------------------------------------------------------------------------------- /model/stages/1_OneHotEncoder_de68f7eb6b38/metadata/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /model/stages/1_OneHotEncoder_de68f7eb6b38/metadata/.part-00000.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/stages/1_OneHotEncoder_de68f7eb6b38/metadata/.part-00000.crc -------------------------------------------------------------------------------- /model/stages/1_OneHotEncoder_de68f7eb6b38/metadata/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/stages/1_OneHotEncoder_de68f7eb6b38/metadata/_SUCCESS -------------------------------------------------------------------------------- /model/stages/1_OneHotEncoder_de68f7eb6b38/metadata/part-00000: -------------------------------------------------------------------------------- 1 | {"class":"org.apache.spark.ml.feature.OneHotEncoderModel","timestamp":1739608798510,"sparkVersion":"3.5.0","uid":"OneHotEncoder_de68f7eb6b38","paramMap":{"handleInvalid":"error","outputCols":["TimeOHE"],"inputCols":["TimeIndex"],"dropLast":false},"defaultParamMap":{"outputCol":"OneHotEncoder_de68f7eb6b38__output","handleInvalid":"error","dropLast":true}} 2 | -------------------------------------------------------------------------------- /model/stages/2_Imputer_7b5e57f74a2b/data/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /model/stages/2_Imputer_7b5e57f74a2b/data/.part-00000-3ba2ba2d-997b-482f-a2a7-d39eedb93fad-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/stages/2_Imputer_7b5e57f74a2b/data/.part-00000-3ba2ba2d-997b-482f-a2a7-d39eedb93fad-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /model/stages/2_Imputer_7b5e57f74a2b/data/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/stages/2_Imputer_7b5e57f74a2b/data/_SUCCESS -------------------------------------------------------------------------------- /model/stages/2_Imputer_7b5e57f74a2b/data/part-00000-3ba2ba2d-997b-482f-a2a7-d39eedb93fad-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/stages/2_Imputer_7b5e57f74a2b/data/part-00000-3ba2ba2d-997b-482f-a2a7-d39eedb93fad-c000.snappy.parquet -------------------------------------------------------------------------------- /model/stages/2_Imputer_7b5e57f74a2b/metadata/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /model/stages/2_Imputer_7b5e57f74a2b/metadata/.part-00000.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/stages/2_Imputer_7b5e57f74a2b/metadata/.part-00000.crc -------------------------------------------------------------------------------- /model/stages/2_Imputer_7b5e57f74a2b/metadata/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/stages/2_Imputer_7b5e57f74a2b/metadata/_SUCCESS -------------------------------------------------------------------------------- /model/stages/2_Imputer_7b5e57f74a2b/metadata/part-00000: -------------------------------------------------------------------------------- 1 | {"class":"org.apache.spark.ml.feature.ImputerModel","timestamp":1739608799419,"sparkVersion":"3.5.0","uid":"Imputer_7b5e57f74a2b","paramMap":{"strategy":"median","inputCols":["V1","V2","V3","V4","V5","V6","V7","V8","V9","V10","V11","V12","V13","V14","V15","V16","V17","V18","V19","V20","V21","V22","V23","V24","V25","V26","V27"],"outputCols":["V1_imputed","V2_imputed","V3_imputed","V4_imputed","V5_imputed","V6_imputed","V7_imputed","V8_imputed","V9_imputed","V10_imputed","V11_imputed","V12_imputed","V13_imputed","V14_imputed","V15_imputed","V16_imputed","V17_imputed","V18_imputed","V19_imputed","V20_imputed","V21_imputed","V22_imputed","V23_imputed","V24_imputed","V25_imputed","V26_imputed","V27_imputed"]},"defaultParamMap":{"strategy":"mean","relativeError":0.001,"missingValue":"NaN","outputCol":"Imputer_7b5e57f74a2b__output"}} 2 | -------------------------------------------------------------------------------- /model/stages/3_VectorAssembler_441d7d1bea9d/metadata/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /model/stages/3_VectorAssembler_441d7d1bea9d/metadata/.part-00000.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/stages/3_VectorAssembler_441d7d1bea9d/metadata/.part-00000.crc -------------------------------------------------------------------------------- /model/stages/3_VectorAssembler_441d7d1bea9d/metadata/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/stages/3_VectorAssembler_441d7d1bea9d/metadata/_SUCCESS -------------------------------------------------------------------------------- /model/stages/3_VectorAssembler_441d7d1bea9d/metadata/part-00000: -------------------------------------------------------------------------------- 1 | {"class":"org.apache.spark.ml.feature.VectorAssembler","timestamp":1739608800637,"sparkVersion":"3.5.0","uid":"VectorAssembler_441d7d1bea9d","paramMap":{"outputCol":"numerical_features","inputCols":["V1_imputed","V2_imputed","V3_imputed","V4_imputed","V5_imputed","V6_imputed","V7_imputed","V8_imputed","V9_imputed","V10_imputed","V11_imputed","V12_imputed","V13_imputed","V14_imputed","V15_imputed","V16_imputed","V17_imputed","V18_imputed","V19_imputed","V20_imputed","V21_imputed","V22_imputed","V23_imputed","V24_imputed","V25_imputed","V26_imputed","V27_imputed"]},"defaultParamMap":{"outputCol":"VectorAssembler_441d7d1bea9d__output","handleInvalid":"error"}} 2 | -------------------------------------------------------------------------------- /model/stages/4_StandardScaler_5dbb14844ce4/data/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /model/stages/4_StandardScaler_5dbb14844ce4/data/.part-00000-27e8b4bc-d589-45cb-9991-def4d3b88a50-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/stages/4_StandardScaler_5dbb14844ce4/data/.part-00000-27e8b4bc-d589-45cb-9991-def4d3b88a50-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /model/stages/4_StandardScaler_5dbb14844ce4/data/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/stages/4_StandardScaler_5dbb14844ce4/data/_SUCCESS -------------------------------------------------------------------------------- /model/stages/4_StandardScaler_5dbb14844ce4/data/part-00000-27e8b4bc-d589-45cb-9991-def4d3b88a50-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/stages/4_StandardScaler_5dbb14844ce4/data/part-00000-27e8b4bc-d589-45cb-9991-def4d3b88a50-c000.snappy.parquet -------------------------------------------------------------------------------- /model/stages/4_StandardScaler_5dbb14844ce4/metadata/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /model/stages/4_StandardScaler_5dbb14844ce4/metadata/.part-00000.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/stages/4_StandardScaler_5dbb14844ce4/metadata/.part-00000.crc -------------------------------------------------------------------------------- /model/stages/4_StandardScaler_5dbb14844ce4/metadata/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/stages/4_StandardScaler_5dbb14844ce4/metadata/_SUCCESS -------------------------------------------------------------------------------- /model/stages/4_StandardScaler_5dbb14844ce4/metadata/part-00000: -------------------------------------------------------------------------------- 1 | {"class":"org.apache.spark.ml.feature.StandardScalerModel","timestamp":1739608801209,"sparkVersion":"3.5.0","uid":"StandardScaler_5dbb14844ce4","paramMap":{"inputCol":"numerical_features","outputCol":"numerical_features_scaled"},"defaultParamMap":{"withMean":false,"outputCol":"StandardScaler_5dbb14844ce4__output","withStd":true}} 2 | -------------------------------------------------------------------------------- /model/stages/5_VectorAssembler_a9434a6047bc/metadata/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /model/stages/5_VectorAssembler_a9434a6047bc/metadata/.part-00000.crc: -------------------------------------------------------------------------------- 1 | crcV)F� -------------------------------------------------------------------------------- /model/stages/5_VectorAssembler_a9434a6047bc/metadata/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/stages/5_VectorAssembler_a9434a6047bc/metadata/_SUCCESS -------------------------------------------------------------------------------- /model/stages/5_VectorAssembler_a9434a6047bc/metadata/part-00000: -------------------------------------------------------------------------------- 1 | {"class":"org.apache.spark.ml.feature.VectorAssembler","timestamp":1739608802396,"sparkVersion":"3.5.0","uid":"VectorAssembler_a9434a6047bc","paramMap":{"inputCols":["TimeOHE","numerical_features"],"outputCol":"features"},"defaultParamMap":{"outputCol":"VectorAssembler_a9434a6047bc__output","handleInvalid":"error"}} 2 | -------------------------------------------------------------------------------- /model/stages/6_LightGBMClassifier_788cfead8c52/complexParams/.lightGBMBooster.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/stages/6_LightGBMClassifier_788cfead8c52/complexParams/.lightGBMBooster.crc -------------------------------------------------------------------------------- /model/stages/6_LightGBMClassifier_788cfead8c52/complexParams/lightGBMBooster: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/stages/6_LightGBMClassifier_788cfead8c52/complexParams/lightGBMBooster -------------------------------------------------------------------------------- /model/stages/6_LightGBMClassifier_788cfead8c52/metadata/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /model/stages/6_LightGBMClassifier_788cfead8c52/metadata/.part-00000.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/stages/6_LightGBMClassifier_788cfead8c52/metadata/.part-00000.crc -------------------------------------------------------------------------------- /model/stages/6_LightGBMClassifier_788cfead8c52/metadata/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/model/stages/6_LightGBMClassifier_788cfead8c52/metadata/_SUCCESS -------------------------------------------------------------------------------- /model/stages/6_LightGBMClassifier_788cfead8c52/metadata/part-00000: -------------------------------------------------------------------------------- 1 | {"class":"com.microsoft.azure.synapse.ml.lightgbm.LightGBMClassificationModel","timestamp":1739608802925,"sparkVersion":"3.5.0","uid":"LightGBMClassifier_788cfead8c52","paramMap":{"featuresCol":"features","labelCol":"Class","numIterations":-1,"actualNumClasses":1,"featuresShapCol":"","startIteration":0,"probabilityCol":"probability","rawPredictionCol":"rawPrediction","predictionCol":"prediction","leafPredictionCol":"","predictDisableShapeCheck":false},"defaultParamMap":{"featuresCol":"features","labelCol":"label","numIterations":-1,"featuresShapCol":"","startIteration":0,"probabilityCol":"probability","rawPredictionCol":"rawPrediction","predictionCol":"prediction","leafPredictionCol":"","predictDisableShapeCheck":false},"complexParamLocs":{"lightGBMBooster":"complexParams/lightGBMBooster"}} 2 | -------------------------------------------------------------------------------- /producer.py: -------------------------------------------------------------------------------- 1 | import json 2 | import time 3 | 4 | import pandas as pd 5 | from datetime import datetime 6 | 7 | from kafka import KafkaAdminClient, KafkaProducer 8 | from kafka.admin import NewTopic 9 | 10 | from const import ROOT_PATH 11 | 12 | 13 | def create_topic(admin, topic_name): 14 | # Create topic if not exists 15 | try: 16 | # Create Kafka topic 17 | topic = NewTopic(name=topic_name, num_partitions=1, 18 | replication_factor=1) 19 | admin.create_topics([topic]) 20 | print(f"A new topic {topic_name} has been created!") 21 | except: 22 | print(f"Topic {topic_name} already exists. Skipping creation!") 23 | pass 24 | 25 | 26 | def create_streams(topic_name: str, servers): 27 | producer = None 28 | admin = None 29 | for _ in range(10): 30 | try: 31 | producer = KafkaProducer(bootstrap_servers=servers) 32 | admin = KafkaAdminClient(bootstrap_servers=servers) 33 | print("SUCCESS: instantiated Kafka admin and producer") 34 | break 35 | except Exception as e: 36 | print( 37 | f"Trying to instantiate admin and producer with bootstrap servers {servers} with error {e}" 38 | ) 39 | time.sleep(10) 40 | pass 41 | 42 | df = pd.read_csv(f"{ROOT_PATH}/creditcard.csv") 43 | df = df.drop(columns="Class") 44 | records = df.to_dict(orient="records") 45 | 46 | for record in records: 47 | producer.send( 48 | topic_name, 49 | json.dumps(record).encode("utf-8") 50 | ) 51 | print(record) 52 | time.sleep(5) 53 | 54 | 55 | if __name__ == '__main__': 56 | create_streams(topic_name="raw", servers="localhost:9092", ) 57 | -------------------------------------------------------------------------------- /prometheus/prometheus.yml: -------------------------------------------------------------------------------- 1 | # my global config 2 | global: 3 | scrape_interval: 10s # Set the scrape interval to every 15 seconds. Default is every 1 minute. 4 | evaluation_interval: 10s # Evaluate rules every 15 seconds. The default is every 1 minute. 5 | # scrape_timeout is set to the global default (10s). 6 | external_labels: 7 | monitor: 'Spark-monitoring' 8 | 9 | # Alertmanager configuration 10 | alerting: 11 | alertmanagers: 12 | - static_configs: 13 | - targets: 14 | # - alertmanager:9093 15 | 16 | # Load rules once and periodically evaluate them according to the global 'evaluation_interval'. 17 | rule_files: 18 | # - "first_rules.yml" 19 | # - "second_rules.yml" 20 | # - 'prometheus.yml' 21 | # A scrape configuration containing exactly one endpoint to scrape: 22 | # Here it's Prometheus itself. 23 | scrape_configs: 24 | # The job name is added as a label `job=` to any timeseries scraped from this config. 25 | 26 | ############## 27 | # Prometheus # 28 | ############## 29 | - job_name: 'prometheus' 30 | scrape_interval: 10s 31 | # metrics_path defaults to '/metrics' 32 | # scheme defaults to 'http'. 33 | static_configs: 34 | - targets: ['localhost:9090'] 35 | 36 | ############### 37 | # Node-exporter # 38 | ############### 39 | - job_name: 'node-exporter' 40 | scrape_interval: 10s 41 | static_configs: 42 | - targets: ['node-exporter:9100'] 43 | labels: 44 | group: 'spark' 45 | instance: 'node-exporter' 46 | 47 | ############### 48 | # Spark Master # 49 | ############### 50 | - job_name: 'spark-master' 51 | scrape_interval: 10s 52 | metrics_path: '/metrics/master/prometheus' 53 | static_configs: 54 | - targets: ['spark:8080'] 55 | labels: 56 | group: 'spark' 57 | instance: 'master' 58 | 59 | ############### 60 | # Spark Worker # 61 | ############### 62 | - job_name: 'spark-worker' 63 | scrape_interval: 10s 64 | metrics_path: '/metrics/prometheus' 65 | static_configs: 66 | - targets: ['spark:8081'] 67 | labels: 68 | group: 'spark' 69 | instance: 'worker' 70 | 71 | ############### 72 | # Spark Applications # 73 | ############### 74 | - job_name: 'spark-applications' 75 | scrape_interval: 10s 76 | metrics_path: '/metrics/applications/prometheus' 77 | static_configs: 78 | - targets: ['spark:8080'] 79 | labels: 80 | group: 'spark' 81 | instance: 'applications' 82 | 83 | ############### 84 | # Spark Driver # 85 | ############### 86 | - job_name: 'spark-driver' 87 | scrape_interval: 10s 88 | metrics_path: '/metrics/prometheus/' 89 | static_configs: 90 | - targets: ['localhost:4040'] 91 | labels: 92 | group: 'spark' 93 | instance: 'driver' -------------------------------------------------------------------------------- /prometheus/prometheus_db/01JM1D7YTFGV5Q46F581Y59V6N/chunks/000001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/prometheus/prometheus_db/01JM1D7YTFGV5Q46F581Y59V6N/chunks/000001 -------------------------------------------------------------------------------- /prometheus/prometheus_db/01JM1D7YTFGV5Q46F581Y59V6N/index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/prometheus/prometheus_db/01JM1D7YTFGV5Q46F581Y59V6N/index -------------------------------------------------------------------------------- /prometheus/prometheus_db/01JM1D7YTFGV5Q46F581Y59V6N/meta.json: -------------------------------------------------------------------------------- 1 | { 2 | "ulid": "01JM1D7YTFGV5Q46F581Y59V6N", 3 | "minTime": 1739451882655, 4 | "maxTime": 1739491200000, 5 | "stats": { 6 | "numSamples": 627172, 7 | "numSeries": 1853, 8 | "numChunks": 7346 9 | }, 10 | "compaction": { 11 | "level": 3, 12 | "sources": [ 13 | "01JM01E7GVDS62213E6J88CZH6", 14 | "01JM04FNYWFKPAQFK4W1HWE3DR", 15 | "01JM0BE48DXCMSRDTPW5ZM4Z64", 16 | "01JM0JFQAJDEGC8W1GQ3QSBJ9B", 17 | "01JM0SAGNRWVTCP5R0ETHGJC93", 18 | "01JM10CCCRWNY4NH1XXF230WC2" 19 | ], 20 | "parents": [ 21 | { 22 | "ulid": "01JM0SAGY1409W3PD84G1R08KH", 23 | "minTime": 1739451882655, 24 | "maxTime": 1739469600000 25 | }, 26 | { 27 | "ulid": "01JM16C7KHYRAPEYR3SRTMVNZ9", 28 | "minTime": 1739469600000, 29 | "maxTime": 1739491200000 30 | } 31 | ] 32 | }, 33 | "version": 1 34 | } -------------------------------------------------------------------------------- /prometheus/prometheus_db/01JM1D7YTFGV5Q46F581Y59V6N/tombstones: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/prometheus/prometheus_db/01JM1D7YTFGV5Q46F581Y59V6N/tombstones -------------------------------------------------------------------------------- /prometheus/prometheus_db/01JM4N0Q121PZKCKNVYCSJCVSZ/chunks/000001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/prometheus/prometheus_db/01JM4N0Q121PZKCKNVYCSJCVSZ/chunks/000001 -------------------------------------------------------------------------------- /prometheus/prometheus_db/01JM4N0Q121PZKCKNVYCSJCVSZ/index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/prometheus/prometheus_db/01JM4N0Q121PZKCKNVYCSJCVSZ/index -------------------------------------------------------------------------------- /prometheus/prometheus_db/01JM4N0Q121PZKCKNVYCSJCVSZ/meta.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/prometheus/prometheus_db/01JM4N0Q121PZKCKNVYCSJCVSZ/meta.json -------------------------------------------------------------------------------- /prometheus/prometheus_db/01JM4N0Q121PZKCKNVYCSJCVSZ/tombstones: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/prometheus/prometheus_db/01JM4N0Q121PZKCKNVYCSJCVSZ/tombstones -------------------------------------------------------------------------------- /prometheus/prometheus_db/01JM4V3JR55NWKNGPRKH0JB4D5/chunks/000001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/prometheus/prometheus_db/01JM4V3JR55NWKNGPRKH0JB4D5/chunks/000001 -------------------------------------------------------------------------------- /prometheus/prometheus_db/01JM4V3JR55NWKNGPRKH0JB4D5/index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/prometheus/prometheus_db/01JM4V3JR55NWKNGPRKH0JB4D5/index -------------------------------------------------------------------------------- /prometheus/prometheus_db/01JM4V3JR55NWKNGPRKH0JB4D5/meta.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/prometheus/prometheus_db/01JM4V3JR55NWKNGPRKH0JB4D5/meta.json -------------------------------------------------------------------------------- /prometheus/prometheus_db/01JM4V3JR55NWKNGPRKH0JB4D5/tombstones: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/prometheus/prometheus_db/01JM4V3JR55NWKNGPRKH0JB4D5/tombstones -------------------------------------------------------------------------------- /prometheus/prometheus_db/01JM4V3K06JFK35VSGTMKRYC65/chunks/000001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/prometheus/prometheus_db/01JM4V3K06JFK35VSGTMKRYC65/chunks/000001 -------------------------------------------------------------------------------- /prometheus/prometheus_db/01JM4V3K06JFK35VSGTMKRYC65/index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/prometheus/prometheus_db/01JM4V3K06JFK35VSGTMKRYC65/index -------------------------------------------------------------------------------- /prometheus/prometheus_db/01JM4V3K06JFK35VSGTMKRYC65/meta.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/prometheus/prometheus_db/01JM4V3K06JFK35VSGTMKRYC65/meta.json -------------------------------------------------------------------------------- /prometheus/prometheus_db/01JM4V3K06JFK35VSGTMKRYC65/tombstones: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/prometheus/prometheus_db/01JM4V3K06JFK35VSGTMKRYC65/tombstones -------------------------------------------------------------------------------- /prometheus/prometheus_db/chunks_head/000002: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/prometheus/prometheus_db/chunks_head/000002 -------------------------------------------------------------------------------- /prometheus/prometheus_db/chunks_head/000003: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/prometheus/prometheus_db/chunks_head/000003 -------------------------------------------------------------------------------- /prometheus/prometheus_db/lock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/prometheus/prometheus_db/lock -------------------------------------------------------------------------------- /prometheus/prometheus_db/queries.active: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/prometheus/prometheus_db/queries.active -------------------------------------------------------------------------------- /prometheus/prometheus_db/wal/00000034: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/prometheus/prometheus_db/wal/00000034 -------------------------------------------------------------------------------- /prometheus/prometheus_db/wal/00000035: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/prometheus/prometheus_db/wal/00000035 -------------------------------------------------------------------------------- /prometheus/prometheus_db/wal/00000036: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/prometheus/prometheus_db/wal/00000036 -------------------------------------------------------------------------------- /prometheus/prometheus_db/wal/checkpoint.00000033/00000000: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Full-Stack-Data-Science/real-time-ml-inference-with-spark-streaming-and-kafka/5e9628600e55eae1e99e45fee52dbc937c5468ad/prometheus/prometheus_db/wal/checkpoint.00000033/00000000 -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pyspark==3.5.0 2 | pydeequ==1.0.1 3 | python-dotenv==1.0.0 4 | nltk==3.8.1 5 | synapseml 6 | pytest -------------------------------------------------------------------------------- /spec.py: -------------------------------------------------------------------------------- 1 | import pyspark.sql.types as T 2 | 3 | 4 | input_schema = T.StructType([ 5 | T.StructField('Time', T.StringType(), True), 6 | T.StructField('V1', T.DoubleType(), True), 7 | T.StructField('V2', T.DoubleType(), True), 8 | T.StructField('V3', T.DoubleType(), True), 9 | T.StructField('V4', T.DoubleType(), True), 10 | T.StructField('V5', T.DoubleType(), True), 11 | T.StructField('V6', T.DoubleType(), True), 12 | T.StructField('V7', T.DoubleType(), True), 13 | T.StructField('V8', T.DoubleType(), True), 14 | T.StructField('V9', T.DoubleType(), True), 15 | T.StructField('V10', T.DoubleType(), True), 16 | T.StructField('V11', T.DoubleType(), True), 17 | T.StructField('V12', T.DoubleType(), True), 18 | T.StructField('V13', T.DoubleType(), True), 19 | T.StructField('V14', T.DoubleType(), True), 20 | T.StructField('V15', T.DoubleType(), True), 21 | T.StructField('V16', T.DoubleType(), True), 22 | T.StructField('V17', T.DoubleType(), True), 23 | T.StructField('V18', T.DoubleType(), True), 24 | T.StructField('V19', T.DoubleType(), True), 25 | T.StructField('V20', T.DoubleType(), True), 26 | T.StructField('V21', T.DoubleType(), True), 27 | T.StructField('V22', T.DoubleType(), True), 28 | T.StructField('V23', T.DoubleType(), True), 29 | T.StructField('V24', T.DoubleType(), True), 30 | T.StructField('V25', T.DoubleType(), True), 31 | T.StructField('V26', T.DoubleType(), True), 32 | T.StructField('V27', T.DoubleType(), True), 33 | ]) 34 | 35 | output_schema = T.StructType([ 36 | T.StructField('Time', T.StringType(), True), 37 | T.StructField('predicted_prob', T.DoubleType(), True) 38 | ]) 39 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | import pytest as pytest 3 | 4 | from unittest.mock import Mock, MagicMock 5 | from pyspark.sql.session import SparkSession 6 | from pyspark.sql.dataframe import DataFrame 7 | import pyspark.sql.types as T 8 | 9 | 10 | @pytest.fixture 11 | def spark() -> SparkSession: 12 | spark = ( 13 | SparkSession.builder.master("local[1]") 14 | .appName("local-tests") 15 | .config("spark.executor.cores", "1") 16 | .config("spark.executor.instances", "1") 17 | .config("spark.sql.shuffle.partitions", "1") 18 | .config("spark.driver.bindAddress", "127.0.0.1") 19 | .getOrCreate() 20 | ) 21 | yield spark 22 | spark.stop() 23 | 24 | 25 | @pytest.fixture 26 | def sample_df(spark) -> DataFrame: 27 | data = [ 28 | ("Walter", 32, "Germany", 10000.0, datetime(2023, 1, 1, 0, 1), 'segment1', '20230101', 0), 29 | ("Nic", 12, "England", 2.0, datetime(2023, 1, 1, 0, 2), 'segment1', '20230101', 0) 30 | ] 31 | 32 | schema = T.StructType( 33 | [ 34 | T.StructType("name", T.StructType(), False), 35 | T.StructType("age", T.StructType(), False), 36 | T.StructType("country", T.StructType(), False), 37 | T.StructType("salary", T.StructType(), False), 38 | T.StructType("time", T.StructType(), False), 39 | T.StructType("segment", T.StructType(), False), 40 | T.StructType("date", T.StructType(), False), 41 | T.StructType("hour", T.StructType(), False), 42 | ] 43 | ) 44 | return spark.createDataFrame(data, schema) 45 | 46 | 47 | @pytest.fixture 48 | def spark_mock(sample_df): 49 | spark_mock = Mock() 50 | type(spark_mock).write = spark_mock 51 | type(spark_mock).read = spark_mock 52 | spark_mock.table.return_value = spark_mock 53 | spark_mock.format.return_value = spark_mock 54 | spark_mock.option.return_value = spark_mock 55 | spark_mock.options.return_value = spark_mock 56 | spark_mock.mode.return_value = spark_mock 57 | spark_mock.save.return_value = None 58 | spark_mock.load.return_value = sample_df 59 | return spark_mock 60 | 61 | 62 | @pytest.fixture 63 | def schema_mock(): 64 | return MagicMock() 65 | -------------------------------------------------------------------------------- /tests/test_inference.py: -------------------------------------------------------------------------------- 1 | from chispa.dataframe_comparer import assert_df_equality 2 | from inference.base import Inference 3 | 4 | 5 | def test_remove_non_word_characters_short(spark): 6 | input_data = [ 7 | ("jo&&se", "jose"), 8 | ("**li**", "li"), 9 | ("#::luisa", "luisa"), 10 | (None, None) 11 | ] 12 | expected_data = [ 13 | ("jo&&se", "jose"), 14 | ("**li**", "li"), 15 | ("#::luisa", "luisa"), 16 | (None, None) 17 | ] 18 | 19 | df = spark.createDataFrame(input_data, ["name", "name"]) 20 | actual_df = Inference.preprocess_data(df) 21 | 22 | expected_df = spark.createDataFrame(expected_data, ["name", "name"]) 23 | assert_df_equality(actual_df, expected_df) 24 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | 2 | import pyspark.sql.types as T 3 | import pyspark.sql.functions as F 4 | from synapse.ml.lightgbm import LightGBMClassifier 5 | from pyspark.sql import SparkSession 6 | from pyspark.ml import Pipeline, PipelineModel 7 | from pyspark.ml.feature import (StringIndexer, OneHotEncoder, VectorAssembler, 8 | Imputer, StandardScaler) 9 | 10 | # Local modules 11 | from const import ROOT_PATH 12 | 13 | 14 | def train(dataset): 15 | spark = SparkSession\ 16 | .builder\ 17 | .master('local[*]')\ 18 | .appName("train_model") \ 19 | .config("spark.jars.packages", "com.microsoft.azure:synapseml_2.12:1.0.2")\ 20 | .getOrCreate() 21 | 22 | df = spark.read.csv(dataset, header=True, inferSchema=True) 23 | df = df.withColumn("Class", F.col("Class").astype(T.IntegerType())) 24 | 25 | fraud_df = df.filter(F.col("Class") == 1).limit(100) 26 | non_fraud_df = df.filter(F.col("Class") == 0).limit(100) 27 | train_df = fraud_df.union(non_fraud_df) 28 | 29 | numerical_cols = ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 30 | 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 31 | 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 32 | 'V26', 'V27'] 33 | categorical_cols = ['Time'] 34 | label_column_name = 'Class' 35 | 36 | # Convert categorical string columns in a DataFrame into numerical indices 37 | index_cols = [c + 'Index' for c in categorical_cols] 38 | string_indexer = StringIndexer( 39 | inputCols=categorical_cols, 40 | outputCols=index_cols, 41 | stringOrderType='alphabetAsc', 42 | handleInvalid='keep' 43 | ) 44 | 45 | # Apply one-hot encoding to integer indexes 46 | ohe_cols = [c + 'OHE' for c in categorical_cols] 47 | one_hot_encoder = OneHotEncoder( 48 | inputCols=index_cols, 49 | outputCols=ohe_cols, 50 | handleInvalid='error', 51 | dropLast=False 52 | ) 53 | 54 | # Imputation transformer for completing missing values. 55 | imputer = Imputer( 56 | strategy='median', 57 | inputCols=numerical_cols, 58 | outputCols=[c + "_imputed" for c in numerical_cols] 59 | ) 60 | 61 | # Combine numerical columns into a single vector 62 | vec_assembler = VectorAssembler( 63 | inputCols=[c + "_imputed" for c in numerical_cols], 64 | outputCol='numerical_features' 65 | ) 66 | 67 | # Scale numerical features 68 | standard_scaler = StandardScaler( 69 | inputCol='numerical_features', 70 | outputCol='numerical_features_scaled' 71 | ) 72 | 73 | # Combine one-hot encoded and scaled numerical features 74 | assembler_cols = ohe_cols + ['numerical_features'] 75 | vec_assembler2 = VectorAssembler( 76 | inputCols=assembler_cols, 77 | outputCol='features' 78 | ) 79 | 80 | lgb_classifier = LightGBMClassifier( 81 | featuresCol="features", 82 | labelCol=label_column_name 83 | ) 84 | 85 | pipeline = Pipeline( 86 | stages=[ 87 | string_indexer, 88 | one_hot_encoder, 89 | imputer, 90 | vec_assembler, 91 | standard_scaler, 92 | vec_assembler2, 93 | lgb_classifier 94 | ] 95 | ) 96 | 97 | model = pipeline.fit(train_df) 98 | 99 | # Save the trained model to a file 100 | model.write().overwrite().save(f"{ROOT_PATH}/model") 101 | 102 | # Test the trained model 103 | pipelineModel = PipelineModel.load(f"{ROOT_PATH}/model") 104 | df = pipelineModel.transform(train_df) 105 | df.show() 106 | 107 | 108 | if __name__ == '__main__': 109 | train(f"{ROOT_PATH}/creditcard.csv") 110 | -------------------------------------------------------------------------------- /udfs.py: -------------------------------------------------------------------------------- 1 | import pyspark.sql.types as T 2 | import pyspark.sql.functions as F 3 | from pyspark.ml.linalg import DenseVector 4 | 5 | 6 | @F.udf(T.FloatType()) 7 | def extract_predicted_prob(v: DenseVector) -> float: 8 | return float(v[1]) 9 | --------------------------------------------------------------------------------