├── .gitignore ├── API-Ingest ├── .vscode │ └── settings.json ├── Kafka consumer │ └── consumer.py ├── Postman │ └── IngestAPI-Test.postman_collection.json ├── app │ └── main.py ├── build command.txt ├── dockerfile └── requirements.txt ├── ApacheSpark ├── .ipynb_checkpoints │ ├── 01-streaming-kafka-src-dst-checkpoint.ipynb │ └── 02-streaming-kafka-src-dst-mongodb-checkpoint.ipynb ├── 01-streaming-kafka-src-dst.ipynb └── 02-streaming-kafka-src-dst-mongodb.ipynb ├── Kafka Commands.txt ├── README.md ├── Streamlit ├── .vscode │ └── settings.json └── streamlitapp.py ├── archive ├── docker-compose-kafka-spark-confluent.yml └── docker-compose-kafka_old.yml ├── client ├── .vscode │ └── settings.json ├── api-client.py └── transformer.py ├── docker helpful commands.txt ├── docker-compose-kafka-spark-mongodb.yml ├── docker-compose-kafka-spark.yml └── docker-compose-kafka.yml /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | ## 4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore 5 | 6 | # User-specific files 7 | *.rsuser 8 | *.suo 9 | *.user 10 | *.userosscache 11 | *.sln.docstates 12 | 13 | # User-specific files (MonoDevelop/Xamarin Studio) 14 | *.userprefs 15 | 16 | # Mono auto generated files 17 | mono_crash.* 18 | 19 | # Build results 20 | [Dd]ebug/ 21 | [Dd]ebugPublic/ 22 | [Rr]elease/ 23 | [Rr]eleases/ 24 | x64/ 25 | x86/ 26 | [Aa][Rr][Mm]/ 27 | [Aa][Rr][Mm]64/ 28 | bld/ 29 | [Bb]in/ 30 | [Oo]bj/ 31 | [Ll]og/ 32 | [Ll]ogs/ 33 | 34 | # Visual Studio 2015/2017 cache/options directory 35 | .vs/ 36 | # Uncomment if you have tasks that create the project's static files in wwwroot 37 | #wwwroot/ 38 | 39 | # Visual Studio 2017 auto generated files 40 | Generated\ Files/ 41 | 42 | # MSTest test Results 43 | [Tt]est[Rr]esult*/ 44 | [Bb]uild[Ll]og.* 45 | 46 | # NUnit 47 | *.VisualState.xml 48 | TestResult.xml 49 | nunit-*.xml 50 | 51 | # Build Results of an ATL Project 52 | [Dd]ebugPS/ 53 | [Rr]eleasePS/ 54 | dlldata.c 55 | 56 | # Benchmark Results 57 | BenchmarkDotNet.Artifacts/ 58 | 59 | # .NET Core 60 | project.lock.json 61 | project.fragment.lock.json 62 | artifacts/ 63 | 64 | # StyleCop 65 | StyleCopReport.xml 66 | 67 | # Files built by Visual Studio 68 | *_i.c 69 | *_p.c 70 | *_h.h 71 | *.ilk 72 | *.meta 73 | *.obj 74 | *.iobj 75 | *.pch 76 | *.pdb 77 | *.ipdb 78 | *.pgc 79 | *.pgd 80 | *.rsp 81 | *.sbr 82 | *.tlb 83 | *.tli 84 | *.tlh 85 | *.tmp 86 | *.tmp_proj 87 | *_wpftmp.csproj 88 | *.log 89 | *.vspscc 90 | *.vssscc 91 | .builds 92 | *.pidb 93 | *.svclog 94 | *.scc 95 | 96 | # Chutzpah Test files 97 | _Chutzpah* 98 | 99 | # Visual C++ cache files 100 | ipch/ 101 | *.aps 102 | *.ncb 103 | *.opendb 104 | *.opensdf 105 | *.sdf 106 | *.cachefile 107 | *.VC.db 108 | *.VC.VC.opendb 109 | 110 | # Visual Studio profiler 111 | *.psess 112 | *.vsp 113 | *.vspx 114 | *.sap 115 | 116 | # Visual Studio Trace Files 117 | *.e2e 118 | 119 | # TFS 2012 Local Workspace 120 | $tf/ 121 | 122 | # Guidance Automation Toolkit 123 | *.gpState 124 | 125 | # ReSharper is a .NET coding add-in 126 | _ReSharper*/ 127 | *.[Rr]e[Ss]harper 128 | *.DotSettings.user 129 | 130 | # TeamCity is a build add-in 131 | _TeamCity* 132 | 133 | # DotCover is a Code Coverage Tool 134 | *.dotCover 135 | 136 | # AxoCover is a Code Coverage Tool 137 | .axoCover/* 138 | !.axoCover/settings.json 139 | 140 | # Visual Studio code coverage results 141 | *.coverage 142 | *.coveragexml 143 | 144 | # NCrunch 145 | _NCrunch_* 146 | .*crunch*.local.xml 147 | nCrunchTemp_* 148 | 149 | # MightyMoose 150 | *.mm.* 151 | AutoTest.Net/ 152 | 153 | # Web workbench (sass) 154 | .sass-cache/ 155 | 156 | # Installshield output folder 157 | [Ee]xpress/ 158 | 159 | # DocProject is a documentation generator add-in 160 | DocProject/buildhelp/ 161 | DocProject/Help/*.HxT 162 | DocProject/Help/*.HxC 163 | DocProject/Help/*.hhc 164 | DocProject/Help/*.hhk 165 | DocProject/Help/*.hhp 166 | DocProject/Help/Html2 167 | DocProject/Help/html 168 | 169 | # Click-Once directory 170 | publish/ 171 | 172 | # Publish Web Output 173 | *.[Pp]ublish.xml 174 | *.azurePubxml 175 | # Note: Comment the next line if you want to checkin your web deploy settings, 176 | # but database connection strings (with potential passwords) will be unencrypted 177 | *.pubxml 178 | *.publishproj 179 | 180 | # Microsoft Azure Web App publish settings. Comment the next line if you want to 181 | # checkin your Azure Web App publish settings, but sensitive information contained 182 | # in these scripts will be unencrypted 183 | PublishScripts/ 184 | 185 | # NuGet Packages 186 | *.nupkg 187 | # NuGet Symbol Packages 188 | *.snupkg 189 | # The packages folder can be ignored because of Package Restore 190 | **/[Pp]ackages/* 191 | # except build/, which is used as an MSBuild target. 192 | !**/[Pp]ackages/build/ 193 | # Uncomment if necessary however generally it will be regenerated when needed 194 | #!**/[Pp]ackages/repositories.config 195 | # NuGet v3's project.json files produces more ignorable files 196 | *.nuget.props 197 | *.nuget.targets 198 | 199 | # Microsoft Azure Build Output 200 | csx/ 201 | *.build.csdef 202 | 203 | # Microsoft Azure Emulator 204 | ecf/ 205 | rcf/ 206 | 207 | # Windows Store app package directories and files 208 | AppPackages/ 209 | BundleArtifacts/ 210 | Package.StoreAssociation.xml 211 | _pkginfo.txt 212 | *.appx 213 | *.appxbundle 214 | *.appxupload 215 | 216 | # Visual Studio cache files 217 | # files ending in .cache can be ignored 218 | *.[Cc]ache 219 | # but keep track of directories ending in .cache 220 | !?*.[Cc]ache/ 221 | 222 | # Others 223 | ClientBin/ 224 | ~$* 225 | *~ 226 | *.dbmdl 227 | *.dbproj.schemaview 228 | *.jfm 229 | *.pfx 230 | *.publishsettings 231 | orleans.codegen.cs 232 | 233 | # Including strong name files can present a security risk 234 | # (https://github.com/github/gitignore/pull/2483#issue-259490424) 235 | #*.snk 236 | 237 | # Since there are multiple workflows, uncomment next line to ignore bower_components 238 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 239 | #bower_components/ 240 | 241 | # RIA/Silverlight projects 242 | Generated_Code/ 243 | 244 | # Backup & report files from converting an old project file 245 | # to a newer Visual Studio version. Backup files are not needed, 246 | # because we have git ;-) 247 | _UpgradeReport_Files/ 248 | Backup*/ 249 | UpgradeLog*.XML 250 | UpgradeLog*.htm 251 | ServiceFabricBackup/ 252 | *.rptproj.bak 253 | 254 | # SQL Server files 255 | *.mdf 256 | *.ldf 257 | *.ndf 258 | 259 | # Business Intelligence projects 260 | *.rdl.data 261 | *.bim.layout 262 | *.bim_*.settings 263 | *.rptproj.rsuser 264 | *- [Bb]ackup.rdl 265 | *- [Bb]ackup ([0-9]).rdl 266 | *- [Bb]ackup ([0-9][0-9]).rdl 267 | 268 | # Microsoft Fakes 269 | FakesAssemblies/ 270 | 271 | # GhostDoc plugin setting file 272 | *.GhostDoc.xml 273 | 274 | # Node.js Tools for Visual Studio 275 | .ntvs_analysis.dat 276 | node_modules/ 277 | 278 | # Visual Studio 6 build log 279 | *.plg 280 | 281 | # Visual Studio 6 workspace options file 282 | *.opt 283 | 284 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.) 285 | *.vbw 286 | 287 | # Visual Studio LightSwitch build output 288 | **/*.HTMLClient/GeneratedArtifacts 289 | **/*.DesktopClient/GeneratedArtifacts 290 | **/*.DesktopClient/ModelManifest.xml 291 | **/*.Server/GeneratedArtifacts 292 | **/*.Server/ModelManifest.xml 293 | _Pvt_Extensions 294 | 295 | # Paket dependency manager 296 | .paket/paket.exe 297 | paket-files/ 298 | 299 | # FAKE - F# Make 300 | .fake/ 301 | 302 | # CodeRush personal settings 303 | .cr/personal 304 | 305 | # Python Tools for Visual Studio (PTVS) 306 | __pycache__/ 307 | *.pyc 308 | 309 | # Cake - Uncomment if you are using it 310 | # tools/** 311 | # !tools/packages.config 312 | 313 | # Tabs Studio 314 | *.tss 315 | 316 | # Telerik's JustMock configuration file 317 | *.jmconfig 318 | 319 | # BizTalk build output 320 | *.btp.cs 321 | *.btm.cs 322 | *.odx.cs 323 | *.xsd.cs 324 | 325 | # OpenCover UI analysis results 326 | OpenCover/ 327 | 328 | # Azure Stream Analytics local run output 329 | ASALocalRun/ 330 | 331 | # MSBuild Binary and Structured Log 332 | *.binlog 333 | 334 | # NVidia Nsight GPU debugger configuration file 335 | *.nvuser 336 | 337 | # MFractors (Xamarin productivity tool) working folder 338 | .mfractor/ 339 | 340 | # Local History for Visual Studio 341 | .localhistory/ 342 | 343 | # BeatPulse healthcheck temp database 344 | healthchecksdb 345 | 346 | # Backup folder for Package Reference Convert tool in Visual Studio 2017 347 | MigrationBackup/ 348 | 349 | # Ionide (cross platform F# VS Code tools) working folder 350 | .ionide/ 351 | client/data.csv 352 | client/output.txt 353 | -------------------------------------------------------------------------------- /API-Ingest/.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.pythonPath": "/bin/python3" 3 | } -------------------------------------------------------------------------------- /API-Ingest/Kafka consumer/consumer.py: -------------------------------------------------------------------------------- 1 | from kafka import KafkaConsumer 2 | 3 | #consumer = KafkaConsumer() 4 | 5 | # define a consumer that waits for new messages 6 | def kafka_python_consumer(): 7 | 8 | # Consumer using the topic name and setting a group id 9 | consumer = KafkaConsumer('ingestion-topic', group_id='mypythonconsumer',bootstrap_servers='localhost:9092',) 10 | for msg in consumer: 11 | print(msg) 12 | 13 | print("start consuming") 14 | 15 | # start the consumer 16 | kafka_python_consumer() 17 | 18 | print("done") 19 | -------------------------------------------------------------------------------- /API-Ingest/Postman/IngestAPI-Test.postman_collection.json: -------------------------------------------------------------------------------- 1 | { 2 | "info": { 3 | "_postman_id": "793bf228-6eb4-497b-9859-355a2251ab4a", 4 | "name": "IngestAPI-Test", 5 | "schema": "https://schema.getpostman.com/json/collection/v2.1.0/collection.json" 6 | }, 7 | "item": [ 8 | { 9 | "name": "Test Date Validation - Good modified", 10 | "request": { 11 | "method": "POST", 12 | "header": [], 13 | "body": { 14 | "mode": "raw", 15 | "raw": "{\"InvoiceNo\":536365,\"StockCode\":\"85123A\",\"Description\":\"WHITE HANGING HEART T-LIGHT HOLDER\",\"Quantity\":6,\"InvoiceDate\":\"12/02/2010 8:26\",\"UnitPrice\":2.55,\"CustomerID\":17850,\"Country\":\"United Kingdom\"}\r\n", 16 | "options": { 17 | "raw": { 18 | "language": "json" 19 | } 20 | } 21 | }, 22 | "url": { 23 | "raw": "localhost:8000/invoiceitem", 24 | "host": [ 25 | "localhost" 26 | ], 27 | "port": "8000", 28 | "path": [ 29 | "invoiceitem" 30 | ] 31 | } 32 | }, 33 | "response": [] 34 | }, 35 | { 36 | "name": "Test Date Validation - Good original", 37 | "request": { 38 | "method": "POST", 39 | "header": [], 40 | "body": { 41 | "mode": "raw", 42 | "raw": "{\"InvoiceNo\":536365,\"StockCode\":\"85123A\",\"Description\":\"WHITE HANGING HEART T-LIGHT HOLDER\",\"Quantity\":6,\"InvoiceDate\":\"12/2/2010 8:26\",\"UnitPrice\":2.55,\"CustomerID\":17850,\"Country\":\"United Kingdom\"}\r\n", 43 | "options": { 44 | "raw": { 45 | "language": "json" 46 | } 47 | } 48 | }, 49 | "url": { 50 | "raw": "localhost:8000/invoiceitem", 51 | "host": [ 52 | "localhost" 53 | ], 54 | "port": "8000", 55 | "path": [ 56 | "invoiceitem" 57 | ] 58 | } 59 | }, 60 | "response": [] 61 | }, 62 | { 63 | "name": "Test Date Validation - Bad", 64 | "request": { 65 | "method": "POST", 66 | "header": [], 67 | "body": { 68 | "mode": "raw", 69 | "raw": "{\"InvoiceNo\":536365,\"StockCode\":\"85123A\",\"Description\":\"WHITE HANGING HEART T-LIGHT HOLDER\",\"Quantity\":6,\"InvoiceDate\":\"12/2/2010 8:26232sa\",\"UnitPrice\":2.55,\"CustomerID\":17850,\"Country\":\"United Kingdom\"}\r\n", 70 | "options": { 71 | "raw": { 72 | "language": "json" 73 | } 74 | } 75 | }, 76 | "url": { 77 | "raw": "localhost:8000/invoiceitem", 78 | "host": [ 79 | "localhost" 80 | ], 81 | "port": "8000", 82 | "path": [ 83 | "invoiceitem" 84 | ] 85 | } 86 | }, 87 | "response": [] 88 | }, 89 | { 90 | "name": "Test invoiceno Validation - Bad Copy", 91 | "request": { 92 | "method": "POST", 93 | "header": [], 94 | "body": { 95 | "mode": "raw", 96 | "raw": "{\"InvoiceNo\":\"536365abc\",\"StockCode\":\"85123A\",\"Description\":\"WHITE HANGING HEART T-LIGHT HOLDER\",\"Quantity\":6,\"InvoiceDate\":\"12/2/2010 8:26\",\"UnitPrice\":2.55,\"CustomerID\":17850,\"Country\":\"United Kingdom\"}\r\n", 97 | "options": { 98 | "raw": { 99 | "language": "json" 100 | } 101 | } 102 | }, 103 | "url": { 104 | "raw": "localhost:8000/invoiceitem", 105 | "host": [ 106 | "localhost" 107 | ], 108 | "port": "8000", 109 | "path": [ 110 | "invoiceitem" 111 | ] 112 | } 113 | }, 114 | "response": [] 115 | }, 116 | { 117 | "name": "Docker container test 1", 118 | "request": { 119 | "method": "POST", 120 | "header": [], 121 | "body": { 122 | "mode": "raw", 123 | "raw": "{\"InvoiceNo\":536365,\"StockCode\":\"85123A\",\"Description\":\"WHITE HANGING HEART T-LIGHT HOLDER\",\"Quantity\":6,\"InvoiceDate\":\"12/02/2010 8:26\",\"UnitPrice\":2.55,\"CustomerID\":17850,\"Country\":\"United Kingdom\"}\r\n", 124 | "options": { 125 | "raw": { 126 | "language": "json" 127 | } 128 | } 129 | }, 130 | "url": { 131 | "raw": "localhost:8000/invoiceitem", 132 | "host": [ 133 | "localhost" 134 | ], 135 | "port": "8000", 136 | "path": [ 137 | "invoiceitem" 138 | ] 139 | } 140 | }, 141 | "response": [] 142 | }, 143 | { 144 | "name": "Docker container test 2", 145 | "request": { 146 | "method": "POST", 147 | "header": [], 148 | "body": { 149 | "mode": "raw", 150 | "raw": "{\"InvoiceNo\":536365,\"StockCode\":\"85123A\",\"Description\":\"WHITE HANGING HEART T-LIGHT HOLDER\",\"Quantity\":6,\"InvoiceDate\":\"12/02/2010 8:26\",\"UnitPrice\":2.55,\"CustomerID\":17850,\"Country\":\"United Kingdom\"}\r\n", 151 | "options": { 152 | "raw": { 153 | "language": "json" 154 | } 155 | } 156 | }, 157 | "url": { 158 | "raw": "localhost:8000/invoiceitem", 159 | "host": [ 160 | "localhost" 161 | ], 162 | "port": "8000", 163 | "path": [ 164 | "invoiceitem" 165 | ] 166 | } 167 | }, 168 | "response": [] 169 | } 170 | ] 171 | } -------------------------------------------------------------------------------- /API-Ingest/app/main.py: -------------------------------------------------------------------------------- 1 | # You need this to use FastAPI, work with statuses and be able to end HTTPExceptions 2 | from fastapi import FastAPI, status, HTTPException 3 | 4 | # You need this to be able to turn classes into JSONs and return 5 | from fastapi.encoders import jsonable_encoder 6 | from fastapi.responses import JSONResponse 7 | 8 | # Needed for json.dumps 9 | import json 10 | 11 | # Both used for BaseModel 12 | from pydantic import BaseModel 13 | 14 | from datetime import datetime 15 | from kafka import KafkaProducer, producer 16 | 17 | 18 | 19 | # Create class (schema) for the JSON 20 | # Date get's ingested as string and then before writing validated 21 | class InvoiceItem(BaseModel): 22 | InvoiceNo: int 23 | StockCode: str 24 | Description: str 25 | Quantity: int 26 | InvoiceDate: str 27 | UnitPrice: float 28 | CustomerID: int 29 | Country: str 30 | 31 | # This is important for general execution and the docker later 32 | app = FastAPI() 33 | 34 | # Base URL 35 | @app.get("/") 36 | async def root(): 37 | return {"message": "Hello World"} 38 | 39 | # Add a new invoice 40 | @app.post("/invoiceitem") 41 | async def post_invoice_item(item: InvoiceItem): #body awaits a json with invoice item information 42 | print("Message received") 43 | try: 44 | # Evaluate the timestamp and parse it to datetime object you can work with 45 | date = datetime.strptime(item.InvoiceDate, "%d/%m/%Y %H:%M") 46 | 47 | print('Found a timestamp: ', date) 48 | 49 | # Replace strange date with new datetime 50 | # Use strftime to parse the string in the right format (replace / with - and add seconds) 51 | item.InvoiceDate = date.strftime("%d-%m-%Y %H:%M:%S") 52 | print("New item date:", item.InvoiceDate) 53 | 54 | # Parse item back to json 55 | json_of_item = jsonable_encoder(item) 56 | 57 | # Dump the json out as string 58 | json_as_string = json.dumps(json_of_item) 59 | print(json_as_string) 60 | 61 | # Produce the string 62 | produce_kafka_string(json_as_string) 63 | 64 | # Encode the created customer item if successful into a JSON and return it to the client with 201 65 | return JSONResponse(content=json_of_item, status_code=201) 66 | 67 | # Will be thrown by datetime if the date does not fit 68 | # All other value errors are automatically taken care of because of the InvoiceItem Class 69 | except ValueError: 70 | return JSONResponse(content=jsonable_encoder(item), status_code=400) 71 | 72 | 73 | def produce_kafka_string(json_as_string): 74 | # Create producer 75 | producer = KafkaProducer(bootstrap_servers='kafka:9092',acks=1) 76 | 77 | # Write the string as bytes because Kafka needs it this way 78 | producer.send('ingestion-topic', bytes(json_as_string, 'utf-8')) 79 | producer.flush() -------------------------------------------------------------------------------- /API-Ingest/build command.txt: -------------------------------------------------------------------------------- 1 | docker build -t api-ingest . -------------------------------------------------------------------------------- /API-Ingest/dockerfile: -------------------------------------------------------------------------------- 1 | FROM tiangolo/uvicorn-gunicorn-fastapi:python3.7 2 | 3 | COPY requirements.txt /tmp/ 4 | 5 | RUN pip install --no-cache-dir --upgrade pip && \ 6 | pip install --requirement /tmp/requirements.txt 7 | 8 | COPY ./app /app -------------------------------------------------------------------------------- /API-Ingest/requirements.txt: -------------------------------------------------------------------------------- 1 | kafka-python -------------------------------------------------------------------------------- /ApacheSpark/.ipynb_checkpoints/01-streaming-kafka-src-dst-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from pyspark.sql import SparkSession\n", 10 | "\n", 11 | "# Spark session & context\n", 12 | "spark = (SparkSession\n", 13 | " .builder\n", 14 | " .master('local')\n", 15 | " .appName('kafka-streaming')\n", 16 | " # Add kafka package\n", 17 | " .config(\"spark.jars.packages\", \"org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.5\")\n", 18 | " .getOrCreate())\n", 19 | "sc = spark.sparkContext\n" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 2, 25 | "metadata": {}, 26 | "outputs": [ 27 | { 28 | "data": { 29 | "text/plain": [ 30 | "DataFrame[key: string, value: string]" 31 | ] 32 | }, 33 | "execution_count": 2, 34 | "metadata": {}, 35 | "output_type": "execute_result" 36 | } 37 | ], 38 | "source": [ 39 | "df = spark \\\n", 40 | " .readStream \\\n", 41 | " .format(\"kafka\") \\\n", 42 | " .option(\"kafka.bootstrap.servers\", \"kafka:9092\") \\\n", 43 | " .option(\"subscribe\", \"ingestion-topic\") \\\n", 44 | " .load()\n", 45 | "df.selectExpr(\"CAST(key AS STRING)\", \"CAST(value AS STRING)\")" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 3, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "df.createOrReplaceTempView(\"message\")" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 4, 60 | "metadata": {}, 61 | "outputs": [ 62 | { 63 | "data": { 64 | "text/plain": [ 65 | "" 66 | ] 67 | }, 68 | "execution_count": 4, 69 | "metadata": {}, 70 | "output_type": "execute_result" 71 | } 72 | ], 73 | "source": [ 74 | "res = spark.sql(\"SELECT * from message\")\n", 75 | "res.writeStream.format(\"console\") \\\n", 76 | " .outputMode(\"append\")" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 5, 82 | "metadata": {}, 83 | "outputs": [ 84 | { 85 | "ename": "AnalysisException", 86 | "evalue": "'checkpointLocation must be specified either through option(\"checkpointLocation\", ...) or SparkSession.conf.set(\"spark.sql.streaming.checkpointLocation\", ...);'", 87 | "output_type": "error", 88 | "traceback": [ 89 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 90 | "\u001b[0;31mPy4JJavaError\u001b[0m Traceback (most recent call last)", 91 | "\u001b[0;32m/usr/local/spark/python/pyspark/sql/utils.py\u001b[0m in \u001b[0;36mdeco\u001b[0;34m(*a, **kw)\u001b[0m\n\u001b[1;32m 62\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 63\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 64\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mpy4j\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprotocol\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mPy4JJavaError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 92 | "\u001b[0;32m/usr/local/spark/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py\u001b[0m in \u001b[0;36mget_return_value\u001b[0;34m(answer, gateway_client, target_id, name)\u001b[0m\n\u001b[1;32m 327\u001b[0m \u001b[0;34m\"An error occurred while calling {0}{1}{2}.\\n\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 328\u001b[0;31m format(target_id, \".\", name), value)\n\u001b[0m\u001b[1;32m 329\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 93 | "\u001b[0;31mPy4JJavaError\u001b[0m: An error occurred while calling o65.start.\n: org.apache.spark.sql.AnalysisException: checkpointLocation must be specified either through option(\"checkpointLocation\", ...) or SparkSession.conf.set(\"spark.sql.streaming.checkpointLocation\", ...);\n\tat org.apache.spark.sql.streaming.StreamingQueryManager$$anonfun$4.apply(StreamingQueryManager.scala:234)\n\tat org.apache.spark.sql.streaming.StreamingQueryManager$$anonfun$4.apply(StreamingQueryManager.scala:229)\n\tat scala.Option.getOrElse(Option.scala:121)\n\tat org.apache.spark.sql.streaming.StreamingQueryManager.createQuery(StreamingQueryManager.scala:228)\n\tat org.apache.spark.sql.streaming.StreamingQueryManager.startQuery(StreamingQueryManager.scala:322)\n\tat org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:325)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:498)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\n\tat py4j.Gateway.invoke(Gateway.java:282)\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\n\tat py4j.GatewayConnection.run(GatewayConnection.java:238)\n\tat java.lang.Thread.run(Thread.java:748)\n", 94 | "\nDuring handling of the above exception, another exception occurred:\n", 95 | "\u001b[0;31mAnalysisException\u001b[0m Traceback (most recent call last)", 96 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"kafka\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0moption\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"kafka.bootstrap.servers\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"host1:port1,host2:port2\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0;34m.\u001b[0m\u001b[0moption\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"topic\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"topic1\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 7\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mstart\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mawaitTermination\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 97 | "\u001b[0;32m/usr/local/spark/python/pyspark/sql/streaming.py\u001b[0m in \u001b[0;36mstart\u001b[0;34m(self, path, format, outputMode, partitionBy, queryName, **options)\u001b[0m\n\u001b[1;32m 1106\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mqueryName\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mqueryName\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1107\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mpath\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1108\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sq\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_jwrite\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstart\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1109\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1110\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sq\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_jwrite\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstart\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 98 | "\u001b[0;32m/usr/local/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m 1255\u001b[0m \u001b[0manswer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgateway_client\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1256\u001b[0m return_value = get_return_value(\n\u001b[0;32m-> 1257\u001b[0;31m answer, self.gateway_client, self.target_id, self.name)\n\u001b[0m\u001b[1;32m 1258\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1259\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mtemp_arg\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtemp_args\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 99 | "\u001b[0;32m/usr/local/spark/python/pyspark/sql/utils.py\u001b[0m in \u001b[0;36mdeco\u001b[0;34m(*a, **kw)\u001b[0m\n\u001b[1;32m 67\u001b[0m e.java_exception.getStackTrace()))\n\u001b[1;32m 68\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0ms\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstartswith\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'org.apache.spark.sql.AnalysisException: '\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 69\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mAnalysisException\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m': '\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstackTrace\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 70\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0ms\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstartswith\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'org.apache.spark.sql.catalyst.analysis'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 71\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mAnalysisException\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m': '\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstackTrace\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 100 | "\u001b[0;31mAnalysisException\u001b[0m: 'checkpointLocation must be specified either through option(\"checkpointLocation\", ...) or SparkSession.conf.set(\"spark.sql.streaming.checkpointLocation\", ...);'" 101 | ] 102 | } 103 | ], 104 | "source": [ 105 | "ds = df \\\n", 106 | " .selectExpr(\"CAST(key AS STRING)\", \"CAST(value AS STRING)\") \\\n", 107 | " .writeStream \\\n", 108 | " .format(\"kafka\") \\\n", 109 | " .option(\"kafka.bootstrap.servers\", \"kafka:9092\") \\\n", 110 | " .option(\"topic\", \"spark-output\") \\\n", 111 | " .start() \\\n", 112 | " .awaitTermination()" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "#df.writeStream \\\n", 122 | " # .format(\"kafka\") \\\n", 123 | " # .option(\"kafka.bootstrap.servers\", \"kafka:9092\") \\\n", 124 | " # .option(\"topic\", \"spark-output\") \\\n", 125 | " # .start()\n", 126 | " " 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [] 135 | } 136 | ], 137 | "metadata": { 138 | "kernelspec": { 139 | "display_name": "Python 3", 140 | "language": "python", 141 | "name": "python3" 142 | }, 143 | "language_info": { 144 | "codemirror_mode": { 145 | "name": "ipython", 146 | "version": 3 147 | }, 148 | "file_extension": ".py", 149 | "mimetype": "text/x-python", 150 | "name": "python", 151 | "nbconvert_exporter": "python", 152 | "pygments_lexer": "ipython3", 153 | "version": "3.7.6" 154 | } 155 | }, 156 | "nbformat": 4, 157 | "nbformat_minor": 4 158 | } 159 | -------------------------------------------------------------------------------- /ApacheSpark/.ipynb_checkpoints/02-streaming-kafka-src-dst-mongodb-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from pyspark.sql import SparkSession\n", 10 | "\n", 11 | "# Spark session & context\n", 12 | "spark = (SparkSession\n", 13 | " .builder\n", 14 | " .master('local')\n", 15 | " .appName('kafka-mongo-streaming') \n", 16 | " # Add kafka package and mongodb package. Make sure to to this as one string!\n", 17 | " # Versions need to match the Spark version (trial & error)\n", 18 | " .config(\"spark.jars.packages\", \"org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.5,org.mongodb.spark:mongo-spark-connector_2.11:2.4.0\")\n", 19 | " # Mongo config including the username and password from compose file\n", 20 | " .config(\"spark.mongodb.input.uri\",\"mongodb://root:example@mongo:27017/docstreaming.invoices?authSource=admin\")\n", 21 | " .config(\"spark.mongodb.output.uri\",\"mongodb://root:example@mongo:27017/docstreaming.invoices?authSource=admin\")\n", 22 | " .getOrCreate())\n", 23 | "sc = spark.sparkContext\n" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 4, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "# Read the message from the kafka stream\n", 33 | "df = spark \\\n", 34 | " .readStream \\\n", 35 | " .format(\"kafka\") \\\n", 36 | " .option(\"kafka.bootstrap.servers\", \"kafka:9092\") \\\n", 37 | " .option(\"subscribe\", \"ingestion-topic\") \\\n", 38 | " .load()\n", 39 | "\n", 40 | "# convert the binary values to string\n", 41 | "df1 = df.selectExpr(\"CAST(key AS STRING)\", \"CAST(value AS STRING)\")" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 5, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "#Create a temporary view for SparkSQL\n", 51 | "df1.createOrReplaceTempView(\"message\")" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 6, 57 | "metadata": {}, 58 | "outputs": [ 59 | { 60 | "data": { 61 | "text/plain": [ 62 | "" 63 | ] 64 | }, 65 | "execution_count": 6, 66 | "metadata": {}, 67 | "output_type": "execute_result" 68 | } 69 | ], 70 | "source": [ 71 | "# Write out the message to the console of the environment\n", 72 | "res = spark.sql(\"SELECT * from message\")\n", 73 | "res.writeStream.format(\"console\") \\\n", 74 | " .outputMode(\"append\") \\\n", 75 | " .start()" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 7, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "# Write the unvonverted dataframe (no strings)\n", 85 | "# message back into Kafka in another topic#\n", 86 | "# listen to it with a local consumer\n", 87 | "ds = df \\\n", 88 | " .writeStream \\\n", 89 | " .format(\"kafka\") \\\n", 90 | " .option(\"kafka.bootstrap.servers\", \"kafka:9092\") \\\n", 91 | " .option(\"topic\", \"spark-output\") \\\n", 92 | " .option(\"checkpointLocation\", \"/tmp\") \\\n", 93 | " .start() " 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 22, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "# Write the message into MongoDB\n", 103 | "\n", 104 | "def f(row):\n", 105 | " #dfjson = row.value\n", 106 | " #write that dataframe to mongodb\n", 107 | " #df = spark.read.json(row.value)\n", 108 | " \n", 109 | " dataframe = spark.read.json(row.value)\n", 110 | " \n", 111 | " dataframe.write.format(\"com.mongodb.spark.sql.DefaultSource\").mode(\"append\").save()\n", 112 | "\n", 113 | "def foreach_batch_function(df, epoch_id):\n", 114 | " # Transform and write batchDF\n", 115 | "\n", 116 | " #writes dataframe with complete kafka message\n", 117 | " #df.write.format(\"com.mongodb.spark.sql.DefaultSource\").mode(\"append\").save()\n", 118 | " \n", 119 | " #only get json sring from dataframe\n", 120 | " #value = df.select(\"value\")\n", 121 | " \n", 122 | " # write each row to mongodb (there is only one)\n", 123 | " #df.foreach(f)\n", 124 | "\n", 125 | " from pyspark.sql.types import MapType,StringType\n", 126 | " from pyspark.sql.functions import from_json\n", 127 | " \n", 128 | " df2=df.withColumn(\"value\",from_json(df.value,MapType(StringType(),StringType())))\n", 129 | " \n", 130 | " df3= df2.select(\"Quantity\",\"UnitPrice\",\"Country\",\"CustomerID\",\"StockCode\",\"Description\",\"InvoiceDate\",\"InvoiceNo\")\n", 131 | " \n", 132 | " df3.write.format(\"com.mongodb.spark.sql.DefaultSource\").mode(\"append\").save()\n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " pass" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "df1.writeStream.foreachBatch(foreach_batch_function).start().awaitTermination()" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [] 162 | } 163 | ], 164 | "metadata": { 165 | "kernelspec": { 166 | "display_name": "Python 3", 167 | "language": "python", 168 | "name": "python3" 169 | }, 170 | "language_info": { 171 | "codemirror_mode": { 172 | "name": "ipython", 173 | "version": 3 174 | }, 175 | "file_extension": ".py", 176 | "mimetype": "text/x-python", 177 | "name": "python", 178 | "nbconvert_exporter": "python", 179 | "pygments_lexer": "ipython3", 180 | "version": "3.7.6" 181 | } 182 | }, 183 | "nbformat": 4, 184 | "nbformat_minor": 4 185 | } 186 | -------------------------------------------------------------------------------- /ApacheSpark/01-streaming-kafka-src-dst.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from pyspark.sql import SparkSession\n", 10 | "\n", 11 | "# Spark session & context\n", 12 | "spark = (SparkSession\n", 13 | " .builder\n", 14 | " .master('local')\n", 15 | " .appName('kafka-streaming')\n", 16 | " # Add kafka package\n", 17 | " .config(\"spark.jars.packages\", \"org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.5\")\n", 18 | " .getOrCreate())\n", 19 | "sc = spark.sparkContext\n" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 2, 25 | "metadata": {}, 26 | "outputs": [ 27 | { 28 | "data": { 29 | "text/plain": [ 30 | "DataFrame[key: string, value: string]" 31 | ] 32 | }, 33 | "execution_count": 2, 34 | "metadata": {}, 35 | "output_type": "execute_result" 36 | } 37 | ], 38 | "source": [ 39 | "# Read the message from the kafka stream\n", 40 | "df = spark \\\n", 41 | " .readStream \\\n", 42 | " .format(\"kafka\") \\\n", 43 | " .option(\"kafka.bootstrap.servers\", \"kafka:9092\") \\\n", 44 | " .option(\"subscribe\", \"ingestion-topic\") \\\n", 45 | " .load()\n", 46 | "df.selectExpr(\"CAST(key AS STRING)\", \"CAST(value AS STRING)\")" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 3, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "#Create a small temporary view for SparkSQL\n", 56 | "df.createOrReplaceTempView(\"message\")" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 4, 62 | "metadata": {}, 63 | "outputs": [ 64 | { 65 | "data": { 66 | "text/plain": [ 67 | "" 68 | ] 69 | }, 70 | "execution_count": 4, 71 | "metadata": {}, 72 | "output_type": "execute_result" 73 | } 74 | ], 75 | "source": [ 76 | "# Write out the message to the console of the environment\n", 77 | "res = spark.sql(\"SELECT * from message\")\n", 78 | "res.writeStream.format(\"console\") \\\n", 79 | " .outputMode(\"append\") \\\n", 80 | " .start() " 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 5, 86 | "metadata": {}, 87 | "outputs": [ 88 | { 89 | "ename": "KeyboardInterrupt", 90 | "evalue": "", 91 | "output_type": "error", 92 | "traceback": [ 93 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 94 | "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", 95 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0moption\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"kafka.bootstrap.servers\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"kafka:9092\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0moption\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"topic\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"spark-output\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0;34m.\u001b[0m\u001b[0moption\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"checkpointLocation\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"/tmp\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 8\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mstart\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mawaitTermination\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 96 | "\u001b[0;32m/usr/local/spark/python/pyspark/sql/streaming.py\u001b[0m in \u001b[0;36mawaitTermination\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 101\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_jsq\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mawaitTermination\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtimeout\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0;36m1000\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 102\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 103\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_jsq\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mawaitTermination\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 104\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 105\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mproperty\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 97 | "\u001b[0;32m/usr/local/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m 1253\u001b[0m \u001b[0mproto\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mEND_COMMAND_PART\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1254\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1255\u001b[0;31m \u001b[0manswer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgateway_client\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1256\u001b[0m return_value = get_return_value(\n\u001b[1;32m 1257\u001b[0m answer, self.gateway_client, self.target_id, self.name)\n", 98 | "\u001b[0;32m/usr/local/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py\u001b[0m in \u001b[0;36msend_command\u001b[0;34m(self, command, retry, binary)\u001b[0m\n\u001b[1;32m 983\u001b[0m \u001b[0mconnection\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_connection\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 984\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 985\u001b[0;31m \u001b[0mresponse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mconnection\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 986\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mbinary\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 987\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresponse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_create_connection_guard\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconnection\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 99 | "\u001b[0;32m/usr/local/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py\u001b[0m in \u001b[0;36msend_command\u001b[0;34m(self, command)\u001b[0m\n\u001b[1;32m 1150\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1151\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1152\u001b[0;31m \u001b[0manswer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msmart_decode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstream\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreadline\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1153\u001b[0m \u001b[0mlogger\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdebug\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Answer received: {0}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0manswer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1154\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0manswer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstartswith\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mproto\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mRETURN_MESSAGE\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 100 | "\u001b[0;32m/opt/conda/lib/python3.7/socket.py\u001b[0m in \u001b[0;36mreadinto\u001b[0;34m(self, b)\u001b[0m\n\u001b[1;32m 587\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 588\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 589\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv_into\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 590\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 591\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_timeout_occurred\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 101 | "\u001b[0;31mKeyboardInterrupt\u001b[0m: " 102 | ] 103 | } 104 | ], 105 | "source": [ 106 | "# Write the message back into Kafka in another topic that you are going to listen to with a local consumer\n", 107 | "ds = df \\\n", 108 | " .writeStream \\\n", 109 | " .format(\"kafka\") \\\n", 110 | " .option(\"kafka.bootstrap.servers\", \"kafka:9092\") \\\n", 111 | " .option(\"topic\", \"spark-output\") \\\n", 112 | " .option(\"checkpointLocation\", \"/tmp\") \\\n", 113 | " .start() \\\n", 114 | " .awaitTermination()" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [] 123 | } 124 | ], 125 | "metadata": { 126 | "kernelspec": { 127 | "display_name": "Python 3", 128 | "language": "python", 129 | "name": "python3" 130 | }, 131 | "language_info": { 132 | "codemirror_mode": { 133 | "name": "ipython", 134 | "version": 3 135 | }, 136 | "file_extension": ".py", 137 | "mimetype": "text/x-python", 138 | "name": "python", 139 | "nbconvert_exporter": "python", 140 | "pygments_lexer": "ipython3", 141 | "version": "3.7.6" 142 | } 143 | }, 144 | "nbformat": 4, 145 | "nbformat_minor": 4 146 | } 147 | -------------------------------------------------------------------------------- /ApacheSpark/02-streaming-kafka-src-dst-mongodb.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from pyspark.sql import SparkSession\n", 10 | "\n", 11 | "# you need these two to transform the json strings to dataframes\n", 12 | "from pyspark.sql.types import MapType,StringType\n", 13 | "from pyspark.sql.functions import from_json\n", 14 | "\n", 15 | "# Spark session & context\n", 16 | "spark = (SparkSession\n", 17 | " .builder\n", 18 | " .master('local')\n", 19 | " .appName('kafka-mongo-streaming') \n", 20 | " # Add kafka package and mongodb package. Make sure to to this as one string!\n", 21 | " # Versions need to match the Spark version (trial & error)\n", 22 | " .config(\"spark.jars.packages\", \"org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.5,org.mongodb.spark:mongo-spark-connector_2.11:2.4.0\")\n", 23 | " # Mongo config including the username and password from compose file\n", 24 | " .config(\"spark.mongodb.input.uri\",\"mongodb://root:example@mongo:27017/docstreaming.invoices?authSource=admin\")\n", 25 | " .config(\"spark.mongodb.output.uri\",\"mongodb://root:example@mongo:27017/docstreaming.invoices?authSource=admin\")\n", 26 | " .getOrCreate())\n", 27 | "sc = spark.sparkContext\n" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 2, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "# Read the message from the kafka stream\n", 37 | "df = spark \\\n", 38 | " .readStream \\\n", 39 | " .format(\"kafka\") \\\n", 40 | " .option(\"kafka.bootstrap.servers\", \"kafka:9092\") \\\n", 41 | " .option(\"subscribe\", \"ingestion-topic\") \\\n", 42 | " .load()\n", 43 | "\n", 44 | "# convert the binary values to string\n", 45 | "df1 = df.selectExpr(\"CAST(key AS STRING)\", \"CAST(value AS STRING)\")" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 3, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "#Create a temporary view for SparkSQL\n", 55 | "df1.createOrReplaceTempView(\"message\")" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 4, 61 | "metadata": {}, 62 | "outputs": [ 63 | { 64 | "data": { 65 | "text/plain": [ 66 | "" 67 | ] 68 | }, 69 | "execution_count": 4, 70 | "metadata": {}, 71 | "output_type": "execute_result" 72 | } 73 | ], 74 | "source": [ 75 | "# Write out the message to the console of the environment\n", 76 | "res = spark.sql(\"SELECT * from message\")\n", 77 | "res.writeStream.format(\"console\") \\\n", 78 | " .outputMode(\"append\") \\\n", 79 | " .start()" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 5, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "# Write the unvonverted dataframe (no strings)\n", 89 | "# message back into Kafka in another topic#\n", 90 | "# listen to it with a local consumer\n", 91 | "ds = df \\\n", 92 | " .writeStream \\\n", 93 | " .format(\"kafka\") \\\n", 94 | " .option(\"kafka.bootstrap.servers\", \"kafka:9092\") \\\n", 95 | " .option(\"topic\", \"spark-output\") \\\n", 96 | " .option(\"checkpointLocation\", \"/tmp\") \\\n", 97 | " .start() " 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 6, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | " \n", 107 | "\n", 108 | "# Write the message into MongoDB\n", 109 | "def foreach_batch_function(df, epoch_id):\n", 110 | " # Transform and write batchDF in this foreach\n", 111 | "\n", 112 | " # writes the dataframe with complete kafka message into mongodb\n", 113 | " #df.write.format(\"com.mongodb.spark.sql.DefaultSource\").mode(\"append\").save()\n", 114 | " \n", 115 | " #Transform the values of all rows in column value and create a dataframe out of it (will also only have one row)\n", 116 | " df2=df.withColumn(\"value\",from_json(df.value,MapType(StringType(),StringType()))) \n", 117 | " \n", 118 | " # Transform the dataframe so that it will have individual columns \n", 119 | " df3= df2.select([\"value.Quantity\",\"value.UnitPrice\",\"value.Country\",\"value.CustomerID\",\"value.StockCode\",\"value.Description\",\"value.InvoiceDate\",\"value.InvoiceNo\"])\n", 120 | " \n", 121 | " # Send the dataframe into MongoDB which will create a BSON document out of it\n", 122 | " df3.write.format(\"com.mongodb.spark.sql.DefaultSource\").mode(\"append\").save()\n", 123 | " \n", 124 | " pass" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "# Start the MongoDB stream and wait for termination\n", 134 | "df1.writeStream.foreachBatch(foreach_batch_function).start().awaitTermination()" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [] 150 | } 151 | ], 152 | "metadata": { 153 | "kernelspec": { 154 | "display_name": "Python 3", 155 | "language": "python", 156 | "name": "python3" 157 | }, 158 | "language_info": { 159 | "codemirror_mode": { 160 | "name": "ipython", 161 | "version": 3 162 | }, 163 | "file_extension": ".py", 164 | "mimetype": "text/x-python", 165 | "name": "python", 166 | "nbconvert_exporter": "python", 167 | "pygments_lexer": "ipython3", 168 | "version": "3.7.6" 169 | } 170 | }, 171 | "nbformat": 4, 172 | "nbformat_minor": 4 173 | } 174 | -------------------------------------------------------------------------------- /Kafka Commands.txt: -------------------------------------------------------------------------------- 1 | 2 | ./kafka-topics.sh --list --bootstrap-server localhost:9092 3 | 4 | ## Create Topic 5 | ./kafka-topics.sh --create --topic ingestion-topic --bootstrap-server localhost:9092 6 | ./kafka-topics.sh --create --topic spark-output --bootstrap-server localhost:9092 7 | 8 | 9 | # Local consumer 10 | ./kafka-console-consumer.sh --topic ingestion-topic --bootstrap-server localhost:9092 11 | ./kafka-console-consumer.sh --topic spark-output --bootstrap-server localhost:9092 12 | 13 | 14 | # Local producer 15 | ./kafka-console-producer.sh --topic ingestion-topic --bootstrap-server localhost:9092 16 | 17 | # To test if your Kafka is running correctly: 18 | 1. Connect to the container cli and go to the Kafka directory 19 | 2. Start a local consumer 20 | 3. Connect with a second cli to the container 21 | 4. Start in the second cli a local producer 22 | 5. Type in to the producer cli a message and hit enter 23 | 6. Check if you can see the message in the consumer cli -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # document-streaming 2 | Repository for the Document streaming capstone projects 3 | 4 | Outlook: 5 | 6 | - Deploy the streamlit app as docker (build and add in dockerfile) 7 | - Deploy the whole platform with all containers on a cloud platform of your choice 8 | - Add an API between Streamlit and MongoDB so that Streamlit doesnt have to be directly connected with MongoDB (User& Password) 9 | 10 | # Links to deploy the streamlit app as docker container 11 | #https://maelfabien.github.io/project/Streamlit/#the-application 12 | #https://towardsdatascience.com/how-to-deploy-a-semantic-search-engine-with-streamlit-and-docker-on-aws-elastic-beanstalk-42ddce0422f3 -------------------------------------------------------------------------------- /Streamlit/.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.pythonPath": "/bin/python3" 3 | } -------------------------------------------------------------------------------- /Streamlit/streamlitapp.py: -------------------------------------------------------------------------------- 1 | from numpy import double 2 | import streamlit as st 3 | from pandas import DataFrame 4 | 5 | import numpy as np 6 | 7 | import pymongo 8 | 9 | 10 | #data = pd.read_csv("data.csv") 11 | myclient = pymongo.MongoClient("mongodb://localhost:27017/",username='root',password='example') 12 | mydb = myclient["docstreaming"] 13 | mycol = mydb["invoices"] 14 | 15 | 16 | # Below the fist chart add a input field for the invoice number 17 | cust_id = st.sidebar.text_input("CustomerID:") 18 | #st.text(inv_no) # Use this to print out the content of the input field 19 | 20 | # if enter has been used on the input field 21 | if cust_id: 22 | 23 | myquery = {"CustomerID": cust_id} 24 | # only includes or excludes 25 | mydoc = mycol.find( myquery , { "_id": 0, "StockCode": 0, "Description": 0, "Quantity": 0, "Country": 0, "UnitPrice": 0}) 26 | 27 | # create dataframe from resulting documents to use drop_duplicates 28 | df = DataFrame(mydoc) 29 | 30 | # drop duplicates, but keep the first one 31 | df.drop_duplicates(subset ="InvoiceNo", keep = 'first', inplace = True) 32 | 33 | # Add the table with a headline 34 | st.header("Output Customer Invoices") 35 | table2 = st.dataframe(data=df) 36 | 37 | 38 | # Below the fist chart add a input field for the invoice number 39 | inv_no = st.sidebar.text_input("InvoiceNo:") 40 | #st.text(inv_no) # Use this to print out the content of the input field 41 | 42 | # if enter has been used on the input field 43 | if inv_no: 44 | 45 | myquery = {"InvoiceNo": inv_no} 46 | mydoc = mycol.find( myquery, { "_id": 0, "InvoiceDate": 0, "Country": 0, "CustomerID": 0 }) 47 | 48 | # create the dataframe 49 | df = DataFrame(mydoc) 50 | 51 | # reindex it so that the columns are order lexicographically 52 | reindexed = df.reindex(sorted(df.columns), axis=1) 53 | 54 | # Add the table with a headline 55 | st.header("Output by Invoice ID") 56 | table2 = st.dataframe(data=reindexed) 57 | 58 | 59 | -------------------------------------------------------------------------------- /archive/docker-compose-kafka-spark-confluent.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | services: 3 | 4 | zookeeper: 5 | image: confluentinc/cp-zookeeper:6.2.0 6 | hostname: zookeeper 7 | container_name: zookeeper 8 | ports: 9 | - "2181:2181" 10 | environment: 11 | ZOOKEEPER_CLIENT_PORT: 2181 12 | ZOOKEEPER_TICK_TIME: 2000 13 | networks: 14 | - document-streaming 15 | 16 | broker: 17 | image: confluentinc/cp-server:6.2.0 18 | hostname: broker 19 | container_name: broker 20 | depends_on: 21 | - zookeeper 22 | ports: 23 | - "9092:9092" 24 | - "9101:9101" 25 | environment: 26 | KAFKA_BROKER_ID: 1 27 | KAFKA_ZOOKEEPER_CONNECT: 'zookeeper:2181' 28 | KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT 29 | KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://broker:29092,PLAINTEXT_HOST://localhost:9092 30 | KAFKA_METRIC_REPORTERS: io.confluent.metrics.reporter.ConfluentMetricsReporter 31 | KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 32 | KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS: 0 33 | KAFKA_CONFLUENT_LICENSE_TOPIC_REPLICATION_FACTOR: 1 34 | KAFKA_CONFLUENT_BALANCER_TOPIC_REPLICATION_FACTOR: 1 35 | KAFKA_TRANSACTION_STATE_LOG_MIN_ISR: 1 36 | KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 1 37 | KAFKA_JMX_PORT: 9101 38 | KAFKA_JMX_HOSTNAME: localhost 39 | KAFKA_CONFLUENT_SCHEMA_REGISTRY_URL: http://schema-registry:8081 40 | CONFLUENT_METRICS_REPORTER_BOOTSTRAP_SERVERS: broker:29092 41 | CONFLUENT_METRICS_REPORTER_TOPIC_REPLICAS: 1 42 | CONFLUENT_METRICS_ENABLE: 'true' 43 | CONFLUENT_SUPPORT_CUSTOMER_ID: 'anonymous' 44 | networks: 45 | - document-streaming 46 | 47 | spark: 48 | image: 'jupyter/pyspark-notebook:spark-2' 49 | ports: 50 | - '8888:8888' 51 | - "4040-4080:4040-4080" 52 | volumes: 53 | - ./ApacheSpark/:/home/jovyan/work 54 | networks: 55 | - document-streaming 56 | 57 | api-ingest: 58 | image: 'api-ingest' 59 | ports: 60 | - '80:80' 61 | networks: 62 | - document-streaming 63 | 64 | networks: 65 | document-streaming: 66 | driver: bridge 67 | -------------------------------------------------------------------------------- /archive/docker-compose-kafka_old.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | services: 3 | zookeeper: 4 | image: 'bitnami/zookeeper:latest' 5 | ports: 6 | - '2181:2181' 7 | environment: 8 | - ALLOW_ANONYMOUS_LOGIN=yes 9 | kafka: 10 | image: 'bitnami/kafka:latest' 11 | ports: 12 | - '9092:9092' 13 | environment: 14 | - KAFKA_BROKER_ID=1 15 | - KAFKA_CFG_LISTENERS=PLAINTEXT://:9092 16 | - KAFKA_CFG_ADVERTISED_LISTENERS=PLAINTEXT://127.0.0.1:9092 17 | - KAFKA_CFG_ZOOKEEPER_CONNECT=zookeeper:2181 18 | - ALLOW_PLAINTEXT_LISTENER=yes 19 | depends_on: 20 | - zookeeper -------------------------------------------------------------------------------- /client/.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.pythonPath": "/bin/python3" 3 | } -------------------------------------------------------------------------------- /client/api-client.py: -------------------------------------------------------------------------------- 1 | import linecache 2 | import json 3 | 4 | # Make sure that requests is installed in your WSL 5 | import requests 6 | 7 | # We could just read the entire file, but if it's really big you could go line by line 8 | # If you want make this an excercise and replace the process below by reading the whole file at once and going line by line 9 | 10 | #set starting id and ending id 11 | start = 1 12 | end = 50 13 | 14 | # Loop over the JSON file 15 | i=start 16 | 17 | while i <= end: 18 | 19 | # read a specific line 20 | line = linecache.getline('./output.txt', i) 21 | #print(line) 22 | # write the line to the API 23 | myjson = json.loads(line) 24 | 25 | print(myjson) 26 | 27 | response = requests.post('http://localhost:80/invoiceitem', json=myjson) 28 | 29 | # Use this for dedbugging 30 | #print("Status code: ", response.status_code) 31 | #print("Printing Entire Post Request") 32 | print(response.json()) 33 | 34 | # increase i 35 | i+=1 36 | -------------------------------------------------------------------------------- /client/transformer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from numpy import add 3 | import pandas as pd 4 | 5 | 6 | df = pd.read_csv ('data.csv') 7 | #print(df) 8 | 9 | # add a json column to the dataframe 10 | # splitlines will split the json into multiple rows not a single one 11 | df['json'] = df.to_json(orient='records', lines=True).splitlines() 12 | #print(df) 13 | 14 | # just take the json column of the dataframe 15 | dfjson = df['json'] 16 | print(dfjson) 17 | 18 | # print out the dataframe to a file 19 | # Note that the timestamp forward slash will be escaped to stay true to JSON schema 20 | np.savetxt(r'./output.txt', dfjson.values, fmt='%s') 21 | -------------------------------------------------------------------------------- /docker helpful commands.txt: -------------------------------------------------------------------------------- 1 | docker inspect document-streaming_spark_1 2 | 3 | 4 | links: 5 | - "kafka:kafka-server" #allows API to discover kafka service by name "kafka-server" -------------------------------------------------------------------------------- /docker-compose-kafka-spark-mongodb.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | services: 3 | 4 | zookeeper: 5 | image: 'bitnami/zookeeper:3.7.0-debian-10-r70' 6 | ports: 7 | - '2181:2181' 8 | environment: 9 | - ALLOW_ANONYMOUS_LOGIN=yes 10 | networks: 11 | - document-streaming 12 | 13 | kafka: 14 | image: 'bitnami/kafka:2.8.0-debian-10-r42' 15 | ports: 16 | - '9093:9093' #change to 9093 to access external from your windows host 17 | environment: 18 | - KAFKA_BROKER_ID=1 19 | - KAFKA_CFG_ZOOKEEPER_CONNECT=zookeeper:2181 20 | - ALLOW_PLAINTEXT_LISTENER=yes 21 | - KAFKA_CFG_LISTENER_SECURITY_PROTOCOL_MAP=CLIENT:PLAINTEXT,EXTERNAL:PLAINTEXT #add aditional listener for external 22 | - KAFKA_CFG_LISTENERS=CLIENT://:9092,EXTERNAL://:9093 #9092 will be for other containers, 9093 for your windows client 23 | - KAFKA_CFG_ADVERTISED_LISTENERS=CLIENT://kafka:9092,EXTERNAL://localhost:9093 #9092 will be for other containers, 9093 for your windows client 24 | - KAFKA_INTER_BROKER_LISTENER_NAME=CLIENT 25 | depends_on: 26 | - zookeeper 27 | networks: 28 | - document-streaming 29 | 30 | spark: 31 | image: 'jupyter/pyspark-notebook:spark-2' 32 | ports: 33 | - '8888:8888' 34 | - "4040-4080:4040-4080" 35 | volumes: 36 | - ./ApacheSpark/:/home/jovyan/work 37 | networks: 38 | - document-streaming 39 | 40 | api-ingest: 41 | image: 'api-ingest' 42 | ports: 43 | - '80:80' 44 | networks: 45 | - document-streaming 46 | 47 | mongo: 48 | container_name: mongo-dev 49 | image: mongo 50 | volumes: 51 | - ~/dockerdata/mongodb:/data/db 52 | restart: on-failure 53 | ports: 54 | - "27017:27017" 55 | environment: 56 | MONGO_INITDB_ROOT_USERNAME: root 57 | MONGO_INITDB_ROOT_PASSWORD: example 58 | MONGO_INITDB_DATABASE: auth 59 | networks: 60 | - document-streaming 61 | 62 | mongo-express: 63 | image: mongo-express 64 | restart: on-failure 65 | ports: 66 | - "8081:8081" 67 | environment: 68 | ME_CONFIG_MONGODB_SERVER: mongo-dev 69 | ME_CONFIG_MONGODB_ADMINUSERNAME: root 70 | ME_CONFIG_MONGODB_ADMINPASSWORD: example 71 | ME_CONFIG_BASICAUTH_USERNAME: admin 72 | ME_CONFIG_BASICAUTH_PASSWORD: tribes 73 | networks: 74 | - document-streaming 75 | depends_on: 76 | - mongo 77 | 78 | 79 | 80 | 81 | networks: 82 | document-streaming: 83 | driver: bridge 84 | -------------------------------------------------------------------------------- /docker-compose-kafka-spark.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | services: 3 | 4 | zookeeper: 5 | image: 'bitnami/zookeeper:3.7.0-debian-10-r70' 6 | ports: 7 | - '2181:2181' 8 | environment: 9 | - ALLOW_ANONYMOUS_LOGIN=yes 10 | networks: 11 | - document-streaming 12 | 13 | kafka: 14 | image: 'bitnami/kafka:2.8.0-debian-10-r42' 15 | ports: 16 | - '9093:9093' #change to 9093 to access external from your windows host 17 | environment: 18 | - KAFKA_BROKER_ID=1 19 | - KAFKA_CFG_ZOOKEEPER_CONNECT=zookeeper:2181 20 | - ALLOW_PLAINTEXT_LISTENER=yes 21 | - KAFKA_CFG_LISTENER_SECURITY_PROTOCOL_MAP=CLIENT:PLAINTEXT,EXTERNAL:PLAINTEXT #add aditional listener for external 22 | - KAFKA_CFG_LISTENERS=CLIENT://:9092,EXTERNAL://:9093 #9092 will be for other containers, 9093 for your windows client 23 | - KAFKA_CFG_ADVERTISED_LISTENERS=CLIENT://kafka:9092,EXTERNAL://localhost:9093 #9092 will be for other containers, 9093 for your windows client 24 | - KAFKA_INTER_BROKER_LISTENER_NAME=CLIENT 25 | depends_on: 26 | - zookeeper 27 | networks: 28 | - document-streaming 29 | 30 | spark: 31 | image: 'jupyter/pyspark-notebook:spark-2' 32 | ports: 33 | - '8888:8888' 34 | - "4040-4080:4040-4080" 35 | volumes: 36 | - ./ApacheSpark/:/home/jovyan/work 37 | networks: 38 | - document-streaming 39 | 40 | api-ingest: 41 | image: 'api-ingest' 42 | ports: 43 | - '80:80' 44 | networks: 45 | - document-streaming 46 | 47 | networks: 48 | document-streaming: 49 | driver: bridge 50 | -------------------------------------------------------------------------------- /docker-compose-kafka.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | services: 3 | zookeeper: 4 | image: 'bitnami/zookeeper:3.7.0-debian-10-r70' 5 | ports: 6 | - '2181:2181' 7 | environment: 8 | - ALLOW_ANONYMOUS_LOGIN=yes 9 | kafka: 10 | image: 'bitnami/kafka:2.8.0-debian-10-r42' 11 | ports: 12 | - '9093:9093' #change to 9093 to access external from your windows host 13 | environment: 14 | - KAFKA_BROKER_ID=1 15 | - KAFKA_CFG_ZOOKEEPER_CONNECT=zookeeper:2181 16 | - ALLOW_PLAINTEXT_LISTENER=yes 17 | - KAFKA_CFG_LISTENER_SECURITY_PROTOCOL_MAP=CLIENT:PLAINTEXT,EXTERNAL:PLAINTEXT #add aditional listener for external 18 | - KAFKA_CFG_LISTENERS=CLIENT://:9092,EXTERNAL://:9093 #9092 will be for other containers, 9093 for your windows client 19 | - KAFKA_CFG_ADVERTISED_LISTENERS=CLIENT://kafka:9092,EXTERNAL://localhost:9093 #9092 will be for other containers, 9093 for your windows client 20 | - KAFKA_INTER_BROKER_LISTENER_NAME=CLIENT 21 | depends_on: 22 | - zookeeper 23 | --------------------------------------------------------------------------------