├── .gitignore
├── API-Ingest
    ├── .vscode
    │   └── settings.json
    ├── Kafka consumer
    │   └── consumer.py
    ├── Postman
    │   └── IngestAPI-Test.postman_collection.json
    ├── app
    │   └── main.py
    ├── build command.txt
    ├── dockerfile
    └── requirements.txt
├── ApacheSpark
    ├── .ipynb_checkpoints
    │   ├── 01-streaming-kafka-src-dst-checkpoint.ipynb
    │   └── 02-streaming-kafka-src-dst-mongodb-checkpoint.ipynb
    ├── 01-streaming-kafka-src-dst.ipynb
    └── 02-streaming-kafka-src-dst-mongodb.ipynb
├── Kafka Commands.txt
├── README.md
├── Streamlit
    ├── .vscode
    │   └── settings.json
    └── streamlitapp.py
├── archive
    ├── docker-compose-kafka-spark-confluent.yml
    └── docker-compose-kafka_old.yml
├── client
    ├── .vscode
    │   └── settings.json
    ├── api-client.py
    └── transformer.py
├── docker helpful commands.txt
├── docker-compose-kafka-spark-mongodb.yml
├── docker-compose-kafka-spark.yml
└── docker-compose-kafka.yml


/.gitignore:
--------------------------------------------------------------------------------
  1 | ## Ignore Visual Studio temporary files, build results, and
  2 | ## files generated by popular Visual Studio add-ons.
  3 | ##
  4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
  5 | 
  6 | # User-specific files
  7 | *.rsuser
  8 | *.suo
  9 | *.user
 10 | *.userosscache
 11 | *.sln.docstates
 12 | 
 13 | # User-specific files (MonoDevelop/Xamarin Studio)
 14 | *.userprefs
 15 | 
 16 | # Mono auto generated files
 17 | mono_crash.*
 18 | 
 19 | # Build results
 20 | [Dd]ebug/
 21 | [Dd]ebugPublic/
 22 | [Rr]elease/
 23 | [Rr]eleases/
 24 | x64/
 25 | x86/
 26 | [Aa][Rr][Mm]/
 27 | [Aa][Rr][Mm]64/
 28 | bld/
 29 | [Bb]in/
 30 | [Oo]bj/
 31 | [Ll]og/
 32 | [Ll]ogs/
 33 | 
 34 | # Visual Studio 2015/2017 cache/options directory
 35 | .vs/
 36 | # Uncomment if you have tasks that create the project's static files in wwwroot
 37 | #wwwroot/
 38 | 
 39 | # Visual Studio 2017 auto generated files
 40 | Generated\ Files/
 41 | 
 42 | # MSTest test Results
 43 | [Tt]est[Rr]esult*/
 44 | [Bb]uild[Ll]og.*
 45 | 
 46 | # NUnit
 47 | *.VisualState.xml
 48 | TestResult.xml
 49 | nunit-*.xml
 50 | 
 51 | # Build Results of an ATL Project
 52 | [Dd]ebugPS/
 53 | [Rr]eleasePS/
 54 | dlldata.c
 55 | 
 56 | # Benchmark Results
 57 | BenchmarkDotNet.Artifacts/
 58 | 
 59 | # .NET Core
 60 | project.lock.json
 61 | project.fragment.lock.json
 62 | artifacts/
 63 | 
 64 | # StyleCop
 65 | StyleCopReport.xml
 66 | 
 67 | # Files built by Visual Studio
 68 | *_i.c
 69 | *_p.c
 70 | *_h.h
 71 | *.ilk
 72 | *.meta
 73 | *.obj
 74 | *.iobj
 75 | *.pch
 76 | *.pdb
 77 | *.ipdb
 78 | *.pgc
 79 | *.pgd
 80 | *.rsp
 81 | *.sbr
 82 | *.tlb
 83 | *.tli
 84 | *.tlh
 85 | *.tmp
 86 | *.tmp_proj
 87 | *_wpftmp.csproj
 88 | *.log
 89 | *.vspscc
 90 | *.vssscc
 91 | .builds
 92 | *.pidb
 93 | *.svclog
 94 | *.scc
 95 | 
 96 | # Chutzpah Test files
 97 | _Chutzpah*
 98 | 
 99 | # Visual C++ cache files
100 | ipch/
101 | *.aps
102 | *.ncb
103 | *.opendb
104 | *.opensdf
105 | *.sdf
106 | *.cachefile
107 | *.VC.db
108 | *.VC.VC.opendb
109 | 
110 | # Visual Studio profiler
111 | *.psess
112 | *.vsp
113 | *.vspx
114 | *.sap
115 | 
116 | # Visual Studio Trace Files
117 | *.e2e
118 | 
119 | # TFS 2012 Local Workspace
120 | $tf/
121 | 
122 | # Guidance Automation Toolkit
123 | *.gpState
124 | 
125 | # ReSharper is a .NET coding add-in
126 | _ReSharper*/
127 | *.[Rr]e[Ss]harper
128 | *.DotSettings.user
129 | 
130 | # TeamCity is a build add-in
131 | _TeamCity*
132 | 
133 | # DotCover is a Code Coverage Tool
134 | *.dotCover
135 | 
136 | # AxoCover is a Code Coverage Tool
137 | .axoCover/*
138 | !.axoCover/settings.json
139 | 
140 | # Visual Studio code coverage results
141 | *.coverage
142 | *.coveragexml
143 | 
144 | # NCrunch
145 | _NCrunch_*
146 | .*crunch*.local.xml
147 | nCrunchTemp_*
148 | 
149 | # MightyMoose
150 | *.mm.*
151 | AutoTest.Net/
152 | 
153 | # Web workbench (sass)
154 | .sass-cache/
155 | 
156 | # Installshield output folder
157 | [Ee]xpress/
158 | 
159 | # DocProject is a documentation generator add-in
160 | DocProject/buildhelp/
161 | DocProject/Help/*.HxT
162 | DocProject/Help/*.HxC
163 | DocProject/Help/*.hhc
164 | DocProject/Help/*.hhk
165 | DocProject/Help/*.hhp
166 | DocProject/Help/Html2
167 | DocProject/Help/html
168 | 
169 | # Click-Once directory
170 | publish/
171 | 
172 | # Publish Web Output
173 | *.[Pp]ublish.xml
174 | *.azurePubxml
175 | # Note: Comment the next line if you want to checkin your web deploy settings,
176 | # but database connection strings (with potential passwords) will be unencrypted
177 | *.pubxml
178 | *.publishproj
179 | 
180 | # Microsoft Azure Web App publish settings. Comment the next line if you want to
181 | # checkin your Azure Web App publish settings, but sensitive information contained
182 | # in these scripts will be unencrypted
183 | PublishScripts/
184 | 
185 | # NuGet Packages
186 | *.nupkg
187 | # NuGet Symbol Packages
188 | *.snupkg
189 | # The packages folder can be ignored because of Package Restore
190 | **/[Pp]ackages/*
191 | # except build/, which is used as an MSBuild target.
192 | !**/[Pp]ackages/build/
193 | # Uncomment if necessary however generally it will be regenerated when needed
194 | #!**/[Pp]ackages/repositories.config
195 | # NuGet v3's project.json files produces more ignorable files
196 | *.nuget.props
197 | *.nuget.targets
198 | 
199 | # Microsoft Azure Build Output
200 | csx/
201 | *.build.csdef
202 | 
203 | # Microsoft Azure Emulator
204 | ecf/
205 | rcf/
206 | 
207 | # Windows Store app package directories and files
208 | AppPackages/
209 | BundleArtifacts/
210 | Package.StoreAssociation.xml
211 | _pkginfo.txt
212 | *.appx
213 | *.appxbundle
214 | *.appxupload
215 | 
216 | # Visual Studio cache files
217 | # files ending in .cache can be ignored
218 | *.[Cc]ache
219 | # but keep track of directories ending in .cache
220 | !?*.[Cc]ache/
221 | 
222 | # Others
223 | ClientBin/
224 | ~$*
225 | *~
226 | *.dbmdl
227 | *.dbproj.schemaview
228 | *.jfm
229 | *.pfx
230 | *.publishsettings
231 | orleans.codegen.cs
232 | 
233 | # Including strong name files can present a security risk
234 | # (https://github.com/github/gitignore/pull/2483#issue-259490424)
235 | #*.snk
236 | 
237 | # Since there are multiple workflows, uncomment next line to ignore bower_components
238 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
239 | #bower_components/
240 | 
241 | # RIA/Silverlight projects
242 | Generated_Code/
243 | 
244 | # Backup & report files from converting an old project file
245 | # to a newer Visual Studio version. Backup files are not needed,
246 | # because we have git ;-)
247 | _UpgradeReport_Files/
248 | Backup*/
249 | UpgradeLog*.XML
250 | UpgradeLog*.htm
251 | ServiceFabricBackup/
252 | *.rptproj.bak
253 | 
254 | # SQL Server files
255 | *.mdf
256 | *.ldf
257 | *.ndf
258 | 
259 | # Business Intelligence projects
260 | *.rdl.data
261 | *.bim.layout
262 | *.bim_*.settings
263 | *.rptproj.rsuser
264 | *- [Bb]ackup.rdl
265 | *- [Bb]ackup ([0-9]).rdl
266 | *- [Bb]ackup ([0-9][0-9]).rdl
267 | 
268 | # Microsoft Fakes
269 | FakesAssemblies/
270 | 
271 | # GhostDoc plugin setting file
272 | *.GhostDoc.xml
273 | 
274 | # Node.js Tools for Visual Studio
275 | .ntvs_analysis.dat
276 | node_modules/
277 | 
278 | # Visual Studio 6 build log
279 | *.plg
280 | 
281 | # Visual Studio 6 workspace options file
282 | *.opt
283 | 
284 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
285 | *.vbw
286 | 
287 | # Visual Studio LightSwitch build output
288 | **/*.HTMLClient/GeneratedArtifacts
289 | **/*.DesktopClient/GeneratedArtifacts
290 | **/*.DesktopClient/ModelManifest.xml
291 | **/*.Server/GeneratedArtifacts
292 | **/*.Server/ModelManifest.xml
293 | _Pvt_Extensions
294 | 
295 | # Paket dependency manager
296 | .paket/paket.exe
297 | paket-files/
298 | 
299 | # FAKE - F# Make
300 | .fake/
301 | 
302 | # CodeRush personal settings
303 | .cr/personal
304 | 
305 | # Python Tools for Visual Studio (PTVS)
306 | __pycache__/
307 | *.pyc
308 | 
309 | # Cake - Uncomment if you are using it
310 | # tools/**
311 | # !tools/packages.config
312 | 
313 | # Tabs Studio
314 | *.tss
315 | 
316 | # Telerik's JustMock configuration file
317 | *.jmconfig
318 | 
319 | # BizTalk build output
320 | *.btp.cs
321 | *.btm.cs
322 | *.odx.cs
323 | *.xsd.cs
324 | 
325 | # OpenCover UI analysis results
326 | OpenCover/
327 | 
328 | # Azure Stream Analytics local run output
329 | ASALocalRun/
330 | 
331 | # MSBuild Binary and Structured Log
332 | *.binlog
333 | 
334 | # NVidia Nsight GPU debugger configuration file
335 | *.nvuser
336 | 
337 | # MFractors (Xamarin productivity tool) working folder
338 | .mfractor/
339 | 
340 | # Local History for Visual Studio
341 | .localhistory/
342 | 
343 | # BeatPulse healthcheck temp database
344 | healthchecksdb
345 | 
346 | # Backup folder for Package Reference Convert tool in Visual Studio 2017
347 | MigrationBackup/
348 | 
349 | # Ionide (cross platform F# VS Code tools) working folder
350 | .ionide/
351 | client/data.csv
352 | client/output.txt
353 | 


--------------------------------------------------------------------------------
/API-Ingest/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "python.pythonPath": "/bin/python3"
3 | }


--------------------------------------------------------------------------------
/API-Ingest/Kafka consumer/consumer.py:
--------------------------------------------------------------------------------
 1 | from kafka import KafkaConsumer
 2 | 
 3 | #consumer = KafkaConsumer()
 4 | 
 5 | # define a consumer that waits for new messages
 6 | def kafka_python_consumer():
 7 |     
 8 |     # Consumer using the topic name and setting a group id
 9 |     consumer = KafkaConsumer('ingestion-topic', group_id='mypythonconsumer',bootstrap_servers='localhost:9092',)
10 |     for msg in consumer:
11 |       print(msg)
12 | 
13 | print("start consuming")
14 | 
15 | # start the consumer
16 | kafka_python_consumer()
17 | 
18 | print("done")
19 | 


--------------------------------------------------------------------------------
/API-Ingest/Postman/IngestAPI-Test.postman_collection.json:
--------------------------------------------------------------------------------
  1 | {
  2 | 	"info": {
  3 | 		"_postman_id": "793bf228-6eb4-497b-9859-355a2251ab4a",
  4 | 		"name": "IngestAPI-Test",
  5 | 		"schema": "https://schema.getpostman.com/json/collection/v2.1.0/collection.json"
  6 | 	},
  7 | 	"item": [
  8 | 		{
  9 | 			"name": "Test Date Validation - Good modified",
 10 | 			"request": {
 11 | 				"method": "POST",
 12 | 				"header": [],
 13 | 				"body": {
 14 | 					"mode": "raw",
 15 | 					"raw": "{\"InvoiceNo\":536365,\"StockCode\":\"85123A\",\"Description\":\"WHITE HANGING HEART T-LIGHT HOLDER\",\"Quantity\":6,\"InvoiceDate\":\"12/02/2010 8:26\",\"UnitPrice\":2.55,\"CustomerID\":17850,\"Country\":\"United Kingdom\"}\r\n",
 16 | 					"options": {
 17 | 						"raw": {
 18 | 							"language": "json"
 19 | 						}
 20 | 					}
 21 | 				},
 22 | 				"url": {
 23 | 					"raw": "localhost:8000/invoiceitem",
 24 | 					"host": [
 25 | 						"localhost"
 26 | 					],
 27 | 					"port": "8000",
 28 | 					"path": [
 29 | 						"invoiceitem"
 30 | 					]
 31 | 				}
 32 | 			},
 33 | 			"response": []
 34 | 		},
 35 | 		{
 36 | 			"name": "Test Date Validation - Good original",
 37 | 			"request": {
 38 | 				"method": "POST",
 39 | 				"header": [],
 40 | 				"body": {
 41 | 					"mode": "raw",
 42 | 					"raw": "{\"InvoiceNo\":536365,\"StockCode\":\"85123A\",\"Description\":\"WHITE HANGING HEART T-LIGHT HOLDER\",\"Quantity\":6,\"InvoiceDate\":\"12/2/2010 8:26\",\"UnitPrice\":2.55,\"CustomerID\":17850,\"Country\":\"United Kingdom\"}\r\n",
 43 | 					"options": {
 44 | 						"raw": {
 45 | 							"language": "json"
 46 | 						}
 47 | 					}
 48 | 				},
 49 | 				"url": {
 50 | 					"raw": "localhost:8000/invoiceitem",
 51 | 					"host": [
 52 | 						"localhost"
 53 | 					],
 54 | 					"port": "8000",
 55 | 					"path": [
 56 | 						"invoiceitem"
 57 | 					]
 58 | 				}
 59 | 			},
 60 | 			"response": []
 61 | 		},
 62 | 		{
 63 | 			"name": "Test Date Validation - Bad",
 64 | 			"request": {
 65 | 				"method": "POST",
 66 | 				"header": [],
 67 | 				"body": {
 68 | 					"mode": "raw",
 69 | 					"raw": "{\"InvoiceNo\":536365,\"StockCode\":\"85123A\",\"Description\":\"WHITE HANGING HEART T-LIGHT HOLDER\",\"Quantity\":6,\"InvoiceDate\":\"12/2/2010 8:26232sa\",\"UnitPrice\":2.55,\"CustomerID\":17850,\"Country\":\"United Kingdom\"}\r\n",
 70 | 					"options": {
 71 | 						"raw": {
 72 | 							"language": "json"
 73 | 						}
 74 | 					}
 75 | 				},
 76 | 				"url": {
 77 | 					"raw": "localhost:8000/invoiceitem",
 78 | 					"host": [
 79 | 						"localhost"
 80 | 					],
 81 | 					"port": "8000",
 82 | 					"path": [
 83 | 						"invoiceitem"
 84 | 					]
 85 | 				}
 86 | 			},
 87 | 			"response": []
 88 | 		},
 89 | 		{
 90 | 			"name": "Test invoiceno Validation - Bad Copy",
 91 | 			"request": {
 92 | 				"method": "POST",
 93 | 				"header": [],
 94 | 				"body": {
 95 | 					"mode": "raw",
 96 | 					"raw": "{\"InvoiceNo\":\"536365abc\",\"StockCode\":\"85123A\",\"Description\":\"WHITE HANGING HEART T-LIGHT HOLDER\",\"Quantity\":6,\"InvoiceDate\":\"12/2/2010 8:26\",\"UnitPrice\":2.55,\"CustomerID\":17850,\"Country\":\"United Kingdom\"}\r\n",
 97 | 					"options": {
 98 | 						"raw": {
 99 | 							"language": "json"
100 | 						}
101 | 					}
102 | 				},
103 | 				"url": {
104 | 					"raw": "localhost:8000/invoiceitem",
105 | 					"host": [
106 | 						"localhost"
107 | 					],
108 | 					"port": "8000",
109 | 					"path": [
110 | 						"invoiceitem"
111 | 					]
112 | 				}
113 | 			},
114 | 			"response": []
115 | 		},
116 | 		{
117 | 			"name": "Docker container test 1",
118 | 			"request": {
119 | 				"method": "POST",
120 | 				"header": [],
121 | 				"body": {
122 | 					"mode": "raw",
123 | 					"raw": "{\"InvoiceNo\":536365,\"StockCode\":\"85123A\",\"Description\":\"WHITE HANGING HEART T-LIGHT HOLDER\",\"Quantity\":6,\"InvoiceDate\":\"12/02/2010 8:26\",\"UnitPrice\":2.55,\"CustomerID\":17850,\"Country\":\"United Kingdom\"}\r\n",
124 | 					"options": {
125 | 						"raw": {
126 | 							"language": "json"
127 | 						}
128 | 					}
129 | 				},
130 | 				"url": {
131 | 					"raw": "localhost:8000/invoiceitem",
132 | 					"host": [
133 | 						"localhost"
134 | 					],
135 | 					"port": "8000",
136 | 					"path": [
137 | 						"invoiceitem"
138 | 					]
139 | 				}
140 | 			},
141 | 			"response": []
142 | 		},
143 | 		{
144 | 			"name": "Docker container test 2",
145 | 			"request": {
146 | 				"method": "POST",
147 | 				"header": [],
148 | 				"body": {
149 | 					"mode": "raw",
150 | 					"raw": "{\"InvoiceNo\":536365,\"StockCode\":\"85123A\",\"Description\":\"WHITE HANGING HEART T-LIGHT HOLDER\",\"Quantity\":6,\"InvoiceDate\":\"12/02/2010 8:26\",\"UnitPrice\":2.55,\"CustomerID\":17850,\"Country\":\"United Kingdom\"}\r\n",
151 | 					"options": {
152 | 						"raw": {
153 | 							"language": "json"
154 | 						}
155 | 					}
156 | 				},
157 | 				"url": {
158 | 					"raw": "localhost:8000/invoiceitem",
159 | 					"host": [
160 | 						"localhost"
161 | 					],
162 | 					"port": "8000",
163 | 					"path": [
164 | 						"invoiceitem"
165 | 					]
166 | 				}
167 | 			},
168 | 			"response": []
169 | 		}
170 | 	]
171 | }


--------------------------------------------------------------------------------
/API-Ingest/app/main.py:
--------------------------------------------------------------------------------
 1 | # You need this to use FastAPI, work with statuses and be able to end HTTPExceptions
 2 | from fastapi import FastAPI, status, HTTPException
 3 |  
 4 | # You need this to be able to turn classes into JSONs and return
 5 | from fastapi.encoders import jsonable_encoder
 6 | from fastapi.responses import JSONResponse
 7 | 
 8 | # Needed for json.dumps
 9 | import json
10 | 
11 | # Both used for BaseModel
12 | from pydantic import BaseModel
13 | 
14 | from datetime import datetime
15 | from kafka import KafkaProducer, producer
16 | 
17 | 
18 | 
19 | # Create class (schema) for the JSON
20 | # Date get's ingested as string and then before writing validated
21 | class InvoiceItem(BaseModel):
22 |     InvoiceNo: int
23 |     StockCode: str
24 |     Description: str
25 |     Quantity: int
26 |     InvoiceDate: str
27 |     UnitPrice: float
28 |     CustomerID: int
29 |     Country: str
30 | 
31 | # This is important for general execution and the docker later
32 | app = FastAPI()
33 | 
34 | # Base URL
35 | @app.get("/")
36 | async def root():
37 |     return {"message": "Hello World"}
38 | 
39 | # Add a new invoice
40 | @app.post("/invoiceitem")
41 | async def post_invoice_item(item: InvoiceItem): #body awaits a json with invoice item information
42 |     print("Message received")
43 |     try:
44 |         # Evaluate the timestamp and parse it to datetime object you can work with
45 |         date = datetime.strptime(item.InvoiceDate, "%d/%m/%Y %H:%M")
46 | 
47 |         print('Found a timestamp: ', date)
48 | 
49 |         # Replace strange date with new datetime
50 |         # Use strftime to parse the string in the right format (replace / with - and add seconds)
51 |         item.InvoiceDate = date.strftime("%d-%m-%Y %H:%M:%S")
52 |         print("New item date:", item.InvoiceDate)
53 |         
54 |         # Parse item back to json
55 |         json_of_item = jsonable_encoder(item)
56 |         
57 |         # Dump the json out as string
58 |         json_as_string = json.dumps(json_of_item)
59 |         print(json_as_string)
60 |         
61 |         # Produce the string
62 |         produce_kafka_string(json_as_string)
63 | 
64 |         # Encode the created customer item if successful into a JSON and return it to the client with 201
65 |         return JSONResponse(content=json_of_item, status_code=201)
66 |     
67 |     # Will be thrown by datetime if the date does not fit
68 |     # All other value errors are automatically taken care of because of the InvoiceItem Class
69 |     except ValueError:
70 |         return JSONResponse(content=jsonable_encoder(item), status_code=400)
71 |         
72 | 
73 | def produce_kafka_string(json_as_string):
74 |     # Create producer
75 |         producer = KafkaProducer(bootstrap_servers='kafka:9092',acks=1)
76 |         
77 |         # Write the string as bytes because Kafka needs it this way
78 |         producer.send('ingestion-topic', bytes(json_as_string, 'utf-8'))
79 |         producer.flush() 


--------------------------------------------------------------------------------
/API-Ingest/build command.txt:
--------------------------------------------------------------------------------
1 | docker build -t api-ingest .


--------------------------------------------------------------------------------
/API-Ingest/dockerfile:
--------------------------------------------------------------------------------
1 | FROM tiangolo/uvicorn-gunicorn-fastapi:python3.7
2 | 
3 | COPY requirements.txt /tmp/
4 | 
5 | RUN pip install --no-cache-dir --upgrade pip && \
6 |     pip install --requirement /tmp/requirements.txt
7 | 
8 | COPY ./app /app


--------------------------------------------------------------------------------
/API-Ingest/requirements.txt:
--------------------------------------------------------------------------------
1 | kafka-python


--------------------------------------------------------------------------------
/ApacheSpark/.ipynb_checkpoints/01-streaming-kafka-src-dst-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from pyspark.sql import SparkSession\n",
 10 |     "\n",
 11 |     "# Spark session & context\n",
 12 |     "spark = (SparkSession\n",
 13 |     "         .builder\n",
 14 |     "         .master('local')\n",
 15 |     "         .appName('kafka-streaming')\n",
 16 |     "         # Add kafka package\n",
 17 |     "         .config(\"spark.jars.packages\", \"org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.5\")\n",
 18 |     "         .getOrCreate())\n",
 19 |     "sc = spark.sparkContext\n"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 2,
 25 |    "metadata": {},
 26 |    "outputs": [
 27 |     {
 28 |      "data": {
 29 |       "text/plain": [
 30 |        "DataFrame[key: string, value: string]"
 31 |       ]
 32 |      },
 33 |      "execution_count": 2,
 34 |      "metadata": {},
 35 |      "output_type": "execute_result"
 36 |     }
 37 |    ],
 38 |    "source": [
 39 |     "df = spark \\\n",
 40 |     "  .readStream \\\n",
 41 |     "  .format(\"kafka\") \\\n",
 42 |     "  .option(\"kafka.bootstrap.servers\", \"kafka:9092\") \\\n",
 43 |     "  .option(\"subscribe\", \"ingestion-topic\") \\\n",
 44 |     "  .load()\n",
 45 |     "df.selectExpr(\"CAST(key AS STRING)\", \"CAST(value AS STRING)\")"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": 3,
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "df.createOrReplaceTempView(\"message\")"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 4,
 60 |    "metadata": {},
 61 |    "outputs": [
 62 |     {
 63 |      "data": {
 64 |       "text/plain": [
 65 |        "<pyspark.sql.streaming.DataStreamWriter at 0x7efce20a83d0>"
 66 |       ]
 67 |      },
 68 |      "execution_count": 4,
 69 |      "metadata": {},
 70 |      "output_type": "execute_result"
 71 |     }
 72 |    ],
 73 |    "source": [
 74 |     "res = spark.sql(\"SELECT * from message\")\n",
 75 |     "res.writeStream.format(\"console\") \\\n",
 76 |     "            .outputMode(\"append\")"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 5,
 82 |    "metadata": {},
 83 |    "outputs": [
 84 |     {
 85 |      "ename": "AnalysisException",
 86 |      "evalue": "'checkpointLocation must be specified either through option(\"checkpointLocation\", ...) or SparkSession.conf.set(\"spark.sql.streaming.checkpointLocation\", ...);'",
 87 |      "output_type": "error",
 88 |      "traceback": [
 89 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
 90 |       "\u001b[0;31mPy4JJavaError\u001b[0m                             Traceback (most recent call last)",
 91 |       "\u001b[0;32m/usr/local/spark/python/pyspark/sql/utils.py\u001b[0m in \u001b[0;36mdeco\u001b[0;34m(*a, **kw)\u001b[0m\n\u001b[1;32m     62\u001b[0m         \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 63\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     64\u001b[0m         \u001b[0;32mexcept\u001b[0m \u001b[0mpy4j\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprotocol\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mPy4JJavaError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 92 |       "\u001b[0;32m/usr/local/spark/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py\u001b[0m in \u001b[0;36mget_return_value\u001b[0;34m(answer, gateway_client, target_id, name)\u001b[0m\n\u001b[1;32m    327\u001b[0m                     \u001b[0;34m\"An error occurred while calling {0}{1}{2}.\\n\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 328\u001b[0;31m                     format(target_id, \".\", name), value)\n\u001b[0m\u001b[1;32m    329\u001b[0m             \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 93 |       "\u001b[0;31mPy4JJavaError\u001b[0m: An error occurred while calling o65.start.\n: org.apache.spark.sql.AnalysisException: checkpointLocation must be specified either through option(\"checkpointLocation\", ...) or SparkSession.conf.set(\"spark.sql.streaming.checkpointLocation\", ...);\n\tat org.apache.spark.sql.streaming.StreamingQueryManager$$anonfun$4.apply(StreamingQueryManager.scala:234)\n\tat org.apache.spark.sql.streaming.StreamingQueryManager$$anonfun$4.apply(StreamingQueryManager.scala:229)\n\tat scala.Option.getOrElse(Option.scala:121)\n\tat org.apache.spark.sql.streaming.StreamingQueryManager.createQuery(StreamingQueryManager.scala:228)\n\tat org.apache.spark.sql.streaming.StreamingQueryManager.startQuery(StreamingQueryManager.scala:322)\n\tat org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:325)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:498)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\n\tat py4j.Gateway.invoke(Gateway.java:282)\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\n\tat py4j.GatewayConnection.run(GatewayConnection.java:238)\n\tat java.lang.Thread.run(Thread.java:748)\n",
 94 |       "\nDuring handling of the above exception, another exception occurred:\n",
 95 |       "\u001b[0;31mAnalysisException\u001b[0m                         Traceback (most recent call last)",
 96 |       "\u001b[0;32m<ipython-input-5-3579afab98a1>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      4\u001b[0m   \u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"kafka\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m   \u001b[0;34m.\u001b[0m\u001b[0moption\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"kafka.bootstrap.servers\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"host1:port1,host2:port2\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m   \u001b[0;34m.\u001b[0m\u001b[0moption\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"topic\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"topic1\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      7\u001b[0m   \u001b[0;34m.\u001b[0m\u001b[0mstart\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      8\u001b[0m   \u001b[0;34m.\u001b[0m\u001b[0mawaitTermination\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 97 |       "\u001b[0;32m/usr/local/spark/python/pyspark/sql/streaming.py\u001b[0m in \u001b[0;36mstart\u001b[0;34m(self, path, format, outputMode, partitionBy, queryName, **options)\u001b[0m\n\u001b[1;32m   1106\u001b[0m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mqueryName\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mqueryName\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1107\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mpath\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1108\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sq\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_jwrite\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstart\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1109\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1110\u001b[0m             \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sq\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_jwrite\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstart\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 98 |       "\u001b[0;32m/usr/local/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m   1255\u001b[0m         \u001b[0manswer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgateway_client\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1256\u001b[0m         return_value = get_return_value(\n\u001b[0;32m-> 1257\u001b[0;31m             answer, self.gateway_client, self.target_id, self.name)\n\u001b[0m\u001b[1;32m   1258\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1259\u001b[0m         \u001b[0;32mfor\u001b[0m \u001b[0mtemp_arg\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtemp_args\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 99 |       "\u001b[0;32m/usr/local/spark/python/pyspark/sql/utils.py\u001b[0m in \u001b[0;36mdeco\u001b[0;34m(*a, **kw)\u001b[0m\n\u001b[1;32m     67\u001b[0m                                              e.java_exception.getStackTrace()))\n\u001b[1;32m     68\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0ms\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstartswith\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'org.apache.spark.sql.AnalysisException: '\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 69\u001b[0;31m                 \u001b[0;32mraise\u001b[0m \u001b[0mAnalysisException\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m': '\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstackTrace\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     70\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0ms\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstartswith\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'org.apache.spark.sql.catalyst.analysis'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     71\u001b[0m                 \u001b[0;32mraise\u001b[0m \u001b[0mAnalysisException\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m': '\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstackTrace\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
100 |       "\u001b[0;31mAnalysisException\u001b[0m: 'checkpointLocation must be specified either through option(\"checkpointLocation\", ...) or SparkSession.conf.set(\"spark.sql.streaming.checkpointLocation\", ...);'"
101 |      ]
102 |     }
103 |    ],
104 |    "source": [
105 |     "ds = df \\\n",
106 |     "  .selectExpr(\"CAST(key AS STRING)\", \"CAST(value AS STRING)\") \\\n",
107 |     "  .writeStream \\\n",
108 |     "  .format(\"kafka\") \\\n",
109 |     "  .option(\"kafka.bootstrap.servers\", \"kafka:9092\") \\\n",
110 |     "  .option(\"topic\", \"spark-output\") \\\n",
111 |     "  .start() \\\n",
112 |     "  .awaitTermination()"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": null,
118 |    "metadata": {},
119 |    "outputs": [],
120 |    "source": [
121 |     "#df.writeStream \\\n",
122 |     " #   .format(\"kafka\") \\\n",
123 |     " #   .option(\"kafka.bootstrap.servers\", \"kafka:9092\") \\\n",
124 |     "  #  .option(\"topic\", \"spark-output\") \\\n",
125 |     "  #  .start()\n",
126 |     "    "
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": null,
132 |    "metadata": {},
133 |    "outputs": [],
134 |    "source": []
135 |   }
136 |  ],
137 |  "metadata": {
138 |   "kernelspec": {
139 |    "display_name": "Python 3",
140 |    "language": "python",
141 |    "name": "python3"
142 |   },
143 |   "language_info": {
144 |    "codemirror_mode": {
145 |     "name": "ipython",
146 |     "version": 3
147 |    },
148 |    "file_extension": ".py",
149 |    "mimetype": "text/x-python",
150 |    "name": "python",
151 |    "nbconvert_exporter": "python",
152 |    "pygments_lexer": "ipython3",
153 |    "version": "3.7.6"
154 |   }
155 |  },
156 |  "nbformat": 4,
157 |  "nbformat_minor": 4
158 | }
159 | 


--------------------------------------------------------------------------------
/ApacheSpark/.ipynb_checkpoints/02-streaming-kafka-src-dst-mongodb-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 3,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from pyspark.sql import SparkSession\n",
 10 |     "\n",
 11 |     "# Spark session & context\n",
 12 |     "spark = (SparkSession\n",
 13 |     "         .builder\n",
 14 |     "         .master('local')\n",
 15 |     "         .appName('kafka-mongo-streaming')     \n",
 16 |     "         # Add kafka package and mongodb package. Make sure to to this as one string!\n",
 17 |     "         # Versions need to match the Spark version (trial & error)\n",
 18 |     "         .config(\"spark.jars.packages\", \"org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.5,org.mongodb.spark:mongo-spark-connector_2.11:2.4.0\")\n",
 19 |     "         # Mongo config including the username and password from compose file\n",
 20 |     "         .config(\"spark.mongodb.input.uri\",\"mongodb://root:example@mongo:27017/docstreaming.invoices?authSource=admin\")\n",
 21 |     "         .config(\"spark.mongodb.output.uri\",\"mongodb://root:example@mongo:27017/docstreaming.invoices?authSource=admin\")\n",
 22 |     "         .getOrCreate())\n",
 23 |     "sc = spark.sparkContext\n"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 4,
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "# Read the message from the kafka stream\n",
 33 |     "df = spark \\\n",
 34 |     "  .readStream \\\n",
 35 |     "  .format(\"kafka\") \\\n",
 36 |     "  .option(\"kafka.bootstrap.servers\", \"kafka:9092\") \\\n",
 37 |     "  .option(\"subscribe\", \"ingestion-topic\") \\\n",
 38 |     "  .load()\n",
 39 |     "\n",
 40 |     "# convert the binary values to string\n",
 41 |     "df1 = df.selectExpr(\"CAST(key AS STRING)\", \"CAST(value AS STRING)\")"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 5,
 47 |    "metadata": {},
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "#Create a temporary view for SparkSQL\n",
 51 |     "df1.createOrReplaceTempView(\"message\")"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 6,
 57 |    "metadata": {},
 58 |    "outputs": [
 59 |     {
 60 |      "data": {
 61 |       "text/plain": [
 62 |        "<pyspark.sql.streaming.StreamingQuery at 0x7fef494bcb50>"
 63 |       ]
 64 |      },
 65 |      "execution_count": 6,
 66 |      "metadata": {},
 67 |      "output_type": "execute_result"
 68 |     }
 69 |    ],
 70 |    "source": [
 71 |     "# Write out the message to the console of the environment\n",
 72 |     "res = spark.sql(\"SELECT * from message\")\n",
 73 |     "res.writeStream.format(\"console\") \\\n",
 74 |     "            .outputMode(\"append\") \\\n",
 75 |     "            .start()"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 7,
 81 |    "metadata": {},
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "# Write the unvonverted dataframe (no strings)\n",
 85 |     "# message back into Kafka in another topic#\n",
 86 |     "# listen to it with a local consumer\n",
 87 |     "ds = df \\\n",
 88 |     "  .writeStream \\\n",
 89 |     "  .format(\"kafka\") \\\n",
 90 |     "  .option(\"kafka.bootstrap.servers\", \"kafka:9092\") \\\n",
 91 |     "  .option(\"topic\", \"spark-output\") \\\n",
 92 |     "  .option(\"checkpointLocation\", \"/tmp\") \\\n",
 93 |     "  .start() "
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": 22,
 99 |    "metadata": {},
100 |    "outputs": [],
101 |    "source": [
102 |     "# Write the message into MongoDB\n",
103 |     "\n",
104 |     "def f(row):\n",
105 |     "    #dfjson = row.value\n",
106 |     "    #write that dataframe to mongodb\n",
107 |     "    #df = spark.read.json(row.value)\n",
108 |     "    \n",
109 |     "    dataframe = spark.read.json(row.value)\n",
110 |     "    \n",
111 |     "    dataframe.write.format(\"com.mongodb.spark.sql.DefaultSource\").mode(\"append\").save()\n",
112 |     "\n",
113 |     "def foreach_batch_function(df, epoch_id):\n",
114 |     "    # Transform and write batchDF\n",
115 |     "\n",
116 |     "    #writes dataframe with complete kafka message\n",
117 |     "    #df.write.format(\"com.mongodb.spark.sql.DefaultSource\").mode(\"append\").save()\n",
118 |     "    \n",
119 |     "    #only get json sring from dataframe\n",
120 |     "    #value = df.select(\"value\")\n",
121 |     "    \n",
122 |     "    # write each row to mongodb (there is only one)\n",
123 |     "    #df.foreach(f)\n",
124 |     "\n",
125 |     "    from pyspark.sql.types import MapType,StringType\n",
126 |     "    from pyspark.sql.functions import from_json\n",
127 |     "    \n",
128 |     "    df2=df.withColumn(\"value\",from_json(df.value,MapType(StringType(),StringType())))\n",
129 |     "    \n",
130 |     "    df3= df2.select(\"Quantity\",\"UnitPrice\",\"Country\",\"CustomerID\",\"StockCode\",\"Description\",\"InvoiceDate\",\"InvoiceNo\")\n",
131 |     "    \n",
132 |     "    df3.write.format(\"com.mongodb.spark.sql.DefaultSource\").mode(\"append\").save()\n",
133 |     "    \n",
134 |     "    \n",
135 |     "    \n",
136 |     "    \n",
137 |     "    pass"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": null,
143 |    "metadata": {},
144 |    "outputs": [],
145 |    "source": [
146 |     "df1.writeStream.foreachBatch(foreach_batch_function).start().awaitTermination()"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": null,
152 |    "metadata": {},
153 |    "outputs": [],
154 |    "source": []
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": null,
159 |    "metadata": {},
160 |    "outputs": [],
161 |    "source": []
162 |   }
163 |  ],
164 |  "metadata": {
165 |   "kernelspec": {
166 |    "display_name": "Python 3",
167 |    "language": "python",
168 |    "name": "python3"
169 |   },
170 |   "language_info": {
171 |    "codemirror_mode": {
172 |     "name": "ipython",
173 |     "version": 3
174 |    },
175 |    "file_extension": ".py",
176 |    "mimetype": "text/x-python",
177 |    "name": "python",
178 |    "nbconvert_exporter": "python",
179 |    "pygments_lexer": "ipython3",
180 |    "version": "3.7.6"
181 |   }
182 |  },
183 |  "nbformat": 4,
184 |  "nbformat_minor": 4
185 | }
186 | 


--------------------------------------------------------------------------------
/ApacheSpark/01-streaming-kafka-src-dst.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from pyspark.sql import SparkSession\n",
 10 |     "\n",
 11 |     "# Spark session & context\n",
 12 |     "spark = (SparkSession\n",
 13 |     "         .builder\n",
 14 |     "         .master('local')\n",
 15 |     "         .appName('kafka-streaming')\n",
 16 |     "         # Add kafka package\n",
 17 |     "         .config(\"spark.jars.packages\", \"org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.5\")\n",
 18 |     "         .getOrCreate())\n",
 19 |     "sc = spark.sparkContext\n"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 2,
 25 |    "metadata": {},
 26 |    "outputs": [
 27 |     {
 28 |      "data": {
 29 |       "text/plain": [
 30 |        "DataFrame[key: string, value: string]"
 31 |       ]
 32 |      },
 33 |      "execution_count": 2,
 34 |      "metadata": {},
 35 |      "output_type": "execute_result"
 36 |     }
 37 |    ],
 38 |    "source": [
 39 |     "# Read the message from the kafka stream\n",
 40 |     "df = spark \\\n",
 41 |     "  .readStream \\\n",
 42 |     "  .format(\"kafka\") \\\n",
 43 |     "  .option(\"kafka.bootstrap.servers\", \"kafka:9092\") \\\n",
 44 |     "  .option(\"subscribe\", \"ingestion-topic\") \\\n",
 45 |     "  .load()\n",
 46 |     "df.selectExpr(\"CAST(key AS STRING)\", \"CAST(value AS STRING)\")"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 3,
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "#Create a small temporary view for SparkSQL\n",
 56 |     "df.createOrReplaceTempView(\"message\")"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": 4,
 62 |    "metadata": {},
 63 |    "outputs": [
 64 |     {
 65 |      "data": {
 66 |       "text/plain": [
 67 |        "<pyspark.sql.streaming.StreamingQuery at 0x7f75c9676150>"
 68 |       ]
 69 |      },
 70 |      "execution_count": 4,
 71 |      "metadata": {},
 72 |      "output_type": "execute_result"
 73 |     }
 74 |    ],
 75 |    "source": [
 76 |     "# Write out the message to the console of the environment\n",
 77 |     "res = spark.sql(\"SELECT * from message\")\n",
 78 |     "res.writeStream.format(\"console\") \\\n",
 79 |     "            .outputMode(\"append\") \\\n",
 80 |     "            .start() "
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": 5,
 86 |    "metadata": {},
 87 |    "outputs": [
 88 |     {
 89 |      "ename": "KeyboardInterrupt",
 90 |      "evalue": "",
 91 |      "output_type": "error",
 92 |      "traceback": [
 93 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
 94 |       "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
 95 |       "\u001b[0;32m<ipython-input-5-3cff01c72009>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      5\u001b[0m   \u001b[0;34m.\u001b[0m\u001b[0moption\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"kafka.bootstrap.servers\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"kafka:9092\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      6\u001b[0m   \u001b[0;34m.\u001b[0m\u001b[0moption\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"topic\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"spark-output\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m   \u001b[0;34m.\u001b[0m\u001b[0moption\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"checkpointLocation\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"/tmp\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      8\u001b[0m   \u001b[0;34m.\u001b[0m\u001b[0mstart\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      9\u001b[0m   \u001b[0;34m.\u001b[0m\u001b[0mawaitTermination\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 96 |       "\u001b[0;32m/usr/local/spark/python/pyspark/sql/streaming.py\u001b[0m in \u001b[0;36mawaitTermination\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    101\u001b[0m             \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_jsq\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mawaitTermination\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtimeout\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0;36m1000\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    102\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 103\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_jsq\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mawaitTermination\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    104\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    105\u001b[0m     \u001b[0;34m@\u001b[0m\u001b[0mproperty\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 97 |       "\u001b[0;32m/usr/local/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m   1253\u001b[0m             \u001b[0mproto\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mEND_COMMAND_PART\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1254\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1255\u001b[0;31m         \u001b[0manswer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgateway_client\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1256\u001b[0m         return_value = get_return_value(\n\u001b[1;32m   1257\u001b[0m             answer, self.gateway_client, self.target_id, self.name)\n",
 98 |       "\u001b[0;32m/usr/local/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py\u001b[0m in \u001b[0;36msend_command\u001b[0;34m(self, command, retry, binary)\u001b[0m\n\u001b[1;32m    983\u001b[0m         \u001b[0mconnection\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_connection\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    984\u001b[0m         \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 985\u001b[0;31m             \u001b[0mresponse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mconnection\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    986\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mbinary\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    987\u001b[0m                 \u001b[0;32mreturn\u001b[0m \u001b[0mresponse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_create_connection_guard\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconnection\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 99 |       "\u001b[0;32m/usr/local/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py\u001b[0m in \u001b[0;36msend_command\u001b[0;34m(self, command)\u001b[0m\n\u001b[1;32m   1150\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1151\u001b[0m         \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1152\u001b[0;31m             \u001b[0manswer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msmart_decode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstream\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreadline\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1153\u001b[0m             \u001b[0mlogger\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdebug\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Answer received: {0}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0manswer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1154\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0manswer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstartswith\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mproto\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mRETURN_MESSAGE\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
100 |       "\u001b[0;32m/opt/conda/lib/python3.7/socket.py\u001b[0m in \u001b[0;36mreadinto\u001b[0;34m(self, b)\u001b[0m\n\u001b[1;32m    587\u001b[0m         \u001b[0;32mwhile\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    588\u001b[0m             \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 589\u001b[0;31m                 \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv_into\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    590\u001b[0m             \u001b[0;32mexcept\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    591\u001b[0m                 \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_timeout_occurred\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
101 |       "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
102 |      ]
103 |     }
104 |    ],
105 |    "source": [
106 |     "# Write the message back into Kafka in another topic that you are going to listen to with a local consumer\n",
107 |     "ds = df \\\n",
108 |     "  .writeStream \\\n",
109 |     "  .format(\"kafka\") \\\n",
110 |     "  .option(\"kafka.bootstrap.servers\", \"kafka:9092\") \\\n",
111 |     "  .option(\"topic\", \"spark-output\") \\\n",
112 |     "  .option(\"checkpointLocation\", \"/tmp\") \\\n",
113 |     "  .start() \\\n",
114 |     "  .awaitTermination()"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": null,
120 |    "metadata": {},
121 |    "outputs": [],
122 |    "source": []
123 |   }
124 |  ],
125 |  "metadata": {
126 |   "kernelspec": {
127 |    "display_name": "Python 3",
128 |    "language": "python",
129 |    "name": "python3"
130 |   },
131 |   "language_info": {
132 |    "codemirror_mode": {
133 |     "name": "ipython",
134 |     "version": 3
135 |    },
136 |    "file_extension": ".py",
137 |    "mimetype": "text/x-python",
138 |    "name": "python",
139 |    "nbconvert_exporter": "python",
140 |    "pygments_lexer": "ipython3",
141 |    "version": "3.7.6"
142 |   }
143 |  },
144 |  "nbformat": 4,
145 |  "nbformat_minor": 4
146 | }
147 | 


--------------------------------------------------------------------------------
/ApacheSpark/02-streaming-kafka-src-dst-mongodb.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from pyspark.sql import SparkSession\n",
 10 |     "\n",
 11 |     "# you need these two to transform the json strings to dataframes\n",
 12 |     "from pyspark.sql.types import MapType,StringType\n",
 13 |     "from pyspark.sql.functions import from_json\n",
 14 |     "\n",
 15 |     "# Spark session & context\n",
 16 |     "spark = (SparkSession\n",
 17 |     "         .builder\n",
 18 |     "         .master('local')\n",
 19 |     "         .appName('kafka-mongo-streaming')     \n",
 20 |     "         # Add kafka package and mongodb package. Make sure to to this as one string!\n",
 21 |     "         # Versions need to match the Spark version (trial & error)\n",
 22 |     "         .config(\"spark.jars.packages\", \"org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.5,org.mongodb.spark:mongo-spark-connector_2.11:2.4.0\")\n",
 23 |     "         # Mongo config including the username and password from compose file\n",
 24 |     "         .config(\"spark.mongodb.input.uri\",\"mongodb://root:example@mongo:27017/docstreaming.invoices?authSource=admin\")\n",
 25 |     "         .config(\"spark.mongodb.output.uri\",\"mongodb://root:example@mongo:27017/docstreaming.invoices?authSource=admin\")\n",
 26 |     "         .getOrCreate())\n",
 27 |     "sc = spark.sparkContext\n"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 2,
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "# Read the message from the kafka stream\n",
 37 |     "df = spark \\\n",
 38 |     "  .readStream \\\n",
 39 |     "  .format(\"kafka\") \\\n",
 40 |     "  .option(\"kafka.bootstrap.servers\", \"kafka:9092\") \\\n",
 41 |     "  .option(\"subscribe\", \"ingestion-topic\") \\\n",
 42 |     "  .load()\n",
 43 |     "\n",
 44 |     "# convert the binary values to string\n",
 45 |     "df1 = df.selectExpr(\"CAST(key AS STRING)\", \"CAST(value AS STRING)\")"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": 3,
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "#Create a temporary view for SparkSQL\n",
 55 |     "df1.createOrReplaceTempView(\"message\")"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 4,
 61 |    "metadata": {},
 62 |    "outputs": [
 63 |     {
 64 |      "data": {
 65 |       "text/plain": [
 66 |        "<pyspark.sql.streaming.StreamingQuery at 0x7fcadd359410>"
 67 |       ]
 68 |      },
 69 |      "execution_count": 4,
 70 |      "metadata": {},
 71 |      "output_type": "execute_result"
 72 |     }
 73 |    ],
 74 |    "source": [
 75 |     "# Write out the message to the console of the environment\n",
 76 |     "res = spark.sql(\"SELECT * from message\")\n",
 77 |     "res.writeStream.format(\"console\") \\\n",
 78 |     "            .outputMode(\"append\") \\\n",
 79 |     "            .start()"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": 5,
 85 |    "metadata": {},
 86 |    "outputs": [],
 87 |    "source": [
 88 |     "# Write the unvonverted dataframe (no strings)\n",
 89 |     "# message back into Kafka in another topic#\n",
 90 |     "# listen to it with a local consumer\n",
 91 |     "ds = df \\\n",
 92 |     "  .writeStream \\\n",
 93 |     "  .format(\"kafka\") \\\n",
 94 |     "  .option(\"kafka.bootstrap.servers\", \"kafka:9092\") \\\n",
 95 |     "  .option(\"topic\", \"spark-output\") \\\n",
 96 |     "  .option(\"checkpointLocation\", \"/tmp\") \\\n",
 97 |     "  .start() "
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": 6,
103 |    "metadata": {},
104 |    "outputs": [],
105 |    "source": [
106 |     " \n",
107 |     "\n",
108 |     "# Write the message into MongoDB\n",
109 |     "def foreach_batch_function(df, epoch_id):\n",
110 |     "    # Transform and write batchDF in this foreach\n",
111 |     "\n",
112 |     "    # writes the dataframe with complete kafka message into mongodb\n",
113 |     "    #df.write.format(\"com.mongodb.spark.sql.DefaultSource\").mode(\"append\").save()\n",
114 |     "    \n",
115 |     "    #Transform the values of all rows in column value and create a dataframe out of it (will also only have one row)\n",
116 |     "    df2=df.withColumn(\"value\",from_json(df.value,MapType(StringType(),StringType())))    \n",
117 |     "   \n",
118 |     "    # Transform the dataframe so that it will have individual columns \n",
119 |     "    df3= df2.select([\"value.Quantity\",\"value.UnitPrice\",\"value.Country\",\"value.CustomerID\",\"value.StockCode\",\"value.Description\",\"value.InvoiceDate\",\"value.InvoiceNo\"])\n",
120 |     "    \n",
121 |     "    # Send the dataframe into MongoDB which will create a BSON document out of it\n",
122 |     "    df3.write.format(\"com.mongodb.spark.sql.DefaultSource\").mode(\"append\").save()\n",
123 |     "    \n",
124 |     "    pass"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": null,
130 |    "metadata": {},
131 |    "outputs": [],
132 |    "source": [
133 |     "# Start the MongoDB stream and wait for termination\n",
134 |     "df1.writeStream.foreachBatch(foreach_batch_function).start().awaitTermination()"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": null,
140 |    "metadata": {},
141 |    "outputs": [],
142 |    "source": []
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": null,
147 |    "metadata": {},
148 |    "outputs": [],
149 |    "source": []
150 |   }
151 |  ],
152 |  "metadata": {
153 |   "kernelspec": {
154 |    "display_name": "Python 3",
155 |    "language": "python",
156 |    "name": "python3"
157 |   },
158 |   "language_info": {
159 |    "codemirror_mode": {
160 |     "name": "ipython",
161 |     "version": 3
162 |    },
163 |    "file_extension": ".py",
164 |    "mimetype": "text/x-python",
165 |    "name": "python",
166 |    "nbconvert_exporter": "python",
167 |    "pygments_lexer": "ipython3",
168 |    "version": "3.7.6"
169 |   }
170 |  },
171 |  "nbformat": 4,
172 |  "nbformat_minor": 4
173 | }
174 | 


--------------------------------------------------------------------------------
/Kafka Commands.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | ./kafka-topics.sh --list --bootstrap-server localhost:9092
 3 | 
 4 | ## Create Topic
 5 | ./kafka-topics.sh --create --topic ingestion-topic --bootstrap-server localhost:9092
 6 | ./kafka-topics.sh --create --topic spark-output --bootstrap-server localhost:9092
 7 | 
 8 | 
 9 | # Local consumer
10 | ./kafka-console-consumer.sh --topic ingestion-topic --bootstrap-server localhost:9092
11 | ./kafka-console-consumer.sh --topic spark-output --bootstrap-server localhost:9092
12 | 
13 | 
14 | # Local producer 
15 | ./kafka-console-producer.sh --topic ingestion-topic --bootstrap-server localhost:9092
16 | 
17 | # To test if your Kafka is running correctly:
18 | 1. Connect to the container cli and go to the Kafka directory
19 | 2. Start a local consumer
20 | 3. Connect with a second cli to the container
21 | 4. Start in the second cli a local producer
22 | 5. Type in to the producer cli a message and hit enter
23 | 6. Check if you can see the message in the consumer cli


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # document-streaming
 2 | Repository for the Document streaming capstone projects
 3 | 
 4 | Outlook:
 5 | 
 6 | - Deploy the streamlit app as docker (build and add in dockerfile)
 7 | - Deploy the whole platform with all containers on a cloud platform of your choice
 8 | - Add an API between Streamlit and MongoDB so that Streamlit doesnt have to be directly connected with MongoDB (User& Password)
 9 | 
10 | #  Links to deploy the streamlit app as docker container
11 | #https://maelfabien.github.io/project/Streamlit/#the-application
12 | #https://towardsdatascience.com/how-to-deploy-a-semantic-search-engine-with-streamlit-and-docker-on-aws-elastic-beanstalk-42ddce0422f3


--------------------------------------------------------------------------------
/Streamlit/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "python.pythonPath": "/bin/python3"
3 | }


--------------------------------------------------------------------------------
/Streamlit/streamlitapp.py:
--------------------------------------------------------------------------------
 1 | from numpy import double
 2 | import streamlit as st
 3 | from pandas import DataFrame
 4 | 
 5 | import numpy as np
 6 | 
 7 | import pymongo
 8 | 
 9 | 
10 | #data = pd.read_csv("data.csv")
11 | myclient = pymongo.MongoClient("mongodb://localhost:27017/",username='root',password='example')
12 | mydb = myclient["docstreaming"]
13 | mycol = mydb["invoices"] 
14 | 
15 | 
16 | # Below the fist chart add a input field for the invoice number
17 | cust_id = st.sidebar.text_input("CustomerID:")
18 | #st.text(inv_no)  # Use this to print out the content of the input field
19 | 
20 | # if enter has been used on the input field 
21 | if cust_id:
22 | 
23 |     myquery = {"CustomerID": cust_id}
24 |     # only includes or excludes
25 |     mydoc = mycol.find( myquery , { "_id": 0, "StockCode": 0, "Description": 0, "Quantity": 0, "Country": 0, "UnitPrice": 0})
26 | 
27 |     # create dataframe from resulting documents to use drop_duplicates
28 |     df = DataFrame(mydoc)
29 |     
30 |     # drop duplicates, but keep the first one
31 |     df.drop_duplicates(subset ="InvoiceNo", keep = 'first', inplace = True)
32 | 
33 |     # Add the table with a headline
34 |     st.header("Output Customer Invoices")
35 |     table2 = st.dataframe(data=df) 
36 |     
37 | 
38 | # Below the fist chart add a input field for the invoice number
39 | inv_no = st.sidebar.text_input("InvoiceNo:")
40 | #st.text(inv_no)  # Use this to print out the content of the input field
41 | 
42 | # if enter has been used on the input field 
43 | if inv_no:
44 |     
45 |     myquery = {"InvoiceNo": inv_no}
46 |     mydoc = mycol.find( myquery, { "_id": 0, "InvoiceDate": 0, "Country": 0, "CustomerID": 0 })
47 | 
48 |     # create the dataframe
49 |     df = DataFrame(mydoc)
50 | 
51 |     # reindex it so that the columns are order lexicographically 
52 |     reindexed = df.reindex(sorted(df.columns), axis=1)
53 | 
54 |     # Add the table with a headline
55 |     st.header("Output by Invoice ID")
56 |     table2 = st.dataframe(data=reindexed) 
57 | 
58 | 
59 | 


--------------------------------------------------------------------------------
/archive/docker-compose-kafka-spark-confluent.yml:
--------------------------------------------------------------------------------
 1 | version: "3"
 2 | services:
 3 | 
 4 |   zookeeper:
 5 |     image: confluentinc/cp-zookeeper:6.2.0
 6 |     hostname: zookeeper
 7 |     container_name: zookeeper
 8 |     ports:
 9 |       - "2181:2181"
10 |     environment:
11 |       ZOOKEEPER_CLIENT_PORT: 2181
12 |       ZOOKEEPER_TICK_TIME: 2000
13 |     networks:
14 |       - document-streaming
15 | 
16 |   broker:
17 |     image: confluentinc/cp-server:6.2.0
18 |     hostname: broker
19 |     container_name: broker
20 |     depends_on:
21 |       - zookeeper
22 |     ports:
23 |       - "9092:9092"
24 |       - "9101:9101"
25 |     environment:
26 |       KAFKA_BROKER_ID: 1
27 |       KAFKA_ZOOKEEPER_CONNECT: 'zookeeper:2181'
28 |       KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT
29 |       KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://broker:29092,PLAINTEXT_HOST://localhost:9092
30 |       KAFKA_METRIC_REPORTERS: io.confluent.metrics.reporter.ConfluentMetricsReporter
31 |       KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
32 |       KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS: 0
33 |       KAFKA_CONFLUENT_LICENSE_TOPIC_REPLICATION_FACTOR: 1
34 |       KAFKA_CONFLUENT_BALANCER_TOPIC_REPLICATION_FACTOR: 1
35 |       KAFKA_TRANSACTION_STATE_LOG_MIN_ISR: 1
36 |       KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 1
37 |       KAFKA_JMX_PORT: 9101
38 |       KAFKA_JMX_HOSTNAME: localhost
39 |       KAFKA_CONFLUENT_SCHEMA_REGISTRY_URL: http://schema-registry:8081
40 |       CONFLUENT_METRICS_REPORTER_BOOTSTRAP_SERVERS: broker:29092
41 |       CONFLUENT_METRICS_REPORTER_TOPIC_REPLICAS: 1
42 |       CONFLUENT_METRICS_ENABLE: 'true'
43 |       CONFLUENT_SUPPORT_CUSTOMER_ID: 'anonymous'
44 |     networks:
45 |       - document-streaming
46 |  
47 |   spark:
48 |     image: 'jupyter/pyspark-notebook:spark-2'
49 |     ports:
50 |       - '8888:8888'
51 |       - "4040-4080:4040-4080"
52 |     volumes:
53 |       - ./ApacheSpark/:/home/jovyan/work
54 |     networks:
55 |       - document-streaming
56 | 
57 |   api-ingest:
58 |     image: 'api-ingest'
59 |     ports:
60 |        - '80:80'
61 |     networks:
62 |       - document-streaming
63 | 
64 | networks:
65 |   document-streaming:
66 |     driver: bridge
67 |   


--------------------------------------------------------------------------------
/archive/docker-compose-kafka_old.yml:
--------------------------------------------------------------------------------
 1 | version: "3"
 2 | services:
 3 |   zookeeper:
 4 |     image: 'bitnami/zookeeper:latest'
 5 |     ports:
 6 |       - '2181:2181'
 7 |     environment:
 8 |       - ALLOW_ANONYMOUS_LOGIN=yes
 9 |   kafka:
10 |     image: 'bitnami/kafka:latest'
11 |     ports:
12 |       - '9092:9092'
13 |     environment:
14 |       - KAFKA_BROKER_ID=1
15 |       - KAFKA_CFG_LISTENERS=PLAINTEXT://:9092
16 |       - KAFKA_CFG_ADVERTISED_LISTENERS=PLAINTEXT://127.0.0.1:9092
17 |       - KAFKA_CFG_ZOOKEEPER_CONNECT=zookeeper:2181
18 |       - ALLOW_PLAINTEXT_LISTENER=yes
19 |     depends_on:
20 |       - zookeeper


--------------------------------------------------------------------------------
/client/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "python.pythonPath": "/bin/python3"
3 | }


--------------------------------------------------------------------------------
/client/api-client.py:
--------------------------------------------------------------------------------
 1 | import linecache
 2 | import json
 3 | 
 4 | # Make sure that requests is installed in your WSL
 5 | import requests 
 6 | 
 7 | # We could just read the entire file, but if it's really big you could go line by line
 8 | # If you want make this an excercise and replace the process below by reading the whole file at once and going line by line
 9 | 
10 | #set starting id and ending id
11 | start = 1
12 | end = 50
13 | 
14 | # Loop over the JSON file
15 | i=start
16 | 
17 | while i <= end:     
18 |     
19 |     # read a specific line
20 |     line = linecache.getline('./output.txt', i)
21 |     #print(line)
22 |     # write the line to the API
23 |     myjson = json.loads(line)
24 |     
25 |     print(myjson)
26 |     
27 |     response = requests.post('http://localhost:80/invoiceitem', json=myjson)
28 | 
29 |     # Use this for dedbugging
30 |     #print("Status code: ", response.status_code)
31 |     #print("Printing Entire Post Request")
32 |     print(response.json())
33 | 
34 |     # increase i
35 |     i+=1
36 | 


--------------------------------------------------------------------------------
/client/transformer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from numpy import add
 3 | import pandas as pd
 4 | 
 5 | 
 6 | df = pd.read_csv ('data.csv') 
 7 | #print(df)
 8 | 
 9 | # add a json column to the dataframe
10 | # splitlines will split the json into multiple rows not a single one
11 | df['json'] = df.to_json(orient='records', lines=True).splitlines()
12 | #print(df)
13 | 
14 | # just take the json column of the dataframe
15 | dfjson = df['json']
16 | print(dfjson)
17 | 
18 | # print out the dataframe to a file
19 | # Note that the timestamp forward slash will be escaped to stay true to JSON schema
20 | np.savetxt(r'./output.txt', dfjson.values, fmt='%s')
21 | 


--------------------------------------------------------------------------------
/docker helpful commands.txt:
--------------------------------------------------------------------------------
1 | docker inspect document-streaming_spark_1 
2 | 
3 | 
4 |     links:
5 |       - "kafka:kafka-server" #allows API to discover kafka service by name "kafka-server"


--------------------------------------------------------------------------------
/docker-compose-kafka-spark-mongodb.yml:
--------------------------------------------------------------------------------
 1 | version: "3"
 2 | services:
 3 | 
 4 |   zookeeper:
 5 |     image: 'bitnami/zookeeper:3.7.0-debian-10-r70'
 6 |     ports:
 7 |       - '2181:2181'
 8 |     environment:
 9 |       - ALLOW_ANONYMOUS_LOGIN=yes
10 |     networks:
11 |       - document-streaming
12 | 
13 |   kafka:
14 |     image: 'bitnami/kafka:2.8.0-debian-10-r42'
15 |     ports:
16 |       - '9093:9093'          #change to 9093 to access external from your windows host
17 |     environment:
18 |       - KAFKA_BROKER_ID=1
19 |       - KAFKA_CFG_ZOOKEEPER_CONNECT=zookeeper:2181
20 |       - ALLOW_PLAINTEXT_LISTENER=yes
21 |       - KAFKA_CFG_LISTENER_SECURITY_PROTOCOL_MAP=CLIENT:PLAINTEXT,EXTERNAL:PLAINTEXT     #add aditional listener for external
22 |       - KAFKA_CFG_LISTENERS=CLIENT://:9092,EXTERNAL://:9093                              #9092 will be for other containers, 9093 for your windows client
23 |       - KAFKA_CFG_ADVERTISED_LISTENERS=CLIENT://kafka:9092,EXTERNAL://localhost:9093     #9092 will be for other containers, 9093 for your windows client
24 |       - KAFKA_INTER_BROKER_LISTENER_NAME=CLIENT
25 |     depends_on:
26 |       - zookeeper
27 |     networks:
28 |       - document-streaming
29 | 
30 |   spark:
31 |     image: 'jupyter/pyspark-notebook:spark-2'
32 |     ports:
33 |       - '8888:8888'
34 |       - "4040-4080:4040-4080"
35 |     volumes:
36 |       - ./ApacheSpark/:/home/jovyan/work
37 |     networks:
38 |       - document-streaming
39 | 
40 |   api-ingest:
41 |     image: 'api-ingest'
42 |     ports:
43 |        - '80:80'
44 |     networks:
45 |       - document-streaming
46 | 
47 |   mongo:
48 |     container_name: mongo-dev
49 |     image: mongo
50 |     volumes:
51 |       - ~/dockerdata/mongodb:/data/db
52 |     restart: on-failure
53 |     ports:
54 |       - "27017:27017"
55 |     environment:
56 |       MONGO_INITDB_ROOT_USERNAME: root
57 |       MONGO_INITDB_ROOT_PASSWORD: example
58 |       MONGO_INITDB_DATABASE: auth
59 |     networks:
60 |       - document-streaming
61 | 
62 |   mongo-express:
63 |     image: mongo-express
64 |     restart: on-failure
65 |     ports:
66 |       - "8081:8081"
67 |     environment:
68 |       ME_CONFIG_MONGODB_SERVER: mongo-dev
69 |       ME_CONFIG_MONGODB_ADMINUSERNAME: root
70 |       ME_CONFIG_MONGODB_ADMINPASSWORD: example
71 |       ME_CONFIG_BASICAUTH_USERNAME: admin
72 |       ME_CONFIG_BASICAUTH_PASSWORD: tribes
73 |     networks:
74 |       - document-streaming
75 |     depends_on:
76 |       - mongo
77 | 
78 | 
79 | 
80 | 
81 | networks:
82 |   document-streaming:
83 |     driver: bridge
84 | 


--------------------------------------------------------------------------------
/docker-compose-kafka-spark.yml:
--------------------------------------------------------------------------------
 1 | version: "3"
 2 | services:
 3 | 
 4 |   zookeeper:
 5 |     image: 'bitnami/zookeeper:3.7.0-debian-10-r70'
 6 |     ports:
 7 |       - '2181:2181'
 8 |     environment:
 9 |       - ALLOW_ANONYMOUS_LOGIN=yes
10 |     networks:
11 |       - document-streaming
12 | 
13 |   kafka:
14 |     image: 'bitnami/kafka:2.8.0-debian-10-r42'
15 |     ports:
16 |       - '9093:9093'          #change to 9093 to access external from your windows host
17 |     environment:
18 |       - KAFKA_BROKER_ID=1
19 |       - KAFKA_CFG_ZOOKEEPER_CONNECT=zookeeper:2181
20 |       - ALLOW_PLAINTEXT_LISTENER=yes
21 |       - KAFKA_CFG_LISTENER_SECURITY_PROTOCOL_MAP=CLIENT:PLAINTEXT,EXTERNAL:PLAINTEXT     #add aditional listener for external
22 |       - KAFKA_CFG_LISTENERS=CLIENT://:9092,EXTERNAL://:9093                              #9092 will be for other containers, 9093 for your windows client
23 |       - KAFKA_CFG_ADVERTISED_LISTENERS=CLIENT://kafka:9092,EXTERNAL://localhost:9093     #9092 will be for other containers, 9093 for your windows client
24 |       - KAFKA_INTER_BROKER_LISTENER_NAME=CLIENT
25 |     depends_on:
26 |       - zookeeper
27 |     networks:
28 |       - document-streaming
29 | 
30 |   spark:
31 |     image: 'jupyter/pyspark-notebook:spark-2'
32 |     ports:
33 |       - '8888:8888'
34 |       - "4040-4080:4040-4080"
35 |     volumes:
36 |       - ./ApacheSpark/:/home/jovyan/work
37 |     networks:
38 |       - document-streaming
39 | 
40 |   api-ingest:
41 |     image: 'api-ingest'
42 |     ports:
43 |        - '80:80'
44 |     networks:
45 |       - document-streaming
46 | 
47 | networks:
48 |   document-streaming:
49 |     driver: bridge
50 | 


--------------------------------------------------------------------------------
/docker-compose-kafka.yml:
--------------------------------------------------------------------------------
 1 | version: "3"
 2 | services:
 3 |   zookeeper:
 4 |     image: 'bitnami/zookeeper:3.7.0-debian-10-r70'
 5 |     ports:
 6 |       - '2181:2181'
 7 |     environment:
 8 |       - ALLOW_ANONYMOUS_LOGIN=yes
 9 |   kafka:
10 |     image: 'bitnami/kafka:2.8.0-debian-10-r42'
11 |     ports:
12 |       - '9093:9093'          #change to 9093 to access external from your windows host
13 |     environment:
14 |       - KAFKA_BROKER_ID=1
15 |       - KAFKA_CFG_ZOOKEEPER_CONNECT=zookeeper:2181
16 |       - ALLOW_PLAINTEXT_LISTENER=yes
17 |       - KAFKA_CFG_LISTENER_SECURITY_PROTOCOL_MAP=CLIENT:PLAINTEXT,EXTERNAL:PLAINTEXT     #add aditional listener for external
18 |       - KAFKA_CFG_LISTENERS=CLIENT://:9092,EXTERNAL://:9093                              #9092 will be for other containers, 9093 for your windows client
19 |       - KAFKA_CFG_ADVERTISED_LISTENERS=CLIENT://kafka:9092,EXTERNAL://localhost:9093     #9092 will be for other containers, 9093 for your windows client
20 |       - KAFKA_INTER_BROKER_LISTENER_NAME=CLIENT
21 |     depends_on:
22 |       - zookeeper
23 | 


--------------------------------------------------------------------------------