├── BDS-adding-intelligence-to-stream-processing.pdf ├── LICENSE ├── README.md ├── assets ├── notebook-architecture.svg └── services-architecture.svg ├── image-frontend ├── README.md ├── app.py └── requirements.txt ├── image-processor-base └── Dockerfile ├── image-processor ├── .s2i │ └── bin │ │ ├── assemble │ │ └── run ├── Dockerfile ├── app.py ├── darkflow-1.0.0-cp36-cp36m-linux_x86_64.whl ├── requirements.txt └── root │ └── darkflow-1.0.0-cp36-cp36m-linux_x86_64.whl ├── notebooks ├── Dockerfile ├── austen.txt ├── darkflow-1.0.0-cp36-cp36m-linux_x86_64.whl ├── generate.ipynb ├── images │ ├── 0001eeaf4aed83f9.jpg │ ├── 0004886b7d043cfd.jpg │ ├── 000595fe6fee6369.jpg │ ├── 00075905539074f2.jpg │ ├── 0007cebe1b2ba653.jpg │ ├── 0007d6cf88afaa4a.jpg │ ├── 0008e425fb49a2bf.jpg │ ├── 0009bad4d8539bb4.jpg │ ├── 000a045a0715d64d.jpg │ ├── 000a1249af2bc5f0.jpg │ ├── 000ada55d36b4bcb.jpg │ ├── 000c4d66ce89aa69.jpg │ ├── 111029deeea453f5.jpg │ ├── 111147418c6aca65.jpg │ ├── 11124dd4875ecf8d.jpg │ ├── 1114b632f4336821.jpg │ ├── 111b8f9f49f2c6d2.jpg │ ├── 111c6bd9ac7173b7.jpg │ ├── 2222e793ebf6cc0e.jpg │ ├── 2224582bc4a8ec9a.jpg │ ├── 22249f8dc1f94a8d.jpg │ ├── 2225364f47d276d1.jpg │ ├── 2226c86e9f50fd72.jpg │ ├── 2227f2fd5d49cb15.jpg │ ├── 222a266190060531.jpg │ ├── 222a383653c65184.jpg │ ├── 222acfb0975199b1.jpg │ ├── 222c38d9991e97e2.jpg │ ├── 222ce6a4d930b047.jpg │ ├── 222e8ad382d2bb98.jpg │ ├── 222fd997433b59ce.jpg │ ├── 3332995b230f5680.jpg │ ├── 333310bdebc23b93.jpg │ ├── 3333210d97865136.jpg │ ├── 33334fc1df5b2536.jpg │ ├── 333452abf04d1b2b.jpg │ ├── 33369f252faf0b2d.jpg │ ├── 33377099dedaefa9.jpg │ ├── 33398fd76df994db.jpg │ ├── 3339a5c598981879.jpg │ ├── 3339cba64cb1927e.jpg │ ├── 3339ed0aad663343.jpg │ ├── 333ed5e609f26e46.jpg │ ├── 333fc945c316344f.jpg │ ├── aaa028a23c5052fa.jpg │ ├── aaa0b7ebb3e4affe.jpg │ ├── aaa12411c2d6378d.jpg │ ├── aaa1ace50eb16015.jpg │ ├── aaa2a076ae7feb33.jpg │ ├── aaa3d54ae3f0c1ae.jpg │ ├── aaa517fbf112c358.jpg │ ├── aaa55aa836a17d0c.jpg │ ├── aaa576abbfdd9a8e.jpg │ ├── aaa5a9c49681685a.jpg │ ├── aaa8bd02f557fc5c.jpg │ ├── aaaba578031e2017.jpg │ ├── aaaf093e2c2a45f1.jpg │ ├── bbb06ba600c9b472.jpg │ ├── bbb1eb4d12a66c6b.jpg │ ├── bbb1ff6486a289be.jpg │ ├── bbb4a398207d7237.jpg │ ├── bbb4a411c275f953.jpg │ ├── bbb4d54888e5fc73.jpg │ ├── bbb60c5612dec65e.jpg │ ├── bbb74721560555e3.jpg │ ├── bbb7a4d1abe795da.jpg │ ├── bbbb419a2c107d5a.jpg │ ├── bbbe9fd021044d50.jpg │ ├── bbbfe6b58a3ac009.jpg │ ├── ccc1057a1160aba1.jpg │ ├── ccc135df5430520b.jpg │ ├── ccc18570ba287be1.jpg │ ├── ccc1a2d44a290368.jpg │ ├── ccc1ffd2d3d5d2c9.jpg │ ├── ccc3f0bc6500081c.jpg │ ├── ccc47704bcce60ef.jpg │ ├── ccc6cc13dd83bfe7.jpg │ ├── ccca79f1e7646f88.jpg │ ├── cccd58f60624eaa2.jpg │ ├── ccce1c730c1696db.jpg │ ├── ccce275285f1d8c0.jpg │ ├── ddd38a507dec8dde.jpg │ ├── ddd3f96ff7f0bc78.jpg │ ├── ddd4549df01b95e0.jpg │ ├── ddd56ec6489b89e3.jpg │ ├── ddd78d7007d17a75.jpg │ ├── ddd8b588942ceded.jpg │ ├── dddd6279b633d7ab.jpg │ ├── eee10577be06c0bb.jpg │ ├── eee273732a3b2608.jpg │ ├── eee297394eb90e24.jpg │ ├── eee3d8fffe8a9cfd.jpg │ ├── eee47025e4848d0a.jpg │ ├── eee4de386df902e5.jpg │ ├── eee4fa0981f728c7.jpg │ ├── eee560c9e2411d91.jpg │ ├── eee6b7c34d84c9ca.jpg │ ├── eee707f7382991dc.jpg │ ├── eee762f22867b8d4.jpg │ ├── eee8820b315c4c3b.jpg │ ├── eeebfdcbce12a2d9.jpg │ ├── eeed1e944c331791.jpg │ ├── eeef28bbe1dd1d18.jpg │ ├── fff0debd2911bfbd.jpg │ ├── fff149d613bacda0.jpg │ ├── fff2268a1b921e8e.jpg │ ├── fff277539bd8a2be.jpg │ ├── fff3ce694bc02a09.jpg │ ├── fff50186c03c8474.jpg │ ├── fff5d10dd5ad119d.jpg │ ├── fff820866f567015.jpg │ ├── fffc2f36b181a4fb.jpg │ └── ffff21932da3ed01.jpg ├── opencv-basics.ipynb ├── otto.jpg ├── preprocess-reviews.py ├── reviews-1.txt.gz ├── reviews-5-100k.txt.gz ├── sentiment.ipynb ├── social-firehose.ipynb └── tensorflow-1.6.0-cp36-cp36m-linux_x86_64.whl ├── resources.yaml ├── update-generator ├── .s2i │ └── bin │ │ └── assemble ├── README.md ├── app.py ├── austen.txt ├── requirements.txt ├── reviews-1.txt.gz └── reviews-5-100k.txt.gz ├── update-transformer ├── README.md ├── app.py └── requirements.txt └── update-visualizer ├── README.md ├── app.py └── requirements.txt /BDS-adding-intelligence-to-stream-processing.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/BDS-adding-intelligence-to-stream-processing.pdf -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # radanalytics.io streaming and event processing lab 2 | 3 | This repository contains artifacts and resources to support the streaming and 4 | event processing labs for radanalytics.io. [Our slides from the workshop at Big Data Spain 2018 are here](./BDS-adding-intelligence-to-stream-processing.pdf). 5 | 6 | ## Description 7 | 8 | For many applications, it’s not enough to be able to process big data at 9 | rest—you also need to be able to process streams of data in motion. 10 | 11 | In this lab, you’ll learn how to use open source tools and frameworks from 12 | radanalytics.io to develop and deploy intelligent event-processing 13 | applications on Red Hat OpenShift. We’ll start by explaining some of the 14 | concepts behind stream processing. Next, we’ll show you how to develop a 15 | basic log-processing application and refine it by adding summarization, 16 | queries, and features that take advantage of artificial intelligence and 17 | machine learning. 18 | 19 | ## Prerequisites 20 | 21 | * An OpenShift cluster available. For instructions on installing OpenShift 22 | (including ad-hoc single node test clusters), please see the 23 | [OpenShift Getting Started](https://docs.openshift.org/latest/getting_started/administrators.html#getting-started-administrators) 24 | documentation. 25 | 26 | * An Apache Kafka broker available. For a basic Apache Kafka installation on 27 | OpenShift, we recommend these 28 | [instruction from Strimzi](http://strimzi.io/docs/0.1.0/#kafka-in-memory) 29 | as a starting point. Be sure to record the broker addresses for future use. 30 | 31 | * A terminal with the OpenShift client `oc` available with an active login 32 | session. 33 | 34 | * An OpenShift project with the 35 | [`resources.yaml`](https://raw.githubusercontent.com/radanalyticsio/streaming-lab/master/resources.yaml) 36 | manifest from this repository installed. To install this file, enter the 37 | following command, replacing the `` with your project: 38 | ``` 39 | oc create -n -f https://raw.githubusercontent.com/radanalyticsio/streaming-lab/master/resources.yaml 40 | ``` 41 | 42 | ### Synthetic social media update service 43 | 44 | As the core of this lab is about processing and analyzing social media 45 | updates, there is a service application that will produce these updates. The 46 | `update-generator` directory contains the source and related files for 47 | deploying this service. 48 | 49 | To deploy the generator run the following command using the `oc` command line 50 | tool. You must replace `` with the values you recorded 51 | earlier for the Kafka brokers. 52 | 53 | ``` 54 | oc new-app centos/python-36-centos7~https://github.com/radanalyticsio/streaming-lab/ \ 55 | --context-dir=update-generator \ 56 | -e KAFKA_BROKERS= \ 57 | -e KAFKA_TOPIC=social-firehose \ 58 | --name=emitter 59 | ``` 60 | 61 | ## Jupyter 62 | 63 | [Jupyter](https://jupyter.org/) is an open source project born out of the 64 | [IPython Project](https://ipython.org/) which delivers an in-browser 65 | experience for interactive data science and scientific computing with support 66 | for several programming languages. In this lab we will utilize Python, Apache 67 | Spark, and a few natural language processing libraries. 68 | 69 | The first portion of this lab is conducted through the lessons available in 70 | the Jupyter notebooks contained in this repository. 71 | 72 | This diagram shows an overview of the architecture for this portion of the 73 | lab: 74 | 75 | ![notebook architecture](assets/notebook-architecture.svg) 76 | 77 | ### Launching a notebook 78 | 79 | WIP 80 | 81 | ## Analytics services on OpenShift 82 | 83 | The second portion of this lab focuses on building and deploying an analytics 84 | service based on the techniques learned in the notebooks. 85 | 86 | There are two services which will be deployed, the `update-transformer`, and 87 | the `update-visualizer`. The transformer will utilize Apache Spark to process 88 | the synthetic social media updates and apply sentiment scores to each update. 89 | The visualizer gives the user an interface to examine some of the work that 90 | is being done by the transformer, it does this by displaying updates along with 91 | the sentiment scores they have received. 92 | 93 | This diagram shows an overview of the architecture for these services: 94 | 95 | ![services architecture](assets/services-architecture.svg) 96 | 97 | ### Procedure 98 | 99 | 1. Deploy the update-transformer application. You will need the Kafka broker 100 | information for this command. To build and deploy the transformer use the 101 | following command: 102 | ``` 103 | oc new-app --template=oshinko-python-spark-build-dc \ 104 | -p APPLICATION_NAME=transformer \ 105 | -p GIT_URI=https://github.com/radanalyticsio/streaming-lab \ 106 | -p CONTEXT_DIR=update-transformer \ 107 | -e KAFKA_BROKERS= \ 108 | -e KAFKA_IN_TOPIC=social-firehose \ 109 | -e KAFKA_OUT_TOPIC=sentiments \ 110 | ``` 111 | 1. Deploy the update-visualizer application. You will again need the Kafka 112 | broker information for this command. To build and deploy the visualizer 113 | use the following command: 114 | ``` 115 | oc new-app centos/python-36-centos7~https://github.com/radanalyticsio/streaming-lab \ 116 | --context-dir=update-visualizer \ 117 | -e KAFKA_BROKERS= \ 118 | -e KAFKA_TOPIC=sentiments \ 119 | --name=visualizer 120 | ``` 121 | 1. Expose a route to the visualizer. This command will expose an external URL 122 | to the visualizer which you will use to communicate with the application. 123 | ``` 124 | oc expose svc/visualizer 125 | ``` 126 | 1. Request the latest data from the visualizer. The `curl` utility provides a 127 | convenient method for accessing the current data in the visualizer. The 128 | following command will get that data: 129 | ``` 130 | curl http://`oc get routes/visualizer --template='{{.spec.host}}'` 131 | ``` 132 | 133 | ## Advanced details 134 | 135 | The following sections provide an in-depth look at individual components of 136 | this lab. They are here to help you build a deeper understanding of how the 137 | pieces of this lab fit together. 138 | 139 | ### Source data 140 | 141 | The source data for this lab is imagined as a series of synthetic social media 142 | updates. The text from these updates will be used in conjunction with sentiment 143 | analysis to help demonstrate the use of machine learning to investigate data. 144 | The data used for this lab is randomly generated using 145 | [Markov chains](https://en.wikipedia.org/wiki/Markov_chain). None of this data 146 | is from live accounts and it contains no personally identifiable information. 147 | 148 | The format used for transmitting the update data on the wire is defined by 149 | this [JSON Schema](http://json-schema.org) notation: 150 | 151 | ``` 152 | { 153 | "title": "Social Media Update", 154 | "type": "object", 155 | "properties": { 156 | "user_id": { 157 | "type": "string" 158 | }, 159 | "update_id": { 160 | "type": "string" 161 | }, 162 | "text": { 163 | "type": "string" 164 | } 165 | }, 166 | "required": ["user_id", "update_id", "text"] 167 | } 168 | ``` 169 | 170 | -------------------------------------------------------------------------------- /assets/notebook-architecture.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 19 | 21 | 29 | 35 | 36 | 44 | 50 | 51 | 59 | 65 | 66 | 74 | 80 | 81 | 93 | 97 | 101 | 105 | 109 | 113 | 117 | 121 | 125 | 129 | 133 | 137 | 141 | 142 | 167 | 169 | 170 | 172 | image/svg+xml 173 | 175 | 176 | 177 | 178 | 179 | 184 | 192 | 200 | 205 | 210 | 215 | 220 | 225 | 230 | 235 | 240 | 245 | 250 | 255 | 260 | 265 | 270 | 275 | 280 | 285 | 286 | 291 | 296 | 300 | 304 | 308 | 312 | 316 | 320 | 321 | 324 | 341 | 349 | 350 | 353 | 370 | 378 | 379 | 382 | 387 | 392 | 397 | 402 | 407 | 412 | 413 | 418 | 422 | 426 | 430 | 434 | 438 | 442 | 446 | 450 | 454 | 458 | 462 | 466 | 470 | 474 | 478 | 479 | 480 | 481 | -------------------------------------------------------------------------------- /image-frontend/README.md: -------------------------------------------------------------------------------- 1 | # image-frontend 2 | 3 | This is a very basic Flask application that accepts image uploads and serializes the image data on to a Kafka topic. 4 | 5 | ## usage 6 | 7 | ``` 8 | oc new-app centos/python-36-centos7~https://github.com/radanalyticsio/streaming-lab/ \ 9 | --context-dir=image-frontend \ 10 | -e KAFKA_BROKERS=my-kafka-host:9092 \ 11 | -e KAFKA_TOPIC=raw-images \ 12 | --name=image-frontend 13 | ``` -------------------------------------------------------------------------------- /image-frontend/app.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # some file uploading code adapted from Flask documentation 4 | # some kafka/flask integration code adapted from https://github.com/bones-brigade/flask-kafka-openshift-python-listener/ 5 | 6 | from flask import Flask, redirect, request, url_for 7 | import base64 8 | import argparse 9 | import os 10 | import json 11 | 12 | from kafka import KafkaProducer 13 | 14 | from werkzeug.utils import secure_filename 15 | 16 | ALLOWED_EXTENSIONS = set(['png', 'jpg', 'jpeg', 'gif']) 17 | 18 | app = Flask(__name__) 19 | topic = None 20 | producer = None 21 | 22 | def allowed_file(filename): 23 | return '.' in filename and \ 24 | filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS 25 | 26 | @app.route('/', methods=['GET', 'POST']) 27 | def upload_file(): 28 | if request.method == 'POST': 29 | # check if the post request has the file part 30 | if 'file' not in request.files: 31 | flash('No file part') 32 | return redirect(request.url) 33 | f = request.files['file'] 34 | # if user does not select file, browser also 35 | # submit a empty part without filename 36 | if f.filename == '': 37 | flash('No selected file') 38 | return redirect(request.url) 39 | if f and allowed_file(f.filename): 40 | filename = secure_filename(f.filename) 41 | if producer is not None: 42 | producer.send(topic, bytes(json.dumps({"filename" : filename, "contents" : base64.b64encode(f.stream.read()).decode('ascii')}), "ascii")) 43 | f.close() 44 | return "got your file

received file %s

" % filename 45 | return ''' 46 | 47 | Upload an image 48 |

Upload an image

49 |
50 |

51 | 52 |

53 | ''' 54 | 55 | def get_arg(env, default): 56 | return os.getenv(env) if os.getenv(env, '') is not '' else default 57 | 58 | def parse_args(parser): 59 | args = parser.parse_args() 60 | args.brokers = get_arg('KAFKA_BROKERS', args.brokers) 61 | args.topic = get_arg('KAFKA_TOPIC', args.topic) 62 | return args 63 | 64 | if __name__ == '__main__': 65 | app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 66 | app.logger.setLevel(0) 67 | parser = argparse.ArgumentParser( 68 | description='listen for some stuff on kafka') 69 | parser.add_argument( 70 | '--brokers', 71 | help='The bootstrap servers, env variable KAFKA_BROKERS', 72 | default='localhost:9092') 73 | parser.add_argument( 74 | '--topic', 75 | help='Topic to publish to, env variable KAFKA_TOPIC', 76 | default='raw-images') 77 | 78 | args = parse_args(parser) 79 | 80 | topic = args.topic 81 | producer = KafkaProducer(bootstrap_servers=args.brokers) 82 | 83 | app.run(host='0.0.0.0', port=8080) -------------------------------------------------------------------------------- /image-frontend/requirements.txt: -------------------------------------------------------------------------------- 1 | flask 2 | kafka 3 | -------------------------------------------------------------------------------- /image-processor-base/Dockerfile: -------------------------------------------------------------------------------- 1 | # image-processor-base 2 | FROM centos/python-36-centos7 3 | 4 | # TODO: Put the maintainer name in the image metadata 5 | # LABEL maintainer="Your Name " 6 | 7 | # TODO: Rename the builder environment variable to inform users about application you provide them 8 | # ENV BUILDER_VERSION 1.0 9 | 10 | # TODO: Set labels used in OpenShift to describe the builder image 11 | #LABEL io.k8s.description="Platform for building xyz" \ 12 | # io.k8s.display-name="builder x.y.z" \ 13 | # io.openshift.expose-services="8080:http" \ 14 | # io.openshift.tags="builder,x.y.z,etc." 15 | 16 | # TODO: Install required packages here: 17 | # RUN yum install -y ... && yum clean all -y 18 | RUN yum install -y libstdc++ libSM libXrender libXext && yum clean all -y 19 | 20 | # TODO (optional): Copy the builder files into /opt/app-root 21 | # COPY .// /opt/app-root/ 22 | 23 | # TODO: Copy the S2I scripts to /usr/libexec/s2i, since openshift/base-centos7 image 24 | # sets io.openshift.s2i.scripts-url label that way, or update that label 25 | COPY ./s2i/bin/ /usr/libexec/s2i 26 | 27 | # TODO: Drop the root user and make the content of /opt/app-root owned by user 1001 28 | # RUN chown -R 1001:1001 /opt/app-root 29 | 30 | # This default user is created in the openshift/base-centos7 image 31 | USER 1001 32 | 33 | # TODO: Set the default port for applications built using this image 34 | # EXPOSE 8080 35 | 36 | # TODO: Set the default CMD for the image 37 | # CMD ["/usr/libexec/s2i/usage"] 38 | -------------------------------------------------------------------------------- /image-processor/.s2i/bin/assemble: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | # Execute the default S2I script 4 | source scl_source enable rh-python36 5 | source ${STI_SCRIPTS_PATH}/assemble 6 | 7 | pip install $(find / -name darkflow-1.0.0-cp36-cp36m-linux_x86_64.whl) 8 | curl https://pjreddie.com/media/files/yolov2.weights -o /opt/app-root/yolo.weights 9 | curl https://raw.githubusercontent.com/pjreddie/darknet/master/cfg/yolov2.cfg -o /opt/app-root/yolo.cfg -------------------------------------------------------------------------------- /image-processor/.s2i/bin/run: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | # Execute the default S2I script 4 | source scl_source enable rh-python36 5 | exec ${STI_SCRIPTS_PATH}/run -------------------------------------------------------------------------------- /image-processor/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM fedora:26 2 | 3 | USER root 4 | RUN mkdir -p /opt/app-root/ 5 | RUN mkdir -p /opt/app-root/cfg 6 | 7 | ADD . /opt/app-root 8 | 9 | ADD https://pjreddie.com/media/files/yolov2.weights /opt/app-root/yolo.weights 10 | ADD https://raw.githubusercontent.com/pjreddie/darknet/master/cfg/yolov2.cfg /opt/app-root/yolo.cfg 11 | ADD https://raw.githubusercontent.com/pjreddie/darknet/master/data/coco.names /opt/app-root/cfg/ 12 | 13 | WORKDIR /opt/app-root 14 | 15 | RUN INSTALL_PKGS="python3 libstdc++ python3-devel python3-setuptools python3-pip libSM libXrender libXext" && \ 16 | yum -y --setopt=tsflags=nodocs install $INSTALL_PKGS && \ 17 | yum -y clean all --enablerepo='*'&& \ 18 | pip3 install -r /opt/app-root/requirements.txt && \ 19 | pip3 install /opt/app-root/darkflow-1.0.0-cp36-cp36m-linux_x86_64.whl && \ 20 | rm /opt/app-root/requirements.txt 21 | 22 | RUN chmod 777 /opt/app-root /opt/app-root/ /opt/app-root/* /opt/app-root/cfg /opt/app-root/cfg/* 23 | RUN chmod 755 /opt/app-root/app.py 24 | RUN chown 185 /opt/app-root 25 | 26 | EXPOSE 8080 27 | 28 | LABEL io.k8s.description="image processor" \ 29 | io.k8s.display-name="image-processor-service" \ 30 | io.openshift.expose-services="8080:http" 31 | 32 | USER 185 33 | 34 | ENTRYPOINT ["python3"] 35 | 36 | CMD ["./app.py"] -------------------------------------------------------------------------------- /image-processor/app.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import os 4 | import json 5 | import cv2 6 | 7 | from kafka import KafkaConsumer, KafkaProducer 8 | import base64 9 | import numpy as np 10 | 11 | def get_arg(env, default): 12 | return os.getenv(env) if os.getenv(env, '') is not '' else default 13 | 14 | 15 | def parse_args(parser): 16 | args = parser.parse_args() 17 | args.brokers = get_arg('KAFKA_BROKERS', args.brokers) 18 | args.topic_in = get_arg('KAFKA_TOPIC_IN', args.topic_in) 19 | args.topic_out = get_arg('KAFKA_TOPIC_OUT', args.topic_out) 20 | return args 21 | 22 | 23 | def main(args): 24 | from darkflow.net.build import TFNet 25 | 26 | consumer = KafkaConsumer(args.topic_in, bootstrap_servers=args.brokers) 27 | producer = KafkaProducer(bootstrap_servers=args.brokers) 28 | options = {"model": "yolo.cfg", "load": "yolo.weights", "threshold" : 0.1} 29 | yolo = TFNet(options) 30 | 31 | for msg in consumer: 32 | value = json.loads(str(msg.value, "utf-8")) 33 | try: 34 | image = base64.b64decode(value["contents"]) 35 | imgcv = cv2.imdecode(np.asarray(bytearray(image), dtype=np.uint8), cv2.IMREAD_COLOR) 36 | predictions = yolo.return_predict(imgcv) 37 | 38 | # annotate image with bounding boxes 39 | rows, cols, _ = imgcv.shape 40 | thickness = int(max(rows, cols) / 100) 41 | for prediction in predictions: 42 | tl = prediction["topleft"] 43 | topleft = (tl["x"], tl["y"]) 44 | br = prediction["bottomright"] 45 | bottomright = (br["x"], br["y"]) 46 | # draw a white rectangle around the identified object 47 | white = (255,255,255) 48 | cv2.rectangle(imgcv, topleft, bottomright, color=white, thickness=thickness) 49 | 50 | # resize long edge to 256 pixels 51 | factor = 256.0 / max(rows, cols) 52 | _, outimg = cv2.imencode(".jpg", cv2.resize(imgcv, fx=factor, fy=factor)) 53 | outimg_enc = base64.b64encode(outimg.tobytes()).decode("ascii") 54 | 55 | producer.send(args.topic_out, bytes(json.dumps({"predictions" : predictions, "image": outimg_enc}))) 56 | except Exception as e: 57 | logging.warn('error processing image data:') 58 | logging.warn(e.message) 59 | 60 | logging.info('exiting') 61 | 62 | 63 | if __name__ == '__main__': 64 | logging.basicConfig(level=logging.INFO) 65 | logging.info('starting kafka-openshift-python-listener') 66 | parser = argparse.ArgumentParser( 67 | description='listen for some stuff on kafka') 68 | parser.add_argument( 69 | '--brokers', 70 | help='The bootstrap servers, env variable KAFKA_BROKERS', 71 | default='localhost:9092') 72 | parser.add_argument( 73 | '--topic-in', 74 | help='Topic to listen to, env variable KAFKA_TOPIC_IN', 75 | default='raw-images') 76 | parser.add_argument( 77 | '--topic-out', 78 | help='Topic to publish to, env variable KAFKA_TOPIC_OUT', 79 | default='processed-images') 80 | args = parse_args(parser) 81 | main(args) -------------------------------------------------------------------------------- /image-processor/darkflow-1.0.0-cp36-cp36m-linux_x86_64.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/image-processor/darkflow-1.0.0-cp36-cp36m-linux_x86_64.whl -------------------------------------------------------------------------------- /image-processor/requirements.txt: -------------------------------------------------------------------------------- 1 | opencv-python 2 | kafka 3 | tensorflow 4 | -------------------------------------------------------------------------------- /image-processor/root/darkflow-1.0.0-cp36-cp36m-linux_x86_64.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/image-processor/root/darkflow-1.0.0-cp36-cp36m-linux_x86_64.whl -------------------------------------------------------------------------------- /notebooks/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM radanalyticsio/jupyter-notebook-py3.5 2 | 3 | USER root 4 | RUN mkdir /data && mkdir /data/images 5 | RUN mkdir /notebooks/cfg 6 | 7 | ADD images /data/images 8 | ADD https://raw.githubusercontent.com/thtrieu/darkflow/master/cfg/coco.names /notebooks/cfg 9 | ADD https://pjreddie.com/media/files/yolov2.weights /data/yolo.weights 10 | ADD https://raw.githubusercontent.com/pjreddie/darknet/master/cfg/yolov2.cfg /notebooks/cfg/yolo.cfg 11 | 12 | ADD *.whl / 13 | 14 | ENV NB_USER=nbuser 15 | ENV NB_UID=1011 16 | 17 | EXPOSE 8888 18 | 19 | USER $NB_UID 20 | 21 | USER root 22 | 23 | RUN chown -R $NB_USER:root /home/$NB_USER /data \ 24 | && find /home/$NB_USER -type d -exec chmod g+rwx,o+rx {} \; \ 25 | && find /home/$NB_USER -type f -exec chmod g+rw {} \; \ 26 | && find /data -type d -exec chmod g+rwx,o+rx {} \; \ 27 | && find /data -type f -exec chmod g+rw {} \; \ 28 | && /opt/conda/bin/conda install --quiet --yes -c conda-forge spacy \ 29 | && /opt/conda/bin/conda install --quiet --yes terminado \ 30 | && /opt/conda/bin/conda install --quiet --yes opencv \ 31 | && /opt/conda/bin/pip install vaderSentiment markovify fileupload \ 32 | && /opt/conda/bin/python -m spacy download en \ 33 | && ( /opt/conda/bin/conda clean -qtipsy || echo "conda clean FAILED" ) \ 34 | && /opt/conda/bin/pip install /darkflow*.whl tensorflow \ 35 | && /opt/conda/bin/jupyter nbextension install --py fileupload \ 36 | && /opt/conda/bin/jupyter nbextension enable --py fileupload \ 37 | && chmod -f g+rw /notebooks $(find /notebooks) 38 | 39 | ADD *.txt *.txt.gz /notebooks/ 40 | ADD *.ipynb /notebooks/ 41 | ADD otto.jpg /notebooks 42 | 43 | RUN chmod -f g+rw /notebooks $(find /notebooks) 44 | 45 | USER $NB_UID 46 | ENV HOME /home/$NB_USER 47 | 48 | LABEL io.k8s.description="PySpark Jupyter Notebook." \ 49 | io.k8s.display-name="PySpark Jupyter Notebook." \ 50 | io.openshift.expose-services="8888:http" 51 | 52 | CMD ["/entrypoint", "/usr/local/bin/start.sh"] -------------------------------------------------------------------------------- /notebooks/austen.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/austen.txt -------------------------------------------------------------------------------- /notebooks/darkflow-1.0.0-cp36-cp36m-linux_x86_64.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/darkflow-1.0.0-cp36-cp36m-linux_x86_64.whl -------------------------------------------------------------------------------- /notebooks/generate.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "We'll start by using the [markovify](https://github.com/jsvine/markovify/) library to make some social-media-sized utterances in the style of Jane Austen. This will be the basis for generating a synthetic social media stream." 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [ 15 | { 16 | "name": "stdout", 17 | "output_type": "stream", 18 | "text": [ 19 | "He began by speaking of her with so much dignified impertinence.\n", 20 | "You should not have been.\n", 21 | "Emma was sadly fearful that this second disappointment would be more becoming in her not to neglect it on any account.\n", 22 | "Although I had always despised her from the evil of a restless, officious companion, too apt to look.\n", 23 | "He has a fine dignified manner, which suits the head of the sinner; for when poor Lady Elliot died herself, no letter of condolence had been sent to Ireland.\n", 24 | "On the morrow the Crawfords were engaged to me ever since my marriage, I had seen Marianne's sweet face as white as my gown.\n", 25 | "Ought he not to have attempted more.\n", 26 | "His aunt worried him by her manner this morning, and cannot get the better of those false ideas of the necessity of reading aloud, which had fallen within his observation, that he had _cause_ to sigh.\n", 27 | "It was, indeed, a highly prized letter.\n", 28 | "For you alone, I think and plan.\n" 29 | ] 30 | } 31 | ], 32 | "source": [ 33 | "import markovify\n", 34 | "import codecs\n", 35 | "\n", 36 | "with codecs.open(\"austen.txt\", \"r\", \"cp1252\") as f:\n", 37 | " text = f.read()\n", 38 | "\n", 39 | "austen_model = markovify.Text(text, retain_original=False, state_size=3)\n", 40 | "\n", 41 | "for i in range(10):\n", 42 | " print(austen_model.make_short_sentence(200))" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "We'll use spaCy to identify entities (mostly proper nouns and noun phrases) in these synthetic status updates and turn them into hashtags:" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 2, 55 | "metadata": {}, 56 | "outputs": [ 57 | { 58 | "name": "stdout", 59 | "output_type": "stream", 60 | "text": [ 61 | "I hope you will do him such ample justice, that I am an advocate for marriage without love?\n", 62 | "I hope you will do him such ample justice, that I am an advocate for marriage without love?\n", 63 | "Chapter 14 Though Charles and Mary still talked on in the same place, was bad for each, for all three.\n", 64 | "Chapter 14 Though #Charles and #Mary still talked on in the same place, was bad for each, for all #three.\n", 65 | "Fanny Price was at this time of year.\n", 66 | "#FannyPrice was at #thistimeofyear.\n", 67 | "It was a happy woman, and a very few weeks would be sufficient for such arrangements as must precede the wedding.\n", 68 | "It was a happy woman, and #averyfewweeks would be sufficient for such arrangements as must precede the wedding.\n", 69 | "Mary had no feelings to make her resolve on remaining at Norland no longer than was unavoidable, it had not afforded.\n", 70 | "Mary had no feelings to make her resolve on remaining at #Norland no longer than was unavoidable, it had not afforded.\n" 71 | ] 72 | } 73 | ], 74 | "source": [ 75 | "import spacy\n", 76 | "nlp = spacy.load('en')\n", 77 | "\n", 78 | "def make_sentence(model, length=200):\n", 79 | " return model.make_short_sentence(length)\n", 80 | " \n", 81 | "def hashtagify_full(sentence):\n", 82 | " doc = nlp(sentence)\n", 83 | " for ent in doc.ents:\n", 84 | " sentence = sentence.replace(str(ent), \"#%s\" % str(ent).replace(\" \", \"\"))\n", 85 | " return (sentence, [\"#%s\" % str(ent).replace(\" \", \"\") for ent in doc.ents])\n", 86 | "\n", 87 | "def hashtagify(sentence):\n", 88 | " result,_ = hashtagify_full(sentence)\n", 89 | " return result\n", 90 | "\n", 91 | "for i in range(5):\n", 92 | " sentence = make_sentence(austen_model)\n", 93 | " print(sentence)\n", 94 | " print(hashtagify(sentence))" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": {}, 100 | "source": [ 101 | "We'll now train two Markov models on positive and negative product reviews (taken from the [public-domain Amazon fine foods reviews dataset on Kaggle](https://www.kaggle.com/snap/amazon-fine-food-reviews/)). We'll incorporate the results of these models into our synthetic social media stream." 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 3, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "import gzip\n", 111 | "\n", 112 | "def train_markov_gz(fn):\n", 113 | " \"\"\" trains a Markov model on gzipped text data \"\"\"\n", 114 | " with gzip.open(fn, \"rt\", encoding=\"utf-8\") as f:\n", 115 | " text = f.read()\n", 116 | " return markovify.Text(text, retain_original=False, state_size=3)\n", 117 | "\n", 118 | "negative_model = train_markov_gz(\"reviews-1.txt.gz\")\n", 119 | "positive_model = train_markov_gz(\"reviews-5-100k.txt.gz\")" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 4, 125 | "metadata": {}, 126 | "outputs": [ 127 | { 128 | "data": { 129 | "text/plain": [ 130 | "'Ick.'" 131 | ] 132 | }, 133 | "execution_count": 4, 134 | "metadata": {}, 135 | "output_type": "execute_result" 136 | } 137 | ], 138 | "source": [ 139 | "make_sentence(negative_model)" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 5, 145 | "metadata": {}, 146 | "outputs": [ 147 | { 148 | "data": { 149 | "text/plain": [ 150 | "'Great stuff.'" 151 | ] 152 | }, 153 | "execution_count": 5, 154 | "metadata": {}, 155 | "output_type": "execute_result" 156 | } 157 | ], 158 | "source": [ 159 | "make_sentence(positive_model)" 160 | ] 161 | }, 162 | { 163 | "cell_type": "markdown", 164 | "metadata": {}, 165 | "source": [ 166 | "We can combine these models with relative weights, but this yields somewhat unusual results:" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": 6, 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "compound_model = markovify.combine([austen_model, negative_model, positive_model], [14, 3, 3])" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 7, 181 | "metadata": {}, 182 | "outputs": [ 183 | { 184 | "name": "stdout", 185 | "output_type": "stream", 186 | "text": [ 187 | "Anyway, the whole family and they all came in sugar free!!!But they are wonderful you will not go.\n", 188 | "There was also a good idea to use very much to have enjoyed their unique taste.\n", 189 | "This is one I fix when I don't have any scent that other ingredients might individually offer.\n", 190 | "Not astonishingly strong, but pretty rich and balanced.\n", 191 | "Several of them broke open in the pot and could not touch it; and #Emma, never loth to be #first, was obliged to repeat again and again, as if nothing were a security for matrimonial comfort.\n", 192 | "#MapleGrove will probably be ordering more.\n", 193 | "I bought these as a child . If you like soft licorice - then don't buy this.\n", 194 | "This tea is so horrible. i cannot believe all the hype on this product and the best of both worlds: the vitamins/minerals of a fortified cereal without all of the grocery stores #today.\n", 195 | "However what I received #today...\n", 196 | "She had been bounced around from house to house and eating whatever was cheap.\n", 197 | "I love to make cake pops.\n", 198 | "It's really too bad because its too expensive.\n", 199 | "I am grown neither humble nor penitent by what has passed.\n", 200 | "But whether I should continue to take it away for fear that somebody it just getting rid of the other house, and on the present occasion, as far at least as recently as #theearly70s.\n", 201 | "$#27.99 for #sixounces?\n", 202 | "It's like I remember it brings back wonderful memories.\n", 203 | "Sort of a smooth, silky feel in the mouth, and seemed to do fine but then began getting very loose stools.\n", 204 | "To expose a friend, such a friend as #Isabella had been to hear papa invite him to say what gave no one any pain but herself.\n", 205 | "My whole family enjoys them.\n", 206 | "I hav not tried #Truvia but probably won't buy anymore nuts from #Amazon.\n" 207 | ] 208 | } 209 | ], 210 | "source": [ 211 | "for i in range(20):\n", 212 | " print(hashtagify(make_sentence(compound_model)))" 213 | ] 214 | }, 215 | { 216 | "cell_type": "markdown", 217 | "metadata": {}, 218 | "source": [ 219 | "As is more or less the case in the real world, we'll assume that a small percentage of users are responsible for the bulk of social media activity, and that the bulk of users are responsible for relatively few posts. We'll model this with a table of random user IDs that has a collection of relatively few talkative users and relatively many moderate users; the proportion of utterances from talkative users to utterances from moderate users is the inverse of the proportion of talkative users to moderate users." 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": 8, 225 | "metadata": {}, 226 | "outputs": [], 227 | "source": [ 228 | "import numpy\n", 229 | "import math\n", 230 | "import collections\n", 231 | "\n", 232 | "class UserTable(object):\n", 233 | " def __init__(self, size, weights=[8, 2]):\n", 234 | " self._talkative = collections.deque()\n", 235 | " self._moderate = collections.deque()\n", 236 | " self._size = size\n", 237 | " self._cutoff = float(weights[0]) / sum(weights)\n", 238 | " \n", 239 | " for i in range(size):\n", 240 | " new_uid = math.floor(numpy.random.uniform(10 ** 10))\n", 241 | " if numpy.random.uniform() >= self._cutoff:\n", 242 | " self._moderate.append(new_uid)\n", 243 | " else:\n", 244 | " self._talkative.append(new_uid)\n", 245 | " \n", 246 | " def random_uid(self):\n", 247 | " def choose_from(c):\n", 248 | " return c[math.floor(numpy.random.uniform() * len(c))]\n", 249 | " \n", 250 | " if numpy.random.uniform() >= self._cutoff:\n", 251 | " return choose_from(self._talkative)\n", 252 | " else:\n", 253 | " return choose_from(self._moderate)\n", 254 | " " 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": 9, 260 | "metadata": {}, 261 | "outputs": [], 262 | "source": [ 263 | "import seaborn" 264 | ] 265 | }, 266 | { 267 | "cell_type": "markdown", 268 | "metadata": {}, 269 | "source": [ 270 | "We can see the number of times each user ID appears if we ask the `UserTable` for 1000 random user IDs:" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": 10, 276 | "metadata": {}, 277 | "outputs": [ 278 | { 279 | "data": { 280 | "text/plain": [ 281 | "" 282 | ] 283 | }, 284 | "execution_count": 10, 285 | "metadata": {}, 286 | "output_type": "execute_result" 287 | }, 288 | { 289 | "data": { 290 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZwAAAD8CAYAAABDwhLXAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAF3tJREFUeJzt3XuUHGWZx/Hfk5sEAwcwkxCJGFZzFPbsgmcj62U9uwdXCUlIQhKiuLgR0Sg3RWFd9ewRcVfXCxcxJGi4RrnGmYQEEggIuLgrK4abQNANoGAkySQESEJikpl59o+q6n6n0t3TmUy9PZfv55w+/VbVW1VPVVfXU29VdbW5uwAAKNqgRgcAABgYSDgAgChIOACAKEg4AIAoSDgAgChIOACAKEg4AIAoSDgAgChIOACAKIY0OoB6jBw50seNG9foMACgT3nkkUc2u3tTo+PI9ImEM27cOK1evbrRYQBAn2JmLzQ6hhCn1AAAUZBwAABRkHAAAFGQcAAAUZBwAABRkHAAAFGQcAAAUZBwAABRkHAAAFH0iScNoBgLbjyxVD779FUNjATAQEALBwAQBQkHABAFCQcAEAUJBwAQBQkHABAFCQcAEAUJBwAQBQkHABAFCQcAEAUJBwAQReEJx8wGm9ljZnZn2n2Umf3KzNaa2W1mNqzoGAAAjRejhfN5Sc8E3d+RdLm7j5f0iqQzI8QAAGiwQhOOmY2VNFnSNWm3STpBUnNaZZGk6UXGAADoHYpu4Xxf0pckdaTdb5L0qru3pd3rJB1RaUQzm2tmq81s9aZNmwoOEwBQtMISjplNkdTq7o+EvStU9Urju/tCd5/g7hOampoKiREAEE+R/4fzfklTzWySpAMkHaykxXOImQ1JWzljJb1UYAwAgF6isBaOu3/F3ce6+zhJH5V0v7v/k6QHJM1Kq82RtKyoGAAAvUcjfofzr5K+aGbPKrmmc20DYgAARBblL6bd/eeSfp6Wn5d0fIz5AgB6D540AACIgoQDAIiChAMAiIKEAwCIIspNA42y8apLSuXRZ13YwEiAvuWMJS+WytfPOLKBkaA/oYUDAIiChAMAiIKEAwCIgoQDAIiiX980AKBxrl/SWiqfMWNUAyPp+1qvXFkqjzp3UgMj2T+0cAAAUZBwAABRkHAAAFGQcAAAUZBwAABRkHAAAFGQcAAAUZBwAABR8MNPANG1NG8ulWfOGtnASHrexssfK5VHf+FdDYyk96GFAwCIgoQDAIiChAMAiIKEAwCIgpsG0DAnLZslSbprWnODIynWlJ+Wl+/OU2fVPd605rtL5WWzJvZoTJI0u2VNqbx45jE9Pn00RuuCxZKkUWfPbnAke6OFAwCIgoQDAIiChAMAiIKEAwCIgoQDAIiChAMAiIKEAwCIgoQDAIiChAMAiIInDUj605Vnl8pHnLuggZEAGKha591fKo8674QGRlIcWjgAgChIOACAKEg4AIAoSDgAgChIOACAKApLOGZ2gJk9bGZPmNnTZnZx2v8oM/uVma01s9vMbFhRMQAAeo8iWzi7JJ3g7sdKOk7SRDN7j6TvSLrc3cdLekXSmQXGAADoJQpLOJ7YnnYOTV8u6QRJ2V8gLpI0vagYAAC9R6E//DSzwZIekfR2SfMlPSfpVXdvS6usk3RElXHnSporSUceeWSRYaKCS245sVS+8LRVDYyk/zm5eakk6Y5ZpzQ4kv7riatbS+VjPz2qgZEgVOhNA+7e7u7HSRor6XhJR1eqVmXche4+wd0nNDU1FRkmACCCKHepufurkn4u6T2SDjGzrGU1VtJLMWIAADRWkXepNZnZIWl5uKR/lPSMpAckzUqrzZG0rKgYAAC9R5HXcMZIWpRexxkkabG732lmayTdamb/IekxSdcWGAMAoJcoLOG4+28kvatC/+eVXM8BAAwgPGkAABAFCQcAEAUJBwAQBQkHABAFfzHdiy2/7qRSeeon72pgJACw/2jhAACiIOEAAKIg4QAAoiDhAACi6BM3DbRt2qJNV90oSWo66/QGR1OfhxZOKZXfO/fOBkaCfTG55ZpSecXMTzUwEqD/oYUDAIiChAMAiIKEAwCIgoQDAIiChAMAiIKEAwCIgoQDAIiChAMAiKKuhGNm99XTDwDQM1rn3aPWefc0OoweVfNJA2Z2gKQDJY00s0MlWTroYElvLjg2AEA/0tWjbT4j6XwlyeURlRPOVknzC4wLANDP1Ew47n6FpCvM7Dx3nxcpJgBAP1TXwzvdfZ6ZvU/SuHAcd/9xQXEBAPqZuhKOmf1E0tskPS6pPe3tkkg4gdU/OlmSNOEzdzQ4EgDofer9e4IJko5xdy8yGABA/1Xv73CeknR4kYEAAPq3els4IyWtMbOHJe3Kerr71EKiAgD0O/UmnK8XGQQAoP+r9y61/yo6EAB9w8VLXyqVLzqF33+jfvXepbZNyV1pkjRM0lBJr7v7wUUFBgDoX+pt4RwUdpvZdEnHFxIRAKBf6tbTot39dkkn9HAsAIB+rN5TajOCzkFKfpfDb3IAAHWr9y61k4Nym6Q/SJrW49EAAPqteq/hnFF0IACA/q3eP2Aba2ZLzazVzDaaWYuZjS06OABA/1HvTQPXS1qu5H9xjpB0R9oPAIC61Jtwmtz9endvS183SGoqMC4AQD9Tb8LZbGanm9ng9HW6pJeLDAwA0L/Um3A+KWm2pA2S1kuaJanmjQRm9hYze8DMnjGzp83s82n/w8zsXjNbm74fuj8LAADoG+pNOP8uaY67N7n7KCUJ6OtdjNMm6QJ3P1rSeySdY2bHSPqypPvcfbyk+9JuAEA/V2/C+Wt3fyXrcPctkt5VawR3X+/uj6blbZKeUXLDwTRJi9JqiyRN39egAQB9T70JZ1B46svMDlP9PxqVmY1TkqB+JWm0u6+XkqQkaVSVceaa2WozW/3y9q31zgoA0EvVmzQulfRLM2tW8kib2ZK+Wc+IZjZCUouk8919q5nVNUN3XyhpoSQd99a/4DE6ANDH1fukgR+b2WolD+w0STPcfU1X45nZUCXJ5iZ3X5L23mhmY9x9vZmNkdTazdgBAH1I3afF0gTTZZLJWNKUuVbSM+5+WTBouaQ5kr6dvi+rd5oAgL6r7oTTDe+X9HFJT5rZ42m/rypJNIvN7ExJL0o6tcAYAAC9RGEJx93/W8npt0o+WNR80f9Mur185/zK6d9uYCQA9ke3/oANAIB9RcIBAERBwgEAREHCAQBEQcIBAERBwgEAREHCAQBEQcIBAERBwgEAREHCAQBEQcIBAERBwgEAREHCAQBEQcIBAERBwgEAREHCAQBEQcIBAERBwgEAREHCAQBEQcIBAERBwgEAREHCAQBEQcIBAERBwgEAREHCAQBEQcIBAERBwgEAREHCAQBEQcIBAERBwgEAREHCAQBEQcIBAERBwgEAREHCAQBEQcIBAERBwgEAREHCAQBEQcIBAERBwgEARFFYwjGz68ys1cyeCvodZmb3mtna9P3QouYPAOhdimzh3CBpYq7flyXd5+7jJd2XdgMABoDCEo67PyhpS673NEmL0vIiSdOLmj8AoHeJfQ1ntLuvl6T0fVTk+QMAGqTX3jRgZnPNbLWZrX55+9ZGhwMA2E+xE85GMxsjSel7a7WK7r7Q3Se4+4Q3jTg4WoAAgGLETjjLJc1Jy3MkLYs8fwBAgxR5W/Qtkh6S9A4zW2dmZ0r6tqQPmdlaSR9KuwEAA8CQoibs7qdVGfTBouYJAOi9eu1NAwCA/oWEAwCIgoQDAIiChAMAiIKEAwCIgoQDAIiChAMAiIKEAwCIgoQDAIiChAMAiIKEAwCIgoQDAIiChAMAiIKEAwCIorC/J0B97rl2Uqn84TNXNjAS7I8pzTeXynfO+lgDIwF6L1o4AIAoSDgAgChIOACAKEg4AIAo+vxNA5t+uLBUbvrsXLX+cJ4kadRnz+tUb8NV3yiVDz/ra3GCA7owtXl5qbx81lRNbV6RlifXHG96ywOlsgVf46UzP9DDEWJ/bfz+w6Xy6POPb2AkjUcLBwAQBQkHABAFCQcAEAUJBwAQRZ+8aWDTVddLkprOOiP6vNcsmCpJOubs5V3U7HuuuOnEckeNQ5GLF5frXTR7lb7604mSpG+dere+0DKxNOzymXd3K46Tbj+/VL5r+vfrHm/S0u+WyitP+ZImL7lMkrRixhc1eckPSsNWzPhct+Lqi2a2/LpUbpn5bs1qeVyS1DzzuOix3NyyqVR+g6zTsDsWb5YknTx75F7j3XdzMt4HP9akB39SnsbQjnKd985p6pEYX/ru+lL5zV8aU/d4Gy57slQ2q1GxAK3zl5TKo86ZEXfm+4gWDgAgChIOACAKEg4AIIo+eQ2naC/+YHapfOTnFjcwkvrdfENyXeVjn1hVyPT/89b6ru9010nLPh10vbHu8SYt/WbQNbRb857c8qOga3DVelNaFpXKd86coynNNwZDB+ax2yVLN5TKF55yeI9M8+5bN5fKEz+69zWdnvbC5eVlqLUFbbjkuVL58Avf1iPz3viDB0vl2Nd+GmFgfksAANGRcAAAUZBwAABRkHAAAFEM2JsG1i/4csPmff81wZOAI18oXPiT4OJ/5HnPXlb+Uejiad37UeikpReVyitPuXi/YyrKlObbgq7qNyL0Jh9Z8qwk6bYZb29wJN2z+rrWUrnWxf+1V24slcefO7rmNNd/70VJ0ph/OXKvYRsu/a0k6fAL3ll3jBuv+GWpPPrz76t7vLzW+ckPz0edM7Xb02gEWjgAgChIOACAKEg4AIAoSDgAgCgG7E0D9Xr2ymml8tvPXdZp2BNXlS/YHXtW9adH/+Lq8k0CH/j0iprzW3ntpHJH7qL+kuvLF93bgv6zz+h8AX7RDR8uled84h5dtyjp/uSce2rOuwhzlyYxLzylezcJFGXykvmSpBUzzqlZb0rLdUFX4y7+T2++t1S+fdaHatad0fJQqWw99BW/YOk6SdKlp4ytWW/B0vIF+eGx70qpYc1V5bi69zwK9ISGtHDMbKKZ/c7MnjWzxt0uBgCIJnrCMbPBkuZLOknSMZJOM7NjYscBAIirES2c4yU96+7Pu/tuSbdKmtbFOACAPq4RCecISX8Mutel/QAA/Zi5e9wZmp0q6UR3/1Ta/XFJx7v7ebl6cyXNTTvfIel3kkZKyp5dXq3cm4YN1HkTV9+ZN3H1nXl3J663unvP/P92T3D3qC9J75W0Kuj+iqSv1Dnu6q7KvWnYQJ03cfWdeRNX35l3d+PqTa9GnFL7taTxZnaUmQ2T9FFJ1e8pBgD0C9F/h+PubWZ2rqRVSn7YcJ27Px07DgBAXA354ae7r5S0shujLqyj3JuGDdR5E1ffmTdx9Z15dzeuXiP6TQMAgIGJZ6kBAKLYr1NqZnadpCmSWpU8OeDHkv5GyeOKXpbUJKlD0qtpvyGSDpbkkjZJOiAtd0g6LJj065Le2I2QXNH/VgwAerVsHzs4189VbnS0B8PblOyr24LyS5L2KNl/v5aO265kX31QOuxhSZ9x9z3VAtnfFs4NkrInSrZJusDdD5Y0StIOSYuVPElgTRrkRkkPKblhYJikAyVtU3I9x5Ukph1p8C3pwvyzpJ1Kfiyanf/bk85vdzqsPe2/NajTFvTbE3R7UH+nyis+q5sJn4+ZjdeR6xdOM+veE3SHwzwdlpXDOn/MTasjfWXx7A7qbgvib9fesunmY/1Thdh3q2v5c6752KvVk6QNFYZXGj8c1lGlf7gR/7nCvMJ1sTs3j3BYOP0dQXmXOn/m23MxVIt3T4V++XVfq38WR7h8O4Py93LjhcviFbqlZPlfy/UL61faPiuVfx9052OvulPJaa8SYy31bF+Vtqf8d7ZdnbeV7AmetT6L/Lzy6zg/3s4aw/LTyttZoU6Hku3hlbQ7/K63BXXDmH6bjpcN25WWX1PyW5wOJfvg7DM7T8m+xZTsY7+W1t8h6X5J6yW9092HKmlADJF0jqQFShoOh6TDj5Z0qaR3SvorScMlfarG8u5fwnH3ByVtScvr3f3RtLxN0jOSTpQ0WtJlaaCjJV2kJNEcKOkFJT9Quild+OXpgg9Xcvu0lGTWYZLenHZ3KFkBf0ynOUzlVs3wtBxuUIOUrPisNZclIEmaF0xTktYGi/dibnFNe6+vzbnuwWmdSq0sk/S89v5SSNIISc8F9dqVtAAPCvopV96hvY9YwvcduWEbg+7waCYv3Ph3q/qy5OtXSl7DK4xTKeFk6yT/hQ3nE+78nszV71Dnz6bWzjGsd2AQyxAl22Mm+2wrJbdsG9um8nYVrv/8Osv67aowLSnZlsOzDeF6+00u5nA9mjp/juE2PyTXL0s0+XUfxhouqys5WMzKg9T5s852eOG6rXSwYEGdDpW373y9elRLLtmOO79NP6XO6/XhIKZK22s2/TDG/Pc+H284T9Pe38NK88jeKyWcbDvJvvvZNjMofe0K6mUHt6bO290gJWedhqm8Txyh8rqYr/JDsw9Mh21VcsZpp5J9d3bGabLKZ6kmK9mv7Hb3Dkly91s8pWT91n6c+P7+kEfSOElPVei3UdKj6QJkLZEtSr6kHZLuTfvtUHkD3pi+b1OS4bMjyNeUnKLbrXL2Dlsn2evmCv06cuXtKn/5/rdCfQ/mW21YT7zae3BabT04rZ58besFMezLq6PB8y9yObJ++7KtVPqOsX7jvvKfwe4q9fLrtj3o36HkADars03J/i2c1maVt489kv5P0l8qaRn9WVKzktbO9vS1WtJdksan+/yhSvb3H4j6w08zG6HkdNhjSlouz0n6WbqQWbPuT5L+TkkGXq9yy6I5XQkjlDQTX0/HG6GkGbdT5SOK19P3V5WsKEk6OX3PVrLU+WjGlWTx7Ejgb4P+Uvk0yh51biFInZvO+6LSEXIWV6bSqbFKR2Bhs7mr+uE0dyg52gvrVjoazc+r2rBQpVOPmRFdjFttnnmu2vF2FWM12Rc4E34mXZ0yqrXO9lV7N8btUHIUW0mlFmj2nQiPyLs6pTpMlZfzyaCcDa+2vVaKq9K6rfUZZjvQUFuVcjidZ3PdXS1vuK66qtPdbS6cTn495OffpuQzCIeH8/9z+v5SUD/f6pHKp852KFmPw5VsO4PTcbaofB3mSEkrlJyNulvJY8VelNTq7o8oaSW5u0+QdLWk7A+jFkh60N1/UXupe7CFoyTLrZJ0oZLWytg06JuUfNjbVf9Rzs/S+ruCfvnWSrjS869q43X3dUmN6e3ogennX/cH88l2Fm1KvkRh/3xM2Xt49NKq5HpKePTzcjfXT60jqew6SE+vi5ivLP4t+7ge6nm9XmC8XcWYj7ejQr/8a13uc24Pyt2JtdJ2m732dHOd1nq15tbPH7qovz9nHXr6jEWldb2rxjx3psOzszjr0u7sex6u2z2Srkn7Pabk8WKPSupI9+OrJD2eTv9/0mmsS9dfdi3pRiWJ7DUll0lulzQo2qNtzMwkXavk/N9TSlo229NAXlDSqnki7feqkibbC0qaZUPTBblayZcyuzNiiMoXzr8l6XSVN6Dn0vLvVb5x4PcqfwjrgvDCI6Bsw1b63q5yayas93JuET8SLm46j0zW2gr7SclO//ygO4wpPH+bbWRhDH+fm152vn6k9r5GkO3sFQwLz12/SZ2PlEyd7wqsdmdffnk8rVvt4uiw3LB8KzGrtzUoh7Kkmhe2ANqCcji+597DOuFBSXiUm53qlco3OGTr4pBgWq+rujCGSi3V0BtqDMvf9CAln2m4zXiu3hZVvw7rKl8wDvtltqq8rjdXGC4lN/+0B8OyeYWt9mx4lkxD+SP4bN2GZxwy2c6ykmxZMvntZE/wHk7zoFy9Mbn55WMIpx9uT5WuOeXfX1P5Ro08V+X57MzVyd5btffpaCnZt1ba3rLr3K0q7ycOVbLsYSsoS2S3S3prOu6VSi5vHCvpZTM7StLR6csl/ZuSa8+zJB2l5HvyorufrmQftUXJtfrTsus6tezXDz/N7BZJ/6BkJ/iKktugn1TSstmtcjPuLWl3doeDKzmVlq2YtUr+jC3bqPMXgZVOt0nS4d0OGAAGruyAMXvP9slSkoyyGxPaVb4t+mklp+AGp+OsUfKfZq1KGg/vVtJw2JZOZ4m7f6NaADxpAAAQBU8aAABEQcIBAERBwgEAREHCAQBEQcIBAERBwgEAREHCAQBEQcIBAETx/0HrmcpcUwA1AAAAAElFTkSuQmCC\n", 291 | "text/plain": [ 292 | "
" 293 | ] 294 | }, 295 | "metadata": {}, 296 | "output_type": "display_data" 297 | } 298 | ], 299 | "source": [ 300 | "ut = UserTable(100)\n", 301 | "uids = [ut.random_uid() for i in range(1000)]\n", 302 | "seaborn.countplot(uids)" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": 32, 308 | "metadata": {}, 309 | "outputs": [], 310 | "source": [ 311 | "def generate_tweets(models, weights=None, hashtag_weights=[8, 2], ut=None, seed_hashtags=[]):\n", 312 | " if weights is None:\n", 313 | " weights = [1] * len(models)\n", 314 | " \n", 315 | " if ut is None:\n", 316 | " ut = UserTable(10000)\n", 317 | " \n", 318 | " choices = []\n", 319 | " \n", 320 | " total_weight = float(sum(weights))\n", 321 | " \n", 322 | " for i in range(len(weights)):\n", 323 | " choices.append((float(sum(weights[0:i+1])) / total_weight, models[i]))\n", 324 | " \n", 325 | " def choose_model():\n", 326 | " r = numpy.random.uniform()\n", 327 | " for (p, m) in choices:\n", 328 | " if r <= p:\n", 329 | " return m\n", 330 | " return choices[-1][1]\n", 331 | " \n", 332 | " seen_hashtags = set()\n", 333 | " hashtags = []\n", 334 | " total_hashtag_weight = float(sum(hashtag_weights))\n", 335 | " for i in range(len(hashtag_weights)): \n", 336 | " hashtags.append((float(sum(hashtag_weights[0:i+1])) / total_hashtag_weight, collections.deque()))\n", 337 | " \n", 338 | " iws = [1.0 - w for (w, _) in hashtags]\n", 339 | " inverse_weights = [(sum(iws[0:i+1]), i) for _, i in zip(iws, range(len(iws)))] \n", 340 | "\n", 341 | " def choose_from(c):\n", 342 | " idx = math.floor(numpy.random.uniform() * len(c))\n", 343 | " return c[idx]\n", 344 | " \n", 345 | " def store_hashtag(tag):\n", 346 | " if tag not in seen_hashtags:\n", 347 | " seen_hashtags.add(str(tag))\n", 348 | " r = numpy.random.uniform()\n", 349 | " for(p, deq) in hashtags:\n", 350 | " if r <= p:\n", 351 | " deq.append(tag)\n", 352 | " \n", 353 | " def choose_hashtag():\n", 354 | " r = numpy.random.uniform()\n", 355 | " for(p, i) in hashtags:\n", 356 | " if r <= - p and len(hashtags[i][1]) > 0:\n", 357 | " return choose_from(hashtags[i][1])\n", 358 | " return len(hashtags[0][1]) > 0 and choose_from(hashtags[0][1]) or choose_from(hashtags[1][1])\n", 359 | " \n", 360 | " for tag in seed_hashtags:\n", 361 | " seen_hashtags.add(str(tag))\n", 362 | " hashtags[-1][1].append(str(tag))\n", 363 | " \n", 364 | " while True:\n", 365 | " tweet, tags = hashtagify_full(make_sentence(choose_model()))\n", 366 | " for tag in tags:\n", 367 | " store_hashtag(str(tag))\n", 368 | " \n", 369 | " this_tweet_tags = set([str(t) for t in tags])\n", 370 | " \n", 371 | " if len(seen_hashtags) > 0:\n", 372 | " for i in range(min(numpy.random.poisson(3), len(seen_hashtags))):\n", 373 | " tag = choose_hashtag()\n", 374 | " if str(tag) not in this_tweet_tags:\n", 375 | " this_tweet_tags.add(str(tag))\n", 376 | " tweet += \" %s\" % str(tag)\n", 377 | " \n", 378 | " yield (ut.random_uid(), tweet)" 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": 33, 384 | "metadata": {}, 385 | "outputs": [], 386 | "source": [ 387 | "seed_hashtags=[\"#ff\", \"#marketing\", \"#fail\", \"#followfriday\", \"#yolo\", \"#retweet\", \"#tbt\", \"#socialmedia\", \"#startup\", \"#blogpost\", \"#news\", \"#health\"]\n", 388 | "\n", 389 | "t = generate_tweets([austen_model, positive_model, negative_model, compound_model], [22, 4, 4, 2], seed_hashtags=seed_hashtags)" 390 | ] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": 34, 395 | "metadata": {}, 396 | "outputs": [ 397 | { 398 | "data": { 399 | "text/plain": [ 400 | "" 401 | ] 402 | }, 403 | "execution_count": 34, 404 | "metadata": {}, 405 | "output_type": "execute_result" 406 | } 407 | ], 408 | "source": [ 409 | "t" 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": 35, 415 | "metadata": { 416 | "scrolled": true 417 | }, 418 | "outputs": [ 419 | { 420 | "data": { 421 | "text/plain": [ 422 | "[(4919108956,\n", 423 | " 'If a woman conceals her affection with the same intent, just then the stoutest of the #three; for the very exertion to which he was to proceed on his journey to Town. #health'),\n", 424 | " (2321890080,\n", 425 | " 'It has protein as well, which I appreciate as a mom. #fail #retweet #tbt'),\n", 426 | " (8288438770,\n", 427 | " \"Yes, that's how good these blueberries are! #startup #health #yolo #socialmedia\"),\n", 428 | " (3873645654,\n", 429 | " \"Mr. #Elton's engagement in the conversation of Mr #Cleveland. #three #news #ff\"),\n", 430 | " (2866175761,\n", 431 | " 'It was impossible for her not to be supposed that he would still love, and still hope! #health #three'),\n", 432 | " (9553742504,\n", 433 | " 'In such a frame of mind as she was resettled, was looking forward with eagerness. #yolo #news'),\n", 434 | " (7212767217,\n", 435 | " 'Depend upon it, that is a place, indeed, and we had a rehearsal. #Cleveland #ff'),\n", 436 | " (4953926703,\n", 437 | " \"I wouldn't stop taking this tea for menstral cramps. #followfriday #marketing #socialmedia #yolo\"),\n", 438 | " (9760872753, 'Clearly adictive #Haribo product. #news #fail #yolo'),\n", 439 | " (9399882181,\n", 440 | " 'The light they give off is pink and doesnt taste like plastic!!!KEEP AWAY FROM ZICO, until they change to GMO free items. #tbt'),\n", 441 | " (3342614033,\n", 442 | " 'No; he was talking to Miss #Fairfax. #marketing #fail #health #Cleveland'),\n", 443 | " (3886111789,\n", 444 | " 'This was the season of happiness to #Marianne. #retweet #tbt #fail'),\n", 445 | " (2224935228, 'But ah! united, what reverse we have! #marketing #Marianne'),\n", 446 | " (593851227,\n", 447 | " 'After securing accommodations, and ordering a dinner at #Hartfield for the #Eltons.'),\n", 448 | " (5916981711, 'I must go away. #blogpost #Fairfax'),\n", 449 | " (9618905701,\n", 450 | " 'He is not a bit better than the obsolete survival of the sentimental novel represented by #MarianneDashwood. #retweet #health #Fairfax'),\n", 451 | " (3703665075, 'Oh, did I mention fresh? #news #three #blogpost #tbt'),\n", 452 | " (8700765252,\n", 453 | " 'His coming to visit them. #three #Fairfax #Marianne #marketing #Hartfield'),\n", 454 | " (2222522121, 'This is a total waste of money.'),\n", 455 | " (1251878805,\n", 456 | " \"#Halfanhour passed away, and #Catherine, having spent the best part of an #Englishman's constitution. #fail\"),\n", 457 | " (4333999649, \"It didn't taste like coconut! #Cleveland #health #yolo #fail\"),\n", 458 | " (6354044852,\n", 459 | " 'It was so long in company with #two men, striking instances of what I had ever done before. #fail #Marianne #Cleveland'),\n", 460 | " (394410389,\n", 461 | " \"In no countenance was attentive curiosity so strongly marked as in Miss #Bingley's, in spite of all the privacy and propriety which was talked about at #first. #Hartfield #Catherine\"),\n", 462 | " (9973510655,\n", 463 | " 'My poor mother is really ill, and keeps her room. #news #retweet'),\n", 464 | " (6488301657, 'As he said this, each looked towards their mother. #retweet'),\n", 465 | " (6832681644,\n", 466 | " 'We would giggle at the fact that the Mi-Del cookies are low-fat and use some organic ingredients. #followfriday #retweet #news #first #Englishman'),\n", 467 | " (8442727273, 'So I took a sip. #Cleveland #Hartfield #news #Halfanhour'),\n", 468 | " (399057138, 'We returned it. #Hartfield #health #three'),\n", 469 | " (6438108847,\n", 470 | " 'Mr. #Phillips visited them all, and especially to her friend. #news #followfriday #Bingley #two #Eltons #Marianne #first #Fairfax #Cleveland'),\n", 471 | " (9985258002,\n", 472 | " 'It was so long since #Fanny had had any letter from #JaneFairfax. #yolo #followfriday'),\n", 473 | " (2388950590,\n", 474 | " 'Too soon did she find herself prevented by a settled rain from going out again after dinner.'),\n", 475 | " (8130810245, '#Frederica makes me very unhappy! #socialmedia #three'),\n", 476 | " (152927674, 'And she did not talk of that odious man. #socialmedia #ff #tbt'),\n", 477 | " (7639367133,\n", 478 | " '#Anne could only feel that #CharlesHayter was not well inclined towards Captain #Wentworth. #socialmedia #Marianne #Eltons'),\n", 479 | " (4291665030,\n", 480 | " 'Most earnestly did she despise her daughter-in-law for it, that, on the arrival of #Saturday. #Marianne'),\n", 481 | " (7915444217,\n", 482 | " 'Indeed I do you justice, my good friend. #socialmedia #Bingley #Englishman #Catherine #news #JaneFairfax #followfriday'),\n", 483 | " (8833516911,\n", 484 | " 'This is great food and all his friends #LOVE the Switch Black Cherry is a healthy breakfast for you Unlike some other brands of shredded wheat. #ff #tbt #Englishman #Frederica'),\n", 485 | " (9469380334,\n", 486 | " 'My dogs are finicky about what she eats, she usually behaves much more enthusiastic about a product. #yolo #first #followfriday'),\n", 487 | " (3235807434,\n", 488 | " 'Thrush in all his notions and behaviour; not likely to produce evil to her. #Marianne #Cleveland #Halfanhour #marketing'),\n", 489 | " (6759680239,\n", 490 | " 'A variety of occupations, of objects, and of company, which could not be mistaken,--it _was_ #Edward. #yolo #fail'),\n", 491 | " (3769769087,\n", 492 | " 'His behaviour to her sister should perceive that she experienced nothing but grief and disappointment in the course of #threeorfourdays could produce any. #fail'),\n", 493 | " (6449278268,\n", 494 | " 'She was sure they must be talking of #Bath, when I am here. #Edward #two #Fairfax #Anne'),\n", 495 | " (461661988,\n", 496 | " 'She had thought her wretchedly altered, and in the #first and the handsomest; and after paying his compliments en passant to #MissBates and her niece, who had accidentally met. #LOVE #Hartfield'),\n", 497 | " (9704213137,\n", 498 | " 'As Mr #Shepherd perceived that this connexion of the Crofts did them no service with Sir #Walter, he mentioned it no more; returning, with all his heart. #retweet #Catherine #followfriday #Saturday'),\n", 499 | " (9379994399,\n", 500 | " '#Elizabeth longed to observe that she had been wont to suppose. #startup #followfriday'),\n", 501 | " (9438789668,\n", 502 | " 'He had ruined for a while would be a #ninth--and #Emma apprehended that it would have been enough for #Anne; but #LadyRussell saw either less or more than her young friend. #health #Englishman'),\n", 503 | " (7087291541, 'They separated too much into parties.'),\n", 504 | " (4755154804,\n", 505 | " 'Her astonishment and confusion were very great on his so sudden appearance. #threeorfourdays #Eltons #startup #followfriday'),\n", 506 | " (8168063914, 'Have you had any flirting? #first #Elizabeth'),\n", 507 | " (4618630492, 'You are quite enough alike. #fail #tbt'),\n", 508 | " (5971801089,\n", 509 | " 'I used to think it is fair traded and price-competitive.The Millstone #KonaBlend tastes essentially the same since it dissolves in cold tea. #Bath #retweet #yolo'),\n", 510 | " (450922284,\n", 511 | " \"It's the best tea on the smallest setting on the bread machine, and added raisins cinnamon and sugar without being overpoweringly so. #followfriday #LOVE #Marianne #three #ff\"),\n", 512 | " (952893218,\n", 513 | " \"I don't like the taste, but it's a dark color, slimy and off tasting. #two #Fairfax #Frederica #Cleveland #news\"),\n", 514 | " (5496778563,\n", 515 | " 'Not so #theMissSteeles. #KonaBlend #Hartfield #yolo #threeorfourdays'),\n", 516 | " (7152123632,\n", 517 | " \"#Anne's object was, not to be ready in time; for she may have a natural talent for--thinks strongly and clearly--and when he takes a pen in hand, his thoughts naturally find proper words. #Bath #Cleveland\"),\n", 518 | " (9326183240,\n", 519 | " '#Marianne looked very grave and said nothing. #Bingley #Bath #JaneFairfax'),\n", 520 | " (5742785569,\n", 521 | " 'I suppose I shall not easily forget its having stopped #twohours at #PettyFrance. #news'),\n", 522 | " (1788658840,\n", 523 | " 'When he questioned, Sir #Walter and #Miss. They are gone off together from #Brighton. #Shepherd #Emma'),\n", 524 | " (5854714591,\n", 525 | " '#Elizabeth immediately began playing again. #followfriday #Edward #Elton'),\n", 526 | " (2062223248,\n", 527 | " 'It is not for us to be going to tell me that we never were to meet #MissTilney again continued in full force at the end of the gallery, stood before her! #Englishman'),\n", 528 | " (6314739995,\n", 529 | " 'My wife and I drank this coffee black which I rarely do but any creamer would have just thrown out the remaining jars. #yolo #Fairfax #startup'),\n", 530 | " (900814851,\n", 531 | " 'Every time I have share the chips with selected friends that pass for energy drinks. #Eltons #Anne'),\n", 532 | " (5278058847,\n", 533 | " 'His admiration was certainly very warm; yet she thought, but for Mrs. #Weston, it would not have had poor #James think himself slighted upon any account; and I am not afraid. #retweet'),\n", 534 | " (7801351645,\n", 535 | " \"#Bingley had never met before, and those who met too often; a commonplace business, too numerous for intimacy, too small for anybody's comfort. #ff #Frederica #KonaBlend\"),\n", 536 | " (684973748, 'I read multiple reviews on the site?'),\n", 537 | " (3104873366,\n", 538 | " 'She feared that under this persuasion she had been so long concealed? #PettyFrance'),\n", 539 | " (296132541, '#Afewhours were to have on the #morrow. #two #KonaBlend'),\n", 540 | " (1211660671,\n", 541 | " 'He was too diffident to do justice to the character of her own disinclination for going to #London. #yolo'),\n", 542 | " (2382997434,\n", 543 | " 'How often have I wished that I possessed as little Personal Beauty as you do; who have not, at least, been given a taste for those sort of visits conveyed, might shortly be over. #Cleveland #Halfanhour'),\n", 544 | " (5213456829, 'This coffee is a winner. #Englishman #yolo #Bath #Shepherd'),\n", 545 | " (3285590114, 'To retreat was impossible. #KonaBlend #morrow'),\n", 546 | " (2747213011, 'Did you think me already. #PettyFrance #morrow'),\n", 547 | " (5994597262,\n", 548 | " 'I was thoroughly unwilling to let her be an active, useful sort of person, not brought up high, but able to make my brother like her. #theMissSteeles #morrow #yolo #Anne'),\n", 549 | " (9048591904, \"Can't beat the convenience! #James #Emma\"),\n", 550 | " (9450941761,\n", 551 | " 'But--good Lord! how unlucky! #first #threeorfourdays #followfriday #Hartfield #news'),\n", 552 | " (4538204684, 'Later I went back to #three. #Englishman'),\n", 553 | " (303574565,\n", 554 | " 'It was a stab, in spite of all the dialogue which ensued of surprize, and inquiry, and congratulations on her side, and I can _answer_ for its being returned. #Saturday #followfriday'),\n", 555 | " (4291665030,\n", 556 | " 'I am all impatience to see the #Thrush before she went off to see #Edward. #ff'),\n", 557 | " (1346041157, 'It would be a valuable neighbour to Mrs. #Bennet. #Saturday'),\n", 558 | " (7374097662,\n", 559 | " 'It is too ridiculous! #Emma #marketing #first #Elizabeth #health'),\n", 560 | " (1505084229,\n", 561 | " \"This was so material an amendment of his late father's steward, to be his home, was pondering with downcast eyes and contracted brow, she felt secure from all possibility of wronging him. #London\"),\n", 562 | " (2924402958,\n", 563 | " 'Who could be happier than ever; and were we at liberty, I doubt if I could see she was altered; but, however, she seemed to _try_ to be very much in love with #ReginaldDeCourcy! #Emma #Elton'),\n", 564 | " (4214961777,\n", 565 | " \"Packaging is awesome and I love this candy and I couldn't do it! #Thrush #first #ff\"),\n", 566 | " (1812911863,\n", 567 | " 'However, #Amazon has easy return process. #KonaBlend #MissTilney #Englishman'),\n", 568 | " (8526601574, 'They were obliged to move. #Emma #PettyFrance #Thrush'),\n", 569 | " (6875955234, 'Mr. #Crawford has been kept waiting too long already. #Edward'),\n", 570 | " (691433228, 'Not. #morrow #startup #Weston'),\n", 571 | " (4078752725,\n", 572 | " 'The ##first solid consolation which #Fanny received for the evils of home, the ##first which her judgment could entirely approve, and which gave any promise of durability, was in a better way, I am sure.'),\n", 573 | " (3782650374,\n", 574 | " 'And they think she will have more to say. #health #Marianne #Eltons #Englishman #Anne'),\n", 575 | " (4818074797,\n", 576 | " \"I have read your feelings, as I think you and I must say that, to be making love, by breaking his mistress's head, is not it, for a cold to hang upon her? #fail #startup #LOVE #Halfanhour\"),\n", 577 | " (4110704046,\n", 578 | " '#JohnKnightley only was in mute astonishment.--That a man who has not a bad face, but there is something wanting--his figure is not striking; but I think there may be peace. #marketing'),\n", 579 | " (5907860263,\n", 580 | " 'This however was more than satisfied, for she came perfectly aware that none would be felt, and if I do not indeed. #retweet #Bingley #health #Elton #Englishman #Halfanhour #Amazon'),\n", 581 | " (9164208785,\n", 582 | " 'This product has bad reviews because it is considered perishable item. #Fairfax #theMissSteeles #PettyFrance #Elizabeth'),\n", 583 | " (4923548050,\n", 584 | " 'But I will pawn to you an undirected Letter that I received from #Chloe. #Bath #health #Saturday #followfriday'),\n", 585 | " (6469661309,\n", 586 | " 'The stiffness of the meeting soon gave way before their popular manners and more diffused intimacies: little groups were formed, and everybody grew comfortable. #fail #marketing'),\n", 587 | " (2546862783,\n", 588 | " \"How can this sugar-free, taste-free, light and fluffy as #Redenbacher's. #Shepherd #fail\"),\n", 589 | " (6218402611,\n", 590 | " 'Maria desired no greater pleasure than ever in her opinion, of all things in the world than her real #eight-and-thirty. #followfriday #health #MissTilney #Marianne #Catherine'),\n", 591 | " (6030506410,\n", 592 | " 'Though #Julia fancies she prefers tragedy, I would not trust her in it. #ReginaldDeCourcy #Eltons #Cleveland'),\n", 593 | " (8862611867,\n", 594 | " 'You might, some time or other, to look at--or my tour to read--or my poem. #retweet'),\n", 595 | " (8266791414,\n", 596 | " 'But his pride, his abominable pride--his shameless avowal of what he had done which satisfied her; his style was not penitent, but haughty. #Brighton #Eltons #fail')]" 597 | ] 598 | }, 599 | "execution_count": 35, 600 | "metadata": {}, 601 | "output_type": "execute_result" 602 | } 603 | ], 604 | "source": [ 605 | "[next(t) for i in range(100)]" 606 | ] 607 | }, 608 | { 609 | "cell_type": "code", 610 | "execution_count": null, 611 | "metadata": {}, 612 | "outputs": [], 613 | "source": [ 614 | "import cProfile\n", 615 | "\n", 616 | "def timing(c):\n", 617 | " for _ in range(c):\n", 618 | " next(t)\n", 619 | "\n", 620 | "cProfile.run('timing(2000)', 'generatestats')" 621 | ] 622 | }, 623 | { 624 | "cell_type": "code", 625 | "execution_count": null, 626 | "metadata": {}, 627 | "outputs": [], 628 | "source": [ 629 | "import pstats\n", 630 | "p = pstats.Stats('generatestats')\n", 631 | "p.strip_dirs().sort_stats(-1).print_stats()" 632 | ] 633 | }, 634 | { 635 | "cell_type": "code", 636 | "execution_count": null, 637 | "metadata": {}, 638 | "outputs": [], 639 | "source": [ 640 | "import pstats\n", 641 | "p = pstats.Stats('generatestats-old')\n", 642 | "p.strip_dirs().sort_stats(-1).print_stats()" 643 | ] 644 | }, 645 | { 646 | "cell_type": "code", 647 | "execution_count": null, 648 | "metadata": {}, 649 | "outputs": [], 650 | "source": [] 651 | } 652 | ], 653 | "metadata": { 654 | "kernelspec": { 655 | "display_name": "Python 3.6", 656 | "language": "python", 657 | "name": "jupyter" 658 | }, 659 | "language_info": { 660 | "codemirror_mode": { 661 | "name": "ipython", 662 | "version": 3 663 | }, 664 | "file_extension": ".py", 665 | "mimetype": "text/x-python", 666 | "name": "python", 667 | "nbconvert_exporter": "python", 668 | "pygments_lexer": "ipython3", 669 | "version": "3.6.5" 670 | } 671 | }, 672 | "nbformat": 4, 673 | "nbformat_minor": 2 674 | } 675 | -------------------------------------------------------------------------------- /notebooks/images/0001eeaf4aed83f9.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/0001eeaf4aed83f9.jpg -------------------------------------------------------------------------------- /notebooks/images/0004886b7d043cfd.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/0004886b7d043cfd.jpg -------------------------------------------------------------------------------- /notebooks/images/000595fe6fee6369.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/000595fe6fee6369.jpg -------------------------------------------------------------------------------- /notebooks/images/00075905539074f2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/00075905539074f2.jpg -------------------------------------------------------------------------------- /notebooks/images/0007cebe1b2ba653.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/0007cebe1b2ba653.jpg -------------------------------------------------------------------------------- /notebooks/images/0007d6cf88afaa4a.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/0007d6cf88afaa4a.jpg -------------------------------------------------------------------------------- /notebooks/images/0008e425fb49a2bf.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/0008e425fb49a2bf.jpg -------------------------------------------------------------------------------- /notebooks/images/0009bad4d8539bb4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/0009bad4d8539bb4.jpg -------------------------------------------------------------------------------- /notebooks/images/000a045a0715d64d.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/000a045a0715d64d.jpg -------------------------------------------------------------------------------- /notebooks/images/000a1249af2bc5f0.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/000a1249af2bc5f0.jpg -------------------------------------------------------------------------------- /notebooks/images/000ada55d36b4bcb.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/000ada55d36b4bcb.jpg -------------------------------------------------------------------------------- /notebooks/images/000c4d66ce89aa69.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/000c4d66ce89aa69.jpg -------------------------------------------------------------------------------- /notebooks/images/111029deeea453f5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/111029deeea453f5.jpg -------------------------------------------------------------------------------- /notebooks/images/111147418c6aca65.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/111147418c6aca65.jpg -------------------------------------------------------------------------------- /notebooks/images/11124dd4875ecf8d.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/11124dd4875ecf8d.jpg -------------------------------------------------------------------------------- /notebooks/images/1114b632f4336821.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/1114b632f4336821.jpg -------------------------------------------------------------------------------- /notebooks/images/111b8f9f49f2c6d2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/111b8f9f49f2c6d2.jpg -------------------------------------------------------------------------------- /notebooks/images/111c6bd9ac7173b7.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/111c6bd9ac7173b7.jpg -------------------------------------------------------------------------------- /notebooks/images/2222e793ebf6cc0e.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/2222e793ebf6cc0e.jpg -------------------------------------------------------------------------------- /notebooks/images/2224582bc4a8ec9a.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/2224582bc4a8ec9a.jpg -------------------------------------------------------------------------------- /notebooks/images/22249f8dc1f94a8d.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/22249f8dc1f94a8d.jpg -------------------------------------------------------------------------------- /notebooks/images/2225364f47d276d1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/2225364f47d276d1.jpg -------------------------------------------------------------------------------- /notebooks/images/2226c86e9f50fd72.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/2226c86e9f50fd72.jpg -------------------------------------------------------------------------------- /notebooks/images/2227f2fd5d49cb15.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/2227f2fd5d49cb15.jpg -------------------------------------------------------------------------------- /notebooks/images/222a266190060531.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/222a266190060531.jpg -------------------------------------------------------------------------------- /notebooks/images/222a383653c65184.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/222a383653c65184.jpg -------------------------------------------------------------------------------- /notebooks/images/222acfb0975199b1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/222acfb0975199b1.jpg -------------------------------------------------------------------------------- /notebooks/images/222c38d9991e97e2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/222c38d9991e97e2.jpg -------------------------------------------------------------------------------- /notebooks/images/222ce6a4d930b047.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/222ce6a4d930b047.jpg -------------------------------------------------------------------------------- /notebooks/images/222e8ad382d2bb98.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/222e8ad382d2bb98.jpg -------------------------------------------------------------------------------- /notebooks/images/222fd997433b59ce.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/222fd997433b59ce.jpg -------------------------------------------------------------------------------- /notebooks/images/3332995b230f5680.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/3332995b230f5680.jpg -------------------------------------------------------------------------------- /notebooks/images/333310bdebc23b93.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/333310bdebc23b93.jpg -------------------------------------------------------------------------------- /notebooks/images/3333210d97865136.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/3333210d97865136.jpg -------------------------------------------------------------------------------- /notebooks/images/33334fc1df5b2536.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/33334fc1df5b2536.jpg -------------------------------------------------------------------------------- /notebooks/images/333452abf04d1b2b.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/333452abf04d1b2b.jpg -------------------------------------------------------------------------------- /notebooks/images/33369f252faf0b2d.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/33369f252faf0b2d.jpg -------------------------------------------------------------------------------- /notebooks/images/33377099dedaefa9.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/33377099dedaefa9.jpg -------------------------------------------------------------------------------- /notebooks/images/33398fd76df994db.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/33398fd76df994db.jpg -------------------------------------------------------------------------------- /notebooks/images/3339a5c598981879.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/3339a5c598981879.jpg -------------------------------------------------------------------------------- /notebooks/images/3339cba64cb1927e.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/3339cba64cb1927e.jpg -------------------------------------------------------------------------------- /notebooks/images/3339ed0aad663343.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/3339ed0aad663343.jpg -------------------------------------------------------------------------------- /notebooks/images/333ed5e609f26e46.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/333ed5e609f26e46.jpg -------------------------------------------------------------------------------- /notebooks/images/333fc945c316344f.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/333fc945c316344f.jpg -------------------------------------------------------------------------------- /notebooks/images/aaa028a23c5052fa.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/aaa028a23c5052fa.jpg -------------------------------------------------------------------------------- /notebooks/images/aaa0b7ebb3e4affe.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/aaa0b7ebb3e4affe.jpg -------------------------------------------------------------------------------- /notebooks/images/aaa12411c2d6378d.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/aaa12411c2d6378d.jpg -------------------------------------------------------------------------------- /notebooks/images/aaa1ace50eb16015.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/aaa1ace50eb16015.jpg -------------------------------------------------------------------------------- /notebooks/images/aaa2a076ae7feb33.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/aaa2a076ae7feb33.jpg -------------------------------------------------------------------------------- /notebooks/images/aaa3d54ae3f0c1ae.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/aaa3d54ae3f0c1ae.jpg -------------------------------------------------------------------------------- /notebooks/images/aaa517fbf112c358.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/aaa517fbf112c358.jpg -------------------------------------------------------------------------------- /notebooks/images/aaa55aa836a17d0c.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/aaa55aa836a17d0c.jpg -------------------------------------------------------------------------------- /notebooks/images/aaa576abbfdd9a8e.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/aaa576abbfdd9a8e.jpg -------------------------------------------------------------------------------- /notebooks/images/aaa5a9c49681685a.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/aaa5a9c49681685a.jpg -------------------------------------------------------------------------------- /notebooks/images/aaa8bd02f557fc5c.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/aaa8bd02f557fc5c.jpg -------------------------------------------------------------------------------- /notebooks/images/aaaba578031e2017.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/aaaba578031e2017.jpg -------------------------------------------------------------------------------- /notebooks/images/aaaf093e2c2a45f1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/aaaf093e2c2a45f1.jpg -------------------------------------------------------------------------------- /notebooks/images/bbb06ba600c9b472.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/bbb06ba600c9b472.jpg -------------------------------------------------------------------------------- /notebooks/images/bbb1eb4d12a66c6b.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/bbb1eb4d12a66c6b.jpg -------------------------------------------------------------------------------- /notebooks/images/bbb1ff6486a289be.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/bbb1ff6486a289be.jpg -------------------------------------------------------------------------------- /notebooks/images/bbb4a398207d7237.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/bbb4a398207d7237.jpg -------------------------------------------------------------------------------- /notebooks/images/bbb4a411c275f953.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/bbb4a411c275f953.jpg -------------------------------------------------------------------------------- /notebooks/images/bbb4d54888e5fc73.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/bbb4d54888e5fc73.jpg -------------------------------------------------------------------------------- /notebooks/images/bbb60c5612dec65e.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/bbb60c5612dec65e.jpg -------------------------------------------------------------------------------- /notebooks/images/bbb74721560555e3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/bbb74721560555e3.jpg -------------------------------------------------------------------------------- /notebooks/images/bbb7a4d1abe795da.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/bbb7a4d1abe795da.jpg -------------------------------------------------------------------------------- /notebooks/images/bbbb419a2c107d5a.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/bbbb419a2c107d5a.jpg -------------------------------------------------------------------------------- /notebooks/images/bbbe9fd021044d50.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/bbbe9fd021044d50.jpg -------------------------------------------------------------------------------- /notebooks/images/bbbfe6b58a3ac009.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/bbbfe6b58a3ac009.jpg -------------------------------------------------------------------------------- /notebooks/images/ccc1057a1160aba1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/ccc1057a1160aba1.jpg -------------------------------------------------------------------------------- /notebooks/images/ccc135df5430520b.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/ccc135df5430520b.jpg -------------------------------------------------------------------------------- /notebooks/images/ccc18570ba287be1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/ccc18570ba287be1.jpg -------------------------------------------------------------------------------- /notebooks/images/ccc1a2d44a290368.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/ccc1a2d44a290368.jpg -------------------------------------------------------------------------------- /notebooks/images/ccc1ffd2d3d5d2c9.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/ccc1ffd2d3d5d2c9.jpg -------------------------------------------------------------------------------- /notebooks/images/ccc3f0bc6500081c.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/ccc3f0bc6500081c.jpg -------------------------------------------------------------------------------- /notebooks/images/ccc47704bcce60ef.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/ccc47704bcce60ef.jpg -------------------------------------------------------------------------------- /notebooks/images/ccc6cc13dd83bfe7.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/ccc6cc13dd83bfe7.jpg -------------------------------------------------------------------------------- /notebooks/images/ccca79f1e7646f88.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/ccca79f1e7646f88.jpg -------------------------------------------------------------------------------- /notebooks/images/cccd58f60624eaa2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/cccd58f60624eaa2.jpg -------------------------------------------------------------------------------- /notebooks/images/ccce1c730c1696db.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/ccce1c730c1696db.jpg -------------------------------------------------------------------------------- /notebooks/images/ccce275285f1d8c0.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/ccce275285f1d8c0.jpg -------------------------------------------------------------------------------- /notebooks/images/ddd38a507dec8dde.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/ddd38a507dec8dde.jpg -------------------------------------------------------------------------------- /notebooks/images/ddd3f96ff7f0bc78.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/ddd3f96ff7f0bc78.jpg -------------------------------------------------------------------------------- /notebooks/images/ddd4549df01b95e0.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/ddd4549df01b95e0.jpg -------------------------------------------------------------------------------- /notebooks/images/ddd56ec6489b89e3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/ddd56ec6489b89e3.jpg -------------------------------------------------------------------------------- /notebooks/images/ddd78d7007d17a75.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/ddd78d7007d17a75.jpg -------------------------------------------------------------------------------- /notebooks/images/ddd8b588942ceded.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/ddd8b588942ceded.jpg -------------------------------------------------------------------------------- /notebooks/images/dddd6279b633d7ab.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/dddd6279b633d7ab.jpg -------------------------------------------------------------------------------- /notebooks/images/eee10577be06c0bb.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/eee10577be06c0bb.jpg -------------------------------------------------------------------------------- /notebooks/images/eee273732a3b2608.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/eee273732a3b2608.jpg -------------------------------------------------------------------------------- /notebooks/images/eee297394eb90e24.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/eee297394eb90e24.jpg -------------------------------------------------------------------------------- /notebooks/images/eee3d8fffe8a9cfd.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/eee3d8fffe8a9cfd.jpg -------------------------------------------------------------------------------- /notebooks/images/eee47025e4848d0a.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/eee47025e4848d0a.jpg -------------------------------------------------------------------------------- /notebooks/images/eee4de386df902e5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/eee4de386df902e5.jpg -------------------------------------------------------------------------------- /notebooks/images/eee4fa0981f728c7.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/eee4fa0981f728c7.jpg -------------------------------------------------------------------------------- /notebooks/images/eee560c9e2411d91.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/eee560c9e2411d91.jpg -------------------------------------------------------------------------------- /notebooks/images/eee6b7c34d84c9ca.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/eee6b7c34d84c9ca.jpg -------------------------------------------------------------------------------- /notebooks/images/eee707f7382991dc.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/eee707f7382991dc.jpg -------------------------------------------------------------------------------- /notebooks/images/eee762f22867b8d4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/eee762f22867b8d4.jpg -------------------------------------------------------------------------------- /notebooks/images/eee8820b315c4c3b.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/eee8820b315c4c3b.jpg -------------------------------------------------------------------------------- /notebooks/images/eeebfdcbce12a2d9.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/eeebfdcbce12a2d9.jpg -------------------------------------------------------------------------------- /notebooks/images/eeed1e944c331791.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/eeed1e944c331791.jpg -------------------------------------------------------------------------------- /notebooks/images/eeef28bbe1dd1d18.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/eeef28bbe1dd1d18.jpg -------------------------------------------------------------------------------- /notebooks/images/fff0debd2911bfbd.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/fff0debd2911bfbd.jpg -------------------------------------------------------------------------------- /notebooks/images/fff149d613bacda0.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/fff149d613bacda0.jpg -------------------------------------------------------------------------------- /notebooks/images/fff2268a1b921e8e.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/fff2268a1b921e8e.jpg -------------------------------------------------------------------------------- /notebooks/images/fff277539bd8a2be.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/fff277539bd8a2be.jpg -------------------------------------------------------------------------------- /notebooks/images/fff3ce694bc02a09.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/fff3ce694bc02a09.jpg -------------------------------------------------------------------------------- /notebooks/images/fff50186c03c8474.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/fff50186c03c8474.jpg -------------------------------------------------------------------------------- /notebooks/images/fff5d10dd5ad119d.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/fff5d10dd5ad119d.jpg -------------------------------------------------------------------------------- /notebooks/images/fff820866f567015.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/fff820866f567015.jpg -------------------------------------------------------------------------------- /notebooks/images/fffc2f36b181a4fb.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/fffc2f36b181a4fb.jpg -------------------------------------------------------------------------------- /notebooks/images/ffff21932da3ed01.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/images/ffff21932da3ed01.jpg -------------------------------------------------------------------------------- /notebooks/opencv-basics.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Data prep for image recognition\n", 8 | "\n", 9 | "The purpose of this short notebook is to introduce the most basic features of the OpenCV library, focusing on features that will make it possible to use intelligent APIs on image data. We'll then see how to use a pretrained object detection model to find real-world objects in images." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import cv2\n", 19 | "import numpy as np" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "The first thing we'll try is reading an image from a file. OpenCV makes it easy to decode popular image formats, and this notebook has access to an image file we can read." 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "def loadImage(f):\n", 36 | " \"\"\" Load an image and convert it from BGR color space \n", 37 | " (which OpenCV uses) to RGB color space (which pyplot expects) \"\"\"\n", 38 | " \n", 39 | " return cv2.cvtColor(cv2.imread(f, cv2.IMREAD_COLOR), cv2.COLOR_BGR2RGB)\n", 40 | " \n", 41 | "img = loadImage(\"otto.jpg\")" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "## Working with images as arrays\n", 49 | "\n", 50 | "This will get us a `numpy` array containing the pixels from a picture of a confused schnauzer who did not expect to wind up unable to get out of the clothes basket. \n", 51 | "\n", 52 | "We can look at the size of the array:" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "img.shape" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "We can examine the image itself by plotting it." 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "%matplotlib inline\n", 78 | "import matplotlib.pyplot as plt\n", 79 | "plt.imshow(img)" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "While our focus is on using pretrained models, if we were training a model, it may be useful to transform, blur, or resize images in order to generate more training data from a few images. Since our images are `numpy` arrays, this is relatively straightforward in general, but OpenCV provides functions to make these tasks even easier. We'll see how to\n", 87 | "\n", 88 | "- blur an input image with a 15x15 box blur,\n", 89 | "- resize an image and interpolate between pixels in the source data, and\n", 90 | "- rotate an image without calculating a transformation matrix\n", 91 | "\n", 92 | "First, let's look at box blur:" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "plt.imshow(cv2.blur(img, (15,15)))" 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": {}, 107 | "source": [ 108 | "We can also scale the image by a factor of 3 on both axes (notice the difference in the axes on the plotted image, even though the size doesn't change)." 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "plt.imshow(cv2.resize(img, None, fx=3, fy=3, interpolation=cv2.INTER_CUBIC))" 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": {}, 123 | "source": [ 124 | "It's also possible to stretch the image by scaling along axes differently:" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "plt.imshow(cv2.resize(img, None, fx=2.5, fy=3, interpolation=cv2.INTER_CUBIC))" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "We can also rotate the image. Recall that rotation is an affine tranformation on image matrices. OpenCV provides a function to calculate the transformation matrix, given a point to rotate around, an angle of rotation, and a scaling factor. Here we'll rotate the image around its center by 15 degrees while scaling by 1.3x." 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "rows, cols, _ = img.shape\n", 150 | "center = (cols / 2, rows / 2)\n", 151 | "angle = 15 # degrees\n", 152 | "scale = 1.3\n", 153 | "rotationMatrix = cv2.getRotationMatrix2D(center, angle, scale)\n", 154 | "plt.imshow(cv2.warpAffine(img, rotationMatrix, (cols, rows)))" 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": {}, 160 | "source": [ 161 | "## Working with image data in byte arrays\n", 162 | "\n", 163 | "In many non-batch applications, we won't be actually processing _files_; instead, we'll be dealing with binary data, whether passed as a base64-encoded string to a HTTP request or stored in a blob as part of structured data on a stream. OpenCV is able to decode this raw binary data just as it is able to decode files; this last part of the notebook will show you how to do it.\n", 164 | "\n", 165 | "We'll start by getting a Python `bytearray` with the contents of a file. Notice that, while we have a JPEG file, we aren't storing the file type anywhere." 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [ 174 | "with open(\"otto.jpg\", \"rb\") as f:\n", 175 | " img_bytes = bytearray(f.read())" 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "metadata": {}, 181 | "source": [ 182 | "Now that we have a `bytearray` of the file's contents, we'll convert that into a flat NumPy array:" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "imgarr = np.asarray(img_bytes, dtype=np.uint8)\n", 192 | "imgarr" 193 | ] 194 | }, 195 | { 196 | "cell_type": "markdown", 197 | "metadata": {}, 198 | "source": [ 199 | "The OpenCV `imdecode` function will inspect this flat array and parse it as an image, inferring the right type and dimensions and returning a multidimensional array with an appropriate shape." 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": null, 205 | "metadata": {}, 206 | "outputs": [], 207 | "source": [ 208 | "# decode byte array as image\n", 209 | "img2 = cv2.imdecode(imgarr, cv2.IMREAD_COLOR)\n", 210 | "\n", 211 | "# convert BGR to RGB\n", 212 | "img2 = cv2.cvtColor(img2, cv2.COLOR_BGR2RGB)" 213 | ] 214 | }, 215 | { 216 | "cell_type": "markdown", 217 | "metadata": {}, 218 | "source": [ 219 | "We then have a multidimensional array that we can use just as we did the image we read from a file." 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": null, 225 | "metadata": {}, 226 | "outputs": [], 227 | "source": [ 228 | "plt.imshow(img2)" 229 | ] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "metadata": {}, 234 | "source": [ 235 | "## Image intensities\n", 236 | "\n", 237 | "We can also plot histograms for each channel of the image. (This example code is taken from the [OpenCV documentation](https://docs.opencv.org/3.1.0/d1/db7/tutorial_py_histogram_begins.html).) You can see that the image of the dog is underexposed." 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": null, 243 | "metadata": {}, 244 | "outputs": [], 245 | "source": [ 246 | "for i, color in enumerate([\"r\", \"g\", \"b\"]):\n", 247 | " histogram = cv2.calcHist([img], [i], None, [256], [0, 256])\n", 248 | " plt.plot(histogram, color=color)\n", 249 | " plt.xlim([0, 256])\n", 250 | "plt.show()" 251 | ] 252 | }, 253 | { 254 | "cell_type": "markdown", 255 | "metadata": {}, 256 | "source": [ 257 | "# Object detection with pretrained models\n", 258 | "\n", 259 | "Now that we've seen how to use some of the basic capabilities of OpenCV to parse image data into a matrix of pixels -- and then to perform useful image transformations and analyses on this matrix -- we're ready to see how to use a pretrained model to identify objects in real images.\n", 260 | "\n", 261 | "We'll use a pretrained [YOLO](https://pjreddie.com/darknet/yolo/) (\"you only look once\") model and we'll load and score that model with the [darkflow](https://github.com/thtrieu/darkflow/) library, which is built on TensorFlow.\n", 262 | "\n", 263 | "One of the key themes of this workshop is that you don't need a deep understanding of the techniques behind off-the-shelf models for language processing or image recognition in order to make use of them in your applications, but YOLO is a cool technique, so if you want to learn more about it, here's where to get started:\n", 264 | "\n", 265 | "- [this paper](https://pjreddie.com/media/files/papers/yolo_1.pdf) explains the first version of YOLO and the basic technique,\n", 266 | "- [this presentation](https://www.youtube.com/watch?v=NM6lrxy0bxs) presents the basics of the paper in a thirteen-minute video, and\n", 267 | "- [this paper](http://homepages.inf.ed.ac.uk/ckiw/postscript/ijcv_voc09.pdf) provides a deeper dive into object detection (including some details on the mAP metric for evaluating classifier quality)\n", 268 | "\n", 269 | "YOLO is so-called because previous object-detection techniques repeatedly ran image classifiers on multiple overlapping windows of an image; by contrast, YOLO \"only looks once,\" identifying image regions that might contain an interesting object and then identifying which objects those regions might contain in a single pass. It can be much faster than classic approaches; indeed, it can run in real time or faster with GPU acceleration.\n", 270 | "\n", 271 | "## Loading our model\n", 272 | "\n", 273 | "We'll start by loading a pretrained model architecture and model weights from files:" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": null, 279 | "metadata": {}, 280 | "outputs": [], 281 | "source": [ 282 | "from darkflow.net.build import TFNet\n", 283 | "options = {\"model\": \"cfg/yolo.cfg\", \"load\": \"/data/yolo.weights\", \"threshold\" : 0.1}\n", 284 | "yolo = TFNet(options)\n" 285 | ] 286 | }, 287 | { 288 | "cell_type": "markdown", 289 | "metadata": {}, 290 | "source": [ 291 | "Our next step is to use the model to identify some objects in an image. We'll start with the dog image. The `return_predict` method will return a list of predictions, each with a visual object class, a confidence score, and a bounding box." 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": null, 297 | "metadata": {}, 298 | "outputs": [], 299 | "source": [ 300 | "predictions = yolo.return_predict(img)\n", 301 | "predictions" 302 | ] 303 | }, 304 | { 305 | "cell_type": "markdown", 306 | "metadata": {}, 307 | "source": [ 308 | "To be fair, most dogs spend a lot of time on sofas.\n", 309 | "\n", 310 | "It is often useful to visualize what parts of the image were identified as objects. We can use OpenCV to annotate the bounding boxes of each identified object in the image with the `cv2.rectangle` function. Since this is destructive, we'll work on a copy of the image." 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": null, 316 | "metadata": {}, 317 | "outputs": [], 318 | "source": [ 319 | "def annotate(img, predictions, thickness=None):\n", 320 | " \"\"\" Copies the supplied image and annotates it with the bounding\n", 321 | " boxes of each identified object \"\"\"\n", 322 | " annotated_img = np.copy(img)\n", 323 | " \n", 324 | " if thickness is None:\n", 325 | " thickness = int(max(img.shape[0], img.shape[1]) / 100)\n", 326 | " \n", 327 | " for prediction in predictions:\n", 328 | " tl = prediction[\"topleft\"]\n", 329 | " topleft = (tl[\"x\"], tl[\"y\"])\n", 330 | " br = prediction[\"bottomright\"]\n", 331 | " bottomright = (br[\"x\"], br[\"y\"])\n", 332 | " # draw a white rectangle around the identified object\n", 333 | " white = (255,255,255)\n", 334 | " cv2.rectangle(annotated_img, topleft, bottomright, color=white, thickness=thickness)\n", 335 | "\n", 336 | " return annotated_img" 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": null, 342 | "metadata": { 343 | "scrolled": true 344 | }, 345 | "outputs": [], 346 | "source": [ 347 | "plt.imshow(annotate(img, predictions))" 348 | ] 349 | }, 350 | { 351 | "cell_type": "markdown", 352 | "metadata": {}, 353 | "source": [ 354 | "## Trying it out with other images\n", 355 | "\n", 356 | "We can try this technique out with other images as well. The test images we have are from the [Open Images Dataset](https://storage.googleapis.com/openimages/web/index.html) and are licensed under CC-BY-SA. Some of these results are impressive and some are unintentionally hilarious! Try it out and see if you can figure out why certain false positives show up." 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": null, 362 | "metadata": {}, 363 | "outputs": [], 364 | "source": [ 365 | "from ipywidgets import interact\n", 366 | "from os import listdir\n", 367 | "\n", 368 | "def predict(imageFile):\n", 369 | " image = loadImage(\"/data/images/\" + imageFile)\n", 370 | " predictions = yolo.return_predict(image)\n", 371 | " plt.imshow(annotate(image, predictions, thickness=5))\n", 372 | " return predictions\n", 373 | "\n", 374 | "interact(predict, imageFile = listdir(\"/data/images/\"))" 375 | ] 376 | }, 377 | { 378 | "cell_type": "markdown", 379 | "metadata": {}, 380 | "source": [ 381 | "# Training custom models\n", 382 | "\n", 383 | "It's outside the scope of this workshop (both in terms of time and content), but you've actually learned a lot of skills in this notebook that are applicable to training custom object detection models (e.g., to identify new kinds of objects). Here's how you can get started.\n", 384 | "\n", 385 | "1. You'll need some labeled data; for object detection, this is going to be image files annotated with the bounding boxes and object classes of real-world object pictured in those images. Good places to start are the [Pascal VOC 2012 dataset](http://host.robots.ox.ac.uk/pascal/VOC/voc2012/index.html#data) or the [COCO dataset](http://cocodataset.org/).\n", 386 | "2. If you want to identify new object classes (for example, a corporate logo), you'll need to add labeled images that contain these object classes. Since you may not have many example images for the new object classes, you may want to generate synthetic images to augment your training set; there are [many approaches ranging from rotating and scaling input data to using neural networks to generate new examples](https://arxiv.org/pdf/1712.04621.pdf). You already know how to transform and rotate images, of course!\n", 387 | "3. Actually training the model will depend on what framework you ultimately want to use for the project; [here are the instructions for Darkflow](https://github.com/thtrieu/darkflow#training-on-your-own-dataset)." 388 | ] 389 | } 390 | ], 391 | "metadata": { 392 | "kernelspec": { 393 | "display_name": "Python 3", 394 | "language": "python", 395 | "name": "python3" 396 | }, 397 | "language_info": { 398 | "codemirror_mode": { 399 | "name": "ipython", 400 | "version": 3 401 | }, 402 | "file_extension": ".py", 403 | "mimetype": "text/x-python", 404 | "name": "python", 405 | "nbconvert_exporter": "python", 406 | "pygments_lexer": "ipython3", 407 | "version": "3.6.3" 408 | } 409 | }, 410 | "nbformat": 4, 411 | "nbformat_minor": 2 412 | } 413 | -------------------------------------------------------------------------------- /notebooks/otto.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/otto.jpg -------------------------------------------------------------------------------- /notebooks/preprocess-reviews.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sqlite3 4 | import json 5 | import sys 6 | import re 7 | 8 | usage = """ 9 | USAGE: ./preprocess-reviews.py database.sqlite [rating] [limit] 10 | Assumes that database.sqlite is a SQLite file with a table called "reviews" that has fields called "score" and "text". 11 | An example such database is available from the public domain Amazon fine foods review dataset: 12 | 13 | https://www.kaggle.com/snap/amazon-fine-food-reviews/ 14 | 15 | This script will extract all reviews with the specified rating, or 5 if no rating is supplied. 16 | """ 17 | 18 | connection = sqlite3.connect(sys.argv[1]) 19 | if len(sys.argv) < 2: 20 | rating = 5 21 | limit = None 22 | elif len(sys.argv) < 3: 23 | rating = int(sys.argv[2]) 24 | limit = None 25 | else: 26 | rating = int(sys.argv[2]) 27 | limit = int(sys.argv[3]) 28 | 29 | cursor = connection.execute("SELECT score, text FROM reviews WHERE score = ?", str(rating)) 30 | if limit is None: 31 | limit = cursor.rowcount 32 | 33 | for row in cursor: 34 | print(re.sub('<[^<]+?>', '', row[1])) 35 | limit = limit - 1 36 | if limit == 0: 37 | break 38 | -------------------------------------------------------------------------------- /notebooks/reviews-1.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/reviews-1.txt.gz -------------------------------------------------------------------------------- /notebooks/reviews-5-100k.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/reviews-5-100k.txt.gz -------------------------------------------------------------------------------- /notebooks/social-firehose.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Getting started processing Kafka with Spark\n", 8 | "\n", 9 | "The first thing we'll need to do is tell Spark where to find the Kafka driver before we set Spark up. Currently, our notebook images are built against Spark 2.2. If you're using this with a different version of Spark, be sure to change `SPARK_VERSION` in the cell below before executing it." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import os\n", 19 | "SPARK_VERSION=\"2.2.0\"\n", 20 | "os.environ[\"PYSPARK_SUBMIT_ARGS\"] = \"--packages org.apache.spark:spark-sql-kafka-0-10_2.11:%s pyspark-shell\" % SPARK_VERSION" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "Next up, we'll connect to Spark by establishing a `SparkSession`." 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 2, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "import pyspark\n", 37 | "\n", 38 | "from pyspark.sql import SparkSession\n", 39 | "\n", 40 | "spark = SparkSession \\\n", 41 | " .builder \\\n", 42 | " .master(\"local[2]\") \\\n", 43 | " .appName(\"Social Firehose\") \\\n", 44 | " .getOrCreate()" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "We're going to begin by loading the contents of a Kafka topic into a data frame. Because Spark data frames are _lazy_, or recomputed when accessed, this data frame will always have the most recent collection of messages in it." 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 3, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "df = spark \\\n", 61 | " .read \\\n", 62 | " .format(\"kafka\") \\\n", 63 | " .option(\"kafka.bootstrap.servers\", \"kafka.kafka.svc:9092\") \\\n", 64 | " .option(\"subscribe\", \"social-firehose\") \\\n", 65 | " .load()" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "We can see that this data frame always has the most recent collection of messages by running the `count()` action on it twice with a short delay in the middle. Note how many messages are generated in ten seconds:" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 4, 78 | "metadata": {}, 79 | "outputs": [ 80 | { 81 | "data": { 82 | "text/plain": [ 83 | "(1026698, 1026716)" 84 | ] 85 | }, 86 | "execution_count": 4, 87 | "metadata": {}, 88 | "output_type": "execute_result" 89 | } 90 | ], 91 | "source": [ 92 | "import time\n", 93 | "a = df.count()\n", 94 | "time.sleep(10)\n", 95 | "b = df.count()\n", 96 | "(a, b)" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": {}, 102 | "source": [ 103 | "We can inspect the first few messages, but they'll be in a pretty raw format." 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 5, 109 | "metadata": {}, 110 | "outputs": [ 111 | { 112 | "data": { 113 | "text/plain": [ 114 | "[Row(key=None, value=bytearray(b'{\"text\": \"The furniture was in all probability have gained some news of them; and till we know that she ever should receive another so perfectly gratifying in the occasion and the style. #retweet #yolo #ff\", \"user_id\": \"9086078734\", \"update_id\": \"00000000000000000000\"}'), topic='social-firehose', partition=0, offset=0, timestamp=datetime.datetime(2018, 5, 1, 13, 55, 35, 78000), timestampType=0),\n", 115 | " Row(key=None, value=bytearray(b'{\"text\": \"After this period every appearance of equal permanency. #health\", \"user_id\": \"3082369400\", \"update_id\": \"00000000000000000001\"}'), topic='social-firehose', partition=0, offset=1, timestamp=datetime.datetime(2018, 5, 1, 13, 55, 35, 273000), timestampType=0),\n", 116 | " Row(key=None, value=bytearray(b'{\"text\": \"Worse than all! #health #news\", \"user_id\": \"7761320665\", \"update_id\": \"00000000000000000002\"}'), topic='social-firehose', partition=0, offset=2, timestamp=datetime.datetime(2018, 5, 1, 13, 55, 35, 343000), timestampType=0)]" 117 | ] 118 | }, 119 | "execution_count": 5, 120 | "metadata": {}, 121 | "output_type": "execute_result" 122 | } 123 | ], 124 | "source": [ 125 | "df.take(3)" 126 | ] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "metadata": {}, 131 | "source": [ 132 | "Now we'll import some functions and types from the Spark library so we can do something more useful with our data set." 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 6, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "from pyspark.sql.functions import explode\n", 142 | "from pyspark.sql.functions import split\n", 143 | "from pyspark.sql.functions import from_json\n", 144 | "from pyspark.sql.functions import column\n", 145 | "from pyspark.sql.types import StringType\n", 146 | "from pyspark.sql.types import StructType\n", 147 | "from pyspark.sql.types import StructField" 148 | ] 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "metadata": {}, 153 | "source": [ 154 | "The first thing we'll do is extract the JSON payloads of the messages; we'll inspect the first ten as a sanity check." 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 7, 160 | "metadata": {}, 161 | "outputs": [ 162 | { 163 | "name": "stdout", 164 | "output_type": "stream", 165 | "text": [ 166 | "+--------------------+\n", 167 | "| value|\n", 168 | "+--------------------+\n", 169 | "|{\"text\": \"The fur...|\n", 170 | "|{\"text\": \"After t...|\n", 171 | "|{\"text\": \"Worse t...|\n", 172 | "|{\"text\": \"She is ...|\n", 173 | "|{\"text\": \"Miss #C...|\n", 174 | "|{\"text\": \"Where p...|\n", 175 | "|{\"text\": \"Miss #H...|\n", 176 | "|{\"text\": \"Why the...|\n", 177 | "|{\"text\": \"The ent...|\n", 178 | "|{\"text\": \"Emma co...|\n", 179 | "+--------------------+\n", 180 | "only showing top 10 rows\n", 181 | "\n" 182 | ] 183 | } 184 | ], 185 | "source": [ 186 | "values = df.select(df[\"value\"].cast(StringType()).alias(\"value\"))\n", 187 | "values.show(10)" 188 | ] 189 | }, 190 | { 191 | "cell_type": "markdown", 192 | "metadata": {}, 193 | "source": [ 194 | "The next thing we'll do is impose some structure on the messages by converting the serialized JSON objects into actual records:\n", 195 | "\n", 196 | "1. First, we'll declare a `StructType` for the structure of our messages (three strings, named `text`, `user_id`, and `update_id`),\n", 197 | "2. Next, we'll convert the JSON strings to structures using the `from_json` dataframe function, and\n", 198 | "3. Finally, we'll `SELECT` the fields of the object so we have something that looks like a flat database tuple." 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": 8, 204 | "metadata": {}, 205 | "outputs": [ 206 | { 207 | "name": "stdout", 208 | "output_type": "stream", 209 | "text": [ 210 | "+--------------------+----------+--------------------+\n", 211 | "| update_id| user_id| text|\n", 212 | "+--------------------+----------+--------------------+\n", 213 | "|00000000000000000000|9086078734|The furniture was...|\n", 214 | "|00000000000000000001|3082369400|After this period...|\n", 215 | "|00000000000000000002|7761320665|Worse than all! #...|\n", 216 | "|00000000000000000003|2529702535|She is netting he...|\n", 217 | "|00000000000000000004|5580232535|Miss #Crawford wa...|\n", 218 | "|00000000000000000005|2143036217|Where pride and s...|\n", 219 | "|00000000000000000006|1605193990|Miss #Hamilton, n...|\n", 220 | "|00000000000000000007|1250771648|Why they are your...|\n", 221 | "|00000000000000000008|5606455308|The entrance of t...|\n", 222 | "|00000000000000000009|1658432974|Emma could not ma...|\n", 223 | "+--------------------+----------+--------------------+\n", 224 | "only showing top 10 rows\n", 225 | "\n" 226 | ] 227 | } 228 | ], 229 | "source": [ 230 | "structure = StructType([StructField(fn, StringType(), True) for fn in \"text user_id update_id\".split()])\n", 231 | "records = values.select(from_json(values[\"value\"], structure).alias(\"json\")) \\\n", 232 | " .select(column(\"json.update_id\"), column(\"json.user_id\").alias(\"user_id\"), column(\"json.text\"))\n", 233 | "records.show(10)" 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "metadata": { 239 | "collapsed": true 240 | }, 241 | "source": [ 242 | "We can perform database-style aggregations on this data frame, like identifying the users responsible for the most status updates:" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": 9, 248 | "metadata": {}, 249 | "outputs": [ 250 | { 251 | "name": "stdout", 252 | "output_type": "stream", 253 | "text": [ 254 | "+----------+-----+\n", 255 | "| user_id|count|\n", 256 | "+----------+-----+\n", 257 | "|8804573903| 1088|\n", 258 | "|3380222115| 1075|\n", 259 | "|0505683347| 1073|\n", 260 | "|3000579014| 1069|\n", 261 | "|9587912416| 1068|\n", 262 | "|4327469857| 1068|\n", 263 | "|3108294958| 1066|\n", 264 | "|7334906826| 1064|\n", 265 | "|0842976912| 1061|\n", 266 | "|8396961232| 1060|\n", 267 | "|4089612749| 1059|\n", 268 | "|9865588092| 1057|\n", 269 | "|4612190570| 1055|\n", 270 | "|0604401354| 1054|\n", 271 | "|3526538619| 1053|\n", 272 | "|7029874273| 1053|\n", 273 | "|5104535196| 1052|\n", 274 | "|8598111212| 1051|\n", 275 | "|4165442352| 1051|\n", 276 | "|6896171244| 1051|\n", 277 | "+----------+-----+\n", 278 | "only showing top 20 rows\n", 279 | "\n" 280 | ] 281 | } 282 | ], 283 | "source": [ 284 | "user_counts = records.groupBy(\"user_id\").count().orderBy(\"count\", ascending=False)\n", 285 | "user_counts.show()" 286 | ] 287 | }, 288 | { 289 | "cell_type": "markdown", 290 | "metadata": {}, 291 | "source": [ 292 | "If you run that query several times with a short delay in between, you may get different results since the data frame will reflect newly-arriving messages. Try it out!\n", 293 | "\n", 294 | "We can also count the number of users who have issued status updates (because of how we're generating the synthetic stream of updates, there is an upper bound on this number):" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": 10, 300 | "metadata": {}, 301 | "outputs": [ 302 | { 303 | "data": { 304 | "text/plain": [ 305 | "19340" 306 | ] 307 | }, 308 | "execution_count": 10, 309 | "metadata": {}, 310 | "output_type": "execute_result" 311 | } 312 | ], 313 | "source": [ 314 | "records.select(\"user_id\").distinct().count()" 315 | ] 316 | }, 317 | { 318 | "cell_type": "markdown", 319 | "metadata": {}, 320 | "source": [ 321 | "We can also identify the most prolix users. You probably have some social media connections who take advantage of every extra bit of character limit; a query like this will show you who they are!" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": 11, 327 | "metadata": {}, 328 | "outputs": [ 329 | { 330 | "name": "stdout", 331 | "output_type": "stream", 332 | "text": [ 333 | "+----------+---------------+\n", 334 | "| user_id|avg(update_len)|\n", 335 | "+----------+---------------+\n", 336 | "|0630541675| 330.0|\n", 337 | "|0391736751| 316.0|\n", 338 | "|2506022037| 307.0|\n", 339 | "|5821398050| 302.0|\n", 340 | "|1410371328| 299.0|\n", 341 | "|1412562247| 284.0|\n", 342 | "|7790898195| 277.0|\n", 343 | "|0223724881| 276.0|\n", 344 | "|7578073215| 275.0|\n", 345 | "|6407600570| 274.0|\n", 346 | "|3076549109| 273.0|\n", 347 | "|7999575942| 273.0|\n", 348 | "|4408278586| 272.0|\n", 349 | "|8725814325| 272.0|\n", 350 | "|4764843224| 269.0|\n", 351 | "|7652850466| 269.0|\n", 352 | "|1036217429| 268.0|\n", 353 | "|1432169939| 268.0|\n", 354 | "|5449920443| 268.0|\n", 355 | "|2160446304| 267.5|\n", 356 | "+----------+---------------+\n", 357 | "only showing top 20 rows\n", 358 | "\n" 359 | ] 360 | } 361 | ], 362 | "source": [ 363 | "from pyspark.sql.functions import length\n", 364 | "user_loquacity = records.select(column(\"user_id\"), length(\"text\").alias(\"update_len\")) \\\n", 365 | " .groupBy(\"user_id\") \\\n", 366 | " .avg() \\\n", 367 | " .orderBy(\"avg(update_len)\", ascending=False)\n", 368 | "user_loquacity.show()" 369 | ] 370 | }, 371 | { 372 | "cell_type": "markdown", 373 | "metadata": {}, 374 | "source": [ 375 | "We can also identify the most popular hashtags in users' updates. We'll start by turning each update into an array of words. Then we'll explode each array into multiple rows, so that each row has a separate, single element, i.e.\n", 376 | "\n", 377 | "```\n", 378 | "1, 2, \"foo bar blah\"\n", 379 | "```\n", 380 | "\n", 381 | "would become\n", 382 | "\n", 383 | "```\n", 384 | "1, 2, [foo, bar, blah]\n", 385 | "```\n", 386 | "\n", 387 | "which would become\n", 388 | "\n", 389 | "```\n", 390 | "1, 2, foo\n", 391 | "1, 2, bar\n", 392 | "1, 2, blah\n", 393 | "```\n", 394 | "\n", 395 | "We'll then filter for hashtags (keeping only words starting with `#`) so we can find the most popular!\n" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": 12, 401 | "metadata": {}, 402 | "outputs": [], 403 | "source": [ 404 | "words = records.select(explode(split(\"text\", \" \")).alias(\"word\"))\n", 405 | "hashtags = words.filter(column(\"word\").startswith(\"#\"))" 406 | ] 407 | }, 408 | { 409 | "cell_type": "code", 410 | "execution_count": 13, 411 | "metadata": {}, 412 | "outputs": [ 413 | { 414 | "name": "stdout", 415 | "output_type": "stream", 416 | "text": [ 417 | "+-----------+\n", 418 | "| word|\n", 419 | "+-----------+\n", 420 | "| The|\n", 421 | "| furniture|\n", 422 | "| was|\n", 423 | "| in|\n", 424 | "| all|\n", 425 | "|probability|\n", 426 | "| have|\n", 427 | "| gained|\n", 428 | "| some|\n", 429 | "| news|\n", 430 | "| of|\n", 431 | "| them;|\n", 432 | "| and|\n", 433 | "| till|\n", 434 | "| we|\n", 435 | "| know|\n", 436 | "| that|\n", 437 | "| she|\n", 438 | "| ever|\n", 439 | "| should|\n", 440 | "+-----------+\n", 441 | "only showing top 20 rows\n", 442 | "\n" 443 | ] 444 | } 445 | ], 446 | "source": [ 447 | "words.show()" 448 | ] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "execution_count": 14, 453 | "metadata": {}, 454 | "outputs": [ 455 | { 456 | "name": "stdout", 457 | "output_type": "stream", 458 | "text": [ 459 | "+-------------+\n", 460 | "| word|\n", 461 | "+-------------+\n", 462 | "| #retweet|\n", 463 | "| #yolo|\n", 464 | "| #ff|\n", 465 | "| #health|\n", 466 | "| #health|\n", 467 | "| #news|\n", 468 | "| #ff|\n", 469 | "| #Crawford|\n", 470 | "| #news|\n", 471 | "| #fail|\n", 472 | "| #retweet|\n", 473 | "|#followfriday|\n", 474 | "| #news|\n", 475 | "| #Hamilton,|\n", 476 | "| #MrsSmith,|\n", 477 | "| #MrsSmith|\n", 478 | "| #MrsSmith|\n", 479 | "| #MrsSmith|\n", 480 | "| #Bath,|\n", 481 | "| #Anne|\n", 482 | "+-------------+\n", 483 | "only showing top 20 rows\n", 484 | "\n" 485 | ] 486 | } 487 | ], 488 | "source": [ 489 | "hashtags.show()" 490 | ] 491 | }, 492 | { 493 | "cell_type": "code", 494 | "execution_count": 15, 495 | "metadata": {}, 496 | "outputs": [ 497 | { 498 | "name": "stdout", 499 | "output_type": "stream", 500 | "text": [ 501 | "+----------+-----+\n", 502 | "| word|count|\n", 503 | "+----------+-----+\n", 504 | "| #first|15356|\n", 505 | "| #one|13781|\n", 506 | "| #two|11384|\n", 507 | "|#Elizabeth|10312|\n", 508 | "| #Fanny| 8513|\n", 509 | "| #Anne| 5927|\n", 510 | "|#Catherine| 5719|\n", 511 | "| #Marianne| 4842|\n", 512 | "| #Amazon| 4779|\n", 513 | "| #Crawford| 4468|\n", 514 | "| #Emma| 4178|\n", 515 | "| #Elinor| 3973|\n", 516 | "| #Jane| 3838|\n", 517 | "| #Weston| 3834|\n", 518 | "| #Darcy| 3793|\n", 519 | "| #Bennet| 3620|\n", 520 | "| #half| 3521|\n", 521 | "| #three| 3477|\n", 522 | "| #second| 3459|\n", 523 | "| #Thomas| 3289|\n", 524 | "+----------+-----+\n", 525 | "only showing top 20 rows\n", 526 | "\n" 527 | ] 528 | } 529 | ], 530 | "source": [ 531 | "hashtag_counts = hashtags.groupBy(\"word\").count().orderBy(\"count\", ascending=False)\n", 532 | "hashtag_counts.show()" 533 | ] 534 | }, 535 | { 536 | "cell_type": "code", 537 | "execution_count": null, 538 | "metadata": {}, 539 | "outputs": [], 540 | "source": [] 541 | } 542 | ], 543 | "metadata": { 544 | "kernelspec": { 545 | "display_name": "Python 3", 546 | "language": "python", 547 | "name": "python3" 548 | }, 549 | "language_info": { 550 | "codemirror_mode": { 551 | "name": "ipython", 552 | "version": 3 553 | }, 554 | "file_extension": ".py", 555 | "mimetype": "text/x-python", 556 | "name": "python", 557 | "nbconvert_exporter": "python", 558 | "pygments_lexer": "ipython3", 559 | "version": "3.6.3" 560 | } 561 | }, 562 | "nbformat": 4, 563 | "nbformat_minor": 2 564 | } 565 | -------------------------------------------------------------------------------- /notebooks/tensorflow-1.6.0-cp36-cp36m-linux_x86_64.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/notebooks/tensorflow-1.6.0-cp36-cp36m-linux_x86_64.whl -------------------------------------------------------------------------------- /resources.yaml: -------------------------------------------------------------------------------- 1 | kind: List 2 | apiVersion: v1 3 | metadata: 4 | name: streaming-lab-list 5 | description: a custom object list for the radanalytics.io streaming lab 6 | 7 | items: 8 | 9 | - apiVersion: v1 10 | kind: ServiceAccount 11 | metadata: 12 | name: oshinko 13 | annotations: 14 | serviceaccounts.openshift.io/oauth-redirectreference.first: '{"kind":"OAuthRedirectReference","apiVersion":"v1","reference":{"kind":"Route","name":"oshinko-web-oaproxy"}}' 15 | 16 | - apiVersion: v1 17 | kind: RoleBinding 18 | metadata: 19 | name: oshinko-edit 20 | roleRef: 21 | name: edit 22 | subjects: 23 | - kind: ServiceAccount 24 | name: oshinko 25 | 26 | - apiVersion: v1 27 | kind: ConfigMap 28 | metadata: 29 | name: streaming-lab 30 | data: 31 | sparkimage: docker.io/elmiko/openshift-spark:2.2.1-streaming-lab 32 | spark-defaults.conf: | 33 | spark.jars.packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.2.1 34 | 35 | - apiVersion: v1 36 | kind: Template 37 | labels: 38 | application: oshinko-python-spark 39 | createdBy: template-oshinko-python-spark-build-dc 40 | metadata: 41 | annotations: 42 | description: Create a buildconfig, imagestream and deploymentconfig using source-to-image and Python Spark source files hosted in git 43 | openshift.io/display-name: Apache Spark Python 44 | name: oshinko-python-spark-build-dc 45 | objects: 46 | - apiVersion: v1 47 | kind: ImageStream 48 | metadata: 49 | name: ${APPLICATION_NAME} 50 | labels: 51 | app: ${APPLICATION_NAME} 52 | spec: 53 | dockerImageRepository: ${APPLICATION_NAME} 54 | tags: 55 | - name: latest 56 | - apiVersion: v1 57 | kind: BuildConfig 58 | metadata: 59 | name: ${APPLICATION_NAME} 60 | labels: 61 | app: ${APPLICATION_NAME} 62 | spec: 63 | output: 64 | to: 65 | kind: ImageStreamTag 66 | name: ${APPLICATION_NAME}:latest 67 | source: 68 | contextDir: ${CONTEXT_DIR} 69 | git: 70 | ref: ${GIT_REF} 71 | uri: ${GIT_URI} 72 | type: Git 73 | strategy: 74 | sourceStrategy: 75 | env: 76 | - name: APP_FILE 77 | value: ${APP_FILE} 78 | forcePull: true 79 | from: 80 | kind: DockerImage 81 | name: docker.io/elmiko/radanalytics-pyspark:2.2.1-streaming-lab 82 | type: Source 83 | triggers: 84 | - imageChange: {} 85 | type: ImageChange 86 | - type: ConfigChange 87 | - github: 88 | secret: ${APPLICATION_NAME} 89 | type: GitHub 90 | - generic: 91 | secret: ${APPLICATION_NAME} 92 | type: Generic 93 | - apiVersion: v1 94 | kind: DeploymentConfig 95 | metadata: 96 | name: ${APPLICATION_NAME} 97 | labels: 98 | deploymentConfig: ${APPLICATION_NAME} 99 | app: ${APPLICATION_NAME} 100 | spec: 101 | replicas: 1 102 | selector: 103 | deploymentConfig: ${APPLICATION_NAME} 104 | strategy: 105 | type: Rolling 106 | template: 107 | metadata: 108 | labels: 109 | deploymentConfig: ${APPLICATION_NAME} 110 | app: ${APPLICATION_NAME} 111 | spec: 112 | containers: 113 | - env: 114 | - name: OSHINKO_CLUSTER_NAME 115 | value: ${OSHINKO_CLUSTER_NAME} 116 | - name: APP_ARGS 117 | value: ${APP_ARGS} 118 | - name: SPARK_OPTIONS 119 | value: ${SPARK_OPTIONS} 120 | - name: OSHINKO_DEL_CLUSTER 121 | value: ${OSHINKO_DEL_CLUSTER} 122 | - name: APP_EXIT 123 | value: "true" 124 | - name: OSHINKO_NAMED_CONFIG 125 | value: ${OSHINKO_NAMED_CONFIG} 126 | - name: OSHINKO_SPARK_DRIVER_CONFIG 127 | value: ${OSHINKO_SPARK_DRIVER_CONFIG} 128 | - name: POD_NAME 129 | valueFrom: 130 | fieldRef: 131 | fieldPath: metadata.name 132 | image: ${APPLICATION_NAME} 133 | imagePullPolicy: IfNotPresent 134 | name: ${APPLICATION_NAME} 135 | resources: {} 136 | terminationMessagePath: /dev/termination-log 137 | volumeMounts: 138 | - mountPath: /etc/podinfo 139 | name: podinfo 140 | readOnly: false 141 | dnsPolicy: ClusterFirst 142 | restartPolicy: Always 143 | serviceAccount: oshinko 144 | volumes: 145 | - downwardAPI: 146 | items: 147 | - fieldRef: 148 | fieldPath: metadata.labels 149 | path: labels 150 | name: podinfo 151 | triggers: 152 | - imageChangeParams: 153 | automatic: true 154 | containerNames: 155 | - ${APPLICATION_NAME} 156 | from: 157 | kind: ImageStreamTag 158 | name: ${APPLICATION_NAME}:latest 159 | type: ImageChange 160 | - type: ConfigChange 161 | - apiVersion: v1 162 | kind: Service 163 | metadata: 164 | name: ${APPLICATION_NAME} 165 | labels: 166 | app: ${APPLICATION_NAME} 167 | spec: 168 | ports: 169 | - name: 8080-tcp 170 | port: 8080 171 | protocol: TCP 172 | targetPort: 8080 173 | selector: 174 | deploymentConfig: ${APPLICATION_NAME} 175 | parameters: 176 | - description: 'The name to use for the buildconfig, imagestream and deployment components' 177 | from: 'python-spark-[a-z0-9]{4}' 178 | generate: expression 179 | name: APPLICATION_NAME 180 | required: true 181 | - description: The URL of the repository with your application source code 182 | displayName: Git Repository URL 183 | name: GIT_URI 184 | - description: Optional branch, tag or commit 185 | displayName: Git Reference 186 | name: GIT_REF 187 | - description: Git sub-directory path 188 | name: CONTEXT_DIR 189 | - description: The name of the main py file to run. If this is not specified and there is a single py file at top level of the git respository, that file will be chosen. 190 | name: APP_FILE 191 | - description: Command line arguments to pass to the Spark application 192 | name: APP_ARGS 193 | - description: List of additional Spark options to pass to spark-submit (for exmaple --conf property=value --conf property=value). Note, --master and --class are set by the launcher and should not be set here 194 | name: SPARK_OPTIONS 195 | - description: The name of the Spark cluster to run against. The cluster will be created if it does not exist, and a random cluster name will be chosen if this value is left blank. 196 | name: OSHINKO_CLUSTER_NAME 197 | - description: The name of a stored cluster configuration to use if a cluster is created, default is 'default'. 198 | name: OSHINKO_NAMED_CONFIG 199 | value: streaming-lab 200 | - description: The name of a configmap to use for the Spark configuration of the driver. If this configmap is empty the default Spark configuration will be used. 201 | name: OSHINKO_SPARK_DRIVER_CONFIG 202 | value: streaming-lab 203 | - description: If a cluster is created on-demand, delete the cluster when the application finishes if this option is set to 'true' 204 | name: OSHINKO_DEL_CLUSTER 205 | required: true 206 | value: 'true' 207 | 208 | - apiVersion: v1 209 | kind: Template 210 | template: oshinko-webui 211 | metadata: 212 | name: oshinko-webui 213 | objects: 214 | - kind: Service 215 | apiVersion: v1 216 | metadata: 217 | name: ${OSHINKO_WEB_NAME}-proxy 218 | labels: 219 | name: ${OSHINKO_WEB_NAME}-proxy 220 | spec: 221 | ports: 222 | - name: oc-proxy-port 223 | protocol: TCP 224 | port: 8001 225 | targetPort: 8001 226 | selector: 227 | name: ${OSHINKO_WEB_NAME} 228 | - kind: Service 229 | apiVersion: v1 230 | metadata: 231 | name: ${OSHINKO_WEB_NAME} 232 | labels: 233 | name: ${OSHINKO_WEB_NAME} 234 | spec: 235 | ports: 236 | - name: o-web-port 237 | protocol: TCP 238 | port: 8080 239 | targetPort: 8080 240 | selector: 241 | name: ${OSHINKO_WEB_NAME} 242 | - kind: Route 243 | apiVersion: v1 244 | metadata: 245 | name: ${OSHINKO_WEB_NAME} 246 | spec: 247 | host: ${OSHINKO_WEB_ROUTE_HOSTNAME} 248 | path: /webui 249 | to: 250 | kind: Service 251 | name: ${OSHINKO_WEB_NAME} 252 | alternateBackends: 253 | - kind: Service 254 | name: ${OSHINKO_WEB_NAME} 255 | - kind: DeploymentConfig 256 | apiVersion: v1 257 | metadata: 258 | name: ${OSHINKO_WEB_NAME} 259 | spec: 260 | strategy: 261 | type: Rolling 262 | triggers: 263 | - type: ConfigChange 264 | replicas: 1 265 | selector: 266 | name: ${OSHINKO_WEB_NAME} 267 | template: 268 | metadata: 269 | labels: 270 | name: ${OSHINKO_WEB_NAME} 271 | spec: 272 | containers: 273 | - name: ${OSHINKO_WEB_NAME} 274 | image: ${OSHINKO_WEB_IMAGE} 275 | imagePullPolicy: Always 276 | ports: 277 | - name: o-web-port 278 | containerPort: 8080 279 | protocol: TCP 280 | env: 281 | - name: SPARK_DEFAULT 282 | value: docker.io/elmiko/openshift-spark:2.2.1-streaming-lab 283 | - name: OSHINKO_REFRESH_INTERVAL 284 | value: ${OSHINKO_REFRESH_INTERVAL} 285 | - name: WEB_ROUTE_NAME 286 | value: ${OSHINKO_WEB_NAME} 287 | - name: INSECURE_WEBUI 288 | value: "true" 289 | - name: CURRENT_NAMESPACE 290 | valueFrom: 291 | fieldRef: 292 | fieldPath: metadata.namespace 293 | readinessProbe: 294 | failureThreshold: 3 295 | httpGet: 296 | path: /webui 297 | port: 8080 298 | scheme: HTTP 299 | periodSeconds: 10 300 | successThreshold: 1 301 | timeoutSeconds: 1 302 | initialDelaySeconds: 20 303 | livenessProbe: 304 | failureThreshold: 3 305 | httpGet: 306 | path: /webui 307 | port: 8080 308 | scheme: HTTP 309 | periodSeconds: 10 310 | successThreshold: 1 311 | timeoutSeconds: 1 312 | initialDelaySeconds: 20 313 | - name: oc-proxy 314 | image: radanalyticsio/oc-proxy:stable 315 | imagePullPolicy: IfNotPresent 316 | args: 317 | - proxy 318 | - "-p" 319 | - '8001' 320 | - "--address=0.0.0.0" 321 | - "--disable-filter=true" 322 | - "--api-prefix=/proxy" 323 | ports: 324 | - name: oc-proxy-port 325 | containerPort: 8001 326 | protocol: TCP 327 | serviceAccount: oshinko 328 | parameters: 329 | - name: OSHINKO_WEB_NAME 330 | description: Name of the oshinko web service 331 | value: "oshinko-web" 332 | - name: OSHINKO_WEB_IMAGE 333 | description: Full name of the oshinko web image 334 | required: true 335 | value: radanalyticsio/oshinko-webui:stable 336 | - name: OSHINKO_WEB_ROUTE_HOSTNAME 337 | description: The hostname used to create the external route for the webui 338 | - name: OSHINKO_REFRESH_INTERVAL 339 | value: "5" 340 | description: Refresh interval for updating cluster list in seconds 341 | 342 | - apiVersion: v1 343 | kind: Template 344 | template: streaming-lab-notebook 345 | metadata: 346 | name: streaming-lab-notebook 347 | objects: 348 | - kind: Service 349 | apiVersion: v1 350 | metadata: 351 | name: ${APPLICATION_NAME} 352 | labels: 353 | name: ${APPLICATION_NAME} 354 | spec: 355 | ports: 356 | - protocol: TCP 357 | port: 8888 358 | targetPort: 8888 359 | selector: 360 | name: ${APPLICATION_NAME} 361 | - kind: Route 362 | apiVersion: v1 363 | metadata: 364 | name: ${APPLICATION_NAME} 365 | spec: 366 | host: ${ROUTE_HOSTNAME} 367 | to: 368 | kind: Service 369 | name: ${APPLICATION_NAME} 370 | - kind: DeploymentConfig 371 | apiVersion: v1 372 | metadata: 373 | name: ${APPLICATION_NAME} 374 | spec: 375 | strategy: 376 | type: Rolling 377 | triggers: 378 | - type: ConfigChange 379 | replicas: 1 380 | selector: 381 | name: ${APPLICATION_NAME} 382 | template: 383 | metadata: 384 | labels: 385 | name: ${APPLICATION_NAME} 386 | spec: 387 | containers: 388 | - name: ${APPLICATION_NAME} 389 | image: quay.io/willbenton/streaming-lab-notebooks:madrid-lab 390 | env: 391 | - name: JUPYTER_NOTEBOOK_PASSWORD 392 | value: developer 393 | ports: 394 | - containerPort: 8888 395 | protocol: TCP 396 | parameters: 397 | - name: APPLICATION_NAME 398 | description: the application name 399 | value: streaming-lab-notebook 400 | - name: ROUTE_HOSTNAME 401 | description: a hostname for the route 402 | -------------------------------------------------------------------------------- /update-generator/.s2i/bin/assemble: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | # Execute the default S2I script 4 | source ${STI_SCRIPTS_PATH}/assemble 5 | 6 | pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz 7 | -------------------------------------------------------------------------------- /update-generator/README.md: -------------------------------------------------------------------------------- 1 | # update-generator 2 | 3 | A Python application that emits synthetic social media updates to an Apache Kafka topic. 4 | 5 | ## Launching on OpenShift 6 | 7 | ``` 8 | oc new-app centos/python-36-centos7~https://github.com/radanalyticsio/streaming-lab/ \ 9 | --context-dir=update-generator \ 10 | -e KAFKA_BROKERS=my-cluster-kafka:9092 \ 11 | -e KAFKA_TOPIC=social-firehose \ 12 | --name=emitter 13 | ``` 14 | 15 | You will need to adjust the `KAFKA_BROKERS` and `KAFKA_TOPICS` variables to match your configured 16 | Kafka deployment and desired topic. 17 | 18 | For now, text corpora and weights are hardcoded to data files bundled in the image. 19 | -------------------------------------------------------------------------------- /update-generator/app.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import os 4 | import time 5 | import urllib 6 | 7 | from kafka import KafkaProducer 8 | 9 | import markovify 10 | import spacy 11 | import numpy 12 | import math 13 | import collections 14 | 15 | import gzip 16 | 17 | import codecs 18 | import json 19 | 20 | def train_markov_gutenberg_txt(fn): 21 | """ trains a Markov model on text data from Project Gutenberg """ 22 | with codecs.open(fn, "r", "cp1252") as f: 23 | text = f.read() 24 | 25 | return markovify.Text(text, retain_original=False, state_size=3) 26 | 27 | def train_markov_gz(fn): 28 | """ trains a Markov model on gzipped text data """ 29 | with gzip.open(fn, "rt", encoding="utf-8") as f: 30 | text = f.read() 31 | return markovify.Text(text, retain_original=False, state_size=3) 32 | 33 | 34 | class UserTable(object): 35 | """ 36 | Table of random user IDs; models two kinds of user: talkative and moderate. 37 | The assumption is that talkative users represent a proportion p of all users 38 | but 1 - p of all utterances. 39 | """ 40 | def __init__(self, size, weights=[92, 8]): 41 | self._talkative = collections.deque() 42 | self._moderate = collections.deque() 43 | self._size = size 44 | self._cutoff = float(weights[0]) / sum(weights) 45 | 46 | for i in range(size): 47 | new_uid = math.floor(numpy.random.uniform(10 ** 10)) 48 | if numpy.random.uniform() >= self._cutoff: 49 | self._moderate.append(new_uid) 50 | else: 51 | self._talkative.append(new_uid) 52 | 53 | def random_uid(self): 54 | def choose_from(c): 55 | return c[math.floor(numpy.random.uniform() * len(c))] 56 | 57 | if numpy.random.uniform() >= self._cutoff: 58 | return choose_from(self._talkative) 59 | else: 60 | return choose_from(self._moderate) 61 | 62 | import spacy 63 | nlp = spacy.load('en_core_web_sm') 64 | 65 | def make_sentence(model, length=200): 66 | return model.make_short_sentence(length) 67 | 68 | def hashtagify_full(sentence): 69 | doc = nlp(sentence) 70 | for ent in doc.ents: 71 | sentence = sentence.replace(str(ent), "#%s" % str(ent).replace(" ", "")) 72 | return (sentence, ["#%s" % str(ent).replace(" ", "") for ent in doc.ents]) 73 | 74 | def hashtagify(sentence): 75 | result,_ = hashtagify_full(sentence) 76 | return result 77 | 78 | def update_generator(models, weights=None, hashtag_weights=[8, 2], ut=None, seed_hashtags=[]): 79 | if weights is None: 80 | weights = [1] * len(models) 81 | 82 | if ut is None: 83 | ut = UserTable(10000) 84 | 85 | choices = [] 86 | 87 | total_weight = float(sum(weights)) 88 | 89 | for i in range(len(weights)): 90 | choices.append((float(sum(weights[0:i+1])) / total_weight, models[i])) 91 | 92 | def choose_model(): 93 | r = numpy.random.uniform() 94 | for (p, m) in choices: 95 | if r <= p: 96 | return m 97 | return choices[-1][1] 98 | 99 | seen_hashtags = set() 100 | hashtags = [] 101 | total_hashtag_weight = float(sum(hashtag_weights)) 102 | for i in range(len(hashtag_weights)): 103 | hashtags.append((float(sum(hashtag_weights[0:i+1])) / total_hashtag_weight, collections.deque())) 104 | 105 | iws = [1.0 - w for (w, _) in hashtags] 106 | inverse_weights = [(sum(iws[0:i+1]), i) for _, i in zip(iws, range(len(iws)))] 107 | 108 | def choose_from(c): 109 | idx = math.floor(numpy.random.uniform() * len(c)) 110 | return c[idx] 111 | 112 | def store_hashtag(tag): 113 | if tag not in seen_hashtags: 114 | seen_hashtags.add(str(tag)) 115 | r = numpy.random.uniform() 116 | for(p, deq) in hashtags: 117 | if r <= p: 118 | deq.append(tag) 119 | 120 | def choose_hashtag(): 121 | r = numpy.random.uniform() 122 | for(p, i) in hashtags: 123 | if r <= - p and len(hashtags[i][1]) > 0: 124 | return choose_from(hashtags[i][1]) 125 | return len(hashtags[0][1]) > 0 and choose_from(hashtags[0][1]) or choose_from(hashtags[1][1]) 126 | 127 | for tag in seed_hashtags: 128 | seen_hashtags.add(str(tag)) 129 | hashtags[-1][1].append(str(tag)) 130 | 131 | while True: 132 | tweet, tags = hashtagify_full(make_sentence(choose_model())) 133 | for tag in tags: 134 | store_hashtag(str(tag)) 135 | 136 | this_tweet_tags = set([str(t) for t in tags]) 137 | 138 | if len(seen_hashtags) > 0: 139 | for i in range(min(numpy.random.poisson(3), len(seen_hashtags))): 140 | tag = choose_hashtag() 141 | if str(tag) not in this_tweet_tags: 142 | this_tweet_tags.add(str(tag)) 143 | tweet += " %s" % str(tag) 144 | 145 | yield (ut.random_uid(), tweet) 146 | 147 | 148 | def main(args): 149 | logging.info('brokers={}'.format(args.brokers)) 150 | logging.info('topic={}'.format(args.topic)) 151 | logging.info('rate={}'.format(args.rate)) 152 | logging.info('source={}'.format(args.source)) 153 | 154 | logging.info('creating Markov chains') 155 | 156 | austen_model = train_markov_gutenberg_txt("austen.txt") 157 | negative_model = train_markov_gz("reviews-1.txt.gz") 158 | positive_model = train_markov_gz("reviews-5-100k.txt.gz") 159 | 160 | logging.info('creating update generator') 161 | 162 | seed_hashtags=["#ff", "#marketing", "#fail", "#followfriday", "#yolo", "#retweet", "#tbt", "#socialmedia", "#startup", "#blogpost", "#news", "#health"] 163 | ug = update_generator([austen_model, positive_model, negative_model], [22, 4, 4], seed_hashtags=seed_hashtags) 164 | 165 | update_id = 0 166 | 167 | logging.info('creating kafka producer') 168 | producer = KafkaProducer(bootstrap_servers=args.brokers) 169 | 170 | logging.info('sending lines') 171 | while True: 172 | update = {"update_id" : "%020d" % update_id} 173 | update_id += 1 174 | userid, text = next(ug) 175 | update["user_id"] = "%010d" % userid 176 | update["text"] = text 177 | 178 | producer.send(args.topic, bytes(json.dumps(update), "utf-8")) 179 | time.sleep(1.0 / float(args.rate)) 180 | logging.info('finished sending source') 181 | 182 | 183 | def get_arg(env, default): 184 | return os.getenv(env) if os.getenv(env, '') is not '' else default 185 | 186 | 187 | def parse_args(parser): 188 | args = parser.parse_args() 189 | args.brokers = get_arg('KAFKA_BROKERS', args.brokers) 190 | args.topic = get_arg('KAFKA_TOPIC', args.topic) 191 | args.rate = get_arg('RATE', args.rate) 192 | args.source = get_arg('SOURCE_URI', args.source) 193 | return args 194 | 195 | 196 | if __name__ == '__main__': 197 | logging.basicConfig(level=logging.INFO) 198 | logging.info('starting update-generator') 199 | parser = argparse.ArgumentParser(description='emit synthetic social media updates on kafka') 200 | parser.add_argument( 201 | '--brokers', 202 | help='The bootstrap servers, env variable KAFKA_BROKERS', 203 | default='localhost:9092') 204 | parser.add_argument( 205 | '--topic', 206 | help='Topic to publish to, env variable KAFKA_TOPIC', 207 | default='social-firehose') 208 | parser.add_argument( 209 | '--rate', 210 | type=int, 211 | help='Lines per second, env variable RATE', 212 | default=40) 213 | parser.add_argument( 214 | '--source', 215 | help='The source URI for data to emit, env variable SOURCE_URI') 216 | args = parse_args(parser) 217 | main(args) 218 | logging.info('exiting') 219 | -------------------------------------------------------------------------------- /update-generator/austen.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/update-generator/austen.txt -------------------------------------------------------------------------------- /update-generator/requirements.txt: -------------------------------------------------------------------------------- 1 | kafka 2 | numpy 3 | spacy 4 | markovify -------------------------------------------------------------------------------- /update-generator/reviews-1.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/update-generator/reviews-1.txt.gz -------------------------------------------------------------------------------- /update-generator/reviews-5-100k.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/radanalyticsio/streaming-lab/29f69bcb835a4b180f6bbabad66a9183aad73c3a/update-generator/reviews-5-100k.txt.gz -------------------------------------------------------------------------------- /update-transformer/README.md: -------------------------------------------------------------------------------- 1 | # update-transformer 2 | 3 | A Python based microservice using Apache Spark to process social media 4 | updates by consuming them from an Apache Kafka topic, applying sentiment 5 | analysis to the text, and then broadcasting the updated message with metadata 6 | on a second topic. 7 | 8 | ## Quickstart 9 | 10 | ``` 11 | oc new-app --template=oshinko-python-spark-build-dc \ 12 | -p APPLICATION_NAME=transformer \ 13 | -p GIT_URI=https://github.com/radanalyticsio/streaming-lab \ 14 | -p CONTEXT_DIR=update-transformer \ 15 | -p APP_ARGS='--brokers=summit-kafka.kafka.svc:9092 --in-topic=social-firehose --out-topic=sentiments' \ 16 | -p SPARK_OPTIONS='--packages=org.apache.spark:spark-sql-kafka-0-10_2.11:2.2.1' \ 17 | -p OSHINKO_CLUSTER_NAME= 18 | ``` 19 | -------------------------------------------------------------------------------- /update-transformer/app.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import logging 4 | import os 5 | 6 | import pyspark.sql as sql 7 | import pyspark.sql.types as types 8 | import pyspark.sql.functions as functions 9 | import vaderSentiment.vaderSentiment as vader 10 | 11 | 12 | # This code is borrowed from Sparkling Pandas; see here: 13 | # https://github.com/sparklingpandas/sparklingml/blob/627c8f23688397a53e2e9e805e92a54c2be1cf3d/sparklingml/transformation_functions.py#L53 14 | class SpacyMagic(object): 15 | """ 16 | Simple Spacy Magic to minimize loading time. 17 | >>> SpacyMagic.get("en") 18 | 0: 28 | compounds = [s.get('compound', 0) for s in sentiments] 29 | avg = math.fsum(compounds) / len(sentiments) 30 | else: 31 | avg = 0 32 | return avg 33 | 34 | def update(self, newdata): 35 | self._access_lock.acquire() 36 | self._data['last-seen'] = copy.deepcopy(newdata) 37 | new_cavg = LastData.get_compound_average( 38 | self._data['last-seen'].get('sentiments', [])) 39 | pos_cavg = LastData.get_compound_average( 40 | self._data['most-positive'].get('sentiments', [])) 41 | neg_cavg = LastData.get_compound_average( 42 | self._data['most-negative'].get('sentiments', [])) 43 | if new_cavg >= pos_cavg: 44 | self._data['most-positive'] = self._data['last-seen'] 45 | if new_cavg <= neg_cavg: 46 | self._data['most-negative'] = self._data['last-seen'] 47 | self._access_lock.release() 48 | 49 | def copy(self): 50 | self._access_lock.acquire() 51 | retval = copy.deepcopy(self._data) 52 | self._access_lock.release() 53 | return retval 54 | 55 | 56 | exit_event = threading.Event() 57 | _last_data = LastData() 58 | 59 | 60 | def last_data(update=None): 61 | if update is not None: 62 | _last_data.update(update) 63 | return _last_data.copy() 64 | 65 | 66 | class RootView(views.MethodView): 67 | def get(self): 68 | return json.jsonify(last_data()) 69 | 70 | 71 | def consumer(args): 72 | logging.info('starting kafka consumer') 73 | consumer = kafka.KafkaConsumer(args.topic, bootstrap_servers=args.brokers) 74 | for msg in consumer: 75 | if exit_event.is_set(): 76 | break 77 | try: 78 | last_data(json.loads(str(msg.value, 'utf-8'))) 79 | except Exception as e: 80 | logging.error(e.message) 81 | logging.info('exiting kafka consumer') 82 | 83 | 84 | def get_arg(env, default): 85 | return os.getenv(env) if os.getenv(env, '') is not '' else default 86 | 87 | 88 | def parse_args(parser): 89 | args = parser.parse_args() 90 | args.brokers = get_arg('KAFKA_BROKERS', args.brokers) 91 | args.topic = get_arg('KAFKA_TOPIC', args.topic) 92 | return args 93 | 94 | 95 | def main(args): 96 | exit_event.clear() 97 | # setup consumer thread 98 | cons = threading.Thread(group=None, target=consumer, args=(args,)) 99 | cons.start() 100 | 101 | # create the flask app object 102 | app = flask.Flask(__name__) 103 | # change this value for production environments 104 | app.config['SECRET_KEY'] = 'secret!' 105 | app.add_url_rule('/', view_func=RootView.as_view('index')) 106 | app.run(host='0.0.0.0', port=8080) 107 | 108 | exit_event.set() 109 | cons.join() 110 | logging.info('exiting flask-kafka-listener') 111 | 112 | 113 | if __name__ == '__main__': 114 | lconfig.dictConfig({ 115 | 'version': 1, 116 | 'formatters': {'default': { 117 | 'format': '[%(asctime)s] %(levelname)s in %(module)s: %(message)s', 118 | }}, 119 | 'handlers': {'wsgi': { 120 | 'class': 'logging.StreamHandler', 121 | 'stream': 'ext://flask.logging.wsgi_errors_stream', 122 | 'formatter': 'default' 123 | }}, 124 | 'root': { 125 | 'level': 'INFO', 126 | 'handlers': ['wsgi'] 127 | } 128 | }) 129 | logging.basicConfig(level=logging.INFO) 130 | logging.info('starting update-visualizer') 131 | parser = argparse.ArgumentParser( 132 | description='listen for some stuff on kafka') 133 | parser.add_argument( 134 | '--brokers', 135 | help='The bootstrap servers, env variable KAFKA_BROKERS', 136 | default='kafka.kafka.svc:9092') 137 | parser.add_argument( 138 | '--topic', 139 | help='Topic to publish to, env variable KAFKA_TOPIC', 140 | default='social-firehose') 141 | args = parse_args(parser) 142 | main(args) 143 | logging.info('exiting') 144 | -------------------------------------------------------------------------------- /update-visualizer/requirements.txt: -------------------------------------------------------------------------------- 1 | flask 2 | kafka 3 | --------------------------------------------------------------------------------