├── .gitignore ├── LICENSE ├── NOTICE.txt ├── README.md ├── data ├── dummyLogFile.txt ├── kibana.jpg └── nycTaxiData.gz ├── pom.xml ├── src └── main │ ├── scala │ └── com │ │ └── dataartisans │ │ └── flink_demo │ │ ├── datatypes │ │ └── TaxiRide.scala │ │ ├── examples │ │ ├── EarlyArrivalCount.scala │ │ ├── SlidingArrivalCount.scala │ │ └── TotalArrivalCount.scala │ │ ├── sinks │ │ └── ElasticsearchUpsertSink.scala │ │ ├── sources │ │ └── TaxiRideSource.scala │ │ └── utils │ │ ├── DemoStreamEnvironment.scala │ │ └── NycGeoUtils.scala │ └── scripts │ └── convertTrips.sh └── tools └── maven └── checkstyle.xml /.gitignore: -------------------------------------------------------------------------------- 1 | .cache 2 | scalastyle-output.xml 3 | .classpath 4 | .idea 5 | .metadata 6 | .settings 7 | .project 8 | .version.properties 9 | filter.properties 10 | target 11 | tmp 12 | *.class 13 | *.iml 14 | *.swp 15 | *.jar 16 | *.log 17 | .DS_Store 18 | _site 19 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | -------------------------------------------------------------------------------- /NOTICE.txt: -------------------------------------------------------------------------------- 1 | Cascading Connector for Apache Flink 2 | Copyright 2015 data Artisans GmbH 3 | 4 | This product includes software developed at 5 | data Artisans GmbH, Berlin, Germany (http://www.data-artisans.com). -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Demo Applications for Apache Flink™ DataStream 2 | 3 | This repository contains demo applications for [Apache Flink](https://flink.apache.org)'s 4 | [DataStream API](https://ci.apache.org/projects/flink/flink-docs-release-0.10/apis/streaming_guide.html). 5 | 6 | Apache Flink is a scalable open-source streaming dataflow engine with many competitive features.
7 | You can find a list of Flink's features at the bottom of this page. 8 | 9 | ### Run a demo application in your IDE 10 | 11 | You can run all examples in this repository from your IDE and play around with the code.
12 | Requirements: 13 | 14 | - Java JDK 7 (or 8) 15 | - Apache Maven 3.x 16 | - Git 17 | - an IDE with Scala support (we recommend IntelliJ IDEA) 18 | 19 | To run a demo application in your IDE follows these steps: 20 | 21 | 1. **Clone the repository:** Open a terminal and clone the repository: 22 | `git clone https://github.com/dataArtisans/flink-streaming-demo.git`. Please note that the 23 | repository is about 100MB in size because it includes the input data of our demo applications. 24 | 25 | 2. **Import the project into your IDE:** The repository is a Maven project. Open your IDE and 26 | import the repository as an existing Maven project. This is usually done by selecting the folder that 27 | contains the `pom.xml` file or selecting the `pom.xml` file itself. 28 | 29 | 3. **Start a demo application:** Execute the `main()` method of one of the demo applications, for example 30 | `com.dataartisans.flink_demo.examples.TotalArrivalCount.scala`. 31 | Running an application will start a local Flink instance in the JVM process of your IDE. 32 | You will see Flink's log messages and the output produced by the program being printed to the standard output. 33 | 34 | 4. **Explore the web dashboard:** The local Flink instance starts a webserver that serves Flink's 35 | dashboard. Open [http://localhost:8081](http://localhost:8081) to access and explore the dashboard. 36 | 37 | ### Demo applications 38 | 39 | #### Taxi event stream 40 | 41 | All demo applications in this repository process a stream of taxi ride events that 42 | originate from a [public data set](http://www.nyc.gov/html/tlc/html/about/trip_record_data.shtml) 43 | of the [New York City Taxi and Limousine Commission](http://www.nyc.gov/html/tlc/html/home/home.shtml) 44 | (TLC). The data set consists of records about taxi trips in New York City from 2009 to 2015. 45 | 46 | We took some of this data and converted it into a data set of taxi ride events by splitting each 47 | trip record into a ride start and a ride end event. The events have the following schema: 48 | 49 | ``` 50 | rideId: Long // unique id for each ride 51 | time: DateTime // timestamp of the start/end event 52 | isStart: Boolean // true = ride start, false = ride end 53 | location: GeoPoint // lon/lat of pick-up/drop-off location 54 | passengerCnt: short // number of passengers 55 | travelDist: float // total travel distance, -1 on start events 56 | ``` 57 | 58 | A custom `SourceFunction` serves a `DataStream[TaxiRide]` from this data set. 59 | In order to generate the stream as realistically as possible, events are emitted according to their 60 | timestamp. Two events that occurred ten minutes after each other in reality are served ten minutes apart. 61 | A speed-up factor can be specified to "fast-forward" the stream, i.e., with a speed-up factor of 2, 62 | the events would be served five minutes apart. Moreover, you can specify a maximum serving delay 63 | which causes each event to be randomly delayed within the bound to simulate an out-of-order stream 64 | (a delay of 0 seconds results in an ordered stream). All examples operate in event-time mode. 65 | This guarantees consistent results even in case of historic data or data which is delivered out-of-order. 66 | 67 | #### Identify popular locations 68 | 69 | The [`TotalArrivalCount.scala`](/src/main/scala/com/dataartisans/flink_demo/examples/TotalArrivalCount.scala) 70 | program identifies popular locations in New York City. 71 | It ingests the stream of taxi ride events and counts for each location the number of persons that 72 | arrive by taxi. 73 | 74 | #### Identify the popular locations of the last 15 minutes 75 | 76 | The [`SlidingArrivalCount.scala`](/src/main/scala/com/dataartisans/flink_demo/examples/SlidingArrivalCount.scala) 77 | program identifies popular locations of the last 15 minutes. 78 | It ingests the stream of taxi ride records and computes every five minutes the number of 79 | persons that arrived at each location within the last 15 minutes. 80 | This type of computation is known as sliding window. 81 | 82 | 83 | #### Compute early arrival counts for popular locations 84 | 85 | Some stream processing use cases depend on timely event aggregation, for example to send out notifications or alerts. 86 | The [`EarlyArrivalCount.scala`](/src/main/scala/com/dataartisans/flink_demo/examples/EarlyArrivalCount.scala) 87 | program extends our previous sliding window application. Same as before, it computes every five minutes 88 | the number of persons that arrived at each location within the last 15 minutes. 89 | In addition it emits an early partial count whenever a multitude of 50 persons arrived at a 90 | location, i.e., it emits an updated count if more than 50, 100, 150 (and so on) persons arrived at a location. 91 | 92 | ### Setting up Elasticsearch and Kibana 93 | 94 | The demo applications in this repository are prepared to write their output to [Elasticsearch](https://www.elastic.co/products/elasticsearch). 95 | Data in Elasticsearch can be easily visualized using [Kibana](https://www.elastic.co/products/kibana) 96 | for real-time monitoring and interactive analysis. 97 | 98 | Our demo applications depend on Elasticsearch 1.7.3 and Kibana 4.1.3. Both systems have a nice 99 | out-of-the-box experience and operate well with their default configurations for our purpose. 100 | 101 | Follow these instructions to set up Elasticsearch and Kibana. 102 | 103 | #### Setup Elasticsearch 104 | 105 | 1. Download Elasticsearch 1.7.3 [here](https://www.elastic.co/downloads/past-releases/elasticsearch-1-7-3). 106 | 107 | 1. Extract the downloaded archive file and enter the extracted repository. 108 | 109 | 1. Start Elasticsearch using the start script: `./bin/elasticsearch`. 110 | 111 | 1. Create an index (here called `nyc-idx`): `curl -XPUT "http://localhost:9200/nyc-idx"` 112 | 113 | 1. Create a schema mapping for the index (here called `popular-locations`): 114 | ``` 115 | curl -XPUT "http://localhost:9200/nyc-idx/_mapping/popular-locations" -d' 116 | { 117 | "popular-locations" : { 118 | "properties" : { 119 | "cnt": {"type": "integer"}, 120 | "location": {"type": "geo_point"}, 121 | "time": {"type": "date"} 122 | } 123 | } 124 | }' 125 | ``` 126 | **Note:** This mapping can be used for all demo application. 127 | 1. Configure a demo application to write its results to Elasticsearch. For that you have to change the corresponding parameters in the demo applications source code: 128 | - set `writeToElasticsearch = true` 129 | - set `elasticsearchHost` to the correct host name (see Elasticsearch's log output) 130 | 131 | 1. Run the Flink program to write its result to Elasticsearch. 132 | 133 | To clear the `nyc-idx` index in Elasticsearch, simply drop the mapping as 134 | `curl -XDELETE 'http://localhost:9200/nyc-idx/popular-locations'` and create it again with the previous 135 | command. 136 | 137 | #### Setup Kibana 138 | 139 | Setting up Kibana and visualizing data that is stored in Elasticsearch is also easy. 140 | 141 | 1. Dowload Kibana 4.1.3 [here](https://www.elastic.co/downloads/past-releases/kibana-4-1-3) 142 | 143 | 1. Extract the downloaded archive and enter the extracted repository. 144 | 145 | 1. Start Kibana using the start script: `./bin/kibana`. 146 | 147 | 1. Access Kibana by opening [http://localhost:5601](http://localhost:5601) in your browser. 148 | 149 | 1. Configure an index pattern by entering the index name "nyc-idx" and clicking on "Create". 150 | Do not uncheck the "Index contains time-based events" option. 151 | 152 | 1. Click on the "Discover" button at the top of the page. Kibana will tell you "No results found" 153 | because we have to configure the time range of the data to visualize in Kibane. Click on the 154 | "Last 15 minutes" label in the top right corner and enter an absolute time range from 2013-01-01 155 | to 2013-01-06 which is the time range of our taxi ride data stream. You can also configure a 156 | refresh interval to reload the page for updates. 157 | 158 | 1. Click on the “Visualize” button at the top of the page, select "Tile map", and click on "From a 159 | new search". 160 | 161 | 1. Next you need to configure the tile map visualization: 162 | 163 | - Top-left: Configure the displayed value to be a “Sum” aggregation over the "cnt" field. 164 | - Top-left: Select "Geo Coordinates" as bucket type and make sure that "location" is 165 | configured as field. 166 | - Top-left: You can change the visualization type by clicking on “Options” (top left) and selecting 167 | for example a “Shaded Geohash Grid” visualization. 168 | - The visualization is started by clicking on the green play button. 169 | 170 | The following screenshot shows how Kibana visualizes the result of `TotalArrivalCount.scala`. 171 | 172 | ![Kibana Screenshot](/data/kibana.jpg?raw=true "Kibana Screenshot") 173 | 174 | ### Apache Flink's Feature Set 175 | 176 | - **Support for out-of-order streams and event-time processing**: In practice, streams of events rarely 177 | arrive in the order that they are produced, especially streams from distributed systems, devices, and sensors. 178 | Flink 0.10 is the first open source engine that supports out-of-order streams and event 179 | time which is a hard requirement for many application that aim for consistent and meaningful results. 180 | 181 | - **Expressive and easy-to-use APIs in Scala and Java**: Flink's DataStream API provides many 182 | operators which are well known from batch processing APIs such as `map`, `reduce`, and `join` as 183 | well as stream specific operations such as `window`, `split`, and `connect`. 184 | First-class support for user-defined functions eases the implementation of custom application 185 | behavior. The DataStream API is available in Scala and Java. 186 | 187 | - **Support for sessions and unaligned windows**: Most streaming systems have some concept of windowing, 188 | i.e., a temporal grouping of events based on some function of their timestamps. Unfortunately, in 189 | many systems these windows are hard-coded and connected with the system’s internal checkpointing 190 | mechanism. Flink is the first open source streaming engine that completely decouples windowing from 191 | fault tolerance, allowing for richer forms of windows, such as sessions. 192 | 193 | - **Consistency, fault tolerance, and high availability**: Flink guarantees consistent operator state 194 | in the presence of failures (often called "exactly-once processing"), and consistent data movement 195 | between selected sources and sinks (e.g., consistent data movement between Kafka and HDFS). Flink 196 | also supports master fail-over, eliminating any single point of failure. 197 | 198 | - **High throughput and low-latency processing**: We have clocked Flink at 1.5 million events per second per core, 199 | and have also observed latencies at the 25 millisecond range in jobs that include network data 200 | shuffling. Using a tuning knob, Flink users can control the latency-throughput trade-off, making 201 | the system suitable for both high-throughput data ingestion and transformations, as well as ultra 202 | low latency (millisecond range) applications. 203 | 204 | - **Integration with many systems for data input and output**: Flink integrates with a wide variety of 205 | open source systems for data input and output (e.g., HDFS, Kafka, Elasticsearch, HBase, and others), 206 | deployment (e.g., YARN), as well as acting as an execution engine for other frameworks (e.g., 207 | Cascading, Google Cloud Dataflow). The Flink project itself comes bundled with a Hadoop MapReduce 208 | compatibility layer, a Storm compatibility layer, as well as libraries for Machine Learning and 209 | graph processing. 210 | 211 | - **Support for batch processing**: In Flink, batch processing is a special case of stream processing, 212 | as finite data sources are just streams that happen to end. Flink offers a dedicated execution mode 213 | for batch processing with a specialized DataSet API and libraries for Machine Learning and graph processing. In 214 | addition, Flink contains several batch-specific optimizations (e.g., for scheduling, memory 215 | management, and query optimization), matching and even out-performing dedicated batch processing 216 | engines in batch use cases. 217 | 218 | - **Developer productivity and operational simplicity**: Flink runs in a variety of environments. Local 219 | execution within an IDE significantly eases development and debugging of Flink applications. 220 | In distributed setups, Flink runs at massive scale-out. The YARN mode 221 | allows users to bring up Flink clusters in a matter of seconds. Flink serves monitoring metrics of 222 | jobs and the system as a whole via a well-defined REST interface. A build-in web dashboard 223 | displays these metrics and makes monitoring of Flink very convenient. 224 | 225 | 226 |
227 | 228 |
229 | Copyright © 2015 dataArtisans. All Rights Reserved. 230 | 231 | Apache Flink, Apache, and the Apache feather logo are trademarks of The Apache Software Foundation. 232 |
-------------------------------------------------------------------------------- /data/dummyLogFile.txt: -------------------------------------------------------------------------------- 1 | JobManager log files are not available in local execution mode. -------------------------------------------------------------------------------- /data/kibana.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataArtisans/flink-streaming-demo/c73063dd9a9c009bc6538c77a40f0e7d4c300054/data/kibana.jpg -------------------------------------------------------------------------------- /data/nycTaxiData.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dataArtisans/flink-streaming-demo/c73063dd9a9c009bc6538c77a40f0e7d4c300054/data/nycTaxiData.gz -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 17 | 18 | 4.0.0 19 | 20 | com.dataArtisans 21 | flink-streaming-demo 22 | 0.1 23 | jar 24 | 25 | Flink Streaming Demo 26 | http://www.data-artisans.com 27 | 28 | 29 | UTF-8 30 | 1.7.12 31 | 0.10.0 32 | 33 | 34 | 35 | 36 | 37 | org.apache.flink 38 | flink-streaming-java 39 | ${flink.version} 40 | 41 | 42 | 43 | org.apache.flink 44 | flink-streaming-scala 45 | ${flink.version} 46 | 47 | 48 | 49 | org.apache.flink 50 | flink-runtime-web 51 | ${flink.version} 52 | 53 | 54 | 55 | org.elasticsearch 56 | elasticsearch 57 | 1.7.3 58 | compile 59 | 60 | 61 | 62 | joda-time 63 | joda-time 64 | 2.7 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | net.alchim31.maven 75 | scala-maven-plugin 76 | 3.1.4 77 | 78 | 80 | 81 | scala-compile-first 82 | process-resources 83 | 84 | compile 85 | 86 | 87 | 88 | 90 | 91 | scala-test-compile 92 | process-test-resources 93 | 94 | testCompile 95 | 96 | 97 | 98 | 100 | 101 | scala-add-source 102 | 103 | add-source 104 | 105 | 106 | 107 | 108 | 109 | -Xms128m 110 | -Xmx512m 111 | 112 | 113 | 114 | 115 | 116 | org.apache.maven.plugins 117 | maven-source-plugin 118 | 2.2.1 119 | 120 | 121 | attach-sources 122 | 123 | jar 124 | 125 | 126 | 127 | 128 | 129 | 130 | org.apache.maven.plugins 131 | maven-jar-plugin 132 | 133 | 134 | 135 | 136 | MBoxParser 137 | package 138 | 139 | jar 140 | 141 | 142 | 143 | MBoxParser 144 | 145 | 146 | 147 | com.dataartisans.flinkTraining.dataSetPreparation.MBoxParser 148 | 149 | 150 | 151 | 152 | **/MBoxParser.class 153 | **/MBoxParser$*.class 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | org.apache.maven.plugins 163 | maven-compiler-plugin 164 | 3.1 165 | 166 | 1.6 167 | 1.6 168 | 169 | 170 | 171 | 172 | org.apache.rat 173 | apache-rat-plugin 174 | 0.10 175 | false 176 | 177 | 178 | verify 179 | 180 | check 181 | 182 | 183 | 184 | 185 | false 186 | 0 187 | 188 | 203 | 204 | AL2 205 | Apache License 2.0 206 | 207 | 208 | Copyright 2015 data Artisans GmbH 209 | Licensed under the Apache License, Version 2.0 (the "License"); 210 | 211 | 212 | 213 | 214 | 215 | Apache License 2.0 216 | 217 | 218 | 219 | 220 | **/.* 221 | **/*.prefs 222 | **/*.properties 223 | **/*.log 224 | *.txt/** 225 | 226 | **/README.md 227 | CHANGELOG 228 | 229 | **/*.iml 230 | 231 | **/target/** 232 | **/build/** 233 | 234 | 235 | 236 | 237 | 238 | org.apache.maven.plugins 239 | maven-checkstyle-plugin 240 | 2.12.1 241 | 242 | 243 | validate 244 | validate 245 | 246 | check 247 | 248 | 249 | 250 | 251 | /tools/maven/checkstyle.xml 252 | true 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | -------------------------------------------------------------------------------- /src/main/scala/com/dataartisans/flink_demo/datatypes/TaxiRide.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 data Artisans GmbH 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.dataartisans.flink_demo.datatypes 18 | 19 | import java.util.Locale 20 | 21 | import org.joda.time.DateTime 22 | import org.joda.time.format.{DateTimeFormat, DateTimeFormatter} 23 | 24 | /** 25 | * A TaxiRide describes a taxi ride event. 26 | * There are two types of events, a taxi ride start event and a taxi ride end event. 27 | * The isStart flag specifies the type of the event. 28 | * 29 | * @param rideId The id of the ride. There are two events for each id. A start and an end event. 30 | * @param time The time at which the event occured 31 | * @param isStart Flag indicating the type of the event (start or end) 32 | * @param location The location at which the event occurred. Either pick-up or drop-off location. 33 | * @param passengerCnt The number of passengers on the taxi ride 34 | * @param travelDist The total traveled distance for end events, -1 for start events. 35 | */ 36 | class TaxiRide( 37 | var rideId: Long, 38 | var time: DateTime, 39 | var isStart: Boolean, 40 | var location: GeoPoint, 41 | var passengerCnt: Short, 42 | var travelDist: Float) { 43 | 44 | def this() { 45 | this(0, new DateTime(0), false, new GeoPoint(0.0, 0.0), 0, 0.0f) 46 | } 47 | 48 | override def toString: String = { 49 | val sb: StringBuilder = new StringBuilder 50 | sb.append(rideId).append(",") 51 | sb.append(time.toString(TaxiRide.TimeFormatter)).append(",") 52 | sb.append(if (isStart) "START" else "END").append(",") 53 | sb.append(location.lon).append(",") 54 | sb.append(location.lat).append(",") 55 | sb.append(passengerCnt).append(",") 56 | sb.append(travelDist) 57 | sb.toString() 58 | } 59 | 60 | } 61 | 62 | object TaxiRide { 63 | 64 | @transient 65 | private final val TimeFormatter: DateTimeFormatter = 66 | DateTimeFormat.forPattern("yyyy-MM-DD HH:mm:ss").withLocale(Locale.US).withZoneUTC 67 | 68 | def fromString(line: String): TaxiRide = { 69 | 70 | val tokens: Array[String] = line.split(",") 71 | if (tokens.length != 7) { 72 | throw new RuntimeException("Invalid record: " + line) 73 | } 74 | 75 | try { 76 | val rideId = tokens(0).toLong 77 | val time = DateTime.parse(tokens(1), TimeFormatter) 78 | val isStart = tokens(2) == "START" 79 | val lon = if (tokens(3).length > 0) tokens(3).toDouble else 0.0 80 | val lat = if (tokens(4).length > 0) tokens(4).toDouble else 0.0 81 | val passengerCnt = tokens(5).toShort 82 | val travelDistance = if (tokens(6).length > 0) tokens(6).toFloat else 0.0f 83 | 84 | new TaxiRide(rideId, time, isStart, new GeoPoint(lon, lat), passengerCnt, travelDistance) 85 | } 86 | catch { 87 | case nfe: NumberFormatException => 88 | throw new RuntimeException("Invalid record: " + line, nfe) 89 | } 90 | } 91 | } 92 | 93 | /** 94 | * A geo point defined by a longitude and a latitude value. 95 | * 96 | * @param lon The longitude of the point. 97 | * @param lat The latitude of the point. 98 | */ 99 | case class GeoPoint(lon: Double, lat: Double) 100 | -------------------------------------------------------------------------------- /src/main/scala/com/dataartisans/flink_demo/examples/EarlyArrivalCount.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 data Artisans GmbH 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.dataartisans.flink_demo.examples 18 | 19 | import com.dataartisans.flink_demo.datatypes.{TaxiRide, GeoPoint} 20 | import com.dataartisans.flink_demo.sinks.ElasticsearchUpsertSink 21 | import com.dataartisans.flink_demo.sources.TaxiRideSource 22 | import com.dataartisans.flink_demo.utils.{DemoStreamEnvironment, NycGeoUtils} 23 | import org.apache.flink.streaming.api.TimeCharacteristic 24 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment 25 | import org.apache.flink.streaming.api.scala._ 26 | import org.apache.flink.streaming.api.windowing.time.Time 27 | import org.apache.flink.streaming.api.windowing.triggers.Trigger 28 | import org.apache.flink.streaming.api.windowing.triggers.Trigger.{TriggerResult, TriggerContext} 29 | import org.apache.flink.streaming.api.windowing.windows.TimeWindow 30 | import org.apache.flink.util.Collector 31 | 32 | /** 33 | * Apache Flink DataStream API demo application. 34 | * 35 | * The program processes a stream of taxi ride events from the New York City Taxi and Limousine 36 | * Commission (TLC). 37 | * It computes every five minutes for each location the total number of persons that arrived 38 | * within the last 15 minutes by taxi. The program emits early partial count results whenever more 39 | * than 50 persons (or a multitude of 50 persons) arrive at a location within 15 minutes. 40 | * 41 | * See 42 | * http://github.com/dataartisans/flink-streaming-demo 43 | * for more detail. 44 | * 45 | */ 46 | object EarlyArrivalCount { 47 | 48 | def main(args: Array[String]) { 49 | 50 | // input parameters 51 | val data = "./data/nycTaxiData.gz" 52 | val maxServingDelay = 60 53 | val servingSpeedFactor = 600f 54 | 55 | // window parameters 56 | val countWindowLength = 15 // window size in min 57 | val countWindowFrequency = 5 // window trigger interval in min 58 | val earlyCountThreshold = 50 59 | 60 | // Elasticsearch parameters 61 | val writeToElasticsearch = false // set to true to write results to Elasticsearch 62 | val elasticsearchHost = "" // look-up hostname in Elasticsearch log output 63 | val elasticsearchPort = 9300 64 | 65 | 66 | // set up streaming execution environment 67 | val env: StreamExecutionEnvironment = DemoStreamEnvironment.env 68 | env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime) 69 | 70 | // Define the data source 71 | val rides: DataStream[TaxiRide] = env.addSource(new TaxiRideSource( 72 | data, maxServingDelay, servingSpeedFactor)) 73 | 74 | val cleansedRides = rides 75 | // filter for trip end events 76 | .filter( !_.isStart ) 77 | // filter for events in NYC 78 | .filter( r => NycGeoUtils.isInNYC(r.location) ) 79 | 80 | // map location coordinates to cell Id, timestamp, and passenger count 81 | val cellIds: DataStream[(Int, Short)] = cleansedRides 82 | .map( r => ( NycGeoUtils.mapToGridCell(r.location), r.passengerCnt ) ) 83 | 84 | val passengerCnts: DataStream[(Int, Long, Int)] = cellIds 85 | // key stream by cell Id 86 | .keyBy(_._1) 87 | // define sliding window on keyed streams 88 | .timeWindow(Time.minutes(countWindowLength), Time.minutes(countWindowFrequency)) 89 | .trigger(new EarlyCountTrigger(earlyCountThreshold)) 90 | // count events in window 91 | .apply { ( 92 | cell: Int, 93 | window: TimeWindow, 94 | events: Iterable[(Int, Short)], 95 | out: Collector[(Int, Long, Int)]) => 96 | out.collect( ( cell, window.getEnd, events.map( _._2 ).sum ) ) 97 | } 98 | 99 | val cntByLocation: DataStream[(Int, Long, GeoPoint, Int)] = passengerCnts 100 | // map cell Id back to GeoPoint 101 | .map( r => ( r._1, r._2, NycGeoUtils.getGridCellCenter(r._1), r._3 ) ) 102 | 103 | // print to console 104 | cntByLocation 105 | .print() 106 | 107 | if (writeToElasticsearch) { 108 | // write to Elasticsearch 109 | cntByLocation 110 | .addSink(new CntByLocTimeUpsert(elasticsearchHost, elasticsearchPort)) 111 | } 112 | 113 | env.execute("Early arrival counts per location") 114 | 115 | } 116 | 117 | class EarlyCountTrigger(triggerCnt: Int) extends Trigger[(Int, Short), TimeWindow] { 118 | 119 | override def onElement( 120 | event: (Int, Short), 121 | timestamp: Long, 122 | window: TimeWindow, 123 | ctx: TriggerContext): TriggerResult = { 124 | 125 | // register event time timer for end of window 126 | ctx.registerEventTimeTimer(window.getEnd) 127 | 128 | // get current count 129 | val personCnt = ctx.getKeyValueState[Integer]("personCnt", 0) 130 | // update count by passenger cnt of new event 131 | personCnt.update(personCnt.value() + event._2) 132 | // check if count is high enough for early notification 133 | if (personCnt.value() < triggerCnt) { 134 | // not yet 135 | TriggerResult.CONTINUE 136 | } 137 | else { 138 | // trigger count is reached 139 | personCnt.update(0) 140 | TriggerResult.FIRE 141 | } 142 | } 143 | 144 | override def onEventTime( 145 | time: Long, 146 | window: TimeWindow, 147 | ctx: TriggerContext): TriggerResult = { 148 | 149 | // trigger final computation 150 | TriggerResult.FIRE_AND_PURGE 151 | } 152 | 153 | override def onProcessingTime( 154 | time: Long, 155 | window: TimeWindow, 156 | ctx: TriggerContext): TriggerResult = { 157 | 158 | throw new UnsupportedOperationException("I am not a processing time trigger") 159 | } 160 | } 161 | 162 | 163 | class CntByLocTimeUpsert(host: String, port: Int) 164 | extends ElasticsearchUpsertSink[(Int, Long, GeoPoint, Int)]( 165 | host, 166 | port, 167 | "elasticsearch", 168 | "nyc-idx", 169 | "popular-locations") { 170 | 171 | override def insertJson(r: (Int, Long, GeoPoint, Int)): Map[String, AnyRef] = { 172 | Map( 173 | "location" -> (r._3.lat+","+r._3.lon).asInstanceOf[AnyRef], 174 | "time" -> r._2.asInstanceOf[AnyRef], 175 | "cnt" -> r._4.asInstanceOf[AnyRef] 176 | ) 177 | } 178 | 179 | override def updateJson(r: (Int, Long, GeoPoint, Int)): Map[String, AnyRef] = { 180 | Map[String, AnyRef] ( 181 | "cnt" -> r._4.asInstanceOf[AnyRef] 182 | ) 183 | } 184 | 185 | override def indexKey(r: (Int, Long, GeoPoint, Int)): String = { 186 | // index by location and time 187 | r._1.toString + "/" + r._2.toString 188 | } 189 | } 190 | 191 | } 192 | 193 | -------------------------------------------------------------------------------- /src/main/scala/com/dataartisans/flink_demo/examples/SlidingArrivalCount.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 data Artisans GmbH 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.dataartisans.flink_demo.examples 18 | 19 | import com.dataartisans.flink_demo.datatypes.{TaxiRide, GeoPoint} 20 | import com.dataartisans.flink_demo.sinks.ElasticsearchUpsertSink 21 | import com.dataartisans.flink_demo.sources.TaxiRideSource 22 | import com.dataartisans.flink_demo.utils.{DemoStreamEnvironment, NycGeoUtils} 23 | import org.apache.flink.streaming.api.TimeCharacteristic 24 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment 25 | import org.apache.flink.streaming.api.scala._ 26 | import org.apache.flink.streaming.api.windowing.time.Time 27 | import org.apache.flink.streaming.api.windowing.windows.TimeWindow 28 | import org.apache.flink.util.Collector 29 | 30 | /** 31 | * Apache Flink DataStream API demo application. 32 | * 33 | * The program processes a stream of taxi ride events from the New York City Taxi and Limousine 34 | * Commission (TLC). 35 | * It computes every five minutes for each location the total number of persons that arrived 36 | * within the last 15 minutes by taxi. 37 | * 38 | * See 39 | * http://github.com/dataartisans/flink-streaming-demo 40 | * for more detail. 41 | * 42 | */ 43 | object SlidingArrivalCount { 44 | 45 | def main(args: Array[String]) { 46 | 47 | // input parameters 48 | val data = "./data/nycTaxiData.gz" 49 | val maxServingDelay = 60 50 | val servingSpeedFactor = 600f 51 | 52 | // window parameters 53 | val countWindowLength = 15 // window size in min 54 | val countWindowFrequency = 5 // window trigger interval in min 55 | val earlyCountThreshold = 50 56 | 57 | // Elasticsearch parameters 58 | val writeToElasticsearch = false // set to true to write results to Elasticsearch 59 | val elasticsearchHost = "" // look-up hostname in Elasticsearch log output 60 | val elasticsearchPort = 9300 61 | 62 | 63 | // set up streaming execution environment 64 | val env: StreamExecutionEnvironment = DemoStreamEnvironment.env 65 | env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime) 66 | 67 | // Define the data source 68 | val rides: DataStream[TaxiRide] = env.addSource(new TaxiRideSource( 69 | data, maxServingDelay, servingSpeedFactor)) 70 | 71 | val cleansedRides = rides 72 | // filter for trip end events 73 | .filter( !_.isStart ) 74 | // filter for events in NYC 75 | .filter( r => NycGeoUtils.isInNYC(r.location) ) 76 | 77 | // map location coordinates to cell Id, timestamp, and passenger count 78 | val cellIds: DataStream[(Int, Short)] = cleansedRides 79 | .map( r => ( NycGeoUtils.mapToGridCell(r.location), r.passengerCnt ) ) 80 | 81 | val passengerCnts: DataStream[(Int, Long, Int)] = cellIds 82 | // key stream by cell Id 83 | .keyBy(_._1) 84 | // define sliding window on keyed streams 85 | .timeWindow(Time.minutes(countWindowLength), Time.minutes(countWindowFrequency)) 86 | // count events in window 87 | .apply { ( 88 | cell: Int, 89 | window: TimeWindow, 90 | events: Iterable[(Int, Short)], 91 | out: Collector[(Int, Long, Int)]) => 92 | out.collect( ( cell, window.getEnd, events.map( _._2 ).sum ) ) 93 | } 94 | 95 | // map cell Id back to GeoPoint 96 | val cntByLocation: DataStream[(Int, Long, GeoPoint, Int)] = passengerCnts 97 | .map( r => ( r._1, r._2, NycGeoUtils.getGridCellCenter(r._1), r._3 ) ) 98 | 99 | // print to console 100 | cntByLocation 101 | .print() 102 | 103 | if (writeToElasticsearch) { 104 | // write to Elasticsearch 105 | cntByLocation 106 | .addSink(new CntByLocTimeUpsert(elasticsearchHost, elasticsearchPort)) 107 | } 108 | 109 | env.execute("Sliding passenger count per location") 110 | 111 | } 112 | 113 | class CntByLocTimeUpsert(host: String, port: Int) 114 | extends ElasticsearchUpsertSink[(Int, Long, GeoPoint, Int)]( 115 | host, 116 | port, 117 | "elasticsearch", 118 | "nyc-idx", 119 | "popular-locations") { 120 | 121 | override def insertJson(r: (Int, Long, GeoPoint, Int)): Map[String, AnyRef] = { 122 | Map( 123 | "location" -> (r._3.lat+","+r._3.lon).asInstanceOf[AnyRef], 124 | "time" -> r._2.asInstanceOf[AnyRef], 125 | "cnt" -> r._4.asInstanceOf[AnyRef] 126 | ) 127 | } 128 | 129 | override def updateJson(r: (Int, Long, GeoPoint, Int)): Map[String, AnyRef] = { 130 | Map[String, AnyRef] ( 131 | "cnt" -> r._4.asInstanceOf[AnyRef] 132 | ) 133 | } 134 | 135 | override def indexKey(r: (Int, Long, GeoPoint, Int)): String = { 136 | // index by location and time 137 | r._1.toString + "/" + r._2.toString 138 | } 139 | } 140 | 141 | } 142 | 143 | -------------------------------------------------------------------------------- /src/main/scala/com/dataartisans/flink_demo/examples/TotalArrivalCount.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 data Artisans GmbH 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.dataartisans.flink_demo.examples 18 | 19 | import com.dataartisans.flink_demo.datatypes.{TaxiRide, GeoPoint} 20 | import com.dataartisans.flink_demo.sinks.ElasticsearchUpsertSink 21 | import com.dataartisans.flink_demo.sources.TaxiRideSource 22 | import com.dataartisans.flink_demo.utils.{DemoStreamEnvironment, NycGeoUtils} 23 | import org.apache.flink.streaming.api.TimeCharacteristic 24 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment 25 | 26 | import org.apache.flink.streaming.api.scala._ 27 | 28 | /** 29 | * Apache Flink DataStream API demo application. 30 | * 31 | * The program processes a stream of taxi ride events from the New York City Taxi and Limousine 32 | * Commission (TLC). 33 | * It computes for each location the total number of persons that arrived by taxi. 34 | * 35 | * See 36 | * http://github.com/dataartisans/flink-streaming-demo 37 | * for more detail. 38 | * 39 | */ 40 | object TotalArrivalCount { 41 | 42 | def main(args: Array[String]) { 43 | 44 | // input parameters 45 | val data = "./data/nycTaxiData.gz" 46 | val maxServingDelay = 60 47 | val servingSpeedFactor = 600f 48 | 49 | // Elasticsearch parameters 50 | val writeToElasticsearch = false // set to true to write results to Elasticsearch 51 | val elasticsearchHost = "" // look-up hostname in Elasticsearch log output 52 | val elasticsearchPort = 9300 53 | 54 | 55 | // set up streaming execution environment 56 | val env: StreamExecutionEnvironment = DemoStreamEnvironment.env 57 | env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime) 58 | 59 | // Define the data source 60 | val rides: DataStream[TaxiRide] = env.addSource(new TaxiRideSource( 61 | data, maxServingDelay, servingSpeedFactor)) 62 | 63 | val cleansedRides = rides 64 | // filter for trip end events 65 | .filter( !_.isStart ) 66 | // filter for events in NYC 67 | .filter( r => NycGeoUtils.isInNYC(r.location) ) 68 | 69 | // map location coordinates to cell Id, timestamp, and passenger count 70 | val cellIds: DataStream[(Int, Long, Short)] = cleansedRides 71 | .map { r => 72 | ( NycGeoUtils.mapToGridCell(r.location), r.time.getMillis, r.passengerCnt ) 73 | } 74 | 75 | val passengerCnts: DataStream[(Int, Long, Int)] = cellIds 76 | // key stream by cell Id 77 | .keyBy(_._1) 78 | // sum passengers per cell Id and update time 79 | .fold((0, 0L, 0), (s: (Int, Long, Int), r: (Int, Long, Short)) => 80 | { (r._1, s._2.max(r._2), s._3 + r._3) } ) 81 | 82 | // map cell Id back to GeoPoint 83 | val cntByLocation: DataStream[(Int, Long, GeoPoint, Int)] = passengerCnts 84 | .map( r => (r._1, r._2, NycGeoUtils.getGridCellCenter(r._1), r._3 ) ) 85 | 86 | // print to console 87 | cntByLocation 88 | .print() 89 | 90 | if (writeToElasticsearch) { 91 | // write to Elasticsearch 92 | cntByLocation 93 | .addSink(new CntTimeByLocUpsert(elasticsearchHost, elasticsearchPort)) 94 | } 95 | 96 | env.execute("Total passenger count per location") 97 | 98 | } 99 | 100 | class CntTimeByLocUpsert(host: String, port: Int) 101 | extends ElasticsearchUpsertSink[(Int, Long, GeoPoint, Int)]( 102 | host, 103 | port, 104 | "elasticsearch", 105 | "nyc-idx", 106 | "popular-locations") { 107 | 108 | override def insertJson(r: (Int, Long, GeoPoint, Int)): Map[String, AnyRef] = { 109 | Map( 110 | "location" -> (r._3.lat+","+r._3.lon).asInstanceOf[AnyRef], 111 | "time" -> r._2.asInstanceOf[AnyRef], 112 | "cnt" -> r._4.asInstanceOf[AnyRef] 113 | ) 114 | } 115 | 116 | override def updateJson(r: (Int, Long, GeoPoint, Int)): Map[String, AnyRef] = { 117 | Map[String, AnyRef] ( 118 | "time" -> r._2.asInstanceOf[AnyRef], 119 | "cnt" -> r._4.asInstanceOf[AnyRef] 120 | ) 121 | } 122 | 123 | override def indexKey(r: (Int, Long, GeoPoint, Int)): String = { 124 | // index by location 125 | r._1.toString 126 | } 127 | } 128 | 129 | } 130 | -------------------------------------------------------------------------------- /src/main/scala/com/dataartisans/flink_demo/sinks/ElasticsearchUpsertSink.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 data Artisans GmbH 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.dataartisans.flink_demo.sinks 18 | 19 | import java.util 20 | 21 | import org.apache.flink.configuration.Configuration 22 | import org.apache.flink.streaming.api.functions.sink.RichSinkFunction 23 | import org.elasticsearch.action.index.IndexRequest 24 | import org.elasticsearch.action.update.UpdateRequest 25 | import org.elasticsearch.client.transport.TransportClient 26 | import org.elasticsearch.common.settings.ImmutableSettings 27 | import org.elasticsearch.common.transport.InetSocketTransportAddress 28 | 29 | import scala.collection.JavaConversions._ 30 | 31 | /** 32 | * SinkFunction to either insert or update an entry in an Elasticsearch index. 33 | * 34 | * @param host Hostname of the Elasticsearch instance. 35 | * @param port Port of the Elasticsearch instance. 36 | * @param cluster Name of the Elasticsearch cluster. 37 | * @param index Name of the Elasticsearch index. 38 | * @param mapping Name of the index mapping. 39 | * 40 | * @tparam T Record type to write to Elasticsearch. 41 | */ 42 | abstract class ElasticsearchUpsertSink[T](host: String, port: Int, cluster: String, index: String, mapping: String) 43 | extends RichSinkFunction[T] { 44 | 45 | private var client: TransportClient = null 46 | 47 | def insertJson(record: T): Map[String, AnyRef] 48 | 49 | def updateJson(record: T): Map[String, AnyRef] 50 | 51 | def indexKey(record: T): String 52 | 53 | @throws[Exception] 54 | override def open(parameters: Configuration) { 55 | 56 | val config = new util.HashMap[String, String] 57 | config.put("bulk.flush.max.actions", "1") 58 | config.put("cluster.name", cluster) 59 | 60 | val settings = ImmutableSettings.settingsBuilder() 61 | .put(config) 62 | .build() 63 | client = new TransportClient(settings) 64 | .addTransportAddress(new InetSocketTransportAddress(host, port)) 65 | } 66 | 67 | @throws[Exception] 68 | override def invoke(r: T) { 69 | // do an upsert request to elastic search 70 | 71 | // index document if it does not exist 72 | val indexRequest = new IndexRequest(index, mapping, indexKey(r)) 73 | .source(mapAsJavaMap(insertJson(r))) 74 | 75 | // update document if it exists 76 | val updateRequest = new UpdateRequest(index, mapping, indexKey(r)) 77 | .doc(mapAsJavaMap(updateJson(r))) 78 | .upsert(indexRequest) 79 | 80 | client.update(updateRequest).get() 81 | } 82 | 83 | } 84 | -------------------------------------------------------------------------------- /src/main/scala/com/dataartisans/flink_demo/sources/TaxiRideSource.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 data Artisans GmbH 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.dataartisans.flink_demo.sources 18 | 19 | import java.io._ 20 | import java.util.{Calendar, Random} 21 | import java.util.zip.GZIPInputStream 22 | 23 | import com.dataartisans.flink_demo.datatypes.TaxiRide 24 | import org.apache.flink.streaming.api.functions.source.{EventTimeSourceFunction} 25 | import org.apache.flink.streaming.api.functions.source.SourceFunction.SourceContext 26 | import org.apache.flink.streaming.api.watermark.Watermark 27 | import org.joda.time.DateTime 28 | 29 | import scala.collection.mutable 30 | 31 | /** 32 | * This SourceFunction generates a data stream of TaxiRide records which are 33 | * read from a gzipped input file. Each record has a time stamp and the input file must be 34 | * ordered by this time stamp. 35 | * 36 | * In order to simulate a realistic stream source, the SourceFunction serves events proportional to 37 | * their timestamps. In addition, the serving of events can be delayed by a bounded random delay 38 | * which causes the events to be served out-of-order of their timestamps. 39 | * 40 | * The serving speed of the SourceFunction can be adjusted by a serving speed factor. 41 | * A factor of 60.0 increases the logical serving time by a factor of 60, i.e., events of one 42 | * minute (60 seconds) are served in 1 second. 43 | * 44 | * This SourceFunction is an EventSourceFunction and does continuously emit watermarks. 45 | * Hence it can only operate in event time mode which is configured as follows: 46 | * 47 | * StreamExecutionEnvironment.setStreamTimeCharacteristic(TimeCharacteristic.EventTime) 48 | * 49 | * @param dataFilePath The path to the gzipped input file. 50 | * @param maxDelaySecs The maximum serving delay. Defines how much elements are served out-of-order. 51 | * @param servingSpeed The relative serving speed. Can be used to fast-forward the stream. 52 | */ 53 | class TaxiRideSource(dataFilePath: String, maxDelaySecs: Int, servingSpeed: Float) 54 | extends EventTimeSourceFunction[TaxiRide] { 55 | 56 | private val maxDelayMsecs = maxDelaySecs * 1000 57 | private val watermarkDelayMSecs = if (maxDelayMsecs < 10000) 10000 else maxDelayMsecs 58 | 59 | @transient 60 | private var reader: BufferedReader = null 61 | @transient 62 | private var gzipStream: InputStream = null 63 | 64 | override def run(sourceContext: SourceContext[TaxiRide]): Unit = { 65 | gzipStream = new GZIPInputStream(new FileInputStream(dataFilePath)) 66 | reader = new BufferedReader(new InputStreamReader(gzipStream, "UTF-8")) 67 | 68 | if (this.maxDelayMsecs == 0) { 69 | generateOrderedStream(sourceContext) 70 | } 71 | else { 72 | generateUnorderedStream(sourceContext) 73 | } 74 | 75 | this.reader.close() 76 | this.reader = null 77 | this.gzipStream.close() 78 | this.gzipStream = null 79 | } 80 | 81 | @throws(classOf[IOException]) 82 | override def cancel(): Unit = { 83 | try { 84 | if (this.reader != null) { 85 | this.reader.close() 86 | } 87 | if (this.gzipStream != null) { 88 | this.gzipStream.close() 89 | } 90 | } finally { 91 | this.reader = null 92 | this.gzipStream = null 93 | } 94 | } 95 | 96 | @throws(classOf[Exception]) 97 | private def generateOrderedStream(sourceContext: SourceContext[TaxiRide]) { 98 | 99 | val servingStartTime = Calendar.getInstance.getTimeInMillis 100 | var dataStartTime = 0L 101 | var nextWatermark = 0L 102 | var nextWatermarkServingTime = 0L 103 | 104 | // read the first ride event 105 | if (reader.ready) { 106 | val line = reader.readLine 107 | if (line != null) { 108 | val ride = TaxiRide.fromString(line) 109 | 110 | // set time of first event 111 | dataStartTime = ride.time.getMillis 112 | // initialize watermarks 113 | nextWatermark = dataStartTime + watermarkDelayMSecs 114 | nextWatermarkServingTime = toServingTime(servingStartTime, dataStartTime, nextWatermark) 115 | // emit first event 116 | sourceContext.collectWithTimestamp(ride, ride.time.getMillis) 117 | } 118 | } 119 | else { 120 | return 121 | } 122 | 123 | // read all following ride events 124 | while (reader.ready) { 125 | val line = reader.readLine 126 | if (line != null) { 127 | 128 | // read event 129 | val ride = TaxiRide.fromString(line) 130 | 131 | val eventTime = ride.time.getMillis 132 | val now = Calendar.getInstance.getTimeInMillis 133 | val eventServingTime = toServingTime(servingStartTime, dataStartTime, eventTime) 134 | 135 | // get time to wait until event and next watermark needs to be emitted 136 | val eventWait = eventServingTime - now 137 | val watermarkWait = nextWatermarkServingTime - now 138 | 139 | if (eventWait < watermarkWait) { 140 | // wait to emit next event 141 | Thread.sleep(if (eventWait > 0) eventWait else 0) 142 | } 143 | else if (eventWait > watermarkWait) { 144 | // wait to emit watermark 145 | Thread.sleep(if (watermarkWait > 0) watermarkWait else 0) 146 | // emit watermark 147 | sourceContext.emitWatermark(new Watermark(nextWatermark)) 148 | // schedule next watermark 149 | nextWatermark = nextWatermark + watermarkDelayMSecs 150 | nextWatermarkServingTime = toServingTime(servingStartTime, dataStartTime, nextWatermark) 151 | // wait to emit event 152 | val remainWait: Long = eventWait - watermarkWait 153 | Thread.sleep(if (remainWait > 0) remainWait else 0) 154 | } 155 | else if (eventWait == watermarkWait) { 156 | // wait to emit watermark 157 | Thread.sleep(if (watermarkWait > 0) watermarkWait else 0) 158 | // emit watermark 159 | sourceContext.emitWatermark(new Watermark(nextWatermark - 1)) 160 | // schedule next watermark 161 | nextWatermark = nextWatermark + watermarkDelayMSecs 162 | nextWatermarkServingTime = toServingTime(servingStartTime, dataStartTime, nextWatermark) 163 | } 164 | // emit event 165 | sourceContext.collectWithTimestamp(ride, ride.time.getMillis) 166 | } 167 | } 168 | } 169 | 170 | @throws(classOf[Exception]) 171 | private def generateUnorderedStream(sourceContext: SourceContext[TaxiRide]) { 172 | 173 | val servingStartTime = Calendar.getInstance.getTimeInMillis 174 | var dataStartTime = 0L 175 | val rand: Random = new Random(7452) 176 | 177 | val emitSchedule = mutable.PriorityQueue.empty[(Long, Either[TaxiRide, Watermark])]( 178 | Ordering.by( (_: (Long, Either[TaxiRide, Watermark]))._1 ).reverse 179 | ) 180 | 181 | var ride: TaxiRide = null 182 | if (reader.ready) { 183 | 184 | val line = reader.readLine 185 | if (line != null) { 186 | 187 | ride = TaxiRide.fromString(line) 188 | dataStartTime = ride.time.getMillis 189 | 190 | // schedule first event 191 | val delayedEventTime: Long = dataStartTime + getNormalDelayMsecs(rand) 192 | emitSchedule += ( (delayedEventTime, Left(ride)) ) 193 | // schedule first watermark 194 | val watermarkTime = dataStartTime + watermarkDelayMSecs 195 | val nextWatermark = new Watermark(watermarkTime - maxDelayMsecs - 1) 196 | emitSchedule += ( (watermarkTime, Right(nextWatermark)) ) 197 | } 198 | } 199 | else { 200 | return 201 | } 202 | 203 | if (reader.ready) { 204 | val line = reader.readLine 205 | if (line != null) { 206 | ride = TaxiRide.fromString(line) 207 | } 208 | } 209 | 210 | while (emitSchedule.nonEmpty || reader.ready) { 211 | 212 | // insert all events into schedule that might be emitted next 213 | val curNextDelayedEventTime = if (emitSchedule.nonEmpty) emitSchedule.head._1 else -1 214 | var rideEventTime = if (ride != null) ride.time.getMillis else -1 215 | while (ride != null && 216 | (emitSchedule.isEmpty || (rideEventTime < curNextDelayedEventTime + maxDelayMsecs))) { 217 | 218 | // insert event into schedule 219 | val delayedEventTime = rideEventTime + getNormalDelayMsecs(rand) 220 | emitSchedule += ( (delayedEventTime, Left(ride)) ) 221 | 222 | // read next ride from input 223 | if (reader.ready) { 224 | val line = reader.readLine 225 | if (line != null) { 226 | ride = TaxiRide.fromString(line) 227 | rideEventTime = ride.time.getMillis 228 | } else { 229 | ride = null 230 | rideEventTime = -1 231 | } 232 | } else { 233 | ride = null 234 | rideEventTime = -1 235 | } 236 | } 237 | 238 | // emit next element 239 | val head = emitSchedule.dequeue() 240 | val delayedEventTime = head._1 241 | 242 | val now = Calendar.getInstance.getTimeInMillis 243 | val servingTime = toServingTime(servingStartTime, dataStartTime, delayedEventTime) 244 | val waitTime = servingTime - now 245 | 246 | Thread.sleep(if (waitTime > 0) waitTime else 0) 247 | 248 | head._2 match { 249 | case Left(emitRide) => { 250 | // emit event 251 | sourceContext.collectWithTimestamp(emitRide, emitRide.time.getMillis) 252 | } 253 | case Right(emitWatermark) => { 254 | // emit watermark 255 | sourceContext.emitWatermark(emitWatermark) 256 | // schedule next watermark 257 | val watermarkTime = delayedEventTime + watermarkDelayMSecs 258 | val nextWatermark = new Watermark(watermarkTime - maxDelayMsecs - 1) 259 | emitSchedule += ( (watermarkTime, Right(nextWatermark))) 260 | } 261 | } 262 | } 263 | } 264 | 265 | def toServingTime(servingStartTime: Long, dataStartTime: Long, eventTime: Long): Long = { 266 | val dataDiff = eventTime - dataStartTime 267 | servingStartTime + (dataDiff / this.servingSpeed).toLong 268 | } 269 | 270 | def getNormalDelayMsecs(rand: Random): Long = { 271 | var delay = -1L 272 | val x = maxDelayMsecs / 2 273 | while (delay < 0 || delay > maxDelayMsecs) { 274 | delay = (rand.nextGaussian * x).toLong + x 275 | } 276 | delay 277 | } 278 | 279 | } 280 | -------------------------------------------------------------------------------- /src/main/scala/com/dataartisans/flink_demo/utils/DemoStreamEnvironment.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 data Artisans GmbH 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.dataartisans.flink_demo.utils 18 | 19 | import org.apache.flink.configuration.{ConfigConstants, Configuration} 20 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment 21 | import org.apache.flink.streaming.api.environment.LocalStreamEnvironment 22 | 23 | object DemoStreamEnvironment { 24 | 25 | def env: StreamExecutionEnvironment = { 26 | val config = new Configuration() 27 | // start the web dashboard 28 | config.setBoolean(ConfigConstants.LOCAL_START_WEBSERVER, true) 29 | // required to start the web dashboard 30 | config.setString(ConfigConstants.JOB_MANAGER_WEB_LOG_PATH_KEY, "./data/dummyLogFile.txt") 31 | 32 | // create a local stream execution environment 33 | new LocalStreamEnvironment(config) 34 | } 35 | 36 | } 37 | -------------------------------------------------------------------------------- /src/main/scala/com/dataartisans/flink_demo/utils/NycGeoUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 data Artisans GmbH 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.dataartisans.flink_demo.utils 18 | 19 | import com.dataartisans.flink_demo.datatypes.GeoPoint 20 | 21 | /** 22 | * GeoUtils provides utility methods to deal with GeoPoints with locations in New York City. 23 | */ 24 | object NycGeoUtils { 25 | 26 | val LonEast: Double = -73.7 27 | val LonWest: Double = -74.05 28 | val LatNorth: Double = 41.0 29 | val LatSouth: Double = 40.5 30 | 31 | val LonWidth: Double = 74.05 - 73.7 32 | val LatHeight: Double = 41.0 - 40.5 33 | 34 | val DeltaLon: Double = 0.0014 35 | val DeltaLat: Double = 0.00125 36 | 37 | val CellCntX: Int = 250 38 | val CellCntY: Int = 400 39 | 40 | /** 41 | * Checks if a location specified by longitude and latitude values is 42 | * within the geo boundaries of New York City. 43 | * 44 | * @param point the geo point to check 45 | * 46 | * @return true if the location is within NYC boundaries, otherwise false. 47 | */ 48 | def isInNYC(point: GeoPoint): Boolean = { 49 | if(point.lon > LonEast || point.lon < LonWest) 50 | false 51 | if(point.lat > LatNorth || point.lat < LatSouth) 52 | false 53 | true 54 | } 55 | 56 | /** 57 | * Maps a location specified as GeoPoint to a cell of a grid covering the area of NYC. 58 | * The grid cells are roughly 100 x 100 m and sequentially number from north-west 59 | * to south-east starting by zero. 60 | * 61 | * @param point the geo point to map 62 | * 63 | * @return id of mapped grid cell. 64 | */ 65 | def mapToGridCell(point: GeoPoint): Int = { 66 | val xIndex: Int = Math.floor((Math.abs(LonWest) - Math.abs(point.lon)) / DeltaLon).toInt 67 | val yIndex: Int = Math.floor((LatNorth - point.lat) / DeltaLat).toInt 68 | xIndex + (yIndex * CellCntX) 69 | } 70 | 71 | /** 72 | * Returns the center of a grid cell as a GeoPoint 73 | * 74 | * @param gridCellId The grid cell. 75 | * 76 | * @return The cell's center as GeoPoint 77 | */ 78 | def getGridCellCenter(gridCellId: Int): GeoPoint = { 79 | val xIndex: Int = gridCellId % CellCntX 80 | val lon = (Math.abs(LonWest) - (xIndex * DeltaLon) - (DeltaLon / 2)).toFloat * -1.0f 81 | 82 | val yIndex: Int = (gridCellId - xIndex) / CellCntX 83 | val lat = (LatNorth - (yIndex * DeltaLat) - (DeltaLat / 2)).toFloat 84 | 85 | new GeoPoint(lon, lat) 86 | } 87 | 88 | } 89 | -------------------------------------------------------------------------------- /src/main/scripts/convertTrips.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2015 data Artisans GmbH 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # This script converts a public data set about taxi rides in New York City into a data set of 18 | # taxi ride events. The original data is published by the NYC Taxi and Limousine Commission (TLC) at 19 | # 20 | # http://www.nyc.gov/html/tlc/html/about/trip_record_data.shtml 21 | # 22 | # The script parses CSV input and converts each line (which represents a taxi trip) into two 23 | # taxi ride events: a start event and an end event. 24 | # The resulting events have the following schema: 25 | # rideId: Long // unique id for each ride 26 | # time: DateTime // timestamp of the start/end event 27 | # isStart: Boolean // true = ride start, false = ride end 28 | # lon: Float // longitude of pick-up/drop-off location 29 | # lat: Float // latitude of pick-up/drop-off location 30 | # passengerCnt: Int // number of passengers 31 | # travelDist: Float // total travel distance, -1 on start events 32 | # 33 | # usage: just pipe raw data into script 34 | 35 | tr -d '\r' | awk -F "," '{print NR "," $2 ",START," $6 "," $7 "," $4 ",-1"}{print NR "," $3 ",END," $10 "," $11 "," $4 "," $5}' | sort -t ',' -k 2 -S "2G" < /dev/stdin -------------------------------------------------------------------------------- /tools/maven/checkstyle.xml: -------------------------------------------------------------------------------- 1 | 2 | 17 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 84 | 85 | 86 | --------------------------------------------------------------------------------