├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── VERSION ├── config ├── drivers │ ├── RedshiftJDBC42-1.2.7.1003.jar │ ├── mysql-connector-java-5.1.44-bin.jar │ └── postgresql-42.1.4.jar └── pipeline │ ├── sample-advanced-filter-csv.conf │ ├── sample-csv-api-header.conf │ ├── sample-csv-api-noheader.conf │ ├── sample-csv-csv-noheader.conf │ ├── sample-multi-csv-csv-noheader.conf │ └── sample-tab-api-header.conf ├── convert.py ├── datastash-sftpscp.png ├── datastash.png ├── docker-entrypoint.sh └── logstash.yml /.gitignore: -------------------------------------------------------------------------------- 1 | # Application specific files 2 | /approot 3 | .sass-cache 4 | *.log 5 | *.pwd 6 | *.salt.txt 7 | *.userhash.txt 8 | prod* 9 | 10 | # Folder view configuration files 11 | *.DS_Store 12 | .AppleDouble 13 | .LSOverride 14 | Desktop.ini 15 | *cipher* 16 | 17 | # Icon must end with two \r 18 | Icon 19 | 20 | # Compiled Python files 21 | *.pyc 22 | 23 | # Compiled C++ files 24 | *.out 25 | # Thumbnails 26 | ._* 27 | Thumbs.db 28 | 29 | # Files that might appear in the root of a volume 30 | .DocumentRevisions-V100 31 | .fseventsd 32 | .Spotlight-V100 33 | .TemporaryItems 34 | .Trashes 35 | .VolumeIcon.icns 36 | .com.apple.timemachine.donotpresent 37 | 38 | # Directories potentially created on remote AFP share 39 | .AppleDB 40 | .AppleDesktop 41 | Network Trash Folder 42 | Temporary Items 43 | .apdisk 44 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM openjdk:8-jre-alpine 2 | 3 | RUN addgroup -S logstash && adduser -S -G logstash logstash 4 | 5 | RUN apk add --no-cache \ 6 | bash \ 7 | curl \ 8 | libc6-compat \ 9 | libzmq 10 | 11 | RUN apk add --no-cache 'su-exec>=0.2' 12 | 13 | ENV LOGSTASH_PATH /usr/share/logstash/bin 14 | ENV PATH $LOGSTASH_PATH:$PATH 15 | 16 | ENV LOGSTASH_VERSION 6.2.3 17 | ENV LOGSTASH_TARBALL="https://artifacts.elastic.co/downloads/logstash/logstash-${LOGSTASH_VERSION}.tar.gz" \ 18 | LOGSTASH_TARBALL_ASC="https://artifacts.elastic.co/downloads/logstash/logstash-${LOGSTASH_VERSION}.tar.gz.asc" \ 19 | LOGSTASH_TARBALL_SHA1="a553e800665b7ccc1a6f30b49fa0d336526c8f01144751dbe617b33b38595f121f6d8b4c43e8b2f5b648bc283fc839f035c816c696d8ecccc3a93a4bb2a329c7" \ 20 | GPG_KEY="46095ACC8548582C1A2699A9D27D666CD88E42B4" 21 | 22 | RUN set -ex; \ 23 | \ 24 | \ 25 | apk add --no-cache --virtual .fetch-deps \ 26 | ca-certificates \ 27 | gnupg \ 28 | openssl \ 29 | libc6-compat \ 30 | tar \ 31 | ; \ 32 | \ 33 | wget -O logstash.tar.gz "$LOGSTASH_TARBALL"; \ 34 | \ 35 | if [ "$LOGSTASH_TARBALL_SHA" ]; then \ 36 | echo "$LOGSTASH_TARBALL_SHA *logstash.tar.gz" | sha1sum -c -; \ 37 | fi; \ 38 | \ 39 | if [ "$TARBALL_ASC" ]; then \ 40 | wget --progress=bar:force -O logstash.tar.gz.asc "$TARBALL_ASC"; \ 41 | export GNUPGHOME="$(mktemp -d)"; \ 42 | ( gpg --keyserver ha.pool.sks-keyservers.net --recv-keys "$GPG_KEY" \ 43 | || gpg --keyserver pgp.mit.edu --recv-keys "$GPG_KEY" \ 44 | || gpg --keyserver keyserver.pgp.com --recv-keys "$GPG_KEY" ); \ 45 | gpg --batch --verify logstash.tar.gz.asc logstash.tar.gz; \ 46 | rm -rf "$GNUPGHOME" logstash.tar.gz.asc || true; \ 47 | fi; \ 48 | \ 49 | dir="$(dirname "$LOGSTASH_PATH")"; \ 50 | \ 51 | mkdir -p "$dir"; \ 52 | tar -xf logstash.tar.gz --strip-components=1 -C "$dir"; \ 53 | rm logstash.tar.gz; \ 54 | \ 55 | apk del .fetch-deps; \ 56 | \ 57 | export LS_SETTINGS_DIR="$dir/config"; \ 58 | if [ -f "$LS_SETTINGS_DIR/log4j2.properties" ]; then \ 59 | cp "$LS_SETTINGS_DIR/log4j2.properties" "$LS_SETTINGS_DIR/log4j2.properties.dist"; \ 60 | truncate -s 0 "$LS_SETTINGS_DIR/log4j2.properties"; \ 61 | fi; \ 62 | \ 63 | for userDir in \ 64 | "$dir/config" \ 65 | "$dir/data" \ 66 | ; do \ 67 | if [ -d "$userDir" ]; then \ 68 | chown -R logstash:logstash "$userDir"; \ 69 | fi; \ 70 | done; \ 71 | \ 72 | /usr/share/logstash/bin/logstash-plugin install logstash-filter-i18n; \ 73 | logstash --version 74 | 75 | COPY docker-entrypoint.sh / 76 | COPY logstash.yml /usr/share/logstash/config 77 | COPY config/pipeline /usr/share/logstash/pipeline 78 | COPY config/drivers /drivers 79 | 80 | ENTRYPOINT ["/docker-entrypoint.sh"] 81 | CMD ["-e", ""] -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Openbridge, Inc. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Data Stash - Event API Client 2 | 3 | Data Stash is a `logstash` service than can ingest data from different data sources, transform them, and then send JSON output via HTTP to the Openbridge Events API. You can also store the outputs into other formats such as CSV. 4 | 5 | ![Data Stash](https://raw.githubusercontent.com/openbridge/ob_datastash/master/datastash.png "How It Works") 6 | 7 | # Why Data Stash? 8 | 9 | Data Stash can perform some magic by automatically processing, cleaning, encoding and streaming contents of one or more CSVs directly to our API. Once it arrives at our API we automatically route all the data to a destination table in your data warehouse. Since CSV files can be a bit messy we have pre-packaged processing configurations that turn those old files into first class data sources. Here are a few of the standard operations we have defined: 10 | 11 | - Exclude columns resident in a CSV (e.g., remove/drop the userID, email address and social security columns) from the output 12 | - Replace non-ASCII characters with an ASCII approximation, or if none exists, a replacement character which defaults to ? 13 | - Remove extraneous white space from records in target columns 14 | - Strip backslashes, question marks, equals, hashes, minuses or other characters from the target columns 15 | - Set a desired data type of a given column and have it transform records to meet that type 16 | - Set everything to lowercase 17 | - Proper UTF-8 encoding of the data 18 | - Mask sensitive data with security "hashes" for one or more fields. 19 | - Add new fields, such as IDs or concatenations of other columns, which can replace the contents of a column or store the results in a new field that is appended to the CSV 20 | 21 | ## Quick Start Sample Config Files 22 | For reference, sample configs can be found in the [`/config/pipeline`](config/pipeline) folder of this repo. 23 | 24 | - **CSV to API**: CSV files with header rows use [`sample-csv-api-header.conf`](config/pipeline/sample-csv-api-header.conf) 25 | - **CSV to API**: CSV without header rows use [`sample-csv-api-noheader.conf`](config/pipeline/sample-csv-api-noheader.conf) 26 | - **CSV to CSV**: To process one CSV to generate a clean processed CSV use[`sample-csv-csv-noheader.conf`](config/pipeline/sample-csv-csv-noheader.conf) 27 | - **Multiple CSV Inputs to Multiple CSV Outputs**: To process multiple CSV files to generate multiple clean CSV files use [`sample-multi-csv-csv-noheader.conf`](config/pipeline/sample-multi-csv-csv-noheader.conf) 28 | 29 | 30 | # Install 31 | 32 | Data Stash is neatly packaged into a Docker image so you can run this on your local laptop or deploy it to a server. The first step is to build or pull the image: 33 | 34 | ```docker 35 | docker build -t openbridge/ob_datastash . 36 | ``` 37 | 38 | or simply pull it from Docker Hub: 39 | 40 | ```docker 41 | docker pull openbridge/ob_datastash:latest 42 | ``` 43 | 44 | Once you have your image you are ready yo get started! 45 | 46 | # Getting Started: How To Stream CSV Files 47 | Data Stash is based on a premise of inputs, filters and outputs; 48 | 49 | - **Inputs**: Your data sources. Primarily this will be a CSV file, but it an be many others. 50 | - **Filters**: This is pre-processing your data prior to delivery to an output location 51 | - **Outputs**: Ther are a few output options but the principle is the Openbridge Webhook API 52 | 53 | Data Stash can take a CSV file and break each row into a streamed JSON "event". These JSON events are delivered to an Openbridge API for import into your target warehouse. 54 | 55 | There are a couple of CSV file use cases: 56 | 57 | - **Static Files**: You have exports from a system that you want to load to your data warehouse. Data Stash will process the exported source file and stream the content of the file until it reaches the end. 58 | - **Dynamic Files**: You have a file that continually has new rows added. Data Stash will process changing files and stream new events as they are appended to a file. 59 | 60 | For our example walk-thru we use a static CSV file called `sales.csv`. 61 | 62 | ## `sales.csv` Needs A Data Stash Configuration File 63 | 64 | To run Data Stash for `sales.csv` you need to define a config file. Each config file is comprised of three parts; input, filter and output. A config file describes how Data Stash should process your `sales.csv` file. 65 | 66 | ### Step 1: Define Your Input 67 | 68 | Lets dig into your example `sales.csv`. The principle part of the input is setting the `path =>` to your file(s). You will need to specify the path to the file you want to process like this `path => "/the/path/to/your/sales.csv"`. We are going to assume this is located in a folder on your laptop here: `/Users/bob/csv/mysalesdata`. 69 | 70 | However, Data Stash has its own location where it references your data. It will use its own default directory called `/data` to reference your files. What does this mean? In the Data Stash config you will use the `/data` in the file path as a default. When you run Data Stash you will tell it to map your laptop directory `/Users/bob/csv/mysalesdata` to the `/data`. This means anything in your laptop directory will appear exactly the same way inside `/data`. 71 | 72 | See the "How To Run" section for more details on this mapping. 73 | 74 | ```bash 75 | input { 76 | file { 77 | path => "/data/sales.csv" 78 | start_position => "beginning" 79 | sincedb_path => "/dev/null" 80 | } 81 | } 82 | ``` 83 | 84 | ### Step 2: Define Your Filter 85 | 86 | This is where you define a CSV filter. A basic filter is focused on setting the schema and removal of system generated columns. 87 | 88 | - The `separator => ","` defines the delimiter. Do not change 89 | - The removal of system generated columns is done via `remove_field => [ "message", "host", "@timestamp", "@version", "path" ]`. Do not change unless you want to remove other columns from your CSV file. For example, lets say you had a column called `userid`. You can add it like this `remove_field => [ "message", "host", "@timestamp", "@version", "path", "userid" ]`. Now `userid` will be supressed and not sent to Openbridge. 90 | - If your CSV file has a header row, then you can set `autodetect_column_names => "true"` and `autogenerate_column_names => "true"` to leverage those values when processing the file. 91 | 92 | ```bash 93 | filter { 94 | csv { 95 | separator => "," 96 | remove_field => [ "message", "host", "@timestamp", "@version", "path" ] 97 | autodetect_column_names => "true" 98 | autogenerate_column_names => "true" 99 | } 100 | } 101 | ``` 102 | 103 | If your CSV does **not** have a header in the file you need to provide context about the target source file. You need to supply the header to the application `columns => [Sku,Name,SearchKeywords,Main,Price,ID,Brands]`. This header should align to the laytout of the CSV file. 104 | 105 | ```bash 106 | filter { 107 | csv { 108 | separator => "," 109 | remove_field => [ "message", "host", "@timestamp", "@version", "path" ] 110 | columns => ["Sku","Name","SearchKeywords","Main","Price","ID","Brands"] 111 | } 112 | } 113 | ``` 114 | 115 | #### Advanced Filtering 116 | 117 | Here is a more advance filter. This performs pre-prcoessing cleanup on the CSV file. For example, it will strip whitespace from columns, removed bad characters, convert a column to a different data type and so forth. 118 | 119 | ```bash 120 | 121 | filter { 122 | 123 | # The CSV filter takes an event field containing CSV data, 124 | # parses it, and stores it as individual fields (can optionally specify the names). 125 | # This filter can also parse data with any separator, not just commas. 126 | 127 | csv { 128 | # Set the comma delimiter 129 | separator => "," 130 | 131 | # We want to exclude these system columns 132 | remove_field => [ 133 | "message", 134 | "host", 135 | "@timestamp", 136 | "@version", 137 | "path" 138 | ] 139 | 140 | # Define the layout of the input file 141 | columns => [ 142 | "Sku","Name","SearchKeywords","Main","Price","ID","Brands" 143 | ] 144 | } 145 | 146 | # The mutate filter allows you to perform general 147 | # mutations on fields. You can rename, remove, replace 148 | # and modify fields in your events 149 | 150 | # We need to set the target column to "string" to allow for find and replace 151 | mutate { 152 | convert => [ "Sku", "string" ] 153 | } 154 | 155 | # Strip backslashes, question marks, equals, hashes, and minuses from the target column 156 | mutate { 157 | gsub => [ "Sku", "[\\?#=]", "" ] 158 | } 159 | 160 | # Strip extraneous white space from records 161 | mutate { 162 | strip => [ "Sku","Name","SearchKeywords","Main","Price","ID","Brands" 163 | ] 164 | } 165 | 166 | # Set everything to lowercase 167 | mutate { 168 | lowercase => [ "Sku","Name","SearchKeywords","Main","Price","ID","Brands" 169 | ] 170 | } 171 | } 172 | ``` 173 | 174 | ### Step 3: Define Your Output Destination 175 | 176 | The output defines the delivery location for all the records in your CSV(s). Openbridge generates a private API endpoint which you use in the `url => ""`. The delivery API would look like this `url => "https://myapi.foo-api.us-east-1.amazonaws.com/dev/events/teststash?token=774f77b389154fd2ae7cb5131201777&sign=ujguuuljNjBkFGHyNTNmZTIxYjEzMWE5MjgyNzM1ODQ="` 177 | 178 | You would take the Openberidge provided endpoint and put it into the config: 179 | 180 | ```bash 181 | output { 182 | http { 183 | url => "https://myapi.foo-api.us-east-1.amazonaws.com/dev/events/teststash?token=774f77b389154fd2ae7cb5131201777&sign=ujguuuljNjBkFGHyNTNmZTIxYjEzMWE5MjgyNzM1ODQ=" 184 | http_method => "post" 185 | format => "json" 186 | pool_max => "10" 187 | pool_max_per_route => "5" 188 | } 189 | } 190 | ``` 191 | 192 | **Note**: Do not change `http_method => "post"`, `format => "json"`, `pool_max => "10"`, `pool_max_per_route => "5"` from the defaults listed in the config. 193 | 194 | You can also store the data to a CSV file (vs sending it to an API). This might be useful to test or validate your data prior to using the API. It also might be useful if you want to create a CSV for upload to Openbridge via SFTP or SCP. 195 | 196 | ```bash 197 | output { 198 | 199 | # Saving output to CSV so we define the layout of the file 200 | csv { 201 | fields => [ "Sku","Name","SearchKeywords","Main","Price","ID","Brands" ] 202 | 203 | # Where do you want to export the file 204 | path => "/data/foo.csv" 205 | } 206 | } 207 | ``` 208 | 209 | You need to reach out to your Openbridge team so they can provision your private API for you. 210 | 211 | ### Step 4: Save Your Config 212 | 213 | You will want to store your configs in a easy to remember location. You should also name the config in a manner that reflects the data resident in the CSV file. Since we are using `sales.csv` we saved our config like this: `/Users/bob/datastash/configs/sales.conf`. We will need to reference this config location in the next section. 214 | 215 | The final config will look something like this: 216 | 217 | ```bash 218 | #################################### 219 | # An input enables a specific source of 220 | # events to be read by Logstash. 221 | #################################### 222 | 223 | input { 224 | file { 225 | # Set the path to the source file(s) 226 | path => "/data/sales.csv" 227 | start_position => "beginning" 228 | sincedb_path => "/dev/null" 229 | } 230 | } 231 | 232 | #################################### 233 | # A filter performs intermediary processing on an event. 234 | # Filters are often applied conditionally depending on the 235 | # characteristics of the event. 236 | #################################### 237 | 238 | filter { 239 | 240 | csv { 241 | 242 | # The CSV filter takes an event field containing CSV data, 243 | # parses it, and stores it as individual fields (can optionally specify the names). 244 | # This filter can also parse data with any separator, not just commas. 245 | 246 | # Set the comma delimiter 247 | separator => "," 248 | 249 | # We want to exclude these system columns 250 | remove_field => [ 251 | "message", "host", "@timestamp", "@version", "path" 252 | ] 253 | 254 | # Define the layout of the input file 255 | columns => [ 256 | "Sku","Name","SearchKeywords","Main","Price","ID","Brands" 257 | ] 258 | } 259 | 260 | # The mutate filter allows you to perform general 261 | # mutations on fields. You can rename, remove, replace 262 | # and modify fields in your events 263 | 264 | # We need to set the target column to "string" to allow for find and replace 265 | mutate { 266 | convert => [ "Sku", "string" ] 267 | } 268 | 269 | # Find and remove backslashes, question marks, equals and hashes from the target column. These are characters we do not want in our column 270 | mutate { 271 | gsub => [ "Sku", "[\\?#=]", "" ] 272 | } 273 | 274 | # Strip extraneous white space from records 275 | mutate { 276 | strip => [ "Sku","Name","SearchKeywords","Main","Price","ID","Brands" 277 | ] 278 | } 279 | 280 | # Set everything to lowercase 281 | mutate { 282 | lowercase => [ "Sku","Name","SearchKeywords","Main","Price","ID","Brands" 283 | ] 284 | } 285 | } 286 | 287 | #################################### 288 | # An output sends event data to a particular 289 | # destination. Outputs are the final stage in the 290 | # event pipeline. 291 | #################################### 292 | 293 | output 294 | { 295 | # Sending the contents of the file to the event API 296 | http 297 | { 298 | # Put the URL for your HTTP endpoint to deliver events to 299 | url => "https://myapi.foo-api.us-east-1.amazonaws.com/dev/events/teststash?token=774f77b389154fd2ae7cb5131201777&sign=ujguuuljNjBkFGHyNTNmZTIxYjEzMWE5MjgyNzM1ODQ=" 300 | # Leave the settings below untouched. 301 | http_method => "post" 302 | format => "json" 303 | pool_max => "10" 304 | pool_max_per_route => "5" 305 | } 306 | } 307 | ``` 308 | 309 | # How To Run 310 | 311 | With your `sales.csv`config file saved to `/Users/bob/datastash/configs/sales.conf` you are ready to stream your data! 312 | 313 | There are two things that Data Stash needs to be told in order to run. 314 | 315 | 1. Where to find your source CSV file (`/Users/bob/csv/mysalesdata`) 316 | 2. The location of the the config file (`/Users/bob/datastash/configs`) 317 | 318 | You tell Data Stash where the file and config are via the `-v` or `volume` command in Docker. In our example your CSV is located on your laptop in this folder: `/Users/bob/csv/mysalesdata`. This means we put that path into the first `-v` command. Internally Data Stash defaults to `/data` so you can leave that untouched. It should look like this: 319 | 320 | ```bash 321 | -v /Users/bob/csv/mysalesdata:/data 322 | ``` 323 | 324 | In our example you also saved your config file on you laptop here: `/Users/bob/datastash/config`. Data Stash defaults to looking for configs in `/config/pipeline` so you can that untouched: 325 | 326 | ```bash 327 | -v /Users/bob/datastash/configs:/config/pipeline 328 | ``` 329 | 330 | Lastly, we put it all together so we can tell Data Stash to stream the file. Here is the command to run our Docker based Data Stash image: 331 | 332 | ```bash 333 | docker run -it --rm \ 334 | -v /Users/bob/csv/mysalesdata:/data \ 335 | -v /Users/bob/datastash/configs:/config/pipeline \ 336 | openbridge/ob_datastash \ 337 | datastash -f /config/pipeline/xxxxx.conf 338 | ``` 339 | # Performance 340 | If you are processing very large CSV files that have millions of records this approach can take awhile to complete. Depending on the complexity of the filters, you can expect about 1000 to 3000 events (i.e., rows) processed per minute. A CSV with 1,000,000 rows might take anywhere from 5 to 8 hours to complete. 341 | 342 | We limit the requests to 100 per second, so the max # of transactions possible in a minute would be 6000. At a rate of 6000 processing a 1M record CSV file would take close to 3 hours. 343 | 344 | You might want to explore using the Openbridge SFTP or SCP options for processing larger files. 345 | 346 | # Notes 347 | 348 | ## Processing A Folder Of CSV Files 349 | 350 | In the example below we used a wildcard `*.csv` to specify processing all sales CSV files in the directory. 351 | 352 | `path => "/the/path/to/your/*.csv"` 353 | 354 | For example, if you had a file called `sales.csv`, `sales002.csv` and `sales-allyear.csv` using a wildcard `*.csv` will process all of them. I 355 | 356 | Please note, using a `*.csv` assumes all files have the same structure/layout. If they do not, then you can be streaming disjointed data sets which will likely fail when it comes time to loading data to your warehouse. 357 | 358 | 359 | 360 | # Versioning 361 | 362 | Docker Tag | Git Hub Release | Logstash | Alpine Version 363 | ---------- | --------------- | -------- | -------------- 364 | latest | Master | 6.2.3 | latest 365 | 366 | # Reference 367 | 368 | A the heart of Data Stash is [Logstash](https://www.elastic.co/products/logstash). For a deeper dive into the capabilities of Logstash check our their [documentation](https://www.elastic.co/guide/en/logstash/current/index.html). Logstash is pretty cool and can do a lot more than just processing CSV files 369 | 370 | CSV files should follow RFC 4180 standards/guidance to ensure success with processing 371 | 372 | - 373 | - 374 | 375 | This images is used for virtualizing your data streaming using Docker. If you don't know what Docker is read "[What is Docker?](https://www.docker.com/what-docker)". Once you have a sense of what Docker is, you can then install the software. It is free: "[Get Docker](https://www.docker.com/products/docker)". Select the Docker package that aligns with your environment (ie. OS X, Linux or Windows). If you have not used Docker before, take a look at the guides: 376 | 377 | - [Engine: Get Started](https://docs.docker.com/engine/getstarted/) 378 | - [Docker Mac](https://docs.docker.com/docker-for-mac/) 379 | - [Docker Windows](https://docs.docker.com/docker-for-windows/) 380 | 381 | # TODO 382 | 383 | - Create more sample configs, including complex wrangling examples. 384 | 385 | # Issues 386 | 387 | If you have any problems with or questions about this image, please contact us through a GitHub issue. 388 | 389 | # Contributing 390 | 391 | You are invited to contribute new features, fixes, or updates, large or small; we are always thrilled to receive pull requests, and do our best to process them as fast as we can. 392 | 393 | Before you start to code, we recommend discussing your plans through a GitHub issue, especially for more ambitious contributions. This gives other contributors a chance to point you in the right direction, give you feedback on your design, and help you find out if someone else is working on the same thing. 394 | 395 | # License 396 | 397 | This project is licensed under the MIT License 398 | -------------------------------------------------------------------------------- /VERSION: -------------------------------------------------------------------------------- 1 | 5.5.2 2 | -------------------------------------------------------------------------------- /config/drivers/RedshiftJDBC42-1.2.7.1003.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openbridge/ob_datastash/76d53980045dfdc8f2696bc8d8da38c10b4ef091/config/drivers/RedshiftJDBC42-1.2.7.1003.jar -------------------------------------------------------------------------------- /config/drivers/mysql-connector-java-5.1.44-bin.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openbridge/ob_datastash/76d53980045dfdc8f2696bc8d8da38c10b4ef091/config/drivers/mysql-connector-java-5.1.44-bin.jar -------------------------------------------------------------------------------- /config/drivers/postgresql-42.1.4.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openbridge/ob_datastash/76d53980045dfdc8f2696bc8d8da38c10b4ef091/config/drivers/postgresql-42.1.4.jar -------------------------------------------------------------------------------- /config/pipeline/sample-advanced-filter-csv.conf: -------------------------------------------------------------------------------- 1 | ####################### INPUT ############################ 2 | # An input enables a specific source of 3 | # events to be read by Logstash. 4 | ########################################################## 5 | 6 | input { 7 | file { 8 | type => "set-type-as-your-table-name" 9 | path => "/data/*.csv" 10 | start_position => "beginning" 11 | sincedb_path => "/dev/null" 12 | } 13 | } 14 | 15 | ####################### FILTER ########################### 16 | # A filter performs intermediary processing on an event. 17 | # Filters are often applied conditionally depending on the 18 | # characteristics of the event. 19 | ########################################################## 20 | 21 | filter { 22 | 23 | # The CSV filter takes an event field containing CSV data, 24 | # parses it, and stores it as individual fields (can optionally specify the names). 25 | # This filter can also parse data with any separator, not just commas. 26 | 27 | csv { 28 | separator => "," 29 | quote_char => '"' 30 | autodetect_column_names => "true" 31 | autogenerate_column_names => "true" 32 | 33 | add_field => { "entryname" => "%{entryType}" } 34 | add_field => { "revised_entry_type" => "" } 35 | } 36 | 37 | ############################## 38 | 39 | # Replaces non-ASCII characters with an ASCII approximation, or if none exists, a replacement character which defaults to ? 40 | i18n { transliterate => [ "RowType", "Status", "Syncerrors", "From", "To", "Advertiser", "Engine", "Account", "Campaign"," DMARegionname", "DMAregionID", "Clicks", "Cost" ] } 41 | 42 | ############################# 43 | 44 | # Create consistent hashes (fingerprints) of one or more fields and store the result in a new field. 45 | fingerprint { method => "SHA1" key => "MySecretKeyForMyEyesOnlyOk?" source => "DMAregionID" target => "DMAregionID" } 46 | 47 | ############################# 48 | 49 | mutate { gsub => [ "landingURL", "[\\?#=]", "" ] } 50 | mutate { gsub => [ "landingURL", "\"", "" ] } 51 | mutate { 52 | gsub => [ "entryname", "1", "Direct Traffic" ] 53 | gsub => [ "entryname", "2", "Clickthrough" ] 54 | gsub => [ "entryname", "3", "OLA Impression" ] 55 | gsub => [ "entryname", "4", "OLA Visit" ] 56 | gsub => [ "entryname", "5", "Referral Traffic" ] 57 | gsub => [ "entryname", "6", "Email" ] 58 | gsub => [ "entryname", "7", "Organic Search" ] 59 | gsub => [ "entryname", "8", "Paid Search" ] 60 | gsub => [ "entryname", "9", "Offline Driver" ] 61 | gsub => [ "entryname", "10", "Social" ] 62 | gsub => [ "entryname", "11", "Key Action" ] 63 | gsub => [ "entryname", "12", "Coreg" ] 64 | gsub => [ "entryname", "13", "Customer Center" ] 65 | gsub => [ "entryname", "14", "Video" ] 66 | gsub => [ "entryname", "15", "Social Impression" ] 67 | gsub => [ "entryname", "16", "Offsite Key Action" ] 68 | gsub => [ "entryname", "17", "Offsite Clickthrough" ] 69 | } 70 | 71 | if ("Email" in [utm_source] or "Email" in [utm_medium] or "Email" in [utm_campaign] or "Email" in [utm_term] or "Email" in [utm_content] or "Email" in [utm_brand]) or ("Enewsletter" in [utm_source] or "Enewsletter" in [utm_medium] or "Enewsletter" in [utm_campaign] or "Enewsletter" in [utm_term] or "Enewsletter" in [utm_content] or "Enewsletter" in [utm_brand]) or "eNL" in [utm_source] or "eNL" in [utm_medium] or "eNL" in [utm_campaign] or "eNL" in [utm_term] or "eNL" in [utm_content] or "eNL" in [utm_brand] { 72 | mutate { replace => {"revised_entry_type" => "Email"} } 73 | } 74 | 75 | mutate { lowercase => [ "id","createdDate","landingURL","landingURLNoQuery" ] } 76 | } 77 | 78 | ####################### OUTPUT ########################### 79 | # An output sends event data to a particular 80 | # destination. Outputs are the final stage in the 81 | # event pipeline. 82 | ########################################################## 83 | 84 | output { 85 | csv { 86 | fields => [ "id","createdDate","landingURL","landingURLNoQuery", "entryname", "revised_entry_type", "utm_source" ] 87 | path => "/data/foo2.csv" 88 | codec => plain { charset => 'UTF-8' } 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /config/pipeline/sample-csv-api-header.conf: -------------------------------------------------------------------------------- 1 | ####################### INPUT ############################ 2 | # An input enables a specific source of 3 | # events to be read by Logstash. 4 | ########################################################## 5 | 6 | input { 7 | file { 8 | # Set the path to the source file(s) 9 | type => "set-type-as-your-table-name" 10 | path => "/data/sales.csv" 11 | start_position => "beginning" 12 | sincedb_path => "/dev/null" 13 | } 14 | } 15 | 16 | 17 | ####################### FILTER ########################### 18 | # A filter performs intermediary processing on an event. 19 | # Filters are often applied conditionally depending on the 20 | # characteristics of the event. 21 | ########################################################## 22 | 23 | filter { 24 | 25 | # The CSV filter takes an event field containing CSV data, 26 | # parses it, and stores it as individual fields (can optionally specify the names). 27 | # This filter can also parse data with any separator, not just commas. 28 | 29 | csv { 30 | 31 | # Set the tab delimiter 32 | separator => "\t" 33 | 34 | # We want to exclude these system columns 35 | remove_field => [ 36 | "message", 37 | "host", 38 | "@timestamp", 39 | "@version", 40 | "path" 41 | ] 42 | 43 | # We infer the layout based on the header 44 | autodetect_column_names => "true" 45 | autogenerate_column_names => "true" 46 | 47 | ############################## 48 | 49 | # Replaces non-ASCII characters with an ASCII approximation, or if none exists, a replacement character which defaults to ? 50 | i18n { transliterate => [ "Sku","Name","SearchKeywords","Main","Price","ID","Brands" ] } 51 | 52 | ############################# 53 | 54 | # Create consistent hashes (fingerprints) of one or more fields and store the result in a new field. 55 | fingerprint { method => "SHA1" key => "MySecretKeyForMyEyesOnlyOk?" source => "Brands" target => "Brands" } 56 | 57 | ############################# 58 | 59 | # The mutate filter allows you to perform general 60 | # mutations on fields. You can rename, remove, replace 61 | # and modify fields in your events 62 | 63 | # We need to set the target column to "string" to allow for find and replace 64 | mutate { 65 | convert => [ "Sku", "string" ] 66 | } 67 | 68 | # Strip backslashes, question marks, equals, hashes, and minuses from the target column 69 | mutate { 70 | gsub => [ "Sku", "[\\?#=]", "" ] 71 | } 72 | 73 | # Strip extraneous white space from records 74 | mutate { 75 | strip => [ 76 | "Sku","Name","SearchKeywords","Main","Price","ID","Brands" 77 | ] 78 | } 79 | 80 | # Set everything to lowercase 81 | mutate { 82 | lowercase => [ "Sku","Name","SearchKeywords","Main","Price","ID","Brands" 83 | ] 84 | } 85 | 86 | ############################# 87 | 88 | # Set rate limits. Do not change unless you know what you are doing 89 | sleep { time => "1" every => 100 } 90 | 91 | } 92 | 93 | 94 | ####################### OUTPUT ########################### 95 | # An output sends event data to a particular 96 | # destination. Outputs are the final stage in the 97 | # event pipeline. 98 | ########################################################## 99 | 100 | output 101 | { 102 | # Sending the contents of the file to the event API 103 | http 104 | { 105 | # Put the URL for your HTTP endpoint to deliver events to 106 | url => "https://myapi.foo-api.us-east-1.amazonaws.com/dev/events/teststash?token=774f77b389154fd2ae7cb5131201777&sign=ujguuuljNjBkFGHyNTNmZTIxYjEzMWE5MjgyNzM1ODQ=" 107 | 108 | # Leave the settings below untouched. 109 | http_method => "post" 110 | format => "json" 111 | keepalive => "true" 112 | automatic_retries => 1 113 | validate_after_inactivity => "45" 114 | pool_max => "10" 115 | pool_max_per_route => "5" 116 | codec => plain { charset => 'UTF-8' } 117 | } 118 | } 119 | -------------------------------------------------------------------------------- /config/pipeline/sample-csv-api-noheader.conf: -------------------------------------------------------------------------------- 1 | ####################### INPUT ############################ 2 | # An input enables a specific source of 3 | # events to be read by Logstash. 4 | ########################################################## 5 | 6 | input { 7 | file { 8 | # Set the path to the source file(s) 9 | type => "set-type-as-your-table-name" 10 | path => "/data/sales.csv" 11 | start_position => "beginning" 12 | sincedb_path => "/dev/null" 13 | } 14 | } 15 | 16 | ####################### FILTER ########################### 17 | # A filter performs intermediary processing on an event. 18 | # Filters are often applied conditionally depending on the 19 | # characteristics of the event. 20 | ########################################################## 21 | 22 | filter { 23 | 24 | # The CSV filter takes an event field containing CSV data, 25 | # parses it, and stores it as individual fields (can optionally specify the names). 26 | # This filter can also parse data with any separator, not just commas. 27 | 28 | csv { 29 | 30 | # Set the comma delimiter 31 | separator => "," 32 | 33 | # We want to exclude these system columns 34 | remove_field => [ 35 | "message", 36 | "host", 37 | "@timestamp", 38 | "@version", 39 | "path" 40 | ] 41 | 42 | # Define the layout of the input file 43 | columns => [ "Sku","Name","SearchKeywords","Main","Price","ID","Brands" ] 44 | } 45 | 46 | ############################## 47 | 48 | # Replaces non-ASCII characters with an ASCII approximation, or if none exists, a replacement character which defaults to ? 49 | i18n { transliterate => [ "Sku","Name","SearchKeywords","Main","Price","ID","Brands" ] } 50 | 51 | ############################# 52 | 53 | # Create consistent hashes (fingerprints) of one or more fields and store the result in a new field. 54 | fingerprint { method => "SHA1" key => "MySecretKeyForMyEyesOnlyOk?" source => "Brands" target => "Brands" } 55 | 56 | ############################# 57 | 58 | # The mutate filter allows you to perform general 59 | # mutations on fields. You can rename, remove, replace 60 | # and modify fields in your events 61 | 62 | # We need to set the target column to "string" to allow for find and replace 63 | mutate { 64 | convert => [ "Sku", "string" ] 65 | } 66 | 67 | # Strip backslashes, question marks, equals, hashes, and minuses from the target column 68 | mutate { 69 | gsub => [ "Sku", "[\\?#=]", "" ] 70 | } 71 | 72 | # Strip extraneous white space from records 73 | mutate { 74 | strip => [ [ "Sku","Name","SearchKeywords","Main","Price","ID","Brands" 75 | ] 76 | } 77 | 78 | # Set everything to lowercase 79 | mutate { 80 | lowercase => [ [ "Sku","Name","SearchKeywords","Main","Price","ID","Brands" 81 | ] 82 | } 83 | 84 | ############################# 85 | 86 | # Set rate limits. Do not change unless you know what you are doing 87 | sleep { time => "1" every => 100 } 88 | 89 | } 90 | 91 | ####################### OUTPUT ########################### 92 | # An output sends event data to a particular 93 | # destination. Outputs are the final stage in the 94 | # event pipeline. 95 | ########################################################## 96 | 97 | output 98 | { 99 | http 100 | { 101 | # Put the URL for your HTTP endpoint to deliver events to 102 | url => "https://myapi.foo-api.us-east-1.amazonaws.com/dev/events/teststash?token=774f77b389154fd2ae7cb5131201777&sign=ujguuuljNjBkFGHyNTNmZTIxYjEzMWE5MjgyNzM1ODQ=" 103 | 104 | # Leave the settings below untouched. 105 | http_method => "post" 106 | format => "json" 107 | keepalive => "true" 108 | automatic_retries => 1 109 | validate_after_inactivity => "45" 110 | pool_max => "10" 111 | pool_max_per_route => "5" 112 | codec => plain { charset => 'UTF-8' } 113 | } 114 | } 115 | -------------------------------------------------------------------------------- /config/pipeline/sample-csv-csv-noheader.conf: -------------------------------------------------------------------------------- 1 | ####################### INPUT ############################ 2 | # An input enables a specific source of 3 | # events to be read by Logstash. 4 | ########################################################## 5 | 6 | input { 7 | file { 8 | # Set the path to the source file(s) 9 | type => "set-type-as-your-table-name" 10 | path => "/data/sales.csv" 11 | start_position => "beginning" 12 | sincedb_path => "/dev/null" 13 | } 14 | } 15 | 16 | ####################### FILTER ########################### 17 | # A filter performs intermediary processing on an event. 18 | # Filters are often applied conditionally depending on the 19 | # characteristics of the event. 20 | ########################################################## 21 | 22 | filter { 23 | 24 | # The CSV filter takes an event field containing CSV data, 25 | # parses it, and stores it as individual fields (can optionally specify the names). 26 | # This filter can also parse data with any separator, not just commas. 27 | 28 | csv { 29 | # Set the comma delimiter 30 | separator => "," 31 | 32 | # We want to exclude these system columns 33 | remove_field => [ 34 | "message", 35 | "host", 36 | "@timestamp", 37 | "@version", 38 | "path" 39 | ] 40 | 41 | # Define the layout of the input file 42 | columns => [ "Sku","Name","SearchKeywords","Main","Price","ID","Brands" 43 | ] 44 | } 45 | } 46 | 47 | ############################## 48 | 49 | # Replaces non-ASCII characters with an ASCII approximation, or if none exists, a replacement character which defaults to ? 50 | i18n { transliterate => [ "Sku","Name","SearchKeywords","Main","Price","ID","Brands" ] } 51 | 52 | ############################# 53 | 54 | # Create consistent hashes (fingerprints) of one or more fields and store the result in a new field. 55 | fingerprint { method => "SHA1" key => "MySecretKeyForMyEyesOnlyOk?" source => "Brands" target => "Brands" } 56 | 57 | ############################# 58 | 59 | # The mutate filter allows you to perform general 60 | # mutations on fields. You can rename, remove, replace 61 | # and modify fields in your events 62 | 63 | # We need to set the target column to "string" to allow for find and replace 64 | mutate { 65 | convert => [ "Sku", "string" ] 66 | } 67 | 68 | # Strip backslashes, question marks, equals, hashes, and minuses from the target column 69 | mutate { 70 | gsub => [ "Sku", "[\\?#=]", "" ] 71 | } 72 | 73 | # Strip extraneous white space from records 74 | mutate { 75 | strip => [ 76 | "Sku","Name","SearchKeywords","Main","Price","ID","Brands" 77 | ] 78 | } 79 | 80 | # Set everything to lowercase 81 | mutate { 82 | lowercase => [ "Sku","Name","SearchKeywords","Main","Price","ID","Brands" 83 | ] 84 | } 85 | } 86 | 87 | 88 | ####################### OUTPUT ########################### 89 | # An output sends event data to a particular 90 | # destination. Outputs are the final stage in the 91 | # event pipeline. 92 | ########################################################## 93 | 94 | output { 95 | 96 | # Saving output to CSV so we define the layout of the file 97 | csv { 98 | fields => [ "Sku","Name","SearchKeywords","Main","Price","ID","Brands" 99 | ] 100 | 101 | # Where do you want to export the file 102 | path => "/data/foo2.csv" 103 | codec => plain { charset => 'UTF-8' } 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /config/pipeline/sample-multi-csv-csv-noheader.conf: -------------------------------------------------------------------------------- 1 | ####################### INPUT ############################ 2 | # An input enables a specific source of 3 | # events to be read by Logstash. 4 | ########################################################## 5 | 6 | input { 7 | file { 8 | type => "sales" 9 | path => "/data/sales/sales.csv" 10 | start_position => "beginning" 11 | sincedb_path => "/dev/null" 12 | } 13 | file { 14 | type => "orders" 15 | path => "/data/orders/orders.csv" 16 | start_position => "beginning" 17 | sincedb_path => "/dev/null" 18 | } 19 | 20 | } 21 | 22 | ####################### FILTER ########################### 23 | # A filter performs intermediary processing on an event. 24 | # Filters are often applied conditionally depending on the 25 | # characteristics of the event. 26 | ########################################################## 27 | 28 | filter { 29 | 30 | # The CSV filter takes an event field containing CSV data, 31 | # parses it, and stores it as individual fields (can optionally specify the names). 32 | # This filter can also parse data with any separator, not just commas. 33 | 34 | csv { 35 | 36 | # Set the comma delimiter 37 | separator => "," 38 | 39 | # We want to exclude these system columns 40 | remove_field => [ 41 | "message", 42 | "host", 43 | "@timestamp", 44 | "@version", 45 | "path" 46 | ] 47 | 48 | # Define the layout of the input file 49 | if [type] == "sales" { columns => [ "Sku","Name","SearchKeywords","Main","Price","ID","Brands" ] } 50 | if [type] == "orders" { columns => [ "Sku","Name","SearchKeywords","Main","Price","ID","Brands" ] } 51 | } 52 | 53 | ############################## 54 | 55 | # Replaces non-ASCII characters with an ASCII approximation, or if none exists, a replacement character which defaults to ? 56 | i18n { transliterate => [ "Sku","Name","SearchKeywords","Main","Price","ID","Brands" ] } 57 | 58 | ############################# 59 | 60 | # Create consistent hashes (fingerprints) of one or more fields and store the result in a new field. 61 | fingerprint { method => "SHA1" key => "MySecretKeyForMyEyesOnlyOk?" source => "Brands" target => "Brands" } 62 | 63 | ############################# 64 | 65 | # The mutate filter allows you to perform general 66 | # mutations on fields. You can rename, remove, replace 67 | # and modify fields in your events 68 | 69 | # We need to set the target column to "string" to allow for find and replace 70 | mutate { 71 | convert => [ "Sku", "string" ] 72 | } 73 | 74 | # Strip backslashes, question marks, equals, hashes, and minuses from the target column 75 | mutate { 76 | gsub => [ "Sku", "[\\?#=]", "" ] 77 | } 78 | 79 | # Strip extraneous white space from records 80 | mutate { 81 | strip => [ 82 | "Sku","Name","SearchKeywords","Main","Price","ID","Brands" 83 | ] 84 | } 85 | 86 | # Set everything to lowercase 87 | mutate { 88 | lowercase => [ "Sku","Name","SearchKeywords","Main","Price","ID","Brands" 89 | ] 90 | } 91 | } 92 | 93 | ####################### OUTPUT ########################### 94 | # An output sends event data to a particular 95 | # destination. Outputs are the final stage in the 96 | # event pipeline. 97 | ########################################################## 98 | 99 | output { 100 | 101 | # Saving output to CSV so we define the layout of the file 102 | 103 | if [type] == "sales" { 104 | csv { 105 | fields => [ "Sku","Name","SearchKeywords","Main","Price","ID","Brands" ] 106 | path => "/data/clean-sales.csv" 107 | codec => plain { charset => 'UTF-8' } 108 | } 109 | } 110 | if [type] == "orders" { 111 | csv { 112 | fields => [ "Sku","Name","SearchKeywords","Main","Price","ID","Brands" ] 113 | path => "/data/clean-orders.csv" 114 | codec => plain { charset => 'UTF-8' } 115 | } 116 | } 117 | } 118 | -------------------------------------------------------------------------------- /config/pipeline/sample-tab-api-header.conf: -------------------------------------------------------------------------------- 1 | ####################### INPUT ############################ 2 | # An input enables a specific source of 3 | # events to be read by Logstash. 4 | ########################################################## 5 | 6 | input { 7 | file { 8 | # Set the path to the source file(s) 9 | type => "sales" 10 | path => "/data/sales.txt" 11 | start_position => "beginning" 12 | sincedb_path => "/dev/null" 13 | } 14 | } 15 | 16 | 17 | ####################### FILTER ########################### 18 | # A filter performs intermediary processing on an event. 19 | # Filters are often applied conditionally depending on the 20 | # characteristics of the event. 21 | ########################################################## 22 | 23 | filter { 24 | 25 | # The CSV filter takes an event field containing CSV data, 26 | # parses it, and stores it as individual fields (can optionally specify the names). 27 | # This filter can also parse data with any separator, not just commas. 28 | 29 | csv { 30 | # Set the comma delimiter 31 | separator => "\t" 32 | 33 | # We want to exclude these system columns 34 | remove_field => [ 35 | "message", 36 | "host", 37 | "@timestamp", 38 | "@version", 39 | "path" 40 | ] 41 | 42 | # We infer the layout based on the header 43 | autodetect_column_names => "true" 44 | autogenerate_column_names => "true" 45 | 46 | } 47 | 48 | ############################## 49 | 50 | # Replaces non-ASCII characters with an ASCII approximation, or if none exists, a replacement character which defaults to ? 51 | i18n { transliterate => [ "Sku","Name","SearchKeywords","Main","Price","ID","Brands" ] } 52 | 53 | ############################# 54 | 55 | # Create consistent hashes (fingerprints) of one or more fields and store the result in a new field. 56 | fingerprint { method => "SHA1" key => "MySecretKeyForMyEyesOnlyOk?" source => "Brands" target => "Brands" } 57 | 58 | ############################# 59 | 60 | # The mutate filter allows you to perform general 61 | # mutations on fields. You can rename, remove, replace 62 | # and modify fields in your events 63 | 64 | 65 | # We need to set the target column to "string" to allow for find and replace 66 | mutate { 67 | convert => [ "Sku", "string" ] 68 | } 69 | 70 | # Strip backslashes, question marks, equals, hashes, and minuses from the target column 71 | mutate { 72 | gsub => [ "Sku", "[\\?#=]", "" ] 73 | } 74 | 75 | # Strip extraneous white space from records 76 | mutate { 77 | strip => [ 78 | "Sku","Name","SearchKeywords","Main","Price","ID","Brands" 79 | ] 80 | } 81 | 82 | # Set everything to lowercase 83 | mutate { 84 | lowercase => [ "Sku","Name","SearchKeywords","Main","Price","ID","Brands" 85 | ] 86 | } 87 | ############################# 88 | 89 | # Set rate limits. Do not change unless you know what you are doing 90 | sleep { time => "1" every => 100 } 91 | 92 | } 93 | 94 | 95 | ####################### OUTPUT ########################### 96 | # An output sends event data to a particular 97 | # destination. Outputs are the final stage in the 98 | # event pipeline. 99 | ########################################################## 100 | 101 | output 102 | { 103 | # Sending the contents of the file to the event API 104 | http 105 | { 106 | # Put the URL for your HTTP endpoint to deliver events to 107 | url => "https://myapi.foo-api.us-east-1.amazonaws.com/dev/events/teststash?token=774f77b389154fd2ae7cb5131201777&sign=ujguuuljNjBkFGHyNTNmZTIxYjEzMWE5MjgyNzM1ODQ=" 108 | 109 | # Leave the settings below untouched. 110 | http_method => "post" 111 | format => "json" 112 | keepalive => "true" 113 | automatic_retries => 1 114 | validate_after_inactivity => "45" 115 | pool_max => "10" 116 | pool_max_per_route => "5" 117 | codec => plain { charset => 'UTF-8' } 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /convert.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys 5 | import pandas as pd 6 | 7 | args = sys.argv 8 | SOURCE_FILE = args[1] 9 | SOURCE_DELIMITER = args[2] if len(args) >= 2 else 'þ' 10 | DESTINATION_FILE = args[3] if len(args) >= 3 else args[1] + '.csv' 11 | 12 | df = pd.read_csv(SOURCE_FILE, delimiter=SOURCE_DELIMITER); 13 | df.to_csv(DESTINATION_FILE, index=False, quoting=1) 14 | -------------------------------------------------------------------------------- /datastash-sftpscp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openbridge/ob_datastash/76d53980045dfdc8f2696bc8d8da38c10b4ef091/datastash-sftpscp.png -------------------------------------------------------------------------------- /datastash.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openbridge/ob_datastash/76d53980045dfdc8f2696bc8d8da38c10b4ef091/datastash.png -------------------------------------------------------------------------------- /docker-entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # first arg is `-f` or `--some-option` 5 | if [ "${1#-}" != "$1" ]; then 6 | set -- logstash "$@" 7 | fi 8 | 9 | mkdir -p /data/logstash-temp-queue 10 | mkdir -p /data/logstash-temp-sincedb 11 | # Set a link to datastash as a command. 12 | ln -s /usr/share/logstash/bin/logstash datastash 13 | 14 | # Run as user "logstash" if the command is "logstash" 15 | # allow the container to be started with `--user` 16 | if [ "$1" = 'datastash' -a "$(id -u)" = '0' ]; then 17 | set -- su-exec datastash "$@" 18 | fi 19 | 20 | exec "$@" 21 | -------------------------------------------------------------------------------- /logstash.yml: -------------------------------------------------------------------------------- 1 | # Settings file in YAML 2 | # 3 | # Settings can be specified either in hierarchical form, e.g.: 4 | # 5 | # pipeline: 6 | # batch: 7 | # size: 125 8 | # delay: 5 9 | # 10 | # Or as flat keys: 11 | # 12 | # pipeline.batch.size: 125 13 | # pipeline.batch.delay: 5 14 | # 15 | # ------------ Node identity ------------ 16 | # 17 | # Use a descriptive name for the node: 18 | # 19 | # node.name: datastash 20 | # xpack.monitoring.elasticsearch.url: http://0.0.0.0:9200 21 | # 22 | # If omitted the node name will default to the machine's host name 23 | # 24 | # ------------ Data path ------------------ 25 | # 26 | # Which directory should be used by logstash and its plugins 27 | # for any persistent needs. Defaults to LOGSTASH_HOME/data 28 | # 29 | # path.data: 30 | # 31 | # ------------ Pipeline Settings -------------- 32 | # 33 | # Set the number of workers that will, in parallel, execute the filters+outputs 34 | # stage of the pipeline. 35 | # 36 | # This defaults to the number of the host's CPU cores. 37 | # 38 | # pipeline.workers: 2 39 | # 40 | # How many workers should be used per output plugin instance 41 | # 42 | # pipeline.output.workers: 1 43 | # 44 | # How many events to retrieve from inputs before sending to filters+workers 45 | # 46 | # pipeline.batch.size: 125 47 | # 48 | # How long to wait before dispatching an undersized batch to filters+workers 49 | # Value is in milliseconds. 50 | # 51 | # pipeline.batch.delay: 5 52 | # 53 | # Force Logstash to exit during shutdown even if there are still inflight 54 | # events in memory. By default, logstash will refuse to quit until all 55 | # received events have been pushed to the outputs. 56 | # 57 | # WARNING: enabling this can lead to data loss during shutdown 58 | # 59 | # pipeline.unsafe_shutdown: false 60 | # 61 | # ------------ Pipeline Configuration Settings -------------- 62 | # 63 | # Where to fetch the pipeline configuration for the main pipeline 64 | # 65 | # path.config: 66 | # 67 | # Pipeline configuration string for the main pipeline 68 | # 69 | # config.string: 70 | # 71 | # At startup, test if the configuration is valid and exit (dry run) 72 | # 73 | # config.test_and_exit: false 74 | # 75 | # Periodically check if the configuration has changed and reload the pipeline 76 | # This can also be triggered manually through the SIGHUP signal 77 | # 78 | # config.reload.automatic: true 79 | # 80 | # How often to check if the pipeline configuration has changed (in seconds) 81 | # 82 | # config.reload.interval: 10s 83 | # 84 | # Show fully compiled configuration as debug log message 85 | # NOTE: --log.level must be 'debug' 86 | # 87 | # config.debug: false 88 | # 89 | # When enabled, process escaped characters such as \n and \" in strings in the 90 | # pipeline configuration files. 91 | # 92 | # config.support_escapes: false 93 | # 94 | # ------------ Module Settings --------------- 95 | # Define modules here. Modules definitions must be defined as an array. 96 | # The simple way to see this is to prepend each `name` with a `-`, and keep 97 | # all associated variables under the `name` they are associated with, and 98 | # above the next, like this: 99 | # 100 | # modules: 101 | # - name: MODULE_NAME 102 | # var.PLUGINTYPE1.PLUGINNAME1.KEY1: VALUE 103 | # var.PLUGINTYPE1.PLUGINNAME1.KEY2: VALUE 104 | # var.PLUGINTYPE2.PLUGINNAME1.KEY1: VALUE 105 | # var.PLUGINTYPE3.PLUGINNAME3.KEY1: VALUE 106 | # 107 | # Module variable names must be in the format of 108 | # 109 | # var.PLUGIN_TYPE.PLUGIN_NAME.KEY 110 | # 111 | # modules: 112 | # 113 | # ------------ Cloud Settings --------------- 114 | # Define Elastic Cloud settings here. 115 | # Format of cloud.id is a base64 value e.g. dXMtZWFzdC0xLmF3cy5mb3VuZC5pbyRub3RhcmVhbCRpZGVudGlmaWVy 116 | # and it may have an label prefix e.g. staging:dXMtZ... 117 | # This will overwrite 'var.elasticsearch.hosts' and 'var.kibana.host' 118 | # cloud.id: 119 | # 120 | # Format of cloud.auth is: : 121 | # This is optional 122 | # If supplied this will overwrite 'var.elasticsearch.username' and 'var.elasticsearch.password' 123 | # If supplied this will overwrite 'var.kibana.username' and 'var.kibana.password' 124 | # cloud.auth: elastic: 125 | # 126 | # ------------ Queuing Settings -------------- 127 | # 128 | # Internal queuing model, "memory" for legacy in-memory based queuing and 129 | # "persisted" for disk-based acked queueing. Defaults is memory 130 | # 131 | queue.type: persisted 132 | # 133 | # If using queue.type: persisted, the directory path where the data files will be stored. 134 | # Default is path.data/queue 135 | # 136 | path.queue: "/data/logstash-temp-queue" 137 | # 138 | # If using queue.type: persisted, the page data files size. The queue data consists of 139 | # append-only data files separated into pages. Default is 250mb 140 | # 141 | # queue.page_capacity: 250mb 142 | # 143 | # If using queue.type: persisted, the maximum number of unread events in the queue. 144 | # Default is 0 (unlimited) 145 | # 146 | # queue.max_events: 0 147 | # 148 | # If using queue.type: persisted, the total capacity of the queue in number of bytes. 149 | # If you would like more unacked events to be buffered in Logstash, you can increase the 150 | # capacity using this setting. Please make sure your disk drive has capacity greater than 151 | # the size specified here. If both max_bytes and max_events are specified, Logstash will pick 152 | # whichever criteria is reached first 153 | # Default is 1024mb or 1gb 154 | # 155 | #queue.max_bytes: 1024mb 156 | # 157 | # If using queue.type: persisted, the maximum number of acked events before forcing a checkpoint 158 | # Default is 1024, 0 for unlimited 159 | # 160 | #queue.checkpoint.acks: 1024 161 | # 162 | # If using queue.type: persisted, the maximum number of written events before forcing a checkpoint 163 | # Default is 1024, 0 for unlimited 164 | # 165 | #queue.checkpoint.writes: 1024 166 | # 167 | # If using queue.type: persisted, the interval in milliseconds when a checkpoint is forced on the head page 168 | # Default is 1000, 0 for no periodic checkpoint. 169 | # 170 | # queue.checkpoint.interval: 1000 171 | # 172 | # ------------ Dead-Letter Queue Settings -------------- 173 | # Flag to turn on dead-letter queue. 174 | # 175 | # dead_letter_queue.enable: false 176 | 177 | # If using dead_letter_queue.enable: true, the maximum size of each dead letter queue. Entries 178 | # will be dropped if they would increase the size of the dead letter queue beyond this setting. 179 | # Default is 1024mb 180 | # dead_letter_queue.max_bytes: 1024mb 181 | 182 | # If using dead_letter_queue.enable: true, the directory path where the data files will be stored. 183 | # Default is path.data/dead_letter_queue 184 | # 185 | # path.dead_letter_queue: 186 | # 187 | # ------------ Metrics Settings -------------- 188 | # 189 | # Bind address for the metrics REST endpoint 190 | # 191 | #http.host: "127.0.0.1" 192 | # 193 | # Bind port for the metrics REST endpoint, this option also accept a range 194 | # (9600-9700) and logstash will pick up the first available ports. 195 | # 196 | #http.port: 9600-9700 197 | # 198 | # ------------ Debugging Settings -------------- 199 | # 200 | # Options for log.level: 201 | # * fatal 202 | # * error 203 | # * warn 204 | # * info (default) 205 | # * debug 206 | # * trace 207 | # 208 | # log.level: info 209 | # path.logs: 210 | # 211 | # ------------ Other Settings -------------- 212 | # 213 | # Where to find custom plugins 214 | # path.plugins: [] 215 | --------------------------------------------------------------------------------