├── .gitignore
├── Dockerfile
├── LICENSE
├── README.md
├── VERSION
├── config
    ├── drivers
    │   ├── RedshiftJDBC42-1.2.7.1003.jar
    │   ├── mysql-connector-java-5.1.44-bin.jar
    │   └── postgresql-42.1.4.jar
    └── pipeline
    │   ├── sample-advanced-filter-csv.conf
    │   ├── sample-csv-api-header.conf
    │   ├── sample-csv-api-noheader.conf
    │   ├── sample-csv-csv-noheader.conf
    │   ├── sample-multi-csv-csv-noheader.conf
    │   └── sample-tab-api-header.conf
├── convert.py
├── datastash-sftpscp.png
├── datastash.png
├── docker-entrypoint.sh
└── logstash.yml


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Application specific files
 2 | /approot
 3 | .sass-cache
 4 | *.log
 5 | *.pwd
 6 | *.salt.txt
 7 | *.userhash.txt
 8 | prod*
 9 | 
10 | # Folder view configuration files
11 | *.DS_Store
12 | .AppleDouble
13 | .LSOverride
14 | Desktop.ini
15 | *cipher*
16 | 
17 | # Icon must end with two \r
18 | Icon
19 | 
20 | # Compiled Python files
21 | *.pyc
22 | 
23 | # Compiled C++ files
24 | *.out
25 | # Thumbnails
26 | ._*
27 | Thumbs.db
28 | 
29 | # Files that might appear in the root of a volume
30 | .DocumentRevisions-V100
31 | .fseventsd
32 | .Spotlight-V100
33 | .TemporaryItems
34 | .Trashes
35 | .VolumeIcon.icns
36 | .com.apple.timemachine.donotpresent
37 | 
38 | # Directories potentially created on remote AFP share
39 | .AppleDB
40 | .AppleDesktop
41 | Network Trash Folder
42 | Temporary Items
43 | .apdisk
44 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM openjdk:8-jre-alpine
 2 | 
 3 | RUN addgroup -S logstash && adduser -S -G logstash logstash
 4 | 
 5 | RUN apk add --no-cache \
 6 | 		bash \
 7 | 		curl \
 8 | 		libc6-compat \
 9 | 		libzmq
10 | 
11 | RUN apk add --no-cache 'su-exec>=0.2'
12 | 
13 | ENV LOGSTASH_PATH /usr/share/logstash/bin
14 | ENV PATH $LOGSTASH_PATH:$PATH
15 | 
16 | ENV LOGSTASH_VERSION 6.2.3
17 | ENV LOGSTASH_TARBALL="https://artifacts.elastic.co/downloads/logstash/logstash-${LOGSTASH_VERSION}.tar.gz" \
18 | 	  LOGSTASH_TARBALL_ASC="https://artifacts.elastic.co/downloads/logstash/logstash-${LOGSTASH_VERSION}.tar.gz.asc" \
19 | 	  LOGSTASH_TARBALL_SHA1="a553e800665b7ccc1a6f30b49fa0d336526c8f01144751dbe617b33b38595f121f6d8b4c43e8b2f5b648bc283fc839f035c816c696d8ecccc3a93a4bb2a329c7" \
20 | 		GPG_KEY="46095ACC8548582C1A2699A9D27D666CD88E42B4"
21 | 
22 | RUN set -ex; \
23 | 	\
24 | 	\
25 | 	apk add --no-cache --virtual .fetch-deps \
26 | 		ca-certificates \
27 | 		gnupg \
28 | 		openssl \
29 | 		libc6-compat \
30 | 		tar \
31 | 	; \
32 | 	\
33 | 	wget -O logstash.tar.gz "$LOGSTASH_TARBALL"; \
34 | 	\
35 | 	if [ "$LOGSTASH_TARBALL_SHA" ]; then \
36 | 		echo "$LOGSTASH_TARBALL_SHA *logstash.tar.gz" | sha1sum -c -; \
37 | 	fi; \
38 | 	\
39 | 	if [ "$TARBALL_ASC" ]; then \
40 |   wget --progress=bar:force -O logstash.tar.gz.asc "$TARBALL_ASC"; \
41 |   export GNUPGHOME="$(mktemp -d)"; \
42 |   ( gpg --keyserver ha.pool.sks-keyservers.net --recv-keys "$GPG_KEY" \
43 |   || gpg --keyserver pgp.mit.edu --recv-keys "$GPG_KEY" \
44 |   || gpg --keyserver keyserver.pgp.com --recv-keys "$GPG_KEY" ); \
45 |   gpg --batch --verify logstash.tar.gz.asc logstash.tar.gz; \
46 |   rm -rf "$GNUPGHOME" logstash.tar.gz.asc || true; \
47 | 	fi; \
48 | 	\
49 | 	dir="$(dirname "$LOGSTASH_PATH")"; \
50 | 	\
51 | 	mkdir -p "$dir"; \
52 | 	tar -xf logstash.tar.gz --strip-components=1 -C "$dir"; \
53 | 	rm logstash.tar.gz; \
54 | 	\
55 | 	apk del .fetch-deps; \
56 | 	\
57 | 	export LS_SETTINGS_DIR="$dir/config"; \
58 | 	if [ -f "$LS_SETTINGS_DIR/log4j2.properties" ]; then \
59 | 		cp "$LS_SETTINGS_DIR/log4j2.properties" "$LS_SETTINGS_DIR/log4j2.properties.dist"; \
60 | 		truncate -s 0 "$LS_SETTINGS_DIR/log4j2.properties"; \
61 | 	fi; \
62 | 	\
63 | 	for userDir in \
64 | 		"$dir/config" \
65 | 		"$dir/data" \
66 | 	; do \
67 | 		if [ -d "$userDir" ]; then \
68 | 			chown -R logstash:logstash "$userDir"; \
69 | 		fi; \
70 | 	done; \
71 | 	\
72 | 	/usr/share/logstash/bin/logstash-plugin install logstash-filter-i18n; \
73 | 	logstash --version
74 | 
75 | COPY docker-entrypoint.sh /
76 | COPY logstash.yml /usr/share/logstash/config
77 | COPY config/pipeline /usr/share/logstash/pipeline
78 | COPY config/drivers /drivers
79 | 
80 | ENTRYPOINT ["/docker-entrypoint.sh"]
81 | CMD ["-e", ""]


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Openbridge, Inc.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Data Stash - Event API Client
  2 | 
  3 | Data Stash is a `logstash` service than can ingest data from different data sources, transform them, and then send JSON output via HTTP to the Openbridge Events API. You can also store the outputs into other formats such as CSV.
  4 | 
  5 | ![Data Stash](https://raw.githubusercontent.com/openbridge/ob_datastash/master/datastash.png "How It Works")
  6 | 
  7 | # Why Data Stash?
  8 | 
  9 | Data Stash can perform some magic by automatically processing, cleaning, encoding and streaming contents of one or more CSVs directly to our API. Once it arrives at our API we automatically route all the data to a destination table in your data warehouse.  Since CSV files can be a bit messy we have pre-packaged processing configurations that turn those old files into first class data sources. Here are a few of the standard operations we have defined:
 10 | 
 11 | - Exclude columns resident in a CSV (e.g., remove/drop the userID, email address and social security columns) from the output
 12 | - Replace non-ASCII characters with an ASCII approximation, or if none exists, a replacement character which defaults to ?
 13 | - Remove extraneous white space from records in target columns
 14 | - Strip backslashes, question marks, equals, hashes, minuses or other characters from the target columns
 15 | - Set a desired data type of a given column and have it transform records to meet that type
 16 | - Set everything to lowercase
 17 | - Proper UTF-8 encoding of the data
 18 | - Mask sensitive data with security "hashes" for one or more fields.
 19 | - Add new fields, such as IDs or concatenations of other columns, which can replace the contents of a column or store the results in a new field that is appended to the CSV
 20 | 
 21 | ## Quick Start Sample Config Files
 22 | For reference, sample configs can be found in the [`/config/pipeline`](config/pipeline) folder of this repo.
 23 | 
 24 | - **CSV to API**: CSV files with header rows use [`sample-csv-api-header.conf`](config/pipeline/sample-csv-api-header.conf)
 25 | - **CSV to API**: CSV without header rows use [`sample-csv-api-noheader.conf`](config/pipeline/sample-csv-api-noheader.conf)
 26 | - **CSV to CSV**: To process one CSV to generate a clean processed CSV use[`sample-csv-csv-noheader.conf`](config/pipeline/sample-csv-csv-noheader.conf)
 27 | - **Multiple CSV Inputs to Multiple CSV Outputs**: To process multiple CSV files to generate multiple clean CSV files use [`sample-multi-csv-csv-noheader.conf`](config/pipeline/sample-multi-csv-csv-noheader.conf)
 28 | 
 29 | 
 30 | # Install
 31 | 
 32 | Data Stash is neatly packaged into a Docker image so you can run this on your local laptop or deploy it to a server. The first step is to build or pull the image:
 33 | 
 34 | ```docker
 35 | docker build -t openbridge/ob_datastash .
 36 | ```
 37 | 
 38 | or simply pull it from Docker Hub:
 39 | 
 40 | ```docker
 41 | docker pull openbridge/ob_datastash:latest
 42 | ```
 43 | 
 44 | Once you have your image you are ready yo get started!
 45 | 
 46 | # Getting Started: How To Stream CSV Files
 47 | Data Stash is based on a premise of inputs, filters and outputs;
 48 | 
 49 | - **Inputs**: Your data sources. Primarily this will be a CSV file, but it an be many others.
 50 | - **Filters**: This is pre-processing your data prior to delivery to an output location
 51 | - **Outputs**: Ther are a few output options but the principle is the Openbridge Webhook API
 52 | 
 53 | Data Stash can take a CSV file and break each row into a streamed JSON "event". These JSON events are delivered to an Openbridge API for import into your target warehouse.
 54 | 
 55 | There are a couple of CSV file use cases:
 56 | 
 57 | - **Static Files**: You have exports from a system that you want to load to your data warehouse. Data Stash will process the exported source file and stream the content of the file until it reaches the end.
 58 | - **Dynamic Files**: You have a file that continually has new rows added. Data Stash will process changing files and stream new events as they are appended to a file.
 59 | 
 60 | For our example walk-thru we use a static CSV file called `sales.csv`.
 61 | 
 62 | ## `sales.csv` Needs A Data Stash Configuration File
 63 | 
 64 | To run Data Stash for `sales.csv` you need to define a config file. Each config file is comprised of three parts; input, filter and output. A config file describes how Data Stash should process your `sales.csv` file.
 65 | 
 66 | ### Step 1: Define Your Input
 67 | 
 68 | Lets dig into your example `sales.csv`. The principle part of the input is setting the `path =>` to your file(s). You will need to specify the path to the file you want to process like this `path => "/the/path/to/your/sales.csv"`. We are going to assume this is located in a folder on your laptop here: `/Users/bob/csv/mysalesdata`.
 69 | 
 70 | However, Data Stash has its own location where it references your data. It will use its own default directory called `/data` to reference your files. What does this mean? In the Data Stash config you will use the `/data` in the file path as a default. When you run Data Stash you will tell it to map your laptop directory `/Users/bob/csv/mysalesdata` to the `/data`. This means anything in your laptop directory will appear exactly the same way inside `/data`.
 71 | 
 72 | See the "How To Run" section for more details on this mapping.
 73 | 
 74 | ```bash
 75 |  input {
 76 |    file {
 77 |       path => "/data/sales.csv"
 78 |       start_position => "beginning"
 79 |       sincedb_path => "/dev/null"
 80 |    }
 81 |  }
 82 | ```
 83 | 
 84 | ### Step 2: Define Your Filter
 85 | 
 86 | This is where you define a CSV filter. A basic filter is focused on setting the schema and removal of system generated columns.
 87 | 
 88 | - The `separator => ","` defines the delimiter. Do not change
 89 | - The removal of system generated columns is done via `remove_field => [ "message", "host", "@timestamp", "@version", "path" ]`. Do not change unless you want to remove other columns from your CSV file. For example, lets say you had a column called `userid`. You can add it like this `remove_field => [ "message", "host", "@timestamp", "@version", "path", "userid" ]`. Now `userid` will be supressed and not sent to Openbridge.
 90 | - If your CSV file has a header row, then you can set `autodetect_column_names => "true"` and `autogenerate_column_names => "true"` to leverage those values when processing the file.
 91 | 
 92 | ```bash
 93 |  filter {
 94 |    csv {
 95 |       separator => ","
 96 |       remove_field => [ "message", "host", "@timestamp", "@version", "path" ]
 97 |       autodetect_column_names => "true"
 98 |       autogenerate_column_names => "true"
 99 |    }
100 |  }
101 | ```
102 | 
103 | If your CSV does **not** have a header in the file you need to provide context about the target source file. You need to supply the header to the application `columns => [Sku,Name,SearchKeywords,Main,Price,ID,Brands]`. This header should align to the laytout of the CSV file.
104 | 
105 | ```bash
106 |   filter {
107 |     csv {
108 |        separator => ","
109 |        remove_field => [ "message", "host", "@timestamp", "@version", "path" ]
110 |        columns => ["Sku","Name","SearchKeywords","Main","Price","ID","Brands"]
111 |     }
112 |   }
113 | ```
114 | 
115 | #### Advanced Filtering
116 | 
117 | Here is a more advance filter. This performs pre-prcoessing cleanup on the CSV file. For example, it will strip whitespace from columns, removed bad characters, convert a column to a different data type and so forth.
118 | 
119 | ```bash
120 | 
121 | filter {
122 | 
123 | # The CSV filter takes an event field containing CSV data,
124 | # parses it, and stores it as individual fields (can optionally specify the names).
125 | # This filter can also parse data with any separator, not just commas.
126 | 
127 |   csv {
128 |   # Set the comma delimiter
129 |     separator => ","
130 | 
131 |   # We want to exclude these system columns
132 |     remove_field => [
133 |        "message",
134 |        "host",
135 |        "@timestamp",
136 |        "@version",
137 |        "path"
138 |     ]
139 | 
140 |   # Define the layout of the input file
141 |     columns => [
142 |     "Sku","Name","SearchKeywords","Main","Price","ID","Brands"
143 |     ]
144 |   }
145 | 
146 |   # The mutate filter allows you to perform general
147 |   # mutations on fields. You can rename, remove, replace
148 |   # and modify fields in your events
149 | 
150 |   # We need to set the target column to "string" to allow for find and replace
151 |   mutate {
152 |     convert => [ "Sku", "string" ]
153 |   }
154 | 
155 |   # Strip backslashes, question marks, equals, hashes, and minuses from the target column
156 |   mutate {
157 |      gsub => [ "Sku", "[\\?#=]", "" ]
158 |   }
159 | 
160 |   # Strip extraneous white space from records
161 |   mutate {
162 |      strip => [ "Sku","Name","SearchKeywords","Main","Price","ID","Brands"
163 |      ]
164 |   }
165 | 
166 |   # Set everything to lowercase
167 |   mutate {
168 |      lowercase => [ "Sku","Name","SearchKeywords","Main","Price","ID","Brands"
169 |      ]
170 |   }
171 | }
172 | ```
173 | 
174 | ### Step 3: Define Your Output Destination
175 | 
176 | The output defines the delivery location for all the records in your CSV(s). Openbridge generates a private API endpoint which you use in the `url => ""`. The delivery API would look like this `url => "https://myapi.foo-api.us-east-1.amazonaws.com/dev/events/teststash?token=774f77b389154fd2ae7cb5131201777&sign=ujguuuljNjBkFGHyNTNmZTIxYjEzMWE5MjgyNzM1ODQ="`
177 | 
178 | You would take the Openberidge provided endpoint and put it into the config:
179 | 
180 | ```bash
181 |    output {
182 |      http {
183 |         url => "https://myapi.foo-api.us-east-1.amazonaws.com/dev/events/teststash?token=774f77b389154fd2ae7cb5131201777&sign=ujguuuljNjBkFGHyNTNmZTIxYjEzMWE5MjgyNzM1ODQ="
184 |         http_method => "post"
185 |         format => "json"
186 |         pool_max => "10"
187 |         pool_max_per_route => "5"
188 |      }
189 |    }
190 | ```
191 | 
192 | **Note**: Do not change `http_method => "post"`, `format => "json"`, `pool_max => "10"`, `pool_max_per_route => "5"` from the defaults listed in the config.
193 | 
194 | You can also store the data to a CSV file (vs sending it to an API). This might be useful to test or validate your data prior to using the API. It also might be useful if you want to create a CSV for upload to Openbridge via SFTP or SCP.
195 | 
196 | ```bash
197 | output {
198 | 
199 |   # Saving output to CSV so we define the layout of the file
200 |     csv {
201 |       fields => [ "Sku","Name","SearchKeywords","Main","Price","ID","Brands" ]
202 | 
203 |    # Where do you want to export the file
204 |      path => "/data/foo.csv"
205 |     }
206 | }
207 | ```
208 | 
209 | You need to reach out to your Openbridge team so they can provision your private API for you.
210 | 
211 | ### Step 4: Save Your Config
212 | 
213 | You will want to store your configs in a easy to remember location. You should also name the config in a manner that reflects the data resident in the CSV file. Since we are using `sales.csv` we saved our config like this: `/Users/bob/datastash/configs/sales.conf`. We will need to reference this config location in the next section.
214 | 
215 | The final config will look something like this:
216 | 
217 | ```bash
218 | ####################################
219 | # An input enables a specific source of
220 | # events to be read by Logstash.
221 | ####################################
222 | 
223 | input {
224 |   file {
225 |      # Set the path to the source file(s)
226 |      path => "/data/sales.csv"
227 |      start_position => "beginning"
228 |      sincedb_path => "/dev/null"
229 |   }
230 | }
231 | 
232 | ####################################
233 | # A filter performs intermediary processing on an event.
234 | # Filters are often applied conditionally depending on the
235 | # characteristics of the event.
236 | ####################################
237 | 
238 | filter {
239 | 
240 |  csv {
241 | 
242 |    # The CSV filter takes an event field containing CSV data,
243 |    # parses it, and stores it as individual fields (can optionally specify the names).
244 |    # This filter can also parse data with any separator, not just commas.
245 | 
246 |   # Set the comma delimiter
247 |     separator => ","
248 | 
249 |   # We want to exclude these system columns
250 |     remove_field => [
251 |     "message", "host", "@timestamp", "@version", "path"
252 |     ]
253 | 
254 |   # Define the layout of the input file
255 |     columns => [
256 |     "Sku","Name","SearchKeywords","Main","Price","ID","Brands"
257 |     ]
258 |   }
259 | 
260 |   # The mutate filter allows you to perform general
261 |   # mutations on fields. You can rename, remove, replace
262 |   # and modify fields in your events
263 | 
264 |   # We need to set the target column to "string" to allow for find and replace
265 |   mutate {
266 |     convert => [ "Sku", "string" ]
267 |   }
268 | 
269 |   # Find and remove backslashes, question marks, equals and hashes from the target column. These are characters we do not want in our column
270 |   mutate {
271 |      gsub => [ "Sku", "[\\?#=]", "" ]
272 |   }
273 | 
274 |   # Strip extraneous white space from records
275 |   mutate {
276 |      strip => [ "Sku","Name","SearchKeywords","Main","Price","ID","Brands"
277 |      ]
278 |   }
279 | 
280 |   # Set everything to lowercase
281 |   mutate {
282 |      lowercase => [ "Sku","Name","SearchKeywords","Main","Price","ID","Brands"
283 |      ]
284 |   }
285 | }
286 | 
287 | ####################################
288 | # An output sends event data to a particular
289 | # destination. Outputs are the final stage in the
290 | # event pipeline.
291 | ####################################
292 | 
293 | output
294 | {
295 |   # Sending the contents of the file to the event API
296 |   http
297 |   {
298 |     # Put the URL for your HTTP endpoint to deliver events to
299 |     url => "https://myapi.foo-api.us-east-1.amazonaws.com/dev/events/teststash?token=774f77b389154fd2ae7cb5131201777&sign=ujguuuljNjBkFGHyNTNmZTIxYjEzMWE5MjgyNzM1ODQ="
300 |     # Leave the settings below untouched.
301 |     http_method => "post"
302 |     format => "json"
303 |     pool_max => "10"
304 |     pool_max_per_route => "5"
305 |   }
306 | }
307 | ```
308 | 
309 | # How To Run
310 | 
311 | With your `sales.csv`config file saved to `/Users/bob/datastash/configs/sales.conf` you are ready to stream your data!
312 | 
313 | There are two things that Data Stash needs to be told in order to run.
314 | 
315 | 1. Where to find your source CSV file (`/Users/bob/csv/mysalesdata`)
316 | 2. The location of the the config file (`/Users/bob/datastash/configs`)
317 | 
318 | You tell Data Stash where the file and config are via the `-v` or `volume` command in Docker. In our example your CSV is located on your laptop in this folder: `/Users/bob/csv/mysalesdata`. This means we put that path into the first `-v` command. Internally Data Stash defaults to `/data` so you can leave that untouched. It should look like this:
319 | 
320 | ```bash
321 | -v /Users/bob/csv/mysalesdata:/data
322 | ```
323 | 
324 | In our example you also saved your config file on you laptop here: `/Users/bob/datastash/config`. Data Stash defaults to looking for configs in `/config/pipeline` so you can that untouched:
325 | 
326 | ```bash
327 | -v /Users/bob/datastash/configs:/config/pipeline
328 | ```
329 | 
330 | Lastly, we put it all together so we can tell Data Stash to stream the file. Here is the command to run our Docker based Data Stash image:
331 | 
332 | ```bash
333 | docker run -it --rm \
334 | -v /Users/bob/csv/mysalesdata:/data \
335 | -v /Users/bob/datastash/configs:/config/pipeline \
336 | openbridge/ob_datastash \
337 | datastash -f /config/pipeline/xxxxx.conf
338 | ```
339 | # Performance
340 | If you are processing very large CSV files that have millions of records this approach can take awhile to complete. Depending on the complexity of the filters, you can expect about 1000 to 3000 events (i.e., rows) processed per minute. A CSV with 1,000,000 rows might take anywhere from 5 to 8 hours to complete.
341 | 
342 | We limit the requests to 100 per second, so the max # of transactions possible in a minute would be 6000. At a rate of 6000 processing a 1M record CSV file would take close to 3 hours.
343 | 
344 | You might want to explore using the Openbridge SFTP or SCP options for processing larger files.
345 | 
346 | # Notes
347 | 
348 | ## Processing A Folder Of CSV Files
349 | 
350 | In the example below we used a wildcard `*.csv` to specify processing all sales CSV files in the directory.
351 | 
352 | `path => "/the/path/to/your/*.csv"`
353 | 
354 | For example, if you had a file called `sales.csv`, `sales002.csv` and `sales-allyear.csv` using a wildcard `*.csv` will process all of them. I
355 | 
356 | Please note, using a `*.csv` assumes all files have the same structure/layout. If they do not, then you can be streaming disjointed data sets which will likely fail when it comes time to loading data to your warehouse.
357 | 
358 | 
359 | 
360 | # Versioning
361 | 
362 | Docker Tag | Git Hub Release | Logstash | Alpine Version
363 | ---------- | --------------- | -------- | --------------
364 | latest     | Master          | 6.2.3    | latest
365 | 
366 | # Reference
367 | 
368 | A the heart of Data Stash is [Logstash](https://www.elastic.co/products/logstash). For a deeper dive into the capabilities of Logstash check our their [documentation](https://www.elastic.co/guide/en/logstash/current/index.html). Logstash is pretty cool and can do a lot more than just processing CSV files
369 | 
370 | CSV files should follow RFC 4180 standards/guidance to ensure success with processing
371 | 
372 | - <https://www.loc.gov/preservation/digital/formats/fdd/fdd000323.shtml>
373 | - <https://tools.ietf.org/html/rfc4180>
374 | 
375 | This images is used for virtualizing your data streaming using Docker. If you don't know what Docker is read "[What is Docker?](https://www.docker.com/what-docker)". Once you have a sense of what Docker is, you can then install the software. It is free: "[Get Docker](https://www.docker.com/products/docker)". Select the Docker package that aligns with your environment (ie. OS X, Linux or Windows). If you have not used Docker before, take a look at the guides:
376 | 
377 | - [Engine: Get Started](https://docs.docker.com/engine/getstarted/)
378 | - [Docker Mac](https://docs.docker.com/docker-for-mac/)
379 | - [Docker Windows](https://docs.docker.com/docker-for-windows/)
380 | 
381 | # TODO
382 | 
383 | - Create more sample configs, including complex wrangling examples.
384 | 
385 | # Issues
386 | 
387 | If you have any problems with or questions about this image, please contact us through a GitHub issue.
388 | 
389 | # Contributing
390 | 
391 | You are invited to contribute new features, fixes, or updates, large or small; we are always thrilled to receive pull requests, and do our best to process them as fast as we can.
392 | 
393 | Before you start to code, we recommend discussing your plans through a GitHub issue, especially for more ambitious contributions. This gives other contributors a chance to point you in the right direction, give you feedback on your design, and help you find out if someone else is working on the same thing.
394 | 
395 | # License
396 | 
397 | This project is licensed under the MIT License
398 | 


--------------------------------------------------------------------------------
/VERSION:
--------------------------------------------------------------------------------
1 | 5.5.2
2 | 


--------------------------------------------------------------------------------
/config/drivers/RedshiftJDBC42-1.2.7.1003.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openbridge/ob_datastash/76d53980045dfdc8f2696bc8d8da38c10b4ef091/config/drivers/RedshiftJDBC42-1.2.7.1003.jar


--------------------------------------------------------------------------------
/config/drivers/mysql-connector-java-5.1.44-bin.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openbridge/ob_datastash/76d53980045dfdc8f2696bc8d8da38c10b4ef091/config/drivers/mysql-connector-java-5.1.44-bin.jar


--------------------------------------------------------------------------------
/config/drivers/postgresql-42.1.4.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openbridge/ob_datastash/76d53980045dfdc8f2696bc8d8da38c10b4ef091/config/drivers/postgresql-42.1.4.jar


--------------------------------------------------------------------------------
/config/pipeline/sample-advanced-filter-csv.conf:
--------------------------------------------------------------------------------
 1 | ####################### INPUT ############################
 2 | # An input enables a specific source of
 3 | # events to be read by Logstash.
 4 | ##########################################################
 5 | 
 6 | input {
 7 |   file {
 8 |      type => "set-type-as-your-table-name"
 9 |      path => "/data/*.csv"
10 |      start_position => "beginning"
11 |      sincedb_path => "/dev/null"
12 |   }
13 | }
14 | 
15 | ####################### FILTER ###########################
16 | # A filter performs intermediary processing on an event.
17 | # Filters are often applied conditionally depending on the
18 | # characteristics of the event.
19 | ##########################################################
20 | 
21 | filter {
22 | 
23 | # The CSV filter takes an event field containing CSV data,
24 | # parses it, and stores it as individual fields (can optionally specify the names).
25 | # This filter can also parse data with any separator, not just commas.
26 | 
27 |   csv {
28 |      separator => ","
29 |      quote_char => '"'
30 |      autodetect_column_names => "true"
31 |      autogenerate_column_names => "true"
32 | 
33 |      add_field => { "entryname" => "%{entryType}" }
34 |      add_field => { "revised_entry_type" => "" }
35 |   }
36 | 
37 |   ##############################
38 | 
39 |   # Replaces non-ASCII characters with an ASCII approximation, or if none exists, a replacement character which defaults to ?
40 |   i18n { transliterate => [ "RowType", "Status", "Syncerrors", "From", "To", "Advertiser", "Engine", "Account", "Campaign"," DMARegionname", "DMAregionID", "Clicks", "Cost" ] }
41 | 
42 |   #############################
43 | 
44 |   # Create consistent hashes (fingerprints) of one or more fields and store the result in a new field.
45 |   fingerprint { method => "SHA1" key => "MySecretKeyForMyEyesOnlyOk?" source => "DMAregionID" target => "DMAregionID" }
46 | 
47 |   #############################
48 | 
49 |   mutate { gsub => [ "landingURL", "[\\?#=]", "" ] }
50 |   mutate { gsub => [ "landingURL", "\"", "" ] }
51 |   mutate {
52 |     gsub => [ "entryname", "1", "Direct Traffic" ]
53 |     gsub => [ "entryname", "2", "Clickthrough" ]
54 |     gsub => [ "entryname", "3", "OLA Impression" ]
55 |     gsub => [ "entryname", "4", "OLA Visit" ]
56 |     gsub => [ "entryname", "5", "Referral Traffic" ]
57 |     gsub => [ "entryname", "6", "Email" ]
58 |     gsub => [ "entryname", "7", "Organic Search" ]
59 |     gsub => [ "entryname", "8", "Paid Search" ]
60 |     gsub => [ "entryname", "9", "Offline Driver" ]
61 |     gsub => [ "entryname", "10", "Social" ]
62 |     gsub => [ "entryname", "11", "Key Action" ]
63 |     gsub => [ "entryname", "12", "Coreg" ]
64 |     gsub => [ "entryname", "13", "Customer Center" ]
65 |     gsub => [ "entryname", "14", "Video" ]
66 |     gsub => [ "entryname", "15", "Social Impression" ]
67 |     gsub => [ "entryname", "16", "Offsite Key Action" ]
68 |     gsub => [ "entryname", "17", "Offsite Clickthrough" ]
69 |   }
70 | 
71 |   if ("Email" in [utm_source] or "Email" in [utm_medium] or "Email" in [utm_campaign] or "Email" in [utm_term] or "Email" in [utm_content] or "Email" in [utm_brand]) or ("Enewsletter" in [utm_source] or "Enewsletter" in [utm_medium] or "Enewsletter" in [utm_campaign] or "Enewsletter" in [utm_term] or "Enewsletter" in [utm_content] or "Enewsletter" in [utm_brand]) or "eNL" in [utm_source] or "eNL" in [utm_medium] or "eNL" in [utm_campaign] or "eNL" in [utm_term] or "eNL" in [utm_content] or "eNL" in [utm_brand] {
72 |   mutate { replace => {"revised_entry_type" => "Email"} }
73 |   }
74 | 
75 |   mutate { lowercase => [ "id","createdDate","landingURL","landingURLNoQuery" ] }
76 | }
77 | 
78 | ####################### OUTPUT ###########################
79 | # An output sends event data to a particular
80 | # destination. Outputs are the final stage in the
81 | # event pipeline.
82 | ##########################################################
83 | 
84 | output {
85 |    csv {
86 |      fields =>  [ "id","createdDate","landingURL","landingURLNoQuery", "entryname", "revised_entry_type", "utm_source" ]
87 |      path => "/data/foo2.csv"
88 |      codec => plain { charset => 'UTF-8' }
89 |    }
90 | }
91 | 


--------------------------------------------------------------------------------
/config/pipeline/sample-csv-api-header.conf:
--------------------------------------------------------------------------------
  1 | ####################### INPUT ############################
  2 | # An input enables a specific source of
  3 | # events to be read by Logstash.
  4 | ##########################################################
  5 | 
  6 | input {
  7 |   file {
  8 |   # Set the path to the source file(s)
  9 |      type => "set-type-as-your-table-name"
 10 |      path => "/data/sales.csv"
 11 |      start_position => "beginning"
 12 |      sincedb_path => "/dev/null"
 13 |   }
 14 | }
 15 | 
 16 | 
 17 | ####################### FILTER ###########################
 18 | # A filter performs intermediary processing on an event.
 19 | # Filters are often applied conditionally depending on the
 20 | # characteristics of the event.
 21 | ##########################################################
 22 | 
 23 | filter {
 24 | 
 25 |   # The CSV filter takes an event field containing CSV data,
 26 |   # parses it, and stores it as individual fields (can optionally specify the names).
 27 |   # This filter can also parse data with any separator, not just commas.
 28 | 
 29 |   csv {
 30 | 
 31 |   # Set the tab delimiter
 32 |     separator => "\t"
 33 | 
 34 |   # We want to exclude these system columns
 35 |   remove_field => [
 36 |        "message",
 37 |        "host",
 38 |        "@timestamp",
 39 |        "@version",
 40 |        "path"
 41 |     ]
 42 | 
 43 |   # We infer the layout based on the header
 44 |   autodetect_column_names => "true"
 45 |   autogenerate_column_names => "true"
 46 | 
 47 |   ##############################
 48 | 
 49 |   # Replaces non-ASCII characters with an ASCII approximation, or if none exists, a replacement character which defaults to ?
 50 |   i18n { transliterate =>  [ "Sku","Name","SearchKeywords","Main","Price","ID","Brands" ] }
 51 | 
 52 |   #############################
 53 | 
 54 |   # Create consistent hashes (fingerprints) of one or more fields and store the result in a new field.
 55 |   fingerprint { method => "SHA1" key => "MySecretKeyForMyEyesOnlyOk?" source => "Brands" target => "Brands" }
 56 | 
 57 |   #############################
 58 | 
 59 |   # The mutate filter allows you to perform general
 60 |   # mutations on fields. You can rename, remove, replace
 61 |   # and modify fields in your events
 62 | 
 63 |   # We need to set the target column to "string" to allow for find and replace
 64 |   mutate {
 65 |     convert => [ "Sku", "string" ]
 66 |   }
 67 | 
 68 |   # Strip backslashes, question marks, equals, hashes, and minuses from the target column
 69 |   mutate {
 70 |     gsub => [ "Sku", "[\\?#=]", "" ]
 71 |   }
 72 | 
 73 |   # Strip extraneous white space from records
 74 |   mutate {
 75 |      strip => [
 76 |      "Sku","Name","SearchKeywords","Main","Price","ID","Brands"
 77 |      ]
 78 |   }
 79 | 
 80 |   # Set everything to lowercase
 81 |   mutate {
 82 |     lowercase => [ "Sku","Name","SearchKeywords","Main","Price","ID","Brands"
 83 |     ]
 84 |   }
 85 | 
 86 |   #############################
 87 |   
 88 |   # Set rate limits. Do not change unless you know what you are doing
 89 |   sleep { time => "1" every => 100 }
 90 | 
 91 | }
 92 | 
 93 | 
 94 | ####################### OUTPUT ###########################
 95 | # An output sends event data to a particular
 96 | # destination. Outputs are the final stage in the
 97 | # event pipeline.
 98 | ##########################################################
 99 | 
100 | output
101 | {
102 |   # Sending the contents of the file to the event API
103 |   http
104 |   {
105 |   # Put the URL for your HTTP endpoint to deliver events to
106 |   url => "https://myapi.foo-api.us-east-1.amazonaws.com/dev/events/teststash?token=774f77b389154fd2ae7cb5131201777&sign=ujguuuljNjBkFGHyNTNmZTIxYjEzMWE5MjgyNzM1ODQ="
107 | 
108 |   # Leave the settings below untouched.
109 |   http_method => "post"
110 |   format => "json"
111 |   keepalive => "true"
112 |   automatic_retries => 1
113 |   validate_after_inactivity => "45"
114 |   pool_max => "10"
115 |   pool_max_per_route => "5"
116 |   codec => plain { charset => 'UTF-8' }
117 |   }
118 | }
119 | 


--------------------------------------------------------------------------------
/config/pipeline/sample-csv-api-noheader.conf:
--------------------------------------------------------------------------------
  1 | ####################### INPUT ############################
  2 | # An input enables a specific source of
  3 | # events to be read by Logstash.
  4 | ##########################################################
  5 | 
  6 | input {
  7 |   file {
  8 |   # Set the path to the source file(s)
  9 |      type => "set-type-as-your-table-name"
 10 |      path => "/data/sales.csv"
 11 |      start_position => "beginning"
 12 |      sincedb_path => "/dev/null"
 13 |   }
 14 | }
 15 | 
 16 | ####################### FILTER ###########################
 17 | # A filter performs intermediary processing on an event.
 18 | # Filters are often applied conditionally depending on the
 19 | # characteristics of the event.
 20 | ##########################################################
 21 | 
 22 | filter {
 23 | 
 24 |  # The CSV filter takes an event field containing CSV data,
 25 |  # parses it, and stores it as individual fields (can optionally specify the names).
 26 |  # This filter can also parse data with any separator, not just commas.
 27 | 
 28 |   csv {
 29 | 
 30 |   # Set the comma delimiter
 31 |     separator => ","
 32 | 
 33 |   # We want to exclude these system columns
 34 |     remove_field => [
 35 |        "message",
 36 |        "host",
 37 |        "@timestamp",
 38 |        "@version",
 39 |        "path"
 40 |     ]
 41 | 
 42 |   # Define the layout of the input file
 43 |     columns => [ "Sku","Name","SearchKeywords","Main","Price","ID","Brands" ]
 44 |   }
 45 | 
 46 |   ##############################
 47 | 
 48 |   # Replaces non-ASCII characters with an ASCII approximation, or if none exists, a replacement character which defaults to ?
 49 |   i18n { transliterate =>  [ "Sku","Name","SearchKeywords","Main","Price","ID","Brands" ] }
 50 | 
 51 |   #############################
 52 | 
 53 |   # Create consistent hashes (fingerprints) of one or more fields and store the result in a new field.
 54 |   fingerprint { method => "SHA1" key => "MySecretKeyForMyEyesOnlyOk?" source => "Brands" target => "Brands" }
 55 | 
 56 |   #############################
 57 | 
 58 |   # The mutate filter allows you to perform general
 59 |   # mutations on fields. You can rename, remove, replace
 60 |   # and modify fields in your events
 61 | 
 62 |   # We need to set the target column to "string" to allow for find and replace
 63 |   mutate {
 64 |     convert => [ "Sku", "string" ]
 65 |   }
 66 | 
 67 |   # Strip backslashes, question marks, equals, hashes, and minuses from the target column
 68 |   mutate {
 69 |     gsub => [ "Sku", "[\\?#=]", "" ]
 70 |   }
 71 | 
 72 |   # Strip extraneous white space from records
 73 |   mutate {
 74 |      strip => [ [ "Sku","Name","SearchKeywords","Main","Price","ID","Brands"
 75 |      ]
 76 |   }
 77 | 
 78 |   # Set everything to lowercase
 79 |   mutate {
 80 |     lowercase => [ [ "Sku","Name","SearchKeywords","Main","Price","ID","Brands"
 81 |     ]
 82 |   }
 83 | 
 84 |   #############################
 85 | 
 86 |   # Set rate limits. Do not change unless you know what you are doing
 87 |   sleep { time => "1" every => 100 }
 88 | 
 89 | }
 90 | 
 91 | ####################### OUTPUT ###########################
 92 | # An output sends event data to a particular
 93 | # destination. Outputs are the final stage in the
 94 | # event pipeline.
 95 | ##########################################################
 96 | 
 97 | output
 98 | {
 99 |   http
100 |   {
101 |   # Put the URL for your HTTP endpoint to deliver events to
102 |     url => "https://myapi.foo-api.us-east-1.amazonaws.com/dev/events/teststash?token=774f77b389154fd2ae7cb5131201777&sign=ujguuuljNjBkFGHyNTNmZTIxYjEzMWE5MjgyNzM1ODQ="
103 | 
104 |     # Leave the settings below untouched.
105 |     http_method => "post"
106 |     format => "json"
107 |     keepalive => "true"
108 |     automatic_retries => 1
109 |     validate_after_inactivity => "45"
110 |     pool_max => "10"
111 |     pool_max_per_route => "5"
112 |     codec => plain { charset => 'UTF-8' }
113 |   }
114 | }
115 | 


--------------------------------------------------------------------------------
/config/pipeline/sample-csv-csv-noheader.conf:
--------------------------------------------------------------------------------
  1 | ####################### INPUT ############################
  2 | # An input enables a specific source of
  3 | # events to be read by Logstash.
  4 | ##########################################################
  5 | 
  6 | input {
  7 |   file {
  8 |   # Set the path to the source file(s)
  9 |      type => "set-type-as-your-table-name"
 10 |      path => "/data/sales.csv"
 11 |      start_position => "beginning"
 12 |      sincedb_path => "/dev/null"
 13 |   }
 14 | }
 15 | 
 16 | ####################### FILTER ###########################
 17 | # A filter performs intermediary processing on an event.
 18 | # Filters are often applied conditionally depending on the
 19 | # characteristics of the event.
 20 | ##########################################################
 21 | 
 22 | filter {
 23 | 
 24 |  # The CSV filter takes an event field containing CSV data,
 25 |  # parses it, and stores it as individual fields (can optionally specify the names).
 26 |  # This filter can also parse data with any separator, not just commas.
 27 | 
 28 |   csv {
 29 |   # Set the comma delimiter
 30 |     separator => ","
 31 | 
 32 |   # We want to exclude these system columns
 33 |     remove_field => [
 34 |        "message",
 35 |        "host",
 36 |        "@timestamp",
 37 |        "@version",
 38 |        "path"
 39 |     ]
 40 | 
 41 |   # Define the layout of the input file
 42 |     columns => [ "Sku","Name","SearchKeywords","Main","Price","ID","Brands"
 43 |     ]
 44 |   }
 45 | }
 46 | 
 47 | ##############################
 48 | 
 49 |  # Replaces non-ASCII characters with an ASCII approximation, or if none exists, a replacement character which defaults to ?
 50 |   i18n { transliterate =>  [ "Sku","Name","SearchKeywords","Main","Price","ID","Brands" ] }
 51 | 
 52 | #############################
 53 | 
 54 |   # Create consistent hashes (fingerprints) of one or more fields and store the result in a new field.
 55 |   fingerprint { method => "SHA1" key => "MySecretKeyForMyEyesOnlyOk?" source => "Brands" target => "Brands" }
 56 | 
 57 | #############################
 58 | 
 59 |   # The mutate filter allows you to perform general
 60 |   # mutations on fields. You can rename, remove, replace
 61 |   # and modify fields in your events
 62 | 
 63 |   # We need to set the target column to "string" to allow for find and replace
 64 |   mutate {
 65 |     convert => [ "Sku", "string" ]
 66 |   }
 67 | 
 68 |   # Strip backslashes, question marks, equals, hashes, and minuses from the target column
 69 |   mutate {
 70 |     gsub => [ "Sku", "[\\?#=]", "" ]
 71 |   }
 72 | 
 73 |   # Strip extraneous white space from records
 74 |   mutate {
 75 |      strip => [
 76 |      "Sku","Name","SearchKeywords","Main","Price","ID","Brands"
 77 |      ]
 78 |   }
 79 | 
 80 |   # Set everything to lowercase
 81 |   mutate {
 82 |     lowercase => [ "Sku","Name","SearchKeywords","Main","Price","ID","Brands"
 83 |     ]
 84 |   }
 85 | }
 86 | 
 87 | 
 88 | ####################### OUTPUT ###########################
 89 | # An output sends event data to a particular
 90 | # destination. Outputs are the final stage in the
 91 | # event pipeline.
 92 | ##########################################################
 93 | 
 94 | output {
 95 | 
 96 |   # Saving output to CSV so we define the layout of the file
 97 |     csv {
 98 |       fields =>  [ "Sku","Name","SearchKeywords","Main","Price","ID","Brands"
 99 |       ]
100 | 
101 |    # Where do you want to export the file
102 |      path => "/data/foo2.csv"
103 |      codec => plain { charset => 'UTF-8' }
104 |     }
105 | }
106 | 


--------------------------------------------------------------------------------
/config/pipeline/sample-multi-csv-csv-noheader.conf:
--------------------------------------------------------------------------------
  1 | ####################### INPUT ############################
  2 | # An input enables a specific source of
  3 | # events to be read by Logstash.
  4 | ##########################################################
  5 | 
  6 | input {
  7 |   file {
  8 |      type => "sales"
  9 |      path => "/data/sales/sales.csv"
 10 |      start_position => "beginning"
 11 |      sincedb_path => "/dev/null"
 12 |   }
 13 |   file {
 14 |      type => "orders"
 15 |      path => "/data/orders/orders.csv"
 16 |      start_position => "beginning"
 17 |      sincedb_path => "/dev/null"
 18 |   }
 19 | 
 20 | }
 21 | 
 22 | ####################### FILTER ###########################
 23 | # A filter performs intermediary processing on an event.
 24 | # Filters are often applied conditionally depending on the
 25 | # characteristics of the event.
 26 | ##########################################################
 27 | 
 28 | filter {
 29 | 
 30 |  # The CSV filter takes an event field containing CSV data,
 31 |  # parses it, and stores it as individual fields (can optionally specify the names).
 32 |  # This filter can also parse data with any separator, not just commas.
 33 | 
 34 |   csv {
 35 | 
 36 |   # Set the comma delimiter
 37 |     separator => ","
 38 | 
 39 |   # We want to exclude these system columns
 40 |     remove_field => [
 41 |        "message",
 42 |        "host",
 43 |        "@timestamp",
 44 |        "@version",
 45 |        "path"
 46 |     ]
 47 | 
 48 |   # Define the layout of the input file
 49 |     if [type] == "sales" { columns => [ "Sku","Name","SearchKeywords","Main","Price","ID","Brands" ] }
 50 |     if [type] == "orders" { columns => [ "Sku","Name","SearchKeywords","Main","Price","ID","Brands" ] }
 51 |   }
 52 | 
 53 |   ##############################
 54 | 
 55 |   # Replaces non-ASCII characters with an ASCII approximation, or if none exists, a replacement character which defaults to ?
 56 |     i18n { transliterate =>  [ "Sku","Name","SearchKeywords","Main","Price","ID","Brands" ] }
 57 | 
 58 |   #############################
 59 | 
 60 |   # Create consistent hashes (fingerprints) of one or more fields and store the result in a new field.
 61 |     fingerprint { method => "SHA1" key => "MySecretKeyForMyEyesOnlyOk?" source => "Brands" target => "Brands" }
 62 | 
 63 |   #############################
 64 | 
 65 |   # The mutate filter allows you to perform general
 66 |   # mutations on fields. You can rename, remove, replace
 67 |   # and modify fields in your events
 68 | 
 69 |   # We need to set the target column to "string" to allow for find and replace
 70 |   mutate {
 71 |     convert => [ "Sku", "string" ]
 72 |   }
 73 | 
 74 |   # Strip backslashes, question marks, equals, hashes, and minuses from the target column
 75 |   mutate {
 76 |     gsub => [ "Sku", "[\\?#=]", "" ]
 77 |   }
 78 | 
 79 |   # Strip extraneous white space from records
 80 |   mutate {
 81 |      strip => [
 82 |      "Sku","Name","SearchKeywords","Main","Price","ID","Brands"
 83 |      ]
 84 |   }
 85 | 
 86 |   # Set everything to lowercase
 87 |   mutate {
 88 |     lowercase => [ "Sku","Name","SearchKeywords","Main","Price","ID","Brands"
 89 |     ]
 90 |   }
 91 | }
 92 | 
 93 | ####################### OUTPUT ###########################
 94 | # An output sends event data to a particular
 95 | # destination. Outputs are the final stage in the
 96 | # event pipeline.
 97 | ##########################################################
 98 | 
 99 | output {
100 | 
101 |   # Saving output to CSV so we define the layout of the file
102 | 
103 |   if [type] == "sales" {
104 |     csv {
105 |      fields =>  [ "Sku","Name","SearchKeywords","Main","Price","ID","Brands" ]
106 |      path => "/data/clean-sales.csv"
107 |      codec => plain { charset => 'UTF-8' }
108 |     }
109 |   }
110 |   if [type] == "orders" {
111 |     csv {
112 |      fields =>  [ "Sku","Name","SearchKeywords","Main","Price","ID","Brands" ]
113 |      path => "/data/clean-orders.csv"
114 |      codec => plain { charset => 'UTF-8' }
115 |     }
116 |   }
117 | }
118 | 


--------------------------------------------------------------------------------
/config/pipeline/sample-tab-api-header.conf:
--------------------------------------------------------------------------------
  1 | ####################### INPUT ############################
  2 | # An input enables a specific source of
  3 | # events to be read by Logstash.
  4 | ##########################################################
  5 | 
  6 | input {
  7 |   file {
  8 |   # Set the path to the source file(s)
  9 |      type => "sales"
 10 |      path => "/data/sales.txt"
 11 |      start_position => "beginning"
 12 |      sincedb_path => "/dev/null"
 13 |   }
 14 | }
 15 | 
 16 | 
 17 | ####################### FILTER ###########################
 18 | # A filter performs intermediary processing on an event.
 19 | # Filters are often applied conditionally depending on the
 20 | # characteristics of the event.
 21 | ##########################################################
 22 | 
 23 | filter {
 24 | 
 25 | # The CSV filter takes an event field containing CSV data,
 26 | # parses it, and stores it as individual fields (can optionally specify the names).
 27 | # This filter can also parse data with any separator, not just commas.
 28 | 
 29 |   csv {
 30 |   # Set the comma delimiter
 31 |     separator => "\t"
 32 | 
 33 |   # We want to exclude these system columns
 34 |   remove_field => [
 35 |        "message",
 36 |        "host",
 37 |        "@timestamp",
 38 |        "@version",
 39 |        "path"
 40 |     ]
 41 | 
 42 |   # We infer the layout based on the header
 43 |   autodetect_column_names => "true"
 44 |   autogenerate_column_names => "true"
 45 | 
 46 | }
 47 | 
 48 | ##############################
 49 | 
 50 |   # Replaces non-ASCII characters with an ASCII approximation, or if none exists, a replacement character which defaults to ?
 51 |   i18n { transliterate =>  [ "Sku","Name","SearchKeywords","Main","Price","ID","Brands" ] }
 52 | 
 53 | #############################
 54 | 
 55 |   # Create consistent hashes (fingerprints) of one or more fields and store the result in a new field.
 56 |   fingerprint { method => "SHA1" key => "MySecretKeyForMyEyesOnlyOk?" source => "Brands" target => "Brands" }
 57 | 
 58 | #############################
 59 | 
 60 |   # The mutate filter allows you to perform general
 61 |   # mutations on fields. You can rename, remove, replace
 62 |   # and modify fields in your events
 63 | 
 64 | 
 65 |   # We need to set the target column to "string" to allow for find and replace
 66 |   mutate {
 67 |     convert => [ "Sku", "string" ]
 68 |   }
 69 | 
 70 |   # Strip backslashes, question marks, equals, hashes, and minuses from the target column
 71 |   mutate {
 72 |     gsub => [ "Sku", "[\\?#=]", "" ]
 73 |   }
 74 | 
 75 |   # Strip extraneous white space from records
 76 |   mutate {
 77 |      strip => [
 78 |      "Sku","Name","SearchKeywords","Main","Price","ID","Brands"
 79 |      ]
 80 |   }
 81 | 
 82 |   # Set everything to lowercase
 83 |   mutate {
 84 |     lowercase => [ "Sku","Name","SearchKeywords","Main","Price","ID","Brands"
 85 |     ]
 86 |   }
 87 |   #############################
 88 | 
 89 |   # Set rate limits. Do not change unless you know what you are doing
 90 |   sleep { time => "1" every => 100 }
 91 | 
 92 | }
 93 | 
 94 | 
 95 | ####################### OUTPUT ###########################
 96 | # An output sends event data to a particular
 97 | # destination. Outputs are the final stage in the
 98 | # event pipeline.
 99 | ##########################################################
100 | 
101 | output
102 | {
103 |   # Sending the contents of the file to the event API
104 |   http
105 |   {
106 |   # Put the URL for your HTTP endpoint to deliver events to
107 |   url => "https://myapi.foo-api.us-east-1.amazonaws.com/dev/events/teststash?token=774f77b389154fd2ae7cb5131201777&sign=ujguuuljNjBkFGHyNTNmZTIxYjEzMWE5MjgyNzM1ODQ="
108 | 
109 |   # Leave the settings below untouched.
110 |   http_method => "post"
111 |   format => "json"
112 |   keepalive => "true"
113 |   automatic_retries => 1
114 |   validate_after_inactivity => "45"
115 |   pool_max => "10"
116 |   pool_max_per_route => "5"
117 |   codec => plain { charset => 'UTF-8' }
118 |   }
119 | }
120 | 


--------------------------------------------------------------------------------
/convert.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import sys
 5 | import pandas as pd
 6 | 
 7 | args = sys.argv
 8 | SOURCE_FILE = args[1]
 9 | SOURCE_DELIMITER = args[2] if len(args) >= 2 else 'þ'
10 | DESTINATION_FILE = args[3] if len(args) >= 3 else args[1] + '.csv'
11 | 
12 | df = pd.read_csv(SOURCE_FILE, delimiter=SOURCE_DELIMITER);
13 | df.to_csv(DESTINATION_FILE, index=False, quoting=1)
14 | 


--------------------------------------------------------------------------------
/datastash-sftpscp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openbridge/ob_datastash/76d53980045dfdc8f2696bc8d8da38c10b4ef091/datastash-sftpscp.png


--------------------------------------------------------------------------------
/datastash.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openbridge/ob_datastash/76d53980045dfdc8f2696bc8d8da38c10b4ef091/datastash.png


--------------------------------------------------------------------------------
/docker-entrypoint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | # first arg is `-f` or `--some-option`
 5 | if [ "${1#-}" != "$1" ]; then
 6 | 	set -- logstash "$@"
 7 | fi
 8 | 
 9 | mkdir -p /data/logstash-temp-queue
10 | mkdir -p /data/logstash-temp-sincedb
11 | # Set a link to datastash as a command.
12 | ln -s /usr/share/logstash/bin/logstash datastash
13 | 
14 | # Run as user "logstash" if the command is "logstash"
15 | # allow the container to be started with `--user`
16 | if [ "$1" = 'datastash' -a "$(id -u)" = '0' ]; then
17 | 	set -- su-exec datastash "$@"
18 | fi
19 | 
20 | exec "$@"
21 | 


--------------------------------------------------------------------------------
/logstash.yml:
--------------------------------------------------------------------------------
  1 | # Settings file in YAML
  2 | #
  3 | # Settings can be specified either in hierarchical form, e.g.:
  4 | #
  5 | #   pipeline:
  6 | #     batch:
  7 | #       size: 125
  8 | #       delay: 5
  9 | #
 10 | # Or as flat keys:
 11 | #
 12 | #   pipeline.batch.size: 125
 13 | #   pipeline.batch.delay: 5
 14 | #
 15 | # ------------  Node identity ------------
 16 | #
 17 | # Use a descriptive name for the node:
 18 | #
 19 | # node.name: datastash
 20 | # xpack.monitoring.elasticsearch.url: http://0.0.0.0:9200
 21 | #
 22 | # If omitted the node name will default to the machine's host name
 23 | #
 24 | # ------------ Data path ------------------
 25 | #
 26 | # Which directory should be used by logstash and its plugins
 27 | # for any persistent needs. Defaults to LOGSTASH_HOME/data
 28 | #
 29 | # path.data:
 30 | #
 31 | # ------------ Pipeline Settings --------------
 32 | #
 33 | # Set the number of workers that will, in parallel, execute the filters+outputs
 34 | # stage of the pipeline.
 35 | #
 36 | # This defaults to the number of the host's CPU cores.
 37 | #
 38 | # pipeline.workers: 2
 39 | #
 40 | # How many workers should be used per output plugin instance
 41 | #
 42 | # pipeline.output.workers: 1
 43 | #
 44 | # How many events to retrieve from inputs before sending to filters+workers
 45 | #
 46 | # pipeline.batch.size: 125
 47 | #
 48 | # How long to wait before dispatching an undersized batch to filters+workers
 49 | # Value is in milliseconds.
 50 | #
 51 | # pipeline.batch.delay: 5
 52 | #
 53 | # Force Logstash to exit during shutdown even if there are still inflight
 54 | # events in memory. By default, logstash will refuse to quit until all
 55 | # received events have been pushed to the outputs.
 56 | #
 57 | # WARNING: enabling this can lead to data loss during shutdown
 58 | #
 59 | # pipeline.unsafe_shutdown: false
 60 | #
 61 | # ------------ Pipeline Configuration Settings --------------
 62 | #
 63 | # Where to fetch the pipeline configuration for the main pipeline
 64 | #
 65 | # path.config:
 66 | #
 67 | # Pipeline configuration string for the main pipeline
 68 | #
 69 | # config.string:
 70 | #
 71 | # At startup, test if the configuration is valid and exit (dry run)
 72 | #
 73 | # config.test_and_exit: false
 74 | #
 75 | # Periodically check if the configuration has changed and reload the pipeline
 76 | # This can also be triggered manually through the SIGHUP signal
 77 | #
 78 | # config.reload.automatic: true
 79 | #
 80 | # How often to check if the pipeline configuration has changed (in seconds)
 81 | #
 82 | # config.reload.interval: 10s
 83 | #
 84 | # Show fully compiled configuration as debug log message
 85 | # NOTE: --log.level must be 'debug'
 86 | #
 87 | # config.debug: false
 88 | #
 89 | # When enabled, process escaped characters such as \n and \" in strings in the
 90 | # pipeline configuration files.
 91 | #
 92 | # config.support_escapes: false
 93 | #
 94 | # ------------ Module Settings ---------------
 95 | # Define modules here.  Modules definitions must be defined as an array.
 96 | # The simple way to see this is to prepend each `name` with a `-`, and keep
 97 | # all associated variables under the `name` they are associated with, and
 98 | # above the next, like this:
 99 | #
100 | # modules:
101 | #   - name: MODULE_NAME
102 | #     var.PLUGINTYPE1.PLUGINNAME1.KEY1: VALUE
103 | #     var.PLUGINTYPE1.PLUGINNAME1.KEY2: VALUE
104 | #     var.PLUGINTYPE2.PLUGINNAME1.KEY1: VALUE
105 | #     var.PLUGINTYPE3.PLUGINNAME3.KEY1: VALUE
106 | #
107 | # Module variable names must be in the format of
108 | #
109 | # var.PLUGIN_TYPE.PLUGIN_NAME.KEY
110 | #
111 | # modules:
112 | #
113 | # ------------ Cloud Settings ---------------
114 | # Define Elastic Cloud settings here.
115 | # Format of cloud.id is a base64 value e.g. dXMtZWFzdC0xLmF3cy5mb3VuZC5pbyRub3RhcmVhbCRpZGVudGlmaWVy
116 | # and it may have an label prefix e.g. staging:dXMtZ...
117 | # This will overwrite 'var.elasticsearch.hosts' and 'var.kibana.host'
118 | # cloud.id: <identifier>
119 | #
120 | # Format of cloud.auth is: <user>:<pass>
121 | # This is optional
122 | # If supplied this will overwrite 'var.elasticsearch.username' and 'var.elasticsearch.password'
123 | # If supplied this will overwrite 'var.kibana.username' and 'var.kibana.password'
124 | # cloud.auth: elastic:<password>
125 | #
126 | # ------------ Queuing Settings --------------
127 | #
128 | # Internal queuing model, "memory" for legacy in-memory based queuing and
129 | # "persisted" for disk-based acked queueing. Defaults is memory
130 | #
131 | queue.type: persisted
132 | #
133 | # If using queue.type: persisted, the directory path where the data files will be stored.
134 | # Default is path.data/queue
135 | #
136 | path.queue: "/data/logstash-temp-queue"
137 | #
138 | # If using queue.type: persisted, the page data files size. The queue data consists of
139 | # append-only data files separated into pages. Default is 250mb
140 | #
141 | # queue.page_capacity: 250mb
142 | #
143 | # If using queue.type: persisted, the maximum number of unread events in the queue.
144 | # Default is 0 (unlimited)
145 | #
146 | # queue.max_events: 0
147 | #
148 | # If using queue.type: persisted, the total capacity of the queue in number of bytes.
149 | # If you would like more unacked events to be buffered in Logstash, you can increase the
150 | # capacity using this setting. Please make sure your disk drive has capacity greater than
151 | # the size specified here. If both max_bytes and max_events are specified, Logstash will pick
152 | # whichever criteria is reached first
153 | # Default is 1024mb or 1gb
154 | #
155 | #queue.max_bytes: 1024mb
156 | #
157 | # If using queue.type: persisted, the maximum number of acked events before forcing a checkpoint
158 | # Default is 1024, 0 for unlimited
159 | #
160 | #queue.checkpoint.acks: 1024
161 | #
162 | # If using queue.type: persisted, the maximum number of written events before forcing a checkpoint
163 | # Default is 1024, 0 for unlimited
164 | #
165 | #queue.checkpoint.writes: 1024
166 | #
167 | # If using queue.type: persisted, the interval in milliseconds when a checkpoint is forced on the head page
168 | # Default is 1000, 0 for no periodic checkpoint.
169 | #
170 | # queue.checkpoint.interval: 1000
171 | #
172 | # ------------ Dead-Letter Queue Settings --------------
173 | # Flag to turn on dead-letter queue.
174 | #
175 | # dead_letter_queue.enable: false
176 | 
177 | # If using dead_letter_queue.enable: true, the maximum size of each dead letter queue. Entries
178 | # will be dropped if they would increase the size of the dead letter queue beyond this setting.
179 | # Default is 1024mb
180 | # dead_letter_queue.max_bytes: 1024mb
181 | 
182 | # If using dead_letter_queue.enable: true, the directory path where the data files will be stored.
183 | # Default is path.data/dead_letter_queue
184 | #
185 | # path.dead_letter_queue:
186 | #
187 | # ------------ Metrics Settings --------------
188 | #
189 | # Bind address for the metrics REST endpoint
190 | #
191 | #http.host: "127.0.0.1"
192 | #
193 | # Bind port for the metrics REST endpoint, this option also accept a range
194 | # (9600-9700) and logstash will pick up the first available ports.
195 | #
196 | #http.port: 9600-9700
197 | #
198 | # ------------ Debugging Settings --------------
199 | #
200 | # Options for log.level:
201 | #   * fatal
202 | #   * error
203 | #   * warn
204 | #   * info (default)
205 | #   * debug
206 | #   * trace
207 | #
208 | # log.level: info
209 | # path.logs:
210 | #
211 | # ------------ Other Settings --------------
212 | #
213 | # Where to find custom plugins
214 | # path.plugins: []
215 | 


--------------------------------------------------------------------------------