├── .github
    └── workflows
    │   └── ci.yml
├── .gitignore
├── .gitmodules
├── .travis.yml
├── Dockerfile
├── LICENSE
├── Makefile
├── README.md
├── snap
    └── snapcraft.yaml
└── src
    ├── argh.h
    └── blaze.cpp


/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | on:
 3 |   pull_request:
 4 |   push:
 5 | 
 6 | jobs:
 7 |   build:
 8 |     name: Build
 9 |     runs-on: ${{ matrix.os }}
10 |     strategy:
11 |       matrix:
12 |         os: [ macos-10.15, ubuntu-20.04 ]
13 | 
14 |     steps:
15 |       - uses: actions/checkout@v2
16 | 
17 |       - name: Setup Git
18 |         run: git submodule update --init --recursive
19 | 
20 |       - name: Install dependencies (Ubuntu 20.04)
21 |         run: sudo apt-get install libcurl4-openssl-dev
22 |         if: matrix.os == 'ubuntu-20.04'
23 | 
24 |       - name: Make
25 |         run: make
26 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | blaze
2 | *.o
3 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "vendor/rapidjson"]
2 | 	path = vendor/rapidjson
3 | 	url = https://github.com/Tencent/RapidJSON
4 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: cpp
 2 | 
 3 | compiler:
 4 |   - clang
 5 |   - gcc
 6 | 
 7 | os:
 8 |   - linux
 9 |   - osx
10 | 
11 | script: make
12 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # -- build context
 2 | 
 3 | FROM alpine AS build-env
 4 | 
 5 | LABEL name="blaze in docker"
 6 | LABEL version="1.0.0"
 7 | LABEL maintainer="norman@khine.net"
 8 | 
 9 | WORKDIR /tmp
10 | 
11 | RUN apk update
12 | 
13 | RUN apk add --no-cache g++ gcc automake make autoconf libtool curl-dev git
14 | 
15 | COPY vendor/ ./vendor
16 | COPY src/ ./src
17 | COPY Makefile .
18 | 
19 | RUN make
20 | 
21 | # -- runtime context
22 | 
23 | FROM alpine
24 | 
25 | RUN apk update \
26 |   && apk add --no-cache ca-certificates openssl \
27 |   && update-ca-certificates
28 | 
29 | COPY --from=build-env /lib /lib
30 | COPY --from=build-env /usr/lib /usr/lib
31 | COPY --from=build-env /usr/local/share /usr/local/share
32 | COPY --from=build-env /usr/local/lib /usr/local/lib
33 | COPY --from=build-env /tmp/blaze /usr/local/bin
34 | 
35 | CMD ["/usr/local/bin/blaze"]
36 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2018 Viktor Elofsson and contributors.
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 4 | this software and associated documentation files (the "Software"), to deal in
 5 | the Software without restriction, including without limitation the rights to
 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
 7 | of the Software, and to permit persons to whom the Software is furnished to do
 8 | so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | CPPFLAGS=--std=c++11 -mtune=native -O3 -DNDEBUG=1
 2 | CXX=g++
 3 | RM=rm -f
 4 | 
 5 | all: blaze
 6 | 
 7 | blaze: src/blaze.o
 8 | 	$(CXX) -o blaze src/blaze.o -lcurl -lpthread
 9 | 
10 | blaze.o: src/blaze.cpp
11 | 	$(CXX) $(CPPFLAGS) -c src/blaze.cpp -o src/blaze.o
12 | 
13 | .PHONY: clean
14 | clean:
15 | 	$(RM) src/blaze.o
16 | 
17 | .PHONY: distclean
18 | distclean: clean
19 | 	$(RM) blaze
20 | 
21 | .PHONY: install
22 | install: blaze
23 | 	mkdir -p $(DESTDIR)/usr/local/bin
24 | 	install -m 4755 -o root blaze $(DESTDIR)/usr/local/bin
25 | 
26 | .PHONY: uninstall
27 | uninstall:
28 | 	rm -f $(DESTDIR)/usr/local/bin/blaze
29 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Blaze
  2 | 
  3 | Are you running Elasticsearch? Want to take your data and get the heck outta
  4 | Dodge? **Blaze** provides everything you need in a neat, blazing fast package!
  5 | 
  6 | | **Linux / OSX** |
  7 | | --------------- |
  8 | | [![Build Status](https://github.com/unidentifieddeveloper/blaze/workflows/CI/badge.svg?branch=master)](https://github.com/unidentifieddeveloper/blaze/actions?query=branch%3Amaster) |
  9 | 
 10 | 
 11 | ## Features
 12 | 
 13 |  - Uses the [Elasticsearch sliced scroll API](https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-scroll.html) to get your data hella fast.
 14 |  - Written in modern C++ using [libcurl](https://github.com/curl/curl) and [RapidJSON](https://github.com/Tencent/RapidJSON).
 15 |  - Distributed as a single, tiny binary.
 16 | 
 17 | 
 18 |  ### Performance
 19 | 
 20 | Blaze compared to other Elasticsearch dump tools. The index has ~3.5M rows and
 21 | is ~5GB in size. Each tool is timed with `time` and measures the time to write
 22 | a simple JSON dump file.
 23 | 
 24 | | **Tool**    | **Time** |
 25 | | ----------- | -------- |
 26 | | Blaze       | 00m40s   |
 27 | | elasticdump | 04m38s   |
 28 | 
 29 | 
 30 | ## Usage
 31 | 
 32 | Get the binary for your platform from the Releases page or compile it yourself.
 33 | If you use it often it might make sense to put it in your `PATH` somewhere.
 34 | 
 35 | ```sh
 36 | $ blaze --host=http://localhost:9200 --index=massive_1 > dump.ndjson
 37 | ```
 38 | 
 39 | This will connect to Elasticsearch on the specified host and start downloading
 40 | the `massive_1` index to *stdout*. Make sure to redirect this somewhere, such as
 41 | a JSON file.
 42 | 
 43 | 
 44 | ### Output format
 45 | 
 46 | Blaze will dump everything to *stdout* in a format compatible with the
 47 | Elasticsearch Bulk API, meaning you can use `curl` to put the data back.
 48 | 
 49 | ```sh
 50 | curl -H "Content-Type: application/x-ndjson" -XPOST localhost:9200/other_data/_bulk --data-binary "@dump.ndjson"
 51 | ```
 52 | 
 53 | One issue when working with large datasets is that Elasticsearch has an upper
 54 | limit on the size of HTTP requests (2GB). The solution is to split the file
 55 | with something like `parallel`. The split should be done on even line numbers
 56 | since each command is actually two lines in the file.
 57 | 
 58 | ```sh
 59 | cat dump.ndjson | parallel --pipe -l 50000 curl -s -H "Content-Type: application/x-ndjson" -XPOST localhost:9200/other_data/_bulk --data-binary "@-"
 60 | ```
 61 | 
 62 | 
 63 | ### Command line options
 64 | 
 65 |  - `--host=<value>` - the host where Elasticsearch is running.
 66 |  - `--index=<value>` - the index to dump.
 67 |  - `--slices=<value>` - *(optional)* the number of slices to split the scroll. Should be set to the
 68 |    number of shards for the index (as seen on `/_cat/indices`). Defaults to *5*.
 69 |  - `--size=<value>` - *(optional)* the size of the response (i.e, length of the `hits` array).
 70 |    Defaults to *5000*.
 71 |  - `--dump-mappings` - specify this flag to dump the index mappings instead of the source.
 72 |  - `--dump-index-info` - specify this flag to dump the full index information (settings and mappings) instead of the source.
 73 | 
 74 | #### Authentication
 75 | 
 76 | To use HTTP Basic authentication you need to pass the following options. *Note*
 77 | that passing a password on the command line will put it in your terminal
 78 | history, so please use with care.
 79 | 
 80 |  - `--auth=basic` - enable HTTP Basic authentication.
 81 |  - `--basic-username=foo` - the username.
 82 |  - `--basic-password=bar` - the password.
 83 |  - `--insecure` - For HTTPS connections, specify this flag to skip server certificate validation.
 84 | 
 85 | ## Building from source
 86 | 
 87 | Building Blaze is easy. It requires `libcurl`.
 88 | 
 89 | ### On Linux (and OSX)
 90 | 
 91 | ```sh
 92 | $ git submodule update --init
 93 | $ make
 94 | ```
 95 | 
 96 | ### Run it from docker
 97 | 
 98 | ```terminal
 99 | docker build -t blaze .
100 | docker run -it blaze blaze
101 | ```
102 | 
103 | ## License
104 | 
105 | Copyright © Viktor Elofsson and contributors.
106 | 
107 | Blaze is provided as-is under the MIT license. For more information see
108 | [LICENSE](https://github.com/vktr/blaze/blob/master/LICENSE).
109 | 
110 |  - For libcurl, see https://curl.haxx.se/docs/copyright.html
111 |  - For RapidJSON, see https://github.com/Tencent/rapidjson/blob/master/license.txt
112 | 


--------------------------------------------------------------------------------
/snap/snapcraft.yaml:
--------------------------------------------------------------------------------
 1 | name: blaze
 2 | version: 'stable'
 3 | summary: Blazing fast ElasticSearch data exporter.
 4 | description: |
 5 |   Blaze exports your ElasticSearch data really, really fast.
 6 | grade: devel
 7 | confinement: devmode
 8 | 
 9 | parts:
10 |   blaze:
11 |     source: .
12 |     plugin: make
13 | 


--------------------------------------------------------------------------------
/src/argh.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <algorithm>
  4 | #include <sstream>
  5 | #include <string>
  6 | #include <vector>
  7 | #include <set>
  8 | #include <map>
  9 | #include <cassert>
 10 | 
 11 | namespace argh
 12 | {
 13 |    // Terminology:
 14 |    // A command line is composed of 2 types of args:
 15 |    // 1. Positional args, i.e. free standing values
 16 |    // 2. Options: args beginning with '-'. We identify two kinds:
 17 |    //    2.1: Flags: boolean options =>  (exist ? true : false)
 18 |    //    2.2: Parameters: a name followed by a non-option value
 19 | 
 20 | #if !defined(__GNUC__) || (__GNUC__ >= 5)
 21 |    using string_stream = std::istringstream;
 22 | #else
 23 |     // Until GCC 5, istringstream did not have a move constructor.
 24 |     // stringstream_proxy is used instead, as a workaround.
 25 |    class stringstream_proxy
 26 |    {
 27 |    public:
 28 |       stringstream_proxy() = default;
 29 | 
 30 |       // Construct with a value.
 31 |       stringstream_proxy(std::string const& value) :
 32 |          stream_(value)
 33 |       {}
 34 | 
 35 |       // Copy constructor.
 36 |       stringstream_proxy(const stringstream_proxy& other) :
 37 |          stream_(other.stream_.str())
 38 |       {
 39 |          stream_.setstate(other.stream_.rdstate());
 40 |       }
 41 | 
 42 |       void setstate(std::ios_base::iostate state) { stream_.setstate(state); }
 43 | 
 44 |       // Stream out the value of the parameter.
 45 |       // If the conversion was not possible, the stream will enter the fail state,
 46 |       // and operator bool will return false.
 47 |       template<typename T>
 48 |       stringstream_proxy& operator >> (T& thing)
 49 |       {
 50 |          stream_ >> thing;
 51 |          return *this;
 52 |       }
 53 | 
 54 | 
 55 |       // Get the string value.
 56 |       std::string str() const { return stream_.str(); }
 57 | 
 58 |       std::stringbuf* rdbuf() const { return stream_.rdbuf(); }
 59 | 
 60 |       // Check the state of the stream. 
 61 |       // False when the most recent stream operation failed
 62 |       operator bool() const { return !!stream_; }
 63 | 
 64 |       ~stringstream_proxy() = default;
 65 |    private:
 66 |       std::istringstream stream_;
 67 |    };
 68 |    using string_stream = stringstream_proxy;
 69 | #endif
 70 | 
 71 |    class parser
 72 |    {
 73 |    public:
 74 |       enum Mode { PREFER_FLAG_FOR_UNREG_OPTION = 1 << 0,
 75 |                   PREFER_PARAM_FOR_UNREG_OPTION = 1 << 1,
 76 |                   NO_SPLIT_ON_EQUALSIGN = 1 << 2,
 77 |                   SINGLE_DASH_IS_MULTIFLAG = 1 << 3,
 78 |                 };
 79 | 
 80 |       parser() = default;
 81 | 
 82 |       parser(std::initializer_list<char const* const> pre_reg_names)
 83 |       {  add_params(pre_reg_names); }
 84 | 
 85 |       parser(const char* const argv[], int mode = PREFER_FLAG_FOR_UNREG_OPTION)
 86 |       {  parse(argv, mode); }
 87 | 
 88 |       parser(int argc, const char* const argv[], int mode = PREFER_FLAG_FOR_UNREG_OPTION)
 89 |       {  parse(argc, argv, mode); }
 90 | 
 91 |       void add_param(std::string const& name);
 92 |       void add_params(std::initializer_list<char const* const> init_list);
 93 | 
 94 |       void parse(const char* const argv[], int mode = PREFER_FLAG_FOR_UNREG_OPTION);
 95 |       void parse(int argc, const char* const argv[], int mode = PREFER_FLAG_FOR_UNREG_OPTION);
 96 | 
 97 |       std::multiset<std::string>          const& flags()    const { return flags_;    }
 98 |       std::map<std::string, std::string>  const& params()   const { return params_;   }
 99 |       std::vector<std::string>            const& pos_args() const { return pos_args_; }
100 | 
101 |       // begin() and end() for using range-for over positional args.
102 |       std::vector<std::string>::const_iterator begin() const { return pos_args_.cbegin(); }
103 |       std::vector<std::string>::const_iterator end()   const { return pos_args_.cend();   }
104 |       size_t size()                                    const { return pos_args_.size();   }
105 | 
106 |       //////////////////////////////////////////////////////////////////////////
107 |       // Accessors
108 | 
109 |       // flag (boolean) accessors: return true if the flag appeared, otherwise false.
110 |       bool operator[](std::string const& name) const;
111 | 
112 |       // multiple flag (boolean) accessors: return true if at least one of the flag appeared, otherwise false.
113 |       bool operator[](std::initializer_list<char const* const> init_list) const;
114 | 
115 |       // returns positional arg string by order. Like argv[] but without the options
116 |       std::string const& operator[](size_t ind) const;
117 | 
118 |       // returns a std::istream that can be used to convert a positional arg to a typed value.
119 |       string_stream operator()(size_t ind) const;
120 | 
121 |       // same as above, but with a default value in case the arg is missing (index out of range).
122 |       template<typename T>
123 |       string_stream operator()(size_t ind, T&& def_val) const;
124 | 
125 |       // parameter accessors, give a name get an std::istream that can be used to convert to a typed value.
126 |       // call .str() on result to get as string
127 |       string_stream operator()(std::string const& name) const;
128 | 
129 |       // accessor for a parameter with multiple names, give a list of names, get an std::istream that can be used to convert to a typed value.
130 |       // call .str() on result to get as string
131 |       // returns the first value in the list to be found.
132 |       string_stream operator()(std::initializer_list<char const* const> init_list) const;
133 | 
134 |       // same as above, but with a default value in case the param was missing.
135 |       // Non-string def_val types must have an operator<<() (output stream operator)
136 |       // If T only has an input stream operator, pass the string version of the type as in "3" instead of 3.
137 |       template<typename T>
138 |       string_stream operator()(std::string const& name, T&& def_val) const;
139 | 
140 |       // same as above but for a list of names. returns the first value to be found.
141 |       template<typename T>
142 |       string_stream operator()(std::initializer_list<char const* const> init_list, T&& def_val) const;
143 | 
144 |    private:
145 |       string_stream bad_stream() const;
146 |       std::string trim_leading_dashes(std::string const& name) const;
147 |       bool is_number(std::string const& arg) const;
148 |       bool is_option(std::string const& arg) const;
149 |       bool got_flag(std::string const& name) const;
150 |       bool is_param(std::string const& name) const;
151 | 
152 |    private:
153 |       std::vector<std::string> args_;
154 |       std::map<std::string, std::string> params_;
155 |       std::vector<std::string> pos_args_;
156 |       std::multiset<std::string> flags_;
157 |       std::set<std::string> registeredParams_;
158 |       std::string empty_;
159 |    };
160 | 
161 | 
162 |    //////////////////////////////////////////////////////////////////////////
163 | 
164 |    inline void parser::parse(const char * const argv[], int mode)
165 |    {
166 |       int argc = 0;
167 |       for (auto argvp = argv; *argvp; ++argc, ++argvp);
168 |       parse(argc, argv, mode);
169 |    }
170 | 
171 |    //////////////////////////////////////////////////////////////////////////
172 | 
173 |    inline void parser::parse(int argc, const char* const argv[], int mode /*= PREFER_FLAG_FOR_UNREG_OPTION*/)
174 |    {
175 |       // convert to strings
176 |       args_.resize(argc);
177 |       std::transform(argv, argv + argc, args_.begin(), [](const char* const arg) { return arg;  });
178 | 
179 |       // parse line
180 |       for (auto i = 0u; i < args_.size(); ++i)
181 |       {
182 |          if (!is_option(args_[i]))
183 |          {
184 |             pos_args_.emplace_back(args_[i]);
185 |             continue;
186 |          }
187 | 
188 |          auto name = trim_leading_dashes(args_[i]);
189 | 
190 |          if (!(mode & NO_SPLIT_ON_EQUALSIGN))
191 |          {
192 |             auto equalPos = name.find('=');
193 |             if (equalPos != std::string::npos)
194 |             {
195 |                params_.insert({ name.substr(0, equalPos), name.substr(equalPos + 1) });
196 |                continue;
197 |             }
198 |          }
199 | 
200 |          // if the option is unregistered and should be a multi-flag
201 |          if (1 == (args_[i].size() - name.size()) &&         // single dash
202 |             argh::parser::SINGLE_DASH_IS_MULTIFLAG & mode && // multi-flag mode
203 |             !is_param(name))                                  // unregistered
204 |          {
205 |             std::string keep_param; 
206 |             
207 |             if (!name.empty() && is_param(std::string(1ul, name.back()))) // last char is param
208 |             {
209 |                keep_param += name.back();
210 |                name.resize(name.size() - 1);
211 |             }
212 | 
213 |             for (auto const& c : name)
214 |             {
215 |                flags_.emplace(std::string{ c });
216 |             }
217 | 
218 |             if (!keep_param.empty())
219 |             {
220 |                name = keep_param;
221 |             }
222 |             else
223 |             {
224 |                continue; // do not consider other options for this arg
225 |             }
226 |          }
227 | 
228 |          // any potential option will get as its value the next arg, unless that arg is an option too
229 |          // in that case it will be determined a flag.
230 |          if (i == args_.size() - 1 || is_option(args_[i + 1]))
231 |          {
232 |             flags_.emplace(name);
233 |             continue;
234 |          }
235 | 
236 |          // if 'name' is a pre-registered option, then the next arg cannot be a free parameter to it is skipped
237 |          // otherwise we have 2 modes:
238 |          // PREFER_FLAG_FOR_UNREG_OPTION: a non-registered 'name' is determined a flag. 
239 |          //                               The following value (the next arg) will be a free parameter.
240 |          //
241 |          // PREFER_PARAM_FOR_UNREG_OPTION: a non-registered 'name' is determined a parameter, the next arg
242 |          //                                will be the value of that option.
243 | 
244 |          assert(!(mode & argh::parser::PREFER_FLAG_FOR_UNREG_OPTION)
245 |              || !(mode & argh::parser::PREFER_PARAM_FOR_UNREG_OPTION));
246 | 
247 |          bool preferParam = mode & argh::parser::PREFER_PARAM_FOR_UNREG_OPTION;
248 | 
249 |          if (is_param(name) || preferParam)
250 |          {
251 |             params_.insert({ name, args_[i + 1] });
252 |             ++i; // skip next value, it is not a free parameter
253 |             continue;
254 |          }
255 |          else
256 |          {
257 |             flags_.emplace(name);
258 |          }
259 |       };
260 |    }
261 | 
262 |    //////////////////////////////////////////////////////////////////////////
263 | 
264 |    inline string_stream parser::bad_stream() const
265 |    {
266 |       string_stream bad;
267 |       bad.setstate(std::ios_base::failbit);
268 |       return bad;
269 |    }
270 | 
271 |    //////////////////////////////////////////////////////////////////////////
272 | 
273 |    inline bool parser::is_number(std::string const& arg) const
274 |    {
275 |       // inefficient but simple way to determine if a string is a number (which can start with a '-')
276 |       std::istringstream istr(arg);
277 |       double number;
278 |       istr >> number;
279 |       return !(istr.fail() || istr.bad());
280 |    }
281 | 
282 |    //////////////////////////////////////////////////////////////////////////
283 | 
284 |    inline bool parser::is_option(std::string const& arg) const
285 |    {
286 |       assert(0 != arg.size());
287 |       if (is_number(arg))
288 |          return false;
289 |       return '-' == arg[0];
290 |    }
291 | 
292 |    //////////////////////////////////////////////////////////////////////////
293 | 
294 |    inline std::string parser::trim_leading_dashes(std::string const& name) const
295 |    {
296 |       auto pos = name.find_first_not_of('-');
297 |       return std::string::npos != pos ? name.substr(pos) : name;
298 |    }
299 | 
300 |    //////////////////////////////////////////////////////////////////////////
301 | 
302 |    inline bool argh::parser::got_flag(std::string const& name) const
303 |    {
304 |       return flags_.end() != flags_.find(trim_leading_dashes(name));
305 |    }
306 | 
307 |    //////////////////////////////////////////////////////////////////////////
308 | 
309 |    inline bool argh::parser::is_param(std::string const& name) const
310 |    {
311 |       return registeredParams_.count(name);
312 |    }
313 | 
314 |    //////////////////////////////////////////////////////////////////////////
315 | 
316 |    inline bool parser::operator[](std::string const& name) const
317 |    {
318 |       return got_flag(name);
319 |    }
320 | 
321 |    //////////////////////////////////////////////////////////////////////////
322 | 
323 |    inline bool parser::operator[](std::initializer_list<char const* const> init_list) const
324 |    {
325 |       return std::any_of(init_list.begin(), init_list.end(), [&](char const* const name) { return got_flag(name); });
326 |    }
327 | 
328 |    //////////////////////////////////////////////////////////////////////////
329 | 
330 |    inline std::string const& parser::operator[](size_t ind) const
331 |    {
332 |       if (ind < pos_args_.size())
333 |          return pos_args_[ind];
334 |       return empty_;
335 |    }
336 | 
337 |    //////////////////////////////////////////////////////////////////////////
338 | 
339 |    inline string_stream parser::operator()(std::string const& name) const
340 |    {
341 |       auto optIt = params_.find(trim_leading_dashes(name));
342 |       if (params_.end() != optIt)
343 |          return string_stream(optIt->second);
344 |       return bad_stream();
345 |    }
346 | 
347 |    //////////////////////////////////////////////////////////////////////////
348 | 
349 |    inline string_stream parser::operator()(std::initializer_list<char const* const> init_list) const
350 |    {
351 |       for (auto& name : init_list)
352 |       {
353 |          auto optIt = params_.find(trim_leading_dashes(name));
354 |          if (params_.end() != optIt)
355 |             return string_stream(optIt->second);
356 |       }
357 |       return bad_stream();
358 |    }
359 | 
360 |    //////////////////////////////////////////////////////////////////////////
361 | 
362 |    template<typename T>
363 |    string_stream parser::operator()(std::string const& name, T&& def_val) const
364 |    {
365 |       auto optIt = params_.find(trim_leading_dashes(name));
366 |       if (params_.end() != optIt)
367 |          return string_stream(optIt->second);
368 | 
369 |       std::ostringstream ostr;
370 |       ostr << def_val;
371 |       return string_stream(ostr.str()); // use default
372 |    }
373 | 
374 |    //////////////////////////////////////////////////////////////////////////
375 | 
376 |    // same as above but for a list of names. returns the first value to be found.
377 |    template<typename T>
378 |    string_stream parser::operator()(std::initializer_list<char const* const> init_list, T&& def_val) const
379 |    {
380 |       for (auto& name : init_list)
381 |       {
382 |          auto optIt = params_.find(trim_leading_dashes(name));
383 |          if (params_.end() != optIt)
384 |             return string_stream(optIt->second);
385 |       }      
386 |       std::ostringstream ostr;
387 |       ostr << def_val;
388 |       return string_stream(ostr.str()); // use default
389 |    }
390 | 
391 |    //////////////////////////////////////////////////////////////////////////
392 | 
393 |    inline string_stream parser::operator()(size_t ind) const
394 |    {
395 |       if (pos_args_.size() <= ind)
396 |          return bad_stream();
397 | 
398 |       return string_stream(pos_args_[ind]);
399 |    }
400 | 
401 |    //////////////////////////////////////////////////////////////////////////
402 | 
403 |    template<typename T>
404 |    string_stream parser::operator()(size_t ind, T&& def_val) const
405 |    {
406 |       if (pos_args_.size() <= ind)
407 |       {
408 |          std::ostringstream ostr;
409 |          ostr << def_val;
410 |          return string_stream(ostr.str());
411 |       }
412 | 
413 |       return string_stream(pos_args_[ind]);
414 |    }
415 | 
416 |    //////////////////////////////////////////////////////////////////////////
417 | 
418 |    inline void parser::add_param(std::string const& name)
419 |    {
420 |       registeredParams_.insert(trim_leading_dashes(name));
421 |    }
422 | 
423 |    //////////////////////////////////////////////////////////////////////////
424 | 
425 |    inline void parser::add_params(std::initializer_list<char const* const> init_list)
426 |    {
427 |       for (auto& name : init_list)
428 |          registeredParams_.insert(trim_leading_dashes(name));
429 |    }
430 | }
431 | 
432 | 


--------------------------------------------------------------------------------
/src/blaze.cpp:
--------------------------------------------------------------------------------
  1 | #include <iomanip>
  2 | #include <iostream>
  3 | #include <mutex>
  4 | #include <sstream>
  5 | #include <thread>
  6 | #include <vector>
  7 | 
  8 | #include <curl/curl.h>
  9 | 
 10 | #include "argh.h"
 11 | #include "../vendor/rapidjson/include/rapidjson/document.h"
 12 | #include "../vendor/rapidjson/include/rapidjson/filewritestream.h"
 13 | #include "../vendor/rapidjson/include/rapidjson/writer.h"
 14 | 
 15 | #define DEFAULT_SIZE   5000
 16 | #define DEFAULT_SLICES 5
 17 | #define WRITE_BUF_SIZE 65536
 18 | 
 19 | static std::mutex mtx_out;
 20 | 
 21 | struct auth_options
 22 | {
 23 |     std::string type;
 24 |     std::string user;
 25 |     std::string pass;
 26 |     bool insecure;
 27 | };
 28 | 
 29 | struct dump_options
 30 | {
 31 |     std::string  host;
 32 |     std::string  index;
 33 |     auth_options auth;
 34 |     int          slice_id;
 35 |     int          slice_max;
 36 |     int          size;
 37 | };
 38 | 
 39 | struct thread_state
 40 | {
 41 |     std::stringstream error;
 42 | };
 43 | 
 44 | struct thread_container
 45 | {
 46 |     int          slice_id;
 47 |     thread_state state;
 48 |     std::thread  thread;
 49 | };
 50 | 
 51 | size_t write_data(
 52 |     void   * buffer,
 53 |     size_t   size,
 54 |     size_t   nmemb,
 55 |     void   * userp)
 56 | {
 57 |     std::vector<char>* data = reinterpret_cast<std::vector<char>*>(userp);
 58 | 
 59 |     const char* real_buffer = reinterpret_cast<const char*>(buffer);
 60 |     size_t real_size = size * nmemb;
 61 |     data->insert(data->end(), real_buffer, real_buffer + real_size);
 62 |     return real_size;
 63 | }
 64 | 
 65 | bool get_or_post_data(
 66 |     CURL                * crl,
 67 |     std::string   const & url,
 68 |     auth_options  const & auth,
 69 |     std::vector<char>   * data,
 70 |     long                * response_code,
 71 |     std::string         * error,
 72 |     std::string           body = "")
 73 | {
 74 |     curl_slist* headers = nullptr;
 75 |     headers = curl_slist_append(headers, "Content-Type: application/json");
 76 | 
 77 |     curl_easy_setopt(crl, CURLOPT_HTTPHEADER,    headers);
 78 |     curl_easy_setopt(crl, CURLOPT_URL,           url.c_str());
 79 |     curl_easy_setopt(crl, CURLOPT_WRITEFUNCTION, &write_data);
 80 |     curl_easy_setopt(crl, CURLOPT_WRITEDATA,     reinterpret_cast<void*>(data));
 81 | 
 82 |     if (auth.insecure)
 83 |     {
 84 |         curl_easy_setopt(crl, CURLOPT_SSL_VERIFYPEER, 0);
 85 |         curl_easy_setopt(crl, CURLOPT_SSL_VERIFYHOST, 0);
 86 |     }
 87 | 
 88 |     if (auth.type == "basic")
 89 |     {
 90 |         std::string user_pass = auth.user + ":" + auth.pass;
 91 |         curl_easy_setopt(crl, CURLOPT_HTTPAUTH, CURLAUTH_BASIC);
 92 |         curl_easy_setopt(crl, CURLOPT_USERPWD,  user_pass.c_str());
 93 |     }
 94 | 
 95 |     if (!body.empty())
 96 |     {
 97 |         curl_easy_setopt(crl, CURLOPT_POSTFIELDS, body.c_str());
 98 |     }
 99 | 
100 |     CURLcode res = curl_easy_perform(crl);
101 |     curl_slist_free_all(headers);
102 | 
103 |     if (res == CURLE_OK)
104 |     {
105 |         curl_easy_getinfo(crl, CURLINFO_RESPONSE_CODE, response_code);
106 |         return true;
107 |     }
108 | 
109 |     *error = curl_easy_strerror(res);
110 |     return false;
111 | }
112 | 
113 | void write_document(
114 |     rapidjson::Document & document,
115 |     int                 * hits_count,
116 |     std::string         * scroll_id)
117 | {
118 |     std::unique_lock<std::mutex>      lock(mtx_out);
119 | 
120 |     static char                       buffer[WRITE_BUF_SIZE];
121 |     static rapidjson::FileWriteStream stream(stdout, buffer, sizeof(buffer));
122 | 
123 |     // Epic const unfolding.
124 |     auto const& scroll_id_value   = document["_scroll_id"];
125 |     auto const& hits_object_value = document["hits"];
126 |     auto const& hits_object       = hits_object_value.GetObject();
127 |     auto const& hits_value        = hits_object["hits"];
128 |     auto const& hits              = hits_value.GetArray();
129 | 
130 |     // Shared allocator
131 |     auto& allocator               = document.GetAllocator();
132 |     auto  writer                  = rapidjson::Writer<rapidjson::FileWriteStream>(stream);
133 | 
134 |     for (rapidjson::Value const& hit : hits)
135 |     {
136 |         auto meta_index      = rapidjson::Value(rapidjson::kObjectType);
137 |         auto meta_index_id   = rapidjson::Value();
138 |         auto meta_object     = rapidjson::Value(rapidjson::kObjectType);
139 | 
140 |         meta_index_id.SetString(hit["_id"].GetString(), allocator);
141 | 
142 |         meta_index.AddMember("_id",   meta_index_id,   allocator);
143 | 
144 |         meta_object.AddMember("index", meta_index, allocator);
145 | 
146 |         // Serialize to output stream. Do it in two steps to get
147 |         // new-line separated JSON.
148 | 
149 |         meta_object.Accept(writer);
150 |         stream.Put('\n');
151 |         stream.Flush();
152 |         writer.Reset(stream);
153 | 
154 |         hit["_source"].Accept(writer);
155 |         stream.Put('\n');
156 |         stream.Flush();
157 |         writer.Reset(stream);
158 |     }
159 | 
160 |     *scroll_id  = scroll_id_value.GetString();
161 |     *hits_count = hits.Size();
162 | }
163 | 
164 | void output_parser_error(
165 |     rapidjson::Document const& doc,
166 |     std::ostream             & stream)
167 | {
168 |     stream << "JSON parsing failed with code: "
169 |            << doc.GetParseError()
170 |            << ", at offset "
171 |            << doc.GetErrorOffset();
172 | }
173 | 
174 | void dump(
175 |     dump_options const& options,
176 |     thread_state      * state)
177 | {
178 |     CURL* crl = curl_easy_init();
179 | 
180 |     std::string query = "{\n"
181 |         "\"size\": " + std::to_string(options.size) + ",\n"
182 |         "\"slice\": {\n"
183 |             "\"id\": " + std::to_string(options.slice_id) + ",\n"
184 |             "\"max\": " + std::to_string(options.slice_max) + "\n"
185 |         "}\n"
186 |     "}";
187 | 
188 |     std::vector<char> buffer;
189 |     long              response_code;
190 |     std::string       error;
191 | 
192 |     bool res = get_or_post_data(
193 |         crl,
194 |         options.host + "/" + options.index + "/_search?scroll=1m",
195 |         options.auth,
196 |         &buffer,
197 |         &response_code,
198 |         &error,
199 |         query);
200 | 
201 |     if (!res)
202 |     {
203 |         state->error << "A HTTP error occured: " << error;
204 |         return;
205 |     }
206 | 
207 |     if (response_code != 200)
208 |     {
209 |         state->error << "Server returned HTTP status " << response_code << ": " << buffer.data();
210 |         return;
211 |     }
212 | 
213 |     rapidjson::Document doc;
214 |     doc.Parse(buffer.data(), buffer.size());
215 | 
216 |     if (doc.HasParseError())
217 |     {
218 |         return output_parser_error(doc, state->error);
219 |     }
220 | 
221 |     std::string scroll_id;
222 |     int         hits_count;
223 | 
224 |     write_document(
225 |         doc,
226 |         &hits_count,
227 |         &scroll_id);
228 | 
229 |     do
230 |     {
231 |         query = "{\n"
232 |             "\"scroll\": \"1m\",\n"
233 |             "\"scroll_id\": \"" + scroll_id + "\"\n"
234 |         "}\n";
235 | 
236 |         buffer.clear();
237 | 
238 |         res = get_or_post_data(
239 |             crl,
240 |             options.host + "/_search/scroll",
241 |             options.auth,
242 |             &buffer,
243 |             &response_code,
244 |             &error,
245 |             query);
246 | 
247 |         if (!res)
248 |         {
249 |             state->error << "A HTTP error occured: " << error;
250 |             return;
251 |         }
252 | 
253 |         if (response_code != 200)
254 |         {
255 |             state->error << "Server returned HTTP status " << response_code;
256 |             return;
257 |         }
258 | 
259 |         rapidjson::Document doc_search;
260 |         doc_search.Parse(buffer.data(), buffer.size());
261 | 
262 |         if (doc_search.HasParseError())
263 |         {
264 |             return output_parser_error(doc_search, state->error);
265 |         }
266 | 
267 |         write_document(
268 |             doc_search,
269 |             &hits_count,
270 |             &scroll_id);
271 |     } while (hits_count > 0);
272 | 
273 |     curl_easy_cleanup(crl);
274 | }
275 | 
276 | int64_t count_documents(
277 |     std::string  const& host,
278 |     std::string  const& index,
279 |     auth_options const& auth)
280 | {
281 |     CURL                * crl = curl_easy_init();
282 |     long                  response_code;
283 |     rapidjson::Document   doc;
284 |     std::string           url = host + "/" + index + "/_count";
285 |     std::string           error;
286 |     std::vector<char>     buffer;
287 | 
288 |     bool res = get_or_post_data(
289 |         crl,
290 |         url,
291 |         auth,
292 |         &buffer,
293 |         &response_code,
294 |         &error);
295 | 
296 |     if (!res)
297 |     {
298 |         std::cerr << "A HTTP error occured: " << error << std::endl;
299 |         return -1;
300 |     }
301 | 
302 |     doc.Parse(buffer.data(), buffer.size());
303 | 
304 |     if (doc.HasParseError())
305 |     {
306 |         output_parser_error(doc, std::cerr);
307 |         return -1;
308 |     }
309 | 
310 |     return doc["count"].GetInt64();
311 | }
312 | 
313 | int dump_mappings(
314 |     std::string  const& host,
315 |     std::string  const& index,
316 |     auth_options const& auth)
317 | {
318 |     static char                       write_buffer[WRITE_BUF_SIZE];
319 |     static rapidjson::FileWriteStream stream(stdout, write_buffer, sizeof(write_buffer));
320 | 
321 |     CURL                            * crl = curl_easy_init();
322 |     long                              response_code;
323 |     rapidjson::Document               doc;
324 |     std::string                       url = host + "/" + index + "/_mapping";
325 |     std::string                       error;
326 |     std::vector<char>                 buffer;
327 | 
328 |     bool res = get_or_post_data(
329 |         crl,
330 |         url,
331 |         auth,
332 |         &buffer,
333 |         &response_code,
334 |         &error);
335 | 
336 |     if (!res)
337 |     {
338 |         std::cerr << "A HTTP error occured: " << error << std::endl;
339 |         return 1;
340 |     }
341 | 
342 |     doc.Parse(buffer.data(), buffer.size());
343 | 
344 |     if (doc.HasParseError())
345 |     {
346 |         output_parser_error(doc, std::cerr);
347 |         return 1;
348 |     }
349 | 
350 |     rapidjson::Writer<rapidjson::FileWriteStream> writer(stream);
351 |     doc[index.c_str()].Accept(writer);
352 |     stream.Put('\n');
353 |     stream.Flush();
354 | 
355 |     curl_easy_cleanup(crl);
356 | 
357 |     return 0;
358 | }
359 | 
360 | int dump_index_info(
361 |     std::string  const& host,
362 |     std::string  const& index,
363 |     auth_options const& auth)
364 | {
365 |     static char                       write_buffer[WRITE_BUF_SIZE];
366 |     static rapidjson::FileWriteStream stream(stdout, write_buffer, sizeof(write_buffer));
367 | 
368 |     CURL                            * crl = curl_easy_init();
369 |     long                              response_code;
370 |     rapidjson::Document               doc;
371 |     std::string                       url = host + "/" + index;
372 |     std::string                       error;
373 |     std::vector<char>                 buffer;
374 | 
375 |     bool res = get_or_post_data(
376 |         crl,
377 |         url,
378 |         auth,
379 |         &buffer,
380 |         &response_code,
381 |         &error);
382 | 
383 |     if (!res)
384 |     {
385 |         std::cerr << "A HTTP error occured: " << error << std::endl;
386 |         return 1;
387 |     }
388 | 
389 |     doc.Parse(buffer.data(), buffer.size());
390 | 
391 |     if (doc.HasParseError())
392 |     {
393 |         output_parser_error(doc, std::cerr);
394 |         return 1;
395 |     }
396 | 
397 |     rapidjson::Writer<rapidjson::FileWriteStream> writer(stream);
398 |     doc[index.c_str()].Accept(writer);
399 |     stream.Put('\n');
400 |     stream.Flush();
401 | 
402 |     curl_easy_cleanup(crl);
403 | 
404 |     return 0;
405 | }
406 | 
407 | int main(
408 |     int    argc,
409 |     char * argv[])
410 | {
411 |     curl_global_init(CURL_GLOBAL_ALL);
412 | 
413 |     std::vector<std::unique_ptr<thread_container>> threads;
414 | 
415 |     // Parse command line options
416 |     argh::parser cmdl(argv);
417 | 
418 |     std::string host;
419 |     if (!(cmdl({"--host"}) >> host))
420 |     {
421 |         std::cerr << "Must provide an Elasticsearch host (--host)" << std::endl;
422 |         return 1;
423 |     }
424 | 
425 |     std::string index;
426 |     if (!(cmdl({"--index"}) >> index))
427 |     {
428 |         std::cerr << "Must provide an index (--index)" << std::endl;
429 |         return 1;
430 |     }
431 | 
432 |     auth_options auth;
433 | 
434 |     if (cmdl({"--auth"}) >> auth.type)
435 |     {
436 |         if (auth.type == "basic")
437 |         {
438 |             if (!(cmdl({"--basic-username"}) >> auth.user))
439 |             {
440 |                 std::cerr << "Must provide --basic-username when passing --auth=basic" << std::endl;
441 |                 return 1;
442 |             }
443 | 
444 |             if (!(cmdl({"--basic-password"}) >> auth.pass))
445 |             {
446 |                 std::cerr << "Must provide --basic-password when passing --auth=basic" << std::endl;
447 |                 return 1;
448 |             }
449 |         }
450 |     }
451 | 
452 |     auth.insecure = cmdl["--insecure"];
453 | 
454 |     if (cmdl["--dump-mappings"])
455 |     {
456 |         return dump_mappings(
457 |             host,
458 |             index,
459 |             auth);
460 |     }
461 |     else if (cmdl["--dump-index-info"])
462 |     {
463 |         return dump_index_info(
464 |             host,
465 |             index,
466 |             auth);
467 |     }
468 | 
469 |     // Sanity check - see if we have any documents in the index at all.
470 |     if (count_documents(host, index, auth) <= 0)
471 |     {
472 |         std::cerr << "Index is empty - no documents found" << std::endl;
473 |         return 0;
474 |     }
475 | 
476 |     int slices;
477 |     cmdl({"--slices"}, DEFAULT_SLICES) >> slices;
478 | 
479 |     int size;
480 |     cmdl({"--size"}, DEFAULT_SIZE) >> size;
481 | 
482 |     for (int i = 0; i < slices; i++)
483 |     {
484 |         dump_options opts;
485 |         opts.host      = host;
486 |         opts.index     = index;
487 |         opts.auth      = auth;
488 |         opts.size      = size;
489 |         opts.slice_id  = i;
490 |         opts.slice_max = slices;
491 | 
492 |         auto cnt       = std::unique_ptr<thread_container>(new thread_container());
493 |         cnt->slice_id  = i;
494 |         cnt->thread    = std::thread(dump, opts, &cnt->state);
495 | 
496 |         threads.push_back(std::move(cnt));
497 |     }
498 | 
499 |     int exit_code = 0;
500 | 
501 |     for (auto& cnt : threads)
502 |     {
503 |         cnt->thread.join();
504 | 
505 |         if (cnt->state.error.tellp() > 0)
506 |         {
507 |             std::cerr << "Slice "
508 |                       << std::setw(2) << std::setfill('0') << cnt->slice_id
509 |                       << " exited with error: "
510 |                       << cnt->state.error.rdbuf()
511 |                       << std::endl;
512 | 
513 |             exit_code = 1;
514 |         }
515 |     }
516 | 
517 |     curl_global_cleanup();
518 | 
519 |     return exit_code;
520 | }
521 | 


--------------------------------------------------------------------------------