├── .github └── workflows │ └── ci.yml ├── .gitignore ├── .gitmodules ├── .travis.yml ├── Dockerfile ├── LICENSE ├── Makefile ├── README.md ├── snap └── snapcraft.yaml └── src ├── argh.h └── blaze.cpp /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: 3 | pull_request: 4 | push: 5 | 6 | jobs: 7 | build: 8 | name: Build 9 | runs-on: ${{ matrix.os }} 10 | strategy: 11 | matrix: 12 | os: [ macos-10.15, ubuntu-20.04 ] 13 | 14 | steps: 15 | - uses: actions/checkout@v2 16 | 17 | - name: Setup Git 18 | run: git submodule update --init --recursive 19 | 20 | - name: Install dependencies (Ubuntu 20.04) 21 | run: sudo apt-get install libcurl4-openssl-dev 22 | if: matrix.os == 'ubuntu-20.04' 23 | 24 | - name: Make 25 | run: make 26 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | blaze 2 | *.o 3 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "vendor/rapidjson"] 2 | path = vendor/rapidjson 3 | url = https://github.com/Tencent/RapidJSON 4 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: cpp 2 | 3 | compiler: 4 | - clang 5 | - gcc 6 | 7 | os: 8 | - linux 9 | - osx 10 | 11 | script: make 12 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # -- build context 2 | 3 | FROM alpine AS build-env 4 | 5 | LABEL name="blaze in docker" 6 | LABEL version="1.0.0" 7 | LABEL maintainer="norman@khine.net" 8 | 9 | WORKDIR /tmp 10 | 11 | RUN apk update 12 | 13 | RUN apk add --no-cache g++ gcc automake make autoconf libtool curl-dev git 14 | 15 | COPY vendor/ ./vendor 16 | COPY src/ ./src 17 | COPY Makefile . 18 | 19 | RUN make 20 | 21 | # -- runtime context 22 | 23 | FROM alpine 24 | 25 | RUN apk update \ 26 | && apk add --no-cache ca-certificates openssl \ 27 | && update-ca-certificates 28 | 29 | COPY --from=build-env /lib /lib 30 | COPY --from=build-env /usr/lib /usr/lib 31 | COPY --from=build-env /usr/local/share /usr/local/share 32 | COPY --from=build-env /usr/local/lib /usr/local/lib 33 | COPY --from=build-env /tmp/blaze /usr/local/bin 34 | 35 | CMD ["/usr/local/bin/blaze"] 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2018 Viktor Elofsson and contributors. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | this software and associated documentation files (the "Software"), to deal in 5 | the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 7 | of the Software, and to permit persons to whom the Software is furnished to do 8 | so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | CPPFLAGS=--std=c++11 -mtune=native -O3 -DNDEBUG=1 2 | CXX=g++ 3 | RM=rm -f 4 | 5 | all: blaze 6 | 7 | blaze: src/blaze.o 8 | $(CXX) -o blaze src/blaze.o -lcurl -lpthread 9 | 10 | blaze.o: src/blaze.cpp 11 | $(CXX) $(CPPFLAGS) -c src/blaze.cpp -o src/blaze.o 12 | 13 | .PHONY: clean 14 | clean: 15 | $(RM) src/blaze.o 16 | 17 | .PHONY: distclean 18 | distclean: clean 19 | $(RM) blaze 20 | 21 | .PHONY: install 22 | install: blaze 23 | mkdir -p $(DESTDIR)/usr/local/bin 24 | install -m 4755 -o root blaze $(DESTDIR)/usr/local/bin 25 | 26 | .PHONY: uninstall 27 | uninstall: 28 | rm -f $(DESTDIR)/usr/local/bin/blaze 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Blaze 2 | 3 | Are you running Elasticsearch? Want to take your data and get the heck outta 4 | Dodge? **Blaze** provides everything you need in a neat, blazing fast package! 5 | 6 | | **Linux / OSX** | 7 | | --------------- | 8 | | [![Build Status](https://github.com/unidentifieddeveloper/blaze/workflows/CI/badge.svg?branch=master)](https://github.com/unidentifieddeveloper/blaze/actions?query=branch%3Amaster) | 9 | 10 | 11 | ## Features 12 | 13 | - Uses the [Elasticsearch sliced scroll API](https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-scroll.html) to get your data hella fast. 14 | - Written in modern C++ using [libcurl](https://github.com/curl/curl) and [RapidJSON](https://github.com/Tencent/RapidJSON). 15 | - Distributed as a single, tiny binary. 16 | 17 | 18 | ### Performance 19 | 20 | Blaze compared to other Elasticsearch dump tools. The index has ~3.5M rows and 21 | is ~5GB in size. Each tool is timed with `time` and measures the time to write 22 | a simple JSON dump file. 23 | 24 | | **Tool** | **Time** | 25 | | ----------- | -------- | 26 | | Blaze | 00m40s | 27 | | elasticdump | 04m38s | 28 | 29 | 30 | ## Usage 31 | 32 | Get the binary for your platform from the Releases page or compile it yourself. 33 | If you use it often it might make sense to put it in your `PATH` somewhere. 34 | 35 | ```sh 36 | $ blaze --host=http://localhost:9200 --index=massive_1 > dump.ndjson 37 | ``` 38 | 39 | This will connect to Elasticsearch on the specified host and start downloading 40 | the `massive_1` index to *stdout*. Make sure to redirect this somewhere, such as 41 | a JSON file. 42 | 43 | 44 | ### Output format 45 | 46 | Blaze will dump everything to *stdout* in a format compatible with the 47 | Elasticsearch Bulk API, meaning you can use `curl` to put the data back. 48 | 49 | ```sh 50 | curl -H "Content-Type: application/x-ndjson" -XPOST localhost:9200/other_data/_bulk --data-binary "@dump.ndjson" 51 | ``` 52 | 53 | One issue when working with large datasets is that Elasticsearch has an upper 54 | limit on the size of HTTP requests (2GB). The solution is to split the file 55 | with something like `parallel`. The split should be done on even line numbers 56 | since each command is actually two lines in the file. 57 | 58 | ```sh 59 | cat dump.ndjson | parallel --pipe -l 50000 curl -s -H "Content-Type: application/x-ndjson" -XPOST localhost:9200/other_data/_bulk --data-binary "@-" 60 | ``` 61 | 62 | 63 | ### Command line options 64 | 65 | - `--host=` - the host where Elasticsearch is running. 66 | - `--index=` - the index to dump. 67 | - `--slices=` - *(optional)* the number of slices to split the scroll. Should be set to the 68 | number of shards for the index (as seen on `/_cat/indices`). Defaults to *5*. 69 | - `--size=` - *(optional)* the size of the response (i.e, length of the `hits` array). 70 | Defaults to *5000*. 71 | - `--dump-mappings` - specify this flag to dump the index mappings instead of the source. 72 | - `--dump-index-info` - specify this flag to dump the full index information (settings and mappings) instead of the source. 73 | 74 | #### Authentication 75 | 76 | To use HTTP Basic authentication you need to pass the following options. *Note* 77 | that passing a password on the command line will put it in your terminal 78 | history, so please use with care. 79 | 80 | - `--auth=basic` - enable HTTP Basic authentication. 81 | - `--basic-username=foo` - the username. 82 | - `--basic-password=bar` - the password. 83 | - `--insecure` - For HTTPS connections, specify this flag to skip server certificate validation. 84 | 85 | ## Building from source 86 | 87 | Building Blaze is easy. It requires `libcurl`. 88 | 89 | ### On Linux (and OSX) 90 | 91 | ```sh 92 | $ git submodule update --init 93 | $ make 94 | ``` 95 | 96 | ### Run it from docker 97 | 98 | ```terminal 99 | docker build -t blaze . 100 | docker run -it blaze blaze 101 | ``` 102 | 103 | ## License 104 | 105 | Copyright © Viktor Elofsson and contributors. 106 | 107 | Blaze is provided as-is under the MIT license. For more information see 108 | [LICENSE](https://github.com/vktr/blaze/blob/master/LICENSE). 109 | 110 | - For libcurl, see https://curl.haxx.se/docs/copyright.html 111 | - For RapidJSON, see https://github.com/Tencent/rapidjson/blob/master/license.txt 112 | -------------------------------------------------------------------------------- /snap/snapcraft.yaml: -------------------------------------------------------------------------------- 1 | name: blaze 2 | version: 'stable' 3 | summary: Blazing fast ElasticSearch data exporter. 4 | description: | 5 | Blaze exports your ElasticSearch data really, really fast. 6 | grade: devel 7 | confinement: devmode 8 | 9 | parts: 10 | blaze: 11 | source: . 12 | plugin: make 13 | -------------------------------------------------------------------------------- /src/argh.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | namespace argh 12 | { 13 | // Terminology: 14 | // A command line is composed of 2 types of args: 15 | // 1. Positional args, i.e. free standing values 16 | // 2. Options: args beginning with '-'. We identify two kinds: 17 | // 2.1: Flags: boolean options => (exist ? true : false) 18 | // 2.2: Parameters: a name followed by a non-option value 19 | 20 | #if !defined(__GNUC__) || (__GNUC__ >= 5) 21 | using string_stream = std::istringstream; 22 | #else 23 | // Until GCC 5, istringstream did not have a move constructor. 24 | // stringstream_proxy is used instead, as a workaround. 25 | class stringstream_proxy 26 | { 27 | public: 28 | stringstream_proxy() = default; 29 | 30 | // Construct with a value. 31 | stringstream_proxy(std::string const& value) : 32 | stream_(value) 33 | {} 34 | 35 | // Copy constructor. 36 | stringstream_proxy(const stringstream_proxy& other) : 37 | stream_(other.stream_.str()) 38 | { 39 | stream_.setstate(other.stream_.rdstate()); 40 | } 41 | 42 | void setstate(std::ios_base::iostate state) { stream_.setstate(state); } 43 | 44 | // Stream out the value of the parameter. 45 | // If the conversion was not possible, the stream will enter the fail state, 46 | // and operator bool will return false. 47 | template 48 | stringstream_proxy& operator >> (T& thing) 49 | { 50 | stream_ >> thing; 51 | return *this; 52 | } 53 | 54 | 55 | // Get the string value. 56 | std::string str() const { return stream_.str(); } 57 | 58 | std::stringbuf* rdbuf() const { return stream_.rdbuf(); } 59 | 60 | // Check the state of the stream. 61 | // False when the most recent stream operation failed 62 | operator bool() const { return !!stream_; } 63 | 64 | ~stringstream_proxy() = default; 65 | private: 66 | std::istringstream stream_; 67 | }; 68 | using string_stream = stringstream_proxy; 69 | #endif 70 | 71 | class parser 72 | { 73 | public: 74 | enum Mode { PREFER_FLAG_FOR_UNREG_OPTION = 1 << 0, 75 | PREFER_PARAM_FOR_UNREG_OPTION = 1 << 1, 76 | NO_SPLIT_ON_EQUALSIGN = 1 << 2, 77 | SINGLE_DASH_IS_MULTIFLAG = 1 << 3, 78 | }; 79 | 80 | parser() = default; 81 | 82 | parser(std::initializer_list pre_reg_names) 83 | { add_params(pre_reg_names); } 84 | 85 | parser(const char* const argv[], int mode = PREFER_FLAG_FOR_UNREG_OPTION) 86 | { parse(argv, mode); } 87 | 88 | parser(int argc, const char* const argv[], int mode = PREFER_FLAG_FOR_UNREG_OPTION) 89 | { parse(argc, argv, mode); } 90 | 91 | void add_param(std::string const& name); 92 | void add_params(std::initializer_list init_list); 93 | 94 | void parse(const char* const argv[], int mode = PREFER_FLAG_FOR_UNREG_OPTION); 95 | void parse(int argc, const char* const argv[], int mode = PREFER_FLAG_FOR_UNREG_OPTION); 96 | 97 | std::multiset const& flags() const { return flags_; } 98 | std::map const& params() const { return params_; } 99 | std::vector const& pos_args() const { return pos_args_; } 100 | 101 | // begin() and end() for using range-for over positional args. 102 | std::vector::const_iterator begin() const { return pos_args_.cbegin(); } 103 | std::vector::const_iterator end() const { return pos_args_.cend(); } 104 | size_t size() const { return pos_args_.size(); } 105 | 106 | ////////////////////////////////////////////////////////////////////////// 107 | // Accessors 108 | 109 | // flag (boolean) accessors: return true if the flag appeared, otherwise false. 110 | bool operator[](std::string const& name) const; 111 | 112 | // multiple flag (boolean) accessors: return true if at least one of the flag appeared, otherwise false. 113 | bool operator[](std::initializer_list init_list) const; 114 | 115 | // returns positional arg string by order. Like argv[] but without the options 116 | std::string const& operator[](size_t ind) const; 117 | 118 | // returns a std::istream that can be used to convert a positional arg to a typed value. 119 | string_stream operator()(size_t ind) const; 120 | 121 | // same as above, but with a default value in case the arg is missing (index out of range). 122 | template 123 | string_stream operator()(size_t ind, T&& def_val) const; 124 | 125 | // parameter accessors, give a name get an std::istream that can be used to convert to a typed value. 126 | // call .str() on result to get as string 127 | string_stream operator()(std::string const& name) const; 128 | 129 | // accessor for a parameter with multiple names, give a list of names, get an std::istream that can be used to convert to a typed value. 130 | // call .str() on result to get as string 131 | // returns the first value in the list to be found. 132 | string_stream operator()(std::initializer_list init_list) const; 133 | 134 | // same as above, but with a default value in case the param was missing. 135 | // Non-string def_val types must have an operator<<() (output stream operator) 136 | // If T only has an input stream operator, pass the string version of the type as in "3" instead of 3. 137 | template 138 | string_stream operator()(std::string const& name, T&& def_val) const; 139 | 140 | // same as above but for a list of names. returns the first value to be found. 141 | template 142 | string_stream operator()(std::initializer_list init_list, T&& def_val) const; 143 | 144 | private: 145 | string_stream bad_stream() const; 146 | std::string trim_leading_dashes(std::string const& name) const; 147 | bool is_number(std::string const& arg) const; 148 | bool is_option(std::string const& arg) const; 149 | bool got_flag(std::string const& name) const; 150 | bool is_param(std::string const& name) const; 151 | 152 | private: 153 | std::vector args_; 154 | std::map params_; 155 | std::vector pos_args_; 156 | std::multiset flags_; 157 | std::set registeredParams_; 158 | std::string empty_; 159 | }; 160 | 161 | 162 | ////////////////////////////////////////////////////////////////////////// 163 | 164 | inline void parser::parse(const char * const argv[], int mode) 165 | { 166 | int argc = 0; 167 | for (auto argvp = argv; *argvp; ++argc, ++argvp); 168 | parse(argc, argv, mode); 169 | } 170 | 171 | ////////////////////////////////////////////////////////////////////////// 172 | 173 | inline void parser::parse(int argc, const char* const argv[], int mode /*= PREFER_FLAG_FOR_UNREG_OPTION*/) 174 | { 175 | // convert to strings 176 | args_.resize(argc); 177 | std::transform(argv, argv + argc, args_.begin(), [](const char* const arg) { return arg; }); 178 | 179 | // parse line 180 | for (auto i = 0u; i < args_.size(); ++i) 181 | { 182 | if (!is_option(args_[i])) 183 | { 184 | pos_args_.emplace_back(args_[i]); 185 | continue; 186 | } 187 | 188 | auto name = trim_leading_dashes(args_[i]); 189 | 190 | if (!(mode & NO_SPLIT_ON_EQUALSIGN)) 191 | { 192 | auto equalPos = name.find('='); 193 | if (equalPos != std::string::npos) 194 | { 195 | params_.insert({ name.substr(0, equalPos), name.substr(equalPos + 1) }); 196 | continue; 197 | } 198 | } 199 | 200 | // if the option is unregistered and should be a multi-flag 201 | if (1 == (args_[i].size() - name.size()) && // single dash 202 | argh::parser::SINGLE_DASH_IS_MULTIFLAG & mode && // multi-flag mode 203 | !is_param(name)) // unregistered 204 | { 205 | std::string keep_param; 206 | 207 | if (!name.empty() && is_param(std::string(1ul, name.back()))) // last char is param 208 | { 209 | keep_param += name.back(); 210 | name.resize(name.size() - 1); 211 | } 212 | 213 | for (auto const& c : name) 214 | { 215 | flags_.emplace(std::string{ c }); 216 | } 217 | 218 | if (!keep_param.empty()) 219 | { 220 | name = keep_param; 221 | } 222 | else 223 | { 224 | continue; // do not consider other options for this arg 225 | } 226 | } 227 | 228 | // any potential option will get as its value the next arg, unless that arg is an option too 229 | // in that case it will be determined a flag. 230 | if (i == args_.size() - 1 || is_option(args_[i + 1])) 231 | { 232 | flags_.emplace(name); 233 | continue; 234 | } 235 | 236 | // if 'name' is a pre-registered option, then the next arg cannot be a free parameter to it is skipped 237 | // otherwise we have 2 modes: 238 | // PREFER_FLAG_FOR_UNREG_OPTION: a non-registered 'name' is determined a flag. 239 | // The following value (the next arg) will be a free parameter. 240 | // 241 | // PREFER_PARAM_FOR_UNREG_OPTION: a non-registered 'name' is determined a parameter, the next arg 242 | // will be the value of that option. 243 | 244 | assert(!(mode & argh::parser::PREFER_FLAG_FOR_UNREG_OPTION) 245 | || !(mode & argh::parser::PREFER_PARAM_FOR_UNREG_OPTION)); 246 | 247 | bool preferParam = mode & argh::parser::PREFER_PARAM_FOR_UNREG_OPTION; 248 | 249 | if (is_param(name) || preferParam) 250 | { 251 | params_.insert({ name, args_[i + 1] }); 252 | ++i; // skip next value, it is not a free parameter 253 | continue; 254 | } 255 | else 256 | { 257 | flags_.emplace(name); 258 | } 259 | }; 260 | } 261 | 262 | ////////////////////////////////////////////////////////////////////////// 263 | 264 | inline string_stream parser::bad_stream() const 265 | { 266 | string_stream bad; 267 | bad.setstate(std::ios_base::failbit); 268 | return bad; 269 | } 270 | 271 | ////////////////////////////////////////////////////////////////////////// 272 | 273 | inline bool parser::is_number(std::string const& arg) const 274 | { 275 | // inefficient but simple way to determine if a string is a number (which can start with a '-') 276 | std::istringstream istr(arg); 277 | double number; 278 | istr >> number; 279 | return !(istr.fail() || istr.bad()); 280 | } 281 | 282 | ////////////////////////////////////////////////////////////////////////// 283 | 284 | inline bool parser::is_option(std::string const& arg) const 285 | { 286 | assert(0 != arg.size()); 287 | if (is_number(arg)) 288 | return false; 289 | return '-' == arg[0]; 290 | } 291 | 292 | ////////////////////////////////////////////////////////////////////////// 293 | 294 | inline std::string parser::trim_leading_dashes(std::string const& name) const 295 | { 296 | auto pos = name.find_first_not_of('-'); 297 | return std::string::npos != pos ? name.substr(pos) : name; 298 | } 299 | 300 | ////////////////////////////////////////////////////////////////////////// 301 | 302 | inline bool argh::parser::got_flag(std::string const& name) const 303 | { 304 | return flags_.end() != flags_.find(trim_leading_dashes(name)); 305 | } 306 | 307 | ////////////////////////////////////////////////////////////////////////// 308 | 309 | inline bool argh::parser::is_param(std::string const& name) const 310 | { 311 | return registeredParams_.count(name); 312 | } 313 | 314 | ////////////////////////////////////////////////////////////////////////// 315 | 316 | inline bool parser::operator[](std::string const& name) const 317 | { 318 | return got_flag(name); 319 | } 320 | 321 | ////////////////////////////////////////////////////////////////////////// 322 | 323 | inline bool parser::operator[](std::initializer_list init_list) const 324 | { 325 | return std::any_of(init_list.begin(), init_list.end(), [&](char const* const name) { return got_flag(name); }); 326 | } 327 | 328 | ////////////////////////////////////////////////////////////////////////// 329 | 330 | inline std::string const& parser::operator[](size_t ind) const 331 | { 332 | if (ind < pos_args_.size()) 333 | return pos_args_[ind]; 334 | return empty_; 335 | } 336 | 337 | ////////////////////////////////////////////////////////////////////////// 338 | 339 | inline string_stream parser::operator()(std::string const& name) const 340 | { 341 | auto optIt = params_.find(trim_leading_dashes(name)); 342 | if (params_.end() != optIt) 343 | return string_stream(optIt->second); 344 | return bad_stream(); 345 | } 346 | 347 | ////////////////////////////////////////////////////////////////////////// 348 | 349 | inline string_stream parser::operator()(std::initializer_list init_list) const 350 | { 351 | for (auto& name : init_list) 352 | { 353 | auto optIt = params_.find(trim_leading_dashes(name)); 354 | if (params_.end() != optIt) 355 | return string_stream(optIt->second); 356 | } 357 | return bad_stream(); 358 | } 359 | 360 | ////////////////////////////////////////////////////////////////////////// 361 | 362 | template 363 | string_stream parser::operator()(std::string const& name, T&& def_val) const 364 | { 365 | auto optIt = params_.find(trim_leading_dashes(name)); 366 | if (params_.end() != optIt) 367 | return string_stream(optIt->second); 368 | 369 | std::ostringstream ostr; 370 | ostr << def_val; 371 | return string_stream(ostr.str()); // use default 372 | } 373 | 374 | ////////////////////////////////////////////////////////////////////////// 375 | 376 | // same as above but for a list of names. returns the first value to be found. 377 | template 378 | string_stream parser::operator()(std::initializer_list init_list, T&& def_val) const 379 | { 380 | for (auto& name : init_list) 381 | { 382 | auto optIt = params_.find(trim_leading_dashes(name)); 383 | if (params_.end() != optIt) 384 | return string_stream(optIt->second); 385 | } 386 | std::ostringstream ostr; 387 | ostr << def_val; 388 | return string_stream(ostr.str()); // use default 389 | } 390 | 391 | ////////////////////////////////////////////////////////////////////////// 392 | 393 | inline string_stream parser::operator()(size_t ind) const 394 | { 395 | if (pos_args_.size() <= ind) 396 | return bad_stream(); 397 | 398 | return string_stream(pos_args_[ind]); 399 | } 400 | 401 | ////////////////////////////////////////////////////////////////////////// 402 | 403 | template 404 | string_stream parser::operator()(size_t ind, T&& def_val) const 405 | { 406 | if (pos_args_.size() <= ind) 407 | { 408 | std::ostringstream ostr; 409 | ostr << def_val; 410 | return string_stream(ostr.str()); 411 | } 412 | 413 | return string_stream(pos_args_[ind]); 414 | } 415 | 416 | ////////////////////////////////////////////////////////////////////////// 417 | 418 | inline void parser::add_param(std::string const& name) 419 | { 420 | registeredParams_.insert(trim_leading_dashes(name)); 421 | } 422 | 423 | ////////////////////////////////////////////////////////////////////////// 424 | 425 | inline void parser::add_params(std::initializer_list init_list) 426 | { 427 | for (auto& name : init_list) 428 | registeredParams_.insert(trim_leading_dashes(name)); 429 | } 430 | } 431 | 432 | -------------------------------------------------------------------------------- /src/blaze.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | 10 | #include "argh.h" 11 | #include "../vendor/rapidjson/include/rapidjson/document.h" 12 | #include "../vendor/rapidjson/include/rapidjson/filewritestream.h" 13 | #include "../vendor/rapidjson/include/rapidjson/writer.h" 14 | 15 | #define DEFAULT_SIZE 5000 16 | #define DEFAULT_SLICES 5 17 | #define WRITE_BUF_SIZE 65536 18 | 19 | static std::mutex mtx_out; 20 | 21 | struct auth_options 22 | { 23 | std::string type; 24 | std::string user; 25 | std::string pass; 26 | bool insecure; 27 | }; 28 | 29 | struct dump_options 30 | { 31 | std::string host; 32 | std::string index; 33 | auth_options auth; 34 | int slice_id; 35 | int slice_max; 36 | int size; 37 | }; 38 | 39 | struct thread_state 40 | { 41 | std::stringstream error; 42 | }; 43 | 44 | struct thread_container 45 | { 46 | int slice_id; 47 | thread_state state; 48 | std::thread thread; 49 | }; 50 | 51 | size_t write_data( 52 | void * buffer, 53 | size_t size, 54 | size_t nmemb, 55 | void * userp) 56 | { 57 | std::vector* data = reinterpret_cast*>(userp); 58 | 59 | const char* real_buffer = reinterpret_cast(buffer); 60 | size_t real_size = size * nmemb; 61 | data->insert(data->end(), real_buffer, real_buffer + real_size); 62 | return real_size; 63 | } 64 | 65 | bool get_or_post_data( 66 | CURL * crl, 67 | std::string const & url, 68 | auth_options const & auth, 69 | std::vector * data, 70 | long * response_code, 71 | std::string * error, 72 | std::string body = "") 73 | { 74 | curl_slist* headers = nullptr; 75 | headers = curl_slist_append(headers, "Content-Type: application/json"); 76 | 77 | curl_easy_setopt(crl, CURLOPT_HTTPHEADER, headers); 78 | curl_easy_setopt(crl, CURLOPT_URL, url.c_str()); 79 | curl_easy_setopt(crl, CURLOPT_WRITEFUNCTION, &write_data); 80 | curl_easy_setopt(crl, CURLOPT_WRITEDATA, reinterpret_cast(data)); 81 | 82 | if (auth.insecure) 83 | { 84 | curl_easy_setopt(crl, CURLOPT_SSL_VERIFYPEER, 0); 85 | curl_easy_setopt(crl, CURLOPT_SSL_VERIFYHOST, 0); 86 | } 87 | 88 | if (auth.type == "basic") 89 | { 90 | std::string user_pass = auth.user + ":" + auth.pass; 91 | curl_easy_setopt(crl, CURLOPT_HTTPAUTH, CURLAUTH_BASIC); 92 | curl_easy_setopt(crl, CURLOPT_USERPWD, user_pass.c_str()); 93 | } 94 | 95 | if (!body.empty()) 96 | { 97 | curl_easy_setopt(crl, CURLOPT_POSTFIELDS, body.c_str()); 98 | } 99 | 100 | CURLcode res = curl_easy_perform(crl); 101 | curl_slist_free_all(headers); 102 | 103 | if (res == CURLE_OK) 104 | { 105 | curl_easy_getinfo(crl, CURLINFO_RESPONSE_CODE, response_code); 106 | return true; 107 | } 108 | 109 | *error = curl_easy_strerror(res); 110 | return false; 111 | } 112 | 113 | void write_document( 114 | rapidjson::Document & document, 115 | int * hits_count, 116 | std::string * scroll_id) 117 | { 118 | std::unique_lock lock(mtx_out); 119 | 120 | static char buffer[WRITE_BUF_SIZE]; 121 | static rapidjson::FileWriteStream stream(stdout, buffer, sizeof(buffer)); 122 | 123 | // Epic const unfolding. 124 | auto const& scroll_id_value = document["_scroll_id"]; 125 | auto const& hits_object_value = document["hits"]; 126 | auto const& hits_object = hits_object_value.GetObject(); 127 | auto const& hits_value = hits_object["hits"]; 128 | auto const& hits = hits_value.GetArray(); 129 | 130 | // Shared allocator 131 | auto& allocator = document.GetAllocator(); 132 | auto writer = rapidjson::Writer(stream); 133 | 134 | for (rapidjson::Value const& hit : hits) 135 | { 136 | auto meta_index = rapidjson::Value(rapidjson::kObjectType); 137 | auto meta_index_id = rapidjson::Value(); 138 | auto meta_object = rapidjson::Value(rapidjson::kObjectType); 139 | 140 | meta_index_id.SetString(hit["_id"].GetString(), allocator); 141 | 142 | meta_index.AddMember("_id", meta_index_id, allocator); 143 | 144 | meta_object.AddMember("index", meta_index, allocator); 145 | 146 | // Serialize to output stream. Do it in two steps to get 147 | // new-line separated JSON. 148 | 149 | meta_object.Accept(writer); 150 | stream.Put('\n'); 151 | stream.Flush(); 152 | writer.Reset(stream); 153 | 154 | hit["_source"].Accept(writer); 155 | stream.Put('\n'); 156 | stream.Flush(); 157 | writer.Reset(stream); 158 | } 159 | 160 | *scroll_id = scroll_id_value.GetString(); 161 | *hits_count = hits.Size(); 162 | } 163 | 164 | void output_parser_error( 165 | rapidjson::Document const& doc, 166 | std::ostream & stream) 167 | { 168 | stream << "JSON parsing failed with code: " 169 | << doc.GetParseError() 170 | << ", at offset " 171 | << doc.GetErrorOffset(); 172 | } 173 | 174 | void dump( 175 | dump_options const& options, 176 | thread_state * state) 177 | { 178 | CURL* crl = curl_easy_init(); 179 | 180 | std::string query = "{\n" 181 | "\"size\": " + std::to_string(options.size) + ",\n" 182 | "\"slice\": {\n" 183 | "\"id\": " + std::to_string(options.slice_id) + ",\n" 184 | "\"max\": " + std::to_string(options.slice_max) + "\n" 185 | "}\n" 186 | "}"; 187 | 188 | std::vector buffer; 189 | long response_code; 190 | std::string error; 191 | 192 | bool res = get_or_post_data( 193 | crl, 194 | options.host + "/" + options.index + "/_search?scroll=1m", 195 | options.auth, 196 | &buffer, 197 | &response_code, 198 | &error, 199 | query); 200 | 201 | if (!res) 202 | { 203 | state->error << "A HTTP error occured: " << error; 204 | return; 205 | } 206 | 207 | if (response_code != 200) 208 | { 209 | state->error << "Server returned HTTP status " << response_code << ": " << buffer.data(); 210 | return; 211 | } 212 | 213 | rapidjson::Document doc; 214 | doc.Parse(buffer.data(), buffer.size()); 215 | 216 | if (doc.HasParseError()) 217 | { 218 | return output_parser_error(doc, state->error); 219 | } 220 | 221 | std::string scroll_id; 222 | int hits_count; 223 | 224 | write_document( 225 | doc, 226 | &hits_count, 227 | &scroll_id); 228 | 229 | do 230 | { 231 | query = "{\n" 232 | "\"scroll\": \"1m\",\n" 233 | "\"scroll_id\": \"" + scroll_id + "\"\n" 234 | "}\n"; 235 | 236 | buffer.clear(); 237 | 238 | res = get_or_post_data( 239 | crl, 240 | options.host + "/_search/scroll", 241 | options.auth, 242 | &buffer, 243 | &response_code, 244 | &error, 245 | query); 246 | 247 | if (!res) 248 | { 249 | state->error << "A HTTP error occured: " << error; 250 | return; 251 | } 252 | 253 | if (response_code != 200) 254 | { 255 | state->error << "Server returned HTTP status " << response_code; 256 | return; 257 | } 258 | 259 | rapidjson::Document doc_search; 260 | doc_search.Parse(buffer.data(), buffer.size()); 261 | 262 | if (doc_search.HasParseError()) 263 | { 264 | return output_parser_error(doc_search, state->error); 265 | } 266 | 267 | write_document( 268 | doc_search, 269 | &hits_count, 270 | &scroll_id); 271 | } while (hits_count > 0); 272 | 273 | curl_easy_cleanup(crl); 274 | } 275 | 276 | int64_t count_documents( 277 | std::string const& host, 278 | std::string const& index, 279 | auth_options const& auth) 280 | { 281 | CURL * crl = curl_easy_init(); 282 | long response_code; 283 | rapidjson::Document doc; 284 | std::string url = host + "/" + index + "/_count"; 285 | std::string error; 286 | std::vector buffer; 287 | 288 | bool res = get_or_post_data( 289 | crl, 290 | url, 291 | auth, 292 | &buffer, 293 | &response_code, 294 | &error); 295 | 296 | if (!res) 297 | { 298 | std::cerr << "A HTTP error occured: " << error << std::endl; 299 | return -1; 300 | } 301 | 302 | doc.Parse(buffer.data(), buffer.size()); 303 | 304 | if (doc.HasParseError()) 305 | { 306 | output_parser_error(doc, std::cerr); 307 | return -1; 308 | } 309 | 310 | return doc["count"].GetInt64(); 311 | } 312 | 313 | int dump_mappings( 314 | std::string const& host, 315 | std::string const& index, 316 | auth_options const& auth) 317 | { 318 | static char write_buffer[WRITE_BUF_SIZE]; 319 | static rapidjson::FileWriteStream stream(stdout, write_buffer, sizeof(write_buffer)); 320 | 321 | CURL * crl = curl_easy_init(); 322 | long response_code; 323 | rapidjson::Document doc; 324 | std::string url = host + "/" + index + "/_mapping"; 325 | std::string error; 326 | std::vector buffer; 327 | 328 | bool res = get_or_post_data( 329 | crl, 330 | url, 331 | auth, 332 | &buffer, 333 | &response_code, 334 | &error); 335 | 336 | if (!res) 337 | { 338 | std::cerr << "A HTTP error occured: " << error << std::endl; 339 | return 1; 340 | } 341 | 342 | doc.Parse(buffer.data(), buffer.size()); 343 | 344 | if (doc.HasParseError()) 345 | { 346 | output_parser_error(doc, std::cerr); 347 | return 1; 348 | } 349 | 350 | rapidjson::Writer writer(stream); 351 | doc[index.c_str()].Accept(writer); 352 | stream.Put('\n'); 353 | stream.Flush(); 354 | 355 | curl_easy_cleanup(crl); 356 | 357 | return 0; 358 | } 359 | 360 | int dump_index_info( 361 | std::string const& host, 362 | std::string const& index, 363 | auth_options const& auth) 364 | { 365 | static char write_buffer[WRITE_BUF_SIZE]; 366 | static rapidjson::FileWriteStream stream(stdout, write_buffer, sizeof(write_buffer)); 367 | 368 | CURL * crl = curl_easy_init(); 369 | long response_code; 370 | rapidjson::Document doc; 371 | std::string url = host + "/" + index; 372 | std::string error; 373 | std::vector buffer; 374 | 375 | bool res = get_or_post_data( 376 | crl, 377 | url, 378 | auth, 379 | &buffer, 380 | &response_code, 381 | &error); 382 | 383 | if (!res) 384 | { 385 | std::cerr << "A HTTP error occured: " << error << std::endl; 386 | return 1; 387 | } 388 | 389 | doc.Parse(buffer.data(), buffer.size()); 390 | 391 | if (doc.HasParseError()) 392 | { 393 | output_parser_error(doc, std::cerr); 394 | return 1; 395 | } 396 | 397 | rapidjson::Writer writer(stream); 398 | doc[index.c_str()].Accept(writer); 399 | stream.Put('\n'); 400 | stream.Flush(); 401 | 402 | curl_easy_cleanup(crl); 403 | 404 | return 0; 405 | } 406 | 407 | int main( 408 | int argc, 409 | char * argv[]) 410 | { 411 | curl_global_init(CURL_GLOBAL_ALL); 412 | 413 | std::vector> threads; 414 | 415 | // Parse command line options 416 | argh::parser cmdl(argv); 417 | 418 | std::string host; 419 | if (!(cmdl({"--host"}) >> host)) 420 | { 421 | std::cerr << "Must provide an Elasticsearch host (--host)" << std::endl; 422 | return 1; 423 | } 424 | 425 | std::string index; 426 | if (!(cmdl({"--index"}) >> index)) 427 | { 428 | std::cerr << "Must provide an index (--index)" << std::endl; 429 | return 1; 430 | } 431 | 432 | auth_options auth; 433 | 434 | if (cmdl({"--auth"}) >> auth.type) 435 | { 436 | if (auth.type == "basic") 437 | { 438 | if (!(cmdl({"--basic-username"}) >> auth.user)) 439 | { 440 | std::cerr << "Must provide --basic-username when passing --auth=basic" << std::endl; 441 | return 1; 442 | } 443 | 444 | if (!(cmdl({"--basic-password"}) >> auth.pass)) 445 | { 446 | std::cerr << "Must provide --basic-password when passing --auth=basic" << std::endl; 447 | return 1; 448 | } 449 | } 450 | } 451 | 452 | auth.insecure = cmdl["--insecure"]; 453 | 454 | if (cmdl["--dump-mappings"]) 455 | { 456 | return dump_mappings( 457 | host, 458 | index, 459 | auth); 460 | } 461 | else if (cmdl["--dump-index-info"]) 462 | { 463 | return dump_index_info( 464 | host, 465 | index, 466 | auth); 467 | } 468 | 469 | // Sanity check - see if we have any documents in the index at all. 470 | if (count_documents(host, index, auth) <= 0) 471 | { 472 | std::cerr << "Index is empty - no documents found" << std::endl; 473 | return 0; 474 | } 475 | 476 | int slices; 477 | cmdl({"--slices"}, DEFAULT_SLICES) >> slices; 478 | 479 | int size; 480 | cmdl({"--size"}, DEFAULT_SIZE) >> size; 481 | 482 | for (int i = 0; i < slices; i++) 483 | { 484 | dump_options opts; 485 | opts.host = host; 486 | opts.index = index; 487 | opts.auth = auth; 488 | opts.size = size; 489 | opts.slice_id = i; 490 | opts.slice_max = slices; 491 | 492 | auto cnt = std::unique_ptr(new thread_container()); 493 | cnt->slice_id = i; 494 | cnt->thread = std::thread(dump, opts, &cnt->state); 495 | 496 | threads.push_back(std::move(cnt)); 497 | } 498 | 499 | int exit_code = 0; 500 | 501 | for (auto& cnt : threads) 502 | { 503 | cnt->thread.join(); 504 | 505 | if (cnt->state.error.tellp() > 0) 506 | { 507 | std::cerr << "Slice " 508 | << std::setw(2) << std::setfill('0') << cnt->slice_id 509 | << " exited with error: " 510 | << cnt->state.error.rdbuf() 511 | << std::endl; 512 | 513 | exit_code = 1; 514 | } 515 | } 516 | 517 | curl_global_cleanup(); 518 | 519 | return exit_code; 520 | } 521 | --------------------------------------------------------------------------------