├── .gitignore ├── README.md ├── async_funcs.hpp ├── block_file_read_policy.hpp ├── block_file_write_policy.hpp ├── block_input_stream.hpp ├── block_memory_policy.hpp ├── block_output_stream.hpp ├── block_types.hpp ├── example ├── Makefile ├── external_sort_custom.hpp ├── external_sort_main.cc ├── logging │ ├── logging.cc │ └── logging.hpp └── simple_example1.cc ├── external_sort.hpp ├── external_sort_merge.hpp ├── external_sort_nolog.hpp └── external_sort_types.hpp /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files 2 | *.slo 3 | *.lo 4 | *.o 5 | *.pyc 6 | 7 | # Compiled Dynamic libraries 8 | *.so 9 | *.dylib 10 | 11 | # Compiled Static libraries 12 | *.lai 13 | *.la 14 | *.a 15 | 16 | # Images 17 | *.png 18 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | external_sort 2 | ------------- 3 | 4 | This is a header-only, multithreaded, policy-based implementation of the [external sort](http://en.wikipedia.org/wiki/External_sorting) in C++11. 5 | 6 | The library works with the basic data types as well as with user defined custom data types. 7 | 8 | ### External sort algorithm 9 | 10 | #### Phase 1: split and sort 11 | 12 | A big input file is consequently read in pieces (aka blocks or chunks) small enough to fit into the memory. Each piece is sorted and stored to a separate output file (split). 13 | 14 | There is one thread reading data from the input file. For each block, read but not yet sorted, a new worker thread is spawned to sort it and write the block to the output file. 15 | 16 | Example: 17 | 18 | external_sort::SplitParams params; 19 | params.mem.size = 100; // memory size 20 | params.mem.unit = external_sort::MB; // memory unit 21 | params.mem.blocks = 2; // max number of memory blocks 22 | params.spl.ifile = "input_file"; // input file to split/sort 23 | params.spl.ofile = "output_file"; // output file prefix 24 | 25 | external_sort::split(params); 26 | if (params.err) { 27 | LOG_ERR(("Error: %s") % params.err.msg()); 28 | } 29 | 30 | #### Phase 2: merge 31 | 32 | The input files (sorted splits) are merged repeatedly until only one file left. 33 | 34 | There can be more than one ongoing merge at a time. Each merge takes k input files (streams) and merges them into one output file (stream). Each input or output stream has its own thread reading or writing data asynchronously. Thus, each k-merge has k+2 threads: k threads reading data (k input streams), 1 thread performing the actual merge and 1 thread writing data (the output stream). 35 | 36 | Each stream (input or output) has a queue and at least two blocks of data. Two blocks per stream make it possible to perform read/write and merge in two threads in parallel (each thread has its own block to work with). Reasonably, there shall be no need in more than two blocks, since either reading/writing or merging is supposed to be consistently slower than the other. 37 | 38 | Example: 39 | 40 | external_sort::MergeParams params; 41 | params.mem.size = 100; // memory size 42 | params.mem.unit = external_sort::MB; // memory unit 43 | params.mrg.merges = 4; // number of simultaneous merges 44 | params.mrg.kmerge = 4; // number of streams to merge 45 | params.mrg.stmblocks = 2; // number of memory blocks per i/o stream 46 | params.mrg.ifiles = files; // std::list of input files 47 | params.mrg.ofile = "file_merged"; // output file 48 | 49 | external_sort::merge(params); 50 | if (params.err) { 51 | LOG_ERR(("Error: %s") % params.err.msg()); 52 | } 53 | 54 | ### External sort = split + merge 55 | 56 | It is possible to combine both split and merge into a single function call: 57 | 58 | // set split and merge parameters 59 | external_sort::SplitParams sp; 60 | external_sort::MergeParams mp; 61 | sp.mem.size = 10; 62 | sp.mem.unit = external_sort::MB; 63 | mp.mem = sp.mem; 64 | sp.spl.ifile = "big_input_file"; 65 | mp.mrg.ofile = "big_sorted_file"; 66 | 67 | using ValueType = unsigned int; 68 | 69 | // run external sort 70 | external_sort::sort(sp, mp); 71 | 72 | if (sp.err.none && mp.err.none) { 73 | std::cout << "File sorted successfully!" << std::endl; 74 | } else { 75 | std::cout << "External sort failed!" << std::endl; 76 | if (sp.err) { 77 | std::cout << "Split failed: " << sp.err.msg() << std::endl; 78 | } else { 79 | std::cout << "Merge failed: " << mp.err.msg() << std::endl; 80 | } 81 | } 82 | 83 | ### The tool 84 | 85 | In the ./example sub-directory, there is a simple wrapper tool around the external sort functionality of the library. 86 | By default, it sorts uint32_t values (it can be changed to a custom type, see [external_sort_custom.hpp](https://github.com/alveko/external_sort/blob/master/example/external_sort_custom.hpp)). 87 | 88 | Usage: external_sort [options] 89 | 90 | General options: 91 | -h [ --help ] Display this information 92 | 93 | --act arg (=all) Action to perform. Possible values: 94 | 95 | gen - Generates random data 96 | spl - Splits and sorts the input 97 | mrg - Merges the input 98 | chk - Checks if the input is sorted 99 | all - All of the above 100 | srt = spl + mrg 101 | 102 | --msize arg (=1) Memory size 103 | --munit arg (=M) Memory unit: 104 | --log arg (=4) Log level: [0-6] 105 | --no_rm Do not remove temporary files 106 | --tmpdir arg (=) Directory for temporary files 107 | (relevant if act includes mrg) 108 | 109 | Options for act=gen (generate): 110 | --gen.ofile arg (=generated) Output file 111 | --gen.fsize arg File size to generate, in memory units. 112 | By default: gen.fsize = 16 * msize 113 | --gen.blocks arg (=2) Number of blocks in memory 114 | 115 | Options for act=spl (phase 1: split and sort): 116 | --srt.ifile arg Same as --spl.ifile 117 | --spl.ifile arg (=) Input file 118 | --spl.ofile arg (=) Output file prefix 119 | --spl.blocks arg (=2) Number of blocks in memory 120 | 121 | Options for act=mrg (phase 2: merge): 122 | --mrg.ifiles arg (=) Input files to be merged into one 123 | (required and only relevant if act=mrg, 124 | otherwise the list of files, i.e. 125 | sorted splits, is passed over from 126 | phase 1) 127 | --mrg.ofile arg (=.sorted) Output file (required if act=mrg) 128 | --mrg.merges arg (=4) Number of simultaneous merge merges 129 | --mrg.kmerge arg (=4) Number of streams merged at a time 130 | --mrg.stmblocks arg (=2) Number of memory blocks per stream 131 | 132 | Options for act=chk (check): 133 | --chk.ifile arg (=) Input file 134 | --chk.blocks arg (=2) Number of blocks in memory 135 | -------------------------------------------------------------------------------- /async_funcs.hpp: -------------------------------------------------------------------------------- 1 | #ifndef ASYNC_FUNCS_HPP 2 | #define ASYNC_FUNCS_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | namespace external_sort { 11 | namespace aux { 12 | 13 | template 14 | class AsyncFuncs 15 | { 16 | public: 17 | template 18 | void Async(Fn&& fn, Args&&... args); 19 | ResultType GetAny(); 20 | 21 | bool Empty() const { return All() == 0; } 22 | size_t All() const { return Ready() + Running(); } 23 | size_t Ready() const; 24 | size_t Running() const; 25 | 26 | private: 27 | template 28 | void RunFunc(Fn&& fn, Args&&... args); 29 | 30 | private: 31 | TRACEX_NAME("AsyncFuncs"); 32 | 33 | mutable std::mutex mtx_; 34 | std::condition_variable cv_; 35 | 36 | std::atomic funcs_running_ = {0}; 37 | std::list funcs_ready_; 38 | }; 39 | 40 | template 41 | size_t AsyncFuncs::Running() const 42 | { 43 | return funcs_running_; 44 | } 45 | 46 | template 47 | size_t AsyncFuncs::Ready() const 48 | { 49 | std::unique_lock lck(mtx_); 50 | return funcs_ready_.size(); 51 | } 52 | 53 | template 54 | ResultType AsyncFuncs::GetAny() 55 | { 56 | TRACEX_METHOD(); 57 | std::unique_lock lck(mtx_); 58 | while (funcs_ready_.empty()) { 59 | cv_.wait(lck); 60 | } 61 | 62 | ResultType result = funcs_ready_.front(); 63 | funcs_ready_.pop_front(); 64 | TRACEX(("async func collected (%d/%d)") 65 | % funcs_running_ % funcs_ready_.size()); 66 | return result; 67 | } 68 | 69 | template 70 | template 71 | void AsyncFuncs::Async(Fn&& fn, Args&&... args) 72 | { 73 | std::unique_lock lck(mtx_); 74 | funcs_running_++; 75 | TRACEX(("async func starting (%d/%d)") 76 | % funcs_running_ % funcs_ready_.size()); 77 | std::thread task(&AsyncFuncs::RunFunc, this, 78 | std::forward(fn), std::forward(args)...); 79 | task.detach(); 80 | } 81 | 82 | template 83 | template 84 | void AsyncFuncs::RunFunc(Fn&& fn, Args&&... args) 85 | { 86 | TRACEX(("async func started (%d/%d)") 87 | % funcs_running_ % funcs_ready_.size()); 88 | ResultType result = fn(std::forward(args)...); 89 | 90 | std::unique_lock lck(mtx_); 91 | funcs_ready_.push_back(result); 92 | funcs_running_--; 93 | TRACEX(("async func ready (%d/%d)") 94 | % funcs_running_ % funcs_ready_.size()); 95 | cv_.notify_one(); 96 | } 97 | 98 | } // namespace aux 99 | } // namespace external_sort 100 | 101 | #endif 102 | -------------------------------------------------------------------------------- /block_file_read_policy.hpp: -------------------------------------------------------------------------------- 1 | #ifndef BLOCK_FILE_READ_HPP 2 | #define BLOCK_FILE_READ_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include "block_types.hpp" 9 | 10 | namespace external_sort { 11 | namespace block { 12 | 13 | /// ---------------------------------------------------------------------------- 14 | /// BlockFileReadPolicy 15 | 16 | template 17 | class BlockFileReadPolicy 18 | { 19 | public: 20 | using BlockPtr = typename BlockTraits::BlockPtr; 21 | using ValueType = typename BlockTraits::ValueType; 22 | 23 | /// Policy interface 24 | void Open(); 25 | void Close(); 26 | void Read(BlockPtr& block); 27 | bool Empty() const; 28 | 29 | /// Set/get properties 30 | void set_input_filename(const std::string& ifn) { input_filename_ = ifn; } 31 | const std::string& input_filename() const { return input_filename_; } 32 | 33 | void set_input_rm_file(bool rm) { input_rm_file_ = rm; } 34 | bool input_rm_file() const { return input_rm_file_; } 35 | 36 | private: 37 | void FileOpen(); 38 | void FileRead(BlockPtr& block); 39 | void FileClose(); 40 | 41 | private: 42 | TRACEX_NAME("BlockFileReadPolicy"); 43 | 44 | std::ifstream ifs_; 45 | std::string input_filename_; 46 | bool input_rm_file_ = {false}; 47 | size_t block_cnt_ = 0; 48 | }; 49 | 50 | /// ---------------------------------------------------------------------------- 51 | /// Policy interface methods 52 | 53 | template 54 | void BlockFileReadPolicy::Open() 55 | { 56 | TRACEX_METHOD(); 57 | FileOpen(); 58 | } 59 | 60 | template 61 | void BlockFileReadPolicy::Close() 62 | { 63 | TRACEX_METHOD(); 64 | FileClose(); 65 | } 66 | 67 | template 68 | void BlockFileReadPolicy::Read(BlockPtr& block) 69 | { 70 | FileRead(block); 71 | block_cnt_++; 72 | } 73 | 74 | template 75 | bool BlockFileReadPolicy::Empty() const 76 | { 77 | return !(ifs_.is_open() && ifs_.good()); 78 | } 79 | 80 | /// ---------------------------------------------------------------------------- 81 | /// File operations 82 | 83 | template 84 | void BlockFileReadPolicy::FileOpen() 85 | { 86 | LOG_INF(("opening file r %s") % input_filename_); 87 | TRACEX(("input file %s") % input_filename_); 88 | ifs_.open(input_filename_, std::ifstream::in | std::ifstream::binary); 89 | if (!ifs_) { 90 | LOG_ERR(("Failed to open input file: %s") % input_filename_); 91 | } 92 | } 93 | 94 | template 95 | void BlockFileReadPolicy::FileRead(BlockPtr& block) 96 | { 97 | block->resize(block->capacity()); 98 | std::streamsize bsize = block->size() * sizeof(ValueType); 99 | 100 | ifs_.read(reinterpret_cast(block->data()), bsize); 101 | if (ifs_.gcount() < bsize) { 102 | block->resize(ifs_.gcount() / sizeof(ValueType)); 103 | } 104 | TRACEX(("block %014p <= file (%s), is_over = %s, size = %s") 105 | % BlockTraits::RawPtr(block) 106 | % block_cnt_ % Empty() % block->size()); 107 | } 108 | 109 | template 110 | void BlockFileReadPolicy::FileClose() 111 | { 112 | if (ifs_.is_open()) { 113 | ifs_.close(); 114 | if (input_rm_file_) { 115 | if (remove(input_filename_.c_str()) != 0) { 116 | LOG_ERR(("Failed to remove file: %s") % input_filename_); 117 | } 118 | } 119 | } 120 | } 121 | 122 | } // namespace block 123 | } // namespace external_sort 124 | 125 | #endif 126 | -------------------------------------------------------------------------------- /block_file_write_policy.hpp: -------------------------------------------------------------------------------- 1 | #ifndef BLOCK_FILE_WRITE_HPP 2 | #define BLOCK_FILE_WRITE_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include "block_types.hpp" 9 | 10 | namespace external_sort { 11 | namespace block { 12 | 13 | /// ---------------------------------------------------------------------------- 14 | /// BlockFileWritePolicy 15 | 16 | template 17 | class BlockFileWritePolicy 18 | { 19 | public: 20 | using BlockPtr = typename BlockTraits::BlockPtr; 21 | using ValueType = typename BlockTraits::ValueType; 22 | 23 | /// Policy interface 24 | void Open(); 25 | void Close(); 26 | void Write(const BlockPtr& block); 27 | 28 | /// Set/get properties 29 | void set_output_filename(const std::string& ofn) { output_filename_ = ofn; } 30 | const std::string& output_filename() const { return output_filename_; } 31 | 32 | private: 33 | void FileOpen(); 34 | void FileWrite(const BlockPtr& block); 35 | void FileClose(); 36 | 37 | private: 38 | TRACEX_NAME("BlockFileWritePolicy"); 39 | 40 | size_t block_cnt_ = 0; 41 | std::string output_filename_; 42 | std::ofstream ofs_; 43 | }; 44 | 45 | /// ---------------------------------------------------------------------------- 46 | /// Policy interface methods 47 | 48 | template 49 | void BlockFileWritePolicy::Open() 50 | { 51 | TRACEX_METHOD(); 52 | FileOpen(); 53 | } 54 | 55 | template 56 | void BlockFileWritePolicy::Close() 57 | { 58 | TRACEX_METHOD(); 59 | FileClose(); 60 | } 61 | 62 | template 63 | void BlockFileWritePolicy::Write(const BlockPtr& block) 64 | { 65 | // egnore empty blocks 66 | if (!block || block->empty()) { 67 | return; 68 | } 69 | 70 | // write the block 71 | FileWrite(block); 72 | block_cnt_++; 73 | } 74 | 75 | /// ---------------------------------------------------------------------------- 76 | /// File operations 77 | 78 | template 79 | void BlockFileWritePolicy::FileOpen() 80 | { 81 | LOG_INF(("opening file w %s") % output_filename_); 82 | TRACEX(("output file %s") % output_filename_); 83 | ofs_.open(output_filename_, std::ofstream::out | std::ofstream::binary); 84 | if (!ofs_) { 85 | LOG_ERR(("Failed to open output file: %s") % output_filename_); 86 | } 87 | } 88 | 89 | template 90 | void BlockFileWritePolicy::FileWrite(const BlockPtr& block) 91 | { 92 | ofs_.write((const char*)block->data(), block->size() * sizeof(ValueType)); 93 | TRACEX(("block %014p => file (%s), bsize = %d") 94 | % BlockTraits::RawPtr(block) % block_cnt_ % block->size()); 95 | } 96 | 97 | template 98 | void BlockFileWritePolicy::FileClose() 99 | { 100 | if (ofs_.is_open()) { 101 | ofs_.close(); 102 | } 103 | } 104 | 105 | } // namespace block 106 | } // namespace external_sort 107 | 108 | #endif 109 | -------------------------------------------------------------------------------- /block_input_stream.hpp: -------------------------------------------------------------------------------- 1 | #ifndef BLOCK_INPUT_STREAM_HPP 2 | #define BLOCK_INPUT_STREAM_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "block_types.hpp" 11 | 12 | namespace external_sort { 13 | namespace block { 14 | 15 | template 16 | class BlockInputStream : public ReadPolicy, public MemoryPolicy 17 | { 18 | public: 19 | using BlockType = Block; 20 | using BlockPtr = typename BlockTraits::BlockPtr; 21 | using Iterator = typename BlockTraits::Iterator; 22 | using ValueType = typename BlockTraits::ValueType; 23 | 24 | void Open(); 25 | void Close(); 26 | bool Empty(); 27 | 28 | ValueType& Front(); // get a single value 29 | BlockPtr FrontBlock(); // get entire block 30 | BlockPtr ReadBlock(); // read a block right from the file 31 | 32 | void Pop(); 33 | void PopBlock(); 34 | 35 | private: 36 | void InputLoop(); 37 | void WaitForBlock(); 38 | 39 | private: 40 | TRACEX_NAME("BlockInputStream"); 41 | 42 | mutable std::condition_variable cv_; 43 | mutable std::mutex mtx_; 44 | std::queue blocks_queue_; 45 | 46 | BlockPtr block_ = {nullptr}; 47 | Iterator block_iter_; 48 | 49 | std::thread tinput_; 50 | std::atomic empty_ = {false}; 51 | }; 52 | 53 | template 54 | void BlockInputStream::Open() 55 | { 56 | TRACEX_METHOD(); 57 | ReadPolicy::Open(); 58 | empty_ = false; 59 | tinput_ = std::thread(&BlockInputStream::InputLoop, this); 60 | } 61 | 62 | template 63 | void BlockInputStream::Close() 64 | { 65 | TRACEX_METHOD(); 66 | ReadPolicy::Close(); 67 | tinput_.join(); 68 | } 69 | 70 | template 71 | bool BlockInputStream::Empty() 72 | { 73 | if (!block_) { 74 | WaitForBlock(); 75 | } 76 | return empty_ && !block_; 77 | } 78 | 79 | template 80 | auto BlockInputStream::Front() 81 | -> ValueType& 82 | { 83 | // Empty() must be called first! 84 | 85 | return *block_iter_; 86 | } 87 | 88 | template 89 | void BlockInputStream::Pop() 90 | { 91 | // Empty() must be called first! 92 | 93 | ++block_iter_; 94 | if (block_iter_ == block_->end()) { 95 | // block is over, free it 96 | auto tmp = block_; 97 | PopBlock(); 98 | MemoryPolicy::Free(tmp); 99 | } 100 | } 101 | 102 | template 103 | auto BlockInputStream::FrontBlock() 104 | -> BlockPtr 105 | { 106 | TRACEX(("block %014p front block") % BlockTraits::RawPtr(block_)); 107 | return block_; 108 | } 109 | 110 | template 111 | void BlockInputStream::PopBlock() 112 | { 113 | // No MemoryPolicy::Free! The caller has to free the block 114 | block_ = nullptr; 115 | } 116 | 117 | template 118 | void BlockInputStream::InputLoop() 119 | { 120 | TRACEX_METHOD(); 121 | 122 | while (!ReadPolicy::Empty()) { 123 | // Allocate and read the block from the file (blocking!) 124 | BlockPtr block = ReadBlock(); 125 | 126 | // push the block to the queue 127 | if (block) { 128 | std::unique_lock lck(mtx_); 129 | blocks_queue_.push(block); 130 | TRACEX(("block %014p => input queue (%d)") 131 | % BlockTraits::RawPtr(block) % blocks_queue_.size()); 132 | cv_.notify_one(); 133 | } 134 | } 135 | 136 | // empty_ needed, since ReadPolicy::Empty() becomes true before 137 | // the last block pushed into the queue 138 | // (hence it can be intercepted by the other thread) 139 | std::unique_lock lck(mtx_); 140 | empty_ = true; 141 | cv_.notify_one(); 142 | } 143 | 144 | template 145 | auto BlockInputStream::ReadBlock() 146 | -> BlockPtr 147 | { 148 | // allocate a new block; supposed to be a blocking call! 149 | // waits for chunks to be released if needed 150 | BlockPtr block = MemoryPolicy::Allocate(); 151 | 152 | // read (fill in) the block from the input source 153 | ReadPolicy::Read(block); 154 | if (block->empty()) { 155 | // this happens when the previous block ended right before EOF 156 | TRACEX(("block %014p is empty, ignoring") 157 | % BlockTraits::RawPtr(block)); 158 | MemoryPolicy::Free(block); 159 | block = nullptr; 160 | } 161 | 162 | return block; 163 | } 164 | 165 | template 166 | void BlockInputStream::WaitForBlock() 167 | { 168 | TRACEX_METHOD(); 169 | 170 | std::unique_lock lck(mtx_); 171 | while (blocks_queue_.empty() && !empty_) { 172 | cv_.wait(lck); 173 | } 174 | 175 | if (!blocks_queue_.empty()) { 176 | block_ = blocks_queue_.front(); 177 | blocks_queue_.pop(); 178 | block_iter_ = block_->begin(); 179 | TRACEX(("block %014p <= input queue (%d)") 180 | % BlockTraits::RawPtr(block_) % blocks_queue_.size()); 181 | } 182 | } 183 | 184 | } // namespace block 185 | } // namespace external_sort 186 | 187 | #endif 188 | -------------------------------------------------------------------------------- /block_memory_policy.hpp: -------------------------------------------------------------------------------- 1 | #ifndef BLOCK_MEMORY_HPP 2 | #define BLOCK_MEMORY_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "block_types.hpp" 11 | 12 | namespace external_sort { 13 | namespace block { 14 | 15 | template 16 | class BlockMemoryPolicy 17 | { 18 | public: 19 | using BlockPtr = typename BlockTraits::BlockPtr; 20 | class BlockPool; 21 | using BlockPoolPtr = std::shared_ptr; 22 | 23 | class BlockPool /*: boost::noncopyable*/ { 24 | public: 25 | BlockPool(size_t memsize, size_t memblocks); 26 | ~BlockPool(); 27 | 28 | public: 29 | size_t Allocated() const; 30 | BlockPtr Allocate(); 31 | void Free(BlockPtr block); 32 | 33 | private: 34 | TRACEX_NAME("BlockPool"); 35 | mutable std::mutex mtx_; 36 | std::condition_variable cv_; 37 | std::stack pool_; 38 | size_t blocks_; 39 | size_t blocks_cnt_; 40 | size_t blocks_allocated_; 41 | }; 42 | 43 | inline size_t Allocated() const { return mem_pool_->Allocated(); } 44 | inline BlockPtr Allocate() { return mem_pool_->Allocate(); } 45 | inline void Free(BlockPtr block) { mem_pool_->Free(block); } 46 | 47 | BlockPoolPtr mem_pool() { return mem_pool_; } 48 | void set_mem_pool(size_t memsize, size_t memblocks) { 49 | mem_pool_ = std::make_shared(memsize, memblocks); 50 | }; 51 | void set_mem_pool(BlockPoolPtr pool) { mem_pool_ = pool; }; 52 | 53 | private: 54 | BlockPoolPtr mem_pool_ = {nullptr}; 55 | }; 56 | 57 | template 58 | BlockMemoryPolicy::BlockPool::BlockPool(size_t memsize, 59 | size_t memblocks) 60 | : blocks_(memblocks), 61 | blocks_cnt_(0), 62 | blocks_allocated_(0) 63 | { 64 | TRACEX(("new block pool: memsize %d, memblocks %d") 65 | % memsize % memblocks); 66 | 67 | size_t block_size = memsize / memblocks / 68 | (sizeof(typename BlockTraits::ValueType)); 69 | 70 | // pre-allocate a pool of blocks 71 | while (pool_.size() < blocks_) { 72 | BlockPtr block(new Block); 73 | block->reserve(block_size); 74 | pool_.push(block); 75 | TRACEX(("new block %014p added to the pool") 76 | % BlockTraits::RawPtr(block)); 77 | } 78 | } 79 | 80 | template 81 | BlockMemoryPolicy::BlockPool::~BlockPool() 82 | { 83 | TRACEX(("deleting block pool")); 84 | 85 | // free all blocks from the pool 86 | while (!pool_.empty()) { 87 | BlockPtr block = pool_.top(); 88 | TRACEX(("deleting block %014p from the pool") 89 | % BlockTraits::RawPtr(block)); 90 | BlockTraits::DeletePtr(block); 91 | pool_.pop(); 92 | } 93 | assert(blocks_allocated_ == 0); 94 | } 95 | 96 | template 97 | size_t BlockMemoryPolicy::BlockPool::Allocated() const 98 | { 99 | std::unique_lock lck(mtx_); 100 | return blocks_allocated_; 101 | } 102 | 103 | template 104 | auto BlockMemoryPolicy::BlockPool::Allocate() 105 | -> BlockPtr 106 | { 107 | std::unique_lock lck(mtx_); 108 | blocks_cnt_++; 109 | TRACEX(("allocating block (%d)...") % blocks_cnt_); 110 | 111 | // get a block from the pre-allocated pool (wait if necesssary) 112 | while (pool_.empty()) { 113 | cv_.wait(lck); 114 | } 115 | BlockPtr block = pool_.top(); 116 | pool_.pop(); 117 | 118 | blocks_allocated_++; 119 | TRACEX(("block %014p allocated (%d)! (%s/%s), cap = %s") 120 | % BlockTraits::RawPtr(block) % blocks_cnt_ 121 | % blocks_allocated_ % blocks_ % block->capacity()); 122 | return block; 123 | } 124 | 125 | template 126 | void BlockMemoryPolicy::BlockPool::Free(BlockPtr block) 127 | { 128 | std::unique_lock lck(mtx_); 129 | blocks_allocated_--; 130 | 131 | // return the block back to the pool 132 | block->resize(0); 133 | pool_.push(block); 134 | 135 | TRACEX(("block %014p deallocated (%s/%s)") 136 | % BlockTraits::RawPtr(block) 137 | % blocks_allocated_ % blocks_); 138 | cv_.notify_one(); 139 | } 140 | 141 | } // namespace block 142 | } // namespace external_sort 143 | 144 | #endif 145 | -------------------------------------------------------------------------------- /block_output_stream.hpp: -------------------------------------------------------------------------------- 1 | #ifndef BLOCK_OUTPUT_STREAM_HPP 2 | #define BLOCK_OUTPUT_STREAM_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "block_types.hpp" 11 | 12 | namespace external_sort { 13 | namespace block { 14 | 15 | template 16 | class BlockOutputStream : public WritePolicy, public MemoryPolicy 17 | { 18 | public: 19 | using BlockType = Block; 20 | using BlockPtr = typename BlockTraits::BlockPtr; 21 | using Iterator = typename BlockTraits::Iterator; 22 | using ValueType = typename BlockTraits::ValueType; 23 | 24 | void Open(); 25 | void Close(); 26 | 27 | void Push(const ValueType& value); // push a single value 28 | void PushBlock(BlockPtr block); // push entire block 29 | void WriteBlock(BlockPtr block); // write a block directly into a file 30 | 31 | private: 32 | void OutputLoop(); 33 | 34 | private: 35 | TRACEX_NAME("BlockOutputStream"); 36 | 37 | mutable std::condition_variable cv_; 38 | mutable std::mutex mtx_; 39 | std::queue blocks_queue_; 40 | 41 | BlockPtr block_ = {nullptr}; 42 | 43 | std::thread toutput_; 44 | std::atomic stopped_ = {false}; 45 | }; 46 | 47 | template 48 | void BlockOutputStream::Open() 49 | { 50 | TRACEX_METHOD(); 51 | 52 | WritePolicy::Open(); 53 | stopped_ = false; 54 | toutput_ = std::thread(&BlockOutputStream::OutputLoop, this); 55 | } 56 | 57 | template 58 | void BlockOutputStream::Close() 59 | { 60 | TRACEX_METHOD(); 61 | 62 | PushBlock(block_); 63 | stopped_ = true; 64 | cv_.notify_one(); 65 | toutput_.join(); 66 | WritePolicy::Close(); 67 | } 68 | 69 | template 70 | void BlockOutputStream::Push( 71 | const ValueType& value) 72 | { 73 | if (!block_) { 74 | block_ = MemoryPolicy::Allocate(); 75 | } 76 | block_->push_back(value); 77 | 78 | if (block_->size() == block_->capacity()) { 79 | // block is full, push it to the output queue 80 | PushBlock(block_); 81 | block_ = nullptr; 82 | } 83 | } 84 | 85 | template 86 | void BlockOutputStream::PushBlock( 87 | BlockPtr block) 88 | { 89 | if (block) { 90 | std::unique_lock lck(mtx_); 91 | blocks_queue_.push(block); 92 | TRACEX(("block %014p => output queue (%d)") 93 | % BlockTraits::RawPtr(block) % blocks_queue_.size()); 94 | cv_.notify_one(); 95 | } 96 | } 97 | 98 | template 99 | void BlockOutputStream::OutputLoop() 100 | { 101 | TRACEX_METHOD(); 102 | for (;;) { 103 | //while (!stopped_ || MemoryPolicy::Allocated()) { 104 | 105 | // wait for a block in the queue or the stop-flag 106 | std::unique_lock lck(mtx_); 107 | while (blocks_queue_.empty() && !stopped_) { 108 | cv_.wait(lck); 109 | } 110 | 111 | if (!blocks_queue_.empty()) { 112 | BlockPtr block = blocks_queue_.front(); 113 | blocks_queue_.pop(); 114 | TRACEX(("block %014p <= output queue (%d)") 115 | % BlockTraits::RawPtr(block) % blocks_queue_.size()); 116 | lck.unlock(); 117 | 118 | WriteBlock(block); 119 | } else if (stopped_) { 120 | // nothing left in the queue and 121 | // the stop flag is set => quit 122 | break; 123 | } 124 | } 125 | } 126 | 127 | template 128 | void BlockOutputStream::WriteBlock( 129 | BlockPtr block) 130 | { 131 | WritePolicy::Write(block); 132 | MemoryPolicy::Free(block); 133 | } 134 | 135 | } // namespace block 136 | } // namespace external_sort 137 | 138 | #endif 139 | -------------------------------------------------------------------------------- /block_types.hpp: -------------------------------------------------------------------------------- 1 | #ifndef BLOCK_TYPES_HPP 2 | #define BLOCK_TYPES_HPP 3 | 4 | #include 5 | #include 6 | 7 | namespace external_sort { 8 | namespace block { 9 | 10 | template 11 | using VectorBlock = std::vector; 12 | 13 | template 14 | struct BlockTraits 15 | { 16 | using Block = BlockType; 17 | 18 | using BlockPtr = Block*; 19 | inline static void* RawPtr(BlockPtr block) { return block; }; 20 | inline static void DeletePtr(BlockPtr block) { delete block; }; 21 | 22 | // Alternatively BlockPtr can be a shared pointer (but it's slower): 23 | // using BlockPtr = std::shared_ptr; 24 | // inline static void* RawPtr(BlockPtr block) { return block.get(); }; 25 | // inline static void DeletePtr(BlockPtr block) { }; 26 | 27 | using Container = Block; 28 | using Iterator = typename Container::iterator; 29 | using ValueType = typename Container::value_type; 30 | }; 31 | 32 | } // namespace block 33 | } // namespace external_sort 34 | 35 | #endif 36 | -------------------------------------------------------------------------------- /example/Makefile: -------------------------------------------------------------------------------- 1 | CXX ?= g++ 2 | 3 | CFLAGS = -std=c++11 -c -Wall -pthread 4 | INCL = -I.. -I./logging -I/usr/local/include 5 | LDFLAGS = \ 6 | -pthread \ 7 | -L/usr/local/lib \ 8 | -lboost_program_options \ 9 | -lboost_timer \ 10 | -lboost_system 11 | 12 | EXE = external_sort 13 | SRC = external_sort_main.cc logging/logging.cc 14 | 15 | OBJ = $(SRC:.cc=.o) 16 | 17 | .PHONY: all clean 18 | 19 | all: CFLAGS += -O3 20 | all: $(EXE) 21 | 22 | debug: CFLAGS += -g -DDEBUG -DBOOSTLOG -DBOOST_LOG_DYN_LINK 23 | debug: LDFLAGS += -lboost_log -lboost_log_setup -lboost_thread 24 | debug: $(EXE) 25 | 26 | $(EXE): $(OBJ) 27 | $(CXX) $(OBJ) -o $@ $(LDFLAGS) 28 | 29 | .cc.o: 30 | $(CXX) $(CFLAGS) $(INCL) $< -o $@ 31 | 32 | clean: 33 | rm -f $(EXE) $(OBJ) 34 | -------------------------------------------------------------------------------- /example/external_sort_custom.hpp: -------------------------------------------------------------------------------- 1 | #ifndef EXTERNAL_SORT_CUSTOM_HPP 2 | #define EXTERNAL_SORT_CUSTOM_HPP 3 | 4 | /// ---------------------------------------------------------------------------- 5 | /// custom type to run external sort for 6 | 7 | // Below is an example how to define and use external_sort with custom type! 8 | 9 | struct CustomRecord 10 | { 11 | uint32_t id; 12 | char name[32]; 13 | char text[64]; 14 | }; 15 | 16 | struct CustomRecordComparator 17 | { 18 | bool operator()(const CustomRecord& x, const CustomRecord& y) const { 19 | return x.id < y.id; 20 | } 21 | }; 22 | 23 | struct CustomRecord2Str 24 | { 25 | std::string operator()(const CustomRecord& x) 26 | { 27 | std::ostringstream ss; 28 | ss << (boost::format("(id = %d; name = '%s'; text = '%s')") 29 | % x.id % x.name % x.text); 30 | return ss.str(); 31 | } 32 | }; 33 | 34 | struct CustomRecordGenerator 35 | { 36 | CustomRecord operator()() 37 | { 38 | CustomRecord x; 39 | std::ostringstream name; 40 | std::ostringstream text; 41 | x.id = rand(); 42 | cnt++; 43 | name << boost::format("Name %03d") % cnt; 44 | memcpy(x.name, name.str().c_str(), sizeof(x.name)); 45 | x.name[sizeof(x.name) - 1] = '\0'; 46 | text << boost::format("Text %03d") % cnt; 47 | memcpy(x.text, text.str().c_str(), sizeof(x.text)); 48 | x.text[sizeof(x.text) - 1] = '\0'; 49 | return x; 50 | } 51 | size_t cnt = 0; 52 | }; 53 | 54 | namespace external_sort { 55 | template <> 56 | struct ValueTraits 57 | { 58 | using Comparator = CustomRecordComparator; 59 | using Generator = CustomRecordGenerator; 60 | using Value2Str = CustomRecord2Str; 61 | 62 | // .. or default generator with all random bytes: 63 | // using Generator = DefaultValueGenerator; 64 | }; 65 | } 66 | 67 | #endif 68 | -------------------------------------------------------------------------------- /example/external_sort_main.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "logging.hpp" 9 | #include "external_sort.hpp" 10 | #include "external_sort_custom.hpp" 11 | 12 | namespace po = boost::program_options; 13 | 14 | /// ---------------------------------------------------------------------------- 15 | /// types 16 | 17 | using ValueType = uint32_t; 18 | 19 | // external sort also work with custom types 20 | // using ValueType = CustomRecord; 21 | 22 | /// ---------------------------------------------------------------------------- 23 | /// consts 24 | 25 | const uint8_t ACT_NONE = 0x00; 26 | const uint8_t ACT_ALL = 0xFF; 27 | const uint8_t ACT_GEN = 1 << 0; // generate 28 | const uint8_t ACT_SPL = 1 << 1; // split 29 | const uint8_t ACT_MRG = 1 << 2; // merge 30 | const uint8_t ACT_CHK = 1 << 3; // check 31 | 32 | const char* DEF_MRG_RES_SFX = ".sorted"; 33 | const char* DEF_GEN_OFILE = "generated"; 34 | 35 | /// ---------------------------------------------------------------------------- 36 | /// auxiliary functions 37 | 38 | std::string any2str(const boost::any x) 39 | { 40 | std::ostringstream ss; 41 | if (x.type() == typeid(std::string)) { 42 | ss << boost::any_cast(x); 43 | } else if (x.type() == typeid(size_t)) { 44 | ss << boost::any_cast(x); 45 | } else if (x.type() == typeid(int)) { 46 | ss << boost::any_cast(x); 47 | } else if (x.type() == typeid(bool)) { 48 | ss << (boost::any_cast(x) ? "true" : "false" ); 49 | } else { 50 | ss << "..."; 51 | } 52 | return ss.str(); 53 | } 54 | 55 | void log_params(const po::variables_map& params, 56 | const std::string& section = {}) 57 | { 58 | for (auto it = params.begin(); it != params.end(); ++it) { 59 | if ((section.empty() && it->first.find(".") == std::string::npos) || 60 | (!section.empty() && 61 | section.compare(0, section.length(), 62 | it->first, 0, section.length()) == 0)) { 63 | LOG_LOW(("%-10s = %s") % it->first % any2str(it->second.value())); 64 | } 65 | } 66 | } 67 | 68 | std::string basename(const std::string& pathname) 69 | { 70 | return {std::find_if(pathname.rbegin(), pathname.rend(), 71 | [](char c) { return c == '/'; }).base(), 72 | pathname.end()}; 73 | } 74 | 75 | std::string replace_dirname(const std::string& pathname, 76 | const std::string& dirname) 77 | { 78 | if (dirname.size()) { 79 | return dirname + '/' + basename(pathname); 80 | } 81 | return pathname; 82 | } 83 | 84 | /// ---------------------------------------------------------------------------- 85 | /// action: split/sort 86 | 87 | std::list act_split(const po::variables_map& vm) 88 | { 89 | LOG_IMP(("\n*** Phase 1: Splitting and Sorting")); 90 | LOG_IMP(("Input file: %s") % vm["spl.ifile"].as()); 91 | log_params(vm, "spl"); 92 | TIMER("Done in %t sec CPU, %w sec real\n"); 93 | 94 | external_sort::SplitParams params; 95 | params.mem.size = vm["msize"].as(); 96 | params.mem.unit = vm["memunit"].as(); 97 | params.mem.blocks = vm["spl.blocks"].as(); 98 | params.spl.ifile = vm["spl.ifile"].as(); 99 | params.spl.ofile = vm["spl.ofile"].as(); 100 | 101 | external_sort::split(params); 102 | if (params.err) { 103 | LOG_ERR(("Error: %s") % params.err.msg()); 104 | } 105 | return params.out.ofiles; 106 | } 107 | 108 | /// ---------------------------------------------------------------------------- 109 | /// action: merge 110 | 111 | void act_merge(const po::variables_map& vm, std::list& files) 112 | { 113 | LOG_IMP(("\n*** Phase 2: Merging")); 114 | log_params(vm, "mrg"); 115 | TIMER("Done in %t sec CPU, %w sec real\n"); 116 | 117 | external_sort::MergeParams params; 118 | params.mem.size = vm["msize"].as(); 119 | params.mem.unit = vm["memunit"].as(); 120 | params.mrg.merges = vm["mrg.merges"].as(); 121 | params.mrg.kmerge = vm["mrg.kmerge"].as(); 122 | params.mrg.stmblocks = vm["mrg.stmblocks"].as(); 123 | params.mrg.ifiles = files; 124 | params.mrg.tfile = vm["mrg.tfile"].as(); 125 | params.mrg.ofile = vm["mrg.ofile"].as(); 126 | params.mrg.rm_input = !vm["no_rm"].as(); 127 | 128 | external_sort::merge(params); 129 | if (params.err) { 130 | LOG_ERR(("Error: %s") % params.err.msg()); 131 | } 132 | } 133 | 134 | /// ---------------------------------------------------------------------------- 135 | /// action: generate 136 | 137 | void act_generate(const po::variables_map& vm) 138 | { 139 | LOG_IMP(("\n*** Generating random data")); 140 | LOG_IMP(("Output file: %s") % vm["gen.ofile"].as()); 141 | log_params(vm, "gen"); 142 | TIMER("Done in %t sec CPU, %w sec real\n"); 143 | 144 | external_sort::GenerateParams params; 145 | params.mem.size = vm["msize"].as(); 146 | params.mem.unit = vm["memunit"].as(); 147 | params.mem.blocks = vm["gen.blocks"].as(); 148 | params.gen.ofile = vm["gen.ofile"].as(); 149 | params.gen.fsize = vm["gen.fsize"].as(); 150 | 151 | external_sort::generate(params); 152 | if (params.err) { 153 | LOG_ERR(("Error: %s") % params.err.msg()); 154 | } 155 | } 156 | 157 | /// ---------------------------------------------------------------------------- 158 | /// action: check 159 | 160 | void act_check(const po::variables_map& vm) 161 | { 162 | LOG_IMP(("\n*** Checking data")); 163 | LOG_IMP(("Input file: %s") % vm["chk.ifile"].as()); 164 | log_params(vm, "chk"); 165 | TIMER("Done in %t sec CPU, %w sec real\n"); 166 | 167 | external_sort::CheckParams params; 168 | params.mem.size = vm["msize"].as(); 169 | params.mem.unit = vm["memunit"].as(); 170 | params.mem.blocks = vm["chk.blocks"].as(); 171 | params.chk.ifile = vm["chk.ifile"].as(); 172 | 173 | external_sort::check(params); 174 | if (params.err) { 175 | LOG_ERR(("The input file is NOT sorted!")); 176 | } 177 | LOG_IMP(("%s") % params.err.msg()); 178 | } 179 | 180 | /// ---------------------------------------------------------------------------- 181 | /// main 182 | 183 | int main(int argc, char *argv[]) 184 | { 185 | std::ostringstream ss; 186 | ss << boost::format("\nUsage: %s [options]\n\n" 187 | "General options") % basename(argv[0]); 188 | 189 | po::options_description desc(ss.str()); 190 | desc.add_options() 191 | ("help,h", 192 | "Display this information\n") 193 | 194 | ("act", 195 | po::value()->default_value("all"), 196 | "Action to perform. Possible values:\n" 197 | "\n" 198 | "gen - Generates random data\n" 199 | "spl - Splits and sorts the input\n" 200 | "mrg - Merges the input\n" 201 | "chk - Checks if the input is sorted\n" 202 | "all - All of the above\n" 203 | "srt = spl + mrg\n") 204 | 205 | ("msize", 206 | po::value()->default_value(1), 207 | "Memory size") 208 | 209 | ("munit", 210 | po::value()->default_value("M"), 211 | "Memory unit: ") 212 | 213 | ("log", 214 | po::value()->default_value(4), 215 | "Log level: [0-6]") 216 | 217 | ("no_rm", 218 | po::value()-> 219 | zero_tokens()->default_value(false)->implicit_value(true), 220 | "Do not remove temporary files") 221 | 222 | ("tmpdir", 223 | po::value()->default_value("", ""), 224 | "Directory for temporary files\n(relevant if act includes mrg)"); 225 | 226 | po::options_description gen_desc("Options for act=gen (generate)"); 227 | gen_desc.add_options() 228 | ("gen.ofile", 229 | po::value()->default_value(DEF_GEN_OFILE), 230 | "Output file") 231 | 232 | ("gen.fsize", 233 | po::value(), 234 | "File size to generate, in memory units.\n" 235 | "By default: gen.fsize = 16 * msize") 236 | 237 | ("gen.blocks", 238 | po::value()->default_value(2), 239 | "Number of blocks in memory"); 240 | 241 | po::options_description spl_desc( 242 | "Options for act=spl (phase 1: split and sort)"); 243 | spl_desc.add_options() 244 | ("srt.ifile", 245 | po::value()->default_value(""), 246 | "Same as --spl.ifile") 247 | 248 | ("spl.ifile", 249 | po::value()->default_value(""), 250 | "Input file") 251 | 252 | ("spl.ofile", 253 | po::value()->default_value(""), 254 | "Output file prefix") 255 | 256 | ("spl.blocks", 257 | po::value()->default_value(2), 258 | "Number of blocks in memory"); 259 | 260 | po::options_description mrg_desc("Options for act=mrg (phase 2: merge)"); 261 | mrg_desc.add_options() 262 | ("mrg.ifiles", 263 | po::value>()->default_value( 264 | std::vector(), "")->multitoken(), 265 | "Input files to be merged into one\n" 266 | "(required and only relevant if act=mrg, otherwise the list of files, " 267 | "i.e. sorted splits, is passed over from phase 1)") 268 | 269 | ("mrg.ofile", 270 | po::value()->default_value(std::string("") + 271 | DEF_MRG_RES_SFX), 272 | "Output file (required if act=mrg)") 273 | 274 | ("mrg.merges", 275 | po::value()->default_value(4), 276 | "Number of simultaneous merge merges") 277 | 278 | ("mrg.kmerge", 279 | po::value()->default_value(4), 280 | "Number of streams merged at a time") 281 | 282 | ("mrg.stmblocks", 283 | po::value()->default_value(2), 284 | "Number of memory blocks per stream"); 285 | 286 | po::options_description chk_desc("Options for act=chk (check)"); 287 | chk_desc.add_options() 288 | ("chk.ifile", 289 | po::value()->default_value(""), 290 | " Input file") 291 | 292 | ("chk.blocks", 293 | po::value()->default_value(2), 294 | " Number of blocks in memory"); 295 | 296 | spl_desc.add(mrg_desc); 297 | gen_desc.add(spl_desc); 298 | desc.add(gen_desc); 299 | desc.add(chk_desc); 300 | 301 | // parse command line arguments 302 | po::variables_map vm; 303 | 304 | try { 305 | po::store(po::parse_command_line(argc, argv, desc), vm); 306 | if (vm.count("help")) { 307 | std::cout << desc << std::endl; 308 | return 1; 309 | } 310 | po::notify(vm); 311 | } catch (std::exception& e) { 312 | std::cerr << "Error: " << e.what() << "\n"; 313 | std::cout << desc << std::endl; 314 | return 1; 315 | } catch (...) { 316 | std::cerr << "Unknown error!" << "\n"; 317 | return 1; 318 | } 319 | 320 | severity_level lvl = IMP; 321 | if (vm["log"].as() >= 0 && 322 | vm["log"].as() <= 6) { 323 | lvl = static_cast(vm["log"].as()); 324 | } 325 | 326 | LOG_INIT(lvl); 327 | TRACE_FUNC(); 328 | srand(time(NULL)); 329 | log_params(vm); 330 | 331 | // po::variables_map does not allow to modify variable_values, 332 | // so cast it to std::map to be able to modify the content 333 | auto& mr = static_cast&>(vm); 334 | 335 | // set the default value for fsize 336 | if (!vm.count("gen.fsize")) { 337 | mr["gen.fsize"].value() = mr["msize"].as() * 16; 338 | } 339 | 340 | // get memory unit coefficient 341 | if (vm["munit"].as() == "M") { 342 | vm.insert(std::make_pair("memunit", 343 | po::variable_value(external_sort::MB, false))); 344 | } else if (vm["munit"].as() == "K") { 345 | vm.insert(std::make_pair("memunit", 346 | po::variable_value(external_sort::KB, false))); 347 | } else if (vm["munit"].as() == "B") { 348 | vm.insert(std::make_pair("memunit", 349 | po::variable_value(external_sort::B, false))); 350 | } else { 351 | LOG_INF(("Unknown munit: %s") % vm["munit"].as()); 352 | std::cout << desc << std::endl; 353 | return 1; 354 | } 355 | 356 | uint8_t act = ACT_NONE; 357 | std::string action = vm["act"].as(); 358 | if (action == "all") { 359 | act = ACT_ALL; 360 | } else if (action == "gen") { 361 | act = ACT_GEN; 362 | } else if (action == "spl") { 363 | act = ACT_SPL; 364 | } else if (action == "mrg") { 365 | act = ACT_MRG; 366 | } else if (action == "chk") { 367 | act = ACT_CHK; 368 | } else if (action == "srt") { 369 | act = ACT_SPL | ACT_MRG; 370 | } else { 371 | LOG_INF(("Unknown action: %s") % action); 372 | std::cout << desc << std::endl; 373 | return 1; 374 | } 375 | 376 | std::list files; 377 | 378 | // adjust filename variables according to the provided options 379 | if (!vm["srt.ifile"].defaulted()) { 380 | mr["spl.ifile"].value() = mr["srt.ifile"].value(); 381 | } else if (vm["spl.ifile"].defaulted()) { 382 | mr["spl.ifile"].value() = mr["gen.ofile"].value(); 383 | } 384 | if (vm["spl.ofile"].defaulted()) { 385 | mr["spl.ofile"].value() = mr["spl.ifile"].value(); 386 | } 387 | if (!(act & ACT_SPL) && (act & ACT_MRG)) { 388 | // no split/sort phase, but only the merge phase 389 | // check for mandatory parameters 390 | for (auto param : {"mrg.ifiles", "mrg.ofile"}){ 391 | if (vm[param].defaulted()) { 392 | LOG_ERR(("Missing mandatory parameter: %s\n" 393 | "For more information, run: %s --help") 394 | % param % argv[0]); 395 | return 1; 396 | } 397 | } 398 | // copy the provided files into the queue 399 | for (const auto& x : 400 | vm["mrg.ifiles"].as>()) { 401 | files.push_back(x); 402 | } 403 | } 404 | if (vm["mrg.ofile"].defaulted()) { 405 | mr["mrg.ofile"].value() = mr["spl.ifile"].value(); 406 | mr["mrg.ofile"].as() += DEF_MRG_RES_SFX; 407 | } 408 | if (vm["chk.ifile"].defaulted()) { 409 | mr["chk.ifile"].value() = mr["mrg.ofile"].value(); 410 | } 411 | 412 | // prefix for temp splits (in case of merge, use tmpdir, if given) 413 | if (act & ACT_MRG) { 414 | mr["spl.ofile"].as() = replace_dirname( 415 | vm["spl.ifile"].as(), vm["tmpdir"].as()); 416 | } 417 | // prefix for temp merges 418 | vm.insert(std::make_pair("mrg.tfile", 419 | po::variable_value(std::string(), false))); 420 | mr["mrg.tfile"].as() = replace_dirname( 421 | vm["mrg.ofile"].defaulted() ? vm["spl.ifile"].as() 422 | : vm["mrg.ofile"].as(), 423 | vm["tmpdir"].as()); 424 | 425 | TIMER("\nOverall %t sec CPU, %w sec real\n"); 426 | 427 | // action! 428 | if (act & ACT_GEN) { 429 | act_generate(vm); 430 | } 431 | if (act & ACT_SPL) { 432 | files = act_split(vm); 433 | } 434 | if (act & ACT_MRG) { 435 | act_merge(vm, files); 436 | } 437 | if (act & ACT_CHK) { 438 | act_check(vm); 439 | } 440 | 441 | return 0; 442 | } 443 | -------------------------------------------------------------------------------- /example/logging/logging.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "logging.hpp" 6 | 7 | #ifdef BOOSTLOG 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | namespace logging = boost::log; 17 | namespace expr = boost::log::expressions; 18 | namespace attrs = boost::log::attributes; 19 | namespace keywords = boost::log::keywords; 20 | 21 | BOOST_LOG_ATTRIBUTE_KEYWORD(timestamp, "TimeStamp", boost::posix_time::ptime) 22 | BOOST_LOG_ATTRIBUTE_KEYWORD(thread_id, "ThreadID", attrs::current_thread_id::value_type) 23 | BOOST_LOG_ATTRIBUTE_KEYWORD(severity, "Severity", severity_level) 24 | BOOST_LOG_ATTRIBUTE_KEYWORD(scope, "Scope", attrs::named_scope::value_type) 25 | BOOST_LOG_ATTRIBUTE_KEYWORD(channel, "Channel", std::string) 26 | #endif 27 | 28 | // The operator is used for regular stream formatting 29 | std::ostream& operator<< (std::ostream& strm, severity_level level) 30 | { 31 | static const char* strings[] = 32 | { 33 | "NON", 34 | "FAT", 35 | "ERR", 36 | "WRN", 37 | "IMP", 38 | "INF", 39 | "LOW", 40 | "DBG" 41 | }; 42 | if (static_cast(level) < sizeof(strings) / sizeof(*strings)) { 43 | strm << strings[level]; 44 | } else { 45 | strm << static_cast(level); 46 | } 47 | return strm; 48 | } 49 | 50 | #ifdef BOOSTLOG 51 | inline size_t tid2nid(const logging::aux::thread::native_type& tid) 52 | { 53 | static std::unordered_map tid_map; 55 | 56 | auto nid_iter = tid_map.find(tid); 57 | if (nid_iter == tid_map.end()) { 58 | tid_map[tid] = tid_map.size(); 59 | nid_iter = tid_map.find(tid); 60 | } 61 | return nid_iter->second; 62 | } 63 | 64 | struct my_formatter 65 | { 66 | void operator()(logging::record_view const& rec, 67 | logging::formatting_ostream& strm) 68 | { 69 | #ifdef DEBUG 70 | std::ostringstream ss_channel; 71 | ss_channel << "<" << rec[channel] << ">"; 72 | 73 | std::ostringstream ss_scope; 74 | ss_scope << "("; 75 | if (rec[scope]->size() >= 2) { 76 | ss_scope << (*(++rec[scope]->rbegin())).scope_name << "->"; 77 | } 78 | if (rec[scope]->size() >= 1) { 79 | ss_scope << (*(rec[scope]->rbegin())).scope_name; 80 | } 81 | ss_scope << ")"; 82 | #endif 83 | std::string fdt_str; 84 | logging::formatting_ostream fdt_stream(fdt_str); 85 | dt_formatter_(rec, fdt_stream); 86 | 87 | auto tid = rec[thread_id]->native_id(); 88 | 89 | strm << bformat_ 90 | % fdt_str % tid 91 | #ifdef DEBUG 92 | % tid2nid(tid) 93 | #endif 94 | % rec[severity] 95 | #ifdef DEBUG 96 | % ss_channel.str() % ss_scope.str() 97 | #endif 98 | % rec[expr::smessage]; 99 | } 100 | 101 | my_formatter() : 102 | #ifdef DEBUG 103 | bformat_(boost::format("[%s %012x:%03d %s] %-40s %-40s %s")), 104 | #else 105 | bformat_(boost::format("[%s %012x %s] %s")), 106 | #endif 107 | dt_formatter_(expr::stream << 108 | expr::format_date_time( 109 | "TimeStamp", "%Y-%m-%d %H:%M:%S.%f")) 110 | { 111 | } 112 | 113 | private: 114 | boost::format bformat_; 115 | std::function dt_formatter_; 117 | }; 118 | #endif // BOOSTLOG 119 | 120 | severity_level log_lvl = IMP; 121 | 122 | void log_init(severity_level lvl) 123 | { 124 | log_lvl = lvl; 125 | #ifdef BOOSTLOG 126 | // Add attributes 127 | logging::add_common_attributes(); 128 | logging::core::get()->add_global_attribute("Scope", attrs::named_scope()); 129 | 130 | typedef logging::sinks::synchronous_sink< 131 | logging::sinks::text_ostream_backend> text_sink; 132 | 133 | // file sink 134 | boost::shared_ptr< text_sink > sinkFile = boost::make_shared< text_sink >(); 135 | sinkFile->locked_backend()->add_stream( 136 | boost::make_shared< std::ofstream >("trace.log")); 137 | sinkFile->locked_backend()->auto_flush(true); 138 | logging::core::get()->add_sink(sinkFile); 139 | 140 | // console sink 141 | boost::shared_ptr sinkConsole = boost::make_shared(); 142 | sinkConsole->locked_backend()->add_stream( 143 | boost::shared_ptr< std::ostream >(&std::cout, logging::empty_deleter())); 144 | sinkConsole->locked_backend()->auto_flush(true); 145 | logging::core::get()->add_sink(sinkConsole); 146 | 147 | sinkFile->set_formatter(my_formatter()); 148 | sinkConsole->set_formatter(my_formatter()); 149 | #endif // BOOSTLOG 150 | } 151 | -------------------------------------------------------------------------------- /example/logging/logging.hpp: -------------------------------------------------------------------------------- 1 | #ifndef LOGGING_HPP 2 | #define LOGGING_HPP 3 | 4 | #include 5 | #include 6 | 7 | // we are going to define the LOG* and TRACE* macros 8 | // undefine the macros first to avoid possible double definition warning 9 | #undef LOG 10 | #undef LOG_FAT 11 | #undef LOG_ERR 12 | #undef LOG_WRN 13 | #undef LOG_IMP 14 | #undef LOG_INF 15 | #undef LOG_LOW 16 | #undef LOG_DBG 17 | 18 | #undef TRACE 19 | #undef TRACE_SCOPE 20 | #undef TRACE_FUNC 21 | #undef TRACEX 22 | #undef TRACEX_SCOPE 23 | #undef TRACEX_METHOD 24 | #undef TRACEX_NAME 25 | 26 | // NOLOG 27 | // STDLOG 28 | // BOOSTLOG 29 | 30 | #define TIMER(x) boost::timer::auto_cpu_timer __x__timer(x); 31 | 32 | enum severity_level 33 | { 34 | NON = 0, // none 35 | FAT = 1, // fatal 36 | ERR = 2, // error 37 | WRN = 3, // warning 38 | IMP = 4, // important information 39 | INF = 5, // information 40 | LOW = 6, // information of low importance 41 | DBG = 7, // debug (only for debug builds) 42 | }; 43 | 44 | extern severity_level log_lvl; 45 | 46 | /// ---------------------------------------------------------------------------- 47 | /// LOG with boost.log 48 | #ifdef BOOSTLOG 49 | #include 50 | 51 | typedef boost::log::sources::severity_channel_logger_mt< 52 | severity_level, // the type of the severity level 53 | std::string // the type of the channel name 54 | > channel_logger_mt; 55 | 56 | BOOST_LOG_INLINE_GLOBAL_LOGGER_INIT(my_global_logger, channel_logger_mt) 57 | { 58 | return channel_logger_mt(boost::log::keywords::channel = "global"); 59 | } 60 | 61 | #define LOG(lvl, x) \ 62 | BOOST_LOG_SEV(my_global_logger::get(), lvl) << boost::format x 63 | #else 64 | /// ---------------------------------------------------------------------------- 65 | /// LOG with std::cout 66 | #define LOG(lvl, x) \ 67 | { if (lvl <= log_lvl) std::cout << boost::format x << std::endl; } 68 | #endif 69 | 70 | #define LOG_FAT(x) LOG(FAT, x) 71 | #define LOG_ERR(x) LOG(ERR, x) 72 | #define LOG_WRN(x) LOG(WRN, x) 73 | #define LOG_IMP(x) LOG(IMP, x) 74 | #define LOG_INF(x) LOG(INF, x) 75 | #define LOG_LOW(x) LOG(LOW, x) 76 | #define LOG_DBG(x) LOG(DBG, x) 77 | 78 | #define LOG_INIT log_init 79 | void log_init(severity_level = IMP); 80 | 81 | /// ---------------------------------------------------------------------------- 82 | /// NO DEBUG => all TRACE* macros are empty 83 | #if !defined(DEBUG) || !defined(BOOSTLOG) 84 | 85 | #define TRACE(x) 86 | #define TRACE_SCOPE(scope) 87 | #define TRACE_FUNC() 88 | 89 | #define TRACEX(x) 90 | #define TRACEX_SCOPE(scope) 91 | #define TRACEX_METHOD() 92 | #define TRACEX_NAME(channel) 93 | 94 | #else 95 | /// ---------------------------------------------------------------------------- 96 | /// DEBUG TRACE with boost log 97 | 98 | #define TRACE_WITH_LOGGER(logger, x) \ 99 | BOOST_LOG_SEV(logger, DBG) << boost::format x 100 | 101 | template 102 | class TraceScope { 103 | public: 104 | TraceScope(const Logger& logger, const std::string& scope) 105 | : scope_(scope), logger_(logger) 106 | { 107 | TRACE_WITH_LOGGER(logger_, ("--> Enter %s") % scope_); 108 | } 109 | ~TraceScope() 110 | { 111 | TRACE_WITH_LOGGER(logger_, ("<-- Exit %s") % scope_); 112 | } 113 | private: 114 | std::string scope_; 115 | const Logger &logger_; 116 | }; 117 | 118 | #define AUX_TRACE_SCOPE_CLASS(logger, scope) \ 119 | TraceScope aux_trace_scope(logger, scope) 120 | 121 | /// TRACE_*() - global trace 122 | 123 | #define TRACE(x) \ 124 | TRACE_WITH_LOGGER(my_global_logger::get(), x) 125 | 126 | #define TRACE_SCOPE(scope) \ 127 | BOOST_LOG_NAMED_SCOPE(scope); \ 128 | AUX_TRACE_SCOPE_CLASS(my_global_logger::get(), scope); 129 | 130 | #define TRACE_FUNC() \ 131 | TRACE_SCOPE(__func__) 132 | 133 | /// TRACEX_*() - trace with named channel (for example, in a class) 134 | 135 | #define TRACEX(x) \ 136 | TRACE_WITH_LOGGER(get_channel_logger(), x) 137 | 138 | #define TRACEX_SCOPE(scope) \ 139 | BOOST_LOG_NAMED_SCOPE(scope); \ 140 | AUX_TRACE_SCOPE_CLASS(get_channel_logger(), scope) 141 | 142 | #define TRACEX_METHOD() \ 143 | TRACEX_SCOPE(__func__) 144 | 145 | #define TRACEX_NAME(channel_name) \ 146 | mutable std::shared_ptr channel_logger_; \ 147 | inline channel_logger_mt& get_channel_logger() const { \ 148 | if (!channel_logger_) { \ 149 | std::ostringstream ss; \ 150 | ss << boost::format("%014p:%s") % this % channel_name; \ 151 | channel_logger_.reset(new channel_logger_mt( \ 152 | boost::log::keywords::channel = ss.str())); \ 153 | } \ 154 | return *channel_logger_; \ 155 | } 156 | 157 | #endif 158 | 159 | #endif 160 | -------------------------------------------------------------------------------- /example/simple_example1.cc: -------------------------------------------------------------------------------- 1 | // can be compiled with: 2 | // g++ -std=c++11 -I.. -pthread simple_example1.cc -o ./simple1 3 | 4 | #include 5 | 6 | #include "external_sort.hpp" 7 | 8 | int main() 9 | { 10 | // set split and merge parameters 11 | external_sort::SplitParams sp; 12 | external_sort::MergeParams mp; 13 | sp.mem.size = 10; 14 | sp.mem.unit = external_sort::MB; 15 | mp.mem = sp.mem; 16 | sp.spl.ifile = "/dir1/big_input_file"; 17 | mp.mrg.ofile = "/dir2/big_sorted_file"; 18 | 19 | using ValueType = unsigned int; 20 | 21 | // run external sort 22 | external_sort::sort(sp, mp); 23 | 24 | if (sp.err.none && mp.err.none) { 25 | std::cout << "File sorted successfully!" << std::endl; 26 | } else { 27 | std::cout << "External sort failed!" << std::endl; 28 | if (sp.err) { 29 | std::cout << "Split failed: " << sp.err.msg() << std::endl; 30 | } else { 31 | std::cout << "Merge failed: " << mp.err.msg() << std::endl; 32 | } 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /external_sort.hpp: -------------------------------------------------------------------------------- 1 | #ifndef EXTERNAL_SORT_HPP 2 | #define EXTERNAL_SORT_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include "external_sort_nolog.hpp" 12 | #include "external_sort_types.hpp" 13 | #include "external_sort_merge.hpp" 14 | #include "async_funcs.hpp" 15 | 16 | namespace external_sort { 17 | 18 | const char* DEF_SPL_TMP_SFX = "split"; 19 | const char* DEF_MRG_TMP_SFX = "merge"; 20 | 21 | /// ---------------------------------------------------------------------------- 22 | /// auxiliary functions 23 | 24 | template 25 | SizeType memsize_in_bytes(const SizeType& memsize, const MemUnit& u) 26 | { 27 | if (u == KB) { 28 | return memsize << 10; 29 | } 30 | if (u == MB) { 31 | return memsize << 20; 32 | } 33 | return memsize; 34 | } 35 | 36 | template 37 | std::string make_tmp_filename(const std::string& prefix, 38 | const std::string& suffix, 39 | const IndexType& index) 40 | { 41 | std::ostringstream filename; 42 | filename << prefix << "." << suffix << "." 43 | << std::setfill ('0') << std::setw(3) << index; 44 | return filename.str(); 45 | } 46 | 47 | template 48 | typename Types::OStreamPtr 49 | sort_and_write(typename Types::BlockPtr block, 50 | typename Types::OStreamPtr ostream) 51 | { 52 | // sort the block 53 | std::sort(block->begin(), block->end(), 54 | typename Types::Comparator()); 55 | TRACE(("block %014p sorted") % 56 | Types::BlockTraits::RawPtr(block)); 57 | 58 | // write the block to the output stream 59 | ostream->WriteBlock(block); 60 | return ostream; 61 | } 62 | 63 | /// ---------------------------------------------------------------------------- 64 | /// main external sorting functions 65 | 66 | //! External Split 67 | template 68 | void split(SplitParams& params) 69 | { 70 | TRACE_FUNC(); 71 | size_t file_cnt = 0; 72 | 73 | aux::AsyncFuncs::OStreamPtr> splits; 74 | 75 | // create memory pool to be shared between input and output streams 76 | auto mem_pool = std::make_shared::BlockPool>( 77 | memsize_in_bytes(params.mem.size, params.mem.unit), params.mem.blocks); 78 | 79 | // create the input stream 80 | auto istream = std::make_shared::IStream>(); 81 | istream->set_mem_pool(mem_pool); 82 | istream->set_input_filename(params.spl.ifile); 83 | istream->set_input_rm_file(params.spl.rm_input); 84 | istream->Open(); 85 | 86 | if (params.spl.ofile.empty()) { 87 | // if no output prefix given, use input filename as a prefix 88 | params.spl.ofile = params.spl.ifile; 89 | } 90 | 91 | while (!istream->Empty()) { 92 | // read a block from the input stream 93 | auto block = istream->FrontBlock(); 94 | istream->PopBlock(); 95 | 96 | // create an output stream 97 | auto ostream = std::make_shared::OStream>(); 98 | ostream->set_mem_pool(mem_pool); 99 | ostream->set_output_filename( 100 | make_tmp_filename(params.spl.ofile, DEF_SPL_TMP_SFX, ++file_cnt)); 101 | ostream->Open(); 102 | 103 | // asynchronously sort the block and write it to the output stream 104 | splits.Async(&sort_and_write, 105 | std::move(block), std::move(ostream)); 106 | 107 | // collect the results 108 | while ((splits.Ready() > 0) || (splits.Running() && istream->Empty())) { 109 | // wait for any split and get its output filename 110 | auto ostream_ready = splits.GetAny(); 111 | if (ostream_ready) { 112 | ostream_ready->Close(); 113 | params.out.ofiles.push_back(ostream_ready->output_filename()); 114 | } 115 | } 116 | } 117 | istream->Close(); 118 | } 119 | 120 | //! External Merge 121 | template 122 | void merge(MergeParams& params) 123 | { 124 | TRACE_FUNC(); 125 | size_t file_cnt = 0; 126 | 127 | aux::AsyncFuncs::OStreamPtr> merges; 128 | 129 | size_t mem_merge = memsize_in_bytes(params.mem.size, params.mem.unit) / 130 | params.mrg.merges; 131 | size_t mem_ostream = mem_merge / 2; 132 | size_t mem_istream = mem_merge - mem_ostream; 133 | 134 | // Merge files while there is something to merge or there are ongoing merges 135 | auto files = params.mrg.ifiles; 136 | while (files.size() > 1 || !merges.Empty()) { 137 | LOG_INF(("* files left to merge %d") % files.size()); 138 | 139 | // create a set of input streams with next kmerge files from the queue 140 | std::unordered_set::IStreamPtr> istreams; 141 | while (istreams.size() < params.mrg.kmerge && !files.empty()) { 142 | // create input stream 143 | auto is = std::make_shared::IStream>(); 144 | is->set_mem_pool(mem_istream, params.mrg.stmblocks); 145 | is->set_input_filename(files.front()); 146 | is->set_input_rm_file(params.mrg.rm_input); 147 | // add to the set 148 | istreams.insert(is); 149 | files.pop_front(); 150 | } 151 | 152 | // create an output stream 153 | auto ostream = std::make_shared::OStream>(); 154 | ostream->set_mem_pool(mem_ostream, params.mrg.stmblocks); 155 | ostream->set_output_filename(make_tmp_filename( 156 | (params.mrg.tfile.size() ? params.mrg.tfile : params.mrg.ofile), 157 | DEF_MRG_TMP_SFX, ++file_cnt)); 158 | 159 | // asynchronously merge and write to the output stream 160 | merges.Async(&merge_streams::IStreamPtr, 161 | typename Types::OStreamPtr>, 162 | std::move(istreams), std::move(ostream)); 163 | 164 | // Wait/get results of asynchroniously running merges if: 165 | // 1) Too few files ready to be merged, while still running merges. 166 | // In other words, more files can be merged at once than 167 | // currently available. So wait for more files. 168 | // 2) There are completed (ready) merges; results shall be collected 169 | // 3) There are simply too many already ongoing merges 170 | while ((files.size() < params.mrg.kmerge && !merges.Empty()) || 171 | (merges.Ready() > 0) || (merges.Running() >= params.mrg.merges)) { 172 | auto ostream_ready = merges.GetAny(); 173 | if (ostream_ready) { 174 | files.push_back(ostream_ready->output_filename()); 175 | } 176 | } 177 | } 178 | 179 | if (files.size()) { 180 | if (rename(files.front().c_str(), params.mrg.ofile.c_str()) == 0) { 181 | LOG_IMP(("Output file: %s") % params.mrg.ofile); 182 | } else { 183 | params.err.none = false; 184 | params.err.stream << "Cannot rename " << files.front() 185 | << " to " << params.mrg.ofile; 186 | } 187 | } else { 188 | params.err.none = false; 189 | params.err.stream << "Merge failed. No input"; 190 | } 191 | } 192 | 193 | //! External Sort (= Split + Merge) 194 | template 195 | void sort(SplitParams& sp, MergeParams& mp) 196 | { 197 | split(sp); 198 | 199 | if (sp.err.none) { 200 | mp.mrg.ifiles = sp.out.ofiles; 201 | merge(mp); 202 | } 203 | } 204 | 205 | //! External Check 206 | template 207 | bool check(CheckParams& params) 208 | { 209 | TRACE_FUNC(); 210 | auto comp = typename ValueTraits::Comparator(); 211 | auto vtos = typename ValueTraits::Value2Str(); 212 | 213 | auto istream = std::make_shared::IStream>(); 214 | istream->set_mem_pool(memsize_in_bytes(params.mem.size, params.mem.unit), 215 | params.mem.blocks); 216 | istream->set_input_filename(params.chk.ifile); 217 | istream->Open(); 218 | 219 | size_t cnt = 0, bad = 0; 220 | if (!istream->Empty()) { 221 | auto vcurr = istream->Front(); 222 | auto vprev = vcurr; 223 | auto vfirst = vprev; 224 | auto vmin = vfirst; 225 | auto vmax = vfirst; 226 | istream->Pop(); 227 | ++cnt; 228 | 229 | while (!istream->Empty()) { 230 | vcurr = istream->Front(); 231 | if (comp(vcurr, vprev)) { 232 | if (bad < 10) { 233 | params.err.stream << "Out of order! cnt = " << cnt 234 | << " prev = " << vtos(vprev) 235 | << " curr = " << vtos(vcurr) << "\n"; 236 | } 237 | bad++; 238 | } 239 | if (comp(vcurr, vmin)) { 240 | vmin = vcurr; 241 | } 242 | if (comp(vmax, vcurr)) { 243 | vmax = vcurr; 244 | } 245 | vprev = vcurr; 246 | istream->Pop(); 247 | ++cnt; 248 | } 249 | if (bad) { 250 | params.err.none = false; 251 | params.err.stream << "Total elements out of order: " << bad << "\n"; 252 | } 253 | params.err.stream << "\tmin = " << vtos(vmin) 254 | << ", max = " << vtos(vmax) << "\n"; 255 | params.err.stream << "\tfirst = " << vtos(vfirst) 256 | << ", last = " << vtos(vprev) << "\n"; 257 | } 258 | params.err.stream << "\tsorted = " << ((bad) ? "false" : "true") 259 | << ", elems = " << cnt << ", bad = " << bad; 260 | istream->Close(); 261 | return bad == 0; 262 | } 263 | 264 | //! External Generate 265 | template 266 | void generate(const GenerateParams& params) 267 | { 268 | TRACE_FUNC(); 269 | 270 | auto generator = typename ValueTraits::Generator(); 271 | size_t gen_elements = memsize_in_bytes(params.gen.fsize, params.mem.unit) / 272 | sizeof(ValueType); 273 | 274 | auto ostream = std::make_shared::OStream>(); 275 | ostream->set_mem_pool(memsize_in_bytes(params.mem.size, params.mem.unit), 276 | params.mem.blocks); 277 | ostream->set_output_filename(params.gen.ofile); 278 | ostream->Open(); 279 | 280 | for (size_t i = 0; i < gen_elements; i++) { 281 | ostream->Push(generator()); 282 | } 283 | 284 | ostream->Close(); 285 | } 286 | 287 | } // namespace external_sort 288 | 289 | #endif 290 | -------------------------------------------------------------------------------- /external_sort_merge.hpp: -------------------------------------------------------------------------------- 1 | #ifndef EXTERNAL_SORT_MERGE_HPP 2 | #define EXTERNAL_SORT_MERGE_HPP 3 | 4 | namespace external_sort { 5 | 6 | // merges 1 stream (simple copy) 7 | template 8 | void copy_stream(InputStream* sin, OutputStream* sout) 9 | { 10 | TRACE_FUNC(); 11 | while (!sin->Empty()) { 12 | sout->Push(sin->Front()); 13 | sin->Pop(); 14 | } 15 | } 16 | 17 | // merges 2 streams 18 | template 19 | void merge_2streams(StreamSet& sin, OutputStream* sout, 20 | Comparator comp) 21 | { 22 | TRACE_FUNC(); 23 | if (sin.size() != 2) { 24 | LOG_ERR(("Internal error: mismatch in number of streams %d/%d") 25 | % sin.size() % 2); 26 | return; 27 | } 28 | auto it = sin.begin(); 29 | InputStream* s1 = *(it++); 30 | InputStream* s2 = *(it++); 31 | InputStream* smin = s1; 32 | 33 | for (;;) { 34 | smin = comp(s1->Front(), s2->Front()) ? s1 : s2; 35 | sout->Push(smin->Front()); 36 | smin->Pop(); 37 | if (smin->Empty()) { 38 | sin.erase(smin); 39 | break; 40 | } 41 | } 42 | copy_stream(*sin.begin(), sout); 43 | } 44 | 45 | // merges 3 streams 46 | template 47 | void merge_3streams(StreamSet& sin, OutputStream* sout, 48 | Comparator comp) 49 | { 50 | TRACE_FUNC(); 51 | if (sin.size() != 3) { 52 | LOG_ERR(("Internal error: mismatch in number of streams %d/%d") 53 | % sin.size() % 3); 54 | return; 55 | } 56 | auto it = sin.begin(); 57 | InputStream* s1 = *(it++); 58 | InputStream* s2 = *(it++); 59 | InputStream* s3 = *(it++); 60 | InputStream* smin = s1; 61 | 62 | for (;;) { 63 | if (comp(s1->Front(),s2->Front())) { 64 | smin = comp(s1->Front(), s3->Front()) ? s1 : s3; 65 | } else { 66 | smin = comp(s2->Front(), s3->Front()) ? s2 : s3; 67 | } 68 | sout->Push(smin->Front()); 69 | smin->Pop(); 70 | if (smin->Empty()) { 71 | sin.erase(smin); 72 | break; 73 | } 74 | } 75 | merge_2streams(sin, sout, comp); 76 | } 77 | 78 | // merges 4 streams 79 | template 80 | void merge_4streams(StreamSet& sin, OutputStream* sout, 81 | Comparator comp) 82 | { 83 | TRACE_FUNC(); 84 | if (sin.size() != 4) { 85 | LOG_ERR(("Internal error: mismatch in number of streams %d/%d") 86 | % sin.size() % 4); 87 | return; 88 | } 89 | auto it = sin.begin(); 90 | InputStream* s1 = *(it++); 91 | InputStream* s2 = *(it++); 92 | InputStream* s3 = *(it++); 93 | InputStream* s4 = *(it++); 94 | InputStream* smin = s1; 95 | 96 | for (;;) { 97 | if (comp(s1->Front(), s2->Front())) { 98 | if (comp(s3->Front(), s4->Front())) 99 | smin = comp(s1->Front(), s3->Front()) ? s1 : s3; 100 | else 101 | smin = comp(s1->Front(), s4->Front()) ? s1 : s4; 102 | } else { 103 | if (comp(s3->Front(), s4->Front())) 104 | smin = comp(s2->Front(), s3->Front()) ? s2 : s3; 105 | else 106 | smin = comp(s2->Front(), s4->Front()) ? s2 : s4; 107 | } 108 | sout->Push(smin->Front()); 109 | smin->Pop(); 110 | if (smin->Empty()) { 111 | sin.erase(smin); 112 | break; 113 | } 114 | } 115 | merge_3streams(sin, sout, comp); 116 | } 117 | 118 | template 119 | void merge_nstreams(StreamSet& sin, OutputStream* sout, 120 | Comparator comp) 121 | { 122 | TRACE_FUNC(); 123 | if (sin.size() <= 4) { 124 | LOG_ERR(("Internal error: too few streams for heap-based merge %d") 125 | % sin.size()); 126 | return; 127 | } 128 | 129 | InputStream* smin; 130 | 131 | std::vector heap; 132 | for (auto& s : sin) { 133 | if (!s->Empty()) { 134 | heap.push_back(s); 135 | } 136 | } 137 | auto hcomp = [ &comp ] (InputStream*& s1, InputStream*& s2) { 138 | return comp(s2->Front(), s1->Front()); 139 | }; 140 | std::make_heap(heap.begin(), heap.end(), hcomp); 141 | 142 | while (heap.size() > 4) { 143 | // find minimum element in the input streams 144 | smin = heap.front(); 145 | std::pop_heap(heap.begin(), heap.end(), hcomp); 146 | 147 | // output the minumum element 148 | sout->Push(smin->Front()); 149 | smin->Pop(); 150 | 151 | if (smin->Empty()) { 152 | // end of this stream 153 | heap.pop_back(); 154 | sin.erase(smin); 155 | } else { 156 | // there is more data in the stream, 157 | // push it back to the heap 158 | heap.back() = smin; 159 | std::push_heap(heap.begin(), heap.end(), hcomp); 160 | } 161 | } 162 | merge_4streams(sin, sout, comp); 163 | } 164 | 165 | template 166 | OutputStreamPtr merge_streams(StreamSet sin, 167 | OutputStreamPtr sout) 168 | { 169 | TRACE_FUNC(); 170 | // Make a new StreamSet with raw pointers to pass to the merge functions: 171 | // 1) Raw pointers are faster 172 | // 2) The merge functions will shrink the set as streams get exhausted 173 | // 3) The original StreamSet is needed to close all streams, when it's done 174 | using InputStream = typename InputStreamPtr::element_type; 175 | using OutputStream = typename OutputStreamPtr::element_type; 176 | StreamSet sinp; 177 | OutputStream* soutp = sout.get(); 178 | 179 | auto comp = typename Types< 180 | typename InputStream::BlockType::value_type>::Comparator(); 181 | 182 | for (const auto& s : sin) { 183 | s->Open(); 184 | if (!s->Empty()) { 185 | sinp.insert(s.get()); 186 | } 187 | } 188 | 189 | if (sinp.size() > 0) { 190 | sout->Open(); 191 | if (sinp.size() > 4) { 192 | merge_nstreams(sinp, soutp, comp); 193 | } else if (sinp.size() == 4) { 194 | merge_4streams(sinp, soutp, comp); 195 | } else if (sinp.size() == 3) { 196 | merge_3streams(sinp, soutp, comp); 197 | } else if (sinp.size() == 2) { 198 | merge_2streams(sinp, soutp, comp); 199 | } else if (sinp.size() == 1) { 200 | copy_stream(*sinp.begin(), soutp); 201 | } 202 | sout->Close(); 203 | } else { 204 | LOG_ERR(("No input streams to merge!")); 205 | sout.reset(); 206 | } 207 | 208 | for (const auto& s : sin) { 209 | s->Close(); 210 | } 211 | return sout; 212 | } 213 | 214 | } // namespace external_sort 215 | 216 | #endif 217 | -------------------------------------------------------------------------------- /external_sort_nolog.hpp: -------------------------------------------------------------------------------- 1 | #ifndef EXTERNAL_SORT_NOLOG_HPP 2 | #define EXTERNAL_SORT_NOLOG_HPP 3 | 4 | // The external sort source files use LOG* and TRACE* macros to 5 | // log info and trace debug messages. In order to be able to include 6 | // external_sort.hpp and compile it even if no such macors are defined 7 | // in the client program, we have to define them empty here 8 | 9 | #if !defined(LOG_FAT) || !defined(LOG_ERR) || !defined(LOG_WRN) || \ 10 | !defined(LOG_IMP) || !defined(LOG_INF) || !defined(LOG_LOW) || \ 11 | !defined(LOG_DBG) 12 | 13 | #define LOG_FAT(x) 14 | #define LOG_ERR(x) 15 | #define LOG_WRN(x) 16 | #define LOG_IMP(x) 17 | #define LOG_INF(x) 18 | #define LOG_LOW(x) 19 | #define LOG_DBG(x) 20 | 21 | #endif 22 | 23 | #if !defined(TRACE) || !defined(TRACE_SCOPE) || !defined(TRACE_FUNC) || \ 24 | !defined(TRACEX) || !defined(TRACEX_SCOPE) || !defined(TRACEX_METHOD) || \ 25 | !defined(TRACEX_NAME) 26 | 27 | #define TRACE(x) 28 | #define TRACE_SCOPE(scope) 29 | #define TRACE_FUNC() 30 | 31 | #define TRACEX(x) 32 | #define TRACEX_SCOPE(scope) 33 | #define TRACEX_METHOD() 34 | #define TRACEX_NAME(channel) 35 | 36 | #endif 37 | 38 | #endif 39 | -------------------------------------------------------------------------------- /external_sort_types.hpp: -------------------------------------------------------------------------------- 1 | #ifndef EXTERNAL_SORT_TYPES_HPP 2 | #define EXTERNAL_SORT_TYPES_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include "block_types.hpp" 9 | #include "block_input_stream.hpp" 10 | #include "block_output_stream.hpp" 11 | #include "block_file_read_policy.hpp" 12 | #include "block_file_write_policy.hpp" 13 | #include "block_memory_policy.hpp" 14 | 15 | namespace external_sort { 16 | 17 | /// ---------------------------------------------------------------------------- 18 | /// Parameter objects 19 | 20 | enum MemUnit { MB, KB, B }; 21 | 22 | struct MemParams 23 | { 24 | size_t size = 10; // memory size 25 | MemUnit unit = MB; // memory unit 26 | size_t blocks = 2; // number of blocks memory is divided by 27 | }; 28 | 29 | struct ErrParams 30 | { 31 | bool none = true; // error status 32 | std::ostringstream stream; // error stream 33 | 34 | operator bool () const { return !none; } 35 | operator std::string () const { return stream.str(); } 36 | std::string msg() const { return stream.str(); } 37 | 38 | }; 39 | 40 | struct SplitParams 41 | { 42 | MemParams mem; // memory params 43 | ErrParams err; // error params 44 | struct { 45 | std::string ifile; // input file to split 46 | std::string ofile; // output file prefix (prefix of splits) 47 | bool rm_input = false; // ifile should be removed when done? 48 | } spl; 49 | struct { 50 | std::list ofiles; // list of output files (splits) 51 | } out; 52 | }; 53 | 54 | struct MergeParams 55 | { 56 | MemParams mem; // memory params 57 | ErrParams err; // error params 58 | struct { 59 | size_t merges = 4; // number of simultaneous merges 60 | size_t kmerge = 4; // number of streams to merge at a time 61 | size_t stmblocks = 2; // number of memory blocks per stream 62 | std::list ifiles; // list of input files to merge 63 | std::string tfile; // prefix for temporary files 64 | std::string ofile; // output file (the merge result) 65 | bool rm_input = true; // ifile should be removed when done? 66 | } mrg; 67 | }; 68 | 69 | struct CheckParams 70 | { 71 | MemParams mem; // memory params 72 | ErrParams err; // error params 73 | struct { 74 | std::string ifile; // input file to check it it's sorted 75 | } chk; 76 | }; 77 | 78 | struct GenerateParams 79 | { 80 | MemParams mem; // memory params 81 | ErrParams err; // error params 82 | struct { 83 | size_t fsize = 0; // file size to generate (in mem.units) 84 | std::string ofile; // output file 85 | } gen; 86 | }; 87 | 88 | /// ---------------------------------------------------------------------------- 89 | /// Types 90 | 91 | //! Default generator 92 | template 93 | struct DefaultValueGenerator 94 | { 95 | T operator()() 96 | { 97 | union { 98 | T data; 99 | uint8_t bytes[sizeof(T)]; 100 | } u; 101 | for (auto& b : u.bytes) { 102 | b = rand() & 0xFF; 103 | } 104 | return u.data; 105 | } 106 | }; 107 | 108 | //! Default value-to-string convertor 109 | template 110 | struct DefaultValue2Str 111 | { 112 | std::string operator()(const ValueType& value) 113 | { 114 | std::ostringstream ss; 115 | ss << value; 116 | return ss.str(); 117 | } 118 | }; 119 | 120 | //! Default ValueType traits 121 | template 122 | struct ValueTraits 123 | { 124 | using Comparator = std::less; 125 | using Generator = DefaultValueGenerator; 126 | using Value2Str = DefaultValue2Str; 127 | 128 | // It can be extended to support non-POD types: 129 | // static const size_t ValueSize = sizeof(ValueType); 130 | // static inline int Serialize(...); 131 | // static inline int Deserialize(...); 132 | }; 133 | 134 | //! Stream set 135 | template 136 | using StreamSet = std::unordered_set; 137 | 138 | //! All types in one place 139 | template 140 | struct Types 141 | { 142 | // Value trait shortcuts 143 | using Comparator = typename ValueTraits::Comparator; 144 | 145 | // Block Types 146 | using Block = block::VectorBlock; 147 | using BlockPtr = typename block::BlockTraits::BlockPtr; 148 | using BlockPool = typename block::BlockMemoryPolicy::BlockPool; 149 | using BlockTraits = block::BlockTraits; 150 | 151 | // Stream Types 152 | using IStream = block::BlockInputStream, 154 | block::BlockMemoryPolicy>; 155 | 156 | using OStream = block::BlockOutputStream, 158 | block::BlockMemoryPolicy>; 159 | 160 | using IStreamPtr = std::shared_ptr; 161 | using OStreamPtr = std::shared_ptr; 162 | }; 163 | 164 | } // namespace external_sort 165 | 166 | #endif 167 | --------------------------------------------------------------------------------