├── .clang-format ├── .gitignore ├── .gitmodules ├── CMakeLists.txt ├── LICENSE ├── README.md ├── data └── test_collection.docs ├── include └── interpolative_coding.hpp ├── src ├── CMakeLists.txt ├── check.cpp ├── decode.cpp └── encode.cpp └── test ├── CMakeLists.txt └── example.cpp /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | Language: Cpp 3 | # BasedOnStyle: Google 4 | AccessModifierOffset: -4 5 | AlignAfterOpenBracket: Align 6 | AlignConsecutiveAssignments: false 7 | AlignConsecutiveDeclarations: false 8 | AlignEscapedNewlines: Left 9 | AlignOperands: true 10 | AlignTrailingComments: true 11 | AllowAllParametersOfDeclarationOnNextLine: true 12 | AllowShortBlocksOnASingleLine: false 13 | AllowShortCaseLabelsOnASingleLine: false 14 | AllowShortFunctionsOnASingleLine: Empty 15 | AllowShortIfStatementsOnASingleLine: true 16 | AllowShortLoopsOnASingleLine: true 17 | AlwaysBreakAfterDefinitionReturnType: None 18 | AlwaysBreakAfterReturnType: None 19 | AlwaysBreakBeforeMultilineStrings: true 20 | AlwaysBreakTemplateDeclarations: Yes 21 | BinPackArguments: true 22 | BinPackParameters: true 23 | BraceWrapping: 24 | AfterClass: false 25 | AfterControlStatement: false 26 | AfterEnum: false 27 | AfterFunction: false 28 | AfterNamespace: false 29 | AfterObjCDeclaration: false 30 | AfterStruct: false 31 | AfterUnion: false 32 | AfterExternBlock: false 33 | BeforeCatch: false 34 | BeforeElse: false 35 | IndentBraces: false 36 | SplitEmptyFunction: true 37 | SplitEmptyRecord: true 38 | SplitEmptyNamespace: true 39 | BreakBeforeBinaryOperators: None 40 | BreakBeforeBraces: Attach 41 | BreakBeforeInheritanceComma: false 42 | BreakInheritanceList: BeforeComma 43 | BreakBeforeTernaryOperators: true 44 | BreakConstructorInitializersBeforeComma: true 45 | BreakConstructorInitializers: BeforeComma 46 | BreakAfterJavaFieldAnnotations: false 47 | BreakStringLiterals: true 48 | ColumnLimit: 80 49 | CommentPragmas: '^ IWYU pragma:' 50 | CompactNamespaces: false 51 | ConstructorInitializerAllOnOneLineOrOnePerLine: true 52 | ConstructorInitializerIndentWidth: 4 53 | ContinuationIndentWidth: 4 54 | Cpp11BracedListStyle: true 55 | DerivePointerAlignment: false 56 | DisableFormat: false 57 | ExperimentalAutoDetectBinPacking: false 58 | FixNamespaceComments: true 59 | ForEachMacros: 60 | - foreach 61 | - Q_FOREACH 62 | - BOOST_FOREACH 63 | IncludeBlocks: Preserve 64 | IncludeCategories: 65 | - Regex: '^' 66 | Priority: 2 67 | - Regex: '^<.*\.h>' 68 | Priority: 1 69 | - Regex: '^<.*' 70 | Priority: 2 71 | - Regex: '.*' 72 | Priority: 3 73 | IncludeIsMainRegex: '([-_](test|unittest))?$' 74 | IndentCaseLabels: true 75 | IndentPPDirectives: None 76 | IndentWidth: 4 77 | IndentWrappedFunctionNames: false 78 | JavaScriptQuotes: Leave 79 | JavaScriptWrapImports: true 80 | KeepEmptyLinesAtTheStartOfBlocks: false 81 | MacroBlockBegin: '' 82 | MacroBlockEnd: '' 83 | MaxEmptyLinesToKeep: 1 84 | NamespaceIndentation: None 85 | ObjCBinPackProtocolList: Never 86 | ObjCBlockIndentWidth: 2 87 | ObjCSpaceAfterProperty: false 88 | ObjCSpaceBeforeProtocolList: true 89 | PenaltyBreakAssignment: 2 90 | PenaltyBreakBeforeFirstCallParameter: 1 91 | PenaltyBreakComment: 300 92 | PenaltyBreakFirstLessLess: 120 93 | PenaltyBreakString: 1000 94 | PenaltyBreakTemplateDeclaration: 10 95 | PenaltyExcessCharacter: 1000000 96 | PenaltyReturnTypeOnItsOwnLine: 200 97 | PointerAlignment: Left 98 | RawStringFormats: 99 | - Language: Cpp 100 | Delimiters: 101 | - cc 102 | - CC 103 | - cpp 104 | - Cpp 105 | - CPP 106 | - 'c++' 107 | - 'C++' 108 | CanonicalDelimiter: '' 109 | BasedOnStyle: google 110 | - Language: TextProto 111 | Delimiters: 112 | - pb 113 | - PB 114 | - proto 115 | - PROTO 116 | EnclosingFunctions: 117 | - EqualsProto 118 | - EquivToProto 119 | - PARSE_PARTIAL_TEXT_PROTO 120 | - PARSE_TEST_PROTO 121 | - PARSE_TEXT_PROTO 122 | - ParseTextOrDie 123 | - ParseTextProtoOrDie 124 | CanonicalDelimiter: '' 125 | BasedOnStyle: google 126 | ReflowComments: true 127 | SortIncludes: false 128 | SortUsingDeclarations: false 129 | SpaceAfterCStyleCast: false 130 | SpaceAfterTemplateKeyword: true 131 | SpaceBeforeAssignmentOperators: true 132 | SpaceBeforeCpp11BracedList: false 133 | SpaceBeforeCtorInitializerColon: true 134 | SpaceBeforeInheritanceColon: true 135 | SpaceBeforeParens: ControlStatements 136 | SpaceBeforeRangeBasedForLoopColon: true 137 | SpaceInEmptyParentheses: false 138 | SpacesBeforeTrailingComments: 2 139 | SpacesInAngles: false 140 | SpacesInContainerLiterals: true 141 | SpacesInCStyleCastParentheses: false 142 | SpacesInParentheses: false 143 | SpacesInSquareBrackets: false 144 | Standard: Auto 145 | StatementMacros: 146 | - Q_UNUSED 147 | - QT_REQUIRE_VERSION 148 | TabWidth: 8 149 | UseTab: Never 150 | ... 151 | 152 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | build 3 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "external/mm_file"] 2 | path = external/mm_file 3 | url = https://github.com/jermp/mm_file.git 4 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8) 2 | project(BIC) 3 | 4 | if(NOT CMAKE_BUILD_TYPE) 5 | set(CMAKE_BUILD_TYPE "Release") 6 | endif() 7 | MESSAGE( STATUS "CMAKE_BUILD_TYPE: " ${CMAKE_BUILD_TYPE} ) 8 | 9 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) 10 | 11 | if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") 12 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++") 13 | endif () 14 | 15 | if(UNIX) 16 | 17 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14") 18 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3") 19 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native") 20 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb") 21 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wno-missing-braces") 22 | 23 | if(USE_SANITIZERS) 24 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -fno-omit-frame-pointer") 25 | endif() 26 | 27 | if(RUNAWARE) 28 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DRUNAWARE") 29 | endif() 30 | 31 | endif() 32 | 33 | include_directories(${BIC_SOURCE_DIR}/include) 34 | 35 | add_subdirectory(src) 36 | add_subdirectory(test) -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright 2019 Giulio Ermanno Pibiri 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included 13 | in all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 | THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR 19 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 20 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 21 | OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Binary Interpolative Coding 2 | =========================== 3 | 4 | A C++ library implementing the *Binary Interpolative Coding* compression algorithm invented by Alistair Moffat and Lang Stuiver [1]. 5 | 6 | The algorithm can be used to compress sorted integer sequences (here, 7 | assumed to be increasing). 8 | 9 | The implementation comes in different flavours: 10 | it can be specified the use of 11 | simple *binary* codes, *left-most minimal* codes and *centered minimal* codes. 12 | Additionally, the implementation is *run-aware*, i.e., 13 | it optimizes encoding/decoding of runs of consecutive identifiers. 14 | 15 | All details and experiments are provided in the following [technical report](http://pages.di.unipi.it/pibiri/papers/BIC.pdf) [2] 16 | 17 | ##### Table of contents 18 | * [Compiling the code](#compiling-the-code) 19 | * [Quick Start](#quick-start) 20 | * [Encoding/decoding a collection of sequences](#encoding/decoding-a-collection-of-sequences) 21 | * [Benchmark](#benchmark) 22 | * [Author](#author) 23 | * [References](#references) 24 | 25 | Compiling the code 26 | ------------------ 27 | 28 | The code is tested on Linux with `gcc` 7.3.0, 8.3.0, 9.2.1 and on Mac 10.14 with `clang` 10.0.0. 29 | To build the code, [`CMake`](https://cmake.org/) is required. 30 | 31 | Clone the repository with 32 | 33 | git clone --recursive https://github.com/jermp/interpolative_coding.git 34 | 35 | If you have cloned the repository without `--recursive`, you will need to perform the following commands before 36 | compiling: 37 | 38 | git submodule init 39 | git submodule update 40 | 41 | To compile the code for a release environment *and* best performance (see file `CMakeLists.txt` for the used compilation flags), do: 42 | 43 | mkdir build 44 | cd build 45 | cmake .. -DRUNAWARE=On 46 | make 47 | 48 | Hint: Use `make -j4` to compile the library in parallel using, e.g., 4 jobs. 49 | 50 | For a testing environment, use the following instead: 51 | 52 | mkdir debug_build 53 | cd debug_build 54 | cmake .. -DCMAKE_BUILD_TYPE=Debug -DUSE_SANITIZERS=On 55 | make 56 | 57 | Quick Start 58 | ------- 59 | 60 | For a quick start, see the source file `test/example.cpp`. 61 | After compilation, run this example with 62 | 63 | ./example 64 | 65 | A simpler variation is shown below. 66 | 67 | ```C++ 68 | #include 69 | 70 | #include "interpolative_coding.hpp" 71 | using namespace bic; 72 | 73 | template 74 | void test(std::vector const& in) { 75 | std::cout << "to be encoded:\n"; 76 | for (auto x : in) { 77 | std::cout << x << " "; 78 | } 79 | std::cout << std::endl; 80 | 81 | uint32_t n = in.size(); 82 | 83 | encoder enc; 84 | enc.encode(in.data(), n); 85 | 86 | std::vector out(n); 87 | decoder dec; 88 | uint32_t m = dec.decode(enc.bits().data(), out.data()); 89 | assert(m == n); 90 | 91 | std::cout << "decoded " << m << " values" << std::endl; 92 | std::cout << "total bits " << enc.num_bits() << std::endl; 93 | std::cout << static_cast(enc.num_bits()) / m << " bits x key" 94 | << std::endl; 95 | 96 | std::cout << "decoded:\n"; 97 | for (auto x : out) { 98 | std::cout << x << " "; 99 | } 100 | std::cout << std::endl; 101 | } 102 | 103 | int main(int argc, char** argv) { 104 | if (argc < 2) { 105 | std::cerr << argv[0] << " binary_code_type" << std::endl; 106 | return 1; 107 | } 108 | 109 | std::vector in = {3, 4, 7, 13, 14, 15, 21, 25, 36, 38, 54, 62}; 110 | 111 | std::string type(argv[1]); 112 | 113 | if (type == "binary") { 114 | test(in); 115 | } else if (type == "leftmost_minimal") { 116 | test(in); 117 | } else if (type == "centered_minimal") { 118 | test(in); 119 | } else { 120 | std::cerr << "unknown type '" << type << "'" << std::endl; 121 | return 1; 122 | } 123 | 124 | return 0; 125 | } 126 | ``` 127 | 128 | Encoding/decoding a collection of sequences 129 | ---------------------------------- 130 | 131 | Typically, we want to build all the sequences from 132 | a collection. 133 | In this case, we assume that the input collection 134 | is a binary file with all the sequences being written 135 | as 32-bit integers. In this library, we follow the 136 | input data format of the [`ds2i`](https://github.com/ot/ds2i) library: 137 | each sequence is prefixed by an additional 138 | 32-bit integer representing the size of the sequence. 139 | The collection file starts with a singleton sequence 140 | containing the universe of representation of the sequences, i.e., the maximum representable value. 141 | 142 | We also assume all sequences are *increasing*. 143 | 144 | The file `data/test_collection.docs` represents an example of 145 | such organization. 146 | 147 | To encode all the sequences from this file, do: 148 | 149 | ./encode leftmost_minimal ../data/test_collection.docs -o test.bin 150 | 151 | To decode all the sequences from the encoded file `test.bin`, do: 152 | 153 | ./decode leftmost_minimal test.bin 154 | 155 | To check correctness of the implementation, use: 156 | 157 | ./check leftmost_minimal test.bin ../data/test_collection.docs 158 | 159 | which will compare every decoded integer against the input collection. 160 | 161 | Benchmark 162 | ------ 163 | For this benchmark we used the whole Gov2 datasets, containing 164 | 5,742,630,292 integers in 35,636,425 sequences. 165 | 166 | We report the average number of bits per integer (bpi) 167 | and nanoseconds spent per decoded integer (with and without the 168 | run-aware optimization). 169 | 170 | We used two different Intel processors: i7-7700 171 | and i9-9900K, both clocked at 3.6 GHz and having 32K L1 caches for 172 | instructions and data. 173 | Both systems run Linux 4.4.0 and have 64 GB on RAM. 174 | The code was compiled with gcc 7.3.0 on the first 175 | system; with gcc 8.3.0 on the second. 176 | In both cases we used all optimizations 177 | (see also `CMakeLists.txt`). 178 | 179 | |**Method** |**bpi** | **ns/int (run-aware) on i7-7700** | **ns/int (not run-aware) on i7-7700**| **ns/int (run-aware) on i9-9900K** | **ns/int (not run-aware) on i9-9900K**| 180 | |:-----------------|:------:|:------------------:|:------:|:-----:|:-----:| 181 | |simple |3.532 | 3.45 | 4.65 | 2.52 | 3.37 | 182 | |left-most minimal |3.362 | 5.78 | 7.07 | 4.18 | 5.28 | 183 | |centered minimal |3.361 | 5.78 | 7.07 | 4.24 | 5.33 | 184 | 185 | Author 186 | ------ 187 | * [Giulio Ermanno Pibiri](http://pages.di.unipi.it/pibiri/), 188 | 189 | References 190 | ------- 191 | * [1] Alistair Moffat and Lang Stuiver. 2000. *Binary Interpolative Coding for Effective Index Compression*. Information Retrieval Journal 3, 1 (2000), 25 – 47. 192 | * [2] Giulio Ermanno Pibiri. 2019. *On Implementing the Binary Interpolative Coding Algorithm*. Technical report. [http://pages.di.unipi.it/pibiri/papers/BIC.pdf](http://pages.di.unipi.it/pibiri/papers/BIC.pdf) 193 | -------------------------------------------------------------------------------- /data/test_collection.docs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jermp/interpolative_coding/bbcbd31ea568c10de37c5f78cedfa85ba43fd98b/data/test_collection.docs -------------------------------------------------------------------------------- /include/interpolative_coding.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | namespace bic { 7 | 8 | inline uint32_t msb(uint32_t x) { 9 | assert(x > 0); 10 | return 31 - __builtin_clz(x); 11 | } 12 | 13 | struct output { 14 | output() : m_size(0), m_cur_word(nullptr) {} 15 | 16 | void append(uint32_t bits, uint32_t len) { 17 | if (!len) return; 18 | uint32_t pos_in_word = m_size % 32; 19 | m_size += len; 20 | if (pos_in_word == 0) { 21 | m_bits.push_back(bits); 22 | } else { 23 | *m_cur_word |= bits << pos_in_word; 24 | if (len > 32 - pos_in_word) { 25 | m_bits.push_back(bits >> (32 - pos_in_word)); 26 | } 27 | } 28 | m_cur_word = &m_bits.back(); 29 | } 30 | 31 | void reserve(size_t bytes) { 32 | m_bits.reserve((bytes + sizeof(m_bits.front()) - 1) / 33 | sizeof(m_bits.front())); 34 | } 35 | 36 | size_t num_bits() const { 37 | return m_size; 38 | } 39 | 40 | std::vector const& bits() const { 41 | return m_bits; 42 | } 43 | 44 | private: 45 | std::vector m_bits; 46 | size_t m_size; 47 | uint32_t* m_cur_word; 48 | }; 49 | 50 | struct input { 51 | input() {} 52 | 53 | input(uint32_t const* in) { 54 | init(in); 55 | } 56 | 57 | void init(uint32_t const* in) { 58 | m_in = in; 59 | m_avail = 0; 60 | m_buf = 0; 61 | } 62 | 63 | uint32_t take(uint32_t len) { 64 | if (!len) return 0; 65 | if (m_avail < len) { 66 | m_buf |= uint64_t(*m_in++) << m_avail; 67 | m_avail += 32; 68 | } 69 | uint32_t val = m_buf & ((uint64_t(1) << len) - 1); 70 | m_buf >>= len; 71 | m_avail -= len; 72 | return val; 73 | } 74 | 75 | private: 76 | uint32_t const* m_in; 77 | uint32_t m_avail; 78 | uint64_t m_buf; 79 | }; 80 | 81 | struct binary { 82 | struct writer : output { 83 | void write(uint32_t x, uint32_t r) { 84 | #if RUNAWARE 85 | assert(r > 0); 86 | #else 87 | if (!r) return; 88 | #endif 89 | assert(x <= r); 90 | uint32_t b = msb(r) + 1; 91 | append(x, b); 92 | } 93 | }; 94 | 95 | struct reader : input { 96 | reader() {} 97 | reader(uint32_t const* encoded) : input(encoded) {} 98 | 99 | uint32_t read(uint32_t r) { 100 | #if RUNAWARE 101 | assert(r > 0); 102 | #else 103 | if (!r) return 0; 104 | #endif 105 | uint32_t b = msb(r) + 1; 106 | uint32_t x = take(b); 107 | assert(x <= r); 108 | return x; 109 | } 110 | }; 111 | }; 112 | 113 | struct leftmost_minimal { 114 | struct writer : output { 115 | void write(uint32_t x, uint32_t r) { 116 | #if RUNAWARE 117 | assert(r > 0); 118 | #else 119 | if (!r) return; 120 | #endif 121 | assert(x <= r); 122 | uint32_t b = msb(r); 123 | uint32_t hi = (uint64_t(1) << (b + 1)) - r - 1; 124 | if (x < hi) { 125 | append(x, b); 126 | } else { 127 | x += hi; 128 | append(x >> 1, b); 129 | append(x & 1, 1); 130 | } 131 | } 132 | }; 133 | 134 | struct reader : input { 135 | reader() {} 136 | reader(uint32_t const* encoded) : input(encoded) {} 137 | 138 | uint32_t read(uint32_t r) { 139 | #if RUNAWARE 140 | assert(r > 0); 141 | #else 142 | if (!r) return 0; 143 | #endif 144 | uint32_t b = msb(r); 145 | uint32_t hi = (uint64_t(1) << (b + 1)) - r - 1; 146 | uint32_t x = take(b); 147 | if (x >= hi) x = (x << 1) + take(1) - hi; 148 | assert(x <= r); 149 | return x; 150 | } 151 | }; 152 | }; 153 | 154 | struct centered_minimal { 155 | struct writer : output { 156 | void write(uint32_t x, uint32_t r) { 157 | #if RUNAWARE 158 | assert(r > 0); 159 | #else 160 | if (!r) return; 161 | #endif 162 | uint32_t b = msb(r); 163 | uint32_t c = (uint64_t(1) << (b + 1)) - r - 1; 164 | int64_t half_c = c / 2; 165 | int64_t half_r = r / 2; 166 | int64_t lo, hi; 167 | lo = half_r - half_c; 168 | hi = half_r + half_c + 1; 169 | if (r % 2 == 0) lo -= 1; 170 | if (x > lo and x < hi) { 171 | append(x, b); 172 | } else { 173 | append(x, b + 1); 174 | } 175 | } 176 | }; 177 | 178 | struct reader : input { 179 | reader() {} 180 | reader(uint32_t const* encoded) : input(encoded) {} 181 | 182 | uint32_t read(uint32_t r) { 183 | #if RUNAWARE 184 | assert(r > 0); 185 | #else 186 | if (!r) return 0; 187 | #endif 188 | uint32_t b = msb(r); 189 | uint32_t c = (uint64_t(1) << (b + 1)) - r - 1; 190 | int64_t half_c = c / 2; 191 | int64_t half_r = r / 2; 192 | int64_t lo; 193 | lo = half_r - half_c; 194 | if (r % 2 == 0) lo -= 1; 195 | uint32_t x = take(b); 196 | if (x <= lo) x += take(1) << b; 197 | assert(x <= r); 198 | return x; 199 | } 200 | }; 201 | }; 202 | 203 | template 204 | struct encoder { 205 | void encode(uint32_t const* input, uint32_t n, bool write_size = true) { 206 | if (!n) return; 207 | uint32_t universe = input[n - 1]; 208 | write_binary(universe); 209 | if (write_size) write_binary(n); 210 | encode(input, n - 1, 0, universe); 211 | } 212 | 213 | void reserve(size_t bytes) { 214 | return m_writer.reserve(bytes); 215 | } 216 | 217 | size_t num_bits() const { 218 | return m_writer.num_bits(); 219 | } 220 | 221 | auto const& bits() const { 222 | return m_writer.bits(); 223 | } 224 | 225 | private: 226 | void encode(uint32_t const* input, uint32_t n, uint32_t lo, uint32_t hi) { 227 | if (!n) return; 228 | #if RUNAWARE 229 | if (hi - lo + 1 == n) return; // run 230 | #endif 231 | assert(lo <= hi); 232 | assert(hi - lo >= n - 1); 233 | uint32_t m = n / 2; 234 | uint32_t x = input[m]; 235 | m_writer.write(x - lo - m, hi - lo - n + 1); 236 | encode(input, m, lo, x - 1); 237 | encode(input + m + 1, n - m - 1, x + 1, hi); 238 | } 239 | 240 | void write_binary(uint32_t x) { 241 | uint32_t b = 0; 242 | if (x) b = msb(x); 243 | assert(b <= 31); 244 | m_writer.append(b, 5); 245 | m_writer.append(x, b + 1); 246 | } 247 | 248 | Writer m_writer; 249 | }; 250 | 251 | template 252 | struct decoder { 253 | decoder() {} 254 | decoder(uint32_t const* encoded) : m_reader(encoded) {} 255 | 256 | uint32_t decode(uint32_t* out) { 257 | uint32_t universe = read_binary(); 258 | uint32_t n = read_binary(); 259 | out[n - 1] = universe; 260 | decode(out, n - 1, 0, universe); 261 | return n; 262 | } 263 | 264 | uint32_t decode(uint32_t* out, uint32_t n) { 265 | uint32_t universe = read_binary(); 266 | out[n - 1] = universe; 267 | decode(out, n - 1, 0, universe); 268 | return n; 269 | } 270 | 271 | uint32_t decode(uint32_t const* encoded, uint32_t* out) { 272 | m_reader.init(encoded); 273 | return decode(out); 274 | } 275 | 276 | private: 277 | void decode(uint32_t* out, uint32_t n, uint32_t lo, uint32_t hi) { 278 | if (!n) return; 279 | assert(lo <= hi); 280 | #if RUNAWARE 281 | if (hi - lo + 1 == n) { // run 282 | for (uint32_t i = 0; i != n; ++i) out[i] = lo++; 283 | return; 284 | } 285 | #endif 286 | uint32_t m = n / 2; 287 | uint32_t x = m_reader.read(hi - lo - n + 1) + lo + m; 288 | out[m] = x; 289 | if (n == 1) return; 290 | decode(out, m, lo, x - 1); 291 | decode(out + m + 1, n - m - 1, x + 1, hi); 292 | } 293 | 294 | uint32_t read_binary() { 295 | uint32_t b = m_reader.take(5); 296 | return m_reader.take(b + 1); 297 | } 298 | 299 | Reader m_reader; 300 | }; 301 | 302 | bool check(uint32_t const* expected, uint32_t const* got, uint32_t n) { 303 | for (uint32_t i = 0; i != n; ++i) { 304 | if (expected[i] != got[i]) { 305 | std::cerr << "error at " << i << "/" << n << ": "; 306 | std::cerr << "expected " << expected[i] << " but got " << got[i] 307 | << std::endl; 308 | return false; 309 | } 310 | } 311 | return true; 312 | } 313 | 314 | } // namespace bic 315 | -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(encode encode.cpp) 2 | add_executable(decode decode.cpp) 3 | add_executable(check check.cpp) -------------------------------------------------------------------------------- /src/check.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "../external/mm_file/include/mm_file/mm_file.hpp" 5 | #include "interpolative_coding.hpp" 6 | 7 | using namespace bic; 8 | 9 | template 10 | void check(char const* binary_filename, char const* collection_filename) { 11 | int advice = mm::advice::sequential; 12 | mm::file_source input_collection(collection_filename, advice); 13 | uint32_t const* input = input_collection.data(); 14 | input += 2; 15 | 16 | mm::file_source encoded(binary_filename, advice); 17 | uint32_t const* data = encoded.data(); 18 | uint32_t universe = data[0]; 19 | uint32_t sequences = data[1]; 20 | Decoder dec(data + 2); 21 | std::vector decoded(universe); 22 | 23 | std::cout << "checking " << sequences << " sequences..." << std::endl; 24 | size_t decoded_ints = 0; 25 | bool all_good = true; 26 | 27 | for (uint32_t i = 0; i != sequences; ++i) { 28 | uint32_t n = dec.decode(decoded.data()); 29 | decoded_ints += n; 30 | if (n != input[0]) { 31 | std::cerr << "decoded " << n << " integers but expected " 32 | << input[0] << std::endl; 33 | return; 34 | } 35 | all_good &= check(input + 1, decoded.data(), n); 36 | input += n + 1; 37 | 38 | if (i and i % 100000 == 0) { 39 | std::cout << " checked " << i << " sequences" << std::endl; 40 | } 41 | } 42 | 43 | std::cout << "DONE" << std::endl; 44 | if (all_good) { 45 | std::cout << "everything good" << std::endl; 46 | } 47 | } 48 | 49 | int main(int argc, char** argv) { 50 | if (argc < 4) { 51 | std::cerr << argv[0] 52 | << " decoder_type binary_filename collection_filename" 53 | << std::endl; 54 | return 1; 55 | } 56 | 57 | std::string type(argv[1]); 58 | char const* binary_filename = argv[2]; 59 | char const* collection_filename = argv[3]; 60 | 61 | if (type == "binary") { 62 | check>(binary_filename, collection_filename); 63 | } else if (type == "leftmost_minimal") { 64 | check>(binary_filename, 65 | collection_filename); 66 | } else if (type == "centered_minimal") { 67 | check>(binary_filename, 68 | collection_filename); 69 | } else { 70 | std::cerr << "unknown type '" << type << "'" << std::endl; 71 | return 1; 72 | } 73 | 74 | return 0; 75 | } 76 | -------------------------------------------------------------------------------- /src/decode.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "../external/mm_file/include/mm_file/mm_file.hpp" 5 | #include "interpolative_coding.hpp" 6 | 7 | using namespace bic; 8 | typedef std::chrono::high_resolution_clock clock_type; 9 | 10 | template 11 | void encode(char const* input_filename) { 12 | int advice = mm::advice::sequential; 13 | mm::file_source input(input_filename, advice); 14 | uint32_t const* data = input.data(); 15 | uint32_t universe = data[0]; 16 | uint32_t sequences = data[1]; 17 | Decoder dec(data + 2); 18 | std::vector decoded(universe); 19 | 20 | std::cout << "decoding " << sequences << " sequences..." << std::endl; 21 | size_t decoded_ints = 0; 22 | auto start = clock_type::now(); 23 | for (uint32_t i = 0; i != sequences; ++i) { 24 | uint32_t n = dec.decode(decoded.data()); 25 | decoded_ints += n; 26 | } 27 | auto finish = clock_type::now(); 28 | std::cout << "DONE" << std::endl; 29 | 30 | std::chrono::duration elapsed = finish - start; 31 | std::cout << "decoded " << decoded_ints << " integers in " 32 | << elapsed.count() << " [sec]" << std::endl; 33 | std::cout << elapsed.count() * 1000000000 / decoded_ints << " ns/int" 34 | << std::endl; 35 | std::cout << "using " << input.bytes() * 8.0 / decoded_ints << " bits x int" 36 | << std::endl; 37 | } 38 | 39 | int main(int argc, char** argv) { 40 | if (argc < 3) { 41 | std::cerr << argv[0] << " decoder_type input_filename" << std::endl; 42 | return 1; 43 | } 44 | 45 | std::string type(argv[1]); 46 | char const* input_filename = argv[2]; 47 | std::cout << "type: '" << type << "':\n"; 48 | 49 | if (type == "binary") { 50 | encode>(input_filename); 51 | } else if (type == "leftmost_minimal") { 52 | encode>(input_filename); 53 | } else if (type == "centered_minimal") { 54 | encode>(input_filename); 55 | } else { 56 | std::cerr << "unknown type '" << type << "'" << std::endl; 57 | return 1; 58 | } 59 | 60 | return 0; 61 | } 62 | -------------------------------------------------------------------------------- /src/encode.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "../external/mm_file/include/mm_file/mm_file.hpp" 6 | #include "interpolative_coding.hpp" 7 | 8 | using namespace bic; 9 | static const uint64_t GiB = 1024 * 1024 * 1024; 10 | 11 | template 12 | void encode(char const* input_filename, char const* output_filename) { 13 | mm::file_source input(input_filename); 14 | uint32_t const* data = input.data(); 15 | assert(data[0] == 1); 16 | uint32_t universe = data[1]; 17 | 18 | Encoder enc; 19 | enc.reserve(10 * GiB); 20 | 21 | std::cout << "encoding data..." << std::endl; 22 | uint32_t encoded_sequences = 0; 23 | size_t encoded_ints = 0; 24 | for (size_t i = 2; i < input.size();) { 25 | uint32_t n = data[i]; 26 | enc.encode(data + i + 1, n); 27 | i += n + 1; 28 | encoded_ints += n; 29 | encoded_sequences += 1; 30 | if (encoded_sequences % 100000 == 0) { 31 | std::cout << " encoded " << encoded_sequences << " sequences" 32 | << std::endl; 33 | } 34 | } 35 | std::cout << "DONE" << std::endl; 36 | 37 | std::cout << "encoded " << encoded_sequences << " sequences" << std::endl; 38 | std::cout << "encoded " << encoded_ints << " integers" << std::endl; 39 | 40 | // NOTE: slightly larger than enc.num_bits() due to padding 41 | std::cout << "using " 42 | << enc.bits().size() * sizeof(enc.bits().front()) * 8.0 / 43 | encoded_ints 44 | << " bits x int" << std::endl; 45 | 46 | if (output_filename) { 47 | std::ofstream out(output_filename, std::ios::binary); 48 | if (!out.is_open()) { 49 | std::cerr << "error in opening binary file" << std::endl; 50 | return; 51 | } 52 | std::cout << "writing encoded data to disk..." << std::endl; 53 | 54 | // save also: 55 | // - universe (max sequence length) 56 | // - number of encoded sequences 57 | out.write(reinterpret_cast(&universe), sizeof(universe)); 58 | out.write(reinterpret_cast(&encoded_sequences), 59 | sizeof(encoded_sequences)); 60 | 61 | out.write(reinterpret_cast(enc.bits().data()), 62 | static_cast(sizeof(enc.bits().front()) * 63 | enc.bits().size())); 64 | std::cout << "DONE" << std::endl; 65 | } 66 | } 67 | 68 | int main(int argc, char** argv) { 69 | if (argc < 3) { 70 | std::cerr << argv[0] 71 | << " encoder_type input_filename -o output_filename" 72 | << std::endl; 73 | return 1; 74 | } 75 | 76 | std::string type(argv[1]); 77 | char const* input_filename = argv[2]; 78 | char const* output_filename = nullptr; 79 | 80 | if (argc > 3 and argv[3] == std::string("-o")) { 81 | output_filename = argv[4]; 82 | } 83 | 84 | if (type == "binary") { 85 | encode>(input_filename, output_filename); 86 | } else if (type == "leftmost_minimal") { 87 | encode>(input_filename, 88 | output_filename); 89 | } else if (type == "centered_minimal") { 90 | encode>(input_filename, 91 | output_filename); 92 | } else { 93 | std::cerr << "unknown type '" << type << "'" << std::endl; 94 | return 1; 95 | } 96 | 97 | return 0; 98 | } 99 | -------------------------------------------------------------------------------- /test/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(example example.cpp) -------------------------------------------------------------------------------- /test/example.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "interpolative_coding.hpp" 4 | using namespace bic; 5 | 6 | template 7 | void test(std::vector const& in) { 8 | std::cout << "to be encoded:\n"; 9 | for (auto x : in) { 10 | std::cout << x << " "; 11 | } 12 | std::cout << std::endl; 13 | 14 | { 15 | uint32_t n = in.size(); 16 | encoder enc; 17 | enc.encode(in.data(), n); // save n by feault 18 | std::vector out(n); 19 | decoder dec; 20 | uint32_t m = dec.decode(enc.bits().data(), out.data()); 21 | assert(m == n); 22 | 23 | std::cout << "decoded " << m << " values" << std::endl; 24 | std::cout << "total bits " << enc.num_bits() << std::endl; 25 | std::cout << static_cast(enc.num_bits()) / m << " bits x key" 26 | << std::endl; 27 | 28 | std::cout << "decoded:\n"; 29 | for (auto x : out) { 30 | std::cout << x << " "; 31 | } 32 | std::cout << std::endl; 33 | } 34 | 35 | { 36 | uint32_t n = in.size(); 37 | encoder enc; 38 | enc.encode(in.data(), n, 39 | false // do not save n 40 | ); 41 | std::vector out(n); 42 | decoder dec(enc.bits().data()); 43 | uint32_t m = dec.decode(out.data(), n); 44 | assert(m == n); 45 | 46 | std::cout << "decoded " << m << " values" << std::endl; 47 | std::cout << "total bits " << enc.num_bits() << std::endl; 48 | std::cout << static_cast(enc.num_bits()) / m << " bits x key" 49 | << std::endl; 50 | 51 | std::cout << "decoded:\n"; 52 | for (auto x : out) { 53 | std::cout << x << " "; 54 | } 55 | std::cout << std::endl; 56 | } 57 | } 58 | 59 | int main(int argc, char** argv) { 60 | if (argc < 2) { 61 | std::cerr << argv[0] << " binary_code_type" << std::endl; 62 | return 1; 63 | } 64 | 65 | // std::vector in = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 66 | // 12, 67 | // 13, 14, 15, 16, 17, 18, 19, 20, 23, 25, 32}; 68 | // std::vector in = {3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 69 | // 13, 14, 15, 21, 25, 36, 38, 54, 62}; 70 | 71 | std::vector in = {3, 4, 7, 13, 14, 15, 21, 25, 36, 38, 54, 62}; 72 | 73 | // std::vector in = {0, 1, 2, 3, 4, 5, 21, 25, 36, 38, 54, 62}; 74 | 75 | std::string type(argv[1]); 76 | 77 | if (type == "binary") { 78 | test(in); 79 | } else if (type == "leftmost_minimal") { 80 | test(in); 81 | } else if (type == "centered_minimal") { 82 | test(in); 83 | } else { 84 | std::cerr << "unknown type '" << type << "'" << std::endl; 85 | return 1; 86 | } 87 | 88 | return 0; 89 | } --------------------------------------------------------------------------------