├── .clang-format
├── .gitignore
├── .gitmodules
├── CMakeLists.txt
├── LICENSE
├── README.md
├── data
    └── test_collection.docs
├── include
    └── interpolative_coding.hpp
├── src
    ├── CMakeLists.txt
    ├── check.cpp
    ├── decode.cpp
    └── encode.cpp
└── test
    ├── CMakeLists.txt
    └── example.cpp


/.clang-format:
--------------------------------------------------------------------------------
  1 | ---
  2 | Language:        Cpp
  3 | # BasedOnStyle:  Google
  4 | AccessModifierOffset: -4
  5 | AlignAfterOpenBracket: Align
  6 | AlignConsecutiveAssignments: false
  7 | AlignConsecutiveDeclarations: false
  8 | AlignEscapedNewlines: Left
  9 | AlignOperands:   true
 10 | AlignTrailingComments: true
 11 | AllowAllParametersOfDeclarationOnNextLine: true
 12 | AllowShortBlocksOnASingleLine: false
 13 | AllowShortCaseLabelsOnASingleLine: false
 14 | AllowShortFunctionsOnASingleLine: Empty
 15 | AllowShortIfStatementsOnASingleLine: true
 16 | AllowShortLoopsOnASingleLine: true
 17 | AlwaysBreakAfterDefinitionReturnType: None
 18 | AlwaysBreakAfterReturnType: None
 19 | AlwaysBreakBeforeMultilineStrings: true
 20 | AlwaysBreakTemplateDeclarations: Yes
 21 | BinPackArguments: true
 22 | BinPackParameters: true
 23 | BraceWrapping:
 24 |   AfterClass:      false
 25 |   AfterControlStatement: false
 26 |   AfterEnum:       false
 27 |   AfterFunction:   false
 28 |   AfterNamespace:  false
 29 |   AfterObjCDeclaration: false
 30 |   AfterStruct:     false
 31 |   AfterUnion:      false
 32 |   AfterExternBlock: false
 33 |   BeforeCatch:     false
 34 |   BeforeElse:      false
 35 |   IndentBraces:    false
 36 |   SplitEmptyFunction: true
 37 |   SplitEmptyRecord: true
 38 |   SplitEmptyNamespace: true
 39 | BreakBeforeBinaryOperators: None
 40 | BreakBeforeBraces: Attach
 41 | BreakBeforeInheritanceComma: false
 42 | BreakInheritanceList: BeforeComma
 43 | BreakBeforeTernaryOperators: true
 44 | BreakConstructorInitializersBeforeComma: true
 45 | BreakConstructorInitializers: BeforeComma
 46 | BreakAfterJavaFieldAnnotations: false
 47 | BreakStringLiterals: true
 48 | ColumnLimit:     80
 49 | CommentPragmas:  '^ IWYU pragma:'
 50 | CompactNamespaces: false
 51 | ConstructorInitializerAllOnOneLineOrOnePerLine: true
 52 | ConstructorInitializerIndentWidth: 4
 53 | ContinuationIndentWidth: 4
 54 | Cpp11BracedListStyle: true
 55 | DerivePointerAlignment: false
 56 | DisableFormat:   false
 57 | ExperimentalAutoDetectBinPacking: false
 58 | FixNamespaceComments: true
 59 | ForEachMacros:
 60 |   - foreach
 61 |   - Q_FOREACH
 62 |   - BOOST_FOREACH
 63 | IncludeBlocks:   Preserve
 64 | IncludeCategories:
 65 |   - Regex:           '^<ext/.*\.h>'
 66 |     Priority:        2
 67 |   - Regex:           '^<.*\.h>'
 68 |     Priority:        1
 69 |   - Regex:           '^<.*'
 70 |     Priority:        2
 71 |   - Regex:           '.*'
 72 |     Priority:        3
 73 | IncludeIsMainRegex: '([-_](test|unittest))?$'
 74 | IndentCaseLabels: true
 75 | IndentPPDirectives: None
 76 | IndentWidth:     4
 77 | IndentWrappedFunctionNames: false
 78 | JavaScriptQuotes: Leave
 79 | JavaScriptWrapImports: true
 80 | KeepEmptyLinesAtTheStartOfBlocks: false
 81 | MacroBlockBegin: ''
 82 | MacroBlockEnd:   ''
 83 | MaxEmptyLinesToKeep: 1
 84 | NamespaceIndentation: None
 85 | ObjCBinPackProtocolList: Never
 86 | ObjCBlockIndentWidth: 2
 87 | ObjCSpaceAfterProperty: false
 88 | ObjCSpaceBeforeProtocolList: true
 89 | PenaltyBreakAssignment: 2
 90 | PenaltyBreakBeforeFirstCallParameter: 1
 91 | PenaltyBreakComment: 300
 92 | PenaltyBreakFirstLessLess: 120
 93 | PenaltyBreakString: 1000
 94 | PenaltyBreakTemplateDeclaration: 10
 95 | PenaltyExcessCharacter: 1000000
 96 | PenaltyReturnTypeOnItsOwnLine: 200
 97 | PointerAlignment: Left
 98 | RawStringFormats:
 99 |   - Language:        Cpp
100 |     Delimiters:
101 |       - cc
102 |       - CC
103 |       - cpp
104 |       - Cpp
105 |       - CPP
106 |       - 'c++'
107 |       - 'C++'
108 |     CanonicalDelimiter: ''
109 |     BasedOnStyle:    google
110 |   - Language:        TextProto
111 |     Delimiters:
112 |       - pb
113 |       - PB
114 |       - proto
115 |       - PROTO
116 |     EnclosingFunctions:
117 |       - EqualsProto
118 |       - EquivToProto
119 |       - PARSE_PARTIAL_TEXT_PROTO
120 |       - PARSE_TEST_PROTO
121 |       - PARSE_TEXT_PROTO
122 |       - ParseTextOrDie
123 |       - ParseTextProtoOrDie
124 |     CanonicalDelimiter: ''
125 |     BasedOnStyle:    google
126 | ReflowComments:  true
127 | SortIncludes:    false
128 | SortUsingDeclarations: false
129 | SpaceAfterCStyleCast: false
130 | SpaceAfterTemplateKeyword: true
131 | SpaceBeforeAssignmentOperators: true
132 | SpaceBeforeCpp11BracedList: false
133 | SpaceBeforeCtorInitializerColon: true
134 | SpaceBeforeInheritanceColon: true
135 | SpaceBeforeParens: ControlStatements
136 | SpaceBeforeRangeBasedForLoopColon: true
137 | SpaceInEmptyParentheses: false
138 | SpacesBeforeTrailingComments: 2
139 | SpacesInAngles:  false
140 | SpacesInContainerLiterals: true
141 | SpacesInCStyleCastParentheses: false
142 | SpacesInParentheses: false
143 | SpacesInSquareBrackets: false
144 | Standard:        Auto
145 | StatementMacros:
146 |   - Q_UNUSED
147 |   - QT_REQUIRE_VERSION
148 | TabWidth:        8
149 | UseTab:          Never
150 | ...
151 | 
152 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | build
3 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "external/mm_file"]
2 | 	path = external/mm_file
3 | 	url = https://github.com/jermp/mm_file.git
4 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.8)
 2 | project(BIC)
 3 | 
 4 | if(NOT CMAKE_BUILD_TYPE)
 5 |   set(CMAKE_BUILD_TYPE "Release")
 6 | endif()
 7 | MESSAGE( STATUS "CMAKE_BUILD_TYPE: " ${CMAKE_BUILD_TYPE} )
 8 | 
 9 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
10 | 
11 | if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
12 |   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++")
13 | endif ()
14 | 
15 | if(UNIX)
16 | 
17 |   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14")
18 |   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
19 |   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
20 |   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ggdb")
21 |   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wno-missing-braces")
22 | 
23 |   if(USE_SANITIZERS)
24 |     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -fno-omit-frame-pointer")
25 |   endif()
26 | 
27 |   if(RUNAWARE)
28 |     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DRUNAWARE")
29 |   endif()
30 | 
31 | endif()
32 | 
33 | include_directories(${BIC_SOURCE_DIR}/include)
34 | 
35 | add_subdirectory(src)
36 | add_subdirectory(test)


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright 2019 Giulio Ermanno Pibiri
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included
13 | in all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 | THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
19 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
20 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
21 | OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Binary Interpolative Coding
  2 | ===========================
  3 | 
  4 | A C++ library implementing the *Binary Interpolative Coding* compression algorithm invented by Alistair Moffat and Lang Stuiver [1].
  5 | 
  6 | The algorithm can be used to compress sorted integer sequences (here,
  7 | assumed to be increasing).
  8 | 
  9 | The implementation comes in different flavours:
 10 | it can be specified the use of
 11 | simple *binary* codes, *left-most minimal* codes and *centered minimal* codes.
 12 | Additionally, the implementation is *run-aware*, i.e.,
 13 | it optimizes encoding/decoding of runs of consecutive identifiers.
 14 | 
 15 | All details and experiments are provided in the following [technical report](http://pages.di.unipi.it/pibiri/papers/BIC.pdf) [2]
 16 | 
 17 | ##### Table of contents
 18 | * [Compiling the code](#compiling-the-code)
 19 | * [Quick Start](#quick-start)
 20 | * [Encoding/decoding a collection of sequences](#encoding/decoding-a-collection-of-sequences)
 21 | * [Benchmark](#benchmark)
 22 | * [Author](#author)
 23 | * [References](#references)
 24 | 
 25 | Compiling the code
 26 | ------------------
 27 | 
 28 | The code is tested on Linux with `gcc` 7.3.0, 8.3.0, 9.2.1 and on Mac 10.14 with `clang` 10.0.0.
 29 | To build the code, [`CMake`](https://cmake.org/) is required.
 30 | 
 31 | Clone the repository with
 32 | 
 33 | 	git clone --recursive https://github.com/jermp/interpolative_coding.git
 34 | 
 35 | If you have cloned the repository without `--recursive`, you will need to perform the following commands before
 36 | compiling:
 37 | 
 38 |     git submodule init
 39 |     git submodule update
 40 | 
 41 | To compile the code for a release environment *and* best performance (see file `CMakeLists.txt` for the used compilation flags), do:
 42 | 
 43 |     mkdir build
 44 |     cd build
 45 |     cmake .. -DRUNAWARE=On
 46 |     make
 47 | 
 48 | Hint: Use `make -j4` to compile the library in parallel using, e.g., 4 jobs.
 49 | 
 50 | For a testing environment, use the following instead:
 51 | 
 52 |     mkdir debug_build
 53 |     cd debug_build
 54 |     cmake .. -DCMAKE_BUILD_TYPE=Debug -DUSE_SANITIZERS=On
 55 |     make
 56 | 
 57 | Quick Start
 58 | -------
 59 | 
 60 | For a quick start, see the source file `test/example.cpp`.
 61 | After compilation, run this example with
 62 | 
 63 | 	./example
 64 | 
 65 | A simpler variation is shown below.
 66 | 
 67 | ```C++
 68 | #include <iostream>
 69 | 
 70 | #include "interpolative_coding.hpp"
 71 | using namespace bic;
 72 | 
 73 | template <typename BinaryCode>
 74 | void test(std::vector<uint32_t> const& in) {
 75 |     std::cout << "to be encoded:\n";
 76 |     for (auto x : in) {
 77 |         std::cout << x << " ";
 78 |     }
 79 |     std::cout << std::endl;
 80 | 
 81 |     uint32_t n = in.size();
 82 | 
 83 |     encoder<typename BinaryCode::writer> enc;
 84 |     enc.encode(in.data(), n);
 85 | 
 86 |     std::vector<uint32_t> out(n);
 87 |     decoder<typename BinaryCode::reader> dec;
 88 |     uint32_t m = dec.decode(enc.bits().data(), out.data());
 89 |     assert(m == n);
 90 | 
 91 |     std::cout << "decoded " << m << " values" << std::endl;
 92 |     std::cout << "total bits " << enc.num_bits() << std::endl;
 93 |     std::cout << static_cast<double>(enc.num_bits()) / m << " bits x key"
 94 |               << std::endl;
 95 | 
 96 |     std::cout << "decoded:\n";
 97 |     for (auto x : out) {
 98 |         std::cout << x << " ";
 99 |     }
100 |     std::cout << std::endl;
101 | }
102 | 
103 | int main(int argc, char** argv) {
104 |     if (argc < 2) {
105 |         std::cerr << argv[0] << " binary_code_type" << std::endl;
106 |         return 1;
107 |     }
108 | 
109 |     std::vector<uint32_t> in = {3, 4, 7, 13, 14, 15, 21, 25, 36, 38, 54, 62};
110 | 
111 |     std::string type(argv[1]);
112 | 
113 |     if (type == "binary") {
114 |         test<binary>(in);
115 |     } else if (type == "leftmost_minimal") {
116 |         test<leftmost_minimal>(in);
117 |     } else if (type == "centered_minimal") {
118 |         test<centered_minimal>(in);
119 |     } else {
120 |         std::cerr << "unknown type '" << type << "'" << std::endl;
121 |         return 1;
122 |     }
123 | 
124 |     return 0;
125 | }
126 | ```
127 | 
128 | Encoding/decoding a collection of sequences
129 | ----------------------------------
130 | 
131 | Typically, we want to build all the sequences from
132 | a collection.
133 | In this case, we assume that the input collection
134 | is a binary file with all the sequences being written
135 | as 32-bit integers. In this library, we follow the
136 | input data format of the [`ds2i`](https://github.com/ot/ds2i) library:
137 | each sequence is prefixed by an additional
138 | 32-bit integer representing the size of the sequence.
139 | The collection file starts with a singleton sequence
140 | containing the universe of representation of the sequences, i.e., the maximum representable value.
141 | 
142 | We also assume all sequences are *increasing*.
143 | 
144 | The file `data/test_collection.docs` represents an example of
145 | such organization.
146 | 
147 | To encode all the sequences from this file, do:
148 | 
149 | 	./encode leftmost_minimal ../data/test_collection.docs -o test.bin
150 | 
151 | To decode all the sequences from the encoded file `test.bin`, do:
152 | 
153 | 	./decode leftmost_minimal test.bin
154 | 
155 | To check correctness of the implementation, use:
156 | 
157 | 	./check leftmost_minimal test.bin ../data/test_collection.docs
158 | 
159 | which will compare every decoded integer against the input collection.
160 | 
161 | Benchmark
162 | ------
163 | For this benchmark we used the whole Gov2 datasets, containing
164 | 5,742,630,292 integers in 35,636,425 sequences.
165 | 
166 | We report the average number of bits per integer (bpi)
167 | and nanoseconds spent per decoded integer (with and without the
168 | run-aware optimization).
169 | 
170 | We used two different Intel processors: i7-7700
171 | and i9-9900K, both clocked at 3.6 GHz and having 32K L1 caches for
172 | instructions and data.
173 | Both systems run Linux 4.4.0 and have 64 GB on RAM.
174 | The code was compiled with gcc 7.3.0 on the first
175 | system; with gcc 8.3.0 on the second.
176 | In both cases we used all optimizations
177 | (see also `CMakeLists.txt`).
178 | 
179 | |**Method**        |**bpi** | **ns/int (run-aware) on i7-7700**  | **ns/int (not run-aware) on i7-7700**| **ns/int (run-aware) on i9-9900K** | **ns/int (not run-aware) on i9-9900K**|
180 | |:-----------------|:------:|:------------------:|:------:|:-----:|:-----:|
181 | |simple            |3.532   | 3.45               | 4.65   | 2.52  | 3.37  |
182 | |left-most minimal |3.362   | 5.78               | 7.07   | 4.18  | 5.28  |
183 | |centered minimal  |3.361   | 5.78               | 7.07   | 4.24  | 5.33  |
184 | 
185 | Author
186 | ------
187 | * [Giulio Ermanno Pibiri](http://pages.di.unipi.it/pibiri/), <giulio.ermanno.pibiri@isti.cnr.it>
188 | 
189 | References
190 | -------
191 | * [1] Alistair Moffat and Lang Stuiver. 2000. *Binary Interpolative Coding for Effective Index Compression*. Information Retrieval Journal 3, 1 (2000), 25 – 47.
192 | * [2] Giulio Ermanno Pibiri. 2019. *On Implementing the Binary Interpolative Coding Algorithm*. Technical report. [http://pages.di.unipi.it/pibiri/papers/BIC.pdf](http://pages.di.unipi.it/pibiri/papers/BIC.pdf)
193 | 


--------------------------------------------------------------------------------
/data/test_collection.docs:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jermp/interpolative_coding/bbcbd31ea568c10de37c5f78cedfa85ba43fd98b/data/test_collection.docs


--------------------------------------------------------------------------------
/include/interpolative_coding.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <vector>
  4 | #include <cassert>
  5 | 
  6 | namespace bic {
  7 | 
  8 | inline uint32_t msb(uint32_t x) {
  9 |     assert(x > 0);
 10 |     return 31 - __builtin_clz(x);
 11 | }
 12 | 
 13 | struct output {
 14 |     output() : m_size(0), m_cur_word(nullptr) {}
 15 | 
 16 |     void append(uint32_t bits, uint32_t len) {
 17 |         if (!len) return;
 18 |         uint32_t pos_in_word = m_size % 32;
 19 |         m_size += len;
 20 |         if (pos_in_word == 0) {
 21 |             m_bits.push_back(bits);
 22 |         } else {
 23 |             *m_cur_word |= bits << pos_in_word;
 24 |             if (len > 32 - pos_in_word) {
 25 |                 m_bits.push_back(bits >> (32 - pos_in_word));
 26 |             }
 27 |         }
 28 |         m_cur_word = &m_bits.back();
 29 |     }
 30 | 
 31 |     void reserve(size_t bytes) {
 32 |         m_bits.reserve((bytes + sizeof(m_bits.front()) - 1) /
 33 |                        sizeof(m_bits.front()));
 34 |     }
 35 | 
 36 |     size_t num_bits() const {
 37 |         return m_size;
 38 |     }
 39 | 
 40 |     std::vector<uint32_t> const& bits() const {
 41 |         return m_bits;
 42 |     }
 43 | 
 44 | private:
 45 |     std::vector<uint32_t> m_bits;
 46 |     size_t m_size;
 47 |     uint32_t* m_cur_word;
 48 | };
 49 | 
 50 | struct input {
 51 |     input() {}
 52 | 
 53 |     input(uint32_t const* in) {
 54 |         init(in);
 55 |     }
 56 | 
 57 |     void init(uint32_t const* in) {
 58 |         m_in = in;
 59 |         m_avail = 0;
 60 |         m_buf = 0;
 61 |     }
 62 | 
 63 |     uint32_t take(uint32_t len) {
 64 |         if (!len) return 0;
 65 |         if (m_avail < len) {
 66 |             m_buf |= uint64_t(*m_in++) << m_avail;
 67 |             m_avail += 32;
 68 |         }
 69 |         uint32_t val = m_buf & ((uint64_t(1) << len) - 1);
 70 |         m_buf >>= len;
 71 |         m_avail -= len;
 72 |         return val;
 73 |     }
 74 | 
 75 | private:
 76 |     uint32_t const* m_in;
 77 |     uint32_t m_avail;
 78 |     uint64_t m_buf;
 79 | };
 80 | 
 81 | struct binary {
 82 |     struct writer : output {
 83 |         void write(uint32_t x, uint32_t r) {
 84 | #if RUNAWARE
 85 |             assert(r > 0);
 86 | #else
 87 |             if (!r) return;
 88 | #endif
 89 |             assert(x <= r);
 90 |             uint32_t b = msb(r) + 1;
 91 |             append(x, b);
 92 |         }
 93 |     };
 94 | 
 95 |     struct reader : input {
 96 |         reader() {}
 97 |         reader(uint32_t const* encoded) : input(encoded) {}
 98 | 
 99 |         uint32_t read(uint32_t r) {
100 | #if RUNAWARE
101 |             assert(r > 0);
102 | #else
103 |             if (!r) return 0;
104 | #endif
105 |             uint32_t b = msb(r) + 1;
106 |             uint32_t x = take(b);
107 |             assert(x <= r);
108 |             return x;
109 |         }
110 |     };
111 | };
112 | 
113 | struct leftmost_minimal {
114 |     struct writer : output {
115 |         void write(uint32_t x, uint32_t r) {
116 | #if RUNAWARE
117 |             assert(r > 0);
118 | #else
119 |             if (!r) return;
120 | #endif
121 |             assert(x <= r);
122 |             uint32_t b = msb(r);
123 |             uint32_t hi = (uint64_t(1) << (b + 1)) - r - 1;
124 |             if (x < hi) {
125 |                 append(x, b);
126 |             } else {
127 |                 x += hi;
128 |                 append(x >> 1, b);
129 |                 append(x & 1, 1);
130 |             }
131 |         }
132 |     };
133 | 
134 |     struct reader : input {
135 |         reader() {}
136 |         reader(uint32_t const* encoded) : input(encoded) {}
137 | 
138 |         uint32_t read(uint32_t r) {
139 | #if RUNAWARE
140 |             assert(r > 0);
141 | #else
142 |             if (!r) return 0;
143 | #endif
144 |             uint32_t b = msb(r);
145 |             uint32_t hi = (uint64_t(1) << (b + 1)) - r - 1;
146 |             uint32_t x = take(b);
147 |             if (x >= hi) x = (x << 1) + take(1) - hi;
148 |             assert(x <= r);
149 |             return x;
150 |         }
151 |     };
152 | };
153 | 
154 | struct centered_minimal {
155 |     struct writer : output {
156 |         void write(uint32_t x, uint32_t r) {
157 | #if RUNAWARE
158 |             assert(r > 0);
159 | #else
160 |             if (!r) return;
161 | #endif
162 |             uint32_t b = msb(r);
163 |             uint32_t c = (uint64_t(1) << (b + 1)) - r - 1;
164 |             int64_t half_c = c / 2;
165 |             int64_t half_r = r / 2;
166 |             int64_t lo, hi;
167 |             lo = half_r - half_c;
168 |             hi = half_r + half_c + 1;
169 |             if (r % 2 == 0) lo -= 1;
170 |             if (x > lo and x < hi) {
171 |                 append(x, b);
172 |             } else {
173 |                 append(x, b + 1);
174 |             }
175 |         }
176 |     };
177 | 
178 |     struct reader : input {
179 |         reader() {}
180 |         reader(uint32_t const* encoded) : input(encoded) {}
181 | 
182 |         uint32_t read(uint32_t r) {
183 | #if RUNAWARE
184 |             assert(r > 0);
185 | #else
186 |             if (!r) return 0;
187 | #endif
188 |             uint32_t b = msb(r);
189 |             uint32_t c = (uint64_t(1) << (b + 1)) - r - 1;
190 |             int64_t half_c = c / 2;
191 |             int64_t half_r = r / 2;
192 |             int64_t lo;
193 |             lo = half_r - half_c;
194 |             if (r % 2 == 0) lo -= 1;
195 |             uint32_t x = take(b);
196 |             if (x <= lo) x += take(1) << b;
197 |             assert(x <= r);
198 |             return x;
199 |         }
200 |     };
201 | };
202 | 
203 | template <typename Writer>
204 | struct encoder {
205 |     void encode(uint32_t const* input, uint32_t n, bool write_size = true) {
206 |         if (!n) return;
207 |         uint32_t universe = input[n - 1];
208 |         write_binary(universe);
209 |         if (write_size) write_binary(n);
210 |         encode(input, n - 1, 0, universe);
211 |     }
212 | 
213 |     void reserve(size_t bytes) {
214 |         return m_writer.reserve(bytes);
215 |     }
216 | 
217 |     size_t num_bits() const {
218 |         return m_writer.num_bits();
219 |     }
220 | 
221 |     auto const& bits() const {
222 |         return m_writer.bits();
223 |     }
224 | 
225 | private:
226 |     void encode(uint32_t const* input, uint32_t n, uint32_t lo, uint32_t hi) {
227 |         if (!n) return;
228 | #if RUNAWARE
229 |         if (hi - lo + 1 == n) return;  // run
230 | #endif
231 |         assert(lo <= hi);
232 |         assert(hi - lo >= n - 1);
233 |         uint32_t m = n / 2;
234 |         uint32_t x = input[m];
235 |         m_writer.write(x - lo - m, hi - lo - n + 1);
236 |         encode(input, m, lo, x - 1);
237 |         encode(input + m + 1, n - m - 1, x + 1, hi);
238 |     }
239 | 
240 |     void write_binary(uint32_t x) {
241 |         uint32_t b = 0;
242 |         if (x) b = msb(x);
243 |         assert(b <= 31);
244 |         m_writer.append(b, 5);
245 |         m_writer.append(x, b + 1);
246 |     }
247 | 
248 |     Writer m_writer;
249 | };
250 | 
251 | template <typename Reader>
252 | struct decoder {
253 |     decoder() {}
254 |     decoder(uint32_t const* encoded) : m_reader(encoded) {}
255 | 
256 |     uint32_t decode(uint32_t* out) {
257 |         uint32_t universe = read_binary();
258 |         uint32_t n = read_binary();
259 |         out[n - 1] = universe;
260 |         decode(out, n - 1, 0, universe);
261 |         return n;
262 |     }
263 | 
264 |     uint32_t decode(uint32_t* out, uint32_t n) {
265 |         uint32_t universe = read_binary();
266 |         out[n - 1] = universe;
267 |         decode(out, n - 1, 0, universe);
268 |         return n;
269 |     }
270 | 
271 |     uint32_t decode(uint32_t const* encoded, uint32_t* out) {
272 |         m_reader.init(encoded);
273 |         return decode(out);
274 |     }
275 | 
276 | private:
277 |     void decode(uint32_t* out, uint32_t n, uint32_t lo, uint32_t hi) {
278 |         if (!n) return;
279 |         assert(lo <= hi);
280 | #if RUNAWARE
281 |         if (hi - lo + 1 == n) {  // run
282 |             for (uint32_t i = 0; i != n; ++i) out[i] = lo++;
283 |             return;
284 |         }
285 | #endif
286 |         uint32_t m = n / 2;
287 |         uint32_t x = m_reader.read(hi - lo - n + 1) + lo + m;
288 |         out[m] = x;
289 |         if (n == 1) return;
290 |         decode(out, m, lo, x - 1);
291 |         decode(out + m + 1, n - m - 1, x + 1, hi);
292 |     }
293 | 
294 |     uint32_t read_binary() {
295 |         uint32_t b = m_reader.take(5);
296 |         return m_reader.take(b + 1);
297 |     }
298 | 
299 |     Reader m_reader;
300 | };
301 | 
302 | bool check(uint32_t const* expected, uint32_t const* got, uint32_t n) {
303 |     for (uint32_t i = 0; i != n; ++i) {
304 |         if (expected[i] != got[i]) {
305 |             std::cerr << "error at " << i << "/" << n << ": ";
306 |             std::cerr << "expected " << expected[i] << " but got " << got[i]
307 |                       << std::endl;
308 |             return false;
309 |         }
310 |     }
311 |     return true;
312 | }
313 | 
314 | }  // namespace bic
315 | 


--------------------------------------------------------------------------------
/src/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(encode encode.cpp)
2 | add_executable(decode decode.cpp)
3 | add_executable(check check.cpp)


--------------------------------------------------------------------------------
/src/check.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <chrono>
 3 | 
 4 | #include "../external/mm_file/include/mm_file/mm_file.hpp"
 5 | #include "interpolative_coding.hpp"
 6 | 
 7 | using namespace bic;
 8 | 
 9 | template <typename Decoder>
10 | void check(char const* binary_filename, char const* collection_filename) {
11 |     int advice = mm::advice::sequential;
12 |     mm::file_source<uint32_t> input_collection(collection_filename, advice);
13 |     uint32_t const* input = input_collection.data();
14 |     input += 2;
15 | 
16 |     mm::file_source<uint32_t> encoded(binary_filename, advice);
17 |     uint32_t const* data = encoded.data();
18 |     uint32_t universe = data[0];
19 |     uint32_t sequences = data[1];
20 |     Decoder dec(data + 2);
21 |     std::vector<uint32_t> decoded(universe);
22 | 
23 |     std::cout << "checking " << sequences << " sequences..." << std::endl;
24 |     size_t decoded_ints = 0;
25 |     bool all_good = true;
26 | 
27 |     for (uint32_t i = 0; i != sequences; ++i) {
28 |         uint32_t n = dec.decode(decoded.data());
29 |         decoded_ints += n;
30 |         if (n != input[0]) {
31 |             std::cerr << "decoded " << n << " integers but expected "
32 |                       << input[0] << std::endl;
33 |             return;
34 |         }
35 |         all_good &= check(input + 1, decoded.data(), n);
36 |         input += n + 1;
37 | 
38 |         if (i and i % 100000 == 0) {
39 |             std::cout << "  checked " << i << " sequences" << std::endl;
40 |         }
41 |     }
42 | 
43 |     std::cout << "DONE" << std::endl;
44 |     if (all_good) {
45 |         std::cout << "everything good" << std::endl;
46 |     }
47 | }
48 | 
49 | int main(int argc, char** argv) {
50 |     if (argc < 4) {
51 |         std::cerr << argv[0]
52 |                   << " decoder_type binary_filename collection_filename"
53 |                   << std::endl;
54 |         return 1;
55 |     }
56 | 
57 |     std::string type(argv[1]);
58 |     char const* binary_filename = argv[2];
59 |     char const* collection_filename = argv[3];
60 | 
61 |     if (type == "binary") {
62 |         check<decoder<binary::reader>>(binary_filename, collection_filename);
63 |     } else if (type == "leftmost_minimal") {
64 |         check<decoder<leftmost_minimal::reader>>(binary_filename,
65 |                                                  collection_filename);
66 |     } else if (type == "centered_minimal") {
67 |         check<decoder<centered_minimal::reader>>(binary_filename,
68 |                                                  collection_filename);
69 |     } else {
70 |         std::cerr << "unknown type '" << type << "'" << std::endl;
71 |         return 1;
72 |     }
73 | 
74 |     return 0;
75 | }
76 | 


--------------------------------------------------------------------------------
/src/decode.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <chrono>
 3 | 
 4 | #include "../external/mm_file/include/mm_file/mm_file.hpp"
 5 | #include "interpolative_coding.hpp"
 6 | 
 7 | using namespace bic;
 8 | typedef std::chrono::high_resolution_clock clock_type;
 9 | 
10 | template <typename Decoder>
11 | void encode(char const* input_filename) {
12 |     int advice = mm::advice::sequential;
13 |     mm::file_source<uint32_t> input(input_filename, advice);
14 |     uint32_t const* data = input.data();
15 |     uint32_t universe = data[0];
16 |     uint32_t sequences = data[1];
17 |     Decoder dec(data + 2);
18 |     std::vector<uint32_t> decoded(universe);
19 | 
20 |     std::cout << "decoding " << sequences << " sequences..." << std::endl;
21 |     size_t decoded_ints = 0;
22 |     auto start = clock_type::now();
23 |     for (uint32_t i = 0; i != sequences; ++i) {
24 |         uint32_t n = dec.decode(decoded.data());
25 |         decoded_ints += n;
26 |     }
27 |     auto finish = clock_type::now();
28 |     std::cout << "DONE" << std::endl;
29 | 
30 |     std::chrono::duration<double> elapsed = finish - start;
31 |     std::cout << "decoded " << decoded_ints << " integers in "
32 |               << elapsed.count() << " [sec]" << std::endl;
33 |     std::cout << elapsed.count() * 1000000000 / decoded_ints << " ns/int"
34 |               << std::endl;
35 |     std::cout << "using " << input.bytes() * 8.0 / decoded_ints << " bits x int"
36 |               << std::endl;
37 | }
38 | 
39 | int main(int argc, char** argv) {
40 |     if (argc < 3) {
41 |         std::cerr << argv[0] << " decoder_type input_filename" << std::endl;
42 |         return 1;
43 |     }
44 | 
45 |     std::string type(argv[1]);
46 |     char const* input_filename = argv[2];
47 |     std::cout << "type: '" << type << "':\n";
48 | 
49 |     if (type == "binary") {
50 |         encode<decoder<binary::reader>>(input_filename);
51 |     } else if (type == "leftmost_minimal") {
52 |         encode<decoder<leftmost_minimal::reader>>(input_filename);
53 |     } else if (type == "centered_minimal") {
54 |         encode<decoder<centered_minimal::reader>>(input_filename);
55 |     } else {
56 |         std::cerr << "unknown type '" << type << "'" << std::endl;
57 |         return 1;
58 |     }
59 | 
60 |     return 0;
61 | }
62 | 


--------------------------------------------------------------------------------
/src/encode.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <fstream>
 3 | #include <cassert>
 4 | 
 5 | #include "../external/mm_file/include/mm_file/mm_file.hpp"
 6 | #include "interpolative_coding.hpp"
 7 | 
 8 | using namespace bic;
 9 | static const uint64_t GiB = 1024 * 1024 * 1024;
10 | 
11 | template <typename Encoder>
12 | void encode(char const* input_filename, char const* output_filename) {
13 |     mm::file_source<uint32_t> input(input_filename);
14 |     uint32_t const* data = input.data();
15 |     assert(data[0] == 1);
16 |     uint32_t universe = data[1];
17 | 
18 |     Encoder enc;
19 |     enc.reserve(10 * GiB);
20 | 
21 |     std::cout << "encoding data..." << std::endl;
22 |     uint32_t encoded_sequences = 0;
23 |     size_t encoded_ints = 0;
24 |     for (size_t i = 2; i < input.size();) {
25 |         uint32_t n = data[i];
26 |         enc.encode(data + i + 1, n);
27 |         i += n + 1;
28 |         encoded_ints += n;
29 |         encoded_sequences += 1;
30 |         if (encoded_sequences % 100000 == 0) {
31 |             std::cout << "  encoded " << encoded_sequences << " sequences"
32 |                       << std::endl;
33 |         }
34 |     }
35 |     std::cout << "DONE" << std::endl;
36 | 
37 |     std::cout << "encoded " << encoded_sequences << " sequences" << std::endl;
38 |     std::cout << "encoded " << encoded_ints << " integers" << std::endl;
39 | 
40 |     // NOTE: slightly larger than enc.num_bits() due to padding
41 |     std::cout << "using "
42 |               << enc.bits().size() * sizeof(enc.bits().front()) * 8.0 /
43 |                      encoded_ints
44 |               << " bits x int" << std::endl;
45 | 
46 |     if (output_filename) {
47 |         std::ofstream out(output_filename, std::ios::binary);
48 |         if (!out.is_open()) {
49 |             std::cerr << "error in opening binary file" << std::endl;
50 |             return;
51 |         }
52 |         std::cout << "writing encoded data to disk..." << std::endl;
53 | 
54 |         // save also:
55 |         // - universe (max sequence length)
56 |         // - number of encoded sequences
57 |         out.write(reinterpret_cast<char const*>(&universe), sizeof(universe));
58 |         out.write(reinterpret_cast<char const*>(&encoded_sequences),
59 |                   sizeof(encoded_sequences));
60 | 
61 |         out.write(reinterpret_cast<char const*>(enc.bits().data()),
62 |                   static_cast<std::streamsize>(sizeof(enc.bits().front()) *
63 |                                                enc.bits().size()));
64 |         std::cout << "DONE" << std::endl;
65 |     }
66 | }
67 | 
68 | int main(int argc, char** argv) {
69 |     if (argc < 3) {
70 |         std::cerr << argv[0]
71 |                   << " encoder_type input_filename -o output_filename"
72 |                   << std::endl;
73 |         return 1;
74 |     }
75 | 
76 |     std::string type(argv[1]);
77 |     char const* input_filename = argv[2];
78 |     char const* output_filename = nullptr;
79 | 
80 |     if (argc > 3 and argv[3] == std::string("-o")) {
81 |         output_filename = argv[4];
82 |     }
83 | 
84 |     if (type == "binary") {
85 |         encode<encoder<binary::writer>>(input_filename, output_filename);
86 |     } else if (type == "leftmost_minimal") {
87 |         encode<encoder<leftmost_minimal::writer>>(input_filename,
88 |                                                   output_filename);
89 |     } else if (type == "centered_minimal") {
90 |         encode<encoder<centered_minimal::writer>>(input_filename,
91 |                                                   output_filename);
92 |     } else {
93 |         std::cerr << "unknown type '" << type << "'" << std::endl;
94 |         return 1;
95 |     }
96 | 
97 |     return 0;
98 | }
99 | 


--------------------------------------------------------------------------------
/test/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(example example.cpp)


--------------------------------------------------------------------------------
/test/example.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | 
 3 | #include "interpolative_coding.hpp"
 4 | using namespace bic;
 5 | 
 6 | template <typename BinaryCode>
 7 | void test(std::vector<uint32_t> const& in) {
 8 |     std::cout << "to be encoded:\n";
 9 |     for (auto x : in) {
10 |         std::cout << x << " ";
11 |     }
12 |     std::cout << std::endl;
13 | 
14 |     {
15 |         uint32_t n = in.size();
16 |         encoder<typename BinaryCode::writer> enc;
17 |         enc.encode(in.data(), n);  // save n by feault
18 |         std::vector<uint32_t> out(n);
19 |         decoder<typename BinaryCode::reader> dec;
20 |         uint32_t m = dec.decode(enc.bits().data(), out.data());
21 |         assert(m == n);
22 | 
23 |         std::cout << "decoded " << m << " values" << std::endl;
24 |         std::cout << "total bits " << enc.num_bits() << std::endl;
25 |         std::cout << static_cast<double>(enc.num_bits()) / m << " bits x key"
26 |                   << std::endl;
27 | 
28 |         std::cout << "decoded:\n";
29 |         for (auto x : out) {
30 |             std::cout << x << " ";
31 |         }
32 |         std::cout << std::endl;
33 |     }
34 | 
35 |     {
36 |         uint32_t n = in.size();
37 |         encoder<typename BinaryCode::writer> enc;
38 |         enc.encode(in.data(), n,
39 |                    false  // do not save n
40 |         );
41 |         std::vector<uint32_t> out(n);
42 |         decoder<typename BinaryCode::reader> dec(enc.bits().data());
43 |         uint32_t m = dec.decode(out.data(), n);
44 |         assert(m == n);
45 | 
46 |         std::cout << "decoded " << m << " values" << std::endl;
47 |         std::cout << "total bits " << enc.num_bits() << std::endl;
48 |         std::cout << static_cast<double>(enc.num_bits()) / m << " bits x key"
49 |                   << std::endl;
50 | 
51 |         std::cout << "decoded:\n";
52 |         for (auto x : out) {
53 |             std::cout << x << " ";
54 |         }
55 |         std::cout << std::endl;
56 |     }
57 | }
58 | 
59 | int main(int argc, char** argv) {
60 |     if (argc < 2) {
61 |         std::cerr << argv[0] << " binary_code_type" << std::endl;
62 |         return 1;
63 |     }
64 | 
65 |     // std::vector<uint32_t> in = {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
66 |     // 12,
67 |     //                             13, 14, 15, 16, 17, 18, 19, 20, 23, 25, 32};
68 |     // std::vector<uint32_t> in = {3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
69 |     //                             13, 14, 15, 21, 25, 36, 38, 54, 62};
70 | 
71 |     std::vector<uint32_t> in = {3, 4, 7, 13, 14, 15, 21, 25, 36, 38, 54, 62};
72 | 
73 |     // std::vector<uint32_t> in = {0, 1, 2, 3, 4, 5, 21, 25, 36, 38, 54, 62};
74 | 
75 |     std::string type(argv[1]);
76 | 
77 |     if (type == "binary") {
78 |         test<binary>(in);
79 |     } else if (type == "leftmost_minimal") {
80 |         test<leftmost_minimal>(in);
81 |     } else if (type == "centered_minimal") {
82 |         test<centered_minimal>(in);
83 |     } else {
84 |         std::cerr << "unknown type '" << type << "'" << std::endl;
85 |         return 1;
86 |     }
87 | 
88 |     return 0;
89 | }


--------------------------------------------------------------------------------