├── .gitignore
├── LICENSE
├── Makefile
├── README.md
├── benchmark
    ├── counters.cpp
    ├── linux-perf-events-wrapper.h
    ├── linux-perf-events.h
    ├── maropuparser.h
    └── ztimer.h
└── include
    ├── fastscancount.h
    ├── fastscancount_avx2.h
    └── fastscancount_avx512.h


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Prerequisites
 2 | *.d
 3 | 
 4 | # Compiled Object files
 5 | *.slo
 6 | *.lo
 7 | *.o
 8 | *.obj
 9 | 
10 | # Precompiled Headers
11 | *.gch
12 | *.pch
13 | 
14 | # Compiled Dynamic libraries
15 | *.so
16 | *.dylib
17 | *.dll
18 | 
19 | # Fortran module files
20 | *.mod
21 | *.smod
22 | 
23 | # Compiled Static libraries
24 | *.lai
25 | *.la
26 | *.a
27 | *.lib
28 | 
29 | # Executables
30 | *.exe
31 | *.out
32 | *.app
33 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | OPT := -O3
 2 | # Leo really doubts -mavx2 helps anything, but one can
 3 | # disable avx512 tests by enforcing -mavx2
 4 | #CXXFLAGS := -std=c++17 $(OPT) -mavx2
 5 | CXXFLAGS := -std=c++17 $(OPT) -march=native
 6 | 
 7 | counter: benchmark/counters.cpp include/*.h Makefile
 8 | 	$(CXX) $(CXXFLAGS) $(CXXEXTRA) -o counter benchmark/counters.cpp -Ibenchmark -Iinclude
 9 | 
10 | clean:
11 | 	rm -f counter
12 | 
13 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # fastscancount
  2 | Fast implementations of the scancount algorithm
  3 | 
  4 | 
  5 | Given a set of arrays of integers, we seek to identify 
  6 | all values that occur more than 'threshold' times. We do so using the
  7 | 'scancount' algorithm. It is assumed
  8 | that you have fewer than 256 arrays of integers and that the threshold is no larger than 254.
  9 | 
 10 | We are effectively providing optimized versions of the following function:
 11 | 
 12 | ```C++
 13 | void scancount(std::vector<std::vector<uint32_t>> &data, std::vector<uint32_t> &out, uint8_t threshold) {
 14 |   std::fill(counters.begin(), counters.end(), 0);
 15 |   out.clear();
 16 |   for (size_t c = 0; c < data.size(); c++) {
 17 |     std::vector<uint32_t> &v = data[c];
 18 |     for (size_t i = 0; i < v.size(); i++) {
 19 |       counters[v[i]]++;
 20 |     }
 21 |   }
 22 |   for (uint32_t i = 0; i < counters.size(); i++) {
 23 |     if (counters[i] > threshold)
 24 |       out.push_back(i);
 25 |   }
 26 | }
 27 | ```
 28 | 
 29 | Our optimized versions assume that your arrays are made of sorted integers.
 30 | 
 31 | There are two headers, `fastscancount.h` uses plain C++ and should
 32 | be portable. It has one main function in the fastscancount namespace.
 33 | We always write the result to 'out'.
 34 | 
 35 | ```C++
 36 | void fastscancount(std::vector<std::vector<uint32_t>> &data,
 37 |     std::vector<uint32_t> &out, uint8_t threshold)
 38 | ```
 39 | 
 40 | There is another header `fastscancount_avx2.h`
 41 | which expects an x64 processor supporting the AVX2 instruction set.  
 42 | It has a similar function signature:
 43 | 
 44 | ```C++
 45 | void fastscancount_avx2(std::vector<std::vector<uint32_t>> &data,
 46 |     std::vector<uint32_t> &out, uint8_t threshold)
 47 | ```
 48 | 
 49 | The AVX2 version assumes that you have fewer than 128 arrays of integers.
 50 | 
 51 | Because this library is made solely of headers, there is no
 52 | need for a build system.
 53 | 
 54 | ## Linux benchmark
 55 | 
 56 | If you have bare metal access to a Linux box, you can run cycle-accurate benchmarks.
 57 | 
 58 | ```
 59 | make
 60 | ./counter
 61 | ```
 62 | 
 63 | Sample output with GNU GCC 8.3:
 64 | 
 65 | ```
 66 | $ ./counter
 67 | Got 2497 hits
 68 | optimized cache-sensitive scancount
 69 | 4.01381 cycles/element
 70 | AVX2-based scancount
 71 | 3.58494 cycles/element
 72 | ```
 73 | 
 74 | With LLVM clang, we seem to get better results:
 75 | 
 76 | ```
 77 | $ ./counter
 78 | Got 2497 hits
 79 | optimized cache-sensitive scancount
 80 | 3.54267 cycles/element
 81 | 2.8908 instructions/cycles
 82 | 0.0134279 miss/element
 83 | AVX2-based scancount
 84 | 3.57374 cycles/element
 85 | 2.03391 instructions/cycles
 86 | 0.0109755 miss/element
 87 | ```
 88 | 
 89 | ## Blog post
 90 | 
 91 | [How fast can scancount be?](http://lemire.me/blog/2019/08/30/how-fast-can-scancount-be/ )
 92 | 
 93 | ## Using actual data
 94 | 
 95 | ```
 96 | ./counter --postings data/postings.bin --queries data/queries.bin --threshold 3
 97 | ```
 98 | 
 99 | ## Credit
100 | 
101 | The AVX2 version was designed and implemented by Travis Downs.
102 | The scalar version was designed and implemented by Daniel Lemire based on ideas by Nathan Kurz,  Travis Downs and others.
103 | 
104 | ## Reference
105 | 
106 | 
107 | Owen Kaser, Daniel Lemire, [Compressed bitmap indexes: beyond unions and intersections](https://arxiv.org/abs/1402.4466), Software: Practice and Experience 46 (2), 2016
108 | 


--------------------------------------------------------------------------------
/benchmark/counters.cpp:
--------------------------------------------------------------------------------
  1 | // Fine-grained statistics is available only on Linux
  2 | #include "fastscancount.h"
  3 | #include "ztimer.h"
  4 | #ifdef __AVX2__
  5 | #include "fastscancount_avx2.h"
  6 | #endif
  7 | #ifdef __AVX512F__
  8 | #include "fastscancount_avx512.h"
  9 | #endif
 10 | #include "linux-perf-events-wrapper.h"
 11 | #include "maropuparser.h"
 12 | #include <algorithm>
 13 | #include <cstdint>
 14 | #include <cstdio>
 15 | #include <immintrin.h>
 16 | #include <iostream>
 17 | #include <vector>
 18 | #include <stdexcept>
 19 | 
 20 | #define REPEATS 10
 21 | #define RUNNINGTESTS
 22 | 
 23 | void scancount(const std::vector<const std::vector<uint32_t>*> &data,
 24 |                std::vector<uint32_t> &out, size_t threshold) {
 25 |   uint64_t largest = 0;
 26 |   for(auto z : data) {
 27 |     const std::vector<uint32_t> & v = *z;
 28 |     if(v[v.size() - 1] > largest) largest = v[v.size() - 1];
 29 |   }
 30 |   std::vector<uint8_t> counters(largest+1);
 31 |   out.clear();
 32 |   for (size_t c = 0; c < data.size(); c++) {
 33 |     const std::vector<uint32_t> &v = *data[c];
 34 |     for (size_t i = 0; i < v.size(); i++) {
 35 |       counters[v[i]]++;
 36 |     }
 37 |   }
 38 |   for (uint32_t i = 0; i < counters.size(); i++) {
 39 |     if (counters[i] > threshold)
 40 |       out.push_back(i);
 41 |   }
 42 | }
 43 | 
 44 | void calc_boundaries(uint32_t largest, uint32_t range_size, 
 45 |                     const std::vector<uint32_t>& data, 
 46 |                     std::vector<uint32_t>& range_ends) {
 47 |   if (!range_size) {
 48 |     throw std::runtime_error("range_size must be > 0");
 49 |   }
 50 |   uint32_t end = 0;
 51 |   range_ends.clear();
 52 |   
 53 |   for (uint32_t start = 0; start <= largest; start += range_size) {
 54 |     uint32_t curr_max = std::min(largest, start + range_size - 1);
 55 |     while (end < data.size() && data[end] <= curr_max) {
 56 |       end++;
 57 |     }
 58 |     range_ends.push_back(end);
 59 |   }
 60 | } 
 61 | 
 62 | const uint32_t range_size_avx512 = 40000;
 63 | 
 64 | void calc_alldata_boundaries(const std::vector<std::vector<uint32_t>>& data,
 65 |                              std::vector<std::vector<uint32_t>>& range_ends,
 66 |                              size_t range_size) {
 67 |   uint32_t largest = 0;
 68 |   range_ends.clear();
 69 |   range_ends.resize(data.size());
 70 |   for(const auto& v : data) {
 71 |     if (!v.empty() && v[v.size() - 1] > largest) largest = v[v.size() - 1];
 72 |   }
 73 |   for (unsigned i = 0; i < data.size(); ++i) {
 74 |     calc_boundaries(largest, range_size, data[i], range_ends[i]); 
 75 |   }
 76 | }
 77 | 
 78 | template <typename F>
 79 | void test(F f, const std::vector<const std::vector<uint32_t>*>& data_ptrs,
 80 |           std::vector<uint32_t>& answer, unsigned threshold, const std::string &name) {
 81 |   scancount(data_ptrs, answer, threshold);
 82 |   size_t s1 = answer.size();
 83 |   auto a1 (answer);
 84 |   std::sort(a1.begin(), a1.end());
 85 |   answer.clear();
 86 |   f();
 87 |   size_t s2 = answer.size();
 88 |   auto a2 (answer);
 89 |   std::sort(a2.begin(), a2.end());
 90 |   if (a1 != a2) {
 91 |     std::cout << "s1: " << s1 << " s2: " << s2 << std::endl;
 92 |     for(size_t j = 0; j < std::min(s1, s2); j++) {
 93 |       std::cout << j << " " << a1[j] << " vs " << a2[j] ;
 94 | 
 95 |       if(a1[j] != a2[j]) std::cout << " oh oh ";
 96 |       std::cout << std::endl;
 97 |     }
 98 |     throw std::runtime_error("bug: " + name);
 99 |   }
100 | }
101 | 
102 | template <typename F>
103 | void bench(F f, const std::string &name,
104 |            LinuxEventsWrapper &unified,
105 |            float& elapsed,
106 |            std::vector<uint32_t> &answer, size_t sum, size_t expected,
107 |            bool print) {
108 |   WallClockTimer tm;
109 |   unified.start();
110 |   f();
111 |   unified.end();
112 |   elapsed += tm.split();
113 |   if (answer.size() != expected)
114 |     std::cerr << "bug: expected " << expected << " but got " << answer.size()
115 |               << "\n";
116 | #ifdef __linux__
117 |   if (print) {
118 |     double cycles = unified.get_result(PERF_COUNT_HW_CPU_CYCLES);
119 |     double instructions = unified.get_result(PERF_COUNT_HW_INSTRUCTIONS);
120 |     double misses = unified.get_result(PERF_COUNT_HW_BRANCH_MISSES);
121 |     std::cout << name << std::endl;
122 |     std::cout << cycles / sum << " cycles/element " << std::endl;
123 |     std::cout << instructions / cycles << " instructions/cycles " << std::endl;
124 |     std::cout << misses / sum << " miss/element " << std::endl;
125 |   }
126 | #endif
127 | }
128 | 
129 | void demo_data(const std::vector<std::vector<uint32_t>>& data,
130 |               const std::vector<std::vector<uint32_t>>& queries,
131 |               size_t threshold) {
132 |   size_t N = 0;
133 |   for (const auto& data_elem : data) {
134 |     size_t sz = data_elem.size();
135 |     if (sz) {
136 |       N = std::max(N, (size_t)data_elem[sz-1] + 1);
137 |     }
138 |   }
139 | 
140 |   std::vector<uint32_t> answer;
141 |   answer.reserve(N);
142 | 
143 |   std::vector<int> evts = {
144 | #ifdef __linux__
145 |                            PERF_COUNT_HW_CPU_CYCLES,
146 |                            PERF_COUNT_HW_INSTRUCTIONS,
147 |                            PERF_COUNT_HW_BRANCH_MISSES,
148 |                            PERF_COUNT_HW_CACHE_REFERENCES,
149 |                            PERF_COUNT_HW_CACHE_MISSES
150 | #endif
151 |                           };
152 |   LinuxEventsWrapper unified(evts);
153 | 
154 |   std::vector<std::vector<uint32_t>> range_boundaries;
155 |   calc_alldata_boundaries(data, range_boundaries, range_size_avx512);
156 | 
157 |   std::vector<const std::vector<uint32_t>*> data_ptrs;
158 |   std::vector<const std::vector<uint32_t>*> range_ptrs;
159 | 
160 |   float elapsed = 0, elapsed_fast = 0, elapsed_avx = 0, elapsed_avx512 = 0;
161 | 
162 |   size_t sum_total = 0;
163 | 
164 |   for (size_t qid = 0; qid < queries.size(); ++qid) {
165 |     const auto& query_elem = queries[qid];
166 |     data_ptrs.clear();
167 |     range_ptrs.clear();
168 |     size_t sum = 0;
169 |     for (uint32_t idx : query_elem) {
170 |       if (idx >= data.size()) {
171 |         std::stringstream err;
172 |         err << "Inconsistent data, posting " << idx << 
173 |                " is >= # of postings " << data.size() << " query id " << qid;
174 |         throw std::runtime_error(err.str());
175 |       }
176 |       sum += data[idx].size();
177 |       data_ptrs.push_back(&data[idx]);
178 |       range_ptrs.push_back(&range_boundaries[idx]);
179 |     }
180 |     sum_total += sum;
181 | 
182 |     scancount(data_ptrs, answer, threshold);
183 |     const size_t expected = answer.size();
184 | 
185 | #ifdef RUNNINGTESTS
186 |     test(
187 |       [&](){
188 |         fastscancount::fastscancount(data_ptrs, answer, threshold);
189 |       }, data_ptrs, answer, threshold, "fastscancount"
190 |     );
191 | #ifdef __AVX2__
192 |     test(
193 |       [&](){
194 |         fastscancount::fastscancount_avx2(data_ptrs, answer, threshold);
195 |       }, data_ptrs, answer, threshold, "fastscancount_avx2"
196 |     );
197 | #endif
198 | 
199 | #ifdef __AVX512F__
200 |     test(
201 |       [&](){
202 |         fastscancount::fastscancount_avx512(range_size_avx512, data_ptrs, range_ptrs, answer, threshold);
203 |       }, data_ptrs, answer, threshold, "fastscancount_avx512"
204 |     );
205 | #endif
206 | 
207 | #endif
208 |     std::cout << "Qid: " << qid << " got " << expected << " hits\n";
209 | 
210 |     bool last = (qid == queries.size() - 1);
211 | 
212 |     bench(
213 |         [&]() {
214 |           scancount(data_ptrs, answer, threshold);
215 |         },
216 |         "baseline scancount", unified, elapsed, answer, sum,
217 |         expected, last);
218 | 
219 |     bench(
220 |         [&]() {
221 |           fastscancount::fastscancount(data_ptrs, answer, threshold);
222 |         },
223 |         "optimized cache-sensitive scancount", unified, elapsed_fast, answer, sum,
224 |         expected, last);
225 | #ifdef __AVX2__
226 |     bench(
227 |         [&]() {
228 |           fastscancount::fastscancount_avx2(data_ptrs, answer, threshold);
229 |         },
230 |         "AVX2-based scancount", unified, elapsed_avx, answer, sum, expected, last);
231 | #endif
232 | #ifdef __AVX512F__
233 |     bench(
234 |         [&]() {
235 |           fastscancount::fastscancount_avx512(range_size_avx512, data_ptrs, range_ptrs, answer, threshold);
236 |         },
237 |         "AVX512-based scancount", unified, elapsed_avx512, answer, sum, expected, last);
238 | #endif
239 |   }
240 |   std::cout << "Elems per millisecond:" << std::endl;
241 |   std::cout << "scancount: " << (sum_total/(elapsed/1e3)) << std::endl; 
242 |   std::cout << "fastscancount: " << (sum_total/(elapsed_fast/1e3)) << std::endl; 
243 | #ifdef __AVX2__
244 |   std::cout << "fastscancount_avx2: " << (sum_total/(elapsed_avx/1e3)) << std::endl; 
245 | #endif
246 | #ifdef __AVX512F__
247 |   std::cout << "fastscancount_avx512: " << (sum_total/(elapsed_avx512/1e3)) << std::endl; 
248 | #endif
249 | 
250 | }
251 | 
252 | 
253 | void demo_random(size_t N, size_t length, size_t array_count, size_t threshold) {
254 |   std::vector<std::vector<uint32_t>> data(array_count);
255 | 
256 |   std::vector<const std::vector<uint32_t>*> data_ptrs;
257 |   std::vector<uint32_t> answer;
258 |   answer.reserve(N);
259 | 
260 |   size_t sum = 0;
261 |   for (size_t c = 0; c < array_count; c++) {
262 |     std::vector<uint32_t> &v = data[c];
263 |     for (size_t i = 0; i < length; i++) {
264 |       v.push_back(rand() % N);
265 |     }
266 |     std::sort(v.begin(), v.end());
267 |     v.resize(std::distance(v.begin(), unique(v.begin(), v.end())));
268 |     sum += v.size();
269 |     data_ptrs.push_back(&data[c]);
270 |   }
271 | 
272 | 
273 |   std::vector<std::vector<uint32_t>> range_boundaries;
274 |   calc_alldata_boundaries(data, range_boundaries, range_size_avx512);
275 |   std::vector<const std::vector<uint32_t>*> range_ptrs;
276 |   for (size_t c = 0; c < array_count; c++) {
277 |     range_ptrs.push_back(&range_boundaries[c]);
278 |   }
279 | 
280 |   std::vector<int> evts = {
281 | #ifdef __linux__
282 |                            PERF_COUNT_HW_CPU_CYCLES,
283 |                            PERF_COUNT_HW_INSTRUCTIONS,
284 |                            PERF_COUNT_HW_BRANCH_MISSES,
285 |                            PERF_COUNT_HW_CACHE_REFERENCES,
286 |                            PERF_COUNT_HW_CACHE_MISSES
287 | #endif
288 |                           };
289 |   LinuxEventsWrapper unified(evts);
290 |   float elapsed = 0, elapsed_fast = 0, elapsed_avx = 0, elapsed_avx512 = 0;
291 |   scancount(data_ptrs, answer, threshold);
292 |   const size_t expected = answer.size();
293 |   std::cout << "Got " << expected << " hits\n";
294 |   size_t sum_total = sum * REPEATS;
295 |   for (size_t t = 0; t < REPEATS; t++) {
296 |     bool last = (t == REPEATS - 1);
297 | 
298 |     bench(
299 |         [&]() {
300 |           scancount(data_ptrs, answer, threshold);
301 |         },
302 |         "baseline scancount", unified, elapsed, answer, sum,
303 |         expected, last);
304 |   }
305 | 
306 |   for (size_t t = 0; t < REPEATS; t++) {
307 |     bool last = (t == REPEATS - 1);
308 | 
309 | #ifdef RUNNINGTESTS
310 |     test(
311 |       [&](){
312 |         fastscancount::fastscancount(data_ptrs, answer, threshold);
313 |       }, data_ptrs, answer, threshold, "fastscancount"
314 |     );
315 | #endif
316 | 
317 |     bench(
318 |         [&]() {
319 |           fastscancount::fastscancount(data_ptrs, answer, threshold);
320 |         },
321 |         "optimized cache-sensitive scancount", unified, elapsed_fast, answer, sum,
322 |         expected, last);
323 |   }
324 | 
325 |   for (size_t t = 0; t < REPEATS; t++) {
326 |     bool last = (t == REPEATS - 1);
327 | 
328 | #ifdef __AVX2__
329 | #ifdef RUNNINGTESTS
330 |     test(
331 |       [&](){
332 |         fastscancount::fastscancount_avx2(data_ptrs, answer, threshold);
333 |       }, data_ptrs, answer, threshold, "fastscancount_avx2"
334 |     );
335 | #endif
336 |     bench(
337 |         [&]() {
338 |           fastscancount::fastscancount_avx2(data_ptrs, answer, threshold);
339 |         },
340 |         "AVX2-based scancount", unified, elapsed_avx, answer, sum, expected, last);
341 | #endif
342 |   }
343 | 
344 |   for (size_t t = 0; t < REPEATS; t++) {
345 |     bool last = (t == REPEATS - 1);
346 | #ifdef __AVX512F__
347 | #ifdef RUNNINGTESTS
348 |     test(
349 |       [&](){
350 |         fastscancount::fastscancount_avx512(range_size_avx512, data_ptrs, range_ptrs, answer, threshold);
351 |       }, data_ptrs, answer, threshold, "fastscancount_avx512"
352 |     );
353 | #endif
354 | 
355 |     bench(
356 |         [&]() {
357 |           fastscancount::fastscancount_avx512(range_size_avx512, data_ptrs, range_ptrs, answer, threshold);
358 |         },
359 |         "AVX512-based scancount", unified, elapsed_avx512, answer, sum, expected, last);
360 | #endif
361 |   }
362 | 
363 |   std::cout << "Elems per millisecond:" << std::endl;
364 |   std::cout << "scancount: " << (sum_total/(elapsed/1e3)) << std::endl; 
365 |   std::cout << "fastscancount: " << (sum_total/(elapsed_fast/1e3)) << std::endl; 
366 | #ifdef __AVX2__
367 |   std::cout << "fastscancount_avx2: " << (sum_total/(elapsed_avx/1e3)) << std::endl; 
368 | #endif
369 | #ifdef __AVX512F__
370 |   std::cout << "fastscancount_avx512: " << (sum_total/(elapsed_avx512/1e3)) << std::endl; 
371 | #endif
372 | }
373 | 
374 | void usage(const std::string& err="") {
375 |   if (!err.empty()) {
376 |     std::cerr << err << std::endl;
377 |   }
378 |   std::cerr << "usage: --postings <postings file> --queries <queries file> --threshold <threshold>" << std::endl;
379 | }
380 | 
381 | int main(int argc, char *argv[]) {
382 |   // A very naive way to process arguments, 
383 |   // but it's ok unless we need to extend it substantially.
384 |   if (argc != 1) {
385 |     if (argc != 7) {
386 |       usage("");
387 |       return EXIT_FAILURE; 
388 |     }
389 |     std::string postings_file, queries_file;
390 |     int threshold = -1;
391 |     for (int i = 1; i < argc; ++i) {
392 |       if (std::string(argv[i]) == "--postings") {
393 |         postings_file = argv[++i];
394 |       } else if (std::string(argv[i]) == "--queries") {
395 |         queries_file = argv[++i];
396 |       } else if (std::string(argv[i]) == "--threshold") {
397 |         threshold = std::atoi(argv[++i]);
398 |       }
399 |     }
400 |     if (postings_file.empty() || queries_file.empty() || threshold < 0) {
401 |       usage("Specify queries, postings, and the threshold!");
402 |       return EXIT_FAILURE; 
403 |     }
404 |     std::vector<uint32_t> tmp; 
405 |     std::vector<std::vector<uint32_t>> data;
406 |     {
407 |       MaropuGapReader drdr(postings_file);
408 |       if (!drdr.open()) {
409 |         usage("Cannot open: " + postings_file);
410 |         return EXIT_FAILURE; 
411 |       }
412 |       while (drdr.loadIntegers(tmp)) {
413 |         data.push_back(tmp);
414 |       }
415 |     }
416 |     std::vector<std::vector<uint32_t>> queries;
417 |     {
418 |       MaropuGapReader qrdr(queries_file);
419 |       if (!qrdr.open()) {
420 |         usage("Cannot open: " + queries_file);
421 |         return EXIT_FAILURE; 
422 |       }
423 |       while (qrdr.loadIntegers(tmp)) {
424 |         queries.push_back(tmp);
425 |       }
426 |     }
427 |               
428 |     try { 
429 |       demo_data(data, queries, threshold);
430 |     } catch (const std::exception& e) {
431 |       std::cerr << "Exception: " << e.what() << std::endl;
432 |       return EXIT_FAILURE;
433 |     }
434 |   } else {
435 |     try {
436 |       // Previous demo with threshold 3
437 |       //demo_random(20000000, 50000, 100, 3);
438 |       for (unsigned k = 1; k < 10; ++k) {
439 |         std::cout << "Demo threshold:" << k << std::endl;
440 |         demo_random(20000000, 50000, 100, k);
441 |         std::cout << "=======================" << std::endl;
442 |       }
443 |     } catch (const std::exception& e) {
444 |       std::cerr << "Exception: " << e.what() << std::endl;
445 |       return EXIT_FAILURE;
446 |     }
447 |   }
448 |   return EXIT_SUCCESS;
449 | }
450 | 


--------------------------------------------------------------------------------
/benchmark/linux-perf-events-wrapper.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <memory>
 3 | #include <memory>
 4 | #include <unordered_map>
 5 | #ifdef __linux__
 6 | #include "linux-perf-events.h"
 7 | #endif
 8 | 
 9 | #ifdef __linux__
10 | typedef LinuxEvents<PERF_TYPE_HARDWARE> EventClass;
11 | #endif
12 | 
13 | class LinuxEventsWrapper {
14 |   public:
15 |     LinuxEventsWrapper(const std::vector<int> event_codes) {
16 | #ifdef __linux__
17 |       for(int ecode: event_codes) {
18 |         event_obj.emplace(ecode, std::shared_ptr<EventClass>(new EventClass(ecode)));
19 |         event_res.emplace(ecode, 0);
20 |       }
21 | #endif
22 |     }
23 |     void start() {
24 | #ifdef __linux__
25 |       for (const auto& [ecode, ptr]: event_obj) {
26 |         ptr->start();  
27 |       }
28 | #endif
29 |     }
30 |     void end() {
31 | #ifdef __linux__
32 |       for (const auto& [ecode, ptr]: event_obj) {
33 |         event_res[ecode] = ptr->end();  
34 |       }
35 | #endif
36 |     }
37 |     // Throws an exception if the code is not present
38 |     unsigned long get_result(int ecode) {
39 | #ifdef __linux__
40 |       return event_res.at(ecode);
41 | #else
42 |       return 0;
43 | #endif
44 |     }
45 |   private:
46 | #ifdef __linux__
47 |     std::unordered_map<int, std::shared_ptr<EventClass>> event_obj;
48 |     std::unordered_map<int, unsigned long> event_res;
49 | #endif
50 | };
51 | 


--------------------------------------------------------------------------------
/benchmark/linux-perf-events.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <unistd.h>             // for syscall
 4 | #include <sys/ioctl.h>          // for ioctl
 5 | #include <asm/unistd.h>         // for __NR_perf_event_open
 6 | #include <linux/perf_event.h>   // for perf event constants
 7 | 
 8 | #include <cerrno>               // for errno
 9 | #include <cstring>              // for memset
10 | #include <stdexcept>
11 | 
12 | 
13 | template <int TYPE = PERF_TYPE_HARDWARE>
14 | class LinuxEvents {
15 | 
16 |     int fd;
17 |     perf_event_attr attribs;
18 | 
19 | public:
20 |     LinuxEvents(int config) : fd(0) {
21 |         memset(&attribs, 0, sizeof(attribs));
22 |         attribs.type        = TYPE;
23 |         attribs.size        = sizeof(attribs);
24 |         attribs.config      = config;
25 |         attribs.disabled        = 1;
26 |         attribs.exclude_kernel  = 1;
27 |         attribs.exclude_hv      = 1;
28 | 
29 |         const int pid = 0;    // the current process
30 |         const int cpu = -1;   // all CPUs
31 |         const int group = -1; // no group
32 |         const unsigned long flags = 0;
33 |         fd = syscall(__NR_perf_event_open, &attribs, pid, cpu, group, flags);
34 |         if (fd == -1) {
35 |             report_error("perf_event_open");
36 |         }
37 |     }
38 | 
39 |     ~LinuxEvents() {
40 |         close(fd);
41 |     }
42 | 
43 |     void start() {
44 |         if (ioctl(fd, PERF_EVENT_IOC_RESET, 0) == -1) {
45 |             report_error("ioctl(PERF_EVENT_IOC_RESET)");
46 |         }
47 | 
48 |         if (ioctl(fd, PERF_EVENT_IOC_ENABLE, 0) == -1) {
49 |             report_error("ioctl(PERF_EVENT_IOC_ENABLE)");
50 |         }
51 |     }
52 | 
53 |     unsigned long end() {
54 |         if (ioctl(fd, PERF_EVENT_IOC_DISABLE, 0) == -1) {
55 |             report_error("ioctl(PERF_EVENT_IOC_DISABLE)");
56 |         }
57 | 
58 |         unsigned long result;
59 |         if (read(fd, &result, sizeof(result)) == -1) {
60 |             report_error("read");
61 |         }
62 | 
63 |         return result;
64 |     }
65 | 
66 | private:
67 |     void report_error(const std::string& context) {
68 |         throw std::runtime_error(context + ": " + std::string(strerror(errno)));
69 |     }
70 | 
71 | };
72 | 
73 | 


--------------------------------------------------------------------------------
/benchmark/maropuparser.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * This code is released under the
  3 |  * Apache License Version 2.0 http://www.apache.org/licenses/.
  4 |  *
  5 |  * (c) Daniel Lemire, http://lemire.me/en/
  6 |  */
  7 | 
  8 | #ifndef FASTSCANCOUNT_MAROPUPARSER_H_
  9 | #define FASTSCANCOUNT_MAROPUPARSER_H_
 10 | 
 11 | #include <stdexcept>
 12 | #include <sstream>
 13 | 
 14 | #include <cstdio>
 15 | #include <iostream>
 16 | 
 17 | /**
 18 |  * This is just a bit of code to parse the binary files provided by the
 19 |  * Maropu-Open-Coders library at
 20 |  * http://integerencoding.isti.cnr.it/?page_id=8
 21 |  *
 22 |  * (Despite the name, this does not necessarily reads gaps.)
 23 |  *
 24 |  * Note that due to use of strerror this code may be thread-unsafe!
 25 |  *
 26 |  */
 27 | class MaropuGapReader {
 28 | public:
 29 |   MaropuGapReader(const std::string &filename) : mFilename(filename), fd(NULL) {}
 30 | 
 31 |   /**
 32 |    * The copy constructor will assign the same file name,
 33 |    * but the newly constructed object won't be opened.
 34 |    */
 35 |   MaropuGapReader(const MaropuGapReader &mgr)
 36 |       : mFilename(mgr.mFilename), fd(NULL) {}
 37 | 
 38 |   /**
 39 |    * Assignment will close the current reader, and change
 40 |    * the file name. You need to reopen the reader after the assignment.
 41 |    */
 42 |   MaropuGapReader &operator=(const MaropuGapReader &mgr) {
 43 |     close();
 44 |     mFilename = mgr.mFilename;
 45 |     return *this;
 46 |   }
 47 | 
 48 |   ~MaropuGapReader() { close(); }
 49 | 
 50 |   // @daniel: should we worry about our code being compilable on 32-bit
 51 |   // machines?
 52 |   // if so, we need to add -D_FILE_OFFSET_BITS=64 to the makefile
 53 |   // Daniel: it would seem odd to consider 32-bit machines when we assume AVX
 54 |   // support!
 55 |   off_t getPos() {
 56 |     errno = 0;
 57 |     off_t res = ftello(fd);
 58 |     if (res < 0) {
 59 |       std::stringstream err;
 60 |       err << "Error getting file position, IO status: " << strerror(errno);
 61 |       throw std::runtime_error(err.str());
 62 |     }
 63 |     return res;
 64 |   }
 65 | 
 66 |   void setPos(off_t pos) {
 67 |     errno = 0;
 68 |     off_t res = fseeko(fd, pos, SEEK_SET);
 69 |     if (res < 0) {
 70 |       std::stringstream err;
 71 |       err << "Error setting file position, IO status: " << strerror(errno);
 72 |       throw std::runtime_error(err.str());
 73 |     }
 74 |   }
 75 | 
 76 |   /*
 77 |    * Return false if no more data can be loaded.
 78 |    * Throw an exception in the case of IO error.
 79 |    */
 80 |   template <class container> bool loadIntegers(container &buffer) {
 81 |     uint32_t qty = 0;
 82 |     if (!ReadQty(qty))
 83 |       return false; // EOF
 84 |     buffer.resize(qty);
 85 |     errno = 0;
 86 |     size_t result = fread(buffer.data(), sizeof(uint32_t), buffer.size(), fd);
 87 |     if (result != buffer.size()) {
 88 |       if (!errno) {
 89 |         // If we can't read, the file maybe truncated, i.e., corrupt
 90 |         throw std::runtime_error("The file appears to be truncated/corrupt!");
 91 |       }
 92 |       std::stringstream err;
 93 |       err << "Error reading from file, IO status: " << strerror(errno);
 94 |       throw std::runtime_error(err.str());
 95 |     }
 96 |     return true;
 97 |   }
 98 | 
 99 |   /*
100 |    * Return false if no more data can be loaded.
101 |    * Throw an exception in the case of IO error.
102 |    */
103 |   bool readNextPosAndQty(off_t &pos, uint32_t &qty) {
104 |     pos = getPos();
105 |     if (!ReadQty(qty))
106 |       return false; // EOF
107 |     setPos(getPos() + qty * sizeof(uint32_t));
108 |     return true;
109 |   }
110 | 
111 |   /**
112 |   * We must call open before we can use this class  meaningfully.
113 |   */
114 |   bool open() {
115 |     close();
116 |     fd = ::fopen(mFilename.c_str(), "rb");
117 |     if (fd == NULL) {
118 |       return false;
119 |     }
120 |     setvbuf(fd, NULL, _IOFBF, 1024 * 4); // large buffer
121 |     return true;
122 |   }
123 | 
124 |   void close() {
125 |     if (fd != NULL) {
126 |       ::fclose(fd);
127 |       fd = NULL;
128 |     }
129 |   }
130 | 
131 | private:
132 |   /*
133 |    * Returns false on EOF.
134 |    * Throws an exception in the case of IO error.
135 |    */
136 |   bool ReadQty(uint32_t &qty) {
137 |     qty = 0;
138 |     if (fd == NULL) {
139 |       throw std::runtime_error("You forgot to open the file.");
140 |     }
141 |     errno = 0;
142 |     size_t result = fread(&qty, sizeof(qty), 1, fd);
143 |     if (errno) {
144 |       std::stringstream err;
145 |       err << "Error opening file, IO status: " << strerror(errno);
146 |       throw std::runtime_error(err.str());
147 |     }
148 |     if (result != 1) {
149 |       return false;
150 |     }
151 |     if (qty > 1 << 29) {
152 |       std::cout << "warning: reading a very large array (" << qty
153 |                 << " integers) : is your input file in the right format?" << std::endl;
154 |     }
155 |     return true;
156 |   }
157 | 
158 |   std::string mFilename;
159 |   FILE *fd;
160 | };
161 | 
162 | #endif /* SIMDCompressionAndIntersection_MAROPUPARSER_H_ */
163 | 


--------------------------------------------------------------------------------
/benchmark/ztimer.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * This code is released under the
 3 |  * Apache License Version 2.0 http://www.apache.org/licenses/.
 4 |  *
 5 |  * (c) Daniel Lemire, http://lemire.me/en/
 6 |  */
 7 | 
 8 | #ifndef ZTIMER_2014
 9 | #define ZTIMER_2014
10 | 
11 | #include <sys/time.h>
12 | #include <sys/resource.h>
13 | 
14 | 
15 | /**
16 |  *  author: Preston Bannister
17 |  */
18 | class WallClockTimer {
19 | public:
20 |     struct timeval t1, t2;
21 |     WallClockTimer() :
22 |         t1(), t2() {
23 |         gettimeofday(&t1, 0);
24 |         t2 = t1;
25 |     }
26 |     void reset() {
27 |         gettimeofday(&t1, 0);
28 |         t2 = t1;
29 |     }
30 |     uint64_t elapsed() {
31 |         return ((t2.tv_sec - t1.tv_sec) * 1000ULL * 1000ULL) + ((t2.tv_usec - t1. tv_usec));
32 |     }
33 |     uint64_t split() {
34 |         gettimeofday(&t2, 0);
35 |         return elapsed();
36 |     }
37 | };
38 | 
39 | /**
40 |  *  author: Daniel Lemire
41 |  */
42 | class CPUTimer {
43 | public:
44 |     //clock_t t1, t2;
45 |     struct rusage t1,t2;
46 | 
47 |     CPUTimer() :
48 |         t1(), t2() {
49 |         getrusage(RUSAGE_SELF, &t1);
50 |         //t1 = clock();
51 |         t2 = t1;
52 |     }
53 |     void reset() {
54 |         getrusage(RUSAGE_SELF, &t1);
55 |         t2 = t1;
56 |     }
57 |     // proxy for userelapsed
58 |     uint64_t elapsed() {
59 |         return totalelapsed();
60 |     }
61 | 
62 |     uint64_t totalelapsed() {
63 |         return userelapsed() + systemelapsed();
64 |     }
65 |     // returns the *user* CPU time in micro seconds (mu s)
66 |     uint64_t userelapsed() {
67 |         return ((t2.ru_utime.tv_sec - t1.ru_utime.tv_sec) * 1000ULL * 1000ULL) + ((t2.ru_utime.tv_usec - t1.ru_utime.tv_usec)
68 |                 );
69 |     }
70 | 
71 |     // returns the *system* CPU time in micro seconds (mu s)
72 |     uint64_t systemelapsed() {
73 |         return ((t2.ru_stime.tv_sec - t1.ru_stime.tv_sec) * 1000ULL * 1000ULL) + ((t2.ru_stime.tv_usec - t1.ru_stime.tv_usec)
74 |                 );
75 |     }
76 | 
77 |     uint64_t split() {
78 |         getrusage(RUSAGE_SELF, &t2);
79 |         return elapsed();
80 |     }
81 | };
82 | 
83 | #endif
84 | 
85 | 


--------------------------------------------------------------------------------
/include/fastscancount.h:
--------------------------------------------------------------------------------
  1 | #ifndef FASTSCANCOUNT_H
  2 | #define FASTSCANCOUNT_H
  3 | 
  4 | #include <algorithm>
  5 | #include <cstddef>
  6 | #include <cstdint>
  7 | #include <cstring>
  8 | #include <vector>
  9 | 
 10 | // credit: implementation and design by Nathan Kurz and Daniel Lemire
 11 | 
 12 | namespace fastscancount {
 13 | 
 14 | namespace {
 15 | 
 16 | // used by natefastscancount
 17 | uint32_t *natefastscancount_maincheck(uint8_t *counters, size_t &it,
 18 |                                       const uint32_t *d, size_t start,
 19 |                                       size_t range, uint8_t threshold,
 20 |                                       uint32_t *out) {
 21 |   range += start;
 22 |   counters -= start;
 23 |   size_t i = it;
 24 |   for (uint32_t val = d[i]; val < range; val = d[++i]) {
 25 |     uint8_t c = counters[val];
 26 |     if (c == threshold) *out++ = val;
 27 |     counters[val] = c + 1;
 28 |   }
 29 |   it = i;
 30 |   return out;
 31 | }
 32 | 
 33 | // used by natefastscancount
 34 | uint32_t *natefastscancount_finalcheck(uint8_t *counters, size_t &it,
 35 |                                        const uint32_t *d, size_t start,
 36 |                                        size_t itend, uint8_t threshold,
 37 |                                        uint32_t *out) {
 38 |   uint8_t *const deccounters = counters - start;
 39 |   size_t i = it;
 40 |   for (; i < itend; i++) {
 41 |     uint32_t val = d[i];
 42 |     uint8_t *location = deccounters + val;
 43 |     uint8_t c = *location;
 44 |     if (c == threshold) {
 45 |       *out++ = val;
 46 |     }
 47 |     *location = c + 1;
 48 |   }
 49 |   it = i;
 50 |   return out;
 51 | }
 52 | } // namespace
 53 | 
 54 | void fastscancount(const std::vector<const std::vector<uint32_t>*> &data,
 55 |                    std::vector<uint32_t> &out, uint8_t threshold) {
 56 |   size_t cache_size = 65536;
 57 |   size_t range = cache_size;
 58 |   std::vector<uint8_t> counters(cache_size);
 59 |   size_t ds = data.size();
 60 |   out.resize( 4 * range); // let us add lots of capacity
 61 |   uint32_t *output = out.data();
 62 |   uint32_t *initout = out.data();
 63 |   std::vector<size_t> iters(ds);
 64 |   size_t countsofar = 0;
 65 |   uint32_t largest = 0;
 66 |   for (size_t c = 0; c < ds; c++) {
 67 |     if (largest < (*data[c])[data[c]->size() - 1])
 68 |       largest = (*data[c])[data[c]->size() - 1];
 69 |   }
 70 |   // we are assuming that all vectors in data are non-empty
 71 |   for (size_t start = 0; start < largest; start += range) {
 72 |     // make sure that the capacity is sufficient
 73 |     countsofar = output - initout;
 74 |     if (out.size() - countsofar < range) {
 75 |       out.resize(out.size() + 4 * range);
 76 |       initout = out.data();
 77 |       output = out.data() + countsofar;
 78 |     }
 79 |     memset(counters.data(), 0, range);
 80 |     for (size_t c = 0; c < ds; c++) {
 81 |       size_t it = iters[c]; // recover where we were
 82 |       const std::vector<uint32_t> &d = *data[c];
 83 |       const size_t itend = d.size();
 84 |       if (it == itend) // check that there is data to be processed
 85 |         continue;      // exhausted
 86 |       // check if we need to be careful:
 87 |       bool near_the_end = (d[itend - 1] < start + range);
 88 |       if (near_the_end) {
 89 |         output = natefastscancount_finalcheck(counters.data(), it, d.data(),
 90 |                                               start, itend, threshold, output);
 91 |       } else {
 92 |         output = natefastscancount_maincheck(counters.data(), it, d.data(),
 93 |                                              start, range, threshold, output);
 94 |       }
 95 |       iters[c] = it; // store it for next round
 96 |     }
 97 |   }
 98 |   countsofar = output - initout;
 99 |   out.resize(countsofar);
100 | }
101 | } // namespace fastscancount
102 | 
103 | #endif
104 | 


--------------------------------------------------------------------------------
/include/fastscancount_avx2.h:
--------------------------------------------------------------------------------
  1 | #ifndef FASTSCANCOUNT_AVX2_H
  2 | #define FASTSCANCOUNT_AVX2_H
  3 | 
  4 | // this code expects an x64 processor with AVX2
  5 | 
  6 | #ifdef _MSC_VER
  7 | #include <intrin.h>
  8 | #else
  9 | #include <x86intrin.h>
 10 | #endif
 11 | 
 12 | #include <algorithm>
 13 | #include <cstddef>
 14 | #include <cstdint>
 15 | #include <cstring>
 16 | #include <vector>
 17 | 
 18 | namespace fastscancount {
 19 | namespace {
 20 | // credit: implementation and design by Travis Downes
 21 | static inline size_t find_next_gt(uint8_t *array, const size_t size,
 22 |                                   const uint8_t threshold) {
 23 |   size_t vsize = size / 32;
 24 |   __m256i *varray = (__m256i *)array;
 25 |   const __m256i comprand = _mm256_set1_epi8(threshold);
 26 |   int bits = 0;
 27 | 
 28 |   for (size_t i = 0; i < vsize; i++) {
 29 |     __m256i v = _mm256_loadu_si256(varray + i);
 30 |     __m256i cmp = _mm256_cmpgt_epi8(v, comprand);
 31 |     if ((bits = _mm256_movemask_epi8(cmp))) {
 32 |       return i * 32 + __builtin_ctz(bits);
 33 |     }
 34 |   }
 35 | 
 36 |   // tail handling
 37 |   for (size_t i = vsize * 32; i < size; i++) {
 38 |     auto v = array[i];
 39 |     if (v > threshold)
 40 |       return i;
 41 |   }
 42 | 
 43 |   return SIZE_MAX;
 44 | }
 45 | 
 46 | void populate_hits_avx(std::vector<uint8_t> &counters, size_t range,
 47 |                        size_t threshold, size_t start,
 48 |                        std::vector<uint32_t> &out) {
 49 |   uint8_t *array = counters.data();
 50 | 
 51 |   size_t ro = range;
 52 |   while (true) {
 53 |     size_t next = find_next_gt(array, range, (uint8_t)threshold);
 54 |     if (next == SIZE_MAX)
 55 |       break;
 56 |     out.push_back(start + next);
 57 |     range -= (next + 1);
 58 |     array += (next + 1);
 59 |     start += (next + 1);
 60 |   }
 61 | }
 62 | 
 63 | void update_counters(const uint32_t *&it_, uint8_t *counters,
 64 |                      uint32_t range_end) {
 65 |   const uint32_t *it = it_;
 66 |   for (uint32_t e; (e = *it) < range_end; ++it) {
 67 |     counters[e]++;
 68 |   }
 69 |   it_ = it;
 70 | }
 71 | 
 72 | void update_counters_final(const uint32_t *&it_, const uint32_t *end,
 73 |                            uint8_t *counters) {
 74 |   uint64_t e;
 75 |   const uint32_t *it = it_;
 76 |   for (; it != end; it++) {
 77 |     counters[*it]++;
 78 |   }
 79 |   it_ = end;
 80 | }
 81 | } // namespace
 82 | 
 83 | void fastscancount_avx2(const std::vector<const std::vector<uint32_t>*> &data,
 84 |                         std::vector<uint32_t> &out, uint8_t threshold) {
 85 |   const size_t cache_size = 40000;
 86 |   std::vector<uint8_t> counters(cache_size);
 87 |   out.clear();
 88 |   const size_t dsize = data.size();
 89 | 
 90 |   struct data_info {
 91 |     const uint32_t *cur; // current pointer into data
 92 |     const uint32_t *end; // pointer to end
 93 |     uint32_t last;       // value of last element
 94 |     data_info(const uint32_t *cur, const uint32_t *end, uint32_t last)
 95 |         : cur{cur}, end{end}, last{last} {}
 96 |   };
 97 | 
 98 |   std::vector<data_info> iter_data;
 99 |   iter_data.reserve(dsize);
100 |   for (auto &d : data) {
101 |     iter_data.emplace_back(d->data(), d->data() + d->size(), d->back());
102 |   }
103 | 
104 |   uint32_t largest = 0;
105 |   for (size_t c = 0; c < data.size(); c++) {
106 |     if (largest < (*data[c])[data[c]->size() - 1])
107 |       largest = (*data[c])[data[c]->size() - 1];
108 |   }
109 |   auto cdata = counters.data();
110 |   for (uint32_t start = 0; start < largest; start += cache_size) {
111 |     memset(cdata, 0, cache_size * sizeof(counters[0]));
112 |     for (auto &id : iter_data) {
113 |       // determine if the loop will end because we get to the end of
114 |       // data, or because we get to the end of the range
115 |       if (__builtin_expect(id.last >= start + cache_size, 1)) {
116 |         // the iteration is guaranteed to end because an element becomes >=
117 |         // range_end, so we don't need to check for end of data
118 |         update_counters(id.cur, cdata - start, start + cache_size);
119 |       } else {
120 |         // the iteration is guaranteed to end because we get to the end of the
121 |         // data
122 |         update_counters_final(id.cur, id.end, cdata - start);
123 |       }
124 |     }
125 | 
126 |     populate_hits_avx(counters, cache_size, threshold, start, out);
127 |   }
128 | }
129 | 
130 | } // namespace fastscancount
131 | #endif
132 | 


--------------------------------------------------------------------------------
/include/fastscancount_avx512.h:
--------------------------------------------------------------------------------
  1 | #ifndef FASTSCANCOUNT_AVX512_H
  2 | #define FASTSCANCOUNT_AVX512_H
  3 | 
  4 | // this code expects an x64 processor with AVX-512F
  5 | 
  6 | #ifdef _MSC_VER
  7 | #include <intrin.h>
  8 | #else
  9 | #include <x86intrin.h>
 10 | #endif
 11 | 
 12 | #include <algorithm>
 13 | #include <cstddef>
 14 | #include <cstdint>
 15 | #include <cstring>
 16 | #include <vector>
 17 | #include <stdexcept>
 18 | 
 19 | namespace fastscancount {
 20 | namespace {
 21 | 
 22 | // credit: inspired by 256-bit implementation of Travis Downes
 23 | void populate_hits_avx512(std::vector<uint8_t> &counters, size_t range,
 24 |                        size_t threshold, size_t start,
 25 |                        std::vector<uint32_t> &out) {
 26 |   uint8_t *array = counters.data();
 27 | 
 28 |   size_t vsize = range / 64;
 29 |   __m512i *varray = (__m512i *)array;
 30 |   const __m512i comprand = _mm512_set1_epi8(threshold);
 31 | 
 32 |   for (size_t i = 0; i < vsize; i++) {
 33 |     size_t start_add = start + i*64;
 34 |     __m512i v = _mm512_loadu_si512(varray + i);
 35 |     uint64_t bits = _mm512_cmpgt_epi8_mask(v, comprand);
 36 |     while (bits) {
 37 |       unsigned zqty = __builtin_ctzll(bits);
 38 |       bits >>= zqty; 
 39 |       bits >>= 1; // If zqty = 63, shift by 64 is not defined, need to split shifts
 40 |       out.push_back(start_add + zqty);
 41 |       start_add += zqty + 1;
 42 |     }
 43 |   }
 44 | 
 45 |   for (size_t i = vsize * 64; i < range; i++) {
 46 |     auto v = array[i];
 47 |     if (v > threshold)
 48 |       out.push_back(start + i);
 49 |   }
 50 | 
 51 | }
 52 | 
 53 | void update_counters_avx512(const uint32_t  *&it_, const uint32_t  *end,
 54 |                             uint8_t *counters, 
 55 |                             const size_t shift) {
 56 | 
 57 |   if (it_ > end) {
 58 |     throw std::runtime_error("Bug: start > end");
 59 |   }
 60 |   size_t qty = end - it_;
 61 |   size_t vsize = qty / 16;
 62 | 
 63 |   __m512i *varray = (__m512i *)it_;
 64 |   const __m512i add1 = _mm512_set1_epi32(1);
 65 |   const __m512i shift_vect = _mm512_set1_epi32(shift);
 66 | 
 67 |   const __mmask64 blend_mask = 0x1111111111111111ull;
 68 | 
 69 |   for (unsigned i = 0; i < vsize; ++i) {
 70 |     __m512i indx = _mm512_sub_epi32(_mm512_loadu_si512(varray + i), shift_vect);
 71 |     __m512i v_orig = _mm512_i32gather_epi32(indx, (const int*)counters, 1);
 72 |     // Note: works correctly only if counters never overflow
 73 |     // First, we increment counters.
 74 |     __m512i v_inc = _mm512_add_epi32(v_orig, add1);
 75 |     // Then, we will blend by keeping three higher-order bytes in each 32-bit word unmodified
 76 |     // When 32-bit words overlap, the gather operation would first write the old values of the word
 77 |     // then it will overwrite them with new values. So, this should work just fine.
 78 |     __m512i v = _mm512_mask_blend_epi8(blend_mask, v_orig, v_inc);
 79 |     _mm512_i32scatter_epi32((int*)counters, indx, v, 1);
 80 |   }
 81 | 
 82 |   // tail processing
 83 |   const uint32_t  *it = it_ + vsize * 16;
 84 |   for (; it != end; it++) {
 85 |     counters[*it-shift]++;
 86 |   }
 87 |   it_ = end;
 88 | }
 89 | 
 90 | 
 91 | } // namespace
 92 | 
 93 | void fastscancount_avx512(uint32_t cache_size,
 94 |                           const std::vector<const std::vector<uint32_t>*> &data,
 95 |                           const std::vector<const std::vector<uint32_t>*> &range_ends,
 96 |                           std::vector<uint32_t> &out, uint8_t threshold) {
 97 |   std::vector<uint8_t> counters(cache_size);
 98 |   out.clear();
 99 |   const size_t dsize = data.size();
100 |   if (!dsize) {
101 |     return;
102 |   }
103 |   if (dsize != range_ends.size()) {
104 |     throw std::runtime_error("Invalid input: non-matching sizes between data and range_ends");
105 |   }
106 | 
107 |   unsigned range_qty = range_ends[0]->size();
108 |   for (unsigned i = 1; i < dsize; ++i) {
109 |     if (range_ends[i]->size() != range_qty) {
110 |       throw std::runtime_error("Invalid input: different range sizes for different data arrays!");
111 |     }
112 |   }
113 | 
114 |   auto cdata = counters.data();
115 | 
116 |   std::vector<const uint32_t*> it(dsize);
117 |   for (unsigned k = 0; k < dsize; ++k) {
118 |     const auto& v = *data[k];  
119 |     if (!v.empty()) {
120 |       it[k] = &v[0];
121 |     }
122 |   }
123 | 
124 |   for (unsigned i = 0; i < range_qty; ++i) {
125 |     memset(cdata, 0, cache_size * sizeof(counters[0]));
126 |     uint32_t start = i * cache_size;
127 |     for (unsigned k = 0; k < dsize; ++k) {
128 |       const std::vector<uint32_t>& v = *data[k];
129 |       const std::vector<uint32_t>& r = *range_ends[k];
130 |       update_counters_avx512(it[k], &v[0] + r[i], cdata, start);
131 |     }
132 | 
133 |     populate_hits_avx512(counters, cache_size, threshold, start, out);
134 |   }
135 | }
136 | 
137 | } // namespace fastscancount
138 | #endif
139 | 


--------------------------------------------------------------------------------