├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── benchmark ├── counters.cpp ├── linux-perf-events-wrapper.h ├── linux-perf-events.h ├── maropuparser.h └── ztimer.h └── include ├── fastscancount.h ├── fastscancount_avx2.h └── fastscancount_avx512.h /.gitignore: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | *.d 3 | 4 | # Compiled Object files 5 | *.slo 6 | *.lo 7 | *.o 8 | *.obj 9 | 10 | # Precompiled Headers 11 | *.gch 12 | *.pch 13 | 14 | # Compiled Dynamic libraries 15 | *.so 16 | *.dylib 17 | *.dll 18 | 19 | # Fortran module files 20 | *.mod 21 | *.smod 22 | 23 | # Compiled Static libraries 24 | *.lai 25 | *.la 26 | *.a 27 | *.lib 28 | 29 | # Executables 30 | *.exe 31 | *.out 32 | *.app 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | OPT := -O3 2 | # Leo really doubts -mavx2 helps anything, but one can 3 | # disable avx512 tests by enforcing -mavx2 4 | #CXXFLAGS := -std=c++17 $(OPT) -mavx2 5 | CXXFLAGS := -std=c++17 $(OPT) -march=native 6 | 7 | counter: benchmark/counters.cpp include/*.h Makefile 8 | $(CXX) $(CXXFLAGS) $(CXXEXTRA) -o counter benchmark/counters.cpp -Ibenchmark -Iinclude 9 | 10 | clean: 11 | rm -f counter 12 | 13 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # fastscancount 2 | Fast implementations of the scancount algorithm 3 | 4 | 5 | Given a set of arrays of integers, we seek to identify 6 | all values that occur more than 'threshold' times. We do so using the 7 | 'scancount' algorithm. It is assumed 8 | that you have fewer than 256 arrays of integers and that the threshold is no larger than 254. 9 | 10 | We are effectively providing optimized versions of the following function: 11 | 12 | ```C++ 13 | void scancount(std::vector> &data, std::vector &out, uint8_t threshold) { 14 | std::fill(counters.begin(), counters.end(), 0); 15 | out.clear(); 16 | for (size_t c = 0; c < data.size(); c++) { 17 | std::vector &v = data[c]; 18 | for (size_t i = 0; i < v.size(); i++) { 19 | counters[v[i]]++; 20 | } 21 | } 22 | for (uint32_t i = 0; i < counters.size(); i++) { 23 | if (counters[i] > threshold) 24 | out.push_back(i); 25 | } 26 | } 27 | ``` 28 | 29 | Our optimized versions assume that your arrays are made of sorted integers. 30 | 31 | There are two headers, `fastscancount.h` uses plain C++ and should 32 | be portable. It has one main function in the fastscancount namespace. 33 | We always write the result to 'out'. 34 | 35 | ```C++ 36 | void fastscancount(std::vector> &data, 37 | std::vector &out, uint8_t threshold) 38 | ``` 39 | 40 | There is another header `fastscancount_avx2.h` 41 | which expects an x64 processor supporting the AVX2 instruction set. 42 | It has a similar function signature: 43 | 44 | ```C++ 45 | void fastscancount_avx2(std::vector> &data, 46 | std::vector &out, uint8_t threshold) 47 | ``` 48 | 49 | The AVX2 version assumes that you have fewer than 128 arrays of integers. 50 | 51 | Because this library is made solely of headers, there is no 52 | need for a build system. 53 | 54 | ## Linux benchmark 55 | 56 | If you have bare metal access to a Linux box, you can run cycle-accurate benchmarks. 57 | 58 | ``` 59 | make 60 | ./counter 61 | ``` 62 | 63 | Sample output with GNU GCC 8.3: 64 | 65 | ``` 66 | $ ./counter 67 | Got 2497 hits 68 | optimized cache-sensitive scancount 69 | 4.01381 cycles/element 70 | AVX2-based scancount 71 | 3.58494 cycles/element 72 | ``` 73 | 74 | With LLVM clang, we seem to get better results: 75 | 76 | ``` 77 | $ ./counter 78 | Got 2497 hits 79 | optimized cache-sensitive scancount 80 | 3.54267 cycles/element 81 | 2.8908 instructions/cycles 82 | 0.0134279 miss/element 83 | AVX2-based scancount 84 | 3.57374 cycles/element 85 | 2.03391 instructions/cycles 86 | 0.0109755 miss/element 87 | ``` 88 | 89 | ## Blog post 90 | 91 | [How fast can scancount be?](http://lemire.me/blog/2019/08/30/how-fast-can-scancount-be/ ) 92 | 93 | ## Using actual data 94 | 95 | ``` 96 | ./counter --postings data/postings.bin --queries data/queries.bin --threshold 3 97 | ``` 98 | 99 | ## Credit 100 | 101 | The AVX2 version was designed and implemented by Travis Downs. 102 | The scalar version was designed and implemented by Daniel Lemire based on ideas by Nathan Kurz, Travis Downs and others. 103 | 104 | ## Reference 105 | 106 | 107 | Owen Kaser, Daniel Lemire, [Compressed bitmap indexes: beyond unions and intersections](https://arxiv.org/abs/1402.4466), Software: Practice and Experience 46 (2), 2016 108 | -------------------------------------------------------------------------------- /benchmark/counters.cpp: -------------------------------------------------------------------------------- 1 | // Fine-grained statistics is available only on Linux 2 | #include "fastscancount.h" 3 | #include "ztimer.h" 4 | #ifdef __AVX2__ 5 | #include "fastscancount_avx2.h" 6 | #endif 7 | #ifdef __AVX512F__ 8 | #include "fastscancount_avx512.h" 9 | #endif 10 | #include "linux-perf-events-wrapper.h" 11 | #include "maropuparser.h" 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | #define REPEATS 10 21 | #define RUNNINGTESTS 22 | 23 | void scancount(const std::vector*> &data, 24 | std::vector &out, size_t threshold) { 25 | uint64_t largest = 0; 26 | for(auto z : data) { 27 | const std::vector & v = *z; 28 | if(v[v.size() - 1] > largest) largest = v[v.size() - 1]; 29 | } 30 | std::vector counters(largest+1); 31 | out.clear(); 32 | for (size_t c = 0; c < data.size(); c++) { 33 | const std::vector &v = *data[c]; 34 | for (size_t i = 0; i < v.size(); i++) { 35 | counters[v[i]]++; 36 | } 37 | } 38 | for (uint32_t i = 0; i < counters.size(); i++) { 39 | if (counters[i] > threshold) 40 | out.push_back(i); 41 | } 42 | } 43 | 44 | void calc_boundaries(uint32_t largest, uint32_t range_size, 45 | const std::vector& data, 46 | std::vector& range_ends) { 47 | if (!range_size) { 48 | throw std::runtime_error("range_size must be > 0"); 49 | } 50 | uint32_t end = 0; 51 | range_ends.clear(); 52 | 53 | for (uint32_t start = 0; start <= largest; start += range_size) { 54 | uint32_t curr_max = std::min(largest, start + range_size - 1); 55 | while (end < data.size() && data[end] <= curr_max) { 56 | end++; 57 | } 58 | range_ends.push_back(end); 59 | } 60 | } 61 | 62 | const uint32_t range_size_avx512 = 40000; 63 | 64 | void calc_alldata_boundaries(const std::vector>& data, 65 | std::vector>& range_ends, 66 | size_t range_size) { 67 | uint32_t largest = 0; 68 | range_ends.clear(); 69 | range_ends.resize(data.size()); 70 | for(const auto& v : data) { 71 | if (!v.empty() && v[v.size() - 1] > largest) largest = v[v.size() - 1]; 72 | } 73 | for (unsigned i = 0; i < data.size(); ++i) { 74 | calc_boundaries(largest, range_size, data[i], range_ends[i]); 75 | } 76 | } 77 | 78 | template 79 | void test(F f, const std::vector*>& data_ptrs, 80 | std::vector& answer, unsigned threshold, const std::string &name) { 81 | scancount(data_ptrs, answer, threshold); 82 | size_t s1 = answer.size(); 83 | auto a1 (answer); 84 | std::sort(a1.begin(), a1.end()); 85 | answer.clear(); 86 | f(); 87 | size_t s2 = answer.size(); 88 | auto a2 (answer); 89 | std::sort(a2.begin(), a2.end()); 90 | if (a1 != a2) { 91 | std::cout << "s1: " << s1 << " s2: " << s2 << std::endl; 92 | for(size_t j = 0; j < std::min(s1, s2); j++) { 93 | std::cout << j << " " << a1[j] << " vs " << a2[j] ; 94 | 95 | if(a1[j] != a2[j]) std::cout << " oh oh "; 96 | std::cout << std::endl; 97 | } 98 | throw std::runtime_error("bug: " + name); 99 | } 100 | } 101 | 102 | template 103 | void bench(F f, const std::string &name, 104 | LinuxEventsWrapper &unified, 105 | float& elapsed, 106 | std::vector &answer, size_t sum, size_t expected, 107 | bool print) { 108 | WallClockTimer tm; 109 | unified.start(); 110 | f(); 111 | unified.end(); 112 | elapsed += tm.split(); 113 | if (answer.size() != expected) 114 | std::cerr << "bug: expected " << expected << " but got " << answer.size() 115 | << "\n"; 116 | #ifdef __linux__ 117 | if (print) { 118 | double cycles = unified.get_result(PERF_COUNT_HW_CPU_CYCLES); 119 | double instructions = unified.get_result(PERF_COUNT_HW_INSTRUCTIONS); 120 | double misses = unified.get_result(PERF_COUNT_HW_BRANCH_MISSES); 121 | std::cout << name << std::endl; 122 | std::cout << cycles / sum << " cycles/element " << std::endl; 123 | std::cout << instructions / cycles << " instructions/cycles " << std::endl; 124 | std::cout << misses / sum << " miss/element " << std::endl; 125 | } 126 | #endif 127 | } 128 | 129 | void demo_data(const std::vector>& data, 130 | const std::vector>& queries, 131 | size_t threshold) { 132 | size_t N = 0; 133 | for (const auto& data_elem : data) { 134 | size_t sz = data_elem.size(); 135 | if (sz) { 136 | N = std::max(N, (size_t)data_elem[sz-1] + 1); 137 | } 138 | } 139 | 140 | std::vector answer; 141 | answer.reserve(N); 142 | 143 | std::vector evts = { 144 | #ifdef __linux__ 145 | PERF_COUNT_HW_CPU_CYCLES, 146 | PERF_COUNT_HW_INSTRUCTIONS, 147 | PERF_COUNT_HW_BRANCH_MISSES, 148 | PERF_COUNT_HW_CACHE_REFERENCES, 149 | PERF_COUNT_HW_CACHE_MISSES 150 | #endif 151 | }; 152 | LinuxEventsWrapper unified(evts); 153 | 154 | std::vector> range_boundaries; 155 | calc_alldata_boundaries(data, range_boundaries, range_size_avx512); 156 | 157 | std::vector*> data_ptrs; 158 | std::vector*> range_ptrs; 159 | 160 | float elapsed = 0, elapsed_fast = 0, elapsed_avx = 0, elapsed_avx512 = 0; 161 | 162 | size_t sum_total = 0; 163 | 164 | for (size_t qid = 0; qid < queries.size(); ++qid) { 165 | const auto& query_elem = queries[qid]; 166 | data_ptrs.clear(); 167 | range_ptrs.clear(); 168 | size_t sum = 0; 169 | for (uint32_t idx : query_elem) { 170 | if (idx >= data.size()) { 171 | std::stringstream err; 172 | err << "Inconsistent data, posting " << idx << 173 | " is >= # of postings " << data.size() << " query id " << qid; 174 | throw std::runtime_error(err.str()); 175 | } 176 | sum += data[idx].size(); 177 | data_ptrs.push_back(&data[idx]); 178 | range_ptrs.push_back(&range_boundaries[idx]); 179 | } 180 | sum_total += sum; 181 | 182 | scancount(data_ptrs, answer, threshold); 183 | const size_t expected = answer.size(); 184 | 185 | #ifdef RUNNINGTESTS 186 | test( 187 | [&](){ 188 | fastscancount::fastscancount(data_ptrs, answer, threshold); 189 | }, data_ptrs, answer, threshold, "fastscancount" 190 | ); 191 | #ifdef __AVX2__ 192 | test( 193 | [&](){ 194 | fastscancount::fastscancount_avx2(data_ptrs, answer, threshold); 195 | }, data_ptrs, answer, threshold, "fastscancount_avx2" 196 | ); 197 | #endif 198 | 199 | #ifdef __AVX512F__ 200 | test( 201 | [&](){ 202 | fastscancount::fastscancount_avx512(range_size_avx512, data_ptrs, range_ptrs, answer, threshold); 203 | }, data_ptrs, answer, threshold, "fastscancount_avx512" 204 | ); 205 | #endif 206 | 207 | #endif 208 | std::cout << "Qid: " << qid << " got " << expected << " hits\n"; 209 | 210 | bool last = (qid == queries.size() - 1); 211 | 212 | bench( 213 | [&]() { 214 | scancount(data_ptrs, answer, threshold); 215 | }, 216 | "baseline scancount", unified, elapsed, answer, sum, 217 | expected, last); 218 | 219 | bench( 220 | [&]() { 221 | fastscancount::fastscancount(data_ptrs, answer, threshold); 222 | }, 223 | "optimized cache-sensitive scancount", unified, elapsed_fast, answer, sum, 224 | expected, last); 225 | #ifdef __AVX2__ 226 | bench( 227 | [&]() { 228 | fastscancount::fastscancount_avx2(data_ptrs, answer, threshold); 229 | }, 230 | "AVX2-based scancount", unified, elapsed_avx, answer, sum, expected, last); 231 | #endif 232 | #ifdef __AVX512F__ 233 | bench( 234 | [&]() { 235 | fastscancount::fastscancount_avx512(range_size_avx512, data_ptrs, range_ptrs, answer, threshold); 236 | }, 237 | "AVX512-based scancount", unified, elapsed_avx512, answer, sum, expected, last); 238 | #endif 239 | } 240 | std::cout << "Elems per millisecond:" << std::endl; 241 | std::cout << "scancount: " << (sum_total/(elapsed/1e3)) << std::endl; 242 | std::cout << "fastscancount: " << (sum_total/(elapsed_fast/1e3)) << std::endl; 243 | #ifdef __AVX2__ 244 | std::cout << "fastscancount_avx2: " << (sum_total/(elapsed_avx/1e3)) << std::endl; 245 | #endif 246 | #ifdef __AVX512F__ 247 | std::cout << "fastscancount_avx512: " << (sum_total/(elapsed_avx512/1e3)) << std::endl; 248 | #endif 249 | 250 | } 251 | 252 | 253 | void demo_random(size_t N, size_t length, size_t array_count, size_t threshold) { 254 | std::vector> data(array_count); 255 | 256 | std::vector*> data_ptrs; 257 | std::vector answer; 258 | answer.reserve(N); 259 | 260 | size_t sum = 0; 261 | for (size_t c = 0; c < array_count; c++) { 262 | std::vector &v = data[c]; 263 | for (size_t i = 0; i < length; i++) { 264 | v.push_back(rand() % N); 265 | } 266 | std::sort(v.begin(), v.end()); 267 | v.resize(std::distance(v.begin(), unique(v.begin(), v.end()))); 268 | sum += v.size(); 269 | data_ptrs.push_back(&data[c]); 270 | } 271 | 272 | 273 | std::vector> range_boundaries; 274 | calc_alldata_boundaries(data, range_boundaries, range_size_avx512); 275 | std::vector*> range_ptrs; 276 | for (size_t c = 0; c < array_count; c++) { 277 | range_ptrs.push_back(&range_boundaries[c]); 278 | } 279 | 280 | std::vector evts = { 281 | #ifdef __linux__ 282 | PERF_COUNT_HW_CPU_CYCLES, 283 | PERF_COUNT_HW_INSTRUCTIONS, 284 | PERF_COUNT_HW_BRANCH_MISSES, 285 | PERF_COUNT_HW_CACHE_REFERENCES, 286 | PERF_COUNT_HW_CACHE_MISSES 287 | #endif 288 | }; 289 | LinuxEventsWrapper unified(evts); 290 | float elapsed = 0, elapsed_fast = 0, elapsed_avx = 0, elapsed_avx512 = 0; 291 | scancount(data_ptrs, answer, threshold); 292 | const size_t expected = answer.size(); 293 | std::cout << "Got " << expected << " hits\n"; 294 | size_t sum_total = sum * REPEATS; 295 | for (size_t t = 0; t < REPEATS; t++) { 296 | bool last = (t == REPEATS - 1); 297 | 298 | bench( 299 | [&]() { 300 | scancount(data_ptrs, answer, threshold); 301 | }, 302 | "baseline scancount", unified, elapsed, answer, sum, 303 | expected, last); 304 | } 305 | 306 | for (size_t t = 0; t < REPEATS; t++) { 307 | bool last = (t == REPEATS - 1); 308 | 309 | #ifdef RUNNINGTESTS 310 | test( 311 | [&](){ 312 | fastscancount::fastscancount(data_ptrs, answer, threshold); 313 | }, data_ptrs, answer, threshold, "fastscancount" 314 | ); 315 | #endif 316 | 317 | bench( 318 | [&]() { 319 | fastscancount::fastscancount(data_ptrs, answer, threshold); 320 | }, 321 | "optimized cache-sensitive scancount", unified, elapsed_fast, answer, sum, 322 | expected, last); 323 | } 324 | 325 | for (size_t t = 0; t < REPEATS; t++) { 326 | bool last = (t == REPEATS - 1); 327 | 328 | #ifdef __AVX2__ 329 | #ifdef RUNNINGTESTS 330 | test( 331 | [&](){ 332 | fastscancount::fastscancount_avx2(data_ptrs, answer, threshold); 333 | }, data_ptrs, answer, threshold, "fastscancount_avx2" 334 | ); 335 | #endif 336 | bench( 337 | [&]() { 338 | fastscancount::fastscancount_avx2(data_ptrs, answer, threshold); 339 | }, 340 | "AVX2-based scancount", unified, elapsed_avx, answer, sum, expected, last); 341 | #endif 342 | } 343 | 344 | for (size_t t = 0; t < REPEATS; t++) { 345 | bool last = (t == REPEATS - 1); 346 | #ifdef __AVX512F__ 347 | #ifdef RUNNINGTESTS 348 | test( 349 | [&](){ 350 | fastscancount::fastscancount_avx512(range_size_avx512, data_ptrs, range_ptrs, answer, threshold); 351 | }, data_ptrs, answer, threshold, "fastscancount_avx512" 352 | ); 353 | #endif 354 | 355 | bench( 356 | [&]() { 357 | fastscancount::fastscancount_avx512(range_size_avx512, data_ptrs, range_ptrs, answer, threshold); 358 | }, 359 | "AVX512-based scancount", unified, elapsed_avx512, answer, sum, expected, last); 360 | #endif 361 | } 362 | 363 | std::cout << "Elems per millisecond:" << std::endl; 364 | std::cout << "scancount: " << (sum_total/(elapsed/1e3)) << std::endl; 365 | std::cout << "fastscancount: " << (sum_total/(elapsed_fast/1e3)) << std::endl; 366 | #ifdef __AVX2__ 367 | std::cout << "fastscancount_avx2: " << (sum_total/(elapsed_avx/1e3)) << std::endl; 368 | #endif 369 | #ifdef __AVX512F__ 370 | std::cout << "fastscancount_avx512: " << (sum_total/(elapsed_avx512/1e3)) << std::endl; 371 | #endif 372 | } 373 | 374 | void usage(const std::string& err="") { 375 | if (!err.empty()) { 376 | std::cerr << err << std::endl; 377 | } 378 | std::cerr << "usage: --postings --queries --threshold " << std::endl; 379 | } 380 | 381 | int main(int argc, char *argv[]) { 382 | // A very naive way to process arguments, 383 | // but it's ok unless we need to extend it substantially. 384 | if (argc != 1) { 385 | if (argc != 7) { 386 | usage(""); 387 | return EXIT_FAILURE; 388 | } 389 | std::string postings_file, queries_file; 390 | int threshold = -1; 391 | for (int i = 1; i < argc; ++i) { 392 | if (std::string(argv[i]) == "--postings") { 393 | postings_file = argv[++i]; 394 | } else if (std::string(argv[i]) == "--queries") { 395 | queries_file = argv[++i]; 396 | } else if (std::string(argv[i]) == "--threshold") { 397 | threshold = std::atoi(argv[++i]); 398 | } 399 | } 400 | if (postings_file.empty() || queries_file.empty() || threshold < 0) { 401 | usage("Specify queries, postings, and the threshold!"); 402 | return EXIT_FAILURE; 403 | } 404 | std::vector tmp; 405 | std::vector> data; 406 | { 407 | MaropuGapReader drdr(postings_file); 408 | if (!drdr.open()) { 409 | usage("Cannot open: " + postings_file); 410 | return EXIT_FAILURE; 411 | } 412 | while (drdr.loadIntegers(tmp)) { 413 | data.push_back(tmp); 414 | } 415 | } 416 | std::vector> queries; 417 | { 418 | MaropuGapReader qrdr(queries_file); 419 | if (!qrdr.open()) { 420 | usage("Cannot open: " + queries_file); 421 | return EXIT_FAILURE; 422 | } 423 | while (qrdr.loadIntegers(tmp)) { 424 | queries.push_back(tmp); 425 | } 426 | } 427 | 428 | try { 429 | demo_data(data, queries, threshold); 430 | } catch (const std::exception& e) { 431 | std::cerr << "Exception: " << e.what() << std::endl; 432 | return EXIT_FAILURE; 433 | } 434 | } else { 435 | try { 436 | // Previous demo with threshold 3 437 | //demo_random(20000000, 50000, 100, 3); 438 | for (unsigned k = 1; k < 10; ++k) { 439 | std::cout << "Demo threshold:" << k << std::endl; 440 | demo_random(20000000, 50000, 100, k); 441 | std::cout << "=======================" << std::endl; 442 | } 443 | } catch (const std::exception& e) { 444 | std::cerr << "Exception: " << e.what() << std::endl; 445 | return EXIT_FAILURE; 446 | } 447 | } 448 | return EXIT_SUCCESS; 449 | } 450 | -------------------------------------------------------------------------------- /benchmark/linux-perf-events-wrapper.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #ifdef __linux__ 6 | #include "linux-perf-events.h" 7 | #endif 8 | 9 | #ifdef __linux__ 10 | typedef LinuxEvents EventClass; 11 | #endif 12 | 13 | class LinuxEventsWrapper { 14 | public: 15 | LinuxEventsWrapper(const std::vector event_codes) { 16 | #ifdef __linux__ 17 | for(int ecode: event_codes) { 18 | event_obj.emplace(ecode, std::shared_ptr(new EventClass(ecode))); 19 | event_res.emplace(ecode, 0); 20 | } 21 | #endif 22 | } 23 | void start() { 24 | #ifdef __linux__ 25 | for (const auto& [ecode, ptr]: event_obj) { 26 | ptr->start(); 27 | } 28 | #endif 29 | } 30 | void end() { 31 | #ifdef __linux__ 32 | for (const auto& [ecode, ptr]: event_obj) { 33 | event_res[ecode] = ptr->end(); 34 | } 35 | #endif 36 | } 37 | // Throws an exception if the code is not present 38 | unsigned long get_result(int ecode) { 39 | #ifdef __linux__ 40 | return event_res.at(ecode); 41 | #else 42 | return 0; 43 | #endif 44 | } 45 | private: 46 | #ifdef __linux__ 47 | std::unordered_map> event_obj; 48 | std::unordered_map event_res; 49 | #endif 50 | }; 51 | -------------------------------------------------------------------------------- /benchmark/linux-perf-events.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include // for syscall 4 | #include // for ioctl 5 | #include // for __NR_perf_event_open 6 | #include // for perf event constants 7 | 8 | #include // for errno 9 | #include // for memset 10 | #include 11 | 12 | 13 | template 14 | class LinuxEvents { 15 | 16 | int fd; 17 | perf_event_attr attribs; 18 | 19 | public: 20 | LinuxEvents(int config) : fd(0) { 21 | memset(&attribs, 0, sizeof(attribs)); 22 | attribs.type = TYPE; 23 | attribs.size = sizeof(attribs); 24 | attribs.config = config; 25 | attribs.disabled = 1; 26 | attribs.exclude_kernel = 1; 27 | attribs.exclude_hv = 1; 28 | 29 | const int pid = 0; // the current process 30 | const int cpu = -1; // all CPUs 31 | const int group = -1; // no group 32 | const unsigned long flags = 0; 33 | fd = syscall(__NR_perf_event_open, &attribs, pid, cpu, group, flags); 34 | if (fd == -1) { 35 | report_error("perf_event_open"); 36 | } 37 | } 38 | 39 | ~LinuxEvents() { 40 | close(fd); 41 | } 42 | 43 | void start() { 44 | if (ioctl(fd, PERF_EVENT_IOC_RESET, 0) == -1) { 45 | report_error("ioctl(PERF_EVENT_IOC_RESET)"); 46 | } 47 | 48 | if (ioctl(fd, PERF_EVENT_IOC_ENABLE, 0) == -1) { 49 | report_error("ioctl(PERF_EVENT_IOC_ENABLE)"); 50 | } 51 | } 52 | 53 | unsigned long end() { 54 | if (ioctl(fd, PERF_EVENT_IOC_DISABLE, 0) == -1) { 55 | report_error("ioctl(PERF_EVENT_IOC_DISABLE)"); 56 | } 57 | 58 | unsigned long result; 59 | if (read(fd, &result, sizeof(result)) == -1) { 60 | report_error("read"); 61 | } 62 | 63 | return result; 64 | } 65 | 66 | private: 67 | void report_error(const std::string& context) { 68 | throw std::runtime_error(context + ": " + std::string(strerror(errno))); 69 | } 70 | 71 | }; 72 | 73 | -------------------------------------------------------------------------------- /benchmark/maropuparser.h: -------------------------------------------------------------------------------- 1 | /** 2 | * This code is released under the 3 | * Apache License Version 2.0 http://www.apache.org/licenses/. 4 | * 5 | * (c) Daniel Lemire, http://lemire.me/en/ 6 | */ 7 | 8 | #ifndef FASTSCANCOUNT_MAROPUPARSER_H_ 9 | #define FASTSCANCOUNT_MAROPUPARSER_H_ 10 | 11 | #include 12 | #include 13 | 14 | #include 15 | #include 16 | 17 | /** 18 | * This is just a bit of code to parse the binary files provided by the 19 | * Maropu-Open-Coders library at 20 | * http://integerencoding.isti.cnr.it/?page_id=8 21 | * 22 | * (Despite the name, this does not necessarily reads gaps.) 23 | * 24 | * Note that due to use of strerror this code may be thread-unsafe! 25 | * 26 | */ 27 | class MaropuGapReader { 28 | public: 29 | MaropuGapReader(const std::string &filename) : mFilename(filename), fd(NULL) {} 30 | 31 | /** 32 | * The copy constructor will assign the same file name, 33 | * but the newly constructed object won't be opened. 34 | */ 35 | MaropuGapReader(const MaropuGapReader &mgr) 36 | : mFilename(mgr.mFilename), fd(NULL) {} 37 | 38 | /** 39 | * Assignment will close the current reader, and change 40 | * the file name. You need to reopen the reader after the assignment. 41 | */ 42 | MaropuGapReader &operator=(const MaropuGapReader &mgr) { 43 | close(); 44 | mFilename = mgr.mFilename; 45 | return *this; 46 | } 47 | 48 | ~MaropuGapReader() { close(); } 49 | 50 | // @daniel: should we worry about our code being compilable on 32-bit 51 | // machines? 52 | // if so, we need to add -D_FILE_OFFSET_BITS=64 to the makefile 53 | // Daniel: it would seem odd to consider 32-bit machines when we assume AVX 54 | // support! 55 | off_t getPos() { 56 | errno = 0; 57 | off_t res = ftello(fd); 58 | if (res < 0) { 59 | std::stringstream err; 60 | err << "Error getting file position, IO status: " << strerror(errno); 61 | throw std::runtime_error(err.str()); 62 | } 63 | return res; 64 | } 65 | 66 | void setPos(off_t pos) { 67 | errno = 0; 68 | off_t res = fseeko(fd, pos, SEEK_SET); 69 | if (res < 0) { 70 | std::stringstream err; 71 | err << "Error setting file position, IO status: " << strerror(errno); 72 | throw std::runtime_error(err.str()); 73 | } 74 | } 75 | 76 | /* 77 | * Return false if no more data can be loaded. 78 | * Throw an exception in the case of IO error. 79 | */ 80 | template bool loadIntegers(container &buffer) { 81 | uint32_t qty = 0; 82 | if (!ReadQty(qty)) 83 | return false; // EOF 84 | buffer.resize(qty); 85 | errno = 0; 86 | size_t result = fread(buffer.data(), sizeof(uint32_t), buffer.size(), fd); 87 | if (result != buffer.size()) { 88 | if (!errno) { 89 | // If we can't read, the file maybe truncated, i.e., corrupt 90 | throw std::runtime_error("The file appears to be truncated/corrupt!"); 91 | } 92 | std::stringstream err; 93 | err << "Error reading from file, IO status: " << strerror(errno); 94 | throw std::runtime_error(err.str()); 95 | } 96 | return true; 97 | } 98 | 99 | /* 100 | * Return false if no more data can be loaded. 101 | * Throw an exception in the case of IO error. 102 | */ 103 | bool readNextPosAndQty(off_t &pos, uint32_t &qty) { 104 | pos = getPos(); 105 | if (!ReadQty(qty)) 106 | return false; // EOF 107 | setPos(getPos() + qty * sizeof(uint32_t)); 108 | return true; 109 | } 110 | 111 | /** 112 | * We must call open before we can use this class meaningfully. 113 | */ 114 | bool open() { 115 | close(); 116 | fd = ::fopen(mFilename.c_str(), "rb"); 117 | if (fd == NULL) { 118 | return false; 119 | } 120 | setvbuf(fd, NULL, _IOFBF, 1024 * 4); // large buffer 121 | return true; 122 | } 123 | 124 | void close() { 125 | if (fd != NULL) { 126 | ::fclose(fd); 127 | fd = NULL; 128 | } 129 | } 130 | 131 | private: 132 | /* 133 | * Returns false on EOF. 134 | * Throws an exception in the case of IO error. 135 | */ 136 | bool ReadQty(uint32_t &qty) { 137 | qty = 0; 138 | if (fd == NULL) { 139 | throw std::runtime_error("You forgot to open the file."); 140 | } 141 | errno = 0; 142 | size_t result = fread(&qty, sizeof(qty), 1, fd); 143 | if (errno) { 144 | std::stringstream err; 145 | err << "Error opening file, IO status: " << strerror(errno); 146 | throw std::runtime_error(err.str()); 147 | } 148 | if (result != 1) { 149 | return false; 150 | } 151 | if (qty > 1 << 29) { 152 | std::cout << "warning: reading a very large array (" << qty 153 | << " integers) : is your input file in the right format?" << std::endl; 154 | } 155 | return true; 156 | } 157 | 158 | std::string mFilename; 159 | FILE *fd; 160 | }; 161 | 162 | #endif /* SIMDCompressionAndIntersection_MAROPUPARSER_H_ */ 163 | -------------------------------------------------------------------------------- /benchmark/ztimer.h: -------------------------------------------------------------------------------- 1 | /** 2 | * This code is released under the 3 | * Apache License Version 2.0 http://www.apache.org/licenses/. 4 | * 5 | * (c) Daniel Lemire, http://lemire.me/en/ 6 | */ 7 | 8 | #ifndef ZTIMER_2014 9 | #define ZTIMER_2014 10 | 11 | #include 12 | #include 13 | 14 | 15 | /** 16 | * author: Preston Bannister 17 | */ 18 | class WallClockTimer { 19 | public: 20 | struct timeval t1, t2; 21 | WallClockTimer() : 22 | t1(), t2() { 23 | gettimeofday(&t1, 0); 24 | t2 = t1; 25 | } 26 | void reset() { 27 | gettimeofday(&t1, 0); 28 | t2 = t1; 29 | } 30 | uint64_t elapsed() { 31 | return ((t2.tv_sec - t1.tv_sec) * 1000ULL * 1000ULL) + ((t2.tv_usec - t1. tv_usec)); 32 | } 33 | uint64_t split() { 34 | gettimeofday(&t2, 0); 35 | return elapsed(); 36 | } 37 | }; 38 | 39 | /** 40 | * author: Daniel Lemire 41 | */ 42 | class CPUTimer { 43 | public: 44 | //clock_t t1, t2; 45 | struct rusage t1,t2; 46 | 47 | CPUTimer() : 48 | t1(), t2() { 49 | getrusage(RUSAGE_SELF, &t1); 50 | //t1 = clock(); 51 | t2 = t1; 52 | } 53 | void reset() { 54 | getrusage(RUSAGE_SELF, &t1); 55 | t2 = t1; 56 | } 57 | // proxy for userelapsed 58 | uint64_t elapsed() { 59 | return totalelapsed(); 60 | } 61 | 62 | uint64_t totalelapsed() { 63 | return userelapsed() + systemelapsed(); 64 | } 65 | // returns the *user* CPU time in micro seconds (mu s) 66 | uint64_t userelapsed() { 67 | return ((t2.ru_utime.tv_sec - t1.ru_utime.tv_sec) * 1000ULL * 1000ULL) + ((t2.ru_utime.tv_usec - t1.ru_utime.tv_usec) 68 | ); 69 | } 70 | 71 | // returns the *system* CPU time in micro seconds (mu s) 72 | uint64_t systemelapsed() { 73 | return ((t2.ru_stime.tv_sec - t1.ru_stime.tv_sec) * 1000ULL * 1000ULL) + ((t2.ru_stime.tv_usec - t1.ru_stime.tv_usec) 74 | ); 75 | } 76 | 77 | uint64_t split() { 78 | getrusage(RUSAGE_SELF, &t2); 79 | return elapsed(); 80 | } 81 | }; 82 | 83 | #endif 84 | 85 | -------------------------------------------------------------------------------- /include/fastscancount.h: -------------------------------------------------------------------------------- 1 | #ifndef FASTSCANCOUNT_H 2 | #define FASTSCANCOUNT_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | // credit: implementation and design by Nathan Kurz and Daniel Lemire 11 | 12 | namespace fastscancount { 13 | 14 | namespace { 15 | 16 | // used by natefastscancount 17 | uint32_t *natefastscancount_maincheck(uint8_t *counters, size_t &it, 18 | const uint32_t *d, size_t start, 19 | size_t range, uint8_t threshold, 20 | uint32_t *out) { 21 | range += start; 22 | counters -= start; 23 | size_t i = it; 24 | for (uint32_t val = d[i]; val < range; val = d[++i]) { 25 | uint8_t c = counters[val]; 26 | if (c == threshold) *out++ = val; 27 | counters[val] = c + 1; 28 | } 29 | it = i; 30 | return out; 31 | } 32 | 33 | // used by natefastscancount 34 | uint32_t *natefastscancount_finalcheck(uint8_t *counters, size_t &it, 35 | const uint32_t *d, size_t start, 36 | size_t itend, uint8_t threshold, 37 | uint32_t *out) { 38 | uint8_t *const deccounters = counters - start; 39 | size_t i = it; 40 | for (; i < itend; i++) { 41 | uint32_t val = d[i]; 42 | uint8_t *location = deccounters + val; 43 | uint8_t c = *location; 44 | if (c == threshold) { 45 | *out++ = val; 46 | } 47 | *location = c + 1; 48 | } 49 | it = i; 50 | return out; 51 | } 52 | } // namespace 53 | 54 | void fastscancount(const std::vector*> &data, 55 | std::vector &out, uint8_t threshold) { 56 | size_t cache_size = 65536; 57 | size_t range = cache_size; 58 | std::vector counters(cache_size); 59 | size_t ds = data.size(); 60 | out.resize( 4 * range); // let us add lots of capacity 61 | uint32_t *output = out.data(); 62 | uint32_t *initout = out.data(); 63 | std::vector iters(ds); 64 | size_t countsofar = 0; 65 | uint32_t largest = 0; 66 | for (size_t c = 0; c < ds; c++) { 67 | if (largest < (*data[c])[data[c]->size() - 1]) 68 | largest = (*data[c])[data[c]->size() - 1]; 69 | } 70 | // we are assuming that all vectors in data are non-empty 71 | for (size_t start = 0; start < largest; start += range) { 72 | // make sure that the capacity is sufficient 73 | countsofar = output - initout; 74 | if (out.size() - countsofar < range) { 75 | out.resize(out.size() + 4 * range); 76 | initout = out.data(); 77 | output = out.data() + countsofar; 78 | } 79 | memset(counters.data(), 0, range); 80 | for (size_t c = 0; c < ds; c++) { 81 | size_t it = iters[c]; // recover where we were 82 | const std::vector &d = *data[c]; 83 | const size_t itend = d.size(); 84 | if (it == itend) // check that there is data to be processed 85 | continue; // exhausted 86 | // check if we need to be careful: 87 | bool near_the_end = (d[itend - 1] < start + range); 88 | if (near_the_end) { 89 | output = natefastscancount_finalcheck(counters.data(), it, d.data(), 90 | start, itend, threshold, output); 91 | } else { 92 | output = natefastscancount_maincheck(counters.data(), it, d.data(), 93 | start, range, threshold, output); 94 | } 95 | iters[c] = it; // store it for next round 96 | } 97 | } 98 | countsofar = output - initout; 99 | out.resize(countsofar); 100 | } 101 | } // namespace fastscancount 102 | 103 | #endif 104 | -------------------------------------------------------------------------------- /include/fastscancount_avx2.h: -------------------------------------------------------------------------------- 1 | #ifndef FASTSCANCOUNT_AVX2_H 2 | #define FASTSCANCOUNT_AVX2_H 3 | 4 | // this code expects an x64 processor with AVX2 5 | 6 | #ifdef _MSC_VER 7 | #include 8 | #else 9 | #include 10 | #endif 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | namespace fastscancount { 19 | namespace { 20 | // credit: implementation and design by Travis Downes 21 | static inline size_t find_next_gt(uint8_t *array, const size_t size, 22 | const uint8_t threshold) { 23 | size_t vsize = size / 32; 24 | __m256i *varray = (__m256i *)array; 25 | const __m256i comprand = _mm256_set1_epi8(threshold); 26 | int bits = 0; 27 | 28 | for (size_t i = 0; i < vsize; i++) { 29 | __m256i v = _mm256_loadu_si256(varray + i); 30 | __m256i cmp = _mm256_cmpgt_epi8(v, comprand); 31 | if ((bits = _mm256_movemask_epi8(cmp))) { 32 | return i * 32 + __builtin_ctz(bits); 33 | } 34 | } 35 | 36 | // tail handling 37 | for (size_t i = vsize * 32; i < size; i++) { 38 | auto v = array[i]; 39 | if (v > threshold) 40 | return i; 41 | } 42 | 43 | return SIZE_MAX; 44 | } 45 | 46 | void populate_hits_avx(std::vector &counters, size_t range, 47 | size_t threshold, size_t start, 48 | std::vector &out) { 49 | uint8_t *array = counters.data(); 50 | 51 | size_t ro = range; 52 | while (true) { 53 | size_t next = find_next_gt(array, range, (uint8_t)threshold); 54 | if (next == SIZE_MAX) 55 | break; 56 | out.push_back(start + next); 57 | range -= (next + 1); 58 | array += (next + 1); 59 | start += (next + 1); 60 | } 61 | } 62 | 63 | void update_counters(const uint32_t *&it_, uint8_t *counters, 64 | uint32_t range_end) { 65 | const uint32_t *it = it_; 66 | for (uint32_t e; (e = *it) < range_end; ++it) { 67 | counters[e]++; 68 | } 69 | it_ = it; 70 | } 71 | 72 | void update_counters_final(const uint32_t *&it_, const uint32_t *end, 73 | uint8_t *counters) { 74 | uint64_t e; 75 | const uint32_t *it = it_; 76 | for (; it != end; it++) { 77 | counters[*it]++; 78 | } 79 | it_ = end; 80 | } 81 | } // namespace 82 | 83 | void fastscancount_avx2(const std::vector*> &data, 84 | std::vector &out, uint8_t threshold) { 85 | const size_t cache_size = 40000; 86 | std::vector counters(cache_size); 87 | out.clear(); 88 | const size_t dsize = data.size(); 89 | 90 | struct data_info { 91 | const uint32_t *cur; // current pointer into data 92 | const uint32_t *end; // pointer to end 93 | uint32_t last; // value of last element 94 | data_info(const uint32_t *cur, const uint32_t *end, uint32_t last) 95 | : cur{cur}, end{end}, last{last} {} 96 | }; 97 | 98 | std::vector iter_data; 99 | iter_data.reserve(dsize); 100 | for (auto &d : data) { 101 | iter_data.emplace_back(d->data(), d->data() + d->size(), d->back()); 102 | } 103 | 104 | uint32_t largest = 0; 105 | for (size_t c = 0; c < data.size(); c++) { 106 | if (largest < (*data[c])[data[c]->size() - 1]) 107 | largest = (*data[c])[data[c]->size() - 1]; 108 | } 109 | auto cdata = counters.data(); 110 | for (uint32_t start = 0; start < largest; start += cache_size) { 111 | memset(cdata, 0, cache_size * sizeof(counters[0])); 112 | for (auto &id : iter_data) { 113 | // determine if the loop will end because we get to the end of 114 | // data, or because we get to the end of the range 115 | if (__builtin_expect(id.last >= start + cache_size, 1)) { 116 | // the iteration is guaranteed to end because an element becomes >= 117 | // range_end, so we don't need to check for end of data 118 | update_counters(id.cur, cdata - start, start + cache_size); 119 | } else { 120 | // the iteration is guaranteed to end because we get to the end of the 121 | // data 122 | update_counters_final(id.cur, id.end, cdata - start); 123 | } 124 | } 125 | 126 | populate_hits_avx(counters, cache_size, threshold, start, out); 127 | } 128 | } 129 | 130 | } // namespace fastscancount 131 | #endif 132 | -------------------------------------------------------------------------------- /include/fastscancount_avx512.h: -------------------------------------------------------------------------------- 1 | #ifndef FASTSCANCOUNT_AVX512_H 2 | #define FASTSCANCOUNT_AVX512_H 3 | 4 | // this code expects an x64 processor with AVX-512F 5 | 6 | #ifdef _MSC_VER 7 | #include 8 | #else 9 | #include 10 | #endif 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | namespace fastscancount { 20 | namespace { 21 | 22 | // credit: inspired by 256-bit implementation of Travis Downes 23 | void populate_hits_avx512(std::vector &counters, size_t range, 24 | size_t threshold, size_t start, 25 | std::vector &out) { 26 | uint8_t *array = counters.data(); 27 | 28 | size_t vsize = range / 64; 29 | __m512i *varray = (__m512i *)array; 30 | const __m512i comprand = _mm512_set1_epi8(threshold); 31 | 32 | for (size_t i = 0; i < vsize; i++) { 33 | size_t start_add = start + i*64; 34 | __m512i v = _mm512_loadu_si512(varray + i); 35 | uint64_t bits = _mm512_cmpgt_epi8_mask(v, comprand); 36 | while (bits) { 37 | unsigned zqty = __builtin_ctzll(bits); 38 | bits >>= zqty; 39 | bits >>= 1; // If zqty = 63, shift by 64 is not defined, need to split shifts 40 | out.push_back(start_add + zqty); 41 | start_add += zqty + 1; 42 | } 43 | } 44 | 45 | for (size_t i = vsize * 64; i < range; i++) { 46 | auto v = array[i]; 47 | if (v > threshold) 48 | out.push_back(start + i); 49 | } 50 | 51 | } 52 | 53 | void update_counters_avx512(const uint32_t *&it_, const uint32_t *end, 54 | uint8_t *counters, 55 | const size_t shift) { 56 | 57 | if (it_ > end) { 58 | throw std::runtime_error("Bug: start > end"); 59 | } 60 | size_t qty = end - it_; 61 | size_t vsize = qty / 16; 62 | 63 | __m512i *varray = (__m512i *)it_; 64 | const __m512i add1 = _mm512_set1_epi32(1); 65 | const __m512i shift_vect = _mm512_set1_epi32(shift); 66 | 67 | const __mmask64 blend_mask = 0x1111111111111111ull; 68 | 69 | for (unsigned i = 0; i < vsize; ++i) { 70 | __m512i indx = _mm512_sub_epi32(_mm512_loadu_si512(varray + i), shift_vect); 71 | __m512i v_orig = _mm512_i32gather_epi32(indx, (const int*)counters, 1); 72 | // Note: works correctly only if counters never overflow 73 | // First, we increment counters. 74 | __m512i v_inc = _mm512_add_epi32(v_orig, add1); 75 | // Then, we will blend by keeping three higher-order bytes in each 32-bit word unmodified 76 | // When 32-bit words overlap, the gather operation would first write the old values of the word 77 | // then it will overwrite them with new values. So, this should work just fine. 78 | __m512i v = _mm512_mask_blend_epi8(blend_mask, v_orig, v_inc); 79 | _mm512_i32scatter_epi32((int*)counters, indx, v, 1); 80 | } 81 | 82 | // tail processing 83 | const uint32_t *it = it_ + vsize * 16; 84 | for (; it != end; it++) { 85 | counters[*it-shift]++; 86 | } 87 | it_ = end; 88 | } 89 | 90 | 91 | } // namespace 92 | 93 | void fastscancount_avx512(uint32_t cache_size, 94 | const std::vector*> &data, 95 | const std::vector*> &range_ends, 96 | std::vector &out, uint8_t threshold) { 97 | std::vector counters(cache_size); 98 | out.clear(); 99 | const size_t dsize = data.size(); 100 | if (!dsize) { 101 | return; 102 | } 103 | if (dsize != range_ends.size()) { 104 | throw std::runtime_error("Invalid input: non-matching sizes between data and range_ends"); 105 | } 106 | 107 | unsigned range_qty = range_ends[0]->size(); 108 | for (unsigned i = 1; i < dsize; ++i) { 109 | if (range_ends[i]->size() != range_qty) { 110 | throw std::runtime_error("Invalid input: different range sizes for different data arrays!"); 111 | } 112 | } 113 | 114 | auto cdata = counters.data(); 115 | 116 | std::vector it(dsize); 117 | for (unsigned k = 0; k < dsize; ++k) { 118 | const auto& v = *data[k]; 119 | if (!v.empty()) { 120 | it[k] = &v[0]; 121 | } 122 | } 123 | 124 | for (unsigned i = 0; i < range_qty; ++i) { 125 | memset(cdata, 0, cache_size * sizeof(counters[0])); 126 | uint32_t start = i * cache_size; 127 | for (unsigned k = 0; k < dsize; ++k) { 128 | const std::vector& v = *data[k]; 129 | const std::vector& r = *range_ends[k]; 130 | update_counters_avx512(it[k], &v[0] + r[i], cdata, start); 131 | } 132 | 133 | populate_hits_avx512(counters, cache_size, threshold, start, out); 134 | } 135 | } 136 | 137 | } // namespace fastscancount 138 | #endif 139 | --------------------------------------------------------------------------------