├── .travis.yml ├── LICENSE ├── Makefile ├── README.md ├── TODO.md ├── include ├── binarysearchintersection.h ├── branchless.h ├── common.h ├── gallopingintersection.h ├── hscalableintersection.h ├── hybridintersection.h ├── inoueetal.h ├── intersection.h ├── intersectionfactory.h ├── match.h ├── mediumintersection.h ├── mersenne.h ├── multiSetIntersection.hpp ├── partitionedintersection.h ├── skipping.h ├── stlutil.h ├── synthetic.h ├── tetzank.h ├── thomaswu.h ├── timer.h ├── union.h ├── util.h └── widevectorintersection.h ├── results ├── benchintersection5march2014.gnuplot ├── benchintersection5march2014.txt ├── benchintersection6march2014.gnuplot ├── benchintersection6march2014.txt └── benchintersection6march2014_2.txt ├── scripts ├── disablehyperthreading.sh ├── powerpolicy.sh └── turboboost.sh └── src ├── benchintersection.cpp ├── getmatrix.cpp ├── intersection.cpp ├── match.cpp ├── multiSetIntersection.cpp ├── realintersection.cpp ├── testintersection.cpp ├── thomaswu.cpp └── unit.cpp /.travis.yml: -------------------------------------------------------------------------------- 1 | language: c++ 2 | sudo: false 3 | compiler: 4 | - clang++ 5 | 6 | branches: 7 | only: 8 | - master 9 | 10 | script: make unit && ./unit 11 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, and 10 | distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by the copyright 13 | owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all other entities 16 | that control, are controlled by, or are under common control with that entity. 17 | For the purposes of this definition, "control" means (i) the power, direct or 18 | indirect, to cause the direction or management of such entity, whether by 19 | contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the 20 | outstanding shares, or (iii) beneficial ownership of such entity. 21 | 22 | "You" (or "Your") shall mean an individual or Legal Entity exercising 23 | permissions granted by this License. 24 | 25 | "Source" form shall mean the preferred form for making modifications, including 26 | but not limited to software source code, documentation source, and configuration 27 | files. 28 | 29 | "Object" form shall mean any form resulting from mechanical transformation or 30 | translation of a Source form, including but not limited to compiled object code, 31 | generated documentation, and conversions to other media types. 32 | 33 | "Work" shall mean the work of authorship, whether in Source or Object form, made 34 | available under the License, as indicated by a copyright notice that is included 35 | in or attached to the work (an example is provided in the Appendix below). 36 | 37 | "Derivative Works" shall mean any work, whether in Source or Object form, that 38 | is based on (or derived from) the Work and for which the editorial revisions, 39 | annotations, elaborations, or other modifications represent, as a whole, an 40 | original work of authorship. For the purposes of this License, Derivative Works 41 | shall not include works that remain separable from, or merely link (or bind by 42 | name) to the interfaces of, the Work and Derivative Works thereof. 43 | 44 | "Contribution" shall mean any work of authorship, including the original version 45 | of the Work and any modifications or additions to that Work or Derivative Works 46 | thereof, that is intentionally submitted to Licensor for inclusion in the Work 47 | by the copyright owner or by an individual or Legal Entity authorized to submit 48 | on behalf of the copyright owner. For the purposes of this definition, 49 | "submitted" means any form of electronic, verbal, or written communication sent 50 | to the Licensor or its representatives, including but not limited to 51 | communication on electronic mailing lists, source code control systems, and 52 | issue tracking systems that are managed by, or on behalf of, the Licensor for 53 | the purpose of discussing and improving the Work, but excluding communication 54 | that is conspicuously marked or otherwise designated in writing by the copyright 55 | owner as "Not a Contribution." 56 | 57 | "Contributor" shall mean Licensor and any individual or Legal Entity on behalf 58 | of whom a Contribution has been received by Licensor and subsequently 59 | incorporated within the Work. 60 | 61 | 2. Grant of Copyright License. 62 | 63 | Subject to the terms and conditions of this License, each Contributor hereby 64 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, 65 | irrevocable copyright license to reproduce, prepare Derivative Works of, 66 | publicly display, publicly perform, sublicense, and distribute the Work and such 67 | Derivative Works in Source or Object form. 68 | 69 | 3. Grant of Patent License. 70 | 71 | Subject to the terms and conditions of this License, each Contributor hereby 72 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, 73 | irrevocable (except as stated in this section) patent license to make, have 74 | made, use, offer to sell, sell, import, and otherwise transfer the Work, where 75 | such license applies only to those patent claims licensable by such Contributor 76 | that are necessarily infringed by their Contribution(s) alone or by combination 77 | of their Contribution(s) with the Work to which such Contribution(s) was 78 | submitted. If You institute patent litigation against any entity (including a 79 | cross-claim or counterclaim in a lawsuit) alleging that the Work or a 80 | Contribution incorporated within the Work constitutes direct or contributory 81 | patent infringement, then any patent licenses granted to You under this License 82 | for that Work shall terminate as of the date such litigation is filed. 83 | 84 | 4. Redistribution. 85 | 86 | You may reproduce and distribute copies of the Work or Derivative Works thereof 87 | in any medium, with or without modifications, and in Source or Object form, 88 | provided that You meet the following conditions: 89 | 90 | You must give any other recipients of the Work or Derivative Works a copy of 91 | this License; and 92 | You must cause any modified files to carry prominent notices stating that You 93 | changed the files; and 94 | You must retain, in the Source form of any Derivative Works that You distribute, 95 | all copyright, patent, trademark, and attribution notices from the Source form 96 | of the Work, excluding those notices that do not pertain to any part of the 97 | Derivative Works; and 98 | If the Work includes a "NOTICE" text file as part of its distribution, then any 99 | Derivative Works that You distribute must include a readable copy of the 100 | attribution notices contained within such NOTICE file, excluding those notices 101 | that do not pertain to any part of the Derivative Works, in at least one of the 102 | following places: within a NOTICE text file distributed as part of the 103 | Derivative Works; within the Source form or documentation, if provided along 104 | with the Derivative Works; or, within a display generated by the Derivative 105 | Works, if and wherever such third-party notices normally appear. The contents of 106 | the NOTICE file are for informational purposes only and do not modify the 107 | License. You may add Your own attribution notices within Derivative Works that 108 | You distribute, alongside or as an addendum to the NOTICE text from the Work, 109 | provided that such additional attribution notices cannot be construed as 110 | modifying the License. 111 | You may add Your own copyright statement to Your modifications and may provide 112 | additional or different license terms and conditions for use, reproduction, or 113 | distribution of Your modifications, or for any such Derivative Works as a whole, 114 | provided Your use, reproduction, and distribution of the Work otherwise complies 115 | with the conditions stated in this License. 116 | 117 | 5. Submission of Contributions. 118 | 119 | Unless You explicitly state otherwise, any Contribution intentionally submitted 120 | for inclusion in the Work by You to the Licensor shall be under the terms and 121 | conditions of this License, without any additional terms or conditions. 122 | Notwithstanding the above, nothing herein shall supersede or modify the terms of 123 | any separate license agreement you may have executed with Licensor regarding 124 | such Contributions. 125 | 126 | 6. Trademarks. 127 | 128 | This License does not grant permission to use the trade names, trademarks, 129 | service marks, or product names of the Licensor, except as required for 130 | reasonable and customary use in describing the origin of the Work and 131 | reproducing the content of the NOTICE file. 132 | 133 | 7. Disclaimer of Warranty. 134 | 135 | Unless required by applicable law or agreed to in writing, Licensor provides the 136 | Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, 137 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, 138 | including, without limitation, any warranties or conditions of TITLE, 139 | NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are 140 | solely responsible for determining the appropriateness of using or 141 | redistributing the Work and assume any risks associated with Your exercise of 142 | permissions under this License. 143 | 144 | 8. Limitation of Liability. 145 | 146 | In no event and under no legal theory, whether in tort (including negligence), 147 | contract, or otherwise, unless required by applicable law (such as deliberate 148 | and grossly negligent acts) or agreed to in writing, shall any Contributor be 149 | liable to You for damages, including any direct, indirect, special, incidental, 150 | or consequential damages of any character arising as a result of this License or 151 | out of the use or inability to use the Work (including but not limited to 152 | damages for loss of goodwill, work stoppage, computer failure or malfunction, or 153 | any and all other commercial damages or losses), even if such Contributor has 154 | been advised of the possibility of such damages. 155 | 156 | 9. Accepting Warranty or Additional Liability. 157 | 158 | While redistributing the Work or Derivative Works thereof, You may choose to 159 | offer, and charge a fee for, acceptance of support, warranty, indemnity, or 160 | other liability obligations and/or rights consistent with this License. However, 161 | in accepting such obligations, You may act only on Your own behalf and on Your 162 | sole responsibility, not on behalf of any other Contributor, and only if You 163 | agree to indemnify, defend, and hold each Contributor harmless for any liability 164 | incurred by, or claims asserted against, such Contributor by reason of your 165 | accepting any such warranty or additional liability. 166 | 167 | END OF TERMS AND CONDITIONS 168 | 169 | APPENDIX: How to apply the Apache License to your work 170 | 171 | To apply the Apache License to your work, attach the following boilerplate 172 | notice, with the fields enclosed by brackets "[]" replaced with your own 173 | identifying information. (Don't include the brackets!) The text should be 174 | enclosed in the appropriate comment syntax for the file format. We also 175 | recommend that a file or class name and description of purpose be included on 176 | the same "printed page" as the copyright notice for easier identification within 177 | third-party archives. 178 | 179 | Copyright [yyyy] [name of copyright owner] 180 | 181 | Licensed under the Apache License, Version 2.0 (the "License"); 182 | you may not use this file except in compliance with the License. 183 | You may obtain a copy of the License at 184 | 185 | http://www.apache.org/licenses/LICENSE-2.0 186 | 187 | Unless required by applicable law or agreed to in writing, software 188 | distributed under the License is distributed on an "AS IS" BASIS, 189 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 190 | See the License for the specific language governing permissions and 191 | limitations under the License. 192 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .SUFFIXES: 2 | # 3 | .SUFFIXES: .cpp .o .c .h 4 | # replace the YOURCXX variable with a path to a C++11 compatible compiler. 5 | ifeq ($(INTEL), 1) 6 | # if you wish to use the Intel compiler, please do "make INTEL=1". 7 | CXX ?= /opt/intel/bin/icpc 8 | CXXFLAGS = -std=c++0x -O3 -Wall -DNDEBUG=1 -g3 9 | else 10 | CXX ?= g++-4.7 11 | ifeq ($(DEBUG),1) 12 | CXXFLAGS = -march=native -std=c++11 -Weffc++ -pedantic -D_GLIBCXX_DEBUG -DDEBUG=1 -ggdb -Wall -Wextra -Wcast-align -Wconversion -Winline 13 | else 14 | CXXFLAGS = -march=native -std=c++11 -Weffc++ -DNDEBUG=1 -pedantic -O3 -Wall -Wextra -Winline -Wcast-align -Wconversion 15 | endif 16 | endif 17 | 18 | 19 | 20 | 21 | 22 | HEADERS= $(shell ls include/*h) 23 | 24 | all: unit testintersection realintersection getmatrix benchintersection multiSetIntersection 25 | echo "please run unit tests by running the unit executable" 26 | 27 | intersection.o: src/intersection.cpp include/common.h 28 | $(CXX) $(CXXFLAGS) -Iinclude -c src/intersection.cpp 29 | 30 | match.o: src/match.cpp include/match.h 31 | $(CXX) $(CXXFLAGS) -Iinclude -c src/match.cpp 32 | 33 | thomaswu.o: src/thomaswu.cpp $(HEADERS) 34 | $(CXX) $(CXXFLAGS) -Iinclude -c src/thomaswu.cpp 35 | 36 | multiSetIntersection: $(HEADERS) src/multiSetIntersection.cpp match.o thomaswu.o intersection.o 37 | $(CXX) $(CXXFLAGS) -Iinclude -o multiSetIntersection src/multiSetIntersection.cpp match.o thomaswu.o intersection.o 38 | 39 | testintersection: $(HEADERS) src/testintersection.cpp match.o thomaswu.o intersection.o 40 | $(CXX) $(CXXFLAGS) -Iinclude -o testintersection src/testintersection.cpp match.o thomaswu.o intersection.o 41 | 42 | realintersection: $(HEADERS) src/realintersection.cpp match.o thomaswu.o intersection.o 43 | $(CXX) $(CXXFLAGS) -Iinclude -o realintersection src/realintersection.cpp match.o thomaswu.o intersection.o 44 | 45 | getmatrix: $(HEADERS) src/getmatrix.cpp match.o thomaswu.o intersection.o 46 | $(CXX) $(CXXFLAGS) -Iinclude -o getmatrix src/getmatrix.cpp match.o thomaswu.o intersection.o 47 | 48 | unit: $(HEADERS) src/unit.cpp match.o thomaswu.o intersection.o 49 | $(CXX) $(CXXFLAGS) -Iinclude -o unit src/unit.cpp match.o thomaswu.o intersection.o 50 | benchintersection: $(HEADERS) src/benchintersection.cpp match.o thomaswu.o intersection.o 51 | $(CXX) $(CXXFLAGS) -Iinclude -o benchintersection src/benchintersection.cpp match.o thomaswu.o intersection.o 52 | 53 | 54 | clean: 55 | rm -f *.o unit testintersection realintersection getmatrix benchintersection multiSetIntersection 56 | 57 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | SIMDIntersections 2 | ================= 3 | [![Build Status](https://travis-ci.org/lemire/SIMDIntersections.png)](https://travis-ci.org/lemire/SIMDIntersections) 4 | 5 | Vectorized intersections : research code. 6 | 7 | Usage: 8 | 9 | ```bash 10 | $ make 11 | $ ./unit 12 | $ ./benchintersection 13 | $ ./realintersection 14 | ``` 15 | 16 | ## Further reading 17 | 18 | - Daniel Lemire, Nathan Kurz, Leonid Boytsov, SIMD Compression and the Intersection of Sorted Integers, Software: Practice and Experience 46 (6), 2016. https://arxiv.org/abs/1401.6399 19 | -------------------------------------------------------------------------------- /TODO.md: -------------------------------------------------------------------------------- 1 | Implement galloping swapping svs 2 | 3 | swapping svs 4 | Experiments on adaptive set intersections for text retrieval systems. Algorithm Engineering and Experimentation (ALENEX), pages 91–104, 2001. 5 | 6 | The Baeza Yates algorithm 7 | R. Baeza-Yates. A fast set intersection algorithm for sorted sequences. In Combinatorial Pattern Matching, pages 400–408. Springer, 2004. 8 | -------------------------------------------------------------------------------- /include/binarysearchintersection.h: -------------------------------------------------------------------------------- 1 | /* 2 | * binarysearch.h 3 | * 4 | * Created on: May 13, 2013 5 | * Author: ? 6 | */ 7 | 8 | #ifndef BINARYSEARCH_H_ 9 | #define BINARYSEARCH_H_ 10 | 11 | #include "common.h" 12 | 13 | /** 14 | * This is pure binary search 15 | * Used by BSintersectioncardinality below 16 | * @param array 17 | * @param pos 18 | * @param min 19 | * @return 20 | */ 21 | static size_t __BSadvanceUntil(const uint32_t * array, const size_t pos, 22 | const size_t length, const size_t min) { 23 | size_t lower = pos + 1; 24 | if (lower == length || array[lower] >= min) { 25 | return lower; 26 | } 27 | // can safely assume that length>0 28 | size_t upper = length - 1; 29 | if (array[upper] < min) { 30 | return length; 31 | } 32 | size_t mid; 33 | while (lower < upper) { 34 | mid = (lower + upper) / 2; 35 | if (array[mid] == min) { 36 | return mid; 37 | } 38 | 39 | if (array[mid] < min) { 40 | lower = mid + 1; 41 | } else { 42 | upper = mid; 43 | } 44 | } 45 | return upper; 46 | } 47 | 48 | /** 49 | * Based on binary search. 50 | */ 51 | size_t BSintersection(const uint32_t * set1, const size_t length1, 52 | const uint32_t * set2, const size_t length2, uint32_t *out) { 53 | if ((0 == length1) or (0 == length2)) 54 | return 0; 55 | size_t answer = 0; 56 | size_t k1 = 0, k2 = 0; 57 | while (true) { 58 | if (set1[k1] < set2[k2]) { 59 | k1 = __BSadvanceUntil(set1, k1, length1, set2[k2]); 60 | if (k1 == length1) 61 | return answer; 62 | } 63 | if (set2[k2] < set1[k1]) { 64 | k2 = __BSadvanceUntil(set2, k2, length2, set1[k1]); 65 | if (k2 == length2) 66 | return answer; 67 | } else { 68 | // assert(set2[k2] == set1[k1]); 69 | out[answer++] = set1[k1]; 70 | ++k1; 71 | if (k1 == length1) 72 | break; 73 | ++k2; 74 | if (k2 == length2) 75 | break; 76 | } 77 | } 78 | return answer; 79 | 80 | } 81 | 82 | /** 83 | * Based on binary search. 84 | */ 85 | size_t BSintersectioncardinality(const uint32_t * set1, const size_t length1, 86 | const uint32_t * set2, const size_t length2) { 87 | if ((0 == length1) or (0 == length2)) 88 | return 0; 89 | size_t answer = 0; 90 | size_t k1 = 0, k2 = 0; 91 | while (true) { 92 | if (set1[k1] < set2[k2]) { 93 | k1 = __BSadvanceUntil(set1, k1, length1, set2[k2]); 94 | if (k1 == length1) 95 | return answer; 96 | } 97 | if (set2[k2] < set1[k1]) { 98 | k2 = __BSadvanceUntil(set2, k2, length2, set1[k1]); 99 | if (k2 == length2) 100 | return answer; 101 | } else { 102 | // assert(set2[k2] == set1[k1]); 103 | ++answer; 104 | ++k1; 105 | if (k1 == length1) 106 | break; 107 | ++k2; 108 | if (k2 == length2) 109 | break; 110 | } 111 | } 112 | return answer; 113 | 114 | } 115 | 116 | static size_t __FixedBSadvanceUntil(const uint32_t * array, const size_t length, 117 | const size_t min) { 118 | size_t lower = 0; 119 | size_t upper = length - 1; 120 | if (array[upper] < min) { 121 | return length; 122 | } 123 | size_t mid; 124 | while (lower < upper) { 125 | mid = (lower + upper) / 2; 126 | if (array[mid] == min) { 127 | return mid; 128 | } 129 | 130 | if (array[mid] < min) { 131 | lower = mid + 1; 132 | } else { 133 | upper = mid; 134 | } 135 | } 136 | return upper; 137 | } 138 | 139 | /** 140 | * Based on binary search. 141 | */ 142 | size_t FixedBSintersectioncardinality(const uint32_t * set1, 143 | const size_t length1, const uint32_t * set2, const size_t length2) { 144 | if ((0 == length1) or (0 == length2)) 145 | return 0; 146 | size_t answer = 0; 147 | size_t k1 = 0, k2 = 0; 148 | while (true) { 149 | if (set1[k1] < set2[k2]) { 150 | k1 = __FixedBSadvanceUntil(set1, length1, set2[k2]); 151 | if (k1 == length1) 152 | return answer; 153 | } 154 | 155 | if (set2[k2] < set1[k1]) { 156 | k2 = __FixedBSadvanceUntil(set2, length2, set1[k1]); 157 | if (k2 == length2) 158 | return answer; 159 | } else { 160 | // assert(set2[k2] == set1[k1]); 161 | ++answer; 162 | ++k1; 163 | if (k1 == length1) 164 | break; 165 | ++k2; 166 | if (k2 == length2) 167 | break; 168 | } 169 | } 170 | return answer; 171 | 172 | } 173 | 174 | #endif /* BINARYSEARCH_H_ */ 175 | -------------------------------------------------------------------------------- /include/branchless.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef BRANCHLESS_H_ 3 | #define BRANCHLESS_H_ 4 | 5 | #include 6 | #include 7 | 8 | #ifdef IACA 9 | #include 10 | #endif 11 | 12 | /** 13 | * Failed (?) attempt at reproducing the good results of the branchless scheme 14 | * from Fast Sorted-Set Intersection using SIMD Instructions 15 | * originally by D. Lemire but combined with a design by N. Kurz. 16 | */ 17 | #ifndef __INTEL_COMPILER 18 | __attribute__((optimize("unroll-loops"))) // this helps a lot with GCC 19 | #endif 20 | size_t branchlessintersection(const uint32_t * set1, const size_t length1, 21 | const uint32_t * set2, const size_t length2, uint32_t * out) { 22 | if ((0 == length1) or (0 == length2)) 23 | return 0; 24 | const uint32_t * const initout(out); 25 | const uint32_t * const finalset1(set1 + length1); 26 | const uint32_t * const finalset2(set2 + length2); 27 | 28 | const unsigned int N = 4; 29 | 30 | // main loop 31 | while ((set1 +N <= finalset1) && (set2 +N <= finalset2)) { 32 | #ifdef __INTEL_COMPILER 33 | #pragma unroll(4) 34 | #endif 35 | for (unsigned int k = 0; k < N; ++k) { 36 | // this is branchless... (in theory, maybe not in practice) 37 | const uint32_t a = *set1; 38 | const uint32_t b = *set2; 39 | *out = a; 40 | out = (a == b) ? out + 1 : out; 41 | set1 = (a <= b) ? set1 + 1 : set1; 42 | set2 = (b <= a) ? set2 + 1 : set2; 43 | } 44 | 45 | } 46 | while ((set1 < finalset1) && (set2 < finalset2)) { 47 | // this is branchless... (in theory, maybe not in practice) 48 | *out = *set1; 49 | const uint32_t a = *set1; 50 | const uint32_t b = *set2; 51 | out = (a == b) ? out + 1 : out; 52 | set1 = (a <= b) ? set1 + 1 : set1; 53 | set2 = (b <= a) ? set2 + 1 : set2; 54 | 55 | } 56 | 57 | return (out - initout); 58 | } 59 | 60 | 61 | /** 62 | * Branchless approach by N. Kurz. 63 | */ 64 | size_t scalar_branchless(const uint32_t *A, size_t lenA, 65 | const uint32_t *B, size_t lenB, 66 | uint32_t *Match) { 67 | 68 | const uint32_t *initMatch = Match; 69 | const uint32_t *endA = A + lenA; 70 | const uint32_t *endB = B + lenB; 71 | 72 | while (A < endA && B < endB) { 73 | int m = (*B == *A) ? 1 : 0; // advance Match only if equal 74 | int a = (*B >= *A) ? 1 : 0; // advance A if match or B ahead 75 | int b = (*B <= *A) ? 1 : 0; // advance B if match or B behind 76 | 77 | *Match = *A; // write the result regardless of match 78 | Match += m; // but will be rewritten unless advanced 79 | A += a; 80 | B += b; 81 | } 82 | 83 | size_t count = Match - initMatch; 84 | return count; 85 | } 86 | 87 | // NOTE: Proof of concept function --- reads past end of input 88 | size_t scalar_branchless_cached(const uint32_t *A, size_t lenA, 89 | const uint32_t *B, size_t lenB, 90 | uint32_t *Match) { 91 | 92 | const uint32_t *initMatch = Match; 93 | const uint32_t *endA = A + lenA; 94 | const uint32_t *endB = B + lenB; 95 | 96 | uint32_t thisA = A[0]; 97 | uint32_t thisB = B[0]; 98 | 99 | while (A < endA && B < endB) { 100 | 101 | #ifdef IACA 102 | IACA_START; 103 | #endif 104 | uint32_t nextA = A[1]; 105 | uint32_t nextB = B[1]; 106 | 107 | uint32_t oldA = thisA; 108 | uint32_t oldB = thisB; 109 | 110 | *Match = thisA; // write the result regardless of match 111 | 112 | int m = (oldB == oldA) ? 1 : 0; // advance Match only if equal 113 | int a = (oldB >= oldA) ? 1 : 0; // advance A if match or B ahead 114 | int b = (oldB <= oldA) ? 1 : 0; // advance B if match or B behind 115 | 116 | thisA = (oldB >= oldA) ? nextA : thisA; // advance A if match or B ahead 117 | thisB = (oldB <= oldA) ? nextB : thisB; // advance B if match or B behind 118 | 119 | Match += m; // will be rewritten unless advanced 120 | A += a; 121 | B += b; 122 | 123 | #ifdef IACA 124 | IACA_END; 125 | #endif 126 | 127 | } 128 | 129 | 130 | size_t count = Match - initMatch; 131 | return count; 132 | } 133 | 134 | // NOTE: Proof of concept function --- reads past end of input 135 | size_t scalar_branchless_cached2(const uint32_t *A, size_t lenA, 136 | const uint32_t *B, size_t lenB, 137 | uint32_t *Match) { 138 | 139 | const uint32_t *initMatch = Match; 140 | const uint32_t *endA = A + lenA; 141 | const uint32_t *endB = B + lenB; 142 | 143 | uint32_t thisA = A[0]; 144 | uint32_t thisB = B[0]; 145 | 146 | uint32_t nextA = A[1]; 147 | uint32_t nextB = B[1]; 148 | 149 | while (A < endA && B < endB) { 150 | #ifdef IACA 151 | IACA_START; 152 | #endif 153 | 154 | uint32_t nextNextA = A[2]; 155 | uint32_t nextNextB = B[2]; 156 | 157 | uint32_t oldA = thisA; 158 | uint32_t oldB = thisB; 159 | 160 | *Match = thisA; // write the result regardless of match 161 | 162 | int m = (oldB == oldA) ? 1 : 0; // advance Match only if equal 163 | int a = (oldB >= oldA) ? 1 : 0; // advance A if match or B ahead 164 | int b = (oldB <= oldA) ? 1 : 0; // advance B if match or B behind 165 | 166 | thisA = (oldB >= oldA) ? nextA : thisA; // advance A if match or B ahead 167 | thisB = (oldB <= oldA) ? nextB : thisB; // advance B if match or B behind 168 | 169 | nextA = (oldB >= oldA) ? nextNextA : nextA; 170 | nextB = (oldB <= oldA) ? nextNextB : nextB; 171 | 172 | Match += m; // Match will be rewritten unless advanced 173 | A += a; 174 | B += b; 175 | 176 | #ifdef IACA 177 | IACA_END; 178 | #endif 179 | 180 | } 181 | 182 | 183 | size_t count = Match - initMatch; 184 | return count; 185 | } 186 | 187 | // use in function below 188 | #define BRANCHLESSMATCH() { \ 189 | int m = (*B == *A) ? 1 : 0; \ 190 | int a = (*B >= *A) ? 1 : 0; \ 191 | int b = (*B <= *A) ? 1 : 0; \ 192 | *Match = *A; \ 193 | Match += m; \ 194 | A += a; \ 195 | B += b; \ 196 | } 197 | 198 | 199 | /** 200 | * Unrolled branchless approach by N. Kurz. 201 | */ 202 | size_t scalar_branchless_unrolled(const uint32_t *A, size_t lenA, 203 | const uint32_t *B, size_t lenB, 204 | uint32_t *Match) { 205 | 206 | const size_t UNROLLED = 4; 207 | 208 | const uint32_t *initMatch = Match; 209 | const uint32_t *endA = A + lenA; 210 | const uint32_t *endB = B + lenB; 211 | 212 | if (lenA >= UNROLLED && lenB >= UNROLLED) { 213 | const uint32_t *stopA = endA - UNROLLED; 214 | const uint32_t *stopB = endB - UNROLLED; 215 | 216 | while (A < stopA && B < stopB) { 217 | BRANCHLESSMATCH(); // NOTE: number of calls must match UNROLLED 218 | BRANCHLESSMATCH(); 219 | BRANCHLESSMATCH(); 220 | BRANCHLESSMATCH(); 221 | } 222 | } 223 | 224 | // Finish remainder without overstepping 225 | while (A < endA && B < endB) { 226 | BRANCHLESSMATCH(); 227 | } 228 | 229 | size_t count = Match - initMatch; 230 | return count; 231 | } 232 | 233 | #undef BRANCHLESSMATCH 234 | 235 | 236 | // Intel disassembly for branchless 237 | // 15: mov (%rdx),%r11d # r11 = *B 238 | // 18: mov $0x1,%r8d # r8 = 1 239 | // 1e: mov (%rdi),%eax # eax = *A 240 | // 20: cmp %eax,%r11d # *B <=> *A 241 | // 23: mov $0x0,%r11d # r11 = 0 242 | // 29: cmove %r8,%r11 # if *B == *A r11 = 1 243 | // 2d: mov %eax,(%r9) # *output = *A 244 | // 30: lea (%r9,%r11,4),%r9 # output += 4 * r11 245 | // 34: mov $0x0,%r11d # r11 = 0 246 | // 3a: cmovae %r8,%r11 # if *B >= *A r11 = 1 247 | // 3e: lea (%rdi,%r11,4),%rdi # A += 4 * r11 248 | // 42: mov $0x0,%r11d # r11 = 0 249 | // 48: cmovbe %r8,%r11 # if *B <= *A r11 = 1 250 | // 4c: lea (%rdx,%r11,4),%rdx # B += 4 * r11 251 | 252 | 253 | #endif 254 | -------------------------------------------------------------------------------- /include/common.h: -------------------------------------------------------------------------------- 1 | /** 2 | * This code is released under the 3 | * Apache License Version 2.0 http://www.apache.org/licenses/. 4 | * 5 | * (c) Daniel Lemire, http://lemire.me/en/ 6 | */ 7 | #ifndef COMMON_H_ 8 | #define COMMON_H_ 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | //#include 42 | //#include 43 | #include 44 | #include 45 | #include 46 | 47 | #define _LIKELY(x) __builtin_expect(!!(x), 1) 48 | #define _UNLIKELY(x) __builtin_expect(!!(x), 0) 49 | #define _NOINLINE __attribute__((noinline)) 50 | #define _ALWAYSINLINE __attribute__((always_inline)) 51 | typedef std::set, 52 | std::function< 53 | bool(const std::vector&, const std::vector&)>>mySet; 54 | 55 | #endif /* COMMON_H_ */ 56 | -------------------------------------------------------------------------------- /include/gallopingintersection.h: -------------------------------------------------------------------------------- 1 | /* 2 | * gallopingintersection.h 3 | * 4 | * Created on: May 13, 2013 5 | * Author: ? 6 | */ 7 | 8 | #ifndef GALLOPINGINTERSECTION_H_ 9 | #define GALLOPINGINTERSECTION_H_ 10 | 11 | /** 12 | * This is often called galloping or exponential search. 13 | * 14 | * Used by frogintersectioncardinality below 15 | * 16 | * Based on binary search... 17 | * Find the smallest integer larger than pos such 18 | * that array[pos]>= min. 19 | * If none can be found, return array.length. 20 | * From code by O. Kaser. 21 | */ 22 | static size_t __frogadvanceUntil(const uint32_t * array, const size_t pos, 23 | const size_t length, const size_t min) { 24 | size_t lower = pos + 1; 25 | 26 | // special handling for a possibly common sequential case 27 | if ((lower >= length) or (array[lower] >= min)) { 28 | return lower; 29 | } 30 | 31 | size_t spansize = 1; // could set larger 32 | // bootstrap an upper limit 33 | 34 | while ((lower + spansize < length) and (array[lower + spansize] < min)) 35 | spansize *= 2; 36 | size_t upper = (lower + spansize < length) ? lower + spansize : length - 1; 37 | 38 | // maybe we are lucky (could be common case when the seek ahead expected to be small and sequential will otherwise make us look bad) 39 | //if (array[upper] == min) { 40 | // return upper; 41 | //} 42 | 43 | if (array[upper] < min) {// means array has no item >= min 44 | return length; 45 | } 46 | 47 | // we know that the next-smallest span was too small 48 | lower += (spansize / 2); 49 | 50 | // else begin binary search 51 | size_t mid = 0; 52 | while (lower + 1 != upper) { 53 | mid = (lower + upper) / 2; 54 | if (array[mid] == min) { 55 | return mid; 56 | } else if (array[mid] < min) 57 | lower = mid; 58 | else 59 | upper = mid; 60 | } 61 | return upper; 62 | 63 | } 64 | 65 | /** 66 | * EXPERIMENTAL VERSION 67 | * 68 | * This is often called galloping or exponential search. 69 | * 70 | * Used by frogintersectioncardinality below 71 | * 72 | * Based on binary search... 73 | * Find the smallest integer larger than pos such 74 | * that array[pos]>= min. 75 | * If none can be found, return array.length. 76 | * From code by O. Kaser. 77 | */ 78 | static size_t __frogadvanceUntil_experimental(const uint32_t * array, const size_t pos, 79 | const size_t length, const size_t min) { 80 | size_t lower = pos + 1; 81 | 82 | // special handling for a possibly common sequential case 83 | if ((lower >= length) or (array[lower] >= min)) { 84 | return lower; 85 | } 86 | 87 | size_t spansize = 1; // could set larger 88 | // bootstrap an upper limit 89 | 90 | while ((lower + spansize < length) and (array[lower + spansize] < min)) 91 | spansize *= 2; 92 | size_t upper = (lower + spansize < length) ? lower + spansize : length - 1; 93 | 94 | // maybe we are lucky (could be common case when the seek ahead expected to be small and sequential will otherwise make us look bad) 95 | if (array[upper] == min) { 96 | return upper; 97 | } 98 | 99 | if (array[upper] < min) {// means array has no item >= min 100 | return length; 101 | } 102 | 103 | // we know that the next-smallest span was too small 104 | lower += (spansize / 2); 105 | 106 | // else begin binary search 107 | size_t mid = 0; 108 | while (lower + 1 != upper) { 109 | mid = (lower + upper) / 2; 110 | if (array[mid] == min) { 111 | return mid; 112 | } else if (array[mid] < min) 113 | lower = mid; 114 | else 115 | upper = mid; 116 | } 117 | return upper; 118 | 119 | } 120 | 121 | 122 | /** 123 | * based on galloping 124 | */ 125 | size_t frogintersectioncardinality(const uint32_t * set1, const size_t length1, 126 | const uint32_t * set2, const size_t length2) { 127 | if ((0 == length1) or (0 == length2)) 128 | return 0; 129 | size_t answer = 0; 130 | size_t k1 = 0, k2 = 0; 131 | while (true) { 132 | if (set1[k1] < set2[k2]) { 133 | k1 = __frogadvanceUntil(set1, k1, length1, set2[k2]); 134 | if (k1 == length1) 135 | return answer; 136 | } 137 | if (set2[k2] < set1[k1]) { 138 | k2 = __frogadvanceUntil(set2, k2, length2, set1[k1]); 139 | if (k2 == length2) 140 | return answer; 141 | } else { 142 | // assert(set2[k2] == set1[k1]); 143 | ++answer; 144 | ++k1; 145 | if (k1 == length1) 146 | break; 147 | ++k2; 148 | if (k2 == length2) 149 | break; 150 | } 151 | } 152 | return answer; 153 | 154 | } 155 | 156 | 157 | size_t onesidedgallopingintersectioncardinality(const uint32_t * smallset, 158 | const size_t smalllength, const uint32_t * largeset, 159 | const size_t largelength) { 160 | if(largelength < smalllength) return onesidedgallopingintersectioncardinality(largeset,largelength,smallset,smalllength); 161 | if (0 == smalllength) 162 | return 0; 163 | size_t answer = 0; 164 | size_t k1 = 0, k2 = 0; 165 | while (true) { 166 | if (largeset[k1] < smallset[k2]) { 167 | k1 = __frogadvanceUntil(largeset, k1, largelength, smallset[k2]); 168 | if (k1 == largelength) 169 | return answer; 170 | } 171 | midpoint: if (smallset[k2] < largeset[k1]) { 172 | ++k2; 173 | if (k2 == smalllength) 174 | return answer; 175 | } else { 176 | ++answer; 177 | ++k2; 178 | if (k2 == smalllength) 179 | return answer; 180 | k1 = __frogadvanceUntil(largeset, k1, largelength, smallset[k2]); 181 | if (k1 == largelength) 182 | return answer; 183 | goto midpoint; 184 | } 185 | } 186 | return answer; 187 | 188 | } 189 | 190 | 191 | size_t onesidedgallopingintersection(const uint32_t * smallset, 192 | const size_t smalllength, const uint32_t * largeset, 193 | const size_t largelength, uint32_t * out) { 194 | if(largelength < smalllength) return onesidedgallopingintersection(largeset,largelength,smallset,smalllength,out); 195 | if (0 == smalllength) 196 | return 0; 197 | const uint32_t * const initout(out); 198 | size_t k1 = 0, k2 = 0; 199 | while (true) { 200 | if (largeset[k1] < smallset[k2]) { 201 | k1 = __frogadvanceUntil(largeset, k1, largelength, smallset[k2]); 202 | if (k1 == largelength) 203 | break; 204 | } 205 | midpoint: if (smallset[k2] < largeset[k1]) { 206 | ++k2; 207 | if (k2 == smalllength) 208 | break; 209 | } else { 210 | *out++ = smallset[k2]; 211 | ++k2; 212 | if (k2 == smalllength) 213 | break; 214 | k1 = __frogadvanceUntil(largeset, k1, largelength, smallset[k2]); 215 | if (k1 == largelength) 216 | break; 217 | goto midpoint; 218 | } 219 | } 220 | return out - initout; 221 | 222 | } 223 | 224 | 225 | 226 | size_t onesidedgallopingintersection_experimental(const uint32_t * smallset, 227 | const size_t smalllength, const uint32_t * largeset, 228 | const size_t largelength, uint32_t * out) { 229 | if(largelength < smalllength) return onesidedgallopingintersection_experimental(largeset,largelength,smallset,smalllength,out); 230 | if (0 == smalllength) 231 | return 0; 232 | const uint32_t * const initout(out); 233 | size_t k1 = 0, k2 = 0; 234 | while (true) { 235 | if (largeset[k1] < smallset[k2]) { 236 | k1 = __frogadvanceUntil_experimental(largeset, k1, largelength, smallset[k2]); 237 | if (k1 == largelength) 238 | break; 239 | } 240 | midpoint: if (smallset[k2] < largeset[k1]) { 241 | ++k2; 242 | if (k2 == smalllength) 243 | break; 244 | } else { 245 | *out++ = smallset[k2]; 246 | ++k2; 247 | if (k2 == smalllength) 248 | break; 249 | k1 = __frogadvanceUntil_experimental(largeset, k1, largelength, smallset[k2]); 250 | if (k1 == largelength) 251 | break; 252 | goto midpoint; 253 | } 254 | } 255 | return out - initout; 256 | 257 | } 258 | 259 | #endif /* GALLOPINGINTERSECTION_H_ */ 260 | -------------------------------------------------------------------------------- /include/hscalableintersection.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Schemes inspired or lifted from 3 | * http://highlyscalable.wordpress.com/2012/06/05/fast-intersection-sorted-lists-sse/ 4 | * 5 | * The downside of these schemes is that they can't safely write back on the input 6 | * buffers. 7 | */ 8 | 9 | #ifndef HSCALABLEINTERSECTION_H_ 10 | #define HSCALABLEINTERSECTION_H_ 11 | 12 | #include "common.h" 13 | 14 | namespace highlyscalablewordpresscom { 15 | /** 16 | * Taken almost verbatim from http://highlyscalable.wordpress.com/2012/06/05/fast-intersection-sorted-lists-sse/ 17 | * (just for comparison) 18 | */ 19 | size_t cardinality_intersect_scalar(const uint32_t *A, const size_t s_a, 20 | const uint32_t *B, const size_t s_b) { 21 | size_t i_a = 0, i_b = 0; 22 | size_t counter = 0; 23 | 24 | while (i_a < s_a && i_b < s_b) { 25 | if (A[i_a] < B[i_b]) { 26 | i_a++; 27 | } else if (B[i_b] < A[i_a]) { 28 | i_b++; 29 | } else { 30 | counter++; 31 | i_a++; 32 | i_b++; 33 | } 34 | } 35 | return counter; 36 | } 37 | 38 | /** 39 | * Taken almost verbatim from http://highlyscalable.wordpress.com/2012/06/05/fast-intersection-sorted-lists-sse/ 40 | * (just for comparison) 41 | */ 42 | size_t intersect_scalar(const uint32_t *A, const size_t s_a, 43 | const uint32_t *B, const size_t s_b, uint32_t * out) { 44 | const uint32_t * const initout (out); 45 | size_t i_a = 0, i_b = 0; 46 | 47 | while (i_a < s_a && i_b < s_b) { 48 | if (A[i_a] < B[i_b]) { 49 | i_a++; 50 | } else if (B[i_b] < A[i_a]) { 51 | i_b++; 52 | } else { 53 | *out++ = B[i_b]; 54 | i_a++; 55 | i_b++; 56 | } 57 | } 58 | return out - initout; 59 | } 60 | /** 61 | * More or less from 62 | * http://highlyscalable.wordpress.com/2012/06/05/fast-intersection-sorted-lists-sse/ 63 | */ 64 | const static __m128i shuffle_mask[16] = { 65 | _mm_set_epi8(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0), 66 | _mm_set_epi8(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0), 67 | _mm_set_epi8(15,14,13,12,11,10,9,8,7,6,5,4,7,6,5,4), 68 | _mm_set_epi8(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0), 69 | _mm_set_epi8(15,14,13,12,11,10,9,8,7,6,5,4,11,10,9,8), 70 | _mm_set_epi8(15,14,13,12,11,10,9,8,11,10,9,8,3,2,1,0), 71 | _mm_set_epi8(15,14,13,12,11,10,9,8,11,10,9,8,7,6,5,4), 72 | _mm_set_epi8(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0), 73 | _mm_set_epi8(15,14,13,12,11,10,9,8,7,6,5,4,15,14,13,12), 74 | _mm_set_epi8(15,14,13,12,11,10,9,8,15,14,13,12,3,2,1,0), 75 | _mm_set_epi8(15,14,13,12,11,10,9,8,15,14,13,12,7,6,5,4), 76 | _mm_set_epi8(15,14,13,12,15,14,13,12,7,6,5,4,3,2,1,0), 77 | _mm_set_epi8(15,14,13,12,11,10,9,8,15,14,13,12,11,10,9,8), 78 | _mm_set_epi8(15,14,13,12,15,14,13,12,11,10,9,8,3,2,1,0), 79 | _mm_set_epi8(15,14,13,12,15,14,13,12,11,10,9,8,7,6,5,4), 80 | _mm_set_epi8(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0), 81 | }; 82 | // precomputed dictionary 83 | 84 | 85 | 86 | /*int getBit(int value, int position) { 87 | return ((value & (1 << position)) >> position); 88 | }*/ 89 | 90 | // a simple implementation, we don't care about performance here 91 | /*void prepare_shuffling_dictionary() { 92 | for (int i = 0; i < 16; i++) { 93 | int counter = 0; 94 | char permutation[16]; 95 | memset(permutation, 0xFF, sizeof(permutation)); 96 | for (char b = 0; b < 4; b++) { 97 | if (getBit(i, b)) { 98 | permutation[counter++] = 4 * b; 99 | permutation[counter++] = 4 * b + 1; 100 | permutation[counter++] = 4 * b + 2; 101 | permutation[counter++] = 4 * b + 3; 102 | } 103 | } 104 | __m128i mask = _mm_loadu_si128((const __m128i *) permutation); 105 | shuffle_mask[i] = mask; 106 | } 107 | }*/ 108 | 109 | /** 110 | * Taken almost verbatim from http://highlyscalable.wordpress.com/2012/06/05/fast-intersection-sorted-lists-sse/ 111 | */ 112 | size_t cardinality_intersect_SIMD(const uint32_t *A, const size_t s_a, 113 | const uint32_t *B, const size_t s_b) { 114 | size_t count = 0; 115 | size_t i_a = 0, i_b = 0; 116 | 117 | // trim lengths to be a multiple of 4 118 | size_t st_a = (s_a / 4) * 4; 119 | size_t st_b = (s_b / 4) * 4; 120 | 121 | while (i_a < st_a && i_b < st_b) { 122 | //[ load segments of four 32-bit elements 123 | __m128i v_a = _mm_load_si128((__m128i *) &A[i_a]); 124 | __m128i v_b = _mm_load_si128((__m128i *) &B[i_b]); 125 | //] 126 | 127 | //[ move pointers 128 | const uint32_t a_max = A[i_a + 3]; 129 | const uint32_t b_max = B[i_b + 3]; 130 | i_a += (a_max <= b_max) * 4; 131 | i_b += (a_max >= b_max) * 4; 132 | //] 133 | 134 | //[ compute mask of common elements 135 | const uint32_t cyclic_shift = _MM_SHUFFLE(0, 3, 2, 1); 136 | __m128i cmp_mask1 = _mm_cmpeq_epi32(v_a, v_b); // pairwise comparison 137 | v_b = _mm_shuffle_epi32(v_b, cyclic_shift); // shuffling 138 | __m128i cmp_mask2 = _mm_cmpeq_epi32(v_a, v_b); // again... 139 | v_b = _mm_shuffle_epi32(v_b, cyclic_shift); 140 | __m128i cmp_mask3 = _mm_cmpeq_epi32(v_a, v_b); // and again... 141 | v_b = _mm_shuffle_epi32(v_b, cyclic_shift); 142 | __m128i cmp_mask4 = _mm_cmpeq_epi32(v_a, v_b); // and again. 143 | __m128i cmp_mask = _mm_or_si128(_mm_or_si128(cmp_mask1, cmp_mask2), 144 | _mm_or_si128(cmp_mask3, cmp_mask4)); // OR-ing of comparison masks 145 | // convert the 128-bit mask to the 4-bit mask 146 | const int mask = _mm_movemask_ps((__m128 ) cmp_mask); 147 | //] 148 | 149 | //[ copy out common elements 150 | //__m128i p = _mm_shuffle_epi8(v_a, shuffle_mask[mask]); 151 | //_mm_storeu_si128((__m128i*)&C[count], p); 152 | count += _mm_popcnt_u32(mask); // a number of elements is a weight of the mask 153 | //] 154 | } 155 | 156 | // intersect the tail using scalar intersection 157 | while (i_a < s_a && i_b < s_b) { 158 | if (A[i_a] < B[i_b]) { 159 | i_a++; 160 | } else if (B[i_b] < A[i_a]) { 161 | i_b++; 162 | } else { 163 | count++; 164 | i_a++; 165 | i_b++; 166 | } 167 | } 168 | 169 | return count; 170 | } 171 | 172 | 173 | 174 | 175 | 176 | /** 177 | * Taken almost verbatim from http://highlyscalable.wordpress.com/2012/06/05/fast-intersection-sorted-lists-sse/ 178 | * 179 | * It is not safe for out to be either A or B. 180 | */ 181 | size_t intersect_SIMD(const uint32_t *A, const size_t s_a, 182 | const uint32_t *B, const size_t s_b, uint32_t * out) { 183 | assert(out != A); 184 | assert(out != B); 185 | const uint32_t * const initout (out); 186 | size_t i_a = 0, i_b = 0; 187 | 188 | // trim lengths to be a multiple of 4 189 | size_t st_a = (s_a / 4) * 4; 190 | size_t st_b = (s_b / 4) * 4; 191 | 192 | while (i_a < st_a && i_b < st_b) { 193 | //[ load segments of four 32-bit elements 194 | __m128i v_a = _mm_load_si128((__m128i *) &A[i_a]); 195 | __m128i v_b = _mm_load_si128((__m128i *) &B[i_b]); 196 | //] 197 | 198 | //[ move pointers 199 | const uint32_t a_max = A[i_a + 3]; 200 | const uint32_t b_max = B[i_b + 3]; 201 | i_a += (a_max <= b_max) * 4; 202 | i_b += (a_max >= b_max) * 4; 203 | //] 204 | 205 | //[ compute mask of common elements 206 | const uint32_t cyclic_shift = _MM_SHUFFLE(0, 3, 2, 1); 207 | __m128i cmp_mask1 = _mm_cmpeq_epi32(v_a, v_b); // pairwise comparison 208 | v_b = _mm_shuffle_epi32(v_b, cyclic_shift); // shuffling 209 | __m128i cmp_mask2 = _mm_cmpeq_epi32(v_a, v_b); // again... 210 | v_b = _mm_shuffle_epi32(v_b, cyclic_shift); 211 | __m128i cmp_mask3 = _mm_cmpeq_epi32(v_a, v_b); // and again... 212 | v_b = _mm_shuffle_epi32(v_b, cyclic_shift); 213 | __m128i cmp_mask4 = _mm_cmpeq_epi32(v_a, v_b); // and again. 214 | __m128i cmp_mask = _mm_or_si128(_mm_or_si128(cmp_mask1, cmp_mask2), 215 | _mm_or_si128(cmp_mask3, cmp_mask4)); // OR-ing of comparison masks 216 | // convert the 128-bit mask to the 4-bit mask 217 | const int mask = _mm_movemask_ps((__m128 ) cmp_mask); 218 | //] 219 | 220 | //[ copy out common elements 221 | const __m128i p = _mm_shuffle_epi8(v_a, shuffle_mask[mask]); 222 | _mm_storeu_si128((__m128i*)out, p); 223 | out += _mm_popcnt_u32(mask); // a number of elements is a weight of the mask 224 | //] 225 | } 226 | 227 | // intersect the tail using scalar intersection 228 | while (i_a < s_a && i_b < s_b) { 229 | if (A[i_a] < B[i_b]) { 230 | i_a++; 231 | } else if (B[i_b] < A[i_a]) { 232 | i_b++; 233 | } else { 234 | *out++ = B[i_b]; ; 235 | i_a++; 236 | i_b++; 237 | } 238 | } 239 | 240 | return out - initout; 241 | } 242 | 243 | size_t dan_cardinality_intersect_SIMD(const uint32_t *A, const size_t s_a, 244 | const uint32_t *B, const size_t s_b) { 245 | size_t count = 0; 246 | size_t i_a = 0, i_b = 0; 247 | const static uint32_t cyclic_shift1 = _MM_SHUFFLE(0, 3, 2, 1); 248 | const static uint32_t cyclic_shift2 = _MM_SHUFFLE(1, 0, 3, 2); 249 | const static uint32_t cyclic_shift3 = _MM_SHUFFLE(2, 1, 0, 3); 250 | 251 | // trim lengths to be a multiple of 4 252 | size_t st_a = (s_a / 4) * 4; 253 | size_t st_b = (s_b / 4) * 4; 254 | if (i_a < st_a && i_b < st_b) { 255 | __m128i v_a, v_b; 256 | v_a = _mm_load_si128((__m128i *) &A[i_a]); 257 | v_b = _mm_load_si128((__m128i *) &B[i_b]); 258 | while (true) { 259 | const __m128i cmp_mask1 = _mm_cmpeq_epi32(v_a, v_b); // pairwise comparison 260 | const __m128i cmp_mask2 = _mm_cmpeq_epi32(v_a, 261 | _mm_shuffle_epi32(v_b, cyclic_shift1)); // again... 262 | __m128i cmp_mask = _mm_or_si128(cmp_mask1, cmp_mask2); 263 | const __m128i cmp_mask3 = _mm_cmpeq_epi32(v_a, 264 | _mm_shuffle_epi32(v_b, cyclic_shift2)); // and again... 265 | cmp_mask = _mm_or_si128(cmp_mask, cmp_mask3); 266 | const __m128i cmp_mask4 = _mm_cmpeq_epi32(v_a, 267 | _mm_shuffle_epi32(v_b, cyclic_shift3)); // and again. 268 | cmp_mask = _mm_or_si128(cmp_mask, cmp_mask4); 269 | // convert the 128-bit mask to the 4-bit mask 270 | const int mask = _mm_movemask_ps((__m128 ) cmp_mask); 271 | count += _mm_popcnt_u32(mask); // the number of elements is the weight of the mask 272 | const uint32_t a_max = A[i_a + 3]; 273 | if (a_max <= B[i_b + 3]) { 274 | i_a += 4; 275 | if (i_a >= st_a) 276 | break; 277 | v_a = _mm_load_si128((__m128i *) &A[i_a]); 278 | } 279 | if (a_max >= B[i_b + 3]) { 280 | i_b += 4; 281 | if (i_b >= st_b) 282 | break; 283 | v_b = _mm_load_si128((__m128i *) &B[i_b]); 284 | } 285 | 286 | } 287 | } 288 | 289 | // intersect the tail using scalar intersection 290 | while (i_a < s_a && i_b < s_b) { 291 | if (A[i_a] < B[i_b]) { 292 | i_a++; 293 | } else if (B[i_b] < A[i_a]) { 294 | i_b++; 295 | } else { 296 | count++; 297 | i_a++; 298 | i_b++; 299 | } 300 | } 301 | 302 | return count; 303 | } 304 | 305 | 306 | /** 307 | * Optimized version of http://highlyscalable.wordpress.com/2012/06/05/fast-intersection-sorted-lists-sse/ 308 | * 309 | * It is not safe for out to be either A or B. 310 | */ 311 | size_t dan_intersect_SIMD(const uint32_t *A, const size_t s_a, 312 | const uint32_t *B, const size_t s_b, uint32_t * out) { 313 | assert(out != A); 314 | assert(out != B); 315 | const uint32_t * const initout (out); 316 | size_t i_a = 0, i_b = 0; 317 | const static uint32_t cyclic_shift1 = _MM_SHUFFLE(0, 3, 2, 1); 318 | const static uint32_t cyclic_shift2 = _MM_SHUFFLE(1, 0, 3, 2); 319 | const static uint32_t cyclic_shift3 = _MM_SHUFFLE(2, 1, 0, 3); 320 | 321 | // trim lengths to be a multiple of 4 322 | size_t st_a = (s_a / 4) * 4; 323 | size_t st_b = (s_b / 4) * 4; 324 | if (i_a < st_a && i_b < st_b) { 325 | __m128i v_a, v_b; 326 | v_a = _mm_load_si128((__m128i *) &A[i_a]); 327 | v_b = _mm_load_si128((__m128i *) &B[i_b]); 328 | while (true) { 329 | const __m128i cmp_mask1 = _mm_cmpeq_epi32(v_a, v_b); // pairwise comparison 330 | const __m128i cmp_mask2 = _mm_cmpeq_epi32(v_a, 331 | _mm_shuffle_epi32(v_b, cyclic_shift1)); // again... 332 | __m128i cmp_mask = _mm_or_si128(cmp_mask1, cmp_mask2); 333 | const __m128i cmp_mask3 = _mm_cmpeq_epi32(v_a, 334 | _mm_shuffle_epi32(v_b, cyclic_shift2)); // and again... 335 | cmp_mask = _mm_or_si128(cmp_mask, cmp_mask3); 336 | const __m128i cmp_mask4 = _mm_cmpeq_epi32(v_a, 337 | _mm_shuffle_epi32(v_b, cyclic_shift3)); // and again. 338 | cmp_mask = _mm_or_si128(cmp_mask, cmp_mask4); 339 | // convert the 128-bit mask to the 4-bit mask 340 | const int mask = _mm_movemask_ps((__m128 ) cmp_mask); 341 | //] 342 | 343 | //[ copy out common elements 344 | const __m128i p = _mm_shuffle_epi8(v_a, shuffle_mask[mask]); 345 | 346 | _mm_storeu_si128((__m128i*)out, p); 347 | //] 348 | out += _mm_popcnt_u32(mask); // a number of elements is a weight of the mask 349 | 350 | const uint32_t a_max = A[i_a + 3]; 351 | //const uint32_t b_max = B[i_b + 3]; 352 | if (a_max <= B[i_b + 3]) { 353 | i_a += 4; 354 | if (i_a >= st_a) 355 | break; 356 | v_a = _mm_load_si128((__m128i *) &A[i_a]); 357 | } 358 | if (a_max >= B[i_b + 3]) { 359 | i_b += 4; 360 | if (i_b >= st_b) 361 | break; 362 | v_b = _mm_load_si128((__m128i *) &B[i_b]); 363 | } 364 | 365 | } 366 | } 367 | 368 | // intersect the tail using scalar intersection 369 | while (i_a < s_a && i_b < s_b) { 370 | if (A[i_a] < B[i_b]) { 371 | i_a++; 372 | } else if (B[i_b] < A[i_a]) { 373 | i_b++; 374 | } else { 375 | *out++ = B[i_b]; 376 | i_a++; 377 | i_b++; 378 | } 379 | } 380 | 381 | return out - initout; 382 | } 383 | 384 | 385 | } 386 | 387 | #endif /* HSCALABLEINTERSECTION_H_ */ 388 | -------------------------------------------------------------------------------- /include/hybridintersection.h: -------------------------------------------------------------------------------- 1 | #ifndef HYBRIDINTERSECTION_H_ 2 | #define HYBRIDINTERSECTION_H_ 3 | 4 | #include "intersection.h" 5 | #include "gallopingintersection.h" 6 | #include "binarysearchintersection.h" 7 | #include "mediumintersection.h" 8 | #include "widevectorintersection.h" 9 | #include "hscalableintersection.h" 10 | #include "match.h" 11 | size_t danielshybridintersectioncardinality(const uint32_t * set1, 12 | const size_t length1, const uint32_t * set2, const size_t length2) { 13 | if ((200 * length1 < length2) or (200 * length2 < length1)) { 14 | if (length1 <= length2) 15 | return danfar_count_medium(set1, length1, 16 | set2, length2); 17 | else 18 | return danfar_count_medium(set2, length2, 19 | set1, length1); 20 | } else { 21 | if (length1 <= length2) 22 | return natedanalt_count_medium(set1, length1, set2, length2); 23 | else 24 | return natedanalt_count_medium(set2, length2, set1, length1); 25 | } 26 | } 27 | 28 | size_t olddanielshybridintersection(const uint32_t * set1, 29 | const size_t length1, const uint32_t * set2, const size_t length2, uint32_t *out) { 30 | if ((10 * length1 <= length2) or (10 * length2 <= length1)) { 31 | if ((200 * length1 < length2) or (200 * length2 < length1)) { 32 | if (length1 <= length2) 33 | return danfar_medium(set1, length1, 34 | set2, length2,out); 35 | else 36 | return danfar_medium(set2, length2, 37 | set1, length1,out); 38 | } else { 39 | if (length1 <= length2) 40 | return natedanalt_medium(set1, length1, set2, length2,out); 41 | else 42 | return natedanalt_medium(set2, length2, set1, length1,out); 43 | } 44 | } 45 | return highlyscalablewordpresscom::dan_intersect_SIMD(set1, length1, set2, length2,out); 46 | } 47 | 48 | size_t danielshybridintersection(const uint32_t * set1, 49 | const size_t length1, const uint32_t * set2, const size_t length2, uint32_t *out) { 50 | if ((length1==0) or (length2 == 0)) return 0; 51 | 52 | if ((50 * length1 <= length2) or (50 * length2 <= length1)) { 53 | if (length1 <= length2) 54 | return danfarfar_medium(set1, length1, set2, length2,out); 55 | else 56 | return danfarfar_medium(set2, length2, set1, length1,out); 57 | } 58 | 59 | if (length1 <= length2) 60 | return match_v4_f2_p0(set1, length1, set2, length2, out); 61 | else 62 | return match_v4_f2_p0(set2, length2, set1, length1, out); 63 | 64 | } 65 | size_t SIMDintersection(const uint32_t *set1, 66 | const size_t length1, const uint32_t *set2, const size_t length2, uint32_t *out) { 67 | if ((length1 == 0) or (length2 == 0)) return 0; 68 | 69 | 70 | if ((1000 * length1 <= length2) or (1000 * length2 <= length1)) { 71 | if (length1 <= length2) 72 | return SIMDgalloping(set1, length1, set2, length2, out); 73 | else 74 | return SIMDgalloping(set2, length2, set1, length1, out); 75 | } 76 | 77 | if ((50 * length1 <= length2) or (50 * length2 <= length1)) { 78 | if (length1 <= length2) 79 | return v3(set1, length1, set2, length2, out); 80 | else 81 | return v3(set2, length2, set1, length1, out); 82 | } 83 | 84 | if (length1 <= length2) 85 | return v1(set1, length1, set2, length2, out); 86 | else 87 | return v1(set2, length2, set1, length1, out); 88 | } 89 | 90 | 91 | 92 | #endif /* HYBRIDINTERSECTION_H_ */ 93 | -------------------------------------------------------------------------------- /include/inoueetal.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef INOUETAL_H_ 3 | #define INOUETAL_H_ 4 | const static int popcnt_u32_4bit[16] = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 }; 5 | 6 | const static __m128i shuffle_mask[16] = { 7 | _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 8 | _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 9 | _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 7, 6, 5, 4), 10 | _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 11 | _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 11, 10, 9, 8), 12 | _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 11, 10, 9, 8, 3, 2, 1, 0), 13 | _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 11, 10, 9, 8, 7, 6, 5, 4), 14 | _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 15 | _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 15, 14, 13, 12), 16 | _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 15, 14, 13, 12, 3, 2, 1, 0), 17 | _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 15, 14, 13, 12, 7, 6, 5, 4), 18 | _mm_set_epi8(15, 14, 13, 12, 15, 14, 13, 12, 7, 6, 5, 4, 3, 2, 1, 0), 19 | _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 15, 14, 13, 12, 11, 10, 9, 8), 20 | _mm_set_epi8(15, 14, 13, 12, 15, 14, 13, 12, 11, 10, 9, 8, 3, 2, 1, 0), 21 | _mm_set_epi8(15, 14, 13, 12, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4), 22 | _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0), 23 | }; 24 | 25 | 26 | 27 | 28 | /* 29 | * Inspired by "Faster Set Intersection with SIMD instructions 30 | * by Reducing Branch Mispredictions". 31 | */ 32 | size_t SIMDIntersectWithPrefilter(const uint32_t *A, const size_t s_a, 33 | const uint32_t *B, const size_t s_b, uint32_t * out) { 34 | 35 | assert(out != A); 36 | assert(out != B); 37 | size_t i_a = 0, i_b = 0, i_out = 0; 38 | 39 | // trim lengths to be a multiple of 4 40 | size_t st_a = (s_a / 4) * 4; 41 | size_t st_b = (s_b / 4) * 4; 42 | 43 | if (i_a < st_a && i_b < st_b) { 44 | const __m128i a_mask = _mm_set_epi8(12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0); 45 | const __m128i b_mask = _mm_set_epi8(12, 8, 4, 0, 12, 8, 4, 0, 12, 8, 4, 0, 12, 8, 4, 0); 46 | 47 | // load initial data in registers. 48 | __m128i v_a = _mm_loadu_si128((__m128i *) &A[i_a]); 49 | __m128i v_b = _mm_loadu_si128((__m128i *) &B[i_b]); 50 | __m128i v_a_filter = _mm_shuffle_epi8(v_a, a_mask); 51 | __m128i v_b_filter = _mm_shuffle_epi8(v_b, b_mask); 52 | uint32_t a_max = A[3]; 53 | uint32_t b_max = B[3]; 54 | 55 | for(;;) { 56 | 57 | // check for potential intersection of least significant byte. 58 | __m128i v_c = _mm_cmpeq_epi8(v_a_filter, v_b_filter); 59 | 60 | if (!_mm_movemask_epi8(v_c)) { 61 | // No hit so load the next 4 lowest bytes from smallest 62 | advance: 63 | const uint32_t a_max_local = a_max; 64 | const uint32_t b_max_local = b_max; 65 | if (a_max_local <= b_max_local) { 66 | i_a += 4; 67 | if (i_a < st_a) { 68 | v_a = _mm_loadu_si128((__m128i *) &A[i_a]); 69 | v_a_filter = _mm_shuffle_epi8(v_a, a_mask); 70 | a_max = _mm_extract_epi32(v_a, 3); 71 | } 72 | else { 73 | break; 74 | } 75 | } 76 | if (a_max_local >= b_max_local) { 77 | i_b += 4; 78 | if (i_b < st_b) { 79 | v_b = _mm_loadu_si128((__m128i *) &B[i_b]); 80 | v_b_filter = _mm_shuffle_epi8(v_b, b_mask); 81 | b_max = _mm_extract_epi32(v_b, 3); 82 | } 83 | else { 84 | break; 85 | } 86 | } 87 | } else { 88 | 89 | // TODO: Any way to figure how to do this without having to copy registers? 90 | // If we can, that would free up more registers when we implement unrolling. 91 | __m128i v_as = v_a; 92 | __m128i v_bs = v_b; 93 | 94 | //[ compute mask of common elements 95 | const uint32_t cyclic_shift = _MM_SHUFFLE(0, 3, 2, 1); 96 | __m128i cmp_mask1 = _mm_cmpeq_epi32(v_as, v_bs); // pairwise comparison 97 | v_bs = _mm_shuffle_epi32(v_bs, cyclic_shift); // shuffling 98 | __m128i cmp_mask2 = _mm_cmpeq_epi32(v_as, v_bs); // again... 99 | v_bs = _mm_shuffle_epi32(v_bs, cyclic_shift); 100 | __m128i cmp_mask3 = _mm_cmpeq_epi32(v_as, v_bs); // and again... 101 | v_bs = _mm_shuffle_epi32(v_bs, cyclic_shift); 102 | __m128i cmp_mask4 = _mm_cmpeq_epi32(v_as, v_bs); // and again. 103 | __m128i cmp_mask = _mm_or_si128(_mm_or_si128(cmp_mask1, cmp_mask2), 104 | _mm_or_si128(cmp_mask3, cmp_mask4)); // OR-ing of comparison masks 105 | // convert the 128-bit mask to the 4-bit mask 106 | const int mask = _mm_movemask_ps(_mm_castsi128_ps(cmp_mask)); 107 | //] 108 | 109 | //[ copy out common elements 110 | const __m128i p = _mm_shuffle_epi8(v_as, shuffle_mask[mask]); 111 | _mm_storeu_si128((__m128i*)(out + i_out), p); 112 | i_out += popcnt_u32_4bit[mask]; // a number of elements is a weight of the mask 113 | //] 114 | 115 | goto advance; 116 | } 117 | } 118 | } 119 | 120 | 121 | // intersect the tail using scalar intersection 122 | 123 | while (i_a < s_a && i_b < s_b) { 124 | const uint32_t a = A[i_a]; 125 | const uint32_t b = B[i_b]; 126 | if (a != b) { 127 | if (a <= b) { 128 | i_a++; 129 | } 130 | if (a >= b) { 131 | i_b++; 132 | } 133 | } else { 134 | out[i_out++] = a; 135 | i_a++; 136 | i_b++; 137 | } 138 | } 139 | 140 | return i_out; 141 | } 142 | 143 | size_t lemireSIMDIntersectWithPrefilter(const uint32_t *A, const size_t s_a, 144 | const uint32_t *B, const size_t s_b, uint32_t * out) { 145 | 146 | assert(out != A); 147 | assert(out != B); 148 | size_t i_a = 0, i_b = 0, i_out = 0; 149 | 150 | // trim lengths to be a multiple of 4 151 | size_t st_a = (s_a / 4) * 4; 152 | size_t st_b = (s_b / 4) * 4; 153 | 154 | if (i_a < st_a && i_b < st_b) { 155 | const __m128i a_mask = _mm_set_epi8(12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0); 156 | const __m128i b_mask = _mm_set_epi8(12, 8, 4, 0, 12, 8, 4, 0, 12, 8, 4, 0, 12, 8, 4, 0); 157 | 158 | // load initial data in registers. 159 | __m128i v_a = _mm_loadu_si128((__m128i *) &A[i_a]); 160 | __m128i v_b = _mm_loadu_si128((__m128i *) &B[i_b]); 161 | __m128i v_a_filter = _mm_shuffle_epi8(v_a, a_mask); 162 | __m128i v_b_filter = _mm_shuffle_epi8(v_b, b_mask); 163 | uint32_t a_max = A[3]; 164 | uint32_t b_max = B[3]; 165 | 166 | for(;;) { 167 | 168 | // check for potential intersection of least significant byte. 169 | __m128i v_c = _mm_cmpeq_epi8(v_a_filter, v_b_filter); 170 | 171 | if (!_mm_movemask_epi8(v_c)) { 172 | // No hit so load the next 4 lowest bytes from smallest 173 | advance: 174 | const uint32_t a_max_local = a_max; 175 | const uint32_t b_max_local = b_max; 176 | if (a_max_local <= b_max_local) { 177 | i_a += 4; 178 | if (i_a < st_a) { 179 | v_a = _mm_loadu_si128((__m128i *) &A[i_a]); 180 | v_a_filter = _mm_shuffle_epi8(v_a, a_mask); 181 | a_max = _mm_extract_epi32(v_a, 3); 182 | } 183 | else { 184 | break; 185 | } 186 | } 187 | if (a_max_local >= b_max_local) { 188 | i_b += 4; 189 | if (i_b < st_b) { 190 | v_b = _mm_loadu_si128((__m128i *) &B[i_b]); 191 | v_b_filter = _mm_shuffle_epi8(v_b, b_mask); 192 | b_max = _mm_extract_epi32(v_b, 3); 193 | } 194 | else { 195 | break; 196 | } 197 | } 198 | } else { 199 | const static uint32_t cyclic_shift1 = _MM_SHUFFLE(0, 3, 2, 1); 200 | const static uint32_t cyclic_shift2 = _MM_SHUFFLE(1, 0, 3, 2); 201 | const static uint32_t cyclic_shift3 = _MM_SHUFFLE(2, 1, 0, 3); 202 | const __m128i cmp_mask1 = _mm_cmpeq_epi32(v_a, v_b); // pairwise comparison 203 | const __m128i cmp_mask2 = _mm_cmpeq_epi32(v_a, 204 | _mm_shuffle_epi32(v_b, cyclic_shift1)); // again... 205 | __m128i cmp_mask = _mm_or_si128(cmp_mask1, cmp_mask2); 206 | const __m128i cmp_mask3 = _mm_cmpeq_epi32(v_a, 207 | _mm_shuffle_epi32(v_b, cyclic_shift2)); // and again... 208 | cmp_mask = _mm_or_si128(cmp_mask, cmp_mask3); 209 | const __m128i cmp_mask4 = _mm_cmpeq_epi32(v_a, 210 | _mm_shuffle_epi32(v_b, cyclic_shift3)); // and again. 211 | cmp_mask = _mm_or_si128(cmp_mask, cmp_mask4); 212 | // convert the 128-bit mask to the 4-bit mask 213 | const int mask = _mm_movemask_ps(_mm_castsi128_ps(cmp_mask)); 214 | //] 215 | 216 | //[ copy out common elements 217 | const __m128i p = _mm_shuffle_epi8(v_a, shuffle_mask[mask]); 218 | _mm_storeu_si128((__m128i*)(out + i_out), p); 219 | i_out += _mm_popcnt_u32(mask); // a number of elements is a weight of the mask 220 | //] 221 | 222 | goto advance; 223 | } 224 | } 225 | } 226 | 227 | 228 | // intersect the tail using scalar intersection 229 | 230 | while (i_a < s_a && i_b < s_b) { 231 | const uint32_t a = A[i_a]; 232 | const uint32_t b = B[i_b]; 233 | if (a != b) { 234 | if (a <= b) { 235 | i_a++; 236 | } 237 | if (a >= b) { 238 | i_b++; 239 | } 240 | } else { 241 | out[i_out++] = a; 242 | i_a++; 243 | i_b++; 244 | } 245 | } 246 | 247 | return i_out; 248 | } 249 | 250 | 251 | #endif /* INOUETAL_H_ */ 252 | -------------------------------------------------------------------------------- /include/intersection.h: -------------------------------------------------------------------------------- 1 | /** 2 | * This code is released under the 3 | * Apache License Version 2.0 http://www.apache.org/licenses/. 4 | * 5 | * (c) Daniel Lemire, http://lemire.me/en/ 6 | */ 7 | 8 | #ifndef INTERSECTION_H_ 9 | #define INTERSECTION_H_ 10 | 11 | #include "common.h" 12 | typedef size_t (*intersectionfunction)(const uint32_t * set1, 13 | const size_t length1, const uint32_t * set2, const size_t length2, uint32_t * out); 14 | 15 | 16 | typedef size_t (*cardinalityintersectionfunction)(const uint32_t * set1, 17 | const size_t length1, const uint32_t * set2, const size_t length2); 18 | 19 | typedef size_t (*cardinalityintersectionfunctionpart)(const uint16_t *A, 20 | const uint16_t *B, const size_t s_a, const size_t s_b); 21 | 22 | /** 23 | * Compute the *cardinality* of the intersection between two *sorted* 24 | * arrays. 25 | * 26 | * Algorithm design by D. Lemire. It uses several while loops on 27 | * purpose. 28 | * 29 | */ 30 | size_t danscalarintersectioncardinality(const uint32_t * set1, const size_t length1, 31 | const uint32_t * set2, const size_t length2); 32 | 33 | /** 34 | * Compute the *cardinality* of the intersection between two *sorted* 35 | * arrays. 36 | * 37 | * Algorithm design by D. Lemire. It uses several while loops on 38 | * purpose. 39 | * 40 | */ 41 | size_t danscalarintersection(const uint32_t * set1, const size_t length1, 42 | const uint32_t * set2, const size_t length2, uint32_t * out) ; 43 | /** 44 | * This is the classical approach 45 | */ 46 | size_t classicalintersectioncardinality(const uint32_t * set1, 47 | const size_t length1, const uint32_t * set2, const size_t length2); 48 | /** 49 | * This is the classical approach 50 | */ 51 | size_t classicalintersection(const uint32_t * set1, 52 | const size_t length1, const uint32_t * set2, const size_t length2, uint32_t * out) ; 53 | 54 | 55 | #endif /* INTERSECTION_H_ */ 56 | -------------------------------------------------------------------------------- /include/intersectionfactory.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef INTERSECTIONFACTORY_H_ 3 | #define INTERSECTIONFACTORY_H_ 4 | 5 | #include "common.h" 6 | #include "intersection.h" 7 | #include "partitionedintersection.h" 8 | #include "hscalableintersection.h" 9 | #include "gallopingintersection.h" 10 | #include "binarysearchintersection.h" 11 | #include "hybridintersection.h" 12 | #include "mediumintersection.h" 13 | #include "widevectorintersection.h" 14 | #include "branchless.h" 15 | #include "match.h" 16 | #include "thomaswu.h" 17 | #include "inoueetal.h" 18 | #include "tetzank.h" 19 | 20 | 21 | 22 | 23 | 24 | std::map realinitializefactory() { 25 | std::map schemes; 26 | schemes[ "inoue" ] = SIMDIntersectWithPrefilter; 27 | schemes[ "lemireinoue" ] = lemireSIMDIntersectWithPrefilter; 28 | schemes[ "V1" ] = V1; 29 | #ifdef __AVX2__ 30 | schemes[ "V1AVX" ] = V1AVX; 31 | schemes["tetzankshuffle"] = tetzank_intersect_vector_avx2; 32 | #endif 33 | schemes[ "f2p0" ] = match_v4_f2_p0; 34 | schemes[ "f4p0" ] = match_v4_f4_p0; 35 | schemes[ "f8p0" ] = match_v4_f8_p0; 36 | 37 | schemes[ "branchless" ] = branchlessintersection; 38 | schemes[ "scalarbranchlesscached" ] = scalar_branchless_cached; 39 | schemes[ "scalarbranchlesscached2" ] = scalar_branchless_cached2; 40 | schemes[ "scalardanbranchless" ] = branchlessintersection; 41 | schemes[ "scalarbranchless" ] = scalar_branchless; 42 | schemes[ "scalarbranchlessunrolled" ] = scalar_branchless_unrolled; 43 | schemes[ "@hybriddan" ] = danielshybridintersection; 44 | 45 | 46 | schemes[ "widevector" ] = widevector_intersect; 47 | schemes[ "widevectorleo" ] = leowidevector_intersect; 48 | 49 | schemes[ "natemediumdanalt" ] = natedanalt_medium; 50 | schemes[ "danfar" ] = danfar_medium; 51 | schemes[ "danfarmov" ] = danfar_medium_mov; 52 | 53 | schemes[ "danfarfar" ] = danfarfar_medium; 54 | 55 | schemes[ "scalarnate" ] = nate_scalar; 56 | schemes[ "scalarnatewg" ] = nate_scalarwithoutgoto; 57 | 58 | schemes[ "scalar1sgalloping" ] = onesidedgallopingintersection; 59 | schemes[ "v1" ] = v1; 60 | schemes[ "v3" ] = v3; 61 | #ifdef __AVX2__ 62 | schemes[ "v3avx2" ] = v3avx2; 63 | #endif 64 | 65 | schemes[ "simdgalloping" ] = SIMDgalloping; 66 | #ifdef __AVX2__ 67 | schemes[ "simdgalloping_avx2" ] = SIMDgalloping_avx2; 68 | #endif 69 | schemes[ "simdgalloping2" ] = SIMDgalloping2; 70 | schemes[ "hssimd" ] = highlyscalablewordpresscom::intersect_SIMD; 71 | schemes[ "hssimddan" ] = highlyscalablewordpresscom::dan_intersect_SIMD; 72 | 73 | 74 | /*schemes[ "thomas_scalar" ] = compute_intersection; 75 | schemes[ "thomas_gallop" ] = compute_intersection; 76 | schemes[ "thomas_v1" ] = compute_intersection; 77 | schemes[ "thomas_v1_plow" ] = compute_intersection; 78 | schemes[ "thomas_v2" ] = compute_intersection; 79 | schemes[ "thomas_v3" ] = compute_intersection; 80 | schemes[ "thomas_v3_aligned" ] = compute_intersection; 81 | schemes[ "thomas_simdgallop_v0" ] = compute_intersection; 82 | schemes[ "thomas_simdgallop_v1" ] = compute_intersection; 83 | schemes[ "thomas_simdgallop_v2" ] = compute_intersection; 84 | schemes[ "thomas_simdgallop_v3" ] = compute_intersection; 85 | schemes[ "thomas_v3cmpeqflagged" ] = compute_intersection_flagged; 86 | schemes[ "thomas_v3cmpeqscalarflagged" ] = compute_intersection_flagged; 87 | schemes[ "thomas_v3cmpeqsimd32flagged" ] = compute_intersection_flagged; 88 | schemes[ "thomas_v3cmpeqsimd8flagged" ] = compute_intersection_flagged; 89 | schemes[ "thomas_v3cmpeqbinaryflagged" ] = compute_intersection_flagged; 90 | */ 91 | return schemes; 92 | } 93 | 94 | 95 | std::map initializefactory() { 96 | std::map schemes; 97 | schemes[ "@hybriddan" ] = danielshybridintersectioncardinality; 98 | #ifdef __AVX2__ 99 | schemes["tetzankshuffle"] = tetzank_intersect_vector_avx2_count; 100 | #endif 101 | schemes[ "widevector" ] = widevector_cardinality_intersect; 102 | schemes[ "widevectorleo" ] = leowidevector_cardinality_intersect; 103 | 104 | schemes[ "scalargalloping" ] = frogintersectioncardinality; 105 | schemes[ "scalar1sgalloping" ] = onesidedgallopingintersectioncardinality; 106 | schemes[ "scalarnate" ] = nate_count_scalar; 107 | 108 | schemes[ "hssimd" ] = highlyscalablewordpresscom::cardinality_intersect_SIMD; 109 | schemes[ "hssimddan" ] = highlyscalablewordpresscom::dan_cardinality_intersect_SIMD; 110 | 111 | schemes[ "natemedium" ] = nate_count_medium; 112 | schemes[ "natemediumdan" ] = natedan_count_medium; 113 | schemes[ "natemediumdanalt" ] = natedanalt_count_medium; 114 | schemes[ "danfar" ] = danfar_count_medium; 115 | 116 | schemes[ "natemediumfarfine" ] = danfarfine_count_medium; 117 | return schemes; 118 | } 119 | 120 | std::set initializebuggy() { 121 | std::set schemes; 122 | schemes.insert("widevectorleo");//makes some assumptions 123 | return schemes; 124 | } 125 | 126 | std::map schemes = initializefactory(); 127 | std::map realschemes = realinitializefactory(); 128 | 129 | std::set buggyschemes = initializebuggy(); 130 | 131 | 132 | std::map initializefactorypart() { 133 | std::map partschemes; 134 | partschemes[ "schlegel" ] = partitioned::cardinality_intersect_partitioned; 135 | partschemes[ "danschlegel" ] = partitioned::faster_cardinality_intersect_partitioned; 136 | return partschemes; 137 | } 138 | 139 | 140 | std::map partschemes = initializefactorypart(); 141 | 142 | /** 143 | * Convenience function 144 | */ 145 | std::vector allNames() { 146 | std::vector < std::string > ans; 147 | for (auto i = schemes.begin(); i != schemes.end(); ++i) { 148 | ans.push_back(i->first); 149 | } 150 | for (auto i = partschemes.begin(); i != partschemes.end(); ++i) { 151 | ans.push_back(i->first); 152 | } 153 | return ans; 154 | } 155 | /** 156 | * Convenience function 157 | */ 158 | std::vector allRealNames() { 159 | std::vector < std::string > ans; 160 | for (auto i = realschemes.begin(); i != realschemes.end(); ++i) { 161 | ans.push_back(i->first); 162 | } 163 | return ans; 164 | } 165 | 166 | 167 | #endif /* INTERSECTIONFACTORY_H_ */ 168 | -------------------------------------------------------------------------------- /include/match.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #define VEC_T __m128i 9 | 10 | #define VEC_COPY_LOW(reg_dest, xmm_src) \ 11 | __asm volatile("movd %1, %0" : "=r" (reg_dest) : "x" (xmm_src)) 12 | 13 | #define VEC_OR(dest, other) \ 14 | __asm volatile("por %1, %0" : "+x" (dest) : "x" (other) ) 15 | 16 | #define VEC_ADD_PTEST(var, add, xmm) { \ 17 | typeof(var) _new = var + add; \ 18 | __asm volatile("ptest %2, %2\n\t" \ 19 | "cmovnz %1, %0\n\t" \ 20 | : /* writes */ "+r" (var) \ 21 | : /* reads */ "r" (_new), "x" (xmm) \ 22 | : /* clobbers */ "cc"); \ 23 | } 24 | 25 | 26 | // this macro does a signed comparison 27 | #define VEC_CMP_GREATER(dest, other) \ 28 | __asm volatile("pcmpgtd %1, %0" : "+x" (dest) : "x" (other)) 29 | 30 | #define VEC_CMP_EQUAL(dest, other) \ 31 | __asm volatile("pcmpeqd %1, %0" : "+x" (dest) : "x" (other)) 32 | 33 | #define VEC_SET_ALL_TO_INT(reg, int32) \ 34 | __asm volatile("movd %1, %0; pshufd $0, %0, %0" \ 35 | : "=x" (reg) : "g" (int32) ) 36 | 37 | #define VEC_LOAD_OFFSET(xmm, ptr, bytes) \ 38 | __asm volatile("movdqu %c2(%1), %0" : "=x" (xmm) : \ 39 | "r" (ptr), "i" (bytes)) 40 | 41 | #define COMPILER_LIKELY(x) __builtin_expect((x),1) 42 | #define COMPILER_RARELY(x) __builtin_expect((x),0) 43 | 44 | #define ASM_LEA_ADD_BYTES(ptr, bytes) \ 45 | __asm volatile("lea %c1(%0), %0\n\t" : \ 46 | /* reads/writes %0 */ "+r" (ptr) : \ 47 | /* reads */ "i" (bytes)); 48 | 49 | 50 | #ifdef __cplusplus 51 | 52 | #define typeof(arg) decltype(arg) 53 | 54 | extern "C" { 55 | #endif 56 | 57 | size_t match_scalar(const uint32_t *A, const size_t lenA, 58 | const uint32_t *B, const size_t lenB, 59 | uint32_t *out); 60 | 61 | // like match_v4_f2_p0 but more portable 62 | size_t V1 63 | (const uint32_t *rare, size_t lenRare, 64 | const uint32_t *freq, size_t lenFreq, 65 | uint32_t *matchOut); 66 | 67 | #ifdef __AVX2__ 68 | size_t V1AVX 69 | (const uint32_t *rare, size_t lenRare, 70 | const uint32_t *freq, size_t lenFreq, 71 | uint32_t *matchOut); 72 | #endif 73 | 74 | size_t match_v4_f2_p0 75 | (const uint32_t *rare, size_t lenRare, 76 | const uint32_t *freq, size_t lenFreq, 77 | uint32_t *matchOut); 78 | 79 | 80 | // proxy for match_v4_f2_p0 81 | inline size_t v1(const uint32_t *rare, size_t lenRare, 82 | const uint32_t *freq, size_t lenFreq, 83 | uint32_t *matchOut) { 84 | return match_v4_f2_p0(rare,lenRare,freq,lenFreq,matchOut); 85 | } 86 | 87 | size_t match_v4_f4_p0 88 | (const uint32_t *rare, size_t lenRare, 89 | const uint32_t *freq, size_t lenFreq, 90 | uint32_t *matchOut); 91 | 92 | 93 | size_t match_v4_f8_p0 94 | (const uint32_t *rare, size_t lenRare, 95 | const uint32_t *freq, size_t lenFreq, 96 | uint32_t *matchOut); 97 | 98 | 99 | // unsafe, assumes signed ints 100 | size_t match_v4_f2_p1 101 | (const uint32_t *rare, size_t lenRare, 102 | const uint32_t *freq, size_t lenFreq, 103 | uint32_t *matchOut); 104 | 105 | // unsafe, assumes signed ints 106 | size_t match_v4_f4_p1 107 | (const uint32_t *rare, size_t lenRare, 108 | const uint32_t *freq, size_t lenFreq, 109 | uint32_t *matchOut); 110 | 111 | 112 | // unsafe, assumes signed ints 113 | size_t match_v4_f8_p1 114 | (const uint32_t *rare, size_t lenRare, 115 | const uint32_t *freq, size_t lenFreq, 116 | uint32_t *matchOut); 117 | 118 | // unsafe, assumes signed ints 119 | size_t match_v4_f2_p2 120 | (const uint32_t *rare, size_t lenRare, 121 | const uint32_t *freq, size_t lenFreq, 122 | uint32_t *matchOut); 123 | 124 | // unsafe, assumes signed ints 125 | size_t match_v4_f4_p2 126 | (const uint32_t *rare, size_t lenRare, 127 | const uint32_t *freq, size_t lenFreq, 128 | uint32_t *matchOut); 129 | 130 | // unsafe, assumes signed ints 131 | size_t match_v4_f8_p2 132 | (const uint32_t *rare, size_t lenRare, 133 | const uint32_t *freq, size_t lenFreq, 134 | uint32_t *matchOut); 135 | 136 | // unsafe, assumes signed ints 137 | size_t match_v4_f2_p3 138 | (const uint32_t *rare, size_t lenRare, 139 | const uint32_t *freq, size_t lenFreq, 140 | uint32_t *matchOut); 141 | 142 | // unsafe, assumes signed ints 143 | size_t match_v4_f4_p3 144 | (const uint32_t *rare, size_t lenRare, 145 | const uint32_t *freq, size_t lenFreq, 146 | uint32_t *matchOut); 147 | 148 | // unsafe, assumes signed ints 149 | size_t match_v4_f8_p3 150 | (const uint32_t *rare, size_t lenRare, 151 | const uint32_t *freq, size_t lenFreq, 152 | uint32_t *matchOut); 153 | 154 | #ifdef __cplusplus 155 | } // extern "C" 156 | #endif 157 | -------------------------------------------------------------------------------- /include/mersenne.h: -------------------------------------------------------------------------------- 1 | /** 2 | * This code is released under the 3 | * Apache License Version 2.0 http://www.apache.org/licenses/. 4 | */ 5 | 6 | #ifndef MERSENNE_H_ 7 | #define MERSENNE_H_ 8 | 9 | #include "common.h" 10 | ///#include "util.h" 11 | 12 | /** 13 | * Mersenne twister - random number generator. 14 | * Generate uniform distribution of 32 bit integers with the MT19937 algorithm. 15 | * source: http://bannister.us/weblog/?s=Mersenne 16 | */ 17 | class ZRandom { 18 | 19 | private: 20 | enum { 21 | N = 624, M = 397 22 | }; 23 | unsigned int MT[N + 1]; 24 | unsigned int* map[N]; 25 | int nValues; 26 | 27 | public: 28 | ZRandom(unsigned int iSeed = 20070102); 29 | void seed(unsigned iSeed); 30 | unsigned int getValue(); 31 | unsigned int getValue(const uint32_t MaxValue); 32 | double getDouble(); 33 | bool test(const double p); 34 | 35 | }; 36 | 37 | ZRandom::ZRandom(unsigned iSeed) : 38 | nValues(0) { 39 | seed(iSeed); 40 | } 41 | 42 | void ZRandom::seed(unsigned iSeed) { 43 | nValues = 0; 44 | // Seed the array used in random number generation. 45 | MT[0] = iSeed; 46 | for (int i = 1; i < N; ++i) { 47 | MT[i] = 1 + (69069 * MT[i - 1]); 48 | } 49 | // Compute map once to avoid % in inner loop. 50 | for (int i = 0; i < N; ++i) { 51 | map[i] = MT + ((i + M) % N); 52 | } 53 | } 54 | 55 | inline bool ZRandom::test(const double p) { 56 | return getDouble() <= p; 57 | } 58 | inline double ZRandom::getDouble() { 59 | return double(getValue()) * (1.0 / 4294967296.0); 60 | } 61 | 62 | unsigned int ZRandom::getValue(const uint32_t MaxValue) { 63 | unsigned int used = MaxValue; 64 | used |= used >> 1; 65 | used |= used >> 2; 66 | used |= used >> 4; 67 | used |= used >> 8; 68 | used |= used >> 16; 69 | 70 | // Draw numbers until one is found in [0,n] 71 | unsigned int i; 72 | do 73 | i = getValue() & used; // toss unused bits to shorten search 74 | while (i > MaxValue); 75 | return i; 76 | } 77 | 78 | unsigned int ZRandom::getValue() { 79 | if (0 == nValues) { 80 | MT[N] = MT[0]; 81 | for (int i = 0; i < N; ++i) { 82 | register unsigned y = (0x80000000 & MT[i]) | (0x7FFFFFFF 83 | & MT[i + 1]); 84 | register unsigned v = *(map[i]) ^ (y >> 1); 85 | if (1 & y) 86 | v ^= 2567483615; 87 | MT[i] = v; 88 | } 89 | nValues = N; 90 | } 91 | register unsigned y = MT[N - nValues--]; 92 | y ^= y >> 11; 93 | y ^= static_cast((y << 7) & 2636928640); 94 | y ^= static_cast((y << 15) & 4022730752); 95 | y ^= y >> 18; 96 | return y; 97 | } 98 | 99 | #endif /* MERSENNE_H_ */ 100 | -------------------------------------------------------------------------------- /include/multiSetIntersection.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * multiSetIntersection.hpp 3 | * 4 | * Created on: 2016/12/20 5 | * Author: SparkleXS 6 | */ 7 | 8 | #ifndef INCLUDE_MULTISETINTERSECTION_HPP_ 9 | #define INCLUDE_MULTISETINTERSECTION_HPP_ 10 | 11 | #include "intersectionfactory.h" 12 | #include "timer.h" 13 | #include "synthetic.h" 14 | #include "util.h" 15 | 16 | namespace msis/*MultiSet InterSection*/{ 17 | // here adapts the range [start,end], different from __BSadvanceUntil 18 | // whose range is [start+1,end-1] 19 | static _ALWAYSINLINE size_t binarySearch_wider(const uint32_t * array, 20 | const size_t start, const size_t end, const size_t min) { 21 | size_t lower = start; 22 | size_t upper = end; 23 | if (lower == end || array[lower] >= min) { 24 | return lower; 25 | } 26 | 27 | size_t mid; 28 | while (lower < upper) { 29 | mid = (lower + upper) / 2; 30 | if (array[mid] == min) { 31 | return mid; 32 | } 33 | 34 | if (array[mid] < min) { 35 | lower = mid + 1; 36 | } else { 37 | upper = mid; 38 | } 39 | } 40 | return upper; 41 | } 42 | 43 | // here adapts the range [start,end] 44 | static _ALWAYSINLINE size_t gallopping(const uint32_t * array, 45 | const size_t start, const size_t end, const size_t min) { 46 | size_t lower = start; 47 | 48 | // special handling for a possibly common sequential case 49 | if ((lower >= end) or (array[lower] >= min)) { 50 | return lower; 51 | } 52 | 53 | size_t spansize = 1; // could set larger 54 | // bootstrap an upper limit 55 | 56 | // sxs: here spansize is enlarged to the maximum where its corresponding 57 | // element is geq min 58 | while ((lower + spansize <= end) and (array[lower + spansize] < min)) 59 | spansize *= 2; 60 | size_t upper = (lower + spansize <= end) ? lower + spansize : end; 61 | 62 | // maybe we are lucky (could be common case when the seek ahead expected to be small and sequential will otherwise make us look bad) 63 | //if (array[upper] == min) { 64 | // return upper; 65 | //} 66 | 67 | if (array[upper] < min) { // means array has no item >= min 68 | return end; 69 | } 70 | 71 | // we know that the next-smallest span was too small 72 | lower += (spansize / 2); 73 | 74 | // else begin binary search 75 | size_t mid = 0; 76 | while (lower + 1 != upper) { 77 | mid = (lower + upper) / 2; 78 | if (array[mid] == min) { 79 | return mid; 80 | } else if (array[mid] < min) 81 | lower = mid; 82 | else 83 | upper = mid; 84 | } 85 | return upper; 86 | 87 | } 88 | 89 | void small_vs_small(const mySet &sets, std::vector &out); 90 | 91 | void BYintersect_sorted(const uint32_t *D, const size_t &D_end, 92 | const uint32_t *Q, const size_t &Q_end, uint32_t **out, 93 | uint32_t &count); 94 | 95 | // without swap 96 | void set_vs_set(const mySet &sets, std::vector &out); 97 | 98 | void swapping_set_vs_set(const mySet &sets, std::vector &out); 99 | 100 | void adaptive(const mySet &sets, std::vector &out); 101 | 102 | void sequential(const mySet &sets, std::vector &out); 103 | 104 | void small_adaptive(const mySet &sets, std::vector &out); 105 | 106 | //without resorting 107 | void max(const mySet &sets, std::vector &out); 108 | 109 | void BaezaYates(const mySet &sets, std::vector &out); 110 | 111 | } 112 | 113 | #endif /* INCLUDE_MULTISETINTERSECTION_HPP_ */ 114 | -------------------------------------------------------------------------------- /include/partitionedintersection.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Schemes inspired or lifted from 3 | * From Schlegel et al., Fast Sorted-Set Intersection using SIMD Instructions 4 | */ 5 | 6 | #ifndef PARTITIONEDINTERSECTION_H_ 7 | #define PARTITIONEDINTERSECTION_H_ 8 | 9 | #include "common.h" 10 | 11 | namespace partitioned { 12 | 13 | /** 14 | * Silly function. 15 | */ 16 | uint16_t _high16(uint32_t x) { 17 | return static_cast(x >> 16); 18 | } 19 | /** 20 | * Another function. 21 | */ 22 | uint16_t _low16(uint32_t x) { 23 | return static_cast(x); 24 | } 25 | 26 | /** 27 | * From Schlegel et al., Fast Sorted-Set Intersection using SIMD Instructions 28 | */ 29 | // A - sorted array 30 | // s_a - size of A 31 | // R - partitioned sorted array 32 | size_t partition(const uint32_t *A, const size_t s_a, uint16_t *R, const size_t /*Rlength*/) { 33 | uint16_t high = 0; 34 | int partition_length = 0; 35 | size_t partition_size_position = 1; 36 | size_t counter = 0; 37 | size_t p = 0; 38 | if (p < s_a) { 39 | uint16_t chigh = _high16(A[p]); // upper dword 40 | uint16_t clow = _low16(A[p]); // lower dword 41 | if (chigh == 0) { 42 | R[counter++] = chigh; // partition prefix 43 | R[counter++] = 0; // reserve place for partition size 44 | R[counter++] = clow; // write the first element 45 | partition_length = 1; // reset counters 46 | //R[partition_size_position] = partition_length - 1; // store "-1" 47 | //partition_size_position = counter - 2; 48 | high = chigh; 49 | ++p; 50 | } 51 | 52 | } 53 | for (; p < s_a; p++) { 54 | uint16_t chigh = _high16(A[p]); // upper dword 55 | uint16_t clow = _low16(A[p]); // lower dword 56 | if (chigh == high && p != 0) { // add element to the current partition 57 | R[counter++] = clow; 58 | partition_length++; 59 | } else { // start new partition 60 | R[counter++] = chigh; // partition prefix 61 | R[counter++] = 0; // reserve place for partition size 62 | R[counter++] = clow; // write the first element 63 | R[partition_size_position] = static_cast(partition_length - 1); // store "-1" 64 | partition_length = 1; // reset counters 65 | partition_size_position = counter - 2; 66 | high = chigh; 67 | } 68 | } 69 | R[partition_size_position] = static_cast(partition_length - 1); 70 | 71 | return counter; 72 | } 73 | 74 | /** 75 | * Useful for debugging purposes. 76 | */ 77 | size_t inverse_partition(uint32_t *A, const size_t /*s_a*/, const uint16_t *R, 78 | const size_t Rlength) { 79 | size_t i = 0; 80 | size_t p = 0; 81 | while (i < Rlength) { 82 | uint16_t chigh = R[i++]; 83 | size_t sizepart = static_cast (R[i++]) + 1; 84 | while (sizepart > 0) { 85 | uint16_t clow = R[i++]; 86 | A[p++] = (static_cast (chigh) << 16) | clow; 87 | --sizepart; 88 | } 89 | } 90 | return p; 91 | } 92 | 93 | /** 94 | * From Schlegel et al., Fast Sorted-Set Intersection using SIMD Instructions 95 | * 96 | * Optimized by D. Lemire on April 30th 2013 97 | */ 98 | static size_t cardinality_intersect_vector16(const uint16_t *A, 99 | const uint16_t *B, const size_t s_a, const size_t s_b/*, uint16_t *C*/) { 100 | size_t count = 0; 101 | size_t i_a = 0, i_b = 0; 102 | 103 | const size_t st_a = (s_a / 8) * 8; 104 | const size_t st_b = (s_b / 8) * 8; 105 | __m128i v_a, v_b; 106 | if ((i_a < st_a) and (i_b < st_b)) { 107 | v_a = _mm_loadu_si128((__m128i *) &A[i_a]); 108 | v_b = _mm_loadu_si128((__m128i *) &B[i_b]); 109 | 110 | while (true) { 111 | const __m128i res_v = _mm_cmpestrm(v_b, 8, v_a, 8, 112 | _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK); 113 | const int r = _mm_extract_epi32(res_v, 0); 114 | //__m128i p = _mm_shuffle_epi8(v_a, shuffle_mask16[r]); 115 | //_mm_storeu_si128((__m128i *) &C[count], p); 116 | count += _mm_popcnt_u32(r); 117 | const uint16_t a_max = A[i_a + 7]; 118 | const uint16_t b_max = B[i_b + 7]; 119 | if (a_max <= b_max) { 120 | i_a += 8; 121 | if (i_a == st_a) 122 | break; 123 | v_a = _mm_loadu_si128((__m128i *) &A[i_a]); 124 | 125 | } 126 | if (b_max <= a_max) { 127 | i_b += 8; 128 | if (i_b == st_b) 129 | break; 130 | v_b = _mm_loadu_si128((__m128i *) &B[i_b]); 131 | 132 | } 133 | } 134 | } 135 | // intersect the tail using scalar intersection 136 | while (i_a < s_a && i_b < s_b) { 137 | if (A[i_a] < B[i_b]) { 138 | i_a++; 139 | } else if (B[i_b] < A[i_a]) { 140 | i_b++; 141 | } else { 142 | count++; 143 | i_a++; 144 | i_b++; 145 | } 146 | } 147 | 148 | return count; 149 | } 150 | 151 | /** 152 | * From Schlegel et al., Fast Sorted-Set Intersection using SIMD Instructions 153 | * 154 | * Optimized by D. Lemire on May 3rd 2013 155 | */ 156 | static size_t faster_cardinality_intersect_vector16(const uint16_t *A, 157 | const uint16_t *B, const size_t s_a, const size_t s_b/*, uint16_t *C*/) { 158 | size_t count = 0; 159 | size_t i_a = 0, i_b = 0; 160 | 161 | const size_t st_a = (s_a / 8) * 8; 162 | const size_t st_b = (s_b / 8) * 8; 163 | __m128i v_a, v_b; 164 | if ((i_a < st_a) and (i_b < st_b)) { 165 | v_a = _mm_loadu_si128((__m128i *) &A[i_a]); 166 | v_b = _mm_loadu_si128((__m128i *) &B[i_b]); 167 | while ((A[i_a] == 0) or (B[i_b] == 0)) { 168 | const __m128i res_v = _mm_cmpestrm(v_b, 8, v_a, 8, 169 | _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK); 170 | const int r = _mm_extract_epi32(res_v, 0); 171 | //__m128i p = _mm_shuffle_epi8(v_a, shuffle_mask16[r]); 172 | //_mm_storeu_si128((__m128i *) &C[count], p); 173 | count += _mm_popcnt_u32(r); 174 | const uint16_t a_max = A[i_a + 7]; 175 | const uint16_t b_max = B[i_b + 7]; 176 | if (a_max <= b_max) { 177 | i_a += 8; 178 | if (i_a == st_a) 179 | break; 180 | v_a = _mm_loadu_si128((__m128i *) &A[i_a]); 181 | 182 | } 183 | if (b_max <= a_max) { 184 | i_b += 8; 185 | if (i_b == st_b) 186 | break; 187 | v_b = _mm_loadu_si128((__m128i *) &B[i_b]); 188 | 189 | } 190 | 191 | } 192 | if ((i_a < st_a) and (i_b < st_b)) 193 | while (true) { 194 | const __m128i res_v = _mm_cmpistrm(v_b, v_a, 195 | _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK); 196 | const int r = _mm_extract_epi32(res_v, 0); 197 | //__m128i p = _mm_shuffle_epi8(v_a, shuffle_mask16[r]); 198 | //_mm_storeu_si128((__m128i *) &C[count], p); 199 | count += _mm_popcnt_u32(r); 200 | const uint16_t a_max = A[i_a + 7]; 201 | const uint16_t b_max = B[i_b + 7]; 202 | if (a_max <= b_max) { 203 | i_a += 8; 204 | if (i_a == st_a) 205 | break; 206 | v_a = _mm_loadu_si128((__m128i *) &A[i_a]); 207 | 208 | } 209 | if (b_max <= a_max) { 210 | i_b += 8; 211 | if (i_b == st_b) 212 | break; 213 | v_b = _mm_loadu_si128((__m128i *) &B[i_b]); 214 | 215 | } 216 | } 217 | } 218 | // intersect the tail using scalar intersection 219 | while (i_a < s_a && i_b < s_b) { 220 | if (A[i_a] < B[i_b]) { 221 | i_a++; 222 | } else if (B[i_b] < A[i_a]) { 223 | i_b++; 224 | } else { 225 | count++; 226 | i_a++; 227 | i_b++; 228 | } 229 | } 230 | 231 | return count; 232 | } 233 | 234 | 235 | /** 236 | * From Schlegel et al., Fast Sorted-Set Intersection using SIMD Instructions 237 | * 238 | * Optimized by D. Lemire on May 3rd 2013 239 | */ 240 | /*static size_t faster2_cardinality_intersect_vector16(const uint16_t *A, 241 | const uint16_t *B, const size_t s_a, const size_t s_b) { 242 | size_t count = 0; 243 | size_t i_a = 0, i_b = 0; 244 | 245 | const size_t st_a = (s_a / 8) * 8; 246 | const size_t st_b = (s_b / 8) * 8; 247 | __m128i v_a, v_b; 248 | if ((i_a < st_a) and (i_b < st_b)) { 249 | v_a = _mm_loadu_si128((__m128i *) &A[i_a]); 250 | v_b = _mm_loadu_si128((__m128i *) &B[i_b]); 251 | while ((A[i_a] == 0) or (B[i_b] == 0)) { 252 | const __m128i res_v = _mm_cmpestrm(v_b, 8, v_a, 8, 253 | _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK); 254 | const int r = _mm_extract_epi32(res_v, 0); 255 | //__m128i p = _mm_shuffle_epi8(v_a, shuffle_mask16[r]); 256 | //_mm_storeu_si128((__m128i *) &C[count], p); 257 | count += _mm_popcnt_u32(r); 258 | const uint16_t a_max = A[i_a + 7]; 259 | const uint16_t b_max = B[i_b + 7]; 260 | if (a_max <= b_max) { 261 | i_a += 8; 262 | if (i_a == st_a) 263 | break; 264 | v_a = _mm_loadu_si128((__m128i *) &A[i_a]); 265 | 266 | } 267 | if (b_max <= a_max) { 268 | i_b += 8; 269 | if (i_b == st_b) 270 | break; 271 | v_b = _mm_loadu_si128((__m128i *) &B[i_b]); 272 | 273 | } 274 | 275 | } 276 | if ((i_a < st_a) and (i_b < st_b)) 277 | while (true) { 278 | const __m128i res_v = _mm_cmpistrm(v_b, v_a, 279 | _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK); 280 | const int r = _mm_extract_epi32(res_v, 0); 281 | //__m128i p = _mm_shuffle_epi8(v_a, shuffle_mask16[r]); 282 | //_mm_storeu_si128((__m128i *) &C[count], p); 283 | count += _mm_popcnt_u32(r); 284 | const uint16_t a_max = A[i_a + 7]; 285 | const uint16_t b_max = B[i_b + 7]; 286 | if (a_max <= b_max) { 287 | i_a += 8; 288 | if (i_a == st_a) 289 | break; 290 | v_a = _mm_loadu_si128((__m128i *) &A[i_a]); 291 | 292 | } 293 | if (b_max <= a_max) { 294 | i_b += 8; 295 | if (i_b == st_b) 296 | break; 297 | v_b = _mm_loadu_si128((__m128i *) &B[i_b]); 298 | 299 | } 300 | } 301 | } 302 | // intersect the tail using scalar intersection 303 | while (i_a < s_a && i_b < s_b) { 304 | if (A[i_a] < B[i_b]) { 305 | i_a++; 306 | } else if (B[i_b] < A[i_a]) { 307 | i_b++; 308 | } else { 309 | count++; 310 | i_a++; 311 | i_b++; 312 | } 313 | } 314 | 315 | return count; 316 | } 317 | */ 318 | 319 | 320 | 321 | /** 322 | * Strictly for testing/debugging purposes. 323 | */ 324 | size_t scalar_cardinality_intersect_vector16(const uint16_t *A, 325 | const uint16_t *B, const size_t s_a, const size_t s_b/*, uint16_t *C*/) { 326 | size_t count = 0; 327 | size_t i_a = 0, i_b = 0; 328 | // intersect the tail using scalar intersection 329 | while (i_a < s_a && i_b < s_b) { 330 | if (A[i_a] < B[i_b]) { 331 | i_a++; 332 | } else if (B[i_b] < A[i_a]) { 333 | i_b++; 334 | } else { 335 | count++; 336 | i_a++; 337 | i_b++; 338 | } 339 | } 340 | 341 | return count; 342 | } 343 | 344 | /** 345 | * From Schlegel et al., Fast Sorted-Set Intersection using SIMD Instructions 346 | */ 347 | // A, B - partitioned operands 348 | size_t cardinality_intersect_partitioned(const uint16_t *A, const uint16_t *B, 349 | const size_t s_a, const size_t s_b) { 350 | size_t i_a = 0, i_b = 0; 351 | size_t counter = 0; 352 | while (i_a < s_a && i_b < s_b) { 353 | if (A[i_a] < B[i_b]) { 354 | i_a += static_cast (A[i_a + 1]) + 2 + 1; 355 | } else if (B[i_b] < A[i_a]) { 356 | i_b += static_cast (B[i_b + 1]) + 2 + 1; 357 | } else { 358 | //C[counter++] = A[i_a]; // write partition prefix 359 | size_t partition_size = cardinality_intersect_vector16(&A[i_a + 2], 360 | &B[i_b + 2], static_cast (A[i_a + 1]) + 1, 361 | static_cast (B[i_b + 1]) + 1);//, &C[counter + 1]); 362 | //C[counter++] = partition_size; // write partition size 363 | counter += partition_size; 364 | i_a += static_cast (A[i_a + 1]) + 2 + 1; 365 | i_b += static_cast (B[i_b + 1]) + 2 + 1; 366 | } 367 | } 368 | //std::cout<<"partcounter = "<< partcounter< (A[i_a + 1]) + 2 + 1; 384 | if (i_a >= s_a) 385 | goto end; 386 | } while (A[i_a] < B[i_b]); 387 | } 388 | if (B[i_b] < A[i_a]) { 389 | do { 390 | i_b += static_cast (B[i_b + 1]) + 2 + 1; 391 | if (i_b >= s_b) 392 | goto end; 393 | } while (B[i_b] < A[i_a]); 394 | } else { 395 | size_t partition_size = faster_cardinality_intersect_vector16( 396 | &A[i_a + 2], &B[i_b + 2], 397 | static_cast (A[i_a + 1]) + 1, 398 | static_cast (B[i_b + 1]) + 1);//, &C[counter + 1]); 399 | //C[counter++] = partition_size; // write partition size 400 | counter += partition_size; 401 | i_a += static_cast (A[i_a + 1]) + 2 + 1; 402 | i_b += static_cast (B[i_b + 1]) + 2 + 1; 403 | } 404 | } 405 | end: return counter; 406 | } 407 | 408 | } 409 | #endif /* PARTITIONEDINTERSECTION_H_ */ 410 | -------------------------------------------------------------------------------- /include/skipping.h: -------------------------------------------------------------------------------- 1 | /* 2 | * This is a simple implementation of a skipping data structure and algorithms similar to 3 | * what is described in 4 | * 5 | * Sanders and Transier, Intersection in Integer Inverted Indices, ALENEX 2007, 2007. 6 | * 7 | * As suggested in their conclusion, we leave the higher-level structure uncompressed. We also 8 | * use differential coding. 9 | * 10 | * To paraphrase Sanders and Transier... 11 | * 12 | * In addition to a delta-encoded compressed list, a top-level data structure stores 13 | * every B-th element of N in t together with its position in the main list (B is a tuning 14 | * parameter). We can now run any search algorithm on t and then scan only the pieces of 15 | * the main list that might contain an element to be located. 16 | * 17 | * In our implementation, we assume that B is a power of two and use 1 << BlockSizeLog as 18 | * the block size. 19 | * 20 | * Sanders and Transier's proposal is similar in spirit to the skipping structure proposed in 21 | * 22 | * Moffat, A., Zobel, J.: Self-indexing inverted files for fast text retrieval. 23 | * ACM Transactions on Information Systems 14 (1996). 24 | * 25 | * 26 | * Author: Daniel Lemire 27 | */ 28 | 29 | #ifndef SKIPPING_H_ 30 | #define SKIPPING_H_ 31 | 32 | #include "common.h" 33 | 34 | class Skipping { 35 | public: 36 | 37 | 38 | Skipping(uint32_t BS, uint32_t * data, uint32_t length) : 39 | BlockSizeLog(BS), 40 | mainbuffer(), highbuffer(), Length(0) { 41 | if((BlockSizeLog == 0) && (BlockSizeLog >= 32)) throw runtime_error("please use a reasonable BlockSizeLog"); 42 | load(data, length);// cheap constructor 43 | } 44 | 45 | 46 | 47 | ~Skipping() {} 48 | 49 | size_t storageInBytes() const { 50 | return mainbuffer.size() * sizeof(uint8_t) 51 | + highbuffer.size() * sizeof(higharraypair) 52 | + sizeof(Length); // rough estimates (good enough) 53 | } 54 | 55 | uint32_t decompress(uint32_t * out) const { 56 | const uint8_t * bout = mainbuffer.data(); 57 | uint32_t pos = 0; 58 | 59 | uint32_t val = 0; 60 | for(uint32_t k = 0; k < Length;++k) { 61 | bout = decode(bout,val); 62 | out[pos++] = val; 63 | } 64 | return pos; 65 | } 66 | 67 | 68 | uint32_t intersect(const Skipping & otherlarger, uint32_t * out) const { 69 | // we assume that "this" is the smallest of the two 70 | if (otherlarger.Length < Length) 71 | return otherlarger.intersect(*this, out); 72 | if (Length == 0) 73 | return 0;// special silly case 74 | assert(otherlarger.Length>=Length); 75 | assert(otherlarger.Length>0); 76 | uint32_t intersectsize = 0; 77 | 78 | const uint8_t * inbyte = mainbuffer.data(); 79 | const uint8_t * const endbyte = mainbuffer.data() 80 | + mainbuffer.size(); 81 | const uint8_t * largemainpointer = otherlarger.mainbuffer.data(); 82 | uint32_t largemainval = 0; 83 | largemainpointer = decode(largemainpointer, largemainval); 84 | uint32_t val = 0;// where I put decoded values 85 | uint32_t x = 0; 86 | while (endbyte > inbyte) { 87 | inbyte = decode(inbyte, val); 88 | // if the last value of the current block is too small, skip the block entirely 89 | if (otherlarger.highbuffer[x >> otherlarger.BlockSizeLog].first < val) { 90 | do { 91 | x = ((x >> otherlarger.BlockSizeLog) + 1) << otherlarger.BlockSizeLog; 92 | if (x >= otherlarger.Length) { 93 | return intersectsize; 94 | } 95 | } while (otherlarger.highbuffer[x >> otherlarger.BlockSizeLog].first < val); 96 | largemainpointer = otherlarger.mainbuffer.data() 97 | + otherlarger.highbuffer[x >> otherlarger.BlockSizeLog].second; 98 | largemainval = otherlarger.highbuffer[(x >> otherlarger.BlockSizeLog)-1].first; 99 | largemainpointer = decode(largemainpointer, largemainval); 100 | } 101 | // at this point, we have that the last value of the current block is >= val 102 | // this means that we shall decode at most one block before giving up 103 | while (largemainval < val) { 104 | ++x; 105 | if (x >= otherlarger.Length) { 106 | return intersectsize; 107 | } 108 | largemainpointer = decode(largemainpointer, largemainval); 109 | } 110 | if (largemainval == val) { 111 | out[intersectsize++] = val; 112 | } 113 | } 114 | return intersectsize; 115 | } 116 | 117 | uint32_t BlockSizeLog; 118 | vector mainbuffer; 119 | typedef pair higharraypair; 120 | 121 | typedef vector higharray; 122 | higharray highbuffer; 123 | uint32_t Length; 124 | 125 | private: 126 | 127 | Skipping(Skipping && other) : BlockSizeLog(other.BlockSizeLog), mainbuffer(other.mainbuffer), 128 | highbuffer(other.highbuffer), Length(other.Length) { 129 | } 130 | Skipping(const Skipping & other) : BlockSizeLog(other.BlockSizeLog), mainbuffer(other.mainbuffer), 131 | highbuffer(other.highbuffer), Length(other.Length) { 132 | cout<<"Just copied "< 143 | uint8_t extract7bits(const uint32_t val) { 144 | return static_cast ((val >> (7 * i)) & ((1U << 7) - 1)); 145 | } 146 | 147 | template 148 | uint8_t extract7bitsmaskless(const uint32_t val) { 149 | return static_cast ((val >> (7 * i))); 150 | } 151 | static inline const uint8_t * decode(const uint8_t * buffer, uint32_t& prev) { 152 | // manually unrolled for performance 153 | uint32_t v = 0; 154 | uint8_t c = *buffer++; 155 | v += (c & 127) ; 156 | if ((c & 128)) { 157 | prev += v; 158 | return buffer; 159 | } 160 | c = *buffer++; 161 | v += ((c & 127) << 7); 162 | if ((c & 128)) { 163 | prev += v; 164 | return buffer; 165 | } 166 | c = *buffer++; 167 | v += ((c & 127) << 14); 168 | if ((c & 128)) { 169 | prev += v; 170 | return buffer; 171 | } 172 | c = *buffer++; 173 | v += ((c & 127) << 21); 174 | if ((c & 128)) { 175 | prev += v; 176 | return buffer; 177 | } 178 | c = *buffer++; 179 | v += ((c & 127) << 30); 180 | prev += v; 181 | return buffer; 182 | } 183 | }; 184 | 185 | void Skipping::load(uint32_t * data, uint32_t len) { 186 | assert(len < (numeric_limits::max() / 5));// check for overflow 187 | Length = len; 188 | if(Length == 0) return; // nothing to do 189 | uint32_t BlockNumber = (Length + (1<= Length); 191 | highbuffer.resize(BlockNumber); 192 | mainbuffer.resize(5 * Length); 193 | uint8_t * bout = mainbuffer.data(); 194 | uint8_t * const boutinit = bout; 195 | uint32_t prev = 0; 196 | for (uint32_t k = 0; k < BlockNumber; ++k) { 197 | const uint32_t howmany = (((k + 1) << BlockSizeLog) > Length) ? 198 | Length - (k << BlockSizeLog) 199 | : 1 << BlockSizeLog; 200 | highbuffer[k] = make_pair(data[(k << BlockSizeLog) + howmany - 1], 201 | static_cast (bout - boutinit)); 202 | for (uint32_t x = 0; x < howmany; ++x) { 203 | const uint32_t v = data[x + (k << BlockSizeLog)]; 204 | const uint32_t val = v - prev; 205 | prev = v; 206 | if (val < (1U << 7)) { 207 | *bout = static_cast (val | (1U << 7)); 208 | ++bout; 209 | } else if (val < (1U << 14)) { 210 | *bout = extract7bits<0> (val); 211 | ++bout; 212 | *bout = extract7bitsmaskless<1> (val) | (1U << 7); 213 | ++bout; 214 | } else if (val < (1U << 21)) { 215 | *bout = extract7bits<0> (val); 216 | ++bout; 217 | *bout = extract7bits<1> (val); 218 | ++bout; 219 | *bout = extract7bitsmaskless<2> (val) | (1U << 7); 220 | ++bout; 221 | } else if (val < (1U << 28)) { 222 | *bout = extract7bits<0> (val); 223 | ++bout; 224 | *bout = extract7bits<1> (val); 225 | ++bout; 226 | *bout = extract7bits<2> (val); 227 | ++bout; 228 | *bout = extract7bitsmaskless<3> (val) | (1U << 7); 229 | ++bout; 230 | } else { 231 | *bout = extract7bits<0> (val); 232 | ++bout; 233 | *bout = extract7bits<1> (val); 234 | ++bout; 235 | *bout = extract7bits<2> (val); 236 | ++bout; 237 | *bout = extract7bits<3> (val); 238 | ++bout; 239 | *bout = extract7bitsmaskless<4> (val) | (1U << 7); 240 | ++bout; 241 | } 242 | } 243 | } 244 | mainbuffer.resize(static_cast (bout - boutinit)); 245 | mainbuffer.shrink_to_fit(); 246 | } 247 | 248 | #endif /* SKIPPING_H_ */ 249 | -------------------------------------------------------------------------------- /include/stlutil.h: -------------------------------------------------------------------------------- 1 | #ifndef STLUTIL_H_ 2 | #define STLUTIL_H_ 3 | 4 | #include "util.h" 5 | #include "union.h" 6 | #include "intersection.h" 7 | 8 | vector unite(const vector & x, const vector & y) { 9 | vector ans (x.size() + y.size()); 10 | ans.resize(unite(x.data(),x.size(), y.data(),y.size(), ans.data())); 11 | return ans; 12 | } 13 | 14 | 15 | vector intersect(const vector & x, const vector & y) { 16 | vector ans (x.size() + y.size()); 17 | ans.resize(classicalintersection(x.data(),x.size(), y.data(),y.size(), ans.data())); 18 | return ans; 19 | } 20 | 21 | /** 22 | * Returns the removed elements 23 | */ 24 | vector removeRandom(vector & x, size_t N) { 25 | auto i = shuffleFY(x.begin(),x.end(),N); 26 | vector tmp (i,x.end()); 27 | vector ans (x.begin(),i); 28 | x.swap(tmp); 29 | return ans; 30 | } 31 | 32 | vector getRandom(const vector & x, size_t N) { 33 | vector copy(x); 34 | auto i = shuffleFY(copy.begin(),copy.end(),N); 35 | vector ans (copy.begin(),i); 36 | return ans; 37 | } 38 | 39 | /** 40 | * Like getRandom except that the provided vector is modified. 41 | */ 42 | vector grabRandom(vector & x, size_t N) { 43 | auto i = shuffleFY(x.begin(),x.end(),N); 44 | vector ans (x.begin(),i); 45 | return ans; 46 | } 47 | 48 | 49 | vector difference(const vector &x, const vector &y) { 50 | vector answer(x.size()); 51 | answer.resize( 52 | std::set_difference (x.begin(), x.end(), y.begin(), y.end(), answer.begin()) 53 | - answer.begin()); 54 | return answer; 55 | 56 | } 57 | #endif 58 | -------------------------------------------------------------------------------- /include/tetzank.h: -------------------------------------------------------------------------------- 1 | // imported from https://github.com/tetzank/SIMDSetOperations/ 2 | #ifndef INCLUDE_TETZANK_H_ 3 | #define INCLUDE_TETZANK_H_ 4 | 5 | #if defined(_MSC_VER) 6 | #define ALIGNED(x) __declspec(align(x)) 7 | #else 8 | #if defined(__GNUC__) 9 | #define ALIGNED(x) __attribute__ ((aligned(x))) 10 | #endif 11 | #endif 12 | static uint32_t shuffle_mask_avx[] ALIGNED(0x1000) = { 13 | 7, 6, 5, 4, 3, 2, 1, 0, 14 | 0, 7, 6, 5, 4, 3, 2, 1, 15 | 1, 7, 6, 5, 4, 3, 2, 0, 16 | 0, 1, 7, 6, 5, 4, 3, 2, 17 | 2, 7, 6, 5, 4, 3, 1, 0, 18 | 0, 2, 7, 6, 5, 4, 3, 1, 19 | 1, 2, 7, 6, 5, 4, 3, 0, 20 | 0, 1, 2, 7, 6, 5, 4, 3, 21 | 3, 7, 6, 5, 4, 2, 1, 0, 22 | 0, 3, 7, 6, 5, 4, 2, 1, 23 | 1, 3, 7, 6, 5, 4, 2, 0, 24 | 0, 1, 3, 7, 6, 5, 4, 2, 25 | 2, 3, 7, 6, 5, 4, 1, 0, 26 | 0, 2, 3, 7, 6, 5, 4, 1, 27 | 1, 2, 3, 7, 6, 5, 4, 0, 28 | 0, 1, 2, 3, 7, 6, 5, 4, 29 | 4, 7, 6, 5, 3, 2, 1, 0, 30 | 0, 4, 7, 6, 5, 3, 2, 1, 31 | 1, 4, 7, 6, 5, 3, 2, 0, 32 | 0, 1, 4, 7, 6, 5, 3, 2, 33 | 2, 4, 7, 6, 5, 3, 1, 0, 34 | 0, 2, 4, 7, 6, 5, 3, 1, 35 | 1, 2, 4, 7, 6, 5, 3, 0, 36 | 0, 1, 2, 4, 7, 6, 5, 3, 37 | 3, 4, 7, 6, 5, 2, 1, 0, 38 | 0, 3, 4, 7, 6, 5, 2, 1, 39 | 1, 3, 4, 7, 6, 5, 2, 0, 40 | 0, 1, 3, 4, 7, 6, 5, 2, 41 | 2, 3, 4, 7, 6, 5, 1, 0, 42 | 0, 2, 3, 4, 7, 6, 5, 1, 43 | 1, 2, 3, 4, 7, 6, 5, 0, 44 | 0, 1, 2, 3, 4, 7, 6, 5, 45 | 5, 7, 6, 4, 3, 2, 1, 0, 46 | 0, 5, 7, 6, 4, 3, 2, 1, 47 | 1, 5, 7, 6, 4, 3, 2, 0, 48 | 0, 1, 5, 7, 6, 4, 3, 2, 49 | 2, 5, 7, 6, 4, 3, 1, 0, 50 | 0, 2, 5, 7, 6, 4, 3, 1, 51 | 1, 2, 5, 7, 6, 4, 3, 0, 52 | 0, 1, 2, 5, 7, 6, 4, 3, 53 | 3, 5, 7, 6, 4, 2, 1, 0, 54 | 0, 3, 5, 7, 6, 4, 2, 1, 55 | 1, 3, 5, 7, 6, 4, 2, 0, 56 | 0, 1, 3, 5, 7, 6, 4, 2, 57 | 2, 3, 5, 7, 6, 4, 1, 0, 58 | 0, 2, 3, 5, 7, 6, 4, 1, 59 | 1, 2, 3, 5, 7, 6, 4, 0, 60 | 0, 1, 2, 3, 5, 7, 6, 4, 61 | 4, 5, 7, 6, 3, 2, 1, 0, 62 | 0, 4, 5, 7, 6, 3, 2, 1, 63 | 1, 4, 5, 7, 6, 3, 2, 0, 64 | 0, 1, 4, 5, 7, 6, 3, 2, 65 | 2, 4, 5, 7, 6, 3, 1, 0, 66 | 0, 2, 4, 5, 7, 6, 3, 1, 67 | 1, 2, 4, 5, 7, 6, 3, 0, 68 | 0, 1, 2, 4, 5, 7, 6, 3, 69 | 3, 4, 5, 7, 6, 2, 1, 0, 70 | 0, 3, 4, 5, 7, 6, 2, 1, 71 | 1, 3, 4, 5, 7, 6, 2, 0, 72 | 0, 1, 3, 4, 5, 7, 6, 2, 73 | 2, 3, 4, 5, 7, 6, 1, 0, 74 | 0, 2, 3, 4, 5, 7, 6, 1, 75 | 1, 2, 3, 4, 5, 7, 6, 0, 76 | 0, 1, 2, 3, 4, 5, 7, 6, 77 | 6, 7, 5, 4, 3, 2, 1, 0, 78 | 0, 6, 7, 5, 4, 3, 2, 1, 79 | 1, 6, 7, 5, 4, 3, 2, 0, 80 | 0, 1, 6, 7, 5, 4, 3, 2, 81 | 2, 6, 7, 5, 4, 3, 1, 0, 82 | 0, 2, 6, 7, 5, 4, 3, 1, 83 | 1, 2, 6, 7, 5, 4, 3, 0, 84 | 0, 1, 2, 6, 7, 5, 4, 3, 85 | 3, 6, 7, 5, 4, 2, 1, 0, 86 | 0, 3, 6, 7, 5, 4, 2, 1, 87 | 1, 3, 6, 7, 5, 4, 2, 0, 88 | 0, 1, 3, 6, 7, 5, 4, 2, 89 | 2, 3, 6, 7, 5, 4, 1, 0, 90 | 0, 2, 3, 6, 7, 5, 4, 1, 91 | 1, 2, 3, 6, 7, 5, 4, 0, 92 | 0, 1, 2, 3, 6, 7, 5, 4, 93 | 4, 6, 7, 5, 3, 2, 1, 0, 94 | 0, 4, 6, 7, 5, 3, 2, 1, 95 | 1, 4, 6, 7, 5, 3, 2, 0, 96 | 0, 1, 4, 6, 7, 5, 3, 2, 97 | 2, 4, 6, 7, 5, 3, 1, 0, 98 | 0, 2, 4, 6, 7, 5, 3, 1, 99 | 1, 2, 4, 6, 7, 5, 3, 0, 100 | 0, 1, 2, 4, 6, 7, 5, 3, 101 | 3, 4, 6, 7, 5, 2, 1, 0, 102 | 0, 3, 4, 6, 7, 5, 2, 1, 103 | 1, 3, 4, 6, 7, 5, 2, 0, 104 | 0, 1, 3, 4, 6, 7, 5, 2, 105 | 2, 3, 4, 6, 7, 5, 1, 0, 106 | 0, 2, 3, 4, 6, 7, 5, 1, 107 | 1, 2, 3, 4, 6, 7, 5, 0, 108 | 0, 1, 2, 3, 4, 6, 7, 5, 109 | 5, 6, 7, 4, 3, 2, 1, 0, 110 | 0, 5, 6, 7, 4, 3, 2, 1, 111 | 1, 5, 6, 7, 4, 3, 2, 0, 112 | 0, 1, 5, 6, 7, 4, 3, 2, 113 | 2, 5, 6, 7, 4, 3, 1, 0, 114 | 0, 2, 5, 6, 7, 4, 3, 1, 115 | 1, 2, 5, 6, 7, 4, 3, 0, 116 | 0, 1, 2, 5, 6, 7, 4, 3, 117 | 3, 5, 6, 7, 4, 2, 1, 0, 118 | 0, 3, 5, 6, 7, 4, 2, 1, 119 | 1, 3, 5, 6, 7, 4, 2, 0, 120 | 0, 1, 3, 5, 6, 7, 4, 2, 121 | 2, 3, 5, 6, 7, 4, 1, 0, 122 | 0, 2, 3, 5, 6, 7, 4, 1, 123 | 1, 2, 3, 5, 6, 7, 4, 0, 124 | 0, 1, 2, 3, 5, 6, 7, 4, 125 | 4, 5, 6, 7, 3, 2, 1, 0, 126 | 0, 4, 5, 6, 7, 3, 2, 1, 127 | 1, 4, 5, 6, 7, 3, 2, 0, 128 | 0, 1, 4, 5, 6, 7, 3, 2, 129 | 2, 4, 5, 6, 7, 3, 1, 0, 130 | 0, 2, 4, 5, 6, 7, 3, 1, 131 | 1, 2, 4, 5, 6, 7, 3, 0, 132 | 0, 1, 2, 4, 5, 6, 7, 3, 133 | 3, 4, 5, 6, 7, 2, 1, 0, 134 | 0, 3, 4, 5, 6, 7, 2, 1, 135 | 1, 3, 4, 5, 6, 7, 2, 0, 136 | 0, 1, 3, 4, 5, 6, 7, 2, 137 | 2, 3, 4, 5, 6, 7, 1, 0, 138 | 0, 2, 3, 4, 5, 6, 7, 1, 139 | 1, 2, 3, 4, 5, 6, 7, 0, 140 | 0, 1, 2, 3, 4, 5, 6, 7, 141 | 7, 6, 5, 4, 3, 2, 1, 0, 142 | 0, 7, 6, 5, 4, 3, 2, 1, 143 | 1, 7, 6, 5, 4, 3, 2, 0, 144 | 0, 1, 7, 6, 5, 4, 3, 2, 145 | 2, 7, 6, 5, 4, 3, 1, 0, 146 | 0, 2, 7, 6, 5, 4, 3, 1, 147 | 1, 2, 7, 6, 5, 4, 3, 0, 148 | 0, 1, 2, 7, 6, 5, 4, 3, 149 | 3, 7, 6, 5, 4, 2, 1, 0, 150 | 0, 3, 7, 6, 5, 4, 2, 1, 151 | 1, 3, 7, 6, 5, 4, 2, 0, 152 | 0, 1, 3, 7, 6, 5, 4, 2, 153 | 2, 3, 7, 6, 5, 4, 1, 0, 154 | 0, 2, 3, 7, 6, 5, 4, 1, 155 | 1, 2, 3, 7, 6, 5, 4, 0, 156 | 0, 1, 2, 3, 7, 6, 5, 4, 157 | 4, 7, 6, 5, 3, 2, 1, 0, 158 | 0, 4, 7, 6, 5, 3, 2, 1, 159 | 1, 4, 7, 6, 5, 3, 2, 0, 160 | 0, 1, 4, 7, 6, 5, 3, 2, 161 | 2, 4, 7, 6, 5, 3, 1, 0, 162 | 0, 2, 4, 7, 6, 5, 3, 1, 163 | 1, 2, 4, 7, 6, 5, 3, 0, 164 | 0, 1, 2, 4, 7, 6, 5, 3, 165 | 3, 4, 7, 6, 5, 2, 1, 0, 166 | 0, 3, 4, 7, 6, 5, 2, 1, 167 | 1, 3, 4, 7, 6, 5, 2, 0, 168 | 0, 1, 3, 4, 7, 6, 5, 2, 169 | 2, 3, 4, 7, 6, 5, 1, 0, 170 | 0, 2, 3, 4, 7, 6, 5, 1, 171 | 1, 2, 3, 4, 7, 6, 5, 0, 172 | 0, 1, 2, 3, 4, 7, 6, 5, 173 | 5, 7, 6, 4, 3, 2, 1, 0, 174 | 0, 5, 7, 6, 4, 3, 2, 1, 175 | 1, 5, 7, 6, 4, 3, 2, 0, 176 | 0, 1, 5, 7, 6, 4, 3, 2, 177 | 2, 5, 7, 6, 4, 3, 1, 0, 178 | 0, 2, 5, 7, 6, 4, 3, 1, 179 | 1, 2, 5, 7, 6, 4, 3, 0, 180 | 0, 1, 2, 5, 7, 6, 4, 3, 181 | 3, 5, 7, 6, 4, 2, 1, 0, 182 | 0, 3, 5, 7, 6, 4, 2, 1, 183 | 1, 3, 5, 7, 6, 4, 2, 0, 184 | 0, 1, 3, 5, 7, 6, 4, 2, 185 | 2, 3, 5, 7, 6, 4, 1, 0, 186 | 0, 2, 3, 5, 7, 6, 4, 1, 187 | 1, 2, 3, 5, 7, 6, 4, 0, 188 | 0, 1, 2, 3, 5, 7, 6, 4, 189 | 4, 5, 7, 6, 3, 2, 1, 0, 190 | 0, 4, 5, 7, 6, 3, 2, 1, 191 | 1, 4, 5, 7, 6, 3, 2, 0, 192 | 0, 1, 4, 5, 7, 6, 3, 2, 193 | 2, 4, 5, 7, 6, 3, 1, 0, 194 | 0, 2, 4, 5, 7, 6, 3, 1, 195 | 1, 2, 4, 5, 7, 6, 3, 0, 196 | 0, 1, 2, 4, 5, 7, 6, 3, 197 | 3, 4, 5, 7, 6, 2, 1, 0, 198 | 0, 3, 4, 5, 7, 6, 2, 1, 199 | 1, 3, 4, 5, 7, 6, 2, 0, 200 | 0, 1, 3, 4, 5, 7, 6, 2, 201 | 2, 3, 4, 5, 7, 6, 1, 0, 202 | 0, 2, 3, 4, 5, 7, 6, 1, 203 | 1, 2, 3, 4, 5, 7, 6, 0, 204 | 0, 1, 2, 3, 4, 5, 7, 6, 205 | 6, 7, 5, 4, 3, 2, 1, 0, 206 | 0, 6, 7, 5, 4, 3, 2, 1, 207 | 1, 6, 7, 5, 4, 3, 2, 0, 208 | 0, 1, 6, 7, 5, 4, 3, 2, 209 | 2, 6, 7, 5, 4, 3, 1, 0, 210 | 0, 2, 6, 7, 5, 4, 3, 1, 211 | 1, 2, 6, 7, 5, 4, 3, 0, 212 | 0, 1, 2, 6, 7, 5, 4, 3, 213 | 3, 6, 7, 5, 4, 2, 1, 0, 214 | 0, 3, 6, 7, 5, 4, 2, 1, 215 | 1, 3, 6, 7, 5, 4, 2, 0, 216 | 0, 1, 3, 6, 7, 5, 4, 2, 217 | 2, 3, 6, 7, 5, 4, 1, 0, 218 | 0, 2, 3, 6, 7, 5, 4, 1, 219 | 1, 2, 3, 6, 7, 5, 4, 0, 220 | 0, 1, 2, 3, 6, 7, 5, 4, 221 | 4, 6, 7, 5, 3, 2, 1, 0, 222 | 0, 4, 6, 7, 5, 3, 2, 1, 223 | 1, 4, 6, 7, 5, 3, 2, 0, 224 | 0, 1, 4, 6, 7, 5, 3, 2, 225 | 2, 4, 6, 7, 5, 3, 1, 0, 226 | 0, 2, 4, 6, 7, 5, 3, 1, 227 | 1, 2, 4, 6, 7, 5, 3, 0, 228 | 0, 1, 2, 4, 6, 7, 5, 3, 229 | 3, 4, 6, 7, 5, 2, 1, 0, 230 | 0, 3, 4, 6, 7, 5, 2, 1, 231 | 1, 3, 4, 6, 7, 5, 2, 0, 232 | 0, 1, 3, 4, 6, 7, 5, 2, 233 | 2, 3, 4, 6, 7, 5, 1, 0, 234 | 0, 2, 3, 4, 6, 7, 5, 1, 235 | 1, 2, 3, 4, 6, 7, 5, 0, 236 | 0, 1, 2, 3, 4, 6, 7, 5, 237 | 5, 6, 7, 4, 3, 2, 1, 0, 238 | 0, 5, 6, 7, 4, 3, 2, 1, 239 | 1, 5, 6, 7, 4, 3, 2, 0, 240 | 0, 1, 5, 6, 7, 4, 3, 2, 241 | 2, 5, 6, 7, 4, 3, 1, 0, 242 | 0, 2, 5, 6, 7, 4, 3, 1, 243 | 1, 2, 5, 6, 7, 4, 3, 0, 244 | 0, 1, 2, 5, 6, 7, 4, 3, 245 | 3, 5, 6, 7, 4, 2, 1, 0, 246 | 0, 3, 5, 6, 7, 4, 2, 1, 247 | 1, 3, 5, 6, 7, 4, 2, 0, 248 | 0, 1, 3, 5, 6, 7, 4, 2, 249 | 2, 3, 5, 6, 7, 4, 1, 0, 250 | 0, 2, 3, 5, 6, 7, 4, 1, 251 | 1, 2, 3, 5, 6, 7, 4, 0, 252 | 0, 1, 2, 3, 5, 6, 7, 4, 253 | 4, 5, 6, 7, 3, 2, 1, 0, 254 | 0, 4, 5, 6, 7, 3, 2, 1, 255 | 1, 4, 5, 6, 7, 3, 2, 0, 256 | 0, 1, 4, 5, 6, 7, 3, 2, 257 | 2, 4, 5, 6, 7, 3, 1, 0, 258 | 0, 2, 4, 5, 6, 7, 3, 1, 259 | 1, 2, 4, 5, 6, 7, 3, 0, 260 | 0, 1, 2, 4, 5, 6, 7, 3, 261 | 3, 4, 5, 6, 7, 2, 1, 0, 262 | 0, 3, 4, 5, 6, 7, 2, 1, 263 | 1, 3, 4, 5, 6, 7, 2, 0, 264 | 0, 1, 3, 4, 5, 6, 7, 2, 265 | 2, 3, 4, 5, 6, 7, 1, 0, 266 | 0, 2, 3, 4, 5, 6, 7, 1, 267 | 1, 2, 3, 4, 5, 6, 7, 0, 268 | 0, 1, 2, 3, 4, 5, 6, 7 269 | }; 270 | 271 | size_t tetzank_intersect_scalar(const uint32_t *list1, size_t size1, const uint32_t *list2, size_t size2, uint32_t *result){ 272 | size_t counter=0; 273 | const uint32_t *end1 = list1+size1, *end2 = list2+size2; 274 | while(list1 != end1 && list2 != end2){ 275 | if(*list1 < *list2){ 276 | list1++; 277 | }else if(*list1 > *list2){ 278 | list2++; 279 | }else{ 280 | result[counter++] = *list1; 281 | list1++; list2++; 282 | } 283 | } 284 | return counter; 285 | } 286 | 287 | size_t tetzank_intersect_scalar_count(const uint32_t *list1, size_t size1, const uint32_t *list2, size_t size2){ 288 | size_t counter=0; 289 | const uint32_t *end1 = list1+size1, *end2 = list2+size2; 290 | while(list1 != end1 && list2 != end2){ 291 | if(*list1 < *list2){ 292 | list1++; 293 | }else if(*list1 > *list2){ 294 | list2++; 295 | }else{ 296 | counter++; 297 | list1++; list2++; 298 | } 299 | } 300 | return counter; 301 | } 302 | 303 | 304 | #ifdef __AVX2__ 305 | 306 | 307 | #include 308 | 309 | 310 | size_t tetzank_intersect_vector_avx2(const uint32_t *list1, size_t size1, const uint32_t *list2, size_t size2, uint32_t *result){ 311 | size_t count=0, i_a=0, i_b=0; 312 | size_t st_a = (size1 / 8) * 8; 313 | size_t st_b = (size2 / 8) * 8; 314 | while(i_a < st_a && i_b < st_b){ 315 | __m256i v_a = _mm256_loadu_si256((const __m256i*)&list1[i_a]); 316 | __m256i v_b = _mm256_loadu_si256((const __m256i*)&list2[i_b]); 317 | 318 | int32_t a_max = list1[i_a+7]; 319 | int32_t b_max = list2[i_b+7]; 320 | i_a += (a_max <= b_max) * 8; 321 | i_b += (a_max >= b_max) * 8; 322 | 323 | constexpr int32_t cyclic_shift = _MM_SHUFFLE(0,3,2,1); //rotating right 324 | constexpr int32_t cyclic_shift2= _MM_SHUFFLE(2,1,0,3); //rotating left 325 | constexpr int32_t cyclic_shift3= _MM_SHUFFLE(1,0,3,2); //between 326 | __m256i cmp_mask1 = _mm256_cmpeq_epi32(v_a, v_b); 327 | __m256 rot1 = _mm256_permute_ps((__m256)v_b, cyclic_shift); 328 | __m256i cmp_mask2 = _mm256_cmpeq_epi32(v_a, (__m256i)rot1); 329 | __m256 rot2 = _mm256_permute_ps((__m256)v_b, cyclic_shift3); 330 | __m256i cmp_mask3 = _mm256_cmpeq_epi32(v_a, (__m256i)rot2); 331 | __m256 rot3 = _mm256_permute_ps((__m256)v_b, cyclic_shift2); 332 | __m256i cmp_mask4 = _mm256_cmpeq_epi32(v_a, (__m256i)rot3); 333 | 334 | __m256 rot4 = _mm256_permute2f128_ps((__m256)v_b, (__m256)v_b, 1); 335 | 336 | __m256i cmp_mask5 = _mm256_cmpeq_epi32(v_a, (__m256i)rot4); 337 | __m256 rot5 = _mm256_permute_ps(rot4, cyclic_shift); 338 | __m256i cmp_mask6 = _mm256_cmpeq_epi32(v_a, (__m256i)rot5); 339 | __m256 rot6 = _mm256_permute_ps(rot4, cyclic_shift3); 340 | __m256i cmp_mask7 = _mm256_cmpeq_epi32(v_a, (__m256i)rot6); 341 | __m256 rot7 = _mm256_permute_ps(rot4, cyclic_shift2); 342 | __m256i cmp_mask8 = _mm256_cmpeq_epi32(v_a, (__m256i)rot7); 343 | 344 | // AVX2: _mm256_or_si256 345 | __m256i cmp_mask = _mm256_or_si256( 346 | _mm256_or_si256( 347 | _mm256_or_si256(cmp_mask1, cmp_mask2), 348 | _mm256_or_si256(cmp_mask3, cmp_mask4) 349 | ), 350 | _mm256_or_si256( 351 | _mm256_or_si256(cmp_mask5, cmp_mask6), 352 | _mm256_or_si256(cmp_mask7, cmp_mask8) 353 | ) 354 | ); 355 | int32_t mask = _mm256_movemask_ps((__m256)cmp_mask); 356 | 357 | __m256i idx = _mm256_load_si256((const __m256i*)&shuffle_mask_avx[mask*8]); 358 | __m256i p = _mm256_permutevar8x32_epi32(v_a, idx); 359 | _mm256_storeu_si256((__m256i*)&result[count], p); 360 | 361 | count += _mm_popcnt_u32(mask); 362 | } 363 | // intersect the tail using scalar intersection 364 | count += tetzank_intersect_scalar(list1+i_a, size1-i_a, list2+i_b, size2-i_b, result+count); 365 | return count; 366 | } 367 | size_t tetzank_intersect_vector_avx2_count(const uint32_t *list1, size_t size1, const uint32_t *list2, size_t size2){ 368 | size_t count=0, i_a=0, i_b=0; 369 | size_t st_a = (size1 / 8) * 8; 370 | size_t st_b = (size2 / 8) * 8; 371 | while(i_a < st_a && i_b < st_b){ 372 | __m256i v_a = _mm256_loadu_si256((const __m256i*)&list1[i_a]); 373 | __m256i v_b = _mm256_loadu_si256((const __m256i*)&list2[i_b]); 374 | int32_t a_max = list1[i_a+7]; 375 | int32_t b_max = list2[i_b+7]; 376 | i_a += (a_max <= b_max) * 8; 377 | i_b += (a_max >= b_max) * 8; 378 | 379 | constexpr int32_t cyclic_shift = _MM_SHUFFLE(0,3,2,1); //rotating right 380 | constexpr int32_t cyclic_shift2= _MM_SHUFFLE(2,1,0,3); //rotating left 381 | constexpr int32_t cyclic_shift3= _MM_SHUFFLE(1,0,3,2); //between 382 | // AVX2: _mm256_cmpeq_epi32 383 | __m256i cmp_mask1 = _mm256_cmpeq_epi32(v_a, v_b); 384 | __m256 rot1 = _mm256_permute_ps((__m256)v_b, cyclic_shift); 385 | __m256i cmp_mask2 = _mm256_cmpeq_epi32(v_a, (__m256i)rot1); 386 | __m256 rot2 = _mm256_permute_ps((__m256)v_b, cyclic_shift3); 387 | __m256i cmp_mask3 = _mm256_cmpeq_epi32(v_a, (__m256i)rot2); 388 | __m256 rot3 = _mm256_permute_ps((__m256)v_b, cyclic_shift2); 389 | __m256i cmp_mask4 = _mm256_cmpeq_epi32(v_a, (__m256i)rot3); 390 | 391 | __m256 rot4 = _mm256_permute2f128_ps((__m256)v_b, (__m256)v_b, 1); 392 | 393 | __m256i cmp_mask5 = _mm256_cmpeq_epi32(v_a, (__m256i)rot4); 394 | __m256 rot5 = _mm256_permute_ps(rot4, cyclic_shift); 395 | __m256i cmp_mask6 = _mm256_cmpeq_epi32(v_a, (__m256i)rot5); 396 | __m256 rot6 = _mm256_permute_ps(rot4, cyclic_shift3); 397 | __m256i cmp_mask7 = _mm256_cmpeq_epi32(v_a, (__m256i)rot6); 398 | __m256 rot7 = _mm256_permute_ps(rot4, cyclic_shift2); 399 | __m256i cmp_mask8 = _mm256_cmpeq_epi32(v_a, (__m256i)rot7); 400 | 401 | // AVX2: _mm256_or_si256 402 | __m256i cmp_mask = _mm256_or_si256( 403 | _mm256_or_si256( 404 | _mm256_or_si256(cmp_mask1, cmp_mask2), 405 | _mm256_or_si256(cmp_mask3, cmp_mask4) 406 | ), 407 | _mm256_or_si256( 408 | _mm256_or_si256(cmp_mask5, cmp_mask6), 409 | _mm256_or_si256(cmp_mask7, cmp_mask8) 410 | ) 411 | ); 412 | int32_t mask = _mm256_movemask_ps((__m256)cmp_mask); 413 | count += _mm_popcnt_u32(mask); 414 | } 415 | // intersect the tail using scalar intersection 416 | count += tetzank_intersect_scalar_count(list1+i_a, size1-i_a, list2+i_b, size2-i_b); 417 | 418 | return count; 419 | } 420 | #endif 421 | 422 | 423 | 424 | 425 | #endif /* INCLUDE_TETZANK_H_ */ 426 | -------------------------------------------------------------------------------- /include/thomaswu.h: -------------------------------------------------------------------------------- 1 | 2 | 3 | #ifndef THOMASWU_H_ 4 | #define THOMASWU_H_ 5 | 6 | #include "common.h" 7 | #include "intersection.h" 8 | 9 | typedef uint32_t UINT4; 10 | 11 | typedef uint64_t UINT8; 12 | 13 | typedef long (*intersectionfindfunction)(UINT4 goal, const UINT4 *target, long ntargets); 14 | 15 | template 16 | size_t 17 | compute_intersection (const uint32_t * rare, 18 | const size_t nrare, const uint32_t * freq, const size_t nfreq, uint32_t * out) { 19 | UINT4 goal; 20 | const UINT4 *stop_rare; 21 | UINT4 *init_out; 22 | long j; 23 | long nfreqleft = static_cast(nfreq);// possibly unsafe if nfreq exceeds the range of longs 24 | 25 | init_out = out; 26 | stop_rare = &(rare[nrare]); 27 | while (rare < stop_rare) { 28 | goal = *rare++; 29 | j = FINDFUNCTION(goal,freq,nfreqleft); 30 | 31 | if (j >= nfreqleft) { 32 | return (out - init_out); 33 | } else if (freq[j] == goal) { 34 | *out++ = goal; 35 | } 36 | freq += j; 37 | nfreqleft -= j; 38 | } 39 | return (out - init_out); 40 | } 41 | 42 | 43 | long 44 | Intersection_find_scalar (UINT4 goal, const UINT4 *target, long ntargets); 45 | long 46 | Intersection_find_gallop (UINT4 goal, const UINT4 *target, long ntargets); 47 | long 48 | Intersection_find_v1 (UINT4 goal, const UINT4 *target, long ntargets); 49 | long 50 | Intersection_find_v1_aligned (UINT4 goal, const UINT4 *target, long ntargets); 51 | long 52 | Intersection_find_v1_plow (UINT4 goal, const UINT4 *target, long ntargets); 53 | long 54 | Intersection_find_v2 (UINT4 goal, const UINT4 *target, long ntargets); 55 | long 56 | Intersection_find_v2_aligned (UINT4 goal, const UINT4 *target, long ntargets); 57 | long 58 | Intersection_find_v3 (UINT4 goal, const UINT4 *target, long ntargets); 59 | long 60 | Intersection_find_v3_aligned (UINT4 goal, const UINT4 *target, long ntargets); 61 | long 62 | Intersection_find_simdgallop_v0 (UINT4 goal, const UINT4 *target, long ntargets); 63 | long 64 | Intersection_find_simdgallop_v1 (UINT4 goal, const UINT4 *target, long ntargets); 65 | long 66 | Intersection_find_simdgallop_v2 (UINT4 goal, const UINT4 *target, long ntargets); 67 | long 68 | Intersection_find_simdgallop_v3 (UINT4 goal, const UINT4 *target, long ntargets); 69 | 70 | 71 | typedef long (*flaggedintersectionfindfunction)(int *foundp, UINT4 goal, const UINT4 *target, long ntargets); 72 | 73 | long 74 | Intersection_find_v3_cmpeq (int *foundp, UINT4 goal, const UINT4 *target, long ntargets); 75 | 76 | long 77 | Intersection_truefind_v3_cmpeq_scalar (int *foundp, UINT4 goal, const UINT4 *target, long ntargets); 78 | 79 | 80 | long 81 | Intersection_truefind_v3_cmpeq_simd32 (int *foundp, UINT4 goal, const UINT4 *target, long ntargets); 82 | 83 | 84 | long 85 | Intersection_truefind_v3_cmpeq_simd8 (int *foundp, UINT4 goal, const UINT4 *target, long ntargets) ; 86 | 87 | long 88 | Intersection_truefind_v3_cmpeq_binary (int *foundp, UINT4 goal, const UINT4 *target, long ntargets) ; 89 | 90 | template 91 | size_t 92 | compute_intersection_flagged (const uint32_t * rare, 93 | const size_t nrare, const uint32_t * freq, const size_t nfreq, uint32_t * out) { 94 | UINT4 *init_out; 95 | size_t i; 96 | 97 | init_out = out; 98 | 99 | size_t lenFreq = nfreq; 100 | long pos; 101 | int foundp; 102 | for (i = 0; i < nrare; i++) { 103 | pos = FINDFUNCTION(&foundp,rare[i],freq,lenFreq); 104 | if (foundp == 1) { 105 | *out++ = rare[i]; 106 | } 107 | freq += pos; 108 | lenFreq -= pos; 109 | } 110 | return (out - init_out); 111 | } 112 | 113 | 114 | #endif /* THOMASWU_H_ */ 115 | -------------------------------------------------------------------------------- /include/timer.h: -------------------------------------------------------------------------------- 1 | /** 2 | * This code is released under the 3 | * Apache License Version 2.0 http://www.apache.org/licenses/. 4 | * 5 | */ 6 | 7 | #ifndef TIMER_H_ 8 | #define TIMER_H_ 9 | 10 | #include 11 | #include 12 | #include 13 | 14 | class WallClockTimer { 15 | public: 16 | struct timeval t1, t2; 17 | WallClockTimer() : 18 | t1(), t2() { 19 | gettimeofday(&t1, 0); 20 | t2 = t1; 21 | } 22 | void reset() { 23 | gettimeofday(&t1, 0); 24 | t2 = t1; 25 | } 26 | uint64_t elapsed() { 27 | return ((t2.tv_sec - t1.tv_sec) * 1000ULL * 1000ULL) + ((t2.tv_usec 28 | - t1. tv_usec)); 29 | } 30 | uint64_t split() { 31 | gettimeofday(&t2, 0); 32 | return elapsed(); 33 | } 34 | }; 35 | 36 | #endif /* TIMER_H_ */ 37 | -------------------------------------------------------------------------------- /include/union.h: -------------------------------------------------------------------------------- 1 | #ifndef UNION_H_ 2 | #define UNION_H_ 3 | #include "common.h" 4 | 5 | size_t unite(const uint32_t * set1, const size_t length1, 6 | const uint32_t * set2, const size_t length2, uint32_t * out) { 7 | size_t pos = 0; 8 | size_t k1 = 0, k2 = 0; 9 | if (0 == length1) { 10 | for (size_t k = 0; k < length2; ++k) 11 | out[k] = set2[k]; 12 | return length2; 13 | } 14 | if (0 == length2) { 15 | for (size_t k = 0; k < length1; ++k) 16 | out[k] = set1[k]; 17 | return length1; 18 | } 19 | while (true) { 20 | if (set1[k1] < set2[k2]) { 21 | out[pos++] = set1[k1]; 22 | ++k1; 23 | if (k1 >= length1) { 24 | for (; k2 < length2; ++k2) 25 | out[pos++] = set2[k2]; 26 | break; 27 | } 28 | } else if (set1[k1] == set2[k2]) { 29 | out[pos++] = set1[k1]; 30 | ++k1; 31 | ++k2; 32 | if (k1 >= length1) { 33 | for (; k2 < length2; ++k2) 34 | out[pos++] = set2[k2]; 35 | break; 36 | } 37 | if (k2 >= length2) { 38 | for (; k1 < length1; ++k1) 39 | out[pos++] = set1[k1]; 40 | break; 41 | } 42 | } else {// if (set1[k1]>set2[k2]) { 43 | out[pos++] = set2[k2]; 44 | ++k2; 45 | if (k2 >= length2) { 46 | for (; k1 < length1; ++k1) 47 | out[pos++] = set1[k1]; 48 | break; 49 | } 50 | } 51 | } 52 | return pos; 53 | } 54 | 55 | #endif /* UNION_H_ */ 56 | -------------------------------------------------------------------------------- /include/util.h: -------------------------------------------------------------------------------- 1 | /** 2 | * This code is released under the 3 | * Apache License Version 2.0 http://www.apache.org/licenses/. 4 | * 5 | */ 6 | 7 | #ifndef UTIL_H_ 8 | #define UTIL_H_ 9 | 10 | #include "common.h" 11 | 12 | using namespace std; 13 | 14 | 15 | /** 16 | * unsatured packing. 17 | */ 18 | __attribute__((always_inline)) 19 | inline __m128i __pack_epu32( __m128i hi, __m128i lo ) { 20 | const static __m128i mask = 21 | _mm_set_epi8(0,0,-1,-1,0,0,-1,-1,0,0,-1,-1,0,0,-1,-1); 22 | hi = _mm_and_si128( hi, mask ); 23 | lo = _mm_and_si128( lo, mask ); 24 | return _mm_packus_epi32( hi, lo ); 25 | } 26 | 27 | /** 28 | * Not recommended. 29 | */ 30 | __attribute__((always_inline)) 31 | inline __m128i __altpack_epu32( __m128i hi, __m128i lo ) { 32 | //0b10101010 = 170 33 | __m128i bva = 34 | _mm_blend_epi16(hi, 35 | _mm_slli_si128(lo, 2), 170); 36 | const static __m128i shufflekey = 37 | _mm_set_epi8(15,14,11,10, 7,6,3,2, 13,12,9,8,5,4,1,0); 38 | return _mm_shuffle_epi8(bva,shufflekey); 39 | } 40 | 41 | 42 | vector split(const string& str, const string& del) { 43 | vector < string > tokens; 44 | size_t lastPos = str.find_first_not_of(del, 0); 45 | size_t pos = str.find_first_of(del, lastPos); 46 | while (string::npos != pos || string::npos != lastPos) { 47 | tokens.push_back(str.substr(lastPos, pos - lastPos)); 48 | lastPos = str.find_first_not_of(del, pos); 49 | pos = str.find_first_of(del, lastPos); 50 | } 51 | return tokens; 52 | } 53 | 54 | 55 | //Fisher-Yates shuffle 56 | template 57 | iter shuffleFY(iter begin, iter end, size_t N) { 58 | size_t M = distance(begin, end); 59 | while (N--) { 60 | iter r = begin; 61 | advance(r, rand() % M); 62 | swap(*begin, *r); 63 | begin++; 64 | M--; 65 | } 66 | return begin; 67 | } 68 | 69 | #endif /* UTIL_H_ */ 70 | -------------------------------------------------------------------------------- /results/benchintersection5march2014.gnuplot: -------------------------------------------------------------------------------- 1 | 2 | 3 | set style line 80 lt rgb "#000000" 4 | 5 | # Line style for grid 6 | #set style line 81 lt 0 # dashed 7 | #set style line 81 lt rgb "#808080" # grey 8 | 9 | #set grid back linestyle 81 10 | set border 3 back linestyle 80 # Remove border on top and right. These 11 | # borders are useless and make it harder 12 | # to see plotted lines near the border. 13 | # Also, put it in grey; no need for so much emphasis on a border. 14 | set xtics nomirror 15 | set ytics nomirror 16 | 17 | 18 | 19 | set style line 1 lt rgb "#A00000" lw 4 pt 1 ps 0.5 20 | set style line 2 lt rgb "#00A000" lw 4 pt 5 ps 0.5 21 | set style line 3 lt rgb "#5060D0" lw 4 pt 7 ps 0.5 22 | set style line 4 lt rgb "#FF1493" lw 4 pt 9 ps 0.5 23 | set style line 5 lt rgb "red" lw 4 pt 11 ps 0.5 24 | set style line 6 lt rgb "#808000" lw 4 pt 13 ps 0.5 25 | set style line 7 lt rgb "#00008B" lw 4 pt 15 ps 0.5 26 | set style line 8 lt rgb "#800080" lw 4 pt 21 ps 0.5 27 | set style line 9 lt rgb "black" lw 4 pt 63 ps 0.5 28 | set style line 10 lt rgb "blue" lw 4 pt 28 ps 0.5 29 | set style line 11 lt rgb "violet" lw 4 pt 44 ps 0.5 30 | set style line 81 lt 0 # dashed 31 | set style line 81 lt rgb "#808080" # grey 32 | 33 | set grid back linestyle 81 34 | #set xtics 2 35 | #set ytics 1 36 | set term pdfcairo 37 | #fontscale 0.8 38 | 39 | 40 | set out "ratiobitpacking.pdf" 41 | 42 | set xlabel "Ratio length (large list) / length(small list)" 43 | set ylabel "relative speed (scalar = 1)" 44 | #set ylabel "relative speed (galloping = 1)" 45 | 46 | set key bmargin 47 | set key samplen 2 spacing .5 font ",8" maxrows 4 48 | #set xrange [1:8192] 49 | set xrange [1:10000] 50 | set logscale x 2 51 | 52 | set logscale y 2 53 | 54 | 55 | set out "benchintersection5march2014_gallop.pdf" 56 | #set logscale x 2 57 | #set logscale y 58 | 59 | plot "benchintersection5march2014.txt" using 1:($13/$3) ti "gallop" with linespoints lw 2 ps 0.5,\ 60 | "" using 1:($21/$3) ti "SIMD gallop" with linespoints lw 2 ps 0.5,\ 61 | "" using 1:($22/$3) ti "SIMD gallop2" with linespoints lw 2 ps 0.5,\ 62 | "" using 1:($25/$3) ti "Wu SIMD gallop v0" with linespoints lw 2 ps 0.5,\ 63 | "" using 1:($26/$3) ti "Wu SIMD gallop v1" with linespoints lw 2 ps 0.5,\ 64 | "" using 1:($27/$3) ti "Wu SIMD gallop v2" with linespoints lw 2 ps 0.5,\ 65 | "" using 1:($28/$3) ti "Wu SIMD gallop v3" with linespoints lw 2 ps 0.5 66 | 67 | 68 | set out "benchintersection5march2014_v1.pdf" 69 | 70 | plot "benchintersection5march2014.txt" using 1:($34/$3) ti "SIMD v1" with linespoints lw 2 ps 0.5,\ 71 | "" using 1:($29/$3) ti "Wu SIMD v1" with linespoints lw 2 ps 0.5,\ 72 | "" using 1:($30/$3) ti "Wu SIMD v1 plow" with linespoints lw 2 ps 0.5 73 | 74 | 75 | 76 | set out "benchintersection5march2014_v3.pdf" 77 | 78 | plot "benchintersection5march2014.txt" using 1:($35/$3) ti "SIMD v3" with linespoints lw 2 ps 0.5,\ 79 | "" using 1:($32/$3) ti "Wu SIMD v3" with linespoints lw 2 ps 0.5,\ 80 | "" using 1:($33/$3) ti "Wu SIMD v3 aligned" with linespoints lw 2 ps 0.5 81 | -------------------------------------------------------------------------------- /results/benchintersection5march2014.txt: -------------------------------------------------------------------------------- 1 | # howmany : 5 2 | # loop : 3 3 | # distribution : clustered 4 | # Big : 22 5 | # intersectionratio : 0.3 6 | # MaxBit : 26 7 | # size-ratio @hybriddan branchless danfar danfarfar danfarmov f2p0 f4p0 f8p0 hssimd hssimddan natemediumdanalt scalar1sgalloping scalarbranchless scalarbranchlesscached scalarbranchlesscached2 scalarbranchlessunrolled scalardanbranchless scalarnate scalarnatewg simdgalloping simdgalloping2 thomas_gallop thomas_scalar thomas_simdgallop_v0 thomas_simdgallop_v1 thomas_simdgallop_v2 thomas_simdgallop_v3 thomas_v1 thomas_v1_plow thomas_v2 thomas_v3 thomas_v3_aligned v1 v3 widevector widevectorleo relative-intersection-size 8 | #generating data...ok. 9 | 1.001 990.45 438.03 464.16 442.4 463.37 991.06 998.3 783.6 1066.5 1135.3 527.17 388.7 278.78 430.61 438.57 276.69 438.18 434.95 441.22 435.3 474.08 284.21 328.98 204.6 204.88 158.27 155.49 249.05 248.8 188.73 185.11 243.47 990.82 442.54 976.53 1103.7 0.34884 10 | #generating data...ok. 11 | 1.3798 1024.4 427.53 584.81 552.03 587.75 1025 1092 893.83 1034.8 1077.9 653.41 362.24 269.65 424.13 428.98 266.47 427.74 411.6 419.61 541.67 596 286.96 328.09 217.33 223.85 176.02 174.5 268.44 264.31 207.94 203.8 247.84 1024.3 552.3 955.56 1092.6 0.32896 12 | #generating data...ok. 13 | 1.9019 1097.9 429.83 626.36 597.3 625.05 1098.8 1204.4 1035.2 1120.9 1135.7 678.9 376.84 262.49 417.55 422.96 260.7 429.82 440.6 444.6 587.51 630.48 307.21 350.76 240.76 250.5 203.04 202.25 293.87 286.51 236.72 233.42 270.03 1098.9 597.32 958.63 1108 0.34597 14 | #generating data...ok. 15 | 2.6216 1293.8 640.14 880.69 832 886.9 1294.5 1368.3 1201.9 1242.7 1277.1 958.96 542.63 304.91 533.24 536.93 302.96 640.36 663.85 679.38 815.51 882.7 414.48 522.1 337.24 352.85 285.07 281.41 426.33 409.21 338.66 334.11 395.23 1295.5 830.23 969.71 1139 0.32307 16 | #generating data...ok. 17 | 3.6136 1256.5 554.73 1040.3 984.84 1050.4 1258.2 1462.1 1423.6 1207.4 1201.5 1065.6 467.4 283.25 483.46 491.42 282.04 554.98 580.95 588.65 962.99 1038.2 388.85 479.19 348.1 382.41 327.65 326.81 434.91 409.58 374.19 370.57 374.25 1257.5 982.22 946.22 1116.8 0.33031 18 | #generating data...ok. 19 | 4.981 1350.9 571.9 1070.2 1025.9 1072.8 1346 1623.3 1676 1245.8 1218 1072.6 471.58 283.61 490.34 499.2 281.77 571.62 597.29 601.66 1006 1055.6 403.68 492.87 383.43 423.02 376.6 376.9 469.39 442.45 416.09 418.08 395.61 1343.7 1024.4 945.58 1125.6 0.35888 20 | #generating data...ok. 21 | 6.8659 1559.3 786.68 1442.3 1382.3 1447.4 1553.3 1794 1936 1383.4 1369.9 1406.2 653.65 312 577.21 585.72 311.08 787.2 871.59 880.64 1348.5 1393.9 545.9 717.8 504.68 569.47 535.3 535.7 644.76 592.17 606.14 608.67 578.73 1548.4 1380.9 958.4 1153.2 0.33581 22 | #generating data...ok. 23 | 9.4639 1657.4 837.43 1567.3 1514 1563.2 1661.2 1943.9 2217.2 1441.9 1423.6 1487.2 697 313.74 589.71 598.56 313.99 837.15 953.78 959.17 1482.9 1499.8 606.64 783.7 564.85 642.3 652.78 653.66 710.77 665.95 722.19 735.53 666.88 1659.5 1515.4 968.42 1171 0.34253 24 | #generating data...ok. 25 | 13.045 2154.2 1026.8 1875.9 1842.1 1864.5 2165.9 2382.8 2550.4 1522.6 1506.9 1792.9 970.24 329.31 647.69 654.01 329.86 1026.5 1258.8 1266.5 1794.4 1740.7 820.85 1053.1 742.31 815.57 808.5 819.31 964.85 887.41 915.43 923.51 912.48 2165.5 1843.1 966.06 1189.9 0.41017 26 | #generating data...ok. 27 | 17.981 2433.7 1091.7 2580 2498.9 2587.7 2439.5 2704.7 2941.2 1551.5 1532.1 2361.5 1121.4 330.05 661.37 666.71 330.85 1091.6 1401.2 1417.2 2430.6 2390.4 964.81 1199.8 936.25 1052.6 1048.5 1065.8 1237.8 1100.1 1183.8 1203.8 1101.8 2440.7 2499.4 928.14 1173.9 0.33022 28 | #generating data...ok. 29 | 24.786 2491.7 1191.9 2725.9 2667.5 2730 2512.5 2724.9 3132.8 1592 1572.2 2443.9 1160.2 336.04 684.17 690.27 337.01 1189 1574.2 1570.2 2582.9 2470.5 1017.4 1348.5 1010.4 1146.6 1297.5 1330.5 1356.8 1219.8 1435.1 1491.3 1279.1 2502.5 2642 965.93 1199.8 0.34907 30 | #generating data...ok. 31 | 34.165 2958.9 1302.1 3220.4 3157.8 3242.6 2966.2 3106 3497 1626.2 1606.5 2938.4 1459.5 340.03 704.12 708.03 341.23 1301.6 1817.7 1813.8 3022.7 2893.7 1305.5 1585.7 1286 1415.3 1571.4 1602.9 1754.9 1550.3 1782.4 1799 1584.8 2962.6 3157.4 933.86 1204.1 0.36754 32 | #generating data...ok. 33 | 47.093 3182.6 1359.7 3462.6 3382.7 3468.2 3218.4 3377.9 3616.4 1648.1 1625.8 3114.1 1578.3 342.35 714.86 720.38 343.44 1360.2 1983.9 1987.7 3280.4 3088.7 1388.2 1746.7 1450.5 1617.9 1817.1 1908.2 2097.5 1768.8 2066.2 2131.1 1843.3 3219.4 3379.5 931.26 1208.2 0.34864 34 | #generating data...ok. 35 | 64.913 4363.4 1470.1 4262.9 4397 4253.5 3422.4 3638.4 4223.1 1665.7 1639.6 3609 2012.7 345.1 730.63 734.01 346.39 1472.3 2181.7 2183.5 4162.3 3773 1851.6 1934.2 1827.7 2028.1 2268.5 2323.6 2367.2 2001.5 2502 2601.3 2266.6 3418.9 4400 905.43 1215.8 0.33951 36 | #generating data...ok. 37 | 89.477 4707.4 1532.7 4556.6 4698.7 4551.7 3701.1 3888.3 4387 1691.7 1668.1 3859.8 2390.3 347.64 742.68 745.14 349.04 1531.1 2385.3 2377.6 4537.7 4209.5 2141.6 2143.1 2164.7 2392.3 2669.7 2782.1 2775.7 2275.7 2864.8 3052 2719.9 3712.1 4701.8 879.54 1218.8 0.48246 38 | #generating data...ok. 39 | 123.33 5049.7 1588 4938.7 5025.6 4977.5 3952.8 4120.8 4589.8 1712.1 1686.8 4273.6 2525.2 349.22 750.44 752.55 350.85 1590.9 2521.4 2553.2 4800.4 4434.4 2427.3 2318.9 2335 2538.4 2785.9 2888.7 3232 2578.9 3227.7 3220.2 2951.4 3962.9 5072.3 839.97 1216.7 0.33693 40 | #generating data...ok. 41 | 170.01 5571.3 1648.3 5441.4 5679.8 5434.9 4169.9 4276.7 4885.6 1702.2 1678.9 4605.7 3036.2 349.89 756.27 757.99 351.55 1649.7 2663.5 2680.9 5340.4 4778.6 2911.8 2440.3 2771.5 2961.4 3214.1 3226.7 3601.1 2820.9 3626.7 3668.9 3483.1 4168.5 5685.9 778.36 1143.7 0.35392 42 | #generating data...ok. 43 | 234.34 7303.8 1683.4 6228.1 7167.9 6110 4182.8 4266.5 5267.9 1703.2 1701.6 5000.6 3958 350.82 759.93 761.66 352.49 1684.2 2775.6 2798.3 6865.6 5977.1 3845.3 2551.4 3702.4 4187.8 4443.2 4495 3742.3 2981.9 4126 4571.8 4480.7 4177.9 7346.2 765.64 1128.9 0.34326 44 | #generating data...ok. 45 | 323.01 7600.2 1728.7 6563.8 7325.8 6510.3 4353.8 4437.4 5330.7 1723.3 1702.4 5218.3 3945.5 351.37 763.6 765 353.21 1726.5 2904.4 2888.3 6892.2 5957.8 3943.1 2648.9 3705.7 3948.7 4244 4179.7 3935.9 3146.1 4301.6 4573.1 4461.6 4358.7 7338.6 754.68 1150.5 0.35395 46 | #generating data...ok. 47 | 445.24 9107.2 1756.4 7246.2 8943.1 7062.7 4329.7 4464 5540.3 1736.6 1710.5 5502.1 4834.3 351.84 766.24 767.03 353.73 1759.7 2959.9 2971.8 7989 6501.9 4841.7 2673.9 4566.4 5016.6 5359.5 5354.5 4109.3 3272.2 4674.1 5413.3 5503.6 4331.5 8958.4 721.95 988.65 0.34055 48 | #generating data...ok. 49 | 613.72 9882.8 1775.2 7673.3 10021 7399.4 4387.8 4471.9 5712.4 1751.6 1720.6 5637.2 7039 352.14 767.84 769.05 354.19 1777.7 3061.3 3045.9 10488 10601 7535.6 2768.5 6497.5 7739.3 8195.2 8380.5 4210.7 3353.9 5069.8 7476.6 8072.4 4390.5 10244 702.74 923.82 0.33772 50 | #generating data...ok. 51 | 845.95 11248 1794.3 8358.1 11844 7924.9 4405.3 4458.9 5864.2 1702.7 1676.8 5804.2 8999.4 352.43 768.93 769.27 354.49 1797.2 3088.7 3045.4 12748 13054 9665 2808.4 8305.2 9913 10669 10769 4287.2 3436.7 5405.7 9739.8 10199 4422 11950 691.8 803.81 0.51735 52 | #generating data...ok. 53 | 1166.1 13579 1803.7 8743 13498 8901.2 4533.6 4618.8 5994.6 1751.3 1719.1 5970.2 10514 352.43 770.06 771.31 354.67 1809.3 3154.4 3136.9 13787 13812 10960 2829.6 9703.7 11641 12255 12728 4362.1 3507.9 5631.6 10453 11838 4454.7 13536 693.44 817.81 0.33389 54 | #generating data...ok. 55 | 1607.3 15213 1808.2 9270.5 15973 9472.7 4554.8 4631.8 6128 1758.7 1726.3 6106 16220 352.68 770.93 769.39 354.8 1811 3176.5 3188.6 24615 27000 18395 2930.4 15346 19244 20939 21750 4410.9 3546.7 6004.7 14067 16423 4515.9 15172 699.23 800.06 0.40077 56 | #generating data...ok. 57 | 2215.5 15332 1811.6 9246.1 15942 9485.7 4556.4 4637.7 6117.6 1758.2 1728.6 6089.8 14301 352.75 771.33 772.48 354.82 1818.3 3185.3 3200 20018 22787 17799 2902.2 13317 17900 19366 19437 4444.8 3572 5962.9 13638 16390 4480.8 14664 691.75 797.42 0.35288 58 | #generating data...ok. 59 | 3053.9 13767 1817.5 8763.8 14427 9012.3 4590.3 4673.2 6230.4 1766.1 1725.5 5951.7 22526 354.55 774.9 777.31 356.92 1819.6 3245.7 3232.5 25328 30556 31395 3075 21077 32751 36034 37855 4499.7 3606.3 6267.7 17385 21975 4509.7 13583 691.68 776.76 0.34669 60 | #generating data...ok. 61 | 4209.5 17126 1820.5 9359.4 17906 9981.7 4542.8 4654.8 6229.4 1715.1 1726.6 6196.9 26632 352.74 771 772.96 355.06 1827.6 3230.4 3241.7 37690 47873 36551 2995.7 25273 38310 39965 41909 4471.2 3617.3 6289.8 20358 22652 4492.2 17348 692.35 788.92 0.34237 62 | #generating data...ok. 63 | 5802.4 17006 1834.5 9736.6 18997 10123 4615.2 4691.6 6278.7 1742.5 1732.7 6232 21793 354.89 775.81 778.02 357.06 1840 3260.8 3258.5 31203 30042 32797 3000.6 21554 25964 27400 30897 4586.6 3639.3 6383.7 18849 24366 4530.5 17634 705.35 803.19 0.35961 64 | #generating data...ok. 65 | 7998.1 18041 1820.8 9929.3 20853 10435 4573.9 4667.2 6266.6 1748.7 1732.3 6245.4 28987 352.81 772.41 772.12 355.25 1829.2 3253.9 3248.4 48104 44877 46955 2961.8 29531 40720 52176 59595 4508.6 3627.5 6437.9 20360 26384 4510.5 18903 691.69 786.03 0.3645 66 | # bogus = 2817022208 67 | -------------------------------------------------------------------------------- /results/benchintersection6march2014.gnuplot: -------------------------------------------------------------------------------- 1 | 2 | 3 | set style line 80 lt rgb "#000000" 4 | 5 | # Line style for grid 6 | #set style line 81 lt 0 # dashed 7 | #set style line 81 lt rgb "#808080" # grey 8 | 9 | #set grid back linestyle 81 10 | set border 3 back linestyle 80 # Remove border on top and right. These 11 | # borders are useless and make it harder 12 | # to see plotted lines near the border. 13 | # Also, put it in grey; no need for so much emphasis on a border. 14 | set xtics nomirror 15 | set ytics nomirror 16 | 17 | 18 | 19 | set style line 1 lt rgb "#A00000" lw 4 pt 1 ps 0.5 20 | set style line 2 lt rgb "#00A000" lw 4 pt 5 ps 0.5 21 | set style line 3 lt rgb "#5060D0" lw 4 pt 7 ps 0.5 22 | set style line 4 lt rgb "#FF1493" lw 4 pt 9 ps 0.5 23 | set style line 5 lt rgb "red" lw 4 pt 11 ps 0.5 24 | set style line 6 lt rgb "#808000" lw 4 pt 13 ps 0.5 25 | set style line 7 lt rgb "#00008B" lw 4 pt 15 ps 0.5 26 | set style line 8 lt rgb "#800080" lw 4 pt 21 ps 0.5 27 | set style line 9 lt rgb "black" lw 4 pt 63 ps 0.5 28 | set style line 10 lt rgb "blue" lw 4 pt 28 ps 0.5 29 | set style line 11 lt rgb "violet" lw 4 pt 44 ps 0.5 30 | set style line 81 lt 0 # dashed 31 | set style line 81 lt rgb "#808080" # grey 32 | 33 | set grid back linestyle 81 34 | #set xtics 2 35 | #set ytics 1 36 | set term pdfcairo 37 | #fontscale 0.8 38 | 39 | 40 | set out "ratiobitpacking.pdf" 41 | 42 | set xlabel "Ratio length (large list) / length(small list)" 43 | set ylabel "relative speed (scalar = 1)" 44 | #set ylabel "relative speed (galloping = 1)" 45 | 46 | set key bmargin 47 | set key samplen 2 spacing .5 font ",8" maxrows 4 48 | #set xrange [1:8192] 49 | set xrange [1:10000] 50 | set logscale x 2 51 | 52 | set logscale y 2 53 | 54 | 55 | 56 | set out "benchintersection6march2014_v3.pdf" 57 | 58 | plot "benchintersection6march2014.txt" using 1:($36/$3) ti "SIMD v3" with linespoints lw 2 ps 0.5,\ 59 | "" using 1:($32/$3) ti "Wu SIMD v3" with linespoints lw 2 ps 0.5,\ 60 | "" using 1:($33/$3) ti "Wu SIMD v3 aligned" with linespoints lw 2 ps 0.5,\ 61 | "" using 1:($34/$3) ti "Wu SIMD v3 cmpeq flagged" with linespoints lw 2 ps 0.5 62 | -------------------------------------------------------------------------------- /results/benchintersection6march2014.txt: -------------------------------------------------------------------------------- 1 | # howmany : 5 2 | # loop : 3 3 | # distribution : clustered 4 | # Big : 22 5 | # intersectionratio : 0.3 6 | # MaxBit : 26 7 | # size-ratio @hybriddan branchless danfar danfarfar danfarmov f2p0 f4p0 f8p0 hssimd hssimddan natemediumdanalt scalar1sgalloping scalarbranchless scalarbranchlesscached scalarbranchlesscached2 scalarbranchlessunrolled scalardanbranchless scalarnate scalarnatewg simdgalloping simdgalloping2 thomas_gallop thomas_scalar thomas_simdgallop_v0 thomas_simdgallop_v1 thomas_simdgallop_v2 thomas_simdgallop_v3 thomas_v1 thomas_v1_plow thomas_v2 thomas_v3 thomas_v3_aligned thomas_v3cmpeqflagged v1 v3 widevector widevectorleo relative-intersection-size 8 | #generating data...ok. 9 | 1.001 984.39 388.36 420.99 403.03 418.71 985.11 1001.8 773.9 1019.3 1093.1 473.79 347.27 259.6 390.82 400.61 257.98 388.1 369.84 373.96 396.1 429.05 265.14 315.94 188.92 188.7 149.29 146.33 224.35 227.07 174.35 171.32 228.61 287.44 983.99 403.3 950.9 1085.7 0.3362 10 | #generating data...ok. 11 | 1.3798 1043 406.2 508.19 485.58 506.38 1043 1101.9 885.06 1053 1099.2 562.21 362.62 259.98 402.83 408.62 258.55 406.14 397.27 401.05 478.57 517.06 286.65 341.03 214.89 218.12 174.86 171.5 257.36 257.34 202.45 199.48 251 343.47 1042.8 484.84 944.7 1092.4 0.33065 12 | #generating data...ok. 13 | 1.9019 1059.5 415.22 557.28 534.93 554 1060.4 1207.5 1031.6 1068.5 1077.2 597.91 346.78 258.59 406.79 415.73 256.33 415.27 402.41 406.27 525.38 563.33 282.41 340.76 234.08 244.03 200.42 198.09 279.13 273.27 226.87 224.93 255.99 390.15 1059.6 534.71 930.69 1083.4 0.34479 14 | #generating data...ok. 15 | 2.6216 1300.8 461.9 670.41 646.39 666.92 1296.1 1407.7 1190 1212.6 1233.1 727.41 412.31 262.25 428.14 432.23 262.69 461.73 474.44 473 635.86 675.99 354.55 403.34 271.78 279.3 234.85 232.53 325.04 320.44 268.64 266.65 313.3 478.36 1299.7 646.82 953.29 1132.4 0.32439 16 | #generating data...ok. 17 | 3.6136 1235.4 547.05 933.09 889 930.74 1235.7 1478.7 1412 1216.4 1206.5 954.32 449.71 281.27 478.29 485.23 279.71 547.61 567.11 571.6 868.25 924.3 382.38 471.7 335.81 368.2 320.45 317.82 411.11 389.95 359.43 357.64 366.9 643.41 1236.1 890.27 934.08 1112.8 0.34676 18 | #generating data...ok. 19 | 4.981 1279.2 579.17 969.57 926.76 961.3 1281.6 1619.1 1652.4 1224.2 1196.8 970.13 449.8 284.01 494.51 502.85 282.19 579 582.89 587.01 910.08 946.21 383.01 487.81 368.83 411.15 373.33 371.84 447.07 419.19 407.53 409.45 384.27 718.32 1283.3 927.64 926.11 1112.8 0.34824 20 | #generating data...ok. 21 | 6.8659 1654.8 721.14 1276.2 1229.2 1271.1 1666.7 1989.4 1947.7 1343 1321.4 1271.5 614.83 302.2 554.48 560.81 301.05 720.86 782.42 786.07 1206.8 1247.1 518.55 654.45 491.83 545.12 489.55 492.05 618.44 571.37 543.47 546.67 531.57 944.53 1668.7 1230 938.9 1140.9 0.34008 22 | #generating data...ok. 23 | 9.4639 1854.6 884.78 1494 1450.8 1485.8 1851.9 2047.9 2176.3 1450.5 1432.3 1465.5 756.61 317.62 607.85 614.19 317.44 884.64 1007.2 1014.9 1425.2 1429.8 634.55 818.41 595.42 652.11 647.8 645.04 754.2 711.31 717.4 728.35 706.68 1163 1859.3 1450.7 949.1 1169.9 0.32946 24 | #generating data...ok. 25 | 13.045 1933.9 955.33 1971.2 1881.7 1971.7 1935.6 2277.6 2552.8 1484.9 1460.5 1832.8 835.73 322.79 627.61 634.56 322.67 955.52 1135.7 1141.3 1838.8 1848.6 727.44 937.61 706.72 806.27 842.94 840.29 905.22 826.88 921.74 930.35 821.68 1526.3 1938.6 1879.9 942.64 1175.4 0.36438 26 | #generating data...ok. 27 | 17.981 2804.4 1249.2 2513.4 2462.8 2506.7 2826 2882.4 2921.3 1602.4 1592.4 2415.9 1389.2 339.68 699.44 701.79 340.84 1248 1680.9 1715.2 2392.8 2310.7 1138 1384.5 1042.7 1116 1103.3 1107.9 1403.2 1276.6 1263.7 1280.6 1341 1958.1 2818.3 2469.8 938.56 1208.6 0.31654 28 | #generating data...ok. 29 | 24.786 2680.4 1177.6 2919 2910.4 2915 2713.8 3010.4 3304.5 1586.2 1556.7 2607.1 1304.2 333.45 678.85 683.76 334.2 1178.5 1563.3 1570.5 2814.3 2661.9 1126.6 1331 1113.5 1251.1 1285.8 1304.7 1449.6 1272.1 1406 1438.1 1309.5 2374.6 2713.8 2903 918.06 1196.1 0.32648 30 | #generating data...ok. 31 | 34.165 2761.5 1280 3156.7 3285.5 3144.4 2775.1 3017.1 3607 1624 1593.8 2743 1479 338.27 697.85 702.85 339.18 1280.1 1710.7 1710.5 3226.9 3021.3 1299.4 1454.8 1318.4 1487.5 1690.7 1709.7 1552.2 1406.7 1704.3 1856.6 1584.7 2831.9 2774.4 3270.6 856.29 1173.5 0.33213 32 | #generating data...ok. 33 | 47.093 3360.1 1403.7 3697.2 3687.6 3698.3 3401 3577.1 3893.8 1660.2 1633.9 3310.9 1835.5 342.98 722.55 725.94 344.66 1399.7 2072 2066.3 3522.6 3296.4 1586.2 1796.3 1585.8 1746.1 1852.9 1895.4 2185.1 1834.5 2100.2 2103.7 1965.8 3198.7 3405.1 3693.2 863.84 1199.8 0.34141 34 | #generating data...ok. 35 | 64.913 4147.9 1473.7 4146.5 4177.7 4157.3 3331.3 3600 4189.8 1679.2 1650 3579.1 1980.4 345.1 730.24 733.62 346.25 1473.9 2179.7 2182.4 3939.8 3606.1 1821.5 1930.6 1756.3 1984.1 2227.9 2197.9 2300.4 1919.6 2440.7 2455.8 2176.6 3643 3341.4 4155.4 879.34 1217 0.33945 36 | #generating data...ok. 37 | 89.477 4181.9 1512.2 4198 4230.1 4186.6 3792 3957.6 4271 1690.3 1668.1 3702.1 2119.5 346.99 740.39 741.39 348.4 1513 2343.4 2337.8 4063.6 3771.1 1894.7 2039.4 1946.9 2162.4 2408 2536.9 2788.1 2208.1 2718.1 2776.9 2514.8 3930.7 3792 4233.4 867.56 1213.3 0.33949 38 | #generating data...ok. 39 | 123.33 5932.2 1611.8 5311.1 5867.4 5304.5 3924.9 4052.6 4758.7 1718.8 1687.4 4379.2 2942.1 348.99 750.35 753.03 350.62 1613.1 2559.9 2577.7 5586.7 4908.6 2660.2 2369.1 2702.6 2962.9 3203.7 3305.6 3158 2558.1 3315.3 3618.7 3337.4 5156.9 3951.3 5872.3 813.5 1212.3 0.34108 40 | #generating data...ok. 41 | 170.01 4865.7 1623.1 4939.4 4904.9 4985.7 4146.4 4212.7 4645 1715 1689.1 4372.8 2537.6 349.88 754.88 755.35 351.42 1624.2 2611.1 2628.8 4555.3 4179.8 2479.5 2413.4 2323.8 2497.2 2766.4 2797.7 3501.5 2679.8 3348.2 3159.7 3004.2 4453.4 4154.6 4851.9 815.31 1218 0.34176 42 | #generating data...ok. 43 | 234.34 7252.9 1696.2 6182 7272.1 6109.1 4183.3 4273.5 5194.9 1717 1698.3 4960.7 3889 350.78 759.51 761.73 352.48 1696.1 2799.4 2806.4 6720.6 5493.4 3618.8 2648.9 3639.6 3922 4232.3 4310 3685.3 2921.5 4026.5 4589.9 4358.4 6061 4182.4 7270.4 733.93 1156.5 0.34382 44 | #generating data...ok. 45 | 323.01 8383.6 1739.9 6809.5 8384.7 6807.3 4323.3 4378.2 5334.6 1744.5 1716.3 5240.3 4568.4 351.71 764.99 766.08 353.56 1740.7 2887.2 2933.7 7470.5 6117.6 4263.4 2810.3 4292.2 4486.2 4741.8 4796.5 3984.3 3118.4 4364.6 5251.2 5026.9 6686.1 4323.3 8416 745.73 1158.4 0.33731 46 | #generating data...ok. 47 | 445.24 8177.8 1744.6 6872.8 8184.2 6876.5 4363.4 4437.4 5422.3 1742.4 1712.5 5408.4 4416.6 351.58 765.97 767.14 353.64 1745.8 2942.6 2928.1 7457.2 6234.6 4364 2828.7 4128.3 4358.6 4653.2 4595.2 4130.7 3202.7 4471.4 5013.2 4911.3 6701.8 4367 8192.7 724.91 1089.5 0.34735 48 | #generating data...ok. 49 | 613.72 10459 1772.5 7683.6 10440 7668.6 4404 4461.1 5686.1 1750.1 1718.4 5611.6 6572.8 352.17 767.71 767.88 354.02 1769.3 2992.9 2958.1 9841.2 8165.5 6200.8 2957.2 6113 6481.5 6955.1 7039 4222.3 3290 4948 7177.7 7032.7 8368.3 4408.4 10441 715.05 900.33 0.41703 50 | #generating data...ok. 51 | 845.95 12143 1791.5 8207.8 11979 8172.7 4415.2 4484.6 5778.1 1757.3 1728.8 5777 8508.3 352.39 767.55 770.32 354.37 1792.1 3088.6 3034 11764 10179 8015.6 3028.1 8045.3 8443.3 8995.5 9246.4 4292.4 3388.4 5241.5 9363.3 9060.2 9858.6 4434.1 12169 681.35 825.82 0.34106 52 | #generating data...ok. 53 | 1166.1 14373 1809.9 8711.6 14402 8710.4 4444 4499.6 5957.2 1762.5 1717.1 5907.5 14043 352.47 770.28 770.87 354.41 1810.1 3126.8 3088.9 18757 18824 13248 3082.3 13143 13818 14851 15166 4322.6 3451 5670.2 13851 13392 14278 4457.9 14452 698.79 865.26 0.4248 54 | #generating data...ok. 55 | 1607.3 14297 1801 8791.5 14412 8787.8 4475.4 4530.8 5971 1758.1 1723.7 5976.1 13323 352.59 770.77 771.83 354.58 1810.6 3117.8 3151.1 16728 16253 12779 3130.8 12656 13175 13893 14111 4416.4 3488.9 5810.1 13608 13329 13617 4473.8 14385 683.16 812.94 0.341 56 | #generating data...ok. 57 | 2215.5 15746 1825.4 9205.4 16006 9139.9 4485 4540 5924.1 1776.6 1731.3 6044.1 17845 352.85 771.77 771.86 354.88 1825.3 3164.8 3105.7 26680 26246 18844 3152.9 17182 19842 20965 21621 4445.1 3527.3 6143.2 15362 17047 18117 4485.3 16006 700.54 833.25 0.4897 58 | #generating data...ok. 59 | 3053.9 16759 1823.4 9327.7 16935 9309.7 4510.4 4555.8 6068.8 1780.8 1737.6 6082.9 22591 353.42 773.04 773.84 355.6 1828.1 3195.9 3205.7 31332 29331 21695 3184.8 21929 22746 23949 25523 4482.4 3547.1 6204 19563 19117 19472 4508.4 16899 681.95 797.12 0.35324 60 | #generating data...ok. 61 | 4209.5 17705 1825.3 9443.7 17866 9586.2 4509 4558 6103.1 1778.9 1738 6134 33478 352.61 771.46 772.93 355.04 1832.5 3184.1 3187.5 51519 47691 34449 3173.2 32242 38124 38806 40118 4483.6 3566.2 6286.1 23551 23411 23827 4502.5 17795 680.37 785.9 0.34538 62 | #generating data...ok. 63 | 5802.4 18248 1820.6 9474.3 18446 9485.7 4517.2 4570.3 6083.3 1777.2 1735.5 6108.1 42612 352.87 772.18 773.13 354.86 1834 3203.9 3222.1 73698 71929 49324 3200.8 41736 58820 54872 59041 4502.3 3587.4 6315.9 25327 25185 24985 4496.8 18349 680.42 787.99 0.34716 64 | #generating data...ok. 65 | 7998.1 18830 1821.5 9653.4 18960 9695 4517 4572.9 6126.1 1780.5 1744.1 6198 49081 353.47 773.34 774.26 355.73 1837.6 3206.3 3231.8 85657 83165 60571 3219.4 50338 74405 63895 70327 4514.4 3561 6290.4 27455 27527 26675 4519.6 18875 684.1 803.7 0.40267 66 | # bogus = 2851167232 67 | -------------------------------------------------------------------------------- /results/benchintersection6march2014_2.txt: -------------------------------------------------------------------------------- 1 | # howmany : 5 2 | # loop : 3 3 | # distribution : clustered 4 | # Big : 22 5 | # intersectionratio : 0.3 6 | # MaxBit : 26 7 | # size-ratio v3 thomas_v3 thomas_v3_aligned thomas_v3cmpeqbinaryflagged thomas_v3cmpeqflagged thomas_v3cmpeqscalarflagged thomas_v3cmpeqsimd32flagged thomas_v3cmpeqsimd8flagged relative-intersection-size 8 | #generating data...ok. 9 | 1.001 379.47 170.78 238.13 237.37 287.58 233.22 199.56 212.07 0.35448 10 | #generating data...ok. 11 | 1.3798 470.48 184.59 220.84 275.11 353.6 270.9 249.95 261.64 0.333 12 | #generating data...ok. 13 | 1.9019 641.32 239.66 294.75 365.62 458.63 360.54 318.18 332.03 0.34687 14 | #generating data...ok. 15 | 2.6216 732.8 295.66 334.87 411.28 540.23 403.97 381.36 393.35 0.3497 16 | #generating data...ok. 17 | 3.6136 925.98 350.7 365.9 505.32 684.04 503.27 481.33 488.09 0.37275 18 | #generating data...ok. 19 | 4.981 1201.7 491.1 554.38 650.79 881.4 644.83 633.15 637.6 0.32547 20 | #generating data...ok. 21 | 6.8659 1282.5 560.34 555.86 789.65 1008.8 780.07 751.17 764.32 0.33926 22 | #generating data...ok. 23 | 9.4639 1535.4 705.72 681.98 934.63 1235.8 919.32 932.06 947.28 0.35154 24 | #generating data...ok. 25 | 13.045 2082.7 1034.5 959.12 1252.2 1685.6 1248.8 1283.7 1285.5 0.33072 26 | #generating data...ok. 27 | 17.981 2456.2 1217.9 1074.7 1505.4 2035.2 1494.3 1580 1545.6 0.32922 28 | #generating data...ok. 29 | 24.786 2673.5 1443.5 1277.7 1706.8 2309.4 1715.6 1824.6 1754.4 0.34234 30 | #generating data...ok. 31 | 34.165 2714.2 1714.6 1470 1974.4 2567.1 1997.2 2112 2041.6 0.35479 32 | #generating data...ok. 33 | 47.093 3785 2178.7 1960.3 2619.7 3355.1 2583.9 2703.1 2619.1 0.34748 34 | #generating data...ok. 35 | 64.913 4205.3 2478 2272.6 3040.8 3763.6 3054.3 3175.8 3092.3 0.35194 36 | #generating data...ok. 37 | 89.477 5225.5 3006.4 2909.3 3666.7 4583.1 3614.8 3707.4 3589.3 0.31662 38 | #generating data...ok. 39 | 123.33 5078.9 3183.9 2971.1 3842.4 4638.1 3715.4 3740.2 3682.2 0.34722 40 | #generating data...ok. 41 | 170.01 6270.7 3981.1 3689.9 4719.3 5576.3 4627.4 4713.7 4613.3 0.38902 42 | #generating data...ok. 43 | 234.34 7339.4 4614.1 4473.3 5424 6082.7 5304.2 5397.6 5274.6 0.3213 44 | #generating data...ok. 45 | 323.01 7765.9 4907.9 4704.8 5869.2 6362.1 5841.5 5906.6 5789 0.34363 46 | #generating data...ok. 47 | 445.24 10794 7904.7 7695.4 8897.6 9021 8866.3 9062.5 8875 0.31975 48 | #generating data...ok. 49 | 613.72 9834.9 6657.4 6523.7 7533.8 7921.1 7500.6 7585.5 7448.3 0.3427 50 | #generating data...ok. 51 | 845.95 13522 12976 12635 14433 13994 14186 14641 14263 0.33038 52 | #generating data...ok. 53 | 1166.1 14304 14953 14455 16133 16154 16083 16137 16233 0.38838 54 | #generating data...ok. 55 | 1607.3 14974 16299 16095 17608 17064 17807 17747 17647 0.33755 56 | #generating data...ok. 57 | 2215.5 16221 19194 18704 19980 18911 19917 20044 19879 0.32911 58 | #generating data...ok. 59 | 3053.9 16431 18564 18284 19581 18407 19673 19642 19430 0.33358 60 | #generating data...ok. 61 | 4209.5 17159 22816 22149 23436 22849 23586 23728 23683 0.3253 62 | #generating data...ok. 63 | 5802.4 17779 24633 24518 24867 24281 24995 25125 25215 0.45781 64 | #generating data...ok. 65 | 7998.1 16146 27857 27054 27491 26823 27648 27795 27993 0.40267 66 | # bogus = 632999104 67 | -------------------------------------------------------------------------------- /scripts/disablehyperthreading.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Be careful to not skip the space at the beginning nor the end 4 | CPUS_TO_SKIP=" $(cat /sys/devices/system/cpu/cpu*/topology/thread_siblings_list | sed 's/[^0-9].*//' | sort | uniq | tr "\r\n" " ") " 5 | 6 | 7 | for CPU_PATH in /sys/devices/system/cpu/cpu[0-9]*; do 8 | CPU="$(echo $CPU_PATH | tr -cd "0-9")" 9 | echo "$CPUS_TO_SKIP" | grep " $CPU " > /dev/null 10 | if [ $? -ne 0 ]; then 11 | echo 0 > $CPU_PATH/online 12 | fi 13 | done 14 | 15 | egrep 'siblings|cpu cores' /proc/cpuinfo | head -2 16 | -------------------------------------------------------------------------------- /scripts/powerpolicy.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # taken from http://hbfs.wordpress.com/2013/06/18/fast-path-finding-part-ii/ 3 | # might require sudo apt-get install cpufrequtils 4 | # invoke with performance or ondemand 5 | # type cpufreq-info to check results, you can also verify with cat /proc/cpuinfo 6 | # enumerate found CPUs 7 | cpus=$( grep processor /proc/cpuinfo | cut -d: -f 2 ) 8 | 9 | 10 | if [ "$1" = "ondemand" ]; then 11 | echo "setting up ondemand" 12 | policy="ondemand" 13 | elif [ "$1" = "performance" ]; then 14 | echo "setting up for performance" 15 | policy="performance" 16 | elif [ "$1" = "list" ]; then 17 | cpufreq-info 18 | exit 0 19 | else 20 | echo "usage: powerpolicy.sh ondemand | performance list" 21 | exit -1 22 | fi 23 | 24 | echo "chosen policy " $1 25 | # set governor for each CPU 26 | # 27 | for cpu in ${cpus[@]} 28 | do 29 | cpufreq-set -c $cpu -g $1 30 | done 31 | -------------------------------------------------------------------------------- /scripts/turboboost.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # stolen from https://github.com/DropD/fnc-simplex/blob/master/linux_turboboost.sh 3 | 4 | # you might need to run sudo apt-get install msr-tools 5 | # Toggle Turbo Boost for Ivy Bridge CPUs (should work for all newer Core) 6 | # Requires a fairly new Linux kernel (let's say 3.0+) 7 | # Written by Donjan Rodic, released for free use 8 | 9 | # check current real frequency with sudo turbostat -s -i1 10 | 11 | sudo modprobe msr 12 | 13 | # all_cores FOO 14 | # perform FOO(i) for each core i 15 | all_cores() { 16 | NPROCS=`cat /proc/cpuinfo | grep "core id" | wc -l` 17 | NPROCS=$(($NPROCS - 1)) 18 | for i in `seq 0 1 $NPROCS`; do 19 | $1 $i 20 | done 21 | } 22 | 23 | 24 | # report Turbo Boost state on core $1 25 | read_tb() { 26 | ret=`sudo rdmsr -p"$1" 0x1a0 -f 38:38` 27 | [ $ret -eq 0 ] && echo "$1": on || echo "$1": off 28 | } 29 | 30 | # enable Turbo Boost on core $1 31 | enable_tb() { 32 | sudo wrmsr -p"$1" 0x1a0 0x850089 33 | } 34 | 35 | # disable Turbo Boost on core $1 36 | disable_tb() { 37 | sudo wrmsr -p"$1" 0x1a0 0x4000850089 38 | } 39 | 40 | 41 | if [ "$1" = "on" ]; then 42 | all_cores enable_tb 43 | all_cores read_tb 44 | elif [ "$1" = "off" ]; then 45 | all_cores disable_tb 46 | all_cores read_tb 47 | elif [ "$1" = "list" ]; then 48 | all_cores read_tb 49 | else 50 | echo "usage: turboboost.sh on | off | list" 51 | fi 52 | -------------------------------------------------------------------------------- /src/benchintersection.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * This code is released under the 3 | * Apache License Version 2.0 http://www.apache.org/licenses/. 4 | * 5 | */ 6 | 7 | 8 | #include 9 | #include "synthetic.h" 10 | #include "timer.h" 11 | #include "intersection.h" 12 | #include "intersectionfactory.h" 13 | 14 | // https://code.google.com/p/likwid/wiki/LikwidPerfCtr#Using_the_marker_API 15 | #ifdef LIKWID_MARKERS // see 'make likwidintersection' for compiler flags 16 | #include 17 | #endif 18 | 19 | /** 20 | * Goal: have the largest array count about 4M terms (this 21 | * matches our experiments), and vary the size of the 22 | * smallest array vary from 1*4M to 1/1000*4M (or so). 23 | * 24 | * Set the size of the intersection to 30% of the lesser 25 | * array. (Again, this matches our real data...) 26 | * 27 | * To match our clueweb, we use a range of values in [0,2**26). 28 | */ 29 | template 30 | pair, vector> getNaivePair(generator gen, uint32_t minlength, uint32_t Max, float sizeratio, 31 | float intersectionratio) { 32 | if (sizeratio < 1) throw runtime_error("sizeratio should be larger or equal to 1"); 33 | if (intersectionratio < 0) throw runtime_error("intersectionratio should be positive"); 34 | if (intersectionratio > 1) throw runtime_error("intersectionratio cannot be larger than 1"); 35 | const uint32_t maxlenth = static_cast(round(static_cast(minlength) * sizeratio)); 36 | if (maxlenth > Max) throw runtime_error("I can't generate an array so large in such a small range."); 37 | if (maxlenth < minlength) throw runtime_error("something went wrong, possibly an overflow."); 38 | // we basically assume that, if we do nothing, intersections are very small 39 | const uint32_t intersize = static_cast(round(static_cast(minlength) * intersectionratio)); 40 | 41 | vector inter = gen.generate(intersize, Max); 42 | vector smallest = unite(gen.generate(static_cast(minlength - inter.size()), Max), inter); 43 | vector largest = unite(gen.generate(static_cast(maxlenth - inter.size()), Max), inter); 44 | vector intersection = intersect(smallest, largest); 45 | if (largest.size() > smallest.size()) 46 | return pair, vector>(smallest, largest); 47 | return pair, vector>(largest, smallest); 48 | 49 | } 50 | 51 | 52 | 53 | void printusage() { 54 | #ifdef LIKWID_MARKERS 55 | cout << "example: likwid -m -C 1 -g BRANCH ./likwidintersection -u > uniform.out" << endl; 56 | #else 57 | cout << " Runs an exhaustive benchmark over a ClusterData distribution."< myschemes = allRealNames(); 74 | 75 | while ((c = getopt(argc, argv, "uns:m:R:M:S:l:hs:")) != -1) 76 | switch (c) { 77 | case 'h': 78 | printusage(); 79 | return 0; 80 | case 'S': 81 | Big = atoi(optarg); 82 | break; 83 | case 'R': 84 | intersectionratio = atof(optarg); 85 | break; 86 | case 's': 87 | myschemes.clear(); 88 | { 89 | const string codecsstr(optarg); 90 | const vector codecslst = split(codecsstr, ",:;"); 91 | for (auto i = codecslst.begin(); i != codecslst.end(); ++i) { 92 | if (realschemes.find(*i) == realschemes.end()) { 93 | cerr << " Warning!!! Warning: unrecognized: " << *i 94 | << endl; 95 | printusage(); 96 | return -1; 97 | 98 | } else { 99 | myschemes.push_back(*i); 100 | } 101 | } 102 | } 103 | break; 104 | case 'M': 105 | MaxBit = atoi(optarg); 106 | if (MaxBit < 1) { 107 | printusage(); 108 | return -1; 109 | } 110 | break; 111 | case 'm': 112 | howmany = atoi(optarg); 113 | if (howmany < 1) { 114 | printusage(); 115 | return -1; 116 | } 117 | break; 118 | case 'l': 119 | loop = atoi(optarg); 120 | if (loop < 1) { 121 | printusage(); 122 | return -1; 123 | } 124 | break; 125 | case 'u': 126 | uniform = true; 127 | break; 128 | default: 129 | printusage(); 130 | abort(); 131 | } 132 | if (howmany == 0) { 133 | howmany = 5; 134 | } 135 | cout << "# howmany : " << howmany << endl; 136 | cout << "# loop : " << loop << endl; 137 | cout << "# distribution : " << (uniform ? "uniform" : "clustered") << endl; 138 | cout << "# Big : " << Big << endl; 139 | cout << "# intersectionratio : " << intersectionratio << endl; 140 | cout << "# MaxBit : " << MaxBit << endl; 141 | UniformDataGenerator udg; 142 | ClusteredDataGenerator cdg; 143 | WallClockTimer z; 144 | size_t bogus = 0; 145 | vector buffer(2 * (1U << Big)); 146 | #ifdef LIKWID_MARKERS 147 | char currentMarker[64]; 148 | likwid_markerInit(); 149 | #endif 150 | 151 | cout << "# size-ratio\t"; 152 | for (string intername : myschemes) { 153 | cout << intername << "\t"; 154 | } 155 | cout << "relative-intersection-size " << endl; 156 | 157 | for (float ir = 1.001; ir <= 10000; ir = ir * sqrt(1.9)) { 158 | vector , vector>> data(howmany); 159 | uint32_t smallsize = static_cast(round(static_cast(1 << Big) / ir)); 160 | cout << "#generating data..."; 161 | cout.flush(); 162 | for (size_t k = 0; k < howmany; ++k) { 163 | data[k] = uniform ? getNaivePair(udg , smallsize, 1U << MaxBit, ir, intersectionratio) 164 | : getNaivePair(cdg , smallsize, 1U << MaxBit, ir, intersectionratio); 165 | } 166 | cout << "ok." << endl; 167 | cout << ir << "\t"; 168 | float aratio = 0.0f; 169 | for (string intername : myschemes) { 170 | intersectionfunction interfnc = realschemes[intername]; 171 | size_t volume = 0; 172 | #ifdef LIKWID_MARKERS 173 | snprintf(currentMarker, sizeof(currentMarker), "%s %.2f", intername.c_str(), ir); 174 | likwid_markerStartRegion(currentMarker); 175 | #endif 176 | z.reset(); 177 | for (size_t k = 0; k < data.size(); ++k) { 178 | volume += (data[k].first.size() + data[k].second.size()) * loop; 179 | for (size_t L = 0; L < loop; ++L) { 180 | aratio = interfnc(data[k].first.data(), 181 | (data[k].first).size(), data[k].second.data(), 182 | (data[k].second).size(), buffer.data()); 183 | bogus += aratio; 184 | } 185 | } 186 | cout << setw(10) << setprecision(5) << (volume / (static_cast(z.split()))) << "\t"; 187 | #ifdef LIKWID_MARKERS 188 | likwid_markerStopRegion(currentMarker); 189 | #endif 190 | } 191 | cout << "\t\t" << aratio / smallsize; 192 | cout << endl; 193 | 194 | } 195 | #ifdef LIKWID_MARKERS 196 | likwid_markerClose(); 197 | #endif 198 | 199 | cout << "# bogus = " << bogus << endl; 200 | } 201 | -------------------------------------------------------------------------------- /src/getmatrix.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Algorithm 3: 1:1 2:1 3:1 5:1 10:1 20:1 40:1 80:1 200:1 500:1 1000:1 3 | 100% xx xx xx ... 4 | 80% xx xx xx ... 5 | 60% 6 | 20% 7 | 10% 8 | 5% 9 | 1% 10 | */ 11 | 12 | #include "common.h" 13 | #include "intersectionfactory.h" 14 | #include "timer.h" 15 | #include "synthetic.h" 16 | #include "util.h" 17 | 18 | void printusage() { 19 | cout << " Try ./getmatrix -s scalarnate" << endl; 20 | cout << " Use the -s flag to specify some scheme, choose from: " << endl; 21 | for(string x : allRealNames()) cout <<" "<< x << endl; 22 | cout 23 | << " The -M flag allows you to specific the range in bits (default 31)." 24 | << endl; 25 | cout 26 | << " The -S flag allows you to specific the log. of the minimal array size (default 10)." 27 | << endl; 28 | } 29 | 30 | int main(int argc, char **argv) { 31 | size_t howmany = 0; 32 | size_t loop = 10; 33 | uint32_t S = 10; 34 | string name; 35 | intersectionfunction myscheme = NULL; 36 | uint32_t MaxBit = 31; 37 | int c; 38 | while ((c = getopt(argc, argv, "ns:m:M:S:l:r:h")) != -1) 39 | switch (c) { 40 | case 'h': 41 | printusage(); 42 | return 0; 43 | case 'S': 44 | S = atoi(optarg); 45 | break; 46 | case 'M': 47 | MaxBit = atoi(optarg); 48 | if (MaxBit < 1) { 49 | printusage(); 50 | return -1; 51 | } 52 | break; 53 | case 'm': 54 | howmany = atoi(optarg); 55 | if (howmany < 1) { 56 | printusage(); 57 | return -1; 58 | } 59 | break; 60 | case 's': 61 | name = optarg; 62 | if (realschemes.find(name) == realschemes.end()) { 63 | cerr << " Warning!!! Warning: unrecognized: " << name << endl; 64 | printusage(); 65 | return -1; 66 | 67 | } else { 68 | 69 | myscheme = realschemes.find(name)->second; 70 | } 71 | break; 72 | case 'l': 73 | loop = atoi(optarg); 74 | if (loop < 1) { 75 | printusage(); 76 | return -1; 77 | } 78 | break; 79 | default: 80 | abort(); 81 | } 82 | if (howmany == 0) { 83 | howmany = 5; 84 | } 85 | if(myscheme == NULL) { 86 | printusage(); 87 | return -1; 88 | } 89 | const uint32_t minlength = 1U << S; 90 | cout << "########### Intersection benchmark ###########" << endl; 91 | 92 | cout << "# speeds in mis" << endl; 93 | cout << "# columns are size ratios" << endl; 94 | cout << "# rows are intersection ratios" << endl; 95 | cout << "# average gaps in bits for smallest array: " << std::setprecision( 96 | 3) << log(1 + (1U << MaxBit) * 1.0 / minlength) << " (use -S and -M flag to change)"<< endl; 97 | #ifdef __INTEL_COMPILER 98 | // Intel's support for C++ sucks 99 | vector intersectionsratios; 100 | intersectionsratios.push_back(1.00); 101 | intersectionsratios.push_back(0.80); 102 | intersectionsratios.push_back(0.60); 103 | intersectionsratios.push_back(0.20); 104 | intersectionsratios.push_back(0.10); 105 | intersectionsratios.push_back(0.05); 106 | intersectionsratios.push_back(0.01); 107 | vector < uint32_t > sizeratios; 108 | sizeratios.push_back(1); 109 | sizeratios.push_back(2); 110 | sizeratios.push_back(3); 111 | sizeratios.push_back(5); 112 | sizeratios.push_back(10); 113 | sizeratios.push_back(20); 114 | sizeratios.push_back(40); 115 | sizeratios.push_back(80); 116 | sizeratios.push_back(200); 117 | sizeratios.push_back(500); 118 | sizeratios.push_back(1000); 119 | #else 120 | // proper C++ 121 | vector intersectionsratios = { 1.00, 0.80, 0.60, 0.20, 0.10, 0.05, 122 | 0.01 }; 123 | vector < uint32_t > sizeratios = {1, 2, 3, 5, 10, 20,40,80,200,500,1000}; 124 | #endif 125 | cout<<"# average gaps in bits for last largest array: "< buffer((sr*minlength + 15)/16*16); 145 | vector < 146 | pair< 147 | vector, vector 148 | > 149 | > data(howmany); 150 | for(size_t k = 0; k < howmany; ++k) 151 | data[k] = getPair(cdg, minlength,1U<(sr), ir); 152 | size_t volume = 0; 153 | z.reset(); 154 | for (size_t L = 0; L < loop; ++L) { 155 | 156 | for (auto x : data) { 157 | volume += (x.first).size(); 158 | volume += (x.second).size(); 159 | bogus 160 | += myscheme(&(x.first)[0], 161 | (x.first).size(), &(x.second)[0], 162 | (x.second).size(),&buffer[0]); 163 | } 164 | 165 | } 166 | time = z.split(); 167 | cout << std::setprecision(4) << static_cast(volume) / static_cast(time) << "\t"; 168 | cout.flush(); 169 | 170 | 171 | } 172 | cout< myschemes(realschemes); 33 | int c; 34 | while ((c = getopt(argc, argv, "uns:S:m:l:r:hk:")) != -1) 35 | switch (c) { 36 | case 'k': 37 | skipping = true; 38 | skipgaplog = atoi(optarg); 39 | if ((S < 1) or (S > 31)) { 40 | cerr<<"Skip param needs to be within [1,31]."< 31)) { 60 | printusage(); 61 | return -1; 62 | } 63 | break; 64 | case 'm': 65 | howmany = atoi(optarg); 66 | if (howmany < 1) { 67 | printusage(); 68 | return -1; 69 | } 70 | break; 71 | case 's': 72 | myschemes.clear(); 73 | { 74 | const string codecsstr(optarg); 75 | const vector codecslst = split(codecsstr, ",:;"); 76 | for (auto i = codecslst.begin(); i != codecslst.end(); ++i) { 77 | if (realschemes.find(*i) == realschemes.end()) { 78 | cerr << " Warning!!! Warning: unrecognized: " << *i 79 | << endl; 80 | printusage(); 81 | return -1; 82 | 83 | } else { 84 | const auto K = realschemes.find(*i); 85 | const std::string name = K->first; 86 | const intersectionfunction fn = K->second; 87 | myschemes[name] = fn; 88 | } 89 | } 90 | } 91 | break; 92 | case 'l': 93 | loop = atoi(optarg); 94 | if (loop < 1) { 95 | printusage(); 96 | return -1; 97 | } 98 | break; 99 | case 'r': 100 | ratio = atoi(optarg); 101 | if (ratio < 1) { 102 | printusage(); 103 | return -1; 104 | } 105 | break; 106 | default: 107 | abort(); 108 | } 109 | if (howmany == 0) { 110 | if (natemode) 111 | howmany = 1; 112 | else 113 | howmany = 20; 114 | } 115 | uint32_t MaxBit = 31; 116 | if (natemode) { 117 | MaxBit = S + 7; 118 | if (MaxBit > 31) 119 | MaxBit = 31; 120 | } 121 | cout << "# algo: "; 122 | for(auto algo : myschemes) { 123 | cout << algo.first<< " "; 124 | } 125 | cout << endl; 126 | const uint32_t N = 1U << S; 127 | if (!natemode) 128 | if (ratio != 1) { 129 | cout << "# ratio = " << ratio << endl; 130 | cout << "# size of largest array = " << N << endl; 131 | cout << "# size of smallest array = " << N / ratio << endl; 132 | } else { 133 | cout << "# size of arrays = " << N << endl; 134 | } 135 | else 136 | cout << "# in natemode" << endl; 137 | 138 | ClusteredDataGenerator cdg; 139 | WallClockTimer z; 140 | size_t bogus = 0; 141 | size_t volume = 0; 142 | uint64_t time = 0; 143 | cout << "# first column is relative size of intersection" << endl; 144 | if (ratio > 1) { 145 | cout 146 | << "# next two are estimated average bits per int for differential coding" 147 | << endl; 148 | } else { 149 | cout 150 | << "# next is estimated average bits per int for differential coding" 151 | << endl; 152 | } 153 | 154 | cout 155 | << "# other columns display speed in mis when computing the intersection" 156 | << endl; 157 | for (uint32_t gap = 0; gap + S <= MaxBit; gap += 1) { 158 | vector < vector > data; 159 | for (size_t zz = 0; zz < howmany; ++zz) { 160 | if (natemode) { 161 | data.push_back( 162 | cdg.generateClustered((1U << (MaxBit - gap)) / ratio, 163 | 1U << MaxBit)); 164 | data.push_back( 165 | cdg.generateClustered((1U << (MaxBit - gap)), 166 | 1U << MaxBit)); 167 | } else { 168 | data.push_back( 169 | cdg.generateClustered(N / ratio, 1U << (gap + S))); 170 | data.push_back(cdg.generateClustered(N, 1U << (gap + S))); 171 | } 172 | } 173 | size_t intersize = 0; 174 | size_t smallestsize = 0; 175 | for (size_t k = 0; k < howmany; k++) { 176 | intersize += classicalintersectioncardinality(&data[2 * k][0], 177 | data[2 * k].size(), &data[2 * k + 1][0], 178 | data[2 * k + 1].size()); 179 | smallestsize 180 | += data[2 * k + 1].size() < data[2 * k].size() ? data[2 * k 181 | + 1].size() : data[2 * k].size(); 182 | } 183 | 184 | cout << std::fixed << std::setprecision(3) 185 | << static_cast (intersize) 186 | / static_cast (smallestsize) << "\t"; 187 | cout.flush(); 188 | if (ratio > 1) { 189 | if (natemode) { 190 | cout << log( 191 | 1 + (1U << MaxBit) * 1.0 / ((1U << (MaxBit - gap)) 192 | / ratio)) / log(2) << "\t"; 193 | cout 194 | << log( 195 | 1 + (1U << MaxBit) * 1.0 / ((1U << (MaxBit 196 | - gap)))) / log(2) << "\t"; 197 | } else { 198 | cout << log(1 + (1U << (gap + S)) * 1.0 / (N / ratio)) / log(2) 199 | << "\t"; 200 | cout << log(1 + (1U << (gap + S)) * 1.0 / (N)) / log(2) << "\t"; 201 | } 202 | } else { 203 | if (natemode) { 204 | cout 205 | << log( 206 | 1 + (1U << MaxBit) * 1.0 / ((1U << (MaxBit 207 | - gap)))) / log(2) << "\t"; 208 | } else { 209 | cout << log(1 + (1U << (gap + S)) * 1.0 / (N)) / log(2) << "\t"; 210 | } 211 | 212 | } 213 | cout << "\t"; 214 | cout.flush(); 215 | size_t maxsize = 0; 216 | for(auto x : data) 217 | if(x.size() > maxsize) maxsize = x.size(); 218 | vector < uint32_t > buffer((maxsize + 15) / 16 * 16); 219 | /** 220 | * Skipping is a standard technique in IR. We test it here. 221 | */ 222 | if (skipping) { 223 | vector < shared_ptr > sdata; 224 | for(vector & x : data) 225 | sdata.emplace_back(shared_ptr(new Skipping(skipgaplog,x.data(),static_cast(x.size())))); 226 | for (size_t k = 0; k < 2 * howmany; k += 2) { 227 | vector < uint32_t > out(buffer.size()); 228 | size_t correctanswer = classicalintersection(&data[k][0], 229 | data[k].size(), &data[k + 1][0], data[k + 1].size(), 230 | &out[0]); 231 | out.resize(correctanswer); 232 | vector < uint32_t > out2(buffer.size()); 233 | size_t thisschemesanswer = sdata[k]->intersect(*sdata[k+1],&out2[0]); 234 | out2.resize(thisschemesanswer); 235 | if (out != out2) { 236 | if (thisschemesanswer != correctanswer) { 237 | cerr << "expecting cardinality of " << correctanswer; 238 | cerr << " got " << thisschemesanswer << "." << endl; 239 | int times = 0; 240 | for(size_t jj=0; (jj < thisschemesanswer)&&(jjintersect(*sdata[k+1],&buffer[0]); 267 | } 268 | 269 | } 270 | time = z.split(); 271 | cout << std::setprecision(0) << static_cast(volume) / static_cast(time) << "\t"; 272 | cout.flush(); 273 | } 274 | /** 275 | * End of Skipping 276 | */ 277 | for(auto algo : myschemes) { 278 | if (safe and buggyschemes.find(algo.first) == buggyschemes.end() ) 279 | for (size_t k = 0; k < 2 * howmany; k += 2) { 280 | vector out(buffer.size()); 281 | size_t correctanswer = classicalintersection( 282 | &data[k][0], data[k].size(), &data[k + 1][0], 283 | data[k + 1].size(),&out[0]); 284 | out.resize(correctanswer); 285 | vector out2(buffer.size()); 286 | size_t thisschemesanswer = algo.second( 287 | &data[k][0], data[k].size(), &data[k + 1][0], 288 | data[k + 1].size(),&out2[0]); 289 | out2.resize(thisschemesanswer); 290 | if (out != out2) { 291 | if(thisschemesanswer != correctanswer) { 292 | cerr << "expecting cardinality of " << correctanswer; 293 | cerr << " got " << thisschemesanswer << "." 294 | << endl; 295 | if(correctanswer < 10) 296 | for(uint32_t x : out) 297 | cerr<(volume) / static_cast(time) << "\t"; 325 | cout.flush(); 326 | } 327 | 328 | cout << endl; 329 | 330 | } 331 | cout << "# bogus = " << bogus << endl; 332 | } 333 | -------------------------------------------------------------------------------- /src/testintersection.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * This code is released under the 3 | * Apache License Version 2.0 http://www.apache.org/licenses/. 4 | * 5 | * (c) Daniel Lemire, http://lemire.me/en/ 6 | */ 7 | #include "intersectionfactory.h" 8 | #include "common.h" 9 | #include "timer.h" 10 | #include "synthetic.h" 11 | #include "util.h" 12 | 13 | 14 | void printusage() { 15 | cout << " Try ./testintersection -r 40" << endl; 16 | cout << " Use the -s flag to specify just some scheme, choose from: " 17 | << endl; 18 | for(string x : allNames()) cout <<" "<< x << endl; 19 | cout << " Separate the schemes by a comma (e.g. -s schlegel,danscalar). "<< endl; 20 | } 21 | 22 | int main(int argc, char **argv) { 23 | size_t howmany = 0; 24 | bool natemode = false; 25 | bool safe = true; 26 | size_t loop = 1000; 27 | uint32_t S = 12; 28 | uint32_t ratio = 1; 29 | map myschemes(schemes); 30 | map mypartschemes(partschemes); 31 | int c; 32 | while ((c = getopt(argc, argv, "uns:S:m:l:r:h")) != -1) 33 | switch (c) { 34 | case 'u': 35 | safe = false; 36 | break; 37 | 38 | case 'n': 39 | natemode = true; 40 | break; 41 | case 'h': 42 | printusage(); 43 | return 0; 44 | case 'S': 45 | S = atoi(optarg); 46 | if ((S < 1) or (S > 31)) { 47 | printusage(); 48 | return -1; 49 | } 50 | break; 51 | case 'm': 52 | howmany = atoi(optarg); 53 | if (howmany < 1) { 54 | printusage(); 55 | return -1; 56 | } 57 | break; 58 | case 's': 59 | myschemes.clear(); 60 | mypartschemes.clear(); 61 | { 62 | const string codecsstr(optarg); 63 | const vector < string > codecslst = split(codecsstr, ",:;"); 64 | for (auto i = codecslst.begin(); i != codecslst.end(); ++i) { 65 | if (schemes.find(*i) == schemes.end()) { 66 | if (partschemes.find(*i) == partschemes.end()) { 67 | cerr << " Warning!!! Warning: unrecognized: " << *i 68 | << endl; 69 | printusage(); 70 | return -1; 71 | } else { 72 | const auto K = partschemes.find(*i); 73 | const std::string name = K->first; 74 | const cardinalityintersectionfunctionpart fn = K->second; 75 | mypartschemes[name] = fn; 76 | } 77 | } else { 78 | const auto K = schemes.find(*i); 79 | const std::string name = K->first; 80 | const cardinalityintersectionfunction fn = K->second; 81 | myschemes[name] = fn; 82 | } 83 | } 84 | } 85 | break; 86 | case 'l': 87 | loop = atoi(optarg); 88 | if (loop < 1) { 89 | printusage(); 90 | return -1; 91 | } 92 | break; 93 | case 'r': 94 | ratio = atoi(optarg); 95 | if (ratio < 1) { 96 | printusage(); 97 | return -1; 98 | } 99 | break; 100 | default: 101 | abort(); 102 | } 103 | if (howmany == 0) { 104 | if (natemode) 105 | howmany = 1; 106 | else 107 | howmany = 20; 108 | } 109 | uint32_t MaxBit = 31; 110 | if (natemode) { 111 | MaxBit = S + 7; 112 | if (MaxBit > 31) 113 | MaxBit = 31; 114 | } 115 | cout<<"# algo: "; 116 | for(auto algo : myschemes) { 117 | cout << algo.first<< " "; 118 | } 119 | for(auto algo : mypartschemes) { 120 | cout << algo.first<< " "; 121 | } 122 | cout << endl; 123 | const uint32_t N = 1U << S; 124 | if (!natemode) 125 | if (ratio != 1) { 126 | cout << "# ratio = " << ratio << endl; 127 | cout << "# size of largest array = " << N << endl; 128 | cout << "# size of smallest array = " << N / ratio << endl; 129 | } else { 130 | cout << "# size of arrays = " << N << endl; 131 | } 132 | else 133 | cout << "# in natemode" << endl; 134 | 135 | ClusteredDataGenerator cdg; 136 | WallClockTimer z; 137 | size_t bogus = 0; 138 | size_t volume = 0; 139 | uint64_t time = 0; 140 | cout << "# first column is relative size of intersection" << endl; 141 | if (ratio > 1) { 142 | cout 143 | << "# next two are estimated average bits per int for differential coding" 144 | << endl; 145 | } else { 146 | cout 147 | << "# next is estimated average bits per int for differential coding" 148 | << endl; 149 | } 150 | 151 | cout 152 | << "# other columns display speed in mis when computing the cardinality of the intersection" 153 | << endl; 154 | for (uint32_t gap = 0; gap + S <= MaxBit; gap += 1) { 155 | vector < vector > data; 156 | for (size_t zz = 0; zz < howmany; ++zz) { 157 | if (natemode) { 158 | data.push_back( 159 | cdg.generateClustered((1U << (MaxBit - gap)) / ratio, 160 | 1U << MaxBit)); 161 | data.push_back( 162 | cdg.generateClustered((1U << (MaxBit - gap)), 163 | 1U << MaxBit)); 164 | } else { 165 | data.push_back( 166 | cdg.generateClustered(N / ratio, 1U << (gap + S))); 167 | data.push_back(cdg.generateClustered(N, 1U << (gap + S))); 168 | } 169 | } 170 | size_t intersize = 0; 171 | size_t smallestsize = 0; 172 | for (size_t k = 0; k < howmany; k++) { 173 | intersize += classicalintersectioncardinality(&data[2 * k][0], 174 | data[2 * k].size(), &data[2 * k + 1][0], 175 | data[2 * k + 1].size()); 176 | smallestsize 177 | += data[2 * k + 1].size() < data[2 * k].size() ? data[2 * k 178 | + 1].size() : data[2 * k].size(); 179 | } 180 | 181 | cout << std::fixed << std::setprecision(3) << static_cast(intersize) 182 | / static_cast(smallestsize) << "\t"; 183 | cout.flush(); 184 | if (ratio > 1) { 185 | if (natemode) { 186 | cout << log( 187 | 1 + (1U << MaxBit) * 1.0 / ((1U << (MaxBit - gap)) 188 | / ratio)) / log(2) << "\t"; 189 | cout 190 | << log( 191 | 1 + (1U << MaxBit) * 1.0 / ((1U << (MaxBit 192 | - gap)))) / log(2) << "\t"; 193 | } else { 194 | cout << log(1 + (1U << (gap + S)) * 1.0 / (N / ratio)) / log(2) 195 | << "\t"; 196 | cout << log(1 + (1U << (gap + S)) * 1.0 / (N)) / log(2) << "\t"; 197 | } 198 | } else { 199 | if (natemode) { 200 | cout 201 | << log( 202 | 1 + (1U << MaxBit) * 1.0 / ((1U << (MaxBit 203 | - gap)))) / log(2) << "\t"; 204 | } else { 205 | cout << log(1 + (1U << (gap + S)) * 1.0 / (N)) / log(2) << "\t"; 206 | } 207 | 208 | } 209 | cout << "\t"; 210 | cout.flush(); 211 | for(auto algo : myschemes) { 212 | if (safe and buggyschemes.find(algo.first) == buggyschemes.end() ) 213 | for (size_t k = 0; k < 2 * howmany; k += 2) { 214 | size_t correctanswer = classicalintersectioncardinality( 215 | &data[k][0], data[k].size(), &data[k + 1][0], 216 | data[k + 1].size()); 217 | size_t thisschemesanswer = algo.second( 218 | &data[k][0], data[k].size(), &data[k + 1][0], 219 | data[k + 1].size()); 220 | if (correctanswer != thisschemesanswer) { 221 | cerr << "expecting cardinality of " << correctanswer; 222 | cerr << " got " << thisschemesanswer << " instead." 223 | << endl; 224 | throw runtime_error("bug"); 225 | } 226 | } 227 | volume = 0; 228 | z.reset(); 229 | for (size_t L = 0; L < loop; ++L) { 230 | 231 | for (size_t k = 0; k < 2 * howmany; k += 2) { 232 | volume += data[k].size(); 233 | volume += data[k + 1].size(); 234 | bogus 235 | += algo.second(&data[k][0], 236 | data[k].size(), &data[k + 1][0], 237 | data[k + 1].size()); 238 | } 239 | 240 | } 241 | time = z.split(); 242 | cout << std::setprecision(0) << static_cast(volume) / static_cast(time) << "\t"; 243 | cout.flush(); 244 | } 245 | 246 | /** 247 | * From Schlegel et al., Fast Sorted-Set Intersection using SIMD Instructions 248 | */ 249 | 250 | vector < vector > pdata(data.size()); 251 | assert(pdata.size() == 2 * howmany); 252 | for (size_t zz = 0; zz < data.size(); ++zz) { 253 | pdata[zz].resize(data[zz].size() * 4); 254 | const size_t c = partitioned::partition(&data[zz][0], 255 | data[zz].size(), &pdata[zz][0], pdata[zz].size()); 256 | pdata[zz].resize(c); 257 | vector (pdata[zz]).swap(pdata[zz]); 258 | assert(pdata[zz].size() == c); 259 | 260 | if (safe) { 261 | vector < uint32_t > testvec(data[zz].size()); 262 | size_t recovsize = partitioned::inverse_partition(&testvec[0], 263 | testvec.size(), &pdata[zz][0], pdata[zz].size()); 264 | if (testvec.size() != recovsize) 265 | throw std::runtime_error("bug"); 266 | if (testvec != data[zz]) 267 | throw std::runtime_error("bug"); 268 | } 269 | 270 | } 271 | cout << "\t"; 272 | //for (uint32_t whichalgo = 0; whichalgo < HOWMANYPARTALGO; ++whichalgo) { 273 | for(auto algo : mypartschemes) { 274 | volume = 0; 275 | if (safe) 276 | for (size_t k = 0; k < 2 * howmany; k += 2) { 277 | size_t correctanswer = classicalintersectioncardinality( 278 | &data[k][0], data[k].size(), &data[k + 1][0], 279 | data[k + 1].size()); 280 | size_t thisschemesanswer = algo.second( 281 | &pdata[k][0], &pdata[k + 1][0], pdata[k].size(), 282 | pdata[k + 1].size()); 283 | if (correctanswer != thisschemesanswer) { 284 | cerr << "expecting cardinality of " << correctanswer; 285 | cerr << " got " << thisschemesanswer << " instead." 286 | << endl; 287 | throw runtime_error("bug"); 288 | } 289 | 290 | } 291 | 292 | z.reset(); 293 | 294 | for (size_t k = 0; k < 2 * howmany; k += 2) { 295 | volume += data[k].size(); 296 | volume += data[k + 1].size(); 297 | 298 | bogus += algo.second(&pdata[k][0], 299 | &pdata[k + 1][0], pdata[k].size(), pdata[k + 1].size()); 300 | } 301 | 302 | time = z.split(); 303 | cout << std::setprecision(0) << static_cast(volume) / static_cast(time) << "\t"; 304 | 305 | cout.flush(); 306 | } 307 | cout << endl; 308 | 309 | } 310 | cout << "# bogus = " << bogus << endl; 311 | } 312 | -------------------------------------------------------------------------------- /src/unit.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * This is not the totality of our testing, but rather, this is meant to include 3 | * very specific tests. 4 | * 5 | * See testintersection.cpp for more general tests. 6 | * 7 | */ 8 | 9 | // todo: add tests on large numbers 10 | 11 | #include "common.h" 12 | #include "intersectionfactory.h" 13 | 14 | int test2(intersectionfunction f) { 15 | const uint32_t firstpost[5] = { 12635, 12921, 12923, 12924, 16 | 12926}; 17 | 18 | const uint32_t secondpost[173] = { 3756, 11996, 12044, 12049, 12109, 12128, 19 | 12131, 12141, 12142, 12150, 12154, 12160, 12167, 12168, 12172, 20 | 12177, 12201, 12208, 12215, 12216, 12223, 12228, 12232, 12233, 21 | 12234, 12235, 12236, 12240, 12241, 12242, 12243, 12254, 12255, 22 | 12256, 12257, 12259, 12260, 12261, 12262, 12264, 12265, 12266, 23 | 12275, 12295, 12471, 12482, 12486, 12508, 12509, 12510, 12511, 24 | 12512, 12530, 12536, 12572, 12573, 12589, 12607, 12609, 12611, 25 | 12630, 12631, 12632, 12633, 12634, 12635, 12636, 12653, 12655, 26 | 12657, 12668, 12672, 12685, 12702, 12716, 12721, 12741, 12745, 27 | 12750, 12755, 12757, 12761, 12765, 12767, 12768, 12794, 12802, 28 | 12803, 12823, 12842, 12851, 12871, 12891, 12893, 12894, 12895, 29 | 12896, 12897, 12915, 12917, 12918, 12919, 12920, 12921, 12922, 30 | 12923, 12924, 12925, 12927, 12929, 12932, 12933, 12934, 12935, 31 | 12936, 12937, 12938, 12939, 12942, 12946, 12951, 12955, 12963, 32 | 12972, 13011, 13013, 13014, 13015, 13017, 13032, 13033, 13036, 33 | 13042, 13050, 13051, 13052, 13057, 13058, 13060, 13090, 13120, 34 | 13132, 13136, 13147, 13185, 13191, 13192, 13193, 13194, 13195, 35 | 13198, 13202, 13205, 13219, 13228, 13230, 13232, 13233, 13238, 36 | 13240, 13246, 13248, 13277, 13278, 13281, 13282, 13283, 13284, 37 | 13291, 13320, 13338, 13346, 13347 }; 38 | vector < uint32_t > inter(173); 39 | size_t s = f(firstpost, 5, secondpost, 173, inter.data()); 40 | inter.resize(s); 41 | vector < uint32_t > correct(173); 42 | size_t cs = classicalintersection(firstpost, 5, secondpost, 173, 43 | correct.data()); 44 | correct.resize(cs); 45 | if (inter != correct) { 46 | cout << inter.size() << " " << correct.size() << endl; 47 | cout<<" correct answer:"< inter(13); 65 | size_t s = f(firstpost, 13, secondpost, 13, inter.data()); 66 | inter.resize(s); 67 | vector < uint32_t > correct(13); 68 | size_t cs = classicalintersection(firstpost, 13, secondpost, 13, 69 | correct.data()); 70 | correct.resize(cs); 71 | if (inter != correct) { 72 | cout << inter.size() << " " << correct.size() << endl; 73 | for (size_t i = 0; (i < inter.size()) && (i < correct.size()); ++i) 74 | cout << i << " " << inter[i] << " " << correct[i] << endl; 75 | return 1; 76 | } 77 | if (!testwriteback) 78 | return 0; 79 | vector < uint32_t > inter2(firstpost, firstpost + 13); 80 | size_t s2 = f(inter2.data(), 13, secondpost, 13, inter2.data()); 81 | inter2.resize(s2); 82 | if (inter2 != correct) 83 | return 2; 84 | return 0; 85 | 86 | } 87 | 88 | int test3(intersectionfunction f) { 89 | 90 | vector firstpost; 91 | vector secondpost; 92 | vector trueinter; 93 | 94 | for(uint32_t i = 10; i < 31;++i) { 95 | firstpost.push_back((1U< inter(firstpost.size()); 103 | size_t s = f(firstpost.data(), firstpost.size(), secondpost.data(), secondpost.size(), inter.data()); 104 | inter.resize(s); 105 | if(inter != trueinter) { 106 | cout << inter.size() << " " << trueinter.size() << endl; 107 | for (size_t i = 0; (i < inter.size()) && (i < trueinter.size()); ++i) 108 | cout << i << " " << inter[i] << " " << trueinter[i] << endl; 109 | return 1; 110 | 111 | return 1; 112 | } 113 | return 0; 114 | } 115 | 116 | 117 | int main() { 118 | int error = 0; 119 | for (string n : allRealNames()) { 120 | cout<<"testing "<