├── .travis.yml
├── LICENSE
├── Makefile
├── README.md
├── TODO.md
├── include
    ├── binarysearchintersection.h
    ├── branchless.h
    ├── common.h
    ├── gallopingintersection.h
    ├── hscalableintersection.h
    ├── hybridintersection.h
    ├── inoueetal.h
    ├── intersection.h
    ├── intersectionfactory.h
    ├── match.h
    ├── mediumintersection.h
    ├── mersenne.h
    ├── multiSetIntersection.hpp
    ├── partitionedintersection.h
    ├── skipping.h
    ├── stlutil.h
    ├── synthetic.h
    ├── tetzank.h
    ├── thomaswu.h
    ├── timer.h
    ├── union.h
    ├── util.h
    └── widevectorintersection.h
├── results
    ├── benchintersection5march2014.gnuplot
    ├── benchintersection5march2014.txt
    ├── benchintersection6march2014.gnuplot
    ├── benchintersection6march2014.txt
    └── benchintersection6march2014_2.txt
├── scripts
    ├── disablehyperthreading.sh
    ├── powerpolicy.sh
    └── turboboost.sh
└── src
    ├── benchintersection.cpp
    ├── getmatrix.cpp
    ├── intersection.cpp
    ├── match.cpp
    ├── multiSetIntersection.cpp
    ├── realintersection.cpp
    ├── testintersection.cpp
    ├── thomaswu.cpp
    └── unit.cpp


/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: c++
 2 | sudo: false
 3 | compiler:
 4 |   - clang++
 5 | 
 6 | branches:
 7 |   only:
 8 |     - master
 9 | 
10 | script: make unit && ./unit
11 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Apache License
  2 | Version 2.0, January 2004
  3 | http://www.apache.org/licenses/
  4 | 
  5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 | 1. Definitions.
  8 | 
  9 | "License" shall mean the terms and conditions for use, reproduction, and
 10 | distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 | "Licensor" shall mean the copyright owner or entity authorized by the copyright
 13 | owner that is granting the License.
 14 | 
 15 | "Legal Entity" shall mean the union of the acting entity and all other entities
 16 | that control, are controlled by, or are under common control with that entity.
 17 | For the purposes of this definition, "control" means (i) the power, direct or
 18 | indirect, to cause the direction or management of such entity, whether by
 19 | contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the
 20 | outstanding shares, or (iii) beneficial ownership of such entity.
 21 | 
 22 | "You" (or "Your") shall mean an individual or Legal Entity exercising
 23 | permissions granted by this License.
 24 | 
 25 | "Source" form shall mean the preferred form for making modifications, including
 26 | but not limited to software source code, documentation source, and configuration
 27 | files.
 28 | 
 29 | "Object" form shall mean any form resulting from mechanical transformation or
 30 | translation of a Source form, including but not limited to compiled object code,
 31 | generated documentation, and conversions to other media types.
 32 | 
 33 | "Work" shall mean the work of authorship, whether in Source or Object form, made
 34 | available under the License, as indicated by a copyright notice that is included
 35 | in or attached to the work (an example is provided in the Appendix below).
 36 | 
 37 | "Derivative Works" shall mean any work, whether in Source or Object form, that
 38 | is based on (or derived from) the Work and for which the editorial revisions,
 39 | annotations, elaborations, or other modifications represent, as a whole, an
 40 | original work of authorship. For the purposes of this License, Derivative Works
 41 | shall not include works that remain separable from, or merely link (or bind by
 42 | name) to the interfaces of, the Work and Derivative Works thereof.
 43 | 
 44 | "Contribution" shall mean any work of authorship, including the original version
 45 | of the Work and any modifications or additions to that Work or Derivative Works
 46 | thereof, that is intentionally submitted to Licensor for inclusion in the Work
 47 | by the copyright owner or by an individual or Legal Entity authorized to submit
 48 | on behalf of the copyright owner. For the purposes of this definition,
 49 | "submitted" means any form of electronic, verbal, or written communication sent
 50 | to the Licensor or its representatives, including but not limited to
 51 | communication on electronic mailing lists, source code control systems, and
 52 | issue tracking systems that are managed by, or on behalf of, the Licensor for
 53 | the purpose of discussing and improving the Work, but excluding communication
 54 | that is conspicuously marked or otherwise designated in writing by the copyright
 55 | owner as "Not a Contribution."
 56 | 
 57 | "Contributor" shall mean Licensor and any individual or Legal Entity on behalf
 58 | of whom a Contribution has been received by Licensor and subsequently
 59 | incorporated within the Work.
 60 | 
 61 | 2. Grant of Copyright License.
 62 | 
 63 | Subject to the terms and conditions of this License, each Contributor hereby
 64 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
 65 | irrevocable copyright license to reproduce, prepare Derivative Works of,
 66 | publicly display, publicly perform, sublicense, and distribute the Work and such
 67 | Derivative Works in Source or Object form.
 68 | 
 69 | 3. Grant of Patent License.
 70 | 
 71 | Subject to the terms and conditions of this License, each Contributor hereby
 72 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
 73 | irrevocable (except as stated in this section) patent license to make, have
 74 | made, use, offer to sell, sell, import, and otherwise transfer the Work, where
 75 | such license applies only to those patent claims licensable by such Contributor
 76 | that are necessarily infringed by their Contribution(s) alone or by combination
 77 | of their Contribution(s) with the Work to which such Contribution(s) was
 78 | submitted. If You institute patent litigation against any entity (including a
 79 | cross-claim or counterclaim in a lawsuit) alleging that the Work or a
 80 | Contribution incorporated within the Work constitutes direct or contributory
 81 | patent infringement, then any patent licenses granted to You under this License
 82 | for that Work shall terminate as of the date such litigation is filed.
 83 | 
 84 | 4. Redistribution.
 85 | 
 86 | You may reproduce and distribute copies of the Work or Derivative Works thereof
 87 | in any medium, with or without modifications, and in Source or Object form,
 88 | provided that You meet the following conditions:
 89 | 
 90 | You must give any other recipients of the Work or Derivative Works a copy of
 91 | this License; and
 92 | You must cause any modified files to carry prominent notices stating that You
 93 | changed the files; and
 94 | You must retain, in the Source form of any Derivative Works that You distribute,
 95 | all copyright, patent, trademark, and attribution notices from the Source form
 96 | of the Work, excluding those notices that do not pertain to any part of the
 97 | Derivative Works; and
 98 | If the Work includes a "NOTICE" text file as part of its distribution, then any
 99 | Derivative Works that You distribute must include a readable copy of the
100 | attribution notices contained within such NOTICE file, excluding those notices
101 | that do not pertain to any part of the Derivative Works, in at least one of the
102 | following places: within a NOTICE text file distributed as part of the
103 | Derivative Works; within the Source form or documentation, if provided along
104 | with the Derivative Works; or, within a display generated by the Derivative
105 | Works, if and wherever such third-party notices normally appear. The contents of
106 | the NOTICE file are for informational purposes only and do not modify the
107 | License. You may add Your own attribution notices within Derivative Works that
108 | You distribute, alongside or as an addendum to the NOTICE text from the Work,
109 | provided that such additional attribution notices cannot be construed as
110 | modifying the License.
111 | You may add Your own copyright statement to Your modifications and may provide
112 | additional or different license terms and conditions for use, reproduction, or
113 | distribution of Your modifications, or for any such Derivative Works as a whole,
114 | provided Your use, reproduction, and distribution of the Work otherwise complies
115 | with the conditions stated in this License.
116 | 
117 | 5. Submission of Contributions.
118 | 
119 | Unless You explicitly state otherwise, any Contribution intentionally submitted
120 | for inclusion in the Work by You to the Licensor shall be under the terms and
121 | conditions of this License, without any additional terms or conditions.
122 | Notwithstanding the above, nothing herein shall supersede or modify the terms of
123 | any separate license agreement you may have executed with Licensor regarding
124 | such Contributions.
125 | 
126 | 6. Trademarks.
127 | 
128 | This License does not grant permission to use the trade names, trademarks,
129 | service marks, or product names of the Licensor, except as required for
130 | reasonable and customary use in describing the origin of the Work and
131 | reproducing the content of the NOTICE file.
132 | 
133 | 7. Disclaimer of Warranty.
134 | 
135 | Unless required by applicable law or agreed to in writing, Licensor provides the
136 | Work (and each Contributor provides its Contributions) on an "AS IS" BASIS,
137 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied,
138 | including, without limitation, any warranties or conditions of TITLE,
139 | NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are
140 | solely responsible for determining the appropriateness of using or
141 | redistributing the Work and assume any risks associated with Your exercise of
142 | permissions under this License.
143 | 
144 | 8. Limitation of Liability.
145 | 
146 | In no event and under no legal theory, whether in tort (including negligence),
147 | contract, or otherwise, unless required by applicable law (such as deliberate
148 | and grossly negligent acts) or agreed to in writing, shall any Contributor be
149 | liable to You for damages, including any direct, indirect, special, incidental,
150 | or consequential damages of any character arising as a result of this License or
151 | out of the use or inability to use the Work (including but not limited to
152 | damages for loss of goodwill, work stoppage, computer failure or malfunction, or
153 | any and all other commercial damages or losses), even if such Contributor has
154 | been advised of the possibility of such damages.
155 | 
156 | 9. Accepting Warranty or Additional Liability.
157 | 
158 | While redistributing the Work or Derivative Works thereof, You may choose to
159 | offer, and charge a fee for, acceptance of support, warranty, indemnity, or
160 | other liability obligations and/or rights consistent with this License. However,
161 | in accepting such obligations, You may act only on Your own behalf and on Your
162 | sole responsibility, not on behalf of any other Contributor, and only if You
163 | agree to indemnify, defend, and hold each Contributor harmless for any liability
164 | incurred by, or claims asserted against, such Contributor by reason of your
165 | accepting any such warranty or additional liability.
166 | 
167 | END OF TERMS AND CONDITIONS
168 | 
169 | APPENDIX: How to apply the Apache License to your work
170 | 
171 | To apply the Apache License to your work, attach the following boilerplate
172 | notice, with the fields enclosed by brackets "[]" replaced with your own
173 | identifying information. (Don't include the brackets!) The text should be
174 | enclosed in the appropriate comment syntax for the file format. We also
175 | recommend that a file or class name and description of purpose be included on
176 | the same "printed page" as the copyright notice for easier identification within
177 | third-party archives.
178 | 
179 |    Copyright [yyyy] [name of copyright owner]
180 | 
181 |    Licensed under the Apache License, Version 2.0 (the "License");
182 |    you may not use this file except in compliance with the License.
183 |    You may obtain a copy of the License at
184 | 
185 |      http://www.apache.org/licenses/LICENSE-2.0
186 | 
187 |    Unless required by applicable law or agreed to in writing, software
188 |    distributed under the License is distributed on an "AS IS" BASIS,
189 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
190 |    See the License for the specific language governing permissions and
191 |    limitations under the License.
192 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .SUFFIXES:
 2 | #
 3 | .SUFFIXES: .cpp .o .c .h
 4 | # replace the YOURCXX variable with a path to a C++11 compatible compiler.
 5 | ifeq ($(INTEL), 1)
 6 | # if you wish to use the Intel compiler, please do "make INTEL=1".
 7 |     CXX ?= /opt/intel/bin/icpc
 8 |     CXXFLAGS = -std=c++0x -O3 -Wall -DNDEBUG=1 -g3
 9 | else 
10 |     CXX ?= g++-4.7
11 | ifeq ($(DEBUG),1)
12 |     CXXFLAGS = -march=native  -std=c++11 -Weffc++ -pedantic -D_GLIBCXX_DEBUG -DDEBUG=1 -ggdb -Wall -Wextra -Wcast-align -Wconversion  -Winline
13 | else
14 |     CXXFLAGS = -march=native -std=c++11 -Weffc++ -DNDEBUG=1 -pedantic -O3 -Wall -Wextra -Winline  -Wcast-align  -Wconversion
15 | endif
16 | endif
17 | 
18 | 
19 | 
20 | 
21 | 
22 | HEADERS= $(shell ls include/*h)
23 | 
24 | all: unit testintersection realintersection getmatrix benchintersection multiSetIntersection
25 | 	echo "please run unit tests by running the unit executable"
26 | 
27 | intersection.o: src/intersection.cpp include/common.h  
28 | 	$(CXX) $(CXXFLAGS) -Iinclude -c src/intersection.cpp  
29 | 
30 | match.o: src/match.cpp include/match.h  
31 | 	$(CXX) $(CXXFLAGS) -Iinclude -c src/match.cpp
32 | 	
33 | thomaswu.o: src/thomaswu.cpp $(HEADERS)  
34 | 	$(CXX) $(CXXFLAGS) -Iinclude -c src/thomaswu.cpp  
35 | 
36 | multiSetIntersection: $(HEADERS) src/multiSetIntersection.cpp  match.o thomaswu.o intersection.o
37 | 	$(CXX) $(CXXFLAGS) -Iinclude -o multiSetIntersection src/multiSetIntersection.cpp  match.o thomaswu.o intersection.o
38 | 	
39 | testintersection: $(HEADERS) src/testintersection.cpp  match.o thomaswu.o intersection.o
40 | 	$(CXX) $(CXXFLAGS) -Iinclude -o testintersection src/testintersection.cpp  match.o thomaswu.o intersection.o
41 | 
42 | realintersection: $(HEADERS) src/realintersection.cpp  match.o thomaswu.o intersection.o
43 | 	$(CXX) $(CXXFLAGS) -Iinclude -o realintersection src/realintersection.cpp  match.o thomaswu.o intersection.o
44 | 
45 | getmatrix: $(HEADERS) src/getmatrix.cpp  match.o thomaswu.o intersection.o
46 | 	$(CXX) $(CXXFLAGS) -Iinclude -o getmatrix src/getmatrix.cpp  match.o thomaswu.o intersection.o
47 | 
48 | unit: $(HEADERS) src/unit.cpp  match.o thomaswu.o intersection.o
49 | 	$(CXX) $(CXXFLAGS) -Iinclude -o unit src/unit.cpp  match.o thomaswu.o intersection.o
50 | benchintersection: $(HEADERS) src/benchintersection.cpp  match.o thomaswu.o intersection.o
51 | 	$(CXX) $(CXXFLAGS) -Iinclude -o benchintersection src/benchintersection.cpp  match.o thomaswu.o intersection.o
52 | 
53 | 
54 | clean: 
55 | 	rm -f *.o unit testintersection realintersection getmatrix benchintersection multiSetIntersection
56 | 
57 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | SIMDIntersections
 2 | =================
 3 | [![Build Status](https://travis-ci.org/lemire/SIMDIntersections.png)](https://travis-ci.org/lemire/SIMDIntersections)
 4 | 
 5 | Vectorized intersections : research code.
 6 | 
 7 | Usage:
 8 | 
 9 | ```bash
10 | $ make
11 | $ ./unit
12 | $ ./benchintersection
13 | $ ./realintersection
14 | ```
15 | 
16 | ## Further reading
17 | 
18 | - Daniel Lemire, Nathan Kurz, Leonid Boytsov, SIMD Compression and the Intersection of Sorted Integers, Software: Practice and Experience 46 (6), 2016.  https://arxiv.org/abs/1401.6399
19 | 


--------------------------------------------------------------------------------
/TODO.md:
--------------------------------------------------------------------------------
1 | Implement galloping swapping svs
2 | 
3 | swapping svs 
4 | Experiments on adaptive set intersections for text retrieval systems. Algorithm Engineering and Experimentation (ALENEX), pages 91–104, 2001.
5 | 
6 | The Baeza Yates algorithm
7 | R. Baeza-Yates. A fast set intersection algorithm for sorted sequences. In Combinatorial Pattern Matching, pages 400–408. Springer, 2004.
8 | 


--------------------------------------------------------------------------------
/include/binarysearchintersection.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * binarysearch.h
  3 |  *
  4 |  *  Created on: May 13, 2013
  5 |  *      Author: ?
  6 |  */
  7 | 
  8 | #ifndef BINARYSEARCH_H_
  9 | #define BINARYSEARCH_H_
 10 | 
 11 | #include "common.h"
 12 | 
 13 | /**
 14 |  * This is pure binary search
 15 |  * Used by BSintersectioncardinality below
 16 |  * @param array
 17 |  * @param pos
 18 |  * @param min
 19 |  * @return
 20 |  */
 21 | static size_t __BSadvanceUntil(const uint32_t * array, const size_t pos,
 22 | 		const size_t length, const size_t min) {
 23 | 	size_t lower = pos + 1;
 24 | 	if (lower == length || array[lower] >= min) {
 25 | 		return lower;
 26 | 	}
 27 | 	// can safely assume that length>0
 28 | 	size_t upper = length - 1;
 29 | 	if (array[upper] < min) {
 30 | 		return length;
 31 | 	}
 32 | 	size_t mid;
 33 | 	while (lower < upper) {
 34 | 		mid = (lower + upper) / 2;
 35 | 		if (array[mid] == min) {
 36 | 			return mid;
 37 | 		}
 38 | 
 39 | 		if (array[mid] < min) {
 40 | 			lower = mid + 1;
 41 | 		} else {
 42 | 			upper = mid;
 43 | 		}
 44 | 	}
 45 | 	return upper;
 46 | }
 47 | 
 48 | /**
 49 |  * Based on binary search.
 50 |  */
 51 | size_t BSintersection(const uint32_t * set1, const size_t length1,
 52 | 		const uint32_t * set2, const size_t length2, uint32_t *out) {
 53 | 	if ((0 == length1) or (0 == length2))
 54 | 		return 0;
 55 | 	size_t answer = 0;
 56 | 	size_t k1 = 0, k2 = 0;
 57 | 	while (true) {
 58 | 		if (set1[k1] < set2[k2]) {
 59 | 			k1 = __BSadvanceUntil(set1, k1, length1, set2[k2]);
 60 | 			if (k1 == length1)
 61 | 				return answer;
 62 | 		}
 63 | 		if (set2[k2] < set1[k1]) {
 64 | 			k2 = __BSadvanceUntil(set2, k2, length2, set1[k1]);
 65 | 			if (k2 == length2)
 66 | 				return answer;
 67 | 		} else {
 68 | 			// assert(set2[k2] == set1[k1]);
 69 | 			out[answer++] = set1[k1];
 70 | 			++k1;
 71 | 			if (k1 == length1)
 72 | 				break;
 73 | 			++k2;
 74 | 			if (k2 == length2)
 75 | 				break;
 76 | 		}
 77 | 	}
 78 | 	return answer;
 79 | 
 80 | }
 81 | 
 82 | /**
 83 |  * Based on binary search.
 84 |  */
 85 | size_t BSintersectioncardinality(const uint32_t * set1, const size_t length1,
 86 | 		const uint32_t * set2, const size_t length2) {
 87 | 	if ((0 == length1) or (0 == length2))
 88 | 		return 0;
 89 | 	size_t answer = 0;
 90 | 	size_t k1 = 0, k2 = 0;
 91 | 	while (true) {
 92 | 		if (set1[k1] < set2[k2]) {
 93 | 			k1 = __BSadvanceUntil(set1, k1, length1, set2[k2]);
 94 | 			if (k1 == length1)
 95 | 				return answer;
 96 | 		}
 97 | 		if (set2[k2] < set1[k1]) {
 98 | 			k2 = __BSadvanceUntil(set2, k2, length2, set1[k1]);
 99 | 			if (k2 == length2)
100 | 				return answer;
101 | 		} else {
102 | 			// assert(set2[k2] == set1[k1]);
103 | 			++answer;
104 | 			++k1;
105 | 			if (k1 == length1)
106 | 				break;
107 | 			++k2;
108 | 			if (k2 == length2)
109 | 				break;
110 | 		}
111 | 	}
112 | 	return answer;
113 | 
114 | }
115 | 
116 | static size_t __FixedBSadvanceUntil(const uint32_t * array, const size_t length,
117 | 		const size_t min) {
118 | 	size_t lower = 0;
119 | 	size_t upper = length - 1;
120 | 	if (array[upper] < min) {
121 | 		return length;
122 | 	}
123 | 	size_t mid;
124 | 	while (lower < upper) {
125 | 		mid = (lower + upper) / 2;
126 | 		if (array[mid] == min) {
127 | 			return mid;
128 | 		}
129 | 
130 | 		if (array[mid] < min) {
131 | 			lower = mid + 1;
132 | 		} else {
133 | 			upper = mid;
134 | 		}
135 | 	}
136 | 	return upper;
137 | }
138 | 
139 | /**
140 |  * Based on binary search.
141 |  */
142 | size_t FixedBSintersectioncardinality(const uint32_t * set1,
143 | 		const size_t length1, const uint32_t * set2, const size_t length2) {
144 | 	if ((0 == length1) or (0 == length2))
145 | 		return 0;
146 | 	size_t answer = 0;
147 | 	size_t k1 = 0, k2 = 0;
148 | 	while (true) {
149 | 		if (set1[k1] < set2[k2]) {
150 | 			k1 = __FixedBSadvanceUntil(set1, length1, set2[k2]);
151 | 			if (k1 == length1)
152 | 				return answer;
153 | 		}
154 | 
155 | 		if (set2[k2] < set1[k1]) {
156 | 			k2 = __FixedBSadvanceUntil(set2, length2, set1[k1]);
157 | 			if (k2 == length2)
158 | 				return answer;
159 | 		} else {
160 | 			// assert(set2[k2] == set1[k1]);
161 | 			++answer;
162 | 			++k1;
163 | 			if (k1 == length1)
164 | 				break;
165 | 			++k2;
166 | 			if (k2 == length2)
167 | 				break;
168 | 		}
169 | 	}
170 | 	return answer;
171 | 
172 | }
173 | 
174 | #endif /* BINARYSEARCH_H_ */
175 | 


--------------------------------------------------------------------------------
/include/branchless.h:
--------------------------------------------------------------------------------
  1 | 
  2 | #ifndef BRANCHLESS_H_
  3 | #define BRANCHLESS_H_
  4 | 
  5 | #include <stdint.h>
  6 | #include <stddef.h>
  7 | 
  8 | #ifdef IACA
  9 | #include </opt/intel/iaca-lin32/include/iacaMarks.h>
 10 | #endif
 11 | 
 12 | /**
 13 |  * Failed (?) attempt at reproducing the good results of the branchless scheme
 14 |  * from Fast Sorted-Set Intersection using SIMD Instructions
 15 |  * originally by D. Lemire but combined with a design by N. Kurz.
 16 |  */
 17 | #ifndef __INTEL_COMPILER
 18 | __attribute__((optimize("unroll-loops"))) // this helps a lot with GCC
 19 | #endif
 20 | size_t branchlessintersection(const uint32_t * set1, const size_t length1,
 21 |         const uint32_t * set2, const size_t length2, uint32_t * out) {
 22 |     if ((0 == length1) or (0 == length2))
 23 |         return 0;
 24 |     const uint32_t * const initout(out);
 25 |     const uint32_t * const finalset1(set1 + length1);
 26 |     const uint32_t * const finalset2(set2 + length2);
 27 | 
 28 |     const unsigned int N = 4;
 29 | 
 30 |     // main loop
 31 |     while ((set1 +N <= finalset1) && (set2 +N <= finalset2)) {
 32 | #ifdef __INTEL_COMPILER
 33 | #pragma unroll(4)
 34 | #endif
 35 |         for (unsigned int k = 0; k < N; ++k) {
 36 |             // this is branchless... (in theory, maybe not in practice)
 37 |             const uint32_t a = *set1;
 38 |             const uint32_t b = *set2;
 39 |             *out = a;
 40 |             out = (a == b) ? out + 1 : out;
 41 |             set1 = (a <= b) ? set1 + 1 : set1;
 42 |             set2 = (b <= a) ? set2 + 1 : set2;
 43 |         }
 44 | 
 45 |     }
 46 |     while ((set1  < finalset1) && (set2 < finalset2)) {
 47 |             // this is branchless... (in theory, maybe not in practice)
 48 |             *out = *set1;
 49 |             const uint32_t a = *set1;
 50 |             const uint32_t b = *set2;
 51 |             out = (a == b) ? out + 1 : out;
 52 |             set1 = (a <= b) ? set1 + 1 : set1;
 53 |             set2 = (b <= a) ? set2 + 1 : set2;
 54 | 
 55 |     }
 56 | 
 57 |     return (out - initout);
 58 | }
 59 | 
 60 | 
 61 | /**
 62 |  * Branchless approach by N. Kurz.
 63 |  */
 64 | size_t scalar_branchless(const uint32_t *A, size_t lenA,
 65 |                          const uint32_t *B, size_t lenB,
 66 |                          uint32_t *Match) {
 67 | 
 68 |     const uint32_t *initMatch = Match;
 69 |     const uint32_t *endA = A + lenA;
 70 |     const uint32_t *endB = B + lenB;
 71 | 
 72 |     while (A < endA && B < endB) {
 73 |         int m = (*B == *A) ? 1 : 0;  // advance Match only if equal
 74 |         int a = (*B >= *A) ? 1 : 0;  // advance A if match or B ahead
 75 |         int b = (*B <= *A) ? 1 : 0;  // advance B if match or B behind
 76 | 
 77 |         *Match = *A;   // write the result regardless of match
 78 |         Match += m;    // but will be rewritten unless advanced
 79 |         A += a;
 80 |         B += b;
 81 |     }
 82 | 
 83 |     size_t count = Match - initMatch;
 84 |     return count;
 85 | }
 86 | 
 87 | // NOTE: Proof of concept function --- reads past end of input
 88 | size_t scalar_branchless_cached(const uint32_t *A, size_t lenA, 
 89 |                                 const uint32_t *B, size_t lenB,
 90 |                                 uint32_t *Match) {
 91 | 
 92 |     const uint32_t *initMatch = Match;
 93 |     const uint32_t *endA = A + lenA;
 94 |     const uint32_t *endB = B + lenB;
 95 | 
 96 |     uint32_t thisA = A[0];
 97 |     uint32_t thisB = B[0];
 98 | 
 99 |     while (A < endA && B < endB) {
100 |         
101 | #ifdef IACA
102 |         IACA_START;
103 | #endif    
104 |         uint32_t nextA = A[1];
105 |         uint32_t nextB = B[1];
106 | 
107 |         uint32_t oldA = thisA;
108 |         uint32_t oldB = thisB;
109 | 
110 |         *Match = thisA;   // write the result regardless of match
111 | 
112 |         int m = (oldB == oldA) ? 1 : 0;  // advance Match only if equal
113 |         int a = (oldB >= oldA) ? 1 : 0;  // advance A if match or B ahead
114 |         int b = (oldB <= oldA) ? 1 : 0;  // advance B if match or B behind
115 | 
116 |         thisA = (oldB >= oldA) ? nextA : thisA;  // advance A if match or B ahead
117 |         thisB = (oldB <= oldA) ? nextB : thisB;  // advance B if match or B behind
118 | 
119 |         Match += m;      // will be rewritten unless advanced
120 |         A += a;        
121 |         B += b;        
122 | 
123 | #ifdef IACA
124 |     IACA_END;
125 | #endif
126 | 
127 |     }
128 | 
129 | 
130 |     size_t count = Match - initMatch; 
131 |     return count; 
132 | }
133 | 
134 | // NOTE: Proof of concept function --- reads past end of input
135 | size_t scalar_branchless_cached2(const uint32_t *A, size_t lenA, 
136 |                                  const uint32_t *B, size_t lenB,
137 |                                  uint32_t *Match) {
138 | 
139 |     const uint32_t *initMatch = Match;
140 |     const uint32_t *endA = A + lenA;
141 |     const uint32_t *endB = B + lenB;
142 | 
143 |     uint32_t thisA = A[0];
144 |     uint32_t thisB = B[0];
145 | 
146 |     uint32_t nextA = A[1];
147 |     uint32_t nextB = B[1];
148 |     
149 |     while (A < endA && B < endB) {
150 | #ifdef IACA
151 |         IACA_START;
152 | #endif
153 | 
154 |         uint32_t nextNextA = A[2];
155 |         uint32_t nextNextB = B[2];
156 | 
157 |         uint32_t oldA = thisA;
158 |         uint32_t oldB = thisB;
159 | 
160 |         *Match = thisA; // write the result regardless of match
161 | 
162 |         int m = (oldB == oldA) ? 1 : 0;  // advance Match only if equal
163 |         int a = (oldB >= oldA) ? 1 : 0;  // advance A if match or B ahead
164 |         int b = (oldB <= oldA) ? 1 : 0;  // advance B if match or B behind
165 | 
166 |         thisA = (oldB >= oldA) ? nextA : thisA;  // advance A if match or B ahead
167 |         thisB = (oldB <= oldA) ? nextB : thisB;  // advance B if match or B behind
168 | 
169 |         nextA = (oldB >= oldA) ? nextNextA : nextA; 
170 |         nextB = (oldB <= oldA) ? nextNextB : nextB; 
171 | 
172 |         Match += m;    // Match will be rewritten unless advanced
173 |         A += a;        
174 |         B += b;        
175 | 
176 | #ifdef IACA
177 |     IACA_END;
178 | #endif
179 | 
180 |     }
181 | 
182 | 
183 |     size_t count = Match - initMatch; 
184 |     return count; 
185 | }
186 | 
187 | // use in function below
188 | #define BRANCHLESSMATCH() {                     \
189 |         int m = (*B == *A) ? 1 : 0;             \
190 |         int a = (*B >= *A) ? 1 : 0;             \
191 |         int b = (*B <= *A) ? 1 : 0;             \
192 |         *Match = *A;                            \
193 |         Match += m;                             \
194 |         A += a;                                 \
195 |         B += b;                                 \
196 |     }
197 | 
198 | 
199 | /**
200 |  * Unrolled branchless approach by N. Kurz.
201 |  */
202 | size_t scalar_branchless_unrolled(const uint32_t *A, size_t lenA,
203 |                                   const uint32_t *B, size_t lenB,
204 |                                   uint32_t *Match) {
205 | 
206 |     const size_t UNROLLED = 4;
207 | 
208 |     const uint32_t *initMatch = Match;
209 |     const uint32_t *endA = A + lenA;
210 |     const uint32_t *endB = B + lenB;
211 | 
212 |     if (lenA >= UNROLLED && lenB >= UNROLLED) {
213 |         const uint32_t *stopA = endA - UNROLLED;
214 |         const uint32_t *stopB = endB - UNROLLED;
215 | 
216 |         while (A < stopA && B < stopB) {
217 |             BRANCHLESSMATCH();  // NOTE: number of calls must match UNROLLED
218 |             BRANCHLESSMATCH();
219 |             BRANCHLESSMATCH();
220 |             BRANCHLESSMATCH();
221 |         }
222 |     }
223 | 
224 |     // Finish remainder without overstepping
225 |     while (A < endA && B < endB) {
226 |         BRANCHLESSMATCH();
227 |     }
228 | 
229 |     size_t count = Match - initMatch;
230 |     return count;
231 | }
232 | 
233 | #undef BRANCHLESSMATCH
234 | 
235 | 
236 | // Intel disassembly for branchless
237 | //  15:   mov    (%rdx),%r11d  # r11 = *B
238 | //  18:   mov    $0x1,%r8d     # r8 = 1
239 | //  1e:   mov    (%rdi),%eax   # eax = *A
240 | //  20:   cmp    %eax,%r11d    # *B <=> *A
241 | //  23:   mov    $0x0,%r11d    # r11 = 0
242 | //  29:   cmove  %r8,%r11      # if *B == *A r11 = 1
243 | //  2d:   mov    %eax,(%r9)    # *output = *A
244 | //  30:   lea    (%r9,%r11,4),%r9  # output += 4 * r11
245 | //  34:   mov    $0x0,%r11d    # r11 = 0
246 | //  3a:   cmovae %r8,%r11      # if *B >= *A r11 = 1
247 | //  3e:   lea    (%rdi,%r11,4),%rdi  # A += 4 * r11
248 | //  42:   mov    $0x0,%r11d    # r11 = 0
249 | //  48:   cmovbe %r8,%r11      # if *B <= *A r11 = 1
250 | //  4c:   lea    (%rdx,%r11,4),%rdx  # B += 4 * r11
251 | 
252 | 
253 | #endif
254 | 


--------------------------------------------------------------------------------
/include/common.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * This code is released under the
 3 |  * Apache License Version 2.0 http://www.apache.org/licenses/.
 4 |  *
 5 |  * (c) Daniel Lemire, http://lemire.me/en/
 6 |  */
 7 | #ifndef COMMON_H_
 8 | #define COMMON_H_
 9 | 
10 | #include <errno.h>
11 | #include <fcntl.h>
12 | #include <immintrin.h>
13 | #include <iso646.h>
14 | #include <limits.h>
15 | #include <sys/resource.h>
16 | #include <sys/types.h>
17 | #include <stdint.h>
18 | #include <stdio.h>
19 | #include <stdlib.h>
20 | #include <string.h>
21 | #include <sys/time.h>
22 | #include <unistd.h>
23 | #include <sys/mman.h>
24 | #include <sys/stat.h>
25 | #include <time.h>
26 | 
27 | #include <algorithm>
28 | #include <cmath>
29 | #include <cassert>
30 | #include <fstream>
31 | #include <iomanip>
32 | #include <iostream>
33 | #include <map>
34 | #include <memory>
35 | #include <numeric>
36 | #include <queue>
37 | #include <set>
38 | #include <string>
39 | #include <stdexcept>
40 | #include <sstream>
41 | //#include <tr1/memory>
42 | //#include <tr1/unordered_set>
43 | #include <unordered_set>
44 | #include <functional>
45 | #include <vector>
46 | 
47 | #define _LIKELY(x) __builtin_expect(!!(x), 1)
48 | #define _UNLIKELY(x) __builtin_expect(!!(x), 0)
49 | #define _NOINLINE __attribute__((noinline))
50 | #define _ALWAYSINLINE __attribute__((always_inline))
51 | typedef std::set<std::vector<uint32_t>,
52 | 		std::function<
53 | 				bool(const std::vector<uint32_t>&, const std::vector<uint32_t>&)>>mySet;
54 | 
55 | #endif /* COMMON_H_ */
56 | 


--------------------------------------------------------------------------------
/include/gallopingintersection.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * gallopingintersection.h
  3 |  *
  4 |  *  Created on: May 13, 2013
  5 |  *      Author: ?
  6 |  */
  7 | 
  8 | #ifndef GALLOPINGINTERSECTION_H_
  9 | #define GALLOPINGINTERSECTION_H_
 10 | 
 11 | /**
 12 |  * This is often called galloping or exponential search.
 13 |  *
 14 |  * Used by frogintersectioncardinality below
 15 |  *
 16 |  * Based on binary search...
 17 |  * Find the smallest integer larger than pos such
 18 |  * that array[pos]>= min.
 19 |  * If none can be found, return array.length.
 20 |  * From code by O. Kaser.
 21 |  */
 22 | static size_t __frogadvanceUntil(const uint32_t * array, const size_t pos,
 23 |         const size_t length, const size_t min) {
 24 |     size_t lower = pos + 1;
 25 | 
 26 |     // special handling for a possibly common sequential case
 27 |     if ((lower >= length) or (array[lower] >= min)) {
 28 |         return lower;
 29 |     }
 30 | 
 31 |     size_t spansize = 1; // could set larger
 32 |     // bootstrap an upper limit
 33 | 
 34 |     while ((lower + spansize < length) and (array[lower + spansize] < min))
 35 |         spansize *= 2;
 36 |     size_t upper = (lower + spansize < length) ? lower + spansize : length - 1;
 37 | 
 38 |     // maybe we are lucky (could be common case when the seek ahead expected to be small and sequential will otherwise make us look bad)
 39 |     //if (array[upper] == min) {
 40 |     //    return upper;
 41 |     //}
 42 | 
 43 |     if (array[upper] < min) {// means array has no item >= min
 44 |         return length;
 45 |     }
 46 | 
 47 |     // we know that the next-smallest span was too small
 48 |     lower += (spansize / 2);
 49 | 
 50 |     // else begin binary search
 51 |     size_t mid = 0;
 52 |     while (lower + 1 != upper) {
 53 |         mid = (lower + upper) / 2;
 54 |         if (array[mid] == min) {
 55 |             return mid;
 56 |         } else if (array[mid] < min)
 57 |             lower = mid;
 58 |         else
 59 |             upper = mid;
 60 |     }
 61 |     return upper;
 62 | 
 63 | }
 64 | 
 65 | /**
 66 |  * EXPERIMENTAL VERSION
 67 |  *
 68 |  * This is often called galloping or exponential search.
 69 |  *
 70 |  * Used by frogintersectioncardinality below
 71 |  *
 72 |  * Based on binary search...
 73 |  * Find the smallest integer larger than pos such
 74 |  * that array[pos]>= min.
 75 |  * If none can be found, return array.length.
 76 |  * From code by O. Kaser.
 77 |  */
 78 | static size_t __frogadvanceUntil_experimental(const uint32_t * array, const size_t pos,
 79 |         const size_t length, const size_t min) {
 80 |     size_t lower = pos + 1;
 81 | 
 82 |     // special handling for a possibly common sequential case
 83 |     if ((lower >= length) or (array[lower] >= min)) {
 84 |         return lower;
 85 |     }
 86 | 
 87 |     size_t spansize = 1; // could set larger
 88 |     // bootstrap an upper limit
 89 | 
 90 |     while ((lower + spansize < length) and (array[lower + spansize] < min))
 91 |         spansize *= 2;
 92 |     size_t upper = (lower + spansize < length) ? lower + spansize : length - 1;
 93 | 
 94 |     // maybe we are lucky (could be common case when the seek ahead expected to be small and sequential will otherwise make us look bad)
 95 |     if (array[upper] == min) {
 96 |         return upper;
 97 |     }
 98 | 
 99 |     if (array[upper] < min) {// means array has no item >= min
100 |         return length;
101 |     }
102 | 
103 |     // we know that the next-smallest span was too small
104 |     lower += (spansize / 2);
105 | 
106 |     // else begin binary search
107 |     size_t mid = 0;
108 |     while (lower + 1 != upper) {
109 |         mid = (lower + upper) / 2;
110 |         if (array[mid] == min) {
111 |             return mid;
112 |         } else if (array[mid] < min)
113 |             lower = mid;
114 |         else
115 |             upper = mid;
116 |     }
117 |     return upper;
118 | 
119 | }
120 | 
121 | 
122 | /**
123 |  * based on galloping
124 |  */
125 | size_t frogintersectioncardinality(const uint32_t * set1, const size_t length1,
126 |         const uint32_t * set2, const size_t length2) {
127 |     if ((0 == length1) or (0 == length2))
128 |         return 0;
129 |     size_t answer = 0;
130 |     size_t k1 = 0, k2 = 0;
131 |     while (true) {
132 |         if (set1[k1] < set2[k2]) {
133 |             k1 = __frogadvanceUntil(set1, k1, length1, set2[k2]);
134 |             if (k1 == length1)
135 |                 return answer;
136 |         }
137 |         if (set2[k2] < set1[k1]) {
138 |             k2 = __frogadvanceUntil(set2, k2, length2, set1[k1]);
139 |             if (k2 == length2)
140 |                 return answer;
141 |         } else {
142 |             // assert(set2[k2] == set1[k1]);
143 |             ++answer;
144 |             ++k1;
145 |             if (k1 == length1)
146 |                 break;
147 |             ++k2;
148 |             if (k2 == length2)
149 |                 break;
150 |         }
151 |     }
152 |     return answer;
153 | 
154 | }
155 | 
156 | 
157 | size_t onesidedgallopingintersectioncardinality(const uint32_t * smallset,
158 |         const size_t smalllength, const uint32_t * largeset,
159 |         const size_t largelength) {
160 |     if(largelength < smalllength) return onesidedgallopingintersectioncardinality(largeset,largelength,smallset,smalllength);
161 |     if (0 == smalllength)
162 |         return 0;
163 |     size_t answer = 0;
164 |     size_t k1 = 0, k2 = 0;
165 |     while (true) {
166 |         if (largeset[k1] < smallset[k2]) {
167 |             k1 = __frogadvanceUntil(largeset, k1, largelength, smallset[k2]);
168 |             if (k1 == largelength)
169 |                 return answer;
170 |         }
171 |         midpoint: if (smallset[k2] < largeset[k1]) {
172 |             ++k2;
173 |             if (k2 == smalllength)
174 |                 return answer;
175 |         } else {
176 |             ++answer;
177 |             ++k2;
178 |             if (k2 == smalllength)
179 |                 return answer;
180 |             k1 = __frogadvanceUntil(largeset, k1, largelength, smallset[k2]);
181 |             if (k1 == largelength)
182 |                 return answer;
183 |             goto midpoint;
184 |         }
185 |     }
186 |     return answer;
187 | 
188 | }
189 | 
190 | 
191 | size_t onesidedgallopingintersection(const uint32_t * smallset,
192 |         const size_t smalllength, const uint32_t * largeset,
193 |         const size_t largelength, uint32_t * out) {
194 |     if(largelength < smalllength) return onesidedgallopingintersection(largeset,largelength,smallset,smalllength,out);
195 |     if (0 == smalllength)
196 |         return 0;
197 |     const uint32_t * const initout(out);
198 |     size_t k1 = 0, k2 = 0;
199 |     while (true) {
200 |         if (largeset[k1] < smallset[k2]) {
201 |             k1 = __frogadvanceUntil(largeset, k1, largelength, smallset[k2]);
202 |             if (k1 == largelength)
203 |                 break;
204 |         }
205 |         midpoint: if (smallset[k2] < largeset[k1]) {
206 |             ++k2;
207 |             if (k2 == smalllength)
208 |                 break;
209 |         } else {
210 |             *out++ = smallset[k2];
211 |             ++k2;
212 |             if (k2 == smalllength)
213 |                 break;
214 |             k1 = __frogadvanceUntil(largeset, k1, largelength, smallset[k2]);
215 |             if (k1 == largelength)
216 |                 break;
217 |             goto midpoint;
218 |         }
219 |     }
220 |     return out - initout;
221 | 
222 | }
223 | 
224 | 
225 | 
226 | size_t onesidedgallopingintersection_experimental(const uint32_t * smallset,
227 |         const size_t smalllength, const uint32_t * largeset,
228 |         const size_t largelength, uint32_t * out) {
229 |     if(largelength < smalllength) return onesidedgallopingintersection_experimental(largeset,largelength,smallset,smalllength,out);
230 |     if (0 == smalllength)
231 |         return 0;
232 |     const uint32_t * const initout(out);
233 |     size_t k1 = 0, k2 = 0;
234 |     while (true) {
235 |         if (largeset[k1] < smallset[k2]) {
236 |             k1 = __frogadvanceUntil_experimental(largeset, k1, largelength, smallset[k2]);
237 |             if (k1 == largelength)
238 |                 break;
239 |         }
240 |         midpoint: if (smallset[k2] < largeset[k1]) {
241 |             ++k2;
242 |             if (k2 == smalllength)
243 |                 break;
244 |         } else {
245 |             *out++ = smallset[k2];
246 |             ++k2;
247 |             if (k2 == smalllength)
248 |                 break;
249 |             k1 = __frogadvanceUntil_experimental(largeset, k1, largelength, smallset[k2]);
250 |             if (k1 == largelength)
251 |                 break;
252 |             goto midpoint;
253 |         }
254 |     }
255 |     return out - initout;
256 | 
257 | }
258 | 
259 | #endif /* GALLOPINGINTERSECTION_H_ */
260 | 


--------------------------------------------------------------------------------
/include/hscalableintersection.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Schemes inspired or lifted from
  3 |  * http://highlyscalable.wordpress.com/2012/06/05/fast-intersection-sorted-lists-sse/
  4 |  *
  5 |  * The downside of these schemes is that they can't safely write back on the input
  6 |  * buffers.
  7 |  */
  8 | 
  9 | #ifndef HSCALABLEINTERSECTION_H_
 10 | #define HSCALABLEINTERSECTION_H_
 11 | 
 12 | #include "common.h"
 13 | 
 14 | namespace highlyscalablewordpresscom {
 15 | /**
 16 |  * Taken almost verbatim from http://highlyscalable.wordpress.com/2012/06/05/fast-intersection-sorted-lists-sse/
 17 |  * (just for comparison)
 18 |  */
 19 | size_t cardinality_intersect_scalar(const uint32_t *A, const size_t s_a,
 20 |         const uint32_t *B, const size_t s_b) {
 21 |     size_t i_a = 0, i_b = 0;
 22 |     size_t counter = 0;
 23 | 
 24 |     while (i_a < s_a && i_b < s_b) {
 25 |         if (A[i_a] < B[i_b]) {
 26 |             i_a++;
 27 |         } else if (B[i_b] < A[i_a]) {
 28 |             i_b++;
 29 |         } else {
 30 |             counter++;
 31 |             i_a++;
 32 |             i_b++;
 33 |         }
 34 |     }
 35 |     return counter;
 36 | }
 37 | 
 38 | /**
 39 |  * Taken almost verbatim from http://highlyscalable.wordpress.com/2012/06/05/fast-intersection-sorted-lists-sse/
 40 |  * (just for comparison)
 41 |  */
 42 | size_t intersect_scalar(const uint32_t *A, const size_t s_a,
 43 |         const uint32_t *B, const size_t s_b, uint32_t * out) {
 44 |     const uint32_t * const initout (out);
 45 |     size_t i_a = 0, i_b = 0;
 46 | 
 47 |     while (i_a < s_a && i_b < s_b) {
 48 |         if (A[i_a] < B[i_b]) {
 49 |             i_a++;
 50 |         } else if (B[i_b] < A[i_a]) {
 51 |             i_b++;
 52 |         } else {
 53 |             *out++ = B[i_b];
 54 |             i_a++;
 55 |             i_b++;
 56 |         }
 57 |     }
 58 |     return out - initout;
 59 | }
 60 | /**
 61 |  * More or less from
 62 |  * http://highlyscalable.wordpress.com/2012/06/05/fast-intersection-sorted-lists-sse/
 63 |  */
 64 | const static __m128i shuffle_mask[16] = {
 65 |         _mm_set_epi8(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0),
 66 |         _mm_set_epi8(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0),
 67 |         _mm_set_epi8(15,14,13,12,11,10,9,8,7,6,5,4,7,6,5,4),
 68 |         _mm_set_epi8(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0),
 69 |         _mm_set_epi8(15,14,13,12,11,10,9,8,7,6,5,4,11,10,9,8),
 70 |         _mm_set_epi8(15,14,13,12,11,10,9,8,11,10,9,8,3,2,1,0),
 71 |         _mm_set_epi8(15,14,13,12,11,10,9,8,11,10,9,8,7,6,5,4),
 72 |         _mm_set_epi8(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0),
 73 |         _mm_set_epi8(15,14,13,12,11,10,9,8,7,6,5,4,15,14,13,12),
 74 |         _mm_set_epi8(15,14,13,12,11,10,9,8,15,14,13,12,3,2,1,0),
 75 |         _mm_set_epi8(15,14,13,12,11,10,9,8,15,14,13,12,7,6,5,4),
 76 |         _mm_set_epi8(15,14,13,12,15,14,13,12,7,6,5,4,3,2,1,0),
 77 |         _mm_set_epi8(15,14,13,12,11,10,9,8,15,14,13,12,11,10,9,8),
 78 |         _mm_set_epi8(15,14,13,12,15,14,13,12,11,10,9,8,3,2,1,0),
 79 |         _mm_set_epi8(15,14,13,12,15,14,13,12,11,10,9,8,7,6,5,4),
 80 |         _mm_set_epi8(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0),
 81 |         };
 82 | // precomputed dictionary
 83 | 
 84 | 
 85 | 
 86 | /*int getBit(int value, int position) {
 87 |     return ((value & (1 << position)) >> position);
 88 | }*/
 89 | 
 90 | // a simple implementation, we don't care about performance here
 91 | /*void prepare_shuffling_dictionary() {
 92 |     for (int i = 0; i < 16; i++) {
 93 |         int counter = 0;
 94 |         char permutation[16];
 95 |         memset(permutation, 0xFF, sizeof(permutation));
 96 |         for (char b = 0; b < 4; b++) {
 97 |             if (getBit(i, b)) {
 98 |                 permutation[counter++] = 4 * b;
 99 |                 permutation[counter++] = 4 * b + 1;
100 |                 permutation[counter++] = 4 * b + 2;
101 |                 permutation[counter++] = 4 * b + 3;
102 |             }
103 |         }
104 |         __m128i mask = _mm_loadu_si128((const __m128i *) permutation);
105 |         shuffle_mask[i] = mask;
106 |     }
107 | }*/
108 | 
109 | /**
110 |  * Taken almost verbatim from http://highlyscalable.wordpress.com/2012/06/05/fast-intersection-sorted-lists-sse/
111 |  */
112 | size_t cardinality_intersect_SIMD(const uint32_t *A, const size_t s_a,
113 |         const uint32_t *B, const size_t s_b) {
114 |     size_t count = 0;
115 |     size_t i_a = 0, i_b = 0;
116 | 
117 |     // trim lengths to be a multiple of 4
118 |     size_t st_a = (s_a / 4) * 4;
119 |     size_t st_b = (s_b / 4) * 4;
120 | 
121 |     while (i_a < st_a && i_b < st_b) {
122 |         //[ load segments of four 32-bit elements
123 |         __m128i v_a = _mm_load_si128((__m128i *) &A[i_a]);
124 |         __m128i v_b = _mm_load_si128((__m128i *) &B[i_b]);
125 |         //]
126 | 
127 |         //[ move pointers
128 |         const uint32_t a_max = A[i_a + 3];
129 |         const uint32_t b_max = B[i_b + 3];
130 |         i_a += (a_max <= b_max) * 4;
131 |         i_b += (a_max >= b_max) * 4;
132 |         //]
133 | 
134 |         //[ compute mask of common elements
135 |         const uint32_t cyclic_shift = _MM_SHUFFLE(0, 3, 2, 1);
136 |         __m128i cmp_mask1 = _mm_cmpeq_epi32(v_a, v_b); // pairwise comparison
137 |         v_b = _mm_shuffle_epi32(v_b, cyclic_shift); // shuffling
138 |         __m128i cmp_mask2 = _mm_cmpeq_epi32(v_a, v_b); // again...
139 |         v_b = _mm_shuffle_epi32(v_b, cyclic_shift);
140 |         __m128i cmp_mask3 = _mm_cmpeq_epi32(v_a, v_b); // and again...
141 |         v_b = _mm_shuffle_epi32(v_b, cyclic_shift);
142 |         __m128i cmp_mask4 = _mm_cmpeq_epi32(v_a, v_b); // and again.
143 |         __m128i cmp_mask = _mm_or_si128(_mm_or_si128(cmp_mask1, cmp_mask2),
144 |                 _mm_or_si128(cmp_mask3, cmp_mask4)); // OR-ing of comparison masks
145 |         // convert the 128-bit mask to the 4-bit mask
146 |         const int mask = _mm_movemask_ps((__m128 ) cmp_mask);
147 |         //]
148 | 
149 |         //[ copy out common elements
150 |         //__m128i p = _mm_shuffle_epi8(v_a, shuffle_mask[mask]);
151 |         //_mm_storeu_si128((__m128i*)&C[count], p);
152 |         count += _mm_popcnt_u32(mask); // a number of elements is a weight of the mask
153 |         //]
154 |     }
155 | 
156 |     // intersect the tail using scalar intersection
157 |     while (i_a < s_a && i_b < s_b) {
158 |         if (A[i_a] < B[i_b]) {
159 |             i_a++;
160 |         } else if (B[i_b] < A[i_a]) {
161 |             i_b++;
162 |         } else {
163 |             count++;
164 |             i_a++;
165 |             i_b++;
166 |         }
167 |     }
168 | 
169 |     return count;
170 | }
171 | 
172 | 
173 | 
174 | 
175 | 
176 | /**
177 |  * Taken almost verbatim from http://highlyscalable.wordpress.com/2012/06/05/fast-intersection-sorted-lists-sse/
178 |  *
179 |  * It is not safe for out to be either A or B.
180 |  */
181 | size_t intersect_SIMD(const uint32_t *A, const size_t s_a,
182 |         const uint32_t *B, const size_t s_b, uint32_t * out) {
183 |     assert(out != A);
184 |     assert(out != B);
185 |     const uint32_t * const initout (out);
186 |     size_t i_a = 0, i_b = 0;
187 | 
188 |     // trim lengths to be a multiple of 4
189 |     size_t st_a = (s_a / 4) * 4;
190 |     size_t st_b = (s_b / 4) * 4;
191 | 
192 |     while (i_a < st_a && i_b < st_b) {
193 |         //[ load segments of four 32-bit elements
194 |         __m128i v_a = _mm_load_si128((__m128i *) &A[i_a]);
195 |         __m128i v_b = _mm_load_si128((__m128i *) &B[i_b]);
196 |         //]
197 | 
198 |         //[ move pointers
199 |         const uint32_t a_max = A[i_a + 3];
200 |         const uint32_t b_max = B[i_b + 3];
201 |         i_a += (a_max <= b_max) * 4;
202 |         i_b += (a_max >= b_max) * 4;
203 |         //]
204 | 
205 |         //[ compute mask of common elements
206 |         const uint32_t cyclic_shift = _MM_SHUFFLE(0, 3, 2, 1);
207 |         __m128i cmp_mask1 = _mm_cmpeq_epi32(v_a, v_b); // pairwise comparison
208 |         v_b = _mm_shuffle_epi32(v_b, cyclic_shift); // shuffling
209 |         __m128i cmp_mask2 = _mm_cmpeq_epi32(v_a, v_b); // again...
210 |         v_b = _mm_shuffle_epi32(v_b, cyclic_shift);
211 |         __m128i cmp_mask3 = _mm_cmpeq_epi32(v_a, v_b); // and again...
212 |         v_b = _mm_shuffle_epi32(v_b, cyclic_shift);
213 |         __m128i cmp_mask4 = _mm_cmpeq_epi32(v_a, v_b); // and again.
214 |         __m128i cmp_mask = _mm_or_si128(_mm_or_si128(cmp_mask1, cmp_mask2),
215 |                 _mm_or_si128(cmp_mask3, cmp_mask4)); // OR-ing of comparison masks
216 |         // convert the 128-bit mask to the 4-bit mask
217 |         const int mask = _mm_movemask_ps((__m128 ) cmp_mask);
218 |         //]
219 | 
220 |         //[ copy out common elements
221 |         const __m128i p = _mm_shuffle_epi8(v_a, shuffle_mask[mask]);
222 |         _mm_storeu_si128((__m128i*)out, p);
223 |         out += _mm_popcnt_u32(mask); // a number of elements is a weight of the mask
224 |         //]
225 |     }
226 | 
227 |     // intersect the tail using scalar intersection
228 |     while (i_a < s_a && i_b < s_b) {
229 |         if (A[i_a] < B[i_b]) {
230 |             i_a++;
231 |         } else if (B[i_b] < A[i_a]) {
232 |             i_b++;
233 |         } else {
234 |             *out++ = B[i_b]; ;
235 |             i_a++;
236 |             i_b++;
237 |         }
238 |     }
239 | 
240 |     return out - initout;
241 | }
242 | 
243 | size_t dan_cardinality_intersect_SIMD(const uint32_t *A, const size_t s_a,
244 |         const uint32_t *B, const size_t s_b) {
245 |     size_t count = 0;
246 |     size_t i_a = 0, i_b = 0;
247 |     const static uint32_t cyclic_shift1 = _MM_SHUFFLE(0, 3, 2, 1);
248 |     const static uint32_t cyclic_shift2 = _MM_SHUFFLE(1, 0, 3, 2);
249 |     const static uint32_t cyclic_shift3 = _MM_SHUFFLE(2, 1, 0, 3);
250 | 
251 |     // trim lengths to be a multiple of 4
252 |     size_t st_a = (s_a / 4) * 4;
253 |     size_t st_b = (s_b / 4) * 4;
254 |     if (i_a < st_a && i_b < st_b) {
255 |         __m128i v_a, v_b;
256 |         v_a = _mm_load_si128((__m128i *) &A[i_a]);
257 |         v_b = _mm_load_si128((__m128i *) &B[i_b]);
258 |         while (true) {
259 |             const __m128i cmp_mask1 = _mm_cmpeq_epi32(v_a, v_b); // pairwise comparison
260 |             const __m128i cmp_mask2 = _mm_cmpeq_epi32(v_a,
261 |                     _mm_shuffle_epi32(v_b, cyclic_shift1)); // again...
262 |             __m128i cmp_mask = _mm_or_si128(cmp_mask1, cmp_mask2);
263 |             const __m128i cmp_mask3 = _mm_cmpeq_epi32(v_a,
264 |                     _mm_shuffle_epi32(v_b, cyclic_shift2)); // and again...
265 |             cmp_mask = _mm_or_si128(cmp_mask, cmp_mask3);
266 |             const __m128i cmp_mask4 = _mm_cmpeq_epi32(v_a,
267 |                     _mm_shuffle_epi32(v_b, cyclic_shift3)); // and again.
268 |             cmp_mask = _mm_or_si128(cmp_mask, cmp_mask4);
269 |             // convert the 128-bit mask to the 4-bit mask
270 |             const int mask = _mm_movemask_ps((__m128 ) cmp_mask);
271 |             count += _mm_popcnt_u32(mask); // the number of elements is the weight of the mask
272 |             const uint32_t a_max = A[i_a + 3];
273 |             if (a_max <= B[i_b + 3]) {
274 |                 i_a += 4;
275 |                 if (i_a >= st_a)
276 |                     break;
277 |                 v_a = _mm_load_si128((__m128i *) &A[i_a]);
278 |             }
279 |             if (a_max >= B[i_b + 3]) {
280 |                 i_b += 4;
281 |                 if (i_b >= st_b)
282 |                     break;
283 |                 v_b = _mm_load_si128((__m128i *) &B[i_b]);
284 |             }
285 | 
286 |         }
287 |     }
288 | 
289 |     // intersect the tail using scalar intersection
290 |     while (i_a < s_a && i_b < s_b) {
291 |         if (A[i_a] < B[i_b]) {
292 |             i_a++;
293 |         } else if (B[i_b] < A[i_a]) {
294 |             i_b++;
295 |         } else {
296 |             count++;
297 |             i_a++;
298 |             i_b++;
299 |         }
300 |     }
301 | 
302 |     return count;
303 | }
304 | 
305 | 
306 | /**
307 |  * Optimized version of http://highlyscalable.wordpress.com/2012/06/05/fast-intersection-sorted-lists-sse/
308 |  *
309 |  * It is not safe for out to be either A or B.
310 |  */
311 | size_t dan_intersect_SIMD(const uint32_t *A, const size_t s_a,
312 |         const uint32_t *B, const size_t s_b, uint32_t * out) {
313 |     assert(out != A);
314 |     assert(out != B);
315 |     const uint32_t * const initout (out);
316 |     size_t i_a = 0, i_b = 0;
317 |     const static uint32_t cyclic_shift1 = _MM_SHUFFLE(0, 3, 2, 1);
318 |     const static uint32_t cyclic_shift2 = _MM_SHUFFLE(1, 0, 3, 2);
319 |     const static uint32_t cyclic_shift3 = _MM_SHUFFLE(2, 1, 0, 3);
320 | 
321 |     // trim lengths to be a multiple of 4
322 |     size_t st_a = (s_a / 4) * 4;
323 |     size_t st_b = (s_b / 4) * 4;
324 |     if (i_a < st_a && i_b < st_b) {
325 |         __m128i v_a, v_b;
326 |         v_a = _mm_load_si128((__m128i *) &A[i_a]);
327 |         v_b = _mm_load_si128((__m128i *) &B[i_b]);
328 |         while (true) {
329 |             const __m128i cmp_mask1 = _mm_cmpeq_epi32(v_a, v_b); // pairwise comparison
330 |             const __m128i cmp_mask2 = _mm_cmpeq_epi32(v_a,
331 |                     _mm_shuffle_epi32(v_b, cyclic_shift1)); // again...
332 |             __m128i cmp_mask = _mm_or_si128(cmp_mask1, cmp_mask2);
333 |             const __m128i cmp_mask3 = _mm_cmpeq_epi32(v_a,
334 |                     _mm_shuffle_epi32(v_b, cyclic_shift2)); // and again...
335 |             cmp_mask = _mm_or_si128(cmp_mask, cmp_mask3);
336 |             const __m128i cmp_mask4 = _mm_cmpeq_epi32(v_a,
337 |                     _mm_shuffle_epi32(v_b, cyclic_shift3)); // and again.
338 |             cmp_mask = _mm_or_si128(cmp_mask, cmp_mask4);
339 |             // convert the 128-bit mask to the 4-bit mask
340 |             const int mask = _mm_movemask_ps((__m128 ) cmp_mask);
341 |             //]
342 | 
343 |             //[ copy out common elements
344 |             const __m128i p = _mm_shuffle_epi8(v_a, shuffle_mask[mask]);
345 | 
346 |             _mm_storeu_si128((__m128i*)out, p);
347 |             //]
348 |             out += _mm_popcnt_u32(mask); // a number of elements is a weight of the mask
349 | 
350 |             const uint32_t a_max = A[i_a + 3];
351 |             //const uint32_t b_max = B[i_b + 3];
352 |             if (a_max <= B[i_b + 3]) {
353 |                 i_a += 4;
354 |                 if (i_a >= st_a)
355 |                     break;
356 |                 v_a = _mm_load_si128((__m128i *) &A[i_a]);
357 |             }
358 |             if (a_max >= B[i_b + 3]) {
359 |                 i_b += 4;
360 |                 if (i_b >= st_b)
361 |                     break;
362 |                 v_b = _mm_load_si128((__m128i *) &B[i_b]);
363 |             }
364 | 
365 |         }
366 |     }
367 | 
368 |     // intersect the tail using scalar intersection
369 |     while (i_a < s_a && i_b < s_b) {
370 |         if (A[i_a] < B[i_b]) {
371 |             i_a++;
372 |         } else if (B[i_b] < A[i_a]) {
373 |             i_b++;
374 |         } else {
375 |             *out++ = B[i_b];
376 |             i_a++;
377 |             i_b++;
378 |         }
379 |     }
380 | 
381 |     return out - initout;
382 | }
383 | 
384 | 
385 | }
386 | 
387 | #endif /* HSCALABLEINTERSECTION_H_ */
388 | 


--------------------------------------------------------------------------------
/include/hybridintersection.h:
--------------------------------------------------------------------------------
 1 | #ifndef HYBRIDINTERSECTION_H_
 2 | #define HYBRIDINTERSECTION_H_
 3 | 
 4 | #include "intersection.h"
 5 | #include "gallopingintersection.h"
 6 | #include "binarysearchintersection.h"
 7 | #include "mediumintersection.h"
 8 | #include "widevectorintersection.h"
 9 | #include "hscalableintersection.h"
10 | #include "match.h"
11 | size_t danielshybridintersectioncardinality(const uint32_t * set1,
12 |         const size_t length1, const uint32_t * set2, const size_t length2) {
13 |         if ((200 * length1 < length2) or (200 * length2 < length1)) {
14 |             if (length1 <= length2)
15 |                 return danfar_count_medium(set1, length1,
16 |                         set2, length2);
17 |             else
18 |                 return danfar_count_medium(set2, length2,
19 |                         set1, length1);
20 |         } else {
21 |             if (length1 <= length2)
22 |                 return natedanalt_count_medium(set1, length1, set2, length2);
23 |             else
24 |                 return natedanalt_count_medium(set2, length2, set1, length1);
25 |         }
26 | }
27 | 
28 | size_t olddanielshybridintersection(const uint32_t * set1,
29 |         const size_t length1, const uint32_t * set2, const size_t length2, uint32_t *out) {
30 |     if ((10 * length1 <= length2) or (10 * length2 <= length1)) {
31 |         if ((200 * length1 < length2) or (200 * length2 < length1)) {
32 |             if (length1 <= length2)
33 |                 return danfar_medium(set1, length1,
34 |                         set2, length2,out);
35 |             else
36 |                 return danfar_medium(set2, length2,
37 |                         set1, length1,out);
38 |         } else {
39 |             if (length1 <= length2)
40 |                 return natedanalt_medium(set1, length1, set2, length2,out);
41 |             else
42 |                 return natedanalt_medium(set2, length2, set1, length1,out);
43 |         }
44 |     }
45 |     return highlyscalablewordpresscom::dan_intersect_SIMD(set1, length1, set2, length2,out);
46 | }
47 | 
48 | size_t danielshybridintersection(const uint32_t * set1,
49 |         const size_t length1, const uint32_t * set2, const size_t length2, uint32_t *out) {
50 |     if ((length1==0) or (length2 == 0)) return 0;
51 | 
52 |     if ((50 * length1 <= length2) or (50 * length2 <= length1)) {
53 |             if (length1 <= length2)
54 |                 return danfarfar_medium(set1, length1, set2, length2,out);
55 |             else
56 |                 return danfarfar_medium(set2, length2, set1, length1,out);
57 |     }
58 | 
59 |    if (length1 <= length2)
60 |         return match_v4_f2_p0(set1, length1, set2, length2, out);
61 |     else
62 |         return match_v4_f2_p0(set2, length2, set1, length1, out);
63 | 
64 | }
65 | size_t SIMDintersection(const uint32_t *set1,
66 |                         const size_t length1, const uint32_t *set2, const size_t length2, uint32_t *out) {
67 |     if ((length1 == 0) or (length2 == 0)) return 0;
68 | 
69 | 
70 |     if ((1000 * length1 <= length2) or (1000 * length2 <= length1)) {
71 |         if (length1 <= length2)
72 |             return SIMDgalloping(set1, length1, set2, length2, out);
73 |         else
74 |             return SIMDgalloping(set2, length2, set1, length1, out);
75 |     }
76 | 
77 |     if ((50 * length1 <= length2) or (50 * length2 <= length1)) {
78 |         if (length1 <= length2)
79 |             return v3(set1, length1, set2, length2, out);
80 |         else
81 |             return v3(set2, length2, set1, length1, out);
82 |     }
83 | 
84 |     if (length1 <= length2)
85 |         return v1(set1, length1, set2, length2, out);
86 |     else
87 |         return v1(set2, length2, set1, length1, out);
88 | }
89 | 
90 | 
91 | 
92 | #endif /* HYBRIDINTERSECTION_H_ */
93 | 


--------------------------------------------------------------------------------
/include/inoueetal.h:
--------------------------------------------------------------------------------
  1 | 
  2 | #ifndef INOUETAL_H_
  3 | #define INOUETAL_H_
  4 | 	const static int popcnt_u32_4bit[16] = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
  5 | 
  6 | 	const static __m128i shuffle_mask[16] = {
  7 | 		_mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
  8 | 		_mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
  9 | 		_mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 7, 6, 5, 4),
 10 | 		_mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
 11 | 		_mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 11, 10, 9, 8),
 12 | 		_mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 11, 10, 9, 8, 3, 2, 1, 0),
 13 | 		_mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 11, 10, 9, 8, 7, 6, 5, 4),
 14 | 		_mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
 15 | 		_mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 15, 14, 13, 12),
 16 | 		_mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 15, 14, 13, 12, 3, 2, 1, 0),
 17 | 		_mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 15, 14, 13, 12, 7, 6, 5, 4),
 18 | 		_mm_set_epi8(15, 14, 13, 12, 15, 14, 13, 12, 7, 6, 5, 4, 3, 2, 1, 0),
 19 | 		_mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 15, 14, 13, 12, 11, 10, 9, 8),
 20 | 		_mm_set_epi8(15, 14, 13, 12, 15, 14, 13, 12, 11, 10, 9, 8, 3, 2, 1, 0),
 21 | 		_mm_set_epi8(15, 14, 13, 12, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4),
 22 | 		_mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
 23 | 	};
 24 | 
 25 | 
 26 | 
 27 | 
 28 | 	/*
 29 | 	* Inspired by "Faster Set Intersection with SIMD instructions
 30 | 	* by Reducing Branch Mispredictions".
 31 | 	*/
 32 | 	 size_t SIMDIntersectWithPrefilter(const uint32_t *A, const size_t s_a,
 33 | 		const uint32_t *B, const size_t s_b, uint32_t * out) {
 34 | 
 35 | 		assert(out != A);
 36 | 		assert(out != B);
 37 | 		size_t i_a = 0, i_b = 0, i_out = 0;
 38 | 
 39 | 		// trim lengths to be a multiple of 4
 40 | 		size_t st_a = (s_a / 4) * 4;
 41 | 		size_t st_b = (s_b / 4) * 4;
 42 | 
 43 | 		if (i_a < st_a && i_b < st_b) {
 44 | 			const __m128i a_mask = _mm_set_epi8(12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0);
 45 | 			const __m128i b_mask = _mm_set_epi8(12, 8, 4, 0, 12, 8, 4, 0, 12, 8, 4, 0, 12, 8, 4, 0);
 46 | 
 47 | 			// load initial data in registers.
 48 | 			__m128i v_a = _mm_loadu_si128((__m128i *) &A[i_a]);
 49 | 			__m128i v_b = _mm_loadu_si128((__m128i *) &B[i_b]);
 50 | 			__m128i v_a_filter = _mm_shuffle_epi8(v_a, a_mask);
 51 | 			__m128i v_b_filter = _mm_shuffle_epi8(v_b, b_mask);
 52 | 			uint32_t a_max = A[3];
 53 | 			uint32_t b_max = B[3];
 54 | 
 55 | 			for(;;) {
 56 | 
 57 | 				// check for potential intersection of least significant byte.
 58 | 				__m128i v_c = _mm_cmpeq_epi8(v_a_filter, v_b_filter);
 59 | 
 60 | 				if (!_mm_movemask_epi8(v_c)) {
 61 | 					// No hit so load the next 4 lowest bytes from smallest
 62 | 				advance:
 63 | 					const uint32_t a_max_local = a_max;
 64 | 					const uint32_t b_max_local = b_max;
 65 | 					if (a_max_local <= b_max_local) {
 66 | 						i_a += 4;
 67 | 						if (i_a < st_a) {
 68 | 							v_a = _mm_loadu_si128((__m128i *) &A[i_a]);
 69 | 							v_a_filter = _mm_shuffle_epi8(v_a, a_mask);
 70 | 							a_max = _mm_extract_epi32(v_a, 3);
 71 | 						}
 72 | 						else {
 73 | 							break;
 74 | 						}
 75 | 					}
 76 | 					if (a_max_local >= b_max_local) {
 77 | 						i_b += 4;
 78 | 						if (i_b < st_b) {
 79 | 							v_b = _mm_loadu_si128((__m128i *) &B[i_b]);
 80 | 							v_b_filter = _mm_shuffle_epi8(v_b, b_mask);
 81 | 							b_max = _mm_extract_epi32(v_b, 3);
 82 | 						}
 83 | 						else {
 84 | 							break;
 85 | 						}
 86 | 					}
 87 | 				} else {
 88 | 
 89 | 					// TODO: Any way to figure how to do this without having to copy registers?
 90 | 					// If we can, that would free up more registers when we implement unrolling.
 91 | 					__m128i v_as = v_a;
 92 | 					__m128i v_bs = v_b;
 93 | 
 94 | 					//[ compute mask of common elements
 95 | 					const uint32_t cyclic_shift = _MM_SHUFFLE(0, 3, 2, 1);
 96 | 					__m128i cmp_mask1 = _mm_cmpeq_epi32(v_as, v_bs); // pairwise comparison
 97 | 					v_bs = _mm_shuffle_epi32(v_bs, cyclic_shift); // shuffling
 98 | 					__m128i cmp_mask2 = _mm_cmpeq_epi32(v_as, v_bs); // again...
 99 | 					v_bs = _mm_shuffle_epi32(v_bs, cyclic_shift);
100 | 					__m128i cmp_mask3 = _mm_cmpeq_epi32(v_as, v_bs); // and again...
101 | 					v_bs = _mm_shuffle_epi32(v_bs, cyclic_shift);
102 | 					__m128i cmp_mask4 = _mm_cmpeq_epi32(v_as, v_bs); // and again.
103 | 					__m128i cmp_mask = _mm_or_si128(_mm_or_si128(cmp_mask1, cmp_mask2),
104 | 						_mm_or_si128(cmp_mask3, cmp_mask4)); // OR-ing of comparison masks
105 | 					// convert the 128-bit mask to the 4-bit mask
106 | 					const int mask = _mm_movemask_ps(_mm_castsi128_ps(cmp_mask));
107 | 					//]
108 | 
109 | 					//[ copy out common elements
110 | 					const __m128i p = _mm_shuffle_epi8(v_as, shuffle_mask[mask]);
111 | 					_mm_storeu_si128((__m128i*)(out + i_out), p);
112 | 					i_out += popcnt_u32_4bit[mask]; // a number of elements is a weight of the mask
113 | 					//]
114 | 
115 | 					goto advance;
116 | 				}
117 | 			}
118 | 		}
119 | 
120 | 
121 | 		// intersect the tail using scalar intersection
122 | 
123 | 		while (i_a < s_a && i_b < s_b) {
124 | 			const uint32_t a = A[i_a];
125 | 			const uint32_t b = B[i_b];
126 | 			if (a != b) {
127 | 				if (a <= b) {
128 | 					i_a++;
129 | 				}
130 | 				if (a >= b) {
131 | 					i_b++;
132 | 				}
133 | 			} else {
134 | 				out[i_out++] = a;
135 | 				i_a++;
136 | 				i_b++;
137 | 			}
138 | 		}
139 | 
140 | 		return i_out;
141 | 	}
142 | 
143 | 	 size_t lemireSIMDIntersectWithPrefilter(const uint32_t *A, const size_t s_a,
144 | 		const uint32_t *B, const size_t s_b, uint32_t * out) {
145 | 
146 | 		assert(out != A);
147 | 		assert(out != B);
148 | 		size_t i_a = 0, i_b = 0, i_out = 0;
149 | 
150 | 		// trim lengths to be a multiple of 4
151 | 		size_t st_a = (s_a / 4) * 4;
152 | 		size_t st_b = (s_b / 4) * 4;
153 | 
154 | 		if (i_a < st_a && i_b < st_b) {
155 | 			const __m128i a_mask = _mm_set_epi8(12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0);
156 | 			const __m128i b_mask = _mm_set_epi8(12, 8, 4, 0, 12, 8, 4, 0, 12, 8, 4, 0, 12, 8, 4, 0);
157 | 
158 | 			// load initial data in registers.
159 | 			__m128i v_a = _mm_loadu_si128((__m128i *) &A[i_a]);
160 | 			__m128i v_b = _mm_loadu_si128((__m128i *) &B[i_b]);
161 | 			__m128i v_a_filter = _mm_shuffle_epi8(v_a, a_mask);
162 | 			__m128i v_b_filter = _mm_shuffle_epi8(v_b, b_mask);
163 | 			uint32_t a_max = A[3];
164 | 			uint32_t b_max = B[3];
165 | 
166 | 			for(;;) {
167 | 
168 | 				// check for potential intersection of least significant byte.
169 | 				__m128i v_c = _mm_cmpeq_epi8(v_a_filter, v_b_filter);
170 | 
171 | 				if (!_mm_movemask_epi8(v_c)) {
172 | 					// No hit so load the next 4 lowest bytes from smallest
173 | 				advance:
174 | 					const uint32_t a_max_local = a_max;
175 | 					const uint32_t b_max_local = b_max;
176 | 					if (a_max_local <= b_max_local) {
177 | 						i_a += 4;
178 | 						if (i_a < st_a) {
179 | 							v_a = _mm_loadu_si128((__m128i *) &A[i_a]);
180 | 							v_a_filter = _mm_shuffle_epi8(v_a, a_mask);
181 | 							a_max = _mm_extract_epi32(v_a, 3);
182 | 						}
183 | 						else {
184 | 							break;
185 | 						}
186 | 					}
187 | 					if (a_max_local >= b_max_local) {
188 | 						i_b += 4;
189 | 						if (i_b < st_b) {
190 | 							v_b = _mm_loadu_si128((__m128i *) &B[i_b]);
191 | 							v_b_filter = _mm_shuffle_epi8(v_b, b_mask);
192 | 							b_max = _mm_extract_epi32(v_b, 3);
193 | 						}
194 | 						else {
195 | 							break;
196 | 						}
197 | 					}
198 | 				} else {
199 | 				    const static uint32_t cyclic_shift1 = _MM_SHUFFLE(0, 3, 2, 1);
200 | 				    const static uint32_t cyclic_shift2 = _MM_SHUFFLE(1, 0, 3, 2);
201 | 				    const static uint32_t cyclic_shift3 = _MM_SHUFFLE(2, 1, 0, 3);
202 | 		            const __m128i cmp_mask1 = _mm_cmpeq_epi32(v_a, v_b); // pairwise comparison
203 | 		            const __m128i cmp_mask2 = _mm_cmpeq_epi32(v_a,
204 | 		                    _mm_shuffle_epi32(v_b, cyclic_shift1)); // again...
205 | 		            __m128i cmp_mask = _mm_or_si128(cmp_mask1, cmp_mask2);
206 | 		            const __m128i cmp_mask3 = _mm_cmpeq_epi32(v_a,
207 | 		                    _mm_shuffle_epi32(v_b, cyclic_shift2)); // and again...
208 | 		            cmp_mask = _mm_or_si128(cmp_mask, cmp_mask3);
209 | 		            const __m128i cmp_mask4 = _mm_cmpeq_epi32(v_a,
210 | 		                    _mm_shuffle_epi32(v_b, cyclic_shift3)); // and again.
211 | 		            cmp_mask = _mm_or_si128(cmp_mask, cmp_mask4);
212 | 					// convert the 128-bit mask to the 4-bit mask
213 | 					const int mask = _mm_movemask_ps(_mm_castsi128_ps(cmp_mask));
214 | 					//]
215 | 
216 | 					//[ copy out common elements
217 | 					const __m128i p = _mm_shuffle_epi8(v_a, shuffle_mask[mask]);
218 | 					_mm_storeu_si128((__m128i*)(out + i_out), p);
219 | 					i_out += _mm_popcnt_u32(mask); // a number of elements is a weight of the mask
220 | 					//]
221 | 
222 | 					goto advance;
223 | 				}
224 | 			}
225 | 		}
226 | 
227 | 
228 | 		// intersect the tail using scalar intersection
229 | 
230 | 		while (i_a < s_a && i_b < s_b) {
231 | 			const uint32_t a = A[i_a];
232 | 			const uint32_t b = B[i_b];
233 | 			if (a != b) {
234 | 				if (a <= b) {
235 | 					i_a++;
236 | 				}
237 | 				if (a >= b) {
238 | 					i_b++;
239 | 				}
240 | 			} else {
241 | 				out[i_out++] = a;
242 | 				i_a++;
243 | 				i_b++;
244 | 			}
245 | 		}
246 | 
247 | 		return i_out;
248 | 	}
249 | 
250 | 
251 | #endif /* INOUETAL_H_ */
252 | 


--------------------------------------------------------------------------------
/include/intersection.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * This code is released under the
 3 |  * Apache License Version 2.0 http://www.apache.org/licenses/.
 4 |  *
 5 |  * (c) Daniel Lemire, http://lemire.me/en/
 6 |  */
 7 | 
 8 | #ifndef INTERSECTION_H_
 9 | #define INTERSECTION_H_
10 | 
11 | #include "common.h"
12 | typedef size_t (*intersectionfunction)(const uint32_t * set1,
13 |         const size_t length1, const uint32_t * set2, const size_t length2, uint32_t * out);
14 | 
15 | 
16 | typedef size_t (*cardinalityintersectionfunction)(const uint32_t * set1,
17 |         const size_t length1, const uint32_t * set2, const size_t length2);
18 | 
19 | typedef size_t (*cardinalityintersectionfunctionpart)(const uint16_t *A,
20 |         const uint16_t *B, const size_t s_a, const size_t s_b);
21 | 
22 | /**
23 |  * Compute the *cardinality* of the intersection between two *sorted*
24 |  * arrays.
25 |  *
26 |  * Algorithm design by D. Lemire. It uses several while loops on
27 |  * purpose.
28 |  *
29 |  */
30 | size_t danscalarintersectioncardinality(const uint32_t * set1, const size_t length1,
31 |         const uint32_t * set2, const size_t length2);
32 | 
33 | /**
34 |  * Compute the *cardinality* of the intersection between two *sorted*
35 |  * arrays.
36 |  *
37 |  * Algorithm design by D. Lemire. It uses several while loops on
38 |  * purpose.
39 |  *
40 |  */
41 | size_t danscalarintersection(const uint32_t * set1, const size_t length1,
42 |         const uint32_t * set2, const size_t length2, uint32_t * out) ;
43 | /**
44 |  * This is the classical approach
45 |  */
46 | size_t classicalintersectioncardinality(const uint32_t * set1,
47 |         const size_t length1, const uint32_t * set2, const size_t length2);
48 | /**
49 |  * This is the classical approach
50 |  */
51 | size_t classicalintersection(const uint32_t * set1,
52 |         const size_t length1, const uint32_t * set2, const size_t length2, uint32_t * out) ;
53 | 
54 | 
55 | #endif /* INTERSECTION_H_ */
56 | 


--------------------------------------------------------------------------------
/include/intersectionfactory.h:
--------------------------------------------------------------------------------
  1 | 
  2 | #ifndef INTERSECTIONFACTORY_H_
  3 | #define INTERSECTIONFACTORY_H_
  4 | 
  5 | #include "common.h"
  6 | #include "intersection.h"
  7 | #include "partitionedintersection.h"
  8 | #include "hscalableintersection.h"
  9 | #include "gallopingintersection.h"
 10 | #include "binarysearchintersection.h"
 11 | #include "hybridintersection.h"
 12 | #include "mediumintersection.h"
 13 | #include "widevectorintersection.h"
 14 | #include "branchless.h"
 15 | #include "match.h"
 16 | #include "thomaswu.h"
 17 | #include "inoueetal.h"
 18 | #include "tetzank.h"
 19 | 
 20 | 
 21 | 
 22 | 
 23 | 
 24 | std::map<std::string,intersectionfunction> realinitializefactory() {
 25 |     std::map<std::string,intersectionfunction> schemes;
 26 |     schemes[ "inoue" ] = SIMDIntersectWithPrefilter;
 27 |     schemes[ "lemireinoue" ] = lemireSIMDIntersectWithPrefilter;
 28 |     schemes[ "V1" ] = V1;
 29 | #ifdef __AVX2__
 30 |     schemes[ "V1AVX" ] = V1AVX;
 31 |     schemes["tetzankshuffle"] = tetzank_intersect_vector_avx2;
 32 | #endif
 33 |     schemes[ "f2p0" ] = match_v4_f2_p0;
 34 |     schemes[ "f4p0" ] = match_v4_f4_p0;
 35 |     schemes[ "f8p0" ] = match_v4_f8_p0;
 36 | 
 37 |     schemes[ "branchless" ] = branchlessintersection;
 38 |     schemes[ "scalarbranchlesscached" ] = scalar_branchless_cached;
 39 |     schemes[ "scalarbranchlesscached2" ] = scalar_branchless_cached2;
 40 |     schemes[ "scalardanbranchless" ] = branchlessintersection;
 41 |     schemes[ "scalarbranchless" ] = scalar_branchless;
 42 |     schemes[ "scalarbranchlessunrolled" ] = scalar_branchless_unrolled;
 43 |     schemes[ "@hybriddan" ] =  danielshybridintersection;
 44 | 
 45 | 
 46 |     schemes[ "widevector" ] =  widevector_intersect;
 47 |      schemes[ "widevectorleo" ] =  leowidevector_intersect;
 48 | 
 49 |     schemes[ "natemediumdanalt" ] =  natedanalt_medium;
 50 |     schemes[ "danfar" ] = danfar_medium;
 51 |     schemes[ "danfarmov" ] = danfar_medium_mov;
 52 | 
 53 |     schemes[ "danfarfar" ] = danfarfar_medium;
 54 | 
 55 |     schemes[ "scalarnate" ] =  nate_scalar;
 56 |     schemes[ "scalarnatewg" ] =  nate_scalarwithoutgoto;
 57 | 
 58 |     schemes[ "scalar1sgalloping" ] =  onesidedgallopingintersection;
 59 |     schemes[ "v1" ] =  v1;
 60 |     schemes[ "v3" ] =  v3;
 61 | #ifdef __AVX2__
 62 |     schemes[ "v3avx2" ] =  v3avx2;
 63 | #endif
 64 | 
 65 |     schemes[ "simdgalloping" ] =  SIMDgalloping;
 66 | #ifdef __AVX2__
 67 |     schemes[ "simdgalloping_avx2" ] =  SIMDgalloping_avx2;
 68 | #endif
 69 |     schemes[ "simdgalloping2" ] =  SIMDgalloping2;
 70 |     schemes[ "hssimd" ] =  highlyscalablewordpresscom::intersect_SIMD;
 71 |     schemes[ "hssimddan" ] =  highlyscalablewordpresscom::dan_intersect_SIMD;
 72 | 
 73 | 
 74 |     /*schemes[ "thomas_scalar" ] = compute_intersection<Intersection_find_scalar>;
 75 |     schemes[ "thomas_gallop" ] =  compute_intersection<Intersection_find_gallop>;
 76 |     schemes[ "thomas_v1" ] =  compute_intersection<Intersection_find_v1>;
 77 |     schemes[ "thomas_v1_plow" ] =  compute_intersection<Intersection_find_v1_plow>;
 78 |     schemes[ "thomas_v2" ] =  compute_intersection<Intersection_find_v2>;
 79 |     schemes[ "thomas_v3" ] =  compute_intersection<Intersection_find_v3>;
 80 |     schemes[ "thomas_v3_aligned" ] =  compute_intersection<Intersection_find_v3_aligned>;
 81 |     schemes[ "thomas_simdgallop_v0" ] =  compute_intersection<Intersection_find_simdgallop_v0>;
 82 |     schemes[ "thomas_simdgallop_v1" ] =  compute_intersection<Intersection_find_simdgallop_v1>;
 83 |     schemes[ "thomas_simdgallop_v2" ] =  compute_intersection<Intersection_find_simdgallop_v2>;
 84 |     schemes[ "thomas_simdgallop_v3" ] =  compute_intersection<Intersection_find_simdgallop_v3>;
 85 |     schemes[ "thomas_v3cmpeqflagged" ] =  compute_intersection_flagged<Intersection_find_v3_cmpeq>;
 86 |     schemes[ "thomas_v3cmpeqscalarflagged" ] =  compute_intersection_flagged<Intersection_truefind_v3_cmpeq_scalar>;
 87 |     schemes[ "thomas_v3cmpeqsimd32flagged" ] =  compute_intersection_flagged<Intersection_truefind_v3_cmpeq_simd32>;
 88 |     schemes[ "thomas_v3cmpeqsimd8flagged" ] =  compute_intersection_flagged<Intersection_truefind_v3_cmpeq_simd8>;
 89 |     schemes[ "thomas_v3cmpeqbinaryflagged" ] =  compute_intersection_flagged<Intersection_truefind_v3_cmpeq_binary>;
 90 |     */
 91 |     return schemes;
 92 | }
 93 | 
 94 | 
 95 | std::map<std::string,cardinalityintersectionfunction> initializefactory() {
 96 |     std::map<std::string,cardinalityintersectionfunction> schemes;
 97 |     schemes[ "@hybriddan" ] =  danielshybridintersectioncardinality;
 98 | #ifdef __AVX2__
 99 |     schemes["tetzankshuffle"] = tetzank_intersect_vector_avx2_count;
100 | #endif
101 |     schemes[ "widevector" ] =  widevector_cardinality_intersect;
102 |     schemes[ "widevectorleo" ] =  leowidevector_cardinality_intersect;
103 | 
104 |     schemes[ "scalargalloping" ] =  frogintersectioncardinality;
105 |     schemes[ "scalar1sgalloping" ] =  onesidedgallopingintersectioncardinality;
106 |     schemes[ "scalarnate" ] =  nate_count_scalar;
107 | 
108 |     schemes[ "hssimd" ] =  highlyscalablewordpresscom::cardinality_intersect_SIMD;
109 |     schemes[ "hssimddan" ] =  highlyscalablewordpresscom::dan_cardinality_intersect_SIMD;
110 | 
111 |     schemes[ "natemedium" ] =  nate_count_medium;
112 |     schemes[ "natemediumdan" ] =  natedan_count_medium;
113 |     schemes[ "natemediumdanalt" ] =  natedanalt_count_medium;
114 |     schemes[ "danfar" ] =  danfar_count_medium;
115 | 
116 |     schemes[ "natemediumfarfine" ] =  danfarfine_count_medium;
117 |     return schemes;
118 | }
119 | 
120 | std::set<std::string> initializebuggy() {
121 |     std::set<std::string> schemes;
122 |     schemes.insert("widevectorleo");//makes some assumptions
123 |     return schemes;
124 | }
125 | 
126 | std::map<std::string,cardinalityintersectionfunction> schemes = initializefactory();
127 | std::map<std::string,intersectionfunction> realschemes = realinitializefactory();
128 | 
129 | std::set<std::string> buggyschemes = initializebuggy();
130 | 
131 | 
132 | std::map<std::string,cardinalityintersectionfunctionpart> initializefactorypart() {
133 |     std::map<std::string,cardinalityintersectionfunctionpart> partschemes;
134 |     partschemes[ "schlegel" ] = partitioned::cardinality_intersect_partitioned;
135 |     partschemes[ "danschlegel" ] = partitioned::faster_cardinality_intersect_partitioned;
136 |     return partschemes;
137 | }
138 | 
139 | 
140 | std::map<std::string,cardinalityintersectionfunctionpart> partschemes = initializefactorypart();
141 | 
142 | /**
143 |  * Convenience function
144 |  */
145 | std::vector<std::string> allNames() {
146 |     std::vector < std::string > ans;
147 |     for (auto i = schemes.begin(); i != schemes.end(); ++i) {
148 |         ans.push_back(i->first);
149 |     }
150 |     for (auto i = partschemes.begin(); i != partschemes.end(); ++i) {
151 |         ans.push_back(i->first);
152 |     }
153 |     return ans;
154 | }
155 | /**
156 |  * Convenience function
157 |  */
158 | std::vector<std::string> allRealNames() {
159 |     std::vector < std::string > ans;
160 |     for (auto i = realschemes.begin(); i != realschemes.end(); ++i) {
161 |         ans.push_back(i->first);
162 |     }
163 |     return ans;
164 | }
165 | 
166 | 
167 | #endif /* INTERSECTIONFACTORY_H_ */
168 | 


--------------------------------------------------------------------------------
/include/match.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <stdint.h>
  4 | #include <stddef.h>
  5 | #include <immintrin.h>
  6 | #include <assert.h>
  7 | 
  8 | #define VEC_T __m128i
  9 | 
 10 | #define VEC_COPY_LOW(reg_dest, xmm_src)                                 \
 11 |     __asm volatile("movd %1, %0" : "=r" (reg_dest) : "x" (xmm_src))
 12 | 
 13 | #define VEC_OR(dest, other)                                             \
 14 |     __asm volatile("por %1, %0" : "+x" (dest) : "x" (other) )
 15 | 
 16 | #define VEC_ADD_PTEST(var, add, xmm)      {                             \
 17 |         typeof(var) _new = var + add;                                   \
 18 |         __asm volatile("ptest %2, %2\n\t"                           \
 19 |                            "cmovnz %1, %0\n\t"                          \
 20 |                            : /* writes */ "+r" (var)                    \
 21 |                            : /* reads */  "r" (_new), "x" (xmm)         \
 22 |                            : /* clobbers */ "cc");                      \
 23 |     }
 24 | 
 25 | 
 26 | // this macro does a signed comparison
 27 | #define VEC_CMP_GREATER(dest, other)                                    \
 28 |     __asm volatile("pcmpgtd %1, %0" : "+x" (dest) : "x" (other))
 29 | 
 30 | #define VEC_CMP_EQUAL(dest, other)                                      \
 31 |     __asm volatile("pcmpeqd %1, %0" : "+x" (dest) : "x" (other))
 32 | 
 33 | #define VEC_SET_ALL_TO_INT(reg, int32)                                  \
 34 |     __asm volatile("movd %1, %0; pshufd $0, %0, %0"                 \
 35 |                        : "=x" (reg) : "g" (int32) )
 36 | 
 37 | #define VEC_LOAD_OFFSET(xmm, ptr, bytes)                    \
 38 |     __asm volatile("movdqu %c2(%1), %0" : "=x" (xmm) :  \
 39 |                    "r" (ptr), "i" (bytes))
 40 | 
 41 | #define COMPILER_LIKELY(x)     __builtin_expect((x),1)
 42 | #define COMPILER_RARELY(x)     __builtin_expect((x),0)
 43 | 
 44 | #define ASM_LEA_ADD_BYTES(ptr, bytes)                            \
 45 |     __asm volatile("lea %c1(%0), %0\n\t" :                       \
 46 |                    /* reads/writes %0 */  "+r" (ptr) :           \
 47 |                    /* reads */ "i" (bytes));
 48 | 
 49 | 
 50 | #ifdef __cplusplus
 51 | 
 52 | #define typeof(arg) decltype(arg)
 53 | 
 54 | extern "C" {
 55 | #endif
 56 | 
 57 | size_t match_scalar(const uint32_t *A, const size_t lenA,
 58 |                     const uint32_t *B, const size_t lenB,
 59 |                     uint32_t *out);
 60 | 
 61 | // like match_v4_f2_p0 but more portable
 62 | size_t V1
 63 | (const uint32_t *rare, size_t lenRare,
 64 |  const uint32_t *freq, size_t lenFreq,
 65 |  uint32_t *matchOut);
 66 | 
 67 | #ifdef __AVX2__
 68 | size_t V1AVX
 69 | (const uint32_t *rare, size_t lenRare,
 70 |  const uint32_t *freq, size_t lenFreq,
 71 |  uint32_t *matchOut);
 72 | #endif
 73 | 
 74 | size_t match_v4_f2_p0
 75 | (const uint32_t *rare, size_t lenRare,
 76 |  const uint32_t *freq, size_t lenFreq,
 77 |  uint32_t *matchOut);
 78 | 
 79 | 
 80 | // proxy for match_v4_f2_p0
 81 | inline size_t v1(const uint32_t *rare, size_t lenRare,
 82 |         const uint32_t *freq, size_t lenFreq,
 83 |         uint32_t *matchOut) {
 84 |     return match_v4_f2_p0(rare,lenRare,freq,lenFreq,matchOut);
 85 | }
 86 | 
 87 | size_t match_v4_f4_p0
 88 | (const uint32_t *rare, size_t lenRare,
 89 |  const uint32_t *freq, size_t lenFreq,
 90 |  uint32_t *matchOut);
 91 | 
 92 | 
 93 | size_t match_v4_f8_p0
 94 | (const uint32_t *rare, size_t lenRare,
 95 |  const uint32_t *freq, size_t lenFreq,
 96 |  uint32_t *matchOut);
 97 | 
 98 | 
 99 | // unsafe, assumes signed ints
100 | size_t match_v4_f2_p1
101 | (const uint32_t *rare, size_t lenRare,
102 |  const uint32_t *freq, size_t lenFreq,
103 |  uint32_t *matchOut);
104 | 
105 | // unsafe, assumes signed ints
106 | size_t match_v4_f4_p1
107 | (const uint32_t *rare, size_t lenRare,
108 |  const uint32_t *freq, size_t lenFreq,
109 |  uint32_t *matchOut);
110 | 
111 | 
112 | // unsafe, assumes signed ints
113 | size_t match_v4_f8_p1
114 | (const uint32_t *rare, size_t lenRare,
115 |  const uint32_t *freq, size_t lenFreq,
116 |  uint32_t *matchOut);
117 | 
118 | // unsafe, assumes signed ints
119 | size_t match_v4_f2_p2
120 | (const uint32_t *rare, size_t lenRare,
121 |  const uint32_t *freq, size_t lenFreq,
122 |  uint32_t *matchOut);
123 | 
124 | // unsafe, assumes signed ints
125 | size_t match_v4_f4_p2
126 | (const uint32_t *rare, size_t lenRare,
127 |  const uint32_t *freq, size_t lenFreq,
128 |  uint32_t *matchOut);
129 | 
130 | // unsafe, assumes signed ints
131 | size_t match_v4_f8_p2
132 | (const uint32_t *rare, size_t lenRare,
133 |  const uint32_t *freq, size_t lenFreq,
134 |  uint32_t *matchOut);
135 | 
136 | // unsafe, assumes signed ints
137 | size_t match_v4_f2_p3
138 | (const uint32_t *rare, size_t lenRare,
139 |  const uint32_t *freq, size_t lenFreq,
140 |  uint32_t *matchOut);
141 | 
142 | // unsafe, assumes signed ints
143 | size_t match_v4_f4_p3
144 | (const uint32_t *rare, size_t lenRare,
145 |  const uint32_t *freq, size_t lenFreq,
146 |  uint32_t *matchOut);
147 | 
148 | // unsafe, assumes signed ints
149 | size_t match_v4_f8_p3
150 | (const uint32_t *rare, size_t lenRare,
151 |  const uint32_t *freq, size_t lenFreq,
152 |  uint32_t *matchOut);
153 | 
154 | #ifdef __cplusplus
155 | } // extern "C"
156 | #endif
157 | 


--------------------------------------------------------------------------------
/include/mersenne.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * This code is released under the
  3 |  * Apache License Version 2.0 http://www.apache.org/licenses/.
  4 |  */
  5 | 
  6 | #ifndef MERSENNE_H_
  7 | #define MERSENNE_H_
  8 | 
  9 | #include "common.h"
 10 | ///#include "util.h"
 11 | 
 12 | /**
 13 |  *  Mersenne twister - random number generator.
 14 |  *  Generate uniform distribution of 32 bit integers with the MT19937 algorithm.
 15 |  * source: http://bannister.us/weblog/?s=Mersenne
 16 |  */
 17 | class ZRandom {
 18 | 
 19 | private:
 20 |     enum {
 21 |         N = 624, M = 397
 22 |     };
 23 |     unsigned int MT[N + 1];
 24 |     unsigned int* map[N];
 25 |     int nValues;
 26 | 
 27 | public:
 28 |     ZRandom(unsigned int iSeed = 20070102);
 29 |     void seed(unsigned iSeed);
 30 |     unsigned int getValue();
 31 |     unsigned int getValue(const uint32_t MaxValue);
 32 |     double getDouble();
 33 |     bool test(const double p);
 34 | 
 35 | };
 36 | 
 37 | ZRandom::ZRandom(unsigned iSeed) :
 38 |     nValues(0) {
 39 |     seed(iSeed);
 40 | }
 41 | 
 42 | void ZRandom::seed(unsigned iSeed) {
 43 |     nValues = 0;
 44 |     // Seed the array used in random number generation.
 45 |     MT[0] = iSeed;
 46 |     for (int i = 1; i < N; ++i) {
 47 |         MT[i] = 1 + (69069 * MT[i - 1]);
 48 |     }
 49 |     // Compute map once to avoid % in inner loop.
 50 |     for (int i = 0; i < N; ++i) {
 51 |         map[i] = MT + ((i + M) % N);
 52 |     }
 53 | }
 54 | 
 55 | inline bool ZRandom::test(const double p) {
 56 |     return getDouble() <= p;
 57 | }
 58 | inline double ZRandom::getDouble() {
 59 |     return double(getValue()) * (1.0 / 4294967296.0);
 60 | }
 61 | 
 62 | unsigned int ZRandom::getValue(const uint32_t MaxValue) {
 63 |     unsigned int used = MaxValue;
 64 |     used |= used >> 1;
 65 |     used |= used >> 2;
 66 |     used |= used >> 4;
 67 |     used |= used >> 8;
 68 |     used |= used >> 16;
 69 | 
 70 |     // Draw numbers until one is found in [0,n]
 71 |     unsigned int i;
 72 |     do
 73 |         i = getValue() & used; // toss unused bits to shorten search
 74 |     while (i > MaxValue);
 75 |     return i;
 76 | }
 77 | 
 78 | unsigned int ZRandom::getValue() {
 79 |     if (0 == nValues) {
 80 |         MT[N] = MT[0];
 81 |         for (int i = 0; i < N; ++i) {
 82 |             register unsigned y = (0x80000000 & MT[i]) | (0x7FFFFFFF
 83 |                     & MT[i + 1]);
 84 |             register unsigned v = *(map[i]) ^ (y >> 1);
 85 |             if (1 & y)
 86 |                 v ^= 2567483615;
 87 |             MT[i] = v;
 88 |         }
 89 |         nValues = N;
 90 |     }
 91 |     register unsigned y = MT[N - nValues--];
 92 |     y ^= y >> 11;
 93 |     y ^= static_cast<unsigned int>((y << 7) & 2636928640);
 94 |     y ^= static_cast<unsigned int>((y << 15) & 4022730752);
 95 |     y ^= y >> 18;
 96 |     return y;
 97 | }
 98 | 
 99 | #endif /* MERSENNE_H_ */
100 | 


--------------------------------------------------------------------------------
/include/multiSetIntersection.hpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * multiSetIntersection.hpp
  3 |  *
  4 |  *  Created on: 2016/12/20
  5 |  *      Author: SparkleXS
  6 |  */
  7 | 
  8 | #ifndef INCLUDE_MULTISETINTERSECTION_HPP_
  9 | #define INCLUDE_MULTISETINTERSECTION_HPP_
 10 | 
 11 | #include "intersectionfactory.h"
 12 | #include "timer.h"
 13 | #include "synthetic.h"
 14 | #include "util.h"
 15 | 
 16 | namespace msis/*MultiSet InterSection*/{
 17 | // here adapts the range [start,end], different from __BSadvanceUntil
 18 | // whose range is [start+1,end-1]
 19 | static _ALWAYSINLINE size_t binarySearch_wider(const uint32_t * array,
 20 | 		const size_t start, const size_t end, const size_t min) {
 21 | 	size_t lower = start;
 22 | 	size_t upper = end;
 23 | 	if (lower == end || array[lower] >= min) {
 24 | 		return lower;
 25 | 	}
 26 | 
 27 | 	size_t mid;
 28 | 	while (lower < upper) {
 29 | 		mid = (lower + upper) / 2;
 30 | 		if (array[mid] == min) {
 31 | 			return mid;
 32 | 		}
 33 | 
 34 | 		if (array[mid] < min) {
 35 | 			lower = mid + 1;
 36 | 		} else {
 37 | 			upper = mid;
 38 | 		}
 39 | 	}
 40 | 	return upper;
 41 | }
 42 | 
 43 | // here adapts the range [start,end]
 44 | static _ALWAYSINLINE size_t gallopping(const uint32_t * array,
 45 | 		const size_t start, const size_t end, const size_t min) {
 46 | 	size_t lower = start;
 47 | 
 48 | 	// special handling for a possibly common sequential case
 49 | 	if ((lower >= end) or (array[lower] >= min)) {
 50 | 		return lower;
 51 | 	}
 52 | 
 53 | 	size_t spansize = 1; // could set larger
 54 | 	// bootstrap an upper limit
 55 | 
 56 | 	// sxs: here spansize is enlarged to the maximum where its corresponding
 57 | 	// element is geq min
 58 | 	while ((lower + spansize <= end) and (array[lower + spansize] < min))
 59 | 		spansize *= 2;
 60 | 	size_t upper = (lower + spansize <= end) ? lower + spansize : end;
 61 | 
 62 | 	// maybe we are lucky (could be common case when the seek ahead expected to be small and sequential will otherwise make us look bad)
 63 | 	//if (array[upper] == min) {
 64 | 	//    return upper;
 65 | 	//}
 66 | 
 67 | 	if (array[upper] < min) {    // means array has no item >= min
 68 | 		return end;
 69 | 	}
 70 | 
 71 | 	// we know that the next-smallest span was too small
 72 | 	lower += (spansize / 2);
 73 | 
 74 | 	// else begin binary search
 75 | 	size_t mid = 0;
 76 | 	while (lower + 1 != upper) {
 77 | 		mid = (lower + upper) / 2;
 78 | 		if (array[mid] == min) {
 79 | 			return mid;
 80 | 		} else if (array[mid] < min)
 81 | 			lower = mid;
 82 | 		else
 83 | 			upper = mid;
 84 | 	}
 85 | 	return upper;
 86 | 
 87 | }
 88 | 
 89 | void small_vs_small(const mySet &sets, std::vector<uint32_t> &out);
 90 | 
 91 | void BYintersect_sorted(const uint32_t *D, const size_t &D_end,
 92 | 		const uint32_t *Q, const size_t &Q_end, uint32_t **out,
 93 | 		uint32_t &count);
 94 | 
 95 | // without swap
 96 | void set_vs_set(const mySet &sets, std::vector<uint32_t> &out);
 97 | 
 98 | void swapping_set_vs_set(const mySet &sets, std::vector<uint32_t> &out);
 99 | 
100 | void adaptive(const mySet &sets, std::vector<uint32_t> &out);
101 | 
102 | void sequential(const mySet &sets, std::vector<uint32_t> &out);
103 | 
104 | void small_adaptive(const mySet &sets, std::vector<uint32_t> &out);
105 | 
106 | //without resorting
107 | void max(const mySet &sets, std::vector<uint32_t> &out);
108 | 
109 | void BaezaYates(const mySet &sets, std::vector<uint32_t> &out);
110 | 
111 | }
112 | 
113 | #endif /* INCLUDE_MULTISETINTERSECTION_HPP_ */
114 | 


--------------------------------------------------------------------------------
/include/partitionedintersection.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Schemes inspired or lifted from
  3 |  * From Schlegel et al., Fast Sorted-Set Intersection using SIMD Instructions
  4 |  */
  5 | 
  6 | #ifndef PARTITIONEDINTERSECTION_H_
  7 | #define PARTITIONEDINTERSECTION_H_
  8 | 
  9 | #include "common.h"
 10 | 
 11 | namespace partitioned {
 12 | 
 13 | /**
 14 |  * Silly function.
 15 |  */
 16 | uint16_t _high16(uint32_t x) {
 17 |     return static_cast<uint16_t>(x >> 16);
 18 | }
 19 | /**
 20 |  * Another function.
 21 |  */
 22 | uint16_t _low16(uint32_t x) {
 23 |     return static_cast<uint16_t>(x);
 24 | }
 25 | 
 26 | /**
 27 |  * From Schlegel et al., Fast Sorted-Set Intersection using SIMD Instructions
 28 |  */
 29 | // A - sorted array
 30 | // s_a - size of A
 31 | // R - partitioned sorted array
 32 | size_t partition(const uint32_t *A, const size_t s_a, uint16_t *R, const size_t /*Rlength*/) {
 33 |     uint16_t high = 0;
 34 |     int partition_length = 0;
 35 |     size_t partition_size_position = 1;
 36 |     size_t counter = 0;
 37 |     size_t p = 0;
 38 |     if (p < s_a) {
 39 |         uint16_t chigh = _high16(A[p]); // upper dword
 40 |         uint16_t clow = _low16(A[p]); // lower dword
 41 |         if (chigh == 0) {
 42 |             R[counter++] = chigh; // partition prefix
 43 |             R[counter++] = 0; // reserve place for partition size
 44 |             R[counter++] = clow; // write the first element
 45 |             partition_length = 1; // reset counters
 46 |             //R[partition_size_position] = partition_length - 1;  // store "-1"
 47 |             //partition_size_position = counter - 2;
 48 |             high = chigh;
 49 |             ++p;
 50 |         }
 51 | 
 52 |     }
 53 |     for (; p < s_a; p++) {
 54 |         uint16_t chigh = _high16(A[p]); // upper dword
 55 |         uint16_t clow = _low16(A[p]); // lower dword
 56 |         if (chigh == high && p != 0) { // add element to the current partition
 57 |             R[counter++] = clow;
 58 |             partition_length++;
 59 |         } else { // start new partition
 60 |             R[counter++] = chigh; // partition prefix
 61 |             R[counter++] = 0; // reserve place for partition size
 62 |             R[counter++] = clow; // write the first element
 63 |             R[partition_size_position] = static_cast<uint16_t>(partition_length - 1); // store "-1"
 64 |             partition_length = 1; // reset counters
 65 |             partition_size_position = counter - 2;
 66 |             high = chigh;
 67 |         }
 68 |     }
 69 |     R[partition_size_position] = static_cast<uint16_t>(partition_length - 1);
 70 | 
 71 |     return counter;
 72 | }
 73 | 
 74 | /**
 75 |  * Useful for debugging purposes.
 76 |  */
 77 | size_t inverse_partition(uint32_t *A, const size_t /*s_a*/, const uint16_t *R,
 78 |         const size_t Rlength) {
 79 |     size_t i = 0;
 80 |     size_t p = 0;
 81 |     while (i < Rlength) {
 82 |         uint16_t chigh = R[i++];
 83 |         size_t sizepart = static_cast<size_t> (R[i++]) + 1;
 84 |         while (sizepart > 0) {
 85 |             uint16_t clow = R[i++];
 86 |             A[p++] = (static_cast<uint32_t> (chigh) << 16) | clow;
 87 |             --sizepart;
 88 |         }
 89 |     }
 90 |     return p;
 91 | }
 92 | 
 93 | /**
 94 |  * From Schlegel et al., Fast Sorted-Set Intersection using SIMD Instructions
 95 |  *
 96 |  * Optimized  by D. Lemire on April 30th 2013
 97 |  */
 98 | static size_t cardinality_intersect_vector16(const uint16_t *A,
 99 |         const uint16_t *B, const size_t s_a, const size_t s_b/*, uint16_t *C*/) {
100 |     size_t count = 0;
101 |     size_t i_a = 0, i_b = 0;
102 | 
103 |     const size_t st_a = (s_a / 8) * 8;
104 |     const size_t st_b = (s_b / 8) * 8;
105 |     __m128i v_a, v_b;
106 |     if ((i_a < st_a) and (i_b < st_b)) {
107 |         v_a = _mm_loadu_si128((__m128i *) &A[i_a]);
108 |         v_b = _mm_loadu_si128((__m128i *) &B[i_b]);
109 | 
110 |         while (true) {
111 |             const __m128i res_v = _mm_cmpestrm(v_b, 8, v_a, 8,
112 |                     _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK);
113 |             const int r = _mm_extract_epi32(res_v, 0);
114 |             //__m128i p = _mm_shuffle_epi8(v_a, shuffle_mask16[r]);
115 |             //_mm_storeu_si128((__m128i *) &C[count], p);
116 |             count += _mm_popcnt_u32(r);
117 |             const uint16_t a_max = A[i_a + 7];
118 |             const uint16_t b_max = B[i_b + 7];
119 |             if (a_max <= b_max) {
120 |                 i_a += 8;
121 |                 if (i_a == st_a)
122 |                     break;
123 |                 v_a = _mm_loadu_si128((__m128i *) &A[i_a]);
124 | 
125 |             }
126 |             if (b_max <= a_max) {
127 |                 i_b += 8;
128 |                 if (i_b == st_b)
129 |                     break;
130 |                 v_b = _mm_loadu_si128((__m128i *) &B[i_b]);
131 | 
132 |             }
133 |         }
134 |     }
135 |     // intersect the tail using scalar intersection
136 |     while (i_a < s_a && i_b < s_b) {
137 |         if (A[i_a] < B[i_b]) {
138 |             i_a++;
139 |         } else if (B[i_b] < A[i_a]) {
140 |             i_b++;
141 |         } else {
142 |             count++;
143 |             i_a++;
144 |             i_b++;
145 |         }
146 |     }
147 | 
148 |     return count;
149 | }
150 | 
151 | /**
152 |  * From Schlegel et al., Fast Sorted-Set Intersection using SIMD Instructions
153 |  *
154 |  * Optimized by D. Lemire on May 3rd 2013
155 |  */
156 | static size_t faster_cardinality_intersect_vector16(const uint16_t *A,
157 |         const uint16_t *B, const size_t s_a, const size_t s_b/*, uint16_t *C*/) {
158 |     size_t count = 0;
159 |     size_t i_a = 0, i_b = 0;
160 | 
161 |     const size_t st_a = (s_a / 8) * 8;
162 |     const size_t st_b = (s_b / 8) * 8;
163 |     __m128i v_a, v_b;
164 |     if ((i_a < st_a) and (i_b < st_b)) {
165 |         v_a = _mm_loadu_si128((__m128i *) &A[i_a]);
166 |         v_b = _mm_loadu_si128((__m128i *) &B[i_b]);
167 |         while ((A[i_a] == 0) or (B[i_b] == 0)) {
168 |             const __m128i res_v = _mm_cmpestrm(v_b, 8, v_a, 8,
169 |                     _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK);
170 |             const int r = _mm_extract_epi32(res_v, 0);
171 |             //__m128i p = _mm_shuffle_epi8(v_a, shuffle_mask16[r]);
172 |             //_mm_storeu_si128((__m128i *) &C[count], p);
173 |             count += _mm_popcnt_u32(r);
174 |             const uint16_t a_max = A[i_a + 7];
175 |             const uint16_t b_max = B[i_b + 7];
176 |             if (a_max <= b_max) {
177 |                 i_a += 8;
178 |                 if (i_a == st_a)
179 |                     break;
180 |                 v_a = _mm_loadu_si128((__m128i *) &A[i_a]);
181 | 
182 |             }
183 |             if (b_max <= a_max) {
184 |                 i_b += 8;
185 |                 if (i_b == st_b)
186 |                     break;
187 |                 v_b = _mm_loadu_si128((__m128i *) &B[i_b]);
188 | 
189 |             }
190 | 
191 |         }
192 |         if ((i_a < st_a) and (i_b < st_b))
193 |             while (true) {
194 |                 const __m128i res_v = _mm_cmpistrm(v_b, v_a,
195 |                         _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK);
196 |                 const int r = _mm_extract_epi32(res_v, 0);
197 |                 //__m128i p = _mm_shuffle_epi8(v_a, shuffle_mask16[r]);
198 |                 //_mm_storeu_si128((__m128i *) &C[count], p);
199 |                 count += _mm_popcnt_u32(r);
200 |                 const uint16_t a_max = A[i_a + 7];
201 |                 const uint16_t b_max = B[i_b + 7];
202 |                 if (a_max <= b_max) {
203 |                     i_a += 8;
204 |                     if (i_a == st_a)
205 |                         break;
206 |                     v_a = _mm_loadu_si128((__m128i *) &A[i_a]);
207 | 
208 |                 }
209 |                 if (b_max <= a_max) {
210 |                     i_b += 8;
211 |                     if (i_b == st_b)
212 |                         break;
213 |                     v_b = _mm_loadu_si128((__m128i *) &B[i_b]);
214 | 
215 |                 }
216 |             }
217 |     }
218 |     // intersect the tail using scalar intersection
219 |     while (i_a < s_a && i_b < s_b) {
220 |         if (A[i_a] < B[i_b]) {
221 |             i_a++;
222 |         } else if (B[i_b] < A[i_a]) {
223 |             i_b++;
224 |         } else {
225 |             count++;
226 |             i_a++;
227 |             i_b++;
228 |         }
229 |     }
230 | 
231 |     return count;
232 | }
233 | 
234 | 
235 | /**
236 |  * From Schlegel et al., Fast Sorted-Set Intersection using SIMD Instructions
237 |  *
238 |  * Optimized by D. Lemire on May 3rd 2013
239 |  */
240 | /*static size_t faster2_cardinality_intersect_vector16(const uint16_t *A,
241 |         const uint16_t *B, const size_t s_a, const size_t s_b) {
242 |     size_t count = 0;
243 |     size_t i_a = 0, i_b = 0;
244 | 
245 |     const size_t st_a = (s_a / 8) * 8;
246 |     const size_t st_b = (s_b / 8) * 8;
247 |     __m128i v_a, v_b;
248 |     if ((i_a < st_a) and (i_b < st_b)) {
249 |         v_a = _mm_loadu_si128((__m128i *) &A[i_a]);
250 |         v_b = _mm_loadu_si128((__m128i *) &B[i_b]);
251 |         while ((A[i_a] == 0) or (B[i_b] == 0)) {
252 |             const __m128i res_v = _mm_cmpestrm(v_b, 8, v_a, 8,
253 |                     _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK);
254 |             const int r = _mm_extract_epi32(res_v, 0);
255 |             //__m128i p = _mm_shuffle_epi8(v_a, shuffle_mask16[r]);
256 |             //_mm_storeu_si128((__m128i *) &C[count], p);
257 |             count += _mm_popcnt_u32(r);
258 |             const uint16_t a_max = A[i_a + 7];
259 |             const uint16_t b_max = B[i_b + 7];
260 |             if (a_max <= b_max) {
261 |                 i_a += 8;
262 |                 if (i_a == st_a)
263 |                     break;
264 |                 v_a = _mm_loadu_si128((__m128i *) &A[i_a]);
265 | 
266 |             }
267 |             if (b_max <= a_max) {
268 |                 i_b += 8;
269 |                 if (i_b == st_b)
270 |                     break;
271 |                 v_b = _mm_loadu_si128((__m128i *) &B[i_b]);
272 | 
273 |             }
274 | 
275 |         }
276 |         if ((i_a < st_a) and (i_b < st_b))
277 |             while (true) {
278 |                 const __m128i res_v = _mm_cmpistrm(v_b, v_a,
279 |                         _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK);
280 |                 const int r = _mm_extract_epi32(res_v, 0);
281 |                 //__m128i p = _mm_shuffle_epi8(v_a, shuffle_mask16[r]);
282 |                 //_mm_storeu_si128((__m128i *) &C[count], p);
283 |                 count += _mm_popcnt_u32(r);
284 |                 const uint16_t a_max = A[i_a + 7];
285 |                 const uint16_t b_max = B[i_b + 7];
286 |                 if (a_max <= b_max) {
287 |                     i_a += 8;
288 |                     if (i_a == st_a)
289 |                         break;
290 |                     v_a = _mm_loadu_si128((__m128i *) &A[i_a]);
291 | 
292 |                 }
293 |                 if (b_max <= a_max) {
294 |                     i_b += 8;
295 |                     if (i_b == st_b)
296 |                         break;
297 |                     v_b = _mm_loadu_si128((__m128i *) &B[i_b]);
298 | 
299 |                 }
300 |             }
301 |     }
302 |     // intersect the tail using scalar intersection
303 |     while (i_a < s_a && i_b < s_b) {
304 |         if (A[i_a] < B[i_b]) {
305 |             i_a++;
306 |         } else if (B[i_b] < A[i_a]) {
307 |             i_b++;
308 |         } else {
309 |             count++;
310 |             i_a++;
311 |             i_b++;
312 |         }
313 |     }
314 | 
315 |     return count;
316 | }
317 | */
318 | 
319 | 
320 | 
321 | /**
322 |  * Strictly for testing/debugging purposes.
323 |  */
324 | size_t scalar_cardinality_intersect_vector16(const uint16_t *A,
325 |         const uint16_t *B, const size_t s_a, const size_t s_b/*, uint16_t *C*/) {
326 |     size_t count = 0;
327 |     size_t i_a = 0, i_b = 0;
328 |     // intersect the tail using scalar intersection
329 |     while (i_a < s_a && i_b < s_b) {
330 |         if (A[i_a] < B[i_b]) {
331 |             i_a++;
332 |         } else if (B[i_b] < A[i_a]) {
333 |             i_b++;
334 |         } else {
335 |             count++;
336 |             i_a++;
337 |             i_b++;
338 |         }
339 |     }
340 | 
341 |     return count;
342 | }
343 | 
344 | /**
345 |  * From Schlegel et al., Fast Sorted-Set Intersection using SIMD Instructions
346 |  */
347 | // A, B - partitioned operands
348 | size_t cardinality_intersect_partitioned(const uint16_t *A, const uint16_t *B,
349 |         const size_t s_a, const size_t s_b) {
350 |     size_t i_a = 0, i_b = 0;
351 |     size_t counter = 0;
352 |     while (i_a < s_a && i_b < s_b) {
353 |         if (A[i_a] < B[i_b]) {
354 |             i_a += static_cast<size_t> (A[i_a + 1]) + 2 + 1;
355 |         } else if (B[i_b] < A[i_a]) {
356 |             i_b += static_cast<size_t> (B[i_b + 1]) + 2 + 1;
357 |         } else {
358 |             //C[counter++] = A[i_a]; // write partition prefix
359 |             size_t partition_size = cardinality_intersect_vector16(&A[i_a + 2],
360 |                     &B[i_b + 2], static_cast<size_t> (A[i_a + 1]) + 1,
361 |                     static_cast<size_t> (B[i_b + 1]) + 1);//, &C[counter + 1]);
362 |             //C[counter++] = partition_size; // write partition size
363 |             counter += partition_size;
364 |             i_a += static_cast<size_t> (A[i_a + 1]) + 2 + 1;
365 |             i_b += static_cast<size_t> (B[i_b + 1]) + 2 + 1;
366 |         }
367 |     }
368 |     //std::cout<<"partcounter = "<< partcounter<<std::endl;
369 |     return counter;
370 | }
371 | 
372 | /**
373 |  * Version optimized by D. Lemire of
374 |  * From Schlegel et al., Fast Sorted-Set Intersection using SIMD Instructions
375 |  */
376 | size_t faster_cardinality_intersect_partitioned(const uint16_t *A,
377 |         const uint16_t *B, const size_t s_a, const size_t s_b) {
378 |     size_t i_a = 0, i_b = 0;
379 |     size_t counter = 0;
380 |     while (i_a < s_a && i_b < s_b) {
381 |         if (A[i_a] < B[i_b]) {
382 |             do {
383 |                 i_a += static_cast<size_t> (A[i_a + 1]) + 2 + 1;
384 |                 if (i_a >= s_a)
385 |                     goto end;
386 |             } while (A[i_a] < B[i_b]);
387 |         }
388 |         if (B[i_b] < A[i_a]) {
389 |             do {
390 |                 i_b += static_cast<size_t> (B[i_b + 1]) + 2 + 1;
391 |                 if (i_b >= s_b)
392 |                     goto end;
393 |             } while (B[i_b] < A[i_a]);
394 |         } else {
395 |             size_t partition_size = faster_cardinality_intersect_vector16(
396 |                     &A[i_a + 2], &B[i_b + 2],
397 |                     static_cast<size_t> (A[i_a + 1]) + 1,
398 |                     static_cast<size_t> (B[i_b + 1]) + 1);//, &C[counter + 1]);
399 |             //C[counter++] = partition_size; // write partition size
400 |             counter += partition_size;
401 |             i_a += static_cast<size_t> (A[i_a + 1]) + 2 + 1;
402 |             i_b += static_cast<size_t> (B[i_b + 1]) + 2 + 1;
403 |         }
404 |     }
405 |     end: return counter;
406 | }
407 | 
408 | }
409 | #endif /* PARTITIONEDINTERSECTION_H_ */
410 | 


--------------------------------------------------------------------------------
/include/skipping.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * This is a simple implementation of a skipping data structure and algorithms similar to
  3 |  * what is described in
  4 |  *
  5 |  * Sanders and Transier, Intersection in Integer Inverted Indices, ALENEX 2007,  2007.
  6 |  *
  7 |  * As suggested in their conclusion, we leave the higher-level structure uncompressed. We also
  8 |  * use differential coding.
  9 |  * 
 10 |  * To paraphrase Sanders and Transier...
 11 |  *
 12 |  * In addition to a delta-encoded compressed list, a top-level data structure stores 
 13 |  * every B-th element of N in t together with its position in the main list (B is a tuning 
 14 |  * parameter). We can now run any  search algorithm on t and then scan only the pieces of 
 15 |  * the main list that might contain an element to be located.
 16 |  *
 17 |  * In our implementation, we assume that B is a power of two and use 1 << BlockSizeLog as
 18 |  * the block size.
 19 |  *
 20 |  * Sanders and Transier's proposal is similar in spirit to the skipping structure proposed in
 21 |  *
 22 |  * Moffat, A., Zobel, J.: Self-indexing inverted files for fast text retrieval.
 23 |  * ACM Transactions on Information Systems 14 (1996).
 24 |  *
 25 |  *
 26 |  *      Author: Daniel Lemire
 27 |  */
 28 | 
 29 | #ifndef SKIPPING_H_
 30 | #define SKIPPING_H_
 31 | 
 32 | #include "common.h"
 33 | 
 34 | class Skipping {
 35 | public:
 36 | 
 37 | 
 38 |     Skipping(uint32_t BS, uint32_t * data, uint32_t length) :
 39 |         BlockSizeLog(BS),
 40 |         mainbuffer(), highbuffer(), Length(0) {
 41 |         if((BlockSizeLog == 0) && (BlockSizeLog >= 32)) throw runtime_error("please use a reasonable BlockSizeLog");
 42 |         load(data, length);// cheap constructor
 43 |     }
 44 | 
 45 | 
 46 | 
 47 |     ~Skipping() {}
 48 | 
 49 |     size_t storageInBytes() const {
 50 |         return mainbuffer.size() * sizeof(uint8_t)
 51 |                 + highbuffer.size() * sizeof(higharraypair)
 52 |                 + sizeof(Length); // rough estimates (good enough)
 53 |     }
 54 | 
 55 |     uint32_t decompress(uint32_t * out) const {
 56 |         const uint8_t * bout = mainbuffer.data();
 57 |         uint32_t pos = 0;
 58 | 
 59 |         uint32_t val = 0;
 60 |         for(uint32_t k = 0; k < Length;++k) {
 61 |           bout = decode(bout,val);
 62 |           out[pos++] = val;
 63 |         }
 64 |         return pos;
 65 |     }
 66 | 
 67 | 
 68 |     uint32_t intersect(const Skipping & otherlarger, uint32_t * out) const {
 69 |         // we assume that "this" is the smallest of the two
 70 |         if (otherlarger.Length < Length)
 71 |             return otherlarger.intersect(*this, out);
 72 |         if (Length == 0)
 73 |             return 0;// special silly case
 74 |         assert(otherlarger.Length>=Length);
 75 |         assert(otherlarger.Length>0);
 76 |         uint32_t intersectsize = 0;
 77 | 
 78 |         const uint8_t * inbyte = mainbuffer.data();
 79 |         const uint8_t * const endbyte = mainbuffer.data()
 80 |                         + mainbuffer.size();
 81 |         const uint8_t * largemainpointer = otherlarger.mainbuffer.data();
 82 |         uint32_t largemainval = 0;
 83 |         largemainpointer = decode(largemainpointer, largemainval);
 84 |         uint32_t val = 0;// where I put decoded values
 85 |         uint32_t x = 0;
 86 |         while (endbyte > inbyte) {
 87 |             inbyte = decode(inbyte, val);
 88 |             // if the last value of the current block is too small, skip the block entirely
 89 |             if (otherlarger.highbuffer[x >> otherlarger.BlockSizeLog].first < val) {
 90 |                 do {
 91 |                     x = ((x >> otherlarger.BlockSizeLog) + 1) << otherlarger.BlockSizeLog;
 92 |                     if (x >= otherlarger.Length) {
 93 |                         return intersectsize;
 94 |                     }
 95 |                 } while (otherlarger.highbuffer[x >> otherlarger.BlockSizeLog].first < val);
 96 |                 largemainpointer = otherlarger.mainbuffer.data()
 97 |                         + otherlarger.highbuffer[x >> otherlarger.BlockSizeLog].second;
 98 |                 largemainval = otherlarger.highbuffer[(x >> otherlarger.BlockSizeLog)-1].first;
 99 |                 largemainpointer = decode(largemainpointer, largemainval);
100 |             }
101 |             // at this point, we have that the last value of the current block is >= val
102 |             // this means that we shall decode at most one block before giving up
103 |             while (largemainval < val) {
104 |                 ++x;
105 |                 if (x >= otherlarger.Length) {
106 |                     return intersectsize;
107 |                 }
108 |                 largemainpointer = decode(largemainpointer, largemainval);
109 |              }
110 |             if (largemainval == val) {
111 |                 out[intersectsize++] = val;
112 |             }
113 |         }
114 |         return intersectsize;
115 |     }
116 | 
117 |     uint32_t BlockSizeLog;
118 |     vector<uint8_t> mainbuffer;
119 |     typedef pair<uint32_t, uint32_t> higharraypair;
120 | 
121 |     typedef vector<higharraypair> higharray;
122 |     higharray highbuffer;
123 |     uint32_t Length;
124 | 
125 | private:
126 | 
127 |     Skipping(Skipping && other) : BlockSizeLog(other.BlockSizeLog), mainbuffer(other.mainbuffer),
128 |     highbuffer(other.highbuffer), Length(other.Length) {
129 |     }
130 |     Skipping(const Skipping & other) : BlockSizeLog(other.BlockSizeLog), mainbuffer(other.mainbuffer),
131 |     highbuffer(other.highbuffer), Length(other.Length) {
132 |         cout<<"Just copied "<<storageInBytes()<<endl;
133 |     }
134 | 
135 |     // making it private on purpose
136 |     Skipping();
137 |     // making it private on purpose
138 |     Skipping & operator=(const Skipping &);
139 | 
140 |     void load(uint32_t * data, uint32_t length);
141 | 
142 |     template<uint32_t i>
143 |     uint8_t extract7bits(const uint32_t val) {
144 |         return static_cast<uint8_t> ((val >> (7 * i)) & ((1U << 7) - 1));
145 |     }
146 | 
147 |     template<uint32_t i>
148 |     uint8_t extract7bitsmaskless(const uint32_t val) {
149 |         return static_cast<uint8_t> ((val >> (7 * i)));
150 |     }
151 |     static inline const uint8_t * decode(const uint8_t * buffer, uint32_t& prev) {
152 |         // manually unrolled for performance
153 |         uint32_t v = 0;
154 |         uint8_t c = *buffer++;
155 |         v += (c & 127) ;
156 |         if ((c & 128)) {
157 |             prev += v;
158 |             return buffer;
159 |         }
160 |         c = *buffer++;
161 |         v += ((c & 127) << 7);
162 |         if ((c & 128)) {
163 |             prev += v;
164 |             return buffer;
165 |         }
166 |         c = *buffer++;
167 |         v += ((c & 127) << 14);
168 |         if ((c & 128)) {
169 |             prev += v;
170 |             return buffer;
171 |         }
172 |         c = *buffer++;
173 |         v += ((c & 127) << 21);
174 |         if ((c & 128)) {
175 |             prev += v;
176 |             return buffer;
177 |         }
178 |         c = *buffer++;
179 |         v += ((c & 127) << 30);
180 |         prev += v;
181 |         return buffer;
182 |     }
183 | };
184 | 
185 | void Skipping::load(uint32_t * data, uint32_t len) {
186 |     assert(len < (numeric_limits<size_t>::max() / 5));// check for overflow
187 |     Length = len;
188 |     if(Length == 0) return; // nothing to do
189 |     uint32_t BlockNumber = (Length + (1<<BlockSizeLog) - 1) / (1<<BlockSizeLog);// count full blocks
190 |     assert(BlockNumber << BlockSizeLog >= Length);
191 |     highbuffer.resize(BlockNumber);
192 |     mainbuffer.resize(5 * Length);
193 |     uint8_t * bout = mainbuffer.data();
194 |     uint8_t * const boutinit = bout;
195 |     uint32_t prev = 0;
196 |     for (uint32_t k = 0; k < BlockNumber; ++k) {
197 |        const uint32_t howmany = (((k + 1)  << BlockSizeLog) > Length) ?
198 |                 Length - (k << BlockSizeLog)
199 |                 : 1 << BlockSizeLog;
200 |         highbuffer[k] = make_pair(data[(k << BlockSizeLog) + howmany - 1],
201 |                 static_cast<uint32_t> (bout - boutinit));
202 |         for (uint32_t x = 0; x < howmany; ++x) {
203 |             const uint32_t v = data[x + (k << BlockSizeLog)];
204 |             const uint32_t val = v - prev;
205 |             prev = v;
206 |             if (val < (1U << 7)) {
207 |                 *bout = static_cast<uint8_t> (val | (1U << 7));
208 |                 ++bout;
209 |             } else if (val < (1U << 14)) {
210 |                 *bout = extract7bits<0> (val);
211 |                 ++bout;
212 |                 *bout = extract7bitsmaskless<1> (val) | (1U << 7);
213 |                 ++bout;
214 |             } else if (val < (1U << 21)) {
215 |                 *bout = extract7bits<0> (val);
216 |                 ++bout;
217 |                 *bout = extract7bits<1> (val);
218 |                 ++bout;
219 |                 *bout = extract7bitsmaskless<2> (val) | (1U << 7);
220 |                 ++bout;
221 |             } else if (val < (1U << 28)) {
222 |                 *bout = extract7bits<0> (val);
223 |                 ++bout;
224 |                 *bout = extract7bits<1> (val);
225 |                 ++bout;
226 |                 *bout = extract7bits<2> (val);
227 |                 ++bout;
228 |                 *bout = extract7bitsmaskless<3> (val) | (1U << 7);
229 |                 ++bout;
230 |             } else {
231 |                 *bout = extract7bits<0> (val);
232 |                 ++bout;
233 |                 *bout = extract7bits<1> (val);
234 |                 ++bout;
235 |                 *bout = extract7bits<2> (val);
236 |                 ++bout;
237 |                 *bout = extract7bits<3> (val);
238 |                 ++bout;
239 |                 *bout = extract7bitsmaskless<4> (val) | (1U << 7);
240 |                 ++bout;
241 |             }
242 |         }
243 |     }
244 |     mainbuffer.resize(static_cast<uint32_t> (bout - boutinit));
245 |     mainbuffer.shrink_to_fit();
246 | }
247 | 
248 | #endif /* SKIPPING_H_ */
249 | 


--------------------------------------------------------------------------------
/include/stlutil.h:
--------------------------------------------------------------------------------
 1 | #ifndef STLUTIL_H_
 2 | #define STLUTIL_H_
 3 | 
 4 | #include "util.h"
 5 | #include "union.h"
 6 | #include "intersection.h"
 7 | 
 8 | vector<uint32_t> unite(const vector<uint32_t> & x, const vector<uint32_t> & y) {
 9 |     vector<uint32_t> ans (x.size() + y.size());
10 |     ans.resize(unite(x.data(),x.size(), y.data(),y.size(), ans.data()));
11 |     return ans;
12 | }
13 | 
14 | 
15 | vector<uint32_t> intersect(const vector<uint32_t> & x, const vector<uint32_t> & y) {
16 |     vector<uint32_t> ans (x.size() + y.size());
17 |     ans.resize(classicalintersection(x.data(),x.size(), y.data(),y.size(), ans.data()));
18 |     return ans;
19 | }
20 | 
21 | /**
22 |  * Returns the removed elements
23 |  */
24 | vector<uint32_t> removeRandom(vector<uint32_t> & x, size_t N) {
25 |     auto i = shuffleFY(x.begin(),x.end(),N);
26 |     vector<uint32_t > tmp (i,x.end());
27 |     vector<uint32_t > ans (x.begin(),i);
28 |     x.swap(tmp);
29 |     return ans;
30 | }
31 | 
32 | vector<uint32_t> getRandom(const vector<uint32_t> & x, size_t N) {
33 |     vector<uint32_t> copy(x);
34 |     auto i = shuffleFY(copy.begin(),copy.end(),N);
35 |     vector<uint32_t > ans (copy.begin(),i);
36 |     return ans;
37 | }
38 | 
39 | /**
40 |  * Like getRandom except that the provided vector is modified.
41 |  */
42 | vector<uint32_t> grabRandom(vector<uint32_t> & x, size_t N) {
43 |     auto i = shuffleFY(x.begin(),x.end(),N);
44 |     vector<uint32_t > ans (x.begin(),i);
45 |     return ans;
46 | }
47 | 
48 | 
49 | vector<uint32_t> difference(const vector<uint32_t> &x, const vector<uint32_t> &y) {
50 |     vector<uint32_t> answer(x.size());
51 |     answer.resize(
52 |             std::set_difference (x.begin(), x.end(), y.begin(), y.end(), answer.begin())
53 |      - answer.begin());
54 |     return answer;
55 | 
56 | }
57 | #endif
58 | 


--------------------------------------------------------------------------------
/include/tetzank.h:
--------------------------------------------------------------------------------
  1 | // imported from https://github.com/tetzank/SIMDSetOperations/
  2 | #ifndef INCLUDE_TETZANK_H_
  3 | #define INCLUDE_TETZANK_H_
  4 | 
  5 | #if defined(_MSC_VER)
  6 | #define ALIGNED(x) __declspec(align(x))
  7 | #else
  8 | #if defined(__GNUC__)
  9 | #define ALIGNED(x) __attribute__ ((aligned(x)))
 10 | #endif
 11 | #endif
 12 | static uint32_t shuffle_mask_avx[] ALIGNED(0x1000) = {
 13 | 		7, 6, 5, 4, 3, 2, 1, 0,
 14 | 		0, 7, 6, 5, 4, 3, 2, 1,
 15 | 		1, 7, 6, 5, 4, 3, 2, 0,
 16 | 		0, 1, 7, 6, 5, 4, 3, 2,
 17 | 		2, 7, 6, 5, 4, 3, 1, 0,
 18 | 		0, 2, 7, 6, 5, 4, 3, 1,
 19 | 		1, 2, 7, 6, 5, 4, 3, 0,
 20 | 		0, 1, 2, 7, 6, 5, 4, 3,
 21 | 		3, 7, 6, 5, 4, 2, 1, 0,
 22 | 		0, 3, 7, 6, 5, 4, 2, 1,
 23 | 		1, 3, 7, 6, 5, 4, 2, 0,
 24 | 		0, 1, 3, 7, 6, 5, 4, 2,
 25 | 		2, 3, 7, 6, 5, 4, 1, 0,
 26 | 		0, 2, 3, 7, 6, 5, 4, 1,
 27 | 		1, 2, 3, 7, 6, 5, 4, 0,
 28 | 		0, 1, 2, 3, 7, 6, 5, 4,
 29 | 		4, 7, 6, 5, 3, 2, 1, 0,
 30 | 		0, 4, 7, 6, 5, 3, 2, 1,
 31 | 		1, 4, 7, 6, 5, 3, 2, 0,
 32 | 		0, 1, 4, 7, 6, 5, 3, 2,
 33 | 		2, 4, 7, 6, 5, 3, 1, 0,
 34 | 		0, 2, 4, 7, 6, 5, 3, 1,
 35 | 		1, 2, 4, 7, 6, 5, 3, 0,
 36 | 		0, 1, 2, 4, 7, 6, 5, 3,
 37 | 		3, 4, 7, 6, 5, 2, 1, 0,
 38 | 		0, 3, 4, 7, 6, 5, 2, 1,
 39 | 		1, 3, 4, 7, 6, 5, 2, 0,
 40 | 		0, 1, 3, 4, 7, 6, 5, 2,
 41 | 		2, 3, 4, 7, 6, 5, 1, 0,
 42 | 		0, 2, 3, 4, 7, 6, 5, 1,
 43 | 		1, 2, 3, 4, 7, 6, 5, 0,
 44 | 		0, 1, 2, 3, 4, 7, 6, 5,
 45 | 		5, 7, 6, 4, 3, 2, 1, 0,
 46 | 		0, 5, 7, 6, 4, 3, 2, 1,
 47 | 		1, 5, 7, 6, 4, 3, 2, 0,
 48 | 		0, 1, 5, 7, 6, 4, 3, 2,
 49 | 		2, 5, 7, 6, 4, 3, 1, 0,
 50 | 		0, 2, 5, 7, 6, 4, 3, 1,
 51 | 		1, 2, 5, 7, 6, 4, 3, 0,
 52 | 		0, 1, 2, 5, 7, 6, 4, 3,
 53 | 		3, 5, 7, 6, 4, 2, 1, 0,
 54 | 		0, 3, 5, 7, 6, 4, 2, 1,
 55 | 		1, 3, 5, 7, 6, 4, 2, 0,
 56 | 		0, 1, 3, 5, 7, 6, 4, 2,
 57 | 		2, 3, 5, 7, 6, 4, 1, 0,
 58 | 		0, 2, 3, 5, 7, 6, 4, 1,
 59 | 		1, 2, 3, 5, 7, 6, 4, 0,
 60 | 		0, 1, 2, 3, 5, 7, 6, 4,
 61 | 		4, 5, 7, 6, 3, 2, 1, 0,
 62 | 		0, 4, 5, 7, 6, 3, 2, 1,
 63 | 		1, 4, 5, 7, 6, 3, 2, 0,
 64 | 		0, 1, 4, 5, 7, 6, 3, 2,
 65 | 		2, 4, 5, 7, 6, 3, 1, 0,
 66 | 		0, 2, 4, 5, 7, 6, 3, 1,
 67 | 		1, 2, 4, 5, 7, 6, 3, 0,
 68 | 		0, 1, 2, 4, 5, 7, 6, 3,
 69 | 		3, 4, 5, 7, 6, 2, 1, 0,
 70 | 		0, 3, 4, 5, 7, 6, 2, 1,
 71 | 		1, 3, 4, 5, 7, 6, 2, 0,
 72 | 		0, 1, 3, 4, 5, 7, 6, 2,
 73 | 		2, 3, 4, 5, 7, 6, 1, 0,
 74 | 		0, 2, 3, 4, 5, 7, 6, 1,
 75 | 		1, 2, 3, 4, 5, 7, 6, 0,
 76 | 		0, 1, 2, 3, 4, 5, 7, 6,
 77 | 		6, 7, 5, 4, 3, 2, 1, 0,
 78 | 		0, 6, 7, 5, 4, 3, 2, 1,
 79 | 		1, 6, 7, 5, 4, 3, 2, 0,
 80 | 		0, 1, 6, 7, 5, 4, 3, 2,
 81 | 		2, 6, 7, 5, 4, 3, 1, 0,
 82 | 		0, 2, 6, 7, 5, 4, 3, 1,
 83 | 		1, 2, 6, 7, 5, 4, 3, 0,
 84 | 		0, 1, 2, 6, 7, 5, 4, 3,
 85 | 		3, 6, 7, 5, 4, 2, 1, 0,
 86 | 		0, 3, 6, 7, 5, 4, 2, 1,
 87 | 		1, 3, 6, 7, 5, 4, 2, 0,
 88 | 		0, 1, 3, 6, 7, 5, 4, 2,
 89 | 		2, 3, 6, 7, 5, 4, 1, 0,
 90 | 		0, 2, 3, 6, 7, 5, 4, 1,
 91 | 		1, 2, 3, 6, 7, 5, 4, 0,
 92 | 		0, 1, 2, 3, 6, 7, 5, 4,
 93 | 		4, 6, 7, 5, 3, 2, 1, 0,
 94 | 		0, 4, 6, 7, 5, 3, 2, 1,
 95 | 		1, 4, 6, 7, 5, 3, 2, 0,
 96 | 		0, 1, 4, 6, 7, 5, 3, 2,
 97 | 		2, 4, 6, 7, 5, 3, 1, 0,
 98 | 		0, 2, 4, 6, 7, 5, 3, 1,
 99 | 		1, 2, 4, 6, 7, 5, 3, 0,
100 | 		0, 1, 2, 4, 6, 7, 5, 3,
101 | 		3, 4, 6, 7, 5, 2, 1, 0,
102 | 		0, 3, 4, 6, 7, 5, 2, 1,
103 | 		1, 3, 4, 6, 7, 5, 2, 0,
104 | 		0, 1, 3, 4, 6, 7, 5, 2,
105 | 		2, 3, 4, 6, 7, 5, 1, 0,
106 | 		0, 2, 3, 4, 6, 7, 5, 1,
107 | 		1, 2, 3, 4, 6, 7, 5, 0,
108 | 		0, 1, 2, 3, 4, 6, 7, 5,
109 | 		5, 6, 7, 4, 3, 2, 1, 0,
110 | 		0, 5, 6, 7, 4, 3, 2, 1,
111 | 		1, 5, 6, 7, 4, 3, 2, 0,
112 | 		0, 1, 5, 6, 7, 4, 3, 2,
113 | 		2, 5, 6, 7, 4, 3, 1, 0,
114 | 		0, 2, 5, 6, 7, 4, 3, 1,
115 | 		1, 2, 5, 6, 7, 4, 3, 0,
116 | 		0, 1, 2, 5, 6, 7, 4, 3,
117 | 		3, 5, 6, 7, 4, 2, 1, 0,
118 | 		0, 3, 5, 6, 7, 4, 2, 1,
119 | 		1, 3, 5, 6, 7, 4, 2, 0,
120 | 		0, 1, 3, 5, 6, 7, 4, 2,
121 | 		2, 3, 5, 6, 7, 4, 1, 0,
122 | 		0, 2, 3, 5, 6, 7, 4, 1,
123 | 		1, 2, 3, 5, 6, 7, 4, 0,
124 | 		0, 1, 2, 3, 5, 6, 7, 4,
125 | 		4, 5, 6, 7, 3, 2, 1, 0,
126 | 		0, 4, 5, 6, 7, 3, 2, 1,
127 | 		1, 4, 5, 6, 7, 3, 2, 0,
128 | 		0, 1, 4, 5, 6, 7, 3, 2,
129 | 		2, 4, 5, 6, 7, 3, 1, 0,
130 | 		0, 2, 4, 5, 6, 7, 3, 1,
131 | 		1, 2, 4, 5, 6, 7, 3, 0,
132 | 		0, 1, 2, 4, 5, 6, 7, 3,
133 | 		3, 4, 5, 6, 7, 2, 1, 0,
134 | 		0, 3, 4, 5, 6, 7, 2, 1,
135 | 		1, 3, 4, 5, 6, 7, 2, 0,
136 | 		0, 1, 3, 4, 5, 6, 7, 2,
137 | 		2, 3, 4, 5, 6, 7, 1, 0,
138 | 		0, 2, 3, 4, 5, 6, 7, 1,
139 | 		1, 2, 3, 4, 5, 6, 7, 0,
140 | 		0, 1, 2, 3, 4, 5, 6, 7,
141 | 		7, 6, 5, 4, 3, 2, 1, 0,
142 | 		0, 7, 6, 5, 4, 3, 2, 1,
143 | 		1, 7, 6, 5, 4, 3, 2, 0,
144 | 		0, 1, 7, 6, 5, 4, 3, 2,
145 | 		2, 7, 6, 5, 4, 3, 1, 0,
146 | 		0, 2, 7, 6, 5, 4, 3, 1,
147 | 		1, 2, 7, 6, 5, 4, 3, 0,
148 | 		0, 1, 2, 7, 6, 5, 4, 3,
149 | 		3, 7, 6, 5, 4, 2, 1, 0,
150 | 		0, 3, 7, 6, 5, 4, 2, 1,
151 | 		1, 3, 7, 6, 5, 4, 2, 0,
152 | 		0, 1, 3, 7, 6, 5, 4, 2,
153 | 		2, 3, 7, 6, 5, 4, 1, 0,
154 | 		0, 2, 3, 7, 6, 5, 4, 1,
155 | 		1, 2, 3, 7, 6, 5, 4, 0,
156 | 		0, 1, 2, 3, 7, 6, 5, 4,
157 | 		4, 7, 6, 5, 3, 2, 1, 0,
158 | 		0, 4, 7, 6, 5, 3, 2, 1,
159 | 		1, 4, 7, 6, 5, 3, 2, 0,
160 | 		0, 1, 4, 7, 6, 5, 3, 2,
161 | 		2, 4, 7, 6, 5, 3, 1, 0,
162 | 		0, 2, 4, 7, 6, 5, 3, 1,
163 | 		1, 2, 4, 7, 6, 5, 3, 0,
164 | 		0, 1, 2, 4, 7, 6, 5, 3,
165 | 		3, 4, 7, 6, 5, 2, 1, 0,
166 | 		0, 3, 4, 7, 6, 5, 2, 1,
167 | 		1, 3, 4, 7, 6, 5, 2, 0,
168 | 		0, 1, 3, 4, 7, 6, 5, 2,
169 | 		2, 3, 4, 7, 6, 5, 1, 0,
170 | 		0, 2, 3, 4, 7, 6, 5, 1,
171 | 		1, 2, 3, 4, 7, 6, 5, 0,
172 | 		0, 1, 2, 3, 4, 7, 6, 5,
173 | 		5, 7, 6, 4, 3, 2, 1, 0,
174 | 		0, 5, 7, 6, 4, 3, 2, 1,
175 | 		1, 5, 7, 6, 4, 3, 2, 0,
176 | 		0, 1, 5, 7, 6, 4, 3, 2,
177 | 		2, 5, 7, 6, 4, 3, 1, 0,
178 | 		0, 2, 5, 7, 6, 4, 3, 1,
179 | 		1, 2, 5, 7, 6, 4, 3, 0,
180 | 		0, 1, 2, 5, 7, 6, 4, 3,
181 | 		3, 5, 7, 6, 4, 2, 1, 0,
182 | 		0, 3, 5, 7, 6, 4, 2, 1,
183 | 		1, 3, 5, 7, 6, 4, 2, 0,
184 | 		0, 1, 3, 5, 7, 6, 4, 2,
185 | 		2, 3, 5, 7, 6, 4, 1, 0,
186 | 		0, 2, 3, 5, 7, 6, 4, 1,
187 | 		1, 2, 3, 5, 7, 6, 4, 0,
188 | 		0, 1, 2, 3, 5, 7, 6, 4,
189 | 		4, 5, 7, 6, 3, 2, 1, 0,
190 | 		0, 4, 5, 7, 6, 3, 2, 1,
191 | 		1, 4, 5, 7, 6, 3, 2, 0,
192 | 		0, 1, 4, 5, 7, 6, 3, 2,
193 | 		2, 4, 5, 7, 6, 3, 1, 0,
194 | 		0, 2, 4, 5, 7, 6, 3, 1,
195 | 		1, 2, 4, 5, 7, 6, 3, 0,
196 | 		0, 1, 2, 4, 5, 7, 6, 3,
197 | 		3, 4, 5, 7, 6, 2, 1, 0,
198 | 		0, 3, 4, 5, 7, 6, 2, 1,
199 | 		1, 3, 4, 5, 7, 6, 2, 0,
200 | 		0, 1, 3, 4, 5, 7, 6, 2,
201 | 		2, 3, 4, 5, 7, 6, 1, 0,
202 | 		0, 2, 3, 4, 5, 7, 6, 1,
203 | 		1, 2, 3, 4, 5, 7, 6, 0,
204 | 		0, 1, 2, 3, 4, 5, 7, 6,
205 | 		6, 7, 5, 4, 3, 2, 1, 0,
206 | 		0, 6, 7, 5, 4, 3, 2, 1,
207 | 		1, 6, 7, 5, 4, 3, 2, 0,
208 | 		0, 1, 6, 7, 5, 4, 3, 2,
209 | 		2, 6, 7, 5, 4, 3, 1, 0,
210 | 		0, 2, 6, 7, 5, 4, 3, 1,
211 | 		1, 2, 6, 7, 5, 4, 3, 0,
212 | 		0, 1, 2, 6, 7, 5, 4, 3,
213 | 		3, 6, 7, 5, 4, 2, 1, 0,
214 | 		0, 3, 6, 7, 5, 4, 2, 1,
215 | 		1, 3, 6, 7, 5, 4, 2, 0,
216 | 		0, 1, 3, 6, 7, 5, 4, 2,
217 | 		2, 3, 6, 7, 5, 4, 1, 0,
218 | 		0, 2, 3, 6, 7, 5, 4, 1,
219 | 		1, 2, 3, 6, 7, 5, 4, 0,
220 | 		0, 1, 2, 3, 6, 7, 5, 4,
221 | 		4, 6, 7, 5, 3, 2, 1, 0,
222 | 		0, 4, 6, 7, 5, 3, 2, 1,
223 | 		1, 4, 6, 7, 5, 3, 2, 0,
224 | 		0, 1, 4, 6, 7, 5, 3, 2,
225 | 		2, 4, 6, 7, 5, 3, 1, 0,
226 | 		0, 2, 4, 6, 7, 5, 3, 1,
227 | 		1, 2, 4, 6, 7, 5, 3, 0,
228 | 		0, 1, 2, 4, 6, 7, 5, 3,
229 | 		3, 4, 6, 7, 5, 2, 1, 0,
230 | 		0, 3, 4, 6, 7, 5, 2, 1,
231 | 		1, 3, 4, 6, 7, 5, 2, 0,
232 | 		0, 1, 3, 4, 6, 7, 5, 2,
233 | 		2, 3, 4, 6, 7, 5, 1, 0,
234 | 		0, 2, 3, 4, 6, 7, 5, 1,
235 | 		1, 2, 3, 4, 6, 7, 5, 0,
236 | 		0, 1, 2, 3, 4, 6, 7, 5,
237 | 		5, 6, 7, 4, 3, 2, 1, 0,
238 | 		0, 5, 6, 7, 4, 3, 2, 1,
239 | 		1, 5, 6, 7, 4, 3, 2, 0,
240 | 		0, 1, 5, 6, 7, 4, 3, 2,
241 | 		2, 5, 6, 7, 4, 3, 1, 0,
242 | 		0, 2, 5, 6, 7, 4, 3, 1,
243 | 		1, 2, 5, 6, 7, 4, 3, 0,
244 | 		0, 1, 2, 5, 6, 7, 4, 3,
245 | 		3, 5, 6, 7, 4, 2, 1, 0,
246 | 		0, 3, 5, 6, 7, 4, 2, 1,
247 | 		1, 3, 5, 6, 7, 4, 2, 0,
248 | 		0, 1, 3, 5, 6, 7, 4, 2,
249 | 		2, 3, 5, 6, 7, 4, 1, 0,
250 | 		0, 2, 3, 5, 6, 7, 4, 1,
251 | 		1, 2, 3, 5, 6, 7, 4, 0,
252 | 		0, 1, 2, 3, 5, 6, 7, 4,
253 | 		4, 5, 6, 7, 3, 2, 1, 0,
254 | 		0, 4, 5, 6, 7, 3, 2, 1,
255 | 		1, 4, 5, 6, 7, 3, 2, 0,
256 | 		0, 1, 4, 5, 6, 7, 3, 2,
257 | 		2, 4, 5, 6, 7, 3, 1, 0,
258 | 		0, 2, 4, 5, 6, 7, 3, 1,
259 | 		1, 2, 4, 5, 6, 7, 3, 0,
260 | 		0, 1, 2, 4, 5, 6, 7, 3,
261 | 		3, 4, 5, 6, 7, 2, 1, 0,
262 | 		0, 3, 4, 5, 6, 7, 2, 1,
263 | 		1, 3, 4, 5, 6, 7, 2, 0,
264 | 		0, 1, 3, 4, 5, 6, 7, 2,
265 | 		2, 3, 4, 5, 6, 7, 1, 0,
266 | 		0, 2, 3, 4, 5, 6, 7, 1,
267 | 		1, 2, 3, 4, 5, 6, 7, 0,
268 | 		0, 1, 2, 3, 4, 5, 6, 7
269 | };
270 | 
271 | size_t tetzank_intersect_scalar(const uint32_t *list1, size_t size1, const uint32_t *list2, size_t size2, uint32_t *result){
272 | 	size_t counter=0;
273 | 	const uint32_t *end1 = list1+size1, *end2 = list2+size2;
274 | 	while(list1 != end1 && list2 != end2){
275 | 		if(*list1 < *list2){
276 | 			list1++;
277 | 		}else if(*list1 > *list2){
278 | 			list2++;
279 | 		}else{
280 | 			result[counter++] = *list1;
281 | 			list1++; list2++;
282 | 		}
283 | 	}
284 | 	return counter;
285 | }
286 | 
287 | size_t tetzank_intersect_scalar_count(const uint32_t *list1, size_t size1, const uint32_t *list2, size_t size2){
288 | 	size_t counter=0;
289 | 	const uint32_t *end1 = list1+size1, *end2 = list2+size2;
290 | 	while(list1 != end1 && list2 != end2){
291 | 		if(*list1 < *list2){
292 | 			list1++;
293 | 		}else if(*list1 > *list2){
294 | 			list2++;
295 | 		}else{
296 | 			counter++;
297 | 			list1++; list2++;
298 | 		}
299 | 	}
300 | 	return counter;
301 | }
302 | 
303 | 
304 | #ifdef __AVX2__
305 | 
306 | 
307 | #include <immintrin.h>
308 | 
309 | 
310 | size_t tetzank_intersect_vector_avx2(const uint32_t *list1, size_t size1, const uint32_t *list2, size_t size2, uint32_t *result){
311 | 	size_t count=0, i_a=0, i_b=0;
312 | 	size_t st_a = (size1 / 8) * 8;
313 | 	size_t st_b = (size2 / 8) * 8;
314 | 	while(i_a < st_a && i_b < st_b){
315 | 		__m256i v_a = _mm256_loadu_si256((const __m256i*)&list1[i_a]);
316 | 		__m256i v_b = _mm256_loadu_si256((const __m256i*)&list2[i_b]);
317 | 
318 | 		int32_t a_max = list1[i_a+7];
319 | 		int32_t b_max = list2[i_b+7];
320 | 		i_a += (a_max <= b_max) * 8;
321 | 		i_b += (a_max >= b_max) * 8;
322 | 
323 | 		constexpr int32_t cyclic_shift = _MM_SHUFFLE(0,3,2,1); //rotating right
324 | 		constexpr int32_t cyclic_shift2= _MM_SHUFFLE(2,1,0,3); //rotating left
325 | 		constexpr int32_t cyclic_shift3= _MM_SHUFFLE(1,0,3,2); //between
326 | 		__m256i cmp_mask1 = _mm256_cmpeq_epi32(v_a, v_b);
327 | 		__m256 rot1 = _mm256_permute_ps((__m256)v_b, cyclic_shift);
328 | 		__m256i cmp_mask2 = _mm256_cmpeq_epi32(v_a, (__m256i)rot1);
329 | 		__m256 rot2 = _mm256_permute_ps((__m256)v_b, cyclic_shift3);
330 | 		__m256i cmp_mask3 = _mm256_cmpeq_epi32(v_a, (__m256i)rot2);
331 | 		__m256 rot3 = _mm256_permute_ps((__m256)v_b, cyclic_shift2);
332 | 		__m256i cmp_mask4 = _mm256_cmpeq_epi32(v_a, (__m256i)rot3);
333 | 
334 | 		__m256 rot4 = _mm256_permute2f128_ps((__m256)v_b, (__m256)v_b, 1);
335 | 
336 | 		__m256i cmp_mask5 = _mm256_cmpeq_epi32(v_a, (__m256i)rot4);
337 | 		__m256 rot5 = _mm256_permute_ps(rot4, cyclic_shift);
338 | 		__m256i cmp_mask6 = _mm256_cmpeq_epi32(v_a, (__m256i)rot5);
339 | 		__m256 rot6 = _mm256_permute_ps(rot4, cyclic_shift3);
340 | 		__m256i cmp_mask7 = _mm256_cmpeq_epi32(v_a, (__m256i)rot6);
341 | 		__m256 rot7 = _mm256_permute_ps(rot4, cyclic_shift2);
342 | 		__m256i cmp_mask8 = _mm256_cmpeq_epi32(v_a, (__m256i)rot7);
343 | 
344 | 		// AVX2: _mm256_or_si256
345 | 		__m256i cmp_mask = _mm256_or_si256(
346 | 			_mm256_or_si256(
347 | 				_mm256_or_si256(cmp_mask1, cmp_mask2),
348 | 				_mm256_or_si256(cmp_mask3, cmp_mask4)
349 | 			),
350 | 			_mm256_or_si256(
351 | 				_mm256_or_si256(cmp_mask5, cmp_mask6),
352 | 				_mm256_or_si256(cmp_mask7, cmp_mask8)
353 | 			)
354 | 		);
355 | 		int32_t mask = _mm256_movemask_ps((__m256)cmp_mask);
356 | 
357 | 		__m256i idx = _mm256_load_si256((const __m256i*)&shuffle_mask_avx[mask*8]);
358 | 		__m256i p = _mm256_permutevar8x32_epi32(v_a, idx);
359 | 		_mm256_storeu_si256((__m256i*)&result[count], p);
360 | 
361 | 		count += _mm_popcnt_u32(mask);
362 | 	}
363 | 	// intersect the tail using scalar intersection
364 | 	count += tetzank_intersect_scalar(list1+i_a, size1-i_a, list2+i_b, size2-i_b, result+count);
365 | 	return count;
366 | }
367 | size_t tetzank_intersect_vector_avx2_count(const uint32_t *list1, size_t size1, const uint32_t *list2, size_t size2){
368 | 	size_t count=0, i_a=0, i_b=0;
369 | 	size_t st_a = (size1 / 8) * 8;
370 | 	size_t st_b = (size2 / 8) * 8;
371 | 	while(i_a < st_a && i_b < st_b){
372 | 		__m256i v_a = _mm256_loadu_si256((const __m256i*)&list1[i_a]);
373 | 		__m256i v_b = _mm256_loadu_si256((const __m256i*)&list2[i_b]);
374 | 		int32_t a_max = list1[i_a+7];
375 | 		int32_t b_max = list2[i_b+7];
376 | 		i_a += (a_max <= b_max) * 8;
377 | 		i_b += (a_max >= b_max) * 8;
378 | 
379 | 		constexpr int32_t cyclic_shift = _MM_SHUFFLE(0,3,2,1); //rotating right
380 | 		constexpr int32_t cyclic_shift2= _MM_SHUFFLE(2,1,0,3); //rotating left
381 | 		constexpr int32_t cyclic_shift3= _MM_SHUFFLE(1,0,3,2); //between
382 | 		// AVX2: _mm256_cmpeq_epi32
383 | 		__m256i cmp_mask1 = _mm256_cmpeq_epi32(v_a, v_b);
384 | 		__m256 rot1 = _mm256_permute_ps((__m256)v_b, cyclic_shift);
385 | 		__m256i cmp_mask2 = _mm256_cmpeq_epi32(v_a, (__m256i)rot1);
386 | 		__m256 rot2 = _mm256_permute_ps((__m256)v_b, cyclic_shift3);
387 | 		__m256i cmp_mask3 = _mm256_cmpeq_epi32(v_a, (__m256i)rot2);
388 | 		__m256 rot3 = _mm256_permute_ps((__m256)v_b, cyclic_shift2);
389 | 		__m256i cmp_mask4 = _mm256_cmpeq_epi32(v_a, (__m256i)rot3);
390 | 
391 | 		__m256 rot4 = _mm256_permute2f128_ps((__m256)v_b, (__m256)v_b, 1);
392 | 
393 | 		__m256i cmp_mask5 = _mm256_cmpeq_epi32(v_a, (__m256i)rot4);
394 | 		__m256 rot5 = _mm256_permute_ps(rot4, cyclic_shift);
395 | 		__m256i cmp_mask6 = _mm256_cmpeq_epi32(v_a, (__m256i)rot5);
396 | 		__m256 rot6 = _mm256_permute_ps(rot4, cyclic_shift3);
397 | 		__m256i cmp_mask7 = _mm256_cmpeq_epi32(v_a, (__m256i)rot6);
398 | 		__m256 rot7 = _mm256_permute_ps(rot4, cyclic_shift2);
399 | 		__m256i cmp_mask8 = _mm256_cmpeq_epi32(v_a, (__m256i)rot7);
400 | 
401 | 		// AVX2: _mm256_or_si256
402 | 		__m256i cmp_mask = _mm256_or_si256(
403 | 			_mm256_or_si256(
404 | 				_mm256_or_si256(cmp_mask1, cmp_mask2),
405 | 				_mm256_or_si256(cmp_mask3, cmp_mask4)
406 | 			),
407 | 			_mm256_or_si256(
408 | 				_mm256_or_si256(cmp_mask5, cmp_mask6),
409 | 				_mm256_or_si256(cmp_mask7, cmp_mask8)
410 | 			)
411 | 		);
412 | 		int32_t mask = _mm256_movemask_ps((__m256)cmp_mask);
413 | 		count += _mm_popcnt_u32(mask);
414 | 	}
415 | 	// intersect the tail using scalar intersection
416 | 	count += tetzank_intersect_scalar_count(list1+i_a, size1-i_a, list2+i_b, size2-i_b);
417 | 
418 | 	return count;
419 | }
420 | #endif
421 | 
422 | 
423 | 
424 | 
425 | #endif /* INCLUDE_TETZANK_H_ */
426 | 


--------------------------------------------------------------------------------
/include/thomaswu.h:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | #ifndef THOMASWU_H_
  4 | #define THOMASWU_H_
  5 | 
  6 | #include "common.h"
  7 | #include "intersection.h"
  8 | 
  9 | typedef uint32_t UINT4;
 10 | 
 11 | typedef uint64_t UINT8;
 12 | 
 13 | typedef long (*intersectionfindfunction)(UINT4 goal, const UINT4 *target, long ntargets);
 14 | 
 15 | template <intersectionfindfunction FINDFUNCTION>
 16 | size_t
 17 | compute_intersection (const uint32_t * rare,
 18 |         const size_t nrare, const uint32_t * freq, const size_t nfreq, uint32_t * out) {
 19 |   UINT4 goal;
 20 |   const UINT4 *stop_rare;
 21 |   UINT4 *init_out;
 22 |   long j;
 23 |   long nfreqleft = static_cast<int>(nfreq);// possibly unsafe if nfreq exceeds the range of longs
 24 | 
 25 |   init_out = out;
 26 |   stop_rare = &(rare[nrare]);
 27 |   while (rare < stop_rare) {
 28 |     goal = *rare++;
 29 |     j = FINDFUNCTION(goal,freq,nfreqleft);
 30 | 
 31 |     if (j >= nfreqleft) {
 32 |       return (out - init_out);
 33 |     } else if (freq[j] == goal) {
 34 |       *out++ = goal;
 35 |     }
 36 |     freq += j;
 37 |     nfreqleft -= j;
 38 |   }
 39 |   return (out - init_out);
 40 | }
 41 | 
 42 | 
 43 | long
 44 | Intersection_find_scalar (UINT4 goal, const UINT4 *target, long ntargets);
 45 | long
 46 | Intersection_find_gallop (UINT4 goal, const UINT4 *target, long ntargets);
 47 | long
 48 | Intersection_find_v1 (UINT4 goal, const UINT4 *target, long ntargets);
 49 | long
 50 | Intersection_find_v1_aligned (UINT4 goal, const UINT4 *target, long ntargets);
 51 | long
 52 | Intersection_find_v1_plow (UINT4 goal, const UINT4 *target, long ntargets);
 53 | long
 54 | Intersection_find_v2 (UINT4 goal, const UINT4 *target, long ntargets);
 55 | long
 56 | Intersection_find_v2_aligned (UINT4 goal, const UINT4 *target, long ntargets);
 57 | long
 58 | Intersection_find_v3 (UINT4 goal, const UINT4 *target, long ntargets);
 59 | long
 60 | Intersection_find_v3_aligned (UINT4 goal, const UINT4 *target, long ntargets);
 61 | long
 62 | Intersection_find_simdgallop_v0 (UINT4 goal, const UINT4 *target, long ntargets);
 63 | long
 64 | Intersection_find_simdgallop_v1 (UINT4 goal, const UINT4 *target, long ntargets);
 65 | long
 66 | Intersection_find_simdgallop_v2 (UINT4 goal, const UINT4 *target, long ntargets);
 67 | long
 68 | Intersection_find_simdgallop_v3 (UINT4 goal, const UINT4 *target, long ntargets);
 69 | 
 70 | 
 71 | typedef long (*flaggedintersectionfindfunction)(int *foundp, UINT4 goal, const UINT4 *target, long ntargets);
 72 | 
 73 | long
 74 | Intersection_find_v3_cmpeq (int *foundp, UINT4 goal, const UINT4 *target, long ntargets);
 75 | 
 76 | long
 77 | Intersection_truefind_v3_cmpeq_scalar (int *foundp, UINT4 goal, const UINT4 *target, long ntargets);
 78 | 
 79 | 
 80 | long
 81 | Intersection_truefind_v3_cmpeq_simd32 (int *foundp, UINT4 goal, const UINT4 *target, long ntargets);
 82 | 
 83 | 
 84 | long
 85 | Intersection_truefind_v3_cmpeq_simd8 (int *foundp, UINT4 goal, const UINT4 *target, long ntargets) ;
 86 | 
 87 | long
 88 | Intersection_truefind_v3_cmpeq_binary (int *foundp, UINT4 goal, const UINT4 *target, long ntargets) ;
 89 | 
 90 | template <flaggedintersectionfindfunction FINDFUNCTION>
 91 | size_t
 92 | compute_intersection_flagged (const uint32_t * rare,
 93 |         const size_t nrare, const uint32_t * freq, const size_t nfreq, uint32_t * out) {
 94 |   UINT4 *init_out;
 95 |   size_t i;
 96 | 
 97 |   init_out = out;
 98 | 
 99 |   size_t lenFreq = nfreq;
100 |   long pos;
101 |   int foundp;
102 |   for (i = 0; i < nrare; i++) {
103 |       pos = FINDFUNCTION(&foundp,rare[i],freq,lenFreq);
104 |       if (foundp == 1) {
105 |         *out++ = rare[i];
106 |       }
107 |       freq += pos;
108 |       lenFreq -= pos;
109 |     }
110 |   return (out - init_out);
111 | }
112 | 
113 | 
114 | #endif /* THOMASWU_H_ */
115 | 


--------------------------------------------------------------------------------
/include/timer.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * This code is released under the
 3 |  * Apache License Version 2.0 http://www.apache.org/licenses/.
 4 |  *
 5 |  */
 6 | 
 7 | #ifndef TIMER_H_
 8 | #define TIMER_H_
 9 | 
10 | #include <sys/stat.h>
11 | #include <sys/time.h>
12 | #include <sys/types.h>
13 | 
14 | class WallClockTimer {
15 | public:
16 |     struct timeval t1, t2;
17 |     WallClockTimer() :
18 |         t1(), t2() {
19 |         gettimeofday(&t1, 0);
20 |         t2 = t1;
21 |     }
22 |     void reset() {
23 |         gettimeofday(&t1, 0);
24 |         t2 = t1;
25 |     }
26 |     uint64_t elapsed() {
27 |         return ((t2.tv_sec - t1.tv_sec) * 1000ULL * 1000ULL) + ((t2.tv_usec
28 |                 - t1. tv_usec));
29 |     }
30 |     uint64_t split() {
31 |         gettimeofday(&t2, 0);
32 |         return elapsed();
33 |     }
34 | };
35 | 
36 | #endif /* TIMER_H_ */
37 | 


--------------------------------------------------------------------------------
/include/union.h:
--------------------------------------------------------------------------------
 1 | #ifndef UNION_H_
 2 | #define UNION_H_
 3 | #include "common.h"
 4 | 
 5 | size_t unite(const uint32_t * set1, const size_t length1,
 6 |         const uint32_t * set2, const size_t length2, uint32_t * out) {
 7 |     size_t pos = 0;
 8 |     size_t k1 = 0, k2 = 0;
 9 |     if (0 == length1) {
10 |         for (size_t k = 0; k < length2; ++k)
11 |             out[k] = set2[k];
12 |         return length2;
13 |     }
14 |     if (0 == length2) {
15 |         for (size_t k = 0; k < length1; ++k)
16 |             out[k] = set1[k];
17 |         return length1;
18 |     }
19 |     while (true) {
20 |         if (set1[k1] < set2[k2]) {
21 |             out[pos++] = set1[k1];
22 |             ++k1;
23 |             if (k1 >= length1) {
24 |                 for (; k2 < length2; ++k2)
25 |                     out[pos++] = set2[k2];
26 |                 break;
27 |             }
28 |         } else if (set1[k1] == set2[k2]) {
29 |             out[pos++] = set1[k1];
30 |             ++k1;
31 |             ++k2;
32 |             if (k1 >= length1) {
33 |                 for (; k2 < length2; ++k2)
34 |                     out[pos++] = set2[k2];
35 |                 break;
36 |             }
37 |             if (k2 >= length2) {
38 |                 for (; k1 < length1; ++k1)
39 |                     out[pos++] = set1[k1];
40 |                 break;
41 |             }
42 |         } else {// if (set1[k1]>set2[k2]) {
43 |             out[pos++] = set2[k2];
44 |             ++k2;
45 |             if (k2 >= length2) {
46 |                 for (; k1 < length1; ++k1)
47 |                     out[pos++] = set1[k1];
48 |                 break;
49 |             }
50 |         }
51 |     }
52 |     return pos;
53 |     }
54 | 
55 | #endif /* UNION_H_ */
56 | 


--------------------------------------------------------------------------------
/include/util.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * This code is released under the
 3 |  * Apache License Version 2.0 http://www.apache.org/licenses/.
 4 |  *
 5 |  */
 6 | 
 7 | #ifndef UTIL_H_
 8 | #define UTIL_H_
 9 | 
10 | #include "common.h"
11 | 
12 | using namespace std;
13 | 
14 | 
15 | /**
16 |  * unsatured packing.
17 |  */
18 | __attribute__((always_inline))
19 | inline __m128i __pack_epu32( __m128i hi, __m128i lo ) {
20 |     const static __m128i mask =
21 |         _mm_set_epi8(0,0,-1,-1,0,0,-1,-1,0,0,-1,-1,0,0,-1,-1);
22 |     hi = _mm_and_si128( hi, mask );
23 |     lo = _mm_and_si128( lo, mask );
24 |     return _mm_packus_epi32( hi, lo );
25 | }
26 | 
27 | /**
28 |  * Not recommended.
29 |  */
30 | __attribute__((always_inline))
31 | inline __m128i __altpack_epu32( __m128i hi, __m128i lo ) {
32 |     //0b10101010 = 170
33 |     __m128i bva =
34 |             _mm_blend_epi16(hi,
35 |             _mm_slli_si128(lo, 2), 170);
36 |     const static __m128i shufflekey =
37 |             _mm_set_epi8(15,14,11,10, 7,6,3,2,  13,12,9,8,5,4,1,0);
38 |     return _mm_shuffle_epi8(bva,shufflekey);
39 | }
40 | 
41 | 
42 | vector<string> split(const string& str, const string& del) {
43 |     vector < string > tokens;
44 |     size_t lastPos = str.find_first_not_of(del, 0);
45 |     size_t pos = str.find_first_of(del, lastPos);
46 |     while (string::npos != pos || string::npos != lastPos) {
47 |         tokens.push_back(str.substr(lastPos, pos - lastPos));
48 |         lastPos = str.find_first_not_of(del, pos);
49 |         pos = str.find_first_of(del, lastPos);
50 |     }
51 |     return tokens;
52 | }
53 | 
54 | 
55 | //Fisher-Yates shuffle
56 | template<class iter>
57 | iter shuffleFY(iter begin, iter end, size_t N) {
58 |    size_t M = distance(begin, end);
59 |    while (N--) {
60 |        iter r = begin;
61 |        advance(r, rand() % M);
62 |        swap(*begin, *r);
63 |        begin++;
64 |        M--;
65 |    }
66 |    return begin;
67 | }
68 | 
69 | #endif /* UTIL_H_ */
70 | 


--------------------------------------------------------------------------------
/results/benchintersection5march2014.gnuplot:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | set style line 80 lt rgb "#000000"
 4 | 
 5 | # Line style for grid
 6 | #set style line 81 lt 0  # dashed
 7 | #set style line 81 lt rgb "#808080"  # grey
 8 | 
 9 | #set grid back linestyle 81
10 | set border 3 back linestyle 80 # Remove border on top and right.  These
11 |              # borders are useless and make it harder
12 |              # to see plotted lines near the border.
13 |     # Also, put it in grey; no need for so much emphasis on a border.
14 | set xtics nomirror
15 | set ytics nomirror
16 | 
17 | 
18 | 
19 | set style line 1 lt rgb "#A00000" lw 4 pt 1 ps 0.5
20 | set style line 2 lt rgb "#00A000" lw 4 pt 5 ps 0.5
21 | set style line 3 lt rgb "#5060D0" lw 4 pt 7 ps 0.5
22 | set style line 4 lt rgb "#FF1493" lw 4 pt 9 ps 0.5
23 | set style line 5 lt rgb "red" lw 4 pt 11 ps 0.5
24 | set style line 6 lt rgb "#808000" lw 4 pt 13 ps 0.5
25 | set style line 7 lt rgb "#00008B" lw 4 pt 15 ps 0.5
26 | set style line 8 lt rgb "#800080" lw 4 pt 21 ps 0.5
27 | set style line 9 lt rgb "black" lw 4 pt 63 ps 0.5
28 | set style line 10 lt rgb "blue" lw 4 pt 28 ps 0.5
29 | set style line 11 lt rgb "violet" lw 4 pt 44 ps 0.5
30 | set style line 81 lt 0  # dashed
31 | set style line 81 lt rgb "#808080"  # grey
32 | 
33 | set grid back linestyle 81
34 | #set xtics 2
35 | #set ytics 1
36 | set term pdfcairo 
37 | #fontscale 0.8
38 | 
39 | 
40 | set out "ratiobitpacking.pdf"
41 | 
42 | set xlabel "Ratio length (large list) / length(small list)"
43 | set ylabel "relative speed (scalar = 1)"
44 | #set ylabel "relative speed (galloping = 1)"
45 | 
46 | set key bmargin
47 | set key samplen 2 spacing .5 font ",8" maxrows 4
48 | #set xrange [1:8192]
49 | set xrange [1:10000]
50 | set logscale x 2
51 | 
52 | set logscale y 2
53 | 
54 | 
55 | set out "benchintersection5march2014_gallop.pdf"
56 | #set logscale x 2
57 | #set logscale y
58 | 
59 | plot "benchintersection5march2014.txt" using 1:($13/$3) ti "gallop" with linespoints lw 2 ps 0.5,\
60 | "" using 1:($21/$3) ti "SIMD gallop" with linespoints lw 2 ps 0.5,\
61 | "" using 1:($22/$3) ti "SIMD gallop2" with linespoints lw 2 ps 0.5,\
62 | "" using 1:($25/$3) ti "Wu SIMD gallop v0" with linespoints lw 2 ps 0.5,\
63 | "" using 1:($26/$3) ti "Wu SIMD gallop v1" with linespoints lw 2 ps 0.5,\
64 | "" using 1:($27/$3) ti "Wu SIMD gallop v2" with linespoints lw 2 ps 0.5,\
65 | "" using 1:($28/$3) ti "Wu SIMD gallop v3" with linespoints lw 2 ps 0.5
66 | 
67 | 
68 | set out "benchintersection5march2014_v1.pdf"
69 | 
70 | plot "benchintersection5march2014.txt" using 1:($34/$3) ti "SIMD v1" with linespoints lw 2 ps 0.5,\
71 | "" using 1:($29/$3) ti "Wu SIMD v1" with linespoints lw 2 ps 0.5,\
72 | "" using 1:($30/$3) ti "Wu SIMD v1 plow" with linespoints lw 2 ps 0.5
73 | 
74 | 
75 | 
76 | set out "benchintersection5march2014_v3.pdf"
77 | 
78 | plot "benchintersection5march2014.txt" using 1:($35/$3) ti "SIMD v3" with linespoints lw 2 ps 0.5,\
79 | "" using 1:($32/$3) ti "Wu SIMD v3" with linespoints lw 2 ps 0.5,\
80 | "" using 1:($33/$3) ti "Wu SIMD v3 aligned" with linespoints lw 2 ps 0.5
81 | 


--------------------------------------------------------------------------------
/results/benchintersection5march2014.txt:
--------------------------------------------------------------------------------
 1 | # howmany : 5
 2 | # loop : 3
 3 | # distribution : clustered
 4 | # Big : 22
 5 | # intersectionratio : 0.3
 6 | # MaxBit : 26
 7 | # size-ratio	@hybriddan	branchless	danfar	danfarfar	danfarmov	f2p0	f4p0	f8p0	hssimd	hssimddan	natemediumdanalt	scalar1sgalloping	scalarbranchless	scalarbranchlesscached	scalarbranchlesscached2	scalarbranchlessunrolled	scalardanbranchless	scalarnate	scalarnatewg	simdgalloping	simdgalloping2	thomas_gallop	thomas_scalar	thomas_simdgallop_v0	thomas_simdgallop_v1	thomas_simdgallop_v2	thomas_simdgallop_v3	thomas_v1	thomas_v1_plow	thomas_v2	thomas_v3	thomas_v3_aligned	v1	v3	widevector	widevectorleo	relative-intersection-size 
 8 | #generating data...ok.
 9 | 1.001	    990.45	    438.03	    464.16	     442.4	    463.37	    991.06	     998.3	     783.6	    1066.5	    1135.3	    527.17	     388.7	    278.78	    430.61	    438.57	    276.69	    438.18	    434.95	    441.22	     435.3	    474.08	    284.21	    328.98	     204.6	    204.88	    158.27	    155.49	    249.05	     248.8	    188.73	    185.11	    243.47	    990.82	    442.54	    976.53	    1103.7			0.34884
10 | #generating data...ok.
11 | 1.3798	    1024.4	    427.53	    584.81	    552.03	    587.75	      1025	      1092	    893.83	    1034.8	    1077.9	    653.41	    362.24	    269.65	    424.13	    428.98	    266.47	    427.74	     411.6	    419.61	    541.67	       596	    286.96	    328.09	    217.33	    223.85	    176.02	     174.5	    268.44	    264.31	    207.94	     203.8	    247.84	    1024.3	     552.3	    955.56	    1092.6			0.32896
12 | #generating data...ok.
13 | 1.9019	    1097.9	    429.83	    626.36	     597.3	    625.05	    1098.8	    1204.4	    1035.2	    1120.9	    1135.7	     678.9	    376.84	    262.49	    417.55	    422.96	     260.7	    429.82	     440.6	     444.6	    587.51	    630.48	    307.21	    350.76	    240.76	     250.5	    203.04	    202.25	    293.87	    286.51	    236.72	    233.42	    270.03	    1098.9	    597.32	    958.63	      1108			0.34597
14 | #generating data...ok.
15 | 2.6216	    1293.8	    640.14	    880.69	       832	     886.9	    1294.5	    1368.3	    1201.9	    1242.7	    1277.1	    958.96	    542.63	    304.91	    533.24	    536.93	    302.96	    640.36	    663.85	    679.38	    815.51	     882.7	    414.48	     522.1	    337.24	    352.85	    285.07	    281.41	    426.33	    409.21	    338.66	    334.11	    395.23	    1295.5	    830.23	    969.71	      1139			0.32307
16 | #generating data...ok.
17 | 3.6136	    1256.5	    554.73	    1040.3	    984.84	    1050.4	    1258.2	    1462.1	    1423.6	    1207.4	    1201.5	    1065.6	     467.4	    283.25	    483.46	    491.42	    282.04	    554.98	    580.95	    588.65	    962.99	    1038.2	    388.85	    479.19	     348.1	    382.41	    327.65	    326.81	    434.91	    409.58	    374.19	    370.57	    374.25	    1257.5	    982.22	    946.22	    1116.8			0.33031
18 | #generating data...ok.
19 | 4.981	    1350.9	     571.9	    1070.2	    1025.9	    1072.8	      1346	    1623.3	      1676	    1245.8	      1218	    1072.6	    471.58	    283.61	    490.34	     499.2	    281.77	    571.62	    597.29	    601.66	      1006	    1055.6	    403.68	    492.87	    383.43	    423.02	     376.6	     376.9	    469.39	    442.45	    416.09	    418.08	    395.61	    1343.7	    1024.4	    945.58	    1125.6			0.35888
20 | #generating data...ok.
21 | 6.8659	    1559.3	    786.68	    1442.3	    1382.3	    1447.4	    1553.3	      1794	      1936	    1383.4	    1369.9	    1406.2	    653.65	       312	    577.21	    585.72	    311.08	     787.2	    871.59	    880.64	    1348.5	    1393.9	     545.9	     717.8	    504.68	    569.47	     535.3	     535.7	    644.76	    592.17	    606.14	    608.67	    578.73	    1548.4	    1380.9	     958.4	    1153.2			0.33581
22 | #generating data...ok.
23 | 9.4639	    1657.4	    837.43	    1567.3	      1514	    1563.2	    1661.2	    1943.9	    2217.2	    1441.9	    1423.6	    1487.2	       697	    313.74	    589.71	    598.56	    313.99	    837.15	    953.78	    959.17	    1482.9	    1499.8	    606.64	     783.7	    564.85	     642.3	    652.78	    653.66	    710.77	    665.95	    722.19	    735.53	    666.88	    1659.5	    1515.4	    968.42	      1171			0.34253
24 | #generating data...ok.
25 | 13.045	    2154.2	    1026.8	    1875.9	    1842.1	    1864.5	    2165.9	    2382.8	    2550.4	    1522.6	    1506.9	    1792.9	    970.24	    329.31	    647.69	    654.01	    329.86	    1026.5	    1258.8	    1266.5	    1794.4	    1740.7	    820.85	    1053.1	    742.31	    815.57	     808.5	    819.31	    964.85	    887.41	    915.43	    923.51	    912.48	    2165.5	    1843.1	    966.06	    1189.9			0.41017
26 | #generating data...ok.
27 | 17.981	    2433.7	    1091.7	      2580	    2498.9	    2587.7	    2439.5	    2704.7	    2941.2	    1551.5	    1532.1	    2361.5	    1121.4	    330.05	    661.37	    666.71	    330.85	    1091.6	    1401.2	    1417.2	    2430.6	    2390.4	    964.81	    1199.8	    936.25	    1052.6	    1048.5	    1065.8	    1237.8	    1100.1	    1183.8	    1203.8	    1101.8	    2440.7	    2499.4	    928.14	    1173.9			0.33022
28 | #generating data...ok.
29 | 24.786	    2491.7	    1191.9	    2725.9	    2667.5	      2730	    2512.5	    2724.9	    3132.8	      1592	    1572.2	    2443.9	    1160.2	    336.04	    684.17	    690.27	    337.01	      1189	    1574.2	    1570.2	    2582.9	    2470.5	    1017.4	    1348.5	    1010.4	    1146.6	    1297.5	    1330.5	    1356.8	    1219.8	    1435.1	    1491.3	    1279.1	    2502.5	      2642	    965.93	    1199.8			0.34907
30 | #generating data...ok.
31 | 34.165	    2958.9	    1302.1	    3220.4	    3157.8	    3242.6	    2966.2	      3106	      3497	    1626.2	    1606.5	    2938.4	    1459.5	    340.03	    704.12	    708.03	    341.23	    1301.6	    1817.7	    1813.8	    3022.7	    2893.7	    1305.5	    1585.7	      1286	    1415.3	    1571.4	    1602.9	    1754.9	    1550.3	    1782.4	      1799	    1584.8	    2962.6	    3157.4	    933.86	    1204.1			0.36754
32 | #generating data...ok.
33 | 47.093	    3182.6	    1359.7	    3462.6	    3382.7	    3468.2	    3218.4	    3377.9	    3616.4	    1648.1	    1625.8	    3114.1	    1578.3	    342.35	    714.86	    720.38	    343.44	    1360.2	    1983.9	    1987.7	    3280.4	    3088.7	    1388.2	    1746.7	    1450.5	    1617.9	    1817.1	    1908.2	    2097.5	    1768.8	    2066.2	    2131.1	    1843.3	    3219.4	    3379.5	    931.26	    1208.2			0.34864
34 | #generating data...ok.
35 | 64.913	    4363.4	    1470.1	    4262.9	      4397	    4253.5	    3422.4	    3638.4	    4223.1	    1665.7	    1639.6	      3609	    2012.7	     345.1	    730.63	    734.01	    346.39	    1472.3	    2181.7	    2183.5	    4162.3	      3773	    1851.6	    1934.2	    1827.7	    2028.1	    2268.5	    2323.6	    2367.2	    2001.5	      2502	    2601.3	    2266.6	    3418.9	      4400	    905.43	    1215.8			0.33951
36 | #generating data...ok.
37 | 89.477	    4707.4	    1532.7	    4556.6	    4698.7	    4551.7	    3701.1	    3888.3	      4387	    1691.7	    1668.1	    3859.8	    2390.3	    347.64	    742.68	    745.14	    349.04	    1531.1	    2385.3	    2377.6	    4537.7	    4209.5	    2141.6	    2143.1	    2164.7	    2392.3	    2669.7	    2782.1	    2775.7	    2275.7	    2864.8	      3052	    2719.9	    3712.1	    4701.8	    879.54	    1218.8			0.48246
38 | #generating data...ok.
39 | 123.33	    5049.7	      1588	    4938.7	    5025.6	    4977.5	    3952.8	    4120.8	    4589.8	    1712.1	    1686.8	    4273.6	    2525.2	    349.22	    750.44	    752.55	    350.85	    1590.9	    2521.4	    2553.2	    4800.4	    4434.4	    2427.3	    2318.9	      2335	    2538.4	    2785.9	    2888.7	      3232	    2578.9	    3227.7	    3220.2	    2951.4	    3962.9	    5072.3	    839.97	    1216.7			0.33693
40 | #generating data...ok.
41 | 170.01	    5571.3	    1648.3	    5441.4	    5679.8	    5434.9	    4169.9	    4276.7	    4885.6	    1702.2	    1678.9	    4605.7	    3036.2	    349.89	    756.27	    757.99	    351.55	    1649.7	    2663.5	    2680.9	    5340.4	    4778.6	    2911.8	    2440.3	    2771.5	    2961.4	    3214.1	    3226.7	    3601.1	    2820.9	    3626.7	    3668.9	    3483.1	    4168.5	    5685.9	    778.36	    1143.7			0.35392
42 | #generating data...ok.
43 | 234.34	    7303.8	    1683.4	    6228.1	    7167.9	      6110	    4182.8	    4266.5	    5267.9	    1703.2	    1701.6	    5000.6	      3958	    350.82	    759.93	    761.66	    352.49	    1684.2	    2775.6	    2798.3	    6865.6	    5977.1	    3845.3	    2551.4	    3702.4	    4187.8	    4443.2	      4495	    3742.3	    2981.9	      4126	    4571.8	    4480.7	    4177.9	    7346.2	    765.64	    1128.9			0.34326
44 | #generating data...ok.
45 | 323.01	    7600.2	    1728.7	    6563.8	    7325.8	    6510.3	    4353.8	    4437.4	    5330.7	    1723.3	    1702.4	    5218.3	    3945.5	    351.37	     763.6	       765	    353.21	    1726.5	    2904.4	    2888.3	    6892.2	    5957.8	    3943.1	    2648.9	    3705.7	    3948.7	      4244	    4179.7	    3935.9	    3146.1	    4301.6	    4573.1	    4461.6	    4358.7	    7338.6	    754.68	    1150.5			0.35395
46 | #generating data...ok.
47 | 445.24	    9107.2	    1756.4	    7246.2	    8943.1	    7062.7	    4329.7	      4464	    5540.3	    1736.6	    1710.5	    5502.1	    4834.3	    351.84	    766.24	    767.03	    353.73	    1759.7	    2959.9	    2971.8	      7989	    6501.9	    4841.7	    2673.9	    4566.4	    5016.6	    5359.5	    5354.5	    4109.3	    3272.2	    4674.1	    5413.3	    5503.6	    4331.5	    8958.4	    721.95	    988.65			0.34055
48 | #generating data...ok.
49 | 613.72	    9882.8	    1775.2	    7673.3	     10021	    7399.4	    4387.8	    4471.9	    5712.4	    1751.6	    1720.6	    5637.2	      7039	    352.14	    767.84	    769.05	    354.19	    1777.7	    3061.3	    3045.9	     10488	     10601	    7535.6	    2768.5	    6497.5	    7739.3	    8195.2	    8380.5	    4210.7	    3353.9	    5069.8	    7476.6	    8072.4	    4390.5	     10244	    702.74	    923.82			0.33772
50 | #generating data...ok.
51 | 845.95	     11248	    1794.3	    8358.1	     11844	    7924.9	    4405.3	    4458.9	    5864.2	    1702.7	    1676.8	    5804.2	    8999.4	    352.43	    768.93	    769.27	    354.49	    1797.2	    3088.7	    3045.4	     12748	     13054	      9665	    2808.4	    8305.2	      9913	     10669	     10769	    4287.2	    3436.7	    5405.7	    9739.8	     10199	      4422	     11950	     691.8	    803.81			0.51735
52 | #generating data...ok.
53 | 1166.1	     13579	    1803.7	      8743	     13498	    8901.2	    4533.6	    4618.8	    5994.6	    1751.3	    1719.1	    5970.2	     10514	    352.43	    770.06	    771.31	    354.67	    1809.3	    3154.4	    3136.9	     13787	     13812	     10960	    2829.6	    9703.7	     11641	     12255	     12728	    4362.1	    3507.9	    5631.6	     10453	     11838	    4454.7	     13536	    693.44	    817.81			0.33389
54 | #generating data...ok.
55 | 1607.3	     15213	    1808.2	    9270.5	     15973	    9472.7	    4554.8	    4631.8	      6128	    1758.7	    1726.3	      6106	     16220	    352.68	    770.93	    769.39	     354.8	      1811	    3176.5	    3188.6	     24615	     27000	     18395	    2930.4	     15346	     19244	     20939	     21750	    4410.9	    3546.7	    6004.7	     14067	     16423	    4515.9	     15172	    699.23	    800.06			0.40077
56 | #generating data...ok.
57 | 2215.5	     15332	    1811.6	    9246.1	     15942	    9485.7	    4556.4	    4637.7	    6117.6	    1758.2	    1728.6	    6089.8	     14301	    352.75	    771.33	    772.48	    354.82	    1818.3	    3185.3	      3200	     20018	     22787	     17799	    2902.2	     13317	     17900	     19366	     19437	    4444.8	      3572	    5962.9	     13638	     16390	    4480.8	     14664	    691.75	    797.42			0.35288
58 | #generating data...ok.
59 | 3053.9	     13767	    1817.5	    8763.8	     14427	    9012.3	    4590.3	    4673.2	    6230.4	    1766.1	    1725.5	    5951.7	     22526	    354.55	     774.9	    777.31	    356.92	    1819.6	    3245.7	    3232.5	     25328	     30556	     31395	      3075	     21077	     32751	     36034	     37855	    4499.7	    3606.3	    6267.7	     17385	     21975	    4509.7	     13583	    691.68	    776.76			0.34669
60 | #generating data...ok.
61 | 4209.5	     17126	    1820.5	    9359.4	     17906	    9981.7	    4542.8	    4654.8	    6229.4	    1715.1	    1726.6	    6196.9	     26632	    352.74	       771	    772.96	    355.06	    1827.6	    3230.4	    3241.7	     37690	     47873	     36551	    2995.7	     25273	     38310	     39965	     41909	    4471.2	    3617.3	    6289.8	     20358	     22652	    4492.2	     17348	    692.35	    788.92			0.34237
62 | #generating data...ok.
63 | 5802.4	     17006	    1834.5	    9736.6	     18997	     10123	    4615.2	    4691.6	    6278.7	    1742.5	    1732.7	      6232	     21793	    354.89	    775.81	    778.02	    357.06	      1840	    3260.8	    3258.5	     31203	     30042	     32797	    3000.6	     21554	     25964	     27400	     30897	    4586.6	    3639.3	    6383.7	     18849	     24366	    4530.5	     17634	    705.35	    803.19			0.35961
64 | #generating data...ok.
65 | 7998.1	     18041	    1820.8	    9929.3	     20853	     10435	    4573.9	    4667.2	    6266.6	    1748.7	    1732.3	    6245.4	     28987	    352.81	    772.41	    772.12	    355.25	    1829.2	    3253.9	    3248.4	     48104	     44877	     46955	    2961.8	     29531	     40720	     52176	     59595	    4508.6	    3627.5	    6437.9	     20360	     26384	    4510.5	     18903	    691.69	    786.03			0.3645
66 | # bogus = 2817022208
67 | 


--------------------------------------------------------------------------------
/results/benchintersection6march2014.gnuplot:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | set style line 80 lt rgb "#000000"
 4 | 
 5 | # Line style for grid
 6 | #set style line 81 lt 0  # dashed
 7 | #set style line 81 lt rgb "#808080"  # grey
 8 | 
 9 | #set grid back linestyle 81
10 | set border 3 back linestyle 80 # Remove border on top and right.  These
11 |              # borders are useless and make it harder
12 |              # to see plotted lines near the border.
13 |     # Also, put it in grey; no need for so much emphasis on a border.
14 | set xtics nomirror
15 | set ytics nomirror
16 | 
17 | 
18 | 
19 | set style line 1 lt rgb "#A00000" lw 4 pt 1 ps 0.5
20 | set style line 2 lt rgb "#00A000" lw 4 pt 5 ps 0.5
21 | set style line 3 lt rgb "#5060D0" lw 4 pt 7 ps 0.5
22 | set style line 4 lt rgb "#FF1493" lw 4 pt 9 ps 0.5
23 | set style line 5 lt rgb "red" lw 4 pt 11 ps 0.5
24 | set style line 6 lt rgb "#808000" lw 4 pt 13 ps 0.5
25 | set style line 7 lt rgb "#00008B" lw 4 pt 15 ps 0.5
26 | set style line 8 lt rgb "#800080" lw 4 pt 21 ps 0.5
27 | set style line 9 lt rgb "black" lw 4 pt 63 ps 0.5
28 | set style line 10 lt rgb "blue" lw 4 pt 28 ps 0.5
29 | set style line 11 lt rgb "violet" lw 4 pt 44 ps 0.5
30 | set style line 81 lt 0  # dashed
31 | set style line 81 lt rgb "#808080"  # grey
32 | 
33 | set grid back linestyle 81
34 | #set xtics 2
35 | #set ytics 1
36 | set term pdfcairo 
37 | #fontscale 0.8
38 | 
39 | 
40 | set out "ratiobitpacking.pdf"
41 | 
42 | set xlabel "Ratio length (large list) / length(small list)"
43 | set ylabel "relative speed (scalar = 1)"
44 | #set ylabel "relative speed (galloping = 1)"
45 | 
46 | set key bmargin
47 | set key samplen 2 spacing .5 font ",8" maxrows 4
48 | #set xrange [1:8192]
49 | set xrange [1:10000]
50 | set logscale x 2
51 | 
52 | set logscale y 2
53 | 
54 | 
55 | 
56 | set out "benchintersection6march2014_v3.pdf"
57 | 
58 | plot "benchintersection6march2014.txt" using 1:($36/$3) ti "SIMD v3" with linespoints lw 2 ps 0.5,\
59 | "" using 1:($32/$3) ti "Wu SIMD v3" with linespoints lw 2 ps 0.5,\
60 | "" using 1:($33/$3) ti "Wu SIMD v3 aligned" with linespoints lw 2 ps 0.5,\
61 | "" using 1:($34/$3) ti "Wu SIMD v3 cmpeq flagged" with linespoints lw 2 ps 0.5
62 | 


--------------------------------------------------------------------------------
/results/benchintersection6march2014.txt:
--------------------------------------------------------------------------------
 1 | # howmany : 5
 2 | # loop : 3
 3 | # distribution : clustered
 4 | # Big : 22
 5 | # intersectionratio : 0.3
 6 | # MaxBit : 26
 7 | # size-ratio	@hybriddan	branchless	danfar	danfarfar	danfarmov	f2p0	f4p0	f8p0	hssimd	hssimddan	natemediumdanalt	scalar1sgalloping	scalarbranchless	scalarbranchlesscached	scalarbranchlesscached2	scalarbranchlessunrolled	scalardanbranchless	scalarnate	scalarnatewg	simdgalloping	simdgalloping2	thomas_gallop	thomas_scalar	thomas_simdgallop_v0	thomas_simdgallop_v1	thomas_simdgallop_v2	thomas_simdgallop_v3	thomas_v1	thomas_v1_plow	thomas_v2	thomas_v3	thomas_v3_aligned	thomas_v3cmpeqflagged	v1	v3	widevector	widevectorleo	relative-intersection-size 
 8 | #generating data...ok.
 9 | 1.001	    984.39	    388.36	    420.99	    403.03	    418.71	    985.11	    1001.8	     773.9	    1019.3	    1093.1	    473.79	    347.27	     259.6	    390.82	    400.61	    257.98	     388.1	    369.84	    373.96	     396.1	    429.05	    265.14	    315.94	    188.92	     188.7	    149.29	    146.33	    224.35	    227.07	    174.35	    171.32	    228.61	    287.44	    983.99	     403.3	     950.9	    1085.7			0.3362
10 | #generating data...ok.
11 | 1.3798	      1043	     406.2	    508.19	    485.58	    506.38	      1043	    1101.9	    885.06	      1053	    1099.2	    562.21	    362.62	    259.98	    402.83	    408.62	    258.55	    406.14	    397.27	    401.05	    478.57	    517.06	    286.65	    341.03	    214.89	    218.12	    174.86	     171.5	    257.36	    257.34	    202.45	    199.48	       251	    343.47	    1042.8	    484.84	     944.7	    1092.4			0.33065
12 | #generating data...ok.
13 | 1.9019	    1059.5	    415.22	    557.28	    534.93	       554	    1060.4	    1207.5	    1031.6	    1068.5	    1077.2	    597.91	    346.78	    258.59	    406.79	    415.73	    256.33	    415.27	    402.41	    406.27	    525.38	    563.33	    282.41	    340.76	    234.08	    244.03	    200.42	    198.09	    279.13	    273.27	    226.87	    224.93	    255.99	    390.15	    1059.6	    534.71	    930.69	    1083.4			0.34479
14 | #generating data...ok.
15 | 2.6216	    1300.8	     461.9	    670.41	    646.39	    666.92	    1296.1	    1407.7	      1190	    1212.6	    1233.1	    727.41	    412.31	    262.25	    428.14	    432.23	    262.69	    461.73	    474.44	       473	    635.86	    675.99	    354.55	    403.34	    271.78	     279.3	    234.85	    232.53	    325.04	    320.44	    268.64	    266.65	     313.3	    478.36	    1299.7	    646.82	    953.29	    1132.4			0.32439
16 | #generating data...ok.
17 | 3.6136	    1235.4	    547.05	    933.09	       889	    930.74	    1235.7	    1478.7	      1412	    1216.4	    1206.5	    954.32	    449.71	    281.27	    478.29	    485.23	    279.71	    547.61	    567.11	     571.6	    868.25	     924.3	    382.38	     471.7	    335.81	     368.2	    320.45	    317.82	    411.11	    389.95	    359.43	    357.64	     366.9	    643.41	    1236.1	    890.27	    934.08	    1112.8			0.34676
18 | #generating data...ok.
19 | 4.981	    1279.2	    579.17	    969.57	    926.76	     961.3	    1281.6	    1619.1	    1652.4	    1224.2	    1196.8	    970.13	     449.8	    284.01	    494.51	    502.85	    282.19	       579	    582.89	    587.01	    910.08	    946.21	    383.01	    487.81	    368.83	    411.15	    373.33	    371.84	    447.07	    419.19	    407.53	    409.45	    384.27	    718.32	    1283.3	    927.64	    926.11	    1112.8			0.34824
20 | #generating data...ok.
21 | 6.8659	    1654.8	    721.14	    1276.2	    1229.2	    1271.1	    1666.7	    1989.4	    1947.7	      1343	    1321.4	    1271.5	    614.83	     302.2	    554.48	    560.81	    301.05	    720.86	    782.42	    786.07	    1206.8	    1247.1	    518.55	    654.45	    491.83	    545.12	    489.55	    492.05	    618.44	    571.37	    543.47	    546.67	    531.57	    944.53	    1668.7	      1230	     938.9	    1140.9			0.34008
22 | #generating data...ok.
23 | 9.4639	    1854.6	    884.78	      1494	    1450.8	    1485.8	    1851.9	    2047.9	    2176.3	    1450.5	    1432.3	    1465.5	    756.61	    317.62	    607.85	    614.19	    317.44	    884.64	    1007.2	    1014.9	    1425.2	    1429.8	    634.55	    818.41	    595.42	    652.11	     647.8	    645.04	     754.2	    711.31	     717.4	    728.35	    706.68	      1163	    1859.3	    1450.7	     949.1	    1169.9			0.32946
24 | #generating data...ok.
25 | 13.045	    1933.9	    955.33	    1971.2	    1881.7	    1971.7	    1935.6	    2277.6	    2552.8	    1484.9	    1460.5	    1832.8	    835.73	    322.79	    627.61	    634.56	    322.67	    955.52	    1135.7	    1141.3	    1838.8	    1848.6	    727.44	    937.61	    706.72	    806.27	    842.94	    840.29	    905.22	    826.88	    921.74	    930.35	    821.68	    1526.3	    1938.6	    1879.9	    942.64	    1175.4			0.36438
26 | #generating data...ok.
27 | 17.981	    2804.4	    1249.2	    2513.4	    2462.8	    2506.7	      2826	    2882.4	    2921.3	    1602.4	    1592.4	    2415.9	    1389.2	    339.68	    699.44	    701.79	    340.84	      1248	    1680.9	    1715.2	    2392.8	    2310.7	      1138	    1384.5	    1042.7	      1116	    1103.3	    1107.9	    1403.2	    1276.6	    1263.7	    1280.6	      1341	    1958.1	    2818.3	    2469.8	    938.56	    1208.6			0.31654
28 | #generating data...ok.
29 | 24.786	    2680.4	    1177.6	      2919	    2910.4	      2915	    2713.8	    3010.4	    3304.5	    1586.2	    1556.7	    2607.1	    1304.2	    333.45	    678.85	    683.76	     334.2	    1178.5	    1563.3	    1570.5	    2814.3	    2661.9	    1126.6	      1331	    1113.5	    1251.1	    1285.8	    1304.7	    1449.6	    1272.1	      1406	    1438.1	    1309.5	    2374.6	    2713.8	      2903	    918.06	    1196.1			0.32648
30 | #generating data...ok.
31 | 34.165	    2761.5	      1280	    3156.7	    3285.5	    3144.4	    2775.1	    3017.1	      3607	      1624	    1593.8	      2743	      1479	    338.27	    697.85	    702.85	    339.18	    1280.1	    1710.7	    1710.5	    3226.9	    3021.3	    1299.4	    1454.8	    1318.4	    1487.5	    1690.7	    1709.7	    1552.2	    1406.7	    1704.3	    1856.6	    1584.7	    2831.9	    2774.4	    3270.6	    856.29	    1173.5			0.33213
32 | #generating data...ok.
33 | 47.093	    3360.1	    1403.7	    3697.2	    3687.6	    3698.3	      3401	    3577.1	    3893.8	    1660.2	    1633.9	    3310.9	    1835.5	    342.98	    722.55	    725.94	    344.66	    1399.7	      2072	    2066.3	    3522.6	    3296.4	    1586.2	    1796.3	    1585.8	    1746.1	    1852.9	    1895.4	    2185.1	    1834.5	    2100.2	    2103.7	    1965.8	    3198.7	    3405.1	    3693.2	    863.84	    1199.8			0.34141
34 | #generating data...ok.
35 | 64.913	    4147.9	    1473.7	    4146.5	    4177.7	    4157.3	    3331.3	      3600	    4189.8	    1679.2	      1650	    3579.1	    1980.4	     345.1	    730.24	    733.62	    346.25	    1473.9	    2179.7	    2182.4	    3939.8	    3606.1	    1821.5	    1930.6	    1756.3	    1984.1	    2227.9	    2197.9	    2300.4	    1919.6	    2440.7	    2455.8	    2176.6	      3643	    3341.4	    4155.4	    879.34	      1217			0.33945
36 | #generating data...ok.
37 | 89.477	    4181.9	    1512.2	      4198	    4230.1	    4186.6	      3792	    3957.6	      4271	    1690.3	    1668.1	    3702.1	    2119.5	    346.99	    740.39	    741.39	     348.4	      1513	    2343.4	    2337.8	    4063.6	    3771.1	    1894.7	    2039.4	    1946.9	    2162.4	      2408	    2536.9	    2788.1	    2208.1	    2718.1	    2776.9	    2514.8	    3930.7	      3792	    4233.4	    867.56	    1213.3			0.33949
38 | #generating data...ok.
39 | 123.33	    5932.2	    1611.8	    5311.1	    5867.4	    5304.5	    3924.9	    4052.6	    4758.7	    1718.8	    1687.4	    4379.2	    2942.1	    348.99	    750.35	    753.03	    350.62	    1613.1	    2559.9	    2577.7	    5586.7	    4908.6	    2660.2	    2369.1	    2702.6	    2962.9	    3203.7	    3305.6	      3158	    2558.1	    3315.3	    3618.7	    3337.4	    5156.9	    3951.3	    5872.3	     813.5	    1212.3			0.34108
40 | #generating data...ok.
41 | 170.01	    4865.7	    1623.1	    4939.4	    4904.9	    4985.7	    4146.4	    4212.7	      4645	      1715	    1689.1	    4372.8	    2537.6	    349.88	    754.88	    755.35	    351.42	    1624.2	    2611.1	    2628.8	    4555.3	    4179.8	    2479.5	    2413.4	    2323.8	    2497.2	    2766.4	    2797.7	    3501.5	    2679.8	    3348.2	    3159.7	    3004.2	    4453.4	    4154.6	    4851.9	    815.31	      1218			0.34176
42 | #generating data...ok.
43 | 234.34	    7252.9	    1696.2	      6182	    7272.1	    6109.1	    4183.3	    4273.5	    5194.9	      1717	    1698.3	    4960.7	      3889	    350.78	    759.51	    761.73	    352.48	    1696.1	    2799.4	    2806.4	    6720.6	    5493.4	    3618.8	    2648.9	    3639.6	      3922	    4232.3	      4310	    3685.3	    2921.5	    4026.5	    4589.9	    4358.4	      6061	    4182.4	    7270.4	    733.93	    1156.5			0.34382
44 | #generating data...ok.
45 | 323.01	    8383.6	    1739.9	    6809.5	    8384.7	    6807.3	    4323.3	    4378.2	    5334.6	    1744.5	    1716.3	    5240.3	    4568.4	    351.71	    764.99	    766.08	    353.56	    1740.7	    2887.2	    2933.7	    7470.5	    6117.6	    4263.4	    2810.3	    4292.2	    4486.2	    4741.8	    4796.5	    3984.3	    3118.4	    4364.6	    5251.2	    5026.9	    6686.1	    4323.3	      8416	    745.73	    1158.4			0.33731
46 | #generating data...ok.
47 | 445.24	    8177.8	    1744.6	    6872.8	    8184.2	    6876.5	    4363.4	    4437.4	    5422.3	    1742.4	    1712.5	    5408.4	    4416.6	    351.58	    765.97	    767.14	    353.64	    1745.8	    2942.6	    2928.1	    7457.2	    6234.6	      4364	    2828.7	    4128.3	    4358.6	    4653.2	    4595.2	    4130.7	    3202.7	    4471.4	    5013.2	    4911.3	    6701.8	      4367	    8192.7	    724.91	    1089.5			0.34735
48 | #generating data...ok.
49 | 613.72	     10459	    1772.5	    7683.6	     10440	    7668.6	      4404	    4461.1	    5686.1	    1750.1	    1718.4	    5611.6	    6572.8	    352.17	    767.71	    767.88	    354.02	    1769.3	    2992.9	    2958.1	    9841.2	    8165.5	    6200.8	    2957.2	      6113	    6481.5	    6955.1	      7039	    4222.3	      3290	      4948	    7177.7	    7032.7	    8368.3	    4408.4	     10441	    715.05	    900.33			0.41703
50 | #generating data...ok.
51 | 845.95	     12143	    1791.5	    8207.8	     11979	    8172.7	    4415.2	    4484.6	    5778.1	    1757.3	    1728.8	      5777	    8508.3	    352.39	    767.55	    770.32	    354.37	    1792.1	    3088.6	      3034	     11764	     10179	    8015.6	    3028.1	    8045.3	    8443.3	    8995.5	    9246.4	    4292.4	    3388.4	    5241.5	    9363.3	    9060.2	    9858.6	    4434.1	     12169	    681.35	    825.82			0.34106
52 | #generating data...ok.
53 | 1166.1	     14373	    1809.9	    8711.6	     14402	    8710.4	      4444	    4499.6	    5957.2	    1762.5	    1717.1	    5907.5	     14043	    352.47	    770.28	    770.87	    354.41	    1810.1	    3126.8	    3088.9	     18757	     18824	     13248	    3082.3	     13143	     13818	     14851	     15166	    4322.6	      3451	    5670.2	     13851	     13392	     14278	    4457.9	     14452	    698.79	    865.26			0.4248
54 | #generating data...ok.
55 | 1607.3	     14297	      1801	    8791.5	     14412	    8787.8	    4475.4	    4530.8	      5971	    1758.1	    1723.7	    5976.1	     13323	    352.59	    770.77	    771.83	    354.58	    1810.6	    3117.8	    3151.1	     16728	     16253	     12779	    3130.8	     12656	     13175	     13893	     14111	    4416.4	    3488.9	    5810.1	     13608	     13329	     13617	    4473.8	     14385	    683.16	    812.94			0.341
56 | #generating data...ok.
57 | 2215.5	     15746	    1825.4	    9205.4	     16006	    9139.9	      4485	      4540	    5924.1	    1776.6	    1731.3	    6044.1	     17845	    352.85	    771.77	    771.86	    354.88	    1825.3	    3164.8	    3105.7	     26680	     26246	     18844	    3152.9	     17182	     19842	     20965	     21621	    4445.1	    3527.3	    6143.2	     15362	     17047	     18117	    4485.3	     16006	    700.54	    833.25			0.4897
58 | #generating data...ok.
59 | 3053.9	     16759	    1823.4	    9327.7	     16935	    9309.7	    4510.4	    4555.8	    6068.8	    1780.8	    1737.6	    6082.9	     22591	    353.42	    773.04	    773.84	     355.6	    1828.1	    3195.9	    3205.7	     31332	     29331	     21695	    3184.8	     21929	     22746	     23949	     25523	    4482.4	    3547.1	      6204	     19563	     19117	     19472	    4508.4	     16899	    681.95	    797.12			0.35324
60 | #generating data...ok.
61 | 4209.5	     17705	    1825.3	    9443.7	     17866	    9586.2	      4509	      4558	    6103.1	    1778.9	      1738	      6134	     33478	    352.61	    771.46	    772.93	    355.04	    1832.5	    3184.1	    3187.5	     51519	     47691	     34449	    3173.2	     32242	     38124	     38806	     40118	    4483.6	    3566.2	    6286.1	     23551	     23411	     23827	    4502.5	     17795	    680.37	     785.9			0.34538
62 | #generating data...ok.
63 | 5802.4	     18248	    1820.6	    9474.3	     18446	    9485.7	    4517.2	    4570.3	    6083.3	    1777.2	    1735.5	    6108.1	     42612	    352.87	    772.18	    773.13	    354.86	      1834	    3203.9	    3222.1	     73698	     71929	     49324	    3200.8	     41736	     58820	     54872	     59041	    4502.3	    3587.4	    6315.9	     25327	     25185	     24985	    4496.8	     18349	    680.42	    787.99			0.34716
64 | #generating data...ok.
65 | 7998.1	     18830	    1821.5	    9653.4	     18960	      9695	      4517	    4572.9	    6126.1	    1780.5	    1744.1	      6198	     49081	    353.47	    773.34	    774.26	    355.73	    1837.6	    3206.3	    3231.8	     85657	     83165	     60571	    3219.4	     50338	     74405	     63895	     70327	    4514.4	      3561	    6290.4	     27455	     27527	     26675	    4519.6	     18875	     684.1	     803.7			0.40267
66 | # bogus = 2851167232
67 | 


--------------------------------------------------------------------------------
/results/benchintersection6march2014_2.txt:
--------------------------------------------------------------------------------
 1 | # howmany : 5
 2 | # loop : 3
 3 | # distribution : clustered
 4 | # Big : 22
 5 | # intersectionratio : 0.3
 6 | # MaxBit : 26
 7 | # size-ratio	v3	thomas_v3	thomas_v3_aligned	thomas_v3cmpeqbinaryflagged	thomas_v3cmpeqflagged	thomas_v3cmpeqscalarflagged	thomas_v3cmpeqsimd32flagged	thomas_v3cmpeqsimd8flagged	relative-intersection-size 
 8 | #generating data...ok.
 9 | 1.001	    379.47	    170.78	    238.13	    237.37	    287.58	    233.22	    199.56	    212.07			0.35448
10 | #generating data...ok.
11 | 1.3798	    470.48	    184.59	    220.84	    275.11	     353.6	     270.9	    249.95	    261.64			0.333
12 | #generating data...ok.
13 | 1.9019	    641.32	    239.66	    294.75	    365.62	    458.63	    360.54	    318.18	    332.03			0.34687
14 | #generating data...ok.
15 | 2.6216	     732.8	    295.66	    334.87	    411.28	    540.23	    403.97	    381.36	    393.35			0.3497
16 | #generating data...ok.
17 | 3.6136	    925.98	     350.7	     365.9	    505.32	    684.04	    503.27	    481.33	    488.09			0.37275
18 | #generating data...ok.
19 | 4.981	    1201.7	     491.1	    554.38	    650.79	     881.4	    644.83	    633.15	     637.6			0.32547
20 | #generating data...ok.
21 | 6.8659	    1282.5	    560.34	    555.86	    789.65	    1008.8	    780.07	    751.17	    764.32			0.33926
22 | #generating data...ok.
23 | 9.4639	    1535.4	    705.72	    681.98	    934.63	    1235.8	    919.32	    932.06	    947.28			0.35154
24 | #generating data...ok.
25 | 13.045	    2082.7	    1034.5	    959.12	    1252.2	    1685.6	    1248.8	    1283.7	    1285.5			0.33072
26 | #generating data...ok.
27 | 17.981	    2456.2	    1217.9	    1074.7	    1505.4	    2035.2	    1494.3	      1580	    1545.6			0.32922
28 | #generating data...ok.
29 | 24.786	    2673.5	    1443.5	    1277.7	    1706.8	    2309.4	    1715.6	    1824.6	    1754.4			0.34234
30 | #generating data...ok.
31 | 34.165	    2714.2	    1714.6	      1470	    1974.4	    2567.1	    1997.2	      2112	    2041.6			0.35479
32 | #generating data...ok.
33 | 47.093	      3785	    2178.7	    1960.3	    2619.7	    3355.1	    2583.9	    2703.1	    2619.1			0.34748
34 | #generating data...ok.
35 | 64.913	    4205.3	      2478	    2272.6	    3040.8	    3763.6	    3054.3	    3175.8	    3092.3			0.35194
36 | #generating data...ok.
37 | 89.477	    5225.5	    3006.4	    2909.3	    3666.7	    4583.1	    3614.8	    3707.4	    3589.3			0.31662
38 | #generating data...ok.
39 | 123.33	    5078.9	    3183.9	    2971.1	    3842.4	    4638.1	    3715.4	    3740.2	    3682.2			0.34722
40 | #generating data...ok.
41 | 170.01	    6270.7	    3981.1	    3689.9	    4719.3	    5576.3	    4627.4	    4713.7	    4613.3			0.38902
42 | #generating data...ok.
43 | 234.34	    7339.4	    4614.1	    4473.3	      5424	    6082.7	    5304.2	    5397.6	    5274.6			0.3213
44 | #generating data...ok.
45 | 323.01	    7765.9	    4907.9	    4704.8	    5869.2	    6362.1	    5841.5	    5906.6	      5789			0.34363
46 | #generating data...ok.
47 | 445.24	     10794	    7904.7	    7695.4	    8897.6	      9021	    8866.3	    9062.5	      8875			0.31975
48 | #generating data...ok.
49 | 613.72	    9834.9	    6657.4	    6523.7	    7533.8	    7921.1	    7500.6	    7585.5	    7448.3			0.3427
50 | #generating data...ok.
51 | 845.95	     13522	     12976	     12635	     14433	     13994	     14186	     14641	     14263			0.33038
52 | #generating data...ok.
53 | 1166.1	     14304	     14953	     14455	     16133	     16154	     16083	     16137	     16233			0.38838
54 | #generating data...ok.
55 | 1607.3	     14974	     16299	     16095	     17608	     17064	     17807	     17747	     17647			0.33755
56 | #generating data...ok.
57 | 2215.5	     16221	     19194	     18704	     19980	     18911	     19917	     20044	     19879			0.32911
58 | #generating data...ok.
59 | 3053.9	     16431	     18564	     18284	     19581	     18407	     19673	     19642	     19430			0.33358
60 | #generating data...ok.
61 | 4209.5	     17159	     22816	     22149	     23436	     22849	     23586	     23728	     23683			0.3253
62 | #generating data...ok.
63 | 5802.4	     17779	     24633	     24518	     24867	     24281	     24995	     25125	     25215			0.45781
64 | #generating data...ok.
65 | 7998.1	     16146	     27857	     27054	     27491	     26823	     27648	     27795	     27993			0.40267
66 | # bogus = 632999104
67 | 


--------------------------------------------------------------------------------
/scripts/disablehyperthreading.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Be careful to not skip the space at the beginning nor the end
 4 | CPUS_TO_SKIP=" $(cat /sys/devices/system/cpu/cpu*/topology/thread_siblings_list | sed 's/[^0-9].*//' | sort | uniq | tr "\r\n" "  ") "
 5 | 
 6 | 
 7 | for CPU_PATH in /sys/devices/system/cpu/cpu[0-9]*; do
 8 |     CPU="$(echo $CPU_PATH | tr -cd "0-9")"
 9 |     echo "$CPUS_TO_SKIP" | grep " $CPU " > /dev/null
10 |     if [ $? -ne 0 ]; then
11 |         echo 0 > $CPU_PATH/online
12 |     fi
13 | done
14 | 
15 | egrep 'siblings|cpu cores' /proc/cpuinfo | head -2
16 | 


--------------------------------------------------------------------------------
/scripts/powerpolicy.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # taken from http://hbfs.wordpress.com/2013/06/18/fast-path-finding-part-ii/
 3 | # might require sudo apt-get install cpufrequtils
 4 | # invoke with performance or ondemand 
 5 | # type cpufreq-info to check results, you can also verify with cat /proc/cpuinfo
 6 | # enumerate found CPUs
 7 | cpus=$( grep processor /proc/cpuinfo | cut -d: -f 2 )
 8 | 
 9 | 
10 | if [ "$1" = "ondemand" ]; then
11 |   echo "setting up ondemand"
12 |   policy="ondemand"
13 | elif [ "$1" = "performance" ]; then
14 |   echo "setting up for performance"
15 |   policy="performance"
16 | elif [ "$1" = "list" ]; then
17 |   cpufreq-info
18 |   exit 0
19 | else
20 |   echo "usage: powerpolicy.sh ondemand | performance list"
21 |   exit -1
22 | fi
23 | 
24 | echo "chosen policy " $1 
25 | # set governor for each CPU
26 | #
27 | for cpu in ${cpus[@]}
28 | do
29 |   cpufreq-set -c $cpu -g $1
30 | done
31 | 


--------------------------------------------------------------------------------
/scripts/turboboost.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | # stolen from https://github.com/DropD/fnc-simplex/blob/master/linux_turboboost.sh
 3 | 
 4 | # you might need to run sudo apt-get install msr-tools
 5 | # Toggle Turbo Boost for Ivy Bridge CPUs (should work for all newer Core)
 6 | # Requires a fairly new Linux kernel (let's say 3.0+)
 7 | # Written by Donjan Rodic, released for free use
 8 | 
 9 | # check current real frequency with  sudo turbostat -s -i1
10 | 
11 | sudo modprobe msr
12 | 
13 | # all_cores FOO
14 | # perform FOO(i) for each core i
15 | all_cores() {
16 |   NPROCS=`cat /proc/cpuinfo | grep "core id" | wc -l`
17 |   NPROCS=$(($NPROCS - 1))
18 |   for i in `seq 0 1 $NPROCS`; do
19 |     $1 $i
20 |   done
21 | }
22 | 
23 | 
24 | # report Turbo Boost state on core $1
25 | read_tb() {
26 |   ret=`sudo rdmsr -p"$1" 0x1a0 -f 38:38`
27 |   [ $ret -eq 0 ] && echo "$1": on || echo "$1": off
28 | }
29 | 
30 | # enable Turbo Boost on core $1
31 | enable_tb() {
32 |   sudo wrmsr -p"$1" 0x1a0 0x850089
33 | }
34 | 
35 | # disable Turbo Boost on core $1
36 | disable_tb() {
37 |   sudo wrmsr -p"$1" 0x1a0 0x4000850089
38 | }
39 | 
40 | 
41 | if [ "$1" = "on" ]; then
42 |   all_cores enable_tb
43 |   all_cores read_tb
44 | elif [ "$1" = "off" ]; then
45 |   all_cores disable_tb
46 |   all_cores read_tb
47 | elif [ "$1" = "list" ]; then
48 |   all_cores read_tb
49 | else
50 |   echo "usage: turboboost.sh on | off | list"
51 | fi
52 | 


--------------------------------------------------------------------------------
/src/benchintersection.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * This code is released under the
  3 |  * Apache License Version 2.0 http://www.apache.org/licenses/.
  4 |  *
  5 |  */
  6 | 
  7 | 
  8 | #include <unistd.h>
  9 | #include "synthetic.h"
 10 | #include "timer.h"
 11 | #include "intersection.h"
 12 | #include "intersectionfactory.h"
 13 | 
 14 | // https://code.google.com/p/likwid/wiki/LikwidPerfCtr#Using_the_marker_API
 15 | #ifdef LIKWID_MARKERS  // see 'make likwidintersection' for compiler flags
 16 | #include <likwid.h>
 17 | #endif
 18 | 
 19 | /**
 20 |  * Goal: have the largest array count about 4M terms (this
 21 |  * matches our experiments), and vary the size of the
 22 |  * smallest array vary from 1*4M to 1/1000*4M (or so).
 23 |  *
 24 |  * Set the size of the intersection to 30% of the lesser
 25 |  * array. (Again, this matches our real data...)
 26 |  *
 27 |  * To match our clueweb, we use a range of values in [0,2**26).
 28 |  */
 29 | template <class generator>
 30 | pair<vector<uint32_t>, vector<uint32_t>> getNaivePair(generator gen, uint32_t minlength, uint32_t Max, float sizeratio,
 31 | float intersectionratio) {
 32 |     if (sizeratio < 1) throw runtime_error("sizeratio should be larger or equal to 1");
 33 |     if (intersectionratio < 0) throw runtime_error("intersectionratio should be positive");
 34 |     if (intersectionratio > 1) throw runtime_error("intersectionratio cannot be larger than 1");
 35 |     const uint32_t maxlenth = static_cast<uint32_t>(round(static_cast<float>(minlength) * sizeratio));
 36 |     if (maxlenth > Max)  throw runtime_error("I can't generate an array so large in such a small range.");
 37 |     if (maxlenth < minlength) throw runtime_error("something went wrong, possibly an overflow.");
 38 |     // we basically assume that, if we do nothing, intersections are very small
 39 |     const uint32_t intersize = static_cast<uint32_t>(round(static_cast<float>(minlength) * intersectionratio));
 40 | 
 41 |     vector<uint32_t> inter = gen.generate(intersize, Max);
 42 |     vector<uint32_t> smallest =  unite(gen.generate(static_cast<uint32_t>(minlength - inter.size()), Max), inter);
 43 |     vector<uint32_t> largest = unite(gen.generate(static_cast<uint32_t>(maxlenth - inter.size()), Max), inter);
 44 |     vector<uint32_t> intersection = intersect(smallest, largest);
 45 |     if (largest.size() > smallest.size())
 46 |         return pair<vector<uint32_t>, vector<uint32_t>>(smallest, largest);
 47 |     return pair<vector<uint32_t>, vector<uint32_t>>(largest, smallest);
 48 | 
 49 | }
 50 | 
 51 | 
 52 | 
 53 | void printusage() {
 54 | #ifdef LIKWID_MARKERS
 55 |     cout << "example: likwid -m -C 1 -g BRANCH ./likwidintersection -u > uniform.out" << endl;
 56 | #else
 57 |     cout << " Runs an exhaustive benchmark over a ClusterData distribution."<<endl;
 58 |     cout << " -s followed by comma-separated values specifies intersections functions, chosen from:"<<endl;
 59 |     for (string x:  allRealNames())
 60 |         cout<< x <<endl;
 61 |     cout << " -u switches to uniform distribution" << endl;
 62 | #endif
 63 | }
 64 | 
 65 | int main(int argc, char **argv) {
 66 |     size_t howmany = 0;
 67 |     size_t loop = 3;
 68 |     bool uniform = false;
 69 |     uint32_t Big = 22;
 70 |     float intersectionratio = 0.3f;
 71 |     uint32_t MaxBit = 26;
 72 |     int c;
 73 |     std::vector<std::string> myschemes = allRealNames();
 74 | 
 75 |     while ((c = getopt(argc, argv, "uns:m:R:M:S:l:hs:")) != -1)
 76 |         switch (c) {
 77 |         case 'h':
 78 |             printusage();
 79 |             return 0;
 80 |         case 'S':
 81 |             Big = atoi(optarg);
 82 |             break;
 83 |         case 'R':
 84 |             intersectionratio = atof(optarg);
 85 |             break;
 86 |         case 's':
 87 |             myschemes.clear();
 88 |             {
 89 |                 const string codecsstr(optarg);
 90 |                 const vector<string> codecslst = split(codecsstr, ",:;");
 91 |                 for (auto i = codecslst.begin(); i != codecslst.end(); ++i) {
 92 |                     if (realschemes.find(*i) == realschemes.end()) {
 93 |                         cerr << " Warning!!! Warning: unrecognized: " << *i
 94 |                                 << endl;
 95 |                         printusage();
 96 |                         return -1;
 97 | 
 98 |                     } else {
 99 |                         myschemes.push_back(*i);
100 |                     }
101 |                 }
102 |             }
103 |             break;
104 |         case 'M':
105 |             MaxBit = atoi(optarg);
106 |             if (MaxBit < 1) {
107 |                 printusage();
108 |                 return -1;
109 |             }
110 |             break;
111 |         case 'm':
112 |             howmany = atoi(optarg);
113 |             if (howmany < 1) {
114 |                 printusage();
115 |                 return -1;
116 |             }
117 |             break;
118 |         case 'l':
119 |             loop = atoi(optarg);
120 |             if (loop < 1) {
121 |                 printusage();
122 |                 return -1;
123 |             }
124 |             break;
125 |         case 'u':
126 |             uniform = true;
127 |             break;
128 |         default:
129 |             printusage();
130 |             abort();
131 |         }
132 |     if (howmany == 0) {
133 |         howmany = 5;
134 |     }
135 |     cout << "# howmany : " << howmany << endl;
136 |     cout << "# loop : " << loop << endl;
137 |     cout << "# distribution : " << (uniform ? "uniform" : "clustered") << endl;
138 |     cout << "# Big : " << Big << endl;
139 |     cout << "# intersectionratio : " << intersectionratio << endl;
140 |     cout << "# MaxBit : " << MaxBit << endl;
141 |     UniformDataGenerator udg;
142 |     ClusteredDataGenerator cdg;
143 |     WallClockTimer z;
144 |     size_t bogus = 0;
145 |     vector <uint32_t> buffer(2 * (1U << Big));
146 | #ifdef LIKWID_MARKERS
147 |     char currentMarker[64];
148 |     likwid_markerInit();
149 | #endif
150 | 
151 |     cout << "# size-ratio\t";
152 |     for (string intername : myschemes) {
153 |         cout << intername << "\t";
154 |     }
155 |     cout << "relative-intersection-size " << endl;
156 | 
157 |     for (float ir = 1.001; ir <= 10000; ir = ir * sqrt(1.9)) {
158 |         vector <pair<vector<uint32_t> , vector<uint32_t>>> data(howmany);
159 |         uint32_t smallsize = static_cast<uint32_t>(round(static_cast<float>(1 << Big) / ir));
160 |         cout << "#generating data...";
161 |         cout.flush();
162 |         for (size_t k = 0; k < howmany; ++k) {
163 |             data[k] = uniform ? getNaivePair(udg , smallsize, 1U << MaxBit, ir, intersectionratio)
164 |                       : getNaivePair(cdg , smallsize, 1U << MaxBit, ir, intersectionratio);
165 |         }
166 |         cout << "ok." << endl;
167 |         cout << ir << "\t";
168 |         float aratio = 0.0f;
169 |         for (string intername : myschemes) {
170 |             intersectionfunction interfnc = realschemes[intername];
171 |             size_t volume = 0;
172 | #ifdef LIKWID_MARKERS
173 |             snprintf(currentMarker, sizeof(currentMarker), "%s %.2f", intername.c_str(), ir);
174 |             likwid_markerStartRegion(currentMarker);
175 | #endif
176 |             z.reset();
177 |             for (size_t k = 0; k < data.size(); ++k) {
178 |                 volume += (data[k].first.size() + data[k].second.size()) * loop;
179 |                 for (size_t L = 0; L < loop; ++L) {
180 |                     aratio = interfnc(data[k].first.data(),
181 |                                       (data[k].first).size(), data[k].second.data(),
182 |                                       (data[k].second).size(), buffer.data());
183 |                     bogus += aratio;
184 |                 }
185 |             }
186 |             cout << setw(10) << setprecision(5) << (volume / (static_cast<double>(z.split()))) << "\t";
187 | #ifdef LIKWID_MARKERS
188 |             likwid_markerStopRegion(currentMarker);
189 | #endif
190 |         }
191 |         cout << "\t\t" << aratio / smallsize;
192 |         cout << endl;
193 | 
194 |     }
195 | #ifdef LIKWID_MARKERS
196 |     likwid_markerClose();
197 | #endif
198 | 
199 |     cout << "# bogus = " << bogus << endl;
200 | }
201 | 


--------------------------------------------------------------------------------
/src/getmatrix.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Algorithm 3:  1:1 2:1 3:1 5:1 10:1 20:1 40:1 80:1 200:1 500:1 1000:1
  3 |  100%             xx  xx   xx ...
  4 |  80%               xx  xx   xx ...
  5 |  60%
  6 |  20%
  7 |  10%
  8 |  5%
  9 |  1%
 10 |  */
 11 | 
 12 | #include "common.h"
 13 | #include "intersectionfactory.h"
 14 | #include "timer.h"
 15 | #include "synthetic.h"
 16 | #include "util.h"
 17 | 
 18 | void printusage() {
 19 |     cout << " Try ./getmatrix -s scalarnate" << endl;
 20 |     cout << " Use the -s flag to specify  some scheme, choose from: " << endl;
 21 |     for(string x : allRealNames()) cout <<" "<< x << endl;
 22 |     cout
 23 |             << " The -M flag allows you to specific the range in bits (default 31)."
 24 |             << endl;
 25 |     cout
 26 |             << " The -S flag allows you to specific the log. of the minimal array size (default 10)."
 27 |             << endl;
 28 | }
 29 | 
 30 | int main(int argc, char **argv) {
 31 |     size_t howmany = 0;
 32 |     size_t loop = 10;
 33 |     uint32_t S = 10;
 34 |     string name;
 35 |     intersectionfunction myscheme = NULL;
 36 |     uint32_t MaxBit = 31;
 37 |     int c;
 38 |     while ((c = getopt(argc, argv, "ns:m:M:S:l:r:h")) != -1)
 39 |         switch (c) {
 40 |         case 'h':
 41 |             printusage();
 42 |             return 0;
 43 |         case 'S':
 44 |             S = atoi(optarg);
 45 |             break;
 46 |         case 'M':
 47 |             MaxBit = atoi(optarg);
 48 |             if (MaxBit < 1) {
 49 |                 printusage();
 50 |                 return -1;
 51 |             }
 52 |             break;
 53 |         case 'm':
 54 |             howmany = atoi(optarg);
 55 |             if (howmany < 1) {
 56 |                 printusage();
 57 |                 return -1;
 58 |             }
 59 |             break;
 60 |         case 's':
 61 |             name = optarg;
 62 |             if (realschemes.find(name) == realschemes.end()) {
 63 |                 cerr << " Warning!!! Warning: unrecognized: " << name << endl;
 64 |                 printusage();
 65 |                 return -1;
 66 | 
 67 |             } else {
 68 | 
 69 |                 myscheme = realschemes.find(name)->second;
 70 |             }
 71 |             break;
 72 |         case 'l':
 73 |             loop = atoi(optarg);
 74 |             if (loop < 1) {
 75 |                 printusage();
 76 |                 return -1;
 77 |             }
 78 |             break;
 79 |         default:
 80 |             abort();
 81 |         }
 82 |     if (howmany == 0) {
 83 |         howmany = 5;
 84 |     }
 85 |     if(myscheme == NULL) {
 86 |         printusage();
 87 |         return -1;
 88 |     }
 89 |     const uint32_t minlength = 1U << S;
 90 |     cout << "########### Intersection benchmark ###########" << endl;
 91 | 
 92 |     cout << "# speeds in mis" << endl;
 93 |     cout << "# columns are size ratios" << endl;
 94 |     cout << "# rows are intersection ratios" << endl;
 95 |     cout << "# average gaps in bits for smallest array: " << std::setprecision(
 96 |             3) << log(1 + (1U << MaxBit) * 1.0 / minlength) << " (use -S and -M flag to change)"<< endl;
 97 | #ifdef __INTEL_COMPILER
 98 |     // Intel's support for C++ sucks
 99 |     vector<float> intersectionsratios;
100 |     intersectionsratios.push_back(1.00);
101 |     intersectionsratios.push_back(0.80);
102 |     intersectionsratios.push_back(0.60);
103 |     intersectionsratios.push_back(0.20);
104 |     intersectionsratios.push_back(0.10);
105 |     intersectionsratios.push_back(0.05);
106 |     intersectionsratios.push_back(0.01);
107 |     vector < uint32_t > sizeratios;
108 |     sizeratios.push_back(1);
109 |     sizeratios.push_back(2);
110 |     sizeratios.push_back(3);
111 |     sizeratios.push_back(5);
112 |     sizeratios.push_back(10);
113 |     sizeratios.push_back(20);
114 |     sizeratios.push_back(40);
115 |     sizeratios.push_back(80);
116 |     sizeratios.push_back(200);
117 |     sizeratios.push_back(500);
118 |     sizeratios.push_back(1000);
119 | #else
120 |     // proper C++
121 |     vector<float> intersectionsratios = { 1.00, 0.80, 0.60, 0.20, 0.10, 0.05,
122 |             0.01 };
123 |     vector < uint32_t > sizeratios = {1, 2, 3, 5, 10, 20,40,80,200,500,1000};
124 | #endif
125 |     cout<<"# average gaps in bits for last largest array: "<<std::setprecision(3)<<log(
126 |              1 + (1U << MaxBit) * 1.0 / (sizeratios.back()*minlength))<<endl;
127 |     cout << "#############################################" << endl << endl;
128 | 
129 |     cout<< name << endl << "\t\t";
130 |     ClusteredDataGenerator cdg;
131 |     WallClockTimer z;
132 |     size_t bogus = 0;
133 | 
134 | 
135 |     for(uint32_t sr :  sizeratios) {
136 |         cout<<"1:"<<sr << "\t";
137 |     }
138 |     cout<<endl;
139 |     size_t time = 0;
140 |     for(float ir : intersectionsratios) {
141 |         cout<< std::setprecision(3)<< (100*ir) << "%\t\t";
142 |         cout.flush();
143 |         for(uint32_t sr :  sizeratios) {
144 |             vector<uint32_t> buffer((sr*minlength + 15)/16*16);
145 |             vector <
146 |             pair<
147 |             vector<uint32_t>, vector<uint32_t>
148 |             >
149 |             > data(howmany);
150 |             for(size_t k = 0; k < howmany; ++k)
151 |                 data[k] = getPair(cdg, minlength,1U<<MaxBit, static_cast<float>(sr), ir);
152 |             size_t volume = 0;
153 |             z.reset();
154 |             for (size_t L = 0; L < loop; ++L) {
155 | 
156 |                 for (auto x : data) {
157 |                     volume += (x.first).size();
158 |                     volume += (x.second).size();
159 |                     bogus
160 |                     += myscheme(&(x.first)[0],
161 |                             (x.first).size(), &(x.second)[0],
162 |                             (x.second).size(),&buffer[0]);
163 |                 }
164 | 
165 |             }
166 |             time = z.split();
167 |             cout << std::setprecision(4) << static_cast<double>(volume) / static_cast<double>(time) << "\t";
168 |             cout.flush();
169 | 
170 | 
171 |         }
172 |         cout<<endl;
173 | 
174 |     }
175 | 
176 |     cout << "# bogus = " << bogus << endl;
177 | }
178 | 
179 | 


--------------------------------------------------------------------------------
/src/intersection.cpp:
--------------------------------------------------------------------------------
  1 | #include "intersection.h"
  2 | 
  3 | /**
  4 |  * Compute the *cardinality* of the intersection between two *sorted*
  5 |  * arrays.
  6 |  *
  7 |  * Algorithm design by D. Lemire. It uses several while loops on
  8 |  * purpose.
  9 |  *
 10 |  */
 11 | size_t danscalarintersectioncardinality(const uint32_t * set1, const size_t length1,
 12 |         const uint32_t * set2, const size_t length2) {
 13 |     if ((0 == length1) or (0 == length2))
 14 |         return 0;
 15 |     size_t answer = 0;
 16 |     size_t k1 = 0, k2 = 0;
 17 |     while (true) {
 18 |         if (set1[k1] < set2[k2]) {
 19 |             do {
 20 |                 ++k1;
 21 |                 if (k1 == length1)
 22 |                     return answer;
 23 |             } while (set1[k1] < set2[k2]);
 24 |         }
 25 |         if (set2[k2] < set1[k1]) {
 26 | 
 27 |             do {
 28 |                 ++k2;
 29 |                 if (k2 == length2)
 30 |                     return answer;
 31 |             } while (set2[k2] < set1[k1]);
 32 |         } else {
 33 |             // assert(set2[k2] == set1[k1]);
 34 |             ++answer;
 35 |             ++k1;
 36 |             if (k1 == length1)
 37 |                 break;
 38 |             ++k2;
 39 |             if (k2 == length2)
 40 |                 break;
 41 |         }
 42 |     }
 43 |     return answer;
 44 | }
 45 | 
 46 | /**
 47 |  * Compute the *cardinality* of the intersection between two *sorted*
 48 |  * arrays.
 49 |  *
 50 |  * Algorithm design by D. Lemire. It uses several while loops on
 51 |  * purpose.
 52 |  *
 53 |  */
 54 | size_t danscalarintersection(const uint32_t * set1, const size_t length1,
 55 |         const uint32_t * set2, const size_t length2, uint32_t * out) {
 56 |     if ((0 == length1) or (0 == length2))
 57 |         return 0;
 58 |     size_t answer = 0;
 59 |     size_t k1 = 0, k2 = 0;
 60 |     while (true) {
 61 |         if (set1[k1] < set2[k2]) {
 62 |             do {
 63 |                 ++k1;
 64 |                 if (k1 == length1)
 65 |                     return answer;
 66 |             } while (set1[k1] < set2[k2]);
 67 |         }
 68 |         if (set2[k2] < set1[k1]) {
 69 | 
 70 |             do {
 71 |                 ++k2;
 72 |                 if (k2 == length2)
 73 |                     return answer;
 74 |             } while (set2[k2] < set1[k1]);
 75 |         } else {
 76 | 
 77 |             // assert(set2[k2] == set1[k1]);
 78 |             out[answer++] = set1[k1];
 79 |             ++k1;
 80 |             if (k1 == length1)
 81 |                 break;
 82 |             ++k2;
 83 |             if (k2 == length2)
 84 |                 break;
 85 |         }
 86 |     }
 87 |     return answer;
 88 | }
 89 | 
 90 | /**
 91 |  * This is the classical approach
 92 |  */
 93 | size_t classicalintersectioncardinality(const uint32_t * set1,
 94 |         const size_t length1, const uint32_t * set2, const size_t length2) {
 95 |     if ((0 == length1) or (0 == length2))
 96 |         return 0;
 97 |     size_t answer = 0;
 98 |     size_t k1 = 0, k2 = 0;
 99 |     while (true) {
100 |         if (set1[k1] < set2[k2]) {
101 |             ++k1;
102 |             if (k1 == length1)
103 |                 return answer;
104 |         } else if (set2[k2] < set1[k1]) {
105 |             ++k2;
106 |             if (k2 == length2)
107 |                 return answer;
108 |         } else {
109 |             // (set2[k2] == set1[k1])
110 |             ++answer;
111 |             ++k1;
112 |             if (k1 == length1)
113 |                 break;
114 |             ++k2;
115 |             if (k2 == length2)
116 |                 break;
117 | 
118 |         }
119 |     }
120 |     return answer;
121 | 
122 | }
123 | /**
124 |  * This is the classical approach
125 |  */
126 | size_t classicalintersection(const uint32_t * set1,
127 |         const size_t length1, const uint32_t * set2, const size_t length2, uint32_t * out) {
128 |     if ((0 == length1) or (0 == length2))
129 |         return 0;
130 |     size_t answer = 0;
131 |     size_t k1 = 0, k2 = 0;
132 |     while (true) {
133 |         if (set1[k1] < set2[k2]) {
134 |             ++k1;
135 |             if (k1 == length1)
136 |                 return answer;
137 |         } else if (set2[k2] < set1[k1]) {
138 |             ++k2;
139 |             if (k2 == length2)
140 |                 return answer;
141 |         } else {
142 |             // (set2[k2] == set1[k1])
143 |             out[answer++] = set1[k1];
144 |             ++k1;
145 |             if (k1 == length1)
146 |                 break;
147 |             ++k2;
148 |             if (k2 == length2)
149 |                 break;
150 | 
151 |         }
152 |     }
153 |     return answer;
154 | 
155 | }
156 | 
157 | 


--------------------------------------------------------------------------------
/src/multiSetIntersection.cpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lemire/SIMDIntersections/7b0011cbcb441680f88c1a47254012c325d55cb3/src/multiSetIntersection.cpp


--------------------------------------------------------------------------------
/src/realintersection.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * This code is released under the
  3 |  * Apache License Version 2.0 http://www.apache.org/licenses/.
  4 |  *
  5 |  * (c) Daniel Lemire, http://lemire.me/en/
  6 |  */
  7 | #include "common.h"
  8 | #include "intersectionfactory.h"
  9 | #include "timer.h"
 10 | #include "synthetic.h"
 11 | #include "util.h"
 12 | #include "skipping.h"
 13 | 
 14 | void printusage() {
 15 |     cout << " Try ./realintersection -r 40" << endl;
 16 |     cout << " Use the -s flag to specify just some scheme, choose from: "
 17 |             << endl;
 18 |     for(string x : allRealNames()) cout <<" "<< x << endl;
 19 |     cout << " Separate the schemes by a comma (e.g. -s schlegel,danscalar). "
 20 |             << endl;
 21 | }
 22 | 
 23 | int main(int argc, char **argv) {
 24 |     size_t howmany = 0;
 25 |     bool natemode = false; // left undocumented by design, use at your own risk
 26 |     bool safe = true;
 27 |     size_t loop = 1000;
 28 |     uint32_t S = 12;
 29 |     uint32_t ratio = 1;
 30 |     bool skipping = false;
 31 |     uint32_t skipgaplog = 0;
 32 |     map<string, intersectionfunction> myschemes(realschemes);
 33 |     int c;
 34 |     while ((c = getopt(argc, argv, "uns:S:m:l:r:hk:")) != -1)
 35 |         switch (c) {
 36 |         case 'k':
 37 |             skipping = true;
 38 |             skipgaplog = atoi(optarg);
 39 |             if ((S < 1) or (S > 31)) {
 40 |                 cerr<<"Skip param needs to be within [1,31]."<<endl;
 41 |                 printusage();
 42 |                 return -1;
 43 |             }
 44 |             cout<<"#using skip steps of "<<(1<<skipgaplog)<<endl;
 45 |             myschemes.clear();
 46 |             break;
 47 |         case 'u':
 48 |             safe = false;
 49 |             break;
 50 | 
 51 |         case 'n':
 52 |             natemode = true;
 53 |             break;
 54 |         case 'h':
 55 |             printusage();
 56 |             return 0;
 57 |         case 'S':
 58 |             S = atoi(optarg);
 59 |             if ((S < 1) or (S > 31)) {
 60 |                 printusage();
 61 |                 return -1;
 62 |             }
 63 |             break;
 64 |         case 'm':
 65 |             howmany = atoi(optarg);
 66 |             if (howmany < 1) {
 67 |                 printusage();
 68 |                 return -1;
 69 |             }
 70 |             break;
 71 |         case 's':
 72 |             myschemes.clear();
 73 |             {
 74 |                 const string codecsstr(optarg);
 75 |                 const vector<string> codecslst = split(codecsstr, ",:;");
 76 |                 for (auto i = codecslst.begin(); i != codecslst.end(); ++i) {
 77 |                     if (realschemes.find(*i) == realschemes.end()) {
 78 |                         cerr << " Warning!!! Warning: unrecognized: " << *i
 79 |                                 << endl;
 80 |                         printusage();
 81 |                         return -1;
 82 | 
 83 |                     } else {
 84 |                         const auto K = realschemes.find(*i);
 85 |                         const std::string name = K->first;
 86 |                         const intersectionfunction fn = K->second;
 87 |                         myschemes[name] = fn;
 88 |                     }
 89 |                 }
 90 |             }
 91 |             break;
 92 |         case 'l':
 93 |             loop = atoi(optarg);
 94 |             if (loop < 1) {
 95 |                 printusage();
 96 |                 return -1;
 97 |             }
 98 |             break;
 99 |         case 'r':
100 |             ratio = atoi(optarg);
101 |             if (ratio < 1) {
102 |                 printusage();
103 |                 return -1;
104 |             }
105 |             break;
106 |         default:
107 |             abort();
108 |         }
109 |     if (howmany == 0) {
110 |         if (natemode)
111 |             howmany = 1;
112 |         else
113 |             howmany = 20;
114 |     }
115 |     uint32_t MaxBit = 31;
116 |     if (natemode) {
117 |         MaxBit = S + 7;
118 |         if (MaxBit > 31)
119 |             MaxBit = 31;
120 |     }
121 |     cout << "# algo: ";
122 |     for(auto algo : myschemes) {
123 |         cout << algo.first<< " ";
124 |     }
125 |     cout << endl;
126 |     const uint32_t N = 1U << S;
127 |     if (!natemode)
128 |         if (ratio != 1) {
129 |             cout << "# ratio = " << ratio << endl;
130 |             cout << "# size of largest array = " << N << endl;
131 |             cout << "# size of smallest array = " << N / ratio << endl;
132 |         } else {
133 |             cout << "# size of arrays = " << N << endl;
134 |         }
135 |     else
136 |         cout << "# in natemode" << endl;
137 | 
138 |     ClusteredDataGenerator cdg;
139 |     WallClockTimer z;
140 |     size_t bogus = 0;
141 |     size_t volume = 0;
142 |     uint64_t time = 0;
143 |     cout << "# first column is relative size of intersection" << endl;
144 |     if (ratio > 1) {
145 |         cout
146 |                 << "# next two are estimated average bits per int for differential coding"
147 |                 << endl;
148 |     } else {
149 |         cout
150 |                 << "# next is estimated average bits per int for differential coding"
151 |                 << endl;
152 |     }
153 | 
154 |     cout
155 |             << "# other columns display speed in mis when computing the intersection"
156 |             << endl;
157 |     for (uint32_t gap = 0; gap + S <= MaxBit; gap += 1) {
158 |         vector < vector<uint32_t> > data;
159 |         for (size_t zz = 0; zz < howmany; ++zz) {
160 |             if (natemode) {
161 |                 data.push_back(
162 |                         cdg.generateClustered((1U << (MaxBit - gap)) / ratio,
163 |                                 1U << MaxBit));
164 |                 data.push_back(
165 |                         cdg.generateClustered((1U << (MaxBit - gap)),
166 |                                 1U << MaxBit));
167 |             } else {
168 |                 data.push_back(
169 |                         cdg.generateClustered(N / ratio, 1U << (gap + S)));
170 |                 data.push_back(cdg.generateClustered(N, 1U << (gap + S)));
171 |             }
172 |         }
173 |         size_t intersize = 0;
174 |         size_t smallestsize = 0;
175 |         for (size_t k = 0; k < howmany; k++) {
176 |             intersize += classicalintersectioncardinality(&data[2 * k][0],
177 |                     data[2 * k].size(), &data[2 * k + 1][0],
178 |                     data[2 * k + 1].size());
179 |             smallestsize
180 |                     += data[2 * k + 1].size() < data[2 * k].size() ? data[2 * k
181 |                             + 1].size() : data[2 * k].size();
182 |         }
183 | 
184 |         cout << std::fixed << std::setprecision(3)
185 |                 << static_cast<double> (intersize)
186 |                         / static_cast<double> (smallestsize) << "\t";
187 |         cout.flush();
188 |         if (ratio > 1) {
189 |             if (natemode) {
190 |                 cout << log(
191 |                         1 + (1U << MaxBit) * 1.0 / ((1U << (MaxBit - gap))
192 |                                 / ratio)) / log(2) << "\t";
193 |                 cout
194 |                         << log(
195 |                                 1 + (1U << MaxBit) * 1.0 / ((1U << (MaxBit
196 |                                         - gap)))) / log(2) << "\t";
197 |             } else {
198 |                 cout << log(1 + (1U << (gap + S)) * 1.0 / (N / ratio)) / log(2)
199 |                         << "\t";
200 |                 cout << log(1 + (1U << (gap + S)) * 1.0 / (N)) / log(2) << "\t";
201 |             }
202 |         } else {
203 |             if (natemode) {
204 |                 cout
205 |                         << log(
206 |                                 1 + (1U << MaxBit) * 1.0 / ((1U << (MaxBit
207 |                                         - gap)))) / log(2) << "\t";
208 |             } else {
209 |                 cout << log(1 + (1U << (gap + S)) * 1.0 / (N)) / log(2) << "\t";
210 |             }
211 | 
212 |         }
213 |         cout << "\t";
214 |         cout.flush();
215 |         size_t maxsize = 0;
216 |         for(auto x : data)
217 |           if(x.size() > maxsize) maxsize = x.size();
218 |         vector < uint32_t > buffer((maxsize + 15) / 16 * 16);
219 |         /**
220 |          * Skipping is a standard technique in IR. We test it here.
221 |          */
222 |         if (skipping) {
223 |             vector < shared_ptr<Skipping> > sdata;
224 |             for(vector<uint32_t> & x : data)
225 |                 sdata.emplace_back(shared_ptr<Skipping>(new Skipping(skipgaplog,x.data(),static_cast<uint32_t>(x.size()))));
226 |             for (size_t k = 0; k < 2 * howmany; k += 2) {
227 |                 vector < uint32_t > out(buffer.size());
228 |                 size_t correctanswer = classicalintersection(&data[k][0],
229 |                         data[k].size(), &data[k + 1][0], data[k + 1].size(),
230 |                         &out[0]);
231 |                 out.resize(correctanswer);
232 |                 vector < uint32_t > out2(buffer.size());
233 |                 size_t thisschemesanswer = sdata[k]->intersect(*sdata[k+1],&out2[0]);
234 |                 out2.resize(thisschemesanswer);
235 |                 if (out != out2) {
236 |                     if (thisschemesanswer != correctanswer) {
237 |                         cerr << "expecting cardinality of " << correctanswer;
238 |                         cerr << " got " << thisschemesanswer << "." << endl;
239 |                         int times = 0;
240 |                         for(size_t jj=0; (jj < thisschemesanswer)&&(jj<correctanswer)&&(times<10);++jj) {
241 |                           if(out[jj]!=out2[jj]) {
242 |                             cout<<" index = "<<jj<<" expected "<<out[jj]<<" but got "<<out2[jj]<<endl;
243 |                             ++times;
244 |                           }
245 |                         }
246 |                         if(times == 0) cout<<"content is correct, but incomplete/too much"<<endl;
247 |                     } else {
248 |                       cerr << "Same cardinality "<< correctanswer<<". Good. "<< endl;
249 |                       for(size_t jj = 0; jj < correctanswer; ++jj)
250 |                         if(out[jj]!= out2[jj]) {
251 |                           cerr<<"Differ at "<<jj<<" got "<<out2[jj]<<" should find "<<out[jj]<<endl;
252 |                           break;
253 |                        }
254 |                     }
255 |                     throw runtime_error("bug");
256 |                 }
257 |         }
258 |         volume = 0;
259 |         z.reset();
260 |         for (size_t L = 0; L < loop; ++L) {
261 | 
262 |             for (size_t k = 0; k < 2 * howmany; k += 2) {
263 |                 volume += data[k].size();
264 |                 volume += data[k + 1].size();
265 |                 bogus
266 |                 += sdata[k]->intersect(*sdata[k+1],&buffer[0]);
267 |             }
268 | 
269 |         }
270 |         time = z.split();
271 |         cout << std::setprecision(0) << static_cast<double>(volume) / static_cast<double>(time) << "\t";
272 |         cout.flush();
273 |     }
274 |     /**
275 |     * End of Skipping
276 |     */
277 |     for(auto algo : myschemes) {
278 |         if (safe and buggyschemes.find(algo.first) == buggyschemes.end() )
279 |         for (size_t k = 0; k < 2 * howmany; k += 2) {
280 |             vector<uint32_t> out(buffer.size());
281 |             size_t correctanswer = classicalintersection(
282 |                     &data[k][0], data[k].size(), &data[k + 1][0],
283 |                     data[k + 1].size(),&out[0]);
284 |             out.resize(correctanswer);
285 |             vector<uint32_t> out2(buffer.size());
286 |             size_t thisschemesanswer = algo.second(
287 |                     &data[k][0], data[k].size(), &data[k + 1][0],
288 |                     data[k + 1].size(),&out2[0]);
289 |             out2.resize(thisschemesanswer);
290 |             if (out != out2) {
291 |                 if(thisschemesanswer != correctanswer) {
292 |                     cerr << "expecting cardinality of " << correctanswer;
293 |                     cerr << " got " << thisschemesanswer << "."
294 |                     << endl;
295 |                     if(correctanswer < 10)
296 |                     for(uint32_t x : out)
297 |                     cerr<<x<<endl;
298 |                 } else {
299 |                     cerr << "Same cardinality "<< correctanswer<<". Good. "<< endl;
300 |                     for(size_t jj = 0; jj < correctanswer; ++jj)
301 |                     if(out[jj]!= out2[jj]) {
302 |                         cerr<<"Differ at "<<jj<<" got "<<out2[jj]<<" should find "<<out[jj]<<endl;
303 |                         break;
304 |                     }
305 |                 }
306 |                 throw runtime_error("bug");
307 |             }
308 |         }
309 |         volume = 0;
310 |         z.reset();
311 |         for (size_t L = 0; L < loop; ++L) {
312 | 
313 |             for (size_t k = 0; k < 2 * howmany; k += 2) {
314 |                 volume += data[k].size();
315 |                 volume += data[k + 1].size();
316 |                 bogus
317 |                 += algo.second(&data[k][0],
318 |                         data[k].size(), &data[k + 1][0],
319 |                         data[k + 1].size(),&buffer[0]);
320 |             }
321 | 
322 |         }
323 |         time = z.split();
324 |         cout << std::setprecision(0) << static_cast<double>(volume) / static_cast<double>(time) << "\t";
325 |         cout.flush();
326 |     }
327 | 
328 |     cout << endl;
329 | 
330 | }
331 | cout << "# bogus = " << bogus << endl;
332 | }
333 | 


--------------------------------------------------------------------------------
/src/testintersection.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * This code is released under the
  3 |  * Apache License Version 2.0 http://www.apache.org/licenses/.
  4 |  *
  5 |  * (c) Daniel Lemire, http://lemire.me/en/
  6 |  */
  7 | #include "intersectionfactory.h"
  8 | #include "common.h"
  9 | #include "timer.h"
 10 | #include "synthetic.h"
 11 | #include "util.h"
 12 | 
 13 | 
 14 | void printusage() {
 15 |     cout << " Try ./testintersection -r 40" << endl;
 16 |     cout << " Use the -s flag to specify just some scheme, choose from: "
 17 |             << endl;
 18 |     for(string x : allNames()) cout <<" "<< x << endl;
 19 |     cout << " Separate the schemes by a comma (e.g. -s schlegel,danscalar). "<< endl;
 20 | }
 21 | 
 22 | int main(int argc, char **argv) {
 23 |     size_t howmany = 0;
 24 |     bool natemode = false;
 25 |     bool safe = true;
 26 |     size_t loop = 1000;
 27 |     uint32_t S = 12;
 28 |     uint32_t ratio = 1;
 29 |     map<string, cardinalityintersectionfunction> myschemes(schemes);
 30 |     map<string, cardinalityintersectionfunctionpart> mypartschemes(partschemes);
 31 |     int c;
 32 |     while ((c = getopt(argc, argv, "uns:S:m:l:r:h")) != -1)
 33 |         switch (c) {
 34 |         case 'u':
 35 |             safe = false;
 36 |             break;
 37 | 
 38 |         case 'n':
 39 |             natemode = true;
 40 |             break;
 41 |         case 'h':
 42 |             printusage();
 43 |             return 0;
 44 |         case 'S':
 45 |             S = atoi(optarg);
 46 |             if ((S < 1) or (S > 31)) {
 47 |                 printusage();
 48 |                 return -1;
 49 |             }
 50 |             break;
 51 |         case 'm':
 52 |             howmany = atoi(optarg);
 53 |             if (howmany < 1) {
 54 |                 printusage();
 55 |                 return -1;
 56 |             }
 57 |             break;
 58 |         case 's':
 59 |             myschemes.clear();
 60 |             mypartschemes.clear();
 61 |             {
 62 |                 const string codecsstr(optarg);
 63 |                 const vector < string > codecslst = split(codecsstr, ",:;");
 64 |                 for (auto i = codecslst.begin(); i != codecslst.end(); ++i) {
 65 |                     if (schemes.find(*i) == schemes.end()) {
 66 |                         if (partschemes.find(*i) == partschemes.end()) {
 67 |                             cerr << " Warning!!! Warning: unrecognized: " << *i
 68 |                                     << endl;
 69 |                             printusage();
 70 |                             return -1;
 71 |                         } else {
 72 |                             const auto K = partschemes.find(*i);
 73 |                             const std::string name = K->first;
 74 |                             const cardinalityintersectionfunctionpart fn = K->second;
 75 |                             mypartschemes[name] =  fn;
 76 |                         }
 77 |                     } else {
 78 |                         const auto K = schemes.find(*i);
 79 |                         const std::string name = K->first;
 80 |                         const cardinalityintersectionfunction fn = K->second;
 81 |                         myschemes[name] =  fn;
 82 |                     }
 83 |                 }
 84 |             }
 85 |             break;
 86 |         case 'l':
 87 |             loop = atoi(optarg);
 88 |             if (loop < 1) {
 89 |                 printusage();
 90 |                 return -1;
 91 |             }
 92 |             break;
 93 |         case 'r':
 94 |             ratio = atoi(optarg);
 95 |             if (ratio < 1) {
 96 |                 printusage();
 97 |                 return -1;
 98 |             }
 99 |             break;
100 |         default:
101 |             abort();
102 |         }
103 |     if (howmany == 0) {
104 |         if (natemode)
105 |             howmany = 1;
106 |         else
107 |             howmany = 20;
108 |     }
109 |     uint32_t MaxBit = 31;
110 |     if (natemode) {
111 |         MaxBit = S + 7;
112 |         if (MaxBit > 31)
113 |             MaxBit = 31;
114 |     }
115 |     cout<<"# algo: ";
116 |     for(auto algo : myschemes) {
117 |         cout << algo.first<< " ";
118 |     }
119 |     for(auto algo : mypartschemes) {
120 |             cout << algo.first<< " ";
121 |     }
122 |     cout << endl;
123 |     const uint32_t N = 1U << S;
124 |     if (!natemode)
125 |         if (ratio != 1) {
126 |             cout << "# ratio = " << ratio << endl;
127 |             cout << "# size of largest array = " << N << endl;
128 |             cout << "# size of smallest array = " << N / ratio << endl;
129 |         } else {
130 |             cout << "# size of arrays = " << N << endl;
131 |         }
132 |     else
133 |         cout << "# in natemode" << endl;
134 | 
135 |     ClusteredDataGenerator cdg;
136 |     WallClockTimer z;
137 |     size_t bogus = 0;
138 |     size_t volume = 0;
139 |     uint64_t time = 0;
140 |     cout << "# first column is relative size of intersection" << endl;
141 |     if (ratio > 1) {
142 |         cout
143 |                 << "# next two are estimated average bits per int for differential coding"
144 |                 << endl;
145 |     } else {
146 |         cout
147 |                 << "# next is estimated average bits per int for differential coding"
148 |                 << endl;
149 |     }
150 | 
151 |     cout
152 |             << "# other columns display speed in mis when computing the cardinality of the intersection"
153 |             << endl;
154 |     for (uint32_t gap = 0; gap + S <= MaxBit; gap += 1) {
155 |         vector < vector<uint32_t> > data;
156 |         for (size_t zz = 0; zz < howmany; ++zz) {
157 |             if (natemode) {
158 |                 data.push_back(
159 |                         cdg.generateClustered((1U << (MaxBit - gap)) / ratio,
160 |                                 1U << MaxBit));
161 |                 data.push_back(
162 |                         cdg.generateClustered((1U << (MaxBit - gap)),
163 |                                 1U << MaxBit));
164 |             } else {
165 |                 data.push_back(
166 |                         cdg.generateClustered(N / ratio, 1U << (gap + S)));
167 |                 data.push_back(cdg.generateClustered(N, 1U << (gap + S)));
168 |             }
169 |         }
170 |         size_t intersize = 0;
171 |         size_t smallestsize = 0;
172 |         for (size_t k = 0; k < howmany; k++) {
173 |             intersize += classicalintersectioncardinality(&data[2 * k][0],
174 |                     data[2 * k].size(), &data[2 * k + 1][0],
175 |                     data[2 * k + 1].size());
176 |             smallestsize
177 |                     += data[2 * k + 1].size() < data[2 * k].size() ? data[2 * k
178 |                             + 1].size() : data[2 * k].size();
179 |         }
180 | 
181 |         cout << std::fixed << std::setprecision(3) << static_cast<double>(intersize)
182 |                 / static_cast<double>(smallestsize) << "\t";
183 |         cout.flush();
184 |         if (ratio > 1) {
185 |             if (natemode) {
186 |                 cout << log(
187 |                         1 + (1U << MaxBit) * 1.0 / ((1U << (MaxBit - gap))
188 |                                 / ratio)) / log(2) << "\t";
189 |                 cout
190 |                         << log(
191 |                                 1 + (1U << MaxBit) * 1.0 / ((1U << (MaxBit
192 |                                         - gap)))) / log(2) << "\t";
193 |             } else {
194 |                 cout << log(1 + (1U << (gap + S)) * 1.0 / (N / ratio)) / log(2)
195 |                         << "\t";
196 |                 cout << log(1 + (1U << (gap + S)) * 1.0 / (N)) / log(2) << "\t";
197 |             }
198 |         } else {
199 |             if (natemode) {
200 |                 cout
201 |                         << log(
202 |                                 1 + (1U << MaxBit) * 1.0 / ((1U << (MaxBit
203 |                                         - gap)))) / log(2) << "\t";
204 |             } else {
205 |                 cout << log(1 + (1U << (gap + S)) * 1.0 / (N)) / log(2) << "\t";
206 |             }
207 | 
208 |         }
209 |         cout << "\t";
210 |         cout.flush();
211 |         for(auto algo : myschemes) {
212 |             if (safe and buggyschemes.find(algo.first) == buggyschemes.end() )
213 |             for (size_t k = 0; k < 2 * howmany; k += 2) {
214 |                 size_t correctanswer = classicalintersectioncardinality(
215 |                         &data[k][0], data[k].size(), &data[k + 1][0],
216 |                         data[k + 1].size());
217 |                 size_t thisschemesanswer = algo.second(
218 |                         &data[k][0], data[k].size(), &data[k + 1][0],
219 |                         data[k + 1].size());
220 |                 if (correctanswer != thisschemesanswer) {
221 |                     cerr << "expecting cardinality of " << correctanswer;
222 |                     cerr << " got " << thisschemesanswer << " instead."
223 |                     << endl;
224 |                     throw runtime_error("bug");
225 |                 }
226 |             }
227 |             volume = 0;
228 |             z.reset();
229 |             for (size_t L = 0; L < loop; ++L) {
230 | 
231 |                 for (size_t k = 0; k < 2 * howmany; k += 2) {
232 |                     volume += data[k].size();
233 |                     volume += data[k + 1].size();
234 |                     bogus
235 |                     += algo.second(&data[k][0],
236 |                             data[k].size(), &data[k + 1][0],
237 |                             data[k + 1].size());
238 |                 }
239 | 
240 |             }
241 |             time = z.split();
242 |             cout << std::setprecision(0) << static_cast<double>(volume) / static_cast<double>(time) << "\t";
243 |             cout.flush();
244 |         }
245 | 
246 |         /**
247 |          * From Schlegel et al., Fast Sorted-Set Intersection using SIMD Instructions
248 |          */
249 | 
250 |         vector < vector<uint16_t> > pdata(data.size());
251 |         assert(pdata.size() == 2 * howmany);
252 |         for (size_t zz = 0; zz < data.size(); ++zz) {
253 |             pdata[zz].resize(data[zz].size() * 4);
254 |             const size_t c = partitioned::partition(&data[zz][0],
255 |                     data[zz].size(), &pdata[zz][0], pdata[zz].size());
256 |             pdata[zz].resize(c);
257 |             vector<uint16_t> (pdata[zz]).swap(pdata[zz]);
258 |             assert(pdata[zz].size() == c);
259 | 
260 |             if (safe) {
261 |                 vector < uint32_t > testvec(data[zz].size());
262 |                 size_t recovsize = partitioned::inverse_partition(&testvec[0],
263 |                         testvec.size(), &pdata[zz][0], pdata[zz].size());
264 |                 if (testvec.size() != recovsize)
265 |                     throw std::runtime_error("bug");
266 |                 if (testvec != data[zz])
267 |                     throw std::runtime_error("bug");
268 |             }
269 | 
270 |         }
271 |         cout << "\t";
272 |         //for (uint32_t whichalgo = 0; whichalgo < HOWMANYPARTALGO; ++whichalgo) {
273 |         for(auto algo : mypartschemes) {
274 |             volume = 0;
275 |             if (safe)
276 |             for (size_t k = 0; k < 2 * howmany; k += 2) {
277 |                 size_t correctanswer = classicalintersectioncardinality(
278 |                         &data[k][0], data[k].size(), &data[k + 1][0],
279 |                         data[k + 1].size());
280 |                 size_t thisschemesanswer = algo.second(
281 |                         &pdata[k][0], &pdata[k + 1][0], pdata[k].size(),
282 |                         pdata[k + 1].size());
283 |                 if (correctanswer != thisschemesanswer) {
284 |                     cerr << "expecting cardinality of " << correctanswer;
285 |                     cerr << " got " << thisschemesanswer << " instead."
286 |                     << endl;
287 |                     throw runtime_error("bug");
288 |                 }
289 | 
290 |             }
291 | 
292 |             z.reset();
293 | 
294 |             for (size_t k = 0; k < 2 * howmany; k += 2) {
295 |                 volume += data[k].size();
296 |                 volume += data[k + 1].size();
297 | 
298 |                 bogus += algo.second(&pdata[k][0],
299 |                         &pdata[k + 1][0], pdata[k].size(), pdata[k + 1].size());
300 |             }
301 | 
302 |             time = z.split();
303 |             cout << std::setprecision(0) << static_cast<double>(volume) / static_cast<double>(time) << "\t";
304 | 
305 |             cout.flush();
306 |         }
307 |         cout << endl;
308 | 
309 |     }
310 |     cout << "# bogus = " << bogus << endl;
311 | }
312 | 


--------------------------------------------------------------------------------
/src/unit.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * This is not the totality of our testing, but rather, this is meant to include
  3 |  * very specific tests.
  4 |  *
  5 |  * See testintersection.cpp for more general tests.
  6 |  *
  7 |  */
  8 | 
  9 | // todo: add tests on large numbers
 10 | 
 11 | #include "common.h"
 12 | #include "intersectionfactory.h"
 13 | 
 14 | int test2(intersectionfunction f) {
 15 |     const uint32_t firstpost[5] = { 12635, 12921, 12923, 12924,
 16 |             12926};
 17 | 
 18 |     const uint32_t secondpost[173] = { 3756, 11996, 12044, 12049, 12109, 12128,
 19 |             12131, 12141, 12142, 12150, 12154, 12160, 12167, 12168, 12172,
 20 |             12177, 12201, 12208, 12215, 12216, 12223, 12228, 12232, 12233,
 21 |             12234, 12235, 12236, 12240, 12241, 12242, 12243, 12254, 12255,
 22 |             12256, 12257, 12259, 12260, 12261, 12262, 12264, 12265, 12266,
 23 |             12275, 12295, 12471, 12482, 12486, 12508, 12509, 12510, 12511,
 24 |             12512, 12530, 12536, 12572, 12573, 12589, 12607, 12609, 12611,
 25 |             12630, 12631, 12632, 12633, 12634, 12635, 12636, 12653, 12655,
 26 |             12657, 12668, 12672, 12685, 12702, 12716, 12721, 12741, 12745,
 27 |             12750, 12755, 12757, 12761, 12765, 12767, 12768, 12794, 12802,
 28 |             12803, 12823, 12842, 12851, 12871, 12891, 12893, 12894, 12895,
 29 |             12896, 12897, 12915, 12917, 12918, 12919, 12920, 12921, 12922,
 30 |             12923, 12924, 12925, 12927, 12929, 12932, 12933, 12934, 12935,
 31 |             12936, 12937, 12938, 12939, 12942, 12946, 12951, 12955, 12963,
 32 |             12972, 13011, 13013, 13014, 13015, 13017, 13032, 13033, 13036,
 33 |             13042, 13050, 13051, 13052, 13057, 13058, 13060, 13090, 13120,
 34 |             13132, 13136, 13147, 13185, 13191, 13192, 13193, 13194, 13195,
 35 |             13198, 13202, 13205, 13219, 13228, 13230, 13232, 13233, 13238,
 36 |             13240, 13246, 13248, 13277, 13278, 13281, 13282, 13283, 13284,
 37 |             13291, 13320, 13338, 13346, 13347 };
 38 |     vector < uint32_t > inter(173);
 39 |     size_t s = f(firstpost, 5, secondpost, 173, inter.data());
 40 |     inter.resize(s);
 41 |     vector < uint32_t > correct(173);
 42 |     size_t cs = classicalintersection(firstpost, 5, secondpost, 173,
 43 |             correct.data());
 44 |     correct.resize(cs);
 45 |     if (inter != correct) {
 46 |         cout << inter.size() << " " << correct.size() << endl;
 47 |         cout<<" correct answer:"<<endl;
 48 |         for (size_t i = 0; i < correct.size(); ++i)
 49 |             cout << i << " " << correct[i] << endl;
 50 |         cout<<" bad answer:"<<endl;
 51 |         for (size_t i = 0; i < inter.size(); ++i)
 52 |             cout << i << " " << inter[i] << endl;
 53 |         return 1;
 54 |     }
 55 |     return 0;
 56 | 
 57 | }
 58 | 
 59 | int test1(intersectionfunction f, bool testwriteback) {
 60 | 
 61 |     const uint32_t firstpost[13] = {27181,35350,39241,39277,39278,44682,64706,120447,120450,159274,159290,173895,173942,
 62 | };
 63 |     const uint32_t secondpost[13] = {25369,28789,28790,28792,28794,28797,37750,42317,68797,68877,68881,68990,85488};
 64 |     vector < uint32_t > inter(13);
 65 |     size_t s = f(firstpost, 13, secondpost, 13, inter.data());
 66 |     inter.resize(s);
 67 |     vector < uint32_t > correct(13);
 68 |     size_t cs = classicalintersection(firstpost, 13, secondpost, 13,
 69 |             correct.data());
 70 |     correct.resize(cs);
 71 |     if (inter != correct) {
 72 |         cout << inter.size() << " " << correct.size() << endl;
 73 |         for (size_t i = 0; (i < inter.size()) && (i < correct.size()); ++i)
 74 |             cout << i << " " << inter[i] << " " << correct[i] << endl;
 75 |         return 1;
 76 |     }
 77 |     if (!testwriteback)
 78 |         return 0;
 79 |     vector < uint32_t > inter2(firstpost, firstpost + 13);
 80 |     size_t s2 = f(inter2.data(), 13, secondpost, 13, inter2.data());
 81 |     inter2.resize(s2);
 82 |     if (inter2 != correct)
 83 |         return 2;
 84 |     return 0;
 85 | 
 86 | }
 87 | 
 88 | int test3(intersectionfunction f) {
 89 | 
 90 |     vector<uint32_t> firstpost;
 91 |     vector<uint32_t> secondpost;
 92 |     vector<uint32_t> trueinter;
 93 | 
 94 |     for(uint32_t i = 10; i < 31;++i) {
 95 |         firstpost.push_back((1U<<i) | 3U);
 96 |         trueinter.push_back((1U<<i) | 3U);
 97 |         for(uint32_t j = 3; j< 1000;j+=11) {
 98 |             secondpost.push_back((1U<<i) | j);
 99 |         }
100 |         firstpost.push_back((1U<<i) | 1001U);
101 |     }
102 |     vector < uint32_t > inter(firstpost.size());
103 |     size_t s = f(firstpost.data(), firstpost.size(), secondpost.data(), secondpost.size(), inter.data());
104 |     inter.resize(s);
105 |     if(inter != trueinter) {
106 |         cout << inter.size() << " " << trueinter.size() << endl;
107 |         for (size_t i = 0; (i < inter.size()) && (i < trueinter.size()); ++i)
108 |             cout << i << " " << inter[i] << " " << trueinter[i] << endl;
109 |         return 1;
110 | 
111 |         return 1;
112 |     }
113 |     return 0;
114 | }
115 | 
116 | 
117 | int main() {
118 |     int error = 0;
119 |     for (string n : allRealNames()) {
120 |         cout<<"testing "<<n<<" ... ";
121 |         cout.flush();
122 |         int code;
123 |         bool testwriteback = (n != "hssimd") && (n != "hssimddan");
124 |         if((code = test1(realschemes[n],testwriteback))==0)
125 |         cout<<"ok ";
126 |         else {
127 |             cout<<" Error"<<code<<" ";
128 |             ++error;
129 |         }
130 |         if((code = test2(realschemes[n]))==0)
131 |         cout<<"ok ";
132 |         else {
133 |             cout<<" Error"<<code<<endl;
134 |             ++error;
135 |         }
136 |         if((code = test3(realschemes[n]))==0)
137 |         cout<<"ok"<<endl;
138 |         else {
139 |             cout<<" Error"<<code<<endl;
140 |             ++error;
141 |         }
142 | 
143 |     }
144 |     if (error == 0)
145 |         cout << "Your code is maybe ok." << endl;
146 |     else
147 |         cout << "Your code is buggy, found " << error << " error(s)" << endl;
148 |     return 0;
149 | }
150 | 


--------------------------------------------------------------------------------