null.
151 | */
152 | private static HLLType getType(final int ordinal) {
153 | if((ordinal < 0) || (ordinal >= TYPE_ORDINALS.length)) {
154 | throw new IllegalArgumentException("Invalid type ordinal '" + ordinal + "'. Only 0-" + (TYPE_ORDINALS.length - 1) + " inclusive allowed.");
155 | }
156 | return TYPE_ORDINALS[ordinal];
157 | }
158 | }
159 |
--------------------------------------------------------------------------------
/src/main/java/net/agkn/hll/serialization/SerializationUtil.java:
--------------------------------------------------------------------------------
1 | package net.agkn.hll.serialization;
2 |
3 | /*
4 | * Copyright 2013 Aggregate Knowledge, Inc.
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 |
19 | import net.agkn.hll.HLLType;
20 |
21 | /**
22 | * A collection of constants and utilities for serializing and deserializing
23 | * HLLs.
24 | *
25 | * NOTE: 'package' visibility is used for many methods that only need to be
26 | * used by the {@link ISchemaVersion} implementations. The structure of
27 | * a serialized HLL's metadata should be opaque to the rest of the
28 | * library.
29 | *
30 | * @author timon
31 | */
32 | public class SerializationUtil {
33 | /**
34 | * The number of bits (of the parameters byte) dedicated to encoding the
35 | * width of the registers.
36 | */
37 | /*package*/ static int REGISTER_WIDTH_BITS = 3;
38 |
39 | /**
40 | * A mask to cap the maximum value of the register width.
41 | */
42 | /*package*/ static int REGISTER_WIDTH_MASK = (1 << REGISTER_WIDTH_BITS) - 1;
43 |
44 | /**
45 | * The number of bits (of the parameters byte) dedicated to encoding
46 | * log2(registerCount)
.
47 | */
48 | /*package*/ static int LOG2_REGISTER_COUNT_BITS = 5;
49 |
50 | /**
51 | * A mask to cap the maximum value of log2(registerCount)
.
52 | */
53 | /*package*/ static int LOG2_REGISTER_COUNT_MASK = (1 << LOG2_REGISTER_COUNT_BITS) - 1;
54 |
55 | /**
56 | * The number of bits (of the cutoff byte) dedicated to encoding the
57 | * log-base-2 of the explicit cutoff or sentinel values for
58 | * 'explicit-disabled' or 'auto'.
59 | */
60 | /*package*/ static int EXPLICIT_CUTOFF_BITS = 6;
61 |
62 | /**
63 | * A mask to cap the maximum value of the explicit cutoff choice.
64 | */
65 | /*package*/ static int EXPLICIT_CUTOFF_MASK = (1 << EXPLICIT_CUTOFF_BITS) - 1;
66 |
67 | /**
68 | * Number of bits in a nibble.
69 | */
70 | private static int NIBBLE_BITS = 4;
71 |
72 | /**
73 | * A mask to cap the maximum value of a nibble.
74 | */
75 | private static int NIBBLE_MASK = (1 << NIBBLE_BITS) - 1;
76 |
77 | // ************************************************************************
78 | // Serialization utilities
79 |
80 | /**
81 | * Schema version one (v1).
82 | */
83 | public static ISchemaVersion VERSION_ONE = new SchemaVersionOne();
84 |
85 | /**
86 | * The default schema version for serializing HLLs.
87 | */
88 | public static ISchemaVersion DEFAULT_SCHEMA_VERSION = VERSION_ONE;
89 |
90 | /**
91 | * List of registered schema versions, indexed by their version numbers. If
92 | * an entry is null
, then no such schema version is registered.
93 | * Similarly, registering a new schema version simply entails assigning an
94 | * {@link ISchemaVersion} instance to the appropriate index of this array.
95 | *
96 | * By default, only {@link SchemaVersionOne} is registered. Note that version
97 | * zero will always be reserved for internal (e.g. proprietary, legacy) schema
98 | * specifications/implementations and will never be assigned to in by this
99 | * library.
100 | */
101 | public static ISchemaVersion[] REGISTERED_SCHEMA_VERSIONS = new ISchemaVersion[16];
102 |
103 | static {
104 | REGISTERED_SCHEMA_VERSIONS[1] = VERSION_ONE;
105 | }
106 |
107 | /**
108 | * @param schemaVersionNumber the version number of the {@link ISchemaVersion}
109 | * desired. This must be a registered schema version number.
110 | * @return The {@link ISchemaVersion} for the given number. This will never
111 | * be null
.
112 | */
113 | public static ISchemaVersion getSchemaVersion(final int schemaVersionNumber) {
114 | if(schemaVersionNumber >= REGISTERED_SCHEMA_VERSIONS.length || schemaVersionNumber < 0) {
115 | throw new RuntimeException("Invalid schema version number " + schemaVersionNumber);
116 | }
117 | final ISchemaVersion schemaVersion = REGISTERED_SCHEMA_VERSIONS[schemaVersionNumber];
118 | if(schemaVersion == null) {
119 | throw new RuntimeException("Unknown schema version number " + schemaVersionNumber);
120 | }
121 | return schemaVersion;
122 | }
123 |
124 | /**
125 | * Get the appropriate {@link ISchemaVersion schema version} for the specified
126 | * serialized HLL.
127 | *
128 | * @param bytes the serialized HLL whose schema version is desired.
129 | * @return the schema version for the specified HLL. This will never
130 | * be null
.
131 | */
132 | public static ISchemaVersion getSchemaVersion(final byte[] bytes) {
133 | final byte versionByte = bytes[0];
134 | final int schemaVersionNumber = schemaVersion(versionByte);
135 |
136 | return getSchemaVersion(schemaVersionNumber);
137 | }
138 |
139 | // ************************************************************************
140 | // Package-specific shared helpers
141 |
142 | /**
143 | * Generates a byte that encodes the schema version and the type ordinal
144 | * of the HLL.
145 | *
146 | * The top nibble is the schema version and the bottom nibble is the type
147 | * ordinal.
148 | *
149 | * @param schemaVersion the schema version to encode.
150 | * @param typeOrdinal the type ordinal of the HLL to encode.
151 | * @return the packed version byte
152 | */
153 | public static byte packVersionByte(final int schemaVersion, final int typeOrdinal) {
154 | return (byte)(((NIBBLE_MASK & schemaVersion) << NIBBLE_BITS) | (NIBBLE_MASK & typeOrdinal));
155 | }
156 | /**
157 | * Generates a byte that encodes the log-base-2 of the explicit cutoff
158 | * or sentinel values for 'explicit-disabled' or 'auto', as well as the
159 | * boolean indicating whether to use {@link HLLType#SPARSE}
160 | * in the promotion hierarchy.
161 | *
162 | * The top bit is always padding, the second highest bit indicates the
163 | * 'sparse-enabled' boolean, and the lowest six bits encode the explicit
164 | * cutoff value.
165 | *
166 | * @param explicitCutoff the explicit cutoff value to encode.
167 | *
168 | * -
169 | * If 'explicit-disabled' is chosen, this value should be
0
.
170 | *
171 | * -
172 | * If 'auto' is chosen, this value should be
63
.
173 | *
174 | * -
175 | * If a cutoff of 2n is desired, for
0 <= n < 31
,
176 | * this value should be n + 1
.
177 | *
178 | *
179 | * @param sparseEnabled whether {@link HLLType#SPARSE}
180 | * should be used in the promotion hierarchy to improve HLL
181 | * storage.
182 | *
183 | * @return the packed cutoff byte
184 | */
185 | public static byte packCutoffByte(final int explicitCutoff, final boolean sparseEnabled) {
186 | final int sparseBit = (sparseEnabled ? (1 << EXPLICIT_CUTOFF_BITS) : 0);
187 | return (byte)(sparseBit | (EXPLICIT_CUTOFF_MASK & explicitCutoff));
188 | }
189 |
190 | /**
191 | * Generates a byte that encodes the parameters of a
192 | * {@link HLLType#FULL} or {@link HLLType#SPARSE}
193 | * HLL.
194 | *
195 | * The top 3 bits are used to encode registerWidth - 1
196 | * (range of registerWidth
is thus 1-9) and the bottom 5
197 | * bits are used to encode registerCountLog2
198 | * (range of registerCountLog2
is thus 0-31).
199 | *
200 | * @param registerWidth the register width (must be at least 1 and at
201 | * most 9)
202 | * @param registerCountLog2 the log-base-2 of the register count (must
203 | * be at least 0 and at most 31)
204 | * @return the packed parameters byte
205 | */
206 | public static byte packParametersByte(final int registerWidth, final int registerCountLog2) {
207 | final int widthBits = ((registerWidth - 1) & REGISTER_WIDTH_MASK);
208 | final int countBits = (registerCountLog2 & LOG2_REGISTER_COUNT_MASK);
209 | return (byte)((widthBits << LOG2_REGISTER_COUNT_BITS) | countBits);
210 | }
211 |
212 | /**
213 | * Extracts the 'sparse-enabled' boolean from the cutoff byte of a serialized
214 | * HLL.
215 | *
216 | * @param cutoffByte the cutoff byte of the serialized HLL
217 | * @return the 'sparse-enabled' boolean
218 | */
219 | public static boolean sparseEnabled(final byte cutoffByte) {
220 | return ((cutoffByte >>> EXPLICIT_CUTOFF_BITS) & 1) == 1;
221 | }
222 |
223 | /**
224 | * Extracts the explicit cutoff value from the cutoff byte of a serialized
225 | * HLL.
226 | *
227 | * @param cutoffByte the cutoff byte of the serialized HLL
228 | * @return the explicit cutoff value
229 | */
230 | public static int explicitCutoff(final byte cutoffByte) {
231 | return (cutoffByte & EXPLICIT_CUTOFF_MASK);
232 | }
233 |
234 | /**
235 | * Extracts the schema version from the version byte of a serialized
236 | * HLL.
237 | *
238 | * @param versionByte the version byte of the serialized HLL
239 | * @return the schema version of the serialized HLL
240 | */
241 | public static int schemaVersion(final byte versionByte) {
242 | return NIBBLE_MASK & (versionByte >>> NIBBLE_BITS);
243 | }
244 |
245 | /**
246 | * Extracts the type ordinal from the version byte of a serialized HLL.
247 | *
248 | * @param versionByte the version byte of the serialized HLL
249 | * @return the type ordinal of the serialized HLL
250 | */
251 | public static int typeOrdinal(final byte versionByte) {
252 | return (versionByte & NIBBLE_MASK);
253 | }
254 |
255 | /**
256 | * Extracts the register width from the parameters byte of a serialized
257 | * {@link HLLType#FULL} HLL.
258 | *
259 | * @param parametersByte the parameters byte of the serialized HLL
260 | * @return the register width of the serialized HLL
261 | *
262 | * @see #packParametersByte(int, int)
263 | */
264 | public static int registerWidth(final byte parametersByte) {
265 | return ((parametersByte >>> LOG2_REGISTER_COUNT_BITS) & REGISTER_WIDTH_MASK) + 1;
266 | }
267 |
268 | /**
269 | * Extracts the log2(registerCount) from the parameters byte of a
270 | * serialized {@link HLLType#FULL} HLL.
271 | *
272 | * @param parametersByte the parameters byte of the serialized HLL
273 | * @return log2(registerCount) of the serialized HLL
274 | *
275 | * @see #packParametersByte(int, int)
276 | */
277 | public static int registerCountLog2(final byte parametersByte) {
278 | return (parametersByte & LOG2_REGISTER_COUNT_MASK);
279 | }
280 | }
281 |
--------------------------------------------------------------------------------
/src/main/java/net/agkn/hll/util/BitUtil.java:
--------------------------------------------------------------------------------
1 | package net.agkn.hll.util;
2 |
3 | /*
4 | * Copyright 2013 Aggregate Knowledge, Inc.
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 |
19 | /**
20 | * A collection of bit utilities.
21 | *
22 | * @author rgrzywinski
23 | */
24 | public class BitUtil {
25 | /**
26 | * The set of least-significant bits for a given byte
. -1
27 | * is used if no bits are set (so as to not be confused with "index of zero"
28 | * meaning that the least significant bit is the 0th (1st) bit).
29 | *
30 | * @see #leastSignificantBit(long)
31 | */
32 | private static final int[] LEAST_SIGNIFICANT_BIT = {
33 | -1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
34 | 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
35 | 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
36 | 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
37 | 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
38 | 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
39 | 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
40 | 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
41 | 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
42 | 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
43 | 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
44 | 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
45 | 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
46 | 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
47 | 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
48 | 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
49 | };
50 |
51 | /**
52 | * Computes the least-significant bit of the specified long
53 | * that is set to 1
. Zero-indexed.
54 | *
55 | * @param value the long
whose least-significant bit is desired.
56 | * @return the least-significant bit of the specified long
.
57 | * -1
is returned if there are no bits set.
58 | */
59 | // REF: http://stackoverflow.com/questions/757059/position-of-least-significant-bit-that-is-set
60 | // REF: http://www-graphics.stanford.edu/~seander/bithacks.html
61 | public static int leastSignificantBit(final long value) {
62 | if(value == 0L) return -1/*by contract*/;
63 | if((value & 0xFFL) != 0) return LEAST_SIGNIFICANT_BIT[(int)( (value >>> 0) & 0xFF)] + 0;
64 | if((value & 0xFFFFL) != 0) return LEAST_SIGNIFICANT_BIT[(int)( (value >>> 8) & 0xFF)] + 8;
65 | if((value & 0xFFFFFFL) != 0) return LEAST_SIGNIFICANT_BIT[(int)( (value >>> 16) & 0xFF)] + 16;
66 | if((value & 0xFFFFFFFFL) != 0) return LEAST_SIGNIFICANT_BIT[(int)( (value >>> 24) & 0xFF)] + 24;
67 | if((value & 0xFFFFFFFFFFL) != 0) return LEAST_SIGNIFICANT_BIT[(int)( (value >>> 32) & 0xFF)] + 32;
68 | if((value & 0xFFFFFFFFFFFFL) != 0) return LEAST_SIGNIFICANT_BIT[(int)( (value >>> 40) & 0xFF)] + 40;
69 | if((value & 0xFFFFFFFFFFFFFFL) != 0) return LEAST_SIGNIFICANT_BIT[(int)( (value >>> 48) & 0xFF)] + 48;
70 | return LEAST_SIGNIFICANT_BIT[(int)( (value >>> 56) & 0xFFL)] + 56;
71 | }
72 | }
--------------------------------------------------------------------------------
/src/main/java/net/agkn/hll/util/BitVector.java:
--------------------------------------------------------------------------------
1 | package net.agkn.hll.util;
2 |
3 | /*
4 | * Copyright 2013 Aggregate Knowledge, Inc.
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 |
19 | import net.agkn.hll.serialization.IWordSerializer;
20 |
21 | /**
22 | * A vector (array) of bits that is accessed in units ("registers") of width
23 | * bits which are stored as 64bit "words" (long
s). In this context
24 | * a register is at most 64bits.
25 | *
26 | * @author rgrzywinski
27 | */
28 | public class BitVector implements Cloneable {
29 | // NOTE: in this context, a word is 64bits
30 |
31 | // rather than doing division to determine how a bit index fits into 64bit
32 | // words (i.e. longs), bit shifting is used
33 | private static final int LOG2_BITS_PER_WORD = 6/*=>64bits*/;
34 | private static final int BITS_PER_WORD = 1 << LOG2_BITS_PER_WORD;
35 | private static final int BITS_PER_WORD_MASK = BITS_PER_WORD - 1;
36 |
37 | // ditto from above but for bytes (for output)
38 | private static final int LOG2_BITS_PER_BYTE = 3/*=>8bits*/;
39 | public static final int BITS_PER_BYTE = 1 << LOG2_BITS_PER_BYTE;
40 |
41 | // ========================================================================
42 | public static final int BYTES_PER_WORD = 8/*8 bytes in a long*/;
43 |
44 | // ************************************************************************
45 | // 64bit words
46 | private final long[] words;
47 | public final long[] words() { return words; }
48 | public final int wordCount() { return words.length; }
49 | public final int byteCount() { return wordCount() * BYTES_PER_WORD; }
50 |
51 | // the width of a register in bits (this cannot be more than 64 (the word size))
52 | private final int registerWidth;
53 | public final int registerWidth() { return registerWidth; }
54 |
55 | private final long count;
56 |
57 | // ------------------------------------------------------------------------
58 | private final long registerMask;
59 |
60 | // ========================================================================
61 | /**
62 | * @param width the width of each register. This cannot be negative or
63 | * zero or greater than 63 (the signed word size).
64 | * @param count the number of registers. This cannot be negative or zero
65 | */
66 | public BitVector(final int width, final long count) {
67 | // ceil((width * count)/BITS_PER_WORD)
68 | this.words = new long[(int)(((width * count) + BITS_PER_WORD_MASK) >>> LOG2_BITS_PER_WORD)];
69 | this.registerWidth = width;
70 | this.count = count;
71 |
72 | this.registerMask = (1L << width) - 1;
73 | }
74 |
75 | // ========================================================================
76 | /**
77 | * @param registerIndex the index of the register whose value is to be
78 | * retrieved. This cannot be negative.
79 | * @return the value at the specified register index
80 | * @see #setRegister(long, long)
81 | * @see #setMaxRegister(long, long)
82 | */
83 | // NOTE: if this changes then setMaxRegister() must change
84 | public long getRegister(final long registerIndex) {
85 | final long bitIndex = registerIndex * registerWidth;
86 | final int firstWordIndex = (int)(bitIndex >>> LOG2_BITS_PER_WORD)/*aka (bitIndex / BITS_PER_WORD)*/;
87 | final int secondWordIndex = (int)((bitIndex + registerWidth - 1) >>> LOG2_BITS_PER_WORD)/*see above*/;
88 | final int bitRemainder = (int)(bitIndex & BITS_PER_WORD_MASK)/*aka (bitIndex % BITS_PER_WORD)*/;
89 |
90 | if(firstWordIndex == secondWordIndex)
91 | return ((words[firstWordIndex] >>> bitRemainder) & registerMask);
92 | /* else -- register spans words */
93 | return (words[firstWordIndex] >>> bitRemainder)/*no need to mask since at top of word*/
94 | | (words[secondWordIndex] << (BITS_PER_WORD - bitRemainder)) & registerMask;
95 | }
96 |
97 | /**
98 | * @param registerIndex the index of the register whose value is to be set.
99 | * This cannot be negative
100 | * @param value the value to set in the register
101 | * @see #getRegister(long)
102 | * @see #setMaxRegister(long, long)
103 | */
104 | // NOTE: if this changes then setMaxRegister() must change
105 | public void setRegister(final long registerIndex, final long value) {
106 | final long bitIndex = registerIndex * registerWidth;
107 | final int firstWordIndex = (int)(bitIndex >>> LOG2_BITS_PER_WORD)/*aka (bitIndex / BITS_PER_WORD)*/;
108 | final int secondWordIndex = (int)((bitIndex + registerWidth - 1) >>> LOG2_BITS_PER_WORD)/*see above*/;
109 | final int bitRemainder = (int)(bitIndex & BITS_PER_WORD_MASK)/*aka (bitIndex % BITS_PER_WORD)*/;
110 |
111 | final long words[] = this.words/*for convenience/performance*/;
112 | if(firstWordIndex == secondWordIndex) {
113 | // clear then set
114 | words[firstWordIndex] &= ~(registerMask << bitRemainder);
115 | words[firstWordIndex] |= (value << bitRemainder);
116 | } else {/*register spans words*/
117 | // clear then set each partial word
118 | words[firstWordIndex] &= (1L << bitRemainder) - 1;
119 | words[firstWordIndex] |= (value << bitRemainder);
120 |
121 | words[secondWordIndex] &= ~(registerMask >>> (BITS_PER_WORD - bitRemainder));
122 | words[secondWordIndex] |= (value >>> (BITS_PER_WORD - bitRemainder));
123 | }
124 | }
125 |
126 | // ------------------------------------------------------------------------
127 | /**
128 | * @return a LongIterator
for iterating starting at the register
129 | * with index zero. This will never be null
.
130 | */
131 | public LongIterator registerIterator() {
132 | return new LongIterator() {
133 | final int registerWidth = BitVector.this.registerWidth;
134 | final long[] words = BitVector.this.words;
135 | final long registerMask = BitVector.this.registerMask;
136 |
137 | // register setup
138 | long registerIndex = 0;
139 | int wordIndex = 0;
140 | int remainingWordBits = BITS_PER_WORD;
141 | long word = words[wordIndex];
142 |
143 | @Override public long next() {
144 | long register;
145 | if(remainingWordBits >= registerWidth) {
146 | register = word & registerMask;
147 |
148 | // shift to the next register
149 | word >>>= registerWidth;
150 | remainingWordBits -= registerWidth;
151 | } else { /*insufficient bits remaining in current word*/
152 | wordIndex++/*move to the next word*/;
153 |
154 | register = (word | (words[wordIndex] << remainingWordBits)) & registerMask;
155 |
156 | // shift to the next partial register (word)
157 | word = words[wordIndex] >>> (registerWidth - remainingWordBits);
158 | remainingWordBits += BITS_PER_WORD - registerWidth;
159 | }
160 | registerIndex++;
161 | return register;
162 | }
163 |
164 | @Override public boolean hasNext() {
165 | return registerIndex < count;
166 | }
167 | };
168 | }
169 |
170 | // ------------------------------------------------------------------------
171 | // composite accessors
172 | /**
173 | * Sets the value of the specified index register if and only if the specified
174 | * value is greater than the current value in the register. This is equivalent
175 | * to but much more performant than:
176 | *
177 | * vector.setRegister(index, Math.max(vector.getRegister(index), value));
178 | *
179 | * @param registerIndex the index of the register whose value is to be set.
180 | * This cannot be negative
181 | * @param value the value to set in the register if and only if this value
182 | * is greater than the current value in the register
183 | * @return true
if and only if the specified value is greater
184 | * than or equal to the current register value. false
185 | * otherwise.
186 | * @see #getRegister(long)
187 | * @see #setRegister(long, long)
188 | * @see java.lang.Math#max(long, long)
189 | */
190 | // NOTE: if this changes then setRegister() must change
191 | public boolean setMaxRegister(final long registerIndex, final long value) {
192 | final long bitIndex = registerIndex * registerWidth;
193 | final int firstWordIndex = (int)(bitIndex >>> LOG2_BITS_PER_WORD)/*aka (bitIndex / BITS_PER_WORD)*/;
194 | final int secondWordIndex = (int)((bitIndex + registerWidth - 1) >>> LOG2_BITS_PER_WORD)/*see above*/;
195 | final int bitRemainder = (int)(bitIndex & BITS_PER_WORD_MASK)/*aka (bitIndex % BITS_PER_WORD)*/;
196 |
197 | // NOTE: matches getRegister()
198 | final long registerValue;
199 | final long words[] = this.words/*for convenience/performance*/;
200 | if(firstWordIndex == secondWordIndex)
201 | registerValue = ((words[firstWordIndex] >>> bitRemainder) & registerMask);
202 | else /*register spans words*/
203 | registerValue = (words[firstWordIndex] >>> bitRemainder)/*no need to mask since at top of word*/
204 | | (words[secondWordIndex] << (BITS_PER_WORD - bitRemainder)) & registerMask;
205 |
206 | // determine which is the larger and update as necessary
207 | if(value > registerValue) {
208 | // NOTE: matches setRegister()
209 | if(firstWordIndex == secondWordIndex) {
210 | // clear then set
211 | words[firstWordIndex] &= ~(registerMask << bitRemainder);
212 | words[firstWordIndex] |= (value << bitRemainder);
213 | } else {/*register spans words*/
214 | // clear then set each partial word
215 | words[firstWordIndex] &= (1L << bitRemainder) - 1;
216 | words[firstWordIndex] |= (value << bitRemainder);
217 |
218 | words[secondWordIndex] &= ~(registerMask >>> (BITS_PER_WORD - bitRemainder));
219 | words[secondWordIndex] |= (value >>> (BITS_PER_WORD - bitRemainder));
220 | }
221 | } /* else -- the register value is greater (or equal) so nothing needs to be done */
222 |
223 | return (value >= registerValue);
224 | }
225 |
226 | // ========================================================================
227 | /**
228 | * Fills this bit vector with the specified bit value. This can be used to
229 | * clear the vector by specifying 0
.
230 | *
231 | * @param value the value to set all bits to (only the lowest bit is used)
232 | */
233 | public void fill(final long value) {
234 | for(long i=0; inull.
244 | */
245 | public void getRegisterContents(final IWordSerializer serializer) {
246 | for(final LongIterator iter = registerIterator(); iter.hasNext();) {
247 | serializer.writeWord(iter.next());
248 | }
249 | }
250 |
251 | /**
252 | * Creates a deep copy of this vector.
253 | *
254 | * @see java.lang.Object#clone()
255 | */
256 | @Override
257 | public BitVector clone() {
258 | final BitVector copy = new BitVector(registerWidth, count);
259 | System.arraycopy(words, 0, copy.words, 0, words.length);
260 | return copy;
261 | }
262 | }
--------------------------------------------------------------------------------
/src/main/java/net/agkn/hll/util/HLLUtil.java:
--------------------------------------------------------------------------------
1 | package net.agkn.hll.util;
2 |
3 | /*
4 | * Copyright 2013 Aggregate Knowledge, Inc.
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 |
19 | import net.agkn.hll.HLL;
20 |
21 | /**
22 | * Static functions for computing constants and parameters used in the HLL
23 | * algorithm.
24 | *
25 | * @author timon
26 | */
27 | public final class HLLUtil {
28 | /**
29 | * Precomputed pwMaxMask
values indexed by registerSizeInBits
.
30 | * Calculated with this formula:
31 | *
32 | * int maxRegisterValue = (1 << registerSizeInBits) - 1;
33 | * // Mask with all bits set except for (maxRegisterValue - 1) least significant bits (see #addRaw())
34 | * return ~((1L << (maxRegisterValue - 1)) - 1);
35 | *
36 | *
37 | * @see #pwMaxMask(int)
38 | */
39 | private static final long[] PW_MASK = {
40 | ~((1L << (((1 << 0) - 1) - 1)) - 1),
41 | ~((1L << (((1 << 1) - 1) - 1)) - 1),
42 | ~((1L << (((1 << 2) - 1) - 1)) - 1),
43 | ~((1L << (((1 << 3) - 1) - 1)) - 1),
44 | ~((1L << (((1 << 4) - 1) - 1)) - 1),
45 | ~((1L << (((1 << 5) - 1) - 1)) - 1),
46 | ~((1L << (((1 << 6) - 1) - 1)) - 1),
47 | ~((1L << (((1 << 7) - 1) - 1)) - 1),
48 | ~((1L << (((1 << 8) - 1) - 1)) - 1)
49 | };
50 |
51 | /**
52 | * Precomputed twoToL
values indexed by a linear combination of
53 | * regWidth
and log2m
.
54 | *
55 | * The array is one-dimensional and can be accessed by using index
56 | * (REG_WIDTH_INDEX_MULTIPLIER * regWidth) + log2m
57 | * for regWidth
and log2m
between the specified
58 | * HLL.{MINIMUM,MAXIMUM}_{REGWIDTH,LOG2M}_PARAM
constants.
59 | *
60 | * @see #largeEstimator(int, int, double)
61 | * @see #largeEstimatorCutoff(int, int)
62 | * @see Blog post with section on 2^L
63 | */
64 | private static final double[] TWO_TO_L = new double[(HLL.MAXIMUM_REGWIDTH_PARAM + 1) * (HLL.MAXIMUM_LOG2M_PARAM + 1)];
65 |
66 | /**
67 | * Spacing constant used to compute offsets into {@link TWO_TO_L}.
68 | */
69 | private static final int REG_WIDTH_INDEX_MULTIPLIER = HLL.MAXIMUM_LOG2M_PARAM + 1;
70 |
71 | static {
72 | for(int regWidth = HLL.MINIMUM_REGWIDTH_PARAM; regWidth <= HLL.MAXIMUM_REGWIDTH_PARAM; regWidth++) {
73 | for(int log2m = HLL.MINIMUM_LOG2M_PARAM ; log2m <= HLL.MAXIMUM_LOG2M_PARAM; log2m++) {
74 | int maxRegisterValue = (1 << regWidth) - 1;
75 |
76 | // Since 1 is added to p(w) in the insertion algorithm, only
77 | // (maxRegisterValue - 1) bits are inspected hence the hash
78 | // space is one power of two smaller.
79 | final int pwBits = (maxRegisterValue - 1);
80 | final int totalBits = (pwBits + log2m);
81 | final double twoToL = Math.pow(2, totalBits);
82 | TWO_TO_L[(REG_WIDTH_INDEX_MULTIPLIER * regWidth) + log2m] = twoToL;
83 | }
84 | }
85 | }
86 |
87 | // ************************************************************************
88 | /**
89 | * Computes the bit-width of HLL registers necessary to estimate a set of
90 | * the specified cardinality.
91 | *
92 | * @param expectedUniqueElements an upper bound on the number of unique
93 | * elements that are expected. This must be greater than zero.
94 | * @return a register size in bits (i.e. log2(log2(n))
)
95 | */
96 | public static int registerBitSize(final long expectedUniqueElements) {
97 | return Math.max(HLL.MINIMUM_REGWIDTH_PARAM,
98 | (int)Math.ceil(NumberUtil.log2(NumberUtil.log2(expectedUniqueElements))));
99 | }
100 |
101 | // ========================================================================
102 | /**
103 | * Computes the 'alpha-m-squared' constant used by the HyperLogLog algorithm.
104 | *
105 | * @param m this must be a power of two, cannot be less than
106 | * 16 (24), and cannot be greater than 65536 (216).
107 | * @return gamma times registerCount
squared where gamma is
108 | * based on the value of registerCount
.
109 | * @throws IllegalArgumentException if registerCount
is less
110 | * than 16.
111 | */
112 | public static double alphaMSquared(final int m) {
113 | switch(m) {
114 | case 1/*2^0*/:
115 | case 2/*2^1*/:
116 | case 4/*2^2*/:
117 | case 8/*2^3*/:
118 | throw new IllegalArgumentException("'m' cannot be less than 16 (" + m + " < 16).");
119 |
120 | case 16/*2^4*/:
121 | return 0.673 * m * m;
122 |
123 | case 32/*2^5*/:
124 | return 0.697 * m * m;
125 |
126 | case 64/*2^6*/:
127 | return 0.709 * m * m;
128 |
129 | default/*>2^6*/:
130 | return (0.7213 / (1.0 + 1.079 / m)) * m * m;
131 | }
132 | }
133 |
134 | // ========================================================================
135 | /**
136 | * Computes a mask that prevents overflow of HyperLogLog registers.
137 | *
138 | * @param registerSizeInBits the size of the HLL registers, in bits.
139 | * @return mask a long
mask to prevent overflow of the registers
140 | * @see #registerBitSize(long)
141 | */
142 | public static long pwMaxMask(final int registerSizeInBits) {
143 | return PW_MASK[registerSizeInBits];
144 | }
145 |
146 | // ========================================================================
147 | /**
148 | * The cutoff for using the "small range correction" formula, in the
149 | * HyperLogLog algorithm.
150 | *
151 | * @param m the number of registers in the HLL. m in the paper.
152 | * @return the cutoff for the small range correction.
153 | * @see #smallEstimator(int, int)
154 | */
155 | public static double smallEstimatorCutoff(final int m) {
156 | return ((double)m * 5) / 2;
157 | }
158 |
159 | /**
160 | * The "small range correction" formula from the HyperLogLog algorithm. Only
161 | * appropriate if both the estimator is smaller than (5/2) * m
and
162 | * there are still registers that have the zero value.
163 | *
164 | * @param m the number of registers in the HLL. m in the paper.
165 | * @param numberOfZeroes the number of registers with value zero. V
166 | * in the paper.
167 | * @return a corrected cardinality estimate.
168 | */
169 | public static double smallEstimator(final int m, final int numberOfZeroes) {
170 | return m * Math.log((double)m / numberOfZeroes);
171 | }
172 |
173 | /**
174 | * The cutoff for using the "large range correction" formula, from the
175 | * HyperLogLog algorithm, adapted for 64 bit hashes.
176 | *
177 | * @param log2m log-base-2 of the number of registers in the HLL. b in the paper.
178 | * @param registerSizeInBits the size of the HLL registers, in bits.
179 | * @return the cutoff for the large range correction.
180 | * @see #largeEstimator(int, int, double)
181 | * @see Blog post with section on 64 bit hashes and "large range correction" cutoff
182 | */
183 | public static double largeEstimatorCutoff(final int log2m, final int registerSizeInBits) {
184 | return (TWO_TO_L[(REG_WIDTH_INDEX_MULTIPLIER * registerSizeInBits) + log2m]) / 30.0;
185 | }
186 |
187 | /**
188 | * The "large range correction" formula from the HyperLogLog algorithm, adapted
189 | * for 64 bit hashes. Only appropriate for estimators whose value exceeds
190 | * the return of {@link #largeEstimatorCutoff(int, int)}.
191 | *
192 | * @param log2m log-base-2 of the number of registers in the HLL. b in the paper.
193 | * @param registerSizeInBits the size of the HLL registers, in bits.
194 | * @param estimator the original estimator ("E" in the paper).
195 | * @return a corrected cardinality estimate.
196 | * @see Blog post with section on 64 bit hashes and "large range correction"
197 | */
198 | public static double largeEstimator(final int log2m, final int registerSizeInBits, final double estimator) {
199 | final double twoToL = TWO_TO_L[(REG_WIDTH_INDEX_MULTIPLIER * registerSizeInBits) + log2m];
200 | return -1 * twoToL * Math.log(1.0 - (estimator/twoToL));
201 | }
202 | }
203 |
--------------------------------------------------------------------------------
/src/main/java/net/agkn/hll/util/LongIterator.java:
--------------------------------------------------------------------------------
1 | package net.agkn.hll.util;
2 |
3 | /*
4 | * Copyright 2013 Aggregate Knowledge, Inc.
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 |
19 | /**
20 | * A long
-based iterator. This is not is-a {@link java.util.Iterator}
21 | * to prevent autoboxing between Long
and long
.
22 | *
23 | * @author rgrzywinski
24 | */
25 | public interface LongIterator {
26 | /**
27 | * @return true
if and only if there are more elements to
28 | * iterate over. false
otherwise.
29 | */
30 | boolean hasNext();
31 |
32 | /**
33 | * @return the next long
in the collection.
34 | */
35 | long next();
36 | }
--------------------------------------------------------------------------------
/src/main/java/net/agkn/hll/util/NumberUtil.java:
--------------------------------------------------------------------------------
1 | package net.agkn.hll.util;
2 |
3 | /*
4 | * Copyright 2013 Aggregate Knowledge, Inc.
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 |
19 | /**
20 | * A collection of utilities to work with numbers.
21 | *
22 | * @author rgrzywinski
23 | */
24 | public class NumberUtil {
25 | // loge(2) (log-base e of 2)
26 | public static final double LOGE_2 = 0.6931471805599453;
27 |
28 | // ************************************************************************
29 | /**
30 | * Computes the log2
(log-base-two) of the specified value.
31 | *
32 | * @param value the double
for which the log2
is
33 | * desired.
34 | * @return the log2
of the specified value
35 | */
36 | public static double log2(final double value) {
37 | // REF: http://en.wikipedia.org/wiki/Logarithmic_scale (conversion of bases)
38 | return Math.log(value) / LOGE_2;
39 | }
40 |
41 | // ========================================================================
42 | // the hex characters
43 | private static final char[] HEX = { '0', '1', '2', '3', '4', '5', '6', '7',
44 | '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };
45 |
46 | // ------------------------------------------------------------------------
47 | /**
48 | * Converts the specified array of byte
s into a string of
49 | * hex characters (low byte
first).
50 | *
51 | * @param bytes the array of byte
s that are to be converted.
52 | * This cannot be null
though it may be empty.
53 | * @param offset the offset in bytes
at which the bytes will
54 | * be taken. This cannot be negative and must be less than
55 | * bytes.length - 1
.
56 | * @param count the number of bytes to be retrieved from the specified array.
57 | * This cannot be negative. If greater than bytes.length - offset
58 | * then that value is used.
59 | * @return a string of at most count
characters that represents
60 | * the specified byte array in hex. This will never be null
61 | * though it may be empty if bytes
is empty or count
62 | * is zero.
63 | * @throws IllegalArgumentException if offset
is greater than
64 | * or equal to bytes.length
.
65 | * @see #fromHex(String, int, int)
66 | */
67 | public static String toHex(final byte[] bytes, final int offset, final int count) {
68 | if(offset >= bytes.length) throw new IllegalArgumentException("Offset is greater than the length (" + offset + " >= " + bytes.length + ").")/*by contract*/;
69 | final int byteCount = Math.min( (bytes.length - offset), count);
70 | final int upperBound = byteCount + offset;
71 |
72 | final char[] chars = new char[byteCount * 2/*two chars per byte*/];
73 | int charIndex = 0;
74 | for(int i=offset; i>> 4) & 0x0F];
77 | chars[charIndex++] = HEX[value & 0x0F];
78 | }
79 |
80 | return new String(chars);
81 | }
82 |
83 | /**
84 | * Converts the specified array of hex characters into an array of byte
s
85 | * (low byte
first).
86 | *
87 | * @param string the string of hex characters to be converted into byte
s.
88 | * This cannot be null
though it may be blank.
89 | * @param offset the offset in the string at which the characters will be
90 | * taken. This cannot be negative and must be less than string.length() - 1
.
91 | * @param count the number of characters to be retrieved from the specified
92 | * string. This cannot be negative and must be divisible by two
93 | * (since there are two characters per byte
).
94 | * @return the array of byte
s that were converted from the
95 | * specified string (in the specified range). This will never be
96 | * null
though it may be empty if string
97 | * is empty or count
is zero.
98 | * @throws IllegalArgumentException if offset
is greater than
99 | * or equal to string.length()
or if count
100 | * is not divisible by two.
101 | * @see #toHex(byte[], int, int)
102 | */
103 | public static byte[] fromHex(final String string, final int offset, final int count) {
104 | if(offset >= string.length()) throw new IllegalArgumentException("Offset is greater than the length (" + offset + " >= " + string.length() + ").")/*by contract*/;
105 | if( (count & 0x01) != 0) throw new IllegalArgumentException("Count is not divisible by two (" + count + ").")/*by contract*/;
106 | final int charCount = Math.min((string.length() - offset), count);
107 | final int upperBound = offset + charCount;
108 |
109 | final byte[] bytes = new byte[charCount >>> 1/*aka /2*/];
110 | int byteIndex = 0/*beginning*/;
111 | for(int i=offset; ibyte.
122 | * This cannot be a character other than [a-fA-F0-9].
123 | * @return the value of the specified character. This will be a value 0
124 | * through 15
.
125 | * @throws IllegalArgumentException if the specified character is not in
126 | * [a-fA-F0-9]
127 | */
128 | private static final int digit(final char character) {
129 | switch(character) {
130 | case '0':
131 | return 0;
132 | case '1':
133 | return 1;
134 | case '2':
135 | return 2;
136 | case '3':
137 | return 3;
138 | case '4':
139 | return 4;
140 | case '5':
141 | return 5;
142 | case '6':
143 | return 6;
144 | case '7':
145 | return 7;
146 | case '8':
147 | return 8;
148 | case '9':
149 | return 9;
150 | case 'a':
151 | case 'A':
152 | return 10;
153 | case 'b':
154 | case 'B':
155 | return 11;
156 | case 'c':
157 | case 'C':
158 | return 12;
159 | case 'd':
160 | case 'D':
161 | return 13;
162 | case 'e':
163 | case 'E':
164 | return 14;
165 | case 'f':
166 | case 'F':
167 | return 15;
168 |
169 | default:
170 | throw new IllegalArgumentException("Character is not in [a-fA-F0-9] ('" + character + "').");
171 | }
172 | }
173 | }
--------------------------------------------------------------------------------
/src/test/java/net/agkn/hll/ExplicitHLLTest.java:
--------------------------------------------------------------------------------
1 | package net.agkn.hll;
2 |
3 | /*
4 | * Copyright 2013 Aggregate Knowledge, Inc.
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 |
19 | import static org.powermock.reflect.Whitebox.getInternalState;
20 | import static org.testng.Assert.assertEquals;
21 | import static org.testng.Assert.assertTrue;
22 | import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
23 |
24 | import java.util.HashSet;
25 | import java.util.Random;
26 |
27 | import net.agkn.hll.serialization.ISchemaVersion;
28 | import net.agkn.hll.serialization.SerializationUtil;
29 | import org.testng.annotations.Test;
30 |
31 | /**
32 | * Tests {@link HLL} of type {@link HLLType#EXPLICIT}.
33 | *
34 | * @author timon
35 | */
36 | public class ExplicitHLLTest {
37 | /**
38 | * Tests basic set semantics of {@link HLL#addRaw(long)}.
39 | */
40 | @Test
41 | public void addBasicTest() {
42 | { // Adding a single positive value to an empty set should work.
43 | final HLL hll = newHLL(128/*arbitrary*/);
44 | hll.addRaw(1L/*positive*/);
45 | assertEquals(hll.cardinality(), 1L);
46 | }
47 | { // Adding a single negative value to an empty set should work.
48 | final HLL hll = newHLL(128/*arbitrary*/);
49 | hll.addRaw(-1L/*negative*/);
50 | assertEquals(hll.cardinality(), 1L);
51 | }
52 | { // Adding a duplicate value to a set should be a no-op.
53 | final HLL hll = newHLL(128/*arbitrary*/);
54 | hll.addRaw(1L/*positive*/);
55 | assertEquals(hll.cardinality(), 1L/*arbitrary*/);
56 | assertEquals(hll.cardinality(), 1L/*dupe*/);
57 | }
58 | }
59 |
60 | // ------------------------------------------------------------------------
61 | /**
62 | * Tests {@link HLL#union(HLL)}.
63 | */
64 | @Test
65 | public void unionTest() {
66 | {// Unioning two distinct sets should work
67 | final HLL hllA = newHLL(128/*arbitrary*/);
68 | final HLL hllB = newHLL(128/*arbitrary*/);
69 | hllA.addRaw(1L);
70 | hllA.addRaw(2L);
71 | hllB.addRaw(3L);
72 |
73 | hllA.union(hllB);
74 | assertEquals(hllA.cardinality(), 3);
75 | }
76 | {// Unioning two sets whose union doesn't exceed the cardinality cap should not promote
77 | final HLL hllA = newHLL(128/*arbitrary*/);
78 | final HLL hllB = newHLL(128/*arbitrary*/);
79 | hllA.addRaw(1L);
80 | hllA.addRaw(2L);
81 | hllB.addRaw(1L);
82 |
83 | hllA.union(hllB);
84 | assertEquals(hllA.cardinality(), 2);
85 | }
86 | {// unioning two sets whose union exceeds the cardinality cap should promote
87 | final HLL hllA = newHLL(128/*arbitrary*/);
88 | final HLL hllB = newHLL(128/*arbitrary*/);
89 |
90 | // fill up sets to explicitThreshold
91 | for(long i=0; i<128/*explicitThreshold*/; i++) {
92 | hllA.addRaw(i);
93 | hllB.addRaw(i + 128);
94 | }
95 |
96 | hllA.union(hllB);
97 | assertEquals(hllA.getType(), HLLType.SPARSE);
98 | }
99 | }
100 |
101 | // ------------------------------------------------------------------------
102 | /**
103 | * Tests {@link HLL#clear()}
104 | */
105 | @Test
106 | public void clearTest() {
107 | final HLL hll = newHLL(128/*arbitrary*/);
108 | hll.addRaw(1L);
109 | assertEquals(hll.cardinality(), 1L);
110 | hll.clear();
111 | assertEquals(hll.cardinality(), 0L);
112 | }
113 |
114 | // ------------------------------------------------------------------------
115 | /**
116 | * Tests {@link LongSetSlab#toBytes(int, ISchemaVersion)} and
117 | * {@link LongSetSlab#fromBytes(int, byte[], ISchemaVersion)}.
118 | */
119 | @Test
120 | public void toFromBytesTest() {
121 | final ISchemaVersion schemaVersion = SerializationUtil.DEFAULT_SCHEMA_VERSION;
122 | final HLLType type = HLLType.EXPLICIT;
123 | final int padding = schemaVersion.paddingBytes(type);
124 | final int bytesPerWord = 8;
125 |
126 | {// Should work on an empty set
127 | final HLL hll = newHLL(128/*arbitrary*/);
128 |
129 | final byte[] bytes = hll.toBytes(schemaVersion);
130 |
131 | // assert output has correct byte length
132 | assertEquals(bytes.length, padding/*no elements, just padding*/);
133 |
134 | final HLL inHLL = HLL.fromBytes(bytes);
135 |
136 | assertElementsEqual(hll, inHLL);
137 | }
138 | {// Should work on a partially filled set
139 | final HLL hll = newHLL(128/*arbitrary*/);
140 |
141 | for(int i=0; i<3; i++) {
142 | hll.addRaw(i);
143 | }
144 |
145 | final byte[] bytes = hll.toBytes(schemaVersion);
146 |
147 | // assert output has correct byte length
148 | assertEquals(bytes.length, padding + (bytesPerWord * 3/*elements*/));
149 |
150 | final HLL inHLL = HLL.fromBytes(bytes);
151 |
152 | assertElementsEqual(hll, inHLL);
153 | }
154 | {// Should work on a full set
155 | final int explicitThreshold = 128;
156 | final HLL hll = newHLL(explicitThreshold);
157 |
158 | for(int i=0; i canonical = new HashSet();
181 | final HLL hll = newHLL(explicitThreshold);
182 |
183 | final long seed = 1L/*constant so results are reproducible*/;
184 | final Random random = new Random(seed);
185 | for(int i=0;i explicitThreshold = 8*/, false/*sparseon*/, HLLType.EXPLICIT);
211 |
212 | for(int i=0;i<9/* > explicitThreshold */;i++){
213 | hll.addRaw(i);
214 | }
215 | assertEquals(hll.getType(), HLLType.FULL);
216 | }
217 | }
218 |
219 | // ************************************************************************
220 | // assertion helpers
221 | /**
222 | * Asserts that values in both sets are exactly equal.
223 | */
224 | private static void assertElementsEqual(final HLL hllA, final HLL hllB) {
225 | final LongOpenHashSet internalSetA = (LongOpenHashSet)getInternalState(hllA, "explicitStorage");
226 | final LongOpenHashSet internalSetB = (LongOpenHashSet)getInternalState(hllB, "explicitStorage");
227 |
228 | assertTrue(internalSetA.equals(internalSetB));
229 | }
230 |
231 | /**
232 | * Builds a {@link HLLType#EXPLICIT} {@link HLL} instance with the specified
233 | * explicit threshold.
234 | *
235 | * @param explicitThreshold explicit threshold to use for the constructed
236 | * {@link HLL}. This must be greater than zero.
237 | * @return a default-sized {@link HLLType#EXPLICIT} empty {@link HLL} instance.
238 | * This will never be null
.
239 | */
240 | private static HLL newHLL(final int explicitThreshold) {
241 | return new HLL(11/*log2m, unused*/, 5/*regwidth, unused*/, explicitThreshold, 256/*sparseThreshold, arbitrary, unused*/, HLLType.EXPLICIT);
242 | }
243 | }
--------------------------------------------------------------------------------
/src/test/java/net/agkn/hll/FullHLLTest.java:
--------------------------------------------------------------------------------
1 | package net.agkn.hll;
2 |
3 | /*
4 | * Copyright 2013 Aggregate Knowledge, Inc.
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 |
19 | import static org.powermock.reflect.Whitebox.getInternalState;
20 | import static org.testng.Assert.assertEquals;
21 | import static org.testng.Assert.assertTrue;
22 | import static org.testng.Assert.assertFalse;
23 |
24 | import net.agkn.hll.serialization.ISchemaVersion;
25 | import net.agkn.hll.serialization.SerializationUtil;
26 | import net.agkn.hll.util.BitVector;
27 | import net.agkn.hll.util.HLLUtil;
28 | import net.agkn.hll.util.LongIterator;
29 |
30 | import org.testng.annotations.Test;
31 |
32 | /**
33 | * Tests {@link HLL} of type {@link HLLType#FULL}.
34 | *
35 | * @author rgrzywinski
36 | * @author timon
37 | */
38 | public class FullHLLTest {
39 | // TODO union test
40 | /**
41 | * Smoke test for {@link HLL#cardinality(int)} and the proper use of the
42 | * small range correction.
43 | */
44 | @Test
45 | public void smallRangeSmokeTest() {
46 | final int log2m = 11;
47 | final int m = (1 << log2m);
48 | final int regwidth = 5;
49 |
50 | // only one register set
51 | {
52 | final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
53 | hll.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, 0/*ix*/, 1/*val*/));
54 |
55 | final long cardinality = hll.cardinality();
56 |
57 | // Trivially true that small correction conditions hold: one register
58 | // set implies zeroes exist, and estimator trivially smaller than 5m/2.
59 | // Small range correction: m * log(m/V)
60 | final long expected = (long)Math.ceil(m * Math.log((double)m / (m - 1)/*# of zeroes*/));
61 | assertEquals(cardinality, expected);
62 | }
63 |
64 | // all but one register set
65 | {
66 | final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
67 | for(int i=0; i<(m - 1); i++) {
68 | hll.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, i/*ix*/, 1/*val*/));
69 | }
70 |
71 | // Trivially true that small correction conditions hold: all but
72 | // one register set implies a zero exists, and estimator trivially
73 | // smaller than 5m/2 since it's alpha / ((m-1)/2)
74 | final long cardinality = hll.cardinality();
75 |
76 | // Small range correction: m * log(m/V)
77 | final long expected = (long)Math.ceil(m * Math.log((double)m / 1/*# of zeroes*/));
78 | assertEquals(cardinality, expected);
79 | }
80 | }
81 |
82 | /**
83 | * Smoke test for {@link HLL#cardinality()} and the proper use of the
84 | * uncorrected estimator
85 | */
86 | @Test
87 | public void normalRangeSmokeTest() {
88 | final int log2m = 11;
89 | final int regwidth = 5;
90 | // regwidth = 5, so hash space is
91 | // log2m + (2^5 - 1 - 1), so L = log2m + 30
92 | final int l = log2m + 30;
93 | final int m = (1 << log2m);
94 | final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
95 |
96 | // all registers at 'medium' value
97 | {
98 | final int registerValue = 7/*chosen to ensure neither correction kicks in*/;
99 | for(int i=0; i (5 * m /(double)2));
112 |
113 | final long expected = (long)Math.ceil(estimator);
114 | assertEquals(cardinality, expected);
115 | }
116 | }
117 |
118 | /**
119 | * Smoke test for {@link HLL#cardinality()} and the proper use of the large
120 | * range correction.
121 | */
122 | @Test
123 | public void largeRangeSmokeTest() {
124 | final int log2m = 12;
125 | final int regwidth = 5;
126 | // regwidth = 5, so hash space is
127 | // log2m + (2^5 - 1 - 1), so L = log2m + 30
128 | final int l = log2m + 30;
129 | final int m = (1 << log2m);
130 | final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
131 |
132 | {
133 | final int registerValue = 31/*chosen to ensure large correction kicks in*/;
134 | for(int i=0; i Math.pow(2,l)/30);
147 |
148 | // Large range correction: -2^L * log(1 - E/2^L)
149 | final long expected = (long)Math.ceil(-1.0 * Math.pow(2, l) * Math.log(1.0 - estimator/Math.pow(2, l)));
150 | assertEquals(cardinality, expected);
151 | }
152 | }
153 |
154 | // ========================================================================
155 | /**
156 | * Tests the bounds on a register's value for a given raw input value.
157 | */
158 | @Test
159 | public void registerValueTest() {
160 | final int log2m = 4/*small enough to make testing easy (addRaw() shifts by one byte)*/;
161 |
162 | // register width 4 (the minimum size)
163 | { // scoped locally for sanity
164 | final int regwidth = 4;
165 | final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
166 | final BitVector bitVector = (BitVector)getInternalState(hll, "probabilisticStorage")/*for testing convenience*/;
167 |
168 | // lower-bounds of the register
169 | hll.addRaw(0x000000000000001L/*'j'=1*/);
170 | assertEquals(bitVector.getRegister(1/*'j'*/), 0);
171 |
172 | hll.addRaw(0x0000000000000012L/*'j'=2*/);
173 | assertEquals(bitVector.getRegister(2/*'j'*/), 1);
174 |
175 | hll.addRaw(0x0000000000000023L/*'j'=3*/);
176 | assertEquals(bitVector.getRegister(3/*'j'*/), 2);
177 |
178 | hll.addRaw(0x0000000000000044L/*'j'=4*/);
179 | assertEquals(bitVector.getRegister(4/*'j'*/), 3);
180 |
181 | hll.addRaw(0x0000000000000085L/*'j'=5*/);
182 | assertEquals(bitVector.getRegister(5/*'j'*/), 4);
183 |
184 | // upper-bounds of the register
185 | // NOTE: bear in mind that BitVector itself does ensure that
186 | // overflow of a register is prevented
187 | hll.addRaw(0x0000000000010006L/*'j'=6*/);
188 | assertEquals(bitVector.getRegister(6/*'j'*/), 13);
189 |
190 | hll.addRaw(0x0000000000020007L/*'j'=7*/);
191 | assertEquals(bitVector.getRegister(7/*'j'*/), 14);
192 |
193 | hll.addRaw(0x0000000000040008L/*'j'=8*/);
194 | assertEquals(bitVector.getRegister(8/*'j'*/), 15);
195 |
196 | hll.addRaw(0x0000000000080009L/*'j'=9*/);
197 | assertEquals(bitVector.getRegister(9/*'j'*/), 15/*overflow*/);
198 |
199 | // sanity checks to ensure that no other bits above the lowest-set
200 | // bit matters
201 | // NOTE: same as case 'j = 6' above
202 | hll.addRaw(0x000000000003000AL/*'j'=10*/);
203 | assertEquals(bitVector.getRegister(10/*'j'*/), 13);
204 |
205 | hll.addRaw(0x000000000011000BL/*'j'=11*/);
206 | assertEquals(bitVector.getRegister(11/*'j'*/), 13);
207 | }
208 |
209 | // register width 5
210 | { // scoped locally for sanity
211 | final int regwidth = 5;
212 | final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
213 | final BitVector bitVector = (BitVector)getInternalState(hll, "probabilisticStorage")/*for testing convenience*/;
214 |
215 | // lower-bounds of the register
216 | hll.addRaw(0x0000000000000001L/*'j'=1*/);
217 | assertEquals(bitVector.getRegister(1/*'j'*/), 0);
218 |
219 | hll.addRaw(0x0000000000000012L/*'j'=2*/);
220 | assertEquals(bitVector.getRegister(2/*'j'*/), 1);
221 |
222 | hll.addRaw(0x0000000000000023L/*'j'=3*/);
223 | assertEquals(bitVector.getRegister(3/*'j'*/), 2);
224 |
225 | hll.addRaw(0x0000000000000044L/*'j'=4*/);
226 | assertEquals(bitVector.getRegister(4/*'j'*/), 3);
227 |
228 | hll.addRaw(0x0000000000000085L/*'j'=5*/);
229 | assertEquals(bitVector.getRegister(5/*'j'*/), 4);
230 |
231 | // upper-bounds of the register
232 | // NOTE: bear in mind that BitVector itself does ensure that
233 | // overflow of a register is prevented
234 | hll.addRaw(0x0000000100000006L/*'j'=6*/);
235 | assertEquals(bitVector.getRegister(6/*'j'*/), 29);
236 |
237 | hll.addRaw(0x0000000200000007L/*'j'=7*/);
238 | assertEquals(bitVector.getRegister(7/*'j'*/), 30);
239 |
240 | hll.addRaw(0x0000000400000008L/*'j'=8*/);
241 | assertEquals(bitVector.getRegister(8/*'j'*/), 31);
242 |
243 | hll.addRaw(0x0000000800000009L/*'j'=9*/);
244 | assertEquals(bitVector.getRegister(9/*'j'*/), 31/*overflow*/);
245 | }
246 | }
247 |
248 | // ========================================================================
249 | /**
250 | * Tests {@link HLL#clear()}.
251 | */
252 | @Test
253 | public void clearTest() {
254 | final int regwidth = 5;
255 | final int log2m = 4/*16 registers per counter*/;
256 | final int m = 1 << log2m;
257 |
258 | final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
259 | final BitVector bitVector = (BitVector)getInternalState(hll, "probabilisticStorage")/*for testing convenience*/;
260 | for(int i=0; i SPARSE
140 | * - SPARSE U "underpopulated" FULL => SPARSE
141 | * - SPARSE U "barely underpopulated" FULL => FULL
142 | */
143 | private static void sparseFullRepresentationTest(final ISchemaVersion schemaVersion) throws IOException {
144 | final FileWriter output = openOutput(schemaVersion, "sparse_full_representation", TestType.UNION);
145 |
146 | final HLL emptyHLL1 = newHLL(HLLType.EMPTY);
147 | final HLL emptyHLL2 = newHLL(HLLType.EMPTY);
148 |
149 | cumulativeUnionLine(output, emptyHLL1, emptyHLL2, schemaVersion);
150 |
151 | // NOTE: In this test the sparseReference will be the "expected" value
152 | // from the C representation, since it doesn't choose representation
153 | // based on original encoding, but rather on the promotion rules
154 | // and the declared type of the "receiving" field.
155 | // It is the manually-constructed union result.
156 |
157 | // "underpopulated" FULL U EMPTY => SPARSE
158 | final HLL fullHLL = newHLL(HLLType.FULL);
159 | fullHLL.addRaw(constructHLLValue(LOG2M, 0/*ix*/, 1/*val*/));
160 |
161 | final HLL sparseHLL = newHLL(HLLType.SPARSE);
162 | sparseHLL.addRaw(constructHLLValue(LOG2M, 0/*ix*/, 1/*val*/));
163 |
164 | output.write(stringCardinality(fullHLL) + "," + toByteA(fullHLL, schemaVersion) + "," + stringCardinality(sparseHLL) + "," + toByteA(sparseHLL, schemaVersion) + "\n");
165 | output.flush();
166 |
167 | // "underpopulated" FULL (small) U SPARSE (small) => SPARSE
168 | final HLL fullHLL2 = newHLL(HLLType.FULL);
169 | fullHLL2.addRaw(constructHLLValue(LOG2M, 1/*ix*/, 1/*val*/));
170 |
171 | sparseHLL.addRaw(constructHLLValue(LOG2M, 1/*ix*/, 1/*val*/));
172 |
173 | output.write(stringCardinality(fullHLL2) + "," + toByteA(fullHLL2, schemaVersion) + "," + stringCardinality(sparseHLL) + "," + toByteA(sparseHLL, schemaVersion) + "\n");
174 | output.flush();
175 |
176 | // "underpopulated" FULL (just on edge) U SPARSE (small) => FULL
177 | final HLL fullHLL3 = newHLL(HLLType.FULL);
178 | for(int i=2; i<(SPARSE_THRESHOLD + 1); i++) {
179 | fullHLL3.addRaw(constructHLLValue(LOG2M, i/*ix*/, 1/*val*/));
180 | sparseHLL.addRaw(constructHLLValue(LOG2M, i/*ix*/, 1/*val*/));
181 | }
182 |
183 | output.write(stringCardinality(fullHLL3) + "," + toByteA(fullHLL3, schemaVersion) + "," + stringCardinality(sparseHLL) + "," + toByteA(sparseHLL, schemaVersion) + "\n");
184 | output.flush();
185 | }
186 |
187 | /**
188 | * Cumulatively sets successive registers to:
189 | *
190 | * (registerIndex % REGISTER_MAX_VALUE) + 1
191 | *
192 | * by adding specifically constructed values to a SPARSE HLL.
193 | * Does not induce promotion.
194 | *
195 | * Format: cumulative add
196 | * Tests:
197 | * - SPARSE addition (predictable)
198 | */
199 | private static void sparseStepTest(final ISchemaVersion schemaVersion) throws IOException {
200 | final FileWriter output = openOutput(schemaVersion, "sparse_step", TestType.ADD);
201 |
202 | // the accumulator, starts empty sparse probabilistic
203 | final HLL hll = newHLL(HLLType.SPARSE);
204 | initLineAdd(output, hll, schemaVersion);
205 |
206 | for(int i=0; inull.
526 | * @return the algorithm-specific cardinality of the instance as a PostgreSQL-
527 | * compatible String. This will never be null
528 | */
529 | private static String stringCardinality(final HLL hll) {
530 | switch(hll.getType()) {
531 | case EMPTY:
532 | return "0";
533 | case EXPLICIT:/*promotion has not yet occurred*/
534 | return Long.toString(hll.cardinality());
535 | case SPARSE:
536 | return Double.toString(hll.sparseProbabilisticAlgorithmCardinality());
537 | case FULL:
538 | return Double.toString(hll.fullProbabilisticAlgorithmCardinality());
539 | default:
540 | throw new RuntimeException("Unknown HLL type " + hll.getType());
541 | }
542 | }
543 |
544 | /**
545 | * Generates a random HLL and populates it with random values.
546 | *
547 | * @param random the {@link Random random number generator} used to populate
548 | * the HLL. This cannot be null
.
549 | * @return the populated HLL. This will never be null
.
550 | */
551 | public static HLL generateRandomHLL(final Random random) {
552 | final int randomTypeInt = random.nextInt(HLLType.values().length);
553 | final HLLType type;
554 | switch(randomTypeInt) {
555 | case 0:
556 | type = HLLType.EMPTY;
557 | break;
558 | case 1:
559 | type = HLLType.EXPLICIT;
560 | break;
561 | case 2:
562 | type = HLLType.FULL;
563 | break;
564 | case 3:
565 | type = HLLType.EMPTY;
566 | break;
567 | case 4:
568 | type = HLLType.SPARSE;
569 | break;
570 | default:
571 | throw new RuntimeException("Unassigned type int " + randomTypeInt);
572 | }
573 |
574 | final int cardinalityCap;
575 | final int cardinalityBaseline;
576 |
577 | switch(type) {
578 | case EMPTY:
579 | return newHLL(HLLType.EMPTY);
580 | case EXPLICIT:
581 | cardinalityCap = EXPLICIT_THRESHOLD;
582 | cardinalityBaseline = 1;
583 | break;
584 | case SPARSE:
585 | cardinalityCap = SPARSE_THRESHOLD;
586 | cardinalityBaseline = (EXPLICIT_THRESHOLD + 1);
587 | break;
588 | case FULL:
589 | cardinalityCap = 100000;
590 | cardinalityBaseline = (SPARSE_THRESHOLD*10);
591 | break;
592 | default:
593 | throw new RuntimeException("We should never be here.");
594 | }
595 |
596 | final HLL hll = newHLL(HLLType.EMPTY);
597 | for(int i=0; inull.
612 | * @param description Description string used to build the filename.
613 | * This cannot be null
.
614 | * @param type {@link TestType type} of the test file to be written.
615 | * This cannot be null
.
616 | * @return The opened {@link FileWriter writer}. This will never be null
.
617 | */
618 | private static FileWriter openOutput(final ISchemaVersion schemaVersion, final String description, final TestType type) throws IOException {
619 | final String schemaVersionPrefix = "v"+ schemaVersion.schemaVersionNumber() + "_";
620 | final String header;
621 | final String filename;
622 | switch(type) {
623 | case ADD:
624 | header = "cardinality,raw_value,HLL\n";
625 | filename = schemaVersionPrefix + "cumulative_add_" + description + ".csv";
626 | break;
627 | case UNION:
628 | header = "cardinality,HLL,union_cardinality,union_HLL\n";
629 | filename = schemaVersionPrefix + "cumulative_union_" + description + ".csv";
630 | break;
631 | default:
632 | throw new RuntimeException("Unknown test type " + type);
633 | }
634 |
635 | final FileWriter output = new FileWriter(OUTPUT_DIRECTORY + filename);
636 | output.write(header);
637 | output.flush();
638 | return output;
639 | }
640 |
641 | /**
642 | * Writes out a {@link TestType#ADD}-formatted test line.
643 | *
644 | * @param output The output {@link FileWriter writer}. This cannot be null
.
645 | * @param hll The "accumulator" HLL instance. This cannot be null
.
646 | * @param rawValue The raw value added to the HLL.
647 | * @param schemaVersion the schema with which to serialize the HLLs. This cannot
648 | * be null
.
649 | */
650 | private static void cumulativeAddLine(final FileWriter output, final HLL hll, final long rawValue, final ISchemaVersion schemaVersion) throws IOException {
651 | hll.addRaw(rawValue);
652 | final String accumulatorCardinality = stringCardinality(hll);
653 |
654 | output.write(accumulatorCardinality + "," + rawValue + "," + toByteA(hll, schemaVersion) + "\n");
655 | output.flush();
656 | }
657 |
658 | /**
659 | * Writes an initial line for a {@link TestType#ADD}-formatted test.
660 | *
661 | * @param output The output {@link FileWriter writer}. This cannot be null
.
662 | * @param hll The "accumulator" HLL instance. This cannot be null
.
663 | * @param rawValue The raw value added to the HLL.
664 | * @param schemaVersion the schema with which to serialize the HLLs. This cannot
665 | * be null
.
666 | */
667 | private static void initLineAdd(final FileWriter output, final HLL hll, final ISchemaVersion schemaVersion) throws IOException {
668 | output.write(0 + "," + 0 + "," + toByteA(hll, schemaVersion) + "\n");
669 | output.flush();
670 | }
671 |
672 | /**
673 | * Writes out a {@link TestType#UNION}-formatted test line.
674 | *
675 | * @param output The output {@link FileWriter writer}. This cannot be null
.
676 | * @param hll The "accumulator" HLL instance. This cannot be null
.
677 | * @param increment The "increment" HLL instance which will be unioned into
678 | * the accumulator. This cannot be null
.
679 | * @param schemaVersion the schema with which to serialize the HLLs. This cannot
680 | * be null
.
681 | */
682 | private static void cumulativeUnionLine(final FileWriter output, final HLL hll, final HLL increment, final ISchemaVersion schemaVersion) throws IOException {
683 | hll.union(increment);
684 |
685 | final String incrementCardinality = stringCardinality(increment);
686 | final String accumulatorCardinality = stringCardinality(hll);
687 | output.write(incrementCardinality + "," + toByteA(increment, schemaVersion) + "," + accumulatorCardinality + "," + toByteA(hll, schemaVersion) + "\n");
688 | output.flush();
689 | }
690 |
691 | /**
692 | * Serializes a HLL to Postgres 9 'bytea' hex-format, for CSV ingest.
693 | *
694 | * @param hll the HLL to serialize. This cannot be null
.
695 | * @param schemaVersion the schema with which to serialize the HLLs. This cannot
696 | * be null
.
697 | * @return a PostgreSQL 'bytea' string representing the HLL.
698 | */
699 | private static String toByteA(final HLL hll, final ISchemaVersion schemaVersion) {
700 | final byte[] bytes = hll.toBytes(schemaVersion);
701 | return ("\\x" + NumberUtil.toHex(bytes, 0, bytes.length));
702 | }
703 |
704 | /**
705 | * Indicates what kind of test output a test will generate.
706 | */
707 | private static enum TestType {
708 | /**
709 | * This type of test is characterized by values being added to an
710 | * accumulator HLL whose serialized representation (after the value is added)
711 | * is printed to each line along with the cardinality and added value.
712 | */
713 | ADD,
714 | /**
715 | * This type of test is characterized by HLLs being unioned into an
716 | * accumulator HLL whose serialized representation (after the HLL is
717 | * union'd) is printed to each line along with the cardinalities and the
718 | * serialized representation of the HLL union'd in.
719 | */
720 | UNION;
721 | }
722 | }
723 |
--------------------------------------------------------------------------------
/src/test/java/net/agkn/hll/ProbabilisticTestUtil.java:
--------------------------------------------------------------------------------
1 | package net.agkn.hll;
2 |
3 | /*
4 | * Copyright 2013 Aggregate Knowledge, Inc.
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 |
19 | import net.agkn.hll.util.BitUtil;
20 |
21 | /**
22 | * A collection of test utilities for constructing input values to HLLs and for
23 | * computing their serialized size.
24 | *
25 | * @author timon
26 | */
27 | public class ProbabilisticTestUtil {
28 | /**
29 | * Constructs a value that when added raw to a HLL will set the register at
30 | * registerIndex
to registerValue
.
31 | *
32 | * @param log2m the log-base-2 of the number of registers in the HLL
33 | * @param registerIndex the index of the register to set
34 | * @param registerValue the value to set the register to
35 | * @return the value
36 | */
37 | public static long constructHLLValue(final int log2m, final int registerIndex, final int registerValue) {
38 | final long partition = registerIndex;
39 | final long substreamValue = (1L << (registerValue - 1));
40 | return (substreamValue << log2m) | partition;
41 | }
42 |
43 | /**
44 | * Extracts the HLL register index from a raw value.
45 | */
46 | public static short getRegisterIndex(final long rawValue, final int log2m) {
47 | final long mBitsMask = (1 << log2m) - 1;
48 | final short j = (short)(rawValue & mBitsMask);
49 | return j;
50 | }
51 |
52 | /**
53 | * Extracts the HLL register value from a raw value.
54 | */
55 | public static byte getRegisterValue(final long rawValue, final int log2m) {
56 | final long substreamValue = (rawValue >>> log2m);
57 | final byte p_w;
58 |
59 | if (substreamValue == 0L) {
60 | // The paper does not cover p(0x0), so the special value 0 is used.
61 | // 0 is the original initialization value of the registers, so by
62 | // doing this the HLL simply ignores it. This is acceptable
63 | // because the probability is 1/(2^(2^registerSizeInBits)).
64 | p_w = 0;
65 | } else {
66 | p_w = (byte)Math.min(1 + BitUtil.leastSignificantBit(substreamValue), 31);
67 | }
68 |
69 | return p_w;
70 | }
71 |
72 | /**
73 | * @return the number of bytes required to pack registerCount
74 | * registers of width shortWordLength
.
75 | */
76 | public static int getRequiredBytes(final int shortWordLength, final int registerCount) {
77 | return (int)Math.ceil((registerCount * shortWordLength)/(float)8);
78 | }
79 | }
80 |
--------------------------------------------------------------------------------
/src/test/java/net/agkn/hll/SparseHLLTest.java:
--------------------------------------------------------------------------------
1 | package net.agkn.hll;
2 |
3 | /*
4 | * Copyright 2013 Aggregate Knowledge, Inc.
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 |
19 | import static org.powermock.reflect.Whitebox.getInternalState;
20 | import static org.testng.Assert.assertEquals;
21 | import static org.testng.Assert.assertTrue;
22 | import it.unimi.dsi.fastutil.ints.Int2ByteOpenHashMap;
23 | import java.util.Random;
24 |
25 | import net.agkn.hll.serialization.ISchemaVersion;
26 | import net.agkn.hll.serialization.SerializationUtil;
27 | import net.agkn.hll.util.HLLUtil;
28 |
29 | import org.testng.annotations.Test;
30 |
31 | /**
32 | * Tests {@link HLL} of type {@link HLLType#SPARSE}.
33 | *
34 | * @author timon
35 | */
36 | public class SparseHLLTest {
37 | private static final int log2m = 11;
38 |
39 | /**
40 | * Tests {@link HLL#addRaw(long)}.
41 | */
42 | @Test
43 | public void addTest() {
44 | { // insert an element with register value 1 (minimum set value)
45 | final int registerIndex = 0;
46 | final int registerValue = 1;
47 | final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue);
48 |
49 | final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
50 | hll.addRaw(rawValue);
51 |
52 | assertOneRegisterSet(hll, registerIndex, (byte)registerValue);
53 | }
54 | { // insert an element with register value 31 (maximum set value)
55 | final int registerIndex = 0;
56 | final int registerValue = 31;
57 | final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue);
58 |
59 | final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
60 | hll.addRaw(rawValue);
61 |
62 | assertOneRegisterSet(hll, registerIndex, (byte)registerValue);
63 | }
64 | { // insert an element that could overflow the register (past 31)
65 | final int registerIndex = 0;
66 | final int registerValue = 36;
67 | final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue);
68 |
69 | final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
70 | hll.addRaw(rawValue);
71 |
72 | assertOneRegisterSet(hll, (short)registerIndex, (byte)31/*register max*/);
73 | }
74 | { // insert duplicate elements, observe no change
75 | final int registerIndex = 0;
76 | final int registerValue = 1;
77 | final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue);
78 |
79 | final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
80 | hll.addRaw(rawValue);
81 | hll.addRaw(rawValue);
82 |
83 | assertOneRegisterSet(hll, registerIndex, (byte)registerValue);
84 | }
85 | { // insert elements that increase a register's value
86 | final int registerIndex = 0;
87 | final int registerValue = 1;
88 | final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue);
89 |
90 | final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
91 | hll.addRaw(rawValue);
92 |
93 | final int registerValue2 = 2;
94 | final long rawValue2 = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue2);
95 | hll.addRaw(rawValue2);
96 |
97 | assertOneRegisterSet(hll, registerIndex, (byte)registerValue2);
98 | }
99 | { // insert elements that have lower register values, observe no change
100 | final int registerIndex = 0;
101 | final int registerValue = 2;
102 | final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue);
103 |
104 | final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
105 | hll.addRaw(rawValue);
106 |
107 | final int registerValue2 = 1;
108 | final long rawValue2 = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue2);
109 | hll.addRaw(rawValue2);
110 |
111 | assertOneRegisterSet(hll, registerIndex, (byte)registerValue);
112 | }
113 | }
114 |
115 | /**
116 | * Smoke test for {@link HLL#cardinality()} and the proper use of the small
117 | * range correction.
118 | */
119 | @Test
120 | public void smallRangeSmokeTest() {
121 | final int log2m = 11;
122 | final int m = (1 << log2m);
123 | final int regwidth = 5;
124 |
125 | // only one register set
126 | {
127 | final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
128 | hll.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, 0, 1));
129 |
130 | final long cardinality = hll.cardinality();
131 |
132 | // Trivially true that small correction conditions hold: one register
133 | // set implies zeroes exist, and estimator trivially smaller than 5m/2.
134 | // Small range correction: m * log(m/V)
135 | final long expected = (long)Math.ceil(m * Math.log((double)m / (m - 1)/*# of zeroes*/));
136 | assertEquals(cardinality, expected);
137 | }
138 |
139 | // all but one register set
140 | {
141 | final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
142 | for(int i=0; i<(m - 1); i++) {
143 | hll.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, i, 1));
144 | }
145 |
146 | // Trivially true that small correction conditions hold: all but
147 | // one register set implies a zero exists, and estimator trivially
148 | // smaller than 5m/2 since it's alpha / ((m-1)/2)
149 | final long cardinality = hll.cardinality();
150 |
151 | // Small range correction: m * log(m/V)
152 | final long expected = (long)Math.ceil(m * Math.log((double)m / 1/*# of zeroes*/));
153 | assertEquals(cardinality, expected);
154 | }
155 | }
156 |
157 | /**
158 | * Smoke test for {@link HLL#cardinality()} and the proper use of the
159 | * uncorrected estimator.
160 | */
161 | @Test
162 | public void normalRangeSmokeTest() {
163 | final int log2m = 11;
164 | final int m = (1 << log2m);
165 | final int regwidth = 5;
166 | // regwidth = 5, so hash space is
167 | // log2m + (2^5 - 1 - 1), so L = log2m + 30
168 | final int l = log2m + 30;
169 |
170 | // all registers at 'medium' value
171 | {
172 | final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, m/*sparseThreshold*/, HLLType.SPARSE);
173 |
174 | final int registerValue = 7/*chosen to ensure neither correction kicks in*/;
175 | for(int i=0; i (5 * m /(double)2));
187 |
188 | final long expected = (long)Math.ceil(estimator);
189 | assertEquals(cardinality, expected);
190 | }
191 | }
192 |
193 | /**
194 | * Smoke test for {@link HLL#cardinality()} and the proper use of the large
195 | * range correction.
196 | */
197 | @Test
198 | public void largeRangeSmokeTest() {
199 | final int log2m = 11;
200 | final int m = (1 << log2m);
201 | final int regwidth = 5;
202 | // regwidth = 5, so hash space is
203 | // log2m + (2^5 - 1 - 1), so L = log2m + 30
204 | final int l = log2m + 30;
205 |
206 | // all registers at large value
207 | {
208 | final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, m/*sparseThreshold*/, HLLType.SPARSE);
209 |
210 | final int registerValue = 31/*chosen to ensure large correction kicks in*/;
211 | for(int i=0; i Math.pow(2, l)/30);
223 |
224 | // Large range correction: -2^32 * log(1 - E/2^32)
225 | final long expected = (long)Math.ceil(-1.0 * Math.pow(2, l) * Math.log(1.0 - estimator/Math.pow(2, l)));
226 | assertEquals(cardinality, expected);
227 | }
228 | }
229 |
230 | /**
231 | * Tests {@link HLL#union(HLL)}.
232 | */
233 | @Test
234 | public void unionTest() {
235 | final int log2m = 11/*arbitrary*/;
236 | final int sparseThreshold = 256/*arbitrary*/;
237 |
238 | { // two empty multisets should union to an empty set
239 | final HLL hllA = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
240 | final HLL hllB = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
241 |
242 | hllA.union(hllB);
243 |
244 | assertEquals(hllA.getType(), HLLType.SPARSE/*unchanged*/);
245 | assertEquals(hllA.cardinality(), 0L);
246 | }
247 | { // two disjoint multisets should union properly
248 | final HLL hllA = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
249 | hllA.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, 1, 1));
250 | final HLL hllB = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
251 | hllB.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, 2, 1));
252 |
253 |
254 | hllA.union(hllB);
255 |
256 | assertEquals(hllA.getType(), HLLType.SPARSE/*unchanged*/);
257 | assertEquals(hllA.cardinality(), 3L/*precomputed*/);
258 | assertRegisterPresent(hllA, 1, (byte)1);
259 | assertRegisterPresent(hllA, 2, (byte)1);
260 | }
261 | { // two exactly overlapping multisets should union properly
262 | final HLL hllA = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
263 | hllA.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, 1, 10));
264 | final HLL hllB = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
265 | hllB.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, 1, 13));
266 |
267 | hllA.union(hllB);
268 |
269 | assertEquals(hllA.getType(), HLLType.SPARSE/*unchanged*/);
270 | assertEquals(hllA.cardinality(), 2L/*precomputed*/);
271 | assertOneRegisterSet(hllA, 1, (byte)13/*max(10,13)*/);
272 | }
273 | { // overlapping multisets should union properly
274 | final HLL hllA = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
275 | final HLL hllB = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
276 | // register index = 3
277 | final long rawValueA = ProbabilisticTestUtil.constructHLLValue(log2m, 3, 11);
278 |
279 | // register index = 4
280 | final long rawValueB = ProbabilisticTestUtil.constructHLLValue(log2m, 4, 13);
281 | final long rawValueBPrime = ProbabilisticTestUtil.constructHLLValue(log2m, 4, 21);
282 |
283 | // register index = 5
284 | final long rawValueC = ProbabilisticTestUtil.constructHLLValue(log2m, 5, 14);
285 |
286 | hllA.addRaw(rawValueA);
287 | hllA.addRaw(rawValueB);
288 |
289 | hllB.addRaw(rawValueBPrime);
290 | hllB.addRaw(rawValueC);
291 |
292 | hllA.union(hllB);
293 | // union should have three registers set, with partition B set to the
294 | // max of the two registers
295 | assertRegisterPresent(hllA, 3, (byte)11);
296 | assertRegisterPresent(hllA, 4, (byte)21/*max(21,13)*/);
297 | assertRegisterPresent(hllA, 5, (byte)14);
298 | }
299 | { // too-large unions should promote
300 | final HLL hllA = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
301 | final HLL hllB = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
302 |
303 | // fill up sets to maxCapacity
304 | for(int i=0; i 0x4F -> 79
178 | // 1100 0010 -> 0xC2 -> -62
179 |
180 | final byte[] bytes = serializer.getBytes();
181 | final byte[] expectedBytes = new byte[] { 79, -62 };
182 | assertTrue(Arrays.equals(bytes, expectedBytes));
183 | }
184 | {// Should work on a byte-divisible sequence, with no padding.
185 | final BigEndianAscendingWordSerializer serializer =
186 | new BigEndianAscendingWordSerializer(shortWordLength,
187 | 8/*wordCount*/,
188 | 0/*bytePadding, none*/);
189 |
190 | for(int i=1; i<9; i++) {
191 | serializer.writeWord(i);
192 | }
193 |
194 | // Values: 1-8
195 | // Corresponding bits:
196 | // ------------------
197 | // 00001
198 | // 00010
199 | // 00011
200 | // 00100
201 | // 00101
202 | // 00110
203 | // 00111
204 | // 01000
205 |
206 | // And the hex:
207 | // ------------
208 | // 0000 1000 => 0x08 => 8
209 | // 1000 0110 => 0x86 => -122
210 | // 0100 0010 => 0x62 => 66
211 | // 1001 1000 => 0x98 => -104
212 | // 1110 1000 => 0xE8 => -24
213 |
214 | final byte[] bytes = serializer.getBytes();
215 | final byte[] expectedBytes = new byte[] { 8, -122, 66, -104, -24 };
216 | assertTrue(Arrays.equals(bytes, expectedBytes));
217 | }
218 | {// Should pad the array correctly.
219 | final BigEndianAscendingWordSerializer serializer =
220 | new BigEndianAscendingWordSerializer(shortWordLength,
221 | 1/*wordCount*/,
222 | 1/*bytePadding*/);
223 |
224 | serializer.writeWord(1);
225 | // 1 byte leading padding | value 1 | trailing padding
226 | // 0000 0000 | 0000 1|000
227 | final byte[] bytes = serializer.getBytes();
228 | final byte[] expectedBytes = new byte[] { 0, 8 };
229 | assertTrue(Arrays.equals(bytes, expectedBytes));
230 | }
231 | }
232 |
233 | /**
234 | * Smoke test for typical parameters used in practice.
235 | */
236 | @Test
237 | public void smokeTestSparseParams() {
238 | // XXX: revisit
239 | final int shortWordLength = 17;
240 | {// Should work on an empty sequence, with no padding.
241 | final BigEndianAscendingWordSerializer serializer =
242 | new BigEndianAscendingWordSerializer(shortWordLength,
243 | 0/*wordCount*/,
244 | 0/*bytePadding, none*/);
245 |
246 | assert(Arrays.equals(serializer.getBytes(), new byte[0]));
247 | }
248 | {// Should work on a non-byte-divisible sequence, with no padding.
249 | final BigEndianAscendingWordSerializer serializer =
250 | new BigEndianAscendingWordSerializer(shortWordLength,
251 | 3/*wordCount*/,
252 | 0/*bytePadding, none*/);
253 |
254 | serializer.writeWord(9);
255 | serializer.writeWord(42);
256 | serializer.writeWord(75);
257 |
258 | // The values:
259 | // -----------
260 | // 9 |42 |75 |padding
261 |
262 | // Corresponding bits:
263 | // ------------------
264 | // 0000 0000 0000 0100 1|000 0000 0000 1010 10|00 0000 0000 1001 011|0 0000
265 |
266 | // And the hex/decimal (remember Java bytes are signed):
267 | // -----------------------------------------------------
268 | // 0000 0000 -> 0x00 -> 0
269 | // 0000 0100 -> 0x04 -> 4
270 | // 1000 0000 -> 0x80 -> -128
271 | // 0000 1010 -> 0x0A -> 10
272 | // 1000 0000 -> 0x80 -> -128
273 | // 0000 1001 -> 0x09 -> 9
274 | // 0110 0000 -> 0x60 -> 96
275 |
276 | final byte[] bytes = serializer.getBytes();
277 | final byte[] expectedBytes = new byte[] { 0, 4, -128, 10, -128, 9, 96 };
278 | assertTrue(Arrays.equals(bytes, expectedBytes));
279 | }
280 | {// Should work on a byte-divisible sequence, with no padding.
281 | final BigEndianAscendingWordSerializer serializer =
282 | new BigEndianAscendingWordSerializer(shortWordLength,
283 | 8/*wordCount*/,
284 | 0/*bytePadding, none*/);
285 |
286 | for(int i=1; i<9; i++) {
287 | serializer.writeWord(i);
288 | }
289 |
290 | // Values: 1-8
291 | // Corresponding bits:
292 | // ------------------
293 | // 0000 0000 0000 0000 1
294 | // 000 0000 0000 0000 10
295 | // 00 0000 0000 0000 011
296 | // 0 0000 0000 0000 0100
297 |
298 | // 0000 0000 0000 0010 1
299 | // 000 0000 0000 0001 10
300 | // 00 0000 0000 0000 111
301 | // 0 0000 0000 0000 1000
302 |
303 | // And the hex:
304 | // ------------
305 | // 0000 0000 -> 0x00 -> 0
306 | // 0000 0000 -> 0x00 -> 0
307 | // 1000 0000 -> 0x80 -> -128
308 | // 0000 0000 -> 0x00 -> 0
309 | // 1000 0000 -> 0x80 -> -128
310 | // 0000 0000 -> 0x00 -> 0
311 | // 0110 0000 -> 0x60 -> 96
312 | // 0000 0000 -> 0x00 -> 0
313 | // 0100 0000 -> 0x40 -> 64
314 | // 0000 0000 -> 0x00 -> 0
315 | // 0010 1000 -> 0x28 -> 40
316 | // 0000 0000 -> 0x00 -> 0
317 | // 0001 1000 -> 0x18 -> 24
318 | // 0000 0000 -> 0x00 -> 0
319 | // 0000 1110 -> 0x0D -> 14
320 | // 0000 0000 -> 0x00 -> 0
321 | // 0000 1000 -> 0x08 -> 8
322 |
323 | final byte[] bytes = serializer.getBytes();
324 | final byte[] expectedBytes = new byte[] { 0, 0, -128, 0, -128, 0, 96, 0, 64, 0, 40, 0, 24, 0, 14, 0, 8 };
325 | assertTrue(Arrays.equals(bytes, expectedBytes));
326 | }
327 | {// Should pad the array correctly.
328 | final BigEndianAscendingWordSerializer serializer =
329 | new BigEndianAscendingWordSerializer(shortWordLength,
330 | 1/*wordCount*/,
331 | 1/*bytePadding*/);
332 |
333 | serializer.writeWord(1);
334 | // 1 byte leading padding | value 1 | trailing padding
335 | // 0000 0000 | 0000 0000 0000 0000 1|000 0000
336 | // 0x00 0x00 0x00 0x80
337 | final byte[] bytes = serializer.getBytes();
338 | final byte[] expectedBytes = new byte[] { 0, 0, 0, -128 };
339 | assertTrue(Arrays.equals(bytes, expectedBytes));
340 | }
341 | }
342 | }
343 |
--------------------------------------------------------------------------------
/src/test/java/net/agkn/hll/serialization/HLLSerializationTest.java:
--------------------------------------------------------------------------------
1 | package net.agkn.hll.serialization;
2 |
3 | import net.agkn.hll.HLL;
4 | import net.agkn.hll.HLLType;
5 | import org.testng.annotations.Test;
6 |
7 | import java.util.ArrayList;
8 | import java.util.Collection;
9 | import java.util.List;
10 | import java.util.Random;
11 |
12 | import static net.agkn.hll.HLL.MAXIMUM_EXPTHRESH_PARAM;
13 | import static net.agkn.hll.HLL.MAXIMUM_REGWIDTH_PARAM;
14 | import static net.agkn.hll.HLL.MINIMUM_EXPTHRESH_PARAM;
15 | import static net.agkn.hll.HLL.MINIMUM_LOG2M_PARAM;
16 | import static net.agkn.hll.HLL.MINIMUM_REGWIDTH_PARAM;
17 | import static org.testng.Assert.assertEquals;
18 |
19 | /**
20 | * Serialization smoke-tests.
21 | *
22 | * @author yerenkow
23 | * @author benl
24 | */
25 | public class HLLSerializationTest {
26 | // A fixed random seed so that this test is reproducible.
27 | private static final long RANDOM_SEED = 1L;
28 |
29 | /**
30 | * A smoke-test that covers serialization/deserialization of an HLL
31 | * under all possible parameters.
32 | */
33 | @Test
34 | public void serializationSmokeTest() throws Exception {
35 | final Random random = new Random(RANDOM_SEED);
36 | final int randomCount = 250;
37 | final List randoms = new ArrayList(randomCount){{
38 | for (int i=0; i items)
56 | throws CloneNotSupportedException {
57 | for(int log2m=MINIMUM_LOG2M_PARAM; log2m<=16; log2m++) {
58 | for(int regw=MINIMUM_REGWIDTH_PARAM; regw<=MAXIMUM_REGWIDTH_PARAM; regw++) {
59 | for(int expthr=MINIMUM_EXPTHRESH_PARAM; expthr<=MAXIMUM_EXPTHRESH_PARAM; expthr++ ) {
60 | for(final boolean sparse: new boolean[]{true, false}) {
61 | HLL hll = new HLL(log2m, regw, expthr, sparse, hllType);
62 | for(final Long item: items) {
63 | hll.addRaw(item);
64 | }
65 | HLL copy = HLL.fromBytes(hll.toBytes());
66 | assertEquals(copy.cardinality(), hll.cardinality());
67 | assertEquals(copy.getType(), hll.getType());
68 | assertEquals(copy.toBytes(), hll.toBytes());
69 |
70 | HLL clone = hll.clone();
71 | assertEquals(clone.cardinality(), hll.cardinality());
72 | assertEquals(clone.getType(), hll.getType());
73 | assertEquals(clone.toBytes(), hll.toBytes());
74 | }
75 | }
76 | }
77 | }
78 | }
79 | }
80 |
--------------------------------------------------------------------------------
/src/test/java/net/agkn/hll/util/BitVectorTest.java:
--------------------------------------------------------------------------------
1 | package net.agkn.hll.util;
2 |
3 | /*
4 | * Copyright 2013 Aggregate Knowledge, Inc.
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 |
19 | import static org.testng.Assert.assertEquals;
20 | import static org.testng.Assert.assertFalse;
21 | import static org.testng.Assert.assertTrue;
22 |
23 | import org.testng.annotations.Test;
24 |
25 | /**
26 | * Unit tests for {@link BitVector}.
27 | *
28 | * @author rgrzywinski
29 | * @author timon
30 | */
31 | public class BitVectorTest {
32 | /**
33 | * Tests {@link BitVector#getRegister(long)} and {@link BitVector#setRegister(long, long)}.
34 | */
35 | @Test
36 | public void getSetRegisterTest() {
37 | { // locally scoped for sanity
38 | // NOTE: registers are only 5bits wide
39 | final BitVector vector1 = new BitVector(5/*width*/, 128/*count, 2^7*/);
40 | final BitVector vector2 = new BitVector(5/*width*/, 128/*count, 2^7*/);
41 | final BitVector vector3 = new BitVector(5/*width*/, 128/*count, 2^7*/);
42 | final BitVector vector4 = new BitVector(5/*width*/, 128/*count, 2^7*/);
43 |
44 | for(int i=0; i<128/*2^7*/; i++) {
45 | vector1.setRegister(i, 0x1F);
46 | vector2.setRegister(i, (i & 0x1F));
47 | vector3.setRegister(i, ((127 - i) & 0x1F));
48 | vector4.setRegister(i, 0x15);
49 | }
50 |
51 | for(int i=0; i<128/*2^7*/; i++) {
52 | assertEquals(vector1.getRegister(i), 0x1F);
53 | assertEquals(vector2.getRegister(i), (i & 0x1F));
54 | assertEquals(vector3.getRegister(i), ((127 - i) & 0x1F));
55 | assertEquals(vector4.getRegister(i), 0x15);
56 | }
57 | }
58 | }
59 |
60 | // ========================================================================
61 | /**
62 | * Tests {@link BitVector#registerIterator()}
63 | */
64 | @Test
65 | public void registerIteratorTest() {
66 | { // scoped locally for sanity
67 | // NOTE: registers are only 5bits wide
68 | final BitVector vector1 = new BitVector(5/*width*/, 128/*count, 2^7*/);
69 | final BitVector vector2 = new BitVector(5/*width*/, 128/*count, 2^7*/);
70 | final BitVector vector3 = new BitVector(5/*width*/, 128/*count, 2^7*/);
71 | final BitVector vector4 = new BitVector(5/*width*/, 128/*count, 2^7*/);
72 |
73 | for(int i=0; i<128/*2^7*/; i++) {
74 | vector1.setRegister(i, 0x1F);
75 | vector2.setRegister(i, (i & 0x1F));
76 | vector3.setRegister(i, ((127 - i) & 0x1F));
77 | vector4.setRegister(i, 0x15);
78 | }
79 |
80 | final LongIterator registerIterator1 = vector1.registerIterator();
81 | final LongIterator registerIterator2 = vector2.registerIterator();
82 | final LongIterator registerIterator3 = vector3.registerIterator();
83 | final LongIterator registerIterator4 = vector4.registerIterator();
84 | for(int i=0; i<128/*2^7*/; i++) {
85 | assertEquals(registerIterator1.hasNext(), true);
86 | assertEquals(registerIterator2.hasNext(), true);
87 | assertEquals(registerIterator3.hasNext(), true);
88 | assertEquals(registerIterator4.hasNext(), true);
89 |
90 | assertEquals(registerIterator1.next(), 0x1F);
91 | assertEquals(registerIterator2.next(), (i & 0x1F));
92 | assertEquals(registerIterator3.next(), ((127 - i) & 0x1F));
93 | assertEquals(registerIterator4.next(), 0x15);
94 | }
95 | assertEquals(registerIterator1.hasNext(), false/*no more*/);
96 | assertEquals(registerIterator2.hasNext(), false/*no more*/);
97 | assertEquals(registerIterator3.hasNext(), false/*no more*/);
98 | assertEquals(registerIterator4.hasNext(), false/*no more*/);
99 | }
100 |
101 | { // scoped locally for sanity
102 | // Vectors that are shorter than one word
103 | assertIterator(1, 12/* 1*12=12 bits, fewer than a single word */);
104 | assertIterator(2, 12/* 2*12=24 bits, fewer than a single word */);
105 | assertIterator(3, 12/* 3*12=36 bits, fewer than a single word */);
106 | assertIterator(4, 12/* 4*12=48 bits, fewer than a single word */);
107 |
108 | // Vectors that don't fit exactly into longs
109 | assertIterator(5, 16/* 5*16=80 bits */);
110 | assertIterator(5, 32/* 5*32=160 bits */);
111 | }
112 |
113 | // Iterate over vectors that are padded
114 | }
115 |
116 | private static void assertIterator(final int width, final int count) {
117 | final BitVector vector = new BitVector(width, count);
118 | final LongIterator iter = vector.registerIterator();
119 |
120 | for(int i=0; i