├── .gitignore ├── LICENSE ├── README.md ├── docs ├── allclasses-frame.html ├── allclasses-noframe.html ├── com │ └── adroll │ │ └── cantor │ │ ├── HLLCounter.html │ │ ├── HLLWritable.html │ │ ├── class-use │ │ ├── HLLCounter.html │ │ └── HLLWritable.html │ │ ├── package-frame.html │ │ ├── package-summary.html │ │ ├── package-tree.html │ │ └── package-use.html ├── constant-values.html ├── deprecated-list.html ├── help-doc.html ├── index-all.html ├── index.html ├── overview-tree.html ├── package-list ├── resources │ ├── background.gif │ ├── tab.gif │ ├── titlebar.gif │ └── titlebar_end.gif ├── serialized-form.html └── stylesheet.css ├── pom.xml ├── src ├── main │ └── java │ │ └── com │ │ └── adroll │ │ └── cantor │ │ ├── HLLCounter.java │ │ ├── HLLWritable.java │ │ └── package-info.java └── test │ └── java │ └── com │ └── adroll │ └── cantor │ ├── TestHLLCounter.java │ └── TestHLLWritable.java └── utils ├── minhash_k.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | target/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 AdRoll 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Cantor 2 | ====== 3 | 4 | Cantor provides utilities for estimating the cardinality 5 | of large sets. 6 | 7 | The algorithms herein are parallelizable, and a Hadoop 8 | wrapper class is provided for convenience. 9 | 10 | It employs most of the HyperLogLog++ algorithm as seen in 11 | [this paper](http://research.google.com/pubs/pub40671.html), 12 | excluding the sparse scheme, and using a simple linear 13 | interpolation instead of kNN. In addition, it can use MinHash 14 | structures to estimate cardinalities of intersections of these 15 | sets, as described in 16 | [this blog post](http://tech.adroll.com/blog/data/2013/07/10/hll-minhash.html). 17 | 18 | Both HyperLogLog and MinHash require a precision 19 | parameter. Basic guidelines are available as follows, 20 | and `HLLCounter.MIN_P = 4 <= p <= 18 = HLLCounter.MAX_P`. 21 | 22 | ####HyperLogLog p @ 99.7% Confidence 23 | p | Relative Error 24 | ---:|---: 25 | 4 | 75% 26 | 5 | 65% 27 | 6 | 47% 28 | 7 | 32% 29 | 8 | 23% 30 | 9 | 16% 31 | 10 | 10% 32 | 11 | 8% 33 | 12 | 5% 34 | 13 | 4% 35 | 14 | 2.5% 36 | 15 | 2% 37 | 16 | 1.3% 38 | 17 | 1% 39 | 18 | 0.7% 40 | 41 | ####MinHash k @ 99% Confidence 42 | **Relative Error** | **Intersection Size -->** | | | | * 43 | :------------------|--------------------------:|-------:|-----:|------:|-----: 44 | - | 0.01% | 0.1% |1.0% | 5.0% |10.0% 45 | 100% | 90000 | 9000 |900 | 170 |75 46 | 50% | 313334 | 31334 |3134 | 587 |280 47 | 25% | - | 116800 |11520 | 2208 |1040 48 | 10% | - | - |68455 | 13128 |6210 49 | 50 | This MinHash k table can be generated by using `minhash_k.py` in the `utils` 51 | directory. For now, the only requirement is scipy, which you can install with 52 | `pip install -r utils/requirements.txt`. Then, for example, you can do: 53 | 54 | ``` 55 | %> ./utils/minhash_k.py --jaccard 0.0001 --error 1 --confidence 0.99 56 | MinHash k: 90000 57 | Error at k: 1.0 58 | %> ./utils/minhash_k.py --jaccard 0.01 --error 0.25 --confidence 0.99 59 | MinHash k: 11520 60 | Error at k: 0.25 61 | %> ./utils/minhash_k.py --jaccard 0.01 --error 0.25 --confidence 0.90 62 | MinHash k: 4800 63 | Error at k: 0.25 64 | ``` 65 | 66 | Additional information is available with `./utils/minhash_k.py --help`. -------------------------------------------------------------------------------- /docs/allclasses-frame.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 |public class HLLCounter 104 | extends Object 105 | implements Serializable106 |
HLLCounter
allows for cardinality estimation of
107 | large sets with a compact data structure.
108 |
109 | For guidance on setting precisions for a desired level of
110 | error, see the package info
.
Modifier and Type | 128 |Field and Description | 129 |
---|---|
static int |
132 | DEFAULT_K
133 | Default MinHash precision of 8192 if intersectable
134 | and HLL precision is DEFAULT_P
135 | |
136 |
static byte |
139 | DEFAULT_P
140 | Default HLL precision of 18
141 | |
142 |
static byte |
145 | MAX_P
146 | Maximum HLL precision of 18
147 | |
148 |
static byte |
151 | MIN_P
152 | Minimum HLL precision of 4
153 | |
154 |
Constructor and Description | 168 |
---|
HLLCounter()
171 | Constructs a non-intersectable
174 | HLLCounter
172 | that can be used to estimate the cardinality of a set
173 | of items with precision DEFAULT_P . |
175 |
HLLCounter(boolean intersectable)
178 | Constructs an
182 | HLLCounter that can be used to
179 | estimate the cardinality of a set of items with
180 | DEFAULT_P and, if intersectable is
181 | true , DEFAULT_K . |
183 |
HLLCounter(boolean intersectable,
186 | int k)
187 | Constructs an
191 | HLLCounter that can be used to
188 | estimate the cardinality of a set of items with
189 | DEFAULT_P and, if intersectable is
190 | true , specified k . |
192 |
HLLCounter(byte p)
195 | Constructs a non-intersectable
198 | HLLCounter object
196 | that can be used to estimate the cardinality of a set of items
197 | with given precision. |
199 |
HLLCounter(byte p,
202 | boolean intersectable)
203 | Constructs an
208 | HLLCounter that can be used to
204 | estimate the cardinality of a set of items with specified
205 | precision and, if intersectable is
206 | true , k is a reasonable precision
207 | guess based on p . |
209 |
HLLCounter(byte p,
212 | boolean intersectable,
213 | int k)
214 | Constructs an
218 | HLLCounter that can be used to
215 | estimate the cardinality of a set of items with specified
216 | precision and, if intersectable is
217 | true , specified k . |
219 |
HLLCounter(byte p,
222 | boolean intersectable,
223 | int k,
224 | byte[] M,
225 | TreeSet<Long> ts)
226 | Constructs an
231 | HLLCounter that can be used to
227 | estimate the cardinality of a set of items with specified
228 | precision and, if intersectable is
229 | true , specified k , along with
230 | pre-computed HLL and MinHash structures. |
232 |
Modifier and Type | 246 |Method and Description | 247 |
---|---|
void |
250 | clear()
251 | Clears all data in the HLL and MinHash structures.
252 | |
253 |
void |
256 | combine(HLLCounter... hs)
257 | Performs a destructive union of this
259 | HLLCounter
258 | and all the ones passed in. |
260 |
void |
263 | combine(HLLCounter h)
264 | Performs a destructive union of this
265 |
266 | HLLCounter and the one passed in. |
267 |
void |
270 | fold(byte q)
271 | Reduces the precision from
272 | p to q . |
273 |
byte[] |
276 | getByteArray()
277 | Returns the raw HLL structure.
278 | |
279 |
int |
282 | getK()
283 | Returns the precision of the MinHash structure.
284 | |
285 |
TreeSet<Long> |
288 | getMinHash()
289 | Returns the raw MinHash structure.
290 | |
291 |
byte |
294 | getP()
295 | Returns the precision of the HLL structure.
296 | |
297 |
static long |
300 | intersect(HLLCounter... hs)
301 | Returns an estimate of the size of the intersection
302 | of the given
303 | HLLCounters . |
304 |
boolean |
307 | isIntersectable()
308 | Returns whether this structure is intersectable.
309 | |
310 |
void |
313 | put(String... vs)
314 | Insert multiple elements into the
316 | HLLCounter
315 | structure. |
317 |
void |
320 | put(String v)
321 | Insert an element into the
322 | HLLCounter structure. |
323 |
static byte[] |
326 | safeUnion(byte[] Q,
327 | byte[] R)
328 | Returns an HLL structure that is the effective union
329 | of two other HLL structures.
330 | |
331 |
long |
334 | size()
335 | Returns the estimated number of unique insertions into
336 | the
337 | HLLCounter structure. |
338 |
public static final byte DEFAULT_P368 |
public static final int DEFAULT_K379 |
public static final byte MIN_P391 |
public static final byte MAX_P402 |
public HLLCounter()421 |
HLLCounter
422 | that can be used to estimate the cardinality of a set
423 | of items with precision DEFAULT_P
.public HLLCounter(byte p)433 |
HLLCounter
object
434 | that can be used to estimate the cardinality of a set of items
435 | with given precision.p
- the byte
precision of the HLL structure,
437 | MIN_P <= p <= MAX_P
public HLLCounter(boolean intersectable)447 |
HLLCounter
that can be used to
448 | estimate the cardinality of a set of items with
449 | DEFAULT_P
and, if intersectable
is
450 | true
, DEFAULT_K
.intersectable
- boolean
to make the structure
452 | intersectablepublic HLLCounter(byte p, 462 | boolean intersectable)463 |
HLLCounter
that can be used to
464 | estimate the cardinality of a set of items with specified
465 | precision and, if intersectable
is
466 | true
, k
is a reasonable precision
467 | guess based on p
.p
- the byte
precision of the
469 | HLL structure, MIN_P <= p <=
470 | MAX_P
intersectable
- boolean
to make the structure
471 | intersectablepublic HLLCounter(boolean intersectable, 481 | int k)482 |
HLLCounter
that can be used to
483 | estimate the cardinality of a set of items with
484 | DEFAULT_P
and, if intersectable
is
485 | true
, specified k
.intersectable
- boolean
to make the structure
487 | intersectablek
- the int
precision of MinHash
488 | structurepublic HLLCounter(byte p, 498 | boolean intersectable, 499 | int k)500 |
HLLCounter
that can be used to
501 | estimate the cardinality of a set of items with specified
502 | precision and, if intersectable
is
503 | true
, specified k
.p
- the byte
precision of the
505 | HLL structure, MIN_P <= p <=
506 | MAX_P
intersectable
- boolean
to make the structure
507 | intersectablek
- the int
precision of MinHash
508 | structurepublic HLLCounter(byte p, 518 | boolean intersectable, 519 | int k, 520 | byte[] M, 521 | TreeSet<Long> ts)522 |
HLLCounter
that can be used to
523 | estimate the cardinality of a set of items with specified
524 | precision and, if intersectable
is
525 | true
, specified k
, along with
526 | pre-computed HLL and MinHash structures.
527 |
528 | One should only use this constructor when absolutely
529 | necessary, for example to copy one HLLCounter
's
530 | fields into a new HLLCounter
.
p
- the byte
precision of the
532 | HLL structure, MIN_P <= p <=
533 | MAX_P
intersectable
- boolean
to make the structure
534 | intersectablek
- the int
precision of MinHash
535 | structureM
- the byte[]
HLL structure of
536 | length 2^p
ts
- the TreeSet<Long>
MinHash structure of
537 | length k
public void put(String v)555 |
HLLCounter
structure.
556 |
557 | The String v
is hashed to a long
,
558 | which, by the HLL algorithm is inserted in the HLL
559 | byte[]
.
560 |
561 | Additionally, if this HLLCounter
is intersectable,
562 | the long
is added to the MinHash
563 | TreeSet<Long>
if it is one of the k
564 | least long
s seen thus far.
v
- the String
to insertpublic void put(String... vs)575 |
HLLCounter
576 | structure.
577 |
578 | This iterates the put(String v)
method.
vs
- the String...
elements to insertpublic long size()589 |
HLLCounter
structure.
591 |
592 | For sets of cardinality less than k
, this
593 | is exact.
long
estimated size of unique
595 | insertionspublic void clear()605 |
public void combine(HLLCounter h)615 |
HLLCounter
and the one passed in.
617 |
618 | This HLLCounter
will now contain data that is
619 | equivalent to all insertions from both
620 | HLLCounter
s going into a single
621 | HLLCounter
. The precisions of this
622 | HLLCounter
will be the smaller of the precisions
623 | between each HLLCounter
.
h
- the HLLCounter
to combine into this onepublic void combine(HLLCounter... hs)634 |
HLLCounter
635 | and all the ones passed in.
636 |
637 | This iterates the combine(HLLCounter h)
method.
hs
- the HLLCounter...
to combine into
639 | this onepublic void fold(byte q)649 |
p
to q
.q
- the byte
new precisionpublic byte[] getByteArray()660 |
byte[]
of the HLLpublic byte getP()671 |
byte
precision of the HLLpublic boolean isIntersectable()682 |
boolean
that indicates
684 | whether this is intersectablepublic TreeSet<Long> getMinHash()694 |
TreeSet<Long>
of the
696 | k
least elements hashedpublic int getK()706 |
int
precision of MinHashpublic static long intersect(HLLCounter... hs)717 |
HLLCounters
.
719 |
720 | This method only works for intersectable
721 | HLLCounters
. It doesn't actually perform
722 | an intersection, but rather returns an estimate of the
723 | cardinality of the intersection.
724 |
725 | It calculates a global MinHash structure based on the
726 | minimum MinHash precision k
across all the
727 | structures. A global HLL structure is also computed
728 | with the minimum precision across all structures. That
729 | is, the estimate's only as accurate as the least accurate
730 | HLLCounter
.
hs
- the HLLCounter...
to "intersect"long
estimate of the intersection
733 | cardinalitypublic static byte[] safeUnion(byte[] Q, 743 | byte[] R)744 |
747 | This method is primarily used for non-destructive unions,
748 | unlike the combine(HLLCounter h)
method.
Q
- the byte[]
first HLL structure
750 | to unionR
- the byte[]
second HLL structure
751 | to unionbyte[]
HLL structure of
753 | the unionCopyright © 2014. All rights reserved.
823 | 824 | 825 | -------------------------------------------------------------------------------- /docs/com/adroll/cantor/HLLWritable.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 |public class HLLWritable 104 | extends Object 105 | implements org.apache.hadoop.io.Writable106 |
HLLWritable
allows for serialization and
107 | deserialization of HLLCounter
objects in a
108 | Hadoop framework.Modifier and Type | 125 |Field and Description | 126 |
---|---|
protected int |
129 | k
130 | The MinHash precision of the contained
131 | HLLCounter representation. |
132 |
protected byte[] |
135 | M
136 | The HLL structure of the contained
137 | HLLCounter representation. |
138 |
protected long[] |
141 | minhash
142 | The contents of the MinHash structure of the contained
143 |
144 | HLLCounter representation. |
145 |
protected byte |
148 | p
149 | The HLL precision of the contained
150 | HLLCounter represenation. |
151 |
protected int |
154 | s
155 | The number of current elements in the MinHash structure
156 | of the contained
157 | HLLCounter representation. |
158 |
Constructor and Description | 172 |
---|
HLLWritable()
175 | Constructs an
178 | HLLWritable that contains a representation
176 | of the default HLLCounter constructed by
177 | HLLCounter.HLLCounter() . |
179 |
HLLWritable(byte p,
182 | int k,
183 | int s,
184 | byte[] M,
185 | long[] minhash)
186 | Constructs an
187 | HLLWritable with the given set of fields. |
188 |
HLLWritable(HLLCounter h)
191 | Constructs an
193 | HLLWritable that contains a representation
192 | of the provided HLLCounter . |
194 |
Modifier and Type | 208 |Method and Description | 209 |
---|---|
HLLWritable |
212 | combine(HLLWritable other)
213 | Returns a new
217 | HLLWritable that contains a
214 | representation of combining its internal
215 | HLLCounter 's representation with
216 | the other's. |
218 |
boolean |
221 | equals(Object obj)
222 | Returns whether this
224 | HLLWritable
223 | is equivalent to the given Object . |
225 |
HLLCounter |
228 | get()
229 | Returns a new
232 | HLLCounter that is constructed
230 | from the internal representation of the HLLCounter
231 | that this HLLWritable contains. |
233 |
int |
236 | hashCode()
237 | Hashes this
239 | HLLWritable based on its
238 | internal structures. |
240 |
void |
243 | readFields(DataInput in)
244 | Deserialize the fields of this
246 | HLLWritable
245 | from the given DataInput . |
247 |
void |
250 | set(HLLCounter h)
251 | Encapsulates a representation of the given
253 | HLLCounter
252 | in this HLLWritable . |
254 |
String |
257 | toString()
258 | Returns a
260 | String representation of this
259 | HLLWritable . |
261 |
void |
264 | write(DataOutput out)
265 | Serializes this
267 | HLLWritable to the given
266 | DataOutput . |
268 |
protected byte p298 |
HLLCounter
represenation.
299 | HLLCounter.MIN_P
<= p <=
HLLCounter.MAX_P
.protected int k309 |
HLLCounter
representation.protected int s319 |
HLLCounter
representation.protected byte[] M330 |
HLLCounter
representation.protected long[] minhash340 |
HLLCounter
representation.public HLLWritable()359 |
HLLWritable
that contains a representation
360 | of the default HLLCounter
constructed by
361 | HLLCounter.HLLCounter()
.public HLLWritable(HLLCounter h)371 |
HLLWritable
that contains a representation
372 | of the provided HLLCounter
.h
- the HLLCounter
to represent and containpublic HLLWritable(byte p, 383 | int k, 384 | int s, 385 | byte[] M, 386 | long[] minhash)387 |
HLLWritable
with the given set of fields.p
- the byte
precision of the HLL
389 | structure. HLLCounter.MIN_P
<= p
390 | <=
HLLCounter.MAX_P
.k
- the int
precision of the MinHash
391 | structures
- the int
number of elements in the
392 | MinHash structureM
- the byte[]
HLL structureminhash
- the long[]
elements in the MinHash
393 | structurepublic void set(HLLCounter h)411 |
HLLCounter
412 | in this HLLWritable
.h
- the HLLCounter
to represent and containpublic HLLCounter get()423 |
HLLCounter
that is constructed
424 | from the internal representation of the HLLCounter
425 | that this HLLWritable
contains.HLLCounter
this HLLWritable
427 | represents.public HLLWritable combine(HLLWritable other)437 |
HLLWritable
that contains a
438 | representation of combining its internal
439 | HLLCounter
's representation with
440 | the other's.
441 |
442 | It is functionally equivalent to combining two
443 | HLLCounter
s
444 | (HLLCounter.combine(HLLCounter h)
) and creating a
445 | new HLLWritable
out of that.
446 |
447 | Returns null
if the combination fails.
other
- the HLLWritable
to combineHLLWritable
that represents
450 | the union, null
if fails,
451 | this
if other
452 | is null
.public void write(DataOutput out) 462 | throws IOException463 |
HLLWritable
to the given
464 | DataOutput
.
465 | 466 | Generally, this method should not be called on its own.
write
in interface org.apache.hadoop.io.Writable
out
- the DataOutput
object to write toIOException
public void readFields(DataInput in) 482 | throws IOException483 |
HLLWritable
484 | from the given DataInput
.
485 | 486 | Generally, this method should not be called on its own. 487 | For efficiency, implementations should attempt to re-use 488 | storage in the existing object where possible.
readFields
in interface org.apache.hadoop.io.Writable
in
- the DataInput
to read fromIOException
public int hashCode()504 |
HLLWritable
based on its
505 | internal structures.public boolean equals(Object obj)519 |
HLLWritable
520 | is equivalent to the given Object
.
521 |
522 | If the input is another HLLWritable
,
523 | the two are considered equivalent if all of their
524 | fields are equivalent (that is, the two
525 | HLLCounters
likely saw the exact same
526 | data).
Copyright © 2014. All rights reserved.
619 | 620 | 621 | -------------------------------------------------------------------------------- /docs/com/adroll/cantor/class-use/HLLCounter.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 |Modifier and Type | 80 |Method and Description | 81 |
---|---|
HLLCounter |
85 | HLLWritable.get()
86 | Returns a new
89 | HLLCounter that is constructed
87 | from the internal representation of the HLLCounter
88 | that this HLLWritable contains. |
90 |
Modifier and Type | 97 |Method and Description | 98 |
---|---|
void |
102 | HLLCounter.combine(HLLCounter... hs)
103 | Performs a destructive union of this
105 | HLLCounter
104 | and all the ones passed in. |
106 |
void |
109 | HLLCounter.combine(HLLCounter h)
110 | Performs a destructive union of this
111 |
112 | HLLCounter and the one passed in. |
113 |
static long |
116 | HLLCounter.intersect(HLLCounter... hs)
117 | Returns an estimate of the size of the intersection
118 | of the given
119 | HLLCounters . |
120 |
void |
123 | HLLWritable.set(HLLCounter h)
124 | Encapsulates a representation of the given
126 | HLLCounter
125 | in this HLLWritable . |
127 |
Constructor and Description | 134 |
---|
HLLWritable(HLLCounter h)
138 | Constructs an
140 | HLLWritable that contains a representation
139 | of the provided HLLCounter . |
141 |
Copyright © 2014. All rights reserved.
194 | 195 | 196 | -------------------------------------------------------------------------------- /docs/com/adroll/cantor/class-use/HLLWritable.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 |Modifier and Type | 80 |Method and Description | 81 |
---|---|
HLLWritable |
85 | HLLWritable.combine(HLLWritable other)
86 | Returns a new
90 | HLLWritable that contains a
87 | representation of combining its internal
88 | HLLCounter 's representation with
89 | the other's. |
91 |
Modifier and Type | 98 |Method and Description | 99 |
---|---|
HLLWritable |
103 | HLLWritable.combine(HLLWritable other)
104 | Returns a new
108 | HLLWritable that contains a
105 | representation of combining its internal
106 | HLLCounter 's representation with
107 | the other's. |
109 |
Copyright © 2014. All rights reserved.
162 | 163 | 164 | -------------------------------------------------------------------------------- /docs/com/adroll/cantor/package-frame.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 |See: Description
72 |Class | 80 |Description | 81 |
---|---|
HLLCounter | 85 |
86 | HLLCounter allows for cardinality estimation of
87 | large sets with a compact data structure. |
89 |
HLLWritable | 92 |
93 | HLLWritable allows for serialization and
94 | deserialization of HLLCounter objects in a
95 | Hadoop framework. |
97 |
109 | The algorithms herein are parallelizable, and a Hadoop 110 | wrapper class is provided for convenience. 111 |
112 | It employs most of the HyperLogLog++ algorithm as seen in 113 | 114 | this paper, excluding the sparse scheme, and using 115 | a simple linear interpolation instead of kNN. In addition, 116 | it can use MinHash structures to estimate cardinalities of 117 | intersections of these sets, as described in 118 | 119 | this blog post. 120 |
121 | Both HyperLogLog and MinHash require a precision
122 | parameter. Basic guidelines are available as follows,
123 | and HLLCounter.MIN_P
= 4 <= p <= 18 =
124 | HLLCounter.MAX_P
.
125 |
126 |
129 |
|
149 |
150 |
|
160 |
Copyright © 2014. All rights reserved.
208 | 209 | 210 | -------------------------------------------------------------------------------- /docs/com/adroll/cantor/package-tree.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 |Copyright © 2014. All rights reserved.
124 | 125 | 126 | -------------------------------------------------------------------------------- /docs/com/adroll/cantor/package-use.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 |Class and Description | 77 |
---|
HLLCounter
81 | HLLCounter allows for cardinality estimation of
82 | large sets with a compact data structure. |
84 |
HLLWritable
87 | HLLWritable allows for serialization and
88 | deserialization of HLLCounter objects in a
89 | Hadoop framework. |
91 |
Copyright © 2014. All rights reserved.
142 | 143 | 144 | -------------------------------------------------------------------------------- /docs/constant-values.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 |Modifier and Type | 82 |Constant Field | 83 |Value | 84 |
---|---|---|
88 |
89 | public static final int |
90 | DEFAULT_K |
91 | 8192 |
92 |
95 |
96 | public static final byte |
97 | DEFAULT_P |
98 | 18 |
99 |
102 |
103 | public static final byte |
104 | MAX_P |
105 | 18 |
106 |
109 |
110 | public static final byte |
111 | MIN_P |
112 | 4 |
113 |
Copyright © 2014. All rights reserved.
164 | 165 | 166 | -------------------------------------------------------------------------------- /docs/deprecated-list.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 |Copyright © 2014. All rights reserved.
114 | 115 | 116 | -------------------------------------------------------------------------------- /docs/help-doc.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 |Each package has a page that contains a list of its classes and interfaces, with a summary for each. This page can contain six categories:
74 |Each class, interface, nested class and nested interface has its own separate page. Each of these pages has three sections consisting of a class/interface description, summary tables, and detailed member descriptions:
86 |Each summary entry contains the first sentence from the detailed description for that item. The summary entries are alphabetical, while the detailed descriptions are in the order they appear in the source code. This preserves the logical groupings established by the programmer.
106 |Each annotation type has its own separate page with the following sections:
110 |Each enum has its own separate page with the following sections:
121 |Each documented package, class and interface has its own Use page. This page describes what packages, classes, methods, constructors and fields use any part of the given class or package. Given a class or interface A, its Use page includes subclasses of A, fields declared as A, methods that return A, and methods and constructors with parameters of type A. You can access this page by first going to the package, class or interface, then clicking on the "Use" link in the navigation bar.
131 |There is a Class Hierarchy page for all packages, plus a hierarchy for each package. Each hierarchy page contains a list of classes and a list of interfaces. The classes are organized by inheritance structure starting with java.lang.Object
. The interfaces do not inherit from java.lang.Object
.
The Deprecated API page lists all of the API that have been deprecated. A deprecated API is not recommended for use, generally due to improvements, and a replacement API is usually given. Deprecated APIs may be removed in future implementations.
143 |The Index contains an alphabetic list of all classes, interfaces, constructors, methods, and fields.
147 |These links take you to the next or previous class, interface, package, or related page.
151 |These links show and hide the HTML frames. All pages are available with or without frames.
155 |The All Classes link shows all classes and interfaces except non-static nested types.
159 |Each serializable or externalizable class has a description of its serialization fields and methods. This information is of interest to re-implementors, not to developers using the API. While there is no link in the navigation bar, you can get to this information by going to any serialized class and clicking "Serialized Form" in the "See also" section of the class description.
163 |The Constant Field Values page lists the static final fields and their values.
167 |Copyright © 2014. All rights reserved.
215 | 216 | 217 | -------------------------------------------------------------------------------- /docs/index-all.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 |HLLCounter
and the one passed in.HLLCounter
87 | and all the ones passed in.HLLWritable
that contains a
92 | representation of combining its internal
93 | HLLCounter
's representation with
94 | the other's.HLLWritable
120 | is equivalent to the given Object
.p
to q
.HLLCounter
that is constructed
141 | from the internal representation of the HLLCounter
142 | that this HLLWritable
contains.HLLWritable
based on its
169 | internal structures.HLLCounter
allows for cardinality estimation of
174 | large sets with a compact data structure.HLLCounter
179 | that can be used to estimate the cardinality of a set
180 | of items with precision DEFAULT_P
.HLLCounter
object
185 | that can be used to estimate the cardinality of a set of items
186 | with given precision.HLLCounter
that can be used to
191 | estimate the cardinality of a set of items with
192 | DEFAULT_P
and, if intersectable
is
193 | true
, DEFAULT_K
.HLLCounter
that can be used to
198 | estimate the cardinality of a set of items with specified
199 | precision and, if intersectable
is
200 | true
, k
is a reasonable precision
201 | guess based on p
.HLLCounter
that can be used to
206 | estimate the cardinality of a set of items with
207 | DEFAULT_P
and, if intersectable
is
208 | true
, specified k
.HLLCounter
that can be used to
213 | estimate the cardinality of a set of items with specified
214 | precision and, if intersectable
is
215 | true
, specified k
.HLLCounter
that can be used to
220 | estimate the cardinality of a set of items with specified
221 | precision and, if intersectable
is
222 | true
, specified k
, along with
223 | pre-computed HLL and MinHash structures.HLLWritable
allows for serialization and
228 | deserialization of HLLCounter
objects in a
229 | Hadoop framework.HLLWritable
that contains a representation
234 | of the default HLLCounter
constructed by
235 | HLLCounter()
.HLLWritable
that contains a representation
240 | of the provided HLLCounter
.HLLWritable
with the given set of fields.HLLCounters
.HLLCounter
representation.HLLCounter
representation.HLLCounter
representation.HLLCounter
represenation.HLLCounter
structure.HLLCounter
311 | structure.HLLWritable
322 | from the given DataInput
.HLLCounter
representation.HLLCounter
343 | in this HLLWritable
.HLLCounter
structure.String
representation of this
359 | HLLWritable
.HLLWritable
to the given
370 | DataOutput
.Copyright © 2014. All rights reserved.
419 | 420 | 421 | -------------------------------------------------------------------------------- /docs/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 |Copyright © 2014. All rights reserved.
128 | 129 | 130 | -------------------------------------------------------------------------------- /docs/package-list: -------------------------------------------------------------------------------- 1 | com.adroll.cantor 2 | -------------------------------------------------------------------------------- /docs/resources/background.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AdRoll/cantor/76b4e1b4fcca28e57e3a23cb6ea61fb428275442/docs/resources/background.gif -------------------------------------------------------------------------------- /docs/resources/tab.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AdRoll/cantor/76b4e1b4fcca28e57e3a23cb6ea61fb428275442/docs/resources/tab.gif -------------------------------------------------------------------------------- /docs/resources/titlebar.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AdRoll/cantor/76b4e1b4fcca28e57e3a23cb6ea61fb428275442/docs/resources/titlebar.gif -------------------------------------------------------------------------------- /docs/resources/titlebar_end.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AdRoll/cantor/76b4e1b4fcca28e57e3a23cb6ea61fb428275442/docs/resources/titlebar_end.gif -------------------------------------------------------------------------------- /docs/serialized-form.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 |byte p86 |
MIN_P <= p <= MAX_P
int m91 |
p
, length of HLL arraydouble a96 |
m
byte[] M102 |
boolean intersectable107 |
TreeSet<E> ts112 |
int k117 |
Copyright © 2014. All rights reserved.
172 | 173 | 174 | -------------------------------------------------------------------------------- /docs/stylesheet.css: -------------------------------------------------------------------------------- 1 | /* Javadoc style sheet */ 2 | /* 3 | Overall document style 4 | */ 5 | body { 6 | background-color:#ffffff; 7 | color:#353833; 8 | font-family:Arial, Helvetica, sans-serif; 9 | font-size:76%; 10 | margin:0; 11 | } 12 | a:link, a:visited { 13 | text-decoration:none; 14 | color:#4c6b87; 15 | } 16 | a:hover, a:focus { 17 | text-decoration:none; 18 | color:#bb7a2a; 19 | } 20 | a:active { 21 | text-decoration:none; 22 | color:#4c6b87; 23 | } 24 | a[name] { 25 | color:#353833; 26 | } 27 | a[name]:hover { 28 | text-decoration:none; 29 | color:#353833; 30 | } 31 | pre { 32 | font-size:1.3em; 33 | } 34 | h1 { 35 | font-size:1.8em; 36 | } 37 | h2 { 38 | font-size:1.5em; 39 | } 40 | h3 { 41 | font-size:1.4em; 42 | } 43 | h4 { 44 | font-size:1.3em; 45 | } 46 | h5 { 47 | font-size:1.2em; 48 | } 49 | h6 { 50 | font-size:1.1em; 51 | } 52 | ul { 53 | list-style-type:disc; 54 | } 55 | code, tt { 56 | font-size:1.2em; 57 | } 58 | dt code { 59 | font-size:1.2em; 60 | } 61 | table tr td dt code { 62 | font-size:1.2em; 63 | vertical-align:top; 64 | } 65 | sup { 66 | font-size:.6em; 67 | } 68 | /* 69 | Document title and Copyright styles 70 | */ 71 | .clear { 72 | clear:both; 73 | height:0px; 74 | overflow:hidden; 75 | } 76 | .aboutLanguage { 77 | float:right; 78 | padding:0px 21px; 79 | font-size:.8em; 80 | z-index:200; 81 | margin-top:-7px; 82 | } 83 | .legalCopy { 84 | margin-left:.5em; 85 | } 86 | .bar a, .bar a:link, .bar a:visited, .bar a:active { 87 | color:#FFFFFF; 88 | text-decoration:none; 89 | } 90 | .bar a:hover, .bar a:focus { 91 | color:#bb7a2a; 92 | } 93 | .tab { 94 | background-color:#0066FF; 95 | background-image:url(resources/titlebar.gif); 96 | background-position:left top; 97 | background-repeat:no-repeat; 98 | color:#ffffff; 99 | padding:8px; 100 | width:5em; 101 | font-weight:bold; 102 | } 103 | /* 104 | Navigation bar styles 105 | */ 106 | .bar { 107 | background-image:url(resources/background.gif); 108 | background-repeat:repeat-x; 109 | color:#FFFFFF; 110 | padding:.8em .5em .4em .8em; 111 | height:auto;/*height:1.8em;*/ 112 | font-size:1em; 113 | margin:0; 114 | } 115 | .topNav { 116 | background-image:url(resources/background.gif); 117 | background-repeat:repeat-x; 118 | color:#FFFFFF; 119 | float:left; 120 | padding:0; 121 | width:100%; 122 | clear:right; 123 | height:2.8em; 124 | padding-top:10px; 125 | overflow:hidden; 126 | } 127 | .bottomNav { 128 | margin-top:10px; 129 | background-image:url(resources/background.gif); 130 | background-repeat:repeat-x; 131 | color:#FFFFFF; 132 | float:left; 133 | padding:0; 134 | width:100%; 135 | clear:right; 136 | height:2.8em; 137 | padding-top:10px; 138 | overflow:hidden; 139 | } 140 | .subNav { 141 | background-color:#dee3e9; 142 | border-bottom:1px solid #9eadc0; 143 | float:left; 144 | width:100%; 145 | overflow:hidden; 146 | } 147 | .subNav div { 148 | clear:left; 149 | float:left; 150 | padding:0 0 5px 6px; 151 | } 152 | ul.navList, ul.subNavList { 153 | float:left; 154 | margin:0 25px 0 0; 155 | padding:0; 156 | } 157 | ul.navList li{ 158 | list-style:none; 159 | float:left; 160 | padding:3px 6px; 161 | } 162 | ul.subNavList li{ 163 | list-style:none; 164 | float:left; 165 | font-size:90%; 166 | } 167 | .topNav a:link, .topNav a:active, .topNav a:visited, .bottomNav a:link, .bottomNav a:active, .bottomNav a:visited { 168 | color:#FFFFFF; 169 | text-decoration:none; 170 | } 171 | .topNav a:hover, .bottomNav a:hover { 172 | text-decoration:none; 173 | color:#bb7a2a; 174 | } 175 | .navBarCell1Rev { 176 | background-image:url(resources/tab.gif); 177 | background-color:#a88834; 178 | color:#FFFFFF; 179 | margin: auto 5px; 180 | border:1px solid #c9aa44; 181 | } 182 | /* 183 | Page header and footer styles 184 | */ 185 | .header, .footer { 186 | clear:both; 187 | margin:0 20px; 188 | padding:5px 0 0 0; 189 | } 190 | .indexHeader { 191 | margin:10px; 192 | position:relative; 193 | } 194 | .indexHeader h1 { 195 | font-size:1.3em; 196 | } 197 | .title { 198 | color:#2c4557; 199 | margin:10px 0; 200 | } 201 | .subTitle { 202 | margin:5px 0 0 0; 203 | } 204 | .header ul { 205 | margin:0 0 25px 0; 206 | padding:0; 207 | } 208 | .footer ul { 209 | margin:20px 0 5px 0; 210 | } 211 | .header ul li, .footer ul li { 212 | list-style:none; 213 | font-size:1.2em; 214 | } 215 | /* 216 | Heading styles 217 | */ 218 | div.details ul.blockList ul.blockList ul.blockList li.blockList h4, div.details ul.blockList ul.blockList ul.blockListLast li.blockList h4 { 219 | background-color:#dee3e9; 220 | border-top:1px solid #9eadc0; 221 | border-bottom:1px solid #9eadc0; 222 | margin:0 0 6px -8px; 223 | padding:2px 5px; 224 | } 225 | ul.blockList ul.blockList ul.blockList li.blockList h3 { 226 | background-color:#dee3e9; 227 | border-top:1px solid #9eadc0; 228 | border-bottom:1px solid #9eadc0; 229 | margin:0 0 6px -8px; 230 | padding:2px 5px; 231 | } 232 | ul.blockList ul.blockList li.blockList h3 { 233 | padding:0; 234 | margin:15px 0; 235 | } 236 | ul.blockList li.blockList h2 { 237 | padding:0px 0 20px 0; 238 | } 239 | /* 240 | Page layout container styles 241 | */ 242 | .contentContainer, .sourceContainer, .classUseContainer, .serializedFormContainer, .constantValuesContainer { 243 | clear:both; 244 | padding:10px 20px; 245 | position:relative; 246 | } 247 | .indexContainer { 248 | margin:10px; 249 | position:relative; 250 | font-size:1.0em; 251 | } 252 | .indexContainer h2 { 253 | font-size:1.1em; 254 | padding:0 0 3px 0; 255 | } 256 | .indexContainer ul { 257 | margin:0; 258 | padding:0; 259 | } 260 | .indexContainer ul li { 261 | list-style:none; 262 | } 263 | .contentContainer .description dl dt, .contentContainer .details dl dt, .serializedFormContainer dl dt { 264 | font-size:1.1em; 265 | font-weight:bold; 266 | margin:10px 0 0 0; 267 | color:#4E4E4E; 268 | } 269 | .contentContainer .description dl dd, .contentContainer .details dl dd, .serializedFormContainer dl dd { 270 | margin:10px 0 10px 20px; 271 | } 272 | .serializedFormContainer dl.nameValue dt { 273 | margin-left:1px; 274 | font-size:1.1em; 275 | display:inline; 276 | font-weight:bold; 277 | } 278 | .serializedFormContainer dl.nameValue dd { 279 | margin:0 0 0 1px; 280 | font-size:1.1em; 281 | display:inline; 282 | } 283 | /* 284 | List styles 285 | */ 286 | ul.horizontal li { 287 | display:inline; 288 | font-size:0.9em; 289 | } 290 | ul.inheritance { 291 | margin:0; 292 | padding:0; 293 | } 294 | ul.inheritance li { 295 | display:inline; 296 | list-style:none; 297 | } 298 | ul.inheritance li ul.inheritance { 299 | margin-left:15px; 300 | padding-left:15px; 301 | padding-top:1px; 302 | } 303 | ul.blockList, ul.blockListLast { 304 | margin:10px 0 10px 0; 305 | padding:0; 306 | } 307 | ul.blockList li.blockList, ul.blockListLast li.blockList { 308 | list-style:none; 309 | margin-bottom:25px; 310 | } 311 | ul.blockList ul.blockList li.blockList, ul.blockList ul.blockListLast li.blockList { 312 | padding:0px 20px 5px 10px; 313 | border:1px solid #9eadc0; 314 | background-color:#f9f9f9; 315 | } 316 | ul.blockList ul.blockList ul.blockList li.blockList, ul.blockList ul.blockList ul.blockListLast li.blockList { 317 | padding:0 0 5px 8px; 318 | background-color:#ffffff; 319 | border:1px solid #9eadc0; 320 | border-top:none; 321 | } 322 | ul.blockList ul.blockList ul.blockList ul.blockList li.blockList { 323 | margin-left:0; 324 | padding-left:0; 325 | padding-bottom:15px; 326 | border:none; 327 | border-bottom:1px solid #9eadc0; 328 | } 329 | ul.blockList ul.blockList ul.blockList ul.blockList li.blockListLast { 330 | list-style:none; 331 | border-bottom:none; 332 | padding-bottom:0; 333 | } 334 | table tr td dl, table tr td dl dt, table tr td dl dd { 335 | margin-top:0; 336 | margin-bottom:1px; 337 | } 338 | /* 339 | Table styles 340 | */ 341 | .contentContainer table, .classUseContainer table, .constantValuesContainer table { 342 | border-bottom:1px solid #9eadc0; 343 | width:100%; 344 | } 345 | .contentContainer ul li table, .classUseContainer ul li table, .constantValuesContainer ul li table { 346 | width:100%; 347 | } 348 | .contentContainer .description table, .contentContainer .details table { 349 | border-bottom:none; 350 | } 351 | .contentContainer ul li table th.colOne, .contentContainer ul li table th.colFirst, .contentContainer ul li table th.colLast, .classUseContainer ul li table th, .constantValuesContainer ul li table th, .contentContainer ul li table td.colOne, .contentContainer ul li table td.colFirst, .contentContainer ul li table td.colLast, .classUseContainer ul li table td, .constantValuesContainer ul li table td{ 352 | vertical-align:top; 353 | padding-right:20px; 354 | } 355 | .contentContainer ul li table th.colLast, .classUseContainer ul li table th.colLast,.constantValuesContainer ul li table th.colLast, 356 | .contentContainer ul li table td.colLast, .classUseContainer ul li table td.colLast,.constantValuesContainer ul li table td.colLast, 357 | .contentContainer ul li table th.colOne, .classUseContainer ul li table th.colOne, 358 | .contentContainer ul li table td.colOne, .classUseContainer ul li table td.colOne { 359 | padding-right:3px; 360 | } 361 | .overviewSummary caption, .packageSummary caption, .contentContainer ul.blockList li.blockList caption, .summary caption, .classUseContainer caption, .constantValuesContainer caption { 362 | position:relative; 363 | text-align:left; 364 | background-repeat:no-repeat; 365 | color:#FFFFFF; 366 | font-weight:bold; 367 | clear:none; 368 | overflow:hidden; 369 | padding:0px; 370 | margin:0px; 371 | } 372 | caption a:link, caption a:hover, caption a:active, caption a:visited { 373 | color:#FFFFFF; 374 | } 375 | .overviewSummary caption span, .packageSummary caption span, .contentContainer ul.blockList li.blockList caption span, .summary caption span, .classUseContainer caption span, .constantValuesContainer caption span { 376 | white-space:nowrap; 377 | padding-top:8px; 378 | padding-left:8px; 379 | display:block; 380 | float:left; 381 | background-image:url(resources/titlebar.gif); 382 | height:18px; 383 | } 384 | .overviewSummary .tabEnd, .packageSummary .tabEnd, .contentContainer ul.blockList li.blockList .tabEnd, .summary .tabEnd, .classUseContainer .tabEnd, .constantValuesContainer .tabEnd { 385 | width:10px; 386 | background-image:url(resources/titlebar_end.gif); 387 | background-repeat:no-repeat; 388 | background-position:top right; 389 | position:relative; 390 | float:left; 391 | } 392 | ul.blockList ul.blockList li.blockList table { 393 | margin:0 0 12px 0px; 394 | width:100%; 395 | } 396 | .tableSubHeadingColor { 397 | background-color: #EEEEFF; 398 | } 399 | .altColor { 400 | background-color:#eeeeef; 401 | } 402 | .rowColor { 403 | background-color:#ffffff; 404 | } 405 | .overviewSummary td, .packageSummary td, .contentContainer ul.blockList li.blockList td, .summary td, .classUseContainer td, .constantValuesContainer td { 406 | text-align:left; 407 | padding:3px 3px 3px 7px; 408 | } 409 | th.colFirst, th.colLast, th.colOne, .constantValuesContainer th { 410 | background:#dee3e9; 411 | border-top:1px solid #9eadc0; 412 | border-bottom:1px solid #9eadc0; 413 | text-align:left; 414 | padding:3px 3px 3px 7px; 415 | } 416 | td.colOne a:link, td.colOne a:active, td.colOne a:visited, td.colOne a:hover, td.colFirst a:link, td.colFirst a:active, td.colFirst a:visited, td.colFirst a:hover, td.colLast a:link, td.colLast a:active, td.colLast a:visited, td.colLast a:hover, .constantValuesContainer td a:link, .constantValuesContainer td a:active, .constantValuesContainer td a:visited, .constantValuesContainer td a:hover { 417 | font-weight:bold; 418 | } 419 | td.colFirst, th.colFirst { 420 | border-left:1px solid #9eadc0; 421 | white-space:nowrap; 422 | } 423 | td.colLast, th.colLast { 424 | border-right:1px solid #9eadc0; 425 | } 426 | td.colOne, th.colOne { 427 | border-right:1px solid #9eadc0; 428 | border-left:1px solid #9eadc0; 429 | } 430 | table.overviewSummary { 431 | padding:0px; 432 | margin-left:0px; 433 | } 434 | table.overviewSummary td.colFirst, table.overviewSummary th.colFirst, 435 | table.overviewSummary td.colOne, table.overviewSummary th.colOne { 436 | width:25%; 437 | vertical-align:middle; 438 | } 439 | table.packageSummary td.colFirst, table.overviewSummary th.colFirst { 440 | width:25%; 441 | vertical-align:middle; 442 | } 443 | /* 444 | Content styles 445 | */ 446 | .description pre { 447 | margin-top:0; 448 | } 449 | .deprecatedContent { 450 | margin:0; 451 | padding:10px 0; 452 | } 453 | .docSummary { 454 | padding:0; 455 | } 456 | /* 457 | Formatting effect styles 458 | */ 459 | .sourceLineNo { 460 | color:green; 461 | padding:0 30px 0 0; 462 | } 463 | h1.hidden { 464 | visibility:hidden; 465 | overflow:hidden; 466 | font-size:.9em; 467 | } 468 | .block { 469 | display:block; 470 | margin:3px 0 0 0; 471 | } 472 | .strong { 473 | font-weight:bold; 474 | } 475 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 |HLLWritable
allows for serialization and
17 | deserialization of {@link HLLCounter} objects in a
18 | Hadoop framework.
19 | */
20 | public class HLLWritable implements Writable {
21 |
22 | private static final Logger LOG = LoggerFactory.getLogger(HLLWritable.class);
23 |
24 | /** The HLL precision of the contained HLLCounter
represenation.
25 | {@link HLLCounter#MIN_P} <= p <=
{@link HLLCounter#MAX_P}.
26 | */
27 | protected byte p;
28 | /** The MinHash precision of the contained HLLCounter
representation. */
29 | protected int k;
30 | /** The number of current elements in the MinHash structure
31 | of the contained HLLCounter
representation. */
32 | protected int s;
33 | /** The HLL structure of the contained HLLCounter
representation. */
34 | protected byte[] M;
35 | /** The contents of the MinHash structure of the contained
36 | HLLCounter
representation.*/
37 | protected long[] minhash;
38 |
39 | /**
40 | Constructs an HLLWritable
that contains a representation
41 | of the default HLLCounter
constructed by
42 | {@link HLLCounter#HLLCounter()}.
43 | */
44 | public HLLWritable() {
45 | set(new HLLCounter());
46 | }
47 |
48 | /**
49 | Constructs an HLLWritable
that contains a representation
50 | of the provided HLLCounter
.
51 |
52 | @param h the HLLCounter
to represent and contain
53 | */
54 | public HLLWritable(HLLCounter h) {
55 | set(h);
56 | }
57 |
58 | /**
59 | Constructs an HLLWritable
with the given set of fields.
60 |
61 | @param p the byte
precision of the HLL
62 | structure. {@link HLLCounter#MIN_P} <= p
63 | <=
{@link HLLCounter#MAX_P}.
64 | @param k the int
precision of the MinHash
65 | structure
66 | @param s the int
number of elements in the
67 | MinHash structure
68 | @param M the byte[]
HLL structure
69 | @param minhash the long[]
elements in the MinHash
70 | structure
71 | */
72 | public HLLWritable(byte p, int k, int s, byte[] M, long[] minhash){
73 | this.p = p;
74 | this.k = k;
75 | this.s = s;
76 | this.M = M;
77 | this.minhash = minhash;
78 | }
79 |
80 | /**
81 | Encapsulates a representation of the given HLLCounter
82 | in this HLLWritable
.
83 |
84 | @param h the HLLCounter
to represent and contain
85 | */
86 | public void set(HLLCounter h) {
87 | p = h.getP();
88 | M = h.getByteArray();
89 | k = h.getK();
90 | if(h.isIntersectable()){
91 | s = h.getMinHash().size();
92 | } else {
93 | s = 0;
94 | }
95 | if(minhash == null || minhash.length != s){
96 | minhash = new long[s];
97 | }
98 | int i = 0;
99 | if(h.getMinHash() != null){
100 | for(Long l : h.getMinHash()){
101 | minhash[i] = l;
102 | i++;
103 | }
104 | }
105 | }
106 |
107 | /**
108 | Returns a new HLLCounter
that is constructed
109 | from the internal representation of the HLLCounter
110 | that this HLLWritable
contains.
111 |
112 | @return the HLLCounter
this HLLWritable
113 | represents.
114 | */
115 | public HLLCounter get() {
116 | TreeSetHLLWritable
that contains a
126 | representation of combining its internal
127 | HLLCounter
's representation with
128 | the other's.
129 |
130 | It is functionally equivalent to combining two
131 | HLLCounter
s
132 | ({@link HLLCounter#combine(HLLCounter h)}) and creating a
133 | new HLLWritable
out of that.
134 |
135 | Returns null
if the combination fails.
136 |
137 | @param other the HLLWritable
to combine
138 | @return the HLLWritable
that represents
139 | the union, null
if fails,
140 | this
if other
141 | is null
.
142 | */
143 | public HLLWritable combine(HLLWritable other){
144 | if(other == null){
145 | return this;
146 | }
147 |
148 | byte newP = (byte)Math.min(p, other.p);
149 | int newK = Math.min(k, other.k);
150 | byte[] newM = HLLCounter.safeUnion(M, other.M);
151 | // newMinhash will hold at most newK elements, but possibly less
152 | long[] newMinhash = new long[newK];
153 | int i=0, j=0;
154 | int newS=0;
155 |
156 | try {
157 | if(newK > 0){
158 | while ( i < s && j < other.s && newS < newK){
159 | long left = minhash[i];
160 | long right = other.minhash[j];
161 | if(left < right){
162 | newMinhash[newS] = left;
163 | i++;
164 | } else if(left > right){
165 | newMinhash[newS] = right;
166 | j++;
167 | } else { // left == right
168 | newMinhash[newS] = left;
169 | i++;
170 | j++;
171 | }
172 | newS++;
173 | }
174 | while( i < s && newS < newK){
175 | newMinhash[newS] = minhash[i];
176 | i++;
177 | newS++;
178 | }
179 | while(j < other.s && newS < newK){
180 | newMinhash[newS] = other.minhash[j];
181 | j++;
182 | newS++;
183 | }
184 | // We allocated an array of newK size, but it's possible we didn't fill it up.
185 | // This would leave trailing 0's at the end of the array which we don't want to keep around.
186 | if (newS < newK) {
187 | newMinhash = Arrays.copyOf(newMinhash, newS);
188 | }
189 | }
190 | return new HLLWritable(newP, newK, newS, newM, newMinhash);
191 | } catch (Exception e){
192 | LOG.error("Failed combining", e);
193 | return null;
194 | }
195 | }
196 |
197 | // WritableComparable
198 | /**
199 | Serializes this HLLWritable
to the given
200 | {@link java.io.DataOutput}.
201 |
202 | Generally, this method should not be called on its own.
203 |
204 | @param out the DataOutput
object to write to
205 | */
206 | public void write(DataOutput out) throws IOException {
207 | try{
208 | // minhash is not maxed out, M is redundant so don't write it
209 | if (s < k) {
210 | // Use -p to signify no M
211 | out.writeByte(-p);
212 | out.writeInt(k);
213 | out.writeInt(s);
214 | for(int i=0; i < s; i++){
215 | out.writeLong(minhash[i]);
216 | }
217 | } else {
218 | out.writeByte(p);
219 | out.writeInt(k);
220 | out.writeInt(s);
221 | for(byte b : M){
222 | out.writeByte(b);
223 | }
224 | for(int i=0; i < s; i++){
225 | out.writeLong(minhash[i]);
226 | }
227 | }
228 | } catch(Exception e){
229 | LOG.warn("Failed writing", e);
230 | }
231 | }
232 |
233 | /**
234 | Deserialize the fields of this HLLWritable
235 | from the given {@link java.io.DataInput}.
236 |
237 | Generally, this method should not be called on its own.
238 | For efficiency, implementations should attempt to re-use
239 | storage in the existing object where possible.
240 |
241 | @param in the DataInput
to read from
242 | */
243 | public void readFields(DataInput in) throws IOException {
244 | try {
245 | p = in.readByte();
246 | k = in.readInt();
247 | s = in.readInt();
248 | if(k == 0) {
249 | s = 0;
250 | }
251 | // If p is negative, M does not exist
252 | if (p < 0) {
253 | p = (byte) -p;
254 | int m = (int)Math.pow(2, p);
255 | M = new byte[m];
256 | } else {
257 | int m = (int)Math.pow(2, p);
258 | M = new byte[m];
259 | for(int i = 0; i < m; i++) {
260 | M[i] = in.readByte();
261 | }
262 | }
263 | minhash = new long[s];
264 |
265 | for(int i = 0; i < s; i++) {
266 | long x = in.readLong();
267 | minhash[i] = x;
268 | /**
269 | * If p was negative, M is empty and we need to re-populate
270 | * If p was positive and we read M, this won't change anything since it's just max
271 | */
272 | int idx = (int)(x >>> (64 - p));
273 | long w = x << p;
274 | M[idx] = (byte)Math.max(M[idx], Long.numberOfLeadingZeros(w) + 1);
275 | }
276 | } catch(Exception e) {
277 | throw new IOException(e);
278 | }
279 | }
280 |
281 | /**
282 | Hashes this HLLWritable
based on its
283 | internal structures.
284 |
285 | @return the int
hash value
286 | */
287 | @Override
288 | public int hashCode() {
289 | final int prime = 31;
290 | int result = 1;
291 | result = prime * result + Arrays.hashCode(M);
292 | result = prime * result + k;
293 | result = prime * result + Arrays.hashCode(minhash);
294 | result = prime * result + p;
295 | result = prime * result + s;
296 | return result;
297 | }
298 |
299 | /**
300 | Returns whether this HLLWritable
301 | is equivalent to the given Object
.
302 |
303 | If the input is another HLLWritable
,
304 | the two are considered equivalent if all of their
305 | fields are equivalent (that is, the two
306 | HLLCounters
likely saw the exact same
307 | data).
308 |
309 | @param obj the Object
to compare to
310 |
311 | @return the boolean
of the comparison
312 | */
313 | @Override
314 | public boolean equals(Object obj) {
315 | if (this == obj) {
316 | return true;
317 | }
318 | if (obj == null) {
319 | return false;
320 | }
321 | if (getClass() != obj.getClass()) {
322 | return false;
323 | }
324 | HLLWritable other = (HLLWritable) obj;
325 | if (!Arrays.equals(M, other.M)) {
326 | return false;
327 | }
328 | if (k != other.k) {
329 | return false;
330 | }
331 | if (!Arrays.equals(minhash, other.minhash)) {
332 | return false;
333 | }
334 | if (p != other.p) {
335 | return false;
336 | }
337 | if (s != other.s) {
338 | return false;
339 | }
340 | return true;
341 | }
342 |
343 | /**
344 | Returns a String
representation of this
345 | HLLWritable
.
346 |
347 | The String
encodes the p
,
348 | k
, and s
fields.
349 |
350 | @return the String
representation
351 | */
352 | @Override
353 | public String toString() {
354 | return "HLLWritable [p=" + p + ", k=" + k + ", s=" + s + "]";
355 | }
356 | }
357 |
--------------------------------------------------------------------------------
/src/main/java/com/adroll/cantor/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 | Cantor provides utilities for estimating the cardinality
3 | of large sets.
4 |
5 | The algorithms herein are parallelizable, and a Hadoop 6 | wrapper class is provided for convenience. 7 |
8 | It employs most of the HyperLogLog++ algorithm as seen in 9 | 10 | this paper, excluding the sparse scheme, and using 11 | a simple linear interpolation instead of kNN. In addition, 12 | it can use MinHash structures to estimate cardinalities of 13 | intersections of these sets, as described in 14 | 15 | this blog post. 16 |
17 | Both HyperLogLog and MinHash require a precision
18 | parameter. Basic guidelines are available as follows,
19 | and {@link com.adroll.cantor.HLLCounter#MIN_P} = 4 <= p <= 18 =
20 | {@link com.adroll.cantor.HLLCounter#MAX_P}.
21 |
22 |
25 |
|
45 |
46 |
|
56 |