├── LICENSE ├── README.markdown ├── RELEASE_NOTES.markdown ├── pom.xml └── src ├── main └── java │ └── net │ └── agkn │ └── hll │ ├── HLL.java │ ├── HLLType.java │ ├── serialization │ ├── BigEndianAscendingWordDeserializer.java │ ├── BigEndianAscendingWordSerializer.java │ ├── HLLMetadata.java │ ├── IHLLMetadata.java │ ├── ISchemaVersion.java │ ├── IWordDeserializer.java │ ├── IWordSerializer.java │ ├── SchemaVersionOne.java │ └── SerializationUtil.java │ └── util │ ├── BitUtil.java │ ├── BitVector.java │ ├── HLLUtil.java │ ├── LongIterator.java │ └── NumberUtil.java └── test └── java └── net └── agkn └── hll ├── ExplicitHLLTest.java ├── FullHLLTest.java ├── IntegrationTestGenerator.java ├── ProbabilisticTestUtil.java ├── SparseHLLTest.java ├── serialization ├── BigEndianAscendingWordDeserializerTest.java ├── BigEndianAscendingWordSerializerTest.java └── HLLSerializationTest.java └── util ├── BitVectorTest.java └── HLLUtilTest.java /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | 3 | Version 2.0, January 2004 4 | 5 | http://www.apache.org/licenses/ 6 | 7 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 8 | 9 | 1. Definitions. 10 | 11 | "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. 16 | 17 | "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. 18 | 19 | "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. 20 | 21 | "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. 22 | 23 | "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). 24 | 25 | "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. 26 | 27 | "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." 28 | 29 | "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 30 | 31 | 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 32 | 33 | 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 34 | 35 | 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: 36 | 37 | You must give any other recipients of the Work or Derivative Works a copy of this License; and 38 | 39 | You must cause any modified files to carry prominent notices stating that You changed the files; and 40 | 41 | You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and 42 | 43 | If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 44 | 45 | 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 46 | 47 | 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 48 | 49 | 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 50 | 51 | 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 52 | 53 | 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. 54 | 55 | END OF TERMS AND CONDITIONS 56 | 57 | APPENDIX: How to apply the Apache License to your work 58 | To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. 59 | 60 | Copyright 2013 Aggregate Knowledge, Inc. 61 | 62 | Licensed under the Apache License, Version 2.0 (the "License"); 63 | you may not use this file except in compliance with the License. 64 | You may obtain a copy of the License at 65 | 66 | http://www.apache.org/licenses/LICENSE-2.0 67 | 68 | Unless required by applicable law or agreed to in writing, software 69 | distributed under the License is distributed on an "AS IS" BASIS, 70 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 71 | See the License for the specific language governing permissions and 72 | limitations under the License. -------------------------------------------------------------------------------- /README.markdown: -------------------------------------------------------------------------------- 1 | java-hll 2 | ======== 3 | 4 | A Java implementation of [HyperLogLog](http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf) whose goal is to be [storage-compatible](https://github.com/aggregateknowledge/hll-storage-spec) with other similar offerings from [Aggregate Knowledge](http://blog.aggregateknowledge.com/). 5 | 6 | 7 | **NOTE:** This implementation fully implements reading and writing all formats in the [v1.0.0 storage specification](https://github.com/aggregateknowledge/hll-storage-spec/blob/v1.0.0/STORAGE.md), but internal memory representation (and hence space-tradeoffs) may cause automatic "promotion" between representations to occur at different implementation-dependent points. To ensure interoperability between, for example, the [PostgreSQL implementation](https://github.com/aggregateknowledge/postgresql-hll) and this library, all promotion cutoffs should be explicitly defined. 8 | 9 | Similarly, certain parameters have different bounds in order to deal with VM limitations like maximum array length. Specifically, `log2m` has a maximum value of 30 in this implementation whereas the storage specification states a maximum value of 31 (which can be realized in the PostgreSQL implementation). 10 | 11 | Overview 12 | -------- 13 | 14 | HyperLogLog (HLL) is a fixed-size, set-like structure used for distinct value counting with tunable precision. For example, in 1280 bytes HLL can estimate the count of tens of billions of distinct values with only a few percent error. 15 | 16 | In addition to the algorithm proposed in the [original paper](http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf), this implementation is augmented to improve its accuracy and memory use without sacrificing much speed. See below for more details. 17 | 18 | Algorithms 19 | ---------- 20 | 21 | A `hll` is a combination of different set/distinct-value-counting algorithms that can be thought of as a hierarchy, along with rules for moving up that hierarchy. In order to distinguish between said algorithms, we have given them names: 22 | 23 | ### `EMPTY` ### 24 | A constant value that denotes the empty set. 25 | 26 | ### `EXPLICIT` ### 27 | An explicit, unique, sorted list of integers in the set, which is maintained up to a fixed cardinality. 28 | 29 | ### `SPARSE` ### 30 | A 'lazy', map-based implementation of HyperLogLog, a probabilistic set data structure. Only stores the indices and values of non-zero registers in a map, until the number of non-zero registers exceeds a fixed cardinality. 31 | 32 | ### `FULL` ### 33 | A fully-materialized, list-based implementation of HyperLogLog. Explicitly stores the value of every register in a list ordered by register index. 34 | 35 | Motivation 36 | ---------- 37 | 38 | Our motivation for augmenting the original HLL algorithm went something like this: 39 | 40 | * Naively, a HLL takes `regwidth * 2^log2m` bits to store. 41 | * In typical usage, `log2m = 11` and `regwidth = 5`, it requires 10,240 bits or 1,280 bytes. 42 | * That's a lot of bytes! 43 | 44 | The first addition to the original HLL algorithm came from realizing that 1,280 bytes is the size of 160 64-bit integers. So, if we wanted more accuracy at low cardinalities, we could just keep an explicit set of the inputs as a sorted list of 64-bit integers until we hit the 161st distinct value. This would give us the true representation of the distinct values in the stream while requiring the same amount of memory. (This is the `EXPLICIT` algorithm.) 45 | 46 | The second came from the realization that we didn't need to store registers whose value was zero. We could simply represent the set of registers that had non-zero values as a map from index to values. This is map is stored as a list of index-value pairs that are bit-packed "short words" of length `log2m + regwidth`. (This is the `SPARSE` algorithm.) 47 | 48 | Combining these two augmentations, we get a "promotion hierarchy" that allows the algorithm to be tuned for better accuracy, memory, or performance. 49 | 50 | Initializing and storing a new `hll` object will simply allocate a small sentinel value symbolizing the empty set (`EMPTY`). When you add the first few values, a sorted list of unique integers is stored in an `EXPLICIT` set. When you wish to cease trading off accuracy for memory, the values in the sorted list are "promoted" to a `SPARSE` map-based HyperLogLog structure. Finally, when there are enough registers, the map-based HLL will be converted to a bit-packed `FULL` HLL structure. 51 | 52 | Empirically, the insertion rate of `EMPTY`, `EXPLICIT`, and `SPARSE` representations is measured in 200k/s - 300k/s range, while the throughput of the `FULL` representation is in the millions of inserts per second on relatively new hardware ('10 Xeon). 53 | 54 | Naturally, the cardinality estimates of the `EMPTY` and `EXPLICIT` representations is exact, while the `SPARSE` and `FULL` representations' accuracies are governed by the guarantees provided by the original HLL algorithm. 55 | 56 | * * * * * * * * * * * * * * * * * * * * * * * * * 57 | 58 | 59 | The Importance of Hashing 60 | ========================= 61 | 62 | In brief, it is absolutely crucial to hash inputs to an HLL. A close approximation of uniform randomness in the inputs ensures that the error guarantees laid out in the original paper hold. We've empirically determined that [MurmurHash 3](http://guava-libraries.googlecode.com/git/guava/src/com/google/common/hash/Murmur3_128HashFunction.java), from Google's Guava, is an excellent and fast hash function to use in conjunction with `java-hll` module. 63 | 64 | The seed to the hash call must remain constant for all inputs to a given HLL. Similarly, if one plans to compute the union of two HLLs, the input values must have been hashed using the same seed. 65 | 66 | For a good overview of the importance of hashing and hash functions when using probabilistic algorithms as well as an analysis of MurmurHash 3, refer to these blog posts: 67 | 68 | * [K-Minimum Values: Sketching Error, Hash Functions, and You](http://blog.aggregateknowledge.com/2012/08/20/k-minimum-values-sketching-error-hash-functions-and-you/) 69 | * [Choosing a Good Hash Function, Part 1](http://blog.aggregateknowledge.com/2011/12/05/choosing-a-good-hash-function-part-1/) 70 | * [Choosing a Good Hash Function, Part 2](http://blog.aggregateknowledge.com/2011/12/29/choosing-a-good-hash-function-part-2/) 71 | * [Choosing a Good Hash Function, Part 3](http://blog.aggregateknowledge.com/2012/02/02/choosing-a-good-hash-function-part-3/) 72 | 73 | 74 | On Unions and Intersections 75 | =========================== 76 | 77 | HLLs have the useful property that the union of any number of HLLs is equal to the HLL that would have been populated by playing back all inputs to those '_n_' HLLs into a single HLL. Colloquially, one can say that HLLs have "lossless" unions because the same cardinality error guarantees that apply to a single HLL apply to a union of HLLs. See the `union()` function. 78 | 79 | Using the [inclusion-exclusion principle](http://en.wikipedia.org/wiki/Inclusion%E2%80%93exclusion_principle) and the `union()` function, one can also estimate the intersection of sets represented by HLLs. Note, however, that error is proportional to the union of the two HLLs, while the result can be significantly smaller than the union, leading to disproportionately large error relative to the actual intersection cardinality. For instance, if one HLL has a cardinality of 1 billion, while the other has a cardinality of 10 million, with an overlap of 5 million, the intersection cardinality can easily be dwarfed by even a 1% error estimate in the larger HLLs cardinality. 80 | 81 | For more information on HLL intersections, see [this blog post](http://blog.aggregateknowledge.com/2012/12/17/hll-intersections-2/). 82 | 83 | Usage 84 | ===== 85 | 86 | HLL is available in Maven Central. Include it in your project with: 87 | 88 | ```xml 89 | 90 | net.agkn 91 | hll 92 | 1.6.0 93 | 94 | ``` 95 | 96 | 97 | Hashing and adding a value to a new HLL: 98 | 99 | ```java 100 | final int seed = 123456; 101 | final Murmur3_128HashFunction hash = new Murmur3_128HashFunction(seed); 102 | final Hasher hasher = hash.newHasher(); 103 | hasher.putLong(1L/*value to hash*/); 104 | 105 | final long hashedValue = hasher.hash().asLong(); 106 | 107 | final HLL hll = new HLL(13/*log2m*/, 5/*registerWidth*/); 108 | hll.addRaw(hashedValue); 109 | ``` 110 | 111 | Retrieving the cardinality of an HLL: 112 | 113 | ```java 114 | final long cardinality = hll.cardinality(); 115 | ``` 116 | 117 | Unioning two HLLs together (and retrieving the resulting cardinality): 118 | 119 | ```java 120 | final HLL hll1 = new HLL(13/*log2m*/, 5/*registerWidth*/); 121 | final HLL hll2 = new HLL(13/*log2m*/, 5/*registerWidth*/); 122 | 123 | // ... (add values to both sets) ... 124 | 125 | hll1.union(hll2)/*modifies hll1 to contain the union*/; 126 | final long cardinalityUnion = hll1.cardinality(); 127 | ``` 128 | 129 | Reading an HLL from a hex representation of [storage specification, v1.0.0](https://github.com/aggregateknowledge/hll-storage-spec/blob/v1.0.0/STORAGE.md) (for example, retrieved from a [PostgreSQL database](https://github.com/aggregateknowledge/postgresql-hll)): 130 | 131 | ```java 132 | final HLL hll = HLL.fromBytes(NumberUtil.fromHex(hexString)); 133 | ``` 134 | 135 | Writing an HLL to its hex representation of [storage specification, v1.0.0](https://github.com/aggregateknowledge/hll-storage-spec/blob/v1.0.0/STORAGE.md) (for example, to be inserted into a [PostgreSQL database](https://github.com/aggregateknowledge/postgresql-hll)): 136 | 137 | ```java 138 | final byte[] bytes = hll.toBytes(); 139 | final String output = "\\x" + NumberUtil.toHex(bytes, 0, bytes.length) 140 | ``` 141 | 142 | * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * 143 | 144 | Building 145 | -------- 146 | 147 | * Requires [Maven 2.0](http://maven.apache.org/) 148 | * `mvn clean package` in the base directory 149 | 150 | A `target` directory will be created and a jar containing the library will be created therein. 151 | 152 | 153 | Testing 154 | ------- 155 | 156 | * `mvn test` in the base directory. 157 | -------------------------------------------------------------------------------- /RELEASE_NOTES.markdown: -------------------------------------------------------------------------------- 1 | v1.6.0 - Jul 29, 2014 2 | --------------------- 3 | * Added support for registering schema versions. 4 | 5 | v1.5.2 - Jul 16, 2014 6 | --------------------- 7 | * Fixed #10: Long overflow bug in `TWO_TO_L` calculation when `regwidth = 6`. 8 | 9 | v1.5.1 - Feb 26, 2014 10 | --------------------- 11 | * Fixed serialization compabitility issue. `expthresh` was not being decoded properly. 12 | 13 | v1.5.0 - Feb 21, 2014 14 | --------------------- 15 | * Fixed #5: Added HLL#clone(). 16 | 17 | v1.4.0 - Feb 04, 2014 18 | --------------------- 19 | * Fixed #4: lowered JDK requirement to 1.6 from 1.7. 20 | 21 | v1.3.0 - Jan 31, 2014 22 | --------------------- 23 | * Fixed #3: added new, simple HLL constructor. 24 | 25 | v1.2.1 - Jan 31, 2014 26 | --------------------- 27 | * Fixed #2: fix HLL when `log2m * regwidth` is small. 28 | 29 | v1.2.0 - Jan 17, 2014 30 | --------------------- 31 | * Reworked pom for Maven Central publishing, via Sonnatype. 32 | 33 | v1.1.0 - Jan 10, 2014 34 | --------------------- 35 | * Documentation fixes. 36 | * Added parameter checking in HLL constructor. 37 | 38 | v1.0.0 - Dec 22, 2013 39 | --------------------- 40 | * Initial public release. -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 4 | 4.0.0 5 | net.agkn 6 | hll 7 | jar 8 | HyperLogLog: approximate distinct value counting algoritm 9 | https://github.com/aggregateknowledge/java-hll 10 | 1.6.0 11 | HyperLogLog in Java 12 | 13 | 14 | The Apache Software License, Version 2.0 15 | http://www.apache.org/licenses/LICENSE-2.0.txt 16 | repo 17 | 18 | 19 | 20 | scm:git:git@github.com:aggregateknowledge/java-hll.git 21 | scm:git:git@github.com:aggregateknowledge/java-hll.git 22 | scm:git:git@github.com:aggregateknowledge/java-hll.git 23 | 24 | 25 | 26 | timonk 27 | Timon Karnezos 28 | timon.karnezos@neustar.biz 29 | 30 | 31 | 32 | 33 | 34 | ${project.artifactId}-${project.version} 35 | 36 | 37 | 38 | org.apache.maven.plugins 39 | maven-compiler-plugin 40 | 3.1 41 | 42 | 1.6 43 | 1.6 44 | 45 | 46 | 47 | 48 | org.apache.maven.plugins 49 | maven-source-plugin 50 | 2.2.1 51 | 52 | 53 | attach-sources 54 | 55 | jar 56 | 57 | 58 | 59 | 60 | 61 | 62 | org.apache.maven.plugins 63 | maven-javadoc-plugin 64 | 2.9.1 65 | 66 | 67 | attach-javadocs 68 | 69 | jar 70 | 71 | 72 | 73 | 74 | 75 | 76 | org.apache.maven.plugins 77 | maven-gpg-plugin 78 | 79 | 80 | sign-artifacts 81 | verify 82 | 83 | sign 84 | 85 | 86 | 87 | 88 | 89 | 90 | org.apache.maven.plugins 91 | maven-surefire-plugin 92 | 2.16 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | release-sign-artifacts 102 | 103 | 104 | performRelease 105 | true 106 | 107 | 108 | 109 | 110 | 111 | org.apache.maven.plugins 112 | maven-gpg-plugin 113 | 1.4 114 | 115 | 116 | sign-artifacts 117 | verify 118 | 119 | sign 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | sonatype-nexus-snapshots 132 | https://oss.sonatype.org/content/repositories/snapshots/ 133 | 134 | 135 | sonatype-nexus-staging 136 | https://oss.sonatype.org/service/local/staging/deploy/maven2/ 137 | 138 | 139 | 140 | 141 | org.sonatype.oss 142 | oss-parent 143 | 7 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | it.unimi.dsi 152 | fastutil 153 | ${fastutil-version} 154 | 155 | 156 | 157 | 158 | org.easymock 159 | easymock 160 | ${easymock-version} 161 | test 162 | 163 | 164 | org.powermock 165 | powermock-module-junit4 166 | ${powermock-version} 167 | test 168 | 169 | 170 | org.powermock 171 | powermock-api-easymock 172 | ${powermock-version} 173 | test 174 | 175 | 176 | 177 | org.testng 178 | testng 179 | ${testng-version} 180 | test 181 | jdk15 182 | 183 | 184 | 185 | 186 | 187 | 188 | 1.8 189 | 190 | 191 | 3.0 192 | 1.4.8 193 | 5.7 194 | 6.5.11 195 | 196 | -------------------------------------------------------------------------------- /src/main/java/net/agkn/hll/HLLType.java: -------------------------------------------------------------------------------- 1 | package net.agkn.hll; 2 | 3 | /* 4 | * Copyright 2013 Aggregate Knowledge, Inc. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | /** 20 | * The types of algorithm/data structure that {@link HLL} can utilize. For more 21 | * information, see the Javadoc for {@link HLL}. 22 | */ 23 | public enum HLLType { 24 | EMPTY, 25 | EXPLICIT, 26 | SPARSE, 27 | FULL, 28 | UNDEFINED/*used by the PostgreSQL implementation to indicate legacy/corrupt/incompatible/unknown formats*/; 29 | } -------------------------------------------------------------------------------- /src/main/java/net/agkn/hll/serialization/BigEndianAscendingWordDeserializer.java: -------------------------------------------------------------------------------- 1 | package net.agkn.hll.serialization; 2 | 3 | /* 4 | * Copyright 2013 Aggregate Knowledge, Inc. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | /** 20 | * A corresponding deserializer for {@link BigEndianAscendingWordSerializer}. 21 | * 22 | * @author timon 23 | */ 24 | public class BigEndianAscendingWordDeserializer implements IWordDeserializer { 25 | // The number of bits per byte. 26 | private static final int BITS_PER_BYTE = 8; 27 | 28 | // long mask for the maximum value stored in a byte 29 | private static final long BYTE_MASK = (1L << BITS_PER_BYTE) - 1L; 30 | 31 | // ************************************************************************ 32 | // The length in bits of the words to be read. 33 | private final int wordLength; 34 | 35 | // The byte array to which the words are serialized. 36 | private final byte[] bytes; 37 | 38 | // The number of leading padding bytes in 'bytes' to be ignored. 39 | private final int bytePadding; 40 | 41 | // The number of words that the byte array contains. 42 | private final int wordCount; 43 | 44 | // The current read state. 45 | private int currentWordIndex; 46 | 47 | // ======================================================================== 48 | /** 49 | * @param wordLength the length in bits of the words to be deserialized. Must 50 | * be less than or equal to 64 and greater than or equal to 1. 51 | * @param bytePadding the number of leading bytes that pad the serialized words. 52 | * Must be greater than or equal to zero. 53 | * @param bytes the byte array containing the serialized words. Cannot be 54 | * null. 55 | */ 56 | public BigEndianAscendingWordDeserializer(final int wordLength, final int bytePadding, final byte[] bytes) { 57 | if((wordLength < 1) || (wordLength > 64)) { 58 | throw new IllegalArgumentException("Word length must be >= 1 and <= 64. (was: " + wordLength + ")"); 59 | } 60 | 61 | if(bytePadding < 0) { 62 | throw new IllegalArgumentException("Byte padding must be >= zero. (was: " + bytePadding + ")"); 63 | } 64 | 65 | this.wordLength = wordLength; 66 | this.bytes = bytes; 67 | this.bytePadding = bytePadding; 68 | 69 | final int dataBytes = (bytes.length - bytePadding); 70 | final long dataBits = (dataBytes * BITS_PER_BYTE); 71 | 72 | this.wordCount = (int)(dataBits/wordLength); 73 | 74 | currentWordIndex = 0; 75 | } 76 | 77 | // ======================================================================== 78 | /* (non-Javadoc) 79 | * @see net.agkn.hll.serialization.IWordDeserializer#readWord() 80 | */ 81 | @Override 82 | public long readWord() { 83 | final long word = readWord(currentWordIndex); 84 | currentWordIndex++; 85 | 86 | return word; 87 | } 88 | 89 | // ------------------------------------------------------------------------ 90 | /** 91 | * Reads the word at the specified sequence position (zero-indexed). 92 | * 93 | * @param position the zero-indexed position of the word to be read. This 94 | * must be greater than or equal to zero. 95 | * @return the value of the serialized word at the specified position. 96 | */ 97 | private long readWord(final int position) { 98 | if(position < 0) { 99 | throw new ArrayIndexOutOfBoundsException(position); 100 | } 101 | 102 | // First bit of the word 103 | final long firstBitIndex = (position * wordLength); 104 | final int firstByteIndex = (bytePadding + (int)(firstBitIndex / BITS_PER_BYTE)); 105 | final int firstByteSkipBits = (int)(firstBitIndex % BITS_PER_BYTE); 106 | 107 | // Last bit of the word 108 | final long lastBitIndex = (firstBitIndex + wordLength - 1); 109 | final int lastByteIndex = (bytePadding + (int)(lastBitIndex / BITS_PER_BYTE)); 110 | final int lastByteBitsToConsume; 111 | 112 | final int bitsAfterByteBoundary = (int)((lastBitIndex + 1) % BITS_PER_BYTE); 113 | // If the word terminates at the end of the last byte, consume the whole 114 | // last byte. 115 | if(bitsAfterByteBoundary == 0) { 116 | lastByteBitsToConsume = BITS_PER_BYTE; 117 | } else { 118 | // Otherwise, only consume what is necessary. 119 | lastByteBitsToConsume = bitsAfterByteBoundary; 120 | } 121 | 122 | if(lastByteIndex >= bytes.length) { 123 | throw new ArrayIndexOutOfBoundsException("Word out of bounds of backing array."); 124 | } 125 | 126 | // Accumulator 127 | long value = 0; 128 | 129 | // -------------------------------------------------------------------- 130 | // First byte 131 | final int bitsRemainingInFirstByte = (BITS_PER_BYTE - firstByteSkipBits); 132 | final int bitsToConsumeInFirstByte = Math.min(bitsRemainingInFirstByte, wordLength); 133 | long firstByte = (long)bytes[firstByteIndex]; 134 | 135 | // Mask off the bits to skip in the first byte. 136 | final long firstByteMask = ((1L << bitsRemainingInFirstByte) - 1L); 137 | firstByte &= firstByteMask; 138 | // Right-align relevant bits of first byte. 139 | firstByte >>>= (bitsRemainingInFirstByte - bitsToConsumeInFirstByte); 140 | 141 | value |= firstByte; 142 | 143 | // If the first byte contains the whole word, short-circuit. 144 | if(firstByteIndex == lastByteIndex) { 145 | return value; 146 | } 147 | 148 | // -------------------------------------------------------------------- 149 | // Middle bytes 150 | final int middleByteCount = (lastByteIndex - firstByteIndex - 1); 151 | for(int i=0; i>= (BITS_PER_BYTE - lastByteBitsToConsume); 162 | value <<= lastByteBitsToConsume; 163 | value |= lastByte; 164 | return value; 165 | } 166 | 167 | /* (non-Javadoc) 168 | * @see net.agkn.hll.serialization.IWordDeserializer#totalWordCount() 169 | */ 170 | @Override 171 | public int totalWordCount() { 172 | return wordCount; 173 | } 174 | } 175 | -------------------------------------------------------------------------------- /src/main/java/net/agkn/hll/serialization/BigEndianAscendingWordSerializer.java: -------------------------------------------------------------------------------- 1 | package net.agkn.hll.serialization; 2 | 3 | /* 4 | * Copyright 2013 Aggregate Knowledge, Inc. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | /** 20 | * A serializer that writes a sequence of fixed bit-width 'words' to a byte array. 21 | * Bitwise OR is used to write words into bytes, so a low bit in a word is also 22 | * a low bit in a byte. However, a high byte in a word is written at a lower index 23 | * in the array than a low byte in a word. The first word is written at the lowest 24 | * array index. Each serializer is one time use and returns its backing byte 25 | * array.

26 | * 27 | * This encoding was chosen so that when reading bytes as octets in the typical 28 | * first-octet-is-the-high-nibble fashion, an octet-to-binary conversion 29 | * would yield a high-to-low, left-to-right view of the "short words".

30 | * 31 | * Example:

32 | * 33 | * Say short words are 5 bits wide. Our word sequence is the values 34 | * [31, 1, 5]. In big-endian binary format, the values are 35 | * [0b11111, 0b00001, 0b00101]. We use 15 of 16 bits in two bytes 36 | * and pad the last (lowest) bit of the last byte with a zero: 37 | * 38 | * 39 | * [0b11111000, 0b01001010] = [0xF8, 0x4A] 40 | * . 41 | * 42 | * @author timon 43 | */ 44 | public class BigEndianAscendingWordSerializer implements IWordSerializer { 45 | // The number of bits per byte. 46 | private static final int BITS_PER_BYTE = 8; 47 | 48 | // ************************************************************************ 49 | // The length in bits of the words to be written. 50 | private final int wordLength; 51 | // The number of words to be written. 52 | private final int wordCount; 53 | 54 | // The byte array to which the words are serialized. 55 | private final byte[] bytes; 56 | 57 | // ------------------------------------------------------------------------ 58 | // Write state 59 | // Number of bits that remain writable in the current byte. 60 | private int bitsLeftInByte; 61 | // Index of byte currently being written to. 62 | private int byteIndex; 63 | // Number of words written. 64 | private int wordsWritten; 65 | 66 | // ======================================================================== 67 | /** 68 | * @param wordLength the length in bits of the words to be serialized. Must 69 | * be greater than or equal to 1 and less than or equal to 64. 70 | * @param wordCount the number of words to be serialized. Must be greater than 71 | * or equal to zero. 72 | * @param bytePadding the number of leading bytes that should pad the 73 | * serialized words. Must be greater than or equal to zero. 74 | */ 75 | public BigEndianAscendingWordSerializer(final int wordLength, final int wordCount, final int bytePadding) { 76 | if((wordLength < 1) || (wordLength > 64)) { 77 | throw new IllegalArgumentException("Word length must be >= 1 and <= 64. (was: " + wordLength + ")"); 78 | } 79 | if(wordCount < 0) { 80 | throw new IllegalArgumentException("Word count must be >= 0. (was: " + wordCount + ")"); 81 | } 82 | if(bytePadding < 0) { 83 | throw new IllegalArgumentException("Byte padding must be must be >= 0. (was: " + bytePadding + ")"); 84 | } 85 | 86 | this.wordLength = wordLength; 87 | this.wordCount = wordCount; 88 | 89 | final long bitsRequired = (wordLength * wordCount); 90 | final boolean leftoverBits = ((bitsRequired % BITS_PER_BYTE) != 0); 91 | final int bytesRequired = (int)(bitsRequired / BITS_PER_BYTE) + (leftoverBits ? 1 : 0) + bytePadding; 92 | bytes = new byte[bytesRequired]; 93 | 94 | bitsLeftInByte = BITS_PER_BYTE; 95 | byteIndex = bytePadding; 96 | wordsWritten = 0; 97 | } 98 | 99 | /* (non-Javadoc) 100 | * @see net.agkn.hll.serialization.IWordSerializer#writeWord(long) 101 | * @throws RuntimeException if the number of words written is greater than the 102 | * wordCount parameter in the constructor. 103 | */ 104 | @Override 105 | public void writeWord(final long word) { 106 | if(wordsWritten == wordCount) { 107 | throw new RuntimeException("Cannot write more words, backing array full!"); 108 | } 109 | 110 | int bitsLeftInWord = wordLength; 111 | 112 | while(bitsLeftInWord > 0) { 113 | // Move to the next byte if the current one is fully packed. 114 | if(bitsLeftInByte == 0) { 115 | byteIndex++; 116 | bitsLeftInByte = BITS_PER_BYTE; 117 | } 118 | 119 | final long consumedMask; 120 | if(bitsLeftInWord == 64) { 121 | consumedMask = ~0L; 122 | } else { 123 | consumedMask = ((1L << bitsLeftInWord) - 1L); 124 | } 125 | 126 | // Fix how many bits will be written in this cycle. Choose the 127 | // smaller of the remaining bits in the word or byte. 128 | final int numberOfBitsToWrite = Math.min(bitsLeftInByte, bitsLeftInWord); 129 | final int bitsInByteRemainingAfterWrite = (bitsLeftInByte - numberOfBitsToWrite); 130 | 131 | // In general, we write the highest bits of the word first, so we 132 | // strip the highest bits that were consumed in previous cycles. 133 | final long remainingBitsOfWordToWrite = (word & consumedMask); 134 | 135 | final long bitsThatTheByteCanAccept; 136 | // If there is more left in the word than can be written to this 137 | // byte, shift off the bits that can't be written off the bottom. 138 | if(bitsLeftInWord > numberOfBitsToWrite) { 139 | bitsThatTheByteCanAccept = (remainingBitsOfWordToWrite >>> (bitsLeftInWord - bitsLeftInByte)); 140 | } else { 141 | // If the byte can accept all remaining bits, there is no need 142 | // to shift off the bits that won't be written in this cycle. 143 | bitsThatTheByteCanAccept = remainingBitsOfWordToWrite; 144 | } 145 | 146 | // Align the word bits to write up against the byte bits that have 147 | // already been written. This shift may do nothing if the remainder 148 | // of the byte is being consumed in this cycle. 149 | final long alignedBits = (bitsThatTheByteCanAccept << bitsInByteRemainingAfterWrite); 150 | 151 | // Update the byte with the alignedBits. 152 | bytes[byteIndex] |= (byte)alignedBits; 153 | 154 | // Update state with bit count written. 155 | bitsLeftInWord -= numberOfBitsToWrite; 156 | bitsLeftInByte = bitsInByteRemainingAfterWrite; 157 | } 158 | 159 | wordsWritten ++; 160 | } 161 | 162 | /* (non-Javadoc) 163 | * @see net.agkn.hll.serialization.IWordSerializer#getBytes() 164 | * @throws RuntimeException if the number of words written is fewer than the 165 | * wordCount parameter in the constructor. 166 | */ 167 | @Override 168 | public byte[] getBytes() { 169 | if(wordsWritten < wordCount) { 170 | throw new RuntimeException("Not all words have been written! (" + wordsWritten + "/" + wordCount + ")"); 171 | } 172 | 173 | return bytes; 174 | } 175 | } 176 | -------------------------------------------------------------------------------- /src/main/java/net/agkn/hll/serialization/HLLMetadata.java: -------------------------------------------------------------------------------- 1 | package net.agkn.hll.serialization; 2 | 3 | /* 4 | * Copyright 2013 Aggregate Knowledge, Inc. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | import net.agkn.hll.HLLType; 20 | 21 | /** 22 | * A concrete {@link IHLLMetadata} implemented as a simple struct. 23 | * 24 | * @author timon 25 | */ 26 | public class HLLMetadata implements IHLLMetadata { 27 | private final int schemaVersion; 28 | private final HLLType type; 29 | private final int registerCountLog2; 30 | private final int registerWidth; 31 | private final int log2ExplicitCutoff; 32 | private final boolean explicitOff; 33 | private final boolean explicitAuto; 34 | private final boolean sparseEnabled; 35 | 36 | /** 37 | * @param schemaVersion the schema version number of the HLL. This must 38 | * be greater than or equal to zero. 39 | * @param type the {@link HLLType type} of the HLL. This cannot 40 | * be null. 41 | * @param registerCountLog2 the log-base-2 register count parameter for 42 | * probabilistic HLLs. This must be greater than or equal to zero. 43 | * @param registerWidth the register width parameter for probabilistic 44 | * HLLs. This must be greater than or equal to zero. 45 | * @param log2ExplicitCutoff the log-base-2 of the explicit cardinality cutoff, 46 | * if it is explicitly defined. (If explicitOff or 47 | * explicitAuto is true then this has no 48 | * meaning.) 49 | * @param explicitOff the flag for 'explicit off'-mode, where the 50 | * {@link HLLType#EXPLICIT} representation is not used. Both this and 51 | * explicitAuto cannot be true at the same 52 | * time. 53 | * @param explicitAuto the flag for 'explicit auto'-mode, where the 54 | * {@link HLLType#EXPLICIT} representation's promotion cutoff is 55 | * determined based on in-memory size automatically. Both this and 56 | * explicitOff cannot be true at the same 57 | * time. 58 | * @param sparseEnabled the flag for 'sparse-enabled'-mode, where the 59 | * {@link HLLType#SPARSE} representation is used. 60 | */ 61 | public HLLMetadata(final int schemaVersion, 62 | final HLLType type, 63 | final int registerCountLog2, 64 | final int registerWidth, 65 | final int log2ExplicitCutoff, 66 | final boolean explicitOff, 67 | final boolean explicitAuto, 68 | final boolean sparseEnabled) { 69 | this.schemaVersion = schemaVersion; 70 | this.type = type; 71 | this.registerCountLog2 = registerCountLog2; 72 | this.registerWidth = registerWidth; 73 | this.log2ExplicitCutoff = log2ExplicitCutoff; 74 | this.explicitOff = explicitOff; 75 | this.explicitAuto = explicitAuto; 76 | this.sparseEnabled = sparseEnabled; 77 | } 78 | 79 | /* (non-Javadoc) 80 | * @see net.agkn.hll.serialization.IHLLMetadata#schemaVersion() 81 | */ 82 | @Override 83 | public int schemaVersion() { return schemaVersion; } 84 | 85 | /* (non-Javadoc) 86 | * @see net.agkn.hll.serialization.IHLLMetadata#HLLType() 87 | */ 88 | @Override 89 | public HLLType HLLType() { return type; } 90 | 91 | /* (non-Javadoc) 92 | * @see net.agkn.hll.serialization.IHLLMetadata#registerCountLog2() 93 | */ 94 | @Override 95 | public int registerCountLog2() { return registerCountLog2; } 96 | 97 | /* (non-Javadoc) 98 | * @see net.agkn.hll.serialization.IHLLMetadata#registerWidth() 99 | */ 100 | @Override 101 | public int registerWidth() { return registerWidth; } 102 | 103 | /* (non-Javadoc) 104 | * @see net.agkn.hll.serialization.IHLLMetadata#log2ExplicitCutoff() 105 | */ 106 | @Override 107 | public int log2ExplicitCutoff() { return log2ExplicitCutoff; } 108 | 109 | /* (non-Javadoc) 110 | * @see net.agkn.hll.serialization.IHLLMetadata#explicitOff() 111 | */ 112 | @Override 113 | public boolean explicitOff() { 114 | return explicitOff; 115 | } 116 | 117 | /* (non-Javadoc) 118 | * @see net.agkn.hll.serialization.IHLLMetadata#explicitAuto() 119 | * @see net.agkn.hll.serialization.IHLLMetadata#log2ExplicitCutoff() 120 | */ 121 | @Override 122 | public boolean explicitAuto() { 123 | return explicitAuto; 124 | } 125 | 126 | /* (non-Javadoc) 127 | * @see net.agkn.hll.serialization.IHLLMetadata#sparseEnabled() 128 | */ 129 | @Override 130 | public boolean sparseEnabled() { return sparseEnabled; } 131 | 132 | /* (non-Javadoc) 133 | * @see java.lang.Object#toString() 134 | */ 135 | @Override 136 | public String toString() { 137 | return ""; 138 | } 139 | } 140 | -------------------------------------------------------------------------------- /src/main/java/net/agkn/hll/serialization/IHLLMetadata.java: -------------------------------------------------------------------------------- 1 | package net.agkn.hll.serialization; 2 | 3 | /* 4 | * Copyright 2013 Aggregate Knowledge, Inc. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | import net.agkn.hll.HLLType; 20 | 21 | /** 22 | * The metadata and parameters associated with a HLL. 23 | */ 24 | public interface IHLLMetadata { 25 | /** 26 | * @return the schema version of the HLL. This will never be null. 27 | */ 28 | int schemaVersion(); 29 | 30 | /** 31 | * @return the type of the HLL. This will never be null. 32 | */ 33 | HLLType HLLType(); 34 | 35 | /** 36 | * @return the log-base-2 of the register count parameter of the HLL. This 37 | * will always be greater than or equal to 4 and less than or equal 38 | * to 31. 39 | */ 40 | int registerCountLog2(); 41 | 42 | /** 43 | * @return the register width parameter of the HLL. This will always be 44 | * greater than or equal to 1 and less than or equal to 8. 45 | */ 46 | int registerWidth(); 47 | 48 | /** 49 | * @return the log-base-2 of the explicit cutoff cardinality. This will always 50 | * be greater than or equal to zero and less than 31, per the specification. 51 | */ 52 | int log2ExplicitCutoff(); 53 | 54 | /** 55 | * @return true if the {@link HLLType#EXPLICIT} representation 56 | * has been disabled. false otherwise. 57 | */ 58 | boolean explicitOff(); 59 | 60 | /** 61 | * @return true if the {@link HLLType#EXPLICIT} representation 62 | * cutoff cardinality is set to be automatically chosen, 63 | * false otherwise. 64 | */ 65 | boolean explicitAuto(); 66 | 67 | /** 68 | * @return true if the {@link HLLType#SPARSE} representation 69 | * is enabled. 70 | */ 71 | boolean sparseEnabled(); 72 | } -------------------------------------------------------------------------------- /src/main/java/net/agkn/hll/serialization/ISchemaVersion.java: -------------------------------------------------------------------------------- 1 | package net.agkn.hll.serialization; 2 | 3 | /* 4 | * Copyright 2013 Aggregate Knowledge, Inc. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | import net.agkn.hll.HLLType; 20 | 21 | /** 22 | * A serialization schema for HLLs. Reads and writes HLL metadata to 23 | * and from byte[] representations. 24 | * 25 | * @author timon 26 | */ 27 | public interface ISchemaVersion { 28 | /** 29 | * The number of metadata bytes required for a serialized HLL of the 30 | * specified type. 31 | * 32 | * @param type the type of the serialized HLL 33 | * @return the number of padding bytes needed in order to fully accommodate 34 | * the needed metadata. 35 | */ 36 | int paddingBytes(HLLType type); 37 | 38 | /** 39 | * Writes metadata bytes to serialized HLL. 40 | * 41 | * @param bytes the padded data bytes of the HLL 42 | * @param metadata the metadata to write to the padding bytes 43 | */ 44 | void writeMetadata(byte[] bytes, IHLLMetadata metadata); 45 | 46 | /** 47 | * Reads the metadata bytes of the serialized HLL. 48 | * 49 | * @param bytes the serialized HLL 50 | * @return the HLL metadata 51 | */ 52 | IHLLMetadata readMetadata(byte[] bytes); 53 | 54 | /** 55 | * Builds an HLL serializer that matches this schema version. 56 | * 57 | * @param type the HLL type that will be serialized. This cannot be 58 | * null. 59 | * @param wordLength the length of the 'words' that comprise the data of the 60 | * HLL. Words must be at least 5 bits and at most 64 bits long. 61 | * @param wordCount the number of 'words' in the HLL's data. 62 | * @return a byte array serializer used to serialize a HLL according 63 | * to this schema version's specification. 64 | * @see #paddingBytes(HLLType) 65 | * @see IWordSerializer 66 | */ 67 | IWordSerializer getSerializer(HLLType type, int wordLength, int wordCount); 68 | 69 | /** 70 | * Builds an HLL deserializer that matches this schema version. 71 | * 72 | * @param type the HLL type that will be deserialized. This cannot be 73 | * null. 74 | * @param wordLength the length of the 'words' that comprise the data of the 75 | * serialized HLL. Words must be at least 5 bits and at most 64 76 | * bits long. 77 | * @param bytes the serialized HLL to deserialize. This cannot be 78 | * null. 79 | * @return a byte array deserializer used to deserialize a HLL serialized 80 | * according to this schema version's specification. 81 | */ 82 | IWordDeserializer getDeserializer(HLLType type, int wordLength, byte[] bytes); 83 | 84 | /** 85 | * @return the schema version number. 86 | */ 87 | int schemaVersionNumber(); 88 | } 89 | -------------------------------------------------------------------------------- /src/main/java/net/agkn/hll/serialization/IWordDeserializer.java: -------------------------------------------------------------------------------- 1 | package net.agkn.hll.serialization; 2 | 3 | /* 4 | * Copyright 2013 Aggregate Knowledge, Inc. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | /** 20 | * Reads 'words' of a fixed width, in sequence, from a byte array. 21 | * 22 | * @author timon 23 | */ 24 | public interface IWordDeserializer { 25 | /** 26 | * @return the next word in the sequence. Should not be called more than 27 | * {@link #totalWordCount()} times. 28 | */ 29 | long readWord(); 30 | 31 | /** 32 | * Returns the number of words that could be encoded in the sequence.

33 | * 34 | * NOTE: the sequence that was encoded may be shorter than the value this 35 | * method returns due to padding issues within bytes. This guarantees 36 | * only an upper bound on the number of times {@link #readWord()} 37 | * can be called. 38 | * 39 | * @return the maximum number of words that could be read from the sequence. 40 | */ 41 | int totalWordCount(); 42 | } -------------------------------------------------------------------------------- /src/main/java/net/agkn/hll/serialization/IWordSerializer.java: -------------------------------------------------------------------------------- 1 | package net.agkn.hll.serialization; 2 | 3 | /* 4 | * Copyright 2013 Aggregate Knowledge, Inc. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | /** 20 | * Writes 'words' of fixed width, in sequence, to a byte array. 21 | * 22 | * @author timon 23 | */ 24 | public interface IWordSerializer { 25 | 26 | /** 27 | * Writes the word to the backing array. 28 | * 29 | * @param word the word to write. 30 | */ 31 | void writeWord(final long word); 32 | 33 | /** 34 | * Returns the backing array of bytes that contain the serialized 35 | * words. 36 | * @return the serialized words as a byte[]. 37 | */ 38 | byte[] getBytes(); 39 | 40 | } -------------------------------------------------------------------------------- /src/main/java/net/agkn/hll/serialization/SchemaVersionOne.java: -------------------------------------------------------------------------------- 1 | package net.agkn.hll.serialization; 2 | 3 | /* 4 | * Copyright 2013 Aggregate Knowledge, Inc. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | import net.agkn.hll.HLLType; 20 | 21 | /** 22 | * A concrete {@link ISchemaVersion} representing schema version one. 23 | * 24 | * @author timon 25 | */ 26 | public class SchemaVersionOne implements ISchemaVersion { 27 | /** 28 | * The schema version number for this instance. 29 | */ 30 | public static final int SCHEMA_VERSION = 1; 31 | 32 | // ------------------------------------------------------------------------ 33 | // Version-specific ordinals (array position) for each of the HLL types 34 | private static final HLLType[] TYPE_ORDINALS = new HLLType[] { 35 | HLLType.UNDEFINED, 36 | HLLType.EMPTY, 37 | HLLType.EXPLICIT, 38 | HLLType.SPARSE, 39 | HLLType.FULL 40 | }; 41 | 42 | // ------------------------------------------------------------------------ 43 | // number of header bytes for all HLL types 44 | private static final int HEADER_BYTE_COUNT = 3; 45 | 46 | // sentinel values from the spec for explicit off and auto 47 | private static final int EXPLICIT_OFF = 0; 48 | private static final int EXPLICIT_AUTO = 63; 49 | 50 | // ************************************************************************ 51 | /* (non-Javadoc) 52 | * @see net.agkn.hll.serialization.ISchemaVersion#paddingBytes(HLLType) 53 | */ 54 | @Override 55 | public int paddingBytes(final HLLType type) { 56 | return HEADER_BYTE_COUNT; 57 | } 58 | 59 | /* (non-Javadoc) 60 | * @see net.agkn.hll.serialization.ISchemaVersion#writeMetadata(byte[], IHLLMetadata) 61 | */ 62 | @Override 63 | public void writeMetadata(final byte[] bytes, final IHLLMetadata metadata) { 64 | final HLLType type = metadata.HLLType(); 65 | final int typeOrdinal = getOrdinal(type); 66 | 67 | final int explicitCutoffValue; 68 | if(metadata.explicitOff()) { 69 | explicitCutoffValue = EXPLICIT_OFF; 70 | } else if(metadata.explicitAuto()) { 71 | explicitCutoffValue = EXPLICIT_AUTO; 72 | } else { 73 | explicitCutoffValue = metadata.log2ExplicitCutoff() + 1/*per spec*/; 74 | } 75 | 76 | bytes[0] = SerializationUtil.packVersionByte(SCHEMA_VERSION, typeOrdinal); 77 | bytes[1] = SerializationUtil.packParametersByte(metadata.registerWidth(), metadata.registerCountLog2()); 78 | bytes[2] = SerializationUtil.packCutoffByte(explicitCutoffValue, metadata.sparseEnabled()); 79 | } 80 | 81 | /* (non-Javadoc) 82 | * @see net.agkn.hll.serialization.ISchemaVersion#readMetadata(byte[]) 83 | */ 84 | @Override 85 | public IHLLMetadata readMetadata(final byte[] bytes) { 86 | final byte versionByte = bytes[0]; 87 | final byte parametersByte = bytes[1]; 88 | final byte cutoffByte = bytes[2]; 89 | 90 | final int typeOrdinal = SerializationUtil.typeOrdinal(versionByte); 91 | final int explicitCutoffValue = SerializationUtil.explicitCutoff(cutoffByte); 92 | final boolean explicitOff = (explicitCutoffValue == EXPLICIT_OFF); 93 | final boolean explicitAuto = (explicitCutoffValue == EXPLICIT_AUTO); 94 | final int log2ExplicitCutoff = (explicitOff || explicitAuto) ? -1/*sentinel*/ : (explicitCutoffValue - 1/*per spec*/); 95 | 96 | return new HLLMetadata(SCHEMA_VERSION, 97 | getType(typeOrdinal), 98 | SerializationUtil.registerCountLog2(parametersByte), 99 | SerializationUtil.registerWidth(parametersByte), 100 | log2ExplicitCutoff, 101 | explicitOff, 102 | explicitAuto, 103 | SerializationUtil.sparseEnabled(cutoffByte)); 104 | } 105 | 106 | /* (non-Javadoc) 107 | * @see net.agkn.hll.serialization.ISchemaVersion#getSerializer(HLLType, int, int) 108 | */ 109 | @Override 110 | public IWordSerializer getSerializer(HLLType type, int wordLength, int wordCount) { 111 | return new BigEndianAscendingWordSerializer(wordLength, wordCount, paddingBytes(type)); 112 | } 113 | 114 | /* (non-Javadoc) 115 | * @see net.agkn.hll.serialization.ISchemaVersion#getDeserializer(HLLType, int, byte[]) 116 | */ 117 | @Override 118 | public IWordDeserializer getDeserializer(HLLType type, int wordLength, byte[] bytes) { 119 | return new BigEndianAscendingWordDeserializer(wordLength, paddingBytes(type), bytes); 120 | } 121 | 122 | /* (non-Javadoc) 123 | * @see net.agkn.hll.serialization.ISchemaVersion#schemaVersionNumber() 124 | */ 125 | @Override 126 | public int schemaVersionNumber() { 127 | return SCHEMA_VERSION; 128 | } 129 | 130 | // ======================================================================== 131 | // Type/Ordinal lookups 132 | /** 133 | * Gets the ordinal for the specified {@link HLLType}. 134 | * 135 | * @param type the type whose ordinal is desired 136 | * @return the ordinal for the specified type, to be used in the version byte. 137 | * This will always be non-negative. 138 | */ 139 | private static int getOrdinal(final HLLType type) { 140 | for(int i=0; inull. 151 | */ 152 | private static HLLType getType(final int ordinal) { 153 | if((ordinal < 0) || (ordinal >= TYPE_ORDINALS.length)) { 154 | throw new IllegalArgumentException("Invalid type ordinal '" + ordinal + "'. Only 0-" + (TYPE_ORDINALS.length - 1) + " inclusive allowed."); 155 | } 156 | return TYPE_ORDINALS[ordinal]; 157 | } 158 | } 159 | -------------------------------------------------------------------------------- /src/main/java/net/agkn/hll/serialization/SerializationUtil.java: -------------------------------------------------------------------------------- 1 | package net.agkn.hll.serialization; 2 | 3 | /* 4 | * Copyright 2013 Aggregate Knowledge, Inc. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | import net.agkn.hll.HLLType; 20 | 21 | /** 22 | * A collection of constants and utilities for serializing and deserializing 23 | * HLLs. 24 | * 25 | * NOTE: 'package' visibility is used for many methods that only need to be 26 | * used by the {@link ISchemaVersion} implementations. The structure of 27 | * a serialized HLL's metadata should be opaque to the rest of the 28 | * library. 29 | * 30 | * @author timon 31 | */ 32 | public class SerializationUtil { 33 | /** 34 | * The number of bits (of the parameters byte) dedicated to encoding the 35 | * width of the registers. 36 | */ 37 | /*package*/ static int REGISTER_WIDTH_BITS = 3; 38 | 39 | /** 40 | * A mask to cap the maximum value of the register width. 41 | */ 42 | /*package*/ static int REGISTER_WIDTH_MASK = (1 << REGISTER_WIDTH_BITS) - 1; 43 | 44 | /** 45 | * The number of bits (of the parameters byte) dedicated to encoding 46 | * log2(registerCount). 47 | */ 48 | /*package*/ static int LOG2_REGISTER_COUNT_BITS = 5; 49 | 50 | /** 51 | * A mask to cap the maximum value of log2(registerCount). 52 | */ 53 | /*package*/ static int LOG2_REGISTER_COUNT_MASK = (1 << LOG2_REGISTER_COUNT_BITS) - 1; 54 | 55 | /** 56 | * The number of bits (of the cutoff byte) dedicated to encoding the 57 | * log-base-2 of the explicit cutoff or sentinel values for 58 | * 'explicit-disabled' or 'auto'. 59 | */ 60 | /*package*/ static int EXPLICIT_CUTOFF_BITS = 6; 61 | 62 | /** 63 | * A mask to cap the maximum value of the explicit cutoff choice. 64 | */ 65 | /*package*/ static int EXPLICIT_CUTOFF_MASK = (1 << EXPLICIT_CUTOFF_BITS) - 1; 66 | 67 | /** 68 | * Number of bits in a nibble. 69 | */ 70 | private static int NIBBLE_BITS = 4; 71 | 72 | /** 73 | * A mask to cap the maximum value of a nibble. 74 | */ 75 | private static int NIBBLE_MASK = (1 << NIBBLE_BITS) - 1; 76 | 77 | // ************************************************************************ 78 | // Serialization utilities 79 | 80 | /** 81 | * Schema version one (v1). 82 | */ 83 | public static ISchemaVersion VERSION_ONE = new SchemaVersionOne(); 84 | 85 | /** 86 | * The default schema version for serializing HLLs. 87 | */ 88 | public static ISchemaVersion DEFAULT_SCHEMA_VERSION = VERSION_ONE; 89 | 90 | /** 91 | * List of registered schema versions, indexed by their version numbers. If 92 | * an entry is null, then no such schema version is registered. 93 | * Similarly, registering a new schema version simply entails assigning an 94 | * {@link ISchemaVersion} instance to the appropriate index of this array.

95 | * 96 | * By default, only {@link SchemaVersionOne} is registered. Note that version 97 | * zero will always be reserved for internal (e.g. proprietary, legacy) schema 98 | * specifications/implementations and will never be assigned to in by this 99 | * library. 100 | */ 101 | public static ISchemaVersion[] REGISTERED_SCHEMA_VERSIONS = new ISchemaVersion[16]; 102 | 103 | static { 104 | REGISTERED_SCHEMA_VERSIONS[1] = VERSION_ONE; 105 | } 106 | 107 | /** 108 | * @param schemaVersionNumber the version number of the {@link ISchemaVersion} 109 | * desired. This must be a registered schema version number. 110 | * @return The {@link ISchemaVersion} for the given number. This will never 111 | * be null. 112 | */ 113 | public static ISchemaVersion getSchemaVersion(final int schemaVersionNumber) { 114 | if(schemaVersionNumber >= REGISTERED_SCHEMA_VERSIONS.length || schemaVersionNumber < 0) { 115 | throw new RuntimeException("Invalid schema version number " + schemaVersionNumber); 116 | } 117 | final ISchemaVersion schemaVersion = REGISTERED_SCHEMA_VERSIONS[schemaVersionNumber]; 118 | if(schemaVersion == null) { 119 | throw new RuntimeException("Unknown schema version number " + schemaVersionNumber); 120 | } 121 | return schemaVersion; 122 | } 123 | 124 | /** 125 | * Get the appropriate {@link ISchemaVersion schema version} for the specified 126 | * serialized HLL. 127 | * 128 | * @param bytes the serialized HLL whose schema version is desired. 129 | * @return the schema version for the specified HLL. This will never 130 | * be null. 131 | */ 132 | public static ISchemaVersion getSchemaVersion(final byte[] bytes) { 133 | final byte versionByte = bytes[0]; 134 | final int schemaVersionNumber = schemaVersion(versionByte); 135 | 136 | return getSchemaVersion(schemaVersionNumber); 137 | } 138 | 139 | // ************************************************************************ 140 | // Package-specific shared helpers 141 | 142 | /** 143 | * Generates a byte that encodes the schema version and the type ordinal 144 | * of the HLL. 145 | * 146 | * The top nibble is the schema version and the bottom nibble is the type 147 | * ordinal. 148 | * 149 | * @param schemaVersion the schema version to encode. 150 | * @param typeOrdinal the type ordinal of the HLL to encode. 151 | * @return the packed version byte 152 | */ 153 | public static byte packVersionByte(final int schemaVersion, final int typeOrdinal) { 154 | return (byte)(((NIBBLE_MASK & schemaVersion) << NIBBLE_BITS) | (NIBBLE_MASK & typeOrdinal)); 155 | } 156 | /** 157 | * Generates a byte that encodes the log-base-2 of the explicit cutoff 158 | * or sentinel values for 'explicit-disabled' or 'auto', as well as the 159 | * boolean indicating whether to use {@link HLLType#SPARSE} 160 | * in the promotion hierarchy. 161 | * 162 | * The top bit is always padding, the second highest bit indicates the 163 | * 'sparse-enabled' boolean, and the lowest six bits encode the explicit 164 | * cutoff value. 165 | * 166 | * @param explicitCutoff the explicit cutoff value to encode. 167 | *

179 | * @param sparseEnabled whether {@link HLLType#SPARSE} 180 | * should be used in the promotion hierarchy to improve HLL 181 | * storage. 182 | * 183 | * @return the packed cutoff byte 184 | */ 185 | public static byte packCutoffByte(final int explicitCutoff, final boolean sparseEnabled) { 186 | final int sparseBit = (sparseEnabled ? (1 << EXPLICIT_CUTOFF_BITS) : 0); 187 | return (byte)(sparseBit | (EXPLICIT_CUTOFF_MASK & explicitCutoff)); 188 | } 189 | 190 | /** 191 | * Generates a byte that encodes the parameters of a 192 | * {@link HLLType#FULL} or {@link HLLType#SPARSE} 193 | * HLL.

194 | * 195 | * The top 3 bits are used to encode registerWidth - 1 196 | * (range of registerWidth is thus 1-9) and the bottom 5 197 | * bits are used to encode registerCountLog2 198 | * (range of registerCountLog2 is thus 0-31). 199 | * 200 | * @param registerWidth the register width (must be at least 1 and at 201 | * most 9) 202 | * @param registerCountLog2 the log-base-2 of the register count (must 203 | * be at least 0 and at most 31) 204 | * @return the packed parameters byte 205 | */ 206 | public static byte packParametersByte(final int registerWidth, final int registerCountLog2) { 207 | final int widthBits = ((registerWidth - 1) & REGISTER_WIDTH_MASK); 208 | final int countBits = (registerCountLog2 & LOG2_REGISTER_COUNT_MASK); 209 | return (byte)((widthBits << LOG2_REGISTER_COUNT_BITS) | countBits); 210 | } 211 | 212 | /** 213 | * Extracts the 'sparse-enabled' boolean from the cutoff byte of a serialized 214 | * HLL. 215 | * 216 | * @param cutoffByte the cutoff byte of the serialized HLL 217 | * @return the 'sparse-enabled' boolean 218 | */ 219 | public static boolean sparseEnabled(final byte cutoffByte) { 220 | return ((cutoffByte >>> EXPLICIT_CUTOFF_BITS) & 1) == 1; 221 | } 222 | 223 | /** 224 | * Extracts the explicit cutoff value from the cutoff byte of a serialized 225 | * HLL. 226 | * 227 | * @param cutoffByte the cutoff byte of the serialized HLL 228 | * @return the explicit cutoff value 229 | */ 230 | public static int explicitCutoff(final byte cutoffByte) { 231 | return (cutoffByte & EXPLICIT_CUTOFF_MASK); 232 | } 233 | 234 | /** 235 | * Extracts the schema version from the version byte of a serialized 236 | * HLL. 237 | * 238 | * @param versionByte the version byte of the serialized HLL 239 | * @return the schema version of the serialized HLL 240 | */ 241 | public static int schemaVersion(final byte versionByte) { 242 | return NIBBLE_MASK & (versionByte >>> NIBBLE_BITS); 243 | } 244 | 245 | /** 246 | * Extracts the type ordinal from the version byte of a serialized HLL. 247 | * 248 | * @param versionByte the version byte of the serialized HLL 249 | * @return the type ordinal of the serialized HLL 250 | */ 251 | public static int typeOrdinal(final byte versionByte) { 252 | return (versionByte & NIBBLE_MASK); 253 | } 254 | 255 | /** 256 | * Extracts the register width from the parameters byte of a serialized 257 | * {@link HLLType#FULL} HLL. 258 | * 259 | * @param parametersByte the parameters byte of the serialized HLL 260 | * @return the register width of the serialized HLL 261 | * 262 | * @see #packParametersByte(int, int) 263 | */ 264 | public static int registerWidth(final byte parametersByte) { 265 | return ((parametersByte >>> LOG2_REGISTER_COUNT_BITS) & REGISTER_WIDTH_MASK) + 1; 266 | } 267 | 268 | /** 269 | * Extracts the log2(registerCount) from the parameters byte of a 270 | * serialized {@link HLLType#FULL} HLL. 271 | * 272 | * @param parametersByte the parameters byte of the serialized HLL 273 | * @return log2(registerCount) of the serialized HLL 274 | * 275 | * @see #packParametersByte(int, int) 276 | */ 277 | public static int registerCountLog2(final byte parametersByte) { 278 | return (parametersByte & LOG2_REGISTER_COUNT_MASK); 279 | } 280 | } 281 | -------------------------------------------------------------------------------- /src/main/java/net/agkn/hll/util/BitUtil.java: -------------------------------------------------------------------------------- 1 | package net.agkn.hll.util; 2 | 3 | /* 4 | * Copyright 2013 Aggregate Knowledge, Inc. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | /** 20 | * A collection of bit utilities. 21 | * 22 | * @author rgrzywinski 23 | */ 24 | public class BitUtil { 25 | /** 26 | * The set of least-significant bits for a given byte. -1 27 | * is used if no bits are set (so as to not be confused with "index of zero" 28 | * meaning that the least significant bit is the 0th (1st) bit). 29 | * 30 | * @see #leastSignificantBit(long) 31 | */ 32 | private static final int[] LEAST_SIGNIFICANT_BIT = { 33 | -1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 34 | 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 35 | 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 36 | 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 37 | 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 38 | 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 39 | 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 40 | 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 41 | 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 42 | 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 43 | 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 44 | 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 45 | 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 46 | 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 47 | 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 48 | 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 49 | }; 50 | 51 | /** 52 | * Computes the least-significant bit of the specified long 53 | * that is set to 1. Zero-indexed. 54 | * 55 | * @param value the long whose least-significant bit is desired. 56 | * @return the least-significant bit of the specified long. 57 | * -1 is returned if there are no bits set. 58 | */ 59 | // REF: http://stackoverflow.com/questions/757059/position-of-least-significant-bit-that-is-set 60 | // REF: http://www-graphics.stanford.edu/~seander/bithacks.html 61 | public static int leastSignificantBit(final long value) { 62 | if(value == 0L) return -1/*by contract*/; 63 | if((value & 0xFFL) != 0) return LEAST_SIGNIFICANT_BIT[(int)( (value >>> 0) & 0xFF)] + 0; 64 | if((value & 0xFFFFL) != 0) return LEAST_SIGNIFICANT_BIT[(int)( (value >>> 8) & 0xFF)] + 8; 65 | if((value & 0xFFFFFFL) != 0) return LEAST_SIGNIFICANT_BIT[(int)( (value >>> 16) & 0xFF)] + 16; 66 | if((value & 0xFFFFFFFFL) != 0) return LEAST_SIGNIFICANT_BIT[(int)( (value >>> 24) & 0xFF)] + 24; 67 | if((value & 0xFFFFFFFFFFL) != 0) return LEAST_SIGNIFICANT_BIT[(int)( (value >>> 32) & 0xFF)] + 32; 68 | if((value & 0xFFFFFFFFFFFFL) != 0) return LEAST_SIGNIFICANT_BIT[(int)( (value >>> 40) & 0xFF)] + 40; 69 | if((value & 0xFFFFFFFFFFFFFFL) != 0) return LEAST_SIGNIFICANT_BIT[(int)( (value >>> 48) & 0xFF)] + 48; 70 | return LEAST_SIGNIFICANT_BIT[(int)( (value >>> 56) & 0xFFL)] + 56; 71 | } 72 | } -------------------------------------------------------------------------------- /src/main/java/net/agkn/hll/util/BitVector.java: -------------------------------------------------------------------------------- 1 | package net.agkn.hll.util; 2 | 3 | /* 4 | * Copyright 2013 Aggregate Knowledge, Inc. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | import net.agkn.hll.serialization.IWordSerializer; 20 | 21 | /** 22 | * A vector (array) of bits that is accessed in units ("registers") of width 23 | * bits which are stored as 64bit "words" (longs). In this context 24 | * a register is at most 64bits. 25 | * 26 | * @author rgrzywinski 27 | */ 28 | public class BitVector implements Cloneable { 29 | // NOTE: in this context, a word is 64bits 30 | 31 | // rather than doing division to determine how a bit index fits into 64bit 32 | // words (i.e. longs), bit shifting is used 33 | private static final int LOG2_BITS_PER_WORD = 6/*=>64bits*/; 34 | private static final int BITS_PER_WORD = 1 << LOG2_BITS_PER_WORD; 35 | private static final int BITS_PER_WORD_MASK = BITS_PER_WORD - 1; 36 | 37 | // ditto from above but for bytes (for output) 38 | private static final int LOG2_BITS_PER_BYTE = 3/*=>8bits*/; 39 | public static final int BITS_PER_BYTE = 1 << LOG2_BITS_PER_BYTE; 40 | 41 | // ======================================================================== 42 | public static final int BYTES_PER_WORD = 8/*8 bytes in a long*/; 43 | 44 | // ************************************************************************ 45 | // 64bit words 46 | private final long[] words; 47 | public final long[] words() { return words; } 48 | public final int wordCount() { return words.length; } 49 | public final int byteCount() { return wordCount() * BYTES_PER_WORD; } 50 | 51 | // the width of a register in bits (this cannot be more than 64 (the word size)) 52 | private final int registerWidth; 53 | public final int registerWidth() { return registerWidth; } 54 | 55 | private final long count; 56 | 57 | // ------------------------------------------------------------------------ 58 | private final long registerMask; 59 | 60 | // ======================================================================== 61 | /** 62 | * @param width the width of each register. This cannot be negative or 63 | * zero or greater than 63 (the signed word size). 64 | * @param count the number of registers. This cannot be negative or zero 65 | */ 66 | public BitVector(final int width, final long count) { 67 | // ceil((width * count)/BITS_PER_WORD) 68 | this.words = new long[(int)(((width * count) + BITS_PER_WORD_MASK) >>> LOG2_BITS_PER_WORD)]; 69 | this.registerWidth = width; 70 | this.count = count; 71 | 72 | this.registerMask = (1L << width) - 1; 73 | } 74 | 75 | // ======================================================================== 76 | /** 77 | * @param registerIndex the index of the register whose value is to be 78 | * retrieved. This cannot be negative. 79 | * @return the value at the specified register index 80 | * @see #setRegister(long, long) 81 | * @see #setMaxRegister(long, long) 82 | */ 83 | // NOTE: if this changes then setMaxRegister() must change 84 | public long getRegister(final long registerIndex) { 85 | final long bitIndex = registerIndex * registerWidth; 86 | final int firstWordIndex = (int)(bitIndex >>> LOG2_BITS_PER_WORD)/*aka (bitIndex / BITS_PER_WORD)*/; 87 | final int secondWordIndex = (int)((bitIndex + registerWidth - 1) >>> LOG2_BITS_PER_WORD)/*see above*/; 88 | final int bitRemainder = (int)(bitIndex & BITS_PER_WORD_MASK)/*aka (bitIndex % BITS_PER_WORD)*/; 89 | 90 | if(firstWordIndex == secondWordIndex) 91 | return ((words[firstWordIndex] >>> bitRemainder) & registerMask); 92 | /* else -- register spans words */ 93 | return (words[firstWordIndex] >>> bitRemainder)/*no need to mask since at top of word*/ 94 | | (words[secondWordIndex] << (BITS_PER_WORD - bitRemainder)) & registerMask; 95 | } 96 | 97 | /** 98 | * @param registerIndex the index of the register whose value is to be set. 99 | * This cannot be negative 100 | * @param value the value to set in the register 101 | * @see #getRegister(long) 102 | * @see #setMaxRegister(long, long) 103 | */ 104 | // NOTE: if this changes then setMaxRegister() must change 105 | public void setRegister(final long registerIndex, final long value) { 106 | final long bitIndex = registerIndex * registerWidth; 107 | final int firstWordIndex = (int)(bitIndex >>> LOG2_BITS_PER_WORD)/*aka (bitIndex / BITS_PER_WORD)*/; 108 | final int secondWordIndex = (int)((bitIndex + registerWidth - 1) >>> LOG2_BITS_PER_WORD)/*see above*/; 109 | final int bitRemainder = (int)(bitIndex & BITS_PER_WORD_MASK)/*aka (bitIndex % BITS_PER_WORD)*/; 110 | 111 | final long words[] = this.words/*for convenience/performance*/; 112 | if(firstWordIndex == secondWordIndex) { 113 | // clear then set 114 | words[firstWordIndex] &= ~(registerMask << bitRemainder); 115 | words[firstWordIndex] |= (value << bitRemainder); 116 | } else {/*register spans words*/ 117 | // clear then set each partial word 118 | words[firstWordIndex] &= (1L << bitRemainder) - 1; 119 | words[firstWordIndex] |= (value << bitRemainder); 120 | 121 | words[secondWordIndex] &= ~(registerMask >>> (BITS_PER_WORD - bitRemainder)); 122 | words[secondWordIndex] |= (value >>> (BITS_PER_WORD - bitRemainder)); 123 | } 124 | } 125 | 126 | // ------------------------------------------------------------------------ 127 | /** 128 | * @return a LongIterator for iterating starting at the register 129 | * with index zero. This will never be null. 130 | */ 131 | public LongIterator registerIterator() { 132 | return new LongIterator() { 133 | final int registerWidth = BitVector.this.registerWidth; 134 | final long[] words = BitVector.this.words; 135 | final long registerMask = BitVector.this.registerMask; 136 | 137 | // register setup 138 | long registerIndex = 0; 139 | int wordIndex = 0; 140 | int remainingWordBits = BITS_PER_WORD; 141 | long word = words[wordIndex]; 142 | 143 | @Override public long next() { 144 | long register; 145 | if(remainingWordBits >= registerWidth) { 146 | register = word & registerMask; 147 | 148 | // shift to the next register 149 | word >>>= registerWidth; 150 | remainingWordBits -= registerWidth; 151 | } else { /*insufficient bits remaining in current word*/ 152 | wordIndex++/*move to the next word*/; 153 | 154 | register = (word | (words[wordIndex] << remainingWordBits)) & registerMask; 155 | 156 | // shift to the next partial register (word) 157 | word = words[wordIndex] >>> (registerWidth - remainingWordBits); 158 | remainingWordBits += BITS_PER_WORD - registerWidth; 159 | } 160 | registerIndex++; 161 | return register; 162 | } 163 | 164 | @Override public boolean hasNext() { 165 | return registerIndex < count; 166 | } 167 | }; 168 | } 169 | 170 | // ------------------------------------------------------------------------ 171 | // composite accessors 172 | /** 173 | * Sets the value of the specified index register if and only if the specified 174 | * value is greater than the current value in the register. This is equivalent 175 | * to but much more performant than:

176 | * 177 | *

vector.setRegister(index, Math.max(vector.getRegister(index), value));
178 | * 179 | * @param registerIndex the index of the register whose value is to be set. 180 | * This cannot be negative 181 | * @param value the value to set in the register if and only if this value 182 | * is greater than the current value in the register 183 | * @return true if and only if the specified value is greater 184 | * than or equal to the current register value. false 185 | * otherwise. 186 | * @see #getRegister(long) 187 | * @see #setRegister(long, long) 188 | * @see java.lang.Math#max(long, long) 189 | */ 190 | // NOTE: if this changes then setRegister() must change 191 | public boolean setMaxRegister(final long registerIndex, final long value) { 192 | final long bitIndex = registerIndex * registerWidth; 193 | final int firstWordIndex = (int)(bitIndex >>> LOG2_BITS_PER_WORD)/*aka (bitIndex / BITS_PER_WORD)*/; 194 | final int secondWordIndex = (int)((bitIndex + registerWidth - 1) >>> LOG2_BITS_PER_WORD)/*see above*/; 195 | final int bitRemainder = (int)(bitIndex & BITS_PER_WORD_MASK)/*aka (bitIndex % BITS_PER_WORD)*/; 196 | 197 | // NOTE: matches getRegister() 198 | final long registerValue; 199 | final long words[] = this.words/*for convenience/performance*/; 200 | if(firstWordIndex == secondWordIndex) 201 | registerValue = ((words[firstWordIndex] >>> bitRemainder) & registerMask); 202 | else /*register spans words*/ 203 | registerValue = (words[firstWordIndex] >>> bitRemainder)/*no need to mask since at top of word*/ 204 | | (words[secondWordIndex] << (BITS_PER_WORD - bitRemainder)) & registerMask; 205 | 206 | // determine which is the larger and update as necessary 207 | if(value > registerValue) { 208 | // NOTE: matches setRegister() 209 | if(firstWordIndex == secondWordIndex) { 210 | // clear then set 211 | words[firstWordIndex] &= ~(registerMask << bitRemainder); 212 | words[firstWordIndex] |= (value << bitRemainder); 213 | } else {/*register spans words*/ 214 | // clear then set each partial word 215 | words[firstWordIndex] &= (1L << bitRemainder) - 1; 216 | words[firstWordIndex] |= (value << bitRemainder); 217 | 218 | words[secondWordIndex] &= ~(registerMask >>> (BITS_PER_WORD - bitRemainder)); 219 | words[secondWordIndex] |= (value >>> (BITS_PER_WORD - bitRemainder)); 220 | } 221 | } /* else -- the register value is greater (or equal) so nothing needs to be done */ 222 | 223 | return (value >= registerValue); 224 | } 225 | 226 | // ======================================================================== 227 | /** 228 | * Fills this bit vector with the specified bit value. This can be used to 229 | * clear the vector by specifying 0. 230 | * 231 | * @param value the value to set all bits to (only the lowest bit is used) 232 | */ 233 | public void fill(final long value) { 234 | for(long i=0; inull. 244 | */ 245 | public void getRegisterContents(final IWordSerializer serializer) { 246 | for(final LongIterator iter = registerIterator(); iter.hasNext();) { 247 | serializer.writeWord(iter.next()); 248 | } 249 | } 250 | 251 | /** 252 | * Creates a deep copy of this vector. 253 | * 254 | * @see java.lang.Object#clone() 255 | */ 256 | @Override 257 | public BitVector clone() { 258 | final BitVector copy = new BitVector(registerWidth, count); 259 | System.arraycopy(words, 0, copy.words, 0, words.length); 260 | return copy; 261 | } 262 | } -------------------------------------------------------------------------------- /src/main/java/net/agkn/hll/util/HLLUtil.java: -------------------------------------------------------------------------------- 1 | package net.agkn.hll.util; 2 | 3 | /* 4 | * Copyright 2013 Aggregate Knowledge, Inc. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | import net.agkn.hll.HLL; 20 | 21 | /** 22 | * Static functions for computing constants and parameters used in the HLL 23 | * algorithm. 24 | * 25 | * @author timon 26 | */ 27 | public final class HLLUtil { 28 | /** 29 | * Precomputed pwMaxMask values indexed by registerSizeInBits. 30 | * Calculated with this formula: 31 | *
 32 |      *     int maxRegisterValue = (1 << registerSizeInBits) - 1;
 33 |      *     // Mask with all bits set except for (maxRegisterValue - 1) least significant bits (see #addRaw())
 34 |      *     return ~((1L << (maxRegisterValue - 1)) - 1);
 35 |      * 
36 | * 37 | * @see #pwMaxMask(int) 38 | */ 39 | private static final long[] PW_MASK = { 40 | ~((1L << (((1 << 0) - 1) - 1)) - 1), 41 | ~((1L << (((1 << 1) - 1) - 1)) - 1), 42 | ~((1L << (((1 << 2) - 1) - 1)) - 1), 43 | ~((1L << (((1 << 3) - 1) - 1)) - 1), 44 | ~((1L << (((1 << 4) - 1) - 1)) - 1), 45 | ~((1L << (((1 << 5) - 1) - 1)) - 1), 46 | ~((1L << (((1 << 6) - 1) - 1)) - 1), 47 | ~((1L << (((1 << 7) - 1) - 1)) - 1), 48 | ~((1L << (((1 << 8) - 1) - 1)) - 1) 49 | }; 50 | 51 | /** 52 | * Precomputed twoToL values indexed by a linear combination of 53 | * regWidth and log2m. 54 | * 55 | * The array is one-dimensional and can be accessed by using index 56 | * (REG_WIDTH_INDEX_MULTIPLIER * regWidth) + log2m 57 | * for regWidth and log2m between the specified 58 | * HLL.{MINIMUM,MAXIMUM}_{REGWIDTH,LOG2M}_PARAM constants. 59 | * 60 | * @see #largeEstimator(int, int, double) 61 | * @see #largeEstimatorCutoff(int, int) 62 | * @see Blog post with section on 2^L 63 | */ 64 | private static final double[] TWO_TO_L = new double[(HLL.MAXIMUM_REGWIDTH_PARAM + 1) * (HLL.MAXIMUM_LOG2M_PARAM + 1)]; 65 | 66 | /** 67 | * Spacing constant used to compute offsets into {@link TWO_TO_L}. 68 | */ 69 | private static final int REG_WIDTH_INDEX_MULTIPLIER = HLL.MAXIMUM_LOG2M_PARAM + 1; 70 | 71 | static { 72 | for(int regWidth = HLL.MINIMUM_REGWIDTH_PARAM; regWidth <= HLL.MAXIMUM_REGWIDTH_PARAM; regWidth++) { 73 | for(int log2m = HLL.MINIMUM_LOG2M_PARAM ; log2m <= HLL.MAXIMUM_LOG2M_PARAM; log2m++) { 74 | int maxRegisterValue = (1 << regWidth) - 1; 75 | 76 | // Since 1 is added to p(w) in the insertion algorithm, only 77 | // (maxRegisterValue - 1) bits are inspected hence the hash 78 | // space is one power of two smaller. 79 | final int pwBits = (maxRegisterValue - 1); 80 | final int totalBits = (pwBits + log2m); 81 | final double twoToL = Math.pow(2, totalBits); 82 | TWO_TO_L[(REG_WIDTH_INDEX_MULTIPLIER * regWidth) + log2m] = twoToL; 83 | } 84 | } 85 | } 86 | 87 | // ************************************************************************ 88 | /** 89 | * Computes the bit-width of HLL registers necessary to estimate a set of 90 | * the specified cardinality. 91 | * 92 | * @param expectedUniqueElements an upper bound on the number of unique 93 | * elements that are expected. This must be greater than zero. 94 | * @return a register size in bits (i.e. log2(log2(n))) 95 | */ 96 | public static int registerBitSize(final long expectedUniqueElements) { 97 | return Math.max(HLL.MINIMUM_REGWIDTH_PARAM, 98 | (int)Math.ceil(NumberUtil.log2(NumberUtil.log2(expectedUniqueElements)))); 99 | } 100 | 101 | // ======================================================================== 102 | /** 103 | * Computes the 'alpha-m-squared' constant used by the HyperLogLog algorithm. 104 | * 105 | * @param m this must be a power of two, cannot be less than 106 | * 16 (24), and cannot be greater than 65536 (216). 107 | * @return gamma times registerCount squared where gamma is 108 | * based on the value of registerCount. 109 | * @throws IllegalArgumentException if registerCount is less 110 | * than 16. 111 | */ 112 | public static double alphaMSquared(final int m) { 113 | switch(m) { 114 | case 1/*2^0*/: 115 | case 2/*2^1*/: 116 | case 4/*2^2*/: 117 | case 8/*2^3*/: 118 | throw new IllegalArgumentException("'m' cannot be less than 16 (" + m + " < 16)."); 119 | 120 | case 16/*2^4*/: 121 | return 0.673 * m * m; 122 | 123 | case 32/*2^5*/: 124 | return 0.697 * m * m; 125 | 126 | case 64/*2^6*/: 127 | return 0.709 * m * m; 128 | 129 | default/*>2^6*/: 130 | return (0.7213 / (1.0 + 1.079 / m)) * m * m; 131 | } 132 | } 133 | 134 | // ======================================================================== 135 | /** 136 | * Computes a mask that prevents overflow of HyperLogLog registers. 137 | * 138 | * @param registerSizeInBits the size of the HLL registers, in bits. 139 | * @return mask a long mask to prevent overflow of the registers 140 | * @see #registerBitSize(long) 141 | */ 142 | public static long pwMaxMask(final int registerSizeInBits) { 143 | return PW_MASK[registerSizeInBits]; 144 | } 145 | 146 | // ======================================================================== 147 | /** 148 | * The cutoff for using the "small range correction" formula, in the 149 | * HyperLogLog algorithm. 150 | * 151 | * @param m the number of registers in the HLL. m in the paper. 152 | * @return the cutoff for the small range correction. 153 | * @see #smallEstimator(int, int) 154 | */ 155 | public static double smallEstimatorCutoff(final int m) { 156 | return ((double)m * 5) / 2; 157 | } 158 | 159 | /** 160 | * The "small range correction" formula from the HyperLogLog algorithm. Only 161 | * appropriate if both the estimator is smaller than
(5/2) * m
and 162 | * there are still registers that have the zero value. 163 | * 164 | * @param m the number of registers in the HLL. m in the paper. 165 | * @param numberOfZeroes the number of registers with value zero. V 166 | * in the paper. 167 | * @return a corrected cardinality estimate. 168 | */ 169 | public static double smallEstimator(final int m, final int numberOfZeroes) { 170 | return m * Math.log((double)m / numberOfZeroes); 171 | } 172 | 173 | /** 174 | * The cutoff for using the "large range correction" formula, from the 175 | * HyperLogLog algorithm, adapted for 64 bit hashes. 176 | * 177 | * @param log2m log-base-2 of the number of registers in the HLL. b in the paper. 178 | * @param registerSizeInBits the size of the HLL registers, in bits. 179 | * @return the cutoff for the large range correction. 180 | * @see #largeEstimator(int, int, double) 181 | * @see Blog post with section on 64 bit hashes and "large range correction" cutoff 182 | */ 183 | public static double largeEstimatorCutoff(final int log2m, final int registerSizeInBits) { 184 | return (TWO_TO_L[(REG_WIDTH_INDEX_MULTIPLIER * registerSizeInBits) + log2m]) / 30.0; 185 | } 186 | 187 | /** 188 | * The "large range correction" formula from the HyperLogLog algorithm, adapted 189 | * for 64 bit hashes. Only appropriate for estimators whose value exceeds 190 | * the return of {@link #largeEstimatorCutoff(int, int)}. 191 | * 192 | * @param log2m log-base-2 of the number of registers in the HLL. b in the paper. 193 | * @param registerSizeInBits the size of the HLL registers, in bits. 194 | * @param estimator the original estimator ("E" in the paper). 195 | * @return a corrected cardinality estimate. 196 | * @see Blog post with section on 64 bit hashes and "large range correction" 197 | */ 198 | public static double largeEstimator(final int log2m, final int registerSizeInBits, final double estimator) { 199 | final double twoToL = TWO_TO_L[(REG_WIDTH_INDEX_MULTIPLIER * registerSizeInBits) + log2m]; 200 | return -1 * twoToL * Math.log(1.0 - (estimator/twoToL)); 201 | } 202 | } 203 | -------------------------------------------------------------------------------- /src/main/java/net/agkn/hll/util/LongIterator.java: -------------------------------------------------------------------------------- 1 | package net.agkn.hll.util; 2 | 3 | /* 4 | * Copyright 2013 Aggregate Knowledge, Inc. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | /** 20 | * A long-based iterator. This is not is-a {@link java.util.Iterator} 21 | * to prevent autoboxing between Long and long. 22 | * 23 | * @author rgrzywinski 24 | */ 25 | public interface LongIterator { 26 | /** 27 | * @return true if and only if there are more elements to 28 | * iterate over. false otherwise. 29 | */ 30 | boolean hasNext(); 31 | 32 | /** 33 | * @return the next long in the collection. 34 | */ 35 | long next(); 36 | } -------------------------------------------------------------------------------- /src/main/java/net/agkn/hll/util/NumberUtil.java: -------------------------------------------------------------------------------- 1 | package net.agkn.hll.util; 2 | 3 | /* 4 | * Copyright 2013 Aggregate Knowledge, Inc. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | /** 20 | * A collection of utilities to work with numbers. 21 | * 22 | * @author rgrzywinski 23 | */ 24 | public class NumberUtil { 25 | // loge(2) (log-base e of 2) 26 | public static final double LOGE_2 = 0.6931471805599453; 27 | 28 | // ************************************************************************ 29 | /** 30 | * Computes the log2 (log-base-two) of the specified value. 31 | * 32 | * @param value the double for which the log2 is 33 | * desired. 34 | * @return the log2 of the specified value 35 | */ 36 | public static double log2(final double value) { 37 | // REF: http://en.wikipedia.org/wiki/Logarithmic_scale (conversion of bases) 38 | return Math.log(value) / LOGE_2; 39 | } 40 | 41 | // ======================================================================== 42 | // the hex characters 43 | private static final char[] HEX = { '0', '1', '2', '3', '4', '5', '6', '7', 44 | '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' }; 45 | 46 | // ------------------------------------------------------------------------ 47 | /** 48 | * Converts the specified array of bytes into a string of 49 | * hex characters (low byte first). 50 | * 51 | * @param bytes the array of bytes that are to be converted. 52 | * This cannot be null though it may be empty. 53 | * @param offset the offset in bytes at which the bytes will 54 | * be taken. This cannot be negative and must be less than 55 | * bytes.length - 1. 56 | * @param count the number of bytes to be retrieved from the specified array. 57 | * This cannot be negative. If greater than bytes.length - offset 58 | * then that value is used. 59 | * @return a string of at most count characters that represents 60 | * the specified byte array in hex. This will never be null 61 | * though it may be empty if bytes is empty or count 62 | * is zero. 63 | * @throws IllegalArgumentException if offset is greater than 64 | * or equal to bytes.length. 65 | * @see #fromHex(String, int, int) 66 | */ 67 | public static String toHex(final byte[] bytes, final int offset, final int count) { 68 | if(offset >= bytes.length) throw new IllegalArgumentException("Offset is greater than the length (" + offset + " >= " + bytes.length + ").")/*by contract*/; 69 | final int byteCount = Math.min( (bytes.length - offset), count); 70 | final int upperBound = byteCount + offset; 71 | 72 | final char[] chars = new char[byteCount * 2/*two chars per byte*/]; 73 | int charIndex = 0; 74 | for(int i=offset; i>> 4) & 0x0F]; 77 | chars[charIndex++] = HEX[value & 0x0F]; 78 | } 79 | 80 | return new String(chars); 81 | } 82 | 83 | /** 84 | * Converts the specified array of hex characters into an array of bytes 85 | * (low byte first). 86 | * 87 | * @param string the string of hex characters to be converted into bytes. 88 | * This cannot be null though it may be blank. 89 | * @param offset the offset in the string at which the characters will be 90 | * taken. This cannot be negative and must be less than string.length() - 1. 91 | * @param count the number of characters to be retrieved from the specified 92 | * string. This cannot be negative and must be divisible by two 93 | * (since there are two characters per byte). 94 | * @return the array of bytes that were converted from the 95 | * specified string (in the specified range). This will never be 96 | * null though it may be empty if string 97 | * is empty or count is zero. 98 | * @throws IllegalArgumentException if offset is greater than 99 | * or equal to string.length() or if count 100 | * is not divisible by two. 101 | * @see #toHex(byte[], int, int) 102 | */ 103 | public static byte[] fromHex(final String string, final int offset, final int count) { 104 | if(offset >= string.length()) throw new IllegalArgumentException("Offset is greater than the length (" + offset + " >= " + string.length() + ").")/*by contract*/; 105 | if( (count & 0x01) != 0) throw new IllegalArgumentException("Count is not divisible by two (" + count + ").")/*by contract*/; 106 | final int charCount = Math.min((string.length() - offset), count); 107 | final int upperBound = offset + charCount; 108 | 109 | final byte[] bytes = new byte[charCount >>> 1/*aka /2*/]; 110 | int byteIndex = 0/*beginning*/; 111 | for(int i=offset; ibyte. 122 | * This cannot be a character other than [a-fA-F0-9]. 123 | * @return the value of the specified character. This will be a value 0 124 | * through 15. 125 | * @throws IllegalArgumentException if the specified character is not in 126 | * [a-fA-F0-9] 127 | */ 128 | private static final int digit(final char character) { 129 | switch(character) { 130 | case '0': 131 | return 0; 132 | case '1': 133 | return 1; 134 | case '2': 135 | return 2; 136 | case '3': 137 | return 3; 138 | case '4': 139 | return 4; 140 | case '5': 141 | return 5; 142 | case '6': 143 | return 6; 144 | case '7': 145 | return 7; 146 | case '8': 147 | return 8; 148 | case '9': 149 | return 9; 150 | case 'a': 151 | case 'A': 152 | return 10; 153 | case 'b': 154 | case 'B': 155 | return 11; 156 | case 'c': 157 | case 'C': 158 | return 12; 159 | case 'd': 160 | case 'D': 161 | return 13; 162 | case 'e': 163 | case 'E': 164 | return 14; 165 | case 'f': 166 | case 'F': 167 | return 15; 168 | 169 | default: 170 | throw new IllegalArgumentException("Character is not in [a-fA-F0-9] ('" + character + "')."); 171 | } 172 | } 173 | } -------------------------------------------------------------------------------- /src/test/java/net/agkn/hll/ExplicitHLLTest.java: -------------------------------------------------------------------------------- 1 | package net.agkn.hll; 2 | 3 | /* 4 | * Copyright 2013 Aggregate Knowledge, Inc. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | import static org.powermock.reflect.Whitebox.getInternalState; 20 | import static org.testng.Assert.assertEquals; 21 | import static org.testng.Assert.assertTrue; 22 | import it.unimi.dsi.fastutil.longs.LongOpenHashSet; 23 | 24 | import java.util.HashSet; 25 | import java.util.Random; 26 | 27 | import net.agkn.hll.serialization.ISchemaVersion; 28 | import net.agkn.hll.serialization.SerializationUtil; 29 | import org.testng.annotations.Test; 30 | 31 | /** 32 | * Tests {@link HLL} of type {@link HLLType#EXPLICIT}. 33 | * 34 | * @author timon 35 | */ 36 | public class ExplicitHLLTest { 37 | /** 38 | * Tests basic set semantics of {@link HLL#addRaw(long)}. 39 | */ 40 | @Test 41 | public void addBasicTest() { 42 | { // Adding a single positive value to an empty set should work. 43 | final HLL hll = newHLL(128/*arbitrary*/); 44 | hll.addRaw(1L/*positive*/); 45 | assertEquals(hll.cardinality(), 1L); 46 | } 47 | { // Adding a single negative value to an empty set should work. 48 | final HLL hll = newHLL(128/*arbitrary*/); 49 | hll.addRaw(-1L/*negative*/); 50 | assertEquals(hll.cardinality(), 1L); 51 | } 52 | { // Adding a duplicate value to a set should be a no-op. 53 | final HLL hll = newHLL(128/*arbitrary*/); 54 | hll.addRaw(1L/*positive*/); 55 | assertEquals(hll.cardinality(), 1L/*arbitrary*/); 56 | assertEquals(hll.cardinality(), 1L/*dupe*/); 57 | } 58 | } 59 | 60 | // ------------------------------------------------------------------------ 61 | /** 62 | * Tests {@link HLL#union(HLL)}. 63 | */ 64 | @Test 65 | public void unionTest() { 66 | {// Unioning two distinct sets should work 67 | final HLL hllA = newHLL(128/*arbitrary*/); 68 | final HLL hllB = newHLL(128/*arbitrary*/); 69 | hllA.addRaw(1L); 70 | hllA.addRaw(2L); 71 | hllB.addRaw(3L); 72 | 73 | hllA.union(hllB); 74 | assertEquals(hllA.cardinality(), 3); 75 | } 76 | {// Unioning two sets whose union doesn't exceed the cardinality cap should not promote 77 | final HLL hllA = newHLL(128/*arbitrary*/); 78 | final HLL hllB = newHLL(128/*arbitrary*/); 79 | hllA.addRaw(1L); 80 | hllA.addRaw(2L); 81 | hllB.addRaw(1L); 82 | 83 | hllA.union(hllB); 84 | assertEquals(hllA.cardinality(), 2); 85 | } 86 | {// unioning two sets whose union exceeds the cardinality cap should promote 87 | final HLL hllA = newHLL(128/*arbitrary*/); 88 | final HLL hllB = newHLL(128/*arbitrary*/); 89 | 90 | // fill up sets to explicitThreshold 91 | for(long i=0; i<128/*explicitThreshold*/; i++) { 92 | hllA.addRaw(i); 93 | hllB.addRaw(i + 128); 94 | } 95 | 96 | hllA.union(hllB); 97 | assertEquals(hllA.getType(), HLLType.SPARSE); 98 | } 99 | } 100 | 101 | // ------------------------------------------------------------------------ 102 | /** 103 | * Tests {@link HLL#clear()} 104 | */ 105 | @Test 106 | public void clearTest() { 107 | final HLL hll = newHLL(128/*arbitrary*/); 108 | hll.addRaw(1L); 109 | assertEquals(hll.cardinality(), 1L); 110 | hll.clear(); 111 | assertEquals(hll.cardinality(), 0L); 112 | } 113 | 114 | // ------------------------------------------------------------------------ 115 | /** 116 | * Tests {@link LongSetSlab#toBytes(int, ISchemaVersion)} and 117 | * {@link LongSetSlab#fromBytes(int, byte[], ISchemaVersion)}. 118 | */ 119 | @Test 120 | public void toFromBytesTest() { 121 | final ISchemaVersion schemaVersion = SerializationUtil.DEFAULT_SCHEMA_VERSION; 122 | final HLLType type = HLLType.EXPLICIT; 123 | final int padding = schemaVersion.paddingBytes(type); 124 | final int bytesPerWord = 8; 125 | 126 | {// Should work on an empty set 127 | final HLL hll = newHLL(128/*arbitrary*/); 128 | 129 | final byte[] bytes = hll.toBytes(schemaVersion); 130 | 131 | // assert output has correct byte length 132 | assertEquals(bytes.length, padding/*no elements, just padding*/); 133 | 134 | final HLL inHLL = HLL.fromBytes(bytes); 135 | 136 | assertElementsEqual(hll, inHLL); 137 | } 138 | {// Should work on a partially filled set 139 | final HLL hll = newHLL(128/*arbitrary*/); 140 | 141 | for(int i=0; i<3; i++) { 142 | hll.addRaw(i); 143 | } 144 | 145 | final byte[] bytes = hll.toBytes(schemaVersion); 146 | 147 | // assert output has correct byte length 148 | assertEquals(bytes.length, padding + (bytesPerWord * 3/*elements*/)); 149 | 150 | final HLL inHLL = HLL.fromBytes(bytes); 151 | 152 | assertElementsEqual(hll, inHLL); 153 | } 154 | {// Should work on a full set 155 | final int explicitThreshold = 128; 156 | final HLL hll = newHLL(explicitThreshold); 157 | 158 | for(int i=0; i canonical = new HashSet(); 181 | final HLL hll = newHLL(explicitThreshold); 182 | 183 | final long seed = 1L/*constant so results are reproducible*/; 184 | final Random random = new Random(seed); 185 | for(int i=0;i explicitThreshold = 8*/, false/*sparseon*/, HLLType.EXPLICIT); 211 | 212 | for(int i=0;i<9/* > explicitThreshold */;i++){ 213 | hll.addRaw(i); 214 | } 215 | assertEquals(hll.getType(), HLLType.FULL); 216 | } 217 | } 218 | 219 | // ************************************************************************ 220 | // assertion helpers 221 | /** 222 | * Asserts that values in both sets are exactly equal. 223 | */ 224 | private static void assertElementsEqual(final HLL hllA, final HLL hllB) { 225 | final LongOpenHashSet internalSetA = (LongOpenHashSet)getInternalState(hllA, "explicitStorage"); 226 | final LongOpenHashSet internalSetB = (LongOpenHashSet)getInternalState(hllB, "explicitStorage"); 227 | 228 | assertTrue(internalSetA.equals(internalSetB)); 229 | } 230 | 231 | /** 232 | * Builds a {@link HLLType#EXPLICIT} {@link HLL} instance with the specified 233 | * explicit threshold. 234 | * 235 | * @param explicitThreshold explicit threshold to use for the constructed 236 | * {@link HLL}. This must be greater than zero. 237 | * @return a default-sized {@link HLLType#EXPLICIT} empty {@link HLL} instance. 238 | * This will never be null. 239 | */ 240 | private static HLL newHLL(final int explicitThreshold) { 241 | return new HLL(11/*log2m, unused*/, 5/*regwidth, unused*/, explicitThreshold, 256/*sparseThreshold, arbitrary, unused*/, HLLType.EXPLICIT); 242 | } 243 | } -------------------------------------------------------------------------------- /src/test/java/net/agkn/hll/FullHLLTest.java: -------------------------------------------------------------------------------- 1 | package net.agkn.hll; 2 | 3 | /* 4 | * Copyright 2013 Aggregate Knowledge, Inc. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | import static org.powermock.reflect.Whitebox.getInternalState; 20 | import static org.testng.Assert.assertEquals; 21 | import static org.testng.Assert.assertTrue; 22 | import static org.testng.Assert.assertFalse; 23 | 24 | import net.agkn.hll.serialization.ISchemaVersion; 25 | import net.agkn.hll.serialization.SerializationUtil; 26 | import net.agkn.hll.util.BitVector; 27 | import net.agkn.hll.util.HLLUtil; 28 | import net.agkn.hll.util.LongIterator; 29 | 30 | import org.testng.annotations.Test; 31 | 32 | /** 33 | * Tests {@link HLL} of type {@link HLLType#FULL}. 34 | * 35 | * @author rgrzywinski 36 | * @author timon 37 | */ 38 | public class FullHLLTest { 39 | // TODO union test 40 | /** 41 | * Smoke test for {@link HLL#cardinality(int)} and the proper use of the 42 | * small range correction. 43 | */ 44 | @Test 45 | public void smallRangeSmokeTest() { 46 | final int log2m = 11; 47 | final int m = (1 << log2m); 48 | final int regwidth = 5; 49 | 50 | // only one register set 51 | { 52 | final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL); 53 | hll.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, 0/*ix*/, 1/*val*/)); 54 | 55 | final long cardinality = hll.cardinality(); 56 | 57 | // Trivially true that small correction conditions hold: one register 58 | // set implies zeroes exist, and estimator trivially smaller than 5m/2. 59 | // Small range correction: m * log(m/V) 60 | final long expected = (long)Math.ceil(m * Math.log((double)m / (m - 1)/*# of zeroes*/)); 61 | assertEquals(cardinality, expected); 62 | } 63 | 64 | // all but one register set 65 | { 66 | final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL); 67 | for(int i=0; i<(m - 1); i++) { 68 | hll.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, i/*ix*/, 1/*val*/)); 69 | } 70 | 71 | // Trivially true that small correction conditions hold: all but 72 | // one register set implies a zero exists, and estimator trivially 73 | // smaller than 5m/2 since it's alpha / ((m-1)/2) 74 | final long cardinality = hll.cardinality(); 75 | 76 | // Small range correction: m * log(m/V) 77 | final long expected = (long)Math.ceil(m * Math.log((double)m / 1/*# of zeroes*/)); 78 | assertEquals(cardinality, expected); 79 | } 80 | } 81 | 82 | /** 83 | * Smoke test for {@link HLL#cardinality()} and the proper use of the 84 | * uncorrected estimator 85 | */ 86 | @Test 87 | public void normalRangeSmokeTest() { 88 | final int log2m = 11; 89 | final int regwidth = 5; 90 | // regwidth = 5, so hash space is 91 | // log2m + (2^5 - 1 - 1), so L = log2m + 30 92 | final int l = log2m + 30; 93 | final int m = (1 << log2m); 94 | final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL); 95 | 96 | // all registers at 'medium' value 97 | { 98 | final int registerValue = 7/*chosen to ensure neither correction kicks in*/; 99 | for(int i=0; i (5 * m /(double)2)); 112 | 113 | final long expected = (long)Math.ceil(estimator); 114 | assertEquals(cardinality, expected); 115 | } 116 | } 117 | 118 | /** 119 | * Smoke test for {@link HLL#cardinality()} and the proper use of the large 120 | * range correction. 121 | */ 122 | @Test 123 | public void largeRangeSmokeTest() { 124 | final int log2m = 12; 125 | final int regwidth = 5; 126 | // regwidth = 5, so hash space is 127 | // log2m + (2^5 - 1 - 1), so L = log2m + 30 128 | final int l = log2m + 30; 129 | final int m = (1 << log2m); 130 | final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL); 131 | 132 | { 133 | final int registerValue = 31/*chosen to ensure large correction kicks in*/; 134 | for(int i=0; i Math.pow(2,l)/30); 147 | 148 | // Large range correction: -2^L * log(1 - E/2^L) 149 | final long expected = (long)Math.ceil(-1.0 * Math.pow(2, l) * Math.log(1.0 - estimator/Math.pow(2, l))); 150 | assertEquals(cardinality, expected); 151 | } 152 | } 153 | 154 | // ======================================================================== 155 | /** 156 | * Tests the bounds on a register's value for a given raw input value. 157 | */ 158 | @Test 159 | public void registerValueTest() { 160 | final int log2m = 4/*small enough to make testing easy (addRaw() shifts by one byte)*/; 161 | 162 | // register width 4 (the minimum size) 163 | { // scoped locally for sanity 164 | final int regwidth = 4; 165 | final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL); 166 | final BitVector bitVector = (BitVector)getInternalState(hll, "probabilisticStorage")/*for testing convenience*/; 167 | 168 | // lower-bounds of the register 169 | hll.addRaw(0x000000000000001L/*'j'=1*/); 170 | assertEquals(bitVector.getRegister(1/*'j'*/), 0); 171 | 172 | hll.addRaw(0x0000000000000012L/*'j'=2*/); 173 | assertEquals(bitVector.getRegister(2/*'j'*/), 1); 174 | 175 | hll.addRaw(0x0000000000000023L/*'j'=3*/); 176 | assertEquals(bitVector.getRegister(3/*'j'*/), 2); 177 | 178 | hll.addRaw(0x0000000000000044L/*'j'=4*/); 179 | assertEquals(bitVector.getRegister(4/*'j'*/), 3); 180 | 181 | hll.addRaw(0x0000000000000085L/*'j'=5*/); 182 | assertEquals(bitVector.getRegister(5/*'j'*/), 4); 183 | 184 | // upper-bounds of the register 185 | // NOTE: bear in mind that BitVector itself does ensure that 186 | // overflow of a register is prevented 187 | hll.addRaw(0x0000000000010006L/*'j'=6*/); 188 | assertEquals(bitVector.getRegister(6/*'j'*/), 13); 189 | 190 | hll.addRaw(0x0000000000020007L/*'j'=7*/); 191 | assertEquals(bitVector.getRegister(7/*'j'*/), 14); 192 | 193 | hll.addRaw(0x0000000000040008L/*'j'=8*/); 194 | assertEquals(bitVector.getRegister(8/*'j'*/), 15); 195 | 196 | hll.addRaw(0x0000000000080009L/*'j'=9*/); 197 | assertEquals(bitVector.getRegister(9/*'j'*/), 15/*overflow*/); 198 | 199 | // sanity checks to ensure that no other bits above the lowest-set 200 | // bit matters 201 | // NOTE: same as case 'j = 6' above 202 | hll.addRaw(0x000000000003000AL/*'j'=10*/); 203 | assertEquals(bitVector.getRegister(10/*'j'*/), 13); 204 | 205 | hll.addRaw(0x000000000011000BL/*'j'=11*/); 206 | assertEquals(bitVector.getRegister(11/*'j'*/), 13); 207 | } 208 | 209 | // register width 5 210 | { // scoped locally for sanity 211 | final int regwidth = 5; 212 | final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL); 213 | final BitVector bitVector = (BitVector)getInternalState(hll, "probabilisticStorage")/*for testing convenience*/; 214 | 215 | // lower-bounds of the register 216 | hll.addRaw(0x0000000000000001L/*'j'=1*/); 217 | assertEquals(bitVector.getRegister(1/*'j'*/), 0); 218 | 219 | hll.addRaw(0x0000000000000012L/*'j'=2*/); 220 | assertEquals(bitVector.getRegister(2/*'j'*/), 1); 221 | 222 | hll.addRaw(0x0000000000000023L/*'j'=3*/); 223 | assertEquals(bitVector.getRegister(3/*'j'*/), 2); 224 | 225 | hll.addRaw(0x0000000000000044L/*'j'=4*/); 226 | assertEquals(bitVector.getRegister(4/*'j'*/), 3); 227 | 228 | hll.addRaw(0x0000000000000085L/*'j'=5*/); 229 | assertEquals(bitVector.getRegister(5/*'j'*/), 4); 230 | 231 | // upper-bounds of the register 232 | // NOTE: bear in mind that BitVector itself does ensure that 233 | // overflow of a register is prevented 234 | hll.addRaw(0x0000000100000006L/*'j'=6*/); 235 | assertEquals(bitVector.getRegister(6/*'j'*/), 29); 236 | 237 | hll.addRaw(0x0000000200000007L/*'j'=7*/); 238 | assertEquals(bitVector.getRegister(7/*'j'*/), 30); 239 | 240 | hll.addRaw(0x0000000400000008L/*'j'=8*/); 241 | assertEquals(bitVector.getRegister(8/*'j'*/), 31); 242 | 243 | hll.addRaw(0x0000000800000009L/*'j'=9*/); 244 | assertEquals(bitVector.getRegister(9/*'j'*/), 31/*overflow*/); 245 | } 246 | } 247 | 248 | // ======================================================================== 249 | /** 250 | * Tests {@link HLL#clear()}. 251 | */ 252 | @Test 253 | public void clearTest() { 254 | final int regwidth = 5; 255 | final int log2m = 4/*16 registers per counter*/; 256 | final int m = 1 << log2m; 257 | 258 | final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL); 259 | final BitVector bitVector = (BitVector)getInternalState(hll, "probabilisticStorage")/*for testing convenience*/; 260 | for(int i=0; i SPARSE 140 | * - SPARSE U "underpopulated" FULL => SPARSE 141 | * - SPARSE U "barely underpopulated" FULL => FULL 142 | */ 143 | private static void sparseFullRepresentationTest(final ISchemaVersion schemaVersion) throws IOException { 144 | final FileWriter output = openOutput(schemaVersion, "sparse_full_representation", TestType.UNION); 145 | 146 | final HLL emptyHLL1 = newHLL(HLLType.EMPTY); 147 | final HLL emptyHLL2 = newHLL(HLLType.EMPTY); 148 | 149 | cumulativeUnionLine(output, emptyHLL1, emptyHLL2, schemaVersion); 150 | 151 | // NOTE: In this test the sparseReference will be the "expected" value 152 | // from the C representation, since it doesn't choose representation 153 | // based on original encoding, but rather on the promotion rules 154 | // and the declared type of the "receiving" field. 155 | // It is the manually-constructed union result. 156 | 157 | // "underpopulated" FULL U EMPTY => SPARSE 158 | final HLL fullHLL = newHLL(HLLType.FULL); 159 | fullHLL.addRaw(constructHLLValue(LOG2M, 0/*ix*/, 1/*val*/)); 160 | 161 | final HLL sparseHLL = newHLL(HLLType.SPARSE); 162 | sparseHLL.addRaw(constructHLLValue(LOG2M, 0/*ix*/, 1/*val*/)); 163 | 164 | output.write(stringCardinality(fullHLL) + "," + toByteA(fullHLL, schemaVersion) + "," + stringCardinality(sparseHLL) + "," + toByteA(sparseHLL, schemaVersion) + "\n"); 165 | output.flush(); 166 | 167 | // "underpopulated" FULL (small) U SPARSE (small) => SPARSE 168 | final HLL fullHLL2 = newHLL(HLLType.FULL); 169 | fullHLL2.addRaw(constructHLLValue(LOG2M, 1/*ix*/, 1/*val*/)); 170 | 171 | sparseHLL.addRaw(constructHLLValue(LOG2M, 1/*ix*/, 1/*val*/)); 172 | 173 | output.write(stringCardinality(fullHLL2) + "," + toByteA(fullHLL2, schemaVersion) + "," + stringCardinality(sparseHLL) + "," + toByteA(sparseHLL, schemaVersion) + "\n"); 174 | output.flush(); 175 | 176 | // "underpopulated" FULL (just on edge) U SPARSE (small) => FULL 177 | final HLL fullHLL3 = newHLL(HLLType.FULL); 178 | for(int i=2; i<(SPARSE_THRESHOLD + 1); i++) { 179 | fullHLL3.addRaw(constructHLLValue(LOG2M, i/*ix*/, 1/*val*/)); 180 | sparseHLL.addRaw(constructHLLValue(LOG2M, i/*ix*/, 1/*val*/)); 181 | } 182 | 183 | output.write(stringCardinality(fullHLL3) + "," + toByteA(fullHLL3, schemaVersion) + "," + stringCardinality(sparseHLL) + "," + toByteA(sparseHLL, schemaVersion) + "\n"); 184 | output.flush(); 185 | } 186 | 187 | /** 188 | * Cumulatively sets successive registers to: 189 | * 190 | * (registerIndex % REGISTER_MAX_VALUE) + 1 191 | * 192 | * by adding specifically constructed values to a SPARSE HLL. 193 | * Does not induce promotion. 194 | * 195 | * Format: cumulative add 196 | * Tests: 197 | * - SPARSE addition (predictable) 198 | */ 199 | private static void sparseStepTest(final ISchemaVersion schemaVersion) throws IOException { 200 | final FileWriter output = openOutput(schemaVersion, "sparse_step", TestType.ADD); 201 | 202 | // the accumulator, starts empty sparse probabilistic 203 | final HLL hll = newHLL(HLLType.SPARSE); 204 | initLineAdd(output, hll, schemaVersion); 205 | 206 | for(int i=0; inull. 526 | * @return the algorithm-specific cardinality of the instance as a PostgreSQL- 527 | * compatible String. This will never be null 528 | */ 529 | private static String stringCardinality(final HLL hll) { 530 | switch(hll.getType()) { 531 | case EMPTY: 532 | return "0"; 533 | case EXPLICIT:/*promotion has not yet occurred*/ 534 | return Long.toString(hll.cardinality()); 535 | case SPARSE: 536 | return Double.toString(hll.sparseProbabilisticAlgorithmCardinality()); 537 | case FULL: 538 | return Double.toString(hll.fullProbabilisticAlgorithmCardinality()); 539 | default: 540 | throw new RuntimeException("Unknown HLL type " + hll.getType()); 541 | } 542 | } 543 | 544 | /** 545 | * Generates a random HLL and populates it with random values. 546 | * 547 | * @param random the {@link Random random number generator} used to populate 548 | * the HLL. This cannot be null. 549 | * @return the populated HLL. This will never be null. 550 | */ 551 | public static HLL generateRandomHLL(final Random random) { 552 | final int randomTypeInt = random.nextInt(HLLType.values().length); 553 | final HLLType type; 554 | switch(randomTypeInt) { 555 | case 0: 556 | type = HLLType.EMPTY; 557 | break; 558 | case 1: 559 | type = HLLType.EXPLICIT; 560 | break; 561 | case 2: 562 | type = HLLType.FULL; 563 | break; 564 | case 3: 565 | type = HLLType.EMPTY; 566 | break; 567 | case 4: 568 | type = HLLType.SPARSE; 569 | break; 570 | default: 571 | throw new RuntimeException("Unassigned type int " + randomTypeInt); 572 | } 573 | 574 | final int cardinalityCap; 575 | final int cardinalityBaseline; 576 | 577 | switch(type) { 578 | case EMPTY: 579 | return newHLL(HLLType.EMPTY); 580 | case EXPLICIT: 581 | cardinalityCap = EXPLICIT_THRESHOLD; 582 | cardinalityBaseline = 1; 583 | break; 584 | case SPARSE: 585 | cardinalityCap = SPARSE_THRESHOLD; 586 | cardinalityBaseline = (EXPLICIT_THRESHOLD + 1); 587 | break; 588 | case FULL: 589 | cardinalityCap = 100000; 590 | cardinalityBaseline = (SPARSE_THRESHOLD*10); 591 | break; 592 | default: 593 | throw new RuntimeException("We should never be here."); 594 | } 595 | 596 | final HLL hll = newHLL(HLLType.EMPTY); 597 | for(int i=0; inull. 612 | * @param description Description string used to build the filename. 613 | * This cannot be null. 614 | * @param type {@link TestType type} of the test file to be written. 615 | * This cannot be null. 616 | * @return The opened {@link FileWriter writer}. This will never be null. 617 | */ 618 | private static FileWriter openOutput(final ISchemaVersion schemaVersion, final String description, final TestType type) throws IOException { 619 | final String schemaVersionPrefix = "v"+ schemaVersion.schemaVersionNumber() + "_"; 620 | final String header; 621 | final String filename; 622 | switch(type) { 623 | case ADD: 624 | header = "cardinality,raw_value,HLL\n"; 625 | filename = schemaVersionPrefix + "cumulative_add_" + description + ".csv"; 626 | break; 627 | case UNION: 628 | header = "cardinality,HLL,union_cardinality,union_HLL\n"; 629 | filename = schemaVersionPrefix + "cumulative_union_" + description + ".csv"; 630 | break; 631 | default: 632 | throw new RuntimeException("Unknown test type " + type); 633 | } 634 | 635 | final FileWriter output = new FileWriter(OUTPUT_DIRECTORY + filename); 636 | output.write(header); 637 | output.flush(); 638 | return output; 639 | } 640 | 641 | /** 642 | * Writes out a {@link TestType#ADD}-formatted test line. 643 | * 644 | * @param output The output {@link FileWriter writer}. This cannot be null. 645 | * @param hll The "accumulator" HLL instance. This cannot be null. 646 | * @param rawValue The raw value added to the HLL. 647 | * @param schemaVersion the schema with which to serialize the HLLs. This cannot 648 | * be null. 649 | */ 650 | private static void cumulativeAddLine(final FileWriter output, final HLL hll, final long rawValue, final ISchemaVersion schemaVersion) throws IOException { 651 | hll.addRaw(rawValue); 652 | final String accumulatorCardinality = stringCardinality(hll); 653 | 654 | output.write(accumulatorCardinality + "," + rawValue + "," + toByteA(hll, schemaVersion) + "\n"); 655 | output.flush(); 656 | } 657 | 658 | /** 659 | * Writes an initial line for a {@link TestType#ADD}-formatted test. 660 | * 661 | * @param output The output {@link FileWriter writer}. This cannot be null. 662 | * @param hll The "accumulator" HLL instance. This cannot be null. 663 | * @param rawValue The raw value added to the HLL. 664 | * @param schemaVersion the schema with which to serialize the HLLs. This cannot 665 | * be null. 666 | */ 667 | private static void initLineAdd(final FileWriter output, final HLL hll, final ISchemaVersion schemaVersion) throws IOException { 668 | output.write(0 + "," + 0 + "," + toByteA(hll, schemaVersion) + "\n"); 669 | output.flush(); 670 | } 671 | 672 | /** 673 | * Writes out a {@link TestType#UNION}-formatted test line. 674 | * 675 | * @param output The output {@link FileWriter writer}. This cannot be null. 676 | * @param hll The "accumulator" HLL instance. This cannot be null. 677 | * @param increment The "increment" HLL instance which will be unioned into 678 | * the accumulator. This cannot be null. 679 | * @param schemaVersion the schema with which to serialize the HLLs. This cannot 680 | * be null. 681 | */ 682 | private static void cumulativeUnionLine(final FileWriter output, final HLL hll, final HLL increment, final ISchemaVersion schemaVersion) throws IOException { 683 | hll.union(increment); 684 | 685 | final String incrementCardinality = stringCardinality(increment); 686 | final String accumulatorCardinality = stringCardinality(hll); 687 | output.write(incrementCardinality + "," + toByteA(increment, schemaVersion) + "," + accumulatorCardinality + "," + toByteA(hll, schemaVersion) + "\n"); 688 | output.flush(); 689 | } 690 | 691 | /** 692 | * Serializes a HLL to Postgres 9 'bytea' hex-format, for CSV ingest. 693 | * 694 | * @param hll the HLL to serialize. This cannot be null. 695 | * @param schemaVersion the schema with which to serialize the HLLs. This cannot 696 | * be null. 697 | * @return a PostgreSQL 'bytea' string representing the HLL. 698 | */ 699 | private static String toByteA(final HLL hll, final ISchemaVersion schemaVersion) { 700 | final byte[] bytes = hll.toBytes(schemaVersion); 701 | return ("\\x" + NumberUtil.toHex(bytes, 0, bytes.length)); 702 | } 703 | 704 | /** 705 | * Indicates what kind of test output a test will generate. 706 | */ 707 | private static enum TestType { 708 | /** 709 | * This type of test is characterized by values being added to an 710 | * accumulator HLL whose serialized representation (after the value is added) 711 | * is printed to each line along with the cardinality and added value. 712 | */ 713 | ADD, 714 | /** 715 | * This type of test is characterized by HLLs being unioned into an 716 | * accumulator HLL whose serialized representation (after the HLL is 717 | * union'd) is printed to each line along with the cardinalities and the 718 | * serialized representation of the HLL union'd in. 719 | */ 720 | UNION; 721 | } 722 | } 723 | -------------------------------------------------------------------------------- /src/test/java/net/agkn/hll/ProbabilisticTestUtil.java: -------------------------------------------------------------------------------- 1 | package net.agkn.hll; 2 | 3 | /* 4 | * Copyright 2013 Aggregate Knowledge, Inc. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | import net.agkn.hll.util.BitUtil; 20 | 21 | /** 22 | * A collection of test utilities for constructing input values to HLLs and for 23 | * computing their serialized size. 24 | * 25 | * @author timon 26 | */ 27 | public class ProbabilisticTestUtil { 28 | /** 29 | * Constructs a value that when added raw to a HLL will set the register at 30 | * registerIndex to registerValue. 31 | * 32 | * @param log2m the log-base-2 of the number of registers in the HLL 33 | * @param registerIndex the index of the register to set 34 | * @param registerValue the value to set the register to 35 | * @return the value 36 | */ 37 | public static long constructHLLValue(final int log2m, final int registerIndex, final int registerValue) { 38 | final long partition = registerIndex; 39 | final long substreamValue = (1L << (registerValue - 1)); 40 | return (substreamValue << log2m) | partition; 41 | } 42 | 43 | /** 44 | * Extracts the HLL register index from a raw value. 45 | */ 46 | public static short getRegisterIndex(final long rawValue, final int log2m) { 47 | final long mBitsMask = (1 << log2m) - 1; 48 | final short j = (short)(rawValue & mBitsMask); 49 | return j; 50 | } 51 | 52 | /** 53 | * Extracts the HLL register value from a raw value. 54 | */ 55 | public static byte getRegisterValue(final long rawValue, final int log2m) { 56 | final long substreamValue = (rawValue >>> log2m); 57 | final byte p_w; 58 | 59 | if (substreamValue == 0L) { 60 | // The paper does not cover p(0x0), so the special value 0 is used. 61 | // 0 is the original initialization value of the registers, so by 62 | // doing this the HLL simply ignores it. This is acceptable 63 | // because the probability is 1/(2^(2^registerSizeInBits)). 64 | p_w = 0; 65 | } else { 66 | p_w = (byte)Math.min(1 + BitUtil.leastSignificantBit(substreamValue), 31); 67 | } 68 | 69 | return p_w; 70 | } 71 | 72 | /** 73 | * @return the number of bytes required to pack registerCount 74 | * registers of width shortWordLength. 75 | */ 76 | public static int getRequiredBytes(final int shortWordLength, final int registerCount) { 77 | return (int)Math.ceil((registerCount * shortWordLength)/(float)8); 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /src/test/java/net/agkn/hll/SparseHLLTest.java: -------------------------------------------------------------------------------- 1 | package net.agkn.hll; 2 | 3 | /* 4 | * Copyright 2013 Aggregate Knowledge, Inc. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | import static org.powermock.reflect.Whitebox.getInternalState; 20 | import static org.testng.Assert.assertEquals; 21 | import static org.testng.Assert.assertTrue; 22 | import it.unimi.dsi.fastutil.ints.Int2ByteOpenHashMap; 23 | import java.util.Random; 24 | 25 | import net.agkn.hll.serialization.ISchemaVersion; 26 | import net.agkn.hll.serialization.SerializationUtil; 27 | import net.agkn.hll.util.HLLUtil; 28 | 29 | import org.testng.annotations.Test; 30 | 31 | /** 32 | * Tests {@link HLL} of type {@link HLLType#SPARSE}. 33 | * 34 | * @author timon 35 | */ 36 | public class SparseHLLTest { 37 | private static final int log2m = 11; 38 | 39 | /** 40 | * Tests {@link HLL#addRaw(long)}. 41 | */ 42 | @Test 43 | public void addTest() { 44 | { // insert an element with register value 1 (minimum set value) 45 | final int registerIndex = 0; 46 | final int registerValue = 1; 47 | final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue); 48 | 49 | final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE); 50 | hll.addRaw(rawValue); 51 | 52 | assertOneRegisterSet(hll, registerIndex, (byte)registerValue); 53 | } 54 | { // insert an element with register value 31 (maximum set value) 55 | final int registerIndex = 0; 56 | final int registerValue = 31; 57 | final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue); 58 | 59 | final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE); 60 | hll.addRaw(rawValue); 61 | 62 | assertOneRegisterSet(hll, registerIndex, (byte)registerValue); 63 | } 64 | { // insert an element that could overflow the register (past 31) 65 | final int registerIndex = 0; 66 | final int registerValue = 36; 67 | final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue); 68 | 69 | final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE); 70 | hll.addRaw(rawValue); 71 | 72 | assertOneRegisterSet(hll, (short)registerIndex, (byte)31/*register max*/); 73 | } 74 | { // insert duplicate elements, observe no change 75 | final int registerIndex = 0; 76 | final int registerValue = 1; 77 | final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue); 78 | 79 | final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE); 80 | hll.addRaw(rawValue); 81 | hll.addRaw(rawValue); 82 | 83 | assertOneRegisterSet(hll, registerIndex, (byte)registerValue); 84 | } 85 | { // insert elements that increase a register's value 86 | final int registerIndex = 0; 87 | final int registerValue = 1; 88 | final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue); 89 | 90 | final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE); 91 | hll.addRaw(rawValue); 92 | 93 | final int registerValue2 = 2; 94 | final long rawValue2 = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue2); 95 | hll.addRaw(rawValue2); 96 | 97 | assertOneRegisterSet(hll, registerIndex, (byte)registerValue2); 98 | } 99 | { // insert elements that have lower register values, observe no change 100 | final int registerIndex = 0; 101 | final int registerValue = 2; 102 | final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue); 103 | 104 | final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE); 105 | hll.addRaw(rawValue); 106 | 107 | final int registerValue2 = 1; 108 | final long rawValue2 = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue2); 109 | hll.addRaw(rawValue2); 110 | 111 | assertOneRegisterSet(hll, registerIndex, (byte)registerValue); 112 | } 113 | } 114 | 115 | /** 116 | * Smoke test for {@link HLL#cardinality()} and the proper use of the small 117 | * range correction. 118 | */ 119 | @Test 120 | public void smallRangeSmokeTest() { 121 | final int log2m = 11; 122 | final int m = (1 << log2m); 123 | final int regwidth = 5; 124 | 125 | // only one register set 126 | { 127 | final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE); 128 | hll.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, 0, 1)); 129 | 130 | final long cardinality = hll.cardinality(); 131 | 132 | // Trivially true that small correction conditions hold: one register 133 | // set implies zeroes exist, and estimator trivially smaller than 5m/2. 134 | // Small range correction: m * log(m/V) 135 | final long expected = (long)Math.ceil(m * Math.log((double)m / (m - 1)/*# of zeroes*/)); 136 | assertEquals(cardinality, expected); 137 | } 138 | 139 | // all but one register set 140 | { 141 | final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE); 142 | for(int i=0; i<(m - 1); i++) { 143 | hll.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, i, 1)); 144 | } 145 | 146 | // Trivially true that small correction conditions hold: all but 147 | // one register set implies a zero exists, and estimator trivially 148 | // smaller than 5m/2 since it's alpha / ((m-1)/2) 149 | final long cardinality = hll.cardinality(); 150 | 151 | // Small range correction: m * log(m/V) 152 | final long expected = (long)Math.ceil(m * Math.log((double)m / 1/*# of zeroes*/)); 153 | assertEquals(cardinality, expected); 154 | } 155 | } 156 | 157 | /** 158 | * Smoke test for {@link HLL#cardinality()} and the proper use of the 159 | * uncorrected estimator. 160 | */ 161 | @Test 162 | public void normalRangeSmokeTest() { 163 | final int log2m = 11; 164 | final int m = (1 << log2m); 165 | final int regwidth = 5; 166 | // regwidth = 5, so hash space is 167 | // log2m + (2^5 - 1 - 1), so L = log2m + 30 168 | final int l = log2m + 30; 169 | 170 | // all registers at 'medium' value 171 | { 172 | final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, m/*sparseThreshold*/, HLLType.SPARSE); 173 | 174 | final int registerValue = 7/*chosen to ensure neither correction kicks in*/; 175 | for(int i=0; i (5 * m /(double)2)); 187 | 188 | final long expected = (long)Math.ceil(estimator); 189 | assertEquals(cardinality, expected); 190 | } 191 | } 192 | 193 | /** 194 | * Smoke test for {@link HLL#cardinality()} and the proper use of the large 195 | * range correction. 196 | */ 197 | @Test 198 | public void largeRangeSmokeTest() { 199 | final int log2m = 11; 200 | final int m = (1 << log2m); 201 | final int regwidth = 5; 202 | // regwidth = 5, so hash space is 203 | // log2m + (2^5 - 1 - 1), so L = log2m + 30 204 | final int l = log2m + 30; 205 | 206 | // all registers at large value 207 | { 208 | final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, m/*sparseThreshold*/, HLLType.SPARSE); 209 | 210 | final int registerValue = 31/*chosen to ensure large correction kicks in*/; 211 | for(int i=0; i Math.pow(2, l)/30); 223 | 224 | // Large range correction: -2^32 * log(1 - E/2^32) 225 | final long expected = (long)Math.ceil(-1.0 * Math.pow(2, l) * Math.log(1.0 - estimator/Math.pow(2, l))); 226 | assertEquals(cardinality, expected); 227 | } 228 | } 229 | 230 | /** 231 | * Tests {@link HLL#union(HLL)}. 232 | */ 233 | @Test 234 | public void unionTest() { 235 | final int log2m = 11/*arbitrary*/; 236 | final int sparseThreshold = 256/*arbitrary*/; 237 | 238 | { // two empty multisets should union to an empty set 239 | final HLL hllA = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE); 240 | final HLL hllB = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE); 241 | 242 | hllA.union(hllB); 243 | 244 | assertEquals(hllA.getType(), HLLType.SPARSE/*unchanged*/); 245 | assertEquals(hllA.cardinality(), 0L); 246 | } 247 | { // two disjoint multisets should union properly 248 | final HLL hllA = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE); 249 | hllA.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, 1, 1)); 250 | final HLL hllB = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE); 251 | hllB.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, 2, 1)); 252 | 253 | 254 | hllA.union(hllB); 255 | 256 | assertEquals(hllA.getType(), HLLType.SPARSE/*unchanged*/); 257 | assertEquals(hllA.cardinality(), 3L/*precomputed*/); 258 | assertRegisterPresent(hllA, 1, (byte)1); 259 | assertRegisterPresent(hllA, 2, (byte)1); 260 | } 261 | { // two exactly overlapping multisets should union properly 262 | final HLL hllA = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE); 263 | hllA.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, 1, 10)); 264 | final HLL hllB = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE); 265 | hllB.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, 1, 13)); 266 | 267 | hllA.union(hllB); 268 | 269 | assertEquals(hllA.getType(), HLLType.SPARSE/*unchanged*/); 270 | assertEquals(hllA.cardinality(), 2L/*precomputed*/); 271 | assertOneRegisterSet(hllA, 1, (byte)13/*max(10,13)*/); 272 | } 273 | { // overlapping multisets should union properly 274 | final HLL hllA = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE); 275 | final HLL hllB = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE); 276 | // register index = 3 277 | final long rawValueA = ProbabilisticTestUtil.constructHLLValue(log2m, 3, 11); 278 | 279 | // register index = 4 280 | final long rawValueB = ProbabilisticTestUtil.constructHLLValue(log2m, 4, 13); 281 | final long rawValueBPrime = ProbabilisticTestUtil.constructHLLValue(log2m, 4, 21); 282 | 283 | // register index = 5 284 | final long rawValueC = ProbabilisticTestUtil.constructHLLValue(log2m, 5, 14); 285 | 286 | hllA.addRaw(rawValueA); 287 | hllA.addRaw(rawValueB); 288 | 289 | hllB.addRaw(rawValueBPrime); 290 | hllB.addRaw(rawValueC); 291 | 292 | hllA.union(hllB); 293 | // union should have three registers set, with partition B set to the 294 | // max of the two registers 295 | assertRegisterPresent(hllA, 3, (byte)11); 296 | assertRegisterPresent(hllA, 4, (byte)21/*max(21,13)*/); 297 | assertRegisterPresent(hllA, 5, (byte)14); 298 | } 299 | { // too-large unions should promote 300 | final HLL hllA = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE); 301 | final HLL hllB = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE); 302 | 303 | // fill up sets to maxCapacity 304 | for(int i=0; i 0x4F -> 79 178 | // 1100 0010 -> 0xC2 -> -62 179 | 180 | final byte[] bytes = serializer.getBytes(); 181 | final byte[] expectedBytes = new byte[] { 79, -62 }; 182 | assertTrue(Arrays.equals(bytes, expectedBytes)); 183 | } 184 | {// Should work on a byte-divisible sequence, with no padding. 185 | final BigEndianAscendingWordSerializer serializer = 186 | new BigEndianAscendingWordSerializer(shortWordLength, 187 | 8/*wordCount*/, 188 | 0/*bytePadding, none*/); 189 | 190 | for(int i=1; i<9; i++) { 191 | serializer.writeWord(i); 192 | } 193 | 194 | // Values: 1-8 195 | // Corresponding bits: 196 | // ------------------ 197 | // 00001 198 | // 00010 199 | // 00011 200 | // 00100 201 | // 00101 202 | // 00110 203 | // 00111 204 | // 01000 205 | 206 | // And the hex: 207 | // ------------ 208 | // 0000 1000 => 0x08 => 8 209 | // 1000 0110 => 0x86 => -122 210 | // 0100 0010 => 0x62 => 66 211 | // 1001 1000 => 0x98 => -104 212 | // 1110 1000 => 0xE8 => -24 213 | 214 | final byte[] bytes = serializer.getBytes(); 215 | final byte[] expectedBytes = new byte[] { 8, -122, 66, -104, -24 }; 216 | assertTrue(Arrays.equals(bytes, expectedBytes)); 217 | } 218 | {// Should pad the array correctly. 219 | final BigEndianAscendingWordSerializer serializer = 220 | new BigEndianAscendingWordSerializer(shortWordLength, 221 | 1/*wordCount*/, 222 | 1/*bytePadding*/); 223 | 224 | serializer.writeWord(1); 225 | // 1 byte leading padding | value 1 | trailing padding 226 | // 0000 0000 | 0000 1|000 227 | final byte[] bytes = serializer.getBytes(); 228 | final byte[] expectedBytes = new byte[] { 0, 8 }; 229 | assertTrue(Arrays.equals(bytes, expectedBytes)); 230 | } 231 | } 232 | 233 | /** 234 | * Smoke test for typical parameters used in practice. 235 | */ 236 | @Test 237 | public void smokeTestSparseParams() { 238 | // XXX: revisit 239 | final int shortWordLength = 17; 240 | {// Should work on an empty sequence, with no padding. 241 | final BigEndianAscendingWordSerializer serializer = 242 | new BigEndianAscendingWordSerializer(shortWordLength, 243 | 0/*wordCount*/, 244 | 0/*bytePadding, none*/); 245 | 246 | assert(Arrays.equals(serializer.getBytes(), new byte[0])); 247 | } 248 | {// Should work on a non-byte-divisible sequence, with no padding. 249 | final BigEndianAscendingWordSerializer serializer = 250 | new BigEndianAscendingWordSerializer(shortWordLength, 251 | 3/*wordCount*/, 252 | 0/*bytePadding, none*/); 253 | 254 | serializer.writeWord(9); 255 | serializer.writeWord(42); 256 | serializer.writeWord(75); 257 | 258 | // The values: 259 | // ----------- 260 | // 9 |42 |75 |padding 261 | 262 | // Corresponding bits: 263 | // ------------------ 264 | // 0000 0000 0000 0100 1|000 0000 0000 1010 10|00 0000 0000 1001 011|0 0000 265 | 266 | // And the hex/decimal (remember Java bytes are signed): 267 | // ----------------------------------------------------- 268 | // 0000 0000 -> 0x00 -> 0 269 | // 0000 0100 -> 0x04 -> 4 270 | // 1000 0000 -> 0x80 -> -128 271 | // 0000 1010 -> 0x0A -> 10 272 | // 1000 0000 -> 0x80 -> -128 273 | // 0000 1001 -> 0x09 -> 9 274 | // 0110 0000 -> 0x60 -> 96 275 | 276 | final byte[] bytes = serializer.getBytes(); 277 | final byte[] expectedBytes = new byte[] { 0, 4, -128, 10, -128, 9, 96 }; 278 | assertTrue(Arrays.equals(bytes, expectedBytes)); 279 | } 280 | {// Should work on a byte-divisible sequence, with no padding. 281 | final BigEndianAscendingWordSerializer serializer = 282 | new BigEndianAscendingWordSerializer(shortWordLength, 283 | 8/*wordCount*/, 284 | 0/*bytePadding, none*/); 285 | 286 | for(int i=1; i<9; i++) { 287 | serializer.writeWord(i); 288 | } 289 | 290 | // Values: 1-8 291 | // Corresponding bits: 292 | // ------------------ 293 | // 0000 0000 0000 0000 1 294 | // 000 0000 0000 0000 10 295 | // 00 0000 0000 0000 011 296 | // 0 0000 0000 0000 0100 297 | 298 | // 0000 0000 0000 0010 1 299 | // 000 0000 0000 0001 10 300 | // 00 0000 0000 0000 111 301 | // 0 0000 0000 0000 1000 302 | 303 | // And the hex: 304 | // ------------ 305 | // 0000 0000 -> 0x00 -> 0 306 | // 0000 0000 -> 0x00 -> 0 307 | // 1000 0000 -> 0x80 -> -128 308 | // 0000 0000 -> 0x00 -> 0 309 | // 1000 0000 -> 0x80 -> -128 310 | // 0000 0000 -> 0x00 -> 0 311 | // 0110 0000 -> 0x60 -> 96 312 | // 0000 0000 -> 0x00 -> 0 313 | // 0100 0000 -> 0x40 -> 64 314 | // 0000 0000 -> 0x00 -> 0 315 | // 0010 1000 -> 0x28 -> 40 316 | // 0000 0000 -> 0x00 -> 0 317 | // 0001 1000 -> 0x18 -> 24 318 | // 0000 0000 -> 0x00 -> 0 319 | // 0000 1110 -> 0x0D -> 14 320 | // 0000 0000 -> 0x00 -> 0 321 | // 0000 1000 -> 0x08 -> 8 322 | 323 | final byte[] bytes = serializer.getBytes(); 324 | final byte[] expectedBytes = new byte[] { 0, 0, -128, 0, -128, 0, 96, 0, 64, 0, 40, 0, 24, 0, 14, 0, 8 }; 325 | assertTrue(Arrays.equals(bytes, expectedBytes)); 326 | } 327 | {// Should pad the array correctly. 328 | final BigEndianAscendingWordSerializer serializer = 329 | new BigEndianAscendingWordSerializer(shortWordLength, 330 | 1/*wordCount*/, 331 | 1/*bytePadding*/); 332 | 333 | serializer.writeWord(1); 334 | // 1 byte leading padding | value 1 | trailing padding 335 | // 0000 0000 | 0000 0000 0000 0000 1|000 0000 336 | // 0x00 0x00 0x00 0x80 337 | final byte[] bytes = serializer.getBytes(); 338 | final byte[] expectedBytes = new byte[] { 0, 0, 0, -128 }; 339 | assertTrue(Arrays.equals(bytes, expectedBytes)); 340 | } 341 | } 342 | } 343 | -------------------------------------------------------------------------------- /src/test/java/net/agkn/hll/serialization/HLLSerializationTest.java: -------------------------------------------------------------------------------- 1 | package net.agkn.hll.serialization; 2 | 3 | import net.agkn.hll.HLL; 4 | import net.agkn.hll.HLLType; 5 | import org.testng.annotations.Test; 6 | 7 | import java.util.ArrayList; 8 | import java.util.Collection; 9 | import java.util.List; 10 | import java.util.Random; 11 | 12 | import static net.agkn.hll.HLL.MAXIMUM_EXPTHRESH_PARAM; 13 | import static net.agkn.hll.HLL.MAXIMUM_REGWIDTH_PARAM; 14 | import static net.agkn.hll.HLL.MINIMUM_EXPTHRESH_PARAM; 15 | import static net.agkn.hll.HLL.MINIMUM_LOG2M_PARAM; 16 | import static net.agkn.hll.HLL.MINIMUM_REGWIDTH_PARAM; 17 | import static org.testng.Assert.assertEquals; 18 | 19 | /** 20 | * Serialization smoke-tests. 21 | * 22 | * @author yerenkow 23 | * @author benl 24 | */ 25 | public class HLLSerializationTest { 26 | // A fixed random seed so that this test is reproducible. 27 | private static final long RANDOM_SEED = 1L; 28 | 29 | /** 30 | * A smoke-test that covers serialization/deserialization of an HLL 31 | * under all possible parameters. 32 | */ 33 | @Test 34 | public void serializationSmokeTest() throws Exception { 35 | final Random random = new Random(RANDOM_SEED); 36 | final int randomCount = 250; 37 | final List randoms = new ArrayList(randomCount){{ 38 | for (int i=0; i items) 56 | throws CloneNotSupportedException { 57 | for(int log2m=MINIMUM_LOG2M_PARAM; log2m<=16; log2m++) { 58 | for(int regw=MINIMUM_REGWIDTH_PARAM; regw<=MAXIMUM_REGWIDTH_PARAM; regw++) { 59 | for(int expthr=MINIMUM_EXPTHRESH_PARAM; expthr<=MAXIMUM_EXPTHRESH_PARAM; expthr++ ) { 60 | for(final boolean sparse: new boolean[]{true, false}) { 61 | HLL hll = new HLL(log2m, regw, expthr, sparse, hllType); 62 | for(final Long item: items) { 63 | hll.addRaw(item); 64 | } 65 | HLL copy = HLL.fromBytes(hll.toBytes()); 66 | assertEquals(copy.cardinality(), hll.cardinality()); 67 | assertEquals(copy.getType(), hll.getType()); 68 | assertEquals(copy.toBytes(), hll.toBytes()); 69 | 70 | HLL clone = hll.clone(); 71 | assertEquals(clone.cardinality(), hll.cardinality()); 72 | assertEquals(clone.getType(), hll.getType()); 73 | assertEquals(clone.toBytes(), hll.toBytes()); 74 | } 75 | } 76 | } 77 | } 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /src/test/java/net/agkn/hll/util/BitVectorTest.java: -------------------------------------------------------------------------------- 1 | package net.agkn.hll.util; 2 | 3 | /* 4 | * Copyright 2013 Aggregate Knowledge, Inc. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | import static org.testng.Assert.assertEquals; 20 | import static org.testng.Assert.assertFalse; 21 | import static org.testng.Assert.assertTrue; 22 | 23 | import org.testng.annotations.Test; 24 | 25 | /** 26 | * Unit tests for {@link BitVector}. 27 | * 28 | * @author rgrzywinski 29 | * @author timon 30 | */ 31 | public class BitVectorTest { 32 | /** 33 | * Tests {@link BitVector#getRegister(long)} and {@link BitVector#setRegister(long, long)}. 34 | */ 35 | @Test 36 | public void getSetRegisterTest() { 37 | { // locally scoped for sanity 38 | // NOTE: registers are only 5bits wide 39 | final BitVector vector1 = new BitVector(5/*width*/, 128/*count, 2^7*/); 40 | final BitVector vector2 = new BitVector(5/*width*/, 128/*count, 2^7*/); 41 | final BitVector vector3 = new BitVector(5/*width*/, 128/*count, 2^7*/); 42 | final BitVector vector4 = new BitVector(5/*width*/, 128/*count, 2^7*/); 43 | 44 | for(int i=0; i<128/*2^7*/; i++) { 45 | vector1.setRegister(i, 0x1F); 46 | vector2.setRegister(i, (i & 0x1F)); 47 | vector3.setRegister(i, ((127 - i) & 0x1F)); 48 | vector4.setRegister(i, 0x15); 49 | } 50 | 51 | for(int i=0; i<128/*2^7*/; i++) { 52 | assertEquals(vector1.getRegister(i), 0x1F); 53 | assertEquals(vector2.getRegister(i), (i & 0x1F)); 54 | assertEquals(vector3.getRegister(i), ((127 - i) & 0x1F)); 55 | assertEquals(vector4.getRegister(i), 0x15); 56 | } 57 | } 58 | } 59 | 60 | // ======================================================================== 61 | /** 62 | * Tests {@link BitVector#registerIterator()} 63 | */ 64 | @Test 65 | public void registerIteratorTest() { 66 | { // scoped locally for sanity 67 | // NOTE: registers are only 5bits wide 68 | final BitVector vector1 = new BitVector(5/*width*/, 128/*count, 2^7*/); 69 | final BitVector vector2 = new BitVector(5/*width*/, 128/*count, 2^7*/); 70 | final BitVector vector3 = new BitVector(5/*width*/, 128/*count, 2^7*/); 71 | final BitVector vector4 = new BitVector(5/*width*/, 128/*count, 2^7*/); 72 | 73 | for(int i=0; i<128/*2^7*/; i++) { 74 | vector1.setRegister(i, 0x1F); 75 | vector2.setRegister(i, (i & 0x1F)); 76 | vector3.setRegister(i, ((127 - i) & 0x1F)); 77 | vector4.setRegister(i, 0x15); 78 | } 79 | 80 | final LongIterator registerIterator1 = vector1.registerIterator(); 81 | final LongIterator registerIterator2 = vector2.registerIterator(); 82 | final LongIterator registerIterator3 = vector3.registerIterator(); 83 | final LongIterator registerIterator4 = vector4.registerIterator(); 84 | for(int i=0; i<128/*2^7*/; i++) { 85 | assertEquals(registerIterator1.hasNext(), true); 86 | assertEquals(registerIterator2.hasNext(), true); 87 | assertEquals(registerIterator3.hasNext(), true); 88 | assertEquals(registerIterator4.hasNext(), true); 89 | 90 | assertEquals(registerIterator1.next(), 0x1F); 91 | assertEquals(registerIterator2.next(), (i & 0x1F)); 92 | assertEquals(registerIterator3.next(), ((127 - i) & 0x1F)); 93 | assertEquals(registerIterator4.next(), 0x15); 94 | } 95 | assertEquals(registerIterator1.hasNext(), false/*no more*/); 96 | assertEquals(registerIterator2.hasNext(), false/*no more*/); 97 | assertEquals(registerIterator3.hasNext(), false/*no more*/); 98 | assertEquals(registerIterator4.hasNext(), false/*no more*/); 99 | } 100 | 101 | { // scoped locally for sanity 102 | // Vectors that are shorter than one word 103 | assertIterator(1, 12/* 1*12=12 bits, fewer than a single word */); 104 | assertIterator(2, 12/* 2*12=24 bits, fewer than a single word */); 105 | assertIterator(3, 12/* 3*12=36 bits, fewer than a single word */); 106 | assertIterator(4, 12/* 4*12=48 bits, fewer than a single word */); 107 | 108 | // Vectors that don't fit exactly into longs 109 | assertIterator(5, 16/* 5*16=80 bits */); 110 | assertIterator(5, 32/* 5*32=160 bits */); 111 | } 112 | 113 | // Iterate over vectors that are padded 114 | } 115 | 116 | private static void assertIterator(final int width, final int count) { 117 | final BitVector vector = new BitVector(width, count); 118 | final LongIterator iter = vector.registerIterator(); 119 | 120 | for(int i=0; i