├── LICENSE
├── README.markdown
├── RELEASE_NOTES.markdown
├── pom.xml
└── src
    ├── main
        └── java
        │   └── net
        │       └── agkn
        │           └── hll
        │               ├── HLL.java
        │               ├── HLLType.java
        │               ├── serialization
        │                   ├── BigEndianAscendingWordDeserializer.java
        │                   ├── BigEndianAscendingWordSerializer.java
        │                   ├── HLLMetadata.java
        │                   ├── IHLLMetadata.java
        │                   ├── ISchemaVersion.java
        │                   ├── IWordDeserializer.java
        │                   ├── IWordSerializer.java
        │                   ├── SchemaVersionOne.java
        │                   └── SerializationUtil.java
        │               └── util
        │                   ├── BitUtil.java
        │                   ├── BitVector.java
        │                   ├── HLLUtil.java
        │                   ├── LongIterator.java
        │                   └── NumberUtil.java
    └── test
        └── java
            └── net
                └── agkn
                    └── hll
                        ├── ExplicitHLLTest.java
                        ├── FullHLLTest.java
                        ├── IntegrationTestGenerator.java
                        ├── ProbabilisticTestUtil.java
                        ├── SparseHLLTest.java
                        ├── serialization
                            ├── BigEndianAscendingWordDeserializerTest.java
                            ├── BigEndianAscendingWordSerializerTest.java
                            └── HLLSerializationTest.java
                        └── util
                            ├── BitVectorTest.java
                            └── HLLUtilTest.java


/LICENSE:
--------------------------------------------------------------------------------
 1 | Apache License
 2 | 
 3 | Version 2.0, January 2004
 4 | 
 5 | http://www.apache.org/licenses/
 6 | 
 7 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
 8 | 
 9 | 1. Definitions.
10 | 
11 | "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document.
12 | 
13 | "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License.
14 | 
15 | "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity.
16 | 
17 | "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License.
18 | 
19 | "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files.
20 | 
21 | "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types.
22 | 
23 | "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below).
24 | 
25 | "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof.
26 | 
27 | "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution."
28 | 
29 | "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work.
30 | 
31 | 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form.
32 | 
33 | 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed.
34 | 
35 | 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:
36 | 
37 | You must give any other recipients of the Work or Derivative Works a copy of this License; and
38 | 
39 | You must cause any modified files to carry prominent notices stating that You changed the files; and
40 | 
41 | You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and
42 | 
43 | If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License.
44 | 
45 | 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions.
46 | 
47 | 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file.
48 | 
49 | 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License.
50 | 
51 | 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.
52 | 
53 | 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.
54 | 
55 | END OF TERMS AND CONDITIONS
56 | 
57 | APPENDIX: How to apply the Apache License to your work
58 | To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives.
59 | 
60 |    Copyright 2013 Aggregate Knowledge, Inc.
61 | 
62 |    Licensed under the Apache License, Version 2.0 (the "License");
63 |    you may not use this file except in compliance with the License.
64 |    You may obtain a copy of the License at
65 | 
66 |        http://www.apache.org/licenses/LICENSE-2.0
67 | 
68 |    Unless required by applicable law or agreed to in writing, software
69 |    distributed under the License is distributed on an "AS IS" BASIS,
70 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
71 |    See the License for the specific language governing permissions and
72 |    limitations under the License.


--------------------------------------------------------------------------------
/README.markdown:
--------------------------------------------------------------------------------
  1 | java-hll
  2 | ========
  3 | 
  4 | A Java implementation of [HyperLogLog](http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf) whose goal is to be [storage-compatible](https://github.com/aggregateknowledge/hll-storage-spec) with other similar offerings from [Aggregate Knowledge](http://blog.aggregateknowledge.com/).
  5 | 
  6 | 
  7 | **NOTE:** This implementation fully implements reading and writing all formats in the [v1.0.0 storage specification](https://github.com/aggregateknowledge/hll-storage-spec/blob/v1.0.0/STORAGE.md), but internal memory representation (and hence space-tradeoffs) may cause automatic "promotion" between representations to occur at different implementation-dependent points. To ensure interoperability between, for example, the [PostgreSQL implementation](https://github.com/aggregateknowledge/postgresql-hll) and this library, all promotion cutoffs should be explicitly defined.
  8 | 
  9 | Similarly, certain parameters have different bounds in order to deal with VM limitations like maximum array length. Specifically, `log2m` has a maximum value of 30 in this implementation whereas the storage specification states a maximum value of 31 (which can be realized in the PostgreSQL implementation).
 10 | 
 11 | Overview
 12 | --------
 13 | 
 14 | HyperLogLog (HLL) is a fixed-size, set-like structure used for distinct value counting with tunable precision. For example, in 1280 bytes HLL can estimate the count of tens of billions of distinct values with only a few percent error.
 15 | 
 16 | In addition to the algorithm proposed in the [original paper](http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf), this implementation is augmented to improve its accuracy and memory use without sacrificing much speed. See below for more details.
 17 | 
 18 | Algorithms
 19 | ----------
 20 | 
 21 | A `hll` is a combination of different set/distinct-value-counting algorithms that can be thought of as a hierarchy, along with rules for moving up that hierarchy. In order to distinguish between said algorithms, we have given them names:
 22 | 
 23 | ### `EMPTY` ###
 24 | A constant value that denotes the empty set.
 25 | 
 26 | ### `EXPLICIT` ###
 27 | An explicit, unique, sorted list of integers in the set, which is maintained up to a fixed cardinality.
 28 | 
 29 | ### `SPARSE` ###
 30 | A 'lazy', map-based implementation of HyperLogLog, a probabilistic set data structure. Only stores the indices and values of non-zero registers in a map, until the number of non-zero registers exceeds a fixed cardinality.
 31 | 
 32 | ### `FULL` ###
 33 | A fully-materialized, list-based implementation of HyperLogLog. Explicitly stores the value of every register in a list ordered by register index.
 34 | 
 35 | Motivation
 36 | ----------
 37 | 
 38 | Our motivation for augmenting the original HLL algorithm went something like this:
 39 | 
 40 | * Naively, a HLL takes `regwidth * 2^log2m` bits to store.
 41 | * In typical usage, `log2m = 11` and `regwidth = 5`, it requires 10,240 bits or 1,280 bytes.
 42 | * That's a lot of bytes!
 43 | 
 44 | The first addition to the original HLL algorithm came from realizing that 1,280 bytes is the size of 160 64-bit integers. So, if we wanted more accuracy at low cardinalities, we could just keep an explicit set of the inputs as a sorted list of 64-bit integers until we hit the 161st distinct value. This would give us the true representation of the distinct values in the stream while requiring the same amount of memory. (This is the `EXPLICIT` algorithm.)
 45 | 
 46 | The second came from the realization that we didn't need to store registers whose value was zero. We could simply represent the set of registers that had non-zero values as a map from index to values. This is map is stored as a list of index-value pairs that are bit-packed "short words" of length `log2m + regwidth`. (This is the `SPARSE` algorithm.)
 47 | 
 48 | Combining these two augmentations, we get a "promotion hierarchy" that allows the algorithm to be tuned for better accuracy, memory, or performance.
 49 | 
 50 | Initializing and storing a new `hll` object will simply allocate a small sentinel value symbolizing the empty set (`EMPTY`). When you add the first few values, a sorted list of unique integers is stored in an `EXPLICIT` set. When you wish to cease trading off accuracy for memory, the values in the sorted list are "promoted" to a `SPARSE` map-based HyperLogLog structure. Finally, when there are enough registers, the map-based HLL will be converted to a bit-packed `FULL` HLL structure.
 51 | 
 52 | Empirically, the insertion rate of `EMPTY`, `EXPLICIT`, and `SPARSE` representations is measured in 200k/s - 300k/s range, while the throughput of the `FULL` representation is in the millions of inserts per second on relatively new hardware ('10 Xeon).
 53 | 
 54 | Naturally, the cardinality estimates of the `EMPTY` and `EXPLICIT` representations is exact, while the `SPARSE` and `FULL` representations' accuracies are governed by the guarantees provided by the original HLL algorithm.
 55 | 
 56 | * * * * * * * * * * * * * * * * * * * * * * * * *
 57 | 
 58 | 
 59 | The Importance of Hashing
 60 | =========================
 61 | 
 62 | In brief, it is absolutely crucial to hash inputs to an HLL. A close approximation of uniform randomness in the inputs ensures that the error guarantees laid out in the original paper hold. We've empirically determined that [MurmurHash 3](http://guava-libraries.googlecode.com/git/guava/src/com/google/common/hash/Murmur3_128HashFunction.java), from Google's Guava, is an excellent and fast hash function to use in conjunction with `java-hll` module.
 63 | 
 64 | The seed to the hash call must remain constant for all inputs to a given HLL.  Similarly, if one plans to compute the union of two HLLs, the input values must have been hashed using the same seed.
 65 | 
 66 | For a good overview of the importance of hashing and hash functions when using probabilistic algorithms as well as an analysis of MurmurHash 3, refer to these blog posts:
 67 | 
 68 | * [K-Minimum Values: Sketching Error, Hash Functions, and You](http://blog.aggregateknowledge.com/2012/08/20/k-minimum-values-sketching-error-hash-functions-and-you/)
 69 | * [Choosing a Good Hash Function, Part 1](http://blog.aggregateknowledge.com/2011/12/05/choosing-a-good-hash-function-part-1/)
 70 | * [Choosing a Good Hash Function, Part 2](http://blog.aggregateknowledge.com/2011/12/29/choosing-a-good-hash-function-part-2/)
 71 | * [Choosing a Good Hash Function, Part 3](http://blog.aggregateknowledge.com/2012/02/02/choosing-a-good-hash-function-part-3/)
 72 | 
 73 | 
 74 | On Unions and Intersections
 75 | ===========================
 76 | 
 77 | HLLs have the useful property that the union of any number of HLLs is equal to the HLL that would have been populated by playing back all inputs to those '_n_' HLLs into a single HLL. Colloquially, one can say that HLLs have "lossless" unions because the same cardinality error guarantees that apply to a single HLL apply to a union of HLLs. See the `union()` function.
 78 | 
 79 | Using the [inclusion-exclusion principle](http://en.wikipedia.org/wiki/Inclusion%E2%80%93exclusion_principle) and the `union()` function, one can also estimate the intersection of sets represented by HLLs. Note, however, that error is proportional to the union of the two HLLs, while the result can be significantly smaller than the union, leading to disproportionately large error relative to the actual intersection cardinality. For instance, if one HLL has a cardinality of 1 billion, while the other has a cardinality of 10 million, with an overlap of 5 million, the intersection cardinality can easily be dwarfed by even a 1% error estimate in the larger HLLs cardinality.
 80 | 
 81 | For more information on HLL intersections, see [this blog post](http://blog.aggregateknowledge.com/2012/12/17/hll-intersections-2/).
 82 | 
 83 | Usage
 84 | =====
 85 | 
 86 | HLL is available in Maven Central. Include it in your project with:
 87 | 
 88 | ```xml
 89 | <dependency>
 90 |     <groupId>net.agkn</groupId>
 91 |     <artifactId>hll</artifactId>
 92 |     <version>1.6.0</version>
 93 | </dependency>
 94 | ```
 95 | 
 96 | 
 97 | Hashing and adding a value to a new HLL:
 98 | 
 99 | ```java
100 | final int seed = 123456;
101 | final Murmur3_128HashFunction hash = new Murmur3_128HashFunction(seed);
102 | final Hasher hasher = hash.newHasher();
103 | hasher.putLong(1L/*value to hash*/);
104 | 
105 | final long hashedValue = hasher.hash().asLong();
106 | 
107 | final HLL hll = new HLL(13/*log2m*/, 5/*registerWidth*/);
108 | hll.addRaw(hashedValue);
109 | ```
110 | 
111 | Retrieving the cardinality of an HLL:
112 | 
113 | ```java
114 | final long cardinality = hll.cardinality();
115 | ```
116 | 
117 | Unioning two HLLs together (and retrieving the resulting cardinality):
118 | 
119 | ```java
120 | final HLL hll1 = new HLL(13/*log2m*/, 5/*registerWidth*/);
121 | final HLL hll2 = new HLL(13/*log2m*/, 5/*registerWidth*/);
122 | 
123 | // ... (add values to both sets) ...
124 | 
125 | hll1.union(hll2)/*modifies hll1 to contain the union*/;
126 | final long cardinalityUnion = hll1.cardinality();
127 | ```
128 | 
129 | Reading an HLL from a hex representation of [storage specification, v1.0.0](https://github.com/aggregateknowledge/hll-storage-spec/blob/v1.0.0/STORAGE.md) (for example, retrieved from a [PostgreSQL database](https://github.com/aggregateknowledge/postgresql-hll)):
130 | 
131 | ```java
132 | final HLL hll = HLL.fromBytes(NumberUtil.fromHex(hexString));
133 | ```
134 | 
135 | Writing an HLL to its hex representation of [storage specification, v1.0.0](https://github.com/aggregateknowledge/hll-storage-spec/blob/v1.0.0/STORAGE.md) (for example, to be inserted into a [PostgreSQL database](https://github.com/aggregateknowledge/postgresql-hll)):
136 | 
137 | ```java
138 | final byte[] bytes = hll.toBytes();
139 | final String output = "\\x" + NumberUtil.toHex(bytes, 0, bytes.length)
140 | ```
141 | 
142 | * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
143 | 
144 | Building
145 | --------
146 | 
147 | *  Requires [Maven 2.0](http://maven.apache.org/)
148 | *  `mvn clean package` in the base directory
149 | 
150 |    A `target` directory will be created and a jar containing the library will be created therein.
151 | 
152 | 
153 | Testing
154 | -------
155 | 
156 | *  `mvn test` in the base directory.
157 | 


--------------------------------------------------------------------------------
/RELEASE_NOTES.markdown:
--------------------------------------------------------------------------------
 1 | v1.6.0 - Jul 29, 2014
 2 | ---------------------
 3 | * Added support for registering schema versions.
 4 | 
 5 | v1.5.2 - Jul 16, 2014
 6 | ---------------------
 7 | * Fixed #10: Long overflow bug in `TWO_TO_L` calculation when `regwidth = 6`.
 8 | 
 9 | v1.5.1 - Feb 26, 2014
10 | ---------------------
11 | * Fixed serialization compabitility issue. `expthresh` was not being decoded properly.
12 | 
13 | v1.5.0 - Feb 21, 2014
14 | ---------------------
15 | * Fixed #5: Added HLL#clone().
16 | 
17 | v1.4.0 - Feb 04, 2014
18 | ---------------------
19 | * Fixed #4: lowered JDK requirement to 1.6 from 1.7.
20 | 
21 | v1.3.0 - Jan 31, 2014
22 | ---------------------
23 | * Fixed #3: added new, simple HLL constructor.
24 | 
25 | v1.2.1 - Jan 31, 2014
26 | ---------------------
27 | * Fixed #2: fix HLL when `log2m * regwidth` is small.
28 | 
29 | v1.2.0 - Jan 17, 2014
30 | ---------------------
31 | * Reworked pom for Maven Central publishing, via Sonnatype.
32 | 
33 | v1.1.0 - Jan 10, 2014
34 | ---------------------
35 | * Documentation fixes.
36 | * Added parameter checking in HLL constructor.
37 | 
38 | v1.0.0 - Dec 22, 2013
39 | ---------------------
40 | * Initial public release.


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <project xmlns="http://maven.apache.org/POM/4.0.0"
  2 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  3 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
  4 |     <modelVersion>4.0.0</modelVersion>
  5 |     <groupId>net.agkn</groupId>
  6 |     <artifactId>hll</artifactId>
  7 |     <packaging>jar</packaging>
  8 |     <description>HyperLogLog: approximate distinct value counting algoritm</description>
  9 |     <url>https://github.com/aggregateknowledge/java-hll</url>
 10 |     <version>1.6.0</version>
 11 |     <name>HyperLogLog in Java</name>
 12 |     <licenses>
 13 |         <license>
 14 |             <name>The Apache Software License, Version 2.0</name>
 15 |             <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
 16 |             <distribution>repo</distribution>
 17 |         </license>
 18 |     </licenses>
 19 |     <scm>
 20 |         <connection>scm:git:git@github.com:aggregateknowledge/java-hll.git</connection>
 21 |         <url>scm:git:git@github.com:aggregateknowledge/java-hll.git</url>
 22 |         <developerConnection>scm:git:git@github.com:aggregateknowledge/java-hll.git</developerConnection>
 23 |     </scm>
 24 |     <developers>
 25 |         <developer>
 26 |             <id>timonk</id>
 27 |             <name>Timon Karnezos</name>
 28 |             <email>timon.karnezos@neustar.biz</email>
 29 |         </developer>
 30 |     </developers>
 31 | 
 32 |     <!-- ================================================================== -->
 33 |     <build>
 34 |         <finalName>${project.artifactId}-${project.version}</finalName>
 35 | 
 36 |         <plugins>
 37 |             <plugin>
 38 |                 <groupId>org.apache.maven.plugins</groupId>
 39 |                 <artifactId>maven-compiler-plugin</artifactId>
 40 |                 <version>3.1</version>
 41 |                 <configuration>
 42 |                     <source>1.6</source>
 43 |                     <target>1.6</target>
 44 |                 </configuration>
 45 |             </plugin>
 46 | 
 47 |             <plugin>
 48 |                 <groupId>org.apache.maven.plugins</groupId>
 49 |                 <artifactId>maven-source-plugin</artifactId>
 50 |                 <version>2.2.1</version>
 51 |                 <executions>
 52 |                     <execution>
 53 |                         <id>attach-sources</id>
 54 |                         <goals>
 55 |                             <goal>jar</goal>
 56 |                         </goals>
 57 |                     </execution>
 58 |                 </executions>
 59 |             </plugin>
 60 | 
 61 |             <plugin>
 62 |                 <groupId>org.apache.maven.plugins</groupId>
 63 |                 <artifactId>maven-javadoc-plugin</artifactId>
 64 |                 <version>2.9.1</version>
 65 |                 <executions>
 66 |                     <execution>
 67 |                         <id>attach-javadocs</id>
 68 |                         <goals>
 69 |                             <goal>jar</goal>
 70 |                         </goals>
 71 |                     </execution>
 72 |                 </executions>
 73 |             </plugin>
 74 | 
 75 |             <plugin>
 76 |                 <groupId>org.apache.maven.plugins</groupId>
 77 |                 <artifactId>maven-gpg-plugin</artifactId>
 78 |                 <executions>
 79 |                     <execution>
 80 |                         <id>sign-artifacts</id>
 81 |                         <phase>verify</phase>
 82 |                         <goals>
 83 |                             <goal>sign</goal>
 84 |                         </goals>
 85 |                     </execution>
 86 |                 </executions>
 87 |             </plugin>
 88 |             <!-- .......................................................... -->
 89 |             <plugin>
 90 |                 <groupId>org.apache.maven.plugins</groupId>
 91 |                 <artifactId>maven-surefire-plugin</artifactId>
 92 |                 <version>2.16</version>
 93 |                 <configuration/>
 94 |             </plugin>
 95 | 
 96 |         </plugins>
 97 |     </build>
 98 | 
 99 |     <profiles>
100 |       <profile>
101 |         <id>release-sign-artifacts</id>
102 |         <activation>
103 |           <property>
104 |             <name>performRelease</name>
105 |             <value>true</value>
106 |           </property>
107 |         </activation>
108 |         <build>
109 |           <plugins>
110 |             <plugin>
111 |               <groupId>org.apache.maven.plugins</groupId>
112 |               <artifactId>maven-gpg-plugin</artifactId>
113 |               <version>1.4</version>
114 |               <executions>
115 |                 <execution>
116 |                   <id>sign-artifacts</id>
117 |                   <phase>verify</phase>
118 |                   <goals>
119 |                     <goal>sign</goal>
120 |                   </goals>
121 |                 </execution>
122 |               </executions>
123 |             </plugin>
124 |           </plugins>
125 |         </build>
126 |       </profile>
127 |     </profiles>
128 | 
129 |     <distributionManagement>
130 |         <snapshotRepository>
131 |             <id>sonatype-nexus-snapshots</id>
132 |             <url>https://oss.sonatype.org/content/repositories/snapshots/</url>
133 |         </snapshotRepository>
134 |         <repository>
135 |             <id>sonatype-nexus-staging</id>
136 |             <url>https://oss.sonatype.org/service/local/staging/deploy/maven2/</url>
137 |         </repository>
138 |     </distributionManagement>
139 | 
140 |     <parent>
141 |         <groupId>org.sonatype.oss</groupId>
142 |         <artifactId>oss-parent</artifactId>
143 |         <version>7</version>
144 |     </parent>
145 | 
146 |     <!-- ================================================================== -->
147 |     <!-- NOTE:  all version numbers are defined in the properties section -->
148 |     <dependencies>
149 |         <!-- Production dependencies :: Default scope -->
150 |         <dependency>
151 |             <groupId>it.unimi.dsi</groupId>
152 |             <artifactId>fastutil</artifactId>
153 |             <version>${fastutil-version}</version>
154 |         </dependency>
155 | 
156 |         <!-- Test dependencies :: Test scope -->
157 |         <dependency>
158 |             <groupId>org.easymock</groupId>
159 |             <artifactId>easymock</artifactId>
160 |             <version>${easymock-version}</version>
161 |             <scope>test</scope>
162 |         </dependency>
163 |         <dependency>
164 |             <groupId>org.powermock</groupId>
165 |             <artifactId>powermock-module-junit4</artifactId>
166 |             <version>${powermock-version}</version>
167 |             <scope>test</scope>
168 |         </dependency>
169 |         <dependency>
170 |             <groupId>org.powermock</groupId>
171 |             <artifactId>powermock-api-easymock</artifactId>
172 |             <version>${powermock-version}</version>
173 |             <scope>test</scope>
174 |         </dependency>
175 |         <!-- NOTE:  the "jdk15" classifier is "JDK 1.5+" -->
176 |         <dependency>
177 |             <groupId>org.testng</groupId>
178 |             <artifactId>testng</artifactId>
179 |             <version>${testng-version}</version>
180 |             <scope>test</scope>
181 |             <classifier>jdk15</classifier>
182 |         </dependency>
183 |     </dependencies>
184 | 
185 |     <!-- ================================================================== -->
186 |     <properties>
187 |         <!-- Production scope -->
188 |         <commons-codec-version>1.8</commons-codec-version>
189 | 
190 |         <!-- Testing versions -->
191 |         <easymock-version>3.0</easymock-version>
192 |         <powermock-version>1.4.8</powermock-version>
193 |         <testng-version>5.7</testng-version>
194 |         <fastutil-version>6.5.11</fastutil-version>
195 |     </properties>
196 | </project>


--------------------------------------------------------------------------------
/src/main/java/net/agkn/hll/HLLType.java:
--------------------------------------------------------------------------------
 1 | package net.agkn.hll;
 2 | 
 3 | /*
 4 |  * Copyright 2013 Aggregate Knowledge, Inc.
 5 |  *
 6 |  * Licensed under the Apache License, Version 2.0 (the "License");
 7 |  * you may not use this file except in compliance with the License.
 8 |  * You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | 
19 | /**
20 |  * The types of algorithm/data structure that {@link HLL} can utilize. For more
21 |  * information, see the Javadoc for {@link HLL}.
22 |  */
23 | public enum HLLType {
24 |     EMPTY,
25 |     EXPLICIT,
26 |     SPARSE,
27 |     FULL,
28 |     UNDEFINED/*used by the PostgreSQL implementation to indicate legacy/corrupt/incompatible/unknown formats*/;
29 | }


--------------------------------------------------------------------------------
/src/main/java/net/agkn/hll/serialization/BigEndianAscendingWordDeserializer.java:
--------------------------------------------------------------------------------
  1 | package net.agkn.hll.serialization;
  2 | 
  3 | /*
  4 |  * Copyright 2013 Aggregate Knowledge, Inc.
  5 |  *
  6 |  * Licensed under the Apache License, Version 2.0 (the "License");
  7 |  * you may not use this file except in compliance with the License.
  8 |  * You may obtain a copy of the License at
  9 |  *
 10 |  *     http://www.apache.org/licenses/LICENSE-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS,
 14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 |  * See the License for the specific language governing permissions and
 16 |  * limitations under the License.
 17 |  */
 18 | 
 19 | /**
 20 |  * A corresponding deserializer for {@link BigEndianAscendingWordSerializer}.
 21 |  *
 22 |  * @author timon
 23 |  */
 24 | public class BigEndianAscendingWordDeserializer implements IWordDeserializer {
 25 |     // The number of bits per byte.
 26 |     private static final int BITS_PER_BYTE = 8;
 27 | 
 28 |     // long mask for the maximum value stored in a byte
 29 |     private static final long BYTE_MASK = (1L << BITS_PER_BYTE) - 1L;
 30 | 
 31 |     // ************************************************************************
 32 |     // The length in bits of the words to be read.
 33 |     private final int wordLength;
 34 | 
 35 |     // The byte array to which the words are serialized.
 36 |     private final byte[] bytes;
 37 | 
 38 |     // The number of leading padding bytes in 'bytes' to be ignored.
 39 |     private final int bytePadding;
 40 | 
 41 |     // The number of words that the byte array contains.
 42 |     private final int wordCount;
 43 | 
 44 |     // The current read state.
 45 |     private int currentWordIndex;
 46 | 
 47 |     // ========================================================================
 48 |     /**
 49 |      * @param wordLength the length in bits of the words to be deserialized. Must
 50 |      *        be less than or equal to 64 and greater than or equal to 1.
 51 |      * @param bytePadding the number of leading bytes that pad the serialized words.
 52 |      *        Must be greater than or equal to zero.
 53 |      * @param bytes the byte array containing the serialized words. Cannot be
 54 |      *        <code>null</code>.
 55 |      */
 56 |     public BigEndianAscendingWordDeserializer(final int wordLength, final int bytePadding, final byte[] bytes) {
 57 |         if((wordLength < 1) || (wordLength > 64)) {
 58 |             throw new IllegalArgumentException("Word length must be >= 1 and <= 64. (was: " + wordLength + ")");
 59 |         }
 60 | 
 61 |         if(bytePadding < 0) {
 62 |             throw new IllegalArgumentException("Byte padding must be >= zero. (was: " + bytePadding + ")");
 63 |         }
 64 | 
 65 |         this.wordLength = wordLength;
 66 |         this.bytes = bytes;
 67 |         this.bytePadding = bytePadding;
 68 | 
 69 |         final int dataBytes = (bytes.length - bytePadding);
 70 |         final long dataBits = (dataBytes * BITS_PER_BYTE);
 71 | 
 72 |         this.wordCount = (int)(dataBits/wordLength);
 73 | 
 74 |         currentWordIndex = 0;
 75 |     }
 76 | 
 77 |     // ========================================================================
 78 |     /* (non-Javadoc)
 79 |      * @see net.agkn.hll.serialization.IWordDeserializer#readWord()
 80 |      */
 81 |     @Override
 82 |     public long readWord() {
 83 |         final long word = readWord(currentWordIndex);
 84 |         currentWordIndex++;
 85 | 
 86 |         return word;
 87 |     }
 88 | 
 89 |     // ------------------------------------------------------------------------
 90 |     /**
 91 |      * Reads the word at the specified sequence position (zero-indexed).
 92 |      *
 93 |      * @param  position the zero-indexed position of the word to be read. This
 94 |      *         must be greater than or equal to zero.
 95 |      * @return the value of the serialized word at the specified position.
 96 |      */
 97 |     private long readWord(final int position) {
 98 |         if(position < 0) {
 99 |             throw new ArrayIndexOutOfBoundsException(position);
100 |         }
101 | 
102 |         // First bit of the word
103 |         final long firstBitIndex = (position * wordLength);
104 |         final int firstByteIndex = (bytePadding + (int)(firstBitIndex / BITS_PER_BYTE));
105 |         final int firstByteSkipBits = (int)(firstBitIndex % BITS_PER_BYTE);
106 | 
107 |         // Last bit of the word
108 |         final long lastBitIndex = (firstBitIndex + wordLength - 1);
109 |         final int lastByteIndex = (bytePadding + (int)(lastBitIndex / BITS_PER_BYTE));
110 |         final int lastByteBitsToConsume;
111 | 
112 |         final int bitsAfterByteBoundary = (int)((lastBitIndex + 1) % BITS_PER_BYTE);
113 |         // If the word terminates at the end of the last byte, consume the whole
114 |         // last byte.
115 |         if(bitsAfterByteBoundary == 0) {
116 |             lastByteBitsToConsume = BITS_PER_BYTE;
117 |         } else {
118 |             // Otherwise, only consume what is necessary.
119 |             lastByteBitsToConsume = bitsAfterByteBoundary;
120 |         }
121 | 
122 |         if(lastByteIndex >= bytes.length) {
123 |             throw new ArrayIndexOutOfBoundsException("Word out of bounds of backing array.");
124 |         }
125 | 
126 |         // Accumulator
127 |         long value = 0;
128 | 
129 |         // --------------------------------------------------------------------
130 |         // First byte
131 |         final int bitsRemainingInFirstByte = (BITS_PER_BYTE - firstByteSkipBits);
132 |         final int bitsToConsumeInFirstByte = Math.min(bitsRemainingInFirstByte, wordLength);
133 |         long firstByte = (long)bytes[firstByteIndex];
134 | 
135 |         // Mask off the bits to skip in the first byte.
136 |         final long firstByteMask = ((1L << bitsRemainingInFirstByte) - 1L);
137 |         firstByte &= firstByteMask;
138 |         // Right-align relevant bits of first byte.
139 |         firstByte >>>= (bitsRemainingInFirstByte - bitsToConsumeInFirstByte);
140 | 
141 |         value |= firstByte;
142 | 
143 |         // If the first byte contains the whole word, short-circuit.
144 |         if(firstByteIndex == lastByteIndex) {
145 |             return value;
146 |         }
147 | 
148 |         // --------------------------------------------------------------------
149 |         // Middle bytes
150 |         final int middleByteCount = (lastByteIndex - firstByteIndex - 1);
151 |         for(int i=0; i<middleByteCount; i++) {
152 |             final long middleByte = (bytes[firstByteIndex + i + 1] & BYTE_MASK);
153 |             // Push middle byte onto accumulator.
154 |             value <<= BITS_PER_BYTE;
155 |             value |= middleByte;
156 |         }
157 | 
158 |         // --------------------------------------------------------------------
159 |         // Last byte
160 |         long lastByte = (bytes[lastByteIndex] & BYTE_MASK);
161 |         lastByte >>= (BITS_PER_BYTE - lastByteBitsToConsume);
162 |         value <<= lastByteBitsToConsume;
163 |         value |= lastByte;
164 |         return value;
165 |     }
166 | 
167 |     /* (non-Javadoc)
168 |      * @see net.agkn.hll.serialization.IWordDeserializer#totalWordCount()
169 |      */
170 |     @Override
171 |     public int totalWordCount() {
172 |         return wordCount;
173 |     }
174 | }
175 | 


--------------------------------------------------------------------------------
/src/main/java/net/agkn/hll/serialization/BigEndianAscendingWordSerializer.java:
--------------------------------------------------------------------------------
  1 | package net.agkn.hll.serialization;
  2 | 
  3 | /*
  4 |  * Copyright 2013 Aggregate Knowledge, Inc.
  5 |  *
  6 |  * Licensed under the Apache License, Version 2.0 (the "License");
  7 |  * you may not use this file except in compliance with the License.
  8 |  * You may obtain a copy of the License at
  9 |  *
 10 |  *     http://www.apache.org/licenses/LICENSE-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS,
 14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 |  * See the License for the specific language governing permissions and
 16 |  * limitations under the License.
 17 |  */
 18 | 
 19 | /**
 20 |  * A serializer that writes a sequence of fixed bit-width 'words' to a byte array.
 21 |  * Bitwise OR is used to write words into bytes, so a low bit in a word is also
 22 |  * a low bit in a byte. However, a high byte in a word is written at a lower index
 23 |  * in the array than a low byte in a word. The first word is written at the lowest
 24 |  * array index. Each serializer is one time use and returns its backing byte
 25 |  * array.<p/>
 26 |  *
 27 |  * This encoding was chosen so that when reading bytes as octets in the typical
 28 |  * first-octet-is-the-high-nibble fashion, an octet-to-binary conversion
 29 |  * would yield a high-to-low, left-to-right view of the "short words".<p/>
 30 |  *
 31 |  * Example:<p/>
 32 |  *
 33 |  * Say short words are 5 bits wide. Our word sequence is the values
 34 |  * <code>[31, 1, 5]</code>. In big-endian binary format, the values are
 35 |  * <code>[0b11111, 0b00001, 0b00101]</code>. We use 15 of 16 bits in two bytes
 36 |  * and pad the last (lowest) bit of the last byte with a zero:
 37 |  *
 38 |  * <code>
 39 |  *  [0b11111000, 0b01001010] = [0xF8, 0x4A]
 40 |  * </code>.
 41 |  *
 42 |  * @author timon
 43 |  */
 44 | public class BigEndianAscendingWordSerializer implements IWordSerializer {
 45 |     // The number of bits per byte.
 46 |     private static final int BITS_PER_BYTE = 8;
 47 | 
 48 |     // ************************************************************************
 49 |     // The length in bits of the words to be written.
 50 |     private final int wordLength;
 51 |     // The number of words to be written.
 52 |     private final int wordCount;
 53 | 
 54 |     // The byte array to which the words are serialized.
 55 |     private final byte[] bytes;
 56 | 
 57 |     // ------------------------------------------------------------------------
 58 |     // Write state
 59 |     // Number of bits that remain writable in the current byte.
 60 |     private int bitsLeftInByte;
 61 |     // Index of byte currently being written to.
 62 |     private int byteIndex;
 63 |     // Number of words written.
 64 |     private int wordsWritten;
 65 | 
 66 |     // ========================================================================
 67 |     /**
 68 |      * @param wordLength the length in bits of the words to be serialized. Must
 69 |      *        be greater than or equal to 1 and less than or equal to 64.
 70 |      * @param wordCount the number of words to be serialized. Must be greater than
 71 |      *        or equal to zero.
 72 |      * @param bytePadding the number of leading bytes that should pad the
 73 |      *        serialized words. Must be greater than or equal to zero.
 74 |      */
 75 |     public BigEndianAscendingWordSerializer(final int wordLength, final int wordCount, final int bytePadding) {
 76 |         if((wordLength < 1) || (wordLength > 64)) {
 77 |             throw new IllegalArgumentException("Word length must be >= 1 and <= 64. (was: " + wordLength + ")");
 78 |         }
 79 |         if(wordCount < 0) {
 80 |             throw new IllegalArgumentException("Word count must be >= 0. (was: " + wordCount + ")");
 81 |         }
 82 |         if(bytePadding < 0) {
 83 |             throw new IllegalArgumentException("Byte padding must be must be >= 0. (was: " + bytePadding + ")");
 84 |         }
 85 | 
 86 |         this.wordLength = wordLength;
 87 |         this.wordCount = wordCount;
 88 | 
 89 |         final long bitsRequired = (wordLength * wordCount);
 90 |         final boolean leftoverBits = ((bitsRequired % BITS_PER_BYTE) != 0);
 91 |         final int bytesRequired = (int)(bitsRequired / BITS_PER_BYTE) + (leftoverBits ? 1 : 0) + bytePadding;
 92 |         bytes = new byte[bytesRequired];
 93 | 
 94 |         bitsLeftInByte = BITS_PER_BYTE;
 95 |         byteIndex = bytePadding;
 96 |         wordsWritten = 0;
 97 |     }
 98 | 
 99 |     /* (non-Javadoc)
100 |      * @see net.agkn.hll.serialization.IWordSerializer#writeWord(long)
101 |      * @throws RuntimeException if the number of words written is greater than the
102 |      *         <code>wordCount</code> parameter in the constructor.
103 |      */
104 |     @Override
105 |     public void writeWord(final long word) {
106 |         if(wordsWritten == wordCount) {
107 |             throw new RuntimeException("Cannot write more words, backing array full!");
108 |         }
109 | 
110 |         int bitsLeftInWord = wordLength;
111 | 
112 |         while(bitsLeftInWord > 0) {
113 |             // Move to the next byte if the current one is fully packed.
114 |             if(bitsLeftInByte == 0) {
115 |                 byteIndex++;
116 |                 bitsLeftInByte = BITS_PER_BYTE;
117 |             }
118 | 
119 |             final long consumedMask;
120 |             if(bitsLeftInWord == 64) {
121 |                 consumedMask = ~0L;
122 |             } else {
123 |                 consumedMask = ((1L << bitsLeftInWord) - 1L);
124 |             }
125 | 
126 |             // Fix how many bits will be written in this cycle. Choose the
127 |             // smaller of the remaining bits in the word or byte.
128 |             final int numberOfBitsToWrite = Math.min(bitsLeftInByte, bitsLeftInWord);
129 |             final int bitsInByteRemainingAfterWrite = (bitsLeftInByte - numberOfBitsToWrite);
130 | 
131 |             // In general, we write the highest bits of the word first, so we
132 |             // strip the highest bits that were consumed in previous cycles.
133 |             final long remainingBitsOfWordToWrite = (word & consumedMask);
134 | 
135 |             final long bitsThatTheByteCanAccept;
136 |             // If there is more left in the word than can be written to this
137 |             // byte, shift off the bits that can't be written off the bottom.
138 |             if(bitsLeftInWord > numberOfBitsToWrite) {
139 |                 bitsThatTheByteCanAccept = (remainingBitsOfWordToWrite >>> (bitsLeftInWord - bitsLeftInByte));
140 |             } else {
141 |                 // If the byte can accept all remaining bits, there is no need
142 |                 // to shift off the bits that won't be written in this cycle.
143 |                 bitsThatTheByteCanAccept = remainingBitsOfWordToWrite;
144 |             }
145 | 
146 |             // Align the word bits to write up against the byte bits that have
147 |             // already been written. This shift may do nothing if the remainder
148 |             // of the byte is being consumed in this cycle.
149 |             final long alignedBits = (bitsThatTheByteCanAccept << bitsInByteRemainingAfterWrite);
150 | 
151 |             // Update the byte with the alignedBits.
152 |             bytes[byteIndex] |= (byte)alignedBits;
153 | 
154 |             // Update state with bit count written.
155 |             bitsLeftInWord -= numberOfBitsToWrite;
156 |             bitsLeftInByte = bitsInByteRemainingAfterWrite;
157 |         }
158 | 
159 |         wordsWritten ++;
160 |     }
161 | 
162 |     /* (non-Javadoc)
163 |      * @see net.agkn.hll.serialization.IWordSerializer#getBytes()
164 |      * @throws RuntimeException if the number of words written is fewer than the
165 |      *         <code>wordCount</code> parameter in the constructor.
166 |      */
167 |     @Override
168 |     public byte[] getBytes() {
169 |         if(wordsWritten < wordCount) {
170 |             throw new RuntimeException("Not all words have been written! (" + wordsWritten + "/" + wordCount + ")");
171 |         }
172 | 
173 |         return bytes;
174 |     }
175 | }
176 | 


--------------------------------------------------------------------------------
/src/main/java/net/agkn/hll/serialization/HLLMetadata.java:
--------------------------------------------------------------------------------
  1 | package net.agkn.hll.serialization;
  2 | 
  3 | /*
  4 |  * Copyright 2013 Aggregate Knowledge, Inc.
  5 |  *
  6 |  * Licensed under the Apache License, Version 2.0 (the "License");
  7 |  * you may not use this file except in compliance with the License.
  8 |  * You may obtain a copy of the License at
  9 |  *
 10 |  *     http://www.apache.org/licenses/LICENSE-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS,
 14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 |  * See the License for the specific language governing permissions and
 16 |  * limitations under the License.
 17 |  */
 18 | 
 19 | import net.agkn.hll.HLLType;
 20 | 
 21 | /**
 22 |  * A concrete {@link IHLLMetadata} implemented as a simple struct.
 23 |  *
 24 |  * @author timon
 25 |  */
 26 | public class HLLMetadata implements IHLLMetadata {
 27 |     private final int schemaVersion;
 28 |     private final HLLType type;
 29 |     private final int registerCountLog2;
 30 |     private final int registerWidth;
 31 |     private final int log2ExplicitCutoff;
 32 |     private final boolean explicitOff;
 33 |     private final boolean explicitAuto;
 34 |     private final boolean sparseEnabled;
 35 | 
 36 |     /**
 37 |      * @param schemaVersion the schema version number of the HLL. This must
 38 |      *        be greater than or equal to zero.
 39 |      * @param type the {@link HLLType type} of the HLL. This cannot
 40 |      *        be <code>null</code>.
 41 |      * @param registerCountLog2 the log-base-2 register count parameter for
 42 |      *        probabilistic HLLs. This must be greater than or equal to zero.
 43 |      * @param registerWidth the register width parameter for probabilistic
 44 |      *        HLLs. This must be greater than or equal to zero.
 45 |      * @param log2ExplicitCutoff the log-base-2 of the explicit cardinality cutoff,
 46 |      *        if it is explicitly defined. (If <code>explicitOff</code> or
 47 |      *        <code>explicitAuto</code> is <code>true</code> then this has no
 48 |      *        meaning.)
 49 |      * @param explicitOff the flag for 'explicit off'-mode, where the
 50 |      *        {@link HLLType#EXPLICIT} representation is not used. Both this and
 51 |      *        <code>explicitAuto</code> cannot be <code>true</code> at the same
 52 |      *        time.
 53 |      * @param explicitAuto the flag for 'explicit auto'-mode, where the
 54 |      *        {@link HLLType#EXPLICIT} representation's promotion cutoff is
 55 |      *        determined based on in-memory size automatically. Both this and
 56 |      *        <code>explicitOff</code> cannot be <code>true</code> at the same
 57 |      *        time.
 58 |      * @param sparseEnabled the flag for 'sparse-enabled'-mode, where the
 59 |      *        {@link HLLType#SPARSE} representation is used.
 60 |      */
 61 |     public HLLMetadata(final int schemaVersion,
 62 |                        final HLLType type,
 63 |                        final int registerCountLog2,
 64 |                        final int registerWidth,
 65 |                        final int log2ExplicitCutoff,
 66 |                        final boolean explicitOff,
 67 |                        final boolean explicitAuto,
 68 |                        final boolean sparseEnabled) {
 69 |         this.schemaVersion = schemaVersion;
 70 |         this.type = type;
 71 |         this.registerCountLog2 = registerCountLog2;
 72 |         this.registerWidth = registerWidth;
 73 |         this.log2ExplicitCutoff = log2ExplicitCutoff;
 74 |         this.explicitOff = explicitOff;
 75 |         this.explicitAuto = explicitAuto;
 76 |         this.sparseEnabled = sparseEnabled;
 77 |     }
 78 | 
 79 |     /* (non-Javadoc)
 80 |      * @see net.agkn.hll.serialization.IHLLMetadata#schemaVersion()
 81 |      */
 82 |     @Override
 83 |     public int schemaVersion() { return schemaVersion; }
 84 | 
 85 |     /* (non-Javadoc)
 86 |      * @see net.agkn.hll.serialization.IHLLMetadata#HLLType()
 87 |      */
 88 |     @Override
 89 |     public HLLType HLLType() { return type; }
 90 | 
 91 |     /* (non-Javadoc)
 92 |      * @see net.agkn.hll.serialization.IHLLMetadata#registerCountLog2()
 93 |      */
 94 |     @Override
 95 |     public int registerCountLog2() { return registerCountLog2; }
 96 | 
 97 |     /* (non-Javadoc)
 98 |      * @see net.agkn.hll.serialization.IHLLMetadata#registerWidth()
 99 |      */
100 |     @Override
101 |     public int registerWidth() { return registerWidth; }
102 | 
103 |     /* (non-Javadoc)
104 |      * @see net.agkn.hll.serialization.IHLLMetadata#log2ExplicitCutoff()
105 |      */
106 |     @Override
107 |     public int log2ExplicitCutoff() { return log2ExplicitCutoff; }
108 | 
109 |     /* (non-Javadoc)
110 |      * @see net.agkn.hll.serialization.IHLLMetadata#explicitOff()
111 |      */
112 |     @Override
113 |     public boolean explicitOff() {
114 |         return explicitOff;
115 |     }
116 | 
117 |     /* (non-Javadoc)
118 |      * @see net.agkn.hll.serialization.IHLLMetadata#explicitAuto()
119 |      * @see net.agkn.hll.serialization.IHLLMetadata#log2ExplicitCutoff()
120 |      */
121 |     @Override
122 |     public boolean explicitAuto() {
123 |         return explicitAuto;
124 |     }
125 | 
126 |     /* (non-Javadoc)
127 |      * @see net.agkn.hll.serialization.IHLLMetadata#sparseEnabled()
128 |      */
129 |     @Override
130 |     public boolean sparseEnabled() { return sparseEnabled; }
131 | 
132 |     /* (non-Javadoc)
133 |      * @see java.lang.Object#toString()
134 |      */
135 |     @Override
136 |     public String toString() {
137 |         return "<HLLMetadata schemaVersion: " + this.schemaVersion + ", type: " + this.type.toString() + ", registerCountLog2: " + this.registerCountLog2 + ", registerWidth: " + this.registerWidth + ", log2ExplicitCutoff: " + this.log2ExplicitCutoff + ", explicitOff: " + this.explicitOff + ", explicitAuto: " +this.explicitAuto + ">";
138 |     }
139 | }
140 | 


--------------------------------------------------------------------------------
/src/main/java/net/agkn/hll/serialization/IHLLMetadata.java:
--------------------------------------------------------------------------------
 1 | package net.agkn.hll.serialization;
 2 | 
 3 | /*
 4 |  * Copyright 2013 Aggregate Knowledge, Inc.
 5 |  *
 6 |  * Licensed under the Apache License, Version 2.0 (the "License");
 7 |  * you may not use this file except in compliance with the License.
 8 |  * You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | 
19 | import net.agkn.hll.HLLType;
20 | 
21 | /**
22 |  * The metadata and parameters associated with a HLL.
23 |  */
24 | public interface IHLLMetadata {
25 |     /**
26 |      * @return the schema version of the HLL. This will never be <code>null</code>.
27 |      */
28 |     int schemaVersion();
29 | 
30 |     /**
31 |      * @return the type of the HLL. This will never be <code>null</code>.
32 |      */
33 |     HLLType HLLType();
34 | 
35 |     /**
36 |      * @return the log-base-2 of the register count parameter of the HLL. This
37 |      *         will always be greater than or equal to 4 and less than or equal
38 |      *         to 31.
39 |      */
40 |     int registerCountLog2();
41 | 
42 |     /**
43 |      * @return the register width parameter of the HLL. This will always be
44 |      *         greater than or equal to 1 and less than or equal to 8.
45 |      */
46 |     int registerWidth();
47 | 
48 |     /**
49 |      * @return the log-base-2 of the explicit cutoff cardinality. This will always
50 |      *         be greater than or equal to zero and less than 31, per the specification.
51 |      */
52 |     int log2ExplicitCutoff();
53 | 
54 |     /**
55 |      * @return <code>true</code> if the {@link HLLType#EXPLICIT} representation
56 |      *         has been disabled. <code>false</code> otherwise.
57 |      */
58 |     boolean explicitOff();
59 | 
60 |     /**
61 |      * @return <code>true</code> if the {@link HLLType#EXPLICIT} representation
62 |      *         cutoff cardinality is set to be automatically chosen,
63 |      *         <code>false</code> otherwise.
64 |      */
65 |     boolean explicitAuto();
66 | 
67 |     /**
68 |      * @return <code>true</code> if the {@link HLLType#SPARSE} representation
69 |      *         is enabled.
70 |      */
71 |     boolean sparseEnabled();
72 | }


--------------------------------------------------------------------------------
/src/main/java/net/agkn/hll/serialization/ISchemaVersion.java:
--------------------------------------------------------------------------------
 1 | package net.agkn.hll.serialization;
 2 | 
 3 | /*
 4 |  * Copyright 2013 Aggregate Knowledge, Inc.
 5 |  *
 6 |  * Licensed under the Apache License, Version 2.0 (the "License");
 7 |  * you may not use this file except in compliance with the License.
 8 |  * You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | 
19 | import net.agkn.hll.HLLType;
20 | 
21 | /**
22 |  * A serialization schema for HLLs. Reads and writes HLL metadata to
23 |  * and from <code>byte[]</code> representations.
24 |  *
25 |  * @author timon
26 |  */
27 | public interface ISchemaVersion {
28 |     /**
29 |      * The number of metadata bytes required for a serialized HLL of the
30 |      * specified type.
31 |      *
32 |      * @param  type the type of the serialized HLL
33 |      * @return the number of padding bytes needed in order to fully accommodate
34 |      *         the needed metadata.
35 |      */
36 |     int paddingBytes(HLLType type);
37 | 
38 |     /**
39 |      * Writes metadata bytes to serialized HLL.
40 |      *
41 |      * @param bytes the padded data bytes of the HLL
42 |      * @param metadata the metadata to write to the padding bytes
43 |      */
44 |     void writeMetadata(byte[] bytes, IHLLMetadata metadata);
45 | 
46 |     /**
47 |      * Reads the metadata bytes of the serialized HLL.
48 |      *
49 |      * @param  bytes the serialized HLL
50 |      * @return the HLL metadata
51 |      */
52 |     IHLLMetadata readMetadata(byte[] bytes);
53 | 
54 |     /**
55 |      * Builds an HLL serializer that matches this schema version.
56 |      *
57 |      * @param  type the HLL type that will be serialized. This cannot be
58 |      *         <code>null</code>.
59 |      * @param  wordLength the length of the 'words' that comprise the data of the
60 |      *         HLL. Words must be at least 5 bits and at most 64 bits long.
61 |      * @param  wordCount the number of 'words' in the HLL's data.
62 |      * @return a byte array serializer used to serialize a HLL according
63 |      *         to this schema version's specification.
64 |      * @see #paddingBytes(HLLType)
65 |      * @see IWordSerializer
66 |      */
67 |     IWordSerializer getSerializer(HLLType type, int wordLength, int wordCount);
68 | 
69 |     /**
70 |      * Builds an HLL deserializer that matches this schema version.
71 |      *
72 |      * @param  type the HLL type that will be deserialized. This cannot be
73 |      *         <code>null</code>.
74 |      * @param  wordLength the length of the 'words' that comprise the data of the
75 |      *         serialized HLL. Words must be at least 5 bits and at most 64
76 |      *         bits long.
77 |      * @param  bytes the serialized HLL to deserialize. This cannot be
78 |      *         <code>null</code>.
79 |      * @return a byte array deserializer used to deserialize a HLL serialized
80 |      *         according to this schema version's specification.
81 |      */
82 |     IWordDeserializer getDeserializer(HLLType type, int wordLength, byte[] bytes);
83 | 
84 |     /**
85 |      * @return the schema version number.
86 |      */
87 |     int schemaVersionNumber();
88 | }
89 | 


--------------------------------------------------------------------------------
/src/main/java/net/agkn/hll/serialization/IWordDeserializer.java:
--------------------------------------------------------------------------------
 1 | package net.agkn.hll.serialization;
 2 | 
 3 | /*
 4 |  * Copyright 2013 Aggregate Knowledge, Inc.
 5 |  *
 6 |  * Licensed under the Apache License, Version 2.0 (the "License");
 7 |  * you may not use this file except in compliance with the License.
 8 |  * You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | 
19 | /**
20 |  * Reads 'words' of a fixed width, in sequence, from a byte array.
21 |  *
22 |  * @author timon
23 |  */
24 | public interface IWordDeserializer {
25 |     /**
26 |      * @return the next word in the sequence. Should not be called more than
27 |      * {@link #totalWordCount()} times.
28 |      */
29 |     long readWord();
30 | 
31 |     /**
32 |      * Returns the number of words that could be encoded in the sequence.<p/>
33 |      *
34 |      * NOTE:  the sequence that was encoded may be shorter than the value this
35 |      *        method returns due to padding issues within bytes. This guarantees
36 |      *        only an upper bound on the number of times {@link #readWord()}
37 |      *        can be called.
38 |      *
39 |      * @return the maximum number of words that could be read from the sequence.
40 |      */
41 |     int totalWordCount();
42 | }


--------------------------------------------------------------------------------
/src/main/java/net/agkn/hll/serialization/IWordSerializer.java:
--------------------------------------------------------------------------------
 1 | package net.agkn.hll.serialization;
 2 | 
 3 | /*
 4 |  * Copyright 2013 Aggregate Knowledge, Inc.
 5 |  *
 6 |  * Licensed under the Apache License, Version 2.0 (the "License");
 7 |  * you may not use this file except in compliance with the License.
 8 |  * You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | 
19 | /**
20 |  * Writes 'words' of fixed width, in sequence, to a byte array.
21 |  *
22 |  * @author timon
23 |  */
24 | public interface IWordSerializer {
25 | 
26 |     /**
27 |      * Writes the word to the backing array.
28 |      *
29 |      * @param  word the word to write.
30 |      */
31 |     void writeWord(final long word);
32 | 
33 |     /**
34 |      * Returns the backing array of <code>byte</code>s that contain the serialized
35 |      * words.
36 |      * @return the serialized words as a <code>byte[]</code>.
37 |      */
38 |     byte[] getBytes();
39 | 
40 | }


--------------------------------------------------------------------------------
/src/main/java/net/agkn/hll/serialization/SchemaVersionOne.java:
--------------------------------------------------------------------------------
  1 | package net.agkn.hll.serialization;
  2 | 
  3 | /*
  4 |  * Copyright 2013 Aggregate Knowledge, Inc.
  5 |  *
  6 |  * Licensed under the Apache License, Version 2.0 (the "License");
  7 |  * you may not use this file except in compliance with the License.
  8 |  * You may obtain a copy of the License at
  9 |  *
 10 |  *     http://www.apache.org/licenses/LICENSE-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS,
 14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 |  * See the License for the specific language governing permissions and
 16 |  * limitations under the License.
 17 |  */
 18 | 
 19 | import net.agkn.hll.HLLType;
 20 | 
 21 | /**
 22 |  * A concrete {@link ISchemaVersion} representing schema version one.
 23 |  *
 24 |  * @author timon
 25 |  */
 26 | public class SchemaVersionOne implements ISchemaVersion {
 27 |     /**
 28 |      * The schema version number for this instance.
 29 |      */
 30 |     public static final int SCHEMA_VERSION = 1;
 31 | 
 32 |     // ------------------------------------------------------------------------
 33 |     // Version-specific ordinals (array position) for each of the HLL types
 34 |     private static final HLLType[] TYPE_ORDINALS = new HLLType[] {
 35 |         HLLType.UNDEFINED,
 36 |         HLLType.EMPTY,
 37 |         HLLType.EXPLICIT,
 38 |         HLLType.SPARSE,
 39 |         HLLType.FULL
 40 |     };
 41 | 
 42 |     // ------------------------------------------------------------------------
 43 |     // number of header bytes for all HLL types
 44 |     private static final int HEADER_BYTE_COUNT = 3;
 45 | 
 46 |     // sentinel values from the spec for explicit off and auto
 47 |     private static final int EXPLICIT_OFF = 0;
 48 |     private static final int EXPLICIT_AUTO = 63;
 49 | 
 50 |     // ************************************************************************
 51 |     /* (non-Javadoc)
 52 |      * @see net.agkn.hll.serialization.ISchemaVersion#paddingBytes(HLLType)
 53 |      */
 54 |     @Override
 55 |     public int paddingBytes(final HLLType type) {
 56 |         return HEADER_BYTE_COUNT;
 57 |     }
 58 | 
 59 |     /* (non-Javadoc)
 60 |      * @see net.agkn.hll.serialization.ISchemaVersion#writeMetadata(byte[], IHLLMetadata)
 61 |      */
 62 |     @Override
 63 |     public void writeMetadata(final byte[] bytes, final IHLLMetadata metadata) {
 64 |         final HLLType type = metadata.HLLType();
 65 |         final int typeOrdinal = getOrdinal(type);
 66 | 
 67 |         final int explicitCutoffValue;
 68 |         if(metadata.explicitOff()) {
 69 |             explicitCutoffValue = EXPLICIT_OFF;
 70 |         } else if(metadata.explicitAuto()) {
 71 |             explicitCutoffValue = EXPLICIT_AUTO;
 72 |         } else {
 73 |             explicitCutoffValue = metadata.log2ExplicitCutoff() + 1/*per spec*/;
 74 |         }
 75 | 
 76 |         bytes[0] = SerializationUtil.packVersionByte(SCHEMA_VERSION, typeOrdinal);
 77 |         bytes[1] = SerializationUtil.packParametersByte(metadata.registerWidth(), metadata.registerCountLog2());
 78 |         bytes[2] = SerializationUtil.packCutoffByte(explicitCutoffValue, metadata.sparseEnabled());
 79 |     }
 80 | 
 81 |     /* (non-Javadoc)
 82 |      * @see net.agkn.hll.serialization.ISchemaVersion#readMetadata(byte[])
 83 |      */
 84 |     @Override
 85 |     public IHLLMetadata readMetadata(final byte[] bytes) {
 86 |         final byte versionByte = bytes[0];
 87 |         final byte parametersByte = bytes[1];
 88 |         final byte cutoffByte = bytes[2];
 89 | 
 90 |         final int typeOrdinal = SerializationUtil.typeOrdinal(versionByte);
 91 |         final int explicitCutoffValue = SerializationUtil.explicitCutoff(cutoffByte);
 92 |         final boolean explicitOff = (explicitCutoffValue == EXPLICIT_OFF);
 93 |         final boolean explicitAuto = (explicitCutoffValue == EXPLICIT_AUTO);
 94 |         final int log2ExplicitCutoff = (explicitOff || explicitAuto) ? -1/*sentinel*/ : (explicitCutoffValue - 1/*per spec*/);
 95 | 
 96 |         return new HLLMetadata(SCHEMA_VERSION,
 97 |                                     getType(typeOrdinal),
 98 |                                     SerializationUtil.registerCountLog2(parametersByte),
 99 |                                     SerializationUtil.registerWidth(parametersByte),
100 |                                     log2ExplicitCutoff,
101 |                                     explicitOff,
102 |                                     explicitAuto,
103 |                                     SerializationUtil.sparseEnabled(cutoffByte));
104 |     }
105 | 
106 |     /* (non-Javadoc)
107 |      * @see net.agkn.hll.serialization.ISchemaVersion#getSerializer(HLLType, int, int)
108 |      */
109 |     @Override
110 |     public IWordSerializer getSerializer(HLLType type, int wordLength, int wordCount) {
111 |         return new BigEndianAscendingWordSerializer(wordLength, wordCount, paddingBytes(type));
112 |     }
113 | 
114 |     /* (non-Javadoc)
115 |      * @see net.agkn.hll.serialization.ISchemaVersion#getDeserializer(HLLType, int, byte[])
116 |      */
117 |     @Override
118 |     public IWordDeserializer getDeserializer(HLLType type, int wordLength, byte[] bytes) {
119 |         return new BigEndianAscendingWordDeserializer(wordLength, paddingBytes(type), bytes);
120 |     }
121 | 
122 |     /* (non-Javadoc)
123 |      * @see net.agkn.hll.serialization.ISchemaVersion#schemaVersionNumber()
124 |      */
125 |     @Override
126 |     public int schemaVersionNumber() {
127 |         return SCHEMA_VERSION;
128 |     }
129 | 
130 |     // ========================================================================
131 |     // Type/Ordinal lookups
132 |     /**
133 |      * Gets the ordinal for the specified {@link HLLType}.
134 |      *
135 |      * @param  type the type whose ordinal is desired
136 |      * @return the ordinal for the specified type, to be used in the version byte.
137 |      *         This will always be non-negative.
138 |      */
139 |     private static int getOrdinal(final HLLType type) {
140 |         for(int i=0; i<TYPE_ORDINALS.length; i++) {
141 |             if(TYPE_ORDINALS[i].equals(type)) return i;
142 |         }
143 |         throw new RuntimeException("Unknown HLL type " + type);
144 |     }
145 | 
146 |     /**
147 |      * Gets the {@link HLLType} for the specified ordinal.
148 |      *
149 |      * @param  ordinal the ordinal whose type is desired
150 |      * @return the type for the specified ordinal. This will never be <code>null</code>.
151 |      */
152 |     private static HLLType getType(final int ordinal) {
153 |         if((ordinal < 0) || (ordinal >= TYPE_ORDINALS.length)) {
154 |             throw new IllegalArgumentException("Invalid type ordinal '" + ordinal + "'. Only 0-" + (TYPE_ORDINALS.length - 1) + " inclusive allowed.");
155 |         }
156 |         return TYPE_ORDINALS[ordinal];
157 |     }
158 | }
159 | 


--------------------------------------------------------------------------------
/src/main/java/net/agkn/hll/serialization/SerializationUtil.java:
--------------------------------------------------------------------------------
  1 | package net.agkn.hll.serialization;
  2 | 
  3 | /*
  4 |  * Copyright 2013 Aggregate Knowledge, Inc.
  5 |  *
  6 |  * Licensed under the Apache License, Version 2.0 (the "License");
  7 |  * you may not use this file except in compliance with the License.
  8 |  * You may obtain a copy of the License at
  9 |  *
 10 |  *     http://www.apache.org/licenses/LICENSE-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS,
 14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 |  * See the License for the specific language governing permissions and
 16 |  * limitations under the License.
 17 |  */
 18 | 
 19 | import net.agkn.hll.HLLType;
 20 | 
 21 | /**
 22 |  * A collection of constants and utilities for serializing and deserializing
 23 |  * HLLs.
 24 |  *
 25 |  * NOTE:  'package' visibility is used for many methods that only need to be
 26 |  *        used by the {@link ISchemaVersion} implementations. The structure of
 27 |  *        a serialized HLL's metadata should be opaque to the rest of the
 28 |  *        library.
 29 |  *
 30 |  * @author timon
 31 |  */
 32 | public class SerializationUtil {
 33 |     /**
 34 |      * The number of bits (of the parameters byte) dedicated to encoding the
 35 |      * width of the registers.
 36 |      */
 37 |     /*package*/ static int REGISTER_WIDTH_BITS = 3;
 38 | 
 39 |     /**
 40 |      * A mask to cap the maximum value of the register width.
 41 |      */
 42 |     /*package*/ static int REGISTER_WIDTH_MASK = (1 << REGISTER_WIDTH_BITS) - 1;
 43 | 
 44 |     /**
 45 |      * The number of bits (of the parameters byte) dedicated to encoding
 46 |      * <code>log2(registerCount)</code>.
 47 |      */
 48 |     /*package*/ static int LOG2_REGISTER_COUNT_BITS = 5;
 49 | 
 50 |     /**
 51 |      * A mask to cap the maximum value of <code>log2(registerCount)</code>.
 52 |      */
 53 |     /*package*/ static int LOG2_REGISTER_COUNT_MASK = (1 << LOG2_REGISTER_COUNT_BITS) - 1;
 54 | 
 55 |     /**
 56 |      * The number of bits (of the cutoff byte) dedicated to encoding the
 57 |      * log-base-2 of the explicit cutoff or sentinel values for
 58 |      * 'explicit-disabled' or 'auto'.
 59 |      */
 60 |     /*package*/ static int EXPLICIT_CUTOFF_BITS = 6;
 61 | 
 62 |     /**
 63 |      * A mask to cap the maximum value of the explicit cutoff choice.
 64 |      */
 65 |     /*package*/ static int EXPLICIT_CUTOFF_MASK = (1 << EXPLICIT_CUTOFF_BITS) - 1;
 66 | 
 67 |     /**
 68 |      * Number of bits in a nibble.
 69 |      */
 70 |     private static int NIBBLE_BITS = 4;
 71 | 
 72 |     /**
 73 |      * A mask to cap the maximum value of a nibble.
 74 |      */
 75 |     private static int NIBBLE_MASK = (1 << NIBBLE_BITS) - 1;
 76 | 
 77 |     // ************************************************************************
 78 |     // Serialization utilities
 79 | 
 80 |     /**
 81 |      * Schema version one (v1).
 82 |      */
 83 |     public static ISchemaVersion VERSION_ONE = new SchemaVersionOne();
 84 | 
 85 |     /**
 86 |      * The default schema version for serializing HLLs.
 87 |      */
 88 |     public static ISchemaVersion DEFAULT_SCHEMA_VERSION = VERSION_ONE;
 89 | 
 90 |     /**
 91 |      * List of registered schema versions, indexed by their version numbers. If
 92 |      * an entry is <code>null</code>, then no such schema version is registered.
 93 |      * Similarly, registering a new schema version simply entails assigning an
 94 |      * {@link ISchemaVersion} instance to the appropriate index of this array.<p/>
 95 |      *
 96 |      * By default, only {@link SchemaVersionOne} is registered. Note that version
 97 |      * zero will always be reserved for internal (e.g. proprietary, legacy) schema
 98 |      * specifications/implementations and will never be assigned to in by this
 99 |      * library.
100 |      */
101 |     public static ISchemaVersion[] REGISTERED_SCHEMA_VERSIONS = new ISchemaVersion[16];
102 | 
103 |     static {
104 |         REGISTERED_SCHEMA_VERSIONS[1] = VERSION_ONE;
105 |     }
106 | 
107 |     /**
108 |      * @param  schemaVersionNumber the version number of the {@link ISchemaVersion}
109 |      *         desired. This must be a registered schema version number.
110 |      * @return The {@link ISchemaVersion} for the given number. This will never
111 |      *         be <code>null</code>.
112 |      */
113 |     public static ISchemaVersion getSchemaVersion(final int schemaVersionNumber) {
114 |         if(schemaVersionNumber >= REGISTERED_SCHEMA_VERSIONS.length || schemaVersionNumber < 0) {
115 |             throw new RuntimeException("Invalid schema version number " + schemaVersionNumber);
116 |         }
117 |         final ISchemaVersion schemaVersion = REGISTERED_SCHEMA_VERSIONS[schemaVersionNumber];
118 |         if(schemaVersion == null) {
119 |             throw new RuntimeException("Unknown schema version number " + schemaVersionNumber);
120 |         }
121 |         return schemaVersion;
122 |     }
123 | 
124 |     /**
125 |      * Get the appropriate {@link ISchemaVersion schema version} for the specified
126 |      * serialized HLL.
127 |      *
128 |      * @param  bytes the serialized HLL whose schema version is desired.
129 |      * @return the schema version for the specified HLL. This will never
130 |      *         be <code>null</code>.
131 |      */
132 |     public static ISchemaVersion getSchemaVersion(final byte[] bytes) {
133 |         final byte versionByte = bytes[0];
134 |         final int schemaVersionNumber = schemaVersion(versionByte);
135 | 
136 |         return getSchemaVersion(schemaVersionNumber);
137 |     }
138 | 
139 |     // ************************************************************************
140 |     // Package-specific shared helpers
141 | 
142 |     /**
143 |      * Generates a byte that encodes the schema version and the type ordinal
144 |      * of the HLL.
145 |      *
146 |      * The top nibble is the schema version and the bottom nibble is the type
147 |      * ordinal.
148 |      *
149 |      * @param schemaVersion the schema version to encode.
150 |      * @param typeOrdinal the type ordinal of the HLL to encode.
151 |      * @return the packed version byte
152 |      */
153 |     public static byte packVersionByte(final int schemaVersion, final int typeOrdinal) {
154 |         return (byte)(((NIBBLE_MASK & schemaVersion) << NIBBLE_BITS) | (NIBBLE_MASK & typeOrdinal));
155 |     }
156 |     /**
157 |      * Generates a byte that encodes the log-base-2 of the explicit cutoff
158 |      * or sentinel values for 'explicit-disabled' or 'auto', as well as the
159 |      * boolean indicating whether to use {@link HLLType#SPARSE}
160 |      * in the promotion hierarchy.
161 |      *
162 |      * The top bit is always padding, the second highest bit indicates the
163 |      * 'sparse-enabled' boolean, and the lowest six bits encode the explicit
164 |      * cutoff value.
165 |      *
166 |      * @param  explicitCutoff the explicit cutoff value to encode.
167 |      *         <ul>
168 |      *           <li>
169 |      *             If 'explicit-disabled' is chosen, this value should be <code>0</code>.
170 |      *           </li>
171 |      *           <li>
172 |      *             If 'auto' is chosen, this value should be <code>63</code>.
173 |      *           </li>
174 |      *           <li>
175 |      *             If a cutoff of 2<sup>n</sup> is desired, for <code>0 <= n < 31</code>,
176 |      *             this value should be <code>n + 1</code>.
177 |      *           </li>
178 |      *         </ul>
179 |      * @param  sparseEnabled whether {@link HLLType#SPARSE}
180 |      *         should be used in the promotion hierarchy to improve HLL
181 |      *         storage.
182 |      *
183 |      * @return the packed cutoff byte
184 |      */
185 |     public static byte packCutoffByte(final int explicitCutoff, final boolean sparseEnabled) {
186 |         final int sparseBit = (sparseEnabled ? (1 << EXPLICIT_CUTOFF_BITS) : 0);
187 |         return (byte)(sparseBit | (EXPLICIT_CUTOFF_MASK & explicitCutoff));
188 |     }
189 | 
190 |     /**
191 |      * Generates a byte that encodes the parameters of a
192 |      * {@link HLLType#FULL} or {@link HLLType#SPARSE}
193 |      * HLL.<p/>
194 |      *
195 |      * The top 3 bits are used to encode <code>registerWidth - 1</code>
196 |      * (range of <code>registerWidth</code> is thus 1-9) and the bottom 5
197 |      * bits are used to encode <code>registerCountLog2</code>
198 |      * (range of <code>registerCountLog2</code> is thus 0-31).
199 |      *
200 |      * @param  registerWidth the register width (must be at least 1 and at
201 |      *         most 9)
202 |      * @param  registerCountLog2 the log-base-2 of the register count (must
203 |      *         be at least 0 and at most 31)
204 |      * @return the packed parameters byte
205 |      */
206 |     public static byte packParametersByte(final int registerWidth, final int registerCountLog2) {
207 |         final int widthBits = ((registerWidth - 1) & REGISTER_WIDTH_MASK);
208 |         final int countBits = (registerCountLog2 & LOG2_REGISTER_COUNT_MASK);
209 |         return (byte)((widthBits << LOG2_REGISTER_COUNT_BITS) | countBits);
210 |     }
211 | 
212 |     /**
213 |      * Extracts the 'sparse-enabled' boolean from the cutoff byte of a serialized
214 |      * HLL.
215 |      *
216 |      * @param  cutoffByte the cutoff byte of the serialized HLL
217 |      * @return the 'sparse-enabled' boolean
218 |      */
219 |     public static boolean sparseEnabled(final byte cutoffByte) {
220 |         return ((cutoffByte >>> EXPLICIT_CUTOFF_BITS) & 1) == 1;
221 |     }
222 | 
223 |     /**
224 |      * Extracts the explicit cutoff value from the cutoff byte of a serialized
225 |      * HLL.
226 |      *
227 |      * @param  cutoffByte the cutoff byte of the serialized HLL
228 |      * @return the explicit cutoff value
229 |      */
230 |     public static int explicitCutoff(final byte cutoffByte) {
231 |         return (cutoffByte & EXPLICIT_CUTOFF_MASK);
232 |     }
233 | 
234 |     /**
235 |      * Extracts the schema version from the version byte of a serialized
236 |      * HLL.
237 |      *
238 |      * @param  versionByte the version byte of the serialized HLL
239 |      * @return the schema version of the serialized HLL
240 |      */
241 |     public static int schemaVersion(final byte versionByte) {
242 |         return NIBBLE_MASK & (versionByte >>> NIBBLE_BITS);
243 |     }
244 | 
245 |     /**
246 |      * Extracts the type ordinal from the version byte of a serialized HLL.
247 |      *
248 |      * @param  versionByte the version byte of the serialized HLL
249 |      * @return the type ordinal of the serialized HLL
250 |      */
251 |     public static int typeOrdinal(final byte versionByte) {
252 |         return (versionByte & NIBBLE_MASK);
253 |     }
254 | 
255 |     /**
256 |      * Extracts the register width from the parameters byte of a serialized
257 |      * {@link HLLType#FULL} HLL.
258 |      *
259 |      * @param  parametersByte the parameters byte of the serialized HLL
260 |      * @return the register width of the serialized HLL
261 |      *
262 |      * @see #packParametersByte(int, int)
263 |      */
264 |     public static int registerWidth(final byte parametersByte) {
265 |         return ((parametersByte >>> LOG2_REGISTER_COUNT_BITS) & REGISTER_WIDTH_MASK) + 1;
266 |     }
267 | 
268 |     /**
269 |      * Extracts the log2(registerCount) from the parameters byte of a
270 |      * serialized {@link HLLType#FULL} HLL.
271 |      *
272 |      * @param  parametersByte the parameters byte of the serialized HLL
273 |      * @return log2(registerCount) of the serialized HLL
274 |      *
275 |      * @see #packParametersByte(int, int)
276 |      */
277 |     public static int registerCountLog2(final byte parametersByte) {
278 |         return (parametersByte & LOG2_REGISTER_COUNT_MASK);
279 |     }
280 | }
281 | 


--------------------------------------------------------------------------------
/src/main/java/net/agkn/hll/util/BitUtil.java:
--------------------------------------------------------------------------------
 1 | package net.agkn.hll.util;
 2 | 
 3 | /*
 4 |  * Copyright 2013 Aggregate Knowledge, Inc.
 5 |  *
 6 |  * Licensed under the Apache License, Version 2.0 (the "License");
 7 |  * you may not use this file except in compliance with the License.
 8 |  * You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | 
19 | /**
20 |  * A collection of bit utilities.
21 |  *
22 |  * @author rgrzywinski
23 |  */
24 | public class BitUtil {
25 |     /**
26 |      * The set of least-significant bits for a given <code>byte</code>.  <code>-1</code>
27 |      * is used if no bits are set (so as to not be confused with "index of zero"
28 |      * meaning that the least significant bit is the 0th (1st) bit).
29 |      *
30 |      * @see #leastSignificantBit(long)
31 |      */
32 |     private static final int[] LEAST_SIGNIFICANT_BIT = {
33 |        -1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
34 |         4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
35 |         5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
36 |         4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
37 |         6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
38 |         4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
39 |         5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
40 |         4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
41 |         7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
42 |         4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
43 |         5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
44 |         4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
45 |         6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
46 |         4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
47 |         5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
48 |         4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
49 |     };
50 | 
51 |     /**
52 |      * Computes the least-significant bit of the specified <code>long</code>
53 |      * that is set to <code>1</code>. Zero-indexed.
54 |      *
55 |      * @param  value the <code>long</code> whose least-significant bit is desired.
56 |      * @return the least-significant bit of the specified <code>long</code>.
57 |      *         <code>-1</code> is returned if there are no bits set.
58 |      */
59 |     // REF:  http://stackoverflow.com/questions/757059/position-of-least-significant-bit-that-is-set
60 |     // REF:  http://www-graphics.stanford.edu/~seander/bithacks.html
61 |     public static int leastSignificantBit(final long value) {
62 |         if(value == 0L) return -1/*by contract*/;
63 |         if((value & 0xFFL) != 0) return LEAST_SIGNIFICANT_BIT[(int)( (value >>>  0) & 0xFF)] +  0;
64 |         if((value & 0xFFFFL) != 0) return LEAST_SIGNIFICANT_BIT[(int)( (value >>>  8) & 0xFF)] +  8;
65 |         if((value & 0xFFFFFFL) != 0) return LEAST_SIGNIFICANT_BIT[(int)( (value >>> 16) & 0xFF)] + 16;
66 |         if((value & 0xFFFFFFFFL) != 0) return LEAST_SIGNIFICANT_BIT[(int)( (value >>> 24) & 0xFF)] + 24;
67 |         if((value & 0xFFFFFFFFFFL) != 0) return LEAST_SIGNIFICANT_BIT[(int)( (value >>> 32) & 0xFF)] + 32;
68 |         if((value & 0xFFFFFFFFFFFFL) != 0) return LEAST_SIGNIFICANT_BIT[(int)( (value >>> 40) & 0xFF)] + 40;
69 |         if((value & 0xFFFFFFFFFFFFFFL) != 0) return LEAST_SIGNIFICANT_BIT[(int)( (value >>> 48) & 0xFF)] + 48;
70 |         return LEAST_SIGNIFICANT_BIT[(int)( (value >>> 56) & 0xFFL)] + 56;
71 |     }
72 | }


--------------------------------------------------------------------------------
/src/main/java/net/agkn/hll/util/BitVector.java:
--------------------------------------------------------------------------------
  1 | package net.agkn.hll.util;
  2 | 
  3 | /*
  4 |  * Copyright 2013 Aggregate Knowledge, Inc.
  5 |  *
  6 |  * Licensed under the Apache License, Version 2.0 (the "License");
  7 |  * you may not use this file except in compliance with the License.
  8 |  * You may obtain a copy of the License at
  9 |  *
 10 |  *     http://www.apache.org/licenses/LICENSE-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS,
 14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 |  * See the License for the specific language governing permissions and
 16 |  * limitations under the License.
 17 |  */
 18 | 
 19 | import net.agkn.hll.serialization.IWordSerializer;
 20 | 
 21 | /**
 22 |  * A vector (array) of bits that is accessed in units ("registers") of <code>width</code>
 23 |  * bits which are stored as 64bit "words" (<code>long</code>s).  In this context
 24 |  * a register is at most 64bits.
 25 |  *
 26 |  * @author rgrzywinski
 27 |  */
 28 | public class BitVector implements Cloneable {
 29 |     // NOTE:  in this context, a word is 64bits
 30 | 
 31 |     // rather than doing division to determine how a bit index fits into 64bit
 32 |     // words (i.e. longs), bit shifting is used
 33 |     private static final int LOG2_BITS_PER_WORD = 6/*=>64bits*/;
 34 |     private static final int BITS_PER_WORD = 1 << LOG2_BITS_PER_WORD;
 35 |     private static final int BITS_PER_WORD_MASK = BITS_PER_WORD - 1;
 36 | 
 37 |     // ditto from above but for bytes (for output)
 38 |     private static final int LOG2_BITS_PER_BYTE = 3/*=>8bits*/;
 39 |     public static final int BITS_PER_BYTE = 1 << LOG2_BITS_PER_BYTE;
 40 | 
 41 |     // ========================================================================
 42 |     public static final int BYTES_PER_WORD = 8/*8 bytes in a long*/;
 43 | 
 44 |     // ************************************************************************
 45 |     // 64bit words
 46 |     private final long[] words;
 47 |     public final long[] words() { return words; }
 48 |     public final int wordCount() { return words.length; }
 49 |     public final int byteCount() { return wordCount() * BYTES_PER_WORD; }
 50 | 
 51 |     // the width of a register in bits (this cannot be more than 64 (the word size))
 52 |     private final int registerWidth;
 53 |     public final int registerWidth() { return registerWidth; }
 54 | 
 55 |     private final long count;
 56 | 
 57 |     // ------------------------------------------------------------------------
 58 |     private final long registerMask;
 59 | 
 60 |     // ========================================================================
 61 |     /**
 62 |      * @param  width the width of each register.  This cannot be negative or
 63 |      *         zero or greater than 63 (the signed word size).
 64 |      * @param  count the number of registers.  This cannot be negative or zero
 65 |      */
 66 |     public BitVector(final int width, final long count) {
 67 |         // ceil((width * count)/BITS_PER_WORD)
 68 |         this.words = new long[(int)(((width * count) + BITS_PER_WORD_MASK) >>> LOG2_BITS_PER_WORD)];
 69 |         this.registerWidth = width;
 70 |         this.count = count;
 71 | 
 72 |         this.registerMask = (1L << width) - 1;
 73 |     }
 74 | 
 75 |     // ========================================================================
 76 |     /**
 77 |      * @param  registerIndex the index of the register whose value is to be
 78 |      *         retrieved.  This cannot be negative.
 79 |      * @return the value at the specified register index
 80 |      * @see #setRegister(long, long)
 81 |      * @see #setMaxRegister(long, long)
 82 |      */
 83 |     // NOTE:  if this changes then setMaxRegister() must change
 84 |     public long getRegister(final long registerIndex) {
 85 |         final long bitIndex = registerIndex * registerWidth;
 86 |         final int firstWordIndex = (int)(bitIndex >>> LOG2_BITS_PER_WORD)/*aka (bitIndex / BITS_PER_WORD)*/;
 87 |         final int secondWordIndex = (int)((bitIndex + registerWidth - 1) >>> LOG2_BITS_PER_WORD)/*see above*/;
 88 |         final int bitRemainder = (int)(bitIndex & BITS_PER_WORD_MASK)/*aka (bitIndex % BITS_PER_WORD)*/;
 89 | 
 90 |         if(firstWordIndex == secondWordIndex)
 91 |             return ((words[firstWordIndex] >>> bitRemainder) & registerMask);
 92 |         /* else -- register spans words */
 93 |         return (words[firstWordIndex] >>> bitRemainder)/*no need to mask since at top of word*/
 94 |              | (words[secondWordIndex] << (BITS_PER_WORD - bitRemainder)) & registerMask;
 95 |     }
 96 | 
 97 |     /**
 98 |      * @param registerIndex the index of the register whose value is to be set.
 99 |      *        This cannot be negative
100 |      * @param value the value to set in the register
101 |      * @see #getRegister(long)
102 |      * @see #setMaxRegister(long, long)
103 |      */
104 |     // NOTE:  if this changes then setMaxRegister() must change
105 |     public void setRegister(final long registerIndex, final long value) {
106 |         final long bitIndex = registerIndex * registerWidth;
107 |         final int firstWordIndex = (int)(bitIndex >>> LOG2_BITS_PER_WORD)/*aka (bitIndex / BITS_PER_WORD)*/;
108 |         final int secondWordIndex = (int)((bitIndex + registerWidth - 1) >>> LOG2_BITS_PER_WORD)/*see above*/;
109 |         final int bitRemainder = (int)(bitIndex & BITS_PER_WORD_MASK)/*aka (bitIndex % BITS_PER_WORD)*/;
110 | 
111 |         final long words[] = this.words/*for convenience/performance*/;
112 |         if(firstWordIndex == secondWordIndex) {
113 |             // clear then set
114 |             words[firstWordIndex] &= ~(registerMask << bitRemainder);
115 |             words[firstWordIndex] |= (value << bitRemainder);
116 |         } else {/*register spans words*/
117 |             // clear then set each partial word
118 |             words[firstWordIndex] &= (1L << bitRemainder) - 1;
119 |             words[firstWordIndex] |= (value << bitRemainder);
120 | 
121 |             words[secondWordIndex] &= ~(registerMask >>> (BITS_PER_WORD - bitRemainder));
122 |             words[secondWordIndex] |= (value >>> (BITS_PER_WORD - bitRemainder));
123 |         }
124 |     }
125 | 
126 |     // ------------------------------------------------------------------------
127 |     /**
128 |      * @return a <code>LongIterator</code> for iterating starting at the register
129 |      *         with index zero. This will never be <code>null</code>.
130 |      */
131 |     public LongIterator registerIterator() {
132 |         return new LongIterator() {
133 |             final int registerWidth = BitVector.this.registerWidth;
134 |             final long[] words = BitVector.this.words;
135 |             final long registerMask = BitVector.this.registerMask;
136 | 
137 |             // register setup
138 |             long registerIndex = 0;
139 |             int wordIndex = 0;
140 |             int remainingWordBits = BITS_PER_WORD;
141 |             long word = words[wordIndex];
142 | 
143 |             @Override public long next() {
144 |                 long register;
145 |                 if(remainingWordBits >= registerWidth) {
146 |                     register = word & registerMask;
147 | 
148 |                     // shift to the next register
149 |                     word >>>= registerWidth;
150 |                     remainingWordBits -= registerWidth;
151 |                 } else { /*insufficient bits remaining in current word*/
152 |                     wordIndex++/*move to the next word*/;
153 | 
154 |                     register = (word | (words[wordIndex] << remainingWordBits)) & registerMask;
155 | 
156 |                     // shift to the next partial register (word)
157 |                     word = words[wordIndex] >>> (registerWidth - remainingWordBits);
158 |                     remainingWordBits += BITS_PER_WORD - registerWidth;
159 |                 }
160 |                 registerIndex++;
161 |                 return register;
162 |             }
163 | 
164 |             @Override public boolean hasNext() {
165 |                 return registerIndex < count;
166 |             }
167 |         };
168 |     }
169 | 
170 |     // ------------------------------------------------------------------------
171 |     // composite accessors
172 |     /**
173 |      * Sets the value of the specified index register if and only if the specified
174 |      * value is greater than the current value in the register.  This is equivalent
175 |      * to but much more performant than:<p/>
176 |      *
177 |      * <pre>vector.setRegister(index, Math.max(vector.getRegister(index), value));</pre>
178 |      *
179 |      * @param  registerIndex the index of the register whose value is to be set.
180 |      *         This cannot be negative
181 |      * @param  value the value to set in the register if and only if this value
182 |      *         is greater than the current value in the register
183 |      * @return <code>true</code> if and only if the specified value is greater
184 |      *         than or equal to the current register value.  <code>false</code>
185 |      *         otherwise.
186 |      * @see #getRegister(long)
187 |      * @see #setRegister(long, long)
188 |      * @see java.lang.Math#max(long, long)
189 |      */
190 |     // NOTE:  if this changes then setRegister() must change
191 |     public boolean setMaxRegister(final long registerIndex, final long value) {
192 |         final long bitIndex = registerIndex * registerWidth;
193 |         final int firstWordIndex = (int)(bitIndex >>> LOG2_BITS_PER_WORD)/*aka (bitIndex / BITS_PER_WORD)*/;
194 |         final int secondWordIndex = (int)((bitIndex + registerWidth - 1) >>> LOG2_BITS_PER_WORD)/*see above*/;
195 |         final int bitRemainder = (int)(bitIndex & BITS_PER_WORD_MASK)/*aka (bitIndex % BITS_PER_WORD)*/;
196 | 
197 |         // NOTE:  matches getRegister()
198 |         final long registerValue;
199 |         final long words[] = this.words/*for convenience/performance*/;
200 |         if(firstWordIndex == secondWordIndex)
201 |             registerValue = ((words[firstWordIndex] >>> bitRemainder) & registerMask);
202 |         else /*register spans words*/
203 |             registerValue = (words[firstWordIndex] >>> bitRemainder)/*no need to mask since at top of word*/
204 |                           | (words[secondWordIndex] << (BITS_PER_WORD - bitRemainder)) & registerMask;
205 | 
206 |         // determine which is the larger and update as necessary
207 |         if(value > registerValue) {
208 |             // NOTE:  matches setRegister()
209 |             if(firstWordIndex == secondWordIndex) {
210 |                 // clear then set
211 |                 words[firstWordIndex] &= ~(registerMask << bitRemainder);
212 |                 words[firstWordIndex] |= (value << bitRemainder);
213 |             } else {/*register spans words*/
214 |                 // clear then set each partial word
215 |                 words[firstWordIndex] &= (1L << bitRemainder) - 1;
216 |                 words[firstWordIndex] |= (value << bitRemainder);
217 | 
218 |                 words[secondWordIndex] &= ~(registerMask >>> (BITS_PER_WORD - bitRemainder));
219 |                 words[secondWordIndex] |= (value >>> (BITS_PER_WORD - bitRemainder));
220 |             }
221 |         } /* else -- the register value is greater (or equal) so nothing needs to be done */
222 | 
223 |         return (value >= registerValue);
224 |     }
225 | 
226 |     // ========================================================================
227 |     /**
228 |      * Fills this bit vector with the specified bit value.  This can be used to
229 |      * clear the vector by specifying <code>0</code>.
230 |      *
231 |      * @param  value the value to set all bits to (only the lowest bit is used)
232 |      */
233 |     public void fill(final long value) {
234 |         for(long i=0; i<count; i++) {
235 |             setRegister(i, value);
236 |         }
237 |     }
238 | 
239 |     // ------------------------------------------------------------------------
240 |     /**
241 |      * Serializes the registers of the vector using the specified serializer.
242 |      *
243 |      * @param serializer the serializer to use. This cannot be <code>null</code>.
244 |      */
245 |     public void getRegisterContents(final IWordSerializer serializer) {
246 |         for(final LongIterator iter = registerIterator(); iter.hasNext();) {
247 |             serializer.writeWord(iter.next());
248 |         }
249 |     }
250 | 
251 |     /**
252 |      * Creates a deep copy of this vector.
253 |      *
254 |      * @see java.lang.Object#clone()
255 |      */
256 |     @Override
257 |     public BitVector clone() {
258 |         final BitVector copy = new BitVector(registerWidth, count);
259 |         System.arraycopy(words, 0, copy.words, 0, words.length);
260 |         return copy;
261 |     }
262 | }


--------------------------------------------------------------------------------
/src/main/java/net/agkn/hll/util/HLLUtil.java:
--------------------------------------------------------------------------------
  1 | package net.agkn.hll.util;
  2 | 
  3 | /*
  4 |  * Copyright 2013 Aggregate Knowledge, Inc.
  5 |  *
  6 |  * Licensed under the Apache License, Version 2.0 (the "License");
  7 |  * you may not use this file except in compliance with the License.
  8 |  * You may obtain a copy of the License at
  9 |  *
 10 |  *     http://www.apache.org/licenses/LICENSE-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS,
 14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 |  * See the License for the specific language governing permissions and
 16 |  * limitations under the License.
 17 |  */
 18 | 
 19 | import net.agkn.hll.HLL;
 20 | 
 21 | /**
 22 |  * Static functions for computing constants and parameters used in the HLL
 23 |  * algorithm.
 24 |  *
 25 |  * @author timon
 26 |  */
 27 | public final class HLLUtil {
 28 |     /**
 29 |      * Precomputed <code>pwMaxMask</code> values indexed by <code>registerSizeInBits</code>.
 30 |      * Calculated with this formula:
 31 |      * <pre>
 32 |      *     int maxRegisterValue = (1 << registerSizeInBits) - 1;
 33 |      *     // Mask with all bits set except for (maxRegisterValue - 1) least significant bits (see #addRaw())
 34 |      *     return ~((1L << (maxRegisterValue - 1)) - 1);
 35 |      * </pre>
 36 |      *
 37 |      * @see #pwMaxMask(int)
 38 |      */
 39 |     private static final long[] PW_MASK = {
 40 |             ~((1L << (((1 << 0) - 1) - 1)) - 1),
 41 |             ~((1L << (((1 << 1) - 1) - 1)) - 1),
 42 |             ~((1L << (((1 << 2) - 1) - 1)) - 1),
 43 |             ~((1L << (((1 << 3) - 1) - 1)) - 1),
 44 |             ~((1L << (((1 << 4) - 1) - 1)) - 1),
 45 |             ~((1L << (((1 << 5) - 1) - 1)) - 1),
 46 |             ~((1L << (((1 << 6) - 1) - 1)) - 1),
 47 |             ~((1L << (((1 << 7) - 1) - 1)) - 1),
 48 |             ~((1L << (((1 << 8) - 1) - 1)) - 1)
 49 |     };
 50 | 
 51 |     /**
 52 |      * Precomputed <code>twoToL</code> values indexed by a linear combination of
 53 |      * <code>regWidth</code> and <code>log2m</code>.
 54 |      *
 55 |      * The array is one-dimensional and can be accessed by using index
 56 |      * <code>(REG_WIDTH_INDEX_MULTIPLIER * regWidth) + log2m</code>
 57 |      * for <code>regWidth</code> and <code>log2m</code> between the specified
 58 |      * <code>HLL.{MINIMUM,MAXIMUM}_{REGWIDTH,LOG2M}_PARAM</code> constants.
 59 |      *
 60 |      * @see #largeEstimator(int, int, double)
 61 |      * @see #largeEstimatorCutoff(int, int)
 62 |      * @see <a href='http://research.neustar.biz/2013/01/24/hyperloglog-googles-take-on-engineering-hll/'>Blog post with section on 2^L</a>
 63 |      */
 64 |     private static final double[] TWO_TO_L = new double[(HLL.MAXIMUM_REGWIDTH_PARAM + 1) * (HLL.MAXIMUM_LOG2M_PARAM + 1)];
 65 | 
 66 |     /**
 67 |      * Spacing constant used to compute offsets into {@link TWO_TO_L}.
 68 |      */
 69 |     private static final int REG_WIDTH_INDEX_MULTIPLIER = HLL.MAXIMUM_LOG2M_PARAM + 1;
 70 | 
 71 |     static {
 72 |         for(int regWidth = HLL.MINIMUM_REGWIDTH_PARAM; regWidth <= HLL.MAXIMUM_REGWIDTH_PARAM; regWidth++) {
 73 |             for(int log2m = HLL.MINIMUM_LOG2M_PARAM ; log2m <= HLL.MAXIMUM_LOG2M_PARAM; log2m++) {
 74 |                 int maxRegisterValue = (1 << regWidth) - 1;
 75 | 
 76 |                 // Since 1 is added to p(w) in the insertion algorithm, only
 77 |                 // (maxRegisterValue - 1) bits are inspected hence the hash
 78 |                 // space is one power of two smaller.
 79 |                 final int pwBits = (maxRegisterValue - 1);
 80 |                 final int totalBits = (pwBits + log2m);
 81 |                 final double twoToL = Math.pow(2, totalBits);
 82 |                 TWO_TO_L[(REG_WIDTH_INDEX_MULTIPLIER * regWidth) + log2m] = twoToL;
 83 |             }
 84 |         }
 85 |     }
 86 | 
 87 |     // ************************************************************************
 88 |     /**
 89 |      * Computes the bit-width of HLL registers necessary to estimate a set of
 90 |      * the specified cardinality.
 91 |      *
 92 |      * @param  expectedUniqueElements an upper bound on the number of unique
 93 |      *         elements that are expected.  This must be greater than zero.
 94 |      * @return a register size in bits (i.e. <code>log2(log2(n))</code>)
 95 |      */
 96 |     public static int registerBitSize(final long expectedUniqueElements) {
 97 |         return Math.max(HLL.MINIMUM_REGWIDTH_PARAM,
 98 |                         (int)Math.ceil(NumberUtil.log2(NumberUtil.log2(expectedUniqueElements))));
 99 |     }
100 | 
101 |     // ========================================================================
102 |     /**
103 |      * Computes the 'alpha-m-squared' constant used by the HyperLogLog algorithm.
104 |      *
105 |      * @param  m this must be a power of two, cannot be less than
106 |      *         16 (2<sup>4</sup>), and cannot be greater than 65536 (2<sup>16</sup>).
107 |      * @return gamma times <code>registerCount</code> squared where gamma is
108 |      *         based on the value of <code>registerCount</code>.
109 |      * @throws IllegalArgumentException if <code>registerCount</code> is less
110 |      *         than 16.
111 |      */
112 |     public static double alphaMSquared(final int m) {
113 |         switch(m) {
114 |             case 1/*2^0*/:
115 |             case 2/*2^1*/:
116 |             case 4/*2^2*/:
117 |             case 8/*2^3*/:
118 |                 throw new IllegalArgumentException("'m' cannot be less than 16 (" + m + " < 16).");
119 | 
120 |             case 16/*2^4*/:
121 |                 return 0.673 * m * m;
122 | 
123 |             case 32/*2^5*/:
124 |                 return 0.697 * m * m;
125 | 
126 |             case 64/*2^6*/:
127 |                 return 0.709 * m * m;
128 | 
129 |             default/*>2^6*/:
130 |                 return (0.7213 / (1.0 + 1.079 / m)) * m * m;
131 |         }
132 |     }
133 | 
134 |     // ========================================================================
135 |     /**
136 |      * Computes a mask that prevents overflow of HyperLogLog registers.
137 |      *
138 |      * @param  registerSizeInBits the size of the HLL registers, in bits.
139 |      * @return mask a <code>long</code> mask to prevent overflow of the registers
140 |      * @see #registerBitSize(long)
141 |      */
142 |     public static long pwMaxMask(final int registerSizeInBits) {
143 |         return PW_MASK[registerSizeInBits];
144 |     }
145 | 
146 |     // ========================================================================
147 |     /**
148 |      * The cutoff for using the "small range correction" formula, in the
149 |      * HyperLogLog algorithm.
150 |      *
151 |      * @param  m the number of registers in the HLL. <em>m<em> in the paper.
152 |      * @return the cutoff for the small range correction.
153 |      * @see #smallEstimator(int, int)
154 |      */
155 |     public static double smallEstimatorCutoff(final int m) {
156 |         return ((double)m * 5) / 2;
157 |     }
158 | 
159 |     /**
160 |      * The "small range correction" formula from the HyperLogLog algorithm. Only
161 |      * appropriate if both the estimator is smaller than <pre>(5/2) * m</pre> and
162 |      * there are still registers that have the zero value.
163 |      *
164 |      * @param  m the number of registers in the HLL. <em>m<em> in the paper.
165 |      * @param  numberOfZeroes the number of registers with value zero. <em>V</em>
166 |      *         in the paper.
167 |      * @return a corrected cardinality estimate.
168 |      */
169 |     public static double smallEstimator(final int m, final int numberOfZeroes) {
170 |         return m * Math.log((double)m / numberOfZeroes);
171 |     }
172 | 
173 |     /**
174 |      * The cutoff for using the "large range correction" formula, from the
175 |      * HyperLogLog algorithm, adapted for 64 bit hashes.
176 |      *
177 |      * @param  log2m log-base-2 of the number of registers in the HLL. <em>b<em> in the paper.
178 |      * @param  registerSizeInBits the size of the HLL registers, in bits.
179 |      * @return the cutoff for the large range correction.
180 |      * @see #largeEstimator(int, int, double)
181 |      * @see <a href='http://research.neustar.biz/2013/01/24/hyperloglog-googles-take-on-engineering-hll/'>Blog post with section on 64 bit hashes and "large range correction" cutoff</a>
182 |      */
183 |     public static double largeEstimatorCutoff(final int log2m, final int registerSizeInBits) {
184 |         return (TWO_TO_L[(REG_WIDTH_INDEX_MULTIPLIER * registerSizeInBits) + log2m]) / 30.0;
185 |     }
186 | 
187 |     /**
188 |      * The "large range correction" formula from the HyperLogLog algorithm, adapted
189 |      * for 64 bit hashes. Only appropriate for estimators whose value exceeds
190 |      * the return of {@link #largeEstimatorCutoff(int, int)}.
191 |      *
192 |      * @param  log2m log-base-2 of the number of registers in the HLL. <em>b<em> in the paper.
193 |      * @param  registerSizeInBits the size of the HLL registers, in bits.
194 |      * @param  estimator the original estimator ("E" in the paper).
195 |      * @return a corrected cardinality estimate.
196 |      * @see <a href='http://research.neustar.biz/2013/01/24/hyperloglog-googles-take-on-engineering-hll/'>Blog post with section on 64 bit hashes and "large range correction"</a>
197 |      */
198 |     public static double largeEstimator(final int log2m, final int registerSizeInBits, final double estimator) {
199 |         final double twoToL = TWO_TO_L[(REG_WIDTH_INDEX_MULTIPLIER * registerSizeInBits) + log2m];
200 |         return -1 * twoToL * Math.log(1.0 - (estimator/twoToL));
201 |     }
202 | }
203 | 


--------------------------------------------------------------------------------
/src/main/java/net/agkn/hll/util/LongIterator.java:
--------------------------------------------------------------------------------
 1 | package net.agkn.hll.util;
 2 | 
 3 | /*
 4 |  * Copyright 2013 Aggregate Knowledge, Inc.
 5 |  *
 6 |  * Licensed under the Apache License, Version 2.0 (the "License");
 7 |  * you may not use this file except in compliance with the License.
 8 |  * You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | 
19 | /**
20 |  * A <code>long</code>-based iterator.  This is not <i>is-a</i> {@link java.util.Iterator}
21 |  * to prevent autoboxing between <code>Long</code> and <code>long</code>.
22 |  *
23 |  * @author rgrzywinski
24 |  */
25 | public interface LongIterator {
26 |     /**
27 |      * @return <code>true</code> if and only if there are more elements to
28 |      *         iterate over.  <code>false</code> otherwise.
29 |      */
30 |     boolean hasNext();
31 | 
32 |     /**
33 |      * @return the next <code>long</code> in the collection.
34 |      */
35 |     long next();
36 | }


--------------------------------------------------------------------------------
/src/main/java/net/agkn/hll/util/NumberUtil.java:
--------------------------------------------------------------------------------
  1 | package net.agkn.hll.util;
  2 | 
  3 | /*
  4 |  * Copyright 2013 Aggregate Knowledge, Inc.
  5 |  *
  6 |  * Licensed under the Apache License, Version 2.0 (the "License");
  7 |  * you may not use this file except in compliance with the License.
  8 |  * You may obtain a copy of the License at
  9 |  *
 10 |  *     http://www.apache.org/licenses/LICENSE-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS,
 14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 |  * See the License for the specific language governing permissions and
 16 |  * limitations under the License.
 17 |  */
 18 | 
 19 | /**
 20 |  * A collection of utilities to work with numbers.
 21 |  *
 22 |  * @author rgrzywinski
 23 |  */
 24 | public class NumberUtil {
 25 |     // loge(2) (log-base e of 2)
 26 |     public static final double LOGE_2 = 0.6931471805599453;
 27 | 
 28 |     // ************************************************************************
 29 |     /**
 30 |      * Computes the <code>log2</code> (log-base-two) of the specified value.
 31 |      *
 32 |      * @param  value the <code>double</code> for which the <code>log2</code> is
 33 |      *         desired.
 34 |      * @return the <code>log2</code> of the specified value
 35 |      */
 36 |     public static double log2(final double value) {
 37 |         // REF:  http://en.wikipedia.org/wiki/Logarithmic_scale (conversion of bases)
 38 |         return Math.log(value) / LOGE_2;
 39 |     }
 40 | 
 41 |     // ========================================================================
 42 |     // the hex characters
 43 |     private static final char[] HEX = { '0', '1', '2', '3', '4', '5', '6', '7',
 44 |                                         '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };
 45 | 
 46 |     // ------------------------------------------------------------------------
 47 |     /**
 48 |      * Converts the specified array of <code>byte</code>s into a string of
 49 |      * hex characters (low <code>byte</code> first).
 50 |      *
 51 |      * @param  bytes the array of <code>byte</code>s that are to be converted.
 52 |      *         This cannot be <code>null</code> though it may be empty.
 53 |      * @param  offset the offset in <code>bytes</code> at which the bytes will
 54 |      *         be taken.  This cannot be negative and must be less than
 55 |      *         <code>bytes.length - 1</code>.
 56 |      * @param  count the number of bytes to be retrieved from the specified array.
 57 |      *         This cannot be negative.  If greater than <code>bytes.length - offset</code>
 58 |      *         then that value is used.
 59 |      * @return a string of at most <code>count</code> characters that represents
 60 |      *         the specified byte array in hex.  This will never be <code>null</code>
 61 |      *         though it may be empty if <code>bytes</code> is empty or <code>count</code>
 62 |      *         is zero.
 63 |      * @throws IllegalArgumentException if <code>offset</code> is greater than
 64 |      *         or equal to <code>bytes.length</code>.
 65 |      * @see #fromHex(String, int, int)
 66 |      */
 67 |     public static String toHex(final byte[] bytes, final int offset, final int count) {
 68 |         if(offset >= bytes.length) throw new IllegalArgumentException("Offset is greater than the length (" + offset + " >= " + bytes.length + ").")/*by contract*/;
 69 |         final int byteCount = Math.min( (bytes.length - offset), count);
 70 |         final int upperBound = byteCount + offset;
 71 | 
 72 |         final char[] chars = new char[byteCount * 2/*two chars per byte*/];
 73 |         int charIndex = 0;
 74 |         for(int i=offset; i<upperBound; i++) {
 75 |             final byte value = bytes[i];
 76 |             chars[charIndex++] = HEX[(value >>> 4) & 0x0F];
 77 |             chars[charIndex++] = HEX[value & 0x0F];
 78 |         }
 79 | 
 80 |         return new String(chars);
 81 |     }
 82 | 
 83 |     /**
 84 |      * Converts the specified array of hex characters into an array of <code>byte</code>s
 85 |      * (low <code>byte</code> first).
 86 |      *
 87 |      * @param  string the string of hex characters to be converted into <code>byte</code>s.
 88 |      *         This cannot be <code>null</code> though it may be blank.
 89 |      * @param  offset the offset in the string at which the characters will be
 90 |      *         taken.  This cannot be negative and must be less than <code>string.length() - 1</code>.
 91 |      * @param  count the number of characters to be retrieved from the specified
 92 |      *         string.  This cannot be negative and must be divisible by two
 93 |      *         (since there are two characters per <code>byte</code>).
 94 |      * @return the array of <code>byte</code>s that were converted from the
 95 |      *         specified string (in the specified range).  This will never be
 96 |      *         <code>null</code> though it may be empty if <code>string</code>
 97 |      *         is empty or <code>count</code> is zero.
 98 |      * @throws IllegalArgumentException if <code>offset</code> is greater than
 99 |      *         or equal to <code>string.length()</code> or if <code>count</code>
100 |      *         is not divisible by two.
101 |      * @see #toHex(byte[], int, int)
102 |      */
103 |     public static byte[] fromHex(final String string, final int offset, final int count) {
104 |         if(offset >= string.length()) throw new IllegalArgumentException("Offset is greater than the length (" + offset + " >= " + string.length() + ").")/*by contract*/;
105 |         if( (count & 0x01) != 0) throw new IllegalArgumentException("Count is not divisible by two (" + count + ").")/*by contract*/;
106 |         final int charCount = Math.min((string.length() - offset), count);
107 |         final int upperBound = offset + charCount;
108 | 
109 |         final byte[] bytes = new byte[charCount >>> 1/*aka /2*/];
110 |         int byteIndex = 0/*beginning*/;
111 |         for(int i=offset; i<upperBound; i+=2) {
112 |             bytes[byteIndex++] = (byte)(( (digit(string.charAt(i)) << 4)
113 |                                          | digit(string.charAt(i + 1))) & 0xFF);
114 |         }
115 | 
116 |         return bytes;
117 |     }
118 | 
119 |     // ------------------------------------------------------------------------
120 |     /**
121 |      * @param  character a hex character to be converted to a <code>byte</code>.
122 |      *         This cannot be a character other than [a-fA-F0-9].
123 |      * @return the value of the specified character.  This will be a value <code>0</code>
124 |      *         through <code>15</code>.
125 |      * @throws IllegalArgumentException if the specified character is not in
126 |      *         [a-fA-F0-9]
127 |      */
128 |     private static final int digit(final char character) {
129 |         switch(character) {
130 |             case '0':
131 |                 return 0;
132 |             case '1':
133 |                 return 1;
134 |             case '2':
135 |                 return 2;
136 |             case '3':
137 |                 return 3;
138 |             case '4':
139 |                 return 4;
140 |             case '5':
141 |                 return 5;
142 |             case '6':
143 |                 return 6;
144 |             case '7':
145 |                 return 7;
146 |             case '8':
147 |                 return 8;
148 |             case '9':
149 |                 return 9;
150 |             case 'a':
151 |             case 'A':
152 |                 return 10;
153 |             case 'b':
154 |             case 'B':
155 |                 return 11;
156 |             case 'c':
157 |             case 'C':
158 |                 return 12;
159 |             case 'd':
160 |             case 'D':
161 |                 return 13;
162 |             case 'e':
163 |             case 'E':
164 |                 return 14;
165 |             case 'f':
166 |             case 'F':
167 |                 return 15;
168 | 
169 |             default:
170 |                 throw new IllegalArgumentException("Character is not in [a-fA-F0-9] ('" + character + "').");
171 |         }
172 |     }
173 | }


--------------------------------------------------------------------------------
/src/test/java/net/agkn/hll/ExplicitHLLTest.java:
--------------------------------------------------------------------------------
  1 | package net.agkn.hll;
  2 | 
  3 | /*
  4 |  * Copyright 2013 Aggregate Knowledge, Inc.
  5 |  *
  6 |  * Licensed under the Apache License, Version 2.0 (the "License");
  7 |  * you may not use this file except in compliance with the License.
  8 |  * You may obtain a copy of the License at
  9 |  *
 10 |  *     http://www.apache.org/licenses/LICENSE-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS,
 14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 |  * See the License for the specific language governing permissions and
 16 |  * limitations under the License.
 17 |  */
 18 | 
 19 | import static org.powermock.reflect.Whitebox.getInternalState;
 20 | import static org.testng.Assert.assertEquals;
 21 | import static org.testng.Assert.assertTrue;
 22 | import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
 23 | 
 24 | import java.util.HashSet;
 25 | import java.util.Random;
 26 | 
 27 | import net.agkn.hll.serialization.ISchemaVersion;
 28 | import net.agkn.hll.serialization.SerializationUtil;
 29 | import org.testng.annotations.Test;
 30 | 
 31 | /**
 32 |  * Tests {@link HLL} of type {@link HLLType#EXPLICIT}.
 33 |  *
 34 |  * @author timon
 35 |  */
 36 | public class ExplicitHLLTest {
 37 |     /**
 38 |      * Tests basic set semantics of {@link HLL#addRaw(long)}.
 39 |      */
 40 |     @Test
 41 |     public void addBasicTest() {
 42 |         { // Adding a single positive value to an empty set should work.
 43 |             final HLL hll = newHLL(128/*arbitrary*/);
 44 |             hll.addRaw(1L/*positive*/);
 45 |             assertEquals(hll.cardinality(), 1L);
 46 |         }
 47 |         { // Adding a single negative value to an empty set should work.
 48 |             final HLL hll = newHLL(128/*arbitrary*/);
 49 |             hll.addRaw(-1L/*negative*/);
 50 |             assertEquals(hll.cardinality(), 1L);
 51 |         }
 52 |         { // Adding a duplicate value to a set should be a no-op.
 53 |             final HLL hll = newHLL(128/*arbitrary*/);
 54 |             hll.addRaw(1L/*positive*/);
 55 |             assertEquals(hll.cardinality(), 1L/*arbitrary*/);
 56 |             assertEquals(hll.cardinality(), 1L/*dupe*/);
 57 |         }
 58 |     }
 59 | 
 60 |     // ------------------------------------------------------------------------
 61 |     /**
 62 |      * Tests {@link HLL#union(HLL)}.
 63 |      */
 64 |     @Test
 65 |     public void unionTest() {
 66 |         {// Unioning two distinct sets should work
 67 |             final HLL hllA = newHLL(128/*arbitrary*/);
 68 |             final HLL hllB = newHLL(128/*arbitrary*/);
 69 |             hllA.addRaw(1L);
 70 |             hllA.addRaw(2L);
 71 |             hllB.addRaw(3L);
 72 | 
 73 |             hllA.union(hllB);
 74 |             assertEquals(hllA.cardinality(), 3);
 75 |         }
 76 |         {// Unioning two sets whose union doesn't exceed the cardinality cap should not promote
 77 |             final HLL hllA = newHLL(128/*arbitrary*/);
 78 |             final HLL hllB = newHLL(128/*arbitrary*/);
 79 |             hllA.addRaw(1L);
 80 |             hllA.addRaw(2L);
 81 |             hllB.addRaw(1L);
 82 | 
 83 |             hllA.union(hllB);
 84 |             assertEquals(hllA.cardinality(), 2);
 85 |         }
 86 |         {// unioning two sets whose union exceeds the cardinality cap should promote
 87 |             final HLL hllA = newHLL(128/*arbitrary*/);
 88 |             final HLL hllB = newHLL(128/*arbitrary*/);
 89 | 
 90 |             // fill up sets to explicitThreshold
 91 |             for(long i=0; i<128/*explicitThreshold*/; i++) {
 92 |                 hllA.addRaw(i);
 93 |                 hllB.addRaw(i + 128);
 94 |             }
 95 | 
 96 |             hllA.union(hllB);
 97 |             assertEquals(hllA.getType(), HLLType.SPARSE);
 98 |         }
 99 |     }
100 | 
101 |     // ------------------------------------------------------------------------
102 |     /**
103 |      * Tests {@link HLL#clear()}
104 |      */
105 |     @Test
106 |     public void clearTest() {
107 |         final HLL hll = newHLL(128/*arbitrary*/);
108 |         hll.addRaw(1L);
109 |         assertEquals(hll.cardinality(), 1L);
110 |         hll.clear();
111 |         assertEquals(hll.cardinality(), 0L);
112 |     }
113 | 
114 |     // ------------------------------------------------------------------------
115 |     /**
116 |      * Tests {@link LongSetSlab#toBytes(int, ISchemaVersion)} and
117 |      * {@link LongSetSlab#fromBytes(int, byte[], ISchemaVersion)}.
118 |      */
119 |     @Test
120 |     public void toFromBytesTest() {
121 |         final ISchemaVersion schemaVersion = SerializationUtil.DEFAULT_SCHEMA_VERSION;
122 |         final HLLType type = HLLType.EXPLICIT;
123 |         final int padding = schemaVersion.paddingBytes(type);
124 |         final int bytesPerWord = 8;
125 | 
126 |         {// Should work on an empty set
127 |             final HLL hll = newHLL(128/*arbitrary*/);
128 | 
129 |             final byte[] bytes = hll.toBytes(schemaVersion);
130 | 
131 |             // assert output has correct byte length
132 |             assertEquals(bytes.length, padding/*no elements, just padding*/);
133 | 
134 |             final HLL inHLL = HLL.fromBytes(bytes);
135 | 
136 |             assertElementsEqual(hll, inHLL);
137 |         }
138 |         {// Should work on a partially filled set
139 |             final HLL hll = newHLL(128/*arbitrary*/);
140 | 
141 |             for(int i=0; i<3; i++) {
142 |                 hll.addRaw(i);
143 |             }
144 | 
145 |             final byte[] bytes = hll.toBytes(schemaVersion);
146 | 
147 |             // assert output has correct byte length
148 |             assertEquals(bytes.length, padding + (bytesPerWord * 3/*elements*/));
149 | 
150 |             final HLL inHLL = HLL.fromBytes(bytes);
151 | 
152 |             assertElementsEqual(hll, inHLL);
153 |         }
154 |         {// Should work on a full set
155 |             final int explicitThreshold = 128;
156 |             final HLL hll = newHLL(explicitThreshold);
157 | 
158 |             for(int i=0; i<explicitThreshold; i++) {
159 |                 hll.addRaw(27 + i/*arbitrary*/);
160 |             }
161 | 
162 |             final byte[] bytes = hll.toBytes(schemaVersion);
163 | 
164 |             // assert output has correct byte length
165 |             assertEquals(bytes.length, padding + (bytesPerWord * explicitThreshold/*elements*/));
166 | 
167 |             final HLL inHLL = HLL.fromBytes(bytes);
168 | 
169 |             assertElementsEqual(hll, inHLL);
170 |         }
171 |     }
172 | 
173 |     // ------------------------------------------------------------------------
174 |     /**
175 |      * Tests correctness against {@link java.util.HashSet}.
176 |      */
177 |     @Test
178 |     public void randomValuesTest() {
179 |         final int explicitThreshold = 4096;
180 |         final HashSet<Long> canonical = new HashSet<Long>();
181 |         final HLL hll = newHLL(explicitThreshold);
182 | 
183 |         final long seed = 1L/*constant so results are reproducible*/;
184 |         final Random random = new Random(seed);
185 |         for(int i=0;i<explicitThreshold;i++){
186 |             long randomLong = random.nextLong();
187 |             canonical.add(new Long(randomLong));
188 |             hll.addRaw(randomLong);
189 |         }
190 |         final int canonicalCardinality = canonical.size();
191 |         assertEquals(hll.cardinality(), canonicalCardinality);
192 |     }
193 | 
194 |     // ------------------------------------------------------------------------
195 |     /**
196 |      * Tests promotion to {@link HLLType#SPARSE} and {@link HLLType#FULL}.
197 |      */
198 |     @Test
199 |     public void promotionTest() {
200 |         { // locally scoped for sanity
201 |             final int explicitThreshold = 128;
202 |             final HLL hll = new HLL(11/*log2m, unused*/, 5/*regwidth, unused*/, explicitThreshold, 256/*sparseThreshold*/, HLLType.EXPLICIT);
203 | 
204 |             for(int i=0;i<explicitThreshold + 1;i++){
205 |                 hll.addRaw(i);
206 |             }
207 |             assertEquals(hll.getType(), HLLType.SPARSE);
208 |         }
209 |         { // locally scoped for sanity
210 |             final HLL hll = new HLL(11/*log2m, unused*/, 5/*regwidth, unused*/, 4/*expthresh => explicitThreshold = 8*/, false/*sparseon*/, HLLType.EXPLICIT);
211 | 
212 |             for(int i=0;i<9/* > explicitThreshold */;i++){
213 |                 hll.addRaw(i);
214 |             }
215 |             assertEquals(hll.getType(), HLLType.FULL);
216 |         }
217 |     }
218 | 
219 |     // ************************************************************************
220 |     // assertion helpers
221 |     /**
222 |      * Asserts that values in both sets are exactly equal.
223 |      */
224 |     private static void assertElementsEqual(final HLL hllA, final HLL hllB) {
225 |         final LongOpenHashSet internalSetA = (LongOpenHashSet)getInternalState(hllA, "explicitStorage");
226 |         final LongOpenHashSet internalSetB = (LongOpenHashSet)getInternalState(hllB, "explicitStorage");
227 | 
228 |         assertTrue(internalSetA.equals(internalSetB));
229 |     }
230 | 
231 |     /**
232 |      * Builds a {@link HLLType#EXPLICIT} {@link HLL} instance with the specified
233 |      * explicit threshold.
234 |      *
235 |      * @param  explicitThreshold explicit threshold to use for the constructed
236 |      *         {@link HLL}. This must be greater than zero.
237 |      * @return a default-sized {@link HLLType#EXPLICIT} empty {@link HLL} instance.
238 |      *         This will never be <code>null</code>.
239 |      */
240 |     private static HLL newHLL(final int explicitThreshold) {
241 |         return new HLL(11/*log2m, unused*/, 5/*regwidth, unused*/, explicitThreshold, 256/*sparseThreshold, arbitrary, unused*/, HLLType.EXPLICIT);
242 |     }
243 | }


--------------------------------------------------------------------------------
/src/test/java/net/agkn/hll/FullHLLTest.java:
--------------------------------------------------------------------------------
  1 | package net.agkn.hll;
  2 | 
  3 | /*
  4 |  * Copyright 2013 Aggregate Knowledge, Inc.
  5 |  *
  6 |  * Licensed under the Apache License, Version 2.0 (the "License");
  7 |  * you may not use this file except in compliance with the License.
  8 |  * You may obtain a copy of the License at
  9 |  *
 10 |  *     http://www.apache.org/licenses/LICENSE-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS,
 14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 |  * See the License for the specific language governing permissions and
 16 |  * limitations under the License.
 17 |  */
 18 | 
 19 | import static org.powermock.reflect.Whitebox.getInternalState;
 20 | import static org.testng.Assert.assertEquals;
 21 | import static org.testng.Assert.assertTrue;
 22 | import static org.testng.Assert.assertFalse;
 23 | 
 24 | import net.agkn.hll.serialization.ISchemaVersion;
 25 | import net.agkn.hll.serialization.SerializationUtil;
 26 | import net.agkn.hll.util.BitVector;
 27 | import net.agkn.hll.util.HLLUtil;
 28 | import net.agkn.hll.util.LongIterator;
 29 | 
 30 | import org.testng.annotations.Test;
 31 | 
 32 | /**
 33 |  * Tests {@link HLL} of type {@link HLLType#FULL}.
 34 |  *
 35 |  * @author rgrzywinski
 36 |  * @author timon
 37 |  */
 38 | public class FullHLLTest {
 39 |     // TODO union test
 40 |     /**
 41 |      * Smoke test for {@link HLL#cardinality(int)} and the proper use of the
 42 |      * small range correction.
 43 |      */
 44 |     @Test
 45 |     public void smallRangeSmokeTest() {
 46 |         final int log2m = 11;
 47 |         final int m = (1 << log2m);
 48 |         final int regwidth = 5;
 49 | 
 50 |         // only one register set
 51 |         {
 52 |             final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
 53 |             hll.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, 0/*ix*/, 1/*val*/));
 54 | 
 55 |             final long cardinality = hll.cardinality();
 56 | 
 57 |             // Trivially true that small correction conditions hold: one register
 58 |             // set implies zeroes exist, and estimator trivially smaller than 5m/2.
 59 |             // Small range correction: m * log(m/V)
 60 |             final long expected = (long)Math.ceil(m * Math.log((double)m / (m - 1)/*# of zeroes*/));
 61 |             assertEquals(cardinality, expected);
 62 |         }
 63 | 
 64 |         // all but one register set
 65 |         {
 66 |             final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
 67 |             for(int i=0; i<(m - 1); i++) {
 68 |                 hll.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, i/*ix*/, 1/*val*/));
 69 |             }
 70 | 
 71 |             // Trivially true that small correction conditions hold: all but
 72 |             // one register set implies a zero exists, and estimator trivially
 73 |             // smaller than 5m/2 since it's alpha / ((m-1)/2)
 74 |             final long cardinality = hll.cardinality();
 75 | 
 76 |             // Small range correction: m * log(m/V)
 77 |             final long expected = (long)Math.ceil(m * Math.log((double)m / 1/*# of zeroes*/));
 78 |             assertEquals(cardinality, expected);
 79 |         }
 80 |     }
 81 | 
 82 |     /**
 83 |      * Smoke test for {@link HLL#cardinality()} and the proper use of the
 84 |      * uncorrected estimator
 85 |      */
 86 |     @Test
 87 |     public void normalRangeSmokeTest() {
 88 |         final int log2m = 11;
 89 |         final int regwidth = 5;
 90 |         // regwidth = 5, so hash space is
 91 |         // log2m + (2^5 - 1 - 1), so L = log2m + 30
 92 |         final int l = log2m + 30;
 93 |         final int m = (1 << log2m);
 94 |         final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
 95 | 
 96 |         // all registers at 'medium' value
 97 |         {
 98 |             final int registerValue = 7/*chosen to ensure neither correction kicks in*/;
 99 |             for(int i=0; i<m; i++) {
100 |                 hll.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, i, registerValue));
101 |             }
102 | 
103 |             final long cardinality = hll.cardinality();
104 | 
105 | 
106 |             // Simplified estimator when all registers take same value: alpha / (m/2^val)
107 |             final double estimator = HLLUtil.alphaMSquared(m)/((double)m/Math.pow(2, registerValue));
108 | 
109 |             // Assert conditions for uncorrected range
110 |             assertTrue(estimator <= Math.pow(2, l)/30);
111 |             assertTrue(estimator > (5 * m /(double)2));
112 | 
113 |             final long expected = (long)Math.ceil(estimator);
114 |             assertEquals(cardinality, expected);
115 |         }
116 |     }
117 | 
118 |     /**
119 |      * Smoke test for {@link HLL#cardinality()} and the proper use of the large
120 |      * range correction.
121 |      */
122 |     @Test
123 |     public void largeRangeSmokeTest() {
124 |         final int log2m = 12;
125 |         final int regwidth = 5;
126 |         // regwidth = 5, so hash space is
127 |         // log2m + (2^5 - 1 - 1), so L = log2m + 30
128 |         final int l = log2m + 30;
129 |         final int m = (1 << log2m);
130 |         final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
131 | 
132 |         {
133 |             final int registerValue = 31/*chosen to ensure large correction kicks in*/;
134 |             for(int i=0; i<m; i++) {
135 |                 hll.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, i, registerValue));
136 |             }
137 | 
138 |             final long cardinality = hll.cardinality();
139 | 
140 | 
141 |             // Simplified estimator when all registers take same value: alpha / (m/2^val)
142 |             final double estimator = HLLUtil.alphaMSquared(m)/((double)m/Math.pow(2, registerValue));
143 | 
144 |             // Assert conditions for large range
145 | 
146 |             assertTrue(estimator > Math.pow(2,l)/30);
147 | 
148 |             // Large range correction: -2^L * log(1 - E/2^L)
149 |             final long expected = (long)Math.ceil(-1.0 * Math.pow(2, l) * Math.log(1.0 - estimator/Math.pow(2, l)));
150 |             assertEquals(cardinality, expected);
151 |         }
152 |     }
153 | 
154 |     // ========================================================================
155 |     /**
156 |      * Tests the bounds on a register's value for a given raw input value.
157 |      */
158 |     @Test
159 |     public void registerValueTest() {
160 |         final int log2m = 4/*small enough to make testing easy (addRaw() shifts by one byte)*/;
161 | 
162 |         // register width 4 (the minimum size)
163 |         { // scoped locally for sanity
164 |             final int regwidth = 4;
165 |             final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
166 |             final BitVector bitVector = (BitVector)getInternalState(hll, "probabilisticStorage")/*for testing convenience*/;
167 | 
168 |             // lower-bounds of the register
169 |             hll.addRaw(0x000000000000001L/*'j'=1*/);
170 |             assertEquals(bitVector.getRegister(1/*'j'*/), 0);
171 | 
172 |             hll.addRaw(0x0000000000000012L/*'j'=2*/);
173 |             assertEquals(bitVector.getRegister(2/*'j'*/), 1);
174 | 
175 |             hll.addRaw(0x0000000000000023L/*'j'=3*/);
176 |             assertEquals(bitVector.getRegister(3/*'j'*/), 2);
177 | 
178 |             hll.addRaw(0x0000000000000044L/*'j'=4*/);
179 |             assertEquals(bitVector.getRegister(4/*'j'*/), 3);
180 | 
181 |             hll.addRaw(0x0000000000000085L/*'j'=5*/);
182 |             assertEquals(bitVector.getRegister(5/*'j'*/), 4);
183 | 
184 |             // upper-bounds of the register
185 |             // NOTE:  bear in mind that BitVector itself does ensure that
186 |             //        overflow of a register is prevented
187 |             hll.addRaw(0x0000000000010006L/*'j'=6*/);
188 |             assertEquals(bitVector.getRegister(6/*'j'*/), 13);
189 | 
190 |             hll.addRaw(0x0000000000020007L/*'j'=7*/);
191 |             assertEquals(bitVector.getRegister(7/*'j'*/), 14);
192 | 
193 |             hll.addRaw(0x0000000000040008L/*'j'=8*/);
194 |             assertEquals(bitVector.getRegister(8/*'j'*/), 15);
195 | 
196 |             hll.addRaw(0x0000000000080009L/*'j'=9*/);
197 |             assertEquals(bitVector.getRegister(9/*'j'*/), 15/*overflow*/);
198 | 
199 |             // sanity checks to ensure that no other bits above the lowest-set
200 |             // bit matters
201 |             // NOTE:  same as case 'j = 6' above
202 |             hll.addRaw(0x000000000003000AL/*'j'=10*/);
203 |             assertEquals(bitVector.getRegister(10/*'j'*/), 13);
204 | 
205 |             hll.addRaw(0x000000000011000BL/*'j'=11*/);
206 |             assertEquals(bitVector.getRegister(11/*'j'*/), 13);
207 |         }
208 | 
209 |         // register width 5
210 |         { // scoped locally for sanity
211 |             final int regwidth = 5;
212 |             final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
213 |             final BitVector bitVector = (BitVector)getInternalState(hll, "probabilisticStorage")/*for testing convenience*/;
214 | 
215 |             // lower-bounds of the register
216 |             hll.addRaw(0x0000000000000001L/*'j'=1*/);
217 |             assertEquals(bitVector.getRegister(1/*'j'*/), 0);
218 | 
219 |             hll.addRaw(0x0000000000000012L/*'j'=2*/);
220 |             assertEquals(bitVector.getRegister(2/*'j'*/), 1);
221 | 
222 |             hll.addRaw(0x0000000000000023L/*'j'=3*/);
223 |             assertEquals(bitVector.getRegister(3/*'j'*/), 2);
224 | 
225 |             hll.addRaw(0x0000000000000044L/*'j'=4*/);
226 |             assertEquals(bitVector.getRegister(4/*'j'*/), 3);
227 | 
228 |             hll.addRaw(0x0000000000000085L/*'j'=5*/);
229 |             assertEquals(bitVector.getRegister(5/*'j'*/), 4);
230 | 
231 |             // upper-bounds of the register
232 |             // NOTE:  bear in mind that BitVector itself does ensure that
233 |             //        overflow of a register is prevented
234 |             hll.addRaw(0x0000000100000006L/*'j'=6*/);
235 |             assertEquals(bitVector.getRegister(6/*'j'*/), 29);
236 | 
237 |             hll.addRaw(0x0000000200000007L/*'j'=7*/);
238 |             assertEquals(bitVector.getRegister(7/*'j'*/), 30);
239 | 
240 |             hll.addRaw(0x0000000400000008L/*'j'=8*/);
241 |             assertEquals(bitVector.getRegister(8/*'j'*/), 31);
242 | 
243 |             hll.addRaw(0x0000000800000009L/*'j'=9*/);
244 |             assertEquals(bitVector.getRegister(9/*'j'*/), 31/*overflow*/);
245 |         }
246 |     }
247 | 
248 |     // ========================================================================
249 |     /**
250 |      * Tests {@link HLL#clear()}.
251 |      */
252 |     @Test
253 |     public void clearTest() {
254 |         final int regwidth = 5;
255 |         final int log2m = 4/*16 registers per counter*/;
256 |         final int m = 1 << log2m;
257 | 
258 |         final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
259 |         final BitVector bitVector = (BitVector)getInternalState(hll, "probabilisticStorage")/*for testing convenience*/;
260 |         for(int i=0; i<m; i++)
261 |             bitVector.setRegister(i, i);
262 | 
263 |         hll.clear();
264 |         for(int i=0; i<m; i++){
265 |             assertEquals(bitVector.getRegister(i), 0L/*default value of register*/);
266 |         }
267 |     }
268 | 
269 |     // ========================================================================
270 |     // Serialization
271 |     /**
272 |      * Tests {@link HLL#toBytes(ISchemaVersion)} and {@link HLL#fromBytes(byte[])}.
273 |      */
274 |     @Test
275 |     public void toFromBytesTest() {
276 |         final int log2m = 11/*arbitrary*/;
277 |         final int regwidth = 5;
278 | 
279 |         final ISchemaVersion schemaVersion = SerializationUtil.DEFAULT_SCHEMA_VERSION;
280 |         final HLLType type = HLLType.FULL;
281 |         final int padding = schemaVersion.paddingBytes(type);
282 |         final int dataByteCount = ProbabilisticTestUtil.getRequiredBytes(regwidth, (1 << log2m)/*aka 2^log2m = m*/);
283 |         final int expectedByteCount = padding + dataByteCount;
284 | 
285 |         {// Should work on an empty element
286 |             final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
287 |             final byte[] bytes = hll.toBytes(schemaVersion);
288 | 
289 |             // assert output length is correct
290 |             assertEquals(bytes.length, expectedByteCount);
291 | 
292 |             final HLL inHLL = HLL.fromBytes(bytes);
293 | 
294 |             // assert register values correct
295 |             assertElementsEqual(hll, inHLL);
296 |         }
297 |         {// Should work on a partially filled element
298 |             final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
299 | 
300 |             for(int i=0; i<3; i++) {
301 |                 final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, i, (i+9));
302 |                 hll.addRaw(rawValue);
303 |             }
304 | 
305 |             final byte[] bytes = hll.toBytes(schemaVersion);
306 | 
307 |             // assert output length is correct
308 |             assertEquals(bytes.length, expectedByteCount);
309 | 
310 |             final HLL inHLL = HLL.fromBytes(bytes);
311 | 
312 |             // assert register values correct
313 |             assertElementsEqual(hll, inHLL);
314 |         }
315 |         {// Should work on a full set
316 |             final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.FULL);
317 | 
318 |             for(int i=0; i<(1 << log2m)/*aka 2^log2m*/; i++) {
319 |                 final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, i, (i % 9) + 1);
320 |                 hll.addRaw(rawValue);
321 |             }
322 | 
323 |             final byte[] bytes = hll.toBytes(schemaVersion);
324 | 
325 |             // assert output length is correct
326 |             assertEquals(bytes.length, expectedByteCount);
327 | 
328 |             final HLL inHLL = HLL.fromBytes(bytes);
329 | 
330 |             // assert register values correct
331 |             assertElementsEqual(hll, inHLL);
332 |         }
333 |     }
334 | 
335 |     // ************************************************************************
336 |     // Assertion Helpers
337 |     /**
338 |      * Asserts that the two HLLs are register-wise equal.
339 |      */
340 |     private static void assertElementsEqual(final HLL hllA, final HLL hllB) {
341 |         final BitVector bitVectorA = (BitVector)getInternalState(hllA, "probabilisticStorage")/*for testing convenience*/;
342 |         final BitVector bitVectorB = (BitVector)getInternalState(hllA, "probabilisticStorage")/*for testing convenience*/;
343 | 
344 |         final LongIterator iterA = bitVectorA.registerIterator();
345 |         final LongIterator iterB = bitVectorB.registerIterator();
346 | 
347 |         for(;iterA.hasNext() && iterB.hasNext();) {
348 |             assertEquals(iterA.next(), iterB.next());
349 |         }
350 |         assertFalse(iterA.hasNext());
351 |         assertFalse(iterB.hasNext());
352 |     }
353 | }


--------------------------------------------------------------------------------
/src/test/java/net/agkn/hll/IntegrationTestGenerator.java:
--------------------------------------------------------------------------------
  1 | package net.agkn.hll;
  2 | 
  3 | /*
  4 |  * Copyright 2013 Aggregate Knowledge, Inc.
  5 |  *
  6 |  * Licensed under the Apache License, Version 2.0 (the "License");
  7 |  * you may not use this file except in compliance with the License.
  8 |  * You may obtain a copy of the License at
  9 |  *
 10 |  *     http://www.apache.org/licenses/LICENSE-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS,
 14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 |  * See the License for the specific language governing permissions and
 16 |  * limitations under the License.
 17 |  */
 18 | 
 19 | import java.io.FileWriter;
 20 | import java.io.IOException;
 21 | import java.util.Random;
 22 | 
 23 | import net.agkn.hll.HLL;
 24 | import net.agkn.hll.HLLType;
 25 | import net.agkn.hll.serialization.ISchemaVersion;
 26 | import net.agkn.hll.serialization.SerializationUtil;
 27 | import net.agkn.hll.util.NumberUtil;
 28 | import static net.agkn.hll.ProbabilisticTestUtil.constructHLLValue;
 29 | 
 30 | /**
 31 |  * Generates test files for testing other implementations of HLL
 32 |  * serialization/deserialization, namely the PostgreSQL implementation.
 33 |  *
 34 |  * @author timon
 35 |  */
 36 | public class IntegrationTestGenerator {
 37 |     // ************************************************************************
 38 |     // directory to output the generated tests
 39 |     private static final String OUTPUT_DIRECTORY = "/tmp/hll_test/";
 40 |     // seed to make results reproducible
 41 |     private static final long SEED = 1L;
 42 | 
 43 |     // ------------------------------------------------------------------------
 44 |     // configurations for HLLs, should mirror settings in PostgreSQL impl. tests
 45 |     private static final int REGWIDTH = 5;
 46 |     private static final int LOG2M = 11;
 47 |     // NOTE:  This differs from the PostgreSQL impl. parameter 'expthresh'. This
 48 |     //        is a literal threshold to use in the promotion hierarchy, implying
 49 |     //        that both EXPLICIT representation should be used and it should
 50 |     //        NOT be automatically computed. This is done to ensure that the
 51 |     //        parameters of the test are very explicitly defined.
 52 |     private static final int EXPLICIT_THRESHOLD = 256;
 53 |     // NOTE:  This is not the PostgreSQL impl. parameter 'sparseon'. 'sparseon'
 54 |     //        is assumed to be true and this is a literal register-count threshold
 55 |     //        to use in the promotion hierarchy. This is done to ensure that the
 56 |     //        parameters of the test are very explicitly defined.
 57 |     private static final int SPARSE_THRESHOLD = 850;
 58 | 
 59 |     // ------------------------------------------------------------------------
 60 |     // computed constants
 61 |     private static final int REGISTER_COUNT = (1 << LOG2M);
 62 |     private static final int REGISTER_MAX_VALUE = (1 << REGWIDTH) - 1;
 63 | 
 64 |     // ========================================================================
 65 |     // Tests
 66 |     /**
 67 |      * Cumulatively adds random values to a FULL HLL through the small range
 68 |      * correction, uncorrected range, and large range correction of the HLL's
 69 |      * cardinality estimator.
 70 |      *
 71 |      * Format: cumulative add
 72 |      * Tests:
 73 |      * - FULL cardinality computation
 74 |      */
 75 |     private static void fullCardinalityCorrectionTest(final ISchemaVersion schemaVersion) throws IOException {
 76 |         final FileWriter output = openOutput(schemaVersion, "cardinality_correction", TestType.ADD);
 77 | 
 78 |         // the accumulator, starts empty
 79 |         final HLL hll = newHLL(HLLType.FULL);
 80 |         initLineAdd(output, hll, schemaVersion);
 81 | 
 82 |         // run through some values in the small range correction
 83 |         for(int i=0; i<((1 << LOG2M) - 1); i++) {
 84 |             final long rawValue = constructHLLValue(LOG2M, i, 1);
 85 |             cumulativeAddLine(output, hll, rawValue, schemaVersion);
 86 |         }
 87 | 
 88 |         // run up past some values in the uncorrected range
 89 |         for(int i=0; i<(1 << LOG2M); i++) {
 90 |             final long rawValue = constructHLLValue(LOG2M, i, 7);
 91 |             cumulativeAddLine(output, hll, rawValue, schemaVersion);
 92 |         }
 93 | 
 94 |         // run through some values in the large range correction
 95 |         for(int i=0; i<(1 << LOG2M); i++) {
 96 |             final long rawValue = constructHLLValue(LOG2M, i, 30);
 97 |             cumulativeAddLine(output, hll, rawValue, schemaVersion);
 98 |         }
 99 | 
100 |         output.flush();
101 |         output.close();
102 |     }
103 | 
104 |     /**
105 |      * Cumulatively adds random values to an EMPTY HLL.
106 |      *
107 |      * Format: cumulative add
108 |      * Tests:
109 |      * - EMPTY, EXPLICIT, SPARSE, PROBABILSTIC addition
110 |      * - EMPTY to EXPLICIT promotion
111 |      * - EXPLICIT to SPARSE promotion
112 |      * - SPARSE to FULL promotion
113 |      */
114 |     private static void globalStepTest(final ISchemaVersion schemaVersion) throws IOException {
115 |         final FileWriter output = openOutput(schemaVersion, "comprehensive_promotion", TestType.ADD);
116 | 
117 |         final Random random = new Random(SEED);
118 | 
119 |         // the accumulator, starts empty
120 |         final HLL hll = newHLL(HLLType.EMPTY);
121 |         initLineAdd(output, hll, schemaVersion);
122 | 
123 |         for(int i=0; i<10000/*arbitrary*/; i++) {
124 |             cumulativeAddLine(output, hll, random.nextLong(), schemaVersion);
125 |         }
126 | 
127 |         output.flush();
128 |         output.close();
129 |     }
130 | 
131 |     /**
132 |      * Cumulatively unions "underpopulated" FULL HLLs into the
133 |      * accumulator to verify the correct behavior from the PostgreSQL implementation.
134 |      * The PostgreSQL implementation's representations of probabilistic HLLs should
135 |      * depend exclusively on the chosen SPARSE-to-FULL cutoff.
136 |      *
137 |      * Format: cumulative union
138 |      * Tests:
139 |      * - EMPTY U "underpopulated" FULL => SPARSE
140 |      * - SPARSE U "underpopulated" FULL => SPARSE
141 |      * - SPARSE U "barely underpopulated" FULL => FULL
142 |      */
143 |     private static void sparseFullRepresentationTest(final ISchemaVersion schemaVersion) throws IOException {
144 |         final FileWriter output = openOutput(schemaVersion, "sparse_full_representation", TestType.UNION);
145 | 
146 |         final HLL emptyHLL1 = newHLL(HLLType.EMPTY);
147 |         final HLL emptyHLL2 = newHLL(HLLType.EMPTY);
148 | 
149 |         cumulativeUnionLine(output, emptyHLL1, emptyHLL2, schemaVersion);
150 | 
151 |         // NOTE:  In this test the sparseReference will be the "expected" value
152 |         //        from the C representation, since it doesn't choose representation
153 |         //        based on original encoding, but rather on the promotion rules
154 |         //        and the declared type of the "receiving" field.
155 |         //        It is the manually-constructed union result.
156 | 
157 |         // "underpopulated" FULL U EMPTY => SPARSE
158 |         final HLL fullHLL = newHLL(HLLType.FULL);
159 |         fullHLL.addRaw(constructHLLValue(LOG2M, 0/*ix*/, 1/*val*/));
160 | 
161 |         final HLL sparseHLL = newHLL(HLLType.SPARSE);
162 |         sparseHLL.addRaw(constructHLLValue(LOG2M, 0/*ix*/, 1/*val*/));
163 | 
164 |         output.write(stringCardinality(fullHLL) + "," + toByteA(fullHLL, schemaVersion) + "," + stringCardinality(sparseHLL) + "," + toByteA(sparseHLL, schemaVersion) + "\n");
165 |         output.flush();
166 | 
167 |         // "underpopulated" FULL (small) U SPARSE (small) => SPARSE
168 |         final HLL fullHLL2 = newHLL(HLLType.FULL);
169 |         fullHLL2.addRaw(constructHLLValue(LOG2M, 1/*ix*/, 1/*val*/));
170 | 
171 |         sparseHLL.addRaw(constructHLLValue(LOG2M, 1/*ix*/, 1/*val*/));
172 | 
173 |         output.write(stringCardinality(fullHLL2) + "," + toByteA(fullHLL2, schemaVersion) + "," + stringCardinality(sparseHLL) + "," + toByteA(sparseHLL, schemaVersion) + "\n");
174 |         output.flush();
175 | 
176 |         // "underpopulated" FULL (just on edge) U SPARSE (small) => FULL
177 |         final HLL fullHLL3 = newHLL(HLLType.FULL);
178 |         for(int i=2; i<(SPARSE_THRESHOLD + 1); i++) {
179 |             fullHLL3.addRaw(constructHLLValue(LOG2M, i/*ix*/, 1/*val*/));
180 |             sparseHLL.addRaw(constructHLLValue(LOG2M, i/*ix*/, 1/*val*/));
181 |         }
182 | 
183 |         output.write(stringCardinality(fullHLL3) + "," + toByteA(fullHLL3, schemaVersion) + "," + stringCardinality(sparseHLL) + "," + toByteA(sparseHLL, schemaVersion) + "\n");
184 |         output.flush();
185 |     }
186 | 
187 |     /**
188 |      * Cumulatively sets successive registers to:
189 |      *
190 |      *     <code>(registerIndex % REGISTER_MAX_VALUE) + 1</code>
191 |      *
192 |      * by adding specifically constructed values to a SPARSE HLL.
193 |      * Does not induce promotion.
194 |      *
195 |      * Format: cumulative add
196 |      * Tests:
197 |      * - SPARSE addition (predictable)
198 |      */
199 |     private static void sparseStepTest(final ISchemaVersion schemaVersion) throws IOException {
200 |         final FileWriter output = openOutput(schemaVersion, "sparse_step", TestType.ADD);
201 | 
202 |         // the accumulator, starts empty sparse probabilistic
203 |         final HLL hll = newHLL(HLLType.SPARSE);
204 |         initLineAdd(output, hll, schemaVersion);
205 | 
206 |         for(int i=0; i<SPARSE_THRESHOLD; i++) {
207 |             final long rawValue = constructHLLValue(LOG2M, i, ((i % REGISTER_MAX_VALUE) + 1));
208 |             cumulativeAddLine(output, hll, rawValue, schemaVersion);
209 |         }
210 | 
211 |         output.flush();
212 |         output.close();
213 |     }
214 | 
215 |     /**
216 |      * Cumulatively sets random registers of a SPARSE HLL to
217 |      * random values by adding random values. Does not induce promotion.
218 |      *
219 |      * Format: cumulative add
220 |      * Tests:
221 |      * - SPARSE addition (random)
222 |      */
223 |     private static void sparseRandomTest(final ISchemaVersion schemaVersion) throws IOException {
224 |         final FileWriter output = openOutput(schemaVersion, "sparse_random", TestType.ADD);
225 | 
226 |         final Random random = new Random(SEED);
227 | 
228 |         // the accumulator, starts empty
229 |         final HLL hll = newHLL(HLLType.SPARSE);
230 |         initLineAdd(output, hll, schemaVersion);
231 | 
232 |         for(int i=0; i<SPARSE_THRESHOLD; i++) {
233 |             final int registerIndex = Math.abs(random.nextInt()) % REGISTER_COUNT;
234 |             final int registerValue = ((Math.abs(random.nextInt()) % REGISTER_MAX_VALUE) + 1);
235 |             final long rawValue = constructHLLValue(LOG2M, registerIndex, registerValue);
236 | 
237 |             cumulativeAddLine(output, hll, rawValue, schemaVersion);
238 |         }
239 | 
240 |         output.flush();
241 |         output.close();
242 |     }
243 | 
244 |     /**
245 |      * Cumulatively sets the first register (index 0) to value 2, the last
246 |      * register (index m-1) to value 2, and then sets registers with indices in
247 |      * the range 2 to (sparseCutoff + 2) to value 1 to trigger promotion.
248 |      *
249 |      * This tests for register alignment in the promotion from SPARSE
250 |      * to FULL.
251 |      *
252 |      * Format: cumulative add
253 |      * Tests:
254 |      * - SPARSE addition
255 |      * - SPARSE to FULL promotion
256 |      */
257 |     private static void sparseEdgeTest(final ISchemaVersion schemaVersion) throws IOException {
258 |         final FileWriter output = openOutput(schemaVersion, "sparse_edge", TestType.ADD);
259 | 
260 |         // the accumulator, starts empty
261 |         final HLL hll = newHLL(HLLType.SPARSE);
262 |         initLineAdd(output, hll, schemaVersion);
263 | 
264 |         final long firstValue = constructHLLValue(LOG2M, 0, 2);
265 |         cumulativeAddLine(output, hll, firstValue, schemaVersion);
266 | 
267 |         final long lastValue = constructHLLValue(LOG2M, (1 << LOG2M) - 1, 2);
268 |         cumulativeAddLine(output, hll, lastValue, schemaVersion);
269 | 
270 |         for(int i=2; i<(SPARSE_THRESHOLD + 2); i++) {
271 |             final long middleValue = constructHLLValue(LOG2M, i, 1);
272 | 
273 |             cumulativeAddLine(output, hll, middleValue, schemaVersion);
274 |         }
275 | 
276 |         output.flush();
277 |         output.close();
278 |     }
279 | 
280 |     /**
281 |      * Unions an EMPTY accumulator with EXPLICIT HLLs, each containing a
282 |      * single random value.
283 |      *
284 |      * Format: cumulative union
285 |      * Tests:
286 |      * - EMPTY U EXPLICIT
287 |      * - EXPLICIT U EXPLICIT
288 |      * - EXPLICIT to SPARSE promotion
289 |      * - SPARSE U EXPLICIT
290 |      */
291 |     private static void explicitPromotionTest(final ISchemaVersion schemaVersion) throws IOException {
292 |         final FileWriter output = openOutput(schemaVersion, "explicit_promotion", TestType.UNION);
293 | 
294 |         final Random random = new Random(SEED);
295 | 
296 |         // the accumulator, starts empty
297 |         final HLL hll = newHLL(HLLType.EMPTY);
298 |         final HLL emptyHLL = newHLL(HLLType.EMPTY);
299 |         cumulativeUnionLine(output, hll, emptyHLL, schemaVersion);
300 | 
301 |         for(int i=0; i<(EXPLICIT_THRESHOLD+500)/*should be greater than promotion cutoff*/; i++) {
302 |             // make an EXPLICIT set and populate with cardinality 1
303 |             final HLL explicitHLL = newHLL(HLLType.EXPLICIT);
304 |             explicitHLL.addRaw(random.nextLong());
305 | 
306 |             cumulativeUnionLine(output, hll, explicitHLL, schemaVersion);
307 |         }
308 | 
309 |         output.flush();
310 |         output.close();
311 |     }
312 | 
313 |     /**
314 |      * Unions an EMPTY accumulator with SPARSE HLLs, each
315 |      * having one register set.
316 |      *
317 |      * Format: cumulative union
318 |      * Tests:
319 |      * - EMPTY U SPARSE
320 |      * - SPARSE U SPARSE
321 |      * - SPARSE promotion
322 |      * - SPARSE U FULL
323 |      */
324 |     private static void sparseProbabilisticPromotionTest(final ISchemaVersion schemaVersion) throws IOException {
325 |         final FileWriter output = openOutput(schemaVersion, "sparse_promotion", TestType.UNION);
326 | 
327 |         final Random random = new Random(SEED);
328 | 
329 |         // the accumulator, starts empty
330 |         final HLL hll = newHLL(HLLType.EMPTY);
331 |         final HLL emptyHLL = newHLL(HLLType.EMPTY);
332 |         cumulativeUnionLine(output, hll, emptyHLL, schemaVersion);
333 | 
334 | 
335 |         for(int i=0; i<(SPARSE_THRESHOLD + 1000)/*should be greater than promotion cutoff*/; i++) {
336 |             // make a SPARSE set and populate with cardinality 1
337 |             final HLL sparseHLL = newHLL(HLLType.SPARSE);
338 | 
339 |             final int registerIndex = Math.abs(random.nextInt()) % REGISTER_COUNT;
340 |             final int registerValue = ((Math.abs(random.nextInt()) % REGISTER_MAX_VALUE) + 1);
341 |             final long rawValue = constructHLLValue(LOG2M, registerIndex, registerValue);
342 |             sparseHLL.addRaw(rawValue);
343 | 
344 |             cumulativeUnionLine(output, hll, sparseHLL, schemaVersion);
345 |         }
346 | 
347 |         output.flush();
348 |         output.close();
349 |     }
350 | 
351 |     /**
352 |      * Unions an EMPTY accumulator with EXPLICIT HLLs, each having a single
353 |      * random value, twice in a row to verify that the set properties are
354 |      * satisfied.
355 |      *
356 |      * Format: cumulative union
357 |      * Tests:
358 |      * - EMPTY U EXPLICIT
359 |      * - EXPLICIT U EXPLICIT
360 |      */
361 |     private static void explicitOverlapTest(final ISchemaVersion schemaVersion) throws IOException {
362 |         final FileWriter output = openOutput(schemaVersion, "explicit_explicit", TestType.UNION);
363 | 
364 |         final Random random = new Random(SEED);
365 | 
366 |         // the accumulator, starts empty
367 |         final HLL hll = newHLL(HLLType.EMPTY);
368 |         final HLL emptyHLL = newHLL(HLLType.EMPTY);
369 | 
370 |         cumulativeUnionLine(output, hll, emptyHLL, schemaVersion);
371 | 
372 |         for(int i=0; i<EXPLICIT_THRESHOLD; i++) {
373 |             // make an EXPLICIT set and populate with cardinality 1
374 |             final HLL explicitHLL = newHLL(HLLType.EXPLICIT);
375 |             explicitHLL.addRaw(random.nextLong());
376 | 
377 |             // union it into the accumulator twice, to test overlap (cardinality should not change)
378 |             cumulativeUnionLine(output, hll, explicitHLL, schemaVersion);
379 |             cumulativeUnionLine(output, hll, explicitHLL, schemaVersion);
380 |         }
381 | 
382 |         output.flush();
383 |         output.close();
384 |     }
385 | 
386 |     /**
387 |      * Unions an EMPTY accumulator with SPARSE HLLs, each
388 |      * having a single register set, twice in a row to verify that the set
389 |      * properties are satisfied.
390 |      *
391 |      * Format: cumulative union
392 |      * Tests:
393 |      * - EMPTY U SPARSE
394 |      * - SPARSE U SPARSE
395 |      */
396 |     private static void sparseProbabilisticOverlapTest(final ISchemaVersion schemaVersion) throws IOException {
397 |         final FileWriter output = openOutput(schemaVersion, "sparse_sparse", TestType.UNION);
398 | 
399 |         final Random random = new Random(SEED);
400 | 
401 |         // the accumulator, starts empty
402 |         final HLL hll = newHLL(HLLType.EMPTY);
403 |         final HLL emptyHLL = newHLL(HLLType.EMPTY);
404 | 
405 |         cumulativeUnionLine(output, hll, emptyHLL, schemaVersion);
406 | 
407 |         for(int i=0; i<SPARSE_THRESHOLD; i++) {
408 |             // make a SPARSE set and populate with cardinality 1
409 |             final HLL sparseHLL = newHLL(HLLType.SPARSE);
410 |             final int registerIndex = Math.abs(random.nextInt()) % REGISTER_COUNT;
411 |             final int registerValue = ((Math.abs(random.nextInt()) % REGISTER_MAX_VALUE) + 1);
412 |             final long rawValue = constructHLLValue(LOG2M, registerIndex, registerValue);
413 |             sparseHLL.addRaw(rawValue);
414 | 
415 |             cumulativeUnionLine(output, hll, sparseHLL, schemaVersion);
416 |         }
417 | 
418 |         output.flush();
419 |         output.close();
420 |     }
421 | 
422 |     /**
423 |      * Unions an EMPTY accumulator with FULL HLLs, each having
424 |      * many registers set, twice in a row to verify that the set properties are
425 |      * satisfied.
426 |      *
427 |      * Format: cumulative union
428 |      * Tests:
429 |      * - EMPTY U FULL
430 |      * - FULL U FULL
431 |      */
432 |     private static void probabilisticUnionTest(final ISchemaVersion schemaVersion) throws IOException {
433 |         final FileWriter output = openOutput(schemaVersion, "probabilistic_probabilistic", TestType.UNION);
434 | 
435 |         final Random random = new Random(SEED);
436 | 
437 |         // the accumulator, starts empty
438 |         final HLL hll = newHLL(HLLType.EMPTY);
439 |         final HLL emptyHLL = newHLL(HLLType.EMPTY);
440 |         cumulativeUnionLine(output, hll, emptyHLL, schemaVersion);
441 | 
442 |         for(int i=0; i<1000/*number of rows to generate*/; i++) {
443 |             // make a FULL set and populate with
444 |             final HLL fullHLL = newHLL(HLLType.FULL);
445 |             final int elementCount = random.nextInt(10000/*arbitrary maximum cardinality*/);
446 |             for(int j=0;j<elementCount;j++) {
447 |                 fullHLL.addRaw(random.nextLong());
448 |             }
449 | 
450 |             cumulativeUnionLine(output, hll, fullHLL, schemaVersion);
451 |         }
452 | 
453 |         output.flush();
454 |         output.close();
455 |     }
456 | 
457 |     /**
458 |      * Unions an EMPTY accumulator with random HLLs.
459 |      *
460 |      * Format: cumulative union
461 |      * Tests:
462 |      * - hopefully all union possibilities
463 |      */
464 |     private static void globalUnionTest(final ISchemaVersion schemaVersion) throws IOException {
465 |         final FileWriter output = openOutput(schemaVersion, "comprehensive", TestType.UNION);
466 | 
467 |         final Random random = new Random(SEED);
468 | 
469 |         // the accumulator, starts empty
470 |         final HLL hll = newHLL(HLLType.EMPTY);
471 |         final HLL emptyHLL = newHLL(HLLType.EMPTY);
472 | 
473 |         cumulativeUnionLine(output, hll, emptyHLL, schemaVersion);
474 | 
475 |         for(int i=0; i<1000/*number of rows to generate*/; i++) {
476 |             final HLL randomHLL = generateRandomHLL(random);
477 |             cumulativeUnionLine(output, hll, randomHLL, schemaVersion);
478 |         }
479 | 
480 |         output.flush();
481 |         output.close();
482 |     }
483 | 
484 |     // ========================================================================
485 |     // Main
486 |     public static void fullSuite(final ISchemaVersion schemaVersion) throws IOException {
487 |         fullCardinalityCorrectionTest(schemaVersion);
488 |         globalUnionTest(schemaVersion);
489 |         globalStepTest(schemaVersion);
490 |         probabilisticUnionTest(schemaVersion);
491 |         explicitPromotionTest(schemaVersion);
492 |         explicitOverlapTest(schemaVersion);
493 |         sparseFullRepresentationTest(schemaVersion);
494 |         sparseStepTest(schemaVersion);
495 |         sparseRandomTest(schemaVersion);
496 |         sparseEdgeTest(schemaVersion);
497 |         sparseProbabilisticPromotionTest(schemaVersion);
498 |         sparseProbabilisticOverlapTest(schemaVersion);
499 |     }
500 | 
501 |     public static void main(String[] args) throws IOException {
502 |         fullSuite(SerializationUtil.VERSION_ONE);
503 |     }
504 | 
505 |     // ************************************************************************
506 |     // Helpers
507 |     /**
508 |      * Shortcut for testing constructor, which uses the constants defined at
509 |      * the top of the file as default parameters.
510 |      *
511 |      * @return a new {@link HLL} of specified type, which uses the parameters
512 |      *         ({@link #LOG2M}, {@link #REGWIDTH}, {@link #EXPLICIT_THRESHOLD},
513 |      *         and {@link #SPARSE_THRESHOLD}) specified above.
514 |      */
515 |     private static HLL newHLL(final HLLType type) {
516 |         return newHLL(type);
517 |     }
518 | 
519 |     /**
520 |      * Returns the algorithm-specific cardinality of the specified {@link HLL}
521 |      * as a {@link String} appropriate for comparison with the algorithm-specific
522 |      * cardinality provided by the PostgreSQL implementation.
523 |      *
524 |      * @param  hll the HLL whose algorithm-specific cardinality is to be printed.
525 |      *         This cannot be <code>null</code>.
526 |      * @return the algorithm-specific cardinality of the instance as a PostgreSQL-
527 |      *         compatible String. This will never be <code>null</code>
528 |      */
529 |     private static String stringCardinality(final HLL hll) {
530 |         switch(hll.getType()) {
531 |             case EMPTY:
532 |                 return "0";
533 |             case EXPLICIT:/*promotion has not yet occurred*/
534 |                 return Long.toString(hll.cardinality());
535 |             case SPARSE:
536 |                 return Double.toString(hll.sparseProbabilisticAlgorithmCardinality());
537 |             case FULL:
538 |                 return Double.toString(hll.fullProbabilisticAlgorithmCardinality());
539 |             default:
540 |                 throw new RuntimeException("Unknown HLL type " + hll.getType());
541 |         }
542 |     }
543 | 
544 |     /**
545 |      * Generates a random HLL and populates it with random values.
546 |      *
547 |      * @param  random the {@link Random random number generator} used to populate
548 |      *         the HLL. This cannot be <code>null</code>.
549 |      * @return the populated HLL. This will never be <code>null</code>.
550 |      */
551 |     public static HLL generateRandomHLL(final Random random) {
552 |         final int randomTypeInt = random.nextInt(HLLType.values().length);
553 |         final HLLType type;
554 |         switch(randomTypeInt) {
555 |             case 0:
556 |                 type = HLLType.EMPTY;
557 |                 break;
558 |             case 1:
559 |                 type = HLLType.EXPLICIT;
560 |                 break;
561 |             case 2:
562 |                 type = HLLType.FULL;
563 |                 break;
564 |             case 3:
565 |                 type = HLLType.EMPTY;
566 |                 break;
567 |             case 4:
568 |                 type = HLLType.SPARSE;
569 |                 break;
570 |             default:
571 |                 throw new RuntimeException("Unassigned type int " + randomTypeInt);
572 |         }
573 | 
574 |         final int cardinalityCap;
575 |         final int cardinalityBaseline;
576 | 
577 |         switch(type) {
578 |             case EMPTY:
579 |                 return newHLL(HLLType.EMPTY);
580 |             case EXPLICIT:
581 |                 cardinalityCap = EXPLICIT_THRESHOLD;
582 |                 cardinalityBaseline = 1;
583 |                 break;
584 |             case SPARSE:
585 |                 cardinalityCap = SPARSE_THRESHOLD;
586 |                 cardinalityBaseline = (EXPLICIT_THRESHOLD + 1);
587 |                 break;
588 |             case FULL:
589 |                 cardinalityCap = 100000;
590 |                 cardinalityBaseline = (SPARSE_THRESHOLD*10);
591 |                 break;
592 |             default:
593 |                 throw new RuntimeException("We should never be here.");
594 |         }
595 | 
596 |         final HLL hll = newHLL(HLLType.EMPTY);
597 |         for(int i=0; i<cardinalityBaseline; i++) {
598 |             hll.addRaw(random.nextLong());
599 |         }
600 |         for(int i=0; i<random.nextInt(cardinalityCap - cardinalityBaseline); i++) {
601 |             hll.addRaw(random.nextLong());
602 |         }
603 | 
604 |         return hll;
605 |     }
606 | 
607 |     /**
608 |      * Opens a {@link FileWriter} and writes out an appropriate CSV header.
609 |      *
610 |      * @param  schemaVersion Schema version of the output. This cannot be
611 |      *         <code>null</code>.
612 |      * @param  description Description string used to build the filename.
613 |      *         This cannot be <code>null</code>.
614 |      * @param  type {@link TestType type} of the test file to be written.
615 |      *         This cannot be <code>null</code>.
616 |      * @return The opened {@link FileWriter writer}. This will never be <code>null</code>.
617 |      */
618 |     private static FileWriter openOutput(final ISchemaVersion schemaVersion, final String description, final TestType type) throws IOException {
619 |         final String schemaVersionPrefix = "v"+ schemaVersion.schemaVersionNumber() + "_";
620 |         final String header;
621 |         final String filename;
622 |         switch(type) {
623 |             case ADD:
624 |                 header = "cardinality,raw_value,HLL\n";
625 |                 filename = schemaVersionPrefix + "cumulative_add_" + description + ".csv";
626 |                 break;
627 |             case UNION:
628 |                 header = "cardinality,HLL,union_cardinality,union_HLL\n";
629 |                 filename = schemaVersionPrefix + "cumulative_union_" + description + ".csv";
630 |                 break;
631 |             default:
632 |                 throw new RuntimeException("Unknown test type " + type);
633 |         }
634 | 
635 |         final FileWriter output = new FileWriter(OUTPUT_DIRECTORY + filename);
636 |         output.write(header);
637 |         output.flush();
638 |         return output;
639 |     }
640 | 
641 |     /**
642 |      * Writes out a {@link TestType#ADD}-formatted test line.
643 |      *
644 |      * @param  output The output {@link FileWriter writer}. This cannot be <code>null</code>.
645 |      * @param  hll The "accumulator" HLL instance. This cannot be <code>null</code>.
646 |      * @param  rawValue The raw value added to the HLL.
647 |      * @param  schemaVersion the schema with which to serialize the HLLs. This cannot
648 |      *         be <code>null</code>.
649 |      */
650 |     private static void cumulativeAddLine(final FileWriter output, final HLL hll, final long rawValue, final ISchemaVersion schemaVersion) throws IOException {
651 |         hll.addRaw(rawValue);
652 |         final String accumulatorCardinality = stringCardinality(hll);
653 | 
654 |         output.write(accumulatorCardinality + "," + rawValue + "," + toByteA(hll, schemaVersion) + "\n");
655 |         output.flush();
656 |     }
657 | 
658 |     /**
659 |      * Writes an initial line for a {@link TestType#ADD}-formatted test.
660 |      *
661 |      * @param  output The output {@link FileWriter writer}. This cannot be <code>null</code>.
662 |      * @param  hll The "accumulator" HLL instance. This cannot be <code>null</code>.
663 |      * @param  rawValue The raw value added to the HLL.
664 |      * @param  schemaVersion the schema with which to serialize the HLLs. This cannot
665 |      *         be <code>null</code>.
666 |      */
667 |     private static void initLineAdd(final FileWriter output, final HLL hll, final ISchemaVersion schemaVersion) throws IOException {
668 |         output.write(0 + "," + 0 + "," + toByteA(hll, schemaVersion) + "\n");
669 |         output.flush();
670 |     }
671 | 
672 |     /**
673 |      * Writes out a {@link TestType#UNION}-formatted test line.
674 |      *
675 |      * @param  output The output {@link FileWriter writer}. This cannot be <code>null</code>.
676 |      * @param  hll The "accumulator" HLL instance. This cannot be <code>null</code>.
677 |      * @param  increment The "increment" HLL instance which will be unioned into
678 |      *         the accumulator. This cannot be <code>null</code>.
679 |      * @param  schemaVersion the schema with which to serialize the HLLs. This cannot
680 |      *         be <code>null</code>.
681 |      */
682 |     private static void cumulativeUnionLine(final FileWriter output, final HLL hll, final HLL increment, final ISchemaVersion schemaVersion) throws IOException {
683 |         hll.union(increment);
684 | 
685 |         final String incrementCardinality = stringCardinality(increment);
686 |         final String accumulatorCardinality = stringCardinality(hll);
687 |         output.write(incrementCardinality + "," + toByteA(increment, schemaVersion) + "," + accumulatorCardinality + "," + toByteA(hll, schemaVersion) + "\n");
688 |         output.flush();
689 |     }
690 | 
691 |     /**
692 |      * Serializes a HLL to Postgres 9 'bytea' hex-format, for CSV ingest.
693 |      *
694 |      * @param  hll the HLL to serialize. This cannot be <code>null</code>.
695 |      * @param  schemaVersion the schema with which to serialize the HLLs. This cannot
696 |      *         be <code>null</code>.
697 |      * @return a PostgreSQL 'bytea' string representing the HLL.
698 |      */
699 |     private static String toByteA(final HLL hll, final ISchemaVersion schemaVersion) {
700 |         final byte[] bytes = hll.toBytes(schemaVersion);
701 |         return ("\\x" + NumberUtil.toHex(bytes, 0, bytes.length));
702 |     }
703 | 
704 |     /**
705 |      * Indicates what kind of test output a test will generate.
706 |      */
707 |     private static enum TestType {
708 |         /**
709 |          * This type of test is characterized by values being added to an
710 |          * accumulator HLL whose serialized representation (after the value is added)
711 |          * is printed to each line along with the cardinality and added value.
712 |          */
713 |         ADD,
714 |         /**
715 |          * This type of test is characterized by HLLs being unioned into an
716 |          * accumulator HLL whose serialized representation (after the HLL is
717 |          * union'd) is printed to each line along with the cardinalities and the
718 |          * serialized representation of the HLL union'd in.
719 |          */
720 |         UNION;
721 |     }
722 | }
723 | 


--------------------------------------------------------------------------------
/src/test/java/net/agkn/hll/ProbabilisticTestUtil.java:
--------------------------------------------------------------------------------
 1 | package net.agkn.hll;
 2 | 
 3 | /*
 4 |  * Copyright 2013 Aggregate Knowledge, Inc.
 5 |  *
 6 |  * Licensed under the Apache License, Version 2.0 (the "License");
 7 |  * you may not use this file except in compliance with the License.
 8 |  * You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | 
19 | import net.agkn.hll.util.BitUtil;
20 | 
21 | /**
22 |  * A collection of test utilities for constructing input values to HLLs and for
23 |  * computing their serialized size.
24 |  *
25 |  * @author timon
26 |  */
27 | public class ProbabilisticTestUtil {
28 |     /**
29 |      * Constructs a value that when added raw to a HLL will set the register at
30 |      * <code>registerIndex</code> to <code>registerValue</code>.
31 |      *
32 |      * @param  log2m the log-base-2 of the number of registers in the HLL
33 |      * @param  registerIndex the index of the register to set
34 |      * @param  registerValue the value to set the register to
35 |      * @return the value
36 |      */
37 |     public static long constructHLLValue(final int log2m, final int registerIndex, final int registerValue) {
38 |         final long partition = registerIndex;
39 |         final long substreamValue = (1L << (registerValue - 1));
40 |         return (substreamValue << log2m) | partition;
41 |     }
42 | 
43 |     /**
44 |      * Extracts the HLL register index from a raw value.
45 |      */
46 |     public static short getRegisterIndex(final long rawValue, final int log2m) {
47 |         final long mBitsMask = (1 << log2m) - 1;
48 |         final short j = (short)(rawValue & mBitsMask);
49 |         return j;
50 |     }
51 | 
52 |     /**
53 |      * Extracts the HLL register value from a raw value.
54 |      */
55 |     public static byte getRegisterValue(final long rawValue, final int log2m) {
56 |         final long substreamValue = (rawValue >>> log2m);
57 |         final byte p_w;
58 | 
59 |         if (substreamValue == 0L) {
60 |             // The paper does not cover p(0x0), so the special value 0 is used.
61 |             // 0 is the original initialization value of the registers, so by
62 |             // doing this the HLL simply ignores it. This is acceptable
63 |             // because the probability is 1/(2^(2^registerSizeInBits)).
64 |             p_w = 0;
65 |         } else {
66 |             p_w = (byte)Math.min(1 + BitUtil.leastSignificantBit(substreamValue), 31);
67 |         }
68 | 
69 |         return p_w;
70 |     }
71 | 
72 |     /**
73 |      * @return the number of bytes required to pack <code>registerCount</code>
74 |      *         registers of width <code>shortWordLength</code>.
75 |      */
76 |     public static int getRequiredBytes(final int shortWordLength, final int registerCount) {
77 |         return (int)Math.ceil((registerCount * shortWordLength)/(float)8);
78 |     }
79 | }
80 | 


--------------------------------------------------------------------------------
/src/test/java/net/agkn/hll/SparseHLLTest.java:
--------------------------------------------------------------------------------
  1 | package net.agkn.hll;
  2 | 
  3 | /*
  4 |  * Copyright 2013 Aggregate Knowledge, Inc.
  5 |  *
  6 |  * Licensed under the Apache License, Version 2.0 (the "License");
  7 |  * you may not use this file except in compliance with the License.
  8 |  * You may obtain a copy of the License at
  9 |  *
 10 |  *     http://www.apache.org/licenses/LICENSE-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS,
 14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 |  * See the License for the specific language governing permissions and
 16 |  * limitations under the License.
 17 |  */
 18 | 
 19 | import static org.powermock.reflect.Whitebox.getInternalState;
 20 | import static org.testng.Assert.assertEquals;
 21 | import static org.testng.Assert.assertTrue;
 22 | import it.unimi.dsi.fastutil.ints.Int2ByteOpenHashMap;
 23 | import java.util.Random;
 24 | 
 25 | import net.agkn.hll.serialization.ISchemaVersion;
 26 | import net.agkn.hll.serialization.SerializationUtil;
 27 | import net.agkn.hll.util.HLLUtil;
 28 | 
 29 | import org.testng.annotations.Test;
 30 | 
 31 | /**
 32 |  * Tests {@link HLL} of type {@link HLLType#SPARSE}.
 33 |  *
 34 |  * @author timon
 35 |  */
 36 | public class SparseHLLTest {
 37 |     private static final int log2m = 11;
 38 | 
 39 |     /**
 40 |      * Tests {@link HLL#addRaw(long)}.
 41 |      */
 42 |     @Test
 43 |     public void addTest() {
 44 |         { // insert an element with register value 1 (minimum set value)
 45 |             final int registerIndex = 0;
 46 |             final int registerValue = 1;
 47 |             final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue);
 48 | 
 49 |             final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
 50 |             hll.addRaw(rawValue);
 51 | 
 52 |             assertOneRegisterSet(hll, registerIndex, (byte)registerValue);
 53 |         }
 54 |         { // insert an element with register value 31 (maximum set value)
 55 |             final int registerIndex = 0;
 56 |             final int registerValue = 31;
 57 |             final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue);
 58 | 
 59 |             final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
 60 |             hll.addRaw(rawValue);
 61 | 
 62 |             assertOneRegisterSet(hll, registerIndex, (byte)registerValue);
 63 |         }
 64 |         { // insert an element that could overflow the register (past 31)
 65 |             final int registerIndex = 0;
 66 |             final int registerValue = 36;
 67 |             final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue);
 68 | 
 69 |             final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
 70 |             hll.addRaw(rawValue);
 71 | 
 72 |             assertOneRegisterSet(hll, (short)registerIndex, (byte)31/*register max*/);
 73 |         }
 74 |         { // insert duplicate elements, observe no change
 75 |             final int registerIndex = 0;
 76 |             final int registerValue = 1;
 77 |             final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue);
 78 | 
 79 |             final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
 80 |             hll.addRaw(rawValue);
 81 |             hll.addRaw(rawValue);
 82 | 
 83 |             assertOneRegisterSet(hll, registerIndex, (byte)registerValue);
 84 |         }
 85 |         { // insert elements that increase a register's value
 86 |             final int registerIndex = 0;
 87 |             final int registerValue = 1;
 88 |             final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue);
 89 | 
 90 |             final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
 91 |             hll.addRaw(rawValue);
 92 | 
 93 |             final int registerValue2 = 2;
 94 |             final long rawValue2 = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue2);
 95 |             hll.addRaw(rawValue2);
 96 | 
 97 |             assertOneRegisterSet(hll, registerIndex, (byte)registerValue2);
 98 |         }
 99 |         { // insert elements that have lower register values, observe no change
100 |             final int registerIndex = 0;
101 |             final int registerValue = 2;
102 |             final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue);
103 | 
104 |             final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
105 |             hll.addRaw(rawValue);
106 | 
107 |             final int registerValue2 = 1;
108 |             final long rawValue2 = ProbabilisticTestUtil.constructHLLValue(log2m, registerIndex, registerValue2);
109 |             hll.addRaw(rawValue2);
110 | 
111 |             assertOneRegisterSet(hll, registerIndex, (byte)registerValue);
112 |         }
113 |     }
114 | 
115 |     /**
116 |      * Smoke test for {@link HLL#cardinality()} and the proper use of the small
117 |      * range correction.
118 |      */
119 |     @Test
120 |     public void smallRangeSmokeTest() {
121 |         final int log2m = 11;
122 |         final int m = (1 << log2m);
123 |         final int regwidth = 5;
124 | 
125 |         // only one register set
126 |         {
127 |             final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
128 |             hll.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, 0, 1));
129 | 
130 |             final long cardinality = hll.cardinality();
131 | 
132 |             // Trivially true that small correction conditions hold: one register
133 |             // set implies zeroes exist, and estimator trivially smaller than 5m/2.
134 |             // Small range correction: m * log(m/V)
135 |             final long expected = (long)Math.ceil(m * Math.log((double)m / (m - 1)/*# of zeroes*/));
136 |             assertEquals(cardinality, expected);
137 |         }
138 | 
139 |         // all but one register set
140 |         {
141 |             final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary*/, HLLType.SPARSE);
142 |             for(int i=0; i<(m - 1); i++) {
143 |                 hll.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, i, 1));
144 |             }
145 | 
146 |             // Trivially true that small correction conditions hold: all but
147 |             // one register set implies a zero exists, and estimator trivially
148 |             // smaller than 5m/2 since it's alpha / ((m-1)/2)
149 |             final long cardinality = hll.cardinality();
150 | 
151 |             // Small range correction: m * log(m/V)
152 |             final long expected = (long)Math.ceil(m * Math.log((double)m / 1/*# of zeroes*/));
153 |             assertEquals(cardinality, expected);
154 |         }
155 |     }
156 | 
157 |     /**
158 |      * Smoke test for {@link HLL#cardinality()} and the proper use of the
159 |      * uncorrected estimator.
160 |      */
161 |     @Test
162 |     public void normalRangeSmokeTest() {
163 |         final int log2m = 11;
164 |         final int m = (1 << log2m);
165 |         final int regwidth = 5;
166 |         // regwidth = 5, so hash space is
167 |         // log2m + (2^5 - 1 - 1), so L = log2m + 30
168 |         final int l = log2m + 30;
169 | 
170 |         // all registers at 'medium' value
171 |         {
172 |             final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, m/*sparseThreshold*/, HLLType.SPARSE);
173 | 
174 |             final int registerValue = 7/*chosen to ensure neither correction kicks in*/;
175 |             for(int i=0; i<m; i++) {
176 |                 hll.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, i, registerValue));
177 |             }
178 | 
179 |             final long cardinality = hll.cardinality();
180 | 
181 |             // Simplified estimator when all registers take same value: alpha / (m/2^val)
182 |             final double estimator = HLLUtil.alphaMSquared(m)/((double)m/Math.pow(2, registerValue));
183 | 
184 |             // Assert conditions for uncorrected range
185 |             assertTrue(estimator <= Math.pow(2,l)/30);
186 |             assertTrue(estimator > (5 * m /(double)2));
187 | 
188 |             final long expected = (long)Math.ceil(estimator);
189 |             assertEquals(cardinality, expected);
190 |         }
191 |     }
192 | 
193 |     /**
194 |      * Smoke test for {@link HLL#cardinality()} and the proper use of the large
195 |      * range correction.
196 |      */
197 |     @Test
198 |     public void largeRangeSmokeTest() {
199 |         final int log2m = 11;
200 |         final int m = (1 << log2m);
201 |         final int regwidth = 5;
202 |         // regwidth = 5, so hash space is
203 |         // log2m + (2^5 - 1 - 1), so L = log2m + 30
204 |         final int l = log2m + 30;
205 | 
206 |         // all registers at large value
207 |         {
208 |             final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, m/*sparseThreshold*/, HLLType.SPARSE);
209 | 
210 |             final int registerValue = 31/*chosen to ensure large correction kicks in*/;
211 |             for(int i=0; i<m; i++) {
212 |                 hll.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, i, registerValue));
213 |             }
214 | 
215 |             final long cardinality = hll.cardinality();
216 | 
217 | 
218 |             // Simplified estimator when all registers take same value: alpha / (m/2^val)
219 |             final double estimator = HLLUtil.alphaMSquared(m)/((double)m/Math.pow(2, registerValue));
220 | 
221 |             // Assert conditions for large range
222 |             assertTrue(estimator > Math.pow(2, l)/30);
223 | 
224 |             // Large range correction: -2^32 * log(1 - E/2^32)
225 |             final long expected = (long)Math.ceil(-1.0 * Math.pow(2, l) * Math.log(1.0 - estimator/Math.pow(2, l)));
226 |             assertEquals(cardinality, expected);
227 |         }
228 |     }
229 | 
230 |     /**
231 |      * Tests {@link HLL#union(HLL)}.
232 |      */
233 |     @Test
234 |     public void unionTest() {
235 |         final int log2m = 11/*arbitrary*/;
236 |         final int sparseThreshold = 256/*arbitrary*/;
237 | 
238 |         { // two empty multisets should union to an empty set
239 |             final HLL hllA = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
240 |             final HLL hllB = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
241 | 
242 |             hllA.union(hllB);
243 | 
244 |             assertEquals(hllA.getType(), HLLType.SPARSE/*unchanged*/);
245 |             assertEquals(hllA.cardinality(), 0L);
246 |         }
247 |         { // two disjoint multisets should union properly
248 |             final HLL hllA = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
249 |             hllA.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, 1, 1));
250 |             final HLL hllB = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
251 |             hllB.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, 2, 1));
252 | 
253 | 
254 |             hllA.union(hllB);
255 | 
256 |             assertEquals(hllA.getType(), HLLType.SPARSE/*unchanged*/);
257 |             assertEquals(hllA.cardinality(), 3L/*precomputed*/);
258 |             assertRegisterPresent(hllA, 1, (byte)1);
259 |             assertRegisterPresent(hllA, 2, (byte)1);
260 |         }
261 |         { // two exactly overlapping multisets should union properly
262 |             final HLL hllA = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
263 |             hllA.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, 1, 10));
264 |             final HLL hllB = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
265 |             hllB.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, 1, 13));
266 | 
267 |             hllA.union(hllB);
268 | 
269 |             assertEquals(hllA.getType(), HLLType.SPARSE/*unchanged*/);
270 |             assertEquals(hllA.cardinality(), 2L/*precomputed*/);
271 |             assertOneRegisterSet(hllA, 1, (byte)13/*max(10,13)*/);
272 |         }
273 |         { // overlapping multisets should union properly
274 |             final HLL hllA = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
275 |             final HLL hllB = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
276 |             // register index = 3
277 |             final long rawValueA = ProbabilisticTestUtil.constructHLLValue(log2m, 3, 11);
278 | 
279 |             // register index = 4
280 |             final long rawValueB = ProbabilisticTestUtil.constructHLLValue(log2m, 4, 13);
281 |             final long rawValueBPrime = ProbabilisticTestUtil.constructHLLValue(log2m, 4, 21);
282 | 
283 |             // register index = 5
284 |             final long rawValueC = ProbabilisticTestUtil.constructHLLValue(log2m, 5, 14);
285 | 
286 |             hllA.addRaw(rawValueA);
287 |             hllA.addRaw(rawValueB);
288 | 
289 |             hllB.addRaw(rawValueBPrime);
290 |             hllB.addRaw(rawValueC);
291 | 
292 |             hllA.union(hllB);
293 |             // union should have three registers set, with partition B set to the
294 |             // max of the two registers
295 |             assertRegisterPresent(hllA, 3, (byte)11);
296 |             assertRegisterPresent(hllA, 4, (byte)21/*max(21,13)*/);
297 |             assertRegisterPresent(hllA, 5, (byte)14);
298 |         }
299 |         { // too-large unions should promote
300 |             final HLL hllA = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
301 |             final HLL hllB = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
302 | 
303 |             // fill up sets to maxCapacity
304 |             for(int i=0; i<sparseThreshold; i++) {
305 |                 hllA.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, i, 1));
306 |                 hllB.addRaw(ProbabilisticTestUtil.constructHLLValue(log2m, (i + sparseThreshold)/*non-overlapping*/, 1));
307 |             }
308 | 
309 |             hllA.union(hllB);
310 | 
311 |             assertEquals(hllA.getType(), HLLType.FULL);
312 |         }
313 |     }
314 | 
315 |     /**
316 |      * Tests {@link HLL#clear()}.
317 |      */
318 |     @Test
319 |     public void clearTest() {
320 |         final HLL hll = new HLL(log2m, 5/*regwidth*/, 128/*explicitThreshold, arbitrary, unused*/, 256/*sparseThreshold, arbitrary, unused*/, HLLType.SPARSE);
321 |         hll.addRaw(1L);
322 |         hll.clear();
323 |         assertEquals(hll.cardinality(), 0L);
324 |     }
325 | 
326 |     /**
327 |      * Tests {@link HLL#toBytes(ISchemaVersion)} and
328 |      * {@link HLL#fromBytes(byte[])}.
329 |      */
330 |     @Test
331 |     public void toFromBytesTest() {
332 |         final int log2m = 11/*arbitrary*/;
333 |         final int regwidth = 5/*arbitrary*/;
334 |         final int sparseThreshold = 256/*arbitrary*/;
335 |         final int shortWordLength = 16/*log2m + regwidth = 11 + 5*/;
336 | 
337 |         final ISchemaVersion schemaVersion = SerializationUtil.DEFAULT_SCHEMA_VERSION;
338 |         final HLLType type = HLLType.SPARSE;
339 |         final int padding = schemaVersion.paddingBytes(type);
340 | 
341 |         {// Should work on an empty element
342 |             final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
343 |             final byte[] bytes = hll.toBytes(schemaVersion);
344 | 
345 |             // output should just be padding since no registers are used
346 |             assertEquals(bytes.length, padding);
347 | 
348 |             final HLL inHLL = HLL.fromBytes(bytes);
349 | 
350 |             // assert register values correct
351 |             assertElementsEqual(hll, inHLL);
352 |         }
353 |         {// Should work on a partially filled element
354 |             final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
355 | 
356 |             for(int i=0; i<3; i++) {
357 |                 final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, i, (i+9));
358 |                 hll.addRaw(rawValue);
359 |             }
360 | 
361 |             final byte[] bytes = hll.toBytes(schemaVersion);
362 | 
363 |             assertEquals(bytes.length, padding + ProbabilisticTestUtil.getRequiredBytes(shortWordLength, 3/*registerCount*/));
364 | 
365 |             final HLL inHLL = HLL.fromBytes(bytes);
366 | 
367 |             // assert register values correct
368 |             assertElementsEqual(hll, inHLL);
369 |         }
370 |         {// Should work on a full set
371 |             final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
372 | 
373 |             for(int i=0; i<sparseThreshold; i++) {
374 |                 final long rawValue = ProbabilisticTestUtil.constructHLLValue(log2m, i, (i % 9) + 1);
375 |                 hll.addRaw(rawValue);
376 |             }
377 | 
378 |             final byte[] bytes = hll.toBytes(schemaVersion);
379 | 
380 |             // 'short words' should be 12 bits + 5 bits = 17 bits long
381 |             assertEquals(bytes.length, padding + ProbabilisticTestUtil.getRequiredBytes(shortWordLength, sparseThreshold));
382 | 
383 |             final HLL inHLL = HLL.fromBytes(bytes);
384 | 
385 |             // assert register values correct
386 |             assertElementsEqual(hll, inHLL);
387 |         }
388 |     }
389 | 
390 |     /**
391 |      * Smoke tests the multisets by adding random values.
392 |      */
393 |     @Test
394 |     public void randomValuesTest() {
395 |         final int log2m = 11/*arbitrary*/;
396 |         final int regwidth = 5/*arbitrary*/;
397 |         final int sparseThreshold = 256/*arbitrary*/;
398 | 
399 |         final long seed = 1L;
400 |         final Random random = new Random(seed);
401 | 
402 |         for(int run=0; run<100; run++) {
403 |             final HLL hll = new HLL(log2m, regwidth, 128/*explicitThreshold, arbitrary, unused*/, sparseThreshold, HLLType.SPARSE);
404 | 
405 |             final Int2ByteOpenHashMap map = new Int2ByteOpenHashMap();
406 |             map.defaultReturnValue((byte)0);
407 | 
408 |             for(int i=0; i<sparseThreshold; i++) {
409 |                 final long rawValue = random.nextLong();
410 | 
411 |                 final short registerIndex = ProbabilisticTestUtil.getRegisterIndex(rawValue, log2m);
412 |                 final byte registerValue = ProbabilisticTestUtil.getRegisterValue(rawValue, log2m);
413 |                 if(map.get(registerIndex) < registerValue) {
414 |                     map.put(registerIndex, registerValue);
415 |                 }
416 | 
417 |                 hll.addRaw(rawValue);
418 |             }
419 | 
420 |             for(int key : map.keySet()) {
421 |                 final byte expectedRegisterValue = map.get(key);
422 |                 assertRegisterPresent(hll, key, expectedRegisterValue);
423 |             }
424 |         }
425 |     }
426 | 
427 |     //*************************************************************************
428 |     // assertion helpers
429 |     /**
430 |      * Asserts that the register at the specified index is set to the specified
431 |      * value.
432 |      */
433 |     private static void assertRegisterPresent(final HLL hll,
434 |                                               final int registerIndex,
435 |                                               final int registerValue) {
436 |         final Int2ByteOpenHashMap sparseProbabilisticStorage = (Int2ByteOpenHashMap)getInternalState(hll, "sparseProbabilisticStorage");
437 |         assertEquals(sparseProbabilisticStorage.get(registerIndex), registerValue);
438 |     }
439 | 
440 |     /**
441 |      * Asserts that only the specified register is set and has the specified value.
442 |      */
443 |     private static void assertOneRegisterSet(final HLL hll,
444 |                                              final int registerIndex,
445 |                                              final byte registerValue) {
446 |         final Int2ByteOpenHashMap sparseProbabilisticStorage = (Int2ByteOpenHashMap)getInternalState(hll, "sparseProbabilisticStorage");
447 |         assertEquals(sparseProbabilisticStorage.size(), 1);
448 |         assertEquals(sparseProbabilisticStorage.get(registerIndex), registerValue);
449 |     }
450 | 
451 |     /**
452 |      * Asserts that all registers in the two {@link HLL} instances are identical.
453 |      */
454 |     private static void assertElementsEqual(final HLL hllA, final HLL hllB) {
455 |         final Int2ByteOpenHashMap sparseProbabilisticStorageA = (Int2ByteOpenHashMap)getInternalState(hllA, "sparseProbabilisticStorage");
456 |         final Int2ByteOpenHashMap sparseProbabilisticStorageB = (Int2ByteOpenHashMap)getInternalState(hllB, "sparseProbabilisticStorage");
457 |         assertEquals(sparseProbabilisticStorageA.size(), sparseProbabilisticStorageB.size());
458 |         for(final int index : sparseProbabilisticStorageA.keySet()) {
459 |             assertEquals(sparseProbabilisticStorageA.get(index), sparseProbabilisticStorageB.get(index));
460 |         }
461 |     }
462 | }
463 | 


--------------------------------------------------------------------------------
/src/test/java/net/agkn/hll/serialization/BigEndianAscendingWordDeserializerTest.java:
--------------------------------------------------------------------------------
  1 | package net.agkn.hll.serialization;
  2 | 
  3 | /*
  4 |  * Copyright 2013 Aggregate Knowledge, Inc.
  5 |  *
  6 |  * Licensed under the Apache License, Version 2.0 (the "License");
  7 |  * you may not use this file except in compliance with the License.
  8 |  * You may obtain a copy of the License at
  9 |  *
 10 |  *     http://www.apache.org/licenses/LICENSE-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS,
 14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 |  * See the License for the specific language governing permissions and
 16 |  * limitations under the License.
 17 |  */
 18 | 
 19 | 
 20 | import org.testng.annotations.Test;
 21 | import java.util.Random;
 22 | 
 23 | import static org.testng.Assert.assertEquals;
 24 | import static org.testng.Assert.assertTrue;
 25 | import static org.testng.Assert.fail;
 26 | 
 27 | /**
 28 |  * Unit and smoke tests for {@link BigEndianAscendingWordDeserializer}.
 29 |  *
 30 |  * @author timon
 31 |  */
 32 | public class BigEndianAscendingWordDeserializerTest {
 33 |     /**
 34 |      * Error checking tests for constructor.
 35 |      */
 36 |     @SuppressWarnings("unused")
 37 |     @Test
 38 |     public void constructorErrorTest() {
 39 |         // word length too small
 40 |         try {
 41 |             new BigEndianAscendingWordDeserializer(0/*wordLength, below minimum of 1*/, 0/*bytePadding, arbitrary*/, new byte[1]/*bytes, arbitrary, not used here*/);
 42 |             fail("Should complain about too-short words.");
 43 |         } catch(final IllegalArgumentException e) {
 44 |             assertTrue(e.getMessage().contains("Word length must be"));
 45 |         }
 46 | 
 47 |         // word length too large
 48 |         try {
 49 |             new BigEndianAscendingWordDeserializer(65/*wordLength, above maximum of 64*/, 0/*bytePadding, arbitrary*/, new byte[1]/*bytes, arbitrary, not used here*/);
 50 |             fail("Should complain about too-long words.");
 51 |         } catch(final IllegalArgumentException e) {
 52 |             assertTrue(e.getMessage().contains("Word length must be"));
 53 |         }
 54 | 
 55 |         // byte padding negative
 56 |         try {
 57 |             new BigEndianAscendingWordDeserializer(5/*wordLength, arbitrary*/, -1/*bytePadding, too small*/, new byte[1]/*bytes, arbitrary, not used here*/);
 58 |             fail("Should complain about negative byte padding.");
 59 |         } catch(final IllegalArgumentException e) {
 60 |             assertTrue(e.getMessage().contains("Byte padding must be"));
 61 |         }
 62 |     }
 63 | 
 64 |     /**
 65 |      * Smoke test using 64-bit short words and special word values.
 66 |      */
 67 |     @Test
 68 |     public void smokeTest64BitWord() {
 69 |         final BigEndianAscendingWordSerializer serializer =
 70 |             new BigEndianAscendingWordSerializer(64/*wordLength*/,
 71 |                                                  5/*wordCount*/,
 72 |                                                  0/*bytePadding, arbitrary*/);
 73 | 
 74 |         // Check that the sign bit is being preserved.
 75 |         serializer.writeWord(-1L);
 76 |         serializer.writeWord(-112894714L);
 77 | 
 78 |         // Check "special" values
 79 |         serializer.writeWord(0L);
 80 |         serializer.writeWord(Long.MAX_VALUE);
 81 |         serializer.writeWord(Long.MIN_VALUE);
 82 | 
 83 |         final byte[] bytes = serializer.getBytes();
 84 | 
 85 |         final BigEndianAscendingWordDeserializer deserializer =
 86 |             new BigEndianAscendingWordDeserializer(64/*wordLength*/, 0/*bytePadding*/, bytes);
 87 | 
 88 |         assertEquals(deserializer.totalWordCount(), 5/*wordCount*/);
 89 | 
 90 |         assertEquals(deserializer.readWord(), -1L);
 91 |         assertEquals(deserializer.readWord(), -112894714L);
 92 |         assertEquals(deserializer.readWord(), 0L);
 93 |         assertEquals(deserializer.readWord(), Long.MAX_VALUE);
 94 |         assertEquals(deserializer.readWord(), Long.MIN_VALUE);
 95 |     }
 96 | 
 97 |     /**
 98 |      * A smoke/fuzz test for ascending (from zero) word values.
 99 |      */
100 |     @Test
101 |     public void ascendingSmokeTest() {
102 |         for(int wordLength=5; wordLength<65; wordLength++) {
103 |             runAscendingTest(wordLength, 3/*bytePadding, arbitrary*/, 100000/*wordCount, arbitrary*/);
104 |         }
105 |     }
106 | 
107 |     /**
108 |      * A smoke/fuzz test for random word values.
109 |      */
110 |     @Test
111 |     public void randomSmokeTest() {
112 |         for(int wordLength=5; wordLength<65; wordLength++) {
113 |             runRandomTest(wordLength, 3/*bytePadding, arbitrary*/, 100000/*wordCount, arbitrary*/, (long)wordLength/*seed, arbitrary*/);
114 |         }
115 |     }
116 | 
117 |     // ------------------------------------------------------------------------
118 |     /**
119 |      * Runs a test which serializes and deserializes random word values.
120 |      *
121 |      * @param wordLength the length of words to test
122 |      * @param bytePadding the number of bytes padding the byte array
123 |      * @param wordCount the number of word values to test
124 |      * @param seed the seed to the random value generator
125 |      */
126 |     private static void runRandomTest(final int wordLength, final int bytePadding, final int wordCount, final long seed) {
127 |         final Random random = new Random(seed);
128 |         final Random verificationRandom = new Random(seed);
129 | 
130 |         final long wordMask;
131 |         if(wordLength == 64) {
132 |             wordMask = ~0L;
133 |         } else {
134 |             wordMask = (1L << wordLength) - 1L;
135 |         }
136 | 
137 |         final BigEndianAscendingWordSerializer serializer =
138 |             new BigEndianAscendingWordSerializer(wordLength/*wordLength, arbitrary*/,
139 |                                                  wordCount,
140 |                                                  bytePadding/*bytePadding, arbitrary*/);
141 | 
142 |         for(int i=0; i<wordCount; i++) {
143 |             final long value = random.nextLong() & wordMask;
144 |             serializer.writeWord(value);
145 |         }
146 | 
147 |         final byte[] bytes = serializer.getBytes();
148 | 
149 |         final BigEndianAscendingWordDeserializer deserializer =
150 |             new BigEndianAscendingWordDeserializer(wordLength, bytePadding, bytes);
151 | 
152 |         assertEquals(deserializer.totalWordCount(), wordCount);
153 |         for(int i=0; i<wordCount; i++) {
154 |             assertEquals(deserializer.readWord(), (verificationRandom.nextLong() & wordMask));
155 |         }
156 |     }
157 | 
158 |     /**
159 |      * Runs a test which serializes and deserializes ascending (from zero) word values.
160 |      *
161 |      * @param wordLength the length of words to test
162 |      * @param bytePadding the number of bytes padding the byte array
163 |      * @param wordCount the number of word values to test
164 |      */
165 |     private static void runAscendingTest(final int wordLength, final int bytePadding, final int wordCount) {
166 |         final long wordMask;
167 |         if(wordLength == 64) {
168 |             wordMask = ~0L;
169 |         } else {
170 |             wordMask = (1L << wordLength) - 1L;
171 |         }
172 | 
173 |         final BigEndianAscendingWordSerializer serializer =
174 |             new BigEndianAscendingWordSerializer(wordLength/*wordLength, arbitrary*/,
175 |                                                  wordCount,
176 |                                                  bytePadding/*bytePadding, arbitrary*/);
177 | 
178 |         for(long i=0; i<wordCount; i++) {
179 |             serializer.writeWord(i & wordMask);
180 |         }
181 | 
182 |         final byte[] bytes = serializer.getBytes();
183 | 
184 |         final BigEndianAscendingWordDeserializer deserializer =
185 |             new BigEndianAscendingWordDeserializer(wordLength, bytePadding, bytes);
186 | 
187 |         assertEquals(deserializer.totalWordCount(), wordCount);
188 |         for(long i=0; i<wordCount; i++) {
189 |             assertEquals(deserializer.readWord(), i & wordMask);
190 |         }
191 |     }
192 | }
193 | 


--------------------------------------------------------------------------------
/src/test/java/net/agkn/hll/serialization/BigEndianAscendingWordSerializerTest.java:
--------------------------------------------------------------------------------
  1 | package net.agkn.hll.serialization;
  2 | 
  3 | /*
  4 |  * Copyright 2013 Aggregate Knowledge, Inc.
  5 |  *
  6 |  * Licensed under the Apache License, Version 2.0 (the "License");
  7 |  * you may not use this file except in compliance with the License.
  8 |  * You may obtain a copy of the License at
  9 |  *
 10 |  *     http://www.apache.org/licenses/LICENSE-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS,
 14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 |  * See the License for the specific language governing permissions and
 16 |  * limitations under the License.
 17 |  */
 18 | 
 19 | import static org.testng.Assert.assertTrue;
 20 | import static org.testng.Assert.fail;
 21 | 
 22 | import java.util.Arrays;
 23 | 
 24 | import org.testng.annotations.Test;
 25 | 
 26 | /**
 27 |  * Unit tests for {@link BigEndianAscendingWordSerializer}.
 28 |  *
 29 |  * @author timon
 30 |  */
 31 | public class BigEndianAscendingWordSerializerTest {
 32 |     /**
 33 |      * Error checking tests for constructor.
 34 |      */
 35 |     @SuppressWarnings("unused")
 36 |     @Test
 37 |     public void constructorErrorTest() {
 38 |         // word length too small
 39 |         try {
 40 |             new BigEndianAscendingWordSerializer(0/*wordLength, below minimum of 1*/, 1/*wordCount, arbitrary*/, 0/*bytePadding, arbitrary*/);
 41 |             fail("Should complain about too-short words.");
 42 |         } catch(final IllegalArgumentException e) {
 43 |             assertTrue(e.getMessage().contains("Word length must be"));
 44 |         }
 45 | 
 46 |         // word length too large
 47 |         try {
 48 |             new BigEndianAscendingWordSerializer(65/*wordLength, above max of 64*/, 1/*wordCount, arbitrary*/, 0/*bytePadding, arbitrary*/);
 49 |             fail("Should complain about too-long words.");
 50 |         } catch(final IllegalArgumentException e) {
 51 |             assertTrue(e.getMessage().contains("Word length must be"));
 52 |         }
 53 | 
 54 |         // word count negative
 55 |         try {
 56 |             new BigEndianAscendingWordSerializer(5/*wordLength, arbitrary*/, -1/*wordCount, too small*/, 0/*bytePadding, arbitrary*/);
 57 |             fail("Should complain about negative word count.");
 58 |         } catch(final IllegalArgumentException e) {
 59 |             assertTrue(e.getMessage().contains("Word count must be"));
 60 |         }
 61 | 
 62 |         // byte padding negative
 63 |         try {
 64 |             new BigEndianAscendingWordSerializer(5/*wordLength, arbitrary*/, 1/*wordCount, arbitrary*/, -1/*bytePadding, too small*/);
 65 |             fail("Should complain about negative byte padding.");
 66 |         } catch(final IllegalArgumentException e) {
 67 |             assertTrue(e.getMessage().contains("Byte padding must be"));
 68 |         }
 69 |     }
 70 | 
 71 |     /**
 72 |      * Tests runtime exception thrown at premature call to {@link BigEndianAscendingWordSerializer#getBytes()}.
 73 |      */
 74 |     @Test
 75 |     public void earlyGetBytesTest() {
 76 |         final BigEndianAscendingWordSerializer serializer =
 77 |             new BigEndianAscendingWordSerializer(5/*wordLength, arbitrary*/,
 78 |                                                  1/*wordCount*/,
 79 |                                                  0/*bytePadding, arbitrary*/);
 80 | 
 81 |         // getBytes without enough writeWord should throw
 82 |         try {
 83 |             serializer.getBytes();
 84 |             fail("Should throw.");
 85 |         } catch(final RuntimeException e) {
 86 |             assertTrue(e.getMessage().contains("Not all words"));
 87 |         }
 88 |     }
 89 | 
 90 |     /**
 91 |      * Smoke test for typical parameters used in {@link LongSetSlab}.
 92 |      */
 93 |     @Test
 94 |     public void smokeTestExplicitParams() {
 95 |         final int shortWordLength = 64/*longs used in LongSetSlab*/;
 96 | 
 97 |         {// Should work on an empty sequence, with no padding.
 98 |             final BigEndianAscendingWordSerializer serializer =
 99 |                 new BigEndianAscendingWordSerializer(shortWordLength,
100 |                                                      0/*wordCount*/,
101 |                                                      0/*bytePadding, none*/);
102 | 
103 |             assert(Arrays.equals(serializer.getBytes(), new byte[0]));
104 |         }
105 |         {// Should work on a byte-divisible sequence, with no padding.
106 |             final BigEndianAscendingWordSerializer serializer =
107 |                 new BigEndianAscendingWordSerializer(shortWordLength,
108 |                                                      2/*wordCount*/,
109 |                                                      0/*bytePadding, none*/);
110 | 
111 |             serializer.writeWord(0xBAAAAAAAAAAAAAACL);
112 |             serializer.writeWord(0x8FFFFFFFFFFFFFF1L);
113 | 
114 |             // Bytes:
115 |             // ======
116 |             // 0xBA 0xAA 0xAA 0xAA 0xAA 0xAA 0xAA 0xAC
117 |             // 0x8F 0xFF 0xFF 0xFF 0xFF 0xFF 0xFF 0xF1
118 |             //
119 |             // -70 -86 ...                        -84
120 |             // -113 -1 ...                        -15
121 |             final byte[] bytes = serializer.getBytes();
122 |             final byte[] expectedBytes = new byte[] { -70, -86, -86, -86, -86, -86, -86, -84,
123 |                                                       -113, -1, -1, -1, -1, -1, -1, -15 };
124 |             assertTrue(Arrays.equals(bytes, expectedBytes));
125 |         }
126 |         {// Should pad the array correctly.
127 |             final BigEndianAscendingWordSerializer serializer =
128 |                 new BigEndianAscendingWordSerializer(shortWordLength,
129 |                                                      1/*wordCount*/,
130 |                                                      1/*bytePadding*/);
131 | 
132 |             serializer.writeWord(1);
133 |             // 1 byte leading padding | value 1 | trailing padding
134 |             // 0000 0000 | 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0001
135 |             // 0x00 | 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x01
136 |             final byte[] bytes = serializer.getBytes();
137 |             final byte[] expectedBytes = new byte[] { 0, 0, 0, 0, 0, 0, 0, 0, 1 };
138 |             assertTrue(Arrays.equals(bytes, expectedBytes));
139 |         }
140 |     }
141 | 
142 |     /**
143 |      * Smoke test for typical parameters used in practice.
144 |      */
145 |     @Test
146 |     public void smokeTestProbabilisticParams() {
147 |         // XXX: revisit this
148 |         final int shortWordLength = 5;
149 |         {// Should work on an empty sequence, with no padding.
150 |             final BigEndianAscendingWordSerializer serializer =
151 |                 new BigEndianAscendingWordSerializer(shortWordLength,
152 |                                                      0/*wordCount*/,
153 |                                                      0/*bytePadding, none*/);
154 | 
155 |             assert(Arrays.equals(serializer.getBytes(), new byte[0]));
156 |         }
157 |         {// Should work on a non-byte-divisible sequence, with no padding.
158 |             final BigEndianAscendingWordSerializer serializer =
159 |                 new BigEndianAscendingWordSerializer(shortWordLength,
160 |                                                      3/*wordCount*/,
161 |                                                      0/*bytePadding, none*/);
162 | 
163 |             serializer.writeWord(9);
164 |             serializer.writeWord(31);
165 |             serializer.writeWord(1);
166 | 
167 |             // The values:
168 |             // -----------
169 |             // 9     |31    |1     |padding
170 | 
171 |             // Corresponding bits:
172 |             // ------------------
173 |             // 0100 1|111 11|00 001|0
174 | 
175 |             // And the hex/decimal (remember Java bytes are signed):
176 |             // -----------------------------------------------------
177 |             // 0100 1111 -> 0x4F -> 79
178 |             // 1100 0010 -> 0xC2 -> -62
179 | 
180 |             final byte[] bytes = serializer.getBytes();
181 |             final byte[] expectedBytes = new byte[] { 79, -62 };
182 |             assertTrue(Arrays.equals(bytes, expectedBytes));
183 |         }
184 |         {// Should work on a byte-divisible sequence, with no padding.
185 |             final BigEndianAscendingWordSerializer serializer =
186 |                 new BigEndianAscendingWordSerializer(shortWordLength,
187 |                                                      8/*wordCount*/,
188 |                                                      0/*bytePadding, none*/);
189 | 
190 |             for(int i=1; i<9; i++) {
191 |                 serializer.writeWord(i);
192 |             }
193 | 
194 |             // Values: 1-8
195 |             // Corresponding bits:
196 |             // ------------------
197 |             // 00001
198 |             // 00010
199 |             // 00011
200 |             // 00100
201 |             // 00101
202 |             // 00110
203 |             // 00111
204 |             // 01000
205 | 
206 |             // And the hex:
207 |             // ------------
208 |             // 0000 1000 => 0x08 => 8
209 |             // 1000 0110 => 0x86 => -122
210 |             // 0100 0010 => 0x62 => 66
211 |             // 1001 1000 => 0x98 => -104
212 |             // 1110 1000 => 0xE8 => -24
213 | 
214 |             final byte[] bytes = serializer.getBytes();
215 |             final byte[] expectedBytes = new byte[] { 8, -122, 66, -104, -24 };
216 |             assertTrue(Arrays.equals(bytes, expectedBytes));
217 |         }
218 |         {// Should pad the array correctly.
219 |             final BigEndianAscendingWordSerializer serializer =
220 |                 new BigEndianAscendingWordSerializer(shortWordLength,
221 |                                                      1/*wordCount*/,
222 |                                                      1/*bytePadding*/);
223 | 
224 |             serializer.writeWord(1);
225 |             // 1 byte leading padding | value 1 | trailing padding
226 |             // 0000 0000 | 0000 1|000
227 |             final byte[] bytes = serializer.getBytes();
228 |             final byte[] expectedBytes = new byte[] { 0, 8 };
229 |             assertTrue(Arrays.equals(bytes, expectedBytes));
230 |         }
231 |     }
232 | 
233 |     /**
234 |      * Smoke test for typical parameters used in practice.
235 |      */
236 |     @Test
237 |     public void smokeTestSparseParams() {
238 |         // XXX: revisit
239 |         final int shortWordLength = 17;
240 |         {// Should work on an empty sequence, with no padding.
241 |             final BigEndianAscendingWordSerializer serializer =
242 |                 new BigEndianAscendingWordSerializer(shortWordLength,
243 |                                                      0/*wordCount*/,
244 |                                                      0/*bytePadding, none*/);
245 | 
246 |             assert(Arrays.equals(serializer.getBytes(), new byte[0]));
247 |         }
248 |         {// Should work on a non-byte-divisible sequence, with no padding.
249 |             final BigEndianAscendingWordSerializer serializer =
250 |                 new BigEndianAscendingWordSerializer(shortWordLength,
251 |                                                      3/*wordCount*/,
252 |                                                      0/*bytePadding, none*/);
253 | 
254 |             serializer.writeWord(9);
255 |             serializer.writeWord(42);
256 |             serializer.writeWord(75);
257 | 
258 |             // The values:
259 |             // -----------
260 |             // 9                    |42                   |75                   |padding
261 | 
262 |             // Corresponding bits:
263 |             // ------------------
264 |             // 0000 0000 0000 0100 1|000 0000 0000 1010 10|00 0000 0000 1001 011|0 0000
265 | 
266 |             // And the hex/decimal (remember Java bytes are signed):
267 |             // -----------------------------------------------------
268 |             // 0000 0000 -> 0x00 -> 0
269 |             // 0000 0100 -> 0x04 -> 4
270 |             // 1000 0000 -> 0x80 -> -128
271 |             // 0000 1010 -> 0x0A -> 10
272 |             // 1000 0000 -> 0x80 -> -128
273 |             // 0000 1001 -> 0x09 -> 9
274 |             // 0110 0000 -> 0x60 -> 96
275 | 
276 |             final byte[] bytes = serializer.getBytes();
277 |             final byte[] expectedBytes = new byte[] { 0, 4, -128, 10, -128, 9, 96 };
278 |             assertTrue(Arrays.equals(bytes, expectedBytes));
279 |         }
280 |         {// Should work on a byte-divisible sequence, with no padding.
281 |             final BigEndianAscendingWordSerializer serializer =
282 |                 new BigEndianAscendingWordSerializer(shortWordLength,
283 |                                                      8/*wordCount*/,
284 |                                                      0/*bytePadding, none*/);
285 | 
286 |             for(int i=1; i<9; i++) {
287 |                 serializer.writeWord(i);
288 |             }
289 | 
290 |             // Values: 1-8
291 |             // Corresponding bits:
292 |             // ------------------
293 |             // 0000 0000 0000 0000 1
294 |             // 000 0000 0000 0000 10
295 |             // 00 0000 0000 0000 011
296 |             // 0 0000 0000 0000 0100
297 | 
298 |             // 0000 0000 0000 0010 1
299 |             // 000 0000 0000 0001 10
300 |             // 00 0000 0000 0000 111
301 |             // 0 0000 0000 0000 1000
302 | 
303 |             // And the hex:
304 |             // ------------
305 |             // 0000 0000 -> 0x00 -> 0
306 |             // 0000 0000 -> 0x00 -> 0
307 |             // 1000 0000 -> 0x80 -> -128
308 |             // 0000 0000 -> 0x00 -> 0
309 |             // 1000 0000 -> 0x80 -> -128
310 |             // 0000 0000 -> 0x00 -> 0
311 |             // 0110 0000 -> 0x60 -> 96
312 |             // 0000 0000 -> 0x00 -> 0
313 |             // 0100 0000 -> 0x40 -> 64
314 |             // 0000 0000 -> 0x00 -> 0
315 |             // 0010 1000 -> 0x28 -> 40
316 |             // 0000 0000 -> 0x00 -> 0
317 |             // 0001 1000 -> 0x18 -> 24
318 |             // 0000 0000 -> 0x00 -> 0
319 |             // 0000 1110 -> 0x0D -> 14
320 |             // 0000 0000 -> 0x00 -> 0
321 |             // 0000 1000 -> 0x08 -> 8
322 | 
323 |             final byte[] bytes = serializer.getBytes();
324 |             final byte[] expectedBytes = new byte[] { 0, 0, -128, 0, -128, 0, 96, 0, 64, 0, 40, 0, 24, 0, 14, 0, 8 };
325 |             assertTrue(Arrays.equals(bytes, expectedBytes));
326 |         }
327 |         {// Should pad the array correctly.
328 |             final BigEndianAscendingWordSerializer serializer =
329 |                 new BigEndianAscendingWordSerializer(shortWordLength,
330 |                                                      1/*wordCount*/,
331 |                                                      1/*bytePadding*/);
332 | 
333 |             serializer.writeWord(1);
334 |             // 1 byte leading padding | value 1 | trailing padding
335 |             // 0000 0000 | 0000 0000 0000 0000 1|000 0000
336 |             // 0x00 0x00 0x00 0x80
337 |             final byte[] bytes = serializer.getBytes();
338 |             final byte[] expectedBytes = new byte[] { 0, 0, 0, -128 };
339 |             assertTrue(Arrays.equals(bytes, expectedBytes));
340 |         }
341 |     }
342 | }
343 | 


--------------------------------------------------------------------------------
/src/test/java/net/agkn/hll/serialization/HLLSerializationTest.java:
--------------------------------------------------------------------------------
 1 | package net.agkn.hll.serialization;
 2 | 
 3 | import net.agkn.hll.HLL;
 4 | import net.agkn.hll.HLLType;
 5 | import org.testng.annotations.Test;
 6 | 
 7 | import java.util.ArrayList;
 8 | import java.util.Collection;
 9 | import java.util.List;
10 | import java.util.Random;
11 | 
12 | import static net.agkn.hll.HLL.MAXIMUM_EXPTHRESH_PARAM;
13 | import static net.agkn.hll.HLL.MAXIMUM_REGWIDTH_PARAM;
14 | import static net.agkn.hll.HLL.MINIMUM_EXPTHRESH_PARAM;
15 | import static net.agkn.hll.HLL.MINIMUM_LOG2M_PARAM;
16 | import static net.agkn.hll.HLL.MINIMUM_REGWIDTH_PARAM;
17 | import static org.testng.Assert.assertEquals;
18 | 
19 | /**
20 |  * Serialization smoke-tests.
21 |  *
22 |  * @author yerenkow
23 |  * @author benl
24 |  */
25 | public class HLLSerializationTest {
26 |     // A fixed random seed so that this test is reproducible.
27 |     private static final long RANDOM_SEED = 1L;
28 | 
29 |     /**
30 |      * A smoke-test that covers serialization/deserialization of an HLL
31 |      * under all possible parameters.
32 |      */
33 |     @Test
34 |     public void serializationSmokeTest() throws Exception {
35 |         final Random random = new Random(RANDOM_SEED);
36 |         final int randomCount = 250;
37 |         final List<Long> randoms = new ArrayList<Long>(randomCount){{
38 |             for (int i=0; i<randomCount; i++) {
39 |                 add(random.nextLong());
40 |             }
41 |         }};
42 | 
43 |         assertCardinality(HLLType.EMPTY, randoms);
44 |         assertCardinality(HLLType.EXPLICIT, randoms);
45 |         assertCardinality(HLLType.SPARSE, randoms);
46 |         assertCardinality(HLLType.FULL, randoms);
47 |     }
48 | 
49 |     // NOTE: log2m<=16 was chosen as the max log2m parameter so that the test
50 |     //       completes in a reasonable amount of time. Not much is gained by
51 |     //       testing larger values - there are no more known serialization
52 |     //       related edge cases that appear as log2m gets even larger.
53 |     // NOTE: This test completed successfully with log2m<=MAXIMUM_LOG2M_PARAM
54 |     //       on 2014-01-30.
55 |     private static void assertCardinality(final HLLType hllType, final Collection<Long> items)
56 |            throws CloneNotSupportedException {
57 |         for(int log2m=MINIMUM_LOG2M_PARAM; log2m<=16; log2m++) {
58 |             for(int regw=MINIMUM_REGWIDTH_PARAM; regw<=MAXIMUM_REGWIDTH_PARAM; regw++) {
59 |                 for(int expthr=MINIMUM_EXPTHRESH_PARAM; expthr<=MAXIMUM_EXPTHRESH_PARAM; expthr++ ) {
60 |                     for(final boolean sparse: new boolean[]{true, false}) {
61 |                         HLL hll = new HLL(log2m, regw, expthr, sparse, hllType);
62 |                         for(final Long item: items) {
63 |                             hll.addRaw(item);
64 |                         }
65 |                         HLL copy = HLL.fromBytes(hll.toBytes());
66 |                         assertEquals(copy.cardinality(), hll.cardinality());
67 |                         assertEquals(copy.getType(), hll.getType());
68 |                         assertEquals(copy.toBytes(), hll.toBytes());
69 | 
70 |                         HLL clone = hll.clone();
71 |                         assertEquals(clone.cardinality(), hll.cardinality());
72 |                         assertEquals(clone.getType(), hll.getType());
73 |                         assertEquals(clone.toBytes(), hll.toBytes());
74 |                     }
75 |                 }
76 |             }
77 |         }
78 |     }
79 | }
80 | 


--------------------------------------------------------------------------------
/src/test/java/net/agkn/hll/util/BitVectorTest.java:
--------------------------------------------------------------------------------
  1 | package net.agkn.hll.util;
  2 | 
  3 | /*
  4 |  * Copyright 2013 Aggregate Knowledge, Inc.
  5 |  *
  6 |  * Licensed under the Apache License, Version 2.0 (the "License");
  7 |  * you may not use this file except in compliance with the License.
  8 |  * You may obtain a copy of the License at
  9 |  *
 10 |  *     http://www.apache.org/licenses/LICENSE-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS,
 14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 |  * See the License for the specific language governing permissions and
 16 |  * limitations under the License.
 17 |  */
 18 | 
 19 | import static org.testng.Assert.assertEquals;
 20 | import static org.testng.Assert.assertFalse;
 21 | import static org.testng.Assert.assertTrue;
 22 | 
 23 | import org.testng.annotations.Test;
 24 | 
 25 | /**
 26 |  * Unit tests for {@link BitVector}.
 27 |  *
 28 |  * @author rgrzywinski
 29 |  * @author timon
 30 |  */
 31 | public class BitVectorTest {
 32 |     /**
 33 |      * Tests {@link BitVector#getRegister(long)} and {@link BitVector#setRegister(long, long)}.
 34 |      */
 35 |     @Test
 36 |     public void getSetRegisterTest() {
 37 |         { // locally scoped for sanity
 38 |             // NOTE:  registers are only 5bits wide
 39 |             final BitVector vector1 = new BitVector(5/*width*/, 128/*count, 2^7*/);
 40 |             final BitVector vector2 = new BitVector(5/*width*/, 128/*count, 2^7*/);
 41 |             final BitVector vector3 = new BitVector(5/*width*/, 128/*count, 2^7*/);
 42 |             final BitVector vector4 = new BitVector(5/*width*/, 128/*count, 2^7*/);
 43 | 
 44 |             for(int i=0; i<128/*2^7*/; i++) {
 45 |                 vector1.setRegister(i, 0x1F);
 46 |                 vector2.setRegister(i, (i & 0x1F));
 47 |                 vector3.setRegister(i, ((127 - i) & 0x1F));
 48 |                 vector4.setRegister(i, 0x15);
 49 |             }
 50 | 
 51 |             for(int i=0; i<128/*2^7*/; i++) {
 52 |                 assertEquals(vector1.getRegister(i), 0x1F);
 53 |                 assertEquals(vector2.getRegister(i), (i & 0x1F));
 54 |                 assertEquals(vector3.getRegister(i), ((127 - i) & 0x1F));
 55 |                 assertEquals(vector4.getRegister(i), 0x15);
 56 |             }
 57 |         }
 58 |     }
 59 | 
 60 |     // ========================================================================
 61 |     /**
 62 |      * Tests {@link BitVector#registerIterator()}
 63 |      */
 64 |     @Test
 65 |     public void registerIteratorTest() {
 66 |         { // scoped locally for sanity
 67 |             // NOTE:  registers are only 5bits wide
 68 |             final BitVector vector1 = new BitVector(5/*width*/, 128/*count, 2^7*/);
 69 |             final BitVector vector2 = new BitVector(5/*width*/, 128/*count, 2^7*/);
 70 |             final BitVector vector3 = new BitVector(5/*width*/, 128/*count, 2^7*/);
 71 |             final BitVector vector4 = new BitVector(5/*width*/, 128/*count, 2^7*/);
 72 | 
 73 |             for(int i=0; i<128/*2^7*/; i++) {
 74 |                 vector1.setRegister(i, 0x1F);
 75 |                 vector2.setRegister(i, (i & 0x1F));
 76 |                 vector3.setRegister(i, ((127 - i) & 0x1F));
 77 |                 vector4.setRegister(i, 0x15);
 78 |             }
 79 | 
 80 |             final LongIterator registerIterator1 = vector1.registerIterator();
 81 |             final LongIterator registerIterator2 = vector2.registerIterator();
 82 |             final LongIterator registerIterator3 = vector3.registerIterator();
 83 |             final LongIterator registerIterator4 = vector4.registerIterator();
 84 |             for(int i=0; i<128/*2^7*/; i++) {
 85 |                 assertEquals(registerIterator1.hasNext(), true);
 86 |                 assertEquals(registerIterator2.hasNext(), true);
 87 |                 assertEquals(registerIterator3.hasNext(), true);
 88 |                 assertEquals(registerIterator4.hasNext(), true);
 89 | 
 90 |                 assertEquals(registerIterator1.next(), 0x1F);
 91 |                 assertEquals(registerIterator2.next(), (i & 0x1F));
 92 |                 assertEquals(registerIterator3.next(), ((127 - i) & 0x1F));
 93 |                 assertEquals(registerIterator4.next(), 0x15);
 94 |             }
 95 |             assertEquals(registerIterator1.hasNext(), false/*no more*/);
 96 |             assertEquals(registerIterator2.hasNext(), false/*no more*/);
 97 |             assertEquals(registerIterator3.hasNext(), false/*no more*/);
 98 |             assertEquals(registerIterator4.hasNext(), false/*no more*/);
 99 |         }
100 | 
101 |         { // scoped locally for sanity
102 |             // Vectors that are shorter than one word
103 |             assertIterator(1, 12/* 1*12=12 bits, fewer than a single word */);
104 |             assertIterator(2, 12/* 2*12=24 bits, fewer than a single word */);
105 |             assertIterator(3, 12/* 3*12=36 bits, fewer than a single word */);
106 |             assertIterator(4, 12/* 4*12=48 bits, fewer than a single word */);
107 | 
108 |             // Vectors that don't fit exactly into longs
109 |             assertIterator(5, 16/* 5*16=80 bits */);
110 |             assertIterator(5, 32/* 5*32=160 bits */);
111 |         }
112 | 
113 |         // Iterate over vectors that are padded
114 |     }
115 | 
116 |     private static void assertIterator(final int width, final int count) {
117 |         final BitVector vector = new BitVector(width, count);
118 |         final LongIterator iter = vector.registerIterator();
119 | 
120 |         for(int i=0; i<count; i++) {
121 |             assertTrue(iter.hasNext(), String.format("expected more elements: width=%s, count=%s", width, count));
122 |             // TODO: fill with a sentinel value
123 |             assertEquals(iter.next(), 0);
124 |         }
125 |         assertFalse(iter.hasNext(), String.format("expected no more elements: width=%s, count=%s", width, count));
126 |     }
127 | 
128 |     // ========================================================================
129 |     /**
130 |      * Tests {@link BitVector#setMaxRegister(long, long)}
131 |      */
132 |     @Test
133 |     public void setMaxRegisterTest() {
134 |         final BitVector vector = new BitVector(5/*width*/, 128/*count, 2^7*/);
135 | 
136 |         vector.setRegister(0, 10);
137 |         // should replace with a larger value
138 |         vector.setMaxRegister(0, 11);
139 |         assertEquals(vector.getRegister(0), 11);
140 |         // should not replace with a smaller or equal value
141 |         vector.setMaxRegister(0, 9);
142 |         assertEquals(vector.getRegister( 0), 11);
143 |         vector.setMaxRegister(0, 11);
144 |         assertEquals(vector.getRegister(0), 11);
145 |     }
146 | 
147 |     // ========================================================================
148 |     // fill
149 |     /**
150 |      * Tests {@link BitVector#fill(long)}
151 |      */
152 |     @Test
153 |     public void fillTest() {
154 |         final BitVector vector = new BitVector(5/*width*/, 128/*count, 2^7*/);
155 | 
156 |         for(int i=0; i<128/*2^7*/; i++) {
157 |             vector.setRegister(i, i);
158 |         }
159 | 
160 |         vector.fill(0L);
161 | 
162 |         for(int i=0; i<128/*2^7*/; i++) {
163 |             assertEquals(vector.getRegister(i), 0);
164 |         }
165 | 
166 |         vector.fill(17L/*arbitrary*/);
167 | 
168 |         for(int i=0; i<128/*2^7*/; i++) {
169 |             assertEquals(vector.getRegister(i), 17/*arbitrary*/);
170 |         }
171 |     }
172 | }


--------------------------------------------------------------------------------
/src/test/java/net/agkn/hll/util/HLLUtilTest.java:
--------------------------------------------------------------------------------
 1 | package net.agkn.hll.util;
 2 | 
 3 | /*
 4 |  * Copyright 2014 Neustar, Inc.
 5 |  *
 6 |  * Licensed under the Apache License, Version 2.0 (the "License");
 7 |  * you may not use this file except in compliance with the License.
 8 |  * You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | 
19 | import static org.testng.Assert.assertEquals;
20 | import net.agkn.hll.HLL;
21 | 
22 | import org.testng.annotations.Test;
23 | 
24 | /**
25 |  * Tests {@link HLLUtil} static methods.
26 |  *
27 |  * @author tkarnezo
28 |  */
29 | public class HLLUtilTest {
30 |     /**
31 |      * Tests that {@link HLLUtil#largeEstimatorCutoff(int, int)} is the same
32 |      * as a trivial implementation.
33 |      */
34 |     @Test
35 |     public void largeEstimatorCutoffTest() {
36 |         for(int log2m=HLL.MINIMUM_LOG2M_PARAM; log2m<=HLL.MAXIMUM_LOG2M_PARAM; log2m++) {
37 |             for(int regWidth=HLL.MINIMUM_REGWIDTH_PARAM; regWidth<=HLL.MINIMUM_REGWIDTH_PARAM; regWidth++) {
38 |                 final double cutoff = HLLUtil.largeEstimatorCutoff(log2m, regWidth);
39 | 
40 |                 // See blog post (http://research.neustar.biz/2013/01/24/hyperloglog-googles-take-on-engineering-hll/)
41 |                 // and original paper (Fig. 3) for information on 2^L and
42 |                 // "large range correction" cutoff.
43 |                 final double expected = Math.pow(2, Math.pow(2, regWidth) - 2 + log2m) / 30.0;
44 |                 assertEquals(cutoff, expected);
45 |             }
46 |         }
47 |     }
48 | }
49 | 


--------------------------------------------------------------------------------