├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── benchmarks ├── pom.xml └── src │ └── main │ └── java │ └── com │ └── github │ └── prasanthj │ └── hyperloglog │ └── HyperLogLogAdd.java ├── hll ├── pom.xml └── src ├── java └── com │ └── github │ └── prasanthj │ └── hll │ ├── HLLConstants.java │ ├── HLLDenseRegister.java │ ├── HLLRegister.java │ ├── HLLSparseRegister.java │ ├── HyperLogLog.java │ ├── HyperLogLogUtils.java │ ├── Murmur3.java │ └── tools │ └── HyperLogLogCLI.java ├── main └── resources │ └── .gitignore └── test └── com └── github └── prasanthj └── hll ├── TestHLLNoBias.java ├── TestHLLSerialization.java ├── TestHyperLogLog.java ├── TestHyperLogLogDense.java ├── TestHyperLogLogMerge.java ├── TestHyperLogLogSparse.java ├── TestMurmur3.java └── TestSparseEncodeHash.java /.gitignore: -------------------------------------------------------------------------------- 1 | *target* 2 | *.jar 3 | *.war 4 | *.ear 5 | *.class 6 | .idea/ 7 | *.iml 8 | 9 | # eclipse specific git ignore 10 | *.pydevproject 11 | .project 12 | .metadata 13 | bin/** 14 | tmp/** 15 | tmp/**/* 16 | *.tmp 17 | *.bak 18 | *.swp 19 | *~.nib 20 | local.properties 21 | .classpath 22 | .settings/ 23 | .loadpath 24 | .DS_Store 25 | *.hll 26 | *.dat 27 | 28 | # External tool builders 29 | .externalToolBuilders/ 30 | 31 | # Locally stored "Eclipse launch configurations" 32 | *.launch 33 | 34 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 35 | hs_err_pid* 36 | 37 | # Mobile Tools for Java (J2ME) 38 | .mtj.tmp/ 39 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Prasanth Jayachandran 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | 16 | # https://docs.travis-ci.com/user/ci-environment/ 17 | # trusty - 7.5GB memory and 2 cores 18 | sudo: required 19 | dist: trusty 20 | 21 | language: java 22 | jdk: 23 | - oraclejdk8 24 | 25 | cache: 26 | directories: 27 | - $HOME/.m2 28 | 29 | install: true 30 | 31 | script: mvn clean install && mvn cobertura:cobertura 32 | 33 | after_success: 34 | - bash <(curl -s https://codecov.io/bash) 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | HyperLogLog [![Build Status](https://travis-ci.org/prasanthj/hyperloglog.svg?branch=master)](https://travis-ci.org/prasanthj/hyperloglog/branches) [![codecov](https://codecov.io/gh/prasanthj/hyperloglog/branch/master/graph/badge.svg)](https://codecov.io/gh/prasanthj/hyperloglog) 2 | ![Maven Central](https://maven-badges.herokuapp.com/maven-central/com.github.prasanthj/hyperloglog/badge.svg) 3 | =========== 4 | 5 | HyperLogLog is an amazing data structure for estimating the cardinality (with very high accuracy) of large data sets that uses very little memory. This implementation of HyperLogLog contains the original algorithm by [Flajolet et. al] as well hyperloglog++ algorithm by [Heule et. al]. Refer 'References' section for blog posts/paper to find out the inner workings of hyperloglog. 6 | 7 | 8 | Features 9 | -------- 10 | - Built-in support for 32-bit and 64-bit hashcodes (Murmur3_32 and Murmur3_128 respectively) 11 | - API support for specifying hashcode directly (instead of using internal ones) 12 | - SPARSE and DENSE encoding support 13 | - Bit-packing of DENSE registers for better compression. Serialized hyperloglog size with bitpacking is ~10KB for millions of distinct items, ~12K for few billion distinct items. When bit-packing is disabled the serialized size is ~16KB. 14 | - Delta encoding and varints for SPARSE registers. Serialized hyperloglog size with sparse representation is from as low as 10s of bytes (boolean column) and above. 15 | - Bias correction using lookup table for better accuracy 16 | - Command line tool (hll) 17 | - Configurable options to enable/disable the above features 18 | 19 | Installation 20 | -------------- 21 | 22 | ```sh 23 | git clone https://github.com/prasanthj/hyperloglog.git hyperloglog 24 | cd hyperloglog 25 | mvn package -DskipTests 26 | ``` 27 | 28 | hll - Command Line Tool 29 | ----------------------- 30 | After running ```mvn package -DskipTests```, run ```hll``` to display the usage options 31 | ```sh 32 | Example usage: hll -n 1000 hll -f /tmp/input.txt hll -d -i /tmp/out.hll 33 | usage: HyperLogLog 34 | -b,--enable-bitpacking enable bit-packing of registers. default = 35 | true 36 | -c,--no-bias use bias correction table (no-bias 37 | algorithm). default = true 38 | -d,--deserialize deserialize hyperloglog from file. specify 39 | -i for input file 40 | -e,--encoding specify encoding to use (SPARSE or DENSE). 41 | default = SPARSE 42 | -f,--file specify file to read input data 43 | -i,--input-file specify input file for deserialization 44 | -n,--num-random-values number of random values to generate 45 | -o,--output-file specify output file for serialization 46 | -p,--num-register-bits number of bits from hashcode used as 47 | register index between 4 and 16 (both 48 | inclusive). default = 14 49 | -r,--relative-error print relative error calculation 50 | -s,--serialize serialize hyperloglog to file. specify -o 51 | for output file 52 | -t,--standard-in read data from standard in 53 | 54 | ``` 55 | 56 | Examples 57 | -------- 58 | Test with 'n' random numbers 59 | 60 | ``` 61 | #./hll -r -n 20000 62 | Actual count: 20000 63 | Encoding: DENSE, p: 14, estimatedCardinality: 19993 64 | Relative error: 0.034999847% 65 | ``` 66 | 67 | Test with input file 68 | ``` 69 | #./hll -r -f /etc/passwd 70 | Actual count: 84 71 | Encoding: SPARSE, p: 14, estimatedCardinality: 84 72 | Relative error: 0.0% 73 | ``` 74 | 75 | Test serialization 76 | ``` 77 | #./hll -r -n 100000000 -s -o /tmp/out.hll 78 | Actual count: 100000000 79 | Encoding: DENSE, p: 14, estimatedCardinality: 100069607 80 | Relative error: -0.069606304% 81 | Serialized hyperloglog to /tmp/out.hll 82 | Serialized size: 10248 bytes 83 | Serialization time: 20 ms 84 | 85 | ./hll -r -f /etc/passwd -s -o /tmp/out.hll 86 | Actual count: 84 87 | Encoding: SPARSE, p: 14, estimatedCardinality: 84 88 | Relative error: 0.0% 89 | Serialized hyperloglog to /tmp/out.hll 90 | Serialized size: 337 bytes 91 | Serialization time: 5 ms 92 | ``` 93 | 94 | Test deserialization 95 | ``` 96 | #./hll -d -i /tmp/passwd.hll 97 | Encoding: SPARSE, p: 14, estimatedCardinality: 84 98 | Count after deserialization: 84 99 | Deserialization time: 42 ms 100 | ``` 101 | 102 | Test disabling bit-packing of registers 103 | ``` 104 | #./hll -r -n 10000000 -b false -s -o /tmp/out.hll 105 | Actual count: 10000000 106 | Encoding: DENSE, p: 14, estimatedCardinality: 10052011 107 | Relative error: -0.52011013% 108 | Serialized hyperloglog to /tmp/out.hll 109 | Serialized size: 16392 bytes 110 | Serialization time: 27 ms 111 | ``` 112 | 113 | Test reading from standard in 114 | ``` 115 | #cat /etc/passwd | ./hll -r -t 116 | Actual count: 84 117 | Encoding: SPARSE, p: 14, estimatedCardinality: 84 118 | Relative error: 0.0% 119 | ``` 120 | 121 | Issues 122 | ------ 123 | Bug fixes or improvements are welcome! Please fork the project and send pull request on github. Or report issues here https://github.com/prasanthj/hyperloglog/issues 124 | 125 | 126 | License 127 | ------- 128 | 129 | Apache licensed. 130 | 131 | References 132 | ---------- 133 | [1] http://research.neustar.biz/2012/10/25/sketch-of-the-day-hyperloglog-cornerstone-of-a-big-data-infrastructure/ 134 | 135 | [2] http://metamarkets.com/2012/fast-cheap-and-98-right-cardinality-estimation-for-big-data/ 136 | 137 | [3] http://research.neustar.biz/tag/flajolet-martin-sketch/ 138 | 139 | [4] http://research.neustar.biz/2013/01/24/hyperloglog-googles-take-on-engineering-hll/ 140 | 141 | [5] http://antirez.com/news/75 142 | 143 | 144 | [Flajolet et. al]:http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf 145 | [Heule et. al]:http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/40671.pdf 146 | -------------------------------------------------------------------------------- /benchmarks/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | com.github.prasanthj 8 | hyperloglog-benchmarks 9 | 1.2-SNAPSHOT 10 | jar 11 | 12 | 13 | UTF-8 14 | 15 | 18 | 1.21 19 | 20 | 23 | 1.8 24 | 25 | 28 | benchmarks 29 | 30 | 31 | 32 | org.openjdk.jmh 33 | jmh-core 34 | ${jmh.version} 35 | 36 | 37 | org.openjdk.jmh 38 | jmh-generator-annprocess 39 | ${jmh.version} 40 | provided 41 | 42 | 43 | com.github.prasanthj 44 | hyperloglog 45 | 1.2-SNAPSHOT 46 | 47 | 48 | org.apache.hive 49 | hive-standalone-metastore 50 | 3.0.0-SNAPSHOT 51 | 52 | 53 | 54 | 55 | 56 | 57 | org.apache.maven.plugins 58 | maven-compiler-plugin 59 | 3.1 60 | 61 | ${javac.target} 62 | ${javac.target} 63 | ${javac.target} 64 | 65 | 66 | 67 | org.apache.maven.plugins 68 | maven-shade-plugin 69 | 2.2 70 | 71 | 72 | package 73 | 74 | shade 75 | 76 | 77 | ${uberjar.name} 78 | 79 | 80 | org.openjdk.jmh.Main 81 | 82 | 83 | 84 | 85 | 89 | *:* 90 | 91 | META-INF/*.SF 92 | META-INF/*.DSA 93 | META-INF/*.RSA 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | maven-clean-plugin 106 | 2.5 107 | 108 | 109 | maven-deploy-plugin 110 | 2.8.1 111 | 112 | 113 | maven-install-plugin 114 | 2.5.1 115 | 116 | 117 | maven-jar-plugin 118 | 2.4 119 | 120 | 121 | maven-javadoc-plugin 122 | 2.9.1 123 | 124 | 125 | maven-resources-plugin 126 | 2.6 127 | 128 | 129 | maven-site-plugin 130 | 3.3 131 | 132 | 133 | maven-source-plugin 134 | 2.2.1 135 | 136 | 137 | maven-surefire-plugin 138 | 2.17 139 | 140 | 141 | 142 | 143 | -------------------------------------------------------------------------------- /benchmarks/src/main/java/com/github/prasanthj/hyperloglog/HyperLogLogAdd.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2014 Prasanth Jayachandran 3 | * Licensed under the Apache License, Version 2.0 (the "License"); 4 | * you may not use this file except in compliance with the License. 5 | * You may obtain a copy of the License at 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * Unless required by applicable law or agreed to in writing, software 8 | * distributed under the License is distributed on an "AS IS" BASIS, 9 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | * See the License for the specific language governing permissions and 11 | * limitations under the License. 12 | */ 13 | package com.github.prasanthj.hyperloglog; 14 | 15 | import java.util.ArrayList; 16 | import java.util.List; 17 | import java.util.Random; 18 | import java.util.concurrent.TimeUnit; 19 | 20 | import org.openjdk.jmh.annotations.Benchmark; 21 | import org.openjdk.jmh.annotations.BenchmarkMode; 22 | import org.openjdk.jmh.annotations.Fork; 23 | import org.openjdk.jmh.annotations.Measurement; 24 | import org.openjdk.jmh.annotations.Mode; 25 | import org.openjdk.jmh.annotations.OperationsPerInvocation; 26 | import org.openjdk.jmh.annotations.OutputTimeUnit; 27 | import org.openjdk.jmh.annotations.Scope; 28 | import org.openjdk.jmh.annotations.State; 29 | import org.openjdk.jmh.annotations.Warmup; 30 | import org.openjdk.jmh.infra.Blackhole; 31 | import org.openjdk.jmh.profile.LinuxPerfAsmProfiler; 32 | import org.openjdk.jmh.profile.LinuxPerfNormProfiler; 33 | import org.openjdk.jmh.profile.LinuxPerfProfiler; 34 | import org.openjdk.jmh.runner.Runner; 35 | import org.openjdk.jmh.runner.RunnerException; 36 | import org.openjdk.jmh.runner.options.Options; 37 | import org.openjdk.jmh.runner.options.OptionsBuilder; 38 | 39 | import com.github.prasanthj.hll.HyperLogLog; 40 | 41 | @State(Scope.Benchmark) 42 | @Warmup(iterations = 10, time = 1) 43 | @Measurement(iterations = 10, time = 1) 44 | @Fork(1) 45 | @BenchmarkMode(Mode.AverageTime) 46 | @OutputTimeUnit(TimeUnit.NANOSECONDS) 47 | public class HyperLogLogAdd { 48 | 49 | private static List hashcodes; 50 | static { 51 | hashcodes = new ArrayList<>(); 52 | Random random = new Random(123); 53 | for (int i = 0; i < 100; i++) { 54 | hashcodes.add(random.nextLong()); 55 | } 56 | } 57 | 58 | @Benchmark 59 | @OperationsPerInvocation(100) 60 | public void testHLLAdd(Blackhole blackhole) { 61 | final HyperLogLog hll = HyperLogLog 62 | .builder() 63 | .setNumRegisterIndexBits(10) 64 | .setEncoding(HyperLogLog.EncodingType.SPARSE) 65 | .build(); 66 | for (long hashcode : hashcodes) { 67 | hll.add(hashcode); 68 | } 69 | blackhole.consume(hll); 70 | } 71 | 72 | @Benchmark 73 | @OperationsPerInvocation(100) 74 | public void testHLLAddHive(Blackhole blackhole) { 75 | final org.apache.hadoop.hive.common.ndv.hll.HyperLogLog hiveHll = org.apache.hadoop.hive.common.ndv.hll 76 | .HyperLogLog 77 | .builder() 78 | .setNumRegisterIndexBits(10) 79 | .setEncoding(org.apache.hadoop.hive.common.ndv.hll.HyperLogLog.EncodingType.SPARSE) 80 | .build(); 81 | for (long hashcode : hashcodes) { 82 | hiveHll.add(hashcode); 83 | } 84 | blackhole.consume(hiveHll); 85 | } 86 | 87 | /* 88 | * ============================== HOW TO RUN THIS TEST: ==================================== 89 | * 90 | * You can run this test: 91 | * 92 | * a) Via the command line: 93 | * $ mvn clean install 94 | * $ java -jar target/benchmarks.jar HyperLogLogAdd -prof perf -f 1 (Linux) 95 | * $ java -jar target/benchmarks.jar HyperLogLogAdd -prof perfnorm -f 3 (Linux) 96 | * $ java -jar target/benchmarks.jar HyperLogLogAdd -prof perfasm -f 1 (Linux) 97 | * $ java -jar target/benchmarks.jar HyperLogLogAdd -prof perf -jvmArgsAppend "-XX:AllocatePrefetchStyle=2" 98 | */ 99 | public static void main(String[] args) throws RunnerException { 100 | Options opt = new OptionsBuilder() 101 | .include(HyperLogLogAdd.class.getSimpleName()) 102 | .addProfiler(LinuxPerfProfiler.class) 103 | .addProfiler(LinuxPerfNormProfiler.class) 104 | .addProfiler(LinuxPerfAsmProfiler.class) 105 | .build(); 106 | 107 | new Runner(opt).run(); 108 | } 109 | } -------------------------------------------------------------------------------- /hll: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | HLL_HOME="$PWD" 4 | HLL_CLASSPATH="$HLL_HOME/target/*:$HLL_HOME/target/lib/*:$HLL_HOME/target/classes" 5 | 6 | java -cp $HLL_CLASSPATH com.github.prasanthj.hll.tools.HyperLogLogCLI "$@" 7 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | com.github.prasanthj 4 | hyperloglog 5 | 1.2-SNAPSHOT 6 | jar 7 | hyperloglog 8 | Implementation of HyperLogLog and HyperLogLog++ in Java 9 | https://github.com/prasanthj/hyperloglog 10 | 11 | 12 | 13 | 14 | The Apache Software License, Version 2.0 15 | http://www.apache.org/licenses/LICENSE-2.0.txt 16 | repo 17 | 18 | 19 | 20 | 21 | 22 | https://github.com/prasanthj/hyperloglog 23 | scm:git:git://github.com/prasanthj/hyperloglog.git 24 | scm:git:git@github.com:prasanthj/hyperloglog.git 25 | HEAD 26 | 27 | 28 | 29 | 30 | 31 | prasanthj 32 | Prasanth Jayachandran 33 | prasanthj@apache.org 34 | https://github.com/prasanthj 35 | 36 | 37 | 38 | 39 | UTF-8 40 | UTF-8 41 | 1.8 42 | 1.8 43 | 44 | 45 | 46 | 47 | 48 | sonatype-nexus-snapshots 49 | Sonatype Nexus snapshot repository 50 | https://oss.sonatype.org/content/repositories/snapshots 51 | 52 | 53 | sonatype-nexus-staging 54 | Sonatype Nexus release repository 55 | https://oss.sonatype.org/service/local/staging/deploy/maven2 56 | 57 | 58 | 59 | 60 | 61 | 62 | release-sign-artifacts 63 | 64 | 65 | performRelease 66 | true 67 | 68 | 69 | 70 | 71 | 72 | org.apache.maven.plugins 73 | maven-gpg-plugin 74 | 1.4 75 | 76 | 77 | sign-artifacts 78 | verify 79 | 80 | sign 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | it.unimi.dsi 93 | fastutil 94 | 6.5.15 95 | 96 | 97 | commons-cli 98 | commons-cli 99 | 1.2 100 | 101 | 102 | junit 103 | junit 104 | [4.13.1,) 105 | test 106 | 107 | 108 | com.google.guava 109 | guava 110 | [24.1.1,) 111 | test 112 | 113 | 114 | org.openjdk.jmh 115 | jmh-core 116 | 1.18 117 | 118 | 119 | org.apache.hive 120 | hive-standalone-metastore 121 | 3.0.0-SNAPSHOT 122 | 123 | 124 | 125 | 126 | target 127 | target/classes 128 | ${project.artifactId}-${project.version} 129 | target/test-classes 130 | src/java 131 | src/test 132 | 133 | 134 | 135 | 136 | ${project.basedir} 137 | 138 | README* 139 | NOTICE* 140 | LICENSE* 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | maven-dependency-plugin 149 | 150 | 151 | package 152 | 153 | copy-dependencies 154 | 155 | 156 | ${project.build.directory}/lib 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | org.apache.maven.plugins 165 | maven-surefire-plugin 166 | 2.16 167 | 168 | false 169 | 170 | en_US.UTF-8 171 | 172 | false 173 | 174 | 175 | 176 | 177 | 178 | org.codehaus.mojo 179 | cobertura-maven-plugin 180 | 2.7 181 | 182 | 183 | html 184 | xml 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | org.apache.maven.plugins 193 | maven-source-plugin 194 | 2.2.1 195 | 196 | 197 | attach-sources 198 | verify 199 | 200 | jar-no-fork 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | org.apache.maven.plugins 209 | maven-javadoc-plugin 210 | 2.9.1 211 | 212 | 213 | attach-javadoc 214 | verify 215 | 216 | jar 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | org.apache.maven.plugins 225 | maven-release-plugin 226 | 2.5 227 | 228 | 229 | 230 | 231 | 235 | 236 | 237 | 238 | 239 | 242 | forked-path 243 | 244 | 245 | 246 | 247 | 248 | 249 | org.sonatype.plugins 250 | nexus-staging-maven-plugin 251 | 1.6.4 252 | true 253 | 254 | sonatype-nexus-staging 255 | https://oss.sonatype.org/ 256 | true 257 | 258 | 259 | 260 | 261 | 262 | -------------------------------------------------------------------------------- /src/java/com/github/prasanthj/hll/HLLDenseRegister.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2017 Prasanth Jayachandran 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.github.prasanthj.hll; 18 | 19 | import java.util.Arrays; 20 | 21 | public class HLLDenseRegister implements HLLRegister { 22 | 23 | // 2^p number of bytes for register 24 | private byte[] register; 25 | 26 | // max value stored in registered is cached to determine the bit width for 27 | // bit packing 28 | private int maxRegisterValue; 29 | 30 | // number of register bits 31 | private int p; 32 | 33 | // m = 2^p 34 | private int m; 35 | 36 | public HLLDenseRegister(int p) { 37 | this(p, true); 38 | } 39 | 40 | public HLLDenseRegister(int p, boolean bitPack) { 41 | this.p = p; 42 | this.m = 1 << p; 43 | this.register = new byte[m]; 44 | this.maxRegisterValue = 0; 45 | if (bitPack == false) { 46 | this.maxRegisterValue = 0xff; 47 | } 48 | } 49 | 50 | public boolean add(long hashcode) { 51 | 52 | // LSB p bits 53 | final int registerIdx = (int) (hashcode & (m - 1)); 54 | 55 | // MSB 64 - p bits 56 | final long w = hashcode >>> p; 57 | 58 | // longest run of trailing zeroes 59 | final int lr = Long.numberOfTrailingZeros(w) + 1; 60 | return set(registerIdx, (byte) lr); 61 | } 62 | 63 | // this is a lossy invert of the function above, which produces a hashcode 64 | // which collides with the current winner of the register (we lose all higher 65 | // bits, but we get all bits useful for lesser p-bit options) 66 | 67 | // +-------------|-------------+ 68 | // |xxxx100000000|1000000000000| (lr=9 + idx=1024) 69 | // +-------------|-------------+ 70 | // \ 71 | // +---------------|-----------+ 72 | // |xxxx10000000010|00000000000| (lr=2 + idx=0) 73 | // +---------------|-----------+ 74 | 75 | // This shows the relevant bits of the original hash value 76 | // and how the conversion is moving bits from the index value 77 | // over to the leading zero computation 78 | 79 | public void extractLowBitsTo(HLLRegister dest) { 80 | for (int idx = 0; idx < register.length; idx++) { 81 | byte lr = register[idx]; // this can be a max of 65, never > 127 82 | if (lr != 0) { 83 | dest.add((long) ((1 << (p + lr - 1)) | idx)); 84 | } 85 | } 86 | } 87 | 88 | public boolean set(int idx, byte value) { 89 | boolean updated = false; 90 | if (idx < register.length && value > register[idx]) { 91 | 92 | // update max register value 93 | if (value > maxRegisterValue) { 94 | maxRegisterValue = value; 95 | } 96 | 97 | // set register value and compute inverse pow of 2 for register value 98 | register[idx] = value; 99 | 100 | updated = true; 101 | } 102 | return updated; 103 | } 104 | 105 | public int size() { 106 | return register.length; 107 | } 108 | 109 | public int getNumZeroes() { 110 | int numZeroes = 0; 111 | for (byte b : register) { 112 | if (b == 0) { 113 | numZeroes++; 114 | } 115 | } 116 | return numZeroes; 117 | } 118 | 119 | public void merge(HLLRegister hllRegister) { 120 | if (hllRegister instanceof HLLDenseRegister) { 121 | HLLDenseRegister hdr = (HLLDenseRegister) hllRegister; 122 | byte[] inRegister = hdr.getRegister(); 123 | 124 | // merge only if the register length matches 125 | if (register.length != inRegister.length) { 126 | throw new IllegalArgumentException( 127 | "The size of register sets of HyperLogLogs to be merged does not match."); 128 | } 129 | 130 | // compare register values and store the max register value 131 | for (int i = 0; i < inRegister.length; i++) { 132 | final byte cb = register[i]; 133 | final byte ob = inRegister[i]; 134 | register[i] = ob > cb ? ob : cb; 135 | } 136 | 137 | // update max register value 138 | if (hdr.getMaxRegisterValue() > maxRegisterValue) { 139 | maxRegisterValue = hdr.getMaxRegisterValue(); 140 | } 141 | } else { 142 | throw new IllegalArgumentException("Specified register is not instance of HLLDenseRegister"); 143 | } 144 | } 145 | 146 | public byte[] getRegister() { 147 | return register; 148 | } 149 | 150 | public void setRegister(byte[] register) { 151 | this.register = register; 152 | } 153 | 154 | public int getMaxRegisterValue() { 155 | return maxRegisterValue; 156 | } 157 | 158 | public double getSumInversePow2() { 159 | double sum = 0; 160 | for (byte b : register) { 161 | sum += HLLConstants.inversePow2Data[b]; 162 | } 163 | return sum; 164 | } 165 | 166 | @Override 167 | public String toString() { 168 | StringBuilder sb = new StringBuilder(); 169 | sb.append("HLLDenseRegister - "); 170 | sb.append("p: "); 171 | sb.append(p); 172 | sb.append(" numZeroes: "); 173 | sb.append(getNumZeroes()); 174 | sb.append(" maxRegisterValue: "); 175 | sb.append(maxRegisterValue); 176 | return sb.toString(); 177 | } 178 | 179 | public String toExtendedString() { 180 | return toString() + " register: " + Arrays.toString(register); 181 | } 182 | 183 | @Override 184 | public boolean equals(Object obj) { 185 | if (!(obj instanceof HLLDenseRegister)) { 186 | return false; 187 | } 188 | HLLDenseRegister other = (HLLDenseRegister) obj; 189 | return getNumZeroes() == other.getNumZeroes() && maxRegisterValue == other.maxRegisterValue 190 | && Arrays.equals(register, other.register); 191 | } 192 | 193 | @Override 194 | public int hashCode() { 195 | int hashcode = 0; 196 | hashcode += 31 * getNumZeroes(); 197 | hashcode += 31 * maxRegisterValue; 198 | hashcode += Arrays.hashCode(register); 199 | return hashcode; 200 | } 201 | 202 | } 203 | -------------------------------------------------------------------------------- /src/java/com/github/prasanthj/hll/HLLRegister.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2017 Prasanth Jayachandran 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.github.prasanthj.hll; 18 | 19 | public interface HLLRegister { 20 | 21 | /** 22 | * Specify a hashcode to add to hyperloglog register. 23 | * @param hashcode 24 | * - hashcode to add 25 | * @return true if register value is updated else false 26 | */ 27 | public boolean add(long hashcode); 28 | 29 | /** 30 | * Instead of specifying hashcode, this interface can be used to directly 31 | * specify the register index and register value. This interface is useful 32 | * when reconstructing hyperloglog from a serialized representation where its 33 | * not possible to regenerate the hashcode. 34 | * @param idx 35 | * - register index 36 | * @param value 37 | * - register value 38 | * @return true if register value is updated else false 39 | */ 40 | public boolean set(int idx, byte value); 41 | 42 | /** 43 | * Merge hyperloglog registers of the same type (SPARSE or DENSE register) 44 | * @param reg 45 | * - register to be merged 46 | */ 47 | public void merge(HLLRegister reg); 48 | } 49 | -------------------------------------------------------------------------------- /src/java/com/github/prasanthj/hll/HLLSparseRegister.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2017 Prasanth Jayachandran 3 | * Licensed under the Apache License, Version 2.0 (the "License"); 4 | * you may not use this file except in compliance with the License. 5 | * You may obtain a copy of the License at 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * Unless required by applicable law or agreed to in writing, software 8 | * distributed under the License is distributed on an "AS IS" BASIS, 9 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | * See the License for the specific language governing permissions and 11 | * limitations under the License. 12 | */ 13 | 14 | package com.github.prasanthj.hll; 15 | 16 | import java.util.Map; 17 | import java.util.Map.Entry; 18 | 19 | import it.unimi.dsi.fastutil.ints.Int2ByteAVLTreeMap; 20 | import it.unimi.dsi.fastutil.ints.Int2ByteSortedMap; 21 | 22 | public class HLLSparseRegister implements HLLRegister { 23 | 24 | // maintains sorted list of register indices and its corresponding values. 25 | // Its easier to use primitive sorted map as opposed to int[] used in this 26 | // paper 27 | // http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/40671.pdf 28 | private Int2ByteSortedMap sparseMap; 29 | 30 | // for a better insertion performance values are added to temporary unsorted 31 | // list which will be merged to sparse map after a threshold 32 | private int[] tempList; 33 | private int tempListIdx; 34 | 35 | // number of register bits 36 | private final int p; 37 | 38 | // new number of register bits for higher accuracy 39 | private final int pPrime; 40 | 41 | // number of bits to store the number of zero runs 42 | private final int qPrime; 43 | 44 | // masks for quicker extraction of p, pPrime, qPrime values 45 | private final int mask; 46 | private final int pPrimeMask; 47 | private final int qPrimeMask; 48 | 49 | public HLLSparseRegister(int p, int pp, int qp) { 50 | this.p = p; 51 | this.sparseMap = new Int2ByteAVLTreeMap(); 52 | this.tempList = new int[HLLConstants.TEMP_LIST_DEFAULT_SIZE]; 53 | this.tempListIdx = 0; 54 | this.pPrime = pp; 55 | this.qPrime = qp; 56 | this.mask = ((1 << pPrime) - 1) ^ ((1 << p) - 1); 57 | this.pPrimeMask = ((1 << pPrime) - 1); 58 | this.qPrimeMask = (1 << qPrime) - 1; 59 | } 60 | 61 | public boolean add(long hashcode) { 62 | boolean updated; 63 | 64 | // fill the temp list before merging to sparse map 65 | if (tempListIdx < tempList.length) { 66 | int encodedHash = encodeHash(hashcode); 67 | tempList[tempListIdx++] = encodedHash; 68 | updated = true; 69 | } else { 70 | updated = mergeTempListToSparseMap(); 71 | } 72 | 73 | return updated; 74 | } 75 | 76 | /** 77 | * Adds temp list to sparse map. The key for sparse map entry is the register 78 | * index determined by pPrime and value is the number of trailing zeroes. 79 | * @return 80 | */ 81 | private boolean mergeTempListToSparseMap() { 82 | boolean updated = false; 83 | for (int i = 0; i < tempListIdx; i++) { 84 | int encodedHash = tempList[i]; 85 | int key = encodedHash & pPrimeMask; 86 | byte value = (byte) (encodedHash >>> pPrime); 87 | byte nr = 0; 88 | // if MSB is set to 1 then next qPrime MSB bits contains the value of 89 | // number of zeroes. 90 | // if MSB is set to 0 then number of zeroes is contained within pPrime - p 91 | // bits. 92 | if (encodedHash < 0) { 93 | nr = (byte) (value & qPrimeMask); 94 | } else { 95 | nr = (byte) (Integer.numberOfTrailingZeros(encodedHash >>> p) + 1); 96 | } 97 | updated = set(key, nr); 98 | } 99 | 100 | // reset temp list index 101 | tempListIdx = 0; 102 | return updated; 103 | } 104 | 105 | /** 106 | *
107 |    * Input: 64 bit hashcode
108 |    *
109 |    * |---------w-------------| |------------p'----------|
110 |    * 10101101.......1010101010 10101010101 01010101010101
111 |    *                                       |------p-----|
112 |    *
113 |    * Output: 32 bit int
114 |    *
115 |    * |b| |-q'-|  |------------p'----------|
116 |    *  1  010101  01010101010 10101010101010
117 |    *                         |------p-----|
118 |    *
119 |    *
120 |    * The default values of p', q' and b are 25, 6, 1 (total 32 bits) respectively.
121 |    * This function will return an int encoded in the following format
122 |    *
123 |    * p  - LSB p bits represent the register index
124 |    * p' - LSB p' bits are used for increased accuracy in estimation
125 |    * q' - q' bits after p' are left as such from the hashcode if b = 0 else
126 |    *      q' bits encodes the longest trailing zero runs from in (w-p) input bits
127 |    * b  - 0 if longest trailing zero run is contained within (p'-p) bits
128 |    *      1 if longest trailing zero run is computeed from (w-p) input bits and
129 |    *      its value is stored in q' bits
130 |    * 
131 | * @param hashcode - hashcode value 132 | * @return - encoded hash code 133 | */ 134 | public int encodeHash(long hashcode) { 135 | // x = p' - p 136 | int x = (int) (hashcode & mask); 137 | if (x == 0) { 138 | // more bits should be considered for finding q (longest zero runs) 139 | // set MSB to 1 140 | int ntr = Long.numberOfTrailingZeros(hashcode >> p) + 1; 141 | long newHashCode = hashcode & pPrimeMask; 142 | newHashCode |= ntr << pPrime; 143 | newHashCode |= 0x80000000; 144 | return (int) newHashCode; 145 | } else { 146 | // q is contained within p' - p 147 | // set MSB to 0 148 | return (int) (hashcode & 0x7FFFFFFF); 149 | } 150 | } 151 | 152 | public int getSize() { 153 | return sparseMap.size() + tempListIdx; 154 | } 155 | 156 | public void merge(HLLRegister hllRegister) { 157 | if (hllRegister instanceof HLLSparseRegister) { 158 | HLLSparseRegister hsr = (HLLSparseRegister) hllRegister; 159 | 160 | // retain only the largest value for a register index 161 | for (Map.Entry entry : hsr.getSparseMap().entrySet()) { 162 | int key = entry.getKey(); 163 | byte value = entry.getValue(); 164 | set(key, value); 165 | } 166 | } else { 167 | throw new IllegalArgumentException("Specified register not instance of HLLSparseRegister"); 168 | } 169 | } 170 | 171 | public boolean set(int key, byte value) { 172 | // retain only the largest value for a register index 173 | Byte containedValue = sparseMap.get(key); 174 | if (value > containedValue) { 175 | sparseMap.put(key, value); 176 | return true; 177 | } 178 | return false; 179 | } 180 | 181 | public Int2ByteSortedMap getSparseMap() { 182 | return getMergedSparseMap(); 183 | } 184 | 185 | private Int2ByteSortedMap getMergedSparseMap() { 186 | if (tempListIdx != 0) { 187 | mergeTempListToSparseMap(); 188 | } 189 | return sparseMap; 190 | } 191 | 192 | // this is effectively the same as the dense register impl. 193 | public void extractLowBitsTo(HLLRegister dest) { 194 | for (Entry entry : getSparseMap().entrySet()) { 195 | int idx = entry.getKey(); 196 | byte lr = entry.getValue(); // this can be a max of 65, never > 127 197 | if (lr != 0) { 198 | // should be a no-op for sparse 199 | dest.add((long) ((1 << (p + lr - 1)) | idx)); 200 | } 201 | } 202 | } 203 | 204 | public int getP() { 205 | return p; 206 | } 207 | 208 | public int getPPrime() { 209 | return pPrime; 210 | } 211 | 212 | @Override 213 | public String toString() { 214 | StringBuilder sb = new StringBuilder(); 215 | sb.append("HLLSparseRegister - "); 216 | sb.append("p: "); 217 | sb.append(p); 218 | sb.append(" pPrime: "); 219 | sb.append(pPrime); 220 | sb.append(" qPrime: "); 221 | sb.append(qPrime); 222 | return sb.toString(); 223 | } 224 | 225 | public String toExtendedString() { 226 | return toString() + " register: " + sparseMap.toString(); 227 | } 228 | 229 | @Override 230 | public boolean equals(Object obj) { 231 | if (!(obj instanceof HLLSparseRegister)) { 232 | return false; 233 | } 234 | HLLSparseRegister other = (HLLSparseRegister) obj; 235 | boolean result = p == other.p && pPrime == other.pPrime && qPrime == other.qPrime 236 | && tempListIdx == other.tempListIdx; 237 | if (result) { 238 | for (int i = 0; i < tempListIdx; i++) { 239 | if (tempList[i] != other.tempList[i]) { 240 | return false; 241 | } 242 | } 243 | 244 | result = result && sparseMap.equals(other.sparseMap); 245 | } 246 | return result; 247 | } 248 | 249 | @Override 250 | public int hashCode() { 251 | int hashcode = 0; 252 | hashcode += 31 * p; 253 | hashcode += 31 * pPrime; 254 | hashcode += 31 * qPrime; 255 | for (int i = 0; i < tempListIdx; i++) { 256 | hashcode += 31 * tempList[tempListIdx]; 257 | } 258 | hashcode += sparseMap.hashCode(); 259 | return hashcode; 260 | } 261 | 262 | } 263 | -------------------------------------------------------------------------------- /src/java/com/github/prasanthj/hll/HyperLogLog.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2017 Prasanth Jayachandran 3 | * Licensed under the Apache License, Version 2.0 (the "License"); 4 | * you may not use this file except in compliance with the License. 5 | * You may obtain a copy of the License at 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * Unless required by applicable law or agreed to in writing, software 8 | * distributed under the License is distributed on an "AS IS" BASIS, 9 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | * See the License for the specific language governing permissions and 11 | * limitations under the License. 12 | */ 13 | 14 | package com.github.prasanthj.hll; 15 | 16 | import java.nio.ByteBuffer; 17 | import java.nio.charset.Charset; 18 | import java.util.Map; 19 | 20 | import it.unimi.dsi.fastutil.doubles.Double2IntAVLTreeMap; 21 | import it.unimi.dsi.fastutil.doubles.Double2IntSortedMap; 22 | 23 | /** 24 | *
 25 |  * This is an implementation of the following variants of hyperloglog (HLL)
 26 |  * algorithm 
 27 |  * Original  - Original HLL algorithm from Flajolet et. al from
 28 |  *             http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf
 29 |  * HLLNoBias - Google's implementation of bias correction based on lookup table
 30 |  *             http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/40671.pdf
 31 |  * HLL++     - Google's implementation of HLL++ algorithm that uses SPARSE registers
 32 |  *             http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/40671.pdf
 33 |  *
 34 |  * Following are the constructor parameters that determines which algorithm is
 35 |  * used
 36 |  * numRegisterIndexBits - number of LSB hashcode bits to be used as register index.
 37 |  *                        Default is 14. min = 4 and max = 16
 38 |  * numHashBits - number of bits for hashcode. Default is 64. min = 32 and max = 128
 39 |  * encoding - Type of encoding to use (SPARSE or DENSE). The algorithm automatically
 40 |  *            switches to DENSE beyond a threshold. Default: SPARSE
 41 |  * enableBitPacking - To enable bit packing or not. Bit packing improves compression
 42 |  *                    at the cost of more CPU cycles. Default: true
 43 |  * noBias - Use Google's bias table lookup for short range bias correction.
 44 |  *          Enabling this will highly improve the estimation accuracy for short
 45 |  *          range values. Default: true
 46 |  *
 47 |  * 
48 | */ 49 | public class HyperLogLog { 50 | private final static int DEFAULT_HASH_BITS = 64; 51 | private final static long HASH64_ZERO = Murmur3.hash64(new byte[]{0}); 52 | private final static long HASH64_ONE = Murmur3.hash64(new byte[]{1}); 53 | private final static ByteBuffer SHORT_BUFFER = ByteBuffer.allocate(Short.BYTES); 54 | private final static ByteBuffer INT_BUFFER = ByteBuffer.allocate(Integer.BYTES); 55 | private final static ByteBuffer LONG_BUFFER = ByteBuffer.allocate(Long.BYTES); 56 | 57 | public enum EncodingType { 58 | SPARSE, DENSE 59 | } 60 | 61 | // number of bits to address registers 62 | private final int p; 63 | 64 | // number of registers - 2^p 65 | private final int m; 66 | 67 | // refer paper 68 | private float alphaMM; 69 | 70 | // enable/disable bias correction using table lookup 71 | private final boolean noBias; 72 | 73 | // enable/disable bitpacking 74 | private final boolean bitPacking; 75 | 76 | // Not making it configurable for perf reasons (avoid checks) 77 | private final int chosenHashBits = DEFAULT_HASH_BITS; 78 | 79 | private HLLDenseRegister denseRegister; 80 | private HLLSparseRegister sparseRegister; 81 | 82 | // counts are cached to avoid repeated complex computation. If register value 83 | // is updated the count will be computed again. 84 | private long cachedCount; 85 | private boolean invalidateCount; 86 | 87 | private EncodingType encoding; 88 | 89 | // threshold to switch from SPARSE to DENSE encoding 90 | private int encodingSwitchThreshold; 91 | 92 | private HyperLogLog(HyperLogLogBuilder hllBuilder) { 93 | if (hllBuilder.numRegisterIndexBits < HLLConstants.MIN_P_VALUE 94 | || hllBuilder.numRegisterIndexBits > HLLConstants.MAX_P_VALUE) { 95 | throw new IllegalArgumentException("p value should be between " + HLLConstants.MIN_P_VALUE 96 | + " to " + HLLConstants.MAX_P_VALUE); 97 | } 98 | this.p = hllBuilder.numRegisterIndexBits; 99 | this.m = 1 << p; 100 | this.noBias = hllBuilder.noBias; 101 | this.bitPacking = hllBuilder.bitPacking; 102 | 103 | // the threshold should be less than 12K bytes for p = 14. 104 | // The reason to divide by 5 is, in sparse mode after serialization the 105 | // entriesin sparse map are compressed, and delta encoded as varints. The 106 | // worst case size of varints are 5 bytes. Hence, 12K/5 ~= 2400 entries in 107 | // sparse map. 108 | if (bitPacking) { 109 | this.encodingSwitchThreshold = ((m * 6) / 8) / 5; 110 | } else { 111 | // if bitpacking is disabled, all register values takes 8 bits and hence 112 | // we can be more flexible with the threshold. For p=14, 16K/5 = 3200 113 | // entries in sparse map can be allowed. 114 | this.encodingSwitchThreshold = m / 3; 115 | } 116 | 117 | // initializeAlpha(DEFAULT_HASH_BITS); 118 | // alphaMM value for 128 bits hash seems to perform better for default 64 hash bits 119 | this.alphaMM = 0.7213f / (1 + 1.079f / m); 120 | // For efficiency alpha is multiplied by m^2 121 | this.alphaMM = this.alphaMM * m * m; 122 | 123 | this.cachedCount = -1; 124 | this.invalidateCount = false; 125 | this.encoding = hllBuilder.encoding; 126 | if (encoding.equals(EncodingType.SPARSE)) { 127 | this.sparseRegister = new HLLSparseRegister(p, HLLConstants.P_PRIME_VALUE, 128 | HLLConstants.Q_PRIME_VALUE); 129 | this.denseRegister = null; 130 | } else { 131 | this.sparseRegister = null; 132 | this.denseRegister = new HLLDenseRegister(p, bitPacking); 133 | } 134 | } 135 | 136 | public static HyperLogLogBuilder builder() { 137 | return new HyperLogLogBuilder(); 138 | } 139 | 140 | public static class HyperLogLogBuilder { 141 | private int numRegisterIndexBits = 14; 142 | private EncodingType encoding = EncodingType.SPARSE; 143 | private boolean bitPacking = true; 144 | private boolean noBias = true; 145 | 146 | public HyperLogLogBuilder() { 147 | } 148 | 149 | public HyperLogLogBuilder setNumRegisterIndexBits(int b) { 150 | this.numRegisterIndexBits = b; 151 | return this; 152 | } 153 | 154 | public HyperLogLogBuilder setEncoding(EncodingType enc) { 155 | this.encoding = enc; 156 | return this; 157 | } 158 | 159 | public HyperLogLogBuilder enableBitPacking(boolean b) { 160 | this.bitPacking = b; 161 | return this; 162 | } 163 | 164 | public HyperLogLogBuilder enableNoBias(boolean nb) { 165 | this.noBias = nb; 166 | return this; 167 | } 168 | 169 | public HyperLogLog build() { 170 | return new HyperLogLog(this); 171 | } 172 | } 173 | 174 | // see paper for alpha initialization. 175 | private void initializeAlpha(final int hashBits) { 176 | if (hashBits <= 16) { 177 | alphaMM = 0.673f; 178 | } else if (hashBits <= 32) { 179 | alphaMM = 0.697f; 180 | } else if (hashBits <= 64) { 181 | alphaMM = 0.709f; 182 | } else { 183 | alphaMM = 0.7213f / (float) (1 + 1.079f / m); 184 | } 185 | 186 | // For efficiency alpha is multiplied by m^2 187 | alphaMM = alphaMM * m * m; 188 | } 189 | 190 | public void addBoolean(boolean val) { 191 | add(val ? HASH64_ONE : HASH64_ZERO); 192 | } 193 | 194 | public void addByte(byte val) { 195 | add(Murmur3.hash64(new byte[]{val})); 196 | } 197 | 198 | public void addBytes(byte[] val) { 199 | add(Murmur3.hash64(val)); 200 | } 201 | 202 | public void addShort(short val) { 203 | SHORT_BUFFER.putShort(0, val); 204 | add(Murmur3.hash64(SHORT_BUFFER.array())); 205 | } 206 | 207 | public void addInt(int val) { 208 | INT_BUFFER.putInt(0, val); 209 | add(Murmur3.hash64(INT_BUFFER.array())); 210 | } 211 | 212 | public void addLong(long val) { 213 | LONG_BUFFER.putLong(0, val); 214 | add(Murmur3.hash64(LONG_BUFFER.array())); 215 | } 216 | 217 | public void addFloat(float val) { 218 | INT_BUFFER.putFloat(0, val); 219 | add(Murmur3.hash64(INT_BUFFER.array())); 220 | } 221 | 222 | public void addDouble(double val) { 223 | LONG_BUFFER.putDouble(0, val); 224 | add(Murmur3.hash64(LONG_BUFFER.array())); 225 | } 226 | 227 | public void addChar(char val) { 228 | SHORT_BUFFER.putChar(0, val); 229 | add(Murmur3.hash64(SHORT_BUFFER.array())); 230 | } 231 | 232 | /** 233 | * Java's default charset will be used for strings. 234 | * @param val 235 | * - input string 236 | */ 237 | public void addString(String val) { 238 | add(Murmur3.hash64(val.getBytes())); 239 | } 240 | 241 | public void addString(String val, Charset charset) { 242 | add(Murmur3.hash64(val.getBytes(charset))); 243 | } 244 | 245 | public void add(long hashcode) { 246 | if (encoding.equals(EncodingType.SPARSE)) { 247 | if (sparseRegister.add(hashcode)) { 248 | invalidateCount = true; 249 | } 250 | 251 | // if size of sparse map excess the threshold convert the sparse map to 252 | // dense register and switch to DENSE encoding 253 | if (sparseRegister.getSize() > encodingSwitchThreshold) { 254 | encoding = EncodingType.DENSE; 255 | denseRegister = sparseToDenseRegister(sparseRegister); 256 | sparseRegister = null; 257 | invalidateCount = true; 258 | } 259 | } else { 260 | if (denseRegister.add(hashcode)) { 261 | invalidateCount = true; 262 | } 263 | } 264 | } 265 | 266 | public long count() { 267 | 268 | // compute count only if the register values are updated else return the 269 | // cached count 270 | if (invalidateCount || cachedCount < 0) { 271 | if (encoding.equals(EncodingType.SPARSE)) { 272 | 273 | // if encoding is still SPARSE use linear counting with increase 274 | // accuracy (as we use pPrime bits for register index) 275 | int mPrime = 1 << sparseRegister.getPPrime(); 276 | cachedCount = linearCount(mPrime, mPrime - sparseRegister.getSparseMap().size()); 277 | } else { 278 | 279 | // for DENSE encoding, use bias table lookup for HLLNoBias algorithm 280 | // else fallback to HLLOriginal algorithm 281 | double sum = denseRegister.getSumInversePow2(); 282 | long numZeros = denseRegister.getNumZeroes(); 283 | 284 | // cardinality estimate from normalized bias corrected harmonic mean on 285 | // the registers 286 | cachedCount = (long) (alphaMM * (1.0 / sum)); 287 | long pow = (long) Math.pow(2, chosenHashBits); 288 | 289 | // when bias correction is enabled 290 | if (noBias) { 291 | cachedCount = cachedCount <= 5 * m ? (cachedCount - estimateBias(cachedCount)) 292 | : cachedCount; 293 | long h = cachedCount; 294 | if (numZeros != 0) { 295 | h = linearCount(m, numZeros); 296 | } 297 | 298 | if (h < getThreshold()) { 299 | cachedCount = h; 300 | } 301 | } else { 302 | // HLL algorithm shows stronger bias for values in (2.5 * m) range. 303 | // To compensate for this short range bias, linear counting is used 304 | // for values before this short range. The original paper also says 305 | // similar bias is seen for long range values due to hash collisions 306 | // in range >1/30*(2^32). For the default case, we do not have to 307 | // worry about this long range bias as the paper used 32-bit hashing 308 | // and we use 64-bit hashing as default. 2^64 values are too high to 309 | // observe long range bias (hash collisions). 310 | if (cachedCount <= 2.5 * m) { 311 | 312 | // for short range use linear counting 313 | if (numZeros != 0) { 314 | cachedCount = linearCount(m, numZeros); 315 | } 316 | } else if (chosenHashBits < 64 && cachedCount > (0.033333 * pow)) { 317 | 318 | // long range bias for 32-bit hashcodes 319 | if (cachedCount > (1 / 30) * pow) { 320 | cachedCount = (long) (-pow * Math.log(1.0 - (double) cachedCount / (double) pow)); 321 | } 322 | } 323 | } 324 | } 325 | invalidateCount = false; 326 | } 327 | 328 | return cachedCount; 329 | } 330 | 331 | private long getThreshold() { 332 | return (long) (HLLConstants.thresholdData[p - 4] + 0.5); 333 | } 334 | 335 | /** 336 | * Estimate bias from lookup table 337 | * @param count 338 | * - cardinality before bias correction 339 | * @return cardinality after bias correction 340 | */ 341 | private long estimateBias(long count) { 342 | double[] rawEstForP = HLLConstants.rawEstimateData[p - 4]; 343 | 344 | // compute distance and store it in sorted map 345 | Double2IntSortedMap estIndexMap = new Double2IntAVLTreeMap(); 346 | double distance = 0; 347 | for (int i = 0; i < rawEstForP.length; i++) { 348 | distance = Math.pow(count - rawEstForP[i], 2); 349 | estIndexMap.put(distance, i); 350 | } 351 | 352 | // take top-k closest neighbors and compute the bias corrected cardinality 353 | long result = 0; 354 | double[] biasForP = HLLConstants.biasData[p - 4]; 355 | double biasSum = 0; 356 | int kNeighbors = HLLConstants.K_NEAREST_NEIGHBOR; 357 | for (Map.Entry entry : estIndexMap.entrySet()) { 358 | biasSum += biasForP[entry.getValue()]; 359 | kNeighbors--; 360 | if (kNeighbors <= 0) { 361 | break; 362 | } 363 | } 364 | 365 | // 0.5 added for rounding off 366 | result = (long) ((biasSum / HLLConstants.K_NEAREST_NEIGHBOR) + 0.5); 367 | return result; 368 | } 369 | 370 | public void setCount(long count) { 371 | this.cachedCount = count; 372 | this.invalidateCount = true; 373 | } 374 | 375 | private long linearCount(int mVal, long numZeros) { 376 | return (long) (Math.round(mVal * Math.log(mVal / ((double) numZeros)))); 377 | } 378 | 379 | // refer paper 380 | public double getStandardError() { 381 | return 1.04 / Math.sqrt(m); 382 | } 383 | 384 | public HLLDenseRegister getHLLDenseRegister() { 385 | return denseRegister; 386 | } 387 | 388 | public HLLSparseRegister getHLLSparseRegister() { 389 | return sparseRegister; 390 | } 391 | 392 | /** 393 | * Reconstruct sparse map from serialized integer list 394 | * @param reg 395 | * - uncompressed and delta decoded integer list 396 | */ 397 | public void setHLLSparseRegister(int[] reg) { 398 | for (int i : reg) { 399 | int key = i >>> HLLConstants.Q_PRIME_VALUE; 400 | byte value = (byte) (i & 0x3f); 401 | sparseRegister.set(key, value); 402 | } 403 | } 404 | 405 | /** 406 | * Reconstruct dense registers from byte array 407 | * @param reg 408 | * - unpacked byte array 409 | */ 410 | public void setHLLDenseRegister(byte[] reg) { 411 | int i = 0; 412 | for (byte b : reg) { 413 | denseRegister.set(i, b); 414 | i++; 415 | } 416 | } 417 | 418 | /** 419 | * Merge the specified hyperloglog to the current one. Encoding switches 420 | * automatically after merge if the encoding switch threshold is exceeded. 421 | * @param hll 422 | * - hyperloglog to be merged 423 | * @throws IllegalArgumentException - throw when incompatible HLL are tried to be merged 424 | */ 425 | public void merge(HyperLogLog hll) { 426 | 427 | if (chosenHashBits != hll.chosenHashBits) { 428 | throw new IllegalArgumentException( 429 | "HyperLogLog cannot be merged as either p or hashbits are different. Current: " 430 | + toString() + " Provided: " + hll.toString()); 431 | } 432 | if (p > hll.p) { 433 | throw new IllegalArgumentException( 434 | "HyperLogLog cannot merge a smaller p into a larger one : " 435 | + toString() + " Provided: " + hll.toString()); 436 | } 437 | if (p != hll.p) { 438 | // invariant: p > hll.p 439 | hll = hll.squash(p); 440 | } 441 | 442 | EncodingType otherEncoding = hll.getEncoding(); 443 | 444 | if (encoding.equals(EncodingType.SPARSE) && otherEncoding.equals(EncodingType.SPARSE)) { 445 | sparseRegister.merge(hll.getHLLSparseRegister()); 446 | // if after merge the sparse switching threshold is exceeded then change 447 | // to dense encoding 448 | if (sparseRegister.getSize() > encodingSwitchThreshold) { 449 | encoding = EncodingType.DENSE; 450 | denseRegister = sparseToDenseRegister(sparseRegister); 451 | sparseRegister = null; 452 | } 453 | } else if (encoding.equals(EncodingType.DENSE) && otherEncoding.equals(EncodingType.DENSE)) { 454 | denseRegister.merge(hll.getHLLDenseRegister()); 455 | } else if (encoding.equals(EncodingType.SPARSE) && otherEncoding.equals(EncodingType.DENSE)) { 456 | denseRegister = sparseToDenseRegister(sparseRegister); 457 | denseRegister.merge(hll.getHLLDenseRegister()); 458 | sparseRegister = null; 459 | encoding = EncodingType.DENSE; 460 | } else if (encoding.equals(EncodingType.DENSE) && otherEncoding.equals(EncodingType.SPARSE)) { 461 | HLLDenseRegister otherDenseRegister = sparseToDenseRegister(hll.getHLLSparseRegister()); 462 | denseRegister.merge(otherDenseRegister); 463 | } 464 | 465 | invalidateCount = true; 466 | } 467 | 468 | /** 469 | * Reduces the accuracy of the HLL provided to a smaller size 470 | * @param p0 471 | * - new p size for the new HyperLogLog (smaller or no change) 472 | * @return reduced (or same) HyperLogLog instance 473 | */ 474 | public HyperLogLog squash(final int p0) { 475 | if (p0 > p) { 476 | throw new IllegalArgumentException( 477 | "HyperLogLog cannot be be squashed to be bigger. Current: " 478 | + toString() + " Provided: " + p0); 479 | } 480 | 481 | if (p0 == p) { 482 | return this; 483 | } 484 | 485 | final HyperLogLog hll = new HyperLogLogBuilder() 486 | .setNumRegisterIndexBits(p0).setEncoding(EncodingType.DENSE) 487 | .enableNoBias(noBias).build(); 488 | final HLLDenseRegister result = hll.denseRegister; 489 | 490 | if (encoding == EncodingType.SPARSE) { 491 | sparseRegister.extractLowBitsTo(result); 492 | } else if (encoding == EncodingType.DENSE) { 493 | denseRegister.extractLowBitsTo(result); 494 | } 495 | return hll; 496 | } 497 | 498 | /** 499 | * Converts sparse to dense hll register 500 | * @param sparseRegister 501 | * - sparse register to be converted 502 | * @return converted dense register 503 | */ 504 | private HLLDenseRegister sparseToDenseRegister(HLLSparseRegister sparseRegister) { 505 | if (sparseRegister == null) { 506 | return null; 507 | } 508 | int p = sparseRegister.getP(); 509 | int pMask = (1 << p) - 1; 510 | HLLDenseRegister result = new HLLDenseRegister(p, bitPacking); 511 | for (Map.Entry entry : sparseRegister.getSparseMap().entrySet()) { 512 | int key = entry.getKey(); 513 | int idx = key & pMask; 514 | result.set(idx, entry.getValue()); 515 | } 516 | return result; 517 | } 518 | 519 | @Override 520 | public String toString() { 521 | StringBuilder sb = new StringBuilder(); 522 | sb.append("Encoding: "); 523 | sb.append(encoding); 524 | sb.append(", p: "); 525 | sb.append(p); 526 | sb.append(", estimatedCardinality: "); 527 | sb.append(count()); 528 | return sb.toString(); 529 | } 530 | 531 | public String toStringExtended() { 532 | if (encoding.equals(EncodingType.DENSE)) { 533 | return toString() + ", " + denseRegister.toExtendedString(); 534 | } else if (encoding.equals(EncodingType.SPARSE)) { 535 | return toString() + ", " + sparseRegister.toExtendedString(); 536 | } 537 | 538 | return toString(); 539 | } 540 | 541 | public int getNumRegisterIndexBits() { 542 | return p; 543 | } 544 | 545 | public EncodingType getEncoding() { 546 | return encoding; 547 | } 548 | 549 | public void setEncoding(EncodingType encoding) { 550 | this.encoding = encoding; 551 | } 552 | 553 | @Override 554 | public boolean equals(Object obj) { 555 | if (!(obj instanceof HyperLogLog)) { 556 | return false; 557 | } 558 | 559 | HyperLogLog other = (HyperLogLog) obj; 560 | long count = count(); 561 | long otherCount = other.count(); 562 | boolean result = p == other.p && chosenHashBits == other.chosenHashBits 563 | && encoding.equals(other.encoding) && count == otherCount; 564 | if (encoding.equals(EncodingType.DENSE)) { 565 | result = result && denseRegister.equals(other.getHLLDenseRegister()); 566 | } 567 | 568 | if (encoding.equals(EncodingType.SPARSE)) { 569 | result = result && sparseRegister.equals(other.getHLLSparseRegister()); 570 | } 571 | return result; 572 | } 573 | 574 | @Override 575 | public int hashCode() { 576 | int hashcode = 0; 577 | hashcode += 31 * p; 578 | hashcode += 31 * chosenHashBits; 579 | hashcode += encoding.hashCode(); 580 | hashcode += 31 * count(); 581 | if (encoding.equals(EncodingType.DENSE)) { 582 | hashcode += 31 * denseRegister.hashCode(); 583 | } 584 | 585 | if (encoding.equals(EncodingType.SPARSE)) { 586 | hashcode += 31 * sparseRegister.hashCode(); 587 | } 588 | return hashcode; 589 | } 590 | } 591 | -------------------------------------------------------------------------------- /src/java/com/github/prasanthj/hll/HyperLogLogUtils.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2017 Prasanth Jayachandran 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.github.prasanthj.hll; 18 | 19 | import com.github.prasanthj.hll.HyperLogLog.EncodingType; 20 | import it.unimi.dsi.fastutil.ints.Int2ByteSortedMap; 21 | 22 | import java.io.EOFException; 23 | import java.io.IOException; 24 | import java.io.InputStream; 25 | import java.io.OutputStream; 26 | import java.util.Arrays; 27 | import java.util.Map; 28 | 29 | /** 30 | * HyperLogLog serialization utilities. 31 | */ 32 | public class HyperLogLogUtils { 33 | 34 | public static final byte[] MAGIC = new byte[] { 'H', 'L', 'L' }; 35 | 36 | /** 37 | * HyperLogLog is serialized using the following format 38 | * 39 | *
 40 |    * |-4 byte-|------varlong----|varint (optional)|----------|  
 41 |    * ---------------------------------------------------------
 42 |    * | header | estimated-count | register-length | register |
 43 |    * ---------------------------------------------------------
 44 |    * 
 45 |    * 4 byte header is encoded like below
 46 |    * 3 bytes - HLL magic string to identify serialized stream
 47 |    * 4 bits  - p (number of bits to be used as register index)
 48 |    * 1       - spare bit (not used)
 49 |    * 3 bits  - encoding (000 - sparse, 001..110 - n bit packing, 111 - no bit packing)
 50 |    * 
 51 |    * Followed by header are 3 fields that are required for reconstruction
 52 |    * of hyperloglog
 53 |    * Estimated count - variable length long to store last computed estimated count.
 54 |    *                   This is just for quick lookup without deserializing registers
 55 |    * Register length - number of entries in the register (required only for 
 56 |    *                   for sparse representation. For bit-packing, the register
 57 |    *                   length can be found from p)
 58 |    * 
59 | * @param out 60 | * - output stream to write to 61 | * @param hll 62 | * - hyperloglog that needs to be serialized 63 | * @throws IOException - thrown by OutputStream 64 | */ 65 | public static void serializeHLL(OutputStream out, HyperLogLog hll) throws IOException { 66 | 67 | // write header 68 | out.write(MAGIC); 69 | int fourthByte = 0; 70 | int p = hll.getNumRegisterIndexBits(); 71 | fourthByte = (p & 0xff) << 4; 72 | 73 | int bitWidth = 0; 74 | EncodingType enc = hll.getEncoding(); 75 | 76 | // determine bit width for bitpacking and encode it in header 77 | if (enc.equals(EncodingType.DENSE)) { 78 | int lzr = hll.getHLLDenseRegister().getMaxRegisterValue(); 79 | bitWidth = getBitWidth(lzr); 80 | 81 | // the max value of number of zeroes for 64 bit hash can be encoded using 82 | // only 6 bits. So we will disable bit packing for any values >6 83 | if (bitWidth > 6) { 84 | fourthByte |= 7; 85 | bitWidth = 8; 86 | } else { 87 | fourthByte |= (bitWidth & 7); 88 | } 89 | } 90 | 91 | // write fourth byte of header 92 | out.write(fourthByte); 93 | 94 | // write estimated count 95 | long estCount = hll.count(); 96 | writeVulong(out, estCount); 97 | 98 | // serialize dense/sparse registers. Dense registers are bitpacked whereas 99 | // sparse registers are delta and variable length encoded 100 | if (enc.equals(EncodingType.DENSE)) { 101 | byte[] register = hll.getHLLDenseRegister().getRegister(); 102 | bitpackHLLRegister(out, register, bitWidth); 103 | } else if (enc.equals(EncodingType.SPARSE)) { 104 | Map sparseMap = hll.getHLLSparseRegister().getSparseMap(); 105 | 106 | // write the number of elements in sparse map (required for 107 | // reconstruction) 108 | writeVulong(out, sparseMap.size()); 109 | 110 | // compute deltas and write the values as varints 111 | int prev = 0; 112 | for (Map.Entry entry : sparseMap.entrySet()) { 113 | if (prev == 0) { 114 | prev = (entry.getKey() << HLLConstants.Q_PRIME_VALUE) | entry.getValue(); 115 | writeVulong(out, prev); 116 | } else { 117 | int curr = (entry.getKey() << HLLConstants.Q_PRIME_VALUE) | entry.getValue(); 118 | int delta = curr - prev; 119 | writeVulong(out, delta); 120 | prev = curr; 121 | } 122 | } 123 | } 124 | } 125 | 126 | /** 127 | * Refer serializeHLL() for format of serialization. This funtions 128 | * deserializes the serialized hyperloglogs 129 | * @param in 130 | * - input stream 131 | * @return deserialized hyperloglog 132 | * @throws IOException - thrown by InputStream 133 | */ 134 | public static HyperLogLog deserializeHLL(InputStream in) throws IOException { 135 | checkMagicString(in); 136 | int fourthByte = in.read() & 0xff; 137 | int p = fourthByte >>> 4; 138 | 139 | // read type of encoding 140 | int enc = fourthByte & 7; 141 | EncodingType encoding = null; 142 | int bitSize = 0; 143 | if (enc == 0) { 144 | encoding = EncodingType.SPARSE; 145 | } else if (enc > 0 && enc < 7) { 146 | bitSize = enc; 147 | encoding = EncodingType.DENSE; 148 | } else { 149 | // bit packing disabled 150 | bitSize = 8; 151 | encoding = EncodingType.DENSE; 152 | } 153 | 154 | // estimated count 155 | long estCount = readVulong(in); 156 | 157 | HyperLogLog result = null; 158 | if (encoding.equals(EncodingType.SPARSE)) { 159 | result = HyperLogLog.builder().setNumRegisterIndexBits(p) 160 | .setEncoding(EncodingType.SPARSE).build(); 161 | int numRegisterEntries = (int) readVulong(in); 162 | int[] reg = new int[numRegisterEntries]; 163 | int prev = 0; 164 | 165 | // reconstruct the sparse map from delta encoded and varint input stream 166 | if (numRegisterEntries > 0) { 167 | prev = (int) readVulong(in); 168 | reg[0] = prev; 169 | } 170 | int delta = 0; 171 | int curr = 0; 172 | for (int i = 1; i < numRegisterEntries; i++) { 173 | delta = (int) readVulong(in); 174 | curr = prev + delta; 175 | reg[i] = curr; 176 | prev = curr; 177 | } 178 | result.setHLLSparseRegister(reg); 179 | } else { 180 | 181 | // explicitly disable bit packing 182 | if (bitSize == 8) { 183 | result = HyperLogLog.builder().setNumRegisterIndexBits(p) 184 | .setEncoding(EncodingType.DENSE).enableBitPacking(false).build(); 185 | } else { 186 | result = HyperLogLog.builder().setNumRegisterIndexBits(p) 187 | .setEncoding(EncodingType.DENSE).enableBitPacking(true).build(); 188 | } 189 | int m = 1 << p; 190 | byte[] register = unpackHLLRegister(in, m, bitSize); 191 | result.setHLLDenseRegister(register); 192 | } 193 | 194 | result.setCount(estCount); 195 | 196 | return result; 197 | } 198 | 199 | private static void bitpackHLLRegister(OutputStream out, byte[] register, int bitWidth) 200 | throws IOException { 201 | int bitsLeft = 8; 202 | byte current = 0; 203 | 204 | if (bitWidth == 8) { 205 | fastPathWrite(out, register); 206 | return; 207 | } 208 | 209 | // write the blob 210 | for (byte value : register) { 211 | int bitsToWrite = bitWidth; 212 | while (bitsToWrite > bitsLeft) { 213 | // add the bits to the bottom of the current word 214 | current |= value >>> (bitsToWrite - bitsLeft); 215 | // subtract out the bits we just added 216 | bitsToWrite -= bitsLeft; 217 | // zero out the bits above bitsToWrite 218 | value &= (1 << bitsToWrite) - 1; 219 | out.write(current); 220 | current = 0; 221 | bitsLeft = 8; 222 | } 223 | bitsLeft -= bitsToWrite; 224 | current |= value << bitsLeft; 225 | if (bitsLeft == 0) { 226 | out.write(current); 227 | current = 0; 228 | bitsLeft = 8; 229 | } 230 | } 231 | 232 | out.flush(); 233 | } 234 | 235 | private static void fastPathWrite(OutputStream out, byte[] register) throws IOException { 236 | for (byte b : register) { 237 | out.write(b); 238 | } 239 | } 240 | 241 | /** 242 | * Unpack the bitpacked HyperLogLog register. 243 | * @param in 244 | * - input stream 245 | * @param length 246 | * - serialized length 247 | * @return unpacked HLL register 248 | * @throws IOException 249 | */ 250 | private static byte[] unpackHLLRegister(InputStream in, int length, int bitSize) 251 | throws IOException { 252 | int mask = (1 << bitSize) - 1; 253 | int bitsLeft = 8; 254 | 255 | if (bitSize == 8) { 256 | return fastPathRead(in, length); 257 | } 258 | 259 | byte current = (byte) (0xff & in.read()); 260 | 261 | byte[] output = new byte[length]; 262 | for (int i = 0; i < output.length; i++) { 263 | byte result = 0; 264 | int bitsLeftToRead = bitSize; 265 | while (bitsLeftToRead > bitsLeft) { 266 | result <<= bitsLeft; 267 | result |= current & ((1 << bitsLeft) - 1); 268 | bitsLeftToRead -= bitsLeft; 269 | current = (byte) (0xff & in.read()); 270 | bitsLeft = 8; 271 | } 272 | if (bitsLeftToRead > 0) { 273 | result <<= bitsLeftToRead; 274 | bitsLeft -= bitsLeftToRead; 275 | result |= (current >>> bitsLeft) & ((1 << bitsLeftToRead) - 1); 276 | } 277 | output[i] = (byte) (result & mask); 278 | } 279 | return output; 280 | } 281 | 282 | private static byte[] fastPathRead(InputStream in, int length) throws IOException { 283 | byte[] result = new byte[length]; 284 | for (int i = 0; i < length; i++) { 285 | result[i] = (byte) in.read(); 286 | } 287 | return result; 288 | } 289 | 290 | /** 291 | * Get estimated cardinality without deserializing HLL 292 | * @param in 293 | * - serialized HLL 294 | * @return - cardinality 295 | * @throws IOException - thrown by InputStream 296 | */ 297 | public static long getEstimatedCountFromSerializedHLL(InputStream in) throws IOException { 298 | checkMagicString(in); 299 | in.read(); 300 | return readVulong(in); 301 | } 302 | 303 | /** 304 | * Check if the specified input stream is actually a HLL stream 305 | * @param in 306 | * - input stream 307 | * @throws IOException 308 | */ 309 | private static void checkMagicString(InputStream in) throws IOException { 310 | byte[] magic = new byte[3]; 311 | magic[0] = (byte) in.read(); 312 | magic[1] = (byte) in.read(); 313 | magic[2] = (byte) in.read(); 314 | 315 | if (!Arrays.equals(magic, MAGIC)) { 316 | throw new IllegalArgumentException("The input stream is not a HyperLogLog stream."); 317 | } 318 | } 319 | 320 | /** 321 | * Minimum bits required to encode the specified value 322 | * @param val 323 | * - input value 324 | * @return 325 | */ 326 | private static int getBitWidth(int val) { 327 | int count = 0; 328 | while (val != 0) { 329 | count++; 330 | val = (byte) (val >>> 1); 331 | } 332 | return count; 333 | } 334 | 335 | /** 336 | * Return relative error between actual and estimated cardinality 337 | * @param actualCount 338 | * - actual count 339 | * @param estimatedCount 340 | * - estimated count 341 | * @return relative error 342 | */ 343 | public static float getRelativeError(long actualCount, long estimatedCount) { 344 | float err = (1.0f - ((float) estimatedCount / (float) actualCount)) * 100.0f; 345 | return err; 346 | } 347 | 348 | /** 349 | * Write variable length encoded longs to output stream 350 | * @param output 351 | * - out stream 352 | * @param value 353 | * - long 354 | * @throws IOException 355 | */ 356 | private static void writeVulong(OutputStream output, long value) throws IOException { 357 | while (true) { 358 | if ((value & ~0x7f) == 0) { 359 | output.write((byte) value); 360 | return; 361 | } else { 362 | output.write((byte) (0x80 | (value & 0x7f))); 363 | value >>>= 7; 364 | } 365 | } 366 | } 367 | 368 | /** 369 | * Read variable length encoded longs from input stream 370 | * @param in 371 | * - input stream 372 | * @return decoded long value 373 | * @throws IOException 374 | */ 375 | private static long readVulong(InputStream in) throws IOException { 376 | long result = 0; 377 | long b; 378 | int offset = 0; 379 | do { 380 | b = in.read(); 381 | if (b == -1) { 382 | throw new EOFException("Reading Vulong past EOF"); 383 | } 384 | result |= (0x7f & b) << offset; 385 | offset += 7; 386 | } while (b >= 0x80); 387 | return result; 388 | } 389 | 390 | } 391 | -------------------------------------------------------------------------------- /src/java/com/github/prasanthj/hll/Murmur3.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2017 Prasanth Jayachandran 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.github.prasanthj.hll; 18 | 19 | import java.nio.ByteBuffer; 20 | import java.nio.ByteOrder; 21 | 22 | /** 23 | * Murmur3 32 and 128 bit variants. 24 | * 32-bit Java port of https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp#94 25 | * 128-bit Java port of https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp#255 26 | */ 27 | public class Murmur3 { 28 | // Constants for 32 bit variant 29 | private static final int C1_32 = 0xcc9e2d51; 30 | private static final int C2_32 = 0x1b873593; 31 | private static final int R1_32 = 15; 32 | private static final int R2_32 = 13; 33 | private static final int M_32 = 5; 34 | private static final int N_32 = 0xe6546b64; 35 | 36 | // Constants for 128 bit variant 37 | private static final long C1 = 0x87c37b91114253d5L; 38 | private static final long C2 = 0x4cf5ad432745937fL; 39 | private static final int R1 = 31; 40 | private static final int R2 = 27; 41 | private static final int R3 = 33; 42 | private static final int M = 5; 43 | private static final int N1 = 0x52dce729; 44 | private static final int N2 = 0x38495ab5; 45 | 46 | private static final int DEFAULT_SEED = 123; 47 | 48 | /** 49 | * Murmur3 32-bit variant. 50 | * 51 | * @param data - input byte array 52 | * @return - hashcode 53 | */ 54 | public static int hash32(byte[] data) { 55 | return hash32(data, data.length, DEFAULT_SEED); 56 | } 57 | 58 | /** 59 | * Murmur3 32-bit variant. 60 | * 61 | * @param data - input byte array 62 | * @param length - length of array 63 | * @param seed - seed. (default 0) 64 | * @return - hashcode 65 | */ 66 | public static int hash32(byte[] data, int length, int seed) { 67 | int hash = seed; 68 | final int nblocks = length >> 2; 69 | 70 | // body 71 | for (int i = 0; i < nblocks; i++) { 72 | int i_4 = i << 2; 73 | int k = (data[i_4] & 0xff) 74 | | ((data[i_4 + 1] & 0xff) << 8) 75 | | ((data[i_4 + 2] & 0xff) << 16) 76 | | ((data[i_4 + 3] & 0xff) << 24); 77 | 78 | // mix functions 79 | k *= C1_32; 80 | k = Integer.rotateLeft(k, R1_32); 81 | k *= C2_32; 82 | hash ^= k; 83 | hash = Integer.rotateLeft(hash, R2_32) * M_32 + N_32; 84 | } 85 | 86 | // tail 87 | int idx = nblocks << 2; 88 | int k1 = 0; 89 | switch (length - idx) { 90 | case 3: 91 | k1 ^= data[idx + 2] << 16; 92 | case 2: 93 | k1 ^= data[idx + 1] << 8; 94 | case 1: 95 | k1 ^= data[idx]; 96 | 97 | // mix functions 98 | k1 *= C1_32; 99 | k1 = Integer.rotateLeft(k1, R1_32); 100 | k1 *= C2_32; 101 | hash ^= k1; 102 | } 103 | 104 | // finalization 105 | hash ^= length; 106 | hash ^= (hash >>> 16); 107 | hash *= 0x85ebca6b; 108 | hash ^= (hash >>> 13); 109 | hash *= 0xc2b2ae35; 110 | hash ^= (hash >>> 16); 111 | 112 | return hash; 113 | } 114 | 115 | /** 116 | * Murmur3 64-bit variant. This is essentially MSB 8 bytes of Murmur3 128-bit variant. 117 | * 118 | * @param data - input byte array 119 | * @return - hashcode 120 | */ 121 | public static long hash64(byte[] data) { 122 | return hash64(data, data.length, DEFAULT_SEED); 123 | } 124 | 125 | /** 126 | * Murmur3 64-bit variant. This is essentially MSB 8 bytes of Murmur3 128-bit variant. 127 | * 128 | * @param data - input byte array 129 | * @param length - length of array 130 | * @param seed - seed. (default is 0) 131 | * @return - hashcode 132 | */ 133 | public static long hash64(byte[] data, int length, int seed) { 134 | long hash = seed; 135 | final int nblocks = length >> 3; 136 | 137 | // body 138 | for (int i = 0; i < nblocks; i++) { 139 | final int i8 = i << 3; 140 | long k = ((long) data[i8] & 0xff) 141 | | (((long) data[i8 + 1] & 0xff) << 8) 142 | | (((long) data[i8 + 2] & 0xff) << 16) 143 | | (((long) data[i8 + 3] & 0xff) << 24) 144 | | (((long) data[i8 + 4] & 0xff) << 32) 145 | | (((long) data[i8 + 5] & 0xff) << 40) 146 | | (((long) data[i8 + 6] & 0xff) << 48) 147 | | (((long) data[i8 + 7] & 0xff) << 56); 148 | 149 | // mix functions 150 | k *= C1; 151 | k = Long.rotateLeft(k, R1); 152 | k *= C2; 153 | hash ^= k; 154 | hash = Long.rotateLeft(hash, R2) * M + N1; 155 | } 156 | 157 | // tail 158 | long k1 = 0; 159 | int tailStart = nblocks << 3; 160 | switch (length - tailStart) { 161 | case 7: 162 | k1 ^= ((long) data[tailStart + 6] & 0xff) << 48; 163 | case 6: 164 | k1 ^= ((long) data[tailStart + 5] & 0xff) << 40; 165 | case 5: 166 | k1 ^= ((long) data[tailStart + 4] & 0xff) << 32; 167 | case 4: 168 | k1 ^= ((long) data[tailStart + 3] & 0xff) << 24; 169 | case 3: 170 | k1 ^= ((long) data[tailStart + 2] & 0xff) << 16; 171 | case 2: 172 | k1 ^= ((long) data[tailStart + 1] & 0xff) << 8; 173 | case 1: 174 | k1 ^= ((long) data[tailStart] & 0xff); 175 | k1 *= C1; 176 | k1 = Long.rotateLeft(k1, R1); 177 | k1 *= C2; 178 | hash ^= k1; 179 | } 180 | 181 | // finalization 182 | hash ^= length; 183 | hash = fmix64(hash); 184 | 185 | return hash; 186 | } 187 | 188 | /** 189 | * Murmur3 128-bit variant. 190 | * 191 | * @param data - input byte array 192 | * @return - hashcode (2 longs) 193 | */ 194 | public static long[] hash128(byte[] data) { 195 | return hash128(data, data.length, DEFAULT_SEED); 196 | } 197 | 198 | /** 199 | * Murmur3 128-bit variant. 200 | * 201 | * @param data - input byte array 202 | * @param length - length of array 203 | * @param seed - seed. (default is 0) 204 | * @return - hashcode (2 longs) 205 | */ 206 | public static long[] hash128(byte[] data, int length, int seed) { 207 | long h1 = seed; 208 | long h2 = seed; 209 | final int nblocks = length >> 4; 210 | 211 | // body 212 | for (int i = 0; i < nblocks; i++) { 213 | final int i16 = i << 4; 214 | long k1 = ((long) data[i16] & 0xff) 215 | | (((long) data[i16 + 1] & 0xff) << 8) 216 | | (((long) data[i16 + 2] & 0xff) << 16) 217 | | (((long) data[i16 + 3] & 0xff) << 24) 218 | | (((long) data[i16 + 4] & 0xff) << 32) 219 | | (((long) data[i16 + 5] & 0xff) << 40) 220 | | (((long) data[i16 + 6] & 0xff) << 48) 221 | | (((long) data[i16 + 7] & 0xff) << 56); 222 | 223 | long k2 = ((long) data[i16 + 8] & 0xff) 224 | | (((long) data[i16 + 9] & 0xff) << 8) 225 | | (((long) data[i16 + 10] & 0xff) << 16) 226 | | (((long) data[i16 + 11] & 0xff) << 24) 227 | | (((long) data[i16 + 12] & 0xff) << 32) 228 | | (((long) data[i16 + 13] & 0xff) << 40) 229 | | (((long) data[i16 + 14] & 0xff) << 48) 230 | | (((long) data[i16 + 15] & 0xff) << 56); 231 | 232 | // mix functions for k1 233 | k1 *= C1; 234 | k1 = Long.rotateLeft(k1, R1); 235 | k1 *= C2; 236 | h1 ^= k1; 237 | h1 = Long.rotateLeft(h1, R2); 238 | h1 += h2; 239 | h1 = h1 * M + N1; 240 | 241 | // mix functions for k2 242 | k2 *= C2; 243 | k2 = Long.rotateLeft(k2, R3); 244 | k2 *= C1; 245 | h2 ^= k2; 246 | h2 = Long.rotateLeft(h2, R1); 247 | h2 += h1; 248 | h2 = h2 * M + N2; 249 | } 250 | 251 | // tail 252 | long k1 = 0; 253 | long k2 = 0; 254 | int tailStart = nblocks << 4; 255 | switch (length - tailStart) { 256 | case 15: 257 | k2 ^= (long) (data[tailStart + 14] & 0xff) << 48; 258 | case 14: 259 | k2 ^= (long) (data[tailStart + 13] & 0xff) << 40; 260 | case 13: 261 | k2 ^= (long) (data[tailStart + 12] & 0xff) << 32; 262 | case 12: 263 | k2 ^= (long) (data[tailStart + 11] & 0xff) << 24; 264 | case 11: 265 | k2 ^= (long) (data[tailStart + 10] & 0xff) << 16; 266 | case 10: 267 | k2 ^= (long) (data[tailStart + 9] & 0xff) << 8; 268 | case 9: 269 | k2 ^= (long) (data[tailStart + 8] & 0xff); 270 | k2 *= C2; 271 | k2 = Long.rotateLeft(k2, R3); 272 | k2 *= C1; 273 | h2 ^= k2; 274 | 275 | case 8: 276 | k1 ^= (long) (data[tailStart + 7] & 0xff) << 56; 277 | case 7: 278 | k1 ^= (long) (data[tailStart + 6] & 0xff) << 48; 279 | case 6: 280 | k1 ^= (long) (data[tailStart + 5] & 0xff) << 40; 281 | case 5: 282 | k1 ^= (long) (data[tailStart + 4] & 0xff) << 32; 283 | case 4: 284 | k1 ^= (long) (data[tailStart + 3] & 0xff) << 24; 285 | case 3: 286 | k1 ^= (long) (data[tailStart + 2] & 0xff) << 16; 287 | case 2: 288 | k1 ^= (long) (data[tailStart + 1] & 0xff) << 8; 289 | case 1: 290 | k1 ^= (long) (data[tailStart] & 0xff); 291 | k1 *= C1; 292 | k1 = Long.rotateLeft(k1, R1); 293 | k1 *= C2; 294 | h1 ^= k1; 295 | } 296 | 297 | // finalization 298 | h1 ^= length; 299 | h2 ^= length; 300 | 301 | h1 += h2; 302 | h2 += h1; 303 | 304 | h1 = fmix64(h1); 305 | h2 = fmix64(h2); 306 | 307 | h1 += h2; 308 | h2 += h1; 309 | 310 | return new long[]{h1, h2}; 311 | } 312 | 313 | private static long fmix64(long h) { 314 | h ^= (h >>> 33); 315 | h *= 0xff51afd7ed558ccdL; 316 | h ^= (h >>> 33); 317 | h *= 0xc4ceb9fe1a85ec53L; 318 | h ^= (h >>> 33); 319 | return h; 320 | } 321 | } -------------------------------------------------------------------------------- /src/java/com/github/prasanthj/hll/tools/HyperLogLogCLI.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2017 Prasanth Jayachandran 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.github.prasanthj.hll.tools; 18 | 19 | import com.github.prasanthj.hll.HyperLogLog; 20 | import com.github.prasanthj.hll.HyperLogLogUtils; 21 | 22 | import java.io.*; 23 | import java.util.HashSet; 24 | import java.util.Random; 25 | import java.util.Set; 26 | 27 | import org.apache.commons.cli.BasicParser; 28 | import org.apache.commons.cli.CommandLine; 29 | import org.apache.commons.cli.CommandLineParser; 30 | import org.apache.commons.cli.HelpFormatter; 31 | import org.apache.commons.cli.Options; 32 | import org.apache.commons.cli.ParseException; 33 | 34 | public class HyperLogLogCLI { 35 | 36 | public static void main(String[] args) { 37 | Options options = new Options(); 38 | addOptions(options); 39 | 40 | CommandLineParser parser = new BasicParser(); 41 | CommandLine cli = null; 42 | long n = 0; 43 | long seed = 123; 44 | HyperLogLog.EncodingType enc = HyperLogLog.EncodingType.SPARSE; 45 | int p = 14; 46 | boolean bitPack = true; 47 | boolean noBias = true; 48 | boolean printRelativeError = false; 49 | int unique = -1; 50 | String filePath = null; 51 | BufferedReader br = null; 52 | String outFile = null; 53 | String inFile = null; 54 | FileOutputStream fos = null; 55 | DataOutputStream out = null; 56 | FileInputStream fis = null; 57 | DataInputStream in = null; 58 | try { 59 | cli = parser.parse(options, args); 60 | 61 | if (!(cli.hasOption('n') || cli.hasOption('f') || cli.hasOption('d') || cli.hasOption('t'))) { 62 | System.out.println("Example usage: hll -n 1000\n" 63 | + " hll -f /tmp/input.txt\n" 64 | + " hll -d -i /tmp/out.hll\n" 65 | + " cat file | hll -t\n"); 66 | usage(options); 67 | return; 68 | } 69 | 70 | if (cli.hasOption('n')) { 71 | n = Long.parseLong(cli.getOptionValue('n')); 72 | } 73 | 74 | if (cli.hasOption('e')) { 75 | String value = cli.getOptionValue('e'); 76 | if (value.equals(HyperLogLog.EncodingType.DENSE.name())) { 77 | enc = HyperLogLog.EncodingType.DENSE; 78 | } 79 | } 80 | 81 | if (cli.hasOption('p')) { 82 | p = Integer.parseInt(cli.getOptionValue('p')); 83 | if (p < 4 && p > 16) { 84 | System.out.println("Warning! Out-of-range value specified for p. Using to p=14."); 85 | p = 14; 86 | } 87 | } 88 | 89 | if (cli.hasOption('c')) { 90 | noBias = Boolean.parseBoolean(cli.getOptionValue('c')); 91 | } 92 | 93 | if (cli.hasOption('b')) { 94 | bitPack = Boolean.parseBoolean(cli.getOptionValue('b')); 95 | } 96 | 97 | if (cli.hasOption('f')) { 98 | filePath = cli.getOptionValue('f'); 99 | br = new BufferedReader(new FileReader(new File(filePath))); 100 | } 101 | 102 | if (filePath != null && cli.hasOption('n')) { 103 | System.out.println("'-f' (input file) specified. Ignoring -n."); 104 | } 105 | 106 | if (cli.hasOption('t')) { 107 | br = new BufferedReader(new InputStreamReader(System.in)); 108 | } 109 | 110 | if (cli.hasOption('r')) { 111 | printRelativeError = true; 112 | } 113 | 114 | if (cli.hasOption('s')) { 115 | if (cli.hasOption('o')) { 116 | outFile = cli.getOptionValue('o'); 117 | fos = new FileOutputStream(new File(outFile)); 118 | out = new DataOutputStream(fos); 119 | } else { 120 | System.err.println("Specify output file. Example usage: hll -s -o /tmp/out.hll"); 121 | usage(options); 122 | return; 123 | } 124 | } 125 | 126 | if (cli.hasOption('d')) { 127 | if (cli.hasOption('i')) { 128 | inFile = cli.getOptionValue('i'); 129 | fis = new FileInputStream(new File(inFile)); 130 | in = new DataInputStream(fis); 131 | } else { 132 | System.err.println("Specify input file. Example usage: hll -d -i /tmp/in.hll"); 133 | usage(options); 134 | return; 135 | } 136 | } 137 | 138 | // return after deserialization 139 | if (fis != null && in != null) { 140 | long start = System.currentTimeMillis(); 141 | HyperLogLog deserializedHLL = HyperLogLogUtils.deserializeHLL(in); 142 | long end = System.currentTimeMillis(); 143 | System.out.println(deserializedHLL.toString()); 144 | System.out.println("Count after deserialization: " + deserializedHLL.count()); 145 | System.out.println("Deserialization time: " + (end - start) + " ms"); 146 | return; 147 | } 148 | 149 | // construct hll and serialize it if required 150 | HyperLogLog hll = HyperLogLog.builder().enableBitPacking(bitPack).enableNoBias(noBias) 151 | .setEncoding(enc).setNumRegisterIndexBits(p).build(); 152 | 153 | if (br != null) { 154 | Set hashset = new HashSet(); 155 | String line; 156 | while ((line = br.readLine()) != null) { 157 | hll.addString(line); 158 | 159 | //ignore hashset overhead if no relative error needed 160 | if(printRelativeError) { 161 | hashset.add(line); 162 | } 163 | } 164 | 165 | n = hashset.size(); 166 | } else { 167 | Random rand = new Random(seed); 168 | for (int i = 0; i < n; i++) { 169 | if (unique < 0) { 170 | hll.addLong(rand.nextLong()); 171 | } else { 172 | int val = rand.nextInt(unique); 173 | hll.addLong(val); 174 | } 175 | } 176 | } 177 | 178 | long estCount = hll.count(); 179 | System.out.println(hll.toString()); 180 | if(printRelativeError) { 181 | System.out.println("Actual count: " + n); 182 | System.out.println("Relative error: " + HyperLogLogUtils.getRelativeError(n, estCount) + "%"); 183 | } 184 | 185 | if (fos != null && out != null) { 186 | long start = System.currentTimeMillis(); 187 | HyperLogLogUtils.serializeHLL(out, hll); 188 | long end = System.currentTimeMillis(); 189 | System.out.println("Serialized hyperloglog to " + outFile); 190 | System.out.println("Serialized size: " + out.size() + " bytes"); 191 | System.out.println("Serialization time: " + (end - start) + " ms"); 192 | out.close(); 193 | } 194 | } catch (ParseException e) { 195 | System.err.println("Invalid parameter."); 196 | usage(options); 197 | } catch (NumberFormatException e) { 198 | System.err.println("Invalid type for parameter."); 199 | usage(options); 200 | } catch (FileNotFoundException e) { 201 | System.err.println("Specified file not found."); 202 | usage(options); 203 | } catch (IOException e) { 204 | System.err.println("Exception occured while reading file."); 205 | usage(options); 206 | } 207 | } 208 | 209 | private static void addOptions(Options options) { 210 | options.addOption("p", "num-register-bits", true, "number of bits from " 211 | + "hashcode used as register index between 4 and 16 (both inclusive). " + "default = 14"); 212 | options.addOption("e", "encoding", true, "specify encoding to use (SPARSE " 213 | + "or DENSE). default = SPARSE"); 214 | options.addOption("b", "enable-bitpacking", true, "enable bit-packing of" 215 | + " registers. default = true"); 216 | options.addOption("c", "no-bias", true, "use bias correction table " 217 | + "(no-bias algorithm). default = true"); 218 | options.addOption("n", "num-random-values", true, "number of random values to generate"); 219 | options.addOption("f", "file", true, "specify file to read input data"); 220 | options.addOption("s", "serialize", false, 221 | "serialize hyperloglog to file. specify -o for output file"); 222 | options.addOption("o", "output-file", true, "specify output file for serialization"); 223 | options.addOption("d", "deserialize", false, 224 | "deserialize hyperloglog from file. specify -i for input file"); 225 | options.addOption("i", "input-file", true, "specify input file for deserialization"); 226 | options.addOption("t", "standard-in", false, "read data from standard in"); 227 | options.addOption("r", "relative-error", false, "print relative error calculation"); 228 | } 229 | 230 | static void usage(Options options) { 231 | HelpFormatter formatter = new HelpFormatter(); 232 | formatter.printHelp("HyperLogLog", options); 233 | } 234 | } 235 | -------------------------------------------------------------------------------- /src/main/resources/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore everything in this directory 2 | * 3 | # Except this file 4 | !.gitignore 5 | -------------------------------------------------------------------------------- /src/test/com/github/prasanthj/hll/TestHLLNoBias.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2017 Prasanth Jayachandran 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.github.prasanthj.hll; 18 | 19 | import static org.junit.Assert.assertEquals; 20 | 21 | import java.util.Arrays; 22 | import java.util.Collection; 23 | import java.util.HashSet; 24 | import java.util.Random; 25 | import java.util.Set; 26 | 27 | import org.junit.Test; 28 | import org.junit.runner.RunWith; 29 | import org.junit.runners.Parameterized; 30 | import org.junit.runners.Parameterized.Parameters; 31 | 32 | @RunWith(value = Parameterized.class) 33 | public class TestHLLNoBias { 34 | 35 | // 1.5% tolerance for long range bias (when no bias enabled) and 5% when (no 36 | // bias is disabled) and 37 | // 0.5% for short range bias 38 | private float noBiaslongRangeTolerance = 1.5f; 39 | private float biasedlongRangeTolerance = 5.0f; 40 | private float shortRangeTolerance = 0.5f; 41 | 42 | private int size; 43 | 44 | public TestHLLNoBias(int n) { 45 | this.size = n; 46 | } 47 | 48 | @Parameters 49 | public static Collection data() { 50 | Object[][] data = new Object[][] { { 30000 }, { 41000 }, { 50000 }, { 60000 }, { 75000 }, 51 | { 80000 }, { 81920 } }; 52 | return Arrays.asList(data); 53 | } 54 | 55 | @Test 56 | public void testHLLAdd() { 57 | Random rand = new Random(size); 58 | HyperLogLog hll = HyperLogLog.builder().build(); 59 | int size = 100; 60 | for (int i = 0; i < size; i++) { 61 | hll.addLong(rand.nextLong()); 62 | } 63 | double threshold = size > 40000 ? noBiaslongRangeTolerance : shortRangeTolerance; 64 | double delta = threshold * size / 100; 65 | assertEquals((double) size, (double) hll.count(), delta); 66 | } 67 | 68 | @Test 69 | public void testHLLAddHalfDistinct() { 70 | Random rand = new Random(size); 71 | HyperLogLog hll = HyperLogLog.builder().build(); 72 | int unique = size / 2; 73 | Set hashset = new HashSet(); 74 | for (int i = 0; i < size; i++) { 75 | long val = rand.nextInt(unique); 76 | hashset.add(val); 77 | hll.addLong(val); 78 | } 79 | double threshold = size > 40000 ? noBiaslongRangeTolerance : shortRangeTolerance; 80 | double delta = threshold * hashset.size() / 100; 81 | assertEquals((double) hashset.size(), (double) hll.count(), delta); 82 | } 83 | 84 | @Test 85 | public void testHLLNoBiasDisabled() { 86 | Random rand = new Random(size); 87 | HyperLogLog hll = HyperLogLog.builder().enableNoBias(false).build(); 88 | int size = 100; 89 | for (int i = 0; i < size; i++) { 90 | hll.addLong(rand.nextLong()); 91 | } 92 | double threshold = size > 40000 ? biasedlongRangeTolerance : shortRangeTolerance; 93 | double delta = threshold * size / 100; 94 | assertEquals((double) size, (double) hll.count(), delta); 95 | } 96 | 97 | @Test 98 | public void testHLLNoBiasDisabledHalfDistinct() { 99 | Random rand = new Random(size); 100 | HyperLogLog hll = HyperLogLog.builder().enableNoBias(false).build(); 101 | int unique = size / 2; 102 | Set hashset = new HashSet(); 103 | for (int i = 0; i < size; i++) { 104 | long val = rand.nextInt(unique); 105 | hashset.add(val); 106 | hll.addLong(val); 107 | } 108 | double threshold = size > 40000 ? biasedlongRangeTolerance : shortRangeTolerance; 109 | double delta = threshold * hashset.size() / 100; 110 | assertEquals((double) hashset.size(), (double) hll.count(), delta); 111 | } 112 | 113 | } 114 | -------------------------------------------------------------------------------- /src/test/com/github/prasanthj/hll/TestHLLSerialization.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package com.github.prasanthj.hll; 20 | 21 | import static org.junit.Assert.assertEquals; 22 | import com.github.prasanthj.hll.HyperLogLog.EncodingType; 23 | 24 | import java.io.DataInputStream; 25 | import java.io.DataOutputStream; 26 | import java.io.File; 27 | import java.io.FileInputStream; 28 | import java.io.FileOutputStream; 29 | import java.io.IOException; 30 | import java.util.Arrays; 31 | import java.util.Collection; 32 | import java.util.HashSet; 33 | import java.util.Random; 34 | import java.util.Set; 35 | 36 | import org.junit.After; 37 | import org.junit.Rule; 38 | import org.junit.Test; 39 | import org.junit.rules.TestName; 40 | import org.junit.runner.RunWith; 41 | import org.junit.runners.Parameterized; 42 | import org.junit.runners.Parameterized.Parameters; 43 | 44 | @RunWith(value = Parameterized.class) 45 | public class TestHLLSerialization { 46 | 47 | private int size; 48 | private File testFile; 49 | private static final String pathPrefix = "."; 50 | private static final int SEED = 100; 51 | // 5% tolerance for long range bias and 2.5% for short range bias 52 | private float longRangeTolerance = 5.0f; 53 | private float shortRangeTolerance = 2.5f; 54 | 55 | public TestHLLSerialization(int n) { 56 | this.size = n; 57 | this.testFile = new File(pathPrefix + testCaseName.getMethodName() + "_" + size + ".hll"); 58 | } 59 | 60 | @Parameters 61 | public static Collection data() { 62 | Object[][] data = new Object[][] { { 2 }, { 10 }, { 100 }, { 1000 }, { 2000 }, { 3000 }, 63 | { 5000 }, { 6000 }, { 10000 }, { 100000 }, { 1000000 } }; 64 | return Arrays.asList(data); 65 | } 66 | 67 | @After 68 | public void close() { 69 | if (testFile.exists()) { 70 | testFile.delete(); 71 | } 72 | } 73 | 74 | @Rule 75 | public TestName testCaseName = new TestName(); 76 | 77 | @Test 78 | public void testHLLSparseSerialization() throws IOException { 79 | HyperLogLog hll = HyperLogLog.builder().setEncoding(EncodingType.SPARSE).build(); 80 | Random rand = new Random(SEED); 81 | for (int i = 0; i < size; i++) { 82 | hll.addLong(rand.nextLong()); 83 | } 84 | FileOutputStream fos = new FileOutputStream(testFile); 85 | DataOutputStream out = new DataOutputStream(fos); 86 | HyperLogLogUtils.serializeHLL(out, hll); 87 | FileInputStream fis = new FileInputStream(testFile); 88 | DataInputStream in = new DataInputStream(fis); 89 | HyperLogLog deserializedHLL = HyperLogLogUtils.deserializeHLL(in); 90 | assertEquals(hll, deserializedHLL); 91 | assertEquals(hll.toString(), deserializedHLL.toString()); 92 | assertEquals(hll.toStringExtended(), deserializedHLL.toStringExtended()); 93 | assertEquals(hll.hashCode(), deserializedHLL.hashCode()); 94 | assertEquals(hll.count(), deserializedHLL.count()); 95 | } 96 | 97 | @Test 98 | public void testHLLSparseSerializationHalfDistinct() throws IOException { 99 | HyperLogLog hll = HyperLogLog.builder().setEncoding(EncodingType.SPARSE).build(); 100 | Random rand = new Random(SEED); 101 | Set hashset = new HashSet(); 102 | for (int i = 0; i < size; i++) { 103 | int val = rand.nextInt(size / 2); 104 | hll.addLong(val); 105 | hashset.add(val); 106 | } 107 | FileOutputStream fos = new FileOutputStream(testFile); 108 | DataOutputStream out = new DataOutputStream(fos); 109 | HyperLogLogUtils.serializeHLL(out, hll); 110 | double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; 111 | double delta = threshold * hashset.size() / 100; 112 | FileInputStream fis = new FileInputStream(testFile); 113 | DataInputStream in = new DataInputStream(fis); 114 | HyperLogLog deserializedHLL = HyperLogLogUtils.deserializeHLL(in); 115 | assertEquals(hll, deserializedHLL); 116 | assertEquals(hll.toString(), deserializedHLL.toString()); 117 | assertEquals(hll.toStringExtended(), deserializedHLL.toStringExtended()); 118 | assertEquals(hll.hashCode(), deserializedHLL.hashCode()); 119 | assertEquals(hll.count(), deserializedHLL.count()); 120 | assertEquals(hashset.size(), hll.count(), delta); 121 | assertEquals(hashset.size(), deserializedHLL.count(), delta); 122 | } 123 | 124 | @Test 125 | public void testHLLSparseNoBitPacking() throws IOException { 126 | HyperLogLog hll = HyperLogLog.builder().setEncoding(EncodingType.SPARSE) 127 | .enableBitPacking(false).build(); 128 | Random rand = new Random(SEED); 129 | for (int i = 0; i < size; i++) { 130 | hll.addLong(rand.nextLong()); 131 | } 132 | FileOutputStream fos = new FileOutputStream(testFile); 133 | DataOutputStream out = new DataOutputStream(fos); 134 | HyperLogLogUtils.serializeHLL(out, hll); 135 | FileInputStream fis = new FileInputStream(testFile); 136 | DataInputStream in = new DataInputStream(fis); 137 | HyperLogLog deserializedHLL = HyperLogLogUtils.deserializeHLL(in); 138 | assertEquals(hll, deserializedHLL); 139 | assertEquals(hll.toString(), deserializedHLL.toString()); 140 | assertEquals(hll.toStringExtended(), deserializedHLL.toStringExtended()); 141 | assertEquals(hll.hashCode(), deserializedHLL.hashCode()); 142 | assertEquals(hll.count(), deserializedHLL.count()); 143 | } 144 | 145 | @Test 146 | public void testHLLSparseNoBitPackingHalfDistinct() throws IOException { 147 | HyperLogLog hll = HyperLogLog.builder().setEncoding(EncodingType.SPARSE) 148 | .enableBitPacking(false).build(); 149 | Random rand = new Random(SEED); 150 | Set hashset = new HashSet(); 151 | for (int i = 0; i < size; i++) { 152 | int val = rand.nextInt(size / 2); 153 | hll.addLong(val); 154 | hashset.add(val); 155 | } 156 | FileOutputStream fos = new FileOutputStream(testFile); 157 | DataOutputStream out = new DataOutputStream(fos); 158 | HyperLogLogUtils.serializeHLL(out, hll); 159 | double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; 160 | double delta = threshold * hashset.size() / 100; 161 | FileInputStream fis = new FileInputStream(testFile); 162 | DataInputStream in = new DataInputStream(fis); 163 | HyperLogLog deserializedHLL = HyperLogLogUtils.deserializeHLL(in); 164 | assertEquals(hll, deserializedHLL); 165 | assertEquals(hll.toString(), deserializedHLL.toString()); 166 | assertEquals(hll.toStringExtended(), deserializedHLL.toStringExtended()); 167 | assertEquals(hll.hashCode(), deserializedHLL.hashCode()); 168 | assertEquals(hll.count(), deserializedHLL.count()); 169 | assertEquals(hashset.size(), hll.count(), delta); 170 | assertEquals(hashset.size(), deserializedHLL.count(), delta); 171 | } 172 | 173 | @Test 174 | public void testHLLDenseSerialization() throws IOException { 175 | HyperLogLog hll = HyperLogLog.builder().setEncoding(EncodingType.DENSE).build(); 176 | Random rand = new Random(SEED); 177 | for (int i = 0; i < size; i++) { 178 | hll.addLong(rand.nextLong()); 179 | } 180 | FileOutputStream fos = new FileOutputStream(testFile); 181 | DataOutputStream out = new DataOutputStream(fos); 182 | HyperLogLogUtils.serializeHLL(out, hll); 183 | FileInputStream fis = new FileInputStream(testFile); 184 | DataInputStream in = new DataInputStream(fis); 185 | HyperLogLog deserializedHLL = HyperLogLogUtils.deserializeHLL(in); 186 | assertEquals(hll, deserializedHLL); 187 | assertEquals(hll.toString(), deserializedHLL.toString()); 188 | assertEquals(hll.toStringExtended(), deserializedHLL.toStringExtended()); 189 | assertEquals(hll.hashCode(), deserializedHLL.hashCode()); 190 | assertEquals(hll.count(), deserializedHLL.count()); 191 | } 192 | 193 | @Test 194 | public void testHLLDenseSerializationHalfDistinct() throws IOException { 195 | HyperLogLog hll = HyperLogLog.builder().setEncoding(EncodingType.DENSE).build(); 196 | Random rand = new Random(SEED); 197 | Set hashset = new HashSet(); 198 | for (int i = 0; i < size; i++) { 199 | int val = rand.nextInt(size / 2); 200 | hll.addLong(val); 201 | hashset.add(val); 202 | } 203 | FileOutputStream fos = new FileOutputStream(testFile); 204 | DataOutputStream out = new DataOutputStream(fos); 205 | HyperLogLogUtils.serializeHLL(out, hll); 206 | double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; 207 | double delta = threshold * hashset.size() / 100; 208 | FileInputStream fis = new FileInputStream(testFile); 209 | DataInputStream in = new DataInputStream(fis); 210 | HyperLogLog deserializedHLL = HyperLogLogUtils.deserializeHLL(in); 211 | assertEquals(hll, deserializedHLL); 212 | assertEquals(hll.toString(), deserializedHLL.toString()); 213 | assertEquals(hll.toStringExtended(), deserializedHLL.toStringExtended()); 214 | assertEquals(hll.hashCode(), deserializedHLL.hashCode()); 215 | assertEquals(hll.count(), deserializedHLL.count()); 216 | assertEquals(hashset.size(), hll.count(), delta); 217 | assertEquals(hashset.size(), deserializedHLL.count(), delta); 218 | } 219 | 220 | @Test 221 | public void testHLLDenseNoBitPacking() throws IOException { 222 | HyperLogLog hll = HyperLogLog.builder().setEncoding(EncodingType.DENSE).enableBitPacking(false) 223 | .build(); 224 | Random rand = new Random(SEED); 225 | for (int i = 0; i < size; i++) { 226 | hll.addLong(rand.nextLong()); 227 | } 228 | FileOutputStream fos = new FileOutputStream(testFile); 229 | DataOutputStream out = new DataOutputStream(fos); 230 | HyperLogLogUtils.serializeHLL(out, hll); 231 | FileInputStream fis = new FileInputStream(testFile); 232 | DataInputStream in = new DataInputStream(fis); 233 | HyperLogLog deserializedHLL = HyperLogLogUtils.deserializeHLL(in); 234 | assertEquals(hll, deserializedHLL); 235 | assertEquals(hll.toString(), deserializedHLL.toString()); 236 | assertEquals(hll.toStringExtended(), deserializedHLL.toStringExtended()); 237 | assertEquals(hll.hashCode(), deserializedHLL.hashCode()); 238 | assertEquals(hll.count(), deserializedHLL.count()); 239 | } 240 | 241 | @Test 242 | public void testHLLDenseNoBitPackingHalfDistinct() throws IOException { 243 | HyperLogLog hll = HyperLogLog.builder().setEncoding(EncodingType.DENSE).enableBitPacking(false) 244 | .build(); 245 | Random rand = new Random(SEED); 246 | Set hashset = new HashSet(); 247 | for (int i = 0; i < size; i++) { 248 | int val = rand.nextInt(size / 2); 249 | hll.addLong(val); 250 | hashset.add(val); 251 | } 252 | FileOutputStream fos = new FileOutputStream(testFile); 253 | DataOutputStream out = new DataOutputStream(fos); 254 | HyperLogLogUtils.serializeHLL(out, hll); 255 | double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; 256 | double delta = threshold * hashset.size() / 100; 257 | FileInputStream fis = new FileInputStream(testFile); 258 | DataInputStream in = new DataInputStream(fis); 259 | HyperLogLog deserializedHLL = HyperLogLogUtils.deserializeHLL(in); 260 | assertEquals(hll, deserializedHLL); 261 | assertEquals(hll.toString(), deserializedHLL.toString()); 262 | assertEquals(hll.toStringExtended(), deserializedHLL.toStringExtended()); 263 | assertEquals(hll.hashCode(), deserializedHLL.hashCode()); 264 | assertEquals(hll.count(), deserializedHLL.count()); 265 | assertEquals(hashset.size(), hll.count(), delta); 266 | assertEquals(hashset.size(), deserializedHLL.count(), delta); 267 | } 268 | } 269 | -------------------------------------------------------------------------------- /src/test/com/github/prasanthj/hll/TestHyperLogLog.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2017 Prasanth Jayachandran 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.github.prasanthj.hll; 18 | 19 | import static org.junit.Assert.assertEquals; 20 | import com.github.prasanthj.hll.HyperLogLog.EncodingType; 21 | 22 | import org.junit.Test; 23 | 24 | public class TestHyperLogLog { 25 | // 5% tolerance for estimated count 26 | private float longRangeTolerance = 5.0f; 27 | private float shortRangeTolerance = 2.0f; 28 | 29 | @Test(expected = IllegalArgumentException.class) 30 | public void testHLLDenseMerge() { 31 | HyperLogLog hll = HyperLogLog.builder().setEncoding(EncodingType.DENSE).build(); 32 | HyperLogLog hll2 = HyperLogLog.builder().setEncoding(EncodingType.DENSE).build(); 33 | HyperLogLog hll3 = HyperLogLog.builder().setEncoding(EncodingType.DENSE).build(); 34 | HyperLogLog hll4 = HyperLogLog.builder().setNumRegisterIndexBits(16) 35 | .setEncoding(EncodingType.DENSE).build(); 36 | HyperLogLog hll5 = HyperLogLog.builder().setNumRegisterIndexBits(12) 37 | .setEncoding(EncodingType.DENSE).build(); 38 | int size = 1000; 39 | for (int i = 0; i < size; i++) { 40 | hll.addLong(i); 41 | hll2.addLong(size + i); 42 | hll3.addLong(2 * size + i); 43 | hll4.addLong(3 * size + i); 44 | } 45 | double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; 46 | double delta = threshold * size / 100; 47 | double delta4 = threshold * (4*size) / 100; 48 | assertEquals((double) size, (double) hll.count(), delta); 49 | assertEquals((double) size, (double) hll2.count(), delta); 50 | 51 | // merge 52 | hll.merge(hll2); 53 | assertEquals((double) 2 * size, (double) hll.count(), delta); 54 | assertEquals(EncodingType.DENSE, hll.getEncoding()); 55 | 56 | // merge should update registers and hence the count 57 | hll.merge(hll2); 58 | assertEquals((double) 2 * size, (double) hll.count(), delta); 59 | assertEquals(EncodingType.DENSE, hll.getEncoding()); 60 | 61 | // new merge 62 | hll.merge(hll3); 63 | assertEquals((double) 3 * size, (double) hll.count(), delta); 64 | assertEquals(EncodingType.DENSE, hll.getEncoding()); 65 | 66 | // valid merge -- register set size gets bigger (also 4k items 67 | hll.merge(hll4); 68 | assertEquals((double) 4 * size, (double) hll.count(), delta4); 69 | assertEquals(EncodingType.DENSE, hll.getEncoding()); 70 | 71 | // invalid merge -- smaller register merge to bigger 72 | hll.merge(hll5); 73 | } 74 | 75 | @Test(expected = IllegalArgumentException.class) 76 | public void testHLLSparseMerge() { 77 | HyperLogLog hll = HyperLogLog.builder().setEncoding(EncodingType.SPARSE).build(); 78 | HyperLogLog hll2 = HyperLogLog.builder().setEncoding(EncodingType.SPARSE).build(); 79 | HyperLogLog hll3 = HyperLogLog.builder().setEncoding(EncodingType.SPARSE).build(); 80 | HyperLogLog hll4 = HyperLogLog.builder().setNumRegisterIndexBits(16) 81 | .setEncoding(EncodingType.SPARSE).build(); 82 | HyperLogLog hll5 = HyperLogLog.builder().setNumRegisterIndexBits(12) 83 | .setEncoding(EncodingType.SPARSE).build(); 84 | int size = 500; 85 | for (int i = 0; i < size; i++) { 86 | hll.addLong(i); 87 | hll2.addLong(size + i); 88 | hll3.addLong(2 * size + i); 89 | hll4.addLong(3 * size + i); 90 | } 91 | double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; 92 | double delta = threshold * size / 100; 93 | double delta4 = threshold * (4*size) / 100; 94 | assertEquals((double) size, (double) hll.count(), delta); 95 | assertEquals((double) size, (double) hll2.count(), delta); 96 | 97 | // merge 98 | hll.merge(hll2); 99 | assertEquals((double) 2 * size, (double) hll.count(), delta); 100 | assertEquals(EncodingType.SPARSE, hll.getEncoding()); 101 | 102 | // merge should update registers and hence the count 103 | hll.merge(hll2); 104 | assertEquals((double) 2 * size, (double) hll.count(), delta); 105 | assertEquals(EncodingType.SPARSE, hll.getEncoding()); 106 | 107 | // new merge 108 | hll.merge(hll3); 109 | assertEquals((double) 3 * size, (double) hll.count(), delta); 110 | assertEquals(EncodingType.SPARSE, hll.getEncoding()); 111 | 112 | // valid merge -- register set size gets bigger & dense automatically 113 | hll.merge(hll4); 114 | assertEquals((double) 4 * size, (double) hll.count(), delta4); 115 | assertEquals(EncodingType.DENSE, hll.getEncoding()); 116 | 117 | // invalid merge -- smaller register merge to bigger 118 | hll.merge(hll5); 119 | } 120 | 121 | @Test(expected = IllegalArgumentException.class) 122 | public void testHLLSparseDenseMerge() { 123 | HyperLogLog hll = HyperLogLog.builder().setEncoding(EncodingType.SPARSE).build(); 124 | HyperLogLog hll2 = HyperLogLog.builder().setEncoding(EncodingType.SPARSE).build(); 125 | HyperLogLog hll3 = HyperLogLog.builder().setEncoding(EncodingType.DENSE).build(); 126 | HyperLogLog hll4 = HyperLogLog.builder().setNumRegisterIndexBits(16) 127 | .setEncoding(EncodingType.DENSE).build(); 128 | HyperLogLog hll5 = HyperLogLog.builder().setNumRegisterIndexBits(12) 129 | .setEncoding(EncodingType.DENSE).build(); 130 | int size = 1000; 131 | for (int i = 0; i < size; i++) { 132 | hll.addLong(i); 133 | hll2.addLong(size + i); 134 | hll3.addLong(2 * size + i); 135 | hll4.addLong(3 * size + i); 136 | } 137 | double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; 138 | double delta = threshold * size / 100; 139 | assertEquals((double) size, (double) hll.count(), delta); 140 | assertEquals((double) size, (double) hll2.count(), delta); 141 | 142 | // sparse-sparse merge 143 | hll.merge(hll2); 144 | assertEquals((double) 2 * size, (double) hll.count(), delta); 145 | assertEquals(EncodingType.SPARSE, hll.getEncoding()); 146 | 147 | // merge should update registers and hence the count 148 | hll.merge(hll2); 149 | assertEquals((double) 2 * size, (double) hll.count(), delta); 150 | assertEquals(EncodingType.SPARSE, hll.getEncoding()); 151 | 152 | // sparse-dense merge 153 | hll.merge(hll3); 154 | assertEquals((double) 3 * size, (double) hll.count(), delta); 155 | assertEquals(EncodingType.DENSE, hll.getEncoding()); 156 | 157 | // merge should convert hll2 to DENSE 158 | hll2.merge(hll4); 159 | assertEquals((double) 2 * size, (double) hll2.count(), delta); 160 | assertEquals(EncodingType.DENSE, hll2.getEncoding()); 161 | 162 | // invalid merge -- smaller register merge to bigger 163 | hll.merge(hll5); 164 | } 165 | 166 | @Test(expected = IllegalArgumentException.class) 167 | public void testHLLDenseSparseMerge() { 168 | HyperLogLog hll = HyperLogLog.builder().setEncoding(EncodingType.DENSE).build(); 169 | HyperLogLog hll2 = HyperLogLog.builder().setEncoding(EncodingType.DENSE).build(); 170 | HyperLogLog hll3 = HyperLogLog.builder().setEncoding(EncodingType.SPARSE).build(); 171 | HyperLogLog hll4 = HyperLogLog.builder().setNumRegisterIndexBits(16) 172 | .setEncoding(EncodingType.SPARSE).build(); 173 | HyperLogLog hll5 = HyperLogLog.builder().setNumRegisterIndexBits(12) 174 | .setEncoding(EncodingType.SPARSE).build(); 175 | int size = 1000; 176 | for (int i = 0; i < size; i++) { 177 | hll.addLong(i); 178 | hll2.addLong(size + i); 179 | hll3.addLong(2 * size + i); 180 | hll4.addLong(3 * size + i); 181 | } 182 | double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; 183 | double delta = threshold * size / 100; 184 | assertEquals((double) size, (double) hll.count(), delta); 185 | assertEquals((double) size, (double) hll2.count(), delta); 186 | 187 | // sparse-sparse merge 188 | hll.merge(hll2); 189 | assertEquals((double) 2 * size, (double) hll.count(), delta); 190 | assertEquals(EncodingType.DENSE, hll.getEncoding()); 191 | 192 | // merge should update registers and hence the count 193 | hll.merge(hll2); 194 | assertEquals((double) 2 * size, (double) hll.count(), delta); 195 | assertEquals(EncodingType.DENSE, hll.getEncoding()); 196 | 197 | // sparse-dense merge 198 | hll.merge(hll3); 199 | assertEquals((double) 3 * size, (double) hll.count(), delta); 200 | assertEquals(EncodingType.DENSE, hll.getEncoding()); 201 | 202 | // merge should convert hll3 to DENSE 203 | hll3.merge(hll4); 204 | assertEquals((double) 2 * size, (double) hll3.count(), delta); 205 | assertEquals(EncodingType.DENSE, hll3.getEncoding()); 206 | 207 | // invalid merge -- smaller register merge to bigger 208 | hll.merge(hll5); 209 | 210 | } 211 | 212 | @Test(expected = IllegalArgumentException.class) 213 | public void testHLLSparseOverflowMerge() { 214 | HyperLogLog hll = HyperLogLog.builder().setEncoding(EncodingType.SPARSE).build(); 215 | HyperLogLog hll2 = HyperLogLog.builder().setEncoding(EncodingType.SPARSE).build(); 216 | HyperLogLog hll3 = HyperLogLog.builder().setEncoding(EncodingType.SPARSE).build(); 217 | HyperLogLog hll4 = HyperLogLog.builder().setNumRegisterIndexBits(16) 218 | .setEncoding(EncodingType.SPARSE).build(); 219 | HyperLogLog hll5 = HyperLogLog.builder().setNumRegisterIndexBits(12) 220 | .setEncoding(EncodingType.SPARSE).build(); 221 | int size = 1000; 222 | for (int i = 0; i < size; i++) { 223 | hll.addLong(i); 224 | hll2.addLong(size + i); 225 | hll3.addLong(2 * size + i); 226 | hll4.addLong(3 * size + i); 227 | } 228 | double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; 229 | double delta = threshold * size / 100; 230 | assertEquals((double) size, (double) hll.count(), delta); 231 | assertEquals((double) size, (double) hll2.count(), delta); 232 | 233 | // sparse-sparse merge 234 | hll.merge(hll2); 235 | assertEquals((double) 2 * size, (double) hll.count(), delta); 236 | assertEquals(EncodingType.SPARSE, hll.getEncoding()); 237 | 238 | // merge should update registers and hence the count 239 | hll.merge(hll2); 240 | assertEquals((double) 2 * size, (double) hll.count(), delta); 241 | assertEquals(EncodingType.SPARSE, hll.getEncoding()); 242 | 243 | // sparse-sparse overload to dense 244 | hll.merge(hll3); 245 | assertEquals((double) 3 * size, (double) hll.count(), delta); 246 | assertEquals(EncodingType.DENSE, hll.getEncoding()); 247 | 248 | // merge should convert hll2 to DENSE 249 | hll2.merge(hll4); 250 | assertEquals((double) 2 * size, (double) hll2.count(), delta); 251 | assertEquals(EncodingType.DENSE, hll2.getEncoding()); 252 | 253 | // invalid merge -- smaller register merge to bigger 254 | hll.merge(hll5); 255 | } 256 | 257 | @Test 258 | public void testHLLSparseMoreRegisterBits() { 259 | HyperLogLog hll = HyperLogLog.builder().setEncoding(EncodingType.SPARSE) 260 | .setNumRegisterIndexBits(16).build(); 261 | int size = 1000; 262 | for (int i = 0; i < size; i++) { 263 | hll.addLong(i); 264 | } 265 | double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; 266 | double delta = threshold * size / 100; 267 | assertEquals((double) size, (double) hll.count(), delta); 268 | } 269 | 270 | @Test 271 | public void testHLLSquash() { 272 | 273 | int[] sizes = new int[] { 500, 1000, 2300, 4096}; 274 | int minBits = 9; 275 | for (final int size : sizes) { 276 | 277 | HyperLogLog hlls[] = new HyperLogLog[16]; 278 | for (int k = minBits; k < hlls.length; k++) { 279 | final HyperLogLog hll = HyperLogLog.builder() 280 | .setEncoding(EncodingType.DENSE).setNumRegisterIndexBits(k).build(); 281 | for (int i = 0; i < size; i++) { 282 | hll.addLong(i); 283 | } 284 | hlls[k] = hll; 285 | } 286 | 287 | for (int k = minBits; k < hlls.length; k++) { 288 | for (int j = k + 1; j < hlls.length; j++) { 289 | final HyperLogLog large = hlls[j]; 290 | final HyperLogLog small = hlls[k]; 291 | final HyperLogLog mush = large 292 | .squash(small.getNumRegisterIndexBits()); 293 | assertEquals(small.count(), mush.count(), 0); 294 | double delta = Math.ceil(small.getStandardError()*size); 295 | assertEquals((double) size, (double) mush.count(), delta); 296 | } 297 | } 298 | } 299 | } 300 | 301 | @Test 302 | public void testHLLDenseDenseSquash() { 303 | HyperLogLog p14HLL = HyperLogLog.builder().setEncoding(EncodingType.DENSE).setNumRegisterIndexBits(14).build(); 304 | HyperLogLog p10HLL = HyperLogLog.builder().setEncoding(EncodingType.DENSE).setNumRegisterIndexBits(10).build(); 305 | int size = 1_000_000; 306 | for (int i = 0; i < size; i++) { 307 | p14HLL.addLong(i); 308 | } 309 | 310 | for (int i = 0; i < 10_000; i++) { 311 | p10HLL.addLong(i); 312 | } 313 | 314 | p14HLL.squash(p10HLL.getNumRegisterIndexBits()); 315 | assertEquals((double) size, p14HLL.count(), longRangeTolerance * size / 100.0); 316 | } 317 | 318 | @Test 319 | public void testHLLSparseDenseSquash() { 320 | HyperLogLog p14HLL = HyperLogLog.builder().setEncoding(EncodingType.SPARSE).setNumRegisterIndexBits(14).build(); 321 | HyperLogLog p10HLL = HyperLogLog.builder().setEncoding(EncodingType.DENSE).setNumRegisterIndexBits(10).build(); 322 | int size = 2000; 323 | for (int i = 0; i < size; i++) { 324 | p14HLL.addLong(i); 325 | } 326 | 327 | for (int i = 0; i < 10_000; i++) { 328 | p10HLL.addLong(i); 329 | } 330 | 331 | p14HLL.squash(p10HLL.getNumRegisterIndexBits()); 332 | assertEquals((double) size, p14HLL.count(), longRangeTolerance * size / 100.0); 333 | } 334 | } 335 | -------------------------------------------------------------------------------- /src/test/com/github/prasanthj/hll/TestHyperLogLogDense.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2017 Prasanth Jayachandran 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.github.prasanthj.hll; 18 | 19 | import static org.junit.Assert.assertEquals; 20 | 21 | import java.util.Arrays; 22 | import java.util.Collection; 23 | import java.util.HashSet; 24 | import java.util.Random; 25 | import java.util.Set; 26 | 27 | import org.junit.Test; 28 | import org.junit.runner.RunWith; 29 | import org.junit.runners.Parameterized; 30 | import org.junit.runners.Parameterized.Parameters; 31 | 32 | @RunWith(value = Parameterized.class) 33 | public class TestHyperLogLogDense { 34 | 35 | // 5% tolerance for long range bias and 3% for short range bias 36 | private float longRangeTolerance = 5.0f; 37 | private float shortRangeTolerance = 3.0f; 38 | 39 | private int size; 40 | 41 | public TestHyperLogLogDense(int n) { 42 | this.size = n; 43 | } 44 | 45 | @Parameters 46 | public static Collection data() { 47 | Object[][] data = new Object[][] { { 2 }, { 10 }, { 100 }, { 1000 }, { 10000 }, { 100000 }, 48 | { 1000000 } }; 49 | return Arrays.asList(data); 50 | } 51 | 52 | @Test 53 | public void testHLLAdd() { 54 | Random rand = new Random(size); 55 | HyperLogLog hll = HyperLogLog.builder().setEncoding(HyperLogLog.EncodingType.DENSE).build(); 56 | int size = 100; 57 | for (int i = 0; i < size; i++) { 58 | hll.addLong(rand.nextLong()); 59 | } 60 | double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; 61 | double delta = threshold * size / 100; 62 | assertEquals((double) size, (double) hll.count(), delta); 63 | } 64 | 65 | @Test 66 | public void testHLLAddHalfDistinct() { 67 | Random rand = new Random(size); 68 | HyperLogLog hll = HyperLogLog.builder().setEncoding(HyperLogLog.EncodingType.DENSE).build(); 69 | int unique = size / 2; 70 | Set hashset = new HashSet(); 71 | for (int i = 0; i < size; i++) { 72 | long val = rand.nextInt(unique); 73 | hashset.add(val); 74 | hll.addLong(val); 75 | } 76 | double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; 77 | double delta = threshold * hashset.size() / 100; 78 | assertEquals((double) hashset.size(), (double) hll.count(), delta); 79 | } 80 | 81 | } 82 | -------------------------------------------------------------------------------- /src/test/com/github/prasanthj/hll/TestHyperLogLogMerge.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2017 Prasanth Jayachandran 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.github.prasanthj.hll; 18 | 19 | import static org.junit.Assert.assertEquals; 20 | 21 | import java.util.Arrays; 22 | import java.util.Collection; 23 | 24 | import org.junit.Test; 25 | import org.junit.runner.RunWith; 26 | import org.junit.runners.Parameterized; 27 | 28 | import com.github.prasanthj.hll.HyperLogLog.EncodingType; 29 | 30 | @RunWith(Parameterized.class) 31 | public class TestHyperLogLogMerge { 32 | // 5% tolerance for estimated count 33 | private float longRangeTolerance = 5.0f; 34 | private float shortRangeTolerance = 2.0f; 35 | 36 | int size; 37 | 38 | @Parameterized.Parameters 39 | public static Collection data() { 40 | return Arrays.asList(new Object[][] { 41 | { 1_000 }, { 10_000 }, { 100_000 }, { 1_000_000 }, { 10_000_000 } 42 | // { 100_000_000 }, { 1_000_000_000 } 1B passed but is super slow 43 | }); 44 | } 45 | 46 | public TestHyperLogLogMerge(int size) { 47 | this.size = size; 48 | } 49 | 50 | @Test 51 | public void testHLLMergeDisjoint() { 52 | HyperLogLog hll1 = HyperLogLog.builder().setNumRegisterIndexBits(16).build(); 53 | for (int i = 0; i < size; i++) { 54 | hll1.addLong(i); 55 | } 56 | HyperLogLog hll2 = HyperLogLog.builder().setNumRegisterIndexBits(16).build(); 57 | for (int i = size; i < 2 * size; i++) { 58 | hll2.addLong(i); 59 | } 60 | hll1.merge(hll2); 61 | double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; 62 | double delta = threshold * size / 100; 63 | long expected = 2 * size; 64 | long actual = hll1.count(); 65 | assertEquals(expected, actual, delta); 66 | } 67 | 68 | @Test 69 | public void testHLLMerge25PercentOverlap() { 70 | HyperLogLog hll1 = HyperLogLog.builder().setNumRegisterIndexBits(16).build(); 71 | for (int i = 0; i < size; i++) { 72 | hll1.addLong(i); 73 | } 74 | HyperLogLog hll2 = HyperLogLog.builder().setNumRegisterIndexBits(16).build(); 75 | int start = (int) (0.75 * size); 76 | int end = (int) (size * 1.75); 77 | for (int i = start; i < end; i++) { 78 | hll2.addLong(i); 79 | } 80 | hll1.merge(hll2); 81 | double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; 82 | double delta = threshold * size / 100; 83 | long expected = (long) (1.75 * size); 84 | long actual = hll1.count(); 85 | assertEquals(expected, actual, delta); 86 | } 87 | 88 | @Test 89 | public void testHLLMerge50PercentOverlap() { 90 | HyperLogLog hll1 = HyperLogLog.builder().setNumRegisterIndexBits(16).build(); 91 | for (int i = 0; i < size; i++) { 92 | hll1.addLong(i); 93 | } 94 | HyperLogLog hll2 = HyperLogLog.builder().setNumRegisterIndexBits(16).build(); 95 | int start = (int) (0.5 * size); 96 | int end = (int) (size * 1.5); 97 | for (int i = start; i < end; i++) { 98 | hll2.addLong(i); 99 | } 100 | hll1.merge(hll2); 101 | double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; 102 | double delta = threshold * size / 100; 103 | long expected = (long) (1.5 * size); 104 | long actual = hll1.count(); 105 | assertEquals(expected, actual, delta); 106 | } 107 | 108 | 109 | @Test 110 | public void testHLLMerge75PercentOverlap() { 111 | HyperLogLog hll1 = HyperLogLog.builder().setNumRegisterIndexBits(16).build(); 112 | for (int i = 0; i < size; i++) { 113 | hll1.addLong(i); 114 | } 115 | HyperLogLog hll2 = HyperLogLog.builder().setNumRegisterIndexBits(16).build(); 116 | int start = (int) (0.25 * size); 117 | int end = (int) (size * 1.25); 118 | for (int i = start; i < end; i++) { 119 | hll2.addLong(i); 120 | } 121 | hll1.merge(hll2); 122 | double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; 123 | double delta = threshold * size / 100; 124 | long expected = (long) (1.25 * size); 125 | long actual = hll1.count(); 126 | assertEquals(expected, actual, delta); 127 | } 128 | 129 | 130 | @Test 131 | public void testHLLMerge100PercentOverlap() { 132 | HyperLogLog hll1 = HyperLogLog.builder().setNumRegisterIndexBits(16).build(); 133 | for (int i = 0; i < size; i++) { 134 | hll1.addLong(i); 135 | } 136 | HyperLogLog hll2 = HyperLogLog.builder().setNumRegisterIndexBits(16).build(); 137 | for (int i = 0; i < size; i++) { 138 | hll2.addLong(i); 139 | } 140 | hll1.merge(hll2); 141 | double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; 142 | double delta = threshold * size / 100; 143 | long expected = size; 144 | long actual = hll1.count(); 145 | assertEquals(expected, actual, delta); 146 | } 147 | 148 | } 149 | -------------------------------------------------------------------------------- /src/test/com/github/prasanthj/hll/TestHyperLogLogSparse.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2017 Prasanth Jayachandran 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.github.prasanthj.hll; 18 | 19 | import static org.junit.Assert.assertEquals; 20 | 21 | import java.util.Arrays; 22 | import java.util.Collection; 23 | import java.util.HashSet; 24 | import java.util.Random; 25 | import java.util.Set; 26 | 27 | import org.junit.Test; 28 | import org.junit.runner.RunWith; 29 | import org.junit.runners.Parameterized; 30 | import org.junit.runners.Parameterized.Parameters; 31 | 32 | @RunWith(value = Parameterized.class) 33 | public class TestHyperLogLogSparse { 34 | 35 | // 5% tolerance for long range bias and 1% for short range bias 36 | private float longRangeTolerance = 5.0f; 37 | private float shortRangeTolerance = 1.0f; 38 | 39 | private int size; 40 | 41 | public TestHyperLogLogSparse(int n) { 42 | this.size = n; 43 | } 44 | 45 | @Parameters 46 | public static Collection data() { 47 | Object[][] data = new Object[][] { { 2 }, { 10 }, { 100 }, { 1000 }, { 10000 }, { 100000 }, 48 | { 1000000 } }; 49 | return Arrays.asList(data); 50 | } 51 | 52 | @Test 53 | public void testHLLAdd() { 54 | Random rand = new Random(size); 55 | HyperLogLog hll = HyperLogLog.builder().build(); 56 | int size = 100; 57 | for (int i = 0; i < size; i++) { 58 | hll.addLong(rand.nextLong()); 59 | } 60 | double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; 61 | double delta = threshold * size / 100; 62 | assertEquals((double) size, (double) hll.count(), delta); 63 | } 64 | 65 | @Test 66 | public void testHLLAddHalfDistinct() { 67 | Random rand = new Random(size); 68 | HyperLogLog hll = HyperLogLog.builder().build(); 69 | int unique = size / 2; 70 | Set hashset = new HashSet(); 71 | for (int i = 0; i < size; i++) { 72 | long val = rand.nextInt(unique); 73 | hashset.add(val); 74 | hll.addLong(val); 75 | } 76 | double threshold = size > 40000 ? longRangeTolerance : shortRangeTolerance; 77 | double delta = threshold * hashset.size() / 100; 78 | assertEquals((double) hashset.size(), (double) hll.count(), delta); 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /src/test/com/github/prasanthj/hll/TestMurmur3.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2017 Prasanth Jayachandran 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.github.prasanthj.hll; 18 | 19 | import static org.junit.Assert.assertEquals; 20 | 21 | import com.google.common.hash.HashFunction; 22 | import com.google.common.hash.Hashing; 23 | 24 | import org.junit.Test; 25 | 26 | import java.nio.ByteBuffer; 27 | import java.nio.ByteOrder; 28 | import java.util.Random; 29 | 30 | /** 31 | * Tests for Murmur3 variants. 32 | */ 33 | public class TestMurmur3 { 34 | 35 | @Test 36 | public void testHashCodesM3_32_string() { 37 | String key = "test"; 38 | int seed = 123; 39 | HashFunction hf = Hashing.murmur3_32(seed); 40 | int hc1 = hf.hashBytes(key.getBytes()).asInt(); 41 | int hc2 = Murmur3.hash32(key.getBytes(), key.getBytes().length, seed); 42 | assertEquals(hc1, hc2); 43 | 44 | key = "testkey"; 45 | hc1 = hf.hashBytes(key.getBytes()).asInt(); 46 | hc2 = Murmur3.hash32(key.getBytes(), key.getBytes().length, seed); 47 | assertEquals(hc1, hc2); 48 | } 49 | 50 | @Test 51 | public void testHashCodesM3_32_ints() { 52 | int seed = 123; 53 | Random rand = new Random(seed); 54 | HashFunction hf = Hashing.murmur3_32(seed); 55 | for (int i = 0; i < 1000; i++) { 56 | int val = rand.nextInt(); 57 | byte[] data = ByteBuffer.allocate(4).putInt(val).array(); 58 | int hc1 = hf.hashBytes(data).asInt(); 59 | int hc2 = Murmur3.hash32(data, data.length, seed); 60 | assertEquals(hc1, hc2); 61 | } 62 | } 63 | 64 | @Test 65 | public void testHashCodesM3_32_longs() { 66 | int seed = 123; 67 | Random rand = new Random(seed); 68 | HashFunction hf = Hashing.murmur3_32(seed); 69 | for (int i = 0; i < 1000; i++) { 70 | long val = rand.nextLong(); 71 | byte[] data = ByteBuffer.allocate(8).putLong(val).array(); 72 | int hc1 = hf.hashBytes(data).asInt(); 73 | int hc2 = Murmur3.hash32(data, data.length, seed); 74 | assertEquals(hc1, hc2); 75 | } 76 | } 77 | 78 | @Test 79 | public void testHashCodesM3_32_double() { 80 | int seed = 123; 81 | Random rand = new Random(seed); 82 | HashFunction hf = Hashing.murmur3_32(seed); 83 | for (int i = 0; i < 1000; i++) { 84 | double val = rand.nextDouble(); 85 | byte[] data = ByteBuffer.allocate(8).putDouble(val).array(); 86 | int hc1 = hf.hashBytes(data).asInt(); 87 | int hc2 = Murmur3.hash32(data, data.length, seed); 88 | assertEquals(hc1, hc2); 89 | } 90 | } 91 | 92 | @Test 93 | public void testHashCodesM3_128_string() { 94 | String key = "test"; 95 | int seed = 123; 96 | HashFunction hf = Hashing.murmur3_128(seed); 97 | // guava stores the hashcodes in little endian order 98 | ByteBuffer buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN); 99 | buf.put(hf.hashBytes(key.getBytes()).asBytes()); 100 | buf.flip(); 101 | long gl1 = buf.getLong(); 102 | long gl2 = buf.getLong(8); 103 | long[] hc = Murmur3.hash128(key.getBytes(), key.getBytes().length, seed); 104 | long m1 = hc[0]; 105 | long m2 = hc[1]; 106 | assertEquals(gl1, m1); 107 | assertEquals(gl2, m2); 108 | 109 | key = "testkey128_testkey128"; 110 | buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN); 111 | buf.put(hf.hashBytes(key.getBytes()).asBytes()); 112 | buf.flip(); 113 | gl1 = buf.getLong(); 114 | gl2 = buf.getLong(8); 115 | hc = Murmur3.hash128(key.getBytes(), key.getBytes().length, seed); 116 | m1 = hc[0]; 117 | m2 = hc[1]; 118 | assertEquals(gl1, m1); 119 | assertEquals(gl2, m2); 120 | } 121 | 122 | @Test 123 | public void testHashCodesM3_128_ints() { 124 | int seed = 123; 125 | Random rand = new Random(seed); 126 | HashFunction hf = Hashing.murmur3_128(seed); 127 | for (int i = 0; i < 1000; i++) { 128 | int val = rand.nextInt(); 129 | byte[] data = ByteBuffer.allocate(4).putInt(val).array(); 130 | // guava stores the hashcodes in little endian order 131 | ByteBuffer buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN); 132 | buf.put(hf.hashBytes(data).asBytes()); 133 | buf.flip(); 134 | long gl1 = buf.getLong(); 135 | long gl2 = buf.getLong(8); 136 | long[] hc = Murmur3.hash128(data, data.length, seed); 137 | long m1 = hc[0]; 138 | long m2 = hc[1]; 139 | assertEquals(gl1, m1); 140 | assertEquals(gl2, m2); 141 | } 142 | } 143 | 144 | @Test 145 | public void testHashCodesM3_128_longs() { 146 | int seed = 123; 147 | Random rand = new Random(seed); 148 | HashFunction hf = Hashing.murmur3_128(seed); 149 | for (int i = 0; i < 1000; i++) { 150 | long val = rand.nextLong(); 151 | byte[] data = ByteBuffer.allocate(8).putLong(val).array(); 152 | // guava stores the hashcodes in little endian order 153 | ByteBuffer buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN); 154 | buf.put(hf.hashBytes(data).asBytes()); 155 | buf.flip(); 156 | long gl1 = buf.getLong(); 157 | long gl2 = buf.getLong(8); 158 | long[] hc = Murmur3.hash128(data, data.length, seed); 159 | long m1 = hc[0]; 160 | long m2 = hc[1]; 161 | assertEquals(gl1, m1); 162 | assertEquals(gl2, m2); 163 | } 164 | } 165 | 166 | @Test 167 | public void testHashCodesM3_128_double() { 168 | int seed = 123; 169 | Random rand = new Random(seed); 170 | HashFunction hf = Hashing.murmur3_128(seed); 171 | for (int i = 0; i < 1000; i++) { 172 | double val = rand.nextDouble(); 173 | byte[] data = ByteBuffer.allocate(8).putDouble(val).array(); 174 | // guava stores the hashcodes in little endian order 175 | ByteBuffer buf = ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN); 176 | buf.put(hf.hashBytes(data).asBytes()); 177 | buf.flip(); 178 | long gl1 = buf.getLong(); 179 | long gl2 = buf.getLong(8); 180 | long[] hc = Murmur3.hash128(data, data.length, seed); 181 | long m1 = hc[0]; 182 | long m2 = hc[1]; 183 | assertEquals(gl1, m1); 184 | assertEquals(gl2, m2); 185 | } 186 | } 187 | } 188 | -------------------------------------------------------------------------------- /src/test/com/github/prasanthj/hll/TestSparseEncodeHash.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2017 Prasanth Jayachandran 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.github.prasanthj.hll; 18 | 19 | import static org.junit.Assert.assertEquals; 20 | 21 | import java.util.Arrays; 22 | import java.util.Collection; 23 | 24 | import org.junit.Test; 25 | import org.junit.runner.RunWith; 26 | import org.junit.runners.Parameterized; 27 | import org.junit.runners.Parameterized.Parameters; 28 | 29 | @RunWith(value = Parameterized.class) 30 | public class TestSparseEncodeHash { 31 | 32 | private long input; 33 | private int expected; 34 | 35 | public TestSparseEncodeHash(long i, int e) { 36 | this.input = i; 37 | this.expected = e; 38 | } 39 | 40 | @Parameters 41 | public static Collection data() { 42 | Object[][] data = new Object[][] { { 11111111111L, 373692871 }, 43 | { 4314495982023L, -1711269433 }, { 4314529536455L, -1744823865 }, 44 | { 4314563074503L, 268425671 }, { 17257983908295L, -1644160569 }, { 536861127L, 536861127 }, 45 | { 536844743L, 536844743 }, { 144115188075862471L, -671082041 } }; 46 | return Arrays.asList(data); 47 | } 48 | 49 | @Test 50 | public void testEncodeHash() { 51 | HLLSparseRegister reg = new HLLSparseRegister(14, 25, 6); 52 | int got = reg.encodeHash(input); 53 | assertEquals(expected, got); 54 | } 55 | } 56 | --------------------------------------------------------------------------------