├── .gitignore ├── .travis.yml ├── 3rd_party_licenses.txt ├── CHANGES.txt ├── LICENSE.txt ├── NOTICE.txt ├── README.mdown ├── bin ├── card-test-and-graph.sh ├── cardinality └── topk ├── pom.xml └── src ├── main └── java │ └── com │ └── clearspring │ ├── analytics │ ├── hash │ │ ├── Lookup3Hash.java │ │ └── MurmurHash.java │ ├── stream │ │ ├── ConcurrentStreamSummary.java │ │ ├── Counter.java │ │ ├── ISampleSet.java │ │ ├── ITopK.java │ │ ├── SampleSet.java │ │ ├── ScoredItem.java │ │ ├── StochasticTopper.java │ │ ├── StreamSummary.java │ │ ├── cardinality │ │ │ ├── AdaptiveCounting.java │ │ │ ├── CardinalityMergeException.java │ │ │ ├── CountThenEstimate.java │ │ │ ├── HyperLogLog.java │ │ │ ├── HyperLogLogPlus.java │ │ │ ├── ICardinality.java │ │ │ ├── LinearCounting.java │ │ │ ├── LogLog.java │ │ │ └── RegisterSet.java │ │ ├── frequency │ │ │ ├── ConservativeAddSketch.java │ │ │ ├── CountMinSketch.java │ │ │ ├── FrequencyMergeException.java │ │ │ └── IFrequency.java │ │ ├── membership │ │ │ ├── BitSetSerializer.java │ │ │ ├── BloomCalculations.java │ │ │ ├── BloomFilter.java │ │ │ ├── DataInputBuffer.java │ │ │ ├── DataOutputBuffer.java │ │ │ ├── Filter.java │ │ │ └── ICompactSerializer.java │ │ └── quantile │ │ │ ├── GroupTree.java │ │ │ ├── IQuantileEstimator.java │ │ │ ├── QDigest.java │ │ │ └── TDigest.java │ └── util │ │ ├── AbstractIterator.java │ │ ├── Bits.java │ │ ├── DoublyLinkedList.java │ │ ├── ExternalizableUtil.java │ │ ├── IBuilder.java │ │ ├── ListNode2.java │ │ ├── Lists.java │ │ ├── ObyCount.java │ │ ├── Pair.java │ │ ├── Preconditions.java │ │ ├── TopK.java │ │ ├── UnsignedIntComparator.java │ │ └── Varint.java │ └── experimental │ └── stream │ └── cardinality │ └── HyperBitBit.java └── test ├── java ├── com │ └── clearspring │ │ ├── analytics │ │ ├── TestUtils.java │ │ ├── hash │ │ │ ├── TestLookup3Hash.java │ │ │ └── TestMurmurHash.java │ │ ├── stream │ │ │ ├── TestConcurrentStreamSummary.java │ │ │ ├── TestSampleSet.java │ │ │ ├── TestStochasticTopper.java │ │ │ ├── TestStreamSummary.java │ │ │ ├── cardinality │ │ │ │ ├── RegisterSetTest.java │ │ │ │ ├── TestAdaptiveCounting.java │ │ │ │ ├── TestAndGraphResults.java │ │ │ │ ├── TestCountThenEstimate.java │ │ │ │ ├── TestHyperLogLog.java │ │ │ │ ├── TestHyperLogLogPlus.java │ │ │ │ ├── TestICardinality.java │ │ │ │ ├── TestLinearCounting.java │ │ │ │ └── TestLogLog.java │ │ │ ├── data │ │ │ │ └── NasaVoyager2.csv │ │ │ ├── frequency │ │ │ │ ├── ConservativeAddSketchTest.java │ │ │ │ └── CountMinSketchTest.java │ │ │ ├── membership │ │ │ │ ├── Base64Test.java │ │ │ │ ├── BloomFilterTest.java │ │ │ │ ├── FilterTest.java │ │ │ │ ├── KeyGenerator.java │ │ │ │ └── ResetableIterator.java │ │ │ └── quantile │ │ │ │ ├── GroupTreeTest.java │ │ │ │ ├── QDigestTest.java │ │ │ │ └── TDigestTest.java │ │ └── util │ │ │ └── TestDoublyLinkedList.java │ │ └── experimental │ │ └── stream │ │ └── cardinality │ │ └── TestHyperBitBit.java └── org │ └── apache │ └── commons │ └── lang3 │ └── RandomStringUtils.java └── resources └── com └── clearspring └── analytics └── stream └── membership └── encoded_random_keys.bloom /.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | .classpath 3 | .project 4 | .settings/ 5 | .idea/ 6 | pom.xml.versionsBackup 7 | .#* 8 | target/ 9 | pom.xml.releaseBackup 10 | release.properties -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: java 2 | jdk: 3 | - oraclejdk8 4 | -------------------------------------------------------------------------------- /3rd_party_licenses.txt: -------------------------------------------------------------------------------- 1 | 2 | LICENSES FOR THIRD-PARTY COMPONENTS 3 | 4 | =============================================================================== 5 | 6 | The following sections list licensing information for 7 | libraries included with the stream-lib source and components 8 | used to test stream-lib. 9 | 10 | The following software may be included in this product: 11 | 12 | =============================================================================== 13 | 14 | Fastutil » 8.1.1 15 | 16 | Fastutil » 8.1.1 uses the Apache 2.0 license, shown below. See the License for details about distribution rights, and the specific rights regarding derivate works. 17 | 18 | http://www.apache.org/licenses/LICENSE-2.0.txt 19 | 20 | --------------------------------------------------------------------------- 21 | 22 | JUnit 23 | 24 | 25 | 26 | JUnit » 4.12 uses Eclipse Public License - Version 1.0, shown below. See the License for details about distribution rights, and the specific rights regarding derivate works. 27 | 28 | 29 | http://www.eclipse.org/org/documents/epl-v10.php 30 | 31 | 32 | --------------------------------------------------------------------------- 33 | 34 | 35 | SLF4J Simple Binding 36 | 37 | SLF4J API Module 38 | 39 | Copyright (c) 2004-2007 QOS.ch 40 | 41 | SLF4J Simple Binding 1.7.25 and SLF4J API Module 1.7.25 use MIT license, shown below. See the License for details about distribution rights, and the specific rights regarding derivate works. 42 | 43 | 44 | https://opensource.org/licenses/mit-license.php 45 | 46 | --------------------------------------------------------------------------- 47 | 48 | Colt » 1.2.0 49 | 50 | Packages cern.colt* , cern.jet*, cern.clhep 51 | 52 | Copyright (c) 1999 CERN - European Organization for Nuclear Research. 53 | Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose is hereby granted without fee, provided that the above copyright notice appear in all copies and that both that copyright notice and this permission notice appear in supporting documentation. CERN makes no representations about the suitability of this software for any purpose. It is provided "as is" without expressed or implied warranty. 54 | 55 | Packages hep.aida.* 56 | 57 | Written by Pavel Binko, Dino Ferrero Merlino, Wolfgang Hoschek, Tony Johnson, Andreas Pfeiffer, and others. Check the FreeHEP home page for more info. Permission to use and/or redistribute this work is granted under the terms of the LGPL License, with the exception that any usage related to military applications is expressly forbidden. The software and documentation made available under the terms of this license are provided with no warranty. 58 | 59 | --------------------------------------------------------------------------- 60 | 61 | Charts4j » 1.3 62 | 63 | https://github.com/julienchastang/charts4j/blob/master/LICENSE.txt 64 | 65 | /** 66 | * 67 | * The MIT License 68 | * 69 | * Copyright (c) 2011 the original author or authors. 70 | * 71 | * Permission is hereby granted, free of charge, to any person obtaining a copy 72 | * of this software and associated documentation files (the "Software"), to deal 73 | * in the Software without restriction, including without limitation the rights 74 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 75 | * copies of the Software, and to permit persons to whom the Software is 76 | * furnished to do so, subject to the following conditions: 77 | 78 | * The above copyright notice and this permission notice shall be included in 79 | * all copies or substantial portions of the Software. 80 | 81 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 82 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 83 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 84 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 85 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 86 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 87 | * THE SOFTWARE. 88 | */ 89 | 90 | --------------------------------------------------------------------------- 91 | 92 | Apache Commons Codec » 1.11 93 | 94 | 95 | Apache 2.0 license, shown below. See the License for details about distribution rights, and the specific rights regarding derivate works. 96 | 97 | https://www.apache.org/licenses/LICENSE-2.0.txt 98 | 99 | --------------------------------------------------------------------------- 100 | 101 | Guava: Google Core Libraries For Java 102 | 103 | Guava: Google Core Libraries For Java » 24.1-jre uses the Apache 2.0 license, shown below. See the License for details about distribution rights, and the specific rights regarding derivate works. 104 | 105 | http://www.apache.org/licenses/LICENSE-2.0.txt 106 | 107 | --------------------------------------------------------------------------- 108 | 109 | Mahout Math » 0.13.0 110 | 111 | Mahout Math » 0.13.0 uses the Apache 2.0 license, shown below. See the License for details about distribution rights, and the specific rights regarding derivate works. 112 | 113 | 114 | https://github.com/apache/mahout/blob/master/LICENSE.txt -------------------------------------------------------------------------------- /CHANGES.txt: -------------------------------------------------------------------------------- 1 | 2.0.0: 2 | * Initial Release. 3 | -------------------------------------------------------------------------------- /NOTICE.txt: -------------------------------------------------------------------------------- 1 | stream-lib 2 | Copyright 2016 AddThis 3 | 4 | This product includes software developed by AddThis. 5 | 6 | This product also includes code adapted from: 7 | 8 | Apache Solr (http://lucene.apache.org/solr/) 9 | Copyright 2014 The Apache Software Foundation 10 | 11 | Apache Mahout (http://mahout.apache.org/) 12 | Copyright 2014 The Apache Software Foundation 13 | -------------------------------------------------------------------------------- /README.mdown: -------------------------------------------------------------------------------- 1 | [![Build Status](https://travis-ci.org/addthis/stream-lib.svg?branch=master)](https://travis-ci.org/addthis/stream-lib) 2 | 3 | ## Description 4 | 5 | A Java library for summarizing data in streams for which it is 6 | infeasible to store all events. More specifically, there are classes 7 | for estimating: cardinality (i.e. counting things); set membership; 8 | top-k elements and frequency. One particularly useful feature is that 9 | cardinality estimators with compatible configurations may be safely 10 | merged. 11 | 12 | These classes may be used directly in a JVM project or with the 13 | provided shell scripts and good old Unix IO redirection. 14 | 15 | The ideas here are not original to us. We have endeavored to create 16 | useful implementations from iterating over the existing academic 17 | literature. As such this library relies heavily on the work of 18 | others. Please read the [Sources](#Sources) and 19 | [Reference](#References) sections. 20 | 21 | ## Examples 22 | 23 | $ echo -e "foo\nfoo\nbar" | ./bin/topk 24 | item count error 25 | ---- ----- ----- 26 | foo 2 0 27 | bar 1 0 28 | 29 | Item count: 3 30 | 31 | 32 | $ echo -e "foo\nfoo\nbar" | ./bin/cardinality 33 | Item Count Cardinality Estimate 34 | ---------- -------------------- 35 | 3 2 36 | 37 | 38 | ## Maven Artifact [![Maven Central](https://maven-badges.herokuapp.com/maven-central/com.clearspring.analytics/stream/badge.svg)](https://maven-badges.herokuapp.com/maven-central/com.clearspring.analytics/stream) 39 | 40 | ``` xml 41 | 42 | com.clearspring.analytics 43 | stream 44 | 2.9.5 45 | 46 | ``` 47 | 48 | ## Building 49 | 50 | Assuming you have [Apache Maven](http://maven.apache.org/) installed 51 | and configured: 52 | 53 | mvn package 54 | 55 | And you should be all set. 56 | 57 | ## Where People Hang Out 58 | 59 | Mailing list: http://groups.google.com/group/stream-lib-user 60 | 61 | 62 | ## Sources 63 | 64 | The set membership code is the Bloom Filter implementation from Apache 65 | Cassandra circa December 2009. The changes here are minimal and were 66 | for the purpose of testing and independent use. Apache Software 67 | Foundation headers have been retained on these files. By extension we 68 | also include [murmurhash](http://murmurhash.googlepages.com/). 69 | 70 | We were inspired to use this code by Jonathan Ellis' post 71 | [All you ever wanted to know about writing bloom filters](http://spyced.blogspot.com/2009/01/all-you-ever-wanted-to-know-about.html). 72 | 73 | ## References 74 | 75 | There are javadoc references to specific papers. These were the ones 76 | we found most relevant during out research. 77 | 78 | #### Cardinality 79 | 80 | * Min Cai, Jianping Pan, Yu K. Kwok, and Kai Hwang. Fast and accurate 81 | traffic matrix measurement using adaptive cardinality counting. In 82 | MineNet ’05: Proceedings of the 2005 ACM SIGCOMM workshop on 83 | Mining network data, pages 205–206, New York, NY, USA, 2005. ACM. 84 | 85 | * Ahmed Metwally, Divyakant Agrawal, and Amr E. Abbadi. Why go 86 | logarithmic if we can go linear?: Towards effective distinct counting of 87 | search traffic. In EDBT ’08: Proceedings of the 11th international 88 | conference on Extending database technology, pages 618–629, New York, 89 | NY, USA, 2008. ACM. 90 | 91 | * Nikos Ntarmos, Peter Triantafillou, and Gerhard Weikum. Counting at 92 | large: Efficient cardinality estimation in Internet-Scale data networks. 93 | In ICDE ’06: Proceedings of the 22nd International Conference on Data 94 | Engineering, pages 40+, Washington, DC, USA, 2006. IEEE Computer 95 | Society. 96 | 97 | * Marianne Durand and Philippe Flajolet. LogLog counting of large 98 | cardinalities. In ESA03, volume 2832 of LNCS, pages 605–617, 2003. 99 | 100 | * Kyu Y. Whang, Brad T. Vander Zanden, and Howard M. Taylor. A 101 | linear-time probabilistic counting algorithm for database applications. 102 | ACM Trans. Database Syst., 15(2):208–229, 1990. 103 | 104 | * Moses Charikar, Kevin Chen, and Martin F. Colton. Finding frequent 105 | items in data streams. In ICALP ’02: Proceedings of the 29th 106 | International Colloquium on Automata, Languages and Programming, 107 | pages 693–703, London, UK, 2002. Springer-Verlag. 108 | 109 | * Stefan Heule, Marc Nunkesser, Alex Hall. HyperLogLog in Practice: 110 | Algorithmic Engineering of a State of The Art Cardinality Estimation 111 | Algorithm. Proceedings of the EDBT 2013 Conference, ACM, Genoa, Italy 112 | 113 | 114 | #### Top-K 115 | 116 | * Graham Cormode and S. Muthukrishnan. An improved data stream 117 | summary: The Count-Min sketch and its applications. pages 29–38. 118 | 2004. 10.1016/j.jalgor.2003.12.001 119 | http://dl.acm.org/citation.cfm?id=1073718 120 | 121 | * Cheqing Jin, Weining Qian, Chaofeng Sha, Jeffrey X. Yu, and Aoying 122 | Zhou. Dynamically maintaining frequent items over a data stream. In 123 | CIKM ’03: Proceedings of the twelfth international conference on 124 | Information and knowledge management, pages 287–294, New York, 125 | NY, USA, 2003. ACM. 10.1145/956863.956918 126 | http://dl.acm.org/citation.cfm?id=956918 127 | 128 | * Ahmed Metwally, Divyakant Agrawal, and Amr Abbadi. Efficient 129 | computation of frequent and top-k elements in data streams. pages 130 | 398–412. 2005. 10.1007/978-3-540-30570-5_27 131 | http://link.springer.com/chapter/10.1007/978-3-540-30570-5_27 132 | 133 | #### Frequency 134 | 135 | * Graham Cormode and S. Muthukrishnan. An improved data stream 136 | summary: The Count-Min sketch and its applications. 2004. 10.1016/j.jalgor.2003.12.001 137 | http://dl.acm.org/citation.cfm?id=1073718 138 | -------------------------------------------------------------------------------- /bin/card-test-and-graph.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -x 2 | 3 | # Wrap the maven boilerplate to run the cardinality tests and graph results. 4 | 5 | mvn -e exec:java -Dexec.classpathScope="test" -Dexec.mainClass="com.clearspring.analytics.stream.cardinality.TestAndGraphResults" -Dexec.args="$*" 6 | -------------------------------------------------------------------------------- /bin/cardinality: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | STREAM_LIB_DIR=${STREAM_LIB_DIR:-./target} 4 | HELP=0 5 | 6 | function help () { 7 | cat <<-HELP 8 | cardinality -- Cardinality estimation command line utility. 9 | 10 | Usage: $( basename $0 ) [options] [RATE] 11 | 12 | Calculates the cardinality of a stream, reporting a summary at the end. 13 | cardinality looks for the Stream Summary analytics library at the location 14 | of the environment variable STREAM_LIB_DIR (default is ./target). 15 | 16 | Arguments: 17 | RATE Report interim summary every RATE elements. 18 | 19 | Options: 20 | -h Displays this help. 21 | 22 | Stream Lib Dir: ${STREAM_LIB_DIR} 23 | 24 | HELP 25 | } 26 | 27 | function fail () { 28 | echo "PREDICTABLE FAILURE. $1" 29 | if [ "$2" ]; then 30 | help 31 | fi 32 | exit 1 33 | } 34 | 35 | SHIFT=0 36 | function incshift () { 37 | SHIFT=$(( $SHIFT + ${1:-1} )) 38 | } 39 | 40 | for opt in $*; do 41 | case "$opt" in 42 | -h | -he | -hel | -help | --h | --he | --hel | --help ) 43 | HELP=1 ;; 44 | esac 45 | done 46 | 47 | while getopts "h" opt; do 48 | case $opt in 49 | h ) HELP=1; incshift ;; 50 | # $opt ) B=$OPTARG; incshift 2 ;; 51 | esac 52 | done 53 | shift $SHIFT 54 | 55 | if test $HELP == 1; then 56 | help 57 | exit 0 58 | fi 59 | 60 | java -cp "${STREAM_LIB_DIR}"/stream-*SNAPSHOT.jar com.clearspring.analytics.util.ObyCount $* 61 | 62 | -------------------------------------------------------------------------------- /bin/topk: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | STREAM_LIB_DIR=${STREAM_LIB_DIR:-./target} 4 | HELP=0 5 | 6 | function help () { 7 | cat <<-HELP 8 | topk -- Finds the top elements in a stream. 9 | 10 | Usage: $( basename $0 ) [options] [CAPACITY] [RATE] 11 | 12 | Finds the top elements in a stream, reporting a summary at the end. 13 | topk looks for the Stream Summary analytics library at the location 14 | of the environment variable STREAM_LIB_DIR (default is ./target). 15 | 16 | Arguments: 17 | CAPACITY Size of top / k (defaults to 1000) 18 | RATE Report interim summary every RATE elements. 19 | 20 | Options: 21 | -h Displays this help. 22 | 23 | Stream Lib Dir: ${STREAM_LIB_DIR} 24 | 25 | HELP 26 | } 27 | 28 | function fail () { 29 | echo "PREDICTABLE FAILURE. $1" 30 | if [ "$2" ]; then 31 | help 32 | fi 33 | exit 1 34 | } 35 | 36 | SHIFT=0 37 | function incshift () { 38 | SHIFT=$(( $SHIFT + ${1:-1} )) 39 | } 40 | 41 | for opt in $*; do 42 | case "$opt" in 43 | -h | -he | -hel | -help | --h | --he | --hel | --help ) 44 | HELP=1 ;; 45 | esac 46 | done 47 | 48 | while getopts "h" opt; do 49 | case $opt in 50 | h ) HELP=1; incshift ;; 51 | # $opt ) B=$OPTARG; incshift 2 ;; 52 | esac 53 | done 54 | shift $SHIFT 55 | 56 | if test $HELP == 1; then 57 | help 58 | exit 0 59 | fi 60 | 61 | java -cp "${STREAM_LIB_DIR}"/stream-*SNAPSHOT.jar com.clearspring.analytics.util.TopK $* 62 | 63 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | jar-pom 5 | com.addthis.common.build.maven.pom 6 | 3.8.0 7 | 8 | 9 | 4.0.0 10 | com.clearspring.analytics 11 | stream 12 | jar 13 | 2.9.9-SNAPSHOT 14 | stream-lib 15 | A library for summarizing data in streams for which it is infeasible to store all events 16 | https://github.com/addthis/stream-lib 17 | 18 | 19 | Apache License, Version 2.0 20 | http://www.apache.org/licenses/LICENSE-2.0.txt 21 | 22 | 23 | 24 | 25 | 1.8 26 | 27 | 28 | 29 | 30 | 31 | com.google.guava 32 | guava 33 | 23.3-jre 34 | 35 | 36 | it.unimi.dsi 37 | fastutil 38 | 8.1.1 39 | 40 | 41 | 42 | 43 | 44 | 45 | it.unimi.dsi 46 | fastutil 47 | 48 | 49 | 50 | 51 | junit 52 | junit 53 | 4.12 54 | test 55 | 56 | 57 | org.slf4j 58 | slf4j-simple 59 | 1.7.25 60 | test 61 | 62 | 63 | colt 64 | colt 65 | 1.2.0 66 | test 67 | 68 | 69 | com.googlecode.charts4j 70 | charts4j 71 | 1.3 72 | test 73 | 74 | 75 | commons-codec 76 | commons-codec 77 | 1.11 78 | test 79 | 80 | 81 | com.google.guava 82 | guava 83 | test 84 | 85 | 86 | org.apache.mahout 87 | mahout-math 88 | 0.13.0 89 | test 90 | 91 | 92 | 93 | 94 | 95 | Matt Abrams 96 | abramsm@addthis.com 97 | 98 | 99 | Chris Burroughs 100 | christopher@addthis.com 101 | 102 | 103 | 104 | 105 | scm:git:git@github.com:addthis/stream-lib.git 106 | scm:git:git@github.com:addthis/stream-lib.git 107 | https://github.com/addthis/stream-lib 108 | HEAD 109 | 110 | 111 | -------------------------------------------------------------------------------- /src/main/java/com/clearspring/analytics/hash/MurmurHash.java: -------------------------------------------------------------------------------- 1 | package com.clearspring.analytics.hash; 2 | 3 | /** 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with this 6 | * work for additional information regarding copyright ownership. The ASF 7 | * licenses this file to you under the Apache License, Version 2.0 (the 8 | * "License"); you may not use this file except in compliance with the License. 9 | * You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 15 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 16 | * License for the specific language governing permissions and limitations under 17 | * the License. 18 | */ 19 | 20 | /** 21 | * This is a very fast, non-cryptographic hash suitable for general hash-based 22 | * lookup. See http://murmurhash.googlepages.com/ for more details. 23 | *

24 | *

25 | * The C version of MurmurHash 2.0 found at that site was ported to Java by 26 | * Andrzej Bialecki (ab at getopt org). 27 | *

28 | */ 29 | public class MurmurHash { 30 | 31 | public static int hash(Object o) { 32 | if (o == null) { 33 | return 0; 34 | } 35 | if (o instanceof Long) { 36 | return hashLong((Long) o); 37 | } 38 | if (o instanceof Integer) { 39 | return hashLong((Integer) o); 40 | } 41 | if (o instanceof Double) { 42 | return hashLong(Double.doubleToRawLongBits((Double) o)); 43 | } 44 | if (o instanceof Float) { 45 | return hashLong(Float.floatToRawIntBits((Float) o)); 46 | } 47 | if (o instanceof String) { 48 | return hash(((String) o).getBytes()); 49 | } 50 | if (o instanceof byte[]) { 51 | return hash((byte[]) o); 52 | } 53 | return hash(o.toString()); 54 | } 55 | 56 | public static int hash(byte[] data) { 57 | return hash(data, data.length, -1); 58 | } 59 | 60 | public static int hash(byte[] data, int seed) { 61 | return hash(data, data.length, seed); 62 | } 63 | 64 | public static int hash(byte[] data, int length, int seed) { 65 | int m = 0x5bd1e995; 66 | int r = 24; 67 | 68 | int h = seed ^ length; 69 | 70 | int len_4 = length >> 2; 71 | 72 | for (int i = 0; i < len_4; i++) { 73 | int i_4 = i << 2; 74 | int k = data[i_4 + 3]; 75 | k = k << 8; 76 | k = k | (data[i_4 + 2] & 0xff); 77 | k = k << 8; 78 | k = k | (data[i_4 + 1] & 0xff); 79 | k = k << 8; 80 | k = k | (data[i_4 + 0] & 0xff); 81 | k *= m; 82 | k ^= k >>> r; 83 | k *= m; 84 | h *= m; 85 | h ^= k; 86 | } 87 | 88 | // avoid calculating modulo 89 | int len_m = len_4 << 2; 90 | int left = length - len_m; 91 | 92 | if (left != 0) { 93 | if (left >= 3) { 94 | h ^= (int) data[length - 3] << 16; 95 | } 96 | if (left >= 2) { 97 | h ^= (int) data[length - 2] << 8; 98 | } 99 | if (left >= 1) { 100 | h ^= (int) data[length - 1]; 101 | } 102 | 103 | h *= m; 104 | } 105 | 106 | h ^= h >>> 13; 107 | h *= m; 108 | h ^= h >>> 15; 109 | 110 | return h; 111 | } 112 | 113 | public static int hashLong(long data) { 114 | int m = 0x5bd1e995; 115 | int r = 24; 116 | 117 | int h = 0; 118 | 119 | int k = (int) data * m; 120 | k ^= k >>> r; 121 | h ^= k * m; 122 | 123 | k = (int) (data >> 32) * m; 124 | k ^= k >>> r; 125 | h *= m; 126 | h ^= k * m; 127 | 128 | h ^= h >>> 13; 129 | h *= m; 130 | h ^= h >>> 15; 131 | 132 | return h; 133 | } 134 | 135 | public static long hash64(Object o) { 136 | if (o == null) { 137 | return 0l; 138 | } else if (o instanceof String) { 139 | final byte[] bytes = ((String) o).getBytes(); 140 | return hash64(bytes, bytes.length); 141 | } else if (o instanceof byte[]) { 142 | final byte[] bytes = (byte[]) o; 143 | return hash64(bytes, bytes.length); 144 | } 145 | return hash64(o.toString()); 146 | } 147 | 148 | // 64 bit implementation copied from here: https://github.com/tnm/murmurhash-java 149 | 150 | /** 151 | * Generates 64 bit hash from byte array with default seed value. 152 | * 153 | * @param data byte array to hash 154 | * @param length length of the array to hash 155 | * @return 64 bit hash of the given string 156 | */ 157 | public static long hash64(final byte[] data, int length) { 158 | return hash64(data, length, 0xe17a1465); 159 | } 160 | 161 | 162 | /** 163 | * Generates 64 bit hash from byte array of the given length and seed. 164 | * 165 | * @param data byte array to hash 166 | * @param length length of the array to hash 167 | * @param seed initial seed value 168 | * @return 64 bit hash of the given array 169 | */ 170 | public static long hash64(final byte[] data, int length, int seed) { 171 | final long m = 0xc6a4a7935bd1e995L; 172 | final int r = 47; 173 | 174 | long h = (seed & 0xffffffffl) ^ (length * m); 175 | 176 | int length8 = length / 8; 177 | 178 | for (int i = 0; i < length8; i++) { 179 | final int i8 = i * 8; 180 | long k = ((long) data[i8 + 0] & 0xff) + (((long) data[i8 + 1] & 0xff) << 8) 181 | + (((long) data[i8 + 2] & 0xff) << 16) + (((long) data[i8 + 3] & 0xff) << 24) 182 | + (((long) data[i8 + 4] & 0xff) << 32) + (((long) data[i8 + 5] & 0xff) << 40) 183 | + (((long) data[i8 + 6] & 0xff) << 48) + (((long) data[i8 + 7] & 0xff) << 56); 184 | 185 | k *= m; 186 | k ^= k >>> r; 187 | k *= m; 188 | 189 | h ^= k; 190 | h *= m; 191 | } 192 | 193 | switch (length % 8) { 194 | case 7: 195 | h ^= (long) (data[(length & ~7) + 6] & 0xff) << 48; 196 | case 6: 197 | h ^= (long) (data[(length & ~7) + 5] & 0xff) << 40; 198 | case 5: 199 | h ^= (long) (data[(length & ~7) + 4] & 0xff) << 32; 200 | case 4: 201 | h ^= (long) (data[(length & ~7) + 3] & 0xff) << 24; 202 | case 3: 203 | h ^= (long) (data[(length & ~7) + 2] & 0xff) << 16; 204 | case 2: 205 | h ^= (long) (data[(length & ~7) + 1] & 0xff) << 8; 206 | case 1: 207 | h ^= (long) (data[length & ~7] & 0xff); 208 | h *= m; 209 | } 210 | ; 211 | 212 | h ^= h >>> r; 213 | h *= m; 214 | h ^= h >>> r; 215 | 216 | return h; 217 | } 218 | } 219 | -------------------------------------------------------------------------------- /src/main/java/com/clearspring/analytics/stream/ConcurrentStreamSummary.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2011 Clearspring Technologies, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.clearspring.analytics.stream; 18 | 19 | import java.util.ArrayList; 20 | import java.util.Collections; 21 | import java.util.List; 22 | import java.util.Map; 23 | import java.util.concurrent.ConcurrentHashMap; 24 | import java.util.concurrent.atomic.AtomicBoolean; 25 | import java.util.concurrent.atomic.AtomicLong; 26 | import java.util.concurrent.atomic.AtomicReference; 27 | 28 | /** 29 | * Based on the Space-Saving algorithm and the Stream-Summary 30 | * data structure as described in: 31 | * Efficient Computation of Frequent and Top-k Elements in Data Streams 32 | * by Metwally, Agrawal, and Abbadi 33 | *

34 | * Ideally used in multithreaded applications, otherwise see {@link StreamSummary} 35 | * 36 | * @param type of data in the stream to be summarized 37 | * @author Eric Vlaanderen 38 | */ 39 | public class ConcurrentStreamSummary implements ITopK { 40 | 41 | private final int capacity; 42 | private final ConcurrentHashMap> itemMap; 43 | private final AtomicReference> minVal; 44 | private final AtomicLong size; 45 | private final AtomicBoolean reachCapacity; 46 | 47 | public ConcurrentStreamSummary(final int capacity) { 48 | this.capacity = capacity; 49 | this.minVal = new AtomicReference>(); 50 | this.size = new AtomicLong(0); 51 | this.itemMap = new ConcurrentHashMap>(capacity); 52 | this.reachCapacity = new AtomicBoolean(false); 53 | } 54 | 55 | @Override 56 | public boolean offer(final T element) { 57 | return offer(element, 1); 58 | } 59 | 60 | @Override 61 | public boolean offer(final T element, final int incrementCount) { 62 | long val = incrementCount; 63 | ScoredItem value = new ScoredItem(element, incrementCount); 64 | ScoredItem oldVal = itemMap.putIfAbsent(element, value); 65 | if (oldVal != null) { 66 | val = oldVal.addAndGetCount(incrementCount); 67 | } else if (reachCapacity.get() || size.incrementAndGet() > capacity) { 68 | reachCapacity.set(true); 69 | 70 | ScoredItem oldMinVal = minVal.getAndSet(value); 71 | itemMap.remove(oldMinVal.getItem()); 72 | 73 | while (oldMinVal.isNewItem()) { 74 | // Wait for the oldMinVal so its error and value are completely up to date. 75 | // no thread.sleep here due to the overhead of calling it - the waiting time will be microseconds. 76 | } 77 | long count = oldMinVal.getCount(); 78 | 79 | value.addAndGetCount(count); 80 | value.setError(count); 81 | } 82 | value.setNewItem(false); 83 | minVal.set(getMinValue()); 84 | 85 | return val != incrementCount; 86 | } 87 | 88 | private ScoredItem getMinValue() { 89 | ScoredItem minVal = null; 90 | for (ScoredItem entry : itemMap.values()) { 91 | if (minVal == null || (!entry.isNewItem() && entry.getCount() < minVal.getCount())) { 92 | minVal = entry; 93 | } 94 | } 95 | return minVal; 96 | } 97 | 98 | @Override 99 | public String toString() { 100 | StringBuilder sb = new StringBuilder(); 101 | sb.append("["); 102 | for (ScoredItem entry : itemMap.values()) { 103 | sb.append("(" + entry.getCount() + ": " + entry.getItem() + ", e: " + entry.getError() + "),"); 104 | } 105 | sb.deleteCharAt(sb.length() - 1); 106 | sb.append("]"); 107 | return sb.toString(); 108 | } 109 | 110 | @Override 111 | public List peek(final int k) { 112 | List toReturn = new ArrayList(k); 113 | List> values = peekWithScores(k); 114 | for (ScoredItem value : values) { 115 | toReturn.add(value.getItem()); 116 | } 117 | return toReturn; 118 | } 119 | 120 | public List> peekWithScores(final int k) { 121 | List> values = new ArrayList>(); 122 | for (Map.Entry> entry : itemMap.entrySet()) { 123 | ScoredItem value = entry.getValue(); 124 | values.add(new ScoredItem(value.getItem(), value.getCount(), value.getError())); 125 | } 126 | Collections.sort(values); 127 | values = values.size() > k ? values.subList(0, k) : values; 128 | return values; 129 | } 130 | } 131 | -------------------------------------------------------------------------------- /src/main/java/com/clearspring/analytics/stream/Counter.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2011 Clearspring Technologies, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.clearspring.analytics.stream; 18 | 19 | import java.io.Externalizable; 20 | import java.io.IOException; 21 | import java.io.ObjectInput; 22 | import java.io.ObjectOutput; 23 | 24 | import com.clearspring.analytics.util.ListNode2; 25 | 26 | public class Counter implements Externalizable { 27 | 28 | protected ListNode2.Bucket> bucketNode; 29 | 30 | protected T item; 31 | protected long count; 32 | protected long error; 33 | 34 | /** 35 | * For de-serialization 36 | */ 37 | public Counter() { 38 | } 39 | 40 | public Counter(ListNode2.Bucket> bucket, T item) { 41 | this.bucketNode = bucket; 42 | this.count = 0; 43 | this.error = 0; 44 | this.item = item; 45 | } 46 | 47 | public T getItem() { 48 | return item; 49 | } 50 | 51 | public long getCount() { 52 | return count; 53 | } 54 | 55 | public long getError() { 56 | return error; 57 | } 58 | 59 | @Override 60 | public String toString() { 61 | return item + ":" + count + ':' + error; 62 | } 63 | 64 | @SuppressWarnings("unchecked") 65 | @Override 66 | public void readExternal(ObjectInput in) throws IOException, ClassNotFoundException { 67 | item = (T) in.readObject(); 68 | count = in.readLong(); 69 | error = in.readLong(); 70 | } 71 | 72 | @Override 73 | public void writeExternal(ObjectOutput out) throws IOException { 74 | out.writeObject(item); 75 | out.writeLong(count); 76 | out.writeLong(error); 77 | } 78 | } 79 | 80 | -------------------------------------------------------------------------------- /src/main/java/com/clearspring/analytics/stream/ISampleSet.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2011 Clearspring Technologies, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.clearspring.analytics.stream; 18 | 19 | import java.util.List; 20 | 21 | 22 | public interface ISampleSet { 23 | 24 | long put(T element); 25 | 26 | long put(T element, int incrementCount); 27 | 28 | T removeRandom(); 29 | 30 | T peek(); 31 | 32 | List peek(int k); 33 | 34 | int size(); 35 | 36 | long count(); 37 | } 38 | -------------------------------------------------------------------------------- /src/main/java/com/clearspring/analytics/stream/ITopK.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2011 Clearspring Technologies, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.clearspring.analytics.stream; 18 | 19 | import java.util.List; 20 | 21 | public interface ITopK { 22 | 23 | /** 24 | * offer a single element to the top. 25 | * 26 | * @param element - the element to add to the top 27 | * @return false if the element was already in the top 28 | */ 29 | boolean offer(T element); 30 | 31 | /** 32 | * offer a single element to the top and increment the count 33 | * for that element by incrementCount. 34 | * 35 | * @param element - the element to add to the top 36 | * @param incrementCount - the increment count for the given count 37 | * @return false if the element was already in the top 38 | */ 39 | boolean offer(T element, int incrementCount); 40 | 41 | /** 42 | * @param k 43 | * @return top k elements offered (may be an approximation) 44 | */ 45 | List peek(int k); 46 | } 47 | -------------------------------------------------------------------------------- /src/main/java/com/clearspring/analytics/stream/SampleSet.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2011 Clearspring Technologies, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.clearspring.analytics.stream; 18 | 19 | import java.util.ArrayList; 20 | import java.util.HashMap; 21 | import java.util.List; 22 | import java.util.Map; 23 | import java.util.Random; 24 | 25 | public class SampleSet implements ISampleSet { 26 | 27 | private Map> sampleMap; 28 | private int size; 29 | private long count; 30 | private Random random; 31 | 32 | /** 33 | * Element with the highest frequency in the set 34 | */ 35 | private Node head; 36 | 37 | /** 38 | * Element with the lowest frequency in the set 39 | */ 40 | private Node tail; 41 | 42 | public SampleSet() { 43 | this(7); 44 | } 45 | 46 | public SampleSet(int capacity) { 47 | this(capacity, new Random()); 48 | } 49 | 50 | public SampleSet(int capacity, Random random) { 51 | sampleMap = new HashMap>(capacity); 52 | this.random = random; 53 | } 54 | 55 | public T peek() { 56 | return (head != null) ? head.element : null; 57 | } 58 | 59 | public List peek(int k) { 60 | List topK = new ArrayList(k); 61 | for (Node itr = head; itr != null && topK.size() < k; itr = itr.next) { 62 | topK.add(itr.element); 63 | } 64 | return topK; 65 | } 66 | 67 | public long put(T element) { 68 | return put(element, 1); 69 | } 70 | 71 | public long put(T element, int incrementCount) { 72 | Node node = sampleMap.get(element); 73 | if (node != null) { 74 | node.count = node.count + incrementCount; 75 | promote(node); 76 | } else { 77 | node = new Node(); 78 | node.element = element; 79 | node.count = incrementCount; 80 | node.prev = tail; 81 | if (tail != null) { 82 | tail.next = node; 83 | } 84 | tail = node; 85 | if (head == null) { 86 | head = node; 87 | } 88 | sampleMap.put(element, node); 89 | size++; 90 | } 91 | count++; 92 | return node.count; 93 | } 94 | 95 | public T removeRandom() { 96 | double p = random.nextDouble(); 97 | long weight = 0; 98 | for (Node itr = head; itr != null; itr = itr.next) { 99 | weight += itr.count; 100 | if (p < weight / (double) count) { 101 | itr.count--; 102 | count--; 103 | demote(itr); 104 | if (itr.count == 0) { 105 | removeMin(); 106 | } 107 | return itr.element; 108 | } 109 | } 110 | return null; 111 | } 112 | 113 | protected T removeMin() { 114 | if (tail == null) { 115 | return null; 116 | } 117 | size--; 118 | count -= tail.count; 119 | T minElement = tail.element; 120 | tail = tail.prev; 121 | if (tail != null) { 122 | tail.next = null; 123 | } 124 | sampleMap.remove(minElement); 125 | return minElement; 126 | } 127 | 128 | public int size() { 129 | return size; 130 | } 131 | 132 | public long count() { 133 | return count; 134 | } 135 | 136 | protected T peekMin() { 137 | return tail.element; 138 | } 139 | 140 | protected void promote(Node node) { 141 | // Bring node closer to the head as necessary 142 | while (node.prev != null && node.count > node.prev.count) { 143 | // BEFORE head... [A]node.prev.prev --> [B]node.prev --> [C]node --> [D]node.next ...tail 144 | // AFTER head... [A]node.prev.prev --> [C]node --> [B]node.prev --> [D]node.next ...tail 145 | Node b = node.prev, c = node, d = node.next, a = (b == null) ? null : b.prev; 146 | 147 | // Re-link each of 3 neighboring pairs 148 | if (a != null) { 149 | a.next = c; 150 | } 151 | c.prev = a; 152 | 153 | c.next = b; 154 | b.prev = c; 155 | 156 | b.next = d; 157 | if (d != null) { 158 | d.prev = b; 159 | } 160 | 161 | // B and C may have switched head/tail roles 162 | if (head == b) { 163 | head = c; 164 | } 165 | if (tail == c) { 166 | tail = b; 167 | } 168 | } 169 | } 170 | 171 | protected void demote(Node node) { 172 | // Bring node closer to the tail as necessary 173 | while (node.next != null && node.count < node.next.count) { 174 | // BEFORE head... [A]node.prev --> [B]node --> [C]node.next --> [D]node.next.next ...tail 175 | // AFTER head... [A]node.prev --> [C]node.next --> [B]node --> [D]node.next.next ...tail 176 | Node a = node.prev, b = node, c = node.next, d = (c == null) ? null : c.next; 177 | 178 | // Re-link each of 3 neighboring pairs 179 | if (a != null) { 180 | a.next = c; 181 | } 182 | c.prev = a; 183 | 184 | c.next = b; 185 | b.prev = c; 186 | 187 | if (d != null) { 188 | d.prev = b; 189 | } 190 | b.next = d; 191 | 192 | // B and C may have switched head/tail roles 193 | if (head == b) { 194 | head = c; 195 | } 196 | if (tail == c) { 197 | tail = b; 198 | } 199 | } 200 | } 201 | 202 | private class Node { 203 | 204 | private Node next; 205 | private Node prev; 206 | private E element; 207 | private long count; 208 | } 209 | } 210 | -------------------------------------------------------------------------------- /src/main/java/com/clearspring/analytics/stream/ScoredItem.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2011 Clearspring Technologies, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.clearspring.analytics.stream; 18 | 19 | import java.util.concurrent.atomic.AtomicBoolean; 20 | import java.util.concurrent.atomic.AtomicLong; 21 | 22 | /** 23 | * @author Eric Vlaanderen 24 | */ 25 | public class ScoredItem implements Comparable> { 26 | 27 | private final AtomicLong error; 28 | private final AtomicLong count; 29 | private final AtomicBoolean newItem; 30 | private final T item; 31 | 32 | public ScoredItem(final T item, final long count, final long error) { 33 | this.item = item; 34 | this.error = new AtomicLong(error); 35 | this.count = new AtomicLong(count); 36 | this.newItem = new AtomicBoolean(true); 37 | } 38 | 39 | public ScoredItem(final T item, final long count) { 40 | this(item, count, 0L); 41 | } 42 | 43 | public long addAndGetCount(final long delta) { 44 | return this.count.addAndGet(delta); 45 | } 46 | 47 | public void setError(final long newError) { 48 | this.error.set(newError); 49 | } 50 | 51 | public long getError() { 52 | return error.get(); 53 | } 54 | 55 | public T getItem() { 56 | return item; 57 | } 58 | 59 | public boolean isNewItem() { 60 | return newItem.get(); 61 | } 62 | 63 | public long getCount() { 64 | return count.get(); 65 | } 66 | 67 | @Override 68 | public int compareTo(final ScoredItem o) { 69 | long x = o.count.get(); 70 | long y = count.get(); 71 | return (x < y) ? -1 : ((x == y) ? 0 : 1); 72 | } 73 | 74 | public String toString() { 75 | StringBuilder sb = new StringBuilder(); 76 | sb.append("Value: "); 77 | sb.append(item); 78 | sb.append(", Count: "); 79 | sb.append(count); 80 | sb.append(", Error: "); 81 | sb.append(error); 82 | sb.append(", object: "); 83 | sb.append(super.toString()); 84 | return sb.toString(); 85 | } 86 | 87 | 88 | public void setNewItem(final boolean newItem) { 89 | this.newItem.set(newItem); 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /src/main/java/com/clearspring/analytics/stream/StochasticTopper.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2011 Clearspring Technologies, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | /** 18 | * 19 | */ 20 | package com.clearspring.analytics.stream; 21 | 22 | import java.util.List; 23 | import java.util.Random; 24 | 25 | /** 26 | * Estimates most frequently occurring items in a data stream 27 | * using a bounded amount of memory. 28 | *

29 | * Warning: this class is not thread safe. 30 | */ 31 | public class StochasticTopper implements ITopK { 32 | 33 | private int sampleSize; 34 | private ISampleSet sample; 35 | private Random random; 36 | private long count; 37 | 38 | public StochasticTopper(int sampleSize) { 39 | this(sampleSize, null); 40 | } 41 | 42 | public StochasticTopper(int sampleSize, Long seed) { 43 | this.sample = new SampleSet(sampleSize); 44 | this.sampleSize = sampleSize; 45 | 46 | if (seed != null) { 47 | random = new Random(seed); 48 | } else { 49 | random = new Random(); 50 | } 51 | } 52 | 53 | public boolean offer(T item, int incrementCount) { 54 | count++; 55 | boolean taken = false; 56 | if (sample.count() < sampleSize) { 57 | sample.put(item, incrementCount); 58 | taken = true; 59 | } else if (random.nextDouble() < sampleSize / (double) count) { 60 | sample.removeRandom(); 61 | sample.put(item, incrementCount); 62 | taken = true; 63 | } 64 | 65 | return taken; 66 | } 67 | 68 | public boolean offer(T item) { 69 | return offer(item, 1); 70 | } 71 | 72 | /** 73 | * Retrieve top k items 74 | */ 75 | public List peek(int k) { 76 | return sample.peek(k); 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /src/main/java/com/clearspring/analytics/stream/cardinality/AdaptiveCounting.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2011 Clearspring Technologies, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.clearspring.analytics.stream.cardinality; 18 | 19 | import java.io.Serializable; 20 | 21 | import java.util.Arrays; 22 | 23 | import com.clearspring.analytics.hash.Lookup3Hash; 24 | import com.clearspring.analytics.util.IBuilder; 25 | 26 | /** 27 | *

28 | * Based on the adaptive counting approach of:
29 | * Fast and Accurate Traffic Matrix Measurement Using Adaptive Cardinality Counting
30 | * by: Cai, Pan, Kwok, and Hwang 31 | *

32 | *

33 | * TODO: use 5 bits/bucket instead of 8 (37.5% size reduction)
34 | * TODO: super-LogLog optimizations 35 | */ 36 | public class AdaptiveCounting extends LogLog { 37 | 38 | /** 39 | * Number of empty buckets 40 | */ 41 | protected int b_e; 42 | 43 | /** 44 | * Switching empty bucket ratio 45 | */ 46 | protected final double B_s = 0.051; 47 | 48 | public AdaptiveCounting(int k) { 49 | super(k); 50 | b_e = m; 51 | } 52 | 53 | public AdaptiveCounting(byte[] M) { 54 | super(M); 55 | 56 | for (byte b : M) { 57 | if (b == 0) { 58 | b_e++; 59 | } 60 | } 61 | } 62 | 63 | @Override 64 | public boolean offer(Object o) { 65 | boolean modified = false; 66 | 67 | long x = Lookup3Hash.lookup3ycs64(o.toString()); 68 | int j = (int) (x >>> (Long.SIZE - k)); 69 | byte r = (byte) (Long.numberOfLeadingZeros((x << k) | (1 << (k - 1))) + 1); 70 | if (M[j] < r) { 71 | Rsum += r - M[j]; 72 | if (M[j] == 0) { 73 | b_e--; 74 | } 75 | M[j] = r; 76 | modified = true; 77 | } 78 | 79 | return modified; 80 | } 81 | 82 | @Override 83 | public long cardinality() { 84 | double B = (b_e / (double) m); 85 | if (B >= B_s) { 86 | return (long) Math.round(-m * Math.log(B)); 87 | } 88 | 89 | return super.cardinality(); 90 | } 91 | 92 | 93 | /** 94 | * Computes the position of the first set bit of the last Long.SIZE-k bits 95 | * 96 | * @return Long.SIZE-k if the last k bits are all zero 97 | */ 98 | protected static byte rho(long x, int k) { 99 | return (byte) (Long.numberOfLeadingZeros((x << k) | (1 << (k - 1))) + 1); 100 | } 101 | 102 | /** 103 | * @return this if estimators is null or no arguments are passed 104 | * @throws LogLogMergeException if estimators are not mergeable (all estimators must be instances of LogLog of the same size) 105 | */ 106 | @Override 107 | public ICardinality merge(ICardinality... estimators) throws LogLogMergeException { 108 | LogLog res = (LogLog) super.merge(estimators); 109 | return new AdaptiveCounting(res.M); 110 | } 111 | 112 | /** 113 | * Merges estimators to produce an estimator for their combined streams 114 | * 115 | * @param estimators 116 | * @return merged estimator or null if no estimators were provided 117 | * @throws LogLogMergeException if estimators are not mergeable (all estimators must be the same size) 118 | */ 119 | public static AdaptiveCounting mergeEstimators(LogLog... estimators) throws LogLogMergeException { 120 | if (estimators == null || estimators.length == 0) { 121 | return null; 122 | } 123 | return (AdaptiveCounting) estimators[0].merge(Arrays.copyOfRange(estimators, 1, estimators.length)); 124 | } 125 | 126 | public static class Builder implements IBuilder, Serializable { 127 | 128 | private static final long serialVersionUID = 2205437102378081334L; 129 | 130 | protected final int k; 131 | 132 | public Builder() { 133 | this(16); 134 | } 135 | 136 | public Builder(int k) { 137 | this.k = k; 138 | } 139 | 140 | @Override 141 | public AdaptiveCounting build() { 142 | return new AdaptiveCounting(k); 143 | } 144 | 145 | @Override 146 | public int sizeof() { 147 | return 1 << k; 148 | } 149 | 150 | /** 151 | *

152 | * For cardinalities less than 4.25M, obyCount provides a LinearCounting Builder 153 | * (see LinearCounting.Builder.onePercentError() ) using only the 154 | * space required to provide estimates within 1% of the actual cardinality, 155 | * up to ~65k. 156 | *

157 | *

158 | * For cardinalities greater than 4.25M, an AdaptiveCounting builder is returned 159 | * that allocates ~65KB and provides estimates with a Gaussian error distribution 160 | * with an average error of 0.5% and a standard deviation of 0.5% 161 | *

162 | * 163 | * @param maxCardinality 164 | * @throws IllegalArgumentException if maxCardinality is not a positive integer 165 | * @see LinearCounting.Builder#onePercentError(int) 166 | */ 167 | public static IBuilder obyCount(long maxCardinality) { 168 | if (maxCardinality <= 0) { 169 | throw new IllegalArgumentException("maxCardinality (" + maxCardinality + ") must be a positive integer"); 170 | } 171 | 172 | if (maxCardinality < 4250000) { 173 | return LinearCounting.Builder.onePercentError((int) maxCardinality); 174 | } 175 | 176 | return new Builder(16); 177 | } 178 | } 179 | } 180 | -------------------------------------------------------------------------------- /src/main/java/com/clearspring/analytics/stream/cardinality/CardinalityMergeException.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2011 Clearspring Technologies, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.clearspring.analytics.stream.cardinality; 18 | 19 | @SuppressWarnings("serial") 20 | public abstract class CardinalityMergeException extends Exception { 21 | 22 | public CardinalityMergeException(String message) { 23 | super(message); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/main/java/com/clearspring/analytics/stream/cardinality/ICardinality.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2011 Clearspring Technologies, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.clearspring.analytics.stream.cardinality; 18 | 19 | import java.io.IOException; 20 | 21 | 22 | public interface ICardinality { 23 | 24 | /** 25 | * @param o stream element 26 | * @return false if the value returned by cardinality() is unaffected by the appearance of o in the stream. 27 | */ 28 | boolean offer(Object o); 29 | 30 | /** 31 | * Offer the value as a hashed long value 32 | * 33 | * @param hashedLong - the hash of the item to offer to the estimator 34 | * @return false if the value returned by cardinality() is unaffected by the appearance of hashedLong in the stream 35 | */ 36 | boolean offerHashed(long hashedLong); 37 | 38 | /** 39 | * Offer the value as a hashed long value 40 | * 41 | * @param hashedInt - the hash of the item to offer to the estimator 42 | * @return false if the value returned by cardinality() is unaffected by the appearance of hashedInt in the stream 43 | */ 44 | boolean offerHashed(int hashedInt); 45 | 46 | /** 47 | * @return the number of unique elements in the stream or an estimate thereof 48 | */ 49 | long cardinality(); 50 | 51 | /** 52 | * @return size in bytes needed for serialization 53 | */ 54 | int sizeof(); 55 | 56 | /** 57 | * @return 58 | * @throws IOException 59 | */ 60 | byte[] getBytes() throws IOException; 61 | 62 | /** 63 | * Merges estimators to produce a new estimator for the combined streams 64 | * of this estimator and those passed as arguments. 65 | *

66 | * Nor this estimator nor the one passed as parameters are modified. 67 | * 68 | * @param estimators Zero or more compatible estimators 69 | * @throws CardinalityMergeException If at least one of the estimators is not compatible with this one 70 | */ 71 | ICardinality merge(ICardinality... estimators) throws CardinalityMergeException; 72 | } 73 | -------------------------------------------------------------------------------- /src/main/java/com/clearspring/analytics/stream/cardinality/LogLog.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2011 Clearspring Technologies, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.clearspring.analytics.stream.cardinality; 18 | 19 | import java.util.Arrays; 20 | 21 | import com.clearspring.analytics.hash.MurmurHash; 22 | import com.clearspring.analytics.util.IBuilder; 23 | 24 | public class LogLog implements ICardinality { 25 | 26 | /** 27 | * Gamma function computed using Mathematica 28 | * AccountingForm[ 29 | * N[With[{m = 2^Range[0, 31]}, 30 | * m (Gamma[-1/m]*(1 - 2^(1/m))/Log[2])^-m], 14]] 31 | */ 32 | protected static final double[] mAlpha = { 33 | 0, 34 | 0.44567926005415, 35 | 1.2480639342271, 36 | 2.8391255240079, 37 | 6.0165231584809, 38 | 12.369319965552, 39 | 25.073991603111, 40 | 50.482891762408, 41 | 101.30047482584, 42 | 202.93553338100, 43 | 406.20559696699, 44 | 812.74569744189, 45 | 1625.8258850594, 46 | 3251.9862536323, 47 | 6504.3069874480, 48 | 13008.948453415, 49 | 26018.231384516, 50 | 52036.797246302, 51 | 104073.92896967, 52 | 208148.19241629, 53 | 416296.71930949, 54 | 832593.77309585, 55 | 1665187.8806686, 56 | 3330376.0958140, 57 | 6660752.5261049, 58 | 13321505.386687, 59 | 26643011.107850, 60 | 53286022.550177, 61 | 106572045.43483, 62 | 213144091.20414, 63 | 426288182.74275, 64 | 852576365.81999 65 | }; 66 | 67 | protected final int k; 68 | protected int m; 69 | protected double Ca; 70 | protected byte[] M; 71 | protected int Rsum = 0; 72 | 73 | public LogLog(int k) { 74 | if (k >= (mAlpha.length - 1)) { 75 | throw new IllegalArgumentException(String.format("Max k (%d) exceeded: k=%d", mAlpha.length - 1, k)); 76 | } 77 | 78 | this.k = k; 79 | this.m = 1 << k; 80 | this.Ca = mAlpha[k]; 81 | this.M = new byte[m]; 82 | } 83 | 84 | public LogLog(byte[] M) { 85 | this.M = M; 86 | this.m = M.length; 87 | this.k = Integer.numberOfTrailingZeros(m); 88 | assert (m == (1 << k)) : "Invalid array size: M.length must be a power of 2"; 89 | this.Ca = mAlpha[k]; 90 | for (byte b : M) { 91 | Rsum += b; 92 | } 93 | } 94 | 95 | @Override 96 | public byte[] getBytes() { 97 | return M; 98 | } 99 | 100 | public int sizeof() { 101 | return m; 102 | } 103 | 104 | @Override 105 | public long cardinality() { 106 | /* 107 | for(int j=0; j>> (Integer.SIZE - k); 125 | byte r = (byte) (Integer.numberOfLeadingZeros((hashedInt << k) | (1 << (k - 1))) + 1); 126 | if (M[j] < r) { 127 | Rsum += r - M[j]; 128 | M[j] = r; 129 | modified = true; 130 | } 131 | 132 | return modified; 133 | } 134 | 135 | @Override 136 | public boolean offer(Object o) { 137 | int x = MurmurHash.hash(o); 138 | return offerHashed(x); 139 | } 140 | 141 | /** 142 | * Computes the position of the first set bit of the last Integer.SIZE-k bits 143 | * 144 | * @return Integer.SIZE-k if the last k bits are all zero 145 | */ 146 | protected static int rho(int x, int k) { 147 | return Integer.numberOfLeadingZeros((x << k) | (1 << (k - 1))) + 1; 148 | } 149 | 150 | /** 151 | * @return this if estimators is null or no arguments are passed 152 | * @throws LogLogMergeException if estimators are not mergeable (all estimators must be instances of LogLog of the same size) 153 | */ 154 | @Override 155 | public ICardinality merge(ICardinality... estimators) throws LogLogMergeException { 156 | if (estimators == null) { 157 | return new LogLog(M); 158 | } 159 | 160 | byte[] mergedBytes = Arrays.copyOf(this.M, this.M.length); 161 | for (ICardinality estimator : estimators) { 162 | if (!(this.getClass().isInstance(estimator))) { 163 | throw new LogLogMergeException("Cannot merge estimators of different class"); 164 | } 165 | if (estimator.sizeof() != this.sizeof()) { 166 | throw new LogLogMergeException("Cannot merge estimators of different sizes"); 167 | } 168 | LogLog ll = (LogLog) estimator; 169 | for (int i = 0; i < mergedBytes.length; ++i) { 170 | mergedBytes[i] = (byte) Math.max(mergedBytes[i], ll.M[i]); 171 | } 172 | } 173 | 174 | return new LogLog(mergedBytes); 175 | } 176 | 177 | /** 178 | * Merges estimators to produce an estimator for their combined streams 179 | * 180 | * @param estimators 181 | * @return merged estimator or null if no estimators were provided 182 | * @throws LogLogMergeException if estimators are not mergeable (all estimators must be the same size) 183 | */ 184 | public static LogLog mergeEstimators(LogLog... estimators) throws LogLogMergeException { 185 | if (estimators == null || estimators.length == 0) { 186 | return null; 187 | } 188 | return (LogLog) estimators[0].merge(Arrays.copyOfRange(estimators, 1, estimators.length)); 189 | } 190 | 191 | 192 | @SuppressWarnings("serial") 193 | protected static class LogLogMergeException extends CardinalityMergeException { 194 | 195 | public LogLogMergeException(String message) { 196 | super(message); 197 | } 198 | } 199 | 200 | public static class Builder implements IBuilder { 201 | 202 | protected final int k; 203 | 204 | public Builder() { 205 | this(16); 206 | } 207 | 208 | public Builder(int k) { 209 | this.k = k; 210 | } 211 | 212 | @Override 213 | public LogLog build() { 214 | return new LogLog(k); 215 | } 216 | 217 | @Override 218 | public int sizeof() { 219 | return 1 << k; 220 | } 221 | } 222 | } 223 | -------------------------------------------------------------------------------- /src/main/java/com/clearspring/analytics/stream/cardinality/RegisterSet.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 Clearspring Technologies, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.clearspring.analytics.stream.cardinality; 18 | 19 | public class RegisterSet { 20 | 21 | public final static int LOG2_BITS_PER_WORD = 6; 22 | public final static int REGISTER_SIZE = 5; 23 | 24 | public final int count; 25 | public final int size; 26 | 27 | private final int[] M; 28 | 29 | public RegisterSet(int count) { 30 | this(count, null); 31 | } 32 | 33 | public RegisterSet(int count, int[] initialValues) { 34 | this.count = count; 35 | 36 | if (initialValues == null) { 37 | this.M = new int[getSizeForCount(count)]; 38 | } else { 39 | this.M = initialValues; 40 | } 41 | this.size = this.M.length; 42 | } 43 | 44 | public static int getBits(int count) { 45 | return count / LOG2_BITS_PER_WORD; 46 | } 47 | 48 | public static int getSizeForCount(int count) { 49 | int bits = getBits(count); 50 | if (bits == 0) { 51 | return 1; 52 | } else if (bits % Integer.SIZE == 0) { 53 | return bits; 54 | } else { 55 | return bits + 1; 56 | } 57 | } 58 | 59 | public void set(int position, int value) { 60 | int bucketPos = position / LOG2_BITS_PER_WORD; 61 | int shift = REGISTER_SIZE * (position - (bucketPos * LOG2_BITS_PER_WORD)); 62 | this.M[bucketPos] = (this.M[bucketPos] & ~(0x1f << shift)) | (value << shift); 63 | } 64 | 65 | public int get(int position) { 66 | int bucketPos = position / LOG2_BITS_PER_WORD; 67 | int shift = REGISTER_SIZE * (position - (bucketPos * LOG2_BITS_PER_WORD)); 68 | return (this.M[bucketPos] & (0x1f << shift)) >>> shift; 69 | } 70 | 71 | public boolean updateIfGreater(int position, int value) { 72 | int bucket = position / LOG2_BITS_PER_WORD; 73 | int shift = REGISTER_SIZE * (position - (bucket * LOG2_BITS_PER_WORD)); 74 | int mask = 0x1f << shift; 75 | 76 | // Use long to avoid sign issues with the left-most shift 77 | long curVal = this.M[bucket] & mask; 78 | long newVal = value << shift; 79 | if (curVal < newVal) { 80 | this.M[bucket] = (int) ((this.M[bucket] & ~mask) | newVal); 81 | return true; 82 | } else { 83 | return false; 84 | } 85 | } 86 | 87 | public void merge(RegisterSet that) { 88 | for (int bucket = 0; bucket < M.length; bucket++) { 89 | int word = 0; 90 | for (int j = 0; j < LOG2_BITS_PER_WORD; j++) { 91 | int mask = 0x1f << (REGISTER_SIZE * j); 92 | 93 | int thisVal = (this.M[bucket] & mask); 94 | int thatVal = (that.M[bucket] & mask); 95 | word |= (thisVal < thatVal) ? thatVal : thisVal; 96 | } 97 | this.M[bucket] = word; 98 | } 99 | } 100 | 101 | int[] readOnlyBits() { 102 | return M; 103 | } 104 | 105 | public int[] bits() { 106 | int[] copy = new int[size]; 107 | System.arraycopy(M, 0, copy, 0, M.length); 108 | return copy; 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /src/main/java/com/clearspring/analytics/stream/frequency/ConservativeAddSketch.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed under the Apache License, Version 2.0 (the "License"); 3 | * you may not use this file except in compliance with the License. 4 | * You may obtain a copy of the License at 5 | * 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, 10 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | * See the License for the specific language governing permissions and 12 | * limitations under the License. 13 | */ 14 | 15 | package com.clearspring.analytics.stream.frequency; 16 | 17 | import com.clearspring.analytics.stream.membership.Filter; 18 | 19 | /** 20 | * A more accurate (by some large, but ill-defined amount), but slower (by some 21 | * small, but equally ill-defined amount) count min sketch. It seemed like a 22 | * simple optimization and later internet searching suggested it might be 23 | * called something like a conservative adding variant. 24 | */ 25 | public class ConservativeAddSketch extends CountMinSketch { 26 | 27 | ConservativeAddSketch() { 28 | super(); 29 | } 30 | 31 | public ConservativeAddSketch(int depth, int width, int seed) { 32 | super(depth, width, seed); 33 | } 34 | 35 | public ConservativeAddSketch(double epsOfTotalCount, double confidence, int seed) { 36 | super(epsOfTotalCount, confidence, seed); 37 | } 38 | 39 | ConservativeAddSketch(int depth, int width, long size, long[] hashA, long[][] table) { 40 | super(depth, width, size, hashA, table); 41 | } 42 | 43 | @Override 44 | public void add(long item, long count) { 45 | if (count < 0) { 46 | // Negative values are not implemented in the regular version, and do not 47 | // play nicely with this algorithm anyway 48 | throw new IllegalArgumentException("Negative increments not implemented"); 49 | } 50 | int[] buckets = new int[depth]; 51 | for (int i = 0; i < depth; ++i) { 52 | buckets[i] = hash(item, i); 53 | } 54 | long min = table[0][buckets[0]]; 55 | for (int i = 1; i < depth; ++i) { 56 | min = Math.min(min, table[i][buckets[i]]); 57 | } 58 | for (int i = 0; i < depth; ++i) { 59 | long newVal = Math.max(table[i][buckets[i]], min + count); 60 | table[i][buckets[i]] = newVal; 61 | } 62 | size += count; 63 | } 64 | 65 | @Override 66 | public void add(String item, long count) { 67 | if (count < 0) { 68 | // Negative values are not implemented in the regular version, and do not 69 | // play nicely with this algorithm anyway 70 | throw new IllegalArgumentException("Negative increments not implemented"); 71 | } 72 | int[] buckets = Filter.getHashBuckets(item, depth, width); 73 | long min = table[0][buckets[0]]; 74 | for (int i = 1; i < depth; ++i) { 75 | min = Math.min(min, table[i][buckets[i]]); 76 | } 77 | for (int i = 0; i < depth; ++i) { 78 | long newVal = Math.max(table[i][buckets[i]], min + count); 79 | table[i][buckets[i]] = newVal; 80 | } 81 | size += count; 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /src/main/java/com/clearspring/analytics/stream/frequency/FrequencyMergeException.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2011 Clearspring Technologies, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.clearspring.analytics.stream.frequency; 18 | 19 | @SuppressWarnings("serial") 20 | public abstract class FrequencyMergeException extends Exception { 21 | 22 | public FrequencyMergeException(String message) { 23 | super(message); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/main/java/com/clearspring/analytics/stream/frequency/IFrequency.java: -------------------------------------------------------------------------------- 1 | package com.clearspring.analytics.stream.frequency; 2 | 3 | public interface IFrequency { 4 | 5 | void add(long item, long count); 6 | 7 | void add(String item, long count); 8 | 9 | long estimateCount(long item); 10 | 11 | long estimateCount(String item); 12 | 13 | long size(); 14 | } 15 | -------------------------------------------------------------------------------- /src/main/java/com/clearspring/analytics/stream/membership/BitSetSerializer.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | package com.clearspring.analytics.stream.membership; 20 | 21 | import java.io.DataInputStream; 22 | import java.io.DataOutputStream; 23 | import java.io.IOException; 24 | import java.io.ObjectInputStream; 25 | import java.io.ObjectOutputStream; 26 | 27 | import java.util.BitSet; 28 | 29 | public class BitSetSerializer { 30 | 31 | public static void serialize(BitSet bs, DataOutputStream dos) throws IOException { 32 | ObjectOutputStream oos = new ObjectOutputStream(dos); 33 | oos.writeObject(bs); 34 | oos.flush(); 35 | } 36 | 37 | public static BitSet deserialize(DataInputStream dis) throws IOException { 38 | ObjectInputStream ois = new ObjectInputStream(dis); 39 | try { 40 | return (BitSet) ois.readObject(); 41 | } catch (ClassNotFoundException e) { 42 | throw new RuntimeException(e); 43 | } 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/main/java/com/clearspring/analytics/stream/membership/BloomCalculations.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package com.clearspring.analytics.stream.membership; 20 | 21 | /** 22 | * The following calculations are taken from: 23 | * http://www.cs.wisc.edu/~cao/papers/summary-cache/node8.html 24 | * "Bloom Filters - the math" 25 | *

26 | * This class's static methods are meant to facilitate the use of the Bloom 27 | * Filter class by helping to choose correct values of 'bits per element' and 28 | * 'number of hash functions, k'. 29 | */ 30 | public class BloomCalculations { 31 | 32 | private static final int maxBuckets = 15; 33 | private static final int minBuckets = 2; 34 | private static final int minK = 1; 35 | private static final int maxK = 8; 36 | private static final int[] optKPerBuckets = 37 | new int[]{1, // dummy K for 0 buckets per element 38 | 1, // dummy K for 1 buckets per element 39 | 1, 2, 3, 3, 4, 5, 5, 6, 7, 8, 8, 9, 10, 10, 11, 12, 12, 13, 14}; 40 | 41 | /** 42 | * In the following table, the row 'i' shows false positive rates if i buckets 43 | * per element are used. Column 'j' shows false positive rates if j hash 44 | * functions are used. The first row is 'i=0', the first column is 'j=0'. 45 | * Each cell (i,j) the false positive rate determined by using i buckets per 46 | * element and j hash functions. 47 | */ 48 | static final double[][] probs = new double[][]{ 49 | {1.0}, // dummy row representing 0 buckets per element 50 | {1.0, 1.0}, // dummy row representing 1 buckets per element 51 | {1.0, 0.393, 0.400}, 52 | {1.0, 0.283, 0.237, 0.253}, 53 | {1.0, 0.221, 0.155, 0.147, 0.160}, 54 | {1.0, 0.181, 0.109, 0.092, 0.092, 0.101}, // 5 55 | {1.0, 0.154, 0.0804, 0.0609, 0.0561, 0.0578, 0.0638}, 56 | {1.0, 0.133, 0.0618, 0.0423, 0.0359, 0.0347, 0.0364}, 57 | {1.0, 0.118, 0.0489, 0.0306, 0.024, 0.0217, 0.0216, 0.0229}, 58 | {1.0, 0.105, 0.0397, 0.0228, 0.0166, 0.0141, 0.0133, 0.0135, 0.0145}, 59 | {1.0, 0.0952, 0.0329, 0.0174, 0.0118, 0.00943, 0.00844, 0.00819, 0.00846}, // 10 60 | {1.0, 0.0869, 0.0276, 0.0136, 0.00864, 0.0065, 0.00552, 0.00513, 0.00509}, 61 | {1.0, 0.08, 0.0236, 0.0108, 0.00646, 0.00459, 0.00371, 0.00329, 0.00314}, 62 | {1.0, 0.074, 0.0203, 0.00875, 0.00492, 0.00332, 0.00255, 0.00217, 0.00199, 0.00194}, 63 | {1.0, 0.0689, 0.0177, 0.00718, 0.00381, 0.00244, 0.00179, 0.00146, 0.00129, 0.00121, 0.0012}, 64 | {1.0, 0.0645, 0.0156, 0.00596, 0.003, 0.00183, 0.00128, 0.001, 0.000852, 0.000775, 0.000744}, // 15 65 | {1.0, 0.0606, 0.0138, 0.005, 0.00239, 0.00139, 0.000935, 0.000702, 0.000574, 0.000505, 0.00047, 0.000459}, 66 | {1.0, 0.0571, 0.0123, 0.00423, 0.00193, 0.00107, 0.000692, 0.000499, 0.000394, 0.000335, 0.000302, 0.000287, 0.000284}, 67 | {1.0, 0.054, 0.0111, 0.00362, 0.00158, 0.000839, 0.000519, 0.00036, 0.000275, 0.000226, 0.000198, 0.000183, 0.000176}, 68 | {1.0, 0.0513, 0.00998, 0.00312, 0.0013, 0.000663, 0.000394, 0.000264, 0.000194, 0.000155, 0.000132, 0.000118, 0.000111, 0.000109}, 69 | {1.0, 0.0488, 0.00906, 0.0027, 0.00108, 0.00053, 0.000303, 0.000196, 0.00014, 0.000108, 8.89e-05, 7.77e-05, 7.12e-05, 6.79e-05, 6.71e-05} // 20 70 | }; // the first column is a dummy column representing K=0. 71 | 72 | /** 73 | * Given the number of buckets that can be used per element, return the optimal 74 | * number of hash functions in order to minimize the false positive rate. 75 | * 76 | * @param bucketsPerElement 77 | * @return The number of hash functions that minimize the false positive rate. 78 | */ 79 | public static int computeBestK(int bucketsPerElement) { 80 | assert bucketsPerElement >= 0; 81 | if (bucketsPerElement >= optKPerBuckets.length) { 82 | return optKPerBuckets[optKPerBuckets.length - 1]; 83 | } 84 | return optKPerBuckets[bucketsPerElement]; 85 | } 86 | 87 | /** 88 | * A wrapper class that holds two key parameters for a Bloom Filter: the 89 | * number of hash functions used, and the number of buckets per element used. 90 | */ 91 | public static final class BloomSpecification { 92 | 93 | final int K; // number of hash functions. 94 | final int bucketsPerElement; 95 | 96 | public BloomSpecification(int k, int bucketsPerElement) { 97 | K = k; 98 | this.bucketsPerElement = bucketsPerElement; 99 | } 100 | } 101 | 102 | /** 103 | * Given a maximum tolerable false positive probability, compute a Bloom 104 | * specification which will give less than the specified false positive rate, 105 | * but minimize the number of buckets per element and the number of hash 106 | * functions used. Because bandwidth (and therefore total bitvector size) 107 | * is considered more expensive than computing power, preference is given 108 | * to minimizing buckets per element rather than number of hash functions. 109 | * 110 | * @param maxFalsePosProb The maximum tolerable false positive rate. 111 | * @return A Bloom Specification which would result in a false positive rate 112 | * less than specified by the function call. 113 | */ 114 | public static BloomSpecification computeBucketsAndK(double maxFalsePosProb) { 115 | // Handle the trivial cases 116 | if (maxFalsePosProb >= probs[minBuckets][minK]) { 117 | return new BloomSpecification(2, optKPerBuckets[2]); 118 | } 119 | if (maxFalsePosProb < probs[maxBuckets][maxK]) { 120 | return new BloomSpecification(maxK, maxBuckets); 121 | } 122 | 123 | // First find the minimal required number of buckets: 124 | int bucketsPerElement = 2; 125 | int K = optKPerBuckets[2]; 126 | while (probs[bucketsPerElement][K] > maxFalsePosProb) { 127 | bucketsPerElement++; 128 | K = optKPerBuckets[bucketsPerElement]; 129 | } 130 | // Now that the number of buckets is sufficient, see if we can relax K 131 | // without losing too much precision. 132 | while (probs[bucketsPerElement][K - 1] <= maxFalsePosProb) { 133 | K--; 134 | } 135 | 136 | return new BloomSpecification(K, bucketsPerElement); 137 | } 138 | 139 | /** 140 | * Calculate the probability of a false positive given the specified 141 | * number of inserted elements. 142 | * 143 | * @param bucketsPerElement number of inserted elements. 144 | * @param hashCount 145 | * @return probability of a false positive. 146 | */ 147 | public static double getFalsePositiveProbability(int bucketsPerElement, int hashCount) { 148 | // (1 - e^(-k * n / m)) ^ k 149 | return Math.pow(1 - Math.exp(-hashCount * (1 / (double) bucketsPerElement)), hashCount); 150 | 151 | } 152 | } 153 | -------------------------------------------------------------------------------- /src/main/java/com/clearspring/analytics/stream/membership/BloomFilter.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package com.clearspring.analytics.stream.membership; 20 | 21 | import java.io.DataInputStream; 22 | import java.io.DataOutputStream; 23 | import java.io.IOException; 24 | 25 | import java.util.BitSet; 26 | 27 | public class BloomFilter extends Filter { 28 | 29 | static ICompactSerializer serializer_ = new BloomFilterSerializer(); 30 | 31 | public static ICompactSerializer serializer() { 32 | return serializer_; 33 | } 34 | 35 | private BitSet filter_; 36 | 37 | public BloomFilter(int numElements, int bucketsPerElement) { 38 | this(BloomCalculations.computeBestK(bucketsPerElement), new BitSet(numElements * bucketsPerElement + 20)); 39 | } 40 | 41 | public BloomFilter(int numElements, double maxFalsePosProbability) { 42 | BloomCalculations.BloomSpecification spec = BloomCalculations 43 | .computeBucketsAndK(maxFalsePosProbability); 44 | filter_ = new BitSet(numElements * spec.bucketsPerElement + 20); 45 | hashCount = spec.K; 46 | } 47 | 48 | /* 49 | * This version is only used by the deserializer. 50 | */ 51 | BloomFilter(int hashes, BitSet filter) { 52 | hashCount = hashes; 53 | filter_ = filter; 54 | } 55 | 56 | public void clear() { 57 | filter_.clear(); 58 | } 59 | 60 | public int buckets() { 61 | return filter_.size(); 62 | } 63 | 64 | BitSet filter() { 65 | return filter_; 66 | } 67 | 68 | public boolean isPresent(String key) { 69 | for (int bucketIndex : getHashBuckets(key)) { 70 | if (!filter_.get(bucketIndex)) { 71 | return false; 72 | } 73 | } 74 | return true; 75 | } 76 | 77 | public boolean isPresent(byte[] key) { 78 | for (int bucketIndex : getHashBuckets(key)) { 79 | if (!filter_.get(bucketIndex)) { 80 | return false; 81 | } 82 | } 83 | return true; 84 | } 85 | 86 | /* 87 | @param key -- value whose hash is used to fill 88 | the filter_. 89 | This is a general purpose API. 90 | */ 91 | public void add(String key) { 92 | for (int bucketIndex : getHashBuckets(key)) { 93 | filter_.set(bucketIndex); 94 | } 95 | } 96 | 97 | public void add(byte[] key) { 98 | for (int bucketIndex : getHashBuckets(key)) { 99 | filter_.set(bucketIndex); 100 | } 101 | } 102 | 103 | public String toString() { 104 | return filter_.toString(); 105 | } 106 | 107 | ICompactSerializer tserializer() { 108 | return serializer_; 109 | } 110 | 111 | int emptyBuckets() { 112 | int n = 0; 113 | for (int i = 0; i < buckets(); i++) { 114 | if (!filter_.get(i)) { 115 | n++; 116 | } 117 | } 118 | return n; 119 | } 120 | 121 | public void addAll(BloomFilter other) { 122 | if (this.getHashCount() != other.getHashCount()) { 123 | throw new IllegalArgumentException("Cannot merge filters of different sizes"); 124 | } 125 | 126 | this.filter().or(other.filter()); 127 | } 128 | 129 | public Filter merge(Filter... filters) { 130 | BloomFilter merged = new BloomFilter(this.getHashCount(), (BitSet) this.filter().clone()); 131 | 132 | if (filters == null) { 133 | return merged; 134 | } 135 | 136 | for (Filter filter : filters) { 137 | if (!(filter instanceof BloomFilter)) { 138 | throw new IllegalArgumentException("Cannot merge filters of different class"); 139 | } 140 | BloomFilter bf = (BloomFilter) filter; 141 | merged.addAll(bf); 142 | } 143 | 144 | return merged; 145 | } 146 | 147 | /** 148 | * @return a BloomFilter that always returns a positive match, for testing 149 | */ 150 | public static BloomFilter alwaysMatchingBloomFilter() { 151 | BitSet set = new BitSet(64); 152 | set.set(0, 64); 153 | return new BloomFilter(1, set); 154 | } 155 | 156 | public static byte[] serialize(BloomFilter filter) { 157 | DataOutputBuffer out = new DataOutputBuffer(); 158 | try { 159 | BloomFilter.serializer().serialize(filter, out); 160 | out.close(); 161 | } catch (IOException e) { 162 | e.printStackTrace(); 163 | } 164 | 165 | return out.getData(); 166 | } 167 | 168 | public static BloomFilter deserialize(byte[] bytes) { 169 | BloomFilter filter = null; 170 | DataInputBuffer in = new DataInputBuffer(); 171 | in.reset(bytes, bytes.length); 172 | try { 173 | filter = BloomFilter.serializer().deserialize(in); 174 | in.close(); 175 | } catch (IOException e) { 176 | e.printStackTrace(); 177 | } 178 | 179 | return filter; 180 | } 181 | } 182 | 183 | class BloomFilterSerializer implements ICompactSerializer { 184 | 185 | public void serialize(BloomFilter bf, DataOutputStream dos) 186 | throws IOException { 187 | dos.writeInt(bf.getHashCount()); 188 | BitSetSerializer.serialize(bf.filter(), dos); 189 | } 190 | 191 | public BloomFilter deserialize(DataInputStream dis) throws IOException { 192 | int hashes = dis.readInt(); 193 | BitSet bs = BitSetSerializer.deserialize(dis); 194 | return new BloomFilter(hashes, bs); 195 | } 196 | } 197 | -------------------------------------------------------------------------------- /src/main/java/com/clearspring/analytics/stream/membership/DataInputBuffer.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package com.clearspring.analytics.stream.membership; 20 | 21 | import java.io.ByteArrayInputStream; 22 | import java.io.DataInputStream; 23 | 24 | 25 | /** 26 | * An implementation of the DataInputStream interface. This instance is completely thread 27 | * unsafe. 28 | */ 29 | 30 | public final class DataInputBuffer extends DataInputStream { 31 | 32 | private static class Buffer extends ByteArrayInputStream { 33 | 34 | public Buffer() { 35 | super(new byte[]{}); 36 | } 37 | 38 | public void reset(byte[] input, int start, int length) { 39 | this.buf = input; 40 | this.count = start + length; 41 | this.mark = start; 42 | this.pos = start; 43 | } 44 | 45 | public int getPosition() { 46 | return pos; 47 | } 48 | 49 | public void setPosition(int position) { 50 | pos = position; 51 | } 52 | 53 | public int getLength() { 54 | return count; 55 | } 56 | } 57 | 58 | private Buffer buffer_; 59 | 60 | /** 61 | * Constructs a new empty buffer. 62 | */ 63 | public DataInputBuffer() { 64 | this(new Buffer()); 65 | } 66 | 67 | private DataInputBuffer(Buffer buffer) { 68 | super(buffer); 69 | this.buffer_ = buffer; 70 | } 71 | 72 | /** 73 | * Resets the data that the buffer reads. 74 | */ 75 | public void reset(byte[] input, int length) { 76 | buffer_.reset(input, 0, length); 77 | } 78 | 79 | /** 80 | * Resets the data that the buffer reads. 81 | */ 82 | public void reset(byte[] input, int start, int length) { 83 | buffer_.reset(input, start, length); 84 | } 85 | 86 | /** 87 | * Returns the length of the input. 88 | */ 89 | public int getLength() { 90 | return buffer_.getLength(); 91 | } 92 | 93 | public int getPosition() { 94 | return buffer_.getPosition(); 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /src/main/java/com/clearspring/analytics/stream/membership/DataOutputBuffer.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | n * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package com.clearspring.analytics.stream.membership; 20 | 21 | import java.io.ByteArrayOutputStream; 22 | import java.io.DataInput; 23 | import java.io.DataOutputStream; 24 | import java.io.IOException; 25 | 26 | import java.util.Arrays; 27 | 28 | 29 | /** 30 | * An implementation of the DataOutputStream interface. This class is completely thread 31 | * unsafe. 32 | */ 33 | public class DataOutputBuffer extends DataOutputStream { 34 | 35 | private static class Buffer extends ByteArrayOutputStream { 36 | 37 | public byte[] getData() { 38 | return Arrays.copyOf(buf, getLength()); 39 | //return buf; 40 | } 41 | 42 | public int getLength() { 43 | return count; 44 | } 45 | 46 | public void reset() { 47 | count = 0; 48 | } 49 | 50 | public void write(DataInput in, int len) throws IOException { 51 | int newcount = count + len; 52 | if (newcount > buf.length) { 53 | byte newbuf[] = new byte[Math.max(buf.length << 1, newcount)]; 54 | System.arraycopy(buf, 0, newbuf, 0, count); 55 | buf = newbuf; 56 | } 57 | in.readFully(buf, count, len); 58 | count = newcount; 59 | } 60 | } 61 | 62 | private Buffer buffer; 63 | 64 | /** 65 | * Constructs a new empty buffer. 66 | */ 67 | public DataOutputBuffer() { 68 | this(new Buffer()); 69 | } 70 | 71 | private DataOutputBuffer(Buffer buffer) { 72 | super(buffer); 73 | this.buffer = buffer; 74 | } 75 | 76 | /** 77 | * Returns the current contents of the buffer. Data is only valid to 78 | * {@link #getLength()}. 79 | */ 80 | public byte[] getData() { 81 | return buffer.getData(); 82 | } 83 | 84 | /** 85 | * Returns the length of the valid data currently in the buffer. 86 | */ 87 | public int getLength() { 88 | return buffer.getLength(); 89 | } 90 | 91 | /** 92 | * Resets the buffer to empty. 93 | */ 94 | public DataOutputBuffer reset() { 95 | this.written = 0; 96 | buffer.reset(); 97 | return this; 98 | } 99 | 100 | /** 101 | * Writes bytes from a DataInput directly into the buffer. 102 | */ 103 | public void write(DataInput in, int length) throws IOException { 104 | buffer.write(in, length); 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /src/main/java/com/clearspring/analytics/stream/membership/Filter.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | package com.clearspring.analytics.stream.membership; 20 | 21 | import java.io.UnsupportedEncodingException; 22 | 23 | import java.lang.reflect.Method; 24 | 25 | import com.clearspring.analytics.hash.MurmurHash; 26 | 27 | public abstract class Filter { 28 | 29 | int hashCount; 30 | 31 | public int getHashCount() { 32 | return hashCount; 33 | } 34 | 35 | public int[] getHashBuckets(String key) { 36 | return Filter.getHashBuckets(key, hashCount, buckets()); 37 | } 38 | 39 | public int[] getHashBuckets(byte[] key) { 40 | return Filter.getHashBuckets(key, hashCount, buckets()); 41 | } 42 | 43 | 44 | abstract int buckets(); 45 | 46 | public abstract void add(String key); 47 | 48 | public abstract boolean isPresent(String key); 49 | 50 | // for testing 51 | abstract int emptyBuckets(); 52 | 53 | @SuppressWarnings("unchecked") 54 | ICompactSerializer getSerializer() { 55 | Method method = null; 56 | try { 57 | method = getClass().getMethod("serializer"); 58 | return (ICompactSerializer) method.invoke(null); 59 | } catch (Exception e) { 60 | throw new RuntimeException(e); 61 | } 62 | } 63 | 64 | // Murmur is faster than an SHA-based approach and provides as-good collision 65 | // resistance. The combinatorial generation approach described in 66 | // https://gnunet.org/sites/default/files/LessHashing2006Kirsch.pdf 67 | // does prove to work in actual tests, and is obviously faster 68 | // than performing further iterations of murmur. 69 | public static int[] getHashBuckets(String key, int hashCount, int max) { 70 | byte[] b; 71 | try { 72 | b = key.getBytes("UTF-16"); 73 | } catch (UnsupportedEncodingException e) { 74 | throw new RuntimeException(e); 75 | } 76 | return getHashBuckets(b, hashCount, max); 77 | } 78 | 79 | static int[] getHashBuckets(byte[] b, int hashCount, int max) { 80 | int[] result = new int[hashCount]; 81 | int hash1 = MurmurHash.hash(b, b.length, 0); 82 | int hash2 = MurmurHash.hash(b, b.length, hash1); 83 | for (int i = 0; i < hashCount; i++) { 84 | result[i] = Math.abs((hash1 + i * hash2) % max); 85 | } 86 | return result; 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /src/main/java/com/clearspring/analytics/stream/membership/ICompactSerializer.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package com.clearspring.analytics.stream.membership; 20 | 21 | import java.io.DataInputStream; 22 | import java.io.DataOutputStream; 23 | import java.io.IOException; 24 | 25 | /** 26 | * Allows for the controlled serialization/deserialization of a given type. 27 | */ 28 | 29 | public interface ICompactSerializer { 30 | 31 | /** 32 | * Serialize the specified type into the specified DataOutputStream instance. 33 | * 34 | * @param t type that needs to be serialized 35 | * @param dos DataOutput into which serialization needs to happen. 36 | * @throws IOException 37 | */ 38 | public void serialize(T t, DataOutputStream dos) throws IOException; 39 | 40 | /** 41 | * Deserialize into the specified DataInputStream instance. 42 | * 43 | * @param dis DataInput from which deserialization needs to happen. 44 | * @return the type that was deserialized 45 | * @throws IOException 46 | */ 47 | public T deserialize(DataInputStream dis) throws IOException; 48 | } 49 | -------------------------------------------------------------------------------- /src/main/java/com/clearspring/analytics/stream/quantile/IQuantileEstimator.java: -------------------------------------------------------------------------------- 1 | package com.clearspring.analytics.stream.quantile; 2 | 3 | public interface IQuantileEstimator { 4 | 5 | void offer(long value); 6 | 7 | long getQuantile(double q); 8 | } 9 | -------------------------------------------------------------------------------- /src/main/java/com/clearspring/analytics/util/AbstractIterator.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.clearspring.analytics.util; 19 | 20 | import java.util.Iterator; 21 | import java.util.NoSuchElementException; 22 | 23 | /** 24 | * Rough and ready clone of the Guava AbstractIterator. I just did this 25 | * to avoid needing to add the guava dependency. It would be better to 26 | * just use quava. 27 | */ 28 | public abstract class AbstractIterator implements Iterator { 29 | 30 | private enum State { 31 | NOT_STARTED, DONE, HAS_DATA, EMPTY 32 | } 33 | 34 | private T next; 35 | 36 | private State currentState = State.NOT_STARTED; 37 | 38 | @Override 39 | public boolean hasNext() { 40 | switch (currentState) { 41 | case DONE: 42 | return false; 43 | case NOT_STARTED: 44 | currentState = State.HAS_DATA; 45 | next = computeNext(); 46 | break; 47 | case HAS_DATA: 48 | return true; 49 | case EMPTY: 50 | currentState = State.HAS_DATA; 51 | next = computeNext(); 52 | break; 53 | } 54 | return currentState != State.DONE; 55 | } 56 | 57 | @Override 58 | public T next() { 59 | if (hasNext()) { 60 | T r = next; 61 | currentState = State.EMPTY; 62 | return r; 63 | } else { 64 | throw new NoSuchElementException(); 65 | } 66 | } 67 | 68 | @Override 69 | public void remove() { 70 | throw new UnsupportedOperationException("Can't remove from an abstract iterator"); 71 | } 72 | 73 | protected abstract T computeNext(); 74 | 75 | public T endOfData() { 76 | currentState = State.DONE; 77 | return null; 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /src/main/java/com/clearspring/analytics/util/Bits.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2011 Clearspring Technologies, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.clearspring.analytics.util; 18 | 19 | import java.io.ByteArrayInputStream; 20 | import java.io.DataInput; 21 | import java.io.DataInputStream; 22 | import java.io.IOException; 23 | 24 | public class Bits { 25 | 26 | public static int[] getBits(byte[] mBytes) throws IOException { 27 | int bitSize = mBytes.length / 4; 28 | int[] bits = new int[bitSize]; 29 | DataInputStream dis = new DataInputStream(new ByteArrayInputStream(mBytes)); 30 | for (int i = 0; i < bitSize; i++) { 31 | bits[i] = dis.readInt(); 32 | } 33 | return bits; 34 | } 35 | 36 | /** 37 | * This method might be better described as 38 | * "byte array to int array" or "data input to int array" 39 | */ 40 | public static int[] getBits(DataInput dataIn, int byteLength) throws IOException { 41 | int bitSize = byteLength / 4; 42 | int[] bits = new int[bitSize]; 43 | for (int i = 0; i < bitSize; i++) { 44 | bits[i] = dataIn.readInt(); 45 | } 46 | return bits; 47 | } 48 | 49 | } 50 | -------------------------------------------------------------------------------- /src/main/java/com/clearspring/analytics/util/DoublyLinkedList.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2011 Clearspring Technologies, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.clearspring.analytics.util; 18 | 19 | import java.util.ConcurrentModificationException; 20 | import java.util.Iterator; 21 | 22 | public class DoublyLinkedList implements Iterable { 23 | 24 | protected int size; 25 | protected ListNode2 tail; 26 | protected ListNode2 head; 27 | 28 | /** 29 | * Append to head of list 30 | */ 31 | public ListNode2 add(T value) { 32 | ListNode2 node = new ListNode2(value); 33 | if (size++ == 0) { 34 | tail = node; 35 | } else { 36 | node.prev = head; 37 | head.next = node; 38 | } 39 | 40 | head = node; 41 | 42 | return node; 43 | } 44 | 45 | /** 46 | * Prepend to tail of list 47 | */ 48 | public ListNode2 enqueue(T value) { 49 | ListNode2 node = new ListNode2(value); 50 | if (size++ == 0) { 51 | head = node; 52 | } else { 53 | node.next = tail; 54 | tail.prev = node; 55 | } 56 | 57 | tail = node; 58 | 59 | return node; 60 | } 61 | 62 | public void add(ListNode2 node) { 63 | node.prev = head; 64 | node.next = null; 65 | 66 | if (size++ == 0) { 67 | tail = node; 68 | } else { 69 | head.next = node; 70 | } 71 | 72 | head = node; 73 | } 74 | 75 | public ListNode2 addAfter(ListNode2 node, T value) { 76 | ListNode2 newNode = new ListNode2(value); 77 | addAfter(node, newNode); 78 | return newNode; 79 | } 80 | 81 | public void addAfter(ListNode2 node, ListNode2 newNode) { 82 | newNode.next = node.next; 83 | newNode.prev = node; 84 | node.next = newNode; 85 | if (newNode.next == null) { 86 | head = newNode; 87 | } else { 88 | newNode.next.prev = newNode; 89 | } 90 | size++; 91 | } 92 | 93 | public void remove(ListNode2 node) { 94 | if (node == tail) { 95 | tail = node.next; 96 | } else { 97 | node.prev.next = node.next; 98 | } 99 | 100 | if (node == head) { 101 | head = node.prev; 102 | } else { 103 | node.next.prev = node.prev; 104 | } 105 | size--; 106 | } 107 | 108 | public int size() { 109 | return size; 110 | } 111 | 112 | 113 | @Override 114 | public Iterator iterator() { 115 | return new DoublyLinkedListIterator(this); 116 | } 117 | 118 | protected class DoublyLinkedListIterator implements Iterator { 119 | 120 | protected DoublyLinkedList list; 121 | protected ListNode2 itr; 122 | protected int length; 123 | 124 | public DoublyLinkedListIterator(DoublyLinkedList list) { 125 | this.length = list.size; 126 | this.list = list; 127 | this.itr = list.tail; 128 | } 129 | 130 | @Override 131 | public boolean hasNext() { 132 | return itr != null; 133 | } 134 | 135 | @Override 136 | public T next() { 137 | if (length != list.size) { 138 | throw new ConcurrentModificationException(); 139 | } 140 | T next = itr.value; 141 | itr = itr.next; 142 | return next; 143 | } 144 | 145 | @Override 146 | public void remove() { 147 | throw new UnsupportedOperationException(); 148 | } 149 | 150 | } 151 | 152 | public T first() { 153 | return tail == null ? null : tail.getValue(); 154 | } 155 | 156 | public T last() { 157 | return head == null ? null : head.getValue(); 158 | } 159 | 160 | public ListNode2 head() { 161 | return head; 162 | } 163 | 164 | public ListNode2 tail() { 165 | return tail; 166 | } 167 | 168 | public boolean isEmpty() { 169 | return size == 0; 170 | } 171 | 172 | @SuppressWarnings("unchecked") 173 | public T[] toArray() { 174 | T[] a = (T[]) new Object[size]; 175 | int i = 0; 176 | for (T v : this) { 177 | a[i++] = v; 178 | } 179 | return a; 180 | } 181 | } 182 | -------------------------------------------------------------------------------- /src/main/java/com/clearspring/analytics/util/ExternalizableUtil.java: -------------------------------------------------------------------------------- 1 | package com.clearspring.analytics.util; 2 | 3 | import java.io.ByteArrayOutputStream; 4 | import java.io.Externalizable; 5 | import java.io.IOException; 6 | import java.io.ObjectOutputStream; 7 | 8 | public class ExternalizableUtil { 9 | 10 | public static byte[] toBytes(Externalizable o) throws IOException { 11 | ByteArrayOutputStream baos = new ByteArrayOutputStream(); 12 | ObjectOutputStream out = new ObjectOutputStream(baos); 13 | o.writeExternal(out); 14 | out.flush(); 15 | return baos.toByteArray(); 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /src/main/java/com/clearspring/analytics/util/IBuilder.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2011 Clearspring Technologies, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.clearspring.analytics.util; 18 | 19 | 20 | public interface IBuilder { 21 | 22 | T build(); 23 | 24 | int sizeof(); 25 | } 26 | -------------------------------------------------------------------------------- /src/main/java/com/clearspring/analytics/util/ListNode2.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2011 Clearspring Technologies, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.clearspring.analytics.util; 18 | 19 | public class ListNode2 { 20 | 21 | protected T value; 22 | protected ListNode2 prev; 23 | protected ListNode2 next; 24 | 25 | public ListNode2(T value) { 26 | this.value = value; 27 | } 28 | 29 | public ListNode2 getPrev() { 30 | return prev; 31 | } 32 | 33 | public ListNode2 getNext() { 34 | return next; 35 | } 36 | 37 | public T getValue() { 38 | return value; 39 | } 40 | 41 | public void setValue(T value) { 42 | this.value = value; 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/main/java/com/clearspring/analytics/util/Lists.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.clearspring.analytics.util; 19 | 20 | import java.util.ArrayList; 21 | import java.util.List; 22 | 23 | /** 24 | * Toy version of the guava class. Only implemented here to avoid adding 25 | * a dependency. It would be better to just depend on guava. 26 | */ 27 | public class Lists { 28 | 29 | public static List newArrayList(Iterable source) { 30 | List r = new ArrayList(); 31 | for (T x : source) { 32 | r.add(x); 33 | } 34 | return r; 35 | } 36 | 37 | public static List newArrayList() { 38 | return new ArrayList(); 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/main/java/com/clearspring/analytics/util/ObyCount.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2011 Clearspring Technologies, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.clearspring.analytics.util; 18 | 19 | import java.io.BufferedReader; 20 | import java.io.IOException; 21 | import java.io.InputStreamReader; 22 | 23 | import com.clearspring.analytics.stream.cardinality.HyperLogLogPlus; 24 | 25 | 26 | /** 27 | * Simple cardinality estimation command line utility 28 | *

29 | * Usage: 30 | * > obycount [update-rate] 31 | *

32 | * update-rate: output results after every update-rate elements/lines 33 | *

34 | * Example: 35 | * > cat elements.txt | obycount 36 | */ 37 | public class ObyCount { 38 | 39 | public static void usage() { 40 | System.err.println 41 | ( 42 | "obycount [update-rate]\n" + 43 | "\n" + 44 | "update-rate: output results after every update-rate elements/lines" + 45 | "\n" + 46 | "Example:" + 47 | "> cat elements.txt | obycount" + 48 | "\n" 49 | ); 50 | 51 | System.exit(-1); 52 | } 53 | 54 | public static void main(String[] args) throws IOException { 55 | long updateRate = -1; 56 | long count = 0; 57 | 58 | if (args.length > 0) { 59 | try { 60 | updateRate = Long.parseLong(args[0]); 61 | } catch (NumberFormatException e) { 62 | System.err.print("Bad update rate: '" + args[0] + "' Update rate must be an integer."); 63 | usage(); 64 | } 65 | } 66 | 67 | BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); 68 | 69 | HyperLogLogPlus card = new HyperLogLogPlus(14, 25); 70 | 71 | String line = null; 72 | while ((line = in.readLine()) != null) { 73 | card.offer(line); 74 | count++; 75 | 76 | if (updateRate > 0 && count % updateRate == 0) { 77 | System.out.println(formatSummary(count, card.cardinality())); 78 | } 79 | } 80 | 81 | System.out.println(formatSummary(count, card.cardinality())); 82 | } 83 | 84 | protected static String formatSummary(long count, long cardinality) { 85 | String cntStr = Long.toString(count); 86 | int len = cntStr.length(); 87 | int l1 = Math.max(len, 10); 88 | int l2 = Math.max(len, 20); 89 | String fmt = "%" + l1 + "s %" + l2 + "s"; 90 | StringBuilder sb = new StringBuilder(); 91 | sb.append(String.format(fmt, "Item Count", "Cardinality Estimate")).append('\n'); 92 | sb.append(String.format(fmt, TopK.string('-', l1), TopK.string('-', l2))).append('\n'); 93 | sb.append(String.format(fmt, count, cardinality)).append('\n'); 94 | return sb.toString(); 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /src/main/java/com/clearspring/analytics/util/Pair.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package com.clearspring.analytics.util; 20 | 21 | public class Pair { 22 | 23 | public final T1 left; 24 | public final T2 right; 25 | 26 | public Pair(T1 left, T2 right) { 27 | this.left = left; 28 | this.right = right; 29 | } 30 | 31 | @Override 32 | public final int hashCode() { 33 | int hashCode = 31 + (left == null ? 0 : left.hashCode()); 34 | return 31 * hashCode + (right == null ? 0 : right.hashCode()); 35 | } 36 | 37 | @Override 38 | public final boolean equals(Object o) { 39 | if (!(o instanceof Pair)) { 40 | return false; 41 | } 42 | Pair that = (Pair) o; 43 | // handles nulls properly 44 | return equal(left, that.left) && equal(right, that.right); 45 | } 46 | 47 | // From Apache Licensed guava: 48 | private boolean equal(Object a, Object b) { 49 | return a == b || (a != null && a.equals(b)); 50 | } 51 | 52 | 53 | @Override 54 | public String toString() { 55 | return "(" + left + "," + right + ")"; 56 | } 57 | 58 | public static Pair create(X x, Y y) { 59 | return new Pair(x, y); 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /src/main/java/com/clearspring/analytics/util/Preconditions.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.clearspring.analytics.util; 19 | 20 | /** 21 | * Toy version of the guava class. Only implemented here to avoid the 22 | * extra depenency. 23 | */ 24 | public class Preconditions { 25 | 26 | public static void checkState(boolean condition, String msg) { 27 | if (!condition) { 28 | throw new IllegalStateException(msg); 29 | } 30 | } 31 | 32 | public static void checkArgument(boolean condition) { 33 | if (!condition) { 34 | throw new IllegalArgumentException(); 35 | } 36 | } 37 | 38 | public static void checkState(boolean condition) { 39 | if (!condition) { 40 | throw new IllegalStateException(); 41 | } 42 | } 43 | 44 | public static void checkArgument(boolean condition, String format, Object... args) { 45 | if (!condition) { 46 | throw new IllegalArgumentException(String.format(format, args)); 47 | } 48 | } 49 | 50 | public static void checkState(boolean condition, String format, Object... args) { 51 | if (!condition) { 52 | throw new IllegalStateException(String.format(format, args)); 53 | } 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /src/main/java/com/clearspring/analytics/util/TopK.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2011 Clearspring Technologies, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.clearspring.analytics.util; 18 | 19 | import java.io.BufferedReader; 20 | import java.io.IOException; 21 | import java.io.InputStreamReader; 22 | 23 | import java.util.List; 24 | 25 | import com.clearspring.analytics.stream.Counter; 26 | import com.clearspring.analytics.stream.StreamSummary; 27 | 28 | 29 | /** 30 | * Simple TopK command line utility 31 | *

32 | * Usage: 33 | * > topk [capacity] [update-rate] 34 | *

35 | * capacity : size of top / k (defaults to 1000) 36 | * update-rate: output results after every update-rate elements/lines 37 | *

38 | * Example: 39 | * > cat elements.txt | topk 10 40 | */ 41 | public class TopK { 42 | 43 | public static void usage() { 44 | System.err.println 45 | ( 46 | "topk [capacity] [update-rate]\n" + 47 | "\n" + 48 | "capacity : size of top / k (defaults to 1000)" + 49 | "update-rate: output results after every update-rate elements/lines" + 50 | "\n" + 51 | "Example:" + 52 | "> cat elements.txt | topk 10" + 53 | "\n" 54 | ); 55 | 56 | System.exit(-1); 57 | } 58 | 59 | public static void main(String[] args) throws IOException { 60 | long updateRate = -1; 61 | long count = 0; 62 | int capacity = 1000; 63 | 64 | if (args.length > 0) { 65 | try { 66 | capacity = Integer.parseInt(args[0]); 67 | } catch (NumberFormatException e) { 68 | System.err.print("Bad capacity: '" + args[0] + "' Capacity must be an integer."); 69 | usage(); 70 | } 71 | } 72 | 73 | if (args.length > 1) { 74 | try { 75 | updateRate = Long.parseLong(args[1]); 76 | } catch (NumberFormatException e) { 77 | System.err.print("Bade update rate: '" + args[1] + "' Update rate must be an integer."); 78 | usage(); 79 | } 80 | } 81 | 82 | StreamSummary topk = new StreamSummary(capacity); 83 | 84 | BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); 85 | 86 | String line = null; 87 | while ((line = in.readLine()) != null) { 88 | topk.offer(line); 89 | count++; 90 | 91 | if (updateRate > 0 && count % updateRate == 0) { 92 | System.out.println(formatSummary(topk)); 93 | System.out.println("Item count: " + count); 94 | System.out.println(); 95 | } 96 | } 97 | 98 | System.out.println(formatSummary(topk)); 99 | System.out.println("Item count: " + count); 100 | } 101 | 102 | public static String formatSummary(StreamSummary topk) { 103 | StringBuilder sb = new StringBuilder(); 104 | 105 | List> counters = topk.topK(topk.getCapacity()); 106 | String itemHeader = "item"; 107 | String countHeader = "count"; 108 | String errorHeader = "error"; 109 | 110 | int maxItemLen = itemHeader.length(); 111 | int maxCountLen = countHeader.length(); 112 | int maxErrorLen = errorHeader.length(); 113 | 114 | for (Counter counter : counters) { 115 | maxItemLen = Math.max(counter.getItem().length(), maxItemLen); 116 | maxCountLen = Math.max(Long.toString(counter.getCount()).length(), maxCountLen); 117 | maxErrorLen = Math.max(Long.toString(counter.getError()).length(), maxErrorLen); 118 | } 119 | 120 | sb.append(String.format("%" + maxItemLen + "s %" + maxCountLen + "s %" + maxErrorLen + "s", itemHeader, countHeader, errorHeader)); 121 | sb.append('\n'); 122 | sb.append(String.format("%" + maxItemLen + "s %" + maxCountLen + "s %" + maxErrorLen + "s", string('-', maxItemLen), string('-', maxCountLen), string('-', maxErrorLen))); 123 | sb.append('\n'); 124 | 125 | for (Counter counter : counters) { 126 | sb.append(String.format("%" + maxItemLen + "s %" + maxCountLen + "d %" + maxErrorLen + "d", counter.getItem(), counter.getCount(), counter.getError())); 127 | sb.append('\n'); 128 | } 129 | 130 | return sb.toString(); 131 | } 132 | 133 | public static String string(char c, int len) { 134 | StringBuilder sb = new StringBuilder(len); 135 | for (int i = 0; i < len; i++) { 136 | sb.append(c); 137 | } 138 | return sb.toString(); 139 | } 140 | } 141 | -------------------------------------------------------------------------------- /src/main/java/com/clearspring/analytics/util/UnsignedIntComparator.java: -------------------------------------------------------------------------------- 1 | package com.clearspring.analytics.util; 2 | 3 | 4 | import java.util.Comparator; 5 | 6 | public class UnsignedIntComparator implements Comparator { 7 | 8 | @Override 9 | public int compare(byte[] left, byte[] right) { 10 | int l = Varint.readUnsignedVarInt(left); 11 | int r = Varint.readUnsignedVarInt(right); 12 | return l - r; 13 | } 14 | } -------------------------------------------------------------------------------- /src/main/java/com/clearspring/experimental/stream/cardinality/HyperBitBit.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2011 Clearspring Technologies, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.clearspring.experimental.stream.cardinality; 18 | 19 | import com.clearspring.analytics.hash.MurmurHash; 20 | import com.clearspring.analytics.stream.cardinality.ICardinality; 21 | import com.clearspring.analytics.stream.cardinality.CardinalityMergeException; 22 | 23 | import java.io.IOException; 24 | 25 | /** 26 | * Java implementation of HyperBitBit (HBB) algorithm as seen on the presentation 27 | * by Robert Sedgewick: 28 | *

29 | * https://www.cs.princeton.edu/~rs/talks/AC11-Cardinality.pdf 30 | *

31 | * HBB aims to beat HyperLogLog. 32 | * From the talk, on practical data: 33 | * - HyperBitBit, for N < 2^64, 34 | * - Uses 128 + 6 bits. (in this implementation case 128 + 8) 35 | * - Estimates cardinality within 10% of the actual. 36 | *

37 | * The algorithm still need some improvements. 38 | * - If you insert twice the same element the structure can change (not as in HLL) 39 | * - For small cardinalities it does not work AT ALL. 40 | * - The constatn 5.4 used in the cardinality estimation formula should be refined 41 | * with real world applications feedback 42 | *

43 | * Even so, HyperBitBit has the necessary characteristics to become 44 | * a better algorithm than HyperLogLog: 45 | * - Makes one pass through the stream. 46 | * - Uses a few dozen machine instructions per value 47 | * - Uses a few hundred bits 48 | * - Achieves 10% relative accuracy or better 49 | *

50 | * Any feedback to improve the algorithm in its weak points will be welcome. 51 | *

52 | */ 53 | 54 | public class HyperBitBit implements ICardinality { 55 | 56 | int lgN; 57 | long sketch; 58 | long sketch2; 59 | 60 | /** 61 | * Create a new HyperBitBit instance. 62 | * 63 | * Remember that it does not work well for small cardinalities! 64 | */ 65 | public HyperBitBit() { 66 | lgN = 5; 67 | sketch = 0; 68 | sketch2 = 0; 69 | } 70 | 71 | @Override 72 | public boolean offer(Object o) { 73 | final long x = MurmurHash.hash64(o); 74 | return offerHashed(x); 75 | } 76 | 77 | @Override 78 | public boolean offerHashed(long hashedLong) { 79 | long k = (hashedLong << 58) >> 58; 80 | // Calculate the position of the leftmost 1-bit. 81 | int r = Long.numberOfLeadingZeros(hashedLong >> 6) - 6; 82 | 83 | boolean modified = false; 84 | 85 | if (r > lgN) { 86 | modified = true; 87 | sketch = sketch | 1L << k; 88 | } 89 | if (r > lgN+1) { 90 | modified = true; 91 | sketch2 = sketch2 | 1L << k; 92 | } 93 | if (Long.bitCount(sketch) > 31) { 94 | modified = true; 95 | sketch = sketch2; 96 | sketch2 = 0; 97 | ++lgN; 98 | } 99 | 100 | return modified; 101 | } 102 | 103 | @Override 104 | public boolean offerHashed(int hashedInt) { 105 | throw new UnsupportedOperationException(); 106 | } 107 | 108 | @Override 109 | public long cardinality() { 110 | double exponent = lgN + 5.4 + Long.bitCount(sketch)/32.0; 111 | return (long) Math.pow(2, exponent); 112 | } 113 | 114 | @Override 115 | public int sizeof() { 116 | return 0; 117 | } 118 | 119 | @Override 120 | public byte[] getBytes() throws IOException { 121 | return new byte[0]; 122 | } 123 | 124 | @Override 125 | public ICardinality merge(ICardinality... estimators) throws CardinalityMergeException { 126 | throw new HyperBitBitMergeException("Cannot merge estimators of HyperBitBit class"); 127 | } 128 | 129 | @SuppressWarnings("serial") 130 | static class HyperBitBitMergeException extends CardinalityMergeException { 131 | public HyperBitBitMergeException(String message) { 132 | super(message); 133 | } 134 | } 135 | } 136 | -------------------------------------------------------------------------------- /src/test/java/com/clearspring/analytics/TestUtils.java: -------------------------------------------------------------------------------- 1 | package com.clearspring.analytics; 2 | 3 | import java.io.*; 4 | 5 | public class TestUtils { 6 | 7 | public static byte[] serialize(Serializable obj) throws IOException { 8 | ByteArrayOutputStream baos = new ByteArrayOutputStream(512); 9 | ObjectOutputStream out = null; 10 | try { 11 | // stream closed in the finally 12 | out = new ObjectOutputStream(baos); 13 | out.writeObject(obj); 14 | } finally { 15 | if (out != null) { 16 | out.close(); 17 | } 18 | } 19 | return baos.toByteArray(); 20 | } 21 | 22 | public static Object deserialize(byte[] bytes) throws ClassNotFoundException, IOException { 23 | ByteArrayInputStream bais = new ByteArrayInputStream(bytes); 24 | ObjectInputStream in = null; 25 | try { 26 | // stream closed in the finally 27 | in = new ObjectInputStream(bais); 28 | return in.readObject(); 29 | } finally { 30 | if (in != null) { 31 | in.close(); 32 | } 33 | } 34 | 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/test/java/com/clearspring/analytics/hash/TestLookup3Hash.java: -------------------------------------------------------------------------------- 1 | package com.clearspring.analytics.hash; 2 | /** 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | import java.util.Random; 20 | 21 | import org.junit.Test; 22 | 23 | import static org.junit.Assert.assertEquals; 24 | 25 | /** 26 | * Tests for lookup3ycs hash functions 27 | * 28 | * @author yonik 29 | */ 30 | public class TestLookup3Hash { 31 | 32 | // Test that the java version produces the same output as the C version 33 | 34 | @Test 35 | public void testEqualsLOOKUP3() { 36 | int[] hashes = new int[]{0xc4c20dd5, 0x3ab04cc3, 0xebe874a3, 0x0e770ef3, 0xec321498, 0x73845e86, 0x8a2db728, 0x03c313bb, 0xfe5b9199, 0x95965125, 0xcbc4e7c2}; 37 | /*** the hash values were generated by adding the following to lookup3.c 38 | * 39 | * char* s = "hello world"; 40 | * int len = strlen(s); 41 | * uint32_t a[len]; 42 | * for (int i=0; i vs = new ConcurrentStreamSummary(3); 36 | String[] stream = {"X", "X", "Y", "Z", "A", "B", "C", "X", "X", "A", "A", "A"}; 37 | for (String i : stream) { 38 | vs.offer(i); 39 | /* 40 | for(String s : vs.poll(3)) 41 | System.out.print(s+" "); 42 | */ 43 | System.out.println(vs); 44 | } 45 | } 46 | 47 | @Test 48 | public void testTopK() { 49 | ConcurrentStreamSummary vs = new ConcurrentStreamSummary(3); 50 | String[] stream = {"X", "X", "Y", "Z", "A", "B", "C", "X", "X", "A", "C", "A", "A"}; 51 | for (String i : stream) { 52 | vs.offer(i); 53 | } 54 | List> topK = vs.peekWithScores(3); 55 | for (ScoredItem c : topK) { 56 | assertTrue(Arrays.asList("A", "C", "X").contains(c.getItem())); 57 | } 58 | } 59 | 60 | @Test 61 | public void testTopKWithIncrement() { 62 | ConcurrentStreamSummary vs = new ConcurrentStreamSummary(3); 63 | String[] stream = {"X", "X", "Y", "Z", "A", "B", "C", "X", "X", "A", "C", "A", "A"}; 64 | for (String i : stream) { 65 | vs.offer(i, 10); 66 | } 67 | List> topK = vs.peekWithScores(3); 68 | for (ScoredItem c : topK) { 69 | assertTrue(Arrays.asList("A", "C", "X").contains(c.getItem())); 70 | } 71 | } 72 | 73 | @Test 74 | public void testGeometricDistribution() { 75 | ConcurrentStreamSummary vs = new ConcurrentStreamSummary(10); 76 | RandomEngine re = RandomEngine.makeDefault(); 77 | 78 | for (int i = 0; i < NUM_ITERATIONS; i++) { 79 | int z = Distributions.nextGeometric(0.25, re); 80 | vs.offer(z); 81 | } 82 | 83 | List top = vs.peek(5); 84 | System.out.println("Geometric:"); 85 | for (Integer e : top) { 86 | System.out.println(e); 87 | } 88 | 89 | int tippyTop = top.get(0); 90 | assertEquals(0, tippyTop); 91 | System.out.println(vs); 92 | } 93 | 94 | } 95 | -------------------------------------------------------------------------------- /src/test/java/com/clearspring/analytics/stream/TestSampleSet.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2011 Clearspring Technologies, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | /** 18 | * 19 | */ 20 | package com.clearspring.analytics.stream; 21 | 22 | import java.util.List; 23 | 24 | import org.junit.Before; 25 | import org.junit.Test; 26 | 27 | import static org.junit.Assert.assertEquals; 28 | import static org.junit.Assert.assertNull; 29 | import static org.junit.Assert.assertTrue; 30 | 31 | 32 | public class TestSampleSet { 33 | 34 | private SampleSet set; 35 | private String[] e; 36 | 37 | /** 38 | * @throws java.lang.Exception 39 | */ 40 | @Before 41 | public void setUp() throws Exception { 42 | set = new SampleSet(); 43 | 44 | e = new String[3]; 45 | for (int i = 0; i < e.length; i++) { 46 | e[i] = "Element_" + i; 47 | } 48 | } 49 | 50 | 51 | @Test 52 | public void testPeekK() { 53 | set.put(e[0]); 54 | for (int i = 0; i < 2; i++) { 55 | set.put(e[1]); 56 | } 57 | 58 | for (int i = 0; i < 3; i++) { 59 | set.put(e[2]); 60 | } 61 | 62 | List top = null; 63 | // Negative 64 | boolean caught = false; 65 | try { 66 | top = set.peek(-1); 67 | } catch (IllegalArgumentException e) { 68 | caught = true; 69 | } 70 | assertTrue(caught); 71 | 72 | // 0 73 | top = set.peek(0); 74 | assertEquals(0, top.size()); 75 | 76 | // 1 77 | top = set.peek(1); 78 | assertEquals(1, top.size()); 79 | assertEquals(set.peek(), top.get(0)); 80 | 81 | // 2 (more than one but less than size) 82 | top = set.peek(2); 83 | assertEquals(2, top.size()); 84 | for (int i = 0; i < 2; i++) { 85 | assertEquals(e[2 - i], top.get(i)); 86 | } 87 | 88 | // 3 (size) 89 | top = set.peek(3); 90 | assertEquals(3, top.size()); 91 | for (int i = 0; i < 3; i++) { 92 | assertEquals(e[2 - i], top.get(i)); 93 | } 94 | 95 | // 4 (more than size) 96 | top = set.peek(4); 97 | assertEquals(3, top.size()); 98 | for (int i = 0; i < 3; i++) { 99 | assertEquals(e[2 - i], top.get(i)); 100 | } 101 | } 102 | 103 | @Test 104 | public void testPut() { 105 | // Empty set 106 | assertEquals(1L, set.put(e[0])); 107 | assertEquals(e[0], set.peek()); 108 | assertEquals(e[0], ((SampleSet) set).peekMin()); 109 | } 110 | 111 | @Test 112 | public void testPutWithIncrement() { 113 | // Empty set 114 | assertEquals(10L, set.put(e[0], 10)); 115 | assertEquals(e[0], set.peek()); 116 | assertEquals(e[0], ((SampleSet) set).peekMin()); 117 | 118 | } 119 | 120 | @Test 121 | public void testRemoveMin() { 122 | // Empty set 123 | assertNull(set.removeMin()); 124 | assertEquals(0, set.size()); 125 | assertEquals(0L, set.count()); 126 | 127 | // Maintaining order 128 | set.put(e[0]); 129 | for (int i = 0; i < 2; i++) { 130 | set.put(e[1]); 131 | } 132 | 133 | for (int i = 0; i < 3; i++) { 134 | set.put(e[2]); 135 | } 136 | 137 | assertEquals(3, set.size()); 138 | assertEquals(6L, set.count()); 139 | 140 | assertEquals(e[0], set.removeMin()); 141 | assertEquals(2, set.size()); 142 | assertEquals(5L, set.count()); 143 | 144 | assertEquals(e[1], set.removeMin()); 145 | assertEquals(1, set.size()); 146 | assertEquals(3L, set.count()); 147 | 148 | assertEquals(e[2], set.removeMin()); 149 | assertEquals(0, set.size()); 150 | assertEquals(0L, set.count()); 151 | 152 | assertEquals(null, set.removeMin()); 153 | assertEquals(0, set.size()); 154 | assertEquals(0L, set.count()); 155 | } 156 | } 157 | -------------------------------------------------------------------------------- /src/test/java/com/clearspring/analytics/stream/TestStochasticTopper.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2011 Clearspring Technologies, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | /** 18 | * 19 | */ 20 | package com.clearspring.analytics.stream; 21 | 22 | import java.util.List; 23 | import java.util.Random; 24 | 25 | import org.junit.Before; 26 | import org.junit.Test; 27 | 28 | import cern.jet.random.Distributions; 29 | import cern.jet.random.engine.RandomEngine; 30 | import static org.junit.Assert.assertTrue; 31 | 32 | 33 | public class TestStochasticTopper { 34 | 35 | private static final int NUM_ITERATIONS = 100000; 36 | private static final int NUM_ELEMENTS = 10; 37 | private StochasticTopper vs; 38 | private Random random; 39 | 40 | @Before 41 | public void setUp() { 42 | vs = new StochasticTopper(200); 43 | random = new Random(340340990L); 44 | } 45 | 46 | 47 | @Test 48 | public void testGaussianDistribution() { 49 | for (int i = 0; i < NUM_ITERATIONS; i++) { 50 | vs.offer(new Integer((int) Math.round((random.nextGaussian() * NUM_ELEMENTS)))); 51 | } 52 | 53 | List top = vs.peek(5); 54 | System.out.println("Gaussian:"); 55 | for (Integer e : top) { 56 | System.out.println(e); 57 | } 58 | 59 | int tippyTop = top.get(0); 60 | assertTrue(tippyTop > -15 && tippyTop < 15); 61 | } 62 | 63 | @Test 64 | public void testZipfianDistribution() { 65 | RandomEngine re = RandomEngine.makeDefault(); 66 | 67 | for (int i = 0; i < NUM_ITERATIONS; i++) { 68 | int z = Distributions.nextZipfInt(1.2D, re); 69 | vs.offer(z); 70 | } 71 | 72 | List top = vs.peek(5); 73 | System.out.println("Zipfian:"); 74 | for (Integer e : top) { 75 | System.out.println(e); 76 | } 77 | 78 | int tippyTop = top.get(0); 79 | assertTrue(tippyTop < 3); 80 | } 81 | 82 | @Test 83 | public void testGeometricDistribution() { 84 | RandomEngine re = RandomEngine.makeDefault(); 85 | 86 | for (int i = 0; i < NUM_ITERATIONS; i++) { 87 | int z = Distributions.nextGeometric(0.25, re); 88 | vs.offer(z); 89 | } 90 | 91 | List top = vs.peek(5); 92 | System.out.println("Geometric:"); 93 | for (Integer e : top) { 94 | System.out.println(e); 95 | } 96 | 97 | int tippyTop = top.get(0); 98 | assertTrue(tippyTop < 3); 99 | } 100 | 101 | @Test 102 | public void testRandomEngine() { 103 | int[] maxcounts = new int[10]; 104 | int[] counts = new int[20]; 105 | 106 | RandomEngine re = RandomEngine.makeDefault(); 107 | 108 | for (int i = 0; i < NUM_ITERATIONS; i++) { 109 | // int z = Distributions.nextZipfInt(1.2D, re); 110 | int z = Distributions.nextGeometric(0.25, re); 111 | if (z > Integer.MAX_VALUE - 9) { 112 | maxcounts[Integer.MAX_VALUE - z]++; 113 | } 114 | if (z < 20) { 115 | counts[z]++; 116 | } 117 | } 118 | 119 | for (int i = 0; i < 20; i++) { 120 | System.out.println(i + ": " + counts[i]); 121 | } 122 | 123 | for (int i = 9; i >= 0; i--) { 124 | System.out.println((Integer.MAX_VALUE - i) + ": " + maxcounts[i]); 125 | } 126 | 127 | } 128 | } 129 | -------------------------------------------------------------------------------- /src/test/java/com/clearspring/analytics/stream/TestStreamSummary.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2011 Clearspring Technologies, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.clearspring.analytics.stream; 18 | 19 | import java.io.ByteArrayInputStream; 20 | import java.io.ByteArrayOutputStream; 21 | import java.io.IOException; 22 | import java.io.ObjectInput; 23 | import java.io.ObjectInputStream; 24 | import java.io.ObjectOutput; 25 | import java.io.ObjectOutputStream; 26 | 27 | import java.util.Arrays; 28 | import java.util.List; 29 | 30 | import org.junit.Test; 31 | 32 | import cern.jet.random.Distributions; 33 | import cern.jet.random.engine.RandomEngine; 34 | import static org.junit.Assert.assertEquals; 35 | import static org.junit.Assert.assertTrue; 36 | 37 | 38 | public class TestStreamSummary { 39 | 40 | private static final int NUM_ITERATIONS = 100000; 41 | 42 | @Test 43 | public void testStreamSummary() { 44 | StreamSummary vs = new StreamSummary(3); 45 | String[] stream = {"X", "X", "Y", "Z", "A", "B", "C", "X", "X", "A", "A", "A"}; 46 | for (String i : stream) { 47 | vs.offer(i); 48 | /* 49 | for(String s : vs.poll(3)) 50 | System.out.print(s+" "); 51 | */ 52 | System.out.println(vs); 53 | } 54 | } 55 | 56 | @Test 57 | public void testTopK() { 58 | StreamSummary vs = new StreamSummary(3); 59 | String[] stream = {"X", "X", "Y", "Z", "A", "B", "C", "X", "X", "A", "C", "A", "A"}; 60 | for (String i : stream) { 61 | vs.offer(i); 62 | } 63 | List> topK = vs.topK(3); 64 | for (Counter c : topK) { 65 | assertTrue(Arrays.asList("A", "C", "X").contains(c.getItem())); 66 | } 67 | } 68 | 69 | @Test 70 | public void testTopKWithIncrement() { 71 | StreamSummary vs = new StreamSummary(3); 72 | String[] stream = {"X", "X", "Y", "Z", "A", "B", "C", "X", "X", "A", "C", "A", "A"}; 73 | for (String i : stream) { 74 | vs.offer(i, 10); 75 | } 76 | List> topK = vs.topK(3); 77 | for (Counter c : topK) { 78 | assertTrue(Arrays.asList("A", "C", "X").contains(c.getItem())); 79 | } 80 | } 81 | 82 | @Test 83 | public void testTopKWithIncrementOutOfOrder() { 84 | StreamSummary vs_increment = new StreamSummary(3); 85 | StreamSummary vs_single = new StreamSummary(3); 86 | String[] stream = {"A", "B", "C", "D", "A"}; 87 | Integer[] increments = {15, 20, 25, 30, 1}; 88 | 89 | for (int i = 0; i < stream.length; i++) { 90 | vs_increment.offer(stream[i], increments[i]); 91 | for (int k = 0; k < increments[i]; k++) { 92 | vs_single.offer(stream[i]); 93 | } 94 | } 95 | System.out.println("Insert with counts vs. single inserts:"); 96 | System.out.println(vs_increment); 97 | System.out.println(vs_single); 98 | 99 | List> topK_increment = vs_increment.topK(3); 100 | List> topK_single = vs_single.topK(3); 101 | 102 | for (int i = 0; i < topK_increment.size(); i++) { 103 | assertEquals(topK_increment.get(i).getItem(), 104 | topK_single.get(i).getItem()); 105 | } 106 | } 107 | 108 | @Test 109 | public void testGeometricDistribution() { 110 | StreamSummary vs = new StreamSummary(10); 111 | RandomEngine re = RandomEngine.makeDefault(); 112 | 113 | for (int i = 0; i < NUM_ITERATIONS; i++) { 114 | int z = Distributions.nextGeometric(0.25, re); 115 | vs.offer(z); 116 | } 117 | 118 | List top = vs.peek(5); 119 | System.out.println("Geometric:"); 120 | for (Integer e : top) { 121 | System.out.println(e); 122 | } 123 | 124 | int tippyTop = top.get(0); 125 | assertEquals(0, tippyTop); 126 | System.out.println(vs); 127 | } 128 | 129 | @SuppressWarnings("unchecked") 130 | @Test 131 | public void testCounterSerialization() throws IOException, ClassNotFoundException { 132 | StreamSummary vs = new StreamSummary(3); 133 | String[] stream = {"X", "X", "Y", "Z", "A", "B", "C", "X", "X", "A", "C", "A", "A"}; 134 | for (String i : stream) { 135 | vs.offer(i); 136 | } 137 | List> topK = vs.topK(3); 138 | for (Counter c : topK) { 139 | ByteArrayOutputStream baos = new ByteArrayOutputStream(); 140 | ObjectOutput oo = new ObjectOutputStream(baos); 141 | oo.writeObject(c); 142 | oo.close(); 143 | 144 | ObjectInput oi = new ObjectInputStream(new ByteArrayInputStream(baos.toByteArray())); 145 | Counter clone = (Counter) oi.readObject(); 146 | assertEquals(c.getCount(), clone.getCount()); 147 | assertEquals(c.getError(), clone.getError()); 148 | assertEquals(c.getItem(), clone.getItem()); 149 | } 150 | } 151 | 152 | 153 | @SuppressWarnings("unchecked") 154 | @Test 155 | public void testSerialization() throws IOException, ClassNotFoundException { 156 | StreamSummary vs = new StreamSummary(3); 157 | String[] stream = {"X", "X", "Y", "Z", "A", "B", "C", "X", "X", "A", "C", "A", "A"}; 158 | for (String i : stream) { 159 | vs.offer(i); 160 | } 161 | 162 | ByteArrayOutputStream baos = new ByteArrayOutputStream(); 163 | ObjectOutput oo = new ObjectOutputStream(baos); 164 | oo.writeObject(vs); 165 | oo.close(); 166 | 167 | ObjectInput oi = new ObjectInputStream(new ByteArrayInputStream(baos.toByteArray())); 168 | StreamSummary clone = (StreamSummary) oi.readObject(); 169 | 170 | assertEquals(vs.toString(), clone.toString()); 171 | } 172 | 173 | 174 | @Test 175 | public void testByteSerialization() throws IOException, ClassNotFoundException { 176 | StreamSummary vs = new StreamSummary(3); 177 | String[] stream = {"X", "X", "Y", "Z", "A", "B", "C", "X", "X", "A", "C", "A", "A"}; 178 | for (String i : stream) { 179 | vs.offer(i); 180 | } 181 | 182 | testSerialization(vs); 183 | 184 | // Empty 185 | vs = new StreamSummary(0); 186 | testSerialization(vs); 187 | } 188 | 189 | private void testSerialization(StreamSummary vs) throws IOException, ClassNotFoundException { 190 | byte[] bytes = vs.toBytes(); 191 | StreamSummary clone = new StreamSummary(bytes); 192 | 193 | assertEquals(vs.toString(), clone.toString()); 194 | } 195 | } 196 | -------------------------------------------------------------------------------- /src/test/java/com/clearspring/analytics/stream/cardinality/RegisterSetTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2012 Clearspring Technologies, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.clearspring.analytics.stream.cardinality; 18 | 19 | import java.util.Random; 20 | 21 | import org.junit.Test; 22 | 23 | import static org.junit.Assert.assertEquals; 24 | 25 | public class RegisterSetTest { 26 | 27 | @Test 28 | public void testGetAndSet() throws Exception { 29 | RegisterSet rs = new RegisterSet((int) Math.pow(2, 4)); 30 | rs.set(0, 11); 31 | assertEquals(11, rs.get(0)); 32 | } 33 | 34 | @Test 35 | public void testGetAndSet_allPositions() throws Exception { 36 | RegisterSet rs = new RegisterSet((int) Math.pow(2, 4)); 37 | for (int i = 0; i < Math.pow(2, 4); i++) { 38 | rs.set(i, i % 31); 39 | assertEquals(i % 31, rs.get(i)); 40 | } 41 | } 42 | 43 | @Test 44 | public void testGetAndSet_withSmallBits() throws Exception { 45 | RegisterSet rs = new RegisterSet(6); 46 | rs.set(0, 11); 47 | assertEquals(11, rs.get(0)); 48 | } 49 | 50 | @Test 51 | public void testMerge() { 52 | Random rand = new Random(2); 53 | int count = 32; 54 | RegisterSet rs = new RegisterSet(count); 55 | RegisterSet[] rss = new RegisterSet[5]; 56 | 57 | for (int i = 0; i < rss.length; i++) { 58 | rss[i] = new RegisterSet(count); 59 | 60 | for (int pos = 0; pos < rs.count; pos++) { 61 | int val = rand.nextInt(10); 62 | rs.updateIfGreater(pos, val); 63 | rss[i].set(pos, val); 64 | } 65 | } 66 | 67 | RegisterSet merged = new RegisterSet(count); 68 | for (int i = 0; i < rss.length; i++) { 69 | merged.merge(rss[i]); 70 | } 71 | 72 | for (int pos = 0; pos < rs.count; pos++) { 73 | assertEquals(rs.get(pos), merged.get(pos)); 74 | } 75 | } 76 | 77 | @Test 78 | public void testMergeUsingUpdate() { 79 | Random rand = new Random(2); 80 | int count = 32; 81 | RegisterSet rs = new RegisterSet(count); 82 | RegisterSet[] rss = new RegisterSet[5]; 83 | 84 | for (int i = 0; i < rss.length; i++) { 85 | rss[i] = new RegisterSet(count); 86 | 87 | for (int pos = 0; pos < rs.count; pos++) { 88 | int val = rand.nextInt(10); 89 | rs.updateIfGreater(pos, val); 90 | rss[i].set(pos, val); 91 | } 92 | } 93 | 94 | RegisterSet merged = new RegisterSet(count); 95 | for (int i = 0; i < rss.length; i++) { 96 | for (int pos = 0; pos < rs.count; pos++) { 97 | merged.updateIfGreater(pos, rss[i].get(pos)); 98 | } 99 | } 100 | 101 | for (int pos = 0; pos < rs.count; pos++) { 102 | assertEquals(rs.get(pos), merged.get(pos)); 103 | } 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /src/test/java/com/clearspring/analytics/stream/cardinality/TestAdaptiveCounting.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2011 Clearspring Technologies, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.clearspring.analytics.stream.cardinality; 18 | 19 | import java.util.Arrays; 20 | 21 | import org.junit.Ignore; 22 | import org.junit.Test; 23 | 24 | import static org.junit.Assert.assertArrayEquals; 25 | import static org.junit.Assert.assertEquals; 26 | 27 | 28 | public class TestAdaptiveCounting { 29 | 30 | @Test 31 | public void testRho() { 32 | assertEquals(17, LogLog.rho(0, 16)); 33 | assertEquals(16, LogLog.rho(1, 16)); 34 | assertEquals(15, LogLog.rho(2, 16)); 35 | assertEquals(1, LogLog.rho(0x00008000, 16)); 36 | 37 | assertEquals(23, LogLog.rho(0, 10)); 38 | assertEquals(22, LogLog.rho(1, 10)); 39 | assertEquals(21, LogLog.rho(2, 10)); 40 | assertEquals(1, LogLog.rho(0x00200000, 10)); 41 | } 42 | 43 | @Test 44 | public void testRhoL() { 45 | assertEquals(49, AdaptiveCounting.rho(0L, 16)); 46 | assertEquals(48, AdaptiveCounting.rho(1L, 16)); 47 | assertEquals(47, AdaptiveCounting.rho(2L, 16)); 48 | assertEquals(1, AdaptiveCounting.rho(0x80008000L, 32)); 49 | 50 | assertEquals(55, AdaptiveCounting.rho(0L, 10)); 51 | assertEquals(54, AdaptiveCounting.rho(1L, 10)); 52 | assertEquals(53, AdaptiveCounting.rho(2L, 10)); 53 | assertEquals(1, AdaptiveCounting.rho(0x0020000000000000L, 10)); 54 | 55 | assertEquals(3, AdaptiveCounting.rho(0xDEA07EEFFEEDCAFEL, 15)); 56 | } 57 | 58 | @Test 59 | public void testJ() { 60 | long x = 0xDEADBEEFFEEDCAFEL; 61 | int k = 12; 62 | int j = (int) (x >>> (Long.SIZE - k)); 63 | assertEquals(0xDEA, j); 64 | } 65 | 66 | @Test 67 | public void testMerge() throws CardinalityMergeException { 68 | int numToMerge = 10; 69 | int cardinality = 10000; 70 | 71 | AdaptiveCounting[] lcs = new AdaptiveCounting[numToMerge]; 72 | AdaptiveCounting baseline = new AdaptiveCounting(16); 73 | for (int i = 0; i < numToMerge; i++) { 74 | lcs[i] = new AdaptiveCounting(16); 75 | for (int j = 0; j < cardinality; j++) { 76 | double val = Math.random(); 77 | lcs[i].offer(val); 78 | baseline.offer(val); 79 | } 80 | } 81 | 82 | int expectedCardinality = numToMerge * cardinality; 83 | long mergedEstimate = AdaptiveCounting.mergeEstimators(lcs).cardinality(); 84 | double error = Math.abs(mergedEstimate - expectedCardinality) / (double) expectedCardinality; 85 | assertEquals(0.01, error, 0.01); 86 | 87 | AdaptiveCounting lc = lcs[0]; 88 | lcs = Arrays.asList(lcs).subList(1, lcs.length).toArray(new AdaptiveCounting[0]); 89 | mergedEstimate = lc.merge(lcs).cardinality(); 90 | error = Math.abs(mergedEstimate - expectedCardinality) / (double) expectedCardinality; 91 | assertEquals(0.01, error, 0.01); 92 | 93 | assertEquals(baseline.cardinality(), mergedEstimate); 94 | } 95 | 96 | @Ignore 97 | @Test 98 | public void testLongCardinality() { 99 | ICardinality ac = new AdaptiveCounting(16); 100 | for (long i = 0; i < 5000000000L; i++) { 101 | ac.offer(Long.valueOf(i)); 102 | if (i % 10000000 == 0) { 103 | System.out.println("actual: " + i + ", estimated: " + ac.cardinality()); 104 | } 105 | } 106 | 107 | System.out.println(ac.cardinality()); 108 | assertEquals(5000000000L, ac.cardinality(), 100000000); 109 | 110 | } 111 | 112 | @Test 113 | public void testSerialization() { 114 | AdaptiveCounting ac = new AdaptiveCounting(10); 115 | testSerialization(ac); 116 | } 117 | 118 | private void testSerialization(AdaptiveCounting ac) { 119 | AdaptiveCounting clone = new AdaptiveCounting(ac.getBytes()); 120 | assertAdaptiveCountingEquals(ac, clone); 121 | 122 | assertEquals(0, ac.cardinality()); 123 | 124 | for (int i = 0; i < 100; i++) { 125 | ac.offer(i); 126 | } 127 | 128 | clone = new AdaptiveCounting(ac.getBytes()); 129 | assertAdaptiveCountingEquals(ac, clone); 130 | 131 | for (int i = 0; i < 1000000; i++) { 132 | ac.offer(i); 133 | } 134 | 135 | clone = new AdaptiveCounting(ac.getBytes()); 136 | assertAdaptiveCountingEquals(ac, clone); 137 | } 138 | 139 | private void assertAdaptiveCountingEquals(AdaptiveCounting expected, AdaptiveCounting actual) { 140 | assertArrayEquals(expected.M, actual.M); 141 | assertEquals(expected.k, actual.k); 142 | assertEquals(expected.m, actual.m); 143 | assertEquals(expected.Ca, actual.Ca, 0.00000001); 144 | assertEquals(expected.Rsum, actual.Rsum); 145 | 146 | assertEquals(expected.b_e, actual.b_e); 147 | assertEquals(expected.B_s, actual.B_s, 0.00000001); 148 | 149 | assertEquals(expected.sizeof(), actual.sizeof()); 150 | assertEquals(expected.cardinality(), actual.cardinality()); 151 | } 152 | } 153 | -------------------------------------------------------------------------------- /src/test/java/com/clearspring/analytics/stream/cardinality/TestHyperLogLog.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2011 Clearspring Technologies, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.clearspring.analytics.stream.cardinality; 18 | 19 | import java.io.IOException; 20 | 21 | import java.util.Arrays; 22 | 23 | import com.clearspring.analytics.TestUtils; 24 | 25 | import com.google.common.base.Charsets; 26 | import com.google.common.hash.HashFunction; 27 | import com.google.common.hash.Hashing; 28 | 29 | import org.junit.Ignore; 30 | import org.junit.Test; 31 | 32 | import static org.junit.Assert.assertEquals; 33 | import static org.junit.Assert.assertTrue; 34 | 35 | public class TestHyperLogLog { 36 | 37 | @Test 38 | public void testComputeCount() { 39 | HyperLogLog hyperLogLog = new HyperLogLog(16); 40 | hyperLogLog.offer(0); 41 | hyperLogLog.offer(1); 42 | hyperLogLog.offer(2); 43 | hyperLogLog.offer(3); 44 | hyperLogLog.offer(16); 45 | hyperLogLog.offer(17); 46 | hyperLogLog.offer(18); 47 | hyperLogLog.offer(19); 48 | hyperLogLog.offer(19); 49 | assertEquals(8, hyperLogLog.cardinality()); 50 | } 51 | 52 | @Test 53 | public void testSerialization() throws IOException, ClassNotFoundException { 54 | HyperLogLog hll = new HyperLogLog(8); 55 | hll.offer("a"); 56 | hll.offer("b"); 57 | hll.offer("c"); 58 | hll.offer("d"); 59 | hll.offer("e"); 60 | 61 | HyperLogLog hll2 = (HyperLogLog) TestUtils.deserialize(TestUtils.serialize(hll)); 62 | assertEquals(hll.cardinality(), hll2.cardinality()); 63 | } 64 | 65 | @Test 66 | public void testSerializationUsingBuilder() throws IOException { 67 | HyperLogLog hll = new HyperLogLog(8); 68 | hll.offer("a"); 69 | hll.offer("b"); 70 | hll.offer("c"); 71 | hll.offer("d"); 72 | hll.offer("e"); 73 | 74 | HyperLogLog hll2 = HyperLogLog.Builder.build(hll.getBytes()); 75 | assertEquals(hll.cardinality(), hll2.cardinality()); 76 | } 77 | 78 | @Test 79 | public void testHighCardinality() { 80 | long start = System.currentTimeMillis(); 81 | HyperLogLog hyperLogLog = new HyperLogLog(10); 82 | int size = 10000000; 83 | for (int i = 0; i < size; i++) { 84 | hyperLogLog.offer(TestICardinality.streamElement(i)); 85 | } 86 | System.out.println("time: " + (System.currentTimeMillis() - start)); 87 | long estimate = hyperLogLog.cardinality(); 88 | double err = Math.abs(estimate - size) / (double) size; 89 | System.out.println(err); 90 | assertTrue(err < .1); 91 | } 92 | 93 | @Test 94 | public void testHighCardinality_withDefinedRSD() { 95 | long start = System.currentTimeMillis(); 96 | HyperLogLog hyperLogLog = new HyperLogLog(0.01); 97 | int size = 100000000; 98 | for (int i = 0; i < size; i++) { 99 | hyperLogLog.offer(TestICardinality.streamElement(i)); 100 | } 101 | System.out.println("time: " + (System.currentTimeMillis() - start)); 102 | long estimate = hyperLogLog.cardinality(); 103 | double err = Math.abs(estimate - size) / (double) size; 104 | System.out.println(err); 105 | assertTrue(err < .1); 106 | } 107 | 108 | @Test 109 | public void testMerge() throws CardinalityMergeException { 110 | int numToMerge = 5; 111 | int bits = 16; 112 | int cardinality = 1000000; 113 | 114 | HyperLogLog[] hyperLogLogs = new HyperLogLog[numToMerge]; 115 | HyperLogLog baseline = new HyperLogLog(bits); 116 | for (int i = 0; i < numToMerge; i++) { 117 | hyperLogLogs[i] = new HyperLogLog(bits); 118 | for (int j = 0; j < cardinality; j++) { 119 | double val = Math.random(); 120 | hyperLogLogs[i].offer(val); 121 | baseline.offer(val); 122 | } 123 | } 124 | 125 | 126 | long expectedCardinality = numToMerge * cardinality; 127 | HyperLogLog hll = hyperLogLogs[0]; 128 | hyperLogLogs = Arrays.asList(hyperLogLogs).subList(1, hyperLogLogs.length).toArray(new HyperLogLog[0]); 129 | long mergedEstimate = hll.merge(hyperLogLogs).cardinality(); 130 | long baselineEstimate = baseline.cardinality(); 131 | double se = expectedCardinality * (1.04 / Math.sqrt(Math.pow(2, bits))); 132 | 133 | System.out.println("Baseline estimate: " + baselineEstimate); 134 | System.out.println("Expect estimate: " + mergedEstimate + " is between " + (expectedCardinality - (3 * se)) + " and " + (expectedCardinality + (3 * se))); 135 | 136 | assertTrue(mergedEstimate >= expectedCardinality - (3 * se)); 137 | assertTrue(mergedEstimate <= expectedCardinality + (3 * se)); 138 | assertEquals(mergedEstimate, baselineEstimate); 139 | } 140 | 141 | /** 142 | * should not fail with HyperLogLogMergeException: "Cannot merge estimators of different sizes" 143 | */ 144 | @Test 145 | public void testMergeWithRegisterSet() throws CardinalityMergeException { 146 | HyperLogLog first = new HyperLogLog(16, new RegisterSet(1 << 20)); 147 | HyperLogLog second = new HyperLogLog(16, new RegisterSet(1 << 20)); 148 | first.offer(0); 149 | second.offer(1); 150 | first.merge(second); 151 | } 152 | 153 | @Test 154 | @Ignore 155 | public void testPrecise() throws CardinalityMergeException { 156 | int cardinality = 1000000000; 157 | int b = 12; 158 | HyperLogLog baseline = new HyperLogLog(b); 159 | HyperLogLog guava128 = new HyperLogLog(b); 160 | HashFunction hf128 = Hashing.murmur3_128(); 161 | for (int j = 0; j < cardinality; j++) { 162 | Double val = Math.random(); 163 | String valString = val.toString(); 164 | baseline.offer(valString); 165 | guava128.offerHashed(hf128.hashString(valString, Charsets.UTF_8).asLong()); 166 | if (j > 0 && j % 1000000 == 0) { 167 | System.out.println("current count: " + j); 168 | } 169 | } 170 | 171 | 172 | long baselineEstimate = baseline.cardinality(); 173 | long g128Estimate = guava128.cardinality(); 174 | double se = cardinality * (1.04 / Math.sqrt(Math.pow(2, b))); 175 | double baselineError = (baselineEstimate - cardinality) / (double) cardinality; 176 | double g128Error = (g128Estimate - cardinality) / (double) cardinality; 177 | System.out.format("b: %f g128 %f", baselineError, g128Error); 178 | assertTrue("baseline estimate bigger than expected", baselineEstimate >= cardinality - (2 * se)); 179 | assertTrue("baseline estimate smaller than expected", baselineEstimate <= cardinality + (2 * se)); 180 | assertTrue("g128 estimate bigger than expected", g128Estimate >= cardinality - (2 * se)); 181 | assertTrue("g128 estimate smaller than expected", g128Estimate <= cardinality + (2 * se)); 182 | } 183 | } 184 | -------------------------------------------------------------------------------- /src/test/java/com/clearspring/analytics/stream/cardinality/TestICardinality.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2011 Clearspring Technologies, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.clearspring.analytics.stream.cardinality; 18 | 19 | import java.util.Arrays; 20 | import java.util.Collection; 21 | import java.util.Random; 22 | 23 | import org.junit.Test; 24 | import org.junit.runner.RunWith; 25 | import org.junit.runners.Parameterized; 26 | import org.junit.runners.Parameterized.Parameters; 27 | 28 | import static org.junit.Assert.assertFalse; 29 | 30 | @RunWith(Parameterized.class) 31 | public class TestICardinality { 32 | 33 | private int N = 1000000; 34 | private ICardinality cardinalityEstimator; 35 | private static Random prng = new Random(); 36 | private static char[] hex = "0123456789abcdef".toCharArray(); 37 | 38 | public TestICardinality(ICardinality cardinalityEstimator) { 39 | super(); 40 | this.cardinalityEstimator = cardinalityEstimator; 41 | } 42 | 43 | @Test 44 | public void testOffer() { 45 | cardinalityEstimator.offer("A"); 46 | cardinalityEstimator.offer("B"); 47 | cardinalityEstimator.offer("C"); 48 | assertFalse(cardinalityEstimator.offer("C")); 49 | assertFalse(cardinalityEstimator.offer("B")); 50 | assertFalse(cardinalityEstimator.offer("A")); 51 | cardinalityEstimator.offer("ABCCBA"); 52 | cardinalityEstimator.offer("CBAABC"); 53 | cardinalityEstimator.offer("ABCABC"); 54 | cardinalityEstimator.offer("CBACBA"); 55 | assertFalse(cardinalityEstimator.offer("ABCCBA")); 56 | } 57 | 58 | @Test 59 | public void testICardinality() { 60 | System.out.println("size: " + cardinalityEstimator.sizeof() + " bytes"); 61 | for (int i = 0; i < N; i++) { 62 | cardinalityEstimator.offer(streamElement(i)); 63 | } 64 | 65 | long estimate = cardinalityEstimator.cardinality(); 66 | System.out.println(estimate); 67 | double err = Math.abs(estimate - N) / (double) N; 68 | System.out.println("% Error: " + err * 100); 69 | } 70 | 71 | static int se = 0; 72 | 73 | public static Object streamElement(int i) { 74 | return Long.toHexString(prng.nextLong()); 75 | //return se++; 76 | } 77 | 78 | @Parameters 79 | public static Collection regExValues() { 80 | return Arrays.asList(new Object[][]{ 81 | //{ new LinearCounting(65536) }, 82 | //{ new CountThenEstimate() }, 83 | {new AdaptiveCounting(16)}, 84 | //{ new LogLog(10) }, 85 | //{ new LogLog(12) }, 86 | //{ new LogLog(14) }, 87 | }); 88 | } 89 | 90 | } 91 | -------------------------------------------------------------------------------- /src/test/java/com/clearspring/analytics/stream/cardinality/TestLinearCounting.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2011 Clearspring Technologies, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.clearspring.analytics.stream.cardinality; 18 | 19 | import java.util.Arrays; 20 | 21 | import com.clearspring.analytics.stream.cardinality.LinearCounting.Builder; 22 | import com.clearspring.analytics.stream.cardinality.LinearCounting.LinearCountingMergeException; 23 | 24 | import org.junit.Test; 25 | 26 | import static org.junit.Assert.assertArrayEquals; 27 | import static org.junit.Assert.assertEquals; 28 | import static org.junit.Assert.assertTrue; 29 | 30 | public class TestLinearCounting { 31 | 32 | @Test 33 | public void testComputeCount() { 34 | LinearCounting lc = new LinearCounting(4); 35 | lc.offer(0); 36 | lc.offer(1); 37 | lc.offer(2); 38 | lc.offer(3); 39 | lc.offer(16); 40 | lc.offer(17); 41 | lc.offer(18); 42 | lc.offer(19); 43 | assertEquals(27, lc.computeCount()); 44 | } 45 | 46 | @Test 47 | public void testSaturation() { 48 | LinearCounting lc = new LinearCounting(1); 49 | for (int i = 0; i < 27; i++) { 50 | lc.offer(i); 51 | } 52 | 53 | assertTrue(lc.isSaturated()); 54 | assertEquals(0, lc.getCount()); 55 | assertEquals(Long.MAX_VALUE, lc.cardinality()); 56 | } 57 | 58 | @Test 59 | public void testBuilder() { 60 | assertEquals(630, Builder.onePercentError(1).size); 61 | assertEquals(630, Builder.onePercentError(99).size); 62 | assertEquals(630, Builder.onePercentError(100).size); 63 | assertEquals(630, Builder.onePercentError(101).size); 64 | assertEquals(759, Builder.onePercentError(3375).size); 65 | assertEquals(995, Builder.onePercentError(9999).size); 66 | assertEquals(995, Builder.onePercentError(10000).size); 67 | assertEquals(996, Builder.onePercentError(10001).size); 68 | assertEquals(7501, Builder.onePercentError(305028).size); 69 | assertEquals(19272, Builder.onePercentError(1000000).size); 70 | assertEquals(23027, Builder.onePercentError(1250000).size); 71 | assertEquals(74962, Builder.onePercentError(5000000).size); 72 | assertEquals(81372, Builder.onePercentError(5500000).size); 73 | assertEquals(131030, Builder.onePercentError(9500000).size); 74 | assertEquals(137073, Builder.onePercentError(10000000).size); 75 | assertEquals(137073, Builder.onePercentError(10000001).size); 76 | assertEquals(355055, Builder.onePercentError(30000000).size); 77 | assertEquals(573038, Builder.onePercentError(50000000).size); 78 | assertEquals(822207, Builder.onePercentError(75000000).size); 79 | assertEquals(1071377, Builder.onePercentError(100000000).size); 80 | assertEquals(1167722, Builder.onePercentError(110000000).size); 81 | assertEquals(1264067, Builder.onePercentError(120000000).size); 82 | assertEquals(2500000, Builder.onePercentError(240000000).size); 83 | } 84 | 85 | @Test 86 | public void testArbitraryStdErrorSize() { 87 | // Some sanity check with 1% error 88 | assertEquals(630, Builder.withError(0.01, 100).size); 89 | assertEquals(759, Builder.withError(0.01, 3375).size); 90 | 91 | // Checking for 10% error (values from original paper) 92 | assertEquals(10, Builder.withError(0.1, 100).size); 93 | assertEquals(34, Builder.withError(0.1, 1000).size); 94 | assertEquals(214, Builder.withError(0.1, 10000).size); 95 | assertEquals(1593, Builder.withError(0.1, 100000).size); 96 | assertEquals(12610, Builder.withError(0.1, 1000000).size); 97 | assertEquals(103977, Builder.withError(0.1, 10000000).size); 98 | assertEquals(882720, Builder.withError(0.1, 100000000).size); 99 | } 100 | 101 | @Test(expected = IllegalArgumentException.class) 102 | public void testBuilderIllegalArgumentZero() { 103 | Builder.onePercentError(0); 104 | } 105 | 106 | @Test(expected = IllegalArgumentException.class) 107 | public void testBuilderIllegalArgumentNegative() { 108 | Builder.onePercentError(-1); 109 | } 110 | 111 | @Test 112 | public void testSerialization() { 113 | LinearCounting lc = new LinearCounting(4); 114 | lc.offer("a"); 115 | lc.offer("b"); 116 | lc.offer("c"); 117 | lc.offer("d"); 118 | lc.offer("e"); 119 | 120 | LinearCounting lc2 = new LinearCounting(lc.getBytes()); 121 | assertArrayEquals(lc.map, lc2.map); 122 | assertEquals(lc.count, lc2.count); 123 | assertEquals(lc.length, lc2.length); 124 | } 125 | 126 | @Test 127 | public void testMerge() throws LinearCountingMergeException { 128 | int numToMerge = 5; 129 | int size = 65536; 130 | int cardinality = 1000; 131 | 132 | LinearCounting[] lcs = new LinearCounting[numToMerge]; 133 | LinearCounting baseline = new LinearCounting(size); 134 | for (int i = 0; i < numToMerge; i++) { 135 | lcs[i] = new LinearCounting(size); 136 | for (int j = 0; j < cardinality; j++) { 137 | double val = Math.random(); 138 | lcs[i].offer(val); 139 | baseline.offer(val); 140 | } 141 | } 142 | 143 | int expectedCardinality = numToMerge * cardinality; 144 | long mergedEstimate = LinearCounting.mergeEstimators(lcs).cardinality(); 145 | double error = Math.abs(mergedEstimate - expectedCardinality) / (double) expectedCardinality; 146 | assertEquals(0.01, error, 0.01); 147 | 148 | LinearCounting lc = lcs[0]; 149 | lcs = Arrays.asList(lcs).subList(1, lcs.length).toArray(new LinearCounting[0]); 150 | mergedEstimate = lc.merge(lcs).cardinality(); 151 | error = Math.abs(mergedEstimate - expectedCardinality) / (double) expectedCardinality; 152 | assertEquals(0.01, error, 0.01); 153 | 154 | long baselineEstimate = baseline.cardinality(); 155 | assertEquals(baselineEstimate, mergedEstimate); 156 | } 157 | } 158 | -------------------------------------------------------------------------------- /src/test/java/com/clearspring/analytics/stream/cardinality/TestLogLog.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2011 Clearspring Technologies, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.clearspring.analytics.stream.cardinality; 18 | 19 | import java.io.IOException; 20 | 21 | import java.util.Arrays; 22 | 23 | import com.google.common.base.Charsets; 24 | import com.google.common.hash.HashFunction; 25 | import com.google.common.hash.Hashing; 26 | 27 | import org.junit.Ignore; 28 | import org.junit.Test; 29 | 30 | import static org.junit.Assert.assertEquals; 31 | import static org.junit.Assert.assertTrue; 32 | 33 | public class TestLogLog { 34 | 35 | @Test 36 | public void testSerialization() throws IOException { 37 | LogLog hll = new LogLog(8); 38 | hll.offer("a"); 39 | hll.offer("b"); 40 | hll.offer("c"); 41 | hll.offer("d"); 42 | hll.offer("e"); 43 | 44 | LogLog hll2 = new LogLog(hll.getBytes()); 45 | assertEquals(hll.cardinality(), hll2.cardinality()); 46 | } 47 | 48 | @Test 49 | public void testHighCardinality() { 50 | long start = System.currentTimeMillis(); 51 | LogLog loglog = new LogLog(10); 52 | int size = 10000000; 53 | for (int i = 0; i < size; i++) { 54 | loglog.offer(TestICardinality.streamElement(i)); 55 | } 56 | System.out.println("time: " + (System.currentTimeMillis() - start)); 57 | long estimate = loglog.cardinality(); 58 | double err = Math.abs(estimate - size) / (double) size; 59 | System.out.println(err); 60 | assertTrue(err < .11); 61 | } 62 | 63 | @Test 64 | public void testHighCardinalityHighOrder() { 65 | long start = System.currentTimeMillis(); 66 | LogLog loglog = new LogLog(25); 67 | int size = 100000000; 68 | for (int i = 0; i < size; i++) { 69 | loglog.offer(TestICardinality.streamElement(i)); 70 | } 71 | System.out.println("time: " + (System.currentTimeMillis() - start)); 72 | long estimate = loglog.cardinality(); 73 | double err = Math.abs(estimate - size) / (double) size; 74 | System.out.println(size); 75 | System.out.println(estimate); 76 | System.out.println(err); 77 | assertTrue(err < .06); 78 | } 79 | 80 | @Test 81 | public void testMerge() throws CardinalityMergeException { 82 | int numToMerge = 5; 83 | int bits = 16; 84 | int cardinality = 1000000; 85 | 86 | LogLog[] loglogs = new LogLog[numToMerge]; 87 | LogLog baseline = new LogLog(bits); 88 | for (int i = 0; i < numToMerge; i++) { 89 | loglogs[i] = new LogLog(bits); 90 | for (int j = 0; j < cardinality; j++) { 91 | double val = Math.random(); 92 | loglogs[i].offer(val); 93 | baseline.offer(val); 94 | } 95 | } 96 | 97 | 98 | LogLog hll = loglogs[0]; 99 | loglogs = Arrays.asList(loglogs).subList(1, loglogs.length).toArray(new LogLog[0]); 100 | long mergedEstimate = hll.merge(loglogs).cardinality(); 101 | long baselineEstimate = baseline.cardinality(); 102 | 103 | System.out.println("Baseline estimate: " + baselineEstimate); 104 | 105 | assertEquals(mergedEstimate, baselineEstimate); 106 | } 107 | 108 | @Test 109 | @Ignore 110 | public void testPrecise() throws CardinalityMergeException { 111 | int cardinality = 1000000000; 112 | int b = 12; 113 | LogLog baseline = new LogLog(b); 114 | LogLog guava128 = new LogLog(b); 115 | HashFunction hf128 = Hashing.murmur3_128(); 116 | for (int j = 0; j < cardinality; j++) { 117 | Double val = Math.random(); 118 | String valString = val.toString(); 119 | baseline.offer(valString); 120 | guava128.offerHashed(hf128.hashString(valString, Charsets.UTF_8).asLong()); 121 | if (j > 0 && j % 1000000 == 0) { 122 | System.out.println("current count: " + j); 123 | } 124 | } 125 | 126 | 127 | long baselineEstimate = baseline.cardinality(); 128 | long g128Estimate = guava128.cardinality(); 129 | double se = cardinality * (1.04 / Math.sqrt(Math.pow(2, b))); 130 | double baselineError = (baselineEstimate - cardinality) / (double) cardinality; 131 | double g128Error = (g128Estimate - cardinality) / (double) cardinality; 132 | System.out.format("b: %f g128 %f", baselineError, g128Error); 133 | assertTrue("baseline estimate bigger than expected", baselineEstimate >= cardinality - (2 * se)); 134 | assertTrue("baseline estimate smaller than expected", baselineEstimate <= cardinality + (2 * se)); 135 | assertTrue("g128 estimate bigger than expected", g128Estimate >= cardinality - (2 * se)); 136 | assertTrue("g128 estimate smaller than expected", g128Estimate <= cardinality + (2 * se)); 137 | } 138 | } 139 | -------------------------------------------------------------------------------- /src/test/java/com/clearspring/analytics/stream/membership/Base64Test.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2011 Clearspring Technologies, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.clearspring.analytics.stream.membership; 18 | 19 | import java.io.IOException; 20 | 21 | import java.nio.charset.Charset; 22 | 23 | import com.google.common.io.Resources; 24 | 25 | import org.apache.commons.codec.binary.Base64; 26 | 27 | import org.junit.Test; 28 | 29 | import static org.junit.Assert.assertFalse; 30 | import static org.junit.Assert.assertTrue; 31 | 32 | 33 | public class Base64Test { 34 | 35 | @Test 36 | public void testBase64EncodedBloomFilter() throws IOException, ClassNotFoundException { 37 | BloomFilter bf = BloomFilter.deserialize(Base64.decodeBase64(Resources.toString(Resources.getResource(Base64Test.class, "encoded_random_keys.bloom"), Charset.forName("UTF-8")))); 38 | assertTrue(bf.isPresent("4a7137513e61adbb")); 39 | assertTrue(bf.isPresent("4ba145c986af5848")); 40 | assertTrue(bf.isPresent("4b8c73a241c9d017")); 41 | assertTrue(bf.isPresent("4bafd549baae6a0c")); 42 | assertTrue(bf.isPresent("4b98ed851c5fc689")); 43 | assertTrue(bf.isPresent("4bbead53d3600f7c")); 44 | assertTrue(bf.isPresent("4bc21f2d4a4a8941")); 45 | assertTrue(bf.isPresent("4b991b45226abc99")); 46 | assertFalse(bf.isPresent("blurg")); 47 | assertFalse(bf.isPresent("bowzer")); 48 | assertFalse(bf.isPresent("4b991b45226abc90")); 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/test/java/com/clearspring/analytics/stream/membership/FilterTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | package com.clearspring.analytics.stream.membership; 20 | 21 | import java.io.IOException; 22 | 23 | import java.util.HashSet; 24 | import java.util.Iterator; 25 | import java.util.Set; 26 | 27 | import org.junit.Test; 28 | 29 | import static org.junit.Assert.assertEquals; 30 | import static org.junit.Assert.assertFalse; 31 | import static org.junit.Assert.assertTrue; 32 | 33 | public class FilterTest { 34 | 35 | public void testManyHashes(Iterator keys) { 36 | int MAX_HASH_COUNT = 128; 37 | Set hashes = new HashSet(); 38 | int collisions = 0; 39 | while (keys.hasNext()) { 40 | hashes.clear(); 41 | for (int hashIndex : Filter.getHashBuckets(keys.next(), MAX_HASH_COUNT, 1024 * 1024)) { 42 | hashes.add(hashIndex); 43 | } 44 | collisions += (MAX_HASH_COUNT - hashes.size()); 45 | } 46 | assertTrue("Collisions: " + collisions, collisions <= 100); 47 | } 48 | 49 | @Test 50 | public void testManyRandom() { 51 | testManyHashes(randomKeys()); 52 | } 53 | 54 | // used by filter subclass tests 55 | 56 | static final double MAX_FAILURE_RATE = 0.1; 57 | public static final BloomCalculations.BloomSpecification spec = BloomCalculations.computeBucketsAndK(MAX_FAILURE_RATE); 58 | static final int ELEMENTS = 10000; 59 | 60 | static final ResetableIterator intKeys() { 61 | return new KeyGenerator.IntGenerator(ELEMENTS); 62 | } 63 | 64 | static final ResetableIterator randomKeys() { 65 | return new KeyGenerator.RandomStringGenerator(314159, ELEMENTS); 66 | } 67 | 68 | static final ResetableIterator randomKeys2() { 69 | return new KeyGenerator.RandomStringGenerator(271828, ELEMENTS); 70 | } 71 | 72 | public static void testFalsePositives(Filter f, ResetableIterator keys, ResetableIterator otherkeys) { 73 | assertEquals(keys.size(), otherkeys.size()); 74 | 75 | while (keys.hasNext()) { 76 | f.add(keys.next()); 77 | } 78 | 79 | int fp = 0; 80 | while (otherkeys.hasNext()) { 81 | if (f.isPresent(otherkeys.next())) { 82 | fp++; 83 | } 84 | } 85 | 86 | double fp_ratio = fp / (keys.size() * BloomCalculations.probs[spec.bucketsPerElement][spec.K]); 87 | assertTrue("FP ratio: " + fp_ratio, fp_ratio < 1.03); 88 | } 89 | 90 | public static Filter testSerialize(Filter f) throws IOException { 91 | f.add("a"); 92 | DataOutputBuffer out = new DataOutputBuffer(); 93 | f.getSerializer().serialize(f, out); 94 | 95 | DataInputBuffer in = new DataInputBuffer(); 96 | in.reset(out.getData(), out.getLength()); 97 | Filter f2 = f.getSerializer().deserialize(in); 98 | 99 | assertTrue(f2.isPresent("a")); 100 | assertFalse(f2.isPresent("b")); 101 | return f2; 102 | } 103 | 104 | } 105 | -------------------------------------------------------------------------------- /src/test/java/com/clearspring/analytics/stream/membership/KeyGenerator.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | package com.clearspring.analytics.stream.membership; 20 | 21 | import java.io.BufferedReader; 22 | import java.io.FileInputStream; 23 | import java.io.FileNotFoundException; 24 | import java.io.IOException; 25 | import java.io.InputStreamReader; 26 | 27 | import java.util.Iterator; 28 | import java.util.Random; 29 | 30 | public class KeyGenerator { 31 | 32 | private static String randomKey(Random r) { 33 | StringBuilder buffer = new StringBuilder(); 34 | for (int j = 0; j < 16; j++) { 35 | buffer.append((char) r.nextInt()); 36 | } 37 | return buffer.toString(); 38 | } 39 | 40 | static class RandomStringGenerator implements ResetableIterator, Iterable { 41 | 42 | int i, n, seed; 43 | Random random; 44 | 45 | RandomStringGenerator(int seed, int n) { 46 | i = 0; 47 | this.seed = seed; 48 | this.n = n; 49 | reset(); 50 | } 51 | 52 | public int size() { 53 | return n; 54 | } 55 | 56 | public void reset() { 57 | random = new Random(seed); 58 | } 59 | 60 | public boolean hasNext() { 61 | return i < n; 62 | } 63 | 64 | public String next() { 65 | i++; 66 | return randomKey(random); 67 | } 68 | 69 | public void remove() { 70 | throw new UnsupportedOperationException(); 71 | } 72 | 73 | @Override 74 | public Iterator iterator() { 75 | return this; 76 | } 77 | } 78 | 79 | static class IntGenerator implements ResetableIterator { 80 | 81 | private int i, start, n; 82 | 83 | IntGenerator(int n) { 84 | this(0, n); 85 | } 86 | 87 | IntGenerator(int start, int n) { 88 | this.start = start; 89 | this.n = n; 90 | reset(); 91 | } 92 | 93 | public int size() { 94 | return n - start; 95 | } 96 | 97 | public void reset() { 98 | i = start; 99 | } 100 | 101 | public boolean hasNext() { 102 | return i < n; 103 | } 104 | 105 | public String next() { 106 | return Integer.toString(i++); 107 | } 108 | 109 | public void remove() { 110 | throw new UnsupportedOperationException(); 111 | } 112 | } 113 | 114 | static class WordGenerator implements ResetableIterator { 115 | 116 | static int WORDS; 117 | 118 | static { 119 | try { 120 | BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream("/usr/share/dict/words"))); 121 | while (br.ready()) { 122 | br.readLine(); 123 | WORDS++; 124 | } 125 | } catch (IOException e) { 126 | WORDS = 0; 127 | } 128 | } 129 | 130 | BufferedReader reader; 131 | private int modulo; 132 | private int skip; 133 | String next; 134 | 135 | WordGenerator(int skip, int modulo) { 136 | this.skip = skip; 137 | this.modulo = modulo; 138 | reset(); 139 | } 140 | 141 | public int size() { 142 | return (1 + WORDS - skip) / modulo; 143 | } 144 | 145 | public void reset() { 146 | try { 147 | reader = new BufferedReader(new InputStreamReader(new FileInputStream("/usr/share/dict/words"))); 148 | } catch (FileNotFoundException e) { 149 | throw new RuntimeException(e); 150 | } 151 | for (int i = 0; i < skip; i++) { 152 | try { 153 | reader.readLine(); 154 | } catch (IOException e) { 155 | throw new RuntimeException(e); 156 | } 157 | } 158 | next(); 159 | } 160 | 161 | public boolean hasNext() { 162 | return next != null; 163 | } 164 | 165 | public String next() { 166 | try { 167 | String s = next; 168 | for (int i = 0; i < modulo; i++) { 169 | next = reader.readLine(); 170 | } 171 | return s; 172 | } catch (IOException e) { 173 | throw new RuntimeException(e); 174 | } 175 | } 176 | 177 | public void remove() { 178 | throw new UnsupportedOperationException(); 179 | } 180 | } 181 | } 182 | -------------------------------------------------------------------------------- /src/test/java/com/clearspring/analytics/stream/membership/ResetableIterator.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | package com.clearspring.analytics.stream.membership; 20 | 21 | import java.util.Iterator; 22 | 23 | public interface ResetableIterator extends Iterator { 24 | 25 | public void reset(); 26 | 27 | int size(); 28 | } 29 | -------------------------------------------------------------------------------- /src/test/java/com/clearspring/analytics/stream/quantile/QDigestTest.java: -------------------------------------------------------------------------------- 1 | package com.clearspring.analytics.stream.quantile; 2 | 3 | import java.util.Arrays; 4 | 5 | import org.junit.Test; 6 | 7 | import cern.jet.random.Normal; 8 | import cern.jet.random.engine.MersenneTwister64; 9 | import cern.jet.random.engine.RandomEngine; 10 | import static org.junit.Assert.assertEquals; 11 | import static org.junit.Assert.assertTrue; 12 | 13 | public class QDigestTest { 14 | 15 | @Test 16 | public void testComprehensiveOnMixture() { 17 | RandomEngine r = new MersenneTwister64(0); 18 | Normal[] dists = new Normal[]{ 19 | new Normal(100, 50, r), 20 | new Normal(150, 20, r), 21 | new Normal(500, 300, r), 22 | new Normal(10000, 10000, r), 23 | new Normal(1200, 300, r), 24 | }; 25 | for (int numSamples : new int[]{1, 10, 100, 1000, 10000}) { 26 | long[][] samples = new long[dists.length][]; 27 | for (int i = 0; i < dists.length; ++i) { 28 | samples[i] = new long[numSamples]; 29 | for (int j = 0; j < samples[i].length; ++j) { 30 | samples[i][j] = (long) Math.max(0, dists[i].nextDouble()); 31 | } 32 | } 33 | double compressionFactor = 1000; 34 | int logCapacity = 1; 35 | long max = 0; 36 | for (long[] s : samples) { 37 | for (long x : s) max = Math.max(max, x); 38 | } 39 | for (double scale = 1; scale < max; scale *= 2, logCapacity++) { 40 | ; 41 | } 42 | double eps = logCapacity / compressionFactor; 43 | 44 | QDigest[] digests = new QDigest[dists.length]; 45 | for (int i = 0; i < digests.length; ++i) { 46 | digests[i] = new QDigest(compressionFactor); 47 | for (long x : samples[i]) { 48 | digests[i].offer(x); 49 | } 50 | assertEquals(samples[i].length, digests[i].computeActualSize()); 51 | } 52 | 53 | int numTotal = 0; 54 | for (int i = 0; i < digests.length; ++i) { 55 | for (double q = 0; q <= 1; q += 0.01) { 56 | long res = digests[i].getQuantile(q); 57 | double[] actualRank = actualRankOf(res, samples[i]); 58 | assertTrue( 59 | actualRank[0] + " .. " + actualRank[1] + " outside error bound for " + q, 60 | q >= actualRank[0] - eps && q <= actualRank[1] + eps); 61 | } 62 | 63 | // Test the same on the union of all distributions up to i-th 64 | numTotal += samples[i].length; 65 | long[] total = new long[numTotal]; 66 | int offset = 0; 67 | QDigest totalDigest = new QDigest(compressionFactor); 68 | long expectedSize = 0; 69 | for (int j = 0; j <= i; ++j) { 70 | System.arraycopy(samples[j], 0, total, offset, samples[j].length); 71 | offset += samples[j].length; 72 | totalDigest = QDigest.unionOf(totalDigest, digests[j]); 73 | expectedSize += samples[j].length; 74 | } 75 | assertEquals(expectedSize, totalDigest.computeActualSize()); 76 | 77 | for (double q = 0; q <= 1; q += 0.01) { 78 | long res = totalDigest.getQuantile(q); 79 | double[] actualRank = actualRankOf(res, total); 80 | assertTrue( 81 | actualRank[0] + " .. " + actualRank[1] + " outside error bound for " + q, 82 | q >= actualRank[0] - eps && q <= actualRank[1] + eps); 83 | } 84 | } 85 | } 86 | } 87 | 88 | private double[] actualRankOf(long x, long[] ys) { 89 | int numSmaller = 0; 90 | int numEqual = 0; 91 | for (long y : ys) if (y < x) numSmaller++; 92 | for (long y : ys) if (y == x) numEqual++; 93 | return new double[]{ 94 | 1.0 * numSmaller / ys.length, 95 | 1.0 * (numSmaller + numEqual) / ys.length 96 | }; 97 | } 98 | 99 | /** 100 | * Test for bug identified and corrected by http://github.com/addthis/stream-lib/pull/52 101 | */ 102 | @Test 103 | public void testMerge() { 104 | int compressionFactor = 2; 105 | 106 | long[] aSamples = {0, 0, 1, 0, 1, 1}; 107 | long[] bSamples = {0, 1, 0, 0, 0, 3}; 108 | long[] allSamples = Arrays.copyOf(aSamples, aSamples.length + bSamples.length); 109 | System.arraycopy(bSamples, 0, allSamples, aSamples.length, bSamples.length); 110 | 111 | QDigest a = new QDigest(compressionFactor); 112 | QDigest b = new QDigest(compressionFactor); 113 | QDigest c = new QDigest(compressionFactor); 114 | for (long x : aSamples) a.offer(x); 115 | for (long x : bSamples) b.offer(x); 116 | for (long x : allSamples) c.offer(x); 117 | QDigest ab = QDigest.unionOf(a, b); 118 | 119 | System.out.println("a: " + a); 120 | System.out.println("b: " + b); 121 | System.out.println("ab: " + ab); 122 | System.out.println("c: " + c); 123 | 124 | assertEquals(allSamples.length, c.computeActualSize()); 125 | 126 | int logCapacity = 1; 127 | long max = 0; 128 | for (long x : allSamples) max = Math.max(max, x); 129 | for (double scale = 1; scale < max; scale *= compressionFactor, logCapacity++) { 130 | } 131 | 132 | double eps = logCapacity / compressionFactor; 133 | for (double q = 0; q <= 1; q += 0.01) { 134 | long res = c.getQuantile(q); 135 | double[] actualRank = actualRankOf(res, allSamples); 136 | assertTrue( 137 | actualRank[0] + " .. " + actualRank[1] + " outside error bound for " + q, 138 | q >= actualRank[0] - eps && q <= actualRank[1] + eps); 139 | } 140 | } 141 | 142 | /** 143 | * Test for bug identified and corrected by http://github.com/addthis/stream-lib/pull/53 144 | */ 145 | @Test 146 | public void testSerialization() { 147 | long[] samples = {0, 20}; 148 | QDigest digestA = new QDigest(2); 149 | 150 | for (int i = 0; i < samples.length; i++) { 151 | digestA.offer(samples[i]); 152 | } 153 | byte[] serialized = QDigest.serialize(digestA); 154 | 155 | QDigest deserializedA = QDigest.deserialize(serialized); 156 | 157 | QDigest digestB = new QDigest(2); 158 | for (int i = 0; i < samples.length; i++) { 159 | digestB.offer(samples[i]); 160 | } 161 | 162 | QDigest.unionOf(digestA, deserializedA); 163 | 164 | 165 | } 166 | } 167 | -------------------------------------------------------------------------------- /src/test/java/com/clearspring/analytics/util/TestDoublyLinkedList.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2011 Clearspring Technologies, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.clearspring.analytics.util; 18 | 19 | import java.util.ConcurrentModificationException; 20 | 21 | import org.junit.Test; 22 | 23 | import static org.junit.Assert.assertArrayEquals; 24 | import static org.junit.Assert.assertEquals; 25 | import static org.junit.Assert.assertFalse; 26 | import static org.junit.Assert.assertNull; 27 | import static org.junit.Assert.assertTrue; 28 | import static org.junit.Assert.fail; 29 | 30 | public class TestDoublyLinkedList { 31 | 32 | @Test 33 | public void testDoublyLinkedList() { 34 | DoublyLinkedList list = new DoublyLinkedList(); 35 | assertIsEmpty(list); 36 | } 37 | 38 | @Test 39 | public void testAdd() { 40 | DoublyLinkedList list = new DoublyLinkedList(); 41 | list.add(1); 42 | assertFalse(list.isEmpty()); 43 | assertEquals(1, list.size()); 44 | assertArrayEquals(new Integer[]{1}, list.toArray()); 45 | list.add(2); 46 | assertFalse(list.isEmpty()); 47 | assertEquals(2, list.size()); 48 | assertArrayEquals(new Integer[]{1, 2}, list.toArray()); 49 | list.add(3); 50 | assertFalse(list.isEmpty()); 51 | assertEquals(3, list.size()); 52 | assertArrayEquals(new Integer[]{1, 2, 3}, list.toArray()); 53 | assertEquals(new Integer(1), list.first()); 54 | } 55 | 56 | @Test 57 | public void testEnqueue() { 58 | DoublyLinkedList list = new DoublyLinkedList(); 59 | list.enqueue(1); 60 | assertFalse(list.isEmpty()); 61 | assertEquals(1, list.size()); 62 | assertArrayEquals(new Integer[]{1}, list.toArray()); 63 | list.enqueue(2); 64 | assertFalse(list.isEmpty()); 65 | assertEquals(2, list.size()); 66 | assertArrayEquals(new Integer[]{2, 1}, list.toArray()); 67 | list.enqueue(3); 68 | assertFalse(list.isEmpty()); 69 | assertEquals(3, list.size()); 70 | assertArrayEquals(new Integer[]{3, 2, 1}, list.toArray()); 71 | assertEquals(new Integer(3), list.first()); 72 | assertEquals(new Integer(1), list.last()); 73 | } 74 | 75 | @Test 76 | public void testAddNode() { 77 | DoublyLinkedList list = new DoublyLinkedList(); 78 | list.add(new ListNode2(1)); 79 | assertFalse(list.isEmpty()); 80 | assertEquals(1, list.size()); 81 | assertArrayEquals(new Integer[]{1}, list.toArray()); 82 | list.add(new ListNode2(2)); 83 | assertFalse(list.isEmpty()); 84 | assertEquals(2, list.size()); 85 | assertArrayEquals(new Integer[]{1, 2}, list.toArray()); 86 | list.add(new ListNode2(3)); 87 | assertFalse(list.isEmpty()); 88 | assertEquals(3, list.size()); 89 | assertArrayEquals(new Integer[]{1, 2, 3}, list.toArray()); 90 | assertEquals(new Integer(1), list.first()); 91 | } 92 | 93 | @Test 94 | public void testAddAfter() { 95 | DoublyLinkedList list = new DoublyLinkedList(); 96 | list.add(1); 97 | ListNode2 node2 = list.add(2); 98 | ListNode2 node4 = list.add(4); 99 | 100 | list.addAfter(node2, 3); 101 | assertEquals(4, list.size()); 102 | assertArrayEquals(new Integer[]{1, 2, 3, 4}, list.toArray()); 103 | 104 | ListNode2 node5 = list.addAfter(node4, 5); 105 | assertEquals(5, list.size()); 106 | assertArrayEquals(new Integer[]{1, 2, 3, 4, 5}, list.toArray()); 107 | assertEquals(new Integer(5), list.last()); 108 | assertEquals(node5, list.head()); 109 | } 110 | 111 | @Test 112 | public void testRemove() { 113 | DoublyLinkedList list = new DoublyLinkedList(); 114 | ListNode2 node1 = list.add(1); 115 | list.remove(node1); 116 | 117 | node1 = list.add(1); 118 | ListNode2 node2 = list.add(2); 119 | list.remove(node1); 120 | assertEquals(1, list.size()); 121 | assertEquals(new Integer(2), list.first()); 122 | assertEquals(node2, list.head()); 123 | assertArrayEquals(new Integer[]{2}, list.toArray()); 124 | list.remove(node2); 125 | assertIsEmpty(list); 126 | 127 | node1 = list.add(1); 128 | node2 = list.add(2); 129 | list.remove(node2); 130 | assertEquals(1, list.size()); 131 | assertEquals(new Integer(1), list.first()); 132 | assertEquals(node1, list.head()); 133 | assertArrayEquals(new Integer[]{1}, list.toArray()); 134 | 135 | node2 = list.add(2); 136 | list.add(3); 137 | assertEquals(3, list.size()); 138 | assertArrayEquals(new Integer[]{1, 2, 3}, list.toArray()); 139 | list.remove(node2); 140 | assertEquals(2, list.size()); 141 | assertEquals(node1, list.tail()); 142 | assertEquals(new Integer(3), list.last()); 143 | assertArrayEquals(new Integer[]{1, 3}, list.toArray()); 144 | } 145 | 146 | @Test(expected = ConcurrentModificationException.class) 147 | public void testConcurrentModification() { 148 | DoublyLinkedList list = new DoublyLinkedList(); 149 | list.add(1); 150 | list.add(2); 151 | list.add(3); 152 | 153 | for (int i : list) { 154 | if (i == 2) { 155 | list.add(4); 156 | } 157 | } 158 | } 159 | 160 | private void assertIsEmpty(DoublyLinkedList list) { 161 | assertNull(list.tail()); 162 | assertNull(list.head()); 163 | assertNull(list.first()); 164 | assertNull(list.last()); 165 | assertTrue(list.isEmpty()); 166 | assertEquals(0, list.size()); 167 | for (T i : list) { 168 | fail("What is this: " + i + " ?"); 169 | } 170 | } 171 | 172 | } 173 | -------------------------------------------------------------------------------- /src/test/java/com/clearspring/experimental/stream/cardinality/TestHyperBitBit.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2011 Clearspring Technologies, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.clearspring.experimental.stream.cardinality; 18 | 19 | import com.clearspring.analytics.stream.cardinality.TestICardinality; 20 | 21 | import org.junit.Test; 22 | import org.junit.Ignore; 23 | import static org.junit.Assert.assertTrue; 24 | 25 | 26 | public class TestHyperBitBit { 27 | // This test are "in progress" until HyperBitBit has better known error bounds 28 | // Right now they will no fail ever (@ignore) 29 | 30 | 31 | @Test 32 | @Ignore 33 | public void testSimpleHighCardinality() { 34 | int size = 10000000; 35 | int max_repetitions = 5; 36 | double errors_mean = 0; 37 | for (int repetitions = 0; repetitions < max_repetitions; ++repetitions) { 38 | long start = System.currentTimeMillis(); 39 | HyperBitBit hyperBitBit = new HyperBitBit(); 40 | 41 | for (int i = 0; i < size; i++) { 42 | hyperBitBit.offer(TestICardinality.streamElement(i)); 43 | } 44 | System.out.println("time: " + (System.currentTimeMillis() - start)); 45 | long estimate = hyperBitBit.cardinality(); 46 | double err = Math.abs(estimate - size) / (double) size; 47 | errors_mean += (err/max_repetitions); 48 | System.out.println(err); 49 | } 50 | System.out.println("This value should be less than 0.25: " + errors_mean); 51 | assertTrue(errors_mean < 0.1); 52 | 53 | } 54 | 55 | @Test 56 | @Ignore 57 | public void testMultipleOrderedHighCardinality() { 58 | int size = 10000000; 59 | 60 | long start = System.currentTimeMillis(); 61 | 62 | HyperBitBit hyperBitBit = new HyperBitBit(); 63 | 64 | for (int i = 0; i < size; i++) { 65 | hyperBitBit.offer(i); 66 | hyperBitBit.offer(i); 67 | hyperBitBit.offer(i); 68 | hyperBitBit.offer(i); 69 | } 70 | 71 | System.out.println("time: " + (System.currentTimeMillis() - start)); 72 | long estimate = hyperBitBit.cardinality(); 73 | double err = Math.abs(estimate - size) / (double) size; 74 | System.out.println(err); 75 | System.out.println("This value should be less than 0.2: " + err); 76 | assertTrue(err < 0.1); 77 | } 78 | 79 | @Test 80 | @Ignore 81 | public void testMultipleUnorderedHighCardinality() { 82 | int size = 10000000; 83 | 84 | long start = System.currentTimeMillis(); 85 | 86 | HyperBitBit hyperBitBit = new HyperBitBit(); 87 | 88 | for (int i = 0; i < size; i++) { 89 | hyperBitBit.offer(i); 90 | } 91 | 92 | for (int i = 0; i < size; i++) { 93 | hyperBitBit.offer(i); 94 | } 95 | 96 | for (int i = 0; i < size; i++) { 97 | hyperBitBit.offer(i); 98 | } 99 | 100 | System.out.println("time: " + (System.currentTimeMillis() - start)); 101 | long estimate = hyperBitBit.cardinality(); 102 | double err = Math.abs(estimate - size) / (double) size; 103 | System.out.println(err); 104 | 105 | System.out.println("This value should be less than 0.2: " + err); 106 | assertTrue(err < 0.1); 107 | } 108 | } 109 | --------------------------------------------------------------------------------