├── res ├── hamlet.txt.gz └── midsummer.txt.gz ├── tests.edn ├── .gitignore ├── bin ├── test └── build ├── doc └── contributing.md ├── LICENSE ├── project.clj ├── test └── bigml │ └── sketchy │ └── test │ ├── bits.clj │ ├── bloom.clj │ ├── count_min.clj │ ├── demo.clj │ ├── murmur.clj │ ├── min_hash.clj │ └── hyper_loglog.clj ├── deps.edn ├── src ├── java │ └── bigml │ │ └── sketchy │ │ └── MurmurUtil.java └── clj │ └── bigml │ └── sketchy │ ├── murmur.clj │ ├── count_min.clj │ ├── bloom.clj │ ├── hyper_loglog.clj │ ├── bits.clj │ └── min_hash.clj └── README.md /res/hamlet.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigmlcom/sketchy/HEAD/res/hamlet.txt.gz -------------------------------------------------------------------------------- /res/midsummer.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigmlcom/sketchy/HEAD/res/midsummer.txt.gz -------------------------------------------------------------------------------- /tests.edn: -------------------------------------------------------------------------------- 1 | #kaocha/v1 2 | {:tests [{:id :unit 3 | :test-paths ["test/"] 4 | :ns-patterns ["bigml.sketchy.test.*"]}] 5 | :reporter [kaocha.report/documentation]} 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | pom.xml 2 | *jar 3 | /lib/ 4 | /classes/ 5 | /target/ 6 | .lein-deps-sum 7 | .lein-failures 8 | .nrepl* 9 | 10 | .clj-kondo/.cache 11 | .lsp/.cache 12 | .portal/ 13 | .cpcache 14 | -------------------------------------------------------------------------------- /bin/test: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # vim: ft=bash 3 | 4 | cd "$(dirname "${BASH_SOURCE[0]}")/.." 5 | 6 | bin/build javac 7 | 8 | if [[ $1 = check ]]; then 9 | exec clojure -M:check 10 | elif [[ $1 = coverage ]]; then 11 | shift 12 | exec clojure -M:coverage "$@" 13 | else 14 | exec clojure -M:test "$@" 15 | fi 16 | -------------------------------------------------------------------------------- /doc/contributing.md: -------------------------------------------------------------------------------- 1 | # Contributing to sketchy 2 | 3 | 4 | ## Build using deps.edn 5 | 6 | We made some scripts to build via deps.edn 7 | 8 | ``` 9 | # Compile java files with javac 10 | ./bin/build javac 11 | 12 | # Build the jar 13 | ./bin/build jar 14 | 15 | # Run tests 16 | ./bin/test 17 | 18 | # Deploy a new version to clojars (maintainers only) 19 | ./bin/build deploy 20 | ``` -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2013 BigML, Inc 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); you may 4 | not use this file except in compliance with the License. You may obtain 5 | a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 11 | WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 12 | License for the specific language governing permissions and limitations 13 | under the License. 14 | -------------------------------------------------------------------------------- /project.clj: -------------------------------------------------------------------------------- 1 | (defproject bigml/sketchy "0.4.2" 2 | :description "Sketching algorithms in Clojure" 3 | :url "https://github.com/bigmlcom/sketchy" 4 | :license {:name "Apache License, Version 2.0" 5 | :url "http://www.apache.org/licenses/LICENSE-2.0"} 6 | :aliases {"lint" ["do" "check," "eastwood"] 7 | "distcheck" ["do" "clean," "lint," "test"]} 8 | :source-paths ["src/clj"] 9 | :java-source-paths ["src/java"] 10 | :jvm-opts ^:replace ["-server"] 11 | :profiles {:dev {:plugins [[jonase/eastwood "0.2.3"]]}} 12 | :dependencies [[org.clojure/clojure "1.9.0"] 13 | [org.clj-commons/byte-transforms "0.2.2"]]) 14 | -------------------------------------------------------------------------------- /test/bigml/sketchy/test/bits.clj: -------------------------------------------------------------------------------- 1 | ;; Copyright 2013, 2014 BigML 2 | ;; Licensed under the Apache License, Version 2.0 3 | ;; http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | (ns bigml.sketchy.test.bits 6 | (:require [clojure.test :refer :all] 7 | (bigml.sketchy [bits :as bits]))) 8 | 9 | (deftest bits-test 10 | (let [[s1 s2] (partition 128 (shuffle (range 256))) 11 | bs1 (apply bits/set (bits/create 256) s1) 12 | bs2 (apply bits/set (bits/create 256) s2)] 13 | (is (= (sort s1) (bits/set-seq bs1))) 14 | (is (= (sort s2) (bits/clear-seq bs1))) 15 | (is (empty? (bits/set-seq (apply bits/flip bs1 s1)))) 16 | (is (empty? (bits/set-seq (bits/and bs1 bs2)))) 17 | (is (empty? (bits/clear-seq (bits/or bs1 bs2)))))) 18 | -------------------------------------------------------------------------------- /test/bigml/sketchy/test/bloom.clj: -------------------------------------------------------------------------------- 1 | ;; Copyright 2013, 2014 BigML 2 | ;; Licensed under the Apache License, Version 2.0 3 | ;; http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | (ns bigml.sketchy.test.bloom 6 | (:require [clojure.test :refer :all] 7 | (bigml.sketchy [bloom :as bloom]))) 8 | 9 | (deftest bloom 10 | (let [d1 (range 10000) 11 | d2 (range 5000 15000) 12 | d3 (range 20000) 13 | b1 (reduce bloom/insert (bloom/create 15000 0.02) d1) 14 | b2 (reduce bloom/insert (bloom/create 15000 0.02) d2) 15 | true-count #(count (filter true? (map (partial bloom/contains? %1) %2)))] 16 | (is (= (true-count b1 d1) 10000)) ;; Never any false negatives 17 | (is (<= 10000 (true-count b1 d3) 10200)) 18 | (is (<= 15000 (true-count (bloom/merge b1 b2) d3) 15300)))) 19 | -------------------------------------------------------------------------------- /test/bigml/sketchy/test/count_min.clj: -------------------------------------------------------------------------------- 1 | ;; Copyright 2013, 2014, 2015 BigML 2 | ;; Licensed under the Apache License, Version 2.0 3 | ;; http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | (ns bigml.sketchy.test.count-min 6 | (:require [clojure.test :refer :all] 7 | (bigml.sketchy [count-min :as cm]))) 8 | 9 | (defn- make-data [size] 10 | (let [rnd (java.util.Random. 0)] 11 | (vec (concat 12 | (repeatedly (int (* 0.1 size)) 13 | #(long (* 100 (.nextGaussian rnd)))) 14 | (repeatedly (int (* 0.9 size)) 15 | #(long (* 20000 (.nextGaussian rnd)))))))) 16 | 17 | (defn- test-sketch [sketch data val] 18 | {:estimate (cm/estimate-count sketch val) 19 | :actual (count (filter #(= val %) data))}) 20 | 21 | (deftest count-min-test 22 | (let [data (vec (make-data 200000)) 23 | sketch (reduce cm/insert (cm/create) data)] 24 | (doseq [val (range 150)] 25 | (let [{:keys [estimate actual]} (test-sketch sketch data val)] 26 | (is (< (- estimate actual) 15)))))) 27 | -------------------------------------------------------------------------------- /test/bigml/sketchy/test/demo.clj: -------------------------------------------------------------------------------- 1 | ;; Copyright 2013, 2014 BigML 2 | ;; Licensed under the Apache License, Version 2.0 3 | ;; http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | (ns bigml.sketchy.test.demo 6 | (:import (java.util.zip GZIPInputStream)) 7 | (:require [clojure.java.io :as io] 8 | [clojure.string :as str])) 9 | 10 | (def ^:private split-pattern #"\r?\n| ") 11 | (def ^:private stop-tokens #{"" "?" "." "," "!" ";" ":"}) 12 | 13 | (defn- get-tokens [file] 14 | (with-open [in (-> file io/input-stream GZIPInputStream.)] 15 | (map str/lower-case 16 | (remove #{"" "?" "." "," "!" ";" ":"} 17 | (str/split (slurp in) #"\r?\n| "))))) 18 | 19 | (def hamlet-tokens (get-tokens "res/hamlet.txt.gz")) 20 | 21 | (let [parts (partition-all (/ (count hamlet-tokens) 2) hamlet-tokens)] 22 | (def hamlet-part1 (first parts)) 23 | (def hamlet-part2 (second parts))) 24 | 25 | (def midsummer-tokens (get-tokens "res/midsummer.txt.gz")) 26 | 27 | (let [parts (partition-all (/ (count midsummer-tokens) 2) midsummer-tokens)] 28 | (def midsummer-part1 (first parts)) 29 | (def midsummer-part2 (second parts))) 30 | -------------------------------------------------------------------------------- /test/bigml/sketchy/test/murmur.clj: -------------------------------------------------------------------------------- 1 | ;; Copyright 2014, 2015 BigML 2 | ;; Licensed under the Apache License, Version 2.0 3 | ;; http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | (ns bigml.sketchy.test.murmur 6 | (:require [clojure.test :refer :all] 7 | (bigml.sketchy [murmur :as murmur]))) 8 | 9 | (deftest hash-values 10 | (let [pairs [[0 799242588501267692] 11 | [1E16 4977900633541538766] 12 | [(long 1E16) -4083462578434211217] 13 | ["hash me!" -3694557840885048153] 14 | [{:foo 1 :bar 2} -584811730086931020]]] 15 | (doseq [[val expected-hash] pairs] 16 | (is (= expected-hash (murmur/hash val)))))) 17 | 18 | (deftest vary-bits 19 | (let [pairs [[63 6231696022289519434] 20 | [31 1825051466] 21 | [15 4938] 22 | [7 74] 23 | [3 2]]] 24 | (doseq [[bits expected-hash] pairs] 25 | (is (= expected-hash (murmur/truncate (murmur/hash "foo") bits)))))) 26 | 27 | (deftest seeds 28 | (is (= (murmur/hash "foo" 0) 6231696022289519434)) 29 | (is (= (murmur/hash "foo" 123) 4010191379894525224)) 30 | (is (= (take 3 (murmur/hash-seq "foo")) 31 | '(6231696022289519434 -1965669315023635442 -4826411765733908310)))) 32 | -------------------------------------------------------------------------------- /deps.edn: -------------------------------------------------------------------------------- 1 | {:description "Sketching algorithms in Clojure" 2 | :url "https://github.com/bigmlcom/sketchy" 3 | :license {:name "Apache License, Version 2.0" 4 | :url "http://www.apache.org/licenses/LICENSE-2.0"} 5 | :paths ["src/clj" "target/classes"] 6 | :deps {org.clj-commons/byte-transforms {:mvn/version "0.2.1"}} 7 | :deps/prep-lib {:ensure "target/classes" 8 | :alias :build 9 | :fn javac} 10 | 11 | :aliases {:dev {:extra-deps {}} 12 | :build 13 | {:deps {org.clojure/clojure {:mvn/version "1.11.1"} 14 | org.slf4j/slf4j-api {:mvn/version "2.0.6"} 15 | org.clojure/tools.build {:mvn/version "0.8.4"} 16 | slipset/deps-deploy {:git/url "https://github.com/slipset/deps-deploy.git" 17 | :git/sha "c6c67a065dc24ef61cae756ec836e0db179b767f"}} 18 | :ns-default build} 19 | :test 20 | {:extra-paths ["test"] 21 | :extra-deps {lambdaisland/kaocha {:mvn/version "1.84.1335"}} 22 | :jvm-opts ["-XX:-OmitStackTraceInFastThrow" 23 | "-Duser.language=en" 24 | "-Duser.country=US"] 25 | :main-opts ["-m" "kaocha.runner"]}}} -------------------------------------------------------------------------------- /test/bigml/sketchy/test/min_hash.clj: -------------------------------------------------------------------------------- 1 | ;; Copyright 2013, 2014, 2015, 2016 BigML 2 | ;; Licensed under the Apache License, Version 2.0 3 | ;; http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | (ns bigml.sketchy.test.min-hash 6 | (:require [clojure.test :refer :all] 7 | (bigml.sketchy [min-hash :as mh]))) 8 | 9 | (deftest similarity-test 10 | (is (<= 0.85 11 | (mh/jaccard-similarity (mh/into (mh/create) (range 0 5000)) 12 | (mh/into (mh/create) (range 500 5000))) 13 | 0.95)) 14 | (is (== 1 (mh/jaccard-similarity (mh/merge (mh/into (mh/create) 15 | (range 1000)) 16 | (mh/into (mh/create) 17 | (range 500 1500))) 18 | (mh/into (mh/create) (range 1500)))))) 19 | 20 | (deftest speed-test 21 | (let [start (System/currentTimeMillis) 22 | similarity 23 | (mh/jaccard-similarity (mh/into (mh/create) (range 0.0 1E6)) 24 | (mh/into (mh/create) (range 1E5 1E6))) 25 | end (System/currentTimeMillis)] 26 | ;; On a 2.2 GHz Intel Core i7 this takes ~160 seconds before 27 | ;; commit e7a629b, after those changes it takes ~300 ms 28 | (is (< (- end start) 10000)) 29 | (is (< 0.85 similarity 0.95)))) 30 | -------------------------------------------------------------------------------- /src/java/bigml/sketchy/MurmurUtil.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2014 BigML 3 | * Licensed under the Apache License, Version 2.0 4 | * http://www.apache.org/licenses/LICENSE-2.0 5 | */ 6 | package bigml.sketchy; 7 | 8 | import byte_transforms.CassandraMurmurHash; 9 | import java.nio.ByteBuffer; 10 | 11 | public class MurmurUtil { 12 | 13 | public static long hash(Object o, long seed) { 14 | Class cls = o.getClass(); 15 | 16 | ByteBuffer bytes; 17 | 18 | if (cls == Double.class) { 19 | bytes = ByteBuffer.allocate(8); 20 | bytes.putDouble((Double) o); 21 | } else if (cls == Long.class) { 22 | bytes = ByteBuffer.allocate(8); 23 | bytes.putLong((Long) o); 24 | } else if (cls == String.class) { 25 | byte[] rawBytes = ((String) o).getBytes(); 26 | bytes = ByteBuffer.allocate(rawBytes.length); 27 | bytes.put(rawBytes); 28 | } else if (cls == Integer.class) { 29 | bytes = ByteBuffer.allocate(4); 30 | bytes.putInt((Integer) o); 31 | } else if (cls == Short.class) { 32 | bytes = ByteBuffer.allocate(2); 33 | bytes.putShort((Short) o); 34 | } else if (cls == Byte.class) { 35 | bytes = ByteBuffer.allocate(1); 36 | bytes.put((Byte) o); 37 | } else { 38 | return 0; 39 | } 40 | bytes.rewind(); 41 | return CassandraMurmurHash.hash2_64(bytes, 0, bytes.remaining(), seed); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/clj/bigml/sketchy/murmur.clj: -------------------------------------------------------------------------------- 1 | ;; Copyright 2014, 2015 BigML 2 | ;; Licensed under the Apache License, Version 2.0 3 | ;; http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | (ns bigml.sketchy.murmur 6 | "Functions for Murmur hashing. 7 | http://en.wikipedia.org/wiki/MurmurHash" 8 | (:refer-clojure :exclude [hash]) 9 | (:import (bigml.sketchy MurmurUtil))) 10 | 11 | (def ^:private default-seed 1651860712) 12 | 13 | (defn- hash* [val seed] 14 | (if val 15 | (let [hv (MurmurUtil/hash val seed)] 16 | (if (zero? hv) 17 | (hash* (clojure.core/hash val) seed) 18 | hv)) 19 | 0)) 20 | 21 | (defn- seed->long [seed] 22 | (cond (nil? seed) default-seed 23 | (not (instance? Long seed)) (clojure.core/hash seed) 24 | (zero? seed) default-seed 25 | :else seed)) 26 | 27 | (defn hash 28 | "Returns a long hash given a value and an optional seed." 29 | ([val] 30 | (hash* val default-seed)) 31 | ([val seed] 32 | (hash* val (seed->long seed)))) 33 | 34 | (defn truncate 35 | "Truncates the hash-value given the desired number of bits." 36 | [hash-val bits] 37 | (bit-and hash-val (unchecked-dec (bit-shift-left 1 bits)))) 38 | 39 | (defn hash-seq 40 | "Returns a lazy infinite sequence of hashes (each with a unique 41 | seed) given a value and optional desired bits." 42 | ([val] 43 | (map hash (repeat val) (range))) 44 | ([val bits] 45 | (map (comp #(truncate % bits) hash) 46 | (repeat val) 47 | (range)))) 48 | -------------------------------------------------------------------------------- /bin/build: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # vim: ft=bash 3 | 4 | set -e 5 | 6 | cd "$(dirname "${BASH_SOURCE[0]}")/.." 7 | 8 | find-latest() { 9 | local file latest 10 | for file in $(find "$1" -type f -name "*.$2"); do 11 | [[ -z $latest || $file -nt $latest ]] && latest=$file 12 | done 13 | echo $latest 14 | } 15 | 16 | if [[ $1 = clean ]]; then 17 | rm -rf target 18 | elif [[ $1 = hiera ]]; then 19 | shift 20 | exec clojure -J-Dclojure.main.report=stderr -X:hiera "$@" 21 | elif [[ $1 = javac ]]; then 22 | shift 23 | if [[ ! -d target/classes || $(find-latest src/java java) -nt $(find-latest target/classes class) ]]; then 24 | echo "Compiling Java class files" 25 | exec clojure -J-Dclojure.main.report=stderr -T:build javac "$@" 26 | fi 27 | elif [[ $1 = deploy ]]; then 28 | shift 29 | if [[ -z $CLOJARS_USERNAME ]]; then 30 | read -p "Clojars username: " CLOJARS_USERNAME 31 | if [[ -z $CLOJARS_USERNAME ]]; then 32 | echo "No username available, aborting" >&2 33 | exit 1 34 | fi 35 | export CLOJARS_USERNAME 36 | fi 37 | if [[ -z $CLOJARS_PASSWORD ]]; then 38 | read -p "Clojars deploy token: " CLOJARS_PASSWORD 39 | if [[ -z $CLOJARS_PASSWORD ]]; then 40 | echo "No deploy token available, aborting" >&2 41 | exit 1 42 | fi 43 | export CLOJARS_PASSWORD 44 | fi 45 | exec clojure -J-Dclojure.main.report=stderr -T:build deploy "$@" 46 | else 47 | exec clojure -J-Dclojure.main.report=stderr -T:build "$@" 48 | fi 49 | -------------------------------------------------------------------------------- /test/bigml/sketchy/test/hyper_loglog.clj: -------------------------------------------------------------------------------- 1 | ;; Copyright 2013, 2014 BigML 2 | ;; Licensed under the Apache License, Version 2.0 3 | ;; http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | (ns bigml.sketchy.test.hyper-loglog 6 | (:require [clojure.test :refer :all] 7 | (bigml.sketchy [hyper-loglog :as hll]))) 8 | 9 | (defn- gen-data [size] 10 | (let [maxint Integer/MAX_VALUE] 11 | (repeatedly size #(rand-int maxint)))) 12 | 13 | (defn- measure-error [target-error data-size] 14 | (/ (Math/abs (- (hll/distinct-count 15 | (reduce hll/insert 16 | (hll/create target-error) 17 | (gen-data data-size))) 18 | data-size)) 19 | data-size)) 20 | 21 | (defn- mean [vals] 22 | (/ (double (reduce + vals)) 23 | (count vals))) 24 | 25 | (defn- trials [trial-count target-error data-size] 26 | {:target-error target-error 27 | :actual-error (mean (repeatedly trial-count 28 | #(measure-error target-error data-size)))}) 29 | 30 | (deftest hyper-loglog-test 31 | (is (> 0.2 (:actual-error (trials 20 0.02 15000)))) 32 | (let [h1 (reduce hll/insert (hll/create 0.01) (gen-data 10000)) 33 | h2 (reduce hll/insert (hll/create 0.01) (gen-data 15000))] 34 | (is (<= 24300 (hll/distinct-count (hll/merge h1 h2)) 25700)))) 35 | 36 | (deftest similarity-test 37 | (is (<= 0.85 38 | (hll/jaccard-similarity (hll/into (hll/create) (range 0 5000)) 39 | (hll/into (hll/create) (range 500 5000))) 40 | 0.95)) 41 | (is (== 1 (hll/jaccard-similarity (hll/merge (hll/into (hll/create) 42 | (range 1000)) 43 | (hll/into (hll/create) 44 | (range 500 1500))) 45 | (hll/into (hll/create) (range 1500)))))) 46 | -------------------------------------------------------------------------------- /src/clj/bigml/sketchy/count_min.clj: -------------------------------------------------------------------------------- 1 | ;; Copyright 2013, 2014, 2015 BigML 2 | ;; Licensed under the Apache License, Version 2.0 3 | ;; http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | (ns bigml.sketchy.count-min 6 | "Functions for constructing a count-min sketch. 7 | http://en.wikipedia.org/wiki/Count-Min_sketch" 8 | (:refer-clojure :exclude [merge into]) 9 | (:require (bigml.sketchy [murmur :as murmur]))) 10 | 11 | (defn- hash-offsets [val hashers hash-bits] 12 | (let [offset (bit-shift-left 1 hash-bits) 13 | doffset (unchecked-dec offset)] 14 | (loop [i 0 15 | offsets []] 16 | (if (= i hashers) 17 | offsets 18 | (recur (inc i) 19 | (conj offsets (+ (bit-and (murmur/hash val i) doffset) 20 | (* offset i)))))))) 21 | 22 | (defn create 23 | "Creates a count-min sketch given the desired number of hash-bits 24 | and the number of hashers. The total number of counters maintained 25 | by the sketch will be (2^hash-bits)*hashers, so choose these values 26 | carefully." 27 | [& {:keys [hash-bits hashers] :or {hash-bits 15 hashers 3}}] 28 | {:inserts 0 29 | :hash-bits hash-bits 30 | :hashers hashers 31 | :counters (vec (repeat (* hashers (bit-shift-left 1 hash-bits)) 0))}) 32 | 33 | (defn- insert* [sketch val] 34 | (let [{:keys [hashers hash-bits counters inserts]} sketch] 35 | (assoc sketch 36 | :inserts (inc inserts) 37 | :counters (reduce #(assoc %1 %2 (inc (%1 %2))) 38 | counters 39 | (hash-offsets val hashers hash-bits))))) 40 | 41 | (defn insert 42 | "Inserts one or more values into the count-min sketch." 43 | [sketch & vals] 44 | (reduce insert* sketch vals)) 45 | 46 | (defn into 47 | "Inserts a collection of values into the count-min sketch." 48 | [sketch coll] 49 | (reduce insert* sketch coll)) 50 | 51 | (defn- merge* [sketch1 sketch2] 52 | (when (apply not= (map (juxt :hashers :hash-bits) [sketch1 sketch2])) 53 | (throw (Exception. "Sketch options must match for merging."))) 54 | (assoc sketch1 55 | :inserts (+ (:inserts sketch1) (:inserts sketch2)) 56 | :counters (mapv + (:counters sketch1) (:counters sketch2)))) 57 | 58 | (defn merge 59 | "Merges the count-min sketches." 60 | [sketch & more] 61 | (reduce merge* sketch more)) 62 | 63 | (defn estimate-count 64 | "Returns an estimated occurance count for the value. The true count 65 | is guanteed to be no less than the estimate." 66 | [sketch val] 67 | (let [{:keys [hashers hash-bits counters]} sketch 68 | results (remove zero? (map counters (hash-offsets val hashers hash-bits)))] 69 | (if (empty? results) 0 (apply min results)))) 70 | -------------------------------------------------------------------------------- /src/clj/bigml/sketchy/bloom.clj: -------------------------------------------------------------------------------- 1 | ;; Copyright 2013, 2014, 2015 BigML 2 | ;; Licensed under the Apache License, Version 2.0 3 | ;; http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | (ns bigml.sketchy.bloom 6 | "Functions for constructing a bloom filter. 7 | http://en.wikipedia.org/wiki/Bloom_filter" 8 | (:refer-clojure :exclude [merge contains? into distinct]) 9 | (:import (java.lang Math)) 10 | (:require (bigml.sketchy [murmur :as murmur] 11 | [bits :as bits]))) 12 | 13 | (def ^:private log2 (Math/log 2)) 14 | 15 | (defn- choose-params [population false-positive-prob] 16 | (let [bins (- (/ (* population (Math/log false-positive-prob)) 17 | (* log2 log2))) 18 | bins (first (drop-while #(< % bins) (iterate (partial * 2) 1))) 19 | k (double (* (/ bins population) log2))] 20 | [(Long/numberOfTrailingZeros bins) 21 | (max (Math/round k) 1)])) 22 | 23 | (defn create 24 | "Creates a bloom filter given the expected unique population and the 25 | desired false positive rate." 26 | [population false-positive-rate] 27 | (let [[bits k] (choose-params population false-positive-rate)] 28 | {:bins (bits/create (bit-shift-left 1 bits)) 29 | :bits bits 30 | :k k})) 31 | 32 | (defn- insert* [bloom val] 33 | (let [{:keys [bits k bins]} bloom] 34 | (assoc bloom 35 | :bins (apply bits/set bins (take k (murmur/hash-seq val bits)))))) 36 | 37 | (defn insert 38 | "Inserts one or more values into the bloom filter." 39 | [bloom & vals] 40 | (reduce insert* bloom vals)) 41 | 42 | (defn into 43 | "Inserts a collection of values into the bloom filter." 44 | [bloom coll] 45 | (reduce insert* bloom coll)) 46 | 47 | (defn- merge* [bloom1 bloom2] 48 | (when (apply not= (map (juxt :k :bits) [bloom1 bloom2])) 49 | (throw (Exception. "Bloom options must match for merging."))) 50 | (assoc bloom1 51 | :bins (bits/or (:bins bloom1) (:bins bloom2)))) 52 | 53 | (defn merge 54 | "Merges the bloom filters." 55 | [bloom & more] 56 | (reduce merge* bloom more)) 57 | 58 | (defn contains? 59 | "Returns true if the value was inserted into the bloom filter, 60 | otherwise returns false. False positives are possible, but false 61 | negatives are not." 62 | [bloom val] 63 | (let [{:keys [bits k bins]} bloom] 64 | (every? true? (map (partial bits/test bins) 65 | (take k (murmur/hash-seq val bits)))))) 66 | 67 | (defn distinct 68 | "Removes non-distinct items." 69 | [vals & {:keys [population false-positive-rate]}] 70 | (let [bf (atom (create (or population 1E4) 71 | (or false-positive-rate 1E-2)))] 72 | (remove #(let [is-member (contains? @bf %)] 73 | (when-not is-member (swap! bf insert %)) 74 | is-member) 75 | vals))) 76 | -------------------------------------------------------------------------------- /src/clj/bigml/sketchy/hyper_loglog.clj: -------------------------------------------------------------------------------- 1 | ;; Copyright 2013, 2014, 2015 BigML 2 | ;; Licensed under the Apache License, Version 2.0 3 | ;; http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | (ns bigml.sketchy.hyper-loglog 6 | "Implements the hyper-loglog algorithm backed by a vector of bytes. 7 | http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.142.9475" 8 | (:refer-clojure :exclude [merge into]) 9 | (:require (bigml.sketchy [murmur :as murmur]))) 10 | 11 | (defn create 12 | "Creates a hyper-loglog sketch whose cardinality estimation error 13 | is similar to the optional error rate (default is 0.05)." 14 | ([] (create 0.05)) 15 | ([target-error-rate] 16 | (let [sketch (max 128 (Math/pow (/ 1.04 target-error-rate) 2)) 17 | sketch (first (drop-while #(< % sketch) (iterate (partial * 2) 128)))] 18 | (vec (repeat sketch (byte 0)))))) 19 | 20 | (defn- insert* [sketch val] 21 | (let [hv (murmur/hash val) 22 | bin-index (bit-and (dec (count sketch)) hv) 23 | offset (Long/numberOfTrailingZeros (count sketch)) 24 | zeros (Long/numberOfTrailingZeros (bit-shift-right hv offset))] 25 | (if (> zeros (sketch bin-index)) 26 | (assoc sketch bin-index (byte zeros)) 27 | sketch))) 28 | 29 | (defn insert 30 | "Inserts one or more values into the hyper-loglog sketch." 31 | [sketch & vals] 32 | (reduce insert* sketch vals)) 33 | 34 | (defn into 35 | "Inserts a collection of values into the hyper-loglog sketch." 36 | [sketch coll] 37 | (reduce insert* sketch coll)) 38 | 39 | (defn- check-size! [sketch1 sketch2] 40 | (when (not= (count sketch1) (count sketch2)) 41 | (throw (Exception. "HyperLogLog sketches must be the same size.")))) 42 | 43 | (defn- merge* [sketch1 sketch2] 44 | (check-size! sketch1 sketch2) 45 | (mapv (comp byte max) sketch1 sketch2)) 46 | 47 | (defn merge 48 | "Merges the hyper-loglog sketches." 49 | [sketch & more] 50 | (reduce merge* sketch more)) 51 | 52 | (defn distinct-count 53 | "Estimates the number of distinct values inserted into the 54 | hyper-loglog sketch." 55 | [sketch] 56 | (let [m (count sketch) 57 | v (count (filter zero? sketch)) 58 | e (* (/ (reduce + (map #(Math/pow 2 (- %)) sketch))) 59 | (bit-shift-left m 1) 60 | (/ 0.7213 (inc (/ 1.079 m))) 61 | m)] 62 | (long (if (and (< e (* 5 m)) (pos? v)) 63 | (* 2 m (Math/log (/ m v))) 64 | e)))) 65 | 66 | (defn jaccard-similarity 67 | "Calculates an estimate of the Jaccard similarity between the sets 68 | each sketch represents." 69 | [sketch1 sketch2] 70 | (check-size! sketch1 sketch2) 71 | (let [union (distinct-count (merge sketch1 sketch2)) 72 | intersection (- (+ (distinct-count sketch1) 73 | (distinct-count sketch2)) 74 | union)] 75 | (double (/ intersection union)))) 76 | -------------------------------------------------------------------------------- /src/clj/bigml/sketchy/bits.clj: -------------------------------------------------------------------------------- 1 | ;; Copyright 2013 BigML 2 | ;; Licensed under the Apache License, Version 2.0 3 | ;; http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | (ns bigml.sketchy.bits 6 | "Functions for an immutable bitset backed by a vector of longs." 7 | (:refer-clojure :exclude [set or and test]) 8 | (:import (java.lang Math))) 9 | 10 | (defn create 11 | "Creates a bitset supporting the desired number of bits." 12 | [num-bits] 13 | (vec (repeat (long (Math/ceil (/ num-bits 64))) 0))) 14 | 15 | (defn test 16 | "Returns true or false for the bit at the given index." 17 | [bits index] 18 | (bit-test (bits (bit-shift-right index 6)) 19 | (bit-and index 0x3f))) 20 | 21 | (defn- set* [bits index] 22 | (let [word-index (bit-shift-right index 6)] 23 | (assoc bits word-index (bit-set (bits word-index) 24 | (bit-and index 0x3f))))) 25 | 26 | (defn set 27 | "Sets the bits associated with each index." 28 | [bits & indicies] 29 | (reduce set* bits indicies)) 30 | 31 | (defn clear* [bits index] 32 | (let [word-index (bit-shift-right index 6)] 33 | (assoc bits word-index (bit-clear (bits word-index) 34 | (bit-and index 0x3f))))) 35 | 36 | (defn clear 37 | "Clears the bits associated with each index." 38 | [bits & indicies] 39 | (reduce clear* bits indicies)) 40 | 41 | (defn flip* [bits index] 42 | (let [word-index (bit-shift-right index 6)] 43 | (assoc bits word-index (bit-flip (bits word-index) 44 | (bit-and index 0x3f))))) 45 | 46 | (defn flip 47 | "Flips the bits associated with each index." 48 | [bits & indicies] 49 | (reduce flip* bits indicies)) 50 | 51 | (defn set-seq 52 | "Returns a seq containing the indicies of all set bits." 53 | [bits] 54 | ;; TODO - Replace with a not-so-slow implementation! 55 | (keep #(when (test bits %) %) (range (* 64 (count bits))))) 56 | 57 | (defn clear-seq 58 | "Returns a seq containing the indicies of all unset bits." 59 | [bits] 60 | ;; TODO - Replace with a not-so-slow implementation! 61 | (keep #(when-not (test bits %) %) (range (* 64 (count bits))))) 62 | 63 | (defn- check-size! [bits1 bits2] 64 | (when (not= (count bits1) (count bits2)) 65 | (throw (Exception. "Bit sets are not the same size.")))) 66 | 67 | (defn and 68 | "Ands the two bitsets." 69 | [bits1 bits2] 70 | (check-size! bits1 bits2) 71 | (mapv bit-and bits1 bits2)) 72 | 73 | (defn or 74 | "Ors the two bitsets." 75 | [bits1 bits2] 76 | (check-size! bits1 bits2) 77 | (mapv bit-or bits1 bits2)) 78 | 79 | (defn set-count 80 | "Returns the total number of set bits." 81 | [bits] 82 | (reduce + (map #(Long/bitCount %) bits))) 83 | 84 | (defn hamming-distance 85 | "Returns the hamming distance between the bit sets (same as L1 or 86 | manhattan distance)." 87 | [bits1 bits2] 88 | (check-size! bits1 bits2) 89 | (set-count (map bit-xor bits1 bits2))) 90 | 91 | (defn dot-product 92 | "Returns the dot product (or inner product) of the bit sets." 93 | [bits1 bits2] 94 | (check-size! bits1 bits2) 95 | (set-count (map bit-and bits1 bits2))) 96 | 97 | (defn jaccard-similarity 98 | "Calculates the Jaccard similarity between the bit sets." 99 | [bits1 bits2] 100 | (let [union (set-count (map bit-or bits1 bits2))] 101 | (when (pos? union) 102 | (double (/ (dot-product bits1 bits2) union))))) 103 | 104 | (defn cosine-similarity 105 | "Calculates the cosine similarity between the bit sets." 106 | [bits1 bits2] 107 | (let [magnitude (Math/sqrt (* (set-count bits1) (set-count bits2)))] 108 | (when (pos? magnitude) 109 | (/ (dot-product bits1 bits2) magnitude)))) 110 | -------------------------------------------------------------------------------- /src/clj/bigml/sketchy/min_hash.clj: -------------------------------------------------------------------------------- 1 | ;; Copyright 2013, 2014, 2015, 2016 BigML 2 | ;; Licensed under the Apache License, Version 2.0 3 | ;; http://www.apache.org/licenses/LICENSE-2.0 4 | 5 | (ns bigml.sketchy.min-hash 6 | "Functions for constructing a min hash. 7 | http://en.wikipedia.org/wiki/MinHash 8 | 9 | Also includes improvements recommended in: 10 | 'Improved Densification of One Permutation Hashing' 11 | http://arxiv.org/abs/1406.4784" 12 | (:refer-clojure :exclude [merge into]) 13 | (:import (java.lang Math) 14 | (java.util BitSet)) 15 | (:require (bigml.sketchy [murmur :as murmur]))) 16 | 17 | (def ^:private max-long Long/MAX_VALUE) 18 | 19 | (defn- make-rand-shifts [bits] 20 | (let [arr (long-array 1) 21 | _ (aset-long arr 0 bits) 22 | bs (BitSet/valueOf arr)] 23 | (for [i (range 64)] (.get bs i)))) 24 | 25 | (def ^:private rand-shifts 26 | (mapcat make-rand-shifts (iterate murmur/hash 1))) 27 | 28 | (defn create 29 | "Create a min-hash with an optional desired error rate when 30 | calculating similarity estimates (defaults to 0.05)." 31 | ([] (create 0.05)) 32 | ([error-rate] 33 | (let [min-size (/ (* error-rate error-rate))] 34 | (vec (repeat (->> (iterate (partial * 2) 128) 35 | (drop-while #(< % min-size)) 36 | (first)) 37 | max-long))))) 38 | 39 | (defn- insert* [sketch val] 40 | (let [hv (murmur/hash val) 41 | index (bit-and (dec (count sketch)) hv) 42 | hv2 (->> (count sketch) 43 | (Long/numberOfTrailingZeros) 44 | (bit-shift-right hv))] 45 | (if (> (sketch index) hv2) 46 | (assoc sketch index hv2) 47 | sketch))) 48 | 49 | (defn insert 50 | "Inserts one or more values into the min-hash." 51 | [sketch & vals] 52 | (reduce insert* sketch vals)) 53 | 54 | (defn into 55 | "Inserts a collection of values into the min-hash." 56 | [sketch coll] 57 | (reduce insert* sketch coll)) 58 | 59 | (defn- check-size! [sketch1 sketch2] 60 | (when (not= (count sketch1) (count sketch2)) 61 | (throw (Exception. "Min-hash sketches must be the same size.")))) 62 | 63 | (defn- densify 64 | "Densifies the min-hash sketch so it may be used for similarity 65 | estimation." 66 | [sketch] 67 | (let [filled-bins (remove #(= % max-long) sketch)] 68 | (loop [result [] 69 | bins sketch 70 | filled-bins (cons (last filled-bins) (cycle filled-bins)) 71 | shifts rand-shifts] 72 | (cond (empty? bins) result 73 | (empty? filled-bins) bins 74 | (= (first bins) max-long) 75 | (recur (conj result 76 | (if (first shifts) 77 | (first filled-bins) 78 | (second filled-bins))) 79 | (next bins) 80 | filled-bins 81 | (next shifts)) 82 | :else 83 | (recur (conj result (first bins)) 84 | (next bins) 85 | (next filled-bins) 86 | (next shifts)))))) 87 | 88 | (defn jaccard-similarity 89 | "Calculates an estimate of the Jaccard similarity between the sets 90 | each sketch represents." 91 | [sketch1 sketch2] 92 | (check-size! sketch1 sketch2) 93 | (-> (filter true? (map = (densify sketch1) (densify sketch2))) 94 | (count) 95 | (/ (count sketch1)) 96 | (double))) 97 | 98 | (defn- merge* [sketch1 sketch2] 99 | (check-size! sketch1 sketch2) 100 | (mapv min sketch1 sketch2)) 101 | 102 | (defn merge 103 | "Merges the min-hashes (analogous to a union of the sets they 104 | represent)." 105 | [sketch & more] 106 | (reduce merge* sketch more)) 107 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Sketching Algorithms in Clojure 3 | 4 | ## Installation 5 | 6 | `sketchy` is available as a Maven artifact from 7 | [Clojars](http://clojars.org/bigml/sketchy). 8 | 9 | [![Clojars Project](https://img.shields.io/clojars/v/bigml/sketchy.svg)](https://clojars.org/bigml/sketchy) 10 | 11 | ## Overview 12 | 13 | This library contains various sketching/hash-based algorithms useful 14 | for building compact summaries of large datasets. 15 | 16 | All the sketches are composed using vanilla Clojure data 17 | structures. That means immutability and easy serialization but humble 18 | performance. [stream-lib](https://github.com/addthis/stream-lib) is a 19 | good alternative for those in need of speed. 20 | 21 | General Utilities: 22 | - [MurmurHash](#murmurhash) 23 | - [Immutable Bitset](#immutable-bitset) 24 | 25 | Sketching/hash-based algorithms: 26 | - [Bloom Filter](#bloom-filter) 27 | - [Min Hash](#min-hash) 28 | - [Hyper-LogLog](#hyper-loglog) 29 | - [Count-Min](#count-min) 30 | 31 | As we review each section, feel free to follow along in the REPL. Note 32 | that `bigml.sketchy.test.demo` loads *Hamlet* and *A Midsummer Night's 33 | Dream* into memory for our code examples. 34 | 35 | ```clojure 36 | user> (ns test 37 | (:use [bigml.sketchy.test.demo]) 38 | (:require (bigml.sketchy [murmur :as murmur] 39 | [bits :as bits] 40 | [bloom :as bloom] 41 | [min-hash :as min-hash] 42 | [hyper-loglog :as hll] 43 | [count-min :as count-min]))) 44 | ``` 45 | 46 | ## MurmurHash 47 | 48 | The `bigml.sketchy.murmur` namespace makes it easy to generate seeded 49 | [Murmur hashes](http://en.wikipedia.org/wiki/MurmurHash). Murmur hashes 50 | are popular as they are reasonably quick to produce and adequately 51 | random. 52 | 53 | These Murmur hashes are all produced as 64 bit longs. A simple example 54 | hashing the string "foo" to a long: 55 | 56 | ```clojure 57 | test> (murmur/hash "foo") 58 | 6231696022289519434 59 | ``` 60 | 61 | Anything that `clojure.core/hash` accepts may also be used with this 62 | hash fn: 63 | 64 | ```clojure 65 | test> (murmur/hash {:foo "bar"}) 66 | -7720779806311024803 67 | ``` 68 | 69 | An optional seed parameter selects a unique hashing function. Anything 70 | that's hashable by `clojure.core/hash` is valid as a seed. 71 | 72 | ```clojure 73 | test> (murmur/hash "foo" 0) 74 | 6231696022289519434 75 | test> (murmur/hash "foo" 42) 76 | -8820575662888368925 77 | test> (murmur/hash "foo" :bar) 78 | -8527955061573093315 79 | ``` 80 | 81 | The `truncate` function can be used to truncate the number of bits 82 | (must be less than 64 and more than 0). 83 | 84 | ```clojure 85 | test> (murmur/truncate (murmur/hash "foo") 32) 86 | 3972535114 87 | test> (murmur/truncate (murmur/hash "foo") 16) 88 | 4938 89 | test> (murmur/truncate (murmur/hash "foo") 8) 90 | 74 91 | ``` 92 | 93 | If you need multiple unique hashes for a value, `hash-seq` is a 94 | convenience function for that. It applies an infinite sequence of 95 | unique hash functions (always in the same order), so `take` as many 96 | as you need. 97 | 98 | ```clojure 99 | test> (take 3 (murmur/hash-seq "foo")) 100 | (6231696022289519434 -1965669315023635442 -4826411765733908310) 101 | ``` 102 | 103 | ## Immutable Bitset 104 | 105 | Besides being my favorite name for a namespace, `bigml.sketchy.bits` 106 | provides an immutable bitset supporting bit-level operations for any 107 | number of bits. The bitset is backed by a vector of longs. 108 | 109 | The `create` function builds a bitset given the desired number of 110 | bits. Every bit will be initialized as clear (all zero). 111 | 112 | The `set` function sets the bits at the given indicies. The `test` 113 | function returns true if the bit at the given index is set. 114 | 115 | ```clojure 116 | test> (def my-bits (-> (bits/create 256) 117 | (bits/set 2 48 58 184 233))) 118 | test> (bits/test my-bits 47) 119 | false 120 | test> (bits/test my-bits 48) 121 | true 122 | ``` 123 | 124 | The `set-seq` function returns the indicies of every set 125 | bit. Alternatively, `clear-seq` returns all the clear bits. 126 | 127 | ```clojure 128 | test> (bits/set-seq my-bits) 129 | (2 48 58 184 233) 130 | ``` 131 | 132 | The `clear` function complements `set` by clearing the bits for the 133 | given indices. Similarly, the `flip` function reverses a bit's state. 134 | 135 | ```clojure 136 | test> (bits/set-seq (bits/clear my-bits 48)) 137 | (2 58 184 233) 138 | test> (bits/set-seq (bits/flip my-bits 48)) 139 | (2 58 184 233) 140 | ``` 141 | 142 | Moreover, the namespace offers functions to `and` and `or` two 143 | bitsets. You can also measure `hamming-distance`, 144 | `jaccard-similarity`, or `cosine-similarity`. 145 | 146 | ## Bloom Filter 147 | 148 | `bigml.sketchy.bloom` contains an implementation of a [Bloom 149 | filter](http://en.wikipedia.org/wiki/Bloom_filter), useful for testing 150 | set membership. When checking set membership for an item, false 151 | positives are possible but false negatives are not. 152 | 153 | You may `create` a Bloom filter by providing the expected number of 154 | items to be inserted into the filter and the acceptable 155 | false positive rate. 156 | 157 | After creating the filter, you may either `insert` individual items or 158 | add an entire collection of items `into` the Bloom filter. 159 | 160 | ```clojure 161 | test> (def hamlet-bloom 162 | (reduce bloom/insert 163 | (bloom/create (count hamlet-tokens) 0.01) 164 | hamlet-tokens)) 165 | 166 | test> (def midsummer-bloom 167 | (bloom/into (bloom/create (count midsummer-tokens) 0.01) 168 | midsummer-tokens)) 169 | ``` 170 | 171 | Item membership is tested with `contains?`. 172 | 173 | ```clojure 174 | test> (bloom/contains? hamlet-bloom "puck") 175 | false 176 | test> (bloom/contains? midsummer-bloom "puck") 177 | true 178 | ``` 179 | 180 | The Bloom filters are also merge friendly as long as they are 181 | initialized with the same parameters. 182 | 183 | ```clojure 184 | test> (def summerham-bloom 185 | (let [total (+ (count hamlet-tokens) (count midsummer-tokens))] 186 | (bloom/merge (bloom/into (bloom/create total 0.01) midsummer-tokens) 187 | (bloom/into (bloom/create total 0.01) hamlet-tokens)))) 188 | test> (bloom/contains? summerham-bloom "puck") 189 | true 190 | test> (bloom/contains? summerham-bloom "yorick") 191 | true 192 | test> (bloom/contains? summerham-bloom "henry") 193 | false 194 | ``` 195 | 196 | ## Min-Hash 197 | 198 | `bigml.sketchy.min-hash` contains an implementation of the 199 | [MinHash](http://en.wikipedia.org/wiki/MinHash) algorithm, useful for 200 | comparing the [Jaccard 201 | similarity](http://en.wikipedia.org/wiki/Jaccard_index) of two sets. 202 | 203 | This implementation includes the improvements recommended in 204 | "[Improved Densification of One Permutation Hashing](http://arxiv.org/abs/1406.4784)", 205 | which greatly reduces the algorithmic complexity for building a MinHash. 206 | 207 | To `create` a MinHash, you may provide a target error rate for 208 | similarity (default is 0.05). After that, you can either `insert` 209 | individual values or add collections `into` the MinHash. 210 | 211 | In the following example we break *A Midsummer Night's Dream* into two 212 | halves (`midsummer-part1` and `midsummer-part2`) and build a MinHash 213 | for each. We then compare the two parts together to see if they are 214 | more similar than a MinHash of *Hamlet*. 215 | 216 | As we'd expect, the two halves of *A Midsummer Night's Dream* are more 217 | alike than *Hamlet*. 218 | 219 | ```clojure 220 | test> (def hamlet-hash (min-hash/into (min-hash/create) hamlet-tokens)) 221 | test> (def midsummer1-hash (min-hash/into (min-hash/create) midsummer-part1)) 222 | test> (def midsummer2-hash (min-hash/into (min-hash/create) midsummer-part2)) 223 | test> (min-hash/jaccard-similarity midsummer1-hash midsummer2-hash) 224 | 0.2852 225 | test> (min-hash/jaccard-similarity midsummer1-hash hamlet-hash) 226 | 0.2012 227 | ``` 228 | 229 | The MinHashes are merge friendly as long as they're initialized with 230 | the same target error rate. 231 | 232 | ```clojure 233 | test> (def midsummer-hash (min-hash/into (min-hash/create) midsummer-tokens)) 234 | test> (min-hash/jaccard-similarity midsummer-hash 235 | (min-hash/merge midsummer1-hash 236 | midsummer2-hash)) 237 | 1.0 238 | ``` 239 | 240 | ## Hyper-LogLog 241 | 242 | `bigml.sketchy.hyper-loglog` contains an implementation of the 243 | [HyperLogLog](http://research.google.com/pubs/pub40671.html) sketch, 244 | useful for estimating the number of distinct items in a set. This is a 245 | technique popular for tracking unique visitors over time. 246 | 247 | To `create` a HyperLogLog sketch, you may provide a target error rate 248 | for distinct item estimation (default is 0.05). After that, you can 249 | either `insert` individual values or add collections `into` the 250 | sketch. 251 | 252 | ```clojure 253 | test> (def hamlet-hll (hll/into (hll/create 0.01) hamlet-tokens)) 254 | test> (def midsummer-hll (hll/into (hll/create 0.01) midsummer-tokens)) 255 | test> (count (distinct hamlet-tokens)) ;; actual 256 | 4793 257 | test> (hll/distinct-count hamlet-hll) ;; estimated 258 | 4868 259 | test> (count (distinct midsummer-tokens)) ;; actual 260 | 3034 261 | test> (hll/distinct-count midsummer-hll) ;; estimated 262 | 3018 263 | ``` 264 | 265 | HyperLogLog sketches may be merged if they're initialized with the 266 | same error rate. 267 | 268 | ```clojure 269 | test> (count (distinct (concat hamlet-tokens midsummer-tokens))) ;; actual 270 | 6275 271 | test> (hll/distinct-count (hll/merge hamlet-hll midsummer-hll)) ;; estimated 272 | 6312 273 | ``` 274 | 275 | Similar to MinHash, HyperLogLog sketches can also provide an estimate 276 | of the [Jaccard 277 | similarity](http://en.wikipedia.org/wiki/Jaccard_index) between two 278 | sets. 279 | 280 | ```clojure 281 | test> (def midsummer1-hll (hll/into (hll/create 0.01) midsummer-part1)) 282 | test> (def midsummer2-hll (hll/into (hll/create 0.01) midsummer-part2)) 283 | test> (hll/jaccard-similarity midsummer1-hll midsummer2-hll) 284 | 0.2833001988071571 285 | test> (hll/jaccard-similarity midsummer1-hll hamlet-hll) 286 | 0.201231310466139 287 | ``` 288 | 289 | ## Count-Min 290 | 291 | `bigml.sketchy.count-min` provides an implementation of the [Count-Min 292 | sketch](http://en.wikipedia.org/wiki/Count-Min_sketch), useful for 293 | estimating frequencies of arbritrary items in a stream. 294 | 295 | To `create` a count-min sketch you may define the desired number of 296 | hash-bits and the number of independent hash functions. The total 297 | number of counters maintained by the sketch will be 298 | (2^hash-bits)*hashers, so choose these values carefully. 299 | 300 | After creating a sketch, you may either `insert` individual values or 301 | add collections `into` the sketch. 302 | 303 | In the example below we build a Count-Min sketch that uses 1500 304 | counters to estimate frequencies for the 4800 unique tokens in 305 | *Hamlet*. 306 | 307 | ```clojure 308 | test> (def hamlet-cm (count-min/into (count-min/create :hash-bits 9) 309 | hamlet-tokens)) 310 | test> (count (:counters hamlet-cm)) 311 | 1536 312 | test> ((frequencies hamlet-tokens) "hamlet") 313 | 77 314 | test> (count-min/estimate-count hamlet-cm "hamlet") 315 | 87 316 | test> ((frequencies hamlet-tokens) "rosencrantz") 317 | 7 318 | test> (count-min/estimate-count hamlet-cm "rosencrantz") 319 | 15 320 | ``` 321 | 322 | As with the other sketching algorithms, Count-Min sketches may be 323 | merged if they're initialized with the same parameters. 324 | 325 | ```clojure 326 | test> (def midsummer1-cm (count-min/into (count-min/create :hash-bits 9) 327 | midsummer-part1)) 328 | test> (def midsummer2-cm (count-min/into (count-min/create :hash-bits 9) 329 | midsummer-part2)) 330 | test> ((frequencies midsummer-tokens) "love") ;; actual count 331 | 98 332 | test> (count-min/estimate-count (count-min/merge midsummer1-cm midsummer2-cm) 333 | "love") 334 | 104 335 | ``` 336 | 337 | ## Contributing to this project 338 | 339 | See doc/contributing.md 340 | 341 | ## License 342 | 343 | Copyright (C) 2013 BigML Inc. 344 | 345 | Distributed under the Apache License, Version 2.0. 346 | --------------------------------------------------------------------------------