├── hadoop ├── checkouts │ └── tesser.math ├── demo │ ├── doc │ │ └── intro.md │ ├── .gitignore │ ├── project.clj │ ├── src │ │ └── tesser │ │ │ └── hadoop │ │ │ └── demo │ │ │ └── core.clj │ ├── README.md │ └── LICENSE ├── .gitignore ├── src │ └── tesser │ │ ├── hadoop_support │ │ └── FressianWritable.java │ │ ├── hadoop │ │ └── serialization.clj │ │ └── hadoop.clj ├── test │ └── tesser │ │ └── hadoop │ │ └── serialization_test.clj ├── project.clj └── LICENSE ├── math ├── checkouts │ └── tesser.core ├── doc │ └── intro.md ├── .gitignore ├── project.clj ├── src │ └── tesser │ │ ├── cardinality.clj │ │ ├── quantiles.clj │ │ └── math.clj ├── test │ └── tesser │ │ ├── math_test.clj │ │ └── quantiles_test.clj └── LICENSE ├── img ├── logo.jpg ├── reduce-combine.jpg ├── combiner-identity-post.jpg └── reducer-identity-post.jpg ├── all ├── test │ └── all │ │ └── core_test.clj ├── src │ └── all │ │ └── core.clj ├── .gitignore ├── project.clj └── LICENSE ├── .gitignore ├── deploy ├── core ├── project.clj ├── src │ └── tesser │ │ ├── simple.clj │ │ └── utils.clj ├── test │ └── tesser │ │ ├── simple_test.clj │ │ ├── utils_test.clj │ │ ├── bench_test.clj │ │ └── core_test.clj └── LICENSE └── README.markdown /hadoop/checkouts/tesser.math: -------------------------------------------------------------------------------- 1 | ../../math -------------------------------------------------------------------------------- /math/checkouts/tesser.core: -------------------------------------------------------------------------------- 1 | ../../core/ -------------------------------------------------------------------------------- /img/logo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphyr/tesser/HEAD/img/logo.jpg -------------------------------------------------------------------------------- /img/reduce-combine.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphyr/tesser/HEAD/img/reduce-combine.jpg -------------------------------------------------------------------------------- /img/combiner-identity-post.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphyr/tesser/HEAD/img/combiner-identity-post.jpg -------------------------------------------------------------------------------- /img/reducer-identity-post.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphyr/tesser/HEAD/img/reducer-identity-post.jpg -------------------------------------------------------------------------------- /all/test/all/core_test.clj: -------------------------------------------------------------------------------- 1 | (ns all.core-test 2 | (:require [clojure.test :refer :all] 3 | [all.core :refer :all])) 4 | -------------------------------------------------------------------------------- /all/src/all/core.clj: -------------------------------------------------------------------------------- 1 | (ns all.core) 2 | 3 | (defn foo 4 | "I don't do a whole lot." 5 | [x] 6 | (println x "Hello, World!")) 7 | -------------------------------------------------------------------------------- /hadoop/demo/doc/intro.md: -------------------------------------------------------------------------------- 1 | # Introduction to demo 2 | 3 | TODO: write [great documentation](http://jacobian.org/writing/what-to-write/) 4 | -------------------------------------------------------------------------------- /math/doc/intro.md: -------------------------------------------------------------------------------- 1 | # Introduction to tesser.math 2 | 3 | TODO: write [great documentation](http://jacobian.org/writing/what-to-write/) 4 | -------------------------------------------------------------------------------- /all/.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /classes 3 | /checkouts 4 | pom.xml 5 | pom.xml.asc 6 | *.jar 7 | *.class 8 | /.lein-* 9 | /.nrepl-port 10 | -------------------------------------------------------------------------------- /hadoop/.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /classes 3 | /checkouts 4 | pom.xml 5 | pom.xml.asc 6 | *.jar 7 | *.class 8 | /.lein-* 9 | /.nrepl-port 10 | -------------------------------------------------------------------------------- /math/.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /classes 3 | /checkouts 4 | pom.xml 5 | pom.xml.asc 6 | *.jar 7 | *.class 8 | /.lein-* 9 | /.nrepl-port 10 | -------------------------------------------------------------------------------- /hadoop/demo/.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /classes 3 | /checkouts 4 | pom.xml 5 | pom.xml.asc 6 | *.jar 7 | *.class 8 | /.lein-* 9 | /.nrepl-port 10 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | */target 2 | */classes 3 | */checkouts 4 | */doc 5 | pom.xml 6 | pom.xml.asc 7 | *.jar 8 | *.class 9 | *.hprof 10 | .*.swp 11 | .lein-* 12 | .nrepl-port 13 | -------------------------------------------------------------------------------- /deploy: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | dir=`pwd` 3 | 4 | cd $dir/core && lein do clean, test, install, deploy clojars && \ 5 | cd $dir/math && lein do clean, test, install, deploy clojars && \ 6 | #cd $dir/hadoop && lein do clean, test, install, deploy clojars && \ 7 | cd $dir/all && lein do clean, codox && \ 8 | cd $dir/all/doc && git commit -am "docs" && git push && \ 9 | cd $dir 10 | -------------------------------------------------------------------------------- /math/project.clj: -------------------------------------------------------------------------------- 1 | (defproject tesser.math "1.0.9-SNAPSHOT" 2 | :description "Concurrent folds for statistical analysis" 3 | :url "http://github.com/aphyr/tesser" 4 | :license {:name "Eclipse Public License" 5 | :url "http://www.eclipse.org/legal/epl-v10.html"} 6 | 7 | :dependencies [[tesser.core "1.0.8"] 8 | ; [com.tdunning/t-digest "3.0"] 9 | [com.clearspring.analytics/stream "2.9.8"] 10 | ; 2.1.10 changed quantiles; see https://github.com/HdrHistogram/HdrHistogram/issues/194. 11 | [org.hdrhistogram/HdrHistogram "2.1.9"] 12 | [org.clojure/math.combinatorics "0.3.0"] 13 | [org.clojure/math.numeric-tower "0.1.0"]] 14 | :profiles {:dev {:dependencies [[org.clojure/clojure "1.12.3"] 15 | [metametadata/multiset "0.1.1"] 16 | [criterium "0.4.6"] 17 | [org.clojure/test.check "1.1.1"]]}}) 18 | -------------------------------------------------------------------------------- /core/project.clj: -------------------------------------------------------------------------------- 1 | (defproject tesser.core "1.0.9-SNAPSHOT" 2 | :description "Composable concurrent folds for Clojure." 3 | :url "http://github.com/aphyr/tesser" 4 | :license {:name "Eclipse Public License" 5 | :url "http://www.eclipse.org/legal/epl-v10.html"} 6 | :dependencies [[interval-metrics "1.0.1"]] 7 | :jvm-opts ["-server" 8 | "-Xms1024m" 9 | "-Xmx1024m" 10 | ; "-Dcom.sun.management.jmxremote" 11 | ; "-XX:+UnlockCommercialFeatures" 12 | ; "-XX:+FlightRecorder" 13 | ] 14 | :profiles {:dev {:dependencies [[org.clojure/clojure "1.12.3"] 15 | [metametadata/multiset "0.1.1"] 16 | [criterium "0.4.6"] 17 | [org.clojure/test.check "1.1.1"]] 18 | ; :jvm-opts ["-XX:-OmitStackTraceInFastThrow"] 19 | }} 20 | :test-selectors {:default #(not-any? % [:stress :bench]) 21 | :focus :focus 22 | :bench :bench 23 | :stress :stress}) 24 | -------------------------------------------------------------------------------- /hadoop/src/tesser/hadoop_support/FressianWritable.java: -------------------------------------------------------------------------------- 1 | package tesser.hadoop_support; 2 | 3 | import java.io.DataInput; 4 | import java.io.DataOutput; 5 | import org.apache.hadoop.io.Writable; 6 | import clojure.lang.IFn; 7 | 8 | // Mutable container for reading and writing Fressian data structures using 9 | // Hadoop. Has a pair of static IFns for reading and writing data, which will 10 | // be filled in by Fressian code later. readFieldsFn is invoked with this and 11 | // the input, and is expected to clobber this's state. writeFn is invoked with 12 | // this and an output, and is expected to serialize this's state to that 13 | // output. 14 | // 15 | // Ugly, ugly hack. 16 | // 17 | // Totally not thread safe, but what's life without a little danger? 18 | public class FressianWritable implements Writable { 19 | public static IFn readFieldsFn; 20 | public static IFn writeFn; 21 | 22 | public Object state; 23 | 24 | public FressianWritable() { 25 | } 26 | 27 | public void readFields(DataInput in) { 28 | readFieldsFn.invoke(this, in); 29 | } 30 | 31 | public void write(DataOutput out) { 32 | writeFn.invoke(this, out); 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /core/src/tesser/simple.clj: -------------------------------------------------------------------------------- 1 | (ns tesser.simple 2 | "Drop-in replacement for `reducers/fold` and `core/reduce`, where order 3 | doesn't matter." 4 | (:refer-clojure :exclude [reduce]) 5 | (:require [tesser.core :as t] 6 | [tesser.utils :refer :all])) 7 | 8 | (defn fold 9 | "Folds over a collection using a parallel reduce-combine strategy. Collection 10 | is partitioned into chunks of approximately `n` (default 512), and folded 11 | over with `t/tesser`. Unlike reducers `fold`, this function does not preserve 12 | order." 13 | ([reducef coll] 14 | (fold reducef reducef coll)) 15 | ([combinef reducef coll] 16 | (fold 512 combinef reducef coll)) 17 | ([n combinef reducef coll] 18 | (t/tesser (t/chunk n coll) 19 | (t/fold {:reducer reducef 20 | :combiner combinef})))) 21 | 22 | (defn reduce 23 | "Like `clojure.core/reduce, but parallel, using `t/tesser` over 512-element 24 | chunks. Unlike `core/reduce`, does not preserve order, init must be an 25 | identity element, f must be associative, etc." 26 | ([f init coll] 27 | (t/tesser (t/chunk 512 coll) 28 | (t/fold {:reducer f 29 | :identity (constantly init)})))) 30 | -------------------------------------------------------------------------------- /core/test/tesser/simple_test.clj: -------------------------------------------------------------------------------- 1 | (ns tesser.simple-test 2 | (:require [clojure.test :refer :all] 3 | [clojure.test.check :as tc] 4 | [clojure.test.check [clojure-test :refer :all] 5 | [generators :as gen] 6 | [properties :as prop]] 7 | [multiset.core :refer [multiset]] 8 | [tesser.simple :as s] 9 | [clojure.core.reducers :as r] 10 | [clojure.set :as set])) 11 | 12 | (def test-opts {:num-tests 1000 13 | :par 256}) 14 | 15 | (def flat-ints (gen/one-of [(gen/list gen/int) 16 | (gen/vector gen/int) 17 | (gen/fmap long-array (gen/vector gen/int))])) 18 | 19 | (defspec fold-spec 20 | test-opts 21 | (let [reducer (r/monoid conj hash-set)] 22 | (prop/for-all [xs flat-ints] 23 | (and (is (= (r/fold + xs) 24 | (s/fold + xs))) 25 | (is (= (r/fold set/union reducer xs) 26 | (s/fold set/union reducer xs))))))) 27 | 28 | (defspec reduce-spec 29 | test-opts 30 | (prop/for-all [xs flat-ints] 31 | (and (is (= (reduce + 0 xs) 32 | (s/reduce + 0 xs)))))) 33 | -------------------------------------------------------------------------------- /hadoop/test/tesser/hadoop/serialization_test.clj: -------------------------------------------------------------------------------- 1 | (ns tesser.hadoop.serialization-test 2 | (:require [clojure.test :refer :all] 3 | [clojure.pprint :refer [pprint]] 4 | [clojure.test.check :as tc] 5 | [clojure.test.check [clojure-test :refer :all] 6 | [generators :as gen] 7 | [properties :as prop]] 8 | [tesser [core :as t] 9 | [math :as m] 10 | [quantiles :as q]] 11 | [tesser.hadoop.serialization :as s]) 12 | (:import (org.HdrHistogram DoubleHistogram) 13 | (java.nio ByteBuffer) 14 | (java.util.zip Deflater))) 15 | 16 | (def test-opts {:num-tests 1000 17 | :par 48}) 18 | 19 | (defspec digest-serialization-spec 20 | test-opts 21 | (prop/for-all [xs (gen/vector gen/pos-int)] 22 | (let [digest (q/dual q/hdr-histogram)] 23 | ; Fill digest 24 | (doseq [x xs] 25 | (q/add-point! digest x)) 26 | 27 | ; Serialize and deserialize 28 | (let [digest' (-> digest 29 | s/write-byte-array 30 | s/read-byte-array)] 31 | (is (= digest digest')))))) 32 | -------------------------------------------------------------------------------- /hadoop/project.clj: -------------------------------------------------------------------------------- 1 | (defproject tesser.hadoop "1.0.3" 2 | :description "Tesser: Hadoop support via Parkour." 3 | :url "http://github.com/aphyr/tesser" 4 | :license {:name "Eclipse Public License" 5 | :url "http://www.eclipse.org/legal/epl-v10.html"} 6 | :java-source-paths ["src/"] 7 | :javac-options ["-target" "1.5" 8 | "-source" "1.5"] 9 | :repositories 10 | {"cloudera" "https://repository.cloudera.com/artifactory/cloudera-repos/"} 11 | :dependencies [[tesser.core "1.0.3"] 12 | [tesser.math "1.0.3"] 13 | [org.clojure/data.fressian "0.2.0"] 14 | [com.damballa/parkour "0.5.4"] 15 | [org.codehaus.jsr166-mirror/jsr166y "1.7.0"] 16 | [proteus "0.1.4"]] 17 | :profiles {:dev 18 | {:dependencies [[org.clojure/clojure "1.9.0"] 19 | [org.clojars.achim/multiset "0.1.0-SNAPSHOT"] 20 | [criterium "0.4.3"] 21 | [org.clojure/test.check "0.7.0"]]} 22 | :provided 23 | {:dependencies 24 | ; Just so we can compile our Writable 25 | [[org.apache.hadoop/hadoop-client "2.0.0-mr1-cdh4.3.0" 26 | :exclusions [org.slf4j/slf4j-api]] 27 | ; for compiling without tesser.math 28 | [com.clearspring.analytics/stream "2.7.0"]]}}) 29 | -------------------------------------------------------------------------------- /all/project.clj: -------------------------------------------------------------------------------- 1 | (defproject tesser.all "1.0.6" 2 | :description "Not a real library; just a placeholder for unifying all the docs." 3 | :url "http://github.com/aphyr/tesser" 4 | :license {:name "Eclipse Public License" 5 | :url "http://www.eclipse.org/legal/epl-v10.html"} 6 | :repositories {"cloudera" "https://repository.cloudera.com/artifactory/cloudera-repos/"} 7 | :dependencies [[tesser.core "1.0.6"] 8 | [tesser.math "1.0.6"] 9 | ; [tesser.hadoop "1.0.3" 10 | ; :exclusions [org.codehaus.jackson/jackson-mapper-asl 11 | ; org.codehaus.jackson/jackson-core-asl]] 12 | ] 13 | :codox {:source-paths ["../core/src" 14 | "../math/src" 15 | ; "../hadoop/src" 16 | ] 17 | :output-path "doc/" 18 | :source-uri "http://github.com/aphyr/tesser/blob/{version}/{filepath}#L{line}" 19 | :metadata {:doc/format :markdown}} 20 | :plugins [[lein-codox "0.10.8"]] 21 | :profiles {:dev {:dependencies [[org.clojure/clojure "1.11.1"] 22 | ]} 23 | ; Parkour won't compile without hadoop 24 | ; :provided 25 | ; {:dependencies 26 | ; [[org.apache.hadoop/hadoop-client "2.0.0-mr1-cdh4.3.0" 27 | ; :exclusions [; Breaks codox via incompatible asm 28 | ; ; FUCK EVERYTHING GRAAAAR 29 | ; asm]]]}}) 30 | }) 31 | -------------------------------------------------------------------------------- /core/test/tesser/utils_test.clj: -------------------------------------------------------------------------------- 1 | (ns tesser.utils-test 2 | (:require [clojure.test :refer :all] 3 | [clojure.test.check :as tc] 4 | [clojure.test.check [clojure-test :refer :all] 5 | [generators :as gen] 6 | [properties :as prop]] 7 | [multiset.core :refer [multiset]] 8 | [tesser.utils :refer :all])) 9 | 10 | (def test-opts {:num-tests 1000 11 | :par 256}) 12 | 13 | (defspec differences-spec 14 | test-opts 15 | (prop/for-all [coll (gen/such-that not-empty (gen/vector gen/int))] 16 | (is (= coll 17 | (->> coll 18 | differences 19 | (cumulative-sums (first coll))))))) 20 | 21 | (defspec partition-fast-spec 22 | {:num-tests 1000} 23 | (prop/for-all [coll (gen/one-of [(gen/list gen/int) 24 | (gen/vector gen/int) 25 | (gen/map gen/int gen/int) 26 | (gen/fmap long-array (gen/vector gen/int))]) 27 | n (gen/choose 1 50)] 28 | (or (is (= (->> coll 29 | (reducible-chunk n) 30 | (map (partial into []))) 31 | (partition-all n coll))) 32 | (prn :n n :coll (seq coll))))) 33 | 34 | (deftest bytes?-test 35 | (is (bytes? (byte-array [1 2 3]))) 36 | (is (not (bytes? nil))) 37 | (is (not (bytes? (byte 4))))) 38 | -------------------------------------------------------------------------------- /hadoop/demo/project.clj: -------------------------------------------------------------------------------- 1 | (defproject tesser-hadoop-demo "0.1.0-SNAPSHOT" 2 | :description "An example Hadoop job with Tesser" 3 | :url "https://github.com/aphyr/tesser" 4 | :license {:name "Eclipse Public License" 5 | :url "http://www.eclipse.org/legal/epl-v10.html"} 6 | ; You may need to add a repository for your hadoop client if it isn't in 7 | ; maven-central 8 | :repositories {"cloudera" "https://repository.cloudera.com/artifactory/cloudera-repos/"} 9 | ; You'll want to depend on clojure and tesser.hadoop, plus any libraries 10 | ; you'd like to use. 11 | :dependencies [[org.clojure/clojure "1.6.0"] 12 | [tesser.hadoop "0.1.0-SNAPSHOT" 13 | ; Hadoop loves to conflict with everything, so you may 14 | ; have to exclude some things pulled in via tesser.hadoop's 15 | ; parkour deps 16 | :exclusions [org.codehaus.jackson/jackson-core-asl 17 | org.codehaus.jackson/jackson-mapper-asl]]] 18 | :jvm-opts ["-server"] 19 | ; This is the namespace where Hadoop will look for a -main defn when called 20 | ; with hadoop -jar 21 | :main tesser.hadoop.demo.core 22 | :profiles 23 | ; We AOT-compile the main namespace when providing an uberjar to Hadoop 24 | {:uberjar {:aot [tesser.hadoop.demo.core]} 25 | :provided {:dependencies 26 | ; Replace this with the appropriate hadoop client for 27 | ; your hadoop cluster 28 | [[org.apache.hadoop/hadoop-client "2.0.0-mr1-cdh4.3.0"]]}}) 29 | -------------------------------------------------------------------------------- /math/src/tesser/cardinality.clj: -------------------------------------------------------------------------------- 1 | (ns tesser.cardinality 2 | "Cardinality estimate digest using HyperLogLog+. 3 | For examples, check tesser.math/digest." 4 | (:import [com.clearspring.analytics.stream.cardinality 5 | HyperLogLogPlus HyperLogLogPlus$Builder]) 6 | (:require [tesser.quantiles :as q])) 7 | 8 | (defn hll 9 | "Construct a new HLL cardinality estimator based on the improved HyperLogLog+ 10 | algorithm which features sparse sets and bias correction. 11 | Optionally accepts an options map with keys: 12 | 13 | :p - Precision for the Normal set representation. 14 | :sp - Precision for the Sparse set representation. 15 | 16 | For the sparse representation: 17 | :p must be a value between 4 and :sp 18 | :sp must be less than 32. 19 | 20 | For old behaviour: 21 | :sp must be set to 0. 22 | 23 | Default: 24 | :p 16, :sp 0" 25 | (^HyperLogLogPlus [] (hll {:p 16 :sp 0})) 26 | (^HyperLogLogPlus 27 | [{:keys [p sp]}] 28 | {:pre [(or (zero? sp) ;old behaviour 29 | (and (<= 4 p sp) 30 | (< sp 32)))]} 31 | (HyperLogLogPlus. ^int p ^int sp))) 32 | 33 | 34 | (extend-type HyperLogLogPlus 35 | q/Digest 36 | (add-point! [hll x] (doto hll (.offer x))) 37 | (merge-digest! [hll ^HyperLogLogPlus other] (doto hll (.addAll other))) 38 | (point-count [hll] (.cardinality hll))) 39 | 40 | 41 | (defn to-byte-array 42 | "Convert the HLL estimator to a ByteArray" 43 | ^bytes [^HyperLogLogPlus hll] 44 | (.getBytes hll)) 45 | 46 | 47 | (defn from-byte-array 48 | "Convert a ByteArray into an HLL estimator instance." 49 | ^HyperLogLogPlus [^bytes bs] 50 | (HyperLogLogPlus$Builder/build bs)) 51 | -------------------------------------------------------------------------------- /core/test/tesser/bench_test.clj: -------------------------------------------------------------------------------- 1 | (ns tesser.bench-test 2 | (:require [clojure.test :refer :all] 3 | [clojure.test.check :as tc] 4 | [clojure.test.check [clojure-test :refer :all] 5 | [generators :as gen] 6 | [properties :as prop]] 7 | [multiset.core :refer [multiset]] 8 | [criterium.core :refer [with-progress-reporting quick-bench bench]] 9 | [tesser.core :as t] 10 | [tesser.simple :as s] 11 | [tesser.utils :refer :all] 12 | [clojure.core.reducers :as r] 13 | [clojure.set :as set])) 14 | 15 | (def n 1000000) 16 | 17 | (defn long-ary [] 18 | (->> #(rand-int n) 19 | repeatedly 20 | (take n) 21 | long-array)) 22 | 23 | (defn long-vec [] 24 | (->> #(rand-int n) 25 | repeatedly 26 | (take n) 27 | vec)) 28 | 29 | (defn sep 30 | ([& args] 31 | (prn) 32 | (prn) 33 | (apply println args) 34 | (prn))) 35 | 36 | (deftest ^:bench sum 37 | (sep "###### Simple sum ########") 38 | (dorun 39 | (for [collf [#'long-ary #'long-vec] 40 | reducef [#'reduce #'r/fold #'s/fold]] 41 | (let [coll (@collf) 42 | reducef @reducef] 43 | (sep collf reducef) 44 | (quick-bench (reducef + coll)))))) 45 | 46 | (deftest ^:bench map-filter-sum 47 | (sep "####### map/filter/sum #######") 48 | (dorun 49 | (for [collf [#'long-ary #'long-vec]] 50 | (let [coll (@collf)] 51 | (sep collf "seq/reduce") 52 | (quick-bench (->> coll 53 | (map inc) 54 | (filter even?) 55 | (reduce +))) 56 | 57 | (sep collf "reducers fold") 58 | (quick-bench (->> coll 59 | (r/map inc) 60 | (r/filter even?) 61 | (r/fold +))) 62 | 63 | (sep collf "tesser") 64 | (quick-bench (->> (t/map inc) 65 | (t/filter even?) 66 | (t/fold +) 67 | (t/tesser (t/chunk 16384 coll)))))))) 68 | 69 | (deftest ^:bench fuse 70 | (sep "##### Fuse #####") 71 | (let [coll (long-ary)] 72 | (time (->> (t/fuse {:sum (t/fold +) 73 | :evens (->> (t/filter even?) (t/count)) 74 | :odds (->> (t/filter odd?) (t/count))}) 75 | (t/tesser (t/chunk 16384 coll)))))) 76 | 77 | ; For profiling 78 | (deftest ^:stress stress 79 | (let [a (long-vec)] 80 | (dotimes [i 10000000] 81 | (->> (t/map inc) 82 | (t/filter even?) 83 | (t/fold +) 84 | (t/tesser (t/chunk 1024 a)))))) 85 | -------------------------------------------------------------------------------- /hadoop/demo/src/tesser/hadoop/demo/core.clj: -------------------------------------------------------------------------------- 1 | (ns tesser.hadoop.demo.core 2 | (:gen-class) 3 | (:require [tesser [core :as t] 4 | [hadoop :as h]] 5 | [parkour [tool :as tool]] 6 | [parkour.io [text :as text]] 7 | [clojure.pprint :refer [pprint]])) 8 | 9 | ;; First, define the fold that we want to run. Note that all this work takes 10 | ;; place in the Hadoop cluster: mappers map and compute local frequency 11 | ;; distributions, and the reducer merges those frequency distributions and 12 | ;; sorts/reverses in the post-combine phase. 13 | (defn char-freq 14 | "Returns a fold that computes a frequency distribution over characters in 15 | input strings. Takes a single argument for case normalization: :upper, 16 | :lower, or nil to leave strings untransformed." 17 | [normalizer] 18 | (->> ; Normalize case 19 | (t/map (case normalizer 20 | :upper #(.toUpperCase ^String %) 21 | :lower #(.toLowerCase ^String %) 22 | nil identity)) 23 | ; Break up strings into characters 24 | (t/mapcat seq) 25 | ; And take a frequency distribution 26 | (t/frequencies) 27 | ; Sort the final results by their frequency 28 | (t/post-combine (partial sort-by second)) 29 | ; In descending order 30 | (t/post-combine reverse))) 31 | 32 | ;; On the local JVM, we'll set up a Hadoop job and tell it to evaluate 33 | ;; (char-freq :lower) on our inputs, writing results to the work directory. 34 | (defn run 35 | "Takes a parkour jobconf and runs char-freq over the given input files, 36 | writing temporary results to work-dir. When the Hadoop job is complete, 37 | returns the results of the fold to the local JVM." 38 | [conf [work-dir input-file]] 39 | (h/fold conf 40 | (text/dseq input-file) 41 | work-dir 42 | ; Note that we pass the fold-creating function as a var, and can 43 | ; provide arguments to the function, which will be automatically 44 | ; serialized and sent to each Hadoop worker to create local instances 45 | ; of the fold as needed. Namespaced vars work fine too. 46 | #'char-freq :lower)) 47 | 48 | ;; Hadoop machinery 49 | 50 | (defn tool 51 | "Top-level hadoop runner. Returns an exit code." 52 | [conf & args] 53 | (try 54 | (pprint (run conf args)) 55 | 0 ; Return succesfully 56 | 57 | ; The default Parkour exception handling is pretty minimal; we'll print out 58 | ; our own here. 59 | (catch clojure.lang.ExceptionInfo e 60 | (let [data (ex-data e)] 61 | (if (h/error? data) 62 | (h/print-error data) 63 | (throw e))) 64 | 65 | ; Exit status 66 | 1))) 67 | 68 | (defn -main [& args] 69 | (System/exit (tool/run tool args))) 70 | -------------------------------------------------------------------------------- /hadoop/demo/README.md: -------------------------------------------------------------------------------- 1 | # tesser.hadoop.demo 2 | 3 | A simple project that runs a character-counting fold. Take a look at 4 | project.clj and src/tesser/hadoop/demo/core.clj for the code, since the 5 | packaging requirements for a Parkour project are a little tricky to get right. 6 | This project.clj is set up for a Hadoop 2.0 Cloudera 4 cluster; you'll probably 7 | want to choose the Hadoop client version appropriate for your environment. 8 | 9 | ## Usage 10 | 11 | Before working with Tesser and Hadoop, make sure that `hadoop jar whatever.jar` 12 | works correctly. You'll probably need to authenticate by running `kinit`. 13 | 14 | We need some text files in HDFS to read, and a temporary directory in HDFS to write results to. First, we'll make sure that directory is empty: 15 | 16 | ``` 17 | hadoop fs -rm -r /tmp/kingsbury/demo 18 | ``` 19 | 20 | Then, we'll package the demo project as a fat jar: 21 | 22 | ``` 23 | lein do clean, uberjar 24 | ``` 25 | 26 | Then we'll run that jar in Hadoop against our temp dir and input files: 27 | 28 | ``` 29 | hadoop jar \ 30 | target/tesser-hadoop-demo-0.1.0-SNAPSHOT-standalone.jar \ 31 | hdfs:/tmp/kingsbury/demo \ 32 | hdfs:/data/foo/0_1/part-* 33 | ``` 34 | 35 | Tesser automatically generates a short random job name and submits it to the 36 | cluster. Stderr looks like: 37 | 38 | ``` 39 | 15/01/08 15:40:48 INFO parkour.graph: Launching job audience-analysis-262[1/1] 40 | 15/01/08 15:40:48 WARN mapred.JobClient: Use GenericOptionsParser for parsing the arguments. Applications should implement Tool for the same. 41 | 15/01/08 15:40:48 INFO hdfs.DFSClient: Created HDFS_DELEGATION_TOKEN token 193994 for kingsbury on ha-hdfs:dev 42 | 15/01/08 15:40:48 INFO security.TokenCache: Got dt for hdfs://dev; Kind: HDFS_DELEGATION_TOKEN, Service: ha-hdfs:dev, Ident: (HDFS_DELEGATION_TOKEN token 193994 for kingsbury) 43 | 15/01/08 15:40:48 INFO input.FileInputFormat: Total input paths to process : 722 44 | 15/01/08 15:40:48 INFO lzo.GPLNativeCodeLoader: Loaded native gpl library 45 | 15/01/08 15:40:48 INFO lzo.LzoCodec: Successfully loaded & initialized native-lzo library [hadoop-lzo rev d0f5a10f99f1b2af4f6610447052c5a67b8b1cc7] 46 | 15/01/08 15:46:16 INFO parkour.graph: Job audience-analysis-262[1/1] succeeded 47 | 15/01/08 15:46:16 INFO hdfs.DFSClient: Created HDFS_DELEGATION_TOKEN token 193997 for kingsbury on ha-hdfs:dev 48 | 15/01/08 15:46:16 INFO security.TokenCache: Got dt for hdfs://dev; Kind: HDFS_DELEGATION_TOKEN, Service: ha-hdfs:dev, Ident: (HDFS_DELEGATION_TOKEN token 193997 for kingsbury) 49 | 15/01/08 15:46:16 INFO input.FileInputFormat: Total input paths to process : 1 50 | ``` 51 | 52 | And stdout: a frequency distribution of characters in the input text. 53 | 54 | ``` 55 | ([\space 1946238846] 56 | [\0 1797333149] 57 | [\1 1325564417] 58 | [\4 1249022283] 59 | [\8 1123651791] 60 | [\2 1122908302] 61 | [\3 1120085668] 62 | [\e 1102461872] 63 | [\5 1102221964] 64 | [\7 1097686896] 65 | [\6 1091183208] 66 | [\9 1072278238] 67 | [\: 789290251] 68 | [\n 709239137] 69 | [\, 690404976] 70 | [\a 690187809] 71 | [\t 680228177] 72 | [\i 664018328] 73 | [\l 602011267] 74 | [\- 519792746] 75 | [\s 515397081] 76 | [\c 453300270] 77 | [\_ 430756037] 78 | ... 79 | ``` 80 | 81 | ## License 82 | 83 | Copyright © 2015 Kyle Kingsbury 84 | 85 | Distributed under the Eclipse Public License either version 1.0 or (at 86 | your option) any later version. 87 | -------------------------------------------------------------------------------- /math/test/tesser/math_test.clj: -------------------------------------------------------------------------------- 1 | (ns tesser.math-test 2 | (:require [clojure.test :refer :all] 3 | [clojure.math.numeric-tower :refer [expt sqrt]] 4 | [clojure.test.check :as tc] 5 | [clojure.test.check [clojure-test :refer :all] 6 | [generators :as gen] 7 | [properties :as prop]] 8 | [multiset.core :refer [multiset]] 9 | [tesser.utils :refer :all] 10 | [tesser [core :as t] 11 | [math :as m] 12 | [quantiles :as q]])) 13 | 14 | (def test-opts {:num-tests 100 15 | :par 4}) 16 | 17 | (deftest map-sum-test 18 | (is (= (->> (t/map inc) 19 | (m/sum) 20 | (t/tesser [[1 2 3] [4 5 6] []])) 21 | 27))) 22 | 23 | ;; Utility functions 24 | (defn smaller 25 | "Make numbers smaller" 26 | [x] 27 | (double (/ x 100))) 28 | 29 | (defn approx= 30 | "Equal to within err fraction, or if one is zero, to within err absolute." 31 | ([err x y] 32 | (or (= x y) 33 | (if (or (zero? x) (zero? y)) 34 | (< (- err) (- x y) err) 35 | (< (- 1 err) (/ x y) (+ 1 err))))) 36 | ([err x y & more] 37 | (->> more 38 | (cons y) 39 | (every? (partial approx= err x))))) 40 | 41 | (def =ish 42 | "Almost equal" 43 | (partial approx= 1/1000)) 44 | 45 | (defn chunks 46 | "Given a generator for inputs, returns a generator that builds 47 | sequences of sequences of inputs." 48 | [input-gen] 49 | (gen/vector (gen/vector input-gen) 0 5)) 50 | 51 | ; Bug in simple-check: ints don't grow that fast 52 | (def bigger-ints (gen/sized (fn [size] (gen/resize (* size size) gen/int)))) 53 | 54 | (defn flatten1 55 | "Flattens a single level." 56 | [seq-of-seqs] 57 | (apply concat seq-of-seqs)) 58 | 59 | ;; Numeric folds 60 | 61 | (defspec sum-spec 62 | test-opts 63 | (prop/for-all [chunks (chunks gen/int)] 64 | (is (= (t/tesser chunks (m/sum)) 65 | (reduce + 0 (flatten1 chunks)))))) 66 | 67 | (defn mean 68 | [coll] 69 | (assert (not (empty? coll))) 70 | (/ (reduce + coll) (count coll))) 71 | 72 | (defspec mean-spec 73 | test-opts 74 | (prop/for-all [chunks (gen/such-that (partial some not-empty) 75 | (chunks gen/int))] 76 | (is (== (t/tesser chunks (m/mean)) 77 | (mean (flatten1 chunks)))))) 78 | 79 | (defn variance 80 | [coll] 81 | (/ (->> coll 82 | (map #(expt (- % (mean coll)) 2)) 83 | (reduce +)) 84 | (max (dec (count coll)) 1))) 85 | 86 | (defspec variance-spec 87 | test-opts 88 | (prop/for-all [chunks (gen/such-that (partial some not-empty) 89 | (chunks gen/int))] 90 | (=ish (t/tesser chunks (m/variance)) 91 | (variance (flatten1 chunks))))) 92 | 93 | (defspec standard-deviation-spec 94 | test-opts 95 | (prop/for-all [chunks (gen/such-that (partial some not-empty) 96 | (chunks gen/int))] 97 | (=ish (t/tesser chunks (m/standard-deviation)) 98 | (sqrt (variance (flatten1 chunks)))))) 99 | 100 | (defn covariance 101 | [fx fy coll] 102 | (let [coll (filter fx (filter fy coll))] 103 | (if (empty? coll) 104 | nil 105 | (let [mean-x (mean (map fx coll)) 106 | mean-y (mean (map fy coll))] 107 | (double (/ (reduce + (map #(* (- (fx %) mean-x) 108 | (- (fy %) mean-y)) 109 | coll)) 110 | (count coll))))))) 111 | 112 | (defspec covariance-spec 113 | test-opts 114 | ; Take maps like {}, {:x 1}, {:x 2 :y 3} and compute covariance 115 | (prop/for-all [chunks (chunks (gen/map (gen/elements [:x :y]) gen/int))] 116 | (is (= (->> (m/covariance :x :y) 117 | (t/tesser chunks)) 118 | (covariance :x :y (flatten1 chunks)))))) 119 | 120 | (defspec covariance-matrix-spec 121 | test-opts 122 | (prop/for-all [chunks (chunks (gen/map (gen/elements [:x :y :z]) gen/int))] 123 | (let [inputs (flatten1 chunks)] 124 | (is (= (->> (m/covariance-matrix {"x" :x "y" :y "z" :z}) 125 | (t/tesser chunks)) 126 | ; NOTE: depends on math.combinatorics order; fragile 127 | ; but easy to fix. 128 | (let [xy (covariance :x :y inputs) 129 | xz (covariance :x :z inputs) 130 | yz (covariance :y :z inputs)] 131 | {["x" "y"] xy 132 | ["x" "z"] xz 133 | ["y" "x"] xy 134 | ["y" "z"] yz 135 | ["z" "x"] xz 136 | ["z" "y"] yz})))))) 137 | 138 | (defn correlation 139 | "Computes the correlation coefficient over a collection of points, given two 140 | functions of a point `(fx point)` and `(fy point)`. See 141 | http://mathworld.wolfram.com/CorrelationCoefficient.html" 142 | [fx fy coll] 143 | (let [coll (filter fx (filter fy coll))] 144 | (when-not (empty? coll) 145 | (let [xs (map fx coll) 146 | ys (map fy coll) 147 | mx (mean (map fx coll)) 148 | my (mean (map fy coll)) 149 | mxs (map #(- % mx) xs) 150 | mys (map #(- % my) ys)] 151 | (try 152 | (/ (reduce + (map * mxs mys)) 153 | (sqrt (* (reduce + (map * mxs mxs)) 154 | (reduce + (map * mys mys))))) 155 | (catch ArithmeticException e 156 | nil)))))) 157 | 158 | (defspec correlation-spec 159 | test-opts 160 | (prop/for-all [chunks (chunks (gen/map (gen/elements [:x :y]) gen/int))] 161 | (is (= (->> (m/correlation :x :y) 162 | (t/tesser chunks)) 163 | (correlation :x :y (flatten1 chunks)))))) 164 | 165 | (defspec correlation-matrix-spec 166 | test-opts 167 | (prop/for-all [chunks (chunks (gen/map (gen/elements [:x :y :z]) gen/int))] 168 | (let [inputs (flatten1 chunks)] 169 | (is (= (->> (m/correlation-matrix {"x" :x "y" :y "z" :z}) 170 | (t/tesser chunks)) 171 | (let [xy (correlation :x :y inputs) 172 | xz (correlation :x :z inputs) 173 | yz (correlation :y :z inputs)] 174 | {["x" "y"] xy 175 | ["x" "z"] xz 176 | ["y" "x"] xy 177 | ["y" "z"] yz 178 | ["z" "x"] xz 179 | ["z" "y"] yz})))))) 180 | 181 | (defspec digest-spec 182 | test-opts 183 | (prop/for-all [chunks (chunks (gen/fmap smaller gen/int))] 184 | (let [inputs (flatten1 chunks) 185 | digest (->> (m/digest #(q/dual q/hdr-histogram)) 186 | (t/tesser chunks))] 187 | (and (is (=ish (q/min digest) 188 | (when (seq inputs) (reduce min inputs)))) 189 | (is (=ish (q/max digest) 190 | (when (seq inputs) (reduce max inputs)))) 191 | (is (= (q/point-count digest) 192 | (count inputs))))))) 193 | -------------------------------------------------------------------------------- /math/src/tesser/quantiles.clj: -------------------------------------------------------------------------------- 1 | (ns tesser.quantiles 2 | "Supports various streaming quantile sketches" 3 | (:refer-clojure :exclude [min max]) 4 | (:import (org.HdrHistogram EncodableHistogram 5 | DoubleHistogram 6 | DoubleHistogramIterationValue) 7 | (java.nio ByteBuffer) 8 | (java.util.zip Deflater)) 9 | (:require [tesser.utils :refer :all] 10 | [clojure.core.reducers :as r] 11 | [clojure.math.numeric-tower :refer []] 12 | [clojure.math.combinatorics :as combo] 13 | [clojure.set :as set] 14 | [clojure.core :as core])) 15 | 16 | (defprotocol Digest 17 | (add-point! [digest x] 18 | "Add a value to the given digest, mutating the digest. Returns 19 | digest.") 20 | (merge-digest! [digest other] 21 | "Merges the second digest into the first, mutating the 22 | first. Returns the first digest.") 23 | (point-count [digest] 24 | "How many points went into this digest?")) 25 | 26 | (defprotocol Quantile 27 | (min [digest] "The minimum point in the digest. For empty digests, 28 | nil.") 29 | (max [digest] "The maximum point in the digest. For empty digests, 30 | nil.") 31 | (quantile [digest q] "Returns a point near the given quantile.")) 32 | 33 | (defprotocol CumulativeDistribution 34 | (cumulative-distribution [digest] 35 | "A non-normalized discrete cumulative distribution function for 36 | a digest, represented as an ascending-order sequence of `[point 37 | total]` pairs, where `total` is the number of points in the 38 | digest which are less than or equal to `point`. `point` ranges 39 | from min to max, inclusive. 40 | 41 | The cumulative distribution for an empty digest is an empty 42 | seq.")) 43 | 44 | (defn distribution 45 | "A discrete distribution function for a digest, represented as an 46 | ascending-order sequence of `[point count]` pairs, where `count` is the 47 | number of points less than or equal to `point`, and greater than the previous 48 | point. Point ranges from min to max inclusive." 49 | [digest] 50 | (let [cd (cumulative-distribution digest)] 51 | (->> digest 52 | cumulative-distribution 53 | (cons [nil 0]) 54 | successive-pairs 55 | (map (fn [[[x c] [x' c']]] 56 | [x' (- c' c)]))))) 57 | 58 | (defn ^"[B" byte-buffer->bytes 59 | "Convert a byte buffer to a byte array." 60 | [^ByteBuffer buffer] 61 | (let [array (byte-array (.remaining buffer))] 62 | (.get buffer array) 63 | array)) 64 | 65 | (defn ^ByteBuffer bytes->byte-buffer 66 | "Convert a bytebuffer to a byte array." 67 | [^bytes bs] 68 | (ByteBuffer/wrap bs)) 69 | 70 | ; TODO: dynamically detect dependencies and load these extensions 71 | ; I'd include em all by default but the jar gets *massive* 72 | ;(extend-type AVLTreeDigest 73 | ; Quantile 74 | ; (add-point! [digest x] (.add digest x)) 75 | ; (merge-digest! [digest ^AVLTreeDigest d] (.add digest d)) 76 | ; (point-count [digest] (.size digest)) 77 | ; (quantile [digest q] (.quantile digest q)) 78 | 79 | ; Buffers 80 | ; (buf-capacity [digest] (.smallByteSize digest)) 81 | ; (write-buf! [digest b] (.asSmallBytes digest b) 82 | ; (read-buf! [digest b] (AVLTreeDigest/fromBytes b)))) 83 | 84 | (extend-type DoubleHistogram 85 | Digest 86 | (add-point! [digest x] (.recordValue digest x) digest) 87 | (merge-digest! [digest ^DoubleHistogram d] (.add digest d) digest) 88 | (point-count [digest] (.getTotalCount digest)) 89 | 90 | Quantile 91 | (min [digest] (when-not (zero? (point-count digest)) 92 | (.getMinValue digest))) 93 | (max [digest] (when-not (zero? (point-count digest)) 94 | (.getMaxValue digest))) 95 | (quantile [digest q] (.getValueAtPercentile digest (* q 100))) 96 | 97 | CumulativeDistribution 98 | (cumulative-distribution [digest] 99 | (->> (.recordedValues digest) 100 | (mapv (fn [^DoubleHistogramIterationValue i] 101 | [(.getValueIteratedTo i) 102 | (.getTotalCountToThisValue i)]))))) 103 | 104 | (defn hdr-histogram 105 | "Constructs a new HDRHistogram for doubles. 106 | Default options: 107 | 108 | {:highest-to-lowest-value-ratio 1e8 109 | :significant-value-digits 3}" 110 | ([] (hdr-histogram {})) 111 | ([opts] 112 | (DoubleHistogram. (long (or (:highest-to-lowest-value-ratio opts) 1e8)) 113 | (int (or (:significant-value-digits opts) 3))))) 114 | 115 | ; A histogram that covers both negative and positive numbers by routing to 116 | ; two distinct histograms. 0 is considered positive here. 117 | (defrecord DualHistogram [neg pos] 118 | Digest 119 | (add-point! [this x] 120 | (if (neg? x) 121 | (add-point! neg (- x)) 122 | (add-point! pos x)) 123 | this) 124 | 125 | (merge-digest! [this other] 126 | (assert (instance? DualHistogram other)) 127 | (merge-digest! neg (:neg other)) 128 | (merge-digest! pos (:pos other)) 129 | this) 130 | 131 | (point-count [this] 132 | (+ (point-count neg) 133 | (point-count pos))) 134 | 135 | Quantile 136 | (min [this] (if-let [x (max neg)] 137 | (- x) 138 | (min pos))) 139 | 140 | (max [this] (or (max pos) 141 | (when-let [x (min neg)] 142 | (- x)))) 143 | 144 | (quantile [this q] 145 | (let [n (point-count neg) 146 | p (point-count pos) 147 | N (+ n p)] 148 | ; (println "finding quantile" q ": neg" n "pos" p "total" N) 149 | 150 | (cond ; No negatives 151 | (zero? n) 152 | (quantile pos q) 153 | 154 | ; No positives 155 | (zero? p) 156 | (let [neg-q (- 1 (clojure.core/max 0 (- q (/ N))))] 157 | ; (println "quantile" q "mapped to all-neg quantile" neg-q 158 | ; "with value" (- (quantile neg neg-q))) 159 | (- (quantile neg neg-q))) 160 | 161 | ; Falls in negative range 162 | (<= q (/ (+ n 1/2) N)) 163 | (let [neg-q (- 1 (* (/ N n) (- q (/ N))))] 164 | ; (println "quantile" q "mapped to neg quantile" neg-q "with value" 165 | ; (- (quantile neg neg-q))) 166 | (- (quantile neg neg-q))) 167 | 168 | ; Falls in positive range 169 | true 170 | (let [pos-q (/ (- (* N q) n) p)] 171 | ; (println "quantile" q "mapped to pos quantile" 172 | ; pos-q "with value" (quantile pos pos-q)) 173 | (quantile pos pos-q))))) 174 | 175 | CumulativeDistribution 176 | (cumulative-distribution [this] 177 | (cond 178 | (zero? (point-count neg)) 179 | (cumulative-distribution pos) 180 | 181 | true 182 | (let [neg-dist (cumulative-distribution neg) 183 | pos-offset (point-count neg)] 184 | ; Ugh, inefficient 185 | (concat (map vector 186 | ; Points 187 | (->> neg-dist 188 | reverse 189 | (map first) 190 | (map -)) 191 | ; Totals 192 | (->> neg-dist 193 | (map second) 194 | (cons 0) 195 | differences 196 | reverse 197 | cumulative-sums)) 198 | ; Positive distribution 199 | (->> (cumulative-distribution pos) 200 | (map (fn [[point total]] 201 | [point (+ total pos-offset)])))))))) 202 | 203 | (defn dual 204 | "HDRHistogram can't deal with negative values. This function takes a function 205 | to construct a quantile estimator and returns a quantile estimator that uses 206 | *two* of the underlying estimator, one for positive numbers, and one for 207 | negative numbers. 208 | 209 | (dual hdr-histogram {:significant-value-digits 4}))" 210 | ([h] (dual h {})) 211 | ([h opts] 212 | (DualHistogram. (h opts) (h opts)))) 213 | -------------------------------------------------------------------------------- /math/test/tesser/quantiles_test.clj: -------------------------------------------------------------------------------- 1 | (ns tesser.quantiles-test 2 | (:require [clojure.test :refer :all] 3 | [clojure.pprint :refer [pprint]] 4 | [clojure.test.check :as tc] 5 | [clojure.test.check [clojure-test :refer :all] 6 | [generators :as gen] 7 | [properties :as prop]] 8 | [tesser.math-test :refer [correlation approx= =ish bigger-ints 9 | smaller]] 10 | [tesser.quantiles :as q])) 11 | 12 | ; For larger values we start failing the distribution analysis due to 13 | ; floating-point offset errors. Ughhh. 14 | (def test-opts {:num-tests 1000 15 | :par 48}) 16 | 17 | (defn fill! 18 | "Applies all points to a given digest." 19 | [digest points] 20 | (doseq [p points] (q/add-point! digest p))) 21 | 22 | (defn check-count 23 | "Verifies that a quantile estimator knows how many points it received." 24 | [digest points] 25 | (testing "count" 26 | (is (= (count points) (q/point-count digest))))) 27 | 28 | ; Just a wrapper for the quantile comparison tuples 29 | (defrecord QC [quantile actual estimate]) 30 | 31 | (defn quantile-comparison 32 | "Given a digest and a set of points, returns a sorted sequence of maps of 33 | {:quantile 0, :actual 0, :estimate 0.24}, where quantile ranges from 0 to 1, 34 | :actual is the real value for that quantile, drawn from `points`, and 35 | :estimate is the estimated value for that quantile from the digest." 36 | [digest points] 37 | (let [n (count points)] 38 | (->> points 39 | sort 40 | ; inc i because indices start at 1; see 41 | ; http://en.wikipedia.org/wiki/Quantile#Quantiles_of_a_population 42 | (map-indexed (fn [i x] 43 | (let [q (/ (inc i) n)] 44 | (QC. q x (q/quantile digest q)))))))) 45 | 46 | (defn check-quantiles-exact 47 | "If every quantile comparison is almost the same, we're trivially done. We do 48 | this check because flat distributions like [3 3 3 3 3 3] lead to undefined 49 | correlations (because there's no extent to the domain." 50 | [quantile-comparison] 51 | (testing "exact" 52 | (every? #(=ish (:actual %) (:estimate %)) quantile-comparison))) 53 | 54 | (defn check-quantiles-correlation 55 | "Compute the correlation between the true and estimated quantiles." 56 | [quantile-comparison] 57 | (testing "correlation" 58 | (let [correlation (correlation :actual :estimate quantile-comparison)] 59 | (or (is (< 0.99 correlation)) 60 | (pprint "Bad correlation") 61 | (pprint quantile-comparison) 62 | (prn :correlation correlation))))) 63 | 64 | (defn check-quantiles 65 | "Evaluates how well the digest estimates quantiles." 66 | [digest points] 67 | (testing "quantiles" 68 | (case (count points) 69 | ; For the empty set, we don't care 70 | 0 true 71 | 72 | ; A single point should be the same as every quantile. 73 | 1 (and (is (=ish (first points) 74 | (q/quantile digest 0) 75 | (q/quantile digest 0.5) 76 | (q/quantile digest 1)))) 77 | 78 | ; Statistical checks 79 | (let [qc (quantile-comparison digest points)] 80 | ; Try an exact match, then fall back to statistical methods. 81 | (or (check-quantiles-exact qc) 82 | (and (check-quantiles-correlation qc))))))) 83 | 84 | (defn equiv-distributions 85 | "Floating point is a problem. 86 | 87 | test.check is really good at finding inputs like 88 | 89 | :points (-1.05 -0.89 -0.45 0.73 0.73 1.98) 90 | :digest ([-1.05003 1] [-0.88999 1] [-0.44999 1] [0.72998 2] [1.98001 1]) 91 | :actual [[-1.05003 0] [-0.88999 2] [-0.44999 1] [0.72998 0] [1.98001 3]] 92 | 93 | Note that 0.73 has been represented in the digest as 0.72998, which pushed 94 | both 0.73's up into the next bucket: 1.98001. 95 | 96 | So, to verify distribution equivalence, we check: 97 | 98 | 1. Every point is identical. 99 | 2. If we ever see a count that is not equal, we can make it up by stealing 100 | from the next bucket. 101 | 102 | dist1 is the digest, dist2 is the actual." 103 | [dist1 dist2] 104 | (and (testing "points" 105 | (is (= (map first dist1) (map first dist2)))) 106 | (testing "counts" 107 | (let [c1 (map second dist1) 108 | c2 (map second dist2)] 109 | (loop [[p1 & c1' :as c1] (map second dist1) 110 | [p2 & c2' :as c2] (map second dist2)] 111 | (if (nil? p1) 112 | ; Done 113 | true 114 | 115 | (if (= p1 p2) 116 | ; Good. 117 | (recur c1' c2') 118 | 119 | (let [p1' (first c1') 120 | p2' (first c2') 121 | delta (- p1 p2)] 122 | ; (prn :p1 p1 :p2 p2 :delta delta) 123 | (if-not (and p1' p2') 124 | ; We're at the end, too bad. 125 | false 126 | 127 | ; Can we find (= p1 p2) by rebalancing p2 and p2', 128 | ; keeping both positive? 129 | (let [p2-new (+ p2 delta) 130 | p2'-new (- p2' delta)] 131 | ; (prn :p2-new p2-new :p2'-new p2'-new) 132 | (if (or (neg? p2-new) (neg? p2'-new)) 133 | ; Can't rebalance enough 134 | false 135 | 136 | ; Try again with the new balance. 137 | (recur c1 (cons p2-new (cons p2'-new (next c2'))))))))))))))) 138 | 139 | 140 | (defn check-distribution 141 | "Verifies that the distribution over the digest matches the points." 142 | [digest points] 143 | (testing "distribution" 144 | (if (empty? points) 145 | (is (= [] (q/distribution digest))) 146 | 147 | (let [digest-dist (q/distribution digest) 148 | cutoffs (map first digest-dist) 149 | points (sort points) 150 | real-dist (loop [[cutoff & cutoffs' :as cutoffs] cutoffs 151 | [point & points' :as points] points 152 | dist [] 153 | n 0] 154 | (cond ; Done! 155 | (nil? point) 156 | (conj dist [cutoff n]) 157 | 158 | (or (empty? cutoffs') (<= point cutoff)) 159 | (recur cutoffs points' dist (inc n)) 160 | 161 | true 162 | (recur cutoffs' points 163 | (conj dist [cutoff n]) 164 | 0)))] 165 | (and (is (=ish (first points) (first cutoffs))) 166 | (is (=ish (last points) (last cutoffs))) 167 | (or (is (equiv-distributions digest-dist real-dist)) 168 | (prn :points points) 169 | (prn :digest digest-dist) 170 | (prn :actual real-dist))))))) 171 | 172 | (defn check-digest 173 | "Check that a quantile estimator handles a given set of inputs OK." 174 | [digest points] 175 | (fill! digest points) 176 | (and (check-count digest points) 177 | (check-quantiles digest points) 178 | (check-distribution digest points))) 179 | 180 | (defn runs 181 | "Quickcheck likes to emit uniformly distributed vectors, but we need 182 | pathological distributions to test quantile estimators. This generator takes 183 | an underlying generator and emits a vector containing *runs* of those 184 | values, which translate to spikes in the probability density." 185 | [gen] 186 | (assert (gen/generator? gen) "Arg to runs must be a generator") 187 | (gen/bind (gen/vector gen/pos-int) 188 | (fn [run-lengths] 189 | (gen/bind (gen/vector gen (count run-lengths)) 190 | (fn [values] 191 | (gen/return 192 | (mapcat repeat run-lengths values))))))) 193 | 194 | (defspec hdr-histogram-spec 195 | test-opts 196 | (prop/for-all [points (gen/vector (gen/fmap smaller bigger-ints))] 197 | ;(prn :points points) 198 | (check-digest (q/dual 199 | q/hdr-histogram 200 | {:highest-to-lowest-value-ratio 1e6 201 | :significant-value-digits 4}) points))) 202 | 203 | (defspec hdr-histogram-runs-spec 204 | (assoc test-opts :num-tests 20) 205 | (prop/for-all [points (runs (gen/fmap smaller bigger-ints))] 206 | ;(prn :points points) 207 | (check-digest (q/dual 208 | q/hdr-histogram 209 | {:highest-to-lowest-value-ratio 1e6 210 | :significant-value-digits 4}) points))) 211 | -------------------------------------------------------------------------------- /hadoop/src/tesser/hadoop/serialization.clj: -------------------------------------------------------------------------------- 1 | (ns tesser.hadoop.serialization 2 | "Lets us serialize Tesser reduction state for transport through various 3 | distributed systems" 4 | (:require [clojure.data.fressian :as fress] 5 | [clojure.walk :as walk] 6 | [parkour.wrapper :as wrapper] 7 | [tesser.quantiles :as quantiles]) 8 | (:import (tesser.hadoop_support FressianWritable) 9 | (java.io DataInput 10 | DataOutput 11 | ByteArrayInputStream 12 | ByteArrayOutputStream) 13 | (java.nio ByteBuffer) 14 | (org.fressian.handlers WriteHandler 15 | ReadHandler) 16 | (org.HdrHistogram DoubleHistogram) 17 | (tesser.quantiles DualHistogram) 18 | (com.clearspring.analytics.stream.cardinality 19 | HyperLogLogPlus 20 | HyperLogLogPlus$Builder))) 21 | 22 | ; TODO: extract serialization for math into tesser.math itself? Common 23 | ; interface somewhere? 24 | 25 | (defmacro handler 26 | "Takes a classname as a symbol, a tag name as a string, and bodies for write 27 | and read functions. Provides a special syntax for writing the component 28 | count: (write-tag! some-number), which expands to (.writeTag writer tag 29 | some-number). Returns a map with two keys: :readers, and :writers, each value 30 | being a map suitable for use as a Fressian reader or writer, respectively. 31 | 32 | (handler QDigest \"q-digest\" 33 | (write [_ writer digest] 34 | (write-tag! 1) 35 | (.writeBytes writer (QDigest/serialize digest))) 36 | (read [_ reader tag component-count] 37 | (QDigest/deserialize ^bytes (.readObject reader))))" 38 | [classname tag write-expr read-expr] 39 | (let [writer-sym (-> write-expr second second) 40 | write-expr (walk/prewalk 41 | (fn [form] 42 | (if (and (list? form) 43 | (= 'write-tag! (first form))) 44 | (let [count-expr (second form)] 45 | (assert 46 | (= 2 (count form)) 47 | "write-tag! takes 1 argument: a component count.") 48 | `(.writeTag ~writer-sym ~tag ~count-expr)) 49 | form)) 50 | write-expr)] 51 | `{:writers {~classname {~tag (reify WriteHandler ~write-expr)}} 52 | :readers {~tag (reify ReadHandler ~read-expr)}})) 53 | 54 | (defmacro handlers 55 | "Takes a flat series of handler quartets: class-name, tag, writer, reader, as 56 | per `handler`. Returns a `{:writers {...}, :readers {...}}` map, where all 57 | writers are merged into a unified map, merged with the clojure default 58 | handlers, and wrapped with inheritance/associative lookups. Does the same for 59 | the readers map, but without inheritance lookups. `:readers` and `:writers` 60 | may be passed to Fressian. 61 | 62 | (handlers 63 | QDigest \"q-digest\" 64 | (write [_ writer digest] 65 | (write-tag! 1) 66 | (.writeBytes writer (QDigest/serialize digest))) 67 | (read [_ reader tag component-count] 68 | (QDigest/deserialize ^bytes (.readObject reader))) 69 | 70 | clojure.lang.PersistentVector \"vector\" 71 | (write [_ writer v] 72 | (write-tag! (count v)) 73 | (doseq [e v] 74 | (.writeObject writer e))) 75 | (read [_ rdr tag component-count] 76 | (let [v (transient [])] 77 | (dotimes [_ component-count] 78 | (conj! v (.readObject rdr))) 79 | (persistent! v))))" 80 | [& quartets] 81 | (let [handlers (partition 4 quartets) 82 | names (repeatedly (count handlers) (partial gensym "handler"))] 83 | ; Bind each handler to a symbol 84 | `(let [~@(->> handlers 85 | (map (partial cons `handler)) 86 | (interleave names)) 87 | ; Wrap up handlers into a vector 88 | handlers# [~@names] 89 | ; Extract writers and readers 90 | writers# (map :writers handlers#) 91 | readers# (map :readers handlers#)] 92 | ; Merge writers/readers together into unified maps 93 | {:writers (->> writers# 94 | (cons fress/clojure-write-handlers) 95 | (reduce merge) 96 | fress/associative-lookup 97 | fress/inheritance-lookup) 98 | :readers (->> readers# 99 | (cons fress/clojure-read-handlers) 100 | (reduce merge) 101 | fress/associative-lookup)}))) 102 | 103 | (defn ^ByteBuffer rewind 104 | "Rewind a byte buffer, returning it." 105 | [^ByteBuffer buf] 106 | (doto buf (.rewind))) 107 | 108 | (def fress-handlers 109 | "All the serialization you could ever need in one fat monolithic, hmm, I 110 | dunno, is this really such a good idea?" 111 | (handlers 112 | DualHistogram "tesser.digest/dual" 113 | (write [_ writer digest] 114 | (write-tag! 2) 115 | (fress/write-object writer (:neg digest)) 116 | (fress/write-object writer (:pos digest))) 117 | 118 | (read [_ reader tag component-count] 119 | (DualHistogram. (.readObject reader) 120 | (.readObject reader))) 121 | 122 | 123 | DoubleHistogram "org.HdrHistogram.DoubleHistogram" 124 | (write [_ writer digest] 125 | (write-tag! 1) 126 | (let [buf (ByteBuffer/allocate 127 | (.getNeededByteBufferCapacity digest))] 128 | (.encodeIntoCompressedByteBuffer digest buf) 129 | (->> buf 130 | rewind 131 | quantiles/byte-buffer->bytes 132 | (.writeBytes writer)))) 133 | 134 | (read [_ reader tag component-count] 135 | (-> (quantiles/bytes->byte-buffer ^bytes (.readObject reader)) 136 | (DoubleHistogram/decodeFromCompressedByteBuffer 0))) 137 | 138 | 139 | clojure.lang.ISeq "seq" 140 | (write [_ w l] 141 | (write-tag! 2) 142 | (.writeInt w (count l)) 143 | (doseq [e l] 144 | (fress/write-object w e))) 145 | 146 | (read [_ rdr tag component-count] 147 | (let [c (.readInt rdr)] 148 | ; Empty seqs eval to nil, which breaks '() roundtrip equality. 149 | (if (zero? c) 150 | '() 151 | (let [ary (object-array c)] 152 | (dotimes [i c] 153 | (aset ary i (.readObject rdr))) 154 | (seq ary))))) 155 | 156 | 157 | clojure.lang.PersistentVector "vector" 158 | (write [_ writer v] 159 | (write-tag! 2) 160 | (.writeInt writer (count v)) 161 | (doseq [e v] 162 | (fress/write-object writer e))) 163 | 164 | (read [_ rdr tag component-count] 165 | (loop [i (.readInt rdr) 166 | v (transient [])] 167 | (if (pos? i) 168 | (recur (dec i) 169 | (conj! v (.readObject rdr))) 170 | (persistent! v)))) 171 | 172 | clojure.lang.IPersistentMap "persistent-map" 173 | (write [_ w m] 174 | (write-tag! 2) 175 | (.writeInt w (count m)) 176 | (doseq [[k v] m] 177 | (.writeObject w k) 178 | (.writeObject w v))) 179 | 180 | (read [_ rdr tag component-count] 181 | (loop [i (.readInt rdr) 182 | m (transient {})] 183 | (if (pos? i) 184 | (recur (dec i) 185 | (assoc! m (.readObject rdr) (.readObject rdr))) 186 | (persistent! m)))) 187 | 188 | 189 | clojure.lang.PersistentTreeMap "sorted-map" 190 | (write [_ w m] 191 | (write-tag! 2) 192 | (.writeInt w (count m)) 193 | (doseq [[k v] m] 194 | (.writeObject w k) 195 | (.writeObject w v))) 196 | 197 | (read [_ rdr tag component-count] 198 | (loop [i (.readInt rdr) 199 | m (sorted-map)] 200 | (if (pos? i) 201 | (recur (dec i) 202 | (assoc m (.readObject rdr) (.readObject rdr))) 203 | m))) 204 | 205 | 206 | clojure.lang.IPersistentSet "persistent-set" 207 | (write [_ w set] 208 | (write-tag! 2) 209 | (.writeInt w (count set)) 210 | (doseq [e set] 211 | (.writeObject w e))) 212 | 213 | (read [_ rdr tag component-count] 214 | (loop [i (.readInt rdr) 215 | s (transient (hash-set))] 216 | (if (pos? i) 217 | (recur (dec i) 218 | (conj! s (.readObject rdr))) 219 | (persistent! s)))) 220 | 221 | 222 | clojure.lang.PersistentTreeSet "sorted-set" 223 | (write [_ w set] 224 | (write-tag! 2) 225 | (.writeInt w (count set)) 226 | (doseq [e set] 227 | (.writeObject w e))) 228 | 229 | (read [_ rdr tag component-count] 230 | (loop [i (.readInt rdr) 231 | s (sorted-set)] 232 | (if (pos? i) 233 | (recur (dec i) 234 | (conj s (.readObject rdr))) 235 | s))) 236 | 237 | 238 | HyperLogLogPlus "hyperloglogplus-estimator" 239 | (write [_ w hll] 240 | (write-tag! 1) 241 | (.writeBytes w (.getBytes ^HyperLogLogPlus hll))) 242 | 243 | (read [_ rdr _ _] 244 | (HyperLogLogPlus$Builder/build ^bytes (.readObject rdr))))) 245 | 246 | (defn ^bytes write-byte-array 247 | "Dump a structure to a byte array using our handlers." 248 | [x] 249 | (let [out (ByteArrayOutputStream.) 250 | writer (fress/create-writer out :handlers (:writers fress-handlers))] 251 | (fress/write-object writer x) 252 | (.toByteArray out))) 253 | 254 | (defn read-byte-array 255 | "Parse a byte array and return a single object, using our handlers." 256 | [^bytes bs] 257 | (let [in (ByteArrayInputStream. bs) 258 | reader (fress/create-reader in :handlers (:readers fress-handlers))] 259 | (fress/read-object reader))) 260 | 261 | ; Serializes values to and from Fressian records, delimited by a 32-bit int 262 | ; length header. Lord have mercy on my soul. 263 | (set! FressianWritable/readFieldsFn 264 | (fn read-fields [^FressianWritable w ^DataInput in] 265 | (let [buffer (-> in 266 | .readInt 267 | byte-array)] 268 | ; Copy input to buffer 269 | (.readFully in buffer) 270 | (set! (.state w) 271 | (-> buffer 272 | (ByteArrayInputStream.) 273 | (fress/create-reader :handlers (:readers fress-handlers)) 274 | (fress/read-object)))))) 275 | 276 | (set! FressianWritable/writeFn 277 | (fn write [^FressianWritable w ^DataOutput out] 278 | (let [value (.state w) 279 | buf (ByteArrayOutputStream.) 280 | writer (fress/create-writer 281 | buf :handlers (:writers fress-handlers)) 282 | _ (fress/write-object writer value)] 283 | (.writeInt out (.size buf)) 284 | (.write out (.toByteArray buf))))) 285 | 286 | ; Tell Parkour how to clobber/extract values from our Fressian-backed 287 | ; FressianWritable 288 | (extend-protocol wrapper/Wrapper 289 | FressianWritable 290 | (unwrap [this] 291 | (.state this)) 292 | (rewrap [this obj] 293 | (set! (.state this) obj) 294 | this)) 295 | -------------------------------------------------------------------------------- /hadoop/src/tesser/hadoop.clj: -------------------------------------------------------------------------------- 1 | (ns tesser.hadoop 2 | "Helps you run a fold on Hadoop!" 3 | (:require [clojure [string :as str] 4 | [pprint :refer [pprint]]] 5 | [clojure.core.reducers :as r] 6 | [clojure.data.fressian :as fress] 7 | tesser.hadoop.serialization 8 | [tesser 9 | [utils :refer :all] 10 | [core :as t]] 11 | [parkour [conf :as conf] 12 | [fs :as fs] 13 | [mapreduce :as mr] 14 | [graph :as pg] 15 | [tool :as tool] 16 | [wrapper :as wrapper]] 17 | [parkour.io [text :as text] 18 | [seqf :as seqf] 19 | [sample :as sample]]) 20 | (:import (tesser.hadoop_support FressianWritable) 21 | (java.io DataInput 22 | DataOutput 23 | IOException 24 | EOFException 25 | ByteArrayInputStream 26 | ByteArrayOutputStream) 27 | (java.nio ByteBuffer) 28 | (org.apache.hadoop.io Text 29 | LongWritable 30 | NullWritable 31 | BytesWritable 32 | Writable) 33 | (org.apache.hadoop.mapred JobPriority) 34 | (org.apache.hadoop.conf Configuration))) 35 | 36 | (defn resolve+ 37 | "Resolves a symbol to a var, requiring the namespace if necessary. If the 38 | namespace doesn't exist, throws just like `clojure.core/require`. If the 39 | symbol doesn't exist after requiring, returns nil." 40 | [sym] 41 | (or (resolve sym) 42 | (let [ns (->> sym str (re-find #"(.+)\/") second symbol)] 43 | (require ns) 44 | (resolve sym)))) 45 | 46 | (defn rehydrate-fold 47 | "Takes the name of a function that generates a fold (a symbol) and args for 48 | that function, and invokes the function with args to build a fold, which is 49 | then compiled and returned." 50 | [fold-name fold-args] 51 | (-> fold-name 52 | resolve+ 53 | deref 54 | (apply fold-args) 55 | t/compile-fold)) 56 | 57 | (defn serialize-error 58 | "Convert an exception to an error." 59 | [state input e] 60 | ; Log locally so we'll have something in the hadoop logs 61 | (.printStackTrace e) 62 | {::error? true 63 | :class (.getName (class e)) 64 | :message (.getMessage e) 65 | :string (.toString e) 66 | :trace (->> (.getStackTrace e) 67 | (map (fn [^StackTraceElement frame] 68 | (str (.getClassName frame) " " 69 | (.getMethodName frame) " (" 70 | (.getFileName frame) ":" 71 | (.getLineNumber frame) ")"))) 72 | (str/join "\n")) 73 | :state state 74 | :input input}) 75 | 76 | (defn error? 77 | "Is this an error object?" 78 | [x] 79 | (and (map? x) 80 | (::error? x))) 81 | 82 | (defn print-error 83 | "Print an error to *err*." 84 | [e] 85 | (locking *out* 86 | (binding [*out* *err*] 87 | (println) 88 | (println "## Hadoop Error:" (:class e)) 89 | (println) 90 | (println "State prior to reduction error:") 91 | (pprint (:state e)) 92 | (println) 93 | (println "Input that caused reduction error:") 94 | (println (:input e)) 95 | (println) 96 | (println (:class e)) 97 | (println (:message e)) 98 | (println (:string e)) 99 | (println (:trace e))))) 100 | 101 | (defn fold-mapper 102 | "A generic, stateful hadoop mapper for applying a fold to a Hadoop dataset. 103 | This function returns a mapper for fold defined by make-fold 104 | applied to fold-name & additional args." 105 | {::mr/source-as :vals 106 | ::mr/sink-as :vals} 107 | [fold-name fold-args input] 108 | (list (try (let [fold (rehydrate-fold fold-name fold-args) 109 | red (:reducer fold) 110 | post (:post-reducer fold)] 111 | (post 112 | (reduce (fn [acc line] 113 | (try 114 | (red acc line) 115 | (catch Exception e 116 | (reduced (serialize-error acc line e))))) 117 | ((:reducer-identity fold)) 118 | input))) 119 | (catch Exception e 120 | (serialize-error nil nil e))))) 121 | 122 | (defn fold-reducer 123 | "This function returns a parkour reducer for fold defined by make-fold 124 | applied to fold-name & additional args" 125 | {::mr/source-as :vals 126 | ::mr/sink-as :vals} 127 | [fold-name fold-args input] 128 | (list (try (let [fold (rehydrate-fold fold-name fold-args) 129 | combiner (:combiner fold) 130 | combined (reduce (fn [acc x] 131 | (try 132 | (if (error? x) 133 | (reduced x) 134 | (combiner acc x)) 135 | (catch Exception e 136 | (reduced (serialize-error acc x e))))) 137 | ((:combiner-identity fold)) 138 | input)] 139 | (if (error? combined) 140 | combined 141 | ((:post-combiner fold) combined))) 142 | (catch Exception e 143 | (serialize-error nil nil e))))) 144 | 145 | (defn fold* 146 | "Takes a Parkour graph and applies a fold to it. Takes a var for a function, 147 | taking `args`, which constructs a fold. Returns a new (unexecuted) graph. 148 | The output of this job will be a single-element Fressian structure containing 149 | the results of the fold applied to the job's inputs." 150 | [graph fold-var & args] 151 | (let [fold-name (var->sym fold-var)] 152 | (-> graph 153 | (pg/map #'fold-mapper fold-name args) 154 | (pg/partition [NullWritable FressianWritable]) 155 | (pg/reduce #'fold-reducer fold-name args)))) 156 | 157 | (defonce job-name-counter 158 | (atom (rand-int 1000))) 159 | 160 | (defn gen-job-name! 161 | "Generates a new job name. Job names start at a random small integer and 162 | increment sequentially from there. Job names are printed to stderr when 163 | generated." 164 | [] 165 | (let [n (str "tesser-" (swap! job-name-counter inc))] 166 | (binding [*out* *err*] 167 | (println "\n## Job" n "\n")) 168 | n)) 169 | 170 | (defn execute 171 | "Like `parkour.graph/execute`, but specialized for folds. Takes a parkour 172 | graph, a jobconf, and a job name. Executes the job, then returns a sequence 173 | of fold results. Job names will be automatically generated if not provided." 174 | ([graph conf] 175 | (execute graph conf (gen-job-name!))) 176 | ([graph conf job-name] 177 | ; For each phase, extract the first tuple, then the value. 178 | (map (comp second reduce-first) 179 | (pg/execute graph conf job-name)))) 180 | 181 | (defn dsink 182 | "Given a work directory and a string name for this file, builds a dsink for a 183 | fold to dump its output to, in `[NullWritable FressianWritable]` format." 184 | [work-dir file-name] 185 | (->> file-name 186 | (fs/path work-dir) 187 | (seqf/dsink [NullWritable FressianWritable]))) 188 | 189 | (defn ^:private output-path 190 | "Extract the output path from fold-var metadata. If not provided, fallback to 191 | conf key or just the fold-var name. 192 | 193 | Metadata key: :tesser.hadoop/output-path 194 | Configuration key: tesser.output.path" 195 | [^Configuration conf fold-var] 196 | (.replaceAll ;remove trailing slashes, if any 197 | ^String (or (::output-path (meta fold-var)) 198 | (.get conf "tesser.output.path") 199 | (name (var->sym fold-var))) 200 | "/+$" "")) 201 | 202 | (defn fold 203 | "A simple, all-in-one fold operation. Takes a jobconf, workdir, input dseq, 204 | var which points to a fold function, and arguments for the fold function. 205 | Runs the fold against the dseq and returns its results. Names output dsink 206 | after metadata key :tesser.hadoop/output-path in fold symbol. If absent, uses 207 | the conf key tesser.hadoop.output-path and finally falls back 208 | to the fold symbol. On error, throws an `ex-info`." 209 | [conf input workdir fold-var & args] 210 | (let [in (pg/input input) 211 | path (output-path conf fold-var)] 212 | (try 213 | (let [x (-> (apply fold* in fold-var args) 214 | (pg/output (dsink workdir path)) 215 | (execute conf) 216 | first)] 217 | (when (error? x) (throw (ex-info "Hadoop fold error" x))) 218 | x)))) 219 | 220 | (defn identity-mapper 221 | "Does nothing in the map step." 222 | {::mr/source-as :vals 223 | ::mr/sink-as :vals} 224 | [records] 225 | records) 226 | 227 | (defn partition-randomly 228 | "Partitions map task outputs randomly and uniformly among the reduce tasks." 229 | ^long [_ _ ^long nparts] 230 | (rand-int nparts)) 231 | 232 | (defn set-one-reducer! 233 | "Takes a jobconf, returns the jobconf with mapred.reduce.tasks set to 1." 234 | [conf] 235 | (conf/assoc! conf "mapred.reduce.tasks" 1)) 236 | 237 | (defn fold-reducer-without-post-combiner 238 | "Like fold-reducer, but omits :post-combiner so it can be used in reduce tasks that are continued in multiple jobs." 239 | {::mr/source-as :vals 240 | ::mr/sink-as :vals} 241 | [fold-name fold-args input] 242 | (list (try (let [fold (rehydrate-fold fold-name fold-args) 243 | combiner (:combiner fold) 244 | combined (reduce (fn [acc x] 245 | (try 246 | (if (error? x) 247 | (reduced x) 248 | (combiner acc x)) 249 | (catch Exception e 250 | (reduced (serialize-error acc x e))))) 251 | ((:combiner-identity fold)) 252 | input)] 253 | combined) 254 | (catch Exception e 255 | (serialize-error nil nil e))))) 256 | 257 | (defn fold-reduce-twice 258 | "Like fold, but in two stages (jobs). First job uses the number of reduce tasks specified in mapred.reduce.tasks. 259 | The second job does nothing in the map step, then uses one reduce task." 260 | [conf input workdir fold-var & args] 261 | (let [fold-name (var->sym fold-var) 262 | path (output-path conf fold-var) 263 | path1 (str path "-1") 264 | path2 (str path "-2") 265 | kv-classes [NullWritable FressianWritable] 266 | 267 | run (fn [conf graph] 268 | (let [x (-> graph 269 | (execute conf) 270 | first)] 271 | (when (error? x) (throw (ex-info "Hadoop fold error" x))) 272 | x))] 273 | (run conf 274 | (-> (pg/input input) 275 | (pg/map #'fold-mapper fold-name args) 276 | (pg/partition kv-classes #'partition-randomly) 277 | (pg/reduce #'fold-reducer-without-post-combiner fold-name args) 278 | (pg/output (dsink workdir path1)))) 279 | (run (set-one-reducer! conf) 280 | (-> (pg/input (dsink workdir path1)) 281 | (pg/map #'identity-mapper) 282 | (pg/partition kv-classes) 283 | (pg/reduce #'fold-reducer fold-name args) 284 | (pg/output (dsink workdir path2)))))) 285 | -------------------------------------------------------------------------------- /core/src/tesser/utils.clj: -------------------------------------------------------------------------------- 1 | (ns tesser.utils 2 | "Toolbox." 3 | (:refer-clojure :exclude [update]) 4 | (:import (java.lang.reflect Array)) 5 | (:require [clojure [set :as set] 6 | [string :as str] 7 | [walk :as walk]] 8 | [clojure.core.reducers :as r])) 9 | 10 | (defn prepend 11 | "Prepends a single value to the beginning of a sequence. O(1) for sequences 12 | using cons, O(n) for vectors. Returns a singleton vector when coll is nil." 13 | [coll element] 14 | (cond (nil? coll) [element] 15 | (vector? coll) (vec (cons element coll)) 16 | true (cons element coll))) 17 | 18 | (defn append 19 | "Appends a single value to the end of a sequence. O(1); uses conj for 20 | vectors, concat for other seqs." 21 | [coll element] 22 | (if (vector? coll) 23 | (conj coll element) 24 | (concat coll (list element)))) 25 | 26 | ;; A mutable pair datatype, intended for use during singlethreaded reductions. 27 | (defprotocol Pair 28 | (a [pair] "Returns the first element in the Pair.") 29 | (b [pair] "Returns the second element in the Pair.") 30 | (set-a! [pair a'] "Set the first element in the Pair.") 31 | (set-b! [pair b'] "Set the second element in the Pair.") 32 | (set-both! [pair a' b'] "Set both the first and second element in the pair.")) 33 | 34 | (deftype UnsafePair [^:unsynchronized-mutable a ^:unsynchronized-mutable b] 35 | Pair 36 | (a [_] a) 37 | (b [_] b) 38 | (set-a! [this a'] (set! a a') this) 39 | (set-b! [this b'] (set! b b') this) 40 | (set-both! [this a' b'] 41 | (set! a a') 42 | (set! b b') 43 | this)) 44 | 45 | (defn unsafe-pair 46 | "Constructs a new unsynchronized mutable pair object, suitable for 47 | single-threaded mutation." 48 | ([] (UnsafePair. nil nil)) 49 | ([a b] (UnsafePair. a b))) 50 | 51 | (defn successive-pairs 52 | "A much faster version of (partition 2 1 coll) which generates vectors, not 53 | lazy seqs." 54 | ([coll] (successive-pairs (first coll) (next coll))) 55 | ([prev coll] 56 | (lazy-seq 57 | (when-let [s (seq coll)] 58 | (let [x (first s)] 59 | (cons [prev x] (successive-pairs x (next coll)))))))) 60 | 61 | (defn differences 62 | "A seq of the differences between successive elements in a collection. For 63 | example, 64 | 65 | (differences [1 2 4 5 2]) 66 | ; (1 2 1 -3)" 67 | [coll] 68 | (->> coll 69 | successive-pairs 70 | (map (fn [[x x']] (- x' x))))) 71 | 72 | (defn cumulative-sums 73 | "A seq of the cumulative sums of all elements in `coll`, starting at `init` 74 | or the first element of `coll` if `init` is not provided. If `differences` 75 | provides differentials, `cumulative-sums` provides integrals. 76 | 77 | (cumulative-sums 1 [1 2 1 -3]) 78 | ; (1 2 4 5 2)" 79 | ([coll] 80 | (reductions + coll)) 81 | ([init coll] 82 | (reductions + init coll))) 83 | 84 | (defn map-vals 85 | "Maps over a key-value map, returning a new map by transforming each value 86 | with (f v)." 87 | [f m] 88 | (->> m 89 | (reduce (fn [m [k v]] 90 | (assoc! m k (f v))) 91 | (transient {})) 92 | persistent!)) 93 | 94 | (defn index-by 95 | "Given an indexing function f and a collection of xs, return a map of (f x) 96 | -> x." 97 | [f xs] 98 | (persistent! 99 | (reduce (fn [m x] (assoc! m (f x) x)) 100 | (transient {}) 101 | xs))) 102 | 103 | (defn path-fn 104 | "Takes a path for get-in and converts it to a function that operates on 105 | associative structures." 106 | [path] 107 | (fn [x] (get-in x path))) 108 | 109 | (defn var->sym 110 | "Converts a var to fully qualified symbol." 111 | [^clojure.lang.Var v] 112 | (symbol (name (.name (.ns v))) (name (.sym v)))) 113 | 114 | (defn complete-triangular-matrix 115 | "Given a map of `[x y]` keys to values, returns a map where *both* `[x y]` 116 | and `[y x]` point to identical values. Useful for pairwise comparisons which 117 | compute triangular matrices but want to return a full matrix." 118 | [m] 119 | (->> m (map (fn [[[x y] value]] [[y x] value])) (into m))) 120 | 121 | (defn first-non-nil-reducer 122 | "A reducing function that simply returns the first non-nil element in the 123 | collection." 124 | [_ x] 125 | (when-not (nil? x) (reduced x))) 126 | 127 | (defn first-reducer 128 | "A reducing function that returns the first input it sees, or not-found." 129 | ([] 130 | (first-reducer nil)) 131 | ([not-found] 132 | (fn reducer 133 | ([] ::not-found) 134 | ([x] (if (identical? x ::not-found) 135 | not-found 136 | x)) 137 | ([_ x] 138 | (if (identical? x ::not-found) 139 | x 140 | (reduced x)))))) 141 | 142 | (defn last-reducer 143 | "A reducing function that returns the last input it sees, or not-found." 144 | ([] 145 | (last-reducer nil)) 146 | ([not-found] 147 | (fn reducer 148 | ([] ::not-found) 149 | ([x] (if (identical? x ::not-found) 150 | not-found 151 | x)) 152 | ([acc x] 153 | (if (identical? x ::not-found) 154 | acc 155 | x))))) 156 | 157 | (defn reduce-first 158 | "clojure.core/first, but for for reducibles." 159 | [reducible] 160 | (reduce (first-reducer) nil reducible)) 161 | 162 | (defmacro scred 163 | "Helper for short-circuiting nested reduction functions which can emit 164 | reduced values. Given the name of a function that could emit a reduced 165 | value, and an expression: 166 | 167 | (scred rfn [1 (rfn x y)]) 168 | 169 | Expands to code that converts the expression to a reduced value whenever 170 | the underlying function emits a reduced value: 171 | 172 | (let [acc (rfn x y)] 173 | (if (reduced? acc) 174 | (let [acc @acc] (reduced [1 acc])) 175 | [1 acc])) 176 | 177 | scred does not interpret lexical scope, so don't rebind rfn in expr. 178 | Uses prewalk, so the outermost fn is where scred will cut out an expr. 179 | Keep this as simple as possible, haha." 180 | [rfn-name expr] 181 | (let [acc (gensym 'acc) 182 | reduced-expr (promise) 183 | expr (walk/prewalk (fn [form] 184 | ; Match (rfn ...) 185 | (if (and (list? form) 186 | (= rfn-name (first form))) 187 | ; Snarf the expression for later 188 | (do (assert 189 | (not (realized? reduced-expr))) 190 | (deliver reduced-expr form) 191 | acc) 192 | form)) 193 | expr) 194 | reduced-expr @reduced-expr] 195 | (assert reduced-expr) 196 | `(let [~acc ~reduced-expr] 197 | (if (reduced? ~acc) 198 | (let [~acc (deref ~acc)] (reduced ~expr)) 199 | ~expr)))) 200 | 201 | (defmacro def-type-predicate 202 | "Takes an instance of an object and defines a function that tests an object 203 | to see if its class is an instance of the exemplar's." 204 | [name exemplar] 205 | `(let [c# (class ~exemplar)] 206 | (defn ~name [x#] (instance? c# x#)))) 207 | 208 | (def-type-predicate shorts? (short-array 0)) 209 | (def-type-predicate ints? (int-array 0)) 210 | (def-type-predicate longs? (long-array 0)) 211 | (def-type-predicate floats? (float-array 0)) 212 | (def-type-predicate doubles? (double-array 0)) 213 | (def-type-predicate objects? (object-array 0)) 214 | 215 | (defmacro reducible-slice 216 | "A reducible slice of an indexed collection. Expands into a reified 217 | CollReduce which uses `(getter coll ... i)` to return the `i`th element. 218 | Defined as a macro so we can do primitive agets, which are waaaay faster for 219 | arrays. Slice will have maximum length n, and starts at index i0." 220 | [getter coll length offset] 221 | `(reify 222 | clojure.core.protocols/CollReduce 223 | 224 | (coll-reduce [this# f#] 225 | (clojure.core.protocols/coll-reduce this# f# (f#))) 226 | 227 | (coll-reduce [_ f# init#] 228 | (let [length# (long ~length) 229 | offset# (long ~offset) 230 | i-final# (dec (min (count ~coll) (+ offset# length#)))] 231 | (loop [i# offset# 232 | acc# init#] 233 | (let [acc# (f# acc# (~getter ~coll i#))] 234 | (if (or (= i# i-final#) 235 | (reduced? acc#)) 236 | acc# 237 | (recur (inc i#) acc#)))))))) 238 | 239 | ; Slices over primitive arrays 240 | 241 | (defn reducible-slice-bytes 242 | [^bytes ary chunk-size offset] 243 | (reducible-slice aget ary chunk-size offset)) 244 | 245 | (defn reducible-slice-shorts 246 | [^shorts ary chunk-size offset] 247 | (reducible-slice aget ary chunk-size offset)) 248 | 249 | (defn reducible-slice-ints 250 | [^ints ary chunk-size offset] 251 | (reducible-slice aget ary chunk-size offset)) 252 | 253 | (defn reducible-slice-longs 254 | [^longs ary chunk-size offset] 255 | (reducible-slice aget ary chunk-size offset)) 256 | 257 | (defn reducible-slice-floats 258 | [^floats ary chunk-size offset] 259 | (reducible-slice aget ary chunk-size offset)) 260 | 261 | (defn reducible-slice-doubles 262 | [^doubles ary chunk-size offset] 263 | (reducible-slice aget ary chunk-size offset)) 264 | 265 | (defn reducible-slice-objects 266 | [^objects ary chunk-size offset] 267 | (reducible-slice aget ary chunk-size offset)) 268 | 269 | (defn chunk-array 270 | "Partitions an array into reducibles of size `chunk-size` (like 271 | chunk), but faster." 272 | ([^long chunk-size ary] 273 | (let [slicer (cond 274 | (bytes? ary) reducible-slice-bytes 275 | (shorts? ary) reducible-slice-shorts 276 | (ints? ary) reducible-slice-ints 277 | (longs? ary) reducible-slice-longs 278 | (floats? ary) reducible-slice-floats 279 | (doubles? ary) reducible-slice-doubles 280 | (objects? ary) reducible-slice-objects)] 281 | (->> (range 0 (count ary) chunk-size) 282 | (map (partial slicer ary chunk-size)))))) 283 | 284 | (defn chunk-vec 285 | "Partitions a vector into reducibles of size n (somewhat like partition-all) 286 | but uses subvec for speed. 287 | 288 | (chunk-vec 2 [1]) ; => ([1]) 289 | (chunk-vec 2 [1 2 3]) ; => ([1 2] [3]) 290 | 291 | Useful for supplying vectors to tesser.core/tesser." 292 | ([^long n v] 293 | (let [c (count v)] 294 | (->> (range 0 c n) 295 | (map #(subvec v % (min c (+ % n)))))))) 296 | 297 | (defn reducible-chunk 298 | "Like partition-all, but only emits reducibles. Faster for vectors and 299 | arrays. May return chunks of any reducible type. Useful for supplying colls 300 | to tesser. 301 | 302 | (->> [1 2 3 4 5 6 7 8] 303 | (chunk 2) 304 | (map (partial into []))) 305 | ; => ([1 2] [3 4] [5 6] [7 8])" 306 | [^long n coll] 307 | (cond 308 | (vector? coll) (chunk-vec n coll) 309 | (.isArray (class coll)) (chunk-array n coll) 310 | true (partition-all n coll))) 311 | 312 | (defn maybe-unary 313 | "Not all functions used in `tesser/fold` and `tesser/reduce` have a 314 | single-arity form. This takes a function `f` and returns a fn `g` such that 315 | `(g x)` is `(f x)` unless `(f x)` throws ArityException, in which case `(g 316 | x)` returns just `x`." 317 | [f] 318 | (fn wrapper 319 | ([] (f)) 320 | ([x] (try 321 | (f x) 322 | (catch clojure.lang.ArityException e 323 | x))) 324 | ([x y] (f x y)) 325 | ([x y & more] (apply f x y more)))) 326 | -------------------------------------------------------------------------------- /all/LICENSE: -------------------------------------------------------------------------------- 1 | THE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF THIS ECLIPSE PUBLIC 2 | LICENSE ("AGREEMENT"). ANY USE, REPRODUCTION OR DISTRIBUTION OF THE PROGRAM 3 | CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS AGREEMENT. 4 | 5 | 1. DEFINITIONS 6 | 7 | "Contribution" means: 8 | 9 | a) in the case of the initial Contributor, the initial code and 10 | documentation distributed under this Agreement, and 11 | 12 | b) in the case of each subsequent Contributor: 13 | 14 | i) changes to the Program, and 15 | 16 | ii) additions to the Program; 17 | 18 | where such changes and/or additions to the Program originate from and are 19 | distributed by that particular Contributor. A Contribution 'originates' from 20 | a Contributor if it was added to the Program by such Contributor itself or 21 | anyone acting on such Contributor's behalf. Contributions do not include 22 | additions to the Program which: (i) are separate modules of software 23 | distributed in conjunction with the Program under their own license 24 | agreement, and (ii) are not derivative works of the Program. 25 | 26 | "Contributor" means any person or entity that distributes the Program. 27 | 28 | "Licensed Patents" mean patent claims licensable by a Contributor which are 29 | necessarily infringed by the use or sale of its Contribution alone or when 30 | combined with the Program. 31 | 32 | "Program" means the Contributions distributed in accordance with this 33 | Agreement. 34 | 35 | "Recipient" means anyone who receives the Program under this Agreement, 36 | including all Contributors. 37 | 38 | 2. GRANT OF RIGHTS 39 | 40 | a) Subject to the terms of this Agreement, each Contributor hereby grants 41 | Recipient a non-exclusive, worldwide, royalty-free copyright license to 42 | reproduce, prepare derivative works of, publicly display, publicly perform, 43 | distribute and sublicense the Contribution of such Contributor, if any, and 44 | such derivative works, in source code and object code form. 45 | 46 | b) Subject to the terms of this Agreement, each Contributor hereby grants 47 | Recipient a non-exclusive, worldwide, royalty-free patent license under 48 | Licensed Patents to make, use, sell, offer to sell, import and otherwise 49 | transfer the Contribution of such Contributor, if any, in source code and 50 | object code form. This patent license shall apply to the combination of the 51 | Contribution and the Program if, at the time the Contribution is added by the 52 | Contributor, such addition of the Contribution causes such combination to be 53 | covered by the Licensed Patents. The patent license shall not apply to any 54 | other combinations which include the Contribution. No hardware per se is 55 | licensed hereunder. 56 | 57 | c) Recipient understands that although each Contributor grants the licenses 58 | to its Contributions set forth herein, no assurances are provided by any 59 | Contributor that the Program does not infringe the patent or other 60 | intellectual property rights of any other entity. Each Contributor disclaims 61 | any liability to Recipient for claims brought by any other entity based on 62 | infringement of intellectual property rights or otherwise. As a condition to 63 | exercising the rights and licenses granted hereunder, each Recipient hereby 64 | assumes sole responsibility to secure any other intellectual property rights 65 | needed, if any. For example, if a third party patent license is required to 66 | allow Recipient to distribute the Program, it is Recipient's responsibility 67 | to acquire that license before distributing the Program. 68 | 69 | d) Each Contributor represents that to its knowledge it has sufficient 70 | copyright rights in its Contribution, if any, to grant the copyright license 71 | set forth in this Agreement. 72 | 73 | 3. REQUIREMENTS 74 | 75 | A Contributor may choose to distribute the Program in object code form under 76 | its own license agreement, provided that: 77 | 78 | a) it complies with the terms and conditions of this Agreement; and 79 | 80 | b) its license agreement: 81 | 82 | i) effectively disclaims on behalf of all Contributors all warranties and 83 | conditions, express and implied, including warranties or conditions of title 84 | and non-infringement, and implied warranties or conditions of merchantability 85 | and fitness for a particular purpose; 86 | 87 | ii) effectively excludes on behalf of all Contributors all liability for 88 | damages, including direct, indirect, special, incidental and consequential 89 | damages, such as lost profits; 90 | 91 | iii) states that any provisions which differ from this Agreement are offered 92 | by that Contributor alone and not by any other party; and 93 | 94 | iv) states that source code for the Program is available from such 95 | Contributor, and informs licensees how to obtain it in a reasonable manner on 96 | or through a medium customarily used for software exchange. 97 | 98 | When the Program is made available in source code form: 99 | 100 | a) it must be made available under this Agreement; and 101 | 102 | b) a copy of this Agreement must be included with each copy of the Program. 103 | 104 | Contributors may not remove or alter any copyright notices contained within 105 | the Program. 106 | 107 | Each Contributor must identify itself as the originator of its Contribution, 108 | if any, in a manner that reasonably allows subsequent Recipients to identify 109 | the originator of the Contribution. 110 | 111 | 4. COMMERCIAL DISTRIBUTION 112 | 113 | Commercial distributors of software may accept certain responsibilities with 114 | respect to end users, business partners and the like. While this license is 115 | intended to facilitate the commercial use of the Program, the Contributor who 116 | includes the Program in a commercial product offering should do so in a 117 | manner which does not create potential liability for other Contributors. 118 | Therefore, if a Contributor includes the Program in a commercial product 119 | offering, such Contributor ("Commercial Contributor") hereby agrees to defend 120 | and indemnify every other Contributor ("Indemnified Contributor") against any 121 | losses, damages and costs (collectively "Losses") arising from claims, 122 | lawsuits and other legal actions brought by a third party against the 123 | Indemnified Contributor to the extent caused by the acts or omissions of such 124 | Commercial Contributor in connection with its distribution of the Program in 125 | a commercial product offering. The obligations in this section do not apply 126 | to any claims or Losses relating to any actual or alleged intellectual 127 | property infringement. In order to qualify, an Indemnified Contributor must: 128 | a) promptly notify the Commercial Contributor in writing of such claim, and 129 | b) allow the Commercial Contributor tocontrol, and cooperate with the 130 | Commercial Contributor in, the defense and any related settlement 131 | negotiations. The Indemnified Contributor may participate in any such claim 132 | at its own expense. 133 | 134 | For example, a Contributor might include the Program in a commercial product 135 | offering, Product X. That Contributor is then a Commercial Contributor. If 136 | that Commercial Contributor then makes performance claims, or offers 137 | warranties related to Product X, those performance claims and warranties are 138 | such Commercial Contributor's responsibility alone. Under this section, the 139 | Commercial Contributor would have to defend claims against the other 140 | Contributors related to those performance claims and warranties, and if a 141 | court requires any other Contributor to pay any damages as a result, the 142 | Commercial Contributor must pay those damages. 143 | 144 | 5. NO WARRANTY 145 | 146 | EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, THE PROGRAM IS PROVIDED ON 147 | AN "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER 148 | EXPRESS OR IMPLIED INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OR 149 | CONDITIONS OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A 150 | PARTICULAR PURPOSE. Each Recipient is solely responsible for determining the 151 | appropriateness of using and distributing the Program and assumes all risks 152 | associated with its exercise of rights under this Agreement , including but 153 | not limited to the risks and costs of program errors, compliance with 154 | applicable laws, damage to or loss of data, programs or equipment, and 155 | unavailability or interruption of operations. 156 | 157 | 6. DISCLAIMER OF LIABILITY 158 | 159 | EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, NEITHER RECIPIENT NOR ANY 160 | CONTRIBUTORS SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT, INCIDENTAL, 161 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING WITHOUT LIMITATION 162 | LOST PROFITS), HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 163 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 164 | ARISING IN ANY WAY OUT OF THE USE OR DISTRIBUTION OF THE PROGRAM OR THE 165 | EXERCISE OF ANY RIGHTS GRANTED HEREUNDER, EVEN IF ADVISED OF THE POSSIBILITY 166 | OF SUCH DAMAGES. 167 | 168 | 7. GENERAL 169 | 170 | If any provision of this Agreement is invalid or unenforceable under 171 | applicable law, it shall not affect the validity or enforceability of the 172 | remainder of the terms of this Agreement, and without further action by the 173 | parties hereto, such provision shall be reformed to the minimum extent 174 | necessary to make such provision valid and enforceable. 175 | 176 | If Recipient institutes patent litigation against any entity (including a 177 | cross-claim or counterclaim in a lawsuit) alleging that the Program itself 178 | (excluding combinations of the Program with other software or hardware) 179 | infringes such Recipient's patent(s), then such Recipient's rights granted 180 | under Section 2(b) shall terminate as of the date such litigation is filed. 181 | 182 | All Recipient's rights under this Agreement shall terminate if it fails to 183 | comply with any of the material terms or conditions of this Agreement and 184 | does not cure such failure in a reasonable period of time after becoming 185 | aware of such noncompliance. If all Recipient's rights under this Agreement 186 | terminate, Recipient agrees to cease use and distribution of the Program as 187 | soon as reasonably practicable. However, Recipient's obligations under this 188 | Agreement and any licenses granted by Recipient relating to the Program shall 189 | continue and survive. 190 | 191 | Everyone is permitted to copy and distribute copies of this Agreement, but in 192 | order to avoid inconsistency the Agreement is copyrighted and may only be 193 | modified in the following manner. The Agreement Steward reserves the right to 194 | publish new versions (including revisions) of this Agreement from time to 195 | time. No one other than the Agreement Steward has the right to modify this 196 | Agreement. The Eclipse Foundation is the initial Agreement Steward. The 197 | Eclipse Foundation may assign the responsibility to serve as the Agreement 198 | Steward to a suitable separate entity. Each new version of the Agreement will 199 | be given a distinguishing version number. The Program (including 200 | Contributions) may always be distributed subject to the version of the 201 | Agreement under which it was received. In addition, after a new version of 202 | the Agreement is published, Contributor may elect to distribute the Program 203 | (including its Contributions) under the new version. Except as expressly 204 | stated in Sections 2(a) and 2(b) above, Recipient receives no rights or 205 | licenses to the intellectual property of any Contributor under this 206 | Agreement, whether expressly, by implication, estoppel or otherwise. All 207 | rights in the Program not expressly granted under this Agreement are 208 | reserved. 209 | 210 | This Agreement is governed by the laws of the State of New York and the 211 | intellectual property laws of the United States of America. No party to this 212 | Agreement will bring a legal action under this Agreement more than one year 213 | after the cause of action arose. Each party waives its rights to a jury trial 214 | in any resulting litigation. 215 | -------------------------------------------------------------------------------- /core/LICENSE: -------------------------------------------------------------------------------- 1 | THE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF THIS ECLIPSE PUBLIC 2 | LICENSE ("AGREEMENT"). ANY USE, REPRODUCTION OR DISTRIBUTION OF THE PROGRAM 3 | CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS AGREEMENT. 4 | 5 | 1. DEFINITIONS 6 | 7 | "Contribution" means: 8 | 9 | a) in the case of the initial Contributor, the initial code and 10 | documentation distributed under this Agreement, and 11 | 12 | b) in the case of each subsequent Contributor: 13 | 14 | i) changes to the Program, and 15 | 16 | ii) additions to the Program; 17 | 18 | where such changes and/or additions to the Program originate from and are 19 | distributed by that particular Contributor. A Contribution 'originates' from 20 | a Contributor if it was added to the Program by such Contributor itself or 21 | anyone acting on such Contributor's behalf. Contributions do not include 22 | additions to the Program which: (i) are separate modules of software 23 | distributed in conjunction with the Program under their own license 24 | agreement, and (ii) are not derivative works of the Program. 25 | 26 | "Contributor" means any person or entity that distributes the Program. 27 | 28 | "Licensed Patents" mean patent claims licensable by a Contributor which are 29 | necessarily infringed by the use or sale of its Contribution alone or when 30 | combined with the Program. 31 | 32 | "Program" means the Contributions distributed in accordance with this 33 | Agreement. 34 | 35 | "Recipient" means anyone who receives the Program under this Agreement, 36 | including all Contributors. 37 | 38 | 2. GRANT OF RIGHTS 39 | 40 | a) Subject to the terms of this Agreement, each Contributor hereby grants 41 | Recipient a non-exclusive, worldwide, royalty-free copyright license to 42 | reproduce, prepare derivative works of, publicly display, publicly perform, 43 | distribute and sublicense the Contribution of such Contributor, if any, and 44 | such derivative works, in source code and object code form. 45 | 46 | b) Subject to the terms of this Agreement, each Contributor hereby grants 47 | Recipient a non-exclusive, worldwide, royalty-free patent license under 48 | Licensed Patents to make, use, sell, offer to sell, import and otherwise 49 | transfer the Contribution of such Contributor, if any, in source code and 50 | object code form. This patent license shall apply to the combination of the 51 | Contribution and the Program if, at the time the Contribution is added by the 52 | Contributor, such addition of the Contribution causes such combination to be 53 | covered by the Licensed Patents. The patent license shall not apply to any 54 | other combinations which include the Contribution. No hardware per se is 55 | licensed hereunder. 56 | 57 | c) Recipient understands that although each Contributor grants the licenses 58 | to its Contributions set forth herein, no assurances are provided by any 59 | Contributor that the Program does not infringe the patent or other 60 | intellectual property rights of any other entity. Each Contributor disclaims 61 | any liability to Recipient for claims brought by any other entity based on 62 | infringement of intellectual property rights or otherwise. As a condition to 63 | exercising the rights and licenses granted hereunder, each Recipient hereby 64 | assumes sole responsibility to secure any other intellectual property rights 65 | needed, if any. For example, if a third party patent license is required to 66 | allow Recipient to distribute the Program, it is Recipient's responsibility 67 | to acquire that license before distributing the Program. 68 | 69 | d) Each Contributor represents that to its knowledge it has sufficient 70 | copyright rights in its Contribution, if any, to grant the copyright license 71 | set forth in this Agreement. 72 | 73 | 3. REQUIREMENTS 74 | 75 | A Contributor may choose to distribute the Program in object code form under 76 | its own license agreement, provided that: 77 | 78 | a) it complies with the terms and conditions of this Agreement; and 79 | 80 | b) its license agreement: 81 | 82 | i) effectively disclaims on behalf of all Contributors all warranties and 83 | conditions, express and implied, including warranties or conditions of title 84 | and non-infringement, and implied warranties or conditions of merchantability 85 | and fitness for a particular purpose; 86 | 87 | ii) effectively excludes on behalf of all Contributors all liability for 88 | damages, including direct, indirect, special, incidental and consequential 89 | damages, such as lost profits; 90 | 91 | iii) states that any provisions which differ from this Agreement are offered 92 | by that Contributor alone and not by any other party; and 93 | 94 | iv) states that source code for the Program is available from such 95 | Contributor, and informs licensees how to obtain it in a reasonable manner on 96 | or through a medium customarily used for software exchange. 97 | 98 | When the Program is made available in source code form: 99 | 100 | a) it must be made available under this Agreement; and 101 | 102 | b) a copy of this Agreement must be included with each copy of the Program. 103 | 104 | Contributors may not remove or alter any copyright notices contained within 105 | the Program. 106 | 107 | Each Contributor must identify itself as the originator of its Contribution, 108 | if any, in a manner that reasonably allows subsequent Recipients to identify 109 | the originator of the Contribution. 110 | 111 | 4. COMMERCIAL DISTRIBUTION 112 | 113 | Commercial distributors of software may accept certain responsibilities with 114 | respect to end users, business partners and the like. While this license is 115 | intended to facilitate the commercial use of the Program, the Contributor who 116 | includes the Program in a commercial product offering should do so in a 117 | manner which does not create potential liability for other Contributors. 118 | Therefore, if a Contributor includes the Program in a commercial product 119 | offering, such Contributor ("Commercial Contributor") hereby agrees to defend 120 | and indemnify every other Contributor ("Indemnified Contributor") against any 121 | losses, damages and costs (collectively "Losses") arising from claims, 122 | lawsuits and other legal actions brought by a third party against the 123 | Indemnified Contributor to the extent caused by the acts or omissions of such 124 | Commercial Contributor in connection with its distribution of the Program in 125 | a commercial product offering. The obligations in this section do not apply 126 | to any claims or Losses relating to any actual or alleged intellectual 127 | property infringement. In order to qualify, an Indemnified Contributor must: 128 | a) promptly notify the Commercial Contributor in writing of such claim, and 129 | b) allow the Commercial Contributor tocontrol, and cooperate with the 130 | Commercial Contributor in, the defense and any related settlement 131 | negotiations. The Indemnified Contributor may participate in any such claim 132 | at its own expense. 133 | 134 | For example, a Contributor might include the Program in a commercial product 135 | offering, Product X. That Contributor is then a Commercial Contributor. If 136 | that Commercial Contributor then makes performance claims, or offers 137 | warranties related to Product X, those performance claims and warranties are 138 | such Commercial Contributor's responsibility alone. Under this section, the 139 | Commercial Contributor would have to defend claims against the other 140 | Contributors related to those performance claims and warranties, and if a 141 | court requires any other Contributor to pay any damages as a result, the 142 | Commercial Contributor must pay those damages. 143 | 144 | 5. NO WARRANTY 145 | 146 | EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, THE PROGRAM IS PROVIDED ON 147 | AN "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER 148 | EXPRESS OR IMPLIED INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OR 149 | CONDITIONS OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A 150 | PARTICULAR PURPOSE. Each Recipient is solely responsible for determining the 151 | appropriateness of using and distributing the Program and assumes all risks 152 | associated with its exercise of rights under this Agreement , including but 153 | not limited to the risks and costs of program errors, compliance with 154 | applicable laws, damage to or loss of data, programs or equipment, and 155 | unavailability or interruption of operations. 156 | 157 | 6. DISCLAIMER OF LIABILITY 158 | 159 | EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, NEITHER RECIPIENT NOR ANY 160 | CONTRIBUTORS SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT, INCIDENTAL, 161 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING WITHOUT LIMITATION 162 | LOST PROFITS), HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 163 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 164 | ARISING IN ANY WAY OUT OF THE USE OR DISTRIBUTION OF THE PROGRAM OR THE 165 | EXERCISE OF ANY RIGHTS GRANTED HEREUNDER, EVEN IF ADVISED OF THE POSSIBILITY 166 | OF SUCH DAMAGES. 167 | 168 | 7. GENERAL 169 | 170 | If any provision of this Agreement is invalid or unenforceable under 171 | applicable law, it shall not affect the validity or enforceability of the 172 | remainder of the terms of this Agreement, and without further action by the 173 | parties hereto, such provision shall be reformed to the minimum extent 174 | necessary to make such provision valid and enforceable. 175 | 176 | If Recipient institutes patent litigation against any entity (including a 177 | cross-claim or counterclaim in a lawsuit) alleging that the Program itself 178 | (excluding combinations of the Program with other software or hardware) 179 | infringes such Recipient's patent(s), then such Recipient's rights granted 180 | under Section 2(b) shall terminate as of the date such litigation is filed. 181 | 182 | All Recipient's rights under this Agreement shall terminate if it fails to 183 | comply with any of the material terms or conditions of this Agreement and 184 | does not cure such failure in a reasonable period of time after becoming 185 | aware of such noncompliance. If all Recipient's rights under this Agreement 186 | terminate, Recipient agrees to cease use and distribution of the Program as 187 | soon as reasonably practicable. However, Recipient's obligations under this 188 | Agreement and any licenses granted by Recipient relating to the Program shall 189 | continue and survive. 190 | 191 | Everyone is permitted to copy and distribute copies of this Agreement, but in 192 | order to avoid inconsistency the Agreement is copyrighted and may only be 193 | modified in the following manner. The Agreement Steward reserves the right to 194 | publish new versions (including revisions) of this Agreement from time to 195 | time. No one other than the Agreement Steward has the right to modify this 196 | Agreement. The Eclipse Foundation is the initial Agreement Steward. The 197 | Eclipse Foundation may assign the responsibility to serve as the Agreement 198 | Steward to a suitable separate entity. Each new version of the Agreement will 199 | be given a distinguishing version number. The Program (including 200 | Contributions) may always be distributed subject to the version of the 201 | Agreement under which it was received. In addition, after a new version of 202 | the Agreement is published, Contributor may elect to distribute the Program 203 | (including its Contributions) under the new version. Except as expressly 204 | stated in Sections 2(a) and 2(b) above, Recipient receives no rights or 205 | licenses to the intellectual property of any Contributor under this 206 | Agreement, whether expressly, by implication, estoppel or otherwise. All 207 | rights in the Program not expressly granted under this Agreement are 208 | reserved. 209 | 210 | This Agreement is governed by the laws of the State of New York and the 211 | intellectual property laws of the United States of America. No party to this 212 | Agreement will bring a legal action under this Agreement more than one year 213 | after the cause of action arose. Each party waives its rights to a jury trial 214 | in any resulting litigation. 215 | -------------------------------------------------------------------------------- /hadoop/LICENSE: -------------------------------------------------------------------------------- 1 | THE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF THIS ECLIPSE PUBLIC 2 | LICENSE ("AGREEMENT"). ANY USE, REPRODUCTION OR DISTRIBUTION OF THE PROGRAM 3 | CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS AGREEMENT. 4 | 5 | 1. DEFINITIONS 6 | 7 | "Contribution" means: 8 | 9 | a) in the case of the initial Contributor, the initial code and 10 | documentation distributed under this Agreement, and 11 | 12 | b) in the case of each subsequent Contributor: 13 | 14 | i) changes to the Program, and 15 | 16 | ii) additions to the Program; 17 | 18 | where such changes and/or additions to the Program originate from and are 19 | distributed by that particular Contributor. A Contribution 'originates' from 20 | a Contributor if it was added to the Program by such Contributor itself or 21 | anyone acting on such Contributor's behalf. Contributions do not include 22 | additions to the Program which: (i) are separate modules of software 23 | distributed in conjunction with the Program under their own license 24 | agreement, and (ii) are not derivative works of the Program. 25 | 26 | "Contributor" means any person or entity that distributes the Program. 27 | 28 | "Licensed Patents" mean patent claims licensable by a Contributor which are 29 | necessarily infringed by the use or sale of its Contribution alone or when 30 | combined with the Program. 31 | 32 | "Program" means the Contributions distributed in accordance with this 33 | Agreement. 34 | 35 | "Recipient" means anyone who receives the Program under this Agreement, 36 | including all Contributors. 37 | 38 | 2. GRANT OF RIGHTS 39 | 40 | a) Subject to the terms of this Agreement, each Contributor hereby grants 41 | Recipient a non-exclusive, worldwide, royalty-free copyright license to 42 | reproduce, prepare derivative works of, publicly display, publicly perform, 43 | distribute and sublicense the Contribution of such Contributor, if any, and 44 | such derivative works, in source code and object code form. 45 | 46 | b) Subject to the terms of this Agreement, each Contributor hereby grants 47 | Recipient a non-exclusive, worldwide, royalty-free patent license under 48 | Licensed Patents to make, use, sell, offer to sell, import and otherwise 49 | transfer the Contribution of such Contributor, if any, in source code and 50 | object code form. This patent license shall apply to the combination of the 51 | Contribution and the Program if, at the time the Contribution is added by the 52 | Contributor, such addition of the Contribution causes such combination to be 53 | covered by the Licensed Patents. The patent license shall not apply to any 54 | other combinations which include the Contribution. No hardware per se is 55 | licensed hereunder. 56 | 57 | c) Recipient understands that although each Contributor grants the licenses 58 | to its Contributions set forth herein, no assurances are provided by any 59 | Contributor that the Program does not infringe the patent or other 60 | intellectual property rights of any other entity. Each Contributor disclaims 61 | any liability to Recipient for claims brought by any other entity based on 62 | infringement of intellectual property rights or otherwise. As a condition to 63 | exercising the rights and licenses granted hereunder, each Recipient hereby 64 | assumes sole responsibility to secure any other intellectual property rights 65 | needed, if any. For example, if a third party patent license is required to 66 | allow Recipient to distribute the Program, it is Recipient's responsibility 67 | to acquire that license before distributing the Program. 68 | 69 | d) Each Contributor represents that to its knowledge it has sufficient 70 | copyright rights in its Contribution, if any, to grant the copyright license 71 | set forth in this Agreement. 72 | 73 | 3. REQUIREMENTS 74 | 75 | A Contributor may choose to distribute the Program in object code form under 76 | its own license agreement, provided that: 77 | 78 | a) it complies with the terms and conditions of this Agreement; and 79 | 80 | b) its license agreement: 81 | 82 | i) effectively disclaims on behalf of all Contributors all warranties and 83 | conditions, express and implied, including warranties or conditions of title 84 | and non-infringement, and implied warranties or conditions of merchantability 85 | and fitness for a particular purpose; 86 | 87 | ii) effectively excludes on behalf of all Contributors all liability for 88 | damages, including direct, indirect, special, incidental and consequential 89 | damages, such as lost profits; 90 | 91 | iii) states that any provisions which differ from this Agreement are offered 92 | by that Contributor alone and not by any other party; and 93 | 94 | iv) states that source code for the Program is available from such 95 | Contributor, and informs licensees how to obtain it in a reasonable manner on 96 | or through a medium customarily used for software exchange. 97 | 98 | When the Program is made available in source code form: 99 | 100 | a) it must be made available under this Agreement; and 101 | 102 | b) a copy of this Agreement must be included with each copy of the Program. 103 | 104 | Contributors may not remove or alter any copyright notices contained within 105 | the Program. 106 | 107 | Each Contributor must identify itself as the originator of its Contribution, 108 | if any, in a manner that reasonably allows subsequent Recipients to identify 109 | the originator of the Contribution. 110 | 111 | 4. COMMERCIAL DISTRIBUTION 112 | 113 | Commercial distributors of software may accept certain responsibilities with 114 | respect to end users, business partners and the like. While this license is 115 | intended to facilitate the commercial use of the Program, the Contributor who 116 | includes the Program in a commercial product offering should do so in a 117 | manner which does not create potential liability for other Contributors. 118 | Therefore, if a Contributor includes the Program in a commercial product 119 | offering, such Contributor ("Commercial Contributor") hereby agrees to defend 120 | and indemnify every other Contributor ("Indemnified Contributor") against any 121 | losses, damages and costs (collectively "Losses") arising from claims, 122 | lawsuits and other legal actions brought by a third party against the 123 | Indemnified Contributor to the extent caused by the acts or omissions of such 124 | Commercial Contributor in connection with its distribution of the Program in 125 | a commercial product offering. The obligations in this section do not apply 126 | to any claims or Losses relating to any actual or alleged intellectual 127 | property infringement. In order to qualify, an Indemnified Contributor must: 128 | a) promptly notify the Commercial Contributor in writing of such claim, and 129 | b) allow the Commercial Contributor tocontrol, and cooperate with the 130 | Commercial Contributor in, the defense and any related settlement 131 | negotiations. The Indemnified Contributor may participate in any such claim 132 | at its own expense. 133 | 134 | For example, a Contributor might include the Program in a commercial product 135 | offering, Product X. That Contributor is then a Commercial Contributor. If 136 | that Commercial Contributor then makes performance claims, or offers 137 | warranties related to Product X, those performance claims and warranties are 138 | such Commercial Contributor's responsibility alone. Under this section, the 139 | Commercial Contributor would have to defend claims against the other 140 | Contributors related to those performance claims and warranties, and if a 141 | court requires any other Contributor to pay any damages as a result, the 142 | Commercial Contributor must pay those damages. 143 | 144 | 5. NO WARRANTY 145 | 146 | EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, THE PROGRAM IS PROVIDED ON 147 | AN "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER 148 | EXPRESS OR IMPLIED INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OR 149 | CONDITIONS OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A 150 | PARTICULAR PURPOSE. Each Recipient is solely responsible for determining the 151 | appropriateness of using and distributing the Program and assumes all risks 152 | associated with its exercise of rights under this Agreement , including but 153 | not limited to the risks and costs of program errors, compliance with 154 | applicable laws, damage to or loss of data, programs or equipment, and 155 | unavailability or interruption of operations. 156 | 157 | 6. DISCLAIMER OF LIABILITY 158 | 159 | EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, NEITHER RECIPIENT NOR ANY 160 | CONTRIBUTORS SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT, INCIDENTAL, 161 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING WITHOUT LIMITATION 162 | LOST PROFITS), HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 163 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 164 | ARISING IN ANY WAY OUT OF THE USE OR DISTRIBUTION OF THE PROGRAM OR THE 165 | EXERCISE OF ANY RIGHTS GRANTED HEREUNDER, EVEN IF ADVISED OF THE POSSIBILITY 166 | OF SUCH DAMAGES. 167 | 168 | 7. GENERAL 169 | 170 | If any provision of this Agreement is invalid or unenforceable under 171 | applicable law, it shall not affect the validity or enforceability of the 172 | remainder of the terms of this Agreement, and without further action by the 173 | parties hereto, such provision shall be reformed to the minimum extent 174 | necessary to make such provision valid and enforceable. 175 | 176 | If Recipient institutes patent litigation against any entity (including a 177 | cross-claim or counterclaim in a lawsuit) alleging that the Program itself 178 | (excluding combinations of the Program with other software or hardware) 179 | infringes such Recipient's patent(s), then such Recipient's rights granted 180 | under Section 2(b) shall terminate as of the date such litigation is filed. 181 | 182 | All Recipient's rights under this Agreement shall terminate if it fails to 183 | comply with any of the material terms or conditions of this Agreement and 184 | does not cure such failure in a reasonable period of time after becoming 185 | aware of such noncompliance. If all Recipient's rights under this Agreement 186 | terminate, Recipient agrees to cease use and distribution of the Program as 187 | soon as reasonably practicable. However, Recipient's obligations under this 188 | Agreement and any licenses granted by Recipient relating to the Program shall 189 | continue and survive. 190 | 191 | Everyone is permitted to copy and distribute copies of this Agreement, but in 192 | order to avoid inconsistency the Agreement is copyrighted and may only be 193 | modified in the following manner. The Agreement Steward reserves the right to 194 | publish new versions (including revisions) of this Agreement from time to 195 | time. No one other than the Agreement Steward has the right to modify this 196 | Agreement. The Eclipse Foundation is the initial Agreement Steward. The 197 | Eclipse Foundation may assign the responsibility to serve as the Agreement 198 | Steward to a suitable separate entity. Each new version of the Agreement will 199 | be given a distinguishing version number. The Program (including 200 | Contributions) may always be distributed subject to the version of the 201 | Agreement under which it was received. In addition, after a new version of 202 | the Agreement is published, Contributor may elect to distribute the Program 203 | (including its Contributions) under the new version. Except as expressly 204 | stated in Sections 2(a) and 2(b) above, Recipient receives no rights or 205 | licenses to the intellectual property of any Contributor under this 206 | Agreement, whether expressly, by implication, estoppel or otherwise. All 207 | rights in the Program not expressly granted under this Agreement are 208 | reserved. 209 | 210 | This Agreement is governed by the laws of the State of New York and the 211 | intellectual property laws of the United States of America. No party to this 212 | Agreement will bring a legal action under this Agreement more than one year 213 | after the cause of action arose. Each party waives its rights to a jury trial 214 | in any resulting litigation. 215 | -------------------------------------------------------------------------------- /math/LICENSE: -------------------------------------------------------------------------------- 1 | THE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF THIS ECLIPSE PUBLIC 2 | LICENSE ("AGREEMENT"). ANY USE, REPRODUCTION OR DISTRIBUTION OF THE PROGRAM 3 | CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS AGREEMENT. 4 | 5 | 1. DEFINITIONS 6 | 7 | "Contribution" means: 8 | 9 | a) in the case of the initial Contributor, the initial code and 10 | documentation distributed under this Agreement, and 11 | 12 | b) in the case of each subsequent Contributor: 13 | 14 | i) changes to the Program, and 15 | 16 | ii) additions to the Program; 17 | 18 | where such changes and/or additions to the Program originate from and are 19 | distributed by that particular Contributor. A Contribution 'originates' from 20 | a Contributor if it was added to the Program by such Contributor itself or 21 | anyone acting on such Contributor's behalf. Contributions do not include 22 | additions to the Program which: (i) are separate modules of software 23 | distributed in conjunction with the Program under their own license 24 | agreement, and (ii) are not derivative works of the Program. 25 | 26 | "Contributor" means any person or entity that distributes the Program. 27 | 28 | "Licensed Patents" mean patent claims licensable by a Contributor which are 29 | necessarily infringed by the use or sale of its Contribution alone or when 30 | combined with the Program. 31 | 32 | "Program" means the Contributions distributed in accordance with this 33 | Agreement. 34 | 35 | "Recipient" means anyone who receives the Program under this Agreement, 36 | including all Contributors. 37 | 38 | 2. GRANT OF RIGHTS 39 | 40 | a) Subject to the terms of this Agreement, each Contributor hereby grants 41 | Recipient a non-exclusive, worldwide, royalty-free copyright license to 42 | reproduce, prepare derivative works of, publicly display, publicly perform, 43 | distribute and sublicense the Contribution of such Contributor, if any, and 44 | such derivative works, in source code and object code form. 45 | 46 | b) Subject to the terms of this Agreement, each Contributor hereby grants 47 | Recipient a non-exclusive, worldwide, royalty-free patent license under 48 | Licensed Patents to make, use, sell, offer to sell, import and otherwise 49 | transfer the Contribution of such Contributor, if any, in source code and 50 | object code form. This patent license shall apply to the combination of the 51 | Contribution and the Program if, at the time the Contribution is added by the 52 | Contributor, such addition of the Contribution causes such combination to be 53 | covered by the Licensed Patents. The patent license shall not apply to any 54 | other combinations which include the Contribution. No hardware per se is 55 | licensed hereunder. 56 | 57 | c) Recipient understands that although each Contributor grants the licenses 58 | to its Contributions set forth herein, no assurances are provided by any 59 | Contributor that the Program does not infringe the patent or other 60 | intellectual property rights of any other entity. Each Contributor disclaims 61 | any liability to Recipient for claims brought by any other entity based on 62 | infringement of intellectual property rights or otherwise. As a condition to 63 | exercising the rights and licenses granted hereunder, each Recipient hereby 64 | assumes sole responsibility to secure any other intellectual property rights 65 | needed, if any. For example, if a third party patent license is required to 66 | allow Recipient to distribute the Program, it is Recipient's responsibility 67 | to acquire that license before distributing the Program. 68 | 69 | d) Each Contributor represents that to its knowledge it has sufficient 70 | copyright rights in its Contribution, if any, to grant the copyright license 71 | set forth in this Agreement. 72 | 73 | 3. REQUIREMENTS 74 | 75 | A Contributor may choose to distribute the Program in object code form under 76 | its own license agreement, provided that: 77 | 78 | a) it complies with the terms and conditions of this Agreement; and 79 | 80 | b) its license agreement: 81 | 82 | i) effectively disclaims on behalf of all Contributors all warranties and 83 | conditions, express and implied, including warranties or conditions of title 84 | and non-infringement, and implied warranties or conditions of merchantability 85 | and fitness for a particular purpose; 86 | 87 | ii) effectively excludes on behalf of all Contributors all liability for 88 | damages, including direct, indirect, special, incidental and consequential 89 | damages, such as lost profits; 90 | 91 | iii) states that any provisions which differ from this Agreement are offered 92 | by that Contributor alone and not by any other party; and 93 | 94 | iv) states that source code for the Program is available from such 95 | Contributor, and informs licensees how to obtain it in a reasonable manner on 96 | or through a medium customarily used for software exchange. 97 | 98 | When the Program is made available in source code form: 99 | 100 | a) it must be made available under this Agreement; and 101 | 102 | b) a copy of this Agreement must be included with each copy of the Program. 103 | 104 | Contributors may not remove or alter any copyright notices contained within 105 | the Program. 106 | 107 | Each Contributor must identify itself as the originator of its Contribution, 108 | if any, in a manner that reasonably allows subsequent Recipients to identify 109 | the originator of the Contribution. 110 | 111 | 4. COMMERCIAL DISTRIBUTION 112 | 113 | Commercial distributors of software may accept certain responsibilities with 114 | respect to end users, business partners and the like. While this license is 115 | intended to facilitate the commercial use of the Program, the Contributor who 116 | includes the Program in a commercial product offering should do so in a 117 | manner which does not create potential liability for other Contributors. 118 | Therefore, if a Contributor includes the Program in a commercial product 119 | offering, such Contributor ("Commercial Contributor") hereby agrees to defend 120 | and indemnify every other Contributor ("Indemnified Contributor") against any 121 | losses, damages and costs (collectively "Losses") arising from claims, 122 | lawsuits and other legal actions brought by a third party against the 123 | Indemnified Contributor to the extent caused by the acts or omissions of such 124 | Commercial Contributor in connection with its distribution of the Program in 125 | a commercial product offering. The obligations in this section do not apply 126 | to any claims or Losses relating to any actual or alleged intellectual 127 | property infringement. In order to qualify, an Indemnified Contributor must: 128 | a) promptly notify the Commercial Contributor in writing of such claim, and 129 | b) allow the Commercial Contributor tocontrol, and cooperate with the 130 | Commercial Contributor in, the defense and any related settlement 131 | negotiations. The Indemnified Contributor may participate in any such claim 132 | at its own expense. 133 | 134 | For example, a Contributor might include the Program in a commercial product 135 | offering, Product X. That Contributor is then a Commercial Contributor. If 136 | that Commercial Contributor then makes performance claims, or offers 137 | warranties related to Product X, those performance claims and warranties are 138 | such Commercial Contributor's responsibility alone. Under this section, the 139 | Commercial Contributor would have to defend claims against the other 140 | Contributors related to those performance claims and warranties, and if a 141 | court requires any other Contributor to pay any damages as a result, the 142 | Commercial Contributor must pay those damages. 143 | 144 | 5. NO WARRANTY 145 | 146 | EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, THE PROGRAM IS PROVIDED ON 147 | AN "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER 148 | EXPRESS OR IMPLIED INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OR 149 | CONDITIONS OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A 150 | PARTICULAR PURPOSE. Each Recipient is solely responsible for determining the 151 | appropriateness of using and distributing the Program and assumes all risks 152 | associated with its exercise of rights under this Agreement , including but 153 | not limited to the risks and costs of program errors, compliance with 154 | applicable laws, damage to or loss of data, programs or equipment, and 155 | unavailability or interruption of operations. 156 | 157 | 6. DISCLAIMER OF LIABILITY 158 | 159 | EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, NEITHER RECIPIENT NOR ANY 160 | CONTRIBUTORS SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT, INCIDENTAL, 161 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING WITHOUT LIMITATION 162 | LOST PROFITS), HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 163 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 164 | ARISING IN ANY WAY OUT OF THE USE OR DISTRIBUTION OF THE PROGRAM OR THE 165 | EXERCISE OF ANY RIGHTS GRANTED HEREUNDER, EVEN IF ADVISED OF THE POSSIBILITY 166 | OF SUCH DAMAGES. 167 | 168 | 7. GENERAL 169 | 170 | If any provision of this Agreement is invalid or unenforceable under 171 | applicable law, it shall not affect the validity or enforceability of the 172 | remainder of the terms of this Agreement, and without further action by the 173 | parties hereto, such provision shall be reformed to the minimum extent 174 | necessary to make such provision valid and enforceable. 175 | 176 | If Recipient institutes patent litigation against any entity (including a 177 | cross-claim or counterclaim in a lawsuit) alleging that the Program itself 178 | (excluding combinations of the Program with other software or hardware) 179 | infringes such Recipient's patent(s), then such Recipient's rights granted 180 | under Section 2(b) shall terminate as of the date such litigation is filed. 181 | 182 | All Recipient's rights under this Agreement shall terminate if it fails to 183 | comply with any of the material terms or conditions of this Agreement and 184 | does not cure such failure in a reasonable period of time after becoming 185 | aware of such noncompliance. If all Recipient's rights under this Agreement 186 | terminate, Recipient agrees to cease use and distribution of the Program as 187 | soon as reasonably practicable. However, Recipient's obligations under this 188 | Agreement and any licenses granted by Recipient relating to the Program shall 189 | continue and survive. 190 | 191 | Everyone is permitted to copy and distribute copies of this Agreement, but in 192 | order to avoid inconsistency the Agreement is copyrighted and may only be 193 | modified in the following manner. The Agreement Steward reserves the right to 194 | publish new versions (including revisions) of this Agreement from time to 195 | time. No one other than the Agreement Steward has the right to modify this 196 | Agreement. The Eclipse Foundation is the initial Agreement Steward. The 197 | Eclipse Foundation may assign the responsibility to serve as the Agreement 198 | Steward to a suitable separate entity. Each new version of the Agreement will 199 | be given a distinguishing version number. The Program (including 200 | Contributions) may always be distributed subject to the version of the 201 | Agreement under which it was received. In addition, after a new version of 202 | the Agreement is published, Contributor may elect to distribute the Program 203 | (including its Contributions) under the new version. Except as expressly 204 | stated in Sections 2(a) and 2(b) above, Recipient receives no rights or 205 | licenses to the intellectual property of any Contributor under this 206 | Agreement, whether expressly, by implication, estoppel or otherwise. All 207 | rights in the Program not expressly granted under this Agreement are 208 | reserved. 209 | 210 | This Agreement is governed by the laws of the State of New York and the 211 | intellectual property laws of the United States of America. No party to this 212 | Agreement will bring a legal action under this Agreement more than one year 213 | after the cause of action arose. Each party waives its rights to a jury trial 214 | in any resulting litigation. 215 | -------------------------------------------------------------------------------- /hadoop/demo/LICENSE: -------------------------------------------------------------------------------- 1 | THE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF THIS ECLIPSE PUBLIC 2 | LICENSE ("AGREEMENT"). ANY USE, REPRODUCTION OR DISTRIBUTION OF THE PROGRAM 3 | CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS AGREEMENT. 4 | 5 | 1. DEFINITIONS 6 | 7 | "Contribution" means: 8 | 9 | a) in the case of the initial Contributor, the initial code and 10 | documentation distributed under this Agreement, and 11 | 12 | b) in the case of each subsequent Contributor: 13 | 14 | i) changes to the Program, and 15 | 16 | ii) additions to the Program; 17 | 18 | where such changes and/or additions to the Program originate from and are 19 | distributed by that particular Contributor. A Contribution 'originates' from 20 | a Contributor if it was added to the Program by such Contributor itself or 21 | anyone acting on such Contributor's behalf. Contributions do not include 22 | additions to the Program which: (i) are separate modules of software 23 | distributed in conjunction with the Program under their own license 24 | agreement, and (ii) are not derivative works of the Program. 25 | 26 | "Contributor" means any person or entity that distributes the Program. 27 | 28 | "Licensed Patents" mean patent claims licensable by a Contributor which are 29 | necessarily infringed by the use or sale of its Contribution alone or when 30 | combined with the Program. 31 | 32 | "Program" means the Contributions distributed in accordance with this 33 | Agreement. 34 | 35 | "Recipient" means anyone who receives the Program under this Agreement, 36 | including all Contributors. 37 | 38 | 2. GRANT OF RIGHTS 39 | 40 | a) Subject to the terms of this Agreement, each Contributor hereby grants 41 | Recipient a non-exclusive, worldwide, royalty-free copyright license to 42 | reproduce, prepare derivative works of, publicly display, publicly perform, 43 | distribute and sublicense the Contribution of such Contributor, if any, and 44 | such derivative works, in source code and object code form. 45 | 46 | b) Subject to the terms of this Agreement, each Contributor hereby grants 47 | Recipient a non-exclusive, worldwide, royalty-free patent license under 48 | Licensed Patents to make, use, sell, offer to sell, import and otherwise 49 | transfer the Contribution of such Contributor, if any, in source code and 50 | object code form. This patent license shall apply to the combination of the 51 | Contribution and the Program if, at the time the Contribution is added by the 52 | Contributor, such addition of the Contribution causes such combination to be 53 | covered by the Licensed Patents. The patent license shall not apply to any 54 | other combinations which include the Contribution. No hardware per se is 55 | licensed hereunder. 56 | 57 | c) Recipient understands that although each Contributor grants the licenses 58 | to its Contributions set forth herein, no assurances are provided by any 59 | Contributor that the Program does not infringe the patent or other 60 | intellectual property rights of any other entity. Each Contributor disclaims 61 | any liability to Recipient for claims brought by any other entity based on 62 | infringement of intellectual property rights or otherwise. As a condition to 63 | exercising the rights and licenses granted hereunder, each Recipient hereby 64 | assumes sole responsibility to secure any other intellectual property rights 65 | needed, if any. For example, if a third party patent license is required to 66 | allow Recipient to distribute the Program, it is Recipient's responsibility 67 | to acquire that license before distributing the Program. 68 | 69 | d) Each Contributor represents that to its knowledge it has sufficient 70 | copyright rights in its Contribution, if any, to grant the copyright license 71 | set forth in this Agreement. 72 | 73 | 3. REQUIREMENTS 74 | 75 | A Contributor may choose to distribute the Program in object code form under 76 | its own license agreement, provided that: 77 | 78 | a) it complies with the terms and conditions of this Agreement; and 79 | 80 | b) its license agreement: 81 | 82 | i) effectively disclaims on behalf of all Contributors all warranties and 83 | conditions, express and implied, including warranties or conditions of title 84 | and non-infringement, and implied warranties or conditions of merchantability 85 | and fitness for a particular purpose; 86 | 87 | ii) effectively excludes on behalf of all Contributors all liability for 88 | damages, including direct, indirect, special, incidental and consequential 89 | damages, such as lost profits; 90 | 91 | iii) states that any provisions which differ from this Agreement are offered 92 | by that Contributor alone and not by any other party; and 93 | 94 | iv) states that source code for the Program is available from such 95 | Contributor, and informs licensees how to obtain it in a reasonable manner on 96 | or through a medium customarily used for software exchange. 97 | 98 | When the Program is made available in source code form: 99 | 100 | a) it must be made available under this Agreement; and 101 | 102 | b) a copy of this Agreement must be included with each copy of the Program. 103 | 104 | Contributors may not remove or alter any copyright notices contained within 105 | the Program. 106 | 107 | Each Contributor must identify itself as the originator of its Contribution, 108 | if any, in a manner that reasonably allows subsequent Recipients to identify 109 | the originator of the Contribution. 110 | 111 | 4. COMMERCIAL DISTRIBUTION 112 | 113 | Commercial distributors of software may accept certain responsibilities with 114 | respect to end users, business partners and the like. While this license is 115 | intended to facilitate the commercial use of the Program, the Contributor who 116 | includes the Program in a commercial product offering should do so in a 117 | manner which does not create potential liability for other Contributors. 118 | Therefore, if a Contributor includes the Program in a commercial product 119 | offering, such Contributor ("Commercial Contributor") hereby agrees to defend 120 | and indemnify every other Contributor ("Indemnified Contributor") against any 121 | losses, damages and costs (collectively "Losses") arising from claims, 122 | lawsuits and other legal actions brought by a third party against the 123 | Indemnified Contributor to the extent caused by the acts or omissions of such 124 | Commercial Contributor in connection with its distribution of the Program in 125 | a commercial product offering. The obligations in this section do not apply 126 | to any claims or Losses relating to any actual or alleged intellectual 127 | property infringement. In order to qualify, an Indemnified Contributor must: 128 | a) promptly notify the Commercial Contributor in writing of such claim, and 129 | b) allow the Commercial Contributor tocontrol, and cooperate with the 130 | Commercial Contributor in, the defense and any related settlement 131 | negotiations. The Indemnified Contributor may participate in any such claim 132 | at its own expense. 133 | 134 | For example, a Contributor might include the Program in a commercial product 135 | offering, Product X. That Contributor is then a Commercial Contributor. If 136 | that Commercial Contributor then makes performance claims, or offers 137 | warranties related to Product X, those performance claims and warranties are 138 | such Commercial Contributor's responsibility alone. Under this section, the 139 | Commercial Contributor would have to defend claims against the other 140 | Contributors related to those performance claims and warranties, and if a 141 | court requires any other Contributor to pay any damages as a result, the 142 | Commercial Contributor must pay those damages. 143 | 144 | 5. NO WARRANTY 145 | 146 | EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, THE PROGRAM IS PROVIDED ON 147 | AN "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER 148 | EXPRESS OR IMPLIED INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OR 149 | CONDITIONS OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A 150 | PARTICULAR PURPOSE. Each Recipient is solely responsible for determining the 151 | appropriateness of using and distributing the Program and assumes all risks 152 | associated with its exercise of rights under this Agreement , including but 153 | not limited to the risks and costs of program errors, compliance with 154 | applicable laws, damage to or loss of data, programs or equipment, and 155 | unavailability or interruption of operations. 156 | 157 | 6. DISCLAIMER OF LIABILITY 158 | 159 | EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, NEITHER RECIPIENT NOR ANY 160 | CONTRIBUTORS SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT, INCIDENTAL, 161 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING WITHOUT LIMITATION 162 | LOST PROFITS), HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 163 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 164 | ARISING IN ANY WAY OUT OF THE USE OR DISTRIBUTION OF THE PROGRAM OR THE 165 | EXERCISE OF ANY RIGHTS GRANTED HEREUNDER, EVEN IF ADVISED OF THE POSSIBILITY 166 | OF SUCH DAMAGES. 167 | 168 | 7. GENERAL 169 | 170 | If any provision of this Agreement is invalid or unenforceable under 171 | applicable law, it shall not affect the validity or enforceability of the 172 | remainder of the terms of this Agreement, and without further action by the 173 | parties hereto, such provision shall be reformed to the minimum extent 174 | necessary to make such provision valid and enforceable. 175 | 176 | If Recipient institutes patent litigation against any entity (including a 177 | cross-claim or counterclaim in a lawsuit) alleging that the Program itself 178 | (excluding combinations of the Program with other software or hardware) 179 | infringes such Recipient's patent(s), then such Recipient's rights granted 180 | under Section 2(b) shall terminate as of the date such litigation is filed. 181 | 182 | All Recipient's rights under this Agreement shall terminate if it fails to 183 | comply with any of the material terms or conditions of this Agreement and 184 | does not cure such failure in a reasonable period of time after becoming 185 | aware of such noncompliance. If all Recipient's rights under this Agreement 186 | terminate, Recipient agrees to cease use and distribution of the Program as 187 | soon as reasonably practicable. However, Recipient's obligations under this 188 | Agreement and any licenses granted by Recipient relating to the Program shall 189 | continue and survive. 190 | 191 | Everyone is permitted to copy and distribute copies of this Agreement, but in 192 | order to avoid inconsistency the Agreement is copyrighted and may only be 193 | modified in the following manner. The Agreement Steward reserves the right to 194 | publish new versions (including revisions) of this Agreement from time to 195 | time. No one other than the Agreement Steward has the right to modify this 196 | Agreement. The Eclipse Foundation is the initial Agreement Steward. The 197 | Eclipse Foundation may assign the responsibility to serve as the Agreement 198 | Steward to a suitable separate entity. Each new version of the Agreement will 199 | be given a distinguishing version number. The Program (including 200 | Contributions) may always be distributed subject to the version of the 201 | Agreement under which it was received. In addition, after a new version of 202 | the Agreement is published, Contributor may elect to distribute the Program 203 | (including its Contributions) under the new version. Except as expressly 204 | stated in Sections 2(a) and 2(b) above, Recipient receives no rights or 205 | licenses to the intellectual property of any Contributor under this 206 | Agreement, whether expressly, by implication, estoppel or otherwise. All 207 | rights in the Program not expressly granted under this Agreement are 208 | reserved. 209 | 210 | This Agreement is governed by the laws of the State of New York and the 211 | intellectual property laws of the United States of America. No party to this 212 | Agreement will bring a legal action under this Agreement more than one year 213 | after the cause of action arose. Each party waives its rights to a jury trial 214 | in any resulting litigation. 215 | -------------------------------------------------------------------------------- /math/src/tesser/math.clj: -------------------------------------------------------------------------------- 1 | (ns tesser.math 2 | "Folds over numbers! Calculate sums, means, variance, standard deviation, 3 | covariance and linear correlations, and matrices thereof, plus quantiles and 4 | histograms estimates backed by probabilistic QDigests." 5 | (:require [tesser.core :as t :refer [deftransform]] 6 | [tesser.utils :refer :all] 7 | [tesser.quantiles :as q] 8 | [clojure.core.reducers :as r] 9 | [clojure.math.numeric-tower :refer [sqrt]] 10 | [clojure.math.combinatorics :as combo] 11 | [clojure.set :as set] 12 | [clojure.core :as core])) 13 | 14 | (deftransform sum 15 | "Finds the sum of numeric elements." 16 | [] 17 | (assert (nil? downstream)) 18 | {:reducer-identity (constantly 0) 19 | :reducer + 20 | :post-reducer identity 21 | :combiner-identity (constantly 0) 22 | :combiner + 23 | :post-combiner identity}) 24 | 25 | (deftransform mean 26 | "Finds the arithmetic mean of numeric inputs." 27 | [] 28 | (assert (nil? downstream)) 29 | {:reducer-identity (constantly [0 0]) 30 | :reducer (fn reducer [[s c] x] 31 | [(+ s x) (inc c)]) 32 | :post-reducer identity 33 | :combiner-identity (constantly [0 0]) 34 | :combiner (fn combiner [x y] (core/map + x y)) 35 | :post-combiner (fn post-combiner [x] 36 | (double (/ (first x) (core/max 1 (last x)))))}) 37 | 38 | (deftransform variance 39 | "Unbiased variance estimation. Given numeric inputs, returns their 40 | variance." 41 | [] 42 | (assert (nil? downstream)) 43 | {:reducer-identity (constantly [0 0 0]) 44 | :reducer (fn count-mean-sq [[count mean sum-of-squares] x] 45 | (let [count' (inc count) 46 | mean' (+ mean (/ (- x mean) count'))] 47 | [count' 48 | mean' 49 | (+ sum-of-squares (* (- x mean') (- x mean)))])) 50 | :post-reducer identity 51 | :combiner-identity (constantly [0 0 0]) 52 | :combiner (fn partcmsq [[c m sq] [c2 m2 sq2]] 53 | (let [count (+ c c2)] 54 | (if (zero? count) 55 | [c m sq] 56 | [count 57 | (/ (+ (* c m) (* c2 m2)) count) 58 | (+ sq sq2 (/ (* (- m2 m) (- m2 m) c c2) count))]))) 59 | :post-combiner (fn vardiv [x] (double (/ (last x) (core/max 1 (dec (first x))))))}) 60 | 61 | (defn standard-deviation 62 | "Estimates the standard deviation of numeric inputs." 63 | [& [f]] 64 | (->> f (variance) (t/post-combine sqrt))) 65 | 66 | (deftransform covariance 67 | "Given two functions of an input `(fx input)` and `(fy input)`, each of which 68 | returns a number, estimates the unbiased covariance of those functions over 69 | inputs. 70 | 71 | Ignores any inputs where `(fx input)` or `(fy input)` are nil. If no inputs 72 | have both x and y, returns nil." 73 | [fx fy] 74 | (assert (nil? downstream)) 75 | {:reducer-identity (constantly [0 0 0 0]) 76 | :reducer (fn count-mean2-sq [[count meanx meany sum-of-squares 77 | :as current-state] elt] 78 | (let [x (fx elt) 79 | y (fy elt)] 80 | (if (or (nil? x) (nil? y)) 81 | current-state 82 | (let [count' (inc count) 83 | meanx' (+ meanx (/ (- x meanx) count')) 84 | meany' (+ meany (/ (- y meany) count'))] 85 | [count' 86 | meanx' 87 | meany' 88 | (+ sum-of-squares (* (- x meanx') (- y meany)))])))) 89 | :post-reducer identity 90 | :combiner-identity (constantly [0 0 0 0]) 91 | :combiner (fn partcm2sq [[c mx my sq :as current-state] [c2 mx2 my2 sq2]] 92 | (let [count (+ c c2)] 93 | (if (zero? count) 94 | current-state 95 | [count 96 | (/ (+ (* c mx) (* c2 mx2)) count) 97 | (/ (+ (* c my) (* c2 my2)) count) 98 | (+ sq sq2 (/ (* (- mx2 mx) (- my2 my) c c2) count))]))) 99 | :post-combiner (fn vardiv [[c _ _ sq]] 100 | (when (pos? c) ; Return nil if no inputs 101 | (double (/ sq c))))}) 102 | 103 | (defn fuse-matrix 104 | "Given: 105 | 106 | 1. A function like `covariance` that takes two functions of an input and 107 | yields a fold, and 108 | 2. A map of key names to functions that extract values for 109 | those keys from an input, 110 | 111 | pairwise-matrix computes that fold over each *pair* of keys, returning a map 112 | of name pairs to the result of that pairwise fold over the inputs. You can 113 | think of this like an N^2 version of `fuse`." 114 | [fold keymap & [downstream]] 115 | (->> downstream 116 | ; For this transform, map inputs to a temporary map of keys->values; 117 | ; we'll be doing O(keys) lookups on each key, so having a flat map cuts 118 | ; down on having to re-run expensive extractor functions. 119 | (t/map (fn project [input] 120 | (->> keymap 121 | (core/reduce-kv (fn extract [m k extractor] 122 | (assoc! m k (extractor input))) 123 | (transient {})) 124 | persistent!))) 125 | 126 | ; And pass those maps into a fused covariance transform 127 | (t/fuse (->> (combo/combinations (core/keys keymap) 2) 128 | ; Turn pairs into [pair, covariance-fold] 129 | (core/map (fn [[k1 k2]] 130 | [[k1 k2] (fold #(get % k1) #(get % k2))])) 131 | (core/into {}))) 132 | 133 | ; And return both halves of the resulting triangular matrix 134 | (t/post-combine complete-triangular-matrix))) 135 | 136 | (defn covariance-matrix 137 | "Given a map of key names to functions that extract values for those keys 138 | from an input, computes the covariance for each of the n^2 key pairs, 139 | returning a map of name pairs to the their covariance. For example: 140 | 141 | (t/covariance-matrix {:name-length #(.length (:name %)) 142 | :age :age 143 | :num-cats (comp count :cats)})" 144 | [& args] 145 | (apply fuse-matrix covariance args)) 146 | 147 | (deftransform correlation+count 148 | "Given two functions: (fx input) and (fy input), each of which returns a 149 | number, estimates the unbiased linear correlation coefficient between fx and 150 | fy over inputs. Ignores any records where fx or fy are nil. If there are no 151 | records with values for fx and fy, the correlation is nil. See 152 | http://mathworld.wolfram.com/CorrelationCoefficient.html. 153 | 154 | This function returns a map of correlation and count, like 155 | 156 | {:correlation 0.34 :count 142} 157 | 158 | which is useful for significance testing." 159 | [fx fy] 160 | {:reducer-identity (constantly [0 0 0 0 0 0]) 161 | :reducer (->> (fn count-m2-sq3 [[count meanx meany ssx ssy ssxy :as acc] elt] 162 | (let [x (fx elt) 163 | y (fy elt)] 164 | (if-not (and x y) 165 | acc 166 | (let [count' (inc count) 167 | meanx' (+ meanx (/ (- x meanx) count')) 168 | meany' (+ meany (/ (- y meany) count'))] 169 | [count' 170 | meanx' 171 | meany' 172 | (+ ssx (* (- x meanx') (- x meanx))) 173 | (+ ssy (* (- y meany') (- y meany))) 174 | (+ ssxy (* (- x meanx') (- y meany)))]))))) 175 | :post-reducer identity 176 | :combiner-identity (constantly [0 0 0 0 0 0]) 177 | :combiner (fn partcm2sq3 [[c mx my ssx ssy ssxy] 178 | [c2 mx2 my2 ssx2 ssy2 ssxy2]] 179 | (let [count (+ c c2)] 180 | (if (zero? count) 181 | [c mx my ssx ssy ssxy] 182 | [count 183 | (/ (+ (* c mx) (* c2 mx2)) count) 184 | (/ (+ (* c my) (* c2 my2)) count) 185 | (+ ssx ssx2 (/ (* (- mx2 mx) (- mx2 mx) c c2) count)) 186 | (+ ssy ssy2 (/ (* (- my2 my) (- my2 my) c c2) count)) 187 | (+ ssxy ssxy2 (/ (* (- mx2 mx) (- my2 my) c c2) count))]))) 188 | :post-combiner (fn corrdiv [[c mx my ssx ssy ssxy]] 189 | (let [div (sqrt (* ssx ssy))] 190 | (when-not (zero? div) 191 | {:count c 192 | :correlation (/ ssxy div)})))}) 193 | 194 | (defn correlation+count-matrix 195 | "Given a map of key names to functions that extract values for those keys 196 | from an input, computes the correlations for each of the n^2 key 197 | pairs, returning a map of name pairs to the their correlations and counts. 198 | See correlation+count. For example: 199 | 200 | (t/correlation-matrix {:name-length #(.length (:name %)) 201 | :age :age 202 | :num-cats (comp count :cats)}) 203 | 204 | will, when executed, returns a map like 205 | 206 | {[:name-length :age] {:count 150 :correlation 0.56} 207 | [:name-length :num-cats] {:count 150 :correlation 0.95} 208 | ...}" 209 | [& args] 210 | (apply fuse-matrix correlation+count args)) 211 | 212 | (defn correlation 213 | "Like correlation+count, but only returns the correlation." 214 | [& args] 215 | (->> args 216 | (apply correlation+count) 217 | (t/post-combine :correlation))) 218 | 219 | (defn correlation-matrix 220 | "Like correlation+count-matrix, but returns just correlations coefficients 221 | instead of maps of :correlation and :count." 222 | [& args] 223 | (apply fuse-matrix correlation args)) 224 | 225 | (deftransform digest 226 | "You've got a set of numeric inputs and want to know their quantiles 227 | distribution, histogram, etc. This fold takes numeric inputs and 228 | produces a statistical estimate of their distribution. 229 | 230 | `digest` takes a function that returns a `tesser.quantiles/Digest`. The fold 231 | returns an instance of that digest. 232 | 233 | For example, to compute an HDRHistogram over both positive and negative 234 | doubles (or longs, rationals, etc): 235 | 236 | Compute a digest using e.g. 237 | 238 | (def digest (->> (m/digest q/hdr-histogram) 239 | (t/tesser [[1 1 1 1 1 1 2 2 2 3 3 4 5]]))) 240 | ; => # 241 | 242 | To specify options for the digest, just use partial or (fn [] ...) 243 | 244 | (m/digest (partial q/hdr-histogram {:significant-value-digits 4 245 | :highest-to-lowest-value-ratio 1e6})) 246 | 247 | DoubleHistogram, like many quantile estimators, only works over positive 248 | values. To cover positives and negatives together, use 249 | `tesser.quantiles/dual`: 250 | 251 | (m/digest #(q/dual q/hdr-histogram {:significant-value-digits 2})) 252 | 253 | Once you've computed a digest, you can find a particular quantile using 254 | `tesser.quantiles/quantile` 255 | 256 | (q/quantile digest 0) ; => 1.0 257 | (q/quantile digest 0.5) ; => 1.0 258 | (q/quantile digest 4/5) ; => 2.0009765625 259 | (q/quantile digest 1) ; => 3.0009765625 260 | 261 | The total number of points in the sample: 262 | 263 | (q/point-count digest) ; => 5 264 | 265 | Minima and maxima: 266 | 267 | (q/min digest) ; => 1.0 268 | (q/max digest) ; => 3.0009765625 269 | 270 | Or find the distribution of values less than or equal to each point, with 271 | resolution given by the internal granularity of the digest: 272 | 273 | (q/distribution digest) 274 | ; => ([1.0 3] [2.0009765625 1] [3.0009765625 1]) 275 | 276 | (q/cumulative-distribution digest) 277 | ; => ([1.0 3] [2.0009765625 4] [3.0009765625 5]) 278 | 279 | You don't have to return the whole digest; any of these derivative 280 | operations can be merged directly into the fold via 281 | `tesser.core/post-combine`. 282 | 283 | (->> (m/digest q/hdr-histogram) 284 | (t/post-combine #(q/quantile % 1/2)) 285 | (t/tesser [[1 2 2 3 3 3 3 3 3 3 3]])) 286 | ; => 3.0009765625 287 | 288 | You may also use `tesser.cardinality/hll` for estimating the cardinality of a 289 | set. HLL+ uses a probabilistic data-structure to compute set cardinality using 290 | very little memory with accuracy tradeoffs. 291 | 292 | The HLL digest can be used like the above mentioned histograms: 293 | 294 | (def digest (->> (m/digest cardinality/hll) 295 | (t/tesser [[1 1 1 1 1 1 2 2 2 3 3 4 5]]))) 296 | ; => # 297 | 298 | Getting the cardinality out through a post-combine step: 299 | 300 | (->> (m/digest cardinality/hll) 301 | (t/post-combine #(q/point-count %)) 302 | (t/tesser [[1 2 2 3 3 3 3 3 3 3 3]])) 303 | ; => 3 304 | 305 | I want to emphasize that depending on the size of your data, its 306 | distribution, and the number of digests you want to compute, you may need 307 | different digest algorithms and widely varying tuning parameters. Until we 308 | have a better grasp of the space/error tradeoffs here, I won't choose 309 | defaults for you." 310 | [digest-generator] 311 | (assert (nil? downstream)) 312 | {:reducer-identity digest-generator 313 | :reducer q/add-point! 314 | :post-reducer identity 315 | :combiner-identity digest-generator 316 | :combiner q/merge-digest! 317 | :post-combiner identity}) 318 | -------------------------------------------------------------------------------- /core/test/tesser/core_test.clj: -------------------------------------------------------------------------------- 1 | (ns tesser.core-test 2 | (:require [clojure.test :refer :all] 3 | [clojure.test.check :as tc] 4 | [clojure.test.check [clojure-test :refer :all] 5 | [generators :as gen] 6 | [properties :as prop]] 7 | [clojure.core.reducers :as r] 8 | [multiset.core :refer [multiset]] 9 | [tesser.utils :refer :all] 10 | [tesser.core :as t])) 11 | 12 | (def test-opts {:num-tests 1000 13 | :par 256}) 14 | 15 | (prn test-opts) 16 | 17 | (defn option 18 | "Generator that may return nil." 19 | [gen] 20 | (gen/one-of [(gen/return nil) gen])) 21 | 22 | (defn chunks 23 | "Given a generator for inputs, returns a generator that builds 24 | sequences of sequences of inputs." 25 | [input-gen] 26 | (gen/vector (gen/vector input-gen) 0 5)) 27 | 28 | (defn flatten1 29 | "Flattens a single level." 30 | [seq-of-seqs] 31 | (apply concat seq-of-seqs)) 32 | 33 | ;; Tests 34 | 35 | (defspec chunk-spec 36 | test-opts 37 | (prop/for-all [inputs (gen/vector gen/int)] 38 | (is (= (->> (t/map inc) 39 | (t/frequencies) 40 | (t/tesser (t/chunk 10 inputs))) 41 | (->> inputs 42 | (map inc) 43 | frequencies))))) 44 | 45 | (defspec fold-full-spec 46 | test-opts 47 | (prop/for-all [chunks (chunks gen/int)] 48 | (is (= (->> (t/filter even?) 49 | (t/fold {:reducer-identity vector 50 | :reducer conj 51 | :post-reducer (comp vec rseq) 52 | :combiner-identity sorted-set 53 | :combiner conj 54 | :post-combiner reverse}) 55 | (t/tesser chunks)) 56 | (->> chunks 57 | (map (fn [chunk] 58 | (->> chunk 59 | (filter even?) 60 | (into []) 61 | rseq 62 | vec))) 63 | (into (sorted-set)) 64 | reverse))))) 65 | 66 | (defspec fold-reducer-spec 67 | test-opts 68 | (prop/for-all [chunks (chunks gen/int)] 69 | (is (= (->> (t/fold {:identity hash-set 70 | :reducer conj 71 | :combiner into}) 72 | (t/tesser chunks)) 73 | (->> chunks flatten1 set))))) 74 | 75 | 76 | (defspec fold-fn-spec 77 | test-opts 78 | (prop/for-all [chunks (chunks gen/int)] 79 | (is (= (->> (t/fold +) 80 | (t/tesser chunks)) 81 | (->> chunks flatten1 (reduce +)))))) 82 | 83 | (defspec transform-spec 84 | {:test-count 1000 85 | :par 1} 86 | (prop/for-all [chunks (chunks gen/int)] 87 | (is (= (->> (t/transform #(assoc % :combiner-identity 88 | (constantly #{:hi}))) 89 | (t/set) 90 | (t/tesser chunks)) 91 | (->> chunks flatten1 (cons :hi) set))))) 92 | 93 | (defspec wrap-transform-spec 94 | {:test-count 1000 95 | :par 1} 96 | (prop/for-all [chunks (chunks gen/int)] 97 | (is (= (->> (t/set) 98 | (t/wrap-transform #(assoc % :combiner-identity 99 | (constantly #{:hi}))) 100 | (t/tesser chunks)) 101 | (->> chunks flatten1 (cons :hi) set))))) 102 | 103 | (defspec map-spec 104 | test-opts 105 | (prop/for-all [chunks (chunks gen/int)] 106 | (is (= (->> (t/map inc) 107 | (t/map (partial * 2)) 108 | (t/into (multiset)) 109 | (t/tesser chunks)) 110 | (->> chunks 111 | flatten1 112 | (map inc) 113 | (map (partial * 2)) 114 | (into (multiset))))))) 115 | 116 | (defspec replace-spec 117 | test-opts 118 | (prop/for-all [chunks (chunks (option gen/boolean))] 119 | (let [subs {true nil 120 | nil false 121 | false true}] 122 | (is (= (->> (t/replace subs) 123 | (t/into (multiset)) 124 | (t/tesser chunks)) 125 | (->> chunks 126 | flatten1 127 | (replace subs) 128 | (into (multiset)))))))) 129 | 130 | (defspec mapcat-spec 131 | test-opts 132 | (prop/for-all [chunks (chunks gen/int)] 133 | (is (= (->> (t/mapcat range) 134 | (t/filter even?) 135 | (t/into (multiset)) 136 | (t/tesser chunks)) 137 | (->> chunks 138 | flatten1 139 | (mapcat range) 140 | (filter even?) 141 | (into (multiset))))))) 142 | 143 | (defspec keep-spec 144 | test-opts 145 | (prop/for-all [chunks (chunks gen/int)] 146 | (is (= (->> (t/keep #(when (even? %) (inc %))) 147 | (t/into (multiset)) 148 | (t/tesser chunks)) 149 | (->> chunks 150 | flatten1 151 | (keep #(when (even? %) (inc %))) 152 | (into (multiset))))))) 153 | 154 | (defspec filter-spec 155 | test-opts 156 | (prop/for-all [chunks (chunks gen/int)] 157 | (is (= (->> (t/filter odd?) 158 | (t/into (multiset)) 159 | (t/tesser chunks)) 160 | (->> chunks 161 | flatten1 162 | (filter odd?) 163 | (into (multiset))))))) 164 | 165 | (defspec remove-spec 166 | test-opts 167 | (prop/for-all [chunks (chunks gen/int)] 168 | (is (= (->> (t/remove odd?) 169 | (t/into (multiset)) 170 | (t/tesser chunks)) 171 | (->> chunks 172 | flatten1 173 | (remove odd?) 174 | (into (multiset))))))) 175 | 176 | (defspec reduce-spec 177 | test-opts 178 | (prop/for-all [chunks (chunks gen/int)] 179 | (is (= (->> (t/map inc) 180 | (t/reduce + 0) 181 | (t/tesser chunks)) 182 | (->> chunks 183 | flatten1 184 | (r/map inc) 185 | (reduce + 0)))))) 186 | 187 | (defspec into-vec-spec 188 | test-opts 189 | (prop/for-all [chunks (chunks gen/int)] 190 | (is (= (sort (t/tesser chunks (t/into []))) 191 | (sort (flatten1 chunks)))))) 192 | 193 | (defspec into-set-spec 194 | test-opts 195 | (prop/for-all [chunks (chunks gen/int)] 196 | (is (= (t/tesser chunks (t/into #{})) 197 | (set (flatten1 chunks)))))) 198 | 199 | (defspec take-spec 200 | test-opts 201 | ; Our chunks will be random partitionings of the integers, and we'll take 202 | ; them into a vector, then verify it contains n unique elements. 203 | (prop/for-all [n gen/pos-int 204 | sizes (gen/vector gen/pos-int 0 10)] 205 | ; (prn) (prn) 206 | (let [total (reduce + sizes) 207 | chunks (->> sizes 208 | (reduce (fn [[start chunks] size] 209 | [(+ start size) 210 | (->> start 211 | (iterate inc) 212 | (take size) 213 | (conj chunks))]) 214 | [0 []]) 215 | second)] 216 | ; (prn :n n :chunks chunks) 217 | (let [x (->> (t/take n) 218 | (t/into []) 219 | (t/tesser chunks))] 220 | ; (prn :total total :n n :result x) 221 | (is (and (or (= (count x) n) ; Exactly n inputs 222 | (and (< (count x) n) ; Not enough to hit n 223 | (= (count x) total))) 224 | ; Unique 225 | (= (count x) 226 | (count (set x))))))))) 227 | 228 | (defspec take-take-spec 229 | test-opts 230 | ; Our chunks will be random partitionings of the integers, and we'll take 231 | ; them into a vector, then verify it contains n unique elements. Performing 232 | ; two takes verifies that the reduced optimizations compose well. :) 233 | (prop/for-all [n gen/pos-int 234 | sizes (gen/vector gen/pos-int 0 10)] 235 | ; (prn) (prn) 236 | (let [total (reduce + sizes) 237 | chunks (->> sizes 238 | (reduce (fn [[start chunks] size] 239 | [(+ start size) 240 | (->> start 241 | (iterate inc) 242 | (take size) 243 | (conj chunks))]) 244 | [0 []]) 245 | second)] 246 | ; (prn :n n :chunks chunks) 247 | (let [x (->> (t/take (inc n)) 248 | (t/take n) 249 | (t/into []) 250 | (t/tesser chunks))] 251 | ; (prn :total total :n n :result x) 252 | (is (and (or (= (count x) n) ; Exactly n inputs 253 | (and (< (count x) n) ; Not enough to hit n 254 | (= (count x) total))) 255 | ; Unique 256 | (= (count x) 257 | (count (set x))))))))) 258 | 259 | (defspec post-combine-spec 260 | test-opts 261 | (prop/for-all [chunks (chunks gen/int)] 262 | (is (= (->> (t/map inc) 263 | (t/map str) 264 | (t/into #{}) 265 | (t/post-combine sort) 266 | (t/post-combine (partial map read-string)) 267 | (t/tesser chunks)) 268 | (->> (flatten1 chunks) 269 | (map inc) 270 | (map str) 271 | (into #{}) 272 | sort 273 | (map read-string)))))) 274 | 275 | ;; Splitting folds 276 | 277 | (defspec group-by-spec 278 | test-opts 279 | (prop/for-all [chunks (chunks gen/int)] 280 | (let [g #(mod % 3)] 281 | (is (= (->> (t/group-by g) 282 | (t/into (multiset)) 283 | (t/tesser chunks)) 284 | (->> (flatten1 chunks) 285 | (group-by g) 286 | (map (fn [[k vs]] [k (apply multiset vs)])) 287 | (into {}))))))) 288 | 289 | (defspec group-by-post-reducer-spec 290 | test-opts 291 | (prop/for-all [chunks (chunks gen/int)] 292 | (let [g #(mod % 3)] 293 | (is (= (->> (flatten1 chunks) 294 | (group-by g) 295 | (map-vals (partial reduce max 0))) 296 | (->> (t/group-by g) 297 | ; In each group, find maximum, but with a weird 298 | ; intermediate datatype 299 | (t/fold {:reducer 300 | (fn 301 | ([] [:secret 0]) 302 | ([[_ m]] m) 303 | ([[_ m] x] [:secret (max m x)]))}) 304 | (t/tesser chunks))))))) 305 | 306 | 307 | (defspec facet-spec 308 | test-opts 309 | ; Sum over maps of keywords to ints 310 | (prop/for-all [chunks (chunks 311 | (gen/resize 4 (gen/map (gen/elements [:a :b :c :d :e]) 312 | gen/int)))] 313 | (is (= (->> (t/facet) 314 | (t/min) 315 | (t/tesser chunks)) 316 | (->> chunks 317 | flatten1 318 | (apply merge-with min {})))))) 319 | 320 | (defspec fuse-spec 321 | test-opts 322 | ; sum, set, and multiset over ints 323 | (prop/for-all [chunks (chunks gen/int)] 324 | (let [inputs (flatten1 chunks)] 325 | (is (= (->> (t/fuse {:max (t/max) 326 | :set (t/into #{}) 327 | :multiset (t/into (multiset))}) 328 | (t/tesser chunks)) 329 | {:max (when-not (empty? inputs) 330 | (reduce max inputs)) 331 | :set (set inputs) 332 | :multiset (into (multiset) inputs)}))))) 333 | 334 | ;; Basic reductions 335 | 336 | (defspec count-spec 337 | test-opts 338 | (prop/for-all [chunks (chunks gen/int)] 339 | (is (= (t/tesser chunks (t/count)) 340 | (count (flatten1 chunks)))))) 341 | 342 | (defspec set-spec 343 | test-opts 344 | (prop/for-all [chunks (chunks gen/int)] 345 | (is (= (t/tesser chunks (t/set)) 346 | (set (flatten1 chunks)))))) 347 | 348 | (defspec frequencies-spec 349 | test-opts 350 | (prop/for-all [chunks (chunks gen/int)] 351 | (is (= (t/tesser chunks (t/frequencies)) 352 | (frequencies (flatten1 chunks)))))) 353 | 354 | (defspec some-spec 355 | test-opts 356 | (prop/for-all [chunks (chunks gen/int)] 357 | (is (= (t/tesser chunks (t/some #{1})) 358 | (some #{1} (flatten1 chunks)))))) 359 | 360 | (defspec any-spec 361 | test-opts 362 | (prop/for-all [chunks (chunks gen/int)] 363 | (let [e (t/tesser chunks (t/any)) 364 | candidates (->> chunks 365 | (filter seq) 366 | (map first) 367 | set)] 368 | (is (or (contains? candidates e) 369 | (and (empty? candidates) (nil? e))))))) 370 | 371 | (defspec last-spec 372 | test-opts 373 | (prop/for-all [chunks (chunks gen/int)] 374 | (let [e (t/tesser chunks (t/last)) 375 | candidates (->> chunks 376 | (filter seq) 377 | (map last) 378 | set)] 379 | (is (or (contains? candidates e) 380 | (and (empty? candidates) (nil? e))))))) 381 | 382 | ;; Predicate folds 383 | (defspec empty?-spec 384 | test-opts 385 | (prop/for-all [chunks (chunks (option gen/boolean))] 386 | (is (= (t/tesser chunks (t/empty?)) 387 | (empty? (flatten1 chunks)))))) 388 | 389 | (defspec every?-spec 390 | test-opts 391 | (prop/for-all [chunks (chunks gen/int)] 392 | (is (= (t/tesser chunks (t/every? odd?)) 393 | (every? odd? (flatten1 chunks)))))) 394 | 395 | (defspec not-every?-spec 396 | test-opts 397 | (prop/for-all [chunks (chunks gen/int)] 398 | (is (= (t/tesser chunks (t/not-every? odd?)) 399 | (not-every? odd? (flatten1 chunks)))))) 400 | 401 | ;; Comparable folds 402 | 403 | (defspec max-spec 404 | test-opts 405 | (prop/for-all [chunks (chunks gen/int)] 406 | (let [m (t/tesser chunks (t/max))] 407 | (if (every? empty? chunks) 408 | (nil? m) 409 | (= m (reduce max (flatten1 chunks))))))) 410 | 411 | (defspec min-spec 412 | test-opts 413 | (prop/for-all [chunks (chunks gen/int)] 414 | (let [m (t/tesser chunks (t/min))] 415 | (if (every? empty? chunks) 416 | (nil? m) 417 | (= m (reduce min (flatten1 chunks))))))) 418 | 419 | (defspec range-spec 420 | test-opts 421 | (prop/for-all [chunks (chunks gen/int)] 422 | (let [inputs (flatten1 chunks)] 423 | (= (t/tesser chunks (t/range)) 424 | (if (every? empty? chunks) 425 | [nil nil] 426 | [(reduce min inputs) (reduce max inputs)]))))) 427 | -------------------------------------------------------------------------------- /README.markdown: -------------------------------------------------------------------------------- 1 | | API docs | Clojars | Purpose | 2 | |----------|---------|---------| 3 | | [tesser.core](http://aphyr.github.io/tesser/tesser.core.html) | [tesser.core](https://clojars.org/tesser.core) | The core library and essential folds | 4 | | [tesser.math](http://aphyr.github.io/tesser/tesser.math.html) | [tesser.math](https://clojars.org/tesser.math) | Statistical folds: means, correlations, covariance matrices, quantiles, etc. | 5 | 6 | # Tesser 7 | 8 | > "Now we will tesser, we will wrinkle again. Do you understand?" "No," 9 | > Meg said flatly. Mrs. Whatsit sighed. "Explanations are not easy when they 10 | > are about things for which your civilization still has no words. Calvin 11 | > talked about traveling at the speed of light. You understand that, little 12 | > Meg?" "Yes," Meg nodded. "That, of course, is the impractical, long way 13 | > around. We have learned to take short cuts wherever possible." "Sort of 14 | > like in math?" Meg asked. "Like in math." 15 | 16 | — Madeline L'Engle, *A Wrinkle In Time*. 17 | 18 | ## A Motivating Example 19 | 20 | You've got a big pile of data--say, JSON in files on disk, or TSVs in 21 | Hadoop--and you'd like to reduce over that data: computing some statistics, 22 | searching for special values, etc. You might want to find the median housing 23 | price in each city given a collection of all sales, or find the total mass of 24 | all main-sequence stars in a region of sky, or search for an anticorrelation 25 | between vaccine use and the prevalence of a disease. These are all *folds*: 26 | collapsing a collection of data into a smaller value. 27 | 28 | In Clojure, we're used to writing programs like 29 | 30 | ```clj 31 | (->> stars 32 | (filter main-sequence?) 33 | (map :mass) 34 | (reduce +)) 35 | ``` 36 | 37 | But this reduction is *singlethreaded*, and can only run on a single machine. 38 | You've got 48 cores in your desktop computer. Why aren't they all helping? 39 | 40 | ```clj 41 | (require '[tesser.core :as t]) 42 | (->> (t/filter main-sequence) 43 | (t/map :mass) 44 | (t/fold +) 45 | (t/tesser (t/chunk 1024 stars))) 46 | ``` 47 | 48 | Tesser goes much deeper, but this is the essence: writing understandable, 49 | composable, *fast* programs for exploring datasets. 50 | 51 | ## A Clojure Library for Concurrent & Commutative Folds 52 | 53 | Tesser gives us a library for building up *folds*, and applying those folds to 54 | a collection of *inputs*, divided into *chunks*. Chunks are reduced with 55 | maximal parallelism, and the results of those reductions are reduced together. 56 | We call the concurrent reduction `:reducer`, and the serial reduction 57 | `:combiner`. 58 | 59 | ![Reduce/combine diagram](/img/reduce-combine.jpg) 60 | 61 | In order to reduce over a chunk, we need an *initial value*. A 62 | `:reducer-identity` function generates this initial value. Once the reduction 63 | is complete, we may want to transform it using a `:post-reducer` function--for 64 | instance, converting from a transient to a persistent data structure, or 65 | discarding internal reducer state. 66 | 67 | ![Diagram of reducer identities and post-reduce](/img/reducer-identity-post.jpg) 68 | 69 | Likewise, we need a `:combiner-identity` function to generate an initial value 70 | for the combine reduction, and a final `:post-combine` function to transform 71 | the combiner's output. 72 | 73 | ![Diagram of combiner identity and post-combine](/img/combiner-identity-post.jpg) 74 | 75 | Tesser folds are *associative* and *commutative* monoids: they can be applied 76 | concurrently and do not preserve order. We make this tradeoff to reduce the 77 | need for coordination between reducers. Not all concurrent folds 78 | require/provide commutativity: Algebird, for example, provides ordered 79 | distributed folds, and clojure.core.reducers preserves order as well. Right 80 | now Tesser doesn't, but we might change that someday. 81 | 82 | Unlike CRDTs, Tesser's folds do not require idempotence. In Tesser, 1 + 1 is 2, 83 | not 1. Each input factors in to the reduced value exactly once--though the 84 | order is up to Tesser. 85 | 86 | ## Representing a Fold 87 | 88 | If all you need is a quick-n-dirty replacement for an existing `reduce` or 89 | `fold`, just drop `tesser.simple` into your program. 90 | 91 | ```clj 92 | (require '[tesser.simple :as s]) 93 | user=> (s/reduce + 0 (range 10000000)) 94 | 49999995000000 95 | ``` 96 | 97 | But Tesser is capable of *composing* folds together--combining simple functions 98 | to express complex ideas. In Tesser, we represent a compiled fold as map of six 99 | functions: 100 | 101 | ```clj 102 | {:reducer-identity (fn [] ...) 103 | :reducer (fn [accumulator input] ...) 104 | :post-reducer (fn [accumulator] ...) 105 | :combiner-identity (fn [] ...) 106 | :combiner (fn [accumulator post-reducer-result] ...) 107 | :post-combiner (fn [accumulator] ...)} 108 | ``` 109 | 110 | For instance, here's a fold to find the sum of all inputs. While the reducer 111 | and combiner often have the same accumulator type and identities, 112 | this is not always the case. 113 | 114 | ```clj 115 | (require '[tesser.core :as t]) 116 | (t/tesser [[1] [2 3]] 117 | (t/fold {:reducer-identity (constantly 0) 118 | :reducer + 119 | :post-reducer identity 120 | :combiner-identity (constantly 0) 121 | :combiner + 122 | :post-combiner identity})) 123 | ; => 6 124 | ``` 125 | 126 | Since `(+)` returns `0` (the additive identity), we can leave off the 127 | identities; `t/fold` will use the reducer as the identity function. The same 128 | goes for the post-reducer and post-combiner: `(+ x)` returns `x`, so we can 129 | leave them off too: 130 | 131 | ```clj 132 | (t/tesser [[1] [2 3]] 133 | (t/fold {:reducer + 134 | :combiner +})) 135 | ; => 6 136 | ``` 137 | 138 | Or simply pass a function to `t/fold`, which will be used for all 6 functions. 139 | 140 | ```clj 141 | (t/tesser [[1] [2 3]] (t/fold +)) 142 | ; => 6 143 | ``` 144 | 145 | But that's not all; we can transform the summing fold into one that operates on strings like "1" with the `map` function: 146 | 147 | ```clj 148 | (->> (t/map read-string) 149 | (t/fold +) 150 | (t/tesser [["1" "2" "3"]])) 151 | ; => 6 152 | ``` 153 | 154 | Tesser provides a rich library of fold transforms, allowing you to build up 155 | complex folds out of simple, modular parts. 156 | 157 | ## Core 158 | 159 | [Tesser.core](http://aphyr.github.io/tesser/tesser.core.html) looks a lot like 160 | the Clojure seq API, and many of its functions have similar names. Their 161 | semantics differ, however: Tesser folds do not preserve the order of inputs, 162 | and when executed, they run in *parallel*. 163 | 164 | Applying a fold using `tesser.core/tesser` uses multiple threads proportional 165 | to processor cores. Unlike reducers, we don't use the Java forkjoin pool, just 166 | plain old threads; it avoids contention issues and improves performance on most 167 | JDKs. 168 | 169 | ```clj 170 | (require '[tesser.core :as t]) 171 | 172 | (t/tesser [[1 2 3] [4 5 6]] (t/into [] (t/map str))) 173 | => ["4" "5" "6" "1" "2" "3"] 174 | 175 | (->> (t/map inc) ; Increment each number 176 | (t/filter odd?) ; Take only odd numbers 177 | (t/take 5) ; *which* five odd numbers are selected is arbitrary 178 | (t/mapcat range) ; Explode each odd number n into the numbers from 0 to n 179 | (t/frequencies) ; Compute the frequency of appearances 180 | (t/tesser (partition 3 (range 100)))) 181 | => {0 5, 7 2, 20 1, 27 1, 1 4, 24 1, 4 3, 15 2, 21 1, 13 2, 22 1, 6 2, 28 1, 25 1, 17 1, 3 3, 12 2, 2 4, 23 1, 19 1, 11 2, 9 2, 5 2, 14 2, 26 1, 16 2, 10 2, 18 1, 8 2} 182 | ``` 183 | 184 | Fold combinators like 185 | [facet](http://aphyr.github.io/tesser/tesser.core.html#var-facet) and 186 | [fuse](http://aphyr.github.io/tesser/tesser.core.html#var-fuse) allow multiple 187 | reductions to be done in a single pass, possibly sharing expensive operations 188 | like deserialization. This is a particularly effective way of working with a 189 | set of data files on disk or in Hadoop. 190 | 191 | Given JSON records about a codebase like 192 | 193 | ```clj 194 | {"year": 2004, 195 | "lines-of-code" {"ruby": 100, 196 | "c": 1693}} 197 | ``` 198 | 199 | We can find the range of years *and* the total lines of code in each language 200 | in a single pass. 201 | 202 | ```clj 203 | (->> (t/map #(json/parse-string % true)) 204 | (t/fuse {:year-range (t/range (t/map :year)) 205 | :total-code (->> (t/map :lines-of-code) 206 | (t/facet) 207 | (t/reduce + 0)}) 208 | (t/tesser records)) 209 | 210 | => {:year-range [1986 2014] 211 | :total-code {:c 153423, 212 | :ruby 4578, 213 | :tcl 3453 214 | :bf 1}} 215 | ``` 216 | 217 | Ready to get started? Start with the [tesser.core docs](http://aphyr.github.io/tesser/tesser.core.html) 218 | 219 | ## Math 220 | 221 | [Tesser.math](http://aphyr.github.io/tesser/tesser.math.html) provides 222 | statistical folds. Some, like sum, mean, and correlation, are exact. Others, 223 | like quantiles, are estimates. 224 | 225 | For instance, to find a Pearson correlation matrix between ln(age), height, and 226 | weight; and in the same pass, to find the total number of samples: 227 | 228 | ```clj 229 | (require '[tesser.math :as m]) 230 | (->> (t/fuse {:count (t/count) 231 | :corrs (m/correlation-matrix {:log-age #(Math/ln (:age %)) 232 | :height :height 233 | :weight :weight})}) 234 | (t/tesser records)) 235 | 236 | => {:count 7123525 237 | :corrs {[:log-age :weight] 0.74 238 | [:log-age :height] 0.86 239 | [:weight :height] 0.91 240 | ... and the same keys in symmetric order ...}} 241 | ``` 242 | 243 | Ready? [To the tesser.math API!](http://aphyr.github.io/tesser/tesser.math.html) 244 | 245 | ## Hadoop 246 | 247 | *Update, 2021-03-01: Parkour, the library tesser.hadoop used to talk to Hadoop, 248 | hasn't been maintained in some time: its dependencies are now unresolvable. 249 | Tesser is likely still a good fit for Hadoop, but I need someone who actually 250 | uses Hadoop to take this over. PRs, please!* 251 | 252 | The [tesser.hadoop API](http://aphyr.github.io/tesser/tesser.hadoop.html) takes 253 | Tesser folds and distributes them using the 254 | [Parkour](https://github.com/damballa/parkour) Hadoop library. You can test 255 | your folds locally, then run them on a cluster to reduce over huge datasets. 256 | 257 | ```clj 258 | (require '[tesser [core :as t] 259 | [math :as m] 260 | [hadoop :as h]]) 261 | 262 | (defn analyze 263 | "A fold that analyzes measurements of trees from a certain location." 264 | [location] 265 | (->> (t/map parse-record) 266 | (t/filter #(= location (:location %))) 267 | (t/fuse {:count (t/count) 268 | :oldest (->> (t/map :age) 269 | (t/max)) 270 | :corrs (m/correlation-matrix 271 | {:age :age 272 | :log-mass #(Math/log (:mass %)) 273 | :growth-rings :growth-rings 274 | :humidity :humdity})}))) 275 | 276 | (h/fold conf 277 | (text/dseq "hdfs:/some/file/part-*") 278 | "hdfs:/tmp/tesser" 279 | #'analyze "Redwood National Park") 280 | ``` 281 | 282 | See the [Hadoop demo 283 | project](https://github.com/aphyr/tesser/tree/master/hadoop/demo) for an 284 | example of how to run a fold in Hadoop. 285 | 286 | ## An integrative example 287 | 288 | Let's say we're public health researchers, and we're trying to understand what 289 | factors influence the prevalence of a communicable disease. There are three 290 | medications, x, y, and z, which show promise for preventing its spread, but not everyone takes medications regularly. We have anonymized case reports for each infection, like: 291 | 292 | ```clj 293 | {:year 2011 ; The year the infection was reported 294 | :age 22 ; The patient's age 295 | :zip 94110 ; The patient's zip code 296 | :primary-facility "City Clinic" ; Where does this person go for care? 297 | :medications [:x, :y] ; The medications this person has been using 298 | :compliance {:x 0.9 ; How often do they adhere to the 299 | :y 0.6} ; prescribed dosage? 300 | ``` 301 | 302 | ```clj 303 | (require '[tesser.core :as t] 304 | [tesser.math :as m] 305 | [tesser.quantiles :as q]) 306 | 307 | (defn fold 308 | "Computes aggregate statistics over infection cases" 309 | [] 310 | (t/fuse 311 | {; Total number of cases 312 | :total (t/count) 313 | 314 | ; How has the number of cases changed over time? 315 | :trend (->> (t/group-by :year) 316 | (t/count)) 317 | 318 | ; Number of cases for the year 2018, broken down by zip code 319 | :recent-cases (->> (t/filter #(= 2018 (:year %))) 320 | (t/group-by :zip) 321 | (t/count)) 322 | 323 | ; A sorted map of ages to the number of cases with that age 324 | :age-range (->> (t/map :age) 325 | (t/frequencies) 326 | (t/post-combine (partial into (sorted-map)))) 327 | 328 | ; How many cases were on no medications at all? 329 | :no-meds (->> (t/filter (comp empty? :medications)) 330 | (t/count)) 331 | 332 | ; A histogram of medication compliance, broken down by medication type 333 | :med-compliance (->> (t/map :compliance) 334 | (t/facet) 335 | (m/digest (partial q/dual q/hdr-histogram)) 336 | (t/post-combine q/distribution)) 337 | 338 | ; Correlation and coocurrence count between age and number of medications 339 | :age-meds (m/correlation+count :age (comp count :medications)) 340 | 341 | ; The covariance matrix between medication compliance--are any pair of 342 | medications linearly covariant? 343 | :med-cov (->> (t/map :compliance) 344 | (m/covariance-matrix {:x :x 345 | :y :y 346 | :z :z}))})) 347 | ``` 348 | 349 | Because these folds are collection-indepedent, and defined in small chunks, we 350 | can break them up into functions, write small tests to verify each folds 351 | behavior indepedently, then compose them into larger programs. We're free to 352 | name transformations at any level by binding them to `let` variables or 353 | `defn`s, or to build complex folds in a single pass. 354 | 355 | ## Invariants 356 | 357 | In order for Tesser to execute a fold concurrently, a fold must obey some 358 | simple invariants. 359 | 360 | - All functions should be deterministic functions purely of their inputs. 361 | Constructing two copies of the same fold on, say, different nodes in a 362 | cluster should result in equivalent behavior. You *cannot* squirrel away state 363 | in a lexical closure, for instance. 364 | - Accumulators may be mutable, and folds never modify the same accumulator 365 | concurrently. You may mutate the accumulator in a reducer, combiner, or 366 | post-fn without synchronization. 367 | - Reducers and combiners must be associative: `(f a (f b c))` = `(f (f a b) 368 | c)`. 369 | - Reducers and combiners must be commutative: `(f a b)` = `(f b a)`. 370 | - Reducers and combiners may emit `reduced` values, which force immediate 371 | completion of that particular reduce or combine. If a reducer emits a reduced 372 | value, it has no impact on the execution of other reducers, or the combiner. 373 | 374 | ## Performance 375 | 376 | When the computation you're performing on each input dominates, Tesser should 377 | be significantly faster than `clojure.core/reduce` and somewhat faster than 378 | `clojure.core.reducers/fold`. Tesser performs worst when the reducing 379 | operations are cheap compared to the cost of traversing the collection. Even in 380 | these cases, Tesser's performance on laptop and server-class x64 hardware ain't 381 | too shabby. 382 | 383 | On a 48-way (including HT) E5-2697, summing 10 million random longs: 384 | 385 | | Collection | Clojure reduce | Reducers fold | Tesser | 386 | |------------|----------------|---------------|----------| 387 | | Array | 460 MHz | 420 MHz | 2900 MHz | 388 | | Vector | 490 MHz | 4300 MHz | 4700 MHz | 389 | 390 | And the equivalent of `(->> (map inc) (filter even?) (reduce +))` over those 10 391 | million longs: 392 | 393 | | Collection | Clojure reduce | Reducers fold | Tesser | 394 | |------------|----------------|---------------|----------| 395 | | Array | 43 MHz | 270 MHz | 2400 MHz | 396 | | Vector | 120 MHz | 3400 MHz | 3200 MHz | 397 | 398 | Run `lein test :bench` to reproduce results on your hardware. 399 | 400 | In general, Tesser... 401 | 402 | - Sees the same benefits of stream fusion as reducers: fewer allocations for 403 | intermediate seqs, 404 | - Can parallelize over primitive arrays (unlike reducers), and 405 | - Reduces thread contention versus Java forkjoin (used by `reducers/fold`). 406 | 407 | However, Tesser cannot automatically partition and traverse vectors as 408 | efficiently as core.reducers can, which makes it slightly slower when you have 409 | a single vector and ask Tesser to partition it for you. Passing a series of 410 | vectors to `tesser.core/tesser` makes Tesser's traversal costs identical to 411 | core.reducers. 412 | 413 | There's also some low-hanging fruit in `fuse`, `take`, etc. that can be 414 | optimized later; we allocate fresh vectors to box reduction state on every new 415 | input, instead of clobbering a variable in place. Haven't gotten around to 416 | tuning that yet. 417 | 418 | In real-world applications, Tesser has significantly improved single-node 419 | performance relative to `reducers/fold`, but YMMV. 420 | 421 | ## Vs Reducers and Transducers 422 | 423 | Clojure's reducers and transducers embody sequential folds: they move from left 424 | to right over a sequence. Reducers also includes a less-well-known 425 | *hierarchical* fold which parallelizes a reduction via Java's fork-join pool, 426 | but this reduction is still fundamentally ordered and local to a single 427 | machine. 428 | 429 | Tesser explores a different niche. It offers: 430 | 431 | - *Commutativity.* Tesser folds must not depend on the order of inputs. 432 | - *Concurrency.* Reductions over independent chunks require no coordination, 433 | making them good candidates for distributed contexts like Hadoop. 434 | - *Stream fusion.* Like Reducers and Transducers, `map`, `filter`, etc. are all 435 | folded into a single reduction function. Intermediate values are 436 | stack-allocated, reducing GC load. 437 | - *Collection independence.* Like Transducers, Tesser folds are abstract 438 | transformations and can be re-used against varying types of collections. 439 | 440 | ## Building 441 | 442 | - Update project versions in core/project.clj, math/project.clj, hadoop/project.clj, all/project.clj 443 | 444 | Test and install 445 | 446 | ``` 447 | cd core/ && lein do test, install && cd ../ 448 | cd math/ && lein do test, install && cd ../ 449 | cd hadoop/ && lein do test, install && cd ../ 450 | ``` 451 | 452 | Commit 453 | 454 | ``` 455 | VERSION="x.x.x" 456 | git commit -a -m "Version $VERSION" 457 | git tag "v$VERSION" 458 | git push 459 | git push --tags 460 | ``` 461 | 462 | Deploy 463 | 464 | ``` 465 | cd core/ && lein deploy clojars && cd ../ 466 | cd math/ && lein deploy clojars && cd ../ 467 | cd hadoop/ && lein deploy clojars && cd ../ 468 | ``` 469 | 470 | Rebuild documentation 471 | 472 | ``` 473 | cd all/ && lein codox 474 | ``` 475 | 476 | Docs commit on gh-pages branch (assumes you've got all/doc set up as 477 | gh-pages) 478 | 479 | ``` 480 | cd all/doc && git commit -am "Docs for version x.x.x" 481 | git push 482 | ``` 483 | 484 | ## Contributors 485 | 486 | - [Kyle Kingsbury](mailto:aphyr@aphyr.com) 487 | - [Natasha Whitney](mailto:natiwhitney@gmail.com) 488 | - Factual, Inc 489 | 490 | ## License 491 | 492 | Eclipse Public License v1.0, same as Clojure. 493 | --------------------------------------------------------------------------------