├── Gemfile
├── Gemfile.lock
├── bin
├── simmer
└── install-algebird-snapshot.rb
├── src
└── main
│ └── scala
│ └── com
│ └── stripe
│ └── simmer
│ ├── package.scala
│ ├── UDPInput.scala
│ ├── IO.scala
│ ├── Main.scala
│ ├── Parallel.scala
│ ├── Redis.scala
│ ├── Http.scala
│ ├── Registry.scala
│ ├── Core.scala
│ └── Aggregators.scala
├── Rakefile
├── pom.xml
└── README.md
/Gemfile:
--------------------------------------------------------------------------------
1 | gem 'rake'
2 |
--------------------------------------------------------------------------------
/Gemfile.lock:
--------------------------------------------------------------------------------
1 | GEM
2 | specs:
3 | rake (0.9.2.2)
4 |
5 | PLATFORMS
6 | ruby
7 |
8 | DEPENDENCIES
9 | rake
10 |
--------------------------------------------------------------------------------
/bin/simmer:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 |
3 | ROOT = File.expand_path(File.join(File.dirname(__FILE__), '..'))
4 | classpath = File.read(ROOT + "/.classpath") + ":#{ROOT}/target/classes"
5 | exec("java -Xmx2G -cp #{classpath} com.stripe.simmer.Main " + ARGV.join(" "))
--------------------------------------------------------------------------------
/bin/install-algebird-snapshot.rb:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 |
3 | Dir["../algebird/algebird-core/target/scala-2.9.2/*"].each do |file|
4 | if file =~ /algebird-core_2.9.2-(.*-SNAPSHOT).(jar|pom)$/
5 | system("mvn install:install-file -Dfile=#{file} -DgroupId=com.twitter -DartifactId=algebird-core_2.9.2 -Dversion=#{$1} -Dpackaging=#{$2}")
6 | end
7 | end
--------------------------------------------------------------------------------
/src/main/scala/com/stripe/simmer/package.scala:
--------------------------------------------------------------------------------
1 | package com.stripe
2 |
3 | package object simmer {
4 | def split(str : String, delim : String) = {
5 | val parts = str.split(delim)
6 | val head = parts.head
7 | if(parts.size > 1 && head.size > 0)
8 | Some((head, str.drop(head.size + 1)))
9 | else
10 | None
11 | }
12 | }
--------------------------------------------------------------------------------
/src/main/scala/com/stripe/simmer/UDPInput.scala:
--------------------------------------------------------------------------------
1 | package com.stripe.simmer
2 |
3 | import java.io._
4 | import java.net._
5 |
6 | class UDPInput(port : Int) extends Input {
7 | def run(simmer : Simmer) {
8 | System.err.println("Listening on UDP port " + port)
9 |
10 | val sock = new DatagramSocket(port)
11 | val buf = new Array[Byte](1024)
12 | while(true) {
13 | val packet = new DatagramPacket(buf, buf.length)
14 | sock.receive(packet)
15 | val str = new String(packet.getData, 0, packet.getLength)
16 | val columns = str.split("\t")
17 | if(columns.size > 1)
18 | simmer.update(columns(0), columns(1))
19 | }
20 | }
21 | }
--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
1 | def find_scalac
2 | scalac = ["zinc", "scalac"].map{|c| `which #{c}`}.detect{|c| c != ""}
3 | unless scalac
4 | $stderr.puts "Could not find a Scala compiler"
5 | exit(1)
6 | end
7 | scalac.chomp!
8 | scalac += " -nailed" if(scalac =~ /zinc/)
9 | scalac
10 | end
11 |
12 | file "default" => ["build"]
13 |
14 | file "build" => [".classpath"] do |t|
15 | Dir.mkdir("target") unless Dir.exists?("target")
16 | Dir.mkdir("target/classes") unless Dir.exists?("target/classes")
17 |
18 | classpath = File.read(".classpath")
19 | sources = Dir["src/**/*.scala"]
20 | sh "#{find_scalac} -d target/classes -cp #{classpath} #{sources.join(' ')}"
21 | end
22 |
23 | file ".classpath" => ["pom.xml"] do |t|
24 | sh "rm -f .classpath && mvn -q dependency:build-classpath -Dmdep.outputFile=.classpath"
25 | end
--------------------------------------------------------------------------------
/src/main/scala/com/stripe/simmer/IO.scala:
--------------------------------------------------------------------------------
1 | package com.stripe.simmer
2 |
3 | import com.twitter.util.Future
4 |
5 | trait Input {
6 | def run(simmer : Simmer)
7 | }
8 |
9 | trait Output {
10 | def write[A](key : String, value : A, aggregator : Aggregator[A]) : Boolean
11 | }
12 |
13 | trait Lookup {
14 | def read(key : String) : Future[Option[(String,String)]]
15 | }
16 |
17 |
18 | object StdInput extends Input {
19 | def run(simmer : Simmer) {
20 | for(line <- io.Source.stdin.getLines) {
21 | val columns = line.split("\t")
22 | if(columns.size > 1)
23 | simmer.update(columns(0), columns(1))
24 | }
25 |
26 | System.exit(0)
27 | }
28 | }
29 |
30 | object StdOutput extends Output {
31 | def write[A](key : String, value : A, aggregator : Aggregator[A]) = {
32 | println(key + "\t" + aggregator.serialize(value) + "\t" + aggregator.present(value))
33 | true
34 | }
35 | }
36 |
37 | object NullLookup extends Lookup {
38 | def read(key : String) = Future.value(None)
39 | }
--------------------------------------------------------------------------------
/src/main/scala/com/stripe/simmer/Main.scala:
--------------------------------------------------------------------------------
1 | package com.stripe.simmer
2 |
3 | import org.rogach.scallop._
4 |
5 | object Main {
6 | def main(args : Array[String]) {
7 | AlgebirdAggregators.load
8 |
9 | object Conf extends ScallopConf(args) {
10 | val version = "0.0.1"
11 | val capacity = opt[Int]("capacity", 'c', "maximum number of keys to keep in memory", Some(5000))
12 | val flushEvery = opt[Int]("flush", 'f', "flush a key once it hits this many values")
13 | val udp = opt[Int]("udp", 'u', "UDP port to listen for input")
14 | val redis = opt[String]("redis", 'r', "connect to Redis at host:port")
15 | val http = opt[Int]("http", 'h', "TCP port to listen for HTTP queries")
16 | }
17 |
18 | val input = Conf.udp.get match {
19 | case Some(port) => new UDPInput(port)
20 | case None => StdInput
21 | }
22 |
23 | val redis = Conf.redis.get.map{host => new Redis(host)}
24 | val output = redis.getOrElse(StdOutput)
25 |
26 | val simmer = new Simmer(output, Conf.capacity(), Conf.flushEvery.get)
27 |
28 | for(port <- Conf.http) {
29 | val lookup = redis.getOrElse(NullLookup)
30 | new Http(port, simmer, lookup)
31 | }
32 |
33 | input.run(simmer)
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 |
5 | com.stripe
6 | simmer
7 | 0.0.1
8 |
9 |
10 |
11 | org.scala-lang
12 | scala-library
13 | 2.9.2
14 |
15 |
16 | com.twitter
17 | algebird-core_2.9.2
18 | 0.1.13
19 |
20 |
21 | com.twitter
22 | chill_2.9.2
23 | 0.2.0
24 |
25 |
26 | org.rogach
27 | scallop_2.9.2
28 | 0.9.1
29 |
30 |
31 | com.twitter
32 | finagle-redis_2.9.2
33 | 6.3.0
34 |
35 |
36 | com.twitter
37 | finagle-http_2.9.2
38 | 6.3.0
39 |
40 |
41 |
42 |
43 |
--------------------------------------------------------------------------------
/src/main/scala/com/stripe/simmer/Parallel.scala:
--------------------------------------------------------------------------------
1 | package com.stripe.simmer
2 | import java.util.concurrent._
3 | import scala.concurrent.ops._
4 |
5 | //disabling for now
6 | /*
7 | class ParallelScrubber(nShards : Int = 16) {
8 | val shards = (1 to nShards).map{i => new Shard}
9 | val rand = new scala.util.Random
10 |
11 | def update(aggKey : String, valueKey : String, value : String) {
12 | val mainShardIndex = (valueKey.hashCode % nShards + nShards) % nShards
13 | if(!shards(mainShardIndex).queue.offer((aggKey, valueKey, value)))
14 | shards(rand.nextInt(nShards)).queue.put((aggKey, valueKey, value))
15 | }
16 |
17 |
18 | def flush(output : Output) {
19 | while(shards.exists{_.queue.size > 0})
20 | Thread.sleep(100)
21 |
22 | System.err.println("flushing")
23 |
24 | val mergeOutput = new MergeOutput
25 | shards.foreach{_.scrubber.flush(mergeOutput)}
26 | mergeOutput.flush(output)
27 | }
28 | }
29 |
30 | class Shard {
31 | val queue = new ArrayBlockingQueue[(String,String,String)](1000)
32 | val scrubber = new Scrubber
33 |
34 | spawn {
35 | while(true) {
36 | val (aggKey, valueKey, value) = queue.take
37 | scrubber.update(aggKey, valueKey, value)
38 | }
39 | }
40 | }
41 |
42 |
43 | class MergeOutput extends Output {
44 | val scrubber = new Scrubber
45 |
46 | def write[A](valueKey : String, value : A, aggKey : String, aggregator : Aggregator[A]) {
47 | System.err.println("merging " + valueKey)
48 | scrubber.update(aggKey, valueKey, aggregator.serialize(value))
49 | }
50 |
51 | def flush(output: Output) {
52 | scrubber.flush(output)
53 | }
54 | }
55 | */
--------------------------------------------------------------------------------
/src/main/scala/com/stripe/simmer/Redis.scala:
--------------------------------------------------------------------------------
1 | package com.stripe.simmer
2 |
3 | import com.twitter.finagle.redis.{TransactionalClient, ClientError}
4 | import com.twitter.finagle.redis.util._
5 | import com.twitter.finagle.redis.protocol.{Set => SetCommand}
6 | import com.twitter.util.Future
7 |
8 | class Redis(host : String) extends Output with Lookup {
9 | System.err.println("Connecting to redis at " + host)
10 | val client = TransactionalClient(host)
11 |
12 | def write[A](key : String, value : A, aggregator : Aggregator[A]) : Boolean = {
13 | val keyCB = StringToChannelBuffer(key)
14 |
15 | val future = client.watch(List(keyCB)).flatMap { unit =>
16 | client.get(keyCB).flatMap { result =>
17 | val newValue =
18 | result match {
19 | case Some(cb) => {
20 | val str = CBToString(cb)
21 | val columns = str.split("\t")
22 | val oldValue = aggregator.deserialize(columns(0)).get
23 | aggregator.reduce(oldValue, value)
24 | }
25 | case None => value
26 | }
27 |
28 | val output = aggregator.serialize(newValue) + "\t" + aggregator.present(newValue)
29 | client.transaction(List(SetCommand(keyCB, StringToChannelBuffer(output))))
30 | }
31 | }
32 |
33 | try {
34 | future.get
35 | true
36 | } catch {
37 | case ex : ClientError => {
38 | System.err.println(ex)
39 | false
40 | }
41 | }
42 | }
43 |
44 | def read(key : String) : Future[Option[(String,String)]] = {
45 | val keyCB = StringToChannelBuffer(key)
46 |
47 | client.get(keyCB).map{value =>
48 | value.map{cb =>
49 | val str = CBToString(cb)
50 | val columns = str.split("\t")
51 | (columns(0), columns(1))
52 | }
53 | }
54 | }
55 | }
--------------------------------------------------------------------------------
/src/main/scala/com/stripe/simmer/Http.scala:
--------------------------------------------------------------------------------
1 | package com.stripe.simmer
2 |
3 | import com.twitter.finagle.Service
4 | import org.jboss.netty.handler.codec.http._
5 | import org.jboss.netty.handler.codec.http.HttpResponseStatus._
6 | import org.jboss.netty.handler.codec.http.HttpVersion.HTTP_1_1
7 | import org.jboss.netty.buffer.ChannelBuffers.copiedBuffer
8 | import org.jboss.netty.util.CharsetUtil.UTF_8
9 | import com.twitter.util.Future
10 | import java.net.InetSocketAddress
11 | import com.twitter.finagle.builder.{Server, ServerBuilder}
12 | import com.twitter.finagle.http.{Http => HttpCodec}
13 |
14 | class Http(port : Int, simmer : Simmer, lookup : Lookup) {
15 | System.err.println("Listening on HTTP port " + port)
16 |
17 | ServerBuilder()
18 | .codec(HttpCodec())
19 | .bindTo(new InetSocketAddress(port))
20 | .name("http")
21 | .build(new Service[HttpRequest, HttpResponse] {
22 | def apply(request: HttpRequest) = handle(request)
23 | })
24 |
25 | def extractKey(request : HttpRequest) : String = {
26 | val uri = request.getUri
27 | val parts = uri.split("[/?]")
28 | if(parts.size > 1)
29 | parts(1)
30 | else
31 | "sum:all"
32 | }
33 |
34 | def handle(request : HttpRequest) = {
35 | val key = extractKey(request)
36 |
37 | lookup.read(key).map{result =>
38 |
39 | val response = new DefaultHttpResponse(HTTP_1_1, OK)
40 | val acc = simmer.accumulators.get(key)
41 |
42 | val output = result match {
43 | case Some((serialized, presented)) => {
44 | if(acc == null) {
45 | presented
46 | } else {
47 | acc.mergeAndPresent(serialized)
48 | }
49 | }
50 |
51 | case None => {
52 | if(acc == null) {
53 | ""
54 | } else {
55 | acc.present
56 | }
57 | }
58 | }
59 |
60 | response.setContent(copiedBuffer(output, UTF_8))
61 | response
62 | }
63 | }
64 | }
--------------------------------------------------------------------------------
/src/main/scala/com/stripe/simmer/Registry.scala:
--------------------------------------------------------------------------------
1 | package com.stripe.simmer
2 |
3 | object Registry {
4 | var registry = Map[String,(Option[Int],Option[Aggregator[_]])=>Aggregator[_]]()
5 | var cache = Map[String,Aggregator[_]]()
6 |
7 | def register(typeKey : String)(fn : (Option[Int], Option[Aggregator[_]])=>Aggregator[_]) {
8 | registry += typeKey -> fn
9 | }
10 |
11 | //this is a bit of a mess
12 | def get(key: String) : Option[Aggregator[_]] = {
13 | val keyRegex = """([a-zA-Z]+)(\d*)(:[a-zA-Z0-9:]+)?:[a-zA-Z0-9]+""".r
14 |
15 | keyRegex.findFirstMatchIn(key).flatMap{m =>
16 | val typeKey = m.group(1)
17 | val optionalInt = m.group(2)
18 | val optionalRecursion = if(m.group(3) == null) "" else m.group(3).tail
19 | val fullTypeKey = typeKey + optionalInt + optionalRecursion
20 |
21 | cache.get(fullTypeKey).orElse{
22 | val created = createAggregator(typeKey, optionalInt, optionalRecursion)
23 | if(created.isDefined)
24 | cache += fullTypeKey -> created.get
25 | created
26 | }
27 | }
28 | }
29 |
30 | def createAggregator(typeKey : String, optionalInt : String, optionalRecursion : String) = {
31 | val int = if(optionalInt.isEmpty) None else Some(optionalInt.toInt)
32 | val inner = if(optionalRecursion.isEmpty) None else get(optionalRecursion + ":dummy")
33 |
34 | registry.get(typeKey).map {fn => fn(int, inner)}
35 | }
36 | }
37 |
38 | trait Registrar {
39 | def register(typeKey : String, agg: Aggregator[_]) {
40 | register(typeKey, 0){n => agg}
41 | }
42 |
43 | def register(typeKey : String, default : Int)(fn : Int=>Aggregator[_]) {
44 | Registry.register(typeKey){(optInt, optRec) => fn(optInt.getOrElse(default))}
45 | }
46 |
47 | def registerRecursive(typeKey : String, default : Int)(fn : (Int,Aggregator[_])=>Aggregator[_]) {
48 | Registry.register(typeKey){
49 | (optInt, optRec) =>
50 | fn(optInt.getOrElse(default), optRec.getOrElse(error(typeKey + " requires a nested aggregation")))
51 | }
52 | }
53 |
54 | def load {}
55 | }
56 |
--------------------------------------------------------------------------------
/src/main/scala/com/stripe/simmer/Core.scala:
--------------------------------------------------------------------------------
1 | package com.stripe.simmer
2 |
3 | import java.util.{Map => JMap, LinkedHashMap => JLinkedHashMap}
4 | import scala.collection.JavaConverters._
5 |
6 | trait Aggregator[A] {
7 | def createAccumulator(input : String) = new Accumulator(this, parse(input))
8 | def parse(input : String) : A = deserialize(input).getOrElse(prepare(input))
9 | def reduce(left : A, right : A) : A
10 | def prepare(input : String) : A
11 | def serialize(value : A) : String
12 | def deserialize(serialized : String) : Option[A]
13 | def present(value : A) : String
14 | }
15 |
16 | class Simmer(output : Output, capacity : Int, flushEvery : Option[Int]) {
17 |
18 | Runtime.getRuntime.addShutdownHook(new Thread { override def run { flush } })
19 |
20 | val accumulators = new JLinkedHashMap[String,Accumulator[_]](capacity, 0.75f, true) {
21 | override def removeEldestEntry(eldest : JMap.Entry[String, Accumulator[_]]) = {
22 | if(this.size > capacity) {
23 | eldest.getValue.write(eldest.getKey, output)
24 | } else {
25 | false
26 | }
27 | }
28 | }
29 |
30 | def update(key : String, value : String) {
31 | val acc = accumulators.get(key)
32 | if(acc == null) {
33 | Registry.get(key) match {
34 | case Some(agg) => {
35 | val newAcc = agg.createAccumulator(value)
36 | accumulators.put(key, newAcc)
37 | }
38 | case None => error("Could not find aggregator for key " + key)
39 | }
40 | } else {
41 | acc.update(value)
42 | if(flushEvery.isDefined && acc.count >= flushEvery.get) {
43 | if(acc.write(key, output)) {
44 | accumulators.remove(key)
45 | }
46 | }
47 | }
48 | }
49 |
50 | def flush {
51 | //TODO respect the return value from write()
52 | accumulators.asScala.foreach{case (key,acc) => acc.write(key, output)}
53 | accumulators.clear
54 | }
55 | }
56 |
57 | class Accumulator[A](aggregator : Aggregator[A], var value : A) {
58 | var count = 1
59 |
60 | def update(input : String) {
61 | value = merge(input)
62 | count += 1
63 | }
64 |
65 | def merge(input : String) = {
66 | val newValue = aggregator.parse(input)
67 | aggregator.reduce(value, newValue)
68 | }
69 |
70 | def mergeAndPresent(input : String) = {
71 | aggregator.present(merge(input))
72 | }
73 |
74 | def present = aggregator.present(value)
75 |
76 | def write(key : String, output : Output) = {
77 | output.write(key, value, aggregator)
78 | }
79 | }
--------------------------------------------------------------------------------
/src/main/scala/com/stripe/simmer/Aggregators.scala:
--------------------------------------------------------------------------------
1 | package com.stripe.simmer
2 |
3 | import com.twitter.algebird._
4 | import com.twitter.bijection._
5 | import com.twitter.chill._
6 | import java.util.Calendar._
7 | import java.util.GregorianCalendar
8 |
9 | object AlgebirdAggregators extends Registrar {
10 | register("sum", DoubleSum)
11 | register("max", DoubleMax)
12 | register("min", DoubleMin)
13 | register("uv", 12){new HyperLogLog(_)}
14 | register("mh", 64){new MinHash(_)}
15 | register("pct", 50){new Percentile(_)}
16 | register("fh", 10){new HashingTrick(_)}
17 | register("dcy", 86400){new Decay(_)}
18 | registerRecursive("top", 10){(k,inner) => new HeavyHitters(k,inner)}
19 | registerRecursive("bot", 10){(k,inner) => new HeavyHitters(k,inner,-1.0)}
20 | }
21 |
22 | trait MonoidAggregator[A] extends Aggregator[A] {
23 | def monoid : Monoid[A]
24 | def reduce(left : A, right : A) = monoid.plus(left, right)
25 | }
26 |
27 | trait NumericAggregator[A] extends MonoidAggregator[A] {
28 | def presentNumeric(value : A) : Double
29 | }
30 |
31 | trait AlgebirdAggregator[A] extends MonoidAggregator[A] {
32 | val MAGIC = "%%%"
33 |
34 | def injection : Injection[A, String]
35 |
36 | def serialize(value : A) = MAGIC + injection(value)
37 | def deserialize(serialized : String) = {
38 | if(serialized.startsWith(MAGIC))
39 | injection.invert(serialized.drop(MAGIC.size))
40 | else
41 | None
42 | }
43 | }
44 |
45 | trait BufferableAggregator[A] extends AlgebirdAggregator[A] {
46 | val injection : Injection[A,String] =
47 | Bufferable.injectionOf(bufferable) andThen
48 | Bijection.bytes2Base64 andThen
49 | Base64String.unwrap
50 | def bufferable : Bufferable[A]
51 | }
52 |
53 | trait KryoAggregator[A] extends AlgebirdAggregator[A] {
54 | val injection : Injection[A,String] =
55 | KryoInjection.asInstanceOf[Injection[A, Array[Byte]]] andThen
56 | Bijection.bytes2Base64 andThen
57 | Base64String.unwrap
58 | }
59 |
60 | trait DoubleAggregator extends NumericAggregator[Double] {
61 | def prepare(in : String) = in.toDouble
62 | def serialize(value : Double) = value.toString
63 | def deserialize(serialized : String) = Some(serialized.toDouble)
64 | def present(value : Double) = value.toString
65 | def presentNumeric(value : Double) = value
66 | }
67 |
68 | object DoubleSum extends DoubleAggregator {
69 | val monoid = implicitly[Monoid[Double]]
70 | }
71 |
72 | object DoubleMax extends DoubleAggregator {
73 | val monoid = new Monoid[Double] {
74 | val zero = Double.MinValue
75 | def plus(left : Double, right: Double) = left.max(right)
76 | }
77 | }
78 |
79 | object DoubleMin extends DoubleAggregator {
80 | val monoid = new Monoid[Double] {
81 | val zero = Double.MaxValue
82 | def plus(left : Double, right: Double) = left.min(right)
83 | }
84 | }
85 |
86 | class HyperLogLog(size : Int) extends BufferableAggregator[HLL] with NumericAggregator[HLL] {
87 | val monoid = new HyperLogLogMonoid(size)
88 | def prepare(in : String) = monoid.create(in.getBytes)
89 | def present(out : HLL) = out.estimatedSize.toInt.toString
90 | def presentNumeric(out : HLL) = out.estimatedSize
91 |
92 | def bufferable = Bufferable.build[HLL] { (bb, hll) =>
93 | Bufferable.reallocatingPut(bb) { Bufferable.put(_, HyperLogLog.toBytes(hll)) }
94 | } { bb =>
95 | Bufferable.get[Array[Byte]](bb).map { tup =>
96 | (tup._1, HyperLogLog.fromBytes(tup._2))
97 | }
98 | }
99 | }
100 |
101 | class Percentile(pct : Int) extends KryoAggregator[QTree[Double]] with NumericAggregator[QTree[Double]]{
102 | val monoid = new QTreeSemigroup[Double](6) with Monoid[QTree[Double]] {
103 | val zero = QTree(0.0) //not actually right but should never be used?
104 | }
105 |
106 | def prepare(in : String) = QTree(in.toDouble)
107 | def present(out : QTree[Double]) = presentNumeric(out).toString
108 | def presentNumeric(out : QTree[Double]) = out.quantileBounds(pct.toDouble / 100)._2
109 | }
110 |
111 | class Decay(halflife : Int) extends KryoAggregator[DecayedValue] with NumericAggregator[DecayedValue] {
112 | val monoid = DecayedValue.monoidWithEpsilon(0.000001)
113 | def prepare(in : String) = {
114 | val (timestamp, value) = split(in, ":").get
115 | DecayedValue.build(value.toDouble, timestamp.toDouble, halflife.toDouble)
116 | }
117 |
118 | def timestampAsOfEndOfDay = {
119 | val calendar = new GregorianCalendar
120 | calendar.add(DATE, 1)
121 | calendar.set(HOUR_OF_DAY, 0)
122 | calendar.set(MINUTE, 0)
123 | calendar.set(SECOND, 0)
124 | calendar.set(MILLISECOND, 0)
125 | calendar.getTimeInMillis / 1000
126 | }
127 |
128 | def presentNumeric(out : DecayedValue) = {
129 | val adjusted = monoid.plus(out, DecayedValue.build(0.0, timestampAsOfEndOfDay, halflife.toDouble))
130 | adjusted.value
131 | }
132 |
133 | def present(out : DecayedValue) = presentNumeric(out).toString
134 | }
135 |
136 | class HeavyHitters[A](k : Int, inner : Aggregator[A], order : Double = 1.0) extends BufferableAggregator[SketchMap[String, A]] {
137 | val innerNumeric = inner match {
138 | case in : NumericAggregator[A] => in
139 | case _ => error("top and bot require a numeric aggregation")
140 | }
141 | implicit val str2Bytes = (x : String) => x.getBytes
142 | implicit val innerMonoid = innerNumeric.monoid
143 | implicit val ordering = Ordering.by{a : A => innerNumeric.presentNumeric(a) * order}
144 | val monoid = new SketchMapMonoid[String,A](100,5,123456,k)
145 |
146 | def prepare(in : String) = {
147 | val (key, value) = split(in, ":").get
148 | monoid.create(key, inner.prepare(value))
149 | }
150 |
151 | def present(out : SketchMap[String, A]) = {
152 | out.heavyHitters.map{case (k,v) => k + ":" + inner.present(v)}.mkString(",")
153 | }
154 |
155 | def bufferable = Bufferable.build[SketchMap[String, A]] { (bb, sm) =>
156 | var newBb = bb
157 | val totalValueString = inner.serialize(sm.totalValue)
158 | newBb = Bufferable.reallocatingPut(newBb) { Bufferable.put(_, totalValueString) }
159 | newBb = Bufferable.reallocatingPut(newBb) { Bufferable.put(_, sm.heavyHitterKeys) }
160 | for(row <- (0 to monoid.params.depth - 1);
161 | col <- (0 to monoid.params.width - 1)) {
162 | val value = sm.valuesTable.getValue(row, col)
163 | val valueString = inner.serialize(value)
164 | newBb = Bufferable.reallocatingPut(newBb) { Bufferable.put(_, valueString) }
165 | }
166 | newBb
167 | } { bb =>
168 | Bufferable.get[String](bb).flatMap { tup =>
169 | Bufferable.get[List[String]](tup._1).map { tup2 =>
170 | var newBb = tup2._1
171 | var matrix = monoid.zero.valuesTable
172 | for(row <- (0 to monoid.params.depth - 1);
173 | col <- (0 to monoid.params.width - 1)) {
174 | val (bb3, str) = Bufferable.get[String](newBb).get
175 | newBb = bb3
176 | matrix = matrix.updated((row,col), inner.deserialize(str).get)
177 | }
178 | (newBb, SketchMap(monoid.params, matrix, tup2._2, inner.deserialize(tup._2).get))
179 | }
180 | }
181 | }
182 | }
183 |
184 | class HashingTrick(bits : Int) extends KryoAggregator[AdaptiveVector[Double]] {
185 | val monoid = new HashingTrickMonoid[Double](bits)
186 | def prepare(in : String) = {
187 | if(in.contains(":")) {
188 | val (key, value) = split(in, ":").get
189 | monoid.init(key.getBytes, value.toDouble)
190 | } else {
191 | monoid.init(in.getBytes, 1.0)
192 | }
193 | }
194 |
195 | def present(out : AdaptiveVector[Double]) = {
196 | out.mkString(",")
197 | }
198 | }
199 |
200 | class MinHash(hashes : Int) extends AlgebirdAggregator[Array[Byte]] {
201 | val monoid = new MinHasher16(0.1, hashes * 2)
202 | val injection = Bijection.bytes2Base64 andThen Base64String.unwrap
203 | def prepare(in : String) = monoid.init(in)
204 | def present(out : Array[Byte]) = {
205 | out.grouped(2).toList.map{h => h.map{"%02X".format(_)}.mkString}.mkString(":")
206 | }
207 | }
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | #simmer
2 | Avi Bryant
3 |
4 | Simmer is a streaming aggregation tool. It can be used in several contexts to incrementally and efficiently summarize large volumes of data using a fixed amount of memory. Some of the ways it can be used include:
5 | * As a filter in a unix pipeline processing logs or other text files
6 | * As a combiner and reducer in Hadoop streaming jobs
7 | * As a statsd-style metrics service over UDP, optionally backed by Redis
8 |
9 | Some of the aggregations it supports include:
10 | * counts of unique values
11 | * exponentially decaying values
12 | * top k most frequent values
13 | * percentiles
14 | * min-hash signatures
15 |
16 | Simmer is commutative and associative, which is to say that you can always use simmer to combine simmer's output.
17 |
18 | It was inspired in part by [Hadoop streaming's Aggregate package](http://hadoop.apache.org/docs/r1.1.2/streaming.html#Hadoop+Aggregate+Package), but uses the probabalistic aggregation algorithms from Twitter's [Algebird](http://github.com/twitter/algebird).
19 |
20 | ###To build:
21 |
22 | ````sh
23 | rake
24 | ````
25 |
26 | ###To run:
27 | ````sh
28 | bin/simmer < /path/to/data.tsv
29 | ````
30 |
31 | ###To run listening on UDP and writing to Redis on every 10 updates to a key:
32 | ````sh
33 | target/simmer -u 8000 -r localhost:6379 -f 10
34 | ````
35 |
36 | ###Input format
37 |
38 | The simmer command takes tab-delimited key-value input and combines all of the values for each key. Here's a very simple sample input:
39 |
40 | ````
41 | sum:x 1
42 | min:y 3
43 | min:y 4
44 | sum:x 2
45 | min:y 3
46 | ````
47 |
48 | And here are the keys and values of the output:
49 |
50 | ````
51 | sum:x 3
52 | min:y 3
53 | ````
54 |
55 | simmer has taken the two values for the key "sum:x", 1 and 2, and produced their sum, 3; it has also taken the three values for the key "min:y", 3, 4 and 3, and produced their minimum, 3.
56 |
57 | The prefix of each key, before the colon, determines how its values will be combined; in this case, the values for "sum:x" are summed, and the values for "min:y" are combined by taking the minimum. As in this example, you can freely mix different types of aggregation in the same input stream.
58 |
59 | Note that the prefix is treated not just as an annotation, but as an integral part of the key. It's often useful to aggregate the same set of values in multiple ways; since, for example, "min:x" and "max:x" are different keys, there's no problem including both and aggregating them separately.
60 |
61 | Many of the aggregations can be parameterized by including an integer in the prefix. For example, the percentile aggregator might appear as the prefix "pct95" (to compute the 95th percentile) or the prefix "pct50" to compute the median. A full list of the supported aggregations, and their parameterizations, is below.
62 |
63 | ###Output format
64 |
65 | The output is, like the input format, a tab-separated key-value stream. The output is designed to be easy to read by humans, while at the same time allowing multiple outputs to be combined and fed back into simmer for further aggregation. As a simple example of how these are in conflict, consider an aggregation producing the average of all of the values for a key. The human-readable output is just a single number, the average. To properly combine multiple averages, however, you have to know the count of how many values originally went into each one, so that you can weight them properly. simmer solves this by producing two values for each key, one with a possibly opaque, machine-readable value that is suitable for further aggregation, and another that includes a human-readable version of the value. Often, it's convenient to filter simmer's output through "cut -f 1,3" to see only the human-readable versions.
66 |
67 | For simple cases like sum, the human-readable and machine-readable formats are identical, so the output looks like this:
68 |
69 | ````sh
70 | sum:x 3 3
71 | ````
72 |
73 | For other aggregations, it might look more like this:
74 | ````sh
75 | dcy:x %%%AQBjb20udHdpdHRlci5hbGdlYmlyZC5EZWNheWVkVmFsdeUBQMVkIdW357VAWQAAAAAAAA== 8.752114744797748
76 | ````
77 |
78 | Simmer will ignore the human readable values if it's given its own output to consume, because it only looks at the first two columns of input. It will also distinguish properly between new single values, and previous aggregated output, for the same key, and will happily combine these with each other. This means, for example, that you can take the aggregated output of yesterday's logs and cat it with the raw input for today's logs, and get the combined output of both.
79 |
80 | ###Flushing
81 |
82 | The simmer command takes two optional integer arguments. The first argument is --capacity, or -c: how many keys it should hold in memory at once. Whenever a new key is added that will exceed this capacity, the current aggregate value for the least recently used key is flushed. In general these will be infrequent keys that may never recur again, but if they do, you may see multiple outputs for the same key; these need to be aggregated in turn (perhaps by feeding the output back through simmer) to get the complete result.
83 |
84 | The second argument, --flush or -f, controls the maximum number of values to aggregate for any one key before flushing. If this is set to 0, there is no maximum and frequently seen keys will only be output when there is no more input. However, if you have an infinite stream of input, you will want to set this to some non-zero value to get intermediate results out. Again, this means there may be multiple values for a single key that need to be combined after the fact.
85 |
86 | The defaults are equivalent to:
87 |
88 | ````
89 | bin/simmer -c 5000 -f 0
90 | ````
91 |
92 | ###UDP
93 |
94 | If you start simmer with --udp or -u, followed by a port number, it will listen on that UDP port instead of on stdin for rows of data; one UDP packet per row.
95 |
96 | ###Redis
97 |
98 | If you start simmer with --redis or -r, followed by host:port, it will write to Redis instead of stdout; the first column of output (the key) will be used as the Redis key, and the second two columns, tab-separated, will be used as a Redis string value. Any existing data stored in Redis at that key will be merged with the output data whenever simmer flushes.
99 |
100 | ###Numeric Aggregations
101 |
102 | The human-readable output of these is always a single number for each key.
103 |
104 |
105 |
106 | | Prefix |
107 | Description |
108 | Parameter |
109 | Default |
110 | Sample input |
111 | Sample output |
112 |
113 |
114 |
115 | | sum |
116 | Sum |
117 | n/a |
118 | n/a |
119 | sum:x 1
120 | sum:x 2
121 |
122 | |
123 |
124 | sum:x 3
125 |
126 | |
127 |
128 |
129 |
130 | | min |
131 | Minimum |
132 | n/a |
133 | n/a |
134 | min:x 1
135 | min:x 2
136 |
137 | |
138 |
139 | min:x 1
140 |
141 | |
142 |
143 |
144 |
145 | | max |
146 | Maximum |
147 | n/a |
148 | n/a |
149 | max:x 1
150 | max:x 2
151 |
152 | |
153 |
154 | max:x 2
155 |
156 | |
157 |
158 |
159 |
160 | | uv |
161 | Unique values (estimated using the HyperLogLog algorithm) |
162 | number of hash bits - memory use is 2^n |
163 | uv12 |
164 | uv:x a
165 | uv:x b
166 | uv:x a
167 |
168 | |
169 |
170 | uv:x 2
171 |
172 | |
173 |
174 |
175 |
176 | | pct |
177 | Percentile
|
178 | which percentile to output |
179 | pct50 (ie median) |
180 |
181 | pct50:x 2
182 | pct50:x 4
183 | pct50:x 4
184 | pct50:x 100
185 |
186 | |
187 | The output will be an upper bound on the estimated percentile, expressed as a double.
188 | pct50:x 4.0000152587890625
189 |
190 | |
191 |
192 |
193 |
194 | | dcy |
195 | Exponentially decayed sum
|
196 | half-life of a value, in seconds |
197 | dcy86400 (ie, half-life of one day) |
198 | Data should be in the format timestamp:value.
199 | dcy:x 1365187171:100
200 | dcy:x 1365100771:100
201 | dcy:x 1365014371:100
202 |
203 | |
204 | Human-readable output will be the decayed value as of the end of the current day.
205 |
206 | dcy:y 122.3
207 |
208 | |
209 |
210 |
211 |
212 | ###Other Aggregations
213 |
214 | These are more specialized than, or build in some way on, the numeric aggregations.
215 |
216 |
217 |
218 | | Prefix |
219 | Description |
220 | Parameter |
221 | Default |
222 | Sample input |
223 | Sample output |
224 |
225 |
226 |
227 | | top |
228 | Top K (by any numeric aggregation) |
229 | how many top values to retain (also requires a secondary prefix - see example) |
230 | top10 |
231 | This will find the top 3 items by the sum of their values, assuming an item:value format
232 | top3:sum:x a:1
233 | top3:sum:x b:2
234 | top3:sum:x a:2
235 | top3:sum:x c:1
236 | top3:sum:x d:3
237 |
238 |
239 | However, top10:uv:x, or top5:pct95:x, or top + any other numeric aggregation, would also be valid keys.
240 | |
241 |
242 | top3:sum:x a:3,d:3,b:2
243 |
244 | |
245 |
246 |
247 |
248 | | bot |
249 | Bottom K (by any numeric aggregation) |
250 | works just like top |
251 | bot10 |
252 |
253 | bot3:sum:x a:1
254 | bot3:sum:x b:2
255 | bot3:sum:x a:2
256 | bot3:sum:x c:1
257 | bot3:sum:x d:3
258 |
259 | |
260 |
261 | bot3:sum:x c:1,b:2,d:3
262 |
263 | |
264 |
265 |
266 |
267 | | mh |
268 | Min-Hash Signature (Used for estimating set similarity) |
269 | number of hashes to use |
270 | mh64 |
271 | Each value should be a single element of the set represented by the key.mh:x a
272 | mh:x b
273 | mh:x c
274 |
275 | |
276 | Hex representation of n 16-bit hashes. If two sets have k matching hash values, their jaccard similarity = k/n.
277 | mh:x 0FCC:2E1F:0DD7:0049:3BF3:10D4:6460:75D4:392B:07AF:2064:27F0:6931:6717:3A0A:16D9:122E:51C6:8632:64BD:0CAE:0D15:8357:39A5:2008:4ED7:5733:44F8:1F70:02F7:23D5:59AE:0ECB:8EE0:4E1C:0249:9804:610B:0DBD:0316
278 |
279 | |
280 |
281 |
282 |
283 | | fh |
284 | Feature Hashing (projects any number of features into a fixed-size vector) |
285 | number of hash bits to use (output vector size will be 2^n) |
286 | fh10 |
287 | Values can either be just a token, for categorical features, or token:number for continuous features.
288 | fh4:x hello
289 | fh4:x world
290 | fh4:x temp:32
291 |
292 | |
293 |
294 | fh4 0.0,0.0,-1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-32.0,0.0
295 |
296 | |
297 |
298 |
299 |
300 |
301 | ###TODO
302 |
303 | See https://github.com/avibryant/simmer/issues
304 |
--------------------------------------------------------------------------------