├── project ├── build.properties └── plugins.sbt ├── version.sbt ├── Filo.jpg ├── .travis.yml ├── schema └── flatbuffers │ ├── simplestring.fbs │ ├── conststring.fbs │ ├── diffprimitive.fbs │ ├── gen-java │ └── org │ │ └── velvia │ │ └── filo │ │ └── vector │ │ ├── MaskType.java │ │ ├── DataInfo.java │ │ ├── DDTVars.java │ │ ├── NaMask.java │ │ ├── ConstStringVector.java │ │ ├── SimpleStringVector.java │ │ ├── SimplePrimitiveVector.java │ │ ├── DiffPrimitiveVector.java │ │ ├── DictStringVector.java │ │ └── DiffDateTimeVector.java │ ├── dictstring.fbs │ ├── simpleprimitive.fbs │ ├── diffdatetime.fbs │ └── vector.fbs ├── .gitignore ├── filo-scala └── src │ ├── main │ └── scala │ │ └── org.velvia.filo │ │ ├── DefaultValues.scala │ │ ├── codecs │ │ ├── ConstWrappers.scala │ │ ├── DictEncodingWrappers.scala │ │ ├── SimpleWrappers.scala │ │ ├── DictEncodingEncoders.scala │ │ ├── ConstEncoders.scala │ │ ├── SimpleEncoders.scala │ │ ├── DiffWrappers.scala │ │ ├── DiffEncoders.scala │ │ ├── Utils.scala │ │ └── PrimitiveDataVectBuilder.scala │ │ ├── WireFormat.scala │ │ ├── vectors │ │ ├── ObjectVector.scala │ │ ├── ConstVector.scala │ │ ├── DictUTF8Vector.scala │ │ └── DeltaDeltaVector.scala │ │ ├── RowReaderAppender.scala │ │ ├── FiloRowReader.scala │ │ ├── RowToVectorBuilder.scala │ │ ├── TypedBufferReader.scala │ │ ├── FastBufferReader.scala │ │ ├── BuilderEncoder.scala │ │ ├── FiloVector.scala │ │ ├── ZeroCopyBinary.scala │ │ ├── VectorReader.scala │ │ └── VectorBuilder.scala │ └── test │ └── scala │ └── org.velvia.filo │ ├── DictEncodingTest.scala │ ├── RowToColumnBuilderTest.scala │ ├── ZeroCopyBinaryTest.scala │ ├── RowReaderTest.scala │ ├── DiffEncodingTest.scala │ ├── vectors │ └── DoubleVectorTest.scala │ └── EncodingPropertiesTest.scala ├── flatbuffers └── src │ └── main │ └── java │ └── com │ └── google │ └── flatbuffers │ ├── Struct.java │ ├── Constants.java │ └── Table.java ├── filo-scala-jmh └── src │ └── main │ └── scala │ └── org.velvia.filo │ ├── ScalaReadBenchmark.scala │ ├── UTF8StringBenchmark.scala │ ├── FastFiloRowReaderBenchmark.scala │ ├── DictStringBenchmark.scala │ ├── BasicFiloBenchmark.scala │ └── EncodingBenchmark.scala ├── wire_format.md ├── README.md └── scalastyle-config.xml /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=0.13.11 2 | -------------------------------------------------------------------------------- /version.sbt: -------------------------------------------------------------------------------- 1 | version in ThisBuild := "0.3.8-SNAPSHOT" -------------------------------------------------------------------------------- /Filo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/velvia/filo/HEAD/Filo.jpg -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: scala 2 | env: 3 | global: 4 | # _JAVA_OPTIONS="-Xmx1500m -XX:MaxPermSize=512m -Dakka.test.timefactor=3" 5 | jdk: 6 | - oraclejdk8 7 | scala: 8 | - 2.10.6 9 | - 2.11.8 10 | -------------------------------------------------------------------------------- /schema/flatbuffers/simplestring.fbs: -------------------------------------------------------------------------------- 1 | include "vector.fbs"; 2 | 3 | namespace org.velvia.filo.vector; 4 | 5 | table SimpleStringVector { 6 | naMask: NaMask; 7 | data: [string]; 8 | } 9 | 10 | root_type SimpleStringVector; 11 | 12 | -------------------------------------------------------------------------------- /schema/flatbuffers/conststring.fbs: -------------------------------------------------------------------------------- 1 | include "vector.fbs"; 2 | 3 | namespace org.velvia.filo.vector; 4 | 5 | table ConstStringVector { 6 | len: int; 7 | naMask: NaMask; 8 | str: string; 9 | } 10 | 11 | root_type ConstStringVector; 12 | 13 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | *.log 3 | 4 | # sbt specific 5 | .cache/ 6 | .history/ 7 | .lib/ 8 | dist/* 9 | target/ 10 | lib_managed/ 11 | src_managed/ 12 | project/boot/ 13 | project/plugins/project/ 14 | 15 | .DS_Store 16 | .history 17 | 18 | # Scala-IDE specific 19 | .scala_dependencies 20 | .worksheet 21 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "0.6.0") 2 | 3 | addSbtPlugin("com.github.gseitz" % "sbt-release" % "0.8.5") 4 | 5 | addSbtPlugin("me.lessis" % "bintray-sbt" % "0.3.0") 6 | 7 | addSbtPlugin("pl.project13.scala" % "sbt-jmh" % "0.1.12") 8 | 9 | resolvers += "sonatype-releases" at "https://oss.sonatype.org/content/repositories/releases/" -------------------------------------------------------------------------------- /filo-scala/src/main/scala/org.velvia.filo/DefaultValues.scala: -------------------------------------------------------------------------------- 1 | package org.velvia.filo 2 | 3 | object DefaultValues { 4 | val DefaultInt = 0 5 | val DefaultLong = 0L 6 | val DefaultFloat = 0.0F 7 | val DefaultDouble = 0.0 8 | val DefaultBool = false 9 | val DefaultString = "" 10 | val DefaultTimestamp = new java.sql.Timestamp(0L) 11 | val DefaultDateTime = new org.joda.time.DateTime(0L) 12 | val DefaultUTF8String = ZeroCopyUTF8String("") 13 | } -------------------------------------------------------------------------------- /schema/flatbuffers/diffprimitive.fbs: -------------------------------------------------------------------------------- 1 | include "vector.fbs"; 2 | 3 | namespace org.velvia.filo.vector; 4 | 5 | // Same as SimplePrimitiveVector but stores deltas from a base value for compactness. 6 | // NOTE: This is not the same as deltas from consequent values, as this design still allows 7 | // for random access. 8 | table DiffPrimitiveVector { 9 | len: int; 10 | naMask: NaMask; 11 | base: long; 12 | info: DataInfo; 13 | data: [ubyte]; 14 | } 15 | 16 | root_type DiffPrimitiveVector; 17 | -------------------------------------------------------------------------------- /schema/flatbuffers/gen-java/org/velvia/filo/vector/MaskType.java: -------------------------------------------------------------------------------- 1 | // automatically generated, do not modify 2 | 3 | package org.velvia.filo.vector; 4 | 5 | public final class MaskType { 6 | private MaskType() { } 7 | public static final byte AllZeroes = 0; 8 | public static final byte SimpleBitMask = 1; 9 | public static final byte AllOnes = 2; 10 | 11 | private static final String[] names = { "AllZeroes", "SimpleBitMask", "AllOnes", }; 12 | 13 | public static String name(int e) { return names[e]; } 14 | }; 15 | 16 | -------------------------------------------------------------------------------- /schema/flatbuffers/dictstring.fbs: -------------------------------------------------------------------------------- 1 | include "vector.fbs"; 2 | 3 | namespace org.velvia.filo.vector; 4 | 5 | // Dictionary encoded string column, with NA/missing value support 6 | // The 0 value for a code is reserved for NA. This means dictionary size is actually 1 + (# uniques). 7 | // In most cases, this results in smaller binaries (no naMask) 8 | // Also codes must translate to an integer type. 9 | table DictStringVector { 10 | len: int; 11 | dictionary: [string]; 12 | info: DataInfo; 13 | codes: [ubyte]; 14 | } 15 | 16 | root_type DictStringVector; 17 | -------------------------------------------------------------------------------- /filo-scala/src/main/scala/org.velvia.filo/codecs/ConstWrappers.scala: -------------------------------------------------------------------------------- 1 | package org.velvia.filo.codecs 2 | 3 | import scalaxy.loops._ 4 | 5 | import org.velvia.filo._ 6 | import org.velvia.filo.vector._ 7 | 8 | class ConstStringWrapper(csv: ConstStringVector) 9 | extends NaMaskAvailable[String](csv.naMask) { 10 | val _len = csv.len 11 | val _str = csv.str 12 | 13 | final def length: Int = _len 14 | 15 | final def apply(i: Int): String = _str 16 | 17 | final def foreach[B](fn: String => B): Unit = { 18 | for { i <- 0 until length optimized } { if (isAvailable(i)) fn(apply(i)) } 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /schema/flatbuffers/simpleprimitive.fbs: -------------------------------------------------------------------------------- 1 | include "vector.fbs"; 2 | 3 | namespace org.velvia.filo.vector; 4 | 5 | // Designed to hold fixed-size element data vectors. nbits bits per element. 6 | // Note that len is the number of elements, which is not the same as data.length, 7 | // which is simply the number of bytes. 8 | // The Simple* types are optimized for very few NA values. If most values are missing, we might 9 | // want a very different representation. 10 | table SimplePrimitiveVector { 11 | len: int; 12 | naMask: NaMask; 13 | info: DataInfo; 14 | data: [ubyte]; 15 | } 16 | 17 | root_type SimplePrimitiveVector; 18 | -------------------------------------------------------------------------------- /schema/flatbuffers/diffdatetime.fbs: -------------------------------------------------------------------------------- 1 | include "vector.fbs"; 2 | 3 | namespace org.velvia.filo.vector; 4 | 5 | // A vector designed to store DateTime values, including a default time zone and other things 6 | // For now, chronologies are ignored (only ISOChronology, which is with rare exceptions what is used) 7 | // If tzLength() == 0, then baseTz is used for the time zone for all values. 8 | table DiffDateTimeVector { 9 | naMask: NaMask; 10 | vars: DDTVars; 11 | millisInfo: DataInfo; 12 | millis: [ubyte]; 13 | tzInfo: DataInfo; 14 | tz: [ubyte]; 15 | } 16 | 17 | // NOTE: using a struct here saves us 2 vtable entries or 4 bytes 18 | // These are always required anyways 19 | struct DDTVars { 20 | len: int; 21 | baseTz: byte; 22 | baseMillis: long; 23 | } 24 | 25 | root_type DiffDateTimeVector; 26 | -------------------------------------------------------------------------------- /schema/flatbuffers/vector.fbs: -------------------------------------------------------------------------------- 1 | // Google FlatBuffers IDL for efficient, zero-copy serialized data vectors 2 | // http://google.github.io/flatbuffers/index.html 3 | // 4 | // Please stick to FlatBuffers versioning guide for backwards compatibility 5 | // 6 | // NOTE: There can only be one root_type or FlatBuffers object per .fbs file.... boo :/ 7 | 8 | namespace org.velvia.filo.vector; 9 | 10 | // Many vectors have a bitmask for representing NA values. 11 | // AllZeroes = every value is available; AllOnes = no value is available == empty 12 | enum MaskType : byte { AllZeroes, SimpleBitMask, AllOnes } 13 | 14 | table NaMask { 15 | maskType: MaskType = AllOnes; 16 | /// for type = SimpleBitMask 17 | bitMask: [long]; 18 | } 19 | 20 | struct DataInfo { 21 | nbits: ushort; 22 | signed: bool = false; 23 | } 24 | -------------------------------------------------------------------------------- /schema/flatbuffers/gen-java/org/velvia/filo/vector/DataInfo.java: -------------------------------------------------------------------------------- 1 | // automatically generated, do not modify 2 | 3 | package org.velvia.filo.vector; 4 | 5 | import java.nio.*; 6 | import java.lang.*; 7 | import java.util.*; 8 | import com.google.flatbuffers.*; 9 | 10 | @SuppressWarnings("unused") 11 | public final class DataInfo extends Struct { 12 | public DataInfo __init(int _i, ByteBuffer _bb) { bb_pos = _i; bb = _bb; return this; } 13 | 14 | public int nbits() { return bb.getShort(bb_pos + 0) & 0xFFFF; } 15 | public boolean signed() { return 0!=bb.get(bb_pos + 2); } 16 | 17 | public static int createDataInfo(FlatBufferBuilder builder, int nbits, boolean signed) { 18 | builder.prep(2, 4); 19 | builder.pad(1); 20 | builder.putBoolean(signed); 21 | builder.putShort((short)(nbits & 0xFFFF)); 22 | return builder.offset(); 23 | } 24 | }; 25 | 26 | -------------------------------------------------------------------------------- /schema/flatbuffers/gen-java/org/velvia/filo/vector/DDTVars.java: -------------------------------------------------------------------------------- 1 | // automatically generated, do not modify 2 | 3 | package org.velvia.filo.vector; 4 | 5 | import java.nio.*; 6 | import java.lang.*; 7 | import java.util.*; 8 | import com.google.flatbuffers.*; 9 | 10 | @SuppressWarnings("unused") 11 | public final class DDTVars extends Struct { 12 | public DDTVars __init(int _i, ByteBuffer _bb) { bb_pos = _i; bb = _bb; return this; } 13 | 14 | public int len() { return bb.getInt(bb_pos + 0); } 15 | public byte baseTz() { return bb.get(bb_pos + 4); } 16 | public long baseMillis() { return bb.getLong(bb_pos + 8); } 17 | 18 | public static int createDDTVars(FlatBufferBuilder builder, int len, byte baseTz, long baseMillis) { 19 | builder.prep(8, 16); 20 | builder.putLong(baseMillis); 21 | builder.pad(3); 22 | builder.putByte(baseTz); 23 | builder.putInt(len); 24 | return builder.offset(); 25 | } 26 | }; 27 | 28 | -------------------------------------------------------------------------------- /flatbuffers/src/main/java/com/google/flatbuffers/Struct.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2014 Google Inc. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.google.flatbuffers; 18 | 19 | import java.nio.ByteBuffer; 20 | 21 | // All structs in the generated code derive from this class, and add their own accessors. 22 | public class Struct { 23 | protected int bb_pos; 24 | protected ByteBuffer bb; 25 | } 26 | -------------------------------------------------------------------------------- /flatbuffers/src/main/java/com/google/flatbuffers/Constants.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2014 Google Inc. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.google.flatbuffers; 18 | 19 | // Class that holds shared constants. 20 | 21 | public class Constants { 22 | // Java doesn't seem to have these. 23 | static final int SIZEOF_SHORT = 2; 24 | static final int SIZEOF_INT = 4; 25 | static final int FILE_IDENTIFIER_LENGTH = 4; 26 | } 27 | 28 | -------------------------------------------------------------------------------- /filo-scala-jmh/src/main/scala/org.velvia.filo/ScalaReadBenchmark.scala: -------------------------------------------------------------------------------- 1 | package org.velvia.filo 2 | 3 | import org.openjdk.jmh.annotations.Benchmark 4 | import org.openjdk.jmh.annotations.BenchmarkMode 5 | import org.openjdk.jmh.annotations.{Mode, State, Scope} 6 | import org.openjdk.jmh.annotations.OutputTimeUnit 7 | import scalaxy.loops._ 8 | import scala.language.postfixOps 9 | 10 | import java.util.concurrent.TimeUnit 11 | 12 | /** 13 | * Measures read speeds for Scala collections and Arrays for comparison 14 | * to Filo vectors. 15 | */ 16 | @State(Scope.Thread) 17 | class ScalaReadBenchmark { 18 | // Ok, create an IntColumn and benchmark it. 19 | val numValues = 10000 20 | 21 | val randomInts = (0 until numValues).map(i => util.Random.nextInt) 22 | val randomIntsAray = randomInts.toArray 23 | 24 | // Scala Seq sum 25 | @Benchmark 26 | @BenchmarkMode(Array(Mode.AverageTime)) 27 | @OutputTimeUnit(TimeUnit.MICROSECONDS) 28 | def sumAllIntsScalaSeqFoldLeft(): Int = { 29 | randomInts.foldLeft(0)(_ + _) 30 | } 31 | 32 | @Benchmark 33 | @BenchmarkMode(Array(Mode.AverageTime)) 34 | @OutputTimeUnit(TimeUnit.MICROSECONDS) 35 | def sumAllIntsScalaArrayFoldLeft(): Int = { 36 | randomIntsAray.foldLeft(0)(_ + _) 37 | } 38 | 39 | @Benchmark 40 | @BenchmarkMode(Array(Mode.AverageTime)) 41 | @OutputTimeUnit(TimeUnit.MICROSECONDS) 42 | def sumAllIntsScalaArrayWhileLoop(): Int = { 43 | var total = 0 44 | for { i <- 0 until numValues optimized } { 45 | total += randomIntsAray(i) 46 | } 47 | total 48 | } 49 | } -------------------------------------------------------------------------------- /filo-scala/src/main/scala/org.velvia.filo/codecs/DictEncodingWrappers.scala: -------------------------------------------------------------------------------- 1 | package org.velvia.filo.codecs 2 | 3 | import scala.language.postfixOps 4 | import scalaxy.loops._ 5 | 6 | import org.velvia.filo.{FiloVector, FastBufferReader} 7 | import org.velvia.filo.vector._ 8 | 9 | object DictStringWrapper { 10 | // Used to represent no string value or NA. Better than using null. 11 | val NoString = "" 12 | } 13 | 14 | abstract class DictStringWrapper(val dsv: DictStringVector) extends FiloVector[String] { 15 | import DictStringWrapper._ 16 | 17 | private val _len = dsv.len 18 | val reader = FastBufferReader(dsv.codesAsByteBuffer()) 19 | 20 | // To be mixed in depending on type of code vector 21 | def getCode(index: Int): Int 22 | 23 | // Cache the Strings so we only pay cost of deserializing each unique string once 24 | val strCache = Array.fill(dsv.dictionaryLength())(NoString) 25 | 26 | final private def dictString(code: Int): String = { 27 | val cacheValue = strCache(code) 28 | if (cacheValue == NoString) { 29 | val strFromDict = dsv.dictionary(code) 30 | strCache(code) = strFromDict 31 | strFromDict 32 | } else { 33 | cacheValue 34 | } 35 | } 36 | 37 | final def isAvailable(index: Int): Boolean = getCode(index) != 0 38 | 39 | final def apply(index: Int): String = dictString(getCode(index)) 40 | 41 | final def length: Int = _len 42 | 43 | final def foreach[B](fn: String => B): Unit = { 44 | for { i <- 0 until length optimized } { 45 | val code = getCode(i) 46 | if (code != 0) fn(dictString(code)) 47 | } 48 | } 49 | } -------------------------------------------------------------------------------- /filo-scala/src/main/scala/org.velvia.filo/WireFormat.scala: -------------------------------------------------------------------------------- 1 | package org.velvia.filo 2 | 3 | /** 4 | * Filo wire format definitions - especially for the header bytes. 5 | * See [wire_format.md] for details. 6 | */ 7 | object WireFormat { 8 | val VECTORTYPE_EMPTY = 0x01 9 | val VECTORTYPE_SIMPLE = 0x02 10 | val VECTORTYPE_DICT = 0x03 11 | val VECTORTYPE_CONST = 0x04 12 | val VECTORTYPE_DIFF = 0x05 13 | val VECTORTYPE_BINSIMPLE = 0x06 14 | val VECTORTYPE_BINDICT = 0x07 15 | val VECTORTYPE_DELTA2 = 0x08 // Delta-delta encoded 16 | 17 | def majorVectorType(headerBytes: Int): Int = headerBytes & 0x00ff 18 | def emptyVectorLen(headerBytes: Int): Int = { 19 | require(majorVectorType(headerBytes) == VECTORTYPE_EMPTY) 20 | java.lang.Integer.rotateRight(headerBytes & 0xffffff00, 8) 21 | } 22 | 23 | val SUBTYPE_PRIMITIVE = 0x00 24 | val SUBTYPE_STRING = 0x01 25 | val SUBTYPE_UTF8 = 0x02 26 | val SUBTYPE_FIXEDMAXUTF8 = 0x03 // fixed max size per blob, length byte 27 | val SUBTYPE_DATETIME = 0x04 28 | val SUBTYPE_PRIMITIVE_NOMASK = 0x05 29 | val SUBTYPE_REPEATED = 0x06 // vectors.ConstVector 30 | val SUBTYPE_INT = 0x07 // Int gets special type because Longs and Doubles may be encoded as Int 31 | val SUBTYPE_INT_NOMASK = 0x08 32 | 33 | def vectorSubType(headerBytes: Int): Int = (headerBytes & 0x00ff00) >> 8 34 | 35 | val MaxEmptyVectorLen = 0x00ffffff 36 | 37 | def emptyVector(len: Int): Int = { 38 | require(len <= MaxEmptyVectorLen, "Vector len too long") 39 | (len << 8) | VECTORTYPE_EMPTY 40 | } 41 | 42 | def apply(majorVectorType: Int, subType: Int): Int = 43 | ((subType & 0x00ff) << 8) | (majorVectorType & 0x00ff) 44 | } -------------------------------------------------------------------------------- /filo-scala/src/main/scala/org.velvia.filo/codecs/SimpleWrappers.scala: -------------------------------------------------------------------------------- 1 | package org.velvia.filo.codecs 2 | 3 | import scala.language.postfixOps 4 | import scalaxy.loops._ 5 | 6 | import org.velvia.filo.{FiloVector, NaMaskAvailable, FastBufferReader} 7 | import org.velvia.filo.vector._ 8 | 9 | /** 10 | * Represents either an empty column (length 0) or a column where none of the 11 | * values are available (null). 12 | */ 13 | class EmptyFiloVector[A](len: Int) extends FiloVector[A] { 14 | final def isAvailable(index: Int): Boolean = false 15 | final def foreach[B](fn: A => B): Unit = {} 16 | final def apply(index: Int): A = 17 | if (index < len) { null.asInstanceOf[A] } 18 | else { throw new ArrayIndexOutOfBoundsException } 19 | final def length: Int = len 20 | } 21 | 22 | abstract class SimplePrimitiveWrapper[@specialized A](spv: SimplePrimitiveVector) 23 | extends NaMaskAvailable[A](spv.naMask) { 24 | val info = spv.info 25 | val _len = spv.len 26 | val reader = FastBufferReader(spv.dataAsByteBuffer()) 27 | 28 | final def length: Int = _len 29 | 30 | final def foreach[B](fn: A => B): Unit = { 31 | if (isEmptyMask) { // every value available! 32 | for { i <- 0 until length optimized } { fn(apply(i)) } 33 | } else { 34 | for { i <- 0 until length optimized } { if (isAvailable(i)) fn(apply(i)) } 35 | } 36 | } 37 | } 38 | 39 | // TODO: ditch naMask 40 | class SimpleStringWrapper(ssv: SimpleStringVector) 41 | extends NaMaskAvailable[String](ssv.naMask) { 42 | val _len = ssv.dataLength 43 | 44 | final def length: Int = _len 45 | 46 | final def apply(i: Int): String = ssv.data(i) 47 | 48 | final def foreach[B](fn: String => B): Unit = { 49 | for { i <- 0 until length optimized } { if (isAvailable(i)) fn(apply(i)) } 50 | } 51 | } 52 | 53 | 54 | -------------------------------------------------------------------------------- /filo-scala-jmh/src/main/scala/org.velvia.filo/UTF8StringBenchmark.scala: -------------------------------------------------------------------------------- 1 | package org.velvia.filo 2 | 3 | import org.openjdk.jmh.annotations.Benchmark 4 | import org.openjdk.jmh.annotations.BenchmarkMode 5 | import org.openjdk.jmh.annotations.{Mode, State, Scope} 6 | import org.openjdk.jmh.annotations.OutputTimeUnit 7 | 8 | import java.util.concurrent.TimeUnit 9 | 10 | /** 11 | * Measures the speed of common string operations vs standard Java strings 12 | * 13 | * For a description of the JMH measurement modes, see 14 | * https://github.com/ktoso/sbt-jmh/blob/master/src/sbt-test/sbt-jmh/jmh-run/src/main/scala/org/openjdk/jmh/samples/JMHSample_02_BenchmarkModes.scala 15 | */ 16 | @State(Scope.Thread) 17 | class UTF8StringBenchmark { 18 | 19 | val str = "xylophonemania" 20 | val str2 = "xylophonemaniac" 21 | val zcStr = ZeroCopyUTF8String(str) 22 | val zcStr2 = ZeroCopyUTF8String(str2) 23 | 24 | // According to @ktosopl, be sure to return some value if possible so that JVM won't 25 | // optimize out the method body. However JMH is apparently very good at avoiding this. 26 | // fastest loop possible using FiloVectorApply method 27 | @Benchmark 28 | @BenchmarkMode(Array(Mode.Throughput)) 29 | @OutputTimeUnit(TimeUnit.SECONDS) 30 | def utf8StrCompare(): Int = { 31 | zcStr.compare(zcStr2) 32 | } 33 | 34 | @Benchmark 35 | @BenchmarkMode(Array(Mode.Throughput)) 36 | @OutputTimeUnit(TimeUnit.SECONDS) 37 | def nativeStrCompare(): Int = { 38 | str.compare(str2) 39 | } 40 | 41 | @Benchmark 42 | @BenchmarkMode(Array(Mode.Throughput)) 43 | @OutputTimeUnit(TimeUnit.SECONDS) 44 | def utf8Substring(): ZeroCopyUTF8String = { 45 | zcStr.substring(2, 6) 46 | } 47 | 48 | @Benchmark 49 | @BenchmarkMode(Array(Mode.Throughput)) 50 | @OutputTimeUnit(TimeUnit.SECONDS) 51 | def nativeSubstring(): String = { 52 | str.substring(2, 6) 53 | } 54 | 55 | @Benchmark 56 | @BenchmarkMode(Array(Mode.Throughput)) 57 | @OutputTimeUnit(TimeUnit.SECONDS) 58 | def utf8hash(): Int = { 59 | zcStr.hashCode 60 | } 61 | } -------------------------------------------------------------------------------- /filo-scala/src/main/scala/org.velvia.filo/codecs/DictEncodingEncoders.scala: -------------------------------------------------------------------------------- 1 | package org.velvia.filo.codecs 2 | 3 | import com.google.flatbuffers.FlatBufferBuilder 4 | import java.nio.ByteBuffer 5 | import java.util.HashMap 6 | import scala.collection.mutable.{ArrayBuffer, BitSet} 7 | import scala.language.postfixOps 8 | import scalaxy.loops._ 9 | 10 | import org.velvia.filo._ 11 | import org.velvia.filo.vector._ 12 | 13 | /** 14 | * Encoders for dictionary encoding/compression 15 | */ 16 | object DictEncodingEncoders extends ThreadLocalBuffers { 17 | import Utils._ 18 | 19 | var count = 0 20 | 21 | // Note: This is a way to avoid storing null and dealing with NPEs for NA values 22 | val NaString = "" 23 | 24 | def toStringVector(data: Seq[String], naMask: BitSet, stringSet: collection.Set[String]): ByteBuffer = { 25 | import DictStringVector._ 26 | 27 | count += 1 28 | val builder = AutoIntegralDVBuilders.IntDataVectBuilder 29 | 30 | // Convert the set of strings to an encoding 31 | val uniques = stringSet.toSeq 32 | // NOTE: sorry but java's HashMap is just much faster (for the next step) 33 | // This used to be `uniques.zipWithIndex.toMap` 34 | val strToCode = new HashMap[String, Int]() 35 | for { i <- 0 until uniques.length optimized } { 36 | strToCode.put(uniques(i), i) 37 | } 38 | 39 | // Encode each string to the code per the map above 40 | // Again we could have used data.zipWithIndex.map(....) but this is much faster. 41 | val codes = ArrayBuffer.fill(data.length)(0) 42 | for { i <- 0 until data.length optimized } { 43 | if (!naMask(i)) codes(i) = strToCode.get(data(i)) + 1 44 | } 45 | 46 | val fbb = new FlatBufferBuilder(getBuffer) 47 | val ((dataOffset, nbits), signed) = builder.build(fbb, codes, 0, stringSet.size + 1) 48 | val dictVect = stringVect(fbb, Seq(NaString) ++ uniques) 49 | startDictStringVector(fbb) 50 | addDictionary(fbb, dictVect) 51 | addLen(fbb, data.length) 52 | addCodes(fbb, dataOffset) 53 | addInfo(fbb, DataInfo.createDataInfo(fbb, nbits, signed)) 54 | finishDictStringVectorBuffer(fbb, endDictStringVector(fbb)) 55 | putHeaderAndGet(fbb, WireFormat.VECTORTYPE_DICT, WireFormat.SUBTYPE_STRING) 56 | } 57 | } -------------------------------------------------------------------------------- /schema/flatbuffers/gen-java/org/velvia/filo/vector/NaMask.java: -------------------------------------------------------------------------------- 1 | // automatically generated, do not modify 2 | 3 | package org.velvia.filo.vector; 4 | 5 | import java.nio.*; 6 | import java.lang.*; 7 | import java.util.*; 8 | import com.google.flatbuffers.*; 9 | 10 | @SuppressWarnings("unused") 11 | public final class NaMask extends Table { 12 | public static NaMask getRootAsNaMask(ByteBuffer _bb) { return getRootAsNaMask(_bb, new NaMask()); } 13 | public static NaMask getRootAsNaMask(ByteBuffer _bb, NaMask obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__init(_bb.getInt(_bb.position()) + _bb.position(), _bb)); } 14 | public NaMask __init(int _i, ByteBuffer _bb) { bb_pos = _i; bb = _bb; return this; } 15 | 16 | public byte maskType() { int o = __offset(4); return o != 0 ? bb.get(o + bb_pos) : 2; } 17 | /** 18 | * for type = SimpleBitMask 19 | */ 20 | public long bitMask(int j) { int o = __offset(6); return o != 0 ? bb.getLong(__vector(o) + j * 8) : 0; } 21 | public int bitMaskLength() { int o = __offset(6); return o != 0 ? __vector_len(o) : 0; } 22 | public ByteBuffer bitMaskAsByteBuffer() { return __vector_as_bytebuffer(6, 8); } 23 | 24 | public static int createNaMask(FlatBufferBuilder builder, 25 | byte maskType, 26 | int bitMask) { 27 | builder.startObject(2); 28 | NaMask.addBitMask(builder, bitMask); 29 | NaMask.addMaskType(builder, maskType); 30 | return NaMask.endNaMask(builder); 31 | } 32 | 33 | public static void startNaMask(FlatBufferBuilder builder) { builder.startObject(2); } 34 | public static void addMaskType(FlatBufferBuilder builder, byte maskType) { builder.addByte(0, maskType, 2); } 35 | public static void addBitMask(FlatBufferBuilder builder, int bitMaskOffset) { builder.addOffset(1, bitMaskOffset, 0); } 36 | public static int createBitMaskVector(FlatBufferBuilder builder, long[] data) { builder.startVector(8, data.length, 8); for (int i = data.length - 1; i >= 0; i--) builder.addLong(data[i]); return builder.endVector(); } 37 | public static void startBitMaskVector(FlatBufferBuilder builder, int numElems) { builder.startVector(8, numElems, 8); } 38 | public static int endNaMask(FlatBufferBuilder builder) { 39 | int o = builder.endObject(); 40 | return o; 41 | } 42 | }; 43 | 44 | -------------------------------------------------------------------------------- /filo-scala-jmh/src/main/scala/org.velvia.filo/FastFiloRowReaderBenchmark.scala: -------------------------------------------------------------------------------- 1 | package org.velvia.filo 2 | 3 | import java.sql.Timestamp 4 | import org.openjdk.jmh.annotations.Benchmark 5 | import org.openjdk.jmh.annotations.BenchmarkMode 6 | import org.openjdk.jmh.annotations.{Mode, State, Scope} 7 | import org.openjdk.jmh.annotations.OutputTimeUnit 8 | import scalaxy.loops._ 9 | import scala.language.postfixOps 10 | 11 | import java.util.concurrent.TimeUnit 12 | 13 | /** 14 | * Measures the speed of creating a FastFiloRowReader, 15 | * parsing the chunks into FiloVectors, and iterating through rows and reading values. 16 | * 17 | * For a description of the JMH measurement modes, see 18 | * https://github.com/ktoso/sbt-jmh/blob/master/src/sbt-test/sbt-jmh/jmh-run/src/main/scala/org/openjdk/jmh/samples/JMHSample_02_BenchmarkModes.scala 19 | */ 20 | @State(Scope.Thread) 21 | class FastFiloRowReaderBenchmark { 22 | import VectorReader._ 23 | 24 | // Ok, create an IntColumn and benchmark it. 25 | val numValues = 10000 26 | 27 | val randomInts = (0 until numValues).map(i => util.Random.nextInt) 28 | val randomLongs = randomInts.map(_.toLong) 29 | val randomTs = randomLongs.map(l => new Timestamp(l)) 30 | 31 | val chunks = Array(VectorBuilder(randomInts).toFiloBuffer, 32 | VectorBuilder(randomLongs).toFiloBuffer, 33 | VectorBuilder(randomTs).toFiloBuffer) 34 | val clazzes = Array[Class[_]](classOf[Int], classOf[Long], classOf[Timestamp]) 35 | 36 | // According to @ktosopl, be sure to return some value if possible so that JVM won't 37 | // optimize out the method body. However JMH is apparently very good at avoiding this. 38 | // fastest loop possible using FiloVectorApply method 39 | @Benchmark 40 | @BenchmarkMode(Array(Mode.AverageTime)) 41 | @OutputTimeUnit(TimeUnit.MICROSECONDS) 42 | def createFastFiloRowReader(): RowReader = { 43 | new FastFiloRowReader(chunks, clazzes) 44 | } 45 | 46 | val fastReader = new FastFiloRowReader(chunks, clazzes) 47 | 48 | @Benchmark 49 | @BenchmarkMode(Array(Mode.Throughput)) 50 | @OutputTimeUnit(TimeUnit.SECONDS) 51 | def fastFiloRowReaderReadOne(): Int = { 52 | fastReader.setRowNo(0) 53 | if (fastReader.notNull(0)) fastReader.getInt(0) + 1 else 0 54 | } 55 | } -------------------------------------------------------------------------------- /filo-scala/src/main/scala/org.velvia.filo/codecs/ConstEncoders.scala: -------------------------------------------------------------------------------- 1 | package org.velvia.filo.codecs 2 | 3 | import com.google.flatbuffers.FlatBufferBuilder 4 | import java.nio.{ByteBuffer, ByteOrder} 5 | import scala.collection.mutable.BitSet 6 | 7 | import org.velvia.filo._ 8 | import org.velvia.filo.vector._ 9 | 10 | /** 11 | * Encoders for sequences where the non-NA values are all the same 12 | */ 13 | object ConstEncoders extends ThreadLocalBuffers { 14 | import Utils._ 15 | 16 | var count = 0 17 | 18 | /** 19 | * Creates a SimplePrimitiveVector-based Filo vector. 20 | * @param min the minimum value from the data points that are available. 21 | * Be careful not to include points from NA parts of the data sequence. 22 | */ 23 | def toPrimitiveVector[A: PrimitiveDataVectBuilder](data: Seq[A], 24 | naMask: BitSet, 25 | min: A, 26 | max: A): ByteBuffer = { 27 | import SimplePrimitiveVector._ 28 | require(min == max) 29 | 30 | val vectBuilder = implicitly[PrimitiveDataVectBuilder[A]] 31 | count += 1 32 | val fbb = new FlatBufferBuilder(getBuffer) 33 | val naOffset = populateNaMask(fbb, naMask, data.length) 34 | val ((dataOffset, nbits), signed) = vectBuilder.build(fbb, Seq(min), min, max) 35 | startSimplePrimitiveVector(fbb) 36 | addNaMask(fbb, naOffset) 37 | addLen(fbb, data.length) 38 | addData(fbb, dataOffset) 39 | addInfo(fbb, DataInfo.createDataInfo(fbb, nbits, signed)) 40 | finishSimplePrimitiveVectorBuffer(fbb, endSimplePrimitiveVector(fbb)) 41 | putHeaderAndGet(fbb, WireFormat.VECTORTYPE_CONST, WireFormat.SUBTYPE_PRIMITIVE) 42 | } 43 | 44 | def toStringVector(str: String, len: Int, naMask: BitSet): ByteBuffer = { 45 | import ConstStringVector._ 46 | 47 | count += 1 48 | val fbb = new FlatBufferBuilder(getBuffer) 49 | val naOffset = populateNaMask(fbb, naMask, len) 50 | val strOffset = fbb.createString(str) 51 | val offset = createConstStringVector(fbb, len, naOffset, strOffset) 52 | finishConstStringVectorBuffer(fbb, offset) 53 | putHeaderAndGet(fbb, WireFormat.VECTORTYPE_CONST, WireFormat.SUBTYPE_STRING) 54 | } 55 | } -------------------------------------------------------------------------------- /filo-scala/src/main/scala/org.velvia.filo/vectors/ObjectVector.scala: -------------------------------------------------------------------------------- 1 | package org.velvia.filo.vectors 2 | 3 | import org.velvia.filo.{BinaryVector, BinaryAppendableVector, UnsafeUtils, BuilderEncoder} 4 | 5 | object ObjectVector { 6 | val objectRefSize = UnsafeUtils.unsafe.arrayIndexScale(classOf[Array[Any]]) 7 | } 8 | 9 | import BuilderEncoder._ 10 | 11 | /** 12 | * Technically not a binary/serializable vector, rather one that holds object references. 13 | * However it conforms to the BinaryVector API for convenience. Probably faster than a normal Scala Seq. 14 | * Should never be serialized -- and this is enforced by not implementing a few types 15 | */ 16 | abstract class ObjectVector[T](val base: Any, 17 | val offset: Long, 18 | val maxBytes: Int, 19 | perElem: Int = ObjectVector.objectRefSize) extends BinaryAppendableVector[T] { 20 | def numBytes: Int = (writeOffset - offset).toInt 21 | override final def length: Int = numBytes / perElem 22 | 23 | private var writeOffset = offset 24 | private var numNAs = 0 25 | private var maxStrLen = 0 26 | 27 | final def reset(): Unit = { 28 | writeOffset = offset 29 | } 30 | 31 | val maybeNAs = true 32 | 33 | final def isAvailable(index: Int): Boolean = 34 | UnsafeUtils.unsafe.getObject(base, offset + perElem * index) != null 35 | final def apply(index: Int): T = 36 | UnsafeUtils.unsafe.getObject(base, offset + perElem * index).asInstanceOf[T] 37 | 38 | def addData(data: T): Unit = { 39 | checkSize(numBytes, maxBytes) 40 | UnsafeUtils.unsafe.putObject(base, writeOffset, data) 41 | writeOffset += perElem 42 | } 43 | 44 | def addNA(): Unit = { 45 | checkSize(numBytes, maxBytes) 46 | UnsafeUtils.unsafe.putObject(base, writeOffset, null) 47 | writeOffset += perElem 48 | numNAs += 1 49 | } 50 | 51 | final def isAllNA: Boolean = numNAs == length 52 | final def noNAs: Boolean = numNAs == 0 53 | 54 | def vectMajorType: Int = ??? 55 | def vectSubType: Int = ??? 56 | def finishCompaction(newBase: Any, newOff: Long): BinaryVector[T] = 57 | throw new RuntimeException("Cannot finalize an ObjectVector") 58 | 59 | override def optimize(hint: EncodingHint = AutoDetect): BinaryVector[T] = suboptimize(hint) 60 | def suboptimize(hint: EncodingHint = AutoDetect): BinaryVector[T] 61 | } -------------------------------------------------------------------------------- /filo-scala/src/test/scala/org.velvia.filo/DictEncodingTest.scala: -------------------------------------------------------------------------------- 1 | package org.velvia.filo 2 | 3 | import org.scalatest.FunSpec 4 | import org.scalatest.Matchers 5 | 6 | class DictEncodingTest extends FunSpec with Matchers { 7 | import BuilderEncoder.DictionaryEncoding 8 | import VectorReader._ 9 | 10 | it("should encode and decode back an empty Seq") { 11 | val buf = VectorBuilder(Seq[String]()).toFiloBuffer(DictionaryEncoding) 12 | val binarySeq = FiloVector[String](buf) 13 | 14 | binarySeq.length should equal (0) 15 | } 16 | 17 | it("should encode and decode back a Seq[String]") { 18 | val orig = Seq("apple", "banana") 19 | val buf = VectorBuilder(orig).toFiloBuffer(DictionaryEncoding) 20 | val binarySeq = FiloVector[String](buf) 21 | 22 | binarySeq.length should equal (orig.length) 23 | binarySeq.toSeq should equal (orig) 24 | } 25 | 26 | it("should encode and decode back a Seq[Option[String]]") { 27 | val orig = Seq(Some("apple"), None, Some("banana")) 28 | val buf = VectorBuilder.fromOptions(orig).toFiloBuffer(DictionaryEncoding) 29 | val binarySeq = FiloVector[String](buf) 30 | 31 | binarySeq.length should equal (orig.length) 32 | binarySeq.toSeq should equal (Seq("apple", "banana")) 33 | binarySeq.optionIterator.toSeq should equal (orig) 34 | binarySeq(0) should equal ("apple") 35 | binarySeq(1) should equal ("") 36 | } 37 | 38 | it("should encode and decode back a sequence starting with NAs") { 39 | val orig = Seq(None, None, None, Some("apple"), Some("banana")) 40 | val buf = VectorBuilder.fromOptions(orig).toFiloBuffer(DictionaryEncoding) 41 | val binarySeq = FiloVector[String](buf) 42 | 43 | binarySeq.length should equal (orig.length) 44 | binarySeq.toSeq should equal (Seq("apple", "banana")) 45 | binarySeq.optionIterator.toSeq should equal (orig) 46 | } 47 | 48 | // Negative byte values might not get converted to ints properly, leading 49 | // to an ArrayOutOfBoundsException. 50 | it("should ensure proper conversion when there are 128-255 unique strings") { 51 | val orig = (0 to 130).map(_.toString).toSeq 52 | val buf = VectorBuilder(orig).toFiloBuffer(DictionaryEncoding) 53 | val binarySeq = FiloVector[String](buf) 54 | 55 | binarySeq.length should equal (orig.length) 56 | binarySeq.toSeq should equal (orig) 57 | } 58 | } -------------------------------------------------------------------------------- /schema/flatbuffers/gen-java/org/velvia/filo/vector/ConstStringVector.java: -------------------------------------------------------------------------------- 1 | // automatically generated, do not modify 2 | 3 | package org.velvia.filo.vector; 4 | 5 | import java.nio.*; 6 | import java.lang.*; 7 | import java.util.*; 8 | import com.google.flatbuffers.*; 9 | 10 | @SuppressWarnings("unused") 11 | public final class ConstStringVector extends Table { 12 | public static ConstStringVector getRootAsConstStringVector(ByteBuffer _bb) { return getRootAsConstStringVector(_bb, new ConstStringVector()); } 13 | public static ConstStringVector getRootAsConstStringVector(ByteBuffer _bb, ConstStringVector obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__init(_bb.getInt(_bb.position()) + _bb.position(), _bb)); } 14 | public ConstStringVector __init(int _i, ByteBuffer _bb) { bb_pos = _i; bb = _bb; return this; } 15 | 16 | public int len() { int o = __offset(4); return o != 0 ? bb.getInt(o + bb_pos) : 0; } 17 | public NaMask naMask() { return naMask(new NaMask()); } 18 | public NaMask naMask(NaMask obj) { int o = __offset(6); return o != 0 ? obj.__init(__indirect(o + bb_pos), bb) : null; } 19 | public String str() { int o = __offset(8); return o != 0 ? __string(o + bb_pos) : null; } 20 | public ByteBuffer strAsByteBuffer() { return __vector_as_bytebuffer(8, 1); } 21 | 22 | public static int createConstStringVector(FlatBufferBuilder builder, 23 | int len, 24 | int naMask, 25 | int str) { 26 | builder.startObject(3); 27 | ConstStringVector.addStr(builder, str); 28 | ConstStringVector.addNaMask(builder, naMask); 29 | ConstStringVector.addLen(builder, len); 30 | return ConstStringVector.endConstStringVector(builder); 31 | } 32 | 33 | public static void startConstStringVector(FlatBufferBuilder builder) { builder.startObject(3); } 34 | public static void addLen(FlatBufferBuilder builder, int len) { builder.addInt(0, len, 0); } 35 | public static void addNaMask(FlatBufferBuilder builder, int naMaskOffset) { builder.addOffset(1, naMaskOffset, 0); } 36 | public static void addStr(FlatBufferBuilder builder, int strOffset) { builder.addOffset(2, strOffset, 0); } 37 | public static int endConstStringVector(FlatBufferBuilder builder) { 38 | int o = builder.endObject(); 39 | return o; 40 | } 41 | public static void finishConstStringVectorBuffer(FlatBufferBuilder builder, int offset) { builder.finish(offset); } 42 | }; 43 | 44 | -------------------------------------------------------------------------------- /filo-scala/src/main/scala/org.velvia.filo/RowReaderAppender.scala: -------------------------------------------------------------------------------- 1 | package org.velvia.filo 2 | 3 | import vectors.ConstAppendingVector 4 | 5 | /** 6 | * A trait for making RowReader appends to BinaryAppendableVector efficient and unboxed. Type specific 7 | * code for adding to specific types of vectors are in each type specific implementation. This avoids 8 | * boxing overhead. 9 | */ 10 | trait RowReaderAppender { 11 | def appender: BinaryAppendableVector[_] 12 | def col: Int 13 | 14 | // Appends data to rowReader. Type specific code. Assumes data is available from reader. 15 | def appendData(row: RowReader): Unit 16 | 17 | /** 18 | * Appends a row to appender, checking for availability. 19 | */ 20 | final def append(row: RowReader): Unit = { 21 | if (row.notNull(col)) { appendData(row) } 22 | else { appender.addNA() } 23 | } 24 | } 25 | 26 | // NOTE: we have to define type specific stuff to ensure the right specialized method of add is used. 27 | // No way to make this generic with specialization unfortunately, because the append signature does not 28 | // take in anything with type A. 29 | // Even if you try returning something of type A, it does not work. 30 | class IntReaderAppender(val appender: BinaryAppendableVector[Int], val col: Int) extends RowReaderAppender { 31 | final def appendData(row: RowReader): Unit = appender.addData(row.getInt(col)) 32 | } 33 | 34 | class LongReaderAppender(val appender: BinaryAppendableVector[Long], val col: Int) extends RowReaderAppender { 35 | final def appendData(row: RowReader): Unit = appender.addData(row.getLong(col)) 36 | } 37 | 38 | class DoubleReaderAppender(val appender: BinaryAppendableVector[Double], val col: Int) extends RowReaderAppender { 39 | final def appendData(row: RowReader): Unit = appender.addData(row.getDouble(col)) 40 | } 41 | 42 | class StringReaderAppender(val appender: BinaryAppendableVector[ZeroCopyUTF8String], val col: Int) 43 | extends RowReaderAppender { 44 | final def appendData(row: RowReader): Unit = appender.addData(row.filoUTF8String(col)) 45 | } 46 | 47 | /** 48 | * An appender that creates correctly-sized ConstVectors for static partition columns, and skips 49 | * reading from the RowReaader for efficiency 50 | */ 51 | final case class ConstAppender[A](appender: ConstAppendingVector[A], col: Int) extends RowReaderAppender { 52 | final def appendData(row: RowReader): Unit = appender.addNA() 53 | } -------------------------------------------------------------------------------- /schema/flatbuffers/gen-java/org/velvia/filo/vector/SimpleStringVector.java: -------------------------------------------------------------------------------- 1 | // automatically generated, do not modify 2 | 3 | package org.velvia.filo.vector; 4 | 5 | import java.nio.*; 6 | import java.lang.*; 7 | import java.util.*; 8 | import com.google.flatbuffers.*; 9 | 10 | @SuppressWarnings("unused") 11 | public final class SimpleStringVector extends Table { 12 | public static SimpleStringVector getRootAsSimpleStringVector(ByteBuffer _bb) { return getRootAsSimpleStringVector(_bb, new SimpleStringVector()); } 13 | public static SimpleStringVector getRootAsSimpleStringVector(ByteBuffer _bb, SimpleStringVector obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__init(_bb.getInt(_bb.position()) + _bb.position(), _bb)); } 14 | public SimpleStringVector __init(int _i, ByteBuffer _bb) { bb_pos = _i; bb = _bb; return this; } 15 | 16 | public NaMask naMask() { return naMask(new NaMask()); } 17 | public NaMask naMask(NaMask obj) { int o = __offset(4); return o != 0 ? obj.__init(__indirect(o + bb_pos), bb) : null; } 18 | public String data(int j) { int o = __offset(6); return o != 0 ? __string(__vector(o) + j * 4) : null; } 19 | public int dataLength() { int o = __offset(6); return o != 0 ? __vector_len(o) : 0; } 20 | 21 | public static int createSimpleStringVector(FlatBufferBuilder builder, 22 | int naMask, 23 | int data) { 24 | builder.startObject(2); 25 | SimpleStringVector.addData(builder, data); 26 | SimpleStringVector.addNaMask(builder, naMask); 27 | return SimpleStringVector.endSimpleStringVector(builder); 28 | } 29 | 30 | public static void startSimpleStringVector(FlatBufferBuilder builder) { builder.startObject(2); } 31 | public static void addNaMask(FlatBufferBuilder builder, int naMaskOffset) { builder.addOffset(0, naMaskOffset, 0); } 32 | public static void addData(FlatBufferBuilder builder, int dataOffset) { builder.addOffset(1, dataOffset, 0); } 33 | public static int createDataVector(FlatBufferBuilder builder, int[] data) { builder.startVector(4, data.length, 4); for (int i = data.length - 1; i >= 0; i--) builder.addOffset(data[i]); return builder.endVector(); } 34 | public static void startDataVector(FlatBufferBuilder builder, int numElems) { builder.startVector(4, numElems, 4); } 35 | public static int endSimpleStringVector(FlatBufferBuilder builder) { 36 | int o = builder.endObject(); 37 | return o; 38 | } 39 | public static void finishSimpleStringVectorBuffer(FlatBufferBuilder builder, int offset) { builder.finish(offset); } 40 | }; 41 | 42 | -------------------------------------------------------------------------------- /filo-scala/src/main/scala/org.velvia.filo/codecs/SimpleEncoders.scala: -------------------------------------------------------------------------------- 1 | package org.velvia.filo.codecs 2 | 3 | import com.google.flatbuffers.FlatBufferBuilder 4 | import java.nio.{ByteBuffer, ByteOrder} 5 | import scala.collection.mutable.BitSet 6 | 7 | import org.velvia.filo._ 8 | import org.velvia.filo.vector._ 9 | 10 | /** 11 | * A whole bunch of encoders for simple (no compression) binary representation of sequences, 12 | * using Google FlatBuffers 13 | */ 14 | object SimpleEncoders extends ThreadLocalBuffers { 15 | import Utils._ 16 | 17 | var count = 0 18 | 19 | /** 20 | * Creates a SimplePrimitiveVector-based Filo vector. 21 | * @param min the minimum value from the data points that are available. 22 | * Be careful not to include points from NA parts of the data sequence. 23 | */ 24 | def toPrimitiveVector[A: PrimitiveDataVectBuilder](data: Seq[A], 25 | naMask: BitSet, 26 | min: A, 27 | max: A): ByteBuffer = { 28 | import SimplePrimitiveVector._ 29 | 30 | val vectBuilder = implicitly[PrimitiveDataVectBuilder[A]] 31 | count += 1 32 | val fbb = new FlatBufferBuilder(getBuffer) 33 | val naOffset = populateNaMask(fbb, naMask, data.length) 34 | val ((dataOffset, nbits), signed) = vectBuilder.build(fbb, data, min, max) 35 | startSimplePrimitiveVector(fbb) 36 | addNaMask(fbb, naOffset) 37 | addLen(fbb, data.length) 38 | addData(fbb, dataOffset) 39 | addInfo(fbb, DataInfo.createDataInfo(fbb, nbits, signed)) 40 | finishSimplePrimitiveVectorBuffer(fbb, endSimplePrimitiveVector(fbb)) 41 | putHeaderAndGet(fbb, WireFormat.VECTORTYPE_SIMPLE, WireFormat.SUBTYPE_PRIMITIVE) 42 | } 43 | 44 | def toEmptyVector(len: Int): ByteBuffer = { 45 | val bb = ByteBuffer.allocate(4).order(ByteOrder.LITTLE_ENDIAN) 46 | bb.putInt(WireFormat.emptyVector(len)) 47 | bb.position(0) 48 | bb 49 | } 50 | 51 | def toStringVector(data: Seq[String], naMask: BitSet): ByteBuffer = { 52 | val fbb = new FlatBufferBuilder(getBuffer) 53 | val naOffset = populateNaMask(fbb, naMask, data.length) 54 | val dataOffset = stringVect(fbb, data) 55 | val ssvOffset = SimpleStringVector.createSimpleStringVector(fbb, naOffset, dataOffset) 56 | SimpleStringVector.finishSimpleStringVectorBuffer(fbb, ssvOffset) 57 | putHeaderAndGet(fbb, WireFormat.VECTORTYPE_SIMPLE, WireFormat.SUBTYPE_STRING) 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /filo-scala-jmh/src/main/scala/org.velvia.filo/DictStringBenchmark.scala: -------------------------------------------------------------------------------- 1 | package org.velvia.filo 2 | 3 | import org.openjdk.jmh.annotations.Benchmark 4 | import org.openjdk.jmh.annotations.BenchmarkMode 5 | import org.openjdk.jmh.annotations.{Mode, State, Scope} 6 | import org.openjdk.jmh.annotations.OutputTimeUnit 7 | import scalaxy.loops._ 8 | import scala.language.postfixOps 9 | 10 | import java.util.concurrent.TimeUnit 11 | 12 | /** 13 | * Measures read speed for a dictionary-encoded string Filo column. 14 | * Has tests for both no-NA and some-NA read speed. 15 | * Since real world datasets tend to contain lots of string data, this is 16 | * probably a much more realistic speed benchmark than Ints. 17 | * Simulate a somewhat-realistic by varying string length and using alphanum chars 18 | * 19 | * TODO: compare against Seq[String] encoding in MessagePack, etc. 20 | */ 21 | @State(Scope.Thread) 22 | class DictStringBenchmark { 23 | import scala.util.Random.{alphanumeric, nextInt, nextFloat} 24 | import VectorReader._ 25 | 26 | val numValues = 10000 27 | // NOTE: results show that time spent is heavily influenced by ratio of unique strings... 28 | val numUniqueStrings = 500 29 | val maxStringLength = 15 30 | val minStringLength = 5 31 | val naChance = 0.05 //5% of values will be NA 32 | 33 | def randString(len: Int): String = alphanumeric.take(len).mkString 34 | 35 | val uniqueStrings = (0 until numUniqueStrings).map { i => 36 | randString(minStringLength + nextInt(maxStringLength - minStringLength)) 37 | } 38 | val randomStrings = (0 until numValues).map(i => uniqueStrings(nextInt(numUniqueStrings))) 39 | val filoBufferNoNA = VectorBuilder(randomStrings).toFiloBuffer 40 | val scNoNA = FiloVector[String](filoBufferNoNA) 41 | 42 | def shouldNA: Boolean = nextFloat < naChance 43 | 44 | val filoBufferNA = VectorBuilder.fromOptions( 45 | randomStrings.map(str => if (shouldNA) None else Some(str)) 46 | ).toFiloBuffer 47 | val scNA = FiloVector[String](filoBufferNA) 48 | 49 | @Benchmark 50 | @BenchmarkMode(Array(Mode.AverageTime)) 51 | @OutputTimeUnit(TimeUnit.MICROSECONDS) 52 | def rawStringLengthTotal(): Int = { 53 | var totalLen = 0 54 | for { i <- 0 until numValues optimized } { 55 | totalLen += scNoNA(i).length 56 | } 57 | totalLen 58 | } 59 | 60 | // TODO: also a benchmark for the foreach/fold of a column with no NA's? 61 | 62 | @Benchmark 63 | @BenchmarkMode(Array(Mode.AverageTime)) 64 | @OutputTimeUnit(TimeUnit.MICROSECONDS) 65 | // Measures foreach and NA read speed 66 | def withNAlengthTotal(): Unit = { 67 | var totalLen = 0 68 | scNA.foreach { str => totalLen += str.length } 69 | totalLen 70 | } 71 | } -------------------------------------------------------------------------------- /schema/flatbuffers/gen-java/org/velvia/filo/vector/SimplePrimitiveVector.java: -------------------------------------------------------------------------------- 1 | // automatically generated, do not modify 2 | 3 | package org.velvia.filo.vector; 4 | 5 | import java.nio.*; 6 | import java.lang.*; 7 | import java.util.*; 8 | import com.google.flatbuffers.*; 9 | 10 | @SuppressWarnings("unused") 11 | public final class SimplePrimitiveVector extends Table { 12 | public static SimplePrimitiveVector getRootAsSimplePrimitiveVector(ByteBuffer _bb) { return getRootAsSimplePrimitiveVector(_bb, new SimplePrimitiveVector()); } 13 | public static SimplePrimitiveVector getRootAsSimplePrimitiveVector(ByteBuffer _bb, SimplePrimitiveVector obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__init(_bb.getInt(_bb.position()) + _bb.position(), _bb)); } 14 | public SimplePrimitiveVector __init(int _i, ByteBuffer _bb) { bb_pos = _i; bb = _bb; return this; } 15 | 16 | public int len() { int o = __offset(4); return o != 0 ? bb.getInt(o + bb_pos) : 0; } 17 | public NaMask naMask() { return naMask(new NaMask()); } 18 | public NaMask naMask(NaMask obj) { int o = __offset(6); return o != 0 ? obj.__init(__indirect(o + bb_pos), bb) : null; } 19 | public DataInfo info() { return info(new DataInfo()); } 20 | public DataInfo info(DataInfo obj) { int o = __offset(8); return o != 0 ? obj.__init(o + bb_pos, bb) : null; } 21 | public int data(int j) { int o = __offset(10); return o != 0 ? bb.get(__vector(o) + j * 1) & 0xFF : 0; } 22 | public int dataLength() { int o = __offset(10); return o != 0 ? __vector_len(o) : 0; } 23 | public ByteBuffer dataAsByteBuffer() { return __vector_as_bytebuffer(10, 1); } 24 | 25 | public static void startSimplePrimitiveVector(FlatBufferBuilder builder) { builder.startObject(4); } 26 | public static void addLen(FlatBufferBuilder builder, int len) { builder.addInt(0, len, 0); } 27 | public static void addNaMask(FlatBufferBuilder builder, int naMaskOffset) { builder.addOffset(1, naMaskOffset, 0); } 28 | public static void addInfo(FlatBufferBuilder builder, int infoOffset) { builder.addStruct(2, infoOffset, 0); } 29 | public static void addData(FlatBufferBuilder builder, int dataOffset) { builder.addOffset(3, dataOffset, 0); } 30 | public static int createDataVector(FlatBufferBuilder builder, byte[] data) { builder.startVector(1, data.length, 1); for (int i = data.length - 1; i >= 0; i--) builder.addByte(data[i]); return builder.endVector(); } 31 | public static void startDataVector(FlatBufferBuilder builder, int numElems) { builder.startVector(1, numElems, 1); } 32 | public static int endSimplePrimitiveVector(FlatBufferBuilder builder) { 33 | int o = builder.endObject(); 34 | return o; 35 | } 36 | public static void finishSimplePrimitiveVectorBuffer(FlatBufferBuilder builder, int offset) { builder.finish(offset); } 37 | }; 38 | 39 | -------------------------------------------------------------------------------- /filo-scala/src/test/scala/org.velvia.filo/RowToColumnBuilderTest.scala: -------------------------------------------------------------------------------- 1 | package org.velvia.filo 2 | 3 | import org.scalatest.FunSpec 4 | import org.scalatest.Matchers 5 | 6 | class RowToVectorBuilderTest extends FunSpec with Matchers { 7 | val schema = Seq( 8 | VectorInfo("name", classOf[String]), 9 | VectorInfo("age", classOf[Int]) 10 | ) 11 | 12 | val rows = Seq( 13 | (Some("Matthew Perry"), Some(18)), 14 | (Some("Michelle Pfeiffer"), None), 15 | (Some("George C"), Some(59)), 16 | (Some("Rich Sherman"), Some(26)) 17 | ) 18 | 19 | val utf8schema = Seq( 20 | VectorInfo("name", classOf[ZeroCopyUTF8String]), 21 | VectorInfo("age", classOf[Int]) 22 | ) 23 | 24 | describe("RowToVectorBuilder") { 25 | import VectorReader._ 26 | 27 | it("should add rows and convert them to Filo binary Seqs") { 28 | val rtcb = new RowToVectorBuilder(schema) 29 | rows.map(TupleRowReader).foreach(rtcb.addRow) 30 | rtcb.addEmptyRow() 31 | val columnData = rtcb.convertToBytes() 32 | 33 | columnData.keys should equal (Set("name", "age")) 34 | val nameBinSeq = FiloVector[String](columnData("name")) 35 | nameBinSeq.toList should equal (List("Matthew Perry", "Michelle Pfeiffer", 36 | "George C", "Rich Sherman")) 37 | val ageBinSeq = FiloVector[Int](columnData("age")) 38 | ageBinSeq should have length (5) 39 | ageBinSeq(0) should equal (18) 40 | ageBinSeq.toList should equal (List(18, 59, 26)) 41 | } 42 | 43 | it("should add UTF8 rows and convert to Filo binary seqs") { 44 | val rtcb = new RowToVectorBuilder(utf8schema) 45 | rows.map(TupleRowReader).foreach(rtcb.addRow) 46 | rtcb.addEmptyRow() 47 | val columnData = rtcb.convertToBytes() 48 | 49 | columnData.keys should equal (Set("name", "age")) 50 | val nameBinSeq = FiloVector[ZeroCopyUTF8String](columnData("name")) 51 | nameBinSeq.length should equal (rows.length + 1) 52 | nameBinSeq.map(_.toString) should equal (List("Matthew Perry", "Michelle Pfeiffer", 53 | "George C", "Rich Sherman")) 54 | } 55 | 56 | it("convenience func should turn rows into bytes") { 57 | val columnData = RowToVectorBuilder.buildFromRows(rows.map(TupleRowReader).toIterator, 58 | schema, 59 | BuilderEncoder.SimpleEncoding) 60 | columnData.keys should equal (Set("name", "age")) 61 | val nameBinSeq = FiloVector[String](columnData("name")) 62 | nameBinSeq.toList should equal (List("Matthew Perry", "Michelle Pfeiffer", 63 | "George C", "Rich Sherman")) 64 | } 65 | } 66 | } -------------------------------------------------------------------------------- /filo-scala/src/main/scala/org.velvia.filo/codecs/DiffWrappers.scala: -------------------------------------------------------------------------------- 1 | package org.velvia.filo.codecs 2 | 3 | import org.joda.time.{DateTime, DateTimeZone} 4 | import scala.language.postfixOps 5 | import scalaxy.loops._ 6 | 7 | import org.velvia.filo._ 8 | import org.velvia.filo.vector._ 9 | 10 | abstract class DiffPrimitiveWrapper[A: TypedReaderProvider, P](dpv: DiffPrimitiveVector) 11 | extends NaMaskAvailable[P](dpv.naMask) { 12 | val info = dpv.info 13 | val _len = dpv.len 14 | val dataReader = TypedBufferReader[A](FastBufferReader(dpv.dataAsByteBuffer()), 15 | info.nbits, info.signed) 16 | val baseReader = FastBufferReader(dpv.base) 17 | 18 | final def length: Int = _len 19 | 20 | final def foreach[B](fn: P => B): Unit = { 21 | if (isEmptyMask) { // every value available! 22 | for { i <- 0 until length optimized } { fn(apply(i)) } 23 | } else { 24 | for { i <- 0 until length optimized } { if (isAvailable(i)) fn(apply(i)) } 25 | } 26 | } 27 | } 28 | 29 | /** 30 | * A FiloVector that represents DateTime's with a fixed single TimeZone 31 | * and a differentially encoded millis vector 32 | */ 33 | abstract class DiffDateTimeWrapperBase(ddtv: DiffDateTimeVector) 34 | extends NaMaskAvailable[DateTime](ddtv.naMask) { 35 | import TypedBufferReader._ 36 | 37 | val _len = ddtv.vars.len 38 | val millisBase: Long = ddtv.vars.baseMillis 39 | val millisReader = TypedBufferReader[Long](FastBufferReader(ddtv.millisAsByteBuffer), 40 | ddtv.millisInfo.nbits, ddtv.millisInfo.signed) 41 | 42 | final def length: Int = _len 43 | 44 | final def foreach[B](fn: DateTime => B): Unit = { 45 | if (isEmptyMask) { // every value available! 46 | for { i <- 0 until length optimized } { fn(apply(i)) } 47 | } else { 48 | for { i <- 0 until length optimized } { if (isAvailable(i)) fn(apply(i)) } 49 | } 50 | } 51 | } 52 | 53 | class DiffDateTimeWrapper(ddtv: DiffDateTimeVector) extends DiffDateTimeWrapperBase(ddtv) { 54 | val zone = DateTimeZone.forOffsetMillis(ddtv.vars.baseTz * VectorBuilder.FifteenMinMillis) 55 | 56 | final def apply(i: Int): DateTime = new DateTime(millisBase + millisReader.read(i), zone) 57 | } 58 | 59 | /** 60 | * A variant of DiffDateTimeWrapper that can create a different time zone for each element 61 | */ 62 | class DiffDateTimeWithTZWrapper(ddtv: DiffDateTimeVector) extends DiffDateTimeWrapperBase(ddtv) { 63 | import TypedBufferReader._ 64 | 65 | val tzBase: Byte = ddtv.vars.baseTz 66 | val tzReader = TypedBufferReader[Int](FastBufferReader(ddtv.tzAsByteBuffer), 67 | ddtv.tzInfo.nbits, ddtv.tzInfo.signed) 68 | 69 | final def apply(i: Int): DateTime = { 70 | val zone = DateTimeZone.forOffsetMillis((tzBase + tzReader.read(i)) * VectorBuilder.FifteenMinMillis) 71 | new DateTime(millisBase + millisReader.read(i), zone) 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /schema/flatbuffers/gen-java/org/velvia/filo/vector/DiffPrimitiveVector.java: -------------------------------------------------------------------------------- 1 | // automatically generated, do not modify 2 | 3 | package org.velvia.filo.vector; 4 | 5 | import java.nio.*; 6 | import java.lang.*; 7 | import java.util.*; 8 | import com.google.flatbuffers.*; 9 | 10 | @SuppressWarnings("unused") 11 | public final class DiffPrimitiveVector extends Table { 12 | public static DiffPrimitiveVector getRootAsDiffPrimitiveVector(ByteBuffer _bb) { return getRootAsDiffPrimitiveVector(_bb, new DiffPrimitiveVector()); } 13 | public static DiffPrimitiveVector getRootAsDiffPrimitiveVector(ByteBuffer _bb, DiffPrimitiveVector obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__init(_bb.getInt(_bb.position()) + _bb.position(), _bb)); } 14 | public DiffPrimitiveVector __init(int _i, ByteBuffer _bb) { bb_pos = _i; bb = _bb; return this; } 15 | 16 | public int len() { int o = __offset(4); return o != 0 ? bb.getInt(o + bb_pos) : 0; } 17 | public NaMask naMask() { return naMask(new NaMask()); } 18 | public NaMask naMask(NaMask obj) { int o = __offset(6); return o != 0 ? obj.__init(__indirect(o + bb_pos), bb) : null; } 19 | public long base() { int o = __offset(8); return o != 0 ? bb.getLong(o + bb_pos) : 0; } 20 | public DataInfo info() { return info(new DataInfo()); } 21 | public DataInfo info(DataInfo obj) { int o = __offset(10); return o != 0 ? obj.__init(o + bb_pos, bb) : null; } 22 | public int data(int j) { int o = __offset(12); return o != 0 ? bb.get(__vector(o) + j * 1) & 0xFF : 0; } 23 | public int dataLength() { int o = __offset(12); return o != 0 ? __vector_len(o) : 0; } 24 | public ByteBuffer dataAsByteBuffer() { return __vector_as_bytebuffer(12, 1); } 25 | 26 | public static void startDiffPrimitiveVector(FlatBufferBuilder builder) { builder.startObject(5); } 27 | public static void addLen(FlatBufferBuilder builder, int len) { builder.addInt(0, len, 0); } 28 | public static void addNaMask(FlatBufferBuilder builder, int naMaskOffset) { builder.addOffset(1, naMaskOffset, 0); } 29 | public static void addBase(FlatBufferBuilder builder, long base) { builder.addLong(2, base, 0); } 30 | public static void addInfo(FlatBufferBuilder builder, int infoOffset) { builder.addStruct(3, infoOffset, 0); } 31 | public static void addData(FlatBufferBuilder builder, int dataOffset) { builder.addOffset(4, dataOffset, 0); } 32 | public static int createDataVector(FlatBufferBuilder builder, byte[] data) { builder.startVector(1, data.length, 1); for (int i = data.length - 1; i >= 0; i--) builder.addByte(data[i]); return builder.endVector(); } 33 | public static void startDataVector(FlatBufferBuilder builder, int numElems) { builder.startVector(1, numElems, 1); } 34 | public static int endDiffPrimitiveVector(FlatBufferBuilder builder) { 35 | int o = builder.endObject(); 36 | return o; 37 | } 38 | public static void finishDiffPrimitiveVectorBuffer(FlatBufferBuilder builder, int offset) { builder.finish(offset); } 39 | }; 40 | 41 | -------------------------------------------------------------------------------- /filo-scala/src/main/scala/org.velvia.filo/vectors/ConstVector.scala: -------------------------------------------------------------------------------- 1 | package org.velvia.filo.vectors 2 | 3 | import org.velvia.filo._ 4 | 5 | object ConstVector { 6 | /** 7 | * Allocates and returns bytes for a ConstVector. 8 | * @param len the logical length or # of repeats 9 | * @param neededBytes the bytes needed for one element 10 | * @param fillBytes a function to fill out bytes at given base and offset 11 | * @return the (base, offset, numBytes) for the ConstVector 12 | */ 13 | def make(len: Int, neededBytes: Int)(fillBytes: (Any, Long) => Unit): (Any, Long, Int) = { 14 | val (base, off, nBytes) = BinaryVector.allocWithMagicHeader(4 + neededBytes) 15 | UnsafeUtils.setInt(base, off, len) 16 | fillBytes(base, off + 4) 17 | (base, off, nBytes) 18 | } 19 | } 20 | 21 | trait ConstVectorType { 22 | val vectMajorType = WireFormat.VECTORTYPE_BINSIMPLE 23 | val vectSubType = WireFormat.SUBTYPE_REPEATED 24 | } 25 | 26 | /** 27 | * A vector which holds the value of one element repeated n times. 28 | */ 29 | abstract class ConstVector[A](val base: Any, val offset: Long, val numBytes: Int) extends 30 | BinaryVector[A] with ConstVectorType { 31 | override val length = UnsafeUtils.getInt(base, offset) 32 | protected val dataOffset = offset + 4 33 | final def isAvailable(i: Int): Boolean = true 34 | val maybeNAs = false 35 | } 36 | 37 | import BuilderEncoder._ 38 | 39 | /** 40 | * An AppendingVector-API compatible class for situations (such as fixed partition keys) where you know 41 | * the values will be constant and just need an Appender. All this class really does is count up however 42 | * many instances to repeat, then generates the ConstVector. It also ensures that that field really is 43 | * a constant. 44 | */ 45 | abstract class ConstAppendingVector[@specialized(Int, Long, Double) A](value: A, 46 | neededBytes: Int, 47 | initLen: Int = 0) 48 | extends BinaryAppendableVector[A] with ConstVectorType { 49 | private var len = initLen 50 | // The code to store the value 51 | def fillBytes(base: Any, offset: Long): Unit 52 | 53 | final def apply(index: Int): A = value 54 | final def addData(data: A): Unit = { len += 1 } 55 | final def addNA(): Unit = { len += 1 } 56 | final def isAvailable(index: Int): Boolean = true 57 | final def base: Any = this 58 | final def numBytes: Int = frozenSize 59 | final def offset: Long = 0 60 | final override def length: Int = len 61 | 62 | val maxBytes = frozenSize 63 | val isAllNA = false 64 | val noNAs = true 65 | val maybeNAs = false 66 | override def frozenSize: Int = 4 + neededBytes 67 | final def reset(): Unit = { len = 0 } 68 | override def optimize(hint: EncodingHint = AutoDetect): BinaryVector[A] = { 69 | val (b, o, l) = ConstVector.make(len, neededBytes)(fillBytes) 70 | finishCompaction(b, o) 71 | } 72 | } -------------------------------------------------------------------------------- /schema/flatbuffers/gen-java/org/velvia/filo/vector/DictStringVector.java: -------------------------------------------------------------------------------- 1 | // automatically generated, do not modify 2 | 3 | package org.velvia.filo.vector; 4 | 5 | import java.nio.*; 6 | import java.lang.*; 7 | import java.util.*; 8 | import com.google.flatbuffers.*; 9 | 10 | @SuppressWarnings("unused") 11 | public final class DictStringVector extends Table { 12 | public static DictStringVector getRootAsDictStringVector(ByteBuffer _bb) { return getRootAsDictStringVector(_bb, new DictStringVector()); } 13 | public static DictStringVector getRootAsDictStringVector(ByteBuffer _bb, DictStringVector obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__init(_bb.getInt(_bb.position()) + _bb.position(), _bb)); } 14 | public DictStringVector __init(int _i, ByteBuffer _bb) { bb_pos = _i; bb = _bb; return this; } 15 | 16 | public int len() { int o = __offset(4); return o != 0 ? bb.getInt(o + bb_pos) : 0; } 17 | public String dictionary(int j) { int o = __offset(6); return o != 0 ? __string(__vector(o) + j * 4) : null; } 18 | public int dictionaryLength() { int o = __offset(6); return o != 0 ? __vector_len(o) : 0; } 19 | public DataInfo info() { return info(new DataInfo()); } 20 | public DataInfo info(DataInfo obj) { int o = __offset(8); return o != 0 ? obj.__init(o + bb_pos, bb) : null; } 21 | public int codes(int j) { int o = __offset(10); return o != 0 ? bb.get(__vector(o) + j * 1) & 0xFF : 0; } 22 | public int codesLength() { int o = __offset(10); return o != 0 ? __vector_len(o) : 0; } 23 | public ByteBuffer codesAsByteBuffer() { return __vector_as_bytebuffer(10, 1); } 24 | 25 | public static void startDictStringVector(FlatBufferBuilder builder) { builder.startObject(4); } 26 | public static void addLen(FlatBufferBuilder builder, int len) { builder.addInt(0, len, 0); } 27 | public static void addDictionary(FlatBufferBuilder builder, int dictionaryOffset) { builder.addOffset(1, dictionaryOffset, 0); } 28 | public static int createDictionaryVector(FlatBufferBuilder builder, int[] data) { builder.startVector(4, data.length, 4); for (int i = data.length - 1; i >= 0; i--) builder.addOffset(data[i]); return builder.endVector(); } 29 | public static void startDictionaryVector(FlatBufferBuilder builder, int numElems) { builder.startVector(4, numElems, 4); } 30 | public static void addInfo(FlatBufferBuilder builder, int infoOffset) { builder.addStruct(2, infoOffset, 0); } 31 | public static void addCodes(FlatBufferBuilder builder, int codesOffset) { builder.addOffset(3, codesOffset, 0); } 32 | public static int createCodesVector(FlatBufferBuilder builder, byte[] data) { builder.startVector(1, data.length, 1); for (int i = data.length - 1; i >= 0; i--) builder.addByte(data[i]); return builder.endVector(); } 33 | public static void startCodesVector(FlatBufferBuilder builder, int numElems) { builder.startVector(1, numElems, 1); } 34 | public static int endDictStringVector(FlatBufferBuilder builder) { 35 | int o = builder.endObject(); 36 | return o; 37 | } 38 | public static void finishDictStringVectorBuffer(FlatBufferBuilder builder, int offset) { builder.finish(offset); } 39 | }; 40 | 41 | -------------------------------------------------------------------------------- /filo-scala/src/main/scala/org.velvia.filo/codecs/DiffEncoders.scala: -------------------------------------------------------------------------------- 1 | package org.velvia.filo.codecs 2 | 3 | import com.google.flatbuffers.FlatBufferBuilder 4 | import java.nio.{ByteBuffer, ByteOrder} 5 | import org.joda.time.DateTime 6 | import scala.collection.mutable.BitSet 7 | 8 | import org.velvia.filo._ 9 | import org.velvia.filo.vector._ 10 | 11 | /** 12 | * Encoders that store deltas from a base value to reduce the FiloVector size. 13 | */ 14 | object DiffEncoders extends ThreadLocalBuffers { 15 | import Utils._ 16 | 17 | var count = 0 18 | 19 | /** 20 | * Creates a DiffPrimitiveVector-based Filo vector. 21 | * @param min the minimum value from the data points that are available. 22 | * Be careful not to include points from NA parts of the data sequence. 23 | */ 24 | def toPrimitiveVector[A: PrimitiveDataVectBuilder](data: Seq[A], 25 | naMask: BitSet, 26 | min: A, 27 | max: A): ByteBuffer = { 28 | import DiffPrimitiveVector._ 29 | 30 | val vectBuilder = implicitly[PrimitiveDataVectBuilder[A]] 31 | count += 1 32 | val fbb = new FlatBufferBuilder(getBuffer) 33 | val naOffset = populateNaMask(fbb, naMask, data.length) 34 | 35 | val ((dataOffset, dnbits), dsigned) = vectBuilder.buildDeltas(fbb, data, min, max) 36 | startDiffPrimitiveVector(fbb) 37 | addNaMask(fbb, naOffset) 38 | addLen(fbb, data.length) 39 | addData(fbb, dataOffset) 40 | addInfo(fbb, DataInfo.createDataInfo(fbb, dnbits, dsigned)) 41 | addBase(fbb, vectBuilder.toLong(min)) 42 | finishDiffPrimitiveVectorBuffer(fbb, endDiffPrimitiveVector(fbb)) 43 | putHeaderAndGet(fbb, WireFormat.VECTORTYPE_DIFF, WireFormat.SUBTYPE_PRIMITIVE) 44 | } 45 | 46 | def toDateTimeVector(millis: LongVectorBuilder, 47 | tz: IntVectorBuilder, 48 | naMask: BitSet): ByteBuffer = { 49 | import DiffDateTimeVector._ 50 | 51 | val intVectBuilder = AutoIntegralDVBuilders.IntDataVectBuilder 52 | val longVectBuilder = AutoIntegralDVBuilders.LongDataVectBuilder 53 | count += 1 54 | val fbb = new FlatBufferBuilder(getBuffer) 55 | val naOffset = populateNaMask(fbb, naMask, millis.length) 56 | 57 | val ((mOffset, mnbits), msigned) = longVectBuilder.buildDeltas(fbb, millis.data, 58 | millis.min, millis.max) 59 | // Only build timezone vector if they are different. Most DateTime's have same TZ 60 | val ((tOffset, tnbits), tsigned) = if (tz.min != tz.max) { 61 | intVectBuilder.buildDeltas(fbb, tz.data, tz.min, tz.max) 62 | } else { 63 | ((-1, -1), false) 64 | } 65 | 66 | startDiffDateTimeVector(fbb) 67 | addNaMask(fbb, naOffset) 68 | addVars(fbb, DDTVars.createDDTVars(fbb, millis.length, tz.min.toByte, millis.min)) 69 | addMillisInfo(fbb, DataInfo.createDataInfo(fbb, mnbits, msigned)) 70 | addMillis(fbb, mOffset) 71 | if (tOffset >= 0) { 72 | addTzInfo(fbb, DataInfo.createDataInfo(fbb, tnbits, tsigned)) 73 | addTz(fbb, tOffset) 74 | } 75 | finishDiffDateTimeVectorBuffer(fbb, endDiffDateTimeVector(fbb)) 76 | putHeaderAndGet(fbb, WireFormat.VECTORTYPE_DIFF, WireFormat.SUBTYPE_DATETIME) 77 | } 78 | } -------------------------------------------------------------------------------- /filo-scala/src/main/scala/org.velvia.filo/FiloRowReader.scala: -------------------------------------------------------------------------------- 1 | package org.velvia.filo 2 | 3 | import java.nio.ByteBuffer 4 | 5 | /** 6 | * A RowReader designed for iteration over rows of multiple Filo vectors, ideally all 7 | * with the same length. 8 | * An Iterator[RowReader] sets the rowNo and returns this RowReader, and 9 | * the application is responsible for calling the right method to extract each value. 10 | * For example, a Spark Row can inherit from RowReader. 11 | */ 12 | trait FiloRowReader extends RowReader { 13 | def parsers: Array[FiloVector[_]] 14 | def rowNo: Int 15 | def setRowNo(newRowNo: Int) 16 | } 17 | 18 | /** 19 | * Just a concrete implementation. 20 | * Designed to minimize allocation by having iterator repeatedly set/update rowNo. 21 | * Thus, this is not appropriate for Seq[RowReader] or conversion to Seq. 22 | */ 23 | class FastFiloRowReader(val parsers: Array[FiloVector[_]]) extends FiloRowReader { 24 | var rowNo: Int = -1 25 | def setRowNo(newRowNo: Int): Unit = { rowNo = newRowNo } 26 | 27 | def this(chunks: Array[ByteBuffer], classes: Array[Class[_]], emptyLen: Int = 0) = 28 | this(FiloVector.makeVectors(chunks, classes, emptyLen)) 29 | 30 | final def notNull(columnNo: Int): Boolean = parsers(columnNo).isAvailable(rowNo) 31 | final def getBoolean(columnNo: Int): Boolean = parsers(columnNo).asInstanceOf[FiloVector[Boolean]](rowNo) 32 | final def getInt(columnNo: Int): Int = parsers(columnNo).asInstanceOf[FiloVector[Int]](rowNo) 33 | final def getLong(columnNo: Int): Long = parsers(columnNo).asInstanceOf[FiloVector[Long]](rowNo) 34 | final def getDouble(columnNo: Int): Double = parsers(columnNo).asInstanceOf[FiloVector[Double]](rowNo) 35 | final def getFloat(columnNo: Int): Float = parsers(columnNo).asInstanceOf[FiloVector[Float]](rowNo) 36 | final def getString(columnNo: Int): String = parsers(columnNo).asInstanceOf[FiloVector[String]](rowNo) 37 | override final def filoUTF8String(columnNo: Int): ZeroCopyUTF8String = 38 | parsers(columnNo).asInstanceOf[FiloVector[ZeroCopyUTF8String]](rowNo) 39 | final def getAny(columnNo: Int): Any = parsers(columnNo).boxed(rowNo) 40 | } 41 | 42 | // A RowReader that can safely be used in Seqs. IE the rowNo is final and won't change. 43 | case class SafeFiloRowReader(reader: FiloRowReader, rowNo: Int) extends FiloRowReader { 44 | val parsers = reader.parsers 45 | require(rowNo < parsers(0).length) 46 | def setRowNo(newRowNo: Int): Unit = {} 47 | 48 | final def notNull(columnNo: Int): Boolean = parsers(columnNo).isAvailable(rowNo) 49 | final def getBoolean(columnNo: Int): Boolean = parsers(columnNo).asInstanceOf[FiloVector[Boolean]](rowNo) 50 | final def getInt(columnNo: Int): Int = parsers(columnNo).asInstanceOf[FiloVector[Int]](rowNo) 51 | final def getLong(columnNo: Int): Long = parsers(columnNo).asInstanceOf[FiloVector[Long]](rowNo) 52 | final def getDouble(columnNo: Int): Double = parsers(columnNo).asInstanceOf[FiloVector[Double]](rowNo) 53 | final def getFloat(columnNo: Int): Float = parsers(columnNo).asInstanceOf[FiloVector[Float]](rowNo) 54 | final def getString(columnNo: Int): String = parsers(columnNo).asInstanceOf[FiloVector[String]](rowNo) 55 | override final def filoUTF8String(columnNo: Int): ZeroCopyUTF8String = 56 | parsers(columnNo).asInstanceOf[FiloVector[ZeroCopyUTF8String]](rowNo) 57 | final def getAny(columnNo: Int): Any = parsers(columnNo).boxed(rowNo) 58 | } 59 | 60 | -------------------------------------------------------------------------------- /filo-scala/src/test/scala/org.velvia.filo/ZeroCopyBinaryTest.scala: -------------------------------------------------------------------------------- 1 | package org.velvia.filo 2 | 3 | import org.scalatest.{FunSpec, Matchers} 4 | import org.scalatest.prop.PropertyChecks 5 | 6 | class ZeroCopyBinaryTest extends FunSpec with Matchers with PropertyChecks { 7 | describe("ZeroCopyUTF8String") { 8 | it("should convert back and forth between regular strings") { 9 | ZeroCopyUTF8String("sheep").asNewString should equal ("sheep") 10 | } 11 | 12 | import ZeroCopyUTF8String._ 13 | import Ordered._ 14 | 15 | it("should compare two strings properly") { 16 | // Unequal lengths, equal prefix 17 | ZeroCopyUTF8String("boobeebob") should be > (ZeroCopyUTF8String("boobee")) 18 | 19 | // Equal lengths, different content 20 | // First comparison fights against int comparisons without proper byte ordering 21 | ZeroCopyUTF8String("aaab") should be < (ZeroCopyUTF8String("baaa")) 22 | "bobcat".utf8 should equal ("bobcat".utf8) 23 | 24 | // Strings longer than 8 chars (in case comparison uses long compare) 25 | "dictionary".utf8 should be < ("pictionar".utf8) 26 | "dictionary".utf8 should be > ("dictionaries".utf8) 27 | 28 | // Calling equals to some other type should return false 29 | ZeroCopyUTF8String("dictionary") should not equal ("dictionary") 30 | } 31 | 32 | it("should compare random strings properly") { 33 | import java.lang.Integer.signum 34 | forAll { (strs: (String, String)) => 35 | val nativeCmp = signum(strs._1.compare(strs._2)) 36 | signum(ZeroCopyUTF8String(strs._1).compare(ZeroCopyUTF8String(strs._2))) should equal (nativeCmp) 37 | } 38 | } 39 | 40 | it("should get bytes back and convert back to instance, and compare equally") { 41 | val origUTF8Str = ZeroCopyUTF8String("dictionary") 42 | ZeroCopyUTF8String(origUTF8Str.bytes) should equal (origUTF8Str) 43 | } 44 | 45 | it("should generate same hashcode for same content") { 46 | "bobcat".utf8.hashCode should equal ("bobcat".utf8.hashCode) 47 | "bobcat".utf8.hashCode should not equal (ZeroCopyUTF8String("bob").hashCode) 48 | 49 | "bobcat".utf8.cachedHash64 should equal ("bobcat".utf8.cachedHash64) 50 | "bobcat".utf8.cachedHash64 should not equal (ZeroCopyUTF8String("bob").cachedHash64) 51 | } 52 | 53 | val str1 = ZeroCopyUTF8String("1234") 54 | val str2 = ZeroCopyUTF8String("一2三4") 55 | val str3 = ZeroCopyUTF8String("一二34") 56 | 57 | it("should get substring correctly") { 58 | str1.substring(3, 2) should equal (ZeroCopyUTF8String("")) 59 | str2.substring(0, 2) should equal (ZeroCopyUTF8String("一2")) 60 | str2.substring(1, 5) should equal (ZeroCopyUTF8String("2三4")) 61 | str3.substring(0, 3) should equal (ZeroCopyUTF8String("一二3")) 62 | str2.substring(1, 3) should equal (ZeroCopyUTF8String("2三")) 63 | } 64 | 65 | it("should startsWith and endsWith correctly") { 66 | str2.startsWith(ZeroCopyUTF8String("一2")) should equal (true) 67 | str2.startsWith(ZeroCopyUTF8String("2三")) should equal (false) 68 | str2.startsWith(str1) should equal (false) 69 | 70 | str2.endsWith(str3) should equal (false) 71 | str2.endsWith(ZeroCopyUTF8String("4")) should equal (true) 72 | } 73 | 74 | it("should check contains correctly") { 75 | str2.contains(ZeroCopyUTF8String("2三")) should equal (true) 76 | str2.contains(str1) should equal (false) 77 | } 78 | } 79 | } -------------------------------------------------------------------------------- /filo-scala/src/main/scala/org.velvia.filo/RowToVectorBuilder.scala: -------------------------------------------------------------------------------- 1 | package org.velvia.filo 2 | 3 | import java.nio.ByteBuffer 4 | import scala.language.existentials 5 | import scala.language.postfixOps 6 | import scalaxy.loops._ 7 | 8 | import BuilderEncoder.{EncodingHint, AutoDetect} 9 | 10 | case class VectorInfo(name: String, dataType: Class[_]) 11 | 12 | // To help matching against the ClassTag in the VectorBuilder 13 | private object Classes { 14 | val Boolean = classOf[Boolean] 15 | val Byte = java.lang.Byte.TYPE 16 | val Short = java.lang.Short.TYPE 17 | val Int = java.lang.Integer.TYPE 18 | val Long = java.lang.Long.TYPE 19 | val Float = java.lang.Float.TYPE 20 | val Double = java.lang.Double.TYPE 21 | val String = classOf[String] 22 | val DateTime = classOf[org.joda.time.DateTime] 23 | val SqlTimestamp = classOf[java.sql.Timestamp] 24 | val UTF8 = classOf[ZeroCopyUTF8String] 25 | } 26 | 27 | object RowToVectorBuilder { 28 | /** 29 | * A convenience method to turn a bunch of rows R to Filo serialized columnar chunks. 30 | * @param rows the rows to convert to columnar chunks 31 | * @param schema a Seq of VectorInfo describing the [[VectorBuilder]] used for each column 32 | * @param hint an EncodingHint for the encoder 33 | * @return a Map of column name to the byte chunks 34 | */ 35 | def buildFromRows(rows: Iterator[RowReader], 36 | schema: Seq[VectorInfo], 37 | hint: EncodingHint = AutoDetect): Map[String, ByteBuffer] = { 38 | val builder = new RowToVectorBuilder(schema) 39 | rows.foreach(builder.addRow) 40 | builder.convertToBytes(hint) 41 | } 42 | } 43 | 44 | /** 45 | * Class to help transpose a set of rows to Filo binary vectors. 46 | * @param schema a Seq of VectorInfo describing the data type used for each vector 47 | * @param builderMap pass in a custom BuilderMap to extend the supported vector types 48 | * 49 | * TODO: Add stats about # of rows, chunks/buffers encoded, bytes encoded, # NA's etc. 50 | */ 51 | class RowToVectorBuilder(schema: Seq[VectorInfo], 52 | builderMap: VectorBuilder.BuilderMap = VectorBuilder.defaultBuilderMap) { 53 | val builders = schema.map { case VectorInfo(_, dataType) => VectorBuilder(dataType, builderMap) } 54 | val numColumns = schema.length 55 | 56 | /** 57 | * Resets the VectorBuilders. Call this before the next batch of rows to transpose. 58 | * @return {[type]} [description] 59 | */ 60 | def reset(): Unit = { 61 | builders.foreach(_.reset()) 62 | } 63 | 64 | /** 65 | * Adds a single row of data to each of the VectorBuilders. 66 | * @param row the row of data to transpose. Each column will be added to the right Builders. 67 | */ 68 | def addRow(row: RowReader): Unit = { 69 | for { i <- 0 until numColumns optimized } { 70 | builders(i).add(row, i) 71 | } 72 | } 73 | 74 | /** 75 | * Adds a single blank NA value to all builders 76 | */ 77 | def addEmptyRow(): Unit = { 78 | builders.foreach(_.addNA()) 79 | } 80 | 81 | /** 82 | * Converts the contents of the [[VectorBuilder]]s to ByteBuffers for writing or transmission. 83 | * @param hint an EncodingHint for the encoder 84 | */ 85 | def convertToBytes(hint: EncodingHint = AutoDetect): Map[String, ByteBuffer] = { 86 | val chunks = builders.map(_.toFiloBuffer(hint)) 87 | schema.zip(chunks).map { case (VectorInfo(colName, _), bytes) => (colName, bytes) }.toMap 88 | } 89 | 90 | private def unsupportedInput(typ: Any) = 91 | throw new RuntimeException("Unsupported input type " + typ) 92 | } 93 | 94 | -------------------------------------------------------------------------------- /filo-scala-jmh/src/main/scala/org.velvia.filo/BasicFiloBenchmark.scala: -------------------------------------------------------------------------------- 1 | package org.velvia.filo 2 | 3 | import org.openjdk.jmh.annotations.Benchmark 4 | import org.openjdk.jmh.annotations.BenchmarkMode 5 | import org.openjdk.jmh.annotations.{Mode, State, Scope} 6 | import org.openjdk.jmh.annotations.OutputTimeUnit 7 | import scalaxy.loops._ 8 | import scala.language.postfixOps 9 | 10 | import java.util.concurrent.TimeUnit 11 | 12 | /** 13 | * Measures basic read benchmark with no NAs for an IntColumn. 14 | * Just raw read speed basically. 15 | * Measures read speed of different encodings (int, byte, diff) as well as 16 | * different read methods. 17 | * 18 | * For a description of the JMH measurement modes, see 19 | * https://github.com/ktoso/sbt-jmh/blob/master/src/sbt-test/sbt-jmh/jmh-run/src/main/scala/org/openjdk/jmh/samples/JMHSample_02_BenchmarkModes.scala 20 | */ 21 | @State(Scope.Thread) 22 | class BasicFiloBenchmark { 23 | import VectorReader._ 24 | import vectors.IntBinaryVector 25 | 26 | // Ok, create an IntColumn and benchmark it. 27 | val numValues = 10000 28 | 29 | val randomInts = (0 until numValues).map(i => util.Random.nextInt) 30 | val randomIntsAray = randomInts.toArray 31 | val filoBuffer = VectorBuilder(randomInts).toFiloBuffer 32 | val sc = FiloVector[Int](filoBuffer) 33 | 34 | val ivbuilder = IntBinaryVector.appendingVectorNoNA(numValues) 35 | randomInts.foreach(ivbuilder.addData) 36 | val iv = IntBinaryVector(ivbuilder.base, ivbuilder.offset, ivbuilder.numBytes) 37 | 38 | val byteFiloBuf = VectorBuilder(randomInts.map(_ % 128)).toFiloBuffer 39 | val byteVect = FiloVector[Int](byteFiloBuf) 40 | 41 | val diffFiloBuf = VectorBuilder(randomInts.map(10000 + _ % 128)).toFiloBuffer 42 | val diffVect = FiloVector[Int](diffFiloBuf) 43 | 44 | // According to @ktosopl, be sure to return some value if possible so that JVM won't 45 | // optimize out the method body. However JMH is apparently very good at avoiding this. 46 | // fastest loop possible using FiloVectorApply method 47 | @Benchmark 48 | @BenchmarkMode(Array(Mode.AverageTime)) 49 | @OutputTimeUnit(TimeUnit.MICROSECONDS) 50 | def sumAllIntsFiloApply(): Int = { 51 | var total = 0 52 | for { i <- 0 until numValues optimized } { 53 | total += sc(i) 54 | } 55 | total 56 | } 57 | 58 | @Benchmark 59 | @BenchmarkMode(Array(Mode.AverageTime)) 60 | @OutputTimeUnit(TimeUnit.MICROSECONDS) 61 | def sumAllIntsBinaryVectApply(): Int = { 62 | var total = 0 63 | for { i <- 0 until numValues optimized } { 64 | total += iv(i) 65 | } 66 | total 67 | } 68 | 69 | @Benchmark 70 | @BenchmarkMode(Array(Mode.AverageTime)) 71 | @OutputTimeUnit(TimeUnit.MICROSECONDS) 72 | def sumAllIntsFiloByteApply(): Int = { 73 | var total = 0 74 | for { i <- 0 until numValues optimized } { 75 | total += byteVect(i) 76 | } 77 | total 78 | } 79 | 80 | @Benchmark 81 | @BenchmarkMode(Array(Mode.AverageTime)) 82 | @OutputTimeUnit(TimeUnit.MICROSECONDS) 83 | def sumAllIntsFiloDiffApply(): Int = { 84 | var total = 0 85 | for { i <- 0 until numValues optimized } { 86 | total += diffVect(i) 87 | } 88 | total 89 | } 90 | 91 | @Benchmark 92 | @BenchmarkMode(Array(Mode.AverageTime)) 93 | @OutputTimeUnit(TimeUnit.MICROSECONDS) 94 | def sumAllNotNullIntsFiloApply(): Int = { 95 | var total = 0 96 | for { i <- 0 until numValues optimized } { 97 | if (sc.isAvailable(i)) total += sc(i) 98 | } 99 | total 100 | } 101 | 102 | // sum which uses foreach from FiloVector 103 | @Benchmark 104 | @BenchmarkMode(Array(Mode.AverageTime)) 105 | @OutputTimeUnit(TimeUnit.MICROSECONDS) 106 | def sumAllIntsFiloForeachFoldLeft(): Int = { 107 | sc.foldLeft(0)(_ + _) 108 | } 109 | } 110 | -------------------------------------------------------------------------------- /schema/flatbuffers/gen-java/org/velvia/filo/vector/DiffDateTimeVector.java: -------------------------------------------------------------------------------- 1 | // automatically generated, do not modify 2 | 3 | package org.velvia.filo.vector; 4 | 5 | import java.nio.*; 6 | import java.lang.*; 7 | import java.util.*; 8 | import com.google.flatbuffers.*; 9 | 10 | @SuppressWarnings("unused") 11 | public final class DiffDateTimeVector extends Table { 12 | public static DiffDateTimeVector getRootAsDiffDateTimeVector(ByteBuffer _bb) { return getRootAsDiffDateTimeVector(_bb, new DiffDateTimeVector()); } 13 | public static DiffDateTimeVector getRootAsDiffDateTimeVector(ByteBuffer _bb, DiffDateTimeVector obj) { _bb.order(ByteOrder.LITTLE_ENDIAN); return (obj.__init(_bb.getInt(_bb.position()) + _bb.position(), _bb)); } 14 | public DiffDateTimeVector __init(int _i, ByteBuffer _bb) { bb_pos = _i; bb = _bb; return this; } 15 | 16 | public NaMask naMask() { return naMask(new NaMask()); } 17 | public NaMask naMask(NaMask obj) { int o = __offset(4); return o != 0 ? obj.__init(__indirect(o + bb_pos), bb) : null; } 18 | public DDTVars vars() { return vars(new DDTVars()); } 19 | public DDTVars vars(DDTVars obj) { int o = __offset(6); return o != 0 ? obj.__init(o + bb_pos, bb) : null; } 20 | public DataInfo millisInfo() { return millisInfo(new DataInfo()); } 21 | public DataInfo millisInfo(DataInfo obj) { int o = __offset(8); return o != 0 ? obj.__init(o + bb_pos, bb) : null; } 22 | public int millis(int j) { int o = __offset(10); return o != 0 ? bb.get(__vector(o) + j * 1) & 0xFF : 0; } 23 | public int millisLength() { int o = __offset(10); return o != 0 ? __vector_len(o) : 0; } 24 | public ByteBuffer millisAsByteBuffer() { return __vector_as_bytebuffer(10, 1); } 25 | public DataInfo tzInfo() { return tzInfo(new DataInfo()); } 26 | public DataInfo tzInfo(DataInfo obj) { int o = __offset(12); return o != 0 ? obj.__init(o + bb_pos, bb) : null; } 27 | public int tz(int j) { int o = __offset(14); return o != 0 ? bb.get(__vector(o) + j * 1) & 0xFF : 0; } 28 | public int tzLength() { int o = __offset(14); return o != 0 ? __vector_len(o) : 0; } 29 | public ByteBuffer tzAsByteBuffer() { return __vector_as_bytebuffer(14, 1); } 30 | 31 | public static void startDiffDateTimeVector(FlatBufferBuilder builder) { builder.startObject(6); } 32 | public static void addNaMask(FlatBufferBuilder builder, int naMaskOffset) { builder.addOffset(0, naMaskOffset, 0); } 33 | public static void addVars(FlatBufferBuilder builder, int varsOffset) { builder.addStruct(1, varsOffset, 0); } 34 | public static void addMillisInfo(FlatBufferBuilder builder, int millisInfoOffset) { builder.addStruct(2, millisInfoOffset, 0); } 35 | public static void addMillis(FlatBufferBuilder builder, int millisOffset) { builder.addOffset(3, millisOffset, 0); } 36 | public static int createMillisVector(FlatBufferBuilder builder, byte[] data) { builder.startVector(1, data.length, 1); for (int i = data.length - 1; i >= 0; i--) builder.addByte(data[i]); return builder.endVector(); } 37 | public static void startMillisVector(FlatBufferBuilder builder, int numElems) { builder.startVector(1, numElems, 1); } 38 | public static void addTzInfo(FlatBufferBuilder builder, int tzInfoOffset) { builder.addStruct(4, tzInfoOffset, 0); } 39 | public static void addTz(FlatBufferBuilder builder, int tzOffset) { builder.addOffset(5, tzOffset, 0); } 40 | public static int createTzVector(FlatBufferBuilder builder, byte[] data) { builder.startVector(1, data.length, 1); for (int i = data.length - 1; i >= 0; i--) builder.addByte(data[i]); return builder.endVector(); } 41 | public static void startTzVector(FlatBufferBuilder builder, int numElems) { builder.startVector(1, numElems, 1); } 42 | public static int endDiffDateTimeVector(FlatBufferBuilder builder) { 43 | int o = builder.endObject(); 44 | return o; 45 | } 46 | public static void finishDiffDateTimeVectorBuffer(FlatBufferBuilder builder, int offset) { builder.finish(offset); } 47 | }; 48 | 49 | -------------------------------------------------------------------------------- /filo-scala-jmh/src/main/scala/org.velvia.filo/EncodingBenchmark.scala: -------------------------------------------------------------------------------- 1 | package org.velvia.filo 2 | 3 | import java.sql.Timestamp 4 | import java.util.concurrent.TimeUnit 5 | import org.openjdk.jmh.annotations.Benchmark 6 | import org.openjdk.jmh.annotations.BenchmarkMode 7 | import org.openjdk.jmh.annotations.OutputTimeUnit 8 | import org.openjdk.jmh.annotations.{Mode, State, Scope} 9 | import scala.language.postfixOps 10 | import scalaxy.loops._ 11 | 12 | import org.velvia.filo.vectors._ 13 | 14 | /** 15 | * Measures the speed of encoding different types of data, 16 | * including just Filo vector encoding and encoding from RowReaders. 17 | * 18 | * For a description of the JMH measurement modes, see 19 | * https://github.com/ktoso/sbt-jmh/blob/master/src/sbt-test/sbt-jmh/jmh-run/src/main/scala/org/openjdk/jmh/samples/JMHSample_02_BenchmarkModes.scala 20 | */ 21 | @State(Scope.Thread) 22 | class EncodingBenchmark { 23 | import BuilderEncoder.SimpleEncoding 24 | import scala.util.Random.{alphanumeric, nextInt, nextFloat} 25 | import VectorReader._ 26 | 27 | // Ok, create an IntColumn and benchmark it. 28 | val numValues = 10000 29 | 30 | val randomInts = (0 until numValues).map(i => util.Random.nextInt) 31 | val randomLongs = randomInts.map(_.toLong) 32 | 33 | // NOTE: results show that time spent is heavily influenced by ratio of unique strings... 34 | val numUniqueStrings = 500 35 | val maxStringLength = 15 36 | val minStringLength = 5 37 | val naChance = 0.05 //5% of values will be NA 38 | 39 | def randString(len: Int): String = alphanumeric.take(len).mkString 40 | 41 | val uniqueStrings = (0 until numUniqueStrings).map { i => 42 | randString(minStringLength + nextInt(maxStringLength - minStringLength)) 43 | } 44 | val randomStrings = (0 until numValues).map(i => uniqueStrings(nextInt(numUniqueStrings))) 45 | 46 | @Benchmark 47 | @BenchmarkMode(Array(Mode.Throughput)) 48 | @OutputTimeUnit(TimeUnit.SECONDS) 49 | // Measures encoding speed of strings that are often repeated 50 | def dictStringEncoding(): Unit = { 51 | VectorBuilder(randomStrings).toFiloBuffer 52 | } 53 | 54 | @Benchmark 55 | @BenchmarkMode(Array(Mode.Throughput)) 56 | @OutputTimeUnit(TimeUnit.SECONDS) 57 | def simpleStringEncoding(): Unit = { 58 | VectorBuilder(randomStrings).toFiloBuffer(SimpleEncoding) 59 | } 60 | 61 | @Benchmark 62 | @BenchmarkMode(Array(Mode.Throughput)) 63 | @OutputTimeUnit(TimeUnit.SECONDS) 64 | def intVectorEncoding(): Unit = { 65 | VectorBuilder(randomInts).toFiloBuffer 66 | } 67 | 68 | val intArray = randomInts.toArray 69 | 70 | @Benchmark 71 | @BenchmarkMode(Array(Mode.Throughput)) 72 | @OutputTimeUnit(TimeUnit.SECONDS) 73 | def newIntVectorEncoding(): Unit = { 74 | val cb = IntBinaryVector.appendingVector(numValues) 75 | for { i <- 0 until numValues optimized } { 76 | cb.addData(intArray(i)) 77 | } 78 | cb.optimize().toFiloBuffer 79 | } 80 | 81 | val utf8strings = randomStrings.map(ZeroCopyUTF8String.apply).toArray 82 | 83 | @Benchmark 84 | @BenchmarkMode(Array(Mode.Throughput)) 85 | @OutputTimeUnit(TimeUnit.SECONDS) 86 | def newUtf8VectorEncoding(): Unit = { 87 | val cb = UTF8Vector.appendingVector(numValues, 16 + numValues * 20) 88 | for { i <- 0 until numValues optimized } { 89 | cb.addData(utf8strings(i)) 90 | } 91 | cb.toFiloBuffer 92 | } 93 | // TODO: RowReader based vector building 94 | 95 | val utf8cb = UTF8Vector.appendingVector(numValues, 16 + numValues * 20) 96 | for { i <- 0 until numValues optimized } { 97 | utf8cb.addData(utf8strings(i)) 98 | } 99 | 100 | @Benchmark 101 | @BenchmarkMode(Array(Mode.Throughput)) 102 | @OutputTimeUnit(TimeUnit.SECONDS) 103 | def newUtf8AddVector(): Unit = { 104 | val cb = UTF8Vector.appendingVector(numValues, 16 + numValues * 20) 105 | cb.addVector(utf8cb) 106 | } 107 | 108 | @Benchmark 109 | @BenchmarkMode(Array(Mode.Throughput)) 110 | @OutputTimeUnit(TimeUnit.SECONDS) 111 | def newDictUtf8VectorEncoding(): Unit = { 112 | UTF8Vector.writeOptimizedBuffer(utf8strings, samplingRate=0.5) 113 | } 114 | } -------------------------------------------------------------------------------- /filo-scala/src/main/scala/org.velvia.filo/TypedBufferReader.scala: -------------------------------------------------------------------------------- 1 | package org.velvia.filo 2 | 3 | trait TypedBufferReader[@specialized A] { 4 | def read(i: Int): A 5 | } 6 | 7 | trait TypedReaderProvider[A] { 8 | def getReader(reader: FastBufferReader): TypedBufferReader.NBitsToReader[A] 9 | } 10 | 11 | object TypedBufferReader { 12 | type NBitsToReader[A] = PartialFunction[(Int, Boolean), TypedBufferReader[A]] 13 | 14 | def UnsupportedReaderPF[A]: NBitsToReader[A] = { 15 | case (nbits, signed) => throw new RuntimeException(s"Unsupported Filo vector nbits=$nbits signed=$signed") 16 | } 17 | 18 | def apply[A: TypedReaderProvider](reader: FastBufferReader, nbits: Int, signed: Boolean): 19 | TypedBufferReader[A] = 20 | (implicitly[TypedReaderProvider[A]].getReader(reader) orElse UnsupportedReaderPF)((nbits, signed)) 21 | 22 | implicit object BoolReaderProvider extends TypedReaderProvider[Boolean] { 23 | def getReader(reader: FastBufferReader): NBitsToReader[Boolean] = { 24 | case (64, _) => 25 | new TypedBufferReader[Boolean] { 26 | final def read(i: Int): Boolean = ((reader.readByte(i >> 3) >> (i & 0x07)) & 0x01) != 0 27 | } 28 | } 29 | } 30 | 31 | implicit object IntReaderProvider extends TypedReaderProvider[Int] { 32 | def getReader(reader: FastBufferReader): NBitsToReader[Int] = { 33 | case (32, _) => new TypedBufferReader[Int] { 34 | final def read(i: Int): Int = reader.readInt(i) 35 | } 36 | case (16, true) => new TypedBufferReader[Int] { 37 | final def read(i: Int): Int = reader.readShort(i).toInt 38 | } 39 | case (8, true) => new TypedBufferReader[Int] { 40 | final def read(i: Int): Int = reader.readByte(i).toInt 41 | } 42 | case (16, false) => new TypedBufferReader[Int] { 43 | final def read(i: Int): Int = (reader.readShort(i) & 0x0ffff).toInt 44 | } 45 | case (8, false) => new TypedBufferReader[Int] { 46 | final def read(i: Int): Int = (reader.readByte(i) & 0x00ff).toInt 47 | } 48 | } 49 | } 50 | 51 | implicit object LongReaderProvider extends TypedReaderProvider[Long] { 52 | def getReader(reader: FastBufferReader): NBitsToReader[Long] = { 53 | case (64, _) => new TypedBufferReader[Long] { 54 | final def read(i: Int): Long = reader.readLong(i) 55 | } 56 | case (32, true) => new TypedBufferReader[Long] { 57 | final def read(i: Int): Long = reader.readInt(i).toLong 58 | } 59 | case (16, true) => new TypedBufferReader[Long] { 60 | final def read(i: Int): Long = reader.readShort(i).toLong 61 | } 62 | case (8, true) => new TypedBufferReader[Long] { 63 | final def read(i: Int): Long = reader.readByte(i).toLong 64 | } 65 | case (32, false) => new TypedBufferReader[Long] { 66 | final def read(i: Int): Long = (reader.readInt(i) & 0x0ffffffffL).toLong 67 | } 68 | case (16, false) => new TypedBufferReader[Long] { 69 | final def read(i: Int): Long = (reader.readShort(i) & 0x0ffff).toLong 70 | } 71 | case (8, false) => new TypedBufferReader[Long] { 72 | final def read(i: Int): Long = (reader.readByte(i) & 0x00ff).toLong 73 | } 74 | } 75 | } 76 | 77 | implicit object DoubleReaderProvider extends TypedReaderProvider[Double] { 78 | def getReader(reader: FastBufferReader): NBitsToReader[Double] = { 79 | case (64, false) => new TypedBufferReader[Double] { 80 | final def read(i: Int): Double = reader.readDouble(i) 81 | } 82 | } 83 | } 84 | 85 | implicit object FloatReaderProvider extends TypedReaderProvider[Float] { 86 | def getReader(reader: FastBufferReader): NBitsToReader[Float] = { 87 | case (32, false) => new TypedBufferReader[Float] { 88 | final def read(i: Int): Float = reader.readFloat(i) 89 | } 90 | } 91 | } 92 | } -------------------------------------------------------------------------------- /filo-scala/src/test/scala/org.velvia.filo/RowReaderTest.scala: -------------------------------------------------------------------------------- 1 | package org.velvia.filo 2 | 3 | import org.joda.time.DateTime 4 | import java.sql.Timestamp 5 | import org.scalatest.FunSpec 6 | import org.scalatest.Matchers 7 | 8 | class RowReaderTest extends FunSpec with Matchers { 9 | val schema = Seq( 10 | VectorInfo("name", classOf[String]), 11 | VectorInfo("age", classOf[Int]), 12 | VectorInfo("timestamp", classOf[Timestamp]) 13 | ) 14 | 15 | val rows = Seq( 16 | (Some("Matthew Perry"), Some(18), Some(new Timestamp(10000L))), 17 | (Some("Michelle Pfeiffer"), None, Some(new Timestamp(10010L))), 18 | (Some("George C"), Some(59), None), 19 | (Some("Rich Sherman"), Some(26), Some(new Timestamp(10000L))) 20 | ) 21 | 22 | val csvRows = Seq( 23 | "Matthew Perry,18,1973-01-25T00Z", 24 | "Michelle Pfeiffer,,1970-07-08T00Z", 25 | "George C,59,", 26 | "Rich Sherman,26,1991-10-12T00Z" 27 | ).map(str => (str.split(',') :+ "").take(3)) 28 | 29 | def readValues[T](r: FastFiloRowReader, len: Int)(f: FiloRowReader => T): Seq[T] = { 30 | (0 until len).map { i => 31 | r.rowNo = i 32 | f(r) 33 | } 34 | } 35 | 36 | it("should extract from columns back to rows") { 37 | val columnData = RowToVectorBuilder.buildFromRows(rows.map(TupleRowReader).toIterator, 38 | schema, 39 | BuilderEncoder.SimpleEncoding) 40 | val chunks = Array(columnData("name"), columnData("age"), columnData("timestamp")) 41 | val types = schema.map(_.dataType) 42 | val reader = new FastFiloRowReader(chunks, types.toArray) 43 | 44 | readValues(reader, 4)(_.getString(0)) should equal ( 45 | Seq("Matthew Perry", "Michelle Pfeiffer", "George C", "Rich Sherman")) 46 | 47 | reader.rowNo = 1 48 | reader.notNull(1) should equal (false) 49 | reader.as[Timestamp](2) should equal (new Timestamp(10010L)) 50 | } 51 | 52 | it("should write to columns from ArrayStringRowReader and read back properly") { 53 | val columnData = RowToVectorBuilder.buildFromRows(csvRows.map(ArrayStringRowReader).toIterator, 54 | schema, 55 | BuilderEncoder.SimpleEncoding) 56 | val chunks = Array(columnData("name"), columnData("age"), columnData("timestamp")) 57 | val types = schema.map(_.dataType) 58 | val reader = new FastFiloRowReader(chunks, types.toArray) 59 | 60 | readValues(reader, 4)(_.getString(0)) should equal ( 61 | Seq("Matthew Perry", "Michelle Pfeiffer", "George C", "Rich Sherman")) 62 | 63 | reader.rowNo = 1 64 | reader.notNull(1) should equal (false) 65 | reader.as[Timestamp](2) should equal (new Timestamp(DateTime.parse("1970-07-08T00Z").getMillis)) 66 | } 67 | 68 | it("should read longs from timestamp strings from ArrayStringRowReader") { 69 | ArrayStringRowReader(csvRows.head).getLong(2) should equal (96768000000L) 70 | } 71 | 72 | import org.velvia.filo.{vectors => bv} 73 | 74 | it("should append to BinaryAppendableVector from Readers with RowReaderAppender") { 75 | val readers = rows.map(TupleRowReader) 76 | val appenders = Seq( 77 | new IntReaderAppender(bv.IntBinaryVector.appendingVector(10), 1), 78 | new LongReaderAppender(bv.LongBinaryVector.appendingVector(10), 2) 79 | ) 80 | readers.foreach { r => appenders.foreach(_.append(r)) } 81 | val bufs = appenders.map(_.appender.optimize().toFiloBuffer).toArray 82 | val reader = new FastFiloRowReader(bufs, Array(classOf[Int], classOf[Long])) 83 | 84 | readValues(reader, 4)(_.getInt(0)) should equal (Seq(18, 0, 59, 26)) 85 | reader.rowNo = 1 86 | reader.notNull(0) should equal (false) 87 | } 88 | 89 | import RowReader._ 90 | it("should compare RowReaders using TypedFieldExtractor") { 91 | val readers = rows.map(TupleRowReader) 92 | StringFieldExtractor.compare(readers(1), readers(2), 0) should be > (0) 93 | IntFieldExtractor.compare(readers(0), readers(2), 1) should be < (0) 94 | TimestampFieldExtractor.compare(readers(0), readers(3), 2) should equal (0) 95 | 96 | // Ok, we should be able to compare the reader with the NA / None too 97 | IntFieldExtractor.compare(readers(1), readers(2), 1) should be < (0) 98 | } 99 | } -------------------------------------------------------------------------------- /flatbuffers/src/main/java/com/google/flatbuffers/Table.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2014 Google Inc. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.google.flatbuffers; 18 | 19 | import static com.google.flatbuffers.Constants.*; 20 | import java.nio.ByteBuffer; 21 | import java.nio.ByteOrder; 22 | 23 | // All tables in the generated code derive from this class, and add their own accessors. 24 | public class Table { 25 | protected int bb_pos; 26 | protected ByteBuffer bb; 27 | 28 | public ByteBuffer getByteBuffer() { return bb; } 29 | 30 | // Look up a field in the vtable, return an offset into the object, or 0 if the field is not 31 | // present. 32 | protected int __offset(int vtable_offset) { 33 | int vtable = bb_pos - bb.getInt(bb_pos); 34 | return vtable_offset < bb.getShort(vtable) ? bb.getShort(vtable + vtable_offset) : 0; 35 | } 36 | 37 | // Retrieve the relative offset stored at "offset" 38 | protected int __indirect(int offset) { 39 | return offset + bb.getInt(offset); 40 | } 41 | 42 | // Create a java String from UTF-8 data stored inside the flatbuffer. 43 | // This allocates a new string and converts to wide chars upon each access, 44 | // which is not very efficient. Instead, each FlatBuffer string also comes with an 45 | // accessor based on __vector_as_bytebuffer below, which is much more efficient, 46 | // assuming your Java program can handle UTF-8 data directly. 47 | protected String __string(int offset) { 48 | offset += bb.getInt(offset); 49 | if (bb.hasArray()) { 50 | return new String(bb.array(), bb.arrayOffset() + offset + SIZEOF_INT, bb.getInt(offset), FlatBufferBuilder.utf8charset); 51 | } else { 52 | // We can't access .array(), since the ByteBuffer is read-only, 53 | // off-heap or a memory map 54 | ByteBuffer bb = this.bb.duplicate().order(ByteOrder.LITTLE_ENDIAN); 55 | // We're forced to make an extra copy: 56 | byte[] copy = new byte[bb.getInt(offset)]; 57 | bb.position(offset + SIZEOF_INT); 58 | bb.get(copy); 59 | return new String(copy, 0, copy.length, FlatBufferBuilder.utf8charset); 60 | } 61 | } 62 | 63 | // Get the length of a vector whose offset is stored at "offset" in this object. 64 | protected int __vector_len(int offset) { 65 | offset += bb_pos; 66 | offset += bb.getInt(offset); 67 | return bb.getInt(offset); 68 | } 69 | 70 | // Get the start of data of a vector whose offset is stored at "offset" in this object. 71 | protected int __vector(int offset) { 72 | offset += bb_pos; 73 | return offset + bb.getInt(offset) + SIZEOF_INT; // data starts after the length 74 | } 75 | 76 | // Get a whole vector as a ByteBuffer. This is efficient, since it only allocates a new 77 | // bytebuffer object, but does not actually copy the data, it still refers to the same 78 | // bytes as the original ByteBuffer. 79 | // Also useful with nested FlatBuffers etc. 80 | protected ByteBuffer __vector_as_bytebuffer(int vector_offset, int elem_size) { 81 | int o = __offset(vector_offset); 82 | if (o == 0) return null; 83 | ByteBuffer bb = this.bb.duplicate().order(ByteOrder.LITTLE_ENDIAN); 84 | int vectorstart = __vector(o); 85 | bb.position(vectorstart); 86 | bb.limit(vectorstart + __vector_len(o) * elem_size); 87 | return bb; 88 | } 89 | 90 | // Initialize any Table-derived type to point to the union at the given offset. 91 | protected Table __union(Table t, int offset) { 92 | offset += bb_pos; 93 | t.bb_pos = offset + bb.getInt(offset); 94 | t.bb = bb; 95 | return t; 96 | } 97 | 98 | protected static boolean __has_identifier(ByteBuffer bb, String ident) { 99 | if (ident.length() != FILE_IDENTIFIER_LENGTH) 100 | throw new AssertionError("FlatBuffers: file identifier must be length " + 101 | FILE_IDENTIFIER_LENGTH); 102 | for (int i = 0; i < FILE_IDENTIFIER_LENGTH; i++) { 103 | if (ident.charAt(i) != (char)bb.get(bb.position() + SIZEOF_INT + i)) return false; 104 | } 105 | return true; 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /filo-scala/src/test/scala/org.velvia.filo/DiffEncodingTest.scala: -------------------------------------------------------------------------------- 1 | package org.velvia.filo 2 | 3 | import java.nio.{ByteBuffer, ByteOrder} 4 | import java.sql.Timestamp 5 | import org.joda.time.{DateTime, DateTimeZone} 6 | 7 | import org.scalatest.FunSpec 8 | import org.scalatest.Matchers 9 | 10 | class DiffEncodingTest extends FunSpec with Matchers { 11 | import BuilderEncoder.DiffEncoding 12 | import VectorReader._ 13 | 14 | private def checkVectorType(bb: ByteBuffer, majorType: Int, minorType: Int): Unit = { 15 | val headerBytes = bb.getInt(0) 16 | WireFormat.majorVectorType(headerBytes) should equal (majorType) 17 | WireFormat.vectorSubType(headerBytes) should equal (minorType) 18 | } 19 | 20 | it("should not diff encode int and long sequences that could be simply encoded efficiently") { 21 | // sequence contains integers already within 256/2^16 etc boundaries 22 | val seq1 = Seq(0, 255) 23 | val buf1 = VectorBuilder(seq1).toFiloBuffer(DiffEncoding) 24 | checkVectorType(buf1, WireFormat.VECTORTYPE_BINSIMPLE, WireFormat.SUBTYPE_INT_NOMASK) 25 | 26 | val seq1a = Seq(0, 65432) 27 | val buf1a = VectorBuilder(seq1a).toFiloBuffer(DiffEncoding) 28 | checkVectorType(buf1, WireFormat.VECTORTYPE_BINSIMPLE, WireFormat.SUBTYPE_INT_NOMASK) 29 | 30 | // sequence contains longs within 2^32 but min and max far apart 31 | val seq2 = Seq(0L, 65432 * 65432L) 32 | val buf2 = VectorBuilder(seq2).toFiloBuffer(DiffEncoding) 33 | checkVectorType(buf1, WireFormat.VECTORTYPE_BINSIMPLE, WireFormat.SUBTYPE_INT_NOMASK) 34 | 35 | // sequence contains integers beyond 2^16 36 | // (such that max - min might wrap around) 37 | val seq3 = Seq(Int.MinValue, Int.MaxValue) 38 | val buf3 = VectorBuilder(seq3).toFiloBuffer(DiffEncoding) 39 | checkVectorType(buf1, WireFormat.VECTORTYPE_BINSIMPLE, WireFormat.SUBTYPE_INT_NOMASK) 40 | 41 | val seq4 = Seq(Short.MinValue.toInt, Short.MaxValue.toInt + 1) 42 | val buf4 = VectorBuilder(seq4).toFiloBuffer(DiffEncoding) 43 | checkVectorType(buf1, WireFormat.VECTORTYPE_BINSIMPLE, WireFormat.SUBTYPE_INT_NOMASK) 44 | 45 | // sequence contains longs beyond 2^32 min max 46 | val seq5 = Seq(Int.MinValue.toLong, Int.MaxValue.toLong + 1) 47 | val buf5 = VectorBuilder(seq5).toFiloBuffer(DiffEncoding) 48 | checkVectorType(buf1, WireFormat.VECTORTYPE_BINSIMPLE, WireFormat.SUBTYPE_INT_NOMASK) 49 | } 50 | 51 | it("should correctly diff encode int and long sequences that fit criteria") { 52 | val seq1 = Seq(500, 254, 257) 53 | val buf1 = VectorBuilder(seq1).toFiloBuffer(DiffEncoding) 54 | // TODO: disable parts of this test because diff encoding for IntBinaryVectors not done yet 55 | // buf1.capacity should equal (76) 56 | // checkVectorType(buf1, WireFormat.VECTORTYPE_DIFF, WireFormat.SUBTYPE_PRIMITIVE) 57 | val binarySeq1 = FiloVector[Int](buf1) 58 | 59 | binarySeq1.length should equal (seq1.length) 60 | binarySeq1.sum should equal (seq1.sum) 61 | 62 | val maxUInt = 65536L * 65536L 63 | val seq2 = Seq(maxUInt + 1, maxUInt, maxUInt + 255, maxUInt + 3) 64 | val buf2 = VectorBuilder(seq2).toFiloBuffer(DiffEncoding) 65 | buf2.capacity should equal (76) 66 | checkVectorType(buf2, WireFormat.VECTORTYPE_DIFF, WireFormat.SUBTYPE_PRIMITIVE) 67 | val binarySeq2 = FiloVector[Long](buf2) 68 | 69 | binarySeq2.length should equal (seq2.length) 70 | binarySeq2.sum should equal (seq2.sum) 71 | } 72 | 73 | val behindGmtZone = DateTimeZone.forOffsetHours(-5) 74 | val aheadGmtZone = DateTimeZone.forOffsetHours(4) 75 | val dt1 = new DateTime("2012-01-12T03:45Z", behindGmtZone) 76 | val ts1 = dt1.getMillis 77 | 78 | it("should correctly encode DateTime sequences in same time zone") { 79 | val seq1 = Seq(dt1, dt1.plusMillis(1), dt1.plusSeconds(2)) 80 | val buf1 = VectorBuilder(seq1).toFiloBuffer 81 | checkVectorType(buf1, WireFormat.VECTORTYPE_DIFF, WireFormat.SUBTYPE_DATETIME) 82 | val binarySeq1 = FiloVector[DateTime](buf1) 83 | 84 | binarySeq1.length should equal (seq1.length) 85 | binarySeq1.toSeq should equal (seq1) 86 | 87 | val dt2 = dt1.withZone(aheadGmtZone) 88 | val seq2 = Seq(dt2, dt2.plusMinutes(1), dt2.minusSeconds(10)) 89 | val buf2 = VectorBuilder(seq2).toFiloBuffer 90 | checkVectorType(buf2, WireFormat.VECTORTYPE_DIFF, WireFormat.SUBTYPE_DATETIME) 91 | val binarySeq2 = FiloVector[DateTime](buf2) 92 | 93 | binarySeq2.length should equal (seq2.length) 94 | binarySeq2.toSeq should equal (seq2) 95 | } 96 | 97 | it("should correctly encode DateTime sequences with mixed NAs") { 98 | val seq1 = Seq(None, None, Some(dt1), Some(dt1.plusMillis(150))) 99 | val buf1 = VectorBuilder.fromOptions(seq1).toFiloBuffer 100 | checkVectorType(buf1, WireFormat.VECTORTYPE_DIFF, WireFormat.SUBTYPE_DATETIME) 101 | val binarySeq1 = FiloVector[DateTime](buf1) 102 | 103 | binarySeq1.length should equal (seq1.length) 104 | binarySeq1.optionIterator.toSeq should equal (seq1) 105 | 106 | val seq2 = Seq(Some(dt1), None, Some(dt1.plusMillis(150)), None, None) 107 | val buf2 = VectorBuilder.fromOptions(seq2).toFiloBuffer 108 | checkVectorType(buf2, WireFormat.VECTORTYPE_DIFF, WireFormat.SUBTYPE_DATETIME) 109 | val binarySeq2 = FiloVector[DateTime](buf2) 110 | 111 | binarySeq2.length should equal (seq2.length) 112 | binarySeq2.optionIterator.toSeq should equal (seq2) 113 | } 114 | 115 | it("should correctly encode Timestamp sequences with mixed NAs") { 116 | val seq1 = Seq(None, Some(new Timestamp(ts1)), None, Some(new Timestamp(ts1 + 15000L))) 117 | val buf1 = VectorBuilder.fromOptions(seq1).toFiloBuffer 118 | checkVectorType(buf1, WireFormat.VECTORTYPE_DIFF, WireFormat.SUBTYPE_PRIMITIVE) 119 | val binarySeq1 = FiloVector[Timestamp](buf1) 120 | 121 | binarySeq1.length should equal (seq1.length) 122 | binarySeq1.optionIterator.toSeq should equal (seq1) 123 | } 124 | 125 | it("should correctly encode DateTime sequences with different time zones") { 126 | val dt2 = dt1.withZone(aheadGmtZone) 127 | val seq2 = Seq(dt1, dt2.plusMinutes(1), dt1.minusSeconds(5), dt2.minusSeconds(10)) 128 | val buf2 = VectorBuilder(seq2).toFiloBuffer 129 | checkVectorType(buf2, WireFormat.VECTORTYPE_DIFF, WireFormat.SUBTYPE_DATETIME) 130 | val binarySeq2 = FiloVector[DateTime](buf2) 131 | 132 | binarySeq2.length should equal (seq2.length) 133 | binarySeq2.toSeq should equal (seq2) 134 | } 135 | } 136 | -------------------------------------------------------------------------------- /filo-scala/src/main/scala/org.velvia.filo/vectors/DictUTF8Vector.scala: -------------------------------------------------------------------------------- 1 | package org.velvia.filo.vectors 2 | 3 | import java.nio.ByteBuffer 4 | import java.util.HashMap 5 | import org.velvia.filo._ 6 | import scalaxy.loops._ 7 | 8 | import org.velvia.filo.{ZeroCopyUTF8String => UTF8Str} 9 | 10 | case class DictUTF8Info(codeMap: HashMap[UTF8Str, Int], 11 | dictStrings: BinaryAppendableVector[UTF8Str], 12 | codes: BinaryAppendableVector[Int]) 13 | 14 | object DictUTF8Vector { 15 | /** 16 | * Determines if it is worth it to do dictionary encoding (which takes longer). Tries to use up 17 | * minimal amount of time to make this determination by sampling or going through only a portion 18 | * of the source vector, and stops building the expensive hash and dictionary if its not worth it. 19 | * This approach might not work for source vectors that are very biased but the sampling rate is 20 | * adjustable. 21 | * 22 | * @param sourceVector the source UTF8 vector. Recommended this be a UTF8PtrAppendable. 23 | * @param spaceThreshold a number between 0.0 and 1.0, the fraction of the original 24 | * space below which the DictUTF8Vector should be sized to be 25 | * worth doing dictionary encoding for. Make this >1.0 if you want to force it 26 | * @param samplingRate the fraction (0.0 <= n < 1.0) of the source vector to use to determine 27 | * if dictionary encoding will be worth it 28 | * @param maxDictSize the max number of bytes that the dictionary coukd grow to 29 | * @return Option[DictUTF8Info] contains info for building the dictionary if it is worth it 30 | */ 31 | def shouldMakeDict(sourceVector: BinaryAppendableVector[UTF8Str], 32 | spaceThreshold: Double = 0.6, 33 | samplingRate: Double = 0.3, 34 | maxDictSize: Int = 10000): Option[DictUTF8Info] = { 35 | val sourceLen = sourceVector.length 36 | val codeMap = new HashMap[UTF8Str, Int](sourceLen, 0.5F) 37 | val sampleSize = (sourceLen * samplingRate).toInt 38 | // The max size for the dict we will tolerate given the sample size and orig vector size 39 | // Above this, cardinality is not likely to be low enough for dict encoding 40 | val dictThreshold = (sampleSize * spaceThreshold).toInt 41 | val dictVect = UTF8Vector.flexibleAppending(sourceLen + 1, maxDictSize) 42 | val codeVect = IntBinaryVector.appendingVectorNoNA(sourceLen) 43 | dictVect.addNA() // first code point 0 == NA 44 | 45 | for { i <- 0 until sourceLen optimized } { 46 | val item = sourceVector(i) 47 | if (item != null && item != ZeroCopyUTF8String.NA) { 48 | val newCode = codeMap.size + 1 49 | val orig = codeMap.putIfAbsent(item, newCode) // Just one hashcode/compare 50 | if (orig == 0) { 51 | dictVect.addData(item) 52 | codeVect.addData(newCode) 53 | } else { 54 | codeVect.addData(orig) 55 | } 56 | } else { 57 | codeVect.addData(0) 58 | } 59 | // Now check if we are over the threshold already 60 | if (i <= sampleSize && dictVect.length > dictThreshold) return None 61 | } 62 | Some(DictUTF8Info(codeMap, dictVect, codeVect)) 63 | } 64 | 65 | /** 66 | * Creates the dictionary-encoding frozen vector from intermediate data. 67 | */ 68 | def makeVector(info: DictUTF8Info, offheap: Boolean = false): DictUTF8Vector = { 69 | // Estimate and allocate enough space for the UTF8Vector 70 | val (nbits, signed) = IntBinaryVector.minMaxToNbitsSigned(0, info.codeMap.size) 71 | val codeVectSize = IntBinaryVector.noNAsize(info.codes.length, nbits) 72 | val dictVectSize = info.dictStrings.frozenSize 73 | val bytesRequired = 8 + dictVectSize + codeVectSize 74 | val (base, off, nBytes) = BinaryVector.allocWithMagicHeader(bytesRequired, offheap) 75 | 76 | // Copy over the dictionary strings 77 | // TODO: optimize in future to FIXED UTF8 vector? 78 | info.dictStrings.freeze(Some((base, off + 8))) 79 | 80 | // Fill up the codes - directly in the allocated space for the DictUTF8Vector 81 | val codeVect = IntBinaryVector.appendingVectorNoNA(base, 82 | off + 8 + dictVectSize, 83 | codeVectSize, 84 | nbits, signed) 85 | codeVect.addVector(info.codes) 86 | 87 | // Write 8 bytes of metadata at beginning 88 | UnsafeUtils.setInt(base, off, WireFormat.SUBTYPE_UTF8) 89 | UnsafeUtils.setInt(base, off + 4, 8 + dictVectSize) 90 | 91 | new DictUTF8Vector(base, off, bytesRequired) 92 | } 93 | 94 | /** 95 | * Wraps bytes with a DictUTF8Vector so it can be read. 96 | */ 97 | def apply(buffer: ByteBuffer): DictUTF8Vector = { 98 | val (base, off, len) = UnsafeUtils.BOLfromBuffer(buffer) 99 | new DictUTF8Vector(base, off, len) 100 | } 101 | } 102 | 103 | /** 104 | * Dictionary-encoding UTF8 string BinaryVector 105 | * Layout: 106 | * +0 Int WireFormat vector subtype of dictionary 107 | * +4 Int relative offset to integer vector for dictionary codes 108 | * +8 String dictionary, either UTF8Vector or FixedMaxUTF8Vector 109 | * +.... 110 | * 111 | * The code zero is used to mark NA. Thus the first entry of the string dictionary is also NA. 112 | * Unlike the FlatBuffer-based DictStringVector, this one does not need to cache because there is no 113 | * string deserialization to be done, thus the code is much much simpler. 114 | */ 115 | class DictUTF8Vector(val base: Any, val offset: Long, val numBytes: Int) extends BinaryVector[UTF8Str] { 116 | val vectMajorType = WireFormat.VECTORTYPE_BINDICT 117 | val vectSubType = WireFormat.SUBTYPE_UTF8 118 | val maybeNAs = true 119 | private val dictSubtype = UnsafeUtils.getInt(base, offset) 120 | private val codeVectOffset = UnsafeUtils.getInt(base, offset + 4) 121 | 122 | private final val dict = dictSubtype match { 123 | case WireFormat.SUBTYPE_UTF8 => UTF8Vector(base, offset + 8, codeVectOffset - 8) 124 | } 125 | 126 | private final val codes = IntBinaryVector(base, offset + codeVectOffset, numBytes - codeVectOffset) 127 | 128 | override final def length: Int = codes.length 129 | final def isAvailable(i: Int): Boolean = codes(i) != 0 130 | final def apply(i: Int): UTF8Str = dict(codes(i)) 131 | } -------------------------------------------------------------------------------- /filo-scala/src/main/scala/org.velvia.filo/FastBufferReader.scala: -------------------------------------------------------------------------------- 1 | package org.velvia.filo 2 | 3 | import com.kenai.jffi.MemoryIO 4 | import java.nio.ByteBuffer 5 | import sun.misc.Unsafe 6 | 7 | /** 8 | * TODO: use something like Agrona so the classes below will work with non-array-based ByteBuffers 9 | * Fast (machine-speed/intrinsic) readers for ByteBuffer values, assuming bytebuffers are vectors 10 | * of fixed size. 11 | */ 12 | object FastBufferReader { 13 | /** 14 | * Instantiates the correct BufferReader implementation: 15 | * - FastUnsafeArrayBufferReader is used if the ByteBuffer is backed by an array 16 | * - SlowBufferReader just uses ByteBuffer 17 | * 18 | * This allows future-proof implementations: for example for JDK9 / better Unsafe changes, and 19 | * for extending to DirectMemory allocated ByteBuffers, for example. 20 | * 21 | * @param buf the ByteBuffer containing an array of fixed values to wrap for fast access 22 | */ 23 | def apply(buf: ByteBuffer): FastBufferReader = { 24 | if (buf.hasArray) { new FastUnsafeArrayBufferReader(buf) } 25 | else { throw new RuntimeException("Cannot support this ByteBuffer") } 26 | } 27 | 28 | def apply(long: Long): FastBufferReader = new FastLongBufferReader(long) 29 | } 30 | 31 | object UnsafeUtils { 32 | val unsafe = scala.concurrent.util.Unsafe.instance 33 | val memoryIO = MemoryIO.getInstance 34 | 35 | // scalastyle:off 36 | val ZeroPointer: Any = null 37 | // scalastyle:on 38 | 39 | val arayOffset = unsafe.arrayBaseOffset(classOf[Array[Byte]]) 40 | 41 | /** Translate ByteBuffer into base, offset, numBytes */ 42 | def BOLfromBuffer(buf: ByteBuffer): (Any, Long, Int) = { 43 | if (buf.hasArray) { 44 | (buf.array, arayOffset.toLong + buf.arrayOffset + buf.position, buf.limit - buf.position) 45 | } else { 46 | throw new RuntimeException("Cannot support this ByteBuffer!") 47 | } 48 | } 49 | 50 | /** 51 | * Allocates off-heap (OS / malloc) memory. This is not direct memory, this is really off-heap. 52 | * The memory needs to be manually tracked and freed using freeOffheap. 53 | * @param numBytes the number of bytes to allocate 54 | * @param initialize if true, zero out the memory first 55 | * @return a Long (64-bit) address or raw pointer to the memory. 56 | */ 57 | def allocOffheap(numBytes: Int, initialize: Boolean = false): Long = 58 | memoryIO.allocateMemory(numBytes, initialize) 59 | 60 | def freeOffheap(addr: Long): Unit = memoryIO.freeMemory(addr) 61 | 62 | /** 63 | * Generic methods to read and write data to any offset from a base object location. Be careful, this 64 | * can easily crash the system! 65 | */ 66 | final def getByte(obj: Any, offset: Long): Byte = unsafe.getByte(obj, offset) 67 | final def getShort(obj: Any, offset: Long): Short = unsafe.getShort(obj, offset) 68 | final def getInt(obj: Any, offset: Long): Int = unsafe.getInt(obj, offset) 69 | final def getLong(obj: Any, offset: Long): Long = unsafe.getLong(obj, offset) 70 | final def getDouble(obj: Any, offset: Long): Double = unsafe.getDouble(obj, offset) 71 | final def getFloat(obj: Any, offset: Long): Double = unsafe.getFloat(obj, offset) 72 | 73 | final def setByte(obj: Any, offset: Long, byt: Byte): Unit = unsafe.putByte(obj, offset, byt) 74 | final def setShort(obj: Any, offset: Long, s: Short): Unit = unsafe.putShort(obj, offset, s) 75 | final def setInt(obj: Any, offset: Long, i: Int): Unit = unsafe.putInt(obj, offset, i) 76 | final def setLong(obj: Any, offset: Long, l: Long): Unit = unsafe.putLong(obj, offset, l) 77 | final def setDouble(obj: Any, offset: Long, d: Double): Unit = unsafe.putDouble(obj, offset, d) 78 | final def setFloat(obj: Any, offset: Long, f: Float): Unit = unsafe.putFloat(obj, offset, f) 79 | 80 | /** 81 | * Compares two memory buffers of length numBytes, returns true if they are byte for byte equal 82 | * Compares long words for speed 83 | */ 84 | def equate(srcObj: Any, srcOffset: Long, destObj: Any, destOffset: Long, numBytes: Int): Boolean = { 85 | var i = 0 86 | while (i <= numBytes - 8) { 87 | if (getLong(srcObj, srcOffset + i) != getLong(destObj, destOffset + i)) return false 88 | i += 8 89 | } 90 | while (i < numBytes) { 91 | if (getByte(srcObj, srcOffset + i) != getByte(destObj, destOffset + i)) return false 92 | i += 1 93 | } 94 | true 95 | } 96 | 97 | 98 | // Comparison of two memories assuming both are word aligned and length is rounded to next word (4 bytes) 99 | // Also assumes a little-endian (eg Intel) architecture 100 | def wordCompare(srcObj: Any, srcOffset: Long, destObj: Any, destOffset: Long, n: Int): Int = { 101 | import java.lang.Integer.reverseBytes 102 | var i = 0 103 | while (i < n) { 104 | val srcWord = reverseBytes(getInt(srcObj, srcOffset + i)) ^ 0x80000000 105 | val destWord = reverseBytes(getInt(destObj, destOffset + i)) ^ 0x80000000 106 | if (srcWord < destWord) return -1 else if (srcWord != destWord) return 1 107 | i += 4 108 | } 109 | 0 110 | } 111 | } 112 | 113 | trait FastBufferReader { 114 | def readByte(i: Int): Byte 115 | def readShort(i: Int): Short 116 | def readInt(i: Int): Int 117 | def readLong(i: Int): Long 118 | def readDouble(i: Int): Double 119 | def readFloat(i: Int): Float 120 | } 121 | 122 | import UnsafeUtils._ 123 | 124 | trait FastUnsafeBufferReader extends FastBufferReader { 125 | def base: Any 126 | def bufOffset: Long 127 | 128 | final def readByte(i: Int): Byte = unsafe.getByte(base, (bufOffset + i).toLong) 129 | final def readShort(i: Int): Short = unsafe.getShort(base, (bufOffset + i * 2).toLong) 130 | final def readInt(i: Int): Int = unsafe.getInt(base, (bufOffset + i * 4).toLong) 131 | final def readLong(i: Int): Long = unsafe.getLong(base, (bufOffset + i * 8).toLong) 132 | final def readDouble(i: Int): Double = unsafe.getDouble(base, (bufOffset + i * 8).toLong) 133 | final def readFloat(i: Int): Float = unsafe.getFloat(base, (bufOffset + i * 4).toLong) 134 | } 135 | 136 | class FastUnsafeArrayBufferReader(buf: ByteBuffer) extends FastUnsafeBufferReader { 137 | val (base, bufOffset, _) = BOLfromBuffer(buf) 138 | } 139 | 140 | class FastLongBufferReader(long: Long) extends FastBufferReader { 141 | def readByte(i: Int): Byte = (long >> (8 * i)).toByte 142 | def readShort(i: Int): Short = (long >> (16 * i)).toShort 143 | def readInt(i: Int): Int = (long >> (32 * i)).toInt 144 | def readLong(i: Int): Long = long 145 | def readDouble(i: Int): Double = ??? 146 | def readFloat(i: Int): Float = ??? 147 | } 148 | -------------------------------------------------------------------------------- /filo-scala/src/test/scala/org.velvia.filo/vectors/DoubleVectorTest.scala: -------------------------------------------------------------------------------- 1 | package org.velvia.filo.vectors 2 | 3 | import org.scalatest.{FunSpec, Matchers} 4 | import org.velvia.filo.{FiloVector, GrowableVector, VectorTooSmall} 5 | 6 | class DoubleVectorTest extends FunSpec with Matchers { 7 | describe("DoubleMaskedAppendableVector") { 8 | it("should append a list of all NAs and read all NAs back") { 9 | val builder = DoubleVector.appendingVector(100) 10 | builder.addNA 11 | builder.isAllNA should be (true) 12 | builder.noNAs should be (false) 13 | val sc = builder.freeze() 14 | sc.length should equal (1) 15 | sc(0) // Just to make sure this does not throw an exception 16 | sc.isAvailable(0) should equal (false) 17 | sc.toList should equal (Nil) 18 | sc.optionIterator.toSeq should equal (Seq(None)) 19 | } 20 | 21 | it("should encode a mix of NAs and Doubles and decode iterate and skip NAs") { 22 | val cb = DoubleVector.appendingVector(5) 23 | cb.addNA 24 | cb.addData(101) 25 | cb.addData(102.5) 26 | cb.addData(103) 27 | cb.addNA 28 | cb.isAllNA should be (false) 29 | cb.noNAs should be (false) 30 | val sc = cb.freeze() 31 | 32 | sc.length should equal (5) 33 | sc.isAvailable(0) should equal (false) 34 | sc.isAvailable(1) should equal (true) 35 | sc.isAvailable(4) should equal (false) 36 | sc(1) should equal (101) 37 | sc.boxed(2) should equal (102.5) 38 | sc.boxed(2) shouldBe a [java.lang.Double] 39 | sc.get(0) should equal (None) 40 | sc.get(-1) should equal (None) 41 | sc.get(2) should equal (Some(102.5)) 42 | sc.toList should equal (List(101, 102.5, 103)) 43 | } 44 | 45 | it("should be able to append lots of Doubles and grow vector") { 46 | val numDoubles = 1000 47 | val builder = DoubleVector.appendingVector(numDoubles / 2) 48 | (0 until numDoubles).map(_.toDouble).foreach(builder.addData) 49 | builder.length should equal (numDoubles) 50 | builder.isAllNA should be (false) 51 | builder.noNAs should be (true) 52 | } 53 | 54 | it("should be able to append lots of Doubles off-heap and grow vector") { 55 | val numDoubles = 1000 56 | val builder = DoubleVector.appendingVector(numDoubles / 2, offheap=true) 57 | (0 until numDoubles).map(_.toDouble).foreach(builder.addData) 58 | builder.length should equal (numDoubles) 59 | builder.isOffheap shouldEqual true 60 | builder.isAllNA should be (false) 61 | builder.noNAs should be (true) 62 | } 63 | 64 | it("should be able to return minMax accurately with NAs") { 65 | val cb = DoubleVector.appendingVector(5) 66 | cb.addNA 67 | cb.addData(10.1) 68 | cb.addData(102) 69 | cb.addData(1.03E9) 70 | cb.addNA 71 | val inner = cb.asInstanceOf[GrowableVector[Double]].inner.asInstanceOf[MaskedDoubleAppendingVector] 72 | inner.minMax should equal ((10.1, 1.03E9)) 73 | } 74 | 75 | it("should be able to freeze() and minimize bytes used") { 76 | val builder = DoubleVector.appendingVector(100) 77 | // Test numBytes to make sure it's accurate 78 | builder.numBytes should equal (4 + 16 + 4) // 2 long words needed for 100 bits 79 | (0 to 4).map(_.toDouble).foreach(builder.addData) 80 | builder.numBytes should equal (4 + 16 + 4 + 40) 81 | val frozen = builder.freeze() 82 | frozen.numBytes should equal (4 + 8 + 4 + 40) // bitmask truncated 83 | 84 | frozen.length should equal (5) 85 | frozen.toSeq should equal (0 to 4) 86 | } 87 | 88 | it("should toFiloBuffer and read back using FiloVector.apply") { 89 | val cb = DoubleVector.appendingVector(5) 90 | cb.addNA 91 | cb.addData(101) 92 | cb.addData(102) 93 | cb.addData(103.7) 94 | cb.addNA 95 | val buffer = cb.optimize().toFiloBuffer 96 | val readVect = FiloVector[Double](buffer) 97 | readVect.toSeq should equal (Seq(101.0, 102.0, 103.7)) 98 | } 99 | 100 | it("should be able to optimize all integral vector to IntBinaryVector") { 101 | val builder = DoubleVector.appendingVector(100) 102 | (0 to 4).map(_.toDouble).foreach(builder.addData) 103 | val optimized = builder.optimize() 104 | optimized.length should equal (5) 105 | optimized.toSeq should equal (0 to 4) 106 | optimized(0) should equal (0.0) 107 | optimized.numBytes should equal (4 + 3) // nbits=4, so only 3 extra bytes 108 | val readVect = FiloVector[Double](optimized.toFiloBuffer) 109 | readVect.toSeq should equal (0 to 4) 110 | } 111 | 112 | it("should be able to optimize off-heap all integral vector to IntBinaryVector") { 113 | val builder = DoubleVector.appendingVector(100, offheap=true) 114 | (0 to 4).map(_.toDouble).foreach(builder.addData) 115 | val optimized = builder.optimize() 116 | optimized.length shouldEqual 5 117 | optimized.isOffheap shouldEqual true 118 | optimized.toSeq should equal (0 to 4) 119 | optimized(0) should equal (0.0) 120 | optimized.numBytes should equal (4 + 3) // nbits=4, so only 3 extra bytes 121 | val readVect = FiloVector[Double](optimized.toFiloBuffer) 122 | readVect.toSeq should equal (0 to 4) 123 | } 124 | 125 | it("should be able to optimize constant Doubles to an IntConstVector") { 126 | val builder = DoubleVector.appendingVector(100) 127 | (0 to 4).foreach(n => builder.addData(99.9)) 128 | val buf = builder.optimize().toFiloBuffer 129 | val readVect = FiloVector[Double](buf) 130 | readVect shouldBe a[DoubleConstVector] 131 | readVect.toSeq should equal (Seq(99.9, 99.9, 99.9, 99.9, 99.9)) 132 | } 133 | 134 | it("should support resetting and optimizing AppendableVector multiple times") { 135 | val cb = DoubleVector.appendingVector(5) 136 | // Use large numbers on purpose so cannot optimize to Doubles or const 137 | val orig = Seq(11.11E101, -2.2E-176, 1.77E88) 138 | cb.addNA() 139 | orig.foreach(cb.addData) 140 | cb.toSeq should equal (orig) 141 | val optimized = cb.optimize() 142 | assert(optimized.base != cb.base) // just compare instances 143 | val readVect1 = FiloVector[Double](optimized.toFiloBuffer) 144 | readVect1.toSeq should equal (orig) 145 | 146 | // Now the optimize should not have damaged original vector 147 | cb.toSeq should equal (orig) 148 | cb.reset() 149 | val orig2 = orig.map(_ * 2) 150 | orig2.foreach(cb.addData) 151 | val readVect2 = FiloVector[Double](cb.optimize().toFiloBuffer) 152 | readVect2.toSeq should equal (orig2) 153 | cb.toSeq should equal (orig2) 154 | } 155 | } 156 | } -------------------------------------------------------------------------------- /wire_format.md: -------------------------------------------------------------------------------- 1 | # Filo Wire Format 2 | 3 | The Filo binary vector wire format is mostly based on FlatBuffer chunks, with some extra stuff around it. 4 | 5 | ## Header bytes 6 | 7 | The first four bytes of any Filo chunk indicates the type of binary vector and structure following it. This allows for future evolution. 8 | 9 | | Offset | Type | Description | 10 | |--------|-------|-----------------| 11 | | +0 | u32 | Vector type | 12 | 13 | The valid values of the vector type: 14 | 15 | | Value | Vector Type | Description | 16 | |-------------|----------------|----------------| 17 | | 0xnnnnnn01 | EmptyVector | Every value in vector of length nnnnnn is NA | 18 | | 0x00000002 | SimplePrimitiveVector | FlatBuffer Vector with fixed bit size per element | 19 | | 0x00000102 | SimpleStringVector | FlatBuffer Vector with variable-size string elements | 20 | | 0x00000103 | DictStringVector | FlatBuffer Vector with dictionary-encoded strings | 21 | | 0x00000004 | ConstPrimitiveVector | A SimplePrimitiveVector for representing primitives holding the same value for entire vector 22 | | 0x00000104 | ConstStringVector | Same string vector for entire vector | 23 | | 0x00000005 | DiffPrimitiveVector | Stores base + deltas for primitives for more compact representation | 24 | | 0x00000405 | DiffDateTimeVector | Delta-encoded timestamp with TZ info for joda.time.DateTime | 25 | | 0x00000006 | Primitive bitmap-mask vector | New-style BinaryVector with fixed bit size (primitives) per element and bitmask for NA | 26 | | 0x00000206 | UTF8Vector | New-style BinaryVector with variable sized UTF8 strings or binary blobs | 27 | | 0x00000306 | FixedMaxUTF8Vector | Actually a PrimitiveVector for UTF8/blob elements with a fixed max size and a size byte | 28 | | 0x00000506 | PrimitiveAppendableVector | New-style BinaryVector with fixed bit size (primitives) per element and no NA mask | 29 | | 0x00000207 | DictUTF8Vector | New-style BinaryVector with dictionary-encoded UTF8/blobs | 30 | 31 | See `WireFormat.scala` for code definitions. 32 | 33 | ## Data vector type 34 | 35 | Many of the vectors consists of at least one fixed-size element arrays. These data vectors are, for flexibility and compactness, not represented with the native FlatBuffer arrays, but rather with two elements: 36 | 37 | * info - a struct with the following elements: 38 | * nbits - a u8 representing the number of bits per element 39 | * signed - a bool, true if the vector contains signed integers 40 | * data - a u8 array with each element aligned to every nbits, stored in little endian order. 41 | - For example, if nbits was 4, and the array was [1, 2, 3, 4], then in file byte order, the bytes would be 0x21 0x43 42 | 43 | The following types and standard nbits sizes should be supported by all Filo implementations: 44 | 45 | | nbits | Types supported | 46 | |-------|--------------------| 47 | | 1 | bit vector | 48 | | 8 | byte / u8 | 49 | | 16 | short int | 50 | | 32 | int, uint, IEEE Float | 51 | | 64 | long, IEEE Double | 52 | 53 | ## EmptyVector 54 | 55 | Nothing follows the 4 header bytes since the length is already captured there. 56 | 57 | ## Simple*Vector 58 | 59 | The FlatBuffers buffer for each type follows the header bytes. See the *.fbs files for exact definition. 60 | 61 | SimplePrimitiveVector contains a data vector as described above with nbits and signed struct, with a separate len field, while SimpleStringVector simply contains a [string]. 62 | 63 | Note that when implementing custom types, if the binary blobs are all fixed size, it is probably more efficient to use SimplePrimitiveVector - assuming the blobs are less than 256 bytes long each. 64 | 65 | ## FixedMaxUTF8Vector 66 | 67 | `SimpleStringVector` is actually inefficient for strings less than about 16 chars long, or string columns where the strings are all the same size. This is because a `[string]` vector has 8 bytes of overhead compared to a primitive fixed array: a string vector is an array of int offsets to a `[byte]` vector, which is itself a 4-byte length element plus the bytes. 68 | 69 | `UTF8Vector` is much more space efficient than `SimpleStringVector` -- it stores 20 bits for offset and 11 bits for string length, and the strings are packed together and not word-aligned -- thus when strings fit into 1MB and are less than 2^11 bytes long, the overhead is just 4 bytes per string. For fixed length strings though we can do better. 70 | 71 | The `FixedMaxUTF8Vector` uses a `PrimitiveAppendableVector` for short strings and stores them inline in the data vector itself. The first byte is a length field, followed by the UTF8 bytes. `nbits` is set to `(longest string length + 1) * 8`. Another benefit is better cache efficiency. 72 | 73 | The actual equation for determining when simpleFixedString works well is this: 74 | 75 | (maxStringLen + 1) < (4 + 4 + avgStringLen) 76 | 77 | ## SimpleConstVector 78 | 79 | This is really a SimplePrimitiveVector, in which the data vector contains one element only. This vector logically represents this single element repeated *len* times. The NA bitmask is still available. 80 | 81 | ## DictStringVector 82 | 83 | Here, each unique string is given a number, or code, starting from 1. 84 | There are two inner vectors within DictStringVector: 85 | 86 | * codes - represents the UTF8 string for each dictionary code. The code 0 is reserved for null or NA, and represented with the empty string, so the code vector looks like this: `["", "firstUnique", ...]` 87 | * data - represents the code for each string in the vector 88 | 89 | ## Diff*Vector 90 | 91 | Stores the base value as a separate single element vector, and all other elements as a vector of deltas from the base value. 92 | 93 | DiffDateTimeVector is similar, storing the millis since Epoch as the data vector with a base value (since many timestamp vectors contains values relatively close together), with an optional TZ vector if the DateTimes differ in timezone. Timezones are encoded as integer increments of 15 minute offsets from UTC (eg, UTC+1hr = 4). 94 | 95 | ## Double Encoding 96 | 97 | If all the doubles are integers and within the int32 range, then an `IntBinaryVector` is used to take advantage of more compact integer encodings, including diff, delta-delta, and less bit width encodings. 98 | 99 | ## Future Formats 100 | 101 | See the [Prometheus TSDB Video](https://promcon.io/2016-berlin/talks/the-prometheus-time-series-database/) for lots of ideas: 102 | 103 | * Integer Delta-Delta encoding (use Prometheus's clever idea of a linear slope: delta for index i = i * deltadelta, where deltadelta is the initial delta between index 0 and 1. This allows random access) 104 | * Float/Double XOR Delta encoding - encode the XOR between initial value and subsequent values 105 | * XOR + delta or delta delta? -------------------------------------------------------------------------------- /filo-scala/src/main/scala/org.velvia.filo/BuilderEncoder.scala: -------------------------------------------------------------------------------- 1 | package org.velvia.filo 2 | 3 | import java.nio.ByteBuffer 4 | import java.sql.Timestamp 5 | import org.joda.time.DateTime 6 | import scala.reflect.ClassTag 7 | import scalaxy.loops._ 8 | 9 | import org.velvia.filo.codecs._ 10 | 11 | /** 12 | * Type class for encoding a VectorBuilder to queryable binary Filo format 13 | */ 14 | trait BuilderEncoder[A] { 15 | def encodeInner(builder: VectorBuilderBase, hint: BuilderEncoder.EncodingHint): ByteBuffer 16 | def encode(builder: VectorBuilderBase, hint: BuilderEncoder.EncodingHint): ByteBuffer = { 17 | if (builder.isAllNA && builder.length <= WireFormat.MaxEmptyVectorLen) { 18 | SimpleEncoders.toEmptyVector(builder.length) 19 | } else { 20 | encodeInner(builder.asInstanceOf[VectorBuilderBase], hint) 21 | } 22 | } 23 | } 24 | 25 | trait MinMaxEncoder[A] { 26 | def minMaxZero(builder: VectorBuilderBase): (A, A, A) = { 27 | val minMaxBuilder = builder.asInstanceOf[MinMaxVectorBuilder[A]] 28 | (minMaxBuilder.min, minMaxBuilder.max, minMaxBuilder.zero) 29 | } 30 | } 31 | 32 | trait SingleMethodEncoder[A] extends BuilderEncoder[A] { 33 | val encodingPF: BuilderEncoder.EncodingPF 34 | val default: BuilderEncoder.EncodingPF = { 35 | case x: Any => 36 | throw new RuntimeException("Unsupported VectorBuilder") 37 | } 38 | 39 | def encodeInner(builder: VectorBuilderBase, hint: BuilderEncoder.EncodingHint): ByteBuffer = { 40 | (encodingPF orElse default)((builder, hint)) 41 | } 42 | } 43 | 44 | abstract class IntegralEncoder[A: PrimitiveDataVectBuilder] extends BuilderEncoder[A] with MinMaxEncoder[A] { 45 | val bufBuilder = implicitly[PrimitiveDataVectBuilder[A]] 46 | def encodeInner(vBuilder: VectorBuilderBase, hint: BuilderEncoder.EncodingHint): ByteBuffer = { 47 | val (min, max, zero) = minMaxZero(vBuilder) 48 | val builder = vBuilder.asInstanceOf[VectorBuilder[A]] 49 | if (min == max) { 50 | ConstEncoders.toPrimitiveVector(builder.data, builder.naMask.result, min, max) 51 | } else if ((hint == BuilderEncoder.AutoDetect || hint == BuilderEncoder.DiffEncoding) && 52 | bufBuilder.shouldBuildDeltas(min, max)) { 53 | DiffEncoders.toPrimitiveVector(builder.data, builder.naMask.result, min, max) 54 | } else { 55 | SimpleEncoders.toPrimitiveVector(builder.data, builder.naMask.result, min, max) 56 | } 57 | } 58 | } 59 | 60 | abstract class FloatDoubleEncoder[A: PrimitiveDataVectBuilder] extends 61 | BuilderEncoder[A] with MinMaxEncoder[A] { 62 | def encodeInner(vBuilder: VectorBuilderBase, hint: BuilderEncoder.EncodingHint): ByteBuffer = { 63 | val (min, max, _) = minMaxZero(vBuilder) 64 | val builder = vBuilder.asInstanceOf[VectorBuilder[A]] 65 | if (min == max) { 66 | ConstEncoders.toPrimitiveVector(builder.data, builder.naMask.result, min, max) 67 | } else { 68 | vBuilder match { 69 | case d: DoubleVectorBuilder if d.useIntVector => 70 | val intVect = vectors.IntBinaryVector.appendingVector(d.length) 71 | for { i <- 0 until d.length optimized } { 72 | if (d.naMask.contains(i)) intVect.addNA() else intVect.addData(d.data(i).toInt) 73 | } 74 | intVect.optimize().toFiloBuffer 75 | case o: Any => 76 | SimpleEncoders.toPrimitiveVector(builder.data, builder.naMask.result, min, max) 77 | } 78 | } 79 | } 80 | } 81 | /** 82 | * Classes to encode a Builder to a queryable binary Filo format. 83 | * Methods automatically detect the best encoding method to use, but hints are available 84 | * to pass to the methods. 85 | * 86 | * To extend the encoder for additional base types A, implement a type class BuilderEncoder[A]. 87 | */ 88 | object BuilderEncoder { 89 | sealed trait EncodingHint 90 | case object AutoDetect extends EncodingHint 91 | case object AutoDetectDispose extends EncodingHint // Dispose of old/existing vector 92 | case object SimpleEncoding extends EncodingHint 93 | case object DictionaryEncoding extends EncodingHint 94 | case object DiffEncoding extends EncodingHint 95 | final case class AutoDictString(spaceThreshold: Double = 0.6, samplingRate: Double = 0.3) extends EncodingHint 96 | 97 | type EncodingPF = PartialFunction[(VectorBuilderBase, EncodingHint), ByteBuffer] 98 | 99 | import AutoIntegralDVBuilders._ 100 | implicit object BoolEncoder extends IntegralEncoder[Boolean] 101 | implicit object IntEncoder extends IntegralEncoder[Int] 102 | implicit object LongEncoder extends IntegralEncoder[Long] 103 | 104 | import FPBuilders._ 105 | implicit object DoubleEncoder extends FloatDoubleEncoder[Double] 106 | implicit object FloatEncoder extends FloatDoubleEncoder[Float] 107 | 108 | implicit object StringEncoder extends BuilderEncoder[String] { 109 | def encodeInner(builder: VectorBuilderBase, hint: EncodingHint): ByteBuffer = { 110 | val useDictEncoding = hint match { 111 | case DictionaryEncoding => true 112 | case SimpleEncoding => false 113 | case x: Any => builder match { 114 | case sb: StringVectorBuilder => 115 | // If the string cardinality is below say half of # of elements 116 | // then definitely worth it to do dictionary encoding. 117 | // Empty/missing elements do not count towards cardinality, so columns with 118 | // many NA values will get dict encoded, which saves space 119 | sb.stringSet.size <= (sb.data.size / 2) 120 | case x: Any => // Someone used something other than our own builder. Oh well. TODO: log 121 | false 122 | } 123 | } 124 | (useDictEncoding, builder) match { 125 | case (_, sb: StringVectorBuilder) if sb.stringSet.size == 1 => 126 | ConstEncoders.toStringVector(sb.stringSet.head, sb.data.length, sb.naMask.result) 127 | case (true, sb: StringVectorBuilder) => 128 | DictEncodingEncoders.toStringVector(sb.data, sb.naMask.result, sb.stringSet) 129 | case x: Any => 130 | val bldr = builder.asInstanceOf[VectorBuilder[String]] 131 | SimpleEncoders.toStringVector(bldr.data, bldr.naMask.result) 132 | } 133 | } 134 | } 135 | 136 | implicit object DateTimeEncoder extends SingleMethodEncoder[DateTime] { 137 | val encodingPF: EncodingPF = { 138 | case (b: DateTimeVectorBuilder, _) => 139 | DiffEncoders.toDateTimeVector(b.millisBuilder, b.tzBuilder, b.millisBuilder.naMask.result) 140 | } 141 | } 142 | 143 | implicit object SqlTimestampEncoder extends SingleMethodEncoder[Timestamp] with MinMaxEncoder[Long] { 144 | val encodingPF: EncodingPF = { 145 | case (b: SqlTimestampVectorBuilder, _) => 146 | val (min, max, zero) = minMaxZero(b.millisBuilder) 147 | DiffEncoders.toPrimitiveVector(b.millisBuilder.data, b.millisBuilder.naMask.result, min, max) 148 | } 149 | } 150 | } 151 | 152 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # filo 2 | ![filo](Filo.jpg) 3 | 4 | A thin layer of dough for baking ultra high performance, memory-efficient, minimal-deserialization, binary data vectors into your app. 5 | 6 | Think of it as the good parts of Parquet without the HDFS and file format cruft -- just the serdes and fast columnar storage. 7 | 8 | For Scala, you get `Seq[A]` or `Traversable[A]` APIs directly on top of binary vectors, with minimal/lazy deserialization. 9 | 10 | The Scala implementation IntColumns have been clocked at 2 billion integer reads per second per thread using JMH on my laptop. 11 | 12 | ## What it is Not 13 | 14 | Filo is not a generic FlatBuffers wrapper for Scala. 15 | 16 | ## Properties 17 | 18 | * A [wire format](wire_format.md) for efficient data vectors for reading with zero or minimal/lazy deserialization 19 | - Very compact and fast string vectors using cached dictionary encoding 20 | - Numeric vectors compressed with minimal bits, differential encoding, other techniques 21 | * Random or linear access, no need to deserialize everything for random access 22 | * Support for missing / Not Available values, even for primitive vectors 23 | * Designed for long term persistence - based on Google [FlatBuffers](https://github.com/google/flatbuffers) which has schema evolution 24 | 25 | Perfect for efficiently representing your data for storing in files, mmap, NoSQL or key-value stores, etc. etc. 26 | 27 | ## Current Status 28 | 29 | Wire format is stable; header bytes enable future expansion into even non-FlatBuffer based binary formats. 30 | 31 | ## Filo-Scala 32 | 33 | Get it here: 34 | 35 | resolvers += "Velvia Bintray" at "https://dl.bintray.com/velvia/maven" 36 | 37 | libraryDependencies += "org.velvia.filo" %% "filo-scala" % "0.2.2" 38 | 39 | Using a `VectorBuilder` to progressively build a column: 40 | 41 | ```scala 42 | scala> import org.velvia.filo._ 43 | import org.velvia.filo._ 44 | 45 | scala> val cb = new IntVectorBuilder 46 | cb: org.velvia.filo.IntVectorBuilder = org.velvia.filo.IntVectorBuilder@48cbb760 47 | 48 | scala> cb.addNA 49 | 50 | scala> cb.addData(101) 51 | 52 | scala> cb.addData(102) 53 | 54 | scala> cb.addData(103) 55 | 56 | scala> cb.addNA 57 | ``` 58 | 59 | Encoding it to a Filo binary `ByteBuffer`: 60 | 61 | ```scala 62 | scala> cb.toFiloBuffer 63 | res5: java.nio.ByteBuffer = java.nio.HeapByteBuffer[pos=0 lim=84 cap=84] 64 | ``` 65 | 66 | The `toFiloBuffer` method takes an optional encoding hint. By default, `VectorBuilder`s will automatically detect the most space efficient encoding method. 67 | 68 | Parsing and iterating through the ByteBuffer as a collection: 69 | 70 | ```scala 71 | scala> import VectorReader._ 72 | import VectorReader._ 73 | 74 | scala> FiloVector[Int](res5).foreach(println) 75 | 101 76 | 102 77 | 103 78 | ``` 79 | 80 | All `FiloVectors` implement `scala.collection.Traversable` for transforming 81 | and iterating over the non-missing elements of a Filo binary vector. There are 82 | also methods for accessing and iterating over all elements. 83 | 84 | ### Converting rows to Filo binary vectors 85 | 86 | Filo is designed to enable efficient conversion and composition between rows having heterogeneous types and Filo vectors. 87 | 88 | Please see `RowToVectorBuilder` and the `RowToVectorBuilderTest` for an example. 89 | There is a convenience function to convert a whole bunch of rows at once. 90 | 91 | Also see `FiloRowReader` for extracting rows out of a bunch of heterogeneous Filo vectors. Both this and the `RowToVectorBuilder` works with `RowReader`s, to facilitate composing rows to and from Filo vectors. 92 | 93 | ### Support for Seq[A] and Seq[Option[A]] 94 | 95 | You can also encode a `Seq[A]` to a buffer easily: 96 | 97 | ```scala 98 | scala> import org.velvia.filo._ 99 | import org.velvia.filo._ 100 | 101 | scala> val orig = Seq(1, 2, -5, 101) 102 | orig: Seq[Int] = List(1, 2, -5, 101) 103 | 104 | scala> val buf = VectorBuilder(orig).toFiloBuffer 105 | buf: java.nio.ByteBuffer = java.nio.HeapByteBuffer[pos=0 lim=76 cap=76] 106 | 107 | scala> val binarySeq = FiloVector[Int](buf) 108 | binarySeq: org.velvia.filo.FiloVector[Int] = VectorReader(1, 2, -5, 101) 109 | 110 | scala> binarySeq.sum == orig.sum 111 | res10: Boolean = true 112 | ``` 113 | 114 | Note that even though a `FiloVector` implements `Traversable`, it only 115 | traverses over defined elements that are not NA. To work with collections of 116 | potentially missing elements, start with a `Seq[Option[A]]`, then use 117 | `VectorBuilder.fromOptions`. You can extract out an 118 | `Iterator[Option[A]]` with the `optionIterator` method. 119 | 120 | ### Performance Benchmarking 121 | 122 | To just get overall run times: 123 | 124 | sbt filoScalaJmh/run -i 10 -wi 10 -f5 125 | 126 | To also get profiling of top methods: 127 | 128 | sbt filoScalaJmh/run -i 10 -wi 10 -f5 -prof stack -jvmArgsAppend -Djmh.stack.lines=3 129 | 130 | For help, do `sbt filoScalaJmh/run -h`. 131 | 132 | See this [gist](https://gist.github.com/velvia/213b837c6e02c4982a9a) for how I improved the `FiloVector.apply()` method performance by 50x. 133 | 134 | ## Contributions 135 | 136 | Contributions are very welcome. You might need to install FlatBuffers if you change the FBB schema: 137 | 138 | 1. Clone the Google Flatbuffers [repo](https://github.com/google/flatbuffers). 139 | 1. Install cmake - on OSX: `brew install cmake` 140 | 1. `cmake -G "Unix Makefiles"` 141 | 2. Run `make` at the root of the flatbuffers dir 142 | 3. Put the `flatc` compiler binary in your path 143 | 4. In SBT, run the command `compileJavaSchema` 144 | 145 | ## Future directions 146 | 147 | Cross-platform support - Go, C/C++, etc. 148 | 149 | ### Additional Encodings 150 | 151 | Still random: 152 | * A much more compact encoding for sparse values 153 | * Combo delta + pack into float for double vector compression 154 | * Use [JavaEWAH](https://github.com/lemire/javaewah) `ImmutableBitSet` for efficient compressed bit vectors / NA masks 155 | * Encode a set or a hash, perhaps using Murmur3 hash for keys with an open hash design 156 | * Encode other data structures in [Open Data Structures](http://opendatastructures.org/)... a BTree would be fun 157 | 158 | No longer zero serialization: 159 | * Use the super fast byte packing algorithm from Cap'n Proto for much smaller wire representation 160 | * [Jsmaz](https://github.com/RyanAD/jsmaz) and [Shoco](http://ed-von-schleck.github.io/shoco/) for small string compression 161 | * [JavaFastPFor](https://github.com/lemire/JavaFastPFOR) for integer array compression 162 | 163 | ### General Compression 164 | 165 | My feeling is that we don't need general compression algorithms like LZ4, 166 | Snappy, etc. (An interesting new one is 167 | [Z-STD](http://fastcompression.blogspot.fr/2015/01/zstd-stronger-compression- 168 | algorithm.html?m=1)). The whole goal of this project is to be able to read from 169 | disk or database with minimal or no deserialization / decompression step. Many 170 | databases, such as Cassandra, already default to some kind of on-disk 171 | compression as well. 172 | -------------------------------------------------------------------------------- /filo-scala/src/main/scala/org.velvia.filo/codecs/Utils.scala: -------------------------------------------------------------------------------- 1 | package org.velvia.filo.codecs 2 | 3 | import com.google.flatbuffers.{FlatBufferBuilder, Table} 4 | import java.nio.{ByteBuffer, ByteOrder} 5 | import scala.collection.mutable.BitSet 6 | 7 | import org.velvia.filo.vector._ 8 | import org.velvia.filo.WireFormat 9 | 10 | /** 11 | * Common utilities for creating FlatBuffers, including mask and data vector building 12 | */ 13 | object Utils { 14 | // default initial size of bytebuffer to allocate. flatbufferbuilder will expand the buffer if needed. 15 | // don't make this too big, because one buffer needs to be allocated per column, and writing many columns 16 | // at once will use up a ton of memory otherwise. 17 | val BufferSize = 64 * 1024 18 | val SizeOfInt = 4 19 | 20 | // Returns true if every element in the data to be encoded is marked as NA (mask is set) 21 | def isAllNA(mask: BitSet, dataLength: Int): Boolean = mask.size == dataLength 22 | 23 | // Ensures that the resulting array(long) has at least as many bits as numBits 24 | def makeBitMask(mask: BitSet, numBits: Int): Array[Long] = { 25 | val initialBits = mask.toBitMask 26 | val numLongs = roundUp(numBits, 64) / 64 27 | initialBits ++ Array.fill(numLongs - initialBits.size)(0L) 28 | } 29 | 30 | // @return offset of mask table 31 | def populateNaMask(fbb: FlatBufferBuilder, mask: BitSet, dataLen: Int): Int = { 32 | val empty = mask.size == 0 33 | val full = isAllNA(mask, dataLen) 34 | var bitMaskOffset = 0 35 | 36 | // Simple bit mask, 1 bit per row 37 | // One option is to use JavaEWAH compressed bitmaps, requires no deserialization now 38 | // RoaringBitmap is really cool, but very space inefficient when you have less than 4096 integers 39 | // it's much better when you have 100000 or more rows 40 | // NOTE: we cannot nest structure creation, so have to create bitmask vector first :( 41 | if (!empty && !full) bitMaskOffset = NaMask.createBitMaskVector(fbb, makeBitMask(mask, dataLen)) 42 | 43 | NaMask.startNaMask(fbb) 44 | NaMask.addMaskType(fbb, if (full) { MaskType.AllOnes } 45 | else if (empty) { MaskType.AllZeroes } 46 | else { MaskType.SimpleBitMask }) 47 | 48 | if (!empty && !full) NaMask.addBitMask(fbb, bitMaskOffset) 49 | NaMask.endNaMask(fbb) 50 | } 51 | 52 | private final def roundUp(n: Int, align: Int): Int = ((n + (align - 1)) / align) * align 53 | 54 | /** 55 | * Sets up and closes the FlatBuffer [ubyte] vector inside of many Filo vectors, figuring 56 | * out proper translation to byte vector length and alignment. 57 | * NOTE: We don't really use FlatBuffer's individual element read methods, so I suppose the 58 | * length in the FBB vector doesn't matter, but it's much better to be consistent to avoid bugs 59 | * @param nbits the # of bits per element 60 | * @param numElems the number of nbits length elements 61 | * @param alignment the byte alignment, eg 1 = byte aligned, 4 = int aligned 62 | * This is basically what chunk size is going to fill up the FBB. 63 | * Should be a power of two, I think. 64 | * @param addFunc a func to populate the elements in FBB, in reverse order 65 | * @return (offset, nbits) 66 | */ 67 | def makeByteVector(fbb: FlatBufferBuilder, nbits: Int, numElems: Int, alignment: Int) 68 | (addFunc: FlatBufferBuilder => Unit): (Int, Int) = { 69 | fbb.startVector(1, roundUp(nbits * numElems, alignment * 8) / 8, alignment) 70 | addFunc(fbb) 71 | (fbb.endVector(), nbits) 72 | } 73 | 74 | // Builds the FB [ubyte] vector for data of different types 75 | // They are all fed a reverse iterator of items 76 | def byteVect(fbb: FlatBufferBuilder, len: Int, reverseElems: Iterator[Byte]): (Int, Int) = 77 | makeByteVector(fbb, 8, len, 1) { fbb => 78 | while (reverseElems.hasNext) { fbb.putByte(reverseElems.next) } 79 | } 80 | 81 | def shortVect(fbb: FlatBufferBuilder, len: Int, reverseElems: Iterator[Short]): (Int, Int) = 82 | makeByteVector(fbb, 16, len, 2) { fbb => 83 | while (reverseElems.hasNext) { fbb.putShort(reverseElems.next) } 84 | } 85 | 86 | def intVect(fbb: FlatBufferBuilder, len: Int, reverseElems: Iterator[Int]): (Int, Int) = 87 | makeByteVector(fbb, 32, len, 4) { fbb => 88 | while (reverseElems.hasNext) { fbb.putInt(reverseElems.next) } 89 | } 90 | 91 | def longVect(fbb: FlatBufferBuilder, len: Int, reverseElems: Iterator[Long]): (Int, Int) = 92 | makeByteVector(fbb, 64, len, 8) { fbb => 93 | while (reverseElems.hasNext) { fbb.putLong(reverseElems.next) } 94 | } 95 | 96 | def doubleVect(fbb: FlatBufferBuilder, len: Int, reverseElems: Iterator[Double]): (Int, Int) = 97 | makeByteVector(fbb, 64, len, 8) { fbb => 98 | while (reverseElems.hasNext) { fbb.putDouble(reverseElems.next) } 99 | } 100 | 101 | def floatVect(fbb: FlatBufferBuilder, len: Int, reverseElems: Iterator[Float]): (Int, Int) = 102 | makeByteVector(fbb, 32, len, 4) { fbb => 103 | while (reverseElems.hasNext) { fbb.putFloat(reverseElems.next) } 104 | } 105 | 106 | // stringVect is fed a Seq of strings in normal order 107 | // Only the offset is returned, nbits is not useful for [string] 108 | def stringVect(fbb: FlatBufferBuilder, data: Seq[String]): Int = { 109 | // Create string vectors in reverse order, since FBB builds from top down 110 | // Also remember in FBB you cannot nest vector creation, so create all string vects first 111 | val reverseOffsets = data.reverseMap { str => 112 | if (str != null) fbb.createString(str) else 0 113 | } 114 | fbb.startVector(4, data.length, 4) 115 | reverseOffsets.foreach(fbb.addOffset) 116 | fbb.endVector() 117 | } 118 | 119 | def putHeaderAndGet(fbb: FlatBufferBuilder, headerBytes: Int): ByteBuffer = { 120 | fbb.addInt(headerBytes) 121 | // Create a separate bytebuffer as original might be reused 122 | ByteBuffer.wrap(fbb.sizedByteArray).order(ByteOrder.LITTLE_ENDIAN) 123 | } 124 | 125 | def putHeaderAndGet(fbb: FlatBufferBuilder, majorVectorType: Int, subType: Int): ByteBuffer = 126 | putHeaderAndGet(fbb, WireFormat(majorVectorType, subType)) 127 | } 128 | 129 | /** 130 | * Mix in to encoders to allow reuse of ByteBuffers for subsequent building of FBBs. 131 | * This avoids allocation and GC of lots of ByteBuffers during heavy periods of building Filo vectors. 132 | * Note that FBB itself can "grow" a BB by allocating a new bigger one, and that would not be saved here. 133 | * That's probably a good thing, because otherwise the BB for each thread could grow to be really big. 134 | */ 135 | trait ThreadLocalBuffers { 136 | val bb = new ThreadLocal[ByteBuffer] 137 | 138 | def getBuffer: ByteBuffer = { 139 | val _bb = bb.get 140 | if (_bb == null) { 141 | val newbb = ByteBuffer.allocate(Utils.BufferSize) 142 | bb.set(newbb) 143 | newbb 144 | } else { 145 | _bb 146 | } 147 | } 148 | } 149 | -------------------------------------------------------------------------------- /filo-scala/src/test/scala/org.velvia.filo/EncodingPropertiesTest.scala: -------------------------------------------------------------------------------- 1 | package org.velvia.filo 2 | 3 | import org.scalatest.FunSpec 4 | import org.scalatest.Matchers 5 | import org.scalatest.prop.PropertyChecks 6 | 7 | class EncodingPropertiesTest extends FunSpec with Matchers with PropertyChecks { 8 | import BuilderEncoder._ 9 | import VectorReader._ 10 | import org.velvia.filo.vectors 11 | 12 | it("Filo format int vectors should match length and sum") { 13 | forAll { (s: List[Int]) => 14 | val buf = VectorBuilder(s).toFiloBuffer 15 | val binarySeq = FiloVector[Int](buf) 16 | 17 | binarySeq.length should equal (s.length) 18 | binarySeq.sum should equal (s.sum) 19 | } 20 | } 21 | 22 | it("Filo format long vectors should match length and sum") { 23 | forAll { (s: List[Long]) => 24 | val buf = VectorBuilder(s).toFiloBuffer 25 | val binarySeq = FiloVector[Long](buf) 26 | 27 | binarySeq.length should equal (s.length) 28 | binarySeq.sum should equal (s.sum) 29 | } 30 | } 31 | 32 | it("Filo format double vectors should match length and sum") { 33 | forAll { (s: List[Double]) => 34 | val buf = VectorBuilder(s).toFiloBuffer 35 | val binarySeq = FiloVector[Double](buf) 36 | 37 | binarySeq.length should equal (s.length) 38 | binarySeq.sum should equal (s.sum) 39 | } 40 | } 41 | 42 | it("Filo format float vectors should match length and sum") { 43 | forAll { (s: List[Float]) => 44 | val buf = VectorBuilder(s).toFiloBuffer 45 | val binarySeq = FiloVector[Float](buf) 46 | 47 | binarySeq.length should equal (s.length) 48 | binarySeq.sum should equal (s.sum) 49 | } 50 | } 51 | 52 | it("Filo format boolean vectors should match length and number of true values") { 53 | forAll { (s: List[Boolean]) => 54 | val buf = VectorBuilder(s).toFiloBuffer 55 | val binarySeq = FiloVector[Boolean](buf) 56 | 57 | binarySeq.length should equal (s.length) 58 | binarySeq.filter(x => x) should equal (s.filter(x => x)) 59 | } 60 | } 61 | 62 | import org.scalacheck._ 63 | import Arbitrary.arbitrary 64 | 65 | // Generate a list of bounded integers, every time bound it slightly differently 66 | // (to test different int compression techniques) 67 | def boundedIntList: Gen[Seq[Option[Int]]] = 68 | for { 69 | minVal <- Gen.oneOf(Int.MinValue, -65536, -32768, -256, -128, 0) 70 | maxVal <- Gen.oneOf(15, 127, 255, 32767, Int.MaxValue) 71 | seqOptList <- Gen.containerOf[Seq, Option[Int]]( 72 | noneOrThing[Int](Arbitrary(Gen.choose(minVal, maxVal)))) 73 | } yield { seqOptList } 74 | 75 | // Write our own generator to force frequent NA elements 76 | def noneOrThing[T](implicit a: Arbitrary[T]): Gen[Option[T]] = 77 | Gen.frequency((5, arbitrary[T].map(Some(_))), 78 | (1, Gen.const(None))) 79 | 80 | def optionList[T](implicit a: Arbitrary[T]): Gen[Seq[Option[T]]] = 81 | Gen.containerOf[Seq, Option[T]](noneOrThing[T]) 82 | 83 | it("should match elements and length for Int vectors with missing/NA elements") { 84 | forAll(boundedIntList) { s => 85 | val buf = VectorBuilder.fromOptions(s).toFiloBuffer 86 | val binarySeq = FiloVector[Int](buf) 87 | 88 | binarySeq.length should equal (s.length) 89 | val elements = binarySeq.optionIterator.toSeq 90 | elements should equal (s) 91 | } 92 | } 93 | 94 | implicit val utf8arb = Arbitrary(arbitrary[String].map(ZeroCopyUTF8String.apply)) 95 | 96 | it("should match elements and length for BinaryIntVectors with missing/NA elements") { 97 | import vectors.IntBinaryVector 98 | forAll(boundedIntList) { s => 99 | val intVect = IntBinaryVector.appendingVector(1000) 100 | s.foreach(intVect.add) 101 | val binarySeq = FiloVector[Int](intVect.optimize().toFiloBuffer) 102 | binarySeq.length should equal (s.length) 103 | val elements = binarySeq.optionIterator.toSeq 104 | elements should equal (s) 105 | } 106 | } 107 | 108 | it("should match elements and length for offheap BinaryIntVectors with missing/NA elements") { 109 | import vectors.IntBinaryVector 110 | forAll(boundedIntList) { s => 111 | val intVect = IntBinaryVector.appendingVector(1000, offheap=true) 112 | s.foreach(intVect.add) 113 | val binarySeq = FiloVector[Int](intVect.optimize().toFiloBuffer) 114 | binarySeq.length should equal (s.length) 115 | val elements = binarySeq.optionIterator.toSeq 116 | elements should equal (s) 117 | } 118 | } 119 | 120 | it("should match elements and length for simple string vectors with missing/NA elements") { 121 | forAll(optionList[String]) { s => 122 | val buf = VectorBuilder.fromOptions(s).toFiloBuffer(SimpleEncoding) 123 | val binarySeq = FiloVector[String](buf) 124 | 125 | binarySeq.length should equal (s.length) 126 | val elements = binarySeq.optionIterator.toSeq 127 | elements should equal (s) 128 | } 129 | } 130 | 131 | ignore("should match elements and length for UTF8Vectors with missing/NA elements") { 132 | forAll(optionList[ZeroCopyUTF8String]) { s => 133 | val utf8vect = vectors.UTF8Vector.appendingVector(500) 134 | s.foreach(utf8vect.add) 135 | val buf = utf8vect.optimize().toFiloBuffer 136 | val binarySeq = FiloVector[ZeroCopyUTF8String](buf) 137 | binarySeq.length should equal (s.length) 138 | val elements = binarySeq.optionIterator.toSeq 139 | elements should equal (s) 140 | } 141 | } 142 | 143 | it("should match elements and length for dictionary string vectors with missing/NA elements") { 144 | forAll(optionList[String]) { s => 145 | val buf = VectorBuilder.fromOptions(s).toFiloBuffer(DictionaryEncoding) 146 | val binarySeq = FiloVector[String](buf) 147 | 148 | binarySeq.length should equal (s.length) 149 | val elements = binarySeq.optionIterator.toSeq 150 | elements should equal (s) 151 | } 152 | } 153 | 154 | it("should match elements and length for Double vectors with missing/NA elements") { 155 | forAll(optionList[Double]) { s => 156 | val buf = VectorBuilder.fromOptions(s).toFiloBuffer 157 | val binarySeq = FiloVector[Double](buf) 158 | 159 | binarySeq.length should equal (s.length) 160 | val elements = binarySeq.optionIterator.toSeq 161 | elements should equal (s) 162 | } 163 | } 164 | 165 | // Right now empty NA strings might get returned as available. Debug later. 166 | ignore("should match elements and length for DictUTF8Vectors with missing/NA elements") { 167 | forAll(optionList[ZeroCopyUTF8String]) { s => 168 | val utf8strs = s.map(_.getOrElse(ZeroCopyUTF8String.NA)) 169 | val buf = vectors.UTF8Vector(utf8strs).optimize(AutoDictString(spaceThreshold=0.8)).toFiloBuffer 170 | val binarySeq = FiloVector[ZeroCopyUTF8String](buf) 171 | binarySeq.length should equal (s.length) 172 | val elements = binarySeq.optionIterator.toSeq 173 | elements should equal (s) 174 | } 175 | } 176 | } -------------------------------------------------------------------------------- /filo-scala/src/main/scala/org.velvia.filo/codecs/PrimitiveDataVectBuilder.scala: -------------------------------------------------------------------------------- 1 | package org.velvia.filo.codecs 2 | 3 | import com.google.flatbuffers.FlatBufferBuilder 4 | import scala.collection.mutable.BitSet 5 | import scala.language.postfixOps 6 | import scalaxy.loops._ 7 | 8 | /** 9 | * A trait to build the smallest space fitting data vector possible given numeric 10 | * inputs. Generally designed for Integer types; would also work for BigInteger etc. 11 | */ 12 | trait PrimitiveDataVectBuilder[A] { 13 | /** 14 | * Populates the FlatBuffers binary data vector based on the sequence of elements and predetermined 15 | * mins and maxes. May determine the smallest representation automatically. 16 | * @return ((offset, nbits), signed) 17 | */ 18 | def build(fbb: FlatBufferBuilder, data: Seq[A], min: A, max: A): ((Int, Int), Boolean) 19 | 20 | def shouldBuildDeltas(min: A, max: A): Boolean = false 21 | 22 | /** 23 | * Like build, but works on deltas of the sequence from the min value (the "base"). 24 | * This often allows for smaller representations. 25 | */ 26 | def buildDeltas(fbb: FlatBufferBuilder, data: Seq[A], min: A, max: A): ((Int, Int), Boolean) = ??? 27 | 28 | // Converts the primitive A to a Long. If shouldBuildDeltas might be true, this should be defined. 29 | def toLong(item: A): Long = ??? 30 | } 31 | 32 | /** 33 | * Builders for efficient integral data vect representations. Automatically determines the smallest 34 | * representation possible - for example if all numbers fit in signed 8 bits, then use that. 35 | */ 36 | object AutoIntegralDVBuilders { 37 | import Utils._ 38 | 39 | implicit object BoolDataVectBuilder extends PrimitiveDataVectBuilder[Boolean] { 40 | def build(fbb: FlatBufferBuilder, data: Seq[Boolean], min: Boolean, max: Boolean): 41 | ((Int, Int), Boolean) = { 42 | // TODO: handle case where all booleans are true or false 43 | val bitset = new BitSet 44 | for { i <- 0 until data.length optimized } { 45 | if (data(i)) bitset += i 46 | } 47 | val mask = makeBitMask(bitset, data.length) 48 | (longVect(fbb, mask.size, mask.reverseIterator), false) 49 | } 50 | } 51 | 52 | implicit object ShortDataVectBuilder extends PrimitiveDataVectBuilder[Short] { 53 | def build(fbb: FlatBufferBuilder, data: Seq[Short], min: Short, max: Short): ((Int, Int), Boolean) = { 54 | // TODO: Add support for stuff below byte level 55 | if (min >= Byte.MinValue.toShort && max <= Byte.MaxValue.toShort) { 56 | (byteVect(fbb, data.size, data.reverseIterator.map(_.toByte)), true) 57 | } else if (min >= 0 && max < 256) { 58 | (byteVect(fbb, data.size, data.reverseIterator.map(_.toByte)), false) 59 | } else { 60 | (shortVect(fbb, data.size, data.reverseIterator), true) 61 | } 62 | } 63 | 64 | override def shouldBuildDeltas(min: Short, max: Short): Boolean = { 65 | val diff = max - min 66 | diff > 0 && 67 | (if (min >= 0) { 68 | (diff < 256 && max >= 256) 69 | } else { 70 | (diff < 256 && (min < Byte.MinValue || max > Byte.MaxValue)) 71 | }) 72 | } 73 | 74 | override def toLong(item: Short): Long = item.toLong 75 | 76 | override def buildDeltas(fbb: FlatBufferBuilder, data: Seq[Short], min: Short, max: Short): 77 | ((Int, Int), Boolean) = build(fbb, data.map(x => (x - min).toShort), 0, (max - min).toShort) 78 | } 79 | 80 | implicit object IntDataVectBuilder extends PrimitiveDataVectBuilder[Int] { 81 | def build(fbb: FlatBufferBuilder, data: Seq[Int], min: Int, max: Int): ((Int, Int), Boolean) = { 82 | // TODO: Add support for stuff below byte level 83 | if (min >= Byte.MinValue && max <= Byte.MaxValue) { 84 | (byteVect(fbb, data.size, data.reverseIterator.map(_.toByte)), true) 85 | } else if (min >= 0 && max < 256) { 86 | (byteVect(fbb, data.size, data.reverseIterator.map(_.toByte)), false) 87 | } else if (min >= Short.MinValue && max <= Short.MaxValue) { 88 | (shortVect(fbb, data.size, data.reverseIterator.map(_.toShort)), true) 89 | } else if (min >= 0 && max < 65536) { 90 | (shortVect(fbb, data.size, data.reverseIterator.map(_.toShort)), false) 91 | } else { 92 | (intVect(fbb, data.size, data.reverseIterator), true) 93 | } 94 | } 95 | 96 | override def shouldBuildDeltas(min: Int, max: Int): Boolean = { 97 | val diff = max - min 98 | diff > 0 && 99 | (if (min >= 0) { 100 | (diff < 256 && max >= 256) || 101 | (diff < 65536 && max >= 65536) 102 | } else { 103 | (diff < 256 && (min < Byte.MinValue || max > Byte.MaxValue)) || 104 | (diff < 65536 && (min < Short.MinValue || max > Short.MaxValue)) 105 | }) 106 | } 107 | 108 | override def toLong(item: Int): Long = item.toLong 109 | 110 | override def buildDeltas(fbb: FlatBufferBuilder, data: Seq[Int], min: Int, max: Int): 111 | ((Int, Int), Boolean) = build(fbb, data.map(_ - min), 0, max - min) 112 | } 113 | 114 | implicit object LongDataVectBuilder extends PrimitiveDataVectBuilder[Long] { 115 | val maxUInt = 65536L * 65536L 116 | def build(fbb: FlatBufferBuilder, data: Seq[Long], min: Long, max: Long): ((Int, Int), Boolean) = { 117 | if (min >= Byte.MinValue && max <= Byte.MaxValue) { 118 | (byteVect(fbb, data.size, data.reverseIterator.map(_.toByte)), true) 119 | } else if (min >= 0L && max < 256L) { 120 | (byteVect(fbb, data.size, data.reverseIterator.map(_.toByte)), false) 121 | } else if (min >= Short.MinValue && max <= Short.MaxValue) { 122 | (shortVect(fbb, data.size, data.reverseIterator.map(_.toShort)), true) 123 | } else if (min >= 0L && max < 65536L) { 124 | (shortVect(fbb, data.size, data.reverseIterator.map(_.toShort)), false) 125 | } else if (min >= Int.MinValue && max <= Int.MaxValue) { 126 | (intVect(fbb, data.size, data.reverseIterator.map(_.toInt)), true) 127 | } else if (min >= 0L && max < maxUInt) { 128 | (intVect(fbb, data.size, data.reverseIterator.map(_.toInt)), false) 129 | } else { 130 | (longVect(fbb, data.size, data.reverseIterator), false) 131 | } 132 | } 133 | 134 | override def shouldBuildDeltas(min: Long, max: Long): Boolean = { 135 | val diff = max - min 136 | diff > 0 && 137 | (if (min >= 0) { 138 | (diff < 256L && max >= 256L) || 139 | (diff < 65536L && max >= 65536L) || 140 | (diff < maxUInt && max >= maxUInt) 141 | } else { 142 | (diff < 256L && (min < Byte.MinValue || max > Byte.MaxValue)) || 143 | (diff < 65536L && (min < Short.MinValue || max > Short.MaxValue)) || 144 | (diff < maxUInt && (min < Int.MinValue || max > Int.MaxValue)) 145 | }) 146 | } 147 | 148 | override def toLong(item: Long): Long = item 149 | 150 | override def buildDeltas(fbb: FlatBufferBuilder, data: Seq[Long], min: Long, max: Long): 151 | ((Int, Int), Boolean) = build(fbb, data.map(_ - min), 0, max - min) 152 | } 153 | } 154 | 155 | object FPBuilders { 156 | import Utils._ 157 | 158 | implicit object DoubleDataVectBuilder extends PrimitiveDataVectBuilder[Double] { 159 | def build(fbb: FlatBufferBuilder, data: Seq[Double], min: Double, max: Double): ((Int, Int), Boolean) = { 160 | (doubleVect(fbb, data.size, data.reverseIterator), false) 161 | } 162 | } 163 | 164 | implicit object FloatDataVectBuilder extends PrimitiveDataVectBuilder[Float] { 165 | def build(fbb: FlatBufferBuilder, data: Seq[Float], min: Float, max: Float): ((Int, Int), Boolean) = { 166 | (floatVect(fbb, data.size, data.reverseIterator), false) 167 | } 168 | } 169 | } -------------------------------------------------------------------------------- /filo-scala/src/main/scala/org.velvia.filo/vectors/DeltaDeltaVector.scala: -------------------------------------------------------------------------------- 1 | package org.velvia.filo.vectors 2 | 3 | import java.nio.ByteBuffer 4 | import scala.util.Try 5 | import scalaxy.loops._ 6 | 7 | import org.velvia.filo._ 8 | 9 | /** 10 | * The Delta-Delta Vector represents an efficient encoding of a sloped line where in general values are 11 | * expected to stay close to such a line, defined with an initial offset and a slope or delta per element. 12 | * Examples of data that fits this well are timestamps and counters that increment regularly. 13 | * This can also be used to encode data with an offset even if the slope is zero. 14 | * 15 | * What is stored: 16 | * base Long/8 bytes - the base or initial value 17 | * slope Int /4 bytes - the delta or increment per value 18 | * 19 | * This is based upon the similar concept used in the Facebook Gorilla TSDB and Prometheus TSDB to compact 20 | * timestamp storage. See: 21 | * https://promcon.io/2016-berlin/talks/the-prometheus-time-series-database/ 22 | * http://www.vldb.org/pvldb/vol8/p1816-teller.pdf 23 | */ 24 | object DeltaDeltaVector { 25 | /** 26 | * Creates a non-growing DeltaDeltaAppendingVector based on an initial value and slope and nbits. 27 | * Really meant to be called from the optimize method of a LongAppendingVector, although you could 28 | * initialize a fresh one if you are relatively sure about the initial value and slope parameters. 29 | */ 30 | def appendingVector(maxElements: Int, 31 | initValue: Long, 32 | slope: Int, 33 | nbits: Short, 34 | signed: Boolean, 35 | offheap: Boolean = false): DeltaDeltaAppendingVector = { 36 | val bytesRequired = 12 + IntBinaryVector.noNAsize(maxElements, nbits) 37 | val (base, off, nBytes) = BinaryVector.allocWithMagicHeader(bytesRequired, offheap) 38 | new DeltaDeltaAppendingVector(base, off, nBytes, initValue, slope, nbits, signed) 39 | } 40 | 41 | /** 42 | * Creates a DeltaDeltaAppendingVector from a source AppendableVector[Long], filling in all 43 | * the values as well. Checks eligibility first based on sampling the input vector. 44 | * Determines if the input values kinda look like a slope/line and might benefit from delta-delta 45 | * encoding. The heuristic is extremely simple for now and looks at the first, last, and sample 46 | * of other values, unless the vector is really small then we look at everything. 47 | * @return Some(vector) if the input vect is eligible, or None if it is not eligible 48 | */ 49 | def fromLongVector(inputVect: BinaryAppendableVector[Long], 50 | min: Long, max: Long, 51 | samples: Int = 4): Option[DeltaDeltaAppendingVector] = { 52 | val indexDelta = inputVect.length / (samples + 1) 53 | if (inputVect.noNAs && indexDelta > 0) { 54 | val diffs = (0 until samples).map { n => inputVect((n + 1) * indexDelta) - inputVect(n * indexDelta) } 55 | // Good: all diffs positive, min=first elem, max=last elem 56 | if (min == inputVect(0) && max == inputVect(inputVect.length - 1) && diffs.forall(_ > 0)) { 57 | for { slope <- getSlope(min, max, inputVect.length) 58 | deltaVect = appendingVector(inputVect.length, min, slope, 32, true, 59 | inputVect.isOffheap) 60 | appended <- Try(deltaVect.addVector(inputVect)).toOption 61 | } yield { deltaVect } 62 | // TODO(velvia): Maybe add less stringent case of slope fitting, flat or negative slope good too. 63 | } else { None } 64 | } else { None } 65 | } 66 | 67 | def apply(buffer: ByteBuffer): BinaryVector[Long] = { 68 | val (base, off, len) = UnsafeUtils.BOLfromBuffer(buffer) 69 | DeltaDeltaVector(base, off, len) 70 | } 71 | 72 | /** 73 | * Returns the incremental slope from first to last over numElements. 74 | * If the slope is greater than Int.MaxValue, then None is returned. 75 | */ 76 | def getSlope(first: Long, last: Long, numElements: Int): Option[Int] = { 77 | val slope = (last - first) / (numElements - 1) 78 | if (slope < Int.MaxValue && slope > Int.MinValue) Some(slope.toInt) else None 79 | } 80 | } 81 | 82 | final case class DeltaDeltaVector(base: Any, offset: Long, numBytes: Int) extends BinaryVector[Long] { 83 | val vectMajorType = WireFormat.VECTORTYPE_DELTA2 84 | val vectSubType = WireFormat.SUBTYPE_INT_NOMASK 85 | val maybeNAs = false 86 | private final val initValue = UnsafeUtils.getLong(base, offset) 87 | private final val slope = UnsafeUtils.getInt(base, offset + 8) 88 | private final val inner = IntBinaryVector(base, offset + 12, numBytes - 12) 89 | 90 | override val length: Int = inner.length 91 | final def isAvailable(index: Int): Boolean = true 92 | final def apply(index: Int): Long = initValue + slope * index + inner(index) 93 | } 94 | 95 | final case class DeltaTooLarge(value: Long, expected: Long) extends 96 | IllegalArgumentException(s"Delta too large for value $value") 97 | 98 | import BuilderEncoder._ 99 | 100 | // TODO: validate args, esp base offset etc, somehow. Need to think about this for the many diff classes. 101 | class DeltaDeltaAppendingVector(val base: Any, 102 | val offset: Long, 103 | val maxBytes: Int, 104 | initValue: Long, 105 | slope: Int, 106 | nbits: Short, 107 | signed: Boolean) extends BinaryAppendableVector[Long] { 108 | val isAllNA = false 109 | val noNAs = true 110 | val maybeNAs = false 111 | val vectMajorType = WireFormat.VECTORTYPE_DELTA2 112 | val vectSubType = WireFormat.SUBTYPE_INT_NOMASK 113 | 114 | private val deltas = IntBinaryVector.appendingVectorNoNA(base, offset + 12, maxBytes - 12, nbits, signed) 115 | private var expected = initValue 116 | private var innerMin: Int = Int.MaxValue 117 | private var innerMax: Int = Int.MinValue 118 | 119 | UnsafeUtils.setLong(base, offset, initValue) 120 | UnsafeUtils.setInt(base, offset + 8, slope) 121 | 122 | override def length: Int = deltas.length 123 | final def isAvailable(index: Int): Boolean = true 124 | final def apply(index: Int): Long = initValue + slope * index + deltas(index) 125 | final def numBytes: Int = 12 + deltas.numBytes 126 | 127 | final def addNA(): Unit = ??? // NAs are not supported for delta delta for now 128 | final def addData(data: Long): Unit = { 129 | val innerValue = data - expected 130 | if (innerValue <= Int.MaxValue && innerValue >= Int.MinValue) { deltas.addData(innerValue.toInt) } 131 | else { 132 | dispose() 133 | throw DeltaTooLarge(data, expected) 134 | } 135 | innerMin = Math.min(innerMin, innerValue.toInt) 136 | innerMax = Math.max(innerMax, innerValue.toInt) 137 | expected += slope 138 | } 139 | 140 | 141 | def reset(): Unit = { 142 | expected = initValue 143 | innerMin = Int.MaxValue 144 | innerMax = Int.MinValue 145 | deltas.reset() 146 | } 147 | 148 | def addInnerVectors(other: IntAppendingVector): Unit = { 149 | for { i <- 0 until other.length optimized } { 150 | val orig = other(i) 151 | deltas.addData(orig) 152 | innerMin = Math.min(innerMin, orig) 153 | innerMax = Math.max(innerMax, orig) 154 | } 155 | expected = initValue + length * slope 156 | } 157 | 158 | def finishCompaction(newBase: Any, newOff: Long): BinaryVector[Long] = 159 | DeltaDeltaVector(newBase, newOff, numBytes) 160 | 161 | override def optimize(hint: EncodingHint = AutoDetect): BinaryVector[Long] = { 162 | // Just optimize nbits. 163 | val (newNbits, newSigned) = IntBinaryVector.minMaxToNbitsSigned(innerMin, innerMax) 164 | if (newNbits < nbits) { 165 | val newVect = DeltaDeltaVector.appendingVector(deltas.length, initValue, slope, newNbits, newSigned, 166 | offheap=this.isOffheap) 167 | newVect.addInnerVectors(deltas) 168 | if (hint == AutoDetectDispose) dispose() 169 | newVect.freeze(copy = false) // already writing new vector 170 | } else { 171 | freeze() 172 | } 173 | } 174 | } -------------------------------------------------------------------------------- /filo-scala/src/main/scala/org.velvia.filo/FiloVector.scala: -------------------------------------------------------------------------------- 1 | package org.velvia.filo 2 | 3 | import java.nio.{ByteBuffer, ByteOrder} 4 | import java.sql.Timestamp 5 | import org.joda.time.DateTime 6 | import scala.collection.Traversable 7 | import scalaxy.loops._ 8 | 9 | import org.velvia.filo.codecs.EmptyFiloVector 10 | import org.velvia.filo.vector._ 11 | 12 | /** 13 | * The main entry point for parsing a Filo binary vector, returning a FiloVector with which 14 | * to iterate over and read the data vector. 15 | */ 16 | object FiloVector { 17 | import WireFormat._ 18 | 19 | /** 20 | * Parses a Filo-format ByteBuffer into a FiloVector. Automatically detects what type of encoding 21 | * is used underneath. 22 | * 23 | * @param buf the ByteBuffer with the columnar chunk at the current position. After apply returns, the 24 | * position will be restored to its original value, but it may change in the meantime. 25 | */ 26 | def apply[A](buf: ByteBuffer, emptyLen: Int = 0)(implicit cm: VectorReader[A]): FiloVector[A] = { 27 | if (buf == null) return new EmptyFiloVector[A](emptyLen) 28 | val origPos = buf.position 29 | buf.order(ByteOrder.LITTLE_ENDIAN) 30 | val headerBytes = buf.getInt() 31 | val vector = majorVectorType(headerBytes) match { 32 | case VECTORTYPE_EMPTY => 33 | new EmptyFiloVector[A](emptyVectorLen(headerBytes)) 34 | case other => 35 | cm.makeVector(buf, headerBytes) 36 | } 37 | buf.position(origPos) 38 | vector 39 | } 40 | 41 | type VectorMaker = PartialFunction[Class[_], (ByteBuffer, Int) => FiloVector[_]] 42 | 43 | import VectorReader._ 44 | 45 | val defaultVectorMaker: VectorMaker = { 46 | case Classes.Boolean => ((b: ByteBuffer, len: Int) => FiloVector[Boolean](b, len)) 47 | case Classes.String => ((b: ByteBuffer, len: Int) => FiloVector[String](b, len)) 48 | case Classes.Int => ((b: ByteBuffer, len: Int) => FiloVector[Int](b, len)) 49 | case Classes.Long => ((b: ByteBuffer, len: Int) => FiloVector[Long](b, len)) 50 | case Classes.Double => ((b: ByteBuffer, len: Int) => FiloVector[Double](b, len)) 51 | case Classes.Float => ((b: ByteBuffer, len: Int) => FiloVector[Float](b, len)) 52 | case Classes.DateTime => ((b: ByteBuffer, len: Int) => FiloVector[DateTime](b, len)) 53 | case Classes.SqlTimestamp => ((b: ByteBuffer, len: Int) => FiloVector[Timestamp](b, len)) 54 | case Classes.UTF8 => ((b: ByteBuffer, len: Int) => FiloVector[ZeroCopyUTF8String](b, len)) 55 | } 56 | 57 | /** 58 | * Creates a FiloVector using a dynamically supplied class type and a pluggable VectorMaker. 59 | */ 60 | def make(buf: ByteBuffer, 61 | clazz: Class[_], 62 | emptyLen: Int = 0, 63 | vectorMaker: VectorMaker = defaultVectorMaker): FiloVector[_] = 64 | vectorMaker(clazz)(buf, emptyLen) 65 | 66 | /** 67 | * Creates multiple FiloVectors from raw ByteBuffers and an array of their classes 68 | */ 69 | def makeVectors(chunks: Array[ByteBuffer], 70 | classes: Array[Class[_]], 71 | emptyLen: Int = 0, 72 | vectorMaker: VectorMaker = defaultVectorMaker): Array[FiloVector[_]] = { 73 | require(chunks.size == classes.size, "chunks must be same length as classes") 74 | val aray = new Array[FiloVector[_]](chunks.size) 75 | for { i <- 0 until chunks.size optimized } { 76 | aray(i) = make(chunks(i), classes(i), emptyLen, vectorMaker) 77 | } 78 | aray 79 | } 80 | 81 | /** 82 | * Gives Traversable / Scala collection semantics to a FiloVector. It is implemented as 83 | * an extension/implicit class to limit the class footprint of core FiloVectors, and also 84 | * separated out because any methods used on these are slow due to boxing. 85 | */ 86 | implicit class FiloVectorTraversable[A](vector: FiloVector[A]) extends Traversable[A] { 87 | // Calls fn for each available element in the column. Will call 0 times if column is empty. 88 | // NOTE: super slow for primitives because no matter what we do, this will not specialize 89 | // the A => B and becomes Object => Object. :/ 90 | def foreach[B](fn: A => B): Unit = { 91 | for { i <- 0 until vector.length optimized } { 92 | if (vector.isAvailable(i)) fn(vector.apply(i)) 93 | } 94 | } 95 | 96 | /** 97 | * Returns an Iterator[Option[A]] over the Filo bytebuffer. This basically calls 98 | * get() at each index, so it returns Some(A) when the value is defined and None 99 | * if it is NA. 100 | * NOTE: This is a very slow API, due to the need to wrap items in Option, as well as 101 | * the natural slowness of get(). 102 | * TODO: make this faster. Don't use the get() API. 103 | */ 104 | def optionIterator(): Iterator[Option[A]] = 105 | for { index <- (0 until vector.length).toIterator } yield { vector.get(index) } 106 | } 107 | } 108 | 109 | /** 110 | * A FiloVector gives extremely fast read APIs, all with minimal or zero deserialization, and 111 | * able to be completely off-heap. 112 | * 113 | * Fastest ways to access a FiloVector in order, taking into account NA's: 114 | * 1. while loop, call apply and isAvailable directly 115 | * 2. use the implicit FiloVectorTraversable to get Traversable semantics 116 | * 117 | * Fastest ways to access FiloVector randomly: 118 | * 1. Call isAvailable and apply at desired index 119 | */ 120 | trait FiloVector[@specialized(Int, Double, Long, Float, Boolean) A] { 121 | // Returns true if the element at position index is available, false if NA 122 | def isAvailable(index: Int): Boolean 123 | 124 | /** 125 | * Returns the element at a given index. If the element is not available, the value returned 126 | * is undefined. This is a very low level function intended for speed, not safety. 127 | * @param index the index in the column to pull from. No bounds checking is done. 128 | */ 129 | def apply(index: Int): A 130 | 131 | /** 132 | * Returns the number of elements in the column. 133 | */ 134 | def length: Int 135 | 136 | /** 137 | * Same as apply(), but returns Any, forcing to be an object. 138 | * Returns null if item not available. 139 | * Used mostly for APIs like Spark that require a boxed output. This will be slow. 140 | */ 141 | def boxed(index: Int): Any = 142 | if (isAvailable(index)) { apply(index).asInstanceOf[Any] } 143 | else { null } 144 | 145 | /** 146 | * A "safe" but slower get-element-at-position method. It is slower because it does 147 | * bounds checking and has to call isAvailable() every time. 148 | * @param index the index in the column to get 149 | * @return Some(a) if index is within bounds and element is not missing 150 | */ 151 | def get(index: Int): Option[A] = 152 | if (index >= 0 && index < length && isAvailable(index)) { Some(apply(index)) } 153 | else { None } 154 | } 155 | 156 | /** 157 | * A FiloVector containing a constant value. Not for serializing, just for in memory querying. 158 | */ 159 | class FiloConstVector[@specialized(Int, Double, Long, Float, Boolean) A](value: A, val length: Int) 160 | extends FiloVector[A] { 161 | def isAvailable(index: Int): Boolean = true 162 | def apply(index: Int): A = value 163 | } 164 | 165 | trait NaMaskReader { 166 | def isAvailable(index: Int): Boolean 167 | } 168 | 169 | object NaMaskReader { 170 | def apply(naMask: NaMask): NaMaskReader = { 171 | if (naMask.maskType == MaskType.AllZeroes) { ZeroesMaskReader } 172 | else { new RegularMaskReader(naMask) } 173 | } 174 | } 175 | 176 | object ZeroesMaskReader extends NaMaskReader { 177 | final def isAvailable(index: Int): Boolean = true 178 | } 179 | 180 | class RegularMaskReader(naMask: NaMask) extends NaMaskReader { 181 | val maskLen = naMask.bitMaskLength() 182 | private val maskReader = FastBufferReader(naMask.bitMaskAsByteBuffer) 183 | 184 | final def isAvailable(index: Int): Boolean = { 185 | // NOTE: length of bitMask may be less than (length / 64) longwords. 186 | val maskIndex = index >> 6 187 | val maskVal = if (maskIndex < maskLen) maskReader.readLong(maskIndex) else 0L 188 | (maskVal & (1L << (index & 63))) == 0 189 | } 190 | } 191 | 192 | abstract class NaMaskAvailable[A](naMask: NaMask) extends FiloVector[A] { 193 | // Must use private[this] to make a val a class field 194 | private[this] final val maskReader = NaMaskReader(naMask) 195 | final def isAvailable(index: Int): Boolean = maskReader.isAvailable(index) 196 | final def isEmptyMask: Boolean = naMask.maskType == MaskType.AllZeroes 197 | } -------------------------------------------------------------------------------- /filo-scala/src/main/scala/org.velvia.filo/ZeroCopyBinary.scala: -------------------------------------------------------------------------------- 1 | package org.velvia.filo 2 | 3 | import net.jpountz.xxhash.XXHashFactory 4 | import scalaxy.loops._ 5 | 6 | /** 7 | * Essentially like a (void *) pointer to an untyped binary blob which supports very basic operations. 8 | * Allows us to do zero-copy comparisons and other ops. 9 | * Intended for small blobs like UTF8-encoded strings. 10 | * The API is immutable, but the underlying bytes could change- but that is not recommended. 11 | */ 12 | trait ZeroCopyBinary extends Ordered[ZeroCopyBinary] { 13 | import ZeroCopyBinary._ 14 | 15 | def base: Any 16 | def offset: Long 17 | def numBytes: Int 18 | 19 | def length: Int = numBytes 20 | 21 | /** 22 | * Compares byte by byte starting with byte 0 23 | */ 24 | def compare(other: ZeroCopyBinary): Int = { 25 | val minLen = Math.min(numBytes, other.numBytes) 26 | // TODO: compare 4 bytes at a time, or even 8 27 | val minLenAligned = minLen & -4 28 | val wordComp = UnsafeUtils.wordCompare(base, offset, other.base, other.offset, minLenAligned) 29 | if (wordComp == 0) { 30 | for { i <- minLenAligned until minLen optimized } { 31 | val res = getByte(i) - other.getByte(i) 32 | if (res != 0) return res 33 | } 34 | return numBytes - other.numBytes 35 | } else wordComp 36 | } 37 | 38 | final def getByte(byteNum: Int): Byte = UnsafeUtils.getByte(base, offset + byteNum) 39 | 40 | final def copyTo(dest: Any, destOffset: Long, delta: Int = 0, n: Int = numBytes): Unit = 41 | UnsafeUtils.unsafe.copyMemory(base, offset + delta, dest, destOffset, n) 42 | 43 | final def asNewByteArray: Array[Byte] = { 44 | val newArray = new Array[Byte](numBytes) 45 | copyTo(newArray, UnsafeUtils.arayOffset) 46 | newArray 47 | } 48 | 49 | /** 50 | * Returns an array of bytes. If this ZeroCopyBinary is already a byte array 51 | * with exactly numBytes bytes, then just return that, to avoid another copy. 52 | * Otherwise, call asNewByteArray to return a copy. 53 | */ 54 | def bytes: Array[Byte] = { 55 | //scalastyle:off 56 | if (base != null && base.isInstanceOf[Array[Byte]] && offset == UnsafeUtils.arayOffset) { 57 | //scalastyle:on 58 | base.asInstanceOf[Array[Byte]] 59 | } else { 60 | asNewByteArray 61 | } 62 | } 63 | 64 | override def equals(other: Any): Boolean = other match { 65 | case z: ZeroCopyBinary => 66 | (numBytes == z.numBytes) && UnsafeUtils.equate(base, offset, z.base, z.offset, numBytes) 67 | case o: Any => 68 | false 69 | } 70 | 71 | private var hash64: Long = -1L 72 | 73 | // Ideally, hash without copying to another byte array, esp if the base storage is a byte array already 74 | def cachedHash64: Long = { 75 | if (hash64 == -1L) { 76 | val hash = base match { 77 | case a: Array[Byte] => hasher64.hash(a, offset.toInt - UnsafeUtils.arayOffset, numBytes, Seed) 78 | case o: Any => hasher64.hash(asNewByteArray, 0, numBytes, Seed) 79 | } 80 | hash64 = hash 81 | } 82 | hash64 83 | } 84 | 85 | override def hashCode: Int = cachedHash64.toInt ^ (cachedHash64 >> 32).toInt 86 | } 87 | 88 | object ZeroCopyBinary { 89 | // NOTE: fastestInstance sometimes returns JNI lib, which seems much slower for shorter strings 90 | val xxhashFactory = XXHashFactory.fastestJavaInstance 91 | val hasher32 = xxhashFactory.hash32 92 | val hasher64 = xxhashFactory.hash64 93 | val Seed = 0x9747b28c 94 | } 95 | 96 | /** 97 | * A zero-copy UTF8 string class 98 | * Not intended for general purpose use, mostly for fast comparisons and sorts without the need to 99 | * deserialize to a regular Java string 100 | */ 101 | final class ZeroCopyUTF8String(val base: Any, val offset: Long, val numBytes: Int) 102 | extends ZeroCopyBinary { 103 | import ZeroCopyUTF8String._ 104 | 105 | final def asNewString: String = new String(asNewByteArray) 106 | override def toString: String = asNewString 107 | 108 | final def numChars: Int = { 109 | var len = 0 110 | var i = 0 111 | while (i < numBytes) { 112 | len += 1 113 | i += numBytesForFirstByte(getByte(i)) 114 | } 115 | len 116 | } 117 | 118 | /** 119 | * Returns a substring of this. The returned string does not have bytes copied; simply different 120 | * pointers to the same area of memory. This is possible because ZCB's are immutable. 121 | * @param start the position of first code point 122 | * @param until the position after last code point, exclusive. 123 | */ 124 | final def substring(start: Int, until: Int): ZeroCopyUTF8String = { 125 | if (until <= start || start >= numBytes) { 126 | empty 127 | } else { 128 | var i = 0 129 | var c = 0 130 | while (i < numBytes && c < start) { 131 | i += numBytesForFirstByte(getByte(i)) 132 | c += 1 133 | } 134 | 135 | val j = i 136 | while (i < numBytes && c < until) { 137 | i += numBytesForFirstByte(getByte(i)) 138 | c += 1 139 | } 140 | 141 | if (i > j) new ZeroCopyUTF8String(base, offset + j, i - j) else empty 142 | } 143 | } 144 | 145 | override def equals(other: Any): Boolean = other match { 146 | case u: UTF8Wrapper => super.equals(u.utf8) 147 | case o: Any => super.equals(o) 148 | case UnsafeUtils.ZeroPointer => false 149 | } 150 | 151 | private def matchAt(s: ZeroCopyUTF8String, pos: Int): Boolean = 152 | if (s.numBytes + pos > numBytes || pos < 0) { false } 153 | else { UnsafeUtils.equate(base, offset + pos, s.base, s.offset, s.numBytes) } 154 | 155 | final def startsWith(prefix: ZeroCopyUTF8String): Boolean = matchAt(prefix, 0) 156 | 157 | final def endsWith(suffix: ZeroCopyUTF8String): Boolean = matchAt(suffix, numBytes - suffix.numBytes) 158 | 159 | final def contains(substring: ZeroCopyUTF8String): Boolean = 160 | if (substring.numBytes == 0) { true } 161 | else { 162 | val firstByte = substring.getByte(0) 163 | for { i <- 0 to (numBytes - substring.numBytes) optimized } { 164 | if (getByte(i) == firstByte && matchAt(substring, i)) return true 165 | } 166 | false 167 | } 168 | } 169 | 170 | @SerialVersionUID(1012L) 171 | case class UTF8Wrapper(var utf8: ZeroCopyUTF8String) extends java.io.Externalizable { 172 | //scalastyle:off 173 | def this() = this(null) 174 | //scalastyle:on 175 | 176 | override def equals(other: Any): Boolean = other match { 177 | case u: UTF8Wrapper => u.utf8 == this.utf8 178 | case z: ZeroCopyUTF8String => this.utf8 == z 179 | case o: Any => super.equals(o) 180 | } 181 | 182 | override def hashCode: Int = utf8.hashCode 183 | 184 | override def toString: String = utf8.toString 185 | 186 | def writeExternal(out: java.io.ObjectOutput): Unit = { 187 | out.writeInt(utf8.length) 188 | out.write(utf8.bytes) 189 | } 190 | def readExternal(in: java.io.ObjectInput): Unit = { 191 | val utf8Bytes = new Array[Byte](in.readInt()) 192 | in.readFully(utf8Bytes, 0, utf8Bytes.size) 193 | utf8 = ZeroCopyUTF8String(utf8Bytes) 194 | } 195 | } 196 | 197 | object ZeroCopyUTF8String { 198 | def apply(bytes: Array[Byte]): ZeroCopyUTF8String = 199 | new ZeroCopyUTF8String(bytes, UnsafeUtils.arayOffset, bytes.size) 200 | 201 | def apply(bytes: Array[Byte], offset: Int, len: Int): ZeroCopyUTF8String = { 202 | require(offset + len <= bytes.size, s"offset + len ($offset + $len) exceeds size ${bytes.size}") 203 | new ZeroCopyUTF8String(bytes, UnsafeUtils.arayOffset + offset, len) 204 | } 205 | 206 | def apply(str: String): ZeroCopyUTF8String = apply(str.getBytes("UTF-8")) 207 | 208 | val empty = ZeroCopyUTF8String("") 209 | 210 | // The official ZeroCopyUTF8String instance designated to equal NA / not available 211 | val NA = empty 212 | 213 | final def isNA(utf8: ZeroCopyUTF8String): Boolean = utf8.base == NA.base 214 | 215 | val bytesOfCodePointInUTF8 = Array(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 216 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 217 | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 218 | 4, 4, 4, 4, 4, 4, 4, 4, 219 | 5, 5, 5, 5, 220 | 6, 6) 221 | 222 | /** 223 | * Returns the number of bytes for a code point with the first byte as `b` 224 | */ 225 | def numBytesForFirstByte(b: Byte): Int = { 226 | val offset = (b & 0xFF) - 192; 227 | if (offset >= 0) bytesOfCodePointInUTF8(offset) else 1 228 | } 229 | 230 | implicit class StringToUTF8(str: String) { 231 | def utf8: ZeroCopyUTF8String = ZeroCopyUTF8String(str) 232 | } 233 | 234 | implicit object ZeroCopyUTF8BinaryOrdering extends Ordering[ZeroCopyUTF8String] { 235 | def compare(a: ZeroCopyUTF8String, b: ZeroCopyUTF8String): Int = a.compare(b) 236 | } 237 | } -------------------------------------------------------------------------------- /filo-scala/src/main/scala/org.velvia.filo/VectorReader.scala: -------------------------------------------------------------------------------- 1 | package org.velvia.filo 2 | 3 | import com.google.flatbuffers.Table 4 | import java.nio.ByteBuffer 5 | import java.sql.Timestamp 6 | import org.joda.time.DateTime 7 | 8 | import org.velvia.filo.codecs._ 9 | import org.velvia.filo.vector._ 10 | import org.velvia.filo.{vectors => bv} 11 | 12 | case class UnsupportedFiloType(vectType: Int, subType: Int) extends 13 | Exception(s"Unsupported Filo vector type $vectType, subType $subType") 14 | 15 | /** 16 | * VectorReader is a type class to help create FiloVector's from the raw Filo binary byte buffers -- 17 | * mostly parsing the header bytes and ensuring the creation of the right FiloVector parsing class. 18 | * 19 | * NOTE: I KNOW there is LOTS of repetition here, but apply() method is the inner loop and must be 20 | * super fast. Traits would slow it WAY down. Instead maybe we can use macros. 21 | */ 22 | object VectorReader { 23 | import WireFormat._ 24 | import TypedBufferReader._ 25 | 26 | implicit object BoolVectorReader extends PrimitiveVectorReader[Boolean] 27 | 28 | implicit object IntVectorReader extends PrimitiveVectorReader[Int] { 29 | override def makeDiffVector(dpv: DiffPrimitiveVector): FiloVector[Int] = { 30 | new DiffPrimitiveWrapper[Int, Int](dpv) { 31 | val base = baseReader.readInt(0) 32 | final def apply(i: Int): Int = base + dataReader.read(i) 33 | } 34 | } 35 | 36 | override val otherMaker: PartialFunction[(Int, Int, ByteBuffer), FiloVector[Int]] = { 37 | case (VECTORTYPE_BINSIMPLE, SUBTYPE_INT, b) => bv.IntBinaryVector.masked(b) 38 | case (VECTORTYPE_BINSIMPLE, SUBTYPE_INT_NOMASK, b) => bv.IntBinaryVector(b) 39 | case (VECTORTYPE_BINSIMPLE, SUBTYPE_REPEATED, b) => bv.IntBinaryVector.const(b) 40 | } 41 | } 42 | 43 | implicit object LongVectorReader extends PrimitiveVectorReader[Long] { 44 | override def makeDiffVector(dpv: DiffPrimitiveVector): FiloVector[Long] = { 45 | new DiffPrimitiveWrapper[Long, Long](dpv) { 46 | val base = baseReader.readLong(0) 47 | final def apply(i: Int): Long = base + dataReader.read(i) 48 | } 49 | } 50 | 51 | override val otherMaker: PartialFunction[(Int, Int, ByteBuffer), FiloVector[Long]] = { 52 | case (VECTORTYPE_BINSIMPLE, SUBTYPE_INT, b) => bv.LongBinaryVector.fromMaskedIntBuf(b) 53 | case (VECTORTYPE_BINSIMPLE, SUBTYPE_INT_NOMASK, b) => bv.LongBinaryVector.fromIntBuf(b) 54 | case (VECTORTYPE_BINSIMPLE, SUBTYPE_REPEATED, b) => bv.LongBinaryVector.const(b) 55 | case (VECTORTYPE_DELTA2, SUBTYPE_INT_NOMASK, b) => bv.DeltaDeltaVector(b) 56 | case (VECTORTYPE_BINSIMPLE, SUBTYPE_PRIMITIVE, b) => bv.LongBinaryVector.masked(b) 57 | case (VECTORTYPE_BINSIMPLE, SUBTYPE_PRIMITIVE_NOMASK, b) => bv.LongBinaryVector(b) 58 | } 59 | } 60 | 61 | implicit object DoubleVectorReader extends PrimitiveVectorReader[Double] { 62 | override val otherMaker: PartialFunction[(Int, Int, ByteBuffer), FiloVector[Double]] = { 63 | case (VECTORTYPE_BINSIMPLE, SUBTYPE_INT, b) => bv.DoubleVector.fromMaskedIntBuf(b) 64 | case (VECTORTYPE_BINSIMPLE, SUBTYPE_INT_NOMASK, b) => bv.DoubleVector.fromIntBuf(b) 65 | case (VECTORTYPE_BINSIMPLE, SUBTYPE_REPEATED, b) => bv.DoubleVector.const(b) 66 | case (VECTORTYPE_BINSIMPLE, SUBTYPE_PRIMITIVE, b) => bv.DoubleVector.masked(b) 67 | case (VECTORTYPE_BINSIMPLE, SUBTYPE_PRIMITIVE_NOMASK, b) => bv.DoubleVector(b) 68 | } 69 | } 70 | 71 | implicit object FloatVectorReader extends PrimitiveVectorReader[Float] 72 | 73 | implicit object StringVectorReader extends VectorReader[String] { 74 | def makeVector(buf: ByteBuffer, headerBytes: Int): FiloVector[String] = { 75 | (majorVectorType(headerBytes), vectorSubType(headerBytes)) match { 76 | case (VECTORTYPE_SIMPLE, SUBTYPE_STRING) => 77 | val ssv = SimpleStringVector.getRootAsSimpleStringVector(buf) 78 | new SimpleStringWrapper(ssv) 79 | 80 | case (VECTORTYPE_CONST, SUBTYPE_STRING) => 81 | val csv = ConstStringVector.getRootAsConstStringVector(buf) 82 | new ConstStringWrapper(csv) 83 | 84 | case (VECTORTYPE_DICT, SUBTYPE_STRING) => 85 | val dsv = DictStringVector.getRootAsDictStringVector(buf) 86 | new DictStringWrapper(dsv) { 87 | val intReader = TypedBufferReader[Int](reader, dsv.info.nbits, dsv.info.signed) 88 | final def getCode(i: Int): Int = intReader.read(i) 89 | } 90 | 91 | case (vectType, subType) => throw UnsupportedFiloType(vectType, subType) 92 | } 93 | } 94 | } 95 | 96 | implicit object UTF8VectorReader extends VectorReader[ZeroCopyUTF8String] { 97 | def makeVector(buf: ByteBuffer, headerBytes: Int): FiloVector[ZeroCopyUTF8String] = { 98 | (majorVectorType(headerBytes), vectorSubType(headerBytes)) match { 99 | case (VECTORTYPE_BINSIMPLE, SUBTYPE_UTF8) => bv.UTF8Vector(buf) 100 | case (VECTORTYPE_BINSIMPLE, SUBTYPE_FIXEDMAXUTF8) => bv.UTF8Vector.fixedMax(buf) 101 | case (VECTORTYPE_BINDICT, SUBTYPE_UTF8) => bv.DictUTF8Vector(buf) 102 | case (VECTORTYPE_BINSIMPLE, SUBTYPE_REPEATED) => bv.UTF8Vector.const(buf) 103 | case (vectType, subType) => throw UnsupportedFiloType(vectType, subType) 104 | } 105 | } 106 | } 107 | 108 | implicit object DateTimeVectorReader extends VectorReader[DateTime] { 109 | def makeVector(buf: ByteBuffer, headerBytes: Int): FiloVector[DateTime] = { 110 | (majorVectorType(headerBytes), vectorSubType(headerBytes)) match { 111 | case (VECTORTYPE_DIFF, SUBTYPE_DATETIME) => 112 | val ddtv = DiffDateTimeVector.getRootAsDiffDateTimeVector(buf) 113 | if (ddtv.tzLength == 0) { 114 | new DiffDateTimeWrapper(ddtv) 115 | } else { 116 | new DiffDateTimeWithTZWrapper(ddtv) 117 | } 118 | 119 | case (vectType, subType) => throw UnsupportedFiloType(vectType, subType) 120 | } 121 | } 122 | } 123 | 124 | implicit object TimestampVectorReader extends VectorReader[Timestamp] { 125 | def makeVector(buf: ByteBuffer, headerBytes: Int): FiloVector[Timestamp] = { 126 | (majorVectorType(headerBytes), vectorSubType(headerBytes)) match { 127 | case (VECTORTYPE_DIFF, SUBTYPE_PRIMITIVE) => 128 | val dpv = DiffPrimitiveVector.getRootAsDiffPrimitiveVector(buf) 129 | new DiffPrimitiveWrapper[Long, Timestamp](dpv) { 130 | val base = baseReader.readLong(0) 131 | final def apply(i: Int): Timestamp = new Timestamp(base + dataReader.read(i)) 132 | } 133 | 134 | case (vectType, subType) => throw UnsupportedFiloType(vectType, subType) 135 | } 136 | } 137 | } 138 | } 139 | 140 | /** 141 | * Implemented by specific Filo column/vector types. 142 | */ 143 | trait VectorReader[A] { 144 | /** 145 | * Creates a FiloVector based on the remaining bytes. Needs to decipher 146 | * what sort of vector it is and make the appropriate choice. 147 | * @param buf a ByteBuffer of the binary vector, with the position at right after 148 | * the 4 header bytes... at the beginning of FlatBuffers or whatever 149 | * @param the four byte headerBytes 150 | */ 151 | def makeVector(buf: ByteBuffer, headerBytes: Int): FiloVector[A] 152 | } 153 | 154 | // NOTE: we MUST @specialize here so that the apply method below will not create boxing 155 | class PrimitiveVectorReader[@specialized A: TypedReaderProvider] extends VectorReader[A] { 156 | import VectorReader._ 157 | import WireFormat._ 158 | 159 | def makeVector(buf: ByteBuffer, headerBytes: Int): FiloVector[A] = 160 | vectMaker((majorVectorType(headerBytes), vectorSubType(headerBytes), buf)) 161 | 162 | val fbbPrimitiveMaker: PartialFunction[(Int, Int, ByteBuffer), FiloVector[A]] = { 163 | case (VECTORTYPE_SIMPLE, SUBTYPE_PRIMITIVE, buf) => 164 | val spv = SimplePrimitiveVector.getRootAsSimplePrimitiveVector(buf) 165 | new SimplePrimitiveWrapper[A](spv) { 166 | val typedReader = TypedBufferReader[A](reader, spv.info.nbits, spv.info.signed) 167 | final def apply(i: Int): A = typedReader.read(i) 168 | } 169 | 170 | case (VECTORTYPE_CONST, SUBTYPE_PRIMITIVE, buf) => 171 | val spv = SimplePrimitiveVector.getRootAsSimplePrimitiveVector(buf) 172 | new SimplePrimitiveWrapper[A](spv) { 173 | val typedReader = TypedBufferReader[A](reader, spv.info.nbits, spv.info.signed) 174 | final def apply(i: Int): A = typedReader.read(0) 175 | } 176 | 177 | case (VECTORTYPE_DIFF, SUBTYPE_PRIMITIVE, buf) => 178 | val dpv = DiffPrimitiveVector.getRootAsDiffPrimitiveVector(buf) 179 | makeDiffVector(dpv) 180 | } 181 | 182 | val defaultMaker: PartialFunction[(Int, Int, ByteBuffer), FiloVector[A]] = { 183 | case (vectType, subType, _) => throw UnsupportedFiloType(vectType, subType) 184 | } 185 | 186 | def otherMaker: PartialFunction[(Int, Int, ByteBuffer), FiloVector[A]] = Map.empty 187 | 188 | lazy val vectMaker = otherMaker orElse fbbPrimitiveMaker orElse defaultMaker 189 | 190 | def makeDiffVector(dpv: DiffPrimitiveVector): FiloVector[A] = ??? 191 | } 192 | -------------------------------------------------------------------------------- /filo-scala/src/main/scala/org.velvia.filo/VectorBuilder.scala: -------------------------------------------------------------------------------- 1 | package org.velvia.filo 2 | 3 | import com.google.flatbuffers.FlatBufferBuilder 4 | import java.nio.ByteBuffer 5 | import java.sql.Timestamp 6 | import org.joda.time.DateTime 7 | import scala.collection.mutable.BitSet 8 | import scala.reflect.ClassTag 9 | 10 | import RowReader._ 11 | import codecs.Utils 12 | 13 | /** 14 | * A builder for FiloVectors. Used to build up elements of a vector before freezing it as an 15 | * immutable, extremely fast for reading FiloVector. 16 | */ 17 | trait VectorBuilderBase { 18 | type T 19 | 20 | /** Add a Not Available (null) element to the builder. */ 21 | def addNA(): Unit 22 | 23 | /** Add a value of type T to the builder. It will be marked as available. */ 24 | def addData(value: T): Unit 25 | 26 | /** If value is defined, then use addData, otherwise use addNA */ 27 | def addOption(value: Option[T]): Unit = { 28 | value.foreach { v => addData(v) } 29 | value.orElse { addNA(); None } 30 | } 31 | 32 | implicit val extractor: TypedFieldExtractor[T] 33 | 34 | /** Adds an element from a RowReader */ 35 | final def add(row: RowReader, colNo: Int): Unit = { 36 | if (row.notNull(colNo)) { addData(extractor.getField(row, colNo)) } 37 | else { addNA() } 38 | } 39 | 40 | /** Resets the builder state to build a new column */ 41 | def reset(): Unit 42 | 43 | /** Number of elements added so far */ 44 | def length: Int 45 | 46 | /** Returns true if every element added is NA, or no elements have been added */ 47 | def isAllNA: Boolean 48 | 49 | /** 50 | * Produces a binary Filo vector as a ByteBuffer, using default encoding hints 51 | */ 52 | def toFiloBuffer(): ByteBuffer = toFiloBuffer(BuilderEncoder.AutoDetect) 53 | 54 | /** 55 | * Produces a binary Filo vector as a ByteBuffer, with a specific encoding hint 56 | */ 57 | def toFiloBuffer(hint: BuilderEncoder.EncodingHint): ByteBuffer 58 | } 59 | 60 | /** 61 | * A concrete implementation of VectorBuilderBase based on ArrayBuffer and BitSet for a mask 62 | * @param empty The empty value to insert for an NA or missing value 63 | */ 64 | sealed abstract class VectorBuilder[A](empty: A) extends VectorBuilderBase { 65 | type T = A 66 | 67 | // True for a row number (or bit is part of the set) if data for that row is not available 68 | val naMask = new BitSet 69 | val data = new collection.mutable.ArrayBuffer[A] 70 | 71 | def addNA(): Unit = { 72 | naMask += data.length 73 | data += empty 74 | } 75 | 76 | def addData(value: A): Unit = { data += value } 77 | 78 | def reset(): Unit = { 79 | naMask.clear 80 | data.clear 81 | } 82 | 83 | def length: Int = data.length 84 | def isAllNA: Boolean = Utils.isAllNA(naMask, data.length) 85 | 86 | implicit val builder: BuilderEncoder[T] 87 | 88 | def toFiloBuffer(hint: BuilderEncoder.EncodingHint): ByteBuffer = builder.encode(this, hint) 89 | } 90 | 91 | sealed abstract class TypedVectorBuilder[A](empty: A) 92 | (implicit val extractor: TypedFieldExtractor[A], 93 | implicit val builder: BuilderEncoder[A]) extends VectorBuilder(empty) 94 | 95 | sealed abstract class MinMaxVectorBuilder[A](minValue: A, 96 | maxValue: A, 97 | val zero: A) 98 | (implicit val ordering: Ordering[A], 99 | implicit val extractor: TypedFieldExtractor[A], 100 | implicit val builder: BuilderEncoder[A]) 101 | extends VectorBuilder(zero) { 102 | var min: A = maxValue 103 | var max: A = minValue 104 | 105 | override def addData(value: A): Unit = { 106 | super.addData(value) 107 | if (ordering.compare(value, max) > 0) max = value 108 | if (ordering.compare(value, min) < 0) min = value 109 | } 110 | } 111 | 112 | object VectorBuilder { 113 | type BuilderMap = Map[Class[_], () => VectorBuilderBase] 114 | /** 115 | * Creates a VectorBuilder dynamically based on a passed in class. 116 | * @param builderMap the map of classes to Builders. Being able to pass one in 117 | * allows for customization. 118 | */ 119 | def apply(dataType: Class[_], 120 | builderMap: BuilderMap = defaultBuilderMap): VectorBuilderBase = 121 | builderMap(dataType)() 122 | 123 | // Please add your builder here when you add a type 124 | val defaultBuilderMap = Map[Class[_], () => VectorBuilderBase]( 125 | Classes.Boolean -> (() => new BoolVectorBuilder), 126 | Classes.Int -> 127 | (() => new vectors.IntVectorBuilder(vectors.IntBinaryVector.appendingVector(2000))), 128 | Classes.Long -> (() => new LongVectorBuilder), 129 | Classes.Double -> (() => new DoubleVectorBuilder), 130 | Classes.Float -> (() => new FloatVectorBuilder), 131 | Classes.String -> (() => new StringVectorBuilder), 132 | Classes.DateTime -> (() => new DateTimeVectorBuilder), 133 | Classes.SqlTimestamp -> (() => new SqlTimestampVectorBuilder), 134 | Classes.UTF8 -> 135 | (() => new vectors.UTF8VectorBuilder(vectors.UTF8Vector.appendingVector(1000))) 136 | ) 137 | 138 | import BuilderEncoder._ 139 | 140 | val FifteenMinMillis = 15 * org.joda.time.DateTimeConstants.MILLIS_PER_MINUTE 141 | 142 | /** 143 | * Builds a VectorBuilder automatically from a scala collection. 144 | * All values will be marked available. 145 | */ 146 | def apply[A: ClassTag: BuilderEncoder](seq: collection.Seq[A]): VectorBuilderBase = { 147 | val builder = apply(implicitly[ClassTag[A]].runtimeClass).asInstanceOf[VectorBuilderBase { type T = A}] 148 | seq.foreach(builder.addData) 149 | builder 150 | } 151 | 152 | /** 153 | * Encodes a sequence of type Option[A] to a Filo format ByteBuffer. 154 | * Elements which are None will get encoded as NA bits. 155 | */ 156 | def fromOptions[A: ClassTag: BuilderEncoder](seq: collection.Seq[Option[A]]): VectorBuilderBase = { 157 | val builder = apply(implicitly[ClassTag[A]].runtimeClass).asInstanceOf[VectorBuilderBase { type T = A}] 158 | seq.foreach(builder.addOption) 159 | builder 160 | } 161 | } 162 | 163 | import DefaultValues._ 164 | 165 | class BoolVectorBuilder extends MinMaxVectorBuilder(false, true, DefaultBool) 166 | class IntVectorBuilder extends MinMaxVectorBuilder(Int.MinValue, Int.MaxValue, DefaultInt) 167 | class LongVectorBuilder extends MinMaxVectorBuilder(Long.MinValue, Long.MaxValue, DefaultLong) 168 | class DoubleVectorBuilder extends MinMaxVectorBuilder(Double.MinValue, Double.MaxValue, DefaultDouble) { 169 | var numInts = 0 170 | override def addData(value: Double): Unit = { 171 | super.addData(value) 172 | if (Math.rint(value) == value) numInts += 1 173 | } 174 | 175 | def useIntVector: Boolean = 176 | (numInts == length) && min >= Int.MinValue.toDouble && max <= Int.MaxValue.toDouble 177 | } 178 | class FloatVectorBuilder extends MinMaxVectorBuilder(Float.MinValue, Float.MaxValue, DefaultFloat) 179 | 180 | class StringVectorBuilder extends TypedVectorBuilder(DefaultString) { 181 | // For dictionary encoding. NOTE: this set does NOT include empty value 182 | val stringSet = new collection.mutable.HashSet[String] 183 | 184 | final def fromReader(row: RowReader, colNo: Int): String = row.getString(colNo) 185 | 186 | override def addData(value: String): Unit = { 187 | if (value == null) { 188 | addNA() 189 | } else { 190 | stringSet += value 191 | super.addData(value) 192 | } 193 | } 194 | 195 | override def reset(): Unit = { 196 | stringSet.clear 197 | super.reset() 198 | } 199 | } 200 | 201 | abstract class NestedVectorBuilder[A, I](val innerBuilder: VectorBuilderBase { type T = I }) 202 | (implicit val extractor: TypedFieldExtractor[A], 203 | implicit val builder: BuilderEncoder[A]) extends VectorBuilderBase { 204 | type T = A 205 | 206 | def addNA(): Unit = innerBuilder.addNA() 207 | 208 | def reset(): Unit = innerBuilder.reset() 209 | 210 | def length: Int = innerBuilder.length 211 | def isAllNA: Boolean = innerBuilder.isAllNA 212 | 213 | def toFiloBuffer(hint: BuilderEncoder.EncodingHint): ByteBuffer = builder.encode(this, hint) 214 | } 215 | 216 | class DateTimeVectorBuilder extends NestedVectorBuilder[DateTime, Long](new LongVectorBuilder) { 217 | val millisBuilder = innerBuilder.asInstanceOf[LongVectorBuilder] 218 | val tzBuilder = new IntVectorBuilder 219 | 220 | def addData(value: DateTime): Unit = { 221 | millisBuilder.addData(value.getMillis) 222 | tzBuilder.addData(value.getZone.getOffset(0) / VectorBuilder.FifteenMinMillis) 223 | } 224 | 225 | override def addNA(): Unit = { 226 | millisBuilder.addNA() 227 | tzBuilder.addNA() 228 | } 229 | 230 | override def reset(): Unit = { 231 | millisBuilder.reset() 232 | tzBuilder.reset() 233 | } 234 | } 235 | 236 | class SqlTimestampVectorBuilder extends NestedVectorBuilder[Timestamp, Long](new LongVectorBuilder) { 237 | val millisBuilder = innerBuilder.asInstanceOf[LongVectorBuilder] 238 | def addData(value: Timestamp): Unit = innerBuilder.addData(value.getTime) 239 | } 240 | -------------------------------------------------------------------------------- /scalastyle-config.xml: -------------------------------------------------------------------------------- 1 | 2 | Scalastyle main configuration 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | ;\r?\n 16 | 17 | No semicolon at end of line 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 1 75 | ^\"\"$ 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | ((lazy)\s+(override|private|protected|final|implicit))|((implicit)\s+(override|private|protected|final))|((final)\s+(override|private|protected))|((private|protected)\s+(override)) 140 | 141 | Modifiers should be declared in the following order: "(override) (private|protected) (abstract|final|sealed) (implicit) (lazy)". 142 | 143 | 144 | --------------------------------------------------------------------------------