├── GolombEncode.hs ├── GolombQuery.hs ├── GolombQuery.js ├── HashSequence.hs ├── README.md ├── base-64-encode.hs ├── golomb-decode.hs ├── golomb-encode.hs ├── golomb-query.hs ├── make-hash-sequences.hs ├── make-hash-sequences.rb ├── make-index.hs ├── mk-gcs.sh └── pedagogicalGolombCode.hs /GolombEncode.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | module GolombEncode where 3 | import Data.Word 4 | import Data.Bits 5 | 6 | linewiseDiff :: [Integer] -> [Int] 7 | linewiseDiff ints = zipWith (\ a b -> fromIntegral (a-b) ) ints (0:ints) 8 | 9 | fastGolombEncode :: Int -> [Int] -> [Word8] 10 | fastGolombEncode !binaryBits !ints = fGEUnary 7 0 (head ints) binaryBits (tail ints) 11 | 12 | fGEUnary :: Int -> Word8 -> Int -> Int -> [Int] -> [Word8] 13 | fGEUnary (-1) !currentWord !currentInt !binaryBits !ints = currentWord : (fGEUnary 7 0 currentInt binaryBits ints) 14 | fGEUnary !bitIndex !currentWord !currentInt !binaryBits !ints | currentInt < (2 ^ binaryBits) = fGEBinary (bitIndex-1) currentWord currentInt (binaryBits-1) binaryBits ints 15 | | otherwise = fGEUnary (bitIndex-1) (currentWord `setBit` bitIndex) (currentInt - (2 ^ binaryBits)) binaryBits ints 16 | 17 | fGEBinary :: Int -> Word8 -> Int -> Int -> Int -> [Int] -> [Word8] 18 | fGEBinary (-1) !currentWord !currentInt !intBitIndex !binaryBits !ints = currentWord : (fGEBinary 7 0 currentInt intBitIndex binaryBits ints) 19 | fGEBinary !bitIndex !currentWord !currentInt (-1) !binaryBits !ints = if null ints then [currentWord] else (fGEUnary bitIndex currentWord (head ints) binaryBits (tail ints)) 20 | fGEBinary !bitIndex !currentWord !currentInt !intBitIndex !binaryBits !ints = fGEBinary (bitIndex-1) (if (currentInt `testBit` intBitIndex) then (currentWord `setBit` bitIndex) else currentWord) currentInt (intBitIndex-1) binaryBits ints -------------------------------------------------------------------------------- /GolombQuery.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | module GolombQuery where 3 | import HashSequence 4 | import Data.Word 5 | import qualified Data.ByteString.Lazy.Char8 as B 6 | import qualified Data.ByteString.Lazy as BW 7 | import Data.List.Ordered (isect) 8 | import Data.List (sort) 9 | import Data.Bits (testBit) 10 | 11 | linewiseSum :: [Int] -> [Integer] 12 | linewiseSum = scanl ((. fromIntegral) . (+)) 0 13 | 14 | golombFilterQueries :: Integer -> Integer -> Word8 -> BW.ByteString -> [B.ByteString] -> [B.ByteString] 15 | golombFilterQueries lineCount modulus binaryBits golombCodedSequence queries = filteredQueries 16 | where sortedHashes = sort [h | (q, h) <- queriesHashes] 17 | queriesHashes = [(q, hashMod modulus q) | q <- queries] 18 | filteredHashes = isect unencodedSequence sortedHashes 19 | filteredQueries = [q | (q,h) <- queriesHashes, h `elem` filteredHashes] 20 | unencodedSequence = linewiseSum (fastGolombDecode lineCount binaryBits golombCodedSequence) 21 | 22 | fastGolombDecode :: Integer -> Word8 -> BW.ByteString -> [Int] 23 | fastGolombDecode !lineCount !binaryBits !bytes = fGDUnary (-1) 0 0 lineCount binaryBits bytes 24 | 25 | fGDUnary :: Int -> Word8 -> Int -> Integer -> Word8 -> BW.ByteString -> [Int] 26 | fGDUnary !bitIndex !currentWord !currentIntAccum 0 !binaryBytes !bytes = [] 27 | fGDUnary (-1) !currentWord !currentIntAccum !intsRemaining !binaryBytes !bytes = fGDUnary 7 (BW.head bytes) currentIntAccum intsRemaining binaryBytes (BW.tail bytes) 28 | fGDUnary !bitIndex !currentWord !currentIntAccum !intsRemaining !binaryBytes !bytes = if currentWord `testBit` bitIndex 29 | then fGDUnary (bitIndex-1) currentWord (currentIntAccum+1) intsRemaining binaryBytes bytes 30 | else fGDBinary (bitIndex-1) currentWord binaryBytes currentIntAccum intsRemaining binaryBytes bytes 31 | 32 | fGDBinary :: Int -> Word8 -> Word8 -> Int -> Integer -> Word8 -> BW.ByteString -> [Int] 33 | fGDBinary !bitIndex !currentWord 0 !currentIntAccum !intsRemaining !binaryBytes !bytes = currentIntAccum : (fGDUnary bitIndex currentWord 0 (intsRemaining-1) binaryBytes bytes) 34 | fGDBinary (-1) !currentWord !intBitsRemaining !currentIntAccum !intsRemaining !binaryBytes !bytes = fGDBinary 7 (BW.head bytes) intBitsRemaining currentIntAccum intsRemaining binaryBytes (BW.tail bytes) 35 | fGDBinary !bitIndex !currentWord !intBitsRemaining !currentIntAccum !intsRemaining !binaryBytes !bytes = fGDBinary (bitIndex-1) currentWord (intBitsRemaining-1) (2*currentIntAccum + (if currentWord `testBit` bitIndex then 1 else 0)) intsRemaining binaryBytes bytes 36 | -------------------------------------------------------------------------------- /GolombQuery.js: -------------------------------------------------------------------------------- 1 | function golombFilterQueries(lineCount, modulus, binaryBits, golombCodedSequence, queries, partialSumBitcounts) { // queries :: [(String, kWin :: IO (), kFail :: IO ())] 2 | var queriesHashes = queries.map(function(s_kW_kF) { return [hashMod(modulus,s_kW_kF[0]), s_kW_kF[1], s_kW_kF[2]] }); 3 | var sortedHashes = queriesHashes.sort(function(s_kW_kF1, s_kW_kF2) { return s_kW_kF1[0] - s_kW_kF2[0] }); 4 | fastGolombDecodeIsectK(lineCount, binaryBits, golombCodedSequence, sortedHashes, partialSumBitcounts.slice(0)); 5 | } 6 | 7 | function hashMod(modulus, string) { 8 | return BigInteger.parse(hex_md5(string),16).remainder(BigInteger(modulus)).toJSValue(); 9 | // we return JS integers (doubles) instead of big ints because even at 0.001% FP rate, you'd need well over a quadrillion values inserted into your sequence for it to matter 10 | } 11 | 12 | // fastGolombDecode decodes lineCount bytes from the start of a base64-encoded golomb-coded sequence in the string bytes, fuses the iterative sum, and the set intersection with queries. 13 | // the code is identical to the haskell original except that the tail recursion has been hand-compiled to an imperative loop, and the input is base64-encoded. 14 | // it also skips ahead based on partial sums and bit positions collected in an index. 15 | function fastGolombDecodeIsectK(lineCount, binaryBits, bytes, queries, partialSumBitcounts) { 16 | var alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/'; 17 | var mapping = {}; 18 | alphabet.split(/(?=.)/).forEach(function(char,i) { mapping[char] = i }); 19 | var sequenceAccumulator = 0; 20 | var unary = true, bytePtr = 0, bitIndex = 0, currentWord = 0, intBitsRemaining = 0, currentIntAccum = 0, intsRemaining = 0; 21 | bitIndex = -1; 22 | var firstQueryHash = queries[0][0]; 23 | var firstPartialSum = partialSumBitcounts[0][0]; 24 | intsRemaining = lineCount; 25 | while (true) { 26 | //console.log("unary?", unary, "firstQuery", JSON.stringify(queries[0]), "firstIndexItem", JSON.stringify(partialSumBitcounts[0]), "bytePtr", bytePtr, "bitIndex", bitIndex, "currentWord", currentWord.toString(2), "intBitsRemaining", intBitsRemaining, "currentIntAccum", currentIntAccum.toString(2), "intsRemaining", intsRemaining); 27 | if(!unary) { 28 | if(intBitsRemaining == 0) { 29 | sequenceAccumulator += currentIntAccum; 30 | if(sequenceAccumulator > firstQueryHash) { 31 | queries[0][2](); 32 | queries.shift(); 33 | if(queries.length > 0) { 34 | firstQueryHash = queries[0][0]; 35 | } 36 | } else { 37 | while(queries.length > 0 && sequenceAccumulator == firstQueryHash) { 38 | queries[0][1](); 39 | queries.shift(); 40 | if(queries.length > 0) { 41 | firstQueryHash = queries[0][0]; 42 | } 43 | } 44 | } 45 | if(queries.length == 0) return; 46 | unary = true; 47 | // bitIndex stays as is 48 | // currentWord stays as is 49 | currentIntAccum = 0; 50 | intsRemaining -= 1; 51 | 52 | while(partialSumBitcounts.length > 0 && firstPartialSum < firstQueryHash) { 53 | sequenceAccumulator = partialSumBitcounts[0][0]; 54 | bitIndex = 5 - (partialSumBitcounts[0][1] % 6); 55 | bytePtr = (partialSumBitcounts[0][1] - (5 - bitIndex)) / 6; 56 | currentWord = mapping[bytes[bytePtr]]; 57 | bytePtr += 1; // bytePtr points to (tail word8s). 58 | intsRemaining = lineCount - partialSumBitcounts[0][2]; 59 | //console.log("pulled off an index entry, indices and queries are ", JSON.stringify(partialSumBitcounts), JSON.stringify(queries), "bytePtr", bytePtr, "bitIndex", bitIndex); 60 | partialSumBitcounts.shift(); 61 | if(partialSumBitcounts.length > 0) firstPartialSum = partialSumBitcounts[0][0]; 62 | } 63 | //console.log("sequence accumulator", sequenceAccumulator, "first query", JSON.stringify(queries[0])); 64 | 65 | } else if(bitIndex == -1) { 66 | bitIndex = 5; 67 | currentWord = mapping[bytes[bytePtr]]; 68 | // intBitsRemaining stays as is 69 | // currentIntAccum stays as is 70 | // intsRemaining stays as is 71 | bytePtr += 1; 72 | } else { 73 | currentIntAccum = (currentIntAccum << 1) + ((currentWord >> bitIndex) & 1); //OoO because bitIndex changes 74 | bitIndex -= 1; 75 | // currentWord stays as is 76 | intBitsRemaining -= 1; 77 | // intsRemaining stays as is 78 | } 79 | continue; 80 | } 81 | // unary is true, because of the continue 82 | if(intsRemaining == 0) { 83 | while(queries.length > 0) { 84 | queries[0][2](); queries.shift(); 85 | } 86 | return; 87 | } 88 | if(bitIndex == -1) { 89 | bitIndex = 5; 90 | currentWord = mapping[bytes[bytePtr]]; 91 | // currentIntAccum stays as is 92 | // intsRemaining stays as is 93 | bytePtr += 1; 94 | } else if(currentWord & (1 << bitIndex)) { 95 | bitIndex -= 1; 96 | // currentWord stays as is 97 | currentIntAccum += 1; 98 | // intsRemaining stays as is 99 | } else { 100 | unary = false; 101 | bitIndex -= 1; 102 | // currentWord stays as is 103 | intBitsRemaining = binaryBits; 104 | // currentIntAccum stays as is 105 | } 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /HashSequence.hs: -------------------------------------------------------------------------------- 1 | module HashSequence where 2 | import qualified Data.ByteString.Lazy.Char8 as B 3 | import Data.Digest.Pure.MD5 (md5) 4 | import qualified Text.Show.ByteString as BS (show) 5 | 6 | hashMod :: Integer -> B.ByteString -> Integer 7 | hashMod modulus string = (read $ ("0x" ++) $ show $ md5 string) `mod` modulus 8 | 9 | processLine modulus = BS.show . hashMod modulus 10 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | gcs 2 | === 3 | 4 | Golomb-compressed sequences (with indices) for large datasets 5 | 6 | To create a golomb-compressed sequence, first install packages and compile code. 7 | 8 | cabal install bytestring-show pure-md5 9 | 10 | ghc --make -O2 make-hash-sequences.hs 11 | ghc --make -O2 golomb-encode.hs 12 | ghc --make -O2 make-index.hs 13 | ghc --make -O2 base-64-encode.hs 14 | 15 | And then, write one key per line on stdin to mk-gcs.sh, and mk-gcs.sh will write binary data to stdout, and will write a JSON object to stderr. 16 | (The JSON object makes sense to use in a browser, the binary data makes sense to use on the command line.) 17 | -------------------------------------------------------------------------------- /base-64-encode.hs: -------------------------------------------------------------------------------- 1 | import Data.ByteString.Lazy.Char8 as B 2 | import Data.ByteString.Base64.Lazy as B64 3 | 4 | main = B.interact B64.encode 5 | -------------------------------------------------------------------------------- /golomb-decode.hs: -------------------------------------------------------------------------------- 1 | import GolombQuery 2 | import System.Environment (getArgs) 3 | import qualified Data.ByteString.Lazy as BWord 4 | 5 | main = do [lineCount, binaryBits] <- getArgs -- todo: use the -index via aeson 6 | f <- BWord.getContents 7 | putStr $ unlines $ map show $ fastGolombDecode (read lineCount) (read binaryBits) f 8 | -------------------------------------------------------------------------------- /golomb-encode.hs: -------------------------------------------------------------------------------- 1 | import GolombEncode 2 | import System.Environment (getArgs) 3 | import qualified Data.ByteString.Lazy.Char8 as B 4 | import Data.Maybe (fromJust) 5 | 6 | main = do [binaryBits] <- getArgs 7 | B.interact (B.pack . map (toEnum . fromEnum) . fastGolombEncode (read binaryBits) . linewiseDiff . map (fst . fromJust . B.readInteger) . B.lines) 8 | -------------------------------------------------------------------------------- /golomb-query.hs: -------------------------------------------------------------------------------- 1 | import GolombQuery 2 | import System.Environment (getArgs) 3 | import qualified Data.ByteString.Lazy as BWord 4 | import qualified Data.ByteString.Lazy.Char8 as BChar 5 | 6 | main = do [lineCount, modulus, binaryBits, gcsFile] <- getArgs -- todo: use the -index via aeson 7 | f <- BWord.readFile gcsFile 8 | queries <- BChar.getContents 9 | BChar.putStr $ BChar.unlines $ golombFilterQueries (read lineCount) (read modulus) (read binaryBits) f (BChar.lines queries) 10 | -------------------------------------------------------------------------------- /make-hash-sequences.hs: -------------------------------------------------------------------------------- 1 | import qualified Data.ByteString.Lazy.Char8 as B 2 | import System.Environment (getArgs) 3 | import HashSequence (processLine) 4 | 5 | main = do [modulus] <- getArgs 6 | B.interact (B.unlines . map (processLine (read modulus)) . B.lines) 7 | -------------------------------------------------------------------------------- /make-hash-sequences.rb: -------------------------------------------------------------------------------- 1 | require 'digest' 2 | 3 | Modulus = ARGV[0].to_i 4 | STDIN.each_line {|line| puts Digest::MD5.hexdigest(line.chomp).to_i(16) % Modulus } 5 | -------------------------------------------------------------------------------- /make-index.hs: -------------------------------------------------------------------------------- 1 | import System.Environment (getArgs) 2 | 3 | main = do [binaryBits, indexFrequency] <- getArgs 4 | interact (show . every (read indexFrequency) . sumsBitcounts (read binaryBits) . map read . lines) 5 | 6 | sumsBitcounts :: Integer -> [Integer] -> [[Integer]] 7 | sumsBitcounts binaryBits = linewiseTupleSum . map (\ di -> [di, sizeof binaryBits di, 1] ) . linewiseDiff 8 | 9 | linewiseTupleSum = scanl1 (zipWith (+)) 10 | 11 | linewiseDiff is = zipWith (-) is (0:is) 12 | 13 | sizeof binaryBits i = (i `div` (2 ^ binaryBits)) + 1 + binaryBits 14 | 15 | every n xs = case drop (n-1) xs of 16 | (y:ys) -> y : every n ys 17 | [] -> [] -------------------------------------------------------------------------------- /mk-gcs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | input=`mktemp` 3 | hashes=`mktemp` 4 | sequence=`mktemp` 5 | gzip -c1 > $input 6 | linecount=`gzip -cd $input | wc -l` 7 | 8 | fp=100 9 | binarybits= 10 | modulus=$(( linecount * fp )) 11 | indexfrequency=16384 12 | while getopts "hp:b:m:i:" OPTION 13 | do 14 | case $OPTION in 15 | h) 16 | echo "usage: $0 -p false-positive-rate (default: 100) -b bits-for-binary-Rice-encoding (default: floor(log2(fp))) -m hash-modulus (default: linecount * fp) -i index-stride (default: 16k)" 17 | echo "stdin: one key per line. stdout: golomb-compressed sequence. stderr: json with base64-encoded gcs and index." 18 | exit 1 19 | ;; 20 | p) 21 | fp=$OPTARG 22 | modulus=$(( linecount * fp )) 23 | ;; 24 | b) 25 | binarybits=$OPTARG 26 | ;; 27 | m) 28 | modulus=$OPTARG 29 | ;; 30 | i) 31 | indexfrequency=$OPTARG 32 | ;; 33 | esac 34 | done 35 | 36 | if [ -z $binarybits ] 37 | then 38 | binarybits=`awk "BEGIN { print int(log($fp)/log(2)) }"` 39 | fi 40 | 41 | #sort -n can't handle bigints 42 | export LC_ALL=C 43 | gzip -cd $input | ./make-hash-sequences $modulus | awk -v len=${#modulus} '{ printf("% " len "s\n",$1) }' | sort -S 50% -u | tr -d ' ' | gzip -c1 > $hashes 44 | 45 | gzip -cd $hashes | ./golomb-encode $binarybits > $sequence 46 | 47 | outputlinecount=`gzip -cd $hashes | wc -l` 48 | (echo '{' 49 | echo '"lineCount":' ${outputlinecount}, 50 | echo '"modulus":' ${modulus}, 51 | echo '"binaryBits":' ${binarybits}, 52 | echo -n '"partialSumBitcounts":' ; gzip -cd $hashes | ./make-index $binarybits $indexfrequency ; echo , 53 | echo -n '"b64EncodedGolombCodedSequence": "' ; ./base-64-encode < $sequence ; echo '"' 54 | echo '}') > /dev/stderr 55 | cat $sequence 56 | rm $input $hashes $sequence 57 | -------------------------------------------------------------------------------- /pedagogicalGolombCode.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | module GolombCode where 3 | import Data.List (unfoldr, foldl') 4 | import Data.Word 5 | 6 | golombCode :: Int -> Int -> [Bool] 7 | golombCode !binaryBits !integer = encodeUnary quotient ++ encodeBinary binaryBits remainder 8 | where (quotient, remainder) = integer `divMod` (2 ^ binaryBits) 9 | 10 | encodeUnary !quotient = go quotient [False] 11 | where go 0 bits = bits 12 | go !n bits = go (n-1) (True : bits) 13 | 14 | encodeBinary !binaryBits !integer = reverse $ unfoldr numToBools (binaryBits,integer) 15 | where numToBools (!mB,!i) = if mB == 0 then Nothing else Just (odd i, (mB-1, i `div` 2)) 16 | 17 | golombDecode :: Int -> [Bool] -> (Int, [Bool]) 18 | golombDecode binaryBits bits = (quotient * (2 ^ binaryBits) + remainder, rest) 19 | where (!quotient, binaryAndRest) = decodeUnary bits 20 | (!remainder, rest) = decodeBinary binaryBits binaryAndRest 21 | 22 | decodeUnary bits = (length ones, rest) 23 | where (ones, z : rest) = span id bits 24 | 25 | decodeBinary binaryBits bits = (int, rest) 26 | where (binary, rest) = splitAt binaryBits bits 27 | int = boolsToInt binary 28 | 29 | boolsToInt :: (Num a) => [Bool] -> a 30 | boolsToInt = foldl' (\ z e -> z*2 + (if e then 1 else 0)) 0 31 | 32 | boolsToBytes :: [Bool] -> Bool -> [Word8] 33 | boolsToBytes (b7:(b6:(b5:(b4:(b3:(b2:(b1:(b0:nextBytes)))))))) padding = (boolsToInt [b7,b6,b5,b4,b3,b2,b1,b0]) : (boolsToBytes nextBytes padding) 34 | boolsToBytes [] padding = [] 35 | boolsToBytes fewBitsShort padding = [boolsToInt $ take 8 $ fewBitsShort ++ repeat padding] 36 | 37 | decodeMany 0 binaryBits bits = [] 38 | decodeMany !n binaryBits bits = i : (decodeMany (n-1) binaryBits rest) 39 | where (i,rest) = golombDecode binaryBits bits 40 | 41 | golombCodesNoBlocks :: Int -> [Int] -> [Word8] 42 | golombCodesNoBlocks !binaryBits ints = boolsToBytes (concatMap (golombCode binaryBits) ints) False 43 | 44 | golombDecodesNoBlocks :: Int -> Integer -> [Word8] -> [Int] 45 | golombDecodesNoBlocks !binaryBits !count word8s = decodeMany count binaryBits (concatMap (encodeBinary 8) word8s) 46 | 47 | linewiseDiff :: [Integer] -> [Int] 48 | linewiseDiff ints = zipWith (\ a b -> fromIntegral (a-b) ) ints (0:ints) 49 | 50 | -- test_golomb = quickCheckWith (stdArgs {maxSuccess = 10000}) (\ ints -> let positives = map abs ints in golombDecodesNoBlocks 2 (fromIntegral $ length ints) (golombCodesNoBlocks 2 positives) == positives) --------------------------------------------------------------------------------