├── GolombEncode.hs
├── GolombQuery.hs
├── GolombQuery.js
├── HashSequence.hs
├── README.md
├── base-64-encode.hs
├── golomb-decode.hs
├── golomb-encode.hs
├── golomb-query.hs
├── make-hash-sequences.hs
├── make-hash-sequences.rb
├── make-index.hs
├── mk-gcs.sh
└── pedagogicalGolombCode.hs


/GolombEncode.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE BangPatterns #-}
 2 | module GolombEncode where
 3 | import Data.Word
 4 | import Data.Bits
 5 | 
 6 | linewiseDiff :: [Integer] -> [Int]
 7 | linewiseDiff ints = zipWith (\ a b -> fromIntegral (a-b) ) ints (0:ints)
 8 | 
 9 | fastGolombEncode :: Int -> [Int] -> [Word8]
10 | fastGolombEncode !binaryBits !ints = fGEUnary 7 0 (head ints) binaryBits (tail ints)
11 | 
12 | fGEUnary :: Int -> Word8 -> Int -> Int -> [Int] -> [Word8]
13 | fGEUnary (-1) !currentWord !currentInt !binaryBits !ints = currentWord : (fGEUnary 7 0 currentInt binaryBits ints)
14 | fGEUnary !bitIndex !currentWord !currentInt !binaryBits !ints | currentInt < (2 ^ binaryBits) = fGEBinary (bitIndex-1) currentWord currentInt (binaryBits-1) binaryBits ints
15 |                                                               | otherwise = fGEUnary (bitIndex-1) (currentWord `setBit` bitIndex) (currentInt - (2 ^ binaryBits)) binaryBits ints
16 | 
17 | fGEBinary :: Int -> Word8 -> Int -> Int -> Int -> [Int] -> [Word8]
18 | fGEBinary (-1) !currentWord !currentInt !intBitIndex !binaryBits !ints = currentWord : (fGEBinary 7 0 currentInt intBitIndex binaryBits ints)
19 | fGEBinary !bitIndex !currentWord !currentInt (-1) !binaryBits !ints = if null ints then [currentWord] else (fGEUnary bitIndex currentWord (head ints) binaryBits (tail ints))
20 | fGEBinary !bitIndex !currentWord !currentInt !intBitIndex !binaryBits !ints = fGEBinary (bitIndex-1) (if (currentInt `testBit` intBitIndex) then (currentWord `setBit` bitIndex) else currentWord) currentInt (intBitIndex-1) binaryBits ints


--------------------------------------------------------------------------------
/GolombQuery.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE BangPatterns #-}
 2 | module GolombQuery where
 3 | import HashSequence
 4 | import Data.Word
 5 | import qualified Data.ByteString.Lazy.Char8 as B
 6 | import qualified Data.ByteString.Lazy as BW
 7 | import Data.List.Ordered (isect)
 8 | import Data.List (sort)
 9 | import Data.Bits (testBit)
10 | 
11 | linewiseSum :: [Int] -> [Integer]
12 | linewiseSum = scanl ((. fromIntegral) . (+)) 0
13 | 
14 | golombFilterQueries :: Integer -> Integer -> Word8 -> BW.ByteString -> [B.ByteString] -> [B.ByteString]
15 | golombFilterQueries lineCount modulus binaryBits golombCodedSequence queries = filteredQueries
16 |   where sortedHashes = sort [h | (q, h) <- queriesHashes]
17 |         queriesHashes = [(q, hashMod modulus q) | q <- queries]
18 |         filteredHashes = isect unencodedSequence sortedHashes
19 |         filteredQueries = [q | (q,h) <- queriesHashes, h `elem` filteredHashes]
20 |         unencodedSequence = linewiseSum (fastGolombDecode lineCount binaryBits golombCodedSequence)
21 | 
22 | fastGolombDecode :: Integer -> Word8 -> BW.ByteString -> [Int]
23 | fastGolombDecode !lineCount !binaryBits !bytes = fGDUnary (-1) 0 0 lineCount binaryBits bytes
24 | 
25 | fGDUnary :: Int -> Word8 -> Int -> Integer -> Word8 -> BW.ByteString -> [Int]
26 | fGDUnary !bitIndex !currentWord !currentIntAccum 0 !binaryBytes !bytes = []
27 | fGDUnary (-1) !currentWord !currentIntAccum !intsRemaining !binaryBytes !bytes = fGDUnary 7 (BW.head bytes) currentIntAccum intsRemaining binaryBytes (BW.tail bytes)
28 | fGDUnary !bitIndex !currentWord !currentIntAccum !intsRemaining !binaryBytes !bytes = if currentWord `testBit` bitIndex
29 |                                                                                         then fGDUnary (bitIndex-1) currentWord (currentIntAccum+1) intsRemaining binaryBytes bytes
30 |                                                                                         else fGDBinary (bitIndex-1) currentWord binaryBytes currentIntAccum intsRemaining binaryBytes bytes
31 | 
32 | fGDBinary :: Int -> Word8 -> Word8 -> Int -> Integer -> Word8 -> BW.ByteString -> [Int]
33 | fGDBinary !bitIndex !currentWord 0 !currentIntAccum !intsRemaining !binaryBytes !bytes = currentIntAccum : (fGDUnary bitIndex currentWord 0 (intsRemaining-1) binaryBytes bytes)
34 | fGDBinary (-1) !currentWord !intBitsRemaining !currentIntAccum !intsRemaining !binaryBytes !bytes = fGDBinary 7 (BW.head bytes) intBitsRemaining currentIntAccum intsRemaining binaryBytes (BW.tail bytes)
35 | fGDBinary !bitIndex !currentWord !intBitsRemaining !currentIntAccum !intsRemaining !binaryBytes !bytes = fGDBinary (bitIndex-1) currentWord (intBitsRemaining-1) (2*currentIntAccum + (if currentWord `testBit` bitIndex then 1 else 0)) intsRemaining binaryBytes bytes
36 | 


--------------------------------------------------------------------------------
/GolombQuery.js:
--------------------------------------------------------------------------------
  1 | function golombFilterQueries(lineCount, modulus, binaryBits, golombCodedSequence, queries, partialSumBitcounts) { // queries :: [(String, kWin :: IO (), kFail :: IO ())]
  2 |     var queriesHashes = queries.map(function(s_kW_kF) { return [hashMod(modulus,s_kW_kF[0]), s_kW_kF[1], s_kW_kF[2]] });
  3 |     var sortedHashes = queriesHashes.sort(function(s_kW_kF1, s_kW_kF2) { return s_kW_kF1[0] - s_kW_kF2[0] });
  4 |     fastGolombDecodeIsectK(lineCount, binaryBits, golombCodedSequence, sortedHashes, partialSumBitcounts.slice(0));
  5 | }
  6 | 
  7 | function hashMod(modulus, string) {
  8 |     return BigInteger.parse(hex_md5(string),16).remainder(BigInteger(modulus)).toJSValue();
  9 |     // we return JS integers (doubles) instead of big ints because even at 0.001% FP rate, you'd need well over a quadrillion values inserted into your sequence for it to matter
 10 | }
 11 | 
 12 | // fastGolombDecode decodes lineCount bytes from the start of a base64-encoded golomb-coded sequence in the string bytes, fuses the iterative sum, and the set intersection with queries.
 13 | // the code is identical to the haskell original except that the tail recursion has been hand-compiled to an imperative loop, and the input is base64-encoded.
 14 | // it also skips ahead based on partial sums and bit positions collected in an index.
 15 | function fastGolombDecodeIsectK(lineCount, binaryBits, bytes, queries, partialSumBitcounts) {
 16 |     var alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/';
 17 |     var mapping = {};
 18 |     alphabet.split(/(?=.)/).forEach(function(char,i) { mapping[char] = i });
 19 |     var sequenceAccumulator = 0;
 20 |     var unary = true, bytePtr = 0, bitIndex = 0, currentWord = 0, intBitsRemaining = 0, currentIntAccum = 0, intsRemaining = 0;
 21 |     bitIndex = -1;
 22 |     var firstQueryHash = queries[0][0];
 23 |     var firstPartialSum = partialSumBitcounts[0][0];
 24 |     intsRemaining = lineCount;
 25 |     while (true) {
 26 | 	//console.log("unary?", unary, "firstQuery", JSON.stringify(queries[0]), "firstIndexItem", JSON.stringify(partialSumBitcounts[0]), "bytePtr", bytePtr, "bitIndex", bitIndex, "currentWord", currentWord.toString(2), "intBitsRemaining", intBitsRemaining, "currentIntAccum", currentIntAccum.toString(2), "intsRemaining", intsRemaining);
 27 | 	if(!unary) {
 28 | 	    if(intBitsRemaining == 0) {
 29 | 		sequenceAccumulator += currentIntAccum;
 30 | 		if(sequenceAccumulator > firstQueryHash) {
 31 | 		    queries[0][2]();
 32 | 		    queries.shift();
 33 | 		    if(queries.length > 0) {
 34 |  			firstQueryHash = queries[0][0];
 35 | 		    }
 36 | 		} else {
 37 | 		    while(queries.length > 0 && sequenceAccumulator == firstQueryHash) {
 38 | 			queries[0][1]();
 39 | 			queries.shift();
 40 | 			if(queries.length > 0) {
 41 | 			    firstQueryHash = queries[0][0];
 42 | 			}
 43 | 		    }
 44 | 		}
 45 | 		if(queries.length == 0) return;
 46 | 		unary = true;
 47 | 		// bitIndex stays as is
 48 | 		// currentWord stays as is
 49 | 		currentIntAccum = 0;
 50 | 		intsRemaining -= 1;
 51 | 
 52 | 		while(partialSumBitcounts.length > 0 && firstPartialSum < firstQueryHash) {
 53 | 		    sequenceAccumulator = partialSumBitcounts[0][0];
 54 | 		    bitIndex = 5 - (partialSumBitcounts[0][1] % 6);
 55 | 		    bytePtr = (partialSumBitcounts[0][1] - (5 - bitIndex)) / 6;
 56 | 		    currentWord = mapping[bytes[bytePtr]];
 57 | 		    bytePtr += 1; // bytePtr points to (tail word8s).
 58 | 		    intsRemaining = lineCount - partialSumBitcounts[0][2];
 59 | 		    //console.log("pulled off an index entry, indices and queries are ", JSON.stringify(partialSumBitcounts), JSON.stringify(queries), "bytePtr", bytePtr, "bitIndex", bitIndex);
 60 | 		    partialSumBitcounts.shift();
 61 | 		    if(partialSumBitcounts.length > 0) firstPartialSum = partialSumBitcounts[0][0];
 62 | 		}
 63 | 		//console.log("sequence accumulator", sequenceAccumulator, "first query", JSON.stringify(queries[0]));
 64 | 
 65 | 	    } else if(bitIndex == -1) {
 66 | 		bitIndex = 5;
 67 | 		currentWord = mapping[bytes[bytePtr]];
 68 | 		// intBitsRemaining stays as is
 69 | 		// currentIntAccum stays as is
 70 | 		// intsRemaining stays as is
 71 | 		bytePtr += 1;
 72 | 	    } else {
 73 | 		currentIntAccum = (currentIntAccum << 1) + ((currentWord >> bitIndex) & 1); //OoO because bitIndex changes
 74 | 		bitIndex -= 1;
 75 | 		// currentWord stays as is
 76 | 		intBitsRemaining -= 1;
 77 | 		// intsRemaining stays as is
 78 | 	    }
 79 | 	    continue;
 80 | 	}
 81 | 	// unary is true, because of the continue
 82 | 	if(intsRemaining == 0) {
 83 | 	    while(queries.length > 0) {
 84 | 		queries[0][2](); queries.shift();
 85 | 	    }
 86 | 	    return;
 87 | 	}
 88 |         if(bitIndex == -1) {
 89 | 	    bitIndex = 5;
 90 | 	    currentWord = mapping[bytes[bytePtr]];
 91 | 	    // currentIntAccum stays as is
 92 | 	    // intsRemaining stays as is
 93 | 	    bytePtr += 1;
 94 | 	} else if(currentWord & (1 << bitIndex)) {
 95 | 	    bitIndex -= 1;
 96 | 	    // currentWord stays as is
 97 | 	    currentIntAccum += 1;
 98 | 	    // intsRemaining stays as is
 99 | 	} else {
100 | 	    unary = false;
101 | 	    bitIndex -= 1;
102 | 	    // currentWord stays as is
103 | 	    intBitsRemaining = binaryBits;
104 | 	    // currentIntAccum stays as is
105 | 	}
106 |     }
107 | }
108 | 


--------------------------------------------------------------------------------
/HashSequence.hs:
--------------------------------------------------------------------------------
 1 | module HashSequence where
 2 | import qualified Data.ByteString.Lazy.Char8 as B
 3 | import Data.Digest.Pure.MD5 (md5)
 4 | import qualified Text.Show.ByteString as BS (show)
 5 | 
 6 | hashMod :: Integer -> B.ByteString -> Integer
 7 | hashMod modulus string = (read $ ("0x" ++) $ show $ md5 string) `mod` modulus
 8 | 
 9 | processLine modulus = BS.show . hashMod modulus
10 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | gcs
 2 | ===
 3 | 
 4 | Golomb-compressed sequences (with indices) for large datasets
 5 | 
 6 | To create a golomb-compressed sequence, first install packages and compile code.
 7 | 
 8 |     cabal install bytestring-show pure-md5
 9 | 
10 |     ghc --make -O2 make-hash-sequences.hs
11 |     ghc --make -O2 golomb-encode.hs
12 |     ghc --make -O2 make-index.hs
13 |     ghc --make -O2 base-64-encode.hs
14 | 
15 | And then, write one key per line on stdin to mk-gcs.sh, and mk-gcs.sh will write binary data to stdout, and will write a JSON object to stderr.
16 | (The JSON object makes sense to use in a browser, the binary data makes sense to use on the command line.)
17 | 


--------------------------------------------------------------------------------
/base-64-encode.hs:
--------------------------------------------------------------------------------
1 | import Data.ByteString.Lazy.Char8 as B
2 | import Data.ByteString.Base64.Lazy as B64
3 | 
4 | main = B.interact B64.encode
5 | 


--------------------------------------------------------------------------------
/golomb-decode.hs:
--------------------------------------------------------------------------------
1 | import GolombQuery
2 | import System.Environment (getArgs)
3 | import qualified Data.ByteString.Lazy as BWord
4 | 
5 | main = do [lineCount, binaryBits] <- getArgs  -- todo: use the -index via aeson
6 |           f <- BWord.getContents
7 | 	  putStr $ unlines $ map show $ fastGolombDecode (read lineCount) (read binaryBits) f
8 | 


--------------------------------------------------------------------------------
/golomb-encode.hs:
--------------------------------------------------------------------------------
1 | import GolombEncode
2 | import System.Environment (getArgs)
3 | import qualified Data.ByteString.Lazy.Char8 as B
4 | import Data.Maybe (fromJust)
5 | 
6 | main = do [binaryBits] <- getArgs
7 |           B.interact (B.pack . map (toEnum . fromEnum) . fastGolombEncode (read binaryBits) . linewiseDiff . map (fst . fromJust . B.readInteger) . B.lines)
8 | 


--------------------------------------------------------------------------------
/golomb-query.hs:
--------------------------------------------------------------------------------
 1 | import GolombQuery
 2 | import System.Environment (getArgs)
 3 | import qualified Data.ByteString.Lazy as BWord
 4 | import qualified Data.ByteString.Lazy.Char8 as BChar
 5 | 
 6 | main = do [lineCount, modulus, binaryBits, gcsFile] <- getArgs  -- todo: use the -index via aeson
 7 |           f <- BWord.readFile gcsFile
 8 | 	  queries <- BChar.getContents
 9 | 	  BChar.putStr $ BChar.unlines $ golombFilterQueries (read lineCount) (read modulus) (read binaryBits) f (BChar.lines queries)
10 | 


--------------------------------------------------------------------------------
/make-hash-sequences.hs:
--------------------------------------------------------------------------------
1 | import qualified Data.ByteString.Lazy.Char8 as B
2 | import System.Environment (getArgs)
3 | import HashSequence (processLine)
4 | 
5 | main = do [modulus] <- getArgs
6 |           B.interact (B.unlines . map (processLine (read modulus)) . B.lines)
7 | 


--------------------------------------------------------------------------------
/make-hash-sequences.rb:
--------------------------------------------------------------------------------
1 | require 'digest'
2 | 
3 | Modulus = ARGV[0].to_i
4 | STDIN.each_line {|line| puts Digest::MD5.hexdigest(line.chomp).to_i(16) % Modulus }
5 | 


--------------------------------------------------------------------------------
/make-index.hs:
--------------------------------------------------------------------------------
 1 | import System.Environment (getArgs)
 2 | 
 3 | main = do [binaryBits, indexFrequency] <- getArgs
 4 |           interact (show . every (read indexFrequency) . sumsBitcounts (read binaryBits) . map read . lines)
 5 | 
 6 | sumsBitcounts :: Integer -> [Integer] -> [[Integer]]
 7 | sumsBitcounts binaryBits = linewiseTupleSum . map (\ di -> [di, sizeof binaryBits di, 1] ) . linewiseDiff
 8 | 
 9 | linewiseTupleSum = scanl1 (zipWith (+))
10 | 
11 | linewiseDiff is = zipWith (-) is (0:is)
12 | 
13 | sizeof binaryBits i = (i `div` (2 ^ binaryBits)) + 1 + binaryBits
14 | 
15 | every n xs = case drop (n-1) xs of
16 |               (y:ys) -> y : every n ys
17 |               [] -> []


--------------------------------------------------------------------------------
/mk-gcs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | input=`mktemp`
 3 | hashes=`mktemp`
 4 | sequence=`mktemp`
 5 | gzip -c1 > $input
 6 | linecount=`gzip -cd $input | wc -l`
 7 | 
 8 | fp=100
 9 | binarybits=
10 | modulus=$(( linecount * fp ))
11 | indexfrequency=16384
12 | while getopts "hp:b:m:i:" OPTION
13 | do
14 |     case $OPTION in
15 | 	h)
16 | 	    echo "usage: $0 -p false-positive-rate (default: 100) -b bits-for-binary-Rice-encoding (default: floor(log2(fp))) -m hash-modulus (default: linecount * fp) -i index-stride (default: 16k)"
17 | 	    echo "stdin: one key per line. stdout: golomb-compressed sequence. stderr: json with base64-encoded gcs and index."
18 | 	    exit 1
19 | 	    ;;
20 | 	p)
21 | 	    fp=$OPTARG
22 | 	    modulus=$(( linecount * fp ))
23 | 	    ;;
24 | 	b)
25 | 	    binarybits=$OPTARG
26 | 	    ;;
27 | 	m)
28 | 	    modulus=$OPTARG
29 | 	    ;;
30 | 	i)
31 | 	    indexfrequency=$OPTARG
32 | 	    ;;
33 |     esac
34 | done
35 | 
36 | if [ -z $binarybits ]
37 | then
38 |     binarybits=`awk "BEGIN { print int(log($fp)/log(2)) }"`
39 | fi
40 | 
41 | #sort -n can't handle bigints
42 | export LC_ALL=C
43 | gzip -cd $input | ./make-hash-sequences $modulus | awk -v len=${#modulus} '{ printf("% " len "s\n",$1) }' | sort -S 50% -u | tr -d ' ' | gzip -c1 > $hashes
44 | 
45 | gzip -cd $hashes | ./golomb-encode $binarybits > $sequence
46 | 
47 | outputlinecount=`gzip -cd $hashes | wc -l`
48 | (echo '{'
49 |   echo '"lineCount":' ${outputlinecount},
50 |   echo '"modulus":' ${modulus},
51 |   echo '"binaryBits":' ${binarybits},
52 |   echo -n '"partialSumBitcounts":' ; gzip -cd $hashes | ./make-index $binarybits $indexfrequency ; echo ,
53 |   echo -n '"b64EncodedGolombCodedSequence": "' ; ./base-64-encode < $sequence ; echo '"'
54 | echo '}') > /dev/stderr
55 | cat $sequence
56 | rm $input $hashes $sequence
57 | 


--------------------------------------------------------------------------------
/pedagogicalGolombCode.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE BangPatterns #-}
 2 | module GolombCode where
 3 | import Data.List (unfoldr, foldl')
 4 | import Data.Word
 5 | 
 6 | golombCode :: Int -> Int -> [Bool]
 7 | golombCode !binaryBits !integer = encodeUnary quotient ++ encodeBinary binaryBits remainder
 8 |   where (quotient, remainder) = integer `divMod` (2 ^ binaryBits)
 9 | 
10 | encodeUnary !quotient = go quotient [False]
11 |   where go 0 bits = bits
12 |         go !n bits = go (n-1) (True : bits)
13 | 
14 | encodeBinary !binaryBits !integer = reverse $ unfoldr numToBools (binaryBits,integer)
15 |   where numToBools (!mB,!i) = if mB == 0 then Nothing else Just (odd i, (mB-1, i `div` 2))
16 | 
17 | golombDecode :: Int -> [Bool] -> (Int, [Bool])
18 | golombDecode binaryBits bits = (quotient * (2 ^ binaryBits) + remainder, rest)
19 |   where (!quotient, binaryAndRest) = decodeUnary bits
20 |         (!remainder, rest) = decodeBinary binaryBits binaryAndRest
21 | 
22 | decodeUnary bits = (length ones, rest)
23 |   where (ones, z : rest) = span id bits
24 | 
25 | decodeBinary binaryBits bits = (int, rest)
26 |   where (binary, rest) = splitAt binaryBits bits
27 |         int = boolsToInt binary
28 | 
29 | boolsToInt :: (Num a) => [Bool] -> a
30 | boolsToInt = foldl' (\ z e -> z*2 + (if e then 1 else 0)) 0
31 | 
32 | boolsToBytes :: [Bool] -> Bool -> [Word8]
33 | boolsToBytes (b7:(b6:(b5:(b4:(b3:(b2:(b1:(b0:nextBytes)))))))) padding = (boolsToInt [b7,b6,b5,b4,b3,b2,b1,b0]) : (boolsToBytes nextBytes padding)
34 | boolsToBytes [] padding = []
35 | boolsToBytes fewBitsShort padding = [boolsToInt $ take 8 $ fewBitsShort ++ repeat padding]
36 | 
37 | decodeMany 0 binaryBits bits = []
38 | decodeMany !n binaryBits bits = i : (decodeMany (n-1) binaryBits rest)
39 |   where (i,rest) = golombDecode binaryBits bits
40 | 
41 | golombCodesNoBlocks :: Int -> [Int] -> [Word8]
42 | golombCodesNoBlocks !binaryBits ints = boolsToBytes (concatMap (golombCode binaryBits) ints) False
43 | 
44 | golombDecodesNoBlocks :: Int -> Integer -> [Word8] -> [Int]
45 | golombDecodesNoBlocks !binaryBits !count word8s = decodeMany count binaryBits (concatMap (encodeBinary 8) word8s)
46 | 
47 | linewiseDiff :: [Integer] -> [Int]
48 | linewiseDiff ints = zipWith (\ a b -> fromIntegral (a-b) ) ints (0:ints)
49 | 
50 | -- test_golomb = quickCheckWith (stdArgs {maxSuccess = 10000}) (\ ints -> let positives = map abs ints in golombDecodesNoBlocks 2 (fromIntegral $ length ints) (golombCodesNoBlocks 2 positives) == positives)


--------------------------------------------------------------------------------