├── adix ├── nim.cfg ├── stat.nim ├── cpuCT.nim ├── memutil.nim ├── metab.nim ├── ways.nim ├── bitop.nim ├── uniqce.nim ├── topk.nim ├── amoft.nim ├── bltab.nim ├── oats.nim ├── embist.nim ├── cumsum.nim ├── lghisto.nim ├── sequint.nim ├── hist.nim ├── lmbist.nim ├── lna.nim ├── bist.nim └── tdigest.nim ├── util ├── nim.cfg └── lfreq.nim ├── tests ├── nim.cfg ├── mostCommon.nim ├── ctab.nim ├── wf.nims ├── toSet.nim ├── tBadHash2.nim ├── edkey.nim ├── repeats.nim ├── testTab.nim ├── tHighTab.nim ├── tRandom.nim ├── lfreq.awk ├── tParams.nim ├── mvWinGen ├── tSuperHigh.nim ├── anaForum.nim ├── dtshell.nim ├── tBadHash.nim ├── writeHash.nim ├── ppss.nim ├── anaPrime.nim ├── sshell.nim ├── kmCmp.sh ├── ucl.nim ├── tshell.nim ├── wu.nim ├── wfr.nim ├── wf.nim ├── bl.nim └── btshell.nim ├── .gitignore ├── adix.nimble ├── adix.nim ├── LICENSE ├── .github └── workflows │ └── ci.yml ├── TODO.md ├── NOTES.md └── README.md /adix/nim.cfg: -------------------------------------------------------------------------------- 1 | path=".." 2 | -------------------------------------------------------------------------------- /util/nim.cfg: -------------------------------------------------------------------------------- 1 | path=".." 2 | -------------------------------------------------------------------------------- /tests/nim.cfg: -------------------------------------------------------------------------------- 1 | path=".." 2 | path="../adix" 3 | path="../../cg" # cligen 4 | -------------------------------------------------------------------------------- /adix/stat.nim: -------------------------------------------------------------------------------- 1 | {.deprecated: "Use mvstat instead".} 2 | import mvstat 3 | export mvstat 4 | -------------------------------------------------------------------------------- /tests/mostCommon.nim: -------------------------------------------------------------------------------- 1 | import adix/lptabz 2 | let data = ["a", "b", "c", "c", "d", "e", "e", "f", "g", "g", "g", "h"] 3 | for tup in mostCommon(data, 3): echo tup 4 | -------------------------------------------------------------------------------- /tests/ctab.nim: -------------------------------------------------------------------------------- 1 | import metab 2 | var c = initTab[int8,int]() 3 | c.inc 2 4 | c.inc 3 5 | c.inc 3 6 | c.inc 2, -1 7 | c.inc 1 8 | echo c 9 | for k,v in c.topByVal(n=1): 10 | echo k," ",v 11 | -------------------------------------------------------------------------------- /tests/wf.nims: -------------------------------------------------------------------------------- 1 | switch("threads", "on") 2 | if defined(tcc): 3 | switch("tlsEmulation", "on") 4 | if (NimMajor,NimMinor,NimPatch) >= (1,6,0): switch("mm", "markAndSweep") 5 | else: switch("gc", "markAndSweep") 6 | switch("passL","-lm") 7 | -------------------------------------------------------------------------------- /tests/toSet.nim: -------------------------------------------------------------------------------- 1 | import cligen, adix/lptabz 2 | 3 | proc test(nums: seq[int32]) = 4 | when defined(fromVar): 5 | var s: LPSet[int32] 6 | s.setCap(nums.len) 7 | else: 8 | var s = initLPSet[int32](nums.len, minFree=0) 9 | for x in nums: s.incl x 10 | echo s.getCap 11 | echo s 12 | 13 | dispatch(test) 14 | -------------------------------------------------------------------------------- /tests/tBadHash2.nim: -------------------------------------------------------------------------------- 1 | import metab 2 | 3 | #import althash 4 | #proc hash(h, salt: Hash): Hash = hashRoMu1(h) xor cast[Hash](salt) 5 | 6 | var one = initSet[int]() 7 | for i in 0 ..< (1 shl 23): 8 | one.incl (i shl 25) 9 | 10 | var ds = one.depths 11 | echo ds 12 | echo one.len, "/", one.getCap 13 | echo "Stats: ", one.depthStats 14 | -------------------------------------------------------------------------------- /tests/edkey.nim: -------------------------------------------------------------------------------- 1 | import metab 2 | var c = initTab[int8,int]() 3 | c.inc 2 4 | c.inc 3 5 | c.inc 4 6 | c.inc 5 7 | c.inc 6 8 | c.inc 7 9 | c.inc 9 10 | c.inc 9 11 | c.editKey 9, 8 12 | for i in 2'i8..8: 13 | echo i, " ", c[i] 14 | 15 | let t = c 16 | echo t.nthPair(6) 17 | let tup = c.nthPair(0) 18 | echo tup[0] 19 | tup[1][] = 9 20 | echo c 21 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | noVC 2 | tests/anaPrime 3 | tests/anaForum 4 | tests/anaPrime 5 | tests/btshell 6 | tests/dtshell 7 | tests/ppss 8 | tests/repeats 9 | tests/sshell 10 | tests/tBadHash2 11 | tests/tBadHash 12 | tests/testTab 13 | tests/tHighTab 14 | tests/tParams 15 | tests/tRandom 16 | tests/tshell 17 | tests/tSuperHigh 18 | tests/writeHash 19 | tests/bl 20 | tests/wf 21 | patches/ 22 | -------------------------------------------------------------------------------- /adix.nimble: -------------------------------------------------------------------------------- 1 | # Package 2 | version = "0.7.6" 3 | author = "Charles Blake" 4 | description = "An Adaptive Index Library for Nim" 5 | license = "MIT/ISC" 6 | 7 | # Deps 8 | requires "nim >= 2.0.0" 9 | requires "cligen >= 1.9.5" 10 | skipDirs = @[ "tests" ] 11 | 12 | # Older Nim must use adix < 0.5.5 & comment out the below `bin`. 13 | bin = @[ 14 | "util/lfreq", # Somewhat efficient line frequency calculator 15 | ] 16 | -------------------------------------------------------------------------------- /tests/repeats.nim: -------------------------------------------------------------------------------- 1 | when not declared(stdin): import std/syncio 2 | import tables, cligen 3 | 4 | proc repeats(thresh=2) = 5 | ## Read 8 byte hashes from stdin & print histogram of any with count > thresh. 6 | var h: int64 7 | var cnt: Table[int64, int] 8 | while stdin.readBuffer(cast[cstring](h.addr), 8) == 8: 9 | cnt.mgetOrPut(h, 0).inc 10 | for h,c in cnt: 11 | if c >= thresh: 12 | echo "h: ", h, " count: ", c 13 | 14 | dispatch(repeats) 15 | -------------------------------------------------------------------------------- /tests/testTab.nim: -------------------------------------------------------------------------------- 1 | ## This is an `include` file to attach a test suite to a given ??tab.nim impl. 2 | 3 | import cligen 4 | 5 | proc test*(nums: seq[int32]) = 6 | var t = initTab() 7 | for x in nums: 8 | if x >= 0: 9 | echo "ADD ", $x 10 | t.mgetOrPut(x.uint32, 0).inc 11 | elif (-x).uint32 in t: echo "HAS ", $(-x) 12 | else: echo "NO ", $(-x) 13 | # echo t.s.data 14 | echo t 15 | # echo t.s.data 16 | # echo t.s.depths 17 | 18 | dispatch(test) 19 | -------------------------------------------------------------------------------- /tests/tHighTab.nim: -------------------------------------------------------------------------------- 1 | when not declared(addFloat): import std/formatfloat 2 | import metab, althash, times 3 | 4 | const shift = 49'u 5 | proc hash(x: uint64): Hash = hashRoMu1(x) 6 | 7 | var one = initTab[uint64, int](4, rehash=false) 8 | var t0 = epochTime() 9 | for i in 0'u ..< ((1'u shl 15) - 1): 10 | one[uint64(i shl shift)] = 2 11 | echo epochTime() - t0, " seconds" 12 | var ds = one.depths 13 | echo ds 14 | echo "MAX DEPTH: ", ds.len 15 | echo one.len, "/", one.getCap 16 | echo "Stats: ", one.depthStats 17 | # one.debugDump 18 | -------------------------------------------------------------------------------- /tests/tRandom.nim: -------------------------------------------------------------------------------- 1 | when not declared(addFloat): import std/formatfloat 2 | import os, strutils, metab, random 3 | 4 | randomize() 5 | 6 | let num = if paramCount() > 0: parseInt(paramStr(1)) else: 1 7 | let den = if paramCount() > 1: parseInt(paramStr(2)) else: 4 8 | let cnt = if paramCount() > 2: parseInt(paramStr(3)) else: 3*(1 shl 10) 9 | let nTr = if paramCount() > 3: parseInt(paramStr(4)) else: 30 10 | let rob = paramCount() > 4 11 | 12 | #echo "USING ", num, '/', den, " and ", cnt, " entries." 13 | for t in 1..nTr: 14 | var one = initSet[int](numer=num, denom=den, robinhood=rob) 15 | for i in 1..cnt: 16 | one.incl rand(1 shl 32) 17 | echo "Ut: ", one.len.float/one.getCap.float, " St: ", one.depthStats 18 | -------------------------------------------------------------------------------- /adix.nim: -------------------------------------------------------------------------------- 1 | when defined(nimdoc): 2 | import adix/althash 3 | import adix/amoft 4 | import adix/bist 5 | import adix/bitop 6 | import adix/bltab 7 | import adix/btree 8 | import adix/cpuCT 9 | import adix/cumsum 10 | import adix/ditab 11 | import adix/embist 12 | import adix/hist 13 | import adix/lghisto 14 | import adix/lmbist 15 | import adix/lna 16 | import adix/lptabz 17 | import adix/memutil 18 | import adix/metab 19 | import adix/mvstat 20 | import adix/nsort 21 | import adix/oats 22 | import adix/sequint 23 | import adix/stat 24 | import adix/tdigest 25 | import adix/topk 26 | import adix/uniqce 27 | import adix/xhist1 28 | import adix/ways 29 | else: 30 | {.error: "use `import adix/{module of interest}`".} 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2020 Charles L. Blake. 2 | 3 | Permission to use, copy, modify, and/or distribute this software for any 4 | purpose with or without fee is hereby granted, provided that the above 5 | copyright notice and this permission notice appear in all copies. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 8 | REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 9 | AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 10 | INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 11 | LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 12 | OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 13 | PERFORMANCE OF THIS SOFTWARE. 14 | -------------------------------------------------------------------------------- /tests/lfreq.awk: -------------------------------------------------------------------------------- 1 | #!/bin/awk -f 2 | {c[$0]++}END{for(k in c)print c[k],k} 3 | 4 | # As has surely been noted about a gajillion times by now, an improvement on 5 | # Doug McIlroy's solution in Knuth-McIlroy is (probably): 6 | # tr -cs A-Za-z \\n | # Non-alpha -> newline; NOTE: apostrophe-quoting ambig. 7 | # tr A-Z a-z | # ASCII upper to lower 8 | # lfreq.awk | # Histogram lines (this script) 9 | # sort -n | tail # top 10; Add `| tac` if you like decreasing 10 | # 11 | # While almost any timing strongly depends on used vocab & its sampled growth, 12 | # I get `mawk` 2..3X slower than optimized Nim & `gawk` ~2X slower than `mawk`. 13 | # 14 | # Given enough CPU cores, all above stages run in parallel & execution time is 15 | # bounded by pipe BW & the slowest stage - likely this AWK script. McIlroy's 16 | # `sort|uniq -c` method may be better if unique lines exceed avail. phys RAM & 17 | # next level of mem hierarchy has high rand.access latency (eg. Winchester,net). 18 | -------------------------------------------------------------------------------- /tests/tParams.nim: -------------------------------------------------------------------------------- 1 | when not declared(addFloat): import std/formatfloat 2 | import random, stats, adix/[metab, althash, bitop], cligen 3 | 4 | proc statsVratio(N=7000, MaxNum=(1 shl 32), numer=1, denom=1, 5 | rehash=true, Robin=true, trials=10) = 6 | randomize() 7 | var util, dmax, dmean, dvar: RunningStat 8 | for t in 1..trials: 9 | var set = initSet[int](numer=numer, denom=denom, 10 | rehash=rehash, robinhood=Robin) 11 | for i in 1..N: 12 | let c0 = set.getCap 13 | let ut = set.len.float / set.getCap.float 14 | let ds = set.depthStats 15 | set.incl rand(MaxNum) 16 | if set.getCap != c0 and c0 > 64: # resized 17 | util.push ut 18 | dmean.push ds[0] 19 | dvar.push ds[1] 20 | dmax.push ds[2].float / lg(set.getCap).float 21 | 22 | echo "ratio: " , numer, "/", denom, " util: " , util.mean, 23 | " dmean: ", dmean.mean, " dvar: " , dvar.mean, " dmax/lg: " , dmax.mean 24 | 25 | dispatch(statsVratio) 26 | -------------------------------------------------------------------------------- /tests/mvWinGen: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | if [ $# -lt 1 ]; then 3 | cat <<-EOF 4 | Convert numbers on stdin to ppss|bt-shell ops tracking a size \$1 moving window. 5 | Repeatably tests trees of many shapes, sizes, duplicate key distros, op mixes. 6 | Zsh Eg. to gen input numbers: (repeat 99 {rshuffle \$(echo {1..99})|tr ' ' \\n}) 7 | EOF 8 | exit 1 9 | fi 10 | DEL="-0"; ADD="+1" # FIFO order 11 | if [ "$1" = "-u" ]; then # unordered 12 | DEL="-"; ADD="+"; shift 1 13 | fi 14 | win="$1" # window is $*/$@ or $1 .. $$# 15 | set dummy; shift 1 # set w/no arg dumps defs which is unwanted 16 | 17 | seq=1 18 | while read a 19 | do 20 | if [ $seq -gt $win ]; then 21 | echo $DEL $1 # w/dups s sets path to 0-side 22 | shift 1 23 | fi 24 | echo $ADD $a $seq # w/dups i1 appends at 1-side 25 | set "$@" $a 26 | echo n0 $(($#/2)) # query uninterpolated moving median 27 | if [ -n "$xtra" ]; then 28 | echo $xtra 29 | fi 30 | seq=$((seq+1)) 31 | done 32 | -------------------------------------------------------------------------------- /tests/tSuperHigh.nim: -------------------------------------------------------------------------------- 1 | when not declared(addFloat): import std/[formatfloat, objectdollar] 2 | import metab, althash, times#, math 3 | # 0 1 2 3 4 5 6 7 8 9 4 | #tombstone methods get depths ~ @[2, 1, 1, 1, 1, 32, 174, 40, 2, 1] 5 | #That is "roughly" a counter example for Python probe sequence, but it would be 6 | #better to push a lot more past depth 7 since 7x slower maybe isn't so obvious. 7 | # 8 | #rehash/robinhood mitigations are not enough in `lpset`. What seems to work 9 | #well is defining below hash which then makes actual hash RoMu1(RoMu1(x)). 10 | # 11 | #proc hash(x: uint64): Hash = hashRoMu1(x) 12 | #proc hash(x: uint64): Hash = hashRoMu2(x) # good hash for these keys @any shift 13 | 14 | const shift = 56'u 15 | 16 | var one = initSet[uint64](256, robinhood=true, rehash=true, numer=10, denom=1) 17 | var t0 = epochTime() 18 | for i in 0'u ..< 234: 19 | one.incl uint64(i shl shift) 20 | echo (epochTime() - t0)*1e9/234.0, " ns/elt" 21 | echo one.len, "/", one.getCap, " = ", one.len.float/one.getCap.float 22 | echo "Depths:", one.depths 23 | echo "Stats: ", one.depthStats 24 | one.debugDump 25 | -------------------------------------------------------------------------------- /adix/cpuCT.nim: -------------------------------------------------------------------------------- 1 | ## gcc/clang error out if the generated C includes a tmmintrin.h header on CPUs 2 | ## without -march=enabling the instructions. An admittedly expensive staticExec 3 | ## lets us probe a build-time system for all pre-defined C preprocessor macros 4 | ## in one execution. We then postprocess these into a set of flags for Nim 5 | ## compile-time `when` checks to make "fall back" easy/natural. 6 | from std/strutils import contains 7 | 8 | const ccDumpMacro {.used.} = " -dM -E -x c - 0: {.passc: "-march=native".} 25 | -------------------------------------------------------------------------------- /tests/anaForum.nim: -------------------------------------------------------------------------------- 1 | # Condensed & generalized version of Nim Forum post. It's 2 | # here for reference but is 17x slower than anaPrime due 3 | # to: sortedSig, Tab[*,seq[string]], hasKey, no presize... 4 | 5 | when not declared(open): import std/syncio 6 | import strutils, algorithm, os, tables, unidecode 7 | 8 | proc signature(word: string): string = 9 | let ascii = unidecode(word).toLowerAscii 10 | let sorted_word = sorted(ascii, system.cmp) 11 | result = sorted_word.join() 12 | 13 | proc main = 14 | if paramCount() < 2: # Parse command line 15 | quit("Usage: anagram ") 16 | let lookup_word = paramStr(2) 17 | let lookup_signature = signature(lookup_word) 18 | echo "Looking up '", lookup_word, "'" 19 | 20 | var anagrams = initTable[string, seq[string]]() 21 | for word in open(paramStr(1)).lines(): 22 | let signature = signature(word) 23 | if anagrams.hasKey(signature): 24 | anagrams[signature].add(word) 25 | else: 26 | anagrams[signature] = @[word] 27 | 28 | if anagrams[lookup_signature].len == 1: 29 | echo "'", lookup_word, "' has no anagrams" 30 | else: 31 | echo anagrams[lookup_signature] 32 | 33 | main() 34 | -------------------------------------------------------------------------------- /tests/dtshell.nim: -------------------------------------------------------------------------------- 1 | when not declared(stdin): import std/[syncio, objectdollar] 2 | import metab, os, strutils 3 | 4 | proc main() = 5 | var lgsz = parseInt(getEnv("LGSZ", "2")) 6 | var nP0, nP1, nG0, nG1, nD0, nD1: int = 0 7 | if paramCount() > 1: 8 | echo("Usage:\n ", paramStr(0), "< [gpdTPD]K [..]") 9 | quit(1) 10 | var t = initTab[int8,int](1 shl lgsz, rehash=false) 11 | var had: bool 12 | for line in lines(stdin): # Dispatch operations 13 | let cols = line.split 14 | let c = cols[0][0] 15 | let k = int8(if cols[0].len > 1: parseInt(cols[0][1 .. ^1]) else: 0) 16 | let v = if cols.len > 1: parseInt(cols[1]) else: 0 17 | case c 18 | of 'g': 19 | if k in t: nG1.inc 20 | else : nG0.inc 21 | of 'p': 22 | discard t.mgetOrPut(k, v, had) 23 | if had: nP0.inc 24 | else : nP1.inc 25 | of 'a': 26 | t.add(k, v) 27 | of 'd': 28 | if t.missingOrExcl(k): nD0.inc 29 | else : nD1.inc 30 | of 'T': echo t 31 | of 'D': echo t.depths 32 | of 'P': t.debugDump 33 | else: echo "UNKNOWN COMMAND:", c.repr; quit 2 34 | echo "nP1: ", nP1, " nP0: ", nP0, " nG1: ", nG1, " nG0: ", nG0, 35 | " nD1: ", nD1, " nD0: ", nD0 36 | main() 37 | -------------------------------------------------------------------------------- /tests/tBadHash.nim: -------------------------------------------------------------------------------- 1 | when not declared(addFloat): import std/formatfloat 2 | import metab, althash, times 3 | 4 | let iniSz = 4 5 | #let iniSz = 1 shl 15 6 | #let iniSz = 1 shl 18 7 | #let iniSz = 1 shl 23 8 | when defined(startWithRehash): 9 | let rh = true 10 | else: 11 | let rh = false 12 | when defined(althash): 13 | proc hash(x: int): Hash = hashRoMu1(x) 14 | elif defined(althash2): 15 | proc hash(x: int): Hash = hashRevFib(x) 16 | 17 | echo "START... " 18 | var t0 = epochTime() 19 | 20 | const shift = 25 21 | 22 | var one = initSet[int](iniSz, rehash=rh) 23 | #for i in 0 ..< ((1 shl 15) - 1): one.incl i 24 | for i in 0 ..< ((1 shl 23) - 19): one.incl (i shl shift) 25 | echo epochTime() - t0, " seconds" 26 | var ds = one.depths 27 | echo ds 28 | echo "MAX DEPTH: ", ds.len 29 | echo one.len, "/", one.getCap 30 | echo "Stats: ", one.depthStats 31 | # one.debugDump 32 | 33 | echo "LOOKING UP... " 34 | for i in 0 ..< ((1 shl 23) - 19): 35 | if (i shl shift) notin one: echo i, " WAS MISSING" 36 | echo "CLONING... " 37 | 38 | t0 = epochTime() 39 | var two = initSet[int](iniSz, rehash=rh) 40 | for v in one: two.incl v 41 | echo epochTime() - t0, " seconds" 42 | ds = two.depths 43 | echo ds 44 | echo "MAX DEPTH: ", ds.len 45 | echo two.len, "/", two.getCap 46 | echo "Stats: ", two.depthStats 47 | # two.debugDump 48 | 49 | echo "DONE... " 50 | -------------------------------------------------------------------------------- /adix/memutil.nim: -------------------------------------------------------------------------------- 1 | when defined(robinHoodMoveMem): 2 | # This branch gets SEGV (with seq[T] vals); I have not yet tracked down why 3 | proc pushUp*[T](x: var seq[T], i, n: int) {.inline.} = 4 | ## move n items up 1; i.e. ``x[i+1..i+n] = x[i..i+n-1]`` 5 | # if n < 1: return 6 | moveMem x[i+1].addr, x[i].addr, n * T.sizeof 7 | 8 | proc pullDown*[T](x: var seq[T], i, n: int) {.inline.} = 9 | ## move n items down 1; i.e. ``x[i..i+n-1] = x[i+1..i+n]`` 10 | # if n < 1: return 11 | moveMem x[i].addr, x[i+1].addr, n * T.sizeof 12 | elif defined(robinHoodSlice): 13 | proc pushUp*[T](x: var seq[T], i, n: int) {.inline.} = 14 | ## move n items up 1; i.e. ``x[i+1..i+n] = x[i..i+n-1]`` 15 | # if n < 1: return 16 | x[i+1 .. i+n] = x[i .. i+n-1] 17 | 18 | proc pullDown*[T](x: var seq[T], i, n: int) {.inline.} = 19 | ## move n items down 1; i.e. ``x[i..i+n-1] = x[i+1..i+n]`` 20 | # if n < 1: return 21 | x[i .. i+n-1] = x[i+1 .. i+n] 22 | else: 23 | proc pushUp*[T](x: var seq[T], i, n: int) {.inline.} = 24 | ## move n items up 1; i.e. ``x[i+1..i+n] = x[i..i+n-1]`` 25 | # if n < 1: return 26 | for j in countdown(i + n - 1, i): 27 | x[j+1] = move x[j] 28 | 29 | proc pullDown*[T](x: var seq[T], i, n: int) {.inline.} = 30 | ## move n items down 1; i.e. ``x[i..i+n-1] = x[i+1..i+n]`` 31 | # if n < 1: return 32 | for j in countup(i, i + n - 1): 33 | x[j] = move x[j+1] 34 | -------------------------------------------------------------------------------- /tests/writeHash.nim: -------------------------------------------------------------------------------- 1 | # Just writes 8 byte binary hashes of 0..[0|1][[]] { [] denotes optionality }. 15 | ## This factoring allows shells to have negligible dispatch overhead/string 16 | ## handling and so be appropriate for benchmarks/timing experiments. 17 | let fi = if i == "/dev/stdin" : stdin else: open(i, fmRead) 18 | let fo = if o == "/dev/stdout": stdout else: open(o, fmWrite) 19 | var cout: Command 20 | for buf in lines(fi): 21 | if buf.len == 0 or buf[0] == '#': continue 22 | let cols = buf.split 23 | if cols.len < 1 or cols[0].len < 1: continue 24 | cout.letter = cols[0][0] 25 | cout.sided = cols[0].len > 1 26 | cout.side = if cout.sided and cols[0][1] == '1': true else: false 27 | cout.key = int16(if cols.len > 1: parseInt(cols[1]) else: -99) 28 | cout.val = int16(if cols.len > 2: parseInt(cols[2]) else: 0) 29 | discard fo.writeBuffer(cout.addr, cout.sizeof) 30 | fo.flushFile 31 | 32 | proc readObject*(f: File, buffer: pointer, size: Natural): int {.inline.} = 33 | proc c_fread(buf: pointer, size, n: culong, f: File): culong {. 34 | importc: "fread_unlocked", header: "" .} 35 | result = int(c_fread(buffer, cast[culong](size), 1, f)) 36 | 37 | when isMainModule: 38 | import cligen; dispatch(preproc) 39 | -------------------------------------------------------------------------------- /tests/anaPrime.nim: -------------------------------------------------------------------------------- 1 | import strutils, times, lptabz, althash, cligen, cligen/[mfile, mslice, osUt] 2 | proc hash(x: uint64): Hash {.inline.} = hashRoMu1(x) # =~ 1.05x faster 3 | 4 | type Word = distinct uint32 # 24 bits of byte-offset, 8 bits of word length 5 | 6 | proc initWord(off, len: int): Word {.inline.} = 7 | Word(uint32(off) shl 8 or uint32(len)) 8 | 9 | proc toString(w: Word, mf: MFile): string {.inline.} = 10 | let off = uint32(w) shr 8 11 | let len = uint32(w) and 255 12 | result.setLen len 13 | copyMem result[0].addr, mf.mem +! off, len 14 | 15 | proc sig(word: MSlice): uint64 {.inline.} = # word signature 16 | const prime = [ #9/267751 oflow 17 | 7'u64, 61, 41, 53, 2, 71, 47, 29, 3, 97, 89, 17, 59, 18 | 19, 5, 31, 101, 11, 13, 23, 37, 79, 73, 67, 43, 83 ] 19 | result = 1'u64 20 | for ch in word: result *= prime[ord(ch) - ord('A')] 21 | 22 | proc getAna(dict="words", mf: MFile): LPTabz[uint64,Word,uint64,0] = 23 | try: result.mmap(findPathPattern(dict & '.')) 24 | except CatchableError: 25 | result.init(mf.len div 10, numer=3, denom=1) 26 | for word in mf.mSlices: 27 | result.add word.sig, initWord(word.mem -! mf.mem, word.len) 28 | result.save(dict) 29 | 30 | proc qry(dict="words", stats=false, query: seq[string]) = 31 | let t0 = getTime() 32 | if (let mf = mopen(dict); mf) != nil: 33 | let ana = dict.getAna(mf) 34 | let t1 = getTime() 35 | for word in query: 36 | let word = word.toUpperAscii 37 | let key = word.toMSlice.sig 38 | echo word, ":" 39 | for ana in ana.allValues(key): 40 | echo " ", ana.toString(mf) 41 | if stats: 42 | echo "Prep Time: ", (t1 - t0).inMicroseconds, " us" 43 | when compiles(ana.depths): 44 | echo "Depths: ", ana.depths # hash table perf 45 | echo "FinalTable: ", ana.len, "/", ana.getCap 46 | mf.close 47 | 48 | when isMainModule: dispatch qry 49 | -------------------------------------------------------------------------------- /tests/sshell.nim: -------------------------------------------------------------------------------- 1 | when not declared(stdin): import std/[syncio, objectdollar] 2 | import metab, strutils, os, times 3 | 4 | proc now(): int64 {.inline.} = cast[int64](epochTime() * 1e9) 5 | 6 | proc main() = 7 | var verb = getEnv("VERB", "xyzpdq") != "xyzpdq" 8 | var nLoop0, nLoop1, nP0, nP1, nG0, nG1, nD0, nD1: int = 0 9 | var t0, t1, tL0, tL1: int64 10 | var stopped = false 11 | var op: seq[char] = @[] # operation & Key sequences 12 | when defined(directIndex): 13 | var size = parseInt(getEnv("SIZE", "0")) 14 | var ky: seq[int8] = @[] 15 | else: 16 | var size = parseInt(getEnv("SIZE", "2")) 17 | var ky: seq[int] = @[] 18 | var inp: string = stdin.readAll # Pre-read+parse to not time that 19 | inp.setLen inp.len - 1 # Chop last nl; .. is inclusive 20 | for line in inp.split('\n'): 21 | op.add line[0] 22 | ky.add(typeof(ky[0])(if line.len > 1: parseInt(line[1 .. ^1]) else: 0)) 23 | if op.len < 1 or paramCount() > 1: 24 | echo("Usage:\n ", paramStr(0), "< [gpdTZzLl.]K [..]") 25 | quit(1) 26 | var s = initSet[typeof(ky[0])](size, rehash=false, numer=3, denom=1) 27 | t0 = now() 28 | for i in 0 ..< ky.len: # Dispatch operations 29 | let c = op[i] 30 | let k = ky[i] 31 | if verb: echo c, k # Verb mode helpful to trap bugs 32 | case c 33 | of 'a': s.add k 34 | of 'g': (if k in s: nG1.inc else: nG0.inc) 35 | of 'p': (if s.containsOrIncl(k): nP0.inc else: nP1.inc) 36 | of 'd': (if s.missingOrExcl(k): nD0.inc else: nD1.inc) 37 | of '-': 38 | if k == 0: discard s.pop() 39 | else: (var kk = k; discard s.pop(kk)) 40 | of 'T': echo s 41 | of 'Z': t0 = now(); nP0 = 0; nP1 = 0; nG0 = 0; nG1 = 0; nD0 = 0; nD1 = 0 42 | of 'z': t1 = now(); stopped = true 43 | of 'L': tL0 = now(); nLoop0 = i 44 | of 'l': tL1 = now(); nLoop1 = i 45 | of '.': discard # Just to time op dispatch ovrhead 46 | of 'P': s.debugDump 47 | of 'D': echo s.depths 48 | else: echo "UNKNOWN COMMAND:", c.repr; quit 2 49 | if not stopped: t1 = now() 50 | t1 -= t0 51 | if nLoop1 - nLoop0 > 0: 52 | var perDispatch = float(tL1 - tL0) / float(nLoop1 - nLoop0) 53 | t1 -= int64(float(nP0 + nP1 + nG0 + nG1 + nD0 + nD1) * perDispatch) 54 | echo "nP1: ", nP1, " nP0: ", nP0, " nG1: ", nG1, " nG0: ", nG0, 55 | " nD1: ", nD1, " nD0: ", nD0 56 | main() 57 | -------------------------------------------------------------------------------- /tests/kmCmp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # nim-pgo build of: wu lfreq wfr wf 3 | export I=/dev/shm/ToTC # totc=Tale Of Two Cities from Project Gutenberg 4 | export i=/dev/shm/totc # $I pre-processed to lower case via tr A-Z a-z 5 | export n=/dev/null # totc=Tale Of Two Cities from Project Gutenberg 6 | tim 'wu<$I>$n' \ 7 | \ 8 | "tr -cs A-Za-z \\\\n<$I|tr A-Z a-z|mawk '{cnt[\$0]++}'>$n" \ 9 | 'tr -cs A-Za-z \\n<$I|tr A-Z a-z|lfreq -n-1' \ 10 | \ 11 | 'wfr -n-1 <$I' \ 12 | \ 13 | 'wf -n-1 -j1 <$I' \ 14 | 'wf -n-1 -j2 <$I' \ 15 | 'wf -n-1 -j3 <$I' \ 16 | 'wf -n-1 -j4 <$I' \ 17 | \ 18 | 'wf -n-1 -j1 <$i' \ 19 | 'wf -n-1 -j2 <$i' \ 20 | 'wf -n-1 -j3 <$i' \ 21 | 'wf -n-1 -j4 <$i' 22 | 23 | # (5.693 +- 0.031)e-03 wu<$I>$n # Simpler - less L1 CPU Cache pressure 24 | # 25 | # Now all the same - accumulate counts, but do no processing after that (this 26 | # is the -n-1 trick in the Nim programs): 27 | # 0.010905 +- 0.000016 tr -cs A-Za-z \\n<$I|tr A-Z a-z|mawk '{cnt[$0]++}'>$n 28 | # (5.840 +- 0.019)e-03 tr -cs A-Za-z \\n<$I|tr A-Z a-z|lfreq -n-1 29 | # 30 | # (5.9686 +- 0.0055)e-03 wfr -n-1 <$I 31 | # 32 | # (4.9756 +- 0.0009)e-03 wf -n-1 -j1 <$I 33 | # (3.842 +- 0.013)e-03 wf -n-1 -j2 <$I 34 | # (3.760 +- 0.020)e-03 wf -n-1 -j3 <$I 35 | # (3.75 +- 0.10)e-03 wf -n-1 -j4 <$I # No point in higher j 36 | # 37 | # These "cheat" by using already lower-cased $i as input, to measure the 38 | # impact of MAP_PRIVATE trick. 39 | # (4.436 +- 0.013)e-03 wf -n-1 -j1 <$i 40 | # (3.3724 +- 0.0031)e-03 wf -n-1 -j2 <$i 41 | # (3.297 +- 0.011)e-03 wf -n-1 -j3 <$i 42 | # (3.2845 +- 0.0071)e-03 wf -n-1 -j4 <$i # No point in higher j 43 | # 44 | # So, basically, `mawk` is not that bad (~2X worse than `lfreq`), mmap input 45 | # is 5.9686/4.9756=1.2X faster, MAP_PRIVATE costs about 1.14X, and almost all 46 | # parallel speed-up comes from the first doubling of L1 storage. 47 | # 48 | # Also of interest is the approximate algorithm using `bu/oft`: 49 | # 0.01674 +- 0.00017 tr -cs A-Za-z \\n<$I|tr A-Z a-z|oft 1 >$n 50 | # Results match exactly on Tale Of Two Cities for top 12. While `oft` uses much 51 | # less memory, 17ms is also (2.866 +- 0.031)X slower than `lfreq`. As mentioned 52 | # in README.md, many sketches need VERY steep space cliffs to pay off in time. 53 | # Aggressive `oft -e0.1 -c0.5` severely degrades matches for only 1.3X speed-up. 54 | # 55 | # In conclusion, advice for vocabulary analysis is use `lfreq` w/preprocessing 56 | # { A) you won't do > ~2X better & B) definition of "word" is likely unstable / 57 | # context-specific; Preproc preserves flex for B without losing much since A. } 58 | -------------------------------------------------------------------------------- /tests/ucl.nim: -------------------------------------------------------------------------------- 1 | when not declared(stdin): import std/[syncio, formatfloat] 2 | import std/[hashes, times], cligen, cligen/[mslice, osUt], adix/oats 3 | 4 | const bLen {.intdefine.} = 8 # <256 long; RT limits nicer but harder 5 | const bOff {.intdefine.} = 24 # <16MiB UNIQUE line data 6 | type 7 | Count {.packed.} = object # Dense-ish hash Count type 8 | when defined hashCache: hc: uint32 # 4B|8B per cell 9 | len {.bitsize: bLen.}: uint32 10 | off {.bitsize: bOff.}: uint32 11 | Counts = object 12 | dat: seq[Count] 13 | nUsed: int 14 | 15 | var s = " "; oatKStack s, Counts, Count, off,uint32, MSlice, MSlice 16 | proc key(c: Counts, i: int): MSlice = c.dat[i].key 17 | proc used(c: Counts, i: int): bool = c.dat[i].off!=0 18 | when defined hashCache: # def auto-triggers use 19 | proc hash(ms: MSlice): Hash = mslice.hash(ms).uint32.Hash 20 | proc hash(c: var Counts, i: int, hc: Hash) {.used.} = c.dat[i].hc = hc.uint32 21 | proc hash(c: Counts, i: int): Hash = c.dat[i].hc.Hash 22 | oatCounted c,Counts, c.nUsed; oatSeq Counts, dat # make counted & resizable 23 | #when Counts is ROat[MSlice, MSlice]: {.warning: "Counts is a ROat"} 24 | 25 | proc incFailed(h: var Counts, ms: MSlice): bool = 26 | var ms = ms 27 | if ms.len > (1 shl bLen) - 1: # Careful to not overflow XXX rate limit msgs 28 | erru "truncating too long (", $ms.len, ") line: ", ($ms)[0..<256], "...\n" 29 | ms.len = (1 shl bLen) - 1 # Truncation makes count potentially off 30 | h.upSert(ms, i): discard # Found key @i: nothing to do 31 | do: # Novel key->i: 32 | h.dat[i].off = s.add(ms, (1 shl bOff) - 1): 33 | erru "unique word data overflow at:",$ms,"\n" #XXX rate limit msgs 34 | return true # Cannot go on GLOBALLY 35 | h.dat[i].len = ms.len.uint32 # Init 36 | 37 | proc ucl(size=9999, dSize=81920, tm=false) = 38 | ## Count unique & total lines on `stdin`. <256B long; <16 MiB unique data. 39 | let t0 = if tm: epochTime() else: 0.0 40 | var h: Counts; h.setCap size # pre-size table & data 41 | s.setLen dSize; s.setLen 1 42 | var nTot = 0 43 | block IO: 44 | for (line, nLine) in stdin.getDelims: 45 | let ms = MSlice(mem: line, len: nLine - 1) 46 | inc nTot # Always bump `nTotal` 47 | if h.incFailed(ms): break IO 48 | echo h.len," unique ",nTot," total ",s.len," B" 49 | if tm: stderr.write epochTime() - t0, "\n" 50 | 51 | when isMainModule: dispatch ucl, help={ 52 | "size" : "pre-size hash table for size slots", 53 | "dSize": "pre-size str data area to this many bytes", 54 | "tm" : "emit wall time of counting to stderr & quit"} 55 | -------------------------------------------------------------------------------- /tests/tshell.nim: -------------------------------------------------------------------------------- 1 | when not declared(stdin): import std/[syncio, objectdollar, formatfloat] 2 | import metab, strutils, os, times 3 | 4 | proc now(): int64 {.inline.} = cast[int64](epochTime() * 1e9) 5 | 6 | proc main() = 7 | var verb = getEnv("VERB", "xyzpdq") != "xyzpdq" 8 | var nLoop0, nLoop1, nP0, nP1, nG0, nG1, nD0, nD1: int = 0 9 | var t0, t1, tL0, tL1: int64 10 | var stopped = false 11 | var op: seq[char] = @[] # operation, Key, Val seqs 12 | when defined(directIndex): 13 | var size = parseInt(getEnv("SIZE", "0")) 14 | var ky: seq[int8] = @[] 15 | else: 16 | var size = parseInt(getEnv("SIZE", "2")) 17 | var ky: seq[int] = @[] 18 | var vl: seq[int] = @[] 19 | var inp: string = stdin.readAll # Pre-read+parse to not time that 20 | inp.setLen inp.len - 1 # Chop last nl; .. is inclusive 21 | for line in inp.split('\n'): 22 | let cols = line.split 23 | op.add cols[0][0] 24 | ky.add(typeof(ky[0])(if cols[0].len > 1: parseInt(cols[0][1..^1]) else: 0)) 25 | vl.add(if cols.len > 1: parseInt(cols[1]) else: 0) 26 | if op.len < 1 or paramCount() > 1: 27 | echo "Usage:\n ", paramStr(0), "< [gpdaTZzLl.PD]K V" 28 | quit 1 29 | var t = initTab[typeof(ky[0]), int](size, rehash=false) 30 | var had: bool 31 | t0 = now() 32 | for i in 0 ..< ky.len: # Dispatch operations 33 | let c = op[i] 34 | let k = ky[i] 35 | let v = vl[i] 36 | if verb: echo c, k, " ", v # Verb mode helpful to trap bugs 37 | case c 38 | of 'g': (if k in t: nG1.inc else: nG0.inc) 39 | of 'p': 40 | discard t.mgetOrPut(k, v, had) 41 | if had: nP1.inc else: nP0.inc 42 | of 'd': 43 | if t.missingOrExcl(k): nD0.inc 44 | else : nD1.inc 45 | of '-': 46 | if k == 0: discard t.pop() 47 | else: (var kk = k; var vv = v; discard t.pop(kk, vv)) 48 | of 'a': t.add(k, v) 49 | of 'T': echo t 50 | of 'Z': t0 = now(); nP0 = 0; nP1 = 0; nG0 = 0; nG1 = 0; nD0 = 0; nD1 = 0 51 | of 'z': t1 = now(); stopped = true 52 | of 'L': tL0 = now(); nLoop0 = i 53 | of 'l': tL1 = now(); nLoop1 = i 54 | of '.': discard # Just to time op dispatch ovrhead 55 | of 'P': t.debugDump 56 | of 'D': echo t.depths 57 | else: echo "UNKNOWN COMMAND:", c.repr; quit 2 58 | if not stopped: t1 = now() 59 | t1 -= t0 60 | if nLoop1 - nLoop0 > 0: 61 | var perDispatch = float(tL1 - tL0) / float(nLoop1 - nLoop0) 62 | t1 -= int64(float(nP0 + nP1 + nG0 + nG1 + nD0 + nD1) * perDispatch) 63 | echo "ns: ", t1, " nP1: ", nP1, " nP0: ", nP0, " nG1: ", nG1, " nG0: ", nG0, 64 | " nD1: ", nD1, " nD0: ", nD0, " a: ", t.len.float / t.getCap.float, 65 | " M: ", t.getCap 66 | main() 67 | -------------------------------------------------------------------------------- /tests/wu.nim: -------------------------------------------------------------------------------- 1 | when not declared(stdin): import std/[syncio, formatfloat] 2 | import std/[hashes, times], cligen, cligen/[mslice, osUt], adix/oats 3 | 4 | const bLen {.intdefine.} = 5 # <32B long; RT params better but less easy 5 | const bOff {.intdefine.} = 27 # <128MiB UNIQUE word data 6 | type 7 | Count {.packed.} = object # Dense-ish hash Count type 8 | when defined hashCache: hc: uint32 # 4B|8B per cell 9 | len {.bitsize: bLen.}: uint8 10 | off {.bitsize: bOff.}: uint32 11 | Counts = object 12 | dat: seq[Count] 13 | nUsed: int 14 | 15 | var s: string; oatKStack s, Counts, Count, off,uint32, MSlice, MSlice 16 | proc key(c: Counts, i: int): MSlice = c.dat[i].key 17 | proc used(c: Counts, i: int): bool = c.dat[i].len != 0 18 | when defined hashCache: # 2nd def triggers saving lpt behavior 19 | proc hash(ms: MSlice): Hash = mslice.hash(ms).uint32.Hash 20 | proc hash(c: var Counts, i: int, hc: Hash) {.used.} = c.dat[i].hc = hc.uint32 21 | proc hash(c: Counts, i: int): Hash = c.dat[i].hc.Hash 22 | oatCounted c,Counts, c.nUsed; oatSeq Counts, dat # make counted & resizable 23 | #when Counts is ROat[MSlice, MSlice]: {.warning: "Counts is a ROat"} 24 | 25 | proc incFailed(h: var Counts, ms: MSlice): bool = 26 | var ms = ms 27 | if ms.len > (1 shl bLen) - 1: # Careful to not overflow XXX rate limit msgs 28 | erru "truncating too long (", $ms.len, ") word: ", ($ms)[0..<32], "...\n" 29 | ms.len = (1 shl bLen) - 1 30 | h.upSert(ms, i): discard # Found key @i: 31 | do: # Novel key->i: 32 | h.dat[i].off = s.add(ms, (1 shl bOff) - 1): 33 | erru "unique word data overflow at:",$ms,"\n" #XXX rate limit 34 | return true # Cannot go on GLOBALLY 35 | h.dat[i].len = ms.len.uint8 # Init 36 | 37 | const d = " \t\r,;:.?!'\"()[]{}|<>=+-*/\\0123456789&`~$#%^" 38 | proc wu(size=9999,dSize=81920, tm=false, Dlm="") = 39 | ## Count unique & total words on `stdin`. <32B long; <128 MiB unique data. 40 | let sep = initSep(if Dlm.len != 0: Dlm else: d) 41 | let t0 = if tm: epochTime() else: 0.0 42 | var h: Counts; h.setCap size # pre-size table & data 43 | s.setLen dSize; s.setLen 0 44 | var nTot = 0 45 | block IO: 46 | for (line, nLine) in stdin.getDelims: 47 | for tok in MSlice(mem: line, len: nLine - 1).frame(sep): 48 | if not tok.isSep and tok.ms.len > 0: 49 | inc nTot # Always bump `nTotal` 50 | if h.incFailed(tok.ms): break IO 51 | echo h.len," unique ",nTot," total ",s.len," B" 52 | if tm: stderr.write epochTime() - t0, "\n" 53 | 54 | when isMainModule: dispatch wu, help={ 55 | "size" : "pre-size hash table for size unique entries", 56 | "dSize": "pre-size str data area to this many bytes", 57 | "tm" : "emit wall time of counting to stderr & quit", 58 | "Dlm":"""chars by which words inside lines are delimited 59 | ""=>SPC,;:.?!'"()[]{}|<>=+-\*/\\0123456789&`~$#%^"""} 60 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: 3 | #schedule: 4 | # - cron: '30 5 * * *' 5 | push: 6 | branches: 7 | - master 8 | pull_request: 9 | branches: 10 | - '*' 11 | jobs: 12 | changes: 13 | if: github.event_name != 'schedule' # Do not want to skip scheduled runs 14 | continue-on-error: true # Ensure errors don't stop us 15 | runs-on: ubuntu-latest 16 | outputs: 17 | src: ${{ steps.filter.outputs.src }} 18 | steps: 19 | - if: github.event_name != 'pull_request' # Github API path filter check=> 20 | name: Checkout (if not PR) #..No need to checkout. 21 | uses: actions/checkout@v2 22 | - uses: dorny/paths-filter@v2 23 | id: filter 24 | with: 25 | filters: | 26 | src: 27 | - '**.cfg' 28 | - '**.nims' 29 | - '**.nim' 30 | - '**.nimble' 31 | - 'tests/**' 32 | - '.github/workflows/ci.yml' 33 | build: 34 | needs: changes # Build if cared-about files changed 35 | # always() is needed here for the job to always run despite Github docs. 36 | # See: https://github.com/actions/runner/issues/491 37 | if: always() && needs.changes.outputs.src != 'false' 38 | strategy: 39 | fail-fast: false 40 | matrix: 41 | os: ['ubuntu-latest'] 42 | nim: ['version-2-0'] 43 | name: '${{ matrix.os }} (${{ matrix.nim }})' 44 | runs-on: ${{ matrix.os }} 45 | steps: 46 | - name: Checkout 47 | uses: actions/checkout@v2 48 | with: 49 | path: ci 50 | - name: Setup Nim 51 | uses: alaviss/setup-nim@0.1.1 52 | with: 53 | path: nim 54 | version: ${{ matrix.nim }} 55 | - name: Build docs 56 | if: ${{ matrix.docs }} == 'true' 57 | shell: bash 58 | run: | 59 | cd ci 60 | branch=${{ github.ref }} 61 | branch=${branch##*/} 62 | nimble doc --project --outdir:docs --path="." \ 63 | '--git.url:https://github.com/${{ github.repository }}' \ 64 | '--git.commit:${{ github.sha }}' \ 65 | "--git.devel:$branch" \ 66 | adix.nim 67 | cp docs/{the,}index.html || true # Ignore failures for older Nim 68 | - name: Publish docs 69 | if: > 70 | github.event_name == 'push' && github.ref == 'refs/heads/master' && 71 | matrix.os == 'ubuntu-latest' && matrix.nim == 'version-2-0' 72 | uses: crazy-max/ghaction-github-pages@v2.5.0 73 | with: 74 | build_dir: ci/docs 75 | env: 76 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 77 | success: # Set check-required on this 78 | needs: build 79 | runs-on: ubuntu-latest 80 | name: 'All check passes' 81 | steps: 82 | - run: | 83 | echo "This is a workaround for Github's broken software" 84 | -------------------------------------------------------------------------------- /adix/metab.nim: -------------------------------------------------------------------------------- 1 | ## This module provides an easy way to do compile-time switched impl swaps for 2 | ## various table/set reprs with various compile-time switched defaults. You 3 | ## should really just learn how to use `LPTabz[..]` directly, though. 4 | import core/macros, std/strformat 5 | 6 | when defined(axRehash): 7 | let rDefault = true 8 | else: 9 | let rDefault = false 10 | 11 | when defined(axRobinHood): 12 | let rhDefault = true 13 | else: 14 | let rhDefault = false 15 | 16 | proc rightSz*(x: Natural): int {.inline,deprecated: "Only identity now".} = x 17 | 18 | macro doAlias(ns: string, root: string, tabP: string, setP: string) = 19 | let inline = "{.inline.}" 20 | parseStmt(&""" 21 | type Tab*[K,V] = {root}{tabP} 22 | 23 | proc initTab*[K,V](sz={ns}InitialSize, numer={ns}Numer, denom={ns}Denom, 24 | minFree={ns}MinFree, growPow2={ns}GrowPow2, rehash=rDefault, 25 | robinHood=rhDefault): Tab[K,V] {inline} = 26 | result.init(sz, numer, denom, minFree, growPow2, rehash, robinHood) 27 | 28 | proc toTab*[K,V](pairs: openArray[(K,V)], dups=false): Tab[K,V] = 29 | result.init pairs.len # calling to{root}{tabp}(pairs, dups) fails; mixin? 30 | if dups: 31 | for k, v in items(pairs): result.add(k, v) 32 | else: 33 | for k, v in items(pairs): result[k] = v 34 | 35 | type Set*[K] = {root}{setP} 36 | 37 | proc initSet*[K](sz={ns}InitialSize, numer={ns}Numer, denom={ns}Denom, 38 | minFree={ns}MinFree, growPow2={ns}GrowPow2, rehash=rDefault, 39 | robinHood=rhDefault): Set[K] {inline} = 40 | result.init(sz, numer, denom, minFree, growPow2, rehash, robinHood) 41 | 42 | proc toSet*[K](keys: openArray[K], dups=false): Set[K] = 43 | result.init keys.len # calling to{root}{tabp}(pairs, dups) fails; mixin? 44 | if dups: 45 | for k in keys: result.add k 46 | else: 47 | for k in keys: result.incl k""") 48 | 49 | when defined(axStdlib): #NOTE: stdlib version cannot ctrl, eg. `initialSize` 50 | import std/[tables, sets] # when client just declares `var x: Tab`. 51 | export tables, sets 52 | type Tab*[K,V] = Table[K,V] 53 | type Set*[K] = HashSet[K] 54 | proc initTab*[K,V](sz=4, numer=1, denom=1, minFree=1, growPow2=1, 55 | rehash=false, robinHood=false): Tab[K,V] {.inline.} = 56 | initTable[K,V](sz) 57 | proc initSet*[K](sz=4, numer=1, denom=1, minFree=1, growPow2=1, rehash=false, 58 | robinHood=false): Set[K] {.inline.} = 59 | initHashSet[K](sz) 60 | elif defined(axDirect): 61 | import adix/ditab 62 | export ditab 63 | doAlias("di", "DITab", "[K,V]", "[K,void]") 64 | elif defined(axInOrder): 65 | import adix/lptabz # Extra generic params are void|not|order sentinel flag, 66 | export lptabz #..then z|num bits for a hash code in the index part. 67 | type InsOrd = distinct int8 # 8 bits blocks most dbl indirections on misses 68 | doAlias("lp", "LPTabz", "[K,V,InsOrd,8]", "[K,void,InsOrd,8]") 69 | else: 70 | import adix/lptabz # Extra generic params here are void|not sentinel flag, z. 71 | export lptabz 72 | when defined(axIntTab0): 73 | doAlias("lp", "LPTabz", "[K,V,K,0]", "[K,void,K,0]") 74 | elif defined(axIntTabM1): 75 | doAlias("lp", "LPTabz", "[K,V,K,-1]", "[K,void,K,-1]") 76 | else: 77 | doAlias("lp", "LPTabz", "[K,V,void,0]", "[K,void,void,0]") 78 | -------------------------------------------------------------------------------- /adix/ways.nim: -------------------------------------------------------------------------------- 1 | ## To paraphrase Mandalorians: "These are the ways" (various algorithms). 2 | import std/heapqueue 3 | 4 | iterator kWayMerge*[T](itrs: openArray[iterator(): T]): T = 5 | ## k-way merge of ordered `itrs[i]` yields using `std/heapqueue`. 6 | if itrs.len > 1: 7 | type HeapItem = (T, int) 8 | var hq = initHeapQueue[HeapItem]() 9 | for i, it in itrs: # Load min-heap with the first yield of each. 10 | let vNext = it() # Must call for system to know exhaustion 11 | if not it.finished: #..but want if-guard before push to avoid 12 | hq.push (vNext, i) #..having exhausted iterators in the heap. 13 | while hq.len > 0: # While heap is not empty: 14 | let (v, i) = hq.pop # get & yield the next min. 15 | yield v 16 | let it = itrs[i] # push next item from just yielded.. 17 | let vNext = it() 18 | if not it.finished: # ..(unless it's exhausted) 19 | hq.push (vNext, i) 20 | elif itrs.len == 1: # special case of only 1 (elif=>or 0) iter 21 | for v in itrs[0](): yield v 22 | 23 | when isMainModule: 24 | iterator i0: int {.closure.} = discard 25 | iterator i1: int {.closure.} = yield 3 26 | iterator i2: int {.closure.} = yield 1; yield 5 27 | iterator i3: int {.closure.} = yield 2; yield 4; yield 6 28 | for i in [i0, i1, i2, i3].kWayMerge: echo i 29 | 30 | iterator succPairs*[T](src: iterator:T; stride=1): (T, T) = 31 | ## Yield successive pairs (src[i - stride], src[i]) for all valid i 32 | var counter = stride # Whether to act|wait 33 | var it0: T # Running reference value 34 | var haveOne = false # Flag indicating we have above 35 | for it in src(): 36 | if haveOne: # We are waiting 37 | dec counter 38 | if counter == 0: # Waited right amount 39 | yield (it0, it) 40 | it0 = it 41 | counter = stride 42 | else: # Transition -> waiting 43 | it0 = it #..with a held reference val 44 | haveOne = true 45 | 46 | iterator diffs*[T](src: iterator:T; stride=1): T = 47 | ## First differences 48 | for x0, x in succPairs(src, stride): 49 | yield x - x0 50 | 51 | iterator diffs2*[T](src: iterator:T; stride=1): T = 52 | ## Second differences 53 | proc diffs1: iterator: T = 54 | iterator: T = 55 | for it in diffs(src): yield it 56 | for x in diffs(diffs1(), stride): yield x 57 | 58 | iterator ratios*[T](src: iterator:T; stride=1): T = 59 | ## First ratios 60 | for x0, x in succPairs(src, stride): 61 | yield x/x0 # guard w/if x0 != 0? 62 | 63 | iterator returns*[T](src: iterator:T; stride=1): T = 64 | ## Arithmetic returns (of, e.g. prices) 65 | for x0, x in succPairs(src, stride): 66 | yield x/x0 - 1 67 | 68 | proc seqItems[T](src: seq[T]): iterator:T = # Cannot be openArray since.. 69 | iterator:T = (for x in src: yield x) #..that can live on the stack. 70 | 71 | proc diffs*[T](src: seq[T]; stride=1): seq[T] = 72 | ## Batch first differences of random-access `src` (vectorizable). 73 | # when T is uint8: .. # To vectorize must at least.. 74 | # when T is float32: .. #..fan-out based on `T`. 75 | for d in diffs(src.seqItems, stride): result.add d # slow for now 76 | 77 | when isMainModule: 78 | iterator nums: float {.closure.} = (for i in 1..9: yield i.float) 79 | for d in diffs(nums): echo d # 1 yields 2-1, 3-2, 4-3, .. = 9 1s 80 | for d in diffs2(nums): echo d # 2 yields 1-1, 1-1, 1-1, .. = 8 0s 81 | for r in ratios(nums): echo r # 3 yields 2/1,3/2,4/3, .. 82 | for r in returns(nums): echo r # 4 yields 2/1-1,3/2-1,4/3-1, .. 83 | let x = [1, 2, 3, 4, 5, 6, 7, 8, 9]; echo diffs(@x) # AOT 1 84 | -------------------------------------------------------------------------------- /tests/wfr.nim: -------------------------------------------------------------------------------- 1 | when not declared(stdin): import std/[syncio, formatfloat] 2 | import std/[hashes, times], cligen, cligen/[mslice, osUt], adix/oats 3 | 4 | const bLen {.intdefine.} = 5 # <32B long; RT params better but less easy 5 | const bOff {.intdefine.} = 27 # <128MiB UNIQUE word data 6 | const bCnt {.intdefine.} = 32 # <4 GiCount 7 | type 8 | Count {.packed.} = object # Dense-ish hash Count type 9 | when defined hashCache: hc: uint32 # 8B|12B per cell 10 | len {.bitsize: bLen.}: uint8 11 | off {.bitsize: bOff.}: uint32 12 | cnt {.bitsize: bCnt.}: uint32 13 | Counts = object 14 | dat: seq[Count] 15 | nUsed: int 16 | 17 | var s: string; oatKStack s, Counts, Count, off,uint32, MSlice, MSlice 18 | proc key(c: Counts, i: int): MSlice = c.dat[i].key 19 | proc val(c: var Counts, i: int, v: uint32) {.used.} = c.dat[i].cnt = v 20 | proc val(c: Counts, i: int): uint32 = c.dat[i].cnt 21 | proc used(c: Counts, i: int): bool = c.dat[i].len != 0 22 | when defined hashCache: # 2nd def triggers saving lpt behavior 23 | proc hash(ms: MSlice): Hash = mslice.hash(ms).uint32.Hash 24 | proc hash(c: var Counts, i: int, hc: Hash) {.used.} = c.dat[i].hc = hc.uint32 25 | proc hash(c: Counts, i: int): Hash = c.dat[i].hc.Hash 26 | oatCounted c,Counts, c.nUsed; oatSeq Counts, dat # make counted & resizable 27 | #when Counts is VROat[MSlice, MSlice, uint32]: {.warning: "Counts is a VROat"} 28 | 29 | proc incFailed(h: var Counts, ms: MSlice): bool = 30 | var ms = ms 31 | if ms.len > (1 shl bLen) - 1: # Careful to not overflow XXX rate limit msgs 32 | erru "truncating too long (", $ms.len, ") word: ", ($ms)[0..<32], "...\n" 33 | ms.len = (1 shl bLen) - 1 # Truncation makes count potentially off 34 | h.upSert(ms, i): # Found key @i: 35 | if h.dat[i].cnt == (1 shl bCnt) - 1: 36 | erru "counter overflow for: ",$ms,"\n" # no update XXX rate limit 37 | else: h.dat[i].cnt.inc # bump 38 | do: # Novel key->i: 39 | h.dat[i].off = s.add(ms, (1 shl bOff) - 1): 40 | erru "unique word data overflow at:",$ms,"\n" #XXX rate limit 41 | return true # Cannot go on GLOBALLY 42 | h.dat[i].len = ms.len.uint8 # Init 43 | h.dat[i].cnt = 1u32 44 | 45 | const d = " \t\r,;:.?!'\"()[]{}|<>=+-*/\\0123456789&`~$#%^" 46 | proc wfr(n=10, count=false,Norm=false, size=9999,dSize=81920, tm=false, Dlm="")= 47 | ## Histogram words on `stdin`. <128 MiB unique data; <32B long; <4 GiCount. 48 | let sep = initSep(if Dlm.len != 0: Dlm else: d) 49 | let t0 = if tm: epochTime() else: 0.0 50 | var h: Counts; h.setCap size # pre-size table & data 51 | s.setLen dSize; s.setLen 0 52 | var nTot = 0 53 | block IO: 54 | for (line, nLine) in stdin.getDelims: 55 | for tok in MSlice(mem: line, len: nLine - 1).frame(sep): 56 | if not tok.isSep and tok.ms.len > 0: 57 | inc nTot # Always bump `nTotal` 58 | if h.incFailed(tok.ms): break IO 59 | if count: echo h.len," unique ",nTot," total ",s.len," B" 60 | template output = 61 | if Norm: outu c.float/nTot.float," ",k,"\n" else: outu c," ",k,"\n" 62 | if n == 0: (for (k, c) in pairs(h): output()) 63 | elif n > 0 : (for (k, c) in topByVal[MSlice, MSlice, uint32](h, n): output()) 64 | elif n < -1: (for (k, c) in topByVal[MSlice, MSlice, uint32](h, -n, order=Descending): output()) 65 | if tm: stderr.write epochTime() - t0, "\n" 66 | 67 | when isMainModule: dispatch wfr, help={ 68 | "n" : "emit `n`-most common lines(0:all; <0 sorted)", 69 | "count": "only emit counts: unique & grand total", 70 | "Norm" : "normalize frequencies by dividing by grand tot", 71 | "size" : "pre-size hash table for size unique entries", 72 | "dSize": "pre-size str data area to this many bytes", 73 | "tm" : "emit wall time of counting to stderr & quit", 74 | "Dlm":"""chars by which words inside lines are delimited 75 | ""=>SPC,;:.?!'"()[]{}|<>=+-\*/\\0123456789&`~$#%^"""} 76 | -------------------------------------------------------------------------------- /adix/bitop.nim: -------------------------------------------------------------------------------- 1 | ## This is a reimplementation of some things we need from bitops which has CT 2 | ## trouble due to importc's. (I feel it's a better naming/factoring, too). 3 | 4 | proc `&=`*[T,U](a: var T, b: U) = a = a and b ## Updating bit-wise `and` 5 | proc `|=`*[T,U](a: var T, b: U) = a = a or b ## Updating bit-wise `or` 6 | proc `^=`*[T,U](a: var T, b: U) = a = a xor b ## Updating bit-wise `xor` 7 | proc `<<=`*[T,U](a: var T, b: U) = a = a shl b ## Updating bit-wise `shl` 8 | proc `>>=`*[T,U](a: var T, b: U) = a = a shr b ## Updating bit-wise `shr` 9 | 10 | proc ceilPow2*(x: int): int {.noSideEffect, inline.} = 11 | ## Returns ``x`` rounded up to the nearest power of two. <= 0 get 1. 12 | result = x - 1 13 | when defined(cpu64): 14 | result |= result shr 32 15 | when sizeof(int) > 2: 16 | result |= result shr 16 17 | result |= result shr 8 18 | result |= result shr 4 19 | result |= result shr 2 20 | result |= result shr 1 21 | result += 1 + ord(x <= 0) 22 | 23 | proc floorPow2*(x: int): int {.noSideEffect, inline.} = 24 | ## Returns ``x`` rounded down to the nearest power of two. 25 | result |= result shr 1 26 | result |= result shr 2 27 | result |= result shr 4 28 | result |= result shr 8 29 | when sizeof(int) > 2: 30 | result |= result shr 16 31 | when defined(cpu64): 32 | result |= result shr 32 33 | result -= result shr 1 34 | 35 | # https://stackoverflow.com/questions/3465098/bit-twiddling-which-bit-is-set/ 36 | # This is essentially just a perfect hash a la Leiserson98-UsingDeBruijnSeqs. 37 | when defined(cpu64): 38 | const deBruijn8 = [ 0, 1, 2, 53, 3, 7, 54, 27, 4, 38, 41, 8, 34, 55, 48, 28, 39 | 62, 5, 39, 46, 44, 42, 22, 9, 24, 35, 59, 56, 49, 18, 29, 11, 63, 52, 6, 40 | 26, 37, 40, 33, 47, 61, 45, 43, 21, 23, 58, 17, 10, 51, 25, 36, 32, 60, 20, 41 | 57, 16, 50, 31, 19, 15, 30, 14, 13, 12 ] 42 | else: 43 | const deBruijn4 = [ 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 44 | 4, 8, 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 ] 45 | 46 | proc lgPow2*(x: int): int {.inline.} = 47 | when defined(cpu64): 48 | deBruijn8[(uint64(x) * 0x022FDD63CC95386D'u64) shr 58] 49 | else: 50 | deBruijn4[(uint32(x) * 0x077CB531'u32) shr 27] 51 | 52 | proc lgCeil*(x: int): int {.inline.} = lgPow2(ceilPow2(x)) 53 | ## integer-math only impl of ceil(log2(x)) 54 | 55 | proc lgFloor*(x: int): int {.inline.} = lgPow2(floorPow2(x)) 56 | ## integer-math only impl of floor(log2(x)) 57 | 58 | proc lg*(x: int): int {.inline.} = lgCeil(x) ## short alias for lgCeil 59 | 60 | proc rotateLeftBits*(a: uint64, numBits: int): uint64 {.inline.} = ## like bitops 61 | (a shl numBits) or (a shr (uint64.sizeof * 8 - numBits)) 62 | 63 | proc rotateRightBits*(a: uint64, numBits: int): uint64 {.inline.} = ## like bitops 64 | (a shr numBits) or (a shl (uint.sizeof * 8 - numBits)) 65 | 66 | proc reverseBitsByte*(x: uint8): uint8 {.inline.} = 67 | const reversed = [ 0b0000'u8, 0b1000, 0b0100, 0b1100, 68 | 0b0010 , 0b1010, 0b0110, 0b1110, 69 | 0b0001 , 0b1001, 0b0101, 0b1101, 70 | 0b0011 , 0b1011, 0b0111, 0b1111 ] 71 | result = (reversed[x and 15] shl 4) or reversed[x shr 4] 72 | 73 | proc reverseBitsMakeTable(): array[256, uint8] = 74 | for i in 0 ..< 256: 75 | result[i] = reverseBitsByte(uint8(i)) 76 | 77 | const revByte = reverseBitsMakeTable() 78 | 79 | proc reverseBits*(x: uint32): uint32 = 80 | (uint32(revByte[int((x and 0x000000FF'u32) )]) shl 24) or 81 | (uint32(revByte[int((x and 0x0000FF00'u32) shr 8)]) shl 16) or 82 | (uint32(revByte[int((x and 0x00FF0000'u32) shr 16)]) shl 8) or 83 | uint32(revByte[int( x shr 24)]) 84 | 85 | proc reverseBits*(x: uint64): uint64 = 86 | (uint64(revByte[int((x and 0x00000000000000FF'u64) )]) shl 56) or 87 | (uint64(revByte[int((x and 0x000000000000FF00'u64) shr 8)]) shl 48) or 88 | (uint64(revByte[int((x and 0x0000000000FF0000'u64) shr 16)]) shl 40) or 89 | (uint64(revByte[int((x and 0x00000000FF000000'u64) shr 24)]) shl 32) or 90 | (uint64(revByte[int((x and 0x000000FF00000000'u64) shr 32)]) shl 24) or 91 | (uint64(revByte[int((x and 0x0000FF0000000000'u64) shr 40)]) shl 16) or 92 | (uint64(revByte[int((x and 0x00FF000000000000'u64) shr 48)]) shl 8) or 93 | uint64(revByte[int( x shr 56)]) 94 | 95 | proc isPow2*(x: int): bool = 96 | if x == 0: return false 97 | (x and (x - 1)) == 0 98 | -------------------------------------------------------------------------------- /TODO.md: -------------------------------------------------------------------------------- 1 | 0) Make the "mitigation sequence" user-adjustable somehow...E.g., first rehash, 2 | then maybe rerehash, then robinhood, then maybe Btree. (Could maybe also 3 | make warnings adjustable just calling some imported tooFull type proc.) 4 | 5 | 1) While things work well in the tests/\*shell.nim's, should add many tests both 6 | to exercise everything and ensure a compiler changes don't cause regressions. 7 | See if we can just use all the one's already in the Nim stdlib. Add in the 8 | weak vitanim benchmarks. Would be nice to do a whole more real suite like 9 | the probablydance guy & beyond; Unsure I'll have the time/interest. 10 | 11 | 2) Make a `diset` using `sequint`, e.g. `dcset`; Should be easy. 12 | 13 | 3) Add `reindex` proc for o\*set/o\*tab. Should be easy..just clear idx and 14 | loop over `data[]` inserting. Maybe add after `sort` (to be more drop-in 15 | for Nim stdlib's ordered table workings). 16 | 17 | 4) add `ref` variants, maybe via a `defref.nim` 18 | 19 | 5) Possible lpset/ilset/olset micro-optimizations most relevant for L1/L2 cases: 20 | 21 | a) `moveMem` type element shifting should be faster than current pushUp/etc., 22 | especially for larger element sizes. 23 | 24 | b) One thing that might boost some workloads is the idea here: 25 | https://web.archive.org/web/20170623234417/https://pubby8.wordpress.com/ 26 | The idea is replacing the lower bits of the hash code which recapitulate 27 | the table index after it's computed with the probe depth (after Amble&Knuth 28 | 1973). The benefit is that the `hc != 0 and d > (hc-i)and mask` check can 29 | be folded into only `d > hc and mask`. The trade off (unmentioned in that 30 | blog post) is that pushUp/pullDn must update depths in the upper half of 31 | the cluster, however big it is, blocking optimized just-memmove shifting. 32 | Also, you save 1/3 not so predictable branches in the find loop per cluster 33 | element, but also create 5 ops per cluster element for mutation ops (load, 34 | mask out, inc/dec, mask in, store) for the cluster upper half. The extra 35 | ops are more predictable work, but so is an optimized memmove. Edit heavy 36 | workloads probably shake out to near a net wash. For "mostly miss, read- 37 | only after build" workloads it could help 1.5x (e.g. empty set intersects). 38 | That's also when Robin Hood *already* wins big from its half-depth search. 39 | Given that memory layout is identical to *not* saving depths, this could 40 | maybe be a run-time/per-instance option { like Robin re-org itself is }, 41 | not necessarily on by default. "bulk conversions" of hcodes to (upper hc, 42 | depth) combos & back can be faster than a full resize/rehash anyway. 43 | 44 | c) Can simplify rawPut/rawDel a lot if assume a strong enough hash by keeping 45 | an overflow area at the high end of s.data, i.e., s.data is longer than `1 46 | shl s.pow2` by denom/numer\*s.pow2 ish. The amount longer will always be 47 | bounded by that from table resize policy *except* if a hash is so bad that 48 | this limit is violated with a sparsely full table, which can happen with 49 | enough probability to be a serious concern. Graceful degradation is better 50 | and all we currently do in this overlong at < 50% full circumstance is warn 51 | or acivate mitigations. In short, I doubt 52 | probablydance.com/2017/02/26/i-wrote-the-fastest-hashtable/ makes the right 53 | safety judgement call for a general purpose table..elsewise a worthwhile 54 | blog post with less lame benchmarks than usual. I independently had the 55 | idea to trigger growth based on probe depth with a memory attack safeguard, 56 | though. As did the Rust guys, apparently and surely many in the 1970s. 57 | It's really a pretty obvious tilt-your-head-the-other-way look at a usual 58 | probes vs.load graph. The probablydance guy seems to have missed the 59 | safeness side of it, though and focused on low loads with good hashes. He 60 | figured this out in the end, but going w/a short-hop linked variant: 61 | https://probablydance.com/2018/05/28/a-new-fast-hash-table-in-response-to-googles-new-fast-hash-table/ 62 | 63 | 7) It'd be nice if we provided the option to use writable mmaps for all these 64 | tables instead of just `seq` backing store. This could maybe be as simple as 65 | passing an allocator proc to the various constructor functions and doing our 66 | own pointer arithmetic. Not too hard, really. 67 | 68 | 8) Do external chaining impl for mutating while iterating. Look at internal 69 | chain of probablydance to assess insert/delete-while-iterating abilities. 70 | -------------------------------------------------------------------------------- /NOTES.md: -------------------------------------------------------------------------------- 1 | These notes are about lptabz/LPTabz. 2 | 3 | Unlike most hash table libraries, we do not use load factor to *guess* when 4 | growth *might* be necessary to preserve performance (conditioned upon good hash 5 | functions relative to the key set). Instead, we measure probe sequence depth 6 | during insert and grow only if it is too deep (or some small `minFree` limit is 7 | hit). "Too long" is `> numer/denom*lg(size)` since the worst per table case on 8 | random data scales that way. This approach is both more robust to weakly 9 | scrambling hashes and more space conservative for "better than random" hashes. 10 | It also fixes this problem: 11 | https://accidentallyquadratic.tumblr.com/post/153545455987/rust-hash-iteration-reinsertion 12 | 13 | However, we also want to avoid memory exhaustion. So, we only grow tables if 14 | not "too sparse", i.e. `count/length > 1/(1 shl growPo2)`. For `growPo2=1` this 15 | means we might still double at only 50% full taking it down to 25%. If 25-50% 16 | load cannot give expected collision cluster sizes then A) the hash function is 17 | inadequate, B) the table is under attack, or C) it is being abused with many 18 | duplicate keys. The first two of these situations are basically the same with 19 | a natural response - use a more scrambling hash function with hard to guess data 20 | mixed in. Ignorance of the underlying keys only allows us to do that "on top" 21 | of the `hash` the user already provides which is what we do here. This will be 22 | ineffective if that provided hash outputs too few hash code values. Our re-hash 23 | of the hash mixes in the VM address of its data area, unique to each table and 24 | table size. We usually emit a warning when activating this feature, though that 25 | can be disabled. 26 | 27 | This resize protocol makes performance much more deterministic, but also makes 28 | space utilization non-deterministic. Utilization can be both much better than 29 | typical load-based resize with a near perfect hash as well as a little worse 30 | with a too weak hash. This seems "how it should be". Safer performance also 31 | seems worth more than deterministic size, and you cannot have both at once with 32 | an abstract key and user-settable hash. I'm a bit surprised this resize 33 | protocol isn't more popular. 34 | 35 | If you want to monitor space utilization you can do `.len/.getCap`. The tables 36 | here also all provide a query function `depths` to inspect distribution of probe 37 | depth. `depths` is about as expensive to compute as looking each item up once. 38 | It can be a bit faster on some key types. More general performance forensics, 39 | are available if `hashStats` is defined to count various important events. 40 | That activates counters for a variety of events like probes, different resize 41 | conditions and so on. Each counter just starts at zero and goes up. So, you 42 | just use them a bit like `epochTime`/`getTime` and friends. You can "time" at 43 | whatever granularity is desired.. They are just global variables and so not 44 | exactly multi-thread safe, but the worst that can happen under MT contention is 45 | that you loose a few counts. { E.g., A) load old val, B) load old val, both 46 | inc, only one writes back. So, you get 1 inc instead of 2. } 47 | 48 | Some OSes can accelerate context switches if FP registers are never dirtied. 49 | So, this library is careful to avoid floating point with integer ratios. 50 | It is also careful to avoid high pipeline latency integer divisions. 51 | 52 | These tables also allow shrinkage in case many deletes have occurred and more 53 | than one iteration might later be performed. { A `setCap` is slower than one 54 | iteration. So, if it's only 1 more there is no point. } The default new size 55 | parameter (or an explicit negative one) causes the table to grow by one standard 56 | expansion while zero (or any small positive number) will make it be compacted 57 | to the minimum size that can fit its current population (possibly plus a fudge 58 | factor based on hash randomness assumptions for tables that need that). 59 | 60 | A perhaps non-obvious subtlety about Robin Hood hashing with depth-triggered 61 | growth is that `pushUp` can increase depth at the end of the collision cluster. 62 | That is the depth we want tested in `tooFull`. So, `rawPut` is two phases for 63 | RH. `rawPut1` finds the end of a collision cluster. `rawPut2` does the actual 64 | shift. In between, we can call `tooFull` to see if resizing is necessary. 65 | Note that the depth `rawPut1` calculates is actually the "shift size" not the 66 | max search depth of moved elements. This is the sense of depth you want since 67 | the point of depth-triggered resize is to avoid both large scans and large data 68 | motion. { "Cost" might be a more clear word than "depth". } 69 | -------------------------------------------------------------------------------- /adix/uniqce.nim: -------------------------------------------------------------------------------- 1 | ## The min-count sketch (NOT count-min) idea is to see hash(x) as a U(0,1) & use 2 | ## P(sampleMax *average* 5 | ## gap between k-1 uniques&averaging cuts noise.) See Bar-Yossef 2002 "Counting 6 | ## Distinct..", Giroire05 "Order statistics & estimating cardinalities" & Ting14 7 | ## "Streamed approximate counting..". 8 | # NOTE: Speed-accuracy-space trade-off triangle. To keep `lowerBound` & rare 9 | # `moveMem` fast, want `tail` cached, meaning small `k` & ~big expected error. 10 | # TODO Augment to AKMV for multiset queries & "cardinality one stop shopping". 11 | # TODO Add a B-Tree for when k>>1000 & folks care about "perf during warm up" 12 | # (As-is is very simple and not slow post warm-up; uniqce 100000 1024 16383 1000 13 | # =~ 2 sec for 100e6 => 20 ns/update. Better than 4x boost seems v.unlikely.) 14 | 15 | const axUCEdefaultK {.intdefine.} = 1024 # -d:axUCEdefaultK=X edits unInit val. 16 | # avg|err| <+-~0.5..1.8%, avgMax1000|err|<~2..8.0% (dep on dups). 17 | import std/[algorithm, hashes, math, sets] 18 | type # NOTE: Want no dups & fast del reorg if seq is replaced 19 | UniqCe*[F:SomeFloat] = object 20 | tail: seq[F] # Sorted seq of `k` biggest `Hash`; Q: Add B-Tree mode? 21 | k: int # State scale & lower bound of true answer 22 | est: float64 # Running estimate a la Ting2014 23 | 24 | proc initUniqCe*[F:SomeFloat](k=1024): UniqCe[F] = result.k = k 25 | ## Return initialized `UniqCe` with tail size `k`. k=1024 costs 4K|1VM page 26 | 27 | proc push*[F:SomeFloat](uc: var UniqCe[F], h: F) = 28 | ## Incorporate visitation of an element. NOTE: std/hashes.hash(string) OUTPUT 29 | ## ONLY FILLS 32-BITS => push takes `x on [0,1]` instead of a key|hash() val. 30 | if uc.k == 0: uc.k = axUCEdefaultK # Make ok w/`var x: UniqCe` 31 | if uc.tail.len == 0: # Initial empty check 32 | uc.tail.add h.F # Branch-predicted out 33 | return 34 | if uc.tail.len == uc.k and h <= uc.tail[0]: # No change to tail 35 | return # Most activity for large inputs ends here post-warm-up;ByDef("tail") 36 | let i = uc.tail.lowerBound(h) # BINARY SEARCH for ins. spot 37 | if i < uc.tail.len and uc.tail[i] == h: # Already in tail; !DUPS=>done 38 | return 39 | if uc.tail.len < uc.k: # BUILD PHASE; always insert 40 | uc.tail.insert h, i 41 | uc.est += 1.0/float64(1.0 - uc.tail[0]) 42 | elif h > uc.tail[0]: # RARE: TAIL GETS NEW ELT 43 | if i > 1: # i >= 1: must make room 44 | moveMem uc.tail[0].addr, uc.tail[1].addr, (i - 1)*uc.tail[0].sizeof 45 | uc.tail[i - 1] = h # i is pre-downshift spot 46 | uc.est += 1.0/float64(1.0 - uc.tail[0]) 47 | 48 | proc nUnique*[F:SomeFloat](uc: UniqCe[F]): float32 = 49 | ## Estimate number of unique elements seen so far. 50 | if uc.tail.len < uc.k: uc.tail.len.float32 else: max(uc.k.float32, uc.est) 51 | 52 | proc nUniqueErr*[F:SomeFloat](uc: UniqCe[F]): float32 = 53 | ## Estimated error on estimate of unique elements seen so far. 54 | if uc.tail.len 0: parseInt(paramStr(1)) else: 100000 81 | let k = if paramCount() > 1: parseInt(paramStr(2)) else: axUCEdefaultK 82 | let d = if paramCount() > 2: parseInt(paramStr(3)).uint64 else: 0xFFFFFFFF'u64 83 | let m = if paramCount() > 3: parseInt(paramStr(4)) else: 1000 84 | let s = paramCount() > 4 # skip tracking HashSet error if present 85 | var err, nSt, eer: RunningStat 86 | for i in 1..m: 87 | var uc = initUniqCe[float32](k) 88 | var st = initHashSet[uint64](n) 89 | for j in 1..n: 90 | let key = (randState.next and d) 91 | uc.push float32(cast[uint64](hash(key)))*(1.0/1.8446744073709551615e19) 92 | if not s: st.incl key 93 | let est = uc.nUnique 94 | if not s: 95 | let e = abs(est - st.len.float)/st.len.float 96 | err.push e; eer.push uc.nUniqueErr/est/e 97 | nSt.push est 98 | echo nSt.mean, " ", err.mean*100, " % ", err.max*100, " % ", eer.mean 99 | -------------------------------------------------------------------------------- /adix/topk.nim: -------------------------------------------------------------------------------- 1 | ##[ Top-k of n using "Buffered QuickSelect" for θ(n) scaling in large `k` cases 2 | rather than `std/heapqueue` { which has [O|θ](n*lg k) }. ]## 3 | import std/[random, algorithm] # quickwit.io/blog/top-k-complexity explains BUT 4 | type #..NOTE heap [θO](n*lg k) &this θ(n),O(n*lg k). 5 | Partn* = enum last, ran #XXX Tukey's 9th | median-of-medians | PDQ? 6 | TopK*[T] = object ## A TopK accumulator; init; push; iterate 7 | i, k: int 8 | partn: Partn # One can do variant/case obj using HeapQueue for small k, but 9 | first: bool #..that is only arch-dependent 1.15-1.30X faster for k<100s & 10 | thr: T #..saves only 1/2 the space when little is used anyway. OTOH, 11 | s: seq[T] #..a better Partn may make this way "always fastest period". 12 | TopKOrder* = enum Descending, Ascending, Cheap 13 | 14 | proc supportsCopyMem(t: typedesc): bool {.magic: "TypeTrait".} 15 | proc initTopK*[T](k=10, partn=last): TopK[T] = 16 | ## Initialize a TopK-accumulator for top-`k`; Usage is simply: 17 | ## 18 | ## .. code-block:: nim 19 | ## var t = initTopK(); for e in 1..99: t.push e 20 | ## for e in t: echo e 21 | result = TopK[T](i: -1, k: k, partn: partn, first: true) 22 | when supportsCopyMem(T) and declared newSeqUninit: 23 | result.s = newSeqUninit[T](2*k); result.s.setLen 0 24 | 25 | proc qpLast[T](a: var openArray[T]; L, R: int): int = 26 | let piv = a[R] # QuickPartition about last element 27 | var i = L - 1 28 | for j in L..= Descending 29 | if a[j] >= piv: inc i; swap a[i], a[j] 30 | swap a[i + 1], a[R] 31 | i + 1 32 | 33 | proc qpRand[T](a: var openArray[T]; L, R: int): int = 34 | swap a[rand(L..R)], a[R] # QuickPartition about random element 35 | a.qpLast L, R 36 | 37 | proc qPart[T](a: var openArray[T]; L, R: int; partn: Partn): int = 38 | case partn # QuickPartition w/various strategies 39 | of last: qpLast a, L, R 40 | of ran : qpRand a, L, R 41 | 42 | proc pqs[T](a: var openArray[T]; k: int, partn: Partn): T = # Partial QuickSort 43 | var (L, R) = (a.low, a.high) # Returns pivot element 44 | while L <= R: # Partition a[L..R] about piv; Find its pos 45 | let pivIx = qPart(a, L, R, partn) 46 | if pivIx == k - 1: return a[pivIx] # piv itself is k-th 47 | elif pivIx > k - 1: R = pivIx - 1 # k-th on left 48 | else : L = pivIx + 1 # k-th on right 49 | 50 | proc push*[T](t: var TopK[T], e: sink T) = 51 | ## Incorporate element `e` into `t` for eventual exact `for e in t: ..`. 52 | inc t.i 53 | if t.i < t.k: t.s.add e # Build phase: Always Add 54 | elif t.first or e > t.thr: # Filter into batches 55 | t.s.add e # Add if building | over threshold 56 | if t.s.len == 2*t.k: 57 | t.thr = t.s.pqs(t.k, t.partn) # Median -> new thr & all >= put on left 58 | t.first = false # Mark thr active 59 | t.s.setLen t.k # Drop < new median half 60 | 61 | proc saw*[T](t: TopK[T]): int = t.i + 1 ## `push` count since last init|clear 62 | 63 | iterator items*[T](t: var TopK[T]): lent T = 64 | ## iterate over `t` yielding top items in cheapest/system order. 65 | if t.saw > t.k: 66 | t.thr = t.s.pqs(t.k, t.partn) # Median -> new thr & all >= put on left 67 | t.first = false # Mark thr active 68 | t.s.setLen t.k # Drop < new median half 69 | for e in t.s: yield e 70 | 71 | iterator descending*[T](t: var TopK[T]): lent T = 72 | ## iterate over `t` yielding top items in DESCENDING order. 73 | t.s.sort(order=SortOrder.Descending); t.s.setLen min(t.k, t.saw) 74 | for e in t.s: yield e 75 | 76 | iterator ascending*[T](t: var TopK[T]): lent T = 77 | ## iterate over `t` yielding top items in ASCENDING order. 78 | t.s.sort(order=SortOrder.Descending); t.s.setLen min(t.k, t.saw) 79 | t.s.reverse 80 | for e in t.s: yield e 81 | 82 | iterator maybeOrdered*[T](t: var TopK[T], order=topk.Cheap): lent T = 83 | ## iterate over `t` yielding top items in various orders. 84 | case order 85 | of topk.Cheap : (for e in topk.items(t) : yield e) 86 | of topk.Ascending : (for e in topk.ascending(t) : yield e) 87 | of topk.Descending: (for e in topk.descending(t): yield e) 88 | 89 | proc clear*[T](t: var TopK[T]) = ## Reset `TopK` accumulator 90 | t.i = -1; t.s.setLen 0; t.first = true 91 | 92 | when isMainModule: # Good check: nim r -d:ck -d:r topk -qk3 -n10 -t3628800 #10! 93 | when not declared stderr: import std/syncio 94 | import cligen, std/[times, sugar, math, sets] 95 | when defined danger: randomize() 96 | proc top(k=500, n=50000, trials=50, partn=last, quiet=false) = 97 | let tScl = 1e12/n.float/log2(k.float) # picosec/work-scale 98 | var t = initTopK[int](k, partn) 99 | var a = collect(for i in 0.. (1 shl bLen) - 1: # Careful to not overflow XXX rate limit msgs 38 | erru "truncating too long (", $ms.len, ") line: ", ($ms)[0..<128], "...\n" 39 | h.upSert(ms, i): # Found key @i: 40 | if (when defined intCnt: h.dat[i].cnt == (1 shl bCnt) - 1 else: false): 41 | erru "counter overflow for: ",$ms,"\n" # no update XXX rate limit msgs 42 | else: h.dat[i].cnt += w; wTot += w # bump 43 | do: # Novel key->i: 44 | h.dat[i].off = s.add(ms, (1 shl bOff) - 1): 45 | erru "unique line data overflow at:",$ms,"\n" #XXX rate limit msgs 46 | return true # Cannot go on GLOBALLY 47 | h.dat[i].len = ms.len.uint32 # Init 48 | h.dat[i].cnt = w; wTot += w 49 | when not defined(intCnt): # Do 1-param frecency idea; Simpler than firefox 50 | w *= wMul # Always-grow-for-new-data EWMA update 51 | if w > wMax: # Nearing per-key weight repr limit.. 52 | let sclInv = 1.0/wMax # *= twice since this pushes FP repr limits 53 | for c in mitems h.dat: c.cnt *= sclInv; c.cnt *= sclInv 54 | w *= sclInv; w *= sclInv # These *='s move all FP numbers from.. 55 | wTot *= sclInv; wTot *= sclInv #..near top of repr to near bottom. 56 | 57 | proc lfreq(n=10, count=false, size=9999, dSize=81920, recTerm='\n', 58 | format="@f @k", RecTerm="\n", old=1.0, tm=false) = 59 | ## Histogram `stdin` lines (read w/non-memory mapped IO to be pipe friendly). 60 | ## Limits: <4 GiB unique data; <16 KiB lines; <4 GiCount. If `old < 1.0`, 61 | ## frequency -> simple 1-parameter "frecency" where counts are virtual-decayed 62 | ## by a factor `old` after each line (i.e. by index not by wall time). 63 | let t0 = if tm: epochTime() else: 0.0 64 | var h: Counts; h.setCap size # pre-size table & data 65 | s.setLen dSize; s.setLen 1 # `1` here lets us encode empty as 0-offset 66 | when not defined(intCnt): wMul = 1.0/old 67 | block IO: 68 | for (line, nLine) in stdin.getDelims(recTerm): 69 | let ms = MSlice(mem: line, len: nLine - 1) 70 | if h.incFailed(ms): break IO 71 | if count: outu h.len," unique ",wTot," total ",s.len," B\n" 72 | let wInv = 1.0/wTot.float; var cs, fs: string # Setup for.. 73 | let prs = format.tmplParsed('@') #..nice output 74 | template output = 75 | for (id, arg, call) in prs: 76 | if id.idIsLiteral: outu MSlice(mem: format[arg.a].addr, len: arg.len) 77 | elif format[id.a] == 'k': outu k 78 | elif format[id.a] == 'c': 79 | when defined intCnt: cs.setLen 0; cs.addInt c; outu cs 80 | else : cs.setLen 0; cs.ecvt c.float, 6; outu cs 81 | elif format[id.a] == 'f': fs.setLen 0; fs.fcvt c.float*wInv, 9; outu fs 82 | else: outu MSlice(mem: format[call.a].addr, len: call.len) 83 | outu RecTerm 84 | if n == 0: (for (k, c) in pairs(h): output()) 85 | elif n > 0 : (for (k, c) in topByVal[MSlice,MSlice,Counter](h, n): output()) 86 | elif n < -1: (for (k, c) in topByVal[MSlice,MSlice,Counter](h, -n, order=Descending): output()) 87 | if tm: stderr.write epochTime() - t0, "\n" # -n-1 for only time output 88 | 89 | when isMainModule: dispatch lfreq, help={ 90 | "n" : "emit `n`-most common lines(0:all; <0 sorted)", 91 | "count": "only emit counts: unique & grand total", 92 | "size" : "pre-size hash table for size unique entries", 93 | "dSize": "pre-size str data area to this many bytes", 94 | "recTerm": "input record terminator", 95 | "RecTerm": "output record terminator", 96 | "format" : "output format: @k=key @c=count @f=fraction", 97 | "old" : "exponen.weight for 'old' ages (if not intCnt)", 98 | "tm" : "emit wall time of counting to stderr & quit"} 99 | -------------------------------------------------------------------------------- /tests/wf.nim: -------------------------------------------------------------------------------- 1 | when not declared(addFloat): import std/[formatfloat, typedthreads] 2 | when not declared(Thread): import std/threads 3 | import std/[hashes,osproc,times], adix/lptabz, cligen/[mfile,mslice,osUt],cligen 4 | type 5 | Word = distinct uint32 6 | Count = uint32 7 | Histo = LPTabz[Word, Count, Word, 0] 8 | ThrDat = tuple[part: ptr MSlice, hp: ptr Histo, nT: ptr uint64] 9 | 10 | template initHisto(sz): untyped = # 4*16*8=512B max depth at 65536 entry 11 | initLPTabz[Word, Count, Word, 0](sz, numer=4, denom=1, robinHood=false) 12 | 13 | var mf: MFile 14 | var hs: seq[Histo] # NEED -d:useMalloc 15 | var nTs: seq[uint64] 16 | var thrs: seq[Thread[ThrDat]] 17 | 18 | const wb = 5 # word len bits 19 | const wm = (1 shl wb) - 1 # max word len 20 | 21 | proc initWord(off, len: int): Word = 22 | if len > wm: 23 | var s = newStringOfCap(len) 24 | copyMem s[0].addr, mf.mem +! off, len 25 | raise newException(RangeDefect, "\"" & s & "\" too long") 26 | Word((off.uint32 shl wb) or len.uint32) 27 | 28 | proc len(w: Word): uint32 = uint32(w) and wm 29 | 30 | proc mem(w: Word): pointer = mf.mem +! int(w.uint32 shr wb) 31 | 32 | # Case insens hash/==|Local stack allocator | may be faster than MAP_PRIVATE. 33 | proc hash(w: Word): Hash {.inline.} = 34 | hash toOpenArray[byte](cast[ptr UncheckedArray[byte]](w.mem), 0, w.len.int-1) 35 | 36 | proc `==`(a, b: Word): bool {.inline.} = 37 | a.len == b.len and cmemcmp(a.mem, b.mem, a.len) == 0 38 | 39 | proc `<`(a, b: Word): bool {.inline.} = # for topk.push 40 | let c = cmemcmp(a.mem, b.mem, min(a.len, b.len)) 41 | if c == 0: a.len < b.len else: c < 0 42 | 43 | proc `$`(w: Word): string = # for output 44 | result.setLen w.len 45 | copyMem result[0].addr, w.mem, w.len 46 | 47 | when defined(benhoyt): # Ben Hoyt definition of "words" 48 | iterator lowCaseWords(ms: MSlice): Word = 49 | var wd, n: int 50 | for i, ch in ms: 51 | if ch in {'A'..'Z'}: # `tr A-Z a-z` preprocess to avoid 52 | ms[i] = char(ord(ch) + 32) # needs MAP_PRIVATE 53 | if n == 0: wd = (ms.mem +! i) -! mf.mem 54 | n.inc # extend 55 | elif ord(ch) > ord(' '): # in-word ch 56 | if n == 0: wd = (ms.mem +! i) -! mf.mem 57 | n.inc # extend 58 | elif n > 0: # non-word ch & have data 59 | yield initWord(wd, n); n = 0 # yield & reset 60 | if n > 0: yield initWord(wd, n) # any final word 61 | else: # Knuth-McIlroy definition of "words" 62 | iterator lowCaseWords(ms: MSlice): Word = 63 | var wd, n: int 64 | for i, ch in ms: 65 | if ch in {'a'..'z'}: # in-word ch 66 | if n == 0: wd = (ms.mem +! i) -! mf.mem 67 | n.inc # extend 68 | elif ch in {'A'..'Z'}: # `tr A-Z a-z` preprocess to avoid 69 | ms[i] = char(ord(ch) + 32) # needs MAP_PRIVATE 70 | if n == 0: wd = (ms.mem +! i) -! mf.mem 71 | n.inc # extend 72 | elif n > 0: # non-word ch & have data 73 | yield initWord(wd, n); n = 0 # yield & reset 74 | if n > 0: yield initWord(wd, n) # any final word 75 | 76 | proc work(td: ThrDat) {.thread.} = # Histogram one segment of an mmap 77 | setAffinity() # pin to CPU initially assigned 78 | var nT = 0u64 # Local accumulator to not thrash 79 | for w in td.part[].lowCaseWords: 80 | nT.inc; td.hp[].mgetOrPut(w, 0).inc 81 | td.nT[] = nT 82 | 83 | proc count(p: int, path: string) = # split path into `p` ~equal segments 84 | var (mfLoc, parts) = p.nSplit(path, flags=MAP_PRIVATE) 85 | mf = mfLoc 86 | if mf != nil: 87 | if mf.len > 1 shl (32 - wb): 88 | raise newException(RangeDefect, "\"" & path & "\" too large") 89 | if p > 1: # add mf.len > 65536|something? 90 | for i in 0 ..< parts.len: # spawn workers 91 | createThread thrs[i], work, (parts[i].addr, hs[i].addr, nTs[i].addr) 92 | joinThreads thrs 93 | else: work (parts[0].addr, hs[0].addr, nTs[0].addr) # ST-mode: No spawn 94 | else: stderr.write "wf: \"", path, "\" missing/irregular\n" 95 | 96 | proc wf(path:seq[string], n=10, c=false, N=false, jobs=1, sz=9999, tm=false) = 97 | ## Parallel word frequency tool for one file < 128 MiB and words < 32 chars. 98 | ## Aggregate multiple via, e.g., `cat \*\*/\*.txt > /dev/shm/inp`. Similar 99 | ## to Knuth-McIlroy `tr A-Z a-z|tr -sc a-z \\n|sort|uniq -c|sort -n|tail`, 100 | ## but ~46X faster than the pipeline (on TOTC; depends on vocab). 101 | let path = if path.len > 1: path[1] else: "/dev/stdin" 102 | let t0 = epochTime() 103 | let p = if jobs > 0: jobs else: countProcessors() 104 | thrs.setLen p # allocate `thrs` & histos 105 | for i in 0 ..< p: hs.add initHisto(sz); nTs.add 0u64 106 | p.count path 107 | for i in 1 ..< p: # hs[0] += [1.. 0 : (for w, c in hs[0].topByVal(n): o()) # unsorted top N 114 | elif n < -1: (for w, c in hs[0].topByVal(n, order=Descending): o()) # sorted 115 | if tm: stderr.write epochTime() - t0, " sec\n" # n == -1: only `c`/tm 116 | 117 | dispatch(wf, help={"n": "do top n; 0all,<0sort", "c": "count only", "N": "norm", 118 | "tm": "time", "jobs": "num threads; 0=>auto", "sz": "init size"}) 119 | -------------------------------------------------------------------------------- /adix/amoft.nim: -------------------------------------------------------------------------------- 1 | ## `Approximately k-Most Oft `_. 2 | ## This is constant space & constant inc&query time with adjustably small error. 3 | ## `AMOft[K,C]` augments the sketch with an O(k) paired Table & HeapQueue. 4 | ## Present impl does NOT scale well to very large `k` (past ~top-1000). E.g.: 5 | ## 6 | ## .. code-block:: nim 7 | ## var amo = initAMOft[int, uint32](k=10) 8 | ## for i in 0..<500: amo.inc i, i # Not v.skewed => not v.accurate 9 | ## for (i, c) in amo.mostCommon(5): echo i, " ", c 10 | 11 | import std/[hashes, heapqueue, tables, algorithm] 12 | type 13 | CtMnSketch*[K,C] = object ## CountMinSketch over hashable `K` & counters `C`. 14 | data: seq[seq[C]] 15 | salts: seq[Hash] 16 | w: int 17 | 18 | AMOft*[K,C] = object ## Track most often hashable `K` with counters `C`. 19 | sketch: CtMnSketch[K,C] # Sketch for gigantic `K` spaces 20 | top: HeapQueue[(C,int)] # Most frequent counts, keys 21 | no2key: seq[K] # For expensive K, like string, the heap ops.. 22 | key2no: Table[K, int] #..& scan are much faster with Indirect K. 23 | k: int # Num most often keys to track (max `top` len) 24 | 25 | proc initCtMnSketch*[K,C](w: int, d=4, salts: seq[int] = @[]): CtMnSketch[K,C] = 26 | ## `w`=Size(count tables), larger implies less overestimation. `d`=nTables, 27 | ## larger implies less P(overEst). `salts`=Override default generated salts. 28 | if w <= 0 or d <= 0: 29 | raise newException(ValueError, "Table size(`w`) & hashes(`d`) must be > 0") 30 | result.w = w 31 | if salts.len>0: result.salts = salts 32 | else : result.salts.setLen d 33 | result.data.setLen result.salts.len 34 | for i, t in result.data.mpairs: 35 | t.setLen w 36 | result.salts[i] = hash(if salts.len>0: salts[i] else: cast[int](t[i].addr)) 37 | 38 | proc inc*[K,C](cs: var CtMnSketch[K,C], key: K, r=1): C {.discardable.} = 39 | ## Count `key` `r` times; Gives new counts; I.e., `(key, r=0)` can query. 40 | result = C.high 41 | let kh = hash(key) 42 | when defined(cmsOnePass): # This updates faster/more independently 43 | for i, s in cs.salts: #..BUT has less accurate estimates. 44 | let h = Hash(uint(!$(kh !& s)) mod cs.w.uint) 45 | cs.data[i][h] += r.C 46 | result = min(result, cs.data[i][h]) 47 | else: 48 | var old = C.high 49 | var hs: array[32, Hash] # Avoid inner loop alloc w/bound; 128-256B of stack 50 | for i, s in cs.salts: 51 | hs[i] = Hash(uint(!$(kh !& s)) mod cs.w.uint) 52 | old = min(old, cs.data[i][hs[i]]) 53 | old += r.C 54 | for i in 0 ..< cs.salts.len: 55 | cs.data[i][hs[i]] = max(cs.data[i][hs[i]], old) 56 | result = min(result, cs.data[i][hs[i]]) 57 | 58 | proc initAMOft*[K,C](k, w: int; d=4, salts: seq[int] = @[]): AMOft[K,C] = 59 | result.sketch = initCtMnSketch[K,C](w,d,salts); result.k = k 60 | 61 | proc slot[K,C](a: var AMOft[K,C], kn: int): int = 62 | result = -1 # Must search only by key since updates 63 | for i in 0 ..< a.top.len: #..of OTHER keys MAY bump old estimates. 64 | if a.top[i][1] == kn: # Also cannot idx with Table since each 65 | return i #..replace re-orders heap slots. Links? 66 | # While above is a linear scan, Table look-up ensures it only happens if needed. 67 | 68 | proc inc*[K,C](a: var AMOft[K,C], key: K, r=1) = 69 | ## Count `key` as being seen `r` times. 70 | let c = a.sketch.inc(key, r) # Form (sketch count, keyNo) 2-tuple 71 | var new = (c, a.key2no.getOrDefault(key, -1)) 72 | if c.int > r and new[1] != -1 and (let i = a.slot(new[1]); i >= 0): 73 | a.top.del i # Hit: update existing; O(k+lg k fix-up) 74 | a.top.push new # O(lg k) 75 | elif a.top.len < a.k: # Miss/initial build 76 | new[1] = a.no2key.len 77 | a.no2key.add key 78 | a.key2no[key] = new[1] 79 | a.top.push new 80 | elif new > a.top[0]: # Miss/frequent enough to bump old top: 81 | new[1] = a.top[0][1] 82 | discard a.top.replace(new) # pop min, push new, discard old 83 | a.key2no.del a.no2key[new[1]] 84 | a.key2no[key] = new[1] 85 | a.no2key[new[1]] = key 86 | 87 | iterator mostCommon*[K,C](a: AMOft[K,C], k=0): (K, C) = 88 | ## Yield (`k`-most common values in `a`, each count) tuples; `k<=0` => known. 89 | let k = if k > 0: k else: a.top.len 90 | var cpy = a.top # A tree top can duck both O(N) copy.. 91 | var res: seq[(C, int)] #..to not edit & need to collect for top 92 | while cpy.len > 0: res.add cpy.pop 93 | res.sort 94 | var v: (K, C) 95 | for i in res.len - k ..< res.len: 96 | v[0] = a.no2key[res[i][1]]; v[1] = res[i][0] 97 | yield v 98 | 99 | when isMainModule: # Check/demo CtMnSketch[K,C], AMOft[K,C] 100 | when not declared(assert): import std/assertions 101 | var c = initCtMnSketch[int, uint32](w=16, salts = @[1,2,3]) 102 | for i in 0..<16: c.inc i, i 103 | for i in 0..<16: 104 | try: assert c.inc(i, 0) == i.uint32 # |exclusions also work for cmsOnePass 105 | except AssertionDefect: (if i notin [6,9]: echo "mismatch at ", i) 106 | 107 | var a = initAMOft[string, uint8](k=4, w=16, salts = @[1,2,3,4,5,6,7]) 108 | for i in 0..<32: a.inc $i, 32-i 109 | var res = "" 110 | for (i, c) in a.mostCommon(3): res.add i 111 | assert res == "210" # Top 3 112 | 113 | var oft = initAMOft[int, uint32](k=50, w=8192, d=7) # Linear dist not v.skewed 114 | for i in 0..<50000: oft.inc i, i # 50000*49999/2 = 1_249_975_000 virt.events 115 | for (i, c) in oft.mostCommon(25): echo i, " ", c, " ", c.int - i,"/1249975000" 116 | -------------------------------------------------------------------------------- /tests/bl.nim: -------------------------------------------------------------------------------- 1 | when not declared(addFloat): import std/formatfloat 2 | import std/[os, strutils, times], adix/[althash, bltab] 3 | 4 | template h(x):untyped = (when defined(lessRan): hashRoMu1(x) else: hashNASAM(x)) 5 | 6 | let nTab = parseInt(paramStr(1)) 7 | let mask = parseInt(paramStr(2)) 8 | 9 | var s = initBLTab(nTab, mask) 10 | var d: seq[int] 11 | for i in 3 .. paramCount(): d.add parseInt(paramStr(i)) 12 | let verb = "V".existsEnv 13 | template maybeEcho(x: varargs[untyped]) = 14 | if verb: echo x 15 | 16 | let t0 = epochTime() 17 | for j in d: 18 | if j > 0: 19 | let k = h(j) and mask 20 | if s.containsOrIncl(k): maybeEcho "had ", j 21 | else: maybeEcho "added ", (j, k) 22 | elif j < 0: 23 | let k = h(-j) and mask 24 | if s.missingOrExcl(k): maybeEcho "did nothing" 25 | else: maybeEcho "removed ", (-j, k) 26 | let dt = epochTime() - t0 27 | 28 | let ds = s.depths 29 | if verb: s.debugDump 30 | echo "dt(s): ", dt 31 | echo "hashLd: ", float(s.len)/float(s.getCap), " ", ds.len, " depths: ", ds 32 | echo paramCount()-2-s.len, '/', paramCount()-2, ". false pos. (if all +)" 33 | 34 | # ./bl $[1<<17] $[(1<<26) - 1] {1..$[3<<15]} |tail -n2 35 | # hashLd: 0.7494 19 depths: @[36541, 25619, 15310, 8917, 5087, 2965, 1708, 934, 486, 272, 174, 82, 56, 36, 17, 13, 5, 1, 2] 36 | # 79/98304. false pos. (if all +); fpr=0.0008; 19*26/8=61.75 < 64B cache line. 37 | # 38 | # A Bloom filter is less space -1.44*lg.0008=14.8 bit/num (57% of 26 bits, 42.7% 39 | # adjusting for 75% hashLd), BUT needs -lg .0008 = 10.3 hash funs ~10 line lds. 40 | # *MANY* would pay a 1/.427=2.34X space increase to get a 10+X speed boost. In 41 | # fact, unless you somehow know you will be right at a "cache cliff", almost no 42 | # one would not choose to spend 2.3X space for a 10X speed-up. 43 | # 44 | # Aux 2..3-bit counter fields (28..29./26*2.34=2.52..2.61x space) can buy you 45 | # deletes that are mostly reliable OR dup keys could be allowed (as in `lptabz`) 46 | # for full reliability at the cost of longer collision clusters. (Well, full 47 | # reliability modulo fingerprint collisions..). 48 | # 49 | # Even compared with Cuckoo filters things are 2x faster in the worst case and 50 | # 1.5x faster on average (depending on probability of 2nd lookup being needed). 51 | # 52 | # Of course, one can do other tests, such as a 1e6 insert 29-bit one: 53 | # ./bl $[1<<21] $[(1<<29)-1] {1..1000000} 54 | # dt(s): 0.02814388275146484 55 | # hashLd: 0.4764270782470703 10 depths: @[669828, 237265, 67871, 17965, 4647, 1192, 275, 71, 23, 3] 56 | # 860/1000000. false pos. (if all +) 57 | # That will take 29*(1<<21) bits or a 7.6 MiB `seq`. 58 | # 59 | # And one can sometimes do better with a *less* random hash, of course at some 60 | # substantial risk that your hash is over-tuned to your key sets: 61 | # nim c -d:danger -d:lessRan bl 62 | # ./bl $[1<<21] $[(1<<22)-1] {1..1000000} 63 | # dt(s): 0.01258683204650879 64 | # hashLd: 0.476837158203125 2 depths: @[801457, 198543] 65 | # 0/1000000. false pos. (if all +) 66 | # That will take 22*(1<<21) bits or a 5.7 MiB `seq`. 67 | # 68 | # 69 | # This is all amenable to more formal analysis for those so inclined. Here is 70 | # an excerpt of an e-mail I wrote in Summer 2001 (unadjusted for slightly better 71 | # Robin Hood Linear Probing): 72 | # ---------------------------------------------------------------------------- 73 | # A few nights ago David M raised some nice specific doubts and prompted me 74 | # to do some simple calculations. The compellingness of Bloom filters seems 75 | # limited, but very well defined. 76 | # 77 | # The executive summary is just this: for small false positive probabilities 78 | # Bloom filters help if you're trading memory against disk accesses, but 79 | # probably not for fast vs. slow memory where "slow" is only 5..8 times higher 80 | # latency. Some basic math will perhaps clarify the issue. 81 | # 82 | # The short of it is just this: 83 | # { ^ -> exponentiation[not xor], lg=log base 2 } 84 | # 85 | # Consider N objects/packet-types/whatever and an M-bit table. 86 | # Then let a = N / M be the "load". We have (see, e.g. Knuth) 87 | # P(false positive) = p = (1 - exp(-k*a))^k 88 | # 89 | # Solve for a(k,p)=-log(1-p^(1/k))/k, differentiate with respect to k, set 90 | # it equal to zero, and solve to get k = lg(1/p) as the maximizer of a, or 91 | # the *minimizer of M*. I.e., a Bloom filter with either ceil(-lg p) or 92 | # floor(-lg p) gives the minimum memory usage for a given target p. 93 | # What is the memory usage? Substituting back in notice p^(1/k)=1/2 and: 94 | # a = -log(1-1/2)/lg(1/p) = log 2 / lg(1/p), or 95 | # M = -lg e * N * lg p = 1.44*N*lg(1/p) 96 | # 97 | # Compare this with recording "existence" in a hash table of B-bit values. 98 | # Suppose we manage collisions with open-addressed linear probing (a cache 99 | # friendly thing). To achieve 2 table accesses/query (probably 1 slow memory 100 | # access) we need the load to be ~70% (see Knuth 6.4 table 4). Specifically, 101 | # M' = 1.44*N slots = 1.44*N*B bits. Anything in the address space does get 102 | # stored in the table. So the false positive rate we expect is the collision 103 | # rate in the B-bit address space for N objects. 104 | # 105 | # Standard binomial birthday simplification of multinomial collision analysis is 106 | # p = 1 − (1 − 1/M')^(N-1), or as M,N get big 107 | # p = 1 - exp(N*log(1 - 1/M')) =~ 1 - exp(-N/M') = 1 - exp(-N/2^B) or 108 | # B = lg(-N/log(1-p)). And so, 109 | # M' = 1.44*N*lg(-N/log(1-p)) bits. 110 | # Now if p << 1, again using log(1-p) =~ -p { err =~ .5*p^2 < 5% for p < .1 } 111 | # M' = 1.44*N*lg(N/p). 112 | # 113 | # So there you have it. The ratio of storage needed for M' (hash table) over 114 | # M (Bloom filter) simplifies for "small" p to (with log_1/p == log base 1/p): 115 | # 116 | # M'/M = lg(N/p) / lg(1/p) = (lg(1/p) + lg N)/log(1/p) = 117 | # = 1 + lg N/lg(1/p) = 1 + log_1/p (N) = 1 + lg N/-lg p 118 | # 119 | # This tells you exactly what you need to know -- Bloom filters save space only 120 | # when N is very large relative to 1/p. E.g., N=1e6 and p=1% give M' = 4M. 121 | # This may surprise as a naive perception may be that you want to Bloom when you 122 | # want small false positive rates. However, for low p, this costs a lot of time 123 | # as you end up with many hash functions also probing memory randomly. 124 | # 125 | # In time, Bloom only pays off when you are near enough a cliff in latency of 126 | # the memory hierarchy where `k` Bloom accesses beat the 1 LinearProbe access 127 | # because the `k` can operate in region (1+lgN/-lg p)X smaller. 128 | -------------------------------------------------------------------------------- /adix/bltab.nim: -------------------------------------------------------------------------------- 1 | ## This module specializes to the case where keys are 1..k-bit ints & values are 2 | ## 0..v-bit ints (`k+v<=8*int.sizeof`) using one `SeqUInt` as backing store. 3 | ## (Mnemonically, "BL" = "Bit Level" or start of "BLoom Filter", a sometimes 4 | ## competing data structure.) Users must give a number of bits for the key. 5 | ## Bits for values and the sentinel key default to 0. `BLTab` otherwise tries to 6 | ## be similar to hash variants of multisets. 7 | 8 | import althash, sequint 9 | when not declared(stderr): import std/assertions 10 | type 11 | BLTab* = object ## RobinHoodLP set of B-bit int keys w/small false pos. rate 12 | data: SeqUint # number array 13 | count: int # count of entered slots 14 | k, v, numer, denom, minFree, growPow2, pow2: uint8 # size policy parameters 15 | rehash, robin: bool # Steal 2-bits from `salt`? 16 | salt: Hash # ~unpredictable salt 17 | z: uint # sentinel 18 | 19 | var blInitialSize* = 2 ## default initial size aka capacity aka cap 20 | var blValueBits* = 0'u8 ## default bits for value in k+v-bit uint 21 | var blSentinel* = 0'u8 ## default sentinel value for k+v-bit uint 22 | var blNumer* = 3'u8 ## default numerator for lg(n) probe depth limit 23 | var blDenom* = 1'u8 ## default denominator for lg(n) probe depth limit 24 | var blMinFree* = 1'u8 ## default min free slots; (>= 1) 25 | var blGrowPow2* = 1'u8 ## default growth power of 2; 1 means double 26 | var blRobinHood* = false ## default to Robin Hood re-org; auto-activated 27 | var blRehash* = false ## default hcode rehashing behavior; auto-activated 28 | 29 | when defined(hashStats): # Power user inspectable/zeroable stats. These are 30 | template ifStats(x) = x # all kind of like "times" - you v0=val;...; val-v0 31 | var blDepth* = 0 ## Counts total search depth 32 | var blTooFull* = 0 ## Counts resizes from minFree boundary 33 | var blTooDeep* = 0 ## Counts resizes from deep probe sequences 34 | var blTooSparse* = 0 ## Counts skips of depth-triggered resize from sparsity 35 | else: 36 | template ifStats(x) = discard 37 | 38 | proc len*(s: BLTab): int {.inline.} = s.count 39 | proc getCap*(s: BLTab): int {.inline.} = s.data.len 40 | 41 | proc save*(t: BLTab, pathStub: string) = discard 42 | proc load*(t: var BLTab, path: string) = discard 43 | proc loadBLTab*(path: string): BLTab = discard 44 | proc mmap*(t: var BLTab, path: string) = discard 45 | 46 | proc pushUp(x: var SeqUint, i, n: int) {.inline.} = # move n items up 1 47 | for j in countdown(i + n - 1, i): x[j+1] = x[j] 48 | 49 | proc pullDown(x: var SeqUint, i, n: int) {.inline.} = # move n items down 1 50 | for j in countup(i, i + n - 1): x[j] = x[j+1] 51 | 52 | proc isUsed(s: BLTab, i: int): bool {.inline.} = s.data[uint(i)] != 0 53 | 54 | proc depth(i, hc, mask: Hash): Hash {.inline.} = 55 | let i = uint(i) 56 | let hc = uint(hc) 57 | let mask = uint(mask) 58 | cast[Hash]((i - hc) and mask) # Search depth of entry w/hcode @ix`i` 59 | 60 | iterator probeSeq(hc, mask: Hash): int = 61 | var i: Hash = hc and mask # Start w/home address 62 | while true: 63 | yield i 64 | i = (i + 1) and mask # Linear Probing 65 | 66 | proc rawGet(s: BLTab; hc: Hash, d: var Hash): int {.inline.} = 67 | assert(s.data.len > 0, "Uninitialized BLTab") # Ensure in *caller* not here 68 | var t {.noinit.}: int # Where to insert if missing 69 | for i in probeSeq(hc, s.data.high): 70 | t = i 71 | if not s.isUsed(i): 72 | break 73 | if d > depth(i, cast[Hash](s.data[uint(i)]), s.data.high): 74 | break 75 | if s.data[i] == uint(hc): 76 | return i 77 | d.inc 78 | ifStats blDepth.inc 79 | if d == s.data.len: # Handle fully saturated table case 80 | break 81 | result = -1 - t # < 0 => MISSING and insert idx = -1 - result 82 | 83 | proc rawGet(s: BLTab, hc: Hash): int {.inline.} = 84 | var d: Hash 85 | rawGet(s, hc, d) # < 0 => MISSING and insert idx = -1 - result 86 | 87 | proc depth(s: BLTab; hc: Hash): int {.inline.} = 88 | var d: Hash 89 | discard rawGet(s, hc, d) 90 | d 91 | 92 | proc rawPut1(s: var BLTab, i: Hash; d: var int): int {.inline.} = 93 | result = i # Linear probe to first empty slot 94 | while s.isUsed(result): 95 | result = (result + 1) and s.data.high 96 | d.inc 97 | if d == s.data.len: 98 | raise newException(ResourceExhaustedError, "BLTab saturated") 99 | 100 | proc rawPut2(s: var BLTab, i, j: Hash): int {.inline.} = 101 | if j > i: # No table wrap around; just shift up 102 | pushUp s.data, i, j - i 103 | elif j < i: # j wrapped to low indices 104 | pushUp s.data, 0, j 105 | s.data[0] = s.data[s.data.high] 106 | pushUp s.data, i, s.data.high - i 107 | result = i # j == i => already have space @i; done 108 | 109 | proc rawDel(s: var BLTab, i: Hash) {.inline.} = 110 | let mask = s.data.high 111 | var k = i 112 | var j = (i + 1) and mask # Find next empty|at home position entry 113 | while s.isUsed(j) and j != (int(s.data[j]) and mask): 114 | j = (j + 1) and mask 115 | if j > i + 1: # No table wrap around; just shift down 116 | pullDown s.data, i, j - 1 - i 117 | k = j - 1 # Mark just-past-shift-block entry empty 118 | elif ((j + mask - i) and mask) > 0: # j wrapped to low indices; Did >0 j.inc 119 | pullDown s.data, i, mask - i 120 | s.data[mask] = s.data[0] 121 | pullDown s.data, 0, j - 1 122 | k = (j + mask) and mask # [j-1 mod tabSz] is now empty 123 | # else: # k == i is already home position 124 | s.data[k] = 0 125 | 126 | proc init*(s: var BLTab, size, mask: int) {.inline.} = 127 | s.data = initSeqUint(size, numBound=mask) 128 | s.count = 0 129 | 130 | proc initBLTab*(size, mask: int): BLTab{.inline.} = result.init size, mask 131 | 132 | proc contains*(s: BLTab, hc: Hash): bool {.inline.} = 133 | assert(s.data.len > 0, "Uninitialized BLTab") # Ensure in *caller* not here 134 | s.rawGet(hc) >= 0 135 | 136 | proc containsOrIncl*(s: var BLTab, hc: Hash): bool {.inline.} = 137 | assert(s.data.len > 0, "Uninitialized BLTab") # Ensure in *caller* not here 138 | var d: Hash 139 | let i = s.rawGet(hc, d) 140 | if i < 0: 141 | var j = s.rawPut1(-1 - i, d) 142 | let k = s.rawPut2(-1 - i, j) # Maybe allocate a slot 143 | s.count.inc 144 | s.data[k] = hc 145 | else: 146 | result = true 147 | 148 | proc missingOrExcl*(s: var BLTab, hc: Hash): bool {.inline.} = 149 | assert(s.data.len > 0, "Uninitialized BLTab") # Ensure in *caller* not here 150 | let i = s.rawGet(hc) 151 | if i >= 0: 152 | s.data[i] = 0 153 | s.rawDel i 154 | s.count.dec 155 | else: 156 | return true 157 | 158 | proc clear*(s: var BLTab) {.inline.} = 159 | s.data.clear 160 | s.count = 0 161 | 162 | iterator items*(s: BLTab): Hash = 163 | let L = s.len 164 | for i in 0 ..< s.data.len: 165 | assert(s.len == L, "cannot change a set while iterating over it") 166 | if s.isUsed(i): yield cast[Hash](s.data[i]) 167 | 168 | iterator pairs*(s: BLTab): tuple[a: int, b: Hash] = 169 | let L = s.len 170 | var j = 0 171 | for i in 0 ..< s.data.len: 172 | assert(s.len == L, "cannot change a set while iterating over it") 173 | if s.isUsed(i): yield (j, cast[Hash](s.data[i])) 174 | j.inc 175 | 176 | proc depths*(s: BLTab): seq[int] = 177 | for elt in s: 178 | let d = s.depth(elt) 179 | if d >= result.len: result.setLen(d + 1) 180 | result[d] += 1 181 | 182 | proc debugDump*(s: BLTab, label="") = 183 | if label.len > 0: echo label 184 | echo s.len, " items" 185 | for i, cell in s.data: 186 | echo "i: ",i," depth: ",if cell != 0: depth(i, int(s.data[i]), s.data.high) 187 | else: 0, " ", cell 188 | -------------------------------------------------------------------------------- /adix/oats.nim: -------------------------------------------------------------------------------- 1 | import std/hashes, adix/[bitop, topk]; export topk.TopKOrder 2 | type # More adaptable than Nim std/sets|tables (named ROats|VROats here) 3 | Oat*[K, Q] = concept t # Base Concept: Open-Addressed Table 4 | cap(t) is int # Query allocated slot count 5 | used(t, int) is bool # Test if slot `i` is used | free 6 | key(t, int) is K # Get key/key ref for slot `i` 7 | keyQ(t, K) is Q # Make a Q)uery from a stored K)ey 8 | keyR(t, Q) is K # Convert internal Q to ref type K 9 | hash(Q) is Hash # hash key `k` 10 | eq(t, K, K) is bool # Stored key/ref `a` == stored `b` 11 | eq(t, K, Q) is bool # Stored key/ref `a` == Query `b` 12 | 13 | Resizable* = concept t # Gives grow/shrink-ability 14 | newOfCap(t, int) is type(t) # Get a new `n`-slot instance 15 | copy(var t, int, t, int) # Abstract t[i] = u[j] 16 | setNew(t, var t) # Efficiently set all t = u 17 | 18 | Valued*[V] = concept t # Gives dictionary-like interfaces 19 | val(var t, int, V) # Set val for slot `i` 20 | val(t, int) is V # Get val for slot `i` 21 | 22 | VOat*[K,Q,V] = concept t # Valued-Oat; Needs val; Adds []/{}/.values/etc. 23 | t is Valued[V]; t is Oat[K,Q] 24 | 25 | ROat*[K,Q] = concept t # R)esizable Oat; Needs new/cp/set;Adds setCap.. 26 | t is Resizable; t is Oat[K,Q] 27 | 28 | VROat*[K,Q,V] = concept t # V)alued, R)esizable Oat; Needs & adds both 29 | t is Valued[V]; t is ROat[K,Q] 30 | 31 | PutKey*[K] = concept t # incl,mgetOrPut.. `SERT` puts keys as an atom 32 | key(var t, int, K) # Set key for slot `i` (upSert can inline) 33 | POat*[K,Q] = concept t # PutKey-Oat 34 | t is PutKey[K]; t is Oat[K,Q] 35 | VPOat*[K,Q,V] = concept t # Valued-PutKey-Oat 36 | t is PutKey[K]; t is VOat[K,Q] 37 | 38 | Counted* = concept t # Gives cheap total used slots; else O(N) 39 | inUse(var t, int) # Set count of slots in use 40 | inUse(t) is int # Get count of slots in use 41 | 42 | SavedHash* = concept t # Speed find&more so resize;<64 bits ok if small 43 | hash(t, 0, Hash) # Set hash of slot `i` 44 | hash(t, 0) is Hash # Get hash of slot `i` 45 | 46 | proc len*[K,Q](t: Oat[K,Q]): int = ## stdlib-style slots in-use; Uncounted=>O(N) 47 | when t is Counted: t.inUse 48 | else: (for i in 0 ..< t.cap: (if t.used i: inc result)) 49 | 50 | iterator probeSeq(h, mask: Hash): int = 51 | var i: Hash = h and mask # Start w/home address 52 | while true: yield i; i = (i+1) and mask # Linear Probing 53 | 54 | func oatSlots*(n: int, mnFree=600): int = ceilPow2(n + max(1, mnFree)) 55 | ## Guess of the right table size from expected number of items. 56 | 57 | proc oatSlot*[K,Q](t: Oat[K,Q]; q: Q; h: Hash; d: var Hash): int = 58 | mixin eq # Q: Set d=0 or just assume like now? 59 | var j {.noinit.}: int # Where to insert if missing 60 | for i in probeSeq(h, t.cap - 1): 61 | j = i 62 | if not t.used i: break # Need >= 1 FREE slot to halt search 63 | if (when t is SavedHash: t.hash(i) == h else: true) and t.eq(t.key i, q): 64 | return i 65 | d.inc # Q: Also break if d == t.cap? 66 | -1 - j # <0 =>MISSING&insert idx = -1 - result 67 | 68 | proc tooFull*[K,Q](t: Oat[K,Q]; d: int; newSize: var int): bool = 69 | #-> user proc w/some provided default BUT there's a circular dep through `len` 70 | let sLen=t.len # Could be a cap-long loop 71 | if sLen + 1 + 1 > t.cap: # Call setCap pre-put? +1 new, +1 free 72 | newSize = t.cap shl 1; return true 73 | let p2 = lgCeil(t.cap) # NOT an over-deep search; Would like to test 74 | if d < 3*p2 + 1: #..first since it is guaranteed cheap, but need 75 | return false #..cond to ensure tiny tables terminate probeSeq 76 | if sLen > t.cap shr 1: # Over-deep on under-full: re-salt hash? 77 | newSize = t.cap; return true 78 | 79 | proc setCap*[K,Q](t: var ROat[K,Q]; newSize = -1) = 80 | let newSz = if newSize < 0: max(2, t.cap shl 1) 81 | else: oatSlots(max(newSize, t.len), 1) # max blocks over-shrink 82 | if newSz == t.cap and newSize == -1: return 83 | var ns = t.newOfCap newSz 84 | var d: int 85 | for i in 0 ..< t.cap: 86 | if t.used i: 87 | let q = t.key i 88 | let h = when t is SavedHash: t.hash(i) else: t.keyQ(q).hash 89 | ns.copy -1 - ns.oatSlot(q, h, d), t, i 90 | t.setNew ns 91 | 92 | template upSert*[K,Q](t: var Oat[K,Q], q, i, UP, SERT) = 93 | var d, newSize: Hash 94 | let h = q.hash 95 | var i = oatSlot(t, q, h, d) 96 | if i >= 0: UP 97 | else: 98 | if t.tooFull(d, newSize): 99 | when t is Resizable: 100 | oats.setCap t, newSize; d = 0 101 | i = oatSlot(t, q, h, d) 102 | else: raise newException(ValueError, "non-resizable table too full") 103 | i = -1 - i 104 | SERT 105 | when t is SavedHash: t.hash i, h # Late in case `SERT` aborts insert 106 | when t is Counted: t.inUse t.len + 1 107 | 108 | proc incl*[K,Q](t: var POat[K,Q], q: Q) = 109 | t.upSert q, i, UP=(discard), SERT=(t.key(i, t.keyR q)) 110 | 111 | proc `[]`*[K,Q,V](t: VOat[K,Q,V], q: Q): V = 112 | if (var d: Hash; let i = oatSlot(t, q, q.hash, d); i >= 0): result = t.val i 113 | else: raise newException(KeyError, "no such key") 114 | 115 | proc `[]=`*[K,Q,V](t: var VPOat[K,Q,V], q: Q, v: V) = 116 | t.upSert q, i, UP=(t.val i), SERT=(t.key(i, t.keyR q); t.val i, v) 117 | 118 | proc mgetOrPut*[K,Q,V](t: var VPOat[K,Q,V], q: Q, v: V): var V = 119 | t.upSert q, i, UP=(t.val i), SERT=(t.key(i, t.keyR q); t.val i, v) 120 | 121 | proc getOrDefault*[K,Q,V](t: VOat[K,Q,V], q: Q, def=default(V)): V = 122 | if (var d: Hash; let i = oatSlot(t, q, q.hash, d); i >= 0): t.val i else: def 123 | 124 | iterator items*[K,Q](t: Oat[K,Q]): K = 125 | for i in 0 ..< t.cap: (if t.used i: yield t.key i) 126 | 127 | iterator values*[K,Q,V](t: VOat[K,Q,V]): V = 128 | for i in 0 ..< t.cap: (if t.used i: yield t.val i) 129 | 130 | iterator pairs*[K,Q,V](t: VOat[K,Q,V]): (K, V) = 131 | for i in 0 ..< t.cap: (if t.used i: yield (t.key i, t.val i)) 132 | 133 | iterator topByVal*[K,Q,V](s:VOat[K,Q,V], n=10,min=V.low,order=topk.Cheap):(K,V)= 134 | ## Yield biggest `n` items by value >= `min` in `s` in `order`. 135 | var t = initTopK[(V,K)](n) 136 | for k, v in oats.pairs(s): (if v >= min: t.push (v, k)) 137 | for e in topk.maybeOrdered(t, order): yield (e[1], e[0]) 138 | 139 | template oatKStack*(s, Self, Cell, off, offT, K, Q) = ## Defs for stacked varLen 140 | ## `string`-like keys backed by `string`-like `s` w/`cligen/MSlice`-like `Q`. 141 | proc mem(c: Cell): pointer = s[c.off].addr # Accessor 142 | proc key(c: Cell): K = K(mem: c.mem, len: c.len.int) # AccessorForUsr-Accessor 143 | proc keyQ(t: Self, k: K): Q = k 144 | proc keyR(t: Self, q: Q): K {.used.} = q 145 | proc eq(t: Self, a: K, b: K|Q): bool = 146 | when b is K: a == b # Compare internal as ints for faster resize 147 | else: t.keyQ(a) == b # Compare Q bytes with memcmp 148 | template add(b; k: Q; limit: int; fail): untyped = 149 | if b.len + k.len <= limit: # Ensure addr space 150 | let off = b.len; b.setLen off + k.len # Ensure alloc room 151 | if k.len > 0: copyMem b[off].addr, k.mem, k.len # Maybe copy 152 | offT(off) # Yield new offset 153 | else: fail 154 | 155 | template oatSeq*(Self, dat) = ## Add routines for `seq`-ish `Self` 156 | proc cap(t: Self): int = t.dat.len 157 | proc newOfCap(t: Self, n: int): Self = result.dat.setLen n 158 | proc copy(t: var Self, i: int, d: Self, j: int) = t.dat[i] = d.dat[j] 159 | proc setNew(t, d: var Self) = swap t.dat, d.dat # efficient t=d (& d=t) 160 | 161 | template oatCounted*(t, Self, cDotPop) = ## Add inUse for var maybe-ref'd off t. 162 | proc inUse(t: var Self, n: int) = cDotPop = typeof(cDotPop)(n) #TODO user grow policy 163 | proc inUse(t: Self): int {.used.} = cDotPop.int 164 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | While this began just as a kind of adaptive index/hash table library, it has 2 | grown into more a collection in the theme of database/big-data related data 3 | structures & algorithms. { Let's say the "ad" in "adix" now stands for 4 | "ADvanced" | "AscenDant" as well as "adaptive" ;-) } Most of these are *à la 5 | carte* and I hope you find them useful. I try to keep the source code short & 6 | to the point. In particular, as an overview/index, here be: 7 | 8 | - The original associative lookup modules: 9 | - [ditab](https://c-blake.github.io/adix/adix/ditab.html): direct-indexed 10 | table for small universe keys 11 | - [lptabz](https://c-blake.github.io/adix/adix/lptabz.html): a hash table 12 | that should be safe for untrusted key sets 13 | - [metab](https://c-blake.github.io/adix/adix/metab.html): an instantiation/ 14 | table portability helper for lptabz 15 | - [btree](https://c-blake.github.io/adix/adix/btree.html): a B-Tree with 16 | various optional features (histogram, ranks, bulk loading, etc.) 17 | 18 | - Basic Sorting: [nsort](https://c-blake.github.io/adix/adix/nsort.html) 19 | Radix sort only by NEEDED BITS; Often 5-10X faster than `algorithm.sort` if 20 | you sort a lot of meso-scale data (merge sorts *always* win for HUGE data; 21 | Very few have it). (Could buffer writes to ensure full cache-line pokes.) 22 | 23 | - Basic Sketches (abbreviated/approximate stores; aka "Digests") for: 24 | - Membership: [bltab](https://c-blake.github.io/adix/adix/bltab.html) 25 | (bit-level table; Like more successfully marketed Bloom|Cuckoo filters, 26 | but lower latency[^1] & ~2X bigger) 27 | - Count Distinct: [uniqce](https://c-blake.github.io/adix/adix/uniqce.html) 28 | aka count unique or cardinality estimation 29 | - Approx Most Often: [amoft](https://c-blake.github.io/adix/adix/amoft.html) 30 | (aka approximate top-K most frequent | heavy-hitters) 31 | 32 | - Distributions/Quantiles: 33 | * [tdigest](https://c-blake.github.io/adix/adix/tdigest.html) (for slower, 34 | more accurate in tail only quantiles (medians generalized). 35 | * for a more complete & adaptive picture, you want accuracy-everywhere / 36 | full histograms able to realize fast moving quantile transforms backed 37 | by [Fenwick/BIST trees](https://c-blake.github.io/adix/adix/bist.html) 38 | with time kernels that are flat, 39 | [linear](https://c-blake.github.io/adix/adix/lmbist.html) or 40 | [exponential](https://c-blake.github.io/adix/adix/embist.html) or [simple 41 | histograms](https://c-blake.github.io/adix/adix/hist.html). 42 | * Optional discretizing for binning via logs/sqrt/whatever values gives 43 | [lghisto](https://c-blake.github.io/adix/adix/lghisto.html), a high 44 | dynamic range (HDR) module that handles that one-stop shopping style or 45 | [xhist1](https://c-blake.github.io/adix/adix/xhist1.html), its 46 | generalization to any transform|backing histogram/time kernel. 47 | - An amalgam: [`mvstat`](https://c-blake.github.io/adix/adix/mvstat.html) 48 | that works like `std/stats` but supports `del`, i.e. sliding/moving/rolling 49 | windows over data streams (like moving averages) as well as running/dynamic 50 | quantiles via `lghisto`. Also includes bulk array stats that in some compile 51 | modes get fully SIMD vectorized inner loops. 52 | 53 | And some utility modules: 54 | - [althash](https://c-blake.github.io/adix/adix/althash.html): salt-able 55 | alternate hash functions for lptabz 56 | - [sequint](https://c-blake.github.io/adix/adix/sequint.html): a fixed stride 57 | "bit matrix" using "batch"/number ops. 58 | - [memutil](https://c-blake.github.io/adix/adix/memutil.html): memory shifting 59 | utilities 60 | - [cumsum](https://c-blake.github.io/adix/adix/cumsum.html): parallel prefix 61 | sum using Intel SIMD for nsort 62 | - [bitop](https://c-blake.github.io/adix/adix/bitop.html): re-impl std/bitops 63 | things to be more CT friendly / provide bitwise updating operators 64 | - [topk](https://c-blake.github.io/adix/adix/topk.html): spends 2X the (small) 65 | space of `std/heapqueue`-based top-k stream algo to scale O(lg k) better via 66 | what one might call "buffered quickselect". 67 | - [lna](https://c-blake.github.io/adix/adix/lna.html): natural log(abs(x)) 68 | approximation more tunable with regard to speed-precision trade-offs with 69 | 5-levels of work (3..11 series terms)/precision (11..24 bits). It's not 70 | always faster, but is more reliably fast and more tunable than libc using 2 71 | simple ideas: IEEE exponent to narrow problem & two series for the remaining 72 | near-unity interval. 73 | - [ways](https://c-blake.github.io/adix/adix/ways.html): Various algos. 74 | Presently, scalable, k-way ordered merge. 75 | 76 | A Word/Paragraph Of Caution 77 | =========================== 78 | While sketches are popular, like merge sort (vs. radix sort), they often need 79 | huge data to pay off. Essentially, probabilistic analysis ("Oh wow, I can do 80 | that?!") distracts from more practical space-time trade-offs. This distraction 81 | is worsened by there being space-time-accuracy "trade-off pyramids". This 82 | results in an odd state of affairs where I can say here "spending *a bit* more 83 | space can yield major speed-ups", and it sounds blatantly obvious to even the 84 | most casual observer. *Yet* such is also neglected in context countless times. 85 | The academic literature does not help, often being "blood sport" for more 86 | compressed data | accuracy with no regard to speed.[^2] 87 | 88 | So, e.g., on my primary 32 GiB RAM dev box with `bu/zipf`, I cannot make exact 89 | `lfreq` slower than Approximately Most Often sketches(`bu/oft`). `tests/bl.nim` 90 | shows another example (also written up 91 | [here](https://blog.cloudflare.com/when-bloom-filters-dont-bloom/) in a Bloom 92 | filter / membership approximation context where spending 2-4X what a Bloom takes 93 | space-wise can buy a 7-10X latency shrink.[^1] (Histograms & UCE are both pretty 94 | good deals, though, if errors are acceptable, and `adix/bltab` with fingerprint 95 | keys is arguably just "a better 'sketch' "). 96 | 97 | A little more on LPTabz & friends 98 | ================================= 99 | As a brief guide I would start with [`NOTES.md`](NOTES.md) and then look at the 100 | top part of the [`lptabz` doc](https://c-blake.github.io/adix/adix/lptabz.html). 101 | [`TODO.md`](TODO.md) also has a lot of notes in it. My overarching vision is to 102 | allow "the fast way" most of the time, especially for developers that know how 103 | to provide a good `hash`, but to also have auto fall backs to "safer ways" with 104 | optional messages to let devs know they may need to intervene by changing some 105 | defaults at table construction time (or else let users/sysadmins know that some 106 | input may be violating the assumptions of some code sensitive to inputs). 107 | Commercial database systems may have done this for decades, but hasn't really 108 | percolated into commonly available runtime libs. (Depth-based growth trigger is 109 | likely the simplest example of Profile-Guided Optimization for data structures. 110 | A.Dain Samples 1993 PhD thesis has some more.) 111 | 112 | [^1]: Note that hardware memory systems got more sophisticated about speculative 113 | workahead execution and parallel fetch which can mask most or all of the extra 114 | latency in a hot loop benchmark, but this is still "more work/mem bandwidth" 115 | competing with other work you *might* want a CPU to be doing instead and The 116 | Memory Wall has been around for like 30 years now. Also, the bonus Robin-Hood 117 | Linear Probing adds over Cuckoo is graceful degradation with weak hashes - a 118 | real risk whenever you let users pick `hash` -- which you kind of _must_. 119 | 120 | [^2]: The basic issue seems to be a need for apparent novelty over practicality 121 | to interest peer reviewers. Besides weak motivation, "expert" committees only 122 | have whatever narrow exposure they have to various domains of ideas/assumption 123 | frameworks. With human psychology & incentives this leads to research fads/gobs 124 | of work & "many new names for old ideas" to sift through in deep dives. One 125 | hoped search engines/LLMs might have eased such sifting, but it seems harder, 126 | perhaps because [synonym density](https://github.com/c-blake/thes) is simply too 127 | great and more folks are at work. 128 | -------------------------------------------------------------------------------- /adix/embist.nim: -------------------------------------------------------------------------------- 1 | ##[ This is like `bist.nim`, but *grows* weight added to counters (some kind of 2 | floating point) exponentially as new updates happen. Up to floating point 3 | arithmetic, this is equivalent to `ctrs vector*= wOld; ctr[i] += 1`. Doing it 4 | this way allows BIST use for efficient exponentially weighted moving quantile. 5 | It does need rescaling BUT this can be made rare. For wOld=0.9, 1.11^(6736|842) 6 | =~ 1.8e308|3.4e38 ~= dbl_max. Re-scaling to tinies can ~2X that to ~13k|1.7k 7 | data points. Rescaling is very CPU-vectorizable & so should be 8..16x\*lg(nBin) 8 | =~ 80..320X faster per it; <=~2X to total amortized cost at <136..270,000 bins. 9 | 10 | Like EWMAs, IF YOU NEVER `dec` then this filter has technically infinite, yet 11 | *usually* short time-memory, but can operate only with a small histogram space. 12 | EWMoving Median inherits a usual high breakdown point/robustness in that, even 13 | if infinite memory/having enduring time-breakdown, influencing the median takes 14 | *MUCH* more than one wild data point long ago in history - it takes a downright 15 | different epoch / non-stationarity which is not nearly as bothersome. 16 | 17 | Also, note that weighting styles of distributions & averages are analogous but 18 | distinct. `*=wOld,+=(1-wOld)*x` is a normalized average, but weight in *distros* 19 | is about "faked-repetition" of relative weight. So, while things like half-life 20 | & lag are the same formulae, meaning varies since distros have MANY interacting 21 | relative weights. (See `LMBist` for linear weight moving medians w/non-flat 22 | recency weight + *strict* windows or `Bist` for flat, strict windows.) The API 23 | is the same as `Bist[F]`. The bottom of this module has a small test/timing 24 | program showing the differences. 25 | 26 | Happy to cite someone, but as far as I can tell, this is a novel (though fairly 27 | obvious) application of Fenwick BISTs for a fast EWMMedian transform. Luxenberg 28 | & Boyd (2024) "Exponentially Weighted Moving Models" does something ~100X more 29 | complex & surely slower than the one-pass O(n\*lg nBin) (20 ns/item!) way done 30 | here. Coleman at https://randorithms.com/2022/01/31/streaming-histograms.html 31 | has some nice animations, but it has pedagogical/poorly scaling O(n\*nBin) code. 32 | It seems likely someone doing big data analytics has this somewhere, though and 33 | I am happy to give credit when due. Similarly, please cite this github repo if 34 | this code inspires your work. ]## 35 | import adix/bist, std/math 36 | template maxFinite(T: typedesc[SomeFloat]): T = # Should be in std/math, IMO 37 | when T is float32: 3.4028235e+38'f32 38 | elif T is float64 or T is float: 1.7976931348623157e+308'f64 39 | 40 | type EMBist*[F: SomeFloat] = object ## Exponentially weighted moving distrib. 41 | cnt: Bist[F] # Raw count; This F *could* become its own generic param. 42 | w, grow: float64 # Running weight next data point will add, growth factor. 43 | 44 | proc len*[F](d: EMBist[F]): int = d.cnt.data.len ## Number of bins & bytes 45 | func space*[F](d: EMBist[F]): int = d.sizeof + d.cnt.space 46 | proc tot*[F](d: EMBist[F]): F = d.cnt.tot ## Raw total 47 | proc count*[F](d: EMBist[F]): F = d.tot ## Total Weight 48 | 49 | proc init*[F](d: var EMBist[F]; len: int, wOld: float=0.9375) = 50 | d.cnt.init len; d.w = 1.0; d.grow = F(1/wOld) # start w at 1/thresh? 51 | proc initEMBist*[F](len: int, wOld: float): EMBist[F] = result.init len, wOld 52 | proc clear*[F](d: var EMBist[F]) = d.cnt.clear; d.tot = 0.0 53 | 54 | proc inc*[F](d: var EMBist[F]; i: int, w: F=1) = ## Add weight `w` to bin `i` 55 | const lim = F.maxFinite/1e9 # 1e9 just to leave some room for `w` variation 56 | const scl = 1/lim 57 | # Can pair up *= scl (ensuring multiplier stays FP representable), but this is 58 | # ~pointless since it 2x's BOTH rarity AND cost. Subtracting 2*lg lim from 59 | # binary exponents is pure rarity savings *IF* it can be vectorized similarly. 60 | d.cnt.inc i, w*d.w 61 | if d.cnt.tot > lim: # Re-scale so future adds do not overflow 62 | for c in mitems d.cnt.data: c *= scl # Both BIST counts.. 63 | d.cnt.tot *= scl; d.w *= scl #..and meta-data. 64 | d.w *= d.grow 65 | 66 | proc scale*[F](d: EMBist[F]; age: int): F = d.grow^(-age) 67 | ## Scale for more rare un-count old; Can re-use if dec @same relative age. 68 | proc dec*[F](d: var EMBist[F]; i: int; w: F=1) = d.cnt.dec i, w*d.w 69 | ## Un-count-old operation for more rare EW with strict windows; Use .scale! 70 | 71 | proc up*[F](d: var EMBist[F]) = discard ## Simple no-op for EMBist 72 | 73 | proc cdf*[F](d: EMBist[F], i: int): F = d.cnt.cdf(i) / d.count ## wrap Bist.cdf 74 | proc pmf*[F](d: EMBist[F], i: int): F = d.cnt.pmf(i) / d.count ## wrap Bist.pdf 75 | proc invCDF*[F](d: EMBist[F], s: F; s0: var F): int = d.cnt.invCDF s, s0 76 | ## wrap Bist.invCDF 77 | proc invCDF*[F](d: EMBist[F]; s: F; s0,s1: var F): int = d.cnt.invCDF s, s0,s1 78 | ## wrap Bist.invCDF 79 | proc min*[F](d: EMBist[F]): int = d.cnt.min ## Simple wrapper of `Bist.min`. 80 | proc max*[F](d: EMBist[F]): int = d.cnt.max ## Simple wrapper of `Bist.max`. 81 | proc quantile*[F](d: EMBist[F]; q: float; iL,iH: var int): float = ## wrap Bist.quantile 82 | d.cnt.quantile q, iL,iH 83 | proc quantile*[F](d: EMBist[F]; q: float): float = d.cnt.quantile q ## wrap Bist.quantile 84 | 85 | proc nPDF*[F](d: EMBist[F]): seq[F] = 86 | result.setLen d.cnt.len;let s=1.0/d.tot;for i,r in mpairs result:r = s*d.pmf(i).F 87 | 88 | proc nCDF*[F](d: EMBist[F]): seq[F] = 89 | result.setLen d.cnt.len;let s=1.0/d.tot;for i,r in mpairs result:r = s*d.cdf(i).F 90 | 91 | when isMainModule: 92 | type F = float64 93 | const slow {.booldefine.} = false # VERY limited differences below 94 | when not declared addFloat: import std/[syncio, formatFloat] 95 | import std/[times, strformat], cligen, cligen/sysUt 96 | proc embist(xs: seq[int], wOld=0.75, q = -2.0, pdf=false,cdf=false,time=false, 97 | xMn=0,xMx=7) = 98 | template toI(x): untyped = max(xMn, min(xMx, x)) - xMn # Clip & shift 99 | if wOld <= 0: Value !! "wOld " & $wOld & " too small" 100 | if wOld >= 1: Value !! "wOld " & $wOld & " too big" 101 | when slow: (var d = initBist[F](xMx - xMn + 1)) 102 | else : (var d = initEMBist[F](xMx - xMn + 1, wOld)) 103 | let t0 = epochTime() 104 | var tQ = 0.0 # Report avg qtl to ensure compiler cannot elide 105 | for t, x in xs: 106 | let x = x.toI # xOld frm Deque=moreGeneral 107 | when slow: # On full data win, decay ALL old weight 108 | for cnt in mitems d.data: cnt *= wOld # BIG LOOP 109 | d.tot *= wOld 110 | d.inc x, 1.0 # Unit entering weight 111 | else: # Remove weight for leaving data point 112 | d.inc x, 1.0 # Unit entering weight 113 | if pdf: echo t," b: tot: ",d.tot," ewmPMF: ",d.nPDF 114 | if cdf: echo t," b: tot: ",d.tot," ewmCDF: ",d.nCDF 115 | if q > -2.0: 116 | if time: tQ += d.quantile(q) # `formatFloat` slow=>just total 117 | else: echo d.quantile(q) # Report inverseCDF(q) 118 | if time: 119 | let n = xs.len.float; let dt = (epochTime() - t0)*1e9/n 120 | stderr.write &"n: {xs.len} ns/no: {dt:.1f} w: {wOld} : {tQ/n}\n" 121 | 122 | dispatch embist, short={"xMn":'a', "xMx":'b'}, help={"xs": "x values", 123 | "wOld": "per-update decay factor" , "q" : "quantile to report; 0.5=median", 124 | "pdf" : "print PDF each time step", "cdf": "print CDF each time step", 125 | "time": "print timing statistics", 126 | "xMn" : "`xs[i]` clipped to this `a` on `[a, xs]`", 127 | "xMx" : "`xs[i]` clipped to this `b` on `[xs, b]`"} 128 | #[ A Zsh session showing basic correctness&boost of optimization. Sets up env, 129 | compiles ref & optimized; makes nums; Tests various q & w; Finally measures 'em. 130 | nim=(nim c -d:danger); t=/tmp/nums # Set up 131 | $nim -d:slow -o=sembist embist; $nim embist 132 | ( for i in {1..10000}; printf " %s" $((RANDOM%8)) ) > $t 133 | ( for q in .1 .25 .5 .75 .9; { for x in {1..9}; { w=$[10./(10.+x)] 134 | paste <(./sembist -w$w -q.1 `<$t`) <(./embist -w$w -q.1 `<$t`) | 135 | awk '{print $1-$2}' | sort -g | tails -h1 -t1 }}) 2>/dev/null|unfold -n3 136 | ./sembist -b8300 -tw.99 -q.5 `<$t`; ./embist -b8300 -tw.99 -q.5 `<$t` 137 | I get NO DIFF between ref & optimized, optimized about 112X faster at -b8300 and 138 | always faster, even at -b2. ]# 139 | -------------------------------------------------------------------------------- /adix/cumsum.nim: -------------------------------------------------------------------------------- 1 | import adix/cpuCT 2 | 3 | proc `[]`[C, W](h: ptr UncheckedArray[C], i: W): var C {.inline.} = 4 | cast[ptr C](cast[uint](h) + i.uint * C.sizeof.uint)[] 5 | 6 | proc cumsum*[T](c: ptr UncheckedArray[T]; n: uint) = 7 | for i in 1 ..< n: 8 | c[i] += c[i - 1] 9 | 10 | #NOTE: SSSE3 => SSE2 which is also used. 11 | when defined(amd64) and not defined(noSIMD) and x86ssse3 in x86features: 12 | when defined(cpuPrefetch): import cligen/prefetch 13 | 14 | template workToAligned(c, n, i, align: untyped) {.dirty.} = 15 | i = 1 16 | while i < n and (cast[uint](c[i].addr) and (align - 1)) != 0: 17 | c[i] += c[i - 1] 18 | i.inc 19 | 20 | template workRemainder(c, n, i: untyped) {.dirty.} = 21 | while i < n: 22 | c[i] += c[i - 1] 23 | i.inc 24 | 25 | type m128i {.importc: "__m128i", header: "emmintrin.h".} = object 26 | proc mm_load(adr: ptr m128i): m128i {.importc: "_mm_load_si128", nodecl, header: "emmintrin.h".} 27 | proc mm_store(adr: ptr m128i, val: m128i) {.importc: "_mm_store_si128", nodecl, header: "emmintrin.h".} 28 | proc mm_set1(ch: uint8): m128i {.importc: "_mm_set1_epi8", nodecl, header: "emmintrin.h".} 29 | proc mm_set1(wd: uint16): m128i {.importc: "_mm_set1_epi16", nodecl, header: "emmintrin.h".} 30 | proc mm_set1(dw: uint32): m128i {.importc: "_mm_set1_epi32", nodecl, header: "emmintrin.h".} 31 | proc mm_add_epi32(a, b: m128i): m128i {.importc: "_mm_add_epi32", nodecl, header: "emmintrin.h".} 32 | proc mm_shuffle_epi32(a: m128i, msk: cint): m128i {.importc: "_mm_shuffle_epi32", nodecl, header: "emmintrin.h".} 33 | proc mm_add_epi8(a, b: m128i): m128i {.importc: "_mm_add_epi8", nodecl, header: "emmintrin.h".} 34 | proc mm_add_epi16(a, b: m128i): m128i {.importc: "_mm_add_epi16", nodecl, header: "emmintrin.h".} 35 | proc mm_slli_si128(a: m128i, n: cint): m128i {.importc: "_mm_slli_si128", nodecl, header: "emmintrin.h".} 36 | proc mm_shuffle_epi8(a, b: m128i): m128i {.importc: "_mm_shuffle_epi8", nodecl, header: "tmmintrin.h".} 37 | 38 | proc cumsum*(c: ptr UncheckedArray[uint8]; n: uint) = 39 | var i = n 40 | workToAligned(c, n, i, 16) #Loop to next 16B align 41 | let n64 = i + ((n - i) and not 63'u64) #Round dn to mult of 64 42 | var off = mm_set1(c[i - 1]) #Initial off=last c[]. 43 | let msk = mm_set1(15'u8) 44 | var v0, v1, v2, v3: m128i #SSE vecs 45 | template do16(v, b: untyped) {.dirty.} = 46 | v = mm_load(cast[ptr m128i](c[b + i].addr)) 47 | v = mm_add_epi8(v, mm_slli_si128(v, 1)) #0+1 1+2 2+3 3+4 4+5 5+6 6+7 7+8 8+9 9+A A+B B+C C+D D+E E+F F 48 | v = mm_add_epi8(v, mm_slli_si128(v, 2)) #0..3 1..4 2..6 3..6 4..7 5..8 6..9 7..A 8..B 9..C A..D B..E C..F D..F E+F F 49 | v = mm_add_epi8(v, mm_slli_si128(v, 4)) #0..7 1..8 2..9 3..A 4..B 5..C 6..D 7..E 8..F 9..F A..F B..F C..F D..F E+F F 50 | v = mm_add_epi8(v, mm_slli_si128(v, 8)) #0..F 1..F 2..F 3..F 4..F 5..F 6..F 7..F 8..F 9..F A..F B..F C..F D..F E+F F 51 | v = mm_add_epi8(v, off) #Add in offset 52 | mm_store(cast[ptr m128i](c[b + i].addr), v) #Update array 53 | off = mm_shuffle_epi8(v, msk) #off=bcast high elt 54 | while i < n64: #1-cache line at a time 55 | when defined(cpuPrefetch): prefetchw(c[i + 64].addr) 56 | do16(v0, 0) 57 | do16(v1, 16) 58 | do16(v2, 32) 59 | do16(v3, 48) 60 | inc(i, 64) #XXX After cache-line loop, could do few more vectorized doX's 61 | workRemainder(c, n, i) #Loop to end 62 | 63 | proc cumsum*(c: ptr UncheckedArray[uint16]; n: uint) = 64 | var i = n 65 | workToAligned(c, n, i, 16) #Loop to next 16B align 66 | let n32 = i + ((n - i) and not 31'u64) #Round dn to mult of 32 67 | var off = mm_set1(c[i - 1]) #Initial off=last c[] 68 | let msk = mm_set1(0x0F0E'u16) #s.t. shuffle_epi8 =~ shuffle_epi16 69 | var v0, v1, v2, v3: m128i #SSE vectors 70 | template do8(v, b: untyped) {.dirty.} = 71 | v = mm_load(cast[ptr m128i](c[b + i].addr)) 72 | v = mm_add_epi16(v, mm_slli_si128(v, 2)) #0+1 1+2 2+3 3+4 4+5 5+6 6+7 7 73 | v = mm_add_epi16(v, mm_slli_si128(v, 4)) #0..3 1..4 2..5 3..6 4..7 5..7 6+7 7 74 | v = mm_add_epi16(v, mm_slli_si128(v, 8)) #0..7 1..7 2..7 3..7 4..7 5..7 6+7 7 75 | v = mm_add_epi16(v, off) #Add in offset 76 | mm_store(cast[ptr m128i](c[b + i].addr), v) #Update array 77 | off = mm_shuffle_epi8(v, msk) #off=bcast high elt 78 | while i < n32: #1-cache line at a time 79 | when defined(cpuPrefetch): prefetchw(c[i + 32].addr) 80 | do8(v0, 0) 81 | do8(v1, 8) 82 | do8(v2, 16) 83 | do8(v3, 24) 84 | inc(i, 32) 85 | workRemainder(c, n, i) #Loop to end 86 | 87 | proc cumsum*(c: ptr UncheckedArray[uint32]; n: uint) = 88 | var i = n 89 | workToAligned(c, n, i, 16) #Loop to next 16B align 90 | let n16 = i + ((n - i) and not 15'u64) #Round dn to mult of 16 91 | var off = mm_set1(c[i - 1]) #Initial off=last c[] 92 | var v0, v1, v2, v3: m128i #SSE vectors 93 | const msk = 0xFF.cint 94 | template do4(v, b: untyped) {.dirty.} = 95 | v = mm_load(cast[ptr m128i](c[b + i].addr)) 96 | v = mm_add_epi32(v, mm_slli_si128(v, 4)) #0+1 1+2 2+3 3 97 | v = mm_add_epi32(v, mm_slli_si128(v, 8)) #0+1+2+3 1+2+3 2+3 3 98 | v = mm_add_epi32(v, off) #Add in offset 99 | mm_store(cast[ptr m128i](c[b + i].addr), v) #Update array 100 | off = mm_shuffle_epi32(v, msk) #off=bcast high elt 101 | while i < n16: #1-cache line at a time 102 | when defined(cpuPrefetch): prefetchw(c[i + 16].addr) 103 | do4(v0, 0) 104 | do4(v1, 4) 105 | do4(v2, 8) 106 | do4(v3, 12) 107 | inc(i, 16) 108 | workRemainder(c, n, i) #Loop to end 109 | 110 | proc cumsum*(c: ptr UncheckedArray[uint64]; n: uint) = 111 | for i in 1 ..< n: #Could speed this up, but time will never be big cmp to 112 | c[i] += c[i - 1] #..moving billions of items around in a counting sort. 113 | 114 | proc cumsum*[T](c: var openArray[T]) {.inline.} = 115 | cumsum(cast[ptr UncheckedArray[T]](c[0].addr), c.len.uint) 116 | 117 | proc cumsum*[T](c: ptr T, n: uint) {.inline.} = 118 | cumsum(cast[ptr UncheckedArray[T]](c), n) 119 | 120 | when isMainModule: 121 | import std/[random, times, stats], cligen 122 | when not declared(stderr): import std/[syncio, formatfloat] 123 | 124 | proc gen[T](x: var openArray[T]; low, range: int) = 125 | for i in 0 ..< x.len: x[i] = low.T + rand(range.float).T 126 | 127 | type Kind = enum kU1, kU2, kU4, kU8 128 | 129 | proc doIt[T](k: T, n=9, low=0, range=9, bench=false, minN=5, avgN=5, 130 | data: seq[uint64]) = 131 | randomize() 132 | var x = newSeq[T](n) 133 | var t = x 134 | var dtsMin: RunningStat 135 | for avgIt in 1..avgN: 136 | var dtMin = float.high 137 | for minIt in 1..minN: 138 | if data.len > 0: #Allow passing specific data for reproducible debug 139 | for i in 0 ..< x.len: x[i] = data[i].T 140 | else: 141 | x.gen low, range 142 | if not bench: t = x 143 | let t0 = epochTime() 144 | x.cumsum 145 | dtMin = min(dtMin, (epochTime() - t0) * 1e9) 146 | if not bench: 147 | var t0 = t 148 | for i in 1 ..< n: 149 | t[i] += t[i - 1] 150 | for i in 1 ..< n: 151 | if x[i] != t[i]: 152 | echo "bad cumsum at: ",i," x: ",x[i]," shouldBe: ",t[i]; echo "" 153 | echo "x0[]: ", t0; echo "" 154 | echo "t[]: ", t; echo "" 155 | echo "x[]: ", x 156 | return 157 | dtsMin.push dtMin 158 | echo "n: ", n, " ", $T, " cumsum_ns: ", dtsMin.min, " .. ", dtsMin.max 159 | 160 | proc testTime(kind=kU1, n=256, low=0, range=128, bench=false, minN=9, avgN=9, 161 | data: seq[uint64]) = 162 | case kind 163 | of kU1: doIt(0'u8 , n, low, range, bench, minN, avgN, data) 164 | of kU2: doIt(0'u16, n, low, range, bench, minN, avgN, data) 165 | of kU4: doIt(0'u32, n, low, range, bench, minN, avgN, data) 166 | of kU8: doIt(0'u64, n, low, range, bench, minN, avgN, data) 167 | 168 | dispatch(testTime, cmdName="cumsum") 169 | -------------------------------------------------------------------------------- /adix/lghisto.nim: -------------------------------------------------------------------------------- 1 | ##[ `LgHisto` is an application of BISTs to histograms of logs giving efficient, 2 | dynamic quantiles. Logs give high dynamic range at low cost while Fenwick/BIST 3 | supports dynamic membership w/operation-balanced perf. 4 | 5 | Quantile error is absolute { not relative to `q*(1-q)` like t-Digests } & easily 6 | bounded as <~ 1/2 bin width { ~ 10^(log_10(b/a)/n) }. If you need 3 places or 7 | your data is clustered within a few orders of magnitude then you can likely just 8 | use 1e4 bins & counters will remain L1 cache resident, depending on resource 9 | competition. Cache is the main cost Re: speed. Re: space, since 99% of bins 10 | are 0 in many cases, simple run-length encoding can improve net/disk transfers. 11 | 12 | The way Fenwick BISTs work, the generic parameter `C` must be a wide enough int 13 | type to hold both elemental bin counts AND cumulatives. `uint32` is likely 14 | enough for many applications, though some might sneak by with `uint16` and a few 15 | might need `uint64`. This scales bin size/space cost. 16 | 17 | t-Digests are a well marketed competitor using ~10X less space BUT with >>5X 18 | slower quantiles of similar accuracy. Actual costs are sensitive to operation 19 | mixes. { This may change, but present t-Digest impls, even with trees, linear 20 | scan for quantile/CDFs. None even offer "batch" APIs to do N quantiles in one 21 | such scan. "Histo B-trees" should allow better scaling for such. } A BIST basis 22 | differs from t-Digests in other important ways. E.g., BISTs are well suited for 23 | `pop` (or moving data window ops) with *strict* finite memory to, e.g. translate 24 | full streams to moving quantiles as in Bollinger Band-style smooths.]## 25 | 26 | when not declared(addFloat): import std/formatfloat 27 | import adix/[bist, lna]; from std/math import exp, isNaN 28 | type 29 | LgHisto*[C] = object ## Log-spaced histogram with `Bist[C]` backing 30 | n: int # number of bins 31 | a, b: float # histogram covers [-b, -a], (-a, a) in zero, [a, b] 32 | aLn, h, hInv: float # index scale conversion pre-computes 33 | bist: Bist[C] # actual smart array of counters: [0, 2*n] -> PMF/CDF 34 | 35 | func underflows*[C](s: LgHisto[C]): int = s.bist.pmf 0 36 | func overflows*[C](s: LgHisto[C]): int = s.bist.pmf 2*s.n 37 | func low*[C](s: LgHisto[C]): float = s.a 38 | func high*[C](s: LgHisto[C]): float = s.b 39 | func nBin*[C](s: LgHisto[C]): int = s.n 40 | func bist*[C](s: LgHisto[C]): Bist[C] = s.bist 41 | 42 | func init*[C](s: var LgHisto[C], a=1e-16, b=1e20, n=8300) = 43 | ## Init histo w/2n+1 log-spaced bins: `[-∞..-b; -b..-a; 0; a..0") 46 | if n < 2: raise newException(ValueError, "n must >= 2") 47 | s.n = n 48 | s.a = a 49 | s.b = b 50 | s.aLn = lna(a) 51 | s.h = (lna(b) - lna(a))/float(n - 1) 52 | s.hInv = 1.0/s.h 53 | s.bist = initBist[C](2*n + 1) 54 | 55 | func initLgHisto*[C](a=1e-16, b=1e20, n=8300): LgHisto[C] = result.init a, b, n 56 | ## Get Histo w/2n+1 log-spaced bins: `[-inf..<-b; -b..<-a; 0; a..= -s.b: result = s.n - 1 - int( (lna(-x) - s.aLn)*s.hInv) 67 | else : result = 0 68 | elif x >= +s.a: 69 | if x <= +s.b: result = s.n + 1 + int( (lna(+x) - s.aLn)*s.hInv) 70 | else : result = 2*s.n 71 | else: result = s.n 72 | 73 | func fromIx*[F,C](s: LgHisto[C], i: int, offset: F=0.5): F = 74 | ## Geometric mean of left&right edge log-shifted `offset` fraction into bin 75 | if i < s.n: -exp(s.aLn + s.h*(F(s.n - i - 1) + F(1) - offset)) 76 | elif i > s.n: +exp(s.aLn + s.h*(F(i - s.n - 1) + offset)) 77 | else: 0.0 # The bin containing x=zero cannot really be offset in the same way 78 | 79 | func binAB*[F,C](s: LgHisto[C], x: F): (float, float) = 80 | ## Range in data space of the bin containing `x`; Costs 2 `fromIx`s. 81 | let i = s.toIx(x) 82 | if i == 0 : result[0] = -Inf ; result[1] = -s.b 83 | elif i == 1 : result[0] = -s.b ; result[1] = s.fromIx(i, 1.0) 84 | elif i == 2*s.n-1: result[0] = s.fromIx(i, 0.0); result[1] = +s.b 85 | elif i == 2*s.n : result[0] = +s.b ; result[1] = +Inf 86 | elif x < -s.a : result[0] = s.fromIx(i, 0.0); result[1] = s.fromIx(i, 1.0) 87 | elif x >= +s.a : result[0] = s.fromIx(i, 0.0); result[1] = s.fromIx(i, 1.0) 88 | else : result[0] = -s.a ; result[1] = +s.a 89 | 90 | func add*[F,C](s: var LgHisto[C], x: F, w=1.C) = 91 | ## Increment bin for value `x` by weight `w` 92 | if not isNaN(x): s.bist.inc(s.toIx(x), w) 93 | 94 | func pop*[F,C](s: var LgHisto[C], x: F, w=1.C) = 95 | ## Alias for `add` with a negative weight argument 96 | if not isNaN(x): s.bist.dec(s.toIx(x), w) 97 | 98 | iterator bins*[C](s: LgHisto[C]): (float, float, C) = 99 | ## Yield `(lo, hi, count)` for each bin covered 100 | yield (-Inf, -s.b, s.bist.pmf 0) 101 | yield (-s.b, s.fromIx(1,1.0), s.bist.pmf 1) 102 | for i in 2 ..< s.n: yield (s.fromIx(i,0.0),s.fromIx(i,1.0),s.bist.pmf i) 103 | yield (-s.a, s.a, s.bist.pmf s.n) # middle bin breaks geometric mean formula 104 | for i in s.n+1..<2*s.n-1: yield (s.fromIx(i,0.0),s.fromIx(i,1.0),s.bist.pmf i) 105 | yield (s.fromIx(2*s.n-1,0.0), +s.b, s.bist.pmf 2*s.n-1) 106 | yield (+s.b, +Inf, s.bist.pmf 2*s.n) 107 | 108 | proc `$`*[C](s: LgHisto[C], nonZero=true): string = 109 | ## Formatting operator; Warning: output can be large, esp. if nonZero=false 110 | result.add "n: " & $s.n & "\ta: " & $s.a & "\tb: " & $s.b & "\n" 111 | result.add "aLn: " & $s.aLn & "\th: " & $s.h & "\thInv: " & $s.hInv & "\n" 112 | result.add "bins,cnts:\n" 113 | var tot = 0; var n = 0 114 | for (a, b, c) in s.bins: 115 | let c = int(c); tot += c 116 | if nonZero: 117 | if c != 0: result.add " [ " & $a & " , " & $b & " ): " & $c & "\n"; inc n 118 | else : result.add " [ " & $a & " , " & $b & " ): " & $c & "\n" 119 | result[^1] = '\n' 120 | result.add "totalCount: " & $tot & (if nonZero: " non0Bins: " & $n else: "") 121 | 122 | func quantile*[F,C](s: LgHisto[C], q: F): F = 123 | ## Basic quantile; XXX Can log-spacing savvy interpolation be more accurate? 124 | if q < 0.0 or q > 1.0: return NaN 125 | var iL, iH: int 126 | let fL = s.bist.quantile(q, iL, iH) 127 | fL*s.fromIx(iL) + (1 - fL)*s.fromIx(iH) 128 | 129 | func cdf*[F,C](s: LgHisto[C], x: F): C = 130 | ## Raw count; Leave to caller to multiply by 1/s.bist.count; XXX Interpolate? 131 | if x.isNaN: NaN else: s.bist.cdf(s.toIx(x)) 132 | 133 | func merge*[C](dst: var LgHisto[C], src: LgHisto[C]) = 134 | ## Merge counts from src into dst. 135 | if src.n != dst.n or src.a != dst.a or src.b != dst.b: 136 | raise newException(ValueError, "src-dst histogram parameter mismatch") 137 | for i in 0..2*src.n: dst.bist.inc i, src.bist.pmf(i) # Flat array can be fastr 138 | 139 | when isMainModule: 140 | when defined(test): # Helpful to run against: \ -12 \ -8 \ -4 \ -1 0 1 4 8 12 141 | proc lghist(a=0.125, b=10.0, n=8, qs = @[0.25, 0.5, 0.75], xs: seq[float]) = 142 | var lh = initLgHisto[uint16](a, b, n) 143 | for x in xs: lh.add x 144 | echo `$`(lh, nonZero=false) 145 | for (a, b, c) in lh.bins: 146 | if (a,b) != lh.binAB((a+b)/2) or a >= b: 147 | echo "a: ",a," b: ",b," c: ",c," ab(mid(a,b)): ",lh.binAB((a+b)/2) 148 | if lh.tot > 0: (for q in qs: echo "q",q,": ",lh.quantile(q)) 149 | import cligen; dispatch lghist 150 | else: 151 | import std/[random, times, strformat] 152 | when defined danger: randomize() 153 | const N = 750_000 154 | var data = newSeq[float32](N) 155 | const Q = [0.001,0.01,0.05,0.1587,0.25,0.50,0.75,0.8413,0.95,0.99,0.999] 156 | var res = newSeq[float32](Q.len) 157 | for i in 0.. 0: lg(numBound) 63 | elif initialSize > 0: lg(initialSize) 64 | else: 0 65 | let bitsz = initialSize * bits 66 | s.data.setLen if bitsz > 0: (roundUp(bitsz) shr iShf) else: 1 67 | s.len = initialSize 68 | s.bits = bits.int8 69 | 70 | proc initSeqUint*(initialSize=0, numBound=0): SeqUint {.inline.} = 71 | result.init(initialSize, numBound) 72 | 73 | template BadIndex: untyped = 74 | when declared(IndexDefect): IndexDefect else: IndexError 75 | 76 | # Consider storing 3 bit numbers packed into 8 bit words big-endian-wise like: 77 | # indices for trad. R2Left ops: 76543210 76543210 76543210 78 | # The layout can be either A) m=1 [....210.] OR B) m=7 [0.......][......21]. 79 | # Goes to bit-algebra `(w shr m) and msk` OR `(w1 and 3) shl 2 or (w0 shr m)` 80 | # where `m == bitix % 8` is the modulus of low order bit index relative to wdsz. 81 | proc `[]`*(s: SeqUint, i: int|uint): uint {.inline.} = 82 | if int(i) >= s.len: 83 | raise newException(BadIndex(), formatErrorIndexBound(int(i), s.len)) 84 | let sbits = uint(s.bits) 85 | let bitix = uint(i) * sbits 86 | let wdix = bitix shr iShf 87 | let wdmod = bitix and iMsk 88 | let bitend = wdmod + sbits 89 | if bitend <= iBit: 90 | result = (s.data[wdix] shr wdmod) and ((1'u shl sbits) - 1) 91 | else: 92 | let w0bit = iBit - wdmod 93 | let oFlow = sbits - w0bit 94 | let oMask = (1'u shl oFlow) - 1 95 | result = ((s.data[wdix+1]and oMask) shl w0bit) or (s.data[wdix] shr wdmod) 96 | 97 | # Reconsider the above bit extraction diagram/example for bit deposit. Here we 98 | # update one or two words. In A) the new value is one 3-way bitwise OR of two 99 | # old & 1 new parts, e.g.: (old and 240)or(num shl 1)or(old and 1) or more 100 | # generally (wd and hiM) or (num shl m) or (wd and mMask) where hiM is the 101 | # complement of the m+3 shift and mMask the mask for m bits. Case B) does two 102 | # bitwise ORs stored to the pair of words. The 1st goes to ((num and 1) shl 7) 103 | # or (w0 and loM) while the 2nd to (w1 and not oMask) or (num shr w0bit). 104 | proc `[]=`*(s: var SeqUint, i: int|uint, x: int|uint) {.inline.} = 105 | let x = uint(x) and ((1'u shl s.bits) - 1) 106 | if int(i) >= s.len: 107 | raise newException(BadIndex(), formatErrorIndexBound(i, s.len)) 108 | let sbits = uint(s.bits) 109 | let bitix = uint(i) * sbits 110 | let wdix = bitix shr iShf 111 | let wdmod = bitix and iMsk 112 | let bitend = wdmod + sbits 113 | if bitend <= iBit: 114 | let wd = s.data[wdix] 115 | let hiM = if bitend == iBit: 0'u else: (not 0'u) shr bitend shl bitend 116 | let mMask = (1'u shl wdmod) - 1 117 | s.data[wdix] = (wd and hiM) or (x shl wdmod) or (wd and mMask) 118 | else: 119 | let w0bit = iBit - wdmod 120 | let oFlow = sbits - w0bit 121 | let w0 = s.data[wdix] 122 | let w1 = s.data[wdix + 1] 123 | let oMask = (1'u shl oFlow) - 1 124 | let loM = (1'u shl wdmod) - 1 125 | let cMask = (1'u shl (iBit - wdmod)) - 1 126 | s.data[wdix] = ((x and cMask) shl wdmod) or (w0 and loM) 127 | s.data[wdix+1] = (w1 and not oMask) or (x shr w0bit) 128 | 129 | proc setLen*(s: var SeqUint, size: int) {.inline.} = 130 | let bitsz = size * s.bits 131 | s.data.setLen (roundUp(bitsz) shr iShf) 132 | s.len = size 133 | 134 | proc add*(s: var SeqUint, v: uint) {.inline.} = 135 | let i = s.len 136 | s.setLen i + 1 137 | s[i] = v 138 | 139 | iterator items*(s: SeqUint): uint = 140 | for i in 0 ..< s.len: yield s[i] 141 | 142 | iterator pairs*(s: SeqUint): (int, uint) = 143 | for i in 0 ..< s.len: yield (i, s[i]) 144 | 145 | proc `$`*(s: SeqUint): string = 146 | result = "[" 147 | for i, v in s: result.add (if i < s.len - 1: $v & ", " else: $v) 148 | result.add "]" 149 | 150 | when isMainModule: 151 | var s1 = initSeqUint(16) 152 | for i in 0 ..< s1.len: # Single big word, even small per big, fwd order 153 | let n = uint(i) 154 | s1[i] = n 155 | if s1[i] != n: echo "i: ", i, " SET ", n, " BUT GOT ", s1[i], " BACK" 156 | 157 | var s2 = initSeqUint(44, numBound=16) 158 | for i in 0 ..< s2.len: # Three big words, even small per big, fwd order 159 | let n = uint(i and 15) 160 | s2[i] = n 161 | if s2[i] != n: echo "i: ", i, " SET ", n, " BUT GOT ", s2[i], " BACK" 162 | 163 | var s3 = initSeqUint(128, 8) 164 | for i in 0 ..< s3.len: # Six big words, uneven small per big, fwd 165 | let n = uint(i and 7) 166 | s3[i] = n 167 | if s3[i] != n: echo "i: ", i, " SET ", n, " BUT GOT ", s3[i], " BACK" 168 | 169 | var s4 = initSeqUint(64*13, numBound=32) 170 | for i in 0 ..< s4.len: # 65 big words, 5-bit nums, pseudo-rand vals 171 | let n = uint((i * 19) and 31) 172 | s4[i] = n 173 | if s4[i] != n: echo "i: ", i, " SET ", n, " BUT GOT ", s4[i], " BACK" 174 | 175 | # Now all the same as above but looping high to low 176 | var s5 = initSeqUint(16) 177 | for i in countdown(s5.len - 1, 0): 178 | let n = uint(i) 179 | s5[i] = n 180 | if s5[i] != n: echo "i: ", i, " SET ", n, " BUT GOT ", s5[i], " BACK" 181 | 182 | var s6 = initSeqUint(44, numBound=16) 183 | for i in countdown(s6.len - 1, 0): 184 | let n = uint(i and 15) 185 | s6[i] = n 186 | if s6[i] != n: echo "i: ", i, " SET ", n, " BUT GOT ", s6[i], " BACK" 187 | 188 | var s7 = initSeqUint(128, 8) 189 | for i in countdown(s7.len - 1, 0): 190 | let n = uint(i and 7) 191 | s7[i] = n 192 | if s7[i] != n: echo "i: ", i, " SET ", n, " BUT GOT ", s7[i], " BACK" 193 | 194 | var s8 = initSeqUint(64*13, numBound=32) 195 | for i in countdown(s8.len - 1, 0): 196 | let n = uint((i * 19) and 31) 197 | s8[i] = n 198 | if s8[i] != n: echo "i: ", i, " SET ", n, " BUT GOT ", s8[i], " BACK" 199 | 200 | var s9 = initSeqUint(0, numBound=32) 201 | for x in s8: s9.add x 202 | if $s9 != $s8: echo "grown seqUint != assigned"; echo s9; echo s8 203 | s9.setLen 0 204 | for x in s6: s9.add x 205 | if $s9 != $s6: echo "grown seqUint != assigned2"; echo s9; echo s6 206 | -------------------------------------------------------------------------------- /adix/hist.nim: -------------------------------------------------------------------------------- 1 | ##[ This provides a simple regular histogram with the `bist.nim` interface, but 2 | using vanilla bin counters mostly so it can be used with adix/xhist1. Bin incs 3 | are fast & `adix/cumsum` can be at least tolerable via parallel prefix sum on 4 | x86_64. Re-scaling|shifting bins can also be done (externally) post-decrement & 5 | pre-increment and is very vectorizable. This enables general weight decay, not 6 | only exponential/linear/flat, but adds per-point expense proportional to nBin. 7 | For specific weight decays this could be ~2X optimized some by "always counting 8 | up", as with Fenwick-backed embist/lmbist, but we hold off on that for now to 9 | provide a distinct calculation with distinct FP math trade offs. 10 | 11 | Performance break-even vs BIST methods depends on (at least!) counter size / 12 | vectorization, platform, time kernel, and work load. Ballpark expectations 13 | might be to use this <= ~300 bins for 1:1 ratios of dec, inc & quantile with 14 | this being about 4X faster for 8 bins & nBin/lg(nBin) slower for larger nBin. 15 | This can be strictly faster if your use case has many counter updates per 16 | quantile query. This can also potentially be accelerated by GPUs (especially in 17 | the context of transforming whole, already existing arrays rather than online / 18 | incremental transformation). The bottom of this module has a small test/timing 19 | program against a bist. ]## 20 | when not declared assert: import std/assertions 21 | import adix/cumsum, std/algorithm 22 | 23 | type Hist*[N: SomeNumber] = object ## Simple Histogram 24 | cnt*, csum*: seq[N] ## PDF/PMF/counter array and its cumulative sum 25 | tot*: N ## csum[^1], but always up to date 26 | dirty*: bool ## Flag indicating if .csum may be out of date from .cnt 27 | 28 | proc len*[N](h: Hist[N]): int = h.cnt.len ## Number of bins & bytes 29 | func size*[N](h: Hist[N]): int = h.len*N.sizeof 30 | func space*[N](h: Hist[N]): int = h.sizeof + 2*h.size 31 | proc tot*[N](h: Hist[N]): N = h.tot ## Raw total 32 | proc count*[N](h: Hist[N]): N = h.tot ## Total Weight 33 | 34 | proc init*[N](h: var Hist[N]; len: int) = h.cnt.setLen len; h.csum.setLen len 35 | proc initHist*[N](len: int): Hist[N] = result.init len 36 | proc clear*[N](h: var Hist[N]) = 37 | zeroMem h.cnt[0].addr, h.size; zeroMem h.csum[0].addr, h.size 38 | h.tot = 0.0; h.dirty = false 39 | proc inc*[N](h: var Hist[N]; i:int, w:N=1) = h.cnt[i]+=w; h.tot+=w; h.dirty=true 40 | ## Add weight `w` to bin `i` & `.tot`; set dirty 41 | proc dec*[N](h: var Hist[N]; i:int; w:N=1) = h.cnt[i]-=w; h.tot-=w; h.dirty=true 42 | ## Subtract weight `w` from bin `i` & `.tot`; set dirty 43 | proc up*[N](h: var Hist[N]) = ## Update `.csum` field after various inc/dec's 44 | if h.dirty and h.csum.len > 0 and h.cnt.len > 0: 45 | copyMem h.csum[0].addr, h.cnt[0].addr, h.size 46 | cumsum.cumsum h.csum; h.dirty = false 47 | 48 | proc cdf*[N](h: Hist[N], i: int): N = h.csum[i] 49 | proc pmf*[N](h: Hist[N], i: int): N = h.cnt[i] 50 | 51 | proc invCDF*[N](h: Hist[N], s: N; s0: var N): int = 52 | ## For `0 < s <= tot`, bracket ECDF jump `>= s`. I.e. find `i0, s0` so `s0 = 53 | ## sum(..< i0) < s yet sum(..i0) >= s` in `lgCeil n` array probes. 54 | assert 0<=s and s<=h.tot, "Hist.invCDF OORange sum " & $s & " of " & $h.tot 55 | result = h.csum.lowerBound(s) #NOTE: s<0|s>tot are invalid inputs 56 | if result >= h.cnt.high: result = h.cnt.high; s0 = h.tot 57 | else: s0 = h.csum[result] - h.cnt[result] 58 | 59 | proc `$`*[N](h: Hist[N]): string = "tot: " & $h.count & " pmf: " & $h.nPDF 60 | 61 | proc invCDF*[N](h: Hist[N], s: N): (int, N) = result[0] = h.invCDF(s, result[1]) 62 | ## For `0 < s <= tot` return `(i0,s0)` so `sum(..=s` 63 | 64 | proc invCDF*[N](h: Hist[N]; s: N; s0, s1: var N): int = 65 | ## For `0 < s <= tot`, find `i0,s0,s1` so `s0 < s <= s1` and `s0+pmf(i0)==s1`. 66 | result = h.invCDF(s, s0) 67 | if result == h.cnt.high: s1 = s0; s0 = s1 - h.cnt[result] 68 | else: s1 = s0 + h.cnt[result] 69 | 70 | proc min*[N](h: Hist[N]): int = ## Simple wrapper: invCDF(h, 1) 71 | var s0: N; h.invCDF(1, s0) 72 | 73 | proc max*[N](h: Hist[N]): int = ## Simple wrapper: invCDF(h,h.count). 74 | var s0: N; h.invCDF(h.tot.N, s0) 75 | 76 | from std/fenv import epsilon #XXX Centralize thrice replicated Parzen(invCDF) 77 | proc quantile*[N](h: Hist[N]; q: float; iL,iH: var int): float = 78 | ## Parzen-interpolated quantile; E.g., q=0.9 => 90th percentile. ``answer = 79 | ## result*iL + (1-result)*iH``, but is left to caller to do { in case it is 80 | ## mapping larger numeric ranges to/from iL,iH }. Tm ~ ``2*lg(addrSpace)``. 81 | ## Unlike other (broken!) quantile-interpolation methods, Parzen's connects 82 | ## midpoints of vertical CDF jumps, not horizontal. This makes more sense, 83 | ## corresponding to Wilcoxon 1945 & later tie mid-ranking recommendations. 84 | assert h.tot > 0, "quantile(Hist[N]) requires non-empty Hist." 85 | var sL0, sL1, sH0, sH1: N #You probably want to draw a CDF to 86 | let n = h.tot.float #..fully understand this code. 87 | let qN = q*n 88 | let wq = when N is SomeFloat: N.epsilon*n else: 1.N # A Quantum Of Ctr Wgt 89 | if qN <= 0.5*wq.float : iL = h.min;iH=0;return 1 #Early tails rets; Pure iL 90 | if qN >= n - 0.5*wq.float: iL = h.max;iH=0;return 1 #{Early body are pure iH.} 91 | let dqN=when N is SomeFloat: wq else: 1.5 # Min round-off + max odds high side 92 | iH = h.invCDF(N(qN + dqN), sH0, sH1) # sH0all iH 99 | if sH0 < wq: return 0 #For qN this small, iH = iL = min. 100 | iL = h.invCDF(sH0, sL0, sL1) #..Also, cannot call invCDF(0). 101 | when N is SomeFloat: # Should be impossible,but round-off 102 | if sL1 > sH0 + wq: #..makes it happen sometimes & when 103 | iL = h.invCDF(sH0 - wq, sL0, sL1) #..it does, we want next lower bin. 104 | let sMidL = 0.5*float(sL0 + sL1) #Mid-vertJump(nxtLwrBin) gives line 105 | min (sMidH - qN)/(sMidH - sMidL), 1.0 #Runs of N.eps-sized bins=>anomalies 106 | 107 | proc quantile*[N](h: Hist[N], q: float): float = 108 | ## Parzen-interpolated quantile when no caller index mapping is needed 109 | var iL, iH: int 110 | let fL = h.quantile(q, iL, iH) 111 | fL*iL.float + (1 - fL)*iH.float 112 | 113 | proc nPDF*[N](h: Hist[N]): seq[float32] = 114 | result.setLen h.len;let s=1/h.tot.float32;for i,r in mpairs result:r=s*h.pmf(i).float32 115 | 116 | proc nCDF*[N](h: Hist[N]): seq[float32] = 117 | result.setLen h.len;let s=1/h.tot.float32;for i,r in mpairs result:r=s*h.cdf(i).float32 118 | 119 | when isMainModule: 120 | const fast {.booldefine.} = false # VERY limited differences below 121 | when not declared addFloat: import std/[syncio, formatFloat] 122 | import std/[times, strformat], cligen, cligen/sysUt 123 | when fast: import adix/bist 124 | proc hist(xs: seq[int], win=3, q = -2.0, pdf=false,cdf=false,time=false, 125 | xMn=0,xMx=7) = 126 | template toI(x): untyped = max(xMn, min(xMx, x)) - xMn # Clip & shift 127 | if win < 2: Value !! "win " & $win & " too small" 128 | when fast: (var d = initBist[uint32](xMx - xMn + 1)) 129 | else : (var d = initHist[uint32](xMx - xMn + 1)) 130 | let t0 = epochTime() 131 | var tQ = 0.0 # Report avg qtl to ensure compiler cannot elide 132 | for t, x in xs: 133 | let x = x.toI # xOld frm Deque=moreGeneral 134 | if t >= win: d.dec xs[t - win].toI, 1 # Remove leaving 135 | d.inc x, 1 # Add entering 136 | if pdf: echo t," b: tot: ",d.tot," mPMF: ",d.nPDF 137 | when not fast: d.up # Make callers do this only once @top-level 138 | if cdf: echo t," b: tot: ",d.tot," mCDF: ",d.nCDF 139 | if q > -2.0: 140 | if time: tQ += d.quantile(q) # `formatFloat` slow=>just total 141 | else: echo d.quantile(q) # Report inverseCDF(q) 142 | if time: 143 | let n = xs.len.float; let dt = (epochTime() - t0)*1e9/n 144 | stderr.write &"n: {xs.len} ns/no: {dt:.1f} w: {win} : {tQ/n}\n" 145 | 146 | dispatch hist, short={"xMn":'a', "xMx":'b'}, help={"xs": "x values", 147 | "win" : "moving data window in points","q" : "quantile to report; 0.5=median", 148 | "pdf" : "print PDF each time step" ,"cdf": "print CDF each time step", 149 | "time": "print timing statistics", 150 | "xMn" : "`xs[i]` clipped to this `a` on `[a, xs]`", 151 | "xMx" : "`xs[i]` clipped to this `b` on `[xs, b]`"} 152 | -------------------------------------------------------------------------------- /tests/btshell.nim: -------------------------------------------------------------------------------- 1 | when not declared(stdout): import std/[syncio, objectdollar, formatfloat] 2 | import ppss, btree, strutils, strformat, os, times 3 | 4 | var verb = false # global verbosity flag 5 | var found = false # global last-seek found flag 6 | var f = stdout # to allow switch to shared stderr 7 | when defined(btTall): # Deep 234 trees hit all cases fast.. 8 | const nodz = 32 9 | else: #..But default to 2*64B cache lines 10 | const nodz = 128 11 | 12 | type # Instantiate a tree type/suite 13 | Pair = tuple[key, val: int16] 14 | Ix = int16 #XXX should not need signed type; Chk 15 | Ln = uint16 16 | 17 | proc getKey(x: Pair): int16 = x.key 18 | 19 | when defined(btHisto): 20 | proc size(ob: Pair): int = int(ob.val) 21 | defBTree(Pair, int16, int32, int16, nodz, Ix, Ln) 22 | else: 23 | defBTree(Pair, int16, int32, void, nodz, Ix, Ln) 24 | const m = btOrder() 25 | 26 | proc print(path: Path) = # This prints `path` on one line, but 27 | if not verb: return #..*ONLY* in verbose mode; else no-op. 28 | f.write &"PATH({path.len})" 29 | if path.len > 0: 30 | let k = path[^1].p[].ob[path[^1].i].key 31 | let v = path[^1].p[].ob[path[^1].i].val 32 | f.write &": {path} K: {k} V: {v}" 33 | f.write '\n' 34 | 35 | # For B-Trees, both objects on internal nodes bracketing the link are "inPath". 36 | # This is at least better than highlighting the *entire* B-tree node. 37 | const colorL = [ "30;7", "101", "102", "103", "104", "105", "106", "107" ] 38 | const colorD = [ "40;7", "41", "42", "43", "44", "45", "46", "47" ] 39 | let color = if getEnv("LC_THEME", "L").toUpper[0] == 'L': colorL else: colorD 40 | proc print(t: Ln, path: Path, lab="") = 41 | const indent = 8 42 | proc inPath(t: Ln, path: Path, ob: Pair): bool = # path markup flag 43 | for i, e in path: # `apply` acts on obs=>colorize obs around each e.p,i link 44 | if (e.p == t and e.i < 2*m-1 and e.p[].ob[e.i] == ob) or 45 | (i < path.len-1 and e.i > 0 and e.p[].ob[e.i-1] == ob): 46 | return true 47 | if lab.len > 0: f.write lab 48 | for (t, ob, depth) in t.nodes: 49 | let c = if t.inPath(path,ob): color[0] else: color[(76543*int(t)) mod 7 + 1] 50 | f.write repeat(" ", depth * indent), "\e[", c, "m", 51 | &"{int(t)}: k{ob.key},v{ob.val} (w{t[].wt})", "\e[0m\n" 52 | path.print 53 | 54 | proc treeCk(t: Ln): int = 55 | result = t.check 56 | if result == 0: result += int(t.refCk) 57 | 58 | let help = """c check many tree invariants 59 | p print colorized tree 60 | P print current `path` 61 | h height/occupancy statistics 62 | m set path to s-most side; 0=min; 1=max 63 | a set path to [01]-side neighbor 64 | k set path to where ob with key is/should be 65 | i insert k,v at current path 66 | d delete ob at current path 67 | n[01] set path to 0-origin -th element; optional [01] => print key,ob 68 | r compute 1-origin rank; show seekNth autocalc 69 | + seek k, then s=0|1 (pre|ap)pend k,v ; ins|ctr inc in btHisto mode 70 | - seek k, then s=0|1 (front|back)pop k; ctr dec|del in btHisto mode 71 | A bulk load/add k,0 from empty in s=0|1 rev|fwd order; spare 72 | D done with bulk adds 73 | t start stop-watch 74 | T stop stop-watch and print nanoseconds, #ops 75 | z print node size in bytes 76 | X no-op; maybe useful to time loop dispatch""" 77 | 78 | proc btshell(verbose=false, quiet=false, check=false, errstd=false): int = 79 | ## This shell reads ``ppss`` output to test all BTree ops & post-mutate check 80 | ## tree invariants. Interactive use (e.g. ppss|btshell) is good to see how it 81 | ## works/fails "in the small". Small programs to generate series of inputs in 82 | ## ``check`` mode is good to exercise all usage modes/trap bugs reproducibly. 83 | ## Color-structure highlighted print outs helps show structural bugs/features. 84 | verb = verbose # Propagate CL -> convenience globals 85 | if errstd: f = stderr 86 | let t = newN() # Init necessary to start w/empty tree 87 | var path: Path 88 | var r = 0 89 | var cin: Command 90 | var t0: Time 91 | var nOp = 0 92 | var ob: Pair 93 | template maybeCk = 94 | if check and t.treeCk > 0: t.print(path, "ERR\n"); return 1 95 | while stdin.readObject(cin.addr, cin.sizeof) == 1: 96 | nOp.inc 97 | if verbose: f.write cin, '\n' 98 | let s = cin.side; let k = int16(cin.key); let v = int16(cin.val) 99 | ob.key = k; ob.val = v 100 | case cin.letter 101 | of 'c': (let nE = t.treeCk; if nE > 0: f.write nE, " ERRS\n") # check 102 | of 'p': t.print(path) # print tree 103 | of 'P': (let tmp = verb; verb = true; path.print; verb = tmp) # print path 104 | of 'h': # height stats 105 | var nN,nO: int; let h = stats(t,nN,nO); let u=float(nO)/float(nN*(2*m-1)) 106 | f.write &"nOb: {nO} nNode: {nN} height: {h} : {u}\n" 107 | of 'm': path.setLen 0; seekMost(t, path, bool(k)); path.print # most `k` 108 | of 'a': seekAdj(path, bool(k)); path.print # adjacent `k` 109 | of 'k': # key search 110 | found = if cin.sided: t.seekKeys(path, s, k) else: t.seekKey(path, k) 111 | path.print 112 | of 'i': # insert ob @path 113 | if path.len > 0: path.add(ob, bool(k), found) 114 | else: f.write "cannot insert @empty path\n" 115 | maybeCk 116 | of 'd': (if path.len > 0: path.del; maybeCk) # Delete Ob @current path 117 | of 'n': # Move path to Nth elt 118 | r = t.seekNth(path, k); path.print 119 | if cin.sided and path.len > 0: 120 | if s: f.write path[^1].p[].ob[path[^1].i], '\n' 121 | else: f.write path[^1].p[].ob[path[^1].i].key, '\n' 122 | of 'r': f.write &"rnk: {rank(path)} r: {r}\n" # 1/0 origin rank of path 123 | of '+': 124 | when defined(btHisto): # This eg. ignores `sided`, doing only ctr; 125 | found = t.seekKey(path, k) #..With GC allocator could instead do seq 126 | if found: #..with real prepend/append with offset `r`. 127 | path[^1].p[].ob[path[^1].i].val.inc 128 | for j in 0 ..< path.len: path[j].p[].wt += 1 129 | else: 130 | ob.val = 1 # Only reason `ob` needs to be `var` 131 | path.add true, ob 132 | else: # +0 k v prepend k,v; +1 k v append k,v 133 | if cin.sided: # Unconditional add 134 | found = t.seekKeys(path, s, k) 135 | path.add(ob, s, found) 136 | else: # Add if missing 137 | found = t.seekKey(path, k) 138 | if found: f.write k, " already present\n" 139 | else: 140 | path.add(ob, s, found) 141 | if check: # Maybe dbl ck path post-add 142 | var path2: Path; discard t.seekKey(path2, k) 143 | if path2 != path: f.write "post add path mismatch\n" 144 | maybeCk 145 | of '-': 146 | when defined(btHisto): # Externally managed-rank integrated key 147 | found = t.seekKey(path, k) #..counter example just decrements|pops. 148 | if found: 149 | if path[^1].p[].ob[path[^1].i].val == 1: path.del 150 | else: 151 | path[^1].p[].ob[path[^1].i].val.dec 152 | for j in 0 ..< path.len: path[j].p[].wt -= 1 153 | elif not quiet: f.write &"{k} not found\n" 154 | else: # -0 k front pops k; -1 k pops k 155 | found = if cin.sided: t.seekKeys(path, s, k) else: t.seekKey(path, k) 156 | if found: path.del; maybeCk 157 | elif not quiet: f.write &"{k} not found\n" 158 | of 'A': badd(path, ob, s, t, v) # A1 k S bulk adds k,0 w/spare S 159 | of 'D': baddDone(t, s, k); maybeCk # Finalize after bulk adds 160 | of 'X': discard 161 | of 'z': f.write "node size: ", Node.sizeof, " bytes\n" 162 | of 't': nOp = 0; t0 = getTime() 163 | of 'T': f.write nOp, " ops in ", (getTime() - t0).inNanoseconds, " ns\n" 164 | else: f.write &"unknown command '{cin.letter}'; choices are:\n{help}\n" 165 | return nOp mod 2 # Ensure compiler cannot elide calc 166 | 167 | when isMainModule: 168 | import cligen 169 | dispatch(btshell, help={ "verbose": "echo read ops & path post [maknp]", 170 | "quiet" : "\"not found\" del vs silent no-op", 171 | "check" : "auto check after all mutating ops", 172 | "errstd" : "echos -> stderr" }) 173 | -------------------------------------------------------------------------------- /adix/lmbist.nim: -------------------------------------------------------------------------------- 1 | ##[ `Bist[T]` (& clients `lghisto`, `mvstat`) already support quantiles over 2 | moving data windows. Sometimes one wants recent values to carry more weight in 3 | summary statistics, as in a Linearly Weighted Moving Average. In the context of 4 | a distribution, one weights by replication - more copies of more recent points 5 | vs. earlier & earlier. While one can do this by literal data point repetition, 6 | that expands both space & time costs. So, one should prefer virtual replication 7 | - a histogram putting weight into bins with the same time structure. 8 | 9 | A naive implementation decays weight for each point in the window as each point 10 | leaves and adds a new point with weight `w`. This is `O(nBin)` - no faster than 11 | a full rebuild. This CAN be done with no loops longer than `lg(nBin)`, though. 12 | The key insight is to "count up forever" adding in new points with weight `t+1`, 13 | but subtract a virtual zero level. If no actual duplicates exist, this 0-level 14 | is simply `t-w`. Each duplicate @various lags (common w/binning, e.g. `lghisto`) 15 | gets 1 "copy" of this virtual 0-level - which "stack up" in a bin. This can be 16 | handled with a *second* `Bist` tracking *only* membership. The actual distro is 17 | then the linear combination `cnt[] - zero*nLag[]`. 18 | 19 | We thus get `LMBist[T]` which bundles up this dual `Bist[T]` idea. The API is 20 | the same as `Bist[T]` & `EMBist[T]`. The bottom of this module has a small 21 | test/timing prog showing the differences. 22 | 23 | Happy to cite someone, but as far as I can tell, this is a completely novel 24 | application of Fenwick BISTs for a fast Linear weight Moving Median filter 25 | transform. I certainly came up with it on my own. The linearly weighted moving 26 | median itself while mostly obvious from its term and the more famous LWMA is all 27 | but unheard of in academic literature regardless of implementation efficiency. 28 | Khodakarami, et. al 2019 about Parkinson's is literally *THE ONLY* match on 29 | scholar.google.com as of this writing. Cohen & Strauss were likely somehow 30 | aware of the idea, but unaware of the "LWM Average" term when they decided to 31 | name this "chordal weighting" in their Cohen,Strauss2003 SIGMOD paper. Due to 32 | its simplicity (see adix/embist) & especially terse *average* form, exponential 33 | time kernels SO dominate that other things are usually unattended. See, e.g., 34 | Akinshin 2023 "Weighted Quantile Estimators" which does *not* consider linear 35 | time kernels (but is otherwise a very interesting paper). It is true that one 36 | needs to keep *just one window* of data points in a std/deque|ring buffer to 37 | expire items, but the same is true of exponential weighting with strict windows, 38 | needed for good history/"time breakdown point" behavior in either average or 39 | quantile settings. It is only fairly extreme scenarios where just one window 40 | exceeds CPU L3 cache, let alone DRAM, though. Please cite this github repo if 41 | this code inspires your work. ]## 42 | when not declared assert: import std/assertions # debugging 43 | import adix/[bist, bitop], cligen/sysUt 44 | 45 | type LMBist*[T: SomeNumber] = object 46 | cnt, nLag: Bist[T] # Raw count, number of Lags in-window@`i` 47 | zero: T # Window size, Bottom Level, Root Finding Guess/Return 48 | 49 | proc len*[T](d: LMBist[T]): int = d.cnt.data.len 50 | func space*[T](d: LMBist[T]): int = 2*(d.sizeof + (d.cnt.data.len + 1)*T.sizeof) 51 | proc tot*[T](d: LMBist[T]): T = d.cnt.tot - d.zero*d.nLag.tot 52 | proc count*[T](d: LMBist[T]): T = d.tot 53 | 54 | proc init*[T](d: var LMBist[T]; len: int) = d.cnt.init len; d.nLag.init len 55 | proc initLMBist*[T](len: int): LMBist[T] = result.init len 56 | proc clear*[T](d: var LMBist[T]) = d.cnt.clear; d.nLag.clear; d.zero = 0 57 | 58 | proc inc*[T](d: var LMBist[T]; i: int, w: T) = 59 | d.cnt.inc i, w; d.nLag.inc i, 1 # track both weight & membership 60 | 61 | proc dec*[T](d: var LMBist[T]; i: int, w: T) = 62 | d.cnt.dec i, w; d.nLag.dec i, 1 # track both weight & membership 63 | d.zero += 1 # & the bottom or virtual zero. 64 | 65 | proc up*[T](d: var LMBist[T]) = discard ## Simple no-op for LMBist 66 | 67 | proc cdf*[T](d: LMBist[T], i: int): T = d.cnt.cdf(i) - d.zero*d.nLag.cdf(i) 68 | proc pmf*[T](d: LMBist[T], i: int): T = d.cnt.pmf(i) - d.zero*d.nLag.pmf(i) 69 | 70 | proc invCDF*[T](d: LMBist[T], s: T; s0: var T): int = 71 | assert 0<=s and s<=d.tot, "LMBist.invCDF OORange sum " & $s & " of " & $d.tot 72 | var c = s #NOTE: s<0|s>tot are invalid inputs 73 | cfor (var half = d.cnt.data.len.ceilPow2 shr 1), half != 0, half >>= 1: 74 | var m = result + half - 1 # midpoint in binary search 75 | if m < d.cnt.data.len and d.cnt[m] - d.zero*d.nLag[m] < c: 76 | c -= d.cnt[m] - d.zero*d.nLag[m] 77 | result = m + 1 78 | s0 = s - c 79 | 80 | proc invCDF*[T](d: LMBist[T]; s: T; s0, s1: var T): int = 81 | result = d.invCDF(s, s0); s1 = s0 + d.pmf(result) 82 | proc min*[T](d: LMBist[T]): int = d.nLag.min ## Simple wrapper of `d.nLag.min`. 83 | proc max*[T](d: LMBist[T]): int = d.nLag.max ## Simple wrapper of `d.nLag.max`. 84 | 85 | proc quantile*[T](d: LMBist[T]; q:float; iL,iH: var int): float = 86 | assert d.tot > 0, "quantile(LMBist[T]) requires non-empty LMBist." 87 | var sL0, sL1, sH0, sH1: T #You probably want to draw a CDF to 88 | let tot = d.tot; let n = tot.float #..fully understand this code. 89 | let qN = q*n 90 | if qN <= 0.5 : iL = d.min; iH = 0; return 1 #Early rets for tails; Pure iL 91 | if qN >= n - 0.5: iL = d.max; iH = 0; return 1 #{Early for body are pure iH.} 92 | iH = d.invCDF(T(qN + 1.5), sH0, sH1) 93 | var sMidH = 0.5*float(sH0 + sH1) #This guess works 90+% of the time.. 94 | if sMidH < qN: #..but can fail for large sH1 - sH0. 95 | if sH1 < tot: #When it fails, want next higher bin 96 | iH = d.invCDF(sH1 + 1, sH0, sH1) 97 | sMidH = 0.5*float(sH0 + sH1) 98 | else: return 0 #..unless @HIGHEST already=>all iH 99 | if sH0 == 0: return 0 #For qN this small, iH = iL = min. 100 | iL = d.invCDF(sH0, sL0, sL1) #..Also, cannot call invCDF(0). 101 | let sMidL = 0.5*float(sL0 + sL1) #Mid-vertJump(nxtLwrBin) gives line 102 | (sMidH - qN)/(sMidH - sMidL) 103 | 104 | proc quantile*[T](d: LMBist[T]; q: float): float = 105 | var iL, iH: int 106 | let fL = d.quantile(q, iL, iH) 107 | fL*iL.float + (1 - fL)*iH.float 108 | 109 | proc nPDF*[T](d: LMBist[T]): seq[float32] = 110 | result.setLen d.cnt.len;let s=1/d.tot.float32;for i,r in mpairs result:r=s*d.pmf(i).float32 111 | 112 | proc nCDF*[T](d: LMBist[T]): seq[float32] = 113 | result.setLen d.cnt.len;let s=1/d.tot.float32;for i,r in mpairs result:r=s*d.cdf(i).float32 114 | 115 | when isMainModule: 116 | const slow {.booldefine.} = false # VERY limited differences below 117 | when not declared addFloat: import std/[syncio, formatFloat] 118 | import std/[times, strformat], cligen 119 | proc lmbist(xs: seq[int], win=3, q = -2.0, pdf=false,cdf=false,time=false, 120 | xMn=0,xMx=7) = 121 | template toI(x): untyped = max(xMn, min(xMx, x)) - xMn # Clip & shift 122 | if win < 2: Value !! "win " & $win & " too small" 123 | when slow: (var d = initBist[uint32](xMx - xMn + 1)) 124 | else : (var d = initLMBist[uint32](xMx - xMn + 1)) 125 | let t0 = epochTime() 126 | var tQ = 0.0 # Report avg qtl to ensure compiler cannot elide 127 | for t, x in xs: 128 | let x = x.toI # xOld frm Deque=moreGeneral 129 | when slow: # On full data win, decay ALL old weight 130 | if t >= win: (for tw in t - win ..< t: d.dec xs[tw].toI, 1) # BIG LOOP 131 | d.inc x, min(t + 1, win).uint32 # Small entering weight 132 | else: # Remove weight for leaving data point 133 | if t >= win: d.dec xs[t - win].toI, uint32(t + 1 - win) 134 | d.inc x, uint32(t + 1) # Large entering weight 135 | if pdf: echo t," b: tot: ",d.tot," lwmPMF: ",d.nPDF 136 | if cdf: echo t," b: tot: ",d.tot," lwmCDF: ",d.nCDF 137 | if q > -2.0: 138 | if time: tQ += d.quantile(q) # `formatFloat` slow=>just total 139 | else: echo d.quantile(q) # Report inverseCDF(q) 140 | if time: 141 | let n = xs.len.float; let dt = (epochTime() - t0)*1e9/n 142 | stderr.write &"n: {xs.len} ns/no: {dt:.1f} w: {win} : {tQ/n}\n" 143 | 144 | dispatch lmbist, short={"xMn":'a', "xMx":'b'}, help={"xs": "x values", 145 | "win" : "moving data window in points","q" : "quantile to report; 0.5=median", 146 | "pdf" : "print PDF each time step" ,"cdf": "print CDF each time step", 147 | "time": "print timing statistics", 148 | "xMn" : "`xs[i]` clipped to this `a` on `[a, xs]`", 149 | "xMx" : "`xs[i]` clipped to this `b` on `[xs, b]`"} 150 | #[ A Zsh session showing basic correctness&boost of optimization. Sets up env, 151 | compiles ref & optimized; makes nums; Tests various q & w; Finally measures 'em. 152 | nim=(nim c -d:danger); t=/tmp/nums # Set up 153 | $nim -d:slow -o=slmbist lmbist; $nim lmbist 154 | ( for i in {1..10000}; printf " %s" $((RANDOM%8)) ) > $t 155 | ( for q in .1 .25 .5 .75 .9; { for w in {2..10}; { 156 | paste <(./lmbist -w$w -q.1 `<$t`) <(./lmbist -w$w -q.1 `<$t`) | 157 | awk '{print $1-$2}' | sort -g | tails -h1 -t1 }}) 2>/dev/null|unfold -n3 158 | ./slmbist -tw756 -q.5 `<$t`; ./lmbist -tw756 -q.5 `<$t` 159 | I get NO DIFF between ref & optimized, optimized about 25X faster. I also get a 160 | -w BreakEven of -w4 for when version marked "slow" is faster, but it's only 1.2X 161 | faster at smallest sensical -w2. So, -w2..4 not really worth conditioning. ]# 162 | -------------------------------------------------------------------------------- /adix/lna.nim: -------------------------------------------------------------------------------- 1 | ## FastIEEESinglePrecNaturalLogAbs; Just arctanh Taylor@1. Was 5X fastr'n mid00s 2 | ## x87. On SkyLake/glibc2.40/gcc14 ~1.2-2X faster; ARM64glibc somehow(fastHW?) 3 | ## ~4X faster. Unsure about Win/OSX. See news.ycombinator.com/item?id=40758562 4 | type f4s {.packed.} = object # De-structuring object for IEEE-single 5 | frac1 {.bitsize: 16}: cuint # Little-Endian format only right now 6 | frac0 {.bitsize: 7}: cuint # `cuint` should make Nim use `unsigned` 7 | expo {.bitsize: 8}: cuint #..for expressions like `expo-127`. Not 8 | sign {.bitsize: 1}: cuint #..sure where this is documented. 9 | 10 | const r1_2 = 0.70710678118654752440f64 11 | const LN2 = 0.69314718055994530942f64 12 | const LNr2 = 0.34657359027997265471f64 13 | 14 | func lnaSeries(s: float): float {.inline.} = # Worst case accuracies: 15 | when defined o3: 2.0/3.0*s + 2.0 # 11.22 rel bits 16 | elif defined o5: (0.4*s + 2.0/3.0)*s + 2.0 # 17.305 rel bits 17 | elif defined o7: ((2.0/7.0*s + 0.4)*s + 2.0/3.0)*s + 2.0 # 22.774 rel bits 18 | elif defined o11: # 24.0000002 rel bits 19 | ((((2.0/11.0*s + 2.0/9.0)*s + 2.0/7.0)*s + 0.4)*s + 2.0/3.0)*s + 2.0 20 | else: (((2.0/9.0*s + 2.0/7.0)*s + 0.4)*s + 2.0/3.0)*s + 2.0 # 23.97 rel bits 21 | 22 | func lna*(x: float32): float32 {.inline.} = 23 | ## Return fast,approx Natural Log(Abs(x)) { 'a' for a)bs | a)pprox } by ATanH 24 | ## 82% of the time & by std ln(1+x) series 12+6=18%. { BUT 0.lna=-88,not -Inf| 25 | ## +Inf@Inf; Not fixed inline here since outer caller should block|handle 0. } 26 | var x = x 27 | let p = cast[ptr f4s](x.addr) 28 | p.sign = 0 # abs(); Force x to be positive 29 | let e = (p.expo.cint - 127).float*LN2 # ln(x*2^y) == ln(x) + y*ln2 30 | p.expo = 127 # force x to [1, 2) 31 | if x > 1.88f32: # Small y in x=2-y: 6 terms of ln(1+y)=Σy^i/i BUT .. 32 | let y = x.float*0.5 - 1.0 #..adjusted for NEXT octave. 33 | e + LN2 + y*(1.0 + y*(-0.5 + y*(1.0/3.0 + y*(-0.25 + y*(0.2 - y/6.0))))) 34 | elif x < 1.06f32: # Small y in x=1+y: 6 terms of ln(1+y)=Σy^i/i 35 | let y = x.float - 1.0 36 | e + y*(1.0 + y*(-0.5 + y*(1.0/3.0 + y*(-0.25 + y*(0.2 - y/6.0))))) 37 | else: # 2*atanh(x) = ln((1+x)/(1-x)) = 2*Σx^o/o for odd 'o' 38 | let d = x.float * r1_2 # x -> dbl [sqrt(1/2), sqrt2) 39 | # d=(1+r)/(1-r); (1+r)=(1-r)*d=d-rd; d-rd-r-1; r*(d+1)=d-1; r=(d-1)/(d+1) 40 | let r = (d.float - 1.0)/(d.float + 1.0) # r for r)atio of -1/+1 41 | let s = r*r # s for s)quare 42 | e + LNr2 + r*s.lnaSeries # (1.37288 +- 0.00003)X faster on SkyLake 43 | 44 | when isMainModule: 45 | when defined(bench): 46 | import std/[times, math, strformat] 47 | var sum0 = 0.0; var sum = 0.0; var n = 0 48 | let t00 = epochTime() 49 | for i in 0 .. (1u64 shl 32) - 1: 50 | var i = uint32(i) 51 | let x = cast[ptr float32](i.addr)[] 52 | if x.isNaN: continue 53 | if x == 0.0f32: continue # -inf 54 | inc n 55 | if not (x.isNaN or 2*x==x): sum0 += x 56 | let dt0 = epochTime() - t00 57 | let t0 = epochTime() 58 | for i in 0 .. (1u64 shl 32) - 1: 59 | var i = uint32(i) 60 | let x = cast[ptr float32](i.addr)[] 61 | if x.isNaN: continue 62 | if x == 0.0f32: continue # -inf 63 | when defined(stdlib): (let l = ln(abs(x))) 64 | else : (let l = lna(x)) 65 | inc n 66 | if not (l.isNaN or 2*x==x): sum += l 67 | let dt = epochTime() - t0 - dt0 68 | echo &"sX:{sum0:.2g} sL:{sum:.0f} in {dt0:.5f} + {dt:.5f} s;n: {n}; {dt/n.float*1e9:.3f} ns/eval" 69 | else: 70 | when not declared(stdout): import std/[syncio, formatFloat] 71 | import std/[math, heapqueue] 72 | proc lnaT*(x: float32): float32 {.inline.} = 73 | var x = x 74 | let p = cast[ptr f4s](x.addr) 75 | p.sign = 0 # force x to be positive 76 | let e = (p.expo.cint - 127).float*LN2 # ln(x*2^y) == ln(x) + y*ln2 77 | p.expo = 127 # force x to [1, 2) 78 | if x > 1.88f32: 79 | let y = x.float*0.5 - 1.0 80 | echo "x: ",x," e: ",e," y: ",y 81 | e + LN2 + y*(1.0 + y*(-0.5 + y*(1.0/3.0 + y*(-0.25 + y*(0.2 - y/6.0))))) 82 | elif x < 1.06f32: 83 | let y = x.float - 1.0 84 | echo "X: ",x," E: ",e," Y: ",y 85 | e + y*(1.0 + y*(-0.5 + y*(1.0/3.0 + y*(-0.25 + y*(0.2 - y/6.0))))) 86 | else: 87 | let d = x.float * r1_2 # x -> dbl [sqrt(1/2), sqrt2) 88 | let r = (d.float - 1.0)/(d.float + 1.0) # r for r)atio of -1/+1 89 | let s = r*r # s for s)quare 90 | echo "x: ",x," d: ",d," e: ",e," r: ",r," s: ",s," iM: ",s.lnaSeries 91 | float32(e + LNr2 + r*s.lnaSeries) 92 | const n = 15 # echo top absolute & relative errors 93 | var abErr, rlErr: HeapQueue[(float, float32, float32, float32)] 94 | for i in 0 .. (1u64 shl 32) - 1: 95 | var i = uint32(i) 96 | let x = cast[ptr float32](i.addr)[] 97 | if x.isNaN: continue 98 | if x < 0: continue 99 | if x == float32.low: continue 100 | if x == float32.high: continue 101 | if x < 1.1754944e-38: continue # under IEEE single limit 102 | if x > 1.7014118e38: continue # above IEEE single limit 103 | if x < 0.5: continue # accelerators 104 | if x > 2.0: continue # accelerators 105 | if x == 0.0f32: continue # -inf 106 | if x == 1.0f32: continue # exactly 0.0 107 | let accu = ln(abs(x.float)) # let accu = lnf(x.float) 108 | let appr = lna(x).float 109 | let aerr = (abs(appr - accu), x, accu.float32, appr.float32) 110 | if abErr.len < n : abErr.push(aerr) 111 | elif aerr[0] > abErr[0][0]: discard abErr.replace(aerr) 112 | let rerr = (abs(appr/accu - 1.0), x, accu.float32, appr.float32) 113 | if rlErr.len < n : rlErr.push(rerr) 114 | elif rerr[0] > rlErr[0][0]: discard rlErr.replace(rerr) 115 | if (i and 0x00FFFFFFu32) == 0: stdout.write "."; stdout.flushFile 116 | echo "\n" 117 | echo "abs: ";(while abErr.len>0:(let e=abErr.pop;echo " ",e,lnaT(e[1]))) 118 | echo "rel: ";(while rlErr.len>0:(let e=rlErr.pop;echo " ",e,lnaT(e[1]))) 119 | #[ b=(chrt 99 taskset -c 2-3 env -i HOME=/u/cb PATH=/u/cb/bin:/usr/local/bin:/usr/bin) 120 | i7_6700k$ for mode in '' -d:fm -d:fim -d:stdlib;{nim c -d:r -d:bench $mode lna>&/n;repeat 3 nor 0 $b ./lna} 121 | S0:5.3e+36 sL:1652640659.073322 in 13.312009 second;n: 8556380160; 1.56 ns/eval 122 | S0:5.3e+36 sL:1652640659.073322 in 13.285249 second;n: 8556380160; 1.55 ns/eval 123 | S0:5.3e+36 sL:1652640659.073322 in 13.296462 second;n: 8556380160; 1.55 ns/eval 124 | 1.55 +- 0.003 125 | S0:5.3e+36 sL:1652640659.073322 in 10.756718 second;n: 8556380160; 1.26 ns/eval 126 | S0:5.3e+36 sL:1652640659.073322 in 10.712787 second;n: 8556380160; 1.25 ns/eval 127 | S0:5.3e+36 sL:1652640659.073322 in 10.718144 second;n: 8556380160; 1.25 ns/eval 128 | 1.25 +- 0.003 129 | S0:5.3e+36 sL:1652640659.073322 in 11.040576 second;n: 8556380160; 1.29 ns/eval 130 | S0:5.3e+36 sL:1652640659.073322 in 11.030243 second;n: 8556380160; 1.29 ns/eval 131 | S0:5.3e+36 sL:1652640659.073322 in 10.904071 second;n: 8556380160; 1.27 ns/eval 132 | 1.27 +- 0.006 133 | S0:5.3e+36 sL:1641011596.122295 in 20.227257 second;n: 8556380160; 2.36 ns/eval 134 | S0:5.3e+36 sL:1641011596.122295 in 20.227509 second;n: 8556380160; 2.36 ns/eval 135 | S0:5.3e+36 sL:1641011596.122295 in 20.231506 second;n: 8556380160; 2.36 ns/eval 136 | 2.36 +- 0.003 137 | i7_1370P$ for mode in '' -d:fm -d:fim -d:stdlib;{nim c -d:r -d:bench $mode lna>&/n;repeat 3 nor 0 $b ./lna} 138 | S0:5.3e+36 sL:1652640659.073322 in 6.615819 second;n: 8556380160; 0.77 ns/eval 139 | S0:5.3e+36 sL:1652640659.073322 in 6.980933 second;n: 8556380160; 0.82 ns/eval 140 | S0:5.3e+36 sL:1652640659.073322 in 7.320486 second;n: 8556380160; 0.86 ns/eval 141 | 0.773 +- 0.017 142 | S0:5.3e+36 sL:1652640659.073322 in 7.609612 second;n: 8556380160; 0.89 ns/eval 143 | S0:5.3e+36 sL:1652640659.073322 in 7.661034 second;n: 8556380160; 0.90 ns/eval 144 | S0:5.3e+36 sL:1652640659.073322 in 8.191683 second;n: 8556380160; 0.96 ns/eval 145 | 0.889 +- 0.003 146 | S0:5.3e+36 sL:1652640659.073322 in 8.293185 second;n: 8556380160; 0.97 ns/eval 147 | S0:5.3e+36 sL:1652640659.073322 in 8.342474 second;n: 8556380160; 0.98 ns/eval 148 | S0:5.3e+36 sL:1652640659.073322 in 8.330391 second;n: 8556380160; 0.97 ns/eval 149 | 0.969 +- 0.003 150 | S0:5.3e+36 sL:1641011596.122295 in 7.963845 second;n: 8556380160; 0.93 ns/eval 151 | S0:5.3e+36 sL:1641011596.122295 in 8.605017 second;n: 8556380160; 1.01 ns/eval 152 | S0:5.3e+36 sL:1641011596.122295 in 9.766621 second;n: 8556380160; 1.14 ns/eval 153 | 0.931 +- 0.027 154 | In Summary: Skylake(4.7GHz) AlderLake (5.2GHzPcore) 2ndBatch δ 155 | 1.55 +- 0.003 0.773 +- 0.017 (0.773 +- 0.016) 0.0σ 156 | 1.25 +- 0.003 0.889 +- 0.003 (0.881 +- 0.0013) 2.5σ 157 | 1.27 +- 0.006 0.969 +- 0.003 (0.893 +- 0.034) 2.2σ 158 | 1.89x 2.36 +- 0.003 1.20x 0.931 +- 0.027 (0.980 +- 0.01) 1.7σ 159 | Note that assessing CPU superscalar pipeline util is much more subtle than raw 160 | wall clock time. These "speed-ups" are really ratios of "incremental wall time 161 | per loop per lna() eval" in best possible, hot-everything cases. Min estimate 162 | here is simply min3 +- (med-min3)/3 which works ok-ish as per final δ. ]# 163 | -------------------------------------------------------------------------------- /adix/bist.nim: -------------------------------------------------------------------------------- 1 | ##[ Binary Indexed Sum Tree (BIST); Fenwick proposed "BIT" but that A) collides 2 | w/many uses B) takes partial (S)ums as implied, but explicit is better (though 3 | products can work) and C) does not rhyme with "dist" (for distribution - what it 4 | is mostly about). While Inet has tutorials, to my knowledge no one (yet) 5 | collects all these algos in one place. Fenwick1994 itself messed up `invCDF`, 6 | correcting w/a tech report a year later. This code only allocates needed space 7 | & uses 0-based array indexing. See https://en.wikipedia.org/wiki/Fenwick_tree 8 | 9 | The idea of a standard binary heap with `kids(k)@[2k],[2k+1]` for dynamic 10 | distributions goes back to Wong&Easton 1980 (or earlier?). Fenwick's clever 11 | index encoding/overlaid trees idea allows using 1/4 to 1/2 that space (only max 12 | index+1 array elements vs `2*lgCeil(n)`), a constant factor improvement. Good 13 | explanations truly need figures, as in the original Fenwick paper | Wikipedia. 14 | 15 | The `Bist[T]` type in this module is generic over the type of counters used for 16 | partial sums|counts. For few total items, you can use a `Bist[uint8]` while for 17 | many you want to use `Bist[uint32]`. This can be space-optimized up to 2X 18 | further with `adix/sequint` specialized to store an array of B-bit counters. 19 | Ranked B-trees are faster for >24..28-bit index spaces as L3 CPU caching fails, 20 | but needing >7..8 decimal dynamic ranges is also rare. ]## 21 | when not declared assert: import std/assertions 22 | import cligen/sysUt, bitop # cfor, `>>=`, `&=`; `ceilPow2` 23 | 24 | type Bist*[T: SomeNumber] = object ## A razor thin wrapper around `seq[T]` 25 | tot*: T # total counted population, via history of inc(i, d) 26 | data*: seq[T] # The Fenwick array/BIST; Relevant seq ops pass through 27 | 28 | proc init*[T](t: var Bist[T], len: int) = t.data.setLen len 29 | proc initBist*[T](len: int): Bist[T] = result.init len 30 | proc len*[T](t: Bist[T]): int = t.data.len 31 | func space*[T](t: Bist[T]): int = t.sizeof + t.data.len*T.sizeof 32 | proc count*[T](t: Bist[T]): T = t.tot 33 | proc `[]`*[T](t: Bist[T], i: int): T = t.data[i] 34 | proc `[]`*[T](t: var Bist[T], i: int): var T = t.data[i] 35 | proc `[]=`*[T](t: var Bist[T], i: int, x: T) = t.data[i] = x 36 | proc clear*[T](t: var Bist[T]) = 37 | t.tot = 0; zeroMem t.data[0].addr, t.len*T.sizeof 38 | 39 | proc inc*[T](t: var Bist[T]; i: int; d: T) = 40 | ## Adjust for count increment by `d`; Tm ~ 1/2..3/4 lg n 41 | t.tot += d 42 | cfor (var i = i.int), i < t.len, i |= i + 1: t[i] += d #Go down update tree 43 | 44 | proc dec*[T](t: var Bist[T]; i: int; d: T) = 45 | ## Adjust for count decrement by `d`; Tm ~ 1/2..3/4 lg n 46 | t.tot -= d 47 | cfor (var i = i.int), i < t.len, i |= i + 1: t[i] -= d #Go down update tree 48 | 49 | proc up*[T](t: var Bist[T]) = discard ## Simple no-op for BISTs 50 | 51 | proc cdf*[T](t: Bist[T], i: int): T = 52 | ## INCLUSIVE `sum(pmf[0..i])`, (rank,EDF,prefix sum,scan,..); Tm~1 bits in `i` 53 | cfor (var i = i + 1), i > 0, i &= i - 1: #Go up interrogation tree 54 | result += t[i - 1] 55 | 56 | proc pmf*[T](t: Bist[T], i: int): T = 57 | ## Probability Mass Function @i; Avg Tm ~ 2 probes; Max Tm ~ lg n 58 | result = t[i] 59 | cfor (var mask = 1), (i and mask) == mask, mask <<= 1: 60 | result -= t[i - mask] #while LSB==1: subtract & mv up tree 61 | 62 | proc invCDF*[T](t: Bist[T], s: T; s0: var T): int = 63 | ## For `0 < s <= tot`, bracket ECDF jump `>= s`. I.e. find `i0, s0` so `s0 = 64 | ## sum(..< i0) < s yet sum(..i0) >= s` in `lgCeil n` array probes. 65 | assert 0<=s and s<=t.tot, "Bist.invCDF OORange sum " & $s & " of " & $t.tot 66 | var c = s #NOTE: s<0|s>tot are invalid inputs 67 | cfor (var half = t.data.len.ceilPow2 shr 1), half != 0, half >>= 1: 68 | var mid = result + half - 1 69 | if mid < t.data.len and t[mid] < c: 70 | c -= t[mid] 71 | result = mid + 1 72 | s0 = s - c 73 | 74 | proc fromCnts*[T](t: var Bist[T]) = 75 | ## In-place bulk convert/reformat `t[]` from counts to BIST; Max time `~1*n`. 76 | t.tot = 0 77 | for i in 0 ..< t.len: 78 | t.tot += t[i] 79 | let j = i or (i + 1) 80 | if j < t.len: 81 | t[j] += t[i] 82 | 83 | proc toCnts*[T](t: var Bist[T]) = 84 | ## In-place bulk convert/reformat `t[]` from BIST to counts; Max time ~1*n 85 | ## *Unlike the others, this routine only works for power of 2-sized arrays*. 86 | cfor (var i = t.len), i != 0, i >>= 1: #Long strides give ~n inner loops. 87 | cfor (var j = 2*i - 1), j < t.len, j += 2*i: #*Might* be slower than just 88 | t[j] -= t[j - i] #..looping & calling `pmf`. 89 | 90 | proc nPDF*[T](t: Bist[T]): seq[float32] = ## Return classic PMF from read-only BIST 91 | result.setLen t.len; let s=1/t.tot.float32;for i,r in mpairs result:r=s*t.pmf(i).float32 92 | 93 | proc nCDF*[T](t: Bist[T]): seq[float32] = ## Return classic CDF from read-only BIST 94 | result = t.nPDF; for i in 1 ..< t.len: result[i] += result[i - 1] # .cumsum? 95 | 96 | proc `$`*[T](t: Bist[T]): string = "tot: " & $t.count & " pmf: " & $t.nPDF 97 | 98 | proc invCDF*[T](t: Bist[T], s: T): (int, T) = result[0] = t.invCDF(s, result[1]) 99 | ## For `0 < s <= tot` return `(i0,s0)` so `sum(..=s` 100 | 101 | proc invCDF*[T](t: Bist[T]; s: T; s0, s1: var T): int = 102 | ## For `0 < s <= tot`, find `i0,s0,s1` so `s0 < s <= s1` and `s0+pmf(i0)==s1`. 103 | result = t.invCDF(s, s0) 104 | s1 = s0 + t.pmf(result) 105 | 106 | proc min*[T](t: Bist[T]): int = ## Simple wrapper: invCDF(t, 1) 107 | var s0: T; t.invCDF(1, s0) 108 | 109 | proc max*[T](t: Bist[T]): int = ## Simple wrapper: invCDF(t,t.count). 110 | var s0: T; t.invCDF(t.tot.T, s0) 111 | 112 | from std/fenv import epsilon 113 | proc quantile*[T](t: Bist[T]; q: float; iL,iH: var int): float = 114 | ## Parzen-interpolated quantile; E.g., q=0.9 => 90th percentile. ``answer = 115 | ## result*iL + (1-result)*iH``, but is left to caller to do { in case it is 116 | ## mapping larger numeric ranges to/from iL,iH }. Tm ~ ``2*lg(addrSpace)``. 117 | ## Unlike other (broken!) quantile-interpolation methods, Parzen's connects 118 | ## midpoints of vertical CDF jumps, not horizontal. This makes more sense, 119 | ## corresponding to Wilcoxon 1945 & later tie mid-ranking recommendations. 120 | assert t.tot > 0, "quantile(Bist[T]) requires non-empty Bist." 121 | var sL0, sL1, sH0, sH1: T #You probably want to draw a CDF to 122 | let n = t.tot.float #..fully understand this code. 123 | let qN = q*n 124 | let wq = when T is SomeFloat: T.epsilon*n else: 1.T # A Quantum Of Ctr Wgt 125 | if qN <= 0.5*wq.float : iL = t.min;iH=0;return 1 #Early tails rets; Pure iL 126 | if qN >= n - 0.5*wq.float: iL = t.max;iH=0;return 1 #{Early body are pure iH.} 127 | let dqN=when T is SomeFloat: wq else: 1.5 # Min round-off + max odds high side 128 | iH = t.invCDF(T(qN + dqN), sH0, sH1) # sH0all iH 135 | if sH0 < wq: return 0 #For qN this small, iH = iL = min. 136 | iL = t.invCDF(sH0, sL0, sL1) #..Also, cannot call invCDF(0). 137 | when T is SomeFloat: # Should be impossible,but round-off 138 | if sL1 > sH0 + wq: #..makes it happen sometimes & when 139 | iL = t.invCDF(sH0 - wq, sL0, sL1) #..it does, we want next lower bin. 140 | let sMidL = 0.5*float(sL0 + sL1) #Mid-vertJump(nxtLwrBin) gives line 141 | min (sMidH - qN)/(sMidH - sMidL), 1.0 #Runs of T.eps-sized bins=>anomalies 142 | 143 | proc quantile*[T](t: Bist[T], q: float): float = 144 | ## Parzen-interpolated quantile when no caller index mapping is needed 145 | var iL, iH: int 146 | let fL = t.quantile(q, iL, iH) 147 | fL*iL.float + (1 - fL)*iH.float 148 | 149 | when isMainModule: 150 | import cligen, std/strutils 151 | when not declared(addFloat): import std/formatfloat 152 | type ct = uint16 153 | proc tbist(num=9, verb=false, parzen=false, thresh=0.03, args: seq[int]): int= 154 | ##[Eg `tbist $(echo 0 2 4 4 4 6 6 6 6 8 | tr \ \\n | shuf)`. Exit status is 155 | bitmask of PMF|CDF|invCDF|Extremes|discontinuousQtls|badFrom|badToCnts. ]## 156 | result = 0 #Set to non-zero on failure for easy halt of randomized tests. 157 | if args.len == 0: quit "Called with no args; --help explains more", 1 158 | var cntR = newSeq[ct](num) #Reference count/PMF/histo 159 | var sumR = newSeq[ct](num) #Reference prefix sum/CDF 160 | var minR = int.high 161 | var maxR = int.low 162 | var b = initBist[ct](num) 163 | for a in args: #Load up bist & references 164 | if a < 0 : echo "tbist: ignoring negative ", a ; continue 165 | if a >= num: echo "tbist: ignoring out of bounds ", a; continue 166 | cntR[a].inc #Reference cntR 167 | minR = min(minR, a) 168 | maxR = max(maxR, a) 169 | b.inc(a, +1) 170 | sumR[0] = cntR[0] #Low-Tech Prefix Sum/CDF 171 | for i in 1 ..< num: 172 | sumR[i] = sumR[i-1] + cntR[i] #Ref cumulative/pfx sum 173 | if verb: #Print Table 174 | echo "i(dec)\ti(bin)\tT\tcount\tcsum" 175 | for i in 0 ..< num: 176 | echo "$1\t$2\t$3\t$4\t$5"%[ $i, toBin(i,6), $b[i], $cntR[i], $sumR[i] ] 177 | for i in 0 ..< b.len: #Test pmf() 178 | if b.pmf(i) != cntR[i]: 179 | echo "i: ", i, "\tcntR: ", sumR[i], " b.pmf:", b.pmf(i); result |= 1 180 | for i in 0 ..< b.len: #Test cdf() 181 | if b.cdf(i) != sumR[i]: 182 | echo "i: ", i, "\tsumR: ", sumR[i], " b.cdf:", b.cdf(i); result |= 2 183 | for s in 1.ct .. args.len.ct: #Test invCDF 4all cumSums 184 | let (i, s0) = b.invCDF(s) 185 | let j = i - 1; let s1 = s0 + cntR[i] 186 | if s1 != sumR[i] or (j >= 0 and s0 != sumR[j]) or not(s0 < s and s <= s1): 187 | echo "cs: ",s," im1: ",j," s0: ",s0," i: ",i," s1: ",s1; result |= 4 188 | if b.min != minR: echo "wrong min: ", b.min, " not ", minR; result |= 8 189 | if b.max != maxR: echo "wrong max: ", b.max, " not ", maxR; result |= 8 190 | let dq = 1.0/2048.0 #Test quantile continuity 191 | var q0 = -1.0; var qP0 = 0.0 #Take dq as param? 192 | cfor (var q = 0.0), q <= 1.0, q += dq: 193 | let qP = b.quantile(q) 194 | if parzen : echo "P: ", q, " ", qP 195 | if q0 > -1 and abs(qP - qP0) > thresh: 196 | result |= 16 #NOTE: Test less objective; Set parzen to assess manually. 197 | echo "PdisCont: ",q0," -> ",q," ",qP0," -> ",qP," |qP0-qP|: ",abs(qP-qP0) 198 | q0 = q; qP0 = qP #save last loop values 199 | var t = b; t.data = cntR #Bulk Histogram -> BIST 200 | t.fromCnts 201 | if b.data != t.data: 202 | echo "- bad fromCnts chk -"; result |= 32 203 | for i in 0 ..< b.len: echo "i: ", i, "\tT: ", t[i] 204 | if num.isPow2: 205 | b.toCnts #Bulk BIST -> Histogram 206 | if b.data != cntR: #NOTE: `b` is clobbered 207 | echo "- bad toCnts chk -"; result |= 64 208 | for i in 0.. merge into existing group 88 | s.mrg[s.nM].m = s.mrg[s.nM].m + (s.buf[i].m - 89 | s.mrg[s.nM].m)*s.buf[i].W/s.mrg[s.nM].W 90 | s.buf[i].w = 0 91 | else: # didn't fit => mv2next output; Cp 1st group 92 | wSoFar += s.mrg[s.nM].W 93 | k = ks[s.scale](wSoFar/s.wTot, norm) 94 | wLim = s.wTot * qs[s.scale](k + 1, norm) 95 | s.nM.inc 96 | if s.nM >= s.mrg.len: 97 | s.mrg.setLen s.nM + 1; {.cast(noSideEffect).}: echo "AUTO-EXPAND" 98 | s.buf.setLen s.buf.len + 1 99 | s.mrg[s.nM].m = s.buf[i].m 100 | s.mrg[s.nM].w = s.buf[i].w 101 | s.buf[i].w = 0 102 | inc s.nM # points to next empty cell 103 | if s.wTot > 0: # update extreme values 104 | s.min = min(s.min, s.mrg[0].m) 105 | s.max = max(s.max, s.mrg[s.nM - 1].m) 106 | 107 | func mergeNew*(s: var DigesT, force=false, cpr = -1.0) = 108 | if s.wTot == 0 and s.wBuf == 0: return 109 | if force or s.wBuf > 0: # Do merge in reverse @odd times to avoid lo2hi bias. 110 | s.merge cpr 111 | s.nMerges.inc 112 | s.nT = 0 113 | s.wBuf = 0 114 | 115 | func add*(s: var DigesT, x: float, w=1) = ## Main update API 116 | if isNaN(x): raise newException(ValueError, "cannot add NaN") 117 | if s.nT >= s.buf.len - s.nM - 1: 118 | s.mergeNew 119 | let i = s.nT; inc s.nT 120 | s.min = min(s.min, x) 121 | s.max = max(s.max, x) 122 | s.buf[i].w = w 123 | s.buf[i].m = x 124 | s.wBuf += w.float 125 | 126 | func compress*(s: var DigesT) = s.mergeNew(true, s.cpr) 127 | ## best done only when we want to show results to the outside world. 128 | 129 | iterator groups*(s: DigesT): Group = 130 | for i in 0 ..< s.nM: yield Group(m: s.mrg[i].m, w: s.mrg[i].w) 131 | 132 | func add*(s: var DigesT, others: var openArray[DigesT]) = 133 | for other in mitems(others): 134 | other.compress 135 | for c in other.groups: s.add(c.m, c.w) 136 | 137 | func weightAvgOrd(x1, w1, x2, w2: float): float {.inline.} = 138 | let x = (x1*w1 + x2*w2)/(w1 + w2) 139 | return max(x1, min(x, x2)) 140 | 141 | func weightedAverage(x1, w1, x2, w2: float): float {.inline.} = 142 | if x1 <= x2: weightAvgOrd(x1, w1, x2, w2) # WeightedAvg of `x1, s1` & `x2, w2` 143 | else : weightAvgOrd(x2, w2, x1, w1) # Guaranteed on `[x1, x2]` 144 | 145 | func quantile*(s: var DigesT, q: float): float = 146 | if q < 0.0 or q > 1.0: 147 | raise newException(ValueError, "q must be on [0,1], not " & $q) 148 | s.mergeNew 149 | if s.nM == 0: return NaN 150 | if s.nM == 1: return s.mrg[0].m 151 | var n = s.nM # At least two groups now 152 | var ix = q * s.wTot.float # weight units offset we want 153 | if ix < 1: return s.min # boundaries; return min|max; likely moot 154 | # If lo group has >1 sample, still know 1 sample occurred @min => interpol. 155 | if s.mrg[0].w > 1 and ix < s.mrg[0].W/2.0: # only 1 sample @min => less weight 156 | return s.min + (ix - 1)/(s.mrg[0].W/2.0 - 1) * (s.mrg[0].m - s.min) 157 | if ix > s.wTot - 1: return s.max # likely moot 158 | # If hi group has >1 sample, still know 1 sample occurred @max => interpol. 159 | if s.mrg[n-1].w > 1 and s.wTot - ix <= s.mrg[n-1].W/2.0: 160 | return s.max - (s.wTot-ix-1)/(s.mrg[n-1].W/2.0 - 1)*(s.max - s.mrg[n-1].m) 161 | var wSoFar = s.mrg[0].W/2.0 # between exVals, interpol betw groups 162 | for i in 0 ..< n - 1: 163 | let dw = float(s.mrg[i].w + s.mrg[i+1].w)/2.0 164 | if wSoFar + dw > ix: # groups i, i+1 bracket current point 165 | var leftUnit = 0.0 # check for unit weight 166 | if s.mrg[i].w == 1: 167 | if ix - wSoFar < 0.5: 168 | return s.mrg[i].m # within the singleton's sphere 169 | else: 170 | leftUnit = 0.5 171 | var rightUnit = 0.0 172 | if s.mrg[i+1].w == 1: 173 | if wSoFar + dw - ix <= 0.5: 174 | return s.mrg[i+1].m # no interpolation needed near singleton 175 | rightUnit = 0.5 176 | let z1 = ix - wSoFar - leftUnit 177 | let z2 = wSoFar + dw - ix - rightUnit 178 | return weightedAverage(s.mrg[i].m, z2, s.mrg[i+1].m, z1) 179 | wSoFar += dw 180 | # Handled singleton@end above 181 | let z1 = ix - s.wTot - s.mrg[n-1].W/2.0 # wSoFar =~ s.wTot - s.mrg[n-1].w/2 182 | let z2 = s.mrg[n-1].W/2.0 - z1 # =>interp out to max value ever seen 183 | return weightedAverage(s.mrg[n-1].m, z1, s.max, z2) 184 | 185 | func cdf*(s: var DigesT, x: float): float = 186 | if x.isNaN: return NaN 187 | s.mergeNew 188 | if s.nM == 0: return NaN # no data to examine 189 | if x < s.min: return 0.0 # -inf works fine 190 | if x > s.max: return 1.0 # +inf works fine 191 | if s.nM == 1: # exactly one group, should have max==min 192 | let width = s.max - s.min 193 | if x - s.min <= width: return 0.5 # min & max too close to interpolate 194 | else: return (x - s.min)/(s.max-s.min) # interpol if weight>0, max != min 195 | let n = s.nM 196 | if x < s.mrg[0].m: # check for the LO TAIL 197 | let dx = s.mrg[0].m - s.min 198 | if dx > 0.0: # do not divide by zero in interpol 199 | return if x == s.min: 0.5/s.wTot # sample exactly @min 200 | else: (1.0 + (x - s.min)/dx * (s.mrg[0].W/2.0 - 1.0))/s.wTot 201 | else: return 0.0 # should be redundant with the check x < s.min 202 | if x > s.mrg[n-1].m: # and the HI TAIL 203 | let dx = s.max - s.mrg[n-1].m 204 | if dx > 0.0: 205 | return if x == s.max: 1.0 - 0.5/s.wTot # single sample exactly @max 206 | else: 1.0 - (1.0 + (s.max-x)/dx*(s.mrg[n-1].W/2.0 - 1.0))/s.wTot 207 | else: return 0.0 # should be redundant with the check x > s.max 208 | var wSoFar = 0.0 # Now mrg[0].m<=x<=mrg[n-1].m >= 2 groups; either >=1 209 | for i in 0 ..< n-1: # consecutive groups all @exactly x OR c0 < x < c1 210 | if s.mrg[i].m == x: # wSoFar does not yet include s.mrg[i].w 211 | var dw = 0.0 # Have >=1 groups @x 212 | for j in i ..< n: # treat as 1, accumulating weight in dw 213 | dw += s.mrg[i].W 214 | if s.mrg[i].m != x: break 215 | return (wSoFar + dw/2.0)/s.wTot 216 | elif s.mrg[i].m <= x and x < s.mrg[i+1].m: # betw groups 217 | if s.mrg[i+1].m - s.mrg[i].m > 0.0: # handle FP issues 218 | var loExclW = 0.0 # Singleton groups have all weight @mean 219 | var hiExclW = 0.0 # & should not be smoothed/interpolated. 220 | if s.mrg[i].w == 1: 221 | if s.mrg[i+1].w == 1: # 2 singletons=>no interpol; lo in, hi out 222 | return (wSoFar + 1.0)/s.wTot 223 | else: 224 | loExclW = 0.5 225 | elif s.mrg[i+1].w == 1: 226 | hiExclW = 0.5 227 | let dw = float(s.mrg[i].w + s.mrg[i+1].w)/2.0 228 | let lo = s.mrg[i].m # adjust endpoints for any singleton 229 | let hi = s.mrg[i+1].m # adjusts have limited effect on endpoints 230 | let dwNoSingleton = dw - loExclW - hiExclW 231 | let base = wSoFar + s.mrg[i].W/2.0 + loExclW 232 | return (base + dwNoSingleton * (x - lo)/(hi - lo))/s.wTot 233 | else: # distinct but too close for safe interpolation 234 | return (wSoFar + float(s.mrg[i].w + s.mrg[i+1].w)/2.0)/s.wTot 235 | else: 236 | wSoFar += s.mrg[i].W 237 | 1.0 - 0.5/s.wTot 238 | 239 | when isMainModule: 240 | when defined(test): 241 | import std/[os, strutils] # Helpful to run against: -12 -8 -4 -1 0 1 4 8 12 242 | var s = initDigesT(a=0.125, b=10.0, n=8) 243 | for i in 1 .. paramCount(): s.add parseFloat(paramStr(i)) 244 | for q in [0.01, 0.05, 0.25, 0.50, 0.75, 0.95, 0.99]: echo s.quantile(q) 245 | echo "s: ", s 246 | else: 247 | import std/[random, times, strformat] 248 | when defined danger: randomize() 249 | const N = 750_000 250 | var data = newSeq[float](N) 251 | const Q = [0.001,0.01,0.05,0.1587,0.25,0.50,0.75,0.8413,0.95,0.99,0.999] 252 | var res = newSeq[float](Q.len) 253 | for i in 0..