├── adix
    ├── nim.cfg
    ├── stat.nim
    ├── cpuCT.nim
    ├── memutil.nim
    ├── metab.nim
    ├── ways.nim
    ├── bitop.nim
    ├── uniqce.nim
    ├── topk.nim
    ├── amoft.nim
    ├── bltab.nim
    ├── oats.nim
    ├── embist.nim
    ├── cumsum.nim
    ├── lghisto.nim
    ├── sequint.nim
    ├── hist.nim
    ├── lmbist.nim
    ├── lna.nim
    ├── bist.nim
    └── tdigest.nim
├── util
    ├── nim.cfg
    └── lfreq.nim
├── tests
    ├── nim.cfg
    ├── mostCommon.nim
    ├── ctab.nim
    ├── wf.nims
    ├── toSet.nim
    ├── tBadHash2.nim
    ├── edkey.nim
    ├── repeats.nim
    ├── testTab.nim
    ├── tHighTab.nim
    ├── tRandom.nim
    ├── lfreq.awk
    ├── tParams.nim
    ├── mvWinGen
    ├── tSuperHigh.nim
    ├── anaForum.nim
    ├── dtshell.nim
    ├── tBadHash.nim
    ├── writeHash.nim
    ├── ppss.nim
    ├── anaPrime.nim
    ├── sshell.nim
    ├── kmCmp.sh
    ├── ucl.nim
    ├── tshell.nim
    ├── wu.nim
    ├── wfr.nim
    ├── wf.nim
    ├── bl.nim
    └── btshell.nim
├── .gitignore
├── adix.nimble
├── adix.nim
├── LICENSE
├── .github
    └── workflows
    │   └── ci.yml
├── TODO.md
├── NOTES.md
└── README.md


/adix/nim.cfg:
--------------------------------------------------------------------------------
1 | path=".."
2 | 


--------------------------------------------------------------------------------
/util/nim.cfg:
--------------------------------------------------------------------------------
1 | path=".."
2 | 


--------------------------------------------------------------------------------
/tests/nim.cfg:
--------------------------------------------------------------------------------
1 | path=".."
2 | path="../adix"
3 | path="../../cg" # cligen
4 | 


--------------------------------------------------------------------------------
/adix/stat.nim:
--------------------------------------------------------------------------------
1 | {.deprecated: "Use mvstat instead".}
2 | import mvstat
3 | export mvstat
4 | 


--------------------------------------------------------------------------------
/tests/mostCommon.nim:
--------------------------------------------------------------------------------
1 | import adix/lptabz
2 | let data = ["a", "b", "c", "c", "d", "e", "e", "f", "g", "g", "g", "h"]
3 | for tup in mostCommon(data, 3): echo tup
4 | 


--------------------------------------------------------------------------------
/tests/ctab.nim:
--------------------------------------------------------------------------------
 1 | import metab
 2 | var c = initTab[int8,int]()
 3 | c.inc 2
 4 | c.inc 3
 5 | c.inc 3
 6 | c.inc 2, -1
 7 | c.inc 1
 8 | echo c
 9 | for k,v in c.topByVal(n=1):
10 |   echo k," ",v
11 | 


--------------------------------------------------------------------------------
/tests/wf.nims:
--------------------------------------------------------------------------------
1 | switch("threads", "on")
2 | if defined(tcc):
3 |   switch("tlsEmulation", "on")
4 |   if (NimMajor,NimMinor,NimPatch) >= (1,6,0): switch("mm", "markAndSweep")
5 |   else: switch("gc", "markAndSweep")
6 |   switch("passL","-lm")
7 | 


--------------------------------------------------------------------------------
/tests/toSet.nim:
--------------------------------------------------------------------------------
 1 | import cligen, adix/lptabz
 2 | 
 3 | proc test(nums: seq[int32]) =
 4 |   when defined(fromVar):
 5 |     var s: LPSet[int32]
 6 |     s.setCap(nums.len)
 7 |   else:
 8 |     var s = initLPSet[int32](nums.len, minFree=0)
 9 |   for x in nums: s.incl x
10 |   echo s.getCap
11 |   echo s
12 | 
13 | dispatch(test)
14 | 


--------------------------------------------------------------------------------
/tests/tBadHash2.nim:
--------------------------------------------------------------------------------
 1 | import metab
 2 | 
 3 | #import althash
 4 | #proc hash(h, salt: Hash): Hash = hashRoMu1(h) xor cast[Hash](salt)
 5 | 
 6 | var one = initSet[int]()
 7 | for i in 0 ..< (1 shl 23):
 8 |   one.incl (i shl 25)
 9 | 
10 | var ds = one.depths
11 | echo ds
12 | echo one.len, "/", one.getCap
13 | echo "Stats: ", one.depthStats
14 | 


--------------------------------------------------------------------------------
/tests/edkey.nim:
--------------------------------------------------------------------------------
 1 | import metab
 2 | var c = initTab[int8,int]()
 3 | c.inc 2
 4 | c.inc 3
 5 | c.inc 4
 6 | c.inc 5
 7 | c.inc 6
 8 | c.inc 7
 9 | c.inc 9
10 | c.inc 9
11 | c.editKey 9, 8
12 | for i in 2'i8..8:
13 |   echo i, " ", c[i]
14 | 
15 | let t = c
16 | echo t.nthPair(6)
17 | let tup = c.nthPair(0)
18 | echo tup[0]
19 | tup[1][] = 9
20 | echo c
21 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | noVC
 2 | tests/anaPrime
 3 | tests/anaForum
 4 | tests/anaPrime
 5 | tests/btshell
 6 | tests/dtshell
 7 | tests/ppss
 8 | tests/repeats
 9 | tests/sshell
10 | tests/tBadHash2
11 | tests/tBadHash
12 | tests/testTab
13 | tests/tHighTab
14 | tests/tParams
15 | tests/tRandom
16 | tests/tshell
17 | tests/tSuperHigh
18 | tests/writeHash
19 | tests/bl
20 | tests/wf
21 | patches/
22 | 


--------------------------------------------------------------------------------
/adix.nimble:
--------------------------------------------------------------------------------
 1 | # Package
 2 | version     = "0.7.6"
 3 | author      = "Charles Blake"
 4 | description = "An Adaptive Index Library for Nim"
 5 | license     = "MIT/ISC"
 6 | 
 7 | # Deps
 8 | requires    "nim >= 2.0.0"
 9 | requires    "cligen >= 1.9.5"
10 | skipDirs    = @[ "tests" ]
11 | 
12 | # Older Nim must use adix < 0.5.5 & comment out the below `bin`.
13 | bin         = @[
14 |   "util/lfreq",     # Somewhat efficient line frequency calculator
15 | ]
16 | 


--------------------------------------------------------------------------------
/tests/repeats.nim:
--------------------------------------------------------------------------------
 1 | when not declared(stdin): import std/syncio
 2 | import tables, cligen
 3 | 
 4 | proc repeats(thresh=2) =
 5 |   ## Read 8 byte hashes from stdin & print histogram of any with count > thresh.
 6 |   var h: int64
 7 |   var cnt: Table[int64, int]
 8 |   while stdin.readBuffer(cast[cstring](h.addr), 8) == 8:
 9 |     cnt.mgetOrPut(h, 0).inc
10 |   for h,c in cnt:
11 |     if c >= thresh:
12 |       echo "h: ", h, " count: ", c
13 | 
14 | dispatch(repeats)
15 | 


--------------------------------------------------------------------------------
/tests/testTab.nim:
--------------------------------------------------------------------------------
 1 | ## This is an `include` file to attach a test suite to a given ??tab.nim impl.
 2 | 
 3 | import cligen
 4 | 
 5 | proc test*(nums: seq[int32]) =
 6 |   var t = initTab()
 7 |   for x in nums:
 8 |     if x >= 0:
 9 |       echo "ADD ", $x
10 |       t.mgetOrPut(x.uint32, 0).inc
11 |     elif (-x).uint32 in t: echo "HAS ", $(-x)
12 |     else: echo "NO ", $(-x)
13 | #   echo t.s.data
14 |   echo t
15 | # echo t.s.data
16 | # echo t.s.depths
17 | 
18 | dispatch(test)
19 | 


--------------------------------------------------------------------------------
/tests/tHighTab.nim:
--------------------------------------------------------------------------------
 1 | when not declared(addFloat): import std/formatfloat
 2 | import metab, althash, times
 3 | 
 4 | const shift = 49'u
 5 | proc hash(x: uint64): Hash = hashRoMu1(x)
 6 | 
 7 | var one = initTab[uint64, int](4, rehash=false)
 8 | var t0 = epochTime()
 9 | for i in 0'u ..< ((1'u shl 15) - 1):
10 |   one[uint64(i shl shift)] = 2
11 | echo epochTime() - t0, " seconds"
12 | var ds = one.depths
13 | echo ds
14 | echo "MAX DEPTH: ", ds.len
15 | echo one.len, "/", one.getCap
16 | echo "Stats: ", one.depthStats
17 | # one.debugDump
18 | 


--------------------------------------------------------------------------------
/tests/tRandom.nim:
--------------------------------------------------------------------------------
 1 | when not declared(addFloat): import std/formatfloat
 2 | import os, strutils, metab, random
 3 | 
 4 | randomize()
 5 | 
 6 | let num = if paramCount() > 0: parseInt(paramStr(1)) else: 1
 7 | let den = if paramCount() > 1: parseInt(paramStr(2)) else: 4
 8 | let cnt = if paramCount() > 2: parseInt(paramStr(3)) else: 3*(1 shl 10)
 9 | let nTr = if paramCount() > 3: parseInt(paramStr(4)) else: 30
10 | let rob = paramCount() > 4
11 | 
12 | #echo "USING ", num, '/', den, " and ", cnt, " entries."
13 | for t in 1..nTr:
14 |   var one = initSet[int](numer=num, denom=den, robinhood=rob)
15 |   for i in 1..cnt:
16 |     one.incl rand(1 shl 32)
17 |   echo "Ut: ", one.len.float/one.getCap.float, " St: ", one.depthStats
18 | 


--------------------------------------------------------------------------------
/adix.nim:
--------------------------------------------------------------------------------
 1 | when defined(nimdoc):
 2 |   import adix/althash
 3 |   import adix/amoft
 4 |   import adix/bist
 5 |   import adix/bitop
 6 |   import adix/bltab
 7 |   import adix/btree
 8 |   import adix/cpuCT
 9 |   import adix/cumsum
10 |   import adix/ditab
11 |   import adix/embist
12 |   import adix/hist
13 |   import adix/lghisto
14 |   import adix/lmbist
15 |   import adix/lna
16 |   import adix/lptabz
17 |   import adix/memutil
18 |   import adix/metab
19 |   import adix/mvstat
20 |   import adix/nsort
21 |   import adix/oats
22 |   import adix/sequint
23 |   import adix/stat
24 |   import adix/tdigest
25 |   import adix/topk
26 |   import adix/uniqce
27 |   import adix/xhist1
28 |   import adix/ways
29 | else:
30 |   {.error: "use `import adix/{module of interest}`".}
31 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2020 Charles L. Blake.
 2 | 
 3 | Permission to use, copy, modify, and/or distribute this software for any
 4 | purpose with or without fee is hereby granted, provided that the above
 5 | copyright notice and this permission notice appear in all copies.
 6 | 
 7 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
 8 | REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
 9 | AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
10 | INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
11 | LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
12 | OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
13 | PERFORMANCE OF THIS SOFTWARE.
14 | 


--------------------------------------------------------------------------------
/tests/lfreq.awk:
--------------------------------------------------------------------------------
 1 | #!/bin/awk -f
 2 | {c[$0]++}END{for(k in c)print c[k],k}
 3 | 
 4 | # As has surely been noted about a gajillion times by now, an improvement on
 5 | # Doug McIlroy's solution in Knuth-McIlroy is (probably):
 6 | #   tr -cs A-Za-z \\n | # Non-alpha -> newline; NOTE: apostrophe-quoting ambig.
 7 | #     tr A-Z a-z      | # ASCII upper to lower
 8 | #     lfreq.awk       | # Histogram lines (this script)
 9 | #     sort -n | tail    # top 10; Add `| tac` if you like decreasing
10 | #
11 | # While almost any timing strongly depends on used vocab & its sampled growth,
12 | # I get `mawk` 2..3X slower than optimized Nim & `gawk` ~2X slower than `mawk`.
13 | #
14 | # Given enough CPU cores, all above stages run in parallel & execution time is
15 | # bounded by pipe BW & the slowest stage - likely this AWK script.  McIlroy's
16 | # `sort|uniq -c` method may be better if unique lines exceed avail. phys RAM &
17 | # next level of mem hierarchy has high rand.access latency (eg. Winchester,net).
18 | 


--------------------------------------------------------------------------------
/tests/tParams.nim:
--------------------------------------------------------------------------------
 1 | when not declared(addFloat): import std/formatfloat
 2 | import random, stats, adix/[metab, althash, bitop], cligen
 3 | 
 4 | proc statsVratio(N=7000, MaxNum=(1 shl 32), numer=1, denom=1,
 5 |                  rehash=true, Robin=true, trials=10) =
 6 |   randomize()
 7 |   var util, dmax, dmean, dvar: RunningStat
 8 |   for t in 1..trials:
 9 |     var set = initSet[int](numer=numer, denom=denom,
10 |                            rehash=rehash, robinhood=Robin)
11 |     for i in 1..N:
12 |       let c0 = set.getCap
13 |       let ut = set.len.float / set.getCap.float
14 |       let ds = set.depthStats
15 |       set.incl rand(MaxNum)
16 |       if set.getCap != c0 and c0 > 64:  # resized
17 |         util.push  ut
18 |         dmean.push ds[0]
19 |         dvar.push  ds[1]
20 |         dmax.push  ds[2].float / lg(set.getCap).float
21 | 
22 |   echo "ratio: " , numer, "/", denom, " util: " , util.mean,
23 |        " dmean: ", dmean.mean, " dvar: " , dvar.mean, " dmax/lg: " , dmax.mean
24 | 
25 | dispatch(statsVratio)
26 | 


--------------------------------------------------------------------------------
/tests/mvWinGen:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | if [ $# -lt 1 ]; then
 3 |     cat <<-EOF
 4 | Convert numbers on stdin to ppss|bt-shell ops tracking a size \$1 moving window.
 5 | Repeatably tests trees of many shapes, sizes, duplicate key distros, op mixes.
 6 | Zsh Eg. to gen input numbers: (repeat 99 {rshuffle \$(echo {1..99})|tr ' ' \\n})
 7 | EOF
 8 |     exit 1
 9 | fi
10 | DEL="-0"; ADD="+1"                  # FIFO order
11 | if [ "$1" = "-u" ]; then            # unordered
12 |     DEL="-"; ADD="+"; shift 1
13 | fi
14 | win="$1"                            # window is $*/$@ or $1 .. $$#
15 | set dummy; shift 1                  # set w/no arg dumps defs which is unwanted
16 | 
17 | seq=1
18 | while read a
19 | do
20 |     if [ $seq -gt $win ]; then
21 |         echo $DEL $1                # w/dups s sets path to 0-side
22 |         shift 1
23 |     fi
24 |     echo $ADD $a $seq               # w/dups i1 appends at 1-side
25 |     set "$@" $a
26 |     echo n0 $(($#/2))               # query uninterpolated moving median
27 |     if [ -n "$xtra" ]; then
28 |         echo $xtra
29 |     fi
30 |     seq=$((seq+1))
31 | done
32 | 


--------------------------------------------------------------------------------
/tests/tSuperHigh.nim:
--------------------------------------------------------------------------------
 1 | when not declared(addFloat): import std/[formatfloat, objectdollar]
 2 | import metab, althash, times#, math
 3 | #                                 0  1  2  3  4  5    6    7  8   9
 4 | #tombstone methods get depths ~ @[2, 1, 1, 1, 1, 32, 174, 40, 2, 1]
 5 | #That is "roughly" a counter example for Python probe sequence, but it would be
 6 | #better to push a lot more past depth 7 since 7x slower maybe isn't so obvious.
 7 | #
 8 | #rehash/robinhood mitigations are not enough in `lpset`.  What seems to work
 9 | #well is defining below hash which then makes actual hash RoMu1(RoMu1(x)).
10 | #
11 | #proc hash(x: uint64): Hash = hashRoMu1(x)
12 | #proc hash(x: uint64): Hash = hashRoMu2(x) # good hash for these keys @any shift
13 | 
14 | const shift = 56'u
15 | 
16 | var one = initSet[uint64](256, robinhood=true, rehash=true, numer=10, denom=1)
17 | var t0 = epochTime()
18 | for i in 0'u ..< 234:
19 |   one.incl uint64(i shl shift)
20 | echo (epochTime() - t0)*1e9/234.0, " ns/elt"
21 | echo one.len, "/", one.getCap, " = ", one.len.float/one.getCap.float
22 | echo "Depths:",  one.depths
23 | echo "Stats: ", one.depthStats
24 | one.debugDump
25 | 


--------------------------------------------------------------------------------
/adix/cpuCT.nim:
--------------------------------------------------------------------------------
 1 | ## gcc/clang error out if the generated C includes a tmmintrin.h header on CPUs
 2 | ## without -march=enabling the instructions.  An admittedly expensive staticExec
 3 | ## lets us probe a build-time system for all pre-defined C preprocessor macros
 4 | ## in one execution.  We then postprocess these into a set of flags for Nim
 5 | ## compile-time `when` checks to make "fall back" easy/natural.
 6 | from std/strutils import contains
 7 | 
 8 | const ccDumpMacro {.used.} = " -dM -E -x c - </dev/null"
 9 | const ccPreDefs* =
10 |   when defined(gcc)  : staticExec("gcc   -march=native" & ccDumpMacro)
11 |   elif defined(clang): staticExec("clang -march=native" & ccDumpMacro)
12 |   else: ""
13 | 
14 | type X86Feature* = enum x86sse2, x86ssse3, x86bmi2
15 | 
16 | const x86features* = static:  #XXX This obviously needs to be fleshed out
17 |   var s: set[X86Feature]
18 |   if " __SSE2__ "  in ccPreDefs: s.incl x86sse2
19 |   if " __SSSE3__ " in ccPreDefs: s.incl x86ssse3
20 |   if " __BMI2__ "  in ccPreDefs: s.incl x86bmi2
21 |   s
22 | 
23 | # We could potentially do a calculation here to decide what march to pass.
24 | when x86features.len > 0: {.passc: "-march=native".}
25 | 


--------------------------------------------------------------------------------
/tests/anaForum.nim:
--------------------------------------------------------------------------------
 1 | # Condensed & generalized version of Nim Forum post.  It's
 2 | # here for reference but is 17x slower than anaPrime due
 3 | # to: sortedSig, Tab[*,seq[string]], hasKey, no presize...
 4 | 
 5 | when not declared(open): import std/syncio
 6 | import strutils, algorithm, os, tables, unidecode
 7 | 
 8 | proc signature(word: string): string =
 9 |   let ascii = unidecode(word).toLowerAscii
10 |   let sorted_word = sorted(ascii, system.cmp)
11 |   result = sorted_word.join()
12 | 
13 | proc main =
14 |   if paramCount() < 2:       # Parse command line
15 |     quit("Usage: anagram <dictionary> <word>")
16 |   let lookup_word = paramStr(2)
17 |   let lookup_signature = signature(lookup_word)
18 |   echo "Looking up '", lookup_word, "'"
19 | 
20 |   var anagrams = initTable[string, seq[string]]()
21 |   for word in open(paramStr(1)).lines():
22 |     let signature = signature(word)
23 |     if anagrams.hasKey(signature):
24 |       anagrams[signature].add(word)
25 |     else:
26 |       anagrams[signature] = @[word]
27 | 
28 |   if anagrams[lookup_signature].len == 1:
29 |     echo "'", lookup_word, "' has no anagrams"
30 |   else:
31 |     echo anagrams[lookup_signature]
32 | 
33 | main()
34 | 


--------------------------------------------------------------------------------
/tests/dtshell.nim:
--------------------------------------------------------------------------------
 1 | when not declared(stdin): import std/[syncio, objectdollar]
 2 | import metab, os, strutils
 3 | 
 4 | proc main() =
 5 |   var lgsz = parseInt(getEnv("LGSZ", "2"))
 6 |   var nP0, nP1, nG0, nG1, nD0, nD1: int = 0
 7 |   if paramCount() > 1:
 8 |     echo("Usage:\n  ", paramStr(0), "< [gpdTPD]K [..]")
 9 |     quit(1)
10 |   var t = initTab[int8,int](1 shl lgsz, rehash=false)
11 |   var had: bool
12 |   for line in lines(stdin):                      # Dispatch operations
13 |     let cols = line.split
14 |     let c = cols[0][0]
15 |     let k = int8(if cols[0].len > 1: parseInt(cols[0][1 .. ^1]) else: 0)
16 |     let v = if cols.len > 1: parseInt(cols[1])             else: 0
17 |     case c
18 |       of 'g':
19 |         if k in t: nG1.inc
20 |         else     : nG0.inc
21 |       of 'p':
22 |         discard t.mgetOrPut(k, v, had)
23 |         if had: nP0.inc
24 |         else  : nP1.inc
25 |       of 'a':
26 |         t.add(k, v)
27 |       of 'd':
28 |         if t.missingOrExcl(k): nD0.inc
29 |         else                 : nD1.inc
30 |       of 'T': echo t
31 |       of 'D': echo t.depths
32 |       of 'P': t.debugDump
33 |       else: echo "UNKNOWN COMMAND:", c.repr; quit 2
34 |   echo "nP1: ", nP1, " nP0: ", nP0, " nG1: ", nG1, " nG0: ", nG0,
35 |        " nD1: ", nD1, " nD0: ", nD0
36 | main()
37 | 


--------------------------------------------------------------------------------
/tests/tBadHash.nim:
--------------------------------------------------------------------------------
 1 | when not declared(addFloat): import std/formatfloat
 2 | import metab, althash, times
 3 | 
 4 | let iniSz = 4
 5 | #let iniSz = 1 shl 15
 6 | #let iniSz = 1 shl 18
 7 | #let iniSz = 1 shl 23
 8 | when defined(startWithRehash):
 9 |   let rh = true
10 | else:
11 |   let rh = false
12 | when defined(althash):
13 |   proc hash(x: int): Hash = hashRoMu1(x)
14 | elif defined(althash2):
15 |   proc hash(x: int): Hash = hashRevFib(x)
16 | 
17 | echo "START... "
18 | var t0 = epochTime()
19 | 
20 | const shift = 25
21 | 
22 | var one = initSet[int](iniSz, rehash=rh)
23 | #for i in 0 ..< ((1 shl 15) - 1): one.incl i
24 | for i in 0 ..< ((1 shl 23) - 19): one.incl (i shl shift)
25 | echo epochTime() - t0, " seconds"
26 | var ds = one.depths
27 | echo ds
28 | echo "MAX DEPTH: ", ds.len
29 | echo one.len, "/", one.getCap
30 | echo "Stats: ", one.depthStats
31 | # one.debugDump
32 | 
33 | echo "LOOKING UP... "
34 | for i in 0 ..< ((1 shl 23) - 19):
35 |   if (i shl shift) notin one: echo i, " WAS MISSING"
36 | echo "CLONING... "
37 | 
38 | t0 = epochTime()
39 | var two = initSet[int](iniSz, rehash=rh)
40 | for v in one: two.incl v
41 | echo epochTime() - t0, " seconds"
42 | ds = two.depths
43 | echo ds
44 | echo "MAX DEPTH: ", ds.len
45 | echo two.len, "/", two.getCap
46 | echo "Stats: ", two.depthStats
47 | # two.debugDump
48 | 
49 | echo "DONE... "
50 | 


--------------------------------------------------------------------------------
/adix/memutil.nim:
--------------------------------------------------------------------------------
 1 | when defined(robinHoodMoveMem):
 2 |   # This branch gets SEGV (with seq[T] vals); I have not yet tracked down why
 3 |   proc pushUp*[T](x: var seq[T], i, n: int) {.inline.} =
 4 |     ## move n items up 1; i.e. ``x[i+1..i+n] = x[i..i+n-1]``
 5 | #   if n < 1: return
 6 |     moveMem x[i+1].addr, x[i].addr, n * T.sizeof
 7 | 
 8 |   proc pullDown*[T](x: var seq[T], i, n: int) {.inline.} =
 9 |     ## move n items down 1; i.e. ``x[i..i+n-1] = x[i+1..i+n]``
10 | #   if n < 1: return
11 |     moveMem x[i].addr, x[i+1].addr, n * T.sizeof
12 | elif defined(robinHoodSlice):
13 |   proc pushUp*[T](x: var seq[T], i, n: int) {.inline.} =
14 |     ## move n items up 1; i.e. ``x[i+1..i+n] = x[i..i+n-1]``
15 | #   if n < 1: return
16 |     x[i+1 .. i+n] = x[i .. i+n-1]
17 | 
18 |   proc pullDown*[T](x: var seq[T], i, n: int) {.inline.} =
19 |     ## move n items down 1; i.e. ``x[i..i+n-1] = x[i+1..i+n]``
20 | #   if n < 1: return
21 |     x[i .. i+n-1] = x[i+1 .. i+n]
22 | else:
23 |   proc pushUp*[T](x: var seq[T], i, n: int) {.inline.} =
24 |     ## move n items up 1; i.e. ``x[i+1..i+n] = x[i..i+n-1]``
25 | #   if n < 1: return
26 |     for j in countdown(i + n - 1, i):
27 |       x[j+1] = move x[j]
28 | 
29 |   proc pullDown*[T](x: var seq[T], i, n: int) {.inline.} =
30 |     ## move n items down 1; i.e. ``x[i..i+n-1] = x[i+1..i+n]``
31 | #   if n < 1: return
32 |     for j in countup(i, i + n - 1):
33 |       x[j] = move x[j+1]
34 | 


--------------------------------------------------------------------------------
/tests/writeHash.nim:
--------------------------------------------------------------------------------
 1 | # Just writes 8 byte binary hashes of 0..<n to stdout for input to PractRand.
 2 | import althash, bitops, times, cligen, cligen/osUt, strutils
 3 | when not declared(stdout): import std/[syncio, formatfloat]
 4 | 
 5 | type HashFun =
 6 |   enum Identity, RoMu, WangYi, MoreMur, NASAM, SplitMix, Split64, Degski
 7 | 
 8 | proc writeHash(n=99, r=0, fun=WangYi, bench=false, step=1, Hex=false) =
 9 |   var h, sum: Hash
10 |   let t0 = getTime()
11 |   for j in 0 ..< n:
12 |     let i = j * step
13 |     let x = rotateLeftBits(uint64(i), r)
14 |     case fun
15 |     of Identity: h = hashIdentity(x)
16 |     of RoMu:     h = hashRoMu1(x)
17 |     of WangYi:   h = hashWangYi1(x)
18 |     of MoreMur:  h = hashMoreMur(x)
19 |     of NASAM:    h = hashNASAM(x)
20 |     of SplitMix: h = hashSplitMix(x)
21 |     of Split64:  h = hashSplit64(x)
22 |     of Degski:   h = hashDegski(x)
23 |     if bench:
24 |       sum += h
25 |     else:
26 |       if Hex:
27 |         let s = BiggestInt(h).toHex(16) & '\n'
28 |         discard stdout.uriteBuffer(cstring(s), 17)
29 |       else:
30 |         discard stdout.uriteBuffer(cast[cstring](h.addr), Hash.sizeof.Natural)
31 |   if bench:
32 |     echo "sum: ", sum, " ns/hash: ", (getTime()-t0).inNanoseconds.float/n.float
33 | 
34 | dispatch(writeHash, help = { "n": "samples", "r": "rotation",
35 |                              "fun": "hash function"})
36 | 
37 | # For 1e6 numbers on 3 computers.  Time is min(5 runs) in ns/hash.
38 | # Compiled with Nim ...d0942d42a1; gcc-10.0.1 with PGO.
39 | #   CPU/Hash  WangYi  MoreMur  NASAM
40 | #   i76700k   1.2843  1.4986   1.6379  (4.70 GHz)
41 | #   amd2950x  1.7293  1.9127   2.2522  (3.78 GHz)
42 | #   i5-540m   2.9324  3.1191  11.9170  (2.53 GHz)
43 | 


--------------------------------------------------------------------------------
/tests/ppss.nim:
--------------------------------------------------------------------------------
 1 | import strutils
 2 | when not declared(stdin): import std/syncio
 3 | 
 4 | type                                    # Instantiate a tree type/suite
 5 |   Command* = object
 6 |     letter*: char
 7 |     sided*, side*: bool
 8 |     key*, val*: int
 9 | 
10 | proc preproc*(i="/dev/stdin", o="/dev/stdout") =
11 |   ## This program pre-parses human understandable ops for search structure
12 |   ## shells.  Output is flushed as soon as ready so that ppss|dsshell can be
13 |   ## used interactively at a terminal.  The language interpreted here is simple:
14 |   ## just <letter>[0|1][<space><key>[<space><val>]] { [] denotes optionality }.
15 |   ## This factoring allows shells to have negligible dispatch overhead/string
16 |   ## handling and so be appropriate for benchmarks/timing experiments.
17 |   let fi = if i == "/dev/stdin" : stdin  else: open(i, fmRead)
18 |   let fo = if o == "/dev/stdout": stdout else: open(o, fmWrite)
19 |   var cout: Command
20 |   for buf in lines(fi):
21 |     if buf.len == 0 or buf[0] == '#': continue
22 |     let cols = buf.split
23 |     if cols.len < 1 or cols[0].len < 1: continue
24 |     cout.letter = cols[0][0]
25 |     cout.sided = cols[0].len > 1
26 |     cout.side = if cout.sided and cols[0][1] == '1': true else: false
27 |     cout.key = int16(if cols.len > 1: parseInt(cols[1]) else: -99)
28 |     cout.val = int16(if cols.len > 2: parseInt(cols[2]) else: 0)
29 |     discard fo.writeBuffer(cout.addr, cout.sizeof)
30 |     fo.flushFile
31 | 
32 | proc readObject*(f: File, buffer: pointer, size: Natural): int {.inline.} =
33 |   proc c_fread(buf: pointer, size, n: culong, f: File): culong {.
34 |     importc: "fread_unlocked", header: "<stdio.h>" .}
35 |   result = int(c_fread(buffer, cast[culong](size), 1, f))
36 | 
37 | when isMainModule:
38 |   import cligen; dispatch(preproc)
39 | 


--------------------------------------------------------------------------------
/tests/anaPrime.nim:
--------------------------------------------------------------------------------
 1 | import strutils, times, lptabz, althash, cligen, cligen/[mfile, mslice, osUt]
 2 | proc hash(x: uint64): Hash {.inline.} = hashRoMu1(x) # =~ 1.05x faster
 3 | 
 4 | type Word = distinct uint32 # 24 bits of byte-offset, 8 bits of word length
 5 | 
 6 | proc initWord(off, len: int): Word {.inline.} =
 7 |   Word(uint32(off) shl 8 or uint32(len))
 8 | 
 9 | proc toString(w: Word, mf: MFile): string {.inline.} =
10 |   let off = uint32(w) shr 8
11 |   let len = uint32(w) and 255
12 |   result.setLen len
13 |   copyMem result[0].addr, mf.mem +! off, len
14 | 
15 | proc sig(word: MSlice): uint64 {.inline.} = # word signature
16 |   const prime = [ #9/267751 oflow
17 |     7'u64, 61, 41, 53, 2, 71, 47, 29, 3, 97, 89, 17, 59,
18 |     19, 5, 31, 101, 11, 13, 23, 37, 79, 73, 67, 43, 83 ]
19 |   result = 1'u64
20 |   for ch in word: result *= prime[ord(ch) - ord('A')]
21 | 
22 | proc getAna(dict="words", mf: MFile): LPTabz[uint64,Word,uint64,0] =
23 |   try: result.mmap(findPathPattern(dict & '.'))
24 |   except CatchableError:
25 |     result.init(mf.len div 10, numer=3, denom=1)
26 |     for word in mf.mSlices:
27 |       result.add word.sig, initWord(word.mem -! mf.mem, word.len)
28 |     result.save(dict)
29 | 
30 | proc qry(dict="words", stats=false, query: seq[string]) =
31 |   let t0 = getTime()
32 |   if (let mf = mopen(dict); mf) != nil:
33 |     let ana = dict.getAna(mf)
34 |     let t1 = getTime()
35 |     for word in query:
36 |       let word = word.toUpperAscii
37 |       let key = word.toMSlice.sig
38 |       echo word, ":"
39 |       for ana in ana.allValues(key):
40 |         echo "  ", ana.toString(mf)
41 |     if stats:
42 |       echo "Prep Time: ", (t1 - t0).inMicroseconds, " us"
43 |       when compiles(ana.depths):
44 |         echo "Depths: ", ana.depths    # hash table perf
45 |         echo "FinalTable: ", ana.len, "/", ana.getCap
46 |     mf.close
47 | 
48 | when isMainModule: dispatch qry
49 | 


--------------------------------------------------------------------------------
/tests/sshell.nim:
--------------------------------------------------------------------------------
 1 | when not declared(stdin): import std/[syncio, objectdollar]
 2 | import metab, strutils, os, times
 3 | 
 4 | proc now(): int64 {.inline.} = cast[int64](epochTime() * 1e9)
 5 | 
 6 | proc main() =
 7 |   var verb = getEnv("VERB", "xyzpdq") != "xyzpdq"
 8 |   var nLoop0, nLoop1, nP0, nP1, nG0, nG1, nD0, nD1: int = 0
 9 |   var t0, t1, tL0, tL1: int64
10 |   var stopped = false
11 |   var op: seq[char] = @[]                     # operation & Key sequences
12 |   when defined(directIndex):
13 |     var size = parseInt(getEnv("SIZE", "0"))
14 |     var ky: seq[int8]  = @[]
15 |   else:
16 |     var size = parseInt(getEnv("SIZE", "2"))
17 |     var ky: seq[int]  = @[]
18 |   var inp: string = stdin.readAll             # Pre-read+parse to not time that
19 |   inp.setLen inp.len - 1                      # Chop last nl; .. is inclusive
20 |   for line in inp.split('\n'):
21 |     op.add line[0]
22 |     ky.add(typeof(ky[0])(if line.len > 1: parseInt(line[1 .. ^1]) else: 0))
23 |   if op.len < 1 or paramCount() > 1:
24 |     echo("Usage:\n  ", paramStr(0), "< [gpdTZzLl.]K [..]")
25 |     quit(1)
26 |   var s = initSet[typeof(ky[0])](size, rehash=false, numer=3, denom=1)
27 |   t0 = now()
28 |   for i in 0 ..< ky.len:                      # Dispatch operations
29 |     let c = op[i]
30 |     let k = ky[i]
31 |     if verb: echo c, k                        # Verb mode helpful to trap bugs
32 |     case c
33 |       of 'a': s.add k
34 |       of 'g': (if k in s: nG1.inc else: nG0.inc)
35 |       of 'p': (if s.containsOrIncl(k): nP0.inc else: nP1.inc)
36 |       of 'd': (if s.missingOrExcl(k): nD0.inc else: nD1.inc)
37 |       of '-':
38 |         if k == 0: discard s.pop()
39 |         else: (var kk = k; discard s.pop(kk))
40 |       of 'T': echo s
41 |       of 'Z': t0 = now(); nP0 = 0; nP1 = 0; nG0 = 0; nG1 = 0; nD0 = 0; nD1 = 0
42 |       of 'z': t1 = now(); stopped = true
43 |       of 'L': tL0 = now(); nLoop0 = i
44 |       of 'l': tL1 = now(); nLoop1 = i
45 |       of '.': discard                         # Just to time op dispatch ovrhead
46 |       of 'P': s.debugDump
47 |       of 'D': echo s.depths
48 |       else: echo "UNKNOWN COMMAND:", c.repr; quit 2
49 |   if not stopped: t1 = now()
50 |   t1 -= t0
51 |   if nLoop1 - nLoop0 > 0:
52 |     var perDispatch = float(tL1 - tL0) / float(nLoop1 - nLoop0)
53 |     t1 -= int64(float(nP0 + nP1 + nG0 + nG1 + nD0 + nD1) * perDispatch)
54 |   echo "nP1: ", nP1, " nP0: ", nP0, " nG1: ", nG1, " nG0: ", nG0,
55 |        " nD1: ", nD1, " nD0: ", nD0
56 | main()
57 | 


--------------------------------------------------------------------------------
/tests/kmCmp.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | # nim-pgo build of: wu lfreq wfr wf
 3 | export I=/dev/shm/ToTC # totc=Tale Of Two Cities from Project Gutenberg
 4 | export i=/dev/shm/totc # $I pre-processed to lower case via tr A-Z a-z
 5 | export n=/dev/null # totc=Tale Of Two Cities from Project Gutenberg
 6 | tim 'wu<$I>$n' \
 7 |     \
 8 |     "tr -cs A-Za-z \\\\n<$I|tr A-Z a-z|mawk '{cnt[\$0]++}'>$n" \
 9 |     'tr -cs A-Za-z \\n<$I|tr A-Z a-z|lfreq -n-1' \
10 |     \
11 |     'wfr -n-1 <$I' \
12 |     \
13 |     'wf -n-1 -j1 <$I' \
14 |     'wf -n-1 -j2 <$I' \
15 |     'wf -n-1 -j3 <$I' \
16 |     'wf -n-1 -j4 <$I' \
17 |     \
18 |     'wf -n-1 -j1 <$i' \
19 |     'wf -n-1 -j2 <$i' \
20 |     'wf -n-1 -j3 <$i' \
21 |     'wf -n-1 -j4 <$i'
22 | 
23 | # (5.693 +- 0.031)e-03   wu<$I>$n   # Simpler - less L1 CPU Cache pressure
24 | #
25 | # Now all the same - accumulate counts, but do no processing after that (this
26 | # is the -n-1 trick in the Nim programs):
27 | #  0.010905 +- 0.000016   tr -cs A-Za-z \\n<$I|tr A-Z a-z|mawk '{cnt[$0]++}'>$n
28 | #  (5.840 +- 0.019)e-03   tr -cs A-Za-z \\n<$I|tr A-Z a-z|lfreq -n-1
29 | #
30 | #  (5.9686 +- 0.0055)e-03 wfr -n-1 <$I
31 | #
32 | #  (4.9756 +- 0.0009)e-03 wf -n-1 -j1 <$I
33 | #  (3.842 +- 0.013)e-03   wf -n-1 -j2 <$I
34 | #  (3.760 +- 0.020)e-03   wf -n-1 -j3 <$I
35 | #  (3.75 +- 0.10)e-03     wf -n-1 -j4 <$I # No point in higher j
36 | #
37 | # These "cheat" by using already lower-cased $i as input, to measure the
38 | # impact of MAP_PRIVATE trick.
39 | #  (4.436 +- 0.013)e-03   wf -n-1 -j1 <$i
40 | #  (3.3724 +- 0.0031)e-03 wf -n-1 -j2 <$i
41 | #  (3.297 +- 0.011)e-03   wf -n-1 -j3 <$i
42 | #  (3.2845 +- 0.0071)e-03 wf -n-1 -j4 <$i # No point in higher j
43 | #
44 | # So, basically, `mawk` is not that bad (~2X worse than `lfreq`), mmap input
45 | # is 5.9686/4.9756=1.2X faster, MAP_PRIVATE costs about 1.14X, and almost all
46 | # parallel speed-up comes from the first doubling of L1 storage.
47 | #
48 | # Also of interest is the approximate algorithm using `bu/oft`:
49 | #  0.01674 +- 0.00017      tr -cs A-Za-z \\n<$I|tr A-Z a-z|oft 1 >$n
50 | # Results match exactly on Tale Of Two Cities for top 12.  While `oft` uses much
51 | # less memory, 17ms is also (2.866 +- 0.031)X slower than `lfreq`.  As mentioned
52 | # in README.md, many sketches need VERY steep space cliffs to pay off in time.
53 | # Aggressive `oft -e0.1 -c0.5` severely degrades matches for only 1.3X speed-up.
54 | #
55 | # In conclusion, advice for vocabulary analysis is use `lfreq` w/preprocessing
56 | # { A) you won't do > ~2X better & B) definition of "word" is likely unstable /
57 | # context-specific; Preproc preserves flex for B without losing much since A. }
58 | 


--------------------------------------------------------------------------------
/tests/ucl.nim:
--------------------------------------------------------------------------------
 1 | when not declared(stdin): import std/[syncio, formatfloat]
 2 | import std/[hashes, times], cligen, cligen/[mslice, osUt], adix/oats
 3 | 
 4 | const bLen {.intdefine.} =  8   # <256 long; RT limits nicer but harder
 5 | const bOff {.intdefine.} = 24   # <16MiB UNIQUE line data
 6 | type
 7 |   Count {.packed.} = object     # Dense-ish hash Count type
 8 |     when defined hashCache: hc: uint32 # 4B|8B per cell
 9 |     len {.bitsize: bLen.}: uint32
10 |     off {.bitsize: bOff.}: uint32
11 |   Counts = object
12 |     dat: seq[Count]
13 |     nUsed: int
14 | 
15 | var s = " "; oatKStack s, Counts, Count, off,uint32, MSlice, MSlice
16 | proc key(c: Counts, i: int): MSlice = c.dat[i].key
17 | proc used(c: Counts, i: int): bool = c.dat[i].off!=0
18 | when defined hashCache:                           # def auto-triggers use
19 |   proc hash(ms: MSlice): Hash = mslice.hash(ms).uint32.Hash
20 |   proc hash(c: var Counts, i: int, hc: Hash) {.used.} = c.dat[i].hc = hc.uint32
21 |   proc hash(c: Counts, i: int): Hash = c.dat[i].hc.Hash
22 | oatCounted c,Counts, c.nUsed; oatSeq Counts, dat  # make counted & resizable
23 | #when Counts is ROat[MSlice, MSlice]: {.warning: "Counts is a ROat"}
24 | 
25 | proc incFailed(h: var Counts, ms: MSlice): bool =
26 |   var ms = ms
27 |   if ms.len > (1 shl bLen) - 1: # Careful to not overflow XXX rate limit msgs
28 |     erru "truncating too long (", $ms.len, ") line: ", ($ms)[0..<256], "...\n"
29 |     ms.len = (1 shl bLen) - 1   # Truncation makes count potentially off
30 |   h.upSert(ms, i): discard      # Found key @i: nothing to do
31 |   do:                           # Novel key->i:
32 |     h.dat[i].off = s.add(ms, (1 shl bOff) - 1):
33 |       erru "unique word data overflow at:",$ms,"\n" #XXX rate limit msgs
34 |       return true               # Cannot go on GLOBALLY
35 |     h.dat[i].len = ms.len.uint32 # Init
36 | 
37 | proc ucl(size=9999, dSize=81920, tm=false) =
38 |   ## Count unique & total lines on `stdin`. <256B long; <16 MiB unique data.
39 |   let t0 = if tm: epochTime() else: 0.0
40 |   var h: Counts; h.setCap size  # pre-size table & data
41 |   s.setLen dSize; s.setLen 1
42 |   var nTot = 0
43 |   block IO:
44 |     for (line, nLine) in stdin.getDelims:
45 |       let ms = MSlice(mem: line, len: nLine - 1)
46 |       inc nTot                  # Always bump `nTotal`
47 |       if h.incFailed(ms): break IO
48 |   echo h.len," unique ",nTot," total ",s.len," B"
49 |   if tm: stderr.write epochTime() - t0, "\n"
50 | 
51 | when isMainModule: dispatch ucl, help={
52 |   "size" : "pre-size hash table for size slots",
53 |   "dSize": "pre-size str data area to this many bytes",
54 |   "tm"   : "emit wall time of counting to stderr & quit"}
55 | 


--------------------------------------------------------------------------------
/tests/tshell.nim:
--------------------------------------------------------------------------------
 1 | when not declared(stdin): import std/[syncio, objectdollar, formatfloat]
 2 | import metab, strutils, os, times
 3 | 
 4 | proc now(): int64 {.inline.} = cast[int64](epochTime() * 1e9)
 5 | 
 6 | proc main() =
 7 |   var verb = getEnv("VERB", "xyzpdq") != "xyzpdq"
 8 |   var nLoop0, nLoop1, nP0, nP1, nG0, nG1, nD0, nD1: int = 0
 9 |   var t0, t1, tL0, tL1: int64
10 |   var stopped = false
11 |   var op: seq[char] = @[]                     # operation, Key, Val seqs
12 |   when defined(directIndex):
13 |     var size = parseInt(getEnv("SIZE", "0"))
14 |     var ky: seq[int8]  = @[]
15 |   else:
16 |     var size = parseInt(getEnv("SIZE", "2"))
17 |     var ky: seq[int]  = @[]
18 |   var vl: seq[int]  = @[]
19 |   var inp: string = stdin.readAll             # Pre-read+parse to not time that
20 |   inp.setLen inp.len - 1                      # Chop last nl; .. is inclusive
21 |   for line in inp.split('\n'):
22 |     let cols = line.split
23 |     op.add cols[0][0]
24 |     ky.add(typeof(ky[0])(if cols[0].len > 1: parseInt(cols[0][1..^1]) else: 0))
25 |     vl.add(if cols.len > 1: parseInt(cols[1])             else: 0)
26 |   if op.len < 1 or paramCount() > 1:
27 |     echo "Usage:\n  ", paramStr(0), "< [gpdaTZzLl.PD]K V"
28 |     quit 1
29 |   var t = initTab[typeof(ky[0]), int](size, rehash=false)
30 |   var had: bool
31 |   t0 = now()
32 |   for i in 0 ..< ky.len:                      # Dispatch operations
33 |     let c = op[i]
34 |     let k = ky[i]
35 |     let v = vl[i]
36 |     if verb: echo c, k, " ", v                # Verb mode helpful to trap bugs
37 |     case c
38 |       of 'g': (if k in t: nG1.inc else: nG0.inc)
39 |       of 'p':
40 |         discard t.mgetOrPut(k, v, had)
41 |         if had: nP1.inc else: nP0.inc
42 |       of 'd':
43 |         if t.missingOrExcl(k): nD0.inc
44 |         else                 : nD1.inc
45 |       of '-':
46 |         if k == 0: discard t.pop()
47 |         else: (var kk = k; var vv = v; discard t.pop(kk, vv))
48 |       of 'a': t.add(k, v)
49 |       of 'T': echo t
50 |       of 'Z': t0 = now(); nP0 = 0; nP1 = 0; nG0 = 0; nG1 = 0; nD0 = 0; nD1 = 0
51 |       of 'z': t1 = now(); stopped = true
52 |       of 'L': tL0 = now(); nLoop0 = i
53 |       of 'l': tL1 = now(); nLoop1 = i
54 |       of '.': discard                         # Just to time op dispatch ovrhead
55 |       of 'P': t.debugDump
56 |       of 'D': echo t.depths
57 |       else: echo "UNKNOWN COMMAND:", c.repr; quit 2
58 |   if not stopped: t1 = now()
59 |   t1 -= t0
60 |   if nLoop1 - nLoop0 > 0:
61 |     var perDispatch = float(tL1 - tL0) / float(nLoop1 - nLoop0)
62 |     t1 -= int64(float(nP0 + nP1 + nG0 + nG1 + nD0 + nD1) * perDispatch)
63 |   echo "ns: ", t1, " nP1: ", nP1, " nP0: ", nP0, " nG1: ", nG1, " nG0: ", nG0,
64 |        " nD1: ", nD1, " nD0: ", nD0, " a: ", t.len.float / t.getCap.float,
65 |        " M: ", t.getCap
66 | main()
67 | 


--------------------------------------------------------------------------------
/tests/wu.nim:
--------------------------------------------------------------------------------
 1 | when not declared(stdin): import std/[syncio, formatfloat]
 2 | import std/[hashes, times], cligen, cligen/[mslice, osUt], adix/oats
 3 | 
 4 | const bLen {.intdefine.} =  5   # <32B long;  RT params better but less easy
 5 | const bOff {.intdefine.} = 27   # <128MiB UNIQUE word data
 6 | type
 7 |   Count {.packed.} = object     # Dense-ish hash Count type
 8 |     when defined hashCache: hc: uint32 # 4B|8B per cell
 9 |     len {.bitsize: bLen.}: uint8
10 |     off {.bitsize: bOff.}: uint32
11 |   Counts = object
12 |     dat: seq[Count]
13 |     nUsed: int
14 | 
15 | var s: string; oatKStack s, Counts, Count, off,uint32, MSlice, MSlice
16 | proc key(c: Counts, i: int): MSlice = c.dat[i].key
17 | proc used(c: Counts, i: int): bool = c.dat[i].len != 0
18 | when defined hashCache:         # 2nd def triggers saving lpt behavior
19 |   proc hash(ms: MSlice): Hash = mslice.hash(ms).uint32.Hash
20 |   proc hash(c: var Counts, i: int, hc: Hash) {.used.} = c.dat[i].hc = hc.uint32
21 |   proc hash(c: Counts, i: int): Hash = c.dat[i].hc.Hash
22 | oatCounted c,Counts, c.nUsed; oatSeq Counts, dat  # make counted & resizable
23 | #when Counts is ROat[MSlice, MSlice]: {.warning: "Counts is a ROat"}
24 | 
25 | proc incFailed(h: var Counts, ms: MSlice): bool =
26 |   var ms = ms
27 |   if ms.len > (1 shl bLen) - 1: # Careful to not overflow XXX rate limit msgs
28 |     erru "truncating too long (", $ms.len, ") word: ", ($ms)[0..<32], "...\n"
29 |     ms.len = (1 shl bLen) - 1
30 |   h.upSert(ms, i): discard      # Found key @i:
31 |   do:                           # Novel key->i:
32 |     h.dat[i].off = s.add(ms, (1 shl bOff) - 1):
33 |       erru "unique word data overflow at:",$ms,"\n" #XXX rate limit
34 |       return true               # Cannot go on GLOBALLY
35 |     h.dat[i].len = ms.len.uint8 # Init
36 | 
37 | const d = " \t\r,;:.?!'\"()[]{}|<>=+-*/\\0123456789&`~$#%^"
38 | proc wu(size=9999,dSize=81920, tm=false, Dlm="") =
39 |   ## Count unique & total words on `stdin`. <32B long; <128 MiB unique data.
40 |   let sep = initSep(if Dlm.len != 0: Dlm else: d)
41 |   let t0 = if tm: epochTime() else: 0.0
42 |   var h: Counts; h.setCap size  # pre-size table & data
43 |   s.setLen dSize; s.setLen 0
44 |   var nTot = 0
45 |   block IO:
46 |     for (line, nLine) in stdin.getDelims:
47 |       for tok in MSlice(mem: line, len: nLine - 1).frame(sep):
48 |         if not tok.isSep and tok.ms.len > 0:
49 |           inc nTot              # Always bump `nTotal`
50 |           if h.incFailed(tok.ms): break IO
51 |   echo h.len," unique ",nTot," total ",s.len," B"
52 |   if tm: stderr.write epochTime() - t0, "\n"
53 | 
54 | when isMainModule: dispatch wu, help={
55 |   "size" : "pre-size hash table for size unique entries",
56 |   "dSize": "pre-size str data area to this many bytes",
57 |   "tm"   : "emit wall time of counting to stderr & quit",
58 |   "Dlm":"""chars by which words inside lines are delimited
59 | ""=>SPC,;:.?!'"()[]{}|<>=+-\*/\\0123456789&`~$#%^"""}
60 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | on:
 3 |   #schedule:
 4 |   #  - cron: '30 5 * * *'
 5 |   push:
 6 |     branches:
 7 |       - master
 8 |   pull_request:
 9 |     branches:
10 |       - '*'
11 | jobs:
12 |   changes:
13 |     if: github.event_name != 'schedule'     # Do not want to skip scheduled runs
14 |     continue-on-error: true                 # Ensure errors don't stop us
15 |     runs-on: ubuntu-latest
16 |     outputs:
17 |       src: ${{ steps.filter.outputs.src }}
18 |     steps:
19 |       - if: github.event_name != 'pull_request' # Github API path filter check=>
20 |         name: Checkout (if not PR)              #..No need to checkout.
21 |         uses: actions/checkout@v2
22 |       - uses: dorny/paths-filter@v2
23 |         id: filter
24 |         with:
25 |           filters: |
26 |             src:
27 |               - '**.cfg'
28 |               - '**.nims'
29 |               - '**.nim'
30 |               - '**.nimble'
31 |               - 'tests/**'
32 |               - '.github/workflows/ci.yml'
33 |   build:
34 |     needs: changes                          # Build if cared-about files changed
35 |     # always() is needed here for the job to always run despite Github docs.
36 |     # See: https://github.com/actions/runner/issues/491
37 |     if: always() && needs.changes.outputs.src != 'false'
38 |     strategy:
39 |       fail-fast: false
40 |       matrix:
41 |         os: ['ubuntu-latest']
42 |         nim: ['version-2-0']
43 |     name: '${{ matrix.os }} (${{ matrix.nim }})'
44 |     runs-on: ${{ matrix.os }}
45 |     steps:
46 |       - name: Checkout
47 |         uses: actions/checkout@v2
48 |         with:
49 |           path: ci
50 |       - name: Setup Nim
51 |         uses: alaviss/setup-nim@0.1.1
52 |         with:
53 |           path: nim
54 |           version: ${{ matrix.nim }}
55 |       - name: Build docs
56 |         if: ${{ matrix.docs }} == 'true'
57 |         shell: bash
58 |         run: |
59 |           cd ci
60 |           branch=${{ github.ref }}
61 |           branch=${branch##*/}
62 |           nimble doc --project --outdir:docs --path="." \
63 |             '--git.url:https://github.com/${{ github.repository }}' \
64 |             '--git.commit:${{ github.sha }}' \
65 |             "--git.devel:$branch" \
66 |             adix.nim
67 |           cp docs/{the,}index.html || true  # Ignore failures for older Nim
68 |       - name: Publish docs
69 |         if: >
70 |           github.event_name == 'push' && github.ref == 'refs/heads/master' &&
71 |           matrix.os == 'ubuntu-latest' && matrix.nim == 'version-2-0'
72 |         uses: crazy-max/ghaction-github-pages@v2.5.0
73 |         with:
74 |           build_dir: ci/docs
75 |         env:
76 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
77 |   success:                                  # Set check-required on this
78 |     needs: build
79 |     runs-on: ubuntu-latest
80 |     name: 'All check passes'
81 |     steps:
82 |       - run: |
83 |           echo "This is a workaround for Github's broken software"
84 | 


--------------------------------------------------------------------------------
/adix/metab.nim:
--------------------------------------------------------------------------------
 1 | ## This module provides an easy way to do compile-time switched impl swaps for
 2 | ## various table/set reprs with various compile-time switched defaults.  You
 3 | ## should really just learn how to use `LPTabz[..]` directly, though.
 4 | import core/macros, std/strformat
 5 | 
 6 | when defined(axRehash):
 7 |   let rDefault = true
 8 | else:
 9 |   let rDefault = false
10 | 
11 | when defined(axRobinHood):
12 |   let rhDefault = true
13 | else:
14 |   let rhDefault = false
15 | 
16 | proc rightSz*(x: Natural): int {.inline,deprecated: "Only identity now".} = x
17 | 
18 | macro doAlias(ns: string, root: string, tabP: string, setP: string) =
19 |   let inline = "{.inline.}"
20 |   parseStmt(&"""
21 | type Tab*[K,V] = {root}{tabP}
22 | 
23 | proc initTab*[K,V](sz={ns}InitialSize, numer={ns}Numer, denom={ns}Denom,
24 |                    minFree={ns}MinFree, growPow2={ns}GrowPow2, rehash=rDefault,
25 |                    robinHood=rhDefault): Tab[K,V] {inline} =
26 |   result.init(sz, numer, denom, minFree, growPow2, rehash, robinHood)
27 | 
28 | proc toTab*[K,V](pairs: openArray[(K,V)], dups=false): Tab[K,V] =
29 |   result.init pairs.len     # calling to{root}{tabp}(pairs, dups) fails; mixin?
30 |   if dups:
31 |     for k, v in items(pairs): result.add(k, v)
32 |   else:
33 |     for k, v in items(pairs): result[k] = v
34 | 
35 | type Set*[K] = {root}{setP}
36 | 
37 | proc initSet*[K](sz={ns}InitialSize, numer={ns}Numer, denom={ns}Denom,
38 |                  minFree={ns}MinFree, growPow2={ns}GrowPow2, rehash=rDefault,
39 |                  robinHood=rhDefault): Set[K] {inline} =
40 |   result.init(sz, numer, denom, minFree, growPow2, rehash, robinHood)
41 | 
42 | proc toSet*[K](keys: openArray[K], dups=false): Set[K] =
43 |   result.init keys.len      # calling to{root}{tabp}(pairs, dups) fails; mixin?
44 |   if dups:
45 |     for k in keys: result.add k
46 |   else:
47 |     for k in keys: result.incl k""")
48 | 
49 | when defined(axStdlib):     #NOTE: stdlib version cannot ctrl, eg. `initialSize`
50 |   import std/[tables, sets] #      when client just declares `var x: Tab`.
51 |   export tables, sets
52 |   type Tab*[K,V] = Table[K,V]
53 |   type Set*[K]   = HashSet[K]
54 |   proc initTab*[K,V](sz=4, numer=1, denom=1, minFree=1, growPow2=1,
55 |                      rehash=false, robinHood=false): Tab[K,V] {.inline.} =
56 |     initTable[K,V](sz)
57 |   proc initSet*[K](sz=4, numer=1, denom=1, minFree=1, growPow2=1, rehash=false,
58 |                    robinHood=false): Set[K] {.inline.} =
59 |     initHashSet[K](sz)
60 | elif defined(axDirect):
61 |   import adix/ditab
62 |   export ditab
63 |   doAlias("di", "DITab", "[K,V]", "[K,void]")
64 | elif defined(axInOrder):
65 |   import adix/lptabz  # Extra generic params are void|not|order sentinel flag,
66 |   export lptabz       #..then z|num bits for a hash code in the index part.
67 |   type InsOrd = distinct int8 # 8 bits blocks most dbl indirections on misses
68 |   doAlias("lp", "LPTabz", "[K,V,InsOrd,8]", "[K,void,InsOrd,8]")
69 | else:
70 |   import adix/lptabz  # Extra generic params here are void|not sentinel flag, z.
71 |   export lptabz
72 |   when defined(axIntTab0):
73 |     doAlias("lp", "LPTabz", "[K,V,K,0]", "[K,void,K,0]")
74 |   elif defined(axIntTabM1):
75 |     doAlias("lp", "LPTabz", "[K,V,K,-1]", "[K,void,K,-1]")
76 |   else:
77 |     doAlias("lp", "LPTabz", "[K,V,void,0]", "[K,void,void,0]")
78 | 


--------------------------------------------------------------------------------
/adix/ways.nim:
--------------------------------------------------------------------------------
 1 | ## To paraphrase Mandalorians: "These are the ways" (various algorithms).
 2 | import std/heapqueue
 3 | 
 4 | iterator kWayMerge*[T](itrs: openArray[iterator(): T]): T =
 5 |   ## k-way merge of ordered `itrs[i]` yields using `std/heapqueue`.
 6 |   if itrs.len > 1:
 7 |     type HeapItem = (T, int)
 8 |     var hq = initHeapQueue[HeapItem]()
 9 |     for i, it in itrs:      # Load min-heap with the first yield of each.
10 |       let vNext = it()      # Must call for system to know exhaustion
11 |       if not it.finished:   #..but want if-guard before push to avoid
12 |         hq.push (vNext, i)  #..having exhausted iterators in the heap.
13 |     while hq.len > 0:       # While heap is not empty:
14 |       let (v, i) = hq.pop   #   get & yield the next min.
15 |       yield v
16 |       let it = itrs[i]      #   push next item from just yielded..
17 |       let vNext = it()
18 |       if not it.finished:   #  ..(unless it's exhausted)
19 |         hq.push (vNext, i)
20 |   elif itrs.len == 1:       # special case of only 1 (elif=>or 0) iter
21 |     for v in itrs[0](): yield v
22 | 
23 | when isMainModule:
24 |   iterator i0: int {.closure.} = discard
25 |   iterator i1: int {.closure.} = yield 3
26 |   iterator i2: int {.closure.} = yield 1; yield 5
27 |   iterator i3: int {.closure.} = yield 2; yield 4; yield 6
28 |   for i in [i0, i1, i2, i3].kWayMerge: echo i
29 | 
30 | iterator succPairs*[T](src: iterator:T; stride=1): (T, T) =
31 |   ## Yield successive pairs (src[i - stride], src[i]) for all valid i
32 |   var counter = stride  # Whether to act|wait
33 |   var it0: T            # Running reference value
34 |   var haveOne = false   # Flag indicating we have above
35 |   for it in src():
36 |     if haveOne:         # We are waiting
37 |       dec counter
38 |       if counter == 0:  # Waited right amount
39 |         yield (it0, it)
40 |         it0 = it
41 |         counter = stride
42 |     else:               # Transition -> waiting
43 |       it0 = it          #..with a held reference val
44 |       haveOne = true
45 | 
46 | iterator diffs*[T](src: iterator:T; stride=1): T =
47 |   ## First differences
48 |   for x0, x in succPairs(src, stride):
49 |     yield x - x0
50 | 
51 | iterator diffs2*[T](src: iterator:T; stride=1): T =
52 |   ## Second differences
53 |   proc diffs1: iterator: T =
54 |     iterator: T =
55 |       for it in diffs(src): yield it
56 |   for x in diffs(diffs1(), stride): yield x
57 | 
58 | iterator ratios*[T](src: iterator:T; stride=1): T =
59 |   ## First ratios
60 |   for x0, x in succPairs(src, stride):
61 |     yield x/x0 # guard w/if x0 != 0?
62 | 
63 | iterator returns*[T](src: iterator:T; stride=1): T =
64 |   ## Arithmetic returns (of, e.g. prices)
65 |   for x0, x in succPairs(src, stride):
66 |     yield x/x0 - 1
67 | 
68 | proc seqItems[T](src: seq[T]): iterator:T = # Cannot be openArray since..
69 |   iterator:T = (for x in src: yield x)      #..that can live on the stack.
70 | 
71 | proc diffs*[T](src: seq[T]; stride=1): seq[T] =
72 |   ## Batch first differences of random-access `src` (vectorizable).
73 | # when T is uint8: ..   # To vectorize must at least..
74 | # when T is float32: .. #..fan-out based on `T`.
75 |   for d in diffs(src.seqItems, stride): result.add d # slow for now
76 | 
77 | when isMainModule:
78 |   iterator nums: float {.closure.} = (for i in 1..9: yield i.float)
79 |   for d in diffs(nums): echo d   # 1 yields 2-1, 3-2, 4-3, .. = 9 1s
80 |   for d in diffs2(nums): echo d  # 2 yields 1-1, 1-1, 1-1, .. = 8 0s
81 |   for r in ratios(nums): echo r  # 3 yields 2/1,3/2,4/3, ..
82 |   for r in returns(nums): echo r # 4 yields 2/1-1,3/2-1,4/3-1, ..
83 |   let x = [1, 2, 3, 4, 5, 6, 7, 8, 9]; echo diffs(@x) # AOT 1
84 | 


--------------------------------------------------------------------------------
/tests/wfr.nim:
--------------------------------------------------------------------------------
 1 | when not declared(stdin): import std/[syncio, formatfloat]
 2 | import std/[hashes, times], cligen, cligen/[mslice, osUt], adix/oats
 3 | 
 4 | const bLen {.intdefine.} =  5   # <32B long;  RT params better but less easy
 5 | const bOff {.intdefine.} = 27   # <128MiB UNIQUE word data
 6 | const bCnt {.intdefine.} = 32   # <4 GiCount
 7 | type
 8 |   Count {.packed.} = object     # Dense-ish hash Count type
 9 |     when defined hashCache: hc: uint32 # 8B|12B per cell
10 |     len {.bitsize: bLen.}: uint8
11 |     off {.bitsize: bOff.}: uint32
12 |     cnt {.bitsize: bCnt.}: uint32
13 |   Counts = object
14 |     dat: seq[Count]
15 |     nUsed: int
16 | 
17 | var s: string; oatKStack s, Counts, Count, off,uint32, MSlice, MSlice
18 | proc key(c: Counts, i: int): MSlice = c.dat[i].key
19 | proc val(c: var Counts, i: int, v: uint32) {.used.} = c.dat[i].cnt = v
20 | proc val(c: Counts, i: int): uint32 = c.dat[i].cnt
21 | proc used(c: Counts, i: int): bool = c.dat[i].len != 0
22 | when defined hashCache:         # 2nd def triggers saving lpt behavior
23 |   proc hash(ms: MSlice): Hash = mslice.hash(ms).uint32.Hash
24 |   proc hash(c: var Counts, i: int, hc: Hash) {.used.} = c.dat[i].hc = hc.uint32
25 |   proc hash(c: Counts, i: int): Hash = c.dat[i].hc.Hash
26 | oatCounted c,Counts, c.nUsed; oatSeq Counts, dat  # make counted & resizable
27 | #when Counts is VROat[MSlice, MSlice, uint32]: {.warning: "Counts is a VROat"}
28 | 
29 | proc incFailed(h: var Counts, ms: MSlice): bool =
30 |   var ms = ms
31 |   if ms.len > (1 shl bLen) - 1: # Careful to not overflow XXX rate limit msgs
32 |     erru "truncating too long (", $ms.len, ") word: ", ($ms)[0..<32], "...\n"
33 |     ms.len = (1 shl bLen) - 1   # Truncation makes count potentially off
34 |   h.upSert(ms, i):              # Found key @i:
35 |     if h.dat[i].cnt == (1 shl bCnt) - 1:
36 |       erru "counter overflow for: ",$ms,"\n" # no update XXX rate limit
37 |     else: h.dat[i].cnt.inc      #   bump
38 |   do:                           # Novel key->i:
39 |     h.dat[i].off = s.add(ms, (1 shl bOff) - 1):
40 |       erru "unique word data overflow at:",$ms,"\n" #XXX rate limit
41 |       return true               # Cannot go on GLOBALLY
42 |     h.dat[i].len = ms.len.uint8 # Init
43 |     h.dat[i].cnt = 1u32
44 | 
45 | const d = " \t\r,;:.?!'\"()[]{}|<>=+-*/\\0123456789&`~$#%^"
46 | proc wfr(n=10, count=false,Norm=false, size=9999,dSize=81920, tm=false, Dlm="")=
47 |   ## Histogram words on `stdin`.  <128 MiB unique data; <32B long; <4 GiCount.
48 |   let sep = initSep(if Dlm.len != 0: Dlm else: d)
49 |   let t0 = if tm: epochTime() else: 0.0
50 |   var h: Counts; h.setCap size  # pre-size table & data
51 |   s.setLen dSize; s.setLen 0
52 |   var nTot = 0
53 |   block IO:
54 |     for (line, nLine) in stdin.getDelims:
55 |       for tok in MSlice(mem: line, len: nLine - 1).frame(sep):
56 |         if not tok.isSep and tok.ms.len > 0:
57 |           inc nTot              # Always bump `nTotal`
58 |           if h.incFailed(tok.ms): break IO
59 |   if count: echo h.len," unique ",nTot," total ",s.len," B"
60 |   template output =
61 |     if Norm: outu c.float/nTot.float," ",k,"\n" else: outu c," ",k,"\n"
62 |   if   n == 0: (for (k, c) in pairs(h): output())
63 |   elif n > 0 : (for (k, c) in topByVal[MSlice, MSlice, uint32](h, n): output())
64 |   elif n < -1: (for (k, c) in topByVal[MSlice, MSlice, uint32](h, -n, order=Descending): output())
65 |   if tm: stderr.write epochTime() - t0, "\n"
66 | 
67 | when isMainModule: dispatch wfr, help={
68 |   "n"    : "emit `n`-most common  lines(0:all; <0 sorted)",
69 |   "count": "only emit counts: unique & grand total",
70 |   "Norm" : "normalize frequencies by dividing by grand tot",
71 |   "size" : "pre-size hash table for size unique entries",
72 |   "dSize": "pre-size str data area to this many bytes",
73 |   "tm"   : "emit wall time of counting to stderr & quit",
74 |   "Dlm":"""chars by which words inside lines are delimited
75 | ""=>SPC,;:.?!'"()[]{}|<>=+-\*/\\0123456789&`~$#%^"""}
76 | 


--------------------------------------------------------------------------------
/adix/bitop.nim:
--------------------------------------------------------------------------------
 1 | ## This is a reimplementation of some things we need from bitops which has CT
 2 | ## trouble due to importc's.  (I feel it's a better naming/factoring, too).
 3 | 
 4 | proc `&=`*[T,U](a: var T, b: U) = a = a and b ## Updating bit-wise `and`
 5 | proc `|=`*[T,U](a: var T, b: U) = a = a or b ## Updating bit-wise `or`
 6 | proc `^=`*[T,U](a: var T, b: U) = a = a xor b ## Updating bit-wise `xor`
 7 | proc `<<=`*[T,U](a: var T, b: U) = a = a shl b ## Updating bit-wise `shl`
 8 | proc `>>=`*[T,U](a: var T, b: U) = a = a shr b ## Updating bit-wise `shr`
 9 | 
10 | proc ceilPow2*(x: int): int {.noSideEffect, inline.} =
11 |   ## Returns ``x`` rounded up to the nearest power of two.  <= 0 get 1.
12 |   result = x - 1
13 |   when defined(cpu64):
14 |     result |= result shr 32
15 |   when sizeof(int) > 2:
16 |     result |= result shr 16
17 |   result |= result shr 8
18 |   result |= result shr 4
19 |   result |= result shr 2
20 |   result |= result shr 1
21 |   result += 1 + ord(x <= 0)
22 | 
23 | proc floorPow2*(x: int): int {.noSideEffect, inline.} =
24 |   ## Returns ``x`` rounded down to the nearest power of two.
25 |   result |= result shr 1
26 |   result |= result shr 2
27 |   result |= result shr 4
28 |   result |= result shr 8
29 |   when sizeof(int) > 2:
30 |     result |= result shr 16
31 |   when defined(cpu64):
32 |     result |= result shr 32
33 |   result -= result shr 1
34 | 
35 | # https://stackoverflow.com/questions/3465098/bit-twiddling-which-bit-is-set/
36 | # This is essentially just a perfect hash a la Leiserson98-UsingDeBruijnSeqs.
37 | when defined(cpu64):
38 |   const deBruijn8 = [ 0, 1, 2, 53, 3, 7, 54, 27, 4, 38, 41, 8, 34, 55, 48, 28,
39 |     62, 5, 39, 46, 44, 42, 22, 9, 24, 35, 59, 56, 49, 18, 29, 11, 63, 52, 6,
40 |     26, 37, 40, 33, 47, 61, 45, 43, 21, 23, 58, 17, 10, 51, 25, 36, 32, 60, 20,
41 |     57, 16, 50, 31, 19, 15, 30, 14, 13, 12 ]
42 | else:
43 |   const deBruijn4 = [ 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17,
44 |     4, 8, 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 ]
45 | 
46 | proc lgPow2*(x: int): int {.inline.} =
47 |   when defined(cpu64):
48 |     deBruijn8[(uint64(x) * 0x022FDD63CC95386D'u64) shr 58]
49 |   else:
50 |     deBruijn4[(uint32(x) * 0x077CB531'u32) shr 27]
51 | 
52 | proc lgCeil*(x: int): int {.inline.} = lgPow2(ceilPow2(x))
53 |   ## integer-math only impl of ceil(log2(x))
54 | 
55 | proc lgFloor*(x: int): int {.inline.} = lgPow2(floorPow2(x))
56 |   ## integer-math only impl of floor(log2(x))
57 | 
58 | proc lg*(x: int): int {.inline.} = lgCeil(x) ## short alias for lgCeil
59 | 
60 | proc rotateLeftBits*(a: uint64, numBits: int): uint64 {.inline.} = ## like bitops
61 |   (a shl numBits) or (a shr (uint64.sizeof * 8 - numBits))
62 | 
63 | proc rotateRightBits*(a: uint64, numBits: int): uint64 {.inline.} = ## like bitops
64 |   (a shr numBits) or (a shl (uint.sizeof * 8 - numBits))
65 | 
66 | proc reverseBitsByte*(x: uint8): uint8 {.inline.} =
67 |   const reversed = [ 0b0000'u8, 0b1000, 0b0100, 0b1100,
68 |                      0b0010   , 0b1010, 0b0110, 0b1110,
69 |                      0b0001   , 0b1001, 0b0101, 0b1101,
70 |                      0b0011   , 0b1011, 0b0111, 0b1111 ]
71 |   result = (reversed[x and 15] shl 4) or reversed[x shr 4]
72 | 
73 | proc reverseBitsMakeTable(): array[256, uint8] =
74 |   for i in 0 ..< 256:
75 |     result[i] = reverseBitsByte(uint8(i))
76 | 
77 | const revByte = reverseBitsMakeTable()
78 | 
79 | proc reverseBits*(x: uint32): uint32 =
80 |   (uint32(revByte[int((x and 0x000000FF'u32)       )]) shl 24) or
81 |   (uint32(revByte[int((x and 0x0000FF00'u32) shr  8)]) shl 16) or
82 |   (uint32(revByte[int((x and 0x00FF0000'u32) shr 16)]) shl  8) or
83 |    uint32(revByte[int( x                               shr 24)])
84 | 
85 | proc reverseBits*(x: uint64): uint64 =
86 |   (uint64(revByte[int((x and 0x00000000000000FF'u64)       )]) shl 56) or
87 |   (uint64(revByte[int((x and 0x000000000000FF00'u64) shr  8)]) shl 48) or
88 |   (uint64(revByte[int((x and 0x0000000000FF0000'u64) shr 16)]) shl 40) or
89 |   (uint64(revByte[int((x and 0x00000000FF000000'u64) shr 24)]) shl 32) or
90 |   (uint64(revByte[int((x and 0x000000FF00000000'u64) shr 32)]) shl 24) or
91 |   (uint64(revByte[int((x and 0x0000FF0000000000'u64) shr 40)]) shl 16) or
92 |   (uint64(revByte[int((x and 0x00FF000000000000'u64) shr 48)]) shl  8) or
93 |    uint64(revByte[int( x                             shr 56)])
94 | 
95 | proc isPow2*(x: int): bool =
96 |   if x == 0: return false
97 |   (x and (x - 1)) == 0
98 | 


--------------------------------------------------------------------------------
/TODO.md:
--------------------------------------------------------------------------------
 1 | 0) Make the "mitigation sequence" user-adjustable somehow...E.g., first rehash,
 2 |    then maybe rerehash, then robinhood, then maybe Btree. (Could maybe also
 3 |    make warnings adjustable just calling some imported tooFull type proc.)
 4 | 
 5 | 1) While things work well in the tests/\*shell.nim's, should add many tests both
 6 |    to exercise everything and ensure a compiler changes don't cause regressions.
 7 |    See if we can just use all the one's already in the Nim stdlib.  Add in the
 8 |    weak vitanim benchmarks.  Would be nice to do a whole more real suite like
 9 |    the probablydance guy & beyond; Unsure I'll have the time/interest.
10 | 
11 | 2) Make a `diset` using `sequint`, e.g. `dcset`; Should be easy.
12 | 
13 | 3) Add `reindex` proc for o\*set/o\*tab.  Should be easy..just clear idx and
14 |    loop over `data[]` inserting.  Maybe add after `sort` (to be more drop-in
15 |    for Nim stdlib's ordered table workings).
16 | 
17 | 4) add `ref` variants, maybe via a `defref.nim`
18 | 
19 | 5) Possible lpset/ilset/olset micro-optimizations most relevant for L1/L2 cases:
20 | 
21 |   a) `moveMem` type element shifting should be faster than current pushUp/etc.,
22 |      especially for larger element sizes.
23 | 
24 |   b) One thing that might boost some workloads is the idea here:
25 |        https://web.archive.org/web/20170623234417/https://pubby8.wordpress.com/
26 |      The idea is replacing the lower bits of the hash code which recapitulate
27 |      the table index after it's computed with the probe depth (after Amble&Knuth
28 |      1973).  The benefit is that the `hc != 0 and d > (hc-i)and mask` check can
29 |      be folded into only `d > hc and mask`.  The trade off (unmentioned in that
30 |      blog post) is that pushUp/pullDn must update depths in the upper half of
31 |      the cluster, however big it is, blocking optimized just-memmove shifting.
32 |      Also, you save 1/3 not so predictable branches in the find loop per cluster
33 |      element, but also create 5 ops per cluster element for mutation ops (load,
34 |      mask out, inc/dec, mask in, store) for the cluster upper half.  The extra
35 |      ops are more predictable work, but so is an optimized memmove.  Edit heavy
36 |      workloads probably shake out to near a net wash.  For "mostly miss, read-
37 |      only after build" workloads it could help 1.5x (e.g. empty set intersects).
38 |      That's also when Robin Hood *already* wins big from its half-depth search.
39 |      Given that memory layout is identical to *not* saving depths, this could
40 |      maybe be a run-time/per-instance option { like Robin re-org itself is },
41 |      not necessarily on by default.  "bulk conversions" of hcodes to (upper hc,
42 |      depth) combos & back can be faster than a full resize/rehash anyway.
43 | 
44 |   c) Can simplify rawPut/rawDel a lot if assume a strong enough hash by keeping
45 |      an overflow area at the high end of s.data, i.e., s.data is longer than `1
46 |      shl s.pow2` by denom/numer\*s.pow2 ish.  The amount longer will always be
47 |      bounded by that from table resize policy *except* if a hash is so bad that
48 |      this limit is violated with a sparsely full table, which can happen with
49 |      enough probability to be a serious concern.  Graceful degradation is better
50 |      and all we currently do in this overlong at < 50% full circumstance is warn
51 |      or acivate mitigations.  In short, I doubt
52 |      probablydance.com/2017/02/26/i-wrote-the-fastest-hashtable/ makes the right
53 |      safety judgement call for a general purpose table..elsewise a worthwhile
54 |      blog post with less lame benchmarks than usual.  I independently had the
55 |      idea to trigger growth based on probe depth with a memory attack safeguard,
56 |      though.  As did the Rust guys, apparently and surely many in the 1970s.
57 |      It's really a pretty obvious tilt-your-head-the-other-way look at a usual
58 |      probes vs.load graph.  The probablydance guy seems to have missed the
59 |      safeness side of it, though and focused on low loads with good hashes. He
60 |      figured this out in the end, but going w/a short-hop linked variant:
61 |        https://probablydance.com/2018/05/28/a-new-fast-hash-table-in-response-to-googles-new-fast-hash-table/
62 | 
63 | 7) It'd be nice if we provided the option to use writable mmaps for all these
64 |    tables instead of just `seq` backing store.  This could maybe be as simple as
65 |    passing an allocator proc to the various constructor functions and doing our
66 |    own pointer arithmetic.  Not too hard, really.
67 | 
68 | 8) Do external chaining impl for mutating while iterating.  Look at internal
69 |    chain of probablydance to assess insert/delete-while-iterating abilities.
70 | 


--------------------------------------------------------------------------------
/NOTES.md:
--------------------------------------------------------------------------------
 1 | These notes are about lptabz/LPTabz.
 2 | 
 3 | Unlike most hash table libraries, we do not use load factor to *guess* when
 4 | growth *might* be necessary to preserve performance (conditioned upon good hash
 5 | functions relative to the key set).  Instead, we measure probe sequence depth
 6 | during insert and grow only if it is too deep (or some small `minFree` limit is
 7 | hit).  "Too long" is `> numer/denom*lg(size)` since the worst per table case on
 8 | random data scales that way.  This approach is both more robust to weakly
 9 | scrambling hashes and more space conservative for "better than random" hashes.
10 | It also fixes this problem:
11 |   https://accidentallyquadratic.tumblr.com/post/153545455987/rust-hash-iteration-reinsertion
12 | 
13 | However, we also want to avoid memory exhaustion.  So, we only grow tables if
14 | not "too sparse", i.e. `count/length > 1/(1 shl growPo2)`.  For `growPo2=1` this
15 | means we might still double at only 50% full taking it down to 25%.  If 25-50%
16 | load cannot give expected collision cluster sizes then A) the hash function is
17 | inadequate, B) the table is under attack, or C) it is being abused with many
18 | duplicate keys.  The first two of these situations are basically the same with
19 | a natural response - use a more scrambling hash function with hard to guess data
20 | mixed in.  Ignorance of the underlying keys only allows us to do that "on top"
21 | of the `hash` the user already provides which is what we do here.  This will be
22 | ineffective if that provided hash outputs too few hash code values.  Our re-hash
23 | of the hash mixes in the VM address of its data area, unique to each table and
24 | table size.  We usually emit a warning when activating this feature, though that
25 | can be disabled.
26 | 
27 | This resize protocol makes performance much more deterministic, but also makes
28 | space utilization non-deterministic.  Utilization can be both much better than
29 | typical load-based resize with a near perfect hash as well as a little worse
30 | with a too weak hash.  This seems "how it should be".  Safer performance also
31 | seems worth more than deterministic size, and you cannot have both at once with
32 | an abstract key and user-settable hash.  I'm a bit surprised this resize
33 | protocol isn't more popular.
34 | 
35 | If you want to monitor space utilization you can do `.len/.getCap`.  The tables
36 | here also all provide a query function `depths` to inspect distribution of probe
37 | depth.  `depths` is about as expensive to compute as looking each item up once.
38 | It can be a bit faster on some key types.  More general performance forensics,
39 | are available if `hashStats` is defined to count various important events.
40 | That activates counters for a variety of events like probes, different resize
41 | conditions and so on.  Each counter just starts at zero and goes up.  So, you
42 | just use them a bit like `epochTime`/`getTime` and friends.  You can "time" at
43 | whatever granularity is desired..  They are just global variables and so not
44 | exactly multi-thread safe, but the worst that can happen under MT contention is
45 | that you loose a few counts.  { E.g., A) load old val, B) load old val, both
46 | inc, only one writes back.  So, you get 1 inc instead of 2. }
47 | 
48 | Some OSes can accelerate context switches if FP registers are never dirtied.
49 | So, this library is careful to avoid floating point with integer ratios.
50 | It is also careful to avoid high pipeline latency integer divisions.
51 | 
52 | These tables also allow shrinkage in case many deletes have occurred and more
53 | than one iteration might later be performed. { A `setCap` is slower than one
54 | iteration.  So, if it's only 1 more there is no point. } The default new size
55 | parameter (or an explicit negative one) causes the table to grow by one standard
56 | expansion while zero (or any small positive number) will make it be compacted
57 | to the minimum size that can fit its current population (possibly plus a fudge
58 | factor based on hash randomness assumptions for tables that need that).
59 | 
60 | A perhaps non-obvious subtlety about Robin Hood hashing with depth-triggered
61 | growth is that `pushUp` can increase depth at the end of the collision cluster.
62 | That is the depth we want tested in `tooFull`.  So, `rawPut` is two phases for
63 | RH.  `rawPut1` finds the end of a collision cluster.  `rawPut2` does the actual
64 | shift.  In between, we can call `tooFull` to see if resizing is necessary.
65 | Note that the depth `rawPut1` calculates is actually the "shift size" not the
66 | max search depth of moved elements.  This is the sense of depth you want since
67 | the point of depth-triggered resize is to avoid both large scans and large data
68 | motion. { "Cost" might be a more clear word than "depth". }
69 | 


--------------------------------------------------------------------------------
/adix/uniqce.nim:
--------------------------------------------------------------------------------
 1 | ## The min-count sketch (NOT count-min) idea is to see hash(x) as a U(0,1) & use
 2 | ## P(sampleMax<x)=x^n for sample size n.  Low art inverts a confidence.ival for
 3 | ## h.max to estimate n.  Tracking `k`-most distinct h gives better accuracy and
 4 | ## is usually called a KMV sketch. (Intuition is that k-th edge val => *average*
 5 | ## gap between k-1 uniques&averaging cuts noise.) See Bar-Yossef 2002 "Counting
 6 | ## Distinct..", Giroire05 "Order statistics & estimating cardinalities" & Ting14
 7 | ## "Streamed approximate counting..".
 8 | # NOTE: Speed-accuracy-space trade-off triangle.  To keep `lowerBound` & rare
 9 | # `moveMem` fast, want `tail` cached, meaning small `k` & ~big expected error.
10 | # TODO Augment to AKMV for multiset queries & "cardinality one stop shopping".
11 | # TODO Add a B-Tree for when k>>1000 & folks care about "perf during warm up"
12 | # (As-is is very simple and not slow post warm-up; uniqce 100000 1024 16383 1000
13 | # =~ 2 sec for 100e6 => 20 ns/update.  Better than 4x boost seems v.unlikely.)
14 | 
15 | const axUCEdefaultK {.intdefine.} = 1024 # -d:axUCEdefaultK=X edits unInit val.
16 | # avg|err| <+-~0.5..1.8%, avgMax1000|err|<~2..8.0% (dep on dups).
17 | import std/[algorithm, hashes, math, sets]
18 | type                    # NOTE: Want no dups & fast del reorg if seq is replaced
19 |   UniqCe*[F:SomeFloat] = object
20 |     tail: seq[F]        # Sorted seq of `k` biggest `Hash`; Q: Add B-Tree mode?
21 |     k: int              # State scale & lower bound of true answer
22 |     est: float64        # Running estimate a la Ting2014
23 | 
24 | proc initUniqCe*[F:SomeFloat](k=1024): UniqCe[F] = result.k = k
25 |   ## Return initialized `UniqCe` with tail size `k`.  k=1024 costs 4K|1VM page
26 | 
27 | proc push*[F:SomeFloat](uc: var UniqCe[F], h: F) =
28 |   ## Incorporate visitation of an element.  NOTE: std/hashes.hash(string) OUTPUT
29 |   ## ONLY FILLS 32-BITS => push takes `x on [0,1]` instead of a key|hash() val.
30 |   if uc.k == 0: uc.k = axUCEdefaultK            # Make ok w/`var x: UniqCe`
31 |   if uc.tail.len == 0:                          # Initial empty check
32 |     uc.tail.add h.F                             # Branch-predicted out
33 |     return
34 |   if uc.tail.len == uc.k and h <= uc.tail[0]:   # No change to tail
35 |     return # Most activity for large inputs ends here post-warm-up;ByDef("tail")
36 |   let i = uc.tail.lowerBound(h)                 # BINARY SEARCH for ins. spot
37 |   if i < uc.tail.len and uc.tail[i] == h:       # Already in tail; !DUPS=>done
38 |     return
39 |   if uc.tail.len < uc.k:                        # BUILD PHASE; always insert
40 |     uc.tail.insert h, i
41 |     uc.est += 1.0/float64(1.0 - uc.tail[0])
42 |   elif h > uc.tail[0]:                          # RARE: TAIL GETS NEW ELT
43 |     if i > 1:                                   # i >= 1: must make room
44 |       moveMem uc.tail[0].addr, uc.tail[1].addr, (i - 1)*uc.tail[0].sizeof
45 |     uc.tail[i - 1] = h                          # i is pre-downshift spot
46 |     uc.est += 1.0/float64(1.0 - uc.tail[0])
47 | 
48 | proc nUnique*[F:SomeFloat](uc: UniqCe[F]): float32 =
49 |   ## Estimate number of unique elements seen so far.
50 |   if uc.tail.len < uc.k: uc.tail.len.float32 else: max(uc.k.float32, uc.est)
51 | 
52 | proc nUniqueErr*[F:SomeFloat](uc: UniqCe[F]): float32 =
53 |   ## Estimated error on estimate of unique elements seen so far.
54 |   if uc.tail.len<uc.k: 0.float32
55 |   else: uc.nUnique/sqrt(uc.k.float32 - 1) #XXX Est better
56 | 
57 | proc jaccard*[F:SomeFloat](a: UniqCe[F], b: UniqCe[F]): float32 =
58 |   ## Exact Jaccard coefficient (`|intersect|/|union|`) of the two sample tails.
59 |   ## NOTE: `jaccard(a,b) * union(a, b).nUnique` estimates `nUnique(a ^ b)`.
60 |   var s: HashSet[F]
61 |   for e in a.tail: s.incl e
62 |   var i = 0
63 |   var u = a.len
64 |   for e in b.tail:
65 |     if e in s: inc i    # e in both a & b
66 |     else: inc u         # OR e in just b
67 |   float32(i.float/u.float)
68 | 
69 | proc union*[F:SomeFloat](a: var UniqCe[F], b: UniqCe[F]) =
70 |   ## Push all hashes from `b` onto `a` effecting an estimated set union.
71 |   for e in b.tail: a.push e
72 | 
73 | proc union*[F:SomeFloat](a: UniqCe[F], b: UniqCe[F]): UniqCe[F] =
74 |   ## Return merge/union of `a` & `b`.
75 |   result = a; a.union b
76 | 
77 | when isMainModule:  # Takes numberOfItems/trial, stateSize, dupMask, numTrials
78 |   when not declared(addFloat): import std/formatfloat
79 |   import std/[random, strutils, os, stats]; randomize()
80 |   let n = if paramCount() > 0: parseInt(paramStr(1)) else: 100000
81 |   let k = if paramCount() > 1: parseInt(paramStr(2)) else: axUCEdefaultK
82 |   let d = if paramCount() > 2: parseInt(paramStr(3)).uint64 else: 0xFFFFFFFF'u64
83 |   let m = if paramCount() > 3: parseInt(paramStr(4)) else: 1000
84 |   let s = paramCount() > 4  # skip tracking HashSet error if present
85 |   var err, nSt, eer: RunningStat
86 |   for i in 1..m:
87 |     var uc = initUniqCe[float32](k)
88 |     var st = initHashSet[uint64](n)
89 |     for j in 1..n:
90 |       let key = (randState.next and d)
91 |       uc.push float32(cast[uint64](hash(key)))*(1.0/1.8446744073709551615e19)
92 |       if not s: st.incl key
93 |     let est = uc.nUnique
94 |     if not s:
95 |       let e = abs(est - st.len.float)/st.len.float
96 |       err.push e; eer.push uc.nUniqueErr/est/e
97 |     nSt.push est
98 |   echo nSt.mean, " ", err.mean*100, " % ", err.max*100, " % ", eer.mean
99 | 


--------------------------------------------------------------------------------
/adix/topk.nim:
--------------------------------------------------------------------------------
  1 | ##[ Top-k of n using "Buffered QuickSelect" for θ(n) scaling in large `k` cases
  2 | rather than `std/heapqueue` { which has [O|θ](n*lg k) }. ]##
  3 | import std/[random, algorithm]  # quickwit.io/blog/top-k-complexity explains BUT
  4 | type                            #..NOTE heap [θO](n*lg k) &this θ(n),O(n*lg k).
  5 |   Partn* = enum last, ran       #XXX Tukey's 9th | median-of-medians | PDQ?
  6 |   TopK*[T] = object  ## A TopK accumulator; init; push; iterate
  7 |     i, k: int
  8 |     partn: Partn  # One can do variant/case obj using HeapQueue for small k, but
  9 |     first: bool   #..that is only arch-dependent 1.15-1.30X faster for k<100s &
 10 |     thr: T        #..saves only 1/2 the space when little is used anyway.  OTOH,
 11 |     s: seq[T]     #..a better Partn may make this way "always fastest period".
 12 |   TopKOrder* = enum Descending, Ascending, Cheap
 13 | 
 14 | proc supportsCopyMem(t: typedesc): bool {.magic: "TypeTrait".}
 15 | proc initTopK*[T](k=10, partn=last): TopK[T] =
 16 |   ## Initialize a TopK-accumulator for top-`k`; Usage is simply:
 17 |   ##
 18 |   ## .. code-block:: nim
 19 |   ##   var t = initTopK(); for e in 1..99: t.push e
 20 |   ##   for e in t: echo e
 21 |   result = TopK[T](i: -1, k: k, partn: partn, first: true)
 22 |   when supportsCopyMem(T) and declared newSeqUninit:
 23 |     result.s = newSeqUninit[T](2*k); result.s.setLen 0
 24 | 
 25 | proc qpLast[T](a: var openArray[T]; L, R: int): int =
 26 |   let piv = a[R]                        # QuickPartition about last element
 27 |   var i = L - 1
 28 |   for j in L..<R:                       # a[]<= Ascending, >= Descending
 29 |     if a[j] >= piv: inc i; swap a[i], a[j]
 30 |   swap a[i + 1], a[R]
 31 |   i + 1
 32 | 
 33 | proc qpRand[T](a: var openArray[T]; L, R: int): int =
 34 |   swap a[rand(L..R)], a[R]              # QuickPartition about random element
 35 |   a.qpLast L, R
 36 | 
 37 | proc qPart[T](a: var openArray[T]; L, R: int; partn: Partn): int =
 38 |   case partn                            # QuickPartition w/various strategies
 39 |   of last: qpLast a, L, R
 40 |   of ran : qpRand a, L, R
 41 | 
 42 | proc pqs[T](a: var openArray[T]; k: int, partn: Partn): T = # Partial QuickSort
 43 |   var (L, R) = (a.low, a.high)          # Returns pivot element
 44 |   while L <= R: # Partition a[L..R] about piv; Find its pos
 45 |     let pivIx = qPart(a, L, R, partn)
 46 |     if pivIx == k - 1: return a[pivIx]  # piv itself is k-th
 47 |     elif pivIx > k - 1: R = pivIx - 1   # k-th on left
 48 |     else              : L = pivIx + 1   # k-th on right
 49 | 
 50 | proc push*[T](t: var TopK[T], e: sink T) =
 51 |   ## Incorporate element `e` into `t` for eventual exact `for e in t: ..`.
 52 |   inc t.i
 53 |   if t.i < t.k: t.s.add e               # Build phase: Always Add
 54 |   elif t.first or e > t.thr:            # Filter into batches
 55 |     t.s.add e                           # Add if building | over threshold
 56 |     if t.s.len == 2*t.k:
 57 |       t.thr = t.s.pqs(t.k, t.partn)     # Median -> new thr & all >= put on left
 58 |       t.first = false                   # Mark thr active
 59 |       t.s.setLen t.k                    # Drop < new median half
 60 | 
 61 | proc saw*[T](t: TopK[T]): int = t.i + 1 ## `push` count since last init|clear
 62 | 
 63 | iterator items*[T](t: var TopK[T]): lent T =
 64 |   ## iterate over `t` yielding top items in cheapest/system order.
 65 |   if t.saw > t.k:
 66 |     t.thr = t.s.pqs(t.k, t.partn)       # Median -> new thr & all >= put on left
 67 |     t.first = false                     # Mark thr active
 68 |     t.s.setLen t.k                      # Drop < new median half
 69 |   for e in t.s: yield e
 70 | 
 71 | iterator descending*[T](t: var TopK[T]): lent T =
 72 |   ## iterate over `t` yielding top items in DESCENDING order.
 73 |   t.s.sort(order=SortOrder.Descending); t.s.setLen min(t.k, t.saw)
 74 |   for e in t.s: yield e
 75 | 
 76 | iterator ascending*[T](t: var TopK[T]): lent T =
 77 |   ## iterate over `t` yielding top items in ASCENDING order.
 78 |   t.s.sort(order=SortOrder.Descending); t.s.setLen min(t.k, t.saw)
 79 |   t.s.reverse
 80 |   for e in t.s: yield e
 81 | 
 82 | iterator maybeOrdered*[T](t: var TopK[T], order=topk.Cheap): lent T =
 83 |   ## iterate over `t` yielding top items in various orders.
 84 |   case order
 85 |   of topk.Cheap     : (for e in topk.items(t)     : yield e)
 86 |   of topk.Ascending : (for e in topk.ascending(t) : yield e)
 87 |   of topk.Descending: (for e in topk.descending(t): yield e)
 88 | 
 89 | proc clear*[T](t: var TopK[T]) = ## Reset `TopK` accumulator
 90 |   t.i = -1; t.s.setLen 0; t.first = true
 91 | 
 92 | when isMainModule: # Good check: nim r -d:ck -d:r topk -qk3 -n10 -t3628800 #10!
 93 |   when not declared stderr: import std/syncio
 94 |   import cligen, std/[times, sugar, math, sets]
 95 |   when defined danger: randomize()
 96 |   proc top(k=500, n=50000, trials=50, partn=last, quiet=false) =
 97 |     let tScl = 1e12/n.float/log2(k.float) # picosec/work-scale
 98 |     var t = initTopK[int](k, partn)
 99 |     var a = collect(for i in 0..<n: i)
100 |     when defined ck: (let ans = collect(for i in max(0, n - k) ..< n: {i}))
101 |     for tr in 1..trials:
102 |       a.shuffle; t.clear
103 |       let t0 = epochTime()
104 |       for e in a: t.push e
105 |       if not quiet: echo int((epochTime() - t0)*tScl)
106 |       let res = collect(for e in t: {e})
107 |       when defined ck: (if res != ans: stderr.write "!!: ",res," != ",ans,"\n")
108 |       else: discard res                 # Suppress unused warning
109 |   dispatch top
110 | 


--------------------------------------------------------------------------------
/util/lfreq.nim:
--------------------------------------------------------------------------------
 1 | when not declared(stdin): import std/[syncio, formatfloat]
 2 | import std/[hashes,times,strutils], adix/oats, cligen/[mslice,strUt,osUt],cligen
 3 | 
 4 | const bLen {.intdefine.} = 16   # <16K long;  RT params better but more work
 5 | const bOff {.intdefine.} = 32   # <4G UNIQUE line data
 6 | const bCnt {.intdefine.} = 32   # <4 GiCount; Must be 32 for -d=frecent
 7 | type
 8 |   Counter = (when defined intCnt: uint32 else: float32)
 9 |   Count {.packed.} = object     # Dense-ish hash Count type
10 |     when defined hashCache: hc: uint32 # 10B|14B per cell
11 |     len {.bitsize: bLen.}: uint32 # Cmp goes hc, len, key
12 |     off {.bitsize: bOff.}: uint32
13 |     when defined intCnt: cnt {.bitsize: bCnt.}: Counter
14 |     else               : cnt: Counter
15 |   Counts = object
16 |     dat: seq[Count]
17 |     nUsed: int
18 | 
19 | var s: string; oatKStack s, Counts, Count, off,uint32, MSlice, MSlice
20 | proc key(c: Counts, i: int): MSlice = c.dat[i].key
21 | proc val(c: var Counts, i: int, v: Counter) {.used.} = c.dat[i].cnt = v
22 | proc val(c: Counts, i: int): Counter = c.dat[i].cnt
23 | proc used(c: Counts, i: int): bool = c.dat[i].off != 0
24 | when defined hashCache:         # 2nd def triggers saving lpt behavior
25 |   proc hash(ms: MSlice): Hash = mslice.hash(ms).uint32.Hash
26 |   proc hash(c: var Counts, i: int, hc: Hash) {.used.} = c.dat[i].hc = hc.uint32
27 |   proc hash(c: Counts, i: int): Hash = c.dat[i].hc.Hash
28 | oatCounted c,Counts, c.nUsed; oatSeq Counts, dat  # make counted & resizable
29 | #when Counts is VROat[MSlice, MSlice, Counter]: {.warning: "Counts is a VROat"}
30 | 
31 | var wMul = 1/0.99       # Set by CLI driver; Used only in "frecent" mode
32 | var wTot = Counter(0)   # Total weight used only for --count & @f normalization
33 | var w    = Counter(1)   # Constant if not frecent, else exponentially growing
34 | let wMax = 3.4028235e33 # 38 = max float32; Leave 1e5 room for `wTot`
35 | proc incFailed(h: var Counts, ms: MSlice): bool =
36 |   var ms = ms
37 |   if ms.len > (1 shl bLen) - 1: # Careful to not overflow XXX rate limit msgs
38 |     erru "truncating too long (", $ms.len, ") line: ", ($ms)[0..<128], "...\n"
39 |   h.upSert(ms, i):              # Found key @i:
40 |     if (when defined intCnt: h.dat[i].cnt == (1 shl bCnt) - 1 else: false):
41 |       erru "counter overflow for: ",$ms,"\n" # no update XXX rate limit msgs
42 |     else: h.dat[i].cnt += w; wTot += w  # bump
43 |   do:                           # Novel key->i:
44 |     h.dat[i].off = s.add(ms, (1 shl bOff) - 1):
45 |       erru "unique line data overflow at:",$ms,"\n" #XXX rate limit msgs
46 |       return true               # Cannot go on GLOBALLY
47 |     h.dat[i].len = ms.len.uint32 # Init
48 |     h.dat[i].cnt = w; wTot += w
49 |   when not defined(intCnt):     # Do 1-param frecency idea; Simpler than firefox
50 |     w *= wMul                   # Always-grow-for-new-data EWMA update
51 |     if w > wMax:                # Nearing per-key weight repr limit..
52 |       let sclInv = 1.0/wMax     # *= twice since this pushes FP repr limits
53 |       for c in mitems h.dat: c.cnt *= sclInv; c.cnt *= sclInv
54 |       w *= sclInv; w *= sclInv        # These *='s move all FP numbers from..
55 |       wTot *= sclInv; wTot *= sclInv  #..near top of repr to near bottom.
56 | 
57 | proc lfreq(n=10, count=false, size=9999, dSize=81920, recTerm='\n',
58 |            format="@f @k", RecTerm="\n", old=1.0, tm=false) =
59 |   ## Histogram `stdin` lines (read w/non-memory mapped IO to be pipe friendly).
60 |   ## Limits: <4 GiB unique data; <16 KiB lines; <4 GiCount.  If `old < 1.0`,
61 |   ## frequency -> simple 1-parameter "frecency" where counts are virtual-decayed
62 |   ## by a factor `old`  after each line (i.e. by index not by wall time).
63 |   let t0 = if tm: epochTime() else: 0.0
64 |   var h: Counts; h.setCap size  # pre-size table & data
65 |   s.setLen dSize; s.setLen 1    # `1` here lets us encode empty as 0-offset
66 |   when not defined(intCnt): wMul = 1.0/old
67 |   block IO:
68 |     for (line, nLine) in stdin.getDelims(recTerm):
69 |       let ms = MSlice(mem: line, len: nLine - 1)
70 |       if h.incFailed(ms): break IO
71 |   if count: outu h.len," unique ",wTot," total ",s.len," B\n"
72 |   let wInv = 1.0/wTot.float; var cs, fs: string # Setup for..
73 |   let prs = format.tmplParsed('@')              #..nice output
74 |   template output =
75 |     for (id, arg, call) in prs:
76 |       if id.idIsLiteral: outu MSlice(mem: format[arg.a].addr, len: arg.len)
77 |       elif format[id.a] == 'k': outu k
78 |       elif format[id.a] == 'c':
79 |         when defined intCnt: cs.setLen 0; cs.addInt c; outu cs
80 |         else               : cs.setLen 0; cs.ecvt c.float, 6; outu cs
81 |       elif format[id.a] == 'f': fs.setLen 0; fs.fcvt c.float*wInv, 9; outu fs
82 |       else: outu MSlice(mem: format[call.a].addr, len: call.len)
83 |     outu RecTerm
84 |   if   n == 0: (for (k, c) in pairs(h): output())
85 |   elif n > 0 : (for (k, c) in topByVal[MSlice,MSlice,Counter](h, n): output())
86 |   elif n < -1: (for (k, c) in topByVal[MSlice,MSlice,Counter](h, -n, order=Descending): output())
87 |   if tm: stderr.write epochTime() - t0, "\n"  # -n-1 for only time output
88 | 
89 | when isMainModule: dispatch lfreq, help={
90 |   "n"    : "emit `n`-most common  lines(0:all; <0 sorted)",
91 |   "count": "only emit counts: unique & grand total",
92 |   "size" : "pre-size hash table for size unique entries",
93 |   "dSize": "pre-size str data area to this many bytes",
94 |   "recTerm": "input record terminator",
95 |   "RecTerm": "output record terminator",
96 |   "format" : "output format: @k=key @c=count @f=fraction",
97 |   "old"  : "exponen.weight for 'old' ages (if not intCnt)",
98 |   "tm"   : "emit wall time of counting to stderr & quit"}
99 | 


--------------------------------------------------------------------------------
/tests/wf.nim:
--------------------------------------------------------------------------------
  1 | when not declared(addFloat): import std/[formatfloat, typedthreads]
  2 | when not declared(Thread): import std/threads
  3 | import std/[hashes,osproc,times], adix/lptabz, cligen/[mfile,mslice,osUt],cligen
  4 | type
  5 |   Word   = distinct uint32
  6 |   Count  = uint32
  7 |   Histo  = LPTabz[Word, Count, Word, 0]
  8 |   ThrDat = tuple[part: ptr MSlice, hp: ptr Histo, nT: ptr uint64]
  9 | 
 10 | template initHisto(sz): untyped =       # 4*16*8=512B max depth at 65536 entry
 11 |   initLPTabz[Word, Count, Word, 0](sz, numer=4, denom=1, robinHood=false)
 12 | 
 13 | var mf: MFile
 14 | var hs: seq[Histo]                      # NEED -d:useMalloc
 15 | var nTs: seq[uint64]
 16 | var thrs: seq[Thread[ThrDat]]
 17 | 
 18 | const wb = 5                            # word len bits
 19 | const wm = (1 shl wb) - 1               # max word len
 20 | 
 21 | proc initWord(off, len: int): Word =
 22 |   if len > wm:
 23 |     var s = newStringOfCap(len)
 24 |     copyMem s[0].addr, mf.mem +! off, len
 25 |     raise newException(RangeDefect, "\"" & s & "\" too long")
 26 |   Word((off.uint32 shl wb) or len.uint32)
 27 | 
 28 | proc len(w: Word): uint32 = uint32(w) and wm
 29 | 
 30 | proc mem(w: Word): pointer = mf.mem +! int(w.uint32 shr wb)
 31 | 
 32 | # Case insens hash/==|Local stack allocator | may be faster than MAP_PRIVATE.
 33 | proc hash(w: Word): Hash {.inline.} =
 34 |   hash toOpenArray[byte](cast[ptr UncheckedArray[byte]](w.mem), 0, w.len.int-1)
 35 | 
 36 | proc `==`(a, b: Word): bool {.inline.} =
 37 |   a.len == b.len and cmemcmp(a.mem, b.mem, a.len) == 0
 38 | 
 39 | proc `<`(a, b: Word): bool {.inline.} = # for topk.push
 40 |   let c = cmemcmp(a.mem, b.mem, min(a.len, b.len))
 41 |   if c == 0: a.len < b.len else: c < 0
 42 | 
 43 | proc `$`(w: Word): string =             # for output
 44 |   result.setLen w.len
 45 |   copyMem result[0].addr, w.mem, w.len
 46 | 
 47 | when defined(benhoyt): # Ben Hoyt definition of "words"
 48 |  iterator lowCaseWords(ms: MSlice): Word =
 49 |   var wd, n: int
 50 |   for i, ch in ms:
 51 |     if ch in {'A'..'Z'}:                # `tr A-Z a-z` preprocess to avoid
 52 |       ms[i] = char(ord(ch) + 32)        # needs MAP_PRIVATE
 53 |       if n == 0: wd = (ms.mem +! i) -! mf.mem
 54 |       n.inc                             # extend
 55 |     elif ord(ch) > ord(' '):            # in-word ch
 56 |       if n == 0: wd = (ms.mem +! i) -! mf.mem
 57 |       n.inc                             # extend
 58 |     elif n > 0:                         # non-word ch & have data
 59 |       yield initWord(wd, n); n = 0      # yield & reset
 60 |   if n > 0: yield initWord(wd, n)       # any final word
 61 | else: # Knuth-McIlroy definition of "words"
 62 |  iterator lowCaseWords(ms: MSlice): Word =
 63 |   var wd, n: int
 64 |   for i, ch in ms:
 65 |     if ch in {'a'..'z'}:                # in-word ch
 66 |       if n == 0: wd = (ms.mem +! i) -! mf.mem
 67 |       n.inc                             # extend
 68 |     elif ch in {'A'..'Z'}:              # `tr A-Z a-z` preprocess to avoid
 69 |       ms[i] = char(ord(ch) + 32)        # needs MAP_PRIVATE
 70 |       if n == 0: wd = (ms.mem +! i) -! mf.mem
 71 |       n.inc                             # extend
 72 |     elif n > 0:                         # non-word ch & have data
 73 |       yield initWord(wd, n); n = 0      # yield & reset
 74 |   if n > 0: yield initWord(wd, n)       # any final word
 75 | 
 76 | proc work(td: ThrDat) {.thread.} =      # Histogram one segment of an mmap
 77 |   setAffinity()                         # pin to CPU initially assigned
 78 |   var nT = 0u64                         # Local accumulator to not thrash
 79 |   for w in td.part[].lowCaseWords:
 80 |     nT.inc; td.hp[].mgetOrPut(w, 0).inc
 81 |   td.nT[] = nT
 82 | 
 83 | proc count(p: int, path: string) =      # split path into `p` ~equal segments
 84 |   var (mfLoc, parts) = p.nSplit(path, flags=MAP_PRIVATE)
 85 |   mf = mfLoc
 86 |   if mf != nil:
 87 |     if mf.len > 1 shl (32 - wb):
 88 |       raise newException(RangeDefect, "\"" & path & "\" too large")
 89 |     if p > 1:                           # add mf.len > 65536|something?
 90 |       for i in 0 ..< parts.len:         # spawn workers
 91 |         createThread thrs[i], work, (parts[i].addr, hs[i].addr, nTs[i].addr)
 92 |       joinThreads thrs
 93 |     else: work (parts[0].addr, hs[0].addr, nTs[0].addr) # ST-mode: No spawn
 94 |   else: stderr.write "wf: \"", path, "\" missing/irregular\n"
 95 | 
 96 | proc wf(path:seq[string], n=10, c=false, N=false, jobs=1, sz=9999, tm=false) =
 97 |   ## Parallel word frequency tool for one file < 128 MiB and words < 32 chars.
 98 |   ## Aggregate multiple via, e.g., `cat \*\*/\*.txt > /dev/shm/inp`.  Similar
 99 |   ## to Knuth-McIlroy `tr A-Z a-z|tr -sc a-z \\n|sort|uniq -c|sort -n|tail`,
100 |   ## but ~46X faster than the pipeline (on TOTC; depends on vocab).
101 |   let path = if path.len > 1: path[1] else: "/dev/stdin"
102 |   let t0 = epochTime()
103 |   let p = if jobs > 0: jobs else: countProcessors()
104 |   thrs.setLen p                         # allocate `thrs` & histos
105 |   for i in 0 ..< p: hs.add initHisto(sz); nTs.add 0u64
106 |   p.count path
107 |   for i in 1 ..< p:                     # hs[0] += [1..<p]
108 |     nTs[0] += nTs[i]
109 |     for wd, cnt in hs[i]: hs[0].mgetOrPut(wd, 0) += cnt
110 |   if c: echo hs[0].len," unique ",nTs[0]," total"
111 |   template o = echo (if N: $(c.float/nTs[0].float) else: $c)," ",w
112 |   if   n == 0: (for w, c in hs[0].pairs: o())       # unsorted whole
113 |   elif n > 0 : (for w, c in hs[0].topByVal(n): o()) # unsorted top N
114 |   elif n < -1: (for w, c in hs[0].topByVal(n, order=Descending): o()) # sorted
115 |   if tm: stderr.write epochTime() - t0, " sec\n"    # n == -1: only `c`/tm
116 | 
117 | dispatch(wf, help={"n": "do top n; 0all,<0sort", "c": "count only", "N": "norm",
118 |   "tm": "time", "jobs": "num threads; 0=>auto", "sz": "init size"})
119 | 


--------------------------------------------------------------------------------
/adix/amoft.nim:
--------------------------------------------------------------------------------
  1 | ## `Approximately k-Most Oft <en.wikipedia.org/wiki/Count%E2%80%93min_sketch>`_.
  2 | ## This is constant space & constant inc&query time with adjustably small error.
  3 | ## `AMOft[K,C]` augments the sketch with an O(k) paired Table & HeapQueue.
  4 | ## Present impl does NOT scale well to very large `k` (past ~top-1000).  E.g.:
  5 | ##
  6 | ## .. code-block:: nim
  7 | ##   var amo = initAMOft[int, uint32](k=10)
  8 | ##   for i in 0..<500: amo.inc i, i # Not v.skewed => not v.accurate
  9 | ##   for (i, c) in amo.mostCommon(5): echo i, " ", c
 10 | 
 11 | import std/[hashes, heapqueue, tables, algorithm]
 12 | type
 13 |   CtMnSketch*[K,C] = object ## CountMinSketch over hashable `K` & counters `C`.
 14 |     data: seq[seq[C]]
 15 |     salts: seq[Hash]
 16 |     w: int
 17 | 
 18 |   AMOft*[K,C] = object ## Track most often hashable `K` with counters `C`.
 19 |     sketch: CtMnSketch[K,C]     # Sketch for gigantic `K` spaces
 20 |     top: HeapQueue[(C,int)]     # Most frequent counts, keys
 21 |     no2key: seq[K]              # For expensive K, like string, the heap ops..
 22 |     key2no: Table[K, int]       #..& scan are much faster with Indirect K.
 23 |     k: int                      # Num most often keys to track (max `top` len)
 24 | 
 25 | proc initCtMnSketch*[K,C](w: int, d=4, salts: seq[int] = @[]): CtMnSketch[K,C] =
 26 |   ## `w`=Size(count tables), larger implies less overestimation.  `d`=nTables,
 27 |   ## larger implies less P(overEst). `salts`=Override default generated salts.
 28 |   if w <= 0 or d <= 0:
 29 |     raise newException(ValueError, "Table size(`w`) & hashes(`d`) must be > 0")
 30 |   result.w = w
 31 |   if salts.len>0: result.salts = salts
 32 |   else          : result.salts.setLen d
 33 |   result.data.setLen result.salts.len
 34 |   for i, t in result.data.mpairs:
 35 |     t.setLen w
 36 |     result.salts[i] = hash(if salts.len>0: salts[i] else: cast[int](t[i].addr))
 37 | 
 38 | proc inc*[K,C](cs: var CtMnSketch[K,C], key: K, r=1): C {.discardable.} =
 39 |   ## Count `key` `r` times; Gives new counts; I.e., `(key, r=0)` can query.
 40 |   result = C.high
 41 |   let kh = hash(key)
 42 |   when defined(cmsOnePass):             # This updates faster/more independently
 43 |     for i, s in cs.salts:               #..BUT has less accurate estimates.
 44 |       let h = Hash(uint(!$(kh !& s)) mod cs.w.uint)
 45 |       cs.data[i][h] += r.C
 46 |       result = min(result, cs.data[i][h])
 47 |   else:
 48 |     var old = C.high
 49 |     var hs: array[32, Hash] # Avoid inner loop alloc w/bound; 128-256B of stack
 50 |     for i, s in cs.salts:
 51 |       hs[i] = Hash(uint(!$(kh !& s)) mod cs.w.uint)
 52 |       old = min(old, cs.data[i][hs[i]])
 53 |     old += r.C
 54 |     for i in 0 ..< cs.salts.len:
 55 |       cs.data[i][hs[i]] = max(cs.data[i][hs[i]], old)
 56 |       result = min(result, cs.data[i][hs[i]])
 57 | 
 58 | proc initAMOft*[K,C](k, w: int; d=4, salts: seq[int] = @[]): AMOft[K,C] =
 59 |   result.sketch = initCtMnSketch[K,C](w,d,salts); result.k = k
 60 | 
 61 | proc slot[K,C](a: var AMOft[K,C], kn: int): int =
 62 |   result = -1                           # Must search only by key since updates
 63 |   for i in 0 ..< a.top.len:             #..of OTHER keys MAY bump old estimates.
 64 |     if a.top[i][1] == kn:               # Also cannot idx with Table since each
 65 |       return i                          #..replace re-orders heap slots.  Links?
 66 | # While above is a linear scan, Table look-up ensures it only happens if needed.
 67 | 
 68 | proc inc*[K,C](a: var AMOft[K,C], key: K, r=1) =
 69 |   ## Count `key` as being seen `r` times.
 70 |   let c = a.sketch.inc(key, r)          # Form (sketch count, keyNo) 2-tuple
 71 |   var new = (c, a.key2no.getOrDefault(key, -1))
 72 |   if c.int > r and new[1] != -1 and (let i = a.slot(new[1]); i >= 0):
 73 |     a.top.del i                         # Hit: update existing; O(k+lg k fix-up)
 74 |     a.top.push new                      # O(lg k)
 75 |   elif a.top.len < a.k:                 # Miss/initial build
 76 |     new[1] = a.no2key.len
 77 |     a.no2key.add key
 78 |     a.key2no[key] = new[1]
 79 |     a.top.push new
 80 |   elif new > a.top[0]:                  # Miss/frequent enough to bump old top:
 81 |     new[1] = a.top[0][1]
 82 |     discard a.top.replace(new)          # pop min, push new, discard old
 83 |     a.key2no.del a.no2key[new[1]]
 84 |     a.key2no[key] = new[1]
 85 |     a.no2key[new[1]] = key
 86 | 
 87 | iterator mostCommon*[K,C](a: AMOft[K,C], k=0): (K, C) =
 88 |   ## Yield (`k`-most common values in `a`, each count) tuples; `k<=0` => known.
 89 |   let k   = if k > 0: k else: a.top.len
 90 |   var cpy = a.top                       # A tree top can duck both O(N) copy..
 91 |   var res: seq[(C, int)]                #..to not edit & need to collect for top
 92 |   while cpy.len > 0: res.add cpy.pop
 93 |   res.sort
 94 |   var v: (K, C)
 95 |   for i in res.len - k ..< res.len:
 96 |     v[0] = a.no2key[res[i][1]]; v[1] = res[i][0]
 97 |     yield v
 98 | 
 99 | when isMainModule:                      # Check/demo CtMnSketch[K,C], AMOft[K,C]
100 |   when not declared(assert): import std/assertions
101 |   var c = initCtMnSketch[int, uint32](w=16, salts = @[1,2,3])
102 |   for i in 0..<16: c.inc i, i
103 |   for i in 0..<16:
104 |     try: assert c.inc(i, 0) == i.uint32 # |exclusions also work for cmsOnePass
105 |     except AssertionDefect: (if i notin [6,9]: echo "mismatch at ", i)
106 | 
107 |   var a = initAMOft[string, uint8](k=4, w=16, salts = @[1,2,3,4,5,6,7])
108 |   for i in 0..<32: a.inc $i, 32-i
109 |   var res = ""
110 |   for (i, c) in a.mostCommon(3): res.add i
111 |   assert res == "210"                   # Top 3
112 | 
113 |   var oft = initAMOft[int, uint32](k=50, w=8192, d=7) # Linear dist not v.skewed
114 |   for i in 0..<50000: oft.inc i, i # 50000*49999/2 = 1_249_975_000 virt.events
115 |   for (i, c) in oft.mostCommon(25): echo i, " ", c, " ", c.int - i,"/1249975000"
116 | 


--------------------------------------------------------------------------------
/tests/bl.nim:
--------------------------------------------------------------------------------
  1 | when not declared(addFloat): import std/formatfloat
  2 | import std/[os, strutils, times], adix/[althash, bltab]
  3 | 
  4 | template h(x):untyped = (when defined(lessRan): hashRoMu1(x) else: hashNASAM(x))
  5 | 
  6 | let nTab = parseInt(paramStr(1))
  7 | let mask = parseInt(paramStr(2))
  8 | 
  9 | var s = initBLTab(nTab, mask)
 10 | var d: seq[int]
 11 | for i in 3 .. paramCount(): d.add parseInt(paramStr(i))
 12 | let verb = "V".existsEnv
 13 | template maybeEcho(x: varargs[untyped]) =
 14 |   if verb: echo x
 15 | 
 16 | let t0 = epochTime()
 17 | for j in d:
 18 |   if j > 0:
 19 |     let k = h(j) and mask
 20 |     if s.containsOrIncl(k): maybeEcho "had ", j
 21 |     else: maybeEcho "added ", (j, k)
 22 |   elif j < 0:
 23 |     let k = h(-j) and mask
 24 |     if s.missingOrExcl(k): maybeEcho "did nothing"
 25 |     else: maybeEcho "removed ", (-j, k)
 26 | let dt = epochTime() - t0
 27 | 
 28 | let ds = s.depths
 29 | if verb: s.debugDump
 30 | echo "dt(s): ", dt
 31 | echo "hashLd: ", float(s.len)/float(s.getCap), " ", ds.len, " depths: ", ds
 32 | echo paramCount()-2-s.len, '/', paramCount()-2, ". false pos. (if all +)"
 33 | 
 34 | # ./bl $[1<<17] $[(1<<26) - 1] {1..$[3<<15]} |tail -n2
 35 | #   hashLd: 0.7494 19 depths: @[36541, 25619, 15310, 8917, 5087, 2965, 1708, 934, 486, 272, 174, 82, 56, 36, 17, 13, 5, 1, 2]
 36 | #   79/98304. false pos. (if all +); fpr=0.0008; 19*26/8=61.75 < 64B cache line.
 37 | #
 38 | # A Bloom filter is less space -1.44*lg.0008=14.8 bit/num (57% of 26 bits, 42.7%
 39 | # adjusting for 75% hashLd), BUT needs -lg .0008 = 10.3 hash funs ~10 line lds.
 40 | # *MANY* would pay a 1/.427=2.34X space increase to get a 10+X speed boost.  In
 41 | # fact, unless you somehow know you will be right at a "cache cliff", almost no
 42 | # one would not choose to spend 2.3X space for a 10X speed-up.
 43 | #
 44 | # Aux 2..3-bit counter fields (28..29./26*2.34=2.52..2.61x space) can buy you
 45 | # deletes that are mostly reliable OR dup keys could be allowed (as in `lptabz`)
 46 | # for full reliability at the cost of longer collision clusters.  (Well, full
 47 | # reliability modulo fingerprint collisions..).
 48 | #
 49 | # Even compared with Cuckoo filters things are 2x faster in the worst case and
 50 | # 1.5x faster on average (depending on probability of 2nd lookup being needed).
 51 | #
 52 | # Of course, one can do other tests, such as a 1e6 insert 29-bit one:
 53 | #   ./bl $[1<<21] $[(1<<29)-1] {1..1000000}
 54 | #     dt(s): 0.02814388275146484
 55 | #     hashLd: 0.4764270782470703 10 depths: @[669828, 237265, 67871, 17965, 4647, 1192, 275, 71, 23, 3]
 56 | #     860/1000000. false pos. (if all +)
 57 | #   That will take 29*(1<<21) bits or a 7.6 MiB `seq`.
 58 | #
 59 | # And one can sometimes do better with a *less* random hash, of course at some
 60 | # substantial risk that your hash is over-tuned to your key sets:
 61 | #   nim c -d:danger -d:lessRan bl
 62 | #   ./bl $[1<<21] $[(1<<22)-1] {1..1000000}
 63 | #     dt(s): 0.01258683204650879
 64 | #     hashLd: 0.476837158203125 2 depths: @[801457, 198543]
 65 | #     0/1000000. false pos. (if all +)
 66 | #   That will take 22*(1<<21) bits or a 5.7 MiB `seq`.
 67 | #
 68 | #
 69 | # This is all amenable to more formal analysis for those so inclined.  Here is
 70 | # an excerpt of an e-mail I wrote in Summer 2001 (unadjusted for slightly better
 71 | # Robin Hood Linear Probing):
 72 | # ----------------------------------------------------------------------------
 73 | # A few nights ago David M raised some nice specific doubts and prompted me
 74 | # to do some simple calculations.  The compellingness of Bloom filters seems
 75 | # limited, but very well defined.
 76 | #
 77 | # The executive summary is just this: for small false positive probabilities
 78 | # Bloom filters help if you're trading memory against disk accesses, but
 79 | # probably not for fast vs. slow memory where "slow" is only 5..8 times higher
 80 | # latency.  Some basic math will perhaps clarify the issue.
 81 | #
 82 | # The short of it is just this:
 83 | #   { ^ -> exponentiation[not xor], lg=log base 2 }
 84 | #
 85 | #   Consider N objects/packet-types/whatever and an M-bit table.
 86 | #   Then let a = N / M be the "load".  We have (see, e.g. Knuth)
 87 | #     P(false positive) = p = (1 - exp(-k*a))^k
 88 | #
 89 | #   Solve for a(k,p)=-log(1-p^(1/k))/k, differentiate with respect to k, set
 90 | #   it equal to zero, and solve to get k = lg(1/p) as the maximizer of a, or
 91 | #   the *minimizer of M*.  I.e., a Bloom filter with either ceil(-lg p) or
 92 | #   floor(-lg p) gives the minimum memory usage for a given target p.
 93 | #   What is the memory usage?  Substituting back in notice p^(1/k)=1/2 and:
 94 | #     a = -log(1-1/2)/lg(1/p) = log 2 / lg(1/p), or
 95 | #     M = -lg e * N * lg p = 1.44*N*lg(1/p)
 96 | #
 97 | # Compare this with recording "existence" in a hash table of B-bit values.
 98 | # Suppose we manage collisions with open-addressed linear probing (a cache
 99 | # friendly thing).  To achieve 2 table accesses/query (probably 1 slow memory
100 | # access) we need the load to be ~70% (see Knuth 6.4 table 4).  Specifically,
101 | # M' = 1.44*N slots = 1.44*N*B bits.  Anything in the address space does get
102 | # stored in the table.  So the false positive rate we expect is the collision
103 | # rate in the B-bit address space for N objects.
104 | #
105 | # Standard binomial birthday simplification of multinomial collision analysis is
106 | #     p = 1 − (1 − 1/M')^(N-1), or as M,N get big
107 | #     p = 1 - exp(N*log(1 - 1/M')) =~ 1 - exp(-N/M') = 1 - exp(-N/2^B) or
108 | #     B = lg(-N/log(1-p)).  And so,
109 | #     M' = 1.44*N*lg(-N/log(1-p)) bits.
110 | # Now if p << 1, again using log(1-p) =~ -p { err =~ .5*p^2 < 5% for p < .1 }
111 | #     M' = 1.44*N*lg(N/p).
112 | #
113 | # So there you have it.  The ratio of storage needed for M' (hash table) over
114 | # M (Bloom filter) simplifies for "small" p to (with log_1/p == log base 1/p):
115 | #
116 | #     M'/M = lg(N/p) / lg(1/p) = (lg(1/p) + lg N)/log(1/p) =
117 | #          = 1 + lg N/lg(1/p) = 1 + log_1/p (N) = 1 + lg N/-lg p
118 | #
119 | # This tells you exactly what you need to know -- Bloom filters save space only
120 | # when N is very large relative to 1/p.  E.g., N=1e6 and p=1% give M' = 4M.
121 | # This may surprise as a naive perception may be that you want to Bloom when you
122 | # want small false positive rates.  However, for low p, this costs a lot of time
123 | # as you end up with many hash functions also probing memory randomly.
124 | #
125 | # In time, Bloom only pays off when you are near enough a cliff in latency of
126 | # the memory hierarchy where `k` Bloom accesses beat the 1 LinearProbe access
127 | # because the `k` can operate in region (1+lgN/-lg p)X smaller.
128 | 


--------------------------------------------------------------------------------
/adix/bltab.nim:
--------------------------------------------------------------------------------
  1 | ## This module specializes to the case where keys are 1..k-bit ints & values are
  2 | ## 0..v-bit ints (`k+v<=8*int.sizeof`) using one `SeqUInt` as backing store.
  3 | ## (Mnemonically, "BL" = "Bit Level" or start of "BLoom Filter", a sometimes
  4 | ## competing data structure.)  Users must give a number of bits for the key.
  5 | ## Bits for values and the sentinel key default to 0. `BLTab` otherwise tries to
  6 | ## be similar to hash variants of multisets.
  7 | 
  8 | import althash, sequint
  9 | when not declared(stderr): import std/assertions
 10 | type
 11 |   BLTab* = object  ## RobinHoodLP set of B-bit int keys w/small false pos. rate
 12 |     data: SeqUint     # number array
 13 |     count: int        # count of entered slots
 14 |     k, v, numer, denom, minFree, growPow2, pow2: uint8 # size policy parameters
 15 |     rehash, robin: bool                           # Steal 2-bits from `salt`?
 16 |     salt: Hash                                    # ~unpredictable salt
 17 |     z: uint                                       # sentinel
 18 | 
 19 | var blInitialSize* = 2     ## default initial size aka capacity aka cap
 20 | var blValueBits*   = 0'u8  ## default bits for value in k+v-bit uint
 21 | var blSentinel*    = 0'u8  ## default sentinel value for k+v-bit uint
 22 | var blNumer*       = 3'u8  ## default numerator for lg(n) probe depth limit
 23 | var blDenom*       = 1'u8  ## default denominator for lg(n) probe depth limit
 24 | var blMinFree*     = 1'u8  ## default min free slots; (>= 1)
 25 | var blGrowPow2*    = 1'u8  ## default growth power of 2; 1 means double
 26 | var blRobinHood*   = false ## default to Robin Hood re-org; auto-activated
 27 | var blRehash*      = false ## default hcode rehashing behavior; auto-activated
 28 | 
 29 | when defined(hashStats):   # Power user inspectable/zeroable stats.  These are
 30 |   template ifStats(x) = x  # all kind of like "times" - you v0=val;...; val-v0
 31 |   var blDepth*     = 0   ## Counts total search depth
 32 |   var blTooFull*   = 0   ## Counts resizes from minFree boundary
 33 |   var blTooDeep*   = 0   ## Counts resizes from deep probe sequences
 34 |   var blTooSparse* = 0   ## Counts skips of depth-triggered resize from sparsity
 35 | else:
 36 |   template ifStats(x) = discard
 37 | 
 38 | proc len*(s: BLTab): int {.inline.} = s.count
 39 | proc getCap*(s: BLTab): int {.inline.} = s.data.len
 40 | 
 41 | proc save*(t: BLTab, pathStub: string) = discard
 42 | proc load*(t: var BLTab, path: string) = discard
 43 | proc loadBLTab*(path: string): BLTab = discard
 44 | proc mmap*(t: var BLTab, path: string) = discard
 45 | 
 46 | proc pushUp(x: var SeqUint, i, n: int) {.inline.} =   # move n items up 1
 47 |   for j in countdown(i + n - 1, i): x[j+1] = x[j]
 48 | 
 49 | proc pullDown(x: var SeqUint, i, n: int) {.inline.} = # move n items down 1
 50 |   for j in countup(i, i + n - 1): x[j] = x[j+1]
 51 | 
 52 | proc isUsed(s: BLTab, i: int): bool {.inline.} = s.data[uint(i)] != 0
 53 | 
 54 | proc depth(i, hc, mask: Hash): Hash {.inline.} =
 55 |   let i = uint(i)
 56 |   let hc = uint(hc)
 57 |   let mask = uint(mask)
 58 |   cast[Hash]((i - hc) and mask)           # Search depth of entry w/hcode @ix`i`
 59 | 
 60 | iterator probeSeq(hc, mask: Hash): int =
 61 |   var i: Hash = hc and mask               # Start w/home address
 62 |   while true:
 63 |     yield i
 64 |     i = (i + 1) and mask                  # Linear Probing
 65 | 
 66 | proc rawGet(s: BLTab; hc: Hash, d: var Hash): int {.inline.} =
 67 |   assert(s.data.len > 0, "Uninitialized BLTab")  # Ensure in *caller* not here
 68 |   var t {.noinit.}: int                          # Where to insert if missing
 69 |   for i in probeSeq(hc, s.data.high):
 70 |     t = i
 71 |     if not s.isUsed(i):
 72 |       break
 73 |     if d > depth(i, cast[Hash](s.data[uint(i)]), s.data.high):
 74 |       break
 75 |     if s.data[i] == uint(hc):
 76 |       return i
 77 |     d.inc
 78 |     ifStats blDepth.inc
 79 |     if d == s.data.len:         # Handle fully saturated table case
 80 |       break
 81 |   result = -1 - t               # < 0 => MISSING and insert idx = -1 - result
 82 | 
 83 | proc rawGet(s: BLTab, hc: Hash): int {.inline.} =
 84 |   var d: Hash
 85 |   rawGet(s, hc, d)              # < 0 => MISSING and insert idx = -1 - result
 86 | 
 87 | proc depth(s: BLTab; hc: Hash): int {.inline.} =
 88 |   var d: Hash
 89 |   discard rawGet(s, hc, d)
 90 |   d
 91 | 
 92 | proc rawPut1(s: var BLTab, i: Hash; d: var int): int {.inline.} =
 93 |   result = i                          # Linear probe to first empty slot
 94 |   while s.isUsed(result):
 95 |     result = (result + 1) and s.data.high
 96 |     d.inc
 97 |     if d == s.data.len:
 98 |       raise newException(ResourceExhaustedError, "BLTab saturated")
 99 | 
100 | proc rawPut2(s: var BLTab, i, j: Hash): int {.inline.} =
101 |   if j > i:                           # No table wrap around; just shift up
102 |     pushUp s.data, i, j - i
103 |   elif j < i:                         # j wrapped to low indices
104 |     pushUp s.data, 0, j
105 |     s.data[0] = s.data[s.data.high]
106 |     pushUp s.data, i, s.data.high - i
107 |   result = i                          # j == i => already have space @i; done
108 | 
109 | proc rawDel(s: var BLTab, i: Hash) {.inline.} =
110 |   let mask = s.data.high
111 |   var k = i
112 |   var j = (i + 1) and mask            # Find next empty|at home position entry
113 |   while s.isUsed(j) and j != (int(s.data[j]) and mask):
114 |     j = (j + 1) and mask
115 |   if j > i + 1:                       # No table wrap around; just shift down
116 |     pullDown s.data, i, j - 1 - i
117 |     k = j - 1                         # Mark just-past-shift-block entry empty
118 |   elif ((j + mask - i) and mask) > 0: # j wrapped to low indices; Did >0 j.inc
119 |     pullDown s.data, i, mask - i
120 |     s.data[mask] = s.data[0]
121 |     pullDown s.data, 0, j - 1
122 |     k = (j + mask) and mask           # [j-1 mod tabSz] is now empty
123 | # else:                               # k == i is already home position
124 |   s.data[k] = 0
125 | 
126 | proc init*(s: var BLTab, size, mask: int) {.inline.} =
127 |   s.data  = initSeqUint(size, numBound=mask)
128 |   s.count = 0
129 | 
130 | proc initBLTab*(size, mask: int): BLTab{.inline.} = result.init size, mask
131 | 
132 | proc contains*(s: BLTab, hc: Hash): bool {.inline.} =
133 |   assert(s.data.len > 0, "Uninitialized BLTab")  # Ensure in *caller* not here
134 |   s.rawGet(hc) >= 0
135 | 
136 | proc containsOrIncl*(s: var BLTab, hc: Hash): bool {.inline.} =
137 |   assert(s.data.len > 0, "Uninitialized BLTab")  # Ensure in *caller* not here
138 |   var d: Hash
139 |   let i = s.rawGet(hc, d)
140 |   if i < 0:
141 |     var j = s.rawPut1(-1 - i, d)
142 |     let k = s.rawPut2(-1 - i, j)                  # Maybe allocate a slot
143 |     s.count.inc
144 |     s.data[k] = hc
145 |   else:
146 |     result = true
147 | 
148 | proc missingOrExcl*(s: var BLTab, hc: Hash): bool {.inline.} =
149 |   assert(s.data.len > 0, "Uninitialized BLTab")  # Ensure in *caller* not here
150 |   let i = s.rawGet(hc)
151 |   if i >= 0:
152 |     s.data[i] = 0
153 |     s.rawDel i
154 |     s.count.dec
155 |   else:
156 |     return true
157 | 
158 | proc clear*(s: var BLTab) {.inline.} =
159 |   s.data.clear
160 |   s.count = 0
161 | 
162 | iterator items*(s: BLTab): Hash =
163 |   let L = s.len
164 |   for i in 0 ..< s.data.len:
165 |     assert(s.len == L, "cannot change a set while iterating over it")
166 |     if s.isUsed(i): yield cast[Hash](s.data[i])
167 | 
168 | iterator pairs*(s: BLTab): tuple[a: int, b: Hash] =
169 |   let L = s.len
170 |   var j =  0
171 |   for i in 0 ..< s.data.len:
172 |     assert(s.len == L, "cannot change a set while iterating over it")
173 |     if s.isUsed(i): yield (j, cast[Hash](s.data[i]))
174 |     j.inc
175 | 
176 | proc depths*(s: BLTab): seq[int] =
177 |   for elt in s:
178 |     let d = s.depth(elt)
179 |     if d >= result.len: result.setLen(d + 1)
180 |     result[d] += 1
181 | 
182 | proc debugDump*(s: BLTab, label="") =
183 |   if label.len > 0: echo label
184 |   echo s.len, " items"
185 |   for i, cell in s.data:
186 |     echo "i: ",i," depth: ",if cell != 0: depth(i, int(s.data[i]), s.data.high)
187 |                             else: 0, " ", cell
188 | 


--------------------------------------------------------------------------------
/adix/oats.nim:
--------------------------------------------------------------------------------
  1 | import std/hashes, adix/[bitop, topk]; export topk.TopKOrder
  2 | type    # More adaptable than Nim std/sets|tables (named ROats|VROats here)
  3 |   Oat*[K, Q] = concept t        # Base Concept: Open-Addressed Table
  4 |     cap(t) is int                   # Query allocated slot count
  5 |     used(t, int) is bool            # Test if slot `i` is used | free
  6 |     key(t, int) is K                # Get key/key ref for slot `i`
  7 |     keyQ(t, K) is Q                 # Make a Q)uery from a stored K)ey
  8 |     keyR(t, Q) is K                 # Convert internal Q to ref type K
  9 |     hash(Q) is Hash                 # hash key `k`
 10 |     eq(t, K, K) is bool             # Stored key/ref `a` == stored `b`
 11 |     eq(t, K, Q) is bool             # Stored key/ref `a` == Query `b`
 12 | 
 13 |   Resizable* = concept t        # Gives grow/shrink-ability
 14 |     newOfCap(t, int) is type(t)     # Get a new `n`-slot instance
 15 |     copy(var t, int, t, int)        # Abstract t[i] = u[j]
 16 |     setNew(t, var t)                # Efficiently set all t = u
 17 | 
 18 |   Valued*[V] = concept t        # Gives dictionary-like interfaces
 19 |     val(var t, int, V)              # Set val for slot `i`
 20 |     val(t, int) is V                # Get val for slot `i`
 21 | 
 22 |   VOat*[K,Q,V] = concept t      # Valued-Oat; Needs val; Adds []/{}/.values/etc.
 23 |     t is Valued[V]; t is Oat[K,Q]
 24 | 
 25 |   ROat*[K,Q] = concept t        # R)esizable Oat; Needs new/cp/set;Adds setCap..
 26 |     t is Resizable; t is Oat[K,Q]
 27 | 
 28 |   VROat*[K,Q,V] = concept t     # V)alued, R)esizable Oat; Needs & adds both
 29 |     t is Valued[V]; t is ROat[K,Q]
 30 | 
 31 |   PutKey*[K] = concept t        # incl,mgetOrPut.. `SERT` puts keys as an atom
 32 |     key(var t, int, K)              # Set key for slot `i` (upSert can inline)
 33 |   POat*[K,Q] = concept t        # PutKey-Oat
 34 |     t is PutKey[K]; t is Oat[K,Q]
 35 |   VPOat*[K,Q,V] = concept t     # Valued-PutKey-Oat
 36 |     t is PutKey[K]; t is VOat[K,Q]
 37 | 
 38 |   Counted* = concept t          # Gives cheap total used slots; else O(N)
 39 |     inUse(var t, int)               # Set count of slots in use
 40 |     inUse(t) is int                 # Get count of slots in use
 41 | 
 42 |   SavedHash* = concept t        # Speed find&more so resize;<64 bits ok if small
 43 |     hash(t, 0, Hash)                # Set hash of slot `i`
 44 |     hash(t, 0) is Hash              # Get hash of slot `i`
 45 | 
 46 | proc len*[K,Q](t: Oat[K,Q]): int = ## stdlib-style slots in-use; Uncounted=>O(N)
 47 |   when t is Counted: t.inUse
 48 |   else: (for i in 0 ..< t.cap: (if t.used i: inc result))
 49 | 
 50 | iterator probeSeq(h, mask: Hash): int =
 51 |   var i: Hash = h and mask                # Start w/home address
 52 |   while true: yield i; i = (i+1) and mask # Linear Probing
 53 | 
 54 | func oatSlots*(n: int, mnFree=600): int = ceilPow2(n + max(1, mnFree))
 55 |   ## Guess of the right table size from expected number of items.
 56 | 
 57 | proc oatSlot*[K,Q](t: Oat[K,Q]; q: Q; h: Hash; d: var Hash): int =
 58 |   mixin eq      # Q: Set d=0 or just assume like now?
 59 |   var j {.noinit.}: int                 # Where to insert if missing
 60 |   for i in probeSeq(h, t.cap - 1):
 61 |     j = i
 62 |     if not t.used i: break              # Need >= 1 FREE slot to halt search
 63 |     if (when t is SavedHash: t.hash(i) == h else: true) and t.eq(t.key i, q):
 64 |       return i
 65 |     d.inc       # Q: Also break if d == t.cap?
 66 |   -1 - j                                # <0 =>MISSING&insert idx = -1 - result
 67 | 
 68 | proc tooFull*[K,Q](t: Oat[K,Q]; d: int; newSize: var int): bool =
 69 |   #-> user proc w/some provided default BUT there's a circular dep through `len`
 70 |   let sLen=t.len                # Could be a cap-long loop
 71 |   if sLen + 1 + 1 > t.cap:      # Call setCap pre-put? +1 new, +1 free
 72 |     newSize = t.cap shl 1; return true
 73 |   let p2 = lgCeil(t.cap)        # NOT an over-deep search; Would like to test
 74 |   if d < 3*p2 + 1:              #..first since it is guaranteed cheap, but need
 75 |     return false                #..cond to ensure tiny tables terminate probeSeq
 76 |   if sLen > t.cap shr 1:        # Over-deep on under-full: re-salt hash?
 77 |     newSize = t.cap; return true
 78 | 
 79 | proc setCap*[K,Q](t: var ROat[K,Q]; newSize = -1) =
 80 |   let newSz = if newSize < 0: max(2, t.cap shl 1)
 81 |               else: oatSlots(max(newSize, t.len), 1)  # max blocks over-shrink
 82 |   if newSz == t.cap and newSize == -1: return
 83 |   var ns = t.newOfCap newSz
 84 |   var d: int
 85 |   for i in 0 ..< t.cap:
 86 |     if t.used i:
 87 |       let q = t.key i
 88 |       let h = when t is SavedHash: t.hash(i) else: t.keyQ(q).hash
 89 |       ns.copy -1 - ns.oatSlot(q, h, d), t, i
 90 |   t.setNew ns
 91 | 
 92 | template upSert*[K,Q](t: var Oat[K,Q], q, i, UP, SERT) =
 93 |   var d, newSize: Hash
 94 |   let h = q.hash
 95 |   var i = oatSlot(t, q, h, d)
 96 |   if i >= 0: UP
 97 |   else:
 98 |     if t.tooFull(d, newSize):
 99 |       when t is Resizable:
100 |         oats.setCap t, newSize; d = 0
101 |         i = oatSlot(t, q, h, d)
102 |       else: raise newException(ValueError, "non-resizable table too full")
103 |     i = -1 - i
104 |     SERT
105 |     when t is SavedHash: t.hash i, h    # Late in case `SERT` aborts insert
106 |     when t is Counted: t.inUse t.len + 1
107 | 
108 | proc incl*[K,Q](t: var POat[K,Q], q: Q) =
109 |   t.upSert q, i, UP=(discard), SERT=(t.key(i, t.keyR q))
110 | 
111 | proc `[]`*[K,Q,V](t: VOat[K,Q,V], q: Q): V =
112 |   if (var d: Hash; let i = oatSlot(t, q, q.hash, d); i >= 0): result = t.val i
113 |   else: raise newException(KeyError, "no such key")
114 | 
115 | proc `[]=`*[K,Q,V](t: var VPOat[K,Q,V], q: Q, v: V) =
116 |   t.upSert q, i, UP=(t.val i), SERT=(t.key(i, t.keyR q); t.val i, v)
117 | 
118 | proc mgetOrPut*[K,Q,V](t: var VPOat[K,Q,V], q: Q, v: V): var V =
119 |   t.upSert q, i, UP=(t.val i), SERT=(t.key(i, t.keyR q); t.val i, v)
120 | 
121 | proc getOrDefault*[K,Q,V](t: VOat[K,Q,V], q: Q, def=default(V)): V =
122 |   if (var d: Hash; let i = oatSlot(t, q, q.hash, d); i >= 0): t.val i else: def
123 | 
124 | iterator items*[K,Q](t: Oat[K,Q]): K =
125 |   for i in 0 ..< t.cap: (if t.used i: yield t.key i)
126 | 
127 | iterator values*[K,Q,V](t: VOat[K,Q,V]): V =
128 |   for i in 0 ..< t.cap: (if t.used i: yield t.val i)
129 | 
130 | iterator pairs*[K,Q,V](t: VOat[K,Q,V]): (K, V) =
131 |   for i in 0 ..< t.cap: (if t.used i: yield (t.key i, t.val i))
132 | 
133 | iterator topByVal*[K,Q,V](s:VOat[K,Q,V], n=10,min=V.low,order=topk.Cheap):(K,V)=
134 |   ## Yield biggest `n` items by value >= `min` in `s` in `order`.
135 |   var t = initTopK[(V,K)](n)
136 |   for k, v in oats.pairs(s): (if v >= min: t.push (v, k))
137 |   for e in topk.maybeOrdered(t, order): yield (e[1], e[0])
138 | 
139 | template oatKStack*(s, Self, Cell, off, offT, K, Q) = ## Defs for stacked varLen
140 |   ## `string`-like keys backed by `string`-like `s` w/`cligen/MSlice`-like `Q`.
141 |   proc mem(c: Cell): pointer = s[c.off].addr          # Accessor
142 |   proc key(c: Cell): K = K(mem: c.mem, len: c.len.int) # AccessorForUsr-Accessor
143 |   proc keyQ(t: Self, k: K): Q = k
144 |   proc keyR(t: Self, q: Q): K {.used.} = q
145 |   proc eq(t: Self, a: K, b: K|Q): bool =
146 |     when b is K: a == b             # Compare internal as ints for faster resize
147 |     else: t.keyQ(a) == b            # Compare Q bytes with memcmp
148 |   template add(b; k: Q; limit: int; fail): untyped =
149 |     if b.len + k.len <= limit:                        # Ensure addr space
150 |       let off = b.len; b.setLen off + k.len           # Ensure alloc room
151 |       if k.len > 0: copyMem b[off].addr, k.mem, k.len # Maybe copy
152 |       offT(off)                                       # Yield new offset
153 |     else: fail
154 | 
155 | template oatSeq*(Self, dat) = ## Add routines for `seq`-ish `Self`
156 |   proc cap(t: Self): int = t.dat.len
157 |   proc newOfCap(t: Self, n: int): Self = result.dat.setLen n
158 |   proc copy(t: var Self, i: int, d: Self, j: int) = t.dat[i] = d.dat[j]
159 |   proc setNew(t, d: var Self) = swap t.dat, d.dat   # efficient t=d (& d=t)
160 | 
161 | template oatCounted*(t, Self, cDotPop) = ## Add inUse for var maybe-ref'd off t.
162 |   proc inUse(t: var Self, n: int) = cDotPop = typeof(cDotPop)(n) #TODO user grow policy
163 |   proc inUse(t: Self): int {.used.} = cDotPop.int
164 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | While this began just as a kind of adaptive index/hash table library, it has
  2 | grown into more a collection in the theme of database/big-data related data
  3 | structures & algorithms.  { Let's say the "ad" in "adix" now stands for
  4 | "ADvanced" | "AscenDant" as well as "adaptive" ;-) }  Most of these are *à la
  5 | carte* and I hope you find them useful.  I try to keep the source code short &
  6 | to the point.  In particular, as an overview/index, here be:
  7 | 
  8 |  - The original associative lookup modules:
  9 |    - [ditab](https://c-blake.github.io/adix/adix/ditab.html): direct-indexed
 10 |    table for small universe keys
 11 |    - [lptabz](https://c-blake.github.io/adix/adix/lptabz.html): a hash table
 12 |    that should be safe for untrusted key sets
 13 |    - [metab](https://c-blake.github.io/adix/adix/metab.html): an instantiation/
 14 |    table portability helper for lptabz
 15 |    - [btree](https://c-blake.github.io/adix/adix/btree.html): a B-Tree with
 16 |    various optional features (histogram, ranks, bulk loading, etc.)
 17 | 
 18 |  - Basic Sorting: [nsort](https://c-blake.github.io/adix/adix/nsort.html)
 19 |      Radix sort only by NEEDED BITS; Often 5-10X faster than `algorithm.sort` if
 20 |      you sort a lot of meso-scale data (merge sorts *always* win for HUGE data;
 21 |      Very few have it). (Could buffer writes to ensure full cache-line pokes.)
 22 | 
 23 |  - Basic Sketches (abbreviated/approximate stores; aka "Digests") for:
 24 |    - Membership: [bltab](https://c-blake.github.io/adix/adix/bltab.html)
 25 |    (bit-level table; Like more successfully marketed Bloom|Cuckoo filters,
 26 |    but lower latency[^1] & ~2X bigger)
 27 |    - Count Distinct: [uniqce](https://c-blake.github.io/adix/adix/uniqce.html)
 28 |    aka count unique or cardinality estimation
 29 |    - Approx Most Often: [amoft](https://c-blake.github.io/adix/adix/amoft.html)
 30 |    (aka approximate top-K most frequent | heavy-hitters)
 31 | 
 32 |    - Distributions/Quantiles:
 33 |       * [tdigest](https://c-blake.github.io/adix/adix/tdigest.html) (for slower,
 34 |         more accurate in tail only quantiles (medians generalized).
 35 |       * for a more complete & adaptive picture, you want accuracy-everywhere /
 36 |         full histograms able to realize fast moving quantile transforms backed
 37 |         by [Fenwick/BIST trees](https://c-blake.github.io/adix/adix/bist.html)
 38 |         with time kernels that are flat,
 39 |         [linear](https://c-blake.github.io/adix/adix/lmbist.html) or
 40 |         [exponential](https://c-blake.github.io/adix/adix/embist.html) or [simple
 41 |         histograms](https://c-blake.github.io/adix/adix/hist.html).
 42 |       * Optional discretizing for binning via logs/sqrt/whatever values gives
 43 |         [lghisto](https://c-blake.github.io/adix/adix/lghisto.html), a high
 44 |         dynamic range (HDR) module that handles that one-stop shopping style or
 45 |         [xhist1](https://c-blake.github.io/adix/adix/xhist1.html), its
 46 |         generalization to any transform|backing histogram/time kernel.
 47 |    - An amalgam: [`mvstat`](https://c-blake.github.io/adix/adix/mvstat.html)
 48 |    that works like `std/stats` but supports `del`, i.e. sliding/moving/rolling
 49 |    windows over data streams (like moving averages) as well as running/dynamic
 50 |    quantiles via `lghisto`.  Also includes bulk array stats that in some compile
 51 |    modes get fully SIMD vectorized inner loops.
 52 | 
 53 | And some utility modules:
 54 |   - [althash](https://c-blake.github.io/adix/adix/althash.html): salt-able
 55 |   alternate hash functions for lptabz
 56 |   - [sequint](https://c-blake.github.io/adix/adix/sequint.html): a fixed stride
 57 |   "bit matrix" using "batch"/number ops.
 58 |   - [memutil](https://c-blake.github.io/adix/adix/memutil.html): memory shifting
 59 |   utilities
 60 |   - [cumsum](https://c-blake.github.io/adix/adix/cumsum.html): parallel prefix
 61 |   sum using Intel SIMD for nsort
 62 |   - [bitop](https://c-blake.github.io/adix/adix/bitop.html): re-impl std/bitops
 63 |   things to be more CT friendly / provide bitwise updating operators
 64 |   - [topk](https://c-blake.github.io/adix/adix/topk.html): spends 2X the (small)
 65 |   space of `std/heapqueue`-based top-k stream algo to scale O(lg k) better via
 66 |   what one might call "buffered quickselect".
 67 |   - [lna](https://c-blake.github.io/adix/adix/lna.html): natural log(abs(x))
 68 |   approximation more tunable with regard to speed-precision trade-offs with
 69 |   5-levels of work (3..11 series terms)/precision (11..24 bits).  It's not
 70 |   always faster, but is more reliably fast and more tunable than libc using 2
 71 |   simple ideas: IEEE exponent to narrow problem & two series for the remaining
 72 |   near-unity interval.
 73 |   - [ways](https://c-blake.github.io/adix/adix/ways.html): Various algos.
 74 |   Presently, scalable, k-way ordered merge.
 75 | 
 76 | A Word/Paragraph Of Caution
 77 | ===========================
 78 | While sketches are popular, like merge sort (vs. radix sort), they often need
 79 | huge data to pay off.  Essentially, probabilistic analysis ("Oh wow, I can do
 80 | that?!") distracts from more practical space-time trade-offs.  This distraction
 81 | is worsened by there being space-time-accuracy "trade-off pyramids".  This
 82 | results in an odd state of affairs where I can say here "spending *a bit* more
 83 | space can yield major speed-ups", and it sounds blatantly obvious to even the
 84 | most casual observer.  *Yet* such is also neglected in context countless times.
 85 | The academic literature does not help, often being "blood sport" for more
 86 | compressed data | accuracy with no regard to speed.[^2]
 87 | 
 88 | So, e.g., on my primary 32 GiB RAM dev box with `bu/zipf`, I cannot make exact
 89 | `lfreq` slower than Approximately Most Often sketches(`bu/oft`).  `tests/bl.nim`
 90 | shows another example (also written up
 91 | [here](https://blog.cloudflare.com/when-bloom-filters-dont-bloom/) in a Bloom
 92 | filter / membership approximation context where spending 2-4X what a Bloom takes
 93 | space-wise can buy a 7-10X latency shrink.[^1] (Histograms & UCE are both pretty
 94 | good deals, though, if errors are acceptable, and `adix/bltab` with fingerprint
 95 | keys is arguably just "a better 'sketch' ").
 96 | 
 97 | A little more on LPTabz & friends
 98 | =================================
 99 | As a brief guide I would start with [`NOTES.md`](NOTES.md) and then look at the
100 | top part of the [`lptabz` doc](https://c-blake.github.io/adix/adix/lptabz.html).
101 | [`TODO.md`](TODO.md) also has a lot of notes in it.  My overarching vision is to
102 | allow "the fast way" most of the time, especially for developers that know how
103 | to provide a good `hash`, but to also have auto fall backs to "safer ways" with
104 | optional messages to let devs know they may need to intervene by changing some
105 | defaults at table construction time (or else let users/sysadmins know that some
106 | input may be violating the assumptions of some code sensitive to inputs).
107 | Commercial database systems may have done this for decades, but hasn't really
108 | percolated into commonly available runtime libs.  (Depth-based growth trigger is
109 | likely the simplest example of Profile-Guided Optimization for data structures.
110 | A.Dain Samples 1993 PhD thesis has some more.)
111 | 
112 | [^1]: Note that hardware memory systems got more sophisticated about speculative
113 | workahead execution and parallel fetch which can mask most or all of the extra
114 | latency in a hot loop benchmark, but this is still "more work/mem bandwidth"
115 | competing with other work you *might* want a CPU to be doing instead and The
116 | Memory Wall has been around for like 30 years now.  Also, the bonus Robin-Hood
117 | Linear Probing adds over Cuckoo is graceful degradation with weak hashes - a
118 | real risk whenever you let users pick `hash` -- which you kind of _must_.
119 | 
120 | [^2]: The basic issue seems to be a need for apparent novelty over practicality
121 | to interest peer reviewers.  Besides weak motivation, "expert" committees only
122 | have whatever narrow exposure they have to various domains of ideas/assumption
123 | frameworks.  With human psychology & incentives this leads to research fads/gobs
124 | of work & "many new names for old ideas" to sift through in deep dives.  One
125 | hoped search engines/LLMs might have eased such sifting, but it seems harder,
126 | perhaps because [synonym density](https://github.com/c-blake/thes) is simply too
127 | great and more folks are at work.
128 | 


--------------------------------------------------------------------------------
/adix/embist.nim:
--------------------------------------------------------------------------------
  1 | ##[ This is like `bist.nim`, but *grows* weight added to counters (some kind of
  2 | floating point) exponentially as new updates happen.  Up to floating point
  3 | arithmetic, this is equivalent to `ctrs vector*= wOld; ctr[i] += 1`.  Doing it
  4 | this way allows BIST use for efficient exponentially weighted moving quantile.
  5 | It does need rescaling BUT this can be made rare.  For wOld=0.9, 1.11^(6736|842)
  6 | =~ 1.8e308|3.4e38 ~= dbl_max.  Re-scaling to tinies can ~2X that to ~13k|1.7k
  7 | data points.  Rescaling is very CPU-vectorizable & so should be 8..16x\*lg(nBin)
  8 | =~ 80..320X faster per it; <=~2X to total amortized cost at <136..270,000 bins.
  9 | 
 10 | Like EWMAs, IF YOU NEVER `dec` then this filter has technically infinite, yet
 11 | *usually* short time-memory, but can operate only with a small histogram space.
 12 | EWMoving Median inherits a usual high breakdown point/robustness in that, even
 13 | if infinite memory/having enduring time-breakdown, influencing the median takes
 14 | *MUCH* more than one wild data point long ago in history - it takes a downright
 15 | different epoch / non-stationarity which is not nearly as bothersome.
 16 | 
 17 | Also, note that weighting styles of distributions & averages are analogous but
 18 | distinct. `*=wOld,+=(1-wOld)*x` is a normalized average, but weight in *distros*
 19 | is about "faked-repetition" of relative weight.  So, while things like half-life
 20 | & lag are the same formulae, meaning varies since distros have MANY interacting
 21 | relative weights. (See `LMBist` for linear weight moving medians w/non-flat
 22 | recency weight + *strict* windows or `Bist` for flat, strict windows.)  The API
 23 | is the same as `Bist[F]`.  The bottom of this module has a small test/timing
 24 | program showing the differences.
 25 | 
 26 | Happy to cite someone, but as far as I can tell, this is a novel (though fairly
 27 | obvious) application of Fenwick BISTs for a fast EWMMedian transform.  Luxenberg
 28 | & Boyd (2024) "Exponentially Weighted Moving Models" does something ~100X more
 29 | complex & surely slower than the one-pass O(n\*lg nBin) (20 ns/item!) way done
 30 | here.  Coleman at https://randorithms.com/2022/01/31/streaming-histograms.html
 31 | has some nice animations, but it has pedagogical/poorly scaling O(n\*nBin) code.
 32 | It seems likely someone doing big data analytics has this somewhere, though and
 33 | I am happy to give credit when due.  Similarly, please cite this github repo if
 34 | this code inspires your work. ]##
 35 | import adix/bist, std/math
 36 | template maxFinite(T: typedesc[SomeFloat]): T = # Should be in std/math, IMO
 37 |   when T is float32: 3.4028235e+38'f32
 38 |   elif T is float64 or T is float: 1.7976931348623157e+308'f64
 39 | 
 40 | type EMBist*[F: SomeFloat] = object ## Exponentially weighted moving distrib.
 41 |   cnt: Bist[F]     # Raw count;  This F *could* become its own generic param.
 42 |   w, grow: float64 # Running weight next data point will add, growth factor.
 43 | 
 44 | proc len*[F](d: EMBist[F]): int = d.cnt.data.len ## Number of bins & bytes
 45 | func space*[F](d: EMBist[F]): int = d.sizeof + d.cnt.space
 46 | proc tot*[F](d: EMBist[F]): F = d.cnt.tot ## Raw total
 47 | proc count*[F](d: EMBist[F]): F = d.tot   ## Total Weight
 48 | 
 49 | proc init*[F](d: var EMBist[F]; len: int, wOld: float=0.9375) =
 50 |   d.cnt.init len; d.w = 1.0; d.grow = F(1/wOld)  # start w at 1/thresh?
 51 | proc initEMBist*[F](len: int, wOld: float): EMBist[F] = result.init len, wOld
 52 | proc clear*[F](d: var EMBist[F]) = d.cnt.clear; d.tot = 0.0
 53 | 
 54 | proc inc*[F](d: var EMBist[F]; i: int, w: F=1) = ## Add weight `w` to bin `i`
 55 |   const lim = F.maxFinite/1e9   # 1e9 just to leave some room for `w` variation
 56 |   const scl = 1/lim
 57 |   # Can pair up *= scl (ensuring multiplier stays FP representable), but this is
 58 |   # ~pointless since it 2x's BOTH rarity AND cost.  Subtracting 2*lg lim from
 59 |   # binary exponents is pure rarity savings *IF* it can be vectorized similarly.
 60 |   d.cnt.inc i, w*d.w
 61 |   if d.cnt.tot > lim:           # Re-scale so future adds do not overflow
 62 |     for c in mitems d.cnt.data: c *= scl  # Both BIST counts..
 63 |     d.cnt.tot *= scl; d.w *= scl          #..and meta-data.
 64 |   d.w *= d.grow
 65 | 
 66 | proc scale*[F](d: EMBist[F]; age: int): F = d.grow^(-age)
 67 |   ## Scale for more rare un-count old; Can re-use if dec @same relative age.
 68 | proc dec*[F](d: var EMBist[F]; i: int; w: F=1) = d.cnt.dec i, w*d.w
 69 |   ## Un-count-old operation for more rare EW with strict windows; Use .scale!
 70 | 
 71 | proc up*[F](d: var EMBist[F]) = discard ## Simple no-op for EMBist
 72 | 
 73 | proc cdf*[F](d: EMBist[F], i: int): F = d.cnt.cdf(i) / d.count ## wrap Bist.cdf
 74 | proc pmf*[F](d: EMBist[F], i: int): F = d.cnt.pmf(i) / d.count ## wrap Bist.pdf
 75 | proc invCDF*[F](d: EMBist[F], s: F; s0: var F): int = d.cnt.invCDF s, s0
 76 |   ## wrap Bist.invCDF
 77 | proc invCDF*[F](d: EMBist[F]; s: F; s0,s1: var F): int = d.cnt.invCDF s, s0,s1
 78 |   ## wrap Bist.invCDF
 79 | proc min*[F](d: EMBist[F]): int = d.cnt.min ## Simple wrapper of `Bist.min`.
 80 | proc max*[F](d: EMBist[F]): int = d.cnt.max ## Simple wrapper of `Bist.max`.
 81 | proc quantile*[F](d: EMBist[F]; q: float; iL,iH: var int): float = ## wrap Bist.quantile
 82 |   d.cnt.quantile q, iL,iH
 83 | proc quantile*[F](d: EMBist[F]; q: float): float = d.cnt.quantile q ## wrap Bist.quantile
 84 | 
 85 | proc nPDF*[F](d: EMBist[F]): seq[F] =
 86 |   result.setLen d.cnt.len;let s=1.0/d.tot;for i,r in mpairs result:r = s*d.pmf(i).F
 87 | 
 88 | proc nCDF*[F](d: EMBist[F]): seq[F] =
 89 |   result.setLen d.cnt.len;let s=1.0/d.tot;for i,r in mpairs result:r = s*d.cdf(i).F
 90 | 
 91 | when isMainModule:
 92 |   type F = float64
 93 |   const slow {.booldefine.} = false     # VERY limited differences below
 94 |   when not declared addFloat: import std/[syncio, formatFloat]
 95 |   import std/[times, strformat], cligen, cligen/sysUt
 96 |   proc embist(xs: seq[int], wOld=0.75, q = -2.0, pdf=false,cdf=false,time=false,
 97 |               xMn=0,xMx=7) =
 98 |     template toI(x): untyped = max(xMn, min(xMx, x)) - xMn   # Clip & shift
 99 |     if wOld <= 0: Value !! "wOld " & $wOld & " too small"
100 |     if wOld >= 1: Value !! "wOld " & $wOld & " too big"
101 |     when slow: (var d = initBist[F](xMx - xMn + 1))
102 |     else     : (var d = initEMBist[F](xMx - xMn + 1, wOld))
103 |     let t0 = epochTime()
104 |     var tQ = 0.0            # Report avg qtl to ensure compiler cannot elide
105 |     for t, x in xs:
106 |       let x = x.toI                     # xOld frm Deque=moreGeneral
107 |       when slow:                        # On full data win, decay ALL old weight
108 |         for cnt in mitems d.data: cnt *= wOld # BIG LOOP
109 |         d.tot *= wOld
110 |         d.inc x, 1.0                    # Unit entering weight
111 |       else:                             # Remove weight for leaving data point
112 |         d.inc x, 1.0                    # Unit entering weight
113 |       if pdf: echo t," b: tot: ",d.tot," ewmPMF: ",d.nPDF
114 |       if cdf: echo t," b: tot: ",d.tot," ewmCDF: ",d.nCDF
115 |       if q > -2.0:
116 |         if time: tQ += d.quantile(q)    # `formatFloat` slow=>just total
117 |         else: echo d.quantile(q)        # Report inverseCDF(q)
118 |     if time:
119 |       let n = xs.len.float; let dt = (epochTime() - t0)*1e9/n
120 |       stderr.write &"n: {xs.len} ns/no: {dt:.1f} w: {wOld} <mQ>: {tQ/n}\n"
121 | 
122 |   dispatch embist, short={"xMn":'a', "xMx":'b'}, help={"xs": "x values",
123 |  "wOld": "per-update decay factor" , "q"  : "quantile to report; 0.5=median",
124 |  "pdf" : "print PDF each time step", "cdf": "print CDF each time step",
125 |  "time": "print timing statistics",
126 |  "xMn" : "`xs[i]` clipped to this `a` on `[a, xs]`",
127 |  "xMx" : "`xs[i]` clipped to this `b` on `[xs, b]`"}
128 | #[ A Zsh session showing basic correctness&boost of optimization.  Sets up env,
129 | compiles ref & optimized; makes nums; Tests various q & w; Finally measures 'em.
130 |   nim=(nim c -d:danger); t=/tmp/nums   # Set up
131 |   $nim -d:slow -o=sembist embist; $nim embist
132 |   ( for i in {1..10000}; printf " %s" $((RANDOM%8)) ) > $t
133 |   ( for q in .1 .25 .5 .75 .9; { for x in {1..9}; { w=$[10./(10.+x)]
134 |       paste <(./sembist -w$w -q.1 `<$t`) <(./embist -w$w -q.1 `<$t`) |
135 |         awk '{print $1-$2}' | sort -g | tails -h1 -t1 }}) 2>/dev/null|unfold -n3
136 |   ./sembist -b8300 -tw.99 -q.5 `<$t`; ./embist -b8300 -tw.99 -q.5 `<$t`
137 | I get NO DIFF between ref & optimized, optimized about 112X faster at -b8300 and
138 | always faster, even at -b2. ]#
139 | 


--------------------------------------------------------------------------------
/adix/cumsum.nim:
--------------------------------------------------------------------------------
  1 | import adix/cpuCT
  2 | 
  3 | proc `[]`[C, W](h: ptr UncheckedArray[C], i: W): var C {.inline.} =
  4 |   cast[ptr C](cast[uint](h) + i.uint * C.sizeof.uint)[]
  5 | 
  6 | proc cumsum*[T](c: ptr UncheckedArray[T]; n: uint) =
  7 |   for i in 1 ..< n:
  8 |     c[i] += c[i - 1]
  9 | 
 10 | #NOTE: SSSE3 => SSE2 which is also used.
 11 | when defined(amd64) and not defined(noSIMD) and x86ssse3 in x86features:
 12 |   when defined(cpuPrefetch): import cligen/prefetch
 13 | 
 14 |   template workToAligned(c, n, i, align: untyped) {.dirty.} =
 15 |     i = 1
 16 |     while i < n and (cast[uint](c[i].addr) and (align - 1)) != 0:
 17 |       c[i] += c[i - 1]
 18 |       i.inc
 19 | 
 20 |   template workRemainder(c, n, i: untyped) {.dirty.} =
 21 |     while i < n:
 22 |       c[i] += c[i - 1]
 23 |       i.inc
 24 | 
 25 |   type m128i {.importc: "__m128i", header: "emmintrin.h".} = object
 26 |   proc mm_load(adr: ptr m128i): m128i {.importc: "_mm_load_si128", nodecl, header: "emmintrin.h".}
 27 |   proc mm_store(adr: ptr m128i, val: m128i) {.importc: "_mm_store_si128", nodecl, header: "emmintrin.h".}
 28 |   proc mm_set1(ch: uint8): m128i {.importc: "_mm_set1_epi8", nodecl, header: "emmintrin.h".}
 29 |   proc mm_set1(wd: uint16): m128i {.importc: "_mm_set1_epi16", nodecl, header: "emmintrin.h".}
 30 |   proc mm_set1(dw: uint32): m128i {.importc: "_mm_set1_epi32", nodecl, header: "emmintrin.h".}
 31 |   proc mm_add_epi32(a, b: m128i): m128i {.importc: "_mm_add_epi32", nodecl, header: "emmintrin.h".}
 32 |   proc mm_shuffle_epi32(a: m128i, msk: cint): m128i {.importc: "_mm_shuffle_epi32", nodecl, header: "emmintrin.h".}
 33 |   proc mm_add_epi8(a, b: m128i): m128i {.importc: "_mm_add_epi8", nodecl, header: "emmintrin.h".}
 34 |   proc mm_add_epi16(a, b: m128i): m128i {.importc: "_mm_add_epi16", nodecl, header: "emmintrin.h".}
 35 |   proc mm_slli_si128(a: m128i, n: cint): m128i {.importc: "_mm_slli_si128", nodecl, header: "emmintrin.h".}
 36 |   proc mm_shuffle_epi8(a, b: m128i): m128i {.importc: "_mm_shuffle_epi8", nodecl, header: "tmmintrin.h".}
 37 | 
 38 |   proc cumsum*(c: ptr UncheckedArray[uint8]; n: uint) =
 39 |     var i = n
 40 |     workToAligned(c, n, i, 16)                #Loop to next 16B align
 41 |     let n64 = i + ((n - i) and not 63'u64)    #Round dn to mult of 64
 42 |     var off = mm_set1(c[i - 1])               #Initial off=last c[].
 43 |     let msk = mm_set1(15'u8)
 44 |     var v0, v1, v2, v3: m128i                 #SSE vecs
 45 |     template do16(v, b: untyped) {.dirty.} =
 46 |       v = mm_load(cast[ptr m128i](c[b + i].addr))
 47 |       v = mm_add_epi8(v, mm_slli_si128(v, 1)) #0+1  1+2  2+3  3+4  4+5  5+6  6+7  7+8  8+9  9+A  A+B  B+C  C+D  D+E  E+F F
 48 |       v = mm_add_epi8(v, mm_slli_si128(v, 2)) #0..3 1..4 2..6 3..6 4..7 5..8 6..9 7..A 8..B 9..C A..D B..E C..F D..F E+F F
 49 |       v = mm_add_epi8(v, mm_slli_si128(v, 4)) #0..7 1..8 2..9 3..A 4..B 5..C 6..D 7..E 8..F 9..F A..F B..F C..F D..F E+F F
 50 |       v = mm_add_epi8(v, mm_slli_si128(v, 8)) #0..F 1..F 2..F 3..F 4..F 5..F 6..F 7..F 8..F 9..F A..F B..F C..F D..F E+F F
 51 |       v = mm_add_epi8(v, off)                         #Add in offset
 52 |       mm_store(cast[ptr m128i](c[b + i].addr), v)     #Update array
 53 |       off = mm_shuffle_epi8(v, msk)                   #off=bcast high elt
 54 |     while i < n64:                                    #1-cache line at a time
 55 |       when defined(cpuPrefetch): prefetchw(c[i + 64].addr)
 56 |       do16(v0,  0)
 57 |       do16(v1, 16)
 58 |       do16(v2, 32)
 59 |       do16(v3, 48)
 60 |       inc(i, 64)  #XXX After cache-line loop, could do few more vectorized doX's
 61 |     workRemainder(c, n, i)                            #Loop to end
 62 | 
 63 |   proc cumsum*(c: ptr UncheckedArray[uint16]; n: uint) =
 64 |     var i = n
 65 |     workToAligned(c, n, i, 16)              #Loop to next 16B align
 66 |     let n32 = i + ((n - i) and not 31'u64)  #Round dn to mult of 32
 67 |     var off = mm_set1(c[i - 1])             #Initial off=last c[]
 68 |     let msk = mm_set1(0x0F0E'u16)           #s.t. shuffle_epi8 =~ shuffle_epi16
 69 |     var v0, v1, v2, v3: m128i               #SSE vectors
 70 |     template do8(v, b: untyped) {.dirty.} =
 71 |       v = mm_load(cast[ptr m128i](c[b + i].addr))
 72 |       v = mm_add_epi16(v, mm_slli_si128(v, 2)) #0+1  1+2  2+3  3+4  4+5  5+6  6+7 7
 73 |       v = mm_add_epi16(v, mm_slli_si128(v, 4)) #0..3 1..4 2..5 3..6 4..7 5..7 6+7 7
 74 |       v = mm_add_epi16(v, mm_slli_si128(v, 8)) #0..7 1..7 2..7 3..7 4..7 5..7 6+7 7
 75 |       v = mm_add_epi16(v, off)                      #Add in offset
 76 |       mm_store(cast[ptr m128i](c[b + i].addr), v)   #Update array
 77 |       off = mm_shuffle_epi8(v, msk)                 #off=bcast high elt
 78 |     while i < n32:                                  #1-cache line at a time
 79 |       when defined(cpuPrefetch): prefetchw(c[i + 32].addr)
 80 |       do8(v0,  0)
 81 |       do8(v1,  8)
 82 |       do8(v2, 16)
 83 |       do8(v3, 24)
 84 |       inc(i, 32)
 85 |     workRemainder(c, n, i)                          #Loop to end
 86 | 
 87 |   proc cumsum*(c: ptr UncheckedArray[uint32]; n: uint) =
 88 |     var i = n
 89 |     workToAligned(c, n, i, 16)                    #Loop to next 16B align
 90 |     let n16 = i + ((n - i) and not 15'u64)        #Round dn to mult of 16
 91 |     var off = mm_set1(c[i - 1])                   #Initial off=last c[]
 92 |     var v0, v1, v2, v3: m128i                     #SSE vectors
 93 |     const msk = 0xFF.cint
 94 |     template do4(v, b: untyped) {.dirty.} =
 95 |       v = mm_load(cast[ptr m128i](c[b + i].addr))
 96 |       v = mm_add_epi32(v, mm_slli_si128(v, 4))    #0+1     1+2   2+3 3
 97 |       v = mm_add_epi32(v, mm_slli_si128(v, 8))    #0+1+2+3 1+2+3 2+3 3
 98 |       v = mm_add_epi32(v, off)                    #Add in offset
 99 |       mm_store(cast[ptr m128i](c[b + i].addr), v) #Update array
100 |       off = mm_shuffle_epi32(v, msk)              #off=bcast high elt
101 |     while i < n16:                                #1-cache line at a time
102 |       when defined(cpuPrefetch): prefetchw(c[i + 16].addr)
103 |       do4(v0,  0)
104 |       do4(v1,  4)
105 |       do4(v2,  8)
106 |       do4(v3, 12)
107 |       inc(i, 16)
108 |     workRemainder(c, n, i)                        #Loop to end
109 | 
110 |   proc cumsum*(c: ptr UncheckedArray[uint64]; n: uint) =
111 |     for i in 1 ..< n:   #Could speed this up, but time will never be big cmp to
112 |       c[i] += c[i - 1]  #..moving billions of items around in a counting sort.
113 | 
114 | proc cumsum*[T](c: var openArray[T]) {.inline.} =
115 |   cumsum(cast[ptr UncheckedArray[T]](c[0].addr), c.len.uint)
116 | 
117 | proc cumsum*[T](c: ptr T, n: uint) {.inline.} =
118 |   cumsum(cast[ptr UncheckedArray[T]](c), n)
119 | 
120 | when isMainModule:
121 |   import std/[random, times, stats], cligen
122 |   when not declared(stderr): import std/[syncio, formatfloat]
123 | 
124 |   proc gen[T](x: var openArray[T]; low, range: int) =
125 |     for i in 0 ..< x.len: x[i] = low.T + rand(range.float).T
126 | 
127 |   type Kind = enum kU1, kU2, kU4, kU8
128 | 
129 |   proc doIt[T](k: T, n=9, low=0, range=9, bench=false, minN=5, avgN=5,
130 |                data: seq[uint64]) =
131 |     randomize()
132 |     var x = newSeq[T](n)
133 |     var t = x
134 |     var dtsMin: RunningStat
135 |     for avgIt in 1..avgN:
136 |       var dtMin = float.high
137 |       for minIt in 1..minN:
138 |         if data.len > 0:    #Allow passing specific data for reproducible debug
139 |           for i in 0 ..< x.len: x[i] = data[i].T
140 |         else:
141 |           x.gen low, range
142 |         if not bench: t = x
143 |         let t0 = epochTime()
144 |         x.cumsum
145 |         dtMin = min(dtMin, (epochTime() - t0) * 1e9)
146 |         if not bench:
147 |           var t0 = t
148 |           for i in 1 ..< n:
149 |             t[i] += t[i - 1]
150 |           for i in 1 ..< n:
151 |             if x[i] != t[i]:
152 |               echo "bad cumsum at: ",i," x: ",x[i]," shouldBe: ",t[i]; echo ""
153 |               echo "x0[]: ", t0; echo ""
154 |               echo "t[]: ", t; echo ""
155 |               echo "x[]: ", x
156 |               return
157 |       dtsMin.push dtMin
158 |     echo "n: ", n, " ", $T, " cumsum_ns: ", dtsMin.min, " .. ", dtsMin.max
159 | 
160 |   proc testTime(kind=kU1, n=256, low=0, range=128, bench=false, minN=9, avgN=9,
161 |                 data: seq[uint64]) =
162 |     case kind
163 |     of kU1: doIt(0'u8 , n, low, range, bench, minN, avgN, data)
164 |     of kU2: doIt(0'u16, n, low, range, bench, minN, avgN, data)
165 |     of kU4: doIt(0'u32, n, low, range, bench, minN, avgN, data)
166 |     of kU8: doIt(0'u64, n, low, range, bench, minN, avgN, data)
167 | 
168 |   dispatch(testTime, cmdName="cumsum")
169 | 


--------------------------------------------------------------------------------
/adix/lghisto.nim:
--------------------------------------------------------------------------------
  1 | ##[ `LgHisto` is an application of BISTs to histograms of logs giving efficient,
  2 | dynamic quantiles.  Logs give high dynamic range at low cost while Fenwick/BIST
  3 | supports dynamic membership w/operation-balanced perf.
  4 | 
  5 | Quantile error is absolute { not relative to `q*(1-q)` like t-Digests } & easily
  6 | bounded as <~ 1/2 bin width { ~ 10^(log_10(b/a)/n) }.  If you need 3 places or
  7 | your data is clustered within a few orders of magnitude then you can likely just
  8 | use 1e4 bins & counters will remain L1 cache resident, depending on resource
  9 | competition.  Cache is the main cost Re: speed.  Re: space, since 99% of bins
 10 | are 0 in many cases, simple run-length encoding can improve net/disk transfers.
 11 | 
 12 | The way Fenwick BISTs work, the generic parameter `C` must be a wide enough int
 13 | type to hold both elemental bin counts AND cumulatives.  `uint32` is likely
 14 | enough for many applications, though some might sneak by with `uint16` and a few
 15 | might need `uint64`.  This scales bin size/space cost.
 16 | 
 17 | t-Digests are a well marketed competitor using ~10X less space BUT with >>5X
 18 | slower quantiles of similar accuracy.  Actual costs are sensitive to operation
 19 | mixes. { This may change, but present t-Digest impls, even with trees, linear
 20 | scan for quantile/CDFs.  None even offer "batch" APIs to do N quantiles in one
 21 | such scan.  "Histo B-trees" should allow better scaling for such. } A BIST basis
 22 | differs from t-Digests in other important ways.  E.g., BISTs are well suited for
 23 | `pop` (or moving data window ops) with *strict* finite memory to, e.g. translate
 24 | full streams to moving quantiles as in Bollinger Band-style smooths.]##
 25 | 
 26 | when not declared(addFloat): import std/formatfloat
 27 | import adix/[bist, lna]; from std/math import exp, isNaN
 28 | type
 29 |   LgHisto*[C] = object  ## Log-spaced histogram with `Bist[C]` backing
 30 |     n: int              # number of bins
 31 |     a, b: float         # histogram covers [-b, -a], (-a, a) in zero, [a, b]
 32 |     aLn, h, hInv: float # index scale conversion pre-computes
 33 |     bist: Bist[C]       # actual smart array of counters: [0, 2*n] -> PMF/CDF
 34 | 
 35 | func underflows*[C](s: LgHisto[C]): int = s.bist.pmf 0
 36 | func overflows*[C](s: LgHisto[C]): int  = s.bist.pmf 2*s.n
 37 | func low*[C](s: LgHisto[C]): float      = s.a
 38 | func high*[C](s: LgHisto[C]): float     = s.b
 39 | func nBin*[C](s: LgHisto[C]): int       = s.n
 40 | func bist*[C](s: LgHisto[C]): Bist[C]   = s.bist
 41 | 
 42 | func init*[C](s: var LgHisto[C], a=1e-16, b=1e20, n=8300) =
 43 |   ## Init histo w/2n+1 log-spaced bins: `[-∞..-b; -b..-a; 0; a..<b; b..∞]`.
 44 |   if b <= a: raise newException(ValueError, "inverted: [" & $a & "," & $b & "]")
 45 |   if a <= 0.0 or b <= 0.0: raise newException(ValueError, "a,b must both be >0")
 46 |   if n < 2: raise newException(ValueError, "n must >= 2")
 47 |   s.n    = n
 48 |   s.a    = a
 49 |   s.b    = b
 50 |   s.aLn  = lna(a)
 51 |   s.h    = (lna(b) - lna(a))/float(n - 1)
 52 |   s.hInv = 1.0/s.h
 53 |   s.bist = initBist[C](2*n + 1)
 54 | 
 55 | func initLgHisto*[C](a=1e-16, b=1e20, n=8300): LgHisto[C] = result.init a, b, n
 56 |   ## Get Histo w/2n+1 log-spaced bins: `[-inf..<-b; -b..<-a; 0; a..<b; b..inf]`.
 57 | 
 58 | func space*[C](s: LgHisto[C]): int = s.sizeof + s.bist.space
 59 |   ## Estimate space taken up by data structure in bytes
 60 | 
 61 | func tot*[C](s: LgHisto[C]): C = s.bist.tot.C ## Give total count weight
 62 | 
 63 | func toIx*[F,C](s: LgHisto[C], x: F): int =
 64 |   ## Find bin index for value `x`; underflows get `[0]` & overflows get `[2*n]`.
 65 |   if   x <= -s.a:
 66 |     if x >= -s.b: result = s.n - 1 - int( (lna(-x) - s.aLn)*s.hInv)
 67 |     else        : result = 0
 68 |   elif x >= +s.a:
 69 |     if x <= +s.b: result = s.n + 1 + int( (lna(+x) - s.aLn)*s.hInv)
 70 |     else        : result = 2*s.n
 71 |   else: result = s.n
 72 | 
 73 | func fromIx*[F,C](s: LgHisto[C], i: int, offset: F=0.5): F =
 74 |   ## Geometric mean of left&right edge log-shifted `offset` fraction into bin
 75 |   if   i < s.n: -exp(s.aLn + s.h*(F(s.n - i - 1) + F(1) - offset))
 76 |   elif i > s.n: +exp(s.aLn + s.h*(F(i - s.n - 1) + offset))
 77 |   else: 0.0 # The bin containing x=zero cannot really be offset in the same way
 78 | 
 79 | func binAB*[F,C](s: LgHisto[C], x: F): (float, float) =
 80 |   ## Range in data space of the bin containing `x`; Costs 2 `fromIx`s.
 81 |   let i = s.toIx(x)
 82 |   if   i == 0      : result[0] = -Inf            ; result[1] = -s.b
 83 |   elif i == 1      : result[0] = -s.b            ; result[1] = s.fromIx(i, 1.0)
 84 |   elif i == 2*s.n-1: result[0] = s.fromIx(i, 0.0); result[1] = +s.b
 85 |   elif i == 2*s.n  : result[0] = +s.b            ; result[1] = +Inf
 86 |   elif x <  -s.a   : result[0] = s.fromIx(i, 0.0); result[1] = s.fromIx(i, 1.0)
 87 |   elif x >= +s.a   : result[0] = s.fromIx(i, 0.0); result[1] = s.fromIx(i, 1.0)
 88 |   else             : result[0] = -s.a            ; result[1] = +s.a
 89 | 
 90 | func add*[F,C](s: var LgHisto[C], x: F, w=1.C) =
 91 |   ## Increment bin for value `x` by weight `w`
 92 |   if not isNaN(x): s.bist.inc(s.toIx(x), w)
 93 | 
 94 | func pop*[F,C](s: var LgHisto[C], x: F, w=1.C) =
 95 |   ## Alias for `add` with a negative weight argument
 96 |   if not isNaN(x): s.bist.dec(s.toIx(x), w)
 97 | 
 98 | iterator bins*[C](s: LgHisto[C]): (float, float, C) =
 99 |   ## Yield `(lo, hi, count)` for each bin covered
100 |   yield (-Inf, -s.b, s.bist.pmf 0)
101 |   yield (-s.b, s.fromIx(1,1.0), s.bist.pmf 1)
102 |   for i in  2   ..<  s.n: yield (s.fromIx(i,0.0),s.fromIx(i,1.0),s.bist.pmf i)
103 |   yield (-s.a, s.a, s.bist.pmf s.n) # middle bin breaks geometric mean formula
104 |   for i in s.n+1..<2*s.n-1: yield (s.fromIx(i,0.0),s.fromIx(i,1.0),s.bist.pmf i)
105 |   yield (s.fromIx(2*s.n-1,0.0), +s.b, s.bist.pmf 2*s.n-1)
106 |   yield (+s.b, +Inf, s.bist.pmf 2*s.n)
107 | 
108 | proc `$`*[C](s: LgHisto[C], nonZero=true): string =
109 |   ## Formatting operator; Warning: output can be large, esp. if nonZero=false
110 |   result.add "n: "   & $s.n   & "\ta: " & $s.a & "\tb: "    & $s.b    & "\n"
111 |   result.add "aLn: " & $s.aLn & "\th: " & $s.h & "\thInv: " & $s.hInv & "\n"
112 |   result.add "bins,cnts:\n"
113 |   var tot = 0; var n = 0
114 |   for (a, b, c) in s.bins:
115 |     let c = int(c); tot += c
116 |     if nonZero:
117 |       if c != 0: result.add "  [ " & $a & " , " & $b & " ): " & $c & "\n"; inc n
118 |     else       : result.add "  [ " & $a & " , " & $b & " ): " & $c & "\n"
119 |   result[^1] = '\n'
120 |   result.add "totalCount: " & $tot & (if nonZero: " non0Bins: " & $n else: "")
121 | 
122 | func quantile*[F,C](s: LgHisto[C], q: F): F =
123 |   ## Basic quantile; XXX Can log-spacing savvy interpolation be more accurate?
124 |   if q < 0.0 or q > 1.0: return NaN
125 |   var iL, iH: int
126 |   let fL = s.bist.quantile(q, iL, iH)
127 |   fL*s.fromIx(iL) + (1 - fL)*s.fromIx(iH)
128 | 
129 | func cdf*[F,C](s: LgHisto[C], x: F): C =
130 |   ## Raw count; Leave to caller to multiply by 1/s.bist.count; XXX Interpolate?
131 |   if x.isNaN: NaN else: s.bist.cdf(s.toIx(x))
132 | 
133 | func merge*[C](dst: var LgHisto[C], src: LgHisto[C]) =
134 |   ## Merge counts from src into dst.
135 |   if src.n != dst.n or src.a != dst.a or src.b != dst.b:
136 |     raise newException(ValueError, "src-dst histogram parameter mismatch")
137 |   for i in 0..2*src.n: dst.bist.inc i, src.bist.pmf(i) # Flat array can be fastr
138 | 
139 | when isMainModule:
140 |   when defined(test): # Helpful to run against: \ -12 \ -8 \ -4 \ -1 0 1 4 8 12
141 |     proc lghist(a=0.125, b=10.0, n=8, qs = @[0.25, 0.5, 0.75], xs: seq[float]) =
142 |       var lh = initLgHisto[uint16](a, b, n)
143 |       for x in xs: lh.add x
144 |       echo `$`(lh, nonZero=false)
145 |       for (a, b, c) in lh.bins:
146 |         if (a,b) != lh.binAB((a+b)/2) or a >= b:
147 |           echo "a: ",a," b: ",b," c: ",c," ab(mid(a,b)): ",lh.binAB((a+b)/2)
148 |       if lh.tot > 0: (for q in qs: echo "q",q,": ",lh.quantile(q))
149 |     import cligen; dispatch lghist
150 |   else:
151 |     import std/[random, times, strformat]
152 |     when defined danger: randomize()
153 |     const N = 750_000
154 |     var data = newSeq[float32](N)
155 |     const Q = [0.001,0.01,0.05,0.1587,0.25,0.50,0.75,0.8413,0.95,0.99,0.999]
156 |     var res = newSeq[float32](Q.len)
157 |     for i in 0..<N: data[i] = gauss().float32 # rand(0.0 .. 1.0)
158 |     var s = initLgHisto[uint32](b=10, n=128)
159 |     let t0 = epochTime()
160 |     for x in data: s.add x
161 |     let t1 = epochTime()
162 |     for j, q in Q: res[j] = s.quantile q
163 |     let t2 = epochTime()
164 |     let dtB = (t1 - t0)*1e9/N.float     # Build time
165 |     let dtQ = (t2 - t1)*1e9/Q.len.float # Query time
166 |     for r in res: echo r
167 |     echo &"ns/add: {dtB:.1f}  ns/q: {dtQ:.1f}  space: {s.space} bytes"
168 | 


--------------------------------------------------------------------------------
/adix/sequint.nim:
--------------------------------------------------------------------------------
  1 | ## This module provides a memory optimized `seq[uint]` for a user-given range of
  2 | ## numbers (by default its own initial length).  E.g., if the range is 0..7, it
  3 | ## uses just 3 bits per number (plus rounding error).  Other pithy descriptions
  4 | ## are "the array version of bit fields" | "the matrix version of bit vectors".
  5 | ##
  6 | ## In the best case, this allows packing numbers 8x (e.g., 8/1) more densely
  7 | ## than a "next biggest CPU int rounded" approach (like 8,16,32,64).  The public
  8 | ## API uses `uint`, usually a 64-bit unsigned integer.
  9 | ##
 10 | ## To store `n` indices from `0..n-1` takes `n*ceil(lg(n))` bits.  E.g., circa
 11 | ## 2020 L3 CPU caches have become large enough to support any permutation of
 12 | ## 0..2^24-1 since (24*2**24/8 = 48 MiB).
 13 | ##
 14 | ## Using the widest type for backing store and limiting the design to hold only
 15 | ## numbers up to said wide type ensures <= 2 consecutive backing items are ever
 16 | ## needed to access a given number.
 17 | ##
 18 | ## While dynamically growing a `SeqUint` works, changing *element size* doesn't.
 19 | ## So, callers must `t=initSeqUint(s.len, 1 shl (s.bits+1))` & copy as needed.
 20 | 
 21 | # We also assume the near universal pattern that the widest type is a power of 2
 22 | # number of bits like 16, 32, 64.  { ISO/IEC JTC1 SC22 WG14 n2472 makes sizes of
 23 | # exact-bit ints next po2 for C anyway. }
 24 | #
 25 | # NOTE: The core of this module is just `[]` & `[]=` which can *very likely* be
 26 | # optimized in non-CPU-portable ways.  PRs for such are not only welcome, but
 27 | # actively solicited.  This is a space optimization almost all know about yet
 28 | # few use (since impls are just not handy or may be slow).  Thus it is not a bad
 29 | # choice for Nim stdlib inclusion, especially with per CPU speed forks.
 30 | 
 31 | import bitop
 32 | const iBit = 8 * sizeof(int)
 33 | const iShf = lgPow2(iBit)
 34 | const iMsk = (1 shl iShf) - 1
 35 | 
 36 | type
 37 |   SeqUint* = object   ## A space-optimized `seq[uint]`
 38 |     data: seq[uint]   # Backing store, probably 64-bit ints
 39 |     len: int          # number of elements occupied
 40 |     bits: int8        # size of elements in bits
 41 | 
 42 | proc roundUp(sz: int): int {.inline.} = # number of int-sized words for sz elts
 43 |   let m = sz and iMsk
 44 |   if m == 0: return sz
 45 |   return sz + iBit - m
 46 | 
 47 | proc bits*(s: SeqUint): int {.inline.} = int(s.bits)
 48 | 
 49 | proc low*(s: SeqUint): int {.inline.} = 0
 50 | 
 51 | proc addr0*(s: SeqUint): pointer {.inline.} = s.data[0].unsafeAddr
 52 | 
 53 | proc len*(s: SeqUint): int {.inline.} = int(s.len)
 54 | 
 55 | proc high*(s: SeqUint): int {.inline.} = int(s.len) - 1
 56 | 
 57 | proc clear*(s: var SeqUint) {.inline.} =
 58 |   zeroMem s.data[0].addr, s.data.len * s.data[0].sizeof
 59 | 
 60 | #TODO Could easily optimize exact 8|16|32-bit cases with CPU supported types.
 61 | proc init*(s: var SeqUint, initialSize=0, numBound=0) {.inline.} =
 62 |   let bits = if   numBound    > 0: lg(numBound)
 63 |              elif initialSize > 0: lg(initialSize)
 64 |              else: 0
 65 |   let bitsz = initialSize * bits
 66 |   s.data.setLen if bitsz > 0: (roundUp(bitsz) shr iShf) else: 1
 67 |   s.len  = initialSize
 68 |   s.bits = bits.int8
 69 | 
 70 | proc initSeqUint*(initialSize=0, numBound=0): SeqUint {.inline.} =
 71 |   result.init(initialSize, numBound)
 72 | 
 73 | template BadIndex: untyped =
 74 |   when declared(IndexDefect): IndexDefect else: IndexError
 75 | 
 76 | # Consider storing 3 bit numbers packed into 8 bit words big-endian-wise like:
 77 | #   indices for trad. R2Left ops:  76543210             76543210  76543210
 78 | # The layout can be either A) m=1 [....210.] OR B) m=7 [0.......][......21].
 79 | # Goes to bit-algebra `(w shr m) and msk` OR `(w1 and 3) shl 2 or (w0 shr m)`
 80 | # where `m == bitix % 8` is the modulus of low order bit index relative to wdsz.
 81 | proc `[]`*(s: SeqUint, i: int|uint): uint {.inline.} =
 82 |   if int(i) >= s.len:
 83 |     raise newException(BadIndex(), formatErrorIndexBound(int(i), s.len))
 84 |   let sbits  = uint(s.bits)
 85 |   let bitix  = uint(i) * sbits
 86 |   let wdix   = bitix shr iShf
 87 |   let wdmod  = bitix and iMsk
 88 |   let bitend = wdmod + sbits
 89 |   if bitend <= iBit:
 90 |     result = (s.data[wdix] shr wdmod) and ((1'u shl sbits) - 1)
 91 |   else:
 92 |     let w0bit = iBit - wdmod
 93 |     let oFlow = sbits - w0bit
 94 |     let oMask = (1'u shl oFlow) - 1
 95 |     result = ((s.data[wdix+1]and oMask) shl w0bit) or (s.data[wdix] shr wdmod)
 96 | 
 97 | # Reconsider the above bit extraction diagram/example for bit deposit.  Here we
 98 | # update one or two words.  In A) the new value is one 3-way bitwise OR of two
 99 | # old & 1 new parts, e.g.: (old and 240)or(num shl 1)or(old and 1) or more
100 | # generally (wd and hiM) or (num shl m) or (wd and mMask) where hiM is the
101 | # complement of the m+3 shift and mMask the mask for m bits.  Case B) does two
102 | # bitwise ORs stored to the pair of words.  The 1st goes to ((num and 1) shl 7)
103 | # or (w0 and loM) while the 2nd to (w1 and not oMask) or (num shr w0bit).
104 | proc `[]=`*(s: var SeqUint, i: int|uint, x: int|uint) {.inline.} =
105 |   let x = uint(x) and ((1'u shl s.bits) - 1)
106 |   if int(i) >= s.len:
107 |     raise newException(BadIndex(), formatErrorIndexBound(i, s.len))
108 |   let sbits  = uint(s.bits)
109 |   let bitix  = uint(i) * sbits
110 |   let wdix   = bitix shr iShf
111 |   let wdmod  = bitix and iMsk
112 |   let bitend = wdmod + sbits
113 |   if bitend <= iBit:
114 |     let wd = s.data[wdix]
115 |     let hiM = if bitend == iBit: 0'u else: (not 0'u) shr bitend shl bitend
116 |     let mMask = (1'u shl wdmod) - 1
117 |     s.data[wdix] = (wd and hiM) or (x shl wdmod) or (wd and mMask)
118 |   else:
119 |     let w0bit = iBit - wdmod
120 |     let oFlow = sbits - w0bit
121 |     let w0    = s.data[wdix]
122 |     let w1    = s.data[wdix + 1]
123 |     let oMask = (1'u shl oFlow) - 1
124 |     let loM   = (1'u shl wdmod) - 1
125 |     let cMask = (1'u shl (iBit - wdmod)) - 1
126 |     s.data[wdix]   = ((x and cMask) shl wdmod) or (w0 and loM)
127 |     s.data[wdix+1] = (w1 and not oMask) or (x shr w0bit)
128 | 
129 | proc setLen*(s: var SeqUint, size: int) {.inline.} =
130 |   let bitsz = size * s.bits
131 |   s.data.setLen (roundUp(bitsz) shr iShf)
132 |   s.len = size
133 | 
134 | proc add*(s: var SeqUint, v: uint) {.inline.} =
135 |   let i = s.len
136 |   s.setLen i + 1
137 |   s[i] = v
138 | 
139 | iterator items*(s: SeqUint): uint =
140 |   for i in 0 ..< s.len: yield s[i]
141 | 
142 | iterator pairs*(s: SeqUint): (int, uint) =
143 |   for i in 0 ..< s.len: yield (i, s[i])
144 | 
145 | proc `$`*(s: SeqUint): string =
146 |   result = "["
147 |   for i, v in s: result.add (if i < s.len - 1: $v & ", " else: $v)
148 |   result.add "]"
149 | 
150 | when isMainModule:
151 |   var s1 = initSeqUint(16)
152 |   for i in 0 ..< s1.len:      # Single big word, even small per big, fwd order
153 |     let n = uint(i)
154 |     s1[i] = n
155 |     if s1[i] != n: echo "i: ", i, " SET ", n, " BUT GOT ", s1[i], " BACK"
156 | 
157 |   var s2 = initSeqUint(44, numBound=16)
158 |   for i in 0 ..< s2.len:      # Three big words, even small per big, fwd order
159 |     let n = uint(i and 15)
160 |     s2[i] = n
161 |     if s2[i] != n: echo "i: ", i, " SET ", n, " BUT GOT ", s2[i], " BACK"
162 | 
163 |   var s3 = initSeqUint(128, 8)
164 |   for i in 0 ..< s3.len:      # Six big words, uneven small per big, fwd
165 |     let n = uint(i and 7)
166 |     s3[i] = n
167 |     if s3[i] != n: echo "i: ", i, " SET ", n, " BUT GOT ", s3[i], " BACK"
168 | 
169 |   var s4 = initSeqUint(64*13, numBound=32)
170 |   for i in 0 ..< s4.len:      # 65 big words, 5-bit nums, pseudo-rand vals
171 |     let n = uint((i * 19) and 31)
172 |     s4[i] = n
173 |     if s4[i] != n: echo "i: ", i, " SET ", n, " BUT GOT ", s4[i], " BACK"
174 | 
175 |   # Now all the same as above but looping high to low
176 |   var s5 = initSeqUint(16)
177 |   for i in countdown(s5.len - 1, 0):
178 |     let n = uint(i)
179 |     s5[i] = n
180 |     if s5[i] != n: echo "i: ", i, " SET ", n, " BUT GOT ", s5[i], " BACK"
181 | 
182 |   var s6 = initSeqUint(44, numBound=16)
183 |   for i in countdown(s6.len - 1, 0):
184 |     let n = uint(i and 15)
185 |     s6[i] = n
186 |     if s6[i] != n: echo "i: ", i, " SET ", n, " BUT GOT ", s6[i], " BACK"
187 | 
188 |   var s7 = initSeqUint(128, 8)
189 |   for i in countdown(s7.len - 1, 0):
190 |     let n = uint(i and 7)
191 |     s7[i] = n
192 |     if s7[i] != n: echo "i: ", i, " SET ", n, " BUT GOT ", s7[i], " BACK"
193 | 
194 |   var s8 = initSeqUint(64*13, numBound=32)
195 |   for i in countdown(s8.len - 1, 0):
196 |     let n = uint((i * 19) and 31)
197 |     s8[i] = n
198 |     if s8[i] != n: echo "i: ", i, " SET ", n, " BUT GOT ", s8[i], " BACK"
199 | 
200 |   var s9 = initSeqUint(0, numBound=32)
201 |   for x in s8: s9.add x
202 |   if $s9 != $s8: echo "grown seqUint != assigned"; echo s9; echo s8
203 |   s9.setLen 0
204 |   for x in s6: s9.add x
205 |   if $s9 != $s6: echo "grown seqUint != assigned2"; echo s9; echo s6
206 | 


--------------------------------------------------------------------------------
/adix/hist.nim:
--------------------------------------------------------------------------------
  1 | ##[ This provides a simple regular histogram with the `bist.nim` interface, but
  2 | using vanilla bin counters mostly so it can be used with adix/xhist1.  Bin incs
  3 | are fast & `adix/cumsum` can be at least tolerable via parallel prefix sum on
  4 | x86_64.  Re-scaling|shifting bins can also be done (externally) post-decrement &
  5 | pre-increment and is very vectorizable.  This enables general weight decay, not
  6 | only exponential/linear/flat, but adds per-point expense proportional to nBin.
  7 | For specific weight decays this could be ~2X optimized some by "always counting
  8 | up", as with Fenwick-backed embist/lmbist, but we hold off on that for now to
  9 | provide a distinct calculation with distinct FP math trade offs.
 10 | 
 11 | Performance break-even vs BIST methods depends on (at least!) counter size /
 12 | vectorization, platform, time kernel, and work load.  Ballpark expectations
 13 | might be to use this <= ~300 bins for 1:1 ratios of dec, inc & quantile with
 14 | this being about 4X faster for 8 bins & nBin/lg(nBin) slower for larger nBin.
 15 | This can be strictly faster if your use case has many counter updates per
 16 | quantile query.  This can also potentially be accelerated by GPUs (especially in
 17 | the context of transforming whole, already existing arrays rather than online /
 18 | incremental transformation).  The bottom of this module has a small test/timing
 19 | program against a bist. ]##
 20 | when not declared assert: import std/assertions
 21 | import adix/cumsum, std/algorithm
 22 | 
 23 | type Hist*[N: SomeNumber] = object ## Simple Histogram
 24 |   cnt*, csum*: seq[N] ## PDF/PMF/counter array and its cumulative sum
 25 |   tot*: N             ## csum[^1], but always up to date
 26 |   dirty*: bool        ## Flag indicating if .csum may be out of date from .cnt
 27 | 
 28 | proc len*[N](h: Hist[N]): int = h.cnt.len ## Number of bins & bytes
 29 | func size*[N](h: Hist[N]): int = h.len*N.sizeof
 30 | func space*[N](h: Hist[N]): int = h.sizeof + 2*h.size
 31 | proc tot*[N](h: Hist[N]): N = h.tot ## Raw total
 32 | proc count*[N](h: Hist[N]): N = h.tot ## Total Weight
 33 | 
 34 | proc init*[N](h: var Hist[N]; len: int) = h.cnt.setLen len; h.csum.setLen len
 35 | proc initHist*[N](len: int): Hist[N] = result.init len
 36 | proc clear*[N](h: var Hist[N]) =
 37 |   zeroMem h.cnt[0].addr, h.size; zeroMem h.csum[0].addr, h.size
 38 |   h.tot = 0.0; h.dirty = false
 39 | proc inc*[N](h: var Hist[N]; i:int, w:N=1) = h.cnt[i]+=w; h.tot+=w; h.dirty=true
 40 |   ## Add weight `w` to bin `i` & `.tot`; set dirty
 41 | proc dec*[N](h: var Hist[N]; i:int; w:N=1) = h.cnt[i]-=w; h.tot-=w; h.dirty=true
 42 |   ## Subtract weight `w` from bin `i` & `.tot`; set dirty
 43 | proc up*[N](h: var Hist[N]) = ## Update `.csum` field after various inc/dec's
 44 |   if h.dirty and h.csum.len > 0 and h.cnt.len > 0:
 45 |     copyMem h.csum[0].addr, h.cnt[0].addr, h.size
 46 |     cumsum.cumsum h.csum; h.dirty = false
 47 | 
 48 | proc cdf*[N](h: Hist[N], i: int): N = h.csum[i]
 49 | proc pmf*[N](h: Hist[N], i: int): N = h.cnt[i]
 50 | 
 51 | proc invCDF*[N](h: Hist[N], s: N; s0: var N): int =
 52 |   ## For `0 < s <= tot`, bracket ECDF jump `>= s`.  I.e. find `i0, s0` so `s0 =
 53 |   ## sum(..< i0) < s yet sum(..i0) >= s` in `lgCeil n` array probes.
 54 |   assert 0<=s and s<=h.tot, "Hist.invCDF OORange sum " & $s & " of " & $h.tot
 55 |   result = h.csum.lowerBound(s)         #NOTE: s<0|s>tot are invalid inputs
 56 |   if result >= h.cnt.high: result = h.cnt.high; s0 = h.tot
 57 |   else: s0 = h.csum[result] - h.cnt[result]
 58 | 
 59 | proc `$`*[N](h: Hist[N]): string = "tot: " & $h.count & " pmf: " & $h.nPDF
 60 | 
 61 | proc invCDF*[N](h: Hist[N], s: N): (int, N) = result[0] = h.invCDF(s, result[1])
 62 |   ## For `0 < s <= tot` return `(i0,s0)` so `sum(..<i0)=s0 < s and sum(..i0)>=s`
 63 | 
 64 | proc invCDF*[N](h: Hist[N]; s: N; s0, s1: var N): int =
 65 |   ## For `0 < s <= tot`, find `i0,s0,s1` so `s0 < s <= s1` and `s0+pmf(i0)==s1`.
 66 |   result = h.invCDF(s, s0)
 67 |   if result == h.cnt.high: s1 = s0; s0 = s1 - h.cnt[result]
 68 |   else: s1 = s0 + h.cnt[result]
 69 | 
 70 | proc min*[N](h: Hist[N]): int = ## Simple wrapper: invCDF(h, 1)
 71 |   var s0: N; h.invCDF(1, s0)
 72 | 
 73 | proc max*[N](h: Hist[N]): int = ## Simple wrapper: invCDF(h,h.count).
 74 |   var s0: N; h.invCDF(h.tot.N, s0)
 75 | 
 76 | from std/fenv import epsilon #XXX Centralize thrice replicated Parzen(invCDF)
 77 | proc quantile*[N](h: Hist[N]; q: float; iL,iH: var int): float =
 78 |   ## Parzen-interpolated quantile; E.g., q=0.9 => 90th percentile.  ``answer =
 79 |   ## result*iL + (1-result)*iH``, but is left to caller to do { in case it is
 80 |   ## mapping larger numeric ranges to/from iL,iH }.  Tm ~ ``2*lg(addrSpace)``.
 81 |   ## Unlike other (broken!) quantile-interpolation methods, Parzen's connects
 82 |   ## midpoints of vertical CDF jumps, not horizontal.  This makes more sense,
 83 |   ## corresponding to Wilcoxon 1945 & later tie mid-ranking recommendations.
 84 |   assert h.tot > 0, "quantile(Hist[N]) requires non-empty Hist."
 85 |   var sL0, sL1, sH0, sH1: N                 #You probably want to draw a CDF to
 86 |   let n  = h.tot.float                      #..fully understand this code.
 87 |   let qN = q*n
 88 |   let wq = when N is SomeFloat: N.epsilon*n else: 1.N # A Quantum Of Ctr Wgt
 89 |   if qN <= 0.5*wq.float    : iL = h.min;iH=0;return 1 #Early tails rets; Pure iL
 90 |   if qN >= n - 0.5*wq.float: iL = h.max;iH=0;return 1 #{Early body are pure iH.}
 91 |   let dqN=when N is SomeFloat: wq else: 1.5 # Min round-off + max odds high side
 92 |   iH = h.invCDF(N(qN + dqN), sH0, sH1)      # sH0<qN<=sH1 BUT can be ABOVE|
 93 |   var sMidH = 0.5*float(sH0 + sH1)          #..BELOW sMidH.  So, test & move as
 94 |   if sMidH < qN:                            #..needed.  When it does fail..,
 95 |     if sH1 < h.tot:                         #..we want the next higher bin.
 96 |       iH    = h.invCDF(sH1 + wq, sH0, sH1)
 97 |       sMidH = 0.5*float(sH0 + sH1)
 98 |     else: return 0                          #..unless @HIGHEST already=>all iH
 99 |   if sH0 < wq: return 0                     #For qN this small, iH = iL = min.
100 |   iL = h.invCDF(sH0, sL0, sL1)              #..Also, cannot call invCDF(0).
101 |   when N is SomeFloat:                      # Should be impossible,but round-off
102 |     if sL1 > sH0 + wq:                      #..makes it happen sometimes & when
103 |       iL = h.invCDF(sH0 - wq, sL0, sL1)     #..it does, we want next lower bin.
104 |   let sMidL = 0.5*float(sL0 + sL1)          #Mid-vertJump(nxtLwrBin) gives line
105 |   min (sMidH - qN)/(sMidH - sMidL), 1.0     #Runs of N.eps-sized bins=>anomalies
106 | 
107 | proc quantile*[N](h: Hist[N], q: float): float =
108 |   ## Parzen-interpolated quantile when no caller index mapping is needed
109 |   var iL, iH: int
110 |   let fL = h.quantile(q, iL, iH)
111 |   fL*iL.float + (1 - fL)*iH.float
112 | 
113 | proc nPDF*[N](h: Hist[N]): seq[float32] =
114 |   result.setLen h.len;let s=1/h.tot.float32;for i,r in mpairs result:r=s*h.pmf(i).float32
115 | 
116 | proc nCDF*[N](h: Hist[N]): seq[float32] =
117 |   result.setLen h.len;let s=1/h.tot.float32;for i,r in mpairs result:r=s*h.cdf(i).float32
118 | 
119 | when isMainModule:
120 |   const fast {.booldefine.} = false     # VERY limited differences below
121 |   when not declared addFloat: import std/[syncio, formatFloat]
122 |   import std/[times, strformat], cligen, cligen/sysUt
123 |   when fast: import adix/bist
124 |   proc hist(xs: seq[int], win=3, q = -2.0, pdf=false,cdf=false,time=false,
125 |             xMn=0,xMx=7) =
126 |     template toI(x): untyped = max(xMn, min(xMx, x)) - xMn   # Clip & shift
127 |     if win < 2: Value !! "win " & $win & " too small"
128 |     when fast: (var d = initBist[uint32](xMx - xMn + 1))
129 |     else     : (var d = initHist[uint32](xMx - xMn + 1))
130 |     let t0 = epochTime()
131 |     var tQ = 0.0                # Report avg qtl to ensure compiler cannot elide
132 |     for t, x in xs:
133 |       let x = x.toI                     # xOld frm Deque=moreGeneral
134 |       if t >= win: d.dec xs[t - win].toI, 1 # Remove leaving
135 |       d.inc x, 1                            # Add entering
136 |       if pdf: echo t," b: tot: ",d.tot," mPMF: ",d.nPDF
137 |       when not fast: d.up       # Make callers do this only once @top-level
138 |       if cdf: echo t," b: tot: ",d.tot," mCDF: ",d.nCDF
139 |       if q > -2.0:
140 |         if time: tQ += d.quantile(q)    # `formatFloat` slow=>just total
141 |         else: echo d.quantile(q)        # Report inverseCDF(q)
142 |     if time:
143 |       let n = xs.len.float; let dt = (epochTime() - t0)*1e9/n
144 |       stderr.write &"n: {xs.len} ns/no: {dt:.1f} w: {win} <mQ>: {tQ/n}\n"
145 | 
146 |   dispatch hist, short={"xMn":'a', "xMx":'b'}, help={"xs": "x values",
147 |  "win" : "moving data window in points","q"  : "quantile to report; 0.5=median",
148 |  "pdf" : "print PDF each time step"    ,"cdf": "print CDF each time step",
149 |  "time": "print timing statistics",
150 |  "xMn" : "`xs[i]` clipped to this `a` on `[a, xs]`",
151 |  "xMx" : "`xs[i]` clipped to this `b` on `[xs, b]`"}
152 | 


--------------------------------------------------------------------------------
/tests/btshell.nim:
--------------------------------------------------------------------------------
  1 | when not declared(stdout): import std/[syncio, objectdollar, formatfloat]
  2 | import ppss, btree, strutils, strformat, os, times
  3 | 
  4 | var   verb   = false                    # global verbosity flag
  5 | var   found  = false                    # global last-seek found flag
  6 | var   f      = stdout                   # to allow switch to shared stderr
  7 | when defined(btTall):                   # Deep 234 trees hit all cases fast..
  8 |   const nodz = 32
  9 | else:                                   #..But default to 2*64B cache lines
 10 |   const nodz = 128
 11 | 
 12 | type                                    # Instantiate a tree type/suite
 13 |   Pair = tuple[key, val: int16]
 14 |   Ix = int16                            #XXX should not need signed type; Chk
 15 |   Ln = uint16
 16 | 
 17 | proc getKey(x: Pair): int16 = x.key
 18 | 
 19 | when defined(btHisto):
 20 |   proc size(ob: Pair): int = int(ob.val)
 21 |   defBTree(Pair, int16, int32, int16, nodz, Ix, Ln)
 22 | else:
 23 |   defBTree(Pair, int16, int32, void, nodz, Ix, Ln)
 24 | const m = btOrder()
 25 | 
 26 | proc print(path: Path) =                # This prints `path` on one line, but
 27 |   if not verb: return                   #..*ONLY* in verbose mode; else no-op.
 28 |   f.write &"PATH({path.len})"
 29 |   if path.len > 0:
 30 |     let k = path[^1].p[].ob[path[^1].i].key
 31 |     let v = path[^1].p[].ob[path[^1].i].val
 32 |     f.write &": {path}  K: {k} V: {v}"
 33 |   f.write '\n'
 34 | 
 35 | # For B-Trees, both objects on internal nodes bracketing the link are "inPath".
 36 | # This is at least better than highlighting the *entire* B-tree node.
 37 | const colorL = [ "30;7", "101", "102", "103", "104", "105", "106", "107" ]
 38 | const colorD = [ "40;7",  "41",  "42",  "43",  "44",  "45",  "46",  "47" ]
 39 | let color = if getEnv("LC_THEME", "L").toUpper[0] == 'L': colorL else: colorD
 40 | proc print(t: Ln, path: Path, lab="") =
 41 |   const indent = 8
 42 |   proc inPath(t: Ln, path: Path, ob: Pair): bool =  # path markup flag
 43 |     for i, e in path: # `apply` acts on obs=>colorize obs around each e.p,i link
 44 |       if (e.p == t and e.i < 2*m-1 and e.p[].ob[e.i] == ob) or
 45 |          (i < path.len-1 and e.i > 0 and e.p[].ob[e.i-1] == ob):
 46 |            return true
 47 |   if lab.len > 0: f.write lab
 48 |   for (t, ob, depth) in t.nodes:
 49 |     let c = if t.inPath(path,ob): color[0] else: color[(76543*int(t)) mod 7 + 1]
 50 |     f.write repeat(" ", depth * indent), "\e[", c, "m",
 51 |             &"{int(t)}: k{ob.key},v{ob.val} (w{t[].wt})", "\e[0m\n"
 52 |   path.print
 53 | 
 54 | proc treeCk(t: Ln): int =
 55 |   result = t.check
 56 |   if result == 0: result += int(t.refCk)
 57 | 
 58 | let help = """c            check many tree invariants
 59 | p            print colorized tree
 60 | P            print current `path`
 61 | h            height/occupancy statistics
 62 | m <s>        set path to s-most side; 0=min; 1=max
 63 | a <s>        set path to [01]-side neighbor
 64 | k <k>        set path to where ob with key <k> is/should be
 65 | i <k> <v>    insert k,v at current path
 66 | d            delete ob at current path
 67 | n[01] <n>    set path to 0-origin <n>-th element; optional [01] => print key,ob
 68 | r            compute 1-origin rank; show seekNth autocalc
 69 | +<s> <k> <v> seek k, then s=0|1 (pre|ap)pend k,v ; ins|ctr inc in btHisto mode
 70 | -<s> <k>     seek k, then s=0|1 (front|back)pop k; ctr dec|del in btHisto mode
 71 | A<s> <k> <x> bulk load/add k,0 from empty in s=0|1 rev|fwd order; spare <x>
 72 | D<s>         done with bulk adds
 73 | t            start stop-watch
 74 | T            stop stop-watch and print nanoseconds, #ops
 75 | z            print node size in bytes
 76 | X            no-op; maybe useful to time loop dispatch"""
 77 | 
 78 | proc btshell(verbose=false, quiet=false, check=false, errstd=false): int =
 79 |   ## This shell reads ``ppss`` output to test all BTree ops & post-mutate check
 80 |   ## tree invariants.  Interactive use (e.g. ppss|btshell) is good to see how it
 81 |   ## works/fails "in the small".  Small programs to generate series of inputs in
 82 |   ## ``check`` mode is good to exercise all usage modes/trap bugs reproducibly.
 83 |   ## Color-structure highlighted print outs helps show structural bugs/features.
 84 |   verb = verbose                        # Propagate CL -> convenience globals
 85 |   if errstd: f = stderr
 86 |   let t = newN()                        # Init necessary to start w/empty tree
 87 |   var path: Path
 88 |   var r = 0
 89 |   var cin: Command
 90 |   var t0: Time
 91 |   var nOp = 0
 92 |   var ob: Pair
 93 |   template maybeCk =
 94 |     if check and t.treeCk > 0: t.print(path, "ERR\n"); return 1
 95 |   while stdin.readObject(cin.addr, cin.sizeof) == 1:
 96 |     nOp.inc
 97 |     if verbose: f.write cin, '\n'
 98 |     let s = cin.side; let k = int16(cin.key); let v = int16(cin.val)
 99 |     ob.key = k; ob.val = v
100 |     case cin.letter
101 |     of 'c': (let nE = t.treeCk; if nE > 0: f.write nE, " ERRS\n") # check
102 |     of 'p': t.print(path)                                         # print tree
103 |     of 'P': (let tmp = verb; verb = true; path.print; verb = tmp) # print path
104 |     of 'h':                                                       # height stats
105 |       var nN,nO: int; let h = stats(t,nN,nO); let u=float(nO)/float(nN*(2*m-1))
106 |       f.write &"nOb: {nO} nNode: {nN} height: {h} <util>: {u}\n"
107 |     of 'm': path.setLen 0; seekMost(t, path, bool(k)); path.print # most `k`
108 |     of 'a': seekAdj(path, bool(k)); path.print                    # adjacent `k`
109 |     of 'k':                                                       # key search
110 |       found = if cin.sided: t.seekKeys(path, s, k) else: t.seekKey(path, k)
111 |       path.print
112 |     of 'i':                                         # insert ob @path
113 |       if path.len > 0: path.add(ob, bool(k), found)
114 |       else: f.write "cannot insert @empty path\n"
115 |       maybeCk
116 |     of 'd': (if path.len > 0: path.del; maybeCk)    # Delete Ob @current path
117 |     of 'n':                                         # Move path to Nth elt
118 |       r = t.seekNth(path, k); path.print
119 |       if cin.sided and path.len > 0:
120 |         if s: f.write path[^1].p[].ob[path[^1].i], '\n'
121 |         else: f.write path[^1].p[].ob[path[^1].i].key, '\n'
122 |     of 'r': f.write &"rnk: {rank(path)} r: {r}\n"   # 1/0 origin rank of path
123 |     of '+':
124 |       when defined(btHisto):        # This eg. ignores `sided`, doing only ctr;
125 |         found = t.seekKey(path, k)  #..With GC allocator could instead do seq
126 |         if found:                   #..with real prepend/append with offset `r`.
127 |           path[^1].p[].ob[path[^1].i].val.inc
128 |           for j in 0 ..< path.len: path[j].p[].wt += 1
129 |         else:
130 |           ob.val = 1                    # Only reason `ob` needs to be `var`
131 |           path.add true, ob
132 |       else:                             # +0 k v prepend k,v; +1 k v append k,v
133 |         if cin.sided:                   # Unconditional add
134 |           found = t.seekKeys(path, s, k)
135 |           path.add(ob, s, found)
136 |         else:                           # Add if missing
137 |           found = t.seekKey(path, k)
138 |           if found: f.write k, " already present\n"
139 |           else:
140 |             path.add(ob, s, found)
141 |             if check:                   # Maybe dbl ck path post-add
142 |               var path2: Path; discard t.seekKey(path2, k)
143 |               if path2 != path: f.write "post add path mismatch\n"
144 |       maybeCk
145 |     of '-':
146 |       when defined(btHisto):            # Externally managed-rank integrated key
147 |         found = t.seekKey(path, k)      #..counter example just decrements|pops.
148 |         if found:
149 |           if path[^1].p[].ob[path[^1].i].val == 1: path.del
150 |           else:
151 |             path[^1].p[].ob[path[^1].i].val.dec
152 |             for j in 0 ..< path.len: path[j].p[].wt -= 1
153 |         elif not quiet: f.write &"{k} not found\n"
154 |       else:                             # -0 k front pops k; -1 k pops k
155 |         found = if cin.sided: t.seekKeys(path, s, k) else: t.seekKey(path, k)
156 |         if found: path.del; maybeCk
157 |         elif not quiet: f.write &"{k} not found\n"
158 |     of 'A': badd(path, ob, s, t, v)     # A1 k S bulk adds k,0 w/spare S
159 |     of 'D': baddDone(t, s, k); maybeCk  # Finalize after bulk adds
160 |     of 'X': discard
161 |     of 'z': f.write "node size: ", Node.sizeof, " bytes\n"
162 |     of 't': nOp = 0; t0 = getTime()
163 |     of 'T': f.write nOp, " ops in ", (getTime() - t0).inNanoseconds, " ns\n"
164 |     else: f.write &"unknown command '{cin.letter}'; choices are:\n{help}\n"
165 |   return nOp mod 2                      # Ensure compiler cannot elide calc
166 | 
167 | when isMainModule:
168 |   import cligen
169 |   dispatch(btshell, help={ "verbose": "echo read ops & path post [maknp]",
170 |                            "quiet"  : "\"not found\" del vs silent no-op",
171 |                            "check"  : "auto check after all mutating ops",
172 |                            "errstd" : "echos -> stderr" })
173 | 


--------------------------------------------------------------------------------
/adix/lmbist.nim:
--------------------------------------------------------------------------------
  1 | ##[ `Bist[T]` (& clients `lghisto`, `mvstat`) already support quantiles over
  2 | moving data windows.  Sometimes one wants recent values to carry more weight in
  3 | summary statistics, as in a Linearly Weighted Moving Average.  In the context of
  4 | a distribution, one weights by replication - more copies of more recent points
  5 | vs. earlier & earlier.  While one can do this by literal data point repetition,
  6 | that expands both space & time costs.  So, one should prefer virtual replication
  7 | - a histogram putting weight into bins with the same time structure.
  8 | 
  9 | A naive implementation decays weight for each point in the window as each point
 10 | leaves and adds a new point with weight `w`.  This is `O(nBin)` - no faster than
 11 | a full rebuild.  This CAN be done with no loops longer than `lg(nBin)`, though.
 12 | The key insight is to "count up forever" adding in new points with weight `t+1`,
 13 | but subtract a virtual zero level.  If no actual duplicates exist, this 0-level
 14 | is simply `t-w`. Each duplicate @various lags (common w/binning, e.g. `lghisto`)
 15 | gets 1 "copy" of this virtual 0-level - which "stack up" in a bin.  This can be
 16 | handled with a *second* `Bist` tracking *only* membership.  The actual distro is
 17 | then the linear combination `cnt[] - zero*nLag[]`.
 18 | 
 19 | We thus get `LMBist[T]` which bundles up this dual `Bist[T]` idea.  The API is
 20 | the same as `Bist[T]` & `EMBist[T]`.  The bottom of this module has a small
 21 | test/timing prog showing the differences.
 22 | 
 23 | Happy to cite someone, but as far as I can tell, this is a completely novel
 24 | application of Fenwick BISTs for a fast Linear weight Moving Median filter
 25 | transform.  I certainly came up with it on my own.  The linearly weighted moving
 26 | median itself while mostly obvious from its term and the more famous LWMA is all
 27 | but unheard of in academic literature regardless of implementation efficiency.
 28 | Khodakarami, et. al 2019 about Parkinson's is literally *THE ONLY* match on
 29 | scholar.google.com as of this writing.  Cohen & Strauss were likely somehow
 30 | aware of the idea, but unaware of the "LWM Average" term when they decided to
 31 | name this "chordal weighting" in their Cohen,Strauss2003 SIGMOD paper.  Due to
 32 | its simplicity (see adix/embist) & especially terse *average* form, exponential
 33 | time kernels SO dominate that other things are usually unattended.  See, e.g.,
 34 | Akinshin 2023 "Weighted Quantile Estimators" which does *not* consider linear
 35 | time kernels (but is otherwise a very interesting paper).  It is true that one
 36 | needs to keep *just one window* of data points in a std/deque|ring buffer to
 37 | expire items, but the same is true of exponential weighting with strict windows,
 38 | needed for good history/"time breakdown point" behavior in either average or
 39 | quantile settings.  It is only fairly extreme scenarios where just one window
 40 | exceeds CPU L3 cache, let alone DRAM, though.  Please cite this github repo if
 41 | this code inspires your work. ]##
 42 | when not declared assert: import std/assertions # debugging
 43 | import adix/[bist, bitop], cligen/sysUt
 44 | 
 45 | type LMBist*[T: SomeNumber] = object
 46 |   cnt, nLag: Bist[T]    # Raw count, number of Lags in-window@`i`
 47 |   zero: T               # Window size, Bottom Level, Root Finding Guess/Return
 48 | 
 49 | proc len*[T](d: LMBist[T]): int = d.cnt.data.len
 50 | func space*[T](d: LMBist[T]): int = 2*(d.sizeof + (d.cnt.data.len + 1)*T.sizeof)
 51 | proc tot*[T](d: LMBist[T]): T = d.cnt.tot - d.zero*d.nLag.tot
 52 | proc count*[T](d: LMBist[T]): T = d.tot
 53 | 
 54 | proc init*[T](d: var LMBist[T]; len: int) = d.cnt.init len; d.nLag.init len
 55 | proc initLMBist*[T](len: int): LMBist[T] = result.init len
 56 | proc clear*[T](d: var LMBist[T]) = d.cnt.clear; d.nLag.clear; d.zero = 0
 57 | 
 58 | proc inc*[T](d: var LMBist[T]; i: int, w: T) =
 59 |   d.cnt.inc i, w; d.nLag.inc i, 1       # track both weight & membership
 60 | 
 61 | proc dec*[T](d: var LMBist[T]; i: int, w: T) =
 62 |   d.cnt.dec i, w; d.nLag.dec i, 1       # track both weight & membership
 63 |   d.zero += 1                           # & the bottom or virtual zero.
 64 | 
 65 | proc up*[T](d: var LMBist[T]) = discard ## Simple no-op for LMBist
 66 | 
 67 | proc cdf*[T](d: LMBist[T], i: int): T = d.cnt.cdf(i) - d.zero*d.nLag.cdf(i)
 68 | proc pmf*[T](d: LMBist[T], i: int): T = d.cnt.pmf(i) - d.zero*d.nLag.pmf(i)
 69 | 
 70 | proc invCDF*[T](d: LMBist[T], s: T; s0: var T): int =
 71 |   assert 0<=s and s<=d.tot, "LMBist.invCDF OORange sum " & $s & " of " & $d.tot
 72 |   var c = s                             #NOTE: s<0|s>tot are invalid inputs
 73 |   cfor (var half = d.cnt.data.len.ceilPow2 shr 1), half != 0, half >>= 1:
 74 |     var m = result + half - 1           # midpoint in binary search
 75 |     if m < d.cnt.data.len and d.cnt[m] - d.zero*d.nLag[m] < c:
 76 |       c -= d.cnt[m] - d.zero*d.nLag[m]
 77 |       result = m + 1
 78 |   s0 = s - c
 79 | 
 80 | proc invCDF*[T](d: LMBist[T]; s: T; s0, s1: var T): int =
 81 |   result = d.invCDF(s, s0); s1 = s0 + d.pmf(result)
 82 | proc min*[T](d: LMBist[T]): int = d.nLag.min ## Simple wrapper of `d.nLag.min`.
 83 | proc max*[T](d: LMBist[T]): int = d.nLag.max ## Simple wrapper of `d.nLag.max`.
 84 | 
 85 | proc quantile*[T](d: LMBist[T]; q:float; iL,iH: var int): float =
 86 |   assert d.tot > 0, "quantile(LMBist[T]) requires non-empty LMBist."
 87 |   var sL0, sL1, sH0, sH1: T                 #You probably want to draw a CDF to
 88 |   let tot = d.tot; let n = tot.float        #..fully understand this code.
 89 |   let qN = q*n
 90 |   if qN <= 0.5    : iL = d.min; iH = 0; return 1 #Early rets for tails; Pure iL
 91 |   if qN >= n - 0.5: iL = d.max; iH = 0; return 1 #{Early for body are pure iH.}
 92 |   iH = d.invCDF(T(qN + 1.5), sH0, sH1)
 93 |   var sMidH = 0.5*float(sH0 + sH1)          #This guess works 90+% of the time..
 94 |   if sMidH < qN:                            #..but can fail for large sH1 - sH0.
 95 |     if sH1 < tot:                           #When it fails, want next higher bin
 96 |       iH    = d.invCDF(sH1 + 1, sH0, sH1)
 97 |       sMidH = 0.5*float(sH0 + sH1)
 98 |     else: return 0                          #..unless @HIGHEST already=>all iH
 99 |   if sH0 == 0: return 0                     #For qN this small, iH = iL = min.
100 |   iL = d.invCDF(sH0, sL0, sL1)              #..Also, cannot call invCDF(0).
101 |   let sMidL = 0.5*float(sL0 + sL1)          #Mid-vertJump(nxtLwrBin) gives line
102 |   (sMidH - qN)/(sMidH - sMidL)
103 | 
104 | proc quantile*[T](d: LMBist[T]; q: float): float =
105 |   var iL, iH: int
106 |   let fL = d.quantile(q, iL, iH)
107 |   fL*iL.float + (1 - fL)*iH.float
108 | 
109 | proc nPDF*[T](d: LMBist[T]): seq[float32] =
110 |   result.setLen d.cnt.len;let s=1/d.tot.float32;for i,r in mpairs result:r=s*d.pmf(i).float32
111 | 
112 | proc nCDF*[T](d: LMBist[T]): seq[float32] =
113 |   result.setLen d.cnt.len;let s=1/d.tot.float32;for i,r in mpairs result:r=s*d.cdf(i).float32
114 | 
115 | when isMainModule:
116 |   const slow {.booldefine.} = false     # VERY limited differences below
117 |   when not declared addFloat: import std/[syncio, formatFloat]
118 |   import std/[times, strformat], cligen
119 |   proc lmbist(xs: seq[int], win=3, q = -2.0, pdf=false,cdf=false,time=false,
120 |               xMn=0,xMx=7) =
121 |     template toI(x): untyped = max(xMn, min(xMx, x)) - xMn   # Clip & shift
122 |     if win < 2: Value !! "win " & $win & " too small"
123 |     when slow: (var d = initBist[uint32](xMx - xMn + 1))
124 |     else     : (var d = initLMBist[uint32](xMx - xMn + 1))
125 |     let t0 = epochTime()
126 |     var tQ = 0.0            # Report avg qtl to ensure compiler cannot elide
127 |     for t, x in xs:
128 |       let x = x.toI                     # xOld frm Deque=moreGeneral
129 |       when slow:                        # On full data win, decay ALL old weight
130 |         if t >= win: (for tw in t - win ..< t: d.dec xs[tw].toI, 1) # BIG LOOP
131 |         d.inc x, min(t + 1, win).uint32 # Small entering weight
132 |       else:                             # Remove weight for leaving data point
133 |         if t >= win: d.dec xs[t - win].toI, uint32(t + 1 - win)
134 |         d.inc x, uint32(t + 1)          # Large entering weight
135 |       if pdf: echo t," b: tot: ",d.tot," lwmPMF: ",d.nPDF
136 |       if cdf: echo t," b: tot: ",d.tot," lwmCDF: ",d.nCDF
137 |       if q > -2.0:
138 |         if time: tQ += d.quantile(q)    # `formatFloat` slow=>just total
139 |         else: echo d.quantile(q)        # Report inverseCDF(q)
140 |     if time:
141 |       let n = xs.len.float; let dt = (epochTime() - t0)*1e9/n
142 |       stderr.write &"n: {xs.len} ns/no: {dt:.1f} w: {win} <mQ>: {tQ/n}\n"
143 | 
144 |   dispatch lmbist, short={"xMn":'a', "xMx":'b'}, help={"xs": "x values",
145 |  "win" : "moving data window in points","q"  : "quantile to report; 0.5=median",
146 |  "pdf" : "print PDF each time step"    ,"cdf": "print CDF each time step",
147 |  "time": "print timing statistics",
148 |  "xMn" : "`xs[i]` clipped to this `a` on `[a, xs]`",
149 |  "xMx" : "`xs[i]` clipped to this `b` on `[xs, b]`"}
150 | #[ A Zsh session showing basic correctness&boost of optimization.  Sets up env,
151 | compiles ref & optimized; makes nums; Tests various q & w; Finally measures 'em.
152 |   nim=(nim c -d:danger); t=/tmp/nums   # Set up
153 |   $nim -d:slow -o=slmbist lmbist; $nim lmbist
154 |   ( for i in {1..10000}; printf " %s" $((RANDOM%8)) ) > $t
155 |   ( for q in .1 .25 .5 .75 .9; { for w in {2..10}; {
156 |       paste <(./lmbist -w$w -q.1 `<$t`) <(./lmbist -w$w -q.1 `<$t`) |
157 |         awk '{print $1-$2}' | sort -g | tails -h1 -t1 }}) 2>/dev/null|unfold -n3
158 |   ./slmbist -tw756 -q.5 `<$t`; ./lmbist -tw756 -q.5 `<$t`
159 | I get NO DIFF between ref & optimized, optimized about 25X faster.  I also get a
160 | -w BreakEven of -w4 for when version marked "slow" is faster, but it's only 1.2X
161 | faster at smallest sensical -w2.  So, -w2..4 not really worth conditioning. ]#
162 | 


--------------------------------------------------------------------------------
/adix/lna.nim:
--------------------------------------------------------------------------------
  1 | ## FastIEEESinglePrecNaturalLogAbs; Just arctanh Taylor@1. Was 5X fastr'n mid00s
  2 | ## x87. On SkyLake/glibc2.40/gcc14 ~1.2-2X faster; ARM64glibc somehow(fastHW?)
  3 | ## ~4X faster.  Unsure about Win/OSX. See news.ycombinator.com/item?id=40758562
  4 | type f4s {.packed.} = object            # De-structuring object for IEEE-single
  5 |   frac1 {.bitsize: 16}: cuint           # Little-Endian format only right now
  6 |   frac0 {.bitsize:  7}: cuint           # `cuint` should make Nim use `unsigned`
  7 |   expo  {.bitsize:  8}: cuint           #..for expressions like `expo-127`. Not
  8 |   sign  {.bitsize:  1}: cuint           #..sure where this is documented.
  9 | 
 10 | const r1_2 = 0.70710678118654752440f64
 11 | const LN2  = 0.69314718055994530942f64
 12 | const LNr2 = 0.34657359027997265471f64
 13 | 
 14 | func lnaSeries(s: float): float {.inline.} =          # Worst case accuracies:
 15 |   when defined o3: 2.0/3.0*s + 2.0                            # 11.22 rel bits
 16 |   elif defined o5: (0.4*s + 2.0/3.0)*s + 2.0                  # 17.305 rel bits
 17 |   elif defined o7: ((2.0/7.0*s + 0.4)*s + 2.0/3.0)*s + 2.0    # 22.774 rel bits
 18 |   elif defined o11: # 24.0000002 rel bits
 19 |     ((((2.0/11.0*s + 2.0/9.0)*s + 2.0/7.0)*s + 0.4)*s + 2.0/3.0)*s + 2.0
 20 |   else: (((2.0/9.0*s + 2.0/7.0)*s + 0.4)*s + 2.0/3.0)*s + 2.0 # 23.97 rel bits
 21 | 
 22 | func lna*(x: float32): float32 {.inline.} =
 23 |   ## Return fast,approx Natural Log(Abs(x)) { 'a' for a)bs | a)pprox } by ATanH
 24 |   ## 82% of the time & by std ln(1+x) series 12+6=18%. { BUT 0.lna=-88,not -Inf|
 25 |   ## +Inf@Inf; Not fixed inline here since outer caller should block|handle 0. }
 26 |   var x = x
 27 |   let p = cast[ptr f4s](x.addr)
 28 |   p.sign = 0                                    # abs(); Force x to be positive
 29 |   let e = (p.expo.cint - 127).float*LN2         # ln(x*2^y) == ln(x) + y*ln2
 30 |   p.expo = 127                                  # force x to [1, 2)
 31 |   if x > 1.88f32:   # Small y in x=2-y: 6 terms of ln(1+y)=Σy^i/i BUT ..
 32 |     let y = x.float*0.5 - 1.0                   #..adjusted for NEXT octave.
 33 |     e + LN2 + y*(1.0 + y*(-0.5 + y*(1.0/3.0 + y*(-0.25 + y*(0.2 - y/6.0)))))
 34 |   elif x < 1.06f32: # Small y in x=1+y: 6 terms of ln(1+y)=Σy^i/i
 35 |     let y = x.float - 1.0
 36 |     e       + y*(1.0 + y*(-0.5 + y*(1.0/3.0 + y*(-0.25 + y*(0.2 - y/6.0)))))
 37 |   else:             # 2*atanh(x) = ln((1+x)/(1-x)) = 2*Σx^o/o for odd 'o'
 38 |     let d = x.float * r1_2                      # x -> dbl [sqrt(1/2), sqrt2)
 39 |     # d=(1+r)/(1-r); (1+r)=(1-r)*d=d-rd; d-rd-r-1; r*(d+1)=d-1; r=(d-1)/(d+1)
 40 |     let r = (d.float - 1.0)/(d.float + 1.0)     # r for r)atio of -1/+1
 41 |     let s = r*r                                 # s for s)quare
 42 |     e + LNr2 + r*s.lnaSeries    # (1.37288 +- 0.00003)X faster on SkyLake
 43 | 
 44 | when isMainModule:
 45 |   when defined(bench):
 46 |     import std/[times, math, strformat]
 47 |     var sum0 = 0.0; var sum = 0.0; var n = 0
 48 |     let t00 = epochTime()
 49 |     for i in 0 .. (1u64 shl 32) - 1:
 50 |       var i = uint32(i)
 51 |       let x = cast[ptr float32](i.addr)[]
 52 |       if x.isNaN: continue
 53 |       if x == 0.0f32: continue            # -inf
 54 |       inc n
 55 |       if not (x.isNaN or 2*x==x): sum0 += x
 56 |     let dt0 = epochTime() - t00
 57 |     let t0 = epochTime()
 58 |     for i in 0 .. (1u64 shl 32) - 1:
 59 |       var i = uint32(i)
 60 |       let x = cast[ptr float32](i.addr)[]
 61 |       if x.isNaN: continue
 62 |       if x == 0.0f32: continue            # -inf
 63 |       when defined(stdlib): (let l = ln(abs(x)))
 64 |       else                : (let l = lna(x))
 65 |       inc n
 66 |       if not (l.isNaN or 2*x==x): sum += l
 67 |     let dt = epochTime() - t0 - dt0
 68 |     echo &"sX:{sum0:.2g} sL:{sum:.0f} in {dt0:.5f} + {dt:.5f} s;n: {n}; {dt/n.float*1e9:.3f} ns/eval"
 69 |   else:
 70 |     when not declared(stdout): import std/[syncio, formatFloat]
 71 |     import std/[math, heapqueue]
 72 |     proc lnaT*(x: float32): float32 {.inline.} =
 73 |       var x = x
 74 |       let p = cast[ptr f4s](x.addr)
 75 |       p.sign = 0                                  # force x to be positive
 76 |       let e = (p.expo.cint - 127).float*LN2       # ln(x*2^y) == ln(x) + y*ln2
 77 |       p.expo = 127                                # force x to [1, 2)
 78 |       if x > 1.88f32:
 79 |         let y = x.float*0.5 - 1.0
 80 |         echo "x: ",x," e: ",e," y: ",y
 81 |         e + LN2 + y*(1.0 + y*(-0.5 + y*(1.0/3.0 + y*(-0.25 + y*(0.2 - y/6.0)))))
 82 |       elif x < 1.06f32:
 83 |         let y = x.float - 1.0
 84 |         echo "X: ",x," E: ",e," Y: ",y
 85 |         e       + y*(1.0 + y*(-0.5 + y*(1.0/3.0 + y*(-0.25 + y*(0.2 - y/6.0)))))
 86 |       else:
 87 |         let d = x.float * r1_2                    # x -> dbl [sqrt(1/2), sqrt2)
 88 |         let r = (d.float - 1.0)/(d.float + 1.0)   # r for r)atio of -1/+1
 89 |         let s = r*r                               # s for s)quare
 90 |         echo "x: ",x," d: ",d," e: ",e," r: ",r," s: ",s," iM: ",s.lnaSeries
 91 |         float32(e + LNr2 + r*s.lnaSeries)
 92 |     const n = 15  # echo top <ThisMany> absolute & relative errors
 93 |     var abErr, rlErr: HeapQueue[(float, float32, float32, float32)]
 94 |     for i in 0 .. (1u64 shl 32) - 1:
 95 |       var i = uint32(i)
 96 |       let x = cast[ptr float32](i.addr)[]
 97 |       if x.isNaN: continue
 98 |       if x < 0: continue
 99 |       if x == float32.low: continue
100 |       if x == float32.high: continue
101 |       if x < 1.1754944e-38: continue    # under IEEE single limit
102 |       if x > 1.7014118e38: continue     # above IEEE single limit
103 |       if x < 0.5: continue # accelerators
104 |       if x > 2.0: continue # accelerators
105 |       if x == 0.0f32: continue          # -inf
106 |       if x == 1.0f32: continue          # exactly 0.0
107 |       let accu = ln(abs(x.float))       #     let accu = lnf(x.float)
108 |       let appr = lna(x).float
109 |       let aerr = (abs(appr - accu), x, accu.float32, appr.float32)
110 |       if abErr.len < n          : abErr.push(aerr)
111 |       elif aerr[0] > abErr[0][0]: discard abErr.replace(aerr)
112 |       let rerr = (abs(appr/accu - 1.0), x, accu.float32, appr.float32)
113 |       if rlErr.len < n          : rlErr.push(rerr)
114 |       elif rerr[0] > rlErr[0][0]: discard rlErr.replace(rerr)
115 |       if (i and 0x00FFFFFFu32) == 0: stdout.write "."; stdout.flushFile
116 |     echo "\n"
117 |     echo "abs: ";(while abErr.len>0:(let e=abErr.pop;echo " ",e,lnaT(e[1])))
118 |     echo "rel: ";(while rlErr.len>0:(let e=rlErr.pop;echo " ",e,lnaT(e[1])))
119 | #[ b=(chrt 99 taskset -c 2-3 env -i HOME=/u/cb PATH=/u/cb/bin:/usr/local/bin:/usr/bin)
120 | i7_6700k$ for mode in '' -d:fm -d:fim -d:stdlib;{nim c -d:r -d:bench $mode lna>&/n;repeat 3 nor 0 $b ./lna}
121 | S0:5.3e+36 sL:1652640659.073322 in 13.312009 second;n: 8556380160; 1.56 ns/eval
122 | S0:5.3e+36 sL:1652640659.073322 in 13.285249 second;n: 8556380160; 1.55 ns/eval
123 | S0:5.3e+36 sL:1652640659.073322 in 13.296462 second;n: 8556380160; 1.55 ns/eval
124 |                                                                    1.55 +- 0.003
125 | S0:5.3e+36 sL:1652640659.073322 in 10.756718 second;n: 8556380160; 1.26 ns/eval
126 | S0:5.3e+36 sL:1652640659.073322 in 10.712787 second;n: 8556380160; 1.25 ns/eval
127 | S0:5.3e+36 sL:1652640659.073322 in 10.718144 second;n: 8556380160; 1.25 ns/eval
128 |                                                                    1.25 +- 0.003
129 | S0:5.3e+36 sL:1652640659.073322 in 11.040576 second;n: 8556380160; 1.29 ns/eval
130 | S0:5.3e+36 sL:1652640659.073322 in 11.030243 second;n: 8556380160; 1.29 ns/eval
131 | S0:5.3e+36 sL:1652640659.073322 in 10.904071 second;n: 8556380160; 1.27 ns/eval
132 |                                                                    1.27 +- 0.006
133 | S0:5.3e+36 sL:1641011596.122295 in 20.227257 second;n: 8556380160; 2.36 ns/eval
134 | S0:5.3e+36 sL:1641011596.122295 in 20.227509 second;n: 8556380160; 2.36 ns/eval
135 | S0:5.3e+36 sL:1641011596.122295 in 20.231506 second;n: 8556380160; 2.36 ns/eval
136 |                                                                    2.36 +- 0.003
137 | i7_1370P$ for mode in '' -d:fm -d:fim -d:stdlib;{nim c -d:r -d:bench $mode lna>&/n;repeat 3 nor 0 $b ./lna}
138 | S0:5.3e+36 sL:1652640659.073322 in 6.615819 second;n: 8556380160; 0.77 ns/eval
139 | S0:5.3e+36 sL:1652640659.073322 in 6.980933 second;n: 8556380160; 0.82 ns/eval
140 | S0:5.3e+36 sL:1652640659.073322 in 7.320486 second;n: 8556380160; 0.86 ns/eval
141 |                                                                   0.773 +- 0.017
142 | S0:5.3e+36 sL:1652640659.073322 in 7.609612 second;n: 8556380160; 0.89 ns/eval
143 | S0:5.3e+36 sL:1652640659.073322 in 7.661034 second;n: 8556380160; 0.90 ns/eval
144 | S0:5.3e+36 sL:1652640659.073322 in 8.191683 second;n: 8556380160; 0.96 ns/eval
145 |                                                                   0.889 +- 0.003
146 | S0:5.3e+36 sL:1652640659.073322 in 8.293185 second;n: 8556380160; 0.97 ns/eval
147 | S0:5.3e+36 sL:1652640659.073322 in 8.342474 second;n: 8556380160; 0.98 ns/eval
148 | S0:5.3e+36 sL:1652640659.073322 in 8.330391 second;n: 8556380160; 0.97 ns/eval
149 |                                                                   0.969 +- 0.003
150 | S0:5.3e+36 sL:1641011596.122295 in 7.963845 second;n: 8556380160; 0.93 ns/eval
151 | S0:5.3e+36 sL:1641011596.122295 in 8.605017 second;n: 8556380160; 1.01 ns/eval
152 | S0:5.3e+36 sL:1641011596.122295 in 9.766621 second;n: 8556380160; 1.14 ns/eval
153 |                                                                   0.931 +- 0.027
154 | In Summary: Skylake(4.7GHz)      AlderLake (5.2GHzPcore) 2ndBatch           δ
155 |             1.55 +- 0.003        0.773 +- 0.017          (0.773 +- 0.016)  0.0σ
156 |             1.25 +- 0.003        0.889 +- 0.003          (0.881 +- 0.0013) 2.5σ
157 |             1.27 +- 0.006        0.969 +- 0.003          (0.893 +- 0.034)  2.2σ
158 |  1.89x      2.36 +- 0.003 1.20x  0.931 +- 0.027          (0.980 +- 0.01)   1.7σ
159 | Note that assessing CPU superscalar pipeline util is much more subtle than raw
160 | wall clock time.  These "speed-ups" are really ratios of "incremental wall time
161 | per loop per lna() eval" in best possible, hot-everything cases.  Min estimate
162 | here is simply min3 +- (med-min3)/3 which works ok-ish as per final δ. ]#
163 | 


--------------------------------------------------------------------------------
/adix/bist.nim:
--------------------------------------------------------------------------------
  1 | ##[ Binary Indexed Sum Tree (BIST); Fenwick proposed "BIT" but that A) collides
  2 | w/many uses B) takes partial (S)ums as implied, but explicit is better (though
  3 | products can work) and C) does not rhyme with "dist" (for distribution - what it
  4 | is mostly about).  While Inet has tutorials, to my knowledge no one (yet)
  5 | collects all these algos in one place.  Fenwick1994 itself messed up `invCDF`,
  6 | correcting w/a tech report a year later.  This code only allocates needed space
  7 | & uses 0-based array indexing.  See https://en.wikipedia.org/wiki/Fenwick_tree
  8 | 
  9 | The idea of a standard binary heap with `kids(k)@[2k],[2k+1]` for dynamic
 10 | distributions goes back to Wong&Easton 1980 (or earlier?).  Fenwick's clever
 11 | index encoding/overlaid trees idea allows using 1/4 to 1/2 that space (only max
 12 | index+1 array elements vs `2*lgCeil(n)`), a constant factor improvement.  Good
 13 | explanations truly need figures, as in the original Fenwick paper | Wikipedia.
 14 | 
 15 | The `Bist[T]` type in this module is generic over the type of counters used for
 16 | partial sums|counts.  For few total items, you can use a `Bist[uint8]` while for
 17 | many you want to use `Bist[uint32]`.  This can be space-optimized up to 2X
 18 | further with `adix/sequint` specialized to store an array of B-bit counters.
 19 | Ranked B-trees are faster for >24..28-bit index spaces as L3 CPU caching fails,
 20 | but needing >7..8 decimal dynamic ranges is also rare. ]##
 21 | when not declared assert: import std/assertions
 22 | import cligen/sysUt, bitop # cfor, `>>=`, `&=`; `ceilPow2`
 23 | 
 24 | type Bist*[T: SomeNumber] = object ## A razor thin wrapper around `seq[T]`
 25 |   tot*: T             # total counted population, via history of inc(i, d)
 26 |   data*: seq[T]       # The Fenwick array/BIST; Relevant seq ops pass through
 27 | 
 28 | proc init*[T](t: var Bist[T], len: int) = t.data.setLen len
 29 | proc initBist*[T](len: int): Bist[T] = result.init len
 30 | proc len*[T](t: Bist[T]): int = t.data.len
 31 | func space*[T](t: Bist[T]): int = t.sizeof + t.data.len*T.sizeof
 32 | proc count*[T](t: Bist[T]): T = t.tot
 33 | proc `[]`*[T](t: Bist[T], i: int): T = t.data[i]
 34 | proc `[]`*[T](t: var Bist[T], i: int): var T = t.data[i]
 35 | proc `[]=`*[T](t: var Bist[T], i: int, x: T) = t.data[i] = x
 36 | proc clear*[T](t: var Bist[T]) =
 37 |   t.tot = 0; zeroMem t.data[0].addr, t.len*T.sizeof
 38 | 
 39 | proc inc*[T](t: var Bist[T]; i: int; d: T) =
 40 |   ## Adjust for count increment by `d`; Tm ~ 1/2..3/4 lg n
 41 |   t.tot += d
 42 |   cfor (var i = i.int), i < t.len, i |= i + 1: t[i] += d    #Go down update tree
 43 | 
 44 | proc dec*[T](t: var Bist[T]; i: int; d: T) =
 45 |   ## Adjust for count decrement by `d`; Tm ~ 1/2..3/4 lg n
 46 |   t.tot -= d
 47 |   cfor (var i = i.int), i < t.len, i |= i + 1: t[i] -= d    #Go down update tree
 48 | 
 49 | proc up*[T](t: var Bist[T]) = discard ## Simple no-op for BISTs
 50 | 
 51 | proc cdf*[T](t: Bist[T], i: int): T =
 52 |   ## INCLUSIVE `sum(pmf[0..i])`, (rank,EDF,prefix sum,scan,..); Tm~1 bits in `i`
 53 |   cfor (var i = i + 1), i > 0, i &= i - 1:        #Go up interrogation tree
 54 |     result += t[i - 1]
 55 | 
 56 | proc pmf*[T](t: Bist[T], i: int): T =
 57 |   ## Probability Mass Function @i;  Avg Tm ~ 2 probes; Max Tm ~ lg n
 58 |   result = t[i]
 59 |   cfor (var mask = 1), (i and mask) == mask, mask <<= 1:
 60 |     result -= t[i - mask]           #while LSB==1: subtract & mv up tree
 61 | 
 62 | proc invCDF*[T](t: Bist[T], s: T; s0: var T): int =
 63 |   ## For `0 < s <= tot`, bracket ECDF jump `>= s`.  I.e. find `i0, s0` so `s0 =
 64 |   ## sum(..< i0) < s yet sum(..i0) >= s` in `lgCeil n` array probes.
 65 |   assert 0<=s and s<=t.tot, "Bist.invCDF OORange sum " & $s & " of " & $t.tot
 66 |   var c = s                             #NOTE: s<0|s>tot are invalid inputs
 67 |   cfor (var half = t.data.len.ceilPow2 shr 1), half != 0, half >>= 1:
 68 |     var mid = result + half - 1
 69 |     if mid < t.data.len and t[mid] < c:
 70 |       c -= t[mid]
 71 |       result = mid + 1
 72 |   s0 = s - c
 73 | 
 74 | proc fromCnts*[T](t: var Bist[T]) =
 75 |   ## In-place bulk convert/reformat `t[]` from counts to BIST; Max time `~1*n`.
 76 |   t.tot = 0
 77 |   for i in 0 ..< t.len:
 78 |     t.tot += t[i]
 79 |     let j = i or (i + 1)
 80 |     if j < t.len:
 81 |       t[j] += t[i]
 82 | 
 83 | proc toCnts*[T](t: var Bist[T]) =
 84 |   ## In-place bulk convert/reformat `t[]` from BIST to counts; Max time ~1*n
 85 |   ## *Unlike the others, this routine only works for power of 2-sized arrays*.
 86 |   cfor (var i = t.len), i != 0, i >>= 1:      #Long strides give ~n inner loops.
 87 |     cfor (var j = 2*i - 1), j < t.len, j += 2*i:  #*Might* be slower than just
 88 |       t[j] -= t[j - i]                            #..looping & calling `pmf`.
 89 | 
 90 | proc nPDF*[T](t: Bist[T]): seq[float32] = ## Return classic PMF from read-only BIST
 91 |   result.setLen t.len; let s=1/t.tot.float32;for i,r in mpairs result:r=s*t.pmf(i).float32
 92 | 
 93 | proc nCDF*[T](t: Bist[T]): seq[float32] = ## Return classic CDF from read-only BIST
 94 |   result = t.nPDF; for i in 1 ..< t.len: result[i] += result[i - 1] # .cumsum?
 95 | 
 96 | proc `$`*[T](t: Bist[T]): string = "tot: " & $t.count & " pmf: " & $t.nPDF
 97 | 
 98 | proc invCDF*[T](t: Bist[T], s: T): (int, T) = result[0] = t.invCDF(s, result[1])
 99 |   ## For `0 < s <= tot` return `(i0,s0)` so `sum(..<i0)=s0 < s and sum(..i0)>=s`
100 | 
101 | proc invCDF*[T](t: Bist[T]; s: T; s0, s1: var T): int =
102 |   ## For `0 < s <= tot`, find `i0,s0,s1` so `s0 < s <= s1` and `s0+pmf(i0)==s1`.
103 |   result = t.invCDF(s, s0)
104 |   s1 = s0 + t.pmf(result)
105 | 
106 | proc min*[T](t: Bist[T]): int = ## Simple wrapper: invCDF(t, 1)
107 |   var s0: T; t.invCDF(1, s0)
108 | 
109 | proc max*[T](t: Bist[T]): int = ## Simple wrapper: invCDF(t,t.count).
110 |   var s0: T; t.invCDF(t.tot.T, s0)
111 | 
112 | from std/fenv import epsilon
113 | proc quantile*[T](t: Bist[T]; q: float; iL,iH: var int): float =
114 |   ## Parzen-interpolated quantile; E.g., q=0.9 => 90th percentile.  ``answer =
115 |   ## result*iL + (1-result)*iH``, but is left to caller to do { in case it is
116 |   ## mapping larger numeric ranges to/from iL,iH }.  Tm ~ ``2*lg(addrSpace)``.
117 |   ## Unlike other (broken!) quantile-interpolation methods, Parzen's connects
118 |   ## midpoints of vertical CDF jumps, not horizontal.  This makes more sense,
119 |   ## corresponding to Wilcoxon 1945 & later tie mid-ranking recommendations.
120 |   assert t.tot > 0, "quantile(Bist[T]) requires non-empty Bist."
121 |   var sL0, sL1, sH0, sH1: T                 #You probably want to draw a CDF to
122 |   let n  = t.tot.float                      #..fully understand this code.
123 |   let qN = q*n
124 |   let wq = when T is SomeFloat: T.epsilon*n else: 1.T # A Quantum Of Ctr Wgt
125 |   if qN <= 0.5*wq.float    : iL = t.min;iH=0;return 1 #Early tails rets; Pure iL
126 |   if qN >= n - 0.5*wq.float: iL = t.max;iH=0;return 1 #{Early body are pure iH.}
127 |   let dqN=when T is SomeFloat: wq else: 1.5 # Min round-off + max odds high side
128 |   iH = t.invCDF(T(qN + dqN), sH0, sH1)      # sH0<qN<=sH1 BUT can be ABOVE|
129 |   var sMidH = 0.5*float(sH0 + sH1)          #..BELOW sMidH.  So, test & move as
130 |   if sMidH < qN:                            #..needed.  When it does fail..,
131 |     if sH1 < t.tot:                         #..we want the next higher bin.
132 |       iH    = t.invCDF(sH1 + wq, sH0, sH1)
133 |       sMidH = 0.5*float(sH0 + sH1)
134 |     else: return 0                          #..unless @HIGHEST already=>all iH
135 |   if sH0 < wq: return 0                     #For qN this small, iH = iL = min.
136 |   iL = t.invCDF(sH0, sL0, sL1)              #..Also, cannot call invCDF(0).
137 |   when T is SomeFloat:                      # Should be impossible,but round-off
138 |     if sL1 > sH0 + wq:                      #..makes it happen sometimes & when
139 |       iL = t.invCDF(sH0 - wq, sL0, sL1)     #..it does, we want next lower bin.
140 |   let sMidL = 0.5*float(sL0 + sL1)          #Mid-vertJump(nxtLwrBin) gives line
141 |   min (sMidH - qN)/(sMidH - sMidL), 1.0     #Runs of T.eps-sized bins=>anomalies
142 | 
143 | proc quantile*[T](t: Bist[T], q: float): float =
144 |   ## Parzen-interpolated quantile when no caller index mapping is needed
145 |   var iL, iH: int
146 |   let fL = t.quantile(q, iL, iH)
147 |   fL*iL.float + (1 - fL)*iH.float
148 | 
149 | when isMainModule:
150 |   import cligen, std/strutils
151 |   when not declared(addFloat): import std/formatfloat
152 |   type ct = uint16
153 |   proc tbist(num=9, verb=false, parzen=false, thresh=0.03, args: seq[int]): int=
154 |     ##[Eg `tbist $(echo 0 2 4 4 4 6 6 6 6 8 | tr \  \\n | shuf)`. Exit status is
155 |     bitmask of PMF|CDF|invCDF|Extremes|discontinuousQtls|badFrom|badToCnts. ]##
156 |     result = 0    #Set to non-zero on failure for easy halt of randomized tests.
157 |     if args.len == 0: quit "Called with no args; --help explains more", 1
158 |     var cntR = newSeq[ct](num)                        #Reference count/PMF/histo
159 |     var sumR = newSeq[ct](num)                        #Reference prefix sum/CDF
160 |     var minR = int.high
161 |     var maxR = int.low
162 |     var b    = initBist[ct](num)
163 |     for a in args:                                    #Load up bist & references
164 |       if a < 0   : echo "tbist: ignoring negative ", a     ; continue
165 |       if a >= num: echo "tbist: ignoring out of bounds ", a; continue
166 |       cntR[a].inc                                     #Reference cntR
167 |       minR = min(minR, a)
168 |       maxR = max(maxR, a)
169 |       b.inc(a, +1)
170 |     sumR[0] = cntR[0]                                 #Low-Tech Prefix Sum/CDF
171 |     for i in 1 ..< num:
172 |       sumR[i] = sumR[i-1] + cntR[i]                   #Ref cumulative/pfx sum
173 |     if verb:                                          #Print Table
174 |       echo "i(dec)\ti(bin)\tT\tcount\tcsum"
175 |       for i in 0 ..< num:
176 |         echo "$1\t$2\t$3\t$4\t$5"%[ $i, toBin(i,6), $b[i], $cntR[i], $sumR[i] ]
177 |     for i in 0 ..< b.len:                             #Test pmf()
178 |       if b.pmf(i) != cntR[i]:
179 |         echo "i: ", i, "\tcntR: ", sumR[i], " b.pmf:", b.pmf(i); result |= 1
180 |     for i in 0 ..< b.len:                             #Test cdf()
181 |       if b.cdf(i) != sumR[i]:
182 |         echo "i: ", i, "\tsumR: ", sumR[i], " b.cdf:", b.cdf(i); result |= 2
183 |     for s in 1.ct .. args.len.ct:                     #Test invCDF 4all cumSums
184 |       let (i, s0) = b.invCDF(s)
185 |       let j = i - 1; let s1 = s0 + cntR[i]
186 |       if s1 != sumR[i] or (j >= 0 and s0 != sumR[j]) or not(s0 < s and s <= s1):
187 |         echo "cs: ",s," im1: ",j," s0: ",s0," i: ",i," s1: ",s1; result |= 4
188 |     if b.min != minR: echo "wrong min: ", b.min, " not ", minR; result |= 8
189 |     if b.max != maxR: echo "wrong max: ", b.max, " not ", maxR; result |= 8
190 |     let dq = 1.0/2048.0                               #Test quantile continuity
191 |     var q0 = -1.0; var qP0 = 0.0                      #Take dq as param?
192 |     cfor (var q = 0.0), q <= 1.0, q += dq:
193 |       let qP = b.quantile(q)
194 |       if parzen : echo "P: ", q, " ", qP
195 |       if q0 > -1 and abs(qP - qP0) > thresh:
196 |        result |= 16  #NOTE: Test less objective; Set parzen to assess manually.
197 |        echo "PdisCont: ",q0," -> ",q," ",qP0," -> ",qP," |qP0-qP|: ",abs(qP-qP0)
198 |       q0 = q; qP0 = qP                                #save last loop values
199 |     var t = b; t.data = cntR                          #Bulk Histogram -> BIST
200 |     t.fromCnts
201 |     if b.data != t.data:
202 |       echo "- bad fromCnts chk -"; result |= 32
203 |       for i in 0 ..< b.len: echo "i: ", i, "\tT: ", t[i]
204 |     if num.isPow2:
205 |       b.toCnts                                        #Bulk BIST -> Histogram
206 |       if b.data != cntR:                              #NOTE: `b` is clobbered
207 |         echo "- bad toCnts chk -"; result |= 64
208 |         for i in 0..<b.len: echo "i: ", i, "\tcntR: ", cntR[i], "\tb[i]: ", b[i]
209 |   dispatch tbist, help={"args": "various positive integers",
210 |     "num"   : "allocated space for Fenwick/BIST array",
211 |     "verb"  : "verbosely print the distribution",
212 |     "parzen": "do parzen quantile interpolation",
213 |     "thresh": "Pr diff meaning 'quantile discontinuity'"}
214 | 


--------------------------------------------------------------------------------
/adix/tdigest.nim:
--------------------------------------------------------------------------------
  1 | ## This is a "tail digest" {github/tdunning/t-digest|arxiv.org/abs/1902.04023}.
  2 | ## Counter decs are not possible.  (So, moving quantiles are unsupported.)  In
  3 | ## my experiments, quantiles are 5X slower than adix/lghisto.  *Tail* quantiles
  4 | ## do deliver better accuracy for less space (but lghisto already needs little
  5 | ## local cache space | network BW in *absolute* terms).  There may be a way to
  6 | ## adapt my B-Tree to speed up the idea. { tDig is also very involved - folks
  7 | ## just *intuit* histo(ln(x)), but that is a more subjective critique. }
  8 | 
  9 | when not declared(addFloat): import std/formatfloat
 10 | import std/[math, algorithm]
 11 | type
 12 |   Scale* = enum scLog           ## Other scales possible, but unimpl here
 13 | 
 14 |   Group* = object               ## Group/Cluster/Centroid
 15 |     m*: float                   ## mean
 16 |     w*: int                     ## weight
 17 | 
 18 |   DigesT* = object
 19 |     scale*: Scale               ## What scale function to use
 20 |     nMerges: int                # counter of merge() to study perf
 21 |     min , max: float            # min & max over all sampled
 22 |     cpr , pubCpr: float         # Compression factor. Same as paper 1/\delta
 23 |     nM  , nT  : int             # next used & next buf group indices
 24 |     wTot, wBuf: float           # sum_i mrg[i].w; sum_i buf[i].w
 25 |     mrg , buf : seq[Group]      # merged & unmerged groups
 26 | 
 27 | func W(c: Group): float = c.w.float
 28 | 
 29 | func init*(s: var DigesT, cpr=100.0, scale=scLog, mainLen=0, nBuf=0) =
 30 |   s.scale = scale
 31 |   s.min   = float.high          #NOTE: nM=nT=wTot=wBuf auto-zeroed by Nim
 32 |   s.max   = float.low
 33 |   let cpr = max(cpr, 10.0)      # force reasonable value.
 34 |   let fudge = if cpr < 30.0: 30.0 else: 10.0
 35 |   var size = max(2.0*cpr + fudge, mainLen.float).int # default size
 36 |   var nBuf = nBuf
 37 |   if nBuf == 0:                 # big buffers good for speed; Returns diminish
 38 |     nBuf = 5*size
 39 |   nBuf = max(nBuf, 2*size)      # ensure enough space in buffer
 40 |   let scl = max(1.0, nBuf.float/size.float - 1.0)
 41 |   s.pubCpr = cpr
 42 |   s.cpr = sqrt(scl)*cpr         # cpr: max retained groups
 43 |   if size.float < s.cpr + fudge: # Adjust if changing cpr may make buf too small
 44 |     size = ceil(s.cpr + fudge).int
 45 |   nBuf = max(nBuf, 2*size)      # ensure enough space in buffer (maybe again)
 46 |   s.mrg.setLen size             # should be one-time space allocations
 47 |   s.buf.setLen nBuf
 48 | 
 49 | func initDigesT*(cpr=100.0, scale=scLog, mainLen=0, nBuf=0): DigesT =
 50 |   result.init cpr, scale, mainLen, nBuf
 51 | 
 52 | func space*(s: DigesT): int = (s.nM + s.nT) * Group.sizeof + DigesT.sizeof
 53 | 
 54 | func weight*(s: DigesT): int = int(s.wTot + s.wBuf) ## total count represented
 55 | 
 56 | func Z(compr, n: float): float = 4.0*ln(n/compr) + 24.0
 57 | func k2_norm(compr, n: float): float = compr/Z(compr, n)
 58 | func k2_k(q, norm: float): float =
 59 |   let q = max(1e-15, min(q, 1 - 1e-15))
 60 |   ln(q/(1 - q))*norm
 61 | func k2_q(k, norm: float): float = (let w = exp(k/norm); w/(1 + w))
 62 | 
 63 | const norms: array[Scale, auto] = [k2_norm]
 64 | const qs   : array[Scale, auto] = [k2_q   ]
 65 | const ks   : array[Scale, auto] = [k2_k   ]
 66 | 
 67 | # Combine existing groups with incoming data & reduce groups by merging.
 68 | func merge(s: var DigesT, cpr: float) =
 69 |   s.buf[s.nT..<s.nT+s.nM] = s.mrg[0..<s.nM] # arraycopy(mrg,0,s.buf,s.nT,s.nM)
 70 |   s.nT   += s.nM
 71 |   s.wTot += s.wBuf
 72 |   let t = s.buf.len
 73 |   s.buf.setLen s.nT
 74 |   s.buf.sort proc(a, b: Group): int = system.cmp a.m, b.m
 75 |   s.buf.setLen t
 76 |   s.nM = 0                      # start by cp least inc value to normal buffer
 77 |   s.mrg[s.nM].m = s.buf[0].m
 78 |   s.mrg[s.nM].w = s.buf[0].w
 79 |   var wSoFar = 0.0
 80 |   let norm = norms[s.scale](if cpr < 0: s.cpr else: cpr, s.wTot)
 81 |   var k    = ks[s.scale](0, norm)
 82 |   var wLim = s.wTot * qs[s.scale](k + 1, norm)
 83 |   for i in 1 ..< s.nT:          # weight will contain all zeros after this loop
 84 |     let wProp = s.mrg[s.nM].W + s.buf[i].W
 85 |     let wProj = wSoFar + wProp
 86 |     if wProj <= wLim and not (i == 1 or i == s.nT - 1): # block last group merge
 87 |       s.mrg[s.nM].w += s.buf[i].w # next point fits => merge into existing group
 88 |       s.mrg[s.nM].m = s.mrg[s.nM].m + (s.buf[i].m -
 89 |                                        s.mrg[s.nM].m)*s.buf[i].W/s.mrg[s.nM].W
 90 |       s.buf[i].w = 0
 91 |     else:                       # didn't fit => mv2next output; Cp 1st group
 92 |       wSoFar += s.mrg[s.nM].W
 93 |       k    = ks[s.scale](wSoFar/s.wTot, norm)
 94 |       wLim = s.wTot * qs[s.scale](k + 1, norm)
 95 |       s.nM.inc
 96 |       if s.nM >= s.mrg.len:
 97 |         s.mrg.setLen s.nM + 1; {.cast(noSideEffect).}: echo "AUTO-EXPAND"
 98 |         s.buf.setLen s.buf.len + 1
 99 |       s.mrg[s.nM].m = s.buf[i].m
100 |       s.mrg[s.nM].w = s.buf[i].w
101 |       s.buf[i].w = 0
102 |   inc s.nM                      # points to next empty cell
103 |   if s.wTot > 0:                # update extreme values
104 |     s.min = min(s.min, s.mrg[0].m)
105 |     s.max = max(s.max, s.mrg[s.nM - 1].m)
106 | 
107 | func mergeNew*(s: var DigesT, force=false, cpr = -1.0) =
108 |   if s.wTot == 0 and s.wBuf == 0: return
109 |   if force or s.wBuf > 0: # Do merge in reverse @odd times to avoid lo2hi bias.
110 |     s.merge cpr
111 |     s.nMerges.inc
112 |     s.nT = 0
113 |     s.wBuf = 0
114 | 
115 | func add*(s: var DigesT, x: float, w=1) = ## Main update API
116 |   if isNaN(x): raise newException(ValueError, "cannot add NaN")
117 |   if s.nT >= s.buf.len - s.nM - 1:
118 |     s.mergeNew
119 |   let i = s.nT; inc s.nT
120 |   s.min = min(s.min, x)
121 |   s.max = max(s.max, x)
122 |   s.buf[i].w = w
123 |   s.buf[i].m = x
124 |   s.wBuf += w.float
125 | 
126 | func compress*(s: var DigesT) = s.mergeNew(true, s.cpr)
127 |   ## best done only when we want to show results to the outside world.
128 | 
129 | iterator groups*(s: DigesT): Group =
130 |   for i in 0 ..< s.nM: yield Group(m: s.mrg[i].m, w: s.mrg[i].w)
131 | 
132 | func add*(s: var DigesT, others: var openArray[DigesT]) =
133 |   for other in mitems(others):
134 |     other.compress
135 |     for c in other.groups: s.add(c.m, c.w)
136 | 
137 | func weightAvgOrd(x1, w1, x2, w2: float): float {.inline.} =
138 |   let x = (x1*w1 + x2*w2)/(w1 + w2)
139 |   return max(x1, min(x, x2))
140 | 
141 | func weightedAverage(x1, w1, x2, w2: float): float {.inline.} =
142 |   if x1 <= x2: weightAvgOrd(x1, w1, x2, w2) # WeightedAvg of `x1, s1` & `x2, w2`
143 |   else       : weightAvgOrd(x2, w2, x1, w1) # Guaranteed on `[x1, x2]`
144 | 
145 | func quantile*(s: var DigesT, q: float): float =
146 |   if q < 0.0 or q > 1.0:
147 |     raise newException(ValueError, "q must be on [0,1], not " & $q)
148 |   s.mergeNew
149 |   if s.nM == 0: return NaN
150 |   if s.nM == 1: return s.mrg[0].m
151 |   var n = s.nM                      # At least two groups now
152 |   var ix = q * s.wTot.float         # weight units offset we want
153 |   if ix < 1: return s.min           # boundaries; return min|max; likely moot
154 |   # If lo group has >1 sample, still know 1 sample occurred @min => interpol.
155 |   if s.mrg[0].w > 1 and ix < s.mrg[0].W/2.0: # only 1 sample @min => less weight
156 |     return s.min + (ix - 1)/(s.mrg[0].W/2.0 - 1) * (s.mrg[0].m - s.min)
157 |   if ix > s.wTot - 1: return s.max  # likely moot
158 |   # If hi group has >1 sample, still know 1 sample occurred @max => interpol.
159 |   if s.mrg[n-1].w > 1 and s.wTot - ix <= s.mrg[n-1].W/2.0:
160 |     return s.max - (s.wTot-ix-1)/(s.mrg[n-1].W/2.0 - 1)*(s.max - s.mrg[n-1].m)
161 |   var wSoFar = s.mrg[0].W/2.0       # between exVals, interpol betw groups
162 |   for i in 0 ..< n - 1:
163 |     let dw = float(s.mrg[i].w + s.mrg[i+1].w)/2.0
164 |     if wSoFar + dw > ix:            # groups i, i+1 bracket current point
165 |       var leftUnit = 0.0            # check for unit weight
166 |       if s.mrg[i].w == 1:
167 |         if ix - wSoFar < 0.5:
168 |           return s.mrg[i].m         # within the singleton's sphere
169 |         else:
170 |           leftUnit = 0.5
171 |       var rightUnit = 0.0
172 |       if s.mrg[i+1].w == 1:
173 |           if wSoFar + dw - ix <= 0.5:
174 |             return s.mrg[i+1].m     # no interpolation needed near singleton
175 |           rightUnit = 0.5
176 |       let z1 = ix - wSoFar - leftUnit
177 |       let z2 = wSoFar + dw - ix - rightUnit
178 |       return weightedAverage(s.mrg[i].m, z2, s.mrg[i+1].m, z1)
179 |     wSoFar += dw
180 |   # Handled singleton@end above
181 |   let z1 = ix - s.wTot - s.mrg[n-1].W/2.0 # wSoFar =~ s.wTot - s.mrg[n-1].w/2
182 |   let z2 = s.mrg[n-1].W/2.0 - z1          # =>interp out to max value ever seen
183 |   return weightedAverage(s.mrg[n-1].m, z1, s.max, z2)
184 | 
185 | func cdf*(s: var DigesT, x: float): float =
186 |   if x.isNaN: return NaN
187 |   s.mergeNew
188 |   if s.nM == 0: return NaN      # no data to examine
189 |   if x < s.min: return 0.0      # -inf works fine
190 |   if x > s.max: return 1.0      # +inf works fine
191 |   if s.nM == 1:                 # exactly one group, should have max==min
192 |     let width = s.max - s.min
193 |     if x - s.min <= width: return 0.5      # min & max too close to interpolate
194 |     else: return (x - s.min)/(s.max-s.min) # interpol if weight>0, max != min
195 |   let n = s.nM
196 |   if x < s.mrg[0].m:            # check for the LO TAIL
197 |     let dx = s.mrg[0].m - s.min
198 |     if dx > 0.0:                # do not divide by zero in interpol
199 |       return if x == s.min: 0.5/s.wTot        # sample exactly @min
200 |              else: (1.0 + (x - s.min)/dx * (s.mrg[0].W/2.0 - 1.0))/s.wTot
201 |     else: return 0.0            # should be redundant with the check x < s.min
202 |   if x > s.mrg[n-1].m:          # and the HI TAIL
203 |     let dx = s.max - s.mrg[n-1].m
204 |     if dx > 0.0:
205 |       return if x == s.max: 1.0 - 0.5/s.wTot  # single sample exactly @max
206 |              else: 1.0 - (1.0 + (s.max-x)/dx*(s.mrg[n-1].W/2.0 - 1.0))/s.wTot
207 |     else: return 0.0            # should be redundant with the check x > s.max
208 |   var wSoFar = 0.0      # Now mrg[0].m<=x<=mrg[n-1].m >= 2 groups; either >=1
209 |   for i in 0 ..< n-1:   # consecutive groups all @exactly x OR c0 < x < c1
210 |     if s.mrg[i].m == x:         # wSoFar does not yet include s.mrg[i].w
211 |       var dw = 0.0              # Have >=1 groups @x
212 |       for j in i ..< n:         # treat as 1, accumulating weight in dw
213 |         dw += s.mrg[i].W
214 |         if s.mrg[i].m != x: break
215 |       return (wSoFar + dw/2.0)/s.wTot
216 |     elif s.mrg[i].m <= x and x < s.mrg[i+1].m: # betw groups
217 |       if s.mrg[i+1].m - s.mrg[i].m > 0.0: # handle FP issues
218 |         var loExclW = 0.0       # Singleton groups have all weight @mean
219 |         var hiExclW = 0.0       # & should not be smoothed/interpolated.
220 |         if s.mrg[i].w == 1:
221 |           if s.mrg[i+1].w == 1: # 2 singletons=>no interpol; lo in, hi out
222 |             return (wSoFar + 1.0)/s.wTot
223 |           else:
224 |             loExclW = 0.5
225 |         elif s.mrg[i+1].w == 1:
226 |           hiExclW = 0.5
227 |         let dw = float(s.mrg[i].w + s.mrg[i+1].w)/2.0
228 |         let lo = s.mrg[i].m     # adjust endpoints for any singleton
229 |         let hi = s.mrg[i+1].m   # adjusts have limited effect on endpoints
230 |         let dwNoSingleton = dw - loExclW - hiExclW
231 |         let base = wSoFar + s.mrg[i].W/2.0 + loExclW
232 |         return (base + dwNoSingleton * (x - lo)/(hi - lo))/s.wTot
233 |       else:                     # distinct but too close for safe interpolation
234 |         return (wSoFar + float(s.mrg[i].w + s.mrg[i+1].w)/2.0)/s.wTot
235 |     else:
236 |       wSoFar += s.mrg[i].W
237 |   1.0 - 0.5/s.wTot
238 | 
239 | when isMainModule:
240 |   when defined(test):
241 |     import std/[os, strutils] # Helpful to run against: -12 -8 -4 -1 0 1 4 8 12
242 |     var s = initDigesT(a=0.125, b=10.0, n=8)
243 |     for i in 1 .. paramCount(): s.add parseFloat(paramStr(i))
244 |     for q in [0.01, 0.05, 0.25, 0.50, 0.75, 0.95, 0.99]: echo s.quantile(q)
245 |     echo "s: ", s
246 |   else:
247 |     import std/[random, times, strformat]
248 |     when defined danger: randomize()
249 |     const N = 750_000
250 |     var data = newSeq[float](N)
251 |     const Q = [0.001,0.01,0.05,0.1587,0.25,0.50,0.75,0.8413,0.95,0.99,0.999]
252 |     var res = newSeq[float](Q.len)
253 |     for i in 0..<N: data[i] = gauss() # rand(0.0 .. 1.0)
254 |     var s = initDigesT()
255 |     let t0 = epochTime()
256 |     for x in data: s.add x
257 |     let t1 = epochTime()
258 |     for j, q in Q: res[j] = s.quantile(q)
259 |     let t2 = epochTime()
260 |     let dtB = (t1 - t0)*1e9/N.float     # Build time
261 |     let dtQ = (t2 - t1)*1e9/Q.len.float # Query time
262 |     for r in res: echo r
263 |     echo &"ns/add: {dtB:.1f}  ns/q: {dtQ:.1f}  space: {s.space} bytes"
264 | 


--------------------------------------------------------------------------------