├── .gitignore ├── README.md ├── config.nims ├── fitl.nim ├── fitl.nimble ├── fitl ├── basicLA.nim ├── bdsTest.nim ├── boots.nim ├── cds.nim ├── covar.nim ├── dists.nim ├── estMI.nim ├── gof.nim ├── ksamp.nim ├── linfit.nim ├── min1d.nim ├── qtl.nim └── svdx.nim └── test ├── gen.nim └── polyf.nim /.gitignore: -------------------------------------------------------------------------------- 1 | .gitignore 2 | fitl.out 3 | fitl/qtl 4 | fitl/gof 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # fitl: A Self-contained fit of linear models with regression diagnostics 2 | 3 | I do a lot of performance analysis and other data analysis. Simple regression 4 | with a handful of columns is often adequate, but has many hazards addressable 5 | through various mitigations & diagnostics I have not found bundled into one, 6 | self-contained tool. So, I wrote this program, pronounced almost like "fiddle" 7 | originally in C but have now ported to Nim. 8 | 9 | It is both a Nim library and command-line utility which is made easy using the 10 | [cligen](https://github.com/c-blake/cligen/) framework. `nimble install fitl` 11 | should just work, but if it fails then just git clone cligen & spfun and then 12 | `nim c -d:danger --path:cligen --path:spfun fitl` to build the CLI. 13 | 14 | # Example use 15 | 16 | XXX TODO Do me. 17 | 18 | # Truly self-contained - even the linear algebra 19 | 20 | BLAS/LAPACK from the Fortran world have many implementations - Netlib-Fortran 21 | compiled, vendor optimized, CUDA/GPU optimized, etc. This is nice for large 22 | problems, but overkill for simple linear models with <100s of predictor and 23 | 0 and cs[0] != '#': 52 | sep.split MSlice(mem: cs, len: n), nums 53 | if nums.len < need: 54 | stderr.write &"{iNm}:{i}:skipping too few columns ({nums.len}<{need})\n" 55 | else: 56 | let w = if ixW == 0: F(1) else: F(1)/nums.f(i, ixW) 57 | for j in result[1]: xT.add w*nums.f(i, j) 58 | result[0] = xT.len div cols.len 59 | X = xpose(xT, result[0], cols.len) 60 | 61 | proc fmtCov*[F](s: string; v: seq[F]; m=0; norm=false, label=false): string = 62 | proc elt(i, j: int): F = # fmt cov/corr 63 | if norm: 64 | if i==j: sqrt(v[m*i + i]) # std.errs 65 | else : v[m*i + j]/sqrt(v[m*i + i]*v[m*j + j]) # corr.coefs 66 | else: v[m*i + j] 67 | result.add s & "-" & (if norm: "stderr-corr" else: "covariance") & " matrix\n" 68 | if label: # FULL SYMM. MATRIX WITH LABELS 69 | for i in 0.. F(0))] 80 | result.add (if ch == 'c' : &"(${ix} {pm} {abs(o)})" 81 | elif ch in {'z','m'}: &"(${ix} {pm} {abs(o)})/{s}" 82 | else: &"${ix}") 83 | result.add sep 84 | 85 | proc fmtModel*[F](cols: seq[string]; ixX: seq[int]; M: int; 86 | b, v, o, s: seq[F]): string = 87 | result.add fmtBasis(cols[0][0], ixX[0], o[0], s[0], "= ") 88 | for j in 1.. [ .. ] 123 | ## y1 basis1(x1) [ basis2(x1) .. ] { Permutable cols. 124 | ## y2 basis1(x2) [ basis2(x2) .. ] 1-origin numbers. } 125 | ## . . [ . . ] 126 | ## . . [ . . ] 127 | ## NOTE: Input ^$ blanks & lines beginning with '#' are skipped. colNo '0' -> 128 | ## 1 for all data points -- useful as both flat reciprocal weight (aka sigma) 129 | ## & intercept col. A colNo prefix of z => Z)SCORE it (mean0,var1); c => only 130 | ## C)ENTER it (mean0); m =>(x-min)/(max-min) {->[0,1]}. 131 | if cols.len < 2: raise newException(HelpError,"Too few columns; Full ${HELP}") 132 | let resF = if resid.len != 0: open(resid, fmWrite) else: nil 133 | let logF = if log.len != 0: open(log , fmAppend) else: nil 134 | if file == "-": iNm = "stdin"; iFl = stdin 135 | else: iNm = file; iFl = open(file) 136 | let sep = initSep(delim) 137 | let M = cols.len # total num columns 138 | let m = M - 1 # num x/predictor columns 139 | var X: seq[F] # Parse cols,text->Y,DesignMatrix 140 | var (n,ixX,xfm) = parseInp(cols,sep,X,wtCol) #..as well as centr&stdize ctrls 141 | if m > n: quit &"fewer data rows in {iNm} than columns", 3 142 | var o = newSeq[F](M) # Offset/Origin for each column (0.0) 143 | var s = newSeq[F](M, 1.0) # Scale to divide by to normalize data 144 | var b = newSeq[F](m) # Do not clobber X w/u; Since X is col-major aka bck2bck 145 | var u = X #..cols, first col can be "y" & rest is still 1 mem block 146 | var w = newSeq[F](m) # Singular values/recips 147 | var v = newSeq[F](m*m) # Right sing.vectors/Cov(b) matrix 148 | var r = newSeq[F](n) # Fit Residuals; Q: conditional alloc? 149 | var h: seq[F] #TODO Hat-matrix/influence functions 150 | var thr = F(sv) 151 | let (ssR,df,ssY) = linFit(X,n,M, b,u,w,v, r,h, o,s, trim,its,xfm, thr,xv,logF) 152 | echo fmtModel(cols, ixX, M, b, v, o, s) # emit the model 153 | if resF != nil: (for i in 0..0: # *some* EDF-based residual test 162 | mV = r.mvars; r.u01ize mV # mean-vars => PITz just once 163 | if gofD in gof: echo fmtGf("KSgaussRes" , r.gofTest(mV, gfD ), 4, 3) 164 | if gofW2 in gof: echo fmtGf("CvMgaussRes", r.gofTest(mV, gfW2), 4, 3) 165 | if gofA2 in gof: echo fmtGf("ADgaussRes" , r.gofTest(mV, gfA2), 4, 3) 166 | if gofV in gof: echo fmtGf("KuiGaussRes", r.gofTest(mV, gfV ), 4, 3) 167 | if gofU2 in gof: echo fmtGf("WatGaussRes", r.gofTest(mV, gfU2), 4, 3) 168 | var bT: seq[F] # Collect `b` for raw `Cov(b)` 169 | if boot > 0: # Bootstrapped cov(parameters) 170 | let N = closestMultiple(n, Block) # Fit synthetic new data sets.. 171 | var Xp = newSeq[F](N*M) #..of *nearest possible* size,.. 172 | var bK: seq[F]; var b = newSeq[F](m) #..*given* Block boot size. 173 | r.setLen 0; h.setLen 0; let nn = xfm.needNormalize 174 | template put(i, j) = colCpy Xp[i].addr, X[j].addr, N, n, M, F.sizeof 175 | forBoot(n, boot, mode, Block, put): # Gen data,reset,fit,save coefs 176 | v.zero; thr=sv; if nn: o.zero; s.set 1.0 # reset 177 | linFit(Xp,n,M, b,u,w,v, r,h, o,s,xfm, thr,xv,logF) # get best fit b 178 | bK.add b # save b 179 | bT.setLen bK.len; bT.xpose(bK, boot, m) 180 | covMat(v, bT, boot, m) # Replace `v` w/boostrapped cov 181 | if covBoot in cov: echo fmtCov("bootstrap",v,m,covNorm in cov,covLab in cov) 182 | if gofPar in gof: echo &"Param Significance Breakdown:\n", fmtPar(" ",b,v,bT) 183 | (if resF != nil: resF.close); (if logF != nil: logF.close) 184 | if iFl != stdin: iFl.close # stdin must have seen EOF; So close not so wrong. 185 | 186 | when isMainModule: include cligen/mergeCfgEnv; dispatch fitl, help={ 187 | "cols" : "1-origin-yCol xCol.. 0=>all 1s; ?[cs]=>Centr/Std", 188 | "file" : "input file; \"-\" => stdin", 189 | "delim": "`initSep` input delim; w=repeated whitespace", 190 | "wtCol": "1-origin sigma aka inverse weight column", 191 | "sv" : "regularize: >0 SVclip <0 -manualRidge ==0 CV", 192 | "xv" : "auto-ridge cross-validation score: GCV LOO", 193 | "resid": "log residuals to this pathname", 194 | "acf" : "emit resid serial AutoCorrFunc up to this lag", 195 | "Corr" : "correlation coefficient to use: linear rank", 196 | "altH" : "alt hyp for CC pVal: - + twoSide form(ula)", 197 | "boot" : "num resamples for Cov(b); 0=>estimated Cov(b)", 198 | "mode" : "re-sample block mode: moving|circular", 199 | "Block": "re-sample block size, e.g. 1 for IID", 200 | "gof" :"""emit goodness of fit diagnostics: 201 | r2: R^2; xsq: Chi-Square{aka SSR},df,pValue 202 | param: parameter significance breakdown 203 | GoF tests residuals are Gaussian: 204 | kolmogorovSmirnovD cramerVonMisesW2 205 | andersonDarlingA2 vKuiper watsonU2""", 206 | "cov" : "emit Cov(b) with flags: est norm label boot", 207 | "log" : "path to log (trimming, model selection..) to", 208 | "trim":"""trim pnts>="Nqtl(x/(2n)) sdevs" from reg surf 209 | [x=num pts expected if resids REALLY Normal]""", 210 | "its" : "max trimming itrs; < 0 => until fixed point."}, short={"altH": 'A'} 211 | -------------------------------------------------------------------------------- /fitl.nimble: -------------------------------------------------------------------------------- 1 | # Package 2 | version = "0.6.4" 3 | author = "Charles Blake" 4 | description = "Self-contained fit of linear models with regression diagnostics" 5 | license = "MIT/ISC" 6 | 7 | # Deps 8 | requires "nim >= 1.6.0" 9 | requires "cligen >= 1.9.2" 10 | requires "spfun >= 0.7.6" 11 | skipDirs = @["fitl"] 12 | installExt = @[".nim"] 13 | bin = @["fitl", "fitl/qtl", "fitl/gof", "fitl/dists", "fitl/estMI"] 14 | -------------------------------------------------------------------------------- /fitl/basicLA.nim: -------------------------------------------------------------------------------- 1 | ## This is just a few BLAS Level 1 operations. With gcc -ffast-math auto-vec 2 | ## they are about as fast (or sometimes a bit faster) than optimized libs. (A 3 | ## bit faster since they are more specific to dense/non-strided iterations.) 4 | {.passc: "-O3 -ffast-math -march=native -mtune=native".} 5 | 6 | proc sum*[F](x: ptr F; n: int): F = 7 | ## Sum of elements with accumulator in same arithmetic width as params. 8 | let x = cast[ptr UncheckedArray[F]](x) 9 | for i in 0.. 0: 69 | result[0] = xs[0]; result[1] = xs[0] 70 | for x in xs: result[0] = min(result[0], x); result[1] = max(result[1], x) 71 | else: result[0]=F(NaN); result[1]=F(NaN) 72 | 73 | func mvar*[F: SomeFloat](xs: openArray[F]): (F, F) = 74 | ## One pass mean & variance; NOTE: pop variance (1/n). 75 | if xs.len > 0: # --passC:-ffast-math can autovec whole loop into 76 | let dx = xs[0] #..very tight vsubp[sd]/vaddp[sd] instructions. 77 | var av, vr: F 78 | for x in xs: 79 | let x = x - dx; av += x; vr += x*x 80 | av /= F(xs.len); vr /= F(xs.len) 81 | result[0] = F(av.float + dx) 82 | result[1] = F(max(1e-30*av*av, vr - av*av)) 83 | else: result[0]=F(NaN); result[1]=F(NaN) 84 | 85 | func mvars*[F: SomeFloat](xs: openArray[F]): (F, F) = 86 | ## One pass mean & sample variance 87 | let mV = xs.mvar 88 | (mV[0], mV[1]*F(xs.len)/F(xs.len - 1)) 89 | 90 | proc newSeq*[T](len: Natural, val: T): seq[T] = 91 | ## Allocate & initialize a seq to a value 92 | result.setLen len 93 | for i in 0.. y: ix = y; iy = x 24 | iy = iy - ix - 1 25 | let ipos = iy div NBIT 26 | let ibit = NBIT - 1 - iy mod NBIT 27 | let p = bStart[ix] +! ipos 28 | p[] = p[] or ibit.bit 29 | 30 | proc mkMask(r, n, nbit, drop: int; mask: pua int) = 31 | mask[1]=ALLBITS; mask[0]=mask[1] # mask[0], mask[1]: 2-word mask. 32 | let last = (n - r - 1) div nbit # Row `r`; `nbit`,`drop`: bits used&dropped 33 | for i in n - drop ..< n: 34 | let itrue = i - r - 1 35 | let j = last - itrue div nbit 36 | let k = nbit - 1 - itrue mod nbit 37 | mask[j] = mask[j] xor k.bit 38 | 39 | proc embed(bStart: seq[ptr int16]; n, dim: int) = 40 | for j in 0 ..< n - dim: # Embed to next higher dim; `g(i,j) &= g(i+1,j+1)` 41 | var i = bStart[j] 42 | for i2 in bStart[j + 1] ..< bStart[j + 2]: 43 | i[] = i[] and i2[]; i = i +! 1 44 | if i != bStart[j + 1]: i[] = 0 45 | 46 | type 47 | Pos[F] = tuple[val: F, pos: int] 48 | BDS*[F] = object ## Holds state for BDS stat calc for `n` observations. 49 | posTab: seq[Pos[F]] 50 | mask, lookup: seq[int] 51 | grid : seq[int16] 52 | start : seq[ptr int16] 53 | 54 | proc space[T](x: seq[T]): int = x.len*x[0].sizeof 55 | proc space*[F](b: BDS[F]): int = ## Lower bound indirect space usage of `b` 56 | b.sizeof+b.posTab.space+b.mask.space+b.lookup.space+b.grid.space+b.start.space 57 | 58 | proc initBDS*[F](n: int): BDS[F] = ## Make BDS calc holder for `n` observations. 59 | result.posTab.setLen n 60 | result.mask.setLen 2*n 61 | result.lookup.setLen TABLEN + 1 62 | result.start.setLen n + 1 63 | var sz = 0 # Find grid size 64 | for i in 0..n: sz.inc (n - i) div NBIT + 1 65 | result.grid.setLen sz # Grid is defined as short (2 byte ints) 66 | result.start[0] = result.grid[0].addr 67 | for i in 1..n: result.start[i] = result.start[i - 1] +! ((n - i) div NBIT + 1) 68 | for i in 0..TABLEN: # Table for bit counting 69 | for j in 0 ..< NBIT: (if (i and j.bit) != 0: inc result.lookup[i]) 70 | 71 | proc evalc[F](b: BDS[F]; n: int): float = 72 | var cnt = 0 # Return count stats for grid; Zero uncounted parts using mask 73 | for j in 0.. 2: 75 | for i in b.start[j] ..< b.start[j + 1] -! 2: 76 | cnt.inc b.lookup[i[]] 77 | if b.lookup[i[]] > NBIT: ResourceExhausted !! &"{i[]} {b.lookup[i[]]}" 78 | for i in b.start[j + 1] -! 2 ..< b.start[j + 1]: 79 | cnt.inc b.lookup[i[] and b.mask[2*j + (b.start[j + 1] -! i - 1)]] 80 | else: 81 | for i in b.start[j] ..< b.start[j + 1]: 82 | cnt.inc b.lookup[i[] and b.mask[2*j + (b.start[j + 1] -! i - 1)]] 83 | if trace: echo "count = ",cnt 84 | return 2*cnt.float / float(n*(n - 1)) 85 | 86 | proc kc*[F](b: var BDS[F]; x: openArray[F]; m=2; drop = -1; eps: F): 87 | tuple[k: float, c: seq[float]] = 88 | ## `x`: time series to test; Length must be same as in `b=initBDS(x.len)`. 89 | ## `m`: cstats will be done for dim/lag 1..m; `drop`: num.data to drop @end. 90 | ## { c(2) can use more data than c(3),.. => Ease ignoring last few so all 91 | ## c() are done over same data. Eg. for m=3 we might use x(1..n-2). } 92 | ## `eps`: epsilon value for close points. 93 | ## Returns `k` & `c`: 1-origin indexed raw c values c[1], c[2], c[3], ... 94 | if x.len != b.posTab.len: Value !! &"{x.len} != {b.posTab.len}" 95 | let drop = if drop != -1: drop else: m - 1 # -1=>All c[] estim w/n-m+1 points 96 | let n = x.len; let nOb = n - drop; let nObF = nOb.float 97 | for ip in b.grid[0].addr ..< b.start[n] +! 1: ip[] = 0 98 | for i in 0..test stat asymptotically~N(0,1). Brock,Hsieh,LeBaron 1991 Chap2,pg43. 130 | ## `c`: c[1]; `cm`: c[m]; `k`: k stat; `m`: embedding dim; `n` = nPoints. 131 | var v = 0.0; let M = m.float 132 | for j in 1..~ 500. 144 | var b = initBDS[float](x.len) # Raw k&c statistics 145 | let (k, c) = b.kc(x, m, m - 1, es*sqrt(x.sampleVar)) 146 | if trace: (echo(&"k = ",k); for i in 1..m: echo &"c({i}) {c[i]}") 147 | result.setLen m + 1 148 | for i in 2..m: result[i] = cstat(c[1], c[i], k, i, x.len - m + 1) 149 | if trace: echo "b.space: ",b.space," bytes" 150 | 151 | when isMainModule: 152 | proc test(x: seq[float]; m=3; es=0.5; trace=false) = 153 | ## BDS Test command (like R tseries::bdstest). Asymptopia at n >~ 500. 154 | if x.len < m + 1: Value !! "\e[1mNOT ENOUGH DATA\e[m" 155 | bdsTest.trace = trace; for e in BDS_N01s(x, m, es)[2..^1]: echo &"{e:.2f}" 156 | dispatch test, cmdName="bdsTest" # 1.1 2.05 3.03 4.02 5.01 6.0 7.1 8.05 157 | -------------------------------------------------------------------------------- /fitl/boots.nim: -------------------------------------------------------------------------------- 1 | ##[ Define template to re-sample with replacement, aka statistical bootstrap.]## 2 | import random; export random 3 | if defined release: randomize() 4 | type BootKind* = enum bMoving="moving", bCircular="circular" ## block boot mode 5 | 6 | proc closestMultiple*(n, B: int): int = 7 | ## Return `n` rounded to the nearest multiple of `B`. Stdlib addition? 8 | let L = (n div B)*B; let H = L + B 9 | if n - L < H - n: L else: H 10 | # |-- Nim behavior blocks B=1 11 | template forBoot*(n: int; boot=1000; mode=bMoving; B, put, fit) = 12 | ##[ Logic template to run `boot` successive estimations on data sets of size 13 | `n` re-sampled w/replacement in `mode bMoving|bCircular` blocks of length `B`. 14 | Caller provides `put(i, j)` to put 1 original point @`j` into some new data 15 | array@`i` (can be implicitly @end) & run a `fit`. Centralizes `n mod B==0` 16 | check, `bMoving` end avoidance & `bCircular` wraparound. While this only 17 | insists on n==K*B, it is statistically best for synthetic data set sizes to 18 | be as close to actual sample sizes as possible a la `closestMultiple`. ]## 19 | template putB(i, j0, B) = 20 | for j in j0 ..< j0 + B: put i, j 21 | 22 | if n mod B != 0: raise newException(ValueError, 23 | "`n` must be a multiple of block size `B`; Call with `closestMultiple`") 24 | for k in 1..boot: 25 | var i = 0 # Re-sample a data set 26 | while i < n: 27 | let j0 = rand(n - (if mode == bMoving: B else: 1)) 28 | case mode 29 | of bMoving: putB i, j0, B # Validity guaranteed by rand(n - B) 30 | of bCircular: # Same as above, but with EOData wraparound 31 | if (let overflow = j0 + B - n; overflow > 0): 32 | putB i, j0, B - overflow 33 | putB i, 0 , overflow # Copy B slots total: some above, rest here 34 | else: # Sample "kinda" needs S1 topology to make sense 35 | putB i, j0, B 36 | i += B 37 | fit # E.g., init aux, estimate, save params 38 | -------------------------------------------------------------------------------- /fitl/cds.nim: -------------------------------------------------------------------------------- 1 | when not declared(stdin): import std/[syncio, formatfloat] 2 | import std/[strutils, strformat, algorithm, random] 3 | from fitl/qtl import quantile 4 | from spfun/binom import initBinomP, est 5 | 6 | proc cds*(x: seq[float], m=50, sort=false): seq[seq[float]] = 7 | ## Use an *already sorted* `x` to make `m` Parzen Qmid re-samples, maybe sort. 8 | result.setLen m 9 | for i in 0..0: iput.open else: stdin): x.add f.strip.parseFloat 21 | x.sort; let n = x.len 22 | let xs = x.cds(m, sort=true) 23 | let g = if gplot.len > 0: open(gplot, fmWrite) else: nil 24 | let e = open(&"{oput}EDF", fmWrite) 25 | let tagL = &"{0.5 - 0.5*ci:.03f}" 26 | let tagH = &"{0.5 + 0.5*ci:.03f}" 27 | let l = open(&"{oput}{tagL}" , fmWrite) 28 | let h = open(&"{oput}{tagH}" , fmWrite) 29 | for j, f in x: 30 | e.write f, "\n" 31 | let (lo, hi) = initBinomP(j, n).est(ci) #XXX Check alignment & maybe 32 | l.write f," ",lo,"\n" #.. emit leading & trailing 33 | h.write f," ",hi,"\n" #.. edges to connect to 0,1 34 | e.close 35 | if g != nil: g.write &"""#set terminal png size 1920,1080 font "Helvetica,10" 36 | #set output "rsSwarm.png" 37 | set key top left noautotitle # EDFs go bot left->up right;Dot keys crowd plot 38 | set style data steps 39 | set xlabel "Sample Value" 40 | set ylabel "Probability" 41 | set linetype 1 lc rgb "blue" lw 3 42 | set linetype 2 lc rgb "red" lw 1 43 | set linetype 3 lc rgb "red" lw 2 44 | set linetype 4 lc rgb "black" dashtype 0 45 | plot """ 46 | for i, x in xs: 47 | let opath = &"{oput}{i:03}" 48 | let o = open(opath, fmWrite) 49 | for f in x: o.write f, "\n" 50 | o.close 51 | if g != nil: 52 | g.write (if i==0: "" else: ",\\\n "), &"'{opath}' u 1:($0/{n}) ls 4" 53 | if g != nil: 54 | g.write &",\\\n '{oput}EDF' u 1:($0/{n}) title 'EDF' ls 1" 55 | g.write &",\\\n '{oput}{tagL}' title 'EDF{tagL}' ls 2" 56 | g.write &",\\\n '{oput}{tagH}' title 'EDF{tagH}' ls 3" 57 | g.write "\n"; g.close 58 | 59 | when isMainModule: 60 | when not declared(stdin): import std/[syncio, formatfloat] 61 | when defined danger: randomize() 62 | import cligen; include cligen/mergeCfgEnv; dispatch cdswarm, help={ 63 | "iput" : "input path or \"\" for stdin", 64 | "oput" : "output path prefix; outs Get numbered", 65 | "m" : "number of resamples; e.g. for plots", 66 | "gplot": "generate a gnuplot script to plot", 67 | "ci" : "CI for Wilson score confidence bands"} 68 | -------------------------------------------------------------------------------- /fitl/covar.nim: -------------------------------------------------------------------------------- 1 | import basicLA; from std/math import sqrt 2 | {.passc: "-O3 -ffast-math -march=native -mtune=native".} 3 | 4 | proc mean[F](x: ptr F; n: int): F = sum(x, n) / F(n) 5 | 6 | proc corr[F](x, y: openArray[F]; xm, ym: F; n: int; yO=0): F = 7 | var syy, sxy, sxx: F 8 | for i in 0 ..< n: 9 | sxx += (x[i] - xm)*(x[i] - xm) 10 | syy += (y[yO+i] - ym)*(y[yO+i] - ym) 11 | sxy += (x[i] - xm)*(y[yO+i] - ym) 12 | sxy / sqrt(sxx * syy) 13 | 14 | proc corr*[F](x, y: openArray[F], yO=0): F = 15 | ## Return Pearson linear correlation coefficient between x & y (over leading 16 | ## elements if one array is longer). 17 | let n = min(x.len, y.len) 18 | corr x, y, mean(x[0].unsafeAddr, n), mean(y[yO].unsafeAddr, n), n, yO 19 | 20 | proc corrAuto*[F](x: openArray[F], lag=1): F = 21 | if lag < x.len: corr(x, x, lag) else: F(0) 22 | 23 | from spfun/studentT import ccPv, Corr, AltH; export Corr, AltH 24 | template pua(T: typedesc): untyped = ptr UncheckedArray[T] 25 | proc corrAuto*[F](x: openArray[F], lag=1, B=9, cc=rank, altH=form): (F, F, F) = 26 | let hi = x.len - lag - 1 27 | ccPv(toOpenArray(cast[pua F](x[ 0 ].unsafeAddr), 0, hi), 28 | toOpenArray(cast[pua F](x[lag].unsafeAddr), 0, hi), B, 1000*B, 29 | 0.99, 0.05, cc, altH) 30 | 31 | proc covMat*[F](v: var openArray[F]; x: openArray[F]; n, m: int) = 32 | ## Save in `v` usual symmetric m*m Covariance matrix for n*m input matrix `x` 33 | ## where `x[i+n*j]` is the j-th column of the i-th row. Aka, samples is the 34 | ## faster moving index, not variables; Aka column-major. 35 | var ni = if n > 1: 1.0 / F(n-1) else: 1.0 36 | var xm = newSeq[F](m) # means 37 | for j in 0 ..< m: xm[j] = mean(x[n*j].unsafeAddr, n) 38 | for i in 0 ..< m: # lower triangle & diag 39 | for j in 0..i: 40 | v[m*i+j] = dots(x[n*i].unsafeAddr, x[n*j].unsafeAddr, xm[i], xm[j], n)*ni 41 | for i in 0 ..< m: # fill in upper from lower 42 | for j in i + 1 ..< m: v[m*i+j] = v[m*j+i] 43 | 44 | proc covMat*[F](x: openArray[F]; m, n: int): seq[F] = 45 | ## Return usual symmetric m*m Covariance matrix for n*m input matrix x where 46 | ## x[i+n*j] is the j-th column of the i-th row. I.e., samples is the faster 47 | ## moving index, not variables. 48 | result.setLen m*m 49 | covMat(result, x, m, n) 50 | 51 | when isMainModule: 52 | when not declared(stdout): import std/formatfloat 53 | echo corrAuto([1.0, 1.9, 3.2, 4.1]) 54 | echo covMat([ 1.0, 2, 3, 4, 5, 6, 7, 8 ], 4, 2) 55 | -------------------------------------------------------------------------------- /fitl/dists.nim: -------------------------------------------------------------------------------- 1 | ## Module defines a common framework for continuous probability distributions. 2 | when not declared(stderr): import std/syncio 3 | import std/[math, algorithm, random, critbits, strutils], spfun/[gauss, cauchy] 4 | type 5 | T = float64 # Things may become generic over this; T to not confuse with F(x) 6 | CDist* = tuple[ ## A 1-Dimensional Probability Distribution 7 | pdf, cdf, qtl: proc(x: T): T, ## Density, Cumulative Distro, qtl/inverse CDF 8 | gen : proc(): T, ## Sample pseudo-random deviates of this dist 9 | support: seq[T], ## where non-zero (for plots, num.integ., etc. 10 | modes : seq[T]] ## Locations of modes (local maxima) 11 | 12 | template newton(p, cdf, pdf, support, x0, fTol) {.dirty.}= # [pc]df must use `x` 13 | var x = x0 # Newton's method should be an ok inverter/solver here. 14 | for it in 1..50: # Quadratically cvgent. If 50 fails,likely nothing works 15 | let pr = (cdf) - p 16 | if abs(pr) < fTol and it > 1: return x 17 | x -= pr / (pdf) # x=min(max(x, support[0]), support[^1]) # breaks BiLogPeak 18 | result = if p < T(0.5): support[0] else: support[^1] 19 | 20 | proc mix*(compons: seq[(T, CDist, T, T)]; support: seq[T] = @[], 21 | modes: seq[T] = @[]): CDist = 22 | ## Make a distribution by mixing with component fractions (must total 1.0) 23 | ## various `CDist`s with location & scale shifts. If empty, `support` & 24 | ## `modes` are inferred from components (with some assumptions, obviously). 25 | for (_, _, _, scale) in compons: 26 | if scale < 0: raise newException(ValueError, "negative scale") 27 | let pd = (proc(x: T): T = 28 | for (coef, dist, location, scale) in compons: 29 | result += coef / scale * dist.pdf((x - location)/scale)) 30 | let cd = (proc(x: T): T = 31 | for (coef, dist, location, scale) in compons: 32 | result += coef * dist.cdf((x - location)/scale)) 33 | var cumWt: seq[T] = @[compons[0][0]] # This is captured in qtl closure 34 | for i in 1..compons.high: cumWt.add cumWt[^1] + compons[i][0] 35 | if abs(cumWt[^1] - T(1)) > 1e-6: stderr.write "unnormalized mixture!\n" 36 | result.gen = (proc(): T = 37 | let p = rand(1.0) 38 | for i, cw in cumWt: # Can binary search if MANY components. 39 | if p <= cw: # `i` is now the right component. 40 | let (coef, dist, location, scale) = compons[i] 41 | return location + scale*dist.qtl((cw - p)/coef) 42 | compons[^1][1].support[^1]) # p =~ 1.0 should be very rare 43 | let supp = if support.len > 0: support else: ( 44 | var (supp0, supp1) = (T.high,T.low) # Low-side & High-side inferred support 45 | for (_, dist, location, scale) in compons: 46 | supp0 = min(supp0, dist.support[0]*scale + location) 47 | supp1 = max(supp1, dist.support[^1]*scale + location) 48 | @[supp0, supp1]) #NOTE: R code calls these "breaks" 49 | result.qtl = (proc(p: T): T = newton(p, cd(x), pd(x), supp, 0.11, 5e-6)) 50 | result.pdf = pd; result.cdf = cd; result.support = supp 51 | result.modes = if modes.len > 0: modes else: (var modes: seq[T]; 52 | for (coef, dist, location, scale) in compons: 53 | for mode in dist.modes: 54 | if (let m = mode*scale + location; m notin modes): modes.add m 55 | modes.sort; modes) 56 | 57 | const z=T(0); const mH=T(-0.5); const pQ=T(0.25); const m6=T(-6);const p2=T(2) 58 | const o=T(1); const pH=T(+0.5); const p4=T(+4) ; const p6=T(+6);const oT=T(0.1) 59 | const p20 = T(20); const o3=T(1)/T(3) #z)ero,o)ne,mX=-X, pX=+X; oT=(o)ne(T)enth 60 | 61 | template PD(pd, cd, qt, supp, mo): untyped = # Invertible formula P.Distros 62 | ((proc(x {.inject.}: T): T {.closure.} = pd), 63 | (proc(x {.inject.}: T): T {.closure.} = cd), 64 | (proc(p {.inject.}: T): T {.closure.} = qt), 65 | (proc(): T {.closure.} = (let p {.inject.} = rand(1.0); qt)), 66 | supp, mo) 67 | 68 | template PDn(pd, cd, supp, mo; guess: untyped=0.125): untyped = # Numer.Inverse 69 | ((proc(x {.inject.}: T): T {.closure.} = pd), # expr in terms of `x` 70 | (proc(x {.inject.}: T): T {.closure.} = cd), # expr in terms of `x` 71 | (proc(p {.inject.}: T): T {.closure.} = newton(p, cd, pd, supp, guess,2e-6)), 72 | (proc(): T {.closure.} = 73 | let p {.inject.} = rand(1.0); newton(p, cd, pd, supp, guess, 2e-6)), 74 | supp, mo) 75 | 76 | # Define common `CDist`s; Export so importers can do their own mixtures/etc. 77 | let dU*: CDist = PD(if x < z: z elif x <= o: o else: z, # {}[0] needs a type 78 | if x < z: z elif x <= o: x else: o, 79 | if p < z: z elif p <= o: p else: z, @[z, o], @[pH]) 80 | 81 | let dN* = PD(gauss.pdf[T](x), gauss.cdf[T](x), gauss.qtl[T](p), @[m6, p6], @[z]) 82 | 83 | let dCauchy* = PD(cauchy.pdf[T](x), cauchy.cdf[T](x), cauchy.qtl[T](p), 84 | @[-p20, p20], @[z]) # Instantiate generics from `spfun` 85 | 86 | let dExp* = PD(if x < z: z else: exp(-x), if x < z: z else: o - exp(-x), 87 | if p < z: z else: -ln(o - p), @[z, T(10)], @[z]) 88 | 89 | let dLaplace* = PD(if x < z : pH*exp(x) else: pH*exp(-x), 90 | if x < z : pH*exp(x) else: o - pH*exp(-x), 91 | if p < pH: ln(p2*p) else: -ln(p2 - p2*p), @[m6, p6], @[z]) 92 | 93 | let dTri* = PD( 94 | if x < -o: z elif x < z : o + x elif x < o: o - x else: z, 95 | if x < -o: z elif x < z : pH*(x+o)^2 elif x < o: o - pH*(o-x)^2 else: o, 96 | if p < z : z elif p < pH: -o+sqrt(p2*p) elif p < o: o-sqrt(p2*(o-p)) else: o, 97 | @[-o, o], @[z]) ## Triangular on [-1,1] 98 | 99 | let dLogNormal* = PD(gauss.pdf[T](ln(x))/x, gauss.cdf[T](ln(x)), 100 | exp(gauss.qtl[T](p)), @[z, p20], @[exp(-o)]) 101 | 102 | let dBeta22* = PD(6*x*(1-x), (3-2*x)*x*x, # 2x^3-3x^2+0x+p=0; a=2,b=-3,c=0,d=p 103 | (let P = T(-0.75); # (3ac - b^2)/(3a^2) 104 | let Q = -pQ + pH*p; # (2b^3 - 9abc + 27da^2)/(27a^3) 105 | let rp3 = sqrt(-P*o3); # wikipedia.org/wiki/Cubic_equation 106 | let t1 = p2*rp3*cos(o3*arccos(Q/(o3*p2*P*rp3))-2*PI*o3*1); 107 | t1 + T(0.5)), # xk = tk - b/(3a); k=1 middle root 108 | @[z, o], @[pH]) 109 | 110 | let distros* = {"U01":dU, "Exp": dExp, #TODO Check supports&modes; Maybe plot? 111 | "Maxwell": PD(x*exp(mH*x*x), o-exp(mH*x*x), sqrt(-p2*ln(o-p)), @[m6,p6],@[o]), 112 | "Laplace": dLaplace, 113 | "Logistic": PD(exp(-x)/(o+exp(-x))^2,o/(1+exp(-x)),ln(p/(o-p)),@[m6,p6],@[z]), 114 | "Cauchy": dCauchy, 115 | "ExtVal" : PD(exp(-exp(-x)-x), exp(-exp(-x)), -ln(-ln(p)), @[m6, p6], @[z]), 116 | "InfPeak" : PD(o/sqrt(p4*x), x.sqrt, p*p, @[z, o], @[z]), 117 | "AsymPareto" : PD(pH/pow(x,T(1.5)), o-o/x.sqrt, o/(o-p)^2, @[o, p20], @[z]), 118 | "SymPareto" : PD(pQ*pow(o+x.abs,T(-1.5)), 119 | if x < 0: pH/sqrt(o+x.abs) else: o-pH/sqrt(o+x.abs), 120 | if p<0.5: o-o/(o-abs(o-p2*p))^2 121 | else : o/(o-abs(p2*p-o))^2-o, @[-p20, p20], @[z]), 122 | "N01": dN, "logNormal": dLogNormal, 123 | "3BinHisto1" : mix(@[(pH,dU,mH,o), (pH,dU,T(-5),T(10))], modes = @[z]), 124 | "Matterhorn" : PD(if abs(x) > exp(-p2): 0.0 else: o/(x.abs*ln(x.abs)^2), 125 | if x.abs < -exp(-p2): z elif x.abs=oT and x.abs<=T(1.1): p2*(o-pow(x2-oT,o3)) else: z), 148 | if x < -T(1.1): z 149 | elif x < -oT : pH*(o-(p4*(x.abs-oT)-3*pow(abs(-x-oT), p4*o3))) 150 | elif x < oT : pH 151 | elif x < T(1.1) : pH + 0.5*(4*(x-0.1)-3*pow(abs(x-0.1), p4*o3)) 152 | else: 1.0, @[-T(1.1), -oT, oT, T(1.1)], @[-oT, oT], 153 | (if abs(p - 0.50)>0.06: T(-1.1) + p*T(2.2) elif p > 0.44 and p < pH: 154 | p/1.48179547 - 0.437185547 else: p/1.48179547 - 0.23781445)), 155 | "TriModeU": mix(@[(pQ,dU,T(-20.1),oT), (pH,dU,-o,p2), (pQ,dU,p20,oT)]), 156 | "Sawtooth": mix(@[(oT, dTri,T(-9),o), (oT, dTri,T(-7),o), (oT, dTri,T(-5),o), 157 | (oT, dTri,T(-3),o), (oT, dTri,T(-1),o), (oT, dTri,T(+1),o), 158 | (oT, dTri,T(+3),o), (oT, dTri,T(+5),o), (oT, dTri,T(+7),o), 159 | (oT, dTri,T(+9),o) ]), 160 | "BiLogPeak": PDn(mH*ln(abs(x*(o-x))), 161 | pH*(-x*x.abs.ln + (o-x)*abs(o-x).ln) + x, @[z,o], @[z,o]), 162 | "Bimodal" : mix(@[(pH, dN,-o,o), (pH, dN,o,o)], modes = @[z]), 163 | "10Normal": mix(@[(oT, dN,T(-25),o), (oT, dN,T(-20),o), (oT, dN,T(-15),o), 164 | (oT, dN,T(-10),o), (oT, dN,T( -5),o), (oT, dN,T( +0),o), 165 | (oT, dN,T( +5),o), (oT, dN,T(+10),o), (oT, dN,T(+15),o), 166 | (oT, dN,T(+20),o)]), 167 | "unif": PD(if x < -o: z elif x <= o: pH else: z, 168 | if x < -o: z elif x <= o: pH*(x+o) else: o, 169 | if p < z: -o elif p <= o: p2*p-o else: o, @[-o, o], @[z]), 170 | "epanechnikov": PD(if x <= -o:z elif x 1: raise newException(IOError, 222 | ("Ambiguous prefix for continuous distro; \"$1\" matches:\n $2\n" & 223 | "Empty string lists all") % [ pfx, ks.join("\n ")]) 224 | else: raise newException(IOError, ("No prefix match for \"$1\"" % [pfx])) 225 | 226 | when isMainModule: # A trivial command line driver mostly for testing. 227 | when not declared addFloat: import std/formatFloat 228 | import cligen; when defined(release): randomize() 229 | template C(a,b,th):untyped = a.abs>th and abs(a-b)>th and abs(a/b-1)>sqrt(th) 230 | type Plot = enum pNone="none", pPDF="pdf", pCDF="cdf", pQtl="qtl" 231 | proc dists(distro="U01", nSamp=0, x=Inf, plot=pNone, mode=false, v=false) = 232 | var dno = 0 233 | try: dno = distros.match(distro) 234 | except IOError as e: echo e.msg; quit(1) 235 | if v: stderr.write "Distro[", dno+1, "]: ", distros[dno][0], '\n' 236 | let (nm,dist) = distros[dno]; let (pdf, cdf, qtl, gen, support, modes)=dist 237 | for i in 1..nSamp: echo gen() 238 | if x != Inf: # User gave some `x` at which to test this PDist 239 | let h = 5e-5 # for numerical derivative of CDF 240 | let p = pdf(x) # This tests ok (except 24 27): for D in {1..37}; { 241 | if p > 0f: # for X in {0..400}; dists -d$D -x$[(X-200)*0.01] } 242 | let c = cdf(x); let q = qtl(c) 243 | let dc = (cdf(x + h) - c)/h 244 | if C(p/dc, 1, 1e-4): echo nm," dCdf!=pdf; x: ",x," pd: ",p," dc: ",dc 245 | if C(q , x, 1e-4): echo nm," qtl(c)!=I; x: ",x," cd: ",c," qt: ",q 246 | if plot in {pPDF, pCDF}: 247 | let scl = (support[^1] - support[0])/4096.0 248 | for i in 1..4095: 249 | let x = support[0] + i.T * scl 250 | echo x, " ", if plot == pCDF: cdf(x) else: pdf(x) 251 | echo support[^1], " ", 0f 252 | elif plot == pQtl: 253 | for i in 0..1024: (let x = i.float*0.0009765625; echo x," ",qtl(x)) 254 | if mode: (for m in modes: echo m) 255 | include cligen/mergeCfgEnv; dispatch dists 256 | -------------------------------------------------------------------------------- /fitl/estMI.nim: -------------------------------------------------------------------------------- 1 | import std/[math, random, algorithm, stats], spfun/[gauss, digamma] 2 | type F = float32 # Can make [F] a common generic parameter someday 3 | # Common params: `x`: input; `n`: #pairs; `nB`: #boxes; `scl`: scale factor; 4 | # `eGinv`: box size; `c(1|2)`: 1st|2nd component for making 2-D grid; `box`: 5 | # each array elt is num of last point in box; `lis`: each array elt is num of 6 | # prev points in box|-1; `mxi`: cumulative n(pnts) in box; eFoo =~ epsilon_Foo 7 | 8 | func ssDup[T](n, d2, o: int, src: seq[seq[T]]): seq[seq[T]] = # seq[seq[]] dup 9 | result.setLen d2 10 | for d in 0 ..< d2: 11 | result[d] = newSeq[T](n) 12 | copyMem result[d][0].addr, src[o + d][0].addr, n*T.sizeof 13 | 14 | proc xcol(x: seq[seq[F]]; i, dim: int): seq[F] = 15 | result.setLen dim; for d in 0 ..< dim: result[d] = x[d][i] 16 | 17 | func make_box1(x: seq[F]; n: int; scl: F; nB: int; box,lis,mxi: var seq[int]) = 18 | for i in 0..nB: box[i] = -1; mxi[i] = 0 # Make 1-D box 19 | for i in 0 ..< n: 20 | let ix = int(x[i]*scl) 21 | lis[i] = box[ix]; box[ix] = i 22 | mxi[ix].inc 23 | for i in 1..nB: mxi[i] += mxi[i-1] 24 | 25 | func make_box2ind(x: var seq[seq[F]]; dim, n, c1, c2, nB, eGinv: int; 26 | ind, lis: var seq[int]; box: var seq[seq[int]]) = 27 | let ib = nB - 1 # Make 2-D box; Re-orders! but saves idx of orig data->`ind`. 28 | var xx = ssDup(n, dim, 0, x) 29 | for ix in 0 ..< nB: (for iy in 0..= 0: 38 | inc i 39 | for d in 0 ..< dim: x[d][i] = xx[d][ixy] 40 | ind[ixy] = i; ixy = lis[ixy] 41 | box[ix][iy] = -1 42 | for i in 0 ..< n: 43 | let (ix,iy) = (int(x[c1][i]*eGinv.F) and ib, int(x[c2][i]*eGinv.F) and ib) 44 | lis[i] = box[ix][iy]; box[ix][iy] = i 45 | 46 | func neiE1(x: seq[F]; scl: F; i,nB: int; eps: F; box,lis,mxi: seq[int]): int = 47 | let xc = x[i] # Count nbors of point i in eps-nborhood in 1-D 48 | let mp = min(nB, int((xc + eps)*scl)) 49 | let mm = max(0, int((xc - eps)*scl)) 50 | var mi = box[mp] 51 | while mi >= 0: 52 | let dd = x[mi] - xc 53 | if dd.abs <= eps: inc result 54 | mi = lis[mi] 55 | if mm >= mp: return result - 1 56 | mi = box[mm] 57 | while mi >= 0: 58 | let dd = xc - x[mi] 59 | if dd.abs <= eps: inc result 60 | mi = lis[mi] 61 | inc result, mxi[mp - 1] - mxi[mm] 62 | result - 1 63 | 64 | iterator els(box: seq[seq[int]]; lis: seq[int]; ix2, iy1, ib: int): int = 65 | var el = box[ix2][iy1 and ib] 66 | while el != -1: 67 | yield el; el = lis[el] 68 | 69 | template gridScan(eps, jj, ix, iy, ib, maxDscan) = 70 | while eps > eGrid*(jj.F - 1): # Outer loop logic for maxDscans in `nei[KE]` 71 | let step = if jj != 0: 2*jj else: 1 72 | for ix1 in ix - jj .. ix + jj: 73 | let ix2 = ix1 and ib 74 | for iy1 in countup(iy - jj, iy + jj, step): maxDscan ix2, iy1 75 | for ix1 in countup(ix - jj, ix + jj, step): 76 | let ix2 = ix1 and ib 77 | for iy1 in iy - jj + 1 .. iy + jj - 1: maxDscan ix2, iy1 78 | inc jj 79 | if jj == nB div 2: break 80 | if jj == nB div 2: # Half of the layer 81 | for ix1 in ix - jj ..< ix + jj: maxDscan ix1 and ib, iy - jj 82 | let ix2 = (ix - jj) and ib 83 | for iy1 in iy - jj + 1 .. iy + jj - 1: maxDscan ix2, iy1 84 | 85 | func neiK(x: seq[seq[F]]; dim, c1, c2, nB, i: int; eGrid: F; k: int; 86 | box: seq[seq[int]]; lis: seq[int]): seq[int] = 87 | let ib = nB - 1 # Search for k nbors of point i in dim-D .. 88 | let xx = xcol(x, i, dim) # eGrid: size of grid; returns ixes(k nbors). 89 | let (ix, iy) = (int(xx[c1]/eGrid) and ib, int(xx[c2]/eGrid) and ib) 90 | var jj: int # Zero by default 91 | result.setLen k + 1 92 | var dn = newSeq[F](k + 1) 93 | for kk in 1..k: dn[kk] = F.high 94 | template maxDscan(ix2, iy1) = 95 | for el in els(box, lis, ix2, iy1, ib): 96 | if el != i: 97 | var dd = abs(xx[0] - x[0][el]) 98 | for d in 1 ..< dim: 99 | if (let dy = abs(xx[d] - x[d][el]); dy > dd): dd = dy 100 | if dd < dn[k]: 101 | var kk = k 102 | while dd < dn[kk]: 103 | if kk < k: dn[kk + 1] = dn[kk]; result[kk + 1] = result[kk] 104 | dec kk 105 | dn[kk + 1] = dd; result[kk + 1] = el 106 | gridScan dn[k], jj, ix, iy, ib, maxDscan 107 | 108 | func neiE(x: seq[seq[F]]; dim, c1, c2, nB, i: int; eGrid, eps: F; 109 | box: seq[seq[int]]; lis: seq[int]): int = 110 | let ib = nB - 1 # Count nbors in eps-nborhood of point i in 111 | let xx = xcol(x, i, dim) # dim-D. eGrid: size of grid 112 | let (ix, iy) = (int(xx[c1]/eGrid) and ib, int(xx[c2]/eGrid) and ib) 113 | var jj, nx: int 114 | template maxDscan(ix2, iy1) = 115 | for el in els(box, lis, ix2, iy1, ib): 116 | var dd = abs(xx[0] - x[0][el]) 117 | for d in 1 ..< dim: 118 | if (let dy = abs(xx[d] - x[d][el]); dy > dd): 119 | dd = dy 120 | if dd > eps: break 121 | if dd <= eps: inc nx 122 | gridScan eps, jj, ix, iy, ib, maxDscan 123 | nx - 1 124 | 125 | proc mir_xnyn(x: var seq[seq[F]]; dimx, dimy, n, k: int; scl: seq[F]): F = 126 | let d2 = dimx + dimy # Compute mutual info among vectors by rectangle 127 | let box1 = n - 5 # method for one k; IN: x: (dimx+dimy)-n-vectors 128 | var xc = newSeq[F](d2) # dim(x|y): Dim of (x|y)vector; k: max num nbors 129 | var nB = 1; while nB*nB*k < 2*n: nB *= 2 130 | let eGinv = nB div 4; let eGrid = 1.0/eGinv.F 131 | var xx, yy: seq[seq[F]] 132 | var boxx, boxy, box: seq[seq[int]] 133 | var lisx,boxx1,lisx1,mxi, lisy,boxy1,lisy1,myi, lis,ind,indx,indy: seq[int] 134 | var scalx, scaly: F 135 | if dimx>1: 136 | xx = ssDup(n, dimx, 0, x) # Save x to xx if data would be re-ordered 137 | boxx.setLen nB; for i in 0 ..< nB: boxx[i] = newSeq[int](nB) 138 | lisx.setLen n 139 | else: boxx1.setLen box1 + 1; lisx1.setLen n; mxi.setLen box1 + 1 140 | if dimy>1: 141 | yy = ssDup(n, dimy,dimx, x) # Save x to yy if data would be re-ordered 142 | boxy.setLen nB; for i in 0 ..< nB: boxy[i] = newSeq[int](nB) 143 | lisy.setLen n 144 | else: boxy1.setLen box1 + 1; lisy1.setLen n; myi.setLen box1 + 1 145 | box.setLen nB 146 | for i in 0 ..< nB: box[i] = newSeq[int](nB) 147 | lis.setLen n; ind.setLen n; indx.setLen n; indy.setLen n 148 | make_box2ind x, d2, n, 0, dimx, nB, eGinv, ind, lis, box 149 | if dimx>1: make_box2ind xx, dimx, n, 0, dimx-1, nB, eGinv, indx, lisx, boxx 150 | else: scalx = scl[0]; make_box1 x[0], n, scalx, box1, boxx1, lisx1, mxi 151 | if dimy>1: make_box2ind yy, dimy, n, 0, dimy-1, nB, eGinv, indy, lisy, boxy 152 | else: scaly = scl[dimx]; make_box1 x[dimx], n, scaly, box1, boxy1, lisy1, myi 153 | var dxy = 0.0 154 | for i in 0 ..< n: 155 | for d in 0 ..< d2: xc[d] = x[d][ind[i]] 156 | let nn = neiK(x, d2, 0, dimx, nB, ind[i], eGrid, k, box, lis) 157 | var ex = 0.0 158 | for d in 0 ..< dimx: 159 | for kk in 1..k: (if (let dx=abs(xc[d] - x[d][nn[kk]]); dx>ex): ex = dx) 160 | var ey = 0.0 161 | for d in dimx ..< d2: 162 | for kk in 1..k: (if (let dy=abs(xc[d] - x[d][nn[kk]]); dy>ey): ey = dy) 163 | let nx=if dimx==1: neiE1 x[0], scalx, ind[i], box1, ex, boxx1, lisx1, mxi 164 | else: neiE xx, dimx, 0, dimx-1, nB, indx[i], eGrid, ex, boxx, lisx 165 | let ny=if dimy==1: neiE1 x[dimx], scaly, ind[i], box1, ey, boxy1, lisy1, myi 166 | else: neiE yy, dimy, 0, dimy-1, nB, indy[i], eGrid, ey, boxy, lisy 167 | dxy += psi(nx) + psi(ny) #echo "nx: ", nx, " ny: ", ny 168 | psi(n) + (psi(k) - 1.0/k.F) - dxy/n.F # middle term = "phi[k]" 169 | 170 | func zscoreScale(x: var seq[F]): F = 171 | var me, s: F; let n = x.len # Make mean0,var1 & non-negative 172 | for i in 0 ..< n: me += x[i] # This is what KSG 2004 recommends 173 | me /= n.F 174 | for i in 0 ..< n: s += (x[i] - me)^2 175 | s /= n.F - 1; s = 1.0/(sqrt s) 176 | var (mn, mx) = (F.high, F.low) 177 | for i in 0 ..< n: 178 | x[i] = s*(x[i] - me) 179 | if x[i] < mn: mn = x[i] 180 | if x[i] > mx: mx = x[i] 181 | for i in 0 ..< n: x[i] -= mn 182 | F(n - 5)/(mx - mn) #echo "scl=", result 183 | 184 | proc normalScoreScale*(pntsD: var seq[F]): F = 185 | let n = pntsD.len # This is what Holmes 2019 recommends. Their.. 186 | var xs = newSeq[(F, int)](n) #.. "reparameterize" is aka "normal scoring". 187 | for i, x in pntsD: xs[i] = (x, i) 188 | xs.sort 189 | let q0 = gauss.qtl(0.5/n.F) # Similar mean0,var1 but 0-offset as zscore 190 | for i in 0 ..< n: pntsD[xs[i][1]] = q0 + gauss.qtl((i.F + 0.5)/n.F) 191 | F(n - 5)/(gauss.qtl((n.F - 0.5)/n.F) - q0) 192 | 193 | proc miKNN*(pnts: seq[F]; dimx, dimy, k: int; score=false): F = 194 | ## Mutual Info Estimate (bits) from data set of dimx-dimy vector pairs by k-th 195 | ## near-nbor rectangle method. `pnts` is `n` back-to-back dimx+dimy vectors. 196 | let d2 = dimx + dimy 197 | let n = pnts.len div d2 # echo "n: ",n," k: ",k," dimx: ",dimx," dimy: ",dimy 198 | if pnts.len mod d2 != 0: raise newException(IOError,"data-dimx+dimy mismatch") 199 | var x = newSeq[seq[F]](d2) 200 | var scl = newSeq[F](d2) 201 | for d in 0 ..< d2: # Set up whole coordinate columns 202 | x[d] = newSeq[F](n) 203 | for i in 0 ..< n: x[d][i] = pnts[d2*i + d] 204 | scl[d] = if score: x[d].normalScoreScale else: x[d].zscoreScale 205 | mir_xnyn(x, dimx, dimy, n, k, scl)/ln(2.0) # Info is in bits not "nats" 206 | 207 | iterator subsampled*(pnts: seq[F]; dimx, dimy, k, nS: int; score=false): F = 208 | ## Yield subsampled `miKNN` estimates for random 1/nS subsets of input `pnts`, 209 | ## sharing all other parameters. 210 | let n = pnts.len div nS # Q: Insist `nS > 1`? 211 | var all = newSeq[int](pnts.len) 212 | for i in 0 ..< pnts.len: all[i] = i 213 | all.shuffle 214 | var ps = newSeq[F](n*(dimx + dimy)) 215 | let bytes = (dimx + dimy)*F.sizeof 216 | for s in 1..nS: 217 | for i in 0 ..< n: copyMem ps[i].addr, pnts[all[(s - 1)*n + i]].addr, bytes 218 | yield miKNN(ps, dimx, dimy, k, score) 219 | 220 | proc miErr*(pnts: seq[F]; dimx, dimy, k: int; score=false, err=4): (F, F) = 221 | result[0] = miKNN(pnts, dimx, dimy, k, score) 222 | var varSum = 0.0; var niSum = 0 # Paper derives assuming variance ~ χ². 223 | for ni in 2..err: # Q: Do a more general `listSplitSizes`? 224 | for trial in 1..3: # ~10X overall slowdown factor w/defaults 225 | var rs: RunningStat 226 | for mi in subsampled(pnts, dimx, dimy, k, ni, score): rs.push mi 227 | varSum += F(ni - 1)/ni.F * rs.varianceS # Holmes2019Eq(9)seems to have.. 228 | niSum += ni - 1 #..missed canceling N from Eq(8). 229 | result[1] = sqrt(varSum/niSum.F) #..Their Matlab _stddev code also drops it. 230 | 231 | when isMainModule: 232 | import std/[syncio, strutils, enumerate], cligen, cligen/strUt 233 | when defined(danger): randomize() 234 | 235 | proc load(f: File, noise: F): seq[F] = 236 | var nCol = 0 237 | for i, row in enumerate(f.lines): 238 | let n0 = result.len 239 | for col in row.split: 240 | result.add col.strip.parseFloat + rand(1.0)*noise 241 | if nCol > 0 and result.len - n0 != nCol: 242 | stderr.write "stdin:",i,": irregular\n" 243 | nCol = result.len - n0 244 | 245 | proc estMI(k=6, file="", xDim=1, yDim=0, noise=0.0, score=false, err=4) = 246 | ##Estim.Mutual Info (bits) from data set of 1-line vector pairs by rectangle 247 | ##method of Kraskov, Stogbauer & Grassberger 2004, PhysRevE 69(6)066138 with 248 | ##follow-on ideas by Holmes & Nemenman 2019 PhysRevE 100(022404). 249 | let yDim = if yDim == 0: xDim else: yDim 250 | let pnts = load(if file.len > 0: file.open else: stdin, noise) 251 | let (val,err) = pnts.miErr(xDim, yDim, k, score); echo fmtUncertain(val,err) 252 | 253 | include cligen/mergeCfgEnv; dispatch estMI, help={ 254 | "k": "k-th out nearest neighbor", 255 | "file": "filename; \"\" => stdin; Fmt: xDim+yDim rows", 256 | "xDim": "x vector dimension", "yDim": "y vector dimension (0=>same as x)", 257 | "noise": "U01-scale noise to add to each coord", "score": "t=normal score", 258 | "err": "max number of subdivisions to estim err with"} 259 | -------------------------------------------------------------------------------- /fitl/gof.nim: -------------------------------------------------------------------------------- 1 | ## Implement metrics from the book Goodness-of-fit Techniques, 1986 edited by 2 | ## Ralph B. D'Agostino and Michael A. Stephens (referred to now as ds86). 3 | import std/[math,random,algorithm,sugar],spfun/[gauss,gamma,binom],basicLA,ksamp 4 | type # Open Topology Circular Topology 5 | GoFTest* = enum gfD = "kolmogorovSmirnovD", gfV = "vKuiper" , # L_infinity 6 | gfW2 = "cramerVonMisesW2" , gfU2 = "watsonU2", # L2 norm 7 | gfA2 = "andersonDarlingA2" 8 | 9 | GoFMod* = enum gfFin = "finiteN", gfEst = "estimates" 10 | 11 | const gofName*: array[GoFTest, string] = [ "D", "V", "W^2", "U^2", "A^2" ] 12 | const noMod: set[GoFMod] = {} 13 | 14 | func kolmogorovSmirnovPM[F](ps: seq[F]): (F, F) = 15 | let nInv = F(1.0/float(ps.len)) # ds86 Eq 4.2 16 | for i, p in ps: #NOTE: formula converted to 0-origin indexes. 17 | result[0] = max(result[0], F(i+1)*nInv - p) 18 | result[1] = max(result[1], p - F(i)*nInv) 19 | 20 | func kolmogorovSmirnov*[F](ps: seq[F], mods=noMod): F = 21 | ## Kolmogorov-Smirnov max(D+,D-) modified by ds86 for estimated mean, var, 22 | ## finite n. `u01ize` first! 23 | let (dP, dM) = ps.kolmogorovSmirnovPM # ds86 Eq 4.2 24 | result = max(dP, dM); let n = ps.len.F 25 | if gfFin in mods: result *= sqrt(n) + F(0.12) + F(0.11)/sqrt(n) 26 | if gfEst in mods: result *= sqrt(n) - F(0.01) + F(0.85)/sqrt(n) 27 | 28 | func kuiperV*[F](ps: seq[F], mods=noMod): F = 29 | ## Kuiper V=Dp+Dm modified by ds86 for est.mean,var,finite n. `u01ize` first! 30 | let (dP, dM) = ps.kolmogorovSmirnovPM 31 | result = dP + dM; let n = ps.len.F # ds86 Eq 4.2 32 | if gfFin in mods: result *= sqrt(n) + F(0.155) + F(0.24)/sqrt(n) 33 | if gfEst in mods: result *= sqrt(n) + F(0.050) + F(0.82)/sqrt(n) 34 | 35 | func cramerVonMises*[F](ps: seq[F], mods=noMod): F = 36 | ## W^2 modified by ds86 for estimated mean,var, finite n. `u01ize` first! 37 | let n = ps.len 38 | let nI2 = F(0.5)/F(n) #NOTE: formula converted to 0-origin indexes. 39 | result = nI2/F(6) + sum0(j, n, (ps[j] - F(2*j + 1)*nI2)^2) # ds86 Eq 4.2 40 | if gfFin in mods: 41 | result *= (F(1) - F(0.4)/F(n) + F(0.6)/F(n*n))*(F(1) + F(1)/F(n)) 42 | # Stephens1970, ds86 Table 4.2 & Eq6.19 all say '*',but '/' in ^^^ E.g. 4.4.1 43 | if gfEst in mods: result *= F(1) + nI2 44 | 45 | func watsonU2*[F](ps: seq[F], mods=noMod): F = 46 | ## Watson U^2 modified by ds86 for estim.mean,var, finite n. `u01ize` first! 47 | let n = ps.len; let nF = n.F 48 | let nI2 = F(0.5)/nF #NOTE: Formula converted to 0-origin indexes. 49 | let mn = sum0(j, n, F(ps[j]))/nF # ds86 Eq 6.18 50 | result = nI2/F(6) - nF*(mn - 0.5)^2 + sum0(j, n, (ps[j] - F(2*j + 1)*nI2)^2) 51 | if gfFin in mods: 52 | result *= (F(1) - F(0.1)/F(n) + F(0.1)/F(n*n))*(F(1) + F(0.8)/F(n)) 53 | # Stephens1970, ds86 Table 4.2 & Eq6.19 all say '*',but '/' in ^^^ E.g. 4.4.1 54 | if gfEst in mods: result *= F(1) + nI2 55 | 56 | func andersonDarling*[F](ps: seq[F], mods=noMod): F = 57 | ## A^2 modified by ds86 for estimated mean,var, finite n. `u01ize` first! 58 | let n = ps.len # ds86 Eq 4.2 59 | let nInv = F(1)/F(n) #NOTE: Formula converted to 0-origin indexes. 60 | result = F(-n) - nInv*sum0(j, n, F(2*j + 1)*ln(ps[j]*(F(1) - ps[n-1-j]))) 61 | if gfEst in mods: result *= F(1) + F(0.75)*nInv + F(2.25)*nInv*nInv 62 | 63 | func zscore*[F](xs: var seq[F]; mnVr: (F, F)) = 64 | ## Unitize a data sample `xs[i]` to its z-scores `(xs[i] - mn)/sqrt(vr))`. 65 | let scl = F(1)/sqrt(mnVr[1]) 66 | for x in mitems(xs): x -= mnVr[0]; x *= scl 67 | 68 | func pitz*[F](zs: var seq[F]) = 69 | ## z-scores `N(0,1)->U(0,1)` via Probability Integral Transform (PIT). 70 | for z in mitems(zs): z = gauss.cdf(z) 71 | 72 | func u01ize*[F](xs: var seq[F], mnVr: (F, F)) = 73 | ## Convert into Z-scores, sort, and PIT-transform to be U(0,1) 74 | xs.zscore mnVr; xs.sort; xs.pitz 75 | 76 | func gofStat*[F](sample: var seq[F], mnVr: (F, F), gof=gfA2, mods=noMod, 77 | u01d=false): F = 78 | ## Calculate goodness-of-fit stat for `sample` (clobbering `sample`). 79 | if not u01d: sample.u01ize(mnVr) 80 | case gof 81 | of gfD : sample.kolmogorovSmirnov(mods) 82 | of gfV : sample.kuiperV(mods) 83 | of gfW2: sample.cramerVonMises(mods) 84 | of gfU2: sample.watsonU2(mods) 85 | of gfA2: sample.andersonDarling(mods) 86 | 87 | func gofStat*[F](sample: seq[F], mnVr: (F, F), gof=gfA2, mods=noMod, 88 | u01d=false): F = 89 | ## Calculate goodness-of-fit stat for `sample` (preserving `sample`). 90 | var sample = sample 91 | sample.gofStat mnVr, gof, mods, u01d 92 | 93 | template gofDistT(F, cdf, mV, gof, mods): untyped = 94 | let (mu, sig) = (mV[0], sqrt(mV[1])) 95 | var sample = newSeq[F](n) 96 | cdf.setLen 0 97 | for k in 1..m: 98 | for i in 0..val|Gauss): {p:.4g}" 207 | if eDist in emit: echo &"cdf({gName}): ", cdf 208 | nSignificantDepartures 209 | 210 | include cligen/mergeCfgEnv; dispatch gof, positional="sample", help={ 211 | "sample": "x_1 x_2 .. x_n data sample", 212 | "gofs": 213 | "kolmogorovSmirnovD cramerVonMisesW2 `andersonDarlingA2` vKuiper watsonU2", 214 | "adj" : "adjust GoF stat for: **estimates** finiteN", 215 | "emit" : "emits: z, PITz, stat, dist, prob", 216 | "s" : "number of n-samples to estimate CDF", 217 | "knownM": "known *mean* Gaussian; kV=0 => **estimate**", 218 | "knownV": "known *var* Gaussian; 0 => **estimate**", 219 | "pval" : "exit status = number of `prob` < `pval`", 220 | "cf" : "test serial autoCorrFunc up to this lag", 221 | "mi" : "test serial auto-Mutual Info up to this lag", 222 | "ident" : "test identically distributed w/`b` trials of `a`-way splits", 223 | "ran" : "randomize() for sampling"}, short={"knownM":'M',"knownV":'V'} 224 | -------------------------------------------------------------------------------- /fitl/ksamp.nim: -------------------------------------------------------------------------------- 1 | ## k-sample statistical homogeneity test based on Fritz Scholz' R package. 2 | import std/[math, random, algorithm] 3 | type F = float 4 | func count(dat: openArray[F], z: F): int = (for e in dat: (if e==z: inc result)) 5 | func getSum(x: openArray[int], lim=0): int = (for i in 0..= observed 54 | ## for non-parametric (rank) test described in Scholz,Stephens 1987 for later 55 | ## `binomP` p-value estimates. Here, `x` is a catenated list of observations, 56 | ## `ns` are the k sample sizes (in the same order), `m` is the number of sims, 57 | ## and `s0, sA` optional saved test statistic values. 58 | var u = x[0..^1] # Copy x for sort & de-duplicate 59 | var x = u # Copy x again for shuffling 60 | u.sortU # sort & de-duplicate 61 | let (datA0, datAA) = x.adkStat(u, ns) # Get observed AkN2,AakN2 62 | # echo "A0: ",datA0," AA: ",datAA 63 | for i in 1..m: # Simulate 64 | x.shuffle 65 | let (a1, aa) = x.adkStat(u, ns) # Get simulated AkN2,AakN2 66 | if not s0.isNil: s0[].add a1 67 | if not sA.isNil: sA[].add aa 68 | inc result[0], (a1 >= datA0).int 69 | inc result[1], (aa >= datAA).int 70 | 71 | when isMainModule: 72 | when defined danger: randomize() 73 | let x=[0.824,0.216,0.538,0.685,0.448,0.348,0.443,0.722,0.403,0.268,0.440,0.087] 74 | echo adkSim(x, [4,4,4], m=1000000) # ADStatVsn2 = 2.6240 != 2.6200, pV matches 75 | -------------------------------------------------------------------------------- /fitl/linfit.nim: -------------------------------------------------------------------------------- 1 | import std/[strformat, strutils, sequtils, algorithm, math], 2 | spfun/[gauss, gamma], basicLA, svdx, min1d; export gamma 3 | when not declared(File): import std/syncio 4 | 5 | type CrVal* = enum xvGCV = "GCV", xvLOO = "LOO" 6 | 7 | proc colDel(x: pointer; n, m, J, sz: int) = # In-place column deletion 8 | if m < 2: return # 2*n memmoves copying most data 9 | var X = cast[cstring](x) # One place that col-major sucks. 10 | let M = m - 1 11 | for i in 0 ..< n: 12 | if J > 0: moveMem X[M*i*sz].addr, X[m*i*sz].addr, J*sz 13 | if J < M: moveMem X[(M*i + J)*sz].addr, X[(m*i + J + 1)*sz].addr, (M - J)*sz 14 | 15 | type CrossVal[F] = object 16 | n: int # number of rows of U/data points 17 | m: int # number of predictors/params; dim of z,w 18 | z: seq[F] # z[j] sum(U_ij*y_i); Data vector in canonical coordinates 19 | u: ptr seq[F] # u[i+n*j] ith element of jth left singular vector 20 | y: seq[F] # y[i] ith element of response vector 21 | s: ptr seq[F] # s[j] singular values 22 | rss0: F # residual sum of squares for full model (input) 23 | df: F # effective degrees of freedom (output) 24 | 25 | proc loo[F](o: var CrossVal[F]; L: F): F = 26 | # Return Leave-One-Out Cross-Validation score also known as "Allen's PRESS" as 27 | # a function of ridge parameter L & in-hand Singular Value Decomposition. 28 | let n = o.n; let m = o.m 29 | for j in 0 ..< m: o.z[j] = sum0(i, n, o.u[][i + n*j]*o.y[i]) # zj = Uj*y 30 | var ss: F 31 | for i in 0 ..< n: 32 | var Hii, yEst: F 33 | for j in 0 ..< m: 34 | let adj = 1.0 / (1.0 + L / (o.s[][j]*o.s[][j])) 35 | yEst += o.u[][i + n*j]*o.z[j]*adj 36 | Hii += o.u[][i + n*j]*o.u[][i + n*j]*adj 37 | let pr = (o.y[i] - yEst)/(1.0 - Hii) # prediction residual 38 | ss += pr*pr # sum of its square 39 | return ss / n.F # LOO aka Allen PRESS 40 | 41 | proc gcv[F](o: var CrossVal[F]; L: F): F = 42 | # Return Generalized Cross-Validation score as a function of ridge parameter L 43 | # &in-hand Singular Value Decomposition; Implementation follows Golub79 Eq2.3. 44 | let n = o.n; let m = o.m 45 | var df = F(n - m) 46 | var rss = 0.0 47 | if o.rss0 == 0.0: 48 | for j in 0 ..< m: o.z[j] = sum0(i,n, o.u[][i + n*j]*o.y[i]) # zj=Uj*y 49 | for i in 0 ..< n: # r = y - yEst = y - U.z 50 | let yEst = sum0(j,m, o.u[][i + n*j]*o.z[j]) 51 | rss += (o.y[i] - yEst)*(o.y[i] - yEst) # rss against OLS yEst 52 | o.rss0 = rss 53 | else: rss = o.rss0 54 | for j in 0 ..< m: 55 | let adj = 1.0 / (1.0 + o.s[][j]*o.s[][j] / L) # L-adjustment for s[j] 56 | df += adj 57 | rss += (adj*o.z[j])*(adj*o.z[j]) 58 | o.df = df 59 | return n.F*rss/(df*df) # Adding this to rss & re-eval makes =~ LOOCV 60 | 61 | proc linFit*[F](y, x: openArray[F]; n, m: int; b, u, s, v, r, h: var seq[F]; 62 | thr: var F=1e-6; xv=xvLOO; log: File=nil): (F,F,F) {.discardable.} = 63 | ## Find best fit b in Xb=y; Caller pre-adjusts design matrix X&y for y weight. 64 | ## X is column major for dot product speed in SVD. Ie. x[i + n*j] indexes j-th 65 | ## predictor at i-th data point (&U similarly). Fills b[m] w/coefs, s with adj 66 | ## sing.value reciprocals. v.len>0 => fill w/Cov(b) estimate. r.len>0 => fill 67 | ## w/resids. h.len>0 => fill w/n DIAG vals of Hat Matrix X*inv(X'*X)*X'. thr 68 | ## >0 is SV CLIP as *frac* of maxSV; <0 => ridge w/lambda -thr; ==0 => LOO|GCV 69 | ## -optimal ridge. Returns (ssR, df, ssY) if r.len>0 else (0, df, 0). 70 | let doC = v.len > 0; let doR = r.len > 0; let doH = h.len > 0 71 | b.setLen m # maybe alloc all needed RAM 72 | if u[0].addr != x[0].unsafeAddr: # setLen & copy 73 | u.setLen x.len 74 | copyMem u[0].addr, x[0].unsafeAddr, x.len*F.sizeof 75 | s.setLen m 76 | v.setLen m*m # need if need `b` => always 77 | if doR: r.setLen n 78 | if doH: h.setLen n 79 | let svMx = svdx(u, s, v, n, m) # SVD LHS of X.b=y design eq 80 | if doH: 81 | for i in 0 ..< n: h[i] = sum0(j,m, u[i + n*j]*u[i + n*j]) 82 | var df = F(n - m) 83 | if thr > 0: # Manual singular value clip 84 | for j in 0 ..< m: s[j] = if s[j] > thr*svMx: 1.0 / s[j] else: 0.0 85 | else: 86 | if thr < 0: thr = -thr # Manual ridge parameter 87 | else: # GCV|LOO-optimal ridge param 88 | var o = CrossVal[F](n: n, m: m, z: newSeq[F](m), 89 | u: u.addr, y: y.toSeq, s: s.addr) 90 | proc xvScore(lam: F): F = (if xv == xvGCV: o.gcv(lam) else: o.loo(lam)) 91 | minBrent(thr, F(0), F(svMx)*F(n), xvScore, tol=0.001, itMx=99) 92 | df = n.F 93 | for j in 0 ..< m: # adjust SVs; eff.deg.freedom 94 | var recipDen = 1.0 / (s[j]*s[j] + thr) 95 | df -= s[j]*s[j]*recipDen #NOTE: s = s / (s^2 + thr) 96 | s[j] = s[j]*recipDen # Ridge adj reduces in lim to s=1/s w/thr>0 above 97 | if not log.isNil: 98 | log.write &"sThr: {thr:.4g} n-df: {n.F - df:.5g} 1/sAdj: " 99 | for j in 0 ..< m: log.write &"{s[j]:.4g}%s", (if j < m-1: " " else: "\n") 100 | for k in 0 ..< m: # Calc best fit coefs `b` 101 | b[k] = sum0(j,m, sum0(i,n, v[k + m*j]*s[j]*u[i + n*j]*y[i])) 102 | var ssR, ssY: F 103 | if doR or doC: # residuals if requested|needed 104 | let (_, vY) = mvar(y); ssY = vY*F(n) 105 | for i in 0 ..< n: # yEst: estimated y from x&b 106 | let yEst = sum0(j,m, b[j]*x[i + n*j]) 107 | if doR: r[i] = y[i] - yEst 108 | ssR += (y[i] - yEst)*(y[i] - yEst) 109 | if doC: # Cov(b) = inv(X'X) = V.W^2.Vt 110 | var cov = newSeq[F](m*m) 111 | let redCsq = if df > 0.0: ssR/df else: 0.0 # reduced Chi-square 112 | for i in 0 ..< m: 113 | for j in 0 .. i: 114 | let Cij = redCsq * sum0(k,m, v[i + m*k]*v[j + m*k] * s[k]*s[k]) 115 | cov[m*i + j] = Cij; cov[m*j + i] = cov[m*i + j] 116 | v = cov 117 | (ssR, df, ssY) 118 | 119 | proc normalize*[F](x: var openArray[F]; n,M: int; xfm: string; o,s: var seq[F])= 120 | ## Here `xfm[j]` specifies 'c': centering only; 'z': z-score (0mean,1var); 121 | ## 'm': min-max (data range->[0,1]). For explanations see: 122 | ## www.listendata.com/2017/04/how-to-standardize-variable-in-regression.html. 123 | for j, ch in xfm: 124 | case ch.toLowerAscii 125 | of 'c': 126 | let (avg, _) = mvar(x[j*n ..< (j+1)*n]) 127 | o[j] += avg 128 | vadd x[j*n].addr, n, -avg 129 | of 'z': 130 | let (avg, vr) = mvar(x[j*n ..< (j+1)*n]) 131 | o[j] += avg; s[j] *= sqrt(vr) 132 | vadd x[j*n].addr, n, -avg 133 | vmul x[j*n].addr, n, F(1)/sqrt(vr) 134 | of 'm': 135 | let (mn, mx) = mnmx(x[j*n ..< (j+1)*n]) 136 | o[j] += mn; s[j] *= mx - mn 137 | vadd x[j*n].addr, n, -mn 138 | vmul x[j*n].addr, n, F(1)/(mx - mn) 139 | else: discard 140 | 141 | proc needNormalize*(xfm: string): bool = 142 | ## Test if `xfm` implies normalize is a no-op. 143 | for ch in xfm: 144 | if ch in {'c','z','m'}: return true 145 | 146 | proc linFit*[F](X: var openArray[F]; n,M: int; b,u,s,v, r,h, oX,sX: var seq[F]; 147 | xfm=""; thr: var F=1e-6; xv=xvLOO; log: File=nil):(F,F,F){.discardable.}= 148 | ## This wraps 2-input `linFit` to normalize predictors/response according to 149 | ## `xfm` returning the offset,scale in oX,sX. Other parameters & return are 150 | ## the same, but this 1-input `X` variant assumes the first column of cap `X` 151 | ## is `y` (for convenience with non-flat pre-weighted input & naming offScales 152 | ## for `y`). `xfm` is described in `normalize` doc above. 153 | normalize(X, n, M, xfm, oX, sX) 154 | linFit(X[0.. F(0): r.setLen n # Csq result MUST be meaningful to trim 164 | for it in 0..its: 165 | result = linFit(X, n,M, b,u,s,v, r,h, oX,sX,xfm, thr, xv, log) 166 | if trim > F(0): 167 | let rtReducedCsq = sqrt(result[0]/result[1]) 168 | var t = -gauss.qtl(trim/F(2)/F(n))*rtReducedCsq # E[outlier] => r units 169 | let n0 = n 170 | for i in 0 ..< n: 171 | if r[i].abs > t: 172 | colDel X[0].addr, M, n, i - (n0 - n), F.sizeof 173 | dec n 174 | if n == n0: break # No points trimmed => done 175 | if log != nil: log.write &"{n0-n} / {n0} > {t/rtReducedCsq} sds\n" 176 | -------------------------------------------------------------------------------- /fitl/min1d.nim: -------------------------------------------------------------------------------- 1 | import math # Module with algos to find minima of 1-D scalar functions. 2 | 3 | type ScalarF*[F] = proc(x: F): F 4 | ## Nim requires casting proc types too much. 5 | 6 | proc minBrent*[F](x: var F; a, b: F; f: ScalarF[F]; tol=1e-7, itMx=100, 7 | dxMin0=1e-9): F {.discardable.} = # can want only arg min 8 | ## Return minimum of a univariate function f(x) on [a,b] by Brent's method. 9 | ## x is also set to arg min_[a,b] f(x). 10 | const golden = F(0.38196601125010515180) # 1-1/phi where phi^2-phi==1. 11 | var a = a; var b = b 12 | var u, v, w, fx, fu, fv, fw, xm, dx0, dxMin, dx, s: F 13 | x = a + golden*(b - a); w = x ; v = x # Initial evaluation 14 | fx = f(x) ; fw = fx; fv = fx 15 | for it in 1..itMx: 16 | xm = (a + b)/F(2) 17 | dxMin = tol*abs(x) + F(dxMin0) 18 | if abs(x - xm) < F(2)*dxMin - (b - a)/F(2): 19 | break # Converged 20 | if abs(dx) <= dxMin: # Golden section step 21 | dx = (if x >= xm: a else: b) - x 22 | s = golden*dx 23 | else: # Parabolic interpolation step 24 | let xw = (x - w)*(fx - fv) 25 | let xv = (x - v)*(fx - fw) 26 | var vw = F(2)*(xv - xw) 27 | var p = (x - v)*xv - (x - w)*xw 28 | if vw > F(0): p = -p 29 | else : vw = -vw 30 | dx0 = dx 31 | dx = s 32 | if p > vw*(a - x) and p < vw*(b - x) and abs(p) < abs(vw*dx0/F(2)): 33 | s = p / vw # Parabolic step is useful 34 | u = x + s 35 | if (u - a) < F(2)*dxMin or (b - u) < F(2)*dxMin: 36 | s = copySign(dxMin, xm - x) 37 | else: # Unuseful => golden section 38 | dx = (if x >= xm: a else: b) - x 39 | s = golden*dx 40 | u = x + (if abs(s) >= dxMin: s else: copySign(dxMin, s)) 41 | fu = f(u) # Split by at least dxMin 42 | if fu > fx: 43 | if u < x: a = u 44 | else : b = u 45 | if fu <= fw or w == x: 46 | v = w; fv = fw 47 | w = u; fw = fu 48 | elif fu <= fv or v == x or v == w: 49 | v = u; fv = fu 50 | else: 51 | if u >= x: a = x 52 | else : b = x 53 | v = w; fv = fw 54 | w = x; fw = fx 55 | x = u; fx = fu 56 | fx 57 | 58 | when isMainModule: 59 | when not declared(addFloat): import std/formatfloat 60 | var nEval = 0 61 | proc g(x: float): float = (inc nEval; ln(1.0 + (x - 2.0)*(x - 2.0))) 62 | var x: float 63 | echo "min: ", minBrent(x, 0.0, 100.0, g), " at: ",x," after ",nEval," evals" 64 | -------------------------------------------------------------------------------- /fitl/qtl.nim: -------------------------------------------------------------------------------- 1 | const adx = defined(release) and defined(useAdix) 2 | when adx: import adix/nsort # Usually faster linear-time number sort 3 | else : import std/algorithm 4 | proc rsort*[E](xs: var seq[E]) = ## ~ radix `sort` & `sorted` 5 | when adx: 6 | when E is float32: nsort xs, 0u32, xfFlt 7 | else: nsort xs, 0u64, xfFlt # !float32 => float64 8 | else: sort xs 9 | proc rsorted*[E](xs: openArray[E]): seq[E] = result = xs; rsort result 10 | 11 | proc parzen*[F:SomeFloat, U:SomeFloat](x: openArray[F], q: U): U = 12 | ##[ Return Parzen Qmid Quantile of sorted openArray. Ma, Genton &Parzen 2011: 13 | "Asymptotic properties of sample quantiles of discrete distributions" has more 14 | discussion. Personally, I see this as better than any alternative definition 15 | & also the right generalization of ancient "mid-ranking ties" ideas in rank 16 | correlation calculations (or even Wilcoxon's original 1945 paper). Qmid is 17 | sadly not as widely used|even known as it should be. ]## 18 | let n = x.len; let N = n.float 19 | if n < 1: return 0.0 # NO DATA ALL 20 | let qN = q.float*N # Should be near the answer 21 | if qN <= 0.5: return x[0] # Special case for lower tail 22 | if qN >= N - 0.5: return x[n - 1] # Special case for upper tail 23 | var iL, jL, iH, jH: int # The rank edges below & above `q` 24 | var xL, xH, cL, cH: float # Corresponding x[]'s & mid-ranks 25 | jL = qN.int; iL = jL #XXX Could replace scans with exponen. 26 | xL = x[iL] # ..expanding search for *many* dups. 27 | while iL > 0 and x[iL-1] == xL: dec iL # Make iL index of first x[]==xL 28 | while jL < n and x[jL] == xL: inc jL # Make jL index of last x[]==xL|n 29 | cL = 0.5*float(iL + jL) 30 | if cL <= qN: # Scan higher 31 | iH = jL; jH = iH # NOTE else: x[LH] COULD early return 32 | xH = if jH < n: x[jH] else: xL # Axes EDF-oriented ----/- 33 | while jH < n and x[jH] == xH: inc jH # ^ jH| / 34 | cH = 0.5*float(iH + jH) # |<- ixes go up cH/ 35 | else: # flip low -> high; scan lower # | /| 36 | cH = cL; iH = iL; jH = jL; xH = xL # | / | 37 | jL = iH; iL = jL # | ---/--| 38 | xL = if iL > 0: x[iL - 1] else: xH # |qN ~~~~~~~~~~|~/ iH 39 | while iL>0 and x[iL-1] == xL: dec iL # | jL cL/! xH 40 | cL = 0.5*float(iL + jL) # | /| ! 41 | let r = (qN - cL)/(cH - cL) # | iL ----/-| ^<- answer 42 | return U((1.0 - r)*xL + r*xH) # | xL -----> x goes up 43 | 44 | proc walshAverages*[F: SomeFloat](x: openArray[F]): seq[F] = 45 | ##[ Walsh 1950: Note on a theorem due to Hoeffding; Ann.Math.Stat. 21(1) first 46 | framed discussion of pairwise-including-self averages. ]## 47 | let n = x.len; var k = 0 48 | result.setLen n*(n + 1)div 2 49 | for i in 0.. `p` complement 68 | let nInv = 1.0/float(n) 69 | var last = betaI(0.0, p, q, err) 70 | for i in 0..=2 (eg. ``210``), instead emit THAT many re-samples 108 | ## to stdout. Eg. `|qtl 210|split -nl/21` creates 10 files each w/21 data. 109 | var x: seq[float] 110 | for line in stdin.lines: x.add parseFloat(line.strip) 111 | rsort x 112 | if ps.len == 1 and ps[0] == ps[0].int.float and ps[0] >= 2: 113 | for i in 1 .. int(ps[0]): echo x.quantile(rand(1.0)) 114 | else: 115 | for i, p in ps: 116 | if i > 0: stdout.write " " 117 | stdout.write x.quantile(p, `method`, cbα) 118 | stdout.write '\n' 119 | include cligen/mergeCfgEnv; dispatch qtl, help={ 120 | "method": """ p)arzen 121 | d(HarrellDavis) 122 | l(ParzenQmid-HodgesLehmann) 123 | b(both HD&HL, i.e. HD(Walsh)""", 124 | "cbα" : "DKWM conf.band α"} 125 | -------------------------------------------------------------------------------- /fitl/svdx.nim: -------------------------------------------------------------------------------- 1 | ## Not so rotten self-contained SVD in 42 lines of non-comment/blank code 2 | from std/math import sqrt, copySign; from basicLA import dot, sum0 3 | 4 | proc jacobi[F](a, b, c: F): (F, F) {.inline.} = 5 | if abs(2.0 * c) != 0.0: # return (cos,sin)(Jacobi Angle) 6 | let z = (b - a) / (2.0 * c) 7 | let t = copySign(1.0, z) / (abs(z) + sqrt(1.0 + z * z)) 8 | result[0] = 1.0 / sqrt(1.0 + t * t) 9 | result[1] = -result[0] * t 10 | else: 11 | result[0] = 1.0 # result[1] = 0.0 # cos=1, sin=0; cos^2+sin^2=1 12 | 13 | proc rot2[F](x, y: var F; co, sn: F) {.inline.} = 14 | let (u, v) = (x, y) # Apply 2D Givens Rotation 15 | x = co*u + sn*v 16 | y = -sn*u + co*v 17 | 18 | template xe(i, j: untyped): untyped = xu[i + n*j] # Elt accessor templates 19 | template ve(i, j: untyped): untyped = v[i + m*j] 20 | 21 | proc svdx*[F](xu, s, v: var openArray[F]; n, m: int; tol=1e-6; mxIt=40): F = 22 | ## Thin/econ SVD (factor to u*s*vT) of a general N*M column major matrix xu. 23 | ## xu is replaced by U on return; Cols of U&V are Left&Right singular vectors. 24 | ## s & v must be presized to m and m*m, respectively. v.len==0 skips calc of 25 | ## right singular vectors (e.g. for symm X). tol=max off-diag matrix element 26 | ## (in units of sqrt(Si*Sj)) to converge. Returns largest singular value. 27 | ## NOTE 1: Unsorted - sort[(s,j)..] to find the permutation if you need that. 28 | ## NOTE 2: u*(s>=0)*vT has 2^m sign choices; A flip in v cancels one in u. 29 | let doV = v.len > 0 30 | let t = tol * tol 31 | if doV: 32 | for i in 0 ..< m: ve(i, i) = 1.0 # V = I 33 | for it in 1..mxIt: 34 | var eMx = F(0) # Hestenes Algo which is just cyclic 35 | for i in 0 ..< m: #..Givens Rotations by Jacobi Angles. 36 | for j in i + 1 ..< m: # std upper triangle 37 | let a = dot(xe(0, i).addr, xe(0, i).addr, n) # compute (Xt*xe)_ij 38 | let b = dot(xe(0, j).addr, xe(0, j).addr, n) 39 | let c = dot(xe(0, i).addr, xe(0, j).addr, n) 40 | let e = c * c / (a * b) # a, b, c all >= 0 41 | eMx = max(eMx, e) 42 | if e > t: # Find angle to annihilate off diag elts 43 | let (co,sn) = jacobi(a, b, c) #..and then apply it all the way down u. 44 | for k in 0 ..< n : rot2(xe(k, i), xe(k, j), co, sn) 45 | if doV: # optionally also apply to v cols 46 | for k in 0 ..< m: rot2(ve(k, i), ve(k, j), co, sn) 47 | if eMx < t: break # converged 48 | var sMax: F 49 | for j in 0 ..< m: # compute singular values 50 | s[j] = sqrt(dot(xe(0, j).addr, xe(0, j).addr, n)) 51 | sMax = max(sMax, s[j]) 52 | let sInv = 1.0 / s[j] # also normalize U 53 | for i in 0 ..< n: xe(i, j) *= sInv 54 | sMax 55 | 56 | template toOA(p, n) = toOpenArray[F](p, 0, n - 1) 57 | proc polyFit*[F](b: var seq[F]; x,y,w: openArray[F], m=3, svThr=1e-6, tol=1e-6)= 58 | ## Fill `b` with `m` least-squares-best fit coefs in ascending order by power 59 | ## of polynomial model for y_i(x_i) maybe weighted by w_i. w.len==0 => w_i=1. 60 | ## `svThr` is a singular value threshold in units of the max SV. Smaller SVs 61 | ## are set to 0 for numerical stability even with (near) collinearities. 62 | let n = x.len 63 | if n < m: raise newException(ValueError, "n < m") 64 | if y.len != n: raise newException(ValueError, "y.len != x.len") 65 | if w.len>0 and w.len!=n: raise newException(ValueError, "w.len != x.len") 66 | if m < 1: raise newException(ValueError, "num.coefs m < 1") 67 | b.setLen (1 + 1 + m)*m + n*(m + 1) # Output b, s, v, weighted Design Matrix 68 | var xuy = cast[ptr UncheckedArray[F]](b[(m + 2)*m].addr) 69 | for i in 0.. 0: w[i] else: F(1) 71 | xuy[i + n*0] = val # 0-th power of x[i] is 1.0 72 | for j in 1.. 0: w[i] else: F(1)) # weight y[i] 75 | var s = cast[ptr UncheckedArray[F]](b[m].addr) # skip b 76 | var v = cast[ptr UncheckedArray[F]](b[m+m].addr) # skip b & s 77 | let thr = svThr*svdx(toOA(xuy, n*m), toOA(s, m), toOA(v, m*m), n, m, tol) 78 | for j in 0..= mix: gaussScale*gauss.qtl(rand(1.0)) 13 | else : cauchScale*cauchy.qtl(rand(1.0)) 14 | for j in 0..p: y += float(1)/float(j+1)*x^j 15 | stdout.write &"{y:.8g}" 16 | for j in 0..p: stdout.write &" {x^j:.7g}" 17 | stdout.write '\n' 18 | x += dx 19 | 20 | when isMainModule: 21 | import cligen 22 | dispatch gen, help={ 23 | "x0" : "starting x", 24 | "dx" : "x step", 25 | "n" : "num points", 26 | "p" : "polynomial order", 27 | "mix" : "fraction of Cauchy samples", 28 | "gaussScale": "scale of Gaussian noise", 29 | "cauchScale": "scale of Cauchy noise", 30 | "reseed" : "randomize RNG seed"} 31 | -------------------------------------------------------------------------------- /test/polyf.nim: -------------------------------------------------------------------------------- 1 | # E.g.: nim r test/gen -g0.1 | awk '{print $3,$1}' | nim r test/polyf 2 | when not declared(stdin): import std/[syncio, formatfloat] 3 | import fitl/svdx, std/strutils 4 | 5 | var x, y, w, b: seq[float] 6 | for line in stdin.lines: 7 | let cols = line.split() 8 | x.add parseFloat(cols[0]) 9 | y.add parseFloat(cols[1]) 10 | if cols.len > 2: w.add parseFloat(cols[2]) 11 | 12 | b.polyFit x, y, w 13 | echo b 14 | --------------------------------------------------------------------------------