├── kodhy ├── __init__.py ├── macros.hy └── util.hy ├── .gitignore ├── conftest.py ├── tests └── test_recategorize.hy └── README.rst /kodhy/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | /.pytest_cache 3 | -------------------------------------------------------------------------------- /conftest.py: -------------------------------------------------------------------------------- 1 | import hy, pytest 2 | 3 | def pytest_collect_file(file_path, parent): 4 | if file_path.name.startswith('test_') and file_path.suffix == '.hy': 5 | return pytest.Module.from_parent(parent, path = file_path) 6 | -------------------------------------------------------------------------------- /tests/test_recategorize.hy: -------------------------------------------------------------------------------- 1 | (import 2 | pandas :as pd 3 | kodhy.util [recategorize]) 4 | 5 | (defn test-recat [] 6 | (setv x (pd.Series (list "aabaddabcabbdbb") :dtype "category")) 7 | (assert (= (.join "" x.cat.categories) "abcd")) 8 | (setv x (recategorize x 9 | "a" "Z" 10 | "b" "Z" 11 | "d" None 12 | "c" "X")) 13 | (assert (= (.join "" x.cat.categories) "ZX")) 14 | (assert (= (.join "-" (map str x)) "Z-Z-Z-Z-nan-nan-Z-Z-X-Z-Z-Z-nan-Z-Z"))) 15 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | This repository contains many small functions and macros that I use to write `Hy`_ code. Python 3 and bleeding-edge Hy (Git master) are assumed. 2 | 3 | The tests require ``pytest``. Run them by saying ``pytest test``. 4 | 5 | License 6 | ============================================================ 7 | 8 | This program is copyright 2025 Kodi Arfer. 9 | 10 | This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. 11 | 12 | This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the `GNU General Public License`_ for more details. 13 | 14 | .. _Hy: http://hylang.org 15 | .. _`GNU General Public License`: http://www.gnu.org/licenses/ 16 | -------------------------------------------------------------------------------- /kodhy/macros.hy: -------------------------------------------------------------------------------- 1 | (require 2 | hyrule [defmacro!]) 3 | 4 | (import 5 | collections 6 | hyrule [coll?]) 7 | 8 | (defmacro incf [expr] 9 | `(+= ~expr 1)) 10 | 11 | (defreader p 12 | ; Pun. 13 | ; #p a => {"a" a} 14 | ; #p [a b c] => {"a" a "b" b "c" c} 15 | (setv expr (.parse-one-form &reader)) 16 | (when (isinstance expr hy.models.Symbol) 17 | (setv expr [expr])) 18 | (hy.models.Dict (+ #* (lfor x expr [(str x) x])))) 19 | 20 | (defmacro lc [vars a1 [a2 None]] 21 | "A more Lispy syntax for list comprehensions. 22 | (lc [x (range 10)] (str x)) 23 | (lc [x (range 10)] (> x 3) (str x))" 24 | `(lfor 25 | ~@vars 26 | ~@(if a2 [:if a1] []) 27 | ~(or a2 a1))) 28 | 29 | (defmacro! rmap [arglist #* expr] 30 | ; (rmap [[i x] (enumerate "xyzzy")] (setv y (.upper x)) (+ (str i) y)) 31 | ; => ["0X" "1Y" "2Z" "3Z" "4Y"] 32 | (setv [lvalue args] arglist) 33 | `(list (map 34 | (fn [~g!arg] 35 | (setv ~lvalue ~g!arg) 36 | ~@expr) 37 | ~args))) 38 | 39 | (defmacro amap [expr args] 40 | "'a' stands for 'anaphoric'." 41 | `(list (map (fn [it] ~expr) ~args))) 42 | 43 | (defmacro filt [expr args] 44 | `(list (filter (fn [it] ~expr) ~args))) 45 | 46 | (defmacro fmap [gen-expr filter-expr args] 47 | `(lfor it ~args :if ~filter-expr ~gen-expr)) 48 | 49 | (defmacro! amap2 [expr args] 50 | ; (amap2 (+ a b) (range 10)) => [1 5 9 13 17] 51 | `(do 52 | (setv ~g!args (list ~args)) 53 | (when (% (len ~g!args) 2) 54 | (raise (ValueError "iterable argument must have an even number of elements"))) 55 | (list (map 56 | (fn [~g!i] 57 | (setv a (get ~g!args ~g!i)) 58 | (setv b (get ~g!args (+ ~g!i 1))) 59 | ~expr) 60 | (range 0 (len ~g!args) 2))))) 61 | 62 | (defmacro! map-dvals [expr d] 63 | `(dict (map 64 | (fn [~g!pair] 65 | (setv it (get ~g!pair 1)) 66 | #((get ~g!pair 0) ~expr)) 67 | (.items ~d)))) 68 | 69 | (defmacro! tally [expr args] 70 | `(do 71 | (setv ~g!n 0) 72 | (for [it ~args] 73 | (when ~expr 74 | (+= ~g!n 1))) 75 | ~g!n)) 76 | 77 | (defmacro afind [expr args] 78 | `(try 79 | (next (filter (fn [it] ~expr) ~args)) 80 | (except [StopIteration] (raise (ValueError "afind: no matching value found"))))) 81 | 82 | (defmacro afind-or [expr args [default None]] 83 | "The default expression 'default' is evaluated (and its value returned) 84 | if no matching value is found." 85 | `(try 86 | (next (filter (fn [it] ~expr) ~args)) 87 | (except [StopIteration] ~default))) 88 | 89 | (defmacro whenn [expr #* body] 90 | "Analogous to Haskell's liftM for Maybe. Evaluates 91 | 'expr' and, if its value is not None, evaluates 'body' with the 92 | value bound to 'it'." 93 | `(do 94 | (setv it ~expr) 95 | (when (is-not it None) 96 | ~@body))) 97 | 98 | (defmacro replicate [n #* body] 99 | `(list (map (fn [_] ~@body) (range ~n)))) 100 | 101 | (defn recur-sym-replace [expr f] (cond 102 | ; Recursive symbol replacement. 103 | (isinstance expr hy.models.Symbol) 104 | (f expr) 105 | (coll? expr) 106 | ((type expr) (amap (recur-sym-replace it f) expr)) 107 | True 108 | expr)) 109 | 110 | (defmacro λ [#* body] 111 | `(fn [it] ~@body)) 112 | 113 | ;(defmacro λ2 [#* body] 114 | ; `(fn [x y] ~@body)) 115 | 116 | (defmacro qw [#* words] 117 | "(qw foo bar baz \"a b\" 5) => ['foo', 'bar', 'baz', 'a b', '5']" 118 | (hy.models.List (gfor w words (hy.models.String 119 | (if (isinstance w #(hy.models.Symbol hy.models.String)) 120 | w 121 | (.lstrip (hy.repr w) "'")))))) 122 | 123 | (defmacro meth [param-list #* body] 124 | "(meth [foo] (+ @bar foo)) => (fn [self foo] (+ self.bar foo))" 125 | (meth-f param-list body)) 126 | 127 | (defmacro cmeth [param-list #* body] 128 | `(classmethod ~(meth-f param-list body))) 129 | 130 | (defn meth-f [param-list body] 131 | `(fn [self ~@param-list] ~@(recur-sym-replace body (fn [sym] (cond 132 | (in sym ["@" "@="]) 133 | sym 134 | (= sym "@@") 135 | 'self 136 | (.startswith sym "@") 137 | `(. self ~@(amap (hy.models.Symbol it) (.split (cut sym 1 None) "."))) 138 | (.startswith sym "is_@") 139 | `(. self ~@(amap (hy.models.Symbol it) (.split (+ "is_" (cut sym (len "is_@") None)) "."))) 140 | True 141 | sym))))) 142 | 143 | (defmacro getl [obj key1 [key2 None] [key3 None]] 144 | ; Given a pd.DataFrame 'mtcars': 145 | ; (getl mtcars "4 Drive" "hp") => the cell "4 Drive", "hp" 146 | ; (getl mtcars "4 Drive") => the row "4 Drive" 147 | ; (getl mtcars : "hp") => the column "hp" 148 | ; (getl mtcars : (: "cyl" "hp")) => columns "cyl" through "hp" 149 | (panda-get 'loc obj key1 key2 key3)) 150 | 151 | (defmacro geti [obj key1 [key2 None] [key3 None]] 152 | (panda-get 'iloc obj key1 key2 key3)) 153 | 154 | (setv COLON :) 155 | (defmacro $ [obj key] 156 | ; Given a pd.DataFrame 'mtcars': 157 | ; ($ mtcars hp) => the column "hp" 158 | `(get ~obj ~(hy.models.String key))) 159 | 160 | (defmacro geta [obj #* keys] 161 | "For numpy arrays." 162 | `(get ~obj #(~@(map parse-key keys)))) 163 | 164 | (defn parse-key [key] 165 | "Keys can be: 166 | : => Empty slice object 167 | (: ...) => slice(...) 168 | anything else => itself" 169 | (cond 170 | (= key :) 171 | '(slice None) 172 | (and (isinstance key hy.models.Expression) (= (get key 0) :)) 173 | `(slice ~@(cut key 1 None)) 174 | True 175 | key)) 176 | 177 | (defn panda-get [attr obj key1 [key2 None] [key3 None]] 178 | `(get (. ~obj ~attr) ~(cond 179 | (is-not key3 None) `#(~(parse-key key1) ~(parse-key key2) ~(parse-key key3)) 180 | (is-not key2 None) `#(~(parse-key key1) ~(parse-key key2)) 181 | True (parse-key key1)))) 182 | 183 | (defn dollar-replace [df-sym expr] 184 | (recur-sym-replace expr (fn [sym] 185 | (if (.startswith sym "$") 186 | (if (= (len sym) 1) 187 | df-sym 188 | (panda-get 'loc df-sym COLON (hy.models.String (cut sym 1 None)))) 189 | sym)))) 190 | 191 | (defmacro wc [df #* body] 192 | "With columns. 193 | (wc df (+ $a $b)) => (+ ($ df a) ($ df b)) 194 | The replacement is recursive. 195 | `$` on its own becomes simply `df`." 196 | (setv df-sym (hy.gensym)) 197 | (setv body (dollar-replace df-sym body)) 198 | `(do (setv ~df-sym ~df) ~@body)) 199 | 200 | (defmacro ss [df #* body] 201 | "Subset. Evaluate `body` like `wc`, which should produce a 202 | boolean vector. Return `df` indexed by the boolean vector." 203 | (setv df-sym (hy.gensym)) 204 | (setv body (dollar-replace df-sym body)) 205 | `(do (setv ~df-sym ~df) (get ~df-sym ~@body))) 206 | 207 | (defmacro ssi [df #* body] 208 | "Subset index. Like `ss`, but returns a list of the indices that 209 | matched." 210 | (setv df-sym (hy.gensym)) 211 | (setv body (dollar-replace df-sym body)) 212 | `(do 213 | (setv ~df-sym ~df) 214 | (.tolist (. (get ~df-sym ~@body) index)))) 215 | 216 | (defmacro ordf [df #* exprs] 217 | "Order data frame. (ordf d (.abs $baz) $bar) sorts first by the 218 | absolute value of the column `baz`, then by `bar`." 219 | (setv [df-sym pd sorting-df] [(hy.gensym) (hy.gensym) (hy.gensym)]) 220 | (setv exprs (dollar-replace df-sym exprs)) 221 | `(do 222 | (setv ~df-sym ~df) 223 | (import pandas :as ~pd) 224 | (setv ~sorting-df (.reset-index (.concat ~pd [~@exprs] 1) None True)) 225 | (geti ~df-sym (. (.sort-values ~sorting-df (list (. ~sorting-df columns))) index)))) 226 | ; ~pd 227 | 228 | (defmacro wcby [df by #* body] 229 | (setv df-sym (hy.gensym) it-sym (hy.gensym)) 230 | `(do 231 | (setv ~df-sym ~df) 232 | (.apply 233 | (.groupby ~df-sym ~(dollar-replace df-sym by)) 234 | (fn [~it-sym] ~@(dollar-replace it-sym body))))) 235 | 236 | (defmacro! cbind [#* args] 237 | `(do 238 | (import kodhy.util [cbind-join :as ~g!cj]) 239 | (~g!cj "outer" ~@(gfor a args 240 | (if (isinstance a hy.models.Keyword) a.name a))))) 241 | 242 | (defmacro cached [expr [bypass 'None] [cache-dir 'None]] 243 | `(do 244 | (import kodhy.util) 245 | (kodhy.util.cached-eval 246 | (kodhy.util.show-expr '~expr) 247 | (fn [] ~expr) 248 | ~bypass 249 | ~cache-dir))) 250 | 251 | (defmacro show-time-elapsed [#* expr] 252 | (setv t (hy.gensym)) 253 | `(do 254 | (setv ~t (hy.I.time.perf-counter)) 255 | (try 256 | ~@expr 257 | (finally 258 | (setv ~t (round (- (hy.I.time.perf-counter) ~t))) 259 | (print "Time elapsed:" #* (gfor 260 | [k v] (.items (dict 261 | :h (// ~t (* 60 60)) 262 | :min (// (% ~t (* 60 60)) 60) 263 | :s (% ~t 60))) 264 | :if v 265 | f"{v} {k}")))))) 266 | -------------------------------------------------------------------------------- /kodhy/util.hy: -------------------------------------------------------------------------------- 1 | (require 2 | hyrule [unless] 3 | kodhy.macros *) 4 | 5 | (import 6 | functools [reduce] 7 | itertools [combinations] 8 | collections.abc [Iterable] 9 | numbers [Number] 10 | hyrule [distinct flatten inc dec coll?] 11 | toolz [first second partition identity]) 12 | 13 | (setv T True) 14 | (setv F False) 15 | 16 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 17 | ;; * Numbers and arrays 18 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 19 | 20 | (defn signum [x] (cond 21 | (< x 0) -1 22 | (> x 0) 1 23 | T 0)) 24 | 25 | (defn product [l] 26 | (setv a 1) 27 | (for [x l] 28 | (*= a x)) 29 | a) 30 | 31 | (defn odds-ratio [p1 p2] 32 | (/ (* p1 (- 1 p2)) (* p2 (- 1 p1)))) 33 | 34 | (defn odds-ratio-on-p [oratio p] 35 | ; How an odds ratio changes a probability. 36 | ; Identity: (= p1 (odds-ratio-on-p (odds-ratio p1 p2) p2)) 37 | (/ (* oratio p) (+ 1 (* p (- oratio 1))))) 38 | 39 | (defn int-binomial [n k] 40 | ; `n` choose `k` as an integer. (scipy.special.binom uses floating point.) 41 | (when (and (= n 0) (!= k 0)) 42 | (return 0)) 43 | (when (= k 0) 44 | (return 1)) 45 | (// 46 | (reduce * (seq (- n (dec k)) n)) 47 | (reduce * (seq 1 k)))) 48 | 49 | (defn frac-binom-pmf [k n p] 50 | ; Probability mass function of the binomial distribution, returning 51 | ; a Fraction so long as `p` is a Fraction. 52 | ; The output is the probability of exactly `k` successes among `n` 53 | ; independent Bernoulli trials, each with probability `p`. 54 | (* 55 | (int-binomial n k) 56 | (** p k) 57 | (** (- 1 p) (- n k)))) 58 | 59 | (defn logit [x] 60 | (import numpy) 61 | (numpy.log (/ x (- 1 x)))) 62 | 63 | (defn ilogit [x] 64 | (import numpy) 65 | (/ 1 (+ 1 (numpy.exp (- x))))) 66 | 67 | (defn zscore [x] 68 | (/ (- x (.mean x)) (.std x :ddof 0))) 69 | 70 | (defn hzscore [x] 71 | "Half the z-score. Divides by two SDs instead of one, per: 72 | Gelman, A. (2008). Scaling regression inputs by dividing by two standard deviations. Statistics in Medicine, 27(15), 2865–2873. doi:10.1002/sim.3107" 73 | (/ (- x (.mean x)) (* 2 (.std x :ddof 0)))) 74 | 75 | (defn rmse [v1 v2] 76 | "Root mean square error." 77 | (import numpy :as np) 78 | (np.sqrt (np.mean (** (- v1 v2) 2)))) 79 | 80 | (defn mean-ad [v1 v2] 81 | "Mean absolute deviation." 82 | (import numpy :as np) 83 | (np.mean (np.abs (- v1 v2)))) 84 | 85 | (defn jitter [v [factor 100]] 86 | (import numpy :as np) 87 | (setv b (/ (- (.max v) (.min v)) (* 2 factor))) 88 | (+ v (np.random.uniform (- b) b (len v)))) 89 | 90 | (defn valcounts [x [y None]] 91 | (import pandas :as pd) 92 | 93 | (setv [x y] (gfor v [x y] (cond 94 | (is v None) 95 | v 96 | (isinstance v pd.Series) 97 | (if (and (= v.dtype "category") (.any (pd.isnull v))) 98 | (.fillna (.cat.add-categories v "~N/A") "~N/A") 99 | v) 100 | True 101 | (pd.Series (list v))))) 102 | 103 | (if (is y None) 104 | (.rename 105 | ((if (in "float" (str x.dtype)) (fn [x] (.sort-index x)) identity) 106 | (.value-counts x :sort F :dropna F)) 107 | (λ (if (pd.isnull it) "~N/A" it))) 108 | (pd.crosstab 109 | (.fillna x "~N/A") (.fillna y "~N/A")))) 110 | 111 | (defn valprops [x [y None] [d None]] 112 | (setv out (valcounts x y)) 113 | (setv out (/ out (.sum out))) 114 | (if (is d None) out (.round out :decimals d))) 115 | 116 | (defn weighted-choice [l] 117 | ; The argument should be a list of (weight, object) pairs. 118 | ; http://stackoverflow.com/a/3679747 119 | (import random) 120 | (setv r (random.uniform 0 (sum (map first l)))) 121 | (for [[w x] l] 122 | (-= r w) 123 | (when (<= r 0) 124 | (break))) 125 | x) 126 | 127 | (defn pds-from-pairs [l #** kwargs] 128 | (import pandas :as pd) 129 | (setv l (list l)) 130 | (pd.Series (amap (second it) l) (amap (first it) l) #** kwargs)) 131 | 132 | (defn pd-posix-time [series] 133 | (import numpy :as np) 134 | (// (.astype series np.int64) (int 1e9))) 135 | 136 | (defn pd-rename-cats [series f-or-dict] 137 | (.cat.rename-categories series (amap 138 | (if (callable f-or-dict) 139 | (f-or-dict it) 140 | (.get f-or-dict it it)) 141 | series.cat.categories))) 142 | 143 | (defn recategorize [x #* kv] 144 | (import pandas :as pd) 145 | (setv kv (list (partition 2 kv))) 146 | (setv d (dict kv)) 147 | (unless (= (sorted x.cat.categories) (sorted (.keys d))) 148 | (raise (ValueError "original categories don't match"))) 149 | (.astype 150 | (.replace (.astype x object) d) 151 | (pd.api.types.CategoricalDtype :categories (distinct 152 | (gfor 153 | [_ v] kv 154 | :if (not (pd.isnull v)) 155 | v))))) 156 | 157 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 158 | ;; * Matrices and DataFrames 159 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 160 | 161 | (defn cbind-join [join #* args] 162 | (import pandas :as pd) 163 | (setv args (list args)) 164 | (setv final-index None) 165 | (when (and (isinstance (first args) str) (= (first args) "I")) 166 | (shift args) 167 | (setv final-index (shift args))) 168 | (defn scalar? [x] 169 | (or (isinstance x str) (not (isinstance x Iterable)))) 170 | (setv height (max (gfor 171 | a args 172 | (if (scalar? a) 1 (len a))))) 173 | 174 | (setv chunks [] assume-index final-index) 175 | (while args 176 | (setv name None x (shift args)) 177 | (when (isinstance x str) 178 | (setv name x x (shift args))) 179 | (when (and 180 | (is assume-index None) 181 | (isinstance x #(pd.Series pd.DataFrame))) 182 | (setv assume-index x.index)) 183 | (setv x (cond 184 | (isinstance x #(pd.Series pd.DataFrame)) 185 | (if name (.copy x) x) 186 | (scalar? x) 187 | (pd.Series (* [x] height) :index assume-index) 188 | T 189 | (pd.Series x))) 190 | (when name 191 | (setv x.name name)) 192 | (.append chunks x)) 193 | 194 | (setv result (pd.concat :objs chunks :axis 1 :join join)) 195 | (unless (is final-index None) 196 | (setv result.index final-index)) 197 | result) 198 | 199 | (defn df-from-pairs [l] 200 | (import pandas :as pd) 201 | (setv d (pd.DataFrame (lc [row l] (lc [[_ v] row] v)))) 202 | (setv d.columns (amap (first it) (first l))) 203 | d) 204 | 205 | (defn drop-unused-cats [d [inplace F]] 206 | ; Drops unused categories from all categorical columns. 207 | ; Can also be applied to a Series. 208 | (import pandas :as pd) 209 | (unless inplace 210 | (setv d (.copy d))) 211 | (for [[_ col] (if (isinstance d pd.Series) [[None d]] (.iteritems d))] 212 | (when (hasattr col "cat") 213 | (.remove-unused-categories col.cat :inplace T))) 214 | d) 215 | 216 | (defn cols2map [d k-col v-col [keep None]] 217 | (setv d (getl d : [k-col v-col])) 218 | (when keep 219 | (setv d (.drop-duplicates d :keep keep))) 220 | (setv d (.set-index d k-col :verify-integrity (not keep))) 221 | (geti d : 0)) 222 | 223 | (defn -number-format [x f] 224 | (import numpy :as np pandas :as pd) 225 | (cond 226 | (isinstance x pd.DataFrame) (do 227 | (setv x (.copy x)) 228 | (for [r (range (first x.shape))] 229 | (for [c (range (second x.shape))] 230 | (setv (geti x r c) (-number-format (geti x r c) f)))) 231 | x) 232 | (isinstance x pd.Series) (do 233 | (setv x (.copy x)) 234 | (for [i (range (len x))] 235 | (setv (geti x i) (-number-format (geti x i) f))) 236 | x) 237 | (isinstance x np.ndarray) 238 | (f x) 239 | (coll? x) 240 | ((type x) (amap (-number-format it f) x)) 241 | (isinstance x Number) 242 | (f x) 243 | T 244 | x)) 245 | 246 | (defn rd [a1 [a2 None]] 247 | "Round for display. Takes just a number, array, Series, DataFrame, 248 | or other collection, or both a number of digits to round to and 249 | such an object." 250 | (import numpy :as np) 251 | (setv [x digits] (if (is a2 None) [a1 3] [a2 a1])) 252 | (-number-format x (fn [v] (np.round v digits)))) 253 | 254 | (defn thousep [x] 255 | (import numpy :as np) 256 | (setv vec-f (np.vectorize (fn [v] (format v ",")))) 257 | (-number-format x (fn [v] (if (isinstance v np.ndarray) (vec-f v) (format v ","))))) 258 | 259 | (defn with-1o-interacts [m [column-names None]] 260 | "Given a data matrix m, return a matrix with a new column 261 | for each first-order interaction. Constant columns are removed." 262 | (import numpy :as np itertools [combinations]) 263 | (when column-names 264 | (assert (= (len column-names) (second m.shape)))) 265 | (setv [new-names new-cols] (zip #* (filt 266 | (not (.all (= (second it) (first (second it))))) 267 | (lc [[v1 v2] (combinations (range (second m.shape)) 2)] (, 268 | (when column-names 269 | (tuple (sorted [(get column-names v1) (get column-names v2)]))) 270 | (np.multiply (geta m : v1) (geta m : v2))))))) 271 | (setv new-m (np.column-stack (+ #(m) new-cols))) 272 | (if column-names 273 | #(new-m (+ (tuple column-names) new-names)) 274 | new-m)) 275 | 276 | (defn print-big-pd [obj] 277 | (import pandas :as pd) 278 | (with [(pd.option-context 279 | "display.max_rows" (int 5000) 280 | "display.max_columns" (int 100) 281 | "display.width" (int 1000) 282 | "display.max_colwidth" (int 500))] 283 | (print obj))) 284 | 285 | (defn pd-to-pretty-json [path df] 286 | ; Serializes a Pandas dataframe to an obvious-looking JSON format. 287 | ; Information about categorial columns is saved as metadata. 288 | (import math [isnan] numpy :as np collections [OrderedDict]) 289 | (setv out (OrderedDict)) 290 | 291 | (setv (get out "categories") (OrderedDict (rmap [col (ssi df.dtypes (= $ "category"))] 292 | [col (OrderedDict [ 293 | #("ordered" (. (getl df : col) cat ordered)) 294 | #("categories" (list (. (getl df : col) cat categories)))])]))) 295 | (unless (get out "categories") 296 | (del (get out "categories"))) 297 | 298 | (setv cols (list df.columns)) 299 | (setv table (.astype df.values object)) 300 | (setv table ((np.vectorize :otypes [object] 301 | (λ (if (and (isinstance it float) (isnan it)) None it))) table)) 302 | (setv (get out "first_col_is_row_labels") F) 303 | (when (or df.index.name 304 | (not (.all (= df.index (list (range (len df))))))) 305 | ; We only include the index as a column if it has a name or 306 | ; is something other than consecutive integers starting from 307 | ; 0. 308 | (setv (get out "first_col_is_row_labels") T) 309 | (setv table (np.column-stack [df.index table])) 310 | (setv cols (+ [df.index.name] cols))) 311 | (setv (get out "table") (+ [cols] (.tolist table))) 312 | 313 | (setv jstr (json-dumps-pretty out :sort-keys F)) 314 | (if path (barf path jstr) jstr)) 315 | 316 | (defn pretty-json-to-pd [path] 317 | (import json pandas :as pd) 318 | (setv j (json.loads (slurp path))) 319 | (setv df (pd.DataFrame (cut (get j "table") 1 None) 320 | :columns (get j "table" 0))) 321 | (when (get j "first_col_is_row_labels") 322 | (setv df (.set-index df (first df.columns)))) 323 | (for [[catcol meta] (.items (.get j "categories" {}))] 324 | (setv (getl df : catcol) (.astype (getl df : catcol) 325 | (pd.api.types.CategoricalDtype #** meta)))) 326 | df) 327 | 328 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 329 | ;; * Strings 330 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 331 | 332 | (defn cat [#* args #** kwargs] 333 | (.join 334 | (or (.get kwargs "sep") "") 335 | (lc [x args] x (string x)))) 336 | 337 | (defn ucfirst [s] 338 | (and s (+ (.upper (first s)) (cut s 1 None)))) 339 | 340 | (defn double-quote [s] 341 | (.format "\"{}\"" 342 | (.replace (.replace s "\\" "\\\\") "\"" "\\\""))) 343 | 344 | (defn show-expr [x] 345 | "Stringify Hy expressions to a fairly pretty form, albeit 346 | without newlines outside string literals." 347 | (cond 348 | (isinstance x hy.models.Expression) 349 | (.format "({})" (.join " " (list (map show-expr x)))) 350 | (isinstance x hy.models.Dict) 351 | (.format "{{{}}}" (.join " " (list (map show-expr x)))) 352 | (isinstance x hy.models.Keyword) 353 | (+ ":" x.name) 354 | (isinstance x hy.models.Symbol) 355 | (str x) 356 | (isinstance x list) 357 | (.format "[{}]" (.join " " (list (map show-expr x)))) 358 | (isinstance x tuple) 359 | (.format "#({})" (.join " " (list (map show-expr x)))) 360 | (isinstance x str) 361 | (double-quote (str x)) 362 | T 363 | (str x))) 364 | 365 | (defn keyword->str [x] 366 | (if (isinstance x hy.models.Keyword) 367 | x.name 368 | x)) 369 | 370 | (defn str->keyword [x] 371 | (if (isinstance x hy.models.Keyword) 372 | x 373 | (hy.models.Keyword x))) 374 | 375 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 376 | ;; * Lists and other basic data structures 377 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 378 | 379 | (defn concat [ll] 380 | (reduce (fn [accum x] (+ accum x)) ll [])) 381 | 382 | (defn merge-dicts [#* ds] 383 | (setv out {}) 384 | (for [d ds] 385 | (.update out d)) 386 | out) 387 | 388 | (defn seq [lo hi [step 1]] 389 | (list (range lo (+ hi step) step))) 390 | 391 | (defn shift [l] 392 | (.pop l 0)) 393 | 394 | (defn all-unique? [l] 395 | (setv seen (set)) 396 | (for [x l] 397 | (when (in x seen) 398 | (return False)) 399 | (.add seen x)) 400 | True) 401 | 402 | (defn mins [iterable [key (λ it)] [comparator-fn None] [agg-fn min]] 403 | ; Returns a list of minimizing values of the iterable, 404 | ; in their original order. 405 | (unless comparator-fn 406 | (import operator) 407 | (setv comparator-fn operator.le)) 408 | (setv items (list iterable)) 409 | (unless items 410 | (return [])) 411 | (setv vals (list (map key items))) 412 | (setv vm (agg-fn vals)) 413 | (lc [[item val] (zip items vals)] 414 | (comparator-fn val vm) 415 | item)) 416 | 417 | (defn maxes [iterable [key (λ it)]] 418 | (import operator) 419 | (mins iterable key operator.ge max)) 420 | 421 | (defn rget [obj regex] 422 | (import re) 423 | (setv regex (re.compile regex)) 424 | (setv keys (filt (.search regex it) (.keys obj))) 425 | (cond 426 | (> (len keys) 1) 427 | (raise (LookupError "Ambiguous matcher")) 428 | (= (len keys) 0) 429 | (raise (LookupError "No match")) 430 | T 431 | (get obj (get keys 0)))) 432 | 433 | (defn pairs [#* a] 434 | (setv a (list a)) 435 | (setv r []) 436 | (while a 437 | (.append r #((keyword->str (shift a)) (keyword->str (shift a))))) 438 | r) 439 | 440 | (defn by-ns [n iterable] 441 | (zip #* (* [(iter iterable)] n))) 442 | (defn by-2s [iterable] 443 | (by-ns 2 iterable)) 444 | 445 | (defn iter-with-prev [iterable] 446 | (setv prev None) 447 | (for [item iterable] 448 | (yield #(prev item)) 449 | (setv prev item))) 450 | 451 | (defn iter-with-prev1 [iterable] 452 | ; Like iter-with-prev, but skips the first pair, which has 453 | ; None for the previous value. 454 | (import itertools) 455 | (itertools.islice (iter-with-prev iterable) 1 None)) 456 | 457 | (defn c+1 [counter key] 458 | (+= (get counter key) 1)) 459 | 460 | (defclass ad [dict] 461 | "Attribute dictionary. A dictionary with which you can set, get, and 462 | delete items as if they were attributes." 463 | (defn __getattr__ [self k] 464 | (get self k)) 465 | (defn __setattr__ [self k v] 466 | (setv (get self k) v)) 467 | (defn __delattr__ [self k] 468 | (del (get self k)))) 469 | 470 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 471 | ;; * Files 472 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 473 | 474 | (defn slurp [name [mode None] [encoding None] [buffering None]] 475 | (setv f open) 476 | (when encoding 477 | (import codecs) 478 | (setv f codecs.open)) 479 | (with [o (f name #** (dict (+ 480 | (if (is mode None) [] [#("mode" mode)]) 481 | (if (is encoding None) [] [#("encoding" encoding)]) 482 | (if (is buffering None) [] [#("buffering" buffering)]))))] 483 | (o.read))) 484 | 485 | (defn barf [name content [mode "w"] [encoding None] [buffering None]] 486 | (setv f open) 487 | (when encoding 488 | (import codecs) 489 | (setv f codecs.open)) 490 | (with [o (f name mode #** (dict (+ 491 | (if (is encoding None) [] [#("encoding" encoding)]) 492 | (if (is buffering None) [] [#("buffering" buffering)]))))] 493 | (o.write content))) 494 | 495 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 496 | ;; * JSON 497 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 498 | 499 | (defn json-dumps-pretty [o #** kwargs] 500 | ; Like json.dumps, but arrays or objects of atomic values are 501 | ; printed without internal indents, and with different 502 | ; option defaults. 503 | (import json uuid) 504 | (for [[option value] [ 505 | ["indent" 2] ["separators" #("," ": ")] ["sort_keys" T]]] 506 | (when (is (.get kwargs option) None) 507 | (setv (get kwargs option) value))) 508 | (setv substituted-parts {}) 509 | (defn recursive-subst [x] 510 | ; Replaces lists or dictionaries of atomic values with UUID 511 | ; strings. 512 | (if (isinstance x #(list tuple dict)) 513 | (if (all (lc [v (if (isinstance x dict) (.values x) x)] 514 | (isinstance v #(bool (type None) int float str bytes)))) 515 | (do 516 | (setv my-id (. (uuid.uuid4) hex)) 517 | (setv (get substituted-parts my-id) x) 518 | my-id) 519 | (if (isinstance x dict) 520 | ((type x) (lc [[k v] (.items x)] #(k (recursive-subst v)))) 521 | (lc [v x] (recursive-subst v)))) 522 | x)) 523 | (setv json-str (json.dumps (recursive-subst o) #** kwargs)) 524 | (setv (get kwargs "indent") None) 525 | (setv (get kwargs "separators") #(", " ": ")) 526 | (for [[my-id x] (.items substituted-parts)] 527 | (setv json-str (.replace json-str (+ "\"" my-id "\"") 528 | (.rstrip (json.dumps x #** kwargs)) 529 | 1))) 530 | json-str) 531 | 532 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 533 | ;; * Cross-validation 534 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 535 | 536 | (defn kfold-cv-pred [x y f [k-folds 10] [shuffle T] [random-state None] [folds None]] 537 | "Return a np of predictions of y given x using f. x is expected to be 538 | a numpy matrix, not a pandas DataFrame. 539 | 540 | f will generally be of the form (fn [x-train y-train x-test] ...), 541 | and should return a 1D nparray of predictions given x-test." 542 | (import numpy :as np) 543 | (setv y-pred None) 544 | (unless folds 545 | (setv folds (cut 546 | (np.tile (np.arange k-folds) (int (np.ceil (/ (len x) k-folds)))) 547 | (len x))) 548 | (.shuffle (np.random.default-rng random-state) folds) 549 | (setv folds (lfor 550 | i (range k-folds) 551 | [(np.where (!= folds i)) (np.where (= folds i))]))) 552 | (for [[train-i test-i] folds] 553 | (setv result (f (get x train-i) (get y train-i) (get x test-i))) 554 | (when (is y-pred None) 555 | (setv y-pred (np.empty-like y :dtype result.dtype))) 556 | (setv (get y-pred test-i) result)) 557 | y-pred) 558 | 559 | (defn choose-labeled-cv-folds [subjects labels bin-label-possibilities] 560 | ; Put the subjects into cross-validation folds such that all the 561 | ; subjects with a given label are in the same fold. 562 | ; bin-label-possibilities should be the return value of 563 | ; bin-labels. 564 | (import random [choice shuffle] collections [Counter]) 565 | (setv group-sizes (Counter labels)) 566 | (setv bins (list (choice bin-label-possibilities))) 567 | (shuffle bins) 568 | (amap 569 | (concat (amap 570 | (do 571 | (setv target-size it) 572 | (setv label (first (afind (= (second it) target-size) (.items group-sizes)))) 573 | (del (get group-sizes label)) 574 | (lc [[s l] (zip subjects labels)] (= l label) s)) 575 | it)) 576 | bins)) 577 | 578 | (defn bin-labels [labels [n-bins 10] [max-bin-size None]] 579 | ; A routine to prepare input for choose-labeled-cv-folds. 580 | ; 581 | ; Finds ways to sort a list of label objects (which is 582 | ; expected to have lots of duplicates) into a fixed number 583 | ; bins in a way that gets the bin sizes as close to equal 584 | ; as possible. This is just the multiprocessor-scheduling problem 585 | ; ( https://en.wikipedia.org/wiki/Multiprocessor_scheduling ) 586 | ; with a different metric to optimize. 587 | ; 588 | ; The algorithm simply enumerates all possibilities, so it will 589 | ; will be too slow with large inputs. 590 | ; 591 | ; Returns a tuple of possibilities, each of which is a tuple of 592 | ; bins. Each bin is a tuple of numbers representing the size 593 | ; of a labelled group. 594 | 595 | (import collections [Counter]) 596 | 597 | (setv initial-state (, 598 | (tuple (sorted (.values (Counter labels)))) 599 | (* #((,)) n-bins))) 600 | 601 | (setv states (set [initial-state])) 602 | (setv explore [initial-state]) 603 | (setv iteration 0) 604 | (setv explore-len-was (len explore)) 605 | 606 | (print "Generating possible arrangements") 607 | (while explore 608 | (+= iteration 1) 609 | (unless (% iteration 10000) 610 | (print (len explore) (.format "({:+d})" (- (len explore) explore-len-was))) 611 | (setv explore-len-was (len explore))) 612 | (setv [remaining bins] (.pop explore 0)) 613 | (setv rseen (set)) 614 | (for [r-i (range (len remaining))] 615 | (setv x (get remaining r-i)) 616 | (when (in x rseen) 617 | (continue)) 618 | (.add rseen x) 619 | (setv new-remaining (+ (cut remaining 0 r-i) (cut remaining (+ r-i 1) None))) 620 | (for [b-i (range n-bins)] 621 | (setv new-bin (+ (get bins b-i) #(x))) 622 | (when (and max-bin-size (> (sum new-bin) max-bin-size)) 623 | (continue)) 624 | (setv new-bins (tuple (sorted (+ 625 | (cut bins b-i) 626 | #((tuple (sorted new-bin))) 627 | (cut bins (+ b-i 1) None))))) 628 | (setv new-state #(new-remaining new-bins)) 629 | (when (in new-state states) 630 | (continue)) 631 | (.add states new-state) 632 | (.append explore new-state)))) 633 | 634 | (print "Finding minima") 635 | (setv target-bin-size (/ (len labels) n-bins)) 636 | (mins 637 | (lc [[remaining bins] states] (and (not remaining) (all bins)) bins) 638 | (λ (sum (amap (** (- (sum it) target-bin-size) 2) it))))) 639 | 640 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 641 | ;; * Caching 642 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 643 | 644 | (setv _default-cache-dir (do 645 | (import os.path) 646 | (os.path.join (os.path.expanduser "~") ".daylight" "py-cache"))) 647 | 648 | (defn cached-eval [key f [bypass None] [cache-dir None]] 649 | "Call `f`, caching the value with the string `key`. If `bypass` 650 | is provided, its value is written to the cache and returned 651 | instead of calling `f` or consulting the existing cache." 652 | (import pickle hashlib base64 os errno time) 653 | ; (unless (os.path.exists cache-dir) 654 | ; (os.makedirs cache-dir)) 655 | (unless cache-dir 656 | (setv cache-dir _default-cache-dir)) 657 | (setv basename (str (cut 658 | (base64.b64encode 659 | (.digest (hashlib.md5 (.encode key "UTF8"))) 660 | (.encode "+_" "ASCII")) 661 | 0 -2) "ASCII")) 662 | (setv path (os.path.join cache-dir basename)) 663 | (setv value bypass) 664 | (setv write-value T) 665 | (when (is value None) 666 | (if (os.path.exists path) 667 | (do 668 | (setv value (with [o (open path "rb")] 669 | (get (pickle.load o) "value"))) 670 | (setv write-value F)) 671 | (setv value (f)))) 672 | (when write-value 673 | (setv d { 674 | "basename" basename 675 | "key" key 676 | "value" value 677 | "time" (time.time)}) 678 | (with [o (open path "wb")] 679 | (pickle.dump d o pickle.HIGHEST-PROTOCOL))) 680 | value) 681 | 682 | (defn show-cache [[cache-dir _default-cache-dir] [pretty T] [regex ""]] 683 | "Prints the caches of 'cached-eval' in chronological order." 684 | (import re pickle os os.path datetime) 685 | (setv regex (re.compile regex)) 686 | (setv items 687 | (sorted :key (λ (get it "time")) 688 | (amap (with [o (open (os.path.join cache-dir it) "rb")] 689 | (pickle.load o)) 690 | (filt (os.path.isfile it) 691 | (os.listdir cache-dir))))) 692 | (for [item items] 693 | (unless (.search regex (get item "key")) 694 | (continue)) 695 | (unless pretty 696 | (print (get item "basename")) 697 | (continue)) 698 | (print "Basename:" (get item "basename")) 699 | (print "Date:" (.strftime 700 | (datetime.datetime.fromtimestamp (get item "time")) 701 | "%-d %b %Y, %-I:%M:%S %p")) 702 | (print (.format "Size: {:,}" (. 703 | (os.stat (os.path.join cache-dir (get item "basename"))) 704 | st_size))) 705 | (print "Key:" (get item "key")) 706 | (print)) 707 | None) 708 | 709 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 710 | ;; * Support for Tversky 711 | ;; https://github.com/Kodiologist/Tversky 712 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 713 | 714 | (defn tversky-format-s [s] 715 | (amap (.format "s{:03d}" it) s)) 716 | 717 | (defn unpack-tversky [db-path 718 | [include-incomplete T] 719 | [exclude-sns None]] 720 | (import sqlite3 pandas :as pd) 721 | (try 722 | (do 723 | (setv db (sqlite3.connect db-path)) 724 | (.execute db "pragma foreign_keys = on") 725 | 726 | (setv sb (pd.read-sql-query :con db :index-col "sn" 727 | :parse-dates (dfor k (qw consented_t began_t completed_t) 728 | k (dict :unit "s" :utc T)) 729 | "select 730 | sn, experimenter, ip, task, 731 | consented_t, 732 | began_t, 733 | case when completed_t = 'assumed' then null else completed_t end 734 | as completed_t, 735 | MTurk.hitid as hit, 736 | MTurk.assignmentid as asgmt, 737 | MTurk.workerid as worker 738 | from Subjects 739 | left join 740 | (select sn, min(first_sent) as began_t from Timing group by sn) 741 | using (sn) 742 | left join MTurk using (sn)")) 743 | ; Make some columns categorical, with the levels ordered 744 | ; chronologically. 745 | (for [c (qw experimenter ip hit task)] 746 | (setv (getl sb : c) (pd.Categorical 747 | (getl sb : c) 748 | :ordered T 749 | :categories (list (.unique (.dropna (getl (.sort-values sb "began_t") : c))))))) 750 | (setv ($ sb tv) (+ 1 (. ($ sb task) cat codes))) 751 | ; "tv" for "task version". 752 | (when exclude-sns 753 | (setv sb (.drop sb exclude-sns))) 754 | (unless include-incomplete 755 | (setv sb (.dropna sb :subset ["completed_t"]))) 756 | 757 | (.execute db "create temporary table IncludeSN(sn integer primary key)") 758 | (.executemany db "insert into IncludeSN (sn) values (?)" (amap #(it) sb.index)) 759 | 760 | (setv dat (.sort-index (geti (pd.read-sql-query :con db :index-col ["sn" "k"] 761 | "select * from D where sn in (select * from IncludeSN)") : 0))) 762 | 763 | (setv timing (.sort-index (pd.read-sql-query :con db :index-col ["sn" "k"] 764 | :parse-dates (dfor k (qw first_sent received) 765 | k (dict :unit "s" :utc T)) 766 | "select * from Timing where sn in (select * from IncludeSN)"))) 767 | 768 | (setv sb.index (tversky-format-s sb.index)) 769 | (for [df [dat timing]] 770 | (setv df.index (.set-levels df.index :level "sn" 771 | (tversky-format-s (first df.index.levels))))) 772 | 773 | #(sb dat timing)) 774 | 775 | (finally 776 | (.close db)))) 777 | 778 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 779 | ;; * Interoperability with R 780 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 781 | 782 | (setv _Rproc None) 783 | 784 | (defn _R-setup [] 785 | (import pyper) 786 | (global _Rproc) 787 | (when (is _Rproc None) 788 | (setv _Rproc (pyper.R)))) 789 | 790 | (defn R-run [expr] 791 | (_R-setup) 792 | (.run _Rproc expr)) 793 | 794 | (defn R-assign [lvalue rvalue] 795 | (_R-setup) 796 | (.assign _Rproc lvalue rvalue)) 797 | 798 | (defn R-get [expr] 799 | (_R-setup) 800 | (.get _Rproc expr)) 801 | 802 | (defn R-call [fn-expr #* args [print-it True] #** kwargs] 803 | (import collections [OrderedDict]) 804 | (_R-setup) 805 | (setv arg-string "") 806 | (for [[i a] (enumerate (+ args (tuple (sorted (.items kwargs)))))] 807 | (setv kw (>= i (len args))) 808 | (setv value (if kw (second a) a)) 809 | (if (and (is (type value) list) (= (first value) :raw)) 810 | (setv expr (second value)) 811 | (do 812 | (setv expr (+ "Kodhy_arg_" (str i))) 813 | (.assign _Rproc expr value) 814 | (when (in "DataFrame" (str (type value))) 815 | ; Make sure each Categorical Series becomes a factor, with 816 | ; the correct levels. 817 | (for [j (range (second value.shape))] 818 | (when (in "category" (str (get value.dtypes j))) 819 | (setv v (geti value : j)) 820 | (unless (all (gfor c v.cat.categories (isinstance c str))) 821 | (raise (ValueError "Only string levels are allowed in Categoricals"))) 822 | (.run _Rproc (.format 823 | "{}[,{}] = factor({}[,{}], levels = c({}))" 824 | expr (inc j) expr (inc j) 825 | (.join "," (map double-quote v.cat.categories))))))))) 826 | ; (if (in "DataFrame" (str (type value))) 827 | ; (do 828 | ; ; Work around a bug in Pyper where large DataFrames don't 829 | ; ; get assigned for some reason. The workaround is not very 830 | ; ; general, but should work for Comorbid. 831 | ; (.assign _Rproc variable (.as-matrix value)) 832 | ; (.run _Rproc (.format "{} = data.frame({})" variable variable)) 833 | ; (.assign _Rproc (.format "colnames({})" variable) value.columns.values)) 834 | ; (.assign _Rproc variable value)) 835 | (when i 836 | (+= arg-string ", ")) 837 | (when kw 838 | (+= arg-string (+ (first a) " = "))) 839 | (+= arg-string expr)) 840 | ((if print-it print identity) 841 | (.run _Rproc (.format "Kodhy_out = ({})({})" fn-expr arg-string))) 842 | (.get _Rproc "Kodhy_out")) 843 | 844 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 845 | ;; * Plotting 846 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 847 | 848 | (defn dotplot [xs [diam 1] [group None] [ax None] #** kwargs] 849 | "A plot of 1D data where each value is represented as a circle, 850 | and circles that would overlap are stacked vertically, a bit 851 | like a histogram. Missing values are silently ignored." 852 | 853 | (import 854 | matplotlib.pyplot :as plt 855 | matplotlib.collections [PatchCollection] 856 | numpy [isnan] 857 | toolz [unique]) 858 | 859 | (when (is group None) 860 | (setv group (* #(True) (len xs)))) 861 | (assert (= (len group) (len xs))) 862 | (setv levels (list (unique group))) 863 | (setv group-vert-space (* 3 diam)) 864 | 865 | (setv rows (dfor l levels l [])) 866 | (for [[x g] (sorted (gfor 867 | [x g] (zip xs group) 868 | :if (and (not (isnan x)) (is-not x None)) [x g]))] 869 | (for [row (get rows g)] 870 | (when (>= (- x (get row -1)) diam) 871 | (.append row x) 872 | (break)) 873 | (else 874 | (.append (get rows g) [x])))) 875 | 876 | (setv x (flatten (.values rows))) 877 | 878 | (setv yg (lfor rs (.values rows) (flatten (gfor 879 | [n row] (enumerate rs) 880 | (* [(* diam (+ n .5))] (len row)))))) 881 | (setv ybumps [0]) 882 | (for [i (range 1 (len yg))] 883 | (.append ybumps (+ (max (get yg (dec i))) group-vert-space)) 884 | (setv (get yg i) (lfor v (get yg i) (+ v (get ybumps -1))))) 885 | (setv y (flatten yg)) 886 | 887 | (unless ax 888 | (setv ax (plt.gca))) 889 | (.set-aspect ax "equal") 890 | (for [side (qw left right top)] 891 | (.set-visible (get ax.spines side) F)) 892 | (.set-xlim ax (- (min x) diam) (+ (max x) diam)) 893 | (.set-ylim ax 0 (+ (max y) diam)) 894 | (.tick-params ax :left F :labelleft F) 895 | 896 | ; Now add the visible markers. 897 | (unless (in "color" kwargs) 898 | (setv (get kwargs "color") "black")) 899 | (.add-collection ax (PatchCollection #** kwargs 900 | (if (.pop kwargs "rect" F) 901 | (lfor [x0 y0] (zip x y) 902 | (plt.Rectangle #((- x0 (/ diam 2)) (- y0 (/ diam 2))) diam diam)) 903 | (lfor [x0 y0] (zip x y) 904 | (plt.Circle #(x0 y0) (/ diam 2)))))) 905 | 906 | ; Add the level labels. 907 | (when (> (len levels) 1) 908 | (for [[i level] (enumerate levels)] 909 | (.text ax (- (min x) (* 2 diam)) (get ybumps i) (str level) 910 | :horizontalalignment "right")))) 911 | 912 | (defn rectplot [xs [diam 1] [ax None] #** kwargs] 913 | "`dotplot` using rectangles instead of circles." 914 | (setv (get kwargs "rect") T) 915 | (dotplot xs diam ax #** kwargs)) 916 | 917 | (defn density-plot [xs [bw None] [lo None] [hi None] [steps 257] [ax None] #** kwargs] 918 | ; The default `steps` is chosen to be 1 plus a power of 2. 919 | 920 | (import 921 | numpy :as np 922 | scipy.stats :as scist 923 | matplotlib.pyplot :as plt) 924 | 925 | (setv kde (scist.gaussian-kde xs :bw-method bw)) 926 | (setv test-points (np.linspace :num steps 927 | (if (is lo None) (np.min xs) lo) 928 | (if (is hi None) (np.max xs) hi))) 929 | 930 | (unless ax 931 | (setv ax (plt.gca))) 932 | (for [side (qw left right top)] 933 | (.set-visible (get ax.spines side) F)) 934 | (.tick-params ax :left F :labelleft F) 935 | 936 | (.plot ax test-points (kde test-points) #** kwargs)) 937 | --------------------------------------------------------------------------------