├── kodhy
    ├── __init__.py
    ├── macros.hy
    └── util.hy
├── .gitignore
├── conftest.py
├── tests
    └── test_recategorize.hy
└── README.rst


/kodhy/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | /.pytest_cache
3 | 


--------------------------------------------------------------------------------
/conftest.py:
--------------------------------------------------------------------------------
1 | import hy, pytest
2 | 
3 | def pytest_collect_file(file_path, parent):
4 |     if file_path.name.startswith('test_') and file_path.suffix == '.hy':
5 |         return pytest.Module.from_parent(parent, path = file_path)
6 | 


--------------------------------------------------------------------------------
/tests/test_recategorize.hy:
--------------------------------------------------------------------------------
 1 | (import
 2 |   pandas :as pd
 3 |   kodhy.util [recategorize])
 4 | 
 5 | (defn test-recat []
 6 |   (setv x (pd.Series (list "aabaddabcabbdbb") :dtype "category"))
 7 |   (assert (= (.join "" x.cat.categories) "abcd"))
 8 |   (setv x (recategorize x
 9 |     "a" "Z"
10 |     "b" "Z"
11 |     "d" None
12 |     "c" "X"))
13 |   (assert (= (.join "" x.cat.categories) "ZX"))
14 |   (assert (= (.join "-" (map str x)) "Z-Z-Z-Z-nan-nan-Z-Z-X-Z-Z-Z-nan-Z-Z")))
15 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | This repository contains many small functions and macros that I use to write `Hy`_ code. Python 3 and bleeding-edge Hy (Git master) are assumed.
 2 | 
 3 | The tests require ``pytest``. Run them by saying ``pytest test``.
 4 | 
 5 | License
 6 | ============================================================
 7 | 
 8 | This program is copyright 2025 Kodi Arfer.
 9 | 
10 | This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
11 | 
12 | This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the `GNU General Public License`_ for more details.
13 | 
14 | .. _Hy: http://hylang.org
15 | .. _`GNU General Public License`: http://www.gnu.org/licenses/
16 | 


--------------------------------------------------------------------------------
/kodhy/macros.hy:
--------------------------------------------------------------------------------
  1 | (require
  2 |   hyrule [defmacro!])
  3 | 
  4 | (import
  5 |   collections
  6 |   hyrule [coll?])
  7 | 
  8 | (defmacro incf [expr]
  9 |   `(+= ~expr 1))
 10 | 
 11 | (defreader p
 12 |   ; Pun.
 13 |   ; #p a  =>  {"a" a}
 14 |   ; #p [a b c]  =>  {"a" a  "b" b  "c" c}
 15 |   (setv expr (.parse-one-form &reader))
 16 |   (when (isinstance expr hy.models.Symbol)
 17 |     (setv expr [expr]))
 18 |   (hy.models.Dict (+ #* (lfor x expr [(str x) x]))))
 19 | 
 20 | (defmacro lc [vars a1 [a2 None]]
 21 | "A more Lispy syntax for list comprehensions.
 22 |     (lc [x (range 10)] (str x))
 23 |     (lc [x (range 10)] (> x 3) (str x))"
 24 |   `(lfor
 25 |     ~@vars
 26 |     ~@(if a2 [:if a1] [])
 27 |     ~(or a2 a1)))
 28 | 
 29 | (defmacro! rmap [arglist #* expr]
 30 | ; (rmap [[i x] (enumerate "xyzzy")] (setv y (.upper x)) (+ (str i) y))
 31 | ;   => ["0X" "1Y" "2Z" "3Z" "4Y"]
 32 |   (setv [lvalue args] arglist)
 33 |   `(list (map
 34 |     (fn [~g!arg]
 35 |       (setv ~lvalue ~g!arg)
 36 |       ~@expr)
 37 |     ~args)))
 38 | 
 39 | (defmacro amap [expr args]
 40 | "'a' stands for 'anaphoric'."
 41 |   `(list (map (fn [it] ~expr) ~args)))
 42 | 
 43 | (defmacro filt [expr args]
 44 |   `(list (filter (fn [it] ~expr) ~args)))
 45 | 
 46 | (defmacro fmap [gen-expr filter-expr args]
 47 |   `(lfor  it ~args  :if ~filter-expr  ~gen-expr))
 48 | 
 49 | (defmacro! amap2 [expr args]
 50 | ; (amap2 (+ a b) (range 10))  =>  [1 5 9 13 17]
 51 |   `(do
 52 |     (setv ~g!args (list ~args))
 53 |     (when (% (len ~g!args) 2)
 54 |       (raise (ValueError "iterable argument must have an even number of elements")))
 55 |     (list (map
 56 |       (fn [~g!i]
 57 |         (setv a (get ~g!args ~g!i))
 58 |         (setv b (get ~g!args (+ ~g!i 1)))
 59 |         ~expr)
 60 |       (range 0 (len ~g!args) 2)))))
 61 | 
 62 | (defmacro! map-dvals [expr d]
 63 |   `(dict (map
 64 |     (fn [~g!pair]
 65 |       (setv it (get ~g!pair 1))
 66 |       #((get ~g!pair 0) ~expr))
 67 |     (.items ~d))))
 68 | 
 69 | (defmacro! tally [expr args]
 70 |   `(do
 71 |     (setv ~g!n 0)
 72 |     (for [it ~args]
 73 |       (when ~expr
 74 |         (+= ~g!n 1)))
 75 |     ~g!n))
 76 | 
 77 | (defmacro afind [expr args]
 78 |   `(try
 79 |     (next (filter (fn [it] ~expr) ~args))
 80 |     (except [StopIteration] (raise (ValueError "afind: no matching value found")))))
 81 | 
 82 | (defmacro afind-or [expr args [default None]]
 83 | "The default expression 'default' is evaluated (and its value returned)
 84 | if no matching value is found."
 85 |   `(try
 86 |     (next (filter (fn [it] ~expr) ~args))
 87 |     (except [StopIteration] ~default)))
 88 | 
 89 | (defmacro whenn [expr #* body]
 90 | "Analogous to Haskell's liftM for Maybe. Evaluates
 91 | 'expr' and, if its value is not None, evaluates 'body' with the
 92 | value bound to 'it'."
 93 |   `(do
 94 |     (setv it ~expr)
 95 |     (when (is-not it None)
 96 |       ~@body)))
 97 | 
 98 | (defmacro replicate [n #* body]
 99 |   `(list (map (fn [_] ~@body) (range ~n))))
100 | 
101 | (defn recur-sym-replace [expr f] (cond
102 |   ; Recursive symbol replacement.
103 |   (isinstance expr hy.models.Symbol)
104 |     (f expr)
105 |   (coll? expr)
106 |     ((type expr) (amap (recur-sym-replace it f) expr))
107 |   True
108 |     expr))
109 | 
110 | (defmacro λ [#* body]
111 |   `(fn [it] ~@body))
112 | 
113 | ;(defmacro λ2 [#* body]
114 | ;  `(fn [x y] ~@body))
115 | 
116 | (defmacro qw [#* words]
117 | "(qw foo bar baz \"a b\" 5) => ['foo', 'bar', 'baz', 'a b', '5']"
118 |   (hy.models.List (gfor  w words  (hy.models.String
119 |     (if (isinstance w #(hy.models.Symbol hy.models.String))
120 |       w
121 |       (.lstrip (hy.repr w) "'"))))))
122 | 
123 | (defmacro meth [param-list #* body]
124 | "(meth [foo] (+ @bar foo))  =>  (fn [self foo] (+ self.bar foo))"
125 |   (meth-f param-list body))
126 | 
127 | (defmacro cmeth [param-list #* body]
128 |   `(classmethod ~(meth-f param-list body)))
129 | 
130 | (defn meth-f [param-list body]
131 |   `(fn [self ~@param-list] ~@(recur-sym-replace body (fn [sym] (cond
132 |     (in sym ["@" "@="])
133 |       sym
134 |     (= sym "@@")
135 |       'self
136 |     (.startswith sym "@")
137 |       `(. self ~@(amap (hy.models.Symbol it) (.split (cut sym 1 None) ".")))
138 |     (.startswith sym "is_@")
139 |       `(. self ~@(amap (hy.models.Symbol it) (.split (+ "is_" (cut sym (len "is_@") None)) ".")))
140 |     True
141 |       sym)))))
142 | 
143 | (defmacro getl [obj key1 [key2 None] [key3 None]]
144 | ; Given a pd.DataFrame 'mtcars':
145 | ;    (getl mtcars "4 Drive" "hp")    =>  the cell "4 Drive", "hp"
146 | ;    (getl mtcars "4 Drive")         =>  the row "4 Drive"
147 | ;    (getl mtcars : "hp")            =>  the column "hp"
148 | ;    (getl mtcars : (: "cyl" "hp"))  =>  columns "cyl" through "hp"
149 |   (panda-get 'loc obj key1 key2 key3))
150 | 
151 | (defmacro geti [obj key1 [key2 None] [key3 None]]
152 |   (panda-get 'iloc obj key1 key2 key3))
153 | 
154 | (setv COLON :)
155 | (defmacro $ [obj key]
156 | ; Given a pd.DataFrame 'mtcars':
157 | ;     ($ mtcars hp)            =>  the column "hp"
158 |   `(get ~obj ~(hy.models.String key)))
159 | 
160 | (defmacro geta [obj #* keys]
161 | "For numpy arrays."
162 |   `(get ~obj #(~@(map parse-key keys))))
163 | 
164 | (defn parse-key [key]
165 | "Keys can be:
166 |     :  =>  Empty slice object
167 |     (: ...)  =>  slice(...)
168 |     anything else => itself"
169 |   (cond
170 |     (= key :)
171 |       '(slice None)
172 |     (and (isinstance key hy.models.Expression) (= (get key 0) :))
173 |       `(slice ~@(cut key 1 None))
174 |     True
175 |       key))
176 | 
177 | (defn panda-get [attr obj key1 [key2 None] [key3 None]]
178 |   `(get (. ~obj ~attr) ~(cond
179 |     (is-not key3 None) `#(~(parse-key key1) ~(parse-key key2) ~(parse-key key3))
180 |     (is-not key2 None) `#(~(parse-key key1) ~(parse-key key2))
181 |     True (parse-key key1))))
182 | 
183 | (defn dollar-replace [df-sym expr]
184 |   (recur-sym-replace expr (fn [sym]
185 |     (if (.startswith sym "$")
186 |       (if (= (len sym) 1)
187 |         df-sym
188 |         (panda-get 'loc df-sym COLON (hy.models.String (cut sym 1 None))))
189 |       sym))))
190 | 
191 | (defmacro wc [df #* body]
192 | "With columns.
193 |     (wc df (+ $a $b))  =>  (+ ($ df a) ($ df b))
194 | The replacement is recursive.
195 | `$` on its own becomes simply `df`."
196 |   (setv df-sym (hy.gensym))
197 |   (setv body (dollar-replace df-sym body))
198 |   `(do (setv ~df-sym ~df) ~@body))
199 | 
200 | (defmacro ss [df #* body]
201 | "Subset. Evaluate `body` like `wc`, which should produce a
202 | boolean vector. Return `df` indexed by the boolean vector."
203 |   (setv df-sym (hy.gensym))
204 |   (setv body (dollar-replace df-sym body))
205 |   `(do (setv ~df-sym ~df) (get ~df-sym ~@body)))
206 | 
207 | (defmacro ssi [df #* body]
208 | "Subset index. Like `ss`, but returns a list of the indices that
209 | matched."
210 |   (setv df-sym (hy.gensym))
211 |   (setv body (dollar-replace df-sym body))
212 |   `(do
213 |     (setv ~df-sym ~df)
214 |     (.tolist (. (get ~df-sym ~@body) index))))
215 | 
216 | (defmacro ordf [df #* exprs]
217 | "Order data frame. (ordf d (.abs $baz) $bar) sorts first by the
218 | absolute value of the column `baz`, then by `bar`."
219 |   (setv [df-sym pd sorting-df] [(hy.gensym) (hy.gensym) (hy.gensym)])
220 |   (setv exprs (dollar-replace df-sym exprs))
221 |   `(do
222 |     (setv ~df-sym ~df)
223 |     (import  pandas :as ~pd)
224 |     (setv ~sorting-df (.reset-index (.concat ~pd [~@exprs] 1) None True))
225 |     (geti ~df-sym (. (.sort-values ~sorting-df (list (. ~sorting-df columns))) index))))
226 | ; ~pd
227 | 
228 | (defmacro wcby [df by #* body]
229 |   (setv df-sym (hy.gensym)  it-sym (hy.gensym))
230 |   `(do
231 |     (setv ~df-sym ~df)
232 |     (.apply
233 |       (.groupby ~df-sym ~(dollar-replace df-sym by))
234 |       (fn [~it-sym] ~@(dollar-replace it-sym body)))))
235 | 
236 | (defmacro! cbind [#* args]
237 |  `(do
238 |     (import  kodhy.util [cbind-join :as ~g!cj])
239 |     (~g!cj "outer" ~@(gfor a args
240 |       (if (isinstance a hy.models.Keyword) a.name a)))))
241 | 
242 | (defmacro cached [expr [bypass 'None] [cache-dir 'None]]
243 |   `(do
244 |      (import kodhy.util)
245 |      (kodhy.util.cached-eval
246 |        (kodhy.util.show-expr '~expr)
247 |        (fn [] ~expr)
248 |        ~bypass
249 |        ~cache-dir)))
250 | 
251 | (defmacro show-time-elapsed [#* expr]
252 |   (setv t (hy.gensym))
253 |   `(do
254 |     (setv ~t (hy.I.time.perf-counter))
255 |     (try
256 |       ~@expr
257 |       (finally
258 |         (setv ~t (round (- (hy.I.time.perf-counter) ~t)))
259 |         (print "Time elapsed:" #* (gfor
260 |             [k v] (.items (dict
261 |               :h (// ~t (* 60 60))
262 |               :min (// (% ~t (* 60 60)) 60)
263 |               :s (% ~t 60)))
264 |             :if v
265 |             f"{v} {k}"))))))
266 | 


--------------------------------------------------------------------------------
/kodhy/util.hy:
--------------------------------------------------------------------------------
  1 | (require
  2 |   hyrule [unless]
  3 |   kodhy.macros *)
  4 | 
  5 | (import
  6 |   functools [reduce]
  7 |   itertools [combinations]
  8 |   collections.abc [Iterable]
  9 |   numbers [Number]
 10 |   hyrule [distinct flatten inc dec coll?]
 11 |   toolz [first second partition identity])
 12 | 
 13 | (setv T True)
 14 | (setv F False)
 15 | 
 16 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 17 | ;; * Numbers and arrays
 18 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 19 | 
 20 | (defn signum [x] (cond
 21 |   (< x 0) -1
 22 |   (> x 0)  1
 23 |   T        0))
 24 | 
 25 | (defn product [l]
 26 |   (setv a 1)
 27 |   (for [x l]
 28 |     (*= a x))
 29 |   a)
 30 | 
 31 | (defn odds-ratio [p1 p2]
 32 |   (/ (* p1 (- 1 p2)) (* p2 (- 1 p1))))
 33 | 
 34 | (defn odds-ratio-on-p [oratio p]
 35 |   ; How an odds ratio changes a probability.
 36 |   ; Identity: (= p1 (odds-ratio-on-p (odds-ratio p1 p2) p2))
 37 |   (/ (* oratio p) (+ 1 (* p (- oratio 1)))))
 38 | 
 39 | (defn int-binomial [n k]
 40 |   ; `n` choose `k` as an integer. (scipy.special.binom uses floating point.)
 41 |   (when (and (= n 0) (!= k 0))
 42 |     (return 0))
 43 |   (when (= k 0)
 44 |     (return 1))
 45 |   (//
 46 |     (reduce * (seq (- n (dec k)) n))
 47 |     (reduce * (seq 1 k))))
 48 | 
 49 | (defn frac-binom-pmf [k n p]
 50 |   ; Probability mass function of the binomial distribution, returning
 51 |   ; a Fraction so long as `p` is a Fraction.
 52 |   ; The output is the probability of exactly `k` successes among `n`
 53 |   ; independent Bernoulli trials, each with probability `p`.
 54 |   (*
 55 |     (int-binomial n k)
 56 |     (** p k)
 57 |     (** (- 1 p) (- n k))))
 58 | 
 59 | (defn logit [x]
 60 |   (import numpy)
 61 |   (numpy.log (/ x (- 1 x))))
 62 | 
 63 | (defn ilogit [x]
 64 |   (import numpy)
 65 |   (/ 1 (+ 1 (numpy.exp (- x)))))
 66 | 
 67 | (defn zscore [x]
 68 |   (/ (- x (.mean x)) (.std x :ddof 0)))
 69 | 
 70 | (defn hzscore [x]
 71 | "Half the z-score. Divides by two SDs instead of one, per:
 72 | Gelman, A. (2008). Scaling regression inputs by dividing by two standard deviations. Statistics in Medicine, 27(15), 2865–2873. doi:10.1002/sim.3107"
 73 |   (/ (- x (.mean x)) (* 2 (.std x :ddof 0))))
 74 | 
 75 | (defn rmse [v1 v2]
 76 | "Root mean square error."
 77 |   (import  numpy :as np)
 78 |   (np.sqrt (np.mean (** (- v1 v2) 2))))
 79 | 
 80 | (defn mean-ad [v1 v2]
 81 | "Mean absolute deviation."
 82 |   (import  numpy :as np)
 83 |   (np.mean (np.abs (- v1 v2))))
 84 | 
 85 | (defn jitter [v [factor 100]]
 86 |   (import  numpy :as np)
 87 |   (setv b (/ (- (.max v) (.min v)) (* 2 factor)))
 88 |   (+ v (np.random.uniform (- b) b (len v))))
 89 | 
 90 | (defn valcounts [x [y None]]
 91 |   (import pandas :as pd)
 92 | 
 93 |   (setv [x y] (gfor v [x y] (cond
 94 |     (is v None)
 95 |       v
 96 |     (isinstance v pd.Series)
 97 |       (if (and (= v.dtype "category") (.any (pd.isnull v)))
 98 |         (.fillna (.cat.add-categories v "~N/A") "~N/A")
 99 |         v)
100 |     True
101 |       (pd.Series (list v)))))
102 | 
103 |   (if (is y None)
104 |     (.rename
105 |       ((if (in "float" (str x.dtype)) (fn [x] (.sort-index x)) identity)
106 |         (.value-counts x :sort F :dropna F))
107 |       (λ (if (pd.isnull it) "~N/A" it)))
108 |     (pd.crosstab
109 |       (.fillna x "~N/A") (.fillna y "~N/A"))))
110 | 
111 | (defn valprops [x [y None] [d None]]
112 |   (setv out (valcounts x y))
113 |   (setv out (/ out (.sum out)))
114 |   (if (is d None) out (.round out :decimals d)))
115 | 
116 | (defn weighted-choice [l]
117 | ; The argument should be a list of (weight, object) pairs.
118 | ; http://stackoverflow.com/a/3679747
119 |   (import random)
120 |   (setv r (random.uniform 0 (sum (map first l))))
121 |   (for [[w x] l]
122 |     (-= r w)
123 |     (when (<= r 0)
124 |       (break)))
125 |   x)
126 | 
127 | (defn pds-from-pairs [l #** kwargs]
128 |   (import  pandas :as pd)
129 |   (setv l (list l))
130 |   (pd.Series (amap (second it) l) (amap (first it) l) #** kwargs))
131 | 
132 | (defn pd-posix-time [series]
133 |   (import  numpy :as np)
134 |   (// (.astype series np.int64) (int 1e9)))
135 | 
136 | (defn pd-rename-cats [series f-or-dict]
137 |   (.cat.rename-categories series (amap
138 |     (if (callable f-or-dict)
139 |       (f-or-dict it)
140 |       (.get f-or-dict it it))
141 |     series.cat.categories)))
142 | 
143 | (defn recategorize [x #* kv]
144 |   (import  pandas :as pd)
145 |   (setv kv (list (partition 2 kv)))
146 |   (setv d (dict kv))
147 |   (unless (= (sorted x.cat.categories) (sorted (.keys d)))
148 |     (raise (ValueError "original categories don't match")))
149 |   (.astype
150 |     (.replace (.astype x object) d)
151 |     (pd.api.types.CategoricalDtype :categories (distinct
152 |       (gfor
153 |         [_ v] kv
154 |         :if (not (pd.isnull v))
155 |         v)))))
156 | 
157 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
158 | ;; * Matrices and DataFrames
159 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
160 | 
161 | (defn cbind-join [join #* args]
162 |   (import  pandas :as pd)
163 |   (setv args (list args))
164 |   (setv final-index None)
165 |   (when (and (isinstance (first args) str) (= (first args) "I"))
166 |     (shift args)
167 |     (setv final-index (shift args)))
168 |   (defn scalar? [x]
169 |     (or (isinstance x str) (not (isinstance x Iterable))))
170 |   (setv height (max (gfor
171 |     a args
172 |     (if (scalar? a) 1 (len a)))))
173 | 
174 |   (setv  chunks []  assume-index final-index)
175 |   (while args
176 |     (setv  name None  x (shift args))
177 |     (when (isinstance x str)
178 |       (setv  name x  x (shift args)))
179 |     (when (and
180 |         (is assume-index None)
181 |         (isinstance x #(pd.Series pd.DataFrame)))
182 |       (setv assume-index x.index))
183 |     (setv x (cond
184 |       (isinstance x #(pd.Series pd.DataFrame))
185 |         (if name (.copy x) x)
186 |       (scalar? x)
187 |         (pd.Series (* [x] height) :index assume-index)
188 |       T
189 |         (pd.Series x)))
190 |     (when name
191 |       (setv x.name name))
192 |     (.append chunks x))
193 | 
194 |   (setv result (pd.concat :objs chunks :axis 1 :join join))
195 |   (unless (is final-index None)
196 |     (setv result.index final-index))
197 |   result)
198 | 
199 | (defn df-from-pairs [l]
200 |   (import  pandas :as pd)
201 |   (setv d (pd.DataFrame (lc [row l] (lc [[_ v] row] v))))
202 |   (setv d.columns (amap (first it) (first l)))
203 |   d)
204 | 
205 | (defn drop-unused-cats [d [inplace F]]
206 |   ; Drops unused categories from all categorical columns.
207 |   ; Can also be applied to a Series.
208 |   (import  pandas :as pd)
209 |   (unless inplace
210 |     (setv d (.copy d)))
211 |   (for [[_ col] (if (isinstance d pd.Series) [[None d]] (.iteritems d))]
212 |     (when (hasattr col "cat")
213 |       (.remove-unused-categories col.cat :inplace T)))
214 |   d)
215 | 
216 | (defn cols2map [d k-col v-col [keep None]]
217 |   (setv d (getl d : [k-col v-col]))
218 |   (when keep
219 |     (setv d (.drop-duplicates d :keep keep)))
220 |   (setv d (.set-index d k-col :verify-integrity (not keep)))
221 |   (geti d : 0))
222 | 
223 | (defn -number-format [x f]
224 |   (import  numpy :as np  pandas :as pd)
225 |   (cond
226 |     (isinstance x pd.DataFrame) (do
227 |       (setv x (.copy x))
228 |       (for [r (range (first x.shape))]
229 |         (for [c (range (second x.shape))]
230 |           (setv (geti x r c) (-number-format (geti x r c) f))))
231 |       x)
232 |    (isinstance x pd.Series) (do
233 |      (setv x (.copy x))
234 |      (for [i (range (len x))]
235 |          (setv (geti x i) (-number-format (geti x i) f)))
236 |      x)
237 |    (isinstance x np.ndarray)
238 |      (f x)
239 |    (coll? x)
240 |     ((type x) (amap (-number-format it f) x))
241 |    (isinstance x Number)
242 |      (f x)
243 |    T
244 |      x))
245 | 
246 | (defn rd [a1 [a2 None]]
247 | "Round for display. Takes just a number, array, Series, DataFrame,
248 | or other collection, or both a number of digits to round to and
249 | such an object."
250 |   (import  numpy :as np)
251 |   (setv [x digits] (if (is a2 None) [a1 3] [a2 a1]))
252 |   (-number-format x (fn [v] (np.round v digits))))
253 | 
254 | (defn thousep [x]
255 |   (import  numpy :as np)
256 |   (setv vec-f (np.vectorize (fn [v] (format v ","))))
257 |   (-number-format x (fn [v] (if (isinstance v np.ndarray) (vec-f v) (format v ",")))))
258 | 
259 | (defn with-1o-interacts [m [column-names None]]
260 | "Given a data matrix m, return a matrix with a new column
261 | for each first-order interaction. Constant columns are removed."
262 |   (import  numpy :as np  itertools [combinations])
263 |   (when column-names
264 |     (assert (= (len column-names) (second m.shape))))
265 |   (setv [new-names new-cols] (zip #* (filt
266 |     (not (.all (= (second it) (first (second it)))))
267 |     (lc [[v1 v2] (combinations (range (second m.shape)) 2)] (,
268 |       (when column-names
269 |         (tuple (sorted [(get column-names v1) (get column-names v2)])))
270 |       (np.multiply (geta m : v1) (geta m : v2)))))))
271 |   (setv new-m (np.column-stack (+ #(m) new-cols)))
272 |   (if column-names
273 |     #(new-m (+ (tuple column-names) new-names))
274 |     new-m))
275 | 
276 | (defn print-big-pd [obj]
277 |   (import  pandas :as pd)
278 |   (with [(pd.option-context
279 |       "display.max_rows" (int 5000)
280 |       "display.max_columns" (int 100)
281 |       "display.width" (int 1000)
282 |       "display.max_colwidth" (int 500))]
283 |     (print obj)))
284 | 
285 | (defn pd-to-pretty-json [path df]
286 |   ; Serializes a Pandas dataframe to an obvious-looking JSON format.
287 |   ; Information about categorial columns is saved as metadata.
288 |   (import  math [isnan]  numpy :as np  collections [OrderedDict])
289 |   (setv out (OrderedDict))
290 | 
291 |   (setv (get out "categories") (OrderedDict (rmap [col (ssi df.dtypes (= $ "category"))]
292 |     [col (OrderedDict [
293 |       #("ordered" (. (getl df : col) cat ordered))
294 |       #("categories" (list (. (getl df : col) cat categories)))])])))
295 |   (unless (get out "categories")
296 |     (del (get out "categories")))
297 | 
298 |   (setv cols (list df.columns))
299 |   (setv table (.astype df.values object))
300 |   (setv table ((np.vectorize :otypes [object]
301 |      (λ (if (and (isinstance it float) (isnan it)) None it))) table))
302 |   (setv (get out "first_col_is_row_labels") F)
303 |   (when (or df.index.name
304 |       (not (.all (= df.index (list (range (len df)))))))
305 |     ; We only include the index as a column if it has a name or
306 |     ; is something other than consecutive integers starting from
307 |     ; 0.
308 |     (setv (get out "first_col_is_row_labels") T)
309 |     (setv table (np.column-stack [df.index table]))
310 |     (setv cols (+ [df.index.name] cols)))
311 |   (setv (get out "table") (+ [cols] (.tolist table)))
312 | 
313 |   (setv jstr (json-dumps-pretty out :sort-keys F))
314 |   (if path (barf path jstr) jstr))
315 | 
316 | (defn pretty-json-to-pd [path]
317 |   (import  json  pandas :as pd)
318 |   (setv j (json.loads (slurp path)))
319 |   (setv df (pd.DataFrame (cut (get j "table") 1 None)
320 |     :columns (get j "table" 0)))
321 |   (when (get j "first_col_is_row_labels")
322 |     (setv df (.set-index df (first df.columns))))
323 |   (for [[catcol meta] (.items (.get j "categories" {}))]
324 |     (setv (getl df : catcol) (.astype (getl df : catcol)
325 |       (pd.api.types.CategoricalDtype #** meta))))
326 |   df)
327 | 
328 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
329 | ;; * Strings
330 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
331 | 
332 | (defn cat [#* args #** kwargs]
333 |   (.join
334 |     (or (.get kwargs "sep") "")
335 |     (lc [x args] x (string x))))
336 | 
337 | (defn ucfirst [s]
338 |   (and s (+ (.upper (first s)) (cut s 1 None))))
339 | 
340 | (defn double-quote [s]
341 |   (.format "\"{}\""
342 |     (.replace (.replace s "\\" "\\\\") "\"" "\\\"")))
343 | 
344 | (defn show-expr [x]
345 | "Stringify Hy expressions to a fairly pretty form, albeit
346 | without newlines outside string literals."
347 |   (cond
348 |     (isinstance x hy.models.Expression)
349 |       (.format "({})" (.join " " (list (map show-expr x))))
350 |     (isinstance x hy.models.Dict)
351 |       (.format "{{{}}}" (.join " " (list (map show-expr x))))
352 |     (isinstance x hy.models.Keyword)
353 |       (+ ":" x.name)
354 |     (isinstance x hy.models.Symbol)
355 |       (str x)
356 |     (isinstance x list)
357 |       (.format "[{}]" (.join " " (list (map show-expr x))))
358 |     (isinstance x tuple)
359 |       (.format "#({})" (.join " " (list (map show-expr x))))
360 |     (isinstance x str)
361 |       (double-quote (str x))
362 |     T
363 |       (str x)))
364 | 
365 | (defn keyword->str [x]
366 |   (if (isinstance x hy.models.Keyword)
367 |     x.name
368 |     x))
369 | 
370 | (defn str->keyword [x]
371 |   (if (isinstance x hy.models.Keyword)
372 |     x
373 |     (hy.models.Keyword x)))
374 | 
375 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
376 | ;; * Lists and other basic data structures
377 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
378 | 
379 | (defn concat [ll]
380 |   (reduce (fn [accum x] (+ accum x)) ll []))
381 | 
382 | (defn merge-dicts [#* ds]
383 |   (setv out {})
384 |   (for [d ds]
385 |     (.update out d))
386 |   out)
387 | 
388 | (defn seq [lo hi [step 1]]
389 |   (list (range lo (+ hi step) step)))
390 | 
391 | (defn shift [l]
392 |   (.pop l 0))
393 | 
394 | (defn all-unique? [l]
395 |   (setv seen (set))
396 |   (for [x l]
397 |     (when (in x seen)
398 |       (return False))
399 |     (.add seen x))
400 |   True)
401 | 
402 | (defn mins [iterable [key (λ it)] [comparator-fn None] [agg-fn min]]
403 |   ; Returns a list of minimizing values of the iterable,
404 |   ; in their original order.
405 |   (unless comparator-fn
406 |     (import operator)
407 |     (setv comparator-fn operator.le))
408 |   (setv items (list iterable))
409 |   (unless items
410 |     (return []))
411 |   (setv vals (list (map key items)))
412 |   (setv vm (agg-fn vals))
413 |   (lc [[item val] (zip items vals)]
414 |     (comparator-fn val vm)
415 |     item))
416 | 
417 | (defn maxes [iterable [key (λ it)]]
418 |   (import operator)
419 |   (mins iterable key operator.ge max))
420 | 
421 | (defn rget [obj regex]
422 |   (import re)
423 |   (setv regex (re.compile regex))
424 |   (setv keys (filt (.search regex it) (.keys obj)))
425 |   (cond
426 |     (> (len keys) 1)
427 |       (raise (LookupError "Ambiguous matcher"))
428 |     (= (len keys) 0)
429 |       (raise (LookupError "No match"))
430 |     T
431 |       (get obj (get keys 0))))
432 | 
433 | (defn pairs [#* a]
434 |   (setv a (list a))
435 |   (setv r [])
436 |   (while a
437 |     (.append r #((keyword->str (shift a)) (keyword->str (shift a)))))
438 |   r)
439 | 
440 | (defn by-ns [n iterable]
441 |   (zip #* (* [(iter iterable)] n)))
442 | (defn by-2s [iterable]
443 |   (by-ns 2 iterable))
444 | 
445 | (defn iter-with-prev [iterable]
446 |   (setv prev None)
447 |   (for [item iterable]
448 |     (yield #(prev item))
449 |     (setv prev item)))
450 | 
451 | (defn iter-with-prev1 [iterable]
452 |   ; Like iter-with-prev, but skips the first pair, which has
453 |   ; None for the previous value.
454 |   (import itertools)
455 |   (itertools.islice (iter-with-prev iterable) 1 None))
456 | 
457 | (defn c+1 [counter key]
458 |   (+= (get counter key) 1))
459 | 
460 | (defclass ad [dict]
461 |   "Attribute dictionary. A dictionary with which you can set, get, and
462 |   delete items as if they were attributes."
463 |   (defn __getattr__ [self k]
464 |     (get self k))
465 |   (defn __setattr__ [self k v]
466 |     (setv (get self k) v))
467 |   (defn __delattr__ [self k]
468 |     (del (get self k))))
469 | 
470 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
471 | ;; * Files
472 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
473 | 
474 | (defn slurp [name [mode None] [encoding None] [buffering None]]
475 |   (setv f open)
476 |   (when encoding
477 |     (import codecs)
478 |     (setv f codecs.open))
479 |   (with [o (f name #** (dict (+
480 |       (if (is mode None)      [] [#("mode" mode)])
481 |       (if (is encoding None)  [] [#("encoding" encoding)])
482 |       (if (is buffering None) [] [#("buffering" buffering)]))))]
483 |     (o.read)))
484 | 
485 | (defn barf [name content [mode "w"] [encoding None] [buffering None]]
486 |   (setv f open)
487 |   (when encoding
488 |     (import codecs)
489 |     (setv f codecs.open))
490 |   (with [o (f name mode #** (dict (+
491 |       (if (is encoding None)  [] [#("encoding" encoding)])
492 |       (if (is buffering None) [] [#("buffering" buffering)]))))]
493 |     (o.write content)))
494 | 
495 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
496 | ;; * JSON
497 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
498 | 
499 | (defn json-dumps-pretty [o #** kwargs]
500 |   ; Like json.dumps, but arrays or objects of atomic values are
501 |   ; printed without internal indents, and with different
502 |   ; option defaults.
503 |   (import json uuid)
504 |   (for [[option value] [
505 |       ["indent" 2] ["separators" #("," ": ")] ["sort_keys" T]]]
506 |     (when (is (.get kwargs option) None)
507 |       (setv (get kwargs option) value)))
508 |   (setv substituted-parts {})
509 |   (defn recursive-subst [x]
510 |     ; Replaces lists or dictionaries of atomic values with UUID
511 |     ; strings.
512 |     (if (isinstance x #(list tuple dict))
513 |       (if (all (lc [v (if (isinstance x dict) (.values x) x)]
514 |             (isinstance v #(bool (type None) int float str bytes))))
515 |         (do
516 |           (setv my-id (. (uuid.uuid4) hex))
517 |           (setv (get substituted-parts my-id) x)
518 |           my-id)
519 |         (if (isinstance x dict)
520 |           ((type x) (lc [[k v] (.items x)] #(k (recursive-subst v))))
521 |           (lc [v x] (recursive-subst v))))
522 |       x))
523 |   (setv json-str (json.dumps (recursive-subst o) #** kwargs))
524 |   (setv (get kwargs "indent") None)
525 |   (setv (get kwargs "separators") #(", " ": "))
526 |   (for [[my-id x] (.items substituted-parts)]
527 |     (setv json-str (.replace json-str (+ "\"" my-id "\"")
528 |       (.rstrip (json.dumps x #** kwargs))
529 |       1)))
530 |   json-str)
531 | 
532 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
533 | ;; * Cross-validation
534 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
535 | 
536 | (defn kfold-cv-pred [x y f [k-folds 10] [shuffle T] [random-state None] [folds None]]
537 | "Return a np of predictions of y given x using f. x is expected to be
538 | a numpy matrix, not a pandas DataFrame.
539 | 
540 | f will generally be of the form (fn [x-train y-train x-test] ...),
541 | and should return a 1D nparray of predictions given x-test."
542 |   (import numpy :as np)
543 |   (setv y-pred None)
544 |   (unless folds
545 |     (setv folds (cut
546 |       (np.tile (np.arange k-folds) (int (np.ceil (/ (len x) k-folds))))
547 |       (len x)))
548 |     (.shuffle (np.random.default-rng random-state) folds)
549 |     (setv folds (lfor
550 |       i (range k-folds)
551 |       [(np.where (!= folds i)) (np.where (= folds i))])))
552 |   (for [[train-i test-i] folds]
553 |     (setv result (f (get x train-i) (get y train-i) (get x test-i)))
554 |     (when (is y-pred None)
555 |       (setv y-pred (np.empty-like y :dtype result.dtype)))
556 |     (setv (get y-pred test-i) result))
557 |   y-pred)
558 | 
559 | (defn choose-labeled-cv-folds [subjects labels bin-label-possibilities]
560 | ; Put the subjects into cross-validation folds such that all the
561 | ; subjects with a given label are in the same fold.
562 | ; bin-label-possibilities should be the return value of
563 | ; bin-labels.
564 |   (import  random [choice shuffle]  collections [Counter])
565 |   (setv group-sizes (Counter labels))
566 |   (setv bins (list (choice bin-label-possibilities)))
567 |   (shuffle bins)
568 |   (amap
569 |     (concat (amap
570 |       (do
571 |         (setv target-size it)
572 |         (setv label (first (afind (= (second it) target-size) (.items group-sizes))))
573 |         (del (get group-sizes label))
574 |         (lc [[s l] (zip subjects labels)] (= l label) s))
575 |       it))
576 |     bins))
577 | 
578 | (defn bin-labels [labels [n-bins 10] [max-bin-size None]]
579 | ; A routine to prepare input for choose-labeled-cv-folds.
580 | ;
581 | ; Finds ways to sort a list of label objects (which is
582 | ; expected to have lots of duplicates) into a fixed number
583 | ; bins in a way that gets the bin sizes as close to equal
584 | ; as possible. This is just the multiprocessor-scheduling problem
585 | ; ( https://en.wikipedia.org/wiki/Multiprocessor_scheduling )
586 | ; with a different metric to optimize.
587 | ;
588 | ; The algorithm simply enumerates all possibilities, so it will
589 | ; will be too slow with large inputs.
590 | ;
591 | ; Returns a tuple of possibilities, each of which is a tuple of
592 | ; bins. Each bin is a tuple of numbers representing the size
593 | ; of a labelled group.
594 | 
595 |   (import  collections [Counter])
596 | 
597 |   (setv initial-state (,
598 |     (tuple (sorted (.values (Counter labels))))
599 |     (* #((,)) n-bins)))
600 | 
601 |   (setv states (set [initial-state]))
602 |   (setv explore [initial-state])
603 |   (setv iteration 0)
604 |   (setv explore-len-was (len explore))
605 | 
606 |   (print "Generating possible arrangements")
607 |   (while explore
608 |     (+= iteration 1)
609 |     (unless (% iteration 10000)
610 |       (print (len explore) (.format "({:+d})" (- (len explore) explore-len-was)))
611 |       (setv explore-len-was (len explore)))
612 |     (setv [remaining bins] (.pop explore 0))
613 |     (setv rseen (set))
614 |     (for [r-i (range (len remaining))]
615 |       (setv x (get remaining r-i))
616 |       (when (in x rseen)
617 |         (continue))
618 |       (.add rseen x)
619 |       (setv new-remaining (+ (cut remaining 0 r-i) (cut remaining (+ r-i 1) None)))
620 |       (for [b-i (range n-bins)]
621 |         (setv new-bin (+ (get bins b-i) #(x)))
622 |         (when (and max-bin-size (> (sum new-bin) max-bin-size))
623 |           (continue))
624 |         (setv new-bins (tuple (sorted (+
625 |           (cut bins b-i)
626 |           #((tuple (sorted new-bin)))
627 |           (cut bins (+ b-i 1) None)))))
628 |         (setv new-state #(new-remaining new-bins))
629 |         (when (in new-state states)
630 |           (continue))
631 |         (.add states new-state)
632 |         (.append explore new-state))))
633 | 
634 |   (print "Finding minima")
635 |   (setv target-bin-size (/ (len labels) n-bins))
636 |   (mins
637 |     (lc [[remaining bins] states] (and (not remaining) (all bins)) bins)
638 |     (λ (sum (amap (** (- (sum it) target-bin-size) 2) it)))))
639 | 
640 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
641 | ;; * Caching
642 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
643 | 
644 | (setv _default-cache-dir (do
645 |   (import os.path)
646 |   (os.path.join (os.path.expanduser "~") ".daylight" "py-cache")))
647 | 
648 | (defn cached-eval [key f [bypass None] [cache-dir None]]
649 | "Call `f`, caching the value with the string `key`. If `bypass`
650 | is provided, its value is written to the cache and returned
651 | instead of calling `f` or consulting the existing cache."
652 |   (import pickle hashlib base64 os errno time)
653 | ;  (unless (os.path.exists cache-dir)
654 | ;    (os.makedirs cache-dir))
655 |   (unless cache-dir
656 |     (setv cache-dir _default-cache-dir))
657 |   (setv basename (str (cut
658 |     (base64.b64encode
659 |       (.digest (hashlib.md5 (.encode key "UTF8")))
660 |       (.encode "+_" "ASCII"))
661 |     0 -2) "ASCII"))
662 |   (setv path (os.path.join cache-dir basename))
663 |   (setv value bypass)
664 |   (setv write-value T)
665 |   (when (is value None)
666 |     (if (os.path.exists path)
667 |       (do
668 |         (setv value (with [o (open path "rb")]
669 |           (get (pickle.load o) "value")))
670 |         (setv write-value F))
671 |       (setv value (f))))
672 |   (when write-value
673 |     (setv d {
674 |       "basename" basename
675 |       "key" key
676 |       "value" value
677 |       "time" (time.time)})
678 |     (with [o (open path "wb")]
679 |       (pickle.dump d o pickle.HIGHEST-PROTOCOL)))
680 |   value)
681 | 
682 | (defn show-cache [[cache-dir _default-cache-dir] [pretty T] [regex ""]]
683 | "Prints the caches of 'cached-eval' in chronological order."
684 |   (import re pickle os os.path datetime)
685 |   (setv regex (re.compile regex))
686 |   (setv items
687 |     (sorted :key (λ (get it "time"))
688 |     (amap (with [o (open (os.path.join cache-dir it) "rb")]
689 |       (pickle.load o))
690 |     (filt (os.path.isfile it)
691 |     (os.listdir cache-dir)))))
692 |   (for [item items]
693 |     (unless (.search regex (get item "key"))
694 |       (continue))
695 |     (unless pretty
696 |       (print (get item "basename"))
697 |       (continue))
698 |     (print "Basename:" (get item "basename"))
699 |     (print "Date:" (.strftime
700 |       (datetime.datetime.fromtimestamp (get item "time"))
701 |       "%-d %b %Y, %-I:%M:%S %p"))
702 |     (print (.format "Size: {:,}" (.
703 |       (os.stat (os.path.join cache-dir (get item "basename")))
704 |       st_size)))
705 |     (print "Key:" (get item "key"))
706 |     (print))
707 |   None)
708 | 
709 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
710 | ;; * Support for Tversky
711 | ;; https://github.com/Kodiologist/Tversky
712 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
713 | 
714 | (defn tversky-format-s [s]
715 |   (amap (.format "s{:03d}" it) s))
716 | 
717 | (defn unpack-tversky [db-path
718 |     [include-incomplete T]
719 |     [exclude-sns None]]
720 |   (import  sqlite3  pandas :as pd)
721 |   (try
722 |     (do
723 |       (setv db (sqlite3.connect db-path))
724 |       (.execute db "pragma foreign_keys = on")
725 | 
726 |       (setv sb (pd.read-sql-query :con db :index-col "sn"
727 |         :parse-dates (dfor k (qw consented_t began_t completed_t)
728 |           k (dict  :unit "s"  :utc T))
729 |         "select
730 |             sn, experimenter, ip, task,
731 |             consented_t,
732 |             began_t,
733 |             case when completed_t = 'assumed' then null else completed_t end
734 |               as completed_t,
735 |             MTurk.hitid as hit,
736 |             MTurk.assignmentid as asgmt,
737 |             MTurk.workerid as worker
738 |         from Subjects
739 |             left join
740 |                 (select sn, min(first_sent) as began_t from Timing group by sn)
741 |                 using (sn)
742 |             left join MTurk using (sn)"))
743 |       ; Make some columns categorical, with the levels ordered
744 |       ; chronologically.
745 |       (for [c (qw experimenter ip hit task)]
746 |         (setv (getl sb : c) (pd.Categorical
747 |           (getl sb : c)
748 |           :ordered T
749 |           :categories (list (.unique (.dropna (getl (.sort-values sb "began_t") : c)))))))
750 |       (setv ($ sb tv) (+ 1 (. ($ sb task) cat codes)))
751 |         ; "tv" for "task version".
752 |       (when exclude-sns
753 |         (setv sb (.drop sb exclude-sns)))
754 |       (unless include-incomplete
755 |         (setv sb (.dropna sb :subset ["completed_t"])))
756 | 
757 |       (.execute db "create temporary table IncludeSN(sn integer primary key)")
758 |       (.executemany db "insert into IncludeSN (sn) values (?)" (amap #(it) sb.index))
759 | 
760 |       (setv dat (.sort-index (geti (pd.read-sql-query :con db :index-col ["sn" "k"]
761 |         "select * from D where sn in (select * from IncludeSN)") : 0)))
762 | 
763 |       (setv timing (.sort-index (pd.read-sql-query :con db :index-col ["sn" "k"]
764 |         :parse-dates (dfor k (qw first_sent received)
765 |           k (dict  :unit "s"  :utc T))
766 |         "select * from Timing where sn in (select * from IncludeSN)")))
767 | 
768 |       (setv sb.index (tversky-format-s sb.index))
769 |       (for [df [dat timing]]
770 |         (setv df.index (.set-levels df.index :level "sn"
771 |           (tversky-format-s (first df.index.levels)))))
772 | 
773 |       #(sb dat timing))
774 | 
775 |     (finally
776 |       (.close db))))
777 | 
778 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
779 | ;; * Interoperability with R
780 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
781 | 
782 | (setv _Rproc None)
783 | 
784 | (defn _R-setup []
785 |   (import pyper)
786 |   (global _Rproc)
787 |   (when (is _Rproc None)
788 |     (setv _Rproc (pyper.R))))
789 | 
790 | (defn R-run [expr]
791 |   (_R-setup)
792 |   (.run _Rproc expr))
793 | 
794 | (defn R-assign [lvalue rvalue]
795 |   (_R-setup)
796 |   (.assign _Rproc lvalue rvalue))
797 | 
798 | (defn R-get [expr]
799 |   (_R-setup)
800 |   (.get _Rproc expr))
801 | 
802 | (defn R-call [fn-expr #* args [print-it True] #** kwargs]
803 |   (import  collections [OrderedDict])
804 |   (_R-setup)
805 |   (setv arg-string "")
806 |   (for [[i a] (enumerate (+ args (tuple (sorted (.items kwargs)))))]
807 |     (setv kw (>= i (len args)))
808 |     (setv value (if kw (second a) a))
809 |     (if (and (is (type value) list) (= (first value) :raw))
810 |       (setv expr (second value))
811 |       (do
812 |         (setv expr (+ "Kodhy_arg_" (str i)))
813 |         (.assign _Rproc expr value)
814 |         (when (in "DataFrame" (str (type value)))
815 |           ; Make sure each Categorical Series becomes a factor, with
816 |           ; the correct levels.
817 |           (for [j (range (second value.shape))]
818 |             (when (in "category" (str (get value.dtypes j)))
819 |               (setv v (geti value : j))
820 |               (unless (all (gfor  c v.cat.categories  (isinstance c str)))
821 |                 (raise (ValueError "Only string levels are allowed in Categoricals")))
822 |               (.run _Rproc (.format
823 |                 "{}[,{}] = factor({}[,{}], levels = c({}))"
824 |                 expr (inc j) expr (inc j)
825 |                 (.join "," (map double-quote v.cat.categories)))))))))
826 | ;    (if (in "DataFrame" (str (type value)))
827 | ;      (do
828 | ;        ; Work around a bug in Pyper where large DataFrames don't
829 | ;        ; get assigned for some reason. The workaround is not very
830 | ;        ; general, but should work for Comorbid.
831 | ;        (.assign _Rproc variable (.as-matrix value))
832 | ;        (.run _Rproc (.format "{} = data.frame({})" variable variable))
833 | ;        (.assign _Rproc (.format "colnames({})" variable) value.columns.values))
834 | ;      (.assign _Rproc variable value))
835 |     (when i
836 |       (+= arg-string ", "))
837 |     (when kw
838 |       (+= arg-string (+ (first a) " = ")))
839 |     (+= arg-string expr))
840 |   ((if print-it print identity)
841 |     (.run _Rproc (.format "Kodhy_out = ({})({})" fn-expr arg-string)))
842 |   (.get _Rproc "Kodhy_out"))
843 | 
844 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
845 | ;; * Plotting
846 | ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
847 | 
848 | (defn dotplot [xs [diam 1] [group None] [ax None] #** kwargs]
849 | "A plot of 1D data where each value is represented as a circle,
850 | and circles that would overlap are stacked vertically, a bit
851 | like a histogram. Missing values are silently ignored."
852 | 
853 |   (import
854 |     matplotlib.pyplot :as plt
855 |     matplotlib.collections [PatchCollection]
856 |     numpy [isnan]
857 |     toolz [unique])
858 | 
859 |   (when (is group None)
860 |     (setv group (* #(True) (len xs))))
861 |   (assert (= (len group) (len xs)))
862 |   (setv levels (list (unique group)))
863 |   (setv group-vert-space (* 3 diam))
864 | 
865 |   (setv rows (dfor  l levels  l []))
866 |   (for [[x g] (sorted (gfor
867 |           [x g] (zip xs group)
868 |           :if (and (not (isnan x)) (is-not x None)) [x g]))]
869 |     (for [row (get rows g)]
870 |       (when (>= (- x (get row -1)) diam)
871 |         (.append row x)
872 |         (break))
873 |       (else
874 |         (.append (get rows g) [x]))))
875 | 
876 |   (setv x (flatten (.values rows)))
877 | 
878 |   (setv yg (lfor  rs (.values rows)  (flatten (gfor
879 |     [n row] (enumerate rs)
880 |     (* [(* diam (+ n .5))] (len row))))))
881 |   (setv ybumps [0])
882 |   (for [i (range 1 (len yg))]
883 |     (.append ybumps (+ (max (get yg (dec i))) group-vert-space))
884 |     (setv (get yg i) (lfor  v (get yg i)  (+ v (get ybumps -1)))))
885 |   (setv y (flatten yg))
886 | 
887 |   (unless ax
888 |     (setv ax (plt.gca)))
889 |   (.set-aspect ax "equal")
890 |   (for [side (qw left right top)]
891 |     (.set-visible (get ax.spines side) F))
892 |   (.set-xlim ax (- (min x) diam) (+ (max x) diam))
893 |   (.set-ylim ax 0 (+ (max y) diam))
894 |   (.tick-params ax :left F :labelleft F)
895 | 
896 |   ; Now add the visible markers.
897 |   (unless (in "color" kwargs)
898 |     (setv (get kwargs "color") "black"))
899 |   (.add-collection ax (PatchCollection #** kwargs
900 |     (if (.pop kwargs "rect" F)
901 |       (lfor  [x0 y0] (zip x y)
902 |         (plt.Rectangle #((- x0 (/ diam 2)) (- y0 (/ diam 2))) diam diam))
903 |       (lfor  [x0 y0] (zip x y)
904 |         (plt.Circle #(x0 y0) (/ diam 2))))))
905 | 
906 |   ; Add the level labels.
907 |   (when (> (len levels) 1)
908 |     (for [[i level] (enumerate levels)]
909 |       (.text ax (- (min x) (* 2 diam)) (get ybumps i) (str level)
910 |         :horizontalalignment "right"))))
911 | 
912 | (defn rectplot [xs [diam 1] [ax None] #** kwargs]
913 | "`dotplot` using rectangles instead of circles."
914 |   (setv (get kwargs "rect") T)
915 |   (dotplot xs diam ax #** kwargs))
916 | 
917 | (defn density-plot [xs [bw None] [lo None] [hi None] [steps 257] [ax None] #** kwargs]
918 |   ; The default `steps` is chosen to be 1 plus a power of 2.
919 | 
920 |   (import
921 |     numpy :as np
922 |     scipy.stats :as scist
923 |     matplotlib.pyplot :as plt)
924 | 
925 |   (setv kde (scist.gaussian-kde xs :bw-method bw))
926 |   (setv test-points (np.linspace :num steps
927 |     (if (is lo None) (np.min xs) lo)
928 |     (if (is hi None) (np.max xs) hi)))
929 | 
930 |   (unless ax
931 |     (setv ax (plt.gca)))
932 |   (for [side (qw left right top)]
933 |     (.set-visible (get ax.spines side) F))
934 |   (.tick-params ax :left F :labelleft F)
935 | 
936 |   (.plot ax test-points (kde test-points) #** kwargs))
937 | 


--------------------------------------------------------------------------------