├── .gitignore ├── LICENSE ├── README.md ├── doc └── intro.md ├── examples └── examples.cljw ├── project.clj ├── src └── huri │ ├── core.clj │ ├── etl.clj │ ├── io.clj │ ├── math.clj │ ├── plot.clj │ └── time.clj └── test └── huri └── core_test.clj /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /classes 3 | /checkouts 4 | pom.xml 5 | pom.xml.asc 6 | *.jar 7 | *.class 8 | /.lein-* 9 | /.nrepl-port 10 | .hgignore 11 | .hg/ 12 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | THE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF THIS ECLIPSE PUBLIC 2 | LICENSE ("AGREEMENT"). ANY USE, REPRODUCTION OR DISTRIBUTION OF THE PROGRAM 3 | CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS AGREEMENT. 4 | 5 | 1. DEFINITIONS 6 | 7 | "Contribution" means: 8 | 9 | a) in the case of the initial Contributor, the initial code and 10 | documentation distributed under this Agreement, and 11 | 12 | b) in the case of each subsequent Contributor: 13 | 14 | i) changes to the Program, and 15 | 16 | ii) additions to the Program; 17 | 18 | where such changes and/or additions to the Program originate from and are 19 | distributed by that particular Contributor. A Contribution 'originates' from 20 | a Contributor if it was added to the Program by such Contributor itself or 21 | anyone acting on such Contributor's behalf. Contributions do not include 22 | additions to the Program which: (i) are separate modules of software 23 | distributed in conjunction with the Program under their own license 24 | agreement, and (ii) are not derivative works of the Program. 25 | 26 | "Contributor" means any person or entity that distributes the Program. 27 | 28 | "Licensed Patents" mean patent claims licensable by a Contributor which are 29 | necessarily infringed by the use or sale of its Contribution alone or when 30 | combined with the Program. 31 | 32 | "Program" means the Contributions distributed in accordance with this 33 | Agreement. 34 | 35 | "Recipient" means anyone who receives the Program under this Agreement, 36 | including all Contributors. 37 | 38 | 2. GRANT OF RIGHTS 39 | 40 | a) Subject to the terms of this Agreement, each Contributor hereby grants 41 | Recipient a non-exclusive, worldwide, royalty-free copyright license to 42 | reproduce, prepare derivative works of, publicly display, publicly perform, 43 | distribute and sublicense the Contribution of such Contributor, if any, and 44 | such derivative works, in source code and object code form. 45 | 46 | b) Subject to the terms of this Agreement, each Contributor hereby grants 47 | Recipient a non-exclusive, worldwide, royalty-free patent license under 48 | Licensed Patents to make, use, sell, offer to sell, import and otherwise 49 | transfer the Contribution of such Contributor, if any, in source code and 50 | object code form. This patent license shall apply to the combination of the 51 | Contribution and the Program if, at the time the Contribution is added by the 52 | Contributor, such addition of the Contribution causes such combination to be 53 | covered by the Licensed Patents. The patent license shall not apply to any 54 | other combinations which include the Contribution. No hardware per se is 55 | licensed hereunder. 56 | 57 | c) Recipient understands that although each Contributor grants the licenses 58 | to its Contributions set forth herein, no assurances are provided by any 59 | Contributor that the Program does not infringe the patent or other 60 | intellectual property rights of any other entity. Each Contributor disclaims 61 | any liability to Recipient for claims brought by any other entity based on 62 | infringement of intellectual property rights or otherwise. As a condition to 63 | exercising the rights and licenses granted hereunder, each Recipient hereby 64 | assumes sole responsibility to secure any other intellectual property rights 65 | needed, if any. For example, if a third party patent license is required to 66 | allow Recipient to distribute the Program, it is Recipient's responsibility 67 | to acquire that license before distributing the Program. 68 | 69 | d) Each Contributor represents that to its knowledge it has sufficient 70 | copyright rights in its Contribution, if any, to grant the copyright license 71 | set forth in this Agreement. 72 | 73 | 3. REQUIREMENTS 74 | 75 | A Contributor may choose to distribute the Program in object code form under 76 | its own license agreement, provided that: 77 | 78 | a) it complies with the terms and conditions of this Agreement; and 79 | 80 | b) its license agreement: 81 | 82 | i) effectively disclaims on behalf of all Contributors all warranties and 83 | conditions, express and implied, including warranties or conditions of title 84 | and non-infringement, and implied warranties or conditions of merchantability 85 | and fitness for a particular purpose; 86 | 87 | ii) effectively excludes on behalf of all Contributors all liability for 88 | damages, including direct, indirect, special, incidental and consequential 89 | damages, such as lost profits; 90 | 91 | iii) states that any provisions which differ from this Agreement are offered 92 | by that Contributor alone and not by any other party; and 93 | 94 | iv) states that source code for the Program is available from such 95 | Contributor, and informs licensees how to obtain it in a reasonable manner on 96 | or through a medium customarily used for software exchange. 97 | 98 | When the Program is made available in source code form: 99 | 100 | a) it must be made available under this Agreement; and 101 | 102 | b) a copy of this Agreement must be included with each copy of the Program. 103 | 104 | Contributors may not remove or alter any copyright notices contained within 105 | the Program. 106 | 107 | Each Contributor must identify itself as the originator of its Contribution, 108 | if any, in a manner that reasonably allows subsequent Recipients to identify 109 | the originator of the Contribution. 110 | 111 | 4. COMMERCIAL DISTRIBUTION 112 | 113 | Commercial distributors of software may accept certain responsibilities with 114 | respect to end users, business partners and the like. While this license is 115 | intended to facilitate the commercial use of the Program, the Contributor who 116 | includes the Program in a commercial product offering should do so in a 117 | manner which does not create potential liability for other Contributors. 118 | Therefore, if a Contributor includes the Program in a commercial product 119 | offering, such Contributor ("Commercial Contributor") hereby agrees to defend 120 | and indemnify every other Contributor ("Indemnified Contributor") against any 121 | losses, damages and costs (collectively "Losses") arising from claims, 122 | lawsuits and other legal actions brought by a third party against the 123 | Indemnified Contributor to the extent caused by the acts or omissions of such 124 | Commercial Contributor in connection with its distribution of the Program in 125 | a commercial product offering. The obligations in this section do not apply 126 | to any claims or Losses relating to any actual or alleged intellectual 127 | property infringement. In order to qualify, an Indemnified Contributor must: 128 | a) promptly notify the Commercial Contributor in writing of such claim, and 129 | b) allow the Commercial Contributor tocontrol, and cooperate with the 130 | Commercial Contributor in, the defense and any related settlement 131 | negotiations. The Indemnified Contributor may participate in any such claim 132 | at its own expense. 133 | 134 | For example, a Contributor might include the Program in a commercial product 135 | offering, Product X. That Contributor is then a Commercial Contributor. If 136 | that Commercial Contributor then makes performance claims, or offers 137 | warranties related to Product X, those performance claims and warranties are 138 | such Commercial Contributor's responsibility alone. Under this section, the 139 | Commercial Contributor would have to defend claims against the other 140 | Contributors related to those performance claims and warranties, and if a 141 | court requires any other Contributor to pay any damages as a result, the 142 | Commercial Contributor must pay those damages. 143 | 144 | 5. NO WARRANTY 145 | 146 | EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, THE PROGRAM IS PROVIDED ON 147 | AN "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER 148 | EXPRESS OR IMPLIED INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OR 149 | CONDITIONS OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A 150 | PARTICULAR PURPOSE. Each Recipient is solely responsible for determining the 151 | appropriateness of using and distributing the Program and assumes all risks 152 | associated with its exercise of rights under this Agreement , including but 153 | not limited to the risks and costs of program errors, compliance with 154 | applicable laws, damage to or loss of data, programs or equipment, and 155 | unavailability or interruption of operations. 156 | 157 | 6. DISCLAIMER OF LIABILITY 158 | 159 | EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, NEITHER RECIPIENT NOR ANY 160 | CONTRIBUTORS SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT, INCIDENTAL, 161 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING WITHOUT LIMITATION 162 | LOST PROFITS), HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 163 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 164 | ARISING IN ANY WAY OUT OF THE USE OR DISTRIBUTION OF THE PROGRAM OR THE 165 | EXERCISE OF ANY RIGHTS GRANTED HEREUNDER, EVEN IF ADVISED OF THE POSSIBILITY 166 | OF SUCH DAMAGES. 167 | 168 | 7. GENERAL 169 | 170 | If any provision of this Agreement is invalid or unenforceable under 171 | applicable law, it shall not affect the validity or enforceability of the 172 | remainder of the terms of this Agreement, and without further action by the 173 | parties hereto, such provision shall be reformed to the minimum extent 174 | necessary to make such provision valid and enforceable. 175 | 176 | If Recipient institutes patent litigation against any entity (including a 177 | cross-claim or counterclaim in a lawsuit) alleging that the Program itself 178 | (excluding combinations of the Program with other software or hardware) 179 | infringes such Recipient's patent(s), then such Recipient's rights granted 180 | under Section 2(b) shall terminate as of the date such litigation is filed. 181 | 182 | All Recipient's rights under this Agreement shall terminate if it fails to 183 | comply with any of the material terms or conditions of this Agreement and 184 | does not cure such failure in a reasonable period of time after becoming 185 | aware of such noncompliance. If all Recipient's rights under this Agreement 186 | terminate, Recipient agrees to cease use and distribution of the Program as 187 | soon as reasonably practicable. However, Recipient's obligations under this 188 | Agreement and any licenses granted by Recipient relating to the Program shall 189 | continue and survive. 190 | 191 | Everyone is permitted to copy and distribute copies of this Agreement, but in 192 | order to avoid inconsistency the Agreement is copyrighted and may only be 193 | modified in the following manner. The Agreement Steward reserves the right to 194 | publish new versions (including revisions) of this Agreement from time to 195 | time. No one other than the Agreement Steward has the right to modify this 196 | Agreement. The Eclipse Foundation is the initial Agreement Steward. The 197 | Eclipse Foundation may assign the responsibility to serve as the Agreement 198 | Steward to a suitable separate entity. Each new version of the Agreement will 199 | be given a distinguishing version number. The Program (including 200 | Contributions) may always be distributed subject to the version of the 201 | Agreement under which it was received. In addition, after a new version of 202 | the Agreement is published, Contributor may elect to distribute the Program 203 | (including its Contributions) under the new version. Except as expressly 204 | stated in Sections 2(a) and 2(b) above, Recipient receives no rights or 205 | licenses to the intellectual property of any Contributor under this 206 | Agreement, whether expressly, by implication, estoppel or otherwise. All 207 | rights in the Program not expressly granted under this Agreement are 208 | reserved. 209 | 210 | This Agreement is governed by the laws of the State of New York and the 211 | intellectual property laws of the United States of America. No party to this 212 | Agreement will bring a legal action under this Agreement more than one year 213 | after the cause of action arose. Each party waives its rights to a jury trial 214 | in any resulting litigation. 215 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # huri 2 | 3 | > A long time ago, there was a girl named Huri. Huri’s mother loved her daughter very much, and she always praised Huri. Huri’s mother stopped everybody who passed. She would point at the chair where Huri was sitting. She would say, “That’s my beautiful, hardworking Huri!” Huri always sat outside. She sometimes took naps in her chair. Other times, she sang simple songs. And Huri’s mother praised her hard work and beauty. All of the people in the town listened to Huri’s mother speak of how hard Huri worked. But they were confused; no one ever saw Huri do any work. Soon, the people in town began to giggle when Huri’s mother praised Huri. When they saw her, they whispered, “Here comes Lazy Huri’s mother.” One day, a stranger came into the town. When Huri’s mother saw him, she told him about Huri. She said, “My daughter is very beautiful and hardworking.” The next day, the man visited the king; he was the king’s messenger. He told the king about the beautiful and hardworking girl. The king said, “My son, the prince, will marry this hardworking girl!” The very next week, the prince and Huri were married! Everybody thought that Huri’s mother had planned the whole thing. That is why she had lied about how hard Huri worked; she did it to trick the prince! People sometimes asked Huri’s mother if she had tricked the prince into marrying her daughter. But she never admitted it. She only smiled and winked. 4 | > 5 | > — Armenian folktale 6 | 7 | __Huri__ is a Clojure library for the lazy data scientists. It consists of 8 | * __huri.core__ a loose set of functions on vanilla Clojure collections that constitute an ad-hoc specification of a data frame; along with some utility and math functions. 9 | * __huri.time__ time handling utilities built on top of [clj-time](https://github.com/clj-time/clj-time). 10 | * __huri.io__ I/O utilites following the API (`slurp-x`, `spit-x`, `cast-fns`, ...) used by [Semantic CSV](https://github.com/metasoarous/semantic-csv) 11 | * __huri.plot__ a DSL for plotting that compiles to R (ggplot2) meant to be used with [Gorilla REPL](http://gorilla-repl.org/) 12 | * __huri.etl__ some light-weight ETL scaffolding built on top of [~~Prismatic~~Plumatic Graph](https://github.com/plumatic/plumbing) 13 | 14 | ## Status 15 | 16 | Huri is still in flux. However it is already used extensively (and has been for some time) at [GoOpti](https://goopti.com), so it can be considered at least somewhat battle-tested. 17 | 18 | ## Design philosophy 19 | 20 | I gave about motivation and design phiosophy behind Huri at ClojureD 2016: [video](https://www.youtube.com/watch?v=PSTSO8K80U4), [slides](http://www.slideshare.net/simonbelak/doing-data-science-with-clojure). 21 | 22 | ## Usage 23 | 24 | Add this dependency to your project: 25 | 26 | ```clj 27 | [huri "0.10.0-SNAPSHOT"] 28 | ``` 29 | 30 | To get the plots working make sure you have R installed, and on your path so it's accessible from the command line. If you can run Rscript from the command line, then you should be good to go. You will also need to have some libraries installed which you can do from R REPL with: 31 | ```r 32 | install.packages("ggplot2") 33 | install.packages("scales") 34 | install.packages("grid") 35 | install.packages("RColorBrewer") 36 | install.packages("ggrepel") 37 | install.packages("svglite") 38 | install.packages("directlabels") 39 | ``` 40 | 41 | ## [Examples](http://viewer.gorilla-repl.org/view.html?source=github&user=sbelak&repo=huri&path=examples/examples.cljw) 42 | 43 | ## Huri likes playing with 44 | 45 | * http://gorilla-repl.org/ 46 | * https://github.com/clj-time/clj-time 47 | * https://github.com/plumatic/plumbing 48 | * https://github.com/metasoarous/semantic-csv 49 | * https://github.com/expez/superstring 50 | * https://github.com/sbelak/tide 51 | * https://github.com/bigmlcom/sampling 52 | 53 | ## For the future 54 | 55 | * Interactive charts; 56 | * Optimizing `->>` that rewrites code on the fly to do as much as possible in a single pass and use transducer fusion more extensively (intermediate results don't need to be end user consumable). 57 | 58 | 59 | ## Contributing 60 | 61 | Feel free to submit a pull request. 62 | If you're looking for things to help with, please take a look at the [GH issues](https://github.com/sbelak/huri/issues) page. 63 | Contributing to the issues with comments, feedback, or requests is also greatly appreciated. 64 | 65 | 66 | ## License 67 | 68 | Copyright © 2016 Simon Belak 69 | 70 | Distributed under the Eclipse Public License either version 1.0 or (at 71 | your option) any later version. 72 | -------------------------------------------------------------------------------- /doc/intro.md: -------------------------------------------------------------------------------- 1 | # Introduction to huri 2 | 3 | TODO: write [great documentation](http://jacobian.org/writing/what-to-write/) 4 | -------------------------------------------------------------------------------- /project.clj: -------------------------------------------------------------------------------- 1 | (defproject huri "0.10.0-SNAPSHOT" 2 | :description "Tools for the lazy data scientist" 3 | :url "https://github.com/sbelak/huri" 4 | :license {:name "Eclipse Public License" 5 | :url "http://www.eclipse.org/legal/epl-v10.html"} 6 | :dependencies [[org.clojure/clojure "1.9.0-alpha17"] 7 | [clj-time "0.13.0"] 8 | [prismatic/plumbing "0.5.4"] 9 | [org.clojure/math.numeric-tower "0.0.4"] 10 | [org.clojure/data.priority-map "0.0.7"] 11 | [net.cgrand/xforms "0.9.3"] 12 | [cheshire "5.7.1"] 13 | [com.taoensso/timbre "4.10.0"] 14 | [gorilla-renderable "2.0.0"]]) 15 | -------------------------------------------------------------------------------- /src/huri/core.clj: -------------------------------------------------------------------------------- 1 | (ns huri.core 2 | (:require (plumbing [core :refer [distinct-fast map-vals safe-get for-map 3 | map-from-vals map-from-keys]]) 4 | [net.cgrand.xforms :as x] 5 | [clojure.data.priority-map :refer [priority-map-by]] 6 | [clj-time.core :as t] 7 | [clojure.core.reducers :as r] 8 | [clojure.spec.alpha :as s] 9 | [clojure.spec.test.alpha :as s.test]) 10 | (:import org.joda.time.DateTime)) 11 | 12 | (defn papply 13 | "partial that applies its arguments." 14 | [f & args] 15 | (apply partial apply f args)) 16 | 17 | (defn pcomp 18 | ([f g] 19 | (fn [& args] 20 | (let [intermediate (apply g args)] 21 | (if (fn? intermediate) 22 | (pcomp f intermediate) 23 | (f intermediate))))) 24 | ([f g & fs] 25 | (reduce pcomp (list* f g fs)))) 26 | 27 | (defn mapply 28 | "map that applies f via apply." 29 | ([f] 30 | (map (papply f))) 31 | ([f coll] 32 | (sequence (mapply f) coll))) 33 | 34 | (defn mapm 35 | "" 36 | [f coll] 37 | (into {} (map f) coll)) 38 | 39 | (defn juxtm 40 | [m] 41 | (fn [& args] 42 | (map-vals #(apply % args) m))) 43 | 44 | (defmacro for-cat 45 | [& for-body] 46 | `(apply concat (for ~@for-body))) 47 | 48 | (defmacro for-keep 49 | [& for-body] 50 | `(remove nil? (for ~@for-body))) 51 | 52 | (defmacro with-conformer 53 | [x & tagvals] 54 | `(s/conformer (fn [[tag# ~x]] 55 | (case tag# ~@tagvals)))) 56 | 57 | (defmacro parallel-map 58 | "A hash map constructor that evaluates its arguments in parallel." 59 | [& keyvals] 60 | `(apply hash-map (pvalues ~@keyvals))) 61 | 62 | (defn fsome 63 | "Takes a function f and returns a function that gets called only if all 64 | the arguments passed in are not nill. Else returns nil." 65 | [f] 66 | (fn [& args] 67 | (when (every? some? args) 68 | (apply f args)))) 69 | 70 | (def patch-nil 71 | "Replaces nil with patch value, passes other values through." 72 | (partial fnil identity)) 73 | 74 | (def transpose 75 | "Transposes vector of vectors." 76 | (papply map vector)) 77 | 78 | (defn val-or-seq 79 | [element-type] 80 | (s/and 81 | (s/or :seq (s/coll-of element-type) 82 | :val element-type) 83 | (with-conformer x 84 | :seq x 85 | :val [x]))) 86 | 87 | (def ensure-seq (partial s/conform (val-or-seq any?))) 88 | 89 | (def flatten1 (partial mapcat ensure-seq)) 90 | 91 | (s/def ::keyfn (s/and 92 | (s/or :fn (s/and ifn? (complement (some-fn keyword? vector?))) 93 | :key (constantly true)) 94 | (with-conformer x 95 | :fn x 96 | :key #(safe-get % x)))) 97 | 98 | (def ->keyfn (partial s/conform ::keyfn)) 99 | 100 | (s/def ::dataframe (s/nilable (s/every map?))) 101 | 102 | (defn col 103 | ([k] 104 | (map (->keyfn k))) 105 | ([k df] 106 | (sequence (col k) df))) 107 | 108 | (s/def ::col-transforms 109 | (s/map-of keyword? 110 | (s/and (s/or :vec (s/and vector? 111 | (s/cat :f ifn? :keyfns (s/+ ::keyfn))) 112 | :ifn ifn?) 113 | (with-conformer x 114 | :vec (comp (papply (:f x)) (apply juxt (:keyfns x))) 115 | :ifn x)))) 116 | 117 | (s/fdef derive-cols 118 | :args (s/cat :new-cols ::col-transforms 119 | :df ::dataframe) 120 | :ret ::dataframe) 121 | 122 | (defn derive-cols 123 | [new-cols df] 124 | (map (->> new-cols 125 | (s/conform ::col-transforms) 126 | (map (fn [[ks f]] 127 | (fn [row] 128 | (assoc row ks (f row))))) 129 | (apply comp)) 130 | df)) 131 | 132 | (defn update-cols 133 | [update-fns df] 134 | (derive-cols (for-map [[k f] update-fns] 135 | k (comp f k)) 136 | df)) 137 | 138 | (defn any-of 139 | [& keyfns] 140 | {::combinator some-fn 141 | ::keyfns keyfns}) 142 | 143 | (defn every-of 144 | [& keyfns] 145 | {::combinator every-pred 146 | ::keyfns keyfns}) 147 | 148 | (s/def ::keyfns (s/+ (val-or-seq ::keyfn))) 149 | 150 | (s/def ::key-combinator (s/and 151 | (s/or :combinator (s/keys :req [::combinator ::keyfns]) 152 | :keyfns (val-or-seq ::keyfn)) 153 | (with-conformer x 154 | :combinator x 155 | :keyfns {::combinator identity 156 | ::keyfns [x]}))) 157 | 158 | (s/def ::pred (s/and 159 | (s/or :vec (s/and vector? (s/cat :f ifn? :args (s/* any?))) 160 | :fn ifn? 161 | :val (complement ifn?)) 162 | (with-conformer x 163 | :vec #(apply (:f x) (concat % (:args x))) 164 | :fn (papply x) 165 | :val (papply = x)))) 166 | 167 | (s/def ::filters (s/and 168 | (s/or :map (s/map-of ::key-combinator ::pred :conform-keys true) 169 | :pred ::keyfn) 170 | (with-conformer x 171 | :map (->> x 172 | (map (fn [[{:keys [::combinator ::keyfns]} pred]] 173 | (->> keyfns 174 | (map #(comp pred (apply juxt %))) 175 | (apply combinator)))) 176 | (apply every-pred)) 177 | :pred x))) 178 | 179 | (s/fdef where 180 | :args (s/alt :curried ::filters 181 | :full (s/cat :filters ::filters 182 | :df (s/nilable coll?))) 183 | :ret coll?) 184 | 185 | (defn where 186 | ([filters] 187 | (partial where filters)) 188 | ([filters df] 189 | (if (or (instance? clojure.lang.PersistentList df) 190 | (instance? clojure.lang.LazySeq df)) 191 | (filter (s/conform ::filters filters) df) 192 | (into (empty df) (filter (s/conform ::filters filters)) df)))) 193 | 194 | (s/fdef summarize 195 | :args (s/alt :curried ::summary-fn 196 | :simple (s/cat :f ::summary-fn 197 | :df (s/nilable coll?)) 198 | :keyfn (s/cat :f ::summary-fn 199 | :keyfn (val-or-seq ::keyfn) 200 | :df (s/nilable coll?))) 201 | :ret any?) 202 | 203 | (s/def ::summary-fn 204 | (s/or :map (s/map-of keyword? (s/and 205 | (s/or :vec (s/cat :f ifn? 206 | :keyfn (val-or-seq ::keyfn) 207 | :filters (s/? ::filters)) 208 | :fn fn?) 209 | (with-conformer x 210 | :vec x 211 | :fn {:f x 212 | :keyfn [identity]}))) 213 | :fn fn?)) 214 | 215 | (defn summarize 216 | ([f] 217 | (partial summarize f)) 218 | ([f df] 219 | (summarize f identity df)) 220 | ([f keyfn df] 221 | (let [[tag f] (s/conform ::summary-fn f)] 222 | (if (= tag :map) 223 | (into {} 224 | (pmap (fn [[k {f :f keyfn-local :keyfn filters :filters}]] 225 | [k (summarize f (cond 226 | (= keyfn identity) keyfn-local 227 | (= keyfn-local [identity]) keyfn 228 | :else (map #(comp % keyfn) keyfn-local)) 229 | (if filters 230 | (where filters df) 231 | df))]) 232 | f)) 233 | (apply f (map #(col % df) (ensure-seq keyfn))))))) 234 | 235 | (s/fdef rollup 236 | :args (s/alt :curried (s/cat :groupfn ::keyfn 237 | :f ::summary-fn) 238 | :simple (s/cat :groupfn ::keyfn 239 | :f ::summary-fn 240 | :df (s/nilable coll?)) 241 | :keyfn (s/cat :groupfn ::keyfn 242 | :f ::summary-fn 243 | :keyfn ::keyfn 244 | :df (s/nilable coll?))) 245 | :ret (s/and map? sorted?)) 246 | 247 | (defn rollup 248 | ([groupfn f] 249 | (partial rollup groupfn f)) 250 | ([groupfn f df] 251 | (rollup groupfn f identity df)) 252 | ([groupfn f keyfn df] 253 | (into (sorted-map) 254 | (x/by-key (->keyfn groupfn) (comp (x/into []) 255 | (map (partial summarize f keyfn)))) 256 | df))) 257 | 258 | (def rollup-vals (pcomp vals rollup)) 259 | (def rollup-keep (pcomp (partial remove nil?) rollup-vals)) 260 | (def rollup-cat (pcomp (papply concat) rollup-vals)) 261 | 262 | (s/def ::fuse-fn (s/and (s/or :map map? 263 | :vec sequential? 264 | :kw keyword? 265 | :fn ifn?) 266 | (with-conformer x 267 | :map (map-vals ->keyfn x) 268 | :vec (zipmap x x) 269 | :kw {x (->keyfn x)} 270 | :fn {::group x}))) 271 | 272 | (s/fdef rollup-fuse 273 | :args (s/alt :curried (s/cat :groupfn ::fuse-fn 274 | :f ::summary-fn) 275 | :simple (s/cat :groupfn ::fuse-fn 276 | :f ::summary-fn 277 | :df (s/nilable coll?)) 278 | :keyfn (s/cat :groupfn ::fuse-fn 279 | :f ::summary-fn 280 | :keyfn ::keyfn 281 | :df (s/nilable coll?))) 282 | :ret coll?) 283 | 284 | (defn rollup-fuse 285 | ([groupfn f] 286 | (partial rollup-fuse groupfn f)) 287 | ([groupfn f df] 288 | (rollup-fuse groupfn f identity df)) 289 | ([groupfn f keyfn df] 290 | (let [groupfn (s/conform ::fuse-fn groupfn)] 291 | (rollup-vals (apply juxt (vals groupfn)) 292 | (fn [group] 293 | (merge (into {} (summarize f keyfn group)) 294 | ((juxtm groupfn) (first group)))) 295 | df)))) 296 | 297 | (s/fdef rollup-transpose 298 | :args (s/alt :curried (s/cat :indexfn ::keyfn 299 | :f (s/and ::summary-fn map?)) 300 | :full (s/cat :indexfn ::keyfn 301 | :f (s/and ::summary-fn map?) 302 | :df (s/nilable coll?))) 303 | :ret map?) 304 | 305 | (defn rollup-transpose 306 | ([indexfn f] 307 | (partial rollup-transpose indexfn f)) 308 | ([indexfn f df] 309 | (->> df 310 | (rollup indexfn f) 311 | (reduce-kv (fn [acc idx kvs] 312 | (reduce-kv (fn [acc k v] 313 | (update acc k conj [idx v])) 314 | acc 315 | kvs)) 316 | (zipmap (keys f) (repeat (sorted-map))))))) 317 | 318 | (s/fdef window 319 | :args (s/alt :curried ifn? 320 | :simple (s/cat :f ifn? 321 | :df (s/nilable coll?)) 322 | :keyfn (s/cat :f ifn? 323 | :keyfn ::keyfn 324 | :df (s/nilable coll?)) 325 | :lag (s/cat :lag pos-int? 326 | :f ifn? 327 | :keyfn ::keyfn 328 | :df (s/nilable coll?))) 329 | :ret coll?) 330 | 331 | (defn window 332 | ([f] 333 | (partial window f)) 334 | ([f df] 335 | (window f identity df)) 336 | ([f keyfn df] 337 | (window 1 f keyfn df)) 338 | ([lag f keyfn df] 339 | (let [xs (col keyfn df)] 340 | (map f (drop lag xs) xs)))) 341 | 342 | (s/fdef size 343 | :args (s/cat :df (s/every coll?)) 344 | :ret (s/cat :rows int? :cols int?)) 345 | 346 | (defn size 347 | [df] 348 | [(count df) (count (first df))]) 349 | 350 | (s/fdef cols 351 | :args (s/cat :df ::dataframe) 352 | :ret coll?) 353 | 354 | (defn cols 355 | [df] 356 | (keys (first df))) 357 | 358 | (defn col-oriented 359 | [df] 360 | (for-map [k (cols df)] 361 | k (col k df))) 362 | 363 | (defn row-oriented 364 | [m] 365 | (apply map (comp (partial zipmap (keys m)) vector) (vals m))) 366 | 367 | (defn ->data-frame 368 | [cols xs] 369 | (if (and (not= (count cols) (count (first xs))) 370 | (some coll? (first xs))) 371 | (->data-frame cols (map flatten1 xs)) 372 | (map (partial zipmap cols) xs))) 373 | 374 | (defn select-cols 375 | [cols df] 376 | (map (juxtm (map-from-keys ->keyfn cols)) df)) 377 | 378 | (s/def ::join-on (s/and 379 | (s/or :vec (s/cat :left ::keyfn :right ::keyfn) 380 | :singleton ::keyfn) 381 | (with-conformer x 382 | :vec x 383 | :singleton {:left x :right x}))) 384 | 385 | (s/def ::op #{:inner-join :semi-join :anti-join :left-join}) 386 | 387 | (s/fdef join 388 | :args (s/alt :default (s/cat :on ::join-on 389 | :left ::dataframe 390 | :right ::dataframe) 391 | :with-op (s/cat :op ::op 392 | :on ::join-on 393 | :left ::dataframe 394 | :right ::dataframe)) 395 | :ret ::dataframe) 396 | 397 | (defn join 398 | ([on left right] 399 | (join :left-join on left right)) 400 | ([op on left right] 401 | (let [{lkey :left rkey :right} (s/conform ::join-on on) 402 | left->right (comp (map-from-vals rkey right) lkey)] 403 | (if (#{:semi-join :anti-join} op) 404 | (where (if (= op :semi-join) 405 | left->right 406 | (comp nil? left->right)) 407 | left) 408 | (doall 409 | (for [row left 410 | :when (or (= op :left-join) (left->right row))] 411 | (merge row (left->right row)))))))) 412 | 413 | (defn count-where 414 | ([filters] 415 | (partial count-where filters)) 416 | ([filters df] 417 | (count (where filters df)))) 418 | 419 | (defn count-distinct 420 | ([df] 421 | (count (distinct-fast df))) 422 | ([keyfn df] 423 | (count-distinct (col keyfn df)))) 424 | 425 | (defn safe-divide 426 | [numerator & denominators] 427 | (when (or (and (not-empty denominators) (not-any? zero? denominators)) 428 | (and (not (zero? numerator)) (empty? denominators))) 429 | (double (apply / numerator denominators)))) 430 | 431 | (s/fdef sum 432 | :args (s/alt :coll (s/nilable coll?) 433 | :keyfn (s/cat :keyfn ::keyfn 434 | :df (s/nilable coll?))) 435 | :ret number?) 436 | 437 | (defn sum 438 | "" 439 | ([df] 440 | (sum identity df)) 441 | ([keyfn df] 442 | (transduce (keep (->keyfn keyfn)) + df))) 443 | 444 | (s/fdef rate 445 | :args (s/alt :curried (s/cat :keyfn-a ::keyfn 446 | :keyfn-b ::keyfn) 447 | :full (s/cat :keyfn-a ::keyfn 448 | :keyfn-b ::keyfn 449 | :df (s/nilable coll?))) 450 | :ret (s/nilable number?)) 451 | 452 | (defn rate 453 | "Returns the quotient of the sum of values extracted by keyfn-a and 454 | keyfn-b. 455 | Returns a curried version when only keyfns are provided." 456 | ([keyfn-a keyfn-b] 457 | (partial rate keyfn-a keyfn-b)) 458 | ([keyfn-a keyfn-b df] 459 | (let [keyfn-a (->keyfn keyfn-a) 460 | keyfn-b (->keyfn keyfn-b)] 461 | (transduce identity 462 | (fn 463 | ([] [0 0]) 464 | ([[sa sb :as acc] e] 465 | (let [a (keyfn-a e) 466 | b (keyfn-b e)] 467 | (if (or (nil? a) (nil? b)) 468 | acc 469 | [(+ a sa) (+ b sb)]))) 470 | ([[a b]] 471 | (safe-divide a b))) 472 | df)))) 473 | 474 | (s/fdef share 475 | :args (s/alt :curried ::filters 476 | :simple (s/cat :filters ::filters 477 | :df (s/nilable coll?)) 478 | :weightfn (s/cat :filters ::filters 479 | :weightfn ::keyfn 480 | :df (s/nilable coll?)))) 481 | 482 | (defn share 483 | "Returns the share of values in df for which filter returns true. 484 | Optionally takes a weightfn to provide weights for each data point. 485 | Uses the same filter format as where. 486 | Returns a curried version when only filter is provided." 487 | ([filters] 488 | (partial share filters)) 489 | ([filters df] 490 | (share filters (constantly 1) df)) 491 | ([filters weightfn df] 492 | (safe-divide (sum weightfn (where filters df)) 493 | (sum weightfn df)))) 494 | 495 | (s/fdef mean 496 | :args (s/alt :coll (s/nilable coll?) 497 | :keyfn (s/cat :keyfn ::keyfn 498 | :df (s/nilable coll?)) 499 | :weightfn (s/cat :keyfn ::keyfn 500 | :weightfn ::keyfn 501 | :df (s/nilable coll?))) 502 | :ret (s/nilable number?)) 503 | 504 | (defn mean 505 | "Calculates the arithmetic mean. 506 | Optionally takes a keyfn to extract the values and weightfn to provide weights 507 | for each data point." 508 | ([df] 509 | (mean identity df)) 510 | ([keyfn df] 511 | (mean keyfn (constantly 1) df)) 512 | ([keyfn weightfn df] 513 | (let [keyfn (->keyfn keyfn) 514 | weightfn (->keyfn weightfn)] 515 | (rate #(* (keyfn %) (weightfn %)) weightfn df)))) 516 | 517 | (def rollup-mean 518 | "Rollup and return the mean of aggregations." 519 | (pcomp mean rollup-vals)) 520 | 521 | (defn top-n 522 | "Return n biggest values in coll. 523 | Optionally takes a kefyn to extract the values. 524 | Returns a curried version when only n is provided." 525 | ([n] 526 | (partial top-n n)) 527 | ([n df] 528 | (top-n n identity df)) 529 | ([n keyfn df] 530 | (into (empty df) (take n) (sort-by (->keyfn keyfn) > df)))) 531 | 532 | (s/fdef distribution 533 | :args (s/alt :coll (s/nilable coll?) 534 | :keyfn (s/cat :keyfn ::keyfn 535 | :df (s/nilable coll?)) 536 | :weightfn (s/cat :keyfn ::keyfn 537 | :weightfn ::keyfn 538 | :df (s/nilable coll?))) 539 | :ret (s/and map? sorted?)) 540 | 541 | (defn distribution 542 | "Returns a map between all distinct values in df and their relative frequency. 543 | Optionally takes a keyfn to extract the values and weightfn to provide weights 544 | for each data point." 545 | ([df] 546 | (distribution identity df)) 547 | ([keyfn df] 548 | (distribution keyfn (constantly 1) df)) 549 | ([keyfn weightfn df] 550 | (when-let [norm (safe-divide (sum weightfn df))] 551 | (into (priority-map-by >) 552 | (rollup keyfn (comp (partial * norm) sum) weightfn df))))) 553 | 554 | (defn extent 555 | "Returns a pair of [smallest, biggest] or [earliest, latest] if passed a coll 556 | of dates. 557 | Optionally takes a keyfn to extract the values." 558 | ([xs] 559 | (let [[x & xs] xs] 560 | (r/fold (r/monoid (if (instance? org.joda.time.DateTime x) 561 | (fn [[acc-min acc-max] x] 562 | [(t/earliest acc-min x) (t/latest acc-max x)]) 563 | (fn [[acc-min acc-max] x] 564 | [(min acc-min x) (max acc-max x)])) 565 | (constantly [x x])) 566 | xs))) 567 | ([keyfn df] 568 | (extent (col keyfn df)))) 569 | 570 | (s.test/instrument) 571 | -------------------------------------------------------------------------------- /src/huri/etl.clj: -------------------------------------------------------------------------------- 1 | (ns huri.etl 2 | (:require (plumbing [graph :as graph] 3 | [core :refer [defnk]] 4 | [map :refer [map-leaves-and-path keep-leaves 5 | safe-select-keys]]) 6 | (clj-time [core :as t] 7 | [periodic :as t.periodic]) 8 | [plumbing.fnk.pfnk :as pfnk] 9 | [taoensso.timbre :as log])) 10 | 11 | (def task-graph (atom {})) 12 | 13 | (def register-task (partial swap! task-graph assoc)) 14 | 15 | (defmacro deftask 16 | [task [& args] & body] 17 | `(do 18 | (defnk ~task [~@args] ~@body) 19 | (register-task ~(keyword task) 20 | (vary-meta ~task (partial merge (meta (var ~task))))))) 21 | 22 | (def exception? (partial instance? Exception)) 23 | 24 | (def with-error-handler 25 | (partial map-leaves-and-path 26 | (fn [ks f] 27 | (pfnk/fn->fnk 28 | (fn [m] 29 | (try 30 | (if (some (comp exception? val) m) 31 | (throw (ex-info "Upstream error" {})) 32 | (f m)) 33 | (catch Exception e 34 | (log/error e ks) 35 | e))) 36 | [(pfnk/input-schema f) 37 | (pfnk/output-schema f)])))) 38 | 39 | (defn run 40 | [& {:keys [execution-strategy] :or {execution-strategy :parallel}}] 41 | (into {} (((case execution-strategy 42 | :parallel graph/par-compile 43 | :sequential graph/compile) 44 | (with-error-handler @task-graph)) {}))) 45 | 46 | (defn run-only 47 | [tasks & {:keys [evaluation-strategy] :or {evaluation-strategy :parallel}}] 48 | (safe-select-keys ((graph/lazy-compile (with-error-handler @task-graph)) {}) 49 | tasks)) 50 | 51 | (defn run-if 52 | [pred] 53 | (run (keys (keep-leaves pred @task-graph)))) 54 | 55 | (defn refreshing 56 | [at period f] 57 | (let [cache (atom ::empty) 58 | schedule (atom (t.periodic/periodic-seq at period))] 59 | (reify clojure.lang.IDeref 60 | (deref [_] 61 | (if (or (t/after? (t/now) (first @schedule)) (= @cache ::empty)) 62 | (do 63 | (swap! schedule (partial drop-while (partial t/after? (t/now)))) 64 | (reset! cache nil) 65 | (reset! cache (f))) 66 | @cache))))) 67 | -------------------------------------------------------------------------------- /src/huri/io.clj: -------------------------------------------------------------------------------- 1 | (ns huri.io 2 | (:require [huri.core :refer [update-cols]] 3 | [cheshire.core :as json] 4 | [clojure.java.io :as io])) 5 | 6 | (defn spit-json 7 | ([f x] 8 | (spit-json f {} x)) 9 | ([f {:keys [cast-fns]} x] 10 | (json/encode-stream (cond->> x 11 | cast-fns (update-cols cast-fns)) 12 | (io/writer f)))) 13 | 14 | (defn slurp-json 15 | [f] 16 | (json/decode-stream (io/reader f) true)) 17 | -------------------------------------------------------------------------------- /src/huri/math.clj: -------------------------------------------------------------------------------- 1 | (ns huri.math 2 | (:require [huri.core :refer [safe-divide distribution sum mean]] 3 | [net.cgrand.xforms :as x] 4 | [clojure.math.numeric-tower :refer [expt round sqrt abs]])) 5 | 6 | (defn smooth 7 | [window xs] 8 | (sequence (x/partition window 1 x/avg) xs)) 9 | 10 | (defn growth 11 | [b a] 12 | (safe-divide (* (if (neg? a) -1 1) (- b a)) a)) 13 | 14 | (defn decay 15 | [lambda t] 16 | (expt Math/E (- (* lambda t)))) 17 | 18 | (defn logistic 19 | [L k x0 x] 20 | (/ L (+ 1 (decay k (- x x0))))) 21 | 22 | (def entropy (comp - 23 | (partial sum #(* % (Math/log %))) 24 | vals 25 | distribution)) 26 | 27 | (defn round-to 28 | ([precision] 29 | (partial round-to precision)) 30 | ([precision x] 31 | (let [scale (/ precision)] 32 | (/ (round (* x scale)) scale)))) 33 | 34 | (defn clamp 35 | ([bounds] 36 | (partial clamp bounds)) 37 | ([[lower upper] x] 38 | (clamp lower upper x)) 39 | ([lower upper x] 40 | (max (min x upper) lower))) 41 | 42 | (def cdf (comp (partial reductions (fn [[_ acc] [x y]] 43 | [x (+ y acc)])) 44 | (partial sort-by key) 45 | distribution)) 46 | 47 | (defn percentiles 48 | ([df] 49 | (percentiles identity df)) 50 | ([keyfn df] 51 | (percentiles keyfn (constantly 1) df)) 52 | ([keyfn weightfn df] 53 | (loop [[[k p] & tail] (seq (distribution keyfn weightfn df)) 54 | percentile 1 55 | acc {}] 56 | (if k 57 | (recur tail (- percentile p) (assoc acc k percentile)) 58 | acc)))) 59 | 60 | (defn kl-divergence 61 | "Kullback-Leibler divergence of discrete probability distributions `p` and `q`. 62 | https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence" 63 | [p q] 64 | (reduce + (map (fn [pi qi] 65 | (if (or (zero? pi) (zero? qi)) 66 | 0 67 | (* pi (Math/log (/ pi qi))))) 68 | p q))) 69 | 70 | (defn js-divergence 71 | "Jensen-Shannon divergence of discrete probability distributions `p` and `q`. 72 | Note returned is the square root of JS-divergence, so that it obeys the 73 | metric laws. 74 | https://en.wikipedia.org/wiki/Jensen%E2%80%93Shannon_divergence" 75 | [p q] 76 | (let [m (map (comp (partial * 0.5) +) p q)] 77 | (sqrt (+ (* 0.5 (kl-divergence p m)) (* 0.5 (kl-divergence q m)))))) 78 | 79 | (defn euclidean-distance 80 | "Euclidean distance between vectors `p` and `q`." 81 | [p q] 82 | (sqrt (reduce + (map (comp #(* % %) -) p q)))) 83 | 84 | (defn center 85 | [p] 86 | (let [mu (mean p)] 87 | (map #(- % mu) p))) 88 | 89 | (defn em-distance 90 | [a b] 91 | (transduce identity 92 | (fn 93 | ([] 94 | {:total-distance 0 95 | :last-distance 0}) 96 | ([{:keys [total-distance]}] total-distance) 97 | ([{:keys [total-distance last-distance]} delta] 98 | (let [current-distance (+ delta last-distance)] 99 | {:total-distance (+ total-distance (abs current-distance)) 100 | :last-distance current-distance}))) 101 | (map - a b))) 102 | -------------------------------------------------------------------------------- /src/huri/plot.clj: -------------------------------------------------------------------------------- 1 | ;;;; R interoperability code is based on Jony Hudson's 2 | ;;;; https://github.com/JonyEpsilon/gg4clj 3 | 4 | (ns huri.plot 5 | (:require [huri.core :refer [rollup derive-cols sum col extent cols size 6 | col-oriented]] 7 | [clojure.string :as s] 8 | [plumbing.core :refer [map-vals for-map assoc-when]] 9 | [clojure.java.shell :as shell] 10 | [clojure.walk :as walk] 11 | [gorilla-renderable.core :as render] 12 | [clojure.xml :as xml] 13 | [clj-time.core :as t]) 14 | (:import org.joda.time.DateTime 15 | java.io.File 16 | java.util.UUID)) 17 | 18 | (declare ->r) 19 | 20 | (defn- quote-string 21 | [s] 22 | (str "\"" s "\"")) 23 | 24 | (defn- function-name 25 | [f] 26 | (case f 27 | :+ (quote-string "+") 28 | :<- (quote-string "<-") 29 | (name f))) 30 | 31 | (defn- fn-from-vec 32 | [[head & tail]] 33 | (str (function-name head) "(" (s/join ", " (map ->r tail)) ")")) 34 | 35 | (defn- named-args-from-map 36 | [arg-map] 37 | (->> arg-map 38 | keys 39 | (map #(str (name %) " = " (->r (% arg-map)))) 40 | (s/join ", "))) 41 | 42 | (defn r+ 43 | [& args] 44 | (reduce (partial vector :+) args)) 45 | 46 | (defn ->r 47 | [code] 48 | (cond 49 | (vector? code) (if (vector? (first code)) 50 | (s/join ";\n" (map ->r code)) 51 | (fn-from-vec code)) 52 | (map? code) (named-args-from-map code) 53 | (keyword? code) (name code) 54 | (string? code) (quote-string code) 55 | (or (true? code) (false? code)) (s/upper-case (str code)) 56 | :else (pr-str code))) 57 | 58 | (defn- rscript 59 | [script-path] 60 | (let [return-val (shell/sh "Rscript" "--vanilla" script-path)] 61 | (when-not (zero? (:exit return-val)) 62 | (println (:err return-val))))) 63 | 64 | (defn- ggsave 65 | [command filepath width height] 66 | [command [:ggsave {:filename filepath :width width :height height}]]) 67 | 68 | (defn- fresh-ids 69 | [svg] 70 | (->> svg 71 | (tree-seq coll? identity) 72 | (filter map?) 73 | (keep :id) 74 | (map (fn [new old] 75 | {old new 76 | (str "#" old) (str "#" new) 77 | (format "url(#%s)" old) (format "url(#%s)" new)}) 78 | (repeatedly #(str (UUID/randomUUID)))) 79 | (apply merge))) 80 | 81 | (defn- mangle-ids 82 | "ggplot produces SVGs with elements that have id attributes. These ids are 83 | unique within each plot, but are generated in such a way that they clash when 84 | there's more than one plot in a document. 85 | This function is a workaround for that. It takes an SVG string and replaces 86 | the ids with globally unique ids, returning a string." 87 | [svg] 88 | (let [svg (xml/parse (java.io.ByteArrayInputStream. (.getBytes svg))) 89 | smap (fresh-ids svg) 90 | mangle (fn [x] 91 | (if (map? x) 92 | (into {} 93 | (for [[k v] x] 94 | [k (if (or (= :id k) 95 | (and (string? v) 96 | (or (s/starts-with? v "#") 97 | (s/starts-with? v "url(#")))) 98 | (smap v) 99 | v)])) 100 | x))] 101 | (with-out-str 102 | (xml/emit (walk/prewalk mangle svg))))) 103 | 104 | (defn render 105 | ([plot-command] 106 | (render plot-command {})) 107 | ([plot-command options] 108 | (let [width (or (:width options) 6.5) 109 | height (or (:height options) (/ width 1.618)) 110 | r-file (File/createTempFile "huri-plot" ".r") 111 | r-path (.getAbsolutePath r-file) 112 | out-file (File/createTempFile "huri-plot" ".svg") 113 | out-path (.getAbsolutePath out-file) 114 | _ (spit r-path (->r (ggsave plot-command out-path width height))) 115 | _ (rscript r-path) 116 | rendered-plot (slurp out-path) 117 | _ (.delete r-file) 118 | _ (.delete out-file)] 119 | (mangle-ids rendered-plot)))) 120 | 121 | (defrecord GGView [plot-command options]) 122 | 123 | (extend-type GGView 124 | render/Renderable 125 | (render [self] 126 | {:type :html 127 | :content (render (:plot-command self) (:options self)) 128 | :value (pr-str self)})) 129 | 130 | (defn view 131 | ([plot-command] (view plot-command {})) 132 | ([plot-command options] 133 | (GGView. plot-command options))) 134 | 135 | (defn- sanitize-key 136 | [k] 137 | (cond 138 | (nil? k) nil 139 | (sequential? k) (map sanitize-key k) 140 | :else (let [sanitized (-> (if (or (keyword? k) (symbol? k) (string? k)) 141 | (name k) 142 | (str k)) 143 | (s/replace #"(?:^\d)|\W" (comp (partial str "__") 144 | int first)) 145 | keyword)] 146 | (if (s/starts-with? sanitized "_") 147 | (str "g" sanitized) 148 | sanitized)))) 149 | 150 | (defmulti ->r-type class) 151 | 152 | (defmethod ->r-type clojure.lang.Keyword 153 | [x] 154 | (name x)) 155 | 156 | (defmethod ->r-type java.lang.String 157 | [x] 158 | x) 159 | 160 | (defmethod ->r-type java.lang.Long 161 | [x] 162 | x) 163 | 164 | (defmethod ->r-type java.lang.Double 165 | [x] 166 | x) 167 | 168 | (defmethod ->r-type java.lang.Integer 169 | [x] 170 | x) 171 | 172 | (defmethod ->r-type java.lang.Float 173 | [x] 174 | x) 175 | 176 | (defmethod ->r-type clojure.lang.BigInt 177 | [x] 178 | (long x)) 179 | 180 | (defmethod ->r-type clojure.lang.Ratio 181 | [x] 182 | (double x)) 183 | 184 | (defmethod ->r-type nil 185 | [x] 186 | :NA) 187 | 188 | (defmethod ->r-type :default 189 | [x] 190 | (str x)) 191 | 192 | (defmethod ->r-type org.joda.time.DateTime 193 | [x] 194 | [:as.Date (str x)]) 195 | 196 | (defn r-template 197 | [template & params] 198 | (->> params 199 | (map #(if (keyword? %) 200 | (name %) 201 | %)) 202 | (apply format template) 203 | keyword)) 204 | 205 | (defn- ->col-oriented 206 | [df] 207 | (cond 208 | (map? df) (->col-oriented (seq df)) 209 | (map? (first df)) (for-map [k (keys (first df))] 210 | (sanitize-key k) (col k df)) 211 | (sequential? (first df)) (->> df 212 | (map (partial zipmap [:x__auto :y__auto])) 213 | ->col-oriented) 214 | (not-empty df) (->col-oriented (map vector (range) df)))) 215 | 216 | (defn ->col 217 | [xs] 218 | (into [:c] (map ->r-type xs))) 219 | 220 | (defn ->matrix 221 | [df] 222 | (let [[m n] (size df)] 223 | [:matrix (->col (apply concat (vals (col-oriented df)))) 224 | {:nrow m 225 | :ncol n 226 | :dimnames [:list :NULL (->col (cols df))]}])) 227 | 228 | (defn melt 229 | [cols value-col group-col df] 230 | (mapcat (fn [col df] 231 | (for [row df] 232 | (assoc row value-col (row col) 233 | group-col col))) 234 | cols 235 | (repeat df))) 236 | 237 | (defn- typespec 238 | [df] 239 | (map-vals (comp #(cond 240 | (number? %) :number 241 | (instance? org.joda.time.DateTime %) :date 242 | :else :categorical) 243 | first) 244 | df)) 245 | 246 | (defn- date-scale-resolution 247 | [dts] 248 | (let [[start end] (extent dts)] 249 | (if (< (t/in-days (t/interval start end)) 90) 250 | "%d-%b" 251 | "%b-%y"))) 252 | 253 | (def preamble [[:library :ggplot2] 254 | [:library :scales] 255 | [:library :grid] 256 | [:library :RColorBrewer] 257 | [:library :ggrepel] 258 | [:library :directlabels] 259 | [:<- :palette [:brewer.pal "Greys" {:n 9}]] 260 | {:color.background (keyword "palette[2]")} 261 | {:color.grid.major (keyword "palette[3]")} 262 | {:color.axis.text (keyword "palette[6]")} 263 | {:color.axis.title (keyword "palette[7]")} 264 | {:color.title (keyword "palette[9]")}]) 265 | 266 | (def theme 267 | (r+ 268 | [:theme_bw {:base_size 9}] 269 | [:theme {:panel.background [:element_rect {:fill :color.background 270 | :color :color.background}]}] 271 | [:theme {:plot.background [:element_rect {:fill :color.background 272 | :color :color.background}]}] 273 | [:theme {:panel.border [:element_rect {:color :color.background}]}] 274 | [:theme {:panel.grid.major [:element_line {:color :color.grid.major 275 | :size 0.25}]}] 276 | [:theme {:panel.grid.minor [:element_blank]}] 277 | [:theme {:axis.ticks [:element_blank]}] 278 | [:theme {:legend.background [:element_rect {:fill :color.background}]}] 279 | [:theme {:legend.key [:element_rect {:fill :color.background 280 | :color :color.background}]}] 281 | [:theme {:legend.text [:element_text {:size 6 282 | :color :color.axis.title}]}] 283 | [:theme {:legend.title [:element_blank]}] 284 | [:theme {:plot.title [:element_text {:size 10 285 | :color :color.title 286 | :vjust 1.25}]}] 287 | [:theme {:axis.text.x [:element_text {:size 6 288 | :color :color.axis.text}]}] 289 | [:theme {:axis.text.y [:element_text {:size 6 290 | :color :color.axis.text}]}] 291 | [:theme {:axis.title.x [:element_text {:size 7 292 | :color :color.axis.title 293 | :vjust 0}]}] 294 | [:theme {:axis.title.y [:element_text {:size 7 295 | :color :color.axis.title 296 | :vjust 1.25}]}] 297 | [:theme {:plot.margin [:unit [:c 0.35 0.2 0.3 0.35] "cm"]}])) 298 | 299 | (defmacro defplot 300 | [name & args] 301 | (let [params (butlast args) 302 | positional-params (butlast params) 303 | [x y] positional-params 304 | defaults (merge {:x-label nil 305 | :y-label nil 306 | :title "" 307 | :x-scale :auto 308 | :y-scale :auto 309 | :group-by nil 310 | :colour "#c0392b" 311 | :alpha 0.75 312 | :legend :auto 313 | :sort-by nil 314 | :share-x? false 315 | :trendline? false 316 | :smoothing-method nil 317 | :facet nil 318 | :x-rotate nil 319 | :width 9 320 | :height 5} 321 | (last params)) 322 | body (last args)] 323 | `(defn ~name 324 | ([df#] 325 | (~name ~@(if y 326 | [:x__auto :y__auto] 327 | [:y__auto]) {} df#)) 328 | ~@(if y 329 | [`([options# df#] 330 | (~name :x__auto :y__auto options# df#)) 331 | `([~@positional-params df#] 332 | (~name ~@positional-params {} df#))] 333 | [`([arg# df#] 334 | (if (map? arg#) 335 | (~name :y__auto arg# df#) 336 | (~name arg# {} df#)))]) 337 | ([~@positional-params options# df#] 338 | (if (sequential? ~(last positional-params)) 339 | (~name ~@(butlast positional-params) :y__auto 340 | (assoc options# :group-by :series__auto) 341 | (melt ~(last positional-params) :y__auto :series__auto df#)) 342 | (when (not-empty df#) 343 | (let [{:keys ~(mapv #(symbol (subs (str %) 1)) (keys defaults)) 344 | :as options#} (merge ~defaults options#) 345 | total# (when (and (:stacked? options#) 346 | (or ~'trendline? (:show-values? options#))) 347 | (comp (rollup ~(first positional-params) sum 348 | ~(second positional-params) 349 | df#) 350 | ~(first positional-params))) 351 | used-cols# (->> options# 352 | vals 353 | (concat ~(vec positional-params) [:group__total]) 354 | flatten 355 | (filter keyword?) 356 | (map sanitize-key)) 357 | ~'*df* (select-keys (->col-oriented 358 | (if total# 359 | (derive-cols {:group__total total#} df#) 360 | df#)) 361 | used-cols#) 362 | col-types# (typespec ~'*df*) 363 | ~'x-scale (if (= ~'x-scale :auto) 364 | (case (cond->> (col-types# (sanitize-key ~x)) 365 | (= '~name '~'bar-chart) 366 | (#(get #{:date} % :categorical))) 367 | :date :dates 368 | :categorical :categorical 369 | :linear) 370 | ~'x-scale) 371 | ~'y-scale (if (= ~'y-scale :auto) 372 | (case (col-types# (sanitize-key ~y)) 373 | :date :dates 374 | :categorical :categorical 375 | :linear) 376 | ~'y-scale) 377 | ~'x-label (or ~'x-label 378 | (if (#{:x__auto :y__auto} ~x) 379 | "" 380 | (name ~x))) 381 | ~'y-label (or ~'y-label 382 | ~(if y 383 | `(if (not= ~y :y__auto) 384 | (name ~y) 385 | "") 386 | ""))] 387 | (view 388 | [[:<- :g [:data.frame (map-vals ->col ~'*df*)]] 389 | preamble 390 | (->> (let [~@(mapcat #(vector % `(sanitize-key ~%)) 391 | (concat positional-params 392 | ['group-by 'facet 'sort-by]))] 393 | (concat ~body 394 | [(when ~'facet 395 | [:facet_grid (keyword 396 | (if (sequential? ~'facet) 397 | (->> ~'facet 398 | (map name) 399 | (s/join " ~ ")) 400 | (str "~" (name ~'facet))))]) 401 | (when ~'trendline? 402 | [:geom_smooth 403 | [:aes (cond 404 | total# {:y :group__total} 405 | ~'group-by {:group ~'group-by} 406 | :else {})] 407 | (merge {:alpha 0.2 408 | :colour "black" 409 | :fill "black"} 410 | (when ~'smoothing-method 411 | {:method (name ~'smoothing-method)}))]) 412 | (when (and ~'share-x? ~'group-by) 413 | [:facet_grid (r-template "%s ~ ." ~'group-by) 414 | {:scales "free_y"}]) 415 | (let [scale# (fn [scale-type# labels#] 416 | [scale-type# 417 | (merge {:labels labels#} 418 | (when (= ~'legend :direct) 419 | {:expand [:c 0.1 0]}))])] 420 | (case ~'x-scale 421 | :log (scale# :scale_x_log10 :comma) 422 | :sqrt (scale# :scale_x_sqrt :comma) 423 | :linear (scale# :scale_x_continuous :comma) 424 | :percent (scale# :scale_x_continuous :percent) 425 | :dates (scale# :scale_x_date 426 | [:date_format 427 | (date-scale-resolution 428 | (~'*df* ~x))]) 429 | :categorical nil)) 430 | (case ~'y-scale 431 | :log [:scale_y_log10 {:labels :comma}] 432 | :sqrt [:scale_y_sqrt {:labels :comma}] 433 | :linear [:scale_y_continuous {:labels :comma}] 434 | :percent [:scale_y_continuous {:labels :percent}] 435 | :dates [:scale_y_date 436 | {:labels [:date_format 437 | (date-scale-resolution 438 | (~'*df* ~y))]}] 439 | :categorical nil) 440 | theme 441 | (when-not (or (true? ~'legend) 442 | (and (= :auto ~'legend) 443 | (or ~'group-by 444 | (:size options#)) 445 | (not ~'share-x?) 446 | (not ~'facet))) 447 | [:theme {:legend.position "none"}]) 448 | (when (= :direct ~'legend) 449 | [:geom_dl [:aes {:label ~'group-by}] 450 | {:method [:list "last.bumpup" {:cex 0.6}]}]) 451 | (when (or (number? ~'x-rotate) 452 | (and (= ~'x-rotate :auto) 453 | (nil? (:flip? options#)))) 454 | [:theme 455 | {:axis.text.x [:element_text 456 | {:angle (if (number? ~'x-rotate) 457 | ~'x-rotate 458 | 45) 459 | :hjust 1}]}]) 460 | [:labs {:x ~'x-label 461 | :y ~'y-label 462 | :title ~'title}]])) 463 | (remove nil?) 464 | (apply r+))] 465 | {:width ~'width :height ~'height})))))))) 466 | 467 | (defn format-value 468 | [x percent?] 469 | (if percent? 470 | [:sprintf "%1.2f%%" (r-template "100*%s" x)] 471 | x)) 472 | 473 | (defplot histogram x {:bins 20 474 | :bin-width nil 475 | :density? false 476 | :frequency? false 477 | :show-mean? true} 478 | (let [bin-width (or bin-width 479 | (/ (- (apply max (*df* x)) (apply min (*df* x))) 480 | bins)) 481 | aesthetics (if frequency? 482 | {:colour (or group-by colour)} 483 | {:fill (or group-by colour)})] 484 | [[:ggplot :g (if density? 485 | [:aes x (keyword "..density..") (if group-by 486 | aesthetics 487 | {})] 488 | [:aes x (if group-by 489 | aesthetics 490 | {})])] 491 | [(if frequency? 492 | :geom_freqpoly 493 | :geom_histogram) (merge {:binwidth bin-width 494 | :alpha alpha} 495 | (when-not group-by 496 | aesthetics))] 497 | (when show-mean? 498 | [:geom_vline [:aes {:xintercept [:mean x]}] {:linetype "dashed" 499 | :color (or group-by colour) 500 | :size 0.5}]) 501 | [:geom_hline {:yintercept 0 :size 0.4 :colour "black"}]])) 502 | 503 | (defplot line-chart x y {:show-points? :auto 504 | :fill? false 505 | :alpha 0.5 506 | :size nil 507 | :show-labels? false} 508 | [[:ggplot :g [:aes (merge {:x x :y y} 509 | (when group-by 510 | {:group group-by 511 | :colour group-by 512 | :fill group-by}))]] 513 | [:geom_line (if group-by 514 | {} 515 | {:colour colour})] 516 | (when (or (true? show-points?) 517 | (and (= show-points? :auto) 518 | (not fill?) 519 | (< (count (*df* x)) 50))) 520 | (let [aesthetics (merge {:alpha (if size 521 | alpha 522 | 1)} 523 | (when-not group-by 524 | {:colour colour}))] 525 | (if size 526 | [:geom_point [:aes {:size size}] aesthetics] 527 | [:geom_point aesthetics]))) 528 | (when show-labels? 529 | [:geom_label_repel 530 | [:aes (-> {:label (format-value y (= y-scale :percent))} 531 | (assoc-when :fill (some->> group-by (vector :factor))))] 532 | {:size 2.5 533 | :color (if group-by "white" "black") 534 | :show.legend false}]) 535 | (when fill? 536 | [:geom_area (merge {:alpha alpha} 537 | (when-not group-by 538 | {:fill colour}))])]) 539 | 540 | (defplot bar-chart x y {:stacked? false 541 | :flip? false 542 | :sort-by nil 543 | :show-values? false 544 | :x-rotate :auto} 545 | [[:ggplot :g [:aes (-> {:x (if (= :dates x-scale) 546 | x 547 | [:reorder x (or sort-by y)]) 548 | :y y} 549 | (assoc-when :fill group-by))]] 550 | [:geom_bar (merge {:stat "identity"} 551 | (if group-by 552 | (when-not stacked? 553 | {:position "dodge"}) 554 | {:fill colour}))] 555 | (when show-values? 556 | [:geom_text [:aes {:label (format-value y (= y-scale :percent)) 557 | :hjust (cond 558 | (and flip? stacked?) 559 | [:ifelse (r-template "%s >= 0" y) 1.3 -0.3] 560 | 561 | flip? 562 | [:ifelse (r-template "%s >= 0" y) -0.3 1.3] 563 | 564 | :else 0.5) 565 | :vjust (cond 566 | flip? 0.5 567 | stacked? [:ifelse (r-template "%s >= 0" y) 568 | 1.8 -0.5] 569 | :else [:ifelse (r-template "%s >= 0" y) 570 | -0.3 1.3])}] 571 | {:size 2 572 | :color (if stacked? "white" "black") 573 | :position (if (and (not stacked?) flip?) [:position_dodge 1] "stack")}]) 574 | (when (and stacked? show-values?) 575 | [:geom_text [:aes {:label (format-value :group__total (= y-scale :percent)) 576 | :y :group__total 577 | :hjust (if flip? 578 | [:ifelse (r-template "%s >= 0" y) -0.3 1.3] 579 | 0.5) 580 | :vjust (if flip? 581 | 0.5 582 | [:ifelse (r-template "%s >= 0" :group__total) 583 | -0.3 1.3])}] 584 | {:size 2}]) 585 | (when flip? 586 | [:coord_flip])]) 587 | 588 | (defplot scatter-plot x y {:alpha 0.5 589 | :label nil 590 | :size nil} 591 | [[:ggplot :g [:aes (-> {:x x :y y} 592 | (assoc-when :colour group-by) 593 | )]] 594 | [:geom_point [:aes (if size 595 | {:size size} 596 | {})] 597 | (merge {:alpha alpha} 598 | (when-not group-by 599 | {:colour colour}))] 600 | (when label 601 | [:geom_label_repel 602 | [:aes (-> {:label label} 603 | (assoc-when :color (some->> group-by (vector :factor))))] 604 | {:size 2.5 605 | :show.legend false}])]) 606 | 607 | (defplot box-plot x y {:legend false} 608 | [[:ggplot :g [:aes {:x x :y y :fill x}]] 609 | [:geom_boxplot]]) 610 | 611 | (defplot violin-plot x y {:legend false 612 | :trim? true 613 | :scale :count 614 | :summary? true} 615 | [[:ggplot :g [:aes {:x x :y y :fill x}]] 616 | [:geom_violin {:alpha 0.5 617 | :colour (keyword "palette[4]") 618 | :trim (boolean trim?) 619 | :scale (name scale)}] 620 | (when summary? 621 | [:stat_summary {:fun.data "mean_se" 622 | :geom "pointrange"}])]) 623 | 624 | (defplot heatmap x y z {:extent nil 625 | :z-label nil 626 | :legend true 627 | :legend-title true} 628 | [[:ggplot :g [:aes x y {:fill z}]] 629 | [:geom_tile] 630 | [:scale_fill_distiller (or z-label (name z)) 631 | (-> {:palette "RdYlBu"} 632 | (assoc-when :limit (some->> extent ->col)))]]) 633 | -------------------------------------------------------------------------------- /src/huri/time.clj: -------------------------------------------------------------------------------- 1 | (ns huri.time 2 | (:require [clj-time.core :as t] 3 | [clojure.math.numeric-tower :refer [ceil]])) 4 | 5 | (defn quarter-of-year 6 | [dt] 7 | (ceil (/ (t/month dt) 3))) 8 | 9 | (defn quarter 10 | [dt] 11 | (t/date-time (t/year dt) (inc (* (dec (quarter-of-year dt)) 3)))) 12 | 13 | (defn date 14 | [dt] 15 | (t/floor dt t/day)) 16 | 17 | (defn year-month 18 | [dt] 19 | (t/floor dt t/month)) 20 | 21 | (defn week-of-year 22 | [dt] 23 | (.getWeekOfWeekyear dt)) 24 | 25 | (defn week 26 | [dt] 27 | (t/minus (date dt) (t/days (dec (t/day-of-week dt))))) 28 | 29 | (defn day-of-year 30 | [dt] 31 | (inc (t/in-days (t/interval (t/floor dt t/year) dt)))) 32 | 33 | (defn after? 34 | [this & that] 35 | (t/after? this (if (instance? org.joda.time.DateTime (first that)) 36 | (first that) 37 | (apply t/date-time that)))) 38 | 39 | (defn before? 40 | [this & that] 41 | (t/before? this (if (instance? org.joda.time.DateTime (first that)) 42 | (first that) 43 | (apply t/date-time that)))) 44 | 45 | (defn before-now? 46 | [dt] 47 | (t/before? dt (t/now))) 48 | 49 | (defn after-now? 50 | [dt] 51 | (t/after? dt (t/now))) 52 | 53 | (def not-before? (complement before?)) 54 | (def not-after? (complement after?)) 55 | 56 | (defn between? 57 | [this start end] 58 | (t/within? (t/interval start end) this)) 59 | 60 | (defn in? 61 | ([dt y] 62 | (= (t/year dt) y)) 63 | ([dt y m] 64 | (= (year-month dt) (t/date-time y m)))) 65 | 66 | (defn since? 67 | [this p] 68 | (not-before? this (t/minus (if (#{org.joda.time.Years org.joda.time.Months} 69 | (class p)) 70 | (year-month (t/now)) 71 | (date (t/now))) 72 | p))) 73 | -------------------------------------------------------------------------------- /test/huri/core_test.clj: -------------------------------------------------------------------------------- 1 | (ns huri.core-test 2 | (:require [clojure.test :refer :all] 3 | [huri.core :refer :all])) 4 | 5 | (deftest a-test 6 | (testing "FIXME, I fail." 7 | (is (= 0 1)))) 8 | --------------------------------------------------------------------------------