├── test └── clojask │ ├── correct_outputs │ ├── 1-10.csv │ ├── 1-11.csv │ ├── 1-3.csv │ ├── 1-9.csv │ ├── 1-2.csv │ ├── 1-1.csv │ ├── 1-6.csv │ ├── 1-5.csv │ ├── 1-7.csv │ ├── 1-8.csv │ └── 1-4.csv │ ├── Employees-info-example.csv │ ├── melt.csv │ ├── Employees-example.csv │ ├── dcast.csv │ ├── inmemory_test.clj │ └── core_test.clj ├── docs ├── diagram.jpg ├── diagram.png ├── clojask_functions.png ├── img │ ├── image-20220405210757274.png │ ├── image-20220405210826777.png │ └── image-20220405211348723.png ├── intro.md ├── clojask types.md ├── aggregation functions.md └── clojask.extensions.md ├── examples ├── readme.md └── timezone dataframe │ ├── sales.csv │ └── timezone.clj ├── .gitignore ├── src └── main │ ├── java │ ├── TypeException.java │ ├── ExecutionException.java │ └── OperationException.java │ └── clojure │ └── clojask │ ├── terminal.clj │ ├── classes │ ├── DataStat.clj │ ├── RowInfo.clj │ ├── MGroup.clj │ ├── ColInfo.clj │ └── JoinedDataFrame.clj │ ├── api │ ├── aggregate.clj │ └── gb_aggregate.clj │ ├── extensions │ ├── reshape.clj │ └── bind.clj │ ├── clojask_input.clj │ ├── join │ ├── outer_output.clj │ ├── outer_input.clj │ └── outer_onyx_comps.clj │ ├── sort.clj │ ├── aggregate │ ├── aggre_output.clj │ ├── aggre_input.clj │ └── aggre_onyx_comps.clj │ ├── clojask_groupby.clj │ ├── debug.clj │ ├── clojask_output.clj │ ├── clojask_join.clj │ ├── clojask_aggre.clj │ ├── preview.clj │ ├── groupby.clj │ ├── utils.clj │ └── join.clj ├── LICENSE ├── project.clj ├── benchmark ├── dask-benchmark.ipynb ├── .ipynb_checkpoints │ └── dask-benchmark-checkpoint.ipynb └── clojure-benchmark.clj └── README.md /test/clojask/correct_outputs/1-10.csv: -------------------------------------------------------------------------------- 1 | new-Salary 2 | 50000.0 3 | -------------------------------------------------------------------------------- /test/clojask/correct_outputs/1-11.csv: -------------------------------------------------------------------------------- 1 | Department 2 | 12 3 | 21 4 | 13 5 | 11 6 | -------------------------------------------------------------------------------- /docs/diagram.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clojure-finance/clojask/HEAD/docs/diagram.jpg -------------------------------------------------------------------------------- /docs/diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clojure-finance/clojask/HEAD/docs/diagram.png -------------------------------------------------------------------------------- /docs/clojask_functions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clojure-finance/clojask/HEAD/docs/clojask_functions.png -------------------------------------------------------------------------------- /test/clojask/correct_outputs/1-3.csv: -------------------------------------------------------------------------------- 1 | Department,new-Salary 2 | 13,800.0 3 | 11,50000.0 4 | 12,1000.0 5 | 21,700.0 6 | -------------------------------------------------------------------------------- /docs/img/image-20220405210757274.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clojure-finance/clojask/HEAD/docs/img/image-20220405210757274.png -------------------------------------------------------------------------------- /docs/img/image-20220405210826777.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clojure-finance/clojask/HEAD/docs/img/image-20220405210826777.png -------------------------------------------------------------------------------- /docs/img/image-20220405211348723.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/clojure-finance/clojask/HEAD/docs/img/image-20220405211348723.png -------------------------------------------------------------------------------- /examples/readme.md: -------------------------------------------------------------------------------- 1 | This folder will be moved out as an independent repository in the future when Clojask has been deployed to Clojars. -------------------------------------------------------------------------------- /docs/intro.md: -------------------------------------------------------------------------------- 1 | # Introduction to techml_onyx 2 | 3 | TODO: write [great documentation](http://jacobian.org/writing/what-to-write/) 4 | -------------------------------------------------------------------------------- /test/clojask/correct_outputs/1-9.csv: -------------------------------------------------------------------------------- 1 | Employee,EmployeeName 2 | 1,Alice 3 | 2,Bob 4 | 3,Carla 5 | 4,Daniel 6 | 5,Evelyn 7 | 6,Ferdinand 8 | 7,Amy 9 | -------------------------------------------------------------------------------- /examples/timezone dataframe/sales.csv: -------------------------------------------------------------------------------- 1 | date,cust,item,sold 2 | 2010-01-19 UTC,101,2,11 3 | 2010-01-22 HKG,102,1,7 4 | 2010-01-24 UK,102,2,9 5 | 2010-01-25 DUB,101,2,9 6 | 2010-01-26 LA,101,1,10 -------------------------------------------------------------------------------- /test/clojask/Employees-info-example.csv: -------------------------------------------------------------------------------- 1 | Employee,EmployeeName,DayOff,UpdateDate 2 | 1,Alice,20,2020/12/10 3 | 2,Bob,15,2020/12/05 4 | 3,Carla,5,2020/12/03 5 | 7,Angel,30,2020/12/11 6 | 8,Jack,4,2019/03/21 -------------------------------------------------------------------------------- /test/clojask/melt.csv: -------------------------------------------------------------------------------- 1 | family_id,age_mother,dob_child1,dob_child2,dob_child3 2 | 1,30,1998-11-26,2000-01-29, 3 | 2,27,1996-06-22,, 4 | 3,26,2002-07-11,2004-04-05,2007-09-02 5 | 4,32,2004-10-10,2009-08-27,2012-07-21 6 | 5,29,2000-12-05,2005-02-28, -------------------------------------------------------------------------------- /test/clojask/correct_outputs/1-2.csv: -------------------------------------------------------------------------------- 1 | Employee,EmployeeName,Department,Salary,UpdateDate,new-col 2 | 1,Alice,11,300.0,2020/12/12,1300.0 3 | 2,Bob,11,600.0,2020/12/01,2600.0 4 | 5,Evelyn,13,800.0,2020/12/03,5800.0 5 | 6,Ferdinand,21,700.0,2020/12/05,6700.0 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | onyx.log* 3 | *.log 4 | resources/ 5 | outputs/ 6 | test/clojask/test_outputs/* 7 | .lein-failures 8 | .lein-repl-history 9 | .nrepl-port 10 | target/ 11 | .clojask/ 12 | .lsp 13 | .calva 14 | .clj-kondo 15 | api_design.clj 16 | # sqlite.db 17 | # _*.csv 18 | .vscode 19 | *.csv -------------------------------------------------------------------------------- /test/clojask/Employees-example.csv: -------------------------------------------------------------------------------- 1 | Employee,EmployeeName,Department,Salary,UpdateDate 2 | 1,Alice,11,300,2020/12/12 3 | 2,Bob,11,600,2020/12/01 4 | 3,Carla,12,900,2020/12/03 5 | 4,Daniel,12,1000,2020/12/05 6 | 5,Evelyn,13,800,2020/12/03 7 | 6,Ferdinand,21,700,2020/12/05 8 | 7,Amy,11,50000,2020/11/26 9 | -------------------------------------------------------------------------------- /test/clojask/correct_outputs/1-1.csv: -------------------------------------------------------------------------------- 1 | Employee,EmployeeName,Department,Salary,UpdateDate 2 | 1,Alice,11,-300.0!,2020/12/12 3 | 2,Bob,11,-600.0!,2020/12/01 4 | 3,Carla,12,-900.0!,2020/12/03 5 | 4,Daniel,12,-1000.0!,2020/12/05 6 | 5,Evelyn,13,-800.0!,2020/12/03 7 | 6,Ferdinand,21,-700.0!,2020/12/05 8 | 7,Amy,11,-50000.0!,2020/11/26 9 | -------------------------------------------------------------------------------- /test/clojask/correct_outputs/1-6.csv: -------------------------------------------------------------------------------- 1 | 2_Employee,2_EmployeeName,2_DayOff,2_UpdateDate,1_Employee,1_EmployeeName,1_Department,1_Salary,1_UpdateDate 2 | 2,Bob,15,2020/12/05,2,Bob,11,600,2020/12/01 3 | 3,Carla,5,2020/12/03,3,Carla,12,900,2020/12/03 4 | 1,Alice,20,2020/12/10,1,Alice,11,300,2020/12/12 5 | 7,Angel,30,2020/12/11,7,Amy,11,50000,2020/11/26 6 | -------------------------------------------------------------------------------- /test/clojask/correct_outputs/1-5.csv: -------------------------------------------------------------------------------- 1 | 2_Employee,2_EmployeeName,2_DayOff,2_UpdateDate,1_Employee,1_EmployeeName,1_Department,1_Salary,1_UpdateDate 2 | 7,Angel,30,2020/12/11,7,Amy,11,50000,2020/11/26 3 | 8,Jack,4,2019/03/21,,,,, 4 | 1,Alice,20,2020/12/10,1,Alice,11,300,2020/12/12 5 | 3,Carla,5,2020/12/03,3,Carla,12,900,2020/12/03 6 | 2,Bob,15,2020/12/05,2,Bob,11,600,2020/12/01 7 | -------------------------------------------------------------------------------- /test/clojask/correct_outputs/1-7.csv: -------------------------------------------------------------------------------- 1 | 1_Employee,1_EmployeeName,1_Department,1_Salary,1_UpdateDate,2_Employee,2_EmployeeName,2_DayOff,2_UpdateDate 2 | 7,Amy,11,50000,2020/11/26,,,, 3 | 3,Carla,12,900,2020/12/03,3,Carla,5,2020/12/03 4 | 4,Daniel,12,1000,2020/12/05,,,, 5 | 5,Evelyn,13,800,2020/12/03,,,, 6 | 1,Alice,11,300,2020/12/12,1,Alice,20,2020/12/10 7 | 2,Bob,11,600,2020/12/01,,,, 8 | 6,Ferdinand,21,700,2020/12/05,,,, 9 | -------------------------------------------------------------------------------- /test/clojask/correct_outputs/1-8.csv: -------------------------------------------------------------------------------- 1 | 1_Employee,1_EmployeeName,1_Department,1_Salary,1_UpdateDate,2_Employee,2_EmployeeName,2_DayOff,2_UpdateDate 2 | 5,Evelyn,13,800,2020/12/03,,,, 3 | 1,Alice,11,300,2020/12/12,,,, 4 | 2,Bob,11,600,2020/12/01,2,Bob,15,2020/12/05 5 | 6,Ferdinand,21,700,2020/12/05,,,, 6 | 7,Amy,11,50000,2020/11/26,,,, 7 | 3,Carla,12,900,2020/12/03,3,Carla,5,2020/12/03 8 | 4,Daniel,12,1000,2020/12/05,,,, 9 | -------------------------------------------------------------------------------- /test/clojask/correct_outputs/1-4.csv: -------------------------------------------------------------------------------- 1 | 1_Employee,1_EmployeeName,1_Department,1_Salary,1_UpdateDate,2_Employee,2_EmployeeName,2_DayOff,2_UpdateDate 2 | 5,Evelyn,13,800,2020/12/03,,,, 3 | 4,Daniel,12,1000,2020/12/05,,,, 4 | 1,Alice,11,300,2020/12/12,1,Alice,20,2020/12/10 5 | 3,Carla,12,900,2020/12/03,3,Carla,5,2020/12/03 6 | 6,Ferdinand,21,700,2020/12/05,,,, 7 | 2,Bob,11,600,2020/12/01,2,Bob,15,2020/12/05 8 | 7,Amy,11,50000,2020/11/26,7,Angel,30,2020/12/11 9 | -------------------------------------------------------------------------------- /test/clojask/dcast.csv: -------------------------------------------------------------------------------- 1 | family_id,age_mother,measure,value 2 | 1,30,dob_child1,1998-11-26 3 | 1,30,dob_child2,2000-01-29 4 | 1,30,dob_child3, 5 | 2,27,dob_child1,1996-06-22 6 | 2,27,dob_child2, 7 | 2,27,dob_child3, 8 | 3,26,dob_child1,2002-07-11 9 | 3,26,dob_child2,2004-04-05 10 | 3,26,dob_child3,2007-09-02 11 | 4,32,dob_child1,2004-10-10 12 | 4,32,dob_child2,2009-08-27 13 | 4,32,dob_child3,2012-07-21 14 | 5,29,dob_child1,2000-12-05 15 | 5,29,dob_child2,2005-02-28 16 | 5,29,dob_child3, 17 | -------------------------------------------------------------------------------- /examples/timezone dataframe/timezone.clj: -------------------------------------------------------------------------------- 1 | (ns examples.timezone 2 | (:require [clojask.dataframe :as clojask])) 3 | 4 | (defn timezone-parser 5 | "the input is a datetime string with timezone identifier as suffix" 6 | [time-string] 7 | ) 8 | 9 | (defn timezone-formatter 10 | "the input is a vector, the first element is a date object, the second is the timezone string" 11 | [time-vec] 12 | ) 13 | 14 | (def main 15 | [] 16 | (def df (clojask/dataframe "sales.csv")) 17 | ) -------------------------------------------------------------------------------- /src/main/java/TypeException.java: -------------------------------------------------------------------------------- 1 | package com.clojask.exception; 2 | 3 | import java.lang.RuntimeException; 4 | 5 | public class TypeException extends RuntimeException { 6 | 7 | public TypeException(String s) { 8 | super("Type assertion error: " + s); 9 | } 10 | 11 | public TypeException(String s, Throwable err) { 12 | super("Type assertion error: " + s, err); 13 | } 14 | 15 | // @Override 16 | // public String toString() { 17 | // return this.getMessage() + "\n" + super.toString(); 18 | // } 19 | } -------------------------------------------------------------------------------- /src/main/java/ExecutionException.java: -------------------------------------------------------------------------------- 1 | package com.clojask.exception; 2 | 3 | import java.lang.Exception; 4 | 5 | public class ExecutionException extends Exception { 6 | 7 | public ExecutionException(String s) { 8 | super("Execution Error: " + s); 9 | } 10 | 11 | public ExecutionException(String s, Throwable err) { 12 | super("Execution Error: " + s, err); 13 | } 14 | 15 | // @Override 16 | // public String toString() { 17 | // return this.getMessage() + "\n" + super.toString(); 18 | // } 19 | 20 | } -------------------------------------------------------------------------------- /src/main/clojure/clojask/terminal.clj: -------------------------------------------------------------------------------- 1 | (ns clojask.terminal 2 | ) 3 | 4 | (defn print-progress 5 | "Print the progress of perc" 6 | [perc & {:keys [total init stage] :or {total 25 init false stage nil}}] 7 | (let [total (or total 25) 8 | count (int (* perc total)) 9 | rem (- total count) 10 | per (* 100 perc)] 11 | (if (not= init true) 12 | (do (print "\33[1A\33[2K") 13 | (flush))) 14 | (if (not= stage nil) 15 | (println stage)) 16 | (println (format "[%s%s] %.2f%%" (apply str (repeat count "#")) (apply str (repeat rem " ")) per)) 17 | (flush))) -------------------------------------------------------------------------------- /src/main/java/OperationException.java: -------------------------------------------------------------------------------- 1 | package com.clojask.exception; 2 | 3 | import java.lang.RuntimeException; 4 | 5 | public class OperationException extends Exception { 6 | 7 | public OperationException(String s) { 8 | super("Failed in running operation: " + s); 9 | } 10 | 11 | public OperationException(String s, Throwable err) { 12 | super("Failed in running operation: " + s, err); 13 | // super.fillInStackTrace(); 14 | } 15 | 16 | // @Override 17 | // public String toString() { 18 | // return this.getMessage() + "\n" + super.toString(); 19 | // } 20 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 clojure-finance 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/main/clojure/clojask/classes/DataStat.clj: -------------------------------------------------------------------------------- 1 | (ns clojask.classes.DataStat 2 | (:require [clojure.java.io :as io])) 3 | 4 | (import '[com.clojask.exception TypeException] 5 | '[com.clojask.exception OperationException]) 6 | 7 | (definterface DataIntf 8 | (init [source file]) 9 | (initWithIO [io-func]) 10 | (getSize [])) 11 | 12 | 13 | (deftype DataStat 14 | ;; the column description about whether a change is made to this column 15 | [^:unsynchronized-mutable file-size 16 | ^:unsynchronized-mutable num-rows] 17 | 18 | ;; method 19 | DataIntf 20 | 21 | (init 22 | [this source file] 23 | (if file 24 | (do 25 | (set! file-size (:size (file))) 26 | (set! num-rows nil)) 27 | (if (fn? source) 28 | (do 29 | (set! file-size nil) 30 | (set! num-rows nil)) 31 | (do 32 | (set! file-size (.length (io/file source))))))) 33 | 34 | (initWithIO 35 | [this io-func] 36 | (set! file-size (:size (io-func))) 37 | (set! num-rows nil)) 38 | 39 | (getSize 40 | [this] 41 | file-size)) 42 | 43 | (defn compute-stat 44 | [source & [io-func]] 45 | (let [stat (DataStat. nil nil)] 46 | (if io-func 47 | (.initWithIO stat io-func) 48 | (.init stat source nil)) 49 | stat)) -------------------------------------------------------------------------------- /docs/clojask types.md: -------------------------------------------------------------------------------- 1 | ## Clojask Types 2 | 3 | ### Supported Types 4 | 5 | string 6 | 7 | int 8 | 9 | double 10 | 11 | date 12 | 13 | datetime 14 | 15 | ### string 16 | 17 | The default type for all columns 18 | 19 | Class: ` java.lang.String` 20 | 21 | #### Examples 22 | 23 | ```clojure 24 | (set-type dataframe "col-name" "string") 25 | ``` 26 | 27 | ### int 28 | 29 | Most efficiently stores an integer 30 | 31 | Class: `java.lang.Integer` 32 | 33 | #### Examples 34 | 35 | ```clojure 36 | (set-type dataframe "col-name" "int") 37 | ``` 38 | 39 | ### double 40 | 41 | Accepts floats and integers 42 | 43 | Class: `java.lang.Double` 44 | 45 | #### Examples 46 | 47 | ```clojure 48 | (set-type dataframe "col-name" "double") 49 | ``` 50 | 51 | ### date 52 | 53 | Transform a date string (no time field) 54 | 55 | Class: `java.time.LocalDate` (default format string: `yyyy-MM-dd`) 56 | 57 | #### Examples 58 | 59 | ```clojure 60 | ;; if the date looks like this 2020/11/12 61 | (set-type dataframe "col-name" "date:yyyy/MM/dd") 62 | ``` 63 | 64 | ### datetime 65 | 66 | Transform a date string (no time field) 67 | 68 | Class: `java.time.LocalDateTime` (default format string: `yyyy-MM-dd HH:mm:ss`) 69 | 70 | #### Examples 71 | 72 | ```clojure 73 | ;; if the date looks like this 2020/11/12 12:12:36 74 | (set-type dataframe "col-name" "datetime:yyyy/MM/dd HH:mm:ss") 75 | ``` 76 | 77 | ### -------------------------------------------------------------------------------- /src/main/clojure/clojask/api/aggregate.clj: -------------------------------------------------------------------------------- 1 | (ns clojask.api.aggregate 2 | (:refer-clojure :exclude [max min sum count])) 3 | "Contains implemented simple aggregation functions" 4 | 5 | (def start) 6 | 7 | ;; (defn aggre-func 8 | ;; "prev value could be start" 9 | ;; [prev new]) 10 | 11 | ;; single row aggregation functions 12 | 13 | (defn max 14 | [a b] 15 | (if (or (= a start) (> (compare b a) 0)) 16 | b 17 | a)) 18 | 19 | (defn min 20 | [a b] 21 | (if (or (= a start) (< (compare b a) 0)) 22 | b 23 | a)) 24 | 25 | (defn sum 26 | [a b] 27 | (if (= a start) 28 | b 29 | (+ a b))) 30 | 31 | (defn count 32 | [a b] 33 | (if (= a start) 34 | 1 35 | (inc a))) 36 | 37 | ;; multi-row aggregation functions 38 | 39 | (defn smallest3 40 | "return the smallest three entries" 41 | [a b] 42 | (cond 43 | (= start a) [b] 44 | :else (take 3 (sort (conj a b))))) 45 | 46 | (defn smallestk 47 | "return the smallest k entries (the performance is better with smaller k)" 48 | [a b k] 49 | (cond 50 | (= start a) [b] 51 | :else (take k (sort (conj a b))))) 52 | 53 | (defn largest3 54 | [a b] 55 | "return the largest three entries" 56 | (cond 57 | (= start a) [b] 58 | :else (take 3 (sort (fn [a b] (compare b a)) (conj a b))))) 59 | 60 | (defn largestk 61 | [a b k] 62 | "return the largest three entries (the performance is better with smaller k)" 63 | (cond 64 | (= start a) [b] 65 | :else (take k (sort (fn [a b] (compare b a)) (conj a b))))) 66 | 67 | -------------------------------------------------------------------------------- /project.clj: -------------------------------------------------------------------------------- 1 | (defproject com.github.clojure-finance/clojask "2.0.1" 2 | :description "Data analysis and manipulation library with parallel computing for larger-than-memory datasets" 3 | :url "https://github.com/clojure-finance/clojask" 4 | :license {:name "MIT" 5 | :url "https://github.com/clojure-finance/clojask/blob/1.x.x/LICENSE"} 6 | :dependencies [[org.clojure/clojure "1.10.1"] 7 | ;; [org.clojure/math.numeric-tower "0.0.4"] 8 | [org.clojure/data.csv "1.0.0"] 9 | ^{:voom {:repo "git@github.com:onyx-platform/onyx.git" :branch "master"}} 10 | [org.onyxplatform/onyx "0.14.6"] 11 | [com.taoensso/timbre "5.2.1"] 12 | ;; [techascent/tech.ml.dataset "5.17" :exclusions [[ch.qos.logback/logback-classic][org.slf4j/slf4j-api]]] 13 | [com.google.code.externalsortinginjava/externalsortinginjava "0.6.0"] 14 | [com.github.clojure-finance/clojask-io "1.0.6"] 15 | [com.github.clojure-finance/clojure-heap "1.0.3"]] 16 | :repl-options {:init-ns clojask.debug 17 | :timeout 180000} 18 | :plugins [[lein-update-dependency "0.1.2"]] 19 | :main ^:skip-aot clojask.debug/-main 20 | :source-paths ["src/main/clojure"] 21 | :java-source-paths ["src/main/java"] 22 | :javac-options ["-target" "1.8" "-source" "1.8" "-Xlint:-options"] 23 | :jvm-opts ["-XX:+UseG1GC" "-server"] 24 | :test-paths ["test/clojask"] 25 | ;:java-test-paths ["test/java"] 26 | ;;:injections [(.. System (setProperty "clojure.core.async.pool-size" "8"))] 27 | ) 28 | -------------------------------------------------------------------------------- /src/main/clojure/clojask/extensions/reshape.clj: -------------------------------------------------------------------------------- 1 | (ns clojask.extensions.reshape 2 | "Contains functions that extends the power of clojask, while not directly applying to the dataframe class" 3 | (:require [clojure.data.csv :as csv] 4 | [clojure.java.io :as io] 5 | [clojure.string :as str] 6 | [clojask.dataframe :as ck])) 7 | 8 | (defn melt 9 | "Reshape the clojask dataframe from wide to long." 10 | [df output-dir id measure & {:keys [measure-name value-name] :or {measure-name "measure" value-name "value"}}] 11 | (let [id-count (count id) 12 | mea-count (count measure) 13 | func (fn [x] (map concat (repeat (take id-count x)) (map vector measure (take-last mea-count x))))] 14 | (ck/compute df 1 output-dir :select (concat id measure) :melt func :header (concat id [measure-name value-name]))) 15 | ) 16 | 17 | (defn- dcast-second 18 | [seq] 19 | (first (rest seq))) 20 | 21 | (defn- dcast-1 22 | [seq order] 23 | (let [keys (map first seq) 24 | vals (map dcast-second seq) 25 | dict (zipmap keys vals) 26 | func (fn [order] (if-let [res (get dict order)] (str res) ""))] 27 | (str/join "," (mapv func order)))) 28 | 29 | (defn dcast 30 | "Reshape the clojask dataframe from long to wide." 31 | [x output-dir id measure-name value-name vals & {:keys [vals-name] :or {vals-name vals}}] 32 | (assert (= [] (.getGroupbyKeys x)) "dcast is not applicable to this dataframe") 33 | (ck/operate x (fn [a b] [a b]) [measure-name value-name] "dcast1014") 34 | (ck/group-by x id) 35 | (let [func #(dcast-1 % vals)] 36 | (ck/aggregate x func "dcast1014")) 37 | (ck/compute x 8 output-dir :header (concat id vals-name)) 38 | ) -------------------------------------------------------------------------------- /src/main/clojure/clojask/api/gb_aggregate.clj: -------------------------------------------------------------------------------- 1 | (ns clojask.api.gb-aggregate 2 | (:require [clojask.api.aggregate :as agg]) 3 | (:refer-clojure :exclude [max min sum count])) 4 | "Contains the implemented function for group-by aggregation functions" 5 | 6 | ;; (defn aggre-func 7 | ;; "function that can be applied on a collection" 8 | ;; [list]) 9 | 10 | ;; single row aggregation functions 11 | 12 | (defn max 13 | [list] 14 | (reduce agg/max list)) 15 | 16 | (defn min 17 | [list] 18 | (reduce agg/min list)) 19 | 20 | (defn sum 21 | [list] 22 | (reduce + list)) 23 | 24 | (defn count 25 | [list] 26 | (clojure.core/count list)) 27 | 28 | (defn mean 29 | [list] 30 | (let [sum (apply + list) 31 | count (count list)] 32 | (if (pos? count) 33 | (/ sum count) 34 | 0))) 35 | 36 | (defn mode 37 | [list] 38 | (let [freqs (frequencies list) 39 | occurrences (clojure.core/group-by val freqs) 40 | modes (last (sort occurrences)) 41 | modes (->> modes 42 | val 43 | (map key))] 44 | modes)) 45 | 46 | (defn median 47 | [list] 48 | (let [sorted (sort list) 49 | cnt (count sorted) 50 | halfway (quot cnt 2)] 51 | (if (odd? cnt) 52 | (nth sorted halfway) 53 | (let [bottom (dec halfway) 54 | bottom-val (nth sorted bottom) 55 | top-val (nth sorted halfway)] 56 | (mean [bottom-val top-val]))))) 57 | 58 | (defn sd 59 | [list] 60 | (let [avg (mean list) 61 | squares (for [x list] 62 | (let [x-avg (- x avg)] 63 | (* x-avg x-avg))) 64 | total (count list)] 65 | (if (= 1 total) 66 | 0 67 | (-> (/ (apply + squares) 68 | (- total 1)) 69 | (Math/sqrt))))) 70 | 71 | (defn skew 72 | [list] 73 | (let [mean (mean list) 74 | median (median list) 75 | sd (sd list)] 76 | (* 3 (/ (- mean median) sd)))) 77 | 78 | ;; multi-row aggregation functions 79 | 80 | (defn smallest3 81 | "return the smallest 3 entries" 82 | [list] 83 | (reduce agg/smallest3 agg/start list)) 84 | 85 | (defn smallestk 86 | "return the smallest k entries (the performance is better with smaller k)" 87 | [list k] 88 | (reduce (fn [a b] (agg/smallestk a b k)) agg/start list)) 89 | 90 | (defn largest3 91 | "return the largest 3 entries" 92 | [list] 93 | (reduce agg/largest3 agg/start list)) 94 | 95 | (defn largestk 96 | "return the largest k entries (the performance is better with smaller k)" 97 | [list k] 98 | (reduce (fn [a b] (agg/largestk a b k)) agg/start list)) 99 | 100 | -------------------------------------------------------------------------------- /docs/aggregation functions.md: -------------------------------------------------------------------------------- 1 | ### Aggregation Functions 2 | 3 | In Clojask, you can aggregate on the whole dataframe, or on the group-by dataframe. We call the first case "simple aggregation" and the second "group-by aggregation". Some given functions for simple aggregation are defined in namespace `clojask.api.aggregate`, and the given functions for group-by aggregation are defined in namespace `clojask.api.gb-aggregate`. 4 | 5 | Below are full list of given functions for the two types. 6 | 7 | #### `clojask.api.aggregate`: 8 | 9 | `max`: Find the max value (use `clojure.core/compare` as the comparator) 10 | 11 | `min`: Find the min value (use `clojure.core/compare` as the comparator) 12 | 13 | #### `clojask.api.gb-aggregate`: 14 | 15 | `max`: Find the max value (use `clojure.core/compare` as the comparator) 16 | 17 | `min`: Find the min value (use `clojure.core/compare` as the comparator) 18 | 19 | Besides these given functions, you are also welcomed to define your own. 20 | 21 | #### How to define group-by aggregation functions? 22 | 23 | This is the template: 24 | 25 | ```clojure 26 | (defn gb-aggre-template 27 | [col] ;; take only one argument which is the aggregation column in the format of vector 28 | ;; ... your implementation 29 | result ;; return one variable (could be int / double / string / collection of above) 30 | ) 31 | ``` 32 | 33 | Basically, the function should take one argument only, which is the full aggregation column. ***Here we simply assume this column should be smaller than memory!*** 34 | 35 | You may find many built-in function in Clojure also fulfilling this requirement, for example, `count`, `mean`, and countless function constructed from [`reduce`](https://clojuredocs.org/clojure.core/reduce). 36 | 37 | #### How to define simple aggregation functions? 38 | 39 | This is the template: 40 | 41 | ```clojure 42 | (defn aggre-template 43 | ;; [new-value old-result] 44 | [old-result new-value] 45 | ;; old-result: the value of the result for the previous gb-aggre-template 46 | ;; new-value: the value for the column on the current row 47 | ;; ... your implementation 48 | new-result ;; return the new result, and this will be passed as old-result for the next gb-aggre-template 49 | ) 50 | ``` 51 | 52 | **Notes:** 53 | 54 | 1. The old-result for the first `aggre-template` is `clojask.api.aggregate/start`. So your function must be able to deal with cases when the first argument is `clojask.api.aggregate/start`. 55 | 2. Your function should be self-sustainable, meaning that the result of `aggre-template` should be safe as the input for `aggre-template`. 56 | 1. To better understand the this template, you may refer to the documentation of [`reduce`](https://clojuredocs.org/clojure.core/reduce), the `aggre-func` should be able to use in `reduce`. 57 | 58 | -------------------------------------------------------------------------------- /benchmark/dask-benchmark.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%%time\n", 10 | "\n", 11 | "from dask.distributed import Client\n", 12 | "\n", 13 | "client = Client(n_workers=4)\n", 14 | "\n", 15 | "import dask.dataframe as dd\n", 16 | "import pandas as pd\n", 17 | "import os\n", 18 | "import dask\n", 19 | "\n", 20 | "filename = os.path.join('../clojure-datasets/data-Compustat-lohi.csv') # 1.8 M dataset\n", 21 | "#filename = os.path.join('../clojure-datasets/data-Compustat-x2.csv') # 3.6 M dataset\n", 22 | "#filename = os.path.join('../clojure-datasets/data-CRSP.csv') # 80 M dataset\n", 23 | "\n", 24 | "#crsp_filename = os.path.join('../clojure-datasets/CRSP-extract.csv') # 80 M dataset\n", 25 | "\n", 26 | "df = dd.read_csv(filename, dtype={'exchg': 'float64', 'sic': 'float64'})\n", 27 | "#other = dd.read_csv(crsp_filename)\n", 28 | "\n", 29 | "#ddf = dd.from_pandas(df, npartitions=10)\n", 30 | "\n", 31 | "# =================== Change this part to test time taken ====================== #\n", 32 | "\n", 33 | "# element-wise operations\n", 34 | "df['new_col'] = df['datacqtr'] + 20 # Compustat\n", 35 | "#df['new_col'] = df['PRC'] + 20 # CRSP\n", 36 | "\n", 37 | "# row-wise\n", 38 | "#df = df[df.datacqtr > 1000.0]\n", 39 | "\n", 40 | "# aggregation\n", 41 | "#df = df.datacqtr.max()\n", 42 | "\n", 43 | "# groupby aggregate\n", 44 | "#df = df.groupby(df.conm).datafqtr.max()\n", 45 | "\n", 46 | "# left join\n", 47 | "#df = df.join(other, how='left')\n", 48 | "\n", 49 | "# left join\n", 50 | "#df = df.join(other, how='right')\n", 51 | "\n", 52 | "# inner join\n", 53 | "#df = df.join(other, how='inner')\n", 54 | "\n", 55 | "# ========================================================================= #\n", 56 | "\n", 57 | "#df.to_csv('./output/data-Compustat-output-*.csv') # output as separate csv files\n", 58 | "df.to_csv('dask_output.csv', single_file=True) # output as a single file" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [] 67 | } 68 | ], 69 | "metadata": { 70 | "kernelspec": { 71 | "display_name": "Python 3 (ipykernel)", 72 | "language": "python", 73 | "name": "python3" 74 | }, 75 | "language_info": { 76 | "codemirror_mode": { 77 | "name": "ipython", 78 | "version": 3 79 | }, 80 | "file_extension": ".py", 81 | "mimetype": "text/x-python", 82 | "name": "python", 83 | "nbconvert_exporter": "python", 84 | "pygments_lexer": "ipython3", 85 | "version": "3.9.6" 86 | } 87 | }, 88 | "nbformat": 4, 89 | "nbformat_minor": 4 90 | } 91 | -------------------------------------------------------------------------------- /benchmark/.ipynb_checkpoints/dask-benchmark-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%%time\n", 10 | "\n", 11 | "from dask.distributed import Client\n", 12 | "\n", 13 | "client = Client(n_workers=4)\n", 14 | "\n", 15 | "import dask.dataframe as dd\n", 16 | "import pandas as pd\n", 17 | "import os\n", 18 | "import dask\n", 19 | "\n", 20 | "filename = os.path.join('../clojure-datasets/data-Compustat-lohi.csv') # 1.8 M dataset\n", 21 | "#filename = os.path.join('../clojure-datasets/data-Compustat-x2.csv') # 3.6 M dataset\n", 22 | "#filename = os.path.join('../clojure-datasets/data-CRSP.csv') # 80 M dataset\n", 23 | "\n", 24 | "#crsp_filename = os.path.join('../clojure-datasets/CRSP-extract.csv') # 80 M dataset\n", 25 | "\n", 26 | "df = dd.read_csv(filename, dtype={'exchg': 'float64', 'sic': 'float64'})\n", 27 | "#other = dd.read_csv(crsp_filename)\n", 28 | "\n", 29 | "#ddf = dd.from_pandas(df, npartitions=10)\n", 30 | "\n", 31 | "# =================== Change this part to test speed ====================== #\n", 32 | "\n", 33 | "# element-wise operations\n", 34 | "df['new_col'] = df['datacqtr'] + 20 # Compustat\n", 35 | "#df['new_col'] = df['PRC'] + 20 # CRSP\n", 36 | "\n", 37 | "# row-wise\n", 38 | "#df = df[df.datacqtr > 1000.0]\n", 39 | "\n", 40 | "# aggregation\n", 41 | "#df = df.datacqtr.max()\n", 42 | "\n", 43 | "# groupby aggregate\n", 44 | "#df = df.groupby(df.conm).datafqtr.max()\n", 45 | "\n", 46 | "# left join\n", 47 | "#df = df.join(other, how='left')\n", 48 | "\n", 49 | "# left join\n", 50 | "#df = df.join(other, how='right')\n", 51 | "\n", 52 | "# inner join\n", 53 | "#df = df.join(other, how='inner')\n", 54 | "\n", 55 | "# ========================================================================= #\n", 56 | "\n", 57 | "#df.to_csv('./output/data-Compustat-output-*.csv') # output as separate csv files\n", 58 | "df.to_csv('dask_output.csv', single_file=True) # output as a single file" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [] 67 | } 68 | ], 69 | "metadata": { 70 | "kernelspec": { 71 | "display_name": "Python 3 (ipykernel)", 72 | "language": "python", 73 | "name": "python3" 74 | }, 75 | "language_info": { 76 | "codemirror_mode": { 77 | "name": "ipython", 78 | "version": 3 79 | }, 80 | "file_extension": ".py", 81 | "mimetype": "text/x-python", 82 | "name": "python", 83 | "nbconvert_exporter": "python", 84 | "pygments_lexer": "ipython3", 85 | "version": "3.9.6" 86 | } 87 | }, 88 | "nbformat": 4, 89 | "nbformat_minor": 4 90 | } 91 | -------------------------------------------------------------------------------- /src/main/clojure/clojask/clojask_input.clj: -------------------------------------------------------------------------------- 1 | (ns clojask.clojask-input 2 | (:require [clojure.core.async :refer [poll! timeout chan close!]] 3 | [clojure.set :refer [join]] 4 | [onyx.plugin.protocols :as p] 5 | [clojure.data.csv :as csv] 6 | [clojask.utils :refer [filter-check]] 7 | [taoensso.timbre :refer [fatal info debug] :as timbre]) 8 | (:import (java.io BufferedReader))) 9 | 10 | (defrecord AbsSeqReader [event reader filters types have-col rst completed? checkpoint? offset batch-size] 11 | p/Plugin 12 | 13 | (start [this event] 14 | this) 15 | 16 | (stop [this event] 17 | this) 18 | 19 | p/Checkpointed 20 | (checkpoint [this] 21 | (when checkpoint? @offset)) 22 | 23 | (recover! [this _ checkpoint] 24 | (vreset! completed? false) 25 | (let [csv-data (reader) 26 | ;; csv-data (if (fn? reader) 27 | ;; (reader) 28 | ;; (if have-col 29 | ;; (rest (line-seq (BufferedReader. reader))) 30 | ;; (line-seq (BufferedReader. reader)))) 31 | data (map zipmap (repeat [:id :d]) (map vector (iterate inc 0) (partition batch-size batch-size [] csv-data)))] 32 | (if (nil? checkpoint) 33 | (do 34 | (vreset! rst data) 35 | (vreset! offset 0)) 36 | (do 37 | (info "clojask.clojask-input is recovering state by dropping" checkpoint "elements.") 38 | (vreset! rst (drop checkpoint data)) 39 | (vreset! offset checkpoint))))) 40 | 41 | (checkpointed! [this epoch]) 42 | 43 | p/BarrierSynchronization 44 | (synced? [this epoch] 45 | true) 46 | 47 | (completed? [this] 48 | @completed?) 49 | 50 | p/Input 51 | (poll! [this _ _] 52 | (if-let [seg (first @rst)] 53 | (do 54 | (vswap! rst rest) 55 | seg 56 | ) 57 | (do (vreset! completed? true) 58 | nil)) 59 | )) 60 | 61 | (defn inject-dataframe 62 | [dataframe] 63 | (def df dataframe)) 64 | 65 | (defn input [{:keys [onyx.core/task-map] :as event}] 66 | ;; (println (:seq/rdr event)) 67 | (map->AbsSeqReader {:event event 68 | ;; :sequential (:seq/seq event) 69 | :reader (.getFunc df) 70 | :filters (.getFilters (:row-info df)) 71 | :types (.getType (:col-info df)) 72 | :have-col (:have-col df) 73 | :rst (volatile! nil) 74 | :completed? (volatile! false) 75 | :checkpoint? (not (false? (:seq/checkpoint? task-map))) 76 | :offset (volatile! nil) 77 | :batch-size (:batch-size df)})) 78 | 79 | (def reader-calls 80 | {}) 81 | 82 | (defn inject-lifecycle-seq 83 | [_ lifecycle] 84 | {:seq/seq (:seq/sequential lifecycle)}) 85 | 86 | (def inject-seq-via-lifecycle 87 | {:lifecycle/before-task-start inject-lifecycle-seq}) -------------------------------------------------------------------------------- /src/main/clojure/clojask/extensions/bind.clj: -------------------------------------------------------------------------------- 1 | (ns clojask.extensions.bind 2 | "Contains functions that extends the power of clojask, while not directly applying to the dataframe class" 3 | (:require [clojure.data.csv :as csv] 4 | [clojure.java.io :as io] 5 | [clojure.string :as str] 6 | [clojask.dataframe :as ck] 7 | [clojask-io.input :refer [read-file]] 8 | [clojask-io.output :refer [write-csv]])) 9 | 10 | (defn _cbind 11 | "joins a list of lazy sequences vertically" 12 | [seq] 13 | (apply map (fn [a b & cs] (apply concat (concat [a b] cs))) seq)) 14 | 15 | (defn cbind-csv 16 | "Joins some csv files into a new dataframe by columns" 17 | [a b & cs] 18 | (let [files (concat [a b] cs) 19 | func (fn [] {:clojask-io true 20 | :data (_cbind (map (fn [file] (:data (read-file file :format "csv" :stat true))) files)) 21 | :size (reduce (fn [a b] (if (and (not= a nil) (not= b nil)) (+ a b) nil)) (mapv (fn [file] (:size (read-file file :format "csv" :stat true))) files)) 22 | :output (fn [wtr seq] (write-csv wtr seq ","))})] 23 | (ck/dataframe func) 24 | ;; (func) 25 | )) 26 | 27 | (defn cbind 28 | "Joins some dataset files into a new dataframe by columns.\n 29 | If one of the file does not use the default seperator, please rewrite this function!" 30 | [a b & cs] 31 | (let [files (concat [a b] cs) 32 | func (fn [] {:clojask-io true 33 | :data (_cbind (map (fn [file] (:data (read-file file :stat true))) files)) 34 | :size (reduce (fn [a b] (if (and (not= a nil) (not= b nil)) (+ a b) nil)) (mapv (fn [file] (:size (read-file file :stat true))) files))})] 35 | (ck/dataframe func))) 36 | 37 | (defn rbind-csv 38 | "Joins some csv files into a new dataframe by rows\n 39 | Will by default use the header names of the first file" 40 | [a b & cs] 41 | (let [files (concat [b] cs) 42 | files-witha (concat [a b] cs) 43 | func (fn [] {:clojask-io true 44 | :data (concat (:data (read-file a :format "csv")) (apply concat (map rest (map #(:data (read-file % :format "csv")) files)))) 45 | :size (reduce (fn [a b] (if (and (not= a nil) (not= b nil)) (+ a b) nil)) (mapv (fn [file] (:size (read-file file :format "csv" :stat true))) files-witha)) 46 | :output (fn [wtr seq] (write-csv wtr seq ","))})] 47 | (ck/dataframe func))) 48 | 49 | (defn rbind 50 | "Joins some csv files into a new dataframe by rows\n 51 | Will by default use the header names of the first file" 52 | [a b & cs] 53 | (let [files (concat [b] cs) 54 | files-witha (concat [a b] cs) 55 | func (fn [] {:clojask-io true 56 | :data (concat (:data (read-file a)) (apply concat (map rest (map #(:data (read-file %)) files)))) 57 | :size (reduce (fn [a b] (if (and (not= a nil) (not= b nil)) (+ a b) nil)) (mapv (fn [file] (:size (read-file file :stat true))) files-witha))})] 58 | (ck/dataframe func))) -------------------------------------------------------------------------------- /benchmark/clojure-benchmark.clj: -------------------------------------------------------------------------------- 1 | (ns benchmark.core 2 | (:require [clojask.dataframe :refer :all] 3 | [clojure.core.async :as async])) 4 | 5 | (defn main 6 | [] 7 | (def y (dataframe "../clojure-datasets/data-Compustat-lohi.csv")) ; 1.8 M dataset 8 | ;(def y (dataframe "../clojure-datasets/data-Compustat-x2.csv")) ; 3.6 M dataset 9 | ;(def y (dataframe "../clojure-datasets/data-CRSP.csv")) ; 80 M dataset 10 | 11 | ; =================== Change this part to test time taken ====================== ; 12 | 13 | ;; Compustat 14 | 15 | ; element-wise 16 | ; (set-type y "prccq" "double") 17 | ; (operate y (fn [val] (if val (+ val 10.0) 0.0)) "prccq") 18 | ; (time (compute y 4 "resources/test.csv" :select ["datadate" "tic" "prccq"] :exception false)) 19 | 20 | ; row-wise 21 | ; (operate y str ["datadate" "tic"] "new-col") 22 | ; (time (compute y 4 "resources/test.csv" :select ["datadate" "tic" "prccq" "new-col"] :exception false)) 23 | 24 | ; groupby-aggregate 25 | ; (set-type y "prccq" "double") 26 | ; (group-by y "tic") 27 | ; (aggregate y gb-aggre/max ["prccq"] ["prccq-max"]) 28 | ; (time (compute y 4 "resources/test.csv" :select ["tic" "prccq-max"] :exception false)) 29 | 30 | ; aggregate -> error? 31 | ; (set-type y "prccq" "double") 32 | ; (aggregate y aggre/max ["prccq"] ["prccq-max"]) 33 | ; (time (compute y 4 "resources/test.csv" :select ["datadate" "tic" "prccq" "prccq-max"] :exception false)) 34 | 35 | ;; CRSP 36 | 37 | ; element-wise 38 | ; (set-type y "PRC" "double") 39 | ; (operate y (fn [val] (if val (+ val 10.0) 0.0)) "prccq") 40 | ; (time (compute y 4 "resources/test.csv" :select ["date" "TICKER" "PRC"] :exception false)) 41 | 42 | ; row-wise 43 | ; (operate y str ["PERMCO" "PERMNO"] "new-col") 44 | ; (time (compute y 4 "resources/test.csv" :select ["date" "TICKER" "PRC" "new-col"] :exception false)) 45 | 46 | ; groupby-aggregate 47 | ; (set-type y "PRC" "double") 48 | ; (group-by y "tic") 49 | ; (aggregate y gb-aggre/max ["PRC"] ["PRC-max"]) 50 | ; (time (compute y 4 "resources/test.csv" :select ["TICKER" "PRC-max"] :exception false)) 51 | 52 | ;; obtain results 53 | (time (compute y 4 "resources/test.csv" :exception false)) 54 | 55 | ;; join APIs 56 | 57 | ;(def x (dataframe "../clojure-datasets/data-CRSP.csv")) 58 | ;(def x (dataframe "resources/CRSP-extract.csv")) 59 | ;(def y (dataframe "../clojure-datasets/data-Compustat-lohi.csv")) 60 | 61 | ; (def output-df (left-join x y ["date" "TICKER"] ["datadate" "TICKER"] "date" "datadate")) 62 | ; (time (compute output-df :select ["1_date" "1_TICKER" "1_prccq" "2_datadate" "2_TICKER" "2_PRC"] 4 "resources/test.csv" :exception false)) 63 | 64 | ; (def output-df (right-join x y ["date" "TICKER"] ["datadate" "TICKER"] "date" "datadate")) 65 | ; (time (compute output-df :select ["1_date" "1_TICKER" "1_prccq" "2_datadate" "2_TICKER" "2_PRC"] 4 "resources/test.csv" :exception false)) 66 | 67 | ; (def output-df (inner-join x y ["date" "TICKER"] ["datadate" "TICKER"] "date" "datadate")) 68 | ; (time (compute output-df :select ["1_date" "1_TICKER" "1_prccq" "2_datadate" "2_TICKER" "2_PRC"] 4 "resources/test.csv" :exception false)) 69 | 70 | ; (def output-df (rolling-join-forward x y ["TICKER"] ["tic"] "date" "datadate")) 71 | ; (time (compute output-df :select ["1_date" "1_TICKER" "1_prccq" "2_datadate" "2_TICKER" "2_PRC"] 4 "resources/test.csv" :exception false)) 72 | 73 | ) -------------------------------------------------------------------------------- /src/main/clojure/clojask/join/outer_output.clj: -------------------------------------------------------------------------------- 1 | (ns clojask.join.outer-output 2 | (:require [onyx.peer.function :as function] 3 | [onyx.plugin.protocols :as p] 4 | [clojure.java.io :as io] 5 | [taoensso.timbre :refer [debug info] :as timbre] 6 | [clojure.string :as string]) 7 | (:import (java.io BufferedReader FileReader BufferedWriter FileWriter))) 8 | 9 | (def write-func (atom nil)) 10 | 11 | (defn inject-write-func 12 | [func] 13 | (reset! write-func func)) 14 | 15 | (defn- inject-into-eventmap 16 | [event lifecycle] 17 | (let [wtr (io/writer (:buffered-wtr/filename lifecycle) :append true)] 18 | {:clojask/wtr wtr})) 19 | 20 | (defn- close-writer [event lifecycle] 21 | (.close (:clojask/wtr event))) 22 | 23 | ;; Map of lifecycle calls that are required to use this plugin. 24 | ;; Users will generally always have to include these in their lifecycle calls 25 | ;; when submitting the job. 26 | (def writer-calls 27 | {:lifecycle/before-task-start inject-into-eventmap 28 | :lifecycle/after-task-stop close-writer}) 29 | 30 | (defrecord ClojaskOutput [write-func] 31 | p/Plugin 32 | (start [this event] 33 | ;; Initialize the plugin, generally by assoc'ing any initial state. 34 | this) 35 | 36 | (stop [this event] 37 | ;; Nothing is required here. However, most plugins have resources 38 | ;; (e.g. a connection) to clean up. 39 | ;; Mind that such cleanup is also achievable with lifecycles. 40 | this) 41 | 42 | p/Checkpointed 43 | ;; Nothing is required here. This is normally useful for checkpointing in 44 | ;; input plugins. 45 | (checkpoint [this]) 46 | 47 | ;; Nothing is required here. This is normally useful for checkpointing in 48 | ;; input plugins. 49 | (recover! [this replica-version checkpoint]) 50 | 51 | ;; Nothing is required here. This is normally useful for checkpointing in 52 | ;; input plugins. 53 | (checkpointed! [this epoch]) 54 | 55 | p/BarrierSynchronization 56 | (synced? [this epoch] 57 | ;; Nothing is required here. This is commonly used to check whether all 58 | ;; async writes have finished. 59 | true) 60 | 61 | (completed? [this] 62 | ;; Nothing is required here. This is commonly used to check whether all 63 | ;; async writes have finished (just like synced). 64 | true) 65 | 66 | p/Output 67 | (prepare-batch [this event replica messenger] 68 | ;; Nothing is required here. This is useful for some initial preparation, 69 | ;; before write-batch is called repeatedly. 70 | true) 71 | 72 | (write-batch [this {:keys [onyx.core/write-batch clojask/wtr]} replica messenger] 73 | ;; keys [:Departement] 74 | ;; Write the batch to your datasink. 75 | ;; In this case we are conjoining elements onto a collection. 76 | (doseq [msg write-batch] 77 | ;; (if-let [msg (first batch)] 78 | (do 79 | ;; (swap! example-datasink conj msg) 80 | (if (not= (:d msg) nil) 81 | (do 82 | (write-func wtr (:d msg)) 83 | ;; (doseq [data (:d msg)] 84 | ;; (.write wtr (str (string/join "," data) "\n"))) 85 | ;; !! define argument (debug) 86 | )))) 87 | (.flush wtr) 88 | true)) 89 | 90 | ;; Builder function for your output plugin. 91 | ;; Instantiates a record. 92 | ;; It is highly recommended you inject and pre-calculate frequently used data 93 | ;; from your task-map here, in order to improve the performance of your plugin 94 | ;; Extending the function below is likely good for most use cases. 95 | (defn output [pipeline-data] 96 | (->ClojaskOutput (deref write-func))) -------------------------------------------------------------------------------- /src/main/clojure/clojask/sort.clj: -------------------------------------------------------------------------------- 1 | (ns clojask.sort 2 | (:require [clojure.java.io :as io] 3 | [clojure.data.csv :as csv] 4 | [clojask.groupby :as gb]) 5 | (:import [com.google.code.externalsorting.csv CsvExternalSort] 6 | [com.google.code.externalsorting.csv CsvSortOptions CsvSortOptions$Builder] 7 | [java.io File])) 8 | 9 | (defn template-compare? 10 | ;; row1 is the first row 11 | ;; row2 is the second row 12 | 13 | ;;return is a int (- / 0 / +) 14 | [row1 row2] 15 | ) 16 | 17 | (defn salary-compare? 18 | [row1 row2] 19 | (- (Integer/parseInt (get row1 :Salary)) (Integer/parseInt (get row2 :Salary)))) 20 | 21 | (defn prc-compare? 22 | [row1 row2] 23 | ;; (println row1) 24 | ;; (println row2) 25 | (if (= (get row1 :PRC) "") 26 | -1 27 | (if (= (get row2 :PRC) "") 28 | +1 29 | (- (Double/parseDouble (get row1 :PRC)) (Double/parseDouble (get row2 :PRC)))))) 30 | 31 | 32 | (defn get-seq 33 | [input-dir] 34 | (let [csv-data (csv/read-csv (io/reader input-dir))] 35 | ;; (println (first csv-data)) 36 | (map zipmap ;; make the first row as headers and the following rows as values in a map structure e.g. {:tic AAPL} 37 | (->> (first csv-data) ;; take the first row of the csv-data 38 | (map keyword) ;; make the header be the "key" in the map 39 | repeat) ;; repeat the process for all the headers 40 | (rest csv-data)))) 41 | 42 | (defn internal-sort-large 43 | [input-dir out-dir comparator] 44 | (def curr (atom nil)) 45 | (def prev (atom nil)) 46 | (def has-next? (atom true)) 47 | (with-open [wtr (io/writer out-dir :append true)] 48 | (loop [] 49 | (reset! curr nil) 50 | ;; the first iteration is to find the standard 51 | (doseq [row (get-seq input-dir)] 52 | (if (and (or (= (deref prev) nil) (> (comparator row (deref prev)) 0)) (or (= (deref curr) nil) (< (comparator row (deref curr)) 0))) 53 | (do 54 | (reset! curr row) 55 | (reset! has-next? true)))) 56 | ;; (println (deref curr)) 57 | (if (deref has-next?) 58 | (do 59 | (doseq [row (get-seq input-dir)] 60 | (if (= (compare 0 (comparator row (deref curr))) 0) 61 | (.write wtr (str row "\n")))) 62 | (reset! prev (deref curr)) 63 | (reset! has-next? false) 64 | (.flush wtr) 65 | (recur)) 66 | nil)) 67 | "success")) 68 | 69 | 70 | (defn internal-sort-small 71 | [input-dir out-dir comparator] 72 | (with-open [wtr (io/writer out-dir)] 73 | (doseq [row (sort prc-compare? (get-seq input-dir))] 74 | (.wtr (str row "\n")))) 75 | "success" 76 | ) 77 | 78 | (defn use-external-sort 79 | [input output comp] 80 | ;; clean the output file 81 | ;; (println (CsvExternalSort/estimateAvailableMemory)) 82 | (with-open [wtr (io/writer output)] 83 | (.write wtr "")) 84 | (io/make-parents "./.clojask/sort/a.txt") 85 | (let 86 | [input (File. input) 87 | output (File. output) 88 | sort-option (let [builder (CsvSortOptions$Builder. comp CsvExternalSort/DEFAULTMAXTEMPFILES (* 5 (CsvExternalSort/estimateAvailableMemory)))] 89 | (.numHeader builder 1) 90 | (.skipHeader builder false) 91 | (.build builder)) 92 | ;; header (vec (first (csv/read-csv (io/reader input)))) 93 | header (java.util.ArrayList.) 94 | file-list (CsvExternalSort/sortInBatch input (File. "./.clojask/sort") sort-option header)] 95 | ;; (println sort-option) 96 | ;; (println header) 97 | (str "Sorted in total " (CsvExternalSort/mergeSortedFiles file-list output sort-option true header) " rows."))) 98 | -------------------------------------------------------------------------------- /src/main/clojure/clojask/aggregate/aggre_output.clj: -------------------------------------------------------------------------------- 1 | (ns clojask.aggregate.aggre-output 2 | (:require [onyx.peer.function :as function] 3 | [onyx.plugin.protocols :as p] 4 | [clojure.java.io :as io] 5 | [taoensso.timbre :refer [debug info] :as timbre] 6 | [clojure.string :as string]) 7 | (:import (java.io BufferedReader FileReader BufferedWriter FileWriter))) 8 | 9 | (defn- inject-into-eventmap 10 | [event lifecycle] 11 | (let [wtr (io/writer (:buffered-wtr/filename lifecycle) :append true)] 12 | {:clojask/wtr wtr})) 13 | 14 | (defn- close-writer [event lifecycle] 15 | (.close (:clojask/wtr event))) 16 | 17 | ;; Map of lifecycle calls that are required to use this plugin. 18 | ;; Users will generally always have to include these in their lifecycle calls 19 | ;; when submitting the job. 20 | (def writer-calls 21 | {:lifecycle/before-task-start inject-into-eventmap 22 | :lifecycle/after-task-stop close-writer}) 23 | 24 | (def df (atom nil)) 25 | (def output-func (atom nil)) 26 | 27 | (defn inject-dataframe 28 | [dataframe out] 29 | (reset! df dataframe) 30 | (reset! output-func out)) 31 | 32 | (defrecord ClojaskOutput [output-func] 33 | p/Plugin 34 | (start [this event] 35 | ;; Initialize the plugin, generally by assoc'ing any initial state. 36 | this) 37 | 38 | (stop [this event] 39 | ;; Nothing is required here. However, most plugins have resources 40 | ;; (e.g. a connection) to clean up. 41 | ;; Mind that such cleanup is also achievable with lifecycles. 42 | this) 43 | 44 | p/Checkpointed 45 | ;; Nothing is required here. This is normally useful for checkpointing in 46 | ;; input plugins. 47 | (checkpoint [this]) 48 | 49 | ;; Nothing is required here. This is normally useful for checkpointing in 50 | ;; input plugins. 51 | (recover! [this replica-version checkpoint]) 52 | 53 | ;; Nothing is required here. This is normally useful for checkpointing in 54 | ;; input plugins. 55 | (checkpointed! [this epoch]) 56 | 57 | p/BarrierSynchronization 58 | (synced? [this epoch] 59 | ;; Nothing is required here. This is commonly used to check whether all 60 | ;; async writes have finished. 61 | true) 62 | 63 | (completed? [this] 64 | ;; Nothing is required here. This is commonly used to check whether all 65 | ;; async writes have finished (just like synced). 66 | true) 67 | 68 | p/Output 69 | (prepare-batch [this event replica messenger] 70 | ;; Nothing is required here. This is useful for some initial preparation, 71 | ;; before write-batch is called repeatedly. 72 | true) 73 | 74 | (write-batch [this {:keys [onyx.core/write-batch clojask/wtr]} replica messenger] 75 | ;; keys [:Departement] 76 | ;; Write the batch to your datasink. 77 | ;; In this case we are conjoining elements onto a collection. 78 | (doseq [msg write-batch] 79 | ;; (if-let [msg (first batch)] 80 | (do 81 | ;; (swap! example-datasink conj msg) 82 | (if (not= (:d msg) nil) 83 | (do 84 | ;; (doseq [data (:d msg)] 85 | ;; (.write wtr (str (string/join "," data) "\n"))) 86 | ;; (println (:d msg)) 87 | (output-func wtr (:d msg)) 88 | ;; !! define argument (debug) 89 | )))) 90 | (.flush wtr) 91 | true)) 92 | 93 | ;; Builder function for your output plugin. 94 | ;; Instantiates a record. 95 | ;; It is highly recommended you inject and pre-calculate frequently used data 96 | ;; from your task-map here, in order to improve the performance of your plugin 97 | ;; Extending the function below is likely good for most use cases. 98 | (defn output [pipeline-data] 99 | (->ClojaskOutput (deref output-func))) -------------------------------------------------------------------------------- /src/main/clojure/clojask/classes/RowInfo.clj: -------------------------------------------------------------------------------- 1 | (ns clojask.classes.RowInfo 2 | ;; (:require [clojask.utils :refer :all]) 3 | ) 4 | 5 | (import '[com.clojask.exception TypeException] 6 | '[com.clojask.exception OperationException]) 7 | 8 | (definterface RowIntf 9 | (getFilters []) 10 | (getAggreOldKeys []) 11 | (getAggreNewKeys []) 12 | (getAggreFunc []) 13 | (getGroupbyKeys []) 14 | (filter [cols predicate]) 15 | (groupby [a]) 16 | (aggregate [func old-key new-key]) 17 | (setRowInfo [new-col-desc new-col-set]) 18 | (renameRowInfo [new-col-names]) 19 | (copy []) 20 | (rollback []) 21 | (commit [])) 22 | 23 | (deftype RowInfo 24 | [^:unsynchronized-mutable filters 25 | ^:unsynchronized-mutable groupby-key 26 | ^:unsynchronized-mutable aggre-func 27 | ;; ^:unsynchronized-mutable aggre-old-key 28 | ^:unsynchronized-mutable aggre-new-key 29 | ^:unsynchronized-mutable hist] 30 | RowIntf 31 | (getFilters 32 | [this] 33 | filters) 34 | 35 | (getGroupbyKeys 36 | [this] 37 | groupby-key) 38 | 39 | ;; (getAggreOldKeys 40 | ;; [this] 41 | ;; aggre-old-key) 42 | (getAggreNewKeys 43 | [this] 44 | aggre-new-key) 45 | 46 | (getAggreFunc 47 | [this] 48 | aggre-func) 49 | 50 | (filter 51 | [this cols predicate] 52 | (.copy this) 53 | (set! filters (conj filters [predicate cols])) 54 | ;; "success" 55 | nil) 56 | 57 | (groupby 58 | [this key] 59 | (.copy this) 60 | (set! groupby-key key) 61 | ;; "success" 62 | nil) 63 | 64 | (aggregate 65 | [this func old-keys new-keys] 66 | (.copy this) 67 | (if true 68 | ;; (not= groupby-key []) 69 | (do 70 | (doseq [old-key old-keys] 71 | (set! aggre-func (conj aggre-func [func old-key]))) 72 | ;; (set! aggre-old-key old-key) 73 | (doseq [new-key new-keys] 74 | (set! aggre-new-key (conj aggre-new-key new-key))) 75 | ; "success" 76 | nil) 77 | (throw (OperationException. "you must first group the dataframe by some keys then aggregate")))) 78 | 79 | (setRowInfo 80 | [this new-col-desc new-col-set] 81 | (.copy this) 82 | (let [original-filter (.getFilters this) 83 | original-groupby-keys (.getGroupbyKeys this) 84 | original-aggre-func (.getAggreFunc this) 85 | new-filter-fns (map #(first %) original-filter) 86 | new-filter-cols (map (fn [fcols] (map #(first (first (get new-col-desc %))) fcols)) (doall (map #(last %) original-filter))) 87 | new-groupby-fns (map #(first %) original-groupby-keys) 88 | new-groupby-cols (map #(first (first (get new-col-desc (last %)))) original-groupby-keys) 89 | new-aggre-fns (map #(first %) original-aggre-func) 90 | new-aggre-cols (map #(first (first (get new-col-desc (last %)))) original-aggre-func)] 91 | (if (not (empty? (.getFilters this))) 92 | (set! filters (vec (map vector new-filter-fns new-filter-cols)))) 93 | (if (not (empty? (.getGroupbyKeys this))) 94 | (set! groupby-key (vec (map vector new-groupby-fns new-groupby-cols))) 95 | ;(set! groupby-key (vec (map #(first (first (get new-col-desc %))) original-groupby-keys))) 96 | ) 97 | (if (not (empty? (.getAggreFunc this))) 98 | (set! aggre-func (vec (map vector new-aggre-fns new-aggre-cols)))))) 99 | 100 | (copy 101 | [this] 102 | (set! hist {:filters filters 103 | :groupby-key groupby-key 104 | :aggre-func aggre-func 105 | :aggre-new-key aggre-new-key})) 106 | 107 | (rollback 108 | [this] 109 | (if (not= hist {}) 110 | (do (set! filters (:filters hist)) 111 | (set! groupby-key (:groupby-key hist)) 112 | (set! aggre-func (:aggre-func hist)) 113 | (set! aggre-new-key (:aggre-new-key hist))))) 114 | 115 | (commit 116 | [this] 117 | (set! hist {}))) 118 | -------------------------------------------------------------------------------- /src/main/clojure/clojask/aggregate/aggre_input.clj: -------------------------------------------------------------------------------- 1 | (ns clojask.aggregate.aggre-input 2 | (:require [clojure.core.async :refer [poll! timeout chan close!]] 3 | [clojure.set :refer [join]] 4 | [onyx.plugin.protocols :as p] 5 | [clojure.data.csv :as csv] 6 | [clojask.utils :refer [filter-check]] 7 | [taoensso.timbre :refer [fatal info debug] :as timbre] 8 | [clojure.java.io :as java.io] 9 | [clojask.utils :as u]) 10 | (:import (java.io BufferedReader))) 11 | 12 | (defrecord AbsSeqReader [event path rst completed? checkpoint? offset source] 13 | p/Plugin 14 | 15 | (start [this event] 16 | this) 17 | 18 | (stop [this event] 19 | this) 20 | 21 | p/Checkpointed 22 | (checkpoint [this] 23 | (when checkpoint? @offset)) 24 | 25 | (recover! 26 | [this _ checkpoint] 27 | (vreset! completed? false) 28 | 29 | (let [ 30 | ;; directory (java.io/file path) 31 | ;; files (rest (file-seq directory)) 32 | ;; data (map zipmap (repeat [:id :file :d]) (map vector (iterate inc 0) [files (mapv (fn [_] (read-string (str _))) files)])) 33 | data (if (= path nil) 34 | (do 35 | (def tmp (volatile! -1)) 36 | (map (fn [file] 37 | (vswap! tmp inc) 38 | {:id @tmp :file file :d (read-string file)}) 39 | (.getKeys source))) 40 | (do 41 | (def tmp (volatile! -1)) 42 | (map (fn [file] 43 | (vswap! tmp inc) 44 | {:id @tmp :file file :d (read-string (u/decode-str (.getName file)))}) 45 | (rest (file-seq (java.io/file path)))))) 46 | ] 47 | (if (nil? checkpoint) 48 | (do 49 | (vreset! rst data) 50 | (vreset! offset 0)) 51 | (do 52 | (info "clojask.aggregate.aggre-input is recovering state by dropping" checkpoint "elements.") 53 | (vreset! rst (drop checkpoint data)) 54 | (vreset! offset checkpoint))))) 55 | 56 | (checkpointed! [this epoch]) 57 | 58 | p/BarrierSynchronization 59 | (synced? [this epoch] 60 | true) 61 | 62 | (completed? [this] 63 | @completed?) 64 | 65 | p/Input 66 | (poll! [this _ _] 67 | ;; (if (> (mem-usage) 500) 68 | ;; (Thread/sleep 10)) 69 | ;; (while (not (filter-check filters types (:d (first @rst)))) 70 | ;; (vswap! rst rest)) 71 | (if-let [seg (first @rst)] 72 | (do 73 | (vswap! rst rest) 74 | seg) 75 | (do (vreset! completed? true) 76 | nil)) 77 | ;; (if-let [seg (first @rst)] 78 | ;; (do (vswap! rst rest) 79 | ;; (vswap! offset inc) 80 | ;; ;; (spit "resources/debug.txt" (str seg) :append true) 81 | ;; seg) 82 | ;; (do (vreset! completed? true) 83 | ;; nil)) 84 | )) 85 | 86 | (defn inject-dataframe 87 | [dataframe _source] 88 | (def df dataframe) 89 | (def source _source)) 90 | 91 | (defn input [{:keys [onyx.core/task-map] :as event}] 92 | ;; (println (:seq/rdr event)) 93 | (map->AbsSeqReader {:event event 94 | ;; :sequential (:seq/seq event) 95 | ;; :reader (:seq/rdr event) 96 | ;; :filters (.getFilters (:row-info df)) 97 | ;; :types (.getType (:col-info df)) 98 | :path (:buffered-reader/path event) 99 | :rst (volatile! nil) 100 | :completed? (volatile! false) 101 | :checkpoint? (not (false? (:seq/checkpoint? task-map))) 102 | :offset (volatile! nil) 103 | :source source})) 104 | 105 | (def reader-calls 106 | {}) 107 | 108 | (defn inject-lifecycle-seq 109 | [_ lifecycle] 110 | {:seq/seq (:seq/sequential lifecycle)}) 111 | 112 | (def inject-seq-via-lifecycle 113 | {:lifecycle/before-task-start inject-lifecycle-seq}) -------------------------------------------------------------------------------- /src/main/clojure/clojask/join/outer_input.clj: -------------------------------------------------------------------------------- 1 | (ns clojask.join.outer-input 2 | (:require [clojure.core.async :refer [poll! timeout chan close!]] 3 | [clojure.set :refer [join]] 4 | [onyx.plugin.protocols :as p] 5 | [clojure.data.csv :as csv] 6 | [clojask.utils :refer [filter-check]] 7 | [taoensso.timbre :refer [fatal info debug] :as timbre] 8 | [clojure.java.io :as java.io]) 9 | (:import (java.io BufferedReader))) 10 | 11 | (def mgroup-a nil) 12 | (def mgroup-b nil) 13 | 14 | (defrecord AbsSeqReader [event path rst completed? checkpoint? offset] 15 | p/Plugin 16 | 17 | (start [this event] 18 | this) 19 | 20 | (stop [this event] 21 | this) 22 | 23 | p/Checkpointed 24 | (checkpoint [this] 25 | (when checkpoint? @offset)) 26 | 27 | (recover! 28 | [this _ checkpoint] 29 | (vreset! completed? false) 30 | 31 | (let [directory (java.io/file path) 32 | files (if (= mgroup-a nil) 33 | (rest (file-seq directory)) 34 | (.getKeys mgroup-a)) 35 | ;; data (map zipmap (repeat [:id :file :d]) (map vector (iterate inc 0) [files (mapv (fn [_] (read-string (str _))) files)])) 36 | data 37 | (if (= mgroup-a nil) 38 | (do 39 | (def tmp (volatile! -1)) 40 | (map (fn [file] 41 | (vswap! tmp inc) 42 | {:id @tmp :d (str file)}) 43 | files)) 44 | (do 45 | (def tmp (volatile! -1)) 46 | (map (fn [file] 47 | (vswap! tmp inc) 48 | (if (not= nil mgroup-b) (.delete mgroup-b file)) 49 | {:id @tmp :d (str file)}) 50 | files))) 51 | ] 52 | (if (nil? checkpoint) 53 | (do 54 | (vreset! rst data) 55 | (vreset! offset 0)) 56 | (do 57 | (info "clojask.join.outer-input is recovering state by dropping" checkpoint "elements.") 58 | (vreset! rst (drop checkpoint data)) 59 | (vreset! offset checkpoint))))) 60 | 61 | (checkpointed! [this epoch]) 62 | 63 | p/BarrierSynchronization 64 | (synced? [this epoch] 65 | true) 66 | 67 | (completed? [this] 68 | @completed?) 69 | 70 | p/Input 71 | (poll! [this _ _] 72 | ;; (if (> (mem-usage) 500) 73 | ;; (Thread/sleep 10)) 74 | ;; (while (not (filter-check filters types (:d (first @rst)))) 75 | ;; (vswap! rst rest)) 76 | (if-let [seg (first @rst)] 77 | (do 78 | (vswap! rst rest) 79 | seg) 80 | (do (vreset! completed? true) 81 | nil)) 82 | ;; (if-let [seg (first @rst)] 83 | ;; (do (vswap! rst rest) 84 | ;; (vswap! offset inc) 85 | ;; ;; (spit "resources/debug.txt" (str seg) :append true) 86 | ;; seg) 87 | ;; (do (vreset! completed? true) 88 | ;; nil)) 89 | )) 90 | 91 | (defn inject-dataframe 92 | [_mgroup-a _mgroup-b] 93 | (def mgroup-a _mgroup-a) 94 | (def mgroup-b _mgroup-b)) 95 | 96 | (defn input [{:keys [onyx.core/task-map] :as event}] 97 | ;; (println (:seq/rdr event)) 98 | (map->AbsSeqReader {:event event 99 | ;; :sequential (:seq/seq event) 100 | ;; :reader (:seq/rdr event) 101 | ;; :filters (.getFilters (:row-info df)) 102 | ;; :types (.getType (:col-info df)) 103 | :path (:buffered-reader/path event) 104 | :rst (volatile! nil) 105 | :completed? (volatile! false) 106 | :checkpoint? (not (false? (:seq/checkpoint? task-map))) 107 | :offset (volatile! nil)})) 108 | 109 | (def reader-calls 110 | {}) 111 | 112 | (defn inject-lifecycle-seq 113 | [_ lifecycle] 114 | {:seq/seq (:seq/sequential lifecycle)}) 115 | 116 | (def inject-seq-via-lifecycle 117 | {:lifecycle/before-task-start inject-lifecycle-seq}) -------------------------------------------------------------------------------- /src/main/clojure/clojask/clojask_groupby.clj: -------------------------------------------------------------------------------- 1 | (ns clojask.clojask-groupby 2 | (:require [clojask.groupby :refer [output-groupby]] 3 | [onyx.peer.function :as function] 4 | [onyx.plugin.protocols :as p] 5 | [clojure.set :as set] 6 | [taoensso.timbre :refer [debug info] :as timbre]) 7 | (:import (java.io BufferedReader FileReader BufferedWriter FileWriter))) 8 | 9 | (def dataframe (atom nil)) 10 | (def groupby-keys (atom nil)) 11 | (def write-index (atom nil)) 12 | (def output-func (atom nil)) 13 | (def dist (atom nil)) 14 | (def format_ (atom nil)) 15 | 16 | (defn inject-dataframe 17 | [df groupby-key index _dist _format] 18 | (reset! dataframe df) 19 | (reset! groupby-keys groupby-key) 20 | (reset! write-index index) 21 | (reset! dist _dist) 22 | (reset! format_ _format) 23 | ;; (reset! output-func out) 24 | ) 25 | 26 | (defn- inject-into-eventmap 27 | [event lifecycle] 28 | (let [key-index (.getKeyIndex (.col-info (deref dataframe))) 29 | formatters (.getFormatter (.col-info (deref dataframe))) 30 | groupby-keys (deref groupby-keys)] 31 | ;; [wtr (BufferedWriter. (FileWriter. (:buffered-wtr/filename lifecycle)))] 32 | {:clojask/dist (deref dist) 33 | ;; :clojask/dist (:buffered-wtr/filename lifecycle) 34 | ;; :clojask/groupby-keys (:clojask/groupby-keys lifecycle) 35 | :clojask/groupby-keys groupby-keys 36 | :clojask/key-index key-index 37 | :clojask/formatter formatters})) 38 | 39 | (defn- close-writer [event lifecycle] 40 | (.close (:clojask/wtr event))) 41 | 42 | ;; Map of lifecycle calls that are required to use this plugin. 43 | ;; Users will generally always have to include these in their lifecycle calls 44 | ;; when submitting the job. 45 | (def writer-aggre-calls 46 | {:lifecycle/before-task-start inject-into-eventmap}) 47 | 48 | (defrecord ClojaskGroupby [write-index] 49 | p/Plugin 50 | (start [this event] 51 | ;; Initialize the plugin, generally by assoc'ing any initial state. 52 | this) 53 | 54 | (stop [this event] 55 | ;; Nothing is required here. However, most plugins have resources 56 | ;; (e.g. a connection) to clean up. 57 | ;; Mind that such cleanup is also achievable with lifecycles. 58 | this) 59 | 60 | p/Checkpointed 61 | ;; Nothing is required here. This is normally useful for checkpointing in 62 | ;; input plugins. 63 | (checkpoint [this]) 64 | 65 | ;; Nothing is required here. This is normally useful for checkpointing in 66 | ;; input plugins. 67 | (recover! [this replica-version checkpoint]) 68 | 69 | ;; Nothing is required here. This is normally useful for checkpointing in 70 | ;; input plugins. 71 | (checkpointed! [this epoch]) 72 | 73 | p/BarrierSynchronization 74 | (synced? [this epoch] 75 | ;; Nothing is required here. This is commonly used to check whether all 76 | ;; async writes have finished. 77 | true) 78 | 79 | (completed? [this] 80 | ;; Nothing is required here. This is commonly used to check whether all 81 | ;; async writes have finished (just like synced). 82 | true) 83 | 84 | p/Output 85 | (prepare-batch [this event replica messenger] 86 | ;; Nothing is required here. This is useful for some initial preparation, 87 | ;; before write-batch is called repeatedly. 88 | true) 89 | 90 | (write-batch [this {:keys [onyx.core/write-batch clojask/dist clojask/groupby-keys clojask/key-index clojask/formatter]} replica messenger] 91 | ;; keys [:Departement] 92 | ;; Write the batch to your datasink. 93 | ;; In this case we are conjoining elements onto a collection. 94 | (doseq [msg write-batch] 95 | (doseq [data (:d msg)] 96 | ;; (swap! example-datasink conj msg) 97 | (if (not= data nil) 98 | (do 99 | ;(.write wtr (str msg "\n")) 100 | ;; !! define argument (debug) 101 | ;; (def groupby-keys [:Department :EmployeeName]) 102 | (output-groupby dist data groupby-keys key-index formatter write-index (deref format_)))))) 103 | true)) 104 | 105 | ;; Builder function for your output plugin. 106 | ;; Instantiates a record. 107 | ;; It is highly recommended you inject and pre-calculate frequently used data 108 | ;; from your task-map here, in order to improve the performance of your plugin 109 | ;; Extending the function below is likely good for most use cases. 110 | (defn groupby [pipeline-data] 111 | (->ClojaskGroupby (deref write-index))) -------------------------------------------------------------------------------- /src/main/clojure/clojask/classes/MGroup.clj: -------------------------------------------------------------------------------- 1 | (ns clojask.classes.MGroup 2 | (:require [clojure.set :as set] 3 | [clojask.utils :as u]) 4 | (:import [com.clojask.exception ExecutionException])) 5 | 6 | (definterface MGroupIntf 7 | (final []) 8 | (getKeys []) 9 | (exists [key]) 10 | (write [key msg write-index formatter] "mimic a bufferedwriter, add a row to a group") 11 | (getKey [key])) 12 | 13 | (definterface MGroupJoinIntf 14 | (getKeyBoth [key]) 15 | (delete [key])) 16 | 17 | (deftype MGroup 18 | [^:unsynchronized-mutable groups] 19 | 20 | MGroupIntf 21 | 22 | (final 23 | [this] 24 | (set! groups (persistent! groups))) 25 | 26 | (getKeys 27 | [this] 28 | ;; (println (keys groups)) 29 | (keys groups)) 30 | 31 | (exists 32 | [this key] 33 | (nil? (get groups key))) 34 | 35 | (write 36 | [this key msg write-index formatter] 37 | (if-let [group (get groups key)] 38 | (set! groups (assoc! groups key (conj! group (u/gets msg write-index)))) 39 | (set! groups (assoc! groups key (transient [(u/gets msg write-index)]))))) 40 | 41 | (getKey 42 | [this key] 43 | (persistent! (get groups key)))) 44 | 45 | (deftype MGroupJoin 46 | [^:unsynchronized-mutable groups 47 | ;; ^:unsynchronized-mutable unformat-groups 48 | ^:volatile-mutable _keys 49 | rolling] 50 | MGroupIntf 51 | (final 52 | [this] 53 | (let [tmp-keys (persistent! _keys)] 54 | ;; (if rolling 55 | ;; (doseq [key (keys tmp-keys)] 56 | ;; (set! groups (assoc! groups key (persistent! (get groups key)))) 57 | ;; (set! unformat-groups (assoc! unformat-groups key (persistent! (get unformat-groups key))))) 58 | (doseq [key (keys tmp-keys)] 59 | (set! groups (assoc! groups key (persistent! (get groups key))))) 60 | ;; ) 61 | (set! _keys (transient tmp-keys))) 62 | (set! groups (persistent! groups)) 63 | ;; (println rolling) 64 | ;; (println groups) 65 | ;; (set! unformat-groups (persistent! unformat-groups)) 66 | ) 67 | 68 | (getKeys 69 | [this] 70 | (keys groups)) 71 | 72 | (exists 73 | [this key] 74 | (contains? _keys key)) 75 | 76 | (write 77 | [this key msg write-index formatter] 78 | (if-let [group (get groups key)] 79 | (do 80 | (if rolling 81 | (set! groups (assoc! groups key (conj! group [(u/gets-format msg write-index formatter) (u/gets msg write-index)]))) 82 | (set! groups (assoc! groups key (conj! group (u/gets-format msg write-index formatter)))))) 83 | (do 84 | (if rolling 85 | (set! groups (assoc! groups key (transient [[(u/gets-format msg write-index formatter) (u/gets msg write-index)]]))) 86 | (set! groups (assoc! groups key (transient [(u/gets-format msg write-index formatter)])))) 87 | (set! _keys (assoc! _keys key 1))))) 88 | 89 | (getKey 90 | [this key] 91 | (get groups key)) 92 | ) 93 | 94 | 95 | (deftype MGroupJoinOuter 96 | [^:unsynchronized-mutable groups 97 | ;; ^:unsynchronized-mutable unformat-groups 98 | ^:volatile-mutable _keys 99 | rolling] 100 | MGroupIntf 101 | (final 102 | [this] 103 | (set! _keys (persistent! _keys)) 104 | ;; (let [tmp-keys (persistent! _keys)] 105 | ;; ;; (if rolling 106 | ;; ;; (doseq [key (keys tmp-keys)] 107 | ;; ;; (set! groups (assoc! groups key (persistent! (get groups key)))) 108 | ;; ;; (set! unformat-groups (assoc! unformat-groups key (persistent! (get unformat-groups key))))) 109 | ;; (doseq [key (keys tmp-keys)] 110 | ;; (set! groups (assoc! groups key (persistent! (get groups key))))) 111 | ;; ;; ) 112 | ;; (set! _keys (transient tmp-keys))) 113 | ;; (set! groups (persistent! groups)) 114 | ;; (println rolling) 115 | ;; (println groups) 116 | ;; (set! unformat-groups (persistent! unformat-groups)) 117 | ) 118 | 119 | (getKeys 120 | [this] 121 | (keys _keys)) 122 | 123 | (exists 124 | [this key] 125 | (contains? groups key)) 126 | 127 | (write 128 | [this key msg write-index formatter] 129 | (if-let [group (get groups key)] 130 | (set! groups (assoc! groups key (conj! group (u/gets-format msg write-index formatter)))) 131 | (set! groups (assoc! groups key (transient [(u/gets-format msg write-index formatter)])))) 132 | (set! _keys (assoc! _keys key 1))) 133 | 134 | (getKey 135 | [this key] 136 | (persistent! (get groups key))) 137 | 138 | MGroupJoinIntf 139 | ;; (getKeyBoth 140 | ;; [this key] 141 | ;; (if (.exists this key) (get unformat-groups key))) 142 | 143 | (delete 144 | [this key] 145 | (set! _keys (dissoc! _keys key)))) 146 | -------------------------------------------------------------------------------- /src/main/clojure/clojask/debug.clj: -------------------------------------------------------------------------------- 1 | (ns clojask.debug 2 | (:require [clojask.dataframe :refer :all] 3 | [clojask.utils :as u] 4 | [clojask.groupby :refer :all] 5 | [clojask.sort :as sort] 6 | [clojask.api.aggregate :as agg] 7 | [clojask.api.gb-aggregate :as gb-agg] 8 | [clojure.string :as str] 9 | [clojask.extensions.bind :refer :all] 10 | [clojask.extensions.reshape :refer :all]) 11 | (:refer-clojure :exclude [group-by filter dedupe sort])) 12 | "For debugging purposes only, will not be used in production." 13 | 14 | (defn -main 15 | [] 16 | ;(def x "Hello world") 17 | ;(-> (clojure.core/format "Expression '%s' not defined." x)(MyOwnException.)(throw)) 18 | 19 | (def x (dataframe "./resources/Employees.csv" :have-col true)) 20 | ;; (set-type x "Employee" "double") 21 | ;; (group-by x ["Department"]) 22 | (aggregate x agg/min ["Employee"]) 23 | (print-df x) 24 | ;; (def y (dataframe "resources/Employees-info.csv" :have-col true)) 25 | ;; (def z (left-join x y ["Employee"] ["Employee"])) 26 | ;(time (compute z 8 "resources/test.csv" :select ["1_Employee" "2_EmployeeName"] :exception true)) 27 | (def output-df (compute x 8 "resources/test.csv" :exception true)) 28 | ;(compute z 8 "resources/test.csv" :exception true) 29 | ;(time (compute x 8 "resources/test.csv" :select ["new-employee"] :exception true)) 30 | 31 | ;(time (rolling-join-forward x y ["EmployeeName"] ["EmployeeName"] "Employee" "Employee" 8 "resources/test.csv" :exception false)) 32 | 33 | ;(select-col y ["Salary" "EmployeeName"]) 34 | ;(delete-col y ["Salary" "EmployeeName"]) 35 | ;(print-df y) 36 | 37 | ;(println (.getKeys (.col-info y))) 38 | ;(set-type y "Salary" "double") 39 | ;(set-type y "EmployeeName" "double") ;; gives exception 40 | 41 | ;(operate y "Salary" (fn [x] (+ 10 x))) 42 | ;(operate y "Salary" (fn [] 2)) ;; gives exception 43 | 44 | ;(operate y str ["Employee" "Salary"] "new-col") 45 | ;(operate y ["Employee" "Salary"] "new-col" (fn [] 2)) ;; gives exception 46 | 47 | ;(print-df y) 48 | ;(filter y "Salary" (fn [salary] (<= salary 800))) 49 | ;(set-parser y "Department" #(Double/parseDouble %)) 50 | 51 | ;(delete-col y ["Salary" "Department"]) 52 | ;(println (col-names y)) 53 | 54 | ;; (group-by y ["Department" "Employee"]) 55 | ;; (aggregate y min ["Employee"] ["new-employee"]) 56 | ;; (rename-col y ["Employee" "Department-A" "EmployeeName" "Salary"]) 57 | 58 | ;; (set-type y "Department" "double") 59 | ;; (set-parser y "Salary" #(Double/parseDouble %)) 60 | ;; (operate y - "Department") 61 | ;; (operate y str ["Employee" "Salary"] "new-col") 62 | 63 | ;(time (compute y 8 "resources/test.csv" :exception true :order true)) 64 | 65 | ;; (-> (dataframe "resources/Employees-large.csv" :have-col true) 66 | ;; (set-type "Salary" "double") 67 | ;; (filter "Salary" (fn [salary] (<= salary 800))) 68 | ;; (set-type "Department" "double") 69 | ;; (operate - "Department") 70 | ;; (operate str ["Employee" "Salary"] "new-col") 71 | ;; (group-by ["Department"]) 72 | ;; (aggregate min ["Employee"] ["new-employee"]) 73 | ;; (compute 4 "resources/test.csv" :exception true :order true) 74 | ;; time) 75 | 76 | ;; (println (.getKeys (.col-info y))) 77 | ;; ;(println "Renaming columns...") 78 | ;; (filter y ["Salary" "Department"] (fn [salary] (<= salary 800))) 79 | ;; (filter y ["Salary" "Department"] (fn [salary] (<= salary 800))) 80 | ;; (group-by y "Department") 81 | ;; (aggregate y aggre-avg ["Department" "Salary"] ["dept-avg" "salary-avg"]) 82 | ;; (reorder-col y ["Employee" "Department-x" "EmployeeName" "Salary"]) 83 | ;; (.reorderCol (.row-info y) (.getDesc (.col-info y)) ["Employee" "Department" "EmployeeName" "Salary"]) 84 | ;; (println (.getKeys (.col-info y))) 85 | 86 | ;; Benchmarking 87 | 88 | ;(def y (dataframe "../clojure-datasets/data-CRSP.csv")) 89 | (def y (dataframe "../clojure-datasets/data-Compustat-lohi.csv")) 90 | (set-type y "prccq" "double") 91 | ;(operate y (fn [val] (if val (+ val 10.0) 0.0)) "prccq") 92 | ;(operate y str ["datadate" "tic"] "new-col") 93 | (group-by y "tic") 94 | (aggregate y gb-agg/max ["prccq"] ["prccq-max"]) 95 | (time (compute y 4 "resources/test.csv" :select ["tic" "prccq-max"] :exception false)) 96 | ;(time (compute y 4 "resources/test.csv" :select ["datadate" "TICKER" "prccq"] :exception false)) 97 | 98 | ;; CRSP Benchmarking 99 | 100 | ;(def x (dataframe "../clojure-datasets/data-CRSP.csv")) 101 | ;(def x (dataframe "resources/CRSP-extract.csv")) 102 | ;(def y (dataframe "../clojure-datasets/data-Compustat-lohi.csv")) 103 | 104 | ; join on (TIC, DATE) 105 | ;(time (rolling-join-forward x y ["TICKER"] ["tic"] "date" "datadate" 4 "resources/test.csv" :exception false)) 106 | ;(time (inner-join x y ["date" "TICKER"] ["datadate" "TICKER"] 4 "resources/test.csv" :exception false)) 107 | ) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Clojask 2 | > Clojure data processing framework with parallel computing on larger-than-memory datasets 3 | 4 | ### Features 5 | 6 | - **Unlimited Size** 7 | 8 | It supports datasets larger than memory. 9 | 10 | - **Various Operations** 11 | 12 | Although Clojask is designed for larger-than-memory datasets, like NoSQLs, it does not sacrifice common operations on relational dataframes, such as [group by](https://clojure-finance.github.io/clojask-website/posts-output/API/#group-by), [aggregate](https://clojure-finance.github.io/clojask-website/posts-output/API/#aggregate), [join](https://clojure-finance.github.io/clojask-website/posts-output/API/#inner-join--left-join--right-join). 13 | 14 | - **Fast** 15 | 16 | Faster than Dask in most operations, and the larger the dataframe is, the bigger the advantage. Please find the benchmarks [here](https://clojure-finance.github.io/clojask-website/pages-output/about/#benchmarks). 17 | 18 | - **All Native Types** 19 | 20 | All the datatypes used to store data are native Clojure (or Java) types. 21 | 22 | - **From File to File** 23 | 24 | Integrate IO inside the dataframe. No need to write your own read-in and output functions. 25 | 26 | - **Parallel** 27 | 28 | Most operations could be executed into multiple threads or even machines. See the principle in [Onyx](http://www.onyxplatform.org/). 29 | 30 | - **Lazy Operations** 31 | 32 | Most operations will not be executed immediately. Dataframe will intelligently pipeline the operations altogether in computation. 33 | 34 | - **Little Constraints on programming** 35 | 36 | Except for some aggregations where you need to write customized functions subject to simple templates, operations in Clojask support arbitrary Clojure functions as input 37 | 38 | ### Installation 39 | 40 | Available on [Clojars](https://clojars.org/com.github.clojure-finance/clojask) ![Clojars Project](https://img.shields.io/clojars/v/com.github.clojure-finance/clojask.svg). 41 | 42 | Insert this line into your `project.clj` if using Leiningen. 43 | 44 | ``` 45 | [com.github.clojure-finance/clojask "2.0.0"] 46 | ``` 47 | 48 | Insert this line into your `deps.edn` if using CLI. 49 | 50 | ```clojure 51 | com.github.clojure-finance/clojask {:mvn/version "2.0.0"} 52 | ``` 53 | 54 | **Requirements:** 55 | 56 | - MacOS or Linux 57 | - Java 8 - 11 58 | 59 | ### Example Usage 60 | 61 | 1. Import `Clojask` 62 | 63 | ```clojure 64 | (require '[clojask.dataframe :as ck]) 65 | ``` 66 | 67 | 2. Initialize a dataframe 68 | 69 | ```clojure 70 | (def df (ck/dataframe "Employees-example.csv")) 71 | ``` 72 | 73 | The source file can be found [here](https://github.com/clojure-finance/clojask/blob/1.x.x/test/clojask/Employees-example.csv). 74 | 75 | See [`dataframe`](https://clojure-finance.github.io/clojask-website/posts-output/API/#dataframe) 76 | 77 | 3. Preview the first few lines of the dataframe 78 | 79 | ```clojure 80 | (ck/print-df df) 81 | ``` 82 | 83 | ![image-20220405210757274](docs/img/image-20220405210757274.png) 84 | 85 | See [`print-df`](https://clojure-finance.github.io/clojask-website/posts-output/API/#print-df) 86 | 87 | 4. Change the data type of some columns 88 | 89 | ```clojure 90 | (ck/set-type df "Salary" "double") 91 | (ck/set-type df "UpdateDate" "date:yyyy/MM/dd") 92 | (ck/print-df df) 93 | ``` 94 | 95 | ![image-20220405210826777](docs/img/image-20220405210826777.png) 96 | 97 | See [`set-type`](https://clojure-finance.github.io/clojask-website/posts-output/API/#set-type) 98 | 99 | 5. Add 100 to Bob as `NewSalary` 100 | 101 | ```clojure 102 | (ck/operate df (fn [EmployeeName Salary] (if (= EmployeeName "Bob") (+ Salary 100) Salary)) ["EmployeeName" "Salary"] "NewSalary") 103 | (ck/print-df df) 104 | ``` 105 | 106 | ![image-20220405211348723](docs/img/image-20220405211348723.png) 107 | 108 | See [`operate`](https://clojure-finance.github.io/clojask-website/posts-output/API/#operate-in-place-modification) 109 | 110 | 6. Output the resultant dataset to "result.csv" (Use 8 threads) 111 | 112 | ```clojure 113 | (ck/compute df 8 "result.csv" :select ["Employee" "EmployeeName" "Department" "NewSalary" "UpdateDate"]) 114 | ``` 115 | 116 | See [`compute`](https://clojure-finance.github.io/clojask-website/posts-output/API/#compute) 117 | 118 | ### Supported Functions and Procedures 119 | 120 | ![clojask functions](docs/clojask_functions.png) 121 | 122 | - *The solid arrows point to the fixed next step; dotted arrows point to all possible next steps.* 123 | - *Any step except for Initialization is optional.* 124 | 125 | ### Documentation 126 | 127 | The detailed documentation for every API can be found [here](https://clojure-finance.github.io/clojask-website/posts-output/API/). 128 | 129 | ### Examples 130 | 131 | A separate repository for some typical usage of Clojask can be found [here](https://github.com/clojure-finance/clojask-examples). 132 | 133 | ### Problem Feedback 134 | 135 | If your question is not answered in existing [issues](https://github.com/clojure-finance/clojask/issues), feel free to create a new one. 136 | -------------------------------------------------------------------------------- /src/main/clojure/clojask/clojask_output.clj: -------------------------------------------------------------------------------- 1 | (ns clojask.clojask-output 2 | (:require [onyx.peer.function :as function] 3 | [onyx.plugin.protocols :as p] 4 | [clojure.java.io :as io] 5 | [taoensso.timbre :refer [debug info] :as timbre] 6 | [clojure.string :as string] 7 | [clojure-heap.core :as heap] 8 | [clojure.set :as set] 9 | [clojask.join.outer-output :as output]) 10 | (:import (java.io BufferedReader FileReader BufferedWriter FileWriter))) 11 | 12 | (def df (atom nil)) 13 | (def output-func (atom nil)) 14 | 15 | (defn inject-dataframe 16 | [dataframe out] 17 | (reset! df dataframe) 18 | (reset! output-func out) 19 | ) 20 | 21 | (defn- inject-into-eventmap 22 | [event lifecycle] 23 | (let [wtr (io/writer (:buffered-wtr/filename lifecycle) :append true) 24 | order (:order lifecycle) 25 | indices (:indices lifecycle) 26 | formatter (.getFormatter (:col-info (deref df)))] 27 | {:clojask/wtr wtr :clojask/order order 28 | ;; :clojask/formatter (set/rename-keys formatter (zipmap indices (iterate inc 0))) 29 | })) 30 | 31 | (defn- close-writer [event lifecycle] 32 | (.close (:clojask/wtr event))) 33 | 34 | (defn- write-msg 35 | [wtr msg melt output-func] 36 | ;; (if (not= (:d msg) nil) 37 | ;; (doseq [] 38 | ;; (output-func wtr (melt (:d msg))) 39 | ;; ;; !! define argument (debug) 40 | ;; )) 41 | (doseq [row (remove nil? (:d msg))] 42 | (output-func wtr (melt row))) 43 | ) 44 | 45 | (defn- order-write 46 | [wtr msg heap exp-id melt output-func] 47 | (let [id (:id msg)] 48 | ;; (println (str msg " " (deref exp-id))) 49 | (if (= id (deref exp-id)) 50 | (do 51 | (write-msg wtr msg melt output-func) 52 | (swap! exp-id inc) 53 | (while (= (:id (heap/peek heap)) (deref exp-id)) 54 | (write-msg wtr (heap/poll heap) melt output-func) 55 | (swap! exp-id inc))) 56 | (do 57 | (heap/add heap msg) 58 | ;; (println (heap/get-size heap)) 59 | ) 60 | ))) 61 | 62 | ;; Map of lifecycle calls that are required to use this plugin. 63 | ;; Users will generally always have to include these in their lifecycle calls 64 | ;; when submitting the job. 65 | (def writer-calls 66 | {:lifecycle/before-task-start inject-into-eventmap 67 | :lifecycle/after-task-stop close-writer}) 68 | 69 | (def melt (atom nil)) 70 | 71 | (defn inject-melt 72 | [tmp] 73 | (reset! melt tmp)) 74 | 75 | (defrecord ClojaskOutput [melt heap exp-id output] 76 | p/Plugin 77 | (start [this event] 78 | ;; Initialize the plugin, generally by assoc'ing any initial state. 79 | this) 80 | 81 | (stop [this event] 82 | ;; Nothing is required here. However, most plugins have resources 83 | ;; (e.g. a connection) to clean up. 84 | ;; Mind that such cleanup is also achievable with lifecycles. 85 | ;; (println (heap/get-size heap)) 86 | (if (not= (heap/get-size heap) 0) (throw (Exception. (str "The order enforcement failed. " (heap/get-size heap) " rows have been shuffled or missing.")))) 87 | this) 88 | 89 | p/Checkpointed 90 | ;; Nothing is required here. This is normally useful for checkpointing in 91 | ;; input plugins. 92 | (checkpoint [this]) 93 | 94 | ;; Nothing is required here. This is normally useful for checkpointing in 95 | ;; input plugins. 96 | (recover! [this replica-version checkpoint]) 97 | 98 | ;; Nothing is required here. This is normally useful for checkpointing in 99 | ;; input plugins. 100 | (checkpointed! [this epoch]) 101 | 102 | p/BarrierSynchronization 103 | (synced? [this epoch] 104 | ;; Nothing is required here. This is commonly used to check whether all 105 | ;; async writes have finished. 106 | true) 107 | 108 | (completed? [this] 109 | ;; Nothing is required here. This is commonly used to check whether all 110 | ;; async writes have finished (just like synced). 111 | true) 112 | 113 | p/Output 114 | (prepare-batch [this event replica messenger] 115 | ;; Nothing is required here. This is useful for some initial preparation, 116 | ;; before write-batch is called repeatedly. 117 | true) 118 | 119 | (write-batch [this {:keys [onyx.core/write-batch clojask/wtr clojask/order]} replica messenger] 120 | ;; keys [:Departement] 121 | ;; Write the batch to your datasink. 122 | ;; In this case we are conjoining elements onto a collection. 123 | (if order 124 | (doseq [msg write-batch] 125 | (order-write wtr msg heap exp-id melt output)) 126 | (let [] 127 | (doseq [msg write-batch] 128 | ;; (println msg) 129 | (write-msg wtr msg melt output)))) 130 | true)) 131 | 132 | ;; Builder function for your output plugin. 133 | ;; Instantiates a record. 134 | ;; It is highly recommended you inject and pre-calculate frequently used data 135 | ;; from your task-map here, in order to improve the performance of your plugin 136 | ;; Extending the function below is likely good for most use cases. 137 | (defn output [pipeline-data] 138 | (->ClojaskOutput (deref melt) (heap/heap (fn [a b] (<= (:id a) (:id b)))) (atom 0) (deref output-func))) -------------------------------------------------------------------------------- /src/main/clojure/clojask/clojask_join.clj: -------------------------------------------------------------------------------- 1 | (ns clojask.clojask-join 2 | (:require [clojask.join :as join] 3 | [clojure.set :as set] 4 | [onyx.peer.function :as function] 5 | [clojure.java.io :as io] 6 | [onyx.plugin.protocols :as p] 7 | [taoensso.timbre :refer [debug info] :as timbre]) 8 | (:import (java.io BufferedReader FileReader BufferedWriter FileWriter))) 9 | 10 | (def a (atom nil)) 11 | (def b (atom nil)) 12 | (def a-keys (atom nil)) 13 | (def b-keys (atom nil)) 14 | (def a-index (atom nil)) 15 | (def b-index (atom nil)) 16 | (def b-format (atom nil)) 17 | (def join-index (atom nil)) 18 | (def output-func (atom nil)) 19 | 20 | (defn inject-dataframe 21 | [d-a d-b a-key b-key -a-index -b-index -join-index -b-format out] 22 | (reset! a d-a) 23 | (reset! b d-b) 24 | (reset! a-keys a-key) 25 | (reset! b-keys b-key) 26 | (reset! a-index -a-index) 27 | (reset! b-index -b-index) 28 | (reset! b-format -b-format) 29 | (reset! join-index -join-index) 30 | (reset! output-func out)) 31 | 32 | (defn- inject-into-eventmap 33 | [event lifecycle] 34 | (let [wtr (io/writer (:buffered-wtr/filename lifecycle) :append true) 35 | ;; a-map (.getKeyIndex (.col-info (deref a))) 36 | a-format (.getFormatter (.col-info (deref a))) 37 | ;; b-map (.getKeyIndex (.col-info (deref b))) 38 | ;; a-format (set/rename-keys a-format (zipmap (deref a-index) (iterate inc 0))) 39 | ;; b-format (.getFormatter (.col-info (deref b))) 40 | ;; b-format (set/rename-keys b-format (zipmap (deref b-index) (iterate inc 0))) 41 | b-format (deref b-format) 42 | ] 43 | 44 | {:clojask/wtr wtr 45 | ;; :clojask/a-keys (:clojask/a-keys lifecycle) 46 | :clojask/a-keys (deref a-keys) 47 | ;; :clojask/b-keys (:clojask/b-keys lifecycle) 48 | :clojask/b-keys (deref b-keys) 49 | :clojask/a-roll (:clojask/a-roll lifecycle) 50 | :clojask/b-roll (:clojask/b-roll lifecycle) 51 | :clojask/a-map (:clojask/a-map lifecycle) 52 | :clojask/b-map (:clojask/b-map lifecycle) 53 | :clojask/a-format a-format 54 | :clojask/b-format b-format 55 | :clojask/join-type (:clojask/join-type lifecycle)})) 56 | 57 | (defn- close-writer [event lifecycle] 58 | (.close (:clojask/wtr event))) 59 | 60 | ;; Map of lifecycle calls that are required to use this plugin. 61 | ;; Users will generally always have to include these in their lifecycle calls 62 | ;; when submitting the job. 63 | (def writer-join-calls 64 | {:lifecycle/before-task-start inject-into-eventmap 65 | :lifecycle/after-task-stop close-writer}) 66 | 67 | (defrecord ClojaskJoin [a-index b-index join-index write-func] 68 | p/Plugin 69 | (start [this event] 70 | ;; Initialize the plugin, generally by assoc'ing any initial state. 71 | this) 72 | 73 | (stop [this event] 74 | ;; Nothing is required here. However, most plugins have resources 75 | ;; (e.g. a connection) to clean up. 76 | ;; Mind that such cleanup is also achievable with lifecycles. 77 | this) 78 | 79 | p/Checkpointed 80 | ;; Nothing is required here. This is normally useful for checkpointing in 81 | ;; input plugins. 82 | (checkpoint [this]) 83 | 84 | ;; Nothing is required here. This is normally useful for checkpointing in 85 | ;; input plugins. 86 | (recover! [this replica-version checkpoint]) 87 | 88 | ;; Nothing is required here. This is normally useful for checkpointing in 89 | ;; input plugins. 90 | (checkpointed! [this epoch]) 91 | 92 | p/BarrierSynchronization 93 | (synced? [this epoch] 94 | ;; Nothing is required here. This is commonly used to check whether all 95 | ;; async writes have finished. 96 | true) 97 | 98 | (completed? [this] 99 | ;; Nothing is required here. This is commonly used to check whether all 100 | ;; async writes have finished (just like synced). 101 | true) 102 | 103 | p/Output 104 | (prepare-batch [this event replica messenger] 105 | ;; Nothing is required here. This is useful for some initial preparation, 106 | ;; before write-batch is called repeatedly. 107 | true) 108 | 109 | (write-batch [this {:keys [onyx.core/write-batch clojask/wtr clojask/a-keys clojask/b-keys clojask/a-roll clojask/b-roll clojask/a-map clojask/b-map clojask/a-format clojask/b-format clojask/join]} replica messenger] 110 | ;; keys [:Departement] 111 | ;; Write the batch to your datasink. 112 | ;; In this case we are conjoining elements onto a collection. 113 | (doseq [msg write-batch] 114 | (doseq [data (:d msg)] 115 | ;; (swap! example-datasink conj msg) 116 | (if (not= data nil) 117 | (do 118 | ;(.write wtr (str msg "\n")) 119 | ;; !! define argument (debug) 120 | ;; (def groupby-keys [:Department :EmployeeName]) 121 | (join/output-join wtr data a-keys a-map b-keys (count b-map) a-roll b-roll a-format b-format a-index b-index join-index write-func))))) 122 | true)) 123 | 124 | ;; Builder function for your output plugin. 125 | ;; Instantiates a record. 126 | ;; It is highly recommended you inject and pre-calculate frequently used data 127 | ;; from your task-map here, in order to improve the performance of your plugin 128 | ;; Extending the function below is likely good for most use cases. 129 | (defn join [pipeline-data] 130 | (->ClojaskJoin (deref a-index) (deref b-index) (deref join-index) (deref output-func))) -------------------------------------------------------------------------------- /src/main/clojure/clojask/clojask_aggre.clj: -------------------------------------------------------------------------------- 1 | (ns clojask.clojask-aggre 2 | (:require [onyx.peer.function :as function] 3 | [onyx.plugin.protocols :as p] 4 | [clojure.java.io :as io] 5 | [taoensso.timbre :refer [debug info] :as timbre] 6 | [clojure.string :as string] 7 | [clojask.api.aggregate :refer [start]] 8 | [clojask.utils :as u]) 9 | (:import [java.io BufferedReader FileReader BufferedWriter FileWriter] 10 | [com.clojask.exception ExecutionException])) 11 | 12 | (def df (atom nil)) 13 | (def aggre-func (atom nil)) 14 | (def select (atom nil)) 15 | (def output-func (atom nil)) 16 | 17 | (defn inject-dataframe 18 | [dataframe a b out] 19 | (reset! df dataframe) 20 | (reset! aggre-func a) 21 | (reset! select b) 22 | (reset! output-func out) 23 | ) 24 | 25 | (defn c-count 26 | [a] 27 | (if (coll? a) 28 | (count a) 29 | 1)) 30 | 31 | (defn- inject-into-eventmap 32 | [event lifecycle] 33 | (let [wtr (io/writer (:buffered-wtr/filename lifecycle) :append true) 34 | order (:order lifecycle) 35 | aggre-func (.getAggreFunc (:row-info (deref df)))] 36 | {:clojask/wtr wtr 37 | :clojask/aggre-func aggre-func})) 38 | 39 | (defn- close-writer [event lifecycle] 40 | (.close (:clojask/wtr event))) 41 | 42 | ;; Map of lifecycle calls that are required to use this plugin. 43 | ;; Users will generally always have to include these in their lifecycle calls 44 | ;; when submitting the job. 45 | (def writer-calls 46 | {:lifecycle/before-task-start inject-into-eventmap 47 | :lifecycle/after-task-stop close-writer}) 48 | 49 | (defrecord ClojaskOutput 50 | [memo 51 | aggre-func 52 | select 53 | output-func] 54 | p/Plugin 55 | (start [this event] 56 | ;; Initialize the plugin, generally by assoc'ing any initial state. 57 | this) 58 | 59 | (stop [this event] 60 | ;; Nothing is required here. However, most plugins have resources 61 | ;; (e.g. a connection) to clean up. 62 | ;; Mind that such cleanup is also achievable with lifecycles. 63 | (let [data (mapv (fn [_] (if (coll? _) _ [_])) (deref memo)) 64 | wtr (:clojask/wtr event)] 65 | ;; (.write (:clojask/wtr event) (str data "\n")) 66 | (if (apply = (map count data)) 67 | (do 68 | (mapv 69 | ;; #(.write (:clojask/wtr event) (str (string/join "," (u/gets % select)) "\n")) 70 | (fn [msg] (output-func wtr [(u/gets msg select)])) (apply map vector data)) 71 | (.flush wtr)) 72 | (throw (ExecutionException. "aggregation result is not of the same length")))) 73 | this) 74 | 75 | p/Checkpointed 76 | ;; Nothing is required here. This is normally useful for checkpointing in 77 | ;; input plugins. 78 | (checkpoint [this]) 79 | 80 | ;; Nothing is required here. This is normally useful for checkpointing in 81 | ;; input plugins. 82 | (recover! [this replica-version checkpoint]) 83 | 84 | ;; Nothing is required here. This is normally useful for checkpointing in 85 | ;; input plugins. 86 | (checkpointed! [this epoch]) 87 | 88 | p/BarrierSynchronization 89 | (synced? [this epoch] 90 | ;; Nothing is required here. This is commonly used to check whether all 91 | ;; async writes have finished. 92 | true) 93 | 94 | (completed? [this] 95 | ;; Nothing is required here. This is commonly used to check whether all 96 | ;; async writes have finished (just like synced). 97 | true) 98 | 99 | p/Output 100 | (prepare-batch [this event replica messenger] 101 | ;; Nothing is required here. This is useful for some initial preparation, 102 | ;; before write-batch is called repeatedly. 103 | true) 104 | 105 | (write-batch [this {:keys [onyx.core/write-batch clojask/wtr]} replica messenger] 106 | ;; keys [:Departement] 107 | ;; Write the batch to your datasink. 108 | ;; In this case we are conjoining elements onto a collection. 109 | (let [] 110 | (doseq [msg write-batch] 111 | ;; (if-let [msg (first batch)] 112 | ;; (do 113 | (doseq [data (:d msg)] 114 | ;; (swap! example-datasink conj msg) 115 | (if (not= data nil) 116 | (let [ 117 | ;; data (:d msg) 118 | ] 119 | ;; (.write wtr (str (string/join "," (:d msg)) "\n")) 120 | 121 | ;; (swap! memo assoc index (func (get index (deref memo)) (:d msg))) 122 | (vreset! memo (doall (map-indexed (fn [ind prev] ((nth (nth aggre-func ind) 0) prev (nth data (nth (nth aggre-func ind) 1)))) (deref memo)))) 123 | ;; (.write wtr (str (vec (deref memo)) "\n")) 124 | ))))) 125 | true)) 126 | 127 | ;; Builder function for your output plugin. 128 | ;; Instantiates a record. 129 | ;; It is highly recommended you inject and pre-calculate frequently used data 130 | ;; from your task-map here, in order to improve the performance of your plugin 131 | ;; Extending the function below is likely good for most use cases. 132 | (defn output [pipeline-data] 133 | (let [] 134 | (->ClojaskOutput (volatile! (doall (take (count (deref aggre-func)) 135 | (repeat start)))) 136 | (deref aggre-func) 137 | (deref select) 138 | ;; (.getOutput (deref df)) 139 | (deref output-func)))) -------------------------------------------------------------------------------- /src/main/clojure/clojask/classes/ColInfo.clj: -------------------------------------------------------------------------------- 1 | (ns clojask.classes.ColInfo 2 | (:require [clojure.set :as set] 3 | [clojask.utils :refer []])) 4 | 5 | (import '[com.clojask.exception TypeException] 6 | '[com.clojask.exception OperationException]) 7 | 8 | (definterface ColIntf 9 | (init [colNames]) 10 | (operate [operation col]) 11 | (operate [operation col newCol]) 12 | (setType [operation col]) 13 | (getDesc [] "get column description") 14 | (getType [] "get column type") 15 | (getKeys [] "get collection of keys") 16 | (getKeyIndex [] "get map with key = column name, value = index") 17 | (getIndexKey [] "get map with key = index, value = column name") 18 | (getDeletedCol [] "get indices of deleted columns") 19 | (setFormatter [format col]) 20 | (getFormatter []) 21 | (delCol [col-to-del]) 22 | (setColInfo [new-col-set]) 23 | (renameColInfo [old-col new-col]) 24 | (copy [] "copy all the information for rollback purpose") 25 | (rollback [] "undo the change making use of the copied") 26 | (commit []) 27 | ) 28 | 29 | 30 | (deftype ColInfo 31 | ;; the column description about whether a change is made to this column 32 | [^:unsynchronized-mutable col-keys 33 | ^:unsynchronized-mutable key-index 34 | ^:unsynchronized-mutable index-key 35 | ^:unsynchronized-mutable col-dsp 36 | ^:unsynchronized-mutable col-type 37 | ^:unsynchronized-mutable col-format 38 | ^:unsynchronized-mutable col-deleted 39 | ^:unsynchronized-mutable hist] 40 | 41 | ;; method 42 | ColIntf 43 | 44 | (init 45 | [this colNames] 46 | (set! col-keys (vec colNames)) ;; contains only the original keys 47 | (set! key-index (zipmap col-keys (iterate inc 0))) 48 | (set! index-key (zipmap (iterate inc 0) col-keys)) 49 | (set! col-dsp (zipmap (take (count colNames) (iterate inc 0)) (map vector (map vector (iterate inc 0))))) 50 | (set! col-deleted (set nil))) 51 | 52 | (getFormatter 53 | [this] 54 | col-format) 55 | 56 | (getDesc 57 | [this] 58 | col-dsp) 59 | 60 | (getType 61 | [this] 62 | col-type) 63 | 64 | (getKeys 65 | [this] 66 | (mapv (fn [index] (get index-key index)) 67 | (take (count index-key) (iterate inc 0)))) 68 | 69 | (getKeyIndex 70 | [this] 71 | key-index) 72 | 73 | (getIndexKey 74 | [this] 75 | index-key) 76 | 77 | (getDeletedCol 78 | [this] 79 | col-deleted) 80 | 81 | (operate 82 | [this operation col] 83 | (.copy this) 84 | (if (contains? key-index col) 85 | (do 86 | (set! col-dsp (assoc col-dsp (get key-index col) (conj (get col-dsp (get key-index col)) operation))) 87 | ;; "success" 88 | nil) 89 | (throw (OperationException. "Column name passed to operate not found")))) 90 | 91 | (operate 92 | [this operation col newCol] 93 | (.copy this) 94 | (let [col (if (coll? col) 95 | col 96 | [col]) 97 | external (vec (filter (fn [_] (not (.contains col-keys _))) col))] 98 | (if (= (count external) 0) 99 | (if (contains? key-index newCol) 100 | (str newCol " is already exist") 101 | (do 102 | ;; (set! col-keys (conj col-keys newCol)) 103 | (set! key-index (assoc key-index newCol (count key-index))) 104 | (set! index-key (assoc index-key (count index-key) newCol)) 105 | (set! col-dsp (assoc col-dsp (get key-index newCol) (conj [(vec (map (fn [_] (get key-index _)) col))] operation))) 106 | ;; "success" 107 | nil)) 108 | (do 109 | (throw (OperationException. (str external " are not original column names"))))))) 110 | 111 | (setType 112 | [this operation col] 113 | (.copy this) 114 | (if (.contains col-keys col) 115 | ;; if this column has been assigned a type 116 | (do 117 | (set! col-type (assoc col-type (get key-index col) operation)) 118 | ;; (set! col-dsp (assoc col-dsp col (vec (concat (conj [(first (col col-dsp))] operation) (rest (rest (col col-dsp))))))) 119 | ;; "success" 120 | nil) 121 | (throw (OperationException. "Column name passed to setType not found")))) 122 | 123 | (setFormatter 124 | [this format col] 125 | (.copy this) 126 | (set! col-format (assoc col-format (get key-index col) format))) 127 | 128 | (delCol 129 | [this col-to-delete] 130 | (.copy this) 131 | (let [col-indices (set (map key-index col-to-delete))] 132 | (set! col-deleted (set/union col-deleted col-indices)))) 133 | 134 | (setColInfo 135 | [this new-col-set] 136 | (.copy this) 137 | (let [original-key-index (.getKeyIndex this) 138 | new-col-dsp-vals (vals (select-keys original-key-index new-col-set)) 139 | original-type (.getType this) 140 | original-format (.getFormatter this)] 141 | (set! col-keys (vec new-col-set)) 142 | (set! key-index (zipmap new-col-set (iterate inc 0))) 143 | (set! index-key (zipmap (iterate inc 0) new-col-set)) 144 | (set! col-dsp (zipmap (take (count col-keys) (iterate inc 0)) (map vector (map vector new-col-dsp-vals)))) 145 | (if (not (empty? (.getType this))) 146 | (set! col-type (zipmap (map #(first (first (get col-dsp (first %)))) original-type) (map last original-type)))) 147 | (if (not (empty? (.getFormatter this))) 148 | (set! col-format (zipmap (map #(first (first (get col-dsp (first %)))) original-format) (map last original-format)))))) 149 | 150 | (renameColInfo 151 | [this old-col new-col] 152 | (.copy this) 153 | (set! col-keys (mapv (fn [_] (if (= _ old-col) new-col _)) col-keys)) 154 | (let [index (get key-index old-col)] 155 | (set! key-index (set/rename-keys key-index {old-col new-col})) 156 | (set! index-key (update index-key index (fn [_] new-col))))) 157 | 158 | (copy 159 | [this] 160 | (set! hist {:col-keys col-keys 161 | :key-index key-index 162 | :index-key index-key 163 | :col-dsp col-dsp 164 | :col-type col-type 165 | :col-format col-format 166 | :col-deleted col-deleted})) 167 | 168 | (rollback 169 | [this] 170 | (if (not= hist {}) 171 | (do (set! col-keys (:col-keys hist)) ;; contains only the original keys 172 | (set! key-index (:key-index hist)) 173 | (set! index-key (:index-key hist)) 174 | (set! col-type (:col-type hist)) 175 | (set! col-format (:col-format hist)) 176 | (set! col-dsp (:col-dsp hist)) 177 | (set! col-deleted (:col-deleted hist))))) 178 | 179 | (commit 180 | [this] 181 | (set! hist {}))) -------------------------------------------------------------------------------- /src/main/clojure/clojask/classes/JoinedDataFrame.clj: -------------------------------------------------------------------------------- 1 | (ns clojask.classes.JoinedDataFrame 2 | (:require [clojure.set :as set] 3 | [clojask.classes.ColInfo :refer [->ColInfo]] 4 | [clojask.classes.RowInfo :refer [->RowInfo]] 5 | [clojask.classes.DataStat :refer [->DataStat]] 6 | [clojask.classes.MGroup :refer [->MGroup ->MGroupJoin ->MGroupJoinOuter]] 7 | [clojask.classes.DataFrame :refer [->DataFrame]] 8 | [clojask.onyx-comps :refer [start-onyx start-onyx-aggre-only start-onyx-groupby start-onyx-join]] 9 | ;; [clojask.aggregate.aggre-onyx-comps :refer [start-onyx-aggre]] 10 | [clojask.join.outer-onyx-comps :refer [start-onyx-outer]] 11 | [clojure.java.io :as io] 12 | [clojask.utils :as u]) 13 | (:import 14 | [clojask.classes.ColInfo ColInfo] 15 | [clojask.classes.RowInfo RowInfo] 16 | [clojask.classes.DataStat DataStat] 17 | [clojask.classes.MGroup MGroup MGroupJoin MGroupJoinOuter] 18 | [clojask.classes.DataFrame GenDFIntf DataFrame] 19 | [com.clojask.exception TypeException OperationException])) 20 | 21 | ;; ============= Below is the definition for the joineddataframe ================ 22 | (definterface JDFIntf 23 | (compute [^int num-worker ^String output-dir ^boolean exception ^boolean order select ifheader out inmemory])) 24 | 25 | (defrecord JoinedDataFrame 26 | [^clojask.classes.DataFrame.DataFrame a 27 | ^clojask.classes.DataFrame.DataFrame b 28 | a-keys 29 | b-keys 30 | a-roll 31 | b-roll 32 | type 33 | limit 34 | prefix 35 | output-func] 36 | 37 | GenDFIntf 38 | 39 | (checkInputPathClash 40 | [this path] 41 | (.checkInputPathClash a path) 42 | (.checkInputPathClash b path)) 43 | 44 | (getColNames 45 | [this] 46 | (let [a-col-prefix (first prefix) 47 | b-col-prefix (last prefix) 48 | a-col-set (.getColNames a) 49 | b-col-set (.getColNames b) 50 | a-col-header (mapv #(str a-col-prefix "_" %) a-col-set) 51 | b-col-header (mapv #(str b-col-prefix "_" %) b-col-set)] 52 | (concat a-col-header b-col-header))) 53 | 54 | (setOutput 55 | [this output] 56 | (reset! output-func output)) 57 | 58 | (getOutput 59 | [this] 60 | (deref output-func)) 61 | 62 | (printCol 63 | ;; print column names, called by compute 64 | [this output-path selected-index out] 65 | (let [col-set (if (= selected-index [nil]) (.getColNames this) (mapv (vec (.getColNames this)) selected-index))] 66 | (let [wrtr (if output-path (io/writer output-path) nil)] 67 | ((or out (.getOutput this)) wrtr [col-set]) 68 | (if output-path (.close wrtr))))) 69 | 70 | (preview 71 | [this sample-size output-size format] 72 | (let [data-a (.preview a sample-size output-size format) 73 | data-b (.preview b sample-size output-size format) 74 | old-a (.getColNames a) 75 | old-b (.getColNames b) 76 | rep-key-a (zipmap old-a (take (count old-a) (.getColNames this))) 77 | rep-key-b (zipmap old-b (take-last (count old-b) (.getColNames this))) 78 | data-a (map #(set/rename-keys % rep-key-a) data-a) 79 | data-b (map #(set/rename-keys % rep-key-b) data-b) 80 | data (map (fn [row-a row-b] (merge row-a row-b)) data-a data-b)] 81 | data)) 82 | 83 | JDFIntf 84 | 85 | (compute 86 | [this ^int num-worker ^String output-dir ^boolean exception ^boolean order select ifheader out inmemory] 87 | (let [select (if (coll? select) select [select]) 88 | select (if (= select [nil]) 89 | (vec (take (+ (count (.getKeyIndex (.col-info a))) (count (.getKeyIndex (.col-info b)))) (iterate inc 0))) 90 | (mapv (fn [key] (.indexOf (.getColNames this) key)) select)) 91 | a-index (vec (apply sorted-set (remove (fn [num] (>= num (count (.getKeyIndex (.col-info a))))) select))) 92 | ;; a-write 93 | b-index (mapv #(- % (count (.getKeyIndex (.col-info a)))) (apply sorted-set (remove (fn [num] (< num (count (.getKeyIndex (.col-info a))))) select))) 94 | b-index (if b-roll (vec (apply sorted-set (conj b-index b-roll))) b-index) 95 | b-roll (if b-roll (count (remove #(>= % b-roll) b-index)) nil) 96 | ;; b-write 97 | a-format (set/rename-keys (.getFormatter (.col-info a)) (zipmap a-index (iterate inc 0))) 98 | b-format (set/rename-keys (.getFormatter (.col-info b)) (zipmap b-index (iterate inc 0))) 99 | write-index (mapv (fn [num] (count (remove #(>= % num) (concat a-index (mapv #(+ % (count (.getKeyIndex (.col-info a)))) b-index))))) select) 100 | ;; test (println a-index b-index b-format write-index b-roll) 101 | mgroup-a (MGroupJoinOuter. (transient {}) (transient {}) false) 102 | mgroup-b (if (not= type 3) (MGroupJoin. (transient {}) (transient {}) (or (= 4 type) (= 5 type))) (MGroupJoinOuter. (transient {}) (transient {}) (or (= 4 type) (= 5 type)))) 103 | ] 104 | ;; (u/init-file output-dir) 105 | ;; print column names 106 | (if (= ifheader true) (.printCol this output-dir select out)) 107 | (cond 108 | (or (= type 0) (= type 1) (= type 2)) ;; inner left right join 109 | (do 110 | (if inmemory 111 | (start-onyx-groupby num-worker 10 b mgroup-b b-keys b-index exception) 112 | (start-onyx-groupby num-worker 10 b "./.clojask/join/b/" b-keys b-index exception :format true)) 113 | (.final mgroup-b) 114 | (start-onyx-join num-worker 10 a b (if inmemory mgroup-b nil) output-dir exception a-keys b-keys a-roll b-roll type limit a-index (vec (take (count b-index) (iterate inc 0))) b-format write-index out)) 115 | (= type 3) ;; outer join 116 | (do 117 | (if inmemory 118 | (do 119 | (start-onyx-groupby num-worker 10 a mgroup-a a-keys a-index exception) 120 | (start-onyx-groupby num-worker 10 b mgroup-b b-keys b-index exception) 121 | (.final mgroup-a) 122 | ;; (.final mgroup-b) 123 | ) 124 | (do 125 | (start-onyx-groupby num-worker 10 a "./.clojask/join/a/" a-keys a-index exception :format true) 126 | (start-onyx-groupby num-worker 10 b "./.clojask/join/b/" b-keys b-index exception :format true))) 127 | (start-onyx-outer num-worker 10 a b (if inmemory mgroup-a nil) (if inmemory mgroup-b nil) output-dir exception a-index b-index a-format b-format write-index out)) 128 | (or (= type 4) (= type 5)) ;; rolling join 129 | (do 130 | (if inmemory 131 | (start-onyx-groupby num-worker 10 b mgroup-b b-keys b-index exception) 132 | (start-onyx-groupby num-worker 10 b "./.clojask/join/b/" b-keys b-index exception)) 133 | (.final mgroup-b) 134 | (start-onyx-join num-worker 10 a b (if inmemory mgroup-b nil) output-dir exception a-keys b-keys a-roll b-roll type limit a-index (vec (take (count b-index) (iterate inc 0))) b-format write-index out)))))) 135 | 136 | -------------------------------------------------------------------------------- /src/main/clojure/clojask/preview.clj: -------------------------------------------------------------------------------- 1 | (ns clojask.preview 2 | (:require [clojure.set :as set] 3 | ;; [clojask.classes.ColInfo :refer [->ColInfo]] 4 | ;; [clojask.classes.RowInfo :refer [->RowInfo]] 5 | [clojure.data.csv :as csv] 6 | [clojure.java.io :as io] 7 | [clojask.utils :refer [eval-res eval-res-ne filter-check]] 8 | [clojask.groupby :refer [gen-groupby-filenames]] 9 | ;; [clojask.onyx-comps :refer [start-onyx start-onyx-groupby start-onyx-join]] 10 | ;; [clojask.sort :as sort] 11 | ;; [clojask.join :as join] 12 | ;; [clojask.aggregate.aggre-onyx-comps :refer [start-onyx-aggre]] 13 | [clojure.string :as str] 14 | [clojask.preview :as preview] 15 | [clojask.api.aggregate :as aggre])) 16 | 17 | (defn preview 18 | [dataframe sample-size return-size formatting] 19 | ;; outer loop is the input node 20 | (let [index-key (.getIndexKey (:col-info dataframe)) 21 | formatters (.getFormatter (:col-info dataframe)) 22 | ;; index (take (count index-key) (iterate inc 0)) 23 | ;; indices-deleted (.getDeletedCol (:col-info dataframe)) 24 | ;; indices-wo-del (vec (take (count index-key) (iterate inc 0))) 25 | ;; indices-not-deleted (set/difference (set indices-wo-del) (set indices-deleted)) 26 | ;; index (if (empty? indices-deleted) 27 | ;; indices-wo-del ;; no columns deleted 28 | ;; (filterv (fn [i] (contains? indices-not-deleted i)) indices-wo-del) 29 | ;; ) 30 | ;; header (mapv index-key index) ;; the header of the result in sequence vector 31 | index (.getColIndex dataframe) 32 | header (.getColNames dataframe) 33 | ;; csv-data (if (fn? (.getFunc dataframe)) 34 | ;; ((.getFunc dataframe)) 35 | ;; (let [reader (io/reader (:path dataframe))] 36 | ;; (if (:have-col dataframe) 37 | ;; (rest (line-seq reader)) 38 | ;; (line-seq reader)))) 39 | csv-data ((.getFunc dataframe)) 40 | data (map zipmap (repeat [:id :d]) (map vector (iterate inc 0) csv-data)) 41 | sample (take sample-size data) ;; lazy source data (take sample size) 42 | ;; define the variables needed in the following functions 43 | operations (.getDesc (:col-info dataframe)) 44 | types (.getType (:col-info dataframe)) 45 | filters (.getFilters (:row-info dataframe)) 46 | indices index 47 | no-aggre (= (.getAggreFunc (:row-info dataframe)) []) ;; if need to groupby & aggregate 48 | no-groupby (= (.getGroupbyKeys (:row-info dataframe)) []) 49 | ;; 50 | preview-work-func (fn [seg] 51 | (let [data (:d seg)] 52 | (if (filter-check filters types data) 53 | {:d (mapv (fn [_] (eval-res data types formatters operations _)) indices)} 54 | {}))) ;; the function body of operation (take over the work in worker nodes) 55 | preview-output-func (if (and formatting no-aggre no-groupby) 56 | (fn [row] 57 | (mapv (fn [_] (if-let [formatter (get formatters _)] 58 | (formatter (nth (:d row) _)) 59 | (nth (:d row) _))) index)) 60 | (fn [row] 61 | (:d row))) ;; the function body of output operation (take over the work in output node) without formatting 62 | 63 | ;; ========== no need to change =========== 64 | compute-res (loop [rows sample res (transient [])] ;; the result of normal compute 65 | (if (= rows []) ;; exceed sample size 66 | (persistent! res) 67 | (let [row (first rows) 68 | rest (rest rows) 69 | row (preview-work-func row) 70 | row-res (preview-output-func row) 71 | res (if row-res (conj! res row-res) res)] 72 | (if (>= (count res) return-size) 73 | (persistent! res) 74 | (recur rest res)))))] 75 | (if (and no-groupby no-aggre) 76 | (mapv (fn [row-v] (zipmap header row-v)) compute-res) 77 | ;; need to do aggregate 78 | (if no-groupby 79 | ;; need to do simple aggregate 80 | (let [aggre-funcs (.getAggreFunc (.row-info dataframe)) 81 | keys (.getAggreNewKeys (:row-info dataframe)) 82 | aggre-res (for [[func index] aggre-funcs] 83 | (let [res 84 | (reduce func aggre/start (mapv (fn [row] (nth row index)) compute-res))] 85 | (if (coll? res) 86 | res 87 | [res])))] 88 | (if (apply = (map count aggre-res)) 89 | (mapv (fn [row-v] (zipmap keys row-v)) (apply map vector aggre-res)) 90 | (throw (Exception. "aggregation result is not of the same length")))) 91 | ;; need to do groupby aggregate 92 | (let [key-index (.getKeyIndex (:col-info dataframe)) 93 | index-key (.getIndexKey (.col-info dataframe)) 94 | groupby-keys (.getGroupbyKeys (:row-info dataframe)) 95 | groupby-res (loop [sample compute-res groupby {}] 96 | (if-let [row (first sample)] 97 | (let [res (rest sample) 98 | key (gen-groupby-filenames nil row groupby-keys key-index formatters)] 99 | (recur res (assoc groupby key (conj (or (get groupby key) []) row)))) 100 | groupby)) 101 | aggre-funcs (.getAggreFunc (.row-info dataframe)) 102 | ;; keys = column names 103 | keys (.getAggreColNames dataframe) 104 | preview-aggre-func (fn [key v-of-v] 105 | (let [data v-of-v 106 | ;; pre 107 | pre (mapv #(let [func (first %) 108 | index (nth % 1)] 109 | (if func 110 | (func (nth (first v-of-v) index)) 111 | (if formatting 112 | ((or (get formatters index) identity) (nth (first v-of-v) index)) 113 | (nth (first v-of-v) index)))) 114 | groupby-keys) 115 | data-map (-> (iterate inc 0) 116 | (zipmap (apply map vector data)))] 117 | (loop [aggre-funcs aggre-funcs 118 | res []] 119 | (if (= aggre-funcs []) 120 | (if (= res []) 121 | [pre] 122 | (mapv concat (repeat pre) (apply map vector res))) 123 | (let [func (first (first aggre-funcs)) 124 | index (nth (first aggre-funcs) 1) 125 | res-funcs (rest aggre-funcs) 126 | new (func (get data-map index)) 127 | new (if (coll? new) 128 | new 129 | (vector new)) 130 | new (if formatting 131 | (mapv (fn [_] (if-let [formatter (get formatters index)] 132 | (formatter _) 133 | _)) new) 134 | new)] 135 | (if (or (= res []) (= (count new) (count (last res)))) 136 | (recur res-funcs (conj res new)) 137 | (throw (Exception. "aggregation result is not of the same length"))))))))] 138 | (loop [groupby-res groupby-res aggre-res []] 139 | (if-let [key-vv (first groupby-res)] 140 | (let [res (rest groupby-res) 141 | key (nth key-vv 0) 142 | vv (nth key-vv 1)] 143 | (recur res (concat aggre-res (preview-aggre-func key vv)))) 144 | (mapv (fn [row-v] (zipmap keys row-v)) aggre-res)))))))) -------------------------------------------------------------------------------- /docs/clojask.extensions.md: -------------------------------------------------------------------------------- 1 | ### Directory: clojask.extensions 2 | 3 | Like many popular Python libraries, such as numpy and pandas, third-party users can extend the function of Clojask by introducing more codes above the basic source code. This attempt is interesting and encouraged. Here is an example of creating such extension functions. 4 | 5 | ### ns: clojask.extensions.bind 6 | 7 | Contains functions that can help to bind several dataset files together on both directions. 8 | 9 | #### API Foundation 10 | 11 | When defining a clojask.classes.DataFrame.DataFrame using `dataframe` function, one can input a function instead of the path of the source file. This function should produce a sequence. If this sequence is lazy, the theoretical length of the sequence can be infinite. Otherwise, it must have a finite length that is smaller than the memory size. 12 | 13 | ``` 14 | (def x (dataframe #(["col1,col2" "1,2" "3,4"]))) 15 | ``` 16 | 17 | Based on this API, we can define the `cbind` and `rbind` function for two csv files. 18 | 19 | #### `cbind-csv` 20 | 21 | Joins some csv files into a new dataframe by columns. 22 | 23 | | Argument | Type | Function | Remarks | 24 | | ---------- | ------ | ------------------------------- | ----------------------------------------------------------- | 25 | | path-a | String | The path of the first csv file | Can be absolute or relative path | 26 | | path-b | String | The path of the second csv file | Can be absolute or relative path | 27 | | [path-c's] | String | Target columns | Can be absolute or relative path; the number is not limited | 28 | 29 | **Example** 30 | 31 | ```clojure 32 | ;; file a 33 | ;; date,item,price 34 | ;; 2010-01-20,1,18.3 35 | ;; 2010-01-20,2,38.3 36 | ;; 2010-01-23,1,18.9 37 | ;; 2010-01-23,2,48.9 38 | ;; 2010-01-26,1,19.1 39 | ;; 2010-01-26,2,59.1 40 | ;; file b 41 | ;; date,cust,Item,sold 42 | ;; 2010-01-19,101,2,11 43 | ;; 2010-01-22,102,1,7 44 | ;; 2010-01-24,102,2,9 45 | ;; 2010-01-25,101,2,9 46 | ;; 2010-01-26,101,1,10 47 | (def x (cbind "path/to/a" "path/to/b")) 48 | ;; x 49 | ;; date1,item,price,date2,cust,Item,sold 50 | ;; 2010-01-20,1,18.3,2010-01-19,101,2,11 51 | ;; 2010-01-20,2,38.3,2010-01-22,102,1,7 52 | ;; 2010-01-23,1,18.9,2010-01-24,102,2,9 53 | ;; 2010-01-23,2,48.9,2010-01-25,101,2,9 54 | ;; 2010-01-26,1,19.1,2010-01-26,101,1,10 55 | ``` 56 | 57 | #### `rbind-csv` 58 | 59 | Joins some csv files into a new dataframe by rows. 60 | 61 | | Argument | Type | Function | Remarks | 62 | | ---------- | ------ | ------------------------------- | --------------------------------------------------------- | 63 | | path-a | String | The path of the first csv file | Can be absolute or relative path | 64 | | path-b | String | The path of the second csv file | Can be absolute or relative path | 65 | | [path-c's] | String | Target columns | Can be absolute or relative path; the number is not fixed | 66 | 67 | **Example** 68 | 69 | ```clojure 70 | ;; file a 71 | ;; date,item,price 72 | ;; 2010-01-20,1,18.3 73 | ;; 2010-01-20,2,38.3 74 | ;; 2010-01-23,1,18.9 75 | ;; 2010-01-23,2,48.9 76 | ;; 2010-01-26,1,19.1 77 | ;; 2010-01-26,2,59.1 78 | ;; file b 79 | ;; date,cust,Item,sold 80 | ;; 2010-01-19,101,2,11 81 | ;; 2010-01-22,102,1,7 82 | ;; 2010-01-24,102,2,9 83 | ;; 2010-01-25,101,2,9 84 | ;; 2010-01-26,101,1,10 85 | (def x (rbind "path/to/a" "path/to/b")) 86 | (print-df x) 87 | | date | item | price | 88 | |------------------+------------------+------------------| 89 | | java.lang.String | java.lang.String | java.lang.String | 90 | | 2010-01-20 | 1 | 18.3 | 91 | | 2010-01-20 | 2 | 38.3 | 92 | | 2010-01-23 | 1 | 18.9 | 93 | | 2010-01-23 | 2 | 48.9 | 94 | | 2010-01-26 | 1 | 19.1 | 95 | | 2010-01-26 | 2 | 59.1 | 96 | | 2010-01-19 | 101 | 2 | 97 | | 2010-01-22 | 102 | 1 | 98 | | 2010-01-24 | 102 | 2 | 99 | | 2010-01-25 | 101 | 2 | 100 | ``` 101 | 102 | #### **It is also possible and encouraged to create more binding functions for other file types.** 103 | 104 | ### ns: clojask.extensions.reshape 105 | 106 | Contains functions that can reshape a clojask dataframe from wide to long or from long to wide. 107 | 108 | #### API Foundation 109 | 110 | When defining a clojask.classes.DataFrame.DataFrame using `dataframe` function, you can specify the option `:melt`, which should be a function that will be applied to each resultant row vector in the end. The default is vector, which will not affect the results. However, if `:melt` is set to 111 | 112 | ```clojure 113 | (fn [x] 114 | (repeat 2 x)) 115 | ``` 116 | 117 | , then each row will be output twice. 118 | 119 | #### `melt` 120 | 121 | Reshape the dataframe from wide to long. 122 | 123 | | Argument | Type | Function | Remarks | 124 | | -------------- | ----------------- | ----------------------------------------- | ------------------------------------------------------------ | 125 | | dataframe | clojask.classes.DataFrame.DataFrame | Specify the dataframe | | 126 | | output-path | String | The path of the output | Can be absolute or relative path with respect to the `project.clj` file. | 127 | | id | vector of strings | The fixed portion of the columns | These columns must have a perfect correlation. | 128 | | measurement | vector of strings | The measurement columns | In the result, the measurement names will become one column and the values will become another. | 129 | | [measure_name] | String | The name of the measurement in the result | By default "measure" | 130 | | [value_name] | String | The name of the value in the result | By default "value" | 131 | 132 | **Example** 133 | 134 | ```clojure 135 | ;; x 136 | ;; family_id,age_mother,dob_child1,dob_child2,dob_child3 137 | ;; 1,30,1998-11-26,2000-01-29, 138 | ;; 2,27,1996-06-22,, 139 | ;; 3,26,2002-07-11,2004-04-05,2007-09-02 140 | ;; 4,32,2004-10-10,2009-08-27,2012-07-21 141 | ;; 5,29,2000-12-05,2005-02-28, 142 | (melt x "path/to/output" ["family_id" "age_mother"] ["dob_child1" "dob_child2" "dob_child3"]) 143 | ``` 144 | 145 | #### `dcast` 146 | 147 | Reshape the dataframe from long to wide. Reversible to `melt`. 148 | 149 | | Argument | Type | Function | Remarks | 150 | | ------------ | ------------------------------------ | ------------------------------------------- | ------------------------------------------------------------ | 151 | | dataframe | clojask.classes.DataFrame.DataFrame | Specify the dataframe | | 152 | | output-path | String | The path of the output | Can be absolute or relative path with respect to the `project.clj` file. | 153 | | id | vector of strings | The fixed portion of the columns | These columns must have a perfect correlation. | 154 | | measure-name | String | The name of the measurement | By default "measure" | 155 | | value-name | String | The name of the value | By default "value" | 156 | | values | vector of string/int/double/datetime | The value choices of the measurement column | The order matters as in the result file. | 157 | | [vals-name] | vector of string | The name of the value columns | By default, same as `values` | 158 | 159 | **Example** 160 | 161 | ``` clojure 162 | ;; x 163 | ;; family_id,age_mother,measure,value 164 | ;; 1,30,dob_child1,1998-11-26 165 | ;; 1,30,dob_child2,2000-01-29 166 | ;; 1,30,dob_child3, 167 | ;; 2,27,dob_child1,1996-06-22 168 | ;; 2,27,dob_child2, 169 | ;; 2,27,dob_child3, 170 | ;; 3,26,dob_child1,2002-07-11 171 | ;; 3,26,dob_child2,2004-04-05 172 | ;; 3,26,dob_child3,2007-09-02 173 | ;; 4,32,dob_child1,2004-10-10 174 | ;; 4,32,dob_child2,2009-08-27 175 | ;; 4,32,dob_child3,2012-07-21 176 | ;; 5,29,dob_child1,2000-12-05 177 | ;; 5,29,dob_child2,2005-02-28 178 | ;; 5,29,dob_child3, 179 | (dcast x "resources/test.csv" ["family_id" "age_mother"] "measure" "value" ["dob_child1" "dob_child2" "dob_child3"]) 180 | ``` 181 | 182 | -------------------------------------------------------------------------------- /src/main/clojure/clojask/groupby.clj: -------------------------------------------------------------------------------- 1 | (ns clojask.groupby 2 | (:require [clojure.java.io :as io] 3 | ;[clojure-csv.core :as csv] 4 | [clojask.utils :as u] 5 | [clojask.classes.MGroup :refer [->MGroup]]) 6 | (:import [clojask.classes.MGroup MGroup])) 7 | "contains the utility functions to group by and aggregate" 8 | 9 | (defn compute-groupby 10 | "map the result to different files" 11 | [dataframe num-worker output-dir exception] 12 | ) 13 | 14 | (defn compute-aggregate 15 | "aggregate the output files to the final destination" 16 | [dateframe output-dir exp]) 17 | 18 | ;; ;; the example of how to write a set of aggregate function 19 | ;; (defn min-pre 20 | ;; [] 21 | ;; (def memo (atom 1))) 22 | 23 | ;; (defn min 24 | ;; [row] 25 | ;; (reset! memo (min (deref memo) row))) 26 | 27 | ;; (defn min-result 28 | ;; [] 29 | ;; (deref memo)) 30 | 31 | (defn gen-groupby-filenames 32 | "internal function to generate files csv line with groupby key(s)" 33 | [dist msg groupby-keys key-index formatters] 34 | ;; (def output-filename dist) 35 | ;; (doseq [groupby-key groupby-keys] 36 | ;; (def output-filename (str output-filename "_" (name groupby-key) "-" (nth msg (get key-index groupby-key))))) 37 | ;; (str output-filename ".csv") 38 | (let [index groupby-keys 39 | ;; (map (fn [_] (get key-index _)) groupby-keys) 40 | val (mapv (fn [_] 41 | (let [func (nth _ 0) 42 | _ (nth _ 1)] 43 | (if func 44 | (func (nth msg _)) 45 | ;; (if-let [formatter (get formatters _)] 46 | ;; (formatter (nth msg _)) 47 | ;; (nth msg _)) 48 | (nth msg _) 49 | ))) 50 | index)] 51 | (if (string? dist) (str dist (u/encode-str (str val))) (str val)))) 52 | 53 | (defn output-groupby 54 | "internal function called by output when aggregation is applied" 55 | [dist msg groupby-keys key-index formatter write-index _format] 56 | ;; msg this time is a vector 57 | 58 | ;; key-index contains the one to one correspondence of key value to index value, it is a map 59 | ;; eg "Salary" -> 3 60 | ;; (spit "resources/debug.txt" (str msg "\n" key-index) :append true) 61 | (let [output-filename (gen-groupby-filenames dist msg groupby-keys key-index formatter) ;; generate output filename 62 | ] 63 | (if (string? dist) 64 | (with-open [groupby-wrtr (io/writer output-filename :append true)] 65 | (.write groupby-wrtr (str (if (= true _format) (u/gets-format msg write-index formatter) (u/gets msg write-index)) "\n")) 66 | (.close groupby-wrtr)) 67 | (.write dist output-filename msg write-index formatter)) 68 | ;; write as maps e.g. {:name "Tim", :salary 62, :tax 0.1, :bonus 12} 69 | ;; (.write groupby-wrtr (str (u/gets-format msg write-index formatter) "\n")) 70 | 71 | 72 | ;; write as csv format e.g. Tim,62,0.1,12 73 | ;(.write groupby-wrtr (str (clojure.string/join "," (map msg (keys msg))) "\n")) 74 | 75 | ;; close writer 76 | ) 77 | 78 | ;; !! debugging 79 | ;(println (clojure.string/join "," (map msg (keys msg)))) 80 | ;(println (apply str (map msg (keys msg)))) 81 | ) 82 | 83 | (defn insert-mgroup 84 | [_mgroup] 85 | (def mgroup _mgroup)) 86 | 87 | (defn read-csv-seq 88 | "takes file name and reads data" 89 | [filename] 90 | (let [file (io/reader filename)] 91 | (->> file 92 | (line-seq) 93 | (map read-string)))) 94 | 95 | 96 | ;; (defn write-file 97 | ;; [dir seq] 98 | ;; (with-open [wtr (io/writer dir :append true)] 99 | ;; (doseq [row seq] 100 | ;; (if (not= row nil) 101 | ;; (.write wtr (str row "\n")))))) 102 | 103 | ;; (defn internal-aggregate-write 104 | ;; "called by child thread function" 105 | ;; [func out-dir groupby-keys keys file & [new-keys]] 106 | ;; (async/thread 107 | ;; (write-file out-dir (func (read-csv-seq file) groupby-keys keys new-keys)))) 108 | 109 | ;; (defn internal-aggregate 110 | ;; "aggregate one group use the function" 111 | ;; [func out-dir key-index groupby-keys keys & [new-keys]] 112 | ;; (let [directory (clojure.java.io/file "./.clojask/grouped/") 113 | ;; files (file-seq directory)] 114 | ;; (doseq [file (rest files)] 115 | ;; ;; w/o multi-threading 116 | ;; (write-file out-dir (func (read-csv-seq file) groupby-keys keys new-keys key-index)) 117 | ;; ;; multi-threading 118 | ;; ;(async/go (async/ curr-val curr-max) 164 | ;; (swap! _max assoc (.indexOf keys groupby-key) (nth row vec-index)))) 165 | ;; ))) 166 | ;; ;(println (deref _max)) 167 | ;; [(deref _max)] 168 | ;; ) 169 | ;; ) 170 | 171 | ;; (defn square [n] (* n n)) 172 | 173 | ;; (defn mean [a] (/ (reduce + a) (count a))) 174 | 175 | ;; (defn standard-deviation 176 | ;; [a] 177 | ;; (let [mn (mean a)] 178 | ;; (Math/sqrt 179 | ;; (/ (reduce #(+ %1 (square (- %2 mn))) 0 a) 180 | ;; (dec (count a)))))) 181 | 182 | ;; ;; !! check if new-keys are float/int cols 183 | ;; (defn aggre-sum 184 | ;; "get the sum of some keys" 185 | ;; [seq groupby-keys keys new-keys key-index] 186 | ;; (let [_sum (atom [])] 187 | ;; (assert (= (count keys) (count new-keys)) "number of new keys not equal to number of aggregation keys") 188 | ;; (doseq [groupby-key keys] 189 | ;; (let [vec-index (get key-index groupby-key)] ;; get index number in vector 190 | ;; ;; initialise max with zero 191 | ;; (swap! _sum assoc (.indexOf keys groupby-key) 0.0) 192 | ;; (doseq [row seq] 193 | ;; ;; do one iteration to get sum 194 | ;; (let [curr-val (Float/parseFloat (nth row vec-index)) 195 | ;; curr-sum (nth (deref _sum) (.indexOf keys groupby-key))] 196 | ;; (swap! _sum assoc (.indexOf keys groupby-key) (+ curr-val curr-sum))) 197 | ;; ))) 198 | ;; (println (deref _sum)) 199 | ;; [(deref _sum)] 200 | ;; ) 201 | ;; ) 202 | 203 | ;; ;; !! check if new-keys are float/int cols 204 | ;; (defn aggre-avg 205 | ;; "get the average of some keys" 206 | ;; [seq groupby-keys keys new-keys key-index] 207 | ;; (let [_avg (atom [])] 208 | ;; (assert (= (count keys) (count new-keys)) "number of new keys not equal to number of aggregation keys") 209 | ;; (doseq [groupby-key keys] 210 | ;; (let [vec-index (get key-index groupby-key) ;; get index number in vector 211 | ;; avg-value (/ (reduce + (doall (map #(Float/parseFloat (nth % vec-index)) seq))) (count seq))] 212 | ;; (swap! _avg assoc (.indexOf keys groupby-key) avg-value) 213 | ;; )) 214 | ;; ;(println (deref _avg)) 215 | ;; [(deref _avg)] 216 | ;; ) 217 | ;; ) 218 | 219 | ;; ;; !! check if new-keys are float/int cols 220 | ;; (defn aggre-sd 221 | ;; "get the standard deviation (sd) of some keys" 222 | ;; [seq groupby-keys keys new-keys key-index] 223 | ;; (let [_sd (atom [])] 224 | ;; (assert (= (count keys) (count new-keys)) "number of new keys not equal to number of aggregation keys") 225 | ;; (doseq [groupby-key keys] 226 | ;; (let [vec-index (get key-index groupby-key) ;; get index number in vector 227 | ;; sd-value (standard-deviation (doall (map #(Float/parseFloat (nth % vec-index)) seq)))] 228 | ;; (swap! _sd assoc (.indexOf keys groupby-key) sd-value) 229 | ;; )) 230 | ;; ;(println (deref _sd)) 231 | ;; [(deref _sd)] 232 | ;; ) 233 | ;; ) 234 | 235 | ;; (defn template 236 | ;; "The template for aggregate functions" 237 | ;; ;; seq: is a seq of maps (lazy) of the data from one of the file 238 | ;; ;; groupby-keys: is a vector of the group by keys 239 | ;; ;; old-keys: the columns to which this function applies 240 | ;; ;; new-keys: the new-keys to replace the old-keys and receive the aggregation result 241 | ;; [seq groupby-keys old-keys new-keys]) 242 | ;; ;; the return should be an vector of map (better lazy) 243 | 244 | -------------------------------------------------------------------------------- /test/clojask/inmemory_test.clj: -------------------------------------------------------------------------------- 1 | (ns inmemory-test 2 | (:require [clojure.test :refer :all] 3 | [clojask.dataframe :refer :all] 4 | [clojask.utils :refer :all] 5 | [clojask.groupby :refer :all] 6 | [clojask.api.gb-aggregate :as gb-aggre] 7 | [clojask.api.aggregate :as aggre] 8 | [clojask.sort :refer :all] 9 | [clojure.string :as str])) 10 | 11 | (use '[clojure.java.shell :only [sh]]) 12 | 13 | ;; an alternative for diff | sort with better compatibility 14 | (defn _get-diff 15 | [a b & [order]] 16 | (try 17 | (let [order (if (nil? order) true order) 18 | data-a (str/split (slurp a) #"\n") 19 | data-b (str/split (slurp b) #"\n") 20 | data-a (if order (clojure.core/sort data-a) data-a) 21 | data-b (if order (clojure.core/sort data-b) data-b)] 22 | (if (= 0 (compare (vec data-a) (vec data-b))) 23 | {:out "" :err ""} 24 | {:out (str "not the same: " data-a data-b) :err ""})) 25 | (catch Exception e {:out "has exception" :err (str e)}))) 26 | 27 | (defn get-diff 28 | [a b & [order]] 29 | (loop [count 3] 30 | (let [res (_get-diff a b order)] 31 | (if (or (< count 1) (= (:out res) "")) 32 | res 33 | (recur (dec count)))))) 34 | 35 | (enable-debug) 36 | 37 | (deftest df-api-test 38 | (testing "Single dataframe manipulation APIs" 39 | (def y (dataframe "test/clojask/Employees-example.csv" :have-col true)) 40 | (is (= clojask.classes.DataFrame.DataFrame (type y))) 41 | (is (= clojask.classes.DataFrame.DataFrame (type (set-type y "Salary" "double")))) 42 | (is (= clojask.classes.DataFrame.DataFrame (type (set-parser y "Department" #(Double/parseDouble %))))) 43 | (is (= clojask.classes.DataFrame.DataFrame (type (filter y "Salary" (fn [salary] (<= salary 800)))))) 44 | (is (= clojask.classes.DataFrame.DataFrame (type (operate y - "Salary")))) 45 | (is (= clojask.classes.DataFrame.DataFrame (type (operate y str ["Employee" "Salary"] "new-col")))) 46 | (is (= clojask.classes.DataFrame.DataFrame (type (group-by y ["Department"])))) 47 | (is (= clojask.classes.DataFrame.DataFrame (type (aggregate y max ["Salary"] ["Salary-max"])))) 48 | (is (= clojask.classes.DataFrame.DataFrame (type (compute y 8 "test/clojask/test_outputs/tmp.csv")))))) 49 | 50 | (deftest df-api-output-test 51 | (testing "Single dataframe manipulation APIs" 52 | (def y (dataframe "test/clojask/Employees-example.csv" :have-col true)) 53 | ;; element-operation 54 | (set-type y "Salary" "double") 55 | (operate y - "Salary") 56 | (set-formatter y "Salary" #(str % "!")) 57 | (compute y 8 "test/clojask/test_outputs/1-1.csv" :exception false :order true :in-memory true) 58 | ;; (let [result (get-diff "./test/clojask/test_outputs/1-1.csv" "./test/clojask/correct_outputs/1-1.csv" false)] 59 | ;; (is (= "" (:out result))) 60 | ;; (is (= "" (:err result)))) 61 | ;; filter and row-operation 62 | (def y (dataframe "test/clojask/Employees-example.csv" :have-col true)) 63 | (set-type y "Salary" "double") 64 | (filter y "Salary" (fn [salary] (<= salary 800))) 65 | (operate y str ["Employee" "Salary"] "new-col") 66 | (compute y 8 "test/clojask/test_outputs/1-2.csv" :exception false :in-memory true) 67 | ;; (let [result (get-diff "./test/clojask/test_outputs/1-2.csv" "./test/clojask/correct_outputs/1-2.csv")] 68 | ;; (is (= "" (:out result))) 69 | ;; (is (= "" (:err result)))) 70 | ;; groupby and aggregate 71 | (def y (dataframe "test/clojask/Employees-example.csv" :have-col true)) 72 | (set-type y "Salary" "double") 73 | (group-by y ["Department"]) 74 | (aggregate y gb-aggre/max ["Salary"] ["new-Salary"]) 75 | (compute y 8 "test/clojask/test_outputs/1-3.csv" :exception false :in-memory true) 76 | ;; (let [result (get-diff "test/clojask/test_outputs/1-3.csv" "test/clojask/correct_outputs/1-3.csv")] 77 | ;; (is (= "" (:out result))) 78 | ;; (is (= "" (:err result)))) 79 | ;; aggregate only 80 | (def y (dataframe "test/clojask/Employees-example.csv" :have-col true)) 81 | (set-type y "Salary" "double") 82 | (aggregate y aggre/max ["Salary"] ["new-Salary"]) 83 | (compute y 8 "test/clojask/test_outputs/1-10.csv" :exception false :in-memory true) 84 | ;; (let [result (get-diff "test/clojask/test_outputs/1-10.csv" "test/clojask/correct_outputs/1-10.csv")] 85 | ;; (is (= "" (:out result))) 86 | ;; (is (= "" (:err result)))) 87 | ;; groupby only 88 | (def y (dataframe "test/clojask/Employees-example.csv" :have-col true)) 89 | (group-by y ["Department"]) 90 | (compute y 8 "test/clojask/test_outputs/1-11.csv" :exception false :in-memory true) 91 | ;; (let [result (get-diff "test/clojask/test_outputs/1-11.csv" "test/clojask/correct_outputs/1-11.csv")] 92 | ;; (is (= "" (:out result))) 93 | ;; (is (= "" (:err result)))) 94 | )) 95 | 96 | (deftest col-api-test 97 | (testing "Column manipulation APIs" 98 | (def y (dataframe "test/clojask/Employees-example.csv" :have-col true)) 99 | (reorder-col y ["Employee" "Department" "EmployeeName" "Salary" "UpdateDate"]) 100 | (is (= (get-col-names y) ["Employee" "Department" "EmployeeName" "Salary" "UpdateDate"])) 101 | (rename-col y "Department" "new-Department") 102 | ;; (map (fn [a b] (rename-col y a b)) (get-col-names y) ["Employee" "new-Department" "EmployeeName" "Salary" "UpdateDate"]) 103 | (is (= (get-col-names y) ["Employee" "new-Department" "EmployeeName" "Salary" "UpdateDate"])))) 104 | 105 | (deftest col-select-output-test 106 | (testing "Select column(s) argument" 107 | (def y (dataframe "test/clojask/Employees-example.csv" :have-col true)) 108 | (compute y 8 "test/clojask/test_outputs/1-9.csv" :select ["Employee", "EmployeeName"] :exception false :in-memory true) 109 | ;; (let [result (get-diff "test/clojask/test_outputs/1-9.csv" "test/clojask/correct_outputs/1-9.csv")] 110 | ;; (is (= "" (:out result))) 111 | ;; (is (= "" (:err result)))) 112 | )) 113 | 114 | (deftest join-api-test 115 | (testing "Join dataframes APIs" 116 | (def x (dataframe "test/clojask/Employees-example.csv")) 117 | (def y (dataframe "test/clojask/Employees-example.csv")) 118 | (is (= clojask.classes.DataFrame.DataFrame (type (compute (left-join x y ["Employee"] ["Employee"]) 8 "resources/test.csv" :exception false)))) 119 | (is (= clojask.classes.DataFrame.DataFrame (type (compute (right-join x y ["Employee"] ["Employee"]) 8 "resources/test.csv" :exception false)))) 120 | (is (= clojask.classes.DataFrame.DataFrame (type (compute (inner-join x y ["Employee"] ["Employee"]) 8 "resources/test.csv" :exception false)))) 121 | (is (= clojask.classes.DataFrame.DataFrame (type (compute (rolling-join-forward x y ["Employee"] ["Employee"] "Salary" "Salary") 8 "resources/test.csv" :exception false)))))) 122 | 123 | (deftest join-api-output-test 124 | (testing "Join dataframes APIs" 125 | (def x (dataframe "test/clojask/Employees-example.csv")) 126 | (set-type x "UpdateDate" "date:yyyy/MM/dd") 127 | (def y (dataframe "test/clojask/Employees-info-example.csv")) 128 | (set-type y "UpdateDate" "date:yyyy/MM/dd") 129 | (compute (left-join x y ["Employee"] ["Employee"]) 8 "test/clojask/test_outputs/1-4.csv" :exception false :in-memory true) 130 | ;; (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-4.csv) <(sort test/clojask/correct_outputs/1-4.csv)")] 131 | ;; (is (= "" (:out result))) 132 | ;; (is (= "" (:err result)))) 133 | (compute (right-join x y ["Employee"] ["Employee"]) 8 "test/clojask/test_outputs/1-5.csv" :exception false :in-memory true) 134 | ;; (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-5.csv) <(sort test/clojask/correct_outputs/1-5.csv)")] 135 | ;; (is (= "" (:out result))) 136 | ;; (is (= "" (:err result)))) 137 | (def z (inner-join x y ["Employee"] ["Employee"])) 138 | (compute z 8 "test/clojask/test_outputs/1-6.csv" :exception false :select ["2_Employee" "2_EmployeeName" "2_DayOff" "2_UpdateDate" "1_Employee" "1_EmployeeName" "1_Department" "1_Salary" "1_UpdateDate"]) 139 | ;; (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-6.csv) <(sort test/clojask/correct_outputs/1-6.csv)")] 140 | ;; (is (= "" (:out result))) 141 | ;; (is (= "" (:err result)))) 142 | (compute (rolling-join-forward x y ["EmployeeName"] ["EmployeeName"] "UpdateDate" "UpdateDate") 8 "test/clojask/test_outputs/1-7.csv" :exception false :in-memory true) 143 | ;; (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-7.csv) <(sort test/clojask/correct_outputs/1-7.csv)")] 144 | ;; (is (= "" (:out result))) 145 | ;; (is (= "" (:err result)))) 146 | (compute (rolling-join-backward x y ["EmployeeName"] ["EmployeeName"] "UpdateDate" "UpdateDate") 8 "test/clojask/test_outputs/1-8.csv" :exception false :in-memory true) 147 | ;; (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-8.csv) <(sort test/clojask/correct_outputs/1-8.csv)")] 148 | ;; (is (= "" (:out result))) 149 | ;; (is (= "" (:err result)))) 150 | (compute (outer-join x y ["Employee"] ["Employee"]) 8 "test/clojask/test_outputs/1-12.csv" :select ["1_Department" "1_Salary" "1_UpdateDate" "2_Employee" "2_EmployeeName" "2_DayOff" "2_UpdateDate"] :in-memory true) 151 | ;; (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-12.csv) <(sort test/clojask/correct_outputs/1-12.csv)")] 152 | ;; (is (= "" (:out result))) 153 | ;; (is (= "" (:err result)))) 154 | )) 155 | 156 | (defn all-in-one 157 | [] 158 | (df-api-test) 159 | (df-api-output-test) 160 | (col-api-test) 161 | (col-select-output-test) 162 | (join-api-test) 163 | (join-api-output-test)) 164 | 165 | (deftest test-ns-hook 166 | (testing "Check all the outputs in a nested way" 167 | (all-in-one) 168 | (let [result (get-diff "./test/clojask/test_outputs/1-1.csv" "./test/clojask/correct_outputs/1-1.csv" false)] 169 | (is (= "" (:out result))) 170 | (is (= "" (:err result)))) 171 | (let [result (get-diff "./test/clojask/test_outputs/1-2.csv" "./test/clojask/correct_outputs/1-2.csv")] 172 | (is (= "" (:out result))) 173 | (is (= "" (:err result)))) 174 | (let [result (get-diff "test/clojask/test_outputs/1-3.csv" "test/clojask/correct_outputs/1-3.csv")] 175 | (is (= "" (:out result))) 176 | (is (= "" (:err result)))) 177 | (let [result (get-diff "test/clojask/test_outputs/1-10.csv" "test/clojask/correct_outputs/1-10.csv")] 178 | (is (= "" (:out result))) 179 | (is (= "" (:err result)))) 180 | (let [result (get-diff "test/clojask/test_outputs/1-11.csv" "test/clojask/correct_outputs/1-11.csv")] 181 | (is (= "" (:out result))) 182 | (is (= "" (:err result)))) 183 | (let [result (get-diff "test/clojask/test_outputs/1-9.csv" "test/clojask/correct_outputs/1-9.csv")] 184 | (is (= "" (:out result))) 185 | (is (= "" (:err result)))) 186 | (let [result (get-diff "test/clojask/test_outputs/1-4.csv" "test/clojask/correct_outputs/1-4.csv")] 187 | (is (= "" (:out result))) 188 | (is (= "" (:err result)))) 189 | (let [result (get-diff "test/clojask/test_outputs/1-5.csv" "test/clojask/correct_outputs/1-5.csv")] 190 | (is (= "" (:out result))) 191 | (is (= "" (:err result)))) 192 | (let [result (get-diff "test/clojask/test_outputs/1-6.csv" "test/clojask/correct_outputs/1-6.csv")] 193 | (is (= "" (:out result))) 194 | (is (= "" (:err result)))) 195 | (let [result (get-diff "test/clojask/test_outputs/1-7.csv" "test/clojask/correct_outputs/1-7.csv")] 196 | (is (= "" (:out result))) 197 | (is (= "" (:err result)))) 198 | (let [result (get-diff "test/clojask/test_outputs/1-8.csv" "test/clojask/correct_outputs/1-8.csv")] 199 | (is (= "" (:out result))) 200 | (is (= "" (:err result)))) 201 | (let [result (get-diff "test/clojask/test_outputs/1-12.csv" "test/clojask/correct_outputs/1-12.csv")] 202 | (is (= "" (:out result))) 203 | (is (= "" (:err result)))))) -------------------------------------------------------------------------------- /test/clojask/core_test.clj: -------------------------------------------------------------------------------- 1 | (ns core-test 2 | (:require [clojure.test :refer :all] 3 | [clojask.dataframe :refer :all] 4 | [clojask.utils :refer :all] 5 | [clojask.groupby :refer :all] 6 | [clojask.api.gb-aggregate :as gb-aggre] 7 | [clojask.api.aggregate :as aggre] 8 | [clojask.sort :refer :all] 9 | [clojure.data.csv :as csv] 10 | [clojure.string :as str])) 11 | 12 | (use '[clojure.java.shell :only [sh]]) 13 | 14 | ;; an alternative for diff | sort with better compatibility 15 | (defn _get-diff 16 | [a b & [order]] 17 | (try 18 | (let [order (if (nil? order) true order) 19 | data-a (str/split (slurp a) #"\n") 20 | data-b (str/split (slurp b) #"\n") 21 | data-a (if order (clojure.core/sort data-a) data-a) 22 | data-b (if order (clojure.core/sort data-b) data-b)] 23 | (if (= 0 (compare (vec data-a) (vec data-b))) 24 | {:out "" :err ""} 25 | {:out (str "not the same: " data-a data-b) :err ""})) 26 | (catch Exception e {:out "has exception" :err (str e)}))) 27 | 28 | (defn get-diff 29 | [a b & [order]] 30 | (loop [count 3] 31 | (let [res (_get-diff a b order)] 32 | (if (or (< count 1) (= (:out res) "")) 33 | res 34 | (recur (dec count)))))) 35 | 36 | ;; (enable-debug) 37 | 38 | (deftest df-api-test 39 | (testing "Single dataframe manipulation APIs" 40 | (def y (dataframe "test/clojask/Employees-example.csv" :have-col true)) 41 | (is (= clojask.classes.DataFrame.DataFrame (type y))) 42 | (is (= clojask.classes.DataFrame.DataFrame (type (set-type y "Salary" "double")))) 43 | (is (= clojask.classes.DataFrame.DataFrame (type (set-parser y "Department" #(Double/parseDouble %))))) 44 | (is (= clojask.classes.DataFrame.DataFrame (type (filter y "Salary" (fn [salary] (<= salary 800)))))) 45 | (is (= clojask.classes.DataFrame.DataFrame (type (operate y - "Salary")))) 46 | (is (= clojask.classes.DataFrame.DataFrame (type (operate y str ["Employee" "Salary"] "new-col")))) 47 | (is (= clojask.classes.DataFrame.DataFrame (type (group-by y ["Department"])))) 48 | (is (= clojask.classes.DataFrame.DataFrame (type (aggregate y max ["Salary"] ["Salary-max"])))) 49 | (is (= clojask.classes.DataFrame.DataFrame (type (compute y 8 "test/clojask/test_outputs/tmp.csv")))) 50 | )) 51 | 52 | (deftest df-api-output-test 53 | (testing "Single dataframe manipulation APIs" 54 | (def y (dataframe "test/clojask/Employees-example.csv" :have-col true)) 55 | ;; element-operation 56 | (set-type y "Salary" "double") 57 | (operate y - "Salary") 58 | (set-formatter y "Salary" #(str % "!")) 59 | (compute y 8 "test/clojask/test_outputs/1-1.csv" :exception false :order true) 60 | ;; (let [result (get-diff "./test/clojask/test_outputs/1-1.csv" "./test/clojask/correct_outputs/1-1.csv" false)] 61 | ;; (is (= "" (:out result))) 62 | ;; (is (= "" (:err result)))) 63 | ;; filter and row-operation 64 | (def y (dataframe "test/clojask/Employees-example.csv" :have-col true)) 65 | (set-type y "Salary" "double") 66 | (filter y "Salary" (fn [salary] (<= salary 800))) 67 | (operate y str ["Employee" "Salary"] "new-col") 68 | (compute y 8 "test/clojask/test_outputs/1-2.csv" :exception false) 69 | ;; (let [result (get-diff "./test/clojask/test_outputs/1-2.csv" "./test/clojask/correct_outputs/1-2.csv")] 70 | ;; (is (= "" (:out result))) 71 | ;; (is (= "" (:err result)))) 72 | ;; groupby and aggregate 73 | (def y (dataframe "test/clojask/Employees-example.csv" :have-col true)) 74 | (set-type y "Salary" "double") 75 | (group-by y ["Department"]) 76 | (aggregate y gb-aggre/max ["Salary"] ["new-Salary"]) 77 | (compute y 8 "test/clojask/test_outputs/1-3.csv" :exception false) 78 | ;; (let [result (get-diff "test/clojask/test_outputs/1-3.csv" "test/clojask/correct_outputs/1-3.csv")] 79 | ;; (is (= "" (:out result))) 80 | ;; (is (= "" (:err result)))) 81 | ;; aggregate only 82 | (def y (dataframe "test/clojask/Employees-example.csv" :have-col true)) 83 | (set-type y "Salary" "double") 84 | (aggregate y aggre/max ["Salary"] ["new-Salary"]) 85 | (compute y 8 "test/clojask/test_outputs/1-10.csv" :exception false) 86 | ;; (let [result (get-diff "test/clojask/test_outputs/1-10.csv" "test/clojask/correct_outputs/1-10.csv")] 87 | ;; (is (= "" (:out result))) 88 | ;; (is (= "" (:err result)))) 89 | ;; groupby only 90 | (def y (dataframe "test/clojask/Employees-example.csv" :have-col true)) 91 | (group-by y ["Department"]) 92 | (compute y 8 "test/clojask/test_outputs/1-11.csv" :exception false) 93 | ;; (let [result (get-diff "test/clojask/test_outputs/1-11.csv" "test/clojask/correct_outputs/1-11.csv")] 94 | ;; (is (= "" (:out result))) 95 | ;; (is (= "" (:err result)))) 96 | )) 97 | 98 | (deftest col-api-test 99 | (testing "Column manipulation APIs" 100 | (def y (dataframe "test/clojask/Employees-example.csv" :have-col true)) 101 | (reorder-col y ["Employee" "Department" "EmployeeName" "Salary" "UpdateDate"]) 102 | (is (= (get-col-names y) ["Employee" "Department" "EmployeeName" "Salary" "UpdateDate"])) 103 | (rename-col y "Department" "new-Department") 104 | ;; (map (fn [a b] (rename-col y a b)) (get-col-names y) ["Employee" "new-Department" "EmployeeName" "Salary" "UpdateDate"]) 105 | (is (= (get-col-names y) ["Employee" "new-Department" "EmployeeName" "Salary" "UpdateDate"])))) 106 | 107 | (deftest col-select-output-test 108 | (testing "Select column(s) argument" 109 | (def y (dataframe "test/clojask/Employees-example.csv" :have-col true)) 110 | (compute y 8 "test/clojask/test_outputs/1-9.csv" :select ["Employee", "EmployeeName"] :exception false) 111 | ;; (let [result (get-diff "test/clojask/test_outputs/1-9.csv" "test/clojask/correct_outputs/1-9.csv")] 112 | ;; (is (= "" (:out result))) 113 | ;; (is (= "" (:err result)))) 114 | )) 115 | 116 | (deftest join-api-test 117 | (testing "Join dataframes APIs" 118 | (def x (dataframe "test/clojask/Employees-example.csv")) 119 | (def y (dataframe "test/clojask/Employees-example.csv")) 120 | (is (= clojask.classes.DataFrame.DataFrame (type (compute (left-join x y ["Employee"] ["Employee"]) 8 "resources/test.csv" :exception false)))) 121 | (is (= clojask.classes.DataFrame.DataFrame (type (compute (right-join x y ["Employee"] ["Employee"]) 8 "resources/test.csv" :exception false)))) 122 | (is (= clojask.classes.DataFrame.DataFrame (type (compute (inner-join x y ["Employee"] ["Employee"]) 8 "resources/test.csv" :exception false)))) 123 | (is (= clojask.classes.DataFrame.DataFrame (type (compute (rolling-join-forward x y ["Employee"] ["Employee"] "Salary" "Salary") 8 "resources/test.csv" :exception false)))) 124 | )) 125 | 126 | (deftest join-api-output-test 127 | (testing "Join dataframes APIs" 128 | (def x (dataframe "test/clojask/Employees-example.csv")) 129 | (set-type x "UpdateDate" "date:yyyy/MM/dd") 130 | (def y (dataframe "test/clojask/Employees-info-example.csv")) 131 | (set-type y "UpdateDate" "date:yyyy/MM/dd") 132 | (compute (left-join x y ["Employee"] ["Employee"]) 8 "test/clojask/test_outputs/1-4.csv" :exception false) 133 | ;; (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-4.csv) <(sort test/clojask/correct_outputs/1-4.csv)")] 134 | ;; (is (= "" (:out result))) 135 | ;; (is (= "" (:err result)))) 136 | (compute (right-join x y ["Employee"] ["Employee"]) 8 "test/clojask/test_outputs/1-5.csv" :exception false) 137 | ;; (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-5.csv) <(sort test/clojask/correct_outputs/1-5.csv)")] 138 | ;; (is (= "" (:out result))) 139 | ;; (is (= "" (:err result)))) 140 | (def z (inner-join x y ["Employee"] ["Employee"])) 141 | (compute z 8 "test/clojask/test_outputs/1-6.csv" :exception false :select ["2_Employee" "2_EmployeeName" "2_DayOff" "2_UpdateDate" "1_Employee" "1_EmployeeName" "1_Department" "1_Salary" "1_UpdateDate"]) 142 | ;; (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-6.csv) <(sort test/clojask/correct_outputs/1-6.csv)")] 143 | ;; (is (= "" (:out result))) 144 | ;; (is (= "" (:err result)))) 145 | (compute (rolling-join-forward x y ["EmployeeName"] ["EmployeeName"] "UpdateDate" "UpdateDate") 8 "test/clojask/test_outputs/1-7.csv" :exception false) 146 | ;; (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-7.csv) <(sort test/clojask/correct_outputs/1-7.csv)")] 147 | ;; (is (= "" (:out result))) 148 | ;; (is (= "" (:err result)))) 149 | (compute (rolling-join-backward x y ["EmployeeName"] ["EmployeeName"] "UpdateDate" "UpdateDate") 8 "test/clojask/test_outputs/1-8.csv" :exception false) 150 | ;; (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-8.csv) <(sort test/clojask/correct_outputs/1-8.csv)")] 151 | ;; (is (= "" (:out result))) 152 | ;; (is (= "" (:err result)))) 153 | (compute (outer-join x y ["Employee"] ["Employee"]) 8 "test/clojask/test_outputs/1-12.csv" :select ["1_Department" "1_Salary" "1_UpdateDate" "2_Employee" "2_EmployeeName" "2_DayOff" "2_UpdateDate"]) 154 | ;; (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-12.csv) <(sort test/clojask/correct_outputs/1-12.csv)")] 155 | ;; (is (= "" (:out result))) 156 | ;; (is (= "" (:err result)))) 157 | )) 158 | 159 | (deftest test-ns-hook 160 | (testing "Check all the outputs in a nested way" 161 | (df-api-test) 162 | (df-api-output-test) 163 | (col-api-test) 164 | (col-select-output-test) 165 | (join-api-test) 166 | (join-api-output-test) 167 | (let [result (get-diff "./test/clojask/test_outputs/1-1.csv" "./test/clojask/correct_outputs/1-1.csv" false)] 168 | (is (= "" (:out result))) 169 | (is (= "" (:err result)))) 170 | (let [result (get-diff "./test/clojask/test_outputs/1-2.csv" "./test/clojask/correct_outputs/1-2.csv")] 171 | (is (= "" (:out result))) 172 | (is (= "" (:err result)))) 173 | (let [result (get-diff "test/clojask/test_outputs/1-3.csv" "test/clojask/correct_outputs/1-3.csv")] 174 | (is (= "" (:out result))) 175 | (is (= "" (:err result)))) 176 | (let [result (get-diff "test/clojask/test_outputs/1-10.csv" "test/clojask/correct_outputs/1-10.csv")] 177 | (is (= "" (:out result))) 178 | (is (= "" (:err result)))) 179 | (let [result (get-diff "test/clojask/test_outputs/1-11.csv" "test/clojask/correct_outputs/1-11.csv")] 180 | (is (= "" (:out result))) 181 | (is (= "" (:err result)))) 182 | (let [result (get-diff "test/clojask/test_outputs/1-9.csv" "test/clojask/correct_outputs/1-9.csv")] 183 | (is (= "" (:out result))) 184 | (is (= "" (:err result)))) 185 | (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-4.csv) <(sort test/clojask/correct_outputs/1-4.csv)")] 186 | (is (= "" (:out result))) 187 | (is (= "" (:err result)))) 188 | (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-5.csv) <(sort test/clojask/correct_outputs/1-5.csv)")] 189 | (is (= "" (:out result))) 190 | (is (= "" (:err result)))) 191 | (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-6.csv) <(sort test/clojask/correct_outputs/1-6.csv)")] 192 | (is (= "" (:out result))) 193 | (is (= "" (:err result)))) 194 | (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-7.csv) <(sort test/clojask/correct_outputs/1-7.csv)")] 195 | (is (= "" (:out result))) 196 | (is (= "" (:err result)))) 197 | (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-8.csv) <(sort test/clojask/correct_outputs/1-8.csv)")] 198 | (is (= "" (:out result))) 199 | (is (= "" (:err result)))) 200 | (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-12.csv) <(sort test/clojask/correct_outputs/1-12.csv)")] 201 | (is (= "" (:out result))) 202 | (is (= "" (:err result)))))) 203 | 204 | -------------------------------------------------------------------------------- /src/main/clojure/clojask/aggregate/aggre_onyx_comps.clj: -------------------------------------------------------------------------------- 1 | (ns clojask.aggregate.aggre-onyx-comps 2 | (:require [clojask.aggregate.aggre-input :as input] 3 | [clojask.aggregate.aggre-output :as output] 4 | ;; [clojask.clojask-groupby :as groupby] 5 | ;; [clojask.clojask-join :as join] 6 | [onyx.api :refer :all] 7 | [clojure.string :as string] 8 | [onyx.test-helper :refer [with-test-env feedback-exception!]] 9 | ;; [tech.v3.dataset :as ds] 10 | [clojure.data.csv :as csv] 11 | [clojask.utils :as u] 12 | [clojure.set :as set] 13 | [clojask.groupby :refer [read-csv-seq insert-mgroup]]) 14 | (:import (java.io BufferedReader FileReader BufferedWriter FileWriter) 15 | [com.clojask.exception ExecutionException])) 16 | 17 | 18 | (def id (java.util.UUID/randomUUID)) 19 | 20 | (defn workflow-gen 21 | "Generate workflow for running Onyx" 22 | [num-work] 23 | (def workflow []) ;; initialisation 24 | 25 | ;; for loop for input edges 26 | (doseq [x (range 1 (+ num-work 1))] 27 | (let [worker-name (keyword (str "sample-worker" x))] 28 | (def workflow (conj workflow [:in worker-name] 29 | )))) 30 | 31 | ;; for loop for output edges 32 | (doseq [x (range 1 (+ num-work 1))] 33 | (let [worker-name (keyword (str "sample-worker" x))] 34 | (def workflow (conj workflow [worker-name :output] 35 | )))) 36 | ) 37 | 38 | (def dataframe (atom nil)) 39 | 40 | 41 | (defn worker-func-gen 42 | [df exception aggre-funcs index formatter source] 43 | (reset! dataframe df) 44 | (let [ 45 | ;; aggre-funcs (.getAggreFunc (.row-info (deref dataframe))) 46 | formatters formatter 47 | ;; key-index (.getKeyIndex (.col-info (deref dataframe))) 48 | ;; formatters (set/rename-keys formatters key-index) 49 | reorder (fn [a b] 50 | ;; (println [a b]) 51 | (u/gets (concat a b) index)) 52 | groupby-keys (.getGroupbyKeys (:row-info df)) 53 | ;; tmp (println groupby-keys) 54 | groupby-index (mapv #(nth % 1) groupby-keys) 55 | org-format (set/rename-keys (.getFormatter (:col-info df)) (zipmap groupby-index (iterate inc 0))) 56 | pre-index (take (count groupby-index) (iterate inc 0)) 57 | ] 58 | (defn worker-func 59 | "refered in preview" 60 | [seq] 61 | ;; (println formatters) 62 | (let [data (if (= source nil) (read-csv-seq (:file seq)) (.getKey source (:file seq))) 63 | pre (:d seq) 64 | pre (u/gets-format pre pre-index org-format) 65 | data-map (-> (iterate inc 0) 66 | (zipmap (apply map vector data)))] 67 | ;; (mapv (fn [_] 68 | ;; (let [func (first _) 69 | ;; index (nth _ 1)] 70 | ;; (func (get data-map index)))) 71 | ;; aggre-funcs) 72 | ;; (println data) 73 | (loop [aggre-funcs aggre-funcs 74 | res []] 75 | (if (= aggre-funcs []) 76 | ;; {:d (vec (concat pre res))} 77 | (if (= res []) 78 | {:d [pre]} 79 | {:d (mapv reorder (repeat pre) (apply map vector res))}) 80 | (let [func (first (first aggre-funcs)) 81 | index (nth (first aggre-funcs) 1) 82 | res-funcs (rest aggre-funcs) 83 | ;; tmp (println index) 84 | ;; tmp (println (str data-map)) 85 | new (func (get data-map index)) 86 | new (if (coll? new) 87 | new 88 | (vector new)) 89 | new (mapv (fn [_] (if-let [formatter (get formatters index)] 90 | (formatter _) 91 | (str _))) new)] 92 | (if (or (= res []) (= (count new) (count (last res)))) 93 | (recur res-funcs (conj res new)) 94 | (throw (Exception. "aggregation result is not of the same length"))) 95 | ))) 96 | )))) 97 | 98 | (defn catalog-gen 99 | "Generate the catalog for running Onyx" 100 | [num-work batch-size] 101 | ;; initialisation 102 | (def catalog []) 103 | 104 | ;; input 105 | (def catalog 106 | (conj catalog 107 | {:onyx/name :in 108 | :onyx/plugin :clojask.aggregate.aggre-input/input 109 | :onyx/type :input 110 | :onyx/medium :seq 111 | :seq/checkpoint? true 112 | :onyx/batch-size batch-size 113 | :onyx/max-peers 1 114 | :input/doc "Reads segments from a core.async channel"})) 115 | 116 | ;; for loop for sample workers 117 | (doseq [x (range 1 (+ num-work 1))] 118 | (let [worker-name (keyword (str "sample-worker" x)) 119 | worker-function (keyword "clojask.aggregate.aggre-onyx-comps" "worker-func")] 120 | (def catalog 121 | (conj catalog 122 | {:onyx/name worker-name 123 | :onyx/fn worker-function 124 | :onyx/type :function 125 | :onyx/batch-size batch-size 126 | :worker/doc "This is a worker node"} 127 | )))) 128 | 129 | ;; output 130 | (def catalog 131 | (conj catalog 132 | {:onyx/name :output 133 | :onyx/plugin :clojask.aggregate.aggre-output/output 134 | :onyx/type :output 135 | :onyx/medium :core.async ;; this is maked up 136 | :onyx/max-peers 1 137 | :onyx/batch-size batch-size 138 | :output/doc "Writes segments to the file"})) 139 | 140 | ;; (println catalog) ;; !! debugging 141 | ) 142 | 143 | 144 | (defn inject-in-reader [event lifecycle] 145 | (let [] 146 | {:buffered-reader/path (:buffered-reader/path lifecycle) 147 | })) 148 | 149 | 150 | (def in-calls 151 | {:lifecycle/before-task-start inject-in-reader}) 152 | 153 | 154 | (defn lifecycle-gen 155 | [source dist] 156 | (def lifecycles 157 | [{:lifecycle/task :in 158 | :buffered-reader/path source 159 | :lifecycle/calls ::in-calls} 160 | {:lifecycle/task :in 161 | :lifecycle/calls :clojask.aggregate.aggre-input/reader-calls} 162 | {:lifecycle/task :output 163 | :buffered-wtr/filename dist 164 | :lifecycle/calls :clojask.aggregate.aggre-output/writer-calls}])) 165 | 166 | (def num-workers (atom 1)) 167 | 168 | ;; (defn rem0? 169 | ;; [event old-segment new-segment all-new-segment] 170 | ;; ;; (spit "resources/debug.txt" (str new-segment "\n") :append true) 171 | ;; (= (mod (:id new-segment) (deref num-workers)) 0)) 172 | 173 | ;; (defn rem1? 174 | ;; [event old-segment new-segment all-new-segment] 175 | ;; (= (mod (:id new-segment) (deref num-workers)) 1)) 176 | 177 | ;; (defn rem2? 178 | ;; [event old-segment new-segment all-new-segment] 179 | ;; (= (mod (:id new-segment) (deref num-workers)) 2)) 180 | 181 | ;; (defn rem3? 182 | ;; [event old-segment new-segment all-new-segment] 183 | ;; (= (mod (:id new-segment) (deref num-workers)) 3)) 184 | 185 | ;; (defn rem4? 186 | ;; [event old-segment new-segment all-new-segment] 187 | ;; (= (mod (:id new-segment) (deref num-workers)) 4)) 188 | 189 | ;; (defn rem5? 190 | ;; [event old-segment new-segment all-new-segment] 191 | ;; (= (mod (:id new-segment) (deref num-workers)) 5)) 192 | 193 | ;; (defn rem6? 194 | ;; [event old-segment new-segment all-new-segment] 195 | ;; (= (mod (:id new-segment) (deref num-workers)) 6)) 196 | 197 | ;; (defn rem7? 198 | ;; [event old-segment new-segment all-new-segment] 199 | ;; (= (mod (:id new-segment) (deref num-workers)) 7)) 200 | 201 | ;; (defn rem8? 202 | ;; [event old-segment new-segment all-new-segment] 203 | ;; (= (mod (:id new-segment) (deref num-workers)) 8)) 204 | 205 | 206 | ;; [{:flow/from :in 207 | ;; :flow/to [:sample-worker1] 208 | ;; :flow/predicate :clojask.onyx-comps/rem0? 209 | ;; :flow/doc ""} 210 | ;; {:flow/from :in 211 | ;; :flow/to [:sample-worker2] 212 | ;; :flow/predicate :clojask.onyx-comps/rem1? 213 | ;; :flow/doc ""}] 214 | 215 | (defn flow-cond-gen 216 | "Generate the flow conditions for running Onyx" 217 | [num-work] 218 | (reset! num-workers num-work) 219 | (def flow-conditions []) ;; initialisation 220 | 221 | ;; for loop for sample workers 222 | (doseq [x (range 1 (+ num-work 1))] 223 | (let [worker-name (keyword (str "sample-worker" x)) 224 | predicate-function (keyword "clojask.aggregate.aggre-onyx-comps" (str "rem" (- x 1) "?"))] 225 | (intern 'clojask.aggregate.aggre-onyx-comps (symbol (str "rem" (- x 1) "?")) (fn [event old-segment new-segment all-new-segment] 226 | (= (mod (:id new-segment) num-work) (- x 1)))) 227 | (def flow-conditions 228 | (conj flow-conditions 229 | {:flow/from :in 230 | :flow/to [worker-name] 231 | :flow/predicate predicate-function 232 | :worker/doc "This is a flow condition"} 233 | )))) 234 | 235 | ;; (println flow-conditions) ;; !! debugging 236 | ) 237 | 238 | (defn config-env 239 | [] 240 | (def env-config 241 | {:zookeeper/address "127.0.0.1:2188" 242 | :zookeeper/server? true 243 | :zookeeper.server/port 2188 244 | :onyx/tenancy-id id 245 | :onyx.log/file ".clojask/clojask.log"}) 246 | 247 | (def peer-config 248 | {:zookeeper/address "127.0.0.1:2188" 249 | :onyx/tenancy-id id 250 | :onyx.peer/job-scheduler :onyx.job-scheduler/balanced 251 | :onyx.messaging/impl :aeron 252 | :onyx.messaging/peer-port 40200 253 | :onyx.messaging/bind-addr "localhost" 254 | :onyx.log/file ".clojask/clojask.log"}) 255 | 256 | (def env (onyx.api/start-env env-config)) 257 | 258 | (def peer-group (onyx.api/start-peer-group peer-config)) 259 | 260 | (def n-peers (count (set (mapcat identity workflow)))) 261 | 262 | (def v-peers (onyx.api/start-peers n-peers peer-group))) 263 | 264 | (defn shutdown 265 | [] 266 | (doseq [v-peer v-peers] 267 | (onyx.api/shutdown-peer v-peer)) 268 | (onyx.api/shutdown-peer-group peer-group) 269 | (onyx.api/shutdown-env env)) 270 | 271 | (defn start-onyx-aggre 272 | "start the onyx cluster with the specification inside dataframe" 273 | [num-work batch-size dataframe source dist exception aggre-func index formatter out] 274 | (try 275 | (workflow-gen num-work) 276 | (config-env) 277 | (worker-func-gen dataframe exception aggre-func index formatter source) ;;need some work 278 | (catalog-gen num-work batch-size) 279 | (lifecycle-gen (if (nil? source) "./.clojask/grouped" nil) dist) 280 | (flow-cond-gen num-work) 281 | (input/inject-dataframe dataframe source) 282 | (output/inject-dataframe dataframe out) 283 | ;; (insert-mgroup source) 284 | (catch Exception e (do 285 | (throw (ExecutionException. (format "[preparing stage (groupby aggregate)] Refer to .clojask/clojask.log for detailed information. (original error: %s)" (.getMessage e))))))) 286 | (try 287 | (let [submission (onyx.api/submit-job peer-config 288 | {:workflow workflow 289 | :catalog catalog 290 | :lifecycles lifecycles 291 | :flow-conditions flow-conditions 292 | :task-scheduler :onyx.task-scheduler/balanced}) 293 | job-id (:job-id submission)] 294 | ;; (println submission) 295 | (assert job-id "Job was not successfully submitted") 296 | (feedback-exception! peer-config job-id)) 297 | (catch Exception e (do 298 | (shutdown) 299 | (throw (ExecutionException. (format "[submit-to-onyx stage (groupby aggregate)] Refer to .clojask/clojask.log for detailed information. (original error: %s)" (.getMessage e))))))) 300 | (try 301 | (shutdown) 302 | (catch Exception e (throw (ExecutionException. (format "[terminate-node stage (groupby aggregate)] Refer to .clojask/clojask.log for detailed information. (original error: %s)" (.getMessage e)))))) 303 | "success") 304 | -------------------------------------------------------------------------------- /src/main/clojure/clojask/utils.clj: -------------------------------------------------------------------------------- 1 | (ns clojask.utils 2 | (:require [clojure.core.async :refer [chan sliding-buffer >!! close!]] 3 | [clojure.java.io :refer [resource]] 4 | [onyx.plugin.core-async :refer [take-segments!]] 5 | ;; [tech.v3.dataset :as ds] 6 | [clojure.string :as str] 7 | [clojure.java.io :as io]) 8 | (:import (java.util Date) 9 | (java.time LocalDate) 10 | (java.time LocalDateTime) 11 | (java.time.format DateTimeFormatter) 12 | (java.util Base64))) 13 | "Utility function used in dataframe" 14 | 15 | (defn gets 16 | "unlike core/get, get elements from indices" 17 | [coll indices] 18 | (mapv #(nth coll %) indices) 19 | ) 20 | 21 | (defn gets-format 22 | "gets with format" 23 | [coll indices formatters] 24 | (mapv (fn [_] 25 | (let [val (nth coll _)] 26 | (if-let [formatter (get formatters _)] 27 | (formatter val) 28 | val))) indices) 29 | ) 30 | 31 | (defn get-key 32 | [row types key-index key] 33 | (let [index (get key-index key)] 34 | (if (contains? types index) 35 | ((get types index) (.get row index)) 36 | (.get row index)))) 37 | 38 | (defn get-val 39 | [row types index] 40 | (map (fn [_] (if-let [parser (get types _)] 41 | (parser (nth row _)) 42 | (nth row _))) 43 | index)) 44 | 45 | (defn eval-res 46 | [row types formats operations index] 47 | ;; (spit "resources/debug.txt" (str row "\n") :append true) 48 | ;; (spit "resources/debug.txt" (str types) :append true) 49 | ;; (spit "resources/debug.txt" operations :append true) 50 | ;; (spit "resources/debug.txt" (str index "\n") :append true) 51 | ;; (println opr-vec) 52 | (let [opr-vec (get operations index) 53 | vals (get-val row types (first opr-vec))] 54 | ;; (println [vals]) 55 | (loop [res vals oprs (rest opr-vec)] 56 | (if (= (count oprs) 0) 57 | (first res) 58 | (let [opr (first oprs) 59 | rest (rest oprs)] 60 | (recur [(apply opr res)] rest)))))) 61 | 62 | (defn eval-res-ne 63 | [row types formats operations index] 64 | ;; (spit "resources/debug.txt" (str row "\n") :append true) 65 | ;; (spit "resources/debug.txt" (str types) :append true) 66 | ;; (spit "resources/debug.txt" operations :append true) 67 | ;; (spit "resources/debug.txt" (str index "\n") :append true) 68 | ;; (println opr-vec) 69 | (try 70 | (let [opr-vec (get operations index) 71 | vals (get-val row types (first opr-vec))] 72 | ;; (println [vals]) 73 | (loop [res vals oprs (rest opr-vec)] 74 | (if (= (count oprs) 0) 75 | (first res) 76 | (let [opr (first oprs) 77 | rest (rest oprs)] 78 | (recur [(apply opr res)] rest))))) 79 | (catch Exception e nil))) 80 | 81 | (defn filter-check 82 | [filters types row] 83 | ;; (loop [filters filters] 84 | ;; (let [filter (first filters) 85 | ;; rem (rest filters)] 86 | ;; (if (= filter nil) 87 | ;; true 88 | ;; (if (not= (filter row) true) 89 | ;; false 90 | ;; (recur rem))))) 91 | (if (= filters []) 92 | true 93 | (loop [filters filters] 94 | (let [com (first filters) 95 | rem (rest filters)] 96 | ;; (println com) 97 | (if (= com nil) 98 | true 99 | (do 100 | (if (apply (first com) (get-val row types (nth com 1))) 101 | (recur rem) 102 | false))))))) 103 | 104 | (def toInt 105 | (atom (fn [string] 106 | (try 107 | (Integer/parseInt string) 108 | (catch Exception e nil))))) 109 | 110 | (def toDouble 111 | (atom (fn [string] 112 | (try 113 | (Double/parseDouble string) 114 | (catch Exception e nil))))) 115 | 116 | (def toString 117 | (atom (fn [string] 118 | string))) 119 | 120 | (def fromString 121 | (atom (fn [_] (str _)))) 122 | 123 | (def toDate 124 | (atom (fn [string] 125 | (try 126 | (.parse (java.text.SimpleDateFormat. "yyyy-MM-dd") string) 127 | ;; (catch Exception e (throw e)) 128 | (catch Exception e nil) 129 | )))) 130 | 131 | (def fromDate 132 | (atom (fn [date] 133 | (if (= (type date) java.util.Date) 134 | (.format (java.text.SimpleDateFormat. "yyyy-MM-dd") date) 135 | date)))) 136 | 137 | (defn set-format-string 138 | [string] 139 | (if (or (str/starts-with? string "date:") (str/starts-with? string "datetime:")) 140 | (let [format-string (subs string (inc (str/index-of string ":")))] 141 | (reset! toDate 142 | (fn [string] 143 | (try 144 | (.parse (java.text.SimpleDateFormat. format-string) string) 145 | (catch Exception e (throw e)) 146 | (catch Exception e nil) 147 | ))) 148 | 149 | (reset! fromDate 150 | (fn [date] 151 | (if (= (type date) java.util.Date) 152 | (.format (java.text.SimpleDateFormat. format-string) date) 153 | date)))) 154 | (do 155 | (reset! toDate 156 | (fn [string] 157 | (try 158 | (.parse (java.text.SimpleDateFormat. "yyyy-MM-dd") string) 159 | ;; (catch Exception e (throw e)) 160 | (catch Exception e nil) 161 | ))) 162 | 163 | (reset! fromDate 164 | (fn [date] 165 | (if (= (type date) java.util.Date) 166 | (.format (java.text.SimpleDateFormat. "yyyy-MM-dd") date) 167 | date)))))) 168 | 169 | (def type-operation-map 170 | {"int" [toInt fromString] 171 | "double" [toDouble fromString] 172 | "string" [toString fromString] 173 | "date" [toDate fromDate] 174 | "datetime" [toDate fromDate]}) 175 | 176 | (defn type-detection 177 | [file] 178 | (let [sample (take 5 file)])) 179 | 180 | (defn is-in 181 | [col dataframe] 182 | (if (contains? (.getKeyIndex (:col-info dataframe)) col) 183 | true 184 | false)) 185 | 186 | (defn is-out 187 | [col dataframe] 188 | (if (contains? (.getKeyIndex (:col-info dataframe)) col) 189 | false 190 | true)) 191 | 192 | (defn are-in 193 | "return should be [] if all in" 194 | [cols dataframe] 195 | (filter (fn [col] (is-out col dataframe)) cols)) 196 | 197 | (defn are-out 198 | "return should be [] if all out" 199 | [cols dataframe] 200 | (filter (fn [col] (is-in col dataframe)) cols)) 201 | 202 | (defn init-file 203 | [out-dir header] 204 | (if (not= out-dir nil) 205 | (io/delete-file out-dir true)) 206 | (doseq [file (rest (file-seq (io/file "./.clojask/grouped/")))] 207 | (try 208 | (io/delete-file file) 209 | (catch Exception e nil))) 210 | (doseq [file (rest (file-seq (io/file "./.clojask/join/")))] 211 | (try 212 | (io/delete-file file) 213 | (catch Exception e nil))) 214 | (io/make-parents "./.clojask/grouped/a.txt") 215 | (io/make-parents "./.clojask/join/a/a.txt") 216 | (io/make-parents "./.clojask/join/b/a.txt") 217 | (io/make-parents "./.clojask/sort/a.txt") 218 | ;; (if (not= header nil) 219 | ;; (with-open [wrtr (io/writer out-dir)] 220 | ;; (.write wrtr (str (str/join "," header) "\n")))) 221 | ) 222 | 223 | (defn get-type-string 224 | [x] 225 | (if (not= x nil) 226 | (subs (str (type x)) 6) 227 | "nil")) 228 | 229 | (defn get-type-string-vec 230 | [col] 231 | (let [types (mapv get-type-string col) 232 | types (sort (vec (set types)))] 233 | (str/join " & " types))) 234 | 235 | (defn check-duplicate-col 236 | "Check for duplicated column names and return a column names list w/o duplicates" 237 | [colNames] 238 | (if (not= (count (distinct colNames)) (count colNames)) 239 | (do 240 | (println "WARNING: Duplicated columns found") 241 | (let [colNames-var (atom colNames) 242 | duplicate-list (into (sorted-map) (clojure.core/filter #(> (last %) 1) (frequencies (deref colNames-var)))) 243 | counter (atom {})] 244 | (doseq [duplicate-col duplicate-list] 245 | (swap! counter assoc (first duplicate-col) (atom 0))) 246 | (doseq [col colNames] 247 | (if (contains? duplicate-list col) 248 | (reset! colNames-var (map #(if (= % col) 249 | (do 250 | (swap! (get @counter col) inc) 251 | (str % (deref (get @counter col)))) 252 | %) (deref colNames-var))))) 253 | (deref colNames-var))) 254 | colNames)) 255 | 256 | (defn proc-groupby-key-each 257 | [pair] 258 | (if (coll? pair) 259 | (if (and (= 2 (count pair)) (fn? (first pair)) (string? (nth pair 1))) 260 | pair 261 | (if (and (= 1 (count pair)) (string? (first pair))) 262 | [nil pair] 263 | (throw (Exception.)))) 264 | (if (string? pair) 265 | [nil pair] 266 | (throw (Exception.))))) 267 | 268 | (defn proc-groupby-key 269 | [input] 270 | (try 271 | (if (coll? input) 272 | ;; it is a collection 273 | (if (fn? (first input)) 274 | (if (= 2 (count input)) 275 | [input] 276 | nil) 277 | (mapv proc-groupby-key-each input)) 278 | (if (string? input) 279 | [[nil input]] 280 | nil)) 281 | (catch Exception e nil))) 282 | 283 | (defn get-func-str 284 | [func] 285 | (let [func-str (str func)] 286 | (str/replace (str/replace (subs func-str 0 (str/last-index-of func-str "@")) "$" "/") "_" "-"))) 287 | 288 | (def encoder (Base64/getUrlEncoder)) 289 | (def decoder (Base64/getUrlDecoder)) 290 | 291 | (defn encode-str 292 | [s] 293 | (.encodeToString encoder (.getBytes s))) 294 | 295 | (defn decode-str 296 | [s] 297 | (String. (.decode decoder s))) 298 | 299 | ;; (def toDate 300 | ;; (atom (fn [string] 301 | ;; (try 302 | ;; (LocalDate/parse string (DateTimeFormatter/ofPattern "yyyy-MM-dd")) 303 | ;; (catch Exception e (throw e)))))) 304 | 305 | ;; (def fromDate 306 | ;; (atom (fn [date] 307 | ;; (if (= (type date) java.time.LocalDate) 308 | ;; (.format date (DateTimeFormatter/ofPattern "yyyy-MM-dd")) 309 | ;; date)))) 310 | 311 | ;; (def toDateTime 312 | ;; (atom (fn [string] 313 | ;; (try 314 | ;; (LocalDateTime/parse string (DateTimeFormatter/ofPattern "yyyy-MM-dd HH:mm:ss")) 315 | ;; (catch Exception e (throw e)))))) 316 | 317 | ;; (def fromDateTime 318 | ;; (atom (fn [date] 319 | ;; (if (= (type date) java.time.LocalDateTime) 320 | ;; (.format date (DateTimeFormatter/ofPattern "yyyy-MM-dd HH:mm:ss")) 321 | ;; date)))) 322 | 323 | ;; (defn set-format-string 324 | ;; [string] 325 | ;; (if (or (str/starts-with? string "date:") (str/starts-with? string "datetime:")) 326 | ;; (let [format-string (subs string (inc (str/index-of string ":")))] 327 | ;; (reset! toDate 328 | ;; (fn [string] 329 | ;; (try 330 | ;; (LocalDate/parse string (DateTimeFormatter/ofPattern format-string)) 331 | ;; (catch Exception e (throw e))))) 332 | 333 | ;; (reset! fromDate 334 | ;; (fn [date] 335 | ;; (if (= (type date) java.time.LocalDate) 336 | ;; (.format date (DateTimeFormatter/ofPattern format-string)) 337 | ;; date))) 338 | 339 | ;; (reset! toDateTime 340 | ;; (fn [string] 341 | ;; (try 342 | ;; (LocalDateTime/parse string (DateTimeFormatter/ofPattern format-string)) 343 | ;; (catch Exception e (throw e))))) 344 | 345 | ;; (reset! fromDateTime 346 | ;; (fn [date] 347 | ;; (if (= (type date) java.time.LocalDateTime) 348 | ;; (.format date (DateTimeFormatter/ofPattern format-string)) 349 | ;; date)))) 350 | ;; )) 351 | 352 | ;; ;; (def operation-type-map 353 | ;; ;; {toInt "int" 354 | ;; ;; toDouble "double" 355 | ;; ;; toString "string" 356 | ;; ;; toDate "date"}) 357 | 358 | ;; (def type-operation-map 359 | ;; {"int" [toInt fromString] 360 | ;; "double" [toDouble fromString] 361 | ;; "string" [toString fromString] 362 | ;; "date" [toDate fromDate] 363 | ;; "datetime" [toDateTime fromDateTime]}) -------------------------------------------------------------------------------- /src/main/clojure/clojask/join.clj: -------------------------------------------------------------------------------- 1 | (ns clojask.join 2 | (:require [clojure.java.io :as io] 3 | [clojure.core.async :as async] 4 | ;; [clojask.onyx-comps :refer [start-onyx-groupby start-onyx-join]] 5 | [clojask.groupby :refer [read-csv-seq gen-groupby-filenames]] 6 | [clojure.string :as str] 7 | [clojask.utils :as u])) 8 | 9 | (def source nil) 10 | 11 | (defn gen-join-filenames 12 | [dist a-row a-keys] 13 | ;; (def output-filename dist) 14 | ;; (doseq [i (take (count a-keys) (iterate inc 0))] 15 | ;; (def output-filename (str output-filename "_" (name (nth b-keys i)) "-" (nth a-row (get a-map (nth a-keys i)))))) 16 | ;; (str output-filename ".csv") 17 | (let [a-val (mapv (fn [_] ((or (nth _ 0) identity) (nth a-row (nth _ 1)))) a-keys)] 18 | (if (nil? dist) (str a-val) (str dist (u/encode-str (str a-val)))))) 19 | 20 | (defn output-join-inner 21 | [writer a-row a-keys a-map b-keys count a-roll b-roll a-format b-format a-index b-index join-index write-func] 22 | (let [filename (gen-join-filenames ".clojask/join/b/" a-row a-keys)] 23 | ;; (println writer) 24 | ;; (spit ".clojask/join/test.txt" (str writer "\n") :append true) 25 | ;; (.write writer (str [a-row a-keys a-map b-keys a-format b-format a-index b-index] "\n")) 26 | (if (.exists (io/file filename)) 27 | ;; (.write writer (str (map type a-row) "\n")) 28 | ;; (spit ".clojask/join/test.txt" (str (vec (read-csv-seq filename)) "\n") :append true) 29 | (let [a-row (u/gets-format a-row a-index a-format) 30 | filename (io/reader filename)] 31 | (doseq [b-row (read-csv-seq filename)] 32 | ;; (.write writer (str (map type b-row) "\n")) 33 | ;; (spit ".clojask/join/test.txt" (str a-row b-row "\n") :append true) 34 | (let [b-row (u/gets b-row b-index)] 35 | ;; (println [(vec a-row) (vec b-row) a-index b-index join-index]) 36 | (write-func writer (vector (u/gets (concat a-row b-row) join-index))))) 37 | (.close filename))))) 38 | 39 | (defn output-join-inner-mem 40 | [writer a-row a-keys a-map b-keys count a-roll b-roll a-format b-format a-index b-index join-index write-func] 41 | (let [filename (gen-join-filenames nil a-row a-keys)] 42 | (if (.exists source filename) 43 | (let [a-row (u/gets-format a-row a-index a-format)] 44 | (doseq [b-row (.getKey source filename)] 45 | (let [] 46 | (write-func writer (vector (u/gets (concat a-row b-row) join-index))))))))) 47 | 48 | (defn output-join-loo 49 | "used for left join right join or outter join" 50 | [writer a-row a-keys a-map b-keys count a-roll b-roll a-format b-format a-index b-index join-index write-func] 51 | ;; (println a-format) 52 | ;; (println b-format) 53 | ;; (println a-index) 54 | ;; (println b-index) 55 | (let [filename (gen-join-filenames ".clojask/join/b/" a-row a-keys)] 56 | ;; (println writer) 57 | ;; (spit ".clojask/join/test.txt" (str writer "\n") :append true) 58 | (if (.exists (io/file filename)) 59 | ;; (spit ".clojask/join/test.txt" (str (vec (read-csv-seq filename)) "\n") :append true) 60 | (let [filename (io/reader filename)] 61 | (doseq [b-row (read-csv-seq filename)] 62 | ;; (spit ".clojask/join/test.txt" (str a-row b-row "\n") :append true) 63 | (let [a-row (u/gets-format a-row a-index a-format) 64 | ;; tmp (println a-row) 65 | ;; a-row (for [index a-index] 66 | ;; (if-let [format (get a-format index)] 67 | ;; (format (nth a-row index)) 68 | ;; (nth a-row index))) 69 | b-row (u/gets b-row b-index) 70 | ;; tmp (println b-row) 71 | ;; b-row (for [index b-index] 72 | ;; (if-let [format (get b-format index)] 73 | ;; (format (nth b-row index)) 74 | ;; (nth b-row index))) 75 | ] 76 | (write-func writer (vector (u/gets (concat a-row b-row) join-index))))) 77 | (.close filename)) 78 | (let [a-row (u/gets-format a-row a-index a-format)] 79 | (write-func writer (vector (u/gets (concat a-row (repeat count "")) join-index))))))) 80 | 81 | (defn output-join-loo-mem 82 | "used for left join right join or outter join" 83 | [writer a-row a-keys a-map b-keys count a-roll b-roll a-format b-format a-index b-index join-index write-func] 84 | (let [filename (gen-join-filenames nil a-row a-keys)] 85 | ;; (println a-row) 86 | (if (.exists source filename) 87 | (let [b-rows (.getKey source filename)] 88 | ;; (println b-rows) 89 | (doseq [b-row b-rows] 90 | (let [a-row (u/gets-format a-row a-index a-format) 91 | ] 92 | (write-func writer (vector (u/gets (concat a-row b-row) join-index)))))) 93 | (let [a-row (u/gets-format a-row a-index a-format)] 94 | (write-func writer (vector (u/gets (concat a-row (repeat count "")) join-index))))))) 95 | 96 | (defn defn-join 97 | [type limit _source] 98 | (def source _source) 99 | (def output-join 100 | (case type 101 | 1 (if (nil? _source) output-join-inner output-join-inner-mem) 102 | 2 (if (nil? _source) output-join-loo output-join-loo-mem) 103 | ;; 4 output-join-forward 104 | 4 (let [roll-join-get-line-forward (fn [bench filename index] 105 | (def memo (volatile! nil)) 106 | (def res (volatile! nil)) 107 | (doseq [row (read-csv-seq filename)] 108 | (let [val (nth row index)] 109 | (if (and (<= (compare val bench) 0) (limit bench val) (or (= @memo nil) (> (compare val @memo) 0))) 110 | (do (vreset! memo val) 111 | (vreset! res row))))) 112 | @res) 113 | roll-join-get-line-forward-mem (fn [bench filename index] 114 | (def memo (volatile! nil)) 115 | (def res (volatile! nil)) 116 | (doseq [row (.getKey source filename)] 117 | (let [unformat (nth row 1) 118 | val (nth unformat index)] 119 | (if (and (<= (compare val bench) 0) (limit bench val) (or (= @memo nil) (> (compare val @memo) 0))) 120 | (do (vreset! memo val) 121 | (vreset! res (first row)))))) 122 | @res)] 123 | (if (nil? _source) 124 | (fn [writer a-row a-keys a-map b-keys count a-roll b-roll a-format b-format a-index b-index join-index write-func] 125 | (let [filename (gen-join-filenames ".clojask/join/b/" a-row a-keys)] 126 | (if (.exists (io/file filename)) 127 | (let [filename (io/reader filename)] 128 | (if-let [b-row (roll-join-get-line-forward (nth a-row a-roll) filename b-roll)] ;; bench is a string 129 | (let [;; a-row (for [index a-index] 130 | ;; (if-let [format (get a-format index)] 131 | ;; (format (nth a-row index)) 132 | ;; (nth a-row index))) 133 | a-row (u/gets-format a-row a-index a-format) 134 | ;; b-row (for [index b-index] 135 | ;; (if-let [format (get b-format index)] 136 | ;; (format (nth b-row index)) 137 | ;; (nth b-row index))) 138 | b-row (u/gets-format b-row b-index b-format)] 139 | (write-func writer [(u/gets (concat a-row b-row) join-index)])) 140 | (let [a-row (for [index a-index] 141 | (if-let [format (get a-format index)] 142 | (format (nth a-row index)) 143 | (nth a-row index)))] 144 | (write-func writer [(u/gets (concat a-row (repeat count "")) join-index)]))) 145 | (.close filename)) 146 | (let [a-row (u/gets-format a-row a-index a-format)] 147 | (write-func writer [(u/gets (concat a-row (repeat count "")) join-index)]))))) 148 | (fn [writer a-row a-keys a-map b-keys count a-roll b-roll a-format b-format a-index b-index join-index write-func] 149 | (let [filename (gen-join-filenames nil a-row a-keys)] 150 | (if (.exists source filename) 151 | (let [] 152 | (if-let [b-row (roll-join-get-line-forward-mem (nth a-row a-roll) filename b-roll)] ;; bench is a string 153 | (write-func writer [(u/gets (concat (u/gets-format a-row a-index a-format) b-row) join-index)]) 154 | (write-func writer [(u/gets (concat (u/gets-format a-row a-index a-format) (repeat count "")) join-index)]))) 155 | (let [a-row (u/gets-format a-row a-index a-format)] 156 | (write-func writer [(u/gets (concat a-row (repeat count "")) join-index)]))))))) 157 | ;; 5 output-join-backward 158 | 5 (let [roll-join-get-line-backward (fn [bench filename index] 159 | (def memo (volatile! nil)) 160 | (def res (volatile! nil)) 161 | (doseq [row (read-csv-seq filename)] 162 | (let [val (nth row index)] 163 | ;; todo does here need to be =? 164 | (if (and (>= (compare val bench) 0) (or (= @memo nil) (< (compare val @memo) 0))) 165 | (do (vreset! memo val) 166 | (vreset! res row))))) 167 | @res) 168 | roll-join-get-line-backward-mem (fn [bench filename index] 169 | (def memo (volatile! nil)) 170 | (def res (volatile! nil)) 171 | (doseq [row (.getKey source filename)] 172 | (let [unformat (nth row 1) 173 | val (nth unformat index)] 174 | (if (and (>= (compare val bench) 0) (limit bench val) (or (= @memo nil) (> (compare val @memo) 0))) 175 | (do (vreset! memo val) 176 | (vreset! res (first row)))))) 177 | @res)] 178 | (if (nil? source) 179 | (fn 180 | [writer a-row a-keys a-map b-keys count a-roll b-roll a-format b-format a-index b-index join-index write-func] 181 | (let [filename (gen-join-filenames ".clojask/join/b/" a-row a-keys)] 182 | ;; (println writer) 183 | ;; (spit ".clojask/join/test.txt" (str writer "\n") :append true) 184 | (if (.exists (io/file filename)) 185 | ;; (spit ".clojask/join/test.txt" (str (vec (read-csv-seq filename)) "\n") :append true) 186 | (let [filename (io/reader filename)] 187 | (if-let [b-row (roll-join-get-line-backward (nth a-row a-roll) filename b-roll)] ;; bench is a string 188 | (let [a-row (u/gets-format a-row a-index a-format) 189 | b-row (u/gets-format b-row b-index b-format)] 190 | (write-func writer [(u/gets (concat a-row b-row) join-index)])) 191 | (let [a-row (u/gets-format a-row a-index a-format)] 192 | (write-func writer [(u/gets (concat a-row (repeat count "")) join-index)]))) 193 | (.close filename)) 194 | (let [a-row (u/gets-format a-row a-index a-format)] 195 | (write-func writer [(u/gets (concat a-row (repeat count "")) join-index)]))))) 196 | (fn [writer a-row a-keys a-map b-keys count a-roll b-roll a-format b-format a-index b-index join-index write-func] 197 | (let [filename (gen-join-filenames nil a-row a-keys)] 198 | (if (.exists source filename) 199 | (let [] 200 | (if-let [b-row (roll-join-get-line-backward-mem (nth a-row a-roll) filename b-roll)] ;; bench is a string 201 | (write-func writer [(u/gets (concat (u/gets-format a-row a-index a-format) b-row) join-index)]) 202 | (write-func writer [(u/gets (concat (u/gets-format a-row a-index a-format) (repeat count "")) join-index)]))) 203 | (let [a-row (u/gets-format a-row a-index a-format)] 204 | (write-func writer [(u/gets (concat a-row (repeat count "")) join-index)]))))))) 205 | nil))) -------------------------------------------------------------------------------- /src/main/clojure/clojask/join/outer_onyx_comps.clj: -------------------------------------------------------------------------------- 1 | (ns clojask.join.outer-onyx-comps 2 | (:require [clojask.join.outer-input :as input] 3 | [clojask.join.outer-output :as output] 4 | [onyx.api :refer :all] 5 | [clojure.string :as string] 6 | [onyx.test-helper :refer [with-test-env feedback-exception!]] 7 | [clojure.data.csv :as csv] 8 | [clojask.utils :as u] 9 | [clojure.set :as set] 10 | [clojure.java.io :as io] 11 | [clojask.groupby :refer [read-csv-seq]]) 12 | (:import (java.io BufferedReader FileReader BufferedWriter FileWriter) 13 | [com.clojask.exception ExecutionException])) 14 | 15 | 16 | (def id (java.util.UUID/randomUUID)) 17 | 18 | (defn workflow-gen 19 | "Generate workflow for running Onyx" 20 | [num-work] 21 | (def workflow []) ;; initialisation 22 | 23 | ;; for loop for input edges 24 | (doseq [x (range 1 (+ num-work 1))] 25 | (let [worker-name (keyword (str "sample-worker" x))] 26 | (def workflow (conj workflow [:in worker-name] 27 | )))) 28 | 29 | ;; for loop for output edges 30 | (doseq [x (range 1 (+ num-work 1))] 31 | (let [worker-name (keyword (str "sample-worker" x))] 32 | (def workflow (conj workflow [worker-name :output] 33 | )))) 34 | ) 35 | 36 | (def dataframe (atom nil)) 37 | 38 | 39 | (defn worker-func-gen 40 | [a b mgroup-a mgroup-b exception a-index b-index a-format b-format write-index] 41 | (let [a-count (count a-index) 42 | b-count (count b-index) 43 | b-nil (repeat b-count nil) 44 | add-nil (fn [row] (concat row b-nil)) 45 | a-index-new (take (count a-index) (iterate inc 0)) 46 | b-index-new (take (count b-index) (iterate inc 0)) 47 | ;; a-format (.getFormatter (:col-info a)) 48 | ;; a-format (set/rename-keys a-format (zipmap (deref a-index) (iterate inc 0))) 49 | ;; a-format (.getFormatter (:col-info a)) 50 | ;; a-format (set/rename-keys a-format (zipmap (deref a-index) (iterate inc 0))) 51 | ] 52 | (if (= nil mgroup-a) 53 | (defn worker-func 54 | "refered in preview" 55 | [seq] 56 | ;; (println seq) 57 | (let [id (:id seq) 58 | a-filename (:d seq) 59 | a-data (read-csv-seq a-filename) 60 | a-data (map #(u/gets % a-index-new) a-data) 61 | b-filename (string/replace-first a-filename "/a/" "/b/")] 62 | (if (.exists (io/file b-filename)) 63 | (do 64 | (let [b-data (mapv #(u/gets % b-index-new) (read-csv-seq b-filename))] 65 | (io/delete-file b-filename true) 66 | {:id id :d (mapv #(u/gets % write-index) (for [a-row a-data b-row b-data] (concat a-row b-row)))}) ;; formatter here 67 | ) 68 | {:id id :d (mapv #(u/gets % write-index) (map add-nil a-data))}))) 69 | (defn worker-func 70 | "refered in preview" 71 | [seq] 72 | ;; (println seq) 73 | (let [id (:id seq) 74 | a-filename (:d seq) 75 | a-data (.getKey mgroup-a a-filename) 76 | a-data (map #(u/gets % a-index-new) a-data) 77 | b-filename a-filename 78 | ] 79 | ;; (println b-filename) 80 | (if (.exists mgroup-b b-filename) 81 | (do 82 | (let [b-data (mapv #(u/gets % b-index-new) (.getKey mgroup-b b-filename))] 83 | ;; (io/delete-file b-filename true) 84 | {:id id :d (mapv #(u/gets % write-index) (for [a-row a-data b-row b-data] (concat a-row b-row)))}) ;; formatter here 85 | ) 86 | {:id id :d (mapv #(u/gets % write-index) (map add-nil a-data))})))))) 87 | 88 | (defn worker-func-gen2 89 | [a b mgroup-a mgroup-b exception a-index b-index a-format b-format write-index] 90 | (let [a-count (count a-index) 91 | b-count (count b-index) 92 | a-nil (repeat a-count nil) 93 | add-nil (fn [row] (concat a-nil row)) 94 | b-index-new (take (count b-index) (iterate inc 0))] 95 | (if (= mgroup-a nil) 96 | (defn worker-func 97 | "refered in preview" 98 | [seq] 99 | ;; (println seq) 100 | (let [id (:id seq) 101 | b-filename (:d seq) 102 | b-data (mapv #(u/gets % b-index-new) (read-csv-seq b-filename))] 103 | {:id id :d (mapv #(u/gets % write-index) (mapv add-nil b-data))})) 104 | (defn worker-func 105 | "refered in preview" 106 | [seq] 107 | ;; (println seq) 108 | (let [id (:id seq) 109 | b-filename (:d seq) 110 | b-data (mapv #(u/gets % b-index-new) (.getKey mgroup-b b-filename))] 111 | {:id id :d (mapv #(u/gets % write-index) (mapv add-nil b-data))}))))) 112 | 113 | (defn catalog-gen 114 | "Generate the catalog for running Onyx" 115 | [num-work batch-size] 116 | ;; initialisation 117 | (def catalog []) 118 | 119 | ;; input 120 | (def catalog 121 | (conj catalog 122 | {:onyx/name :in 123 | :onyx/plugin :clojask.join.outer-input/input 124 | :onyx/type :input 125 | :onyx/medium :seq 126 | :seq/checkpoint? true 127 | :onyx/batch-size batch-size 128 | :onyx/max-peers 1 129 | :input/doc "Reads segments from a core.async channel"})) 130 | 131 | ;; for loop for sample workers 132 | (doseq [x (range 1 (+ num-work 1))] 133 | (let [worker-name (keyword (str "sample-worker" x)) 134 | worker-function (keyword "clojask.join.outer-onyx-comps" "worker-func")] 135 | (def catalog 136 | (conj catalog 137 | {:onyx/name worker-name 138 | :onyx/fn worker-function 139 | :onyx/type :function 140 | :onyx/batch-size batch-size 141 | :worker/doc "This is a worker node"} 142 | )))) 143 | 144 | ;; output 145 | (def catalog 146 | (conj catalog 147 | {:onyx/name :output 148 | :onyx/plugin :clojask.join.outer-output/output 149 | :onyx/type :output 150 | :onyx/medium :core.async ;; this is maked up 151 | :onyx/max-peers 1 152 | :onyx/batch-size batch-size 153 | :output/doc "Writes segments to the file"})) 154 | 155 | ;; (println catalog) ;; !! debugging 156 | ) 157 | 158 | 159 | (defn inject-in-reader [event lifecycle] 160 | (let [] 161 | {:buffered-reader/path (:buffered-reader/path lifecycle) 162 | })) 163 | 164 | 165 | (def in-calls 166 | {:lifecycle/before-task-start inject-in-reader}) 167 | 168 | 169 | (defn lifecycle-gen 170 | [source dist] 171 | (def lifecycles 172 | [{:lifecycle/task :in 173 | :buffered-reader/path source 174 | :lifecycle/calls ::in-calls} 175 | {:lifecycle/task :in 176 | :lifecycle/calls :clojask.join.outer-input/reader-calls} 177 | {:lifecycle/task :output 178 | :buffered-wtr/filename dist 179 | :lifecycle/calls :clojask.join.outer-output/writer-calls}])) 180 | 181 | (def num-workers (atom 1)) 182 | 183 | ;; (defn rem0? 184 | ;; [event old-segment new-segment all-new-segment] 185 | ;; ;; (spit "resources/debug.txt" (str new-segment "\n") :append true) 186 | ;; (= (mod (:id new-segment) (deref num-workers)) 0)) 187 | 188 | ;; (defn rem1? 189 | ;; [event old-segment new-segment all-new-segment] 190 | ;; (= (mod (:id new-segment) (deref num-workers)) 1)) 191 | 192 | ;; (defn rem2? 193 | ;; [event old-segment new-segment all-new-segment] 194 | ;; (= (mod (:id new-segment) (deref num-workers)) 2)) 195 | 196 | ;; (defn rem3? 197 | ;; [event old-segment new-segment all-new-segment] 198 | ;; (= (mod (:id new-segment) (deref num-workers)) 3)) 199 | 200 | ;; (defn rem4? 201 | ;; [event old-segment new-segment all-new-segment] 202 | ;; (= (mod (:id new-segment) (deref num-workers)) 4)) 203 | 204 | ;; (defn rem5? 205 | ;; [event old-segment new-segment all-new-segment] 206 | ;; (= (mod (:id new-segment) (deref num-workers)) 5)) 207 | 208 | ;; (defn rem6? 209 | ;; [event old-segment new-segment all-new-segment] 210 | ;; (= (mod (:id new-segment) (deref num-workers)) 6)) 211 | 212 | ;; (defn rem7? 213 | ;; [event old-segment new-segment all-new-segment] 214 | ;; (= (mod (:id new-segment) (deref num-workers)) 7)) 215 | 216 | ;; (defn rem8? 217 | ;; [event old-segment new-segment all-new-segment] 218 | ;; (= (mod (:id new-segment) (deref num-workers)) 8)) 219 | 220 | 221 | ;; [{:flow/from :in 222 | ;; :flow/to [:sample-worker1] 223 | ;; :flow/predicate :clojask.onyx-comps/rem0? 224 | ;; :flow/doc ""} 225 | ;; {:flow/from :in 226 | ;; :flow/to [:sample-worker2] 227 | ;; :flow/predicate :clojask.onyx-comps/rem1? 228 | ;; :flow/doc ""}] 229 | 230 | (defn flow-cond-gen 231 | "Generate the flow conditions for running Onyx" 232 | [num-work] 233 | (reset! num-workers num-work) 234 | (def flow-conditions []) ;; initialisation 235 | 236 | ;; for loop for sample workers 237 | (doseq [x (range 1 (+ num-work 1))] 238 | (let [worker-name (keyword (str "sample-worker" x)) 239 | predicate-function (keyword "clojask.join.outer-onyx-comps" (str "rem" (- x 1) "?"))] 240 | (intern 'clojask.join.outer-onyx-comps (symbol (str "rem" (- x 1) "?")) (fn [event old-segment new-segment all-new-segment] 241 | (= (mod (:id new-segment) num-work) (- x 1)))) 242 | (def flow-conditions 243 | (conj flow-conditions 244 | {:flow/from :in 245 | :flow/to [worker-name] 246 | :flow/predicate predicate-function 247 | :worker/doc "This is a flow condition"} 248 | )))) 249 | 250 | ;; (println flow-conditions) ;; !! debugging 251 | ) 252 | 253 | (defn config-env 254 | [] 255 | (def env-config 256 | {:zookeeper/address "127.0.0.1:2188" 257 | :zookeeper/server? true 258 | :zookeeper.server/port 2188 259 | :onyx/tenancy-id id 260 | :onyx.log/file ".clojask/clojask.log"}) 261 | 262 | (def peer-config 263 | {:zookeeper/address "127.0.0.1:2188" 264 | :onyx/tenancy-id id 265 | :onyx.peer/job-scheduler :onyx.job-scheduler/balanced 266 | :onyx.messaging/impl :aeron 267 | :onyx.messaging/peer-port 40200 268 | :onyx.messaging/bind-addr "localhost" 269 | :onyx.log/file ".clojask/clojask.log"}) 270 | 271 | (def env (onyx.api/start-env env-config)) 272 | 273 | (def peer-group (onyx.api/start-peer-group peer-config)) 274 | 275 | (def n-peers (count (set (mapcat identity workflow)))) 276 | 277 | (def v-peers (onyx.api/start-peers n-peers peer-group))) 278 | 279 | (defn shutdown 280 | [] 281 | (doseq [v-peer v-peers] 282 | (onyx.api/shutdown-peer v-peer)) 283 | (onyx.api/shutdown-peer-group peer-group) 284 | (onyx.api/shutdown-env env)) 285 | 286 | (defn start-onyx-outer 287 | "start the onyx cluster with the specification inside dataframe" 288 | [num-work batch-size a b mgroup-a mgroup-b dist exception a-index b-index a-format b-format write-index output] 289 | ;; step 1 290 | (try 291 | (workflow-gen num-work) 292 | (config-env) 293 | (worker-func-gen a b mgroup-a mgroup-b exception a-index b-index a-format b-format write-index) ;;need some work 294 | (catalog-gen num-work batch-size) 295 | (lifecycle-gen "./.clojask/join/a" dist) 296 | (flow-cond-gen num-work) 297 | (input/inject-dataframe mgroup-a mgroup-b) 298 | (output/inject-write-func output) 299 | (catch Exception e (do 300 | (shutdown) 301 | (throw (ExecutionException. (format "[preparing stage (outer join)] Refer to .clojask/clojask.log for detailed information. (original error: %s)" (.getMessage e))))))) 302 | (try 303 | (let [submission (onyx.api/submit-job peer-config 304 | {:workflow workflow 305 | :catalog catalog 306 | :lifecycles lifecycles 307 | :flow-conditions flow-conditions 308 | :task-scheduler :onyx.task-scheduler/balanced}) 309 | job-id (:job-id submission)] 310 | ;; (println submission) 311 | (assert job-id "Job was not successfully submitted") 312 | (feedback-exception! peer-config job-id)) 313 | (catch Exception e (do 314 | (shutdown) 315 | (throw (ExecutionException. (format "[submit-to-onyx stage (outer join)] Refer to .clojask/clojask.log for detailed information. (original error: %s)" (.getMessage e))))))) 316 | 317 | ;; step 2 318 | (try 319 | (if (not= mgroup-b nil) (.final mgroup-b)) 320 | (worker-func-gen2 a b mgroup-a mgroup-b exception a-index b-index a-format b-format write-index) ;;need some work 321 | (lifecycle-gen "./.clojask/join/b" dist) 322 | (input/inject-dataframe mgroup-b nil) 323 | 324 | (catch Exception e (do 325 | (shutdown) 326 | (throw (ExecutionException. (format "[preparing stage (outer join 2)] Refer to .clojask/clojask.log for detailed information. (original error: %s)" (.getMessage e))))))) 327 | (try 328 | (let [submission (onyx.api/submit-job peer-config 329 | {:workflow workflow 330 | :catalog catalog 331 | :lifecycles lifecycles 332 | :flow-conditions flow-conditions 333 | :task-scheduler :onyx.task-scheduler/balanced}) 334 | job-id (:job-id submission)] 335 | ;; (println submission) 336 | (assert job-id "Job was not successfully submitted") 337 | (feedback-exception! peer-config job-id)) 338 | (catch Exception e (do 339 | (shutdown) 340 | (throw (ExecutionException. (format "[submit-to-onyx stage (outer join 2)] Refer to .clojask/clojask.log for detailed information. (original error: %s)" (.getMessage e))))))) 341 | 342 | (try 343 | (shutdown) 344 | (catch Exception e (throw (ExecutionException. (format "[terminate-node stage (outer join)] Refer to .clojask/clojask.log for detailed information. (original error: %s)" (.getMessage e)))))) 345 | "success") 346 | --------------------------------------------------------------------------------