├── test
    └── clojask
    │   ├── correct_outputs
    │       ├── 1-10.csv
    │       ├── 1-11.csv
    │       ├── 1-3.csv
    │       ├── 1-9.csv
    │       ├── 1-2.csv
    │       ├── 1-1.csv
    │       ├── 1-6.csv
    │       ├── 1-5.csv
    │       ├── 1-7.csv
    │       ├── 1-8.csv
    │       └── 1-4.csv
    │   ├── Employees-info-example.csv
    │   ├── melt.csv
    │   ├── Employees-example.csv
    │   ├── dcast.csv
    │   ├── inmemory_test.clj
    │   └── core_test.clj
├── docs
    ├── diagram.jpg
    ├── diagram.png
    ├── clojask_functions.png
    ├── img
    │   ├── image-20220405210757274.png
    │   ├── image-20220405210826777.png
    │   └── image-20220405211348723.png
    ├── intro.md
    ├── clojask types.md
    ├── aggregation functions.md
    └── clojask.extensions.md
├── examples
    ├── readme.md
    └── timezone dataframe
    │   ├── sales.csv
    │   └── timezone.clj
├── .gitignore
├── src
    └── main
    │   ├── java
    │       ├── TypeException.java
    │       ├── ExecutionException.java
    │       └── OperationException.java
    │   └── clojure
    │       └── clojask
    │           ├── terminal.clj
    │           ├── classes
    │               ├── DataStat.clj
    │               ├── RowInfo.clj
    │               ├── MGroup.clj
    │               ├── ColInfo.clj
    │               └── JoinedDataFrame.clj
    │           ├── api
    │               ├── aggregate.clj
    │               └── gb_aggregate.clj
    │           ├── extensions
    │               ├── reshape.clj
    │               └── bind.clj
    │           ├── clojask_input.clj
    │           ├── join
    │               ├── outer_output.clj
    │               ├── outer_input.clj
    │               └── outer_onyx_comps.clj
    │           ├── sort.clj
    │           ├── aggregate
    │               ├── aggre_output.clj
    │               ├── aggre_input.clj
    │               └── aggre_onyx_comps.clj
    │           ├── clojask_groupby.clj
    │           ├── debug.clj
    │           ├── clojask_output.clj
    │           ├── clojask_join.clj
    │           ├── clojask_aggre.clj
    │           ├── preview.clj
    │           ├── groupby.clj
    │           ├── utils.clj
    │           └── join.clj
├── LICENSE
├── project.clj
├── benchmark
    ├── dask-benchmark.ipynb
    ├── .ipynb_checkpoints
    │   └── dask-benchmark-checkpoint.ipynb
    └── clojure-benchmark.clj
└── README.md


/test/clojask/correct_outputs/1-10.csv:
--------------------------------------------------------------------------------
1 | new-Salary
2 | 50000.0
3 | 


--------------------------------------------------------------------------------
/test/clojask/correct_outputs/1-11.csv:
--------------------------------------------------------------------------------
1 | Department
2 | 12
3 | 21
4 | 13
5 | 11
6 | 


--------------------------------------------------------------------------------
/docs/diagram.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clojure-finance/clojask/HEAD/docs/diagram.jpg


--------------------------------------------------------------------------------
/docs/diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clojure-finance/clojask/HEAD/docs/diagram.png


--------------------------------------------------------------------------------
/docs/clojask_functions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clojure-finance/clojask/HEAD/docs/clojask_functions.png


--------------------------------------------------------------------------------
/test/clojask/correct_outputs/1-3.csv:
--------------------------------------------------------------------------------
1 | Department,new-Salary
2 | 13,800.0
3 | 11,50000.0
4 | 12,1000.0
5 | 21,700.0
6 | 


--------------------------------------------------------------------------------
/docs/img/image-20220405210757274.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clojure-finance/clojask/HEAD/docs/img/image-20220405210757274.png


--------------------------------------------------------------------------------
/docs/img/image-20220405210826777.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clojure-finance/clojask/HEAD/docs/img/image-20220405210826777.png


--------------------------------------------------------------------------------
/docs/img/image-20220405211348723.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/clojure-finance/clojask/HEAD/docs/img/image-20220405211348723.png


--------------------------------------------------------------------------------
/examples/readme.md:
--------------------------------------------------------------------------------
1 | This folder will be moved out as an independent repository in the future when Clojask has been deployed to Clojars.


--------------------------------------------------------------------------------
/docs/intro.md:
--------------------------------------------------------------------------------
1 | # Introduction to techml_onyx
2 | 
3 | TODO: write [great documentation](http://jacobian.org/writing/what-to-write/)
4 | 


--------------------------------------------------------------------------------
/test/clojask/correct_outputs/1-9.csv:
--------------------------------------------------------------------------------
1 | Employee,EmployeeName
2 | 1,Alice
3 | 2,Bob
4 | 3,Carla
5 | 4,Daniel
6 | 5,Evelyn
7 | 6,Ferdinand
8 | 7,Amy
9 | 


--------------------------------------------------------------------------------
/examples/timezone dataframe/sales.csv:
--------------------------------------------------------------------------------
1 | date,cust,item,sold
2 | 2010-01-19 UTC,101,2,11
3 | 2010-01-22 HKG,102,1,7
4 | 2010-01-24 UK,102,2,9
5 | 2010-01-25 DUB,101,2,9
6 | 2010-01-26 LA,101,1,10


--------------------------------------------------------------------------------
/test/clojask/Employees-info-example.csv:
--------------------------------------------------------------------------------
1 | Employee,EmployeeName,DayOff,UpdateDate
2 | 1,Alice,20,2020/12/10
3 | 2,Bob,15,2020/12/05
4 | 3,Carla,5,2020/12/03
5 | 7,Angel,30,2020/12/11
6 | 8,Jack,4,2019/03/21


--------------------------------------------------------------------------------
/test/clojask/melt.csv:
--------------------------------------------------------------------------------
1 | family_id,age_mother,dob_child1,dob_child2,dob_child3
2 | 1,30,1998-11-26,2000-01-29,
3 | 2,27,1996-06-22,,
4 | 3,26,2002-07-11,2004-04-05,2007-09-02
5 | 4,32,2004-10-10,2009-08-27,2012-07-21
6 | 5,29,2000-12-05,2005-02-28,


--------------------------------------------------------------------------------
/test/clojask/correct_outputs/1-2.csv:
--------------------------------------------------------------------------------
1 | Employee,EmployeeName,Department,Salary,UpdateDate,new-col
2 | 1,Alice,11,300.0,2020/12/12,1300.0
3 | 2,Bob,11,600.0,2020/12/01,2600.0
4 | 5,Evelyn,13,800.0,2020/12/03,5800.0
5 | 6,Ferdinand,21,700.0,2020/12/05,6700.0
6 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | onyx.log*
 3 | *.log
 4 | resources/
 5 | outputs/
 6 | test/clojask/test_outputs/*
 7 | .lein-failures
 8 | .lein-repl-history
 9 | .nrepl-port
10 | target/
11 | .clojask/
12 | .lsp
13 | .calva
14 | .clj-kondo
15 | api_design.clj
16 | # sqlite.db
17 | # _*.csv
18 | .vscode
19 | *.csv


--------------------------------------------------------------------------------
/test/clojask/Employees-example.csv:
--------------------------------------------------------------------------------
1 | Employee,EmployeeName,Department,Salary,UpdateDate
2 | 1,Alice,11,300,2020/12/12
3 | 2,Bob,11,600,2020/12/01
4 | 3,Carla,12,900,2020/12/03
5 | 4,Daniel,12,1000,2020/12/05
6 | 5,Evelyn,13,800,2020/12/03
7 | 6,Ferdinand,21,700,2020/12/05
8 | 7,Amy,11,50000,2020/11/26
9 | 


--------------------------------------------------------------------------------
/test/clojask/correct_outputs/1-1.csv:
--------------------------------------------------------------------------------
1 | Employee,EmployeeName,Department,Salary,UpdateDate
2 | 1,Alice,11,-300.0!,2020/12/12
3 | 2,Bob,11,-600.0!,2020/12/01
4 | 3,Carla,12,-900.0!,2020/12/03
5 | 4,Daniel,12,-1000.0!,2020/12/05
6 | 5,Evelyn,13,-800.0!,2020/12/03
7 | 6,Ferdinand,21,-700.0!,2020/12/05
8 | 7,Amy,11,-50000.0!,2020/11/26
9 | 


--------------------------------------------------------------------------------
/test/clojask/correct_outputs/1-6.csv:
--------------------------------------------------------------------------------
1 | 2_Employee,2_EmployeeName,2_DayOff,2_UpdateDate,1_Employee,1_EmployeeName,1_Department,1_Salary,1_UpdateDate
2 | 2,Bob,15,2020/12/05,2,Bob,11,600,2020/12/01
3 | 3,Carla,5,2020/12/03,3,Carla,12,900,2020/12/03
4 | 1,Alice,20,2020/12/10,1,Alice,11,300,2020/12/12
5 | 7,Angel,30,2020/12/11,7,Amy,11,50000,2020/11/26
6 | 


--------------------------------------------------------------------------------
/test/clojask/correct_outputs/1-5.csv:
--------------------------------------------------------------------------------
1 | 2_Employee,2_EmployeeName,2_DayOff,2_UpdateDate,1_Employee,1_EmployeeName,1_Department,1_Salary,1_UpdateDate
2 | 7,Angel,30,2020/12/11,7,Amy,11,50000,2020/11/26
3 | 8,Jack,4,2019/03/21,,,,,
4 | 1,Alice,20,2020/12/10,1,Alice,11,300,2020/12/12
5 | 3,Carla,5,2020/12/03,3,Carla,12,900,2020/12/03
6 | 2,Bob,15,2020/12/05,2,Bob,11,600,2020/12/01
7 | 


--------------------------------------------------------------------------------
/test/clojask/correct_outputs/1-7.csv:
--------------------------------------------------------------------------------
1 | 1_Employee,1_EmployeeName,1_Department,1_Salary,1_UpdateDate,2_Employee,2_EmployeeName,2_DayOff,2_UpdateDate
2 | 7,Amy,11,50000,2020/11/26,,,,
3 | 3,Carla,12,900,2020/12/03,3,Carla,5,2020/12/03
4 | 4,Daniel,12,1000,2020/12/05,,,,
5 | 5,Evelyn,13,800,2020/12/03,,,,
6 | 1,Alice,11,300,2020/12/12,1,Alice,20,2020/12/10
7 | 2,Bob,11,600,2020/12/01,,,,
8 | 6,Ferdinand,21,700,2020/12/05,,,,
9 | 


--------------------------------------------------------------------------------
/test/clojask/correct_outputs/1-8.csv:
--------------------------------------------------------------------------------
1 | 1_Employee,1_EmployeeName,1_Department,1_Salary,1_UpdateDate,2_Employee,2_EmployeeName,2_DayOff,2_UpdateDate
2 | 5,Evelyn,13,800,2020/12/03,,,,
3 | 1,Alice,11,300,2020/12/12,,,,
4 | 2,Bob,11,600,2020/12/01,2,Bob,15,2020/12/05
5 | 6,Ferdinand,21,700,2020/12/05,,,,
6 | 7,Amy,11,50000,2020/11/26,,,,
7 | 3,Carla,12,900,2020/12/03,3,Carla,5,2020/12/03
8 | 4,Daniel,12,1000,2020/12/05,,,,
9 | 


--------------------------------------------------------------------------------
/test/clojask/correct_outputs/1-4.csv:
--------------------------------------------------------------------------------
1 | 1_Employee,1_EmployeeName,1_Department,1_Salary,1_UpdateDate,2_Employee,2_EmployeeName,2_DayOff,2_UpdateDate
2 | 5,Evelyn,13,800,2020/12/03,,,,
3 | 4,Daniel,12,1000,2020/12/05,,,,
4 | 1,Alice,11,300,2020/12/12,1,Alice,20,2020/12/10
5 | 3,Carla,12,900,2020/12/03,3,Carla,5,2020/12/03
6 | 6,Ferdinand,21,700,2020/12/05,,,,
7 | 2,Bob,11,600,2020/12/01,2,Bob,15,2020/12/05
8 | 7,Amy,11,50000,2020/11/26,7,Angel,30,2020/12/11
9 | 


--------------------------------------------------------------------------------
/test/clojask/dcast.csv:
--------------------------------------------------------------------------------
 1 | family_id,age_mother,measure,value
 2 | 1,30,dob_child1,1998-11-26
 3 | 1,30,dob_child2,2000-01-29
 4 | 1,30,dob_child3,
 5 | 2,27,dob_child1,1996-06-22
 6 | 2,27,dob_child2,
 7 | 2,27,dob_child3,
 8 | 3,26,dob_child1,2002-07-11
 9 | 3,26,dob_child2,2004-04-05
10 | 3,26,dob_child3,2007-09-02
11 | 4,32,dob_child1,2004-10-10
12 | 4,32,dob_child2,2009-08-27
13 | 4,32,dob_child3,2012-07-21
14 | 5,29,dob_child1,2000-12-05
15 | 5,29,dob_child2,2005-02-28
16 | 5,29,dob_child3,
17 | 


--------------------------------------------------------------------------------
/examples/timezone dataframe/timezone.clj:
--------------------------------------------------------------------------------
 1 | (ns examples.timezone
 2 |   (:require [clojask.dataframe :as clojask]))
 3 | 
 4 | (defn timezone-parser
 5 |   "the input is a datetime string with timezone identifier as suffix"
 6 |   [time-string]
 7 |   )
 8 | 
 9 | (defn timezone-formatter
10 |   "the input is a vector, the first element is a date object, the second is the timezone string"
11 |   [time-vec]
12 |   )
13 | 
14 | (def main
15 |   []
16 |   (def df (clojask/dataframe "sales.csv"))
17 |   )


--------------------------------------------------------------------------------
/src/main/java/TypeException.java:
--------------------------------------------------------------------------------
 1 | package com.clojask.exception;
 2 | 
 3 | import java.lang.RuntimeException;
 4 | 
 5 | public class TypeException extends RuntimeException {
 6 | 
 7 |     public TypeException(String s) {
 8 |         super("Type assertion error: " + s);
 9 |     }
10 | 
11 |     public TypeException(String s, Throwable err) {
12 |         super("Type assertion error: " + s, err);
13 |     }
14 | 
15 |     // @Override
16 |     // public String toString() {
17 |     //     return this.getMessage() + "\n" + super.toString();
18 |     // }
19 | }


--------------------------------------------------------------------------------
/src/main/java/ExecutionException.java:
--------------------------------------------------------------------------------
 1 | package com.clojask.exception;
 2 | 
 3 | import java.lang.Exception;
 4 | 
 5 | public class ExecutionException extends Exception {
 6 | 
 7 |     public ExecutionException(String s) {
 8 |         super("Execution Error: " + s);
 9 |     }
10 | 
11 |     public ExecutionException(String s, Throwable err) {
12 |         super("Execution Error: " + s, err);
13 |     }
14 | 
15 |     // @Override
16 |     // public String toString() {
17 |     //     return this.getMessage() + "\n" + super.toString();
18 |     // }
19 | 
20 | }


--------------------------------------------------------------------------------
/src/main/clojure/clojask/terminal.clj:
--------------------------------------------------------------------------------
 1 | (ns clojask.terminal
 2 |   )
 3 | 
 4 | (defn print-progress
 5 |   "Print the progress of perc"
 6 |   [perc & {:keys [total init stage] :or {total 25 init false stage nil}}]
 7 |   (let [total (or total 25)
 8 |         count (int (* perc total))
 9 |         rem (- total count)
10 |         per (* 100 perc)]
11 |     (if (not= init true)
12 |       (do (print "\33[1A\33[2K")
13 |           (flush)))
14 |     (if (not= stage nil)
15 |       (println stage))
16 |     (println (format "[%s%s] %.2f%%" (apply str (repeat count "#")) (apply str (repeat rem " ")) per))
17 |     (flush)))


--------------------------------------------------------------------------------
/src/main/java/OperationException.java:
--------------------------------------------------------------------------------
 1 | package com.clojask.exception;
 2 | 
 3 | import java.lang.RuntimeException;
 4 | 
 5 | public class OperationException extends Exception {
 6 | 
 7 |     public OperationException(String s) {
 8 |         super("Failed in running operation: " + s);
 9 |     }
10 | 
11 |     public OperationException(String s, Throwable err) {
12 |         super("Failed in running operation: " + s, err);
13 |         // super.fillInStackTrace();
14 |     }
15 | 
16 |     // @Override
17 |     // public String toString() {
18 |     //     return this.getMessage() + "\n" + super.toString();
19 |     // }
20 | }


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 clojure-finance
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/src/main/clojure/clojask/classes/DataStat.clj:
--------------------------------------------------------------------------------
 1 | (ns clojask.classes.DataStat
 2 |   (:require [clojure.java.io :as io]))
 3 | 
 4 | (import '[com.clojask.exception TypeException]
 5 |         '[com.clojask.exception OperationException])
 6 | 
 7 | (definterface DataIntf
 8 |   (init [source file])
 9 |   (initWithIO [io-func])
10 |   (getSize []))
11 | 
12 | 
13 | (deftype DataStat
14 |   ;; the column description about whether a change is made to this column
15 |          [^:unsynchronized-mutable file-size
16 |           ^:unsynchronized-mutable num-rows]
17 | 
18 |   ;; method
19 |   DataIntf
20 | 
21 |   (init
22 |    [this source file]
23 |    (if file
24 |      (do
25 |        (set! file-size (:size (file)))
26 |        (set! num-rows nil))
27 |      (if (fn? source)
28 |        (do
29 |          (set! file-size nil)
30 |          (set! num-rows nil))
31 |        (do
32 |          (set! file-size (.length (io/file source)))))))
33 |   
34 |   (initWithIO
35 |    [this io-func]
36 |    (set! file-size (:size (io-func)))
37 |    (set! num-rows nil))
38 | 
39 |    (getSize
40 |     [this]
41 |     file-size))
42 | 
43 | (defn compute-stat
44 |   [source & [io-func]]
45 |   (let [stat (DataStat. nil nil)]
46 |     (if io-func
47 |       (.initWithIO stat io-func)
48 |       (.init stat source nil))
49 |     stat))


--------------------------------------------------------------------------------
/docs/clojask types.md:
--------------------------------------------------------------------------------
 1 | ## Clojask Types
 2 | 
 3 | ### Supported Types
 4 | 
 5 | string
 6 | 
 7 | int
 8 | 
 9 | double
10 | 
11 | date
12 | 
13 | datetime
14 | 
15 | ### string
16 | 
17 | The default type for all columns
18 | 
19 | Class: ` java.lang.String`
20 | 
21 | #### Examples
22 | 
23 | ```clojure
24 | (set-type dataframe "col-name" "string")
25 | ```
26 | 
27 | ### int
28 | 
29 | Most efficiently stores an integer
30 | 
31 | Class: `java.lang.Integer`
32 | 
33 | #### Examples
34 | 
35 | ```clojure
36 | (set-type dataframe "col-name" "int")
37 | ```
38 | 
39 | ### double
40 | 
41 | Accepts floats and integers
42 | 
43 | Class: `java.lang.Double`
44 | 
45 | #### Examples
46 | 
47 | ```clojure
48 | (set-type dataframe "col-name" "double")
49 | ```
50 | 
51 | ### date
52 | 
53 | Transform a date string (no time field)
54 | 
55 | Class: `java.time.LocalDate` (default format string: `yyyy-MM-dd`)
56 | 
57 | #### Examples
58 | 
59 | ```clojure
60 | ;; if the date looks like this 2020/11/12
61 | (set-type dataframe "col-name" "date:yyyy/MM/dd")
62 | ```
63 | 
64 | ### datetime
65 | 
66 | Transform a date string (no time field)
67 | 
68 | Class: `java.time.LocalDateTime` (default format string: `yyyy-MM-dd HH:mm:ss`)
69 | 
70 | #### Examples
71 | 
72 | ```clojure
73 | ;; if the date looks like this 2020/11/12 12:12:36
74 | (set-type dataframe "col-name" "datetime:yyyy/MM/dd HH:mm:ss")
75 | ```
76 | 
77 | ### 


--------------------------------------------------------------------------------
/src/main/clojure/clojask/api/aggregate.clj:
--------------------------------------------------------------------------------
 1 | (ns clojask.api.aggregate
 2 |   (:refer-clojure :exclude [max min sum count]))
 3 | "Contains implemented simple aggregation functions"
 4 | 
 5 | (def start)
 6 | 
 7 | ;; (defn aggre-func
 8 | ;;   "prev value could be start"
 9 | ;;   [prev new])
10 | 
11 | ;; single row aggregation functions
12 | 
13 | (defn max
14 |   [a b]
15 |   (if (or (= a start) (> (compare b a) 0))
16 |     b
17 |     a))
18 | 
19 | (defn min
20 |   [a b]
21 |   (if (or (= a start) (< (compare b a) 0))
22 |     b
23 |     a))
24 | 
25 | (defn sum
26 |   [a b]
27 |   (if (= a start)
28 |     b
29 |     (+ a b)))
30 | 
31 | (defn count
32 |   [a b]
33 |   (if (= a start)
34 |     1
35 |     (inc a)))
36 | 
37 | ;; multi-row aggregation functions
38 | 
39 | (defn smallest3
40 |   "return the smallest three entries"
41 |   [a b]
42 |   (cond
43 |     (= start a) [b]
44 |     :else (take 3 (sort (conj a b)))))
45 | 
46 | (defn smallestk
47 |   "return the smallest k entries (the performance is better with smaller k)"
48 |   [a b k]
49 |   (cond
50 |     (= start a) [b]
51 |     :else (take k (sort (conj a b)))))
52 | 
53 | (defn largest3
54 |   [a b]
55 |   "return the largest three entries"
56 |   (cond
57 |     (= start a) [b]
58 |     :else (take 3 (sort (fn [a b] (compare b a)) (conj a b)))))
59 | 
60 | (defn largestk
61 |   [a b k]
62 |   "return the largest three entries (the performance is better with smaller k)"
63 |   (cond
64 |     (= start a) [b]
65 |     :else (take k (sort (fn [a b] (compare b a)) (conj a b)))))
66 | 
67 | 


--------------------------------------------------------------------------------
/project.clj:
--------------------------------------------------------------------------------
 1 | (defproject com.github.clojure-finance/clojask "2.0.1"
 2 |   :description "Data analysis and manipulation library with parallel computing for larger-than-memory datasets"
 3 |   :url "https://github.com/clojure-finance/clojask"
 4 |   :license {:name "MIT"
 5 |             :url "https://github.com/clojure-finance/clojask/blob/1.x.x/LICENSE"}
 6 |   :dependencies [[org.clojure/clojure "1.10.1"]
 7 |                 ;;  [org.clojure/math.numeric-tower "0.0.4"]
 8 |                  [org.clojure/data.csv "1.0.0"]
 9 |                  ^{:voom {:repo "git@github.com:onyx-platform/onyx.git" :branch "master"}}
10 |                  [org.onyxplatform/onyx "0.14.6"]
11 |                  [com.taoensso/timbre "5.2.1"]
12 |                 ;;  [techascent/tech.ml.dataset "5.17" :exclusions [[ch.qos.logback/logback-classic][org.slf4j/slf4j-api]]]
13 |                  [com.google.code.externalsortinginjava/externalsortinginjava "0.6.0"]
14 |                  [com.github.clojure-finance/clojask-io "1.0.6"]
15 |                  [com.github.clojure-finance/clojure-heap "1.0.3"]]
16 |   :repl-options {:init-ns clojask.debug
17 |                  :timeout 180000}
18 |   :plugins [[lein-update-dependency "0.1.2"]]
19 |   :main ^:skip-aot clojask.debug/-main
20 |   :source-paths      ["src/main/clojure"]
21 |   :java-source-paths ["src/main/java"]
22 |   :javac-options ["-target" "1.8" "-source" "1.8" "-Xlint:-options"]
23 |   :jvm-opts ["-XX:+UseG1GC" "-server"]
24 |   :test-paths        ["test/clojask"]
25 |   ;:java-test-paths   ["test/java"]
26 |   ;;:injections [(.. System (setProperty "clojure.core.async.pool-size" "8"))]
27 |   )
28 | 


--------------------------------------------------------------------------------
/src/main/clojure/clojask/extensions/reshape.clj:
--------------------------------------------------------------------------------
 1 | (ns clojask.extensions.reshape
 2 |   "Contains functions that extends the power of clojask, while not directly applying to the dataframe class"
 3 |   (:require [clojure.data.csv :as csv]
 4 |             [clojure.java.io :as io]
 5 |             [clojure.string :as str]
 6 |             [clojask.dataframe :as ck]))
 7 | 
 8 | (defn melt
 9 |   "Reshape the clojask dataframe from wide to long."
10 |   [df output-dir id measure & {:keys [measure-name value-name] :or {measure-name "measure" value-name "value"}}]
11 |   (let [id-count (count id)
12 |         mea-count (count measure)
13 |         func (fn [x] (map concat (repeat (take id-count x)) (map vector measure (take-last mea-count x))))]
14 |     (ck/compute df 1 output-dir :select (concat id measure) :melt func :header (concat id [measure-name value-name])))
15 |   )
16 | 
17 | (defn- dcast-second
18 |   [seq]
19 |   (first (rest seq)))
20 | 
21 | (defn- dcast-1
22 |   [seq order]
23 |   (let [keys (map first seq)
24 |         vals (map dcast-second seq)
25 |         dict (zipmap keys vals)
26 |         func (fn [order] (if-let [res (get dict order)] (str res) ""))]
27 |     (str/join "," (mapv func order))))
28 | 
29 | (defn dcast
30 |   "Reshape the clojask dataframe from long to wide."
31 |   [x output-dir id measure-name value-name vals & {:keys [vals-name] :or {vals-name vals}}]
32 |   (assert (= [] (.getGroupbyKeys x)) "dcast is not applicable to this dataframe")
33 |   (ck/operate x (fn [a b] [a b]) [measure-name value-name] "dcast1014")
34 |   (ck/group-by x id)
35 |   (let [func #(dcast-1 % vals)]
36 |     (ck/aggregate x func "dcast1014"))
37 |   (ck/compute x 8 output-dir :header (concat id vals-name))
38 |   )


--------------------------------------------------------------------------------
/src/main/clojure/clojask/api/gb_aggregate.clj:
--------------------------------------------------------------------------------
  1 | (ns clojask.api.gb-aggregate
  2 |   (:require [clojask.api.aggregate :as agg])
  3 |   (:refer-clojure :exclude [max min sum count]))
  4 | "Contains the implemented function for group-by aggregation functions"
  5 | 
  6 | ;; (defn aggre-func
  7 | ;;   "function that can be applied on a collection"
  8 | ;;   [list])
  9 | 
 10 | ;; single row aggregation functions
 11 | 
 12 | (defn max
 13 |   [list]
 14 |   (reduce agg/max list))
 15 | 
 16 | (defn min
 17 |   [list]
 18 |   (reduce agg/min list))
 19 | 
 20 | (defn sum
 21 |   [list]
 22 |   (reduce + list))
 23 | 
 24 | (defn count
 25 |   [list]
 26 |   (clojure.core/count list))
 27 | 
 28 | (defn mean
 29 |   [list]
 30 |   (let [sum (apply + list)
 31 |         count (count list)]
 32 |     (if (pos? count)
 33 |       (/ sum count)
 34 |       0)))
 35 | 
 36 | (defn mode
 37 |   [list]
 38 |   (let [freqs (frequencies list)
 39 |         occurrences (clojure.core/group-by val freqs)
 40 |         modes (last (sort occurrences))
 41 |         modes (->> modes
 42 |                    val
 43 |                    (map key))]
 44 |     modes))
 45 | 
 46 | (defn median
 47 |   [list]
 48 |   (let [sorted (sort list)
 49 |         cnt (count sorted)
 50 |         halfway (quot cnt 2)]
 51 |     (if (odd? cnt)
 52 |       (nth sorted halfway)
 53 |       (let [bottom (dec halfway)
 54 |             bottom-val (nth sorted bottom)
 55 |             top-val (nth sorted halfway)]
 56 |         (mean [bottom-val top-val])))))
 57 | 
 58 | (defn sd
 59 |   [list]
 60 |   (let [avg (mean list)
 61 |         squares (for [x list]
 62 |                   (let [x-avg (- x avg)]
 63 |                     (* x-avg x-avg)))
 64 |         total (count list)]
 65 |     (if (= 1 total)
 66 |       0
 67 |       (-> (/ (apply + squares)
 68 |              (- total 1))
 69 |           (Math/sqrt)))))
 70 | 
 71 | (defn skew
 72 |   [list]
 73 |   (let [mean (mean list)
 74 |         median (median list)
 75 |         sd (sd list)]
 76 |     (* 3 (/ (- mean median) sd))))
 77 | 
 78 | ;; multi-row aggregation functions
 79 | 
 80 | (defn smallest3
 81 |   "return the smallest 3 entries"
 82 |   [list]
 83 |   (reduce agg/smallest3 agg/start list))
 84 | 
 85 | (defn smallestk
 86 |   "return the smallest k entries (the performance is better with smaller k)"
 87 |   [list k]
 88 |   (reduce (fn [a b] (agg/smallestk a b k)) agg/start list))
 89 | 
 90 | (defn largest3
 91 |   "return the largest 3 entries"
 92 |   [list]
 93 |   (reduce agg/largest3 agg/start list))
 94 | 
 95 | (defn largestk
 96 |   "return the largest k entries (the performance is better with smaller k)"
 97 |   [list k]
 98 |   (reduce (fn [a b] (agg/largestk a b k)) agg/start list))
 99 | 
100 | 


--------------------------------------------------------------------------------
/docs/aggregation functions.md:
--------------------------------------------------------------------------------
 1 | ### Aggregation Functions
 2 | 
 3 | In Clojask, you can aggregate on the whole dataframe, or on the group-by dataframe. We call the first case "simple aggregation" and the second "group-by aggregation". Some given functions for simple aggregation are defined in namespace `clojask.api.aggregate`, and the given functions for group-by aggregation are defined in namespace `clojask.api.gb-aggregate`. 
 4 | 
 5 | Below are full list of given functions for the two types.
 6 | 
 7 | #### `clojask.api.aggregate`:
 8 | 
 9 | `max`: Find the max value (use `clojure.core/compare` as the comparator)
10 | 
11 | `min`: Find the min value (use `clojure.core/compare` as the comparator)
12 | 
13 | #### `clojask.api.gb-aggregate`:
14 | 
15 | `max`: Find the max value (use `clojure.core/compare` as the comparator)
16 | 
17 | `min`: Find the min value (use `clojure.core/compare` as the comparator)
18 | 
19 | Besides these given functions, you are also welcomed to define your own.
20 | 
21 | #### How to define group-by aggregation functions?
22 | 
23 | This is the template:
24 | 
25 | ```clojure
26 | (defn gb-aggre-template
27 |   [col]  ;; take only one argument which is the aggregation column in the format of vector
28 |   ;; ... your implementation
29 |   result    ;; return one variable (could be int / double / string / collection of above)
30 |   )
31 | ```
32 | 
33 | Basically, the function should take one argument only, which is the full aggregation column. ***Here we simply assume this column should be smaller than memory!***
34 | 
35 | You may find many built-in function in Clojure also fulfilling this requirement, for example, `count`, `mean`, and countless function constructed from [`reduce`](https://clojuredocs.org/clojure.core/reduce).
36 | 
37 | #### How to define simple aggregation functions?
38 | 
39 | This is the template:
40 | 
41 | ```clojure
42 | (defn aggre-template
43 |   ;; [new-value old-result]
44 |   [old-result new-value]
45 |   ;; old-result: the value of the result for the previous gb-aggre-template
46 |   ;; new-value: the value for the column on the current row
47 |   ;; ... your implementation
48 |   new-result   ;; return the new result, and this will be passed as old-result for the next gb-aggre-template
49 |   )
50 | ```
51 | 
52 | **Notes:**
53 | 
54 | 1. The old-result for the first `aggre-template` is `clojask.api.aggregate/start`. So your function must be able to deal with cases when the first argument is `clojask.api.aggregate/start`.
55 | 2. Your function should be self-sustainable, meaning that the result of `aggre-template` should be safe as the input for `aggre-template`.
56 |    1. To better understand the this template, you may refer to the documentation of [`reduce`](https://clojuredocs.org/clojure.core/reduce), the `aggre-func` should be able to use in `reduce`.
57 | 
58 | 


--------------------------------------------------------------------------------
/benchmark/dask-benchmark.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "%%time\n",
10 |     "\n",
11 |     "from dask.distributed import Client\n",
12 |     "\n",
13 |     "client = Client(n_workers=4)\n",
14 |     "\n",
15 |     "import dask.dataframe as dd\n",
16 |     "import pandas as pd\n",
17 |     "import os\n",
18 |     "import dask\n",
19 |     "\n",
20 |     "filename = os.path.join('../clojure-datasets/data-Compustat-lohi.csv') # 1.8 M dataset\n",
21 |     "#filename = os.path.join('../clojure-datasets/data-Compustat-x2.csv') # 3.6 M dataset\n",
22 |     "#filename = os.path.join('../clojure-datasets/data-CRSP.csv') # 80 M dataset\n",
23 |     "\n",
24 |     "#crsp_filename = os.path.join('../clojure-datasets/CRSP-extract.csv') # 80 M dataset\n",
25 |     "\n",
26 |     "df = dd.read_csv(filename, dtype={'exchg': 'float64', 'sic': 'float64'})\n",
27 |     "#other = dd.read_csv(crsp_filename)\n",
28 |     "\n",
29 |     "#ddf = dd.from_pandas(df, npartitions=10)\n",
30 |     "\n",
31 |     "# =================== Change this part to test time taken ====================== #\n",
32 |     "\n",
33 |     "# element-wise operations\n",
34 |     "df['new_col'] = df['datacqtr'] + 20 # Compustat\n",
35 |     "#df['new_col'] = df['PRC'] + 20 # CRSP\n",
36 |     "\n",
37 |     "# row-wise\n",
38 |     "#df = df[df.datacqtr > 1000.0]\n",
39 |     "\n",
40 |     "# aggregation\n",
41 |     "#df = df.datacqtr.max()\n",
42 |     "\n",
43 |     "# groupby aggregate\n",
44 |     "#df = df.groupby(df.conm).datafqtr.max()\n",
45 |     "\n",
46 |     "# left join\n",
47 |     "#df = df.join(other, how='left')\n",
48 |     "\n",
49 |     "# left join\n",
50 |     "#df = df.join(other, how='right')\n",
51 |     "\n",
52 |     "# inner join\n",
53 |     "#df = df.join(other, how='inner')\n",
54 |     "\n",
55 |     "# ========================================================================= #\n",
56 |     "\n",
57 |     "#df.to_csv('./output/data-Compustat-output-*.csv') # output as separate csv files\n",
58 |     "df.to_csv('dask_output.csv', single_file=True) # output as a single file"
59 |    ]
60 |   },
61 |   {
62 |    "cell_type": "code",
63 |    "execution_count": null,
64 |    "metadata": {},
65 |    "outputs": [],
66 |    "source": []
67 |   }
68 |  ],
69 |  "metadata": {
70 |   "kernelspec": {
71 |    "display_name": "Python 3 (ipykernel)",
72 |    "language": "python",
73 |    "name": "python3"
74 |   },
75 |   "language_info": {
76 |    "codemirror_mode": {
77 |     "name": "ipython",
78 |     "version": 3
79 |    },
80 |    "file_extension": ".py",
81 |    "mimetype": "text/x-python",
82 |    "name": "python",
83 |    "nbconvert_exporter": "python",
84 |    "pygments_lexer": "ipython3",
85 |    "version": "3.9.6"
86 |   }
87 |  },
88 |  "nbformat": 4,
89 |  "nbformat_minor": 4
90 | }
91 | 


--------------------------------------------------------------------------------
/benchmark/.ipynb_checkpoints/dask-benchmark-checkpoint.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "%%time\n",
10 |     "\n",
11 |     "from dask.distributed import Client\n",
12 |     "\n",
13 |     "client = Client(n_workers=4)\n",
14 |     "\n",
15 |     "import dask.dataframe as dd\n",
16 |     "import pandas as pd\n",
17 |     "import os\n",
18 |     "import dask\n",
19 |     "\n",
20 |     "filename = os.path.join('../clojure-datasets/data-Compustat-lohi.csv') # 1.8 M dataset\n",
21 |     "#filename = os.path.join('../clojure-datasets/data-Compustat-x2.csv') # 3.6 M dataset\n",
22 |     "#filename = os.path.join('../clojure-datasets/data-CRSP.csv') # 80 M dataset\n",
23 |     "\n",
24 |     "#crsp_filename = os.path.join('../clojure-datasets/CRSP-extract.csv') # 80 M dataset\n",
25 |     "\n",
26 |     "df = dd.read_csv(filename, dtype={'exchg': 'float64', 'sic': 'float64'})\n",
27 |     "#other = dd.read_csv(crsp_filename)\n",
28 |     "\n",
29 |     "#ddf = dd.from_pandas(df, npartitions=10)\n",
30 |     "\n",
31 |     "# =================== Change this part to test speed ====================== #\n",
32 |     "\n",
33 |     "# element-wise operations\n",
34 |     "df['new_col'] = df['datacqtr'] + 20 # Compustat\n",
35 |     "#df['new_col'] = df['PRC'] + 20 # CRSP\n",
36 |     "\n",
37 |     "# row-wise\n",
38 |     "#df = df[df.datacqtr > 1000.0]\n",
39 |     "\n",
40 |     "# aggregation\n",
41 |     "#df = df.datacqtr.max()\n",
42 |     "\n",
43 |     "# groupby aggregate\n",
44 |     "#df = df.groupby(df.conm).datafqtr.max()\n",
45 |     "\n",
46 |     "# left join\n",
47 |     "#df = df.join(other, how='left')\n",
48 |     "\n",
49 |     "# left join\n",
50 |     "#df = df.join(other, how='right')\n",
51 |     "\n",
52 |     "# inner join\n",
53 |     "#df = df.join(other, how='inner')\n",
54 |     "\n",
55 |     "# ========================================================================= #\n",
56 |     "\n",
57 |     "#df.to_csv('./output/data-Compustat-output-*.csv') # output as separate csv files\n",
58 |     "df.to_csv('dask_output.csv', single_file=True) # output as a single file"
59 |    ]
60 |   },
61 |   {
62 |    "cell_type": "code",
63 |    "execution_count": null,
64 |    "metadata": {},
65 |    "outputs": [],
66 |    "source": []
67 |   }
68 |  ],
69 |  "metadata": {
70 |   "kernelspec": {
71 |    "display_name": "Python 3 (ipykernel)",
72 |    "language": "python",
73 |    "name": "python3"
74 |   },
75 |   "language_info": {
76 |    "codemirror_mode": {
77 |     "name": "ipython",
78 |     "version": 3
79 |    },
80 |    "file_extension": ".py",
81 |    "mimetype": "text/x-python",
82 |    "name": "python",
83 |    "nbconvert_exporter": "python",
84 |    "pygments_lexer": "ipython3",
85 |    "version": "3.9.6"
86 |   }
87 |  },
88 |  "nbformat": 4,
89 |  "nbformat_minor": 4
90 | }
91 | 


--------------------------------------------------------------------------------
/src/main/clojure/clojask/clojask_input.clj:
--------------------------------------------------------------------------------
 1 | (ns clojask.clojask-input
 2 |   (:require [clojure.core.async :refer [poll! timeout chan close!]]
 3 |             [clojure.set :refer [join]]
 4 |             [onyx.plugin.protocols :as p]
 5 |             [clojure.data.csv :as csv]
 6 |             [clojask.utils :refer [filter-check]]
 7 |             [taoensso.timbre :refer [fatal info debug] :as timbre])
 8 |   (:import (java.io BufferedReader)))
 9 | 
10 | (defrecord AbsSeqReader [event reader filters types have-col rst completed? checkpoint? offset batch-size]
11 |   p/Plugin
12 | 
13 |   (start [this event]
14 |     this)
15 | 
16 |   (stop [this event]
17 |     this)
18 | 
19 |   p/Checkpointed
20 |   (checkpoint [this]
21 |     (when checkpoint? @offset))
22 | 
23 |   (recover! [this _ checkpoint]
24 |     (vreset! completed? false)
25 |     (let [csv-data (reader)
26 |           ;; csv-data (if (fn? reader)
27 |           ;;            (reader)
28 |           ;;            (if have-col
29 |           ;;              (rest (line-seq (BufferedReader. reader)))
30 |           ;;              (line-seq (BufferedReader. reader))))
31 |           data (map zipmap (repeat [:id :d]) (map vector (iterate inc 0) (partition batch-size batch-size [] csv-data)))]
32 |       (if (nil? checkpoint)
33 |         (do
34 |           (vreset! rst data)
35 |           (vreset! offset 0))
36 |         (do
37 |           (info "clojask.clojask-input is recovering state by dropping" checkpoint "elements.")
38 |           (vreset! rst (drop checkpoint data))
39 |           (vreset! offset checkpoint)))))
40 | 
41 |   (checkpointed! [this epoch])
42 | 
43 |   p/BarrierSynchronization
44 |   (synced? [this epoch]
45 |     true)
46 | 
47 |   (completed? [this]
48 |     @completed?)
49 | 
50 |   p/Input
51 |   (poll! [this _ _]
52 |     (if-let [seg (first @rst)]
53 |       (do
54 |         (vswap! rst rest)
55 |         seg
56 |         )
57 |       (do (vreset! completed? true)
58 |           nil))
59 |          ))
60 | 
61 | (defn inject-dataframe
62 |   [dataframe]
63 |   (def df dataframe))
64 | 
65 | (defn input [{:keys [onyx.core/task-map] :as event}]
66 |   ;; (println (:seq/rdr event))
67 |   (map->AbsSeqReader {:event event
68 |                       ;; :sequential (:seq/seq event)
69 |                       :reader (.getFunc df)
70 |                       :filters (.getFilters (:row-info df))
71 |                       :types (.getType (:col-info df))
72 |                       :have-col (:have-col df)
73 |                       :rst (volatile! nil)
74 |                       :completed? (volatile! false)
75 |                       :checkpoint? (not (false? (:seq/checkpoint? task-map)))
76 |                       :offset (volatile! nil)
77 |                       :batch-size (:batch-size df)}))
78 | 
79 | (def reader-calls
80 |   {})
81 | 
82 | (defn inject-lifecycle-seq
83 |   [_ lifecycle]
84 |   {:seq/seq (:seq/sequential lifecycle)})
85 | 
86 | (def inject-seq-via-lifecycle
87 |   {:lifecycle/before-task-start inject-lifecycle-seq})


--------------------------------------------------------------------------------
/src/main/clojure/clojask/extensions/bind.clj:
--------------------------------------------------------------------------------
 1 | (ns clojask.extensions.bind
 2 |   "Contains functions that extends the power of clojask, while not directly applying to the dataframe class"
 3 |   (:require [clojure.data.csv :as csv]
 4 |             [clojure.java.io :as io]
 5 |             [clojure.string :as str]
 6 |             [clojask.dataframe :as ck]
 7 |             [clojask-io.input :refer [read-file]]
 8 |             [clojask-io.output :refer [write-csv]]))
 9 | 
10 | (defn _cbind
11 |   "joins a list of lazy sequences vertically"
12 |   [seq]
13 |   (apply map (fn [a b & cs] (apply concat (concat [a b] cs))) seq))
14 | 
15 | (defn cbind-csv
16 |   "Joins some csv files into a new dataframe by columns"
17 |   [a b & cs]
18 |   (let [files (concat [a b] cs)
19 |         func (fn [] {:clojask-io true
20 |                      :data (_cbind (map (fn [file] (:data (read-file file :format "csv" :stat true))) files))
21 |                      :size (reduce (fn [a b] (if (and (not= a nil) (not= b nil)) (+ a b) nil)) (mapv (fn [file] (:size (read-file file :format "csv" :stat true))) files))
22 |                      :output (fn [wtr seq] (write-csv wtr seq ","))})]
23 |     (ck/dataframe func)
24 |     ;; (func)
25 |     ))
26 | 
27 | (defn cbind
28 |   "Joins some dataset files into a new dataframe by columns.\n
29 |    If one of the file does not use the default seperator, please rewrite this function!"
30 |   [a b & cs]
31 |   (let [files (concat [a b] cs)
32 |         func (fn [] {:clojask-io true
33 |                      :data (_cbind (map (fn [file] (:data (read-file file :stat true))) files))
34 |                      :size (reduce (fn [a b] (if (and (not= a nil) (not= b nil)) (+ a b) nil)) (mapv (fn [file] (:size (read-file file :stat true))) files))})]
35 |     (ck/dataframe func)))
36 | 
37 | (defn rbind-csv
38 |   "Joins some csv files into a new dataframe by rows\n
39 |    Will by default use the header names of the first file"
40 |   [a b & cs]
41 |   (let [files (concat [b] cs)
42 |         files-witha (concat [a b] cs)
43 |         func (fn [] {:clojask-io true
44 |                      :data (concat (:data (read-file a :format "csv")) (apply concat (map rest (map #(:data (read-file % :format "csv")) files))))
45 |                      :size (reduce (fn [a b] (if (and (not= a nil) (not= b nil)) (+ a b) nil)) (mapv (fn [file] (:size (read-file file :format "csv" :stat true))) files-witha))
46 |                      :output (fn [wtr seq] (write-csv wtr seq ","))})]
47 |     (ck/dataframe func)))
48 | 
49 | (defn rbind
50 |   "Joins some csv files into a new dataframe by rows\n
51 |    Will by default use the header names of the first file"
52 |   [a b & cs]
53 |   (let [files (concat [b] cs)
54 |         files-witha (concat [a b] cs)
55 |         func (fn [] {:clojask-io true
56 |                      :data (concat (:data (read-file a)) (apply concat (map rest (map #(:data (read-file %)) files))))
57 |                      :size (reduce (fn [a b] (if (and (not= a nil) (not= b nil)) (+ a b) nil)) (mapv (fn [file] (:size (read-file file :stat true))) files-witha))})]
58 |     (ck/dataframe func)))


--------------------------------------------------------------------------------
/benchmark/clojure-benchmark.clj:
--------------------------------------------------------------------------------
 1 | (ns benchmark.core
 2 |     (:require [clojask.dataframe :refer :all]
 3 |               [clojure.core.async :as async]))
 4 |   
 5 |   (defn main
 6 |     []
 7 |     (def y (dataframe "../clojure-datasets/data-Compustat-lohi.csv")) ; 1.8 M dataset
 8 |     ;(def y (dataframe "../clojure-datasets/data-Compustat-x2.csv")) ; 3.6 M dataset
 9 |     ;(def y (dataframe "../clojure-datasets/data-CRSP.csv")) ; 80 M dataset
10 | 
11 |     ; =================== Change this part to test time taken ====================== ;
12 | 
13 |     ;; Compustat 
14 | 
15 |     ; element-wise
16 |     ; (set-type y "prccq" "double")
17 |     ; (operate y (fn [val] (if val (+ val 10.0) 0.0)) "prccq")
18 |     ; (time (compute y 4 "resources/test.csv" :select ["datadate" "tic" "prccq"] :exception false))
19 |     
20 |     ; row-wise
21 |     ; (operate y str ["datadate" "tic"] "new-col")
22 |     ; (time (compute y 4 "resources/test.csv" :select ["datadate" "tic" "prccq" "new-col"] :exception false))
23 |     
24 |     ; groupby-aggregate
25 |     ; (set-type y "prccq" "double")
26 |     ; (group-by y "tic")
27 |     ; (aggregate y gb-aggre/max ["prccq"] ["prccq-max"])
28 |     ; (time (compute y 4 "resources/test.csv" :select ["tic" "prccq-max"] :exception false))
29 | 
30 |     ; aggregate -> error?
31 |     ; (set-type y "prccq" "double")
32 |     ; (aggregate y aggre/max ["prccq"] ["prccq-max"])
33 |     ; (time (compute y 4 "resources/test.csv" :select ["datadate" "tic" "prccq" "prccq-max"] :exception false))
34 | 
35 |     ;; CRSP
36 | 
37 |     ; element-wise
38 |     ; (set-type y "PRC" "double")
39 |     ; (operate y (fn [val] (if val (+ val 10.0) 0.0)) "prccq")
40 |     ; (time (compute y 4 "resources/test.csv" :select ["date" "TICKER" "PRC"] :exception false))
41 |     
42 |     ; row-wise
43 |     ; (operate y str ["PERMCO" "PERMNO"] "new-col")
44 |     ; (time (compute y 4 "resources/test.csv" :select ["date" "TICKER" "PRC" "new-col"] :exception false))
45 |     
46 |     ; groupby-aggregate
47 |     ; (set-type y "PRC" "double")
48 |     ; (group-by y "tic")
49 |     ; (aggregate y gb-aggre/max ["PRC"] ["PRC-max"])
50 |     ; (time (compute y 4 "resources/test.csv" :select ["TICKER" "PRC-max"] :exception false))
51 |     
52 |     ;; obtain results
53 |     (time (compute y 4 "resources/test.csv" :exception false))
54 | 
55 |     ;; join APIs
56 | 
57 |     ;(def x (dataframe "../clojure-datasets/data-CRSP.csv"))
58 |     ;(def x (dataframe "resources/CRSP-extract.csv"))
59 |     ;(def y (dataframe "../clojure-datasets/data-Compustat-lohi.csv"))
60 | 
61 |     ; (def output-df (left-join x y ["date" "TICKER"] ["datadate" "TICKER"] "date" "datadate"))
62 |     ; (time (compute output-df :select ["1_date" "1_TICKER" "1_prccq" "2_datadate" "2_TICKER" "2_PRC"] 4 "resources/test.csv" :exception false))
63 | 
64 |     ; (def output-df (right-join x y ["date" "TICKER"] ["datadate" "TICKER"] "date" "datadate"))
65 |     ; (time (compute output-df :select ["1_date" "1_TICKER" "1_prccq" "2_datadate" "2_TICKER" "2_PRC"] 4 "resources/test.csv" :exception false))
66 | 
67 |     ; (def output-df (inner-join x y ["date" "TICKER"] ["datadate" "TICKER"] "date" "datadate"))
68 |     ; (time (compute output-df :select ["1_date" "1_TICKER" "1_prccq" "2_datadate" "2_TICKER" "2_PRC"] 4 "resources/test.csv" :exception false))
69 | 
70 |     ; (def output-df (rolling-join-forward x y ["TICKER"] ["tic"] "date" "datadate"))
71 |     ; (time (compute output-df :select ["1_date" "1_TICKER" "1_prccq" "2_datadate" "2_TICKER" "2_PRC"] 4 "resources/test.csv" :exception false))
72 | 
73 |     )


--------------------------------------------------------------------------------
/src/main/clojure/clojask/join/outer_output.clj:
--------------------------------------------------------------------------------
 1 | (ns clojask.join.outer-output
 2 |   (:require [onyx.peer.function :as function]
 3 |             [onyx.plugin.protocols :as p]
 4 |             [clojure.java.io :as io]
 5 |             [taoensso.timbre :refer [debug info] :as timbre]
 6 |             [clojure.string :as string])
 7 |   (:import (java.io BufferedReader FileReader BufferedWriter FileWriter)))
 8 | 
 9 | (def write-func (atom nil))
10 | 
11 | (defn inject-write-func
12 |   [func]
13 |   (reset! write-func func))
14 | 
15 | (defn- inject-into-eventmap
16 |   [event lifecycle]
17 |   (let [wtr (io/writer (:buffered-wtr/filename lifecycle) :append true)]
18 |    {:clojask/wtr wtr}))
19 | 
20 | (defn- close-writer [event lifecycle]
21 |   (.close (:clojask/wtr event)))
22 | 
23 | ;; Map of lifecycle calls that are required to use this plugin.
24 | ;; Users will generally always have to include these in their lifecycle calls
25 | ;; when submitting the job.
26 | (def writer-calls
27 |   {:lifecycle/before-task-start inject-into-eventmap
28 |    :lifecycle/after-task-stop close-writer})
29 | 
30 | (defrecord ClojaskOutput [write-func]
31 |   p/Plugin
32 |   (start [this event]
33 |     ;; Initialize the plugin, generally by assoc'ing any initial state.
34 |     this)
35 | 
36 |   (stop [this event]
37 |     ;; Nothing is required here. However, most plugins have resources
38 |     ;; (e.g. a connection) to clean up.
39 |     ;; Mind that such cleanup is also achievable with lifecycles.
40 |     this)
41 | 
42 |   p/Checkpointed
43 |   ;; Nothing is required here. This is normally useful for checkpointing in
44 |   ;; input plugins.
45 |   (checkpoint [this])
46 | 
47 |   ;; Nothing is required here. This is normally useful for checkpointing in
48 |   ;; input plugins.
49 |   (recover! [this replica-version checkpoint])
50 | 
51 |   ;; Nothing is required here. This is normally useful for checkpointing in
52 |   ;; input plugins.
53 |   (checkpointed! [this epoch])
54 | 
55 |   p/BarrierSynchronization
56 |   (synced? [this epoch]
57 |     ;; Nothing is required here. This is commonly used to check whether all
58 |     ;; async writes have finished.
59 |     true)
60 | 
61 |   (completed? [this]
62 |     ;; Nothing is required here. This is commonly used to check whether all
63 |     ;; async writes have finished (just like synced).
64 |     true)
65 | 
66 |   p/Output
67 |   (prepare-batch [this event replica messenger]
68 |     ;; Nothing is required here. This is useful for some initial preparation,
69 |     ;; before write-batch is called repeatedly.
70 |     true)
71 | 
72 |   (write-batch [this {:keys [onyx.core/write-batch  clojask/wtr]} replica messenger]
73 |               ;;  keys [:Departement]
74 |     ;; Write the batch to your datasink.
75 |     ;; In this case we are conjoining elements onto a collection.
76 |     (doseq [msg write-batch]
77 |       ;; (if-let [msg (first batch)]
78 |       (do
79 |           ;; (swap! example-datasink conj msg)
80 |         (if (not= (:d msg) nil)
81 |           (do
82 |             (write-func wtr (:d msg))
83 |             ;; (doseq [data (:d msg)]
84 |             ;;   (.write wtr (str (string/join "," data) "\n")))
85 |                 ;; !! define argument (debug)
86 |             ))))
87 |     (.flush wtr)
88 |     true))
89 | 
90 | ;; Builder function for your output plugin.
91 | ;; Instantiates a record.
92 | ;; It is highly recommended you inject and pre-calculate frequently used data 
93 | ;; from your task-map here, in order to improve the performance of your plugin
94 | ;; Extending the function below is likely good for most use cases.
95 | (defn output [pipeline-data]
96 |   (->ClojaskOutput (deref write-func)))


--------------------------------------------------------------------------------
/src/main/clojure/clojask/sort.clj:
--------------------------------------------------------------------------------
 1 | (ns clojask.sort
 2 |   (:require [clojure.java.io :as io]
 3 |             [clojure.data.csv :as csv]
 4 |             [clojask.groupby :as gb])
 5 |   (:import [com.google.code.externalsorting.csv  CsvExternalSort]
 6 |            [com.google.code.externalsorting.csv  CsvSortOptions CsvSortOptions$Builder]
 7 |            [java.io File]))
 8 | 
 9 | (defn template-compare?
10 |   ;; row1 is the first row
11 |   ;; row2 is the second row
12 | 
13 |   ;;return is a int (- / 0 / +)
14 |   [row1 row2]
15 |   )
16 | 
17 | (defn salary-compare?
18 |   [row1 row2]
19 |   (- (Integer/parseInt (get row1 :Salary)) (Integer/parseInt (get row2 :Salary))))
20 | 
21 | (defn prc-compare?
22 |   [row1 row2]
23 | ;;     (println row1)
24 | ;;     (println row2)
25 |   (if (= (get row1 :PRC) "")
26 |     -1
27 |     (if (= (get row2 :PRC) "")
28 |       +1
29 |       (- (Double/parseDouble (get row1 :PRC)) (Double/parseDouble (get row2 :PRC))))))
30 | 
31 | 
32 | (defn get-seq
33 |   [input-dir]
34 |   (let [csv-data (csv/read-csv (io/reader input-dir))]
35 | ;;         (println (first csv-data))
36 |     (map zipmap ;; make the first row as headers and the following rows as values in a map structure e.g. {:tic AAPL} 
37 |          (->> (first csv-data) ;; take the first row of the csv-data
38 |               (map keyword) ;; make the header be the "key" in the map 
39 |               repeat)      ;; repeat the process for all the headers
40 |          (rest csv-data))))
41 | 
42 | (defn internal-sort-large
43 |   [input-dir out-dir comparator]
44 |   (def curr (atom nil))
45 |   (def prev (atom nil))
46 |   (def has-next? (atom true))
47 |   (with-open [wtr (io/writer out-dir :append true)]
48 |     (loop []
49 |       (reset! curr nil)
50 |           ;; the first iteration is to find the standard
51 |       (doseq [row (get-seq input-dir)]
52 |         (if (and (or (= (deref prev) nil) (> (comparator row (deref prev)) 0)) (or (= (deref curr) nil) (< (comparator row (deref curr)) 0)))
53 |           (do
54 |             (reset! curr row)
55 |             (reset! has-next? true))))
56 | ;;         (println (deref curr))
57 |       (if (deref has-next?)
58 |         (do
59 |           (doseq [row (get-seq input-dir)]
60 |             (if (= (compare 0 (comparator row (deref curr))) 0)
61 |               (.write wtr (str row "\n"))))
62 |           (reset! prev (deref curr))
63 |           (reset! has-next? false)
64 |           (.flush wtr)
65 |           (recur))
66 |         nil))
67 |     "success"))
68 | 
69 | 
70 | (defn internal-sort-small
71 |   [input-dir out-dir comparator]
72 |   (with-open [wtr (io/writer out-dir)]
73 |     (doseq [row (sort prc-compare? (get-seq input-dir))]
74 |       (.wtr (str row "\n"))))
75 |   "success"
76 |   )
77 | 
78 | (defn use-external-sort
79 |   [input output comp]
80 |   ;; clean the output file
81 |   ;; (println (CsvExternalSort/estimateAvailableMemory))
82 |   (with-open [wtr (io/writer output)]
83 |     (.write wtr ""))
84 |   (io/make-parents "./.clojask/sort/a.txt")
85 |   (let
86 |    [input (File. input)
87 |     output (File. output)
88 |     sort-option (let [builder (CsvSortOptions$Builder. comp CsvExternalSort/DEFAULTMAXTEMPFILES (* 5 (CsvExternalSort/estimateAvailableMemory)))]
89 |                   (.numHeader builder 1)
90 |                   (.skipHeader builder false)
91 |                   (.build builder))
92 |     ;; header (vec (first (csv/read-csv (io/reader input))))
93 |     header (java.util.ArrayList.)
94 |     file-list (CsvExternalSort/sortInBatch input (File. "./.clojask/sort") sort-option header)]
95 |     ;; (println sort-option)
96 |     ;; (println header)
97 |     (str "Sorted in total " (CsvExternalSort/mergeSortedFiles file-list output sort-option true header) " rows.")))
98 | 


--------------------------------------------------------------------------------
/src/main/clojure/clojask/aggregate/aggre_output.clj:
--------------------------------------------------------------------------------
 1 | (ns clojask.aggregate.aggre-output
 2 |   (:require [onyx.peer.function :as function]
 3 |             [onyx.plugin.protocols :as p]
 4 |             [clojure.java.io :as io]
 5 |             [taoensso.timbre :refer [debug info] :as timbre]
 6 |             [clojure.string :as string])
 7 |   (:import (java.io BufferedReader FileReader BufferedWriter FileWriter)))
 8 | 
 9 | (defn- inject-into-eventmap
10 |   [event lifecycle]
11 |   (let [wtr (io/writer (:buffered-wtr/filename lifecycle) :append true)]
12 |    {:clojask/wtr wtr}))
13 | 
14 | (defn- close-writer [event lifecycle]
15 |   (.close (:clojask/wtr event)))
16 | 
17 | ;; Map of lifecycle calls that are required to use this plugin.
18 | ;; Users will generally always have to include these in their lifecycle calls
19 | ;; when submitting the job.
20 | (def writer-calls
21 |   {:lifecycle/before-task-start inject-into-eventmap
22 |    :lifecycle/after-task-stop close-writer})
23 | 
24 | (def df (atom nil))
25 | (def output-func (atom nil))
26 | 
27 | (defn inject-dataframe
28 |   [dataframe out]
29 |   (reset! df dataframe)
30 |   (reset! output-func out))
31 | 
32 | (defrecord ClojaskOutput [output-func]
33 |   p/Plugin
34 |   (start [this event]
35 |     ;; Initialize the plugin, generally by assoc'ing any initial state.
36 |     this)
37 | 
38 |   (stop [this event]
39 |     ;; Nothing is required here. However, most plugins have resources
40 |     ;; (e.g. a connection) to clean up.
41 |     ;; Mind that such cleanup is also achievable with lifecycles.
42 |     this)
43 | 
44 |   p/Checkpointed
45 |   ;; Nothing is required here. This is normally useful for checkpointing in
46 |   ;; input plugins.
47 |   (checkpoint [this])
48 | 
49 |   ;; Nothing is required here. This is normally useful for checkpointing in
50 |   ;; input plugins.
51 |   (recover! [this replica-version checkpoint])
52 | 
53 |   ;; Nothing is required here. This is normally useful for checkpointing in
54 |   ;; input plugins.
55 |   (checkpointed! [this epoch])
56 | 
57 |   p/BarrierSynchronization
58 |   (synced? [this epoch]
59 |     ;; Nothing is required here. This is commonly used to check whether all
60 |     ;; async writes have finished.
61 |     true)
62 | 
63 |   (completed? [this]
64 |     ;; Nothing is required here. This is commonly used to check whether all
65 |     ;; async writes have finished (just like synced).
66 |     true)
67 | 
68 |   p/Output
69 |   (prepare-batch [this event replica messenger]
70 |     ;; Nothing is required here. This is useful for some initial preparation,
71 |     ;; before write-batch is called repeatedly.
72 |     true)
73 | 
74 |   (write-batch [this {:keys [onyx.core/write-batch  clojask/wtr]} replica messenger]
75 |               ;;  keys [:Departement]
76 |     ;; Write the batch to your datasink.
77 |     ;; In this case we are conjoining elements onto a collection.
78 |     (doseq [msg write-batch]
79 |       ;; (if-let [msg (first batch)]
80 |       (do
81 |           ;; (swap! example-datasink conj msg)
82 |         (if (not= (:d msg) nil)
83 |           (do
84 |             ;; (doseq [data (:d msg)]
85 |             ;;   (.write wtr (str (string/join "," data) "\n")))
86 |             ;; (println (:d msg))
87 |             (output-func wtr (:d msg))
88 |                 ;; !! define argument (debug)
89 |             ))))
90 |     (.flush wtr)
91 |     true))
92 | 
93 | ;; Builder function for your output plugin.
94 | ;; Instantiates a record.
95 | ;; It is highly recommended you inject and pre-calculate frequently used data 
96 | ;; from your task-map here, in order to improve the performance of your plugin
97 | ;; Extending the function below is likely good for most use cases.
98 | (defn output [pipeline-data]
99 |   (->ClojaskOutput (deref output-func)))


--------------------------------------------------------------------------------
/src/main/clojure/clojask/classes/RowInfo.clj:
--------------------------------------------------------------------------------
  1 | (ns clojask.classes.RowInfo
  2 |   ;; (:require [clojask.utils :refer :all])
  3 |   )
  4 | 
  5 | (import '[com.clojask.exception TypeException]
  6 |         '[com.clojask.exception OperationException])
  7 | 
  8 | (definterface RowIntf
  9 |   (getFilters [])
 10 |   (getAggreOldKeys [])
 11 |   (getAggreNewKeys [])
 12 |   (getAggreFunc [])
 13 |   (getGroupbyKeys [])
 14 |   (filter [cols predicate])
 15 |   (groupby [a])
 16 |   (aggregate [func old-key new-key])
 17 |   (setRowInfo [new-col-desc new-col-set])
 18 |   (renameRowInfo [new-col-names])
 19 |   (copy [])
 20 |   (rollback [])
 21 |   (commit []))
 22 | 
 23 | (deftype RowInfo
 24 |          [^:unsynchronized-mutable filters
 25 |           ^:unsynchronized-mutable groupby-key
 26 |           ^:unsynchronized-mutable aggre-func
 27 |           ;; ^:unsynchronized-mutable aggre-old-key
 28 |           ^:unsynchronized-mutable aggre-new-key
 29 |           ^:unsynchronized-mutable hist]
 30 |   RowIntf
 31 |   (getFilters
 32 |     [this]
 33 |     filters)
 34 | 
 35 |   (getGroupbyKeys
 36 |     [this]
 37 |     groupby-key)
 38 | 
 39 |   ;; (getAggreOldKeys
 40 |   ;;  [this]
 41 |   ;;  aggre-old-key)
 42 |   (getAggreNewKeys
 43 |     [this]
 44 |     aggre-new-key)
 45 | 
 46 |   (getAggreFunc
 47 |     [this]
 48 |     aggre-func)
 49 | 
 50 |   (filter
 51 |     [this cols predicate]
 52 |     (.copy this)
 53 |     (set! filters (conj filters [predicate cols]))
 54 |    ;; "success"
 55 |     nil)
 56 | 
 57 |   (groupby
 58 |     [this key]
 59 |     (.copy this)
 60 |     (set! groupby-key key)
 61 |     ;; "success"
 62 |     nil)
 63 | 
 64 |   (aggregate
 65 |     [this func old-keys new-keys]
 66 |     (.copy this)
 67 |     (if true
 68 |     ;;  (not= groupby-key [])
 69 |       (do
 70 |         (doseq [old-key old-keys]
 71 |           (set! aggre-func (conj aggre-func [func old-key])))
 72 |         ;; (set! aggre-old-key old-key)
 73 |         (doseq [new-key new-keys]
 74 |           (set! aggre-new-key (conj aggre-new-key new-key)))
 75 |         ; "success"
 76 |         nil)
 77 |       (throw (OperationException. "you must first group the dataframe by some keys then aggregate"))))
 78 | 
 79 |   (setRowInfo
 80 |     [this new-col-desc new-col-set]
 81 |     (.copy this)
 82 |     (let [original-filter (.getFilters this)
 83 |           original-groupby-keys (.getGroupbyKeys this)
 84 |           original-aggre-func (.getAggreFunc this)
 85 |           new-filter-fns (map #(first %) original-filter)
 86 |           new-filter-cols (map (fn [fcols] (map #(first (first (get new-col-desc %))) fcols)) (doall (map #(last %) original-filter)))
 87 |           new-groupby-fns (map #(first %) original-groupby-keys)
 88 |           new-groupby-cols (map #(first (first (get new-col-desc (last %)))) original-groupby-keys)
 89 |           new-aggre-fns (map #(first %) original-aggre-func)
 90 |           new-aggre-cols (map #(first (first (get new-col-desc (last %)))) original-aggre-func)]
 91 |       (if (not (empty? (.getFilters this)))
 92 |         (set! filters (vec (map vector new-filter-fns new-filter-cols))))
 93 |       (if (not (empty? (.getGroupbyKeys this)))
 94 |         (set! groupby-key (vec (map vector new-groupby-fns new-groupby-cols)))
 95 |         ;(set! groupby-key (vec (map #(first (first (get new-col-desc %))) original-groupby-keys)))
 96 |         )
 97 |       (if (not (empty? (.getAggreFunc this)))
 98 |         (set! aggre-func (vec (map vector new-aggre-fns new-aggre-cols))))))
 99 | 
100 |   (copy
101 |     [this]
102 |     (set! hist {:filters filters
103 |                 :groupby-key groupby-key
104 |                 :aggre-func aggre-func
105 |                 :aggre-new-key aggre-new-key}))
106 | 
107 |   (rollback
108 |     [this]
109 |     (if (not= hist {})
110 |       (do (set! filters (:filters hist))
111 |           (set! groupby-key (:groupby-key hist))
112 |           (set! aggre-func (:aggre-func hist))
113 |           (set! aggre-new-key (:aggre-new-key hist)))))
114 |   
115 |   (commit
116 |    [this]
117 |    (set! hist {})))
118 | 


--------------------------------------------------------------------------------
/src/main/clojure/clojask/aggregate/aggre_input.clj:
--------------------------------------------------------------------------------
  1 | (ns clojask.aggregate.aggre-input
  2 |   (:require [clojure.core.async :refer [poll! timeout chan close!]]
  3 |             [clojure.set :refer [join]]
  4 |             [onyx.plugin.protocols :as p]
  5 |             [clojure.data.csv :as csv]
  6 |             [clojask.utils :refer [filter-check]]
  7 |             [taoensso.timbre :refer [fatal info debug] :as timbre]
  8 |             [clojure.java.io :as java.io]
  9 |             [clojask.utils :as u])
 10 |   (:import (java.io BufferedReader)))
 11 | 
 12 | (defrecord AbsSeqReader [event path rst completed? checkpoint? offset source]
 13 |   p/Plugin
 14 | 
 15 |   (start [this event]
 16 |     this)
 17 | 
 18 |   (stop [this event]
 19 |     this)
 20 | 
 21 |   p/Checkpointed
 22 |   (checkpoint [this]
 23 |     (when checkpoint? @offset))
 24 | 
 25 |   (recover!
 26 |    [this _ checkpoint]
 27 |    (vreset! completed? false)
 28 | 
 29 |    (let [
 30 |         ;;  directory (java.io/file path)
 31 |         ;;  files (rest (file-seq directory))
 32 |         ;;  data (map zipmap (repeat [:id :file :d]) (map vector (iterate inc 0) [files (mapv (fn [_] (read-string (str _))) files)]))
 33 |          data (if (= path nil)
 34 |                 (do
 35 |                   (def tmp (volatile! -1))
 36 |                   (map (fn [file]
 37 |                          (vswap! tmp inc)
 38 |                          {:id @tmp :file file :d (read-string file)})
 39 |                        (.getKeys source)))
 40 |                 (do
 41 |                   (def tmp (volatile! -1))
 42 |                   (map (fn [file]
 43 |                          (vswap! tmp inc)
 44 |                          {:id @tmp :file file :d (read-string (u/decode-str (.getName file)))})
 45 |                        (rest (file-seq (java.io/file path))))))
 46 |          ]
 47 |      (if (nil? checkpoint)
 48 |        (do
 49 |          (vreset! rst data)
 50 |          (vreset! offset 0))
 51 |        (do
 52 |          (info "clojask.aggregate.aggre-input is recovering state by dropping" checkpoint "elements.")
 53 |          (vreset! rst (drop checkpoint data))
 54 |          (vreset! offset checkpoint)))))
 55 | 
 56 |   (checkpointed! [this epoch])
 57 | 
 58 |   p/BarrierSynchronization
 59 |   (synced? [this epoch]
 60 |     true)
 61 | 
 62 |   (completed? [this]
 63 |     @completed?)
 64 | 
 65 |   p/Input
 66 |   (poll! [this _ _]
 67 |     ;; (if (> (mem-usage) 500)
 68 |     ;;   (Thread/sleep 10))
 69 |     ;; (while (not (filter-check filters types (:d (first @rst))))
 70 |     ;;   (vswap! rst rest))
 71 |     (if-let [seg (first @rst)]
 72 |       (do
 73 |         (vswap! rst rest)
 74 |         seg)
 75 |       (do (vreset! completed? true)
 76 |           nil))
 77 |     ;; (if-let [seg (first @rst)]
 78 |     ;;   (do (vswap! rst rest)
 79 |     ;;       (vswap! offset inc)
 80 |     ;;       ;; (spit "resources/debug.txt" (str seg) :append true)
 81 |     ;;       seg)
 82 |     ;;   (do (vreset! completed? true)
 83 |     ;;       nil))
 84 |          ))
 85 | 
 86 | (defn inject-dataframe
 87 |   [dataframe _source]
 88 |   (def df dataframe)
 89 |   (def source _source))
 90 | 
 91 | (defn input [{:keys [onyx.core/task-map] :as event}]
 92 |   ;; (println (:seq/rdr event))
 93 |   (map->AbsSeqReader {:event event
 94 |                       ;; :sequential (:seq/seq event)
 95 |                       ;; :reader (:seq/rdr event)
 96 |                       ;; :filters (.getFilters (:row-info  df))
 97 |                       ;; :types (.getType (:col-info df))
 98 |                       :path (:buffered-reader/path event)
 99 |                       :rst (volatile! nil)
100 |                       :completed? (volatile! false)
101 |                       :checkpoint? (not (false? (:seq/checkpoint? task-map)))
102 |                       :offset (volatile! nil)
103 |                       :source source}))
104 | 
105 | (def reader-calls
106 |   {})
107 | 
108 | (defn inject-lifecycle-seq
109 |   [_ lifecycle]
110 |   {:seq/seq (:seq/sequential lifecycle)})
111 | 
112 | (def inject-seq-via-lifecycle
113 |   {:lifecycle/before-task-start inject-lifecycle-seq})


--------------------------------------------------------------------------------
/src/main/clojure/clojask/join/outer_input.clj:
--------------------------------------------------------------------------------
  1 | (ns clojask.join.outer-input
  2 |   (:require [clojure.core.async :refer [poll! timeout chan close!]]
  3 |             [clojure.set :refer [join]]
  4 |             [onyx.plugin.protocols :as p]
  5 |             [clojure.data.csv :as csv]
  6 |             [clojask.utils :refer [filter-check]]
  7 |             [taoensso.timbre :refer [fatal info debug] :as timbre]
  8 |             [clojure.java.io :as java.io])
  9 |   (:import (java.io BufferedReader)))
 10 | 
 11 | (def mgroup-a nil)
 12 | (def mgroup-b nil)
 13 | 
 14 | (defrecord AbsSeqReader [event path rst completed? checkpoint? offset]
 15 |   p/Plugin
 16 | 
 17 |   (start [this event]
 18 |     this)
 19 | 
 20 |   (stop [this event]
 21 |     this)
 22 | 
 23 |   p/Checkpointed
 24 |   (checkpoint [this]
 25 |     (when checkpoint? @offset))
 26 | 
 27 |   (recover!
 28 |    [this _ checkpoint]
 29 |    (vreset! completed? false)
 30 | 
 31 |    (let [directory (java.io/file path)
 32 |          files (if (= mgroup-a nil)
 33 |                  (rest (file-seq directory))
 34 |                  (.getKeys mgroup-a))
 35 |         ;;  data (map zipmap (repeat [:id :file :d]) (map vector (iterate inc 0) [files (mapv (fn [_] (read-string (str _))) files)]))
 36 |          data 
 37 |          (if (= mgroup-a nil)
 38 |                 (do
 39 |                   (def tmp (volatile! -1))
 40 |                   (map (fn [file]
 41 |                          (vswap! tmp inc)
 42 |                          {:id @tmp :d (str file)})
 43 |                        files))
 44 |                 (do
 45 |                   (def tmp (volatile! -1))
 46 |                   (map (fn [file]
 47 |                          (vswap! tmp inc)
 48 |                          (if (not= nil mgroup-b) (.delete mgroup-b file))
 49 |                          {:id @tmp :d (str file)})
 50 |                        files)))
 51 |      ]
 52 |      (if (nil? checkpoint)
 53 |        (do
 54 |          (vreset! rst data)
 55 |          (vreset! offset 0))
 56 |        (do
 57 |          (info "clojask.join.outer-input is recovering state by dropping" checkpoint "elements.")
 58 |          (vreset! rst (drop checkpoint data))
 59 |          (vreset! offset checkpoint)))))
 60 | 
 61 |   (checkpointed! [this epoch])
 62 | 
 63 |   p/BarrierSynchronization
 64 |   (synced? [this epoch]
 65 |     true)
 66 | 
 67 |   (completed? [this]
 68 |     @completed?)
 69 | 
 70 |   p/Input
 71 |   (poll! [this _ _]
 72 |     ;; (if (> (mem-usage) 500)
 73 |     ;;   (Thread/sleep 10))
 74 |     ;; (while (not (filter-check filters types (:d (first @rst))))
 75 |     ;;   (vswap! rst rest))
 76 |     (if-let [seg (first @rst)]
 77 |       (do
 78 |         (vswap! rst rest)
 79 |         seg)
 80 |       (do (vreset! completed? true)
 81 |           nil))
 82 |     ;; (if-let [seg (first @rst)]
 83 |     ;;   (do (vswap! rst rest)
 84 |     ;;       (vswap! offset inc)
 85 |     ;;       ;; (spit "resources/debug.txt" (str seg) :append true)
 86 |     ;;       seg)
 87 |     ;;   (do (vreset! completed? true)
 88 |     ;;       nil))
 89 |          ))
 90 | 
 91 | (defn inject-dataframe
 92 |   [_mgroup-a _mgroup-b]
 93 |   (def mgroup-a _mgroup-a)
 94 |   (def mgroup-b _mgroup-b))
 95 | 
 96 | (defn input [{:keys [onyx.core/task-map] :as event}]
 97 |   ;; (println (:seq/rdr event))
 98 |   (map->AbsSeqReader {:event event
 99 |                       ;; :sequential (:seq/seq event)
100 |                       ;; :reader (:seq/rdr event)
101 |                       ;; :filters (.getFilters (:row-info  df))
102 |                       ;; :types (.getType (:col-info df))
103 |                       :path (:buffered-reader/path event)
104 |                       :rst (volatile! nil)
105 |                       :completed? (volatile! false)
106 |                       :checkpoint? (not (false? (:seq/checkpoint? task-map)))
107 |                       :offset (volatile! nil)}))
108 | 
109 | (def reader-calls
110 |   {})
111 | 
112 | (defn inject-lifecycle-seq
113 |   [_ lifecycle]
114 |   {:seq/seq (:seq/sequential lifecycle)})
115 | 
116 | (def inject-seq-via-lifecycle
117 |   {:lifecycle/before-task-start inject-lifecycle-seq})


--------------------------------------------------------------------------------
/src/main/clojure/clojask/clojask_groupby.clj:
--------------------------------------------------------------------------------
  1 | (ns clojask.clojask-groupby
  2 |   (:require [clojask.groupby :refer [output-groupby]]
  3 |             [onyx.peer.function :as function]
  4 |             [onyx.plugin.protocols :as p]
  5 |             [clojure.set :as set]
  6 |             [taoensso.timbre :refer [debug info] :as timbre])
  7 |   (:import (java.io BufferedReader FileReader BufferedWriter FileWriter)))
  8 | 
  9 | (def dataframe (atom nil))
 10 | (def groupby-keys (atom nil))
 11 | (def write-index (atom nil))
 12 | (def output-func (atom nil))
 13 | (def dist (atom nil))
 14 | (def format_ (atom nil))
 15 | 
 16 | (defn inject-dataframe
 17 |   [df groupby-key index _dist _format]
 18 |   (reset! dataframe df)
 19 |   (reset! groupby-keys groupby-key)
 20 |   (reset! write-index index)
 21 |   (reset! dist _dist)
 22 |   (reset! format_ _format)
 23 |   ;; (reset! output-func out)
 24 |   )
 25 | 
 26 | (defn- inject-into-eventmap
 27 |   [event lifecycle]
 28 |   (let [key-index (.getKeyIndex (.col-info (deref dataframe)))
 29 |         formatters (.getFormatter (.col-info (deref dataframe)))
 30 |         groupby-keys (deref groupby-keys)]
 31 |   ;;  [wtr (BufferedWriter. (FileWriter. (:buffered-wtr/filename lifecycle)))]
 32 |     {:clojask/dist (deref dist)
 33 |     ;;  :clojask/dist (:buffered-wtr/filename lifecycle) 
 34 |     ;;  :clojask/groupby-keys (:clojask/groupby-keys lifecycle) 
 35 |      :clojask/groupby-keys groupby-keys
 36 |      :clojask/key-index key-index
 37 |      :clojask/formatter formatters}))
 38 | 
 39 | (defn- close-writer [event lifecycle]
 40 |   (.close (:clojask/wtr event)))
 41 | 
 42 | ;; Map of lifecycle calls that are required to use this plugin.
 43 | ;; Users will generally always have to include these in their lifecycle calls
 44 | ;; when submitting the job.
 45 | (def writer-aggre-calls
 46 |   {:lifecycle/before-task-start inject-into-eventmap})
 47 | 
 48 | (defrecord ClojaskGroupby [write-index]
 49 |   p/Plugin
 50 |   (start [this event]
 51 |     ;; Initialize the plugin, generally by assoc'ing any initial state.
 52 |     this)
 53 | 
 54 |   (stop [this event]
 55 |     ;; Nothing is required here. However, most plugins have resources
 56 |     ;; (e.g. a connection) to clean up.
 57 |     ;; Mind that such cleanup is also achievable with lifecycles.
 58 |     this)
 59 | 
 60 |   p/Checkpointed
 61 |   ;; Nothing is required here. This is normally useful for checkpointing in
 62 |   ;; input plugins.
 63 |   (checkpoint [this])
 64 | 
 65 |   ;; Nothing is required here. This is normally useful for checkpointing in
 66 |   ;; input plugins.
 67 |   (recover! [this replica-version checkpoint])
 68 | 
 69 |   ;; Nothing is required here. This is normally useful for checkpointing in
 70 |   ;; input plugins.
 71 |   (checkpointed! [this epoch])
 72 | 
 73 |   p/BarrierSynchronization
 74 |   (synced? [this epoch]
 75 |     ;; Nothing is required here. This is commonly used to check whether all
 76 |     ;; async writes have finished.
 77 |     true)
 78 | 
 79 |   (completed? [this]
 80 |     ;; Nothing is required here. This is commonly used to check whether all
 81 |     ;; async writes have finished (just like synced).
 82 |     true)
 83 | 
 84 |   p/Output
 85 |   (prepare-batch [this event replica messenger]
 86 |     ;; Nothing is required here. This is useful for some initial preparation,
 87 |     ;; before write-batch is called repeatedly.
 88 |     true)
 89 | 
 90 |   (write-batch [this {:keys [onyx.core/write-batch clojask/dist clojask/groupby-keys clojask/key-index clojask/formatter]} replica messenger]
 91 |               ;;  keys [:Departement]
 92 |     ;; Write the batch to your datasink.
 93 |     ;; In this case we are conjoining elements onto a collection.
 94 |     (doseq [msg write-batch]
 95 |       (doseq [data (:d msg)]
 96 |           ;; (swap! example-datasink conj msg)
 97 |         (if (not= data nil)
 98 |           (do
 99 |                 ;(.write wtr (str msg "\n"))
100 |                 ;; !! define argument (debug)
101 |             ;;   (def groupby-keys [:Department :EmployeeName])
102 |             (output-groupby dist data groupby-keys key-index formatter write-index (deref format_))))))
103 |     true))
104 | 
105 | ;; Builder function for your output plugin.
106 | ;; Instantiates a record.
107 | ;; It is highly recommended you inject and pre-calculate frequently used data 
108 | ;; from your task-map here, in order to improve the performance of your plugin
109 | ;; Extending the function below is likely good for most use cases.
110 | (defn groupby [pipeline-data]
111 |   (->ClojaskGroupby (deref write-index)))


--------------------------------------------------------------------------------
/src/main/clojure/clojask/classes/MGroup.clj:
--------------------------------------------------------------------------------
  1 | (ns clojask.classes.MGroup
  2 |   (:require [clojure.set :as set]
  3 |             [clojask.utils :as u])
  4 |   (:import [com.clojask.exception ExecutionException]))
  5 | 
  6 | (definterface MGroupIntf
  7 |   (final [])
  8 |   (getKeys [])
  9 |   (exists [key])
 10 |   (write [key msg write-index formatter] "mimic a bufferedwriter, add a row to a group")
 11 |   (getKey [key]))
 12 | 
 13 | (definterface MGroupJoinIntf
 14 |   (getKeyBoth [key])
 15 |   (delete [key]))
 16 | 
 17 | (deftype MGroup
 18 |          [^:unsynchronized-mutable groups]
 19 |   
 20 |   MGroupIntf
 21 | 
 22 |   (final
 23 |    [this]
 24 |    (set! groups (persistent! groups)))
 25 |   
 26 |   (getKeys
 27 |    [this]
 28 |   ;;  (println (keys groups))
 29 |    (keys groups))
 30 |   
 31 |   (exists
 32 |    [this key]
 33 |    (nil? (get groups key)))
 34 |   
 35 |   (write
 36 |    [this key msg write-index formatter]
 37 |    (if-let [group (get groups key)]
 38 |      (set! groups (assoc! groups key (conj! group (u/gets msg write-index))))
 39 |      (set! groups (assoc! groups key (transient [(u/gets msg write-index)])))))
 40 |   
 41 |   (getKey
 42 |    [this key]
 43 |    (persistent! (get groups key))))
 44 | 
 45 | (deftype MGroupJoin
 46 |          [^:unsynchronized-mutable groups
 47 |           ;; ^:unsynchronized-mutable unformat-groups
 48 |           ^:volatile-mutable _keys
 49 |           rolling]
 50 |   MGroupIntf
 51 |   (final
 52 |     [this]
 53 |     (let [tmp-keys (persistent! _keys)]
 54 |       ;; (if rolling
 55 |       ;;   (doseq [key (keys tmp-keys)]
 56 |       ;;     (set! groups (assoc! groups key (persistent! (get groups key))))
 57 |       ;;     (set! unformat-groups (assoc! unformat-groups key (persistent! (get unformat-groups key)))))
 58 |       (doseq [key (keys tmp-keys)]
 59 |         (set! groups (assoc! groups key (persistent! (get groups key)))))
 60 |       ;; )
 61 |       (set! _keys (transient tmp-keys)))
 62 |     (set! groups (persistent! groups))
 63 |   ;;  (println rolling)
 64 |   ;;  (println groups)
 65 |     ;; (set! unformat-groups (persistent! unformat-groups))
 66 |     )
 67 | 
 68 |   (getKeys
 69 |     [this]
 70 |     (keys groups))
 71 | 
 72 |   (exists
 73 |     [this key]
 74 |     (contains? _keys key))
 75 | 
 76 |   (write
 77 |     [this key msg write-index formatter]
 78 |     (if-let [group (get groups key)]
 79 |       (do
 80 |         (if rolling
 81 |           (set! groups (assoc! groups key (conj! group [(u/gets-format msg write-index formatter) (u/gets msg write-index)])))
 82 |           (set! groups (assoc! groups key (conj! group (u/gets-format msg write-index formatter))))))
 83 |       (do
 84 |         (if rolling
 85 |           (set! groups (assoc! groups key (transient [[(u/gets-format msg write-index formatter) (u/gets msg write-index)]])))
 86 |           (set! groups (assoc! groups key (transient [(u/gets-format msg write-index formatter)]))))
 87 |         (set! _keys (assoc! _keys key 1)))))
 88 | 
 89 |   (getKey
 90 |     [this key]
 91 |     (get groups key))
 92 | )
 93 | 
 94 | 
 95 | (deftype MGroupJoinOuter
 96 |          [^:unsynchronized-mutable groups
 97 |           ;; ^:unsynchronized-mutable unformat-groups
 98 |           ^:volatile-mutable _keys
 99 |           rolling]
100 |   MGroupIntf
101 |   (final
102 |     [this]
103 |     (set! _keys (persistent! _keys))
104 |   ;;  (let [tmp-keys (persistent! _keys)]
105 |   ;;     ;; (if rolling
106 |   ;;     ;;   (doseq [key (keys tmp-keys)]
107 |   ;;     ;;     (set! groups (assoc! groups key (persistent! (get groups key))))
108 |   ;;     ;;     (set! unformat-groups (assoc! unformat-groups key (persistent! (get unformat-groups key)))))
109 |   ;;    (doseq [key (keys tmp-keys)]
110 |   ;;      (set! groups (assoc! groups key (persistent! (get groups key)))))
111 |   ;;     ;; )
112 |   ;;    (set! _keys (transient tmp-keys)))
113 |   ;;  (set! groups (persistent! groups))
114 |   ;;  (println rolling)
115 |   ;;  (println groups)
116 |     ;; (set! unformat-groups (persistent! unformat-groups))
117 |     )
118 | 
119 |   (getKeys
120 |     [this]
121 |     (keys _keys))
122 | 
123 |   (exists
124 |     [this key]
125 |     (contains? groups key))
126 | 
127 |   (write
128 |     [this key msg write-index formatter]
129 |     (if-let [group (get groups key)]
130 |       (set! groups (assoc! groups key (conj! group (u/gets-format msg write-index formatter))))
131 |       (set! groups (assoc! groups key (transient [(u/gets-format msg write-index formatter)]))))
132 |     (set! _keys (assoc! _keys key 1)))
133 |   
134 |   (getKey
135 |     [this key]
136 |     (persistent! (get groups key)))
137 | 
138 |   MGroupJoinIntf
139 |   ;; (getKeyBoth
140 |   ;;   [this key]
141 |   ;;   (if (.exists this key) (get unformat-groups key)))
142 | 
143 |   (delete
144 |     [this key]
145 |     (set! _keys (dissoc! _keys key))))
146 | 


--------------------------------------------------------------------------------
/src/main/clojure/clojask/debug.clj:
--------------------------------------------------------------------------------
  1 | (ns clojask.debug
  2 |   (:require [clojask.dataframe :refer :all]
  3 |             [clojask.utils :as u]
  4 |             [clojask.groupby :refer :all]
  5 |             [clojask.sort :as sort]
  6 |             [clojask.api.aggregate :as agg]
  7 |             [clojask.api.gb-aggregate :as gb-agg]
  8 |             [clojure.string :as str]
  9 |             [clojask.extensions.bind :refer :all]
 10 |             [clojask.extensions.reshape :refer :all])
 11 |   (:refer-clojure :exclude [group-by filter dedupe sort]))
 12 | "For debugging purposes only, will not be used in production."
 13 | 
 14 | (defn -main
 15 |   []
 16 |   ;(def x "Hello world")
 17 |   ;(-> (clojure.core/format "Expression '%s' not defined." x)(MyOwnException.)(throw))
 18 | 
 19 |   (def x (dataframe "./resources/Employees.csv" :have-col true))
 20 |   ;; (set-type x "Employee" "double")
 21 |   ;; (group-by x ["Department"])
 22 |   (aggregate x agg/min ["Employee"])
 23 |   (print-df x)
 24 |   ;; (def y (dataframe "resources/Employees-info.csv" :have-col true))
 25 |   ;; (def z (left-join x y ["Employee"] ["Employee"]))
 26 |   ;(time (compute z 8 "resources/test.csv" :select ["1_Employee" "2_EmployeeName"] :exception true))
 27 |   (def output-df (compute x 8 "resources/test.csv" :exception true))
 28 |   ;(compute z 8 "resources/test.csv" :exception true)
 29 |   ;(time (compute x 8 "resources/test.csv" :select ["new-employee"] :exception true))
 30 | 
 31 |   ;(time (rolling-join-forward x y ["EmployeeName"] ["EmployeeName"] "Employee" "Employee" 8 "resources/test.csv" :exception false))
 32 | 
 33 |   ;(select-col y ["Salary" "EmployeeName"])
 34 |   ;(delete-col y ["Salary" "EmployeeName"])
 35 |   ;(print-df y)
 36 | 
 37 |   ;(println (.getKeys (.col-info y)))
 38 |   ;(set-type y "Salary" "double")
 39 |   ;(set-type y "EmployeeName" "double") ;; gives exception
 40 | 
 41 |   ;(operate y "Salary" (fn [x] (+ 10 x)))
 42 |   ;(operate y "Salary" (fn [] 2)) ;; gives exception
 43 | 
 44 |   ;(operate y str ["Employee" "Salary"] "new-col")
 45 |   ;(operate y ["Employee" "Salary"] "new-col" (fn [] 2)) ;; gives exception
 46 | 
 47 |   ;(print-df y)
 48 |   ;(filter y "Salary" (fn [salary] (<= salary 800)))
 49 |   ;(set-parser y "Department" #(Double/parseDouble %))
 50 | 
 51 |   ;(delete-col y ["Salary" "Department"])
 52 |   ;(println (col-names y))
 53 | 
 54 |   ;; (group-by y ["Department" "Employee"])
 55 |   ;; (aggregate y min ["Employee"] ["new-employee"])
 56 |   ;; (rename-col y ["Employee" "Department-A" "EmployeeName" "Salary"])
 57 | 
 58 |   ;; (set-type y "Department" "double")
 59 |   ;; (set-parser y "Salary" #(Double/parseDouble %))
 60 |   ;; (operate y - "Department")
 61 |   ;; (operate y str ["Employee" "Salary"] "new-col")
 62 | 
 63 |   ;(time (compute y 8 "resources/test.csv" :exception true :order true))
 64 | 
 65 |   ;; (-> (dataframe "resources/Employees-large.csv" :have-col true)
 66 |   ;;     (set-type "Salary" "double")
 67 |   ;;     (filter "Salary" (fn [salary] (<= salary 800)))
 68 |   ;;     (set-type "Department" "double")
 69 |   ;;     (operate - "Department")
 70 |   ;;     (operate str ["Employee" "Salary"] "new-col")
 71 |   ;;     (group-by ["Department"])
 72 |   ;;     (aggregate min ["Employee"] ["new-employee"])
 73 |   ;;     (compute 4 "resources/test.csv" :exception true :order true)
 74 |   ;;     time)
 75 |   
 76 |   ;; (println (.getKeys (.col-info y)))
 77 |   ;; ;(println "Renaming columns...")
 78 |   ;; (filter y ["Salary" "Department"] (fn [salary] (<= salary 800)))
 79 |   ;; (filter y ["Salary" "Department"] (fn [salary] (<= salary 800)))
 80 |   ;; (group-by y "Department")
 81 |   ;; (aggregate y aggre-avg ["Department" "Salary"] ["dept-avg" "salary-avg"])
 82 |   ;; (reorder-col y ["Employee" "Department-x" "EmployeeName" "Salary"])
 83 |   ;; (.reorderCol (.row-info y) (.getDesc (.col-info y)) ["Employee" "Department" "EmployeeName" "Salary"])
 84 |   ;; (println (.getKeys (.col-info y)))
 85 |   
 86 |   ;; Benchmarking
 87 | 
 88 |   ;(def y (dataframe "../clojure-datasets/data-CRSP.csv"))
 89 |   (def y (dataframe "../clojure-datasets/data-Compustat-lohi.csv"))
 90 |   (set-type y "prccq" "double")
 91 |   ;(operate y (fn [val] (if val (+ val 10.0) 0.0)) "prccq")
 92 |   ;(operate y str ["datadate" "tic"] "new-col")
 93 |   (group-by y "tic")
 94 |   (aggregate y gb-agg/max ["prccq"] ["prccq-max"])
 95 |   (time (compute y 4 "resources/test.csv" :select ["tic" "prccq-max"] :exception false))
 96 |   ;(time (compute y 4 "resources/test.csv" :select ["datadate" "TICKER" "prccq"] :exception false))
 97 | 
 98 |   ;; CRSP Benchmarking
 99 | 
100 |   ;(def x (dataframe "../clojure-datasets/data-CRSP.csv"))
101 |   ;(def x (dataframe "resources/CRSP-extract.csv"))
102 |   ;(def y (dataframe "../clojure-datasets/data-Compustat-lohi.csv"))
103 | 
104 |   ; join on (TIC, DATE)
105 |   ;(time (rolling-join-forward x y ["TICKER"] ["tic"] "date" "datadate" 4 "resources/test.csv" :exception false))
106 |   ;(time (inner-join x y ["date" "TICKER"] ["datadate" "TICKER"] 4 "resources/test.csv" :exception false))
107 |   )


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Clojask
  2 | > Clojure data processing framework with parallel computing on larger-than-memory datasets
  3 | 
  4 | ### Features
  5 | 
  6 | - **Unlimited Size**
  7 | 
  8 |   It supports datasets larger than memory.
  9 | 
 10 | - **Various Operations**
 11 | 
 12 |   Although Clojask is designed for larger-than-memory datasets, like NoSQLs, it does not sacrifice common operations on relational dataframes, such as [group by](https://clojure-finance.github.io/clojask-website/posts-output/API/#group-by), [aggregate](https://clojure-finance.github.io/clojask-website/posts-output/API/#aggregate), [join](https://clojure-finance.github.io/clojask-website/posts-output/API/#inner-join--left-join--right-join).
 13 | 
 14 | - **Fast**
 15 | 
 16 |   Faster than Dask in most operations, and the larger the dataframe is, the bigger the advantage. Please find the benchmarks [here](https://clojure-finance.github.io/clojask-website/pages-output/about/#benchmarks).
 17 | 
 18 | - **All Native Types**
 19 | 
 20 |   All the datatypes used to store data are native Clojure (or Java) types.
 21 | 
 22 | - **From File to File**
 23 | 
 24 |   Integrate IO inside the dataframe. No need to write your own read-in and output functions.
 25 | 
 26 | - **Parallel**
 27 | 
 28 |   Most operations could be executed into multiple threads or even machines. See the principle in [Onyx](http://www.onyxplatform.org/).
 29 | 
 30 | - **Lazy Operations**
 31 | 
 32 |   Most operations will not be executed immediately. Dataframe will intelligently pipeline the operations altogether in computation.
 33 | 
 34 | - **Little Constraints on programming**
 35 | 
 36 |   Except for some aggregations where you need to write customized functions subject to simple templates, operations in Clojask support arbitrary Clojure functions as input
 37 | 
 38 | ### Installation
 39 | 
 40 | Available on [Clojars](https://clojars.org/com.github.clojure-finance/clojask) ![Clojars Project](https://img.shields.io/clojars/v/com.github.clojure-finance/clojask.svg).
 41 | 
 42 | Insert this line into your `project.clj` if using Leiningen.
 43 | 
 44 | ```
 45 | [com.github.clojure-finance/clojask "2.0.0"]
 46 | ```
 47 | 
 48 | Insert this line into your `deps.edn` if using CLI.
 49 | 
 50 | ```clojure
 51 | com.github.clojure-finance/clojask {:mvn/version "2.0.0"}
 52 | ```
 53 | 
 54 | **Requirements:**
 55 | 
 56 | - MacOS or Linux
 57 | - Java 8 - 11
 58 | 
 59 | ### Example Usage
 60 | 
 61 | 1. Import `Clojask`
 62 | 
 63 |    ```clojure
 64 |    (require '[clojask.dataframe :as ck])
 65 |    ```
 66 | 
 67 | 2. Initialize a dataframe
 68 | 
 69 |    ```clojure
 70 |    (def df (ck/dataframe "Employees-example.csv"))
 71 |    ```
 72 | 
 73 |    The source file can be found [here](https://github.com/clojure-finance/clojask/blob/1.x.x/test/clojask/Employees-example.csv).
 74 | 
 75 |    See [`dataframe`](https://clojure-finance.github.io/clojask-website/posts-output/API/#dataframe)
 76 | 
 77 | 3. Preview the first few lines of the dataframe
 78 | 
 79 |    ```clojure
 80 |    (ck/print-df df)
 81 |    ```
 82 | 
 83 |    ![image-20220405210757274](docs/img/image-20220405210757274.png)
 84 | 
 85 |    See [`print-df`](https://clojure-finance.github.io/clojask-website/posts-output/API/#print-df)
 86 | 
 87 | 4. Change the data type of some columns
 88 | 
 89 |    ```clojure
 90 |    (ck/set-type df "Salary" "double")
 91 |    (ck/set-type df "UpdateDate" "date:yyyy/MM/dd")
 92 |    (ck/print-df df)
 93 |    ```
 94 | 
 95 |    ![image-20220405210826777](docs/img/image-20220405210826777.png)
 96 | 
 97 |    See [`set-type`](https://clojure-finance.github.io/clojask-website/posts-output/API/#set-type)
 98 | 
 99 | 5. Add 100 to Bob as `NewSalary`
100 | 
101 |    ```clojure
102 |    (ck/operate df (fn [EmployeeName Salary] (if (= EmployeeName "Bob") (+ Salary 100) Salary)) ["EmployeeName" "Salary"] "NewSalary")
103 |    (ck/print-df df)
104 |    ```
105 | 
106 |    ![image-20220405211348723](docs/img/image-20220405211348723.png)
107 | 
108 |    See [`operate`](https://clojure-finance.github.io/clojask-website/posts-output/API/#operate-in-place-modification)
109 | 
110 | 6. Output the resultant dataset to "result.csv" (Use 8 threads)
111 | 
112 |    ```clojure
113 |    (ck/compute df 8 "result.csv" :select ["Employee" "EmployeeName" "Department" "NewSalary" "UpdateDate"])
114 |    ```
115 | 
116 |    See [`compute`](https://clojure-finance.github.io/clojask-website/posts-output/API/#compute)
117 | 
118 | ### Supported Functions and Procedures
119 | 
120 | ![clojask functions](docs/clojask_functions.png)
121 | 
122 | - *The solid arrows point to the fixed next step; dotted arrows point to all possible next steps.*
123 | - *Any step except for Initialization is optional.*
124 | 
125 | ### Documentation
126 | 
127 | The detailed documentation for every API can be found [here](https://clojure-finance.github.io/clojask-website/posts-output/API/).
128 | 
129 | ### Examples
130 | 
131 | A separate repository for some typical usage of Clojask can be found [here](https://github.com/clojure-finance/clojask-examples).
132 | 
133 | ### Problem Feedback
134 | 
135 | If your question is not answered in existing [issues](https://github.com/clojure-finance/clojask/issues), feel free to create a new one.
136 | 


--------------------------------------------------------------------------------
/src/main/clojure/clojask/clojask_output.clj:
--------------------------------------------------------------------------------
  1 | (ns clojask.clojask-output
  2 |   (:require [onyx.peer.function :as function]
  3 |             [onyx.plugin.protocols :as p]
  4 |             [clojure.java.io :as io]
  5 |             [taoensso.timbre :refer [debug info] :as timbre]
  6 |             [clojure.string :as string]
  7 |             [clojure-heap.core :as heap]
  8 |             [clojure.set :as set]
  9 |             [clojask.join.outer-output :as output])
 10 |   (:import (java.io BufferedReader FileReader BufferedWriter FileWriter)))
 11 | 
 12 | (def df (atom nil))
 13 | (def output-func (atom nil))
 14 | 
 15 | (defn inject-dataframe
 16 |   [dataframe out]
 17 |   (reset! df dataframe)
 18 |   (reset! output-func out)
 19 |   )
 20 | 
 21 | (defn- inject-into-eventmap
 22 |   [event lifecycle]
 23 |   (let [wtr (io/writer (:buffered-wtr/filename lifecycle) :append true)
 24 |         order (:order lifecycle)
 25 |         indices (:indices lifecycle)
 26 |         formatter (.getFormatter (:col-info (deref df)))]
 27 |    {:clojask/wtr wtr :clojask/order order 
 28 |     ;; :clojask/formatter (set/rename-keys formatter (zipmap indices (iterate inc 0)))
 29 |     }))
 30 | 
 31 | (defn- close-writer [event lifecycle]
 32 |   (.close (:clojask/wtr event)))
 33 | 
 34 | (defn- write-msg
 35 |   [wtr msg melt output-func]
 36 |   ;; (if (not= (:d msg) nil)
 37 |   ;;   (doseq []
 38 |   ;;     (output-func wtr (melt (:d msg)))
 39 |   ;;               ;; !! define argument (debug)
 40 |   ;;     ))
 41 |   (doseq [row (remove nil? (:d msg))]
 42 |     (output-func wtr (melt row)))
 43 |   )
 44 | 
 45 | (defn- order-write
 46 |   [wtr msg heap exp-id melt output-func]
 47 |   (let [id (:id msg)]
 48 |     ;; (println (str msg " " (deref exp-id)))
 49 |     (if (= id (deref exp-id))
 50 |       (do
 51 |         (write-msg wtr msg melt output-func)
 52 |         (swap! exp-id inc)
 53 |         (while (= (:id (heap/peek heap)) (deref exp-id))
 54 |           (write-msg wtr (heap/poll heap) melt output-func)
 55 |           (swap! exp-id inc)))
 56 |       (do
 57 |         (heap/add heap msg)
 58 |         ;; (println (heap/get-size heap))
 59 |         )
 60 |       )))
 61 | 
 62 | ;; Map of lifecycle calls that are required to use this plugin.
 63 | ;; Users will generally always have to include these in their lifecycle calls
 64 | ;; when submitting the job.
 65 | (def writer-calls
 66 |   {:lifecycle/before-task-start inject-into-eventmap
 67 |    :lifecycle/after-task-stop close-writer})
 68 | 
 69 | (def melt (atom nil))
 70 | 
 71 | (defn inject-melt
 72 |   [tmp]
 73 |   (reset! melt tmp))
 74 | 
 75 | (defrecord ClojaskOutput [melt heap exp-id output]
 76 |   p/Plugin
 77 |   (start [this event]
 78 |     ;; Initialize the plugin, generally by assoc'ing any initial state.
 79 |     this)
 80 | 
 81 |   (stop [this event]
 82 |     ;; Nothing is required here. However, most plugins have resources
 83 |     ;; (e.g. a connection) to clean up.
 84 |     ;; Mind that such cleanup is also achievable with lifecycles.
 85 |         ;; (println (heap/get-size heap))
 86 |         (if (not= (heap/get-size heap) 0) (throw (Exception. (str "The order enforcement failed. "  (heap/get-size heap) " rows have been shuffled or missing."))))
 87 |         this)
 88 | 
 89 |   p/Checkpointed
 90 |   ;; Nothing is required here. This is normally useful for checkpointing in
 91 |   ;; input plugins.
 92 |   (checkpoint [this])
 93 | 
 94 |   ;; Nothing is required here. This is normally useful for checkpointing in
 95 |   ;; input plugins.
 96 |   (recover! [this replica-version checkpoint])
 97 | 
 98 |   ;; Nothing is required here. This is normally useful for checkpointing in
 99 |   ;; input plugins.
100 |   (checkpointed! [this epoch])
101 | 
102 |   p/BarrierSynchronization
103 |   (synced? [this epoch]
104 |     ;; Nothing is required here. This is commonly used to check whether all
105 |     ;; async writes have finished.
106 |     true)
107 | 
108 |   (completed? [this]
109 |     ;; Nothing is required here. This is commonly used to check whether all
110 |     ;; async writes have finished (just like synced).
111 |     true)
112 | 
113 |   p/Output
114 |   (prepare-batch [this event replica messenger]
115 |     ;; Nothing is required here. This is useful for some initial preparation,
116 |     ;; before write-batch is called repeatedly.
117 |     true)
118 | 
119 |   (write-batch [this {:keys [onyx.core/write-batch  clojask/wtr clojask/order]} replica messenger]
120 |               ;;  keys [:Departement]
121 |     ;; Write the batch to your datasink.
122 |     ;; In this case we are conjoining elements onto a collection.
123 |     (if order
124 |       (doseq [msg write-batch]
125 |         (order-write wtr msg heap exp-id melt output))
126 |       (let []
127 |         (doseq [msg write-batch]
128 |           ;; (println msg)
129 |           (write-msg wtr msg melt output))))
130 |     true))
131 | 
132 | ;; Builder function for your output plugin.
133 | ;; Instantiates a record.
134 | ;; It is highly recommended you inject and pre-calculate frequently used data 
135 | ;; from your task-map here, in order to improve the performance of your plugin
136 | ;; Extending the function below is likely good for most use cases.
137 | (defn output [pipeline-data]
138 |   (->ClojaskOutput (deref melt) (heap/heap (fn [a b] (<= (:id a) (:id b)))) (atom 0) (deref output-func)))


--------------------------------------------------------------------------------
/src/main/clojure/clojask/clojask_join.clj:
--------------------------------------------------------------------------------
  1 | (ns clojask.clojask-join
  2 |   (:require [clojask.join :as join]
  3 |             [clojure.set :as set]
  4 |             [onyx.peer.function :as function]
  5 |             [clojure.java.io :as io]
  6 |             [onyx.plugin.protocols :as p]
  7 |             [taoensso.timbre :refer [debug info] :as timbre])
  8 |   (:import (java.io BufferedReader FileReader BufferedWriter FileWriter)))
  9 | 
 10 | (def a (atom nil))
 11 | (def b (atom nil))
 12 | (def a-keys (atom nil))
 13 | (def b-keys (atom nil))
 14 | (def a-index (atom nil))
 15 | (def b-index (atom nil))
 16 | (def b-format (atom nil))
 17 | (def join-index (atom nil))
 18 | (def output-func (atom nil))
 19 | 
 20 | (defn inject-dataframe
 21 |   [d-a d-b a-key b-key -a-index -b-index -join-index -b-format out]
 22 |   (reset! a d-a)
 23 |   (reset! b d-b)
 24 |   (reset! a-keys a-key)
 25 |   (reset! b-keys b-key)
 26 |   (reset! a-index -a-index)
 27 |   (reset! b-index -b-index)
 28 |   (reset! b-format -b-format)
 29 |   (reset! join-index -join-index)
 30 |   (reset! output-func out))
 31 | 
 32 | (defn- inject-into-eventmap
 33 |   [event lifecycle]
 34 |   (let [wtr (io/writer (:buffered-wtr/filename lifecycle) :append true)
 35 |     ;; a-map (.getKeyIndex (.col-info (deref a)))
 36 |         a-format (.getFormatter (.col-info (deref a)))
 37 |     ;; b-map (.getKeyIndex (.col-info (deref b)))
 38 |         ;; a-format (set/rename-keys a-format (zipmap (deref a-index) (iterate inc 0)))
 39 |         ;; b-format (.getFormatter (.col-info (deref b)))
 40 |         ;; b-format (set/rename-keys b-format (zipmap (deref b-index) (iterate inc 0)))
 41 |         b-format (deref b-format)
 42 |         ]
 43 | 
 44 |     {:clojask/wtr wtr
 45 |     ;;  :clojask/a-keys (:clojask/a-keys lifecycle)
 46 |      :clojask/a-keys (deref a-keys)
 47 |     ;;  :clojask/b-keys (:clojask/b-keys lifecycle)
 48 |      :clojask/b-keys (deref b-keys)
 49 |      :clojask/a-roll (:clojask/a-roll lifecycle)
 50 |      :clojask/b-roll (:clojask/b-roll lifecycle)
 51 |      :clojask/a-map (:clojask/a-map lifecycle)
 52 |      :clojask/b-map (:clojask/b-map lifecycle)
 53 |      :clojask/a-format a-format
 54 |      :clojask/b-format b-format
 55 |      :clojask/join-type (:clojask/join-type lifecycle)}))
 56 | 
 57 | (defn- close-writer [event lifecycle]
 58 |   (.close (:clojask/wtr event)))
 59 | 
 60 | ;; Map of lifecycle calls that are required to use this plugin.
 61 | ;; Users will generally always have to include these in their lifecycle calls
 62 | ;; when submitting the job.
 63 | (def writer-join-calls
 64 |   {:lifecycle/before-task-start inject-into-eventmap
 65 |   :lifecycle/after-task-stop close-writer})
 66 | 
 67 | (defrecord ClojaskJoin [a-index b-index join-index write-func]
 68 |   p/Plugin
 69 |   (start [this event]
 70 |     ;; Initialize the plugin, generally by assoc'ing any initial state.
 71 |     this)
 72 | 
 73 |   (stop [this event]
 74 |     ;; Nothing is required here. However, most plugins have resources
 75 |     ;; (e.g. a connection) to clean up.
 76 |     ;; Mind that such cleanup is also achievable with lifecycles.
 77 |     this)
 78 | 
 79 |   p/Checkpointed
 80 |   ;; Nothing is required here. This is normally useful for checkpointing in
 81 |   ;; input plugins.
 82 |   (checkpoint [this])
 83 | 
 84 |   ;; Nothing is required here. This is normally useful for checkpointing in
 85 |   ;; input plugins.
 86 |   (recover! [this replica-version checkpoint])
 87 | 
 88 |   ;; Nothing is required here. This is normally useful for checkpointing in
 89 |   ;; input plugins.
 90 |   (checkpointed! [this epoch])
 91 | 
 92 |   p/BarrierSynchronization
 93 |   (synced? [this epoch]
 94 |     ;; Nothing is required here. This is commonly used to check whether all
 95 |     ;; async writes have finished.
 96 |     true)
 97 | 
 98 |   (completed? [this]
 99 |     ;; Nothing is required here. This is commonly used to check whether all
100 |     ;; async writes have finished (just like synced).
101 |     true)
102 | 
103 |   p/Output
104 |   (prepare-batch [this event replica messenger]
105 |     ;; Nothing is required here. This is useful for some initial preparation,
106 |     ;; before write-batch is called repeatedly.
107 |     true)
108 | 
109 |   (write-batch [this {:keys [onyx.core/write-batch clojask/wtr clojask/a-keys clojask/b-keys clojask/a-roll clojask/b-roll  clojask/a-map clojask/b-map clojask/a-format clojask/b-format clojask/join]} replica messenger]
110 |               ;;  keys [:Departement]
111 |     ;; Write the batch to your datasink.
112 |     ;; In this case we are conjoining elements onto a collection.
113 |     (doseq [msg write-batch]
114 |       (doseq [data (:d msg)]
115 |           ;; (swap! example-datasink conj msg)
116 |         (if (not= data nil)
117 |           (do
118 |                 ;(.write wtr (str msg "\n"))
119 |                 ;; !! define argument (debug)
120 |             ;;   (def groupby-keys [:Department :EmployeeName])
121 |             (join/output-join wtr data a-keys a-map b-keys (count b-map) a-roll b-roll a-format b-format a-index b-index join-index write-func)))))
122 |     true))
123 | 
124 | ;; Builder function for your output plugin.
125 | ;; Instantiates a record.
126 | ;; It is highly recommended you inject and pre-calculate frequently used data 
127 | ;; from your task-map here, in order to improve the performance of your plugin
128 | ;; Extending the function below is likely good for most use cases.
129 | (defn join [pipeline-data]
130 |   (->ClojaskJoin (deref a-index) (deref b-index) (deref join-index) (deref output-func)))


--------------------------------------------------------------------------------
/src/main/clojure/clojask/clojask_aggre.clj:
--------------------------------------------------------------------------------
  1 | (ns clojask.clojask-aggre
  2 |   (:require [onyx.peer.function :as function]
  3 |             [onyx.plugin.protocols :as p]
  4 |             [clojure.java.io :as io]
  5 |             [taoensso.timbre :refer [debug info] :as timbre]
  6 |             [clojure.string :as string]
  7 |             [clojask.api.aggregate :refer [start]]
  8 |             [clojask.utils :as u])
  9 |   (:import [java.io BufferedReader FileReader BufferedWriter FileWriter]
 10 |            [com.clojask.exception ExecutionException]))
 11 | 
 12 | (def df (atom nil))
 13 | (def aggre-func (atom nil))
 14 | (def select (atom nil))
 15 | (def output-func (atom nil))
 16 | 
 17 | (defn inject-dataframe
 18 |   [dataframe a b out]
 19 |   (reset! df dataframe)
 20 |   (reset! aggre-func a)
 21 |   (reset! select b)
 22 |   (reset! output-func out)
 23 |   )
 24 | 
 25 | (defn c-count
 26 |   [a]
 27 |   (if (coll? a)
 28 |     (count a)
 29 |     1))
 30 | 
 31 | (defn- inject-into-eventmap
 32 |   [event lifecycle]
 33 |   (let [wtr (io/writer (:buffered-wtr/filename lifecycle) :append true)
 34 |         order (:order lifecycle)
 35 |         aggre-func (.getAggreFunc (:row-info (deref df)))]
 36 |     {:clojask/wtr wtr
 37 |      :clojask/aggre-func aggre-func}))
 38 | 
 39 | (defn- close-writer [event lifecycle]
 40 |   (.close (:clojask/wtr event)))
 41 | 
 42 | ;; Map of lifecycle calls that are required to use this plugin.
 43 | ;; Users will generally always have to include these in their lifecycle calls
 44 | ;; when submitting the job.
 45 | (def writer-calls
 46 |   {:lifecycle/before-task-start inject-into-eventmap
 47 |    :lifecycle/after-task-stop close-writer})
 48 | 
 49 | (defrecord ClojaskOutput
 50 |            [memo
 51 |             aggre-func
 52 |             select
 53 |             output-func]
 54 |   p/Plugin
 55 |   (start [this event]
 56 |     ;; Initialize the plugin, generally by assoc'ing any initial state.
 57 |     this)
 58 | 
 59 |   (stop [this event]
 60 |     ;; Nothing is required here. However, most plugins have resources
 61 |     ;; (e.g. a connection) to clean up.
 62 |     ;; Mind that such cleanup is also achievable with lifecycles.
 63 |         (let [data (mapv (fn [_] (if (coll? _) _ [_])) (deref memo))
 64 |               wtr (:clojask/wtr event)]
 65 |           ;; (.write (:clojask/wtr event) (str data "\n"))
 66 |           (if (apply = (map count data))
 67 |             (do
 68 |               (mapv
 69 |             ;;  #(.write (:clojask/wtr event) (str (string/join "," (u/gets % select)) "\n"))
 70 |                (fn [msg] (output-func wtr [(u/gets msg select)])) (apply map vector data))
 71 |               (.flush wtr))
 72 |             (throw (ExecutionException. "aggregation result is not of the same length"))))
 73 |         this)
 74 | 
 75 |   p/Checkpointed
 76 |   ;; Nothing is required here. This is normally useful for checkpointing in
 77 |   ;; input plugins.
 78 |   (checkpoint [this])
 79 | 
 80 |   ;; Nothing is required here. This is normally useful for checkpointing in
 81 |   ;; input plugins.
 82 |   (recover! [this replica-version checkpoint])
 83 | 
 84 |   ;; Nothing is required here. This is normally useful for checkpointing in
 85 |   ;; input plugins.
 86 |   (checkpointed! [this epoch])
 87 | 
 88 |   p/BarrierSynchronization
 89 |   (synced? [this epoch]
 90 |     ;; Nothing is required here. This is commonly used to check whether all
 91 |     ;; async writes have finished.
 92 |     true)
 93 | 
 94 |   (completed? [this]
 95 |     ;; Nothing is required here. This is commonly used to check whether all
 96 |     ;; async writes have finished (just like synced).
 97 |     true)
 98 | 
 99 |   p/Output
100 |   (prepare-batch [this event replica messenger]
101 |     ;; Nothing is required here. This is useful for some initial preparation,
102 |     ;; before write-batch is called repeatedly.
103 |     true)
104 | 
105 |   (write-batch [this {:keys [onyx.core/write-batch clojask/wtr]} replica messenger]
106 |               ;;  keys [:Departement]
107 |     ;; Write the batch to your datasink.
108 |     ;; In this case we are conjoining elements onto a collection.
109 |     (let []
110 |       (doseq [msg write-batch]
111 |       ;; (if-let [msg (first batch)]
112 |         ;; (do
113 |         (doseq [data (:d msg)]
114 |           ;; (swap! example-datasink conj msg)
115 |           (if (not= data nil)
116 |             (let [
117 |                   ;; data (:d msg)
118 |                   ]
119 |             ;;   (.write wtr (str (string/join "," (:d msg)) "\n"))
120 | 
121 |             ;;    (swap! memo assoc index (func (get index (deref memo)) (:d msg)))
122 |               (vreset! memo (doall (map-indexed (fn [ind prev] ((nth (nth aggre-func ind) 0) prev (nth data (nth (nth aggre-func ind) 1)))) (deref memo))))
123 |             ;;   (.write wtr (str (vec (deref memo)) "\n"))
124 |               )))))
125 |     true))
126 | 
127 | ;; Builder function for your output plugin.
128 | ;; Instantiates a record.
129 | ;; It is highly recommended you inject and pre-calculate frequently used data 
130 | ;; from your task-map here, in order to improve the performance of your plugin
131 | ;; Extending the function below is likely good for most use cases.
132 | (defn output [pipeline-data]
133 |   (let []
134 |    (->ClojaskOutput (volatile! (doall (take (count (deref aggre-func))
135 |                                      (repeat start))))
136 |                     (deref aggre-func)
137 |                     (deref select)
138 |                     ;; (.getOutput (deref df))
139 |                     (deref output-func))))


--------------------------------------------------------------------------------
/src/main/clojure/clojask/classes/ColInfo.clj:
--------------------------------------------------------------------------------
  1 | (ns clojask.classes.ColInfo
  2 |   (:require [clojure.set :as set]
  3 |             [clojask.utils :refer []]))
  4 | 
  5 | (import '[com.clojask.exception TypeException]
  6 |         '[com.clojask.exception OperationException])
  7 | 
  8 | (definterface ColIntf
  9 |   (init [colNames])
 10 |   (operate [operation col])
 11 |   (operate [operation col newCol])
 12 |   (setType [operation col])
 13 |   (getDesc [] "get column description")
 14 |   (getType [] "get column type")
 15 |   (getKeys [] "get collection of keys")
 16 |   (getKeyIndex [] "get map with key = column name, value = index")
 17 |   (getIndexKey [] "get map with key = index, value = column name")
 18 |   (getDeletedCol [] "get indices of deleted columns")
 19 |   (setFormatter [format col])
 20 |   (getFormatter [])
 21 |   (delCol [col-to-del])
 22 |   (setColInfo [new-col-set])
 23 |   (renameColInfo [old-col new-col])
 24 |   (copy [] "copy all the information for rollback purpose")
 25 |   (rollback [] "undo the change making use of the copied")
 26 |   (commit [])
 27 |   )
 28 | 
 29 | 
 30 | (deftype ColInfo
 31 |   ;; the column description about whether a change is made to this column
 32 |          [^:unsynchronized-mutable col-keys
 33 |           ^:unsynchronized-mutable key-index
 34 |           ^:unsynchronized-mutable index-key
 35 |           ^:unsynchronized-mutable col-dsp
 36 |           ^:unsynchronized-mutable col-type
 37 |           ^:unsynchronized-mutable col-format
 38 |           ^:unsynchronized-mutable col-deleted
 39 |           ^:unsynchronized-mutable hist]
 40 | 
 41 |   ;; method
 42 |   ColIntf
 43 | 
 44 |   (init
 45 |     [this colNames]
 46 |     (set! col-keys (vec colNames))  ;; contains only the original keys
 47 |     (set! key-index (zipmap col-keys (iterate inc 0)))
 48 |     (set! index-key (zipmap (iterate inc 0) col-keys))
 49 |     (set! col-dsp (zipmap (take (count colNames) (iterate inc 0)) (map vector (map vector (iterate inc 0)))))
 50 |     (set! col-deleted (set nil)))
 51 | 
 52 |   (getFormatter
 53 |     [this]
 54 |     col-format)
 55 | 
 56 |   (getDesc
 57 |     [this]
 58 |     col-dsp)
 59 | 
 60 |   (getType
 61 |     [this]
 62 |     col-type)
 63 | 
 64 |   (getKeys
 65 |     [this]
 66 |     (mapv (fn [index] (get index-key index))
 67 |           (take (count index-key) (iterate inc 0))))
 68 | 
 69 |   (getKeyIndex
 70 |     [this]
 71 |     key-index)
 72 | 
 73 |   (getIndexKey
 74 |     [this]
 75 |     index-key)
 76 | 
 77 |   (getDeletedCol
 78 |     [this]
 79 |     col-deleted)
 80 | 
 81 |   (operate
 82 |     [this operation col]
 83 |     (.copy this)
 84 |     (if (contains? key-index col)
 85 |       (do
 86 |         (set! col-dsp (assoc col-dsp (get key-index col) (conj (get col-dsp (get key-index col)) operation)))
 87 |           ;; "success"
 88 |         nil)
 89 |       (throw (OperationException. "Column name passed to operate not found"))))
 90 | 
 91 |   (operate
 92 |     [this operation col newCol]
 93 |     (.copy this)
 94 |     (let [col (if (coll? col)
 95 |                 col
 96 |                 [col])
 97 |           external (vec (filter (fn [_] (not (.contains col-keys _))) col))]
 98 |       (if (= (count external) 0)
 99 |         (if (contains? key-index newCol)
100 |           (str newCol " is already exist")
101 |           (do
102 |             ;; (set! col-keys (conj col-keys newCol))
103 |             (set! key-index (assoc key-index newCol (count key-index)))
104 |             (set! index-key (assoc index-key (count index-key) newCol))
105 |             (set! col-dsp (assoc col-dsp (get key-index newCol) (conj [(vec (map (fn [_] (get key-index _)) col))] operation)))
106 |             ;; "success"
107 |             nil))
108 |         (do
109 |           (throw (OperationException. (str external " are not original column names")))))))
110 | 
111 |   (setType
112 |     [this operation col]
113 |     (.copy this)
114 |     (if (.contains col-keys col)
115 |       ;; if this column has been assigned a type
116 |       (do
117 |         (set! col-type (assoc col-type (get key-index col)  operation))
118 |         ;; (set! col-dsp (assoc col-dsp col (vec (concat (conj [(first (col col-dsp))] operation) (rest (rest (col col-dsp)))))))
119 |         ;; "success"
120 |         nil)
121 |       (throw (OperationException. "Column name passed to setType not found"))))
122 | 
123 |   (setFormatter
124 |     [this format col]
125 |     (.copy this)
126 |     (set! col-format (assoc col-format (get key-index col) format)))
127 | 
128 |   (delCol
129 |     [this col-to-delete]
130 |     (.copy this)
131 |     (let [col-indices (set (map key-index col-to-delete))]
132 |       (set! col-deleted (set/union col-deleted col-indices))))
133 | 
134 |   (setColInfo
135 |     [this new-col-set]
136 |     (.copy this)
137 |     (let [original-key-index (.getKeyIndex this)
138 |           new-col-dsp-vals (vals (select-keys original-key-index new-col-set))
139 |           original-type (.getType this)
140 |           original-format (.getFormatter this)]
141 |       (set! col-keys (vec new-col-set))
142 |       (set! key-index (zipmap new-col-set (iterate inc 0)))
143 |       (set! index-key (zipmap (iterate inc 0) new-col-set))
144 |       (set! col-dsp (zipmap (take (count col-keys) (iterate inc 0)) (map vector (map vector new-col-dsp-vals))))
145 |       (if (not (empty? (.getType this)))
146 |         (set! col-type (zipmap (map #(first (first (get col-dsp (first %)))) original-type) (map last original-type))))
147 |       (if (not (empty? (.getFormatter this)))
148 |         (set! col-format (zipmap (map #(first (first (get col-dsp (first %)))) original-format) (map last original-format))))))
149 | 
150 |   (renameColInfo
151 |     [this old-col new-col]
152 |     (.copy this)
153 |     (set! col-keys (mapv (fn [_] (if (= _ old-col) new-col _)) col-keys))
154 |     (let [index (get key-index old-col)]
155 |       (set! key-index (set/rename-keys key-index {old-col new-col}))
156 |       (set! index-key (update index-key index (fn [_] new-col)))))
157 | 
158 |   (copy
159 |     [this]
160 |     (set! hist {:col-keys col-keys
161 |                 :key-index key-index
162 |                 :index-key index-key
163 |                 :col-dsp col-dsp
164 |                 :col-type col-type
165 |                 :col-format col-format
166 |                 :col-deleted col-deleted}))
167 | 
168 |   (rollback
169 |    [this]
170 |    (if (not= hist {})
171 |      (do (set! col-keys (:col-keys hist))  ;; contains only the original keys
172 |          (set! key-index (:key-index hist))
173 |          (set! index-key (:index-key hist))
174 |          (set! col-type (:col-type hist))
175 |          (set! col-format (:col-format hist))
176 |          (set! col-dsp (:col-dsp hist))
177 |          (set! col-deleted (:col-deleted hist)))))
178 |   
179 |   (commit
180 |    [this]
181 |    (set! hist {})))


--------------------------------------------------------------------------------
/src/main/clojure/clojask/classes/JoinedDataFrame.clj:
--------------------------------------------------------------------------------
  1 | (ns clojask.classes.JoinedDataFrame
  2 |   (:require [clojure.set :as set]
  3 |             [clojask.classes.ColInfo :refer [->ColInfo]]
  4 |             [clojask.classes.RowInfo :refer [->RowInfo]]
  5 |             [clojask.classes.DataStat :refer [->DataStat]]
  6 |             [clojask.classes.MGroup :refer [->MGroup ->MGroupJoin ->MGroupJoinOuter]]
  7 |             [clojask.classes.DataFrame :refer [->DataFrame]]
  8 |             [clojask.onyx-comps :refer [start-onyx start-onyx-aggre-only start-onyx-groupby start-onyx-join]]
  9 |             ;; [clojask.aggregate.aggre-onyx-comps :refer [start-onyx-aggre]]
 10 |             [clojask.join.outer-onyx-comps :refer [start-onyx-outer]]
 11 |             [clojure.java.io :as io]
 12 |             [clojask.utils :as u])
 13 |   (:import
 14 |    [clojask.classes.ColInfo ColInfo]
 15 |    [clojask.classes.RowInfo RowInfo]
 16 |    [clojask.classes.DataStat DataStat]
 17 |    [clojask.classes.MGroup MGroup MGroupJoin MGroupJoinOuter]
 18 |    [clojask.classes.DataFrame GenDFIntf DataFrame]
 19 |    [com.clojask.exception TypeException OperationException]))
 20 | 
 21 | ;; ============= Below is the definition for the joineddataframe ================
 22 | (definterface JDFIntf
 23 |   (compute [^int num-worker ^String output-dir ^boolean exception ^boolean order select ifheader out inmemory]))
 24 | 
 25 | (defrecord JoinedDataFrame
 26 |            [^clojask.classes.DataFrame.DataFrame a
 27 |             ^clojask.classes.DataFrame.DataFrame b
 28 |             a-keys
 29 |             b-keys
 30 |             a-roll
 31 |             b-roll
 32 |             type
 33 |             limit
 34 |             prefix
 35 |             output-func]
 36 | 
 37 |   GenDFIntf
 38 | 
 39 |   (checkInputPathClash
 40 |     [this path]
 41 |     (.checkInputPathClash a path)
 42 |     (.checkInputPathClash b path))
 43 | 
 44 |   (getColNames
 45 |     [this]
 46 |     (let [a-col-prefix (first prefix)
 47 |           b-col-prefix (last prefix)
 48 |           a-col-set (.getColNames a)
 49 |           b-col-set (.getColNames b)
 50 |           a-col-header (mapv #(str a-col-prefix "_" %) a-col-set)
 51 |           b-col-header (mapv #(str b-col-prefix "_" %) b-col-set)]
 52 |       (concat a-col-header b-col-header)))
 53 | 
 54 |   (setOutput
 55 |     [this output]
 56 |     (reset! output-func output))
 57 | 
 58 |   (getOutput
 59 |     [this]
 60 |     (deref output-func))
 61 | 
 62 |   (printCol
 63 |     ;; print column names, called by compute
 64 |     [this output-path selected-index out]
 65 |     (let [col-set (if (= selected-index [nil]) (.getColNames this) (mapv (vec (.getColNames this)) selected-index))]
 66 |       (let [wrtr (if output-path (io/writer output-path) nil)]
 67 |         ((or out (.getOutput this)) wrtr [col-set])
 68 |         (if output-path (.close wrtr)))))
 69 | 
 70 |   (preview
 71 |     [this sample-size output-size format]
 72 |     (let [data-a (.preview a sample-size output-size format)
 73 |           data-b (.preview b sample-size output-size format)
 74 |           old-a (.getColNames a)
 75 |           old-b (.getColNames b)
 76 |           rep-key-a (zipmap old-a (take (count old-a) (.getColNames this)))
 77 |           rep-key-b (zipmap old-b (take-last (count old-b) (.getColNames this)))
 78 |           data-a (map #(set/rename-keys % rep-key-a) data-a)
 79 |           data-b (map #(set/rename-keys % rep-key-b) data-b)
 80 |           data (map (fn [row-a row-b] (merge row-a row-b)) data-a data-b)]
 81 |       data))
 82 | 
 83 |   JDFIntf
 84 | 
 85 |   (compute
 86 |     [this ^int num-worker ^String output-dir ^boolean exception ^boolean order select ifheader out inmemory]
 87 |     (let [select (if (coll? select) select [select])
 88 |           select (if (= select [nil])
 89 |                    (vec (take (+ (count (.getKeyIndex (.col-info a))) (count (.getKeyIndex (.col-info b)))) (iterate inc 0)))
 90 |                    (mapv (fn [key] (.indexOf (.getColNames this) key)) select))
 91 |           a-index (vec (apply sorted-set (remove (fn [num] (>= num (count (.getKeyIndex (.col-info a))))) select)))
 92 |           ;; a-write 
 93 |           b-index (mapv #(- % (count (.getKeyIndex (.col-info a)))) (apply sorted-set (remove (fn [num] (< num (count (.getKeyIndex (.col-info a))))) select)))
 94 |           b-index (if b-roll (vec (apply sorted-set (conj b-index b-roll))) b-index)
 95 |           b-roll (if b-roll (count (remove #(>= % b-roll) b-index)) nil)
 96 |           ;; b-write
 97 |           a-format (set/rename-keys (.getFormatter (.col-info a)) (zipmap a-index (iterate inc 0)))
 98 |           b-format (set/rename-keys (.getFormatter (.col-info b)) (zipmap b-index (iterate inc 0)))
 99 |           write-index (mapv (fn [num] (count (remove #(>= % num) (concat a-index (mapv #(+ % (count (.getKeyIndex (.col-info a)))) b-index))))) select)
100 |           ;; test (println a-index b-index b-format write-index b-roll)
101 |           mgroup-a (MGroupJoinOuter. (transient {}) (transient {}) false)
102 |           mgroup-b (if (not= type 3) (MGroupJoin. (transient {}) (transient {}) (or (= 4 type) (= 5 type))) (MGroupJoinOuter. (transient {}) (transient {}) (or (= 4 type) (= 5 type))))
103 |           ]
104 |       ;; (u/init-file output-dir)
105 |       ;; print column names
106 |       (if (= ifheader true) (.printCol this output-dir select out))
107 |       (cond
108 |         (or (= type 0) (= type 1) (= type 2)) ;; inner left right join
109 |         (do
110 |           (if inmemory
111 |             (start-onyx-groupby num-worker 10 b mgroup-b b-keys b-index exception)
112 |             (start-onyx-groupby num-worker 10 b "./.clojask/join/b/" b-keys b-index exception :format true))
113 |           (.final mgroup-b)
114 |           (start-onyx-join num-worker 10 a b (if inmemory mgroup-b nil) output-dir exception a-keys b-keys a-roll b-roll type limit a-index (vec (take (count b-index) (iterate inc 0))) b-format write-index out))
115 |         (= type 3) ;; outer join
116 |         (do
117 |           (if inmemory
118 |             (do
119 |               (start-onyx-groupby num-worker 10 a mgroup-a a-keys a-index exception)
120 |               (start-onyx-groupby num-worker 10 b mgroup-b b-keys b-index exception)
121 |               (.final mgroup-a)
122 |               ;; (.final mgroup-b)
123 |               )
124 |             (do
125 |               (start-onyx-groupby num-worker 10 a "./.clojask/join/a/" a-keys a-index exception :format true)
126 |               (start-onyx-groupby num-worker 10 b "./.clojask/join/b/" b-keys b-index exception :format true)))
127 |           (start-onyx-outer num-worker 10 a b (if inmemory mgroup-a nil) (if inmemory mgroup-b nil) output-dir exception a-index b-index a-format b-format write-index out))
128 |         (or (= type 4) (= type 5))  ;; rolling join
129 |         (do
130 |           (if inmemory
131 |             (start-onyx-groupby num-worker 10 b mgroup-b b-keys b-index exception)
132 |             (start-onyx-groupby num-worker 10 b "./.clojask/join/b/" b-keys b-index exception))
133 |           (.final mgroup-b)
134 |           (start-onyx-join num-worker 10 a b (if inmemory mgroup-b nil) output-dir exception a-keys b-keys a-roll b-roll type limit a-index (vec (take (count b-index) (iterate inc 0))) b-format write-index out))))))
135 | 
136 | 


--------------------------------------------------------------------------------
/src/main/clojure/clojask/preview.clj:
--------------------------------------------------------------------------------
  1 | (ns clojask.preview
  2 |   (:require [clojure.set :as set]
  3 |             ;; [clojask.classes.ColInfo :refer [->ColInfo]]
  4 |             ;; [clojask.classes.RowInfo :refer [->RowInfo]]
  5 |             [clojure.data.csv :as csv]
  6 |             [clojure.java.io :as io]
  7 |             [clojask.utils :refer [eval-res eval-res-ne filter-check]]
  8 |             [clojask.groupby :refer [gen-groupby-filenames]]
  9 |             ;; [clojask.onyx-comps :refer [start-onyx start-onyx-groupby start-onyx-join]]
 10 |             ;; [clojask.sort :as sort]
 11 |             ;; [clojask.join :as join]
 12 |             ;; [clojask.aggregate.aggre-onyx-comps :refer [start-onyx-aggre]]
 13 |             [clojure.string :as str]
 14 |             [clojask.preview :as preview]
 15 |             [clojask.api.aggregate :as aggre]))
 16 | 
 17 | (defn preview
 18 |   [dataframe sample-size return-size formatting]
 19 |   ;; outer loop is the input node
 20 |   (let [index-key (.getIndexKey (:col-info dataframe))
 21 |         formatters (.getFormatter (:col-info dataframe))
 22 |         ;; index (take (count index-key) (iterate inc 0))
 23 |         ;; indices-deleted (.getDeletedCol (:col-info dataframe))
 24 |         ;; indices-wo-del (vec (take (count index-key) (iterate inc 0)))
 25 |         ;; indices-not-deleted (set/difference (set indices-wo-del) (set indices-deleted))
 26 |         ;; index (if (empty? indices-deleted) 
 27 |         ;;           indices-wo-del ;; no columns deleted
 28 |         ;;           (filterv (fn [i] (contains? indices-not-deleted i)) indices-wo-del)
 29 |         ;;           )
 30 |         ;; header (mapv index-key index)    ;; the header of the result in sequence vector
 31 |         index (.getColIndex dataframe)
 32 |         header (.getColNames dataframe)
 33 |         ;; csv-data (if (fn? (.getFunc dataframe))
 34 |         ;;            ((.getFunc dataframe))
 35 |         ;;            (let [reader (io/reader (:path dataframe))]
 36 |         ;;              (if (:have-col dataframe)
 37 |         ;;                (rest (line-seq reader))
 38 |         ;;                (line-seq  reader))))
 39 |         csv-data ((.getFunc dataframe))
 40 |         data (map zipmap (repeat [:id :d]) (map vector (iterate inc 0) csv-data))
 41 |         sample (take sample-size data)    ;; lazy source data (take sample size)
 42 |         ;; define the variables needed in the following functions
 43 |         operations (.getDesc (:col-info  dataframe))
 44 |         types (.getType (:col-info  dataframe))
 45 |         filters (.getFilters (:row-info dataframe))
 46 |         indices index
 47 |         no-aggre (= (.getAggreFunc (:row-info dataframe)) []) ;; if need to groupby & aggregate
 48 |         no-groupby (= (.getGroupbyKeys (:row-info dataframe)) [])
 49 |         ;;
 50 |         preview-work-func (fn [seg]
 51 |                             (let [data (:d seg)]
 52 |                               (if (filter-check filters types data)
 53 |                                 {:d (mapv (fn [_] (eval-res data types formatters operations _)) indices)}
 54 |                                 {}))) ;; the function body of operation (take over the work in worker nodes)
 55 |         preview-output-func (if (and formatting no-aggre no-groupby)
 56 |                               (fn [row]
 57 |                                 (mapv (fn [_] (if-let [formatter (get formatters _)]
 58 |                                                 (formatter (nth (:d row) _))
 59 |                                                 (nth (:d row) _))) index))
 60 |                               (fn [row]
 61 |                                 (:d row))) ;; the function body of output operation (take over the work in output node) without formatting
 62 | 
 63 |         ;; ========== no need to change ===========
 64 |         compute-res (loop [rows sample res (transient [])]     ;; the result of normal compute
 65 |                       (if (= rows []) ;; exceed sample size
 66 |                         (persistent! res)
 67 |                         (let [row (first rows)
 68 |                               rest (rest rows)
 69 |                               row (preview-work-func row)
 70 |                               row-res (preview-output-func row)
 71 |                               res (if row-res (conj! res row-res) res)]
 72 |                           (if (>= (count res) return-size)
 73 |                             (persistent! res)
 74 |                             (recur rest res)))))]
 75 |     (if (and no-groupby no-aggre)
 76 |       (mapv (fn [row-v] (zipmap header row-v)) compute-res)
 77 |       ;; need to do aggregate
 78 |       (if no-groupby
 79 |         ;; need to do simple aggregate
 80 |         (let [aggre-funcs (.getAggreFunc (.row-info dataframe))
 81 |               keys (.getAggreNewKeys (:row-info dataframe))
 82 |               aggre-res (for [[func index] aggre-funcs]
 83 |                           (let [res
 84 |                                 (reduce func aggre/start (mapv (fn [row] (nth row index)) compute-res))]
 85 |                             (if (coll? res)
 86 |                               res
 87 |                               [res])))]
 88 |           (if (apply = (map count aggre-res))
 89 |             (mapv (fn [row-v] (zipmap keys row-v)) (apply map vector aggre-res))
 90 |             (throw (Exception. "aggregation result is not of the same length"))))
 91 |         ;; need to do groupby aggregate
 92 |         (let [key-index (.getKeyIndex (:col-info dataframe))
 93 |               index-key (.getIndexKey (.col-info dataframe))
 94 |               groupby-keys (.getGroupbyKeys (:row-info dataframe))
 95 |               groupby-res (loop [sample compute-res groupby {}]
 96 |                             (if-let [row (first sample)]
 97 |                               (let [res (rest sample)
 98 |                                     key (gen-groupby-filenames nil row groupby-keys key-index formatters)]
 99 |                                 (recur res (assoc groupby key (conj (or (get groupby key) []) row))))
100 |                               groupby))
101 |               aggre-funcs (.getAggreFunc (.row-info dataframe))
102 |               ;; keys = column names
103 |               keys (.getAggreColNames dataframe)
104 |               preview-aggre-func (fn [key v-of-v]
105 |                                    (let [data v-of-v
106 |                                         ;; pre 
107 |                                          pre (mapv #(let [func (first %)
108 |                                                           index (nth % 1)]
109 |                                                       (if func
110 |                                                         (func (nth (first v-of-v) index))
111 |                                                         (if formatting
112 |                                                           ((or (get formatters index) identity) (nth (first v-of-v) index))
113 |                                                           (nth (first v-of-v) index))))
114 |                                                    groupby-keys)
115 |                                          data-map (-> (iterate inc 0)
116 |                                                       (zipmap (apply map vector data)))]
117 |                                      (loop [aggre-funcs aggre-funcs
118 |                                             res []]
119 |                                        (if (= aggre-funcs [])
120 |                                          (if (= res [])
121 |                                            [pre]
122 |                                            (mapv concat (repeat pre) (apply map vector res)))
123 |                                          (let [func (first (first aggre-funcs))
124 |                                                index (nth (first aggre-funcs) 1)
125 |                                                res-funcs (rest aggre-funcs)
126 |                                                new (func (get data-map index))
127 |                                                new (if (coll? new)
128 |                                                      new
129 |                                                      (vector new))
130 |                                                new (if formatting
131 |                                                      (mapv (fn [_] (if-let [formatter (get formatters index)]
132 |                                                                      (formatter _)
133 |                                                                      _)) new)
134 |                                                      new)]
135 |                                            (if (or (= res []) (= (count new) (count (last res))))
136 |                                              (recur res-funcs (conj res new))
137 |                                              (throw (Exception. "aggregation result is not of the same length"))))))))]
138 |           (loop [groupby-res groupby-res aggre-res []]
139 |             (if-let [key-vv (first groupby-res)]
140 |               (let [res (rest groupby-res)
141 |                     key (nth key-vv 0)
142 |                     vv (nth key-vv 1)]
143 |                 (recur res (concat aggre-res (preview-aggre-func key vv))))
144 |               (mapv (fn [row-v] (zipmap keys row-v)) aggre-res))))))))


--------------------------------------------------------------------------------
/docs/clojask.extensions.md:
--------------------------------------------------------------------------------
  1 | ### Directory: clojask.extensions
  2 | 
  3 | Like many popular Python libraries, such as numpy and pandas, third-party users can extend the function of  Clojask by introducing more codes above the basic source code. This attempt is interesting and encouraged. Here is an example of creating such extension functions.
  4 | 
  5 | ### ns: clojask.extensions.bind
  6 | 
  7 | Contains functions that can help to bind several dataset files together on both directions.
  8 | 
  9 | #### API Foundation
 10 | 
 11 | When defining a clojask.classes.DataFrame.DataFrame using `dataframe` function, one can input a function instead of the path of the source file. This function should produce a sequence. If this sequence is lazy, the theoretical length of the sequence can be infinite. Otherwise, it must have a finite length that is smaller than the memory size.
 12 | 
 13 | ```
 14 | (def x (dataframe #(["col1,col2" "1,2" "3,4"])))
 15 | ```
 16 | 
 17 | Based on this API, we can define the `cbind` and `rbind` function for two csv files.
 18 | 
 19 | #### `cbind-csv`
 20 | 
 21 | Joins some csv files into a new dataframe by columns.
 22 | 
 23 | | Argument   | Type   | Function                        | Remarks                                                     |
 24 | | ---------- | ------ | ------------------------------- | ----------------------------------------------------------- |
 25 | | path-a     | String | The path of the first csv file  | Can be absolute or relative path                            |
 26 | | path-b     | String | The path of the second csv file | Can be absolute or relative path                            |
 27 | | [path-c's] | String | Target columns                  | Can be absolute or relative path; the number is not limited |
 28 | 
 29 | **Example**
 30 | 
 31 | ```clojure
 32 | ;; file a
 33 | ;; date,item,price
 34 | ;; 2010-01-20,1,18.3
 35 | ;; 2010-01-20,2,38.3
 36 | ;; 2010-01-23,1,18.9
 37 | ;; 2010-01-23,2,48.9
 38 | ;; 2010-01-26,1,19.1
 39 | ;; 2010-01-26,2,59.1
 40 | ;; file b
 41 | ;; date,cust,Item,sold
 42 | ;; 2010-01-19,101,2,11
 43 | ;; 2010-01-22,102,1,7
 44 | ;; 2010-01-24,102,2,9
 45 | ;; 2010-01-25,101,2,9
 46 | ;; 2010-01-26,101,1,10
 47 | (def x (cbind "path/to/a" "path/to/b"))
 48 | ;; x
 49 | ;; date1,item,price,date2,cust,Item,sold
 50 | ;; 2010-01-20,1,18.3,2010-01-19,101,2,11
 51 | ;; 2010-01-20,2,38.3,2010-01-22,102,1,7
 52 | ;; 2010-01-23,1,18.9,2010-01-24,102,2,9
 53 | ;; 2010-01-23,2,48.9,2010-01-25,101,2,9
 54 | ;; 2010-01-26,1,19.1,2010-01-26,101,1,10
 55 | ```
 56 | 
 57 | #### `rbind-csv`
 58 | 
 59 | Joins some csv files into a new dataframe by rows.
 60 | 
 61 | | Argument   | Type   | Function                        | Remarks                                                   |
 62 | | ---------- | ------ | ------------------------------- | --------------------------------------------------------- |
 63 | | path-a     | String | The path of the first csv file  | Can be absolute or relative path                          |
 64 | | path-b     | String | The path of the second csv file | Can be absolute or relative path                          |
 65 | | [path-c's] | String | Target columns                  | Can be absolute or relative path; the number is not fixed |
 66 | 
 67 | **Example**
 68 | 
 69 | ```clojure
 70 | ;; file a
 71 | ;; date,item,price
 72 | ;; 2010-01-20,1,18.3
 73 | ;; 2010-01-20,2,38.3
 74 | ;; 2010-01-23,1,18.9
 75 | ;; 2010-01-23,2,48.9
 76 | ;; 2010-01-26,1,19.1
 77 | ;; 2010-01-26,2,59.1
 78 | ;; file b
 79 | ;; date,cust,Item,sold
 80 | ;; 2010-01-19,101,2,11
 81 | ;; 2010-01-22,102,1,7
 82 | ;; 2010-01-24,102,2,9
 83 | ;; 2010-01-25,101,2,9
 84 | ;; 2010-01-26,101,1,10
 85 | (def x (rbind "path/to/a" "path/to/b"))
 86 | (print-df x)
 87 | |             date |             item |            price |
 88 | |------------------+------------------+------------------|
 89 | | java.lang.String | java.lang.String | java.lang.String |
 90 | |       2010-01-20 |                1 |             18.3 |
 91 | |       2010-01-20 |                2 |             38.3 |
 92 | |       2010-01-23 |                1 |             18.9 |
 93 | |       2010-01-23 |                2 |             48.9 |
 94 | |       2010-01-26 |                1 |             19.1 |
 95 | |       2010-01-26 |                2 |             59.1 |
 96 | |       2010-01-19 |              101 |                2 |
 97 | |       2010-01-22 |              102 |                1 |
 98 | |       2010-01-24 |              102 |                2 |
 99 | |       2010-01-25 |              101 |                2 |
100 | ```
101 | 
102 | #### **It is also possible and encouraged to create more binding functions for other file types.**
103 | 
104 | ### ns: clojask.extensions.reshape
105 | 
106 | Contains functions that can reshape a clojask dataframe from wide to long or from long to wide.
107 | 
108 | #### API Foundation
109 | 
110 | When defining a clojask.classes.DataFrame.DataFrame using `dataframe` function, you can specify the option `:melt`, which should be a function that will be applied to each resultant row vector in the end. The default is vector, which will not affect the results. However, if `:melt` is set to
111 | 
112 | ```clojure
113 | (fn [x]
114 |   (repeat 2 x))
115 | ```
116 | 
117 | , then each row will be output twice.
118 | 
119 | #### `melt`
120 | 
121 | Reshape the dataframe from wide to long.
122 | 
123 | | Argument       | Type              | Function                                  | Remarks                                                      |
124 | | -------------- | ----------------- | ----------------------------------------- | ------------------------------------------------------------ |
125 | | dataframe      | clojask.classes.DataFrame.DataFrame | Specify the dataframe                     |                                                              |
126 | | output-path    | String            | The path of the output                    | Can be absolute or relative path with respect to the `project.clj` file. |
127 | | id             | vector of strings | The fixed portion of the columns          | These columns must have a perfect correlation.               |
128 | | measurement    | vector of strings | The measurement columns                   | In the result, the measurement names will become one column and the values will become another. |
129 | | [measure_name] | String            | The name of the measurement in the result | By default "measure"                                         |
130 | | [value_name]   | String            | The name of the value in the result       | By default "value"                                           |
131 | 
132 | **Example**
133 | 
134 | ```clojure
135 | ;; x
136 | ;; family_id,age_mother,dob_child1,dob_child2,dob_child3
137 | ;; 1,30,1998-11-26,2000-01-29,
138 | ;; 2,27,1996-06-22,,
139 | ;; 3,26,2002-07-11,2004-04-05,2007-09-02
140 | ;; 4,32,2004-10-10,2009-08-27,2012-07-21
141 | ;; 5,29,2000-12-05,2005-02-28,
142 | (melt x "path/to/output" ["family_id" "age_mother"] ["dob_child1" "dob_child2" "dob_child3"])
143 | ```
144 | 
145 | #### `dcast`
146 | 
147 | Reshape the dataframe from long to wide. Reversible to `melt`.
148 | 
149 | | Argument     | Type                                 | Function                                    | Remarks                                                      |
150 | | ------------ | ------------------------------------ | ------------------------------------------- | ------------------------------------------------------------ |
151 | | dataframe    | clojask.classes.DataFrame.DataFrame                    | Specify the dataframe                       |                                                              |
152 | | output-path  | String                               | The path of the output                      | Can be absolute or relative path with respect to the `project.clj` file. |
153 | | id           | vector of strings                    | The fixed portion of the columns            | These columns must have a perfect correlation.               |
154 | | measure-name | String                               | The name of the measurement                 | By default "measure"                                         |
155 | | value-name   | String                               | The name of the value                       | By default "value"                                           |
156 | | values       | vector of string/int/double/datetime | The value choices of the measurement column | The order matters as in the result file.                     |
157 | | [vals-name]  | vector of string                     | The name of the value columns               | By default, same as `values`                                 |
158 | 
159 | **Example**
160 | 
161 | ``` clojure
162 | ;; x
163 | ;; family_id,age_mother,measure,value
164 | ;; 1,30,dob_child1,1998-11-26
165 | ;; 1,30,dob_child2,2000-01-29
166 | ;; 1,30,dob_child3,
167 | ;; 2,27,dob_child1,1996-06-22
168 | ;; 2,27,dob_child2,
169 | ;; 2,27,dob_child3,
170 | ;; 3,26,dob_child1,2002-07-11
171 | ;; 3,26,dob_child2,2004-04-05
172 | ;; 3,26,dob_child3,2007-09-02
173 | ;; 4,32,dob_child1,2004-10-10
174 | ;; 4,32,dob_child2,2009-08-27
175 | ;; 4,32,dob_child3,2012-07-21
176 | ;; 5,29,dob_child1,2000-12-05
177 | ;; 5,29,dob_child2,2005-02-28
178 | ;; 5,29,dob_child3,
179 | (dcast x "resources/test.csv" ["family_id" "age_mother"] "measure" "value" ["dob_child1" "dob_child2" "dob_child3"])
180 | ```
181 | 
182 | 


--------------------------------------------------------------------------------
/src/main/clojure/clojask/groupby.clj:
--------------------------------------------------------------------------------
  1 | (ns clojask.groupby
  2 |   (:require [clojure.java.io :as io]
  3 |             ;[clojure-csv.core :as csv]
  4 |             [clojask.utils :as u]
  5 |             [clojask.classes.MGroup :refer [->MGroup]])
  6 |   (:import [clojask.classes.MGroup MGroup]))
  7 | "contains the utility functions to group by and aggregate"
  8 | 
  9 | (defn compute-groupby
 10 |   "map the result to different files"
 11 |   [dataframe num-worker output-dir exception]
 12 |   )
 13 | 
 14 | (defn compute-aggregate
 15 |   "aggregate the output files to the final destination"
 16 |   [dateframe output-dir exp])
 17 | 
 18 | ;; ;; the example of how to write a set of aggregate function
 19 | ;; (defn min-pre
 20 | ;;   []
 21 | ;;   (def memo (atom 1)))
 22 | 
 23 | ;; (defn min
 24 | ;;   [row]
 25 | ;;   (reset! memo (min (deref memo) row)))
 26 | 
 27 | ;; (defn min-result
 28 | ;;   []
 29 | ;;   (deref memo))
 30 | 
 31 | (defn gen-groupby-filenames
 32 |   "internal function to generate files csv line with groupby key(s)"
 33 |   [dist msg groupby-keys key-index formatters]
 34 |   ;; (def output-filename dist)
 35 |   ;; (doseq [groupby-key groupby-keys]
 36 |   ;;   (def output-filename (str output-filename "_" (name groupby-key) "-" (nth msg (get key-index groupby-key)))))
 37 |   ;; (str output-filename ".csv")
 38 |   (let [index groupby-keys
 39 |         ;; (map (fn [_] (get key-index _)) groupby-keys)
 40 |         val (mapv (fn [_] 
 41 |                     (let [func (nth _ 0)
 42 |                           _ (nth _ 1)]
 43 |                      (if func
 44 |                        (func (nth msg _))
 45 |                       ;;  (if-let [formatter (get formatters _)]
 46 |                       ;;    (formatter (nth msg _))
 47 |                       ;;    (nth msg _))
 48 |                        (nth msg _)
 49 |                        ))) 
 50 |                   index)]
 51 |     (if (string? dist) (str dist (u/encode-str (str val))) (str val))))
 52 | 
 53 | (defn output-groupby
 54 |   "internal function called by output when aggregation is applied"
 55 |   [dist msg groupby-keys key-index formatter write-index _format]
 56 |   ;; msg this time is a vector
 57 | 
 58 |   ;; key-index contains the one to one correspondence of key value to index value, it is a map
 59 |   ;; eg "Salary" -> 3
 60 |   ;; (spit "resources/debug.txt" (str msg "\n" key-index) :append true)
 61 |   (let [output-filename (gen-groupby-filenames dist msg groupby-keys key-index formatter) ;; generate output filename
 62 |         ]
 63 |     (if (string? dist)
 64 |       (with-open [groupby-wrtr (io/writer output-filename :append true)]
 65 |         (.write groupby-wrtr (str (if (= true _format) (u/gets-format msg write-index formatter) (u/gets msg write-index)) "\n"))
 66 |         (.close groupby-wrtr))
 67 |       (.write dist output-filename msg write-index formatter))
 68 |     ;; write as maps e.g. {:name "Tim", :salary 62, :tax 0.1, :bonus 12}
 69 |     ;; (.write groupby-wrtr (str (u/gets-format msg write-index formatter) "\n"))
 70 | 
 71 | 
 72 |     ;; write as csv format e.g. Tim,62,0.1,12
 73 |     ;(.write groupby-wrtr (str (clojure.string/join "," (map msg (keys msg))) "\n"))
 74 | 
 75 |     ;; close writer
 76 | )
 77 | 
 78 |   ;; !! debugging
 79 |   ;(println (clojure.string/join "," (map msg (keys msg))))
 80 |   ;(println (apply str (map msg (keys msg))))
 81 |   )
 82 | 
 83 | (defn insert-mgroup
 84 |   [_mgroup]
 85 |   (def mgroup _mgroup))
 86 | 
 87 | (defn read-csv-seq
 88 |   "takes file name and reads data"
 89 |   [filename]
 90 |     (let [file (io/reader filename)]
 91 |       (->> file
 92 |            (line-seq)
 93 |            (map read-string))))
 94 | 
 95 | 
 96 | ;; (defn write-file
 97 | ;;  [dir seq]
 98 | ;;   (with-open [wtr (io/writer dir :append true)]
 99 | ;;     (doseq [row seq]
100 | ;;     (if (not= row nil)
101 | ;;       (.write wtr (str row "\n"))))))
102 | 
103 | ;; (defn internal-aggregate-write
104 | ;;   "called by child thread function"
105 | ;;   [func out-dir groupby-keys keys file & [new-keys]]
106 | ;;   (async/thread
107 | ;;     (write-file out-dir (func (read-csv-seq file) groupby-keys keys new-keys))))
108 | 
109 | ;; (defn internal-aggregate
110 | ;;   "aggregate one group use the function"
111 | ;;   [func out-dir key-index groupby-keys keys & [new-keys]]
112 | ;;   (let [directory (clojure.java.io/file "./.clojask/grouped/")
113 | ;;         files (file-seq directory)]
114 | ;;     (doseq [file (rest files)]
115 | ;;       ;; w/o multi-threading
116 | ;;       (write-file out-dir (func (read-csv-seq file) groupby-keys keys new-keys key-index))
117 | ;;       ;; multi-threading
118 | ;;       ;(async/go (async/<! (internal-aggregate-write func out-dir groupby-keys keys file [new-keys])))
119 | ;;       (io/delete-file file true)
120 | ;;       )
121 | ;;     (doseq [file (rest (file-seq (clojure.java.io/file "./_grouped/")))]
122 | ;;        (io/delete-file file))
123 | ;;     "success"))
124 | 
125 | 
126 | 
127 | ;; ;; below are example aggregate functions
128 | 
129 | ;; (defn aggre-min
130 | ;;   "get the min of some keys"
131 | ;;   [seq groupby-keys keys new-keys key-index]
132 | ;;   (let [_min (atom [])] 
133 | ;;     (assert (= (count keys) (count new-keys)) "number of new keys not equal to number of aggregation keys")
134 | ;;     (doseq [groupby-key keys]
135 | ;;       (let [vec-index (get key-index groupby-key)] ;; get index number in vector
136 | ;;         ;; initialise min with first value
137 | ;;         (swap! _min assoc (.indexOf keys groupby-key) (nth (first seq) vec-index))
138 | ;;         (doseq [row seq]
139 | ;;           ;; do one iteration to find the min
140 | ;;           (let [curr-val (Integer/parseInt (nth row vec-index))
141 | ;;                 curr-min (Integer/parseInt (nth (deref _min) (.indexOf keys groupby-key)))]
142 | ;;             (if (< curr-val curr-min)
143 | ;;               (swap! _min assoc (.indexOf keys groupby-key) (nth row vec-index))))
144 | ;;         )))
145 | ;;     ;(println (deref _min))
146 | ;;     [(deref _min)]
147 | ;;     )
148 | ;; )
149 | 
150 | ;; (defn aggre-max
151 | ;;   "get the max of some keys"
152 | ;;   [seq groupby-keys keys new-keys key-index]
153 | ;;   (let [_max (atom [])] 
154 | ;;     (assert (= (count keys) (count new-keys)) "number of new keys not equal to number of aggregation keys")
155 | ;;     (doseq [groupby-key keys]
156 | ;;       (let [vec-index (get key-index groupby-key)] ;; get index number in vector
157 | ;;         ;; initialise max with first value
158 | ;;         (swap! _max assoc (.indexOf keys groupby-key) (nth (first seq) vec-index))
159 | ;;         (doseq [row seq]
160 | ;;           ;; do one iteration to find the min
161 | ;;           (let [curr-val (Integer/parseInt (nth row vec-index))
162 | ;;                 curr-max (Integer/parseInt (nth (deref _max) (.indexOf keys groupby-key)))]
163 | ;;             (if (> curr-val curr-max)
164 | ;;               (swap! _max assoc (.indexOf keys groupby-key) (nth row vec-index))))
165 | ;;         )))
166 | ;;     ;(println (deref _max))
167 | ;;     [(deref _max)]
168 | ;;     )
169 | ;; )
170 | 
171 | ;; (defn square [n] (* n n))
172 | 
173 | ;; (defn mean [a] (/ (reduce + a) (count a)))
174 | 
175 | ;; (defn standard-deviation
176 | ;;   [a]
177 | ;;   (let [mn (mean a)]
178 | ;;     (Math/sqrt
179 | ;;       (/ (reduce #(+ %1 (square (- %2 mn))) 0 a)
180 | ;;          (dec (count a))))))
181 | 
182 | ;; ;; !! check if new-keys are float/int cols
183 | ;; (defn aggre-sum
184 | ;;   "get the sum of some keys"
185 | ;;   [seq groupby-keys keys new-keys key-index]
186 | ;;     (let [_sum (atom [])] 
187 | ;;       (assert (= (count keys) (count new-keys)) "number of new keys not equal to number of aggregation keys")
188 | ;;       (doseq [groupby-key keys]
189 | ;;         (let [vec-index (get key-index groupby-key)] ;; get index number in vector
190 | ;;           ;; initialise max with zero
191 | ;;           (swap! _sum assoc (.indexOf keys groupby-key) 0.0)
192 | ;;           (doseq [row seq]
193 | ;;             ;; do one iteration to get sum
194 | ;;             (let [curr-val (Float/parseFloat (nth row vec-index))
195 | ;;                   curr-sum (nth (deref _sum) (.indexOf keys groupby-key))]
196 | ;;                 (swap! _sum assoc (.indexOf keys groupby-key) (+ curr-val curr-sum)))
197 | ;;           )))
198 | ;;       (println (deref _sum))
199 | ;;       [(deref _sum)]
200 | ;;       )
201 | ;; )
202 | 
203 | ;; ;; !! check if new-keys are float/int cols
204 | ;; (defn aggre-avg
205 | ;;   "get the average of some keys"
206 | ;;   [seq groupby-keys keys new-keys key-index]
207 | ;;     (let [_avg (atom [])] 
208 | ;;       (assert (= (count keys) (count new-keys)) "number of new keys not equal to number of aggregation keys")
209 | ;;       (doseq [groupby-key keys]
210 | ;;         (let [vec-index (get key-index groupby-key) ;; get index number in vector
211 | ;;               avg-value (/ (reduce + (doall (map #(Float/parseFloat (nth % vec-index)) seq))) (count seq))] 
212 | ;;           (swap! _avg assoc (.indexOf keys groupby-key) avg-value)
213 | ;;           ))
214 | ;;       ;(println (deref _avg))
215 | ;;       [(deref _avg)]
216 | ;;       )
217 | ;; )
218 | 
219 | ;; ;; !! check if new-keys are float/int cols
220 | ;; (defn aggre-sd
221 | ;;   "get the standard deviation (sd) of some keys"
222 | ;;   [seq groupby-keys keys new-keys key-index]
223 | ;;     (let [_sd (atom [])] 
224 | ;;       (assert (= (count keys) (count new-keys)) "number of new keys not equal to number of aggregation keys")
225 | ;;       (doseq [groupby-key keys]
226 | ;;         (let [vec-index (get key-index groupby-key) ;; get index number in vector
227 | ;;               sd-value (standard-deviation (doall (map #(Float/parseFloat (nth % vec-index)) seq)))] 
228 | ;;           (swap! _sd assoc (.indexOf keys groupby-key) sd-value)
229 | ;;           ))
230 | ;;       ;(println (deref _sd))
231 | ;;       [(deref _sd)]
232 | ;;       )
233 | ;; )
234 | 
235 | ;; (defn template
236 | ;;   "The template for aggregate functions"
237 | ;;   ;; seq: is a seq of maps (lazy) of the data from one of the file
238 | ;;   ;; groupby-keys: is a vector of the group by keys
239 | ;;   ;; old-keys: the columns to which this function applies
240 | ;;   ;; new-keys: the new-keys to replace the old-keys and receive the aggregation result
241 | ;;   [seq groupby-keys old-keys new-keys])
242 | ;; ;; the return should be an vector of map (better lazy)
243 | 
244 | 


--------------------------------------------------------------------------------
/test/clojask/inmemory_test.clj:
--------------------------------------------------------------------------------
  1 | (ns inmemory-test
  2 |     (:require [clojure.test :refer :all]
  3 |               [clojask.dataframe :refer :all]
  4 |               [clojask.utils :refer :all]
  5 |               [clojask.groupby :refer :all]
  6 |               [clojask.api.gb-aggregate :as gb-aggre]
  7 |               [clojask.api.aggregate :as aggre]
  8 |               [clojask.sort :refer :all]
  9 |               [clojure.string :as str]))
 10 |         
 11 | (use '[clojure.java.shell :only [sh]])
 12 | 
 13 | ;; an alternative for diff | sort with better compatibility
 14 | (defn _get-diff
 15 |   [a b & [order]]
 16 |   (try
 17 |     (let [order (if (nil? order) true order)
 18 |           data-a (str/split (slurp a) #"\n")
 19 |           data-b (str/split (slurp b) #"\n")
 20 |           data-a (if order (clojure.core/sort data-a) data-a)
 21 |           data-b (if order (clojure.core/sort data-b) data-b)]
 22 |       (if (= 0 (compare (vec data-a) (vec data-b)))
 23 |         {:out "" :err ""}
 24 |         {:out (str "not the same: " data-a data-b) :err ""}))
 25 |     (catch Exception e {:out "has exception" :err (str e)})))
 26 | 
 27 | (defn get-diff
 28 |   [a b & [order]]
 29 |   (loop [count 3]
 30 |     (let [res (_get-diff a b order)]
 31 |       (if (or (< count 1) (= (:out res) ""))
 32 |         res
 33 |         (recur (dec count))))))
 34 | 
 35 | (enable-debug)
 36 | 
 37 | (deftest df-api-test
 38 |   (testing "Single dataframe manipulation APIs"
 39 |     (def y (dataframe "test/clojask/Employees-example.csv" :have-col true))
 40 |     (is (= clojask.classes.DataFrame.DataFrame (type y)))
 41 |     (is (= clojask.classes.DataFrame.DataFrame (type (set-type y "Salary" "double"))))
 42 |     (is (= clojask.classes.DataFrame.DataFrame (type (set-parser y "Department" #(Double/parseDouble %)))))
 43 |     (is (= clojask.classes.DataFrame.DataFrame (type (filter y "Salary" (fn [salary] (<= salary 800))))))
 44 |     (is (= clojask.classes.DataFrame.DataFrame (type (operate y - "Salary"))))
 45 |     (is (= clojask.classes.DataFrame.DataFrame (type (operate y str ["Employee" "Salary"] "new-col"))))
 46 |     (is (= clojask.classes.DataFrame.DataFrame (type (group-by y ["Department"]))))
 47 |     (is (= clojask.classes.DataFrame.DataFrame (type (aggregate y max ["Salary"] ["Salary-max"]))))
 48 |     (is (= clojask.classes.DataFrame.DataFrame (type (compute y 8 "test/clojask/test_outputs/tmp.csv"))))))
 49 | 
 50 | (deftest df-api-output-test
 51 |   (testing "Single dataframe manipulation APIs"
 52 |     (def y (dataframe "test/clojask/Employees-example.csv" :have-col true))
 53 |     ;; element-operation
 54 |     (set-type y "Salary" "double")
 55 |     (operate y - "Salary")
 56 |     (set-formatter y "Salary" #(str % "!"))
 57 |     (compute y 8 "test/clojask/test_outputs/1-1.csv" :exception false :order true :in-memory true)
 58 |     ;; (let [result (get-diff "./test/clojask/test_outputs/1-1.csv" "./test/clojask/correct_outputs/1-1.csv" false)]
 59 |     ;;     (is (= "" (:out result))) 
 60 |     ;;     (is (= "" (:err result))))
 61 |     ;; filter and row-operation
 62 |     (def y (dataframe "test/clojask/Employees-example.csv" :have-col true))
 63 |     (set-type y "Salary" "double")
 64 |     (filter y "Salary" (fn [salary] (<= salary 800)))
 65 |     (operate y str ["Employee" "Salary"] "new-col")
 66 |     (compute y 8 "test/clojask/test_outputs/1-2.csv" :exception false :in-memory true)
 67 |     ;; (let [result (get-diff "./test/clojask/test_outputs/1-2.csv" "./test/clojask/correct_outputs/1-2.csv")]
 68 |     ;;     (is (= "" (:out result))) 
 69 |     ;;     (is (= "" (:err result))))
 70 |     ;; groupby and aggregate
 71 |     (def y (dataframe "test/clojask/Employees-example.csv" :have-col true))
 72 |     (set-type y "Salary" "double")
 73 |     (group-by y ["Department"])
 74 |     (aggregate y gb-aggre/max ["Salary"] ["new-Salary"])
 75 |     (compute y 8 "test/clojask/test_outputs/1-3.csv" :exception false :in-memory true)
 76 |     ;; (let [result (get-diff "test/clojask/test_outputs/1-3.csv" "test/clojask/correct_outputs/1-3.csv")]
 77 |     ;;     (is (= "" (:out result))) 
 78 |     ;;     (is (= "" (:err result))))
 79 |     ;; aggregate only
 80 |     (def y (dataframe "test/clojask/Employees-example.csv" :have-col true))
 81 |     (set-type y "Salary" "double")
 82 |     (aggregate y aggre/max ["Salary"] ["new-Salary"])
 83 |     (compute y 8 "test/clojask/test_outputs/1-10.csv" :exception false :in-memory true)
 84 |     ;; (let [result (get-diff "test/clojask/test_outputs/1-10.csv" "test/clojask/correct_outputs/1-10.csv")]
 85 |     ;;     (is (= "" (:out result))) 
 86 |     ;;     (is (= "" (:err result))))
 87 |     ;; groupby only
 88 |     (def y (dataframe "test/clojask/Employees-example.csv" :have-col true))
 89 |     (group-by y ["Department"])
 90 |     (compute y 8 "test/clojask/test_outputs/1-11.csv" :exception false :in-memory true)
 91 |     ;; (let [result (get-diff "test/clojask/test_outputs/1-11.csv" "test/clojask/correct_outputs/1-11.csv")]
 92 |     ;;     (is (= "" (:out result))) 
 93 |     ;;     (is (= "" (:err result))))
 94 |     ))
 95 | 
 96 | (deftest col-api-test
 97 |   (testing "Column manipulation APIs"
 98 |     (def y (dataframe "test/clojask/Employees-example.csv" :have-col true))
 99 |     (reorder-col y ["Employee" "Department" "EmployeeName" "Salary" "UpdateDate"])
100 |     (is (= (get-col-names y) ["Employee" "Department" "EmployeeName" "Salary" "UpdateDate"]))
101 |     (rename-col y "Department" "new-Department")
102 |     ;; (map (fn [a b] (rename-col y a b)) (get-col-names y) ["Employee" "new-Department" "EmployeeName" "Salary" "UpdateDate"])
103 |     (is (= (get-col-names y) ["Employee" "new-Department" "EmployeeName" "Salary" "UpdateDate"]))))
104 | 
105 | (deftest col-select-output-test
106 |   (testing "Select column(s) argument"
107 |     (def y (dataframe "test/clojask/Employees-example.csv" :have-col true))
108 |     (compute y 8 "test/clojask/test_outputs/1-9.csv" :select ["Employee", "EmployeeName"] :exception false :in-memory true)
109 |     ;; (let [result (get-diff "test/clojask/test_outputs/1-9.csv" "test/clojask/correct_outputs/1-9.csv")]
110 |     ;;     (is (= "" (:out result))) 
111 |     ;;     (is (= "" (:err result))))
112 |     ))
113 | 
114 | (deftest join-api-test
115 |   (testing "Join dataframes APIs"
116 |     (def x (dataframe "test/clojask/Employees-example.csv"))
117 |     (def y (dataframe "test/clojask/Employees-example.csv"))
118 |     (is (= clojask.classes.DataFrame.DataFrame (type (compute (left-join x y ["Employee"] ["Employee"]) 8 "resources/test.csv" :exception false))))
119 |     (is (= clojask.classes.DataFrame.DataFrame (type (compute (right-join x y ["Employee"] ["Employee"]) 8 "resources/test.csv" :exception false))))
120 |     (is (= clojask.classes.DataFrame.DataFrame (type (compute (inner-join x y ["Employee"] ["Employee"]) 8 "resources/test.csv" :exception false))))
121 |     (is (= clojask.classes.DataFrame.DataFrame (type (compute (rolling-join-forward x y ["Employee"] ["Employee"] "Salary" "Salary") 8 "resources/test.csv" :exception false))))))
122 | 
123 | (deftest join-api-output-test
124 |   (testing "Join dataframes APIs"
125 |     (def x (dataframe "test/clojask/Employees-example.csv"))
126 |     (set-type x "UpdateDate" "date:yyyy/MM/dd")
127 |     (def y (dataframe "test/clojask/Employees-info-example.csv"))
128 |     (set-type y "UpdateDate" "date:yyyy/MM/dd")
129 |     (compute (left-join x y ["Employee"] ["Employee"]) 8 "test/clojask/test_outputs/1-4.csv" :exception false :in-memory true)
130 |       ;; (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-4.csv) <(sort test/clojask/correct_outputs/1-4.csv)")]
131 |       ;;   (is (= "" (:out result)))
132 |       ;;   (is (= "" (:err result))))
133 |     (compute (right-join x y ["Employee"] ["Employee"]) 8 "test/clojask/test_outputs/1-5.csv" :exception false :in-memory true)
134 |       ;; (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-5.csv) <(sort test/clojask/correct_outputs/1-5.csv)")]
135 |       ;;   (is (= "" (:out result)))
136 |       ;;   (is (= "" (:err result))))
137 |     (def z (inner-join x y ["Employee"] ["Employee"]))
138 |     (compute z 8 "test/clojask/test_outputs/1-6.csv" :exception false :select ["2_Employee" "2_EmployeeName" "2_DayOff" "2_UpdateDate" "1_Employee" "1_EmployeeName" "1_Department" "1_Salary" "1_UpdateDate"])
139 |       ;; (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-6.csv) <(sort test/clojask/correct_outputs/1-6.csv)")]
140 |       ;;   (is (= "" (:out result)))
141 |       ;;   (is (= "" (:err result))))
142 |     (compute (rolling-join-forward x y ["EmployeeName"] ["EmployeeName"] "UpdateDate" "UpdateDate") 8 "test/clojask/test_outputs/1-7.csv" :exception false :in-memory true)
143 |       ;; (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-7.csv) <(sort test/clojask/correct_outputs/1-7.csv)")]
144 |       ;;   (is (= "" (:out result)))
145 |       ;;   (is (= "" (:err result))))
146 |     (compute (rolling-join-backward x y ["EmployeeName"] ["EmployeeName"] "UpdateDate" "UpdateDate") 8 "test/clojask/test_outputs/1-8.csv" :exception false :in-memory true)
147 |       ;; (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-8.csv) <(sort test/clojask/correct_outputs/1-8.csv)")]
148 |       ;;   (is (= "" (:out result)))
149 |       ;;   (is (= "" (:err result))))
150 |     (compute (outer-join x y ["Employee"] ["Employee"]) 8 "test/clojask/test_outputs/1-12.csv" :select ["1_Department" "1_Salary" "1_UpdateDate" "2_Employee" "2_EmployeeName" "2_DayOff" "2_UpdateDate"] :in-memory true)
151 |       ;; (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-12.csv) <(sort test/clojask/correct_outputs/1-12.csv)")]
152 |       ;;   (is (= "" (:out result)))
153 |       ;;   (is (= "" (:err result))))
154 |     ))
155 | 
156 | (defn all-in-one
157 |   []
158 |   (df-api-test)
159 |   (df-api-output-test)
160 |   (col-api-test)
161 |   (col-select-output-test)
162 |   (join-api-test)
163 |   (join-api-output-test))
164 | 
165 | (deftest test-ns-hook
166 |   (testing "Check all the outputs in a nested way"
167 |     (all-in-one)
168 |     (let [result (get-diff "./test/clojask/test_outputs/1-1.csv" "./test/clojask/correct_outputs/1-1.csv" false)]
169 |       (is (= "" (:out result)))
170 |       (is (= "" (:err result))))
171 |     (let [result (get-diff "./test/clojask/test_outputs/1-2.csv" "./test/clojask/correct_outputs/1-2.csv")]
172 |       (is (= "" (:out result)))
173 |       (is (= "" (:err result))))
174 |     (let [result (get-diff "test/clojask/test_outputs/1-3.csv" "test/clojask/correct_outputs/1-3.csv")]
175 |       (is (= "" (:out result)))
176 |       (is (= "" (:err result))))
177 |     (let [result (get-diff "test/clojask/test_outputs/1-10.csv" "test/clojask/correct_outputs/1-10.csv")]
178 |       (is (= "" (:out result)))
179 |       (is (= "" (:err result))))
180 |     (let [result (get-diff "test/clojask/test_outputs/1-11.csv" "test/clojask/correct_outputs/1-11.csv")]
181 |       (is (= "" (:out result)))
182 |       (is (= "" (:err result))))
183 |     (let [result (get-diff "test/clojask/test_outputs/1-9.csv" "test/clojask/correct_outputs/1-9.csv")]
184 |       (is (= "" (:out result)))
185 |       (is (= "" (:err result))))
186 |     (let [result (get-diff "test/clojask/test_outputs/1-4.csv" "test/clojask/correct_outputs/1-4.csv")]
187 |       (is (= "" (:out result)))
188 |       (is (= "" (:err result))))
189 |     (let [result (get-diff "test/clojask/test_outputs/1-5.csv" "test/clojask/correct_outputs/1-5.csv")]
190 |       (is (= "" (:out result)))
191 |       (is (= "" (:err result))))
192 |     (let [result (get-diff "test/clojask/test_outputs/1-6.csv" "test/clojask/correct_outputs/1-6.csv")]
193 |       (is (= "" (:out result)))
194 |       (is (= "" (:err result))))
195 |     (let [result (get-diff "test/clojask/test_outputs/1-7.csv" "test/clojask/correct_outputs/1-7.csv")]
196 |       (is (= "" (:out result)))
197 |       (is (= "" (:err result))))
198 |     (let [result (get-diff "test/clojask/test_outputs/1-8.csv" "test/clojask/correct_outputs/1-8.csv")]
199 |       (is (= "" (:out result)))
200 |       (is (= "" (:err result))))
201 |     (let [result (get-diff "test/clojask/test_outputs/1-12.csv" "test/clojask/correct_outputs/1-12.csv")]
202 |       (is (= "" (:out result)))
203 |       (is (= "" (:err result))))))


--------------------------------------------------------------------------------
/test/clojask/core_test.clj:
--------------------------------------------------------------------------------
  1 | (ns core-test
  2 |     (:require [clojure.test :refer :all]
  3 |               [clojask.dataframe :refer :all]
  4 |               [clojask.utils :refer :all]
  5 |               [clojask.groupby :refer :all]
  6 |               [clojask.api.gb-aggregate :as gb-aggre]
  7 |               [clojask.api.aggregate :as aggre]
  8 |               [clojask.sort :refer :all]
  9 |               [clojure.data.csv :as csv]
 10 |               [clojure.string :as str]))
 11 |         
 12 | (use '[clojure.java.shell :only [sh]])
 13 | 
 14 | ;; an alternative for diff | sort with better compatibility
 15 | (defn _get-diff
 16 |   [a b & [order]]
 17 |   (try 
 18 |     (let [order (if (nil? order) true order)
 19 |         data-a (str/split (slurp a) #"\n")
 20 |         data-b (str/split (slurp b) #"\n")
 21 |         data-a (if order (clojure.core/sort data-a) data-a)
 22 |         data-b (if order (clojure.core/sort data-b) data-b)]
 23 |     (if (= 0 (compare (vec data-a) (vec data-b)))
 24 |       {:out "" :err ""}
 25 |       {:out (str "not the same: " data-a data-b) :err ""}))
 26 |     (catch Exception e {:out "has exception" :err (str e)})))
 27 | 
 28 | (defn get-diff
 29 |   [a b & [order]]
 30 |   (loop [count 3]
 31 |       (let [res (_get-diff a b order)]
 32 |         (if (or (< count 1) (= (:out res) ""))
 33 |           res
 34 |           (recur (dec count))))))
 35 | 
 36 | ;; (enable-debug)
 37 | 
 38 | (deftest df-api-test
 39 |   (testing "Single dataframe manipulation APIs"
 40 |     (def y (dataframe "test/clojask/Employees-example.csv" :have-col true))
 41 |     (is (= clojask.classes.DataFrame.DataFrame (type y)))
 42 |     (is (= clojask.classes.DataFrame.DataFrame (type (set-type y "Salary" "double"))))
 43 |     (is (= clojask.classes.DataFrame.DataFrame (type (set-parser y "Department" #(Double/parseDouble %)))))
 44 |     (is (= clojask.classes.DataFrame.DataFrame (type (filter y "Salary" (fn [salary] (<= salary 800))))))
 45 |     (is (= clojask.classes.DataFrame.DataFrame (type (operate y - "Salary"))))
 46 |     (is (= clojask.classes.DataFrame.DataFrame (type (operate y str ["Employee" "Salary"] "new-col"))))
 47 |     (is (= clojask.classes.DataFrame.DataFrame (type (group-by y ["Department"]))))
 48 |     (is (= clojask.classes.DataFrame.DataFrame (type (aggregate y max ["Salary"] ["Salary-max"]))))
 49 |     (is (= clojask.classes.DataFrame.DataFrame (type (compute y 8 "test/clojask/test_outputs/tmp.csv"))))
 50 |     ))
 51 | 
 52 | (deftest df-api-output-test
 53 |     (testing "Single dataframe manipulation APIs"
 54 |     (def y (dataframe "test/clojask/Employees-example.csv" :have-col true))
 55 |     ;; element-operation
 56 |     (set-type y "Salary" "double")
 57 |     (operate y - "Salary")
 58 |     (set-formatter y "Salary" #(str % "!"))
 59 |     (compute y 8 "test/clojask/test_outputs/1-1.csv" :exception false :order true)
 60 |     ;; (let [result (get-diff "./test/clojask/test_outputs/1-1.csv" "./test/clojask/correct_outputs/1-1.csv" false)]
 61 |     ;;     (is (= "" (:out result))) 
 62 |     ;;     (is (= "" (:err result))))
 63 |     ;; filter and row-operation
 64 |     (def y (dataframe "test/clojask/Employees-example.csv" :have-col true))
 65 |     (set-type y "Salary" "double")
 66 |     (filter y "Salary" (fn [salary] (<= salary 800)))
 67 |     (operate y str ["Employee" "Salary"] "new-col")
 68 |     (compute y 8 "test/clojask/test_outputs/1-2.csv" :exception false)
 69 |     ;; (let [result (get-diff "./test/clojask/test_outputs/1-2.csv" "./test/clojask/correct_outputs/1-2.csv")]
 70 |     ;;     (is (= "" (:out result))) 
 71 |     ;;     (is (= "" (:err result))))
 72 |     ;; groupby and aggregate
 73 |     (def y (dataframe "test/clojask/Employees-example.csv" :have-col true))
 74 |     (set-type y "Salary" "double")
 75 |     (group-by y ["Department"])
 76 |     (aggregate y gb-aggre/max ["Salary"] ["new-Salary"])
 77 |     (compute y 8 "test/clojask/test_outputs/1-3.csv" :exception false)
 78 |     ;; (let [result (get-diff "test/clojask/test_outputs/1-3.csv" "test/clojask/correct_outputs/1-3.csv")]
 79 |     ;;     (is (= "" (:out result))) 
 80 |     ;;     (is (= "" (:err result))))
 81 |     ;; aggregate only
 82 |     (def y (dataframe "test/clojask/Employees-example.csv" :have-col true))
 83 |     (set-type y "Salary" "double")
 84 |     (aggregate y aggre/max ["Salary"] ["new-Salary"])
 85 |     (compute y 8 "test/clojask/test_outputs/1-10.csv" :exception false)
 86 |     ;; (let [result (get-diff "test/clojask/test_outputs/1-10.csv" "test/clojask/correct_outputs/1-10.csv")]
 87 |     ;;     (is (= "" (:out result))) 
 88 |     ;;     (is (= "" (:err result))))
 89 |     ;; groupby only
 90 |     (def y (dataframe "test/clojask/Employees-example.csv" :have-col true))
 91 |     (group-by y ["Department"])
 92 |     (compute y 8 "test/clojask/test_outputs/1-11.csv" :exception false)
 93 |     ;; (let [result (get-diff "test/clojask/test_outputs/1-11.csv" "test/clojask/correct_outputs/1-11.csv")]
 94 |     ;;     (is (= "" (:out result))) 
 95 |     ;;     (is (= "" (:err result))))
 96 |     ))
 97 | 
 98 | (deftest col-api-test
 99 |     (testing "Column manipulation APIs"
100 |       (def y (dataframe "test/clojask/Employees-example.csv" :have-col true))
101 |       (reorder-col y ["Employee" "Department" "EmployeeName" "Salary" "UpdateDate"])
102 |       (is (= (get-col-names y) ["Employee" "Department" "EmployeeName" "Salary" "UpdateDate"]))
103 |       (rename-col y "Department" "new-Department")
104 |     ;; (map (fn [a b] (rename-col y a b)) (get-col-names y) ["Employee" "new-Department" "EmployeeName" "Salary" "UpdateDate"])
105 |       (is (= (get-col-names y) ["Employee" "new-Department" "EmployeeName" "Salary" "UpdateDate"]))))
106 | 
107 | (deftest col-select-output-test
108 |     (testing "Select column(s) argument"
109 |     (def y (dataframe "test/clojask/Employees-example.csv" :have-col true))
110 |     (compute y 8 "test/clojask/test_outputs/1-9.csv" :select ["Employee", "EmployeeName"] :exception false)
111 |     ;; (let [result (get-diff "test/clojask/test_outputs/1-9.csv" "test/clojask/correct_outputs/1-9.csv")]
112 |     ;;     (is (= "" (:out result))) 
113 |     ;;     (is (= "" (:err result))))
114 |     ))
115 | 
116 | (deftest join-api-test
117 |     (testing "Join dataframes APIs"
118 |     (def x (dataframe "test/clojask/Employees-example.csv"))
119 |     (def y (dataframe "test/clojask/Employees-example.csv"))
120 |     (is (= clojask.classes.DataFrame.DataFrame (type (compute (left-join x y ["Employee"] ["Employee"]) 8 "resources/test.csv" :exception false))))
121 |     (is (= clojask.classes.DataFrame.DataFrame (type (compute (right-join x y ["Employee"] ["Employee"]) 8 "resources/test.csv" :exception false))))
122 |     (is (= clojask.classes.DataFrame.DataFrame (type (compute (inner-join x y ["Employee"] ["Employee"]) 8 "resources/test.csv" :exception false))))
123 |     (is (= clojask.classes.DataFrame.DataFrame (type (compute (rolling-join-forward x y ["Employee"] ["Employee"] "Salary" "Salary") 8 "resources/test.csv" :exception false))))
124 |     ))
125 | 
126 | (deftest join-api-output-test
127 |     (testing "Join dataframes APIs"
128 |       (def x (dataframe "test/clojask/Employees-example.csv"))
129 |       (set-type x "UpdateDate" "date:yyyy/MM/dd")
130 |       (def y (dataframe "test/clojask/Employees-info-example.csv"))
131 |       (set-type y "UpdateDate" "date:yyyy/MM/dd")
132 |       (compute (left-join x y ["Employee"] ["Employee"]) 8 "test/clojask/test_outputs/1-4.csv" :exception false)
133 |       ;; (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-4.csv) <(sort test/clojask/correct_outputs/1-4.csv)")]
134 |       ;;   (is (= "" (:out result)))
135 |       ;;   (is (= "" (:err result))))
136 |       (compute (right-join x y ["Employee"] ["Employee"]) 8 "test/clojask/test_outputs/1-5.csv" :exception false)
137 |       ;; (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-5.csv) <(sort test/clojask/correct_outputs/1-5.csv)")]
138 |       ;;   (is (= "" (:out result)))
139 |       ;;   (is (= "" (:err result))))
140 |       (def z (inner-join x y ["Employee"] ["Employee"]))
141 |       (compute z 8 "test/clojask/test_outputs/1-6.csv" :exception false :select ["2_Employee" "2_EmployeeName" "2_DayOff" "2_UpdateDate" "1_Employee" "1_EmployeeName" "1_Department" "1_Salary" "1_UpdateDate"])
142 |       ;; (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-6.csv) <(sort test/clojask/correct_outputs/1-6.csv)")]
143 |       ;;   (is (= "" (:out result)))
144 |       ;;   (is (= "" (:err result))))
145 |       (compute (rolling-join-forward x y ["EmployeeName"] ["EmployeeName"] "UpdateDate" "UpdateDate") 8 "test/clojask/test_outputs/1-7.csv" :exception false)
146 |       ;; (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-7.csv) <(sort test/clojask/correct_outputs/1-7.csv)")]
147 |       ;;   (is (= "" (:out result)))
148 |       ;;   (is (= "" (:err result))))
149 |       (compute (rolling-join-backward x y ["EmployeeName"] ["EmployeeName"] "UpdateDate" "UpdateDate") 8 "test/clojask/test_outputs/1-8.csv" :exception false)
150 |       ;; (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-8.csv) <(sort test/clojask/correct_outputs/1-8.csv)")]
151 |       ;;   (is (= "" (:out result)))
152 |       ;;   (is (= "" (:err result))))
153 |       (compute (outer-join x y ["Employee"] ["Employee"]) 8 "test/clojask/test_outputs/1-12.csv" :select ["1_Department" "1_Salary" "1_UpdateDate" "2_Employee" "2_EmployeeName" "2_DayOff" "2_UpdateDate"])
154 |       ;; (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-12.csv) <(sort test/clojask/correct_outputs/1-12.csv)")]
155 |       ;;   (is (= "" (:out result)))
156 |       ;;   (is (= "" (:err result))))
157 |       ))
158 | 
159 | (deftest test-ns-hook
160 |   (testing "Check all the outputs in a nested way"
161 |     (df-api-test)
162 |     (df-api-output-test)
163 |     (col-api-test)
164 |     (col-select-output-test)
165 |     (join-api-test)
166 |     (join-api-output-test)
167 |     (let [result (get-diff "./test/clojask/test_outputs/1-1.csv" "./test/clojask/correct_outputs/1-1.csv" false)]
168 |       (is (= "" (:out result)))
169 |       (is (= "" (:err result))))
170 |     (let [result (get-diff "./test/clojask/test_outputs/1-2.csv" "./test/clojask/correct_outputs/1-2.csv")]
171 |       (is (= "" (:out result)))
172 |       (is (= "" (:err result))))
173 |     (let [result (get-diff "test/clojask/test_outputs/1-3.csv" "test/clojask/correct_outputs/1-3.csv")]
174 |       (is (= "" (:out result)))
175 |       (is (= "" (:err result))))
176 |     (let [result (get-diff "test/clojask/test_outputs/1-10.csv" "test/clojask/correct_outputs/1-10.csv")]
177 |       (is (= "" (:out result)))
178 |       (is (= "" (:err result))))
179 |     (let [result (get-diff "test/clojask/test_outputs/1-11.csv" "test/clojask/correct_outputs/1-11.csv")]
180 |       (is (= "" (:out result)))
181 |       (is (= "" (:err result))))
182 |     (let [result (get-diff "test/clojask/test_outputs/1-9.csv" "test/clojask/correct_outputs/1-9.csv")]
183 |       (is (= "" (:out result)))
184 |       (is (= "" (:err result))))
185 |     (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-4.csv) <(sort test/clojask/correct_outputs/1-4.csv)")]
186 |       (is (= "" (:out result)))
187 |       (is (= "" (:err result))))
188 |     (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-5.csv) <(sort test/clojask/correct_outputs/1-5.csv)")]
189 |       (is (= "" (:out result)))
190 |       (is (= "" (:err result))))
191 |     (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-6.csv) <(sort test/clojask/correct_outputs/1-6.csv)")]
192 |       (is (= "" (:out result)))
193 |       (is (= "" (:err result))))
194 |     (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-7.csv) <(sort test/clojask/correct_outputs/1-7.csv)")]
195 |       (is (= "" (:out result)))
196 |       (is (= "" (:err result))))
197 |     (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-8.csv) <(sort test/clojask/correct_outputs/1-8.csv)")]
198 |       (is (= "" (:out result)))
199 |       (is (= "" (:err result))))
200 |     (let [result (sh "zsh" "-c" "diff <(sort test/clojask/test_outputs/1-12.csv) <(sort test/clojask/correct_outputs/1-12.csv)")]
201 |       (is (= "" (:out result)))
202 |       (is (= "" (:err result))))))
203 | 
204 | 


--------------------------------------------------------------------------------
/src/main/clojure/clojask/aggregate/aggre_onyx_comps.clj:
--------------------------------------------------------------------------------
  1 | (ns clojask.aggregate.aggre-onyx-comps
  2 |   (:require [clojask.aggregate.aggre-input :as input]
  3 |             [clojask.aggregate.aggre-output :as output]
  4 |             ;; [clojask.clojask-groupby :as groupby]
  5 |             ;; [clojask.clojask-join :as join]
  6 |             [onyx.api :refer :all]
  7 |             [clojure.string :as string]
  8 |             [onyx.test-helper :refer [with-test-env feedback-exception!]]
  9 |             ;; [tech.v3.dataset :as ds]
 10 |             [clojure.data.csv :as csv]
 11 |             [clojask.utils :as u]
 12 |             [clojure.set :as set]
 13 |             [clojask.groupby :refer [read-csv-seq insert-mgroup]])
 14 |   (:import (java.io BufferedReader FileReader BufferedWriter FileWriter)
 15 |            [com.clojask.exception ExecutionException]))
 16 | 
 17 | 
 18 | (def id (java.util.UUID/randomUUID))
 19 | 
 20 | (defn workflow-gen
 21 |   "Generate workflow for running Onyx"
 22 |   [num-work]
 23 |   (def workflow []) ;; initialisation
 24 | 
 25 |   ;; for loop for input edges
 26 |   (doseq [x (range 1 (+ num-work 1))]
 27 |     (let [worker-name (keyword (str "sample-worker" x))]
 28 |           (def workflow (conj workflow [:in worker-name]
 29 |               ))))
 30 | 
 31 |   ;; for loop for output edges
 32 |   (doseq [x (range 1 (+ num-work 1))]
 33 |     (let [worker-name (keyword (str "sample-worker" x))]
 34 |           (def workflow (conj workflow [worker-name :output]
 35 |               ))))
 36 | )
 37 | 
 38 | (def dataframe (atom nil))
 39 | 
 40 | 
 41 | (defn worker-func-gen
 42 |   [df exception aggre-funcs index formatter source]
 43 |   (reset! dataframe df)
 44 |   (let [
 45 |         ;; aggre-funcs (.getAggreFunc (.row-info (deref dataframe)))
 46 |         formatters formatter
 47 |         ;; key-index (.getKeyIndex (.col-info (deref dataframe)))
 48 |         ;; formatters (set/rename-keys formatters key-index)
 49 |         reorder (fn [a b]
 50 |                       ;; (println [a b])
 51 |                   (u/gets (concat a b) index))
 52 |         groupby-keys (.getGroupbyKeys (:row-info df))
 53 |         ;; tmp (println groupby-keys)
 54 |         groupby-index (mapv #(nth % 1) groupby-keys)
 55 |         org-format (set/rename-keys (.getFormatter (:col-info df)) (zipmap groupby-index (iterate inc 0)))
 56 |         pre-index (take (count groupby-index) (iterate inc 0))
 57 |         ]
 58 |     (defn worker-func
 59 |       "refered in preview"
 60 |       [seq]
 61 |       ;; (println formatters)
 62 |       (let [data (if (= source nil) (read-csv-seq (:file seq)) (.getKey source (:file seq)))
 63 |             pre (:d seq)
 64 |             pre (u/gets-format pre pre-index org-format)
 65 |             data-map (-> (iterate inc 0)
 66 |                          (zipmap (apply map vector data)))]
 67 |         ;; (mapv (fn [_]
 68 |         ;;        (let [func (first _)
 69 |         ;;              index (nth _ 1)]
 70 |         ;;          (func (get data-map index))))
 71 |         ;;      aggre-funcs)
 72 |         ;; (println data)
 73 |         (loop [aggre-funcs aggre-funcs
 74 |                res []]
 75 |           (if (= aggre-funcs [])
 76 |             ;; {:d (vec (concat pre res))}
 77 |             (if (= res [])
 78 |               {:d [pre]}
 79 |               {:d (mapv reorder (repeat pre) (apply map vector res))})
 80 |             (let [func (first (first aggre-funcs))
 81 |                   index (nth (first aggre-funcs) 1)
 82 |                   res-funcs (rest aggre-funcs)
 83 |                   ;; tmp (println index)
 84 |                   ;; tmp (println (str data-map))
 85 |                   new (func (get data-map index))
 86 |                   new (if (coll? new)
 87 |                         new
 88 |                         (vector new))
 89 |                   new (mapv (fn [_] (if-let [formatter (get formatters index)]
 90 |                                      (formatter _)
 91 |                                      (str _))) new)]
 92 |               (if (or (= res []) (= (count new) (count (last res))))
 93 |                 (recur res-funcs (conj res new))
 94 |                 (throw (Exception. "aggregation result is not of the same length")))
 95 |               )))
 96 |         ))))
 97 | 
 98 | (defn catalog-gen
 99 |   "Generate the catalog for running Onyx"
100 |   [num-work batch-size]
101 |   ;; initialisation
102 |   (def catalog [])
103 | 
104 |   ;; input
105 |   (def catalog 
106 |     (conj catalog
107 |      {:onyx/name :in
108 |       :onyx/plugin :clojask.aggregate.aggre-input/input
109 |       :onyx/type :input
110 |       :onyx/medium :seq
111 |       :seq/checkpoint? true
112 |       :onyx/batch-size batch-size
113 |       :onyx/max-peers 1
114 |       :input/doc "Reads segments from a core.async channel"}))
115 | 
116 |     ;; for loop for sample workers
117 |     (doseq [x (range 1 (+ num-work 1))]
118 |       (let [worker-name (keyword (str "sample-worker" x))
119 |             worker-function (keyword "clojask.aggregate.aggre-onyx-comps" "worker-func")]
120 |             (def catalog 
121 |               (conj catalog
122 |                {:onyx/name worker-name
123 |                 :onyx/fn worker-function
124 |                 :onyx/type :function
125 |                 :onyx/batch-size batch-size
126 |                 :worker/doc "This is a worker node"}
127 |                 ))))
128 |     
129 |     ;; output
130 |     (def catalog
131 |       (conj catalog
132 |       {:onyx/name :output
133 |         :onyx/plugin :clojask.aggregate.aggre-output/output
134 |         :onyx/type :output
135 |         :onyx/medium :core.async  ;; this is maked up
136 |         :onyx/max-peers 1
137 |         :onyx/batch-size batch-size
138 |         :output/doc "Writes segments to the file"}))
139 | 
140 |     ;; (println catalog) ;; !! debugging
141 |     )
142 | 
143 | 
144 | (defn inject-in-reader [event lifecycle]
145 |   (let []
146 |     {:buffered-reader/path (:buffered-reader/path lifecycle)
147 |      }))
148 | 
149 | 
150 | (def in-calls
151 |   {:lifecycle/before-task-start inject-in-reader})
152 | 
153 | 
154 | (defn lifecycle-gen
155 |   [source dist]
156 |   (def lifecycles
157 |     [{:lifecycle/task :in
158 |       :buffered-reader/path source
159 |       :lifecycle/calls ::in-calls}
160 |      {:lifecycle/task :in
161 |       :lifecycle/calls :clojask.aggregate.aggre-input/reader-calls}
162 |      {:lifecycle/task :output
163 |       :buffered-wtr/filename dist
164 |       :lifecycle/calls :clojask.aggregate.aggre-output/writer-calls}]))
165 | 
166 | (def num-workers (atom 1))
167 | 
168 | ;; (defn rem0?
169 | ;;   [event old-segment new-segment all-new-segment]
170 | ;;   ;; (spit "resources/debug.txt" (str new-segment "\n") :append true)
171 | ;;   (= (mod (:id new-segment) (deref num-workers)) 0))
172 | 
173 | ;; (defn rem1?
174 | ;;   [event old-segment new-segment all-new-segment]
175 | ;;   (= (mod (:id new-segment) (deref num-workers)) 1))
176 | 
177 | ;; (defn rem2?
178 | ;;   [event old-segment new-segment all-new-segment]
179 | ;;   (= (mod (:id new-segment) (deref num-workers)) 2))
180 | 
181 | ;; (defn rem3?
182 | ;;   [event old-segment new-segment all-new-segment]
183 | ;;   (= (mod (:id new-segment) (deref num-workers)) 3))
184 | 
185 | ;; (defn rem4?
186 | ;;   [event old-segment new-segment all-new-segment]
187 | ;;   (= (mod (:id new-segment) (deref num-workers)) 4))
188 | 
189 | ;; (defn rem5?
190 | ;;   [event old-segment new-segment all-new-segment]
191 | ;;   (= (mod (:id new-segment) (deref num-workers)) 5))
192 | 
193 | ;; (defn rem6?
194 | ;;   [event old-segment new-segment all-new-segment]
195 | ;;   (= (mod (:id new-segment) (deref num-workers)) 6))
196 | 
197 | ;; (defn rem7?
198 | ;;   [event old-segment new-segment all-new-segment]
199 | ;;   (= (mod (:id new-segment) (deref num-workers)) 7))
200 | 
201 | ;; (defn rem8?
202 | ;;   [event old-segment new-segment all-new-segment]
203 | ;;   (= (mod (:id new-segment) (deref num-workers)) 8))
204 | 
205 | 
206 | ;; [{:flow/from :in
207 | ;;   :flow/to [:sample-worker1]
208 | ;;   :flow/predicate :clojask.onyx-comps/rem0?
209 | ;;   :flow/doc ""}
210 | ;;  {:flow/from :in
211 | ;;   :flow/to [:sample-worker2]
212 | ;;   :flow/predicate :clojask.onyx-comps/rem1?
213 | ;;   :flow/doc ""}]
214 | 
215 | (defn flow-cond-gen
216 |   "Generate the flow conditions for running Onyx"
217 |   [num-work]
218 |   (reset! num-workers num-work)
219 |   (def flow-conditions []) ;; initialisation
220 | 
221 |   ;; for loop for sample workers
222 |   (doseq [x (range 1 (+ num-work 1))]
223 |     (let [worker-name (keyword (str "sample-worker" x))
224 |           predicate-function (keyword "clojask.aggregate.aggre-onyx-comps" (str "rem" (- x 1) "?"))]
225 |       (intern 'clojask.aggregate.aggre-onyx-comps (symbol (str "rem" (- x 1) "?")) (fn [event old-segment new-segment all-new-segment]
226 |                                                                      (= (mod (:id new-segment) num-work) (- x 1))))    
227 |       (def flow-conditions
228 |             (conj flow-conditions
229 |              {:flow/from :in
230 |               :flow/to [worker-name]
231 |               :flow/predicate predicate-function
232 |               :worker/doc "This is a flow condition"}
233 |               ))))
234 |     
235 |   ;; (println flow-conditions) ;; !! debugging
236 |   )
237 | 
238 | (defn config-env
239 |   []
240 |   (def env-config
241 |     {:zookeeper/address "127.0.0.1:2188"
242 |      :zookeeper/server? true
243 |      :zookeeper.server/port 2188
244 |      :onyx/tenancy-id id
245 |      :onyx.log/file ".clojask/clojask.log"})
246 | 
247 |   (def peer-config
248 |     {:zookeeper/address "127.0.0.1:2188"
249 |      :onyx/tenancy-id id
250 |      :onyx.peer/job-scheduler :onyx.job-scheduler/balanced
251 |      :onyx.messaging/impl :aeron
252 |      :onyx.messaging/peer-port 40200
253 |      :onyx.messaging/bind-addr "localhost"
254 |      :onyx.log/file ".clojask/clojask.log"})
255 | 
256 |   (def env (onyx.api/start-env env-config))
257 | 
258 |   (def peer-group (onyx.api/start-peer-group peer-config))
259 | 
260 |   (def n-peers (count (set (mapcat identity workflow))))
261 | 
262 |   (def v-peers (onyx.api/start-peers n-peers peer-group)))
263 | 
264 | (defn shutdown
265 |   []
266 |   (doseq [v-peer v-peers]
267 |     (onyx.api/shutdown-peer v-peer))
268 |   (onyx.api/shutdown-peer-group peer-group)
269 |   (onyx.api/shutdown-env env))
270 | 
271 | (defn start-onyx-aggre
272 |   "start the onyx cluster with the specification inside dataframe"
273 |   [num-work batch-size dataframe source dist exception aggre-func index formatter out]
274 |   (try
275 |     (workflow-gen num-work)
276 |     (config-env)
277 |     (worker-func-gen dataframe exception aggre-func index formatter source) ;;need some work
278 |     (catalog-gen num-work batch-size)
279 |     (lifecycle-gen (if (nil? source) "./.clojask/grouped" nil) dist)
280 |     (flow-cond-gen num-work)
281 |     (input/inject-dataframe dataframe source)
282 |     (output/inject-dataframe dataframe out)
283 |     ;; (insert-mgroup source)
284 |     (catch Exception e (do
285 |                          (throw (ExecutionException. (format "[preparing stage (groupby aggregate)]  Refer to .clojask/clojask.log for detailed information. (original error: %s)" (.getMessage e)))))))
286 |   (try
287 |     (let [submission (onyx.api/submit-job peer-config
288 |                                           {:workflow workflow
289 |                                            :catalog catalog
290 |                                            :lifecycles lifecycles
291 |                                            :flow-conditions flow-conditions
292 |                                            :task-scheduler :onyx.task-scheduler/balanced})
293 |           job-id (:job-id submission)]
294 |       ;; (println submission)
295 |       (assert job-id "Job was not successfully submitted")
296 |       (feedback-exception! peer-config job-id))
297 |     (catch Exception e (do
298 |                          (shutdown)
299 |                          (throw (ExecutionException. (format "[submit-to-onyx stage (groupby aggregate)]  Refer to .clojask/clojask.log for detailed information. (original error: %s)" (.getMessage e)))))))
300 |   (try
301 |     (shutdown)
302 |     (catch Exception e (throw (ExecutionException. (format "[terminate-node stage (groupby aggregate)]  Refer to .clojask/clojask.log for detailed information. (original error: %s)" (.getMessage e))))))
303 |   "success")
304 | 


--------------------------------------------------------------------------------
/src/main/clojure/clojask/utils.clj:
--------------------------------------------------------------------------------
  1 | (ns clojask.utils
  2 |   (:require [clojure.core.async :refer [chan sliding-buffer >!! close!]]
  3 |             [clojure.java.io :refer [resource]]
  4 |             [onyx.plugin.core-async :refer [take-segments!]]
  5 |             ;; [tech.v3.dataset :as ds]
  6 |             [clojure.string :as str]
  7 |             [clojure.java.io :as io])
  8 |   (:import (java.util Date)
  9 |            (java.time LocalDate)
 10 |            (java.time LocalDateTime)
 11 |            (java.time.format DateTimeFormatter)
 12 |            (java.util Base64)))
 13 | "Utility function used in dataframe"
 14 | 
 15 | (defn gets
 16 |   "unlike core/get, get elements from indices"
 17 |   [coll indices]
 18 |   (mapv #(nth coll %) indices)
 19 |   )
 20 | 
 21 | (defn gets-format
 22 |   "gets with format"
 23 |   [coll indices formatters]
 24 |   (mapv (fn [_] 
 25 |           (let [val (nth coll _)]
 26 |             (if-let [formatter (get formatters _)]
 27 |               (formatter val)
 28 |               val))) indices)
 29 |   )
 30 | 
 31 | (defn get-key
 32 |   [row types key-index key]
 33 |   (let [index (get key-index key)]
 34 |     (if (contains? types index)
 35 |       ((get types index) (.get row index))
 36 |       (.get row index))))
 37 | 
 38 | (defn get-val
 39 |   [row types index]
 40 |   (map (fn [_] (if-let [parser (get types _)]
 41 |                  (parser (nth row _))
 42 |                  (nth row _)))
 43 |        index))
 44 | 
 45 | (defn eval-res
 46 |   [row types formats operations index]
 47 |   ;; (spit "resources/debug.txt" (str row "\n") :append true)
 48 |   ;; (spit "resources/debug.txt" (str types) :append true)
 49 |   ;; (spit "resources/debug.txt" operations :append true)
 50 |   ;; (spit "resources/debug.txt" (str index "\n") :append true)
 51 |   ;; (println opr-vec)
 52 |   (let [opr-vec (get operations index)
 53 |         vals (get-val row types (first opr-vec))]
 54 |     ;; (println [vals])
 55 |     (loop [res vals oprs (rest opr-vec)]
 56 |       (if (= (count oprs) 0)
 57 |         (first res)
 58 |         (let [opr (first oprs)
 59 |               rest (rest oprs)]
 60 |           (recur [(apply opr res)] rest))))))
 61 | 
 62 | (defn eval-res-ne
 63 |   [row types formats operations index]
 64 |   ;; (spit "resources/debug.txt" (str row "\n") :append true)
 65 |   ;; (spit "resources/debug.txt" (str types) :append true)
 66 |   ;; (spit "resources/debug.txt" operations :append true)
 67 |   ;; (spit "resources/debug.txt" (str index "\n") :append true)
 68 |   ;; (println opr-vec)
 69 |   (try
 70 |     (let [opr-vec (get operations index)
 71 |           vals (get-val row types (first opr-vec))]
 72 |     ;; (println [vals])
 73 |       (loop [res vals oprs (rest opr-vec)]
 74 |         (if (= (count oprs) 0)
 75 |           (first res)
 76 |           (let [opr (first oprs)
 77 |                 rest (rest oprs)]
 78 |             (recur [(apply opr res)] rest)))))
 79 |     (catch Exception e nil)))
 80 | 
 81 | (defn filter-check
 82 |   [filters types row]
 83 |   ;; (loop [filters filters]
 84 |   ;;   (let [filter (first filters)
 85 |   ;;         rem (rest filters)]
 86 |   ;;     (if (= filter nil)
 87 |   ;;       true
 88 |   ;;       (if (not= (filter row) true)
 89 |   ;;         false
 90 |   ;;         (recur rem)))))
 91 |   (if (= filters [])
 92 |     true
 93 |     (loop [filters filters]
 94 |       (let [com (first filters)
 95 |             rem (rest filters)]
 96 |         ;; (println com)
 97 |         (if (= com nil)
 98 |           true
 99 |           (do
100 |             (if (apply (first com) (get-val row types (nth com 1)))
101 |               (recur rem)
102 |               false)))))))
103 | 
104 | (def toInt
105 |   (atom (fn [string]
106 |           (try
107 |             (Integer/parseInt string)
108 |             (catch Exception e nil)))))
109 | 
110 | (def toDouble
111 |   (atom (fn [string]
112 |           (try
113 |             (Double/parseDouble string)
114 |             (catch Exception e nil)))))
115 | 
116 | (def toString
117 |   (atom (fn [string]
118 |           string)))
119 | 
120 | (def fromString
121 |   (atom (fn [_] (str _))))
122 | 
123 | (def toDate
124 |   (atom (fn [string]
125 |           (try
126 |             (.parse (java.text.SimpleDateFormat. "yyyy-MM-dd") string)
127 |             ;; (catch Exception e (throw e))
128 |             (catch Exception e nil)
129 |             ))))
130 | 
131 | (def fromDate
132 |   (atom (fn [date]
133 |           (if (= (type date) java.util.Date)
134 |             (.format (java.text.SimpleDateFormat. "yyyy-MM-dd") date)
135 |             date))))
136 | 
137 | (defn set-format-string
138 |   [string]
139 |   (if (or (str/starts-with? string "date:") (str/starts-with? string "datetime:"))
140 |     (let [format-string (subs string (inc (str/index-of string ":")))]
141 |       (reset! toDate
142 |               (fn [string]
143 |                 (try
144 |                   (.parse (java.text.SimpleDateFormat. format-string) string)
145 |                   (catch Exception e (throw e))
146 |                   (catch Exception e nil)
147 |                   )))
148 | 
149 |       (reset! fromDate
150 |               (fn [date]
151 |                 (if (= (type date) java.util.Date)
152 |                   (.format (java.text.SimpleDateFormat. format-string) date)
153 |                   date))))
154 |     (do
155 |       (reset! toDate
156 |               (fn [string]
157 |                 (try
158 |                   (.parse (java.text.SimpleDateFormat. "yyyy-MM-dd") string)
159 |                   ;; (catch Exception e (throw e))
160 |                   (catch Exception e nil)
161 |                   )))
162 | 
163 |       (reset! fromDate
164 |               (fn [date]
165 |                 (if (= (type date) java.util.Date)
166 |                   (.format (java.text.SimpleDateFormat. "yyyy-MM-dd") date)
167 |                   date))))))
168 | 
169 | (def type-operation-map
170 |   {"int" [toInt fromString]
171 |    "double" [toDouble fromString]
172 |    "string" [toString fromString]
173 |    "date" [toDate fromDate]
174 |    "datetime" [toDate fromDate]})
175 | 
176 | (defn type-detection
177 |   [file]
178 |   (let [sample (take 5 file)]))
179 | 
180 | (defn is-in
181 |   [col dataframe]
182 |   (if (contains? (.getKeyIndex (:col-info dataframe)) col)
183 |     true
184 |     false))
185 | 
186 | (defn is-out
187 |   [col dataframe]
188 |   (if (contains? (.getKeyIndex (:col-info dataframe)) col)
189 |     false
190 |     true))
191 | 
192 | (defn are-in
193 |   "return should be [] if all in"
194 |   [cols dataframe]
195 |   (filter (fn [col] (is-out col dataframe)) cols))
196 | 
197 | (defn are-out
198 |   "return should be [] if all out"
199 |   [cols dataframe]
200 |   (filter (fn [col] (is-in col dataframe)) cols))
201 | 
202 | (defn init-file
203 |   [out-dir header]
204 |   (if (not= out-dir nil)
205 |    (io/delete-file out-dir true))
206 |   (doseq [file (rest (file-seq (io/file "./.clojask/grouped/")))]
207 |     (try
208 |       (io/delete-file file)
209 |       (catch Exception e nil)))
210 |   (doseq [file (rest (file-seq (io/file "./.clojask/join/")))]
211 |     (try
212 |       (io/delete-file file)
213 |       (catch Exception e nil)))
214 |   (io/make-parents "./.clojask/grouped/a.txt")
215 |   (io/make-parents "./.clojask/join/a/a.txt")
216 |   (io/make-parents "./.clojask/join/b/a.txt")
217 |   (io/make-parents "./.clojask/sort/a.txt")
218 |   ;; (if (not= header nil)
219 |   ;;   (with-open [wrtr (io/writer out-dir)]
220 |   ;;     (.write wrtr (str (str/join "," header) "\n"))))
221 |   )
222 | 
223 | (defn get-type-string
224 |   [x]
225 |   (if (not= x nil)
226 |    (subs (str (type x)) 6)
227 |     "nil"))
228 | 
229 | (defn get-type-string-vec
230 |   [col]
231 |   (let [types (mapv get-type-string col)
232 |         types (sort (vec (set types)))]
233 |     (str/join " & " types)))
234 | 
235 | (defn check-duplicate-col
236 |   "Check for duplicated column names and return a column names list w/o duplicates"
237 |   [colNames]
238 |   (if (not= (count (distinct colNames)) (count colNames))
239 |     (do
240 |       (println "WARNING: Duplicated columns found")
241 |       (let [colNames-var (atom colNames)
242 |             duplicate-list (into (sorted-map) (clojure.core/filter #(> (last %) 1) (frequencies (deref colNames-var))))
243 |             counter (atom {})]
244 |         (doseq [duplicate-col duplicate-list]
245 |           (swap! counter assoc (first duplicate-col) (atom 0)))
246 |         (doseq [col colNames]
247 |           (if (contains? duplicate-list col)
248 |             (reset! colNames-var (map #(if (= % col)
249 |                                          (do
250 |                                            (swap! (get @counter col) inc)
251 |                                            (str % (deref (get @counter col))))
252 |                                          %) (deref colNames-var)))))
253 |         (deref colNames-var)))
254 |     colNames))
255 | 
256 | (defn proc-groupby-key-each
257 |   [pair]
258 |   (if (coll? pair)
259 |     (if (and (= 2 (count pair)) (fn? (first pair)) (string? (nth pair 1)))
260 |       pair
261 |       (if (and (= 1 (count pair)) (string? (first pair)))
262 |         [nil pair]
263 |         (throw (Exception.))))
264 |     (if (string? pair)
265 |       [nil pair]
266 |       (throw (Exception.)))))
267 | 
268 | (defn proc-groupby-key
269 |   [input]
270 |   (try
271 |     (if (coll? input)
272 |     ;; it is a collection
273 |       (if (fn? (first input))
274 |         (if (= 2 (count input))
275 |           [input]
276 |           nil)
277 |         (mapv proc-groupby-key-each input))
278 |       (if (string? input)
279 |         [[nil input]]
280 |         nil))
281 |     (catch Exception e nil)))
282 | 
283 | (defn get-func-str
284 |   [func]
285 |   (let [func-str (str func)]
286 |     (str/replace (str/replace (subs func-str  0 (str/last-index-of func-str "@")) "$" "/") "_" "-")))
287 | 
288 | (def encoder (Base64/getUrlEncoder))
289 | (def decoder (Base64/getUrlDecoder))
290 | 
291 | (defn encode-str
292 |   [s]
293 |   (.encodeToString encoder (.getBytes s)))
294 | 
295 | (defn decode-str
296 |   [s]
297 |   (String. (.decode decoder s)))
298 | 
299 | ;; (def toDate
300 | ;;   (atom (fn [string]
301 | ;;           (try
302 | ;;             (LocalDate/parse string (DateTimeFormatter/ofPattern "yyyy-MM-dd"))
303 | ;;             (catch Exception e (throw e))))))
304 | 
305 | ;; (def fromDate
306 | ;;   (atom (fn [date]
307 | ;;           (if (= (type date) java.time.LocalDate)
308 | ;;             (.format date (DateTimeFormatter/ofPattern "yyyy-MM-dd"))
309 | ;;             date))))
310 | 
311 | ;; (def toDateTime
312 | ;;   (atom (fn [string]
313 | ;;           (try
314 | ;;             (LocalDateTime/parse string (DateTimeFormatter/ofPattern "yyyy-MM-dd HH:mm:ss"))
315 | ;;             (catch Exception e (throw e))))))
316 | 
317 | ;; (def fromDateTime
318 | ;;   (atom (fn [date]
319 | ;;           (if (= (type date) java.time.LocalDateTime)
320 | ;;             (.format date (DateTimeFormatter/ofPattern "yyyy-MM-dd HH:mm:ss"))
321 | ;;             date))))
322 | 
323 | ;; (defn set-format-string
324 | ;;   [string]
325 | ;;   (if (or (str/starts-with? string "date:") (str/starts-with? string "datetime:"))
326 | ;;     (let [format-string (subs string (inc (str/index-of string ":")))]
327 | ;;       (reset! toDate
328 | ;;               (fn [string]
329 | ;;                 (try
330 | ;;                   (LocalDate/parse string (DateTimeFormatter/ofPattern format-string))
331 | ;;                   (catch Exception e (throw e)))))
332 | 
333 | ;;       (reset! fromDate
334 | ;;               (fn [date]
335 | ;;                 (if (= (type date) java.time.LocalDate)
336 | ;;                   (.format date (DateTimeFormatter/ofPattern format-string))
337 | ;;                   date)))
338 | 
339 | ;;       (reset! toDateTime
340 | ;;               (fn [string]
341 | ;;                 (try
342 | ;;                   (LocalDateTime/parse string (DateTimeFormatter/ofPattern format-string))
343 | ;;                   (catch Exception e (throw e)))))
344 | 
345 | ;;       (reset! fromDateTime
346 | ;;               (fn [date]
347 | ;;                 (if (= (type date) java.time.LocalDateTime)
348 | ;;                   (.format date (DateTimeFormatter/ofPattern format-string))
349 | ;;                   date))))
350 | ;;     ))
351 | 
352 | ;; ;; (def operation-type-map
353 | ;; ;;   {toInt "int"
354 | ;; ;;    toDouble "double"
355 | ;; ;;    toString "string"
356 | ;; ;;    toDate "date"})
357 | 
358 | ;; (def type-operation-map
359 | ;;   {"int" [toInt fromString]
360 | ;;    "double" [toDouble fromString]
361 | ;;    "string" [toString fromString]
362 | ;;    "date" [toDate fromDate]
363 | ;;    "datetime" [toDateTime fromDateTime]})


--------------------------------------------------------------------------------
/src/main/clojure/clojask/join.clj:
--------------------------------------------------------------------------------
  1 | (ns clojask.join
  2 |   (:require [clojure.java.io :as io]
  3 |             [clojure.core.async :as async]
  4 |             ;; [clojask.onyx-comps :refer [start-onyx-groupby start-onyx-join]]
  5 |             [clojask.groupby :refer [read-csv-seq gen-groupby-filenames]]
  6 |             [clojure.string :as str]
  7 |             [clojask.utils :as u]))
  8 | 
  9 | (def source nil)
 10 | 
 11 | (defn gen-join-filenames
 12 |   [dist a-row a-keys]
 13 |   ;; (def output-filename dist)
 14 |   ;; (doseq [i (take (count a-keys) (iterate inc 0))]
 15 |   ;;   (def output-filename (str output-filename "_" (name (nth b-keys i)) "-" (nth a-row (get a-map (nth a-keys i))))))
 16 |   ;; (str output-filename ".csv")
 17 |   (let [a-val (mapv (fn [_] ((or (nth _ 0) identity) (nth a-row (nth _ 1)))) a-keys)]
 18 |     (if (nil? dist) (str a-val) (str dist (u/encode-str (str a-val))))))
 19 | 
 20 | (defn output-join-inner
 21 |   [writer a-row a-keys a-map b-keys count a-roll b-roll a-format b-format a-index b-index join-index write-func]
 22 |   (let [filename (gen-join-filenames ".clojask/join/b/" a-row a-keys)]
 23 |     ;; (println writer)
 24 |     ;; (spit ".clojask/join/test.txt" (str writer "\n") :append true)
 25 |     ;; (.write writer (str [a-row a-keys a-map b-keys a-format b-format a-index b-index] "\n"))
 26 |     (if (.exists (io/file filename))
 27 |       ;; (.write writer (str (map type a-row) "\n"))
 28 |       ;; (spit ".clojask/join/test.txt" (str (vec (read-csv-seq filename)) "\n") :append true)
 29 |       (let [a-row (u/gets-format a-row a-index a-format)
 30 |             filename (io/reader filename)]
 31 |         (doseq [b-row (read-csv-seq filename)]
 32 |           ;; (.write writer (str (map type b-row) "\n"))
 33 |         ;; (spit ".clojask/join/test.txt" (str a-row b-row "\n") :append true)
 34 |           (let [b-row (u/gets b-row b-index)]
 35 |             ;; (println [(vec a-row) (vec b-row) a-index b-index join-index])
 36 |             (write-func writer (vector (u/gets (concat a-row b-row) join-index)))))
 37 |         (.close filename)))))
 38 | 
 39 | (defn output-join-inner-mem
 40 |   [writer a-row a-keys a-map b-keys count a-roll b-roll a-format b-format a-index b-index join-index write-func]
 41 |   (let [filename (gen-join-filenames nil a-row a-keys)]
 42 |     (if (.exists source filename)
 43 |       (let [a-row (u/gets-format a-row a-index a-format)]
 44 |         (doseq [b-row (.getKey source filename)]
 45 |           (let []
 46 |             (write-func writer (vector (u/gets (concat a-row b-row) join-index)))))))))
 47 | 
 48 | (defn output-join-loo
 49 |   "used for left join right join or outter join"
 50 |   [writer a-row a-keys a-map b-keys count a-roll b-roll a-format b-format a-index b-index join-index write-func]
 51 |   ;; (println a-format)
 52 |   ;; (println b-format)
 53 |   ;; (println a-index)
 54 |   ;; (println b-index)
 55 |   (let [filename (gen-join-filenames ".clojask/join/b/" a-row a-keys)]
 56 |     ;; (println writer)
 57 |     ;; (spit ".clojask/join/test.txt" (str writer "\n") :append true)
 58 |     (if (.exists (io/file filename))
 59 |       ;; (spit ".clojask/join/test.txt" (str (vec (read-csv-seq filename)) "\n") :append true)
 60 |       (let [filename (io/reader filename)]
 61 |         (doseq [b-row (read-csv-seq filename)]
 62 |         ;; (spit ".clojask/join/test.txt" (str a-row b-row "\n") :append true)
 63 |           (let [a-row (u/gets-format a-row a-index a-format)
 64 |                 ;; tmp (println a-row)
 65 |                 ;; a-row (for [index a-index]
 66 |                 ;;         (if-let [format (get a-format index)]
 67 |                 ;;           (format (nth a-row index))
 68 |                 ;;           (nth a-row index)))
 69 |                 b-row (u/gets b-row b-index)
 70 |                 ;; tmp (println b-row)
 71 |                 ;; b-row (for [index b-index]
 72 |                 ;;         (if-let [format (get b-format index)]
 73 |                 ;;           (format (nth b-row index))
 74 |                 ;;           (nth b-row index)))
 75 |                 ]
 76 |             (write-func writer (vector (u/gets (concat a-row b-row) join-index)))))
 77 |         (.close filename))
 78 |       (let [a-row (u/gets-format a-row a-index a-format)]
 79 |        (write-func writer (vector (u/gets (concat a-row (repeat count "")) join-index)))))))
 80 | 
 81 | (defn output-join-loo-mem
 82 |   "used for left join right join or outter join"
 83 |   [writer a-row a-keys a-map b-keys count a-roll b-roll a-format b-format a-index b-index join-index write-func]
 84 |   (let [filename (gen-join-filenames nil a-row a-keys)]
 85 |     ;; (println a-row)
 86 |     (if (.exists source filename)
 87 |       (let [b-rows (.getKey source filename)]
 88 |         ;; (println b-rows)
 89 |         (doseq [b-row b-rows]
 90 |           (let [a-row (u/gets-format a-row a-index a-format)
 91 |                 ]
 92 |             (write-func writer (vector (u/gets (concat a-row b-row) join-index))))))
 93 |       (let [a-row (u/gets-format a-row a-index a-format)]
 94 |         (write-func writer (vector (u/gets (concat a-row (repeat count "")) join-index)))))))
 95 | 
 96 | (defn defn-join
 97 |   [type limit _source]
 98 |   (def source _source)
 99 |   (def output-join
100 |     (case type
101 |       1 (if (nil? _source) output-join-inner output-join-inner-mem)
102 |       2 (if (nil? _source) output-join-loo output-join-loo-mem)
103 |       ;; 4 output-join-forward
104 |       4 (let [roll-join-get-line-forward (fn [bench filename index]
105 |                                            (def memo (volatile! nil))
106 |                                            (def res (volatile! nil))
107 |                                            (doseq [row (read-csv-seq filename)]
108 |                                              (let [val (nth row index)]
109 |                                                (if (and (<= (compare val bench) 0) (limit bench val) (or (= @memo nil) (> (compare val @memo) 0)))
110 |                                                  (do (vreset! memo val)
111 |                                                      (vreset! res row)))))
112 |                                            @res)
113 |               roll-join-get-line-forward-mem (fn [bench filename index]
114 |                                                (def memo (volatile! nil))
115 |                                                (def res (volatile! nil))
116 |                                                (doseq [row (.getKey source filename)]
117 |                                                  (let [unformat (nth row 1)
118 |                                                        val (nth unformat index)]
119 |                                                    (if (and (<= (compare val bench) 0) (limit bench val) (or (= @memo nil) (> (compare val @memo) 0)))
120 |                                                      (do (vreset! memo val)
121 |                                                          (vreset! res (first row))))))
122 |                                                @res)]
123 |           (if (nil? _source)
124 |             (fn [writer a-row a-keys a-map b-keys count a-roll b-roll a-format b-format a-index b-index join-index write-func]
125 |               (let [filename (gen-join-filenames ".clojask/join/b/" a-row a-keys)]
126 |                 (if (.exists (io/file filename))
127 |                   (let [filename (io/reader filename)]
128 |                     (if-let [b-row (roll-join-get-line-forward (nth a-row a-roll) filename b-roll)] ;; bench is a string
129 |                       (let [;; a-row (for [index a-index]
130 |                           ;;         (if-let [format (get a-format index)]
131 |                           ;;           (format (nth a-row index))
132 |                           ;;           (nth a-row index)))
133 |                             a-row (u/gets-format a-row a-index a-format)
134 |                           ;; b-row (for [index b-index]
135 |                           ;;         (if-let [format (get b-format index)]
136 |                           ;;           (format (nth b-row index))
137 |                           ;;           (nth b-row index)))
138 |                             b-row (u/gets-format b-row b-index b-format)]
139 |                         (write-func writer [(u/gets (concat a-row b-row) join-index)]))
140 |                       (let [a-row (for [index a-index]
141 |                                     (if-let [format (get a-format index)]
142 |                                       (format (nth a-row index))
143 |                                       (nth a-row index)))]
144 |                         (write-func writer [(u/gets (concat a-row (repeat count "")) join-index)])))
145 |                     (.close filename))
146 |                   (let [a-row (u/gets-format a-row a-index a-format)]
147 |                     (write-func writer [(u/gets (concat a-row (repeat count "")) join-index)])))))
148 |             (fn [writer a-row a-keys a-map b-keys count a-roll b-roll a-format b-format a-index b-index join-index write-func]
149 |               (let [filename (gen-join-filenames nil a-row a-keys)]
150 |                 (if (.exists source filename)
151 |                   (let []
152 |                     (if-let [b-row (roll-join-get-line-forward-mem (nth a-row a-roll) filename b-roll)] ;; bench is a string
153 |                       (write-func writer [(u/gets (concat (u/gets-format a-row a-index a-format) b-row) join-index)])
154 |                       (write-func writer [(u/gets (concat (u/gets-format a-row a-index a-format) (repeat count "")) join-index)])))
155 |                   (let [a-row (u/gets-format a-row a-index a-format)]
156 |                     (write-func writer [(u/gets (concat a-row (repeat count "")) join-index)])))))))
157 |       ;; 5 output-join-backward
158 |       5 (let [roll-join-get-line-backward (fn [bench filename index]
159 |                                             (def memo (volatile! nil))
160 |                                             (def res (volatile! nil))
161 |                                             (doseq [row (read-csv-seq filename)]
162 |                                               (let [val (nth row index)]
163 |       ;;        todo does here need to be =?
164 |                                                 (if (and (>= (compare val bench) 0) (or (= @memo nil) (< (compare val @memo) 0)))
165 |                                                   (do (vreset! memo val)
166 |                                                       (vreset! res row)))))
167 |                                             @res)
168 |               roll-join-get-line-backward-mem (fn [bench filename index]
169 |                                                (def memo (volatile! nil))
170 |                                                (def res (volatile! nil))
171 |                                                (doseq [row (.getKey source filename)]
172 |                                                  (let [unformat (nth row 1)
173 |                                                        val (nth unformat index)]
174 |                                                    (if (and (>= (compare val bench) 0) (limit bench val) (or (= @memo nil) (> (compare val @memo) 0)))
175 |                                                      (do (vreset! memo val)
176 |                                                          (vreset! res (first row))))))
177 |                                                @res)]
178 |           (if (nil? source)
179 |            (fn
180 |             [writer a-row a-keys a-map b-keys count a-roll b-roll a-format b-format a-index b-index join-index write-func]
181 |             (let [filename (gen-join-filenames ".clojask/join/b/" a-row a-keys)]
182 |     ;; (println writer)
183 |     ;; (spit ".clojask/join/test.txt" (str writer "\n") :append true)
184 |               (if (.exists (io/file filename))
185 |       ;; (spit ".clojask/join/test.txt" (str (vec (read-csv-seq filename)) "\n") :append true)
186 |                 (let [filename (io/reader filename)]
187 |                   (if-let [b-row (roll-join-get-line-backward (nth a-row a-roll) filename b-roll)] ;; bench is a string
188 |                     (let [a-row (u/gets-format a-row a-index a-format)
189 |                           b-row (u/gets-format b-row b-index b-format)]
190 |                       (write-func writer [(u/gets (concat a-row b-row) join-index)]))
191 |                     (let [a-row (u/gets-format a-row a-index a-format)]
192 |                       (write-func writer [(u/gets (concat a-row (repeat count "")) join-index)])))
193 |                   (.close filename))
194 |                 (let [a-row (u/gets-format a-row a-index a-format)]
195 |                   (write-func writer [(u/gets (concat a-row (repeat count "")) join-index)])))))
196 |             (fn [writer a-row a-keys a-map b-keys count a-roll b-roll a-format b-format a-index b-index join-index write-func]
197 |               (let [filename (gen-join-filenames nil a-row a-keys)]
198 |                 (if (.exists source filename)
199 |                   (let []
200 |                     (if-let [b-row (roll-join-get-line-backward-mem (nth a-row a-roll) filename b-roll)] ;; bench is a string
201 |                       (write-func writer [(u/gets (concat (u/gets-format a-row a-index a-format) b-row) join-index)])
202 |                       (write-func writer [(u/gets (concat (u/gets-format a-row a-index a-format) (repeat count "")) join-index)])))
203 |                   (let [a-row (u/gets-format a-row a-index a-format)]
204 |                     (write-func writer [(u/gets (concat a-row (repeat count "")) join-index)])))))))
205 |       nil)))


--------------------------------------------------------------------------------
/src/main/clojure/clojask/join/outer_onyx_comps.clj:
--------------------------------------------------------------------------------
  1 | (ns clojask.join.outer-onyx-comps
  2 |   (:require [clojask.join.outer-input :as input]
  3 |             [clojask.join.outer-output :as output]
  4 |             [onyx.api :refer :all]
  5 |             [clojure.string :as string]
  6 |             [onyx.test-helper :refer [with-test-env feedback-exception!]]
  7 |             [clojure.data.csv :as csv]
  8 |             [clojask.utils :as u]
  9 |             [clojure.set :as set]
 10 |             [clojure.java.io :as io]
 11 |             [clojask.groupby :refer [read-csv-seq]])
 12 |   (:import (java.io BufferedReader FileReader BufferedWriter FileWriter)
 13 |            [com.clojask.exception ExecutionException]))
 14 | 
 15 | 
 16 | (def id (java.util.UUID/randomUUID))
 17 | 
 18 | (defn workflow-gen
 19 |   "Generate workflow for running Onyx"
 20 |   [num-work]
 21 |   (def workflow []) ;; initialisation
 22 | 
 23 |   ;; for loop for input edges
 24 |   (doseq [x (range 1 (+ num-work 1))]
 25 |     (let [worker-name (keyword (str "sample-worker" x))]
 26 |           (def workflow (conj workflow [:in worker-name]
 27 |               ))))
 28 | 
 29 |   ;; for loop for output edges
 30 |   (doseq [x (range 1 (+ num-work 1))]
 31 |     (let [worker-name (keyword (str "sample-worker" x))]
 32 |           (def workflow (conj workflow [worker-name :output]
 33 |               ))))
 34 | )
 35 | 
 36 | (def dataframe (atom nil))
 37 | 
 38 | 
 39 | (defn worker-func-gen
 40 |   [a b mgroup-a mgroup-b exception a-index b-index a-format b-format write-index]
 41 |   (let [a-count (count a-index)
 42 |         b-count (count b-index)
 43 |         b-nil (repeat b-count nil)
 44 |         add-nil (fn [row] (concat row b-nil))
 45 |         a-index-new (take (count a-index) (iterate inc 0))
 46 |         b-index-new (take (count b-index) (iterate inc 0))
 47 |         ;; a-format (.getFormatter (:col-info a))
 48 |         ;; a-format (set/rename-keys a-format (zipmap (deref a-index) (iterate inc 0)))
 49 |         ;; a-format (.getFormatter (:col-info a))
 50 |         ;; a-format (set/rename-keys a-format (zipmap (deref a-index) (iterate inc 0)))
 51 |         ]
 52 |     (if (= nil mgroup-a)
 53 |       (defn worker-func
 54 |         "refered in preview"
 55 |         [seq]
 56 |       ;; (println seq)
 57 |         (let [id (:id seq)
 58 |               a-filename (:d seq)
 59 |               a-data (read-csv-seq a-filename)
 60 |               a-data (map #(u/gets % a-index-new) a-data)
 61 |               b-filename (string/replace-first a-filename "/a/" "/b/")]
 62 |           (if (.exists (io/file b-filename))
 63 |             (do
 64 |               (let [b-data (mapv #(u/gets % b-index-new) (read-csv-seq b-filename))]
 65 |                 (io/delete-file b-filename true)
 66 |                 {:id id :d (mapv #(u/gets % write-index) (for [a-row a-data b-row b-data] (concat a-row b-row)))}) ;; formatter here
 67 |               )
 68 |             {:id id :d (mapv #(u/gets % write-index) (map add-nil a-data))})))
 69 |       (defn worker-func
 70 |         "refered in preview"
 71 |         [seq]
 72 |       ;; (println seq)
 73 |         (let [id (:id seq)
 74 |               a-filename (:d seq)
 75 |               a-data (.getKey mgroup-a a-filename)
 76 |               a-data (map #(u/gets % a-index-new) a-data)
 77 |               b-filename a-filename
 78 |               ]
 79 |           ;; (println b-filename)
 80 |           (if (.exists mgroup-b b-filename)
 81 |             (do
 82 |               (let [b-data (mapv #(u/gets % b-index-new) (.getKey mgroup-b b-filename))]
 83 |                 ;; (io/delete-file b-filename true)
 84 |                 {:id id :d (mapv #(u/gets % write-index) (for [a-row a-data b-row b-data] (concat a-row b-row)))}) ;; formatter here
 85 |               )
 86 |             {:id id :d (mapv #(u/gets % write-index) (map add-nil a-data))}))))))
 87 | 
 88 | (defn worker-func-gen2
 89 |   [a b mgroup-a mgroup-b exception a-index b-index a-format b-format write-index]
 90 |   (let [a-count (count a-index)
 91 |         b-count (count b-index)
 92 |         a-nil (repeat a-count nil)
 93 |         add-nil (fn [row] (concat a-nil row))
 94 |         b-index-new (take (count b-index) (iterate inc 0))]
 95 |     (if (= mgroup-a nil)
 96 |       (defn worker-func
 97 |         "refered in preview"
 98 |         [seq]
 99 |       ;; (println seq)
100 |         (let [id (:id seq)
101 |               b-filename (:d seq)
102 |               b-data (mapv #(u/gets % b-index-new) (read-csv-seq b-filename))]
103 |           {:id id :d (mapv #(u/gets % write-index) (mapv add-nil b-data))}))
104 |       (defn worker-func
105 |         "refered in preview"
106 |         [seq]
107 |       ;; (println seq)
108 |         (let [id (:id seq)
109 |               b-filename (:d seq)
110 |               b-data (mapv #(u/gets % b-index-new) (.getKey mgroup-b b-filename))]
111 |           {:id id :d (mapv #(u/gets % write-index) (mapv add-nil b-data))})))))
112 | 
113 | (defn catalog-gen
114 |   "Generate the catalog for running Onyx"
115 |   [num-work batch-size]
116 |   ;; initialisation
117 |   (def catalog [])
118 | 
119 |   ;; input
120 |   (def catalog 
121 |     (conj catalog
122 |      {:onyx/name :in
123 |       :onyx/plugin :clojask.join.outer-input/input
124 |       :onyx/type :input
125 |       :onyx/medium :seq
126 |       :seq/checkpoint? true
127 |       :onyx/batch-size batch-size
128 |       :onyx/max-peers 1
129 |       :input/doc "Reads segments from a core.async channel"}))
130 | 
131 |     ;; for loop for sample workers
132 |     (doseq [x (range 1 (+ num-work 1))]
133 |       (let [worker-name (keyword (str "sample-worker" x))
134 |             worker-function (keyword "clojask.join.outer-onyx-comps" "worker-func")]
135 |             (def catalog 
136 |               (conj catalog
137 |                {:onyx/name worker-name
138 |                 :onyx/fn worker-function
139 |                 :onyx/type :function
140 |                 :onyx/batch-size batch-size
141 |                 :worker/doc "This is a worker node"}
142 |                 ))))
143 |     
144 |     ;; output
145 |     (def catalog
146 |       (conj catalog
147 |       {:onyx/name :output
148 |         :onyx/plugin :clojask.join.outer-output/output
149 |         :onyx/type :output
150 |         :onyx/medium :core.async  ;; this is maked up
151 |         :onyx/max-peers 1
152 |         :onyx/batch-size batch-size
153 |         :output/doc "Writes segments to the file"}))
154 | 
155 |     ;; (println catalog) ;; !! debugging
156 |     )
157 | 
158 | 
159 | (defn inject-in-reader [event lifecycle]
160 |   (let []
161 |     {:buffered-reader/path (:buffered-reader/path lifecycle)
162 |      }))
163 | 
164 | 
165 | (def in-calls
166 |   {:lifecycle/before-task-start inject-in-reader})
167 | 
168 | 
169 | (defn lifecycle-gen
170 |   [source dist]
171 |   (def lifecycles
172 |     [{:lifecycle/task :in
173 |       :buffered-reader/path source
174 |       :lifecycle/calls ::in-calls}
175 |      {:lifecycle/task :in
176 |       :lifecycle/calls :clojask.join.outer-input/reader-calls}
177 |      {:lifecycle/task :output
178 |       :buffered-wtr/filename dist
179 |       :lifecycle/calls :clojask.join.outer-output/writer-calls}]))
180 | 
181 | (def num-workers (atom 1))
182 | 
183 | ;; (defn rem0?
184 | ;;   [event old-segment new-segment all-new-segment]
185 | ;;   ;; (spit "resources/debug.txt" (str new-segment "\n") :append true)
186 | ;;   (= (mod (:id new-segment) (deref num-workers)) 0))
187 | 
188 | ;; (defn rem1?
189 | ;;   [event old-segment new-segment all-new-segment]
190 | ;;   (= (mod (:id new-segment) (deref num-workers)) 1))
191 | 
192 | ;; (defn rem2?
193 | ;;   [event old-segment new-segment all-new-segment]
194 | ;;   (= (mod (:id new-segment) (deref num-workers)) 2))
195 | 
196 | ;; (defn rem3?
197 | ;;   [event old-segment new-segment all-new-segment]
198 | ;;   (= (mod (:id new-segment) (deref num-workers)) 3))
199 | 
200 | ;; (defn rem4?
201 | ;;   [event old-segment new-segment all-new-segment]
202 | ;;   (= (mod (:id new-segment) (deref num-workers)) 4))
203 | 
204 | ;; (defn rem5?
205 | ;;   [event old-segment new-segment all-new-segment]
206 | ;;   (= (mod (:id new-segment) (deref num-workers)) 5))
207 | 
208 | ;; (defn rem6?
209 | ;;   [event old-segment new-segment all-new-segment]
210 | ;;   (= (mod (:id new-segment) (deref num-workers)) 6))
211 | 
212 | ;; (defn rem7?
213 | ;;   [event old-segment new-segment all-new-segment]
214 | ;;   (= (mod (:id new-segment) (deref num-workers)) 7))
215 | 
216 | ;; (defn rem8?
217 | ;;   [event old-segment new-segment all-new-segment]
218 | ;;   (= (mod (:id new-segment) (deref num-workers)) 8))
219 | 
220 | 
221 | ;; [{:flow/from :in
222 | ;;   :flow/to [:sample-worker1]
223 | ;;   :flow/predicate :clojask.onyx-comps/rem0?
224 | ;;   :flow/doc ""}
225 | ;;  {:flow/from :in
226 | ;;   :flow/to [:sample-worker2]
227 | ;;   :flow/predicate :clojask.onyx-comps/rem1?
228 | ;;   :flow/doc ""}]
229 | 
230 | (defn flow-cond-gen
231 |   "Generate the flow conditions for running Onyx"
232 |   [num-work]
233 |   (reset! num-workers num-work)
234 |   (def flow-conditions []) ;; initialisation
235 | 
236 |   ;; for loop for sample workers
237 |   (doseq [x (range 1 (+ num-work 1))]
238 |     (let [worker-name (keyword (str "sample-worker" x))
239 |           predicate-function (keyword "clojask.join.outer-onyx-comps" (str "rem" (- x 1) "?"))]
240 |       (intern 'clojask.join.outer-onyx-comps (symbol (str "rem" (- x 1) "?")) (fn [event old-segment new-segment all-new-segment]
241 |                                                                      (= (mod (:id new-segment) num-work) (- x 1))))    
242 |       (def flow-conditions
243 |             (conj flow-conditions
244 |              {:flow/from :in
245 |               :flow/to [worker-name]
246 |               :flow/predicate predicate-function
247 |               :worker/doc "This is a flow condition"}
248 |               ))))
249 |     
250 |   ;; (println flow-conditions) ;; !! debugging
251 |   )
252 | 
253 | (defn config-env
254 |   []
255 |   (def env-config
256 |     {:zookeeper/address "127.0.0.1:2188"
257 |      :zookeeper/server? true
258 |      :zookeeper.server/port 2188
259 |      :onyx/tenancy-id id
260 |      :onyx.log/file ".clojask/clojask.log"})
261 | 
262 |   (def peer-config
263 |     {:zookeeper/address "127.0.0.1:2188"
264 |      :onyx/tenancy-id id
265 |      :onyx.peer/job-scheduler :onyx.job-scheduler/balanced
266 |      :onyx.messaging/impl :aeron
267 |      :onyx.messaging/peer-port 40200
268 |      :onyx.messaging/bind-addr "localhost"
269 |      :onyx.log/file ".clojask/clojask.log"})
270 | 
271 |   (def env (onyx.api/start-env env-config))
272 | 
273 |   (def peer-group (onyx.api/start-peer-group peer-config))
274 | 
275 |   (def n-peers (count (set (mapcat identity workflow))))
276 | 
277 |   (def v-peers (onyx.api/start-peers n-peers peer-group)))
278 | 
279 | (defn shutdown
280 |   []
281 |   (doseq [v-peer v-peers]
282 |     (onyx.api/shutdown-peer v-peer))
283 |   (onyx.api/shutdown-peer-group peer-group)
284 |   (onyx.api/shutdown-env env))
285 | 
286 | (defn start-onyx-outer
287 |   "start the onyx cluster with the specification inside dataframe"
288 |   [num-work batch-size a b mgroup-a mgroup-b dist exception a-index b-index a-format b-format write-index output]
289 |   ;; step 1
290 |   (try
291 |     (workflow-gen num-work)
292 |     (config-env)
293 |     (worker-func-gen a b mgroup-a mgroup-b exception a-index b-index a-format b-format write-index) ;;need some work
294 |     (catalog-gen num-work batch-size)
295 |     (lifecycle-gen "./.clojask/join/a" dist)
296 |     (flow-cond-gen num-work)
297 |     (input/inject-dataframe mgroup-a mgroup-b)
298 |     (output/inject-write-func output)
299 |     (catch Exception e (do
300 |                          (shutdown)
301 |                          (throw (ExecutionException. (format "[preparing stage (outer join)]  Refer to .clojask/clojask.log for detailed information. (original error: %s)" (.getMessage e)))))))
302 |   (try
303 |     (let [submission (onyx.api/submit-job peer-config
304 |                                           {:workflow workflow
305 |                                            :catalog catalog
306 |                                            :lifecycles lifecycles
307 |                                            :flow-conditions flow-conditions
308 |                                            :task-scheduler :onyx.task-scheduler/balanced})
309 |           job-id (:job-id submission)]
310 |       ;; (println submission)
311 |       (assert job-id "Job was not successfully submitted")
312 |       (feedback-exception! peer-config job-id))
313 |     (catch Exception e (do
314 |                          (shutdown)
315 |                          (throw (ExecutionException. (format "[submit-to-onyx stage (outer join)]  Refer to .clojask/clojask.log for detailed information. (original error: %s)" (.getMessage e)))))))
316 | 
317 |   ;; step 2
318 |   (try
319 |     (if (not= mgroup-b nil) (.final mgroup-b))
320 |     (worker-func-gen2 a b mgroup-a mgroup-b exception a-index b-index a-format b-format write-index) ;;need some work
321 |     (lifecycle-gen "./.clojask/join/b" dist)
322 |     (input/inject-dataframe mgroup-b nil)
323 | 
324 |     (catch Exception e (do
325 |                          (shutdown)
326 |                          (throw (ExecutionException. (format "[preparing stage (outer join 2)]  Refer to .clojask/clojask.log for detailed information. (original error: %s)" (.getMessage e)))))))
327 |   (try
328 |     (let [submission (onyx.api/submit-job peer-config
329 |                                           {:workflow workflow
330 |                                            :catalog catalog
331 |                                            :lifecycles lifecycles
332 |                                            :flow-conditions flow-conditions
333 |                                            :task-scheduler :onyx.task-scheduler/balanced})
334 |           job-id (:job-id submission)]
335 |       ;; (println submission)
336 |       (assert job-id "Job was not successfully submitted")
337 |       (feedback-exception! peer-config job-id))
338 |     (catch Exception e (do
339 |                          (shutdown)
340 |                          (throw (ExecutionException. (format "[submit-to-onyx stage (outer join 2)]  Refer to .clojask/clojask.log for detailed information. (original error: %s)" (.getMessage e)))))))
341 | 
342 |   (try
343 |     (shutdown)
344 |     (catch Exception e (throw (ExecutionException. (format "[terminate-node stage (outer join)]  Refer to .clojask/clojask.log for detailed information. (original error: %s)" (.getMessage e))))))
345 |   "success")
346 | 


--------------------------------------------------------------------------------