├── .gitignore ├── LICENSE ├── README.md ├── sawzall-doc ├── info.rkt └── sawzall.scrbl ├── sawzall-examples ├── 5.2.rkt ├── 5.9.rkt ├── anscombe.rkt ├── data │ ├── .gitignore │ ├── airlines.csv │ ├── airports.csv │ ├── all_gapminder.csv │ ├── billboard.csv │ ├── chicago-nmmaps.csv │ ├── gss_sm.csv │ ├── midwest.csv │ ├── oecd.csv │ ├── organdata.csv │ ├── planes.csv │ ├── stop-words.csv │ ├── taylor_swift_lyrics.csv │ ├── weather.csv │ └── who.csv ├── r4ds-ch12.rkt ├── r4ds-ch5.rkt ├── racketcon-2021 │ ├── Rplots.pdf │ ├── dplyr-logo.png │ ├── gapminder-graphite.rkt │ ├── gapminder-plot.rkt │ ├── gapminder-r.r │ ├── ggplot2-logo.png │ ├── gss-logo.png │ ├── nickelback.png │ ├── outline.org │ ├── pencils-lol.png │ ├── racket-logo.png │ ├── reciprocating_saw.jpg │ ├── talk.rkt │ ├── tidy-0-0.png │ ├── tidy-1-0.png │ ├── tidy-2-0.png │ ├── tidy.png │ └── tidyr-logo.png └── taylor-swift.rkt ├── sawzall-lib ├── aggregate.rkt ├── bsearch.rkt ├── combining-join.rkt ├── constructors.rkt ├── create.rkt ├── display.rkt ├── filtering-join.rkt ├── generic-join.rkt ├── grouped-df.rkt ├── grouping.rkt ├── helpers.rkt ├── info.rkt ├── main.rkt ├── missing-values.rkt ├── pivot.rkt ├── rectangling.rkt ├── rename.rkt ├── reorder-df.rkt ├── reorder.rkt ├── separate.rkt ├── slice-spec.rkt ├── slice.rkt ├── split.rkt ├── syntax.rkt └── where.rkt ├── sawzall-test ├── aggregating-test.rkt ├── combining-join-test.rkt ├── create-test.rkt ├── data │ ├── billboard.csv │ ├── gss_sm.csv │ ├── iris.csv │ ├── organdata.csv │ └── relig_income.csv ├── filtering-test.rkt ├── info.rkt ├── pivot-test.rkt ├── rectangling-test.rkt ├── results │ ├── aggregate_gss_1.csv │ ├── billboard_pivot_longer_1.csv │ ├── relig_income_pivot_longer_1.csv │ ├── reorder_gss_1.csv │ ├── reorder_gss_2.csv │ ├── reorder_gss_3.csv │ ├── reorder_gss_4.csv │ ├── slice_iris_1.csv │ ├── slice_iris_2.csv │ ├── slice_iris_3.csv │ ├── where_gss_1.csv │ ├── where_gss_2.csv │ ├── where_gss_3.csv │ ├── where_organdata_1.csv │ ├── where_organdata_2.csv │ └── where_organdata_3.csv ├── separate-test.rkt ├── slice-test.rkt ├── sorting-test.rkt ├── test-data.rkt └── util.rkt └── sawzall └── info.rkt /.gitignore: -------------------------------------------------------------------------------- 1 | compiled/ 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Sawzall 2 | 3 | A grammar of data manipulation for Racket, inspired by dplyr. 4 | 5 | *This software is under rapid development. While the majority of code will look similar across versions, 6 | the API is not stable.* 7 | 8 | # Installation 9 | 10 | This package is available on the Racket package server: 11 | https://pkgs.racket-lang.org/package/sawzall 12 | 13 | You can install it with `raco pkg install sawzall`. 14 | 15 | # Documentation 16 | 17 | - API reference: https://docs.racket-lang.org/sawzall/index.html 18 | -------------------------------------------------------------------------------- /sawzall-doc/info.rkt: -------------------------------------------------------------------------------- 1 | #lang info 2 | 3 | (define collection "sawzall-doc") 4 | (define scribblings '(("sawzall.scrbl" (multi-page)))) 5 | 6 | (define pkg-desc "Documentation for Sawzall") 7 | (define version "1.0") 8 | (define deps '("base")) 9 | (define build-deps '("data-frame" 10 | "racket-doc" 11 | "sawzall-lib" 12 | "scribble-lib" 13 | "threading-lib" 14 | "threading-doc")) 15 | -------------------------------------------------------------------------------- /sawzall-examples/5.2.rkt: -------------------------------------------------------------------------------- 1 | #lang racket 2 | (require data-frame 3 | graphite 4 | threading 5 | racket/vector 6 | sawzall) 7 | 8 | (define (v/ vec c) (vector-map (λ (v) (/ v c)) vec)) 9 | (define (sum vec) (for/sum ([v (in-vector vec)] #:when (number? v)) v)) 10 | 11 | (define gss-sm (df-read/csv "data/gss_sm.csv")) 12 | 13 | ;; rel_by_region <- gss_sm %>% 14 | ;; group_by(bigregion, religion) %>% 15 | ;; summarize(N = n()) %>% 16 | ;; mutate(freq = N / sum(N), 17 | ;; pct = round((freq*100), 0)) 18 | (define rel-by-region 19 | (~> gss-sm 20 | (group-with "bigregion" "religion") 21 | (aggregate [N (religion) (vector-length religion)]) 22 | introspect 23 | (create [freq ([N : vector]) (v/ N (sum N))] 24 | [pct (freq) (round (* freq 100))]) 25 | ungroup)) 26 | 27 | ;; ;; p <- ggplot(rel_by_region, aes(x = religion, y = pct, fill = religion)) 28 | ;; ;; p + geom_col(position = "dodge2") + 29 | ;; ;; labs(x = NULL, y = "Percent", fill = "Religion") + 30 | ;; ;; guides(fill = FALSE) + 31 | ;; ;; coord_flip() + 32 | ;; ;; facet_grid(~ bigregion) 33 | (graph #:data rel-by-region 34 | #:mapping (aes #:x "religion" #:y "pct" #:facet "bigregion") 35 | #:facet-wrap 2 36 | #:title "Religious preferences by region" 37 | #:x-label "Religion" #:y-label "Percent" 38 | #:height 800 39 | #:width 800 40 | (col)) 41 | -------------------------------------------------------------------------------- /sawzall-examples/5.9.rkt: -------------------------------------------------------------------------------- 1 | #lang racket 2 | (require data-frame math/statistics graphite threading sawzall) 3 | 4 | (define organdata (df-read/csv "data/organdata.csv" #:na "NA")) 5 | (define sorted-countries 6 | (~> organdata 7 | (group-with "country") 8 | (aggregate [med (donors) (median < (vector-filter identity donors))]) 9 | (reorder "med") 10 | (df-select "country"))) 11 | (define sorted 12 | (~> organdata 13 | (reorder (cons "country" (by-vector sorted-countries))))) 14 | 15 | (graph #:data sorted 16 | #:mapping (aes #:x "donors" #:y "country") 17 | #:width 700 18 | (boxplot #:invert? #t)) 19 | -------------------------------------------------------------------------------- /sawzall-examples/anscombe.rkt: -------------------------------------------------------------------------------- 1 | #lang racket 2 | (require graphite 3 | sawzall 4 | threading) 5 | 6 | ;; anscombe's quartet, from R 7 | ;; each "x1" "y1" pair has extremely similar summary statistics 8 | (define anscombe 9 | (row-df [x1 x2 x3 x4 y1 y2 y3 y4] 10 | 10 10 10 8 8.04 9.14 7.46 6.58 11 | 8 8 8 8 6.95 8.14 6.77 5.76 12 | 13 13 13 8 7.58 8.74 12.74 7.71 13 | 9 9 9 8 8.81 8.77 7.11 8.84 14 | 11 11 11 8 8.33 9.26 7.81 8.47 15 | 14 14 14 8 9.96 8.10 8.84 7.04 16 | 6 6 6 8 7.24 6.13 6.08 5.25 17 | 4 4 4 19 4.26 3.10 5.39 12.50 18 | 12 12 12 8 10.84 9.13 8.15 5.56 19 | 7 7 7 8 4.82 7.26 6.42 7.91 20 | 5 5 5 8 5.68 4.74 5.73 6.89)) 21 | 22 | ;; this data is fine, but we want to show everything with one graphite facet! 23 | ;; what we want is a variable to facet on... but we need to induce one first... 24 | (define facetable 25 | (~> anscombe 26 | ;; create a column so we don't have any duplicates, and pivot-wider knows 27 | ;; what to bind on 28 | ;; 29 | ;; if we don't do this, pivot-wider will complain about insufficient information 30 | ;; to join on 31 | (create [nrow ([x1 : vector]) (build-vector (vector-length x1) (λ (x) x))]) 32 | ;; take every column but nrow, 33 | ;; names to a new column called "name" and the values to a new column called "val" 34 | (pivot-longer (not "nrow") #:names-to "name" #:values-to "val") 35 | ;; take "name", which is comprised of "x1" "x2" "y1" et al, 36 | ;; and turn it into two columns by splitting on the first character, 37 | ;; one representing the variable and the other representing the quadrant 38 | (separate "name" #:into '("x-or-y" "quadrant") #:separator 1) 39 | ;; take the column "x-or-y", and take all the "x" values and make them a column, 40 | ;; then the "y" values and make them a column 41 | (pivot-wider #:names-from "x-or-y" #:values-from "val") 42 | (introspect everything) 43 | ;; remove the column we built at the start 44 | (slice (not "nrow")))) 45 | 46 | ;; so the data now looks like: 47 | ;; ┌──┬────┬────────┐ 48 | ;; │x │y │quadrant│ 49 | ;; ├──┼────┼────────┤ 50 | ;; │10│8.04│1 │ 51 | ;; ├──┼────┼────────┤ 52 | ;; │8 │6.95│1 │ 53 | ;; ├──┼────┼────────┤ 54 | ;; │13│7.58│1 │ 55 | ;; ├──┼────┼────────┤ 56 | ;; │9 │8.81│1 │ 57 | ;; ├──┼────┼────────┤ 58 | ;; │11│8.33│1 │ 59 | ;; ├──┼────┼────────┤ 60 | ;; │14│9.96│1 │ 61 | ;; └──┴────┴────────┘ 62 | ;; so we have a variable we can facet on! 63 | 64 | ;; read in "facetable", 65 | (graph #:data facetable 66 | ;; use "x" for x and "y" for y, facet on "quadrant", 67 | #:mapping (aes #:x "x" #:y "y" #:facet "quadrant") 68 | #:width 800 #:height 800 69 | #:title "Anscombe's Quartet" 70 | ;; and draw points and a linear fit line 71 | (points) 72 | (fit #:width 3)) 73 | -------------------------------------------------------------------------------- /sawzall-examples/data/.gitignore: -------------------------------------------------------------------------------- 1 | flights.csv 2 | -------------------------------------------------------------------------------- /sawzall-examples/data/airlines.csv: -------------------------------------------------------------------------------- 1 | carrier,name 2 | 9E,Endeavor Air Inc. 3 | AA,American Airlines Inc. 4 | AS,Alaska Airlines Inc. 5 | B6,JetBlue Airways 6 | DL,Delta Air Lines Inc. 7 | EV,ExpressJet Airlines Inc. 8 | F9,Frontier Airlines Inc. 9 | FL,AirTran Airways Corporation 10 | HA,Hawaiian Airlines Inc. 11 | MQ,Envoy Air 12 | OO,SkyWest Airlines Inc. 13 | UA,United Air Lines Inc. 14 | US,US Airways Inc. 15 | VX,Virgin America 16 | WN,Southwest Airlines Co. 17 | YV,Mesa Airlines Inc. 18 | -------------------------------------------------------------------------------- /sawzall-examples/data/oecd.csv: -------------------------------------------------------------------------------- 1 | "","year","other","usa","diff","hi_lo" 2 | "1",1960,68.6,69.9,1.30000000000001,"Below" 3 | "2",1961,69.2,70.4,1.2,"Below" 4 | "3",1962,68.9,70.2,1.3,"Below" 5 | "4",1963,69.1,70,0.900000000000006,"Below" 6 | "5",1964,69.5,70.3,0.799999999999997,"Below" 7 | "6",1965,69.6,70.3,0.700000000000003,"Below" 8 | "7",1966,69.9,70.3,0.399999999999991,"Below" 9 | "8",1967,70.1,70.7,0.600000000000009,"Below" 10 | "9",1968,70.1,70.4,0.300000000000011,"Below" 11 | "10",1969,70.1,70.6,0.5,"Below" 12 | "11",1970,69.8,70.9,1.10000000000001,"Below" 13 | "12",1971,70.4,71.2,0.799999999999997,"Below" 14 | "13",1972,70.6,71.3,0.700000000000003,"Below" 15 | "14",1973,70.7,71.5,0.799999999999997,"Below" 16 | "15",1974,71,72.1,1.09999999999999,"Below" 17 | "16",1975,71.3,72.7,1.40000000000001,"Below" 18 | "17",1976,71.6,73,1.40000000000001,"Below" 19 | "18",1977,72,73.4,1.40000000000001,"Below" 20 | "19",1978,72.1,73.5,1.40000000000001,"Below" 21 | "20",1979,72.3,73.9,1.60000000000001,"Below" 22 | "21",1980,72.6,73.7,1.10000000000001,"Below" 23 | "22",1981,72.9,74.1,1.19999999999999,"Below" 24 | "23",1982,73.1,74.5,1.40000000000001,"Below" 25 | "24",1983,73.2,74.6,1.39999999999999,"Below" 26 | "25",1984,73.5,74.7,1.2,"Below" 27 | "26",1985,73.7,74.7,1,"Below" 28 | "27",1986,74,74.7,0.700000000000003,"Below" 29 | "28",1987,74.3,74.9,0.600000000000009,"Below" 30 | "29",1988,74.5,74.9,0.400000000000006,"Below" 31 | "30",1989,74.7,75.1,0.399999999999991,"Below" 32 | "31",1990,74.8,75.3,0.5,"Below" 33 | "32",1991,75,75.5,0.5,"Below" 34 | "33",1992,75.2,75.7,0.5,"Below" 35 | "34",1993,75.4,75.5,0.0999999999999943,"Below" 36 | "35",1994,75.7,75.7,0,"Below" 37 | "36",1995,75.8,75.7,-0.0999999999999943,"Above" 38 | "37",1996,76.2,76.1,-0.100000000000009,"Above" 39 | "38",1997,76.4,76.5,0.0999999999999943,"Below" 40 | "39",1998,76.6,76.7,0.100000000000009,"Below" 41 | "40",1999,76.8,76.7,-0.0999999999999943,"Above" 42 | "41",2000,77.2,76.7,-0.5,"Above" 43 | "42",2001,77.5,76.9,-0.599999999999994,"Above" 44 | "43",2002,77.5,77,-0.5,"Above" 45 | "44",2003,77.6,77.1,-0.5,"Above" 46 | "45",2004,78.1,77.6,-0.5,"Above" 47 | "46",2005,78.3,77.6,-0.700000000000003,"Above" 48 | "47",2006,78.6,77.8,-0.799999999999997,"Above" 49 | "48",2007,78.8,78.1,-0.700000000000003,"Above" 50 | "49",2008,79.1,78.1,-1,"Above" 51 | "50",2009,79.3,78.5,-0.799999999999997,"Above" 52 | "51",2010,79.6,78.6,-1,"Above" 53 | "52",2011,79.9,78.7,-1.2,"Above" 54 | "53",2012,80,78.8,-1.2,"Above" 55 | "54",2013,80.4,78.8,-1.60000000000001,"Above" 56 | "55",2014,80.7,78.9,-1.8,"Above" 57 | "56",2015,80.6,78.8,-1.8,"Above" 58 | "57",2016,77.1,NA,NA,NA 59 | -------------------------------------------------------------------------------- /sawzall-examples/data/stop-words.csv: -------------------------------------------------------------------------------- 1 | a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your 2 | -------------------------------------------------------------------------------- /sawzall-examples/data/taylor_swift_lyrics.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ralsei/sawzall/c9a41fe3d639b880c36379f1cb7a1b1cd077e5ea/sawzall-examples/data/taylor_swift_lyrics.csv -------------------------------------------------------------------------------- /sawzall-examples/r4ds-ch12.rkt: -------------------------------------------------------------------------------- 1 | #lang racket 2 | (require data-frame 3 | graphite 4 | sawzall 5 | threading) 6 | 7 | ;; read in some dirty data 8 | (define who-dirty (df-read/csv "data/who.csv" #:na "NA")) 9 | ;; bug (?) in df-read/csv 10 | (df-del-series! who-dirty "") 11 | 12 | (define who 13 | (~> who-dirty 14 | ;; give it a generic name, because we don't know what these values 15 | ;; mean yet. everything else is of the form new_rel or newrel or something 16 | ;; like that. 17 | (pivot-longer (not ["country" "iso2" "iso3" "year"]) 18 | #:names-to "key" #:values-to "cases") 19 | ;; NA values probably aren't important. 20 | (drop-na "cases") 21 | ;; annoyingly, we have newrel in some keys aside from new_rel, so replace 22 | ;; those 23 | (create [key (key) (string-replace key "newrel" "new_rel")]) 24 | ;; split up the keys into something more meaningful 25 | (separate "key" 26 | #:into '("new" "type" "sex-age") 27 | #:separator "_") 28 | ;; drop useless columns 29 | (slice (not ["new" "iso2" "iso3"])) 30 | ;; separate "sex-age" into sex and age on the first character 31 | (separate "sex-age" 32 | #:into '("sex" "age") 33 | #:separator 1))) 34 | 35 | (~> who 36 | (where* (country) ("Afghanistan")) 37 | (group-with "year" "age") 38 | (aggregate [cases-sum (cases) (for/sum ([v (in-vector cases)]) v)]) 39 | ungroup 40 | (graph #:data _ 41 | #:mapping (aes #:x "year" #:y "cases-sum" #:discrete-color "age") 42 | (lines))) 43 | -------------------------------------------------------------------------------- /sawzall-examples/r4ds-ch5.rkt: -------------------------------------------------------------------------------- 1 | #lang racket 2 | (require colormaps 3 | data-frame 4 | file/gunzip 5 | math/statistics 6 | graphite 7 | sawzall 8 | threading) 9 | (provide (all-defined-out)) 10 | 11 | ;; load in data from the R nycflights13 package 12 | ;; really big, so it's stored gzipped 13 | (define flights 14 | (let () 15 | (define data 16 | (call-with-output-string 17 | (λ (out) 18 | (call-with-input-file "data/flights.csv.gz" 19 | (λ (in) (gunzip-through-ports in out)))))) 20 | (call-with-input-string data (curry df-read/csv #:na "NA")))) 21 | 22 | ;; exploring the relationship between the distance and average delay for each location 23 | ;; note the implicit ungroup for aggregate 24 | (define delays 25 | (~> flights 26 | (group-with "dest") 27 | (aggregate [count (dep_delay) (vector-length dep_delay)] 28 | [dist (distance) (mean (vector-filter identity distance))] 29 | [delay (arr_delay) (mean (vector-filter identity arr_delay))]) 30 | (where (count dest) (and (> count 20) (not (equal? dest "HNL")))))) 31 | 32 | (graph #:data delays 33 | #:mapping (aes #:x "dist" #:y "delay") 34 | #:theme (theme-override theme-default #:color-map 'tol-sd) 35 | (points #:mapping (aes #:continuous-color "count")) 36 | (fit #:method 'loess #:width 3)) 37 | 38 | ;; get all the flights that haven't been cancelled (which is a NA delay) 39 | (define not-cancelled 40 | (~> flights 41 | (where (arr_delay dep_delay) (and arr_delay dep_delay)))) 42 | 43 | ;; planes identified by tail number with the highest average delays 44 | ;; filter out groups with small numbers of observations, to get to more trends 45 | (define delays-by-tailnum 46 | (~> not-cancelled 47 | (group-with "tailnum") 48 | (aggregate [delay (arr_delay) (mean arr_delay)] 49 | [N (arr_delay) (vector-length arr_delay)]) 50 | (where (N) (> N 25)))) 51 | 52 | ;; we don't have geom_freqpoly() in graphite. maybe we should? 53 | (graph #:data delays-by-tailnum 54 | #:mapping (aes #:x "N" #:y "delay") 55 | #:x-min -2 #:width 700 56 | (points #:alpha 1/10 #:color "black")) 57 | -------------------------------------------------------------------------------- /sawzall-examples/racketcon-2021/Rplots.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ralsei/sawzall/c9a41fe3d639b880c36379f1cb7a1b1cd077e5ea/sawzall-examples/racketcon-2021/Rplots.pdf -------------------------------------------------------------------------------- /sawzall-examples/racketcon-2021/dplyr-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ralsei/sawzall/c9a41fe3d639b880c36379f1cb7a1b1cd077e5ea/sawzall-examples/racketcon-2021/dplyr-logo.png -------------------------------------------------------------------------------- /sawzall-examples/racketcon-2021/gapminder-graphite.rkt: -------------------------------------------------------------------------------- 1 | #lang racket 2 | (require data-frame graphite) 3 | 4 | (define gapminder 5 | (df-read/csv "../data/all_gapminder.csv")) 6 | 7 | (graph #:data gapminder 8 | #:mapping (aes #:x "gdpPercap" 9 | #:y "lifeExp") 10 | #:x-label "GDP per capita (USD)" 11 | #:y-label "Life expectancy (years)" 12 | #:x-transform logarithmic-transform 13 | (points #:alpha 0.4 14 | #:mapping (aes #:discrete-color 15 | "continent")) 16 | (fit #:width 3 #:method 'loess)) 17 | -------------------------------------------------------------------------------- /sawzall-examples/racketcon-2021/gapminder-plot.rkt: -------------------------------------------------------------------------------- 1 | #lang racket 2 | (require data-frame 3 | fancy-app 4 | plot/pict 5 | plot/utils) 6 | 7 | (define gapminder (df-read/csv "../data/all_gapminder.csv")) 8 | 9 | (define fit 10 | (df-least-squares-fit gapminder "gdpPercap" 11 | "lifeExp" #:mode 'log)) 12 | 13 | 14 | (parameterize ([plot-x-label "GDP per capita (USD)"] 15 | [plot-y-label "Life expectancy (years)"] 16 | [plot-font-family 'swiss] 17 | [plot-x-transform log-transform] 18 | [plot-x-ticks (log-ticks #:scientific? #f)] 19 | [plot-x-far-ticks no-ticks] 20 | [plot-y-far-ticks no-ticks] 21 | [point-sym 'bullet] 22 | [point-alpha 0.4] 23 | [plot-pen-color-map 'set1]) 24 | (define tbl (make-hash)) 25 | (for ([(x y con) (in-data-frame gapminder 26 | "gdpPercap" 27 | "lifeExp" 28 | "continent")] 29 | #:when (and x y)) 30 | (hash-update! tbl con (cons (vector x y) _) null)) 31 | 32 | (plot 33 | (cons (function fit #:width 3 #:color 'blue) 34 | (let ([color-n -1]) 35 | (hash-map tbl 36 | (lambda (con pts) 37 | (set! color-n (add1 color-n)) 38 | (points pts 39 | #:color (->pen-color color-n) 40 | #:label con))))))) 41 | -------------------------------------------------------------------------------- /sawzall-examples/racketcon-2021/gapminder-r.r: -------------------------------------------------------------------------------- 1 | 2 | library(gapminder) 3 | library(tidyverse) 4 | 5 | 6 | 7 | ggplot(data = gapminder, 8 | mapping = aes(x = gdpPercap, 9 | y = lifeExp)) + 10 | labs(x = "GDP per capita (USD)", 11 | y = "Life expectancy (years)") + 12 | scale_x_log10() + 13 | geom_point(alpha = 0.4, 14 | aes(color = continent)) + 15 | 16 | geom_smooth() 17 | -------------------------------------------------------------------------------- /sawzall-examples/racketcon-2021/ggplot2-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ralsei/sawzall/c9a41fe3d639b880c36379f1cb7a1b1cd077e5ea/sawzall-examples/racketcon-2021/ggplot2-logo.png -------------------------------------------------------------------------------- /sawzall-examples/racketcon-2021/gss-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ralsei/sawzall/c9a41fe3d639b880c36379f1cb7a1b1cd077e5ea/sawzall-examples/racketcon-2021/gss-logo.png -------------------------------------------------------------------------------- /sawzall-examples/racketcon-2021/nickelback.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ralsei/sawzall/c9a41fe3d639b880c36379f1cb7a1b1cd077e5ea/sawzall-examples/racketcon-2021/nickelback.png -------------------------------------------------------------------------------- /sawzall-examples/racketcon-2021/outline.org: -------------------------------------------------------------------------------- 1 | #+TITLE: Outline 2 | 3 | * Introduction 4 | ** Hi, I'm Hazel 5 | ** Sawzall is... 6 | ** Tidy data is... 7 | *** This is the key thing we abstract over to make Sawzall work 8 | * GSS example 9 | ** General Social Survey, 2016 data, individual-level preferences 10 | ** Idea of what we want to do: 11 | *** Get percent religious preference by census region 12 | **** What % of people in the Midwest are Catholic (in this data)? 13 | *** Accomplishing this 14 | **** Start with individual level data 15 | **** Then, get the count of each religion within each region 16 | **** Finally, turn those counts into percentages 17 | ** Doing that with Sawzall 18 | *** Start with the threading library 19 | *** Group with respect to bigregion, and religion 20 | *** Then, summarize each religion into just the count of the number of observations 21 | *** Finally, turn those into frequencies, and turn the resulting frequencies into percentages 22 | *** Graph it 23 | * Basic wrangling operators 24 | ** aggregate [fold], create [map], where [filter], slice [also filter] 25 | ** All of these seamlessly compose with ~> 26 | ** It's natural to want to express operations in groups 27 | *** This works in threading by transporting to a grouped data structure 28 | * Billboard example 29 | ** A la Scheme 2021 30 | ** Why do we care about untidy data? 31 | * Basic tidying operators 32 | ** pivots, nesting, separating 33 | ** These compose with wrangling operators to make a seamless pipeline 34 | * The approach (implementation) 35 | ** Macros! 36 | ** Racket is really good at figuring out what you want to write, and then making it work 37 | *** A lot of these operators have their own DSLs (like slice) 38 | *** These DSLs are implemented using syntax classes, which whip ass 39 | * Current uses, and future directions 40 | ** Processing small, in-memory datasets is relatively ergonomic 41 | *** Most of the book /R for Data Science/, by Hadley Wickham, can be completed 42 | *** ...but performance could be better 43 | ** I made this whole other library for making visualizations 44 | *** There's a Scheme talk and a tutorial about that 45 | ** "Abstracting" from a data-frame 46 | *** Could use a different style of storing data, or a different library 47 | *** Could interface with a real database (though this would be inefficient) 48 | -------------------------------------------------------------------------------- /sawzall-examples/racketcon-2021/pencils-lol.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ralsei/sawzall/c9a41fe3d639b880c36379f1cb7a1b1cd077e5ea/sawzall-examples/racketcon-2021/pencils-lol.png -------------------------------------------------------------------------------- /sawzall-examples/racketcon-2021/racket-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ralsei/sawzall/c9a41fe3d639b880c36379f1cb7a1b1cd077e5ea/sawzall-examples/racketcon-2021/racket-logo.png -------------------------------------------------------------------------------- /sawzall-examples/racketcon-2021/reciprocating_saw.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ralsei/sawzall/c9a41fe3d639b880c36379f1cb7a1b1cd077e5ea/sawzall-examples/racketcon-2021/reciprocating_saw.jpg -------------------------------------------------------------------------------- /sawzall-examples/racketcon-2021/tidy-0-0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ralsei/sawzall/c9a41fe3d639b880c36379f1cb7a1b1cd077e5ea/sawzall-examples/racketcon-2021/tidy-0-0.png -------------------------------------------------------------------------------- /sawzall-examples/racketcon-2021/tidy-1-0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ralsei/sawzall/c9a41fe3d639b880c36379f1cb7a1b1cd077e5ea/sawzall-examples/racketcon-2021/tidy-1-0.png -------------------------------------------------------------------------------- /sawzall-examples/racketcon-2021/tidy-2-0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ralsei/sawzall/c9a41fe3d639b880c36379f1cb7a1b1cd077e5ea/sawzall-examples/racketcon-2021/tidy-2-0.png -------------------------------------------------------------------------------- /sawzall-examples/racketcon-2021/tidy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ralsei/sawzall/c9a41fe3d639b880c36379f1cb7a1b1cd077e5ea/sawzall-examples/racketcon-2021/tidy.png -------------------------------------------------------------------------------- /sawzall-examples/racketcon-2021/tidyr-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ralsei/sawzall/c9a41fe3d639b880c36379f1cb7a1b1cd077e5ea/sawzall-examples/racketcon-2021/tidyr-logo.png -------------------------------------------------------------------------------- /sawzall-examples/taylor-swift.rkt: -------------------------------------------------------------------------------- 1 | #lang racket 2 | (require data-frame 3 | csv-reading 4 | graphite 5 | sawzall 6 | threading) 7 | 8 | ;; read in all the data 9 | (define raw-lyrics (df-read/csv "data/taylor_swift_lyrics.csv")) 10 | 11 | (define stop-words (apply set (first (call-with-input-file "data/stop-words.csv" csv->list)))) 12 | 13 | ;; currently, we have each song line-by-line. we want *all* the lyrics 14 | ;; for each song. 15 | (define full-lyrics 16 | (~> raw-lyrics 17 | ;; group in album and track title, 18 | ;; both because we actually want to group by track title 19 | ;; and to retain the album information after aggregating 20 | (group-with "album" "track_title") 21 | ;; append all of the strings with spaces 22 | ;; map ~a (format) to avoid parsing the lyric "22" in the song "22" as an integer 23 | (aggregate [lyrics (lyric) (string-join (vector->list (vector-map ~a lyric)) " ")]) 24 | ungroup)) 25 | 26 | ;; strips punctuation from a string 27 | (define (strip-punctuation str) 28 | (string-normalize-spaces (regexp-replace* #px"[.,/#!$%\\^&\\*;:{}=\\-_`~()\"\"\\?]" str " "))) 29 | 30 | (define (get-frequency lst) 31 | (define frequencies (make-hash)) 32 | (for ([v (in-list lst)]) 33 | (hash-update! frequencies v add1 0)) 34 | frequencies) 35 | 36 | (define by-word 37 | (~> full-lyrics 38 | ;; remove all punctuation (except ones that modify the meaning) 39 | ;; and also remove uppercase 40 | (create [lyrics (lyrics) (string-downcase (strip-punctuation lyrics))]) 41 | ;; then turn them into hashes 42 | ;; this could be combined into the above operation, but I'm not for clarity 43 | (create [lyrics (lyrics) (get-frequency (string-split lyrics))]) 44 | ;; then turn those hash tables into regular columnar variables 45 | (unnest-longer "lyrics" 46 | #:keys-to "word" 47 | #:values-to "count" 48 | #:remove? #t) 49 | ;; remove "stop words" (really common english) 50 | (where (word) (not (set-member? stop-words word))))) 51 | 52 | (define (sum vec) (for/sum ([v (in-vector vec)]) v)) 53 | 54 | (~> by-word 55 | (group-with "word") 56 | (aggregate [count-sum (count) (sum count)]) 57 | (reorder (cons "count-sum" >)) 58 | (take-rows 0 20) 59 | (graph #:data _ 60 | #:mapping (aes #:x "word" #:y "count-sum") 61 | #:width 1000 #:height 600 62 | (col))) 63 | -------------------------------------------------------------------------------- /sawzall-lib/aggregate.rkt: -------------------------------------------------------------------------------- 1 | #lang racket/base 2 | (require (for-syntax racket/base) 3 | data-frame 4 | fancy-app 5 | racket/match 6 | racket/vector 7 | "grouped-df.rkt" 8 | "grouping.rkt" 9 | "syntax.rkt") 10 | (provide aggregate) 11 | 12 | (define-syntax (aggregate stx) 13 | (column-syntax-form stx #'aggregate/int #f)) 14 | 15 | (define (aggregate/int df proc) 16 | (ungroup-once (grouped-df-apply (aggregate-already-split _ _ proc) df #:pass-groups? #t))) 17 | 18 | ; after already having split the data-frame up, aggregate the results 19 | (define (aggregate-already-split df retain proc) 20 | (match-define (column-proc new-cols binders procs) proc) 21 | (define empty-df? (sub-df-empty? df)) 22 | 23 | (define return-df (make-data-frame)) 24 | (define retain-series 25 | (for/list ([v (in-list retain)]) 26 | ; should be homogenous 27 | (make-series v 28 | #:data (if empty-df? 29 | (vector) 30 | (vector-take (df-select/sub df v) 1))))) 31 | (define new-series 32 | (for/list ([new-col (in-list new-cols)] 33 | [binder (in-list binders)] 34 | [to-apply (in-list procs)]) 35 | (make-series new-col 36 | #:data (if empty-df? 37 | (vector) 38 | (vector 39 | (apply to-apply (map (compose (df-select/sub df _) car) 40 | binder))))))) 41 | 42 | (for ([s (in-list (append retain-series new-series))]) 43 | (df-add-series! return-df s)) 44 | return-df) 45 | -------------------------------------------------------------------------------- /sawzall-lib/bsearch.rkt: -------------------------------------------------------------------------------- 1 | #lang racket/base 2 | 3 | ;; bsearch.rkt -- binary search in a sorted vector 4 | ;; 5 | ;; This file is part of data-frame -- https://github.com/alex-hhh/data-frame 6 | ;; Copyright (c) 2018 Alex Harsányi 7 | ;; 8 | ;; This program is free software: you can redistribute it and/or modify it 9 | ;; under the terms of the GNU Lesser General Public License as published by 10 | ;; the Free Software Foundation, either version 3 of the License, or (at your 11 | ;; option) any later version. 12 | ;; 13 | ;; This program is distributed in the hope that it will be useful, but WITHOUT 14 | ;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 15 | ;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 16 | ;; License for more details. 17 | ;; 18 | ;; You should have received a copy of the GNU Lesser General Public License 19 | ;; along with this program. If not, see . 20 | 21 | (require racket/contract 22 | racket/math) 23 | 24 | ;; Both `lower-bound` and `upper-bound` will search a sorted vector, VEC for a 25 | ;; value VAL and return an index into the vector. The vector is assumed to 26 | ;; contain sorted values, as defined by CMP-FN (which must define a strict 27 | ;; ordering, that is a "less than" operation, BUT NOT a "less than or equal" 28 | ;; operation). KEY, if present, selects the value to compare (useful if the 29 | ;; vector contains structures and we want to search on a structure slot). 30 | ;; START and END define the sub-range of the vector to search. 31 | ;; 32 | ;; If unsure, you need to use `lower-bound` 33 | ;; 34 | ;; `lower-bound` will return an index identifying the earliest position where 35 | ;; VAL could be inserted to keep the range sorted, while upper-bound will 36 | ;; return the last position where an insertion would keep the value sorted. 37 | ;; 38 | ;; * if VAl exists one or more times in the vector, `lower-bound` will return 39 | ;; the first occurrence, but `upper-bound` will return one after the last 40 | ;; occurrence. For example: 41 | ;; 42 | ;; (define data (vector 1 2 3 4 5)) 43 | ;; (lower-bound data 3) => 2 (position of '3' is at index 2) 44 | ;; (upper-bound data 3) => 3 45 | ;; 46 | ;; (define data (vector 1 2 3 3 3 4 5)) 47 | ;; (lower-bound data 3) => 2 (first '3' is at index 2) 48 | ;; (upper-bound data 3) => 5 (NOTE: at index 5 we have the value '4') 49 | ;; 50 | ;; * if VAL is smaller than the first value in the range, both functions 51 | ;; return START 52 | ;; 53 | ;; * if VAL is greater than the last value in the range, both functions return 54 | ;; END (this is considered out of range for the vector) 55 | ;; 56 | ;; * If VAL does not exist, an index is returned representing the location of 57 | ;; VAL in the vector (or the "best" location, if val is not found). Same 58 | ;; index is returned by both functions. 59 | ;; 60 | ;; To determine if a value actually exists in a vector you also need to check 61 | ;; the actual value at the position returned by `lower-bound`, like so: 62 | ;; 63 | ;; (define (exists? data v) 64 | ;; (define index (lower-bound data v)) 65 | ;; (and (< index (vector-length data)) 66 | ;; (equal? v (vector-ref data index)))) 67 | ;; 68 | ;; NOTE: this works like the std::lower_bound() and std::upper_bound() 69 | ;; functions in C++. 70 | 71 | (define (lower-bound vec val 72 | #:cmp (cmp-fn <) 73 | #:key (key-fn values) 74 | #:start (start 0) 75 | #:stop (end (vector-length vec))) 76 | 77 | (let ((vlen (vector-length vec))) 78 | (cond ((or (< start 0) (> start vlen)) 79 | (raise-range-error 'lower-bound "vector" "starting " start vec 0 vlen)) 80 | ((or (< end 0) (> end vlen)) 81 | (raise-range-error 'lower-bound "vector" "ending " end vec 0 vlen)) 82 | ((> start end) 83 | (raise-range-error 84 | 'lower-bound "vector" "ending " end vec start vlen 0)))) 85 | 86 | (let loop ([start start] 87 | [end end]) 88 | (if (= start end) 89 | start 90 | (let* ((mid (exact-truncate (/ (+ start end) 2))) 91 | (mid-val (key-fn (vector-ref vec mid)))) 92 | (if (cmp-fn mid-val val) 93 | (loop (add1 mid) end) 94 | (loop start mid)))))) 95 | 96 | (define (upper-bound vec val 97 | #:cmp (cmp-fn <) 98 | #:key (key-fn values) 99 | #:start (start 0) 100 | #:stop (end (vector-length vec))) 101 | 102 | (let ((vlen (vector-length vec))) 103 | (cond ((or (< start 0) (> start vlen)) 104 | (raise-range-error 'upper-bound "vector" "starting " start vec 0 vlen)) 105 | ((or (< end 0) (> end vlen)) 106 | (raise-range-error 'upper-bound "vector" "ending " end vec 0 vlen)) 107 | ((> start end) 108 | (raise-range-error 109 | 'upper-bound "vector" "ending " end vec start vlen 0)))) 110 | 111 | (let loop ([start start] 112 | [end end]) 113 | (if (= start end) 114 | start 115 | (let* ((mid (exact-truncate (/ (+ start end) 2))) 116 | (mid-val (key-fn (vector-ref vec mid)))) 117 | (if (cmp-fn val mid-val) 118 | (loop start mid) 119 | (loop (add1 mid) end)))))) 120 | 121 | ;; Return two values representing the start and one-plus end ranges where VAL 122 | ;; is present in the sorted vector VEC. This is equivalent to calling 123 | ;; lower-bound and upper-bound with the same parameters, but will run somewhat 124 | ;; faster 125 | (define (equal-range vec val 126 | #:cmp (cmp-fn <) 127 | #:key (key-fn values) 128 | #:start (start 0) 129 | #:stop (end (vector-length vec))) 130 | 131 | (let ((vlen (vector-length vec))) 132 | (cond ((or (< start 0) (> start vlen)) 133 | (raise-range-error 'equal-range "vector" "starting " start vec 0 vlen)) 134 | ((or (< end 0) (> end vlen)) 135 | (raise-range-error 'equal-range "vector" "ending " end vec 0 vlen)) 136 | ((> start end) 137 | (raise-range-error 138 | 'equal-range "vector" "ending " end vec start vlen 0)))) 139 | 140 | (define lb (lower-bound vec val #:cmp cmp-fn #:key key-fn #:start start #:stop end)) 141 | (cond ((>= lb end) (values lb lb)) 142 | ((equal? (vector-ref vec lb) val) 143 | (values lb 144 | (upper-bound vec val #:cmp cmp-fn #:key key-fn #:start lb #:stop end))) 145 | (#t 146 | (values lb lb)))) 147 | 148 | 149 | ;;............................................................. provides .... 150 | 151 | (provide/contract 152 | (lower-bound (->* ((vectorof any/c) any/c) 153 | (#:cmp (-> any/c any/c boolean?) 154 | #:key (-> any/c any/c) 155 | #:start integer? 156 | #:stop integer?) 157 | integer?)) 158 | 159 | (upper-bound (->* ((vectorof any/c) any/c) 160 | (#:cmp (-> any/c any/c boolean?) 161 | #:key (-> any/c any/c) 162 | #:start integer? 163 | #:stop integer?) 164 | integer?)) 165 | 166 | (equal-range (->* ((vectorof any/c) any/c) 167 | (#:cmp (-> any/c any/c boolean?) 168 | #:key (-> any/c any/c) 169 | #:start integer? 170 | #:stop integer?) 171 | (values integer? integer?))) 172 | 173 | ) 174 | -------------------------------------------------------------------------------- /sawzall-lib/combining-join.rkt: -------------------------------------------------------------------------------- 1 | #lang racket/base 2 | (require data-frame 3 | fancy-app 4 | racket/contract/base 5 | racket/list 6 | "generic-join.rkt" 7 | "grouped-df.rkt" 8 | "grouping.rkt") 9 | (provide (contract-out [left-join (-> (or/c data-frame? grouped-data-frame?) 10 | (or/c data-frame? grouped-data-frame?) 11 | string? ... 12 | (or/c data-frame? grouped-data-frame?))] 13 | [right-join (-> (or/c data-frame? grouped-data-frame?) 14 | (or/c data-frame? grouped-data-frame?) 15 | string? ... 16 | (or/c data-frame? grouped-data-frame?))] 17 | [inner-join (-> (or/c data-frame? grouped-data-frame?) 18 | (or/c data-frame? grouped-data-frame?) 19 | string? ... 20 | (or/c data-frame? grouped-data-frame?))] 21 | [full-join (-> (or/c data-frame? grouped-data-frame?) 22 | (or/c data-frame? grouped-data-frame?) 23 | string? ... 24 | (or/c data-frame? grouped-data-frame?))]) 25 | join-matches join-no-matches) 26 | 27 | (define (left-join df1 df2 . by) 28 | (ignore-groups-apply (left-join-dfs _ (ungroup df2) by) df1)) 29 | (define (right-join df1 df2 . by) 30 | (ignore-groups-apply (left-join-dfs (ungroup df2) _ by) df1)) 31 | (define (inner-join df1 df2 . by) 32 | (ignore-groups-apply (inner-join-dfs _ (ungroup df2) by) df1)) 33 | (define (full-join df1 df2 . by) 34 | (ignore-groups-apply (full-join-dfs _ (ungroup df2) by) df1)) 35 | 36 | ; pad any missing data that isn't matched in any column in df2 with #f 37 | (define (join-no-matches df1 df2-series) 38 | (define df1-int (sub-data-frame-delegate-frame df1)) 39 | (define return-df (make-data-frame)) 40 | (define df1-size (df-row-count/sub df1)) 41 | 42 | (for ([name (in-list df2-series)]) 43 | (df-add-series! return-df (make-series name #:data (make-vector df1-size #f)))) 44 | (for ([name (in-list (df-series-names df1-int))]) 45 | (df-add-series! return-df (make-series name #:data (df-select/sub df1 name)))) 46 | 47 | return-df) 48 | 49 | ; combine on all shared matches 50 | (define (join-matches df1 df2 by) 51 | (define df1-int (sub-data-frame-delegate-frame df1)) 52 | (define df2-int (sub-data-frame-delegate-frame df2)) 53 | 54 | (define (permute-data series df2?) 55 | (for*/vector ([df1-val (in-data-frame/sub df1 (if df2? (first by) series))] 56 | [df2-val (in-data-frame/sub df2 (if df2? series (first by)))]) 57 | (if df2? df2-val df1-val))) 58 | 59 | (define return-df (make-data-frame)) 60 | (for ([name (in-list (df-series-names df2-int))]) 61 | (df-add-series! return-df (make-series name #:data (permute-data name #t)))) 62 | (for ([name (in-list (df-series-names df1-int))]) 63 | (df-add-series! return-df (make-series name #:data (permute-data name #f)))) 64 | 65 | return-df) 66 | 67 | (define left-join-dfs 68 | (generic-join 69 | #:on-= (λ (df1 df2 by acc) (cons (join-matches df1 df2 by) acc)) 70 | ; if df2 ends, keep adding #f 71 | #:on-end 72 | (λ (df1 df2-names acc) 73 | (cons (join-no-matches df1 df2-names) acc)) 74 | ; if df1 < df2, add #f 75 | #:on-< 76 | (λ (df1 df2-names acc) 77 | (cons (join-no-matches df1 df2-names) acc)) 78 | ; if df1 > df2, do nothing 79 | #:on-> 80 | (λ (df1-names df2 acc) acc))) 81 | 82 | (define inner-join-dfs 83 | ; only do something if we're equal 84 | (generic-join 85 | #:on-= (λ (df1 df2 by acc) (cons (join-matches df1 df2 by) acc)) 86 | #:on-end (λ (df1 df2-names acc) acc) 87 | #:on-< (λ (df1 df2-names acc) acc) 88 | #:on-> (λ (df1-names df2 acc) acc))) 89 | 90 | (define full-join-dfs 91 | ; keep adding #f no matter what 92 | (generic-join 93 | #:on-= (λ (df1 df2 by acc) (cons (join-matches df1 df2 by) acc)) 94 | #:on-end 95 | (λ (df1 df2-names acc) 96 | (cons (join-no-matches df1 df2-names) acc)) 97 | #:on-< 98 | (λ (df1 df2-names acc) 99 | (cons (join-no-matches df1 df2-names) acc)) 100 | #:on-> 101 | (λ (df1-names df2 acc) 102 | (cons (join-no-matches df2 df1-names) acc)))) 103 | -------------------------------------------------------------------------------- /sawzall-lib/constructors.rkt: -------------------------------------------------------------------------------- 1 | #lang racket/base 2 | (require (for-syntax racket/base 3 | racket/sequence) 4 | data-frame 5 | syntax/parse/define) 6 | (provide column-df row-df) 7 | 8 | (begin-for-syntax 9 | (define-syntax-class column-spec 10 | [pattern col:id 11 | #:with name #'(symbol->string 'col)] 12 | [pattern col 13 | #:declare col (expr/c #'string?) 14 | #:with name #'col.c]) 15 | 16 | (define (slice n lst) 17 | (sequence->list (in-slice n (in-list lst)))) 18 | 19 | (define (syntax-vector . args) 20 | #`(vector #,@args))) 21 | 22 | (define-syntax-parse-rule (column-df [col:column-spec col-data:expr] ...) 23 | (let () 24 | (define df (make-data-frame)) 25 | (df-add-series! df (make-series col.name #:data col-data)) ... 26 | df)) 27 | 28 | (define-syntax-parse-rule (row-df [col:column-spec ...] value:expr ...) 29 | #:with (column-vec ...) 30 | (apply map syntax-vector 31 | (slice (length (attribute col.name)) (attribute value))) 32 | (let () 33 | (column-df [col.name column-vec] ...))) 34 | -------------------------------------------------------------------------------- /sawzall-lib/create.rkt: -------------------------------------------------------------------------------- 1 | #lang racket/base 2 | (require (for-syntax racket/base) 3 | data-frame 4 | fancy-app 5 | racket/function 6 | racket/match 7 | racket/vector 8 | "grouped-df.rkt" 9 | "grouping.rkt" 10 | "syntax.rkt") 11 | (provide create) 12 | 13 | (define-syntax (create stx) 14 | (column-syntax-form stx #'create/int #t)) 15 | 16 | (define (create/int df proc) 17 | (grouped-df-apply (create-on-df _ proc) df)) 18 | 19 | (define (create-on-df df proc) 20 | (match-define (column-proc new-cols binders procs) proc) 21 | (define return-df (df-dumb-copy/sub df)) 22 | 23 | ; we have to support sequential saw-λ 24 | (for ([col-name (in-list new-cols)] 25 | [binder (in-list binders)] 26 | [to-apply (in-list procs)]) 27 | ; we need to map if there is a single element being bound 28 | (define all-vector? (andmap (λ (x) (eq? (cdr x) 'vector)) binder)) 29 | 30 | (define func 31 | (if all-vector? 32 | to-apply 33 | (curry vector-map to-apply))) 34 | 35 | (define len (df-row-count return-df)) 36 | (define args 37 | (if all-vector? 38 | (map (compose (df-select return-df _) car) binder) 39 | (for/list ([binding (in-list binder)]) 40 | (define var (car binding)) 41 | (define ty (cdr binding)) 42 | (if (eq? ty 'vector) 43 | (make-vector len (df-select return-df var)) 44 | (df-select return-df var))))) 45 | 46 | (df-add-series! 47 | return-df 48 | (make-series col-name #:data (apply func args)))) 49 | 50 | return-df) 51 | -------------------------------------------------------------------------------- /sawzall-lib/display.rkt: -------------------------------------------------------------------------------- 1 | #lang racket/base 2 | (require (for-syntax racket/base) 3 | data-frame 4 | fancy-app 5 | text-table 6 | racket/contract/base 7 | racket/format 8 | racket/list 9 | syntax/parse/define 10 | "grouped-df.rkt" 11 | "grouping.rkt" 12 | "slice-spec.rkt") 13 | (provide (contract-out [sawzall-show-formatter (parameter/c (-> any/c string?))]) 14 | show introspect) 15 | 16 | (define sawzall-show-formatter (make-parameter ~a)) 17 | 18 | (define *show-rows-default* 6) 19 | (define *show-cols-default* 6) 20 | 21 | (define (take* lst n) 22 | (cond [(zero? n) empty] 23 | [(empty? lst) empty] 24 | [else (cons (first lst) 25 | (take* (rest lst) (sub1 n)))])) 26 | 27 | (define (gdf-series-names df) 28 | (df-series-names 29 | (cond [(grouped-data-frame? df) (grouped-data-frame-delegate-frame df)] 30 | [(sub-data-frame? df) (sub-data-frame-delegate-frame df)] 31 | [(data-frame? df) df]))) 32 | 33 | (define-for-syntax (show-syntax-form stx return?) 34 | (syntax-parse stx 35 | [(_ df {~alt 36 | {~optional spec:slice-spec 37 | #:defaults ([spec.parsed #'(all-in$ 38 | (take* 39 | (gdf-series-names df) 40 | *show-cols-default*))])} 41 | {~optional {~seq #:n-rows n-rows} 42 | #:defaults ([n-rows.c #'*show-rows-default*])}} 43 | ...) 44 | #:declare df (expr/c #'(or/c data-frame? grouped-data-frame?)) 45 | #:declare n-rows (expr/c #'(or/c exact-nonnegative-integer? 'all)) 46 | #`(begin 47 | (ignore-groups-apply (show-internal spec.parsed n-rows.c) df.c 48 | #:pass-groups? #t #:regroup? #f) 49 | #,(if return? #'df #'(void)))])) 50 | 51 | (define-syntax (show stx) 52 | (show-syntax-form stx #f)) 53 | (define-syntax (introspect stx) 54 | (show-syntax-form stx #t)) 55 | 56 | (define ((show-internal parsed-spec row-cap/i) df grps) 57 | (define all-series (exec-spec-on-df df parsed-spec)) 58 | 59 | (define all? (eq? row-cap/i 'all)) 60 | (define n-rows (df-row-count df)) 61 | (define n-cols (length (df-series-names df))) 62 | (define row-cap (and (not all?) (min n-rows row-cap/i))) 63 | (define col-cap (length all-series)) 64 | 65 | (printf "data-frame: ~a rows x ~a columns~n" n-rows n-cols) 66 | (when (not (null? grps)) 67 | (printf "groups: ~a~n" grps)) 68 | 69 | (displayln 70 | (table->string 71 | #:->string (sawzall-show-formatter) 72 | (let ([series (if all? all-series (take all-series col-cap))]) 73 | (cons series 74 | (for/list ([v (apply in-data-frame/list df series)] 75 | [_ (if all? n-rows row-cap)]) 76 | v))))) 77 | 78 | (when (not (or all? (and (= n-rows row-cap) (= n-cols col-cap)))) 79 | (printf "~a rows, ~a cols elided 80 | (use (show df everything #:n-rows 'all) for full frame)~n" 81 | (- n-rows row-cap) 82 | (- n-cols col-cap)))) 83 | -------------------------------------------------------------------------------- /sawzall-lib/filtering-join.rkt: -------------------------------------------------------------------------------- 1 | #lang racket/base 2 | (require data-frame 3 | fancy-app 4 | racket/contract/base 5 | "generic-join.rkt" 6 | "grouped-df.rkt" 7 | "grouping.rkt") 8 | (provide (contract-out [semi-join (-> (or/c data-frame? grouped-data-frame?) 9 | (or/c data-frame? grouped-data-frame?) 10 | string? ... 11 | (or/c data-frame? grouped-data-frame?))] 12 | [anti-join (-> (or/c data-frame? grouped-data-frame?) 13 | (or/c data-frame? grouped-data-frame?) 14 | string? ... 15 | (or/c data-frame? grouped-data-frame?))])) 16 | 17 | (define (semi-join df1 df2 . by) 18 | (ignore-groups-apply (semi-join-dfs _ (ungroup df2) by) df1)) 19 | (define (anti-join df1 df2 . by) 20 | (ignore-groups-apply (anti-join-dfs _ (ungroup df2) by) df1)) 21 | 22 | (define (join-matches df1 df2 by) 23 | (define df1-int (sub-data-frame-delegate-frame df1)) 24 | 25 | (define return-df (make-data-frame)) 26 | (for ([name (in-list (df-series-names df1-int))]) 27 | (df-add-series! return-df (make-series name #:data (df-select/sub df1 name)))) 28 | 29 | return-df) 30 | 31 | (define (join-no-matches df1 df2-series) 32 | (define df1-int (sub-data-frame-delegate-frame df1)) 33 | 34 | (define return-df (make-data-frame)) 35 | (for ([name (in-list (df-series-names df1-int))]) 36 | (df-add-series! return-df (make-series name #:data (df-select/sub df1 name)))) 37 | 38 | return-df) 39 | 40 | (define semi-join-dfs 41 | (generic-join 42 | #:on-= (λ (df1 df2 by acc) (cons (join-matches df1 df2 by) acc)) 43 | #:on-end (λ (df1 df2-names acc) acc) 44 | #:on-< (λ (df1 df2-names acc) acc) 45 | #:on-> (λ (df1-names df2 acc) acc))) 46 | 47 | (define anti-join-dfs 48 | (generic-join 49 | #:on-= (λ (df1 df2 by acc) acc) 50 | #:on-end (λ (df1 df2-names acc) (cons (join-no-matches df1 df2-names) acc)) 51 | #:on-< (λ (df1 df2-names acc) (cons (join-no-matches df1 df2-names) acc)) 52 | #:on-> (λ (df1-names df2 acc) acc))) 53 | -------------------------------------------------------------------------------- /sawzall-lib/generic-join.rkt: -------------------------------------------------------------------------------- 1 | #lang racket/base 2 | (require data-frame 3 | fancy-app 4 | racket/list 5 | "helpers.rkt" 6 | "grouped-df.rkt" 7 | "grouping.rkt" 8 | "split.rkt") 9 | (provide generic-join) 10 | 11 | ;; defines a generic join on two data-frames. 12 | (define ((generic-join #:on-end on-end 13 | #:on-= on-eq 14 | #:on-< on-lt 15 | #:on-> on-else) 16 | df1 df2 [by-int '()]) 17 | (define by 18 | (if (null? by-int) 19 | (shared-series (list df1 df2)) 20 | by-int)) 21 | 22 | (when (null? by) 23 | (error 'join "no shared series between merge 24 | please use rename to make some shared columns first")) 25 | 26 | (define df1-grouped (apply group-with df1 by)) 27 | (define df2-grouped (apply group-with df2 by)) 28 | (define df1-sorted (grouped-data-frame-delegate-frame df1-grouped)) 29 | (define df2-sorted (grouped-data-frame-delegate-frame df2-grouped)) 30 | (define df1-group-ivls (first (grouped-data-frame-group-indices df1-grouped))) 31 | (define df2-group-ivls (first (grouped-data-frame-group-indices df2-grouped))) 32 | 33 | (define df1-by (get-grouped-by df1-grouped)) 34 | (define df2-by (get-grouped-by df2-grouped)) 35 | 36 | (define df1-len (vector-length df1-by)) 37 | (define df2-len (vector-length df2-by)) 38 | 39 | ; the return of merge from the hit series merge sort 40 | (let loop ([df1-idx 0] [df2-idx 0] [dfs '()]) 41 | (cond [(>= df1-idx df1-len) 42 | ; we've run out of vector to use, so return the final df 43 | (when (null? dfs) 44 | (error 'join "no data-frames to combine (are you using a semi-join with no matches?)")) 45 | (apply combine (reverse dfs))] 46 | [(>= df2-idx df2-len) 47 | ; we've run out of the second vector. this varies between joins 48 | (loop (add1 df1-idx) df2-idx 49 | (on-end (df-with-ivl df1-sorted (vector-ref df1-group-ivls df1-idx)) 50 | (df-series-names df2) 51 | dfs))] 52 | [(equal? (vector-ref df1-by df1-idx) 53 | (vector-ref df2-by df2-idx)) 54 | ; the rows share the same key, so merge them with combining 55 | (loop (add1 df1-idx) (add1 df2-idx) 56 | (on-eq (df-with-ivl df1-sorted (vector-ref df1-group-ivls df1-idx)) 57 | (df-with-ivl df2-sorted (vector-ref df2-group-ivls df2-idx)) 58 | by 59 | dfs))] 60 | [(lexicographic-vector df2, so keep incrementing the df2 index until they match, and 70 | ; update the accumulator (again, varies) 71 | (loop df1-idx (add1 df2-idx) 72 | (on-else (df-series-names df1) 73 | (df-with-ivl df2-sorted (vector-ref df2-group-ivls df2-idx)) 74 | dfs))]))) 75 | 76 | ;; gets what the given grouped data frame is grouped by at the top level, based on what 77 | ;; we already have grouped 78 | ;; 79 | ;; by the implementation of a grouped data frame this set is already lexicographically sorted 80 | (define (get-grouped-by gdf) 81 | (for/vector ([iv (in-vector (first (grouped-data-frame-group-indices gdf)))]) 82 | (apply df-ref* 83 | (grouped-data-frame-delegate-frame gdf) 84 | (ivl-beg iv) 85 | (reverse (grouped-data-frame-groups gdf))))) 86 | -------------------------------------------------------------------------------- /sawzall-lib/grouped-df.rkt: -------------------------------------------------------------------------------- 1 | #lang racket/base 2 | (require data-frame 3 | racket/contract/base 4 | racket/match) 5 | (provide (struct-out ivl) 6 | df-select/sub 7 | df-select*/sub 8 | in-data-frame/sub 9 | in-data-frame/list/sub 10 | df-dumb-copy/sub 11 | sub-df-empty? 12 | df-row-count/sub 13 | df-ref/sub 14 | df-with-ivl 15 | 16 | (contract-out 17 | [struct sub-data-frame ((delegate-frame data-frame?) 18 | (ivl ivl?))] 19 | [struct grouped-data-frame ((delegate-frame data-frame?) 20 | (groups (listof string?)) 21 | (group-indices (listof (vectorof ivl?))))])) 22 | 23 | ;;;; subframes 24 | ;; like a data-frame, except only a contiguous slice of it 25 | (struct ivl (beg end) #:transparent) 26 | (struct sub-data-frame (delegate-frame ivl) #:transparent) 27 | 28 | ;; like their regular data-frame functions, but operating on sub-data-frames 29 | (define (df-select/sub dfl series) 30 | (match-define (sub-data-frame df (ivl beg end)) dfl) 31 | (df-select df series #:start beg #:stop end)) 32 | 33 | (define (df-select*/sub dfl . series) 34 | (match-define (sub-data-frame df (ivl beg end)) dfl) 35 | (apply df-select* df #:start beg #:stop end series)) 36 | 37 | (define (in-data-frame/sub dfl . series) 38 | (match-define (sub-data-frame df (ivl beg end)) dfl) 39 | (apply in-data-frame df #:start beg #:stop end series)) 40 | 41 | (define (in-data-frame/list/sub dfl . series) 42 | (match-define (sub-data-frame df (ivl beg end)) dfl) 43 | (apply in-data-frame/list df #:start beg #:stop end series)) 44 | 45 | (define (df-ref/sub dfl idx series) 46 | (match-define (sub-data-frame df (ivl beg end)) dfl) 47 | (df-ref df (+ beg idx) series)) 48 | 49 | (define (df-dumb-copy/sub dfl) 50 | (match-define (sub-data-frame df (ivl beg end)) dfl) 51 | (define return-df (make-data-frame)) 52 | (for ([s (in-list (df-series-names df))]) 53 | (df-add-series! return-df 54 | (make-series s #:data (df-select df s #:start beg #:stop end)))) 55 | return-df) 56 | 57 | (define (sub-df-empty? dfl) 58 | (match-define (sub-data-frame _ (ivl beg end)) dfl) 59 | (= (- end beg) 0)) 60 | 61 | (define (df-row-count/sub dfl) 62 | (define iv (sub-data-frame-ivl dfl)) 63 | (- (ivl-end iv) (ivl-beg iv))) 64 | 65 | ;; add an interval to a data-frame, or alternatively add an interval that comprises 66 | ;; the entire df 67 | (define (df-with-ivl df [int #f]) 68 | (sub-data-frame df (if int int (ivl 0 (df-row-count df))))) 69 | 70 | ;;;; grouped data frames 71 | ;; a data-frame, with some additional information: 72 | ;; - the groups 73 | ;; - for each group, a set of intervals that correspond to an ivl (for use with 74 | ;; a `sub-data-frame`) 75 | ;; 76 | ;; unlike a regular data-frame, this is immutable to avoid destroying invariants 77 | (struct grouped-data-frame (delegate-frame groups group-indices) #:transparent) 78 | -------------------------------------------------------------------------------- /sawzall-lib/grouping.rkt: -------------------------------------------------------------------------------- 1 | #lang racket/base 2 | (require data-frame 3 | fancy-app 4 | racket/contract/base 5 | racket/list 6 | racket/match 7 | "bsearch.rkt" 8 | "grouped-df.rkt" 9 | "helpers.rkt" 10 | "reorder-df.rkt" 11 | "split.rkt") 12 | (provide 13 | (contract-out [group-with (->* (data-frame?) 14 | #:rest (non-empty-listof string?) 15 | grouped-data-frame?)] 16 | [ungroup-once (-> (or/c data-frame? grouped-data-frame?) 17 | (or/c data-frame? grouped-data-frame?))] 18 | [ungroup (-> (or/c data-frame? grouped-data-frame?) 19 | data-frame?)]) 20 | grouped-df-apply ignore-groups-apply) 21 | 22 | ;;;; constructing grouped data frames 23 | (define (group-with df . groups) 24 | (define sorted (reorder-default df groups)) 25 | 26 | (define (build-group-vector grp [in-ivls (vector (ivl 0 (df-row-count df)))]) 27 | (define vec (df-select sorted grp)) 28 | (for*/vector ([interval (in-vector in-ivls)] 29 | [p (in-vector (possibilities sorted grp #:ivl interval))]) 30 | (match-define (ivl beg end) interval) 31 | (call-with-values 32 | (λ () (equal-range vec p #:cmp orderable data-frame?) to a grouped data frame 58 | (define (grouped-df-apply fn df #:pass-groups? [pass-groups? #f]) 59 | (cond [(data-frame? df) (if pass-groups? (fn (df-with-ivl df) null) (fn (df-with-ivl df)))] 60 | [(sub-data-frame? df) (if pass-groups? (fn df null) (fn df))] 61 | [(grouped-data-frame? df) 62 | (match-define (grouped-data-frame int-df grps grp-idxes) df) 63 | (define (call ivl) 64 | (if pass-groups? 65 | (fn (df-with-ivl int-df ivl) grps) 66 | (fn (df-with-ivl int-df ivl)))) 67 | (apply group-with 68 | (apply combine 69 | (for/list ([i (in-vector (first grp-idxes))]) 70 | (call i))) 71 | (reverse grps))])) 72 | 73 | ;; applies a function (data-frame? -> data-frame?) to a grouped data frame, 74 | ;; ignoring its grouping 75 | (define (ignore-groups-apply fn df #:pass-groups? [pass-groups? #f] #:regroup? [regroup? #t]) 76 | (define real-df 77 | (cond [(grouped-data-frame? df) (grouped-data-frame-delegate-frame df)] 78 | [(sub-data-frame? df) (sub-data-frame-delegate-frame df)] 79 | [(data-frame? df) df])) 80 | (define groups 81 | (cond [(grouped-data-frame? df) (grouped-data-frame-groups df)] 82 | [else null])) 83 | (define res (if pass-groups? (fn real-df groups) (fn real-df))) 84 | ((if (and regroup? (grouped-data-frame? df)) 85 | (apply group-with _ (reverse groups)) 86 | (λ (x) x)) 87 | res)) 88 | -------------------------------------------------------------------------------- /sawzall-lib/helpers.rkt: -------------------------------------------------------------------------------- 1 | #lang racket/base 2 | (require data-frame 3 | racket/contract/base 4 | racket/match 5 | racket/set 6 | "grouped-df.rkt") 7 | (provide ? λ? 8 | possibilities 9 | vector-has-duplicates? 10 | vector-reorder 11 | vector-reorder! 12 | shared-series 13 | lexicographic-vector orderable? orderable? boolean?)])) 16 | 17 | (define (? fn . args) 18 | (if (andmap (λ (x) (and x #t)) args) 19 | (apply fn args) 20 | #f)) 21 | 22 | (define ((λ? fn) . args) 23 | (apply ? fn args)) 24 | 25 | ; removes duplicates from a given vector 26 | (define (vector-remove-duplicates vec) 27 | (define seen (mutable-set)) 28 | (for/vector ([v (in-vector vec)] 29 | #:unless (set-member? seen v)) 30 | (set-add! seen v) 31 | v)) 32 | 33 | ; determines if a vector has duplicates 34 | (define (vector-has-duplicates? vec) 35 | (define seen (mutable-set)) 36 | (not 37 | (for/and ([v (in-vector vec)]) 38 | (and (not (set-member? seen v)) 39 | (set-add! seen v))))) 40 | 41 | ; determines the possible values that a given data-frame has in a column 42 | (define (possibilities data group #:ivl [iv #f]) 43 | (match-define (ivl beg end) 44 | (if (not iv) 45 | (ivl 0 (df-row-count data)) 46 | iv)) 47 | (vector-remove-duplicates (df-select data group #:start beg #:stop end))) 48 | 49 | ; reorders a vector based on the given indices 50 | ; example: 51 | ; (vector-reorder (vector 1 2 3) (vector 2 1 0)) 52 | ; => (vector 3 2 1) 53 | (define (vector-reorder vec indices) 54 | (when (not (= (vector-length indices) (vector-length vec))) 55 | (error 'vector-reorder "index list not same length as vector")) 56 | (for/vector ([idx (in-vector indices)]) 57 | (vector-ref vec idx))) 58 | 59 | ; takes the input vector and swaps the value at index A with that at index B, 60 | ; mutably 61 | (define (vector-swap! vec a-idx b-idx) 62 | (define temp (vector-ref vec a-idx)) 63 | (vector-set! vec a-idx (vector-ref vec b-idx)) 64 | (vector-set! vec b-idx temp)) 65 | 66 | ; like the above, but mutably with regards to the input 67 | (define (vector-reorder! vec indices) 68 | (define src-idx 0) 69 | 70 | (for ([tar-idx (in-range (vector-length vec))]) 71 | (set! src-idx (vector-ref indices tar-idx)) 72 | (let loop () 73 | (when (< src-idx tar-idx) 74 | (set! src-idx (vector-ref indices src-idx)) 75 | (loop))) 76 | (vector-swap! vec src-idx tar-idx))) 77 | 78 | ; shared series between data-frames 79 | (define (shared-series dfs) 80 | (apply set-intersect (map df-series-names dfs))) 81 | 82 | ; lexicographic ordering for vectors 83 | (define (lexicographic-vectorlist (exec-spec-on-df df parsed-spec))) 49 | (where/int df (row-proc (set->list columns) (λ vals (andmap identity vals))))) 50 | -------------------------------------------------------------------------------- /sawzall-lib/pivot.rkt: -------------------------------------------------------------------------------- 1 | #lang racket/base 2 | (require (for-syntax racket/base) 3 | data-frame 4 | fancy-app 5 | racket/contract/base 6 | racket/format 7 | racket/list 8 | racket/set 9 | racket/vector 10 | syntax/parse/define 11 | "combining-join.rkt" 12 | "generic-join.rkt" 13 | "grouped-df.rkt" 14 | "helpers.rkt" 15 | "rename.rkt" 16 | "slice-spec.rkt" 17 | "split.rkt") 18 | (provide pivot-longer 19 | (contract-out [pivot-wider (-> data-frame? 20 | #:names-from string? #:values-from string? 21 | data-frame?)])) 22 | 23 | (define-syntax-parse-rule (pivot-longer df spec:slice-spec 24 | #:names-to name #:values-to value) 25 | #:declare df (expr/c #'data-frame?) 26 | #:declare name (expr/c #'string?) 27 | #:declare value (expr/c #'string?) 28 | (pivot-longer/int df.c spec.parsed #:names-to name.c #:values-to value.c)) 29 | 30 | ; lengthens data, increasing the number of rows and decreasing the number of columns 31 | (define (pivot-longer/int df parsed-spec 32 | #:names-to name 33 | #:values-to value) 34 | (define cols (exec-spec-on-df df parsed-spec)) 35 | 36 | (define ~cols (set-subtract (df-series-names df) cols)) 37 | (define n-new-cols (set-count cols)) 38 | ; each column becomes a set of rows = to the length of the df 39 | ; so replicate retained series based on the number of columns 40 | (define new-~col-series 41 | (for/list ([name (in-list ~cols)]) 42 | (make-series name 43 | #:data (apply vector-append 44 | (make-list n-new-cols (df-select df name)))))) 45 | 46 | ; turn each series name into a new series that's comprised of its names repeated by the number 47 | ; of rows 48 | (define n-rows (df-row-count df)) 49 | (define list-cols (set->list cols)) 50 | (define new-name-series 51 | (make-series name #:data (apply vector-append (map (make-vector n-rows _) list-cols)))) 52 | ; and append all the values into their own series, which match with the names by virtue of ordering 53 | (define new-val-series 54 | (make-series value #:data (apply vector-append (map (df-select df _) list-cols)))) 55 | 56 | (define return-df (make-data-frame)) 57 | (for ([s (in-list new-~col-series)]) 58 | (df-add-series! return-df s)) 59 | (df-add-series! return-df new-name-series) 60 | (df-add-series! return-df new-val-series) 61 | 62 | return-df) 63 | 64 | ;; like a left join, except it's less permissive with duplicates 65 | (define (pivot-wider-join-matches df1 df2 by) 66 | (define df1-int (sub-data-frame-delegate-frame df1)) 67 | (define df2-int (sub-data-frame-delegate-frame df2)) 68 | 69 | (when (or (vector-has-duplicates? (apply df-select*/sub df1 (df-series-names df1-int))) 70 | (vector-has-duplicates? (apply df-select*/sub df2 (df-series-names df2-int)))) 71 | (error 'pivot-wider "duplicate identifiers for rows: need more information")) 72 | 73 | (join-matches df1 df2 by)) 74 | 75 | (define pivot-wider-join 76 | (generic-join 77 | #:on-= (λ (df1 df2 by acc) (cons (pivot-wider-join-matches df1 df2 by) acc)) 78 | ; if df2 ends, keep adding #f 79 | #:on-end 80 | (λ (df1 df2-names acc) 81 | (cons (join-no-matches df1 df2-names) acc)) 82 | ; if df1 < df2, pad with #f 83 | #:on-< 84 | (λ (df1 df2-names acc) 85 | (cons (join-no-matches df1 df2-names) acc)) 86 | ; if df1 > df2, do nothing 87 | #:on-> 88 | (λ (df1-names df2 acc) acc))) 89 | 90 | ; widens data, decreasing the number of rows and increasing the number of columns 91 | ; XXX: this is not particularly efficient, but intuitive. should see what dplyr does 92 | (define (pivot-wider df #:names-from name-from #:values-from value-from) 93 | (define split (split-with-possibility df name-from)) 94 | 95 | (define to-ljoin 96 | (for/list ([on-possibility (in-vector split)]) 97 | (define val (car on-possibility)) 98 | (define int-df (cdr on-possibility)) 99 | 100 | (define ret (rename int-df value-from (~a val))) 101 | (df-del-series! ret name-from) 102 | ret)) 103 | 104 | (define return-df 105 | (for/fold ([d (first to-ljoin)]) 106 | ([v (in-list (rest to-ljoin))]) 107 | (pivot-wider-join v d))) 108 | 109 | return-df) 110 | -------------------------------------------------------------------------------- /sawzall-lib/rectangling.rkt: -------------------------------------------------------------------------------- 1 | #lang racket/base 2 | (require data-frame 3 | fancy-app 4 | racket/contract/base 5 | racket/format 6 | racket/sequence 7 | racket/vector) 8 | (provide (contract-out [unnest-longer (->* (data-frame? string?) 9 | (#:keys-to (or/c string? #f) 10 | #:values-to (or/c string? #f) 11 | #:remove? boolean?) 12 | data-frame?)] 13 | [unnest-wider (->* (data-frame? string?) 14 | (#:index-prefix string? 15 | #:remove? boolean?) 16 | data-frame?)])) 17 | 18 | (define (sequence-return-count seq) 19 | (call-with-values (λ () (sequence-ref seq 0)) (compose length list))) 20 | 21 | ;; like in-parallel, but: 22 | ;; - if a sequence is #f, make it an endless sequence of #f 23 | ;; - if a sequence is shorter than the other sequences specified, 24 | ;; it will pad the end of it with #f 25 | (define (in-parallel* . seqs) 26 | (define lengths 27 | (for/list ([v (in-list seqs)]) 28 | (if v (sequence-length v) -1))) 29 | (define max-len (apply max lengths)) 30 | (define to-add 31 | (for/list ([len (in-list lengths)]) 32 | (- max-len len))) 33 | (apply in-parallel 34 | (for/list ([v (in-list seqs)] 35 | [add (in-list to-add)]) 36 | (if v 37 | (in-sequences v (make-vector add #f)) 38 | (in-cycle (in-value #f)))))) 39 | 40 | ;; takes a column of sequences, and converts it into either one column (if it is list-like), 41 | ;; or two columns (if dictionary-like), based on its keys and values 42 | (define (unnest-longer df column-name 43 | #:keys-to [keys-to/int #f] 44 | #:values-to [values-to/int #f] 45 | #:remove? [remove? #t]) 46 | ;; default to column-name + "-keys" 47 | (define keys-to (if (not keys-to/int) 48 | (string-append column-name "-keys") 49 | keys-to/int)) 50 | ;; default to column-name 51 | (define values-to (if (not values-to/int) 52 | column-name 53 | values-to/int)) 54 | 55 | (define data (df-select df column-name)) 56 | 57 | ;; determine if the sequence is list-like or dictionary-like 58 | (define n-return 59 | (for/first ([seq (in-vector data)] 60 | #:when seq) 61 | (sequence-return-count seq))) 62 | (when (not (or (= n-return 1) (= n-return 2))) 63 | (error 'unnest-longer "sequence is not list-like or dictionary-like")) 64 | (define has-keys? (= n-return 2)) 65 | 66 | ;; due to arity nonsense, we want to be able to treat it like a dict even 67 | ;; if it's not one. 68 | ;; if there's no sequence, convert it to key #f, value #f 69 | ;; if there is a sequence and we're not dict-like already, make its keys 70 | ;; an infinite sequence of #f (they don't get used anyway) 71 | (define padded-data 72 | (if has-keys? 73 | data 74 | (for/vector ([seq (in-vector data)]) 75 | (if seq 76 | (in-parallel (in-cycle (in-value #f)) seq) 77 | (in-parallel '(#f) '(#f)))))) 78 | 79 | ;; get the lengths of each sequence so we know how many times to duplicate 80 | ;; the old data 81 | (define lengths 82 | (for/list ([seq (in-vector data)]) 83 | (if seq (sequence-length seq) 1))) 84 | 85 | ;; everything but the column values-to, so if it's the same as an input column, 86 | ;; that input column gets overwritten 87 | (define everything-else (remove values-to (df-series-names df))) 88 | (define keys-series 89 | (if (not has-keys?) 90 | #f 91 | (make-series 92 | keys-to 93 | #:data (for*/vector ([seq (in-vector padded-data)] 94 | [(key _) seq]) 95 | key)))) 96 | (define values-series 97 | (make-series 98 | values-to 99 | #:data (for*/vector ([seq (in-vector padded-data)] 100 | [(_ val) seq]) 101 | val))) 102 | 103 | ;; take the lengths, and then pad each of the data to be that length, to match with 104 | ;; the newly expanded sequences 105 | (define old-series 106 | (for/list ([name (in-list everything-else)]) 107 | (make-series name 108 | #:data (apply vector-append 109 | (for/list ([val (in-data-frame df name)] 110 | [len (in-list lengths)]) 111 | (make-vector len val)))))) 112 | 113 | ;; add everything 114 | (define return-df (make-data-frame)) 115 | (when has-keys? 116 | (df-add-series! return-df keys-series)) 117 | (df-add-series! return-df values-series) 118 | (for ([v (in-list old-series)]) 119 | (df-add-series! return-df v)) 120 | 121 | (when (and remove? (not (equal? column-name values-to))) 122 | (df-del-series! return-df column-name)) 123 | return-df) 124 | 125 | ;; takes a column of sequences, and turns it into multiple columns 126 | (define (unnest-wider df column-name 127 | #:index-prefix [index-prefix "idx-"] 128 | #:remove? [remove? #t]) 129 | ;; determine if we are list-like or dictionary-like 130 | (define data (df-select df column-name)) 131 | (define n-return 132 | (for/first ([seq (in-vector data)] 133 | #:when seq) 134 | (sequence-return-count seq))) 135 | (when (not (or (= n-return 1) (= n-return 2))) 136 | (error 'unnest-wider "sequence is not list-like or dictionary-like")) 137 | (define has-keys? (= n-return 2)) 138 | 139 | ;; if we are list-like, then just return the sequence 140 | ;; if we are dict-like, then get all the values 141 | (define (get-values sequence) 142 | (if has-keys? 143 | (for/vector ([(_ v) sequence]) 144 | v) 145 | sequence)) 146 | 147 | ;; get the maximum length, so we know how many column names to generate 148 | ;; if we don't have keys 149 | (define len 150 | (for/fold ([cur-len 0]) 151 | ([v (in-vector data)]) 152 | (if v 153 | (max cur-len (sequence-length v)) 154 | cur-len))) 155 | ;; if we don't have keys, make columns "idx-1", "idx-2", ... 156 | (define new-column-names 157 | (if has-keys? 158 | (for*/vector ([seq (in-vector data)] 159 | [(k _) seq]) 160 | k) 161 | (build-vector len (λ (x) (string-append index-prefix (number->string (add1 x))))))) 162 | ;; get all values in parallel 163 | (define parallel-data 164 | (in-values-sequence (apply in-parallel* (map get-values (vector->list data))))) 165 | 166 | ;; df-shallow-copy because the internals do not change 167 | (define return-df (df-shallow-copy df)) 168 | ;; for each sequence in the data and each column name, add that column 169 | (for ([name (in-vector new-column-names)] 170 | [data parallel-data]) 171 | (df-add-series! return-df (make-series (~a name) #:data (apply vector data)))) 172 | (when remove? 173 | (df-del-series! return-df column-name)) 174 | 175 | return-df) 176 | -------------------------------------------------------------------------------- /sawzall-lib/rename.rkt: -------------------------------------------------------------------------------- 1 | #lang racket/base 2 | (require data-frame 3 | fancy-app 4 | racket/sequence 5 | "grouped-df.rkt" 6 | "grouping.rkt") 7 | (provide rename) 8 | 9 | (define (rename df . args) 10 | (grouped-df-apply (rename-df _ args) df)) 11 | 12 | (define (rename-df df args) 13 | (when (not (even? (length args))) 14 | (error 'rename "column specified with nothing to rename to")) 15 | 16 | (define return-df (df-dumb-copy/sub df)) 17 | (for ([rename-clause (in-slice 2 (in-list args))]) 18 | (define from (car rename-clause)) 19 | (define to (cadr rename-clause)) 20 | 21 | (df-add-derived! return-df to (list from) (λ (x) (car x))) 22 | (df-del-series! return-df from)) 23 | return-df) 24 | -------------------------------------------------------------------------------- /sawzall-lib/reorder-df.rkt: -------------------------------------------------------------------------------- 1 | #lang racket/base 2 | (require data-frame 3 | fancy-app 4 | racket/vector 5 | "helpers.rkt") 6 | (provide reorder-df 7 | reorder-default) 8 | 9 | ;; sorts a data-frame with the given columns and comparators 10 | ;; breaks ties using subsequent columns 11 | ;; 12 | ;; XXX: this function is used internally a lot. maybe it shouldn't be! 13 | ;; Julia uses an index vector for grouping as metadata 14 | (define (reorder-df df pairs) 15 | (define row-count (df-row-count df)) 16 | 17 | (define index-vector (build-vector row-count (λ (x) x))) 18 | (for ([p (in-list (reverse pairs))]) 19 | (define col (car p)) 20 | (define cmp? (cdr p)) 21 | 22 | (define data (df-select df col)) 23 | (vector-sort! index-vector cmp? #:key (vector-ref data _))) 24 | 25 | ;; this is non-deterministic, but since we define it here, we can use it as a commonality 26 | ;; to zip the vectors later 27 | ;; 28 | ;; MOST TIME here is spent in `vector-reorder!` 29 | (define series (df-series-names df)) 30 | (define reordered-vecs 31 | (for/vector ([col (in-list series)]) 32 | (define data (df-select df col)) 33 | (vector-reorder! data index-vector) 34 | data)) 35 | 36 | (define return-df (make-data-frame)) 37 | (for ([name (in-list series)] 38 | [data (in-vector reordered-vecs)]) 39 | (df-add-series! return-df (make-series name #:data data))) 40 | 41 | return-df) 42 | 43 | ;; like the above, but defaults to orderable vector? (-> any/c any/c boolean?))] 10 | [reorder (-> (or/c data-frame? grouped-data-frame?) 11 | (or/c string? 12 | (cons/c string? (-> any/c any/c boolean?))) ... 13 | (or/c data-frame? grouped-data-frame?))])) 14 | 15 | (define (reorder df . to-sort) 16 | (define pairs 17 | (for/list ([v (in-list to-sort)]) 18 | (if (pair? v) 19 | v 20 | (cons v orderable* (data-frame? 10 | string? 11 | #:into (non-empty-listof (or/c string? #f))) 12 | (#:separator (or/c string? 13 | regexp? 14 | exact-nonnegative-integer? 15 | (listof 16 | exact-nonnegative-integer?)) 17 | #:remove? boolean? 18 | #:fill (or/c 'right 'left)) 19 | data-frame?)] 20 | [extract (->* (data-frame? 21 | string? 22 | #:into (non-empty-listof (or/c string? #f))) 23 | (#:regex regexp? 24 | #:remove? boolean?) 25 | data-frame?)] 26 | [unite (->* (data-frame? 27 | string? 28 | #:from (non-empty-listof string?)) 29 | (#:combine (-> any/c ... any/c) 30 | #:remove? boolean?) 31 | data-frame?)])) 32 | 33 | ;; creates substrings from a given list of indices 34 | ;; example: 35 | ;; (substrings "asdfghjkl" (list 1 3 5 7)) 36 | ;; => (vector "a" "sd" "fg" "hj" "kl") 37 | (define (substrings s offset-list) 38 | (define len (string-length s)) 39 | (for/vector ([start (in-sequences (list 0) offset-list)] 40 | [end (in-sequences offset-list (list len))]) 41 | (substring s start end))) 42 | 43 | ;; turns a column of strings into multiple columns of strings, 44 | ;; on some separator 45 | (define (separate df column-name 46 | #:into new-column-names 47 | #:separator [separator #px"[^[:alnum:]]+"] 48 | #:remove? [remove? #t] 49 | #:fill [fill 'right]) 50 | (define (split str) 51 | (cond [(not str) #f] 52 | [(or (regexp? separator) (string? separator)) 53 | (apply vector (regexp-split separator str))] 54 | [(number? separator) (substrings str (list separator))] 55 | [(list? separator) (substrings str separator)] 56 | [else #f])) 57 | ;; what to do with matches that don't match the rest 58 | (define (pad-or-truncate-vector vec target-len) 59 | (define input-len (vector-length vec)) 60 | (cond [(> target-len input-len) 61 | (define return-vec (make-vector target-len #f)) 62 | (define beg-idx 63 | (match fill 64 | ['left (- target-len input-len)] 65 | ['right 0])) 66 | (vector-copy! return-vec beg-idx vec) 67 | return-vec] 68 | [(< target-len input-len) 69 | (define return-vec (make-vector target-len #f)) 70 | (vector-copy! return-vec 0 vec target-len) 71 | return-vec] 72 | [else vec])) 73 | 74 | (define data (df-select df column-name)) 75 | (define split-up (vector-map split data)) 76 | (define max-len 77 | (for/fold ([cur-max 0]) ; we are only dealing with positive exact integers 78 | ([v (in-vector split-up)] 79 | #:when v) 80 | (max cur-max (vector-length v)))) 81 | (define padded (vector-map (? pad-or-truncate-vector _ max-len) split-up)) 82 | 83 | (define return-df (df-shallow-copy df)) 84 | (for ([(name idx) (in-indexed (in-list new-column-names))] 85 | #:when name) 86 | (df-add-series! return-df (make-series name #:data (vector-map (? vector-ref _ idx) padded)))) 87 | (when remove? 88 | (df-del-series! return-df column-name)) 89 | return-df) 90 | 91 | ;; turns a column of strings into multiple columns of strings, 92 | ;; by using regular expression capturing groups 93 | (define (extract df column-name 94 | #:into new-column-names 95 | #:regex [regex #px"([[:alnum:]]+)"] 96 | #:remove? [remove? #t]) 97 | (define new-column-len (length new-column-names)) 98 | (define (capture str) 99 | (define res (? cdr (regexp-match regex str))) 100 | (cond [res 101 | (when (not (= (length res) new-column-len)) 102 | (error 'extract "too many/too few matches for input ~a under pattern ~a" str regex)) 103 | (apply vector res)] 104 | [else #f])) 105 | 106 | (define data (df-select df column-name)) 107 | (define split-up (vector-map (λ? capture) data)) 108 | 109 | (define return-df (df-shallow-copy df)) 110 | (for ([(name idx) (in-indexed (in-list new-column-names))] 111 | #:when name) 112 | (df-add-series! return-df (make-series name #:data (vector-map (? vector-ref _ idx) split-up)))) 113 | (when remove? 114 | (df-del-series! return-df column-name)) 115 | return-df) 116 | 117 | ;; turns multiple columns as specified by `parsed-spec` into one column, 118 | ;; by applying `combine-fn` 119 | (define (unite df column-name 120 | #:from to-combine 121 | #:combine [combine-fn (λ args (string-join (filter (λ (x) x) args) "_"))] 122 | #:remove? [remove? #t]) 123 | (define new-series 124 | (make-series column-name 125 | #:data (for/vector ([data (apply in-data-frame/list df to-combine)]) 126 | (apply combine-fn data)))) 127 | 128 | (define return-df (df-shallow-copy df)) 129 | (df-add-series! return-df new-series) 130 | (when remove? 131 | (for ([v (in-list to-combine)]) 132 | (df-del-series! return-df v))) 133 | return-df) 134 | -------------------------------------------------------------------------------- /sawzall-lib/slice-spec.rkt: -------------------------------------------------------------------------------- 1 | #lang racket/base 2 | (require (for-syntax racket/base) 3 | data-frame 4 | fancy-app 5 | racket/contract/base 6 | racket/function 7 | racket/match 8 | racket/set 9 | racket/sequence 10 | racket/string 11 | syntax/parse/define) 12 | (provide (for-syntax slice-spec) 13 | exec-spec exec-spec-on-df 14 | everything starting-with ending-with containing 15 | and or not 16 | all-in any-in 17 | 18 | everything$ starting-with$ ending-with$ containing$ 19 | and$ or$ not$ 20 | all-in$ any-in$) 21 | 22 | ;; a Slice-Spec is one of: 23 | ;; - String 24 | ;; - Regex 25 | ;; - everything ; all variables 26 | ;; - [String ...] ; multiple variables 27 | ;; - (all-in [Sequenceof String]) ; all from string sequence 28 | ;; - (any-in [Sequenceof String]) ; any from string sequence 29 | ;; - (or Slice-Spec ...) ; union 30 | ;; - (and Slice-Spec ...) ; intersection 31 | ;; - (not Slice-Spec) ; complement 32 | ;; - (starting-with String) 33 | ;; - (ending-with String) 34 | ;; - (containing String) 35 | (struct everything$ () #:transparent) 36 | (struct multi-var$ (names) #:transparent) 37 | (struct any-in$ (sequence) #:transparent) 38 | (struct all-in$ (sequence) #:transparent) 39 | (struct or$ (slice-specs) #:transparent) 40 | (struct and$ (slice-specs) #:transparent) 41 | (struct not$ (slice-spec) #:transparent) 42 | (struct starting-with$ (prefix) #:transparent) 43 | (struct ending-with$ (suffix) #:transparent) 44 | (struct containing$ (substring) #:transparent) 45 | 46 | (define-syntax-parse-rule (define-dummy-stxes name:id ...+) 47 | (begin 48 | (define-syntax (name stx) 49 | (raise-syntax-error #f "cannot be used outside of a slice specification" stx)) 50 | ...)) 51 | 52 | (define-dummy-stxes everything starting-with ending-with containing all-in any-in) 53 | 54 | (begin-for-syntax 55 | (define-syntax-class slice-spec 56 | #:attributes (parsed) 57 | #:literals (everything 58 | starting-with ending-with containing 59 | or and not 60 | all-in any-in) 61 | [pattern everything 62 | #:with parsed #'(everything$)] 63 | [pattern (starting-with prefix) 64 | #:declare prefix (expr/c #'string?) 65 | #:with parsed #'(starting-with$ prefix.c)] 66 | [pattern (ending-with suffix) 67 | #:declare suffix (expr/c #'string?) 68 | #:with parsed #'(ending-with$ suffix.c)] 69 | [pattern (containing substr) 70 | #:declare substr (expr/c #'string?) 71 | #:with parsed #'(containing$ substr.c)] 72 | [pattern (or spec:slice-spec ...+) 73 | #:with parsed #'(or$ (list spec.parsed ...))] 74 | [pattern (and spec:slice-spec ...+) 75 | #:with parsed #'(and$ (list spec.parsed ...))] 76 | [pattern (not spec:slice-spec) 77 | #:with parsed #'(not$ spec.parsed)] 78 | [pattern (all-in sequence) 79 | #:declare sequence (expr/c #'(sequence/c string?)) 80 | #:with parsed #'(all-in$ sequence.c)] 81 | [pattern (any-in sequence) 82 | #:declare sequence (expr/c #'(sequence/c string?)) 83 | #:with parsed #'(any-in$ sequence.c)] 84 | [pattern [var:string ...+] 85 | #:with parsed #'(multi-var$ (list var ...))] 86 | [pattern var 87 | #:declare var (expr/c #'(or/c string? regexp?)) 88 | #:with parsed #'var.c])) 89 | 90 | (define (exec-spec universe parsed-spec) 91 | (define (in-universe? var) 92 | (set-member? universe var)) 93 | (match parsed-spec 94 | [(? string? var) 95 | (when (not (in-universe? var)) 96 | (error 'exec-spec "selection not in universe: ~a" var)) 97 | (list var)] 98 | [(? regexp? rx) (filter (curry regexp-match? rx) universe)] 99 | [(everything$) universe] 100 | [(starting-with$ pref) (filter (string-prefix? _ pref) universe)] 101 | [(ending-with$ suff) (filter (string-suffix? _ suff) universe)] 102 | [(containing$ substr) (filter (string-contains? _ substr) universe)] 103 | [(or$ specs) (apply set-union (map (curry exec-spec universe) specs))] 104 | [(and$ specs) (apply set-intersect (map (curry exec-spec universe) specs))] 105 | [(not$ spec) (set-subtract universe (exec-spec universe spec))] 106 | [(all-in$ sequence) 107 | (define lst (sequence->list sequence)) 108 | (when (not (andmap in-universe? lst)) 109 | (error 'exec-spec "some selection(s) not in universe: ~a" lst)) 110 | lst] 111 | [(any-in$ sequence) 112 | (sequence->list (sequence-filter in-universe? sequence))] 113 | [(multi-var$ vars) 114 | (when (not (andmap in-universe? vars)) 115 | (error 'exec-spec "some selection(s) not in universe: ~a" vars)) 116 | (apply list vars)] 117 | [_ (error 'exec-spec "invalid slice specification: ~a" parsed-spec)])) 118 | 119 | (define (exec-spec-on-df df parsed-spec) 120 | (exec-spec (df-series-names df) parsed-spec)) 121 | -------------------------------------------------------------------------------- /sawzall-lib/slice.rkt: -------------------------------------------------------------------------------- 1 | #lang racket/base 2 | (require (for-syntax racket/base) 3 | data-frame 4 | fancy-app 5 | racket/contract/base 6 | racket/match 7 | racket/set 8 | syntax/parse/define 9 | "grouped-df.rkt" 10 | "grouping.rkt" 11 | "slice-spec.rkt") 12 | (provide slice 13 | everything starting-with ending-with containing 14 | and or not 15 | all-in any-in 16 | 17 | take-rows) 18 | 19 | (define (slice-df df parsed-spec groups) 20 | (define to-copy (exec-spec-on-df df parsed-spec)) 21 | (when (not (set-empty? (set-intersect to-copy groups))) 22 | (error 'slice "cannot remove grouping variable from grouped data-frame")) 23 | 24 | (define return-df (make-data-frame)) 25 | 26 | (for ([s (in-list to-copy)]) 27 | (df-add-series! return-df (df-duplicate-series df s))) 28 | return-df) 29 | 30 | (define-syntax-parse-rule (slice df spec:slice-spec) 31 | #:declare df (expr/c #'(or/c data-frame? grouped-data-frame?)) 32 | (ignore-groups-apply (λ (x grps) (slice-df x spec.parsed grps)) df.c #:pass-groups? #t)) 33 | 34 | (define (take-rows df beg end) 35 | (grouped-df-apply (λ (x) (take-rows-df x beg end)) df)) 36 | 37 | (define (take-rows-df df beg end) 38 | (match-define (sub-data-frame int-df (ivl b e)) df) 39 | (df-dumb-copy/sub (df-with-ivl int-df (ivl (+ beg b) (+ end b))))) 40 | -------------------------------------------------------------------------------- /sawzall-lib/split.rkt: -------------------------------------------------------------------------------- 1 | #lang racket/base 2 | (require data-frame 3 | fancy-app 4 | racket/contract/base 5 | racket/set 6 | racket/vector 7 | "helpers.rkt" 8 | "reorder-df.rkt") 9 | (provide 10 | (contract-out [split-with (-> data-frame? string? (listof data-frame?))] 11 | [combine (->* () #:rest (non-empty-listof data-frame?) data-frame?)]) 12 | split-with-possibility) 13 | 14 | ; splitting, and recording the possibility 15 | (define (split-with-possibility df group) 16 | (define sorted-df (reorder-df df (list (cons group orderabledatum x) #t)) (flatten (attribute binding.ty)))) 30 | (raise-syntax-error (syntax->datum (attribute internal-function)) 31 | "types should not be specified here")) 32 | #'(internal-function 33 | frame.c 34 | (column-proc (list (symbol->string 'col) ...) 35 | (list (list (cons (symbol->string 'binding.var) 36 | 'binding.ty) 37 | ...) ...) 38 | (list (λ (binding.var ...) 39 | body ...) 40 | ...)))])) 41 | 42 | (struct row-proc (bindings proc)) 43 | 44 | (define-for-syntax (row-syntax-form stx internal-function-stx) 45 | (syntax-parse stx 46 | [(_ frame (bound:id ...) body:expr ...) 47 | #:declare frame (expr/c #'(or/c data-frame? grouped-data-frame?)) 48 | #:with internal-function internal-function-stx 49 | #'(internal-function 50 | frame.c 51 | (row-proc (list (symbol->string 'bound) ...) 52 | (λ (bound ...) 53 | body ...)))])) 54 | -------------------------------------------------------------------------------- /sawzall-lib/where.rkt: -------------------------------------------------------------------------------- 1 | #lang racket/base 2 | (require (for-syntax racket/base) 3 | data-frame 4 | fancy-app 5 | racket/contract/base 6 | racket/match 7 | racket/set 8 | syntax/parse/define 9 | "grouped-df.rkt" 10 | "grouping.rkt" 11 | "syntax.rkt") 12 | (provide where where* deduplicate 13 | where/int) 14 | 15 | (define-syntax (where stx) 16 | (row-syntax-form stx #'where/int)) 17 | 18 | (define (where/int df proc) 19 | (grouped-df-apply (where-df _ proc) df)) 20 | 21 | (define (where-df df proc) 22 | (define internal-df (sub-data-frame-delegate-frame df)) 23 | (match-define (row-proc binder f?) proc) 24 | 25 | (define return-df (make-data-frame)) 26 | (define indices 27 | (for*/list ([(vs idx) (in-indexed (apply in-data-frame/list/sub df binder))] 28 | #:when (apply f? vs)) 29 | idx)) 30 | (for ([name (in-list (df-series-names internal-df))]) 31 | (df-add-series! return-df 32 | (make-series name #:data (for/vector ([idx (in-list indices)]) 33 | (df-ref/sub df idx name))))) 34 | return-df) 35 | 36 | (define-syntax-parse-rule (where* df (name:id ...) (pat:expr ...)) 37 | #:declare df (expr/c #'(or/c data-frame? grouped-data-frame?)) 38 | #:fail-when (not (= (length (attribute name)) (length (attribute pat)))) 39 | "number of names must be the same as the number of match patterns" 40 | (where df.c (name ...) 41 | (match (list name ...) 42 | [(list pat ...) #t] 43 | [_ #f]))) 44 | 45 | (define-syntax-parse-rule (deduplicate df fld:id ...+) 46 | #:declare df (expr/c #'(or/c data-frame? grouped-data-frame?)) 47 | (grouped-df-apply 48 | (λ (sub-df) 49 | (define seen-set (mutable-set)) 50 | (define (seen? v) (set-member? seen-set v)) 51 | (define (add-seen v) (set-add! seen-set v)) 52 | (define (filter-seen v) 53 | (cond 54 | [(seen? v) #f] 55 | [else (add-seen v) #t])) 56 | (where/int sub-df (row-proc (list (symbol->string 'fld) ...) 57 | (λ (fld ...) (filter-seen (list fld ...)))))) 58 | df.c)) 59 | -------------------------------------------------------------------------------- /sawzall-test/aggregating-test.rkt: -------------------------------------------------------------------------------- 1 | #lang racket 2 | (require data-frame 3 | racket/runtime-path 4 | rackunit 5 | sawzall 6 | threading 7 | "test-data.rkt" 8 | "util.rkt") 9 | 10 | ;; aggregate to a scalar 11 | (define aggregate-1 (aggregate woodland1 [N (site) (vector-length site)])) 12 | (define aggregate-1-result 13 | (row-df [N] 14 | 3)) 15 | 16 | (define aggregate-2 17 | (aggregate woodland1 18 | [combined (habitat) 19 | (for/fold ([str ""]) 20 | ([h (in-vector habitat)]) 21 | (string-append str " " h))])) 22 | (define aggregate-2-result 23 | (row-df [combined] 24 | " grassland meadow woodland")) 25 | 26 | ;; aggregating with groups 27 | (define (sum vec) (for/sum ([v (in-vector vec)] #:when v) v)) 28 | 29 | ;; implicit ungroup 30 | (define aggregate-3 31 | (~> docs1 32 | (group-with "grp") 33 | (aggregate [adult-sum (adult) (sum adult)] 34 | [juv-sum (juv) (sum juv)]))) 35 | (define aggregate-3-result 36 | (row-df [grp adult-sum juv-sum] 37 | "a" 3 30 38 | "b" 12 120)) 39 | 40 | ;; needs explicit ungroup 41 | (define aggregate-4 42 | (~> docs1 43 | (group-with "grp" "trt") 44 | (aggregate [adult-sum (adult) (sum adult)] 45 | [juv-sum (juv) (sum juv)]) 46 | ungroup)) 47 | (define aggregate-4-result 48 | (row-df [grp trt adult-sum juv-sum] 49 | "a" "a" 1 10 50 | "a" "b" 2 20 51 | "b" "a" 3 30 52 | "b" "b" 9 90)) 53 | 54 | ;; gss-sm aggregation 55 | (define-runtime-path aggregate-gss-1-data "./results/aggregate_gss_1.csv") 56 | (define aggregate-gss-1 57 | (~> gss-sm 58 | (group-with "bigregion" "religion") 59 | (aggregate [N (religion) (vector-length religion)]) 60 | ungroup)) 61 | 62 | (module+ test 63 | (check data-frame~=? aggregate-1 aggregate-1-result) 64 | (check data-frame~=? aggregate-2 aggregate-2-result) 65 | (check data-frame~=? aggregate-3 aggregate-3-result) 66 | (check data-frame~=? aggregate-4 aggregate-4-result) 67 | 68 | (check-csv aggregate-gss-1 aggregate-gss-1-data)) 69 | -------------------------------------------------------------------------------- /sawzall-test/combining-join-test.rkt: -------------------------------------------------------------------------------- 1 | #lang racket 2 | (require rackunit 3 | sawzall 4 | "test-data.rkt" 5 | "util.rkt") 6 | 7 | (define left-join-1 (left-join woodland1 woodland2 "site")) 8 | (define left-join-1-result 9 | (row-df [day catch site habitat] 10 | #f #f "a" "meadow" 11 | 1 12 "b" "grassland" 12 | 2 24 "b" "grassland" 13 | 1 10 "c" "woodland" 14 | 2 20 "c" "woodland")) 15 | 16 | (define left-join-2 (left-join woodland2 woodland1 "site")) 17 | (define left-join-2-result 18 | (row-df [habitat catch site day] 19 | "grassland" 12 "b" 1 20 | "grassland" 24 "b" 2 21 | "woodland" 10 "c" 1 22 | "woodland" 20 "c" 2)) 23 | 24 | (define left-join-3 25 | (left-join (row-df [key val-x] 26 | 1 "x1" 27 | 2 "x2" 28 | 2 "x3" 29 | 1 "x4") 30 | (row-df [key val-y] 31 | 1 "y1" 32 | 2 "y2") 33 | "key")) 34 | (define left-join-3-result 35 | (row-df [key val-x val-y] 36 | 1 "x1" "y1" 37 | 1 "x4" "y1" 38 | 2 "x2" "y2" 39 | 2 "x3" "y2")) 40 | 41 | (define left-join-4 42 | (left-join (row-df [key val-x] 43 | 1 "x1" 44 | 2 "x2" 45 | 2 "x3" 46 | 3 "x4") 47 | (row-df [key val-y] 48 | 1 "y1" 49 | 2 "y2" 50 | 2 "y3" 51 | 3 "y4") 52 | "key")) 53 | (define left-join-4-result 54 | (row-df [key val-x val-y] 55 | 1 "x1" "y1" 56 | 2 "x2" "y2" 57 | 2 "x2" "y3" 58 | 2 "x3" "y2" 59 | 2 "x3" "y3" 60 | 3 "x4" "y4")) 61 | 62 | ;; join by multiple variables 63 | (define left-join-5 (left-join ball1 ball2 "first" "last")) 64 | (define left-join-5-result 65 | (row-df [first last age game goals] 66 | "bob" "ert" 20 1 1 67 | "bob" "ert" 20 2 3 68 | "dan" "man" 40 1 2 69 | "sam" "jam" 30 #f #f 70 | "sam" "son" 10 1 0)) 71 | 72 | (define left-join-6 (left-join ball2 ball1 "first" "last")) 73 | (define left-join-6a (left-join ball2 ball1)) 74 | (define left-join-6-result 75 | (row-df [first last game goals age] 76 | "bob" "ert" 1 1 20 77 | "bob" "ert" 2 3 20 78 | "dan" "man" 1 2 40 79 | "sam" "son" 1 0 10)) 80 | 81 | (define right-join-1 (right-join woodland2 woodland1 "site")) 82 | (define right-join-1-result left-join-1-result) 83 | 84 | (define right-join-2 (right-join woodland1 woodland2 "site")) 85 | (define right-join-2-result left-join-2-result) 86 | 87 | (define inner-join-1 (inner-join woodland1 woodland2 "site")) 88 | (define inner-join-1-result 89 | (row-df [day catch site habitat] 90 | 1 12 "b" "grassland" 91 | 2 24 "b" "grassland" 92 | 1 10 "c" "woodland" 93 | 2 20 "c" "woodland")) 94 | 95 | (define inner-join-2 96 | (inner-join (row-df [key val-x] 97 | 1 "x1" 98 | 2 "x2" 99 | 3 "x3") 100 | (row-df [key val-y] 101 | 1 "y1" 102 | 2 "y2" 103 | 4 "y3") 104 | "key")) 105 | (define inner-join-2-result 106 | (row-df [key val-x val-y] 107 | 1 "x1" "y1" 108 | 2 "x2" "y2")) 109 | 110 | (define full-join-1 (full-join woodland2 woodland1 "site")) 111 | (define full-join-1-result 112 | (row-df [day catch site habitat] 113 | #f #f "a" "meadow" 114 | 1 12 "b" "grassland" 115 | 2 24 "b" "grassland" 116 | 1 10 "c" "woodland" 117 | 2 20 "c" "woodland")) 118 | 119 | (module+ test 120 | (check data-frame~=? left-join-1 left-join-1-result) 121 | (check data-frame~=? left-join-2 left-join-2-result) 122 | (check data-frame~=? left-join-3 left-join-3-result) 123 | (check data-frame~=? left-join-4 left-join-4-result) 124 | (check data-frame~=? left-join-5 left-join-5-result) 125 | (check data-frame~=? left-join-6 left-join-6-result) 126 | (check data-frame~=? left-join-6a left-join-6-result) 127 | 128 | (check data-frame~=? right-join-1 right-join-1-result) 129 | (check data-frame~=? right-join-2 right-join-2-result) 130 | 131 | (check data-frame~=? inner-join-1 inner-join-1-result) 132 | (check data-frame~=? inner-join-2 inner-join-2-result) 133 | 134 | (check data-frame~=? full-join-1 full-join-1-result)) 135 | -------------------------------------------------------------------------------- /sawzall-test/create-test.rkt: -------------------------------------------------------------------------------- 1 | #lang racket 2 | (require rackunit 3 | sawzall 4 | "test-data.rkt" 5 | "util.rkt") 6 | 7 | ;; these really should be provided by sawzall, tbh 8 | (define (v/ vec c) (vector-map (λ (v) (/ v c)) vec)) 9 | (define (sum vec) (for/sum ([v (in-vector vec)]) v)) 10 | 11 | ;; simple element mapping 12 | (define create-1 (create docs1 [egrp (grp) (string-append "e" grp)])) 13 | (define create-1-result 14 | (column-df [grp #("a" "a" "b" "b" "b")] 15 | [egrp #("ea" "ea" "eb" "eb" "eb")] 16 | [trt #("a" "b" "a" "b" "b")] 17 | [adult #(1 2 3 4 5)] 18 | [juv #(10 20 30 40 50)])) 19 | 20 | ;; overwriting an existing column 21 | (define create-2 (create docs1 [grp (grp) (string-append "e" grp)])) 22 | (define create-2-result 23 | (column-df [grp #("ea" "ea" "eb" "eb" "eb")] 24 | [trt #("a" "b" "a" "b" "b")] 25 | [adult #(1 2 3 4 5)] 26 | [juv #(10 20 30 40 50)])) 27 | 28 | ;; using a vector as an input, and returning a vector as input 29 | (define create-3 (create docs1 [norm-juv ([juv : vector]) (v/ juv (sum juv))])) 30 | (define create-3-result 31 | (column-df [grp #("a" "a" "b" "b" "b")] 32 | [trt #("a" "b" "a" "b" "b")] 33 | [adult #(1 2 3 4 5)] 34 | [juv #(10 20 30 40 50)] 35 | [norm-juv #(1/15 2/15 1/5 4/15 1/3)])) 36 | 37 | ;; using a vector and a scalar as input, and mapping over, keeping the vector constant 38 | (define create-4 39 | (create docs1 [what ([adult : element] [juv : vector]) (/ adult (sum juv))])) 40 | (define create-4-result 41 | (column-df [grp #("a" "a" "b" "b" "b")] 42 | [trt #("a" "b" "a" "b" "b")] 43 | [adult #(1 2 3 4 5)] 44 | [juv #(10 20 30 40 50)] 45 | [what #(1/150 1/75 1/50 2/75 1/30)])) 46 | 47 | ;; multiple clauses, mixing up all of the above 48 | (define create-5 49 | (create docs1 50 | [grp (grp) (string-append "e" grp)] 51 | [norm-juv ([juv : vector]) (v/ juv (sum juv))] 52 | [what ([adult : element] [juv : vector]) (/ adult (sum juv))])) 53 | (define create-5-result 54 | (column-df [grp #("ea" "ea" "eb" "eb" "eb")] 55 | [trt #("a" "b" "a" "b" "b")] 56 | [adult #(1 2 3 4 5)] 57 | [juv #(10 20 30 40 50)] 58 | [norm-juv #(1/15 2/15 1/5 4/15 1/3)] 59 | [what #(1/150 1/75 1/50 2/75 1/30)])) 60 | 61 | ;; clauses can depend on ones before them 62 | (define create-6 63 | (create docs1 64 | [grp (grp) (string-append "e" grp)] 65 | [trt (trt grp) (string-append trt grp)])) 66 | (define create-6-result 67 | (column-df [grp #("ea" "ea" "eb" "eb" "eb")] 68 | [trt #("aea" "bea" "aeb" "beb" "beb")] 69 | [adult #(1 2 3 4 5)] 70 | [juv #(10 20 30 40 50)])) 71 | 72 | (module+ test 73 | (check data-frame~=? create-1 create-1-result) 74 | (check data-frame~=? create-2 create-2-result) 75 | (check data-frame~=? create-3 create-3-result) 76 | (check data-frame~=? create-4 create-4-result) 77 | (check data-frame~=? create-5 create-5-result) 78 | (check data-frame~=? create-6 create-6-result)) 79 | -------------------------------------------------------------------------------- /sawzall-test/data/iris.csv: -------------------------------------------------------------------------------- 1 | "","Sepal.Length","Sepal.Width","Petal.Length","Petal.Width","Species" 2 | "1",5.1,3.5,1.4,0.2,"setosa" 3 | "2",4.9,3,1.4,0.2,"setosa" 4 | "3",4.7,3.2,1.3,0.2,"setosa" 5 | "4",4.6,3.1,1.5,0.2,"setosa" 6 | "5",5,3.6,1.4,0.2,"setosa" 7 | "6",5.4,3.9,1.7,0.4,"setosa" 8 | "7",4.6,3.4,1.4,0.3,"setosa" 9 | "8",5,3.4,1.5,0.2,"setosa" 10 | "9",4.4,2.9,1.4,0.2,"setosa" 11 | "10",4.9,3.1,1.5,0.1,"setosa" 12 | "11",5.4,3.7,1.5,0.2,"setosa" 13 | "12",4.8,3.4,1.6,0.2,"setosa" 14 | "13",4.8,3,1.4,0.1,"setosa" 15 | "14",4.3,3,1.1,0.1,"setosa" 16 | "15",5.8,4,1.2,0.2,"setosa" 17 | "16",5.7,4.4,1.5,0.4,"setosa" 18 | "17",5.4,3.9,1.3,0.4,"setosa" 19 | "18",5.1,3.5,1.4,0.3,"setosa" 20 | "19",5.7,3.8,1.7,0.3,"setosa" 21 | "20",5.1,3.8,1.5,0.3,"setosa" 22 | "21",5.4,3.4,1.7,0.2,"setosa" 23 | "22",5.1,3.7,1.5,0.4,"setosa" 24 | "23",4.6,3.6,1,0.2,"setosa" 25 | "24",5.1,3.3,1.7,0.5,"setosa" 26 | "25",4.8,3.4,1.9,0.2,"setosa" 27 | "26",5,3,1.6,0.2,"setosa" 28 | "27",5,3.4,1.6,0.4,"setosa" 29 | "28",5.2,3.5,1.5,0.2,"setosa" 30 | "29",5.2,3.4,1.4,0.2,"setosa" 31 | "30",4.7,3.2,1.6,0.2,"setosa" 32 | "31",4.8,3.1,1.6,0.2,"setosa" 33 | "32",5.4,3.4,1.5,0.4,"setosa" 34 | "33",5.2,4.1,1.5,0.1,"setosa" 35 | "34",5.5,4.2,1.4,0.2,"setosa" 36 | "35",4.9,3.1,1.5,0.2,"setosa" 37 | "36",5,3.2,1.2,0.2,"setosa" 38 | "37",5.5,3.5,1.3,0.2,"setosa" 39 | "38",4.9,3.6,1.4,0.1,"setosa" 40 | "39",4.4,3,1.3,0.2,"setosa" 41 | "40",5.1,3.4,1.5,0.2,"setosa" 42 | "41",5,3.5,1.3,0.3,"setosa" 43 | "42",4.5,2.3,1.3,0.3,"setosa" 44 | "43",4.4,3.2,1.3,0.2,"setosa" 45 | "44",5,3.5,1.6,0.6,"setosa" 46 | "45",5.1,3.8,1.9,0.4,"setosa" 47 | "46",4.8,3,1.4,0.3,"setosa" 48 | "47",5.1,3.8,1.6,0.2,"setosa" 49 | "48",4.6,3.2,1.4,0.2,"setosa" 50 | "49",5.3,3.7,1.5,0.2,"setosa" 51 | "50",5,3.3,1.4,0.2,"setosa" 52 | "51",7,3.2,4.7,1.4,"versicolor" 53 | "52",6.4,3.2,4.5,1.5,"versicolor" 54 | "53",6.9,3.1,4.9,1.5,"versicolor" 55 | "54",5.5,2.3,4,1.3,"versicolor" 56 | "55",6.5,2.8,4.6,1.5,"versicolor" 57 | "56",5.7,2.8,4.5,1.3,"versicolor" 58 | "57",6.3,3.3,4.7,1.6,"versicolor" 59 | "58",4.9,2.4,3.3,1,"versicolor" 60 | "59",6.6,2.9,4.6,1.3,"versicolor" 61 | "60",5.2,2.7,3.9,1.4,"versicolor" 62 | "61",5,2,3.5,1,"versicolor" 63 | "62",5.9,3,4.2,1.5,"versicolor" 64 | "63",6,2.2,4,1,"versicolor" 65 | "64",6.1,2.9,4.7,1.4,"versicolor" 66 | "65",5.6,2.9,3.6,1.3,"versicolor" 67 | "66",6.7,3.1,4.4,1.4,"versicolor" 68 | "67",5.6,3,4.5,1.5,"versicolor" 69 | "68",5.8,2.7,4.1,1,"versicolor" 70 | "69",6.2,2.2,4.5,1.5,"versicolor" 71 | "70",5.6,2.5,3.9,1.1,"versicolor" 72 | "71",5.9,3.2,4.8,1.8,"versicolor" 73 | "72",6.1,2.8,4,1.3,"versicolor" 74 | "73",6.3,2.5,4.9,1.5,"versicolor" 75 | "74",6.1,2.8,4.7,1.2,"versicolor" 76 | "75",6.4,2.9,4.3,1.3,"versicolor" 77 | "76",6.6,3,4.4,1.4,"versicolor" 78 | "77",6.8,2.8,4.8,1.4,"versicolor" 79 | "78",6.7,3,5,1.7,"versicolor" 80 | "79",6,2.9,4.5,1.5,"versicolor" 81 | "80",5.7,2.6,3.5,1,"versicolor" 82 | "81",5.5,2.4,3.8,1.1,"versicolor" 83 | "82",5.5,2.4,3.7,1,"versicolor" 84 | "83",5.8,2.7,3.9,1.2,"versicolor" 85 | "84",6,2.7,5.1,1.6,"versicolor" 86 | "85",5.4,3,4.5,1.5,"versicolor" 87 | "86",6,3.4,4.5,1.6,"versicolor" 88 | "87",6.7,3.1,4.7,1.5,"versicolor" 89 | "88",6.3,2.3,4.4,1.3,"versicolor" 90 | "89",5.6,3,4.1,1.3,"versicolor" 91 | "90",5.5,2.5,4,1.3,"versicolor" 92 | "91",5.5,2.6,4.4,1.2,"versicolor" 93 | "92",6.1,3,4.6,1.4,"versicolor" 94 | "93",5.8,2.6,4,1.2,"versicolor" 95 | "94",5,2.3,3.3,1,"versicolor" 96 | "95",5.6,2.7,4.2,1.3,"versicolor" 97 | "96",5.7,3,4.2,1.2,"versicolor" 98 | "97",5.7,2.9,4.2,1.3,"versicolor" 99 | "98",6.2,2.9,4.3,1.3,"versicolor" 100 | "99",5.1,2.5,3,1.1,"versicolor" 101 | "100",5.7,2.8,4.1,1.3,"versicolor" 102 | "101",6.3,3.3,6,2.5,"virginica" 103 | "102",5.8,2.7,5.1,1.9,"virginica" 104 | "103",7.1,3,5.9,2.1,"virginica" 105 | "104",6.3,2.9,5.6,1.8,"virginica" 106 | "105",6.5,3,5.8,2.2,"virginica" 107 | "106",7.6,3,6.6,2.1,"virginica" 108 | "107",4.9,2.5,4.5,1.7,"virginica" 109 | "108",7.3,2.9,6.3,1.8,"virginica" 110 | "109",6.7,2.5,5.8,1.8,"virginica" 111 | "110",7.2,3.6,6.1,2.5,"virginica" 112 | "111",6.5,3.2,5.1,2,"virginica" 113 | "112",6.4,2.7,5.3,1.9,"virginica" 114 | "113",6.8,3,5.5,2.1,"virginica" 115 | "114",5.7,2.5,5,2,"virginica" 116 | "115",5.8,2.8,5.1,2.4,"virginica" 117 | "116",6.4,3.2,5.3,2.3,"virginica" 118 | "117",6.5,3,5.5,1.8,"virginica" 119 | "118",7.7,3.8,6.7,2.2,"virginica" 120 | "119",7.7,2.6,6.9,2.3,"virginica" 121 | "120",6,2.2,5,1.5,"virginica" 122 | "121",6.9,3.2,5.7,2.3,"virginica" 123 | "122",5.6,2.8,4.9,2,"virginica" 124 | "123",7.7,2.8,6.7,2,"virginica" 125 | "124",6.3,2.7,4.9,1.8,"virginica" 126 | "125",6.7,3.3,5.7,2.1,"virginica" 127 | "126",7.2,3.2,6,1.8,"virginica" 128 | "127",6.2,2.8,4.8,1.8,"virginica" 129 | "128",6.1,3,4.9,1.8,"virginica" 130 | "129",6.4,2.8,5.6,2.1,"virginica" 131 | "130",7.2,3,5.8,1.6,"virginica" 132 | "131",7.4,2.8,6.1,1.9,"virginica" 133 | "132",7.9,3.8,6.4,2,"virginica" 134 | "133",6.4,2.8,5.6,2.2,"virginica" 135 | "134",6.3,2.8,5.1,1.5,"virginica" 136 | "135",6.1,2.6,5.6,1.4,"virginica" 137 | "136",7.7,3,6.1,2.3,"virginica" 138 | "137",6.3,3.4,5.6,2.4,"virginica" 139 | "138",6.4,3.1,5.5,1.8,"virginica" 140 | "139",6,3,4.8,1.8,"virginica" 141 | "140",6.9,3.1,5.4,2.1,"virginica" 142 | "141",6.7,3.1,5.6,2.4,"virginica" 143 | "142",6.9,3.1,5.1,2.3,"virginica" 144 | "143",5.8,2.7,5.1,1.9,"virginica" 145 | "144",6.8,3.2,5.9,2.3,"virginica" 146 | "145",6.7,3.3,5.7,2.5,"virginica" 147 | "146",6.7,3,5.2,2.3,"virginica" 148 | "147",6.3,2.5,5,1.9,"virginica" 149 | "148",6.5,3,5.2,2,"virginica" 150 | "149",6.2,3.4,5.4,2.3,"virginica" 151 | "150",5.9,3,5.1,1.8,"virginica" 152 | -------------------------------------------------------------------------------- /sawzall-test/data/relig_income.csv: -------------------------------------------------------------------------------- 1 | "","religion","<$10k","$10-20k","$20-30k","$30-40k","$40-50k","$50-75k","$75-100k","$100-150k",">150k","Don't know/refused" 2 | "1","Agnostic",27,34,60,81,76,137,122,109,84,96 3 | "2","Atheist",12,27,37,52,35,70,73,59,74,76 4 | "3","Buddhist",27,21,30,34,33,58,62,39,53,54 5 | "4","Catholic",418,617,732,670,638,1116,949,792,633,1489 6 | "5","Don’t know/refused",15,14,15,11,10,35,21,17,18,116 7 | "6","Evangelical Prot",575,869,1064,982,881,1486,949,723,414,1529 8 | "7","Hindu",1,9,7,9,11,34,47,48,54,37 9 | "8","Historically Black Prot",228,244,236,238,197,223,131,81,78,339 10 | "9","Jehovah's Witness",20,27,24,24,21,30,15,11,6,37 11 | "10","Jewish",19,19,25,25,30,95,69,87,151,162 12 | "11","Mainline Prot",289,495,619,655,651,1107,939,753,634,1328 13 | "12","Mormon",29,40,48,51,56,112,85,49,42,69 14 | "13","Muslim",6,7,9,10,9,23,16,8,6,22 15 | "14","Orthodox",13,17,23,32,32,47,38,42,46,73 16 | "15","Other Christian",9,7,11,13,13,14,18,14,12,18 17 | "16","Other Faiths",20,33,40,46,49,63,46,40,41,71 18 | "17","Other World Religions",5,2,3,4,2,7,3,4,4,8 19 | "18","Unaffiliated",217,299,374,365,341,528,407,321,258,597 20 | -------------------------------------------------------------------------------- /sawzall-test/filtering-test.rkt: -------------------------------------------------------------------------------- 1 | #lang racket 2 | (require data-frame 3 | racket/runtime-path 4 | rackunit 5 | sawzall 6 | threading 7 | "test-data.rkt" 8 | "util.rkt") 9 | 10 | ;; super basic filter 11 | (define where-1 (where woodland2 (site) (string=? site "c"))) 12 | (define where-1-result 13 | (row-df [site day catch] 14 | "c" 1 10 15 | "c" 2 20)) 16 | 17 | ;; binding multiple variables 18 | (define where-2 (where woodland2 (site day catch) 19 | (and (= day 2) (= catch 20) (string=? site "c")))) 20 | (define where-2-result 21 | (row-df [site day catch] 22 | "c" 2 20)) 23 | 24 | ;; no matches 25 | (define where-3 (where woodland2 (site) (string=? site "canada"))) ; not a real place 26 | (define where-3-result 27 | (column-df [site #()] 28 | [day #()] 29 | [catch #()])) 30 | 31 | ;; yes 32 | (define where-4 (where woodland2 (site) #t)) 33 | (define where-4-result woodland2) 34 | 35 | ;; more compound booleans 36 | (define where-5 (where ball2 (first last) (or (string=? first "sam") (string=? last "ert")))) 37 | (define where-5-result 38 | (row-df [first last game goals] 39 | "sam" "son" 1 0 40 | "bob" "ert" 1 1 41 | "bob" "ert" 2 3)) 42 | 43 | ;; grouping and filtering 44 | (define where-6 45 | (~> ball1 46 | (group-with "last") 47 | (where (first) (string=? first "sam")) 48 | ungroup)) 49 | (define where-6-result 50 | (row-df [first last age] 51 | "sam" "jam" 30 52 | "sam" "son" 10)) 53 | 54 | ;; deduplicating 55 | (define deduplicate-1 (deduplicate ball1 first)) 56 | (define deduplicate-1-result 57 | (row-df [first last age] 58 | "sam" "son" 10 59 | "bob" "ert" 20 60 | "dan" "man" 40)) 61 | 62 | (define deduplicate-2 63 | (~> docs1 64 | (group-with "grp") 65 | (deduplicate trt) 66 | ungroup)) 67 | (define deduplicate-2-result 68 | (row-df [grp trt adult juv] 69 | "a" "a" 1 10 70 | "a" "b" 2 20 71 | "b" "a" 3 30 72 | "b" "b" 4 40)) 73 | 74 | ;; filtering gss 75 | (define-runtime-path where-gss-1-data "./results/where_gss_1.csv") 76 | (define where-gss-1 (where gss-sm (bigregion) (string=? bigregion "Northeast"))) 77 | 78 | (define-runtime-path where-gss-2-data "./results/where_gss_2.csv") 79 | (define where-gss-2 (where gss-sm (bigregion) (not (string=? bigregion "Midwest")))) 80 | 81 | (define-runtime-path where-gss-3-data "./results/where_gss_3.csv") 82 | (define where-gss-3 83 | (~> gss-sm 84 | (group-with "bigregion" "religion") 85 | (where (bigregion) (string=? bigregion "South")) 86 | ungroup)) 87 | 88 | ;; filtering organdata 89 | (define-runtime-path where-organdata-1-data "./results/where_organdata_1.csv") 90 | (define where-organdata-1 (where organdata (consent_practice) (string=? consent_practice "Informed"))) 91 | 92 | (define-runtime-path where-organdata-2-data "./results/where_organdata_2.csv") 93 | (define where-organdata-2 (where organdata (country) (char=? (string-ref country 0) #\I))) 94 | 95 | (define-runtime-path where-organdata-3-data "./results/where_organdata_3.csv") 96 | (define where-organdata-3 97 | (~> organdata 98 | (group-with "country") 99 | (where (consent_practice) (string=? consent_practice "Presumed")) 100 | ungroup)) 101 | 102 | (module+ test 103 | ;; I think the error message data-frame provides is good enough here 104 | ;; But we always want to stick with this error message if we aren't rolling our own 105 | (check-exn 106 | exn:fail:data-frame? 107 | (thunk (where woodland2 (non-existent site) #t))) 108 | 109 | (check data-frame~=? where-1 where-1-result) 110 | (check data-frame~=? where-2 where-2-result) 111 | (check data-frame~=? where-3 where-3-result) 112 | (check data-frame~=? where-4 where-4-result) 113 | (check data-frame~=? where-5 where-5-result) 114 | (check data-frame~=? where-6 where-6-result) 115 | 116 | (check data-frame~=? deduplicate-1 deduplicate-1-result) 117 | (check data-frame~=? deduplicate-2 deduplicate-2-result) 118 | 119 | (check-csv where-gss-1 where-gss-1-data) 120 | (check-true (df-contains-only? where-gss-1 "bigregion" "Northeast")) 121 | (check-csv where-gss-2 where-gss-2-data) 122 | (check-true (df-does-not-contain? where-gss-2 "bigregion" "Midwest")) 123 | (check-csv where-gss-3 where-gss-3-data) 124 | (check-true (df-contains-only? where-gss-3 "bigregion" "South")) 125 | 126 | (check-csv where-organdata-1 where-organdata-1-data) 127 | (check-true (df-contains-only? where-organdata-1 "consent_practice" "Informed")) 128 | (check-csv where-organdata-2 where-organdata-2-data) 129 | (check-true (df-does-not-contain? where-organdata-2 "country" "Netherlands")) 130 | (check-csv where-organdata-3 where-organdata-3-data) 131 | (check-true (df-contains-only? where-organdata-3 "consent_practice" "Presumed"))) 132 | -------------------------------------------------------------------------------- /sawzall-test/info.rkt: -------------------------------------------------------------------------------- 1 | #lang info 2 | 3 | (define collection "sawzall-test") 4 | 5 | (define test-omit-paths '("./info.rkt" "./test-data.rkt")) 6 | (define test-responsibles '((all hazel@knightsofthelambdacalcul.us))) 7 | 8 | (define pkg-desc "Tests for Sawzall") 9 | (define version "1.0") 10 | (define deps '("base" 11 | "data-frame" 12 | "rackunit-lib" 13 | "sawzall-lib" 14 | "threading-lib")) 15 | -------------------------------------------------------------------------------- /sawzall-test/pivot-test.rkt: -------------------------------------------------------------------------------- 1 | #lang racket 2 | (require data-frame 3 | racket/runtime-path 4 | rackunit 5 | sawzall 6 | threading 7 | "test-data.rkt" 8 | "util.rkt") 9 | 10 | (define wide-df 11 | (row-df [day hour a b c] 12 | 1 10 97 84 55 13 | 2 11 78 47 54)) 14 | 15 | (define pivot-longer-1 (pivot-longer wide-df ["a" "b" "c"] #:names-to "site" #:values-to "catch")) 16 | (define pivot-longer-1-result 17 | (row-df [day hour site catch] 18 | 1 10 "a" 97 19 | 2 11 "a" 78 20 | 1 10 "b" 84 21 | 2 11 "b" 47 22 | 1 10 "c" 55 23 | 2 11 "c" 54)) 24 | 25 | (define-runtime-path relig-income-pivot-longer-1-data 26 | "./results/relig_income_pivot_longer_1.csv") 27 | (define relig-income-pivot-longer-1 28 | (pivot-longer relig-income (not "religion") #:names-to "income" #:values-to "count")) 29 | 30 | (define-runtime-path billboard-pivot-longer-1-data 31 | "./results/billboard_pivot_longer_1.csv") 32 | (define billboard-pivot-longer-1 33 | (~> billboard 34 | (pivot-longer (starting-with "wk") 35 | #:names-to "week" #:values-to "rank") 36 | (create [week (week) (string->number (string-replace week "wk" ""))]))) 37 | 38 | (define long-df1 39 | (row-df [day grp val] 40 | 1 "A" 10 41 | 1 "B" 20 42 | 2 "B" 30)) 43 | 44 | (define long-df2 45 | (row-df [day hour grp val] 46 | 1 10 "a" 83 47 | 1 10 "b" 78 48 | 1 11 "a" 80 49 | 1 11 "b" 105 50 | 2 10 "a" 95 51 | 2 10 "b" 77 52 | 2 11 "a" 96 53 | 2 11 "b" 99)) 54 | 55 | (define pivot-wider-1 (pivot-wider long-df1 #:names-from "grp" #:values-from "val")) 56 | (define pivot-wider-1-result 57 | (row-df [day A B] 58 | 1 10 20 59 | 2 #f 30)) 60 | 61 | (define pivot-wider-2 (pivot-wider long-df2 #:names-from "grp" #:values-from "val")) 62 | (define pivot-wider-2-result 63 | (row-df [day hour a b] 64 | 1 10 83 78 65 | 2 10 95 77 66 | 1 11 80 105 67 | 2 11 96 99)) 68 | 69 | (module+ test 70 | (check data-frame~=? pivot-longer-1 pivot-longer-1-result) 71 | 72 | (check-csv relig-income-pivot-longer-1 relig-income-pivot-longer-1-data) 73 | (check-csv billboard-pivot-longer-1 billboard-pivot-longer-1-data) 74 | (check-true (for/and ([v (in-data-frame billboard-pivot-longer-1 "week")]) 75 | (number? v))) 76 | 77 | (check data-frame~=? pivot-wider-1 pivot-wider-1-result) 78 | (check data-frame~=? pivot-wider-2 pivot-wider-2-result)) 79 | -------------------------------------------------------------------------------- /sawzall-test/rectangling-test.rkt: -------------------------------------------------------------------------------- 1 | #lang racket 2 | (require rackunit 3 | sawzall 4 | "util.rkt") 5 | 6 | (define deep-df 7 | (column-df 8 | [character #("Toothless" "Dory" "Holly")] 9 | [metadata (vector (hash 'species "dragon" 10 | 'color "black" 11 | 'films (vector "How to Train Your Dragon" 12 | "How to Train Your Dragon 2" 13 | "How to Train Your Dragon: The Hidden World")) 14 | (hash 'species "blue tang" 15 | 'color "blue" 16 | 'films (vector "Finding Nemo" 17 | "Finding Dory")) 18 | (hash 'species "glaceon" 19 | 'color "also blue" 20 | 'films #f))])) 21 | 22 | (define unnest-1 (unnest-wider deep-df "metadata")) 23 | (define unnest-1-result 24 | (row-df [character species color films] 25 | "Toothless" "dragon" "black" (vector "How to Train Your Dragon" 26 | "How to Train Your Dragon 2" 27 | "How to Train Your Dragon: The Hidden World") 28 | "Dory" "blue tang" "blue" (vector "Finding Nemo" 29 | "Finding Dory") 30 | "Holly" "glaceon" "also blue" #f)) 31 | 32 | (define unnest-2 (unnest-longer unnest-1-result "films")) 33 | (define unnest-2-result 34 | (row-df [character species color films] 35 | "Toothless" "dragon" "black" "How to Train Your Dragon" 36 | "Toothless" "dragon" "black" "How to Train Your Dragon 2" 37 | "Toothless" "dragon" "black" "How to Train Your Dragon: The Hidden World" 38 | "Dory" "blue tang" "blue" "Finding Nemo" 39 | "Dory" "blue tang" "blue" "Finding Dory" 40 | "Holly" "glaceon" "also blue" #f)) 41 | 42 | (define unnest-3 (unnest-longer deep-df "metadata")) 43 | (define unnest-3-result 44 | (row-df [character metadata-keys metadata] 45 | "Toothless" 'color "black" 46 | "Toothless" 'species "dragon" 47 | "Toothless" 'films (vector "How to Train Your Dragon" 48 | "How to Train Your Dragon 2" 49 | "How to Train Your Dragon: The Hidden World") 50 | "Dory" 'color "blue" 51 | "Dory" 'species "blue tang" 52 | "Dory" 'films (vector "Finding Nemo" 53 | "Finding Dory") 54 | "Holly" 'color "also blue" 55 | "Holly" 'species "glaceon" 56 | "Holly" 'films #f)) 57 | 58 | (define unnest-4 (unnest-wider unnest-1-result "films")) 59 | (define unnest-4-result 60 | (row-df [character species color idx-1 idx-2 idx-3] 61 | "Toothless" "dragon" "black" "How to Train Your Dragon" 62 | "How to Train Your Dragon 2" 63 | "How to Train Your Dragon: The Hidden World" 64 | "Dory" "blue tang" "blue" "Finding Nemo" 65 | "Finding Dory" 66 | #f 67 | "Holly" "glaceon" "also blue" #f #f #f)) 68 | 69 | (module+ test 70 | (check data-frame~=? unnest-1 unnest-1-result) 71 | (check data-frame~=? unnest-2 unnest-2-result) 72 | (check data-frame~=? unnest-3 unnest-3-result) 73 | (check data-frame~=? unnest-4 unnest-4-result)) 74 | -------------------------------------------------------------------------------- /sawzall-test/results/aggregate_gss_1.csv: -------------------------------------------------------------------------------- 1 | "N","bigregion","religion" 2 | 172,"Midwest","Catholic" 3 | 3,"Midwest","Jewish" 4 | 5,"Midwest","NA" 5 | 157,"Midwest","None" 6 | 33,"Midwest","Other" 7 | 325,"Midwest","Protestant" 8 | 162,"Northeast","Catholic" 9 | 27,"Northeast","Jewish" 10 | 1,"Northeast","NA" 11 | 112,"Northeast","None" 12 | 28,"Northeast","Other" 13 | 158,"Northeast","Protestant" 14 | 160,"South","Catholic" 15 | 11,"South","Jewish" 16 | 11,"South","NA" 17 | 170,"South","None" 18 | 50,"South","Other" 19 | 650,"South","Protestant" 20 | 155,"West","Catholic" 21 | 10,"West","Jewish" 22 | 1,"West","NA" 23 | 180,"West","None" 24 | 48,"West","Other" 25 | 238,"West","Protestant" 26 | -------------------------------------------------------------------------------- /sawzall-test/results/relig_income_pivot_longer_1.csv: -------------------------------------------------------------------------------- 1 | "religion","count","income" 2 | "Agnostic",122,"$75-100k" 3 | "Atheist",73,"$75-100k" 4 | "Buddhist",62,"$75-100k" 5 | "Catholic",949,"$75-100k" 6 | "Don’t know/refused",21,"$75-100k" 7 | "Evangelical Prot",949,"$75-100k" 8 | "Hindu",47,"$75-100k" 9 | "Historically Black Prot",131,"$75-100k" 10 | "Jehovah's Witness",15,"$75-100k" 11 | "Jewish",69,"$75-100k" 12 | "Mainline Prot",939,"$75-100k" 13 | "Mormon",85,"$75-100k" 14 | "Muslim",16,"$75-100k" 15 | "Orthodox",38,"$75-100k" 16 | "Other Christian",18,"$75-100k" 17 | "Other Faiths",46,"$75-100k" 18 | "Other World Religions",3,"$75-100k" 19 | "Unaffiliated",407,"$75-100k" 20 | "Agnostic",34,"$10-20k" 21 | "Atheist",27,"$10-20k" 22 | "Buddhist",21,"$10-20k" 23 | "Catholic",617,"$10-20k" 24 | "Don’t know/refused",14,"$10-20k" 25 | "Evangelical Prot",869,"$10-20k" 26 | "Hindu",9,"$10-20k" 27 | "Historically Black Prot",244,"$10-20k" 28 | "Jehovah's Witness",27,"$10-20k" 29 | "Jewish",19,"$10-20k" 30 | "Mainline Prot",495,"$10-20k" 31 | "Mormon",40,"$10-20k" 32 | "Muslim",7,"$10-20k" 33 | "Orthodox",17,"$10-20k" 34 | "Other Christian",7,"$10-20k" 35 | "Other Faiths",33,"$10-20k" 36 | "Other World Religions",2,"$10-20k" 37 | "Unaffiliated",299,"$10-20k" 38 | "Agnostic",76,"$40-50k" 39 | "Atheist",35,"$40-50k" 40 | "Buddhist",33,"$40-50k" 41 | "Catholic",638,"$40-50k" 42 | "Don’t know/refused",10,"$40-50k" 43 | "Evangelical Prot",881,"$40-50k" 44 | "Hindu",11,"$40-50k" 45 | "Historically Black Prot",197,"$40-50k" 46 | "Jehovah's Witness",21,"$40-50k" 47 | "Jewish",30,"$40-50k" 48 | "Mainline Prot",651,"$40-50k" 49 | "Mormon",56,"$40-50k" 50 | "Muslim",9,"$40-50k" 51 | "Orthodox",32,"$40-50k" 52 | "Other Christian",13,"$40-50k" 53 | "Other Faiths",49,"$40-50k" 54 | "Other World Religions",2,"$40-50k" 55 | "Unaffiliated",341,"$40-50k" 56 | "Agnostic","1","#f" 57 | "Atheist","2","#f" 58 | "Buddhist","3","#f" 59 | "Catholic","4","#f" 60 | "Don’t know/refused","5","#f" 61 | "Evangelical Prot","6","#f" 62 | "Hindu","7","#f" 63 | "Historically Black Prot","8","#f" 64 | "Jehovah's Witness","9","#f" 65 | "Jewish","10","#f" 66 | "Mainline Prot","11","#f" 67 | "Mormon","12","#f" 68 | "Muslim","13","#f" 69 | "Orthodox","14","#f" 70 | "Other Christian","15","#f" 71 | "Other Faiths","16","#f" 72 | "Other World Religions","17","#f" 73 | "Unaffiliated","18","#f" 74 | "Agnostic",137,"$50-75k" 75 | "Atheist",70,"$50-75k" 76 | "Buddhist",58,"$50-75k" 77 | "Catholic",1116,"$50-75k" 78 | "Don’t know/refused",35,"$50-75k" 79 | "Evangelical Prot",1486,"$50-75k" 80 | "Hindu",34,"$50-75k" 81 | "Historically Black Prot",223,"$50-75k" 82 | "Jehovah's Witness",30,"$50-75k" 83 | "Jewish",95,"$50-75k" 84 | "Mainline Prot",1107,"$50-75k" 85 | "Mormon",112,"$50-75k" 86 | "Muslim",23,"$50-75k" 87 | "Orthodox",47,"$50-75k" 88 | "Other Christian",14,"$50-75k" 89 | "Other Faiths",63,"$50-75k" 90 | "Other World Religions",7,"$50-75k" 91 | "Unaffiliated",528,"$50-75k" 92 | "Agnostic",81,"$30-40k" 93 | "Atheist",52,"$30-40k" 94 | "Buddhist",34,"$30-40k" 95 | "Catholic",670,"$30-40k" 96 | "Don’t know/refused",11,"$30-40k" 97 | "Evangelical Prot",982,"$30-40k" 98 | "Hindu",9,"$30-40k" 99 | "Historically Black Prot",238,"$30-40k" 100 | "Jehovah's Witness",24,"$30-40k" 101 | "Jewish",25,"$30-40k" 102 | "Mainline Prot",655,"$30-40k" 103 | "Mormon",51,"$30-40k" 104 | "Muslim",10,"$30-40k" 105 | "Orthodox",32,"$30-40k" 106 | "Other Christian",13,"$30-40k" 107 | "Other Faiths",46,"$30-40k" 108 | "Other World Religions",4,"$30-40k" 109 | "Unaffiliated",365,"$30-40k" 110 | "Agnostic",84,">150k" 111 | "Atheist",74,">150k" 112 | "Buddhist",53,">150k" 113 | "Catholic",633,">150k" 114 | "Don’t know/refused",18,">150k" 115 | "Evangelical Prot",414,">150k" 116 | "Hindu",54,">150k" 117 | "Historically Black Prot",78,">150k" 118 | "Jehovah's Witness",6,">150k" 119 | "Jewish",151,">150k" 120 | "Mainline Prot",634,">150k" 121 | "Mormon",42,">150k" 122 | "Muslim",6,">150k" 123 | "Orthodox",46,">150k" 124 | "Other Christian",12,">150k" 125 | "Other Faiths",41,">150k" 126 | "Other World Religions",4,">150k" 127 | "Unaffiliated",258,">150k" 128 | "Agnostic",27,"<$10k" 129 | "Atheist",12,"<$10k" 130 | "Buddhist",27,"<$10k" 131 | "Catholic",418,"<$10k" 132 | "Don’t know/refused",15,"<$10k" 133 | "Evangelical Prot",575,"<$10k" 134 | "Hindu",1,"<$10k" 135 | "Historically Black Prot",228,"<$10k" 136 | "Jehovah's Witness",20,"<$10k" 137 | "Jewish",19,"<$10k" 138 | "Mainline Prot",289,"<$10k" 139 | "Mormon",29,"<$10k" 140 | "Muslim",6,"<$10k" 141 | "Orthodox",13,"<$10k" 142 | "Other Christian",9,"<$10k" 143 | "Other Faiths",20,"<$10k" 144 | "Other World Religions",5,"<$10k" 145 | "Unaffiliated",217,"<$10k" 146 | "Agnostic",60,"$20-30k" 147 | "Atheist",37,"$20-30k" 148 | "Buddhist",30,"$20-30k" 149 | "Catholic",732,"$20-30k" 150 | "Don’t know/refused",15,"$20-30k" 151 | "Evangelical Prot",1064,"$20-30k" 152 | "Hindu",7,"$20-30k" 153 | "Historically Black Prot",236,"$20-30k" 154 | "Jehovah's Witness",24,"$20-30k" 155 | "Jewish",25,"$20-30k" 156 | "Mainline Prot",619,"$20-30k" 157 | "Mormon",48,"$20-30k" 158 | "Muslim",9,"$20-30k" 159 | "Orthodox",23,"$20-30k" 160 | "Other Christian",11,"$20-30k" 161 | "Other Faiths",40,"$20-30k" 162 | "Other World Religions",3,"$20-30k" 163 | "Unaffiliated",374,"$20-30k" 164 | "Agnostic",109,"$100-150k" 165 | "Atheist",59,"$100-150k" 166 | "Buddhist",39,"$100-150k" 167 | "Catholic",792,"$100-150k" 168 | "Don’t know/refused",17,"$100-150k" 169 | "Evangelical Prot",723,"$100-150k" 170 | "Hindu",48,"$100-150k" 171 | "Historically Black Prot",81,"$100-150k" 172 | "Jehovah's Witness",11,"$100-150k" 173 | "Jewish",87,"$100-150k" 174 | "Mainline Prot",753,"$100-150k" 175 | "Mormon",49,"$100-150k" 176 | "Muslim",8,"$100-150k" 177 | "Orthodox",42,"$100-150k" 178 | "Other Christian",14,"$100-150k" 179 | "Other Faiths",40,"$100-150k" 180 | "Other World Religions",4,"$100-150k" 181 | "Unaffiliated",321,"$100-150k" 182 | "Agnostic",96,"Don't know/refused" 183 | "Atheist",76,"Don't know/refused" 184 | "Buddhist",54,"Don't know/refused" 185 | "Catholic",1489,"Don't know/refused" 186 | "Don’t know/refused",116,"Don't know/refused" 187 | "Evangelical Prot",1529,"Don't know/refused" 188 | "Hindu",37,"Don't know/refused" 189 | "Historically Black Prot",339,"Don't know/refused" 190 | "Jehovah's Witness",37,"Don't know/refused" 191 | "Jewish",162,"Don't know/refused" 192 | "Mainline Prot",1328,"Don't know/refused" 193 | "Mormon",69,"Don't know/refused" 194 | "Muslim",22,"Don't know/refused" 195 | "Orthodox",73,"Don't know/refused" 196 | "Other Christian",18,"Don't know/refused" 197 | "Other Faiths",71,"Don't know/refused" 198 | "Other World Religions",8,"Don't know/refused" 199 | "Unaffiliated",597,"Don't know/refused" 200 | -------------------------------------------------------------------------------- /sawzall-test/results/slice_iris_1.csv: -------------------------------------------------------------------------------- 1 | "Petal.Width","#f","Sepal.Width","Species" 2 | 0.2,"1",3.5,"setosa" 3 | 0.2,"2",3,"setosa" 4 | 0.2,"3",3.2,"setosa" 5 | 0.2,"4",3.1,"setosa" 6 | 0.2,"5",3.6,"setosa" 7 | 0.4,"6",3.9,"setosa" 8 | 0.3,"7",3.4,"setosa" 9 | 0.2,"8",3.4,"setosa" 10 | 0.2,"9",2.9,"setosa" 11 | 0.1,"10",3.1,"setosa" 12 | 0.2,"11",3.7,"setosa" 13 | 0.2,"12",3.4,"setosa" 14 | 0.1,"13",3,"setosa" 15 | 0.1,"14",3,"setosa" 16 | 0.2,"15",4,"setosa" 17 | 0.4,"16",4.4,"setosa" 18 | 0.4,"17",3.9,"setosa" 19 | 0.3,"18",3.5,"setosa" 20 | 0.3,"19",3.8,"setosa" 21 | 0.3,"20",3.8,"setosa" 22 | 0.2,"21",3.4,"setosa" 23 | 0.4,"22",3.7,"setosa" 24 | 0.2,"23",3.6,"setosa" 25 | 0.5,"24",3.3,"setosa" 26 | 0.2,"25",3.4,"setosa" 27 | 0.2,"26",3,"setosa" 28 | 0.4,"27",3.4,"setosa" 29 | 0.2,"28",3.5,"setosa" 30 | 0.2,"29",3.4,"setosa" 31 | 0.2,"30",3.2,"setosa" 32 | 0.2,"31",3.1,"setosa" 33 | 0.4,"32",3.4,"setosa" 34 | 0.1,"33",4.1,"setosa" 35 | 0.2,"34",4.2,"setosa" 36 | 0.2,"35",3.1,"setosa" 37 | 0.2,"36",3.2,"setosa" 38 | 0.2,"37",3.5,"setosa" 39 | 0.1,"38",3.6,"setosa" 40 | 0.2,"39",3,"setosa" 41 | 0.2,"40",3.4,"setosa" 42 | 0.3,"41",3.5,"setosa" 43 | 0.3,"42",2.3,"setosa" 44 | 0.2,"43",3.2,"setosa" 45 | 0.6,"44",3.5,"setosa" 46 | 0.4,"45",3.8,"setosa" 47 | 0.3,"46",3,"setosa" 48 | 0.2,"47",3.8,"setosa" 49 | 0.2,"48",3.2,"setosa" 50 | 0.2,"49",3.7,"setosa" 51 | 0.2,"50",3.3,"setosa" 52 | 1.4,"51",3.2,"versicolor" 53 | 1.5,"52",3.2,"versicolor" 54 | 1.5,"53",3.1,"versicolor" 55 | 1.3,"54",2.3,"versicolor" 56 | 1.5,"55",2.8,"versicolor" 57 | 1.3,"56",2.8,"versicolor" 58 | 1.6,"57",3.3,"versicolor" 59 | 1,"58",2.4,"versicolor" 60 | 1.3,"59",2.9,"versicolor" 61 | 1.4,"60",2.7,"versicolor" 62 | 1,"61",2,"versicolor" 63 | 1.5,"62",3,"versicolor" 64 | 1,"63",2.2,"versicolor" 65 | 1.4,"64",2.9,"versicolor" 66 | 1.3,"65",2.9,"versicolor" 67 | 1.4,"66",3.1,"versicolor" 68 | 1.5,"67",3,"versicolor" 69 | 1,"68",2.7,"versicolor" 70 | 1.5,"69",2.2,"versicolor" 71 | 1.1,"70",2.5,"versicolor" 72 | 1.8,"71",3.2,"versicolor" 73 | 1.3,"72",2.8,"versicolor" 74 | 1.5,"73",2.5,"versicolor" 75 | 1.2,"74",2.8,"versicolor" 76 | 1.3,"75",2.9,"versicolor" 77 | 1.4,"76",3,"versicolor" 78 | 1.4,"77",2.8,"versicolor" 79 | 1.7,"78",3,"versicolor" 80 | 1.5,"79",2.9,"versicolor" 81 | 1,"80",2.6,"versicolor" 82 | 1.1,"81",2.4,"versicolor" 83 | 1,"82",2.4,"versicolor" 84 | 1.2,"83",2.7,"versicolor" 85 | 1.6,"84",2.7,"versicolor" 86 | 1.5,"85",3,"versicolor" 87 | 1.6,"86",3.4,"versicolor" 88 | 1.5,"87",3.1,"versicolor" 89 | 1.3,"88",2.3,"versicolor" 90 | 1.3,"89",3,"versicolor" 91 | 1.3,"90",2.5,"versicolor" 92 | 1.2,"91",2.6,"versicolor" 93 | 1.4,"92",3,"versicolor" 94 | 1.2,"93",2.6,"versicolor" 95 | 1,"94",2.3,"versicolor" 96 | 1.3,"95",2.7,"versicolor" 97 | 1.2,"96",3,"versicolor" 98 | 1.3,"97",2.9,"versicolor" 99 | 1.3,"98",2.9,"versicolor" 100 | 1.1,"99",2.5,"versicolor" 101 | 1.3,"100",2.8,"versicolor" 102 | 2.5,"101",3.3,"virginica" 103 | 1.9,"102",2.7,"virginica" 104 | 2.1,"103",3,"virginica" 105 | 1.8,"104",2.9,"virginica" 106 | 2.2,"105",3,"virginica" 107 | 2.1,"106",3,"virginica" 108 | 1.7,"107",2.5,"virginica" 109 | 1.8,"108",2.9,"virginica" 110 | 1.8,"109",2.5,"virginica" 111 | 2.5,"110",3.6,"virginica" 112 | 2,"111",3.2,"virginica" 113 | 1.9,"112",2.7,"virginica" 114 | 2.1,"113",3,"virginica" 115 | 2,"114",2.5,"virginica" 116 | 2.4,"115",2.8,"virginica" 117 | 2.3,"116",3.2,"virginica" 118 | 1.8,"117",3,"virginica" 119 | 2.2,"118",3.8,"virginica" 120 | 2.3,"119",2.6,"virginica" 121 | 1.5,"120",2.2,"virginica" 122 | 2.3,"121",3.2,"virginica" 123 | 2,"122",2.8,"virginica" 124 | 2,"123",2.8,"virginica" 125 | 1.8,"124",2.7,"virginica" 126 | 2.1,"125",3.3,"virginica" 127 | 1.8,"126",3.2,"virginica" 128 | 1.8,"127",2.8,"virginica" 129 | 1.8,"128",3,"virginica" 130 | 2.1,"129",2.8,"virginica" 131 | 1.6,"130",3,"virginica" 132 | 1.9,"131",2.8,"virginica" 133 | 2,"132",3.8,"virginica" 134 | 2.2,"133",2.8,"virginica" 135 | 1.5,"134",2.8,"virginica" 136 | 1.4,"135",2.6,"virginica" 137 | 2.3,"136",3,"virginica" 138 | 2.4,"137",3.4,"virginica" 139 | 1.8,"138",3.1,"virginica" 140 | 1.8,"139",3,"virginica" 141 | 2.1,"140",3.1,"virginica" 142 | 2.4,"141",3.1,"virginica" 143 | 2.3,"142",3.1,"virginica" 144 | 1.9,"143",2.7,"virginica" 145 | 2.3,"144",3.2,"virginica" 146 | 2.5,"145",3.3,"virginica" 147 | 2.3,"146",3,"virginica" 148 | 1.9,"147",2.5,"virginica" 149 | 2,"148",3,"virginica" 150 | 2.3,"149",3.4,"virginica" 151 | 1.8,"150",3,"virginica" 152 | -------------------------------------------------------------------------------- /sawzall-test/results/slice_iris_2.csv: -------------------------------------------------------------------------------- 1 | "Sepal.Length","#f","Petal.Length","Species" 2 | 5.1,"1",1.4,"setosa" 3 | 4.9,"2",1.4,"setosa" 4 | 4.7,"3",1.3,"setosa" 5 | 4.6,"4",1.5,"setosa" 6 | 5,"5",1.4,"setosa" 7 | 5.4,"6",1.7,"setosa" 8 | 4.6,"7",1.4,"setosa" 9 | 5,"8",1.5,"setosa" 10 | 4.4,"9",1.4,"setosa" 11 | 4.9,"10",1.5,"setosa" 12 | 5.4,"11",1.5,"setosa" 13 | 4.8,"12",1.6,"setosa" 14 | 4.8,"13",1.4,"setosa" 15 | 4.3,"14",1.1,"setosa" 16 | 5.8,"15",1.2,"setosa" 17 | 5.7,"16",1.5,"setosa" 18 | 5.4,"17",1.3,"setosa" 19 | 5.1,"18",1.4,"setosa" 20 | 5.7,"19",1.7,"setosa" 21 | 5.1,"20",1.5,"setosa" 22 | 5.4,"21",1.7,"setosa" 23 | 5.1,"22",1.5,"setosa" 24 | 4.6,"23",1,"setosa" 25 | 5.1,"24",1.7,"setosa" 26 | 4.8,"25",1.9,"setosa" 27 | 5,"26",1.6,"setosa" 28 | 5,"27",1.6,"setosa" 29 | 5.2,"28",1.5,"setosa" 30 | 5.2,"29",1.4,"setosa" 31 | 4.7,"30",1.6,"setosa" 32 | 4.8,"31",1.6,"setosa" 33 | 5.4,"32",1.5,"setosa" 34 | 5.2,"33",1.5,"setosa" 35 | 5.5,"34",1.4,"setosa" 36 | 4.9,"35",1.5,"setosa" 37 | 5,"36",1.2,"setosa" 38 | 5.5,"37",1.3,"setosa" 39 | 4.9,"38",1.4,"setosa" 40 | 4.4,"39",1.3,"setosa" 41 | 5.1,"40",1.5,"setosa" 42 | 5,"41",1.3,"setosa" 43 | 4.5,"42",1.3,"setosa" 44 | 4.4,"43",1.3,"setosa" 45 | 5,"44",1.6,"setosa" 46 | 5.1,"45",1.9,"setosa" 47 | 4.8,"46",1.4,"setosa" 48 | 5.1,"47",1.6,"setosa" 49 | 4.6,"48",1.4,"setosa" 50 | 5.3,"49",1.5,"setosa" 51 | 5,"50",1.4,"setosa" 52 | 7,"51",4.7,"versicolor" 53 | 6.4,"52",4.5,"versicolor" 54 | 6.9,"53",4.9,"versicolor" 55 | 5.5,"54",4,"versicolor" 56 | 6.5,"55",4.6,"versicolor" 57 | 5.7,"56",4.5,"versicolor" 58 | 6.3,"57",4.7,"versicolor" 59 | 4.9,"58",3.3,"versicolor" 60 | 6.6,"59",4.6,"versicolor" 61 | 5.2,"60",3.9,"versicolor" 62 | 5,"61",3.5,"versicolor" 63 | 5.9,"62",4.2,"versicolor" 64 | 6,"63",4,"versicolor" 65 | 6.1,"64",4.7,"versicolor" 66 | 5.6,"65",3.6,"versicolor" 67 | 6.7,"66",4.4,"versicolor" 68 | 5.6,"67",4.5,"versicolor" 69 | 5.8,"68",4.1,"versicolor" 70 | 6.2,"69",4.5,"versicolor" 71 | 5.6,"70",3.9,"versicolor" 72 | 5.9,"71",4.8,"versicolor" 73 | 6.1,"72",4,"versicolor" 74 | 6.3,"73",4.9,"versicolor" 75 | 6.1,"74",4.7,"versicolor" 76 | 6.4,"75",4.3,"versicolor" 77 | 6.6,"76",4.4,"versicolor" 78 | 6.8,"77",4.8,"versicolor" 79 | 6.7,"78",5,"versicolor" 80 | 6,"79",4.5,"versicolor" 81 | 5.7,"80",3.5,"versicolor" 82 | 5.5,"81",3.8,"versicolor" 83 | 5.5,"82",3.7,"versicolor" 84 | 5.8,"83",3.9,"versicolor" 85 | 6,"84",5.1,"versicolor" 86 | 5.4,"85",4.5,"versicolor" 87 | 6,"86",4.5,"versicolor" 88 | 6.7,"87",4.7,"versicolor" 89 | 6.3,"88",4.4,"versicolor" 90 | 5.6,"89",4.1,"versicolor" 91 | 5.5,"90",4,"versicolor" 92 | 5.5,"91",4.4,"versicolor" 93 | 6.1,"92",4.6,"versicolor" 94 | 5.8,"93",4,"versicolor" 95 | 5,"94",3.3,"versicolor" 96 | 5.6,"95",4.2,"versicolor" 97 | 5.7,"96",4.2,"versicolor" 98 | 5.7,"97",4.2,"versicolor" 99 | 6.2,"98",4.3,"versicolor" 100 | 5.1,"99",3,"versicolor" 101 | 5.7,"100",4.1,"versicolor" 102 | 6.3,"101",6,"virginica" 103 | 5.8,"102",5.1,"virginica" 104 | 7.1,"103",5.9,"virginica" 105 | 6.3,"104",5.6,"virginica" 106 | 6.5,"105",5.8,"virginica" 107 | 7.6,"106",6.6,"virginica" 108 | 4.9,"107",4.5,"virginica" 109 | 7.3,"108",6.3,"virginica" 110 | 6.7,"109",5.8,"virginica" 111 | 7.2,"110",6.1,"virginica" 112 | 6.5,"111",5.1,"virginica" 113 | 6.4,"112",5.3,"virginica" 114 | 6.8,"113",5.5,"virginica" 115 | 5.7,"114",5,"virginica" 116 | 5.8,"115",5.1,"virginica" 117 | 6.4,"116",5.3,"virginica" 118 | 6.5,"117",5.5,"virginica" 119 | 7.7,"118",6.7,"virginica" 120 | 7.7,"119",6.9,"virginica" 121 | 6,"120",5,"virginica" 122 | 6.9,"121",5.7,"virginica" 123 | 5.6,"122",4.9,"virginica" 124 | 7.7,"123",6.7,"virginica" 125 | 6.3,"124",4.9,"virginica" 126 | 6.7,"125",5.7,"virginica" 127 | 7.2,"126",6,"virginica" 128 | 6.2,"127",4.8,"virginica" 129 | 6.1,"128",4.9,"virginica" 130 | 6.4,"129",5.6,"virginica" 131 | 7.2,"130",5.8,"virginica" 132 | 7.4,"131",6.1,"virginica" 133 | 7.9,"132",6.4,"virginica" 134 | 6.4,"133",5.6,"virginica" 135 | 6.3,"134",5.1,"virginica" 136 | 6.1,"135",5.6,"virginica" 137 | 7.7,"136",6.1,"virginica" 138 | 6.3,"137",5.6,"virginica" 139 | 6.4,"138",5.5,"virginica" 140 | 6,"139",4.8,"virginica" 141 | 6.9,"140",5.4,"virginica" 142 | 6.7,"141",5.6,"virginica" 143 | 6.9,"142",5.1,"virginica" 144 | 5.8,"143",5.1,"virginica" 145 | 6.8,"144",5.9,"virginica" 146 | 6.7,"145",5.7,"virginica" 147 | 6.7,"146",5.2,"virginica" 148 | 6.3,"147",5,"virginica" 149 | 6.5,"148",5.2,"virginica" 150 | 6.2,"149",5.4,"virginica" 151 | 5.9,"150",5.1,"virginica" 152 | -------------------------------------------------------------------------------- /sawzall-test/results/slice_iris_3.csv: -------------------------------------------------------------------------------- 1 | "Petal.Length" 2 | 1.4 3 | 1.4 4 | 1.3 5 | 1.5 6 | 1.4 7 | 1.7 8 | 1.4 9 | 1.5 10 | 1.4 11 | 1.5 12 | 1.5 13 | 1.6 14 | 1.4 15 | 1.1 16 | 1.2 17 | 1.5 18 | 1.3 19 | 1.4 20 | 1.7 21 | 1.5 22 | 1.7 23 | 1.5 24 | 1 25 | 1.7 26 | 1.9 27 | 1.6 28 | 1.6 29 | 1.5 30 | 1.4 31 | 1.6 32 | 1.6 33 | 1.5 34 | 1.5 35 | 1.4 36 | 1.5 37 | 1.2 38 | 1.3 39 | 1.4 40 | 1.3 41 | 1.5 42 | 1.3 43 | 1.3 44 | 1.3 45 | 1.6 46 | 1.9 47 | 1.4 48 | 1.6 49 | 1.4 50 | 1.5 51 | 1.4 52 | 4.7 53 | 4.5 54 | 4.9 55 | 4 56 | 4.6 57 | 4.5 58 | 4.7 59 | 3.3 60 | 4.6 61 | 3.9 62 | 3.5 63 | 4.2 64 | 4 65 | 4.7 66 | 3.6 67 | 4.4 68 | 4.5 69 | 4.1 70 | 4.5 71 | 3.9 72 | 4.8 73 | 4 74 | 4.9 75 | 4.7 76 | 4.3 77 | 4.4 78 | 4.8 79 | 5 80 | 4.5 81 | 3.5 82 | 3.8 83 | 3.7 84 | 3.9 85 | 5.1 86 | 4.5 87 | 4.5 88 | 4.7 89 | 4.4 90 | 4.1 91 | 4 92 | 4.4 93 | 4.6 94 | 4 95 | 3.3 96 | 4.2 97 | 4.2 98 | 4.2 99 | 4.3 100 | 3 101 | 4.1 102 | 6 103 | 5.1 104 | 5.9 105 | 5.6 106 | 5.8 107 | 6.6 108 | 4.5 109 | 6.3 110 | 5.8 111 | 6.1 112 | 5.1 113 | 5.3 114 | 5.5 115 | 5 116 | 5.1 117 | 5.3 118 | 5.5 119 | 6.7 120 | 6.9 121 | 5 122 | 5.7 123 | 4.9 124 | 6.7 125 | 4.9 126 | 5.7 127 | 6 128 | 4.8 129 | 4.9 130 | 5.6 131 | 5.8 132 | 6.1 133 | 6.4 134 | 5.6 135 | 5.1 136 | 5.6 137 | 6.1 138 | 5.6 139 | 5.5 140 | 4.8 141 | 5.4 142 | 5.6 143 | 5.1 144 | 5.1 145 | 5.9 146 | 5.7 147 | 5.2 148 | 5 149 | 5.2 150 | 5.4 151 | 5.1 152 | -------------------------------------------------------------------------------- /sawzall-test/results/where_organdata_1.csv: -------------------------------------------------------------------------------- 1 | "ccode","gdp","roads","consistent","#f","donors","consent_practice","consent_law","country","pop","txp_pop","year","pubhealth","assault","health_lag","pop_dens","cerebvas","external","world","opt","gdp_lag","health" 2 | "Oz",16774,136.595370641664,"Yes","1","NA","Informed","Informed","Australia",17065,0.937591561675945,"NA",4.8,21,1224,0.220443289300653,682,444,"Liberal","In",16591,1300 3 | "Oz",17171,122.251793566304,"Yes","2",12.09,"Informed","Informed","Australia",17284,0.925711640823883,"1991-01-01",5.4,19,1300,0.223272300748461,647,425,"Liberal","In",16774,1379 4 | "Oz",17914,112.832237782224,"Yes","3",12.35,"Informed","Informed","Australia",17495,0.914547013432409,"1992-01-01",5.4,17,1379,0.225997969312331,630,406,"Liberal","In",17171,1455 5 | "Oz",18883,110.545084055018,"Yes","4",12.51,"Informed","Informed","Australia",17667,0.90564328974925,"1993-01-01",5.4,18,1455,0.228219841317002,611,376,"Liberal","In",17914,1540 6 | "Oz",19849,107.980957714926,"Yes","5",10.25,"Informed","Informed","Australia",17855,0.896107532903948,"1994-01-01",5.4,17,1540,0.230648399089549,631,387,"Liberal","In",18883,1626 7 | "Oz",21079,111.609119079239,"Yes","6",10.18,"Informed","Informed","Australia",18072,0.885347498893316,"1995-01-01",5.5,16,1626,0.233451574816373,592,371,"Liberal","In",19849,1737 8 | "Oz",21923,107.585604281579,"Yes","7",10.59,"Informed","Informed","Australia",18311,0.873791709901152,"1996-01-01",5.6,17,1737,0.236538943474026,576,395,"Liberal","In",21079,1846 9 | "Oz",22961,95.4206717788098,"Yes","8",10.26,"Informed","Informed","Australia",18518,0.864024192677395,"1997-01-01",5.7,17,1846,0.239212940595927,525,385,"Liberal","In",21923,1948 10 | "Oz",24148,93.7950937950938,"Yes","9",10.48,"Informed","Informed","Australia",18711,0.855111966223077,"1998-01-01",5.9,16,1948,0.241706087670936,516,410,"Liberal","In",22961,2077 11 | "Oz",25445,93.2051146570855,"Yes","10",8.67,"Informed","Informed","Australia",18926,0.84539786537039,"1999-01-01",6.1,15,2077,0.244483427676774,493,409,"Liberal","In",24148,2231 12 | "Oz",26545,94.8676447553908,"Yes","11",10.23,"Informed","Informed","Australia",19153,0.835378269722759,"2000-01-01",6.2,16,2231,0.247415782008521,474,393,"Liberal","In",25445,2379 13 | "Oz",27461,89.4761242466388,"Yes","12",9.53,"Informed","Informed","Australia",19413,0.824189975789419,"2001-01-01",6.2,15,2379,0.250774425736512,449,367,"Liberal","In",26545,2504 14 | "Oz",28168,87.2196511213955,"Yes","13",10.48,"Informed","Informed","Australia",19663,0.813711030870162,"2002-01-01",6.2,14,2504,0.25400389085958,424,341,"Liberal","In",27461,2629 15 | "Oz","NA","NA","Yes","14","NA","Informed","Informed","Australia","NA","NA","NA","NA","NA",2629,"NA","NA","NA","Liberal","In",28168,2754 16 | "Can",19044,143.063427313093,"Yes","43","NA","Informed","Informed","Canada",27701,1.11909317353164,"NA",6.7,20,1600,0.277826532178071,476,469,"Liberal","In",18699,1714 17 | "Can",19101,131.639970033178,"Yes","44",14.7,"Informed","Informed","Canada",28031,1.1059184474332,"1991-01-01",7.2,22,1714,0.281136259466572,462,463,"Liberal","In",19044,1855 18 | "Can",19590,123.37456390739,"Yes","45",12.6,"Informed","Informed","Canada",28377,1.09243401346161,"1992-01-01",7.4,20,1855,0.284606458381182,449,436,"Liberal","In",19101,1959 19 | "Can",20269,125.94502316831,"Yes","46",14.1,"Informed","Informed","Canada",28703,1.08002647806849,"1993-01-01",7.2,17,1959,0.287876067763156,463,440,"Liberal","In",19590,2002 20 | "Can",21428,112.377737980438,"Yes","47",13.9,"Informed","Informed","Canada",29036,1.06764017082243,"1994-01-01",6.9,17,2002,0.291215883481552,444,422,"Liberal","In",20269,2039 21 | "Can",22292,114.158206717994,"Yes","48",14.8,"Informed","Informed","Canada",29354,1.05607412959052,"1995-01-01",6.5,16,2039,0.294405257050471,434,424,"Liberal","In",21428,2044 22 | "Can",22764,104.172283634403,"Yes","49",14,"Informed","Informed","Canada",29672,1.04475599892154,"1996-01-01",6.3,17,2044,0.29759463061939,432,417,"Liberal","In",22292,2039 23 | "Can",23949,102.177610297796,"Yes","50",14.2,"Informed","Informed","Canada",29987,1.03378130523227,"1997-01-01",6.2,14,2039,0.300753915758414,430,399,"Liberal","In",22764,2130 24 | "Can",25013,97.4940491933351,"Yes","51",13.7,"Informed","Informed","Canada",30248,1.02486114784449,"1998-01-01",6.5,15,2130,0.303371609159319,407,398,"Liberal","In",23949,2291 25 | "Can",26658,97.8720613790616,"Yes","52",13.8,"Informed","Informed","Canada",30499,1.01642676809076,"1999-01-01",6.3,15,2291,0.305889007793906,387,411,"Liberal","In",25013,2400 26 | "Can",28472,95.0926226844329,"Yes","53",15.3,"Informed","Informed","Canada",30770,1.00747481312967,"2000-01-01",6.3,15,2400,0.30860699596113,378,382,"Liberal","In",26658,2541 27 | "Can",29235,89.2931760470573,"Yes","54",13.5,"Informed","Informed","Canada",31111,0.99643213011475,"2001-01-01",6.6,15,2541,0.312027047492581,369,353,"Liberal","In",28472,2743 28 | "Can",30429,83.7206341121793,"Yes","55",13,"Informed","Informed","Canada",31414,0.986821162538995,"2002-01-01",6.7,15,2743,0.315065978912022,360,324,"Liberal","In",29235,2931 29 | "Can","NA","NA","Yes","56","NA","Informed","Informed","Canada","NA","NA","NA","NA","NA",2931,"NA","NA","NA","Liberal","In",30429,3119 30 | "Den",18285,123.322310834468,"Yes","57","NA","Informed","Informed","Denmark",5141,0.778058743435129,"NA",7,10,1506,11.9308424228359,699,582,"SocDem","In",17460,1554 31 | "Den",19126,117.578579743888,"Yes","58",11.7,"Informed","Informed","Denmark",5154,0.776096235933256,"1991-01-01",7,13,1554,11.9610118356927,688,572,"SocDem","In",18285,1603 32 | "Den",19644,111.58383291433,"Yes","59",16.1,"Informed","Informed","Denmark",5171,0.7735447689035,"1992-01-01",7,13,1603,12.0004641448132,691,565,"SocDem","In",19126,1660 33 | "Den",20056,107.727885912507,"Yes","60",14.7,"Informed","Informed","Denmark",5189,0.770861437656581,"1993-01-01",7.2,12,1660,12.0422371779995,714,574,"SocDem","In",19644,1757 34 | "Den",21494,104.878985785632,"Yes","61",12.9,"Informed","Informed","Denmark",5206,0.768344218209758,"1994-01-01",7,13,1757,12.08168948712,646,548,"SocDem","In",20056,1834 35 | "Den",22462,111.217274985668,"Yes","62",12.9,"Informed","Informed","Denmark",5233,0.764379896808714,"1995-01-01",6.8,11,1834,12.1443490368995,655,535,"SocDem","In",21494,1843 36 | "Den",23548,97.6629298878966,"Yes","63",14,"Informed","Informed","Denmark",5263,0.760022800684021,"1996-01-01",6.8,11,1843,12.2139707588768,650,499,"SocDem","In",22462,1949 37 | "Den",24676,92.5260170293283,"Yes","64",11.4,"Informed","Informed","Denmark",5285,0.75685903500473,"1997-01-01",6.8,11,1949,12.2650266883268,606,506,"SocDem","In",23548,2028 38 | "Den",25537,94.079939668175,"Yes","65",11,"Informed","Informed","Denmark",5304,0.754147812971342,"1998-01-01",6.9,9,2028,12.309120445579,580,482,"SocDem","In",24676,2141 39 | "Den",26985,96.5802329951146,"Yes","66",14.3,"Informed","Informed","Denmark",5322,0.751597143930853,"1999-01-01",7,11,2141,12.3508934787654,588,495,"SocDem","In",25537,2297 40 | "Den",28146,97.5655430711611,"Yes","67",12.5,"Informed","Informed","Denmark",5340,0.749063670411985,"2000-01-01",6.9,13,2297,12.3926665119517,596,508,"SocDem","In",26985,2351 41 | "Den",29203,80.4254525097966,"Yes","68",12.9,"Informed","Informed","Denmark",5359,0.746407911923866,"2001-01-01",7.1,15,2353,12.436760269204,604,521,"SocDem","In",28146,2523 42 | "Den",29228,86.1235119047619,"Yes","69",12.7,"Informed","Informed","Denmark",5376,0.744047619047619,"2002-01-01",7.3,17,2520,12.4762125783244,612,534,"SocDem","In",29203,2580 43 | "Den","NA","NA","Yes","70","NA","Informed","Informed","Denmark","NA","NA","NA","NA","NA",2583,"NA","NA","NA","SocDem","In",29228,2637 44 | "Fin",18025,130.16446048937,"No","71","NA","Informed","Presumed","Finland",4986,0.601684717208183,"NA",6.3,30,1268,1.47449356794322,957,853,"SocDem","NA",17436,1414 45 | "Fin",17281,126.047068209015,"No","72",16.8,"Informed","Presumed","Finland",5014,0.598324690865576,"1991-01-01",7.3,28,1414,1.48277391690078,939,823,"SocDem","NA",18025,1547 46 | "Fin",16943,119.198730662436,"No","73",19.4,"Informed","Presumed","Finland",5042,0.595001983339944,"1992-01-01",7.2,32,1547,1.49105426585835,886,792,"SocDem","NA",17281,1542 47 | "Fin",17082,95.5388866956178,"No","74",19.6,"Informed","Presumed","Finland",5066,0.592183181997631,"1993-01-01",6.3,31,1542,1.49815170782197,932,741,"SocDem","NA",16943,1421 48 | "Fin",17993,94.3396226415094,"No","75",20,"Informed","Presumed","Finland",5088,0.589622641509434,"1994-01-01",5.8,29,1421,1.50465769628863,830,738,"SocDem","NA",17082,1390 49 | "Fin",19031,86.335160532498,"No","76",19.4,"Informed","Presumed","Finland",5108,0.587314017227878,"1995-01-01",5.7,28,1390,1.51057223125832,875,737,"SocDem","NA",17993,1428 50 | "Fin",19842,78.8292682926829,"No","77",19.5,"Informed","Presumed","Finland",5125,0.585365853658537,"1996-01-01",5.8,32,1428,1.51559958598255,736,686,"SocDem","NA",19031,1517 51 | "Fin",21691,85.2140077821012,"No","78",16.3,"Informed","Presumed","Finland",5140,0.583657587548638,"1997-01-01",5.5,25,1517,1.52003548720982,721,701,"SocDem","NA",19842,1580 52 | "Fin",23267,77.6096235933256,"No","79",19.8,"Informed","Presumed","Finland",5154,0.582072176949942,"1998-01-01",5.3,22,1580,1.5241756616886,670,686,"SocDem","NA",21691,1607 53 | "Fin",23702,83.4301200154859,"No","80",16.5,"Informed","Presumed","Finland",5166,0.580720092915215,"1999-01-01",5.2,24,1607,1.52772438267041,652,673,"SocDem","NA",23267,1641 54 | "Fin",25397,76.5069551777434,"No","81",19.9,"Informed","Presumed","Finland",5176,0.579598145285935,"2000-01-01",5,25,1641,1.53068165015526,639,657,"SocDem","NA",23702,1698 55 | "Fin",26376,83.4618350038551,"No","82",17,"Informed","Presumed","Finland",5188,0.578257517347726,"2001-01-01",5.3,26,1698,1.53423037113707,604,660,"SocDem","NA",25397,1841 56 | "Fin",26616,79.7923476254566,"No","83",17.1,"Informed","Presumed","Finland",5201,0.576812151509325,"2002-01-01",5.5,25,1841,1.53807481886737,587,638,"SocDem","NA",26376,1943 57 | "Fin","NA","NA","No","84","NA","Informed","Presumed","Finland","NA","NA","NA","NA","NA",1943,"NA","NA","NA","SocDem","NA",26616,2045 58 | "Fra",18162,197.764023347264,"No","85","NA","Informed","Presumed","France",56709,0.722989296231639,"NA",6.6,10,1555,10.2826835902085,529,689,"Corporatist","Out",17113,1555 59 | "Fra",18989,183.989750070205,"No","86",19.1,"Informed","Presumed","France",56976,0.719601235607975,"1991-01-01",6.7,10,1555,10.3310970081596,520,668,"Corporatist","Out",18162,1668 60 | "Fra",19566,172.955974842767,"No","87",16.8,"Informed","Presumed","France",57240,0.716282320055905,"1992-01-01",6.9,10,1668,10.3789664551224,487,650,"Corporatist","Out",18989,1769 61 | "Fra",19763,171.69854003167,"No","88",17.1,"Informed","Presumed","France",57467,0.713452938208015,"1993-01-01",7.2,11,1769,10.4201269265639,473,645,"Corporatist","Out",19566,1866 62 | "Fra",20655,156.419639605265,"No","89",15.4,"Informed","Presumed","France",57659,0.711077195234049,"1994-01-01",7.1,11,1866,10.4549410698096,437,615,"Corporatist","Out",19763,1937 63 | "Fra",21283,153.706520987484,"No","90",15.1,"Informed","Presumed","France",57844,0.708802987345273,"1995-01-01",7.3,10,1937,10.4884859474161,431,602,"Corporatist","Out",20655,2025 64 | "Fra",21990,147.192637783063,"No","91",15.1,"Informed","Presumed","France",58026,0.706579809051115,"1996-01-01",7.2,10,2025,10.5214868540345,424,580,"Corporatist","Out",21283,2091 65 | "Fra",23113,145.065970313359,"No","92",15,"Informed","Presumed","France",58208,0.704370533260033,"1997-01-01",7.1,9,2091,10.5544877606528,409,571,"Corporatist","Out",21990,2163 66 | "Fra",24044,152.710709270865,"No","93",16.5,"Informed","Presumed","France",58398,0.702078838316381,"1998-01-01",7.1,7,2163,10.588939256573,405,581,"Corporatist","Out",23113,2231 67 | "Fra",24856,144.772529553247,"No","94",16.2,"Informed","Presumed","France",58623,0.699384200740324,"1999-01-01",7.1,7,2231,10.629737080689,394,572,"Corporatist","Out",24044,2306 68 | "Fra",25928,137.174001629992,"No","95",17,"Informed","Presumed","France",58896,0.69614235262157,"2000-01-01",7.1,7,2306,10.6792384406165,383,563,"Corporatist","Out",24856,2416 69 | "Fra",27394,137.856467090147,"No","96",17.8,"Informed","Presumed","France",59192,0.692661170428436,"2001-01-01",7.2,7,2416,10.7329102447869,372,554,"Corporatist","Out",25928,2588 70 | "Fra",28094,128.685741182799,"No","97",20,"Informed","Presumed","France",59486,0.689237803853007,"2002-01-01",7.4,7,2588,10.7862194016319,361,545,"Corporatist","Out",27394,2736 71 | "Fra","NA","NA","No","98","NA","Informed","Presumed","France","NA","NA","NA","NA","NA",2760,"NA","NA","NA","Corporatist","Out",28094,2884 72 | "Ger",20359,174.629272457078,"Yes","99","NA","Informed","Informed","Germany",63254,0.695608182881715,"NA",6.5,10,1630,17.7167184830406,846,477,"Corporatist","In",19015,1729 73 | "Ger",17511,141.225285575024,"Yes","100",13.3,"Informed","Informed","Germany",80014,0.549903766840803,"1991-01-01",7.25,11,1729,22.411001876593,880,480,"Corporatist","In",20359,1845.5 74 | "Ger",19811,131.857364341085,"Yes","101",14.2,"Informed","Informed","Germany",80625,0.545736434108527,"1992-01-01",8,11,1845.5,22.5821359549618,834,456,"Corporatist","In",17511,1962 75 | "Ger",19983,122.591059194637,"Yes","102",13.9,"Informed","Informed","Germany",81156,0.542165705554734,"1993-01-01",8,11,1962,22.7308629526931,817,432,"Corporatist","In",19811,1988 76 | "Ger",20690,120.508853360839,"Yes","103",12.3,"Informed","Informed","Germany",81438,0.540288317493062,"1994-01-01",8.2,11,1988,22.8098479119402,791,417,"Corporatist","In",19983,2106 77 | "Ger",21411,115.747202429051,"Yes","104",12.8,"Informed","Informed","Germany",81678,0.538700751732413,"1995-01-01",8.5,11,2106,22.8770691538526,753,399,"Corporatist","In",20690,2263 78 | "Ger",22164,106.915705304279,"Yes","105",12.7,"Informed","Informed","Germany",81915,0.537142159555637,"1996-01-01",8.8,11,2263,22.9434501302412,725,385,"Corporatist","In",21411,2410 79 | "Ger",22589,104.211616992747,"Yes","106",13.2,"Informed","Informed","Germany",82035,0.536356433229719,"1997-01-01",8.5,9,2410,22.9770607511974,679,370,"Corporatist","In",22164,2416 80 | "Ger",23283,94.9699562445915,"Yes","107",13.4,"Informed","Informed","Germany",82047,0.536277987007447,"1998-01-01",8.3,9,2416,22.980421813293,650,345,"Corporatist","In",22589,2470 81 | "Ger",24100,94.6650426309379,"Yes","108",13.2,"Informed","Informed","Germany",82100,0.535931790499391,"1999-01-01",8.4,9,2470,22.9952665042153,608,339,"Corporatist","In",23283,2563 82 | "Ger",24942,91.2640490439352,"Yes","109",12.5,"Informed","Informed","Germany",82212,0.535201673721598,"2000-01-01",8.3,7,2563,23.0266364171078,558,338,"Corporatist","In",24100,2640 83 | "Ger",25436,84.7237401335762,"Yes","110",12.8,"Informed","Informed","Germany",82350,0.534304796599879,"2001-01-01",8.5,7,2640,23.0652886312075,535,329,"Corporatist","In",24942,2735 84 | "Ger",25843,82.9443925856781,"Yes","111",12.2,"Informed","Informed","Germany",82489,0.53340445392719,"2002-01-01",8.6,7,2735,23.1042209338151,512,320,"Corporatist","In",25436,2830 85 | "Ger","NA","NA","Yes","112","NA","Informed","Informed","Germany","NA","NA","NA","NA","NA",2830,"NA","NA","NA","Corporatist","In",25843,2925 86 | "Ita",17430,125.83085033234,"No","113","NA","Informed","Presumed","Italy",56719,0.458400183360073,"NA",6.4,25,1274,18.8222605694564,886,422,"Corporatist","In",16525,1397 87 | "Ita",18209,142.429208295889,"No","114",5.2,"Informed","Presumed","Italy",56751,0.458141706754066,"1991-01-01",6.6,27,1397,18.8328798035442,876,435,"Corporatist","In",17430,1520 88 | "Ita",18883,140.945145007826,"No","115",5.8,"Informed","Presumed","Italy",56859,0.457271496157161,"1992-01-01",6.5,21,1520,18.8687197185903,832,419,"Corporatist","In",18209,1584 89 | "Ita",19124,125.558730214377,"No","116",6.2,"Informed","Presumed","Italy",57049,0.455748567021333,"1993-01-01",6.2,16,1584,18.9317714209863,821,396,"Corporatist","In",18883,1554 90 | "Ita",19903,123.959862946647,"No","117",7.9,"Informed","Presumed","Italy",57204,0.454513670372701,"1994-01-01",5.9,15,1554,18.9832083360988,791,381,"Corporatist","In",19124,1557 91 | "Ita",20652,122.510950943265,"No","118",10.1,"Informed","Presumed","Italy",57301,0.453744262752832,"1995-01-01",5.3,14,1557,19.0153978894272,715,376,"Corporatist","In",19903,1524 92 | "Ita",21396,116.312699269997,"No","119",11,"Informed","Presumed","Italy",57397,0.452985347666254,"1996-01-01",5.4,14,1524,19.0472555916904,670,366,"Corporatist","In",20652,1605 93 | "Ita",22030,116.706078731395,"No","120",11.6,"Informed","Presumed","Italy",57512,0.452079566003617,"1997-01-01",5.6,12,1605,19.0854184641933,669,365,"Corporatist","In",21396,1705 94 | "Ita",23291,109.640897409182,"No","121",12.3,"Informed","Presumed","Italy",57588,0.451482947836355,"1998-01-01",5.6,12,1705,19.1106391451517,670,359,"Corporatist","In",22030,1800 95 | "Ita",23729,115.064358324949,"No","122",13.7,"Informed","Presumed","Italy",57646,0.451028692363737,"1999-01-01",5.6,11,1800,19.1298865069357,627,343,"Corporatist","In",23291,1853 96 | "Ita",24629,110.972611751671,"No","123",14.2,"Informed","Presumed","Italy",57762,0.450122918181503,"2000-01-01",6,10,1853,19.1683812305038,597,327,"Corporatist","In",23729,2001 97 | "Ita",25359,115.417832590597,"No","124",17.1,"Informed","Presumed","Italy",57894,0.449096624866135,"2001-01-01",6.3,9,2001,19.2121855711157,567,311,"Corporatist","In",24629,2107 98 | "Ita",25569,119.908956098907,"No","125",18.1,"Informed","Presumed","Italy",57994,0.448322240231748,"2002-01-01",6.4,8,2107,19.2453706776399,537,295,"Corporatist","In",25359,2166 99 | "Ita","NA","NA","No","126","NA","Informed","Presumed","Italy","NA","NA","NA","NA","NA",2166,"NA","NA","NA","Corporatist","In",25569,2225 100 | "Irl",12917,136.027319294252,"Yes","127","NA","Informed","Informed","Ireland",3514,0.853727945361411,"NA",4.4,7,727,5.00071154119824,825,432,"Liberal","In",11434,791 101 | "Irl",13495,125.919637804188,"Yes","128",19,"Informed","Informed","Ireland",3534,0.848896434634975,"1991-01-01",4.8,6,791,5.02917318912765,807,410,"Liberal","In",12917,884 102 | "Irl",14241,116.63856098932,"Yes","129",19.5,"Informed","Informed","Ireland",3558,0.843170320404722,"1992-01-01",5,8,884,5.06332716664295,787,379,"Liberal","In",13495,1005 103 | "Irl",14927,120.525727069351,"Yes","130",17.1,"Informed","Informed","Ireland",3576,0.838926174496644,"1993-01-01",5.1,7,1005,5.08894264977942,802,374,"Liberal","In",14241,1041 104 | "Irl",15990,112.534818941504,"Yes","131",20.3,"Informed","Informed","Ireland",3590,0.835654596100278,"1994-01-01",5,7,1041,5.10886580333001,749,392,"Liberal","In",14927,1119 105 | "Irl",17789,121.086173455251,"Yes","132",24.6,"Informed","Informed","Ireland",3609,0.831255195344971,"1995-01-01",4.9,8,1119,5.13590436886296,733,387,"Liberal","In",15990,1208 106 | "Irl",19245,124.587458745875,"Yes","133",16.8,"Informed","Informed","Ireland",3636,0.825082508250825,"1996-01-01",4.7,9,1208,5.17432759356767,719,391,"Liberal","In",17789,1269 107 | "Irl",22017,128.777566022325,"Yes","134",20.9,"Informed","Informed","Ireland",3673,0.81677103185407,"1997-01-01",4.8,8,1269,5.22698164223709,665,394,"Liberal","In",19245,1417 108 | "Irl",23995,123.28398384926,"Yes","135",23.8,"Informed","Informed","Ireland",3715,0.807537012113055,"1998-01-01",4.7,11,1417,5.28675110288886,614,417,"Liberal","In",22017,1487 109 | "Irl",25936,110.223642172524,"Yes","136",18.7,"Informed","Informed","Ireland",3756,0.798722044728434,"1999-01-01",4.6,10,1487,5.34509748114416,663,409,"Liberal","In",23995,1623 110 | "Irl",27891,109.181794264667,"Yes","137",17.6,"Informed","Informed","Ireland",3801,0.789265982636148,"2000-01-01",4.7,10,1623,5.40913618898534,631,394,"Liberal","In",25936,1774 111 | "Irl",29703,106.652860471136,"Yes","138",18.2,"Informed","Informed","Ireland",3863,0.776598498576236,"2001-01-01",5.2,10,1774,5.49736729756653,599,379,"Liberal","In",27891,2059 112 | "Irl",32571,95.6256358087487,"Yes","139",21,"Informed","Informed","Ireland",3932,0.762970498474059,"2002-01-01",5.5,10,2059,5.59555998292301,567,364,"Liberal","In",29703,2367 113 | "Irl","NA","NA","Yes","140","NA","Informed","Informed","Ireland","NA","NA","NA","NA","NA",2367,"NA","NA","NA","Liberal","In",32571,2675 114 | "Neth",17707,92.027822364901,"Yes","141","NA","Informed","Informed","Netherlands",14952,0.735687533440342,"NA",5.4,9,1320,36.0028894774862,649,310,"SocDem","In",16580,1419 115 | "Neth",18708,85.0033178500332,"Yes","142",14.9,"Informed","Informed","Netherlands",15070,0.72992700729927,"1991-01-01",5.7,11,1419,36.2870214302914,651,314,"SocDem","In",17707,1532 116 | "Neth",19285,82.5210748155954,"Yes","143",15.1,"Informed","Informed","Netherlands",15184,0.724446786090622,"1992-01-01",6.1,12,1532,36.561521791476,650,309,"SocDem","In",18708,1623 117 | "Neth",19856,80.7717462393721,"Yes","144",15.4,"Informed","Informed","Netherlands",15290,0.719424460431655,"1993-01-01",6.3,12,1623,36.8167589694197,638,295,"SocDem","In",19285,1700 118 | "Neth",20768,84.3788597802769,"Yes","145",13.1,"Informed","Informed","Netherlands",15383,0.715075082883703,"1994-01-01",6.1,10,1700,37.0406934745967,616,294,"SocDem","In",19856,1745 119 | "Neth",21723,86.2927744356039,"Yes","146",15.2,"Informed","Informed","Netherlands",15459,0.711559609289087,"1995-01-01",6,12,1745,37.2236937153865,594,286,"SocDem","In",20768,1827 120 | "Neth",22541,75.9770781018608,"Yes","147",15.1,"Informed","Informed","Netherlands",15531,0.708260897559719,"1996-01-01",5.5,13,1827,37.3970623645557,577,293,"SocDem","In",21723,1878 121 | "Neth",23753,74.4987508807892,"Yes","148",14.4,"Informed","Informed","Netherlands",15611,0.704631349689322,"1997-01-01",5.5,13,1878,37.5896941969661,563,282,"SocDem","In",22541,1936 122 | "Neth",24780,67.8678296301012,"Yes","149",13,"Informed","Informed","Netherlands",15707,0.700324695995416,"1998-01-01",5.5,11,1936,37.8208523958584,552,265,"SocDem","In",23753,2016 123 | "Neth",25438,68.9349860865166,"Yes","150",10.9,"Informed","Informed","Netherlands",15812,0.695674171515305,"1999-01-01",5.5,12,2016,38.0736816758969,557,276,"SocDem","In",24780,2098 124 | "Neth",26873,67.939218887354,"Yes","151",12.6,"Informed","Informed","Netherlands",15926,0.690694461886224,"2000-01-01",5.5,11,2098,38.3481820370816,538,270,"SocDem","In",25438,2196 125 | "Neth",28756,61.8845818272467,"Yes","152",11.6,"Informed","Informed","Netherlands",16046,0.685529103826499,"2001-01-01",5.5,10,2196,38.6371297856971,519,264,"SocDem","In",26873,2455 126 | "Neth",28983,61.1183355006502,"Yes","153",12.6,"Informed","Informed","Netherlands",16149,0.681156727970772,"2002-01-01",5.5,9,2455,38.8851432699254,500,258,"SocDem","In",28756,2643 127 | "Neth","NA","NA","Yes","154","NA","Informed","Informed","Netherlands","NA","NA","NA","NA","NA",2643,"NA","NA","NA","SocDem","In",28983,2831 128 | "Nor",17905,78.2649693540783,"No","155","NA","Informed","Presumed","Norway",4242,0.235737859500236,"NA",6.4,11,1297,1.30974434975917,816,518,"SocDem","Out",16942,1385 129 | "Nor",19134,75.7860159549507,"No","156",15.2,"Informed","Presumed","Norway",4262,0.234631628343501,"1991-01-01",6.8,14,1385,1.31591947634927,758,482,"SocDem","Out",17905,1542 130 | "Nor",20128,75.8282781147923,"No","157",14.3,"Informed","Presumed","Norway",4286,0.233317778814746,"1992-01-01",6.9,11,1542,1.32332962825738,760,463,"SocDem","Out",19134,1643 131 | "Nor",21047,65.1669758812616,"No","158",17.1,"Informed","Presumed","Norway",4312,0.23191094619666,"1993-01-01",6.8,10,1643,1.3313572928245,767,442,"SocDem","Out",20128,1688 132 | "Nor",22503,65.252478671893,"No","159",16,"Informed","Presumed","Norway",4337,0.230574129582661,"1994-01-01",6.7,7,1688,1.33907620106212,696,405,"SocDem","Out",21047,1777 133 | "Nor",23868,69.9701766460197,"No","160",15.7,"Informed","Presumed","Norway",4359,0.229410415232852,"1995-01-01",6.7,10,1777,1.34586884031123,668,412,"SocDem","Out",22503,1892 134 | "Nor",26218,58.2058890664232,"No","161",15.1,"Informed","Presumed","Norway",4381,0.228258388495777,"1996-01-01",6.6,10,1892,1.35266147956033,689,394,"SocDem","Out",23868,2064 135 | "Nor",27784,68.7854710556186,"No","162",15.1,"Informed","Presumed","Norway",4405,0.227014755959137,"1997-01-01",6.6,9,2064,1.36007163146845,646,406,"SocDem","Out",26218,2179 136 | "Nor",27323,79.4223826714802,"No","163",15.6,"Informed","Presumed","Norway",4432,0.225631768953069,"1998-01-01",7.2,10,2179,1.36840805236507,639,408,"SocDem","Out",27784,2314 137 | "Nor",30005,68.1308830121022,"No","164",15.5,"Informed","Presumed","Norway",4462,0.224114746750336,"1999-01-01",7.3,8,2314,1.37767074225022,593,424,"SocDem","Out",27323,2561 138 | "Nor",35829,75.9296370518815,"No","165",17.6,"Informed","Presumed","Norway",4491,0.222667557336896,"2000-01-01",6.5,12,2561,1.38662467580585,554,412,"SocDem","Out",30005,2747 139 | "Nor",36554,60.9215773150199,"No","166",14.4,"Informed","Presumed","Norway",4514,0.221533008418254,"2001-01-01",6.9,8,2747,1.39372607138446,523,383,"SocDem","Out",35829,2946 140 | "Nor",35531,68.3120317320406,"No","167",13.7,"Informed","Presumed","Norway",4538,0.220361392684002,"2002-01-01",7.4,4,3258,1.40113622329258,492,354,"SocDem","Out",36554,3083 141 | "Nor","NA","NA","No","168","NA","Informed","Presumed","Norway","NA","NA","NA","NA","NA",3409,"NA","NA","NA","SocDem","Out",35531,3220 142 | "Spa",12971,232.483912483912,"No","169","NA","Informed","Presumed","Spain",38850,0.720720720720721,"NA",5.3,9,759,7.67801735212158,862,453,"NA","Out",12051,865 143 | "Spa",13754,227.029804727646,"No","170",20.2,"Informed","Presumed","Spain",38920,0.719424460431655,"1991-01-01",5.3,9,865,7.6918516176209,856,441,"NA","Out",12971,946 144 | "Spa",14331,200.405013970419,"No","171",21.7,"Informed","Presumed","Spain",39011,0.71774627669119,"1992-01-01",5.6,8,946,7.70983616277001,786,405,"NA","Out",13754,1035 145 | "Spa",14359,163.085737671373,"No","172",22.6,"Informed","Presumed","Spain",39096,0.716185799058727,"1993-01-01",5.8,9,1035,7.72663491373347,752,381,"NA","Out",14331,1084 146 | "Spa",15024,143.338610018894,"No","173",25,"Informed","Presumed","Spain",39166,0.714905785630394,"1994-01-01",5.6,8,1084,7.74046917923279,719,368,"NA","Out",14359,1116 147 | "Spa",15720,146.57216429136,"No","174",27,"Informed","Presumed","Spain",39223,0.713866863829896,"1995-01-01",5.5,8,1116,7.75173422399652,681,367,"NA","Out",15024,1195 148 | "Spa",16416,139.565671223809,"No","175",26.8,"Informed","Presumed","Spain",39279,0.712849105119784,"1996-01-01",5.5,8,1195,7.76280163639598,644,362,"NA","Out",15720,1250 149 | "Spa",17203,142.421469960354,"No","176",29,"Informed","Presumed","Spain",39348,0.711599064755515,"1997-01-01",5.4,8,1250,7.77643826953102,609,361,"NA","Out",16416,1287 150 | "Spa",18332,150.989785314171,"No","177",31.5,"Informed","Presumed","Spain",39453,0.70970521886802,"1998-01-01",5.4,8,1287,7.79718966778,597,366,"NA","Out",17203,1371 151 | "Spa",19546,144.803916620401,"No","178",33.4,"Informed","Presumed","Spain",39626,0.706606773330641,"1999-01-01",5.4,8,1371,7.83138006679974,587,351,"NA","Out",18332,1467 152 | "Spa",20017,142.2169695179,"No","179",33.9,"Informed","Presumed","Spain",40614,0.689417442261289,"2000-01-01",5.3,9,1467,8.0266408427044,530,349,"NA","Out",19546,1493 153 | "Spa",20864,133.880245625106,"No","180",32.5,"Informed","Presumed","Spain",41201,0.679595155457392,"2001-01-01",5.4,10,1493,8.1426510405344,473,347,"NA","Out",20017,1567 154 | "Spa",21592,127.692601614367,"No","181",33.7,"Informed","Presumed","Spain",41874,0.668672684720829,"2002-01-01",5.4,11,1567,8.27565762169213,416,345,"NA","Out",20864,1646 155 | "Spa","NA","NA","No","182","NA","Informed","Presumed","Spain","NA","NA","NA","NA","NA",1646,"NA","NA","NA","NA","Out",21592,1725 156 | "Swe",18660,99.0769949760486,"No","183","NA","Informed","Presumed","Sweden",8559,0.701016473887136,"NA",7.5,12,1497,1.90216908169615,669,468,"SocDem","Out",17915,1566 157 | "Swe",19000,89.5903446675177,"No","184",16.4,"Informed","Presumed","Sweden",8617,0.696298015550656,"1991-01-01",7.2,14,1566,1.9150591163659,670,454,"SocDem","Out",18660,1552 158 | "Swe",19116,85.9483156437471,"No","185",14.9,"Informed","Presumed","Sweden",8668,0.692201199815413,"1992-01-01",7.3,13,1552,1.9263934571962,648,419,"SocDem","Out",19000,1594 159 | "Swe",19063,87.0512673471729,"No","186",15.2,"Informed","Presumed","Sweden",8719,0.688152311044845,"1993-01-01",7.5,12,1594,1.93772779802649,632,407,"SocDem","Out",19116,1637 160 | "Swe",20132,71.9735793189842,"No","187",12.8,"Informed","Presumed","Sweden",8781,0.683293474547318,"1994-01-01",7.1,12,1637,1.9515068006045,605,421,"SocDem","Out",19063,1648 161 | "Swe",21290,66.7270873456441,"No","188",13,"Informed","Presumed","Sweden",8827,0.679732638495525,"1995-01-01",7.1,10,1648,1.96172993154947,587,379,"SocDem","Out",20132,1733 162 | "Swe",22029,64.6985635109151,"No","189",11.9,"Informed","Presumed","Sweden",8841,0.678656260604004,"1996-01-01",7.3,12,1733,1.96484131922838,585,363,"SocDem","Out",21290,1845 163 | "Swe",22756,60.705403572236,"No","190",12.6,"Informed","Presumed","Sweden",8846,0.678272665611576,"1997-01-01",7.1,10,1845,1.9659525291137,591,366,"SocDem","Out",22029,1870 164 | "Swe",23525,61.123036944978,"No","191",14.6,"Informed","Presumed","Sweden",8851,0.677889504010846,"1998-01-01",7.2,11,1870,1.96706373899902,588,368,"SocDem","Out",22756,1961 165 | "Swe",25099,59.9458116956424,"No","192",12.1,"Informed","Presumed","Sweden",8858,0.677353804470535,"1999-01-01",7.2,12,1961,1.96861943283847,571,355,"SocDem","Out",23525,2119 166 | "Swe",26574,65.3742110009017,"No","193",10.9,"Informed","Presumed","Sweden",8872,0.676284941388638,"2000-01-01",7.2,10,2119,1.97173082051738,555,352,"SocDem","Out",25099,2243 167 | "Swe",26902,65.535071942446,"No","194",12.1,"Informed","Presumed","Sweden",8896,0.674460431654676,"2001-01-01",7.5,9,2243,1.97706462796693,531,382,"SocDem","Out",26574,2370 168 | "Swe",27255,62.7450980392157,"No","195",11,"Informed","Presumed","Sweden",8925,0.672268907563025,"2002-01-01",7.9,8,2370,1.9835096453018,507,412,"SocDem","Out",26902,2517 169 | "Swe","NA","NA","No","196","NA","Informed","Presumed","Sweden","NA","NA","NA","NA","NA",2517,"NA","NA","NA","SocDem","Out",27255,2664 170 | "Swiz",24648,142.133492252682,"No","197","NA","Informed","Presumed","Switzerland",6712,1.04290822407628,"NA",4.3,13,1915,16.2557519980625,545,638,"Corporatist","NA",23009,2040 171 | "Swiz",24879,126.526408709725,"No","198",15.6,"Informed","Presumed","Switzerland",6797,1.02986611740474,"1991-01-01",4.7,13,2040,16.4616129813514,507,637,"Corporatist","NA",24648,2220 172 | "Swiz",25135,121.309090909091,"No","199",14.8,"Informed","Presumed","Switzerland",6875,1.01818181818182,"1992-01-01",5,14,2220,16.650520707193,492,605,"Corporatist","NA",24879,2345 173 | "Swiz",25316,104.20870567887,"No","200",16.6,"Informed","Presumed","Switzerland",6938,1.00893629287979,"1993-01-01",5.1,16,2345,16.8031000242189,477,558,"Corporatist","NA",25135,2384 174 | "Swiz",25901,97.0832141835859,"No","201",15.9,"Informed","Presumed","Switzerland",6994,1.0008578781813,"1994-01-01",5.1,13,2384,16.9387260837975,466,562,"Corporatist","NA",25316,2457 175 | "Swiz",26304,98.2814941059509,"No","202",13,"Informed","Presumed","Switzerland",7041,0.994176963499503,"1995-01-01",5.2,10,2457,17.0525550980867,441,443,"Corporatist","NA",25901,2555 176 | "Swiz",26180,87.10407239819,"No","203",12.57,"Informed","Presumed","Switzerland",7072,0.989819004524887,"1996-01-01",5.5,11,2555,17.1276338096391,414,417,"Corporatist","NA",26304,2654 177 | "Swiz",27675,82.8043447594865,"No","204",14.28,"Informed","Presumed","Switzerland",7089,0.987445337847369,"1997-01-01",5.6,14,2654,17.1688060062969,412,407,"Corporatist","NA",26180,2812 178 | "Swiz",28733,83.9662447257384,"No","205",15.43,"Informed","Presumed","Switzerland",7110,0.984528832630098,"1998-01-01",5.7,8,2812,17.2196657786389,390,400,"Corporatist","NA",27675,2967 179 | "Swiz",28562,81.6069428891377,"No","206",14.42,"Informed","Presumed","Switzerland",7144,0.979843225083987,"1999-01-01",5.8,10,2967,17.3020101719545,381,387,"Corporatist","NA",28733,2985 180 | "Swiz",29837,82.4053452115813,"No","207",14,"Informed","Presumed","Switzerland",7184,0.974387527839644,"2000-01-01",5.8,8,2985,17.3988859287963,354,409,"Corporatist","NA",28562,3111 181 | "Swiz",30134,75.2108392091802,"No","208",13.19,"Informed","Presumed","Switzerland",7233,0.967786533941656,"2001-01-01",6.2,6,3111,17.5175587309276,327,431,"Corporatist","NA",29837,3288 182 | "Swiz",30725,70.3703703703704,"No","209",10.4,"Informed","Presumed","Switzerland",7290,0.960219478737997,"2002-01-01",6.5,4,3288,17.6556066844272,300,453,"Corporatist","NA",30134,3445 183 | "Swiz","NA","NA","No","210","NA","Informed","Presumed","Switzerland","NA","NA","NA","NA","NA",3446,"NA","NA","NA","Corporatist","NA",30725,3602 184 | "UK",16228,94.3778608616653,"Yes","211","NA","Informed","Informed","United Kingdom",57238,0.716307348265139,"NA",5,7,940,23.5634597175909,858,326,"Liberal","In",15804,977 185 | "UK",16729,82.7486550949703,"Yes","212",14.2,"Informed","Informed","United Kingdom",57439,0.713800727728547,"1991-01-01",5.4,7,977,23.6462064138982,860,311,"Liberal","In",16228,1079 186 | "UK",17110,76.0480705776111,"Yes","213",14.4,"Informed","Informed","United Kingdom",57582,0.712028064325657,"1992-01-01",5.8,9,1079,23.7050759540571,818,298,"Liberal","In",16729,1184 187 | "UK",17952,68.5657846857618,"Yes","214",13.9,"Informed","Informed","United Kingdom",57711,0.710436485245447,"1993-01-01",5.9,12,1184,23.7581820427319,759,288,"Liberal","In",17110,1246 188 | "UK",18994,65.802437127301,"Yes","215",14.2,"Informed","Informed","United Kingdom",57855,0.708668222279838,"1994-01-01",5.9,10,1246,23.8174632579968,721,283,"Liberal","In",17952,1331 189 | "UK",19998,64.9081975691751,"Yes","216",14.4,"Informed","Informed","United Kingdom",58005,0.706835617619171,"1995-01-01",5.8,10,1331,23.8792145238977,718,279,"Liberal","In",18994,1393 190 | "UK",20839,64.5005934054593,"Yes","217",13.6,"Informed","Informed","United Kingdom",58139,0.705206487899689,"1996-01-01",5.8,9,1393,23.9343789881026,698,279,"Liberal","In",19998,1461 191 | "UK",22442,64.2211279446837,"Yes","218",13.4,"Informed","Informed","United Kingdom",58283,0.703464131908104,"1997-01-01",5.5,7,1461,23.9936602033675,671,279,"Liberal","In",20839,1535 192 | "UK",23343,61.2765229295003,"Yes","219",12.3,"Informed","Informed","United Kingdom",58440,0.701574264202601,"1998-01-01",5.5,7,1535,24.0582931950105,660,274,"Liberal","In",22442,1607 193 | "UK",24086,58.3781018163213,"Yes","220",12.1,"Informed","Informed","United Kingdom",58635,0.699241067621728,"1999-01-01",5.8,8,1607,24.1385698406817,640,277,"Liberal","In",23343,1725 194 | "UK",25271,60.8667562099393,"Yes","221",13.2,"Informed","Informed","United Kingdom",58817,0.69707737558869,"2000-01-01",5.9,9,1725,24.2134947099749,620,280,"Liberal","In",24086,1839 195 | "UK",26720,60.9303822119862,"Yes","222",13.2,"Informed","Informed","United Kingdom",59051,0.694315083571828,"2001-01-01",6.2,10,1839,24.3098266847804,600,283,"Liberal","In",25271,2012 196 | "UK",27959,60.4571853052404,"Yes","223",13,"Informed","Informed","United Kingdom",59232,0.692193408968125,"2002-01-01",6.4,11,2012,24.3843398789675,580,286,"Liberal","In",26720,2160 197 | "UK","NA","NA","Yes","224","NA","Informed","Informed","United Kingdom","NA","NA","NA","NA","NA",2160,"NA","NA","NA","Liberal","In",27959,2308 198 | "USA",23038,178.665427464617,"Yes","225","NA","Informed","Informed","United States",249623,1.09765526413832,"NA",4.7,97,2475,2.59238411937161,474,576,"Liberal","In",22039,2738 199 | "USA",23443,164.075562986944,"Yes","226",17.89,"Informed","Informed","United States",252981,1.08308529099023,"1991-01-01",5.2,103,2738,2.62725761209003,457,565,"Liberal","In",23038,2957 200 | "USA",24411,153.01309090342,"Yes","227",17.62,"Informed","Informed","United States",256514,1.06816781930031,"1992-01-01",5.5,99,2957,2.66394851434559,447,545,"Liberal","In",23443,3165 201 | "USA",25327,154.471200643277,"Yes","228",18.7,"Informed","Informed","United States",259919,1.05417456976981,"1993-01-01",5.7,101,3165,2.6993101113397,454,559,"Liberal","In",24411,3357 202 | "USA",26578,154.739554433997,"Yes","229",19.38,"Informed","Informed","United States",263126,1.0413262087365,"1994-01-01",5.9,96,3357,2.7326154392575,453,553,"Liberal","In",25327,3500 203 | "USA",27559,157.042639647286,"Yes","230",20.14,"Informed","Informed","United States",266278,1.02899976716064,"1995-01-01",6,88,3500,2.76534958132077,457,546,"Liberal","In",26578,3655 204 | "USA",28772,156.146759022101,"Yes","231",20.1,"Informed","Informed","United States",269394,1.01709763394879,"1996-01-01",6,80,3655,2.79770985627925,452,535,"Liberal","In",27559,3792 205 | "USA",30283,154.093021379293,"Yes","232",20.09,"Informed","Informed","United States",272647,1.00496246061758,"1997-01-01",5.9,75,3792,2.83149290327539,443,526,"Liberal","In",28772,3939 206 | "USA",31612,150.445525531622,"Yes","233",21,"Informed","Informed","United States",275854,0.993279053412312,"1998-01-01",5.8,68,3939,2.86479823119319,428,519,"Liberal","In",30283,4096 207 | "USA",33016,149.50186353211,"Yes","234",20.87,"Informed","Informed","United States",279040,0.981938073394495,"1999-01-01",5.8,64,4096,2.89788546996653,440,514,"Liberal","In",31612,4298 208 | "USA",34590,148.623079539656,"Yes","235",21.21,"Informed","Informed","United States",282224,0.970860026078576,"2000-01-01",5.8,61,4298,2.93095193834516,432,499,"Liberal","In",33016,4538 209 | "USA",35118,147.891124990362,"Yes","236",21.31,"Informed","Informed","United States",285318,0.960331980456894,"2001-01-01",6.2,58,4538,2.96308373896183,424,484,"Liberal","In",34590,4869 210 | "USA",36006,148.472963460011,"Yes","237",21.47,"Informed","Informed","United States",288369,0.950171481677989,"2002-01-01",6.6,55,4869,2.99476897609224,416,469,"Liberal","In",35118,5267 211 | "USA","NA","NA","Yes","238","NA","Informed","Informed","United States","NA","NA","NA","NA","NA",5267,"NA","NA","NA","Liberal","In",36006,5665 212 | -------------------------------------------------------------------------------- /sawzall-test/results/where_organdata_2.csv: -------------------------------------------------------------------------------- 1 | "ccode","gdp","roads","consistent","#f","donors","consent_practice","consent_law","country","pop","txp_pop","year","pubhealth","assault","health_lag","pop_dens","cerebvas","external","world","opt","gdp_lag","health" 2 | "Ita",17430,125.83085033234,"No","113","NA","Informed","Presumed","Italy",56719,0.458400183360073,"NA",6.4,25,1274,18.8222605694564,886,422,"Corporatist","In",16525,1397 3 | "Ita",18209,142.429208295889,"No","114",5.2,"Informed","Presumed","Italy",56751,0.458141706754066,"1991-01-01",6.6,27,1397,18.8328798035442,876,435,"Corporatist","In",17430,1520 4 | "Ita",18883,140.945145007826,"No","115",5.8,"Informed","Presumed","Italy",56859,0.457271496157161,"1992-01-01",6.5,21,1520,18.8687197185903,832,419,"Corporatist","In",18209,1584 5 | "Ita",19124,125.558730214377,"No","116",6.2,"Informed","Presumed","Italy",57049,0.455748567021333,"1993-01-01",6.2,16,1584,18.9317714209863,821,396,"Corporatist","In",18883,1554 6 | "Ita",19903,123.959862946647,"No","117",7.9,"Informed","Presumed","Italy",57204,0.454513670372701,"1994-01-01",5.9,15,1554,18.9832083360988,791,381,"Corporatist","In",19124,1557 7 | "Ita",20652,122.510950943265,"No","118",10.1,"Informed","Presumed","Italy",57301,0.453744262752832,"1995-01-01",5.3,14,1557,19.0153978894272,715,376,"Corporatist","In",19903,1524 8 | "Ita",21396,116.312699269997,"No","119",11,"Informed","Presumed","Italy",57397,0.452985347666254,"1996-01-01",5.4,14,1524,19.0472555916904,670,366,"Corporatist","In",20652,1605 9 | "Ita",22030,116.706078731395,"No","120",11.6,"Informed","Presumed","Italy",57512,0.452079566003617,"1997-01-01",5.6,12,1605,19.0854184641933,669,365,"Corporatist","In",21396,1705 10 | "Ita",23291,109.640897409182,"No","121",12.3,"Informed","Presumed","Italy",57588,0.451482947836355,"1998-01-01",5.6,12,1705,19.1106391451517,670,359,"Corporatist","In",22030,1800 11 | "Ita",23729,115.064358324949,"No","122",13.7,"Informed","Presumed","Italy",57646,0.451028692363737,"1999-01-01",5.6,11,1800,19.1298865069357,627,343,"Corporatist","In",23291,1853 12 | "Ita",24629,110.972611751671,"No","123",14.2,"Informed","Presumed","Italy",57762,0.450122918181503,"2000-01-01",6,10,1853,19.1683812305038,597,327,"Corporatist","In",23729,2001 13 | "Ita",25359,115.417832590597,"No","124",17.1,"Informed","Presumed","Italy",57894,0.449096624866135,"2001-01-01",6.3,9,2001,19.2121855711157,567,311,"Corporatist","In",24629,2107 14 | "Ita",25569,119.908956098907,"No","125",18.1,"Informed","Presumed","Italy",57994,0.448322240231748,"2002-01-01",6.4,8,2107,19.2453706776399,537,295,"Corporatist","In",25359,2166 15 | "Ita","NA","NA","No","126","NA","Informed","Presumed","Italy","NA","NA","NA","NA","NA",2166,"NA","NA","NA","Corporatist","In",25569,2225 16 | "Irl",12917,136.027319294252,"Yes","127","NA","Informed","Informed","Ireland",3514,0.853727945361411,"NA",4.4,7,727,5.00071154119824,825,432,"Liberal","In",11434,791 17 | "Irl",13495,125.919637804188,"Yes","128",19,"Informed","Informed","Ireland",3534,0.848896434634975,"1991-01-01",4.8,6,791,5.02917318912765,807,410,"Liberal","In",12917,884 18 | "Irl",14241,116.63856098932,"Yes","129",19.5,"Informed","Informed","Ireland",3558,0.843170320404722,"1992-01-01",5,8,884,5.06332716664295,787,379,"Liberal","In",13495,1005 19 | "Irl",14927,120.525727069351,"Yes","130",17.1,"Informed","Informed","Ireland",3576,0.838926174496644,"1993-01-01",5.1,7,1005,5.08894264977942,802,374,"Liberal","In",14241,1041 20 | "Irl",15990,112.534818941504,"Yes","131",20.3,"Informed","Informed","Ireland",3590,0.835654596100278,"1994-01-01",5,7,1041,5.10886580333001,749,392,"Liberal","In",14927,1119 21 | "Irl",17789,121.086173455251,"Yes","132",24.6,"Informed","Informed","Ireland",3609,0.831255195344971,"1995-01-01",4.9,8,1119,5.13590436886296,733,387,"Liberal","In",15990,1208 22 | "Irl",19245,124.587458745875,"Yes","133",16.8,"Informed","Informed","Ireland",3636,0.825082508250825,"1996-01-01",4.7,9,1208,5.17432759356767,719,391,"Liberal","In",17789,1269 23 | "Irl",22017,128.777566022325,"Yes","134",20.9,"Informed","Informed","Ireland",3673,0.81677103185407,"1997-01-01",4.8,8,1269,5.22698164223709,665,394,"Liberal","In",19245,1417 24 | "Irl",23995,123.28398384926,"Yes","135",23.8,"Informed","Informed","Ireland",3715,0.807537012113055,"1998-01-01",4.7,11,1417,5.28675110288886,614,417,"Liberal","In",22017,1487 25 | "Irl",25936,110.223642172524,"Yes","136",18.7,"Informed","Informed","Ireland",3756,0.798722044728434,"1999-01-01",4.6,10,1487,5.34509748114416,663,409,"Liberal","In",23995,1623 26 | "Irl",27891,109.181794264667,"Yes","137",17.6,"Informed","Informed","Ireland",3801,0.789265982636148,"2000-01-01",4.7,10,1623,5.40913618898534,631,394,"Liberal","In",25936,1774 27 | "Irl",29703,106.652860471136,"Yes","138",18.2,"Informed","Informed","Ireland",3863,0.776598498576236,"2001-01-01",5.2,10,1774,5.49736729756653,599,379,"Liberal","In",27891,2059 28 | "Irl",32571,95.6256358087487,"Yes","139",21,"Informed","Informed","Ireland",3932,0.762970498474059,"2002-01-01",5.5,10,2059,5.59555998292301,567,364,"Liberal","In",29703,2367 29 | "Irl","NA","NA","Yes","140","NA","Informed","Informed","Ireland","NA","NA","NA","NA","NA",2367,"NA","NA","NA","Liberal","In",32571,2675 30 | -------------------------------------------------------------------------------- /sawzall-test/results/where_organdata_3.csv: -------------------------------------------------------------------------------- 1 | "gdp_lag","gdp","roads","consistent","#f","donors","country","consent_law","consent_practice","txp_pop","pop","year","pubhealth","assault","health_lag","pop_dens","cerebvas","external","world","opt","ccode","health" 2 | 17425,18914,202.917426413128,"Yes","15","NA","Austria","Presumed","Presumed",0.651211252930451,7678,"NA",5.2,16,1255,9.15573575005962,953,614,"Corporatist","Out","Aus",1344 3 | 18914,19860,200,"Yes","16",27.6,"Austria","Presumed","Presumed",0.644745325596389,7755,"1991-01-01",5.2,12,1344,9.24755544955879,942,590,"Corporatist","Out","Aus",1419 4 | 19860,20601,178.931258768014,"Yes","17",23.1,"Austria","Presumed","Presumed",0.637673766101263,7841,"1992-01-01",5.5,14,1419,9.35010732172669,880,549,"Corporatist","Out","Aus",1551 5 | 20601,21119,162.28181128257,"Yes","18",26.2,"Austria","Presumed","Presumed",0.632431065013913,7906,"1993-01-01",5.9,13,1551,9.42761745766754,828,551,"Corporatist","Out","Aus",1674 6 | 21119,21940,168.598790322581,"Yes","19",21.4,"Austria","Presumed","Presumed",0.630040322580645,7936,"1994-01-01",5.9,11,1674,9.46339136656332,803,561,"Corporatist","Out","Aus",1739 7 | 21940,22817,152.239557121288,"Yes","20",21.5,"Austria","Presumed","Presumed",0.629089079013588,7948,"1995-01-01",5.8,10,1739,9.47770093012163,774,527,"Corporatist","Out","Aus",1865 8 | 22817,23798,129.036311094359,"Yes","21",24.7,"Austria","Presumed","Presumed",0.628219625581103,7959,"1996-01-01",5.8,11,1865,9.49081803005008,801,499,"Corporatist","Out","Aus",1986 9 | 23798,24364,138.679718875502,"Yes","22",19.5,"Austria","Presumed","Presumed",0.627510040160643,7968,"1997-01-01",5.3,9,1986,9.50155020271882,770,470,"Corporatist","Out","Aus",1848 10 | 24364,25423,120.722075968409,"Yes","23",20.7,"Austria","Presumed","Presumed",0.626802055910743,7977,"1998-01-01",5.4,11,1848,9.51228237538755,732,439,"Corporatist","Out","Aus",1953 11 | 25423,26513,135.01001001001,"Yes","24",25.9,"Austria","Presumed","Presumed",0.625625625625626,7992,"1999-01-01",5.4,8,1953,9.53016932983544,699,458,"Corporatist","Out","Aus",2069 12 | 26513,27738,121.817274088867,"Yes","25",24,"Austria","Presumed","Presumed",0.624063904143784,8012,"2000-01-01",5.4,9,2069,9.55401860243263,644,460,"Corporatist","Out","Aus",2147 13 | 27738,28457,119.302615193026,"Yes","26",23.9,"Austria","Presumed","Presumed",0.62266500622665,8030,"2001-01-01",5.2,9,2147,9.57548294777009,583,429,"Corporatist","Out","Aus",2174 14 | 28457,28842,118.713522910717,"Yes","27",23.8,"Austria","Presumed","Presumed",0.620886626102074,8053,"2002-01-01",5.4,9,2174,9.60290961125686,586,442,"Corporatist","Out","Aus",2220 15 | 28842,"NA","NA","Yes","28","NA","Austria","Presumed","Presumed","NA","NA","NA","NA","NA",2220,"NA","NA","NA","Corporatist","Out","Aus",2266 16 | 16848,18008,198.254238988663,"No","29","NA","Belgium","Presumed","Presumed",0.802648740844788,9967,"NA","NA",13,1246,30.1117824773414,688,565,"Corporatist","Out","Bel",1340 17 | 18008,18796,187.206396801599,"No","30",21,"Belgium","Presumed","Presumed",0.79960019990005,10005,"1991-01-01","NA",14,1340,30.226586102719,662,561,"Corporatist","Out","Bel",1460 18 | 18796,19444,166.450970632155,"No","31",20.6,"Belgium","Presumed","Presumed",0.79641612742658,10045,"1992-01-01","NA",16,1460,30.3474320241692,646,547,"Corporatist","Out","Bel",1547 19 | 19444,19733,164.600892414477,"No","32",21,"Belgium","Presumed","Presumed",0.793257312840853,10085,"1993-01-01","NA",19,1547,30.4682779456193,659,574,"Corporatist","Out","Bel",1595 20 | 19733,20732,167.259786476868,"No","33",22.8,"Belgium","Presumed","Presumed",0.790826413602214,10116,"1994-01-01",6,17,1595,30.5619335347432,615,584,"Corporatist","Out","Bel",1632 21 | 20732,21679,142.941698727434,"No","34",19.6,"Belgium","Presumed","Presumed",0.789188122718753,10137,"1995-01-01",6,16,1632,30.6253776435045,586,549,"Corporatist","Out","Bel",1882 22 | 21679,22152,133.503987397854,"No","35",21.2,"Belgium","Presumed","Presumed",0.78763414394014,10157,"1996-01-01",6.4,19,1882,30.6858006042296,585,523,"Corporatist","Out","Bel",1981 23 | 22152,22936,133.975051566644,"No","36",22.5,"Belgium","Presumed","Presumed",0.785777428543365,10181,"1997-01-01",6,17,1981,30.7583081570997,574,523,"Corporatist","Out","Bel",1967 24 | 22936,23738,147.015583651867,"No","37",19.4,"Belgium","Presumed","Presumed",0.784083112809958,10203,"1998-01-01",6,15,1967,30.8247734138973,563,523,"Corporatist","Out","Bel",2041 25 | 23738,24521,136.61255622922,"No","38",25.2,"Belgium","Presumed","Presumed",0.782319577547428,10226,"1999-01-01",6.2,13,2041,30.8942598187311,552,523,"Corporatist","Out","Bel",2139 26 | 24521,25991,143.400643839625,"No","39",25.6,"Belgium","Presumed","Presumed",0.780411667154424,10251,"2000-01-01",6.2,11,2139,30.9697885196375,541,523,"Corporatist","Out","Bel",2288 27 | 25991,27113,144.454165451541,"No","40",22.2,"Belgium","Presumed","Presumed",0.777680567706814,10287,"2001-01-01",6.4,9,2288,31.0785498489426,530,523,"Corporatist","Out","Bel",2441 28 | 27113,27652,145.359527726701,"No","41",21.7,"Belgium","Presumed","Presumed",0.774218523178167,10333,"2002-01-01",6.5,7,2441,31.2175226586103,519,523,"Corporatist","Out","Bel",2515 29 | 27652,"NA","NA","No","42","NA","Belgium","Presumed","Presumed","NA","NA","NA","NA","NA",2515,"NA","NA","NA","Corporatist","Out","Bel",2589 30 | -------------------------------------------------------------------------------- /sawzall-test/separate-test.rkt: -------------------------------------------------------------------------------- 1 | #lang racket/base 2 | (require rackunit 3 | sawzall 4 | "util.rkt") 5 | 6 | (define has-no-na 7 | (column-df [col #("a-b" "a-d" "b-c" "d-e")])) 8 | (define has-some-na 9 | (column-df [col #(#f "a-b" "a-d" "b-c" "d-e")])) 10 | 11 | (define extract-1 (extract has-no-na "col" #:into '("A"))) 12 | (define extract-1-result 13 | (column-df [A #("a" "a" "b" "d")])) 14 | 15 | (define extract-2 (extract has-no-na "col" 16 | #:into '("A" "B") 17 | #:regex #px"([[:alnum:]]+)-([[:alnum:]]+)")) 18 | (define extract-2-result 19 | (column-df [A #("a" "a" "b" "d")] 20 | [B #("b" "d" "c" "e")])) 21 | 22 | ;; NA values remain that way 23 | (define extract-3 (extract has-some-na "col" #:into '("A"))) 24 | (define extract-3-result 25 | (column-df [A #(#f "a" "a" "b" "d")])) 26 | 27 | (define extract-4 (extract has-some-na "col" 28 | #:into '("A" "B") 29 | #:regex #px"([[:alnum:]]+)-([[:alnum:]]+)")) 30 | (define extract-4-result 31 | (column-df [A #(#f "a" "a" "b" "d")] 32 | [B #(#f "b" "d" "c" "e")])) 33 | 34 | ;; non-matches become NA 35 | (define extract-5 (extract has-some-na "col" 36 | #:into '("A" "B") 37 | #:regex #px"([a-d]+)-([a-d]+)")) 38 | (define extract-5-result 39 | (column-df [A #(#f "a" "a" "b" #f)] 40 | [B #(#f "b" "d" "c" #f)])) 41 | 42 | ;; skip variables with #f 43 | (define extract-6 (extract has-some-na "col" 44 | #:into '("A" #f) 45 | #:regex #px"([a-d]+)-([a-d]+)")) 46 | (define extract-6-result 47 | (column-df [A #(#f "a" "a" "b" #f)])) 48 | 49 | ;; don't remove original column 50 | (define extract-7 (extract has-some-na "col" 51 | #:into '("A" #f) 52 | #:regex #px"([a-d]+)-([a-d]+)" 53 | #:remove? #f)) 54 | (define extract-7-result 55 | (column-df [col #(#f "a-b" "a-d" "b-c" "d-e")] 56 | [A #(#f "a" "a" "b" #f)])) 57 | 58 | (module+ test 59 | (check data-frame~=? extract-1 extract-1-result) 60 | (check data-frame~=? extract-2 extract-2-result) 61 | (check data-frame~=? extract-3 extract-3-result) 62 | (check data-frame~=? extract-4 extract-4-result) 63 | (check data-frame~=? extract-5 extract-5-result) 64 | (check data-frame~=? extract-6 extract-6-result) 65 | (check data-frame~=? extract-7 extract-7-result)) 66 | -------------------------------------------------------------------------------- /sawzall-test/slice-test.rkt: -------------------------------------------------------------------------------- 1 | #lang racket 2 | (require data-frame 3 | racket/runtime-path 4 | rackunit 5 | sawzall 6 | threading 7 | "test-data.rkt" 8 | "util.rkt") 9 | 10 | ;; single variable 11 | (define slice-1 (slice woodland1 "site")) 12 | (define slice-1-result 13 | (column-df [site #("b" "a" "c")])) 14 | 15 | ;; multiple variables 16 | (define slice-2 (slice woodland1 ["site" "habitat"])) 17 | (define slice-2-result woodland1) 18 | 19 | (define slice-3 (slice ball2 (starting-with "g"))) 20 | (define slice-3-result 21 | (column-df [game #(1 1 1 2)] 22 | [goals #(0 1 2 3)])) 23 | 24 | (define slice-4 (slice ball2 (ending-with "t"))) 25 | (define slice-4-result 26 | (column-df [first #("sam" "bob" "dan" "bob")] 27 | [last #("son" "ert" "man" "ert")])) 28 | 29 | ;; union 30 | (define slice-5 (slice ball2 (or (starting-with "g") (ending-with "t")))) 31 | (define slice-5-result ball2) 32 | 33 | ;; intersection 34 | (define slice-6 (slice ball2 (and (starting-with "g") (ending-with "t")))) 35 | (define slice-6-result (make-data-frame)) 36 | 37 | ;; basic regex 38 | (define slice-7 (slice docs1 #px"r")) 39 | (define slice-7-result 40 | (column-df [grp #("a" "a" "b" "b" "b")] 41 | [trt #("a" "b" "a" "b" "b")])) 42 | 43 | ;; using a vector, and a list, and a set 44 | (define slice-8 (slice docs1 (all-in (vector "trt")))) 45 | (define slice-8a (slice docs1 (all-in (list "trt")))) 46 | (define slice-8b (slice docs1 (all-in (set "trt")))) 47 | (define slice-8c (slice docs1 (any-in (set "trt" "thisdoesntexist")))) 48 | (define slice-8-result (column-df [trt #("a" "b" "a" "b" "b")])) 49 | 50 | ;; slicing up iris, as in dplyr::select examples 51 | (define-runtime-path slice-iris-1-data "./results/slice_iris_1.csv") 52 | (define slice-iris-1 (slice iris (not ["Sepal.Length" "Petal.Length"]))) 53 | 54 | (define-runtime-path slice-iris-2-data "./results/slice_iris_2.csv") 55 | (define slice-iris-2 (slice iris (not (ending-with "Width")))) 56 | 57 | (define-runtime-path slice-iris-3-data "./results/slice_iris_3.csv") 58 | (define slice-iris-3 (slice iris (and (starting-with "Petal") (not (ending-with "Width"))))) 59 | 60 | (module+ test 61 | ;; cannot remove a group variable from a grouped data frame 62 | (check-exn exn? (thunk (~> woodland2 63 | (group-with "day") 64 | (slice (not "day")) 65 | ungroup))) 66 | ;; ensure that these doesn't work despite grammar 67 | (check-exn exn? (thunk (slice woodland1 "not-in-woodland1"))) 68 | 69 | (check data-frame~=? slice-1 slice-1-result) 70 | (check data-frame~=? slice-2 slice-2-result) 71 | (check data-frame~=? slice-3 slice-3-result) 72 | (check data-frame~=? slice-4 slice-4-result) 73 | (check data-frame~=? slice-5 slice-5-result) 74 | (check data-frame~=? slice-6 slice-6-result) 75 | (check data-frame~=? slice-7 slice-7-result) 76 | (check data-frame~=? slice-8 slice-8-result) 77 | (check data-frame~=? slice-8a slice-8-result) 78 | (check data-frame~=? slice-8b slice-8-result) 79 | (check data-frame~=? slice-8c slice-8-result) 80 | 81 | (check-csv slice-iris-1 slice-iris-1-data) 82 | (check-equal? (sort (df-series-names slice-iris-1) string-ci))) 29 | (define reorder-3-result 30 | (row-df [site day catch] 31 | "b" 2 24 32 | "b" 1 12 33 | "c" 2 20 34 | "c" 1 10)) 35 | 36 | ;; sorting with groups 37 | (define reorder-4 38 | (~> woodland2 39 | (group-with "site") 40 | (reorder "day") 41 | ungroup)) 42 | (define reorder-4-result 43 | (row-df [site day catch] 44 | "b" 1 12 45 | "b" 2 24 46 | "c" 1 10 47 | "c" 2 20)) 48 | 49 | ;; sorting gss-sm 50 | (define-runtime-path reorder-gss-1-data "./results/reorder_gss_1.csv") 51 | (define reorder-gss-1 (reorder gss-sm "bigregion")) 52 | 53 | (define-runtime-path reorder-gss-2-data "./results/reorder_gss_2.csv") 54 | (define reorder-gss-2 (reorder gss-sm "bigregion" "religion")) 55 | 56 | (define-runtime-path reorder-gss-3-data "./results/reorder_gss_3.csv") 57 | (define reorder-gss-3 58 | (~> gss-sm 59 | (group-with "bigregion" "religion") 60 | (reorder "obama") 61 | ungroup)) 62 | 63 | (module+ test 64 | (check data-frame~=? reorder-1 reorder-1-result) 65 | (check data-frame~=? reorder-2 reorder-2-result) 66 | (check data-frame~=? reorder-3 reorder-3-result) 67 | (check data-frame~=? reorder-4 reorder-4-result) 68 | 69 | (check df-sorted-by? reorder-gss-1 "bigregion") 70 | (check-csv reorder-gss-1 reorder-gss-1-data) 71 | (check df-sorted-by? reorder-gss-2 "bigregion") 72 | (check-csv reorder-gss-2 reorder-gss-2-data) 73 | (check-csv reorder-gss-3 reorder-gss-3-data)) 74 | -------------------------------------------------------------------------------- /sawzall-test/test-data.rkt: -------------------------------------------------------------------------------- 1 | #lang racket 2 | (require data-frame sawzall) 3 | (provide (all-defined-out)) 4 | 5 | (define woodland1 (make-data-frame)) 6 | (define woodland2 (make-data-frame)) 7 | 8 | (df-add-series! woodland1 (make-series "site" #:data (vector "b" "a" "c"))) 9 | (df-add-series! woodland1 (make-series "habitat" #:data (vector "grassland" "meadow" "woodland"))) 10 | 11 | (df-add-series! woodland2 (make-series "site" #:data (vector "c" "b" "c" "b"))) 12 | (df-add-series! woodland2 (make-series "day" #:data (vector 1 1 2 2))) 13 | (df-add-series! woodland2 (make-series "catch" #:data (vector 10 12 20 24))) 14 | 15 | (define ball1 (make-data-frame)) 16 | (define ball2 (make-data-frame)) 17 | 18 | (df-add-series! ball1 (make-series "first" #:data (vector "sam" "bob" "sam" "dan"))) 19 | (df-add-series! ball1 (make-series "last" #:data (vector "son" "ert" "jam" "man"))) 20 | (df-add-series! ball1 (make-series "age" #:data (vector 10 20 30 40))) 21 | 22 | (df-add-series! ball2 (make-series "first" #:data (vector "sam" "bob" "dan" "bob"))) 23 | (df-add-series! ball2 (make-series "last" #:data (vector "son" "ert" "man" "ert"))) 24 | (df-add-series! ball2 (make-series "game" #:data (vector 1 1 1 2))) 25 | (df-add-series! ball2 (make-series "goals" #:data (vector 0 1 2 3))) 26 | 27 | (define docs1 28 | (column-df [grp #("a" "a" "b" "b" "b")] 29 | [trt #("a" "b" "a" "b" "b")] 30 | [adult #(1 2 3 4 5)] 31 | [juv #(10 20 30 40 50)])) 32 | 33 | (define anscombe 34 | (row-df [x1 x2 x3 x4 y1 y2 y3 y4] 35 | 10 10 10 8 8.04 9.14 7.46 6.58 36 | 8 8 8 8 6.95 8.14 6.77 5.76 37 | 13 13 13 8 7.58 8.74 12.74 7.71 38 | 9 9 9 8 8.81 8.77 7.11 8.84 39 | 11 11 11 8 8.33 9.26 7.81 8.47 40 | 14 14 14 8 9.96 8.10 8.84 7.04 41 | 6 6 6 8 7.24 6.13 6.08 5.25 42 | 4 4 4 19 4.26 3.10 5.39 12.50 43 | 12 12 12 8 10.84 9.13 8.15 5.56 44 | 7 7 7 8 4.82 7.26 6.42 7.91 45 | 5 5 5 8 5.68 4.74 5.73 6.89)) 46 | 47 | (define gss-sm (df-read/csv "data/gss_sm.csv")) 48 | (define organdata (df-read/csv "data/organdata.csv")) 49 | (define iris (df-read/csv "data/iris.csv")) 50 | (define relig-income (df-read/csv "data/relig_income.csv")) 51 | (define billboard (df-read/csv "data/billboard.csv")) 52 | -------------------------------------------------------------------------------- /sawzall-test/util.rkt: -------------------------------------------------------------------------------- 1 | #lang racket 2 | (require data-frame 3 | rackunit 4 | sawzall) 5 | (provide data-frame~=? 6 | df-sorted-by? 7 | df-contains-only? 8 | df-does-not-contain? 9 | check-csv) 10 | 11 | ; checks if two data-frames are "equivalent". 12 | ; conditions, checked sequentially: 13 | ; - they have the same series names 14 | ; - they have the same row count 15 | ; - they have the same data in each series (by equal?) 16 | (define (data-frame~=? df1 df2) 17 | (and df1 df2 18 | (apply df-contains? df1 (df-series-names df2)) 19 | (apply df-contains? df2 (df-series-names df1)) 20 | 21 | (= (df-row-count df1) (df-row-count df2)) 22 | 23 | (for*/and ([name (in-list (df-series-names df1))] 24 | [(val-df1 val-df2) (in-parallel (in-data-frame df1 name) 25 | (in-data-frame df2 name))]) 26 | (equal? val-df1 val-df2)))) 27 | 28 | ; checks if a data-frame is sorted by the given column 29 | (define (df-sorted-by? df by #:cmp? [cmp? orderablestring name)))) 48 | (df-write/csv df data-file) 49 | (fail-check (format "csv not the same as df, new set written to ~a" data-file)))) 50 | 51 | (module+ test 52 | (require rackunit) 53 | 54 | (define df1 55 | (for*/data-frame (al bl) 56 | ([as (in-range 6)] 57 | [bs (in-range 8)]) 58 | (values as bs))) 59 | (define df2 60 | (for*/data-frame (bl al) 61 | ([as (in-range 6)] 62 | [bs (in-range 8)]) 63 | (values bs as))) 64 | (check data-frame~=? df1 df2) 65 | (check df-sorted-by? df1 "al") 66 | 67 | (define df3 68 | (row-df [a b c] 69 | 1 2 3 70 | 4 5 6 71 | 7 8 9)) 72 | (define df4 73 | (row-df [a b c] 74 | 9 9 9 75 | 8 8 8 76 | 7 7 7)) 77 | (check-true (df-sorted-by? df3 "a")) 78 | (check-false (df-sorted-by? df4 "a")) 79 | (check-true (df-sorted-by? df4 "a" #:cmp? >))) 80 | -------------------------------------------------------------------------------- /sawzall/info.rkt: -------------------------------------------------------------------------------- 1 | #lang info 2 | 3 | (define collection 'multi) 4 | (define deps '("sawzall-lib" 5 | "sawzall-doc")) 6 | (define implies '("sawzall-lib" 7 | "sawzall-doc")) 8 | 9 | (define pkg-desc "A grammar for data wrangling") 10 | (define version "1.0") 11 | --------------------------------------------------------------------------------