├── HADOOP-VERSION ├── VERSION ├── MODULES ├── cascalog-core ├── test │ └── cascalog │ │ ├── logic │ │ ├── zip_test.clj │ │ ├── def_test.clj │ │ ├── defops_test.clj │ │ └── vars_test.clj │ │ ├── cascading │ │ ├── io_test.clj │ │ ├── flow_test.clj │ │ ├── util_test.clj │ │ └── conf_test.clj │ │ ├── in_memory_api_test.clj │ │ ├── jcascalog_cascading_test.clj │ │ └── jcascalog_test.clj ├── src │ ├── java │ │ ├── jcascalog │ │ │ ├── PredicateMacro.java │ │ │ ├── op │ │ │ │ ├── GT.java │ │ │ │ ├── LT.java │ │ │ │ ├── Div.java │ │ │ │ ├── GTE.java │ │ │ │ ├── LTE.java │ │ │ │ ├── Plus.java │ │ │ │ ├── Avg.java │ │ │ │ ├── Equals.java │ │ │ │ ├── Max.java │ │ │ │ ├── Min.java │ │ │ │ ├── Minus.java │ │ │ │ ├── Sum.java │ │ │ │ ├── Count.java │ │ │ │ ├── Multiply.java │ │ │ │ ├── DistinctCount.java │ │ │ │ ├── Limit.java │ │ │ │ ├── LimitRank.java │ │ │ │ ├── FixedSample.java │ │ │ │ └── ReParse.java │ │ │ ├── Option.java │ │ │ ├── Fields.java │ │ │ ├── example │ │ │ │ ├── Split.java │ │ │ │ └── Examples.java │ │ │ ├── ClojureOp.java │ │ │ ├── Playground.java │ │ │ ├── Predicate.java │ │ │ ├── Subquery.java │ │ │ └── PredicateMacroTemplate.java │ │ ├── cascalog │ │ │ ├── aggregator │ │ │ │ ├── ClojureAggregateBy.java │ │ │ │ ├── ClojureMonoidFunctor.java │ │ │ │ ├── FoldFunctor.java │ │ │ │ ├── CombinerSpec.java │ │ │ │ ├── ClojureAggregator.java │ │ │ │ ├── ClojureParallelAggregator.java │ │ │ │ └── ClojureMonoidAggregator.java │ │ │ ├── CascalogFilter.java │ │ │ ├── test │ │ │ │ ├── DoubleOp.java │ │ │ │ ├── RangeOp.java │ │ │ │ ├── MultiplyAgg.java │ │ │ │ ├── OneBuffer.java │ │ │ │ ├── KeepEven.java │ │ │ │ ├── CountAgg.java │ │ │ │ └── SumAgg.java │ │ │ ├── ops │ │ │ │ ├── RandLong.java │ │ │ │ ├── IdentityBuffer.java │ │ │ │ └── KryoInsert.java │ │ │ ├── hadoop │ │ │ │ └── DefaultComparator.java │ │ │ ├── SimplePrintDirectedGraph.java │ │ │ ├── ParallelAgg.java │ │ │ ├── CascalogBuffer.java │ │ │ ├── CascalogAggregatorExecutor.java │ │ │ ├── CascalogFunction.java │ │ │ ├── kryo │ │ │ │ ├── KryoService.java │ │ │ │ └── ClojureKryoInstantiator.java │ │ │ ├── RegularTupleSeqConverter.java │ │ │ ├── TupleSeqConverter.java │ │ │ ├── FastFirst.java │ │ │ ├── ClojureFilter.java │ │ │ ├── FilterFunctionCall.java │ │ │ ├── CascalogAggregator.java │ │ │ ├── WriterOutputStream.java │ │ │ ├── ClojureMap.java │ │ │ ├── ClojureCombiner.java │ │ │ ├── CascalogBufferExecutor.java │ │ │ ├── CascalogFunctionExecutor.java │ │ │ ├── ClojureParallelAgg.java │ │ │ ├── ClojureBufferIter.java │ │ │ ├── ClojureMapcat.java │ │ │ ├── CascadingFilterToFunction.java │ │ │ ├── CascadingFunctionWrapper.java │ │ │ ├── ClojureBuffer.java │ │ │ ├── ClojureMultibuffer.java │ │ │ ├── ClojureCombinedAggregator.java │ │ │ ├── ClojureBufferCombiner.java │ │ │ └── ClojureCascadingBase.java │ │ └── cascading │ │ │ └── pipe │ │ │ └── joiner │ │ │ └── CascalogJoiner.java │ └── clj │ │ └── cascalog │ │ ├── in_memory │ │ ├── util.clj │ │ ├── testing.clj │ │ ├── tuple.clj │ │ └── join.clj │ │ ├── logic │ │ ├── algebra.clj │ │ ├── testing.clj │ │ ├── platform.clj │ │ ├── options.clj │ │ ├── ops_impl.clj │ │ └── zip.clj │ │ └── cascading │ │ ├── def.clj │ │ ├── testing.clj │ │ ├── types.clj │ │ └── io.clj ├── gendoc.sh ├── dev │ └── logback.xml ├── example.job-conf.clj └── project.clj ├── cascalog-more-taps ├── README.md ├── src │ └── java │ │ └── cascalog │ │ └── moreTaps │ │ ├── WholeFileInputFormat.java │ │ ├── WholeFileRecordReader.java │ │ └── WholeFile.java └── project.clj ├── .gitignore ├── cascalog-lzo ├── test │ └── cascalog │ │ └── lzo_test.clj ├── src │ └── cascalog │ │ ├── lzo │ │ └── impl.clj │ │ └── lzo.clj ├── project.clj └── README.md ├── cascalog-checkpoint ├── test │ └── cascalog │ │ └── checkpoint_test.clj ├── project.clj └── README.md ├── LICENSE.txt ├── cascalog-math ├── project.clj ├── src │ └── cascalog │ │ └── math │ │ ├── stats.clj │ │ └── contrib │ │ ├── def.clj │ │ ├── types.clj │ │ └── accumulators.clj └── test │ └── cascalog │ └── math │ └── stats_test.clj ├── midje-cascalog ├── project.clj ├── src │ └── midje │ │ ├── cascalog │ │ └── impl.clj │ │ └── cascalog.clj ├── README.md └── test │ └── midje │ └── cascalog_test.clj ├── project.clj ├── .travis.yml └── README.md /HADOOP-VERSION: -------------------------------------------------------------------------------- 1 | 1.2.1 -------------------------------------------------------------------------------- /VERSION: -------------------------------------------------------------------------------- 1 | 3.0.1-SNAPSHOT -------------------------------------------------------------------------------- /MODULES: -------------------------------------------------------------------------------- 1 | cascalog-core 2 | cascalog-checkpoint 3 | cascalog-lzo 4 | cascalog-more-taps 5 | cascalog-math 6 | midje-cascalog -------------------------------------------------------------------------------- /cascalog-core/test/cascalog/logic/zip_test.clj: -------------------------------------------------------------------------------- 1 | (ns cascalog.logic.zip-test 2 | (:require [clojure.zip :as zip] 3 | [cascalog.logic.zip :refer :all])) 4 | -------------------------------------------------------------------------------- /cascalog-core/src/java/jcascalog/PredicateMacro.java: -------------------------------------------------------------------------------- 1 | package jcascalog; 2 | 3 | import java.util.List; 4 | 5 | public interface PredicateMacro { 6 | List getPredicates(Fields inFields, Fields outFields); 7 | } 8 | -------------------------------------------------------------------------------- /cascalog-core/src/java/jcascalog/op/GT.java: -------------------------------------------------------------------------------- 1 | package jcascalog.op; 2 | 3 | import jcascalog.ClojureOp; 4 | 5 | public class GT extends ClojureOp { 6 | public GT() { 7 | super("clojure.core", ">"); 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /cascalog-core/src/java/jcascalog/op/LT.java: -------------------------------------------------------------------------------- 1 | package jcascalog.op; 2 | 3 | import jcascalog.ClojureOp; 4 | 5 | public class LT extends ClojureOp { 6 | public LT() { 7 | super("clojure.core", "<"); 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /cascalog-core/src/java/jcascalog/op/Div.java: -------------------------------------------------------------------------------- 1 | package jcascalog.op; 2 | 3 | import jcascalog.ClojureOp; 4 | 5 | public class Div extends ClojureOp { 6 | public Div() { 7 | super("cascalog.api", "div"); 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /cascalog-core/src/java/jcascalog/op/GTE.java: -------------------------------------------------------------------------------- 1 | package jcascalog.op; 2 | 3 | import jcascalog.ClojureOp; 4 | 5 | public class GTE extends ClojureOp { 6 | public GTE() { 7 | super("clojure.core", ">="); 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /cascalog-core/src/java/jcascalog/op/LTE.java: -------------------------------------------------------------------------------- 1 | package jcascalog.op; 2 | 3 | import jcascalog.ClojureOp; 4 | 5 | public class LTE extends ClojureOp { 6 | public LTE() { 7 | super("clojure.core", "<="); 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /cascalog-core/src/java/jcascalog/op/Plus.java: -------------------------------------------------------------------------------- 1 | package jcascalog.op; 2 | 3 | import jcascalog.ClojureOp; 4 | 5 | public class Plus extends ClojureOp { 6 | public Plus() { 7 | super("clojure.core", "+"); 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /cascalog-core/src/java/jcascalog/op/Avg.java: -------------------------------------------------------------------------------- 1 | package jcascalog.op; 2 | 3 | import jcascalog.ClojureOp; 4 | 5 | public class Avg extends ClojureOp { 6 | public Avg() { 7 | super("cascalog.logic.ops", "avg"); 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /cascalog-core/src/java/jcascalog/op/Equals.java: -------------------------------------------------------------------------------- 1 | package jcascalog.op; 2 | 3 | import jcascalog.ClojureOp; 4 | 5 | public class Equals extends ClojureOp { 6 | public Equals() { 7 | super("clojure.core", "="); 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /cascalog-core/src/java/jcascalog/op/Max.java: -------------------------------------------------------------------------------- 1 | package jcascalog.op; 2 | 3 | import jcascalog.ClojureOp; 4 | 5 | public class Max extends ClojureOp { 6 | public Max() { 7 | super("cascalog.logic.ops", "max"); 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /cascalog-core/src/java/jcascalog/op/Min.java: -------------------------------------------------------------------------------- 1 | package jcascalog.op; 2 | 3 | import jcascalog.ClojureOp; 4 | 5 | public class Min extends ClojureOp { 6 | public Min() { 7 | super("cascalog.logic.ops", "min"); 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /cascalog-core/src/java/jcascalog/op/Minus.java: -------------------------------------------------------------------------------- 1 | package jcascalog.op; 2 | 3 | import jcascalog.ClojureOp; 4 | 5 | public class Minus extends ClojureOp { 6 | public Minus() { 7 | super("clojure.core", "-"); 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /cascalog-core/src/java/jcascalog/op/Sum.java: -------------------------------------------------------------------------------- 1 | package jcascalog.op; 2 | 3 | import jcascalog.ClojureOp; 4 | 5 | public class Sum extends ClojureOp { 6 | public Sum() { 7 | super("cascalog.logic.ops", "sum"); 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /cascalog-core/src/java/jcascalog/op/Count.java: -------------------------------------------------------------------------------- 1 | package jcascalog.op; 2 | 3 | import jcascalog.ClojureOp; 4 | 5 | public class Count extends ClojureOp { 6 | public Count() { 7 | super("cascalog.logic.ops", "count"); 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /cascalog-core/src/java/jcascalog/op/Multiply.java: -------------------------------------------------------------------------------- 1 | package jcascalog.op; 2 | 3 | import jcascalog.ClojureOp; 4 | 5 | public class Multiply extends ClojureOp { 6 | public Multiply() { 7 | super("clojure.core", "*"); 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /cascalog-core/src/java/jcascalog/op/DistinctCount.java: -------------------------------------------------------------------------------- 1 | package jcascalog.op; 2 | 3 | import jcascalog.ClojureOp; 4 | 5 | public class DistinctCount extends ClojureOp { 6 | public DistinctCount() { 7 | super("cascalog.logic.ops", "distinct-count"); 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /cascalog-core/src/clj/cascalog/in_memory/util.clj: -------------------------------------------------------------------------------- 1 | (ns cascalog.in-memory.util) 2 | 3 | (defn smallest-arity [fun] 4 | "Returns the smallest number of arguments the function takes" 5 | (->> fun meta :arglists first count)) 6 | 7 | (defn system-println [s] 8 | (.println System/out s)) 9 | -------------------------------------------------------------------------------- /cascalog-more-taps/README.md: -------------------------------------------------------------------------------- 1 | # Cascalog-More-Taps 2 | 3 | Additional taps for Cascalog. 4 | 5 | ## Available Taps 6 | 7 | - delimited files 8 | - writable sequence files 9 | 10 | ## Usage 11 | 12 | Add the following to project.clj: 13 | 14 | ```clj 15 | [cascalog-more-taps "0.3.0"] 16 | ``` -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | _deps 2 | build 3 | target 4 | out 5 | lib 6 | pom.xml 7 | *.class 8 | *.jar 9 | *~ 10 | .DS_Store 11 | *.#* 12 | *#* 13 | *.classpath 14 | *.project 15 | *.settings 16 | *.dot 17 | .lein-failures 18 | .lein-deps-sum 19 | doc 20 | *.ipr 21 | *.iws 22 | *.iml 23 | checkouts 24 | .nrepl-port -------------------------------------------------------------------------------- /cascalog-core/src/java/jcascalog/op/Limit.java: -------------------------------------------------------------------------------- 1 | package jcascalog.op; 2 | 3 | import java.util.Arrays; 4 | 5 | import jcascalog.ClojureOp; 6 | 7 | public class Limit extends ClojureOp { 8 | public Limit(int amt) { 9 | super("cascalog.logic.ops", "limit", Arrays.asList((Object) amt)); 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /cascalog-core/src/java/jcascalog/op/LimitRank.java: -------------------------------------------------------------------------------- 1 | package jcascalog.op; 2 | 3 | import java.util.Arrays; 4 | 5 | import jcascalog.ClojureOp; 6 | 7 | public class LimitRank extends ClojureOp { 8 | public LimitRank(int amt) { 9 | super("cascalog.logic.ops", "limit-rank", Arrays.asList((Object) amt)); 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /cascalog-core/src/java/jcascalog/op/FixedSample.java: -------------------------------------------------------------------------------- 1 | package jcascalog.op; 2 | 3 | import java.util.Arrays; 4 | 5 | import jcascalog.ClojureOp; 6 | 7 | public class FixedSample extends ClojureOp { 8 | public FixedSample(int amt) { 9 | super("cascalog.logic.ops", "fixed-sample-agg", Arrays.asList((Object) amt)); 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /cascalog-core/gendoc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Make sure you've installed codox! 4 | # 5 | # lein plugin install codox "0.3.2" 6 | 7 | lein doc 8 | cd doc 9 | git checkout gh-pages 10 | git add . 11 | git commit -am "new documentation push." 12 | echo "Pushing to gh-pages branch on github..." 13 | git push -u origin gh-pages 14 | cd .. 15 | 16 | -------------------------------------------------------------------------------- /cascalog-core/src/java/jcascalog/op/ReParse.java: -------------------------------------------------------------------------------- 1 | package jcascalog.op; 2 | 3 | import java.util.Arrays; 4 | import java.util.regex.Pattern; 5 | 6 | import jcascalog.ClojureOp; 7 | 8 | public class ReParse extends ClojureOp { 9 | public ReParse(Pattern pattern) { 10 | super("cascalog.logic.ops", "re-parse", Arrays.asList((Object) pattern)); 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /cascalog-core/src/java/jcascalog/Option.java: -------------------------------------------------------------------------------- 1 | package jcascalog; 2 | 3 | import clojure.lang.Keyword; 4 | 5 | public class Option { 6 | public static Object SORT = Keyword.intern("sort"); 7 | public static Object REVERSE = Keyword.intern("reverse"); 8 | public static Object TRAP = Keyword.intern("trap"); 9 | public static Object DISTINCT = Keyword.intern("distinct"); 10 | } 11 | -------------------------------------------------------------------------------- /cascalog-core/src/java/jcascalog/Fields.java: -------------------------------------------------------------------------------- 1 | package jcascalog; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | public class Fields extends ArrayList { 7 | public Fields(Object... fields) { 8 | for (Object field : fields) { 9 | add(field); 10 | } 11 | } 12 | 13 | public Fields(List fields) { 14 | for (Object field : fields) { 15 | add(field); 16 | } 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /cascalog-core/dev/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | %d{HH:mm:ss.SSS} [%thread FUCK] %-5level %logger{36} - %msg%n 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /cascalog-core/src/java/cascalog/aggregator/ClojureAggregateBy.java: -------------------------------------------------------------------------------- 1 | package cascalog.aggregator; 2 | 3 | import cascading.operation.Aggregator; 4 | import cascading.pipe.assembly.AggregateBy; 5 | import cascading.tuple.Fields; 6 | 7 | public class ClojureAggregateBy extends AggregateBy { 8 | public ClojureAggregateBy(Fields argumentFields, Functor functor, Aggregator aggregator) { 9 | super(argumentFields, functor, aggregator); 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /cascalog-lzo/test/cascalog/lzo_test.clj: -------------------------------------------------------------------------------- 1 | (ns cascalog.lzo-test 2 | (:use [cascalog lzo api] 3 | [midje sweet cascalog]) 4 | (:require [cascalog.cascading.io :as io])) 5 | 6 | (fact "Test round tripping." 7 | (io/with-fs-tmp [_ tmp] 8 | "Set up the job..." 9 | (?- (hfs-lzo-textline tmp) [["a line of text!"]]) 10 | (with-job-conf lzo-settings 11 | "The same line of text should come back out." 12 | (fact 13 | (hfs-lzo-textline tmp) => (produces [["a line of text!"]]))))) 14 | -------------------------------------------------------------------------------- /cascalog-core/src/java/cascalog/CascalogFilter.java: -------------------------------------------------------------------------------- 1 | package cascalog; 2 | 3 | import cascading.flow.FlowProcess; 4 | import cascading.operation.BaseOperation; 5 | import cascading.operation.Filter; 6 | import cascading.operation.FilterCall; 7 | 8 | public abstract class CascalogFilter extends BaseOperation implements Filter { 9 | 10 | public boolean isRemove(FlowProcess process, FilterCall filterCall) { 11 | return !isKeep(process, filterCall); 12 | } 13 | 14 | public abstract boolean isKeep(FlowProcess process, FilterCall filterCall); 15 | } 16 | -------------------------------------------------------------------------------- /cascalog-core/src/java/jcascalog/example/Split.java: -------------------------------------------------------------------------------- 1 | package jcascalog.example; 2 | 3 | import cascading.flow.FlowProcess; 4 | import cascading.operation.FunctionCall; 5 | import cascading.tuple.Tuple; 6 | import cascalog.CascalogFunction; 7 | 8 | public class Split extends CascalogFunction { 9 | public void operate(FlowProcess flowProcess, FunctionCall fnCall) { 10 | String sentence = fnCall.getArguments().getString(0); 11 | for (String word : sentence.split(" ")) { 12 | fnCall.getOutputCollector().add(new Tuple(word)); 13 | } 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /cascalog-checkpoint/test/cascalog/checkpoint_test.clj: -------------------------------------------------------------------------------- 1 | (ns cascalog.checkpoint-test 2 | (:use cascalog.checkpoint 3 | clojure.test)) 4 | 5 | (def sprint identity) 6 | 7 | (deftest workflow-test 8 | (workflow ["/tmp/lalala"] 9 | aaa ([] (sprint "aaa") 10 | (sprint "aaa") 11 | (sprint "aaa") 12 | (sprint "aaa")) 13 | bbb ([:deps nil] 14 | (sprint "bbb")) 15 | ccc ([:tmp-dirs ccc-path] 16 | (sprint "ccc")) 17 | ddd ([:deps :all] 18 | (sprint "ddd")))) 19 | -------------------------------------------------------------------------------- /cascalog-core/src/java/cascalog/test/DoubleOp.java: -------------------------------------------------------------------------------- 1 | package cascalog.test; 2 | 3 | import cascading.flow.FlowProcess; 4 | import cascading.operation.FunctionCall; 5 | import cascading.tuple.Tuple; 6 | import cascading.tuple.TupleEntry; 7 | import cascalog.CascalogFunction; 8 | import clojure.lang.Numbers; 9 | 10 | 11 | public class DoubleOp extends CascalogFunction { 12 | 13 | @Override 14 | public void operate(FlowProcess flowProcess, FunctionCall fnCall) { 15 | TupleEntry args = fnCall.getArguments(); 16 | Number n = (Number) args.get(0); 17 | fnCall.getOutputCollector().add(new Tuple(Numbers.multiply(n, 2))); 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /cascalog-core/test/cascalog/cascading/io_test.clj: -------------------------------------------------------------------------------- 1 | (ns cascalog.cascading.io-test 2 | (:use cascalog.cascading.io 3 | midje.sweet 4 | clojure.test) 5 | (:require [cascalog.api :as api])) 6 | 7 | (deftest configurable-with-fs-tmp 8 | (is (.startsWith (with-fs-tmp [_ foo] foo) 9 | "/tmp/cascalog_reserved/")) 10 | (is (.startsWith (api/with-job-conf 11 | {tmp-dir-property 12 | ;; deliberately using lein's build directory 13 | "target/bar"} 14 | (with-fs-tmp [_ foo] foo)) 15 | "target/bar/cascalog_reserved/"))) 16 | -------------------------------------------------------------------------------- /cascalog-core/src/java/cascalog/test/RangeOp.java: -------------------------------------------------------------------------------- 1 | package cascalog.test; 2 | 3 | import cascading.flow.FlowProcess; 4 | import cascading.operation.FunctionCall; 5 | import cascading.tuple.Tuple; 6 | import cascading.tuple.TupleEntry; 7 | import cascalog.CascalogFunction; 8 | 9 | 10 | public class RangeOp extends CascalogFunction { 11 | 12 | @Override 13 | public void operate(FlowProcess flowProcess, FunctionCall fnCall) { 14 | TupleEntry args = fnCall.getArguments(); 15 | Number n = (Number) args.get(0); 16 | for (int i = 1; i <= n.intValue(); i++) { 17 | fnCall.getOutputCollector().add(new Tuple(i)); 18 | } 19 | } 20 | 21 | } 22 | -------------------------------------------------------------------------------- /cascalog-core/src/java/cascalog/test/MultiplyAgg.java: -------------------------------------------------------------------------------- 1 | package cascalog.test; 2 | 3 | import java.util.Arrays; 4 | import java.util.List; 5 | 6 | import cascading.flow.FlowProcess; 7 | import cascalog.ParallelAgg; 8 | import clojure.lang.Numbers; 9 | 10 | public class MultiplyAgg implements ParallelAgg { 11 | 12 | @Override 13 | public void prepare(FlowProcess flowProcess) { } 14 | 15 | @Override 16 | public List init(List input) { 17 | return input; 18 | } 19 | 20 | @Override 21 | public List combine(List val1, List val2) { 22 | Number n1 = (Number) val1.get(0); 23 | Number n2 = (Number) val2.get(0); 24 | return Arrays.asList((Object) Numbers.multiply(n1, n2)); 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /cascalog-core/test/cascalog/logic/def_test.clj: -------------------------------------------------------------------------------- 1 | (ns cascalog.logic.def-test 2 | (:use midje.sweet 3 | cascalog.logic.def)) 4 | 5 | (defn square [x] 6 | (* x x)) 7 | 8 | (defn sum [& xs] 9 | (reduce + xs)) 10 | 11 | (defmapfn plus-two [x] 12 | (+ 2 x)) 13 | 14 | (defn times 15 | [y] 16 | (mapfn [x] (* x y))) 17 | 18 | (facts 19 | "Normal squaring function works." 20 | (square 10) => 100 21 | 22 | "And still works as a mapop. The behavior is unchanged." 23 | ((mapop square) 3) => 9 24 | 25 | "anonymous mapops work as functions" 26 | ((mapfn [x] (* x 5)) 4) => 20 27 | 28 | "operations defined with def*fn work as normal functions." 29 | (plus-two 2) => 4 30 | 31 | "Higher order mapfns work normally" 32 | ((times 2) 4) => 8) 33 | -------------------------------------------------------------------------------- /cascalog-core/src/java/cascalog/ops/RandLong.java: -------------------------------------------------------------------------------- 1 | package cascalog.ops; 2 | 3 | import java.util.Random; 4 | 5 | import cascading.flow.FlowProcess; 6 | import cascading.operation.FunctionCall; 7 | import cascading.operation.OperationCall; 8 | import cascading.tuple.Tuple; 9 | import cascalog.CascalogFunction; 10 | 11 | public class RandLong extends CascalogFunction { 12 | long seed; 13 | Random rand; 14 | 15 | public RandLong() { 16 | this.seed = new Random().nextLong(); 17 | } 18 | 19 | public void prepare(FlowProcess flowProcess, OperationCall operationCall) { 20 | this.rand = new Random(seed + flowProcess.getCurrentSliceNum()); 21 | } 22 | 23 | public void operate(FlowProcess flow_process, FunctionCall fn_call) { 24 | fn_call.getOutputCollector().add(new Tuple(rand.nextLong())); 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /cascalog-lzo/src/cascalog/lzo/impl.clj: -------------------------------------------------------------------------------- 1 | (ns cascalog.lzo.impl 2 | (:require [cascalog.cascading.util :as util]) 3 | (:import [com.twitter.elephantbird.cascading2.scheme 4 | LzoTextLine LzoTextDelimited LzoThriftScheme 5 | LzoProtobufScheme])) 6 | 7 | (defn text-line 8 | ([] (LzoTextLine.)) 9 | ([field-names] 10 | (text-line field-names field-names)) 11 | ([source-fields sink-fields] 12 | (LzoTextLine. (util/fields source-fields) 13 | (util/fields sink-fields)))) 14 | 15 | (defn delimited [field-names klasses] 16 | (let [klasses (when klasses (into-array klasses))] 17 | (-> (util/fields field-names) 18 | (LzoTextDelimited. "\t")))) 19 | 20 | (defn thrift-b64-line [klass] 21 | (LzoThriftScheme. klass)) 22 | 23 | (defn proto-b64-line [klass] 24 | (LzoProtobufScheme. klass)) 25 | -------------------------------------------------------------------------------- /cascalog-core/src/java/jcascalog/ClojureOp.java: -------------------------------------------------------------------------------- 1 | package jcascalog; 2 | 3 | import java.util.Collections; 4 | import java.util.List; 5 | 6 | import cascalog.Util; 7 | import clojure.lang.IFn; 8 | import clojure.lang.Var; 9 | 10 | public class ClojureOp { 11 | String _namespace; 12 | String _name; 13 | final List hofArgs; 14 | 15 | public ClojureOp(String namespace, String name) { 16 | this(namespace, name, Collections.emptyList()); 17 | } 18 | 19 | public ClojureOp(String namespace, String name, List hofArgs) { 20 | _namespace = namespace; 21 | _name = name; 22 | this.hofArgs = hofArgs; 23 | } 24 | 25 | public Object toVar() { 26 | Var ret = Util.getVar(_namespace, _name); 27 | if (!hofArgs.isEmpty()) 28 | return ret.applyTo(Util.coerceToSeq(hofArgs)); 29 | else 30 | return ret.deref(); 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | License: 2 | 3 | Copyright (c) 2010-2014 Nathan Marz. All Rights Reserved. 4 | 5 | Project and contact information: http://www.cascalog.org/ 6 | 7 | This file is part of the Cascalog project. 8 | 9 | Licensed under the Apache License, Version 2.0 (the "License"); 10 | you may not use this file except in compliance with the License. 11 | You may obtain a copy of the License at 12 | 13 | http://www.apache.org/licenses/LICENSE-2.0 14 | 15 | Unless required by applicable law or agreed to in writing, software 16 | distributed under the License is distributed on an "AS IS" BASIS, 17 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | See the License for the specific language governing permissions and 19 | limitations under the License. 20 | 21 | Third-party Licenses: 22 | 23 | All third-party dependencies are listed in project.clj. 24 | -------------------------------------------------------------------------------- /cascalog-core/test/cascalog/in_memory_api_test.clj: -------------------------------------------------------------------------------- 1 | (ns cascalog.in-memory-api-test 2 | (:use clojure.test 3 | [midje sweet cascalog] 4 | cascalog.logic.testing 5 | cascalog.in-memory.testing 6 | cascalog.api) 7 | (:require [cascalog.in-memory.platform :as p] 8 | [cascalog.in-memory.tuple :as t])) 9 | 10 | (use-fixtures :once 11 | (fn [f] 12 | (set-in-memory-platform!) 13 | (f))) 14 | 15 | (deftest test-atom-sink 16 | (let [results (atom [])] 17 | (?<- results [?n] ([[1] [2] [3]] ?n)) 18 | (is (= [{"?n" 1} {"?n" 2} {"?n" 3}] 19 | @results)))) 20 | 21 | (deftest test-fn-sink 22 | (let [results (atom [])] 23 | (letfn [(reset-atom [tuples fields] 24 | (reset! results tuples))] 25 | (?<- reset-atom [?n] ([[1] [2] [3]] ?n)) 26 | (is (= [{"?n" 1} {"?n" 2} {"?n" 3}] 27 | @results))))) 28 | 29 | -------------------------------------------------------------------------------- /cascalog-core/src/java/jcascalog/Playground.java: -------------------------------------------------------------------------------- 1 | package jcascalog; 2 | 3 | import java.util.List; 4 | 5 | import cascalog.Util; 6 | import clojure.lang.Var; 7 | 8 | /** 9 | * Some in-memory datasets to use to play around with Cascalog. You can see the contents of the 10 | * datasets in the src/clj/cascalog/playground.clj file. 11 | */ 12 | public class Playground { 13 | public static List AGE = getDataset("age"); 14 | public static List GENDER = getDataset("gender"); 15 | public static List FOLLOWS = getDataset("follows"); 16 | public static List FULL_NAMES = getDataset("full-names"); 17 | public static List LOCATION = getDataset("location"); 18 | public static List INTEGER = getDataset("integer"); 19 | public static List SENTENCE = getDataset("sentence"); 20 | 21 | 22 | private static List getDataset(String name) { 23 | Var v = Util.getVar("cascalog.playground", name); 24 | return (List) v.deref(); 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /cascalog-core/src/java/jcascalog/Predicate.java: -------------------------------------------------------------------------------- 1 | package jcascalog; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import clojure.lang.Keyword; 7 | 8 | public class Predicate { 9 | List _initialFields; 10 | List _outFields = null; 11 | Object _op; 12 | 13 | public Predicate(Object op, List initialFields) { 14 | _op = op; 15 | _initialFields = initialFields; 16 | } 17 | 18 | public Predicate(Object op, List inFields, List outFields) { 19 | _op = op; 20 | _initialFields = inFields; 21 | _outFields = outFields; 22 | } 23 | 24 | public List toRawCascalogPredicate() { 25 | List pred = new ArrayList(); 26 | pred.add(_op); 27 | pred.addAll(_initialFields); 28 | if (_outFields != null) { 29 | pred.add(Keyword.intern(">")); 30 | pred.addAll(_outFields); 31 | } 32 | return pred; 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /cascalog-core/src/clj/cascalog/in_memory/testing.clj: -------------------------------------------------------------------------------- 1 | (ns cascalog.in-memory.testing 2 | (:require [cascalog.api :refer :all] 3 | [cascalog.logic.testing :refer (ITestable)] 4 | [cascalog.in-memory.tuple :refer (map-select-values)] 5 | [jackknife.seq :refer (unweave)]) 6 | (:import [cascalog.in_memory.platform InMemoryPlatform])) 7 | 8 | (extend-protocol ITestable 9 | InMemoryPlatform 10 | (process?- [_ [ll :as bindings]] 11 | (let [bindings (if (keyword? ll) 12 | (rest bindings) 13 | bindings) 14 | [specs rules] (unweave bindings) 15 | out-tuples (map 16 | #(let [results (atom []) 17 | fields (get-out-fields %)] 18 | (?- results %) 19 | (map-select-values fields @results)) 20 | rules)] 21 | [specs out-tuples]))) 22 | -------------------------------------------------------------------------------- /cascalog-core/src/java/cascalog/hadoop/DefaultComparator.java: -------------------------------------------------------------------------------- 1 | package cascalog.hadoop; 2 | 3 | import java.io.Serializable; 4 | import java.util.Comparator; 5 | 6 | import cascading.tuple.Hasher; 7 | import clojure.lang.Util; 8 | 9 | /** User: sritchie Date: 12/12/11 Time: 3:23 PM */ 10 | public class DefaultComparator implements Comparator, Hasher, Serializable { 11 | 12 | public int compare(Object o1, Object o2) { 13 | return Util.compare(o1, o2); 14 | } 15 | 16 | private int numericHash(Number x) { 17 | Class xc = x.getClass(); 18 | 19 | if (xc == Long.class || xc == Integer.class || xc == Short.class || xc == Byte.class) { 20 | long lpart = x.longValue(); 21 | return (int) (lpart ^ (lpart >>> 32)); 22 | } 23 | return x.hashCode(); 24 | } 25 | 26 | public int hashCode(Object o) { 27 | if (o instanceof Number) { return numericHash((Number) o); } 28 | 29 | return o.hashCode(); 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /cascalog-core/src/clj/cascalog/logic/algebra.clj: -------------------------------------------------------------------------------- 1 | (ns cascalog.logic.algebra) 2 | 3 | (defprotocol Semigroup 4 | "First step toward an abstract algebra library." 5 | (plus [l r])) 6 | 7 | (extend-protocol Semigroup 8 | nil 9 | (plus [l r] r) 10 | 11 | String 12 | (plus [l r] 13 | (str l r)) 14 | 15 | clojure.lang.IPersistentVector 16 | (plus [l r] (concat l r)) 17 | 18 | clojure.lang.IPersistentList 19 | (plus [l r] (concat l r)) 20 | 21 | clojure.lang.IPersistentMap 22 | (plus [l r] 23 | (merge-with plus l r)) 24 | 25 | clojure.lang.LazySeq 26 | (plus [l r] 27 | (lazy-cat l r)) 28 | 29 | java.lang.Integer 30 | (plus [l r] (+ l r)) 31 | 32 | java.lang.Double 33 | (plus [l r] (+ l r)) 34 | 35 | java.lang.Float 36 | (plus [l r] (+ l r)) 37 | 38 | java.lang.Long 39 | (plus [l r] (+ l r)) 40 | 41 | clojure.lang.Ratio 42 | (plus [l r] (+ l r))) 43 | 44 | (defn sum [items] 45 | (reduce plus items)) 46 | -------------------------------------------------------------------------------- /cascalog-checkpoint/project.clj: -------------------------------------------------------------------------------- 1 | (def ROOT-DIR (subs *file* 0 (- (count *file*) (count "project.clj")))) 2 | (def HADOOP-VERSION (-> ROOT-DIR (str "/../HADOOP-VERSION") slurp)) 3 | (def VERSION (-> ROOT-DIR (str "/../VERSION") slurp)) 4 | 5 | (defproject cascalog/cascalog-checkpoint VERSION 6 | :description "Workflow checkpoints for the masses." 7 | :license {:name "Eclipse Public License" 8 | :url "http://www.eclipse.org/legal/epl-v10.html"} 9 | :repositories {"conjars.org" "http://conjars.org/repo"} 10 | :dependencies [[jackknife "0.1.7"] 11 | [hadoop-util "0.3.0"]] 12 | :profiles {:1.3 {:dependencies [[org.clojure/clojure "1.3.0"]]} 13 | :1.4 {:dependencies [[org.clojure/clojure "1.4.0"]]} 14 | :dev {:plugins [[lein-midje "3.1.3"]]} 15 | :provided {:dependencies [[cascalog/cascalog-core ~VERSION] 16 | [org.apache.hadoop/hadoop-core ~HADOOP-VERSION]]}}) 17 | -------------------------------------------------------------------------------- /cascalog-math/project.clj: -------------------------------------------------------------------------------- 1 | (def ROOT-DIR (subs *file* 0 (- (count *file*) (count "project.clj")))) 2 | (def HADOOP-VERSION (-> ROOT-DIR (str "/../HADOOP-VERSION") slurp)) 3 | (def VERSION (-> ROOT-DIR (str "/../VERSION") slurp)) 4 | 5 | (defproject cascalog/cascalog-math VERSION 6 | :description "Math modules for Cascalog." 7 | :license {:name "Eclipse Public License" 8 | :url "http://www.eclipse.org/legal/epl-v10.html"} 9 | :repositories {"conjars.org" "http://conjars.org/repo"} 10 | :profiles {:1.3 {:dependencies [[org.clojure/clojure "1.3.0"]]} 11 | :1.4 {:dependencies [[org.clojure/clojure "1.4.0"]]} 12 | :provided {:dependencies [[cascalog/cascalog-core ~VERSION] 13 | [org.apache.hadoop/hadoop-core ~HADOOP-VERSION]]} 14 | :dev {:dependencies [[cascalog/midje-cascalog ~VERSION] 15 | [net.sourceforge.parallelcolt/parallelcolt "0.10.0"]] 16 | :plugins [[lein-midje "3.1.3"]]}}) 17 | -------------------------------------------------------------------------------- /cascalog-more-taps/src/java/cascalog/moreTaps/WholeFileInputFormat.java: -------------------------------------------------------------------------------- 1 | package cascalog.moreTaps; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.mapred.RecordReader; 6 | import org.apache.hadoop.mapred.InputSplit; 7 | import org.apache.hadoop.mapred.JobConf; 8 | import org.apache.hadoop.mapred.Reporter; 9 | import org.apache.hadoop.mapred.FileSplit; 10 | import org.apache.hadoop.mapred.FileInputFormat; 11 | import org.apache.hadoop.io.Text; 12 | import org.apache.hadoop.io.BytesWritable; 13 | import org.apache.hadoop.fs.FileSystem; 14 | import org.apache.hadoop.fs.Path; 15 | 16 | public class WholeFileInputFormat extends FileInputFormat { 17 | @Override 18 | protected boolean isSplitable(FileSystem fs, Path filename) { 19 | return false; 20 | } 21 | 22 | @Override 23 | public RecordReader getRecordReader( 24 | InputSplit split, JobConf job, Reporter reporter) throws IOException { 25 | return new WholeFileRecordReader((FileSplit) split, job); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /midje-cascalog/project.clj: -------------------------------------------------------------------------------- 1 | (def ROOT-DIR (subs *file* 0 (- (count *file*) (count "project.clj")))) 2 | (def HADOOP-VERSION (-> ROOT-DIR (str "/../HADOOP-VERSION") slurp)) 3 | (def VERSION (-> ROOT-DIR (str "/../VERSION") slurp)) 4 | 5 | (defproject cascalog/midje-cascalog VERSION 6 | :description "Cascalog functions for Midje." 7 | :license {:name "Eclipse Public License" 8 | :url "http://www.eclipse.org/legal/epl-v10.html"} 9 | :repositories {"conjars.org" "http://conjars.org/repo"} 10 | :dependencies [[midje "1.7.0" :exclusions [org.clojure/clojure]]] 11 | :profiles {:1.3 {:dependencies [[org.clojure/clojure "1.3.0"]]} 12 | :1.4 {:dependencies [[org.clojure/clojure "1.4.0"]]} 13 | :1.5 {:dependencies [[org.clojure/clojure "1.5.1"]]} 14 | :1.6 {:dependencies [[org.clojure/clojure "1.6.0"]]} 15 | :1.7 {:dependencies [[org.clojure/clojure "1.7.0"]]} 16 | :dev {:plugins [[lein-midje "3.1.3"]]} 17 | :provided {:dependencies [[cascalog/cascalog-core ~VERSION] 18 | [org.apache.hadoop/hadoop-core ~HADOOP-VERSION]]}}) 19 | -------------------------------------------------------------------------------- /cascalog-core/src/java/cascalog/SimplePrintDirectedGraph.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2010 Nathan Marz 3 | 4 | Project and contact information: http://www.cascalog.org/ 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | package cascalog; 20 | 21 | public class SimplePrintDirectedGraph extends org.jgrapht.graph.DefaultDirectedGraph { 22 | public SimplePrintDirectedGraph(org.jgrapht.EdgeFactory ef) { 23 | super(ef); 24 | } 25 | 26 | public String toString() { 27 | return "Graph"; //this avoids infinite printing issue in Clojure node -> graph -> node -> ... 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /cascalog-core/src/java/cascalog/ParallelAgg.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2010 Nathan Marz 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | (at your option) any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | */ 17 | 18 | package cascalog; 19 | 20 | import java.io.Serializable; 21 | import java.util.List; 22 | 23 | import cascading.flow.FlowProcess; 24 | 25 | public interface ParallelAgg extends Serializable { 26 | void prepare(FlowProcess flowProcess); 27 | 28 | List init(List input); 29 | 30 | List combine(List val1, List val2); 31 | } 32 | -------------------------------------------------------------------------------- /cascalog-more-taps/project.clj: -------------------------------------------------------------------------------- 1 | (def ROOT-DIR (subs *file* 0 (- (count *file*) (count "project.clj")))) 2 | (def HADOOP-VERSION (-> ROOT-DIR (str "/../HADOOP-VERSION") slurp)) 3 | (def VERSION (-> ROOT-DIR (str "/../VERSION") slurp)) 4 | 5 | (defproject cascalog/cascalog-more-taps VERSION 6 | :description "More taps for Cascalog" 7 | :license {:name "Eclipse Public License" 8 | :url "http://www.eclipse.org/legal/epl-v10.html"} 9 | :javac-options ["-target" "1.6" "-source" "1.6"] 10 | :source-paths ["src/clj"] 11 | :java-source-paths ["src/java"] 12 | :jar-exclusions [#"\.java$"] 13 | :repositories {"conjars.org" "http://conjars.org/repo"} 14 | :profiles {:1.3 {:dependencies [[org.clojure/clojure "1.3.0"]]} 15 | :1.4 {:dependencies [[org.clojure/clojure "1.4.0"]]} 16 | :provided {:dependencies [[cascalog/cascalog-core ~VERSION] 17 | [org.apache.hadoop/hadoop-core ~HADOOP-VERSION]]} 18 | :dev {:plugins [[lein-midje "3.1.3"]] 19 | :dependencies 20 | [[cascalog/midje-cascalog ~VERSION] 21 | [hadoop-util "0.3.0"]]}}) 22 | -------------------------------------------------------------------------------- /cascalog-core/src/java/cascalog/test/OneBuffer.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2010 Nathan Marz 3 | 4 | Project and contact information: http://www.cascalog.org/ 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | package cascalog.test; 20 | 21 | import cascading.flow.FlowProcess; 22 | import cascading.operation.BufferCall; 23 | import cascading.tuple.Tuple; 24 | import cascalog.CascalogBuffer; 25 | 26 | public class OneBuffer extends CascalogBuffer { 27 | 28 | public void operate(FlowProcess flowProcess, BufferCall bufCall) { 29 | bufCall.getOutputCollector().add(new Tuple(1)); 30 | } 31 | } 32 | 33 | -------------------------------------------------------------------------------- /cascalog-core/src/java/cascalog/test/KeepEven.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2010 Nathan Marz 3 | 4 | Project and contact information: http://www.cascalog.org/ 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | package cascalog.test; 20 | 21 | import cascading.flow.FlowProcess; 22 | import cascading.operation.BaseOperation; 23 | import cascading.operation.Filter; 24 | import cascading.operation.FilterCall; 25 | 26 | public class KeepEven extends BaseOperation implements Filter { 27 | 28 | public boolean isRemove(FlowProcess process, FilterCall call) { 29 | return call.getArguments().getInteger(0) % 2 == 1; 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /cascalog-core/src/clj/cascalog/cascading/def.clj: -------------------------------------------------------------------------------- 1 | (ns cascalog.cascading.def 2 | (:require [cascalog.logic.fn :as s] 3 | [cascalog.logic.def :as d] 4 | [jackknife.meta :refer (meta-update)])) 5 | 6 | (defn prepared 7 | "Marks the supplied operation as needing to be prepared by 8 | Cascading. The supplied op should take two arguments and return 9 | another IFn for use by Cascading." 10 | [afn] 11 | (meta-update afn #(merge % {::prepared true}))) 12 | 13 | ;; TODO: This runs into trouble if you want to return a map to use as 14 | ;; a function. Make an interface that we can reify to make a prepared 15 | ;; operation if we want a cleanup. 16 | 17 | (defmacro prepfn 18 | "Defines a prepared operation. Pass in an argument vector of two 19 | items and return either a function or a Map with two 20 | keywords; :operate and :cleanup" 21 | [args & body] {:pre [(= 2 (count args))]} 22 | `(prepared (s/fn ~args ~@body))) 23 | 24 | (defn prepared? 25 | "Returns true if the supplied operation needs to be supplied the 26 | FlowProcess and operation call by Cascading on instantiation, false 27 | otherwise." 28 | [op] 29 | (= true (-> op meta ::prepared))) 30 | 31 | (d/defdefop defprepfn 32 | "Defines a prepared operation." 33 | `prepfn) 34 | -------------------------------------------------------------------------------- /project.clj: -------------------------------------------------------------------------------- 1 | (def VERSION (slurp "VERSION")) 2 | (def MODULES (-> "MODULES" slurp (.split "\n"))) 3 | (def DEPENDENCIES (for [m MODULES] [(symbol (str "cascalog/" m)) VERSION])) 4 | 5 | (eval `(defproject cascalog/cascalog ~VERSION 6 | :description "Hadoop without the Hassle." 7 | :url "http://www.cascalog.org" 8 | :license {:name "Eclipse Public License" 9 | :url "http://www.eclipse.org/legal/epl-v10.html"} 10 | :mailing-list {:name "Cascalog user mailing list" 11 | :archive "https://groups.google.com/d/forum/cascalog-user" 12 | :post "cascalog-user@googlegroups.com"} 13 | :dependencies [~@DEPENDENCIES] 14 | :plugins [[~'lein-sub "0.3.0"] 15 | [~'codox "0.6.6"]] 16 | :sub [~@MODULES] 17 | :codox {:src-dir-uri "http://github.com/nathanmarz/cascalog/blob/master" 18 | :src-linenum-anchor-prefix "L" 19 | :sources ["cascalog-core/src" 20 | "cascalog-checkpoint/src" 21 | "cascalog-more-taps/src" 22 | "cascalog-math/src" 23 | "cascalog-lzo/src" 24 | "midje-cascalog/src"]})) 25 | -------------------------------------------------------------------------------- /cascalog-core/src/java/cascalog/CascalogBuffer.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2010 Nathan Marz 3 | 4 | Project and contact information: http://www.cascalog.org/ 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | package cascalog; 20 | 21 | import java.io.Serializable; 22 | 23 | import cascading.flow.FlowProcess; 24 | import cascading.operation.BufferCall; 25 | import cascading.operation.OperationCall; 26 | 27 | public abstract class CascalogBuffer implements Serializable { 28 | public void prepare(FlowProcess flowProcess, OperationCall operationCall) { 29 | } 30 | 31 | public abstract void operate(FlowProcess flowProcess, BufferCall bufCall); 32 | 33 | public void cleanup(FlowProcess flowProcess, OperationCall operationCall) { 34 | 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /cascalog-core/src/java/cascalog/ops/IdentityBuffer.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2010 Nathan Marz 3 | 4 | Project and contact information: http://www.cascalog.org/ 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | package cascalog.ops; 20 | 21 | import java.util.Iterator; 22 | 23 | import cascading.flow.FlowProcess; 24 | import cascading.operation.BufferCall; 25 | import cascading.tuple.TupleEntry; 26 | import cascalog.CascalogBuffer; 27 | 28 | public class IdentityBuffer extends CascalogBuffer { 29 | 30 | public void operate(FlowProcess flowProcess, BufferCall bufCall) { 31 | Iterator it = bufCall.getArgumentsIterator(); 32 | while (it.hasNext()) { 33 | bufCall.getOutputCollector().add(it.next().getTuple()); 34 | } 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /cascalog-core/src/java/cascalog/CascalogAggregatorExecutor.java: -------------------------------------------------------------------------------- 1 | package cascalog; 2 | 3 | import cascading.flow.FlowProcess; 4 | import cascading.operation.Aggregator; 5 | import cascading.operation.AggregatorCall; 6 | import cascading.operation.BaseOperation; 7 | import cascading.operation.OperationCall; 8 | import cascading.tuple.Fields; 9 | 10 | public class CascalogAggregatorExecutor extends BaseOperation implements Aggregator { 11 | CascalogAggregator agg; 12 | 13 | public CascalogAggregatorExecutor(Fields outFields, CascalogAggregator agg) { 14 | super(outFields); 15 | this.agg = agg; 16 | } 17 | 18 | @Override 19 | public void prepare(FlowProcess flowProcess, OperationCall operationCall) { 20 | agg.prepare(flowProcess, operationCall); 21 | } 22 | 23 | @Override 24 | public void cleanup(FlowProcess flowProcess, OperationCall operationCall) { 25 | agg.prepare(flowProcess, operationCall); 26 | } 27 | 28 | public void start(FlowProcess flowProcess, AggregatorCall aggCall) { 29 | agg.start(flowProcess, aggCall); 30 | } 31 | 32 | public void aggregate(FlowProcess flowProcess, AggregatorCall aggCall) { 33 | agg.aggregate(flowProcess, aggCall); 34 | } 35 | 36 | public void complete(FlowProcess flowProcess, AggregatorCall aggCall) { 37 | agg.complete(flowProcess, aggCall); 38 | } 39 | 40 | } 41 | -------------------------------------------------------------------------------- /cascalog-core/src/java/cascalog/CascalogFunction.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2010 Nathan Marz 3 | 4 | Project and contact information: http://www.cascalog.org/ 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | package cascalog; 20 | 21 | import java.io.Serializable; 22 | 23 | import cascading.flow.FlowProcess; 24 | import cascading.operation.FunctionCall; 25 | import cascading.operation.OperationCall; 26 | 27 | public abstract class CascalogFunction implements Serializable { 28 | public void prepare(FlowProcess flowProcess, OperationCall operationCall) { 29 | 30 | } 31 | 32 | public abstract void operate(FlowProcess flow_process, FunctionCall fn_call); 33 | 34 | public void cleanup(FlowProcess flowProcess, OperationCall operationCall) { 35 | 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /cascalog-core/src/java/cascalog/kryo/KryoService.java: -------------------------------------------------------------------------------- 1 | package cascalog.kryo; 2 | 3 | import com.twitter.chill.KryoInstantiator; 4 | import com.twitter.chill.KryoPool; 5 | import com.twitter.chill.hadoop.HadoopConfig; 6 | import com.twitter.chill.config.ConfiguredInstantiator; 7 | import com.twitter.chill.config.ConfigurationException; 8 | 9 | import static cascalog.Util.clojureConf; 10 | 11 | public class KryoService { 12 | static int GUESS_THREADS_PER_CORE = 4; 13 | static int MAX_CACHED_KRYO = GUESS_THREADS_PER_CORE * Runtime.getRuntime().availableProcessors(); 14 | 15 | static final Object mutex = new Object(); 16 | static KryoPool kpool = null; 17 | 18 | public static KryoPool defaultPool() { 19 | synchronized(mutex) { 20 | if (kpool == null) { 21 | try { 22 | KryoInstantiator kryoInst = new ConfiguredInstantiator(new HadoopConfig(clojureConf())); 23 | kpool = KryoPool.withByteArrayOutputStream(MAX_CACHED_KRYO, kryoInst); 24 | } catch (ConfigurationException cx) { 25 | throw new RuntimeException(cx); 26 | } 27 | } 28 | return kpool; 29 | } 30 | } 31 | 32 | public static byte[] serialize(Object obj) { 33 | return defaultPool().toBytesWithClass(obj); 34 | } 35 | 36 | public static Object deserialize(byte[] bytes) { 37 | return defaultPool().fromBytes(bytes); 38 | } 39 | 40 | } 41 | -------------------------------------------------------------------------------- /cascalog-core/src/java/cascalog/RegularTupleSeqConverter.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2010 Nathan Marz 3 | 4 | Project and contact information: http://www.cascalog.org/ 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | package cascalog; 20 | 21 | import java.util.Iterator; 22 | 23 | import cascading.tuple.Tuple; 24 | import clojure.lang.ISeq; 25 | 26 | public class RegularTupleSeqConverter implements Iterator { 27 | private Iterator _tuples; 28 | 29 | public RegularTupleSeqConverter(Iterator tuples) { 30 | _tuples = tuples; 31 | } 32 | 33 | public boolean hasNext() { 34 | return _tuples.hasNext(); 35 | } 36 | 37 | public ISeq next() { 38 | return Util.coerceFromTuple(_tuples.next()); 39 | } 40 | 41 | public void remove() { 42 | _tuples.remove(); 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /cascalog-core/src/java/cascalog/TupleSeqConverter.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2010 Nathan Marz 3 | 4 | Project and contact information: http://www.cascalog.org/ 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | package cascalog; 20 | 21 | import java.util.Iterator; 22 | 23 | import cascading.tuple.TupleEntry; 24 | import clojure.lang.ISeq; 25 | 26 | public class TupleSeqConverter implements Iterator { 27 | private Iterator _tuples; 28 | 29 | public TupleSeqConverter(Iterator tuples) { 30 | _tuples = tuples; 31 | } 32 | 33 | public boolean hasNext() { 34 | return _tuples.hasNext(); 35 | } 36 | 37 | public ISeq next() { 38 | return Util.coerceFromTuple(_tuples.next().getTupleCopy()); 39 | } 40 | 41 | public void remove() { 42 | _tuples.remove(); 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /cascalog-core/src/java/cascalog/FastFirst.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2010 Nathan Marz 3 | 4 | Project and contact information: http://www.cascalog.org/ 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | package cascalog; 20 | 21 | import java.util.Iterator; 22 | 23 | import cascading.flow.FlowProcess; 24 | import cascading.operation.BaseOperation; 25 | import cascading.operation.Buffer; 26 | import cascading.operation.BufferCall; 27 | import cascading.tuple.Fields; 28 | import cascading.tuple.TupleEntry; 29 | 30 | public class FastFirst extends BaseOperation implements Buffer { 31 | 32 | public FastFirst() { 33 | super(Fields.ARGS); 34 | } 35 | 36 | public void operate(FlowProcess flowProcess, BufferCall bufCall) { 37 | bufCall.getOutputCollector() 38 | .add(((Iterator) bufCall.getArgumentsIterator()).next()); 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /cascalog-lzo/src/cascalog/lzo.clj: -------------------------------------------------------------------------------- 1 | (ns cascalog.lzo 2 | (:use cascalog.api) 3 | (:require [cascalog.lzo.impl :as lzo] 4 | [clojure.string :as s]) 5 | (:import cascading.tuple.Fields)) 6 | 7 | (def lzo-settings 8 | {"mapred.map.output.compression.codec" "com.hadoop.compression.lzo.LzoCodec" 9 | "io.compression.codec.lzo.class" "com.hadoop.compression.lzo.LzoCodec" 10 | "io.compression.codecs" 11 | (s/join "," ["org.apache.hadoop.io.compress.GzipCodec" 12 | "org.apache.hadoop.io.compress.DefaultCodec" 13 | "org.apache.hadoop.io.compress.BZip2Codec" 14 | "com.hadoop.compression.lzo.LzoCodec" 15 | "com.hadoop.compression.lzo.LzopCodec"])}) 16 | 17 | (defn hfs-lzo-textline [path & opts] 18 | (let [scheme (->> (:outfields (apply array-map opts) Fields/ALL) 19 | (lzo/text-line ["line"]))] 20 | (apply hfs-tap scheme path opts))) 21 | 22 | (defn lfs-lzo-textline 23 | [path & opts] 24 | (let [scheme (->> (:outfields (apply array-map opts) Fields/ALL) 25 | (lzo/text-line ["line"]))] 26 | (apply lfs-tap scheme path opts))) 27 | 28 | (defn hfs-lzo-thrift 29 | [path klass & opts] 30 | (let [scheme (lzo/thrift-b64-line klass)] 31 | (apply hfs-tap scheme path opts))) 32 | 33 | (defn hfs-lzo-protobuf 34 | [path klass & opts] 35 | (let [scheme (lzo/proto-b64-line klass)] 36 | (apply hfs-tap scheme path opts))) 37 | -------------------------------------------------------------------------------- /cascalog-lzo/project.clj: -------------------------------------------------------------------------------- 1 | (def ROOT-DIR (subs *file* 0 (- (count *file*) (count "project.clj")))) 2 | (def HADOOP-VERSION (-> ROOT-DIR (str "/../HADOOP-VERSION") slurp)) 3 | (def VERSION (-> ROOT-DIR (str "/../VERSION") slurp)) 4 | 5 | (defproject cascalog/cascalog-lzo VERSION 6 | :description "Lzo compression taps for Cascalog." 7 | :license {:name "Eclipse Public License" 8 | :url "http://www.eclipse.org/legal/epl-v10.html"} 9 | :repositories {"conjars.org" "http://conjars.org/repo" "twttr.com" "http://maven.twttr.com/"} 10 | :dependencies [[com.twitter.elephantbird/elephant-bird-cascading2 "4.6" 11 | :exclusions [cascading/cascading-hadoop]] 12 | [com.hadoop.gplcompression/hadoop-lzo "0.4.19"]] 13 | :profiles {:1.3 {:dependencies [[org.clojure/clojure "1.3.0"]]} 14 | :1.4 {:dependencies [[org.clojure/clojure "1.4.0"]]} 15 | :1.5 {:dependencies [[org.clojure/clojure "1.5.0"]]} 16 | :1.6 {:dependencies [[org.clojure/clojure "1.6.0"]]} 17 | :provided {:dependencies [[cascalog/cascalog-core ~VERSION] 18 | [org.apache.hadoop/hadoop-common "2.4.0"] 19 | [org.apache.hadoop/hadoop-mapreduce-client-jobclient "2.4.0"] 20 | [org.apache.httpcomponents/httpclient "4.2.3"]]} 21 | :dev {:dependencies [[cascalog/midje-cascalog ~VERSION]] 22 | :plugins [[lein-midje "3.1.3"]]}}) 23 | -------------------------------------------------------------------------------- /cascalog-core/src/java/cascalog/ClojureFilter.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2010 Nathan Marz 3 | 4 | Project and contact information: http://www.cascalog.org/ 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | package cascalog; 20 | 21 | import cascading.flow.FlowProcess; 22 | import cascading.operation.Filter; 23 | import cascading.operation.FilterCall; 24 | import clojure.lang.IFn; 25 | import clojure.lang.ISeq; 26 | import clojure.lang.Var; 27 | 28 | public class ClojureFilter extends ClojureCascadingBase implements Filter { 29 | public ClojureFilter(IFn fn) { 30 | super(fn); 31 | } 32 | 33 | public boolean isRemove(FlowProcess fp, FilterCall call) { 34 | Var.pushThreadBindings(bindingMap); 35 | try { 36 | ISeq fnArgs = Util.coerceFromTuple(call.getArguments().getTuple()); 37 | return !Util.truthy(applyFunction(fnArgs)); 38 | } finally { 39 | Var.popThreadBindings(); 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /cascalog-core/src/java/cascalog/FilterFunctionCall.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2010 Nathan Marz 3 | 4 | Project and contact information: http://www.cascalog.org/ 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | package cascalog; 20 | 21 | import cascading.operation.FilterCall; 22 | import cascading.operation.FunctionCall; 23 | import cascading.tuple.Fields; 24 | import cascading.tuple.TupleEntry; 25 | 26 | public class FilterFunctionCall implements FilterCall { 27 | FunctionCall func; 28 | 29 | public FilterFunctionCall(FunctionCall func) { 30 | this.func = func; 31 | } 32 | 33 | public Object getContext() { 34 | return func.getContext(); 35 | } 36 | 37 | public void setContext(Object o) { 38 | func.setContext(o); 39 | } 40 | 41 | public Fields getArgumentFields() { 42 | return func.getArgumentFields(); 43 | } 44 | 45 | public TupleEntry getArguments() { 46 | return func.getArguments(); 47 | } 48 | 49 | 50 | } 51 | -------------------------------------------------------------------------------- /cascalog-core/src/java/cascalog/CascalogAggregator.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2010 Nathan Marz 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | (at your option) any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | */ 17 | 18 | package cascalog; 19 | 20 | import java.io.Serializable; 21 | 22 | import cascading.flow.FlowProcess; 23 | import cascading.operation.AggregatorCall; 24 | import cascading.operation.OperationCall; 25 | 26 | public abstract class CascalogAggregator implements Serializable { 27 | 28 | public void prepare(FlowProcess flowProcess, OperationCall operationCall) { 29 | } 30 | 31 | public void cleanup(FlowProcess flowProcess, OperationCall operationCall) { 32 | } 33 | 34 | public abstract void start(FlowProcess flowProcess, AggregatorCall aggregatorCall); 35 | 36 | public abstract void aggregate(FlowProcess fp, AggregatorCall ac); 37 | 38 | public abstract void complete(FlowProcess flowProcess, AggregatorCall aggregatorCall); 39 | } 40 | -------------------------------------------------------------------------------- /cascalog-core/src/java/cascalog/aggregator/ClojureMonoidFunctor.java: -------------------------------------------------------------------------------- 1 | package cascalog.aggregator; 2 | 3 | import cascading.tuple.Fields; 4 | import cascading.tuple.Tuple; 5 | import cascading.tuple.TupleEntry; 6 | import cascalog.Util; 7 | import clojure.lang.IFn; 8 | import clojure.lang.ISeq; 9 | import clojure.lang.RT; 10 | 11 | public class ClojureMonoidFunctor extends FoldFunctor { 12 | final CombinerSpec combinerSpec; 13 | transient IFn prepareFn; 14 | transient IFn combineFn; 15 | transient boolean isPrepared = false; 16 | 17 | public ClojureMonoidFunctor(Fields fields, CombinerSpec combinerSpec) { 18 | super(fields); 19 | this.combinerSpec = combinerSpec; 20 | } 21 | 22 | @Override 23 | public ISeq prepare(TupleEntry args) { 24 | if (!isPrepared) { 25 | prepareFn = combinerSpec.getPrepareFn(); 26 | combineFn = combinerSpec.getCombineFn(); 27 | isPrepared = true; 28 | } 29 | return applyPrepareFn(args); 30 | } 31 | 32 | public ISeq applyPrepareFn(TupleEntry args) { 33 | ISeq input = RT.seq(Util.tupleToList(args)); 34 | if (null != prepareFn) { 35 | return RT.seq(Util.coerceToList(prepareFn.applyTo(input))); 36 | } else { 37 | return input; 38 | } 39 | } 40 | 41 | @Override 42 | public ISeq fold(ISeq acc, TupleEntry newArgs) { 43 | return RT.seq(Util.coerceToList( 44 | combineFn.applyTo( 45 | Util.cat(acc, applyPrepareFn(newArgs))))); 46 | } 47 | 48 | @Override 49 | public Tuple present(ISeq finalValue) { 50 | return Util.coerceToTuple(finalValue); 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /cascalog-core/src/java/cascalog/WriterOutputStream.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2010 Nathan Marz 3 | 4 | Project and contact information: http://www.cascalog.org/ 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | package cascalog; 20 | 21 | import java.io.IOException; 22 | import java.io.OutputStream; 23 | import java.io.Writer; 24 | 25 | /** This is for helping with running hadoop jobs from within emacs repl. */ 26 | public class WriterOutputStream extends OutputStream { 27 | 28 | private final Writer writer; 29 | 30 | public WriterOutputStream(Writer writer) { 31 | this.writer = writer; 32 | } 33 | 34 | public void write(int b) throws IOException { 35 | write(new byte[]{(byte) b}, 0, 1); 36 | } 37 | 38 | public void write(byte b[], int off, int len) throws IOException { 39 | writer.write(new String(b, off, len)); 40 | } 41 | 42 | public void flush() throws IOException { 43 | writer.flush(); 44 | } 45 | 46 | public void close() throws IOException { 47 | writer.close(); 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /cascalog-core/src/java/jcascalog/Subquery.java: -------------------------------------------------------------------------------- 1 | package jcascalog; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Arrays; 5 | import java.util.List; 6 | 7 | import cascalog.Util; 8 | 9 | public class Subquery { 10 | List _preds = new ArrayList(); 11 | Fields _outFields; 12 | Predicate _currPred = null; 13 | 14 | public Subquery(String... fields) { 15 | this(Arrays.asList(fields)); 16 | } 17 | 18 | public Subquery(List fields) { 19 | _outFields = new Fields((List) fields); 20 | } 21 | 22 | public Fields getOutputFields() { 23 | return _outFields; 24 | } 25 | 26 | public Object getCompiledSubquery() { 27 | return Util.bootSimpleFn("cascalog.logic.parse", "parse-subquery") 28 | .invoke(_outFields, _preds); 29 | } 30 | 31 | public Subquery predicate(Object op, Object... fields) { 32 | return predicate(op, Arrays.asList(fields)); 33 | } 34 | 35 | public Subquery predicate(Object op, List fields) { 36 | _currPred = new Predicate(op, fields); 37 | _preds.add(_currPred); 38 | return this; 39 | } 40 | 41 | public Subquery predicate(Predicate p) { 42 | _preds.add(p); 43 | return this; 44 | } 45 | 46 | public Subquery out(Object... fields) { 47 | return out(Arrays.asList(fields)); 48 | } 49 | 50 | public Subquery out(List fields) { 51 | if (_currPred == null) { 52 | throw new RuntimeException("Cannot declare outfields for no predicate"); 53 | } else { 54 | _currPred._outFields = fields; 55 | _currPred = null; 56 | return this; 57 | } 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /cascalog-core/src/clj/cascalog/logic/testing.clj: -------------------------------------------------------------------------------- 1 | (ns cascalog.logic.testing 2 | (:require [clojure.test :refer :all] 3 | [cascalog.api :refer :all] 4 | [jackknife.seq :refer (collectify multi-set)] 5 | [cascalog.logic.platform :as platform])) 6 | 7 | (defn doublify 8 | "Takes a sequence of tuples and converts all numbers to doubles. 9 | For example: 10 | (doublify [[1 :a] [2 :b]]) 11 | ;; [[1.0 :a] [2.0 :b]]" 12 | [tuples] 13 | (vec (for [t tuples] 14 | (into [] (map (fn [v] (if (number? v) (double v) v)) 15 | (collectify t)))))) 16 | 17 | (defn is-specs= [set1 set2] 18 | (every? true? (doall 19 | (map (fn [input output] 20 | (let [input (multi-set (doublify input)) 21 | output (multi-set (doublify output))] 22 | (is (= input output)))) 23 | set1 set2)))) 24 | 25 | (defn is-tuplesets= [set1 set2] 26 | (is-specs= [set1] [set2])) 27 | 28 | (defprotocol ITestable 29 | (process?- [_ bindings] 30 | "Used in testing, returns the result from processing the bindings")) 31 | 32 | (defn test?- [& bindings] 33 | (let [[specs out-tuples] (process?- platform/*platform* bindings)] 34 | (is-specs= specs out-tuples))) 35 | 36 | (defmacro test?<- [& args] 37 | (let [[begin body] (if (keyword? (first args)) 38 | (split-at 2 args) 39 | (split-at 1 args))] 40 | `(test?- ~@begin (<- ~@body)))) 41 | 42 | (defmacro thrown?<- [error & body] 43 | `(is (~'thrown? ~error (<- ~@body)))) 44 | -------------------------------------------------------------------------------- /cascalog-core/src/java/cascalog/ClojureMap.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2010 Nathan Marz 3 | 4 | Project and contact information: http://www.cascalog.org/ 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | package cascalog; 20 | 21 | import cascading.flow.FlowProcess; 22 | import cascading.operation.Function; 23 | import cascading.operation.FunctionCall; 24 | import cascading.tuple.Fields; 25 | import clojure.lang.IFn; 26 | import clojure.lang.ISeq; 27 | import clojure.lang.Var; 28 | 29 | public class ClojureMap extends ClojureCascadingBase implements Function { 30 | public ClojureMap(Fields outputFields, IFn fn) { 31 | super(outputFields, fn); 32 | } 33 | 34 | public void operate(FlowProcess fp, final FunctionCall call) { 35 | Var.pushThreadBindings(bindingMap); 36 | try { 37 | ISeq fnArgs = Util.coerceFromTuple(call.getArguments().getTuple()); 38 | Object res = applyFunction(fnArgs); 39 | call.getOutputCollector().add(Util.coerceToTuple(res)); 40 | } finally { 41 | Var.popThreadBindings(); 42 | } 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /cascalog-core/src/java/cascalog/kryo/ClojureKryoInstantiator.java: -------------------------------------------------------------------------------- 1 | package cascalog.kryo; 2 | 3 | import com.esotericsoftware.kryo.Kryo; 4 | import com.twitter.chill.KryoInstantiator; 5 | import com.twitter.chill.config.Config; 6 | import com.twitter.chill.config.ReflectingInstantiator; 7 | import com.twitter.chill.config.ConfigurationException; 8 | import org.objenesis.strategy.StdInstantiatorStrategy; 9 | 10 | import static carbonite.JavaBridge.enhanceRegistry; 11 | 12 | import java.util.ArrayList; 13 | import java.util.HashMap; 14 | import java.util.HashSet; 15 | 16 | public class ClojureKryoInstantiator extends KryoInstantiator { 17 | 18 | final Config config; 19 | final ReflectingInstantiator reflectingInst; 20 | 21 | public ClojureKryoInstantiator(Config config) { 22 | try { 23 | this.config = config; 24 | reflectingInst = new ReflectingInstantiator(config); 25 | } catch (ConfigurationException cx) { 26 | throw new RuntimeException(cx); 27 | } 28 | } 29 | 30 | @Override 31 | public Kryo newKryo() { 32 | try { 33 | Kryo k = reflectingInst.newKryo(); 34 | // register all the carbonite serializers 35 | enhanceRegistry(k); 36 | 37 | k.register(ArrayList.class); 38 | k.register(HashMap.class); 39 | k.register(HashSet.class); 40 | 41 | k.setReferences(config.getBoolean("cascalog.kryo.setreferences", false)); 42 | k.setClassLoader(Thread.currentThread().getContextClassLoader()); 43 | 44 | return k; 45 | 46 | } catch (Exception e) { 47 | throw new RuntimeException("unable to create new Kryo: " + e); 48 | } 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /cascalog-core/example.job-conf.clj: -------------------------------------------------------------------------------- 1 | ;; ## job-conf.clj 2 | ;; 3 | ;; This is example job-conf.clj file, meant to provide default 4 | ;;settings to all queries executed inside this project. To get 5 | ;;started, create a file called "job-conf.clj" inside the "resources" 6 | ;;directory at your project's root. (this is called "resources" by 7 | ;;default, though you can customize this in project.clj with the 8 | ;;following k-v pair: 9 | ;; 10 | ;; :resources-path "confdir" 11 | ;; 12 | ;; job-conf.clj must end with a job-conf map. Feel free to define 13 | ;; functions, import namespaces and evaluate code above the final 14 | ;; return form. 15 | ;; 16 | ;; Here's an import of Hadoop's java serialization interface: 17 | (import 'org.apache.hadoop.io.serializer.JavaSerialization) 18 | 19 | ;; And here's Backtype's Thrift serialization. Get this by including 20 | ;; 21 | ;; [backtype/cascading-thrift "0.1.0"] 22 | ;; 23 | ;; As a dependency. 24 | 25 | (import 'backtype.hadoop.ThriftSerialization) 26 | 27 | ;; Now, the job-conf map: 28 | {"io.serializations" JavaSerialization} 29 | 30 | ;; To provide multiple arguments, skip the usual comma separation and 31 | ;; wrap multiple arguments in a vector: 32 | ;; {"io.serializations" [ThriftSerialization JavaSerialization]} 33 | 34 | ;; The above examples use class symbols directly. You can also use 35 | ;; string versions of the full qualified class names. 36 | 37 | {"io.serializations" ["backtype.hadoop.ThriftSerialization" 38 | "org.apache.hadoop.io.serializer.JavaSerialization"]} 39 | 40 | ;; That's it! The above map will get returned, as it's the last form 41 | ;; in the file. 42 | -------------------------------------------------------------------------------- /cascalog-core/src/java/cascalog/test/CountAgg.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2010 Nathan Marz 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | (at your option) any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | */ 17 | 18 | package cascalog.test; 19 | 20 | import cascading.flow.FlowProcess; 21 | import cascading.operation.AggregatorCall; 22 | import cascading.tuple.Tuple; 23 | import cascalog.CascalogAggregator; 24 | 25 | public class CountAgg extends CascalogAggregator { 26 | 27 | @Override 28 | public void start(FlowProcess flowProcess, AggregatorCall aggregatorCall) { 29 | aggregatorCall.setContext(0); 30 | } 31 | 32 | @Override 33 | public void aggregate(FlowProcess flowProcess, AggregatorCall aggregatorCall) { 34 | int count = (Integer) aggregatorCall.getContext(); 35 | aggregatorCall.setContext(count + 1); 36 | } 37 | 38 | @Override 39 | public void complete(FlowProcess flowProcess, AggregatorCall aggregatorCall) { 40 | int count = (Integer) aggregatorCall.getContext(); 41 | aggregatorCall.getOutputCollector().add(new Tuple(count)); 42 | } 43 | 44 | } 45 | -------------------------------------------------------------------------------- /cascalog-core/src/java/cascalog/ClojureCombiner.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2010 Nathan Marz 3 | 4 | Project and contact information: http://www.cascalog.org/ 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | package cascalog; 20 | 21 | import java.util.List; 22 | 23 | import cascading.operation.FunctionCall; 24 | import cascading.operation.OperationCall; 25 | import cascading.tuple.Fields; 26 | import cascading.tuple.Tuple; 27 | import cascading.tuple.TupleEntryCollector; 28 | 29 | public class ClojureCombiner extends ClojureCombinerBase { 30 | 31 | public ClojureCombiner(Fields groupFields, List argFields, Fields outFields, 32 | List agg_specs) { 33 | super(groupFields, false, null, argFields, outFields, agg_specs, "cascalog.combiner.aggregator.size", 10000); 34 | } 35 | 36 | @Override 37 | protected void write(Tuple group, List val, OperationCall opCall) { 38 | TupleEntryCollector output = ((FunctionCall) opCall).getOutputCollector(); 39 | Tuple t = new Tuple(group); 40 | for (Object o : val) { 41 | t.add(o); 42 | } 43 | output.add(t); 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /cascalog-core/src/java/cascalog/aggregator/FoldFunctor.java: -------------------------------------------------------------------------------- 1 | package cascalog.aggregator; 2 | 3 | import cascading.flow.FlowProcess; 4 | import cascading.pipe.assembly.AggregateBy; 5 | import cascading.tuple.Fields; 6 | import cascading.tuple.Tuple; 7 | import cascading.tuple.TupleEntry; 8 | 9 | /** 10 | * Straight-up port of Scalding's FoldFunctor. 11 | */ 12 | public abstract class FoldFunctor implements AggregateBy.Functor { 13 | protected final Fields fields; 14 | protected Tuple nextContext = null; 15 | 16 | public FoldFunctor(Fields fields) { 17 | this.fields = fields; 18 | } 19 | 20 | public abstract T prepare(TupleEntry args); 21 | public abstract T fold(T acc, TupleEntry newArgs); 22 | public abstract Tuple present(T finalValue); 23 | 24 | @Override 25 | public Fields getDeclaredFields() { 26 | return fields; 27 | } 28 | 29 | @Override 30 | public Tuple aggregate(FlowProcess flowProcess, TupleEntry args, Tuple context) { 31 | T nextContextObj; 32 | if (null == context) { 33 | nextContext = Tuple.size(1); 34 | nextContextObj = prepare(args); 35 | } else { 36 | T oldValue = (T) context.getObject(0); 37 | nextContext = context; 38 | nextContextObj = fold(oldValue, args); 39 | } 40 | nextContext.set(0, nextContextObj); 41 | return nextContext; 42 | } 43 | 44 | @Override 45 | public Tuple complete(FlowProcess flowProcess, Tuple context) { 46 | if (null == context) { 47 | throw new RuntimeException("ClojureMonoidFunctor completed with any aggregate calls"); 48 | } else { 49 | T result = (T) context.getObject(0); 50 | context.set(0, null); 51 | return present(result); 52 | } 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /cascalog-core/src/java/cascalog/test/SumAgg.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2010 Nathan Marz 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | (at your option) any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | */ 17 | 18 | package cascalog.test; 19 | 20 | import cascading.flow.FlowProcess; 21 | import cascading.operation.AggregatorCall; 22 | import cascading.tuple.Tuple; 23 | import cascalog.CascalogAggregator; 24 | 25 | public class SumAgg extends CascalogAggregator { 26 | 27 | @Override 28 | public void start(FlowProcess flowProcess, AggregatorCall aggregatorCall) { 29 | aggregatorCall.setContext(0); 30 | } 31 | 32 | @Override 33 | public void aggregate(FlowProcess flowProcess, AggregatorCall aggregatorCall) { 34 | int sum = (Integer) aggregatorCall.getContext(); 35 | sum += aggregatorCall.getArguments().getInteger(0); 36 | aggregatorCall.setContext(sum); 37 | } 38 | 39 | @Override 40 | public void complete(FlowProcess flowProcess, AggregatorCall aggregatorCall) { 41 | int sum = (Integer) aggregatorCall.getContext(); 42 | aggregatorCall.getOutputCollector().add(new Tuple(sum)); 43 | } 44 | 45 | } 46 | -------------------------------------------------------------------------------- /cascalog-math/src/cascalog/math/stats.clj: -------------------------------------------------------------------------------- 1 | (ns cascalog.math.stats 2 | (:use cascalog.api) 3 | (:require [cascalog.logic.ops :as c] 4 | [cascalog.logic.def :as d] 5 | [cascalog.math.contrib [accumulators :as acc]])) 6 | 7 | (defn initialize-mean-variance-parallel [& X] 8 | (map (fn [x] (acc/mean-variance {:mean x :variance 0 :n 1})) X)) 9 | 10 | (d/defparallelagg mean-variance-parallel 11 | :init-var #'initialize-mean-variance-parallel 12 | :combine-var #'acc/combine) 13 | 14 | (defn get-variance [mvp-struct] 15 | (double (mvp-struct :variance))) 16 | 17 | (def sample-variance-parallel 18 | "Predicate macro that calculates the sample variance of the supplied input 19 | var, in a parallel, numerically stable way." 20 | (<- [!val :> !var] 21 | (mean-variance-parallel :< !val :> !ret) 22 | (get-variance :< !ret :> !var))) 23 | 24 | (def variance 25 | "Predicate macro that calculates the variance of the supplied input 26 | var." 27 | (<- [!val :> !var] 28 | (* !val !val :> !squared) 29 | (c/sum !squared :> !square-sum) 30 | (c/count !count) 31 | (c/avg !val :> !mean) 32 | (* !mean !mean :> !mean-squared) 33 | (div !square-sum !count :> !i) 34 | (- !i !mean-squared :> !var))) 35 | 36 | (def sample-variance 37 | "Predicate macro that calculates the sample variance of the supplied input 38 | var." 39 | (<- [!val :> !var] 40 | (* !val !val :> !squared) 41 | (c/sum !squared :> !squared-sum) 42 | (c/count !count) 43 | (c/sum !val :> !sum) 44 | (c/avg !val :> !mean) 45 | (* !sum !mean :> !i) 46 | (- !squared-sum !i :> !num) 47 | (- !count 1 :> !denom) 48 | (div !num !denom :> !var))) 49 | 50 | -------------------------------------------------------------------------------- /cascalog-core/src/java/cascalog/CascalogBufferExecutor.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2010 Nathan Marz 3 | 4 | Project and contact information: http://www.cascalog.org/ 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | package cascalog; 20 | 21 | import cascading.flow.FlowProcess; 22 | import cascading.operation.BaseOperation; 23 | import cascading.operation.Buffer; 24 | import cascading.operation.BufferCall; 25 | import cascading.operation.OperationCall; 26 | import cascading.tuple.Fields; 27 | 28 | public class CascalogBufferExecutor extends BaseOperation implements Buffer { 29 | CascalogBuffer buf; 30 | 31 | public CascalogBufferExecutor(Fields outFields, CascalogBuffer buf) { 32 | super(outFields); 33 | this.buf = buf; 34 | } 35 | 36 | public void prepare(FlowProcess flowProcess, OperationCall operationCall) { 37 | buf.prepare(flowProcess, operationCall); 38 | } 39 | 40 | public void operate(FlowProcess flowProcess, BufferCall bufCall) { 41 | buf.operate(flowProcess, bufCall); 42 | } 43 | 44 | public void cleanup(FlowProcess flowProcess, OperationCall operationCall) { 45 | buf.cleanup(flowProcess, operationCall); 46 | } 47 | 48 | } 49 | -------------------------------------------------------------------------------- /cascalog-core/src/java/cascalog/CascalogFunctionExecutor.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2010 Nathan Marz 3 | 4 | Project and contact information: http://www.cascalog.org/ 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | package cascalog; 20 | 21 | import cascading.flow.FlowProcess; 22 | import cascading.operation.BaseOperation; 23 | import cascading.operation.Function; 24 | import cascading.operation.FunctionCall; 25 | import cascading.operation.OperationCall; 26 | import cascading.tuple.Fields; 27 | 28 | public class CascalogFunctionExecutor extends BaseOperation implements Function { 29 | CascalogFunction fn; 30 | 31 | public CascalogFunctionExecutor(Fields out_fields, CascalogFunction fn) { 32 | super(out_fields); 33 | this.fn = fn; 34 | } 35 | 36 | public void prepare(FlowProcess flowProcess, OperationCall operationCall) { 37 | fn.prepare(flowProcess, operationCall); 38 | } 39 | 40 | public void operate(FlowProcess flow_process, FunctionCall fn_call) { 41 | fn.operate(flow_process, fn_call); 42 | } 43 | 44 | public void cleanup(FlowProcess flowProcess, OperationCall operationCall) { 45 | fn.cleanup(flowProcess, operationCall); 46 | } 47 | 48 | } 49 | -------------------------------------------------------------------------------- /cascalog-core/src/java/cascalog/ClojureParallelAgg.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2010 Nathan Marz 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | (at your option) any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | */ 17 | 18 | package cascalog; 19 | 20 | import java.util.List; 21 | 22 | import cascading.flow.FlowProcess; 23 | import cascalog.aggregator.CombinerSpec; 24 | import clojure.lang.IFn; 25 | import clojure.lang.RT; 26 | 27 | public class ClojureParallelAgg implements ParallelAgg { 28 | CombinerSpec _spec; 29 | IFn _initFn; 30 | IFn _combinerFn; 31 | 32 | public ClojureParallelAgg(CombinerSpec spec) { 33 | _spec = spec; 34 | } 35 | 36 | // TODO: Remove this once we have a functor properly in place. 37 | public void prepare(FlowProcess flowProcess) { 38 | _initFn = Util.deserializeFn(_spec.prepareFn); 39 | _combinerFn = Util.deserializeFn(_spec.combineFn); 40 | } 41 | 42 | public List init(List input) { 43 | return Util.coerceToList(_initFn.applyTo(RT.seq(input))); 44 | } 45 | 46 | public List combine(List val1, List val2) { 47 | return Util.coerceToList(_combinerFn.applyTo(Util.cat(RT.seq(val1), RT.seq(val2)))); 48 | } 49 | 50 | } 51 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # See this guide for more settings: 2 | # 3 | # http://about.travis-ci.org/docs/user/build-configuration/ 4 | 5 | language: clojure 6 | lein: lein2 7 | script: JVM_OPTS="-Djava.library.path=$PWD/hadoop-lzo-native/lib" lein2 sub with-profile dev,provided test 8 | before_install: 9 | - cat /etc/hosts # optionally check the content *before* 10 | - sudo hostname "$(hostname | cut -c1-63)" 11 | - sed -e "s/^\\(127\\.0\\.0\\.1.*\\)/\\1 $(hostname | cut -c1-63)/" /etc/hosts | sudo tee /etc/hosts 12 | - cat /etc/hosts # optionally check the content *after* 13 | - lein2 sub install 14 | - sudo apt-get update -qq 15 | - sudo apt-get install -qq protobuf-compiler 16 | - sudo apt-get install -qq libboost-dev libboost-test-dev libboost-program-options-dev libevent-dev automake libtool flex bison pkg-config g++ libssl-dev 17 | - wget -nv http://archive.apache.org/dist/thrift/0.7.0/thrift-0.7.0.tar.gz 18 | - tar zxf thrift-0.7.0.tar.gz 19 | - cd thrift-0.7.0 20 | - chmod +x ./configure 21 | - ./configure --disable-gen-erl --disable-gen-hs --without-ruby --without-haskell --without-erlang 22 | - sudo make install 23 | - cd .. 24 | - sudo apt-get -qq install lzop liblzo2-dev # libzo2-dev for compiling hadoop-lzo 25 | - git clone git://github.com/twitter/hadoop-lzo.git # for native libgplcompression 26 | - cd hadoop-lzo 27 | - git checkout master 28 | - mvn compile 29 | - mv target/native/Linux-* ../hadoop-lzo-native 30 | - cd .. 31 | env: 32 | - C_INCLUDE_PATH=/usr/include/lzo LIBRARY_PATH=/usr/lib/x86_64-linux-gnu 33 | branches: 34 | only: 35 | - develop 36 | - master 37 | jdk: 38 | - openjdk6 39 | cache: 40 | directories: 41 | - $HOME/.m2 42 | notifications: 43 | email: 44 | - sritchie09@gmail.com 45 | - paul@quantisan.com 46 | -------------------------------------------------------------------------------- /cascalog-core/test/cascalog/cascading/flow_test.clj: -------------------------------------------------------------------------------- 1 | (ns cascalog.cascading.flow-test 2 | (:use [midje sweet cascalog] 3 | clojure.test 4 | cascalog.logic.testing 5 | cascalog.cascading.testing 6 | cascalog.api) 7 | (:require [cascalog.cascading.operations :as ops] 8 | [cascalog.logic.platform :as p] 9 | [cascalog.cascading.flow :as f])) 10 | 11 | (background 12 | (before :facts 13 | (set-cascading-platform!))) 14 | 15 | (defn square [x] 16 | (* x x)) 17 | 18 | (deftest to-memory-test 19 | (let [gen (-> (p/generator [1 2 3 4]) 20 | (ops/rename* "?x") 21 | (ops/map* square "?x" "?x2"))] 22 | (fact 23 | (f/to-memory gen) 24 | => [[1 1] [2 4] [3 9] [4 16]]))) 25 | 26 | (comment 27 | "Turn these into valid tests." 28 | (require '[cascalog.logic.parse :refer (<-)] 29 | '[cascalog.cascading.flow :refer (all-to-memory to-memory graph)]) 30 | 31 | (def cross-join 32 | (<- [:>] (identity 1 :> _))) 33 | 34 | (let [sq (<- [?squared ?squared-minus ?x ?sum] 35 | ([1 2 3] ?x) 36 | (* ?x ?x :> ?squared) 37 | (- ?squared 1 :> ?squared-minus) 38 | ((d/parallelagg* +) ?squared :> ?sum))] 39 | (to-memory sq)) 40 | 41 | (let [sq (<- [?x ?y] 42 | ([1 2 3] ?x) 43 | ([1 2 3] ?y) 44 | (cross-join) 45 | (* ?x ?y :> ?z))] 46 | (to-memory sq)) 47 | 48 | (let [x (<- [?x ?y :> ?z] 49 | (* ?x ?x :> 10) 50 | (* ?x ?y :> ?z)) 51 | sq (<- [?a ?b ?z] 52 | ([[1 2 3]] ?a) 53 | (x ?a ?a :> 4) 54 | ((d/bufferop* +) ?a :> ?z) 55 | ((d/mapcatop* +) ?a 10 :> ?b))] 56 | (clojure.pprint/pprint (build-rule sq)))) 57 | -------------------------------------------------------------------------------- /cascalog-math/test/cascalog/math/stats_test.clj: -------------------------------------------------------------------------------- 1 | (ns cascalog.math.stats-test 2 | (:use [cascalog.math.stats] 3 | [cascalog.api] 4 | [clojure.test] 5 | [midje sweet cascalog]) 6 | (:import (cern.jet.random.tdouble DoubleUniform))) 7 | 8 | (background 9 | (before :facts 10 | (set-cascading-platform!))) 11 | 12 | (defn sample-uniform [size min-val max-val seed] 13 | (let [dist (DoubleUniform. (double min-val) (double max-val) seed)] 14 | (for [_ (range size)] (. dist nextDouble)))) 15 | 16 | ;; TODO add test 17 | (fact 18 | (<- [?x] ([[1]] ?x)) => (produces [[1]])) 19 | 20 | ;; variance 21 | (fact "" 22 | (let [source [[0]]] 23 | (<- [!var] (source !val) (variance :< !val :> !var))) => 24 | (produces [[0.0]])) 25 | 26 | (fact "" 27 | (let [source [[0] [1]]] 28 | (<- [!var] (source !val) (variance :< !val :> !var))) => 29 | (produces [[0.25]])) 30 | 31 | (fact "variance is numerically unstable, resulting in a very wrong answer" 32 | (let [n 100 33 | lo 1000000000 34 | hi (+ 1 lo) 35 | seed 1234 36 | source (sample-uniform n lo hi seed)] 37 | (<- [!var] (source !val) (variance :< !val :> !var))) => 38 | (produces [[256.0]])) 39 | 40 | ;; sample-variance-parallel 41 | (fact "" 42 | (let [source [[0]]] 43 | (<- [!var] (source !val) (sample-variance-parallel :< !val :> !var))) => 44 | (produces [[0.0]])) 45 | 46 | (fact "" 47 | (let [source [[0] [1]]] 48 | (<- [!var] (source !val) (sample-variance-parallel :< !val :> !var))) => 49 | (produces [[0.5]])) 50 | 51 | (fact "variance-parallel is stable, resulting in nearly the right answer" 52 | (let [n 100 53 | lo 1000000000 54 | hi (+ 1 lo) 55 | seed 1234 56 | source (sample-uniform n lo hi seed)] 57 | (<- [!var] (source !val) (sample-variance-parallel :< !val :> !var))) => 58 | (produces [[0.09958331251840505]])) 59 | -------------------------------------------------------------------------------- /cascalog-core/src/java/cascalog/ClojureBufferIter.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2010 Nathan Marz 3 | 4 | Project and contact information: http://www.cascalog.org/ 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | package cascalog; 20 | 21 | import cascading.flow.FlowProcess; 22 | import cascading.operation.Buffer; 23 | import cascading.operation.BufferCall; 24 | import cascading.tuple.Fields; 25 | import cascading.tuple.TupleEntryCollector; 26 | import clojure.lang.IFn; 27 | import clojure.lang.ISeq; 28 | import clojure.lang.RT; 29 | import clojure.lang.Var; 30 | 31 | public class ClojureBufferIter extends ClojureCascadingBase implements Buffer { 32 | 33 | public ClojureBufferIter(Fields outputFields, IFn fn) { 34 | super(outputFields, fn); 35 | } 36 | 37 | public void operate(FlowProcess flow_process, BufferCall call) { 38 | Var.pushThreadBindings(bindingMap); 39 | try { 40 | ISeq resultSeq = 41 | RT.seq(invokeFunction(new TupleSeqConverter(call.getArgumentsIterator()))); 42 | TupleEntryCollector collector = call.getOutputCollector(); 43 | while (resultSeq != null) { 44 | Object obj = resultSeq.first(); 45 | collector.add(Util.coerceToTuple(obj)); 46 | resultSeq = resultSeq.next(); 47 | } 48 | } finally { 49 | Var.popThreadBindings(); 50 | } 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /cascalog-core/src/java/cascalog/ClojureMapcat.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2010 Nathan Marz 3 | 4 | Project and contact information: http://www.cascalog.org/ 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | package cascalog; 20 | 21 | import cascading.flow.FlowProcess; 22 | import cascading.operation.Function; 23 | import cascading.operation.FunctionCall; 24 | import cascading.tuple.Fields; 25 | import cascading.tuple.TupleEntryCollector; 26 | import clojure.lang.IFn; 27 | import clojure.lang.ISeq; 28 | import clojure.lang.RT; 29 | import clojure.lang.Var; 30 | 31 | public class ClojureMapcat extends ClojureCascadingBase implements Function { 32 | 33 | public ClojureMapcat(Fields outputFields, IFn fn) { 34 | super(outputFields, fn); 35 | } 36 | 37 | public void operate(FlowProcess fp, FunctionCall call) { 38 | Var.pushThreadBindings(bindingMap); 39 | try { 40 | ISeq fnArgs = Util.coerceFromTuple(call.getArguments().getTuple()); 41 | ISeq resultSeq = RT.seq(applyFunction(fnArgs)); 42 | TupleEntryCollector collector = call.getOutputCollector(); 43 | while (resultSeq != null) { 44 | Object obj = resultSeq.first(); 45 | collector.add(Util.coerceToTuple(obj)); 46 | resultSeq = resultSeq.next(); 47 | } 48 | } finally { 49 | Var.popThreadBindings(); 50 | } 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /cascalog-core/src/java/cascalog/CascadingFilterToFunction.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2010 Nathan Marz 3 | 4 | Project and contact information: http://www.cascalog.org/ 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | package cascalog; 20 | 21 | import cascading.flow.FlowProcess; 22 | import cascading.operation.BaseOperation; 23 | import cascading.operation.Filter; 24 | import cascading.operation.Function; 25 | import cascading.operation.FunctionCall; 26 | import cascading.operation.OperationCall; 27 | import cascading.tuple.Fields; 28 | import cascading.tuple.Tuple; 29 | 30 | public class CascadingFilterToFunction extends BaseOperation implements Function { 31 | Filter filter; 32 | 33 | public CascadingFilterToFunction(String outfield, Filter filter) { 34 | super(new Fields(outfield)); 35 | this.filter = filter; 36 | } 37 | 38 | public void prepare(FlowProcess flowProcess, OperationCall operationCall) { 39 | filter.prepare(flowProcess, operationCall); 40 | } 41 | 42 | public void operate(FlowProcess process, FunctionCall call) { 43 | boolean ret = !filter.isRemove(process, new FilterFunctionCall(call)); 44 | call.getOutputCollector().add(new Tuple(ret)); 45 | } 46 | 47 | public void cleanup(FlowProcess flowProcess, OperationCall operationCall) { 48 | filter.cleanup(flowProcess, operationCall); 49 | } 50 | 51 | } 52 | -------------------------------------------------------------------------------- /cascalog-core/src/java/cascalog/CascadingFunctionWrapper.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2012 Twitter, Inc. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package cascalog; 18 | 19 | import cascading.flow.FlowProcess; 20 | import cascading.operation.BaseOperation; 21 | import cascading.operation.Function; 22 | import cascading.operation.FunctionCall; 23 | import cascading.operation.OperationCall; 24 | import cascading.tuple.Fields; 25 | 26 | /** 27 | * 28 | */ 29 | public class CascadingFunctionWrapper extends BaseOperation implements Function { 30 | private final Function func; 31 | 32 | public CascadingFunctionWrapper(Fields fields, Function func) { 33 | super(func.getNumArgs(), fields); 34 | this.func = func; 35 | } 36 | 37 | public void operate(FlowProcess flowProcess, FunctionCall tFunctionCall) { 38 | func.operate(flowProcess, tFunctionCall); 39 | } 40 | 41 | public void prepare(FlowProcess flowProcess, OperationCall tOperationCall) { 42 | func.prepare(flowProcess, tOperationCall); 43 | } 44 | 45 | public void flush(FlowProcess flowProcess, OperationCall tOperationCall) { 46 | func.flush(flowProcess, tOperationCall); 47 | } 48 | 49 | public void cleanup(FlowProcess flowProcess, OperationCall tOperationCall) { 50 | func.cleanup(flowProcess, tOperationCall); 51 | } 52 | 53 | public boolean isSafe() { 54 | return func.isSafe(); 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /cascalog-lzo/README.md: -------------------------------------------------------------------------------- 1 | # Cascalog-Lzo 2 | 3 | Based on the excellent work in Elephant-Bird: 4 | 5 | https://github.com/dvryaboy/elephant-bird/tree/eb-dev 6 | 7 | NOTE: If you just want to read .lzo files you just need to setup hadoop to do so. Then the normal `(hfs-textline "my_lzo_file.lzo")` will work. 8 | AWS EMR is setup to include hadoop-lzo by default. 9 | 10 | ### Configuring Hadoop 11 | 12 | You can find more information about Hadoop-LZO [on Cloudera](http://www.cloudera.com/blog/2009/11/hadoop-at-twitter-part-1-splittable-lzo-compression/). 13 | 14 | [Quick install guide](http://docs.hortonworks.com/HDPDocuments/HDP2/HDP-2.0.9.0/bk_installing_manually_book/content/rpm-chap2-3.html) 15 | 16 | ## Usage 17 | 18 | Add the following to `project.clj`: 19 | 20 | [cascalog/cascalog-lzo "3.0.0-SNAPSHOT"] 21 | 22 | To use: `(:require [cascalog.lzo :as lzo])` and then create sinks or sources as `(lzo/hfs-lzo-textline directory)` 23 | 24 | Tested with hadoop 2.4 and 2.6. 25 | 26 | Stay tuned for updates! 27 | 28 | ### Installing Local Dependencies 29 | 30 | On OS X: 31 | 32 | 1. Install MacPorts 33 | 2. sudo port install lzo 34 | 3. If you're on Lion, you'll have to re-install your java development headers [here](http://connect.apple.com/cgi-bin/WebObjects/MemberSite.woa/wa/download?path=%2FDeveloper_Tools%2Fjava_for_mac_os_x_10.7_update_1_developer_package%2Fjavadeveloper_for_mac_os_x_10.7__11m3527.dmg&wosid=Mo5ndLZsjioK2DIXcKKGLmyLffK). 35 | 4. Download the [lzo native libs](https://github.com/nathanmarz/cascalog-contrib/downloads) and place them in `/opt/local/lib`. 36 | 37 | ### Building Hadoop-Lzo 38 | 39 | This is only necessary if you're trying to rebuild this project. 40 | 41 | ```bash 42 | git clone https://github.com/twitter/hadoop-lzo 43 | cd hadoop-lzo 44 | 45 | JAVA_HOME=$(/usr/libexec/java_home) \ 46 | C_INCLUDE_PATH=/opt/local/include LIBRARY_PATH=/opt/local/lib \ 47 | CFLAGS="-arch x86_64" mvn clean test install 48 | ``` 49 | -------------------------------------------------------------------------------- /cascalog-core/src/java/cascalog/aggregator/CombinerSpec.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2010 Nathan Marz 3 | 4 | Project and contact information: http://www.cascalog.org/ 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | package cascalog.aggregator; 20 | 21 | import java.io.Serializable; 22 | 23 | import cascalog.Util; 24 | import clojure.lang.IFn; 25 | 26 | public class CombinerSpec implements Serializable { 27 | public byte[] prepareFn; 28 | public final byte[] combineFn; 29 | public byte[] presentFn; 30 | 31 | public CombinerSpec(IFn combineFn) { 32 | this.combineFn = Util.serializeFn(combineFn); 33 | } 34 | 35 | public CombinerSpec setPrepareFn(IFn prepareFn) { 36 | this.prepareFn = (null == prepareFn) ? null : Util.serializeFn(prepareFn); 37 | return this; 38 | } 39 | 40 | public CombinerSpec setPresentFn(IFn presentFn) { 41 | this.presentFn = (null == presentFn) ? null : Util.serializeFn(presentFn); 42 | return this; 43 | } 44 | 45 | public IFn getPrepareFn() { 46 | if (null == prepareFn) 47 | return null; 48 | else 49 | return Util.deserializeFn(prepareFn); 50 | } 51 | public IFn getCombineFn() { 52 | return Util.deserializeFn(combineFn); 53 | } 54 | 55 | public IFn getPresentFn() { 56 | if (null == presentFn) 57 | return null; 58 | else 59 | return Util.deserializeFn(presentFn); 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /cascalog-core/src/java/cascalog/ClojureBuffer.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2010 Nathan Marz 3 | 4 | Project and contact information: http://www.cascalog.org/ 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | package cascalog; 20 | 21 | import cascading.flow.FlowProcess; 22 | import cascading.operation.Buffer; 23 | import cascading.operation.BufferCall; 24 | import cascading.tuple.Fields; 25 | import cascading.tuple.TupleEntryCollector; 26 | import clojure.lang.IFn; 27 | import clojure.lang.ISeq; 28 | import clojure.lang.IteratorSeq; 29 | import clojure.lang.RT; 30 | import clojure.lang.Var; 31 | 32 | public class ClojureBuffer extends ClojureCascadingBase implements Buffer { 33 | 34 | public ClojureBuffer(Fields outputFields, IFn fn) { 35 | super(outputFields, fn); 36 | } 37 | 38 | public void operate(FlowProcess flow_process, BufferCall call) { 39 | Var.pushThreadBindings(bindingMap); 40 | try { 41 | ISeq resultSeq = RT.seq(invokeFunction(IteratorSeq 42 | .create(new TupleSeqConverter(call.getArgumentsIterator())))); 43 | TupleEntryCollector collector = call.getOutputCollector(); 44 | while (resultSeq != null) { 45 | Object obj = resultSeq.first(); 46 | collector.add(Util.coerceToTuple(obj)); 47 | resultSeq = resultSeq.next(); 48 | } 49 | } finally { 50 | Var.popThreadBindings(); 51 | } 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /cascalog-core/src/java/cascalog/ops/KryoInsert.java: -------------------------------------------------------------------------------- 1 | package cascalog.ops; 2 | 3 | import cascading.flow.FlowProcess; 4 | import cascading.operation.BaseOperation; 5 | import cascading.operation.Function; 6 | import cascading.operation.FunctionCall; 7 | import cascading.tuple.Fields; 8 | import cascading.tuple.Tuple; 9 | import cascalog.kryo.KryoService; 10 | 11 | public class KryoInsert extends BaseOperation implements Function { 12 | /** Field values */ 13 | private final byte[] serialized; 14 | private transient Tuple values; 15 | 16 | public KryoInsert(Fields fieldDeclaration, Object... values) { 17 | super(0, fieldDeclaration); 18 | this.serialized = KryoService.serialize(values); 19 | 20 | if (!fieldDeclaration.isSubstitution() && fieldDeclaration.size() != values.length) { 21 | throw new IllegalArgumentException("fieldDeclaration must be the same size as the given values"); 22 | } 23 | } 24 | 25 | public Tuple getTuple() { 26 | if (this.values == null) { 27 | Object[] values = (Object[]) KryoService.deserialize(this.serialized); 28 | this.values = new Tuple(values); 29 | } 30 | return this.values; 31 | } 32 | 33 | public void operate(FlowProcess flowProcess, FunctionCall functionCall) { 34 | functionCall.getOutputCollector().add(new Tuple(getTuple())); 35 | } 36 | 37 | @Override 38 | public boolean equals(Object object) { 39 | if (this == object) { return true; } 40 | if (!(object instanceof KryoInsert)) { return false; } 41 | if (!super.equals(object)) { return false; } 42 | 43 | KryoInsert insert = (KryoInsert) object; 44 | 45 | Tuple tuple = getTuple(); 46 | 47 | return !(tuple != null ? !tuple.equals(insert.getTuple()) : insert.getTuple() != null); 48 | } 49 | 50 | @Override public int hashCode() { 51 | int result = super.hashCode(); 52 | result = 31 * result + (getTuple() != null ? getTuple().hashCode() : 0); 53 | return result; 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /cascalog-core/src/java/cascalog/ClojureMultibuffer.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2010 Nathan Marz 3 | 4 | Project and contact information: http://www.cascalog.org/ 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | package cascalog; 20 | 21 | import java.util.ArrayList; 22 | import java.util.List; 23 | 24 | import cascading.tuple.Fields; 25 | import cascalog.MultiGroupBy.MultiBuffer; 26 | import cascalog.MultiGroupBy.MultiBufferContext; 27 | import clojure.lang.IFn; 28 | import clojure.lang.ISeq; 29 | import clojure.lang.IteratorSeq; 30 | import clojure.lang.RT; 31 | import clojure.lang.Var; 32 | 33 | public class ClojureMultibuffer extends ClojureCascadingBase implements MultiBuffer { 34 | 35 | public ClojureMultibuffer(Fields outputFields, IFn fn) { 36 | super(outputFields, fn); 37 | } 38 | 39 | public void operate(MultiBufferContext context) { 40 | List inputTuples = new ArrayList(); 41 | 42 | for (int i = 0; i < context.size(); i++) { 43 | inputTuples 44 | .add(IteratorSeq.create(new RegularTupleSeqConverter(context.getArgumentsIterator(i)))); 45 | } 46 | 47 | Var.pushThreadBindings(bindingMap); 48 | try { 49 | ISeq resultSeq = RT.seq(applyFunction(RT.seq(inputTuples))); 50 | while (resultSeq != null) { 51 | Object obj = resultSeq.first(); 52 | context.emit(Util.coerceToTuple(obj)); 53 | resultSeq = resultSeq.next(); 54 | } 55 | 56 | 57 | } finally { 58 | Var.popThreadBindings(); 59 | } 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /cascalog-core/test/cascalog/jcascalog_cascading_test.clj: -------------------------------------------------------------------------------- 1 | (ns cascalog.jcascalog-cascading-test 2 | (:use clojure.test 3 | cascalog.api 4 | cascalog.logic.testing 5 | cascalog.cascading.testing) 6 | (:require [cascalog.cascading.tap :as tap] 7 | [cascalog.cascading.io :as io]) 8 | (:import [cascalog.test MultiplyAgg RangeOp DoubleOp] 9 | [jcascalog Api Subquery] 10 | [jcascalog.op Count Sum Multiply])) 11 | 12 | (use-fixtures :once 13 | (fn [f] 14 | (Api/setCascadingPlatform) 15 | (f))) 16 | 17 | (deftest test-vanilla 18 | (let [value [["a" 1] ["a" 2] ["b" 10] 19 | ["c" 3] ["b" 2] ["a" 6]]] 20 | (test?- [[(* 1 2 3628800 6 2 720) 24]] 21 | (-> (Subquery. ["?result" "?count"]) 22 | (.predicate value ["_" "?v"]) 23 | (.predicate (RangeOp.) ["?v"]) (.out ["?v2"]) 24 | (.predicate (MultiplyAgg.) ["?v2"]) (.out ["?result"]) 25 | (.predicate (Count.) ["?count"]))))) 26 | 27 | (deftest test-java-each 28 | (let [data [[1 2 3] [4 5 6]]] 29 | (test?- [[2 4 6] [8 10 12]] 30 | (-> (Subquery. ["?x" "?y" "?z"]) 31 | (.predicate data ["?a" "?b" "?c"]) 32 | (.predicate (Api/each (DoubleOp.)) 33 | ["?a" "?b" "?c"]) (.out ["?x" "?y" "?z"]))))) 34 | 35 | (deftest test-compile-flow 36 | (io/with-fs-tmp [_ sink-path] 37 | (let [sink (mk-test-sink ["?letter" "?doublesum"] sink-path) 38 | value [["a" 1] ["a" 2] ["b" 10] 39 | ["c" 3] ["b" 2] ["a" 6]] 40 | expected [["a" 18] ["b" 24] ["c" 6]] 41 | flow (Api/compileFlow "testFlow" sink 42 | (-> (Subquery. ["?letter" "?doublesum"]) 43 | (.predicate value ["?letter" "?v"]) 44 | (.predicate (Multiply.) ["?v" 2]) (.out ["?double"]) 45 | (.predicate (Sum.) ["?double"]) (.out ["?doublesum"])))] 46 | (.complete flow) 47 | (is-tuplesets= expected (tap/get-sink-tuples sink))))) 48 | -------------------------------------------------------------------------------- /cascalog-core/src/clj/cascalog/logic/platform.clj: -------------------------------------------------------------------------------- 1 | (ns cascalog.logic.platform 2 | "The execution platform class." 3 | (:refer-clojure :exclude [run!]) 4 | (:require [cascalog.logic.zip :as zip] 5 | [jackknife.core :as u])) 6 | 7 | ;; ## Platform Protocol 8 | (defprotocol IPlatform 9 | (generator? [p x] 10 | "Returns true if the supplied x is a generator, false 11 | otherwise.") 12 | 13 | (generator-builder [p gen fields options] 14 | "Returns some source representation.") 15 | 16 | (run! [p name bindings]) 17 | 18 | (run-to-memory! [p name queries])) 19 | 20 | ;; This is required so that the *platform* var isn't nil 21 | (defrecord EmptyPlatform [] 22 | IPlatform 23 | (generator? [p _] 24 | (u/throw-illegal (str p " isn't a valid platform."))) 25 | 26 | (generator-builder [p _ _ _] 27 | (u/throw-illegal (str p " isn't a valid platform."))) 28 | 29 | (run! [p _ _] 30 | (u/throw-illegal (str p " isn't a valid platform."))) 31 | 32 | (run-to-memory! [p _ _] 33 | (u/throw-illegal (str p " isn't a valid platform.")))) 34 | 35 | (def ^:dynamic *platform* (EmptyPlatform.)) 36 | 37 | (defn set-platform! [c] 38 | (alter-var-root #'*platform* (constantly c))) 39 | 40 | (defmacro with-platform 41 | [platform & body] 42 | `(binding [*platform* ~platform] 43 | ~@body)) 44 | 45 | (defn gen-dispatch 46 | "Dispatch for the generator multimethod." 47 | [gen] 48 | [(type *platform*) (type gen)]) 49 | 50 | (defmulti generator 51 | "Accepts some type and returns a platform specific representation 52 | that can be used as a generator." 53 | gen-dispatch) 54 | 55 | (defn platform-generator? 56 | "Evaluates whether there is a method to dispatch to for the 57 | generator multimethod." 58 | [g] 59 | (not (nil? 60 | (.getMethod generator (gen-dispatch g))))) 61 | 62 | (defmulti to-generator 63 | (fn [item] 64 | [(type *platform*) (type item)])) 65 | 66 | (defn compile-query [query] 67 | (zip/postwalk-edit 68 | (zip/cascalog-zip query) 69 | identity 70 | (fn [x _] (to-generator x)) 71 | :encoder (fn [x] 72 | (or (:identifier x) x)))) 73 | -------------------------------------------------------------------------------- /cascalog-core/src/java/cascalog/aggregator/ClojureAggregator.java: -------------------------------------------------------------------------------- 1 | 2 | /* 3 | Copyright 2010 Nathan Marz 4 | 5 | Project and contact information: http://www.cascalog.org/ 6 | 7 | Licensed under the Apache License, Version 2.0 (the "License"); 8 | you may not use this file except in compliance with the License. 9 | You may obtain a copy of the License at 10 | 11 | http://www.apache.org/licenses/LICENSE-2.0 12 | 13 | Unless required by applicable law or agreed to in writing, software 14 | distributed under the License is distributed on an "AS IS" BASIS, 15 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | See the License for the specific language governing permissions and 17 | limitations under the License. 18 | */ 19 | 20 | package cascalog.aggregator; 21 | 22 | import java.util.Collection; 23 | 24 | import cascading.flow.FlowProcess; 25 | import cascading.operation.Aggregator; 26 | import cascading.operation.AggregatorCall; 27 | import cascading.tuple.Fields; 28 | import cascading.tuple.TupleEntryCollector; 29 | import cascalog.ClojureCascadingBase; 30 | import cascalog.Util; 31 | import clojure.lang.IFn; 32 | import clojure.lang.ISeq; 33 | import clojure.lang.RT; 34 | 35 | public class ClojureAggregator extends ClojureCascadingBase implements Aggregator { 36 | 37 | public ClojureAggregator(Fields outputFields, IFn fn) { 38 | super(outputFields, fn); 39 | } 40 | 41 | public void start(FlowProcess flow_process, AggregatorCall ag_call) { 42 | ag_call.setContext(invokeFunction()); 43 | } 44 | 45 | public void aggregate(FlowProcess flow_process, AggregatorCall ag_call) { 46 | ISeq fn_args_seq = Util.coerceFromTuple(ag_call.getArguments().getTuple()); 47 | ag_call.setContext(applyFunction(RT.cons(ag_call.getContext(), fn_args_seq))); 48 | } 49 | 50 | public void complete(FlowProcess flow_process, AggregatorCall ag_call) { 51 | Collection coll = (Collection) invokeFunction(ag_call.getContext()); 52 | 53 | TupleEntryCollector collector = ag_call.getOutputCollector(); 54 | 55 | if (coll != null) { 56 | for (Object o : coll) { 57 | collector.add(Util.coerceToTuple(o)); 58 | } 59 | } 60 | } 61 | } -------------------------------------------------------------------------------- /cascalog-more-taps/src/java/cascalog/moreTaps/WholeFileRecordReader.java: -------------------------------------------------------------------------------- 1 | package cascalog.moreTaps; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.hadoop.fs.FSDataInputStream; 5 | import org.apache.hadoop.fs.FileSystem; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.hadoop.io.BytesWritable; 8 | import org.apache.hadoop.io.IOUtils; 9 | import org.apache.hadoop.io.Text; 10 | import org.apache.hadoop.mapred.FileSplit; 11 | import org.apache.hadoop.mapred.RecordReader; 12 | 13 | import java.io.IOException; 14 | 15 | class WholeFileRecordReader implements RecordReader { 16 | 17 | private FileSplit fileSplit; 18 | private Configuration conf; 19 | private boolean processed = false; 20 | 21 | public WholeFileRecordReader(FileSplit fileSplit, Configuration conf) throws IOException { 22 | this.fileSplit = fileSplit; 23 | this.conf = conf; 24 | } 25 | 26 | public boolean next(Text key, BytesWritable value) throws IOException { 27 | if (!processed) { 28 | byte[] contents = new byte[(int) fileSplit.getLength()]; 29 | Path file = fileSplit.getPath(); 30 | 31 | String fileName = file.getName(); 32 | key.set(fileName); 33 | 34 | FileSystem fs = file.getFileSystem(conf); 35 | FSDataInputStream in = null; 36 | try { 37 | in = fs.open(file); 38 | IOUtils.readFully(in, contents, 0, contents.length); 39 | value.set(contents, 0, contents.length); 40 | } finally { 41 | IOUtils.closeStream(in); 42 | } 43 | processed = true; 44 | return true; 45 | } 46 | return false; 47 | } 48 | 49 | public Text createKey() { 50 | return new Text(); 51 | } 52 | 53 | public BytesWritable createValue() { 54 | return new BytesWritable(); 55 | } 56 | 57 | public long getPos() throws IOException { 58 | return processed ? fileSplit.getLength() : 0; 59 | } 60 | 61 | public float getProgress() throws IOException { 62 | return processed ? 1.0f : 0.0f; 63 | } 64 | 65 | public void close() throws IOException { 66 | // do nothing 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /cascalog-core/project.clj: -------------------------------------------------------------------------------- 1 | (def ROOT-DIR (subs *file* 0 (- (count *file*) (count "project.clj")))) 2 | (def HADOOP-VERSION (-> ROOT-DIR (str "/../HADOOP-VERSION") slurp)) 3 | (def VERSION (-> ROOT-DIR (str "/../VERSION") slurp)) 4 | (def CC-VERSION (or (System/getenv "CASCALOG_CASCADING_VERSION") "2.5.3")) 5 | 6 | (defproject cascalog/cascalog-core VERSION 7 | :description "Cascalog core libraries." 8 | :url "http://www.cascalog.org" 9 | :license {:name "Eclipse Public License" 10 | :url "http://www.eclipse.org/legal/epl-v10.html"} 11 | :jvm-opts ["-Xmx768m" 12 | "-server" 13 | "-agentlib:jdwp=transport=dt_socket,server=y,suspend=n"] 14 | :javac-options ["-target" "1.6" "-source" "1.6"] 15 | :source-paths ["src/clj"] 16 | :java-source-paths ["src/java"] 17 | :jar-exclusions [#"\.java$"] 18 | :repositories {"conjars" "http://conjars.org/repo/"} 19 | :exclusions [log4j/log4j org.slf4j/slf4j-log4j12] 20 | :dependencies [[org.clojure/clojure "1.6.0"] 21 | [org.clojure/tools.macro "0.1.2"] 22 | [log4j "1.2.16"] 23 | [org.slf4j/slf4j-log4j12 "1.6.6"] 24 | [cascading/cascading-hadoop ~CC-VERSION 25 | :exclusions [org.codehaus.janino/janino 26 | org.apache.hadoop/hadoop-core]] 27 | [com.twitter/chill-hadoop "0.3.5"] 28 | [com.twitter/carbonite "1.4.0"] 29 | [com.twitter/maple "0.2.2"] 30 | [prismatic/schema "0.3.7" 31 | :exclusions [org.clojure/clojurescript]] 32 | [jackknife "0.1.7"] 33 | [hadoop-util "0.3.0"]] 34 | :profiles {:1.3 {:dependencies [[org.clojure/clojure "1.3.0"]]} 35 | :1.4 {:dependencies [[org.clojure/clojure "1.4.0"]]} 36 | :1.5 {:dependencies [[org.clojure/clojure "1.5.1"]]} 37 | :1.6 {:dependencies [[org.clojure/clojure "1.6.0"]]} 38 | :1.7 {:denpedencies [[org.clojure/clojure "1.7.0"]]} 39 | :provided {:dependencies [[org.apache.hadoop/hadoop-core ~HADOOP-VERSION]]} 40 | :dev {:resource-paths ["dev"] 41 | :plugins [[lein-midje "3.1.3"]] 42 | :injections [(require 'schema.core) 43 | (schema.core/set-fn-validation! true)] 44 | :dependencies 45 | [[cascalog/midje-cascalog ~VERSION]]}}) 46 | -------------------------------------------------------------------------------- /cascalog-core/src/java/cascalog/ClojureCombinedAggregator.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2010 Nathan Marz 3 | 4 | Project and contact information: http://www.cascalog.org/ 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | package cascalog; 20 | 21 | import java.util.List; 22 | 23 | import cascading.flow.FlowProcess; 24 | import cascading.operation.Aggregator; 25 | import cascading.operation.AggregatorCall; 26 | import cascading.operation.BaseOperation; 27 | import cascading.operation.OperationCall; 28 | import cascading.tuple.Fields; 29 | 30 | public class ClojureCombinedAggregator extends BaseOperation implements Aggregator { 31 | private ParallelAgg _agg; 32 | 33 | public ClojureCombinedAggregator(Fields outfields, ParallelAgg agg) { 34 | super(outfields); 35 | _agg = agg; 36 | } 37 | 38 | @Override 39 | public void prepare(FlowProcess flowProcess, OperationCall opCall) { 40 | _agg.prepare(flowProcess); 41 | } 42 | 43 | public void start(FlowProcess flowProcess, AggregatorCall aggCall) { 44 | aggCall.setContext(null); 45 | } 46 | 47 | public void aggregate(FlowProcess flowProcess, AggregatorCall aggCall) { 48 | try { 49 | List args = Util.tupleToList(aggCall.getArguments()); 50 | List currContext = (List) aggCall.getContext(); 51 | if (currContext == null) { 52 | aggCall.setContext(args); 53 | } else { 54 | aggCall.setContext(_agg.combine(currContext, args)); 55 | } 56 | } catch (Exception e) { 57 | throw new RuntimeException(e); 58 | } 59 | } 60 | 61 | public void complete(FlowProcess flowProcess, AggregatorCall aggCall) { 62 | try { 63 | aggCall.getOutputCollector().add(Util.coerceToTuple(aggCall.getContext())); 64 | } catch (Exception e) { 65 | throw new RuntimeException(e); 66 | } 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /cascalog-core/src/java/jcascalog/PredicateMacroTemplate.java: -------------------------------------------------------------------------------- 1 | package jcascalog; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Arrays; 5 | import java.util.List; 6 | 7 | import cascalog.Util; 8 | import clojure.lang.Keyword; 9 | 10 | public class PredicateMacroTemplate { 11 | public static PredicateMacroTemplateBuilder build(String... inFields) { 12 | return build(Arrays.asList(inFields)); 13 | } 14 | 15 | public static PredicateMacroTemplateBuilder build(List inFields) { 16 | return new PredicateMacroTemplateBuilder(inFields); 17 | } 18 | 19 | public static class PredicateMacroTemplateBuilder { 20 | List _inFields; 21 | 22 | public PredicateMacroTemplateBuilder(List inFields) { 23 | _inFields = inFields; 24 | } 25 | 26 | public PredicateMacroTemplate out(String... outFields) { 27 | return out(Arrays.asList(outFields)); 28 | } 29 | 30 | public PredicateMacroTemplate out(List outFields) { 31 | return new PredicateMacroTemplate(_inFields, outFields); 32 | } 33 | } 34 | 35 | List _inFields; 36 | List _outFields; 37 | List _preds = new ArrayList(); 38 | Predicate _currPred = null; 39 | 40 | public PredicateMacroTemplate(List inFields, List outFields) { 41 | _inFields = inFields; 42 | _outFields = outFields; 43 | } 44 | 45 | public PredicateMacroTemplate predicate(Object op, Object... fields) { 46 | return predicate(op, Arrays.asList(fields)); 47 | } 48 | 49 | public PredicateMacroTemplate predicate(Object op, List fields) { 50 | _currPred = new Predicate(op, fields); 51 | _preds.add(_currPred); 52 | return this; 53 | } 54 | 55 | public PredicateMacroTemplate predicate(Predicate p) { 56 | _preds.add(p); 57 | return this; 58 | } 59 | 60 | public PredicateMacroTemplate out(Object... fields) { 61 | return out(Arrays.asList(fields)); 62 | } 63 | 64 | public PredicateMacroTemplate out(List fields) { 65 | if (_currPred == null) { 66 | throw new RuntimeException("Cannot declare outfields for no predicate"); 67 | } else { 68 | _currPred._outFields = fields; 69 | _currPred = null; 70 | return this; 71 | } 72 | } 73 | 74 | public Object getCompiledPredMacro() { 75 | return Util.bootSimpleFn("cascalog.logic.predmacro", "build-predmacro") 76 | .invoke(_inFields, _outFields, _preds); 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /cascalog-core/test/cascalog/logic/defops_test.clj: -------------------------------------------------------------------------------- 1 | (ns cascalog.logic.defops-test 2 | (:use cascalog.api 3 | clojure.test 4 | [midje sweet cascalog])) 5 | 6 | (use-fixtures :once 7 | (fn [f] 8 | (set-cascading-platform!) 9 | (f) 10 | (set-in-memory-platform!) 11 | (f))) 12 | 13 | (defmapop ident [x] x) 14 | 15 | (defmapop ident-doc 16 | "Identity operation." 17 | [x] x) 18 | 19 | (defmapop ident-meta 20 | {:great-meta "yes!"} 21 | [x] x) 22 | 23 | (defmapop ident-both 24 | "Identity operation." 25 | {:great-meta "yes!"} 26 | [x] x) 27 | 28 | (defn ident-stateful 29 | {:great-meta "yes!"} 30 | [y] 31 | "Identity operation." 32 | (let [state 3] 33 | (mapfn [x] (+ x y state)))) 34 | 35 | (deftest defops-arg-parsing-test 36 | (let [src [[1] [2]] 37 | mk-query (fn [afn] 38 | (<- [?y] (src ?x) (afn ?x :> ?y)))] 39 | 40 | "This query should add 3 plus the param to each input var from 41 | src." 42 | (fact (<- [?y] 43 | (src ?x) 44 | ((ident-stateful 1) ?x :> ?y)) 45 | => (produces [[5] [6]])) 46 | (tabular 47 | (fact 48 | "Each function will be applied to `mk-query` in turn; all of 49 | these functions act as identity transformations, so each query 50 | should produce the original source without modification." 51 | (mk-query ?func) => (produces src)) 52 | ?func 53 | ident 54 | ident-doc 55 | ident-meta 56 | ident-both))) 57 | 58 | (deftest metadata-test 59 | (facts "Metadata testing." 60 | "var should have custom metadata." 61 | (meta #'ident-stateful) => (contains {:great-meta "yes!"}) 62 | 63 | "var should have a docstring." 64 | (meta #'ident-doc) => (contains {:doc "Identity operation."}) 65 | 66 | "ident-meta shouldn't have a docstring in its metadata." 67 | (meta #'ident-meta) =not=> (contains {:doc anything}))) 68 | 69 | (defn five->two [a b c d e] 70 | [(+ a b c) (+ d e)]) 71 | 72 | (defn four->one [a b c d] 73 | (+ a b c d)) 74 | 75 | (defparallelagg multi-combine 76 | :init-var #'five->two 77 | :combine-var #'four->one) 78 | 79 | (deftest agg-test 80 | (fact "Test of aggregators with multiple arguments." 81 | (let [src [[1 2 3 4 5] [5 6 7 8 9]]] 82 | "init-var takes n args, outputs x. combine-var takes 2*x args, 83 | outputs x." 84 | (<- [?sum] 85 | (src ?a ?b ?c ?d ?e) 86 | (multi-combine ?a ?b ?c ?d ?e :> ?sum)) 87 | => (produces [[50]])))) 88 | -------------------------------------------------------------------------------- /cascalog-core/src/java/cascalog/aggregator/ClojureParallelAggregator.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2010 Nathan Marz 3 | 4 | Project and contact information: http://www.cascalog.org/ 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | package cascalog.aggregator; 20 | 21 | import java.util.List; 22 | 23 | import cascading.flow.FlowProcess; 24 | import cascading.operation.Aggregator; 25 | import cascading.operation.AggregatorCall; 26 | import cascading.operation.BaseOperation; 27 | import cascading.operation.OperationCall; 28 | import cascading.tuple.Fields; 29 | import cascalog.ParallelAgg; 30 | import cascalog.Util; 31 | 32 | public class ClojureParallelAggregator extends BaseOperation implements Aggregator { 33 | ParallelAgg agg; 34 | 35 | public ClojureParallelAggregator(Fields outfields, ParallelAgg agg) { 36 | super(outfields); 37 | this.agg = agg; 38 | } 39 | 40 | public void prepare(FlowProcess flowProcess, OperationCall opCall) { 41 | this.agg.prepare(flowProcess); 42 | } 43 | 44 | public void start(FlowProcess flowProcess, AggregatorCall aggCall) { 45 | aggCall.setContext(null); 46 | } 47 | 48 | public void aggregate(FlowProcess flowProcess, AggregatorCall aggCall) { 49 | try { 50 | List initted = agg.init(Util.tupleToList(aggCall.getArguments().getTuple())); 51 | 52 | List currContext = (List) aggCall.getContext(); 53 | if (currContext == null) { 54 | aggCall.setContext(initted); 55 | } else { 56 | aggCall.setContext(agg.combine(currContext, initted)); 57 | } 58 | } catch (Exception e) { 59 | throw new RuntimeException(e); 60 | } 61 | } 62 | 63 | public void complete(FlowProcess flowProcess, AggregatorCall aggCall) { 64 | try { 65 | aggCall.getOutputCollector().add(Util.coerceToTuple(aggCall.getContext())); 66 | } catch (Exception e) { 67 | throw new RuntimeException(e); 68 | } 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /cascalog-core/test/cascalog/logic/vars_test.clj: -------------------------------------------------------------------------------- 1 | (ns cascalog.logic.vars-test 2 | (:use midje.sweet 3 | cascalog.logic.vars)) 4 | 5 | (facts 6 | "Strings and symbols work as vars" 7 | ["?a" '?a '?face_two '!!two '!a] => (has every? cascalog-var?) 8 | 9 | "The underscore isn't a cascalog-var," 10 | ['_ "_"] =not=> (has some cascalog-var?) 11 | 12 | "But it is reserved, along with & (for predmacros)." 13 | ['_ "_" '& "&"] => (has every? reserved?) 14 | 15 | "Unground vars begin with !!" 16 | '!!a => unground-var? 17 | 18 | "! and ? vars are ground." 19 | ['!a '?a] => (has every? ground-var?) 20 | 21 | "Adding !!a causes the test to fail." 22 | ['!!a '!a '?a] =not=> (has every? ground-var?) 23 | 24 | "A sequence of vars is only fully ground if every var is ground." 25 | ['?b '!a '?a] => fully-ground? 26 | 27 | "As before, the addition of an unground var ungrounds the sequence." 28 | ['!!b '!a '?a] =not=> fully-ground?) 29 | 30 | 31 | (fact 32 | "with-logic-vars allows logic symbols to be used without quoting." 33 | (with-logic-vars 34 | (str !!d ?a ?b "see!") => "!!d?a?bsee!")) 35 | 36 | (let [non-nullables (gen-non-nullable-vars 10) 37 | nullables (gen-nullable-vars 10)] 38 | (fact 39 | "The non-nullable generator generates non-nullable vars..." 40 | non-nullables => (has every? non-nullable-var?) 41 | 42 | "And no nullables." 43 | non-nullables =not=> (has some nullable-var?) 44 | 45 | "The non-nullable generator generates non-nullable vars..." 46 | nullables => (has every? nullable-var?) 47 | 48 | "And no nullables." 49 | nullables =not=> (has some non-nullable-var?))) 50 | 51 | (fact 52 | "Sanitize replaces cascalog variables with strings and munges 53 | underscores. The replaced underscore is unground if any of the 54 | replaced variables are unground." 55 | (sanitize [* '!!a '?b '_ '& :> 10]) => (fn [result] 56 | (let [ignored (nth result 3)] 57 | (and (cascalog-var? ignored) 58 | (unground-var? ignored) 59 | (= result [* "!!a" "?b" ignored "&" :> 10])))) 60 | 61 | "Sanitize also works with deeply nested structures." 62 | (sanitize 63 | {:key1 [['?a] '_ '&] 64 | :key2 ['othersym [] '?b]}) 65 | => (fn [result] 66 | (let [ignored (-> result :key1 second)] 67 | (and (cascalog-var? ignored) 68 | (ground-var? ignored) 69 | (= result {:key1 [["?a"] ignored "&"] 70 | :key2 ['othersym [] "?b"]}))))) 71 | -------------------------------------------------------------------------------- /cascalog-core/src/clj/cascalog/logic/options.clj: -------------------------------------------------------------------------------- 1 | (ns cascalog.logic.options 2 | (:require [jackknife.core :refer (throw-illegal uuid)] 3 | [jackknife.seq :as s])) 4 | 5 | ;; ## Option Parsing 6 | ;; 7 | ;; The following code deals with parsing of Cascalog's option 8 | ;; predicates. The goal is to accept the options for a particular 9 | ;; subquery and generate a map of supported option -> the option's 10 | ;; value. 11 | ;; 12 | ;; A couple of nice extensions would be: 13 | ;; 14 | ;; * Validation for each option type. 15 | ;; * Can the user add an option type on the fly? Is there some generic 16 | ;; way to specify an option's meaning within the predicate? Not sure 17 | ;; how we can make this pluggable, other than just allowing all options. 18 | 19 | (def DEFAULT-OPTIONS 20 | "The set of options supported by Cascalog, mapped to default values." 21 | {:distinct false 22 | :sort nil 23 | :reverse nil 24 | :trap nil 25 | :spill-threshold nil 26 | :reducers nil 27 | :name "" 28 | :stats-fn nil}) 29 | 30 | (defn careful-merge 31 | "Semigroup that keeps the right value of it's not nil or not equal 32 | to the old left value. If these conditions aren't met, the merge 33 | will throw an exception." 34 | [l r] 35 | (if-not (or (nil? l) (= l r)) 36 | (throw-illegal (format "Same option set to conflicting values: %s vs %s." 37 | l r)) 38 | r)) 39 | 40 | (def option? 41 | "A predicate is an option if it begins with a keyword." 42 | (comp keyword? :op)) 43 | 44 | (defn generate-option-map 45 | "Accepts a sequence of option predicates and generates a map of 46 | option -> value." 47 | [opt-predicates] 48 | (->> opt-predicates 49 | (map (fn [{:keys [op input output]}] 50 | (assert (contains? DEFAULT-OPTIONS op) 51 | (str op " is not a valid option predicate")) 52 | {op (condp = op 53 | ;; Flatten sorting fields. 54 | :sort (flatten input) 55 | ;; TODO: validation. 56 | :trap {:tap (first input) :name (uuid)} 57 | ;; Otherwise, take the first item. TODO: Throw if 58 | ;; more than one item exists for non-sorting 59 | ;; fields. 60 | (first input))})) 61 | (apply merge-with careful-merge) 62 | (merge DEFAULT-OPTIONS))) 63 | 64 | (defn extract-options 65 | "Accepts a sequence of raw predicates and returns a 2-vector of 66 | [option-map, rest-of-preds]." 67 | [preds] 68 | (let [[raw-options preds] (s/separate option? preds)] 69 | [(generate-option-map raw-options) preds])) 70 | -------------------------------------------------------------------------------- /cascalog-more-taps/src/java/cascalog/moreTaps/WholeFile.java: -------------------------------------------------------------------------------- 1 | package cascalog.moreTaps; 2 | 3 | import cascading.flow.FlowProcess; 4 | import cascading.scheme.Scheme; 5 | import cascading.scheme.SinkCall; 6 | import cascading.scheme.SourceCall; 7 | import cascading.tap.Tap; 8 | import cascading.tuple.Fields; 9 | import cascading.tuple.Tuple; 10 | import org.apache.hadoop.io.BytesWritable; 11 | import org.apache.hadoop.io.Text; 12 | import org.apache.hadoop.mapred.JobConf; 13 | import org.apache.hadoop.mapred.OutputCollector; 14 | import org.apache.hadoop.mapred.RecordReader; 15 | 16 | import java.io.IOException; 17 | import java.lang.Override; 18 | 19 | public class WholeFile extends 20 | Scheme, OutputCollector, Object[], Object[]> { 21 | 22 | public WholeFile( Fields fields ) { 23 | super(fields); 24 | } 25 | 26 | @Override 27 | public void sourceConfInit(FlowProcess flowProcess, 28 | Tap, OutputCollector> tap, 29 | JobConf conf) { 30 | conf.setInputFormat( WholeFileInputFormat.class ); 31 | } 32 | 33 | @Override 34 | public void sinkConfInit(FlowProcess flowProcess, 35 | Tap, OutputCollector> tap, 36 | JobConf conf) { 37 | throw new UnsupportedOperationException("Not supported yet."); 38 | } 39 | 40 | @Override 41 | public void sourcePrepare(FlowProcess flowProcess, 42 | SourceCall> sourceCall) { 43 | sourceCall.setContext(new Object[2]); 44 | 45 | sourceCall.getContext()[0] = sourceCall.getInput().createKey(); 46 | sourceCall.getContext()[1] = sourceCall.getInput().createValue(); 47 | } 48 | 49 | @Override 50 | public boolean source(FlowProcess flowProcess, 51 | SourceCall> sourceCall) throws IOException { 52 | 53 | 54 | Text key = (Text) sourceCall.getContext()[0]; 55 | BytesWritable value = (BytesWritable) sourceCall.getContext()[1]; 56 | 57 | boolean result = sourceCall.getInput().next(key, value); 58 | 59 | if (!result) 60 | return false; 61 | 62 | sourceCall.getIncomingEntry().setTuple(new Tuple(key.toString(), value)); 63 | return true; 64 | } 65 | 66 | @Override 67 | public void sink(FlowProcess flowProcess, 68 | SinkCall outputCollectorSinkCall) throws IOException { 69 | throw new UnsupportedOperationException("Not supported yet."); 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /cascalog-math/src/cascalog/math/contrib/def.clj: -------------------------------------------------------------------------------- 1 | ;; Copyright (c) Stephen C. Gilardi. All rights reserved. The use and 2 | ;; distribution terms for this software are covered by the Eclipse Public 3 | ;; License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) which can 4 | ;; be found in the file epl-v10.html at the root of this distribution. By 5 | ;; using this software in any fashion, you are agreeing to be bound by the 6 | ;; terms of this license. You must not remove this notice, or any other, 7 | ;; from this software. 8 | ;; 9 | ;; File: def.clj 10 | ;; 11 | ;; def.clj provides variants of def that make including doc strings and 12 | ;; making private definitions more succinct. 13 | ;; 14 | ;; scgilardi (gmail) 15 | ;; 17 May 2008 16 | 17 | (ns 18 | #^{:author "Stephen C. Gilardi", 19 | :doc "def.clj provides variants of def that make including doc strings and 20 | making private definitions more succinct."} 21 | cascalog.math.contrib.def) 22 | 23 | (defmacro defvar 24 | "Defines a var with an optional intializer and doc string" 25 | ([name] 26 | (list `def name)) 27 | ([name init] 28 | (list `def name init)) 29 | ([name init doc] 30 | (list `def (with-meta name (assoc (meta name) :doc doc)) init))) 31 | 32 | ; name-with-attributes by Konrad Hinsen: 33 | (defn name-with-attributes 34 | "To be used in macro definitions. 35 | Handles optional docstrings and attribute maps for a name to be defined 36 | in a list of macro arguments. If the first macro argument is a string, 37 | it is added as a docstring to name and removed from the macro argument 38 | list. If afterwards the first macro argument is a map, its entries are 39 | added to the name's metadata map and the map is removed from the 40 | macro argument list. The return value is a vector containing the name 41 | with its extended metadata map and the list of unprocessed macro 42 | arguments." 43 | [name macro-args] 44 | (let [[docstring macro-args] (if (string? (first macro-args)) 45 | [(first macro-args) (next macro-args)] 46 | [nil macro-args]) 47 | [attr macro-args] (if (map? (first macro-args)) 48 | [(first macro-args) (next macro-args)] 49 | [{} macro-args]) 50 | attr (if docstring 51 | (assoc attr :doc docstring) 52 | attr) 53 | attr (if (meta name) 54 | (conj (meta name) attr) 55 | attr)] 56 | [(with-meta name attr) macro-args])) 57 | 58 | -------------------------------------------------------------------------------- /cascalog-core/test/cascalog/cascading/util_test.clj: -------------------------------------------------------------------------------- 1 | (ns cascalog.cascading.util-test 2 | (:use midje.sweet 3 | cascalog.cascading.util) 4 | (:import [cascading.tuple Fields] 5 | [cascading.pipe Pipe] 6 | [cascalog.aggregator ClojureAggregator] 7 | [cascalog ClojureFilter ClojureMap ClojureMapcat])) 8 | 9 | (defn plus-one [in] 10 | [(+ in 1)]) 11 | 12 | (defn inc-wrapped [num] 13 | [(inc num)]) 14 | 15 | (defn inc-both [num1 num2] 16 | [(inc num1) (inc num2)]) 17 | 18 | (defn is-type 19 | "Accepts a class and returns a checker that tests whether or not its 20 | input is an instance of the supplied class." 21 | [^Class expected] 22 | (chatty-checker 23 | [actual] 24 | (instance? expected actual))) 25 | 26 | (facts "Fields tests." 27 | (let [f1 (fields "foo") 28 | f2 (fields ["foo" "bar"])] 29 | (facts "Single fields should resolve properly." 30 | f1 => (is-type Fields) 31 | (seq f1) => ["foo"]) 32 | 33 | (facts "Double fields should resolve properly." 34 | f2 => (is-type Fields) 35 | (seq f2) => ["foo" "bar"]))) 36 | 37 | (tabular 38 | (fact "Pipes without names use UUID." 39 | ?pipe => (is-type Pipe) 40 | (.getName ?pipe) => ?check) 41 | ?pipe ?check 42 | (pipe) #(= 36 (count %)) 43 | (pipe "name") "name") 44 | 45 | (fact "Clojure Filter test." 46 | (let [fil (ClojureFilter. odd?)] 47 | (invoke-filter fil [1]) => false 48 | (invoke-filter fil [2]) => true)) 49 | 50 | (tabular 51 | (fact "ClojureMap test, single field." 52 | (invoke-function ?clj-map [1]) => [[2]]) 53 | ?clj-map 54 | (ClojureMap. (fields "num") 55 | inc-wrapped) 56 | (ClojureMap. (fields "num") 57 | inc)) 58 | 59 | (facts "ClojureMap test, multiple fields." 60 | (let [m (ClojureMap. (fields ["num1" "num2"]) inc-both)] 61 | (invoke-function m [1 2]) => [[2 3]])) 62 | 63 | (defn iterate-inc-wrapped [num] 64 | (list [(+ num 1)] 65 | [(+ num 2)] 66 | [(+ num 3)])) 67 | 68 | (defn iterate-inc [num] 69 | (list (+ num 1) 70 | (+ num 2) 71 | (+ num 3))) 72 | 73 | (tabular 74 | (fact 75 | "ClojureMapCat test, single field. Wrapped vs non-wrapped should 76 | have the same result, when a single field is involved." 77 | (invoke-function ?clj-mapcat [1]) => [[2] [3] [4]]) 78 | ?clj-mapcat 79 | (ClojureMapcat. (fields "num") iterate-inc-wrapped) 80 | (ClojureMapcat. (fields "num") iterate-inc)) 81 | 82 | (defn sum 83 | ([] 0) 84 | ([mem v] (+ mem v)) 85 | ([mem] [mem])) 86 | 87 | (fact "ClojureAggregator test." 88 | (let [a (ClojureAggregator. (fields "sum") sum)] 89 | (invoke-aggregator a [[1] [2] [3]]) => [[6]])) 90 | -------------------------------------------------------------------------------- /cascalog-math/src/cascalog/math/contrib/types.clj: -------------------------------------------------------------------------------- 1 | ;; Data types 2 | 3 | ;; by Konrad Hinsen 4 | ;; last updated May 3, 2009 5 | 6 | ;; Copyright (c) Konrad Hinsen, 2009. All rights reserved. The use 7 | ;; and distribution terms for this software are covered by the Eclipse 8 | ;; Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) 9 | ;; which can be found in the file epl-v10.html at the root of this 10 | ;; distribution. By using this software in any fashion, you are 11 | ;; agreeing to be bound by the terms of this license. You must not 12 | ;; remove this notice, or any other, from this software. 13 | 14 | (ns 15 | ^{:author "Konrad Hinsen" 16 | :doc "General and algebraic data types"} 17 | cascalog.math.contrib.types 18 | (:refer-clojure :exclude (deftype)) 19 | (:use [cascalog.math.contrib.def :only (name-with-attributes)])) 20 | 21 | ; 22 | ; Utility functions 23 | ; 24 | (defn- qualified-symbol 25 | [s] 26 | (symbol (str *ns*) (str s))) 27 | 28 | ; 29 | ; Data type definition 30 | ; 31 | (defmulti deconstruct type) 32 | 33 | (defmacro deftype 34 | "Define a data type by a type tag (a namespace-qualified keyword) 35 | and a symbol naming the constructor function. Optionally, a 36 | constructor and a deconstructor function can be given as well, 37 | the defaults being clojure.core/identity and clojure.core/list. 38 | The full constructor associated with constructor-name calls the 39 | constructor function and attaches the type tag to its result 40 | as metadata. The deconstructor function must return the arguments 41 | to be passed to the constructor in order to create an equivalent 42 | object. It is used for printing and matching." 43 | {:arglists 44 | '([type-tag constructor-name docstring? attr-map?] 45 | [type-tag constructor-name docstring? attr-map? constructor] 46 | [type-tag constructor-name docstring? attr-map? constructor deconstructor])} 47 | [type-tag constructor-name & options] 48 | (let [[constructor-name options] (name-with-attributes 49 | constructor-name options) 50 | [constructor deconstructor] options 51 | constructor (if (nil? constructor) 52 | 'clojure.core/identity 53 | constructor) 54 | deconstructor (if (nil? deconstructor) 55 | 'clojure.core/list 56 | deconstructor)] 57 | `(do 58 | (derive ~type-tag ::type) 59 | (let [meta-map# {:type ~type-tag 60 | ::constructor 61 | (quote ~(qualified-symbol constructor-name))}] 62 | (def ~constructor-name 63 | (comp (fn [~'x] (with-meta ~'x meta-map#)) ~constructor)) 64 | (defmethod deconstruct ~type-tag [~'x] 65 | (~deconstructor (with-meta ~'x {}))))))) 66 | 67 | -------------------------------------------------------------------------------- /cascalog-core/src/clj/cascalog/logic/ops_impl.clj: -------------------------------------------------------------------------------- 1 | (ns cascalog.logic.ops-impl 2 | (:use cascalog.api) 3 | (:require [cascalog.logic.vars :as v])) 4 | 5 | (defn one [] 1) 6 | 7 | (defn identity-tuple [& tuple] tuple) 8 | 9 | (defn existence-int [v] (if v 1 0)) 10 | 11 | (defparallelagg sum-parallel 12 | :init-var #'identity 13 | :combine-var #'+) 14 | 15 | (defparallelagg min-parallel 16 | :init-var #'identity 17 | :combine-var #'min) 18 | 19 | (defparallelagg max-parallel 20 | :init-var #'identity 21 | :combine-var #'max) 22 | 23 | (defparallelagg !count-parallel 24 | :init-var #'existence-int 25 | :combine-var #'+) 26 | 27 | (defn limit-init [options _] 28 | (fn [sort-tuple & tuple] 29 | ;; this is b/c CombinerBase does coerceToSeq on everything and 30 | ;; applies when combining, since this returns a seq we need an 31 | ;; extra level of nesting should have a different combiner base 32 | ;; for buffer combiners 33 | [[[(vec sort-tuple) (vec tuple)]]])) 34 | 35 | (defn- mk-limit-comparator [options] 36 | (fn [[^Comparable o1 _] [^Comparable o2 _]] 37 | (if (:sort options) 38 | (* (.compareTo o1 o2) (if (boolean (:reverse options)) -1 1)) 39 | 0))) 40 | 41 | (defn limit-combine [options limit] 42 | (let [compare-fn (mk-limit-comparator options)] 43 | (fn [list1 list2] 44 | (let [res (concat list1 list2)] 45 | ;; see note in limit-init 46 | [(if (> (count res) (* 2 limit)) 47 | (take limit (sort compare-fn res)) 48 | res)])))) 49 | 50 | (defn limit-extract [options limit] 51 | (let [compare-fn (mk-limit-comparator options)] 52 | (fn [alist] 53 | (let [alist (if (<= (count alist) limit) 54 | alist 55 | (take limit (sort compare-fn alist)))] 56 | (map (partial apply concat) alist))))) 57 | 58 | (defn limit-buffer [_ limit] 59 | (fn [tuples] 60 | (take limit tuples))) 61 | 62 | (defn limit-rank-buffer [_ limit] 63 | (fn [tuples] 64 | (take limit (map (fn [x y] (conj (vec x) y)) 65 | tuples 66 | (iterate inc 1))))) 67 | 68 | (defaggregatefn distinct-count-agg 69 | ([] [nil 0]) 70 | ([[prev cnt] & tuple] 71 | [tuple (if (= tuple prev) cnt (inc cnt))]) 72 | ([state] [(second state)])) 73 | 74 | (defn bool-or [& vars] 75 | (boolean (some identity vars))) 76 | 77 | (defn bool-and [& vars] 78 | (every? identity vars)) 79 | 80 | (defn logical-comp [ops logic-fn-var] 81 | (let [outvars (v/gen-nullable-vars (clojure.core/count ops))] 82 | (construct 83 | [:<< "!invars" :> "!true?"] 84 | (conj 85 | (map (fn [o v] [o :<< "!invars" :> v]) ops outvars) 86 | [logic-fn-var :<< outvars :> "!true?"])))) 87 | -------------------------------------------------------------------------------- /cascalog-core/src/java/cascading/pipe/joiner/CascalogJoiner.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2010 Nathan Marz 3 | 4 | Project and contact information: http://www.cascalog.org/ 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | package cascading.pipe.joiner; 20 | 21 | import java.util.Iterator; 22 | import java.util.List; 23 | 24 | import cascading.tuple.Tuple; 25 | 26 | 27 | public class CascalogJoiner implements Joiner { 28 | public static enum JoinType { 29 | INNER, 30 | OUTER, 31 | EXISTS; 32 | } 33 | 34 | private final List joins; 35 | 36 | public CascalogJoiner(List joins) { 37 | this.joins = joins; 38 | } 39 | 40 | @Override public Iterator getIterator(JoinerClosure closure) { 41 | return new JoinIterator(closure); 42 | } 43 | 44 | public int numJoins() { 45 | return joins.size() - 1; 46 | } 47 | 48 | protected class JoinIterator extends OuterJoin.JoinIterator { 49 | public JoinIterator(JoinerClosure closure) { 50 | super(closure); 51 | } 52 | 53 | @Override protected boolean isOuter(int i) { 54 | return joins.get(i) != JoinType.INNER && super.isOuter(i); 55 | } 56 | 57 | @Override protected Iterator getIterator(int i) { 58 | if (joins.get(i) == JoinType.EXISTS) { 59 | final boolean isEmpty = closure.isEmpty(i); 60 | final Iterator wrapped = super.getIterator(i); 61 | return new Iterator() { 62 | private boolean emittedOne = false; 63 | 64 | public boolean hasNext() { 65 | return !emittedOne && wrapped.hasNext(); 66 | } 67 | 68 | public Object next() { 69 | if (emittedOne) { 70 | throw new RuntimeException("Shouldn't be accessing outerjoin_first more than once"); 71 | } 72 | emittedOne = true; 73 | Tuple t = (Tuple) wrapped.next(); 74 | Tuple ret = new Tuple(); 75 | for (int i = 0; i < t.size(); i++) { 76 | ret.add(!isEmpty); 77 | } 78 | return ret; 79 | } 80 | 81 | public void remove() { 82 | //not implemented 83 | } 84 | 85 | }; 86 | } else { 87 | return super.getIterator(i); 88 | } 89 | } 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /cascalog-core/src/java/cascalog/ClojureBufferCombiner.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2010 Nathan Marz 3 | 4 | Project and contact information: http://www.cascalog.org/ 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | package cascalog; 20 | 21 | import java.util.Arrays; 22 | import java.util.List; 23 | 24 | import cascading.flow.FlowProcess; 25 | import cascading.operation.FunctionCall; 26 | import cascading.operation.OperationCall; 27 | import cascading.tuple.Fields; 28 | import cascading.tuple.Tuple; 29 | import cascading.tuple.TupleEntryCollector; 30 | import cascalog.aggregator.CombinerSpec; 31 | import clojure.lang.IFn; 32 | import clojure.lang.ISeq; 33 | import clojure.lang.RT; 34 | 35 | public class ClojureBufferCombiner extends ClojureCombinerBase { 36 | 37 | private CombinerSpec spec; 38 | 39 | public ClojureBufferCombiner(Fields groupFields, Fields sortFields, Fields args, Fields outFields, 40 | CombinerSpec spec) { 41 | super(groupFields, true, sortFields, Arrays.asList(args), outFields, Arrays 42 | .asList((ParallelAgg) new ClojureParallelAgg(spec)), "cascalog.combiner.buffer.size", 200); 43 | this.spec = spec; 44 | } 45 | 46 | private IFn extract_fn = null; 47 | 48 | @Override 49 | public void prepare(FlowProcess flowProcess, OperationCall operationCall) { 50 | super.prepare(flowProcess, operationCall); 51 | extract_fn = Util.deserializeFn(spec.presentFn); 52 | } 53 | 54 | @Override 55 | protected void write(Tuple group, List vals, OperationCall opCall) { 56 | TupleEntryCollector output = ((FunctionCall) opCall).getOutputCollector(); 57 | 58 | if (vals.size() != 1) { 59 | throw new RuntimeException( 60 | "Should only have one object in buffer combiner before extraction " + vals.size() + ":" 61 | + vals.toString()); 62 | } 63 | Object val = vals.get(0); 64 | try { 65 | ISeq result_seq = RT.seq(extract_fn.invoke(val)); 66 | while (result_seq != null) { 67 | Tuple t = Util.coerceToTuple(result_seq.first()); 68 | Tuple emit = new Tuple(group); 69 | emit.addAll(t); 70 | output.add(emit); 71 | result_seq = result_seq.next(); 72 | } 73 | } catch (Exception e) { 74 | throw new RuntimeException(e); 75 | } 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /cascalog-core/src/clj/cascalog/in_memory/tuple.clj: -------------------------------------------------------------------------------- 1 | (ns cascalog.in-memory.tuple 2 | "Tuples encapsulate the data that Cascalog queries. An individual 3 | tuple contains a field name and a value. It is represented as a map. 4 | So tuples are just a sequence of maps, and an example is: 5 | [{'?num' 1} {'?num' 2}]." 6 | (:refer-clojure :exclude [sort]) 7 | (:require [cascalog.logic.vars :as v] 8 | [jackknife.core :as u])) 9 | 10 | (defn to-tuple 11 | [names v] 12 | (if (= (count names) (count v)) 13 | (zipmap names v) 14 | (u/throw-illegal "Output variables arity and function output arity do not match"))) 15 | 16 | (defn to-tuples 17 | "turns [\"n\"] and [[1] [2]] into [{\"n\" 1} {\"n\" 2}]" 18 | [names coll-of-seqs] 19 | (map #(to-tuple names %) coll-of-seqs)) 20 | 21 | (defn valid? 22 | "Verifies that non-nullable vars aren't null." 23 | [tuple] 24 | (not-any? 25 | (fn [[k v]] 26 | (and (v/non-nullable-var? k) 27 | (nil? v))) 28 | tuple)) 29 | 30 | (defn to-tuples-filter-nullable 31 | "Turns [\"n\"] and [[1] [2]] into [{\"n\" 1} {\"n\" 2}]" 32 | [names coll-of-seqs] 33 | (->> coll-of-seqs 34 | (map 35 | (fn [s] 36 | (let [tuple (to-tuple names s)] 37 | (if (valid? tuple) 38 | tuple)))) 39 | (remove nil?))) 40 | 41 | (defn empty-tuple 42 | "Creates a tuple with a nil value for all of the fields" 43 | [fields] 44 | (to-tuple fields (repeat (count fields) nil))) 45 | 46 | (defn select-values 47 | "Creates a list of the values of the tuples you want and if the field isn't 48 | found, its value is the name of the field. 49 | For examples: (select-values [:b :a 100] {:a 1 :b 2 :c 3}) => (2 1 100)" 50 | [fields tuple] 51 | (map #(get tuple % %) fields)) 52 | 53 | (defn map-select-values 54 | "Creates a collection of vectors for the values of the fields 55 | you have selected" 56 | [fields tuples] 57 | (map #(select-values fields %) tuples)) 58 | 59 | (defn sort 60 | [tuples sort-fields reverse?] 61 | (if sort-fields 62 | (let [sorted (sort-by #(vec (select-values sort-fields %)) tuples)] 63 | (if reverse? 64 | (reverse sorted) 65 | sorted)) 66 | tuples)) 67 | 68 | (defn cross-join 69 | "Input a collection of a collection of tuples like [[{:b 2}] [{:a 1} {:a 3}] 70 | And you'll get a result like: [{:a 1 :b 2} {:a 3 :b 2}]" 71 | [coll-of-tuples] 72 | (loop [[s1 s2 & s-rest] coll-of-tuples] 73 | (if (or (empty? s1) (empty? s2)) 74 | (concat s1 s2) 75 | (let [s-merge (for [x s1 y s2] (merge x y))] 76 | (if (empty? s-rest) 77 | s-merge 78 | (recur (cons s-merge s-rest))))))) 79 | 80 | (defn project 81 | ([tuples fields] (project tuples fields fields)) 82 | ([tuples input-fields output-fields] 83 | (map 84 | #(->> % 85 | (select-values input-fields) 86 | (to-tuple output-fields)) 87 | tuples))) 88 | -------------------------------------------------------------------------------- /midje-cascalog/src/midje/cascalog/impl.clj: -------------------------------------------------------------------------------- 1 | (ns midje.cascalog.impl 2 | (:use midje.sweet 3 | [clojure.set :only (difference)] 4 | [cascalog.api :only (with-job-conf <- ??-)]) 5 | (:require cascalog.cascading.types 6 | [cascalog.cascading.io :as io] 7 | [cascalog.cascading.flow :as flow] 8 | [midje.checking.core :as checking]) 9 | (:import [cascalog.cascading.types ClojureFlow])) 10 | 11 | (defn- multifn? [x] 12 | (instance? clojure.lang.MultiFn x)) 13 | 14 | (def ^{:private true} mocking-forms 15 | #{'against-background 'provided}) 16 | 17 | (defn- mocking-form? 18 | "Returns true if the supplied form (or sequence) is a midje 19 | `provided` or `against-background` clause, false otherwise." 20 | [x] 21 | (when (coll? x) 22 | (contains? mocking-forms (first x)))) 23 | 24 | (defn- extract-mockers 25 | "Returns a vector of two sequences, obtained by splitting the 26 | supplied `coll` into midje forms and rest." 27 | [coll] 28 | ((juxt filter remove) mocking-form? coll)) 29 | 30 | (def ^{:private true} default-log-level :fatal) 31 | 32 | (defn pop-log-level 33 | "Accepts a sequence with an optional log level as its first argument 34 | and returns a 2-vector with the log level (or nil if it wasn't 35 | present) and the non-log-level elements of the sequence." 36 | [bindings] 37 | (let [[pre [ll & more]] (split-with (complement io/log-levels) bindings)] 38 | (if ll 39 | [ll (concat pre more)] 40 | [default-log-level bindings]))) 41 | 42 | (defn execute 43 | "Executes the supplied query and returns the sequence of tuples it 44 | generates. Optionally accepts a log-level key." 45 | [query & {:keys [log-level] :or {log-level default-log-level}}] 46 | (io/with-log-level log-level 47 | (with-job-conf {"io.sort.mb" 10} 48 | (if (instance? ClojureFlow query) 49 | (flow/to-memory query) 50 | (first (??- query)))))) 51 | 52 | ;; ## Midje-Style Checker Helpers 53 | 54 | (def log-level-set 55 | (set (keys io/log-levels))) 56 | 57 | (defn mk-opt-set 58 | "Accepts a sequence of options and returns the same sequence with 59 | all log-level keywords removed." 60 | [opts] 61 | (difference (set opts) log-level-set)) 62 | 63 | (defn valid-options? 64 | "Returns false if supplied-opts contains any item not present in 65 | `permitted-opts` or `log-level-set`, true otherwise." 66 | [permitted-opts supplied-opts] 67 | (empty? (difference (set supplied-opts) 68 | log-level-set 69 | (set permitted-opts)))) 70 | 71 | (def ^{:doc "Accepts a sequence of arguments to a 72 | collection-checker-generator and returns a vector containing two 73 | sequences: 74 | 75 | [ ] 76 | 77 | fn-arguments are non-keywords meant to pass through unmolested into 78 | the checker. keyword arguments are optionally parsed by the wrapping 79 | checker."} 80 | split-forms 81 | (partial split-with (complement keyword?))) 82 | -------------------------------------------------------------------------------- /cascalog-math/src/cascalog/math/contrib/accumulators.clj: -------------------------------------------------------------------------------- 1 | ;; Accumulators 2 | 3 | ;; by Konrad Hinsen 4 | ;; last updated May 19, 2009 5 | 6 | ;; This module defines various accumulators (list, vector, map, 7 | ;; sum, product, counter, and combinations thereof) with a common 8 | ;; interface defined by the multimethods add and combine. 9 | ;; For each accumulator type, its empty value is defined in this module. 10 | ;; Applications typically use this as a starting value and add data 11 | ;; using the add multimethod. 12 | 13 | ;; Copyright (c) Konrad Hinsen, 2009. All rights reserved. The use 14 | ;; and distribution terms for this software are covered by the Eclipse 15 | ;; Public License 1.0 (http://opensource.org/licenses/eclipse-1.0.php) 16 | ;; which can be found in the file epl-v10.html at the root of this 17 | ;; distribution. By using this software in any fashion, you are 18 | ;; agreeing to be bound by the terms of this license. You must not 19 | ;; remove this notice, or any other, from this software. 20 | 21 | (ns 22 | ^{:author "Konrad Hinsen" 23 | :doc "A generic accumulator interface and implementations of various 24 | accumulators."} 25 | cascalog.math.contrib.accumulators 26 | (:refer-clojure :exclude (deftype)) 27 | (:use [cascalog.math.contrib.types :only (deftype)]) 28 | (:use [cascalog.math.contrib.def :only (defvar)])) 29 | 30 | (defmulti add 31 | "Add item to the accumulator acc. The exact meaning of adding an 32 | an item depends on the type of the accumulator." 33 | {:arglists '([acc item])} 34 | (fn [acc item] (type acc))) 35 | 36 | (defn add-items 37 | "Add all elements of a collection coll to the accumulator acc." 38 | [acc items] 39 | (reduce add acc items)) 40 | 41 | (defmulti combine 42 | "Combine the values of the accumulators acc1 and acc2 into a 43 | single accumulator of the same type." 44 | {:arglists '([& accs])} 45 | (fn [& accs] (type (first accs)))) 46 | 47 | ; 48 | ; Mean and variance accumulator 49 | ; 50 | (deftype ::mean-variance mean-variance) 51 | 52 | (derive ::mean-variance ::accumulator) 53 | 54 | (defvar empty-mean-variance (mean-variance {:n 0 :mean 0 :variance 0}) 55 | "An empty mean-variance accumulator, combining sample mean and 56 | sample variance. Only numbers can be added.") 57 | 58 | (defmethod combine ::mean-variance 59 | ([mv] 60 | mv) 61 | 62 | ([mv1 mv2] 63 | (let [{n1 :n mean1 :mean var1 :variance} mv1 64 | {n2 :n mean2 :mean var2 :variance} mv2 65 | n (+ n1 n2) 66 | mean (/ (+ (* n1 mean1) (* n2 mean2)) n) 67 | sq #(* % %) 68 | c (+ (* n1 (sq (- mean mean1))) (* n2 (sq (- mean mean2)))) 69 | var (if (< n 2) 70 | 0 71 | (/ (+ c (* (dec n1) var1) (* (dec n2) var2)) (dec n)))] 72 | (mean-variance {:n n :mean mean :variance var}))) 73 | 74 | ([mv1 mv2 & mvs] 75 | (reduce combine (combine mv1 mv2) mvs))) 76 | 77 | (defmethod add ::mean-variance 78 | [mv x] 79 | (let [{n :n mean :mean var :variance} mv 80 | n1 (inc n) 81 | d (- x mean) 82 | new-mean (+ mean (/ d n1)) 83 | new-var (if (zero? n) 0 (/ (+ (* (dec n) var) (* d (- x new-mean))) n))] 84 | (mean-variance {:n n1 :mean new-mean :variance new-var}))) 85 | 86 | -------------------------------------------------------------------------------- /cascalog-core/src/clj/cascalog/logic/zip.clj: -------------------------------------------------------------------------------- 1 | (ns cascalog.logic.zip 2 | (:require [clojure.zip :as zip] 3 | [jackknife.seq :refer (collectify)])) 4 | 5 | (defprotocol TreeNode 6 | (branch? [node] "Is it possible for node to have children?") 7 | (children [node] "Return children of this node.") 8 | (make-node [node children] "Makes new node from existing node and new children.")) 9 | 10 | (extend-protocol TreeNode 11 | Object 12 | (branch? [node] 13 | false) 14 | (make-node [node children] node)) 15 | 16 | (defn cascalog-zip 17 | "Returns a zipper for cascalog nodes, given a root sequence" 18 | [root] 19 | (zip/zipper branch? children make-node root)) 20 | 21 | (defn leftmost-descendant 22 | "Given a zipper loc, returns its leftmost descendent (ie, down repeatedly)." 23 | [loc] 24 | (if (and (zip/branch? loc) (zip/down loc)) 25 | (recur (zip/down loc)) 26 | loc)) 27 | 28 | ;; Thanks, Raynes! 29 | ;; 30 | ;; https://github.com/Raynes/laser/blob/e1beb765cf40564a789fa5d2d5f795e9df724530/src/me/raynes/laser/zip.clj#L17 31 | 32 | (defn my-next 33 | "Moves to the next loc in the hierarchy in postorder 34 | traversal. Behaves like clojure.zip/next otherwise. Note that 35 | unlike with a pre-order walk, the root is NOT the first element in 36 | the walk order, so be sure to take that into account in your 37 | algorithm if it matters (ie, call leftmost-descendant first thing 38 | before processing a node)." 39 | [loc] 40 | (if (zip/end? loc) ;; If it's the end, return the end. 41 | loc 42 | (if (nil? (zip/up loc)) 43 | [(zip/node loc) :end] 44 | (or (and (zip/right loc) (leftmost-descendant (zip/right loc))) 45 | (zip/up loc))))) 46 | 47 | (defn postwalk-edit [zipper matcher editor & {:keys [encoder] 48 | :or {encoder identity}}] 49 | (loop [visited {} 50 | loc (leftmost-descendant zipper)] 51 | (if (zip/end? loc) 52 | (zip/root loc) 53 | (if-let [res (visited (encoder (zip/node loc)))] 54 | (recur visited (my-next (zip/replace loc res))) 55 | (if-let [matcher-result (matcher (zip/node loc))] 56 | (let [res (editor matcher-result (zip/node loc))] 57 | (recur (assoc visited (encoder (zip/node loc)) res) 58 | (my-next (zip/replace loc res)))) 59 | (recur visited (my-next loc))))))) 60 | 61 | (comment 62 | "Example of how zippers can be used to walk a map:" 63 | (extend-protocol TreeNode 64 | clojure.lang.IPersistentMap 65 | (branch? [node] true) 66 | (children [node] 67 | (collectify (:children node))) 68 | (make-node [node children] 69 | (with-meta children (meta node)))) 70 | 71 | (let [a {:children [1 2 3]} 72 | b {:children [a {:children [4 5]}]} 73 | c {:children [a {:children [8 9]}]}] 74 | (postwalk-edit (cascalog-zip {:children [b c]}) 75 | identity 76 | (fn [x _] (do (println x) (if (number? x) (inc x) x))))) 77 | 78 | {:children [{:children [1 2 3]} 79 | {:children [4 5]}]} 80 | (-> (cascalog-zip {:children [{:children [1 2 3]} 81 | {:children [4 5]}]}) 82 | zip/down 83 | zip/up 84 | zip/down 85 | zip/right 86 | zip/node)) 87 | -------------------------------------------------------------------------------- /cascalog-core/src/java/cascalog/aggregator/ClojureMonoidAggregator.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2010 Nathan Marz 3 | 4 | Project and contact information: http://www.cascalog.org/ 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | package cascalog.aggregator; 20 | 21 | import cascading.flow.FlowProcess; 22 | import cascading.operation.Aggregator; 23 | import cascading.operation.AggregatorCall; 24 | import cascading.operation.BaseOperation; 25 | import cascading.operation.OperationCall; 26 | import cascading.tuple.Fields; 27 | import cascading.tuple.Tuple; 28 | import cascalog.Util; 29 | import clojure.lang.IFn; 30 | import clojure.lang.ISeq; 31 | import clojure.lang.RT; 32 | 33 | public class ClojureMonoidAggregator extends BaseOperation implements Aggregator { 34 | private final CombinerSpec combinerSpec; 35 | private transient IFn prepareFn; 36 | private transient IFn combineFn; 37 | private transient IFn presentFn; 38 | 39 | public ClojureMonoidAggregator(Fields fields, CombinerSpec combinerSpec) { 40 | super(fields); 41 | this.combinerSpec = combinerSpec; 42 | } 43 | 44 | @Override 45 | public void prepare(FlowProcess flowProcess, OperationCall operationCall) { 46 | prepareFn = combinerSpec.getPrepareFn(); 47 | combineFn = combinerSpec.getCombineFn(); 48 | presentFn = combinerSpec.getPresentFn(); 49 | } 50 | 51 | public void start(FlowProcess fp, AggregatorCall call) { 52 | call.setContext(null); 53 | } 54 | 55 | public void aggregate(FlowProcess fp, AggregatorCall call) { 56 | ISeq fnArgs = RT.seq(Util.tupleToList(call.getArguments())); 57 | if (null != prepareFn) { 58 | fnArgs = RT.seq(Util.coerceToList(prepareFn.applyTo(fnArgs))); 59 | } 60 | Tuple context = call.getContext(); 61 | 62 | if (null == context) { 63 | Tuple newContext = Tuple.size(1); 64 | newContext.set(0, fnArgs); 65 | call.setContext(newContext); 66 | } else { 67 | ISeq acc = (ISeq) context.getObject(0); 68 | ISeq ret = RT.seq(Util.coerceToList(combineFn.applyTo(Util.cat(acc, fnArgs)))); 69 | context.set(0, ret); 70 | } 71 | } 72 | 73 | public void complete(FlowProcess flowProcess, AggregatorCall call) { 74 | Tuple context = call.getContext(); 75 | 76 | if (null == context) { 77 | throw new RuntimeException("ClojureMonoidAggregator completed with any aggregate calls"); 78 | } else { 79 | ISeq finalValue = (ISeq) context.getObject(0); 80 | call.setContext(null); 81 | if (null != presentFn) { 82 | call.getOutputCollector().add( 83 | Util.coerceToTuple(presentFn.applyTo(finalValue))); 84 | } else { 85 | call.getOutputCollector().add( 86 | Util.coerceToTuple(finalValue)); 87 | } 88 | } 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /cascalog-core/src/java/cascalog/ClojureCascadingBase.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2010 Nathan Marz 3 | 4 | Project and contact information: http://www.cascalog.org/ 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | package cascalog; 20 | 21 | import java.util.Map; 22 | 23 | import cascading.flow.FlowProcess; 24 | import cascading.operation.BaseOperation; 25 | import cascading.operation.OperationCall; 26 | import cascading.tuple.Fields; 27 | import clojure.lang.Associative; 28 | import clojure.lang.IFn; 29 | import clojure.lang.ISeq; 30 | import clojure.lang.Keyword; 31 | import clojure.lang.PersistentHashMap; 32 | import clojure.lang.Var; 33 | 34 | public class ClojureCascadingBase extends BaseOperation { 35 | private byte[] serializedFn; 36 | protected IFn fn; 37 | protected IFn cleanupFn; 38 | protected Associative bindingMap; 39 | 40 | public void initialize(IFn fn) { 41 | serializedFn = Util.serializeFn(fn); 42 | } 43 | 44 | public ClojureCascadingBase(IFn fn) { 45 | initialize(fn); 46 | } 47 | 48 | public ClojureCascadingBase(Fields fields, IFn fn) { 49 | super(fields); 50 | initialize(fn); 51 | } 52 | 53 | @Override 54 | public void prepare(FlowProcess fp, OperationCall call) { 55 | this.bindingMap = PersistentHashMap 56 | .create(Util.getVar("cascalog.cascading.stats", "*flow-process*"), fp, 57 | Util.getVar("cascalog.cascading.stats", "*op-call*"), call); 58 | 59 | IFn fn = Util.deserializeFn(serializedFn); 60 | 61 | Boolean isPrepared = 62 | (Boolean) Util.bootSimpleFn("cascalog.cascading.def", "prepared?").invoke(fn); 63 | 64 | Var.pushThreadBindings(bindingMap); 65 | try { 66 | if (isPrepared) { 67 | Object res = fn.invoke(fp, call); 68 | 69 | if(res instanceof Map) { 70 | Map resmap = (Map) res; 71 | this.fn = (IFn) resmap.get(Keyword.intern("operate")); 72 | this.cleanupFn = (IFn) resmap.get(Keyword.intern("cleanup")); 73 | } else { 74 | this.fn = (IFn) res; 75 | this.cleanupFn = null; 76 | } 77 | } else { 78 | this.fn = fn; 79 | this.cleanupFn = null; 80 | } 81 | } finally { 82 | Var.popThreadBindings(); 83 | } 84 | } 85 | 86 | protected Object applyFunction(ISeq seq) { 87 | return this.fn.applyTo(seq); 88 | } 89 | 90 | protected Object invokeFunction(Object arg) { 91 | return this.fn.invoke(arg); 92 | } 93 | 94 | protected Object invokeFunction() { 95 | return this.fn.invoke(); 96 | } 97 | 98 | 99 | @Override 100 | public void cleanup(FlowProcess flowProcess, OperationCall call) { 101 | super.cleanup(flowProcess, call); 102 | 103 | if(cleanupFn != null) { 104 | Var.pushThreadBindings(bindingMap); 105 | try { 106 | cleanupFn.invoke(); 107 | } finally { 108 | Var.popThreadBindings(); 109 | } 110 | } 111 | } 112 | } 113 | -------------------------------------------------------------------------------- /cascalog-core/test/cascalog/jcascalog_test.clj: -------------------------------------------------------------------------------- 1 | (ns cascalog.jcascalog-test 2 | (:use clojure.test 3 | cascalog.api 4 | cascalog.logic.testing 5 | cascalog.cascading.testing 6 | cascalog.in-memory.testing) 7 | (:import [jcascalog Api Option Predicate PredicateMacroTemplate 8 | PredicateMacro Subquery Api$FirstNArgs] 9 | [jcascalog.op Count Div Sum Multiply Equals])) 10 | 11 | (use-fixtures :once 12 | (fn [f] 13 | (Api/setCascadingPlatform) 14 | (f) 15 | (Api/setInMemoryPlatform) 16 | (f))) 17 | 18 | (deftest test-vanilla 19 | (let [value [["a" 1] ["a" 2] ["b" 10] 20 | ["c" 3] ["b" 2] ["a" 6]]] 21 | (test?- [["a" 18] ["b" 24] ["c" 6]] 22 | (-> (Subquery. ["?letter" "?doublesum"]) 23 | (.predicate value ["?letter" "?v"]) 24 | (.predicate (Multiply.) ["?v" 2]) (.out ["?double"]) 25 | (.predicate (Sum.) ["?double"]) (.out ["?doublesum"]))) 26 | 27 | (test?- [["a"] ["a"] ["a"]] 28 | (-> (Subquery. ["?letter"]) 29 | (.predicate value ["?letter" "_"]) 30 | (.predicate (Equals.) ["?letter" "a"]))) 31 | 32 | (test?- [["a"]] 33 | (-> (Subquery. ["?letter"]) 34 | (.predicate value ["?letter" "_"]) 35 | (.predicate #'= ["?letter" "a"]) 36 | (.predicate Option/DISTINCT [true]))))) 37 | 38 | 39 | (def my-avg 40 | (reify PredicateMacro 41 | (getPredicates [this [val] [avg]] 42 | (let [count-var (Api/genNullableVar) 43 | sum-var (Api/genNullableVar)] 44 | [(Predicate. (Count.) [count-var]) 45 | (Predicate. (Sum.) [val] [sum-var]) 46 | (Predicate. (Div.) [sum-var count-var] [avg])])))) 47 | 48 | (deftest test-java-predicate-macro 49 | (let [nums [[1] [2] [3] [4] [5]]] 50 | (test?- [[3]] 51 | (-> (Subquery. ["?avg"]) 52 | (.predicate nums ["?v"]) 53 | (.predicate my-avg ["?v"]) (.out ["?avg"]))))) 54 | 55 | (def my-avg-template 56 | (-> (PredicateMacroTemplate/build ["?v"]) (.out ["?avg"]) 57 | (.predicate (Count.) ["?count"]) 58 | (.predicate (Sum.) ["?v"]) (.out ["?sum"]) 59 | (.predicate (Div.) ["?sum" "?count"]) (.out ["?avg"]))) 60 | 61 | (deftest test-java-predicate-macro-template 62 | (let [nums [[1] [2] [3] [4] [5]]] 63 | (test?- [[3]] 64 | (-> (Subquery. ["?avg"]) 65 | ;; use ?sum name here to try to confuse it (test that 66 | ;; it renames intermediate vars) 67 | (.predicate nums ["?sum"]) 68 | (.predicate my-avg-template ["?sum"]) (.out ["?avg"]))))) 69 | 70 | (deftest test-first-n 71 | (let [data [["a" 1] ["a" 1] ["b" 1] ["c" 1] ["c" 1] ["a" 1] 72 | ["d" 1]] 73 | sq (-> (Subquery. ["?l" "?count"]) 74 | (.predicate data ["?l" "_"]) 75 | (.predicate (Count.) ["?count"])) 76 | firstn (Api/firstN sq 2 77 | (-> (Api$FirstNArgs.) 78 | (.sort "?count") 79 | (.reverse true)))] 80 | (test?- [["c"]] 81 | (-> (Subquery. ["?l"]) 82 | (.predicate firstn ["?l" 2]))))) 83 | 84 | (deftest test-java-each 85 | (let [data [[1 2 3] [4 5 6]]] 86 | (test?- [[5 7 9]] 87 | (-> (Subquery. ["?x" "?y" "?z"]) 88 | (.predicate data ["?a" "?b" "?c"]) 89 | (.predicate (Api/each (Sum.)) 90 | ["?a" "?b" "?c"]) (.out ["?x" "?y" "?z"]))))) 91 | -------------------------------------------------------------------------------- /midje-cascalog/src/midje/cascalog.clj: -------------------------------------------------------------------------------- 1 | (ns midje.cascalog 2 | (:use midje.sweet 3 | midje.cascalog.impl 4 | [clojure.set :only (union)])) 5 | 6 | ;; ## Custom Checkers 7 | ;; 8 | ;; Midje-Cascalog provides four custom checkers to use in your facts: 9 | ;; 10 | ;; * produces 11 | ;; * produces-some 12 | ;; * produces-prefix 13 | ;; * produces-suffix 14 | ;; 15 | ;; These act (respectively) like `just`, `contains`, `has-prefix` and 16 | ;; `has-suffix` do for normal collections. Use them like this: 17 | 18 | ;; (fact "memory sources should produce themselves." 19 | ;; (memory-source-tap [[1]]) => (produces [[1]])) 20 | ;; 21 | ;; 22 | ;; Unlike `just` and `contains`, `produces` and `produces-some` make 23 | ;; the default assumption that output from the checked generator will 24 | ;; be unordered. 25 | ;; 26 | ;; To test for an ordered set of tuples with `produces` or 27 | ;; `produces-some`, use the `:in-order` keyword argument: 28 | ;; 29 | ;; (produces [[10] [11]] :in-order) 30 | ;; 31 | ;; When `:in-order` is used with `produces-some`, the checker will 32 | ;; assume that gaps are okay. To test for an ordered subset of tuples, 33 | ;; use both `:in-order` and `:no-gaps` as arguments: 34 | ;; 35 | ;; (produces-some [[10] [11]] :in-order :no-gaps) 36 | ;; 37 | ;; Using `:no-gaps` without `:in-order` is allowed but has no effect. 38 | 39 | (defn wrap-checker 40 | "Accepts up to three arguments: 41 | 42 | `checker-fn`: a midje collection checker (just or contains, for 43 | example). This checker-fn will be primed with the expected set of 44 | tuples for the query being tested. 45 | 46 | `opt-map` (optional): map of some set of allowed keyword arguments 47 | for the checker we're generating to the corresponding arguments to 48 | the supplied `checker-fn`. 49 | 50 | `valid-set` (optional): set of options that should be allowed to 51 | pass on through to the wrapper checker. 52 | 53 | Returns a function that accepts a sequence of result tuples and 54 | optional arguments and returns a chatty checker tuned for said 55 | arguments. See `produces` and `produces-some` for usage examples." 56 | ([checker-fn] 57 | (wrap-checker checker-fn {})) 58 | ([checker-fn opt-map] 59 | (wrap-checker checker-fn opt-map #{})) 60 | ([checker-fn opt-map valid-set] 61 | {:pre [(every? set? (keys opt-map))]} 62 | (let [valid-opts (apply union valid-set (keys opt-map))] 63 | (-> (fn [& forms] 64 | (let [[forms opts] (split-forms forms) 65 | [ll opts] (pop-log-level opts) 66 | opt-set (mk-opt-set opts) 67 | options (opt-map opt-set []) 68 | check-fn (apply checker-fn (concat forms options))] 69 | (assert (valid-options? valid-opts opt-set)) 70 | (chatty-checker 71 | [query] 72 | (check-fn (execute query :log-level ll))))) 73 | (with-meta {:cascalog-checker true}))))) 74 | 75 | ;; With our fun function-generating-function-generating functions 76 | ;; behind us, we can move on to the meaty definitions of our Cascalog 77 | ;; collection checkers. 78 | 79 | (def produces 80 | (wrap-checker just 81 | {#{:in-order} #{} 82 | #{} #{:in-any-order}})) 83 | 84 | (def produces-some 85 | (wrap-checker contains 86 | {#{:in-order :no-gaps} #{} 87 | #{:in-order} #{:gaps-ok} 88 | #{} #{:in-any-order :gaps-ok}})) 89 | 90 | (def produces-prefix 91 | (wrap-checker has-prefix)) 92 | 93 | (def produces-suffix 94 | (wrap-checker has-suffix)) 95 | 96 | (def has-tuples 97 | (wrap-checker has)) 98 | -------------------------------------------------------------------------------- /cascalog-core/src/clj/cascalog/cascading/testing.clj: -------------------------------------------------------------------------------- 1 | (ns cascalog.cascading.testing 2 | (:require [clojure.test :refer :all] 3 | [cascalog.api :refer :all] 4 | [cascalog.logic.testing :as t] 5 | [cascalog.cascading.io :as io] 6 | [cascalog.cascading.tap :as tap] 7 | [cascalog.cascading.platform :refer (normalize-sink-connection)] 8 | [jackknife.core :as u] 9 | [jackknife.seq :refer (unweave multi-set)] 10 | [schema.core :as s]) 11 | (:import [cascalog.cascading.types CascadingPlatform] 12 | [java.io File] 13 | [cascading.tuple Fields])) 14 | 15 | ;; ## Cascading Testing Functions 16 | ;; 17 | ;; The following functions create proxies for dealing with various 18 | ;; output collectors. 19 | 20 | (defn mk-test-tap [fields-def path] 21 | (-> (tap/sequence-file fields-def) 22 | (tap/lfs path))) 23 | 24 | (letfn [(mapify-spec [spec] 25 | (if (map? spec) 26 | spec 27 | {:fields Fields/ALL :tuples spec}))] 28 | 29 | (defn mk-test-sink [spec path] 30 | (mk-test-tap (:fields (mapify-spec spec)) path))) 31 | 32 | (defn- mk-tmpfiles+forms [amt] 33 | (let [tmpfiles (take amt (repeatedly (fn [] (gensym "tap")))) 34 | tmpforms (->> tmpfiles 35 | (mapcat (fn [f] 36 | [f `(File. 37 | (str (io/temp-dir ~(str f)) 38 | "/" 39 | (u/uuid)))])))] 40 | [tmpfiles (vec tmpforms)])) 41 | 42 | (extend-protocol t/ITestable 43 | CascadingPlatform 44 | (process?- [_ bindings] 45 | (let [[ll :as bindings] bindings 46 | [log-level bindings] (if (contains? io/log-levels ll) 47 | [ll (rest bindings)] 48 | [:fatal bindings])] 49 | (io/with-log-level log-level 50 | (io/with-fs-tmp [_ sink-path] 51 | (with-job-conf {"io.sort.mb" 10} 52 | (let [bindings (mapcat (partial apply normalize-sink-connection) 53 | (partition 2 bindings)) 54 | [specs rules] (unweave bindings) 55 | sinks (map mk-test-sink specs 56 | (u/unique-rooted-paths sink-path)) 57 | _ (apply ?- (interleave sinks rules)) 58 | out-tuples (doall (map tap/get-sink-tuples sinks))] 59 | [specs out-tuples]))))))) 60 | 61 | (defn check-tap-spec [tap spec] 62 | (t/is-tuplesets= (tap/get-sink-tuples tap) spec)) 63 | 64 | (defn check-tap-spec-sets [tap spec] 65 | (is (= (multi-set (map set (t/doublify (tap/get-sink-tuples tap)))) 66 | (multi-set (map set (t/doublify spec)))))) 67 | 68 | (defn with-expected-sinks-helper [checker bindings body] 69 | (let [[names specs] (map vec (unweave bindings)) 70 | [tmpfiles tmpforms] (mk-tmpfiles+forms (count names)) 71 | tmptaps (mapcat (fn [n t s] 72 | [n `(cascalog.cascading.testing/mk-test-sink ~s ~t)]) 73 | names tmpfiles specs)] 74 | `(cascalog.cascading.io/with-tmp-files ~tmpforms 75 | (let [~@tmptaps] 76 | ~@body 77 | (dorun (map ~checker ~names ~specs)))))) 78 | 79 | ;; bindings are name spec, where spec is either {:fields :tuples} or 80 | ;; vector of tuples 81 | (defmacro with-expected-sinks [bindings & body] 82 | (with-expected-sinks-helper check-tap-spec bindings body)) 83 | 84 | (defmacro with-expected-sink-sets [bindings & body] 85 | (with-expected-sinks-helper check-tap-spec-sets bindings body)) 86 | -------------------------------------------------------------------------------- /cascalog-core/src/java/jcascalog/example/Examples.java: -------------------------------------------------------------------------------- 1 | package jcascalog.example; 2 | 3 | import com.twitter.maple.tap.StdoutTap; 4 | 5 | import jcascalog.Api; 6 | import jcascalog.Option; 7 | import jcascalog.Playground; 8 | import jcascalog.Subquery; 9 | import jcascalog.op.Count; 10 | import jcascalog.op.GT; 11 | import jcascalog.op.LT; 12 | import jcascalog.op.Multiply; 13 | 14 | 15 | public class Examples { 16 | public static void twentyFiveYearOlds() { 17 | Api.execute(new StdoutTap(), new Subquery("?person").predicate(Playground.AGE, "?person", 25)); 18 | } 19 | 20 | public static void lessThanThirtyYearsOld() { 21 | Api.execute(new StdoutTap(), new Subquery("?person") 22 | .predicate(Playground.AGE, "?person", "?age").predicate(new LT(), "?age", 30)); 23 | } 24 | 25 | public static void lessThanThirtyYearsOldWithAge() { 26 | Api.execute(new StdoutTap(), new Subquery("?person", "?age") 27 | .predicate(Playground.AGE, "?person", "?age").predicate(new LT(), "?age", 30)); 28 | } 29 | 30 | public static void doubleAges() { 31 | Api.execute(new StdoutTap(), new Subquery("?person", "?double-age") 32 | .predicate(Playground.AGE, "?person", "?age").predicate(new Multiply(), "?age", 2) 33 | .out("?double-age")); 34 | } 35 | 36 | public static void distinctPeopleFromFollows() { 37 | Api.execute(new StdoutTap(), new Subquery("?person") 38 | .predicate(Playground.FOLLOWS, "?person", "_").predicate(Option.DISTINCT, true)); 39 | } 40 | 41 | public static void nonDistinctPeopleFromFollows() { 42 | Api.execute(new StdoutTap(), new Subquery("?person") 43 | .predicate(Playground.FOLLOWS, "?person", "_")); 44 | } 45 | 46 | public static void malePeopleEmilyFollows() { 47 | Api.execute(new StdoutTap(), new Subquery("?person") 48 | .predicate(Playground.FOLLOWS, "emily", "?person") 49 | .predicate(Playground.GENDER, "?person", "m")); 50 | } 51 | 52 | public static void followsManyFollows() { 53 | Subquery manyFollows = new Subquery("?person").predicate(Playground.FOLLOWS, "?person", "_") 54 | .predicate(new Count(), "?count").predicate(new GT(), "?count", 2); 55 | Api.execute(new StdoutTap(), new Subquery("?person1", "?person2") 56 | .predicate(manyFollows, "?person1").predicate(manyFollows, "?person2") 57 | .predicate(Playground.FOLLOWS, "?person1", "?person2")); 58 | } 59 | 60 | public static void followsManyFollowsConcise() { 61 | // this implementation uses Api.each to shorten the implementation 62 | Subquery manyFollows = new Subquery("?person").predicate(Playground.FOLLOWS, "?person", "_") 63 | .predicate(new Count(), "?count").predicate(new GT(), "?count", 2); 64 | Api.execute(new StdoutTap(), new Subquery("?person1", "?person2") 65 | .predicate(Api.each(manyFollows), "?person1", "?person2") 66 | .predicate(Playground.FOLLOWS, "?person1", "?person2")); 67 | } 68 | 69 | public static void sentenceUniqueWords() { 70 | Api.execute(new StdoutTap(), new Subquery("?word").predicate(Playground.SENTENCE, "?sentence") 71 | .predicate(new Split(), "?sentence").out("?word").predicate(Option.DISTINCT, true)); 72 | } 73 | 74 | public static void wordCount() { 75 | Api.execute(new StdoutTap(), new Subquery("?word", "?count") 76 | .predicate(Playground.SENTENCE, "?sentence").predicate(new Split(), "?sentence") 77 | .out("?word").predicate(new Count(), "?count")); 78 | } 79 | 80 | public static void lineCountWithFiles() { 81 | Api.execute(Api.hfsTextline("/tmp/myresults"), new Subquery("?count") 82 | .predicate(Api.hfsTextline("src/jvm/jcascalog/example"), "_") 83 | .predicate(new Count(), "?count")); 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /midje-cascalog/README.md: -------------------------------------------------------------------------------- 1 | ## Midje-Cascalog 2 | 3 | Midje-Cascalog is a thin layer over [midje](https://github.com/marick/Midje) that makes it easy and fun to test [Cascalog](https://github.com/nathanmarz/cascalog) queries! Scroll down for an in-depth example. 4 | 5 | [Cascalog Testing 2.0](http://sritchie.github.com/2012/01/22/cascalog-testing-20/) gives a long discussion on various midje-cascalog idioms. 6 | 7 | ## Usage Instructions 8 | 9 | To use midje-cascalog in your own project, add the following two entries to `:dev-dependencies` inside of your `project.clj` file: 10 | 11 | [lein-midje "3.0.1"] 12 | [cascalog/midje-cascalog "3.0.0"] 13 | 14 | Midje-Cascalog supports Clojure 1.3+ and Cascalog 1.8+. Add `(:use [midje sweet cascalog])` to your testing namespace to get started. 15 | 16 | When you're all finished writing tests, `lein midje` at the command line will run all Midje tests and generate a summary. 17 | 18 | ## Example Query Test 19 | 20 | Let's say you want to test a Cascalog workflow that examines your user datastore and returns the user with the greatest number of followers. Your workflow's top level query will generate a single tuple containing that user's name and follower-count. Here's the code: 21 | 22 | (defn max-followers-query [datastore-path] 23 | (let [src (name-vars (complex-subquery datastore-path) 24 | ["?user" "?follower-count"])] 25 | (cascalog.ops/first-n src 1 :sort ["?follower-count"] :reverse true))) 26 | 27 | `max-followers-query` is a function that returns a Cascalog subquery. It works like this: 28 | 29 | * The function accepts a path, (`datastore-path`) and passes it into a function called `complex-subquery`. 30 | * `complex-subquery` returns a subquery that generates 2-tuples; this subquery is passed into `name-vars`. 31 | * `name-vars` binds this subquery to `src` after naming its output variables `?user` and `?follower-count`. 32 | * `first-n` returns a subquery that 33 | * sorts tuples from `src` in reverse order by follower count, and 34 | * returns a single 2-tuple with the name and follower-count of our most popular user. 35 | 36 | At a high level, the subquery returned by =max-followers-query= is responsible for a single piece of application logic: 37 | 38 | * extracting the tuple with max `?follower-count` from the tuples returned by `(complex-subquery datastore-path)`. 39 | 40 | A correct test of `max-followers-query` will test this piece of logic in isolation. 41 | 42 | ```clj 43 | (fact "Query should return a single tuple containing 44 | [most-popular-user, follower-count]." 45 | (max-followers-query :path) => (produces [["richhickey" 2961]]) 46 | (provided 47 | (complex-subquery :path) => [["sritchie09" 180] 48 | ["richhickey" 2961]])) 49 | ``` 50 | 51 | Midje circumvents all extra complexity by mocking out the result of `(complex-subquery datastore-path)` and forcing it to return a specific Clojure sequence of `[?user ?follower-count]` tuples. 52 | 53 | `produces` checks result from queries. The fact passes if these statements are true and fails otherwise. The above fact states that 54 | 55 | * when `max-followers-query` is called with the argument `:path`, 56 | * it will produce `[[ richhickey" 2961]]`, 57 | * provided `(complex-subquery :path)` produces `[["sritchie09" 180] ["richhickey" 2961]]`. 58 | 59 | Fact-based testing separates application logic from the way data is stored. By mocking out `complex-subquery`, our fact tests `max-followers-query` in isolation and proves it correct for all expected inputs. 60 | 61 | This approach is not just better than the "state of the art" of MapReduce testing, [as defined by Cloudera](http://www.cloudera.com/blog/2009/07/debugging-mapreduce-programs-with-mrunit/); it completely obliterates the old way of thinking, and makes it possible to build very complex workflows with a minimum of uncertainty. 62 | 63 | Fact-based tests are the building blocks of rock-solid production workflows. 64 | -------------------------------------------------------------------------------- /cascalog-core/src/clj/cascalog/cascading/types.clj: -------------------------------------------------------------------------------- 1 | (ns cascalog.cascading.types 2 | (:require [jackknife.core :as u] 3 | [cascalog.logic.algebra :refer (plus Semigroup)] 4 | [cascalog.logic.platform :refer (generator compile-query)] 5 | [cascalog.logic.parse :refer (build-rule)] 6 | [cascalog.cascading.tap :as tap]) 7 | (:import [cascalog Util] 8 | [cascalog.cascading.tap CascalogTap] 9 | [cascading.pipe Pipe Merge] 10 | [cascading.tap Tap] 11 | [cascading.tuple Fields Tuple] 12 | [cascalog.logic.parse TailStruct] 13 | [cascalog.logic.predicate RawSubquery] 14 | [cascading.flow.hadoop HadoopFlow] 15 | [com.twitter.maple.tap MemorySourceTap] 16 | [jcascalog Subquery])) 17 | 18 | ;; ## Platform 19 | 20 | (defrecord CascadingPlatform []) 21 | 22 | ;; ## Tuple Conversion 23 | ;; 24 | ;; TODO: We should probably have our tuple converter typeclass follow 25 | ;; the same pattern as scalding. Go to the tuple, come back from the 26 | ;; tuple. Accomplish this with a from-tuple method. 27 | 28 | (defprotocol ITuple 29 | (to-tuple [this] 30 | "Returns a tupled representation of the supplied thing.")) 31 | 32 | (extend-protocol ITuple 33 | Tuple 34 | (to-tuple [t] t) 35 | 36 | clojure.lang.IPersistentVector 37 | (to-tuple [v] (Util/coerceToTuple v)) ;; TODO: do this in clojure. 38 | 39 | Object 40 | (to-tuple [v] (to-tuple [v]))) 41 | 42 | ;; ## Generators 43 | 44 | ;; Note that we need to use getIdentifier on the taps. 45 | 46 | ;; source-map is a map of identifier to tap, or source. Pipe is the 47 | ;; current pipe that the user needs to operate on. 48 | 49 | (defrecord ClojureFlow [source-map sink-map trap-map tails pipe name]) 50 | 51 | (defmethod generator [CascadingPlatform ClojureFlow] 52 | [x] x) 53 | 54 | (defmethod generator [CascadingPlatform Subquery] 55 | [sq] 56 | (generator (.getCompiledSubquery sq))) 57 | 58 | (defmethod generator [CascadingPlatform CascalogTap] 59 | [tap] 60 | (generator (:source tap))) 61 | 62 | (defmethod generator [CascadingPlatform clojure.lang.IPersistentVector] 63 | [v] 64 | (generator (or (seq v) ()))) 65 | 66 | (defmethod generator [CascadingPlatform clojure.lang.ISeq] 67 | [v] 68 | (generator 69 | (MemorySourceTap. (map to-tuple v) Fields/ALL))) 70 | 71 | (defmethod generator [CascadingPlatform java.util.ArrayList] 72 | [coll] 73 | (generator (into [] coll))) 74 | 75 | (defmethod generator [CascadingPlatform Tap] 76 | [tap] 77 | (let [id (u/uuid)] 78 | (ClojureFlow. {id tap} nil nil nil (Pipe. id) nil))) 79 | 80 | (defmethod generator [CascadingPlatform TailStruct] 81 | [sq] 82 | (compile-query sq)) 83 | 84 | (defmethod generator [CascadingPlatform RawSubquery] 85 | [sq] 86 | (generator (build-rule sq))) 87 | 88 | ;; ## Sink Typeclasses 89 | 90 | (defprotocol ISink 91 | (to-sink [this] 92 | "Returns a Cascading tap into which Cascalog can sink the supplied 93 | data.")) 94 | 95 | ;; => Tap, Tap => T 96 | 97 | (extend-protocol ISink 98 | Tap 99 | (to-sink [tap] tap) 100 | 101 | CascalogTap 102 | (to-sink [tap] (to-sink (:sink tap)))) 103 | 104 | (defn array-of [t] 105 | (.getClass 106 | (java.lang.reflect.Array/newInstance t 0))) 107 | 108 | (extend-protocol Semigroup 109 | (array-of Pipe) 110 | (plus [l r] 111 | (into-array Pipe (concat l r))) 112 | 113 | Pipe 114 | (plus [l r] 115 | (Merge. (into-array Pipe [(Pipe. (u/uuid) l) 116 | (Pipe. (u/uuid) r)]))) 117 | 118 | ClojureFlow 119 | (plus [l r] 120 | (letfn [(merge-k [k] (merge (k l) (k r))) 121 | (plus-k [k] (plus (k l) (k r)))] 122 | (ClojureFlow. (merge-k :source-map) 123 | (plus-k :sink-map) 124 | (merge-k :trap-map) 125 | (plus-k (comp vec :tails)) 126 | (plus-k :pipe) 127 | (:name l))))) 128 | -------------------------------------------------------------------------------- /cascalog-core/src/clj/cascalog/in_memory/join.clj: -------------------------------------------------------------------------------- 1 | (ns cascalog.in-memory.join 2 | (:require [cascalog.in-memory.tuple :refer (to-tuple empty-tuple)])) 3 | 4 | (defn inner-join 5 | "Inner joins two maps that both have been grouped by the same function. 6 | This is an inner join, so nils are discarded." 7 | [l-grouped r-grouped] 8 | (->> l-grouped 9 | (map 10 | (fn [l-group] 11 | (let [[k l-tuples] l-group] 12 | (if-let [r-tuples (get r-grouped k)] 13 | (for [x l-tuples y r-tuples] 14 | (merge x y)))))) 15 | (remove nil?) 16 | flatten)) 17 | 18 | (defn left-join 19 | "Joins two maps (a left and a right) that have been grouped by 20 | the same function. Keeps only values found on the left and 21 | returns nil for values not found on the right." 22 | [l-grouped r-grouped r-fields] 23 | (->> l-grouped 24 | (map 25 | (fn [l-group] 26 | (let [[k l-tuples] l-group 27 | r-empty-tuples [(empty-tuple r-fields)] 28 | r-tuples (get r-grouped k r-empty-tuples)] 29 | (for [x l-tuples y r-tuples] 30 | ;; merge is specifically ordered, because the left 31 | ;; tuple takes precedence over the right one (which 32 | ;; could be nil) 33 | (merge y x))))) 34 | (remove nil?) 35 | flatten)) 36 | 37 | (defn left-existence-join 38 | "Similar to a left-join except it includes an additional argument, 39 | existence-field, that captures the boolean about whether a join was 40 | found or not. True if a left value was found. False if not." 41 | [l-grouped r-grouped r-fields existence-field] 42 | (->> l-grouped 43 | (map 44 | (fn [l-group] 45 | (let [[k l-tuples] l-group 46 | r-empty-tuples [(empty-tuple r-fields)] 47 | r-tuple (first (get r-grouped k r-empty-tuples)) 48 | existence-tuple (if (contains? r-grouped k) 49 | (to-tuple [existence-field] [true]) 50 | (to-tuple [existence-field] [false]))] 51 | (for [x l-tuples] 52 | ;; merge is specifically ordered, because the left 53 | ;; tuple takes precedence over the right one (which 54 | ;; could be nil) 55 | (merge existence-tuple r-tuple x))))) 56 | (remove nil?) 57 | flatten)) 58 | 59 | (defn left-excluding-join 60 | "A left join that only returns values where the right side is nil" 61 | [l-grouped r-grouped r-fields] 62 | (->> l-grouped 63 | (map 64 | (fn [l-group] 65 | (let [[k l-tuples] l-group 66 | r-empty-tuple (empty-tuple r-fields)] 67 | (if (not (find r-grouped k)) 68 | (map #(merge r-empty-tuple %) l-tuples))))) 69 | (remove nil?) 70 | flatten)) 71 | 72 | (defn outer-join 73 | "A join that contains all of the values between the two maps, 74 | but none duplicated" 75 | [l-grouped r-grouped l-fields r-fields] 76 | (let [inner (inner-join l-grouped r-grouped) 77 | left (left-excluding-join l-grouped r-grouped r-fields) 78 | right (left-excluding-join r-grouped l-grouped l-fields)] 79 | (concat inner left right))) 80 | 81 | (defn join 82 | "Dispatches to all the different join types and returns a vector 83 | with the collection of joined tuples and the type of join" 84 | [l-grouped r-grouped l-type r-type l-fields r-fields] 85 | (cond 86 | (and (= :inner l-type) (= :inner r-type)) 87 | [(inner-join l-grouped r-grouped) :inner] 88 | (and (= :inner l-type) (= :outer r-type)) 89 | [(left-join l-grouped r-grouped r-fields) :outer] 90 | (= :inner l-type) 91 | [(left-existence-join l-grouped r-grouped r-fields r-type) :outer] 92 | (and (= :outer l-type) (= :inner r-type)) 93 | [(left-join r-grouped l-grouped l-fields) :outer] 94 | (and (= :outer l-type) (= :outer r-type)) 95 | [(outer-join l-grouped r-grouped l-fields r-fields) :outer] 96 | :else 97 | [(left-existence-join l-grouped r-grouped r-fields r-type) :outer])) 98 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Cascalog 2 | 3 | [![Build Status](https://secure.travis-ci.org/nathanmarz/cascalog.png?branch=develop)](http://travis-ci.org/nathanmarz/cascalog) 4 | 5 | [Cascalog](http://cascalog.org/) is a fully-featured data processing and querying library for Clojure or Java. The main use cases for Cascalog are processing "Big Data" on top of Hadoop or doing analysis on your local computer. Cascalog is a replacement for tools like Pig, Hive, and Cascading and operates at a significantly higher level of abstraction than those tools. 6 | 7 | Follow the getting started steps, check out the tutorial, and you'll be running Cascalog queries on your local computer within 5 minutes. 8 | 9 | # Getting Started with JCascalog 10 | 11 | To get started with JCascalog, Cascalog's pure-Java API, see [this wiki page](https://github.com/nathanmarz/cascalog/wiki/JCascalog). The jcascalog.Playground class has in-memory datasets that you can play with to learn the basics. 12 | 13 | # Latest Version 14 | 15 | The latest release version of Cascalog is hosted on [Clojars](https://clojars.org): 16 | 17 | [![Current Version](https://img.shields.io/clojars/v/cascalog/cascalog-core.svg)](https://clojars.org/cascalog/cascalog-core) 18 | 19 | # Getting started with Clojure Cascalog 20 | 21 | The best way to get started with Cascalog is experiment with the toy datasets that ship with the project. These datasets are served from memory and can be played with purely from the REPL. Just follow these steps and you'll be on your way: 22 | 23 | 1. Install [leiningen](http://github.com/technomancy/leiningen) 24 | 2. Make sure you have Java 1.6 (run `java -version`) 25 | 3. Start a new leiningen project with `lein new `, replacing `` 26 | 4. Include dependency on Cascalog in your project by adding `[cascalog/cascalog-core "2.1.0"]` into your project's `project.clj` file. 27 | 5. Work through the examples in the [Getting Started Guide](http://cascalog.org/articles/getting_started.html). 28 | 29 | # Using Cascalog within a project 30 | 31 | Cascalog is hosted at [Clojars](http://clojars.org/cascalog), and some of its dependencies are hosted at [Conjars](http://conjars.org/). Both Clo/Con-jars are maven repos that's easy to use with maven or leiningen. 32 | 33 | To include Cascalog in your leiningen or cake project, add the following to your `project.clj`: 34 | 35 | General 36 | 37 | [cascalog/cascalog-core "3.0.0"] ;; under :dependencies 38 | [org.apache.hadoop/hadoop-core "1.2.1"] ;; under :dev-dependencies 39 | 40 | Leiningen 2.0 41 | 42 | :repositories {"conjars" "http://conjars.org/repo"} 43 | :dependencies [cascalog/cascalog-core "3.0.0"] 44 | :profiles { :provided {:dependencies [[org.apache.hadoop/hadoop-core "1.2.1"]]}} 45 | 46 | Leiningen < 2.0 47 | 48 | :dependencies [cascalog/cascalog-core "3.0.0"] 49 | :dev-dependencies [[org.apache.hadoop/hadoop-core "1.2.1"]] 50 | 51 | Note that Cascalog is compatible with Clojure 1.2.0, 1.2.1, 1.3.0, 1.4.0, and 1.5.1. 52 | 53 | # Documentation and Issue Tracker 54 | 55 | - The [Cascalog website](http://cascalog.org/) contains more information and links to Various articles and tutorials. 56 | - API documentation can be found at http://nathanmarz.github.io/cascalog/. 57 | - [Issue Tracker on Github](https://github.com/nathanmarz/cascalog/issues). 58 | 59 | Come chat with us in the Google group: [cascalog-user](http://groups.google.com/group/cascalog-user) 60 | 61 | Or in the #cascalog or #cascading rooms on freenode! 62 | 63 | # Priorities for Cascalog development 64 | 65 | 1. Replicated and bloom joins 66 | 2. Cross query optimization: push constants and filters down into subqueries when possible 67 | 68 | # Acknowledgements 69 | 70 | YourKit is kindly supporting open source projects with its full-featured Java Profiler. YourKit, LLC is the creator of innovative and intelligent tools for profiling Java and .NET applications. Take a look at YourKit's leading software products: [YourKit Java Profiler](http://www.yourkit.com/java/profiler/index.jsp) and [YourKit .NET Profiler](http://www.yourkit.com/.net/profiler/index.jsp). 71 | 72 | Cascalog is based off of a very early branch of cascading-clojure project (http://github.com/clj-sys/cascading-clojure). Special thanks to Bradford Cross and Mark McGranaghan for their work on that project. Much of that code appears within Cascalog in either its original form or a modified form. 73 | -------------------------------------------------------------------------------- /cascalog-core/test/cascalog/cascading/conf_test.clj: -------------------------------------------------------------------------------- 1 | (ns cascalog.cascading.conf-test 2 | (:use midje.sweet 3 | cascalog.cascading.conf) 4 | (:require [clojure.string :as s] 5 | [cascalog.api :refer (??<- set-cascading-platform!)]) 6 | (:import [cascalog.hadoop DefaultComparator] 7 | [cascading.flow.planner PlannerException])) 8 | 9 | (background 10 | (before :facts 11 | (set-cascading-platform!))) 12 | 13 | (def comma 14 | (partial s/join ",")) 15 | 16 | (def defaults 17 | (comma default-serializations)) 18 | 19 | (with-job-conf {"key" "val"} 20 | (fact "The first binding level should set the JobConf equal to the 21 | supplied conf map." 22 | *JOB-CONF* => {"key" "val"})) 23 | 24 | (with-job-conf {"key" ["val1" "val2"] 25 | "other-key" "other-val"} 26 | (fact "Vectors of strings will be joined w/ commas. This binding 27 | level knocks out the previous, as the keys are identical." 28 | *JOB-CONF* => {"key" "val1,val2" 29 | "other-key" "other-val"}) 30 | 31 | (with-job-conf {"key" ["val3"]} 32 | (fact "other-key from above should be preserved." 33 | *JOB-CONF* => {"key" "val3" 34 | "other-key" "other-val"}))) 35 | 36 | (with-job-conf {"io.serializations" "java.lang.String"} 37 | *JOB-CONF* => {"io.serializations" "java.lang.String"} 38 | (fact 39 | "Calling project-merge on the JobConf will prepend default 40 | serializations onto the supplied list of serializations." 41 | (project-merge *JOB-CONF*) => {"io.serializations" 42 | (comma [defaults "java.lang.String"])}) 43 | (with-serializations [String] 44 | (fact 45 | "You can specify serialiations using the `with-serializations` 46 | form. This works w/ class objects or strings. Without 47 | project-merge, the *JOB-CONF* variable is unaffected. (Note that 48 | classes are resolved properly.)" 49 | *JOB-CONF* => {"io.serializations" "java.lang.String"}) 50 | 51 | (fact "Again, project-merging with w/ Class objects, vs Strings." 52 | (project-merge *JOB-CONF*) => {"io.serializations" 53 | (comma [defaults "java.lang.String"])}))) 54 | 55 | (with-serializations [String] 56 | (with-job-conf {"io.serializations" "java.lang.String,SomeSerialization"} 57 | (fact "with-serializations nests properly with with-job-conf." 58 | (project-merge *JOB-CONF*) => {"io.serializations" 59 | (comma [defaults 60 | "java.lang.String" 61 | "SomeSerialization"])}))) 62 | 63 | (facts "Tests of various aspects of Kryo serialization." 64 | (with-job-conf 65 | {"com.twitter.chill.config.reflectinginstantiator.registrations" "java.util.DoesntExist,someSerializer" 66 | "com.twitter.chill.config.reflectinginstantiator.skipmissing" true 67 | "com.twitter.chill.config.reflectinginstantiator.registrationrequired" false} 68 | (let [cal-tuple [[(java.util.GregorianCalendar.)]]] 69 | (??<- [?a] (cal-tuple ?a)) => cal-tuple)) 70 | 71 | (with-job-conf 72 | {"com.twitter.chill.config.reflectinginstantiator.registrationrequired" true} 73 | (let [cal-tuple [[(java.util.GregorianCalendar.)]]] 74 | (fact 75 | "Attempting to serialize an unregistered object when 76 | accept.all is set to false should throw a flow exception." 77 | (??<- [?a] (cal-tuple ?a))) => (throws PlannerException)))) 78 | 79 | (tabular 80 | (fact "Test of various comparators." 81 | (let [comp (DefaultComparator.)] 82 | (.compare comp ?left ?right) => ?expected)) 83 | ?left ?right ?expected 84 | 0 0 0 85 | 1 1 0 86 | 1 1M 0 87 | 1M 1 0 88 | 2M 1 1 89 | (Long. 4) (Integer. 3) 1 90 | (Long. 3) (Integer. 4) -1 91 | 1 2M -1) 92 | 93 | (fact "Conf-merging test." 94 | (let [m1 {"key" "foo" 95 | "key2" ["bar" "baz"]} 96 | m2 {"key" ["cake" "salad"]}] 97 | (conf-merge m1) => {"key" "foo", "key2" "bar,baz"} 98 | (conf-merge m1 m2) => {"key" "cake,salad", "key2" "bar,baz"})) 99 | 100 | (fact "Stringify test." 101 | (stringify-keys 102 | {:key "val" "key2" "val2"}) => {"key" "val" "key2" "val2"}) 103 | 104 | (future-fact 105 | "Test that stringify-keys can handle clashes between, 106 | say, \"key\" and :key.") 107 | -------------------------------------------------------------------------------- /cascalog-core/src/clj/cascalog/cascading/io.clj: -------------------------------------------------------------------------------- 1 | (ns cascalog.cascading.io 2 | "TODO: Some of these things need to move into hadoop-util." 3 | (:require [clojure.java.io :as io] 4 | [hadoop-util.core :as hadoop] 5 | [cascalog.cascading.conf :as conf] 6 | [jackknife.core :refer (uuid)] 7 | [schema.core :as s]) 8 | (:import [java.io File PrintWriter] 9 | [org.apache.log4j Logger Level] 10 | [org.apache.hadoop.io BytesWritable])) 11 | 12 | ;; While a BytesWritable object wraps a byte array, not all of the 13 | ;; bytes returned by the getBytes method are valid. As mentioned in 14 | ;; the [documentation](http://goo.gl/3qzyc), "The data is only valid 15 | ;; between 0 and getLength() - 1." 16 | 17 | (defn get-bytes 18 | "Extracts a byte array from a Hadoop BytesWritable object. As 19 | mentioned in the [BytesWritable javadoc](http://goo.gl/cjjlD), only 20 | the first N bytes are valid, where N = `(.getLength byteswritable)`." 21 | [^BytesWritable bytes] 22 | (byte-array (.getLength bytes) 23 | (.getBytes bytes))) 24 | 25 | (defn write-lines 26 | "Writes lines (a seq) to f, separated by newlines. f is opened with 27 | writer, and automatically closed at the end of the sequence." 28 | [f lines] 29 | (with-open [^PrintWriter writer (io/writer f)] 30 | (loop [lines lines] 31 | (when-let [line (first lines)] 32 | (.write writer (str line)) 33 | (.println writer) 34 | (recur (rest lines)))))) 35 | 36 | (defn delete-file-recursively 37 | "Delete file f. If it's a directory, recursively delete all its contents. 38 | Raise an exception if any deletion fails unless silently is true." 39 | [f & [silently]] 40 | (let [f (io/file f)] 41 | (if (.isDirectory f) 42 | (doseq [child (.listFiles f)] 43 | (delete-file-recursively child silently))) 44 | (io/delete-file f silently))) 45 | 46 | (defn temp-path [sub-path] 47 | (io/file (System/getProperty "java.io.tmpdir") sub-path)) 48 | 49 | (defn temp-dir 50 | "1) creates a directory in System.getProperty(\"java.io.tmpdir\") 51 | 2) calls tempDir.deleteOn Exit() so the file is deleted by the jvm. 52 | reference: ;http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4735419 53 | deleteOnExit is last resort cleanup on jvm exit." 54 | [sub-path] 55 | (let [tmp-dir (temp-path sub-path)] 56 | (or (.exists tmp-dir) (.mkdir tmp-dir)) 57 | (.deleteOnExit tmp-dir) 58 | tmp-dir)) 59 | 60 | (s/defn unique-tmp-file :- File 61 | [s :- s/Str] 62 | (File. (str (temp-dir s) "/" (uuid)))) 63 | 64 | (defn delete-all 65 | "delete-file-recursively is preemptive delete on exiting the code 66 | block for repl and tests run in the same process." 67 | [bindings] 68 | (doseq [file (reverse (map second (partition 2 bindings)))] 69 | (if (.exists file) 70 | (delete-file-recursively file)))) 71 | 72 | (defmacro with-tmp-files [bindings & body] 73 | `(let ~bindings 74 | (try ~@body 75 | (finally (delete-all ~bindings))))) 76 | 77 | (def log-levels 78 | {:fatal Level/FATAL 79 | :warn Level/WARN 80 | :info Level/INFO 81 | :debug Level/DEBUG 82 | :off Level/OFF}) 83 | 84 | (defmacro with-log-level [level & body] 85 | `(let [with-lev# (log-levels ~level) 86 | logger# (Logger/getRootLogger) 87 | prev-lev# (.getLevel logger#)] 88 | (try 89 | (.setLevel logger# with-lev#) 90 | ~@body 91 | (finally 92 | (.setLevel logger# prev-lev#))))) 93 | 94 | (defn delete-all-fs [fs paths] 95 | (dorun 96 | (for [t paths] 97 | (.delete fs (hadoop/path t) true)))) 98 | 99 | (def tmp-dir-property 100 | "Use this variable as key in JobConf if you want to override the 101 | root of temporary paths. See with-fs-tmp." 102 | "cascalog.tmpdir") 103 | 104 | (defmacro with-fs-tmp 105 | "Generates unique, temporary path names as subfolders of /cascalog_reserved. 106 | by default will be '/tmp', but you can configure it via the 107 | JobConf property `cascalog.io/tmp-dir-property`." 108 | [[fs-sym & tmp-syms] & body] 109 | (let [tmp-root (gensym "tmp-root")] 110 | `(let [~fs-sym (hadoop/filesystem) 111 | ~tmp-root (str (get (conf/project-conf) tmp-dir-property "/tmp") 112 | "/cascalog_reserved") 113 | ~@(mapcat (fn [t] 114 | [t `(str ~tmp-root "/" (uuid))]) 115 | tmp-syms)] 116 | (.mkdirs ~fs-sym (hadoop/path ~tmp-root)) 117 | (try 118 | ~@body 119 | (finally 120 | (delete-all-fs ~fs-sym ~(vec tmp-syms))))))) 121 | -------------------------------------------------------------------------------- /cascalog-checkpoint/README.md: -------------------------------------------------------------------------------- 1 | # Cascalog-Checkpoint 2 | 3 | Workflow checkpointing for Cascalog. 4 | 5 | ## Usage 6 | 7 | Add the following to `project.clj`: 8 | 9 | ```clojure 10 | [cascalog/cascalog-checkpoint "0.2.0"] 11 | ``` 12 | 13 | ** Tutorial 14 | 15 | The `workflow` macro in the checkpoint module allows you to break complicated workflows out into small, checkpointed steps. If one of these steps causes a job to fail and you restart the job, the workflow macro will skip every step up to the previous point of failure. Fault-tolerant MapReduce topologies ftw! 16 | 17 | Let's look at the workflow macro in action. The following function takes an input-path to some existing Twitter data and an output-path, and executes a tweet-processing workflow with five steps: 18 | 19 | ```clojure 20 | (defn -main 21 | [input-path output-path] 22 | (workflow ["/tmp/example-checkpoint"] 23 | step-1 ([:tmp-dirs [staging-path]] 24 | (transfer-tweets input-path staging-path)) 25 | 26 | step-2 ([:deps :last :tmp-dirs user-path] 27 | (harvest-users staging-path user-path)) 28 | 29 | step-3a ([:deps step-2 :tmp-dirs [cluster-path friend-path]] 30 | (cluster-users user-path cluster-path) 31 | (count-friends user-path friend-path)) 32 | 33 | step-3b ([:deps step-2 :tmp-dirs age-path] 34 | (examine-ages user-path age-path)) 35 | 36 | final-step ([:deps :all] 37 | (big-analysis cluster-path 38 | friend-path 39 | age-path 40 | output-path)))) 41 | ``` 42 | 43 | Let's look at this one piece at a time. The first argument to `workflow` is a vector with some path that the workflow can use to stage temporary files. 44 | 45 | 46 | ```clojure 47 | (workflow ["/tmp/example-checkpoint"] ...) 48 | ``` 49 | 50 | It doesn't matter what path you choose; just make sure that Hadoop has access and can write data to the folder. 51 | 52 | Following this vector, `workflow` expects pairs of the form 53 | 54 | ```clojure 55 | step-name ([:deps ] 56 | :tmp-dirs [] 57 | ......) 58 | ``` 59 | 60 | Steps can identify other steps as dependencies by referencing their step-names with the `:deps` keyword argument. 61 | 62 | The first step creates a temporary directory by supplying the symbol `staging-path` to the `:tmp-dirs` keyword argument. It then transfers tweets from the input directory into this staging directory, where they will remain available for future steps to consume. 63 | 64 | ```clojure 65 | step-1 ([:tmp-dirs [staging-path]] 66 | (transfer-tweets input-path staging-path)) 67 | (transfer-tweets input-path staging-path)) 68 | ``` 69 | 70 | Step 2 marks `:last` as a dependency. `:last` is the default, and marks the step as dependent only on the step directly above. A step will not execute until all of its dependencies have completed successfully. 71 | 72 | `step-2` uses `staging-path` defined in `step-1` and creates a new temp directory (`user-path`) for its results. 73 | 74 | If `step-2` fails for any reason and you restart the workflow, the workflow macro will skip `step-1`, destroy any temporary directories created in the previous run of `step-2`, and start `step-2` afresh. 75 | 76 | ```clojure 77 | step-2 ([:deps :last :tmp-dirs user-path] 78 | (harvest-users staging-path user-path)) 79 | ``` 80 | 81 | The next two steps, `step-3a` and `step-3b`, each mark `step-2` as a dependency. Once `step-2` completes, `step-3a` and step-3b` will run in parallel. 82 | 83 | ```clojure 84 | step-3a ([:deps step-2 :tmp-dirs [cluster-path friend-path]] 85 | (cluster-users user-path cluster-path) 86 | (count-friends user-path friend-path)) 87 | 88 | step-3b ([:deps step-2 :tmp-dirs age-path] 89 | (examine-ages user-path age-path)) 90 | ``` 91 | 92 | The final step marks its dependencies as `:all`. This signifies that the step must wait for every step defined above it to complete before running. Again, if `final-step` fails and the workflow restarts, all previous successful steps will be skipped. 93 | 94 | ```clojure 95 | final-step ([:deps :all] 96 | (big-analysis cluster-path 97 | friend-path 98 | age-path 99 | output-path)) 100 | ``` 101 | -------------------------------------------------------------------------------- /midje-cascalog/test/midje/cascalog_test.clj: -------------------------------------------------------------------------------- 1 | (ns midje.cascalog-test 2 | (:use midje.sweet 3 | clojure.test 4 | cascalog.api 5 | midje.cascalog 6 | [midje.cascalog.impl :only [execute]] 7 | [clojure.math.combinatorics :only [permutations]]) 8 | (:require [cascalog.logic.ops :as c])) 9 | 10 | ;; ## Testing Battery 11 | 12 | (background 13 | (before :facts 14 | (set-cascading-platform!))) 15 | 16 | (defn whoop [x] [[x]]) 17 | (defn bang [x y] [[x y]]) 18 | 19 | (defn my-query [x y z] 20 | (let [foo (whoop x) 21 | bar (bang y z)] 22 | (<- [?a ?b] 23 | (foo ?a) 24 | (bar ?a ?b)))) 25 | 26 | (defn a-query [x] (<- [?a] (x ?a))) 27 | 28 | (deftest against-background-test 29 | (fact (whoop :a) => 10 30 | (provided (whoop :a) => 10) 31 | (against-background (whoop :a) => 2)) 32 | (against-background [(whoop :a) => 10] 33 | (fact (whoop :a) => 10))) 34 | 35 | ;; Similar to clojure.test's "are". 36 | (deftest tabular-test 37 | (tabular 38 | (fact 39 | (apply ?func ?args) => (produces ?res)) 40 | ?res ?func ?args 41 | [[3 5]] my-query [3 3 5] 42 | [[1]] a-query [[1]])) 43 | 44 | ;; ## Standard Checker Tests 45 | 46 | (deftest produces-test 47 | (fact 48 | "The produces checker allows for more midje-like syntax in cascalog 49 | tests." 50 | (a-query [[10]]) => (produces [[10]]) 51 | (<- [?a ?b] 52 | ([[10 11] [12 13]] ?a ?b) 53 | ([[11]] ?b)) => (produces [[10 11]]) 54 | (against-background 55 | (whoop) => [[10 11] [12 13]] 56 | (bang) => [[11]])) 57 | (let [some-seq [[10]]] 58 | (fact 59 | "use `produces` to check that the supplied query, when executed, 60 | produces exactly the supplied set of tuples -- no more, no less -- 61 | in any order." 62 | (<- [?a] ((whoop :a) ?a)) => (produces some-seq) 63 | (provided (whoop :a) => [[10]])))) 64 | 65 | ;; The following facts demonstrate the power of midje-cascalog's 66 | ;; chatty checkers. Note that each of the forms (`produces`, 67 | ;; `produces-some`, `produces-prefix` and `produces-suffix`) can be 68 | ;; provided with a log-level keyword as their first argument after the 69 | ;; sequence of result tuples. 70 | (deftest chatty-checkers-test 71 | (let [src [[1 2] [1 3] 72 | [3 4] [3 6] 73 | [5 2] [5 9]] 74 | query (<- [?x ?sum] 75 | (src ?x ?y) 76 | (c/sum ?y :> ?sum))] 77 | (facts 78 | "Executing the query produces proper sums in either order." 79 | query => (produces [[3 10] [1 5] [5 11]]) 80 | query => (produces [[1 5] [3 10] [5 11]]) 81 | 82 | "the `:in-order` keyword makes ordering important, helpful in 83 | cases where output is sorted." 84 | query =not=> (produces [[3 10] [5 11] [1 5]] :in-order) 85 | query => (produces [[1 5] [3 10] [5 11]] :in-order) 86 | 87 | "`produces-some` allows for checking against a subset of tuples" 88 | query => (produces-some [[5 11] [1 5]]) 89 | 90 | "`:in-order` makes ordering important, but gaps are all right." 91 | query =not=> (produces-some [[5 11] [1 5]] :in-order) 92 | query => (produces-some [[1 5] [5 11]] :in-order) 93 | 94 | "Adding `:no-gaps` causes gapped tuples to fail." 95 | query =not=> (produces-some [[1 5] [5 11]] :in-order :no-gaps) 96 | query => (produces-some [[1 5] [3 10]] :in-order :no-gaps) 97 | 98 | "`produce-prefix` mimics the `has-prefix` collection checker." 99 | query => (produces-prefix [[1 5]]) 100 | query => (produces-prefix [[1 5] [3 10]]) 101 | 102 | "`produce-suffix` mimics the `has-suffix` collection checker." 103 | query => (produces-suffix [[5 11]])))) 104 | 105 | (defn- mk-query [src] 106 | (<- [?a] (src ?a))) 107 | 108 | ;; This syntax makes it possible to wrap tests in an external 109 | ;; `against-background` form, like so: 110 | 111 | (deftest external-against-background-test 112 | (against-background 113 | [(whoop :a) => [[1] [2] [3]]] 114 | (fact "the background above applies to each fact." 115 | (mk-query (whoop :a)) => (produces [[1] [2] [3]]) 116 | 117 | "Internal calls to provide will override the background." 118 | (mk-query (whoop :a)) => (produces [["STRING!"]]) 119 | (provided 120 | (whoop :a) => [["STRING!"]])))) 121 | 122 | (deftest log-level-test 123 | (doseq [?options (permutations [:in-order :no-gaps :info ])] 124 | (fact "log-level option is used when executing query - 125 | regardless of location in options order" 126 | ((apply produces-some ..query.. ?options) ..query..) => true 127 | (provided 128 | (execute ..query.. :log-level :info) => ..query..)))) 129 | --------------------------------------------------------------------------------