├── setm ├── test ├── project │ └── assembly.sbt ├── build.sbt ├── src │ └── main │ │ └── scala │ │ └── dbis │ │ └── setm │ │ ├── Main.scala │ │ └── SETM.scala └── .gitignore ├── ceplib ├── .gitignore ├── build.sbt └── src │ ├── main │ └── scala │ │ └── dbis │ │ └── piglet │ │ └── cep │ │ ├── ops │ │ ├── Outputter.scala │ │ ├── MatchCollector.scala │ │ ├── EngineConf.scala │ │ └── Strategies.scala │ │ ├── nfa │ │ └── RelatedValue.scala │ │ ├── spark │ │ ├── CustomRDDMatcher.scala │ │ ├── CustomDStreamMatcher.scala │ │ └── RDDMatcher.scala │ │ ├── flink │ │ ├── CustomDataSetMatcher.scala │ │ ├── CustomDataStreamMatcher.scala │ │ ├── DataSetMatcher.scala │ │ └── DataStreamMatcher.scala │ │ └── engines │ │ ├── NextMatch.scala │ │ ├── FirstMatch.scala │ │ ├── ContiguityMatch.scala │ │ ├── AnyMatch.scala │ │ └── CEPEngine.scala │ └── test │ └── resources │ └── logback-test.xml ├── common ├── .gitignore ├── build.sbt └── src │ └── main │ └── scala │ └── dbis │ └── piglet │ ├── tools │ ├── HdfsCommand.scala │ └── logging │ │ └── PigletLogging.scala │ └── backends │ ├── BackendConf.scala │ └── CppConfig.scala ├── flinklib ├── .gitignore ├── src │ ├── main │ │ ├── resources │ │ │ ├── application.conf │ │ │ └── log4j.properties │ │ └── scala │ │ │ └── dbis │ │ │ └── piglet │ │ │ └── backends │ │ │ └── flink │ │ │ ├── streaming │ │ │ ├── FlinkExtensions.scala │ │ │ ├── FlinksConf.scala │ │ │ └── UTF8StringSchema.scala │ │ │ ├── FlinkConf.scala │ │ │ └── PigFuncs.scala │ └── test │ │ └── resources │ │ └── logback-test.xml └── build.sbt ├── mapreducelib ├── .gitignore ├── build.sbt └── src │ └── main │ └── scala │ └── dbis │ └── piglet │ └── backends │ └── mapreduce │ └── PigRun.scala ├── sparklib ├── .gitignore ├── src │ ├── test │ │ ├── resources │ │ │ ├── person.csv │ │ │ ├── values.csv │ │ │ └── logback-test.xml │ │ └── scala │ │ │ └── dbis │ │ │ └── piglet │ │ │ └── backends │ │ │ └── spark │ │ │ └── Person.scala │ └── main │ │ ├── resources │ │ └── application.conf │ │ └── scala │ │ └── dbis │ │ └── piglet │ │ └── backends │ │ └── spark │ │ ├── SparkStream.scala │ │ ├── PigFuncs.scala │ │ ├── SparkSRun.scala │ │ └── FileStreamReader.scala └── build.sbt ├── zeppelin ├── .gitignore └── build.sbt ├── src ├── it │ └── resources │ │ ├── truth │ │ ├── result3.data │ │ ├── splitY.data │ │ ├── accumulate.data │ │ ├── filtered.data │ │ ├── aggrwogrouping.data │ │ ├── nested.data │ │ ├── splitX.data │ │ ├── splitZ.data │ │ ├── embedded.data │ │ ├── unique.data │ │ ├── sampling.data │ │ ├── top.data │ │ ├── distances.data │ │ ├── grouping2.data │ │ ├── macro1.data │ │ ├── result2.data │ │ ├── aggregate.data │ │ ├── spatialjoin.data │ │ ├── sorted.data │ │ ├── spatialfilter.data │ │ ├── twojoins.data │ │ ├── result1.data │ │ ├── jdbc-data.data │ │ ├── sorted_multiple_directions.data │ │ ├── joined_filtered.data │ │ ├── aggregate2.data │ │ ├── groupall.data │ │ ├── construct.data │ │ ├── grouping.data │ │ ├── united.data │ │ ├── bag.data │ │ ├── joined.data │ │ ├── simple-matrix-res.data │ │ ├── joined_ambiguous_fieldnames.data │ │ ├── marycount.data │ │ ├── rdf_starjoin_plain.data │ │ ├── rdf_pathjoin_plain.data │ │ ├── bgpfilter.data │ │ ├── crossed.data │ │ └── cross2.csv │ │ ├── json.data │ │ ├── import2.pig │ │ ├── skyline.data │ │ ├── input │ │ ├── split.csv │ │ ├── events.csv │ │ ├── file.csv │ │ ├── aggregate.csv │ │ ├── construct.csv │ │ ├── file.txt │ │ ├── unsorted.csv │ │ ├── unsorted_top.csv │ │ ├── file.json │ │ ├── joinInput.csv │ │ ├── nested.csv │ │ ├── grouping.txt │ │ ├── test.mv.db │ │ ├── duplicates.csv │ │ ├── mary.txt │ │ └── matrix_data.csv │ │ ├── load.pig │ │ ├── import1.pig │ │ ├── stream_load.pig │ │ ├── load2.pig │ │ ├── groupall.pig │ │ ├── bag.pig │ │ ├── load3.pig │ │ ├── stream_load2.pig │ │ ├── grouping.pig │ │ ├── socket_write.pig │ │ ├── sampling.pig │ │ ├── filter.pig │ │ ├── top.pig │ │ ├── socket_read.pig │ │ ├── stream_filter.pig │ │ ├── sort.pig │ │ ├── grouping2.pig │ │ ├── aggrwogrouping.pig │ │ ├── jdbc.pig │ │ ├── sort_multiple_directions.pig │ │ ├── accumulate.pig │ │ ├── aggregate.pig │ │ ├── top_schema.pig │ │ ├── construct.pig │ │ ├── windowDistinct.pig │ │ ├── foreach1.pig │ │ ├── windowFilter.pig │ │ ├── json.pig │ │ ├── streaming │ │ ├── aggregate.pig │ │ ├── construct.pig │ │ ├── accumulate.pig │ │ └── union.pig │ │ ├── windowGrouping.pig │ │ ├── embedded.pig │ │ ├── stream_foreach1.pig │ │ ├── windowSort.pig │ │ ├── cross.pig │ │ ├── selfjoin.pig │ │ ├── bgpfilter.pig │ │ ├── simple_matrix.pig │ │ ├── splitInto.pig │ │ ├── selfjoin_filtered.pig │ │ ├── selfjoin_ambiguous_fieldnames.pig │ │ ├── union.pig │ │ ├── groupforeach.pig │ │ ├── rdf_pathjoin_plain.pig │ │ ├── spatialfilter.pig │ │ ├── nforeach.pig │ │ ├── rdf_starjoin_plain.pig │ │ ├── windowCount.pig │ │ ├── spatialfilterwithindex.pig │ │ ├── windowCross.pig │ │ ├── windowJoin.pig │ │ ├── rscript.pig │ │ ├── nforeach2.pig │ │ ├── spatialpartitioning.pig │ │ ├── windowNforeach.pig │ │ ├── crossmany.pig │ │ ├── macro1.pig │ │ ├── skyline.pig │ │ ├── two_joins.pig │ │ ├── wordcount.pig │ │ ├── spatialjoin.pig │ │ └── spatialjoinwithindex.pig ├── test │ ├── scala │ │ └── dbis │ │ │ └── piglet │ │ │ ├── tools │ │ │ ├── TestTools.scala │ │ │ ├── CodeMatcherSpec.scala │ │ │ └── RingBufferSpec.scala │ │ │ └── CompilerSpec.scala │ └── resources │ │ └── logback-test.xml └── main │ ├── scala │ └── dbis │ │ └── piglet │ │ ├── plan │ │ ├── rewriting │ │ │ ├── internals │ │ │ │ ├── package.scala │ │ │ │ ├── MutingSupport.scala │ │ │ │ └── EmbedSupport.scala │ │ │ ├── rulesets │ │ │ │ └── Ruleset.scala │ │ │ └── dsl │ │ │ │ ├── words │ │ │ │ ├── CheckWord.scala │ │ │ │ ├── ImmediateEndWord.scala │ │ │ │ ├── ReplaceWord.scala │ │ │ │ └── MergeWord.scala │ │ │ │ ├── traits │ │ │ │ └── EndWordT.scala │ │ │ │ └── builders │ │ │ │ └── ReplacementBuilder.scala │ │ └── PrettyPrinter.scala │ │ ├── tools │ │ ├── ProductTools.scala │ │ ├── UpdateMap.scala │ │ └── RingBuffer.scala │ │ ├── codegen │ │ ├── flink │ │ │ └── emitter │ │ │ │ ├── DumpEmitter.scala │ │ │ │ ├── StreamDumpEmitter.scala │ │ │ │ ├── StreamSampleEmitter.scala │ │ │ │ ├── LoadEmitter.scala │ │ │ │ ├── StreamStoreEmitter.scala │ │ │ │ ├── StreamLoadEmitter.scala │ │ │ │ ├── StoreEmitter.scala │ │ │ │ ├── StreamOpEmitter.scala │ │ │ │ ├── StreamDistinctEmitter.scala │ │ │ │ ├── LimitEmitter.scala │ │ │ │ ├── OrderByEmitter.scala │ │ │ │ ├── SocketWriteEmitter.scala │ │ │ │ ├── StreamFilterEmitter.scala │ │ │ │ └── SocketReadEmitter.scala │ │ ├── spark │ │ │ ├── StreamDumpEmitter.scala │ │ │ ├── StreamStoreEmitter.scala │ │ │ ├── StreamLoadEmitter.scala │ │ │ ├── CacheEmitter.scala │ │ │ ├── VisualizeEmitter.scala │ │ │ ├── StreamDistinctEmitter.scala │ │ │ ├── SpatialIndexEmitter.scala │ │ │ ├── StreamOrderByEmitter.scala │ │ │ ├── SpatialEmitterHelper.scala │ │ │ ├── StreamGroupingEmitter.scala │ │ │ ├── PartitionerEmitter.scala │ │ │ └── DelayEmitter.scala │ │ └── scala_lang │ │ │ ├── EmptyEmitter.scala │ │ │ ├── HdfsCmdEmitter.scala │ │ │ ├── IntersectionEmitter.scala │ │ │ ├── UnionEmitter.scala │ │ │ ├── SampleEmitter.scala │ │ │ ├── TimingEmitter.scala │ │ │ ├── DistinctEmitter.scala │ │ │ ├── FilterEmitter.scala │ │ │ ├── DumpEmitter.scala │ │ │ ├── StoreEmitter.scala │ │ │ ├── DifferenceEmitter.scala │ │ │ ├── LimitEmitter.scala │ │ │ ├── LoadEmitter.scala │ │ │ └── StreamOpEmitter.scala │ │ ├── op │ │ ├── Intersection.scala │ │ ├── Materialize.scala │ │ ├── TimingOp.scala │ │ ├── Visualize.scala │ │ ├── Partition.scala │ │ ├── Difference.scala │ │ ├── Empty.scala │ │ ├── Cache.scala │ │ ├── cmd │ │ │ ├── RegisterCmd.scala │ │ │ ├── SetCmd.scala │ │ │ ├── DefineCmd.scala │ │ │ └── HdfsCmd.scala │ │ ├── Describe.scala │ │ ├── Limit.scala │ │ ├── Display.scala │ │ ├── RScript.scala │ │ ├── Dump.scala │ │ ├── Top.scala │ │ ├── Distinct.scala │ │ ├── Tuplify.scala │ │ ├── WindowApply.scala │ │ ├── IndexOp.scala │ │ ├── Delay.scala │ │ └── SplitInto.scala │ │ ├── expr │ │ └── Traverser.scala │ │ ├── mm │ │ ├── CacheEntry.scala │ │ └── MaterializationPoint.scala │ │ └── api │ │ └── PigletInterpreterAPI.scala │ └── resources │ └── logback.xml ├── project ├── build.properties ├── assembly.sbt └── plugins.sbt ├── lib_unmanaged ├── stark.jar └── jvmr_2.11-2.11.2.1.jar ├── script └── simplestatserver.sh ├── .dockerignore ├── Dockerfile ├── materialization_scripts ├── gdelt_gold_tone_roi.pig ├── taxi_tip_avg.pig ├── gdelt_url_eventcode.pig └── taxi_high_tip_block.pig ├── .gitignore ├── make-distribution.sh └── Zeppelin.md /setm/test: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ceplib/.gitignore: -------------------------------------------------------------------------------- 1 | /bin/ 2 | -------------------------------------------------------------------------------- /common/.gitignore: -------------------------------------------------------------------------------- 1 | /bin/ 2 | -------------------------------------------------------------------------------- /flinklib/.gitignore: -------------------------------------------------------------------------------- 1 | /bin/ 2 | -------------------------------------------------------------------------------- /mapreducelib/.gitignore: -------------------------------------------------------------------------------- 1 | /bin/ 2 | -------------------------------------------------------------------------------- /sparklib/.gitignore: -------------------------------------------------------------------------------- 1 | /bin/ 2 | -------------------------------------------------------------------------------- /zeppelin/.gitignore: -------------------------------------------------------------------------------- 1 | /bin/ 2 | -------------------------------------------------------------------------------- /src/it/resources/truth/result3.data: -------------------------------------------------------------------------------- 1 | small -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=1.1.4 2 | -------------------------------------------------------------------------------- /src/it/resources/json.data: -------------------------------------------------------------------------------- 1 | Hage,(Ilmenau,98693) -------------------------------------------------------------------------------- /src/it/resources/truth/splitY.data: -------------------------------------------------------------------------------- 1 | 4,5,6 2 | -------------------------------------------------------------------------------- /src/it/resources/import2.pig: -------------------------------------------------------------------------------- 1 | A = LOAD 'input'; 2 | -------------------------------------------------------------------------------- /src/it/resources/truth/accumulate.data: -------------------------------------------------------------------------------- 1 | 1,3,9,5,1.8 -------------------------------------------------------------------------------- /src/it/resources/truth/filtered.data: -------------------------------------------------------------------------------- 1 | 2,2 2 | 3,3 3 | -------------------------------------------------------------------------------- /src/it/resources/skyline.data: -------------------------------------------------------------------------------- 1 | event#1,1.0 2 | event#2,1.0 -------------------------------------------------------------------------------- /src/it/resources/truth/aggrwogrouping.data: -------------------------------------------------------------------------------- 1 | 5,21,4.2 2 | -------------------------------------------------------------------------------- /src/it/resources/truth/nested.data: -------------------------------------------------------------------------------- 1 | BB,2 2 | CC,2 3 | AA,3 -------------------------------------------------------------------------------- /src/it/resources/truth/splitX.data: -------------------------------------------------------------------------------- 1 | 1,2,3 2 | 4,5,6 3 | -------------------------------------------------------------------------------- /src/it/resources/truth/splitZ.data: -------------------------------------------------------------------------------- 1 | 1,2,3 2 | 7,8,9 3 | -------------------------------------------------------------------------------- /src/it/resources/input/split.csv: -------------------------------------------------------------------------------- 1 | 1,2,3 2 | 4,5,6 3 | 7,8,9 4 | -------------------------------------------------------------------------------- /src/it/resources/truth/embedded.data: -------------------------------------------------------------------------------- 1 | 2 2 | 4 3 | 3 4 | 6 5 | 4 6 | -------------------------------------------------------------------------------- /src/it/resources/truth/unique.data: -------------------------------------------------------------------------------- 1 | 1,1 2 | 2,2 3 | 3,3 4 | 4,4 5 | -------------------------------------------------------------------------------- /sparklib/src/test/resources/person.csv: -------------------------------------------------------------------------------- 1 | Anna,21 2 | John,53 3 | Mike,32 -------------------------------------------------------------------------------- /src/it/resources/input/events.csv: -------------------------------------------------------------------------------- 1 | event#1,50.0,10.1 2 | event#2,50.1,10.2 -------------------------------------------------------------------------------- /src/it/resources/input/file.csv: -------------------------------------------------------------------------------- 1 | 1,1 2 | 2,2 3 | 1,2 4 | 3,3 5 | 3,1 6 | -------------------------------------------------------------------------------- /src/it/resources/truth/sampling.data: -------------------------------------------------------------------------------- 1 | 1,1 2 | 2,2 3 | 1,2 4 | 3,3 5 | 3,1 -------------------------------------------------------------------------------- /src/it/resources/truth/top.data: -------------------------------------------------------------------------------- 1 | b,A,1 2 | c,B,9 3 | b,D,7 4 | a,D,3 5 | -------------------------------------------------------------------------------- /src/it/resources/input/aggregate.csv: -------------------------------------------------------------------------------- 1 | 1,3 2 | 1,3 3 | 2,3 4 | 4,5 5 | 2,7 6 | -------------------------------------------------------------------------------- /src/it/resources/truth/distances.data: -------------------------------------------------------------------------------- 1 | event#1,50.01,13.6 2 | event#2,50.11,13.7 -------------------------------------------------------------------------------- /src/it/resources/truth/grouping2.data: -------------------------------------------------------------------------------- 1 | 1,2.0 2 | 4,2.5 3 | 7,2.0 4 | 8,3.5 5 | -------------------------------------------------------------------------------- /src/it/resources/truth/macro1.data: -------------------------------------------------------------------------------- 1 | 43,0 2 | 44,1 3 | 43,1 4 | 45,2 5 | 45,0 -------------------------------------------------------------------------------- /src/it/resources/truth/result2.data: -------------------------------------------------------------------------------- 1 | 1,1 2 | 2,2 3 | 1,2 4 | 3,3 5 | 3,1 6 | -------------------------------------------------------------------------------- /project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.5") 2 | -------------------------------------------------------------------------------- /sparklib/src/test/resources/values.csv: -------------------------------------------------------------------------------- 1 | 0.001,21.5 2 | 0.00004,53.9 3 | 0.023456,32.3 -------------------------------------------------------------------------------- /src/it/resources/input/construct.csv: -------------------------------------------------------------------------------- 1 | 100,101,aaaa 2 | 200,202,bbbb 3 | 300,302,cccc -------------------------------------------------------------------------------- /src/it/resources/truth/aggregate.data: -------------------------------------------------------------------------------- 1 | 1,2,6,3.0 2 | 2,2,10,5.0 3 | 4,1,5,5.0 4 | -------------------------------------------------------------------------------- /src/it/resources/truth/spatialjoin.data: -------------------------------------------------------------------------------- 1 | event#1,event#1 2 | event#2,event#2 3 | -------------------------------------------------------------------------------- /setm/project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.0") 2 | -------------------------------------------------------------------------------- /src/it/resources/input/file.txt: -------------------------------------------------------------------------------- 1 | Hallo 2 | this 3 | is 4 | a 5 | small 6 | text 7 | file. -------------------------------------------------------------------------------- /src/it/resources/input/unsorted.csv: -------------------------------------------------------------------------------- 1 | c,B,9 2 | b,D,7 3 | a,C,3 4 | d,F,4 5 | b,A,1 6 | -------------------------------------------------------------------------------- /src/it/resources/truth/sorted.data: -------------------------------------------------------------------------------- 1 | a,C,3 2 | b,A,1 3 | b,D,7 4 | c,B,9 5 | d,F,4 6 | -------------------------------------------------------------------------------- /src/it/resources/truth/spatialfilter.data: -------------------------------------------------------------------------------- 1 | event#2,STObject(POINT (50.1 10.2),None) 2 | -------------------------------------------------------------------------------- /src/it/resources/truth/twojoins.data: -------------------------------------------------------------------------------- 1 | 4,4 2 | 4,4 3 | 4,4 4 | 4,4 5 | 4,4 6 | 4,4 7 | -------------------------------------------------------------------------------- /src/it/resources/input/unsorted_top.csv: -------------------------------------------------------------------------------- 1 | c,B,9 2 | b,D,7 3 | a,D,3 4 | d,F,4 5 | b,A,1 6 | -------------------------------------------------------------------------------- /src/it/resources/load.pig: -------------------------------------------------------------------------------- 1 | A = LOAD '$inbase/input/file.txt'; 2 | STORE A INTO '$outfile'; 3 | -------------------------------------------------------------------------------- /src/it/resources/truth/result1.data: -------------------------------------------------------------------------------- 1 | Hallo 2 | this 3 | is 4 | a 5 | small 6 | text 7 | file. -------------------------------------------------------------------------------- /lib_unmanaged/stark.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dbis-ilm/piglet/HEAD/lib_unmanaged/stark.jar -------------------------------------------------------------------------------- /src/it/resources/input/file.json: -------------------------------------------------------------------------------- 1 | {"name":"Hage","address":{"city":"Ilmenau","zipcode":"98693"}} -------------------------------------------------------------------------------- /src/it/resources/input/joinInput.csv: -------------------------------------------------------------------------------- 1 | 1,2,3 2 | 4,2,1 3 | 8,3,4 4 | 4,3,3 5 | 7,2,5 6 | 8,4,3 7 | -------------------------------------------------------------------------------- /src/it/resources/input/nested.csv: -------------------------------------------------------------------------------- 1 | AA,bb 2 | AA,cc 3 | AA,dd 4 | BB,cc 5 | BB,dd 6 | CC,aa 7 | CC,dd -------------------------------------------------------------------------------- /src/it/resources/import1.pig: -------------------------------------------------------------------------------- 1 | IMPORT 'src/it/resources/import2.pig'; 2 | B = FILTER A BY $0 > 10; 3 | -------------------------------------------------------------------------------- /src/it/resources/stream_load.pig: -------------------------------------------------------------------------------- 1 | A = LOAD '$inbase/input/file.txt'; 2 | STORE A INTO '$outfile'; 3 | -------------------------------------------------------------------------------- /src/it/resources/truth/jdbc-data.data: -------------------------------------------------------------------------------- 1 | 1,One 2 | 2,Two 3 | 3,Three 4 | 4,Four 5 | 5,Five 6 | 6,Six 7 | -------------------------------------------------------------------------------- /src/it/resources/truth/sorted_multiple_directions.data: -------------------------------------------------------------------------------- 1 | a,C,3 2 | b,D,7 3 | b,A,1 4 | c,B,9 5 | d,F,4 -------------------------------------------------------------------------------- /src/it/resources/truth/joined_filtered.data: -------------------------------------------------------------------------------- 1 | 4,2,1,4,2,1 2 | 4,2,1,4,3,3 3 | 4,3,3,4,2,1 4 | 4,3,3,4,3,3 5 | -------------------------------------------------------------------------------- /src/it/resources/truth/aggregate2.data: -------------------------------------------------------------------------------- 1 | 1,1,3,3.0 2 | 1,2,6,3.0 3 | 2,1,3,3.0 4 | 2,2,10,5.0 5 | 4,1,5,5.0 6 | -------------------------------------------------------------------------------- /src/it/resources/truth/groupall.data: -------------------------------------------------------------------------------- 1 | all,{(1,2,3),(4,2,1),(8,3,4),(4,3,3),(7,2,5),(8,4,3),(1,2,5),(7,2,8)} 2 | -------------------------------------------------------------------------------- /src/it/resources/input/grouping.txt: -------------------------------------------------------------------------------- 1 | 1 2 3 2 | 4 2 1 3 | 8 3 4 4 | 4 3 3 5 | 7 2 5 6 | 8 4 3 7 | 1 2 5 8 | 7 2 8 9 | -------------------------------------------------------------------------------- /src/it/resources/input/test.mv.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dbis-ilm/piglet/HEAD/src/it/resources/input/test.mv.db -------------------------------------------------------------------------------- /lib_unmanaged/jvmr_2.11-2.11.2.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dbis-ilm/piglet/HEAD/lib_unmanaged/jvmr_2.11-2.11.2.1.jar -------------------------------------------------------------------------------- /src/it/resources/input/duplicates.csv: -------------------------------------------------------------------------------- 1 | 1,1 2 | 2,2 3 | 2,2 4 | 3,3 5 | 3,3 6 | 3,3 7 | 4,4 8 | 4,4 9 | 4,4 10 | 4,4 11 | -------------------------------------------------------------------------------- /src/it/resources/load2.pig: -------------------------------------------------------------------------------- 1 | A = LOAD '$inbase/input/file.csv' USING PigStorage(',') AS (f1:chararray, f2: int); 2 | STORE A INTO '$outfile'; 3 | -------------------------------------------------------------------------------- /script/simplestatserver.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | FILE=$1 4 | PORT=$2 5 | 6 | nc -p $PORT -l -o $FILE --append-output --recv-only --keep-open 7 | -------------------------------------------------------------------------------- /src/it/resources/groupall.pig: -------------------------------------------------------------------------------- 1 | A = LOAD '$inbase/input/grouping.txt' AS (f1:int, f2:int, f3:int); 2 | B = GROUP A ALL; 3 | STORE B INTO '$outfile'; -------------------------------------------------------------------------------- /src/it/resources/bag.pig: -------------------------------------------------------------------------------- 1 | A = LOAD '$inbase/input/mary.txt' AS (f1:chararray); 2 | X = FOREACH A GENERATE TOKENIZE(f1); 3 | STORE X INTO '$outfile'; 4 | -------------------------------------------------------------------------------- /src/it/resources/input/mary.txt: -------------------------------------------------------------------------------- 1 | Mary had a little lamb 2 | its fleece was white as snow 3 | and everywhere that Mary went 4 | the lamb was sure to go. -------------------------------------------------------------------------------- /src/it/resources/load3.pig: -------------------------------------------------------------------------------- 1 | a = load '$inbase/input/file.txt' using PigStorage(':'); 2 | b = filter a by $0 == "small"; 3 | store b into '$outfile'; 4 | -------------------------------------------------------------------------------- /src/it/resources/stream_load2.pig: -------------------------------------------------------------------------------- 1 | A = LOAD '$inbase/input/file.csv' USING PigStream(',') AS (f1:chararray, f2: int); 2 | STORE A INTO '$outfile'; 3 | -------------------------------------------------------------------------------- /src/it/resources/truth/construct.data: -------------------------------------------------------------------------------- 1 | (100,101),{(100),(101)},[aaaa#100] 2 | (200,202),{(200),(202)},[bbbb#200] 3 | (300,302),{(300),(302)},[cccc#300] -------------------------------------------------------------------------------- /src/it/resources/grouping.pig: -------------------------------------------------------------------------------- 1 | A = LOAD '$inbase/input/grouping.txt' AS (f1:int, f2:int, f3:int); 2 | B = GROUP A BY (f1,f2); 3 | STORE B INTO '$outfile'; 4 | -------------------------------------------------------------------------------- /src/it/resources/socket_write.pig: -------------------------------------------------------------------------------- 1 | A = LOAD '$inbase/input/file.csv' USING PigStream(',') AS (f1:chararray,f2:int); 2 | SOCKET_WRITE A TO 'localhost:9999'; 3 | -------------------------------------------------------------------------------- /setm/build.sbt: -------------------------------------------------------------------------------- 1 | name := "setm" 2 | 3 | version := "0.1" 4 | 5 | scalaVersion := "2.11.8" 6 | 7 | libraryDependencies += "fm.void.jetm" % "jetm" % "1.2.3" 8 | -------------------------------------------------------------------------------- /src/it/resources/input/matrix_data.csv: -------------------------------------------------------------------------------- 1 | 1.0,2.0,3.0,1.5,2.5,3.5 2 | 10.0,20.0,30.0,10.5,20.5,30.5 3 | 11.0,12.0,13.0,11.5,12.5,13.5 4 | 21.0,22.0,23.0,21.5,22.5,23.5 -------------------------------------------------------------------------------- /src/it/resources/sampling.pig: -------------------------------------------------------------------------------- 1 | A = LOAD '$inbase/input/file.csv' USING PigStorage(',') AS (f1:int, f2: int); 2 | B = SAMPLE A 1.0; 3 | STORE B INTO '$outfile'; 4 | -------------------------------------------------------------------------------- /src/it/resources/filter.pig: -------------------------------------------------------------------------------- 1 | A = LOAD '$inbase/input/file.csv' USING PigStorage(',') AS (f1:int, f2: int); 2 | B = FILTER A BY f1>1 AND f2>1; 3 | STORE B INTO '$outfile'; 4 | -------------------------------------------------------------------------------- /src/it/resources/truth/grouping.data: -------------------------------------------------------------------------------- 1 | (1,2),{(1,2,3),(1,2,5)} 2 | (4,2),{(4,2,1)} 3 | (4,3),{(4,3,3)} 4 | (7,2),{(7,2,5),(7,2,8)} 5 | (8,3),{(8,3,4)} 6 | (8,4),{(8,4,3)} 7 | -------------------------------------------------------------------------------- /src/it/resources/truth/united.data: -------------------------------------------------------------------------------- 1 | 1,1 2 | 2,2 3 | 1,2 4 | 3,3 5 | 3,1 6 | 1,1 7 | 2,2 8 | 1,2 9 | 3,3 10 | 3,1 11 | 1,1 12 | 2,2 13 | 1,2 14 | 3,3 15 | 3,1 16 | -------------------------------------------------------------------------------- /src/it/resources/top.pig: -------------------------------------------------------------------------------- 1 | A = LOAD '$inbase/input/unsorted_top.csv' USING PigStorage(','); 2 | B = order A by $1 asc, $2 desc; 3 | C = limit B 4; 4 | STORE C INTO '$outfile'; 5 | -------------------------------------------------------------------------------- /src/it/resources/socket_read.pig: -------------------------------------------------------------------------------- 1 | A = SOCKET_READ 'tcp://localhost:9999' MODE ZMQ USING PigStream(',') AS (f1:double, f2:double, f3:double); 2 | B = FILTER A BY f2>0; 3 | DUMP B; 4 | -------------------------------------------------------------------------------- /src/it/resources/stream_filter.pig: -------------------------------------------------------------------------------- 1 | A = LOAD '$inbase/input/file.csv' USING PigStream(',') AS (f1:int, f2: int); 2 | B = FILTER A BY f1>1 AND f2>1; 3 | STORE B INTO '$outfile'; 4 | -------------------------------------------------------------------------------- /src/it/resources/sort.pig: -------------------------------------------------------------------------------- 1 | A = LOAD '$inbase/input/unsorted.csv' USING PigStorage(',') AS (f1:chararray, f2: chararray, f3: int); 2 | B = ORDER A BY f1, f2, f3; 3 | STORE B INTO '$outfile'; 4 | -------------------------------------------------------------------------------- /src/it/resources/grouping2.pig: -------------------------------------------------------------------------------- 1 | A = LOAD '$inbase/input/grouping.txt' AS (f1:int, f2:int, f3:int); 2 | B = GROUP A BY f1; 3 | C = FOREACH B GENERATE A.f1, AVG(A.f2); 4 | STORE C INTO '$outfile'; 5 | -------------------------------------------------------------------------------- /src/it/resources/aggrwogrouping.pig: -------------------------------------------------------------------------------- 1 | a = load '$inbase/input/aggregate.csv' using PigStorage(',') as (x:int, y:int); 2 | b = foreach a generate COUNT(y), SUM(y), AVG(y); 3 | store b into '$outfile'; 4 | -------------------------------------------------------------------------------- /src/it/resources/jdbc.pig: -------------------------------------------------------------------------------- 1 | A = LOAD 'data' USING JdbcStorage('org.h2.Driver', 'jdbc:h2:file:$inbase/input/test?user=sa&ACCESS_MODE_DATA=r') AS (col1: int, col2:chararray); 2 | STORE A INTO '$outfile'; 3 | -------------------------------------------------------------------------------- /src/it/resources/truth/bag.data: -------------------------------------------------------------------------------- 1 | {(Mary),(had),(a),(little),(lamb)} 2 | {(its),(fleece),(was),(white),(as),(snow)} 3 | {(and),(everywhere),(that),(Mary),(went)} 4 | {(the),(lamb),(was),(sure),(to),(go.)} -------------------------------------------------------------------------------- /src/it/resources/truth/joined.data: -------------------------------------------------------------------------------- 1 | 4,2,1,4,2,1 2 | 4,2,1,4,3,3 3 | 4,3,3,4,2,1 4 | 4,3,3,4,3,3 5 | 8,3,4,8,3,4 6 | 8,3,4,8,4,3 7 | 8,4,3,8,3,4 8 | 8,4,3,8,4,3 9 | 1,2,3,1,2,3 10 | 7,2,5,7,2,5 11 | -------------------------------------------------------------------------------- /src/test/scala/dbis/piglet/tools/TestTools.scala: -------------------------------------------------------------------------------- 1 | 2 | 3 | package dbis.piglet.tools 4 | 5 | import java.net.URI 6 | 7 | object TestTools { 8 | implicit def strToUri(str: String): URI = new URI(str) 9 | } -------------------------------------------------------------------------------- /src/it/resources/sort_multiple_directions.pig: -------------------------------------------------------------------------------- 1 | A = LOAD '$inbase/input/unsorted.csv' USING PigStorage(',') AS (f1:chararray, f2: chararray, f3: int); 2 | B = ORDER A BY f1 asc, f2 desc; 3 | STORE B INTO '$outfile'; 4 | -------------------------------------------------------------------------------- /src/it/resources/accumulate.pig: -------------------------------------------------------------------------------- 1 | A = LOAD '$inbase/input/file.csv' USING PigStorage(',') AS (f1: int, f2: int); 2 | B = ACCUMULATE A GENERATE min(f1), max(f1), sum(f2), count(f2), avg(f2); 3 | STORE B INTO '$outfile'; 4 | -------------------------------------------------------------------------------- /src/it/resources/truth/simple-matrix-res.data: -------------------------------------------------------------------------------- 1 | 1.0 3.0 2.5 2 | 2.0 1.5 3.5 3 | 10.0 30.0 20.5 4 | 20.0 10.5 30.5 5 | 11.0 13.0 12.5 6 | 12.0 11.5 13.5 7 | 21.0 23.0 22.5 8 | 22.0 21.5 23.5 9 | -------------------------------------------------------------------------------- /src/it/resources/aggregate.pig: -------------------------------------------------------------------------------- 1 | a = load '$inbase/input/aggregate.csv' using PigStorage(',') as (x:int, y:int); 2 | b = group a by x ; 3 | c = foreach b generate group, COUNT(a.y), SUM(a.y), AVG(a.y); 4 | store c into '$outfile'; -------------------------------------------------------------------------------- /src/it/resources/top_schema.pig: -------------------------------------------------------------------------------- 1 | A = LOAD '$inbase/input/unsorted_top.csv' USING PigStorage(',') as (a: chararray, b: chararray, c: int); 2 | B = order A by $1 asc, $2 desc; 3 | C = limit B 4; 4 | STORE C INTO '$outfile'; 5 | -------------------------------------------------------------------------------- /src/it/resources/truth/joined_ambiguous_fieldnames.data: -------------------------------------------------------------------------------- 1 | 1,2,3,1,2,3 2 | 4,2,1,4,2,1 3 | 4,2,1,4,3,3 4 | 4,3,3,4,2,1 5 | 4,3,3,4,3,3 6 | 7,2,5,7,2,5 7 | 8,3,4,8,3,4 8 | 8,3,4,8,4,3 9 | 8,4,3,8,3,4 10 | 8,4,3,8,4,3 11 | -------------------------------------------------------------------------------- /src/it/resources/construct.pig: -------------------------------------------------------------------------------- 1 | data = load '$inbase/input/construct.csv' using PigStorage(',') as (f1: int, f2: int, name:chararray); 2 | out = foreach data generate (f1, f2), {f1, f2}, [name, f1]; 3 | STORE out INTO '$outfile'; 4 | -------------------------------------------------------------------------------- /src/it/resources/windowDistinct.pig: -------------------------------------------------------------------------------- 1 | A = LOAD '$inbase/input/duplicates.csv' USING PigStream(',') AS (f1:int, f2: int); 2 | B = WINDOW A RANGE 10 SECONDS SLIDE RANGE 10 SECONDS; 3 | C = DISTINCT B; 4 | STORE C INTO '$outfile'; 5 | -------------------------------------------------------------------------------- /src/it/resources/foreach1.pig: -------------------------------------------------------------------------------- 1 | A = LOAD '$inbase/input/events.csv' USING PigStorage(',') AS (id:chararray, longitude: double, latitude: double); 2 | B = FOREACH A GENERATE id, longitude + 0.01, latitude + 3.5; 3 | STORE B INTO '$outfile'; -------------------------------------------------------------------------------- /src/it/resources/windowFilter.pig: -------------------------------------------------------------------------------- 1 | A = LOAD '$inbase/input/file.csv' USING PigStream(',') AS (f1:int, f2: int); 2 | B = WINDOW A RANGE 5 SECONDS SLIDE RANGE 5 SECONDS; 3 | C = FILTER B BY f1>1 AND f2>1; 4 | STORE C INTO '$outfile'; 5 | -------------------------------------------------------------------------------- /src/it/resources/json.pig: -------------------------------------------------------------------------------- 1 | A = LOAD '$inbase/input/file.json' USING JsonStorage() AS (address:(city:chararray, zipcode:chararray),name:chararray); 2 | B = FOREACH A GENERATE address.city, address.zipcode, name; 3 | STORE B INTO '$outfile'; 4 | -------------------------------------------------------------------------------- /src/it/resources/streaming/aggregate.pig: -------------------------------------------------------------------------------- 1 | a = load '$inbase/input/aggregate.csv' using PigStream(',') as (x:int, y:int); 2 | b = group a by x ; 3 | c = foreach b generate group, COUNT(a.y), SUM(a.y), AVG(a.y); 4 | store c into '$outfile'; 5 | -------------------------------------------------------------------------------- /src/it/resources/streaming/construct.pig: -------------------------------------------------------------------------------- 1 | data = load '$inbase/input/construct.csv' using PigStream(',') as (f1: int, f2: int, name:chararray); 2 | out = foreach data generate (f1, f2), {f1, f2}, [name, f1]; 3 | STORE out INTO '$outfile'; 4 | -------------------------------------------------------------------------------- /src/it/resources/windowGrouping.pig: -------------------------------------------------------------------------------- 1 | A = LOAD '$inbase/input/grouping.txt' USING PigStream('\t') AS (f1:int, f2:int, f3:int); 2 | B = WINDOW A RANGE 5 SECONDS SLIDE RANGE 5 SECONDS; 3 | C = GROUP B BY (f1,f2); 4 | STORE C INTO '$outfile'; 5 | -------------------------------------------------------------------------------- /src/it/resources/embedded.pig: -------------------------------------------------------------------------------- 1 | <% 2 | def myFunc(i1: Int, i2: Int): Int = i1 + i2 3 | %> 4 | A = LOAD '$inbase/input/file.csv' USING PigStorage(',') AS (f1:int, f2: int); 5 | B = FOREACH A GENERATE myFunc(f1, f2); 6 | STORE B INTO '$outfile'; 7 | -------------------------------------------------------------------------------- /src/it/resources/stream_foreach1.pig: -------------------------------------------------------------------------------- 1 | A = LOAD '$inbase/input/events.csv' USING PigStream(',') AS (id:chararray, longitude: double, latitude: double); 2 | B = FOREACH A GENERATE id, longitude + 0.01, latitude + 3.5; 3 | STORE B INTO '$outfile'; 4 | -------------------------------------------------------------------------------- /src/it/resources/windowSort.pig: -------------------------------------------------------------------------------- 1 | A = LOAD '$inbase/input/unsorted.csv' USING PigStream(',') AS (f1:chararray, f2: chararray, f3: int); 2 | B = WINDOW A RANGE 5 SECONDS SLIDE RANGE 5 SECONDS; 3 | C = ORDER B BY f1, f2, f3; 4 | STORE C INTO '$outfile'; 5 | -------------------------------------------------------------------------------- /src/it/resources/cross.pig: -------------------------------------------------------------------------------- 1 | A = LOAD '$inbase/input/file.txt' USING PigStorage(',') AS (f1:chararray); --, f2:int 2 | B = LOAD '$inbase/input/file.txt' USING PigStorage(',') AS (f1:chararray); 3 | D = CROSS A,B; 4 | STORE D INTO '$outfile'; 5 | -- DUMP D; 6 | -------------------------------------------------------------------------------- /src/it/resources/truth/marycount.data: -------------------------------------------------------------------------------- 1 | Mary,2 2 | had,1 3 | a,1 4 | little,1 5 | lamb,2 6 | its,1 7 | fleece,1 8 | was,2 9 | white,1 10 | as,1 11 | snow,1 12 | and,1 13 | everywhere,1 14 | that,1 15 | went,1 16 | the,1 17 | sure,1 18 | to,1 19 | go.,1 -------------------------------------------------------------------------------- /src/it/resources/streaming/accumulate.pig: -------------------------------------------------------------------------------- 1 | A = LOAD '$inbase/input/file.csv' USING PigStream(',') AS (f1: int, f2: int); 2 | B = GROUP A BY f1; 3 | C = ACCUMULATE B GENERATE min(A.f1), max(A.f1), sum(A.f2), count(A.f2), avg(A.f2); 4 | STORE C INTO '$outfile'; 5 | -------------------------------------------------------------------------------- /mapreducelib/build.sbt: -------------------------------------------------------------------------------- 1 | import Dependencies._ 2 | 3 | name := "mapreduce" 4 | 5 | libraryDependencies ++= Seq( 6 | scalaTest % "test" withSources(), 7 | pig % "provided", 8 | hadoop, 9 | typesafe 10 | ) 11 | 12 | test in assembly := {} 13 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | # Currently, it's easier to to exclude all files and 2 | # to specify excludes from these excludes. :) 3 | 4 | * 5 | 6 | !Dockerfile 7 | !script/piglet 8 | !sparklib/target/scala-2.11/sparklib_2.11-*.jar 9 | !target/scala-2.11/PigCompiler.jar 10 | -------------------------------------------------------------------------------- /common/build.sbt: -------------------------------------------------------------------------------- 1 | import Dependencies._ 2 | 3 | name := "common" 4 | 5 | libraryDependencies ++= Seq( 6 | "ch.qos.logback" % "logback-classic" % "1.2.3", 7 | "org.slf4j" % "slf4j-api" % "1.7.25" % "provided", 8 | hadoop % "provided", 9 | json4s 10 | ) 11 | -------------------------------------------------------------------------------- /src/it/resources/selfjoin.pig: -------------------------------------------------------------------------------- 1 | A = LOAD '$inbase/input/joinInput.csv' USING PigStorage(',') AS (a1:int,a2:int,a3:int); 2 | B = LOAD '$inbase/input/joinInput.csv' USING PigStorage(',') AS (b1:int,b2:int,b3:int); 3 | X = JOIN A BY a1, B BY b1; 4 | STORE X INTO '$outfile'; 5 | -------------------------------------------------------------------------------- /src/it/resources/bgpfilter.pig: -------------------------------------------------------------------------------- 1 | a = LOAD '$inbase/input/sibdataset.nt' using PigStorage(' ') as (subject: chararray, predicate: chararray, object:chararray); 2 | b = BGP_FILTER a BY { 3 | ?user "" ?person 4 | }; 5 | STORE b INTO '$outfile'; -------------------------------------------------------------------------------- /src/it/resources/simple_matrix.pig: -------------------------------------------------------------------------------- 1 | A = LOAD '$inbase/input/matrix_data.csv' USING PigStorage(',') AS (v11: double, v12: double, v21: double, v22: double, v31: double, v32: double); 2 | B = FOREACH A GENERATE ddmatrix(2, 3, {v11, v12, v21, v22, v31, v32}); 3 | STORE B INTO '$outfile'; 4 | -------------------------------------------------------------------------------- /src/it/resources/splitInto.pig: -------------------------------------------------------------------------------- 1 | -- Based on: http://pig.apache.org/docs/r0.14.0/basic.html 2 | A = LOAD '$inbase/input/split.csv' USING PigStream(',') AS (f1:int,f2:int,f3:int); 3 | SPLIT A INTO X IF f1<7, Y IF f2==5, Z IF (f3<6 OR f3>6); 4 | STORE X INTO '$outfile'; 5 | DUMP Y; 6 | DUMP Z; 7 | -------------------------------------------------------------------------------- /src/it/resources/truth/rdf_starjoin_plain.data: -------------------------------------------------------------------------------- 1 | "Ling","Chen", 2 | "Laurent","Ciss\u0329", 3 | "Jean-Pierre","Hnatow", 4 | "Julia","Hooda", 5 | -------------------------------------------------------------------------------- /src/it/resources/selfjoin_filtered.pig: -------------------------------------------------------------------------------- 1 | A = LOAD '$inbase/input/joinInput.csv' USING PigStorage(',') AS (a1:int,a2:int,a3:int); 2 | B = LOAD '$inbase/input/joinInput.csv' USING PigStorage(',') AS (b1:int,b2:int,b3:int); 3 | X = JOIN A BY a1, B BY b1; 4 | Y = FILTER X BY a1 == 4; 5 | STORE Y INTO '$outfile'; 6 | -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/plan/rewriting/internals/package.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.plan.rewriting 2 | 3 | /** This package is not meant for public use, it's only providing functions for various parts of the 4 | * [[dbis.piglet.plan.rewriting.Rewriter]] object. 5 | */ 6 | package object internals { 7 | } 8 | -------------------------------------------------------------------------------- /flinklib/src/main/resources/application.conf: -------------------------------------------------------------------------------- 1 | backends { 2 | flink { 3 | name = "flinklib" 4 | template = "flink-template.stg" 5 | connector = "PigStorage" 6 | } 7 | 8 | flinks { 9 | name = "flinks" 10 | template = "flinks-template.stg" 11 | connector = "PigStream" 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /sparklib/src/test/scala/dbis/piglet/backends/spark/Person.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.backends.spark 2 | 3 | import dbis.piglet.backends.SchemaClass 4 | 5 | case class Person(name: String, age: Int) extends java.io.Serializable with SchemaClass { 6 | override def mkString(delim: String) = s"$name$delim$age" 7 | } 8 | -------------------------------------------------------------------------------- /src/it/resources/selfjoin_ambiguous_fieldnames.pig: -------------------------------------------------------------------------------- 1 | A = LOAD '$inbase/input/joinInput.csv' USING PigStorage(',') AS (a1:int,a2:int,a3:int); 2 | B = LOAD '$inbase/input/joinInput.csv' USING PigStorage(',') AS (a1:int,a2:int,a3:int); 3 | X = JOIN A BY a1, B BY a1; 4 | Y = ORDER X BY B::a1 ASC; 5 | STORE Y INTO '$outfile'; 6 | -------------------------------------------------------------------------------- /src/it/resources/union.pig: -------------------------------------------------------------------------------- 1 | A = LOAD '$inbase/input/file.csv' USING PigStorage(',') AS (f1:int, f2: int); 2 | B = LOAD '$inbase/input/file.csv' USING PigStorage(',') AS (f1:int, f2: int); 3 | C = LOAD '$inbase/input/file.csv' USING PigStorage(',') AS (f1:int, f2: int); 4 | D = UNION A, B, C; 5 | STORE D INTO '$outfile'; 6 | -------------------------------------------------------------------------------- /src/it/resources/groupforeach.pig: -------------------------------------------------------------------------------- 1 | -- triples = LOAD '$inbase/input/rdf-data.nt' USING RDFFileStorage AS (subject: chararray, predicate: chararray, object: chararray); 2 | triples = RDFLOAD('$inbase/input/rdf-data.nt'); 3 | stmts = GROUP triples BY subject; 4 | tmp = FOREACH stmts GENERATE *; 5 | STORE tmp INTO '$outfile'; 6 | -------------------------------------------------------------------------------- /src/it/resources/streaming/union.pig: -------------------------------------------------------------------------------- 1 | A = LOAD '$inbase/input/file.csv' USING PigStream(',') AS (f1:int, f2: int); 2 | B = LOAD '$inbase/input/file.csv' USING PigStream(',') AS (f1:int, f2: int); 3 | C = LOAD '$inbase/input/file.csv' USING PigStream(',') AS (f1:int, f2: int); 4 | D = UNION A, B, C; 5 | STORE D INTO '$outfile'; 6 | -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/tools/ProductTools.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.tools 2 | 3 | case class ProductTools(p: Product) { 4 | def mkString(sep: String = ",") = p.productIterator.mkString(sep) 5 | } 6 | 7 | object ProductTools { 8 | implicit def productMkString(p: Product): ProductTools = ProductTools(p) 9 | } 10 | -------------------------------------------------------------------------------- /sparklib/src/main/resources/application.conf: -------------------------------------------------------------------------------- 1 | backends { 2 | name = "sparklib" 3 | spark { 4 | # if in src/main/resources, file name is enough 5 | template = "spark-template.stg" 6 | connector = "PigStorage" 7 | } 8 | sparks { 9 | template = "sparks-template.stg" 10 | connector = "PigStream" 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /common/src/main/scala/dbis/piglet/tools/HdfsCommand.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.tools 2 | 3 | object HdfsCommand extends Enumeration { 4 | type HdfsCommand = Value 5 | val COPYTOLOCAL, 6 | COPYTOREMOTE, 7 | RM, 8 | RMDIR, 9 | MKDIR, 10 | LS, 11 | CAT, 12 | GETMERGE = Value 13 | 14 | } -------------------------------------------------------------------------------- /src/it/resources/rdf_pathjoin_plain.pig: -------------------------------------------------------------------------------- 1 | a = LOAD '$inbase/input/sibdataset.nt' using PigStorage(' ') as (subject: chararray, predicate: chararray, object:chararray); 2 | b = BGP_FILTER a BY { 3 | ?user "" ?person . 4 | ?person "" ?email 5 | }; 6 | STORE b INTO '$outfile'; -------------------------------------------------------------------------------- /src/it/resources/spatialfilter.pig: -------------------------------------------------------------------------------- 1 | a = load '$inbase/input/events.csv' using PigStorage(',') as (name: chararray, lat: double, lon: chararray); 2 | b = foreach a GENERATE name, geometry("POINT("+lat+" "+lon+")") as loc; 3 | c = SPATIAL_FILTER b BY containedby(loc, geometry("POINT(50.1 10.2)")); 4 | STORE c INTO '$outfile'; 5 | -- DUMP c; 6 | -------------------------------------------------------------------------------- /src/it/resources/nforeach.pig: -------------------------------------------------------------------------------- 1 | daily = load '$inbase/input/nested.csv' using PigStorage(',') as (exchange, symbol); 2 | grpd = group daily by exchange; 3 | uniqcnt = foreach grpd { 4 | sym = daily.symbol; 5 | uniq_sym = distinct sym; 6 | generate group, COUNT(uniq_sym); 7 | }; 8 | store uniqcnt into '$outfile'; -------------------------------------------------------------------------------- /src/it/resources/rdf_starjoin_plain.pig: -------------------------------------------------------------------------------- 1 | a = LOAD '$inbase/input/sibdataset.nt' using PigStorage(' ') as (subject: chararray, predicate: chararray, object:chararray); 2 | b = BGP_FILTER a BY { 3 | ?person "" ?f . 4 | ?person "" ?l 5 | }; 6 | STORE b INTO '$outfile'; -------------------------------------------------------------------------------- /src/it/resources/truth/rdf_pathjoin_plain.data: -------------------------------------------------------------------------------- 1 | "Ling671@gmail.com",, 2 | "Jean-Pierre149@gmail.com",, 3 | "Julia228@yahoo.com",, -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/codegen/flink/emitter/DumpEmitter.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.codegen.flink.emitter 2 | 3 | class DumpEmitter extends dbis.piglet.codegen.scala_lang.DumpEmitter { 4 | override def template: String = """ .map(_.mkString()).print""".stripMargin 5 | } 6 | 7 | object DumpEmitter { 8 | lazy val instance = new DumpEmitter 9 | } -------------------------------------------------------------------------------- /src/it/resources/windowCount.pig: -------------------------------------------------------------------------------- 1 | input = load '$inbase/input/mary.txt' using TextLoader() as (line); 2 | words = foreach input generate flatten(TOKENIZE(line)) as word; 3 | win = window words range 10 seconds slide range 10 seconds; 4 | grpd = group win by word; 5 | cntd = foreach grpd generate group, COUNT(win); 6 | -- dump cntd; 7 | store cntd into '$outfile'; 8 | -------------------------------------------------------------------------------- /flinklib/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | hadoop.root.logger=WARN, console 2 | log4j.rootLogger =WARN, console 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender 4 | log4j.appender.console.target=System.out 5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n 7 | -------------------------------------------------------------------------------- /src/it/resources/spatialfilterwithindex.pig: -------------------------------------------------------------------------------- 1 | a = load '$inbase/input/events.csv' using PigStorage(',') as (name: chararray, lat: double, lon: chararray); 2 | b = foreach a GENERATE name, geometry("POINT("+lat+" "+lon+")") as loc; 3 | c = SPATIAL_FILTER b BY containedby(loc, geometry("POINT(50.1 10.2)")) using index rtree(order=2);; 4 | STORE c INTO '$outfile'; 5 | -- DUMP c; 6 | -------------------------------------------------------------------------------- /src/it/resources/windowCross.pig: -------------------------------------------------------------------------------- 1 | A = LOAD '$inbase/input/joinInput.csv' USING PigStream(',') AS (a1:int,a2:int,a3:int); 2 | B = LOAD '$inbase/input/joinInput.csv' USING PigStream(',') AS (b1:int,b2:int,b3:int); 3 | C = WINDOW A RANGE 10 seconds SLIDE RANGE 10 seconds; 4 | D = WINDOW B RANGE 10 seconds SLIDE RANGE 10 seconds; 5 | X = CROSS C, D; 6 | STORE X INTO '$outfile'; 7 | -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/codegen/flink/emitter/StreamDumpEmitter.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.codegen.flink.emitter 2 | 3 | class StreamDumpEmitter extends dbis.piglet.codegen.scala_lang.DumpEmitter { 4 | override def template: String = """ .map(_.mkString()).print""".stripMargin 5 | } 6 | 7 | object StreamDumpEmitter { 8 | lazy val instance = new StreamDumpEmitter 9 | } -------------------------------------------------------------------------------- /src/it/resources/windowJoin.pig: -------------------------------------------------------------------------------- 1 | A = LOAD '$inbase/input/joinInput.csv' USING PigStream(',') AS (a1:int,a2:int,a3:int); 2 | B = LOAD '$inbase/input/joinInput.csv' USING PigStream(',') AS (b1:int,b2:int,b3:int); 3 | C = WINDOW A RANGE 10 seconds SLIDE RANGE 10 seconds; 4 | D = WINDOW B RANGE 10 seconds SLIDE RANGE 10 seconds; 5 | X = JOIN C BY a1, D BY b1; 6 | STORE X INTO '$outfile'; 7 | -------------------------------------------------------------------------------- /ceplib/build.sbt: -------------------------------------------------------------------------------- 1 | import Dependencies._ 2 | 3 | name := "ceplib" 4 | 5 | libraryDependencies ++= Seq( 6 | scalaCompiler, 7 | scalaTest % "test" withSources(), 8 | sparkCore % "provided", 9 | sparkStreaming % "provided", 10 | flinkScala % "provided", 11 | flinkStreaming % "provided", 12 | typesafe, 13 | log4j 14 | ) 15 | 16 | test in assembly := {} 17 | -------------------------------------------------------------------------------- /src/it/resources/rscript.pig: -------------------------------------------------------------------------------- 1 | A = LOAD 'src/it/resources/input/cluster-data.csv' USING PigStorage(',') AS (x: double, y: double); 2 | B = RSCRIPT A USING 'library(fpc);db = dbscan($_, eps=.3, MinPts=5);cluster = cbind(inp, data.frame(db$cluster + 1L)); res = data.matrix(cluster)'; 3 | RES = FOREACH B GENERATE $0 AS x: double, $1 AS y: double, $2 AS cluster: int; 4 | STORE RES INTO 'cluster.out'; 5 | -------------------------------------------------------------------------------- /src/it/resources/nforeach2.pig: -------------------------------------------------------------------------------- 1 | triples = RDFLOAD('$inbase/input/rdf-data.nt'); 2 | stmts = GROUP triples BY subject; 3 | tmp = FOREACH stmts { 4 | r1 = FILTER triples BY (predicate == ""); 5 | r2 = FILTER triples BY (predicate == ""); 6 | GENERATE *, COUNT(r1) AS cnt1, COUNT(r2) AS cnt2; 7 | }; 8 | STORE tmp INTO '$outfile'; 9 | -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/codegen/flink/emitter/StreamSampleEmitter.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.codegen.flink.emitter 2 | 3 | class StreamSampleEmitter extends dbis.piglet.codegen.scala_lang.SampleEmitter { 4 | override def template: String = """ val = .filter(t => util.Random.nextDouble \<= )""".stripMargin 5 | } 6 | 7 | object StreamSampleEmitter { 8 | lazy val instance = new StreamSampleEmitter 9 | } -------------------------------------------------------------------------------- /src/it/resources/spatialpartitioning.pig: -------------------------------------------------------------------------------- 1 | a = load '$inbase/input/events.csv' using PigStorage(',') as (name: chararray, lat: double, lon: chararray); 2 | b = foreach a GENERATE name, geometry("POINT("+lat+" "+lon+")") as loc; 3 | c = partition b on loc using grid(partitionsPerDimension=4, withExtent=false); 4 | d = SPATIAL_FILTER c BY containedby(loc, geometry("POINT(50.1 10.2)")); 5 | STORE d INTO '$outfile'; 6 | -- DUMP c; 7 | -------------------------------------------------------------------------------- /src/it/resources/windowNforeach.pig: -------------------------------------------------------------------------------- 1 | daily = load '$inbase/input/nested.csv' using PigStream(',') as (exchange, symbol); 2 | win = window daily range 10 seconds slide range 10 seconds; 3 | grpd = group win by exchange; 4 | uniqcnt = foreach grpd { 5 | sym = win.symbol; 6 | uniq_sym = distinct sym; 7 | generate group, COUNT(uniq_sym); 8 | }; 9 | store uniqcnt into '$outfile'; 10 | -------------------------------------------------------------------------------- /src/it/resources/crossmany.pig: -------------------------------------------------------------------------------- 1 | A = LOAD '$inbase/input/file.csv' USING PigStorage(',') AS (f1:chararray, f2: int); 2 | B = LOAD '$inbase/input/file.csv' USING PigStorage(',') AS (f1:int, f2: int); 3 | C = LOAD '$inbase/input/file.csv' USING PigStorage(',') AS (f1:int, f2: chararray); 4 | D = LOAD '$inbase/input/file.txt' USING PigStorage(',') AS (f1: chararray); 5 | E = CROSS A, B, C, D; 6 | STORE E INTO '$outfile'; 7 | -- DUMP E; 8 | -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/codegen/flink/emitter/LoadEmitter.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.codegen.flink.emitter 2 | 3 | class LoadEmitter extends dbis.piglet.codegen.scala_lang.LoadEmitter { 4 | override def template: String = """val = []().load(env, "", , )""".stripMargin 5 | 6 | } 7 | 8 | object LoadEmitter { 9 | lazy val instance = new LoadEmitter 10 | } -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/codegen/flink/emitter/StreamStoreEmitter.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.codegen.flink.emitter 2 | 3 | class StreamStoreEmitter extends dbis.piglet.codegen.scala_lang.StoreEmitter { 4 | override def template: String = """ []().writeStream("", , )""".stripMargin 5 | } 6 | 7 | object StreamStoreEmitter { 8 | lazy val instance = new StreamStoreEmitter 9 | } -------------------------------------------------------------------------------- /zeppelin/build.sbt: -------------------------------------------------------------------------------- 1 | import Dependencies._ 2 | 3 | name := "piglet-interpreter" 4 | 5 | 6 | libraryDependencies ++= Seq( 7 | sparkCore % "provided", 8 | sparkSql % "provided", 9 | "org.apache.spark" %% "spark-repl" % "1.5.0", 10 | "org.apache.zeppelin" % "zeppelin-interpreter" % "0.5.0-incubating" 11 | ) 12 | 13 | dependencyOverrides += "org.slf4j" % "slf4j-log4j12" % "1.7.5" 14 | 15 | test in assembly := {} 16 | -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/codegen/flink/emitter/StreamLoadEmitter.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.codegen.flink.emitter 2 | 3 | class StreamLoadEmitter extends dbis.piglet.codegen.scala_lang.LoadEmitter { 4 | override def template: String = """ val = []().loadStream(env, "", , )""".stripMargin 5 | } 6 | 7 | object StreamLoadEmitter { 8 | lazy val instance = new StreamLoadEmitter 9 | } -------------------------------------------------------------------------------- /src/it/resources/macro1.pig: -------------------------------------------------------------------------------- 1 | DEFINE my_macro(in_alias, p) RETURNS out_alias { 2 | $out_alias = FOREACH $in_alias GENERATE $0 + $p, $1; 3 | }; 4 | 5 | DEFINE my_macro2(in_alias, p) RETURNS out_alias { 6 | $out_alias = FOREACH $in_alias GENERATE $0, $1 - $p; 7 | }; 8 | 9 | in = LOAD '$inbase/input/file.csv' USING PigStorage(',') AS (f1:int, f2: int); 10 | out = my_macro(in, 42); 11 | out2 = my_macro2(out, 1); 12 | 13 | STORE out2 INTO '$outfile'; 14 | -------------------------------------------------------------------------------- /src/it/resources/skyline.pig: -------------------------------------------------------------------------------- 1 | REGISTER 'eventlib/target/scala-2.11/eventlib_2.11-1.0.jar'; 2 | A = LOAD 'src/it/resources/events.csv' USING PigStorage(',') AS (id: chararray, longitude: double, latitude: double); 3 | B = FOREACH A GENERATE id, dbis.events.Distances.spatialDistance(longitude, latitude, 50.0, 10.0) AS dist: double; 4 | C = STREAM B THROUGH dbis.events.Skyline.process(2, 5, "eventDominates") AS (id: chararray, dist: double); 5 | STORE C INTO 'skyline.out'; 6 | -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/codegen/flink/emitter/StoreEmitter.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.codegen.flink.emitter 2 | 3 | class StoreEmitter extends dbis.piglet.codegen.scala_lang.StoreEmitter { 4 | override def template: String = """ []().write("", , ) 5 | | env.execute("Starting Query")""".stripMargin 6 | } 7 | 8 | object StoreEmitter { 9 | lazy val instance = new StoreEmitter 10 | } -------------------------------------------------------------------------------- /src/it/resources/two_joins.pig: -------------------------------------------------------------------------------- 1 | A = LOAD '$inbase/input/joinInput.csv' USING PigStorage(',') AS (a1:int,a2:int,a3:int); 2 | B = LOAD '$inbase/input/joinInput.csv' USING PigStorage(',') AS (b1:int,b2:int,b3:int); 3 | 4 | BF1 = FILTER B BY b1 == 4; 5 | BF2 = FILTER B BY b3 == 1; 6 | 7 | X = JOIN A BY a1, BF1 BY b1; 8 | X2 = JOIN A BY a1, BF2 BY b1; 9 | 10 | x0 = FOREACH X GENERATE a1, b1; 11 | x1 = FOREACH X2 GENERATE a1, b1; 12 | u = UNION x0, x1; 13 | 14 | STORE u INTO '$outfile'; 15 | -------------------------------------------------------------------------------- /ceplib/src/main/scala/dbis/piglet/cep/ops/Outputter.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.cep.ops 2 | import scala.reflect.ClassTag 3 | import dbis.piglet.backends.{SchemaClass => Event} 4 | package object Outputter { 5 | def convertEventsToArray[T <: Event: ClassTag](collector: MatchCollector[T]): Any = { 6 | collector.convertEventsToArray() 7 | } 8 | def convertEventsToBoolean[T <: Event: ClassTag](collector: MatchCollector[T]): Any = { 9 | collector.convertEventsToBoolean() 10 | } 11 | } -------------------------------------------------------------------------------- /sparklib/src/main/scala/dbis/piglet/backends/spark/SparkStream.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.backends.spark 2 | 3 | import org.apache.spark._ 4 | import org.apache.spark.streaming._ 5 | 6 | object SparkStream { 7 | lazy val conf = new SparkConf() 8 | lazy val cx = new SparkContext(conf) 9 | lazy val ssc = new StreamingContext(cx, Seconds(1)) 10 | 11 | def setAppName(appName: String) = conf.setAppName(appName) 12 | def setMaster(master: String) = conf.setMaster(master) 13 | } 14 | -------------------------------------------------------------------------------- /src/it/resources/truth/bgpfilter.data: -------------------------------------------------------------------------------- 1 | ,, 2 | ,, 3 | ,, 4 | ,, -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/codegen/spark/StreamDumpEmitter.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.codegen.spark 2 | 3 | import dbis.piglet.codegen.scala_lang.DumpEmitter 4 | 5 | /** 6 | * Created by kai on 12.12.16. 7 | */ 8 | 9 | class StreamDumpEmitter extends DumpEmitter { 10 | override def template: String = """ .foreachRDD(rdd => rdd.foreach(elem => println(elem.mkString())))""".stripMargin 11 | } 12 | 13 | object StreamDumpEmitter { 14 | lazy val instance = new StreamDumpEmitter 15 | } -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/codegen/spark/StreamStoreEmitter.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.codegen.spark 2 | 3 | import dbis.piglet.codegen.scala_lang.StoreEmitter 4 | 5 | /** 6 | * Created by kai on 12.12.16. 7 | */ 8 | 9 | class StreamStoreEmitter extends StoreEmitter { 10 | override def template: String = """ []().writeStream("", , )""".stripMargin 11 | } 12 | 13 | object StreamStoreEmitter { 14 | lazy val instance = new StreamStoreEmitter 15 | } -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/codegen/spark/StreamLoadEmitter.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.codegen.spark 2 | 3 | import dbis.piglet.codegen.scala_lang.LoadEmitter 4 | 5 | /** 6 | * Created by kai on 12.12.16. 7 | */ 8 | class StreamLoadEmitter extends LoadEmitter { 9 | override def template: String = """ val = []().loadStream(ssc, "", , )""".stripMargin 10 | } 11 | 12 | object StreamLoadEmitter { 13 | lazy val instance = new StreamLoadEmitter 14 | } -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/codegen/flink/emitter/StreamOpEmitter.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.codegen.flink.emitter 2 | 3 | class StreamOpEmitter extends dbis.piglet.codegen.scala_lang.StreamOpEmitter { 4 | override def template: String = """ val _helper = .map(t => List()) 5 | | val = (env, _helper).map(t => ())""".stripMargin 6 | 7 | } 8 | 9 | object StreamOpEmitter { 10 | lazy val instance = new StreamOpEmitter 11 | } -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/tools/UpdateMap.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.tools 2 | 3 | import scala.collection.mutable.{Map => MutableMap} 4 | 5 | class UpdateMap[K,V](m: MutableMap[K,V]) { 6 | 7 | def insertOrUpdate(k: K)( f: Option[V] => V): Unit = { 8 | 9 | if(m.contains(k)) { 10 | m(k) = f(Some(m(k))) 11 | } else { 12 | m(k) = f(None) 13 | } 14 | } 15 | } 16 | 17 | object UpdateMap { 18 | implicit def createUpdateMap[K,V](m: MutableMap[K,V]): UpdateMap[K,V] = new UpdateMap[K,V](m) 19 | } 20 | -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/codegen/scala_lang/EmptyEmitter.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.codegen.scala_lang 2 | 3 | import dbis.piglet.codegen.{CodeEmitter, CodeGenContext, CodeGenException} 4 | import dbis.piglet.op.Empty 5 | 6 | /** 7 | * Created by kai on 09.12.16. 8 | */ 9 | class EmptyEmitter extends CodeEmitter[Empty] { 10 | override def template: String = "" 11 | 12 | override def code(ctx: CodeGenContext, node: Empty): String = template 13 | } 14 | 15 | object EmptyEmitter { 16 | lazy val instance = new EmptyEmitter 17 | } -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/op/Intersection.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.op 2 | 3 | case class Intersection(out: Pipe, in1: Pipe, in2: Pipe) extends PigOperator(List(out), List(in1, in2)) { 4 | 5 | override def lineageString: String = { 6 | s"""INTERSECTION%""" + super.lineageString 7 | } 8 | 9 | override def toString = 10 | s"""INTERSECTION 11 | | out = $outPipeName 12 | | ins = ${inPipeNames.mkString(",")} 13 | | inSchema = $inputSchema 14 | | outSchema = $schema""".stripMargin 15 | 16 | } 17 | -------------------------------------------------------------------------------- /flinklib/build.sbt: -------------------------------------------------------------------------------- 1 | import Dependencies._ 2 | 3 | name := "flinklib" 4 | 5 | libraryDependencies ++= Seq( 6 | scalaCompiler, 7 | scalaTest % "test" withSources(), 8 | jeromq, 9 | flinkScala % "provided", 10 | flinkStreaming % "provided", 11 | typesafe, 12 | log4j, 13 | akkaLogging 14 | ) 15 | 16 | resolvers += "Sonatype (releases)" at "https://oss.sonatype.org/content/repositories/releases/" 17 | 18 | scalacOptions ++= Seq("-feature","-language:implicitConversions") 19 | 20 | test in assembly := {} 21 | logLevel in assembly := Level.Error 22 | // 23 | -------------------------------------------------------------------------------- /setm/src/main/scala/dbis/setm/Main.scala: -------------------------------------------------------------------------------- 1 | package dbis.setm 2 | 3 | import dbis.setm.SETM._ 4 | 5 | object Main { 6 | 7 | def myFunction(s: String) = timing("greeting func") { 8 | // complex operations, e.g. 9 | (0 until 100).foreach(i => println(s"Hello $s")) 10 | } 11 | 12 | def main(args: Array[String]) { 13 | 14 | timing("program total") { 15 | 16 | val names = timing("create names") { Array("Tick","Trick","Track") } 17 | 18 | for(name <- names) 19 | myFunction(name) 20 | 21 | } 22 | 23 | collect() 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /sparklib/build.sbt: -------------------------------------------------------------------------------- 1 | import Dependencies._ 2 | 3 | name := "sparklib" 4 | 5 | libraryDependencies ++= Seq( 6 | scalaCompiler, 7 | scalaTest % "test" withSources(), 8 | sparkCore % "provided", 9 | //sparkREPL % "provided", // doesn't work yet due to some incompatibilities with jetty 10 | sparkSql % "provided", 11 | sparkStreaming % "provided", 12 | typesafe, 13 | //scalikejdbc, 14 | //scalikejdbc_config, 15 | jdbc, 16 | json4s 17 | ) 18 | 19 | test in assembly := {} 20 | 21 | scalacOptions ++= Seq("-feature","-language:implicitConversions") 22 | 23 | sourcesInBase := false 24 | -------------------------------------------------------------------------------- /src/it/resources/truth/crossed.data: -------------------------------------------------------------------------------- 1 | 1,2,3,1,2,3 2 | 1,2,3,4,2,1 3 | 1,2,3,8,3,4 4 | 1,2,3,4,3,3 5 | 1,2,3,7,2,5 6 | 1,2,3,8,4,3 7 | 4,2,1,1,2,3 8 | 4,2,1,4,2,1 9 | 4,2,1,8,3,4 10 | 4,2,1,4,3,3 11 | 4,2,1,7,2,5 12 | 4,2,1,8,4,3 13 | 8,3,4,1,2,3 14 | 8,3,4,4,2,1 15 | 8,3,4,8,3,4 16 | 8,3,4,4,3,3 17 | 8,3,4,7,2,5 18 | 8,3,4,8,4,3 19 | 4,3,3,1,2,3 20 | 4,3,3,4,2,1 21 | 4,3,3,8,3,4 22 | 4,3,3,4,3,3 23 | 4,3,3,7,2,5 24 | 4,3,3,8,4,3 25 | 7,2,5,1,2,3 26 | 7,2,5,4,2,1 27 | 7,2,5,8,3,4 28 | 7,2,5,4,3,3 29 | 7,2,5,7,2,5 30 | 7,2,5,8,4,3 31 | 8,4,3,1,2,3 32 | 8,4,3,4,2,1 33 | 8,4,3,8,3,4 34 | 8,4,3,4,3,3 35 | 8,4,3,7,2,5 36 | 8,4,3,8,4,3 37 | -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/op/Materialize.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.op 2 | 3 | /** 4 | * This represent a MATERIALIZE operator in Pig 5 | * 6 | */ 7 | case class Materialize(private val in: Pipe) extends PigOperator(List(), List(in)) { 8 | 9 | /** 10 | * Returns the lineage string describing the sub-plan producing the input for this operator. 11 | * 12 | * @return a string representation of the sub-plan. 13 | */ 14 | override def lineageString: String = { 15 | s"""MATERIALIZE%""" + super.lineageString 16 | } 17 | 18 | override def toString = 19 | s"""MATERIALIZE 20 | | in = $inPipeName 21 | """.stripMargin 22 | } -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | logLevel := Level.Warn 2 | 3 | // resolvers += Resolver.url( 4 | // "bintray-sbt-plugin-releases", 5 | // url("http://dl.bintray.com/content/sbt/sbt-plugin-releases"))( 6 | // Resolver.ivyStylePatterns) 7 | // 8 | // addSbtPlugin("me.lessis" % "bintray-sbt" % "0.3.0") 9 | addSbtPlugin("com.eed3si9n" % "sbt-buildinfo" % "0.7.0") 10 | 11 | addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.5.1") 12 | 13 | addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "1.0.0") 14 | 15 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "5.2.4") 16 | 17 | addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.9.0") 18 | -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/codegen/flink/emitter/StreamDistinctEmitter.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.codegen.flink.emitter 2 | 3 | import dbis.piglet.codegen.CodeGenContext 4 | import dbis.piglet.op.Distinct 5 | import dbis.piglet.codegen.CodeEmitter 6 | 7 | class StreamDistinctEmitter extends dbis.piglet.codegen.scala_lang.DistinctEmitter { 8 | override def template: String = """""".stripMargin 9 | def templateHelper: String = " .toList.distinct" 10 | 11 | def windowApply(ctx: CodeGenContext, op: Distinct): String = { 12 | CodeEmitter.render(templateHelper, Map()) 13 | } 14 | } 15 | 16 | object StreamDistinctEmitter { 17 | lazy val instance = new StreamDistinctEmitter 18 | } -------------------------------------------------------------------------------- /src/it/resources/wordcount.pig: -------------------------------------------------------------------------------- 1 | -- Load input from the file named mary.txt and call the single 2 | -- field in the record 'line'. 3 | input = load '$inbase/input/mary.txt' as (line: chararray); 4 | 5 | -- TOKENIZE splits the line into a field for each word. 6 | -- flatten will take the collection of records returned b 7 | -- TOKENIZE and produce a separate record for each one, calling the single 8 | -- field in the record word. 9 | words = foreach input generate flatten(TOKENIZE(line)) as word; 10 | 11 | -- Now group them together by each word. 12 | grpd = group words by word; 13 | 14 | -- Count them. 15 | cntd = foreach grpd generate group, COUNT(words); 16 | 17 | store cntd into '$outfile'; 18 | -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/codegen/scala_lang/HdfsCmdEmitter.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.codegen.scala_lang 2 | 3 | import dbis.piglet.codegen.{CodeEmitter, CodeGenContext, CodeGenException} 4 | import dbis.piglet.op.PigOperator 5 | import dbis.piglet.op.cmd.HdfsCmd 6 | 7 | /** 8 | * Created by kai on 12.12.16. 9 | */ 10 | class HdfsCmdEmitter extends CodeEmitter[HdfsCmd] { 11 | override def template: String = """HDFSService.process("", )""".stripMargin 12 | 13 | override def code(ctx: CodeGenContext, op: HdfsCmd): String = render(Map("cmd" -> op.cmd, "params" -> s"List(${op.paramString})")) 14 | } 15 | 16 | object HdfsCmdEmitter { 17 | lazy val instance = new HdfsCmdEmitter 18 | } -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/codegen/scala_lang/IntersectionEmitter.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.codegen.scala_lang 2 | 3 | import dbis.piglet.codegen.{CodeEmitter, CodeGenContext} 4 | import dbis.piglet.op.Intersection 5 | 6 | class IntersectionEmitter extends CodeEmitter[Intersection] { 7 | override def template: String = """val = .intersection()""".stripMargin 8 | 9 | 10 | override def code(ctx: CodeGenContext, op: Intersection): String = render(Map("out" -> op.outPipeName, 11 | "in1" -> op.inPipeNames.head, 12 | "in2" -> op.inPipeNames.last 13 | )) 14 | 15 | } 16 | 17 | object IntersectionEmitter { 18 | lazy val instance = new IntersectionEmitter 19 | } -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/op/TimingOp.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.op 2 | 3 | case class TimingOp ( 4 | private[op] val out: Pipe, 5 | private[op] val in: Pipe, 6 | operatorId: String) extends PigOperator(out, in, in.producer.schema ) { 7 | 8 | 9 | 10 | override def equals(other: Any) = other match { 11 | case o: TimingOp => operatorId == o.operatorId && outPipeName == o.outPipeName 12 | case _ => false 13 | } 14 | 15 | override def hashCode() = (operatorId+outPipeName).hashCode() 16 | 17 | override def toString = 18 | s"""TIMING 19 | | out = $outPipeName 20 | | in = $inPipeName 21 | | schema = $schema""".stripMargin 22 | 23 | 24 | } 25 | -------------------------------------------------------------------------------- /src/it/resources/truth/cross2.csv: -------------------------------------------------------------------------------- 1 | Hallo,Hallo 2 | Hallo,this 3 | Hallo,is 4 | Hallo,a 5 | Hallo,small 6 | Hallo,text 7 | Hallo,file. 8 | this,Hallo 9 | this,this 10 | this,is 11 | this,a 12 | this,small 13 | this,text 14 | this,file. 15 | is,Hallo 16 | is,this 17 | is,is 18 | is,a 19 | is,small 20 | is,text 21 | is,file. 22 | a,Hallo 23 | a,this 24 | a,is 25 | a,a 26 | a,small 27 | a,text 28 | a,file. 29 | small,Hallo 30 | small,this 31 | small,is 32 | small,a 33 | small,small 34 | small,text 35 | small,file. 36 | text,Hallo 37 | text,this 38 | text,is 39 | text,a 40 | text,small 41 | text,text 42 | text,file. 43 | file.,Hallo 44 | file.,this 45 | file.,is 46 | file.,a 47 | file.,small 48 | file.,text 49 | file.,file. 50 | -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/codegen/scala_lang/UnionEmitter.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.codegen.scala_lang 2 | 3 | import dbis.piglet.codegen.{CodeEmitter, CodeGenContext, CodeGenException} 4 | import dbis.piglet.op.{Union, PigOperator} 5 | 6 | /** 7 | * Created by kai on 03.12.16. 8 | */ 9 | class UnionEmitter extends CodeEmitter[Union] { 10 | override def template: String = """val = )}>""".stripMargin 11 | 12 | 13 | override def code(ctx: CodeGenContext, op: Union): String = render(Map("out" -> op.outPipeName, 14 | "in" -> op.inPipeName, 15 | "others" -> op.inPipeNames.tail)) 16 | 17 | } 18 | 19 | object UnionEmitter { 20 | lazy val instance = new UnionEmitter 21 | } -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/codegen/flink/emitter/LimitEmitter.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.codegen.flink.emitter 2 | 3 | import dbis.piglet.codegen.CodeGenContext 4 | import dbis.piglet.op.Limit 5 | 6 | class LimitEmitter extends dbis.piglet.codegen.scala_lang.LimitEmitter { 7 | override def template: String = """ val = .first()""".stripMargin 8 | 9 | override def code(ctx: CodeGenContext, op: Limit): String = { 10 | 11 | val params = Map( 12 | "out" -> op.outPipeName, 13 | "in" -> op.inPipeName, 14 | "num" -> op.num, 15 | "lineage" -> op.lineageSignature) 16 | 17 | render(params) 18 | 19 | } 20 | 21 | } 22 | 23 | object LimitEmitter { 24 | lazy val instance = new LimitEmitter 25 | } -------------------------------------------------------------------------------- /setm/.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | logs 3 | project/project 4 | project/target 5 | target 6 | tmp 7 | .history 8 | dist 9 | /.idea 10 | /*.iml 11 | /out 12 | .idea_modules 13 | .classpath 14 | .project 15 | /RUNNING_PID 16 | .settings 17 | .target 18 | /bin 19 | *.jpage 20 | lodhub_data 21 | .cache 22 | .worksheet/ 23 | *.sc 24 | .sbt_completion_cache 25 | .tags 26 | *.bak 27 | *.class 28 | *.log 29 | 30 | __my_script/ 31 | 32 | ##### 33 | # ignore database 34 | db 35 | 36 | # sbt specific 37 | .cache 38 | .cache-* 39 | .history 40 | .lib/ 41 | .scalastyle 42 | dist/* 43 | target/ 44 | lib_managed/ 45 | src_managed/ 46 | project/boot/ 47 | project/plugins/project/ 48 | buildinfo.properties 49 | 50 | # Scala-IDE specific 51 | .scala_dependencies 52 | .worksheet 53 | -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/codegen/scala_lang/SampleEmitter.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.codegen.scala_lang 2 | 3 | import dbis.piglet.codegen.{CodeEmitter, CodeGenContext, CodeGenException} 4 | import dbis.piglet.op.{Sample, PigOperator} 5 | 6 | /** 7 | * Created by kai on 03.12.16. 8 | */ 9 | class SampleEmitter extends CodeEmitter[Sample] { 10 | override def template: String = """ val = .sample(false, )""".stripMargin 11 | 12 | 13 | override def code(ctx: CodeGenContext, op: Sample): String = render(Map("out" -> op.outPipeName, 14 | "in" -> op.inPipeName, 15 | "expr" -> ScalaEmitter.emitExpr(ctx, op.expr))) 16 | 17 | } 18 | 19 | 20 | object SampleEmitter { 21 | lazy val instance = new SampleEmitter 22 | } -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/codegen/spark/CacheEmitter.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.codegen.spark 2 | 3 | import dbis.piglet.codegen.{CodeEmitter, CodeGenContext} 4 | import dbis.piglet.op.Cache 5 | 6 | /** 7 | * Created by hage on 11.07.17. 8 | */ 9 | class CacheEmitter extends CodeEmitter[Cache] { 10 | override def template: String = "val = .persist(org.apache.spark.storage.StorageLevel.)" 11 | 12 | override def code(ctx: CodeGenContext, node: Cache): String = { 13 | 14 | val mode = node.cacheMode.toString 15 | 16 | val map = Map("out" -> node.outPipeName, "in" -> node.inPipeName, "mode" -> mode) 17 | 18 | render(map) 19 | } 20 | } 21 | 22 | object CacheEmitter { 23 | lazy val instance = new CacheEmitter 24 | } 25 | -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/codegen/scala_lang/TimingEmitter.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.codegen.scala_lang 2 | 3 | import dbis.piglet.codegen.{CodeEmitter, CodeGenContext} 4 | import dbis.piglet.op.TimingOp 5 | 6 | class TimingEmitter extends CodeEmitter[TimingOp] { 7 | override def template = """val = .mapPartitionsWithIndex({case (idx,iter) => 8 | | PerfMonitor.notify(url, "", ,idx, System.currentTimeMillis) 9 | | iter 10 | \},true)""".stripMargin 11 | 12 | override def code(ctx: CodeGenContext, op: TimingOp): String = render(Map( 13 | "out"-> op.outPipeName, 14 | "in" -> op.inPipeName, 15 | "lineage" -> op.operatorId)) 16 | } 17 | 18 | object TimingEmitter { 19 | lazy val instance = new TimingEmitter 20 | } -------------------------------------------------------------------------------- /src/it/resources/spatialjoin.pig: -------------------------------------------------------------------------------- 1 | a1 = load '$inbase/input/events.csv' using PigStorage(',') as (name: chararray, lat: double, lon: chararray); 2 | b1 = foreach a1 GENERATE name, geometry("POINT("+lat+" "+lon+")") as loc; 3 | 4 | a2 = load '$inbase/input/events.csv' using PigStorage(',') as (name: chararray, lat: double, lon: chararray); 5 | b2 = foreach a2 GENERATE name, geometry("POINT("+lat+" "+lon+")") as loc; 6 | 7 | -- loc fields are automatically resolved as we expect the first one to be from the left input 8 | -- and the second one to be from the right input relation. 9 | -- Hence, no need for (b1::loc, b2::loc) disambiguation or renaming 10 | c = SPATIAL_JOIN b1, b2 ON containedby(loc, loc); 11 | 12 | d = foreach c GENERATE b1::name, b2::name; 13 | 14 | --DUMP d; 15 | STORE d INTO '$outfile'; 16 | -------------------------------------------------------------------------------- /src/it/resources/spatialjoinwithindex.pig: -------------------------------------------------------------------------------- 1 | a1 = load '$inbase/input/events.csv' using PigStorage(',') as (name: chararray, lat: double, lon: chararray); 2 | b1 = foreach a1 GENERATE name, geometry("POINT("+lat+" "+lon+")") as loc; 3 | 4 | a2 = load '$inbase/input/events.csv' using PigStorage(',') as (name: chararray, lat: double, lon: chararray); 5 | b2 = foreach a2 GENERATE name, geometry("POINT("+lat+" "+lon+")") as loc; 6 | 7 | -- loc fields are automatically resolved as we expect the first one to be from the left input 8 | -- and the second one to be from the right input relation. 9 | -- Hence, no need for (b1::loc, b2::loc) disambiguation or renaming 10 | c = SPATIAL_JOIN b1, b2 ON containedby(loc, loc) using index rtree(order = 2); 11 | 12 | d = foreach c GENERATE b1::name, b2::name; 13 | 14 | --DUMP d; 15 | STORE d INTO '$outfile'; 16 | -------------------------------------------------------------------------------- /ceplib/src/main/scala/dbis/piglet/cep/nfa/RelatedValue.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.cep.nfa 2 | import scala.reflect.ClassTag 3 | import dbis.piglet.backends.{SchemaClass => Event} 4 | 5 | case class NotInitializedException(private val msg: String) extends Exception(msg) 6 | 7 | trait RelatedValue[T]{ 8 | def updateValue(event: T): Unit 9 | def initValue(event: T): Unit 10 | def getValue(): Double 11 | } 12 | 13 | abstract class PreviousRelatedValue[T <: Event: ClassTag] extends RelatedValue[T]{ 14 | var value: Option[Double] = None 15 | override def initValue(event: T): Unit = updateValue(event) 16 | override def updateValue(event: T): Unit 17 | override def getValue(): Double = { 18 | value match { 19 | case Some(v) => v 20 | case None => throw NotInitializedException("Related value is not initialized") 21 | } 22 | } 23 | } -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/op/Visualize.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.op 2 | 3 | import dbis.piglet.expr.Ref 4 | 5 | case class Visualize(private val in: Pipe, field: Ref, path: String, width: Int, height: Int, pointSize: Option[Int] = None) extends PigOperator(List(), List(in)) { 6 | 7 | 8 | lazy val (pathNoExt,fileType) = { 9 | val i = path.lastIndexOf(".") 10 | if(i > 0) { 11 | val p = path.substring(0,i) 12 | val ext = path.substring(i+1) 13 | (p,ext) 14 | } 15 | else 16 | (path,"") 17 | } 18 | 19 | 20 | override def lineageString: String = { 21 | s"""VISUALIZE%""" + super.lineageString 22 | } 23 | 24 | override def toString = 25 | s"""VISUALIZE 26 | | in = $inPipeName 27 | | type = $path 28 | | size = $width x $height 29 | | pointSize = $pointSize""".stripMargin 30 | 31 | } 32 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM java:latest 2 | MAINTAINER stefan.hagedorn@tu-ilmenau.de 3 | 4 | COPY /script/piglet /piglet/ 5 | 6 | COPY target/scala-2.11/PigCompiler.jar /piglet/ 7 | ENV PIG_LIB /piglet/PigCompiler.jar 8 | 9 | COPY /sparklib/target/scala-2.11/sparklib_2.11-*.jar /sparklib/target/scala-2.11/ 10 | ENV BACKEND_DIR /sparklib/target/scala-2.11/* 11 | 12 | # enable these to support other backends. 13 | #COPY /flinklib/target/scala-2.11/flinklib_2.11-*.jar /flinklib/target/scala-2.11/ 14 | #COPY /mapreduce/target/scala-2.11/mapreduce_2.11-*.jar /mapreduce/target/scala-2.11/ 15 | 16 | ENV SPARK_JAR /piglet/spark-assembly-1.5.1-hadoop2.4.0.jar 17 | 18 | 19 | RUN wget -q -P /piglet http://moria.prakinf.tu-ilmenau.de/spark-assembly-1.5.1-hadoop2.4.0.jar 20 | #RUN mv spark-assembly-1.5.1-hadoop2.4.0.jar /piglet/ 21 | 22 | 23 | ENTRYPOINT ["/piglet/piglet"] 24 | CMD ["--help"] 25 | -------------------------------------------------------------------------------- /ceplib/src/main/scala/dbis/piglet/cep/ops/MatchCollector.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.cep.ops 2 | import dbis.piglet.cep.nfa.NFAStructure 3 | import scala.reflect.ClassTag 4 | import scala.collection.mutable.ListBuffer 5 | import scala.collection.mutable.ArrayBuffer 6 | import dbis.piglet.backends.{SchemaClass => Event} 7 | 8 | class MatchCollector[ T <: Event: ClassTag] extends Serializable { 9 | var macthSequences: ListBuffer[NFAStructure[T]] = new ListBuffer() 10 | def +(that: NFAStructure[T]): Unit = macthSequences += that 11 | def size: Int = macthSequences.size 12 | def convertEventsToArray(): ArrayBuffer[T] = { 13 | var events: ArrayBuffer[T] = new ArrayBuffer() 14 | macthSequences.foreach (seq => events ++= seq.events) 15 | events 16 | } 17 | def convertEventsToBoolean(): ArrayBuffer[Boolean] = { 18 | ArrayBuffer(macthSequences.size > 0) 19 | } 20 | } -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/op/Partition.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.op 2 | 3 | object PartitionMethod extends Enumeration { 4 | type PartitionMethod = Value 5 | val GRID, BSP, Hash = Value 6 | } 7 | 8 | import PartitionMethod.PartitionMethod 9 | import dbis.piglet.expr.Ref 10 | 11 | case class Partition( 12 | private val out: Pipe, 13 | private val in: Pipe, 14 | field: Ref, 15 | method: PartitionMethod, 16 | params: Seq[String] 17 | ) extends PigOperator(out, in) { 18 | 19 | override def lineageString = 20 | s"""PARTITION%$method%$field%${params.mkString}"""+super.lineageString 21 | 22 | override def toString = 23 | s"""PARTITION 24 | | out = $outPipeName 25 | | in = $inPipeName 26 | | schema = $schema 27 | | field = $field 28 | | method = $method 29 | | params = ${params.mkString(",")}""".stripMargin 30 | } -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/op/Difference.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.op 2 | 3 | import dbis.piglet.expr.Ref 4 | 5 | case class Difference(private val out: Pipe, private val in1: Pipe, private val in2: Pipe, 6 | refs1: Option[List[Ref]] = None, 7 | refs2: Option[List[Ref]] = None 8 | ) extends PigOperator(List(out), List(in1, in2)) { 9 | 10 | override def lineageString: String = { 11 | s"""DIFFERENCE%""" + super.lineageString 12 | } 13 | 14 | override def toString = 15 | s"""DIFFERENCE 16 | | out = $outPipeName 17 | | ins = ${inPipeNames.mkString(",")}, 18 | | refs1 = ${refs1.map(_.mkString(",")).getOrElse("--")}, 19 | | refs2 = ${refs2.map(_.mkString(",")).getOrElse("--")}, 20 | | inSchema = $inputSchema 21 | | outSchema = $schema""".stripMargin 22 | 23 | } 24 | -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/codegen/scala_lang/DistinctEmitter.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.codegen.scala_lang 2 | 3 | import dbis.piglet.codegen.{CodeEmitter, CodeGenContext, CodeGenException} 4 | import dbis.piglet.op.{Distinct, PigOperator} 5 | 6 | /** 7 | * Created by kai on 03.12.16. 8 | */ 9 | class DistinctEmitter extends CodeEmitter[Distinct] { 10 | override def template: String = """val = .distinct.map{t => 11 | | PerfMonitor.sampleSize(t,"", accum, randFactor) 12 | | t 13 | |}""".stripMargin 14 | 15 | 16 | override def code(ctx: CodeGenContext, op: Distinct): String = 17 | render(Map("out" -> op.outPipeName, "in" -> op.inPipeName, "lineage" -> op.lineageSignature)) 18 | 19 | } 20 | 21 | object DistinctEmitter { 22 | lazy val instance = new DistinctEmitter 23 | } -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/codegen/spark/VisualizeEmitter.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.codegen.spark 2 | 3 | import dbis.piglet.codegen.{CodeEmitter, CodeGenContext} 4 | import dbis.piglet.op.Visualize 5 | 6 | class VisualizeEmitter extends CodeEmitter[Visualize] { 7 | override def template: String = s""".visualize(,, "", "")""" 8 | 9 | override def code(ctx: CodeGenContext, op: Visualize): String = { 10 | val m = Map( 11 | "in" -> op.inPipeName, 12 | "width" -> op.width, 13 | "height" -> op.height, 14 | "path" -> op.pathNoExt, 15 | "ext" -> op.fileType, 16 | "keyby" -> SpatialEmitterHelper.keyByCode(op.schema,op.field, ctx), 17 | "pointsize" -> op.pointSize.map(p => s",pointSize = $p").getOrElse("") 18 | ) 19 | 20 | render(m) 21 | } 22 | } 23 | 24 | object VisualizeEmitter { 25 | lazy val instance = new VisualizeEmitter 26 | } 27 | -------------------------------------------------------------------------------- /ceplib/src/main/scala/dbis/piglet/cep/spark/CustomRDDMatcher.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.cep.spark 2 | 3 | import org.apache.spark.SparkContext 4 | import dbis.piglet.cep.ops.SelectionStrategy._ 5 | import dbis.piglet.cep.ops.OutputStrategy._ 6 | import org.apache.spark.rdd._ 7 | import scala.reflect.ClassTag 8 | import dbis.piglet.cep.nfa.NFAController 9 | import dbis.piglet.backends.{SchemaClass => Event} 10 | 11 | class CustomRDDMatcher[T <: Event: ClassTag](rdd: RDD[T]) { 12 | 13 | def matchNFA(nfa: NFAController[T], sstr: SelectionStrategy = FirstMatch, out: OutputStrategy = Combined) = { 14 | // println("create a new RDD matcher") 15 | val newRDD = rdd.coalesce(1, true) 16 | new RDDMatcher(newRDD, nfa, sstr, out) 17 | } 18 | 19 | } 20 | 21 | object CustomRDDMatcher { 22 | 23 | implicit def addRDDMatcher[T <: Event: ClassTag](rdd: RDD[T]) = { 24 | // println("add a custom RDD function") 25 | new CustomRDDMatcher(rdd) 26 | } 27 | } -------------------------------------------------------------------------------- /ceplib/src/main/scala/dbis/piglet/cep/spark/CustomDStreamMatcher.scala: -------------------------------------------------------------------------------- 1 | 2 | package dbis.piglet.cep.spark 3 | 4 | import org.apache.spark.SparkContext._ 5 | import org.apache.spark.streaming.dstream._ 6 | import scala.reflect.ClassTag 7 | import dbis.piglet.cep.ops.SelectionStrategy._ 8 | import dbis.piglet.cep.ops.OutputStrategy._ 9 | import dbis.piglet.cep.nfa.NFAController 10 | import dbis.piglet.backends.{SchemaClass => Event} 11 | 12 | class CustomDStreamMatcher[T <: Event: ClassTag](dstream: DStream[T]) { 13 | 14 | def matchNFA(nfa: NFAController[T], sstr: SelectionStrategy = FirstMatch, out: OutputStrategy = Combined) = { 15 | // println("create a new DStream matcher") 16 | new DStreamMatcher(dstream, nfa, sstr, out) 17 | } 18 | 19 | } 20 | 21 | object CustomDStreamMatcher { 22 | 23 | implicit def addDStreamMatcher[T <: Event: ClassTag](dstream: DStream[T]) = { 24 | // println("add a custom DStream function") 25 | new CustomDStreamMatcher(dstream) 26 | } 27 | } -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/plan/rewriting/rulesets/Ruleset.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package dbis.piglet.plan.rewriting.rulesets 18 | 19 | trait Ruleset { 20 | def registerRules(): Unit 21 | } 22 | -------------------------------------------------------------------------------- /flinklib/src/main/scala/dbis/piglet/backends/flink/streaming/FlinkExtensions.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.backends.flink.streaming 2 | 3 | /* 4 | import org.apache.flink.streaming.api.scala.WindowedDataStream 5 | import org.apache.flink.streaming.api.scala.createTypeInformation 6 | import org.apache.flink.util.Collector 7 | 8 | 9 | object FlinkExtensions { 10 | implicit class EnhancedWindowedDataStream(val w: WindowedDataStream[List[Any]]) { 11 | 12 | def distinct = w.mapWindow(distinctMapFunction _) 13 | 14 | private def distinctMapFunction(ts: Iterable[List[Any]], out: Collector[List[Any]]) ={ 15 | ts.toList.distinct.foreach{ x => out.collect(x) } 16 | } 17 | } 18 | 19 | implicit class EnhancedWindowedDataStreamString(val w: WindowedDataStream[List[String]]) { 20 | 21 | def distinct = w.mapWindow(distinctMapFunction _) 22 | 23 | private def distinctMapFunction(ts: Iterable[List[String]], out: Collector[List[String]]) ={ 24 | ts.toList.distinct.foreach{ x => out.collect(x) } 25 | } 26 | } 27 | }*/ 28 | -------------------------------------------------------------------------------- /materialization_scripts/gdelt_gold_tone_roi.pig: -------------------------------------------------------------------------------- 1 | gdelt = LOAD '$gdelt' using PigStorage() 2 | fields = FOREACH gdelt GENERATE $0 as eventid, $1 as day, $4 as a1code, $5 as a1countrycode, $8 as a1ethniccode, $14 as a2code, $15 as a2countrycode, $18 as a2ethniccode, $29 as goldstein, $33 as avgtone, $39 as a1lat, $40 as a1lon, $47 as a2lat, $48 as a2lon; 3 | withLoc = FILTER fields BY NONEMPTY(a1lat) and NONEMPTY(a1lon) and NONEMPTY(goldstein) and nonempty(avgtone) 4 | gdeltGeo = FOREACH withLoc GENERATE geometry("POINT("+a1lat+" "+a1lon+")"), (double)goldstein as gold, (double)avgtone as tone; 5 | roi = LOAD '$rgdelt' USING PigStorage(';') as (id: int, wkt: chararray); 6 | roiGeo = FOREACH roi GENERATE geometry(wkt) as geo, id; 7 | toneregion = SPATIAL_JOIN gdeltGeo, roiGeo ON CONTAINEDBY using index rtree(order=5); 8 | toneregionid = FOREACH toneregion GENERATE id, gold, tone; 9 | toneByRegion = GROUP toneregionid BY id 10 | tonePerRegion = FOREACH toneByRegion GENERATE group as regionId, avg(toneregionid.gold), avg(toneregionid.tone) 11 | dump tonePerRegion mute -------------------------------------------------------------------------------- /sparklib/src/main/scala/dbis/piglet/backends/spark/PigFuncs.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package dbis.piglet.backends.spark 19 | 20 | import dbis.piglet.CommonPigFuncs 21 | 22 | object PigFuncs extends CommonPigFuncs { 23 | 24 | } 25 | -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/codegen/scala_lang/FilterEmitter.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.codegen.scala_lang 2 | 3 | import dbis.piglet.codegen.{CodeEmitter, CodeGenContext} 4 | import dbis.piglet.op.Filter 5 | 6 | /** 7 | * Created by kai on 01.12.16. 8 | */ 9 | class FilterEmitter extends CodeEmitter[Filter] { 10 | override def template: String = 11 | """val = .filter{t => 12 | | val res = 13 | | 14 | | if(res) { 15 | | PerfMonitor.sampleSize(t, "", accum, randFactor) 16 | | } 17 | | 18 | | res 19 | |\}""".stripMargin 20 | 21 | 22 | override def code(ctx: CodeGenContext, op: Filter): String = { 23 | val m = Map("out" -> op.outPipeName, 24 | "in" -> op.inPipeName, 25 | "lineage" -> op.lineageSignature, 26 | "pred" -> ScalaEmitter.emitPredicate(CodeGenContext(ctx, Map[String,Any]("schema" -> op.schema)), op.pred)) 27 | 28 | render(m) 29 | } 30 | } 31 | 32 | object FilterEmitter { 33 | lazy val instance = new FilterEmitter 34 | } -------------------------------------------------------------------------------- /materialization_scripts/taxi_tip_avg.pig: -------------------------------------------------------------------------------- 1 | <% 2 | def dateToMonth(date: String): Int = { 3 | val formatter = java.time.format.DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss") 4 | java.time.LocalDate.parse(date,formatter).getMonthValue() 5 | } 6 | %> 7 | 8 | raw = load '$taxi' using PigStorage(',',skipEmpty=true) as 9 | (vendor_id:chararray,pickup_datetime:chararray,dropoff_datetime:chararray,passenger_count:chararray, 10 | trip_distance:chararray, pickup_longitude:chararray,pickup_latitude:chararray,rate_code:chararray, 11 | store_and_fwd_flag:chararray,dropoff_longitude:chararray,dropoff_latitude:chararray,payment_type:chararray, 12 | fare_amount:chararray,surcharge:chararray,mta_tax:chararray,tip_amount:chararray,tolls_amount:chararray,total_amount:chararray); 13 | 14 | noHeader = filter raw by not STARTSWITH(lower(vendor_id),"vendor"); 15 | month_tip = FOREACH noHeader GENERATE dateToMonth(pickup_datetime) as month:int, (double)tip_amount as tip 16 | 17 | grp = GROUP month_tip by month; 18 | avg = FOREACH grp GENERATE group, AVG(month_tip.tip); 19 | dump avg mute; 20 | -------------------------------------------------------------------------------- /ceplib/src/main/scala/dbis/piglet/cep/ops/EngineConf.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.cep.ops 2 | 3 | import dbis.piglet.cep.engines._ 4 | import scala.reflect.ClassTag 5 | import dbis.piglet.cep.ops.SelectionStrategy._ 6 | import dbis.piglet.cep.nfa.NFAController 7 | import dbis.piglet.backends.{SchemaClass => Event} 8 | 9 | abstract class EngineConf[T <: Event: ClassTag](nfa: NFAController[T], sstr: SelectionStrategy) { 10 | val collector: MatchCollector[T] = new MatchCollector() 11 | var engine: CEPEngine[T] = sstr match { 12 | case SelectionStrategy.FirstMatch => new FirstMatch(nfa, collector) 13 | case SelectionStrategy.AllMatches => new AnyMatch(nfa, collector) 14 | case SelectionStrategy.NextMatches => new NextMatch(nfa, collector) 15 | case SelectionStrategy.ContiguityMatches => new ContiguityMatch(nfa, collector) 16 | case _ => throw new Exception("The Strategy is not supported") 17 | 18 | } 19 | } 20 | /* 21 | trait EngineConfig [T] extends EngineConf[T] { 22 | implicit def event: Event 23 | }*/ -------------------------------------------------------------------------------- /src/test/scala/dbis/piglet/CompilerSpec.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet 2 | 3 | import dbis.piglet.codegen.PigletCompiler 4 | import org.scalatest.{Matchers, FlatSpec} 5 | 6 | /** 7 | * Created by kai on 13.07.15. 8 | */ 9 | class CompilerSpec extends FlatSpec with Matchers { 10 | "The compiler" should "substitute parameters in a source line" in { 11 | val source = """a = FOREACH b GENERATE $0 AS $P1, myFunc($1) AS $PARAM2;""" 12 | val substitutedLine = PigletCompiler.replaceParameters(source, Map("P1" -> "column", "PARAM2" -> "funcResult")) 13 | substitutedLine should be ("""a = FOREACH b GENERATE $0 AS column, myFunc($1) AS funcResult;""") 14 | } 15 | 16 | it should "resolve IMPORT statements recursively" in { 17 | val source = List("IMPORT 'src/it/resources/import1.pig';", "C = FOREACH B GENERATE $0;") 18 | val (output, p) = PigletCompiler.resolveImports(source.toIterator) 19 | output.mkString("\n") should be ( 20 | """A = LOAD 'input'; 21 | |B = FILTER A BY $0 > 10; 22 | |C = FOREACH B GENERATE $0;""".stripMargin) 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /flinklib/src/main/scala/dbis/piglet/backends/flink/FlinkConf.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.backends.flink 2 | 3 | import dbis.piglet.backends.BackendConf 4 | import com.typesafe.config.ConfigFactory 5 | import dbis.piglet.backends.PigletBackend 6 | 7 | /** 8 | * @author hage 9 | */ 10 | class FlinkConf extends BackendConf { 11 | 12 | // loads the default configuration file in resources/application.conf 13 | private val appconf = ConfigFactory.load() 14 | 15 | /** 16 | * Get the name of this backend 17 | * 18 | * @return Returns the name of this backend 19 | */ 20 | override def name: String = appconf.getString("backends.flink.name") 21 | 22 | /** 23 | * Get the path to the runner class that implements the PigletBackend interface 24 | */ 25 | override def runnerClass: PigletBackend = { 26 | new FlinkRun 27 | } 28 | 29 | override def templateFile: String = appconf.getString("backends.flink.template") 30 | 31 | override def defaultConnector: String = appconf.getString("backends.flink.connector") 32 | 33 | override def raw = false 34 | } 35 | -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/expr/Traverser.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.expr 2 | 3 | //import dbis.piglet.op.{Func, RefExpr, Expr, NamedField} 4 | import dbis.piglet.schema.Schema 5 | 6 | import scala.collection.mutable.ListBuffer 7 | 8 | 9 | class NamedFieldExtractor { 10 | val fields = ListBuffer[NamedField]() 11 | 12 | def collectNamedFields(schema: Schema, ex: Expr): Boolean = ex match { 13 | case RefExpr(r) => r match { 14 | case NamedField(n, _) => fields += r.asInstanceOf[NamedField]; true 15 | case _ => true 16 | } 17 | case _ => true 18 | } 19 | } 20 | 21 | class RefExprExtractor { 22 | val exprs = ListBuffer[RefExpr]() 23 | 24 | def collectRefExprs(schema: Schema, ex: Expr): Boolean = ex match { 25 | case RefExpr(r) => exprs += ex.asInstanceOf[RefExpr]; true 26 | case _ => true 27 | } 28 | } 29 | 30 | class FuncExtractor { 31 | val funcs = ListBuffer[Func]() 32 | 33 | def collectFuncExprs(schema: Schema, ex: Expr): Boolean = ex match { 34 | case Func(f, params) => funcs += ex.asInstanceOf[Func]; true 35 | case _ => true 36 | } 37 | } 38 | 39 | -------------------------------------------------------------------------------- /flinklib/src/main/scala/dbis/piglet/backends/flink/streaming/FlinksConf.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.backends.flink.streaming 2 | 3 | import dbis.piglet.backends.BackendConf 4 | import dbis.piglet.backends.flink.FlinkRun 5 | import dbis.piglet.backends.PigletBackend 6 | import com.typesafe.config.ConfigFactory 7 | 8 | /** 9 | * @author hage 10 | */ 11 | class FlinksConf extends BackendConf { 12 | // loads the default configuration file in resources/application.conf 13 | private val appconf = ConfigFactory.load() 14 | 15 | /** 16 | * Get the name of this backend 17 | * 18 | * @return Returns the name of this backend 19 | */ 20 | override def name: String = appconf.getString("backends.flinks.name") 21 | 22 | /** 23 | * Get the path to the runner class that implements the PigletBackend interface 24 | */ 25 | override def runnerClass: PigletBackend = { 26 | new FlinkRun 27 | } 28 | 29 | override def templateFile: String = appconf.getString("backends.flinks.template") 30 | 31 | override def defaultConnector: String = appconf.getString("backends.flinks.connector") 32 | 33 | override def raw = false 34 | } 35 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ######################################### 2 | # File with contents to be ignored by git 3 | ######################################### 4 | logs 5 | project/project 6 | project/target 7 | target 8 | tmp 9 | .history 10 | dist 11 | /.idea 12 | /*.iml 13 | /out 14 | .idea_modules 15 | .classpath 16 | .project 17 | /RUNNING_PID 18 | .settings 19 | .target 20 | /bin 21 | *.jpage 22 | lodhub_data 23 | .cache 24 | .worksheet/ 25 | *.sc 26 | .sbt_completion_cache 27 | .tags 28 | *.bak 29 | *.class 30 | *.log 31 | 32 | __my_script*/ 33 | 34 | ##### 35 | 36 | piglet-dist* 37 | 38 | 39 | ################# 40 | # ignore database 41 | ################# 42 | db 43 | 44 | ############## 45 | # sbt specific 46 | ############## 47 | .cache 48 | .cache-* 49 | .history 50 | .lib/ 51 | .scalastyle 52 | dist/* 53 | target/ 54 | lib_managed/ 55 | src_managed/ 56 | project/boot/ 57 | project/plugins/project/ 58 | buildinfo.properties 59 | 60 | # Scala-IDE specific 61 | .scala_dependencies 62 | .worksheet 63 | 64 | .LICENSE-COPY.crc 65 | 66 | ################################ 67 | # Operating Systems 68 | ################################ 69 | *~ 70 | *.swp 71 | .DS_Store 72 | -------------------------------------------------------------------------------- /common/src/main/scala/dbis/piglet/backends/BackendConf.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.backends 2 | 3 | /** 4 | * @author hage 5 | */ 6 | trait BackendConf { 7 | /** 8 | * Get the name of this backend 9 | * 10 | * @return Returns the name of this backend 11 | */ 12 | def name: String 13 | 14 | /** 15 | * Get an instance of runner that will be used to run the jobs 16 | * 17 | * @return Returns the full qualified name of the runner class 18 | */ 19 | def runnerClass: PigletBackend 20 | 21 | /** 22 | * Get the full path to the template file to use for the backend 23 | * 24 | * @return the name of the template file 25 | */ 26 | def templateFile: String 27 | 28 | /** 29 | * Get the default connection function used for source and sink nodes 30 | * 31 | * @return the name of the function 32 | */ 33 | def defaultConnector: String 34 | 35 | /** 36 | * Defines that a backends needs the raw Pig script 37 | * rather than the generated code 38 | * 39 | * @return True if the backends wants the original script, otherwise false 40 | */ 41 | def raw: Boolean 42 | } 43 | -------------------------------------------------------------------------------- /sparklib/src/main/scala/dbis/piglet/backends/spark/SparkSRun.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package dbis.piglet.backends.spark 19 | 20 | class SparkSRun extends SparkRun { 21 | override def templateFile = appconf.getString("backends.sparks.template") 22 | override def defaultConnector = appconf.getString("backends.sparks.connector") 23 | } 24 | -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/op/Empty.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package dbis.piglet.op 18 | 19 | /** Empty represents PigOperators or an entire structure of them that has been removed 20 | * 21 | * @param in 22 | */ 23 | //noinspection ScalaDocMissingParameterDescription 24 | case class Empty(private val in: Pipe) extends PigOperator(List(), List(in)) -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/mm/CacheEntry.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.mm 2 | 3 | import dbis.piglet.Piglet.Lineage 4 | 5 | import scala.concurrent.duration._ 6 | 7 | case class CacheEntry(lineage: Lineage, uri: String, _benefit: Long, bytes: Long, var lastLoaded: Option[Long] = None, var written: Option[Long] = None, 8 | var useCount: Int = 0, var fixed: Boolean = false) { 9 | 10 | 11 | def benefit: Duration = _benefit.milliseconds 12 | 13 | def markWritten() = written = Some(System.currentTimeMillis()) 14 | 15 | def markLoaded() = { 16 | lastLoaded = Some(System.currentTimeMillis()) 17 | useCount += 1 18 | } 19 | 20 | override def toString = 21 | s"""CacheEntry 22 | | lineage: $lineage file: $uri benefit: ${benefit.toSeconds} (${_benefit} ms) bytes: $bytes lastLoaded: ${lastLoaded.getOrElse("-")} written: ${written.getOrElse("-")} 23 | | use count: $useCount fixed: $fixed""".stripMargin 24 | 25 | override def equals(obj: scala.Any): Boolean = obj match { 26 | case o: CacheEntry => 27 | o.lineage equals lineage 28 | case _ => false 29 | } 30 | 31 | override def hashCode(): Int = lineage.hashCode 32 | } 33 | -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/plan/rewriting/dsl/words/CheckWord.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package dbis.piglet.plan.rewriting.dsl.words 18 | 19 | import dbis.piglet.op.PigOperator 20 | import dbis.piglet.plan.rewriting.dsl.traits.{BuilderT, CheckWordT} 21 | 22 | class CheckWord[FROM <: PigOperator, TO](override val b: BuilderT[FROM, TO]) extends CheckWordT[FROM, TO] { 23 | 24 | } -------------------------------------------------------------------------------- /materialization_scripts/gdelt_url_eventcode.pig: -------------------------------------------------------------------------------- 1 | <% 2 | def extractDomain(url: String): String = { 3 | if(!url.startsWith("http")) 4 | url 5 | else { 6 | val startPos = url.indexOf("//")+2 7 | val endPos = if(url.indexOf("/",startPos) < 0) { url.size } else { url.indexOf("/",startPos) } 8 | url.substring(startPos, endPos) 9 | } 10 | } 11 | def diff(d1: Double, d2: Double): Double = { 12 | math.abs(d1 - d2) 13 | } 14 | def isnum(s: String): Boolean = { 15 | scala.util.Try { 16 | s.toDouble 17 | }.map(_ => true).getOrElse(false) 18 | } 19 | %> 20 | gdelt = LOAD '$gdelt' using PigStorage(); 21 | fields = FOREACH gdelt GENERATE $26 as eventcode, (double)$34 as avgtone, $57 as url; 22 | withURL = FILTER fields BY nonempty(eventcode) and isnum(eventcode) and nonempty(url) 23 | domain = FOREACH withURL GENERATE extractDomain(url) as site, (int)eventcode as ecode, avgtone; 24 | grp = GROUP domain BY (site, ecode); 25 | avgtones1 = FOREACH grp GENERATE group as siteecode, avg(domain.avgtone) as avgtone 26 | avgtones = FILTER avgtones1 BY avgtone != 0 27 | f = FOREACH avgtones GENERATE siteecode.site as site,siteecode.ecode as code, avgtone 28 | ordered = ORDER f BY site, code 29 | dump ordered mute; -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/codegen/flink/emitter/OrderByEmitter.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.codegen.flink.emitter 2 | 3 | import dbis.piglet.codegen.{ CodeEmitter, CodeGenContext, CodeGenException } 4 | import dbis.piglet.op.{ OrderBy, OrderByDirection, OrderBySpec, PigOperator } 5 | import dbis.piglet.schema.Types 6 | import dbis.piglet.expr.NamedField 7 | import dbis.piglet.expr.PositionalField 8 | import dbis.piglet.schema.Schema 9 | import dbis.piglet.expr.Ref 10 | import dbis.piglet.codegen.flink.FlinkHelper 11 | 12 | class OrderByEmitter extends dbis.piglet.codegen.scala_lang.OrderByEmitter { 13 | override def template: String = """ val = .setParallelism(1), Order.)}>""".stripMargin 14 | 15 | override def code(ctx: CodeGenContext, op: OrderBy): String = { 16 | val key = op.orderSpec.map(spec => FlinkHelper.getOrderIndex(op.schema, spec.field)) 17 | val orders = op.orderSpec.map(spec => if (spec.dir == OrderByDirection.AscendingOrder) "ASCENDING" else "DESCENDING") 18 | render(Map("out" -> op.outPipeName, "in" -> op.inPipeName, "key" -> key, "asc" -> orders)) 19 | } 20 | } 21 | 22 | object OrderByEmitter { 23 | lazy val instance = new OrderByEmitter 24 | } -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/plan/rewriting/dsl/words/ImmediateEndWord.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package dbis.piglet.plan.rewriting.dsl.words 18 | 19 | import dbis.piglet.op.PigOperator 20 | import dbis.piglet.plan.rewriting.dsl.traits.{BuilderT, EndWordT} 21 | 22 | class ImmediateEndWord[FROM <: PigOperator, TO](override val b: BuilderT[FROM, TO]) extends EndWordT[FROM, TO]{ 23 | 24 | } 25 | -------------------------------------------------------------------------------- /common/src/main/scala/dbis/piglet/tools/logging/PigletLogging.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.tools.logging 2 | 3 | import java.io.PrintStream 4 | 5 | import org.slf4j.LoggerFactory 6 | 7 | trait PigletLogging { 8 | 9 | /* 10 | * This ugly hack is used to suppress the annoying warning of multiple bindings in slf4j. 11 | */ 12 | val filteredErr = new PrintStream(System.err) { 13 | override def println(l: String) = if (!l.startsWith("SLF4J") && !l.startsWith("[INFO ] [EtmMonitor]")) super.println(l) 14 | } 15 | System.setErr(filteredErr) 16 | 17 | val filteredOut = new PrintStream(System.out) { 18 | override def println(l: String) = if (!l.startsWith("SLF4J") && !l.startsWith("[INFO ] [EtmMonitor]")) super.println(l) 19 | } 20 | System.setOut(filteredOut) 21 | 22 | 23 | 24 | 25 | protected val logger: PigletLogger = { 26 | val baseLogger = LoggerFactory.getLogger(getClass.getName) 27 | 28 | if(baseLogger.isInstanceOf[ch.qos.logback.classic.Logger]) 29 | PigletLogger(baseLogger.asInstanceOf[ch.qos.logback.classic.Logger]) 30 | else { 31 | Console.err.println(s"Could not bind logger: $baseLogger") 32 | new PigletLogger(None) 33 | } 34 | } 35 | 36 | 37 | } -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/codegen/scala_lang/DumpEmitter.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.codegen.scala_lang 2 | 3 | import dbis.piglet.codegen.{CodeEmitter, CodeGenContext} 4 | import dbis.piglet.op.Dump 5 | 6 | /** 7 | * Created by kai on 05.12.16. 8 | */ 9 | class DumpEmitter extends CodeEmitter[Dump] { 10 | override def template: String = """.foreach{t=> 11 | | PerfMonitor.sampleSize(t,"", accum, randFactor) 12 | |} 13 | |.map{t => 14 | | PerfMonitor.sampleSize(t,"", accum, randFactor) 15 | | t 16 | |}.collect.foreach(t => println(t.toString()))""".stripMargin 17 | 18 | 19 | override def code(ctx: CodeGenContext, op: Dump): String = { 20 | val map = collection.mutable.Map("in" -> op.inPipeName, "lineage" -> op.lineageSignature) 21 | if(op.mute) 22 | map += ("mute" -> op.mute.toString) 23 | render(map.toMap) 24 | } 25 | 26 | } 27 | 28 | object DumpEmitter { 29 | lazy val instance = new DumpEmitter 30 | } 31 | -------------------------------------------------------------------------------- /ceplib/src/main/scala/dbis/piglet/cep/flink/CustomDataSetMatcher.scala: -------------------------------------------------------------------------------- 1 | 2 | package dbis.piglet.cep.flink 3 | 4 | import scala.reflect.ClassTag 5 | import dbis.piglet.cep.ops.SelectionStrategy._ 6 | import dbis.piglet.cep.ops.OutputStrategy._ 7 | import dbis.piglet.cep.nfa.NFAController 8 | import dbis.piglet.backends.{SchemaClass => Event} 9 | import org.apache.flink.api.common.typeinfo.TypeInformation 10 | //import org.apache.flink.api.java.ExecutionEnvironment 11 | //import org.apache.flink.api.java.DataSet 12 | import scala.collection.JavaConversions._ 13 | import org.apache.flink.api.scala._ 14 | 15 | class CustomDataSetMatcher[T <: Event: ClassTag: TypeInformation](dataSet: DataSet[T]) { 16 | 17 | def matchNFA(nfa: NFAController[T], sstr: SelectionStrategy = FirstMatch, out: OutputStrategy = Combined) = { 18 | // println("create a new DataSet matcher") 19 | val flinkEnv = dataSet.getExecutionEnvironment 20 | new DataSetMatcher(dataSet, nfa, flinkEnv, sstr, out).compute() 21 | } 22 | 23 | } 24 | 25 | object CustomDataSetMatcher { 26 | 27 | implicit def addDataSetMatcher[T <: Event: ClassTag: TypeInformation](dataSet: DataSet[T]) = { 28 | // println("add a custom DataSet function") 29 | new CustomDataSetMatcher(dataSet) 30 | } 31 | } -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/op/Cache.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.op 2 | 3 | object CacheMode extends Enumeration { 4 | type CacheMode = Value 5 | val NONE, 6 | MEMORY_ONLY, 7 | MEMORY_AND_DISK, 8 | MEMORY_ONLY_SER, 9 | MEMORY_AND_DISK_SER, 10 | DISK_ONLY, 11 | MEMORY_ONLY_2, 12 | MEMORY_AND_DISK_2 = Value 13 | } 14 | 15 | import CacheMode.CacheMode 16 | 17 | case class Cache (private[op] val out: Pipe, 18 | private[op] val in: Pipe, 19 | operatorId: String, 20 | cacheMode: CacheMode) extends PigOperator(out, in) { 21 | 22 | if(in.producer != null) { 23 | schema = in.producer.schema 24 | } 25 | 26 | 27 | override def equals(other: Any) = other match { 28 | case o: Cache => operatorId == o.operatorId && outPipeName == o.outPipeName 29 | case _ => false 30 | } 31 | 32 | override def hashCode() = (operatorId+outPipeName).hashCode() 33 | 34 | override def toString = 35 | s"""CACHE 36 | | out = $outPipeName 37 | | in = $inPipeName 38 | | operatorId = $operatorId 39 | | mode = $cacheMode 40 | """.stripMargin 41 | 42 | override def lineageString = s"CACHE%$operatorId%$cacheMode%${super.lineageString}" 43 | } 44 | -------------------------------------------------------------------------------- /ceplib/src/main/scala/dbis/piglet/cep/engines/NextMatch.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.cep.engines 2 | 3 | import dbis.piglet.backends.{SchemaClass => Event} 4 | import scala.reflect.ClassTag 5 | import dbis.piglet.cep.nfa.NFAStructure 6 | import dbis.piglet.cep.nfa.NFAController 7 | import dbis.piglet.cep.nfa.NormalState 8 | import dbis.piglet.cep.ops.MatchCollector 9 | class NextMatch[T <: Event: ClassTag](nfaController: NFAController[T], collector: MatchCollector[T]) extends CEPEngine(nfaController, collector) with Serializable { 10 | var statics: Long = 0 11 | override def runEngine(event: T): Unit = { 12 | runningStructursPool.foreach ( str => engineProcess(event, str)) 13 | createNewStructue(event) 14 | runGCStructures() 15 | } 16 | private[NextMatch] def engineProcess(event: T, strInfo: (Long, NFAStructure[T])) { 17 | val currenStr= strInfo._2 18 | val result: Int = checkPredicate(event, currenStr) 19 | if (result != -1) { // the predicate if ok. 20 | currenStr.addEvent(event, currenStr.getCurrentState.asInstanceOf[NormalState[T]].getEdgeByIndex(result)) 21 | if (currenStr.complete) { //final state 22 | statics += 1 23 | collector + currenStr 24 | wantToDeletedStructurs += strInfo._1 25 | } 26 | } 27 | } 28 | } -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/codegen/spark/StreamDistinctEmitter.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package dbis.piglet.codegen.spark 18 | 19 | import dbis.piglet.codegen.scala_lang.DistinctEmitter 20 | 21 | class StreamDistinctEmitter extends DistinctEmitter { 22 | override def template: String = """ val = .transform(rdd => rdd.distinct)""".stripMargin 23 | } 24 | 25 | object StreamDistinctEmitter { 26 | lazy val instance = new StreamDistinctEmitter 27 | } -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/codegen/spark/SpatialIndexEmitter.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.codegen.spark 2 | 3 | import dbis.piglet.codegen.{CodeEmitter, CodeGenContext} 4 | import dbis.piglet.op.{IndexMethod, IndexOp} 5 | 6 | class SpatialIndexEmitter extends CodeEmitter[IndexOp] { 7 | // new dbis.stark.spatial.partitioner.SpatialGridPartitioner(KeyBy, partitionsPerDimension=20, pointsOnly=false) 8 | override def template = 9 | """val KeyBy = 10 | |val idxParti = new dbis.stark.spatial.partitioner.BSPartitioner(KeyBy, 1, 1000, false) 11 | |val = KeyBy.index(Some(idxParti), ).map{ idx => 12 | | PerfMonitor.sampleSize(idx, "", accum, randFactor) 13 | | idx 14 | |}""".stripMargin 15 | 16 | override def code(ctx: CodeGenContext, op: IndexOp): String = render(Map( 17 | "out" -> op.outPipeName, 18 | "in" -> op.inPipeName, 19 | "method" -> IndexMethod.methodName(op.method), 20 | "params" -> op.params.mkString(","), 21 | "keyby" -> SpatialEmitterHelper.keyByCode(op.inputSchema, op.field, ctx), 22 | "lineage" -> op.lineageSignature 23 | ) ) 24 | } 25 | 26 | object SpatialIndexEmitter { 27 | lazy val instance = new SpatialIndexEmitter 28 | } -------------------------------------------------------------------------------- /ceplib/src/main/scala/dbis/piglet/cep/engines/FirstMatch.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.cep.engines 2 | 3 | import dbis.piglet.backends.{SchemaClass => Event} 4 | import scala.reflect.ClassTag 5 | import dbis.piglet.cep.nfa.NFAStructure 6 | import dbis.piglet.cep.nfa.NFAController 7 | import dbis.piglet.cep.nfa.NormalState 8 | import dbis.piglet.cep.ops.MatchCollector 9 | class FirstMatch[T <: Event: ClassTag](nfaController: NFAController[T], collector: MatchCollector[T]) extends CEPEngine(nfaController, collector) with Serializable { 10 | var statics: Long = 0 11 | 12 | override def runEngine(event: T): Unit = { 13 | if (runningStructursPool.size == 0) 14 | createNewStructue(event); 15 | else { 16 | engineProcess(event, runningStructursPool.head._2); 17 | } 18 | } 19 | private[FirstMatch] def engineProcess(event: T, currenStr: NFAStructure[T]) { 20 | val result: Int = checkPredicate(event, currenStr) 21 | if (result != -1) { // the predicate if ok. 22 | currenStr.addEvent(event, currenStr.getCurrentState.asInstanceOf[NormalState[T]].getEdgeByIndex(result)) 23 | if (currenStr.complete) { //final state 24 | statics += 1 25 | //println("complete") 26 | collector + currenStr 27 | runningStructursPool.clear() 28 | } 29 | } 30 | } 31 | } -------------------------------------------------------------------------------- /ceplib/src/main/scala/dbis/piglet/cep/flink/CustomDataStreamMatcher.scala: -------------------------------------------------------------------------------- 1 | 2 | package dbis.piglet.cep.flink 3 | 4 | import scala.reflect.ClassTag 5 | import dbis.piglet.cep.ops.SelectionStrategy._ 6 | import dbis.piglet.cep.ops.OutputStrategy._ 7 | import dbis.piglet.cep.nfa.NFAController 8 | import dbis.piglet.backends.{SchemaClass => Event} 9 | import org.apache.flink.api.common.typeinfo.TypeInformation 10 | //import org.apache.flink.api.java.ExecutionEnvironment 11 | //import org.apache.flink.api.java.DataSet 12 | import scala.collection.JavaConversions._ 13 | import org.apache.flink.streaming.api.scala._ 14 | 15 | class CustomDataStreamMatcher[T <: Event: ClassTag: TypeInformation](@transient val dataStream: DataStream[T]) { 16 | 17 | def matchNFA(nfa: NFAController[T], flinkEnv: StreamExecutionEnvironment, sstr: SelectionStrategy = FirstMatch, out: OutputStrategy = Combined) = { 18 | // println("create a new DataStream matcher") 19 | new DataStreamMatcher(dataStream, nfa, flinkEnv, sstr, out).compute() 20 | } 21 | 22 | } 23 | 24 | object CustomDataStreamMatcher { 25 | 26 | implicit def addDataSetMatcher[T <: Event: ClassTag: TypeInformation](@transient dataStream: DataStream[T]) = { 27 | // println("add a custom DataStream function") 28 | new CustomDataStreamMatcher(dataStream) 29 | } 30 | } -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/codegen/spark/StreamOrderByEmitter.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package dbis.piglet.codegen.spark 18 | 19 | import dbis.piglet.codegen.scala_lang.OrderByEmitter 20 | 21 | class StreamOrderByEmitter extends OrderByEmitter { 22 | override def template: String = """ val = .transform(rdd => rdd.repartition(1).sortBy(t => , ))""".stripMargin 23 | } 24 | 25 | object StreamOrderByEmitter { 26 | lazy val instance = new StreamOrderByEmitter 27 | } -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/op/cmd/RegisterCmd.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package dbis.piglet.op.cmd 19 | 20 | import dbis.piglet.op.PigOperator 21 | 22 | 23 | /** 24 | * Register represents a pseudo operator for the REGISTER statement. This "operator" will 25 | * be eliminated during building the dataflow plan. 26 | * 27 | * @param jarFile the URI of the Jar file to be registered 28 | */ 29 | case class RegisterCmd(jarFile: String) extends PigOperator(List(), List()) 30 | 31 | -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/op/cmd/SetCmd.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package dbis.piglet.op.cmd 19 | 20 | import dbis.piglet.expr.Value 21 | import dbis.piglet.op.PigOperator 22 | 23 | 24 | /** 25 | * SetCmd represents a pseudo operator for the SET statement. 26 | * 27 | * @param param the parameter name 28 | * @param value the value of the parameter set by this statement 29 | */ 30 | case class SetCmd(param: String, value: Value) extends PigOperator(List(), List()) 31 | 32 | 33 | -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/codegen/spark/SpatialEmitterHelper.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.codegen.spark 2 | 3 | import dbis.piglet.codegen.scala_lang.ScalaEmitter 4 | import dbis.piglet.op.PigOperator 5 | import dbis.piglet.codegen.{CodeEmitter, CodeGenContext} 6 | import dbis.piglet.expr.NamedField 7 | import dbis.piglet.expr.PositionalField 8 | import dbis.piglet.expr.Ref 9 | import dbis.piglet.schema.Schema 10 | 11 | object SpatialEmitterHelper { 12 | 13 | 14 | def geomIsFirstPos[T <: PigOperator](ref: Ref, op: T): Boolean = { 15 | 16 | val pos = ref match { 17 | case nf : NamedField => 18 | op.inputSchema.get.indexOfField(nf) 19 | case pf : PositionalField => 20 | pf.pos 21 | case _ => throw new IllegalArgumentException(s"expected field reference, got: $ref") 22 | } 23 | 24 | pos == 0 25 | } 26 | 27 | 28 | def keyByCode(schema: Option[Schema], ref: Ref, ctx: CodeGenContext): String = 29 | s".keyBy(${ctx.asString("tuplePrefix")} => ${ScalaEmitter.emitRef(CodeGenContext(ctx,Map("schema"->schema)), ref)})" 30 | 31 | 32 | def keyByCode(schema: Option[Schema], refs: Iterable[Ref], ctx: CodeGenContext): String = 33 | s".keyBy(${ctx.asString("tuplePrefix")} => (${refs.map(ref => ScalaEmitter.emitRef(CodeGenContext(ctx,Map("schema"->schema)), ref)).mkString(",")}))" 34 | } -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/plan/rewriting/dsl/words/ReplaceWord.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package dbis.piglet.plan.rewriting.dsl.words 18 | 19 | import dbis.piglet.op.PigOperator 20 | import dbis.piglet.plan.rewriting.dsl.builders.PigOperatorBuilder 21 | import dbis.piglet.plan.rewriting.dsl.traits.{CheckWordT, BuilderT, EndWordT} 22 | 23 | class ReplaceWord[FROM <: PigOperator](override val b: BuilderT[FROM, PigOperator]) 24 | extends EndWordT[FROM, PigOperator] with CheckWordT[FROM, PigOperator] { 25 | } 26 | -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/tools/RingBuffer.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.tools 2 | 3 | import scala.reflect.ClassTag 4 | 5 | trait RingLike[T] extends Seq[T] { 6 | def put(o: T) 7 | } 8 | 9 | class RingBuffer[T: ClassTag](capacity: Int) extends RingLike[T] { 10 | 11 | require(capacity > 0, s"capacity must be > 0 , but is $capacity") 12 | 13 | private val ring = Array.fill(capacity){Option.empty[T]} 14 | 15 | private var curr = 0 16 | 17 | override def put(o: T): Unit = { 18 | require(o != null) 19 | 20 | ring(curr) = Some(o) 21 | 22 | curr = (curr + 1) % capacity 23 | } 24 | 25 | override def length: Int = ring.count(_.isDefined) 26 | 27 | override def foreach[U](f: (T) => U): Unit = iterator.foreach(f) 28 | 29 | override def apply(idx: Int): T = { 30 | require(idx > 0, s"idx must be > 0, but is $idx") 31 | val a = ring.apply(idx % capacity) 32 | 33 | if(a.isDefined) 34 | a.get 35 | else 36 | throw new ArrayIndexOutOfBoundsException("no such index $idx") 37 | } 38 | 39 | override def iterator: Iterator[T] = ring.iterator.filter(_.isDefined).map(_.get) 40 | } 41 | 42 | object RingBuffer { 43 | def apply[T:ClassTag](elements: T*): RingBuffer[T] = { 44 | val b = new RingBuffer[T](elements.length) 45 | for(e <- elements) 46 | b.put(e) 47 | 48 | b 49 | } 50 | } -------------------------------------------------------------------------------- /ceplib/src/main/scala/dbis/piglet/cep/engines/ContiguityMatch.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.cep.engines 2 | 3 | import dbis.piglet.backends.{SchemaClass => Event} 4 | import scala.reflect.ClassTag 5 | import dbis.piglet.cep.nfa.NFAStructure 6 | import dbis.piglet.cep.nfa.NFAController 7 | import dbis.piglet.cep.nfa.NormalState 8 | import dbis.piglet.cep.ops.MatchCollector 9 | class ContiguityMatch[T <: Event: ClassTag](nfaController: NFAController[T], collector: MatchCollector[T]) extends CEPEngine(nfaController, collector) with Serializable { 10 | var statics: Long = 0 11 | override def runEngine(event: T): Unit = { 12 | runningStructursPool.foreach ( str => engineProcess(event, str)) 13 | createNewStructue(event) 14 | runGCStructures() 15 | } 16 | private[ContiguityMatch] def engineProcess(event: T, strInfo: (Long, NFAStructure[T])) { 17 | val currenStr= strInfo._2 18 | val result: Int = checkPredicate(event, currenStr) 19 | if (result != -1) { // the predicate if ok. 20 | currenStr.addEvent(event, currenStr.getCurrentState.asInstanceOf[NormalState[T]].getEdgeByIndex(result)) 21 | if (currenStr.complete) { //final state 22 | statics += 1 23 | collector + currenStr 24 | wantToDeletedStructurs += strInfo._1 25 | } 26 | } 27 | else 28 | wantToDeletedStructurs += strInfo._1 29 | } 30 | } -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/codegen/scala_lang/StoreEmitter.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.codegen.scala_lang 2 | 3 | import dbis.piglet.backends.BackendManager 4 | import dbis.piglet.codegen.{CodeEmitter, CodeGenContext, CodeGenException} 5 | import dbis.piglet.op.{PigOperator, Store} 6 | 7 | /** 8 | * Created by kai on 05.12.16. 9 | */ 10 | class StoreEmitter extends CodeEmitter[Store] { 11 | override def template: String = 12 | // """.write("", , )""".stripMargin 13 | """ []().write("", , )""".stripMargin 14 | 15 | 16 | override def code(ctx: CodeGenContext, op: Store): String = { 17 | var paramMap = Map("in" -> op.inPipeName, 18 | "file" -> op.file.toString, 19 | "func" -> op.func.getOrElse(BackendManager.backend.defaultConnector)) 20 | op.schema match { 21 | case Some(s) => 22 | val cName = ScalaEmitter.schemaClassName(s) 23 | 24 | paramMap += ("class" -> cName) 25 | case None => paramMap += ("class" -> "Record") 26 | } 27 | 28 | if (op.params != null && op.params.nonEmpty) 29 | paramMap += ("params" -> op.params.mkString(",")) 30 | render(paramMap) 31 | } 32 | 33 | } 34 | 35 | object StoreEmitter { 36 | lazy val instance = new StoreEmitter 37 | } -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/codegen/scala_lang/DifferenceEmitter.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.codegen.scala_lang 2 | 3 | import dbis.piglet.codegen.spark.SpatialEmitterHelper 4 | import dbis.piglet.codegen.{CodeEmitter, CodeGenContext} 5 | import dbis.piglet.op.Difference 6 | 7 | class DifferenceEmitter extends CodeEmitter[Difference] { 8 | override def template: String = """val = .subtract()""".stripMargin 9 | 10 | def templateKeyed = """val = .subtractByKey().map(_._2)""" 11 | 12 | override def code(ctx: CodeGenContext, op: Difference): String = { 13 | 14 | val (templ,params) = if(op.refs1.isDefined) { 15 | val m = Map("out" -> op.outPipeName, 16 | "in1" -> op.inPipeNames.head, 17 | "in2" -> op.inPipeNames.last, 18 | "keyby1" -> SpatialEmitterHelper.keyByCode(op.inputs.head.producer.schema, op.refs1.get,ctx), 19 | "keyby2" -> SpatialEmitterHelper.keyByCode(op.inputs.last.producer.schema, op.refs2.get,ctx) 20 | ) 21 | (templateKeyed, m) 22 | } else { 23 | val m = Map("out" -> op.outPipeName, 24 | "in1" -> op.inPipeNames.head, 25 | "in2" -> op.inPipeNames.last 26 | ) 27 | (template, m) 28 | } 29 | 30 | CodeEmitter.render(templ, params) 31 | } 32 | 33 | } 34 | 35 | object DifferenceEmitter { 36 | lazy val instance = new DifferenceEmitter 37 | } -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/plan/rewriting/dsl/words/MergeWord.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package dbis.piglet.plan.rewriting.dsl.words 18 | 19 | import dbis.piglet.op.PigOperator 20 | import dbis.piglet.plan.rewriting.dsl.traits.{CheckWordT, EndWordT, BuilderT} 21 | 22 | import scala.reflect.ClassTag 23 | 24 | class MergeWord[FROM1 <: PigOperator : ClassTag, FROM2 <: PigOperator : ClassTag] 25 | (override val b: BuilderT[(FROM1, FROM2), PigOperator]) 26 | extends EndWordT[(FROM1, FROM2), PigOperator] with CheckWordT[(FROM1, FROM2), PigOperator] { 27 | } 28 | -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/mm/MaterializationPoint.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.mm 2 | 3 | import dbis.piglet.Piglet.Lineage 4 | 5 | import scala.concurrent.duration.Duration 6 | 7 | /** 8 | * A MaterializationPoint object represents information about a possible materialization of the result 9 | * of a dataflow operator. It is identified by a hash of the lineage string of the operator and collects 10 | * profile information. 11 | * 12 | * @param lineage the MD5 hash of the lineage string of the operator 13 | * @param benefit the cumulative benefit of this materialization point compared to the root operator 14 | * @param prob The probability for re-using this operator 15 | * @param cost The duration that this operator takes 16 | */ 17 | case class MaterializationPoint(lineage: Lineage, prob: Double, cost: Long, bytes: Long, benefit: Duration = Duration.Undefined) { 18 | override def hashCode(): Int = lineage.hashCode 19 | 20 | override def equals(obj: scala.Any): Boolean = obj match { 21 | case m:MaterializationPoint => m.lineage equals lineage 22 | case _ => false 23 | } 24 | 25 | override def toString = s"MaterializationPoint($lineage, prob=$prob, cost=$cost ms, benefit=${benefit.toMillis} ms)" 26 | } 27 | 28 | 29 | object MaterializationPoint { 30 | def dummy(lineage: Lineage): MaterializationPoint = MaterializationPoint(lineage, -1,-1, -1, Duration.Undefined) 31 | } -------------------------------------------------------------------------------- /ceplib/src/main/scala/dbis/piglet/cep/flink/DataSetMatcher.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.cep.flink 2 | 3 | import scala.reflect.ClassTag 4 | import dbis.piglet.cep.nfa.NFAController 5 | import dbis.piglet.cep.engines._ 6 | import dbis.piglet.cep.ops.SelectionStrategy._ 7 | import dbis.piglet.cep.ops.OutputStrategy._ 8 | import dbis.piglet.backends.{SchemaClass => Event} 9 | import dbis.piglet.cep.ops.MatchCollector 10 | import org.apache.flink.api.common.typeinfo.TypeInformation 11 | import dbis.piglet.cep.ops.SelectionStrategy 12 | //import org.apache.flink.api.java.operators.CustomUnaryOperation 13 | //import scala.collection.mutable.ArrayBuffer 14 | import scala.collection.mutable.ListBuffer 15 | //import org.apache.flink.api.java.DataSet 16 | //import org.apache.flink.api.java.ExecutionEnvironment 17 | import scala.collection.JavaConversions._ 18 | import org.apache.flink.api.scala._ 19 | import dbis.piglet.cep.ops.EngineConf 20 | 21 | class DataSetMatcher[T <: Event: ClassTag: TypeInformation](input: DataSet[T], nfa: NFAController[T], flinkEnv: ExecutionEnvironment, sstr: SelectionStrategy = SelectionStrategy.FirstMatch, out: OutputStrategy = Combined) extends EngineConf[T](nfa, sstr) with java.io.Serializable { 22 | def compute(): DataSet[T] = { 23 | input.collect().foreach ( event => engine.runEngine(event) ) 24 | flinkEnv.fromCollection(collector.convertEventsToArray().toSeq) 25 | } 26 | 27 | } -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/plan/PrettyPrinter.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package dbis.piglet.plan 18 | 19 | import dbis.piglet.op.PigOperator 20 | 21 | object PrettyPrinter extends org.kiama.output.PrettyPrinter{ 22 | def pretty(op: PigOperator): String = { 23 | super.pretty(show(op)) 24 | } 25 | 26 | def show(op: PigOperator): Doc = { 27 | val prettyInputs = op.inputs.map(p => show(p.producer)) 28 | parens ( 29 | value(op) 30 | <> nest( 31 | line 32 | <> ssep(prettyInputs, line))) 33 | } 34 | 35 | def show(p: List[PigOperator]): Doc = any(p) 36 | } 37 | -------------------------------------------------------------------------------- /ceplib/src/main/scala/dbis/piglet/cep/engines/AnyMatch.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.cep.engines 2 | 3 | import dbis.piglet.backends.{SchemaClass => Event} 4 | import scala.reflect.ClassTag 5 | import dbis.piglet.cep.nfa.NFAStructure 6 | import dbis.piglet.cep.nfa.NFAController 7 | import dbis.piglet.cep.nfa.NormalState 8 | import dbis.piglet.cep.ops.MatchCollector 9 | 10 | class AnyMatch[T <: Event: ClassTag](nfaController: NFAController[T], collector: MatchCollector[T]) extends CEPEngine(nfaController, collector) with Serializable { 11 | var statics: Long = 0 12 | override def runEngine(event: T): Unit = { 13 | runningStructursPool.foreach ( str => engineProcess(event, str)) 14 | createNewStructue(event) 15 | runGCStructures() 16 | } 17 | private[AnyMatch] def engineProcess(event: T, strInfo: (Long, NFAStructure[T])) { 18 | val currenStr= strInfo._2 19 | val result: Int = checkPredicate(event, currenStr) 20 | if (result != -1) { // the predicate if ok. 21 | val cloneStr = currenStr.clone 22 | runningStructursPool+= (structureID() -> cloneStr) 23 | currenStr.addEvent(event, currenStr.getCurrentState.asInstanceOf[NormalState[T]].getEdgeByIndex(result)) 24 | if (currenStr.complete) { //final state 25 | statics += 1 26 | collector + currenStr 27 | wantToDeletedStructurs += strInfo._1 28 | } 29 | } 30 | else 31 | wantToDeletedStructurs += strInfo._1 32 | } 33 | } -------------------------------------------------------------------------------- /src/test/scala/dbis/piglet/tools/CodeMatcherSpec.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.tools 2 | 3 | import org.scalatest.{Matchers, FlatSpec} 4 | 5 | /** 6 | * Created by kai on 18.11.15. 7 | */ 8 | class CodeMatcherSpec extends FlatSpec with Matchers { 9 | "The SnippetMatcher" should "match two equal strings" in { 10 | SnippetMatcher.matches("abc 12 def", "abc 12 def") should be (true) 11 | } 12 | 13 | it should "not match two different strings" in { 14 | SnippetMatcher.matches("abc 12 def", "abc 12") should be (false) 15 | } 16 | 17 | it should "match a string to a corresponding template" in { 18 | SnippetMatcher.matches("abc x_8_ def x_8_", "abc x_$1_ def x_$1_") should be (true) 19 | } 20 | 21 | it should "match a string to another corresponding template" in { 22 | SnippetMatcher.matches("abc x_6_ def x_7_", "abc x_$2_ def x_$1_") should be (true) 23 | } 24 | 25 | it should "match a string with longer ids to a corresponding template" in { 26 | SnippetMatcher.matches("abc x_82_ def x_82_", "abc x_$1_ def x_$1_") should be (true) 27 | } 28 | 29 | it should "match a string with different longer ids to a corresponding template" in { 30 | SnippetMatcher.matches("abc x_82_ def x_83_", "abc x_$1_ def x_$2_") should be (true) 31 | } 32 | 33 | it should "not match a string to a wrong template" in { 34 | SnippetMatcher.matches("abc x_8_ def x_9_", "abc x_$1_ def x_$1 _") should be (false) 35 | } 36 | } 37 | 38 | -------------------------------------------------------------------------------- /setm/src/main/scala/dbis/setm/SETM.scala: -------------------------------------------------------------------------------- 1 | package dbis.setm 2 | 3 | import etm.core.configuration.BasicEtmConfigurator 4 | import etm.core.configuration.EtmManager 5 | import etm.core.renderer.{MeasurementRenderer, SimpleTextRenderer} 6 | 7 | /** 8 | * SETM is a simple wrapper for JETM to provide a 9 | * more Scala-like usage. 10 | * 11 | * See http://jetm.void.fm/ 12 | */ 13 | object SETM { 14 | BasicEtmConfigurator.configure(true) // nested 15 | private val monitor = EtmManager.getEtmMonitor() 16 | 17 | // Start monitoring 18 | monitor.start() 19 | 20 | var quiet: Boolean = false 21 | def enable = monitor.enableCollection() 22 | def disable = monitor.disableCollection() 23 | /* 24 | * Stop monitoring, collect results and render them 25 | * 26 | * @param renderer The renderer to use 27 | */ 28 | def collect() = { 29 | monitor.render(new SimpleTextRenderer()) 30 | monitor.stop() 31 | } 32 | 33 | /** 34 | * Measure execution time of the given function 35 | * 36 | * @param name A human readable name to identify this timing measurement 37 | * @param f The function to measure execution time of 38 | */ 39 | def timing[T](name: String)(f: => T) = { 40 | val p = monitor.createPoint(name) 41 | if(!quiet) 42 | print(s"==> $name \r") 43 | 44 | try { 45 | f 46 | } finally { 47 | p.collect 48 | } 49 | } 50 | } 51 | 52 | -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/op/Describe.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package dbis.piglet.op 18 | 19 | /** 20 | * Describe represents the DESCRIBE operator of Pig. 21 | * 22 | * @param in the input pipe 23 | */ 24 | case class Describe(private val in: Pipe) extends PigOperator(List(), List(in)) { 25 | 26 | /** 27 | * Returns the lineage string describing the sub-plan producing the input for this operator. 28 | * 29 | * @return a string representation of the sub-plan. 30 | */ 31 | override def lineageString: String = { 32 | s"""DESCRIBE%""" + super.lineageString 33 | } 34 | 35 | override def toString = 36 | s"""DESCRIBE 37 | | in = $inPipeName""".stripMargin 38 | } 39 | -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/codegen/scala_lang/LimitEmitter.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.codegen.scala_lang 2 | 3 | import dbis.piglet.codegen.{CodeEmitter, CodeGenContext} 4 | import dbis.piglet.op.Limit 5 | 6 | /** 7 | * Created by kai on 03.12.16. 8 | */ 9 | class LimitEmitter extends CodeEmitter[Limit] { 10 | // val = sc.parallelize(.take()) 11 | override def template: String = smallLimitTemplate 12 | 13 | lazy val smallLimitTemplate = s"""val = sc.parallelize(.take()).map{e => 14 | | PerfMonitor.sampleSize(e,"", accum, randFactor) 15 | | e 16 | |}""".stripMargin 17 | 18 | 19 | lazy val largeLimitTemplage = """val = .zipWithIndex.filter{case (_,idx) => idx \< }.map{t => 20 | | val res = t._1 21 | | 22 | | PerfMonitor.sampleSize(res,"", accum, randFactor) 23 | | 24 | | res 25 | |}""".stripMargin 26 | 27 | override def code(ctx: CodeGenContext, op: Limit): String = { 28 | 29 | val params = Map( 30 | "out" -> op.outPipeName, 31 | "in" -> op.inPipeName, 32 | "num" -> op.num, 33 | "lineage" -> op.lineageSignature) 34 | 35 | if(op.num > 1000) 36 | CodeEmitter.render(largeLimitTemplage,params) 37 | else 38 | render(params) 39 | 40 | } 41 | 42 | } 43 | 44 | object LimitEmitter { 45 | lazy val instance = new LimitEmitter 46 | } -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/codegen/scala_lang/LoadEmitter.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.codegen.scala_lang 2 | 3 | import dbis.piglet.backends.BackendManager 4 | import dbis.piglet.codegen.{CodeEmitter, CodeGenContext} 5 | import dbis.piglet.op.Load 6 | 7 | /** 8 | * Created by kai on 03.12.16. 9 | */ 10 | class LoadEmitter extends CodeEmitter[Load] { 11 | override def template: String = 12 | // """val = .load[](sc,"", , )""" 13 | """ val = [](randFactor).load(sc, "", , , lineageAndAccum = Some(("",accum))lineageAndAccum = None)""".stripMargin 14 | 15 | 16 | 17 | override def code(ctx: CodeGenContext, op: Load): String = { 18 | var paramMap = ScalaEmitter.emitExtractorFunc(op, op.loaderFunc) 19 | paramMap += ("out" -> op.outPipeName) 20 | paramMap += ("file" -> op.file.toString) 21 | paramMap += ("lineage" -> op.lineageSignature) 22 | if (op.loaderFunc.isEmpty) 23 | paramMap += ("func" -> BackendManager.backend.defaultConnector) 24 | else { 25 | paramMap += ("func" -> op.loaderFunc.get) 26 | if (op.loaderParams != null && op.loaderParams.nonEmpty) 27 | paramMap += ("params" -> op.loaderParams.mkString(",")) 28 | } 29 | render(paramMap) 30 | } 31 | } 32 | 33 | object LoadEmitter { 34 | lazy val instance = new LoadEmitter 35 | } 36 | -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/op/Limit.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package dbis.piglet.op 18 | 19 | /** 20 | * Limit represents the LIMIT operator of Pig. 21 | * 22 | * @param out the output pipe (relation). 23 | * @param in the input pipe. 24 | * @param num the maximum number of tuples produced by this operator 25 | */ 26 | case class Limit(private val out: Pipe, private val in: Pipe, num: Int) extends PigOperator(out, in) { 27 | 28 | override def lineageString: String = { 29 | s"""LIMIT%$num%""" + super.lineageString 30 | } 31 | 32 | override def toString = 33 | s"""LIMIT 34 | | out = $outPipeName 35 | | in = $inPipeName 36 | | num = $num""".stripMargin 37 | 38 | } 39 | -------------------------------------------------------------------------------- /src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | %d{HH:mm:ss.SSS} %-5level %logger{5}: %msg%n 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/op/cmd/DefineCmd.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package dbis.piglet.op.cmd 19 | 20 | import dbis.piglet.op.PigOperator 21 | import dbis.piglet.expr.Value 22 | 23 | 24 | /** 25 | * DefineCmd represents a pseudo operator for the DEFINE statement. This "operator" will 26 | * be eliminated during building the dataflow plan. 27 | * 28 | * @param alias the alias name of the UDF 29 | * @param scalaName the full classified Scala name of the function 30 | * @param paramList a list of values uses as the first standard parameters in the function call 31 | */ 32 | case class DefineCmd(alias: String, scalaName: String, paramList: List[Value]) extends PigOperator(List(), List()) 33 | 34 | 35 | -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/op/Display.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package dbis.piglet.op 18 | 19 | /** 20 | * Display represents the DISPLAY operator used to produce data for Zeppelin. 21 | * 22 | * @param in the input pipe 23 | */ 24 | case class Display(private val in: Pipe) extends PigOperator(List(), List(in)) { 25 | 26 | /** 27 | * Returns the lineage string describing the sub-plan producing the input for this operator. 28 | * 29 | * @return a string representation of the sub-plan. 30 | */ 31 | override def lineageString: String = { 32 | s"""DISPLAY%""" + super.lineageString 33 | } 34 | 35 | override def toString = 36 | s"""DISPLAY 37 | | in = $inPipeName""".stripMargin 38 | } 39 | -------------------------------------------------------------------------------- /make-distribution.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | TRUE=0 4 | FALSE=1 5 | 6 | PIGLET_HOME=. 7 | 8 | FILES=(target/scala-2.11/piglet.jar 9 | sparklib/target/scala-2.11/sparklib_2.11-*.jar 10 | flinklib/target/scala-2.11/flinklib_2.11-*.jar 11 | common/target/scala-2.11/common_2.11-*.jar 12 | ceplib/target/scala-2.11/ceplib_2.11-*.jar 13 | mapreducelib/target/scala-2.11/mapreduce_2.11-*.jar 14 | script/piglet) 15 | 16 | TARGET_DIR=$PIGLET_HOME/piglet-dist 17 | 18 | 19 | function checkfile { 20 | # echo "checking file $1" 21 | if [ -r $1 ]; then 22 | FEXISTS=TRUE 23 | else 24 | FEXISTS=FALSE 25 | fi 26 | 27 | } 28 | 29 | if [ -z "$PIGLET_HOME" ]; then 30 | echo "Please set PIGLET_HOME" 31 | exit 1 32 | fi 33 | 34 | rm -rf $TARGET_DIR 35 | mkdir $TARGET_DIR 36 | 37 | for f in ${FILES[@]} 38 | do 39 | echo -ne "\r copying $f " 40 | sourcefile=$PIGLET_HOME/$f 41 | checkfile $sourcefile 42 | if [ $FEXISTS == TRUE ]; then 43 | # targetfile=$TARGET_DIR/$f 44 | cp --parents $sourcefile $TARGET_DIR 45 | else 46 | echo "File $f does not exist - aborting" 47 | rm -rf $TARGET_DIR 48 | exit 1 49 | fi 50 | done 51 | echo -e "\rcopied files " 52 | 53 | echo -n "creating archive..." 54 | tar jcf ${TARGET_DIR}.tar.bz2 ${TARGET_DIR} 55 | echo -e "\rcreated archive at ${TARGET_DIR}.tar.bz2" 56 | 57 | echo "cleanup" 58 | rm -rf $TARGET_DIR 59 | -------------------------------------------------------------------------------- /src/test/resources/logback-test.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{5} - %msg%n 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /ceplib/src/test/resources/logback-test.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{5} - %msg%n 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /flinklib/src/test/resources/logback-test.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{5} - %msg%n 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /sparklib/src/test/resources/logback-test.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{5} - %msg%n 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/op/RScript.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package dbis.piglet.op 18 | 19 | import dbis.piglet.schema.Schema 20 | 21 | /** 22 | * Created by kai on 13.07.15. 23 | */ 24 | case class RScript( 25 | private val out: Pipe, 26 | private val in: Pipe, 27 | script: String, 28 | loadSchema: Option[Schema] = None 29 | ) extends PigOperator(List(out), List(in), loadSchema) { 30 | 31 | override def lineageString: String = s"""STREAM%""" + super.lineageString 32 | 33 | override def toString = 34 | s"""RScript 35 | | out = $outPipeName 36 | | in = $inPipeName 37 | | script = $script 38 | | load schema = $loadSchema 39 | """.stripMargin 40 | 41 | } 42 | 43 | -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/plan/rewriting/internals/MutingSupport.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.plan.rewriting.internals 2 | 3 | import dbis.piglet.tools.logging.PigletLogging 4 | import dbis.piglet.plan.DataflowPlan 5 | import dbis.piglet.op.Store 6 | import dbis.piglet.op.Dump 7 | import dbis.piglet.plan.rewriting.Rewriter 8 | import org.kiama.rewriting.Rewriter._ 9 | import org.kiama.rewriting.Strategy 10 | 11 | trait MutingSupport extends PigletLogging { 12 | 13 | def mute(plan: DataflowPlan): DataflowPlan = { 14 | 15 | val strategy = (op: Any) => op match { 16 | case s: Store => 17 | val dump = Dump(s.inputs.head, mute = true) 18 | Rewriter.replace(plan, s, dump) 19 | logger.debug(s"replaced $s with $dump") 20 | Some(dump) 21 | case d: Dump if !d.mute => 22 | logger.debug(s"muting $d") 23 | d.mute = true 24 | Some(d) 25 | 26 | case _ => None 27 | } 28 | 29 | 30 | Rewriter.rewritePlan(plan, manybu(strategyf(t => strategy(t)))) 31 | // var newPlan = plan 32 | // 33 | // val sinks = newPlan.sinkNodes 34 | // 35 | // sinks.foreach { sink => sink match { 36 | // case s: Store => 37 | // val dump = Dump(s.inputs.head, quietMode = true) 38 | // newPlan = newPlan.replace(s, dump) 39 | // case d: Dump if !d.quietMode => 40 | // d.quietMode = true 41 | // case _ => // ignore other consumers (such as display, empty) 42 | // 43 | // } 44 | // } 45 | // 46 | // newPlan 47 | // 48 | } 49 | } -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/codegen/flink/emitter/SocketWriteEmitter.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.codegen.flink.emitter 2 | 3 | import dbis.piglet.op.SocketWrite 4 | import dbis.piglet.codegen.CodeGenContext 5 | import dbis.piglet.backends.BackendManager 6 | import dbis.piglet.codegen.CodeEmitter 7 | import dbis.piglet.codegen.scala_lang.ScalaEmitter 8 | 9 | 10 | 11 | class SocketWriteEmitter extends CodeEmitter[SocketWrite] { 12 | override def template: String = """ 13 | | []().zmqPublish(":", , ) 14 | | 15 | | []().bind("", , , ) 16 | |""".stripMargin 17 | 18 | override def code(ctx: CodeGenContext, op: SocketWrite): String = { 19 | var paramMap = Map("in" -> op.inPipeName, "addr" -> op.addr, 20 | "func" -> op.func.getOrElse(BackendManager.backend.defaultConnector)) 21 | op.schema match { 22 | case Some(s) => paramMap += ("class" -> ScalaEmitter.schemaClassName(s.className)) 23 | case None => paramMap += ("class" -> "Record") 24 | } 25 | if (op.mode != "") paramMap += ("mode" -> op.mode) 26 | if (op.params != null && op.params.nonEmpty) paramMap += ("params" -> op.params.mkString(",")) 27 | render(paramMap) 28 | } 29 | } 30 | 31 | object SocketWriteEmitter { 32 | lazy val instance = new SocketWriteEmitter 33 | } -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/op/Dump.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package dbis.piglet.op 18 | 19 | /** 20 | * Dump represents the DUMP operator of Pig. 21 | * 22 | * @param in the input pipe 23 | */ 24 | case class Dump(private val in: Pipe, var mute: Boolean = false) extends PigOperator(List(), List(in)) { 25 | 26 | /** 27 | * Returns the lineage string describing the sub-plan producing the input for this operator. 28 | * 29 | * @return a string representation of the sub-plan. 30 | */ 31 | override def lineageString: String = { 32 | s"""DUMP%""" + super.lineageString 33 | } 34 | 35 | override def toString = 36 | s"""DUMP 37 | | in = $inPipeName 38 | | ${if(mute) "muted" else ""}""".stripMargin 39 | 40 | } 41 | -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/codegen/spark/StreamGroupingEmitter.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package dbis.piglet.codegen.spark 18 | 19 | import dbis.piglet.codegen.scala_lang.GroupingEmitter 20 | 21 | class StreamGroupingEmitter extends GroupingEmitter { 22 | override def template: String = """ 23 | | val = .transform(rdd => rdd.groupBy(t => {}).map{case (k,v) => (,v)}) 24 | | 25 | | val = .transform(rdd => rdd.coalesce(1).glom.map(t => ("all", t))) 26 | |""".stripMargin 27 | } 28 | 29 | object StreamGroupingEmitter { 30 | lazy val instance = new StreamGroupingEmitter 31 | } -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/op/Top.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package dbis.piglet.op 18 | 19 | /** An operator for top-k queries. 20 | * 21 | * It can also be used for bottom-k by changing the orderSpec 22 | * 23 | * @param out 24 | * @param in 25 | * @param orderSpec 26 | * @param num 27 | */ 28 | case class Top( 29 | private val out: Pipe, 30 | private val in: Pipe, 31 | orderSpec: List[OrderBySpec], 32 | num: Int 33 | ) extends PigOperator(out, in) { 34 | 35 | override def lineageString: String = s"""TOP$num""" + super.lineageString 36 | 37 | override def toString = 38 | s"""TOP 39 | | out = $outPipeName 40 | | in = $inPipeName 41 | | order = ${orderSpec.mkString(",")} 42 | | num = $num""".stripMargin 43 | } 44 | -------------------------------------------------------------------------------- /ceplib/src/main/scala/dbis/piglet/cep/spark/RDDMatcher.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.cep.spark 2 | 3 | import org.apache.spark.rdd.RDD 4 | import org.apache.spark.{Partition, TaskContext} 5 | import scala.reflect.ClassTag 6 | import dbis.piglet.cep.nfa.NFAController 7 | import dbis.piglet.cep.engines._ 8 | import dbis.piglet.cep.ops.SelectionStrategy._ 9 | import dbis.piglet.cep.ops.OutputStrategy._ 10 | import dbis.piglet.backends.{SchemaClass => Event} 11 | import dbis.piglet.cep.ops.MatchCollector 12 | import dbis.piglet.cep.ops.SelectionStrategy 13 | 14 | class RDDMatcher[T <: Event: ClassTag](parent: RDD[T], nfa: NFAController[T], sstr: SelectionStrategy = SelectionStrategy.FirstMatch, out: OutputStrategy = Combined) extends RDD[T](parent){ 15 | val collector: MatchCollector[T] = new MatchCollector() 16 | val engine: CEPEngine[T] = sstr match { 17 | case SelectionStrategy.FirstMatch => new FirstMatch(nfa, collector) 18 | case SelectionStrategy.AllMatches => new AnyMatch(nfa, collector) 19 | case SelectionStrategy.NextMatches => new NextMatch(nfa, collector) 20 | case SelectionStrategy.ContiguityMatches => new ContiguityMatch(nfa, collector) 21 | case _ => throw new Exception("The Strategy is not supported") 22 | 23 | } 24 | override def compute(split: Partition, context: TaskContext): Iterator[T] = { 25 | firstParent[T].iterator(split, context).foreach (event => engine.runEngine(event)) 26 | collector.convertEventsToArray().iterator 27 | } 28 | 29 | 30 | override protected def getPartitions: Array[Partition] = firstParent[Event].partitions 31 | } -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/codegen/scala_lang/StreamOpEmitter.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.codegen.scala_lang 2 | 3 | import dbis.piglet.codegen.{CodeEmitter, CodeGenContext, CodeGenException} 4 | import dbis.piglet.expr.Ref 5 | import dbis.piglet.op.StreamOp 6 | 7 | /** 8 | * Created by kai on 01.12.16. 9 | */ 10 | class StreamOpEmitter extends CodeEmitter[StreamOp] { 11 | override def template: String = """ val _helper = .map(t => List()) 12 | | val = (sc, _helper).map(t => ()) 13 | |""".stripMargin 14 | 15 | override def code(ctx: CodeGenContext, op: StreamOp): String = { 16 | if(op.schema.isEmpty) { 17 | throw CodeGenException("Schema must be set for STREAM THROUGH operator") 18 | } 19 | 20 | val className = ScalaEmitter.schemaClassName(op.schema.get.className) 21 | 22 | val inFields = op.inputSchema.get.fields.zipWithIndex.map{ case (f, i) => s"t._$i"}.mkString(", ") 23 | val outFields = op.schema.get.fields.zipWithIndex.map{ case (f, i) => s"t($i).asInstanceOf[${ScalaEmitter.scalaTypeMappingTable(f.fType)}]"}.mkString(", ") 24 | 25 | render(Map("out" -> op.outPipeName, 26 | "op" -> op.opName, 27 | "in" -> op.inPipeName, 28 | "class" -> className, 29 | "in_fields" -> inFields, 30 | "out_fields" -> outFields, 31 | "params" -> ScalaEmitter.emitParamList(CodeGenContext(ctx, Map("schema" -> op.schema)), op.params))) 32 | } 33 | } 34 | 35 | object StreamOpEmitter { 36 | lazy val instance = new StreamOpEmitter 37 | } -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/plan/rewriting/dsl/traits/EndWordT.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package dbis.piglet.plan.rewriting.dsl.traits 18 | 19 | /** A trait supplying methods to set the function in a [[dbis.piglet.plan.rewriting.dsl.traits.BuilderT]] and call its 20 | * apply method. 21 | */ 22 | trait EndWordT[FROM, TO] { 23 | val b: BuilderT[FROM, TO] 24 | 25 | /** Apply ``f`` (a total function) when rewriting. 26 | * 27 | * @param f 28 | */ 29 | def applyRule(f: (FROM => Option[TO])): Unit = { 30 | b.func = f 31 | b.build() 32 | } 33 | 34 | /** Apply ``f`` (a partial function) when rewriting. 35 | * 36 | */ 37 | def applyPattern(f: scala.PartialFunction[FROM, TO]): Unit = { 38 | val lifted = f.lift 39 | 40 | b.func = lifted 41 | b.build() 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/op/Distinct.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package dbis.piglet.op 18 | 19 | /** 20 | * Distinct represents the DISTINCT operator of Pig. 21 | * 22 | * @param out the output pipe (relation). 23 | * @param in the input pipe. 24 | * @param windowMode true if processed on a window on a data stream 25 | */ 26 | case class Distinct( 27 | private val out: Pipe, 28 | private val in: Pipe, 29 | var windowMode: Boolean = false 30 | ) extends PigOperator(out, in) { 31 | 32 | override def lineageString: String = { 33 | s"""DISTINCT%""" + super.lineageString 34 | } 35 | 36 | override def toString = 37 | s"""DISTINCT 38 | | out = $outPipeName 39 | | in = $inPipeName 40 | | inSchema = $inputSchema 41 | | outSchema = $schema""".stripMargin 42 | 43 | } 44 | -------------------------------------------------------------------------------- /Zeppelin.md: -------------------------------------------------------------------------------- 1 | ## Zeppelin integration 2 | 3 | We provide an integration with Apache Zeppelin - a web-bases notebook 4 | for data analytics. It allows to write and execute Piglet scripts in 5 | notebooks and visualize the results directly. For installation you 6 | need 7 | * the Zeppelin project from [here](https://zeppelin.incubator.apache.org/) 8 | * Spark 1.5 built with Scala 2.11 9 | * the zeppelin branch of Piglet 10 | 11 | Setting up the Piglet interpreter requires the following steps: 12 | 1. Build the zeppelin interpreter with sbt: 13 | ``` 14 | sbt> package 15 | sbt> assembly 16 | sbt> project zeppelin 17 | sbt> package 18 | ``` 19 | 20 | 1. Copy the following Jar files to ZEPPELIN_HOME/interpreter/piglet 21 | * PIGLET_HOME/common/target/scala-2.11/common_2.11-0.3.jar 22 | * PIGLET_HOME/sparklib/target/scala-2.11/sparklib_2.11-0.3.jar 23 | * PIGLET_HOME/target/scala-2.11/PigCompiler.jar 24 | * PIGLET_HOME/zeppelin/target/scala-2.11/piglet-interpreter_2.11-0.3.jar 25 | * spark-assembly-1.5.2-hadoop2.6.0.jar 26 | 27 | 1. Add the Piglet interpreter in ZEPPELIN_HOME/conf/zeppelin-site.xml 28 | by adding `dbis.piglet.PigletInterpreter` to the property value 29 | `zeppelin.interpreters`. 30 | 31 | 1. In Zeppelin, go to `INTERPRETER` and click `+Create`. Enter a name (piglet) 32 | and select the Piglet interpreter from the drop down menu. Currently, no 33 | additional properties have to be set. 34 | 35 | 1. Create a new notebook, go to "Interpreter binding", and activate 36 | `piglet %piglet`. 37 | 38 | 1. Enter your script and mark it as Piglet using `%piglet`. 39 | 40 | 1. Note that you have to use `DISPLAY relation` instead of `DUMP` to visualize the result. -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/codegen/flink/emitter/StreamFilterEmitter.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.codegen.flink.emitter 2 | 3 | import dbis.piglet.codegen.{ CodeEmitter, CodeGenContext, CodeGenException } 4 | import dbis.piglet.op.Filter 5 | import dbis.piglet.codegen.scala_lang.ScalaEmitter 6 | 7 | class StreamFilterEmitter extends CodeEmitter[Filter] { 8 | override def template: String = """ 9 | | val = .mapWindow(customFilter _) 10 | | 11 | | val = .filter(t => {}) 12 | | 13 | |""".stripMargin 14 | def templateHelper: String = """ .filter(t => {})""".stripMargin 15 | 16 | def windowApply(ctx: CodeGenContext, op: Filter): String = { 17 | CodeEmitter.render(templateHelper, Map("pred" -> ScalaEmitter.emitPredicate(CodeGenContext(ctx, Map("schema" -> op.schema)), op.pred))) 18 | } 19 | 20 | override def code(ctx: CodeGenContext, op: Filter): String = { 21 | if (op.windowMode) return "" 22 | if (!op.schema.isDefined) 23 | throw CodeGenException("FILTER requires a schema definition") 24 | 25 | val className = ScalaEmitter.schemaClassName(op.schema.get.className) 26 | render(Map("out" -> op.outPipeName, 27 | "in" -> op.inPipeName, 28 | "class" -> className, 29 | "pred" -> ScalaEmitter.emitPredicate(CodeGenContext(ctx, Map[String, Any]("schema" -> op.schema)), op.pred))) 30 | } 31 | } 32 | 33 | object StreamFilterEmitter { 34 | lazy val instance = new StreamFilterEmitter 35 | } -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/op/Tuplify.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package dbis.piglet.op 18 | 19 | import dbis.piglet.expr.Ref 20 | 21 | /** 22 | * 23 | * @param initialOutPipeName the name of the initial output pipe (relation) which is needed to construct the plan, but 24 | * can be changed later. 25 | * @param initialInPipeName 26 | * @param ref a reference des 27 | */ 28 | case class Tuplify(private val out: Pipe, private val in: Pipe, ref: Ref) extends PigOperator(out, in) { 29 | 30 | override def lineageString: String = s"""TUPLIFY%""" + super.lineageString 31 | 32 | // TODO 33 | override def checkSchemaConformance: Boolean = true 34 | 35 | override def toString = 36 | s"""TUPLIFY 37 | | out = $outPipeName 38 | | in = $inPipeName 39 | | ref = $ref""".stripMargin 40 | } 41 | -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/plan/rewriting/dsl/builders/ReplacementBuilder.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package dbis.piglet.plan.rewriting.dsl.builders 18 | 19 | import dbis.piglet.op.PigOperator 20 | import dbis.piglet.plan.rewriting.Rewriter 21 | import dbis.piglet.plan.rewriting.dsl.traits.BuilderT 22 | 23 | import scala.reflect.ClassTag 24 | 25 | /** A builder for applying a rewriting method that rewrites a single [[dbis.piglet.op.PigOperator]] to another one. 26 | * 27 | * @tparam FROM 28 | * @tparam TO 29 | */ 30 | class ReplacementBuilder[FROM <: PigOperator : ClassTag, TO <: PigOperator : ClassTag] extends 31 | PigOperatorBuilder[FROM, TO] { 32 | override def wrapInFixer(func: (FROM => Option[TO])): (FROM => Option[TO]) = func 33 | 34 | override def addAsStrategy(func: (FROM => Option[TO])) = { 35 | Rewriter.addTypedStrategy(func) 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /mapreducelib/src/main/scala/dbis/piglet/backends/mapreduce/PigRun.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.backends.mapreduce 2 | 3 | import dbis.piglet.backends.BackendConf 4 | import dbis.piglet.backends.PigletBackend 5 | import java.nio.file.Path 6 | import org.apache.pig.PigServer 7 | import org.apache.pig.ExecType 8 | import org.apache.pig.tools.pigstats.PigProgressNotificationListener 9 | import org.apache.pig.PigRunner 10 | 11 | /** 12 | * @author hage 13 | */ 14 | class PigRun extends PigletBackend with BackendConf { 15 | 16 | override def execute(master: String, className: String, jarFile: Path, backendArgs: Map[String,String], profiling: Boolean) = ??? 17 | 18 | override def executeRaw(program: Path, master: String, backendArgs: Map[String,String]) { 19 | 20 | val ba = backendArgs.flatMap{ case (k,v) => Array(k,v)} 21 | 22 | val args = Array("-x", execType(master), program.toAbsolutePath().toString() ) ++ ba 23 | 24 | val stats = PigRunner.run(args, null) 25 | 26 | } 27 | 28 | /** 29 | * Get the name of this backend 30 | * 31 | * @return Returns the name of this backend 32 | */ 33 | override def name: String = "MapReduce - Pig" 34 | 35 | /** 36 | * Get the path to the runner class that implements the PigletBackend interface 37 | */ 38 | override def runnerClass: PigletBackend = this 39 | 40 | override def templateFile = null 41 | 42 | override def defaultConnector = "PigStorage" 43 | 44 | override def raw = true 45 | 46 | private def execType(master: String) = if(master.startsWith("local")) "local" else "mapreduce" 47 | // implicit private def execType(master: String) = if(master.toLowerCase().startsWith("local")) ExecType.LOCAL else ExecType.MAPREDUCE 48 | } 49 | -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/codegen/flink/emitter/SocketReadEmitter.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.codegen.flink.emitter 2 | 3 | import dbis.piglet.codegen.CodeEmitter 4 | import dbis.piglet.op.SocketRead 5 | import dbis.piglet.codegen.CodeGenContext 6 | import dbis.piglet.backends.BackendManager 7 | import dbis.piglet.codegen.scala_lang.ScalaEmitter 8 | 9 | class SocketReadEmitter extends CodeEmitter[SocketRead] { 10 | override def template: String = """ 11 | | val = []().zmqSubscribe(env, ":", ) 12 | | 13 | | val = []().connect(env, "", , ) 14 | |""".stripMargin 15 | 16 | override def code(ctx: CodeGenContext, op: SocketRead): String = { 17 | var paramMap = ScalaEmitter.emitExtractorFunc(op, op.streamFunc) 18 | op.schema match { 19 | case Some(s) => paramMap += ("class" -> ScalaEmitter.schemaClassName(s.className)) 20 | case None => paramMap += ("class" -> "Record") 21 | } 22 | val params = if (op.streamParams != null && op.streamParams.nonEmpty) ", " + op.streamParams.mkString(",") else "" 23 | val func = op.streamFunc.getOrElse(BackendManager.backend.defaultConnector) 24 | paramMap ++= Map( 25 | "out" -> op.outPipeName, 26 | "addr" -> op.addr, 27 | "func" -> func, 28 | "params" -> params) 29 | if (op.mode != "") paramMap += ("mode" -> op.mode) 30 | render(paramMap) 31 | } 32 | } 33 | 34 | object SocketReadEmitter { 35 | lazy val instance = new SocketReadEmitter 36 | } -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/plan/rewriting/internals/EmbedSupport.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package dbis.piglet.plan.rewriting.internals 18 | 19 | import com.twitter.util.Eval 20 | import org.kiama.rewriting.Strategy 21 | 22 | /** Provides methods for evaluating embedded code. 23 | * 24 | */ 25 | trait EmbedSupport { 26 | /** The imports that are automatically added to eval'd code 27 | * 28 | */ 29 | private val imports = """ 30 | |import dbis.piglet.op._ 31 | |import dbis.piglet.plan.rewriting.Extractors._ 32 | |import dbis.piglet.plan.rewriting.Rewriter._ 33 | """.stripMargin 34 | 35 | /** Evals each String in ``ruleCode`` 36 | */ 37 | protected def evalExtraRuleCode(ruleCode: Seq[String]): Unit = 38 | ruleCode map { imports ++ _ } map { c => (new Eval).apply[scala.runtime.BoxedUnit](c) } 39 | } 40 | -------------------------------------------------------------------------------- /ceplib/src/main/scala/dbis/piglet/cep/ops/Strategies.scala: -------------------------------------------------------------------------------- 1 | 2 | package dbis.piglet.cep.ops 3 | import dbis.piglet.backends.{SchemaClass => Event} 4 | 5 | package object OutputTypes { 6 | type PossibleTypes = Event with Boolean 7 | } 8 | /** 9 | * @brief this enumerations represents the available selection strategies 10 | * where the matching should be done accordingly. 11 | */ 12 | object SelectionStrategy extends Enumeration { 13 | type SelectionStrategy = Value 14 | val NextMatches, AllMatches, ContiguityMatches, FirstMatch, RecentMatch = Value 15 | } 16 | 17 | /** 18 | * @brief this enumerations represents the available output strategies 19 | * where the result output would be generated accordingly, 20 | * because the output of this operator is a complex event or a combination of tuples 21 | * one by one means generate the tuples one after another, in this case the resulting tuples 22 | * have fixed schema, whereas combined strategy combines all tuples (complex event) in one big tuple 23 | * which has variable schema 24 | * 25 | */ 26 | object OutputStrategy extends Enumeration { 27 | type OutputStrategy = Value 28 | val OneByOne, Combined, TrueValues = Value 29 | } 30 | 31 | /** 32 | * @brief this enumerations represents the available model strategies 33 | * for processing or detecting the complex event accordingly. 34 | * in this engine, a non-deterministic finite automaton (NFA) based approach is used. 35 | * Moreover, we used a tree based evaluation model for pattern queries. 36 | * Each approach has its advantages and drawbacks in terms of performance, 37 | * optimization and its expressiveness 38 | * 39 | */ 40 | object MatchingStrategy extends Enumeration { 41 | type OutputStrategy = Value 42 | val TreeBased, NFABased = Value 43 | } 44 | 45 | -------------------------------------------------------------------------------- /flinklib/src/main/scala/dbis/piglet/backends/flink/PigFuncs.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package dbis.piglet.backends.flink 19 | 20 | import java.util.Random 21 | 22 | import dbis.piglet.CommonPigFuncs 23 | import dbis.piglet.backends._ 24 | import org.apache.flink.api.common.typeinfo.TypeInformation 25 | import org.apache.flink.api.java.functions._ 26 | import org.apache.flink.api.scala._ 27 | 28 | import scala.reflect.ClassTag 29 | 30 | class CustomSampler[T <: SchemaClass: ClassTag: TypeInformation](dataSet: DataSet[T]) { 31 | def sample(withReplacement: Boolean, fraction: Double, seed: Long = new Random().nextLong()) = { 32 | dataSet.mapPartition(new SampleWithFraction[T](withReplacement, fraction, seed)) 33 | } 34 | 35 | } 36 | 37 | object Sampler { 38 | implicit def addSampler[T <: SchemaClass: ClassTag: TypeInformation](dataSet: DataSet[T]) = { 39 | new CustomSampler(dataSet) 40 | } 41 | } 42 | 43 | object PigFuncs extends CommonPigFuncs { 44 | } 45 | -------------------------------------------------------------------------------- /flinklib/src/main/scala/dbis/piglet/backends/flink/streaming/UTF8StringSchema.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package dbis.piglet.backends.flink.streaming 19 | 20 | import org.apache.commons.lang3.SerializationUtils 21 | import org.apache.flink.streaming.util.serialization._ 22 | import org.apache.flink.api.common.typeinfo.TypeInformation 23 | import org.apache.flink.api.java.typeutils.TypeExtractor 24 | 25 | class UTF8StringSchema extends DeserializationSchema[String] with SerializationSchema[String] { 26 | 27 | override def deserialize(message: Array[Byte]): String = { 28 | new String(message, "UTF-8") 29 | } 30 | 31 | override def isEndOfStream(nextElement: String): Boolean = { 32 | false 33 | } 34 | 35 | override def serialize(element: String): Array[Byte] = { 36 | element.getBytes("UTF-8") 37 | } 38 | 39 | override def getProducedType(): TypeInformation[String] = { 40 | TypeExtractor.getForClass(classOf[String]) 41 | } 42 | } 43 | 44 | -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/op/WindowApply.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package dbis.piglet.op 18 | 19 | import dbis.piglet.schema.Schema 20 | 21 | /** 22 | * WindowApply is used to transform Windows back to a single continuous Stream. 23 | * 24 | * @param out the name of the output pipe. 25 | * @param in the name of the input pipe. 26 | * @param fname the name of the function which will be applied to the input window operator. 27 | */ 28 | case class WindowApply( 29 | private val out: Pipe, 30 | private val in: Pipe, 31 | fname: String 32 | ) extends PigOperator(out, in) { 33 | 34 | override def constructSchema: Option[Schema] = { 35 | schema 36 | } 37 | override def lineageString: String = { 38 | s"""WINDOWAPPLY%$fname%""" + super.lineageString 39 | } 40 | 41 | override def toString = 42 | s"""WINDOWAPPLY 43 | | out = $outPipeName 44 | | in = $inPipeName 45 | | fname = $fname 46 | """.stripMargin 47 | 48 | } 49 | -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/op/cmd/HdfsCmd.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package dbis.piglet.op.cmd 19 | 20 | import dbis.piglet.op.PigOperator 21 | import dbis.piglet.tools.HDFSService 22 | import dbis.piglet.tools.HdfsCommand 23 | 24 | 25 | /** 26 | * HdfsCmd represents a pseudo operator for HDFS commands. 27 | */ 28 | case class HdfsCmd(cmd: HdfsCommand.HdfsCommand, params: List[String]) extends PigOperator(List(), List()) 29 | { 30 | 31 | // if (!isValid) 32 | // throw new java.lang.IllegalArgumentException("unknown fs command '" + cmd + "'") 33 | 34 | 35 | override def outPipeNames: List[String] = List() 36 | 37 | // def isValid = HdfsCommand.values.map{v => v.toString().toLowerCase()}.exists { s => s.equalsIgnoreCase(cmd) } 38 | 39 | def paramString(): String = params.map(p => s""""$p"""").mkString(",") 40 | 41 | override def toString = 42 | s"""HDFS COMMAND 43 | | cmd = $cmd 44 | | params = ${params.mkString(",")}""".stripMargin 45 | 46 | } 47 | 48 | -------------------------------------------------------------------------------- /ceplib/src/main/scala/dbis/piglet/cep/flink/DataStreamMatcher.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.cep.flink 2 | 3 | import scala.reflect.ClassTag 4 | import dbis.piglet.cep.nfa.NFAController 5 | import dbis.piglet.cep.engines._ 6 | import dbis.piglet.cep.ops.SelectionStrategy._ 7 | import dbis.piglet.cep.ops.OutputStrategy._ 8 | import dbis.piglet.backends.{SchemaClass => Event} 9 | import org.apache.flink.api.common.typeinfo.TypeInformation 10 | import org.apache.flink.streaming.api.windowing.windows.GlobalWindow 11 | import org.apache.flink.streaming.api.windowing.assigners.GlobalWindows 12 | import dbis.piglet.cep.ops.MatchCollector 13 | import dbis.piglet.cep.ops.SelectionStrategy 14 | //import org.apache.flink.api.java.operators.CustomUnaryOperation 15 | import scala.collection.mutable.ListBuffer 16 | //import org.apache.flink.api.java.DataSet 17 | //import org.apache.flink.api.java.ExecutionEnvironment 18 | import scala.collection.JavaConversions._ 19 | import org.apache.flink.streaming.api.scala._ 20 | import dbis.piglet.cep.ops.EngineConf 21 | import org.apache.flink.util.Collector 22 | 23 | 24 | class DataStreamMatcher[T <: Event: ClassTag: TypeInformation](@transient val input: DataStream[T], nfa: NFAController[T], flinkEnv: StreamExecutionEnvironment, sstr: SelectionStrategy = SelectionStrategy.FirstMatch, out: OutputStrategy = Combined) extends EngineConf[T](nfa, sstr) with java.io.Serializable { 25 | object DataStreamProcess { 26 | def customRun(gw: GlobalWindow, ts: Iterable[T], out: Collector[T]) = { 27 | ts.foreach { event => engine.runEngine(event)} 28 | val result = collector.convertEventsToArray() 29 | result.foreach { res => out.collect(res) } 30 | } 31 | } 32 | def compute(): DataStream[T] = { 33 | input.windowAll(GlobalWindows.create()).apply(DataStreamProcess.customRun _) 34 | } 35 | 36 | } -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/codegen/spark/PartitionerEmitter.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.codegen.spark 2 | 3 | import dbis.piglet.codegen.{CodeEmitter, CodeGenContext} 4 | import dbis.piglet.op.{Partition, PartitionMethod} 5 | 6 | class PartitionerEmitter extends CodeEmitter[Partition] { 7 | 8 | private val partitionerTemplate = "new ()" 9 | private val spatialPartitionerTemplate = "new (_helper,)" 10 | 11 | override def template = """val = { 12 | | val _helper = 13 | | _helper.partitionBy().map{case (_,v)=> 14 | | 15 | | PerfMonitor.sampleSize(v,"", accum, randFactor) 16 | | 17 | | 18 | | v 19 | | \} 20 | |\}""".stripMargin 21 | 22 | override def code(ctx: CodeGenContext, op: Partition): String = { 23 | 24 | 25 | val partitioner = { 26 | 27 | val (template, methodClass) = op.method match { 28 | case PartitionMethod.GRID => (spatialPartitionerTemplate, "SpatialGridPartitioner") 29 | case PartitionMethod.BSP => (spatialPartitionerTemplate, "BSPartitioner") 30 | case PartitionMethod.Hash => (partitionerTemplate, "org.apache.spark.HashPartitioner") 31 | } 32 | 33 | CodeEmitter.render(template, Map( 34 | "method" -> methodClass, 35 | "params" -> op.params.mkString(","))) 36 | 37 | } 38 | 39 | render(Map( 40 | "out" -> op.outPipeName, 41 | "in" -> op.inPipeName, 42 | "partitioner" -> partitioner, 43 | "keyby" -> SpatialEmitterHelper.keyByCode(op.inputSchema, op.field, ctx), 44 | "lineage" -> op.lineageSignature 45 | )) 46 | 47 | } 48 | } 49 | 50 | object PartitionerEmitter { 51 | lazy val instance = new PartitionerEmitter 52 | } -------------------------------------------------------------------------------- /src/test/scala/dbis/piglet/tools/RingBufferSpec.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.tools 2 | 3 | import org.scalatest.{FlatSpec, Matchers} 4 | 5 | /** 6 | * Created by hage on 31.05.17. 7 | */ 8 | class RingBufferSpec extends FlatSpec with Matchers { 9 | 10 | "A RingBuffer" should "return the correct length for empty buffer" in { 11 | val b = new RingBuffer[Int](3) 12 | 13 | b.length shouldBe 0 14 | } 15 | 16 | it should "return the correct length for single element in buffer" in { 17 | val b = RingBuffer(5) 18 | 19 | b.length shouldBe 1 20 | } 21 | 22 | it should "return the correct length more elements than capacity" in { 23 | val b = new RingBuffer[Int](3) 24 | 25 | Array(1,2,3,4,5,6,7,8).foreach(b.put) 26 | 27 | b.length shouldBe 3 28 | } 29 | 30 | it should "accept fewer elements as capacity" in { 31 | val b = new RingBuffer[Int](3) 32 | 33 | b.put(2) 34 | b.put(1) 35 | 36 | b should contain theSameElementsAs List(1, 2) 37 | } 38 | 39 | it should "accept same number of elements as capacity" in { 40 | val b = new RingBuffer[Int](3) 41 | 42 | b.put(2) 43 | b.put(1) 44 | b.put(3) 45 | 46 | b should contain theSameElementsAs List(3, 1, 2) 47 | } 48 | 49 | it should "remove the first entry when inserting cap + 1st element" in { 50 | val b = RingBuffer(1,2,3) 51 | 52 | b.put(4) 53 | 54 | b should contain theSameElementsAs List(4,2,3) 55 | } 56 | 57 | it should "accept more elements than capacity" in { 58 | 59 | val b = new RingBuffer[Int](3) 60 | 61 | Array(1,2,3,4,5,6,7,8).foreach(b.put) 62 | 63 | b should contain theSameElementsAs List(6,7,8) 64 | } 65 | 66 | it should "create a buffer with apply method" in { 67 | val b = RingBuffer(1,2,3,4,5,6,7,8) 68 | b should contain theSameElementsAs List(1,2,3,4,5,6,7,8) 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /common/src/main/scala/dbis/piglet/backends/CppConfig.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package dbis.piglet.backends 19 | import scala.collection.immutable.List 20 | 21 | 22 | /** 23 | * Defines the interface to the c++ compiler 24 | */ 25 | trait CppConfig { 26 | 27 | /** 28 | * Get a C++ compiler, this can be done g++, clang++, .... 29 | */ 30 | def getCompiler: String 31 | 32 | /** 33 | * Get the libraries which are used during compiling. The compiler has to link to these 34 | * libraries otherwise, linking error will be shown 35 | */ 36 | def getLibraries: List[String] 37 | 38 | /** 39 | * Get options for compiling the code accordingly such as the optimization level, enabling some 40 | * macros, etc. 41 | */ 42 | def getOptions: List[String] 43 | /** 44 | * Get directories for libraries which are essential during the linking 45 | */ 46 | def getLibDirs: List[String] 47 | /** 48 | * Get include directories for finding the header files. 49 | */ 50 | def getIncludeDirs: List[String] 51 | } 52 | -------------------------------------------------------------------------------- /materialization_scripts/taxi_high_tip_block.pig: -------------------------------------------------------------------------------- 1 | <% 2 | 3 | def dateToMonth(date: String): Int = { 4 | val formatter = java.time.format.DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss") 5 | java.time.LocalDate.parse(date,formatter).getMonthValue() 6 | } 7 | %> 8 | 9 | raw = load '$taxi' using PigStorage(',',skipEmpty=true) as 10 | (vendor_id:chararray,pickup_datetime:chararray,dropoff_datetime:chararray,passenger_count:chararray, 11 | trip_distance:chararray, pickup_longitude:chararray,pickup_latitude:chararray,rate_code:chararray, 12 | store_and_fwd_flag:chararray,dropoff_longitude:chararray,dropoff_latitude:chararray,payment_type:chararray, 13 | fare_amount:chararray,surcharge:chararray,mta_tax:chararray,tip_amount:chararray,tolls_amount:chararray,total_amount:chararray); 14 | 15 | noHeader = filter raw by not STARTSWITH(LOWER(vendor_id),"vendor") and dropoff_longitude != "" and dropoff_latitude != "" 16 | and total_amount != "" and tip_amount != ""; 17 | 18 | month_total = FOREACH noHeader GENERATE geometry("POINT("+ dropoff_latitude +" "+ dropoff_longitude +")") as dropoffloc, 19 | (double)total_amount as total, (double)tip_amount as tip; 20 | 21 | allBlocks = load '$blocks' using PigStorage(';') as (blockid: int, wkt: chararray); 22 | blocks = FOREACH allBlocks GENERATE geometry(wkt) as blockbounds, blockid; 23 | 24 | dropoff = SPATIAL_JOIN month_total, blocks ON CONTAINEDBY using index RTree(order=10); 25 | dropoff_block = FOREACH dropoff GENERATE blockid, total, tip; 26 | 27 | 28 | grp = GROUP dropoff_block by blockid; 29 | avgs = FOREACH grp GENERATE group as blockid, AVG(dropoff_block.tip) * 100 / AVG(dropoff_block.total) as p:double ; 30 | 31 | hightip = FILTER avgs BY p >= 20; 32 | 33 | sorted = ORDER hightip BY p DESC; 34 | 35 | DUMP sorted; 36 | -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/op/IndexOp.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.op 2 | 3 | import dbis.piglet.expr.{NamedField, Ref, RefExpr} 4 | import dbis.piglet.schema._ 5 | 6 | object IndexMethod extends Enumeration { 7 | type IndexMethod = Value 8 | val RTREE = Value 9 | 10 | def methodName(method: IndexMethod.IndexMethod): String = method match { 11 | case RTREE => "RTree" 12 | case _ => throw new IllegalArgumentException(s"unknown index method: $method") 13 | } 14 | } 15 | 16 | import dbis.piglet.op.IndexMethod.IndexMethod 17 | 18 | case class IndexOp( 19 | private val out: Pipe, 20 | private val in: Pipe, 21 | field: Ref, 22 | method: IndexMethod, 23 | params: Seq[String] 24 | ) extends PigOperator(out, in) { 25 | 26 | 27 | override def constructSchema = { 28 | val inSchema = inputs.head.producer.schema 29 | 30 | val inputType = inSchema match { 31 | case Some(s) => s.element.valueType 32 | case None => TupleType(Array(Field("", Types.ByteArrayType))) 33 | } 34 | 35 | val keyField = field match { 36 | case nf:NamedField => 37 | Field(nf.name, RefExpr(field).resultType(inSchema), nf.lineage) 38 | case _ => 39 | Field("", RefExpr(field).resultType(inSchema)) 40 | } 41 | 42 | val nested = Field(in.name, inputType) 43 | val fields = Array(keyField, nested) 44 | 45 | val iBag = BagType(IndexType(TupleType(fields), IndexMethod.methodName(method))) 46 | 47 | schema = Some(Schema(iBag)) 48 | schema 49 | } 50 | 51 | 52 | override def lineageString = 53 | s"""INDEX%$method%$field%${params.mkString}"""+super.lineageString 54 | 55 | override def toString = 56 | s"""INDEX 57 | | out = $outPipeName 58 | | in = $inPipeName 59 | | field = $field 60 | | index method = $method 61 | | params = ${params.mkString(",")} 62 | """.stripMargin 63 | 64 | } -------------------------------------------------------------------------------- /sparklib/src/main/scala/dbis/piglet/backends/spark/FileStreamReader.scala: -------------------------------------------------------------------------------- 1 | 2 | package dbis.piglet.backends.spark 3 | import org.apache.spark.storage.StorageLevel 4 | import org.apache.spark.streaming.receiver.Receiver 5 | import scala.io.Source 6 | import java.io.{ FileNotFoundException, IOException } 7 | import org.apache.spark.streaming.scheduler._ 8 | import org.apache.spark.streaming.StreamingContext 9 | 10 | class FileStreamReader(file: String, @transient val ssc: StreamingContext) extends Receiver[String](StorageLevel.MEMORY_AND_DISK_2) { 11 | 12 | def onStart() { 13 | // Start the thread that reads data from a file 14 | new Thread("FileStreamReader") { 15 | override def run() { receive() } 16 | }.start() 17 | } 18 | 19 | def onStop() { 20 | // There is nothing to do here 21 | } 22 | 23 | /** Create a reader to read data from the file till EOF */ 24 | private def receive() { 25 | try { 26 | for (line <- Source.fromFile(file).getLines()) { 27 | store(line) 28 | //Thread sleep 1000 // for testing 29 | } 30 | //stop("stopped ...") // stop receiver 31 | //ssc.stop() 32 | //SparkStream.ssc.stop(true, true) // stop streaming context gracefully 33 | } catch { 34 | case ex: FileNotFoundException => println(s"Could not find $file file.") 35 | case ex: IOException => println(s"Had an IOException during reading $file file") 36 | } finally { 37 | stop("Stopped Receiver") 38 | ssc.stop(true, true) 39 | SparkStream.ssc.stop(true, true) 40 | //sys.exit() 41 | 42 | 43 | } 44 | } 45 | } 46 | class FileReader(ssc: StreamingContext) { 47 | def readFile(file: String) = ssc.receiverStream(new FileStreamReader(file, ssc)) 48 | } 49 | object FileStreamReader { 50 | implicit def customFileStreamReader(ssc: StreamingContext) = 51 | new FileReader(ssc) 52 | } 53 | 54 | -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/codegen/spark/DelayEmitter.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.codegen.spark 2 | 3 | import dbis.piglet.codegen.{CodeEmitter, CodeGenContext} 4 | import dbis.piglet.op.Delay 5 | 6 | class DelayEmitter extends CodeEmitter[Delay] { 7 | override def template: String = 8 | """val = .mapPartitions({ iter => 9 | | Thread.sleep() 10 | | 11 | |},true)""".stripMargin 12 | 13 | 14 | lazy val processorFilterTemplate = s"""iter.filter{ t => 15 | | 16 | | val decision = scala.util.Random.nextInt() == 0 17 | | 18 | | if(decision) 19 | | PerfMonitor.sampleSize(t,"", accum, randFactor) 20 | | 21 | | decision 22 | |}""".stripMargin 23 | 24 | lazy val processorDuplTemplate = s"""iter.flatMap{ t => 25 | | (0 until ).iterator.map{_ => 26 | | 27 | | PerfMonitor.sampleSize(t,"", accum, randFactor) 28 | | 29 | | t 30 | | } 31 | |}""".stripMargin 32 | 33 | 34 | override def code(ctx: CodeGenContext, op: Delay): String = { 35 | 36 | val processorParams = Map( 37 | "sampleFactor" -> math.abs(op.sampleFactor), // always use positive value 38 | "lineage" -> op.lineageSignature 39 | ) 40 | 41 | // if sampleFactor is negative, use a filter to reduce tuples, otherwise duplicate them 42 | val processorCode = if(op.sampleFactor < 0 ) 43 | CodeEmitter.render(processorFilterTemplate, processorParams) 44 | else CodeEmitter.render(processorDuplTemplate, processorParams) 45 | 46 | val m = Map("out" -> op.outPipeName, 47 | "in" -> op.inPipeName, 48 | "wtime" -> op.wtime.toMillis, 49 | "processor" -> processorCode 50 | ) 51 | 52 | render(m) 53 | } 54 | } 55 | 56 | object DelayEmitter { 57 | lazy val instance = new DelayEmitter 58 | } -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/api/PigletInterpreterAPI.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.api 2 | 3 | import dbis.piglet.plan.rewriting.Rewriter._ 4 | import dbis.piglet.plan.DataflowPlan 5 | import dbis.piglet.parser.PigParser 6 | import dbis.piglet.backends.BackendManager 7 | import dbis.piglet.tools.Conf 8 | import dbis.piglet.codegen.CodeGenerator 9 | import dbis.piglet.tools.logging.PigletLogging 10 | import dbis.piglet.schema.Schema 11 | 12 | object PigletInterpreterAPI extends PigletLogging { 13 | 14 | /** 15 | * Create Scala code for the given backend from the source string. 16 | * This method is provided mainly for Zeppelin. 17 | * 18 | * @param source the Piglet script 19 | * @param backend the backend used to compile and execute 20 | * @return the generated Scala code 21 | */ 22 | def createCodeFromInput(source: String, backend: String): String = { 23 | import scala.collection.JavaConverters._ 24 | 25 | Schema.init() 26 | var plan = new DataflowPlan(PigParser.parseScript(source)) 27 | 28 | if (!plan.checkConnectivity) { 29 | logger.error(s"dataflow plan not connected") 30 | return "" 31 | } 32 | 33 | logger.debug(s"successfully created dataflow plan") 34 | plan = rewritePlan(plan) 35 | 36 | // compile it into Scala code for Spark 37 | val generatorClass = Conf.backendGenerator(backend) 38 | val extension = Conf.backendExtension(backend) 39 | val backendConf = BackendManager.init(backend) 40 | // BackendManager.backend = backendConf 41 | val templateFile = backendConf.templateFile 42 | val args = Array(templateFile).asInstanceOf[Array[AnyRef]] 43 | val compiler = Class.forName(generatorClass).getConstructors()(0).newInstance(args: _*).asInstanceOf[CodeGenerator] 44 | 45 | // 5. generate the Scala code 46 | val code = compiler.generate("blubs", plan, profiling = None, forREPL = true) 47 | logger.debug("successfully generated scala program") 48 | code 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/op/Delay.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package dbis.piglet.op 18 | 19 | import scala.concurrent.duration.FiniteDuration 20 | 21 | /** 22 | * Delay represents the DELAY operator of Pig. 23 | * 24 | * @param out the output pipe (relation). 25 | * @param in the input pipe. 26 | * @param sampleFactor the percentage of input tuples that is passed to the output pipe 27 | * @param wtime the time for delaying the processing 28 | * 29 | */ 30 | case class Delay( 31 | private val out: Pipe, 32 | private val in: Pipe, 33 | sampleFactor: Int, 34 | wtime: FiniteDuration 35 | ) extends PigOperator(out, in) { 36 | 37 | private val r = 0 //System.currentTimeMillis() 38 | 39 | override def lineageString: String = { 40 | s"""DELAY%$sampleFactor%$wtime%$r%""" + super.lineageString 41 | } 42 | 43 | override def toString = 44 | s"""DELAY 45 | | out = $outPipeName 46 | | in = $inPipeName 47 | | sample factor = $sampleFactor 48 | | waiting time = ${wtime._1} - ${wtime._2}""".stripMargin 49 | 50 | 51 | 52 | } 53 | -------------------------------------------------------------------------------- /src/main/scala/dbis/piglet/op/SplitInto.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package dbis.piglet.op 18 | 19 | import dbis.piglet.expr.Predicate 20 | 21 | case class SplitBranch(output: Pipe, expr: Predicate) { 22 | protected[op] def lineageSignature = s"""SPLITBRANCH($expr)""" 23 | 24 | override def toString = s"${output.name} $expr" 25 | } 26 | 27 | /** 28 | * SplitInto represents the SPLIT INTO operator of Pig. 29 | * 30 | * @param splits a list of split branches (output pipe + condition) 31 | */ 32 | case class SplitInto(private val in: Pipe, splits: List[SplitBranch]) extends PigOperator(splits.map(s => s.output), List(in)) { 33 | 34 | // override def initialOutPipeNames: List[String] = splits.map{ branch => branch.output.name } 35 | 36 | override def lineageString: String = { 37 | s"""SPLIT%${splits.map(_.lineageSignature).mkString("%")}%""" + super.lineageString 38 | } 39 | 40 | override def toString = 41 | s"""SPLITINTO 42 | | out = ${outPipeNames.mkString(",")} 43 | | in = $inPipeName 44 | | inSchema = $inputSchema 45 | | splits = ${splits.mkString(",")} 46 | """.stripMargin 47 | } 48 | -------------------------------------------------------------------------------- /ceplib/src/main/scala/dbis/piglet/cep/engines/CEPEngine.scala: -------------------------------------------------------------------------------- 1 | package dbis.piglet.cep.engines 2 | 3 | import dbis.piglet.backends.{SchemaClass => Event} 4 | import scala.reflect.ClassTag 5 | import scala.collection.mutable.ListBuffer 6 | import dbis.piglet.cep.nfa.NFAStructure 7 | import dbis.piglet.cep.nfa.NFAController 8 | import dbis.piglet.cep.nfa.NormalState 9 | import scala.collection.mutable.Map 10 | import dbis.piglet.cep.ops.MatchCollector 11 | 12 | abstract class CEPEngine[T <: Event: ClassTag](nfaController: NFAController[T], collector: MatchCollector[T]) extends Serializable { 13 | val structureID = { var sid: Long = 0; () => { sid += 1; sid } } 14 | var runningStructursPool: Map[Long, NFAStructure[T]] = Map() 15 | var wantToDeletedStructurs: ListBuffer[Long] = new ListBuffer() 16 | def createNewStructue(event: T): Unit = { 17 | val start = nfaController.getStartState 18 | start.edges.foreach { e => 19 | if (e.evaluate(event)) { 20 | val newStr = new NFAStructure[T](nfaController) 21 | newStr.addEvent(event, e) 22 | runningStructursPool += (structureID() -> newStr) 23 | } 24 | } 25 | } 26 | def runGCStructures(): Unit = { 27 | if(runningStructursPool.size > 0) { 28 | runningStructursPool --= wantToDeletedStructurs 29 | //wantToDeletedStructurs.foreach { x => runningStructursPool -= x } 30 | wantToDeletedStructurs.clear() 31 | } 32 | } 33 | 34 | def checkPredicate(event: T, currenStr: NFAStructure[T]): Int = { 35 | var result: Int = -1 36 | if (currenStr.getCurrentState.isInstanceOf[NormalState[T]]) { 37 | val currentState = currenStr.getCurrentState.asInstanceOf[NormalState[T]] 38 | currentState.edges.zipWithIndex.foreach { 39 | case (e, i) => 40 | if (e.evaluate(event, currenStr)) { 41 | result = i 42 | } 43 | } 44 | } 45 | result 46 | } 47 | def runEngine(event: T): Unit 48 | //def printNumMatches(): Unit 49 | } --------------------------------------------------------------------------------