├── scalding-thrift-macros ├── NOTICE └── src │ └── main │ └── scala │ └── com │ └── twitter │ └── scalding │ └── thrift │ └── macros │ ├── RequiredBinaryComparators.scala │ └── Macros.scala ├── project ├── build.properties ├── travis-log4j.properties ├── scalding-dagon.scala └── plugins.sbt ├── tutorial ├── data │ ├── graph2.tsv │ ├── hello.txt │ ├── words.txt │ ├── graph.tsv │ ├── word_scores.tsv │ ├── helloDoc.txt │ ├── phones.txt │ ├── docBOW.tsv │ └── session.json ├── CodeSnippets.md ├── .scalding_repl ├── JsonTutorial0.scala ├── AvroTutorial0.scala ├── MatrixTutorial1.scala ├── MatrixTutorial0.scala ├── MatrixTutorial4.scala ├── MatrixTutorial3.scala ├── MatrixTutorial6.scala ├── MatrixTutorial5.scala ├── MatrixTutorial2.scala ├── Tutorial1.scala └── ReplTutorial1.scala ├── logo └── scalding.png ├── scalding-core ├── src │ ├── test │ │ ├── resources │ │ │ └── com │ │ │ │ └── twitter │ │ │ │ └── scalding │ │ │ │ └── test_filesystem │ │ │ │ └── test_data │ │ │ │ └── 2013 │ │ │ │ ├── 04 │ │ │ │ ├── _SUCCESS │ │ │ │ └── 2013-04.txt │ │ │ │ ├── 05 │ │ │ │ ├── _SUCCESS │ │ │ │ └── _ignored │ │ │ │ ├── 06 │ │ │ │ └── _SUCCESS │ │ │ │ ├── 08 │ │ │ │ ├── _SUCCESS │ │ │ │ └── 2013-08.txt │ │ │ │ ├── 03 │ │ │ │ └── 2013-03.txt │ │ │ │ └── 07 │ │ │ │ ├── 2013-07.txt │ │ │ │ └── _SUCCESS │ │ │ │ └── _ignored │ │ └── scala │ │ │ └── com │ │ │ └── twitter │ │ │ └── scalding │ │ │ ├── typed │ │ │ └── InAnotherPackage.scala │ │ │ ├── FlowStateMapTest.scala │ │ │ ├── RegressionTests.scala │ │ │ ├── TypedPipeCheckerTest.scala │ │ │ ├── source │ │ │ └── TypedTextTest.scala │ │ │ ├── estimation │ │ │ └── memory │ │ │ │ └── MemoryEstimatorStepStrategyTest.scala │ │ │ ├── IterableExecutionSerializationTest.scala │ │ │ ├── TypedSketchJoinJobForEmptyKeysTest.scala │ │ │ ├── TestTapFactoryTest.scala │ │ │ ├── TimePathedSourceTest.scala │ │ │ ├── ExecutionUtilTest.scala │ │ │ ├── PathFilterTest.scala │ │ │ └── ScanLeftTest.scala │ └── main │ │ ├── scala │ │ └── com │ │ │ └── twitter │ │ │ └── scalding │ │ │ ├── reducer_estimation │ │ │ ├── package.scala │ │ │ ├── ReducerHistoryEstimator.scala │ │ │ └── ReducerEstimatorConfig.scala │ │ │ ├── serialization │ │ │ ├── RequiredBinaryComparatorsConfig.scala │ │ │ ├── SerializeAsUnit.scala.scala │ │ │ ├── MultiJoinExternalizer.scala │ │ │ ├── Externalizer.scala │ │ │ └── RequiredBinaryComparators.scala │ │ │ ├── typed │ │ │ ├── cascading_backend │ │ │ │ └── DistinctCoGroupJoiner.scala │ │ │ ├── BijectedSourceSink.scala │ │ │ └── MemorySink.scala │ │ │ ├── source │ │ │ ├── NullSink.scala │ │ │ ├── CheckedInversion.scala │ │ │ └── MaxFailuresCheck.scala │ │ │ ├── mathematics │ │ │ ├── Poisson.scala │ │ │ └── Histogram.scala │ │ │ ├── TupleConversions.scala │ │ │ ├── Sortable.scala │ │ │ ├── CascadeJob.scala │ │ │ ├── OptionalSource.scala │ │ │ ├── TypedPipeChecker.scala │ │ │ ├── BijectedOrderedSerialization.scala │ │ │ ├── estimation │ │ │ ├── Common.scala │ │ │ └── HistoryService.scala │ │ │ ├── macros │ │ │ ├── MacroImplicits.scala │ │ │ └── impl │ │ │ │ └── CaseClassFieldSetter.scala │ │ │ ├── Dsl.scala │ │ │ └── TupleArity.scala │ │ └── java │ │ └── com │ │ └── twitter │ │ └── scalding │ │ └── tap │ │ └── GlobHfs.java └── codegen │ ├── function_implicits_generator.rb │ ├── mappable_generator.rb │ ├── typed_source_generator.rb │ └── tuple_adder_generator.rb ├── scalding-parquet ├── src │ ├── test │ │ └── resources │ │ │ └── names.txt │ └── main │ │ ├── scala │ │ └── com │ │ │ └── twitter │ │ │ └── scalding │ │ │ └── parquet │ │ │ └── HasFilterPredicate.scala │ │ └── java │ │ └── com │ │ └── twitter │ │ └── scalding │ │ └── parquet │ │ └── tuple │ │ ├── TupleRecordMaterializer.java │ │ └── SchemaIntersection.java └── README.md ├── scalding-commons ├── src │ ├── test │ │ ├── resources │ │ │ └── com │ │ │ │ └── twitter │ │ │ │ └── scalding │ │ │ │ └── test_filesystem │ │ │ │ └── test_data │ │ │ │ └── 2013 │ │ │ │ ├── 10 │ │ │ │ ├── _SUCCESS │ │ │ │ ├── part-00000 │ │ │ │ ├── part-00001 │ │ │ │ └── part-00002 │ │ │ │ └── 09 │ │ │ │ ├── _SUCCESS │ │ │ │ ├── part-00000 │ │ │ │ └── part-00001 │ │ ├── scala │ │ │ └── com │ │ │ │ └── twitter │ │ │ │ └── scalding │ │ │ │ ├── commons │ │ │ │ └── source │ │ │ │ │ ├── typedtext │ │ │ │ │ └── TypedTextTest.scala │ │ │ │ │ └── LzoGenericSourceSpec.scala │ │ │ │ └── WordCountTest.scala │ │ └── java │ │ │ └── com │ │ │ └── twitter │ │ │ └── scalding │ │ │ └── commons │ │ │ └── datastores │ │ │ ├── FSTestCase.java │ │ │ └── TestUtils.java │ └── main │ │ ├── scala │ │ └── com │ │ │ └── twitter │ │ │ └── scalding │ │ │ ├── examples │ │ │ ├── WordCountJob.scala │ │ │ └── MergeTest.scala │ │ │ └── commons │ │ │ ├── scheme │ │ │ └── CombinedSequenceFileScheme.scala │ │ │ └── source │ │ │ ├── FixedPathSources.scala │ │ │ ├── LzoCodecSource.scala │ │ │ └── LzoGenericSource.scala │ │ └── java │ │ └── com │ │ └── twitter │ │ └── scalding │ │ └── commons │ │ └── datastores │ │ └── Utils.java └── codegen │ └── lzotypedtsv_generator.rb ├── scalding-parquet-scrooge ├── src │ ├── test │ │ └── resources │ │ │ └── names.txt │ └── main │ │ ├── scala │ │ └── com │ │ │ └── twitter │ │ │ └── scalding │ │ │ └── parquet │ │ │ └── scrooge │ │ │ └── ParquetScrooge.scala │ │ └── java │ │ └── com │ │ └── twitter │ │ └── scalding │ │ └── parquet │ │ └── scrooge │ │ ├── ParquetScroogeInputFormat.java │ │ ├── ScroogeSchemaConversionException.java │ │ └── ParquetScroogeOutputFormat.java └── README.md ├── docs └── src │ └── main │ ├── tut │ ├── cookbook │ │ ├── cookbook.md │ │ └── hbase.md │ └── cookbook.md │ └── resources │ └── microsite │ └── img │ ├── favicon.png │ ├── navbar_brand.png │ ├── sidebar_brand.png │ ├── navbar_brand2x.png │ └── sidebar_brand2x.png ├── maple └── src │ └── main │ └── java │ └── com │ └── twitter │ └── maple │ └── tap │ ├── TupleWrapper.java │ └── StdoutTap.java ├── scalding-dagon └── src │ ├── main │ ├── scala │ │ └── com │ │ │ └── twitter │ │ │ └── scalding │ │ │ └── dagon │ │ │ ├── package.scala │ │ │ ├── PartialRule.scala │ │ │ ├── FunctionK.scala │ │ │ ├── RefPair.scala │ │ │ ├── Id.scala │ │ │ └── Rule.scala │ ├── scala-2.12- │ │ └── com │ │ │ └── twitter │ │ │ └── scalding │ │ │ └── dagon │ │ │ └── ScalaVersionCompat.scala │ └── scala-2.13+ │ │ └── com │ │ └── twitter │ │ └── scalding │ │ └── dagon │ │ └── ScalaVersionCompat.scala │ └── test │ └── scala │ └── com │ └── twitter │ └── scalding │ └── dagon │ ├── MemoizeTests.scala │ ├── CacheTests.scala │ └── HCacheTests.scala ├── scalding-estimators-test └── src │ └── test │ └── resources │ └── scores.tsv ├── NOTICE ├── scalding-base └── src │ ├── main │ └── scala │ │ └── com │ │ └── twitter │ │ └── scalding │ │ ├── typed │ │ ├── functions │ │ │ ├── ScaldingPriorityQueueMonoid.scala │ │ │ ├── FlatMapping.scala │ │ │ └── EqTypes.scala │ │ ├── OptimizationPhases.scala │ │ ├── memory_backend │ │ │ └── AtomicBox.scala │ │ ├── KeyedPipe.scala │ │ └── WithDescription.scala │ │ ├── StatKey.scala │ │ ├── CPromise.scala │ │ ├── StringUtility.scala │ │ ├── CancellationHandler.scala │ │ ├── UniqueID.scala │ │ └── Mode.scala │ └── test │ └── scala │ └── com │ └── twitter │ └── scalding │ └── typed │ ├── TypedPipeMonoidTest.scala │ └── CoGroupableTest.scala ├── scalding-quotation └── src │ ├── test │ └── scala │ │ └── com │ │ └── twitter │ │ └── scalding │ │ └── quotation │ │ ├── package.scala │ │ ├── Person.scala │ │ └── LimitationsTest.scala │ └── main │ └── scala │ └── com │ └── twitter │ └── scalding │ └── quotation │ ├── TreeOps.scala │ ├── Quoted.scala │ └── Liftables.scala ├── scalding-parquet-fixtures └── src │ └── test │ └── resources │ └── test.thrift ├── scripts ├── build_assembly_no_test.sh ├── common.sh ├── test_typed_tutorials.sh ├── test_execution_tutorial.sh ├── test_tutorials.sh ├── README.md ├── test_matrix_tutorials.sh ├── test_repl_tutorial.sh └── testValidator.sh ├── scalding-serialization └── src │ ├── main │ ├── scala │ │ └── com │ │ │ └── twitter │ │ │ └── scalding │ │ │ └── serialization │ │ │ ├── RequireOrderedSerializationMode.scala │ │ │ ├── macros │ │ │ └── impl │ │ │ │ ├── BinaryOrdering.scala │ │ │ │ └── ordered_serialization │ │ │ │ ├── providers │ │ │ │ └── StableKnownDirectSubclasses.scala │ │ │ │ └── runtime_helpers │ │ │ │ ├── MacroEqualityOrderedSerialization.scala │ │ │ │ └── LengthCalculations.scala │ │ │ ├── Laws.scala │ │ │ └── UnsignedComparisons.scala │ └── java │ │ └── com │ │ └── twitter │ │ └── scalding │ │ └── serialization │ │ └── Undeprecated.java │ └── test │ └── scala │ └── com │ └── twitter │ └── scalding │ └── serialization │ ├── macros │ └── ZDifficultTypes.scala │ └── UnsignedComparisonLaws.scala ├── scalding-repl └── src │ └── main │ └── scala │ └── com │ └── twitter │ └── scalding │ └── ILoopCompat.scala ├── scalding-db └── src │ ├── main │ └── scala │ │ └── com │ │ └── twitter │ │ └── scalding │ │ └── db │ │ ├── JdbcStatementSetter.scala │ │ ├── macros │ │ ├── impl │ │ │ ├── handler │ │ │ │ ├── BlobTypeHandler.scala │ │ │ │ ├── DateTypeHandler.scala │ │ │ │ ├── ColumnFormat.scala │ │ │ │ └── NumericTypeHandler.scala │ │ │ ├── DBTypeDescriptorImpl.scala │ │ │ └── JdbcStatementSetterImpl.scala │ │ └── DBMacro.scala │ │ ├── DBTypeDescriptor.scala │ │ ├── package.scala │ │ ├── extensions │ │ └── VerticaExtensions.scala │ │ └── ColumnDefinition.scala │ └── test │ └── scala │ └── com │ └── twitter │ └── scalding │ └── db │ └── DBOptionsTest.scala ├── scalding-hraven ├── README.md └── src │ ├── main │ └── scala │ │ └── com │ │ └── twitter │ │ └── scalding │ │ └── hraven │ │ └── reducer_estimation │ │ └── HRavenBasedEstimator.scala │ └── test │ └── resources │ ├── jobResponse_job_1470171371859_6607542.json │ └── jobResponse_job_1470171371859_6608570.json ├── .travis.blacklist ├── .scalafmt.conf ├── scalding-avro ├── README.md └── src │ └── main │ └── scala │ └── com │ └── twitter │ └── scalding │ └── avro │ └── package.scala ├── codecov.yml ├── COMMITTERS.md ├── scalding-parquet-scrooge-fixtures └── src │ └── test │ └── resources │ └── binary.thrift ├── .gitignore ├── scalding-date └── src │ └── main │ └── scala │ └── com │ └── twitter │ └── scalding │ └── CalendarOps.scala ├── .github └── workflows │ └── publish.yml ├── scalding-hadoop-test └── src │ └── test │ └── scala │ └── com │ └── twitter │ └── scalding │ └── platform │ └── TestJobsWithDescriptions.scala └── scalding-args └── src ├── test └── scala │ └── com │ └── twitter │ └── scalding │ └── RangedArgsSpec.scala └── main └── scala └── com └── twitter └── scalding └── RangedArgs.scala /scalding-thrift-macros/NOTICE: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=1.5.4 2 | -------------------------------------------------------------------------------- /tutorial/data/graph2.tsv: -------------------------------------------------------------------------------- 1 | 1 2 1 2 | 1 3 1 3 | 2 3 1 4 | -------------------------------------------------------------------------------- /tutorial/data/hello.txt: -------------------------------------------------------------------------------- 1 | Hello world 2 | Goodbye world 3 | -------------------------------------------------------------------------------- /tutorial/data/words.txt: -------------------------------------------------------------------------------- 1 | hello 2 | world 3 | goodbye 4 | -------------------------------------------------------------------------------- /tutorial/data/graph.tsv: -------------------------------------------------------------------------------- 1 | 1 2 1 2 | 1 3 1 3 | 3 2 1 4 | 4 2 2 5 | -------------------------------------------------------------------------------- /tutorial/data/word_scores.tsv: -------------------------------------------------------------------------------- 1 | hello 1.0 2 | world 2.0 3 | goodbye 3.0 -------------------------------------------------------------------------------- /logo/scalding.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/twitter/scalding/HEAD/logo/scalding.png -------------------------------------------------------------------------------- /tutorial/data/helloDoc.txt: -------------------------------------------------------------------------------- 1 | 1 Hello world 2 | 2 See ya soon world 3 | 3 Hello again world 4 | -------------------------------------------------------------------------------- /scalding-core/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/04/_SUCCESS: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scalding-core/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/05/_SUCCESS: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scalding-core/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/05/_ignored: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scalding-core/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/06/_SUCCESS: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scalding-core/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/08/_SUCCESS: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scalding-parquet/src/test/resources/names.txt: -------------------------------------------------------------------------------- 1 | Alice Practive 2 | Bob Hope 3 | Charlie Horse 4 | -------------------------------------------------------------------------------- /scalding-commons/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/09/_SUCCESS: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scalding-commons/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/10/_SUCCESS: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scalding-core/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/03/2013-03.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scalding-core/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/04/2013-04.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scalding-core/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/07/2013-07.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scalding-core/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/08/2013-08.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scalding-parquet-scrooge/src/test/resources/names.txt: -------------------------------------------------------------------------------- 1 | Alice Practice 2 | Bob Hope 3 | Charlie Horse 4 | -------------------------------------------------------------------------------- /scalding-core/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/07/_SUCCESS/_ignored: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tutorial/data/phones.txt: -------------------------------------------------------------------------------- 1 | john smith 5551212 30 US 2 | harry bovik 4122680000 55 US 3 | jane doe 4125551212 40 CN 4 | -------------------------------------------------------------------------------- /tutorial/CodeSnippets.md: -------------------------------------------------------------------------------- 1 | Please see the [API reference](https://github.com/twitter/scalding/wiki/API-Reference) on the wiki. 2 | -------------------------------------------------------------------------------- /docs/src/main/tut/cookbook/cookbook.md: -------------------------------------------------------------------------------- 1 | # Cookbook 2 | 3 | In Progress - a cookbook of things you might like to do with Scalding. 4 | -------------------------------------------------------------------------------- /tutorial/.scalding_repl: -------------------------------------------------------------------------------- 1 | // for use in testing to verify that '.scalding_repl' files are loaded 2 | val scaldingReplInitWasLoaded = true 3 | -------------------------------------------------------------------------------- /docs/src/main/resources/microsite/img/favicon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/twitter/scalding/HEAD/docs/src/main/resources/microsite/img/favicon.png -------------------------------------------------------------------------------- /docs/src/main/resources/microsite/img/navbar_brand.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/twitter/scalding/HEAD/docs/src/main/resources/microsite/img/navbar_brand.png -------------------------------------------------------------------------------- /docs/src/main/resources/microsite/img/sidebar_brand.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/twitter/scalding/HEAD/docs/src/main/resources/microsite/img/sidebar_brand.png -------------------------------------------------------------------------------- /docs/src/main/resources/microsite/img/navbar_brand2x.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/twitter/scalding/HEAD/docs/src/main/resources/microsite/img/navbar_brand2x.png -------------------------------------------------------------------------------- /docs/src/main/resources/microsite/img/sidebar_brand2x.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/twitter/scalding/HEAD/docs/src/main/resources/microsite/img/sidebar_brand2x.png -------------------------------------------------------------------------------- /tutorial/data/docBOW.tsv: -------------------------------------------------------------------------------- 1 | 1 hello 2 2 | 1 twitter 1 3 | 2 conversation 1 4 | 2 celebrities 1 5 | 2 twitter 1 6 | 3 elections 1 7 | 3 debate 1 8 | 3 twitter 1 9 | 3 political 1 10 | -------------------------------------------------------------------------------- /maple/src/main/java/com/twitter/maple/tap/TupleWrapper.java: -------------------------------------------------------------------------------- 1 | package com.twitter.maple.tap; 2 | 3 | import cascading.tuple.Tuple; 4 | 5 | public class TupleWrapper { 6 | public Tuple tuple; 7 | } 8 | -------------------------------------------------------------------------------- /scalding-parquet-scrooge/README.md: -------------------------------------------------------------------------------- 1 | # Parquet-Scrooge support for Scalding 2 | 3 | This module has sources for reading scrooge-generated thrift structs. See the scalding-parquet module for reading apache-thrift (TBase) generated thrift structs. 4 | -------------------------------------------------------------------------------- /scalding-commons/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/09/part-00000: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/twitter/scalding/HEAD/scalding-commons/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/09/part-00000 -------------------------------------------------------------------------------- /scalding-commons/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/09/part-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/twitter/scalding/HEAD/scalding-commons/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/09/part-00001 -------------------------------------------------------------------------------- /scalding-commons/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/10/part-00000: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/twitter/scalding/HEAD/scalding-commons/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/10/part-00000 -------------------------------------------------------------------------------- /scalding-commons/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/10/part-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/twitter/scalding/HEAD/scalding-commons/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/10/part-00001 -------------------------------------------------------------------------------- /scalding-commons/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/10/part-00002: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/twitter/scalding/HEAD/scalding-commons/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/10/part-00002 -------------------------------------------------------------------------------- /scalding-dagon/src/main/scala/com/twitter/scalding/dagon/package.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding 2 | 3 | /** Collection of graph algorithms */ 4 | package object dagon { 5 | type BoolT[T] = Boolean 6 | type NeighborFn[T] = T => Iterable[T] 7 | } 8 | -------------------------------------------------------------------------------- /scalding-parquet/src/main/scala/com/twitter/scalding/parquet/HasFilterPredicate.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.parquet 2 | 3 | import org.apache.parquet.filter2.predicate.FilterPredicate 4 | 5 | trait HasFilterPredicate { 6 | def withFilter: Option[FilterPredicate] = None 7 | } 8 | -------------------------------------------------------------------------------- /scalding-estimators-test/src/test/resources/scores.tsv: -------------------------------------------------------------------------------- 1 | iphone 0.5 2 | mixtape 0.2 3 | helvetica 0.1 4 | gastropub 0.1 5 | raw 0.05 6 | sustainable 0.01 7 | stumptown 0.75 8 | postironic 0.3 9 | ironic 0.9 10 | pintrest 0.05 11 | selfies 0.2 12 | dreamcatcher 0.65 13 | twitter 0.0 14 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | scalding is a Scala API for Cascading. 2 | Copyright 2012 Twitter, Inc. 3 | 4 | Third Party Dependencies: 5 | 6 | Cascading 2.0 7 | Apache Public License 2.0 8 | http://www.cascading.org 9 | 10 | Hadoop 0.20.2 11 | Apache Public License 2.0 12 | http://hadoop.apache.org 13 | 14 | -------------------------------------------------------------------------------- /scalding-core/src/main/scala/com/twitter/scalding/reducer_estimation/package.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding 2 | 3 | package object reducer_estimation { 4 | def median(xs: Seq[Double]): Option[Double] = xs.sorted.lift(xs.length / 2) 5 | def mean(xs: Seq[Double]): Option[Double] = if (xs.isEmpty) None else Some(xs.sum / xs.length) 6 | } 7 | -------------------------------------------------------------------------------- /docs/src/main/tut/cookbook.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: docs 3 | title: "Cookbook" 4 | section: "cookbook" 5 | position: 1 6 | --- 7 | 8 | {% include_relative cookbook/cookbook.md %} 9 | 10 | ## Index 11 | 12 | {% for x in site.pages %} 13 | {% if x.section == 'cookbook' %} 14 | - [{{x.title}}]({{site.baseurl}}{{x.url}}) 15 | {% endif %} 16 | {% endfor %} 17 | -------------------------------------------------------------------------------- /tutorial/data/session.json: -------------------------------------------------------------------------------- 1 | {"sessionId": "efdc0698-66ea-4d3a-ac6e-fab9fd97a78b"} 2 | {"sessionId": "efdc0698-66ea-4d3a-ac6e-fab9fd97a78b"} 3 | {"sessionId": "efdc0698-66ea-4d3a-ac6e-fab9fd97a78b"} 4 | {"sessionId": "efdc0698-66ea-4d3a-ac6e-fab9fd97a78b"} 5 | {"sessionId": "efdc0698-66ea-4d3a-ac6e-fab9fd97a78b"} 6 | {"sessionId": "efdc0698-66ea-4d3a-ac6e-fab9fd97a78b"} 7 | -------------------------------------------------------------------------------- /scalding-base/src/main/scala/com/twitter/scalding/typed/functions/ScaldingPriorityQueueMonoid.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.typed.functions 2 | 3 | import com.twitter.algebird.mutable.PriorityQueueMonoid 4 | 5 | class ScaldingPriorityQueueMonoid[K]( 6 | val count: Int 7 | )(implicit val ordering: Ordering[K]) 8 | extends PriorityQueueMonoid[K](count)(ordering) 9 | -------------------------------------------------------------------------------- /scalding-quotation/src/test/scala/com/twitter/scalding/quotation/package.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding 2 | 3 | import org.scalatest.MustMatchers 4 | import org.scalatest.FreeSpec 5 | 6 | package object quotation { 7 | def typeName[T](implicit ct: reflect.ClassTag[T]) = TypeName(ct.runtimeClass.getName) 8 | trait Test extends FreeSpec with MustMatchers 9 | } 10 | -------------------------------------------------------------------------------- /scalding-parquet-fixtures/src/test/resources/test.thrift: -------------------------------------------------------------------------------- 1 | namespace java com.twitter.scalding.parquet.thrift_java.test 2 | #@namespace scala com.twitter.scalding.parquet.thrift_scala.test 3 | 4 | struct Name { 5 | 1: required string first_name, 6 | 2: optional string last_name 7 | } 8 | 9 | struct Address { 10 | 1: string street, 11 | 2: required string zip 12 | } 13 | -------------------------------------------------------------------------------- /scripts/build_assembly_no_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -exv 2 | 3 | # Identify the bin dir in the distribution, and source the common include script 4 | BASE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/.. && pwd )" 5 | TARGET=$1 6 | 7 | cd $BASE_DIR 8 | sed -i'' -e 's/\/\/ test in assembly/test in assembly/g' build.sbt 9 | 10 | time ./sbt ++$TRAVIS_SCALA_VERSION $TARGET/assembly 11 | -------------------------------------------------------------------------------- /scalding-serialization/src/main/scala/com/twitter/scalding/serialization/RequireOrderedSerializationMode.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.serialization 2 | 3 | sealed trait RequireOrderedSerializationMode extends Serializable 4 | object RequireOrderedSerializationMode { 5 | case object Fail extends RequireOrderedSerializationMode 6 | case object Log extends RequireOrderedSerializationMode 7 | } 8 | -------------------------------------------------------------------------------- /scalding-repl/src/main/scala/com/twitter/scalding/ILoopCompat.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding 2 | 3 | import java.io.BufferedReader 4 | 5 | import scala.tools.nsc.interpreter.ILoop 6 | import scala.tools.nsc.interpreter.JPrintWriter 7 | 8 | class ILoopCompat(in: Option[BufferedReader], out: JPrintWriter) extends ILoop(in, out) { 9 | def addThunk(f: => Unit): Unit = intp.initialize(f) 10 | } 11 | -------------------------------------------------------------------------------- /scalding-dagon/src/main/scala/com/twitter/scalding/dagon/PartialRule.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.dagon 2 | 3 | /** 4 | * Often a partial function is an easier way to express rules 5 | */ 6 | trait PartialRule[N[_]] extends Rule[N] { 7 | final def apply[T](on: Dag[N]): N[T] => Option[N[T]] = 8 | applyWhere[T](on).lift 9 | 10 | def applyWhere[T](on: Dag[N]): PartialFunction[N[T], N[T]] 11 | } 12 | -------------------------------------------------------------------------------- /scalding-db/src/main/scala/com/twitter/scalding/db/JdbcStatementSetter.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.db 2 | 3 | import java.sql.PreparedStatement 4 | import scala.util.Try 5 | 6 | /** 7 | * Case class to JDBC statement setter used for database writes 8 | */ 9 | trait JdbcStatementSetter[T] extends java.io.Serializable { self => 10 | def apply(t: T, s: PreparedStatement): Try[PreparedStatement] 11 | } 12 | -------------------------------------------------------------------------------- /scalding-base/src/main/scala/com/twitter/scalding/typed/OptimizationPhases.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.typed 2 | 3 | import com.twitter.scalding.dagon.Rule 4 | 5 | /** 6 | * This is a class to allow customization of how we plan typed pipes 7 | */ 8 | abstract class OptimizationPhases { 9 | def phases: Seq[Rule[TypedPipe]] 10 | } 11 | 12 | final class EmptyOptimizationPhases extends OptimizationPhases { 13 | def phases = Nil 14 | } 15 | -------------------------------------------------------------------------------- /scripts/common.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -exv 2 | 3 | # Not sure what the right default is here: trying nonzero. 4 | SCALA_EXIT_STATUS=127 5 | SAVED_STTY="" 6 | 7 | SCALD="${BASE_DIR}/scripts/scald.rb --local" 8 | SCALD_REPL="${BASE_DIR}/scripts/scald.rb --repl --local" 9 | 10 | echo "using TRAVIS_SCALA_VERSION ${TRAVIS_SCALA_VERSION}" 11 | SCALD="$SCALD --scalaversion ${TRAVIS_SCALA_VERSION}" 12 | SCALD_REPL="$SCALD_REPL --scalaversion ${TRAVIS_SCALA_VERSION}" 13 | -------------------------------------------------------------------------------- /scalding-db/src/test/scala/com/twitter/scalding/db/DBOptionsTest.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.db 2 | 3 | import org.scalacheck.Properties 4 | import org.scalacheck.Prop._ 5 | 6 | object DBOptionsTest extends Properties("DBOptions") { 7 | property("password") = forAll { x: String => 8 | ("Password toString should not be equal to x" |: Password(x).toString != x) && 9 | ("Password toStr should be equal to x" |: Password(x).toStr == x) 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /scalding-core/src/test/scala/com/twitter/scalding/typed/InAnotherPackage.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.example.scalding.typed 2 | 3 | import com.twitter.scalding._ 4 | import scala.concurrent.{ExecutionContext => SExecutionContext, _} 5 | import SExecutionContext.Implicits.global 6 | 7 | object InAnotherPackage { 8 | def buildF: Future[TypedPipe[(Int, Int)]] = 9 | Future { 10 | TypedPipe 11 | .from(List(1, 2, 3, 4, 555, 3)) 12 | .map { case x => (x, x) } 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /scalding-hraven/README.md: -------------------------------------------------------------------------------- 1 | # hRaven Extensions 2 | This module includes additions to Scalding that make use of [hRaven](https://github.com/twitter/hraven) for querying job history. 3 | 4 | ## Reducer Estimation 5 | Reducer estimators can include the `HRavenHistory` trait to get additional functionality for querying hRaven for past jobs. 6 | 7 | For example, `RatioBasedReducerEstimator`, also in this module, uses hRaven job history to better estimate reducers based on the ratio of mapper-reducer input data. 8 | -------------------------------------------------------------------------------- /project/travis-log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootCategory=DEBUG, console 2 | log4j.threshold=ALL 3 | 4 | log4j.category.cascading=WARN 5 | log4j.category.com.twitter=INFO 6 | log4j.logger.org.apache.hadoop=ERROR 7 | log4j.logger.cascading.flow=WARN 8 | log4j.logger.cascading.tap=WARN 9 | 10 | 11 | log4j.appender.console=org.apache.log4j.ConsoleAppender 12 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 13 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n 14 | -------------------------------------------------------------------------------- /scalding-core/src/main/scala/com/twitter/scalding/serialization/RequiredBinaryComparatorsConfig.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.serialization 2 | 3 | import com.twitter.scalding.{Config, Job} 4 | 5 | trait RequiredBinaryComparatorsConfig extends Job { 6 | def requireOrderedSerializationMode: RequireOrderedSerializationMode = RequireOrderedSerializationMode.Fail 7 | override def config = 8 | super.config + (Config.ScaldingRequireOrderedSerialization -> requireOrderedSerializationMode.toString) 9 | } 10 | -------------------------------------------------------------------------------- /scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/BinaryOrdering.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.serialization.macros.impl 2 | 3 | import com.twitter.scalding.serialization.OrderedSerialization 4 | 5 | import scala.language.experimental.macros 6 | 7 | trait BinaryOrdering { 8 | implicit def ordSer[T]: OrderedSerialization[T] = 9 | macro com.twitter.scalding.serialization.macros.impl.OrderedSerializationProviderImpl[T] 10 | } 11 | 12 | object BinaryOrdering extends BinaryOrdering 13 | -------------------------------------------------------------------------------- /.travis.blacklist: -------------------------------------------------------------------------------- 1 | #This describes extra builds our validator will pretend to run in CI but won't 2 | # Remember we run most builds twice, so if you want it disabled for both 2.10 and 2.11 it needs to be here twice 3 | scalding-benchmarks 4 | scalding-benchmarks 5 | # These are just for fixtures, so blacklist for 2.10 and 2.11 6 | scalding-thrift-macros-fixtures 7 | scalding-thrift-macros-fixtures 8 | scalding-parquet-fixtures 9 | scalding-parquet-fixtures 10 | scalding-parquet-scrooge-fixtures 11 | scalding-parquet-scrooge-fixtures 12 | -------------------------------------------------------------------------------- /tutorial/JsonTutorial0.scala: -------------------------------------------------------------------------------- 1 | /** 2 | Scalding with Json tutorial part 0. 3 | 4 | To run this job: 5 | scripts/scald.rb --local --json tutorial/JsonTutorial0.scala 6 | 7 | Check the output: 8 | cat tutorial/data/jsonoutput0.tsv 9 | 10 | **/ 11 | 12 | import com.twitter.scalding.{Job, Args, JsonLine, Tsv} 13 | 14 | class JsonTutorial0(args: Args) extends Job(args) { 15 | JsonLine("tutorial/data/session.json", ('sessionId)).read 16 | .groupBy('sessionId){_.size} 17 | .write(Tsv("tutorial/data/jsonoutput0.tsv")) 18 | } 19 | -------------------------------------------------------------------------------- /scalding-commons/src/main/scala/com/twitter/scalding/examples/WordCountJob.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.examples 2 | 3 | import com.twitter.scalding._ 4 | 5 | class WordCountJob(args: Args) extends Job(args) { 6 | TypedPipe 7 | .from(TextLine(args("input"))) 8 | .flatMap(line => line.split("\\s+")) 9 | .map(word => (word, 1L)) 10 | .sumByKey 11 | // The compiler will enforce the type coming out of the sumByKey is the same as the type we have for our sink 12 | .write(TypedTsv[(String, Long)](args("output"))) 13 | } 14 | -------------------------------------------------------------------------------- /scalding-core/src/main/scala/com/twitter/scalding/typed/cascading_backend/DistinctCoGroupJoiner.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.typed.cascading_backend 2 | 3 | import com.twitter.scalding.TupleGetter 4 | import com.twitter.scalding.typed.MultiJoinFunction 5 | 6 | // If all the input pipes are unique, this works: 7 | class DistinctCoGroupJoiner[K]( 8 | count: Int, 9 | getter: TupleGetter[K], 10 | @transient joinF: MultiJoinFunction[K, Any] 11 | ) extends CoGroupedJoiner[K](count, getter, joinF) { 12 | val distinctSize = count 13 | def distinctIndexOf(idx: Int) = idx 14 | } 15 | -------------------------------------------------------------------------------- /scalding-core/src/main/scala/com/twitter/scalding/serialization/SerializeAsUnit.scala.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.serialization 2 | 3 | import com.esotericsoftware.kryo.Kryo 4 | import com.esotericsoftware.kryo.{Serializer => KSerializer} 5 | import com.esotericsoftware.kryo.io.{Input, Output} 6 | 7 | // We use this for TypedPipe subclasses which should never be needed when we run 8 | class SerializeAsUnit[T >: Null] extends KSerializer[T] { 9 | override def write(kryo: Kryo, output: Output, t: T): Unit = () 10 | override def read(kryo: Kryo, input: Input, t: Class[T]): T = null 11 | } 12 | -------------------------------------------------------------------------------- /scalding-quotation/src/test/scala/com/twitter/scalding/quotation/Person.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.quotation 2 | 3 | case class Contact(phone: String) 4 | case class Person(name: String, contact: Contact, alternativeContact: Option[Contact]) 5 | 6 | object Person { 7 | val typeReference = TypeReference(typeName[Person]) 8 | val nameProjection = typeReference.andThen(Accessor("name"), typeName[String]) 9 | val contactProjection = typeReference.andThen(Accessor("contact"), typeName[Contact]) 10 | val phoneProjection = contactProjection.andThen(Accessor("phone"), typeName[String]) 11 | } 12 | -------------------------------------------------------------------------------- /scalding-commons/src/test/scala/com/twitter/scalding/commons/source/typedtext/TypedTextTest.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.commons.source 2 | 3 | import com.twitter.scalding._ 4 | 5 | import org.scalatest.FunSuite 6 | 7 | case class Test1(a: Int, b: Long, c: Option[Double]) 8 | case class Test2(one: Test1, d: String) 9 | 10 | class TypedTextTest extends FunSuite { 11 | test("Test with a nested tuple: Daily") { 12 | val source = 13 | LzoTypedText.dailyLzoTsv[Test2]("myPath")(DateRange(RichDate.now, RichDate.now + Hours(1)), implicitly) 14 | assert(source.sourceFields.size == 4) 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /scalding-base/src/main/scala/com/twitter/scalding/StatKey.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding 2 | 3 | case class StatKey(counter: String, group: String) extends java.io.Serializable 4 | 5 | object StatKey { 6 | // This is implicit to allow Stat("c", "g") to work. 7 | implicit def fromCounterGroup(counterGroup: (String, String)): StatKey = counterGroup match { 8 | case (c, g) => StatKey(c, g) 9 | } 10 | // Create a Stat in the ScaldingGroup 11 | implicit def fromCounterDefaultGroup(counter: String): StatKey = 12 | StatKey(counter, ScaldingGroup) 13 | 14 | val ScaldingGroup = "Scalding Custom" 15 | } 16 | -------------------------------------------------------------------------------- /scalding-core/src/main/scala/com/twitter/scalding/source/NullSink.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.source 2 | 3 | import com.twitter.scalding.typed.TypedSink 4 | import com.twitter.scalding.{BaseNullSource, TupleSetter} 5 | 6 | /** 7 | * This can be used to cause cascading to run a flow, but discard the output. The only place this is likely of 8 | * use is to do some (non-recommended, but sometimes the most expediant way to accomplish some task). 9 | */ 10 | object NullSink extends BaseNullSource with TypedSink[Any] { 11 | def setter[U <: Any] = TupleSetter.asSubSetter[Any, U](TupleSetter.singleSetter) 12 | } 13 | -------------------------------------------------------------------------------- /.scalafmt.conf: -------------------------------------------------------------------------------- 1 | version = 3.5.1 2 | maxColumn = 110 3 | docstrings.style = Asterisk 4 | newlines.alwaysBeforeMultilineDef = false 5 | newlines.penalizeSingleSelectMultiArgList = false 6 | align.openParenCallSite = false 7 | rewrite.rules = [AvoidInfix, SortImports, RedundantBraces, RedundantParens, PreferCurlyFors] 8 | rewrite.redundantBraces.generalExpressions = false 9 | 10 | # scalafmt can only choose one scala version target per file to format 11 | # we have to use 212 for build.sbt or else we get failures 12 | runner.dialect = scala211 13 | fileOverride { 14 | "glob:**build.sbt" { 15 | runner.dialect = scala212 16 | } 17 | } -------------------------------------------------------------------------------- /scalding-core/src/main/scala/com/twitter/scalding/mathematics/Poisson.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.mathematics 2 | 3 | import scala.util.Random 4 | 5 | /** 6 | * Generating Poisson-distributed random variables according to Donald Knuth's algorithm as shown on 7 | * Wikipedia's Poisson Distribution page 8 | */ 9 | 10 | class Poisson(fraction: Double, seed: Int) { 11 | 12 | val L = math.exp(-fraction) 13 | val randomGenerator = new Random(seed) 14 | 15 | def nextInt = { 16 | var k = 0 17 | var p = 1.0 18 | do { 19 | k = k + 1 20 | p = p * randomGenerator.nextDouble 21 | } while (p > L) 22 | k - 1 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /scalding-commons/src/test/java/com/twitter/scalding/commons/datastores/FSTestCase.java: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.commons.datastores; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.hadoop.fs.FileSystem; 5 | 6 | import java.io.IOException; 7 | 8 | public class FSTestCase { 9 | public FileSystem local; 10 | public FileSystem fs; 11 | 12 | public FSTestCase() { 13 | try { 14 | local = FileSystem.getLocal(new Configuration()); 15 | fs = FileSystem.get(new Configuration()); 16 | } catch(IOException e) { 17 | throw new RuntimeException(e); 18 | } 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /scalding-avro/README.md: -------------------------------------------------------------------------------- 1 | This module contains Avro support for Scalding. It's based on the cascading.avro project, 2 | https://github.com/ScaleUnlimited/cascading.avro . 3 | 4 | In some case Kryo (the default serializer used by Scalding) doesn't work well with Avro objects. If you run in to 5 | serialization errors, or if you want to preempt and trouble, you should add the following to your Job class: 6 | ```scala 7 | override def ioSerializations = 8 | super.ioSerializations :+ classOf[cascading.avro.serialization.AvroSpecificRecordSerialization[_]] 9 | ``` 10 | 11 | This will use cascading.avro's Avro SpecificRecord serialization for Avro objects in place of the Kryo serialization. 12 | 13 | -------------------------------------------------------------------------------- /scalding-commons/src/test/java/com/twitter/scalding/commons/datastores/TestUtils.java: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.commons.datastores; 2 | 3 | import org.apache.hadoop.fs.FileSystem; 4 | import org.apache.hadoop.fs.Path; 5 | 6 | import java.io.IOException; 7 | 8 | public class TestUtils { 9 | 10 | private static final String TMP_ROOT = "/tmp/unittests"; 11 | 12 | public static String getTmpPath(FileSystem fs, String name) throws IOException { 13 | fs.mkdirs(new Path(TMP_ROOT)); 14 | String full = TMP_ROOT + "/" + name; 15 | if (fs.exists(new Path(full))) { 16 | fs.delete(new Path(full), true); 17 | } 18 | return full; 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /scalding-dagon/src/main/scala/com/twitter/scalding/dagon/FunctionK.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.dagon 2 | 3 | import java.io.Serializable 4 | 5 | /** 6 | * This is a Natural transformation. 7 | * 8 | * For any type X, this type can produce a function from T[X] to R[X]. 9 | */ 10 | trait FunctionK[T[_], R[_]] extends Serializable { 11 | def apply[U](tu: T[U]): R[U] = 12 | toFunction[U](tu) 13 | 14 | def toFunction[U]: T[U] => R[U] 15 | } 16 | 17 | object FunctionK { 18 | def andThen[A[_], B[_], C[_]](first: FunctionK[A, B], second: FunctionK[B, C]): FunctionK[A, C] = 19 | new FunctionK[A, C] { 20 | def toFunction[U] = first.toFunction[U].andThen(second.toFunction[U]) 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /scripts/test_typed_tutorials.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -exv 2 | 3 | # Identify the bin dir in the distribution, and source the common include script 4 | BASE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/.. && pwd )" 5 | source ${BASE_DIR}/scripts/common.sh 6 | 7 | # Test TypedTutorial cases 8 | for t in 1 2 3 4 5 pipes block; do 9 | echo "--------------------" 10 | echo "TypedTutorial: $t" 11 | echo "--------------------" 12 | time $SCALD tutorial/TypedTutorial.scala \ 13 | --tutorial $t \ 14 | --input tutorial/data/hello.txt \ 15 | --output tutorial/data/output0.txt \ 16 | --words tutorial/data/word_scores.tsv 17 | echo "--------------------" 18 | cat tutorial/data/output0.txt 19 | done 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | # note that if an org-wide global config is configured, it will be merged (with duplicate settings taking priority from this file) 2 | # it's better to explicitly set all configs if you want consistency 3 | 4 | codecov: 5 | require_ci_to_pass: yes 6 | 7 | coverage: 8 | precision: 2 9 | round: down 10 | range: "0...100" # acceptable coverage range 11 | 12 | # default behaviour 13 | parsers: 14 | gcov: 15 | branch_detection: 16 | conditional: yes 17 | loop: yes 18 | method: no 19 | macro: no 20 | 21 | # can be configured in https://docs.codecov.com/docs/pull-request-comments 22 | comment: 23 | layout: "reach,diff,flags,files,footer" 24 | behavior: default 25 | require_changes: no 26 | -------------------------------------------------------------------------------- /scalding-core/src/test/scala/com/twitter/scalding/FlowStateMapTest.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding 2 | 3 | import org.scalatest.FunSuite 4 | 5 | import cascading.flow.FlowDef 6 | import com.twitter.scalding.source.{NullSink, TypedText} 7 | import com.twitter.scalding.typed.cascading_backend.{CascadingBackend, CascadingExtensions} 8 | 9 | import CascadingExtensions._ 10 | 11 | class FlowStateMapTest extends FunSuite { 12 | test("make sure sure sourcemap isn't empty after planning") { 13 | implicit val fd = new FlowDef 14 | implicit val m = Local(false) 15 | val t = TypedPipe.from(TypedText.tsv[String]("")).write(NullSink) 16 | CascadingBackend.planTypedWrites(fd, m) 17 | val state = FlowStateMap(fd) 18 | assert(state.sourceMap.nonEmpty) 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /scalding-dagon/src/main/scala-2.12-/com/twitter/scalding/dagon/ScalaVersionCompat.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.dagon 2 | 3 | object ScalaVersionCompat { 4 | type LazyList[+A] = scala.collection.immutable.Stream[A] 5 | val LazyList = scala.collection.immutable.Stream 6 | 7 | type IterableOnce[+A] = scala.collection.TraversableOnce[A] 8 | 9 | def iterateOnce[A](as: IterableOnce[A]): Iterator[A] = 10 | as.toIterator 11 | 12 | def lazyList[A](as: A*): LazyList[A] = 13 | Stream(as: _*) 14 | 15 | def lazyListToIterator[A](lst: LazyList[A]): Iterator[A] = 16 | lst.iterator 17 | 18 | def lazyListFromIterator[A](it: Iterator[A]): LazyList[A] = 19 | it.toStream 20 | 21 | implicit val ieeeDoubleOrdering: Ordering[Double] = 22 | Ordering.Double 23 | } 24 | -------------------------------------------------------------------------------- /scalding-core/src/main/scala/com/twitter/scalding/TupleConversions.scala: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2012 Twitter, Inc. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package com.twitter.scalding 17 | 18 | @deprecated("This trait does nothing now", "0.9.0") 19 | trait TupleConversions 20 | -------------------------------------------------------------------------------- /scalding-dagon/src/main/scala-2.13+/com/twitter/scalding/dagon/ScalaVersionCompat.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.dagon 2 | 3 | object ScalaVersionCompat { 4 | type LazyList[+A] = scala.collection.immutable.LazyList[A] 5 | val LazyList = scala.collection.immutable.LazyList 6 | 7 | type IterableOnce[+A] = scala.collection.IterableOnce[A] 8 | 9 | def iterateOnce[A](as: IterableOnce[A]): Iterator[A] = 10 | as.iterator 11 | 12 | def lazyList[A](as: A*): LazyList[A] = 13 | LazyList(as: _*) 14 | 15 | def lazyListToIterator[A](lst: LazyList[A]): Iterator[A] = 16 | lst.iterator 17 | 18 | def lazyListFromIterator[A](it: Iterator[A]): LazyList[A] = 19 | LazyList.from(it) 20 | 21 | implicit val ieeeDoubleOrdering: Ordering[Double] = 22 | Ordering.Double.IeeeOrdering 23 | } 24 | -------------------------------------------------------------------------------- /scalding-core/codegen/function_implicits_generator.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | # Run it like this: 4 | # 5 | # ruby codegen/function_implicits_generator.rb > src/main/scala/com/twitter/scalding/FunctionImplicits.scala 6 | 7 | $indent = " " 8 | 9 | def puts_function_to_tupled(cnt) 10 | gen = (1 .. cnt).map{ |i| "T#{i}" }.join(", ") 11 | puts "#{$indent}implicit def function#{cnt}ToTupledFunction1[#{gen}, R](f: Function#{cnt}[#{gen}, R]): Function1[(#{gen}), R] = f.tupled" 12 | end 13 | 14 | puts "// following were autogenerated by #{__FILE__} at #{Time.now} do not edit" 15 | puts "package com.twitter.scalding" 16 | puts 17 | puts"object FunctionImplicits {" 18 | puts 19 | 20 | (2 .. 22).each { |c| 21 | puts_function_to_tupled(c) 22 | puts 23 | } 24 | 25 | puts "}" 26 | puts "// end of autogenerated" 27 | -------------------------------------------------------------------------------- /scalding-thrift-macros/src/main/scala/com/twitter/scalding/thrift/macros/RequiredBinaryComparators.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.thrift.macros 2 | 3 | import com.twitter.scalding.serialization.{OrderedSerialization, RequiredBinaryComparatorsConfig} 4 | import com.twitter.scalding.thrift.macros.impl.ScroogeInternalOrderedSerializationImpl 5 | import scala.language.experimental.{macros => smacros} 6 | 7 | /** 8 | * Provides support for Scrooge classes in addition to primitives, cases classes, tuples etc. Use this if you 9 | * use Scrooge classes as `key` in your scalding job. 10 | * @author 11 | * Mansur Ashraf. 12 | */ 13 | trait RequiredBinaryComparators extends RequiredBinaryComparatorsConfig { 14 | implicit def ordSer[T]: OrderedSerialization[T] = macro ScroogeInternalOrderedSerializationImpl[T] 15 | } 16 | -------------------------------------------------------------------------------- /scalding-parquet/src/main/java/com/twitter/scalding/parquet/tuple/TupleRecordMaterializer.java: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.parquet.tuple; 2 | 3 | import cascading.tuple.Tuple; 4 | 5 | import org.apache.parquet.io.api.GroupConverter; 6 | import org.apache.parquet.io.api.RecordMaterializer; 7 | import org.apache.parquet.schema.GroupType; 8 | 9 | public class TupleRecordMaterializer extends RecordMaterializer { 10 | 11 | private ParquetTupleConverter root; 12 | 13 | public TupleRecordMaterializer(GroupType parquetSchema) { 14 | this.root = new ParquetTupleConverter(parquetSchema); 15 | } 16 | 17 | @Override 18 | public Tuple getCurrentRecord() { 19 | return root.getCurrentTuple(); 20 | } 21 | 22 | @Override 23 | public GroupConverter getRootConverter() { 24 | return root; 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /scripts/test_execution_tutorial.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -exv 2 | 3 | # Identify the bin dir in the distribution, and source the common include script 4 | BASE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/.. && pwd )" 5 | source ${BASE_DIR}/scripts/common.sh 6 | SHORT_SCALA_VERSION=${TRAVIS_SCALA_VERSION%.*} 7 | SCALDING_VERSION=`./sbt -Dsbt.log.noformat=true -Dsbt.supershell=false "print scalding-core / version" -error` 8 | 9 | export CLASSPATH=tutorial/execution-tutorial/target/scala-${SHORT_SCALA_VERSION}/execution-tutorial-assembly-${SCALDING_VERSION}.jar 10 | time java -jar tutorial/execution-tutorial/target/scala-${SHORT_SCALA_VERSION}/execution-tutorial-assembly-${SCALDING_VERSION}.jar \ 11 | com.twitter.scalding.tutorial.MyExecJob --local \ 12 | --input tutorial/data/hello.txt \ 13 | --output tutorial/data/execution_output.txt 14 | -------------------------------------------------------------------------------- /scalding-dagon/src/main/scala/com/twitter/scalding/dagon/RefPair.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.dagon 2 | 3 | import scala.util.hashing.MurmurHash3 4 | 5 | /** 6 | * A tuple2 that uses reference equality on items to do equality useful for caching the results of pair-wise 7 | * functions on DAGs. 8 | * 9 | * Without this, you can easily get exponential complexity on recursion on DAGs. 10 | */ 11 | case class RefPair[A <: AnyRef, B <: AnyRef](_1: A, _2: B) { 12 | 13 | override lazy val hashCode: Int = MurmurHash3.productHash(this) 14 | 15 | override def equals(that: Any) = that match { 16 | case RefPair(thatA, thatB) => (_1 eq thatA) && (_2 eq thatB) 17 | case _ => false 18 | } 19 | 20 | /** 21 | * true if the left is referentially equal to the right 22 | */ 23 | def itemsEq: Boolean = _1 eq _2 24 | } 25 | -------------------------------------------------------------------------------- /scalding-commons/codegen/lzotypedtsv_generator.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | # Run it like this: 4 | # 5 | # ./codegen/lzotypedtsv_generator.rb > src/main/scala/com/twitter/scalding/commons/source/GeneratedLzoTypedTsv.scala 6 | 7 | $indent = " " 8 | 9 | TYPES = ('A'..'Z').to_a 10 | 11 | def make_typed_tsv(cnt) 12 | other_cnts = (1..(22-cnt)).to_a 13 | typeString = TYPES[0..(cnt - 1)].join(",") 14 | puts "trait LzoTypedTsv#{cnt}[#{typeString}] extends LzoTypedTsv[Tuple#{cnt}[#{typeString}]] with Mappable#{cnt}[#{typeString}]" 15 | end 16 | 17 | puts "// following were autogenerated by #{__FILE__} at #{Time.now} do not edit" 18 | puts %q|package com.twitter.scalding.commons.source 19 | 20 | import com.twitter.scalding._ 21 | 22 | | 23 | 24 | (1..22).each { |c| 25 | make_typed_tsv(c) 26 | puts 27 | } 28 | 29 | puts "// end of autogenerated" 30 | -------------------------------------------------------------------------------- /scalding-core/src/test/scala/com/twitter/scalding/RegressionTests.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding 2 | 3 | import org.scalatest.FunSuite 4 | 5 | class RegressionTests extends FunSuite { 6 | test("hashJoins + merges that fail in cascading 3") { 7 | val p1 = 8 | TypedPipe 9 | .from(List(1, 2)) 10 | .cross(TypedPipe.from(List(3, 4))) 11 | 12 | val p2 = 13 | TypedPipe 14 | .from(List(5, 6)) 15 | .cross(TypedPipe.from(List(8, 9))) 16 | 17 | val p3 = p1 ++ p2 18 | val p4 = TypedPipe.from(List((8, 1), (10, 2))) ++ p3 19 | 20 | val expected = List((1, 3), (1, 4), (2, 3), (2, 4), (5, 8), (5, 9), (6, 8), (6, 9), (8, 1), (10, 2)) 21 | val values = p4.toIterableExecution 22 | .waitFor(Config.empty, Local(true)) 23 | .get 24 | assert(values.toList.sorted == expected) 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /project/scalding-dagon.scala: -------------------------------------------------------------------------------- 1 | import sbt.CrossVersion 2 | 3 | import java.io.File 4 | import java.nio.file.Paths 5 | 6 | object scaldingDagonSettings { 7 | 8 | // load either scala-2.12- or scala-2.12+ dagon src depending on scala version 9 | def scalaVersionSpecificFolders(srcName: String, srcBaseDir: File, scalaVersion: String) = { 10 | 11 | def extraDirs(suffix: String) = { 12 | val scalaCompat = Paths 13 | .get(srcBaseDir.toString) 14 | .resolve("src") 15 | .resolve(srcName) 16 | .resolve("scala" + suffix) 17 | .toFile 18 | Seq(scalaCompat) 19 | } 20 | 21 | CrossVersion.partialVersion(scalaVersion) match { 22 | case Some((2, y)) if y <= 12 => 23 | extraDirs("-2.12-") 24 | case Some((2, y)) if y >= 13 => 25 | extraDirs("-2.13+") 26 | case _ => Nil 27 | } 28 | } 29 | 30 | } 31 | -------------------------------------------------------------------------------- /scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/providers/StableKnownDirectSubclasses.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.serialization.macros.impl.ordered_serialization.providers 2 | 3 | import scala.reflect.macros.whitebox.Context 4 | 5 | /** 6 | * The `knownDirectSubclasses` method doesn't provide stable ordering since it returns an unordered `Set` and 7 | * the `Type` AST nodes don't override the `hashCode` method, relying on the default identity `hashCode`. 8 | * 9 | * This function makes the ordering stable using a list ordered by the full name of the types. 10 | */ 11 | object StableKnownDirectSubclasses { 12 | 13 | def apply(c: Context)(tpe: c.Type): List[c.universe.TypeSymbol] = // linter:ignore:UnusedParameter 14 | tpe.typeSymbol.asClass.knownDirectSubclasses.map(_.asType).toList.sortBy(_.fullName) 15 | } 16 | -------------------------------------------------------------------------------- /scalding-core/codegen/mappable_generator.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | # Run it like this: 4 | # 5 | # ./codegen/mappable_generator.rb > src/main/scala/com/twitter/scalding/GeneratedMappable.scala 6 | 7 | $indent = " " 8 | 9 | TYPES = ('A'..'Z').to_a 10 | 11 | def make_mappable(cnt) 12 | other_cnts = (1..(22-cnt)).to_a 13 | typeString = TYPES[0..(cnt - 1)].join(",") 14 | puts "trait Mappable#{cnt}[#{typeString}] extends Mappable[Tuple#{cnt}[#{typeString}]] {" 15 | puts "#{$indent}def converter[Z >: Tuple#{cnt}[#{typeString}]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple#{cnt}[#{typeString}]])" 16 | puts "}" 17 | end 18 | 19 | puts "// following were autogenerated by #{__FILE__} at #{Time.now} do not edit" 20 | puts %q|package com.twitter.scalding 21 | 22 | | 23 | 24 | (1..22).each { |c| 25 | make_mappable(c) 26 | puts 27 | } 28 | 29 | puts "// end of autogenerated" 30 | -------------------------------------------------------------------------------- /tutorial/AvroTutorial0.scala: -------------------------------------------------------------------------------- 1 | /** 2 | Scalding with Avro (and Json) tutorial part 0. 3 | 4 | To run this job: 5 | scripts/scald.rb --local --avro --json tutorial/AvroTutorial0.scala 6 | 7 | Check the output: 8 | java -jar avro-tools-1.7.6.jar tojson tutorial/data/avrooutput0.avro 9 | 10 | **/ 11 | 12 | import com.twitter.scalding.{Job, Args, JsonLine} 13 | import com.twitter.scalding.avro.UnpackedAvroSource 14 | import org.apache.avro.Schema 15 | 16 | class AvroTutorial0(args: Args) extends Job(args) { 17 | val schema = """{ 18 | "type": "record", "name": "parseJson", "fields": [ 19 | { "name": "sessionId", "type": "string" }, 20 | { "name": "optionalField", "type": ["string", "null"] } 21 | ] }""" 22 | 23 | JsonLine("tutorial/data/session.json", ('sessionId, 'optionalField)).read 24 | .write(UnpackedAvroSource("tutorial/data/avrooutput0.avro", new Schema.Parser().parse(schema))) 25 | } 26 | -------------------------------------------------------------------------------- /scalding-base/src/main/scala/com/twitter/scalding/typed/memory_backend/AtomicBox.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.typed.memory_backend 2 | 3 | import java.util.concurrent.atomic.AtomicReference 4 | 5 | class AtomicBox[T <: AnyRef](init: T) { 6 | private[this] val ref = new AtomicReference[T](init) 7 | 8 | def lazySet(t: T): Unit = 9 | ref.lazySet(t) 10 | 11 | def set(t: T): Unit = 12 | ref.set(t) 13 | 14 | def swap(t: T): T = 15 | ref.getAndSet(t) 16 | 17 | /** 18 | * use a pure function to update the state. fn may be called more than once 19 | */ 20 | def update[R](fn: T => (T, R)): R = { 21 | 22 | @annotation.tailrec 23 | def loop(): R = { 24 | val init = ref.get 25 | val (next, res) = fn(init) 26 | if (ref.compareAndSet(init, next)) res 27 | else loop() 28 | } 29 | 30 | loop() 31 | } 32 | 33 | def get(): T = ref.get 34 | } 35 | -------------------------------------------------------------------------------- /scalding-core/src/main/scala/com/twitter/scalding/Sortable.scala: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2012 Twitter, Inc. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package com.twitter.scalding 17 | 18 | import cascading.tuple.Fields 19 | 20 | trait Sortable[+Self] { 21 | // Perform an inner secondary sort 22 | def sortBy(innerSort: Fields): Self 23 | def sorting: Option[Fields] 24 | } 25 | -------------------------------------------------------------------------------- /tutorial/MatrixTutorial1.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.examples 2 | 3 | import com.twitter.scalding._ 4 | import com.twitter.scalding.mathematics.Matrix 5 | 6 | 7 | /* 8 | * MatrixTutorial1.scala 9 | * 10 | * Loads a directed graph adjacency matrix where a[i,j] = 1 if there is an edge from a[i] to b[j] 11 | * and compute the co-follows between any two nodes 12 | * 13 | * ../scripts/scald.rb --local MatrixTutorial1.scala --input data/graph.tsv --output data/cofollows.tsv 14 | * 15 | */ 16 | 17 | 18 | class CofollowsJob(args : Args) extends Job(args) { 19 | 20 | import Matrix._ 21 | 22 | val adjacencyMatrix = Tsv( args("input"), ('user1, 'user2, 'rel) ) 23 | .read 24 | .toMatrix[Long,Long,Double]('user1, 'user2, 'rel) 25 | 26 | // compute the innerproduct of the adjacency matrix with itself 27 | (adjacencyMatrix * adjacencyMatrix.transpose).write( Tsv( args("output") ) ) 28 | } 29 | 30 | -------------------------------------------------------------------------------- /scalding-core/src/main/scala/com/twitter/scalding/reducer_estimation/ReducerHistoryEstimator.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.reducer_estimation 2 | 3 | import com.twitter.scalding.estimation.{HistoryEstimator, Task} 4 | import org.apache.hadoop.mapred.JobConf 5 | 6 | object ReducerHistoryEstimator { 7 | val Status = "status" 8 | val StartTime = "startTime" 9 | val FinishTime = "finishTime" 10 | 11 | implicit class ReducerRichTask(val task: Task) { 12 | def status: Option[String] = task.details.get(Status).map(_.asInstanceOf[String]) 13 | def startTime: Option[Long] = task.details.get(StartTime).map(_.asInstanceOf[Long]) 14 | def finishTime: Option[Long] = task.details.get(FinishTime).map(_.asInstanceOf[Long]) 15 | } 16 | } 17 | 18 | trait ReducerHistoryEstimator extends HistoryEstimator[Int] { 19 | override def maxHistoryItems(conf: JobConf): Int = ReducerEstimatorConfig.getMaxHistory(conf) 20 | } 21 | -------------------------------------------------------------------------------- /scalding-serialization/src/test/scala/com/twitter/scalding/serialization/macros/ZDifficultTypes.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.some.other.space.space 2 | 3 | sealed trait ContainerX 4 | object ContainerX { 5 | case class ElementY(x: String) extends ContainerX 6 | case class ElementZ(x: String) extends ContainerX 7 | } 8 | 9 | // This is intentionally not sealed. User can supply their own 10 | trait ContainerP { 11 | def id: String 12 | } 13 | object ContainerP { 14 | case object ElementA extends ContainerP { 15 | def id: String = "a" 16 | } 17 | case object ElementB extends ContainerP { 18 | def id: String = "b" 19 | } 20 | def fromId(id: String): ContainerP = id match { 21 | case _ if id == ElementA.id => ElementA 22 | case _ if id == ElementB.id => ElementB 23 | } 24 | } 25 | 26 | case class TestCaseHardA(e: ContainerX, y: String) 27 | case class TestCaseHardB(e: ContainerP, y: String) 28 | -------------------------------------------------------------------------------- /tutorial/MatrixTutorial0.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.examples 2 | 3 | import com.twitter.scalding._ 4 | import com.twitter.scalding.mathematics.Matrix 5 | 6 | /* 7 | * MatrixTutorial0.scala 8 | * 9 | * Loads a directed graph adjacency matrix where a[i,j] = 1 if there is an edge from a[i] to b[j] 10 | * and compute the outdegree of each node i 11 | * 12 | * ../scripts/scald.rb --local MatrixTutorial0.scala --input data/graph.tsv --output data/outdegree.tsv 13 | * 14 | */ 15 | 16 | 17 | class GraphOutDegreeJob(args : Args) extends Job(args) { 18 | 19 | import Matrix._ 20 | 21 | val adjacencyMatrix = Tsv( args("input"), ('user1, 'user2, 'rel) ) 22 | .read 23 | .toMatrix[Long,Long,Double]('user1, 'user2, 'rel) 24 | 25 | // each row i represents all of the outgoing edges from i 26 | // by summing out all of the columns we get the outdegree of i 27 | adjacencyMatrix.sumColVectors.write( Tsv( args("output") ) ) 28 | } 29 | 30 | -------------------------------------------------------------------------------- /scalding-base/src/main/scala/com/twitter/scalding/CPromise.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding 2 | 3 | import scala.concurrent.{ExecutionContext => ConcurrentExecutionContext, Future, Promise} 4 | 5 | /** 6 | * Represents a cancellable promise. 7 | */ 8 | case class CPromise[T](promise: Promise[T], cancellationHandler: Promise[CancellationHandler]) { 9 | 10 | /** 11 | * Creates a CFuture using the given promises. 12 | */ 13 | def cfuture: CFuture[T] = 14 | CFuture(promise.future, CancellationHandler.fromFuture(cancellationHandler.future)) 15 | 16 | def completeWith(other: CFuture[T]): this.type = { 17 | // fulfill the main and cancellation handler promises 18 | promise.completeWith(other.future) 19 | cancellationHandler.completeWith(Future.successful(other.cancellationHandler)) 20 | this 21 | } 22 | } 23 | object CPromise { 24 | def apply[T](): CPromise[T] = CPromise(Promise[T](), Promise[CancellationHandler]()) 25 | } 26 | -------------------------------------------------------------------------------- /scalding-base/src/main/scala/com/twitter/scalding/typed/functions/FlatMapping.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.typed.functions 2 | 3 | import java.io.Serializable 4 | 5 | /** 6 | * This is one of 4 core, non composed operations: identity filter map flatMap 7 | */ 8 | sealed abstract class FlatMapping[-A, +B] extends Serializable 9 | 10 | object FlatMapping extends Serializable { 11 | def filter[A](fn: A => Boolean): FlatMapping[A, A] = 12 | Filter[A, A](fn, implicitly) 13 | 14 | def filterKeys[K, V](fn: K => Boolean): FlatMapping[(K, V), (K, V)] = 15 | filter[(K, V)](FilterKeysToFilter(fn)) 16 | 17 | final case class Identity[A, B](ev: EqTypes[A, B]) extends FlatMapping[A, B] 18 | final case class Filter[A, B](fn: A => Boolean, ev: EqTypes[A, B]) extends FlatMapping[A, B] 19 | final case class Map[A, B](fn: A => B) extends FlatMapping[A, B] 20 | final case class FlatM[A, B](fn: A => TraversableOnce[B]) extends FlatMapping[A, B] 21 | } 22 | -------------------------------------------------------------------------------- /scalding-commons/src/main/scala/com/twitter/scalding/commons/scheme/CombinedSequenceFileScheme.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.commons.scheme 2 | 3 | import cascading.scheme.Scheme 4 | import com.twitter.elephantbird.cascading2.scheme.{CombinedSequenceFile, CombinedWritableSequenceFile} 5 | import com.twitter.scalding.{HadoopSchemeInstance, SequenceFileScheme, WritableSequenceFileScheme} 6 | 7 | trait CombinedSequenceFileScheme extends SequenceFileScheme { 8 | // TODO Cascading doesn't support local mode yet 9 | override def hdfsScheme = HadoopSchemeInstance( 10 | new CombinedSequenceFile(fields).asInstanceOf[Scheme[_, _, _, _, _]] 11 | ) 12 | } 13 | 14 | trait CombinedWritableSequenceFileScheme extends WritableSequenceFileScheme { 15 | // TODO Cascading doesn't support local mode yet 16 | override def hdfsScheme = 17 | HadoopSchemeInstance( 18 | new CombinedWritableSequenceFile(fields, keyType, valueType).asInstanceOf[Scheme[_, _, _, _, _]] 19 | ) 20 | } 21 | -------------------------------------------------------------------------------- /scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/handler/BlobTypeHandler.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.db.macros.impl.handler 2 | 3 | import com.twitter.scalding.db.macros.impl.FieldName 4 | import scala.reflect.macros.Context 5 | import scala.util.{Failure, Success} 6 | 7 | object BlobTypeHandler { 8 | def apply[T](c: Context)(implicit 9 | accessorTree: List[c.universe.MethodSymbol], 10 | fieldName: FieldName, 11 | defaultValue: Option[c.Expr[String]], 12 | annotationInfo: List[(c.universe.Type, Option[Int])], 13 | nullable: Boolean 14 | ): scala.util.Try[List[ColumnFormat[c.type]]] = 15 | if (defaultValue.nonEmpty || annotationInfo.nonEmpty) 16 | Failure( 17 | new Exception( 18 | s"Default values and annotation info are not supported: defaultValue = $defaultValue annotationInfo = $annotationInfo" 19 | ) 20 | ) 21 | else 22 | Success(List(ColumnFormat(c)(accessorTree, "BLOB", None))) 23 | } 24 | -------------------------------------------------------------------------------- /scalding-base/src/main/scala/com/twitter/scalding/StringUtility.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding 2 | 3 | object StringUtility { 4 | private[this] val emptyCons = "" :: Nil 5 | 6 | private def fastSplitHelper( 7 | text: String, 8 | key: String, 9 | from: Int, 10 | textLength: Int, 11 | keyLength: Int 12 | ): List[String] = { 13 | val firstIndex = text.indexOf(key, from) 14 | if (firstIndex == -1) { 15 | if (from < textLength) { 16 | text.substring(from) :: Nil 17 | } else { 18 | emptyCons 19 | } 20 | } else { 21 | // the text till the separator should be kept in any case 22 | text.substring(from, firstIndex) :: fastSplitHelper( 23 | text, 24 | key, 25 | firstIndex + keyLength, 26 | textLength, 27 | keyLength 28 | ) 29 | } 30 | } 31 | 32 | def fastSplit(text: String, key: String): List[String] = 33 | fastSplitHelper(text, key, 0, text.length, key.length) 34 | } 35 | -------------------------------------------------------------------------------- /scalding-base/src/main/scala/com/twitter/scalding/typed/KeyedPipe.scala: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2014 Twitter, Inc. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package com.twitter.scalding.typed 17 | 18 | /** 19 | * Represents anything that starts as a TypedPipe of Key Value, where the value type has been erased. Acts as 20 | * proof that the K in the tuple has an Ordering 21 | */ 22 | trait KeyedPipe[K] { 23 | def keyOrdering: Ordering[K] 24 | def mapped: TypedPipe[(K, Any)] 25 | } 26 | -------------------------------------------------------------------------------- /scalding-commons/src/main/java/com/twitter/scalding/commons/datastores/Utils.java: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.commons.datastores; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.hadoop.fs.FileSystem; 5 | import org.apache.hadoop.fs.Path; 6 | 7 | import java.io.IOException; 8 | 9 | public class Utils { 10 | 11 | public static FileSystem getFS(String path) throws IOException { 12 | return getFS(path, new Configuration()); 13 | } 14 | 15 | public static FileSystem getFS(String path, Configuration conf) throws IOException { 16 | return new Path(path).getFileSystem(conf); 17 | } 18 | 19 | /** 20 | * Return true or false if the input is a long 21 | * @param input 22 | * @return boolean 23 | */ 24 | public static boolean isLong(String input) { 25 | try { 26 | Long.parseLong(input); 27 | return true; 28 | } catch (Exception e) { 29 | return false; 30 | } 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /scalding-core/src/test/scala/com/twitter/scalding/TypedPipeCheckerTest.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding 2 | 3 | import org.scalatest.{Matchers, WordSpec} 4 | 5 | class TypedPipeCheckerTest extends WordSpec with Matchers { 6 | import TypedPipeChecker._ 7 | 8 | "TypedPipeChecker" should { 9 | "run asserts on pipe" in { 10 | checkOutput(TypedPipe.from(List(1, 2, 3, 4))) { rows => 11 | assert(rows.size == 4) 12 | assert(rows == List(1, 2, 3, 4)) 13 | } 14 | } 15 | } 16 | 17 | it should { 18 | "give back a list" in { 19 | val list = inMemoryToList(TypedPipe.from(List(1, 2, 3, 4))) 20 | assert(list == List(1, 2, 3, 4)) 21 | } 22 | } 23 | 24 | it should { 25 | "allow for a list of input to be run through a transform function" in { 26 | def transform(pipe: TypedPipe[Int]) = pipe.map(identity) 27 | 28 | checkOutputTransform(List(1, 2, 3))(transform) { rows => 29 | assert(rows == List(1, 2, 3)) 30 | } 31 | } 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /scripts/test_tutorials.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -exv 2 | 3 | # Identify the bin dir in the distribution, and source the common include script 4 | BASE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/.. && pwd )" 5 | source ${BASE_DIR}/scripts/common.sh 6 | 7 | time $SCALD tutorial/Tutorial0.scala 8 | time $SCALD tutorial/Tutorial1.scala 9 | time $SCALD tutorial/Tutorial2.scala 10 | 11 | time $SCALD tutorial/Tutorial3.scala \ 12 | --input tutorial/data/hello.txt 13 | 14 | time $SCALD tutorial/Tutorial4.scala \ 15 | --input tutorial/data/hello.txt \ 16 | --output tutorial/data/output4.txt 17 | 18 | time $SCALD tutorial/Tutorial5.scala \ 19 | --input tutorial/data/hello.txt \ 20 | --output tutorial/data/output5.txt \ 21 | --words tutorial/data/words.txt 22 | 23 | time $SCALD tutorial/MatrixTutorial5.scala \ 24 | --input tutorial/data/graph.tsv \ 25 | --output tutorial/data/cosineSim.tsv 26 | 27 | time $SCALD --json tutorial/JsonTutorial0.scala 28 | 29 | time $SCALD --avro --json tutorial/AvroTutorial0.scala 30 | 31 | 32 | -------------------------------------------------------------------------------- /scalding-db/src/main/scala/com/twitter/scalding/db/DBTypeDescriptor.scala: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2015 Twitter, Inc. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package com.twitter.scalding.db 18 | import com.twitter.scalding._ 19 | import cascading.tuple.Fields 20 | 21 | trait DBTypeDescriptor[T] { 22 | def columnDefn: ColumnDefinitionProvider[T] 23 | def converter: TupleConverter[T] 24 | def setter: TupleSetter[T] 25 | def fields: Fields 26 | def jdbcSetter: JdbcStatementSetter[T] 27 | } 28 | -------------------------------------------------------------------------------- /scalding-core/src/main/scala/com/twitter/scalding/CascadeJob.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding 2 | 3 | import cascading.cascade.CascadeConnector 4 | import cascading.cascade.Cascade 5 | 6 | abstract class CascadeJob(args: Args) extends Job(args) { 7 | 8 | def jobs: Seq[Job] 9 | 10 | override def run = { 11 | val flows = jobs.map(_.buildFlow) 12 | val cascade = new CascadeConnector().connect(flows: _*) 13 | preProcessCascade(cascade) 14 | cascade.complete() 15 | postProcessCascade(cascade) 16 | val statsData = cascade.getCascadeStats 17 | 18 | handleStats(statsData) 19 | statsData.isSuccessful 20 | } 21 | 22 | override def validate(): Unit = 23 | jobs.foreach(_.validate()) 24 | 25 | /* 26 | * Good for printing a dot file, setting the flow skip strategy, etc 27 | */ 28 | def preProcessCascade(cascade: Cascade) = {} // linter:ignore 29 | 30 | /* 31 | * Good for checking the cascade stats 32 | */ 33 | def postProcessCascade(cascade: Cascade) = {} // linter:ignore 34 | 35 | } 36 | -------------------------------------------------------------------------------- /docs/src/main/tut/cookbook/hbase.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: docs 3 | title: "Scalding and HBase" 4 | section: "cookbook" 5 | --- 6 | 7 | # Using Scalding with HBase 8 | 9 | ## Resources 10 | 11 | - [Running Scalding with HBase support](https://github.com/kianwilcox/hbase-scalding) a github example project. 12 | - [Spy Glass](https://github.com/ParallelAI/SpyGlass) - Advanced featured HBase wrapper for Cascading and Scalding 13 | - [Maple](https://github.com/Cascading/maple) a collection of Cascading Taps, including a simple HBase tap. Spy Glass appears to be the more advanced option. 14 | - [KijiExpress](https://github.com/kijiproject/kiji-express) provides a full lifecycle for building predictive models using Scalding and HBase. 15 | 16 | ## Example Code 17 | 18 | TODO: Please add links to example code here. 19 | 20 | ### Documentation Help 21 | 22 | We'd love your help fleshing out this documentation! You can edit this page in your browser by clicking [this link](https://github.com/twitter/scalding/edit/develop/docs/src/main/tut/cookbook/hbase.md). 23 | -------------------------------------------------------------------------------- /scalding-quotation/src/test/scala/com/twitter/scalding/quotation/LimitationsTest.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.quotation 2 | 3 | class LimitationsTest extends Test { 4 | 5 | class TestClass { 6 | def function[T, U](f: T => U)(implicit q: Quoted) = (q, f) 7 | } 8 | 9 | val test = new TestClass 10 | 11 | "nested transitive projection" in pendingUntilFixed { 12 | test.function[Person, Option[String]](_.alternativeContact.map(_.phone))._1.projections.set mustEqual 13 | Set( 14 | Person.typeReference 15 | .andThen(Accessor("alternativeContact"), typeName[Option[Contact]]) 16 | .andThen(Accessor("phone"), typeName[String]) 17 | ) 18 | } 19 | 20 | "nested quoted function projection" in pendingUntilFixed { 21 | val contactFunction = Quoted.function { (p: Person) => 22 | p.contact 23 | } 24 | val phoneFunction = Quoted.function { (p: Person) => 25 | contactFunction(p).phone 26 | } 27 | phoneFunction.quoted.projections.set mustEqual Set(Person.phoneProjection) 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /scripts/README.md: -------------------------------------------------------------------------------- 1 | Testing info: 2 | 3 | .travis.yml:: 4 | 5 | On travis to get better build time on weaker machines we split out the build into several targets. 6 | 7 | .travis.yml contains several build targets splitting them up. 8 | 9 | 10 | run_test.sh: 11 | Here we unpack a cache(falling back to a static link if we can to try get some bootstrap jars). Run the compile step with standard out/standard error directed to /dev/null. (This is to stop travis giving up about our test logging being too verbose.). Then we run our test suite on our pre-build classes. Finally if anything has changed in our cache folders we pack them away into a special folder for travis to cache. 12 | 13 | testValidator.sh: 14 | 15 | Here we attempt in a bash script to ensure we have full coverage of our Build targets in our travis yml. Erroring the build if not the case. 16 | 17 | 18 | 19 | NB: At some point on more powerful machines with a proper maven cache none of these strategies may be worth while. Possibly splitting up some of the test running, but the code around caches and such can all probably be deleted. -------------------------------------------------------------------------------- /scalding-dagon/src/main/scala/com/twitter/scalding/dagon/Id.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.dagon 2 | 3 | import java.io.Serializable 4 | import java.util.concurrent.atomic.AtomicLong 5 | 6 | /** 7 | * The Expressions are assigned Ids. Each Id is associated with an expression of inner type T. 8 | * 9 | * This is done to put an indirection in the Dag that allows us to rewrite nodes by simply replacing the 10 | * expressions associated with given Ids. 11 | * 12 | * T is a phantom type used by the type system 13 | */ 14 | final class Id[T] private (val serial: Long) extends Serializable { 15 | require(serial >= 0, s"counter overflow has occurred: $serial") 16 | override def toString: String = s"Id($serial)" 17 | } 18 | 19 | object Id { 20 | 21 | @transient private[this] val counter = new AtomicLong(0) 22 | 23 | def next[T](): Id[T] = 24 | new Id[T](counter.getAndIncrement()) 25 | 26 | implicit def idOrdering[T]: Ordering[Id[T]] = 27 | new Ordering[Id[T]] { 28 | def compare(a: Id[T], b: Id[T]) = 29 | java.lang.Long.compare(a.serial, b.serial) 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /tutorial/MatrixTutorial4.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.examples 2 | 3 | import com.twitter.scalding._ 4 | import com.twitter.scalding.mathematics.Matrix 5 | 6 | 7 | /* 8 | * MatrixTutorial4.scala 9 | * 10 | * Loads a directed graph adjacency matrix where a[i,j] = 1 if there is an edge from a[i] to b[j] 11 | * and computes the cosine of the angle between every two pairs of vectors 12 | * 13 | * ../scripts/scald.rb --local MatrixTutorial4.scala --input data/graph.tsv --output data/cosineSim.tsv 14 | * 15 | */ 16 | 17 | class ComputeCosineJob(args : Args) extends Job(args) { 18 | 19 | import Matrix._ 20 | 21 | val adjacencyMatrix = Tsv( args("input"), ('user1, 'user2, 'rel) ) 22 | .read 23 | .toMatrix[Long,Long,Double]('user1, 'user2, 'rel) 24 | 25 | // we compute the L2 normalized adjacency graph 26 | val normMatrix = adjacencyMatrix.rowL2Normalize 27 | 28 | // we compute the innerproduct of the normalized matrix with itself 29 | // which is equivalent with computing cosine: AA^T / ||A|| * ||A|| 30 | (normMatrix * normMatrix.transpose).write( Tsv( args("output") ) ) 31 | 32 | } 33 | 34 | -------------------------------------------------------------------------------- /scalding-thrift-macros/src/main/scala/com/twitter/scalding/thrift/macros/Macros.scala: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2014 Twitter, Inc. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package com.twitter.scalding.thrift.macros 17 | 18 | import com.twitter.scalding.serialization.OrderedSerialization 19 | import com.twitter.scalding.thrift.macros.impl.ScroogeInternalOrderedSerializationImpl 20 | 21 | import scala.language.experimental.{macros => sMacros} 22 | 23 | object Macros { 24 | implicit def scroogeOrdSer[T]: OrderedSerialization[T] = macro ScroogeInternalOrderedSerializationImpl[T] 25 | } 26 | -------------------------------------------------------------------------------- /scripts/test_matrix_tutorials.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -exv 2 | 3 | # Identify the bin dir in the distribution, and source the common include script 4 | BASE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/.. && pwd )" 5 | source ${BASE_DIR}/scripts/common.sh 6 | 7 | time $SCALD tutorial/MatrixTutorial0.scala \ 8 | --input tutorial/data/graph.tsv \ 9 | --output tutorial/data/outdegree.tsv 10 | 11 | time $SCALD tutorial/MatrixTutorial1.scala \ 12 | --input tutorial/data/graph.tsv \ 13 | --output tutorial/data/cofollows.tsv 14 | 15 | time $SCALD tutorial/MatrixTutorial2.scala \ 16 | --input tutorial/data/graph.tsv \ 17 | --maxOutdegree 1000 \ 18 | --output tutorial/data/graphFiltered.tsv 19 | 20 | time $SCALD tutorial/MatrixTutorial3.scala \ 21 | --input1 tutorial/data/graph.tsv \ 22 | --input2 tutorial/data/graph2.tsv \ 23 | --intersection tutorial/data/intersection.tsv \ 24 | --leftDiff tutorial/data/leftDiff.tsv \ 25 | --rightDiff tutorial/data/rightDiff.tsv 26 | 27 | time $SCALD tutorial/MatrixTutorial5.scala \ 28 | --input tutorial/data/graph.tsv \ 29 | --output tutorial/data/cosineSim.tsv 30 | 31 | 32 | -------------------------------------------------------------------------------- /scalding-core/src/main/scala/com/twitter/scalding/source/CheckedInversion.scala: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2012 Twitter, Inc. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package com.twitter.scalding.source 18 | 19 | import com.twitter.bijection.Injection 20 | import java.io.Serializable 21 | 22 | /** 23 | * Handles the error checking for Injection inversion if check fails, it will throw an unrecoverable exception 24 | * stopping the job TODO: probably belongs in Bijection 25 | */ 26 | trait CheckedInversion[T, U] extends Serializable { 27 | def injection: Injection[T, U] 28 | def apply(input: U): Option[T] 29 | } 30 | -------------------------------------------------------------------------------- /scripts/test_repl_tutorial.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -exv 2 | 3 | # Identify the bin dir in the distribution, and source the common include script 4 | BASE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/.. && pwd )" 5 | source ${BASE_DIR}/scripts/common.sh 6 | 7 | # Now run a basic test for the REPL 8 | # If the content of the output is different, diff will fail with a non-zero exit code 9 | time $SCALD_REPL < tutorial/ReplTutorial1.scala 10 | diff tutorial/data/hello.txt tutorial/data/output1.txt 11 | 12 | # Run from inside tutorial directory so we pick up definition 13 | # of 'scaldingReplInitWasLoaded' from 'tutorial/.scalding_repl' 14 | # If it does, then this 'script' exits early with success. 15 | # Otherwise it continues and exits with an error. 16 | cd tutorial; echo " 17 | if (scaldingReplInitWasLoaded) System.exit(0) 18 | System.exit(1) 19 | " | $SCALD_REPL 20 | 21 | # Test running repl from sbt. 22 | cd $BASE_DIR 23 | # We need to clean SBT_OPTS, because on travis default SBT_OPTS enable sbt batch mode, which finishes repl process immediately before passing `System.exit`. 24 | echo 'System.exit(0)' | SBT_OPTS='' ./sbt ++$TRAVIS_SCALA_VERSION 'scalding-repl/run --local' 25 | -------------------------------------------------------------------------------- /COMMITTERS.md: -------------------------------------------------------------------------------- 1 | # Committers 2 | 3 | Please see our [Project Governance](https://github.com/twitter/analytics-infra-governance) page for more details. 4 | 5 | ## Active 6 | 7 | | Name | Handle | 8 | |------------------------|-----------------------------------------------------------| 9 | | Alex Levenson | [@isnotinvain](https://github.com/isnotinvain) | 10 | | Ben Pence | [@benpence](https://github.com/benpence) | 11 | | Ian O'Connell | [@ianoc](https://github.com/ianoc) | 12 | | Joe Nievelt | [@jnievelt](https://github.com/jnievelt) | 13 | | Oscar Boykin | [@johnynek](https://github.com/johnynek) | 14 | | Pankaj Gupta | [@pankajroark](https://github.com/pankajroark) | 15 | | Piyush Narang | [@piyushnarang](https://github.com/piyushnarang) | 16 | | Ruban Monu | [@rubanm](https://github.com/rubanm) | 17 | | Sriram Krishnan | [@sriramkrishnan](https://github.com/sriramkrishnan) | 18 | 19 | ##Emeritus 20 | -------------------------------------------------------------------------------- /scalding-parquet-scrooge-fixtures/src/test/resources/binary.thrift: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | namespace java com.twitter.scalding.parquet.scrooge.thrift_java.test.binary 21 | #@namespace scala com.twitter.scalding.parquet.scrooge.thrift_scala.test.binary 22 | 23 | struct StringAndBinary { 24 | 1: required string s; 25 | 2: required binary b; 26 | } 27 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .bsp 2 | .cache 3 | .project 4 | .settings 5 | .classpath 6 | *.swp 7 | BUILD 8 | target/ 9 | lib_managed/ 10 | project/boot/ 11 | project/build/target/ 12 | project/plugins/target/ 13 | project/plugins/lib_managed/ 14 | project/plugins/src_managed/ 15 | /.idea/ 16 | /.idea_modules/ 17 | .project 18 | .classpath 19 | .cache-main 20 | .cache-tests 21 | .tmpBin 22 | bin 23 | *.iml 24 | sonatype.sbt 25 | build.sbt-e # not sure where this comes from some kind of backup? 26 | tutorial/data/execution_output.txt 27 | tutorial/data/cofollows.tsv 28 | tutorial/data/cosineSim.tsv 29 | tutorial/data/graphFiltered.tsv 30 | tutorial/data/intersection.tsv 31 | tutorial/data/jaccardSim.tsv 32 | tutorial/data/leftDiff.tsv 33 | tutorial/data/outdegree.tsv 34 | tutorial/data/output0.txt 35 | tutorial/data/output1.txt 36 | tutorial/data/output2.txt 37 | tutorial/data/output3.txt 38 | tutorial/data/output4.txt 39 | tutorial/data/output5.txt 40 | tutorial/data/rightDiff.tsv 41 | tutorial/data/tmp3.tsv 42 | tutorial/data/jsonoutput0.tsv 43 | tutorial/data/avrooutput0.avro 44 | .scalding_repl 45 | scalding-hadoop-test/NOTICE 46 | NOTICE 47 | 48 | # Auto-copied by sbt-microsites 49 | docs/src/main/tut/contributing.md 50 | .DS_Store 51 | -------------------------------------------------------------------------------- /scalding-core/src/test/scala/com/twitter/scalding/source/TypedTextTest.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.source 2 | 3 | import org.scalatest.FunSuite 4 | 5 | case class Test1(a: Int, b: Long, c: Option[Double]) 6 | case class Test2(one: Test1, d: String) 7 | 8 | class TypedTextTest extends FunSuite { 9 | test("Test with a flat tuple") { 10 | val source = TypedText.tsv[Test1]("myPath") 11 | assert(source.sourceFields.size == 3) 12 | } 13 | 14 | test("Test with a nested tuple") { 15 | val source = TypedText.tsv[Test2]("myPath") 16 | assert(source.sourceFields.size == 4) 17 | } 18 | 19 | test("Test with a raw type") { 20 | val source = TypedText.tsv[String]("myPath") 21 | assert(source.sourceFields.size == 1) 22 | } 23 | 24 | test("Test with a tuple") { 25 | val source = TypedText.tsv[(Int, Int)]("myPath") 26 | assert(source.sourceFields.size == 2) 27 | } 28 | 29 | test("Test with an Optional Int") { 30 | val source = TypedText.tsv[Option[Int]]("myPath") 31 | assert(source.sourceFields.size == 1) 32 | } 33 | 34 | test("Test with an Int") { 35 | val source = TypedText.tsv[Int]("myPath") 36 | assert(source.sourceFields.size == 1) 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | resolvers ++= Seq( 2 | "jgit-repo".at("https://download.eclipse.org/jgit/maven"), 3 | "sonatype-releases".at("https://oss.sonatype.org/content/repositories/releases"), 4 | "Twitter Maven".at("https://maven.twttr.com") 5 | ) 6 | 7 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.6") 8 | addSbtPlugin("com.eed3si9n" % "sbt-buildinfo" % "0.11.0") 9 | addSbtPlugin("com.eed3si9n" % "sbt-unidoc" % "0.4.3") 10 | addSbtPlugin("com.47deg" % "sbt-microsites" % "1.3.4") 11 | addSbtPlugin("com.github.sbt" % "sbt-ci-release" % "1.5.10") 12 | addSbtPlugin("com.dwijnand" % "sbt-dynver" % "4.1.1") 13 | addSbtPlugin("com.github.sbt" % "sbt-pgp" % "2.1.2") 14 | addSbtPlugin("com.twitter" %% "scrooge-sbt-plugin" % "18.9.0") 15 | addSbtPlugin("com.typesafe" % "sbt-mima-plugin" % "0.1.14") 16 | addSbtPlugin("com.typesafe.sbt" % "sbt-ghpages" % "0.6.3") 17 | addSbtPlugin("com.typesafe.sbt" % "sbt-git" % "1.0.0") 18 | addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.4.6") 19 | addSbtPlugin("com.github.sbt" % "sbt-jacoco" % "3.4.0") 20 | addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "3.9.7") 21 | addSbtPlugin("org.wartremover" % "sbt-wartremover" % "2.4.16") 22 | addSbtPlugin("org.scalameta" % "sbt-mdoc" % "2.2.22") 23 | -------------------------------------------------------------------------------- /scalding-base/src/main/scala/com/twitter/scalding/CancellationHandler.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding 2 | 3 | import scala.concurrent.{ExecutionContext => ConcurrentExecutionContext, Future} 4 | 5 | sealed trait CancellationHandler { outer => 6 | def stop()(implicit ec: ConcurrentExecutionContext): Future[Unit] 7 | def compose(other: CancellationHandler): CancellationHandler = new CancellationHandler { 8 | override def stop()(implicit ec: ConcurrentExecutionContext): Future[Unit] = 9 | other.stop().zip(outer.stop()).map(_ => ()) 10 | } 11 | } 12 | 13 | object CancellationHandler { 14 | val empty: CancellationHandler = new CancellationHandler { 15 | def stop()(implicit ec: ConcurrentExecutionContext): Future[Unit] = Future.successful(()) 16 | } 17 | 18 | def fromFn(fn: ConcurrentExecutionContext => Future[Unit]): CancellationHandler = new CancellationHandler { 19 | override def stop()(implicit ec: ConcurrentExecutionContext): Future[Unit] = fn(ec) 20 | } 21 | 22 | def fromFuture(f: Future[CancellationHandler]): CancellationHandler = new CancellationHandler { 23 | override def stop()(implicit ec: ConcurrentExecutionContext): Future[Unit] = 24 | f.flatMap(_.stop()) 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/handler/DateTypeHandler.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.db.macros.impl.handler 2 | 3 | import scala.reflect.macros.Context 4 | import scala.util.Success 5 | 6 | import com.twitter.scalding.db.macros.impl.FieldName 7 | 8 | object DateTypeHandler { 9 | 10 | def apply[T](c: Context)(implicit 11 | accessorTree: List[c.universe.MethodSymbol], 12 | fieldName: FieldName, 13 | defaultValue: Option[c.Expr[String]], 14 | annotationInfo: List[(c.universe.Type, Option[Int])], 15 | nullable: Boolean 16 | ): scala.util.Try[List[ColumnFormat[c.type]]] = { 17 | 18 | val helper = new { 19 | val ctx: c.type = c 20 | val cfieldName = fieldName 21 | val cannotationInfo = annotationInfo 22 | } with AnnotationHelper 23 | 24 | val extracted = for { 25 | (nextHelper, dateAnno) <- helper.dateAnnotation 26 | _ <- nextHelper.validateFinished 27 | } yield dateAnno 28 | 29 | extracted.flatMap { 30 | case WithDate => Success(List(ColumnFormat(c)(accessorTree, "DATE", None))) 31 | case WithoutDate => Success(List(ColumnFormat(c)(accessorTree, "DATETIME", None))) 32 | } 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /scalding-date/src/main/scala/com/twitter/scalding/CalendarOps.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding 2 | 3 | import java.util.{Calendar, Date} 4 | import scala.annotation.tailrec 5 | 6 | /** 7 | */ 8 | object CalendarOps { 9 | def truncate(date: Calendar, field: Int): Calendar = { 10 | @tailrec 11 | def truncateIter(cal: Calendar, field: Int, currentField: Int): Calendar = 12 | if (currentField > field) { 13 | currentField match { 14 | case Calendar.DAY_OF_MONTH => cal.set(currentField, 1) 15 | case Calendar.DAY_OF_WEEK_IN_MONTH | Calendar.DAY_OF_WEEK | Calendar.DAY_OF_YEAR | 16 | Calendar.WEEK_OF_MONTH | Calendar.WEEK_OF_YEAR | Calendar.HOUR_OF_DAY => 17 | () // Skip 18 | case _ => cal.set(currentField, 0) 19 | } 20 | 21 | truncateIter(cal, field, currentField - 1) 22 | } else { 23 | cal 24 | } 25 | 26 | val cloned = date.clone().asInstanceOf[Calendar] 27 | 28 | truncateIter(cloned, field, Calendar.MILLISECOND) 29 | } 30 | 31 | def truncate(date: Date, field: Int): Date = { 32 | val cal = Calendar.getInstance() 33 | cal.setTime(date) 34 | 35 | truncate(cal, field).getTime() 36 | } 37 | 38 | } 39 | -------------------------------------------------------------------------------- /scalding-serialization/src/main/scala/com/twitter/scalding/serialization/Laws.scala: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2015 Twitter, Inc. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package com.twitter.scalding.serialization 17 | 18 | /** 19 | * This is a simple trait for describing laws on single parameter type classes (Serialization, Monoid, 20 | * Ordering, etc...) 21 | */ 22 | sealed trait Law[T] { 23 | def name: String 24 | } 25 | final case class Law1[T](override val name: String, check: T => Boolean) extends Law[T] 26 | final case class Law2[T](override val name: String, check: (T, T) => Boolean) extends Law[T] 27 | final case class Law3[T](override val name: String, check: (T, T, T) => Boolean) extends Law[T] 28 | -------------------------------------------------------------------------------- /scalding-quotation/src/main/scala/com/twitter/scalding/quotation/TreeOps.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.quotation 2 | 3 | import scala.reflect.macros.blackbox.Context 4 | 5 | trait TreeOps { 6 | val c: Context 7 | import c.universe._ 8 | 9 | /** 10 | * Finds the first tree that satisfies the condition. 11 | */ 12 | def find(tree: Tree)(f: Tree => Boolean): Option[Tree] = { 13 | var res: Option[Tree] = None 14 | val t = new Traverser { 15 | override def traverse(t: Tree) = 16 | if (res.isEmpty) 17 | if (f(t)) 18 | res = Some(t) 19 | else 20 | super.traverse(t) 21 | } 22 | t.traverse(tree) 23 | res 24 | } 25 | 26 | /** 27 | * Similar to tree.collect but it doesn't collect the children of a collected tree. 28 | */ 29 | def collect[T](tree: Tree)(f: PartialFunction[Tree, T]): List[T] = { 30 | var res = List.newBuilder[T] 31 | val t = new Traverser { 32 | override def traverse(t: Tree) = 33 | f.lift(t) match { 34 | case Some(v) => 35 | res += v 36 | case None => 37 | super.traverse(t) 38 | } 39 | } 40 | t.traverse(tree) 41 | res.result() 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /scalding-base/src/main/scala/com/twitter/scalding/UniqueID.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding 2 | 3 | /** 4 | * Used to inject a typed unique identifier to uniquely name each scalding flow. This is here mostly to deal 5 | * with the case of testing where there are many concurrent threads running Flows. Users should never have to 6 | * worry about these 7 | */ 8 | case class UniqueID(get: String) { 9 | require(get.indexOf(',') == -1, s"UniqueID cannot contain ,: $get") 10 | } 11 | 12 | object UniqueID { 13 | val UNIQUE_JOB_ID = "scalding.job.uniqueId" 14 | private val id = new java.util.concurrent.atomic.AtomicInteger(0) 15 | 16 | def getRandom: UniqueID = { 17 | // This number is unique as long as we don't create more than 10^6 per milli 18 | // across separate jobs. which seems very unlikely. 19 | val unique = (System.currentTimeMillis << 20) ^ (id.getAndIncrement.toLong) 20 | UniqueID(unique.toString) 21 | } 22 | 23 | /** 24 | * This is only safe if you use something known to have a single instance in the relevant scope. 25 | * 26 | * In cascading, the FlowDef has been used here 27 | */ 28 | def fromSystemHashCode(ar: AnyRef): UniqueID = 29 | UniqueID(System.identityHashCode(ar).toString) 30 | } 31 | -------------------------------------------------------------------------------- /scalding-base/src/main/scala/com/twitter/scalding/typed/functions/EqTypes.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.typed.functions 2 | 3 | /** 4 | * This is a more powerful version of =:= that can allow us to remove casts and also not have any runtime cost 5 | * for our function calls in some cases of trivial functions 6 | */ 7 | sealed abstract class EqTypes[A, B] extends java.io.Serializable { 8 | def apply(a: A): B 9 | def subst[F[_]](f: F[A]): F[B] 10 | 11 | final def reverse: EqTypes[B, A] = { 12 | val aa = EqTypes.reflexive[A] 13 | type F[T] = EqTypes[T, A] 14 | subst[F](aa) 15 | } 16 | 17 | def toEv: A =:= B = { 18 | val aa = implicitly[A =:= A] 19 | type F[T] = A =:= T 20 | subst[F](aa) 21 | } 22 | } 23 | 24 | object EqTypes extends java.io.Serializable { 25 | private[this] final case class ReflexiveEquality[A]() extends EqTypes[A, A] { 26 | def apply(a: A): A = a 27 | def subst[F[_]](f: F[A]): F[A] = f 28 | } 29 | 30 | implicit def reflexive[A]: EqTypes[A, A] = ReflexiveEquality() 31 | 32 | def fromEv[A, B](ev: A =:= B): EqTypes[A, B] = // linter:disable:UnusedParameter 33 | // in scala 2.13, this won't need a cast, but the cast is safe 34 | reflexive[A].asInstanceOf[EqTypes[A, B]] 35 | } 36 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish 2 | 3 | on: 4 | push: 5 | branches: 6 | - "develop" 7 | tags: 8 | - "v*" 9 | 10 | jobs: 11 | publish: 12 | runs-on: ubuntu-latest 13 | 14 | steps: 15 | - name: Checkout Repo 16 | uses: actions/checkout@v2 17 | with: 18 | fetch-depth: 0 # fetch all tags for sbt-dynver to properly resolve scalding version 19 | 20 | - uses: actions/setup-java@v2 21 | with: 22 | distribution: "adopt-openj9" 23 | java-version: '8.0.322+6' # non hadoop 3.3 versions build break https://issues.apache.org/jira/browse/HADOOP-16590 24 | 25 | - name: Set up Ruby 26 | uses: ruby/setup-ruby@v1 27 | with: 28 | ruby-version: 2.4 29 | 30 | - name: Install Ruby Gems 31 | run: | 32 | gem install sass -v 3.7.4 33 | gem install jekyll -v 3.2.1 34 | 35 | - name: "Publish" 36 | env: 37 | PGP_PASSPHRASE: ${{ secrets.PGP_PASSPHRASE }} 38 | PGP_SECRET: ${{ secrets.PGP_SECRET }} 39 | SONATYPE_PASSWORD: ${{ secrets.SONATYPE_PASSWORD }} 40 | SONATYPE_USERNAME: ${{ secrets.SONATYPE_USERNAME }} 41 | run: | 42 | ./sbt "ci-release" 43 | -------------------------------------------------------------------------------- /scalding-commons/src/test/scala/com/twitter/scalding/commons/source/LzoGenericSourceSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2015 Twitter, Inc. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package com.twitter.scalding.commons.source 17 | 18 | import com.twitter.bijection.JavaSerializationInjection 19 | import org.scalatest.{Matchers, WordSpec} 20 | import scala.util.Success 21 | 22 | class LzoGenericSourceSpec extends WordSpec with Matchers { 23 | "LzoGenericScheme" should { 24 | "be serializable" in { 25 | val scheme = LzoGenericScheme[Array[Byte]](IdentityBinaryConverter) 26 | val inj = JavaSerializationInjection[LzoGenericScheme[Array[Byte]]] 27 | inj.invert(inj.apply(scheme)) shouldBe Success(scheme) 28 | } 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /scalding-commons/src/main/scala/com/twitter/scalding/commons/source/FixedPathSources.scala: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2012 Twitter, Inc. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package com.twitter.scalding.commons.source 18 | 19 | import com.google.protobuf.Message 20 | import com.twitter.scalding._ 21 | import org.apache.thrift.TBase 22 | 23 | abstract class FixedPathLzoThrift[T <: TBase[_, _]: Manifest](path: String*) 24 | extends FixedPathSource(path: _*) 25 | with LzoThrift[T] { 26 | def column = manifest[T].runtimeClass 27 | } 28 | 29 | abstract class FixedPathLzoProtobuf[T <: Message: Manifest](path: String) 30 | extends FixedPathSource(path) 31 | with LzoProtobuf[T] { 32 | def column = manifest[T].runtimeClass 33 | } 34 | -------------------------------------------------------------------------------- /scalding-core/src/test/scala/com/twitter/scalding/estimation/memory/MemoryEstimatorStepStrategyTest.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.estimation.memory 2 | 3 | import org.apache.hadoop.mapred.JobConf 4 | import org.scalatest.{Matchers, WordSpec} 5 | 6 | class MemoryEstimatorStepStrategyTest extends WordSpec with Matchers { 7 | "A Memory estimator step strategy" should { 8 | "set xmx settings correctly" in { 9 | val conf = confWith("test.opts", "-Xmx3500m -Djava.net.preferIPv4Stack=true -Xms34m") 10 | 11 | MemoryEstimatorStepStrategy.setXmxMemory("test.opts", 1024, conf) 12 | 13 | conf.get("test.opts") shouldBe "-Djava.net.preferIPv4Stack=true -Xmx1024m" 14 | } 15 | 16 | "set xmx settings correctly with empty original config" in { 17 | val conf = confWith(Map.empty) 18 | 19 | MemoryEstimatorStepStrategy.setXmxMemory("test.opts", 1024, conf) 20 | 21 | conf.get("test.opts") shouldBe " -Xmx1024m" 22 | } 23 | } 24 | 25 | def confWith(key: String, value: String): JobConf = 26 | confWith(Map(key -> value)) 27 | 28 | def confWith(values: Map[String, String]): JobConf = { 29 | val conf = new JobConf(false) 30 | 31 | values.foreach { case (k, v) => 32 | conf.set(k, v) 33 | } 34 | 35 | conf 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/handler/ColumnFormat.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.db.macros.impl.handler 2 | 3 | import scala.reflect.macros.Context 4 | 5 | import com.twitter.scalding.db.macros.impl.FieldName 6 | 7 | object ColumnFormat { 8 | def apply(c: Context)(fAccessor: List[c.universe.MethodSymbol], fType: String, size: Option[Int])(implicit 9 | fName: FieldName, 10 | isNullable: Boolean, 11 | defaultV: Option[c.Expr[String]] 12 | ): ColumnFormat[c.type] = 13 | new ColumnFormat[c.type](c) { 14 | val fieldAccessor = fAccessor 15 | val fieldType = fType 16 | val fieldName = fName 17 | val nullable = isNullable 18 | val sizeOpt = size 19 | val defaultValue = defaultV 20 | } 21 | } 22 | 23 | /** 24 | * Contains data format information for a column as defined in the case class. 25 | * 26 | * Used by the ColumnDefinitionProvider macro too generate columns definitions and JDBC ResultSet extractor. 27 | */ 28 | abstract class ColumnFormat[C <: Context](val ctx: C) { 29 | def fieldAccessor: List[ctx.universe.MethodSymbol] 30 | def fieldType: String 31 | def fieldName: FieldName 32 | def nullable: Boolean 33 | def sizeOpt: Option[Int] 34 | def defaultValue: Option[ctx.Expr[String]] 35 | } 36 | -------------------------------------------------------------------------------- /scalding-commons/src/main/scala/com/twitter/scalding/commons/source/LzoCodecSource.scala: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2012 Twitter, Inc. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package com.twitter.scalding.commons.source 18 | 19 | import com.twitter.chill.Externalizer 20 | import com.twitter.bijection.Injection 21 | 22 | /** 23 | * Source used to write some type T into an LZO-compressed SequenceFile using a codec on T for serialization. 24 | */ 25 | 26 | object LzoCodecSource { 27 | def apply[T](paths: String*)(implicit passedInjection: Injection[T, Array[Byte]]) = 28 | new LzoCodec[T] { 29 | val hdfsPaths = paths 30 | val localPaths = paths 31 | val boxed = Externalizer(passedInjection) 32 | override def injection = boxed.get 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /scalding-core/src/test/scala/com/twitter/scalding/IterableExecutionSerializationTest.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding 2 | 3 | import com.twitter.bijection.JavaSerializationInjection 4 | import com.twitter.chill.KryoPool 5 | import com.twitter.chill.config.ScalaAnyRefMapConfig 6 | import com.twitter.scalding.serialization.{Externalizer, KryoHadoop} 7 | import com.twitter.scalding.source.TypedText 8 | import org.scalatest.FunSuite 9 | 10 | class ToIterableSerializationTest extends FunSuite { 11 | 12 | class Foo { 13 | val field = 42 14 | } 15 | 16 | val myFoo = new Foo 17 | val testIterableExecution = 18 | Execution.toIterable(TypedPipe.from(TypedText.tsv[Int]("foo")).map(_ * myFoo.field)) 19 | 20 | test("toIterableExecution should roundtrip") { 21 | 22 | val jInjection = JavaSerializationInjection[Externalizer[Execution[Iterable[Int]]]] 23 | val externalizer = Externalizer(testIterableExecution) 24 | 25 | assert(jInjection.invert(jInjection(externalizer)).isSuccess) 26 | } 27 | test("testing kryo") { 28 | val kryo = new KryoHadoop(ScalaAnyRefMapConfig(Map("scalding.kryo.setreferences" -> "true"))) 29 | val kryoPool = KryoPool.withByteArrayOutputStream(1, kryo) 30 | assert(scala.util.Try(kryoPool.deepCopy(testIterableExecution)).isSuccess) 31 | } 32 | 33 | } 34 | -------------------------------------------------------------------------------- /scalding-dagon/src/test/scala/com/twitter/scalding/dagon/MemoizeTests.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.dagon 2 | 3 | import org.scalatest.FunSuite 4 | 5 | class MemoizeTests extends FunSuite { 6 | test("fibonacci is linear in time") { 7 | 8 | var calls = 0 9 | 10 | val fib = 11 | Memoize.function[Int, Long] { (i, f) => 12 | calls += 1 13 | 14 | i match { 15 | case 0 => 0 16 | case 1 => 1 17 | case i => f(i - 1) + f(i - 2) 18 | } 19 | } 20 | 21 | def fib2(n: Int, x: Long, y: Long): Long = 22 | if (n == 0) x 23 | else fib2(n - 1, y, x + y) 24 | 25 | assert(fib(100) == fib2(100, 0L, 1L)) 26 | assert(calls == 101) 27 | } 28 | 29 | test("functionK repeated calls only evaluate once") { 30 | 31 | var calls = 0 32 | val fn = 33 | Memoize.functionK[BoolT, BoolT](new Memoize.RecursiveK[BoolT, BoolT] { 34 | def toFunction[T] = { case (b, rec) => 35 | calls += 1 36 | 37 | !b 38 | } 39 | }) 40 | 41 | assert(fn(true) == false) 42 | assert(calls == 1) 43 | assert(fn(true) == false) 44 | assert(calls == 1) 45 | 46 | assert(fn(false) == true) 47 | assert(calls == 2) 48 | assert(fn(false) == true) 49 | assert(calls == 2) 50 | 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /scalding-db/src/main/scala/com/twitter/scalding/db/package.scala: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2015 Twitter, Inc. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package com.twitter.scalding.db 17 | 18 | import scala.language.experimental.{macros => sMacros} 19 | 20 | import com.twitter.scalding.db.macros.impl.{ColumnDefinitionProviderImpl, DBTypeDescriptorImpl} 21 | 22 | // The implicits in the jdbc.macro's package 23 | // These are to allow us to auto provide our Type Classes without the user possibly knowing 24 | // all of the various ways we could build it. 25 | package object macros { 26 | implicit def toColumnDefinitionProvider[T]: ColumnDefinitionProvider[T] = 27 | macro ColumnDefinitionProviderImpl[T] 28 | implicit def toDBTypeDescriptor[T]: DBTypeDescriptor[T] = macro DBTypeDescriptorImpl[T] 29 | } 30 | -------------------------------------------------------------------------------- /scalding-serialization/src/main/java/com/twitter/scalding/serialization/Undeprecated.java: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2015 Twitter, Inc. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package com.twitter.scalding.serialization; 17 | 18 | public class Undeprecated { 19 | /** 20 | * This method is faster for ASCII data, but unsafe otherwise 21 | * it is used by our macros AFTER checking that the string is ASCII 22 | * following a pattern seen in Kryo, which benchmarking showed helped. 23 | * Scala cannot suppress warnings like this so we do it here 24 | */ 25 | @SuppressWarnings("deprecation") 26 | public static void getAsciiBytes(String element, int charStart, int charLen, byte[] bytes, int byteOffset) { 27 | element.getBytes(charStart, charLen, bytes, byteOffset); 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /scalding-core/src/main/scala/com/twitter/scalding/OptionalSource.scala: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2012 Twitter, Inc. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package com.twitter.scalding 17 | 18 | import scala.util.{Failure, Success, Try} 19 | import cascading.tap.Tap 20 | 21 | case class OptionalSource[T](src: Mappable[T]) extends Source with Mappable[T] { 22 | override def converter[U >: T] = TupleConverter.asSuperConverter(src.converter) 23 | 24 | def createTap(readOrWrite: AccessMode)(implicit mode: Mode): Tap[_, _, _] = 25 | Try(src.validateTaps(mode)) match { 26 | case Success(_) => 27 | src.createTap(readOrWrite) 28 | case Failure(_) => 29 | IterableSource[T](Nil)(TupleSetter.singleSetter[T], src.converter) 30 | .createTap(readOrWrite) 31 | .asInstanceOf[Tap[_, _, _]] 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /scalding-core/src/main/scala/com/twitter/scalding/TypedPipeChecker.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding 2 | 3 | /** 4 | * This class is used to assist with testing a TypedPipe 5 | */ 6 | object TypedPipeChecker { 7 | 8 | /* 9 | * Takes a List and a transform function. 10 | * The resulting TypedPipe form the transform will be run through asserts 11 | */ 12 | def checkOutputTransform[T, U, R](input: List[T])(transform: TypedPipe[T] => TypedPipe[U])( 13 | assertions: List[U] => R 14 | ): R = 15 | assertions(inMemoryToList(transform(TypedPipe.from(input)))) 16 | 17 | /* 18 | * Execute a TypedPipe in memory, convert the resulting Iterator to 19 | * a list and run it through a function that makes arbitrary 20 | * assertions on it. 21 | */ 22 | def checkOutput[T, R](output: TypedPipe[T])(assertions: List[T] => R): R = 23 | assertions(inMemoryToList(output)) 24 | 25 | /** 26 | * Execute a TypedPipe in memory and return the result as a List 27 | */ 28 | def inMemoryToList[T](output: TypedPipe[T]): List[T] = 29 | output.toIterableExecution 30 | .waitFor(Config.unitTestDefault, Local(strictSources = true)) 31 | .get 32 | .toList 33 | 34 | implicit class InMemoryToListEnrichment[T](val pipe: TypedPipe[T]) extends AnyVal { 35 | def inMemoryToList: List[T] = TypedPipeChecker.inMemoryToList(pipe) 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /scalding-core/codegen/typed_source_generator.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | # Run it like this: 4 | # 5 | # ./codegen/typed_source_generator.rb > src/main/scala/com/twitter/scalding/typed/GeneratedTypedSource.scala 6 | 7 | $indent = " " 8 | 9 | TYPES = ('A'..'Z').to_a 10 | 11 | def make_typed_source(cnt) 12 | other_cnts = (1..(22-cnt)).to_a 13 | typeString = TYPES[0..(cnt - 1)].join(",") 14 | puts "trait TypedSource#{cnt}[#{typeString}] extends TypedSource[Tuple#{cnt}[#{typeString}]] {" 15 | puts "#{$indent}def converter[Z >: Tuple#{cnt}[#{typeString}]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple#{cnt}[#{typeString}]])" 16 | puts "}" 17 | end 18 | 19 | def make_typed_sink(cnt) 20 | other_cnts = (1..(22-cnt)).to_a 21 | typeString = TYPES[0..(cnt - 1)].join(",") 22 | puts "trait TypedSink#{cnt}[#{typeString}] extends TypedSink[Tuple#{cnt}[#{typeString}]] {" 23 | puts "#{$indent}final def setter[Z <: Tuple#{cnt}[#{typeString}]] = TupleSetter.asSubSetter(TupleSetter.of[Tuple#{cnt}[#{typeString}]])" 24 | puts "}" 25 | end 26 | 27 | puts "// following were autogenerated by #{__FILE__} at #{Time.now} do not edit" 28 | puts %q|package com.twitter.scalding 29 | 30 | | 31 | 32 | (1..22).each { |c| 33 | make_typed_source(c) 34 | puts 35 | } 36 | 37 | (1..22).each { |c| 38 | make_typed_sink(c) 39 | puts 40 | } 41 | 42 | puts "// end of autogenerated" 43 | -------------------------------------------------------------------------------- /scalding-core/src/test/scala/com/twitter/scalding/TypedSketchJoinJobForEmptyKeysTest.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding 2 | 3 | import org.scalatest.{Matchers, WordSpec} 4 | 5 | class TypedSketchJoinJobForEmptyKeys(args: Args) extends Job(args) { 6 | // Deal with when a key appears in left but not right 7 | val leftTypedPipe = TypedPipe.from(List((1, 1111))) 8 | val rightTypedPipe = TypedPipe.from(List((3, 3333), (4, 4444))) 9 | 10 | implicit def serialize(k: Int): Array[Byte] = k.toString.getBytes 11 | 12 | val sketched = leftTypedPipe 13 | .sketch(1) 14 | .leftJoin(rightTypedPipe) 15 | 16 | // this is test that a TypedPipe.Keyed method works: 17 | sketched.values 18 | 19 | sketched 20 | .map { case (a, (b, c)) => 21 | (a, b, c.getOrElse(-1)) 22 | } 23 | .write(TypedTsv("output")) 24 | } 25 | 26 | class TypedSketchJoinJobForEmptyKeysTest extends WordSpec with Matchers { 27 | "A TypedSketchJoinJobForEmptyKeysTest" should { 28 | "Sketch leftJoin with a single left key should be correct" in { 29 | JobTest(new TypedSketchJoinJobForEmptyKeys(_)) 30 | .sink[(Int, Int, Int)](TypedTsv[(Int, Int, Int)]("output")) { outBuf => 31 | outBuf should have size 1 32 | val unordered = outBuf.toSet 33 | unordered should contain(1, 1111, -1) 34 | } 35 | .run 36 | .finish() 37 | } 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/handler/NumericTypeHandler.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.db.macros.impl.handler 2 | 3 | import scala.reflect.macros.Context 4 | import scala.util.{Failure, Success} 5 | 6 | import com.twitter.scalding.db.macros.impl.FieldName 7 | 8 | object NumericTypeHandler { 9 | def apply[T](c: Context)(implicit 10 | accessorTree: List[c.universe.MethodSymbol], 11 | fieldName: FieldName, 12 | defaultValue: Option[c.Expr[String]], 13 | annotationInfo: List[(c.universe.Type, Option[Int])], 14 | nullable: Boolean, 15 | numericType: String 16 | ): scala.util.Try[List[ColumnFormat[c.type]]] = { 17 | 18 | val helper = new { 19 | val ctx: c.type = c 20 | val cfieldName = fieldName 21 | val cannotationInfo = annotationInfo 22 | } with AnnotationHelper 23 | 24 | val extracted = for { 25 | (nextHelper, sizeAnno) <- helper.sizeAnnotation 26 | _ <- nextHelper.validateFinished 27 | } yield sizeAnno 28 | 29 | extracted.flatMap { 30 | case WithSize(s) if s > 0 => Success(List(ColumnFormat(c)(accessorTree, numericType, Some(s)))) 31 | case WithSize(s) => Failure(new Exception(s"Int field $fieldName, has a size defined that is <= 0.")) 32 | case WithoutSize => Success(List(ColumnFormat(c)(accessorTree, numericType, None))) 33 | } 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /scalding-core/src/main/scala/com/twitter/scalding/BijectedOrderedSerialization.scala: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2014 Twitter, Inc. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package com.twitter.scalding 17 | 18 | import com.twitter.scalding.serialization.OrderedSerialization 19 | import com.twitter.bijection.{ImplicitBijection, Injection} 20 | 21 | object BijectedOrderedSerialization { 22 | implicit def fromBijection[T, U](implicit 23 | bij: ImplicitBijection[T, U], 24 | ordSer: OrderedSerialization[U] 25 | ): OrderedSerialization[T] = 26 | OrderedSerialization.viaTransform[T, U](bij.apply(_), bij.invert(_)) 27 | 28 | implicit def fromInjection[T, U](implicit 29 | bij: Injection[T, U], 30 | ordSer: OrderedSerialization[U] 31 | ): OrderedSerialization[T] = 32 | OrderedSerialization.viaTryTransform[T, U](bij.apply(_), bij.invert(_)) 33 | } 34 | -------------------------------------------------------------------------------- /scalding-core/src/main/scala/com/twitter/scalding/estimation/Common.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.estimation 2 | 3 | import cascading.flow.FlowStep 4 | import cascading.tap.hadoop.Hfs 5 | import cascading.tap.{CompositeTap, Tap} 6 | import com.twitter.scalding.tap.GlobHfs 7 | import org.apache.hadoop.mapred.JobConf 8 | import org.slf4j.LoggerFactory 9 | import scala.collection.JavaConverters._ 10 | 11 | object Common { 12 | private[this] val LOG = LoggerFactory.getLogger(this.getClass) 13 | 14 | private def unrollTaps(taps: Seq[Tap[_, _, _]]): Seq[Tap[_, _, _]] = 15 | taps.flatMap { 16 | case multi: CompositeTap[_] => 17 | unrollTaps(multi.getChildTaps.asScala.toSeq) 18 | case t => Seq(t) 19 | } 20 | 21 | def unrollTaps(step: FlowStep[JobConf]): Seq[Tap[_, _, _]] = 22 | unrollTaps(step.getSources.asScala.toSeq) 23 | 24 | def inputSizes(step: FlowStep[JobConf]): Seq[(String, Long)] = { 25 | val conf = step.getConfig 26 | unrollTaps(step).flatMap { 27 | case tap: GlobHfs => Some(tap.toString -> tap.getSize(conf)) 28 | case tap: Hfs => Some(tap.toString -> GlobHfs.getSize(tap.getPath, conf)) 29 | case tap => 30 | LOG.warn("InputSizeReducerEstimator unable to calculate size: " + tap) 31 | None 32 | } 33 | } 34 | 35 | def totalInputSize(step: FlowStep[JobConf]): Long = inputSizes(step).map(_._2).sum 36 | } 37 | -------------------------------------------------------------------------------- /scalding-core/src/main/scala/com/twitter/scalding/serialization/MultiJoinExternalizer.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.serialization 2 | 3 | import com.twitter.scalding.typed.MultiJoinFunction 4 | 5 | object MultiJoinExternalizer { 6 | import MultiJoinFunction.Transformer 7 | 8 | final case class ExternalizeMapGroup[A, B, C](@transient fn: (A, Iterator[B]) => Iterator[C]) 9 | extends Function2[A, Iterator[B], Iterator[C]] { 10 | private[this] val fnEx = Externalizer(fn) 11 | 12 | def apply(a: A, it: Iterator[B]) = fnEx.get(a, it) 13 | } 14 | 15 | final case class ExternalizeJoin[A, B, C, D](@transient fn: (A, Iterator[B], Iterable[C]) => Iterator[D]) 16 | extends Function3[A, Iterator[B], Iterable[C], Iterator[D]] { 17 | private[this] val fnEx = Externalizer(fn) 18 | 19 | def apply(a: A, bs: Iterator[B], cs: Iterable[C]) = fnEx.get(a, bs, cs) 20 | } 21 | 22 | private[this] object ExtTrans extends Transformer { 23 | def transformJoin[A, B, C, D]( 24 | fn: (A, Iterator[B], Iterable[C]) => Iterator[D] 25 | ): (A, Iterator[B], Iterable[C]) => Iterator[D] = 26 | ExternalizeJoin(fn) 27 | def transformMap[A, B, C](fn: (A, Iterator[B]) => Iterator[C]): (A, Iterator[B]) => Iterator[C] = 28 | ExternalizeMapGroup(fn) 29 | } 30 | 31 | def externalize[A, B](mjf: MultiJoinFunction[A, B]): MultiJoinFunction[A, B] = 32 | ExtTrans(mjf) 33 | } 34 | -------------------------------------------------------------------------------- /scalding-commons/src/test/scala/com/twitter/scalding/WordCountTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2012 Twitter, Inc. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package com.twitter.scalding 17 | 18 | import org.scalatest.{Matchers, WordSpec} 19 | 20 | class WordCountTest extends WordSpec with Matchers { 21 | "A WordCount job" should { 22 | JobTest(new com.twitter.scalding.examples.WordCountJob(_)) 23 | .arg("input", "inputFile") 24 | .arg("output", "outputFile") 25 | .source(TextLine("inputFile"), List((0, "hack hack hack and hack"))) 26 | .sink[(String, Int)](TypedTsv[(String, Long)]("outputFile")) { outputBuffer => 27 | val outMap = outputBuffer.toMap 28 | "count words correctly" in { 29 | outMap("hack") shouldBe 4 30 | outMap("and") shouldBe 1 31 | } 32 | } 33 | .run 34 | .finish() 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetScrooge.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.parquet.scrooge 2 | 3 | import cascading.scheme.Scheme 4 | import com.twitter.scalding._ 5 | import com.twitter.scalding.parquet.thrift.ParquetThriftBaseFileSource 6 | import com.twitter.scalding.source.{DailySuffixSource, HourlySuffixSource} 7 | import com.twitter.scrooge.ThriftStruct 8 | 9 | import scala.reflect.ClassTag 10 | 11 | trait ParquetScrooge[T <: ThriftStruct] extends ParquetThriftBaseFileSource[T] { 12 | 13 | override def hdfsScheme = { 14 | // See docs in Parquet346ScroogeScheme 15 | val scheme = new Parquet346ScroogeScheme[T](this.config) 16 | HadoopSchemeInstance(scheme.asInstanceOf[Scheme[_, _, _, _, _]]) 17 | } 18 | 19 | } 20 | 21 | class DailySuffixParquetScrooge[T <: ThriftStruct](path: String, dateRange: DateRange)(implicit 22 | override val ct: ClassTag[T] 23 | ) extends DailySuffixSource(path, dateRange) 24 | with ParquetScrooge[T] 25 | 26 | class HourlySuffixParquetScrooge[T <: ThriftStruct](path: String, dateRange: DateRange)(implicit 27 | override val ct: ClassTag[T] 28 | ) extends HourlySuffixSource(path, dateRange) 29 | with ParquetScrooge[T] 30 | 31 | class FixedPathParquetScrooge[T <: ThriftStruct](paths: String*)(implicit override val ct: ClassTag[T]) 32 | extends FixedPathSource(paths: _*) 33 | with ParquetScrooge[T] 34 | -------------------------------------------------------------------------------- /scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ParquetScroogeInputFormat.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | package com.twitter.scalding.parquet.scrooge; 20 | 21 | import org.apache.parquet.hadoop.thrift.ParquetThriftInputFormat; 22 | 23 | /** 24 | * Use this class to read Scrooge records from parquet file 25 | * @param Type of Scrooge records to read 26 | */ 27 | public class ParquetScroogeInputFormat extends ParquetThriftInputFormat { 28 | public ParquetScroogeInputFormat() { 29 | super(ScroogeReadSupport.class); 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /scalding-core/src/main/scala/com/twitter/scalding/macros/MacroImplicits.scala: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2012 Twitter, Inc. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package com.twitter.scalding.macros 17 | 18 | import scala.language.experimental.macros 19 | 20 | import com.twitter.scalding._ 21 | import com.twitter.scalding.macros.impl._ 22 | 23 | object MacroImplicits { 24 | 25 | /** 26 | * This method provides proof that the given type is a case class. 27 | */ 28 | implicit def materializeCaseClassTupleSetter[T]: TupleSetter[T] = 29 | macro TupleSetterImpl.caseClassTupleSetterImpl[T] 30 | implicit def materializeCaseClassTupleConverter[T]: TupleConverter[T] = 31 | macro TupleConverterImpl.caseClassTupleConverterImpl[T] 32 | implicit def materializeCaseClassTypeDescriptor[T]: TypeDescriptor[T] = 33 | macro TypeDescriptorProviderImpl.caseClassTypeDescriptorImpl[T] 34 | } 35 | -------------------------------------------------------------------------------- /scalding-core/src/test/scala/com/twitter/scalding/TestTapFactoryTest.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding 2 | 3 | import cascading.tap.Tap 4 | import cascading.tuple.{Fields, Tuple} 5 | import scala.collection.mutable.Buffer 6 | import org.scalatest.{Matchers, WordSpec} 7 | 8 | class TestTapFactoryTest extends WordSpec with Matchers { 9 | "A test tap created by TestTapFactory" should { 10 | "error helpfully when a source is not in the map for test buffers" in { 11 | // Source to use for this test. 12 | val testSource = Tsv("path") 13 | 14 | // Map of sources to use when creating the tap-- does not contain testSource 15 | val emptySourceMap = Map[Source, Buffer[Tuple]]() 16 | 17 | val testMode = Test(emptySourceMap.get(_)) 18 | val testTapFactory = TestTapFactory(testSource, new Fields()) 19 | 20 | def createIllegalTap(accessMode: AccessMode): Tap[Any, Any, Any] = 21 | testTapFactory.createTap(accessMode)(testMode).asInstanceOf[Tap[Any, Any, Any]] 22 | 23 | (the[IllegalArgumentException] thrownBy { 24 | createIllegalTap(Read) 25 | } should have).message("requirement failed: " + TestTapFactory.sourceNotFoundError.format(testSource)) 26 | 27 | (the[IllegalArgumentException] thrownBy { 28 | createIllegalTap(Write) 29 | } should have).message("requirement failed: " + TestTapFactory.sinkNotFoundError.format(testSource)) 30 | } 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /scalding-core/src/main/scala/com/twitter/scalding/serialization/Externalizer.scala: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2012 Twitter, Inc. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package com.twitter.scalding.serialization 17 | 18 | import com.twitter.chill.{Externalizer => ChillExtern} 19 | 20 | import com.esotericsoftware.kryo.DefaultSerializer 21 | import com.esotericsoftware.kryo.serializers.JavaSerializer 22 | 23 | import com.twitter.chill.config.ScalaAnyRefMapConfig 24 | 25 | /** 26 | * We need to control the Kryo created 27 | */ 28 | object Externalizer { 29 | def apply[T](t: T): Externalizer[T] = { 30 | val e = new Externalizer[T] 31 | e.set(t) 32 | e 33 | } 34 | } 35 | 36 | @DefaultSerializer(classOf[JavaSerializer]) 37 | class Externalizer[T] extends ChillExtern[T] { 38 | protected override def kryo = 39 | new KryoHadoop(ScalaAnyRefMapConfig(Map("scalding.kryo.setreferences" -> "true"))) 40 | } 41 | -------------------------------------------------------------------------------- /scalding-parquet/src/main/java/com/twitter/scalding/parquet/tuple/SchemaIntersection.java: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.parquet.tuple; 2 | 3 | import org.apache.parquet.schema.MessageType; 4 | import org.apache.parquet.schema.Type; 5 | 6 | import cascading.tuple.Fields; 7 | 8 | import java.util.List; 9 | import java.util.ArrayList; 10 | 11 | public class SchemaIntersection { 12 | 13 | private final MessageType requestedSchema; 14 | private final Fields sourceFields; 15 | 16 | public SchemaIntersection(MessageType fileSchema, Fields requestedFields) { 17 | if(requestedFields == Fields.UNKNOWN) 18 | requestedFields = Fields.ALL; 19 | 20 | Fields newFields = Fields.NONE; 21 | List newSchemaFields = new ArrayList(); 22 | int schemaSize = fileSchema.getFieldCount(); 23 | 24 | for (int i = 0; i < schemaSize; i++) { 25 | Type type = fileSchema.getType(i); 26 | Fields name = new Fields(type.getName()); 27 | 28 | if(requestedFields.contains(name)) { 29 | newFields = newFields.append(name); 30 | newSchemaFields.add(type); 31 | } 32 | } 33 | 34 | this.sourceFields = newFields; 35 | this.requestedSchema = new MessageType(fileSchema.getName(), newSchemaFields); 36 | } 37 | 38 | public MessageType getRequestedSchema() { 39 | return requestedSchema; 40 | } 41 | 42 | public Fields getSourceFields() { 43 | return sourceFields; 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /scalding-serialization/src/main/scala/com/twitter/scalding/serialization/UnsignedComparisons.scala: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2015 Twitter, Inc. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package com.twitter.scalding.serialization 18 | 19 | object UnsignedComparisons { 20 | final def unsignedLongCompare(a: Long, b: Long): Int = if (a == b) 0 21 | else { 22 | val xor = a ^ b 23 | // If xor >= 0, then a and b are on the same side of zero 24 | if (xor >= 0L) java.lang.Long.compare(a, b) 25 | else if (b >= 0L) 1 26 | else -1 27 | } 28 | final def unsignedIntCompare(a: Int, b: Int): Int = 29 | java.lang.Long.compare(a.toLong & 0xffffffffL, b.toLong & 0xffffffffL) 30 | 31 | final def unsignedShortCompare(a: Short, b: Short): Int = 32 | Integer.compare(a & 0xffff, b & 0xffff) 33 | 34 | final def unsignedByteCompare(a: Byte, b: Byte): Int = 35 | Integer.compare(a & 0xff, b & 0xff) 36 | } 37 | -------------------------------------------------------------------------------- /scalding-core/src/main/scala/com/twitter/scalding/source/MaxFailuresCheck.scala: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2012 Twitter, Inc. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package com.twitter.scalding.source 18 | 19 | import com.twitter.bijection.Injection 20 | import java.util.concurrent.atomic.AtomicInteger 21 | 22 | // TODO: this should actually increment an read a Hadoop counter 23 | class MaxFailuresCheck[T, U](val maxFailures: Int)(implicit override val injection: Injection[T, U]) 24 | extends CheckedInversion[T, U] { 25 | 26 | private val failures = new AtomicInteger(0) 27 | def apply(input: U): Option[T] = 28 | try { 29 | Some(injection.invert(input).get) 30 | } catch { 31 | case e: Exception => 32 | // TODO: use proper logging 33 | e.printStackTrace() 34 | assert(failures.incrementAndGet <= maxFailures, "maximum decoding errors exceeded") 35 | None 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /scalding-base/src/main/scala/com/twitter/scalding/typed/WithDescription.scala: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2015 Twitter, Inc. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package com.twitter.scalding.typed 17 | 18 | /** 19 | * Used for objects that may have a description set to be used in .dot and MR step names. 20 | */ 21 | trait HasDescription { 22 | def descriptions: Seq[String] 23 | } 24 | 25 | /** 26 | * Used for objects that may _set_ a description to be used in .dot and MR step names. 27 | */ 28 | trait WithDescription[+This <: WithDescription[This]] extends HasDescription { self: This => 29 | 30 | /** never mutates this, instead returns a new item. */ 31 | def withDescription(description: String): This 32 | 33 | def withDescription(descriptionOpt: Option[String]): This = 34 | descriptionOpt match { 35 | case Some(description) => withDescription(description) 36 | case None => self 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /scalding-hadoop-test/src/test/scala/com/twitter/scalding/platform/TestJobsWithDescriptions.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.platform 2 | 3 | import com.twitter.scalding._ 4 | 5 | /* 6 | * These jobs are used in PlatformTests that test correct line numbers in descriptions. 7 | * Placing them in a separate file means we don't have to update the tests that care about 8 | * line numbers when PlatformTest.scala changes for unrelated reasons. 9 | */ 10 | 11 | class TypedPipeJoinWithDescriptionJob(args: Args) extends Job(args) { 12 | PlatformTest.setAutoForceRight(mode, true) 13 | 14 | val x = TypedPipe.from[(Int, Int)](List((1, 1))) 15 | val y = TypedPipe.from[(Int, String)](List((1, "first"))) 16 | val z = TypedPipe.from[(Int, Boolean)](List((2, true))).group 17 | 18 | x.hashJoin(y) // this triggers an implicit that somehow pushes the line number to the next one 19 | .withDescription("hashJoin") 20 | .leftJoin(z) 21 | .withDescription("leftJoin") 22 | .values 23 | .write(TypedTsv[((Int, String), Option[Boolean])]("output")) 24 | } 25 | 26 | class TypedPipeWithDescriptionJob(args: Args) extends Job(args) { 27 | TypedPipe 28 | .from[String](List("word1", "word1", "word2")) 29 | .withDescription("map stage - assign words to 1") 30 | .map(w => (w, 1L)) 31 | .group 32 | .withDescription("reduce stage - sum") 33 | .sum 34 | .withDescription("write") 35 | .write(TypedTsv[(String, Long)]("output")) 36 | } 37 | -------------------------------------------------------------------------------- /scalding-db/src/main/scala/com/twitter/scalding/db/extensions/VerticaExtensions.scala: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2015 Twitter, Inc. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package com.twitter.scalding.db.extensions 18 | 19 | import com.twitter.scalding.db._ 20 | 21 | object VerticaExtensions { 22 | def verticaMutator: PartialFunction[DBColumnDefinition, DBColumnDefinition] = { 23 | case t @ DBColumnDefinition(BIGINT, _, _, None, _, _) => t.copy(sizeOpt = None) 24 | case t @ DBColumnDefinition(INT, _, _, None, _, _) => t.copy(sizeOpt = None) 25 | case t @ DBColumnDefinition(SMALLINT, _, _, None, _, _) => t.copy(sizeOpt = None) 26 | case t @ DBColumnDefinition(BOOLEAN, _, _, None, _, _) => t.copy(sizeOpt = None) 27 | case t @ DBColumnDefinition(TINYINT, _, _, None, _, _) => t.copy(sizeOpt = None) 28 | case t @ DBColumnDefinition(DOUBLE, _, _, _, _, _) => t.copy(sqlType = SqlTypeName("DOUBLE PRECISION")) 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /scalding-quotation/src/main/scala/com/twitter/scalding/quotation/Quoted.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.quotation 2 | 3 | import java.io.File 4 | 5 | /** 6 | * Meta information about a method call. 7 | */ 8 | case class Quoted(position: Source, text: Option[String], projections: Projections) { 9 | override def toString = s"$position ${text.getOrElse("")}" 10 | } 11 | 12 | object Quoted { 13 | import language.experimental.macros 14 | implicit def method: Quoted = macro QuotedMacro.method 15 | 16 | private[scalding] def internal: Quoted = macro QuotedMacro.internal 17 | 18 | def function[T1, U](f: T1 => U): Function1[T1, U] with QuotedFunction = macro QuotedMacro.function 19 | def function[T1, T2, U](f: (T1, T2) => U): Function2[T1, T2, U] with QuotedFunction = 20 | macro QuotedMacro.function 21 | def function[T1, T2, T3, U](f: (T1, T2, T3) => U): Function3[T1, T2, T3, U] with QuotedFunction = 22 | macro QuotedMacro.function 23 | def function[T1, T2, T3, T4, U](f: (T1, T2, T3, T4) => U): Function4[T1, T2, T3, T4, U] 24 | with QuotedFunction = macro QuotedMacro.function 25 | def function[T1, T2, T3, T4, T5, U](f: (T1, T2, T3, T4, T5) => U): Function5[T1, T2, T3, T4, T5, U] 26 | with QuotedFunction = macro QuotedMacro.function 27 | } 28 | 29 | case class Source(path: String, line: Int) { 30 | def classFile = path.split(File.separator).last 31 | override def toString = s"$classFile:$line" 32 | } 33 | 34 | trait QuotedFunction { 35 | def quoted: Quoted 36 | } 37 | -------------------------------------------------------------------------------- /scalding-core/src/main/scala/com/twitter/scalding/estimation/HistoryService.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.estimation 2 | 3 | import scala.util.Try 4 | 5 | /** 6 | * Info about a prior FlowStep, provided by implementers of HistoryService 7 | */ 8 | final case class FlowStepHistory( 9 | keys: FlowStepKeys, 10 | submitTimeMillis: Long, 11 | launchTimeMillis: Long, 12 | finishTimeMillis: Long, 13 | totalMaps: Long, 14 | totalReduces: Long, 15 | finishedMaps: Long, 16 | finishedReduces: Long, 17 | failedMaps: Long, 18 | failedReduces: Long, 19 | mapFileBytesRead: Long, 20 | mapFileBytesWritten: Long, 21 | mapOutputBytes: Long, 22 | reduceFileBytesRead: Long, 23 | hdfsBytesRead: Long, 24 | hdfsBytesWritten: Long, 25 | mapperTimeMillis: Long, 26 | reducerTimeMillis: Long, 27 | reduceShuffleBytes: Long, 28 | cost: Double, 29 | tasks: Seq[Task] 30 | ) 31 | 32 | final case class FlowStepKeys( 33 | jobName: String, 34 | user: String, 35 | priority: String, 36 | status: String, 37 | version: String, 38 | queue: String 39 | ) 40 | 41 | final case class Task(details: Map[String, Any], counters: Map[String, Long]) { 42 | def taskType: Option[String] = details.get(Task.TaskType).map(_.asInstanceOf[String]) 43 | } 44 | 45 | object Task { 46 | val TaskType = "taskType" 47 | } 48 | 49 | trait HistoryService { 50 | def fetchHistory(info: FlowStrategyInfo, maxHistory: Int): Try[Seq[FlowStepHistory]] 51 | } 52 | -------------------------------------------------------------------------------- /scalding-dagon/src/test/scala/com/twitter/scalding/dagon/CacheTests.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.dagon 2 | 3 | import org.scalacheck.Prop._ 4 | import org.scalacheck.{Arbitrary, Cogen, Properties} 5 | 6 | abstract class CacheTests[K: Cogen: Arbitrary, V: Arbitrary](name: String) extends Properties(name) { 7 | 8 | def buildMap(c: Cache[K, V], ks: Iterable[K], f: K => V): Map[K, V] = 9 | ks.iterator.foldLeft(Map.empty[K, V]) { (m, k) => 10 | m.updated(k, c.getOrElseUpdate(k, f(k))) 11 | } 12 | 13 | property("getOrElseUpdate") = forAll { (f: K => V, k: K, v1: V, v2: V) => 14 | val c = Cache.empty[K, V] 15 | var count = 0 16 | val x = c.getOrElseUpdate(k, { count += 1; v1 }) 17 | val y = c.getOrElseUpdate(k, { count += 1; v2 }) 18 | x == v1 && y == v1 && count == 1 19 | } 20 | 21 | property("toMap") = forAll { (f: K => V, ks: Set[K]) => 22 | val c = Cache.empty[K, V] 23 | val m = buildMap(c, ks, f) 24 | c.toMap == m 25 | } 26 | 27 | property("duplicate") = forAll { (f: K => V, ks: Set[K]) => 28 | val c = Cache.empty[K, V] 29 | val d = c.duplicate 30 | buildMap(c, ks, f) 31 | d.toMap.isEmpty 32 | } 33 | 34 | property("reset works") = forAll { (f: K => V, ks: Set[K]) => 35 | val c = Cache.empty[K, V] 36 | buildMap(c, ks, f) 37 | val d = c.duplicate 38 | c.reset() 39 | c.toMap.isEmpty && d.toMap.size == ks.size 40 | } 41 | } 42 | 43 | object CacheTestsSL extends CacheTests[String, Long]("CacheTests[String, Long]") 44 | -------------------------------------------------------------------------------- /scalding-base/src/main/scala/com/twitter/scalding/Mode.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding 2 | 3 | trait Mode extends java.io.Serializable { 4 | 5 | /** 6 | * Make the Execution.Writer for this platform 7 | */ 8 | def newWriter(): Execution.Writer 9 | 10 | /** 11 | * Config.defaultForMode converts this map into a Config (we don't use Config here to avoid a circular 12 | * dependency) 13 | */ 14 | def defaultConfig: Map[String, String] = Map.empty 15 | } 16 | 17 | object Mode { 18 | 19 | /** 20 | * This is a Args and a Mode together. It is used purely as a work-around for the fact that Job only accepts 21 | * an Args object, but needs a Mode inside. 22 | */ 23 | private class ArgsWithMode(argsMap: Map[String, List[String]], val mode: Mode) extends Args(argsMap) { 24 | override def +(keyvals: (String, Iterable[String])): Args = 25 | new ArgsWithMode(super.+(keyvals).m, mode) 26 | } 27 | 28 | /** Attach a mode to these Args and return the new Args */ 29 | def putMode(mode: Mode, args: Args): Args = new ArgsWithMode(args.m, mode) 30 | 31 | /** Get a Mode if this Args was the result of a putMode */ 32 | def getMode(args: Args): Option[Mode] = args match { 33 | case withMode: ArgsWithMode => Some(withMode.mode) 34 | case _ => None 35 | } 36 | } 37 | 38 | case class ModeException(message: String) extends RuntimeException(message) 39 | case class ModeLoadException(message: String, origin: ClassNotFoundException) extends RuntimeException(origin) 40 | -------------------------------------------------------------------------------- /scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ScroogeSchemaConversionException.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | package com.twitter.scalding.parquet.scrooge; 20 | 21 | import org.apache.parquet.ParquetRuntimeException; 22 | 23 | /** 24 | * Throw this exception when there is an error converting a Scrooge class to 25 | * thrift schema 26 | */ 27 | class ScroogeSchemaConversionException extends ParquetRuntimeException { 28 | public ScroogeSchemaConversionException(String message, Throwable cause) { 29 | super(message, cause); 30 | } 31 | 32 | public ScroogeSchemaConversionException(String message) { 33 | super(message); 34 | } 35 | } 36 | 37 | -------------------------------------------------------------------------------- /tutorial/MatrixTutorial3.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.examples 2 | 3 | import com.twitter.scalding._ 4 | import com.twitter.scalding.mathematics.Matrix 5 | 6 | 7 | /* 8 | * MatrixTutorial3.scala 9 | * 10 | * Loads two directed graph adjacency matrices where a[i,j] = 1 if there is an edge from a[i] to b[j] 11 | * and computes the intersection and the differences between the two 12 | * 13 | * ../scripts/scald.rb --local MatrixTutorial3.scala --input1 data/graph.tsv --input2 data/graph2.tsv --intersection data/intersection.tsv --leftDiff data/leftDiff.tsv --rightDiff data/rightDiff.tsv 14 | * 15 | */ 16 | 17 | 18 | class ComputeMatrixIntersectionJob(args : Args) extends Job(args) { 19 | 20 | import Matrix._ 21 | 22 | val adjacencyMatrix1 = Tsv( args("input1"), ('user1, 'user2, 'rel) ) 23 | .read 24 | .toMatrix[Long,Long,Double]('user1, 'user2, 'rel) 25 | 26 | val adjacencyMatrix2 = Tsv( args("input2"), ('user1, 'user2, 'rel) ) 27 | .read 28 | .toMatrix[Long,Long,Double]('user1, 'user2, 'rel) 29 | 30 | //zip puts creates a pair element out of corresponding elements in the two matrices 31 | val intersection = adjacencyMatrix1 32 | .zip(adjacencyMatrix2) 33 | .mapValues( pair => if (pair._1 > 0 && pair._2 > 0) 1.0 else 0.0 ) 34 | .write(Tsv(args("intersection"))) 35 | (adjacencyMatrix1 - intersection).write(Tsv(args("leftDiff"))) 36 | (adjacencyMatrix2 - intersection).write(Tsv(args("rightDiff"))) 37 | 38 | } 39 | 40 | -------------------------------------------------------------------------------- /scalding-core/src/main/scala/com/twitter/scalding/Dsl.scala: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2012 Twitter, Inc. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package com.twitter.scalding 17 | 18 | import cascading.pipe.Pipe 19 | import cascading.flow.FlowDef 20 | 21 | /** 22 | * This object has all the implicit functions and values that are used to make the scalding DSL, which 23 | * includes the functions for automatically creating cascading.tuple.Fields objects from scala tuples of 24 | * Strings, Symbols or Ints, as well as the cascading.pipe.Pipe enrichment to RichPipe which adds the 25 | * scala.collections-like API to Pipe. 26 | * 27 | * It's useful to import Dsl._ when you are writing scalding code outside of a Job. 28 | */ 29 | object Dsl extends FieldConversions with java.io.Serializable { 30 | implicit def pipeToRichPipe(pipe: Pipe): RichPipe = new RichPipe(pipe) 31 | 32 | /** 33 | * Enrichment on FlowDef 34 | */ 35 | implicit def flowDefToRichFlowDef(fd: FlowDef): RichFlowDef = new RichFlowDef(fd) 36 | 37 | } 38 | -------------------------------------------------------------------------------- /scalding-core/src/main/scala/com/twitter/scalding/reducer_estimation/ReducerEstimatorConfig.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.reducer_estimation 2 | 3 | import org.apache.hadoop.mapred.JobConf 4 | 5 | object ReducerEstimatorConfig { 6 | 7 | /** Output param: what the Reducer Estimator recommended, regardless of if it was used. */ 8 | val estimatedNumReducers = "scalding.reducer.estimator.result" 9 | 10 | /** 11 | * Output param: same as estimatedNumReducers but with the cap specified by maxEstimatedReducersKey applied. 12 | * Can be used to determine whether a cap was applied to the estimated number of reducers and potentially to 13 | * trigger alerting / logging. 14 | */ 15 | val cappedEstimatedNumReducersKey = "scalding.reducer.estimator.result.capped" 16 | 17 | /** Output param: what the original job config was. */ 18 | val originalNumReducers = "scalding.reducer.estimator.original.mapred.reduce.tasks" 19 | 20 | /** 21 | * If we estimate more than this number of reducers, we will use this number instead of the estimated value 22 | */ 23 | val maxEstimatedReducersKey = "scalding.reducer.estimator.max.estimated.reducers" 24 | 25 | /* fairly arbitrary choice here -- you will probably want to configure this in your cluster defaults */ 26 | val defaultMaxEstimatedReducers = 5000 27 | 28 | /** Maximum number of history items to use for reducer estimation. */ 29 | val maxHistoryKey = "scalding.reducer.estimator.max.history" 30 | 31 | def getMaxHistory(conf: JobConf): Int = conf.getInt(maxHistoryKey, 1) 32 | } 33 | -------------------------------------------------------------------------------- /scalding-db/src/main/scala/com/twitter/scalding/db/ColumnDefinition.scala: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2015 Twitter, Inc. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package com.twitter.scalding.db 18 | 19 | import com.twitter.scalding.TupleConverter 20 | 21 | case class ColumnName(toStr: String) extends AnyVal 22 | case class SqlTypeName(toStr: String) extends AnyVal 23 | 24 | case class ColumnDefinition( 25 | jdbcType: SqlType, 26 | name: ColumnName, 27 | nullable: IsNullable, 28 | sizeOpt: Option[Int], 29 | defaultValue: Option[String] 30 | ) extends Serializable 31 | 32 | trait ColumnDefinitionProvider[T] extends Serializable { 33 | def columns: Iterable[ColumnDefinition] 34 | def resultSetExtractor: ResultSetExtractor[T] 35 | } 36 | 37 | class JdbcValidationException(msg: String) extends RuntimeException(msg) 38 | 39 | trait ResultSetExtractor[T] { 40 | def validate(rsmd: java.sql.ResultSetMetaData): scala.util.Try[Unit] 41 | def toCaseClass(rs: java.sql.ResultSet, c: TupleConverter[T]): T 42 | } 43 | -------------------------------------------------------------------------------- /scalding-parquet/README.md: -------------------------------------------------------------------------------- 1 | # Parquet support for Scalding 2 | 3 | The implementation is ported from code used by Twitter internally written by Sam Ritchie, Ian O'Connell, Oscar Boykin, Tianshuo Deng 4 | ## Use com.twitter.scalding.parquet.thrift for reading apache Thrift (TBase) records 5 | ## Use com.twitter.scalding.parquet.scrooge for reading scrooge Thrift (ThriftStruct) records 6 | Located in the scalding-parquet-scrooge module 7 | ## Use com.twitter.scalding.parquet.tuple for reading Tuple records 8 | ## Use com.twitter.scalding.parquet.tuple.TypedParquet for reading or writing case classes: 9 | Can use macro in com.twitter.scalding.parquet.tuple.macros.Macros to generate parquet read/write support. Here's an example: 10 | ```scala 11 | import com.twitter.scalding.parquet.tuple.macros.Macros._ 12 | 13 | case class SampleClass(x: Int, y: String) 14 | 15 | class WriteToTypedParquetTupleJob(args: Args) extends Job(args) { 16 | val outputPath = args.required("output") 17 | val sink = TypedParquetSink[SampleClass](outputPath) 18 | 19 | TypedPipe.from(List(SampleClass(0, "foo"), SampleClass(1, "bar"))).write(sink) 20 | } 21 | 22 | class ReadWithFilterPredicateJob(args: Args) extends Job(args) { 23 | val fp: FilterPredicate = FilterApi.eq(binaryColumn("y"), Binary.fromString("foo")) 24 | 25 | val inputPath = args.required("input") 26 | val outputPath = args.required("output") 27 | 28 | val input = TypedParquet[SampleClass](inputPath, fp) 29 | 30 | TypedPipe.from(input).map(_.x).write(TypedTsv[Int](outputPath)) 31 | } 32 | ``` -------------------------------------------------------------------------------- /scalding-core/src/main/scala/com/twitter/scalding/macros/impl/CaseClassFieldSetter.scala: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2015 Twitter, Inc. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package com.twitter.scalding.macros.impl 18 | 19 | import scala.reflect.macros.Context 20 | import scala.util.Try 21 | 22 | /** 23 | * Helper to set fields from a case class to other "container" types E.g. cascading Tuple, jdbc 24 | * PreparedStatement 25 | */ 26 | trait CaseClassFieldSetter { 27 | 28 | // mark the field as absent/null 29 | def absent(c: Context)(idx: Int, container: c.TermName): c.Tree 30 | 31 | // use the default field setter (for when there is no type-specific setter) 32 | def default(c: Context)(idx: Int, container: c.TermName, fieldValue: c.Tree): c.Tree 33 | 34 | // use the field setter known specific to the given field type 35 | // return scala.util.Failure if no type specific setter in the container 36 | def from(c: Context)(fieldType: c.Type, idx: Int, container: c.TermName, fieldValue: c.Tree): Try[c.Tree] 37 | } 38 | -------------------------------------------------------------------------------- /scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/runtime_helpers/MacroEqualityOrderedSerialization.scala: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2016 Twitter, Inc. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package com.twitter.scalding.serialization.macros.impl.ordered_serialization.runtime_helpers 17 | 18 | import com.twitter.scalding.serialization.{EquivSerialization, OrderedSerialization} 19 | 20 | object MacroEqualityOrderedSerialization { 21 | private val seed = "MacroEqualityOrderedSerialization".hashCode 22 | } 23 | 24 | abstract class MacroEqualityOrderedSerialization[T] 25 | extends OrderedSerialization[T] 26 | with EquivSerialization[T] { 27 | def uniqueId: String 28 | override def hashCode = MacroEqualityOrderedSerialization.seed ^ uniqueId.hashCode 29 | override def equals(other: Any): Boolean = other match { 30 | case o: MacroEqualityOrderedSerialization[_] => o.uniqueId == uniqueId 31 | case _ => false 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /scalding-core/src/main/scala/com/twitter/scalding/mathematics/Histogram.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.mathematics 2 | 3 | class Histogram(map: Map[Double, Long], binWidth: Double) { 4 | lazy val size = map.values.sum 5 | lazy val sum = map.foldLeft(0.0) { case (acc, (bin, count)) => acc + bin * count } 6 | lazy val keys = map.keys.toList.sorted 7 | 8 | lazy val min = keys.head 9 | lazy val max = keys.last 10 | 11 | lazy val stdDev = { 12 | val squaredDiff = map.foldLeft(0.0) { case (acc, (bin, count)) => 13 | acc + count * math.pow(bin - mean, 2.0) 14 | } 15 | math.sqrt(squaredDiff / size) 16 | } 17 | 18 | lazy val cdf = { 19 | var cumulative = 0L 20 | var result = Map[Double, Double]() 21 | keys.foreach { bin => 22 | cumulative += map(bin) 23 | result += (bin -> (cumulative.toDouble / size)) 24 | } 25 | result 26 | } 27 | 28 | lazy val lorenz = { 29 | var cumulativeUnique = 0.0 30 | var cumulativeTotal = 0.0 31 | var result = Map[Double, Double]() 32 | keys.foreach { bin => 33 | cumulativeUnique += map(bin) 34 | cumulativeTotal += bin * map(bin) 35 | result += (cumulativeUnique / size -> cumulativeTotal / sum) 36 | } 37 | result 38 | } 39 | 40 | def percentile(p: Int) = keys.find(bin => cdf(bin) * 100 >= p).getOrElse(-1d) 41 | 42 | lazy val median = percentile(50) 43 | lazy val q1 = percentile(25) 44 | lazy val q3 = percentile(75) 45 | 46 | def mean = sum / size 47 | def innerQuartileRange = q3 - q1 48 | def coefficientOfDispersion = innerQuartileRange / (q3 + q1) 49 | } 50 | -------------------------------------------------------------------------------- /tutorial/MatrixTutorial6.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.examples 2 | 3 | import com.twitter.scalding._ 4 | import com.twitter.scalding.mathematics.Matrix 5 | 6 | /* 7 | * MatrixTutorial6.scala 8 | * 9 | * Loads a document to word matrix where a[i,j] = freq of the word j in the document i 10 | * computes the Tf-Idf score of each word w.r.t. to each document and keeps the top nrWords in each document 11 | * (see http://en.wikipedia.org/wiki/Tf*idf for more info) 12 | * 13 | * ../scripts/scald.rb --local MatrixTutorial6.scala --input data/docBOW.tsv --nrWords 300 --output data/featSelectedMatrix.tsv 14 | * 15 | */ 16 | 17 | class TfIdfJob(args : Args) extends Job(args) { 18 | 19 | import Matrix._ 20 | 21 | val docWordMatrix = Tsv( args("input"), ('doc, 'word, 'count) ) 22 | .read 23 | .toMatrix[Long,String,Double]('doc, 'word, 'count) 24 | 25 | // compute the overall document frequency of each row 26 | val docFreq = docWordMatrix.binarizeAs[Double].sumRowVectors 27 | 28 | // compute the inverse document frequency vector 29 | val invDocFreqVct = docFreq.toMatrix(1).rowL1Normalize.mapValues( x => log2(1/x) ) 30 | 31 | // zip the row vector along the entire document - word matrix 32 | val invDocFreqMat = docWordMatrix.zip(invDocFreqVct.getRow(1)).mapValues( pair => pair._2 ) 33 | 34 | // multiply the term frequency with the inverse document frequency and keep the top nrWords 35 | docWordMatrix.hProd(invDocFreqMat).topRowElems( args("nrWords").toInt ).write(Tsv( args("output") )) 36 | 37 | def log2(x : Double) = scala.math.log(x)/scala.math.log(2.0) 38 | 39 | } 40 | 41 | -------------------------------------------------------------------------------- /scalding-core/src/test/scala/com/twitter/scalding/TimePathedSourceTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2016 Twitter, Inc. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package com.twitter.scalding 17 | 18 | import java.util.TimeZone 19 | 20 | import org.scalatest.{Matchers, WordSpec} 21 | 22 | class TimePathedSourceTest extends WordSpec with Matchers { 23 | "TimePathedSource.hdfsWritePath" should { 24 | val dateRange = DateRange(RichDate(0L), RichDate(0L)) 25 | val utcTZ = DateOps.UTC 26 | 27 | "crib if path == /*" in { 28 | intercept[AssertionError](TestTimePathedSource("/*", dateRange, utcTZ).hdfsWritePath) 29 | } 30 | 31 | "crib if path doesn't end with /*" in { 32 | intercept[AssertionError](TestTimePathedSource("/my/invalid/path", dateRange, utcTZ).hdfsWritePath) 33 | } 34 | 35 | "work for path ending with /*" in { 36 | TestTimePathedSource("/my/path/*", dateRange, utcTZ).hdfsWritePath.startsWith("/my/path") 37 | } 38 | } 39 | } 40 | 41 | case class TestTimePathedSource(p: String, dr: DateRange, t: TimeZone) extends TimePathedSource(p, dr, t) 42 | -------------------------------------------------------------------------------- /tutorial/MatrixTutorial5.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.examples 2 | 3 | import com.twitter.scalding._ 4 | import com.twitter.scalding.mathematics.Matrix 5 | 6 | 7 | /* 8 | * MatrixTutorial5.scala 9 | * 10 | * Loads a directed graph adjacency matrix where a[i,j] = 1 if there is an edge from a[i] to b[j] 11 | * and computes the jaccard similarity between any two pairs of vectors 12 | * 13 | * ../scripts/scald.rb --local MatrixTutorial5.scala --input data/graph.tsv --output data/jaccardSim.tsv 14 | * 15 | */ 16 | 17 | class ComputeJaccardJob(args : Args) extends Job(args) { 18 | 19 | import Matrix._ 20 | 21 | val adjacencyMatrix = Tsv( args("input"), ('user1, 'user2, 'rel) ) 22 | .read 23 | .toMatrix[Long,Long,Double]('user1, 'user2, 'rel) 24 | 25 | val aBinary = adjacencyMatrix.binarizeAs[Double] 26 | 27 | // intersectMat holds the size of the intersection of row(a)_i n row (b)_j 28 | val intersectMat = aBinary * aBinary.transpose 29 | val aSumVct = aBinary.sumColVectors 30 | val bSumVct = aBinary.sumRowVectors 31 | 32 | //Using zip to repeat the row and column vectors values on the right hand 33 | //for all non-zeroes on the left hand matrix 34 | val xMat = intersectMat.zip(aSumVct).mapValues( pair => pair._2 ) 35 | val yMat = intersectMat.zip(bSumVct).mapValues( pair => pair._2 ) 36 | 37 | val unionMat = xMat + yMat - intersectMat 38 | //We are guaranteed to have Double both in the intersection and in the union matrix 39 | intersectMat.zip(unionMat) 40 | .mapValues( pair => pair._1 / pair._2 ) 41 | .write(Tsv( args("output") )) 42 | 43 | } 44 | 45 | -------------------------------------------------------------------------------- /scalding-core/src/test/scala/com/twitter/scalding/ExecutionUtilTest.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding 2 | 3 | import org.scalatest.{Matchers, WordSpec} 4 | 5 | class ExecutionUtilTest extends WordSpec with Matchers { 6 | import ExecutionUtil._ 7 | 8 | implicit val tz: java.util.TimeZone = DateOps.UTC 9 | implicit val dp: DateParser = DateParser.default 10 | implicit val dateRange: DateRange = DateRange.parse("2015-01-01", "2015-01-10") 11 | 12 | def run[T](e: Execution[T]) = { 13 | val mode = Local(true) 14 | e.waitFor(Config.defaultFrom(mode), mode) 15 | } 16 | 17 | def testJob(dr: DateRange) = { 18 | assert(dr != null) 19 | TypedPipe 20 | .from[Int](Seq(1, 2, 3)) 21 | .toIterableExecution 22 | .map(_.head) 23 | } 24 | 25 | def testJobFailure(dr: DateRange) = 26 | throw new Exception(s"failed: $dr") 27 | 28 | "ExecutionUtil" should { 29 | "run multiple jobs" in { 30 | val days = dateRange.each(Days(1)).toSeq 31 | val result = runDatesWithParallelism(Days(1))(testJob) 32 | assert(run(result).get == days.map(d => (d, 1))) 33 | } 34 | 35 | "run multiple jobs with executions" in { 36 | val days = dateRange.each(Days(1)).toSeq 37 | val result = runDateRangeWithParallelism(Days(1))(testJob) 38 | assert(run(result).get == days.map(d => 1)) 39 | } 40 | 41 | "run multiple jobs with executions and sum results" in { 42 | val days = dateRange.each(Days(1)).toSeq 43 | val result = runDateRangeWithParallelismSum(Days(1))(testJob) 44 | assert(run(result).get == days.map(d => 1).sum) 45 | } 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /scalding-hraven/src/main/scala/com/twitter/scalding/hraven/reducer_estimation/HRavenBasedEstimator.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.hraven.reducer_estimation 2 | 3 | import com.twitter.hraven.{CounterMap, TaskDetails} 4 | import com.twitter.scalding.estimation.Task 5 | import com.twitter.scalding.hraven.estimation.HRavenHistoryService 6 | import com.twitter.scalding.reducer_estimation.{RatioBasedEstimator, RuntimeReducerEstimator} 7 | 8 | trait HRavenReducerHistoryService extends HRavenHistoryService { 9 | override protected val counterFields: List[String] = List() 10 | override protected val detailFields: List[String] = List(Task.TaskType, "status", "startTime", "finishTime") 11 | 12 | override protected def counters(taskCounters: CounterMap): Option[Map[String, Long]] = Some(Map.empty) 13 | 14 | override protected def details(taskDetails: TaskDetails): Option[Map[String, Any]] = 15 | if (taskDetails.getType.nonEmpty) { 16 | Some( 17 | Map( 18 | Task.TaskType -> taskDetails.getType, 19 | "status" -> taskDetails.getStatus, 20 | "startTime" -> taskDetails.getStartTime, 21 | "finishTime" -> taskDetails.getFinishTime 22 | ) 23 | ) 24 | } else { 25 | None 26 | } 27 | } 28 | 29 | object HRavenReducerHistoryService extends HRavenReducerHistoryService 30 | 31 | class HRavenRatioBasedEstimator extends RatioBasedEstimator { 32 | override val historyService = HRavenReducerHistoryService 33 | } 34 | 35 | class HRavenRuntimeBasedEstimator extends RuntimeReducerEstimator { 36 | override val historyService = HRavenReducerHistoryService 37 | } 38 | -------------------------------------------------------------------------------- /tutorial/MatrixTutorial2.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.examples 2 | 3 | import com.twitter.scalding._ 4 | import com.twitter.scalding.mathematics.Matrix 5 | 6 | 7 | /* 8 | * MatrixTutorial2.scala 9 | * 10 | * Loads a directed graph adjacency matrix where a[i,j] = 1 if there is an edge from a[i] to b[j] 11 | * and returns a graph containing only the nodes with outdegree smaller than a given value 12 | * 13 | * ../scripts/scald.rb --local MatrixTutorial2.scala --input data/graph.tsv --maxOutdegree 1000 --output data/graphFiltered.tsv 14 | * 15 | */ 16 | 17 | 18 | class FilterOutdegreeJob(args : Args) extends Job(args) { 19 | 20 | import Matrix._ 21 | 22 | val adjacencyMatrix = Tsv( args("input"), ('user1, 'user2, 'rel) ) 23 | .read 24 | .toMatrix[Long,Long,Double]('user1, 'user2, 'rel) 25 | 26 | // Each row corresponds to the outgoing edges so to compute the outdegree we sum out the columns 27 | val outdegree = adjacencyMatrix.sumColVectors 28 | 29 | // We convert the column vector to a matrix object to be able to use the matrix method filterValues 30 | // we make all non zero values into ones and then convert it back to column vector 31 | val outdegreeFiltered = outdegree.toMatrix[Int](1) 32 | .filterValues{ _ < args("maxOutdegree").toDouble } 33 | .binarizeAs[Double].getCol(1) 34 | 35 | // We multiply on the left hand side with the diagonal matrix created from the column vector 36 | // to keep only the rows with outdregree smaller than maxOutdegree 37 | (outdegreeFiltered.diag * adjacencyMatrix).write(Tsv( args("output") ) ) 38 | 39 | } 40 | 41 | -------------------------------------------------------------------------------- /scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ParquetScroogeOutputFormat.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2012 Twitter, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.twitter.scalding.parquet.scrooge; 17 | 18 | import com.twitter.scrooge.ThriftStruct; 19 | import org.apache.hadoop.conf.Configuration; 20 | import org.apache.parquet.hadoop.ParquetOutputFormat; 21 | 22 | /** 23 | * Use this class to write Scrooge records to parquet 24 | * @param Type of Scrooge records to write 25 | */ 26 | public class ParquetScroogeOutputFormat extends ParquetOutputFormat { 27 | 28 | public static void setScroogeClass(Configuration configuration, Class thriftClass) { 29 | ScroogeWriteSupport.setScroogeClass(configuration, thriftClass); 30 | } 31 | 32 | public static Class getScroogeClass(Configuration configuration) { 33 | return ScroogeWriteSupport.getScroogeClass(configuration); 34 | } 35 | 36 | public ParquetScroogeOutputFormat() { 37 | super(new ScroogeWriteSupport()); 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /maple/src/main/java/com/twitter/maple/tap/StdoutTap.java: -------------------------------------------------------------------------------- 1 | package com.twitter.maple.tap; 2 | 3 | import cascading.flow.hadoop.HadoopFlowProcess; 4 | import cascading.scheme.hadoop.SequenceFile; 5 | import cascading.tap.hadoop.Lfs; 6 | import cascading.tuple.Fields; 7 | import cascading.tuple.TupleEntryIterator; 8 | import org.apache.hadoop.mapred.JobConf; 9 | 10 | import java.io.File; 11 | import java.io.IOException; 12 | 13 | public class StdoutTap extends Lfs { 14 | 15 | public StdoutTap() { 16 | super(new SequenceFile(Fields.ALL), getTempDir()); 17 | } 18 | 19 | public static String getTempDir() { 20 | final File temp; 21 | try { 22 | temp = File.createTempFile("temp", Long.toString(System.nanoTime())); 23 | } catch (IOException e) { 24 | throw new RuntimeException(e); 25 | } 26 | temp.deleteOnExit(); 27 | if (!(temp.delete())) { 28 | throw new RuntimeException("Could not delete temp file: " + temp.getAbsolutePath()); 29 | } 30 | 31 | return temp.getAbsoluteFile().getPath(); 32 | } 33 | 34 | @Override 35 | public boolean commitResource(JobConf conf) throws java.io.IOException { 36 | TupleEntryIterator it = new HadoopFlowProcess(conf).openTapForRead(this); 37 | System.out.println(""); 38 | System.out.println(""); 39 | System.out.println("RESULTS"); 40 | System.out.println("-----------------------"); 41 | while (it.hasNext()) { 42 | System.out.println(it.next().getTuple()); 43 | } 44 | System.out.println("-----------------------"); 45 | it.close(); 46 | return true; 47 | } 48 | } -------------------------------------------------------------------------------- /scalding-dagon/src/test/scala/com/twitter/scalding/dagon/HCacheTests.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.dagon 2 | 3 | import org.scalacheck.Prop._ 4 | import org.scalacheck.{Arbitrary, Cogen, Properties} 5 | 6 | abstract class HCacheTests[K[_], V[_]](name: String)(implicit 7 | ka: Arbitrary[K[Int]], 8 | kc: Cogen[K[Int]], 9 | va: Arbitrary[V[Int]] 10 | ) extends Properties(name) { 11 | 12 | def buildHMap(c: HCache[K, V], ks: Iterable[K[Int]], f: K[Int] => V[Int]): HMap[K, V] = 13 | ks.iterator.foldLeft(HMap.empty[K, V]) { (m, k) => 14 | m.updated(k, c.getOrElseUpdate(k, f(k))) 15 | } 16 | 17 | property("getOrElseUpdate") = forAll { (f: K[Int] => V[Int], k: K[Int], v1: V[Int], v2: V[Int]) => 18 | val c = HCache.empty[K, V] 19 | var count = 0 20 | val x = c.getOrElseUpdate(k, { count += 1; v1 }) 21 | val y = c.getOrElseUpdate(k, { count += 1; v2 }) 22 | x == v1 && y == v1 && count == 1 23 | } 24 | 25 | property("toHMap") = forAll { (f: K[Int] => V[Int], ks: Set[K[Int]]) => 26 | val c = HCache.empty[K, V] 27 | val m = buildHMap(c, ks, f) 28 | c.toHMap == m 29 | } 30 | 31 | property("duplicate") = forAll { (f: K[Int] => V[Int], ks: Set[K[Int]]) => 32 | val c = HCache.empty[K, V] 33 | val d = c.duplicate 34 | buildHMap(c, ks, f) 35 | d.toHMap.isEmpty 36 | } 37 | 38 | property("reset works") = forAll { (f: K[Int] => V[Int], ks: Set[K[Int]]) => 39 | val c = HCache.empty[K, V] 40 | buildHMap(c, ks, f) 41 | val d = c.duplicate 42 | c.reset() 43 | c.toHMap.isEmpty && d.toHMap.size == ks.size 44 | } 45 | } 46 | 47 | object HCacheTestsLL extends HCacheTests[List, List]("HCacheTests[List, List]") 48 | -------------------------------------------------------------------------------- /scalding-core/src/test/scala/com/twitter/scalding/PathFilterTest.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding 2 | 3 | import org.scalatest.{Matchers, WordSpec} 4 | import org.apache.hadoop.fs.{Path => HadoopPath, PathFilter} 5 | 6 | class PathFilterTest extends WordSpec with Matchers { 7 | "RichPathFilter" should { 8 | import RichPathFilter.toRichPathFilter 9 | val p = new HadoopPath("/nowhere") 10 | 11 | "compose ands" in { 12 | AlwaysTrue.and(AlwaysTrue).accept(p) shouldBe true 13 | AlwaysTrue.and(AlwaysFalse).accept(p) shouldBe false 14 | AlwaysFalse.and(AlwaysTrue).accept(p) shouldBe false 15 | AlwaysFalse.and(AlwaysFalse).accept(p) shouldBe false 16 | 17 | AlwaysTrue.and(AlwaysTrue, AlwaysTrue).accept(p) shouldBe true 18 | AlwaysTrue.and(AlwaysTrue, AlwaysFalse).accept(p) shouldBe false 19 | } 20 | 21 | "compose ors" in { 22 | AlwaysTrue.or(AlwaysTrue).accept(p) shouldBe true 23 | AlwaysTrue.or(AlwaysFalse).accept(p) shouldBe true 24 | AlwaysFalse.or(AlwaysTrue).accept(p) shouldBe true 25 | AlwaysFalse.or(AlwaysFalse).accept(p) shouldBe false 26 | 27 | AlwaysFalse.or(AlwaysTrue, AlwaysTrue).accept(p) shouldBe true 28 | AlwaysTrue.or(AlwaysFalse, AlwaysFalse).accept(p) shouldBe true 29 | } 30 | 31 | "negate nots" in { 32 | AlwaysTrue.not.accept(p) shouldBe false 33 | AlwaysFalse.not.accept(p) shouldBe true 34 | AlwaysTrue.not.not.accept(p) shouldBe true 35 | } 36 | 37 | } 38 | } 39 | 40 | object AlwaysTrue extends PathFilter { 41 | override def accept(p: HadoopPath): Boolean = true 42 | } 43 | 44 | object AlwaysFalse extends PathFilter { 45 | override def accept(p: HadoopPath): Boolean = false 46 | } 47 | -------------------------------------------------------------------------------- /scalding-dagon/src/main/scala/com/twitter/scalding/dagon/Rule.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.dagon 2 | 3 | import java.io.Serializable 4 | 5 | /** 6 | * This implements a simplification rule on Dags 7 | */ 8 | trait Rule[N[_]] extends Serializable { self => 9 | 10 | /** 11 | * If the given Id can be replaced with a simpler expression, return Some(expr) else None. 12 | * 13 | * If it is convenient, you might write a partial function and then call .lift to get the correct Function 14 | * type 15 | */ 16 | def apply[T](on: Dag[N]): N[T] => Option[N[T]] 17 | 18 | /** 19 | * If the current rule cannot apply, then try the argument here. Note, this applies in series at a given 20 | * node, not on the whole Dag after the first rule has run. For that, see Dag.applySeq. 21 | */ 22 | def orElse(that: Rule[N]): Rule[N] = 23 | new Rule[N] { 24 | def apply[T](on: Dag[N]) = { n => 25 | self.apply(on)(n) match { 26 | case Some(n1) if n1 == n => 27 | // If the rule emits the same as input fall through 28 | that.apply(on)(n) 29 | case None => 30 | that.apply(on)(n) 31 | case s @ Some(_) => s 32 | } 33 | } 34 | 35 | override def toString: String = 36 | s"$self.orElse($that)" 37 | } 38 | } 39 | 40 | object Rule { 41 | 42 | /** 43 | * A Rule that never applies 44 | */ 45 | def empty[N[_]]: Rule[N] = 46 | new Rule[N] { 47 | def apply[T](on: Dag[N]) = { _ => None } 48 | } 49 | 50 | /** 51 | * Build a new Rule out of several using orElse to compose 52 | */ 53 | def orElse[N[_]](it: Iterable[Rule[N]]): Rule[N] = 54 | it.reduceOption(_ orElse _).getOrElse(empty) 55 | } 56 | -------------------------------------------------------------------------------- /scripts/testValidator.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -exv 2 | 3 | # Identify the bin dir in the distribution, and source the common include script 4 | BASE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/.. && pwd )" 5 | cd $BASE_DIR 6 | 7 | 8 | #Which things do we want to build, all things that start with scalding- or maple are build targets 9 | # This will produce a long string targets separated by spaces 10 | TARGET_NAMES=$(ls -d scalding-* maple) 11 | 12 | # Cat the travis build file, ignoring the assembly lines 13 | # Reformatting any quotes to new lines so we get things nicer split up 14 | # Then grep this for the scalding- and maple from above 15 | BUILDS_WE_HAVE=$(cat .travis.yml | grep -v scripts/build_assembly_no_test | tr '"' '\n' | tr ' ' '\n' | grep -e scalding- -e maple ) 16 | 17 | # Grab the blacklist, lines starting with #'s 18 | BLACKLIST_BUILDS=$(cat .travis.blacklist | egrep -v '^\s*#') 19 | 20 | 21 | TEST_ID=$(date '+%s') 22 | GOAL_PATH="/tmp/scalding_goal.$TEST_ID.txt" 23 | HAVE_PATH="/tmp/scalding_gHAVE.$TEST_ID.txt" 24 | # Ideally we want to have each target twice, once for 2.10 and 2.11 25 | # So echo them twice, counting their frequency into the goal path 26 | echo $TARGET_NAMES $TARGET_NAMES | tr ' ' '\n' | sort | uniq -c > $GOAL_PATH 27 | 28 | #Now we take the builds we have, appending the 29 | #blacklist builds 30 | echo $BUILDS_WE_HAVE $BLACKLIST_BUILDS | tr ' ' '\n' | sort | uniq -c > $HAVE_PATH 31 | 32 | # Once we've done this both lists should be identical 33 | DIFF=$(diff $GOAL_PATH $HAVE_PATH) 34 | RET=$? 35 | rm -f $GOAL_PATH 36 | rm -f $HAVE_PATH 37 | 38 | if [ $RET -eq 0 ]; then 39 | echo "All builds running" 40 | exit 0 41 | else 42 | echo -e "Missing some builds, diff $DIFF" 43 | exit 1 44 | fi -------------------------------------------------------------------------------- /scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/DBTypeDescriptorImpl.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.db.macros.impl 2 | 3 | import scala.reflect.macros.Context 4 | 5 | import com.twitter.bijection.macros.impl.IsCaseClassImpl 6 | import com.twitter.scalding.macros.impl.{FieldsProviderImpl, TupleConverterImpl, TupleSetterImpl} 7 | import com.twitter.scalding.db.DBTypeDescriptor 8 | 9 | object DBTypeDescriptorImpl { 10 | 11 | def apply[T](c: Context)(implicit T: c.WeakTypeTag[T]): c.Expr[DBTypeDescriptor[T]] = { 12 | import c.universe._ 13 | 14 | if (!IsCaseClassImpl.isCaseClassType(c)(T.tpe)) 15 | c.abort( 16 | c.enclosingPosition, 17 | s"""We cannot enforce ${T.tpe} is a case class, either it is not a case class or this macro call is possibly enclosed in a class. 18 | This will mean the macro is operating on a non-resolved type.""" 19 | ) 20 | 21 | val columnDefn = ColumnDefinitionProviderImpl[T](c) 22 | val converter = TupleConverterImpl.caseClassTupleConverterWithUnknownImpl[T](c) 23 | val setter = TupleSetterImpl.caseClassTupleSetterWithUnknownImpl[T](c) 24 | val jdbcSetter = JdbcStatementSetterImpl.caseClassJdbcSetterCommonImpl[T](c, true) 25 | val fields = FieldsProviderImpl.toFieldsWithUnknownNoPrefixImpl[T](c) 26 | 27 | val res = q""" 28 | new _root_.com.twitter.scalding.db.DBTypeDescriptor[$T] with _root_.com.twitter.bijection.macros.MacroGenerated { 29 | override val columnDefn = $columnDefn 30 | override val converter = $converter 31 | override val setter = $setter 32 | override val fields = $fields 33 | override val jdbcSetter = $jdbcSetter 34 | } 35 | """ 36 | c.Expr[DBTypeDescriptor[T]](res) 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /scalding-core/src/main/scala/com/twitter/scalding/serialization/RequiredBinaryComparators.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.serialization 2 | 3 | import com.twitter.scalding._ 4 | 5 | import scala.language.experimental.{macros => smacros} 6 | 7 | /** 8 | * RequiredBinaryComparators provide comparators (or Ordering in Scala) that are capable of comparing keys in 9 | * their serialized form reducing the amount of time spent in serialization/deserialization. These comparators 10 | * are implemented using Scala macros, and currently provide binary comparators for primitives, strings, 11 | * Options, tuples, collections, case classes and Scrooge objects. 12 | */ 13 | trait RequiredBinaryComparators extends RequiredBinaryComparatorsConfig { 14 | 15 | implicit def ordSer[T]: OrderedSerialization[T] = 16 | macro com.twitter.scalding.serialization.macros.impl.OrderedSerializationProviderImpl[T] 17 | 18 | } 19 | 20 | object RequiredBinaryComparators { 21 | 22 | implicit def orderedSerialization[T]: OrderedSerialization[T] = 23 | macro com.twitter.scalding.serialization.macros.impl.OrderedSerializationProviderImpl[T] 24 | } 25 | 26 | /** 27 | * Use this for an ExecutionApp. 28 | */ 29 | trait RequiredBinaryComparatorsExecutionApp extends ExecutionApp { 30 | implicit def ordSer[T]: OrderedSerialization[T] = 31 | macro com.twitter.scalding.serialization.macros.impl.OrderedSerializationProviderImpl[T] 32 | def requireOrderedSerializationMode: RequireOrderedSerializationMode = RequireOrderedSerializationMode.Fail 33 | override def config(inputArgs: Array[String]): (Config, Mode) = { 34 | val (conf, m) = super.config(inputArgs) 35 | (conf.setRequireOrderedSerializationMode(Some(requireOrderedSerializationMode)), m) 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /scalding-base/src/test/scala/com/twitter/scalding/typed/TypedPipeMonoidTest.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding 2 | package typed 3 | 4 | import memory_backend.MemoryMode 5 | import com.twitter.algebird.Monoid.{plus, sum, zero} 6 | import org.scalatest.FunSuite 7 | import org.scalatest.prop.PropertyChecks 8 | 9 | class TypedPipeMonoidTest extends FunSuite with PropertyChecks { 10 | 11 | def run[A](t: TypedPipe[A]): List[A] = 12 | t.toIterableExecution.map(_.toList).waitFor(Config.empty, MemoryMode.empty).get 13 | 14 | def sortedEq[A: Ordering](a: List[A], b: List[A]): Boolean = 15 | a.sorted == b.sorted 16 | 17 | def eqvPipe[A: Ordering](a: TypedPipe[A], b: TypedPipe[A]): Boolean = 18 | sortedEq(run(a), run(b)) 19 | 20 | test("typedPipeMonoid.zero should be equal to TypePipe.empty") { 21 | assert(zero[TypedPipe[Int]] == TypedPipe.empty) 22 | } 23 | 24 | test("monoid is associative") { 25 | forAll { (a: List[Int], b: List[Int], c: List[Int]) => 26 | val left = plus(plus(TypedPipe.from(a), TypedPipe.from(b)), TypedPipe.from(c)) 27 | val right = plus(TypedPipe.from(a), plus(TypedPipe.from(b), TypedPipe.from(c))) 28 | assert(eqvPipe(left, right)) 29 | } 30 | } 31 | 32 | test("monoid is commutative") { 33 | forAll { (a: List[Int], b: List[Int]) => 34 | val left = plus(TypedPipe.from(a), TypedPipe.from(b)) 35 | val right = plus(TypedPipe.from(b), TypedPipe.from(a)) 36 | assert(eqvPipe(left, right)) 37 | } 38 | } 39 | 40 | test("monoid sum is equivalent to a union") { 41 | forAll { (as: List[List[Int]]) => 42 | val pipes = as.map(TypedPipe.from(_)) 43 | val bigPipe = TypedPipe.from(as.flatten) 44 | assert(eqvPipe(sum(pipes), bigPipe)) 45 | } 46 | } 47 | 48 | } 49 | -------------------------------------------------------------------------------- /scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/runtime_helpers/LengthCalculations.scala: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2014 Twitter, Inc. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package com.twitter.scalding.serialization.macros.impl.ordered_serialization.runtime_helpers 17 | 18 | /** 19 | * There is a Monoid on MaybeLength, with ConstLen(0) being the zero. 20 | */ 21 | sealed trait MaybeLength { 22 | def +(that: MaybeLength): MaybeLength 23 | } 24 | 25 | case object NoLengthCalculation extends MaybeLength { 26 | def +(that: MaybeLength): MaybeLength = this 27 | } 28 | final case class ConstLen(toInt: Int) extends MaybeLength { 29 | def +(that: MaybeLength): MaybeLength = that match { 30 | case ConstLen(c) => ConstLen(toInt + c) 31 | case DynamicLen(d) => DynamicLen(toInt + d) 32 | case NoLengthCalculation => NoLengthCalculation 33 | } 34 | } 35 | final case class DynamicLen(toInt: Int) extends MaybeLength { 36 | def +(that: MaybeLength): MaybeLength = that match { 37 | case ConstLen(c) => DynamicLen(toInt + c) 38 | case DynamicLen(d) => DynamicLen(toInt + d) 39 | case NoLengthCalculation => NoLengthCalculation 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/JdbcStatementSetterImpl.scala: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2015 Twitter, Inc. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package com.twitter.scalding.db.macros.impl 17 | 18 | import scala.reflect.macros.Context 19 | 20 | import com.twitter.scalding.macros.impl.CaseClassBasedSetterImpl 21 | import com.twitter.scalding.db.JdbcStatementSetter 22 | 23 | /** 24 | * Generates JDBC PreparedStatement data from case class 25 | */ 26 | private[macros] object JdbcStatementSetterImpl { 27 | 28 | def caseClassJdbcSetterCommonImpl[T](c: Context, allowUnknownTypes: Boolean)(implicit 29 | T: c.WeakTypeTag[T] 30 | ): c.Expr[JdbcStatementSetter[T]] = { 31 | import c.universe._ 32 | 33 | val stmtTerm = newTermName(c.fresh("stmt")) 34 | val (_, setterTerm) = CaseClassBasedSetterImpl(c)(stmtTerm, allowUnknownTypes, JdbcFieldSetter) 35 | val res = q""" 36 | new _root_.com.twitter.scalding.db.JdbcStatementSetter[$T] with _root_.com.twitter.bijection.macros.MacroGenerated { 37 | override def apply(t: $T, $stmtTerm: _root_.java.sql.PreparedStatement) = _root_.scala.util.Try { 38 | $setterTerm 39 | $stmtTerm 40 | } 41 | } 42 | """ 43 | c.Expr[JdbcStatementSetter[T]](res) 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /scalding-commons/src/main/scala/com/twitter/scalding/commons/source/LzoGenericSource.scala: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2015 Twitter, Inc. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package com.twitter.scalding.commons.source 18 | 19 | import com.twitter.elephantbird.mapreduce.io.BinaryConverter 20 | import com.twitter.scalding._ 21 | 22 | import cascading.scheme.Scheme 23 | 24 | /** 25 | * Generic source with an underlying GenericScheme that uses the supplied BinaryConverter. 26 | */ 27 | abstract class LzoGenericSource[T] 28 | extends FileSource 29 | with SingleMappable[T] 30 | with TypedSink[T] 31 | with LocalTapSource { 32 | def clazz: Class[T] 33 | def conv: BinaryConverter[T] 34 | override def setter[U <: T] = TupleSetter.asSubSetter[T, U](TupleSetter.singleSetter[T]) 35 | override def hdfsScheme = HadoopSchemeInstance( 36 | LzoGenericScheme[T](conv, clazz).asInstanceOf[Scheme[_, _, _, _, _]] 37 | ) 38 | } 39 | 40 | object LzoGenericSource { 41 | def apply[T](passedConv: BinaryConverter[T], passedClass: Class[T], paths: String*) = 42 | new LzoGenericSource[T] { 43 | override val conv: BinaryConverter[T] = passedConv 44 | override val clazz = passedClass 45 | override val hdfsPaths = paths 46 | override val localPaths = paths 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /tutorial/Tutorial1.scala: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2012 Twitter, Inc. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | import com.twitter.scalding._ 17 | 18 | /** 19 | Scalding tutorial part 1. 20 | 21 | In part 0, we made a copy of hello.txt, but it wasn't a perfect copy: 22 | it was annotated with line numbers. 23 | 24 | That's because the data stream coming out of a TextLine source actually 25 | has two fields: one, called "line", has the actual line of text. The other, 26 | called "num", has the line number in the file. When you write these 27 | tuples to a TextLine, it naively outputs them both on each line. 28 | 29 | We can ask scalding to select just the "line" field from the pipe, using the 30 | project() method. When we refer to a data stream's fields, we use Scala symbols, 31 | like this: 'line. 32 | 33 | To run this job: 34 | scripts/scald.rb --local tutorial/Tutorial1.scala 35 | 36 | Check the output: 37 | cat tutorial/data/output1.txt 38 | 39 | **/ 40 | 41 | class Tutorial1(args : Args) extends Job(args) { 42 | 43 | val input = TextLine("tutorial/data/hello.txt") 44 | val output = TextLine("tutorial/data/output1.txt") 45 | 46 | /** 47 | We generally write each step of the pipeline on a separate line. 48 | **/ 49 | input 50 | .read 51 | .project('line) 52 | .write(output) 53 | } 54 | -------------------------------------------------------------------------------- /scalding-core/src/main/scala/com/twitter/scalding/TupleArity.scala: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2012 Twitter, Inc. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package com.twitter.scalding 18 | 19 | import cascading.tuple.Fields 20 | 21 | /** 22 | * Mixed in to both TupleConverter and TupleSetter to improve arity safety of cascading jobs before we run 23 | * anything on Hadoop. 24 | */ 25 | trait TupleArity { 26 | 27 | /** 28 | * Return the arity of product types, should probably only be used implicitly The use case here is to see 29 | * how many fake field names we need in Cascading to hold an intermediate value for mapReduceMap 30 | */ 31 | def arity: Int 32 | 33 | /** 34 | * assert that the arity of this setter matches the fields given. if arity == -1, we can't check, and if 35 | * Fields is not a definite size, (such as Fields.ALL), we also cannot check, so this should only be 36 | * considered a weak check. 37 | */ 38 | def assertArityMatches(f: Fields): Unit = 39 | // Fields.size == 0 for the indefinite Fields: ALL, GROUP, VALUES, UNKNOWN, etc.. 40 | if (f.size > 0 && arity >= 0) { 41 | assert( 42 | arity == f.size, 43 | "Arity of (" + super.getClass + ") is " 44 | + arity + ", which doesn't match: + (" + f.toString + ")" 45 | ) 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /scalding-core/src/main/java/com/twitter/scalding/tap/GlobHfs.java: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.tap; 2 | 3 | import java.io.FileNotFoundException; 4 | import java.io.IOException; 5 | 6 | import org.apache.hadoop.fs.FileStatus; 7 | import org.apache.hadoop.fs.FileSystem; 8 | import org.apache.hadoop.fs.Path; 9 | import org.apache.hadoop.mapred.JobConf; 10 | import org.apache.hadoop.mapred.OutputCollector; 11 | import org.apache.hadoop.mapred.RecordReader; 12 | 13 | import cascading.scheme.Scheme; 14 | 15 | /** 16 | * Default implementation of getSize in {@link cascading.tap.hadoop.Hfs} don't respect to paths with glob patterns, 17 | * that will throw IOException where we actually can calculate size of source. 18 | */ 19 | public class GlobHfs extends ScaldingHfs { 20 | public GlobHfs(Scheme scheme) { 21 | super(scheme); 22 | } 23 | 24 | public GlobHfs(Scheme scheme, String stringPath) { 25 | super(scheme, stringPath); 26 | } 27 | 28 | @Override 29 | public long getSize(JobConf conf) throws IOException { 30 | return getSize(getPath(), conf); 31 | } 32 | 33 | /** 34 | * Get the total size of the file(s) specified by the Hfs, which may contain a glob 35 | * pattern in its path, so we must be ready to handle that case. 36 | */ 37 | public static long getSize(Path path, JobConf conf) throws IOException { 38 | FileSystem fs = path.getFileSystem(conf); 39 | FileStatus[] statuses = fs.globStatus(path); 40 | 41 | if (statuses == null) { 42 | throw new FileNotFoundException(String.format("File not found: %s", path)); 43 | } 44 | 45 | long size = 0; 46 | for (FileStatus status : statuses) { 47 | size += fs.getContentSummary(status.getPath()).getLength(); 48 | } 49 | return size; 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /scalding-args/src/test/scala/com/twitter/scalding/RangedArgsSpec.scala: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2012 Twitter, Inc. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package com.twitter.scalding 18 | 19 | import org.scalatest.WordSpec 20 | 21 | class RangeSpecs extends WordSpec { 22 | "A Range" should { 23 | val testRange = Range(4, 5) 24 | 25 | "contain its endpoints" in { 26 | assert(testRange.lower === 4) 27 | assert(testRange.upper === 5) 28 | } 29 | 30 | "throw errors for misordered ranges" in { 31 | Range(4, 4) 32 | intercept[AssertionError](Range(5, 4)) 33 | } 34 | 35 | "assert lower bounds" in { 36 | testRange.assertLowerBound(3) 37 | testRange.assertLowerBound(4) 38 | intercept[AssertionError](testRange.assertLowerBound(5)) 39 | } 40 | 41 | "assert upper bounds" in { 42 | testRange.assertUpperBound(6) 43 | testRange.assertUpperBound(5) 44 | intercept[AssertionError](testRange.assertUpperBound(4)) 45 | } 46 | 47 | "print nicely with mkString" should { 48 | "for trivial ranges" in { 49 | assert(Range(4, 4).mkString("_") === "4") 50 | } 51 | "for proper ranges" in { 52 | assert(testRange.mkString("_") === "4_5") 53 | assert(testRange.mkString("-") === "4-5") 54 | } 55 | } 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /scalding-core/src/main/scala/com/twitter/scalding/typed/BijectedSourceSink.scala: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2012 Twitter, Inc. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package com.twitter.scalding.typed 17 | 18 | import cascading.flow.FlowDef 19 | import cascading.pipe.Pipe 20 | 21 | import com.twitter.bijection.ImplicitBijection 22 | import com.twitter.scalding._ 23 | import serialization.Externalizer 24 | 25 | object BijectedSourceSink { 26 | type SourceSink[T] = TypedSource[T] with TypedSink[T] 27 | def apply[T, U](parent: SourceSink[T])(implicit 28 | transformer: ImplicitBijection[T, U] 29 | ): BijectedSourceSink[T, U] = 30 | new BijectedSourceSink(parent)(transformer) 31 | } 32 | 33 | class BijectedSourceSink[T, U](parent: BijectedSourceSink.SourceSink[T])(implicit 34 | @transient transformer: ImplicitBijection[T, U] 35 | ) extends TypedSource[U] 36 | with TypedSink[U] { 37 | 38 | val lockedBij = Externalizer(transformer) 39 | 40 | def setter[V <: U] = parent.setter.contraMap(lockedBij.get.invert(_)) 41 | 42 | override def converter[W >: U] = parent.converter.andThen { t: T => lockedBij.get(t) }: TupleConverter[W] 43 | 44 | override def read(implicit flowDef: FlowDef, mode: Mode): Pipe = parent.read 45 | override def writeFrom(pipe: Pipe)(implicit flowDef: FlowDef, mode: Mode) = parent.writeFrom(pipe) 46 | } 47 | -------------------------------------------------------------------------------- /scalding-base/src/test/scala/com/twitter/scalding/typed/CoGroupableTest.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.typed 2 | 3 | import org.scalatest.FunSuite 4 | 5 | class CoGroupableTest extends FunSuite { 6 | test("CoGroupable.atMostOneValue is consistent") { 7 | val init = TypedPipe.from(List((1, 2))) 8 | 9 | assert(CoGroupable.atMostOneValue(init.sumByKey)) 10 | assert(CoGroupable.atMostOneValue(init.group.sum)) 11 | assert(CoGroupable.atMostOneValue(init.group.mapValues(_ + 100).sum)) 12 | assert(CoGroupable.atMostOneValue(init.group.forceToReducers.mapValues(_ + 100).sum)) 13 | assert(CoGroupable.atMostOneValue(init.group.forceToReducers.mapValues(_ + 100).sum.mapValues(_ - 100))) 14 | assert(CoGroupable.atMostOneValue(init.group.forceToReducers.mapValues(_ + 100).sum.filter { 15 | case (k, v) => k > v 16 | })) 17 | assert(CoGroupable.atMostOneValue(init.group.mapValues(_ * 2).sum.join(init.group.sum))) 18 | 19 | assert(!CoGroupable.atMostOneValue(init.group)) 20 | assert(!CoGroupable.atMostOneValue(init.group.scanLeft(0)(_ + _))) 21 | assert(!CoGroupable.atMostOneValue(init.join(init.group.mapValues(_ * 2)))) 22 | assert(!CoGroupable.atMostOneValue(init.group.sum.flatMapValues(List(_)))) 23 | 24 | val sum1 = init.sumByKey 25 | 26 | assert(CoGroupable.atMostOneValue(sum1.join(sum1.join(sum1)))) 27 | assert(CoGroupable.atMostOneValue(sum1.join(sum1).join(sum1))) 28 | 29 | assert(!CoGroupable.atMostOneValue(init.join(sum1.join(sum1)))) 30 | assert(!CoGroupable.atMostOneValue(init.join(sum1).join(sum1))) 31 | assert(!CoGroupable.atMostOneValue(sum1.join(init.join(sum1)))) 32 | assert(!CoGroupable.atMostOneValue(sum1.join(init).join(sum1))) 33 | assert(!CoGroupable.atMostOneValue(sum1.join(sum1.join(init)))) 34 | assert(!CoGroupable.atMostOneValue(sum1.join(sum1).join(init))) 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /scalding-db/src/main/scala/com/twitter/scalding/db/macros/DBMacro.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.db.macros 2 | 3 | import scala.language.experimental.macros 4 | import com.twitter.scalding.db.macros.impl._ 5 | import com.twitter.scalding.db.{ColumnDefinitionProvider, DBTypeDescriptor} 6 | 7 | // This is the sealed base trait for scala runtime annotiations used by the JDBC macros. 8 | // These will read from these macros as a means to annotate fields to make up for the missing 9 | // extra type information JDBC wants but is not in the jvm types. 10 | sealed trait ScaldingDBAnnotation 11 | 12 | // This is the size in characters for a char field 13 | // For integers its really for display purposes 14 | @scala.annotation.meta.getter 15 | final class size(val size: Int) extends annotation.StaticAnnotation with ScaldingDBAnnotation 16 | 17 | // JDBC TEXT type, this forces the String field in question to be a text type 18 | @scala.annotation.meta.getter 19 | final class text() extends annotation.StaticAnnotation with ScaldingDBAnnotation 20 | 21 | // JDBC VARCHAR type, this forces the String field in question to be a text type 22 | @scala.annotation.meta.getter 23 | final class varchar() extends annotation.StaticAnnotation with ScaldingDBAnnotation 24 | 25 | // JDBC DATE type, this toggles a java.util.Date field to be JDBC Date. 26 | // It will default to DATETIME to preserve the full resolution of java.util.Date 27 | @scala.annotation.meta.getter 28 | final class date() extends annotation.StaticAnnotation with ScaldingDBAnnotation 29 | 30 | // This is the entry point to explicitly calling the JDBC macros. 31 | // Most often the implicits will be used in the package however 32 | object DBMacro { 33 | def toColumnDefinitionProvider[T]: ColumnDefinitionProvider[T] = macro ColumnDefinitionProviderImpl[T] 34 | def toDBTypeDescriptor[T]: DBTypeDescriptor[T] = macro DBTypeDescriptorImpl[T] 35 | } 36 | -------------------------------------------------------------------------------- /scalding-serialization/src/test/scala/com/twitter/scalding/serialization/UnsignedComparisonLaws.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.serialization 2 | 3 | import org.scalacheck.Properties 4 | import org.scalacheck.Prop.forAll 5 | import org.scalacheck.Prop._ 6 | 7 | object UnsignedComparisonLaws extends Properties("UnsignedComparisonLaws") { 8 | 9 | property("UnsignedLongCompare works") = forAll { (l1: Long, l2: Long) => 10 | val cmp = UnsignedComparisons.unsignedLongCompare(l1, l2) 11 | (l1 >= 0, l2 >= 0) match { 12 | case (true, true) => cmp == java.lang.Long.compare(l1, l2) 13 | case (true, false) => cmp < 0 // negative is bigger 14 | case (false, true) => cmp > 0 15 | case (false, false) => cmp == java.lang.Long.compare(l1 & Long.MaxValue, l2 & Long.MaxValue) 16 | } 17 | } 18 | property("UnsignedIntCompare works") = forAll { (l1: Int, l2: Int) => 19 | val cmp = UnsignedComparisons.unsignedIntCompare(l1, l2) 20 | (l1 >= 0, l2 >= 0) match { 21 | case (true, true) => cmp == java.lang.Integer.compare(l1, l2) 22 | case (true, false) => cmp < 0 // negative is bigger 23 | case (false, true) => cmp > 0 24 | case (false, false) => cmp == java.lang.Integer.compare(l1 & Int.MaxValue, l2 & Int.MaxValue) 25 | } 26 | } 27 | property("UnsignedByteCompare works") = forAll { (l1: Byte, l2: Byte) => 28 | def clamp(i: Int) = if (i > 0) 1 else if (i < 0) -1 else 0 29 | val cmp = clamp(UnsignedComparisons.unsignedByteCompare(l1, l2)) 30 | (l1 >= 0, l2 >= 0) match { 31 | case (true, true) => cmp == clamp(java.lang.Byte.compare(l1, l2)) 32 | case (true, false) => cmp < 0 // negative is bigger 33 | case (false, true) => cmp > 0 34 | // Convert to positive ints 35 | case (false, false) => cmp == java.lang.Integer.compare(l1 & Byte.MaxValue, l2 & Byte.MaxValue) 36 | } 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /tutorial/ReplTutorial1.scala: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2012 Twitter, Inc. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | import com.twitter.scalding._ 17 | 18 | /** 19 | Scalding Tutorial1 in REPL form. 20 | 21 | To test it, first make sure you've built the target/scalding-assembly-XXX.jar: 22 | from the base directory type: 23 | sbt assembly 24 | 25 | Now run the REPL in local mode. 26 | scripts/scald-repl.sh --local 27 | 28 | Run the Tutorial by typing the following. 29 | :load tutorial/ReplTutorial1.scala 30 | 31 | You can check the input: 32 | cat tutorial/data/hello.txt 33 | 34 | And the output: 35 | cat tutorial/data/output1.txt 36 | 37 | The output should look exactly like the input. 38 | **/ 39 | 40 | /** 41 | Both input and output data sources are represented by instances of 42 | com.twitter.scalding.Source. 43 | 44 | Scalding comes with some basic source types like TextLine and Tsv. 45 | There are also many twitter-specific types like MergedAdRequestSource. 46 | **/ 47 | 48 | val input = TextLine("tutorial/data/hello.txt") 49 | val output = TextLine("tutorial/data/output1.txt") 50 | 51 | /** 52 | You can then define a pipe that reads the source and writes to the sink. 53 | The "project" just fetches the content of the line, and not the line number. 54 | **/ 55 | input.read.project('line).write(output) 56 | 57 | /** 58 | And then run it! 59 | **/ 60 | run 61 | -------------------------------------------------------------------------------- /scalding-hraven/src/test/resources/jobResponse_job_1470171371859_6607542.json: -------------------------------------------------------------------------------- 1 | [ { 2 | "taskType" : "", 3 | "counters" : { 4 | } 5 | }, { 6 | "taskType" : "MAP", 7 | "counters" : { 8 | "org.apache.hadoop.mapreduce.TaskCounter" : { 9 | "PHYSICAL_MEMORY_BYTES" : 751378432, 10 | "GC_TIME_MILLIS" : 310, 11 | "CPU_MILLISECONDS" : 38570, 12 | "COMMITTED_HEAP_BYTES" : 810524672 13 | } 14 | } 15 | }, { 16 | "taskType" : "MAP", 17 | "counters" : { 18 | "org.apache.hadoop.mapreduce.TaskCounter" : { 19 | "PHYSICAL_MEMORY_BYTES" : 751378432, 20 | "GC_TIME_MILLIS" : 310, 21 | "CPU_MILLISECONDS" : 38570, 22 | "COMMITTED_HEAP_BYTES" : 810524672 23 | } 24 | } 25 | }, { 26 | "taskType" : "MAP", 27 | "counters" : { 28 | "org.apache.hadoop.mapreduce.TaskCounter" : { 29 | "PHYSICAL_MEMORY_BYTES" : 759648256, 30 | "GC_TIME_MILLIS" : 313, 31 | "CPU_MILLISECONDS" : 38620, 32 | "COMMITTED_HEAP_BYTES" : 810520576 33 | } 34 | } 35 | }, { 36 | "taskType" : "REDUCE", 37 | "counters" : { 38 | "org.apache.hadoop.mapreduce.TaskCounter" : { 39 | "PHYSICAL_MEMORY_BYTES" : 449499136, 40 | "GC_TIME_MILLIS" : 444, 41 | "CPU_MILLISECONDS" : 53720, 42 | "COMMITTED_HEAP_BYTES" : 506986496 43 | } 44 | } 45 | }, { 46 | "taskType" : "REDUCE", 47 | "counters" : { 48 | "org.apache.hadoop.mapreduce.TaskCounter" : { 49 | "PHYSICAL_MEMORY_BYTES" : 449499136, 50 | "GC_TIME_MILLIS" : 444, 51 | "CPU_MILLISECONDS" : 53720, 52 | "COMMITTED_HEAP_BYTES" : 506986496 53 | } 54 | } 55 | }, { 56 | "taskType" : "REDUCE", 57 | "counters" : { 58 | "org.apache.hadoop.mapreduce.TaskCounter" : { 59 | "PHYSICAL_MEMORY_BYTES" : 465207296, 60 | "GC_TIME_MILLIS" : 529, 61 | "CPU_MILLISECONDS" : 57210, 62 | "COMMITTED_HEAP_BYTES" : 506986496 63 | } 64 | } 65 | } ] 66 | -------------------------------------------------------------------------------- /scalding-hraven/src/test/resources/jobResponse_job_1470171371859_6608570.json: -------------------------------------------------------------------------------- 1 | [ { 2 | "taskType" : "", 3 | "counters" : { 4 | } 5 | }, { 6 | "taskType" : "MAP", 7 | "counters" : { 8 | "org.apache.hadoop.mapreduce.TaskCounter" : { 9 | "PHYSICAL_MEMORY_BYTES" : 768618496, 10 | "GC_TIME_MILLIS" : 371, 11 | "CPU_MILLISECONDS" : 45260, 12 | "COMMITTED_HEAP_BYTES" : 814776320 13 | } 14 | } 15 | }, { 16 | "taskType" : "MAP", 17 | "counters" : { 18 | "org.apache.hadoop.mapreduce.TaskCounter" : { 19 | "PHYSICAL_MEMORY_BYTES" : 768618496, 20 | "GC_TIME_MILLIS" : 371, 21 | "CPU_MILLISECONDS" : 45260, 22 | "COMMITTED_HEAP_BYTES" : 814776320 23 | } 24 | } 25 | }, { 26 | "taskType" : "MAP", 27 | "counters" : { 28 | "org.apache.hadoop.mapreduce.TaskCounter" : { 29 | "PHYSICAL_MEMORY_BYTES" : 758517760, 30 | "GC_TIME_MILLIS" : 355, 31 | "CPU_MILLISECONDS" : 43950, 32 | "COMMITTED_HEAP_BYTES" : 814280704 33 | } 34 | } 35 | }, { 36 | "taskType" : "REDUCE", 37 | "counters" : { 38 | "org.apache.hadoop.mapreduce.TaskCounter" : { 39 | "PHYSICAL_MEMORY_BYTES" : 433074176, 40 | "GC_TIME_MILLIS" : 671, 41 | "CPU_MILLISECONDS" : 74270, 42 | "COMMITTED_HEAP_BYTES" : 506986496 43 | } 44 | } 45 | }, { 46 | "taskType" : "REDUCE", 47 | "counters" : { 48 | "org.apache.hadoop.mapreduce.TaskCounter" : { 49 | "PHYSICAL_MEMORY_BYTES" : 421924864, 50 | "GC_TIME_MILLIS" : 596, 51 | "CPU_MILLISECONDS" : 64390, 52 | "COMMITTED_HEAP_BYTES" : 506986496 53 | } 54 | } 55 | }, { 56 | "taskType" : "REDUCE", 57 | "counters" : { 58 | "org.apache.hadoop.mapreduce.TaskCounter" : { 59 | "PHYSICAL_MEMORY_BYTES" : 421924864, 60 | "GC_TIME_MILLIS" : 596, 61 | "CPU_MILLISECONDS" : 64390, 62 | "COMMITTED_HEAP_BYTES" : 506986496 63 | } 64 | } 65 | } ] 66 | -------------------------------------------------------------------------------- /scalding-commons/src/main/scala/com/twitter/scalding/examples/MergeTest.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.examples 2 | 3 | import scala.annotation.tailrec 4 | 5 | import com.twitter.scalding._ 6 | 7 | /** 8 | * This example job does not yet work. It is a test for Kyro serialization 9 | */ 10 | class MergeTest(args: Args) extends Job(args) { 11 | TextLine(args("input")) 12 | .flatMapTo('word)(_.split("""\s+""")) 13 | .groupBy('word)(_.size) 14 | // Now, let's get the top 10 words: 15 | .groupAll { 16 | _.mapReduceMap(('word, 'size) -> 'list) /* map1 */ { tup: (String, Long) => List(tup) } /* reduce */ { 17 | (l1: List[(String, Long)], l2: List[(String, Long)]) => 18 | mergeSort2(l1, l2, 10, cmpTup) 19 | } /* map2 */ { lout: List[(String, Long)] => 20 | lout 21 | } 22 | } 23 | // Now expand out the list. 24 | .flatMap('list -> ('word, 'cnt)) { list: List[(String, Long)] => list } 25 | .project('word, 'cnt) 26 | .write(Tsv(args("output"))) 27 | 28 | // Reverse sort to get the top items 29 | def cmpTup(t1: (String, Long), t2: (String, Long)) = t2._2.compareTo(t1._2) 30 | 31 | def mergeSort2[T](v1: List[T], v2: List[T], k: Int, cmp: Function2[T, T, Int]) = { 32 | @tailrec 33 | def mergeSortR(acc: List[T], list1: List[T], list2: List[T], k: Int): List[T] = 34 | (list1, list2, k) match { 35 | case (_, _, 0) => acc 36 | case (x1 :: t1, x2 :: t2, _) => { 37 | if (cmp(x1, x2) < 0) { 38 | mergeSortR(x1 :: acc, t1, list2, k - 1) 39 | } else { 40 | mergeSortR(x2 :: acc, list1, t2, k - 1) 41 | } 42 | } 43 | case (x1 :: t1, Nil, _) => mergeSortR(x1 :: acc, t1, Nil, k - 1) 44 | case (Nil, x2 :: t2, _) => mergeSortR(x2 :: acc, Nil, t2, k - 1) 45 | case (Nil, Nil, _) => acc 46 | } 47 | mergeSortR(Nil, v1, v2, k).reverse 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /scalding-core/codegen/tuple_adder_generator.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | # Run it like this: 4 | # 5 | # ruby scripts/tuple_adder_generator.rb > src/main/scala/com/twitter/scalding/GeneratedTupleAdders.scala 6 | 7 | $indent = " " 8 | 9 | TYPES = ('A'..'Z').to_a 10 | 11 | def make_tuple_adder(cnt) 12 | other_cnts = (1..(22-cnt)).to_a 13 | puts "#{$indent}class Tuple#{cnt}Adder[#{TYPES[0..(cnt - 1)].join(",")}](tup : #{get_tuple(0, cnt)}) {" 14 | 15 | # Add the :+ method 16 | puts "#{$indent}#{$indent}def :+[#{TYPES[cnt]}](other : #{TYPES[cnt]}) = {" 17 | puts "#{$indent}#{$indent}#{$indent}(#{tup_get("tup", cnt)},other)" 18 | puts "#{$indent}#{$indent}}" 19 | 20 | # Add the +: method 21 | puts "#{$indent}#{$indent}def +:[#{TYPES[cnt]}](other : #{TYPES[cnt]}) = {" 22 | puts "#{$indent}#{$indent}#{$indent}(other,#{tup_get("tup", cnt)})" 23 | puts "#{$indent}#{$indent}}" 24 | 25 | other_cnts.each do |ocnt| 26 | puts 27 | puts "#{$indent}#{$indent}def ++[#{TYPES[cnt..(cnt + ocnt - 1)].join(",")}](other : #{get_tuple(cnt, ocnt)}) = {" 28 | puts "#{$indent}#{$indent}#{$indent}(#{tup_get("tup", cnt)},#{tup_get("other", ocnt)})" 29 | puts "#{$indent}#{$indent}}" 30 | end 31 | puts "#{$indent}}" 32 | puts 33 | puts "#{$indent}implicit def tup#{cnt}ToAdder[#{TYPES[0..(cnt - 1)].join(",")}](tup : Tuple#{cnt}[#{TYPES[0..(cnt - 1)].join(",")}]) = new Tuple#{cnt}Adder(tup)" 34 | end 35 | 36 | def get_tuple(cnt1, cnt2) 37 | "Tuple#{cnt2}[#{TYPES[cnt1..(cnt1 + cnt2 - 1)].join(",")}]" 38 | end 39 | 40 | def tup_get(name, cnt) 41 | (1..cnt).map{ |i| "#{name}._#{i}" }.join(",") 42 | end 43 | 44 | puts "// following were autogenerated by #{__FILE__} at #{Time.now} do not edit" 45 | puts %q|package com.twitter.scalding 46 | 47 | trait GeneratedTupleAdders { 48 | | 49 | 50 | (1..21).each { |c| 51 | make_tuple_adder(c) 52 | puts 53 | } 54 | 55 | puts "}" 56 | puts "// end of autogenerated" 57 | -------------------------------------------------------------------------------- /scalding-core/src/main/scala/com/twitter/scalding/typed/MemorySink.scala: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2014 Twitter, Inc. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | package com.twitter.scalding.typed 17 | 18 | import com.twitter.scalding._ 19 | 20 | import scala.collection.mutable.Buffer 21 | 22 | import java.util.UUID 23 | 24 | import cascading.pipe.Pipe 25 | import cascading.flow.FlowDef 26 | import cascading.scheme.NullScheme 27 | import cascading.tuple.Tuple 28 | 29 | /* 30 | * This is useful for in-memory testing with Execution 31 | * It only works for CascadingLocal mode. 32 | */ 33 | class MemorySink[T] extends TypedSink[T] { 34 | private[this] val buf = Buffer[Tuple]() 35 | private[this] val name: String = UUID.randomUUID.toString 36 | 37 | // takes a copy as of NOW. Don't call this before the job has run 38 | def readResults: Iterable[T] = 39 | buf.iterator.map(_.getObject(0).asInstanceOf[T]).toList 40 | 41 | def setter[U <: T] = TupleSetter.asSubSetter(TupleSetter.singleSetter[T]) 42 | def writeFrom(pipe: Pipe)(implicit flowDef: FlowDef, mode: Mode): Pipe = 43 | mode match { 44 | case cl: CascadingLocal => 45 | val tap = new MemoryTap(new NullScheme(sinkFields, sinkFields), buf) 46 | flowDef.addSink(name, tap) 47 | flowDef.addTail(new Pipe(name, pipe)) 48 | pipe 49 | case _ => sys.error("MemorySink only usable with cascading local") 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /scalding-core/src/test/scala/com/twitter/scalding/ScanLeftTest.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding 2 | 3 | import org.scalatest.{Matchers, WordSpec} 4 | 5 | /** 6 | * Simple Example: First group data by gender and then sort by height reverse order. Then add another column 7 | * for each group which is the rank order of the height. 8 | */ 9 | class AddRankingWithScanLeft(args: Args) extends Job(args) { 10 | Tsv("input1", ('gender, 'height)).read 11 | .groupBy('gender) { group => 12 | group.sortBy('height).reverse 13 | group.scanLeft('height -> 'rank)(0L) { (rank: Long, user_id: Double) => 14 | (rank + 1L) 15 | } 16 | } 17 | // scanLeft generates an extra line per group, thus remove it 18 | .filter('height) { x: String => x != null } 19 | .debug 20 | .write(Tsv("result1")) 21 | } 22 | 23 | class ScanLeftTest extends WordSpec with Matchers { 24 | import Dsl._ 25 | 26 | // --- A simple ranking job 27 | val sampleInput1 = 28 | List(("male", "165.2"), ("female", "172.2"), ("male", "184.1"), ("male", "125.4"), ("female", "128.6")) 29 | 30 | // Each group sorted and ranking added highest person to shortest 31 | val expectedOutput1 = Set( 32 | ("male", 184.1, 1), 33 | ("male", 165.2, 2), 34 | ("male", 125.4, 3), 35 | ("female", 172.2, 1), 36 | ("female", 128.6, 2) 37 | ) 38 | 39 | "A simple ranking scanleft job" should { 40 | JobTest(new AddRankingWithScanLeft(_)) 41 | .source(Tsv("input1", ('gender, 'height)), sampleInput1) 42 | .sink[(String, Double, Long)](Tsv("result1")) { outBuf1 => 43 | "produce correct number of records when filtering out null values" in { 44 | outBuf1 should have size 5 45 | } 46 | "create correct ranking per group, 1st being the heighest person of that group" in { 47 | outBuf1.toSet shouldBe expectedOutput1 48 | } 49 | } 50 | .run 51 | .finish() 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /scalding-quotation/src/main/scala/com/twitter/scalding/quotation/Liftables.scala: -------------------------------------------------------------------------------- 1 | package com.twitter.scalding.quotation 2 | 3 | import scala.reflect.macros.blackbox.Context 4 | 5 | /** 6 | * These Liftables allows us to lift values into quasiquote trees. For example: 7 | * 8 | * def test(v: Source) => q"$v" 9 | * 10 | * uses `sourceLiftable` 11 | */ 12 | trait Liftables { 13 | val c: Context 14 | import c.universe.{TypeName => _, _} 15 | 16 | protected implicit val sourceLiftable: Liftable[Source] = Liftable { case Source(path, line) => 17 | q"_root_.com.twitter.scalding.quotation.Source($path, $line)" 18 | } 19 | 20 | protected implicit val projectionsLiftable: Liftable[Projections] = Liftable { case p => 21 | q"_root_.com.twitter.scalding.quotation.Projections(${p.set})" 22 | } 23 | 24 | protected implicit val typeNameLiftable: Liftable[TypeName] = Liftable { case TypeName(name) => 25 | q"_root_.com.twitter.scalding.quotation.TypeName($name)" 26 | } 27 | 28 | protected implicit val accessorLiftable: Liftable[Accessor] = Liftable { case Accessor(name) => 29 | q"_root_.com.twitter.scalding.quotation.Accessor($name)" 30 | } 31 | 32 | protected implicit val quotedLiftable: Liftable[Quoted] = Liftable { case Quoted(source, call, fa) => 33 | q"_root_.com.twitter.scalding.quotation.Quoted($source, $call, $fa)" 34 | } 35 | 36 | protected implicit val projectionLiftable: Liftable[Projection] = Liftable { 37 | case p: Property => q"$p" 38 | case p: TypeReference => q"$p" 39 | } 40 | 41 | protected implicit val propertyLiftable: Liftable[Property] = Liftable { 42 | case Property(path, accessor, tpe) => 43 | q"_root_.com.twitter.scalding.quotation.Property($path, $accessor, $tpe)" 44 | } 45 | 46 | protected implicit val typeReferenceLiftable: Liftable[TypeReference] = Liftable { 47 | case TypeReference(name) => q"_root_.com.twitter.scalding.quotation.TypeReference($name)" 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /scalding-avro/src/main/scala/com/twitter/scalding/avro/package.scala: -------------------------------------------------------------------------------- 1 | /* Copyright 2013 eBay, inc. 2 | * 3 | * Licensed under the Apache License, Version 2.0 (the "License"); 4 | * you may not use this file except in compliance with the License. 5 | * You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software 10 | * distributed under the License is distributed on an "AS IS" BASIS, 11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | * See the License for the specific language governing permissions and 13 | * limitations under the License. 14 | */ 15 | 16 | package com.twitter.scalding 17 | 18 | import cascading.flow.FlowDef 19 | import org.apache.avro.Schema 20 | import collection.JavaConverters._ 21 | import cascading.tuple.Fields 22 | 23 | import com.twitter.scalding.typed.cascading_backend.CascadingExtensions._ 24 | 25 | package object avro { 26 | def writePackedAvro[T](pipe: TypedPipe[T], path: String)(implicit 27 | mf: Manifest[T], 28 | st: AvroSchemaType[T], 29 | conv: TupleConverter[T], 30 | set: TupleSetter[T], 31 | flow: FlowDef, 32 | mode: Mode 33 | ): Unit = { 34 | val sink = PackedAvroSource[T](path) 35 | pipe.write(sink) 36 | } 37 | 38 | def writeUnpackedAvro[T <: Product](pipe: TypedPipe[T], path: String, schema: Schema)(implicit 39 | mf: Manifest[T], 40 | conv: TupleConverter[T], 41 | set: TupleSetter[T], 42 | flow: FlowDef, 43 | mode: Mode 44 | ): Unit = { 45 | import Dsl._ 46 | val sink = UnpackedAvroSource[T](path, Some(schema)) 47 | val outFields = { 48 | val schemaFields = schema.getFields 49 | schemaFields.asScala.foldLeft(new Fields())((cFields, sField) => 50 | cFields.append(new Fields(sField.name())) 51 | ) 52 | } 53 | pipe.toPipe(outFields).write(sink) 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /scalding-args/src/main/scala/com/twitter/scalding/RangedArgs.scala: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2012 Twitter, Inc. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package com.twitter.scalding 18 | 19 | object RangedArgs { 20 | implicit def rangedFromArgs(args: Args): RangedArgs = new RangedArgs(args) 21 | } 22 | 23 | case class Range[T](lower: T, upper: T)(implicit ord: Ordering[T]) { 24 | assert(ord.lteq(lower, upper), "Bad range: " + lower + " > " + upper) 25 | 26 | def assertLowerBound(min: T): Unit = 27 | assert(ord.lteq(min, lower), "Range out of bounds: " + lower + " < " + min) 28 | 29 | def assertUpperBound(max: T): Unit = 30 | assert(ord.gteq(max, upper), "Range out of bounds: " + upper + " > " + max) 31 | 32 | def assertBounds(min: T, max: T): Unit = { 33 | assertLowerBound(min) 34 | assertUpperBound(max) 35 | } 36 | 37 | def mkString(sep: String) = 38 | if (ord.equiv(lower, upper)) { 39 | lower.toString 40 | } else { 41 | lower.toString + sep + upper.toString 42 | } 43 | } 44 | 45 | class RangedArgs(args: Args) { 46 | def range[T](argName: String)(cnv: String => T)(implicit ord: Ordering[T]): Range[T] = 47 | args.list(argName) match { 48 | case List(v) => 49 | Range(cnv(v), cnv(v)) 50 | case List(v1, v2) => 51 | Range(cnv(v1), cnv(v2)) 52 | case _ => 53 | throw new IllegalArgumentException(argName + " must have either 1 or 2 values specified") 54 | } 55 | } 56 | --------------------------------------------------------------------------------