├── scalding-thrift-macros
    ├── NOTICE
    └── src
    │   └── main
    │       └── scala
    │           └── com
    │               └── twitter
    │                   └── scalding
    │                       └── thrift
    │                           └── macros
    │                               ├── RequiredBinaryComparators.scala
    │                               └── Macros.scala
├── project
    ├── build.properties
    ├── travis-log4j.properties
    ├── scalding-dagon.scala
    └── plugins.sbt
├── tutorial
    ├── data
    │   ├── graph2.tsv
    │   ├── hello.txt
    │   ├── words.txt
    │   ├── graph.tsv
    │   ├── word_scores.tsv
    │   ├── helloDoc.txt
    │   ├── phones.txt
    │   ├── docBOW.tsv
    │   └── session.json
    ├── CodeSnippets.md
    ├── .scalding_repl
    ├── JsonTutorial0.scala
    ├── AvroTutorial0.scala
    ├── MatrixTutorial1.scala
    ├── MatrixTutorial0.scala
    ├── MatrixTutorial4.scala
    ├── MatrixTutorial3.scala
    ├── MatrixTutorial6.scala
    ├── MatrixTutorial5.scala
    ├── MatrixTutorial2.scala
    ├── Tutorial1.scala
    └── ReplTutorial1.scala
├── logo
    └── scalding.png
├── scalding-core
    ├── src
    │   ├── test
    │   │   ├── resources
    │   │   │   └── com
    │   │   │   │   └── twitter
    │   │   │   │       └── scalding
    │   │   │   │           └── test_filesystem
    │   │   │   │               └── test_data
    │   │   │   │                   └── 2013
    │   │   │   │                       ├── 04
    │   │   │   │                           ├── _SUCCESS
    │   │   │   │                           └── 2013-04.txt
    │   │   │   │                       ├── 05
    │   │   │   │                           ├── _SUCCESS
    │   │   │   │                           └── _ignored
    │   │   │   │                       ├── 06
    │   │   │   │                           └── _SUCCESS
    │   │   │   │                       ├── 08
    │   │   │   │                           ├── _SUCCESS
    │   │   │   │                           └── 2013-08.txt
    │   │   │   │                       ├── 03
    │   │   │   │                           └── 2013-03.txt
    │   │   │   │                       └── 07
    │   │   │   │                           ├── 2013-07.txt
    │   │   │   │                           └── _SUCCESS
    │   │   │   │                               └── _ignored
    │   │   └── scala
    │   │   │   └── com
    │   │   │       └── twitter
    │   │   │           └── scalding
    │   │   │               ├── typed
    │   │   │                   └── InAnotherPackage.scala
    │   │   │               ├── FlowStateMapTest.scala
    │   │   │               ├── RegressionTests.scala
    │   │   │               ├── TypedPipeCheckerTest.scala
    │   │   │               ├── source
    │   │   │                   └── TypedTextTest.scala
    │   │   │               ├── estimation
    │   │   │                   └── memory
    │   │   │                   │   └── MemoryEstimatorStepStrategyTest.scala
    │   │   │               ├── IterableExecutionSerializationTest.scala
    │   │   │               ├── TypedSketchJoinJobForEmptyKeysTest.scala
    │   │   │               ├── TestTapFactoryTest.scala
    │   │   │               ├── TimePathedSourceTest.scala
    │   │   │               ├── ExecutionUtilTest.scala
    │   │   │               ├── PathFilterTest.scala
    │   │   │               └── ScanLeftTest.scala
    │   └── main
    │   │   ├── scala
    │   │       └── com
    │   │       │   └── twitter
    │   │       │       └── scalding
    │   │       │           ├── reducer_estimation
    │   │       │               ├── package.scala
    │   │       │               ├── ReducerHistoryEstimator.scala
    │   │       │               └── ReducerEstimatorConfig.scala
    │   │       │           ├── serialization
    │   │       │               ├── RequiredBinaryComparatorsConfig.scala
    │   │       │               ├── SerializeAsUnit.scala.scala
    │   │       │               ├── MultiJoinExternalizer.scala
    │   │       │               ├── Externalizer.scala
    │   │       │               └── RequiredBinaryComparators.scala
    │   │       │           ├── typed
    │   │       │               ├── cascading_backend
    │   │       │               │   └── DistinctCoGroupJoiner.scala
    │   │       │               ├── BijectedSourceSink.scala
    │   │       │               └── MemorySink.scala
    │   │       │           ├── source
    │   │       │               ├── NullSink.scala
    │   │       │               ├── CheckedInversion.scala
    │   │       │               └── MaxFailuresCheck.scala
    │   │       │           ├── mathematics
    │   │       │               ├── Poisson.scala
    │   │       │               └── Histogram.scala
    │   │       │           ├── TupleConversions.scala
    │   │       │           ├── Sortable.scala
    │   │       │           ├── CascadeJob.scala
    │   │       │           ├── OptionalSource.scala
    │   │       │           ├── TypedPipeChecker.scala
    │   │       │           ├── BijectedOrderedSerialization.scala
    │   │       │           ├── estimation
    │   │       │               ├── Common.scala
    │   │       │               └── HistoryService.scala
    │   │       │           ├── macros
    │   │       │               ├── MacroImplicits.scala
    │   │       │               └── impl
    │   │       │               │   └── CaseClassFieldSetter.scala
    │   │       │           ├── Dsl.scala
    │   │       │           └── TupleArity.scala
    │   │   └── java
    │   │       └── com
    │   │           └── twitter
    │   │               └── scalding
    │   │                   └── tap
    │   │                       └── GlobHfs.java
    └── codegen
    │   ├── function_implicits_generator.rb
    │   ├── mappable_generator.rb
    │   ├── typed_source_generator.rb
    │   └── tuple_adder_generator.rb
├── scalding-parquet
    ├── src
    │   ├── test
    │   │   └── resources
    │   │   │   └── names.txt
    │   └── main
    │   │   ├── scala
    │   │       └── com
    │   │       │   └── twitter
    │   │       │       └── scalding
    │   │       │           └── parquet
    │   │       │               └── HasFilterPredicate.scala
    │   │   └── java
    │   │       └── com
    │   │           └── twitter
    │   │               └── scalding
    │   │                   └── parquet
    │   │                       └── tuple
    │   │                           ├── TupleRecordMaterializer.java
    │   │                           └── SchemaIntersection.java
    └── README.md
├── scalding-commons
    ├── src
    │   ├── test
    │   │   ├── resources
    │   │   │   └── com
    │   │   │   │   └── twitter
    │   │   │   │       └── scalding
    │   │   │   │           └── test_filesystem
    │   │   │   │               └── test_data
    │   │   │   │                   └── 2013
    │   │   │   │                       ├── 10
    │   │   │   │                           ├── _SUCCESS
    │   │   │   │                           ├── part-00000
    │   │   │   │                           ├── part-00001
    │   │   │   │                           └── part-00002
    │   │   │   │                       └── 09
    │   │   │   │                           ├── _SUCCESS
    │   │   │   │                           ├── part-00000
    │   │   │   │                           └── part-00001
    │   │   ├── scala
    │   │   │   └── com
    │   │   │   │   └── twitter
    │   │   │   │       └── scalding
    │   │   │   │           ├── commons
    │   │   │   │               └── source
    │   │   │   │               │   ├── typedtext
    │   │   │   │               │       └── TypedTextTest.scala
    │   │   │   │               │   └── LzoGenericSourceSpec.scala
    │   │   │   │           └── WordCountTest.scala
    │   │   └── java
    │   │   │   └── com
    │   │   │       └── twitter
    │   │   │           └── scalding
    │   │   │               └── commons
    │   │   │                   └── datastores
    │   │   │                       ├── FSTestCase.java
    │   │   │                       └── TestUtils.java
    │   └── main
    │   │   ├── scala
    │   │       └── com
    │   │       │   └── twitter
    │   │       │       └── scalding
    │   │       │           ├── examples
    │   │       │               ├── WordCountJob.scala
    │   │       │               └── MergeTest.scala
    │   │       │           └── commons
    │   │       │               ├── scheme
    │   │       │                   └── CombinedSequenceFileScheme.scala
    │   │       │               └── source
    │   │       │                   ├── FixedPathSources.scala
    │   │       │                   ├── LzoCodecSource.scala
    │   │       │                   └── LzoGenericSource.scala
    │   │   └── java
    │   │       └── com
    │   │           └── twitter
    │   │               └── scalding
    │   │                   └── commons
    │   │                       └── datastores
    │   │                           └── Utils.java
    └── codegen
    │   └── lzotypedtsv_generator.rb
├── scalding-parquet-scrooge
    ├── src
    │   ├── test
    │   │   └── resources
    │   │   │   └── names.txt
    │   └── main
    │   │   ├── scala
    │   │       └── com
    │   │       │   └── twitter
    │   │       │       └── scalding
    │   │       │           └── parquet
    │   │       │               └── scrooge
    │   │       │                   └── ParquetScrooge.scala
    │   │   └── java
    │   │       └── com
    │   │           └── twitter
    │   │               └── scalding
    │   │                   └── parquet
    │   │                       └── scrooge
    │   │                           ├── ParquetScroogeInputFormat.java
    │   │                           ├── ScroogeSchemaConversionException.java
    │   │                           └── ParquetScroogeOutputFormat.java
    └── README.md
├── docs
    └── src
    │   └── main
    │       ├── tut
    │           ├── cookbook
    │           │   ├── cookbook.md
    │           │   └── hbase.md
    │           └── cookbook.md
    │       └── resources
    │           └── microsite
    │               └── img
    │                   ├── favicon.png
    │                   ├── navbar_brand.png
    │                   ├── sidebar_brand.png
    │                   ├── navbar_brand2x.png
    │                   └── sidebar_brand2x.png
├── maple
    └── src
    │   └── main
    │       └── java
    │           └── com
    │               └── twitter
    │                   └── maple
    │                       └── tap
    │                           ├── TupleWrapper.java
    │                           └── StdoutTap.java
├── scalding-dagon
    └── src
    │   ├── main
    │       ├── scala
    │       │   └── com
    │       │   │   └── twitter
    │       │   │       └── scalding
    │       │   │           └── dagon
    │       │   │               ├── package.scala
    │       │   │               ├── PartialRule.scala
    │       │   │               ├── FunctionK.scala
    │       │   │               ├── RefPair.scala
    │       │   │               ├── Id.scala
    │       │   │               └── Rule.scala
    │       ├── scala-2.12-
    │       │   └── com
    │       │   │   └── twitter
    │       │   │       └── scalding
    │       │   │           └── dagon
    │       │   │               └── ScalaVersionCompat.scala
    │       └── scala-2.13+
    │       │   └── com
    │       │       └── twitter
    │       │           └── scalding
    │       │               └── dagon
    │       │                   └── ScalaVersionCompat.scala
    │   └── test
    │       └── scala
    │           └── com
    │               └── twitter
    │                   └── scalding
    │                       └── dagon
    │                           ├── MemoizeTests.scala
    │                           ├── CacheTests.scala
    │                           └── HCacheTests.scala
├── scalding-estimators-test
    └── src
    │   └── test
    │       └── resources
    │           └── scores.tsv
├── NOTICE
├── scalding-base
    └── src
    │   ├── main
    │       └── scala
    │       │   └── com
    │       │       └── twitter
    │       │           └── scalding
    │       │               ├── typed
    │       │                   ├── functions
    │       │                   │   ├── ScaldingPriorityQueueMonoid.scala
    │       │                   │   ├── FlatMapping.scala
    │       │                   │   └── EqTypes.scala
    │       │                   ├── OptimizationPhases.scala
    │       │                   ├── memory_backend
    │       │                   │   └── AtomicBox.scala
    │       │                   ├── KeyedPipe.scala
    │       │                   └── WithDescription.scala
    │       │               ├── StatKey.scala
    │       │               ├── CPromise.scala
    │       │               ├── StringUtility.scala
    │       │               ├── CancellationHandler.scala
    │       │               ├── UniqueID.scala
    │       │               └── Mode.scala
    │   └── test
    │       └── scala
    │           └── com
    │               └── twitter
    │                   └── scalding
    │                       └── typed
    │                           ├── TypedPipeMonoidTest.scala
    │                           └── CoGroupableTest.scala
├── scalding-quotation
    └── src
    │   ├── test
    │       └── scala
    │       │   └── com
    │       │       └── twitter
    │       │           └── scalding
    │       │               └── quotation
    │       │                   ├── package.scala
    │       │                   ├── Person.scala
    │       │                   └── LimitationsTest.scala
    │   └── main
    │       └── scala
    │           └── com
    │               └── twitter
    │                   └── scalding
    │                       └── quotation
    │                           ├── TreeOps.scala
    │                           ├── Quoted.scala
    │                           └── Liftables.scala
├── scalding-parquet-fixtures
    └── src
    │   └── test
    │       └── resources
    │           └── test.thrift
├── scripts
    ├── build_assembly_no_test.sh
    ├── common.sh
    ├── test_typed_tutorials.sh
    ├── test_execution_tutorial.sh
    ├── test_tutorials.sh
    ├── README.md
    ├── test_matrix_tutorials.sh
    ├── test_repl_tutorial.sh
    └── testValidator.sh
├── scalding-serialization
    └── src
    │   ├── main
    │       ├── scala
    │       │   └── com
    │       │   │   └── twitter
    │       │   │       └── scalding
    │       │   │           └── serialization
    │       │   │               ├── RequireOrderedSerializationMode.scala
    │       │   │               ├── macros
    │       │   │                   └── impl
    │       │   │                   │   ├── BinaryOrdering.scala
    │       │   │                   │   └── ordered_serialization
    │       │   │                   │       ├── providers
    │       │   │                   │           └── StableKnownDirectSubclasses.scala
    │       │   │                   │       └── runtime_helpers
    │       │   │                   │           ├── MacroEqualityOrderedSerialization.scala
    │       │   │                   │           └── LengthCalculations.scala
    │       │   │               ├── Laws.scala
    │       │   │               └── UnsignedComparisons.scala
    │       └── java
    │       │   └── com
    │       │       └── twitter
    │       │           └── scalding
    │       │               └── serialization
    │       │                   └── Undeprecated.java
    │   └── test
    │       └── scala
    │           └── com
    │               └── twitter
    │                   └── scalding
    │                       └── serialization
    │                           ├── macros
    │                               └── ZDifficultTypes.scala
    │                           └── UnsignedComparisonLaws.scala
├── scalding-repl
    └── src
    │   └── main
    │       └── scala
    │           └── com
    │               └── twitter
    │                   └── scalding
    │                       └── ILoopCompat.scala
├── scalding-db
    └── src
    │   ├── main
    │       └── scala
    │       │   └── com
    │       │       └── twitter
    │       │           └── scalding
    │       │               └── db
    │       │                   ├── JdbcStatementSetter.scala
    │       │                   ├── macros
    │       │                       ├── impl
    │       │                       │   ├── handler
    │       │                       │   │   ├── BlobTypeHandler.scala
    │       │                       │   │   ├── DateTypeHandler.scala
    │       │                       │   │   ├── ColumnFormat.scala
    │       │                       │   │   └── NumericTypeHandler.scala
    │       │                       │   ├── DBTypeDescriptorImpl.scala
    │       │                       │   └── JdbcStatementSetterImpl.scala
    │       │                       └── DBMacro.scala
    │       │                   ├── DBTypeDescriptor.scala
    │       │                   ├── package.scala
    │       │                   ├── extensions
    │       │                       └── VerticaExtensions.scala
    │       │                   └── ColumnDefinition.scala
    │   └── test
    │       └── scala
    │           └── com
    │               └── twitter
    │                   └── scalding
    │                       └── db
    │                           └── DBOptionsTest.scala
├── scalding-hraven
    ├── README.md
    └── src
    │   ├── main
    │       └── scala
    │       │   └── com
    │       │       └── twitter
    │       │           └── scalding
    │       │               └── hraven
    │       │                   └── reducer_estimation
    │       │                       └── HRavenBasedEstimator.scala
    │   └── test
    │       └── resources
    │           ├── jobResponse_job_1470171371859_6607542.json
    │           └── jobResponse_job_1470171371859_6608570.json
├── .travis.blacklist
├── .scalafmt.conf
├── scalding-avro
    ├── README.md
    └── src
    │   └── main
    │       └── scala
    │           └── com
    │               └── twitter
    │                   └── scalding
    │                       └── avro
    │                           └── package.scala
├── codecov.yml
├── COMMITTERS.md
├── scalding-parquet-scrooge-fixtures
    └── src
    │   └── test
    │       └── resources
    │           └── binary.thrift
├── .gitignore
├── scalding-date
    └── src
    │   └── main
    │       └── scala
    │           └── com
    │               └── twitter
    │                   └── scalding
    │                       └── CalendarOps.scala
├── .github
    └── workflows
    │   └── publish.yml
├── scalding-hadoop-test
    └── src
    │   └── test
    │       └── scala
    │           └── com
    │               └── twitter
    │                   └── scalding
    │                       └── platform
    │                           └── TestJobsWithDescriptions.scala
└── scalding-args
    └── src
        ├── test
            └── scala
            │   └── com
            │       └── twitter
            │           └── scalding
            │               └── RangedArgsSpec.scala
        └── main
            └── scala
                └── com
                    └── twitter
                        └── scalding
                            └── RangedArgs.scala


/scalding-thrift-macros/NOTICE:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=1.5.4
2 | 


--------------------------------------------------------------------------------
/tutorial/data/graph2.tsv:
--------------------------------------------------------------------------------
1 | 1	2	1
2 | 1	3	1
3 | 2	3	1
4 | 


--------------------------------------------------------------------------------
/tutorial/data/hello.txt:
--------------------------------------------------------------------------------
1 | Hello world
2 | Goodbye world
3 | 


--------------------------------------------------------------------------------
/tutorial/data/words.txt:
--------------------------------------------------------------------------------
1 | hello
2 | world
3 | goodbye
4 | 


--------------------------------------------------------------------------------
/tutorial/data/graph.tsv:
--------------------------------------------------------------------------------
1 | 1	2	1
2 | 1	3	1
3 | 3	2	1
4 | 4	2	2
5 | 


--------------------------------------------------------------------------------
/tutorial/data/word_scores.tsv:
--------------------------------------------------------------------------------
1 | hello	1.0
2 | world	2.0
3 | goodbye	3.0


--------------------------------------------------------------------------------
/logo/scalding.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/twitter/scalding/HEAD/logo/scalding.png


--------------------------------------------------------------------------------
/tutorial/data/helloDoc.txt:
--------------------------------------------------------------------------------
1 | 1	Hello world
2 | 2	See ya soon world
3 | 3	Hello again world
4 | 


--------------------------------------------------------------------------------
/scalding-core/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/04/_SUCCESS:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scalding-core/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/05/_SUCCESS:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scalding-core/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/05/_ignored:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scalding-core/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/06/_SUCCESS:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scalding-core/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/08/_SUCCESS:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scalding-parquet/src/test/resources/names.txt:
--------------------------------------------------------------------------------
1 | Alice	Practive
2 | Bob	Hope
3 | Charlie	Horse
4 | 


--------------------------------------------------------------------------------
/scalding-commons/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/09/_SUCCESS:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scalding-commons/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/10/_SUCCESS:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scalding-core/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/03/2013-03.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scalding-core/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/04/2013-04.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scalding-core/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/07/2013-07.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scalding-core/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/08/2013-08.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scalding-parquet-scrooge/src/test/resources/names.txt:
--------------------------------------------------------------------------------
1 | Alice	Practice
2 | Bob	Hope
3 | Charlie	Horse
4 | 


--------------------------------------------------------------------------------
/scalding-core/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/07/_SUCCESS/_ignored:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tutorial/data/phones.txt:
--------------------------------------------------------------------------------
1 | john smith 5551212 30 US
2 | harry bovik 4122680000 55 US
3 | jane doe 4125551212 40 CN
4 | 


--------------------------------------------------------------------------------
/tutorial/CodeSnippets.md:
--------------------------------------------------------------------------------
1 | Please see the [API reference](https://github.com/twitter/scalding/wiki/API-Reference) on the wiki.
2 | 


--------------------------------------------------------------------------------
/docs/src/main/tut/cookbook/cookbook.md:
--------------------------------------------------------------------------------
1 | # Cookbook
2 | 
3 | In Progress -  a cookbook of things you might like to do with Scalding.
4 | 


--------------------------------------------------------------------------------
/tutorial/.scalding_repl:
--------------------------------------------------------------------------------
1 | // for use in testing to verify that '.scalding_repl' files are loaded
2 | val scaldingReplInitWasLoaded = true
3 | 


--------------------------------------------------------------------------------
/docs/src/main/resources/microsite/img/favicon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/twitter/scalding/HEAD/docs/src/main/resources/microsite/img/favicon.png


--------------------------------------------------------------------------------
/docs/src/main/resources/microsite/img/navbar_brand.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/twitter/scalding/HEAD/docs/src/main/resources/microsite/img/navbar_brand.png


--------------------------------------------------------------------------------
/docs/src/main/resources/microsite/img/sidebar_brand.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/twitter/scalding/HEAD/docs/src/main/resources/microsite/img/sidebar_brand.png


--------------------------------------------------------------------------------
/docs/src/main/resources/microsite/img/navbar_brand2x.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/twitter/scalding/HEAD/docs/src/main/resources/microsite/img/navbar_brand2x.png


--------------------------------------------------------------------------------
/docs/src/main/resources/microsite/img/sidebar_brand2x.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/twitter/scalding/HEAD/docs/src/main/resources/microsite/img/sidebar_brand2x.png


--------------------------------------------------------------------------------
/tutorial/data/docBOW.tsv:
--------------------------------------------------------------------------------
 1 | 1	hello	2
 2 | 1	twitter	1
 3 | 2	conversation	1
 4 | 2	celebrities	1
 5 | 2	twitter	1
 6 | 3	elections	1
 7 | 3	debate	1
 8 | 3	twitter	1
 9 | 3	political	1
10 | 


--------------------------------------------------------------------------------
/maple/src/main/java/com/twitter/maple/tap/TupleWrapper.java:
--------------------------------------------------------------------------------
1 | package com.twitter.maple.tap;
2 | 
3 | import cascading.tuple.Tuple;
4 | 
5 | public class TupleWrapper {
6 |     public Tuple tuple;
7 | }
8 | 


--------------------------------------------------------------------------------
/scalding-parquet-scrooge/README.md:
--------------------------------------------------------------------------------
1 | # Parquet-Scrooge support for Scalding
2 | 
3 | This module has sources for reading scrooge-generated thrift structs. See the scalding-parquet module for reading apache-thrift (TBase) generated thrift structs.
4 | 


--------------------------------------------------------------------------------
/scalding-commons/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/09/part-00000:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/twitter/scalding/HEAD/scalding-commons/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/09/part-00000


--------------------------------------------------------------------------------
/scalding-commons/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/09/part-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/twitter/scalding/HEAD/scalding-commons/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/09/part-00001


--------------------------------------------------------------------------------
/scalding-commons/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/10/part-00000:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/twitter/scalding/HEAD/scalding-commons/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/10/part-00000


--------------------------------------------------------------------------------
/scalding-commons/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/10/part-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/twitter/scalding/HEAD/scalding-commons/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/10/part-00001


--------------------------------------------------------------------------------
/scalding-commons/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/10/part-00002:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/twitter/scalding/HEAD/scalding-commons/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/10/part-00002


--------------------------------------------------------------------------------
/scalding-dagon/src/main/scala/com/twitter/scalding/dagon/package.scala:
--------------------------------------------------------------------------------
1 | package com.twitter.scalding
2 | 
3 | /** Collection of graph algorithms */
4 | package object dagon {
5 |   type BoolT[T] = Boolean
6 |   type NeighborFn[T] = T => Iterable[T]
7 | }
8 | 


--------------------------------------------------------------------------------
/scalding-parquet/src/main/scala/com/twitter/scalding/parquet/HasFilterPredicate.scala:
--------------------------------------------------------------------------------
1 | package com.twitter.scalding.parquet
2 | 
3 | import org.apache.parquet.filter2.predicate.FilterPredicate
4 | 
5 | trait HasFilterPredicate {
6 |   def withFilter: Option[FilterPredicate] = None
7 | }
8 | 


--------------------------------------------------------------------------------
/scalding-estimators-test/src/test/resources/scores.tsv:
--------------------------------------------------------------------------------
 1 | iphone	0.5
 2 | mixtape	0.2
 3 | helvetica	0.1
 4 | gastropub	0.1
 5 | raw	0.05
 6 | sustainable	0.01
 7 | stumptown	0.75
 8 | postironic	0.3
 9 | ironic	0.9
10 | pintrest	0.05
11 | selfies	0.2
12 | dreamcatcher	0.65
13 | twitter	0.0
14 | 


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
 1 | scalding is a Scala API for Cascading.
 2 | Copyright 2012 Twitter, Inc.
 3 | 
 4 | Third Party Dependencies:
 5 | 
 6 | Cascading 2.0
 7 | Apache Public License 2.0
 8 | http://www.cascading.org
 9 | 
10 | Hadoop 0.20.2
11 | Apache Public License 2.0
12 | http://hadoop.apache.org
13 | 
14 | 


--------------------------------------------------------------------------------
/scalding-core/src/main/scala/com/twitter/scalding/reducer_estimation/package.scala:
--------------------------------------------------------------------------------
1 | package com.twitter.scalding
2 | 
3 | package object reducer_estimation {
4 |   def median(xs: Seq[Double]): Option[Double] = xs.sorted.lift(xs.length / 2)
5 |   def mean(xs: Seq[Double]): Option[Double] = if (xs.isEmpty) None else Some(xs.sum / xs.length)
6 | }
7 | 


--------------------------------------------------------------------------------
/docs/src/main/tut/cookbook.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: docs
 3 | title:  "Cookbook"
 4 | section: "cookbook"
 5 | position: 1
 6 | ---
 7 | 
 8 | {% include_relative cookbook/cookbook.md %}
 9 | 
10 | ## Index
11 | 
12 | {% for x in site.pages %}
13 |   {% if x.section == 'cookbook' %}
14 | - [{{x.title}}]({{site.baseurl}}{{x.url}})
15 |   {% endif %}
16 | {% endfor %}
17 | 


--------------------------------------------------------------------------------
/tutorial/data/session.json:
--------------------------------------------------------------------------------
1 | {"sessionId": "efdc0698-66ea-4d3a-ac6e-fab9fd97a78b"}
2 | {"sessionId": "efdc0698-66ea-4d3a-ac6e-fab9fd97a78b"}
3 | {"sessionId": "efdc0698-66ea-4d3a-ac6e-fab9fd97a78b"}
4 | {"sessionId": "efdc0698-66ea-4d3a-ac6e-fab9fd97a78b"}
5 | {"sessionId": "efdc0698-66ea-4d3a-ac6e-fab9fd97a78b"}
6 | {"sessionId": "efdc0698-66ea-4d3a-ac6e-fab9fd97a78b"}
7 | 


--------------------------------------------------------------------------------
/scalding-base/src/main/scala/com/twitter/scalding/typed/functions/ScaldingPriorityQueueMonoid.scala:
--------------------------------------------------------------------------------
1 | package com.twitter.scalding.typed.functions
2 | 
3 | import com.twitter.algebird.mutable.PriorityQueueMonoid
4 | 
5 | class ScaldingPriorityQueueMonoid[K](
6 |     val count: Int
7 | )(implicit val ordering: Ordering[K])
8 |     extends PriorityQueueMonoid[K](count)(ordering)
9 | 


--------------------------------------------------------------------------------
/scalding-quotation/src/test/scala/com/twitter/scalding/quotation/package.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding
 2 | 
 3 | import org.scalatest.MustMatchers
 4 | import org.scalatest.FreeSpec
 5 | 
 6 | package object quotation {
 7 |   def typeName[T](implicit ct: reflect.ClassTag[T]) = TypeName(ct.runtimeClass.getName)
 8 |   trait Test extends FreeSpec with MustMatchers
 9 | }
10 | 


--------------------------------------------------------------------------------
/scalding-parquet-fixtures/src/test/resources/test.thrift:
--------------------------------------------------------------------------------
 1 | namespace java com.twitter.scalding.parquet.thrift_java.test
 2 | #@namespace scala com.twitter.scalding.parquet.thrift_scala.test
 3 | 
 4 | struct Name {
 5 |   1: required string first_name,
 6 |   2: optional string last_name
 7 | }
 8 | 
 9 | struct Address {
10 |   1: string street,
11 |   2: required string zip
12 | }
13 | 


--------------------------------------------------------------------------------
/scripts/build_assembly_no_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -exv
 2 | 
 3 | # Identify the bin dir in the distribution, and source the common include script
 4 | BASE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/.. && pwd )"
 5 | TARGET=$1
 6 | 
 7 | cd $BASE_DIR
 8 | sed -i'' -e 's/\/\/ test in assembly/test in assembly/g' build.sbt
 9 | 
10 | time ./sbt ++$TRAVIS_SCALA_VERSION $TARGET/assembly
11 | 


--------------------------------------------------------------------------------
/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/RequireOrderedSerializationMode.scala:
--------------------------------------------------------------------------------
1 | package com.twitter.scalding.serialization
2 | 
3 | sealed trait RequireOrderedSerializationMode extends Serializable
4 | object RequireOrderedSerializationMode {
5 |   case object Fail extends RequireOrderedSerializationMode
6 |   case object Log extends RequireOrderedSerializationMode
7 | }
8 | 


--------------------------------------------------------------------------------
/scalding-repl/src/main/scala/com/twitter/scalding/ILoopCompat.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding
 2 | 
 3 | import java.io.BufferedReader
 4 | 
 5 | import scala.tools.nsc.interpreter.ILoop
 6 | import scala.tools.nsc.interpreter.JPrintWriter
 7 | 
 8 | class ILoopCompat(in: Option[BufferedReader], out: JPrintWriter) extends ILoop(in, out) {
 9 |   def addThunk(f: => Unit): Unit = intp.initialize(f)
10 | }
11 | 


--------------------------------------------------------------------------------
/scalding-dagon/src/main/scala/com/twitter/scalding/dagon/PartialRule.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.dagon
 2 | 
 3 | /**
 4 |  * Often a partial function is an easier way to express rules
 5 |  */
 6 | trait PartialRule[N[_]] extends Rule[N] {
 7 |   final def apply[T](on: Dag[N]): N[T] => Option[N[T]] =
 8 |     applyWhere[T](on).lift
 9 | 
10 |   def applyWhere[T](on: Dag[N]): PartialFunction[N[T], N[T]]
11 | }
12 | 


--------------------------------------------------------------------------------
/scalding-db/src/main/scala/com/twitter/scalding/db/JdbcStatementSetter.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.db
 2 | 
 3 | import java.sql.PreparedStatement
 4 | import scala.util.Try
 5 | 
 6 | /**
 7 |  * Case class to JDBC statement setter used for database writes
 8 |  */
 9 | trait JdbcStatementSetter[T] extends java.io.Serializable { self =>
10 |   def apply(t: T, s: PreparedStatement): Try[PreparedStatement]
11 | }
12 | 


--------------------------------------------------------------------------------
/scalding-base/src/main/scala/com/twitter/scalding/typed/OptimizationPhases.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.typed
 2 | 
 3 | import com.twitter.scalding.dagon.Rule
 4 | 
 5 | /**
 6 |  * This is a class to allow customization of how we plan typed pipes
 7 |  */
 8 | abstract class OptimizationPhases {
 9 |   def phases: Seq[Rule[TypedPipe]]
10 | }
11 | 
12 | final class EmptyOptimizationPhases extends OptimizationPhases {
13 |   def phases = Nil
14 | }
15 | 


--------------------------------------------------------------------------------
/scripts/common.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -exv
 2 | 
 3 | # Not sure what the right default is here: trying nonzero.
 4 | SCALA_EXIT_STATUS=127
 5 | SAVED_STTY=""
 6 | 
 7 | SCALD="${BASE_DIR}/scripts/scald.rb --local"
 8 | SCALD_REPL="${BASE_DIR}/scripts/scald.rb --repl --local"
 9 | 
10 | echo "using TRAVIS_SCALA_VERSION ${TRAVIS_SCALA_VERSION}"
11 | SCALD="$SCALD --scalaversion ${TRAVIS_SCALA_VERSION}"
12 | SCALD_REPL="$SCALD_REPL --scalaversion ${TRAVIS_SCALA_VERSION}"
13 | 


--------------------------------------------------------------------------------
/scalding-db/src/test/scala/com/twitter/scalding/db/DBOptionsTest.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.db
 2 | 
 3 | import org.scalacheck.Properties
 4 | import org.scalacheck.Prop._
 5 | 
 6 | object DBOptionsTest extends Properties("DBOptions") {
 7 |   property("password") = forAll { x: String =>
 8 |     ("Password toString should not be equal to x" |: Password(x).toString != x) &&
 9 |     ("Password toStr should be equal to x" |: Password(x).toStr == x)
10 |   }
11 | }
12 | 


--------------------------------------------------------------------------------
/scalding-core/src/test/scala/com/twitter/scalding/typed/InAnotherPackage.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.example.scalding.typed
 2 | 
 3 | import com.twitter.scalding._
 4 | import scala.concurrent.{ExecutionContext => SExecutionContext, _}
 5 | import SExecutionContext.Implicits.global
 6 | 
 7 | object InAnotherPackage {
 8 |   def buildF: Future[TypedPipe[(Int, Int)]] =
 9 |     Future {
10 |       TypedPipe
11 |         .from(List(1, 2, 3, 4, 555, 3))
12 |         .map { case x => (x, x) }
13 |     }
14 | }
15 | 


--------------------------------------------------------------------------------
/scalding-hraven/README.md:
--------------------------------------------------------------------------------
1 | # hRaven Extensions
2 | This module includes additions to Scalding that make use of [hRaven](https://github.com/twitter/hraven) for querying job history.
3 | 
4 | ## Reducer Estimation
5 | Reducer estimators can include the `HRavenHistory` trait to get additional functionality for querying hRaven for past jobs.
6 | 
7 | For example, `RatioBasedReducerEstimator`, also in this module, uses hRaven job history to better estimate reducers based on the ratio of mapper-reducer input data.
8 | 


--------------------------------------------------------------------------------
/project/travis-log4j.properties:
--------------------------------------------------------------------------------
 1 | log4j.rootCategory=DEBUG, console
 2 | log4j.threshold=ALL
 3 | 
 4 | log4j.category.cascading=WARN
 5 | log4j.category.com.twitter=INFO
 6 | log4j.logger.org.apache.hadoop=ERROR
 7 | log4j.logger.cascading.flow=WARN
 8 | log4j.logger.cascading.tap=WARN
 9 | 
10 | 
11 | log4j.appender.console=org.apache.log4j.ConsoleAppender
12 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
13 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n
14 | 


--------------------------------------------------------------------------------
/scalding-core/src/main/scala/com/twitter/scalding/serialization/RequiredBinaryComparatorsConfig.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.serialization
 2 | 
 3 | import com.twitter.scalding.{Config, Job}
 4 | 
 5 | trait RequiredBinaryComparatorsConfig extends Job {
 6 |   def requireOrderedSerializationMode: RequireOrderedSerializationMode = RequireOrderedSerializationMode.Fail
 7 |   override def config =
 8 |     super.config + (Config.ScaldingRequireOrderedSerialization -> requireOrderedSerializationMode.toString)
 9 | }
10 | 


--------------------------------------------------------------------------------
/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/BinaryOrdering.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.serialization.macros.impl
 2 | 
 3 | import com.twitter.scalding.serialization.OrderedSerialization
 4 | 
 5 | import scala.language.experimental.macros
 6 | 
 7 | trait BinaryOrdering {
 8 |   implicit def ordSer[T]: OrderedSerialization[T] =
 9 |     macro com.twitter.scalding.serialization.macros.impl.OrderedSerializationProviderImpl[T]
10 | }
11 | 
12 | object BinaryOrdering extends BinaryOrdering
13 | 


--------------------------------------------------------------------------------
/.travis.blacklist:
--------------------------------------------------------------------------------
 1 | #This describes extra builds our validator will pretend to run in CI but won't
 2 | # Remember we run most builds twice, so if you want it disabled for both 2.10 and 2.11 it needs to be here twice
 3 | scalding-benchmarks
 4 | scalding-benchmarks
 5 | # These are just for fixtures, so blacklist for 2.10 and 2.11
 6 | scalding-thrift-macros-fixtures
 7 | scalding-thrift-macros-fixtures
 8 | scalding-parquet-fixtures
 9 | scalding-parquet-fixtures
10 | scalding-parquet-scrooge-fixtures
11 | scalding-parquet-scrooge-fixtures
12 | 


--------------------------------------------------------------------------------
/tutorial/JsonTutorial0.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 | Scalding with Json tutorial part 0.
 3 | 
 4 | To run this job:
 5 |   scripts/scald.rb --local --json tutorial/JsonTutorial0.scala
 6 | 
 7 | Check the output:
 8 |   cat tutorial/data/jsonoutput0.tsv
 9 | 
10 | **/
11 | 
12 | import com.twitter.scalding.{Job, Args, JsonLine, Tsv}
13 |  
14 | class JsonTutorial0(args: Args) extends Job(args) {
15 |   JsonLine("tutorial/data/session.json", ('sessionId)).read
16 |     .groupBy('sessionId){_.size}
17 |     .write(Tsv("tutorial/data/jsonoutput0.tsv"))
18 | }
19 | 


--------------------------------------------------------------------------------
/scalding-commons/src/main/scala/com/twitter/scalding/examples/WordCountJob.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.examples
 2 | 
 3 | import com.twitter.scalding._
 4 | 
 5 | class WordCountJob(args: Args) extends Job(args) {
 6 |   TypedPipe
 7 |     .from(TextLine(args("input")))
 8 |     .flatMap(line => line.split("\\s+"))
 9 |     .map(word => (word, 1L))
10 |     .sumByKey
11 |     // The compiler will enforce the type coming out of the sumByKey is the same as the type we have for our sink
12 |     .write(TypedTsv[(String, Long)](args("output")))
13 | }
14 | 


--------------------------------------------------------------------------------
/scalding-core/src/main/scala/com/twitter/scalding/typed/cascading_backend/DistinctCoGroupJoiner.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.typed.cascading_backend
 2 | 
 3 | import com.twitter.scalding.TupleGetter
 4 | import com.twitter.scalding.typed.MultiJoinFunction
 5 | 
 6 | // If all the input pipes are unique, this works:
 7 | class DistinctCoGroupJoiner[K](
 8 |     count: Int,
 9 |     getter: TupleGetter[K],
10 |     @transient joinF: MultiJoinFunction[K, Any]
11 | ) extends CoGroupedJoiner[K](count, getter, joinF) {
12 |   val distinctSize = count
13 |   def distinctIndexOf(idx: Int) = idx
14 | }
15 | 


--------------------------------------------------------------------------------
/scalding-core/src/main/scala/com/twitter/scalding/serialization/SerializeAsUnit.scala.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.serialization
 2 | 
 3 | import com.esotericsoftware.kryo.Kryo
 4 | import com.esotericsoftware.kryo.{Serializer => KSerializer}
 5 | import com.esotericsoftware.kryo.io.{Input, Output}
 6 | 
 7 | // We use this for TypedPipe subclasses which should never be needed when we run
 8 | class SerializeAsUnit[T >: Null] extends KSerializer[T] {
 9 |   override def write(kryo: Kryo, output: Output, t: T): Unit = ()
10 |   override def read(kryo: Kryo, input: Input, t: Class[T]): T = null
11 | }
12 | 


--------------------------------------------------------------------------------
/scalding-quotation/src/test/scala/com/twitter/scalding/quotation/Person.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.quotation
 2 | 
 3 | case class Contact(phone: String)
 4 | case class Person(name: String, contact: Contact, alternativeContact: Option[Contact])
 5 | 
 6 | object Person {
 7 |   val typeReference = TypeReference(typeName[Person])
 8 |   val nameProjection = typeReference.andThen(Accessor("name"), typeName[String])
 9 |   val contactProjection = typeReference.andThen(Accessor("contact"), typeName[Contact])
10 |   val phoneProjection = contactProjection.andThen(Accessor("phone"), typeName[String])
11 | }
12 | 


--------------------------------------------------------------------------------
/scalding-commons/src/test/scala/com/twitter/scalding/commons/source/typedtext/TypedTextTest.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.commons.source
 2 | 
 3 | import com.twitter.scalding._
 4 | 
 5 | import org.scalatest.FunSuite
 6 | 
 7 | case class Test1(a: Int, b: Long, c: Option[Double])
 8 | case class Test2(one: Test1, d: String)
 9 | 
10 | class TypedTextTest extends FunSuite {
11 |   test("Test with a nested tuple: Daily") {
12 |     val source =
13 |       LzoTypedText.dailyLzoTsv[Test2]("myPath")(DateRange(RichDate.now, RichDate.now + Hours(1)), implicitly)
14 |     assert(source.sourceFields.size == 4)
15 |   }
16 | }
17 | 


--------------------------------------------------------------------------------
/scalding-base/src/main/scala/com/twitter/scalding/StatKey.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding
 2 | 
 3 | case class StatKey(counter: String, group: String) extends java.io.Serializable
 4 | 
 5 | object StatKey {
 6 |   // This is implicit to allow Stat("c", "g") to work.
 7 |   implicit def fromCounterGroup(counterGroup: (String, String)): StatKey = counterGroup match {
 8 |     case (c, g) => StatKey(c, g)
 9 |   }
10 |   // Create a Stat in the ScaldingGroup
11 |   implicit def fromCounterDefaultGroup(counter: String): StatKey =
12 |     StatKey(counter, ScaldingGroup)
13 | 
14 |   val ScaldingGroup = "Scalding Custom"
15 | }
16 | 


--------------------------------------------------------------------------------
/scalding-core/src/main/scala/com/twitter/scalding/source/NullSink.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.source
 2 | 
 3 | import com.twitter.scalding.typed.TypedSink
 4 | import com.twitter.scalding.{BaseNullSource, TupleSetter}
 5 | 
 6 | /**
 7 |  * This can be used to cause cascading to run a flow, but discard the output. The only place this is likely of
 8 |  * use is to do some (non-recommended, but sometimes the most expediant way to accomplish some task).
 9 |  */
10 | object NullSink extends BaseNullSource with TypedSink[Any] {
11 |   def setter[U <: Any] = TupleSetter.asSubSetter[Any, U](TupleSetter.singleSetter)
12 | }
13 | 


--------------------------------------------------------------------------------
/.scalafmt.conf:
--------------------------------------------------------------------------------
 1 | version = 3.5.1
 2 | maxColumn = 110
 3 | docstrings.style = Asterisk
 4 | newlines.alwaysBeforeMultilineDef = false
 5 | newlines.penalizeSingleSelectMultiArgList = false
 6 | align.openParenCallSite = false
 7 | rewrite.rules = [AvoidInfix, SortImports, RedundantBraces, RedundantParens, PreferCurlyFors]
 8 | rewrite.redundantBraces.generalExpressions = false
 9 | 
10 | # scalafmt can only choose one scala version target per file to format
11 | # we have to use 212 for build.sbt or else we get failures
12 | runner.dialect = scala211
13 | fileOverride {
14 |   "glob:**build.sbt" {
15 |     runner.dialect = scala212
16 |   }
17 | }


--------------------------------------------------------------------------------
/scalding-core/src/main/scala/com/twitter/scalding/mathematics/Poisson.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.mathematics
 2 | 
 3 | import scala.util.Random
 4 | 
 5 | /**
 6 |  * Generating Poisson-distributed random variables according to Donald Knuth's algorithm as shown on
 7 |  * Wikipedia's Poisson Distribution page
 8 |  */
 9 | 
10 | class Poisson(fraction: Double, seed: Int) {
11 | 
12 |   val L = math.exp(-fraction)
13 |   val randomGenerator = new Random(seed)
14 | 
15 |   def nextInt = {
16 |     var k = 0
17 |     var p = 1.0
18 |     do {
19 |       k = k + 1
20 |       p = p * randomGenerator.nextDouble
21 |     } while (p > L)
22 |     k - 1
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/scalding-commons/src/test/java/com/twitter/scalding/commons/datastores/FSTestCase.java:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.commons.datastores;
 2 | 
 3 | import org.apache.hadoop.conf.Configuration;
 4 | import org.apache.hadoop.fs.FileSystem;
 5 | 
 6 | import java.io.IOException;
 7 | 
 8 | public class FSTestCase {
 9 |     public FileSystem local;
10 |     public FileSystem fs;
11 | 
12 |     public FSTestCase() {
13 |         try {
14 |             local = FileSystem.getLocal(new Configuration());
15 |             fs = FileSystem.get(new Configuration());
16 |         } catch(IOException e) {
17 |             throw new RuntimeException(e);
18 |         }
19 |     }
20 | }
21 | 


--------------------------------------------------------------------------------
/scalding-avro/README.md:
--------------------------------------------------------------------------------
 1 | This module contains Avro support for Scalding. It's based on the cascading.avro project,
 2 | https://github.com/ScaleUnlimited/cascading.avro .
 3 | 
 4 | In some case Kryo (the default serializer used by Scalding) doesn't work well with Avro objects. If you run in to
 5 | serialization errors, or if you want to preempt and trouble, you should add the following to your Job class:
 6 | ```scala
 7 | override def ioSerializations = 
 8 |   super.ioSerializations :+ classOf[cascading.avro.serialization.AvroSpecificRecordSerialization[_]]
 9 | ```
10 | 
11 | This will use cascading.avro's Avro SpecificRecord serialization for Avro objects in place of the Kryo serialization.
12 | 
13 | 


--------------------------------------------------------------------------------
/scalding-commons/src/test/java/com/twitter/scalding/commons/datastores/TestUtils.java:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.commons.datastores;
 2 | 
 3 | import org.apache.hadoop.fs.FileSystem;
 4 | import org.apache.hadoop.fs.Path;
 5 | 
 6 | import java.io.IOException;
 7 | 
 8 | public class TestUtils {
 9 | 
10 |     private static final String TMP_ROOT = "/tmp/unittests";
11 | 
12 |     public static String getTmpPath(FileSystem fs, String name) throws IOException {
13 |         fs.mkdirs(new Path(TMP_ROOT));
14 |         String full = TMP_ROOT + "/" + name;
15 |         if (fs.exists(new Path(full))) {
16 |             fs.delete(new Path(full), true);
17 |         }
18 |         return full;
19 |     }
20 | }
21 | 


--------------------------------------------------------------------------------
/scalding-dagon/src/main/scala/com/twitter/scalding/dagon/FunctionK.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.dagon
 2 | 
 3 | import java.io.Serializable
 4 | 
 5 | /**
 6 |  * This is a Natural transformation.
 7 |  *
 8 |  * For any type X, this type can produce a function from T[X] to R[X].
 9 |  */
10 | trait FunctionK[T[_], R[_]] extends Serializable {
11 |   def apply[U](tu: T[U]): R[U] =
12 |     toFunction[U](tu)
13 | 
14 |   def toFunction[U]: T[U] => R[U]
15 | }
16 | 
17 | object FunctionK {
18 |   def andThen[A[_], B[_], C[_]](first: FunctionK[A, B], second: FunctionK[B, C]): FunctionK[A, C] =
19 |     new FunctionK[A, C] {
20 |       def toFunction[U] = first.toFunction[U].andThen(second.toFunction[U])
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/scripts/test_typed_tutorials.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -exv
 2 | 
 3 | # Identify the bin dir in the distribution, and source the common include script
 4 | BASE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/.. && pwd )"
 5 | source ${BASE_DIR}/scripts/common.sh
 6 | 
 7 | # Test TypedTutorial cases
 8 | for t in 1 2 3 4 5 pipes block; do
 9 |   echo "--------------------"
10 |   echo "TypedTutorial: $t"
11 |   echo "--------------------"
12 |   time $SCALD tutorial/TypedTutorial.scala \
13 |     --tutorial $t \
14 |     --input tutorial/data/hello.txt \
15 |     --output tutorial/data/output0.txt \
16 |     --words tutorial/data/word_scores.tsv
17 |   echo "--------------------"
18 |   cat tutorial/data/output0.txt
19 | done
20 | 
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
 1 | # note that if an org-wide global config is configured, it will be merged (with duplicate settings taking priority from this file)
 2 | # it's better to explicitly set all configs if you want consistency
 3 | 
 4 | codecov:
 5 |   require_ci_to_pass: yes
 6 | 
 7 | coverage:
 8 |   precision: 2
 9 |   round: down
10 |   range: "0...100" # acceptable coverage range
11 | 
12 | # default behaviour
13 | parsers:
14 |   gcov:
15 |     branch_detection:
16 |       conditional: yes
17 |       loop: yes
18 |       method: no
19 |       macro: no
20 | 
21 | # can be configured in https://docs.codecov.com/docs/pull-request-comments
22 | comment:
23 |   layout: "reach,diff,flags,files,footer"
24 |   behavior: default
25 |   require_changes: no
26 | 


--------------------------------------------------------------------------------
/scalding-core/src/test/scala/com/twitter/scalding/FlowStateMapTest.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding
 2 | 
 3 | import org.scalatest.FunSuite
 4 | 
 5 | import cascading.flow.FlowDef
 6 | import com.twitter.scalding.source.{NullSink, TypedText}
 7 | import com.twitter.scalding.typed.cascading_backend.{CascadingBackend, CascadingExtensions}
 8 | 
 9 | import CascadingExtensions._
10 | 
11 | class FlowStateMapTest extends FunSuite {
12 |   test("make sure sure sourcemap isn't empty after planning") {
13 |     implicit val fd = new FlowDef
14 |     implicit val m = Local(false)
15 |     val t = TypedPipe.from(TypedText.tsv[String]("")).write(NullSink)
16 |     CascadingBackend.planTypedWrites(fd, m)
17 |     val state = FlowStateMap(fd)
18 |     assert(state.sourceMap.nonEmpty)
19 |   }
20 | }
21 | 


--------------------------------------------------------------------------------
/scalding-dagon/src/main/scala-2.12-/com/twitter/scalding/dagon/ScalaVersionCompat.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.dagon
 2 | 
 3 | object ScalaVersionCompat {
 4 |   type LazyList[+A] = scala.collection.immutable.Stream[A]
 5 |   val LazyList = scala.collection.immutable.Stream
 6 | 
 7 |   type IterableOnce[+A] = scala.collection.TraversableOnce[A]
 8 | 
 9 |   def iterateOnce[A](as: IterableOnce[A]): Iterator[A] =
10 |     as.toIterator
11 | 
12 |   def lazyList[A](as: A*): LazyList[A] =
13 |     Stream(as: _*)
14 | 
15 |   def lazyListToIterator[A](lst: LazyList[A]): Iterator[A] =
16 |     lst.iterator
17 | 
18 |   def lazyListFromIterator[A](it: Iterator[A]): LazyList[A] =
19 |     it.toStream
20 | 
21 |   implicit val ieeeDoubleOrdering: Ordering[Double] =
22 |     Ordering.Double
23 | }
24 | 


--------------------------------------------------------------------------------
/scalding-core/src/main/scala/com/twitter/scalding/TupleConversions.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2012 Twitter, Inc.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 | http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 |  */
16 | package com.twitter.scalding
17 | 
18 | @deprecated("This trait does nothing now", "0.9.0")
19 | trait TupleConversions
20 | 


--------------------------------------------------------------------------------
/scalding-dagon/src/main/scala-2.13+/com/twitter/scalding/dagon/ScalaVersionCompat.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.dagon
 2 | 
 3 | object ScalaVersionCompat {
 4 |   type LazyList[+A] = scala.collection.immutable.LazyList[A]
 5 |   val LazyList = scala.collection.immutable.LazyList
 6 | 
 7 |   type IterableOnce[+A] = scala.collection.IterableOnce[A]
 8 | 
 9 |   def iterateOnce[A](as: IterableOnce[A]): Iterator[A] =
10 |     as.iterator
11 | 
12 |   def lazyList[A](as: A*): LazyList[A] =
13 |     LazyList(as: _*)
14 | 
15 |   def lazyListToIterator[A](lst: LazyList[A]): Iterator[A] =
16 |     lst.iterator
17 | 
18 |   def lazyListFromIterator[A](it: Iterator[A]): LazyList[A] =
19 |     LazyList.from(it)
20 | 
21 |   implicit val ieeeDoubleOrdering: Ordering[Double] =
22 |     Ordering.Double.IeeeOrdering
23 | }
24 | 


--------------------------------------------------------------------------------
/scalding-core/codegen/function_implicits_generator.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | # Run it like this:
 4 | #
 5 | #   ruby codegen/function_implicits_generator.rb > src/main/scala/com/twitter/scalding/FunctionImplicits.scala
 6 | 
 7 | $indent = "  "
 8 | 
 9 | def puts_function_to_tupled(cnt)
10 |   gen = (1 .. cnt).map{ |i| "T#{i}" }.join(", ")
11 |   puts "#{$indent}implicit def function#{cnt}ToTupledFunction1[#{gen}, R](f: Function#{cnt}[#{gen}, R]): Function1[(#{gen}), R] = f.tupled"
12 | end
13 | 
14 | puts "// following were autogenerated by #{__FILE__} at #{Time.now} do not edit"
15 | puts "package com.twitter.scalding"
16 | puts
17 | puts"object FunctionImplicits {"
18 | puts
19 | 
20 | (2 .. 22).each { |c|
21 |   puts_function_to_tupled(c)
22 |   puts
23 | }
24 | 
25 | puts "}"
26 | puts "// end of autogenerated"
27 | 


--------------------------------------------------------------------------------
/scalding-thrift-macros/src/main/scala/com/twitter/scalding/thrift/macros/RequiredBinaryComparators.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.thrift.macros
 2 | 
 3 | import com.twitter.scalding.serialization.{OrderedSerialization, RequiredBinaryComparatorsConfig}
 4 | import com.twitter.scalding.thrift.macros.impl.ScroogeInternalOrderedSerializationImpl
 5 | import scala.language.experimental.{macros => smacros}
 6 | 
 7 | /**
 8 |  * Provides support for Scrooge classes in addition to primitives, cases classes, tuples etc. Use this if you
 9 |  * use Scrooge classes as `key` in your scalding job.
10 |  * @author
11 |  *   Mansur Ashraf.
12 |  */
13 | trait RequiredBinaryComparators extends RequiredBinaryComparatorsConfig {
14 |   implicit def ordSer[T]: OrderedSerialization[T] = macro ScroogeInternalOrderedSerializationImpl[T]
15 | }
16 | 


--------------------------------------------------------------------------------
/scalding-parquet/src/main/java/com/twitter/scalding/parquet/tuple/TupleRecordMaterializer.java:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.parquet.tuple;
 2 | 
 3 | import cascading.tuple.Tuple;
 4 | 
 5 | import org.apache.parquet.io.api.GroupConverter;
 6 | import org.apache.parquet.io.api.RecordMaterializer;
 7 | import org.apache.parquet.schema.GroupType;
 8 | 
 9 | public class TupleRecordMaterializer extends RecordMaterializer<Tuple> {
10 | 
11 |   private ParquetTupleConverter root;
12 | 
13 |   public TupleRecordMaterializer(GroupType parquetSchema) {
14 |     this.root = new ParquetTupleConverter(parquetSchema);
15 |   }
16 | 
17 |   @Override
18 |   public Tuple getCurrentRecord() {
19 |     return root.getCurrentTuple();
20 |   }
21 | 
22 |   @Override
23 |   public GroupConverter getRootConverter() {
24 |     return root;
25 |   }
26 | 
27 | }
28 | 


--------------------------------------------------------------------------------
/scripts/test_execution_tutorial.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -exv
 2 | 
 3 | # Identify the bin dir in the distribution, and source the common include script
 4 | BASE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/.. && pwd )"
 5 | source ${BASE_DIR}/scripts/common.sh
 6 | SHORT_SCALA_VERSION=${TRAVIS_SCALA_VERSION%.*}
 7 | SCALDING_VERSION=`./sbt -Dsbt.log.noformat=true -Dsbt.supershell=false "print scalding-core / version" -error`
 8 | 
 9 | export CLASSPATH=tutorial/execution-tutorial/target/scala-${SHORT_SCALA_VERSION}/execution-tutorial-assembly-${SCALDING_VERSION}.jar
10 | time java -jar tutorial/execution-tutorial/target/scala-${SHORT_SCALA_VERSION}/execution-tutorial-assembly-${SCALDING_VERSION}.jar \
11 |     com.twitter.scalding.tutorial.MyExecJob --local \
12 |     --input tutorial/data/hello.txt \
13 |     --output tutorial/data/execution_output.txt
14 | 


--------------------------------------------------------------------------------
/scalding-dagon/src/main/scala/com/twitter/scalding/dagon/RefPair.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.dagon
 2 | 
 3 | import scala.util.hashing.MurmurHash3
 4 | 
 5 | /**
 6 |  * A tuple2 that uses reference equality on items to do equality useful for caching the results of pair-wise
 7 |  * functions on DAGs.
 8 |  *
 9 |  * Without this, you can easily get exponential complexity on recursion on DAGs.
10 |  */
11 | case class RefPair[A <: AnyRef, B <: AnyRef](_1: A, _2: B) {
12 | 
13 |   override lazy val hashCode: Int = MurmurHash3.productHash(this)
14 | 
15 |   override def equals(that: Any) = that match {
16 |     case RefPair(thatA, thatB) => (_1 eq thatA) && (_2 eq thatB)
17 |     case _                     => false
18 |   }
19 | 
20 |   /**
21 |    * true if the left is referentially equal to the right
22 |    */
23 |   def itemsEq: Boolean = _1 eq _2
24 | }
25 | 


--------------------------------------------------------------------------------
/scalding-commons/codegen/lzotypedtsv_generator.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | # Run it like this:
 4 | #
 5 | #   ./codegen/lzotypedtsv_generator.rb > src/main/scala/com/twitter/scalding/commons/source/GeneratedLzoTypedTsv.scala
 6 | 
 7 | $indent = "  "
 8 | 
 9 | TYPES = ('A'..'Z').to_a
10 | 
11 | def make_typed_tsv(cnt)
12 |   other_cnts = (1..(22-cnt)).to_a
13 |   typeString = TYPES[0..(cnt - 1)].join(",")
14 |   puts "trait LzoTypedTsv#{cnt}[#{typeString}] extends LzoTypedTsv[Tuple#{cnt}[#{typeString}]] with Mappable#{cnt}[#{typeString}]"
15 | end
16 | 
17 | puts "// following were autogenerated by #{__FILE__} at #{Time.now} do not edit"
18 | puts %q|package com.twitter.scalding.commons.source
19 | 
20 | import com.twitter.scalding._
21 | 
22 | |
23 | 
24 | (1..22).each { |c|
25 |   make_typed_tsv(c)
26 |   puts
27 | }
28 | 
29 | puts "// end of autogenerated"
30 | 


--------------------------------------------------------------------------------
/scalding-core/src/test/scala/com/twitter/scalding/RegressionTests.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding
 2 | 
 3 | import org.scalatest.FunSuite
 4 | 
 5 | class RegressionTests extends FunSuite {
 6 |   test("hashJoins + merges that fail in cascading 3") {
 7 |     val p1 =
 8 |       TypedPipe
 9 |         .from(List(1, 2))
10 |         .cross(TypedPipe.from(List(3, 4)))
11 | 
12 |     val p2 =
13 |       TypedPipe
14 |         .from(List(5, 6))
15 |         .cross(TypedPipe.from(List(8, 9)))
16 | 
17 |     val p3 = p1 ++ p2
18 |     val p4 = TypedPipe.from(List((8, 1), (10, 2))) ++ p3
19 | 
20 |     val expected = List((1, 3), (1, 4), (2, 3), (2, 4), (5, 8), (5, 9), (6, 8), (6, 9), (8, 1), (10, 2))
21 |     val values = p4.toIterableExecution
22 |       .waitFor(Config.empty, Local(true))
23 |       .get
24 |     assert(values.toList.sorted == expected)
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/project/scalding-dagon.scala:
--------------------------------------------------------------------------------
 1 | import sbt.CrossVersion
 2 | 
 3 | import java.io.File
 4 | import java.nio.file.Paths
 5 | 
 6 | object scaldingDagonSettings {
 7 | 
 8 |   // load either scala-2.12- or scala-2.12+ dagon src depending on scala version
 9 |   def scalaVersionSpecificFolders(srcName: String, srcBaseDir: File, scalaVersion: String) = {
10 | 
11 |     def extraDirs(suffix: String) = {
12 |       val scalaCompat = Paths
13 |         .get(srcBaseDir.toString)
14 |         .resolve("src")
15 |         .resolve(srcName)
16 |         .resolve("scala" + suffix)
17 |         .toFile
18 |       Seq(scalaCompat)
19 |     }
20 | 
21 |     CrossVersion.partialVersion(scalaVersion) match {
22 |       case Some((2, y)) if y <= 12 =>
23 |         extraDirs("-2.12-")
24 |       case Some((2, y)) if y >= 13 =>
25 |         extraDirs("-2.13+")
26 |       case _ => Nil
27 |     }
28 |   }
29 | 
30 | }
31 | 


--------------------------------------------------------------------------------
/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/providers/StableKnownDirectSubclasses.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.serialization.macros.impl.ordered_serialization.providers
 2 | 
 3 | import scala.reflect.macros.whitebox.Context
 4 | 
 5 | /**
 6 |  * The `knownDirectSubclasses` method doesn't provide stable ordering since it returns an unordered `Set` and
 7 |  * the `Type` AST nodes don't override the `hashCode` method, relying on the default identity `hashCode`.
 8 |  *
 9 |  * This function makes the ordering stable using a list ordered by the full name of the types.
10 |  */
11 | object StableKnownDirectSubclasses {
12 | 
13 |   def apply(c: Context)(tpe: c.Type): List[c.universe.TypeSymbol] = // linter:ignore:UnusedParameter
14 |     tpe.typeSymbol.asClass.knownDirectSubclasses.map(_.asType).toList.sortBy(_.fullName)
15 | }
16 | 


--------------------------------------------------------------------------------
/scalding-core/codegen/mappable_generator.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | # Run it like this:
 4 | #
 5 | #   ./codegen/mappable_generator.rb > src/main/scala/com/twitter/scalding/GeneratedMappable.scala
 6 | 
 7 | $indent = "  "
 8 | 
 9 | TYPES = ('A'..'Z').to_a
10 | 
11 | def make_mappable(cnt)
12 |   other_cnts = (1..(22-cnt)).to_a
13 |   typeString = TYPES[0..(cnt - 1)].join(",")
14 |   puts "trait Mappable#{cnt}[#{typeString}] extends Mappable[Tuple#{cnt}[#{typeString}]] {"
15 |   puts "#{$indent}def converter[Z >: Tuple#{cnt}[#{typeString}]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple#{cnt}[#{typeString}]])"
16 |   puts "}"
17 | end
18 | 
19 | puts "// following were autogenerated by #{__FILE__} at #{Time.now} do not edit"
20 | puts %q|package com.twitter.scalding
21 | 
22 | |
23 | 
24 | (1..22).each { |c|
25 |   make_mappable(c)
26 |   puts
27 | }
28 | 
29 | puts "// end of autogenerated"
30 | 


--------------------------------------------------------------------------------
/tutorial/AvroTutorial0.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 | Scalding with Avro (and Json) tutorial part 0.
 3 | 
 4 | To run this job:
 5 |   scripts/scald.rb --local --avro --json tutorial/AvroTutorial0.scala
 6 | 
 7 | Check the output:
 8 |   java -jar avro-tools-1.7.6.jar tojson tutorial/data/avrooutput0.avro
 9 | 
10 | **/
11 | 
12 | import com.twitter.scalding.{Job, Args, JsonLine}
13 | import com.twitter.scalding.avro.UnpackedAvroSource
14 | import org.apache.avro.Schema
15 |  
16 | class AvroTutorial0(args: Args) extends Job(args) {
17 |   val schema = """{
18 | "type": "record", "name": "parseJson", "fields": [
19 | { "name": "sessionId", "type": "string" },
20 | { "name": "optionalField", "type": ["string", "null"] }
21 | ] }"""
22 | 
23 |   JsonLine("tutorial/data/session.json", ('sessionId, 'optionalField)).read
24 |     .write(UnpackedAvroSource("tutorial/data/avrooutput0.avro", new Schema.Parser().parse(schema)))
25 | }
26 | 


--------------------------------------------------------------------------------
/scalding-base/src/main/scala/com/twitter/scalding/typed/memory_backend/AtomicBox.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.typed.memory_backend
 2 | 
 3 | import java.util.concurrent.atomic.AtomicReference
 4 | 
 5 | class AtomicBox[T <: AnyRef](init: T) {
 6 |   private[this] val ref = new AtomicReference[T](init)
 7 | 
 8 |   def lazySet(t: T): Unit =
 9 |     ref.lazySet(t)
10 | 
11 |   def set(t: T): Unit =
12 |     ref.set(t)
13 | 
14 |   def swap(t: T): T =
15 |     ref.getAndSet(t)
16 | 
17 |   /**
18 |    * use a pure function to update the state. fn may be called more than once
19 |    */
20 |   def update[R](fn: T => (T, R)): R = {
21 | 
22 |     @annotation.tailrec
23 |     def loop(): R = {
24 |       val init = ref.get
25 |       val (next, res) = fn(init)
26 |       if (ref.compareAndSet(init, next)) res
27 |       else loop()
28 |     }
29 | 
30 |     loop()
31 |   }
32 | 
33 |   def get(): T = ref.get
34 | }
35 | 


--------------------------------------------------------------------------------
/scalding-core/src/main/scala/com/twitter/scalding/Sortable.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2012 Twitter, Inc.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 | http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 |  */
16 | package com.twitter.scalding
17 | 
18 | import cascading.tuple.Fields
19 | 
20 | trait Sortable[+Self] {
21 |   // Perform an inner secondary sort
22 |   def sortBy(innerSort: Fields): Self
23 |   def sorting: Option[Fields]
24 | }
25 | 


--------------------------------------------------------------------------------
/tutorial/MatrixTutorial1.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.examples
 2 | 
 3 | import com.twitter.scalding._
 4 | import com.twitter.scalding.mathematics.Matrix
 5 | 
 6 | 
 7 | /*
 8 | * MatrixTutorial1.scala
 9 | * 
10 | * Loads a directed graph adjacency matrix where a[i,j] = 1 if there is an edge from a[i] to b[j]
11 | * and compute the co-follows between any two nodes 
12 | *
13 | * ../scripts/scald.rb --local MatrixTutorial1.scala --input data/graph.tsv --output data/cofollows.tsv
14 | *
15 | */
16 | 
17 | 
18 | class CofollowsJob(args : Args) extends Job(args) {
19 |   
20 |   import Matrix._
21 |   
22 |   val adjacencyMatrix = Tsv( args("input"), ('user1, 'user2, 'rel) )
23 |   	.read
24 |   	.toMatrix[Long,Long,Double]('user1, 'user2, 'rel)
25 | 
26 |   // compute the innerproduct of the adjacency matrix with itself 
27 |   (adjacencyMatrix * adjacencyMatrix.transpose).write( Tsv( args("output") ) )
28 | }
29 | 
30 | 


--------------------------------------------------------------------------------
/scalding-core/src/main/scala/com/twitter/scalding/reducer_estimation/ReducerHistoryEstimator.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.reducer_estimation
 2 | 
 3 | import com.twitter.scalding.estimation.{HistoryEstimator, Task}
 4 | import org.apache.hadoop.mapred.JobConf
 5 | 
 6 | object ReducerHistoryEstimator {
 7 |   val Status = "status"
 8 |   val StartTime = "startTime"
 9 |   val FinishTime = "finishTime"
10 | 
11 |   implicit class ReducerRichTask(val task: Task) {
12 |     def status: Option[String] = task.details.get(Status).map(_.asInstanceOf[String])
13 |     def startTime: Option[Long] = task.details.get(StartTime).map(_.asInstanceOf[Long])
14 |     def finishTime: Option[Long] = task.details.get(FinishTime).map(_.asInstanceOf[Long])
15 |   }
16 | }
17 | 
18 | trait ReducerHistoryEstimator extends HistoryEstimator[Int] {
19 |   override def maxHistoryItems(conf: JobConf): Int = ReducerEstimatorConfig.getMaxHistory(conf)
20 | }
21 | 


--------------------------------------------------------------------------------
/scalding-serialization/src/test/scala/com/twitter/scalding/serialization/macros/ZDifficultTypes.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.some.other.space.space
 2 | 
 3 | sealed trait ContainerX
 4 | object ContainerX {
 5 |   case class ElementY(x: String) extends ContainerX
 6 |   case class ElementZ(x: String) extends ContainerX
 7 | }
 8 | 
 9 | // This is intentionally not sealed. User can supply their own
10 | trait ContainerP {
11 |   def id: String
12 | }
13 | object ContainerP {
14 |   case object ElementA extends ContainerP {
15 |     def id: String = "a"
16 |   }
17 |   case object ElementB extends ContainerP {
18 |     def id: String = "b"
19 |   }
20 |   def fromId(id: String): ContainerP = id match {
21 |     case _ if id == ElementA.id => ElementA
22 |     case _ if id == ElementB.id => ElementB
23 |   }
24 | }
25 | 
26 | case class TestCaseHardA(e: ContainerX, y: String)
27 | case class TestCaseHardB(e: ContainerP, y: String)
28 | 


--------------------------------------------------------------------------------
/tutorial/MatrixTutorial0.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.examples
 2 | 
 3 | import com.twitter.scalding._
 4 | import com.twitter.scalding.mathematics.Matrix
 5 | 
 6 | /*
 7 | * MatrixTutorial0.scala
 8 | *
 9 | * Loads a directed graph adjacency matrix where a[i,j] = 1 if there is an edge from a[i] to b[j]
10 | * and compute the outdegree of each node i
11 | *
12 | * ../scripts/scald.rb --local MatrixTutorial0.scala --input data/graph.tsv --output data/outdegree.tsv
13 | *
14 | */
15 | 
16 | 
17 | class GraphOutDegreeJob(args : Args) extends Job(args) {
18 | 
19 |   import Matrix._
20 | 
21 |   val adjacencyMatrix = Tsv( args("input"), ('user1, 'user2, 'rel) )
22 |     .read
23 |     .toMatrix[Long,Long,Double]('user1, 'user2, 'rel)
24 | 
25 |   // each row i represents all of the outgoing edges from i
26 |   // by summing out all of the columns we get the outdegree of i
27 |   adjacencyMatrix.sumColVectors.write( Tsv( args("output") ) )
28 | }
29 | 
30 | 


--------------------------------------------------------------------------------
/scalding-base/src/main/scala/com/twitter/scalding/CPromise.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding
 2 | 
 3 | import scala.concurrent.{ExecutionContext => ConcurrentExecutionContext, Future, Promise}
 4 | 
 5 | /**
 6 |  * Represents a cancellable promise.
 7 |  */
 8 | case class CPromise[T](promise: Promise[T], cancellationHandler: Promise[CancellationHandler]) {
 9 | 
10 |   /**
11 |    * Creates a CFuture using the given promises.
12 |    */
13 |   def cfuture: CFuture[T] =
14 |     CFuture(promise.future, CancellationHandler.fromFuture(cancellationHandler.future))
15 | 
16 |   def completeWith(other: CFuture[T]): this.type = {
17 |     // fulfill the main and cancellation handler promises
18 |     promise.completeWith(other.future)
19 |     cancellationHandler.completeWith(Future.successful(other.cancellationHandler))
20 |     this
21 |   }
22 | }
23 | object CPromise {
24 |   def apply[T](): CPromise[T] = CPromise(Promise[T](), Promise[CancellationHandler]())
25 | }
26 | 


--------------------------------------------------------------------------------
/scalding-base/src/main/scala/com/twitter/scalding/typed/functions/FlatMapping.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.typed.functions
 2 | 
 3 | import java.io.Serializable
 4 | 
 5 | /**
 6 |  * This is one of 4 core, non composed operations: identity filter map flatMap
 7 |  */
 8 | sealed abstract class FlatMapping[-A, +B] extends Serializable
 9 | 
10 | object FlatMapping extends Serializable {
11 |   def filter[A](fn: A => Boolean): FlatMapping[A, A] =
12 |     Filter[A, A](fn, implicitly)
13 | 
14 |   def filterKeys[K, V](fn: K => Boolean): FlatMapping[(K, V), (K, V)] =
15 |     filter[(K, V)](FilterKeysToFilter(fn))
16 | 
17 |   final case class Identity[A, B](ev: EqTypes[A, B]) extends FlatMapping[A, B]
18 |   final case class Filter[A, B](fn: A => Boolean, ev: EqTypes[A, B]) extends FlatMapping[A, B]
19 |   final case class Map[A, B](fn: A => B) extends FlatMapping[A, B]
20 |   final case class FlatM[A, B](fn: A => TraversableOnce[B]) extends FlatMapping[A, B]
21 | }
22 | 


--------------------------------------------------------------------------------
/scalding-commons/src/main/scala/com/twitter/scalding/commons/scheme/CombinedSequenceFileScheme.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.commons.scheme
 2 | 
 3 | import cascading.scheme.Scheme
 4 | import com.twitter.elephantbird.cascading2.scheme.{CombinedSequenceFile, CombinedWritableSequenceFile}
 5 | import com.twitter.scalding.{HadoopSchemeInstance, SequenceFileScheme, WritableSequenceFileScheme}
 6 | 
 7 | trait CombinedSequenceFileScheme extends SequenceFileScheme {
 8 |   // TODO Cascading doesn't support local mode yet
 9 |   override def hdfsScheme = HadoopSchemeInstance(
10 |     new CombinedSequenceFile(fields).asInstanceOf[Scheme[_, _, _, _, _]]
11 |   )
12 | }
13 | 
14 | trait CombinedWritableSequenceFileScheme extends WritableSequenceFileScheme {
15 |   // TODO Cascading doesn't support local mode yet
16 |   override def hdfsScheme =
17 |     HadoopSchemeInstance(
18 |       new CombinedWritableSequenceFile(fields, keyType, valueType).asInstanceOf[Scheme[_, _, _, _, _]]
19 |     )
20 | }
21 | 


--------------------------------------------------------------------------------
/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/handler/BlobTypeHandler.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.db.macros.impl.handler
 2 | 
 3 | import com.twitter.scalding.db.macros.impl.FieldName
 4 | import scala.reflect.macros.Context
 5 | import scala.util.{Failure, Success}
 6 | 
 7 | object BlobTypeHandler {
 8 |   def apply[T](c: Context)(implicit
 9 |       accessorTree: List[c.universe.MethodSymbol],
10 |       fieldName: FieldName,
11 |       defaultValue: Option[c.Expr[String]],
12 |       annotationInfo: List[(c.universe.Type, Option[Int])],
13 |       nullable: Boolean
14 |   ): scala.util.Try[List[ColumnFormat[c.type]]] =
15 |     if (defaultValue.nonEmpty || annotationInfo.nonEmpty)
16 |       Failure(
17 |         new Exception(
18 |           s"Default values and annotation info are not supported: defaultValue = $defaultValue annotationInfo = $annotationInfo"
19 |         )
20 |       )
21 |     else
22 |       Success(List(ColumnFormat(c)(accessorTree, "BLOB", None)))
23 | }
24 | 


--------------------------------------------------------------------------------
/scalding-base/src/main/scala/com/twitter/scalding/StringUtility.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding
 2 | 
 3 | object StringUtility {
 4 |   private[this] val emptyCons = "" :: Nil
 5 | 
 6 |   private def fastSplitHelper(
 7 |       text: String,
 8 |       key: String,
 9 |       from: Int,
10 |       textLength: Int,
11 |       keyLength: Int
12 |   ): List[String] = {
13 |     val firstIndex = text.indexOf(key, from)
14 |     if (firstIndex == -1) {
15 |       if (from < textLength) {
16 |         text.substring(from) :: Nil
17 |       } else {
18 |         emptyCons
19 |       }
20 |     } else {
21 |       // the text till the separator should be kept in any case
22 |       text.substring(from, firstIndex) :: fastSplitHelper(
23 |         text,
24 |         key,
25 |         firstIndex + keyLength,
26 |         textLength,
27 |         keyLength
28 |       )
29 |     }
30 |   }
31 | 
32 |   def fastSplit(text: String, key: String): List[String] =
33 |     fastSplitHelper(text, key, 0, text.length, key.length)
34 | }
35 | 


--------------------------------------------------------------------------------
/scalding-base/src/main/scala/com/twitter/scalding/typed/KeyedPipe.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2014 Twitter, Inc.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 | http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 |  */
16 | package com.twitter.scalding.typed
17 | 
18 | /**
19 |  * Represents anything that starts as a TypedPipe of Key Value, where the value type has been erased. Acts as
20 |  * proof that the K in the tuple has an Ordering
21 |  */
22 | trait KeyedPipe[K] {
23 |   def keyOrdering: Ordering[K]
24 |   def mapped: TypedPipe[(K, Any)]
25 | }
26 | 


--------------------------------------------------------------------------------
/scalding-commons/src/main/java/com/twitter/scalding/commons/datastores/Utils.java:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.commons.datastores;
 2 | 
 3 | import org.apache.hadoop.conf.Configuration;
 4 | import org.apache.hadoop.fs.FileSystem;
 5 | import org.apache.hadoop.fs.Path;
 6 | 
 7 | import java.io.IOException;
 8 | 
 9 | public class Utils {
10 | 
11 |     public static FileSystem getFS(String path) throws IOException {
12 |         return getFS(path, new Configuration());
13 |     }
14 | 
15 |     public static FileSystem getFS(String path, Configuration conf) throws IOException {
16 |         return new Path(path).getFileSystem(conf);
17 |     }
18 | 
19 |     /**
20 |      * Return true or false if the input is a long
21 |      * @param input
22 |      * @return boolean
23 |      */
24 |     public static boolean isLong(String input) {
25 |         try {
26 |             Long.parseLong(input);
27 |             return true;
28 |         } catch (Exception e) {
29 |             return false;
30 |         }
31 |     }
32 | }
33 | 


--------------------------------------------------------------------------------
/scalding-core/src/test/scala/com/twitter/scalding/TypedPipeCheckerTest.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding
 2 | 
 3 | import org.scalatest.{Matchers, WordSpec}
 4 | 
 5 | class TypedPipeCheckerTest extends WordSpec with Matchers {
 6 |   import TypedPipeChecker._
 7 | 
 8 |   "TypedPipeChecker" should {
 9 |     "run asserts on pipe" in {
10 |       checkOutput(TypedPipe.from(List(1, 2, 3, 4))) { rows =>
11 |         assert(rows.size == 4)
12 |         assert(rows == List(1, 2, 3, 4))
13 |       }
14 |     }
15 |   }
16 | 
17 |   it should {
18 |     "give back a list" in {
19 |       val list = inMemoryToList(TypedPipe.from(List(1, 2, 3, 4)))
20 |       assert(list == List(1, 2, 3, 4))
21 |     }
22 |   }
23 | 
24 |   it should {
25 |     "allow for a list of input to be run through a transform function" in {
26 |       def transform(pipe: TypedPipe[Int]) = pipe.map(identity)
27 | 
28 |       checkOutputTransform(List(1, 2, 3))(transform) { rows =>
29 |         assert(rows == List(1, 2, 3))
30 |       }
31 |     }
32 |   }
33 | }
34 | 


--------------------------------------------------------------------------------
/scripts/test_tutorials.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -exv
 2 | 
 3 | # Identify the bin dir in the distribution, and source the common include script
 4 | BASE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/.. && pwd )"
 5 | source ${BASE_DIR}/scripts/common.sh
 6 | 
 7 | time $SCALD tutorial/Tutorial0.scala
 8 | time $SCALD tutorial/Tutorial1.scala
 9 | time $SCALD tutorial/Tutorial2.scala
10 | 
11 | time $SCALD tutorial/Tutorial3.scala \
12 |   --input tutorial/data/hello.txt
13 | 
14 | time $SCALD tutorial/Tutorial4.scala \
15 |   --input tutorial/data/hello.txt \
16 |   --output tutorial/data/output4.txt
17 | 
18 | time $SCALD tutorial/Tutorial5.scala \
19 |   --input tutorial/data/hello.txt \
20 |   --output tutorial/data/output5.txt \
21 |   --words tutorial/data/words.txt
22 | 
23 | time $SCALD tutorial/MatrixTutorial5.scala \
24 |   --input tutorial/data/graph.tsv \
25 |   --output tutorial/data/cosineSim.tsv
26 | 
27 | time $SCALD --json tutorial/JsonTutorial0.scala
28 | 
29 | time $SCALD --avro --json tutorial/AvroTutorial0.scala
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/scalding-db/src/main/scala/com/twitter/scalding/db/DBTypeDescriptor.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2015 Twitter, Inc.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 | http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 |  */
16 | 
17 | package com.twitter.scalding.db
18 | import com.twitter.scalding._
19 | import cascading.tuple.Fields
20 | 
21 | trait DBTypeDescriptor[T] {
22 |   def columnDefn: ColumnDefinitionProvider[T]
23 |   def converter: TupleConverter[T]
24 |   def setter: TupleSetter[T]
25 |   def fields: Fields
26 |   def jdbcSetter: JdbcStatementSetter[T]
27 | }
28 | 


--------------------------------------------------------------------------------
/scalding-core/src/main/scala/com/twitter/scalding/CascadeJob.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding
 2 | 
 3 | import cascading.cascade.CascadeConnector
 4 | import cascading.cascade.Cascade
 5 | 
 6 | abstract class CascadeJob(args: Args) extends Job(args) {
 7 | 
 8 |   def jobs: Seq[Job]
 9 | 
10 |   override def run = {
11 |     val flows = jobs.map(_.buildFlow)
12 |     val cascade = new CascadeConnector().connect(flows: _*)
13 |     preProcessCascade(cascade)
14 |     cascade.complete()
15 |     postProcessCascade(cascade)
16 |     val statsData = cascade.getCascadeStats
17 | 
18 |     handleStats(statsData)
19 |     statsData.isSuccessful
20 |   }
21 | 
22 |   override def validate(): Unit =
23 |     jobs.foreach(_.validate())
24 | 
25 |   /*
26 |    * Good for printing a dot file, setting the flow skip strategy, etc
27 |    */
28 |   def preProcessCascade(cascade: Cascade) = {} // linter:ignore
29 | 
30 |   /*
31 |    * Good for checking the cascade stats
32 |    */
33 |   def postProcessCascade(cascade: Cascade) = {} // linter:ignore
34 | 
35 | }
36 | 


--------------------------------------------------------------------------------
/docs/src/main/tut/cookbook/hbase.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | layout: docs
 3 | title:  "Scalding and HBase"
 4 | section: "cookbook"
 5 | ---
 6 | 
 7 | # Using Scalding with HBase
 8 | 
 9 | ## Resources
10 | 
11 | - [Running Scalding with HBase support](https://github.com/kianwilcox/hbase-scalding) a github example project.
12 | - [Spy Glass](https://github.com/ParallelAI/SpyGlass) - Advanced featured HBase wrapper for Cascading and Scalding
13 | - [Maple](https://github.com/Cascading/maple) a collection of Cascading Taps, including a simple HBase tap. Spy Glass appears to be the more advanced option.
14 | - [KijiExpress](https://github.com/kijiproject/kiji-express) provides a full lifecycle for building predictive models using Scalding and HBase.
15 | 
16 | ## Example Code
17 | 
18 | TODO: Please add links to example code here.
19 | 
20 | ### Documentation Help
21 | 
22 | We'd love your help fleshing out this documentation! You can edit this page in your browser by clicking [this link](https://github.com/twitter/scalding/edit/develop/docs/src/main/tut/cookbook/hbase.md).
23 | 


--------------------------------------------------------------------------------
/scalding-quotation/src/test/scala/com/twitter/scalding/quotation/LimitationsTest.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.quotation
 2 | 
 3 | class LimitationsTest extends Test {
 4 | 
 5 |   class TestClass {
 6 |     def function[T, U](f: T => U)(implicit q: Quoted) = (q, f)
 7 |   }
 8 | 
 9 |   val test = new TestClass
10 | 
11 |   "nested transitive projection" in pendingUntilFixed {
12 |     test.function[Person, Option[String]](_.alternativeContact.map(_.phone))._1.projections.set mustEqual
13 |       Set(
14 |         Person.typeReference
15 |           .andThen(Accessor("alternativeContact"), typeName[Option[Contact]])
16 |           .andThen(Accessor("phone"), typeName[String])
17 |       )
18 |   }
19 | 
20 |   "nested quoted function projection" in pendingUntilFixed {
21 |     val contactFunction = Quoted.function { (p: Person) =>
22 |       p.contact
23 |     }
24 |     val phoneFunction = Quoted.function { (p: Person) =>
25 |       contactFunction(p).phone
26 |     }
27 |     phoneFunction.quoted.projections.set mustEqual Set(Person.phoneProjection)
28 |   }
29 | }
30 | 


--------------------------------------------------------------------------------
/scripts/README.md:
--------------------------------------------------------------------------------
 1 | Testing info:
 2 | 
 3 | .travis.yml::
 4 | 
 5 | On travis to get better build time on weaker machines we split out the build into several targets.
 6 | 
 7 | .travis.yml contains several build targets splitting them up.
 8 | 
 9 | 
10 | run_test.sh:
11 | Here we unpack a cache(falling back to a static link if we can to try get some bootstrap jars). Run the compile step with standard out/standard error directed to /dev/null. (This is to stop travis giving up about our test logging being too verbose.). Then we run our test suite on our pre-build classes. Finally if anything has changed in our cache folders we pack them away into a special folder for travis to cache.
12 | 
13 | testValidator.sh:
14 | 
15 | Here we attempt in a bash script to ensure we have full coverage of our Build targets in our travis yml. Erroring the build if not the case.
16 | 
17 | 
18 | 
19 | NB: At some point on more powerful machines with a proper maven cache none of these strategies may be worth while. Possibly splitting up some of the test running, but the code around caches and such can all probably be deleted.


--------------------------------------------------------------------------------
/scalding-dagon/src/main/scala/com/twitter/scalding/dagon/Id.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.dagon
 2 | 
 3 | import java.io.Serializable
 4 | import java.util.concurrent.atomic.AtomicLong
 5 | 
 6 | /**
 7 |  * The Expressions are assigned Ids. Each Id is associated with an expression of inner type T.
 8 |  *
 9 |  * This is done to put an indirection in the Dag that allows us to rewrite nodes by simply replacing the
10 |  * expressions associated with given Ids.
11 |  *
12 |  * T is a phantom type used by the type system
13 |  */
14 | final class Id[T] private (val serial: Long) extends Serializable {
15 |   require(serial >= 0, s"counter overflow has occurred: $serial")
16 |   override def toString: String = s"Id($serial)"
17 | }
18 | 
19 | object Id {
20 | 
21 |   @transient private[this] val counter = new AtomicLong(0)
22 | 
23 |   def next[T](): Id[T] =
24 |     new Id[T](counter.getAndIncrement())
25 | 
26 |   implicit def idOrdering[T]: Ordering[Id[T]] =
27 |     new Ordering[Id[T]] {
28 |       def compare(a: Id[T], b: Id[T]) =
29 |         java.lang.Long.compare(a.serial, b.serial)
30 |     }
31 | }
32 | 


--------------------------------------------------------------------------------
/tutorial/MatrixTutorial4.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.examples
 2 | 
 3 | import com.twitter.scalding._
 4 | import com.twitter.scalding.mathematics.Matrix
 5 | 
 6 | 
 7 | /*
 8 | * MatrixTutorial4.scala
 9 | *
10 | * Loads a directed graph adjacency matrix where a[i,j] = 1 if there is an edge from a[i] to b[j]
11 | * and computes the cosine of the angle between every two pairs of vectors
12 | * 
13 | * ../scripts/scald.rb --local MatrixTutorial4.scala --input data/graph.tsv --output data/cosineSim.tsv
14 | *
15 | */
16 | 
17 | class ComputeCosineJob(args : Args) extends Job(args) {
18 |   
19 |   import Matrix._
20 | 
21 |   val adjacencyMatrix = Tsv( args("input"), ('user1, 'user2, 'rel) )
22 |   	.read
23 |   	.toMatrix[Long,Long,Double]('user1, 'user2, 'rel)
24 | 
25 |   // we compute the L2 normalized adjacency graph 
26 |   val normMatrix = adjacencyMatrix.rowL2Normalize
27 | 
28 |   // we compute the innerproduct of the normalized matrix with itself
29 |   // which is equivalent with computing cosine: AA^T / ||A|| * ||A||
30 |   (normMatrix * normMatrix.transpose).write( Tsv( args("output") ) )
31 | 
32 | }
33 | 
34 | 


--------------------------------------------------------------------------------
/scalding-thrift-macros/src/main/scala/com/twitter/scalding/thrift/macros/Macros.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  Copyright 2014 Twitter, Inc.
 3 | 
 4 |  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  you may not use this file except in compliance with the License.
 6 |  You may obtain a copy of the License at
 7 | 
 8 |  http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 |  Unless required by applicable law or agreed to in writing, software
11 |  distributed under the License is distributed on an "AS IS" BASIS,
12 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  See the License for the specific language governing permissions and
14 |  limitations under the License.
15 |  */
16 | package com.twitter.scalding.thrift.macros
17 | 
18 | import com.twitter.scalding.serialization.OrderedSerialization
19 | import com.twitter.scalding.thrift.macros.impl.ScroogeInternalOrderedSerializationImpl
20 | 
21 | import scala.language.experimental.{macros => sMacros}
22 | 
23 | object Macros {
24 |   implicit def scroogeOrdSer[T]: OrderedSerialization[T] = macro ScroogeInternalOrderedSerializationImpl[T]
25 | }
26 | 


--------------------------------------------------------------------------------
/scripts/test_matrix_tutorials.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -exv
 2 | 
 3 | # Identify the bin dir in the distribution, and source the common include script
 4 | BASE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/.. && pwd )"
 5 | source ${BASE_DIR}/scripts/common.sh
 6 | 
 7 | time $SCALD tutorial/MatrixTutorial0.scala \
 8 |   --input tutorial/data/graph.tsv \
 9 |   --output tutorial/data/outdegree.tsv
10 | 
11 | time $SCALD tutorial/MatrixTutorial1.scala \
12 |   --input tutorial/data/graph.tsv \
13 |   --output tutorial/data/cofollows.tsv
14 | 
15 | time $SCALD tutorial/MatrixTutorial2.scala \
16 |   --input tutorial/data/graph.tsv \
17 |   --maxOutdegree 1000 \
18 |   --output tutorial/data/graphFiltered.tsv
19 | 
20 | time $SCALD tutorial/MatrixTutorial3.scala \
21 |   --input1 tutorial/data/graph.tsv \
22 |   --input2 tutorial/data/graph2.tsv \
23 |   --intersection tutorial/data/intersection.tsv \
24 |   --leftDiff tutorial/data/leftDiff.tsv \
25 |   --rightDiff tutorial/data/rightDiff.tsv
26 | 
27 | time $SCALD tutorial/MatrixTutorial5.scala \
28 |   --input tutorial/data/graph.tsv \
29 |   --output tutorial/data/cosineSim.tsv
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/scalding-core/src/main/scala/com/twitter/scalding/source/CheckedInversion.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2012 Twitter, Inc.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 | http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 |  */
16 | 
17 | package com.twitter.scalding.source
18 | 
19 | import com.twitter.bijection.Injection
20 | import java.io.Serializable
21 | 
22 | /**
23 |  * Handles the error checking for Injection inversion if check fails, it will throw an unrecoverable exception
24 |  * stopping the job TODO: probably belongs in Bijection
25 |  */
26 | trait CheckedInversion[T, U] extends Serializable {
27 |   def injection: Injection[T, U]
28 |   def apply(input: U): Option[T]
29 | }
30 | 


--------------------------------------------------------------------------------
/scripts/test_repl_tutorial.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -exv
 2 | 
 3 | # Identify the bin dir in the distribution, and source the common include script
 4 | BASE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/.. && pwd )"
 5 | source ${BASE_DIR}/scripts/common.sh
 6 | 
 7 | # Now run a basic test for the REPL
 8 | # If the content of the output is different, diff will fail with a non-zero exit code
 9 | time $SCALD_REPL < tutorial/ReplTutorial1.scala
10 | diff tutorial/data/hello.txt tutorial/data/output1.txt
11 | 
12 | # Run from inside tutorial directory so we pick up definition
13 | # of 'scaldingReplInitWasLoaded' from 'tutorial/.scalding_repl'
14 | # If it does, then this 'script' exits early with success.
15 | # Otherwise it continues and exits with an error.
16 | cd tutorial; echo "
17 | if (scaldingReplInitWasLoaded) System.exit(0)
18 | System.exit(1)
19 | " | $SCALD_REPL
20 | 
21 | # Test running repl from sbt.
22 | cd $BASE_DIR
23 | # We need to clean SBT_OPTS, because on travis default SBT_OPTS enable sbt batch mode, which finishes repl process immediately before passing `System.exit`.
24 | echo 'System.exit(0)' | SBT_OPTS='' ./sbt ++$TRAVIS_SCALA_VERSION 'scalding-repl/run --local'
25 | 


--------------------------------------------------------------------------------
/COMMITTERS.md:
--------------------------------------------------------------------------------
 1 | # Committers
 2 | 
 3 | Please see our [Project Governance](https://github.com/twitter/analytics-infra-governance) page for more details.
 4 | 
 5 | ## Active
 6 | 
 7 | | Name                   | Handle                                                    |
 8 | |------------------------|-----------------------------------------------------------|
 9 | | Alex Levenson          | [@isnotinvain](https://github.com/isnotinvain)            |
10 | | Ben Pence              | [@benpence](https://github.com/benpence)                  |
11 | | Ian O'Connell          | [@ianoc](https://github.com/ianoc)                        |
12 | | Joe Nievelt            | [@jnievelt](https://github.com/jnievelt)                  |
13 | | Oscar Boykin           | [@johnynek](https://github.com/johnynek)                  |
14 | | Pankaj Gupta           | [@pankajroark](https://github.com/pankajroark)            |
15 | | Piyush Narang          | [@piyushnarang](https://github.com/piyushnarang)          |
16 | | Ruban Monu             | [@rubanm](https://github.com/rubanm)                      |
17 | | Sriram Krishnan        | [@sriramkrishnan](https://github.com/sriramkrishnan)      |
18 | 
19 | ##Emeritus
20 | 


--------------------------------------------------------------------------------
/scalding-parquet-scrooge-fixtures/src/test/resources/binary.thrift:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to the Apache Software Foundation (ASF) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The ASF licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing,
13 |  * software distributed under the License is distributed on an
14 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 |  * KIND, either express or implied.  See the License for the
16 |  * specific language governing permissions and limitations
17 |  * under the License.
18 |  */
19 | 
20 | namespace java com.twitter.scalding.parquet.scrooge.thrift_java.test.binary
21 | #@namespace scala com.twitter.scalding.parquet.scrooge.thrift_scala.test.binary
22 | 
23 | struct StringAndBinary {
24 |   1: required string s;
25 |   2: required binary b;
26 | }
27 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .bsp
 2 | .cache
 3 | .project
 4 | .settings
 5 | .classpath
 6 | *.swp
 7 | BUILD
 8 | target/
 9 | lib_managed/
10 | project/boot/
11 | project/build/target/
12 | project/plugins/target/
13 | project/plugins/lib_managed/
14 | project/plugins/src_managed/
15 | /.idea/
16 | /.idea_modules/
17 | .project
18 | .classpath
19 | .cache-main
20 | .cache-tests
21 | .tmpBin
22 | bin
23 | *.iml
24 | sonatype.sbt
25 | build.sbt-e # not sure where this comes from some kind of backup?
26 | tutorial/data/execution_output.txt
27 | tutorial/data/cofollows.tsv
28 | tutorial/data/cosineSim.tsv
29 | tutorial/data/graphFiltered.tsv
30 | tutorial/data/intersection.tsv
31 | tutorial/data/jaccardSim.tsv
32 | tutorial/data/leftDiff.tsv
33 | tutorial/data/outdegree.tsv
34 | tutorial/data/output0.txt
35 | tutorial/data/output1.txt
36 | tutorial/data/output2.txt
37 | tutorial/data/output3.txt
38 | tutorial/data/output4.txt
39 | tutorial/data/output5.txt
40 | tutorial/data/rightDiff.tsv
41 | tutorial/data/tmp3.tsv
42 | tutorial/data/jsonoutput0.tsv
43 | tutorial/data/avrooutput0.avro
44 | .scalding_repl
45 | scalding-hadoop-test/NOTICE
46 | NOTICE
47 | 
48 | # Auto-copied by sbt-microsites
49 | docs/src/main/tut/contributing.md
50 | .DS_Store
51 | 


--------------------------------------------------------------------------------
/scalding-core/src/test/scala/com/twitter/scalding/source/TypedTextTest.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.source
 2 | 
 3 | import org.scalatest.FunSuite
 4 | 
 5 | case class Test1(a: Int, b: Long, c: Option[Double])
 6 | case class Test2(one: Test1, d: String)
 7 | 
 8 | class TypedTextTest extends FunSuite {
 9 |   test("Test with a flat tuple") {
10 |     val source = TypedText.tsv[Test1]("myPath")
11 |     assert(source.sourceFields.size == 3)
12 |   }
13 | 
14 |   test("Test with a nested tuple") {
15 |     val source = TypedText.tsv[Test2]("myPath")
16 |     assert(source.sourceFields.size == 4)
17 |   }
18 | 
19 |   test("Test with a raw type") {
20 |     val source = TypedText.tsv[String]("myPath")
21 |     assert(source.sourceFields.size == 1)
22 |   }
23 | 
24 |   test("Test with a tuple") {
25 |     val source = TypedText.tsv[(Int, Int)]("myPath")
26 |     assert(source.sourceFields.size == 2)
27 |   }
28 | 
29 |   test("Test with an Optional Int") {
30 |     val source = TypedText.tsv[Option[Int]]("myPath")
31 |     assert(source.sourceFields.size == 1)
32 |   }
33 | 
34 |   test("Test with an Int") {
35 |     val source = TypedText.tsv[Int]("myPath")
36 |     assert(source.sourceFields.size == 1)
37 |   }
38 | }
39 | 


--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
 1 | resolvers ++= Seq(
 2 |   "jgit-repo".at("https://download.eclipse.org/jgit/maven"),
 3 |   "sonatype-releases".at("https://oss.sonatype.org/content/repositories/releases"),
 4 |   "Twitter Maven".at("https://maven.twttr.com")
 5 | )
 6 | 
 7 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.6")
 8 | addSbtPlugin("com.eed3si9n" % "sbt-buildinfo" % "0.11.0")
 9 | addSbtPlugin("com.eed3si9n" % "sbt-unidoc" % "0.4.3")
10 | addSbtPlugin("com.47deg" % "sbt-microsites" % "1.3.4")
11 | addSbtPlugin("com.github.sbt" % "sbt-ci-release" % "1.5.10")
12 | addSbtPlugin("com.dwijnand" % "sbt-dynver" % "4.1.1")
13 | addSbtPlugin("com.github.sbt" % "sbt-pgp" % "2.1.2")
14 | addSbtPlugin("com.twitter" %% "scrooge-sbt-plugin" % "18.9.0")
15 | addSbtPlugin("com.typesafe" % "sbt-mima-plugin" % "0.1.14")
16 | addSbtPlugin("com.typesafe.sbt" % "sbt-ghpages" % "0.6.3")
17 | addSbtPlugin("com.typesafe.sbt" % "sbt-git" % "1.0.0")
18 | addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.4.6")
19 | addSbtPlugin("com.github.sbt" % "sbt-jacoco" % "3.4.0")
20 | addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "3.9.7")
21 | addSbtPlugin("org.wartremover" % "sbt-wartremover" % "2.4.16")
22 | addSbtPlugin("org.scalameta" % "sbt-mdoc" % "2.2.22")
23 | 


--------------------------------------------------------------------------------
/scalding-base/src/main/scala/com/twitter/scalding/CancellationHandler.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding
 2 | 
 3 | import scala.concurrent.{ExecutionContext => ConcurrentExecutionContext, Future}
 4 | 
 5 | sealed trait CancellationHandler { outer =>
 6 |   def stop()(implicit ec: ConcurrentExecutionContext): Future[Unit]
 7 |   def compose(other: CancellationHandler): CancellationHandler = new CancellationHandler {
 8 |     override def stop()(implicit ec: ConcurrentExecutionContext): Future[Unit] =
 9 |       other.stop().zip(outer.stop()).map(_ => ())
10 |   }
11 | }
12 | 
13 | object CancellationHandler {
14 |   val empty: CancellationHandler = new CancellationHandler {
15 |     def stop()(implicit ec: ConcurrentExecutionContext): Future[Unit] = Future.successful(())
16 |   }
17 | 
18 |   def fromFn(fn: ConcurrentExecutionContext => Future[Unit]): CancellationHandler = new CancellationHandler {
19 |     override def stop()(implicit ec: ConcurrentExecutionContext): Future[Unit] = fn(ec)
20 |   }
21 | 
22 |   def fromFuture(f: Future[CancellationHandler]): CancellationHandler = new CancellationHandler {
23 |     override def stop()(implicit ec: ConcurrentExecutionContext): Future[Unit] =
24 |       f.flatMap(_.stop())
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/handler/DateTypeHandler.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.db.macros.impl.handler
 2 | 
 3 | import scala.reflect.macros.Context
 4 | import scala.util.Success
 5 | 
 6 | import com.twitter.scalding.db.macros.impl.FieldName
 7 | 
 8 | object DateTypeHandler {
 9 | 
10 |   def apply[T](c: Context)(implicit
11 |       accessorTree: List[c.universe.MethodSymbol],
12 |       fieldName: FieldName,
13 |       defaultValue: Option[c.Expr[String]],
14 |       annotationInfo: List[(c.universe.Type, Option[Int])],
15 |       nullable: Boolean
16 |   ): scala.util.Try[List[ColumnFormat[c.type]]] = {
17 | 
18 |     val helper = new {
19 |       val ctx: c.type = c
20 |       val cfieldName = fieldName
21 |       val cannotationInfo = annotationInfo
22 |     } with AnnotationHelper
23 | 
24 |     val extracted = for {
25 |       (nextHelper, dateAnno) <- helper.dateAnnotation
26 |       _ <- nextHelper.validateFinished
27 |     } yield dateAnno
28 | 
29 |     extracted.flatMap {
30 |       case WithDate    => Success(List(ColumnFormat(c)(accessorTree, "DATE", None)))
31 |       case WithoutDate => Success(List(ColumnFormat(c)(accessorTree, "DATETIME", None)))
32 |     }
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/scalding-date/src/main/scala/com/twitter/scalding/CalendarOps.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding
 2 | 
 3 | import java.util.{Calendar, Date}
 4 | import scala.annotation.tailrec
 5 | 
 6 | /**
 7 |  */
 8 | object CalendarOps {
 9 |   def truncate(date: Calendar, field: Int): Calendar = {
10 |     @tailrec
11 |     def truncateIter(cal: Calendar, field: Int, currentField: Int): Calendar =
12 |       if (currentField > field) {
13 |         currentField match {
14 |           case Calendar.DAY_OF_MONTH => cal.set(currentField, 1)
15 |           case Calendar.DAY_OF_WEEK_IN_MONTH | Calendar.DAY_OF_WEEK | Calendar.DAY_OF_YEAR |
16 |               Calendar.WEEK_OF_MONTH | Calendar.WEEK_OF_YEAR | Calendar.HOUR_OF_DAY =>
17 |             () // Skip
18 |           case _ => cal.set(currentField, 0)
19 |         }
20 | 
21 |         truncateIter(cal, field, currentField - 1)
22 |       } else {
23 |         cal
24 |       }
25 | 
26 |     val cloned = date.clone().asInstanceOf[Calendar]
27 | 
28 |     truncateIter(cloned, field, Calendar.MILLISECOND)
29 |   }
30 | 
31 |   def truncate(date: Date, field: Int): Date = {
32 |     val cal = Calendar.getInstance()
33 |     cal.setTime(date)
34 | 
35 |     truncate(cal, field).getTime()
36 |   }
37 | 
38 | }
39 | 


--------------------------------------------------------------------------------
/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/Laws.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2015 Twitter, Inc.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 | http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 |  */
16 | package com.twitter.scalding.serialization
17 | 
18 | /**
19 |  * This is a simple trait for describing laws on single parameter type classes (Serialization, Monoid,
20 |  * Ordering, etc...)
21 |  */
22 | sealed trait Law[T] {
23 |   def name: String
24 | }
25 | final case class Law1[T](override val name: String, check: T => Boolean) extends Law[T]
26 | final case class Law2[T](override val name: String, check: (T, T) => Boolean) extends Law[T]
27 | final case class Law3[T](override val name: String, check: (T, T, T) => Boolean) extends Law[T]
28 | 


--------------------------------------------------------------------------------
/scalding-quotation/src/main/scala/com/twitter/scalding/quotation/TreeOps.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.quotation
 2 | 
 3 | import scala.reflect.macros.blackbox.Context
 4 | 
 5 | trait TreeOps {
 6 |   val c: Context
 7 |   import c.universe._
 8 | 
 9 |   /**
10 |    * Finds the first tree that satisfies the condition.
11 |    */
12 |   def find(tree: Tree)(f: Tree => Boolean): Option[Tree] = {
13 |     var res: Option[Tree] = None
14 |     val t = new Traverser {
15 |       override def traverse(t: Tree) =
16 |         if (res.isEmpty)
17 |           if (f(t))
18 |             res = Some(t)
19 |           else
20 |             super.traverse(t)
21 |     }
22 |     t.traverse(tree)
23 |     res
24 |   }
25 | 
26 |   /**
27 |    * Similar to tree.collect but it doesn't collect the children of a collected tree.
28 |    */
29 |   def collect[T](tree: Tree)(f: PartialFunction[Tree, T]): List[T] = {
30 |     var res = List.newBuilder[T]
31 |     val t = new Traverser {
32 |       override def traverse(t: Tree) =
33 |         f.lift(t) match {
34 |           case Some(v) =>
35 |             res += v
36 |           case None =>
37 |             super.traverse(t)
38 |         }
39 |     }
40 |     t.traverse(tree)
41 |     res.result()
42 |   }
43 | }
44 | 


--------------------------------------------------------------------------------
/scalding-base/src/main/scala/com/twitter/scalding/UniqueID.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding
 2 | 
 3 | /**
 4 |  * Used to inject a typed unique identifier to uniquely name each scalding flow. This is here mostly to deal
 5 |  * with the case of testing where there are many concurrent threads running Flows. Users should never have to
 6 |  * worry about these
 7 |  */
 8 | case class UniqueID(get: String) {
 9 |   require(get.indexOf(',') == -1, s"UniqueID cannot contain ,: $get")
10 | }
11 | 
12 | object UniqueID {
13 |   val UNIQUE_JOB_ID = "scalding.job.uniqueId"
14 |   private val id = new java.util.concurrent.atomic.AtomicInteger(0)
15 | 
16 |   def getRandom: UniqueID = {
17 |     // This number is unique as long as we don't create more than 10^6 per milli
18 |     // across separate jobs. which seems very unlikely.
19 |     val unique = (System.currentTimeMillis << 20) ^ (id.getAndIncrement.toLong)
20 |     UniqueID(unique.toString)
21 |   }
22 | 
23 |   /**
24 |    * This is only safe if you use something known to have a single instance in the relevant scope.
25 |    *
26 |    * In cascading, the FlowDef has been used here
27 |    */
28 |   def fromSystemHashCode(ar: AnyRef): UniqueID =
29 |     UniqueID(System.identityHashCode(ar).toString)
30 | }
31 | 


--------------------------------------------------------------------------------
/scalding-base/src/main/scala/com/twitter/scalding/typed/functions/EqTypes.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.typed.functions
 2 | 
 3 | /**
 4 |  * This is a more powerful version of =:= that can allow us to remove casts and also not have any runtime cost
 5 |  * for our function calls in some cases of trivial functions
 6 |  */
 7 | sealed abstract class EqTypes[A, B] extends java.io.Serializable {
 8 |   def apply(a: A): B
 9 |   def subst[F[_]](f: F[A]): F[B]
10 | 
11 |   final def reverse: EqTypes[B, A] = {
12 |     val aa = EqTypes.reflexive[A]
13 |     type F[T] = EqTypes[T, A]
14 |     subst[F](aa)
15 |   }
16 | 
17 |   def toEv: A =:= B = {
18 |     val aa = implicitly[A =:= A]
19 |     type F[T] = A =:= T
20 |     subst[F](aa)
21 |   }
22 | }
23 | 
24 | object EqTypes extends java.io.Serializable {
25 |   private[this] final case class ReflexiveEquality[A]() extends EqTypes[A, A] {
26 |     def apply(a: A): A = a
27 |     def subst[F[_]](f: F[A]): F[A] = f
28 |   }
29 | 
30 |   implicit def reflexive[A]: EqTypes[A, A] = ReflexiveEquality()
31 | 
32 |   def fromEv[A, B](ev: A =:= B): EqTypes[A, B] = // linter:disable:UnusedParameter
33 |     // in scala 2.13, this won't need a cast, but the cast is safe
34 |     reflexive[A].asInstanceOf[EqTypes[A, B]]
35 | }
36 | 


--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
 1 | name: Publish
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - "develop"
 7 |     tags:
 8 |       - "v*"
 9 | 
10 | jobs:
11 |   publish:
12 |     runs-on: ubuntu-latest
13 | 
14 |     steps:
15 |       - name: Checkout Repo
16 |         uses: actions/checkout@v2
17 |         with:
18 |           fetch-depth: 0 # fetch all tags for sbt-dynver to properly resolve scalding version
19 | 
20 |       - uses: actions/setup-java@v2
21 |         with:
22 |           distribution: "adopt-openj9"
23 |           java-version: '8.0.322+6' # non hadoop 3.3 versions build break https://issues.apache.org/jira/browse/HADOOP-16590
24 | 
25 |       - name: Set up Ruby
26 |         uses: ruby/setup-ruby@v1
27 |         with:
28 |           ruby-version: 2.4
29 | 
30 |       - name: Install Ruby Gems
31 |         run: |
32 |           gem install sass -v 3.7.4
33 |           gem install jekyll -v 3.2.1
34 | 
35 |       - name: "Publish"
36 |         env:
37 |           PGP_PASSPHRASE: ${{ secrets.PGP_PASSPHRASE }}
38 |           PGP_SECRET: ${{ secrets.PGP_SECRET }}
39 |           SONATYPE_PASSWORD: ${{ secrets.SONATYPE_PASSWORD }}
40 |           SONATYPE_USERNAME: ${{ secrets.SONATYPE_USERNAME }}
41 |         run: |
42 |           ./sbt "ci-release"
43 | 


--------------------------------------------------------------------------------
/scalding-commons/src/test/scala/com/twitter/scalding/commons/source/LzoGenericSourceSpec.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2015 Twitter, Inc.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 | http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 |  */
16 | package com.twitter.scalding.commons.source
17 | 
18 | import com.twitter.bijection.JavaSerializationInjection
19 | import org.scalatest.{Matchers, WordSpec}
20 | import scala.util.Success
21 | 
22 | class LzoGenericSourceSpec extends WordSpec with Matchers {
23 |   "LzoGenericScheme" should {
24 |     "be serializable" in {
25 |       val scheme = LzoGenericScheme[Array[Byte]](IdentityBinaryConverter)
26 |       val inj = JavaSerializationInjection[LzoGenericScheme[Array[Byte]]]
27 |       inj.invert(inj.apply(scheme)) shouldBe Success(scheme)
28 |     }
29 |   }
30 | }
31 | 


--------------------------------------------------------------------------------
/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/FixedPathSources.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2012 Twitter, Inc.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 | http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 |  */
16 | 
17 | package com.twitter.scalding.commons.source
18 | 
19 | import com.google.protobuf.Message
20 | import com.twitter.scalding._
21 | import org.apache.thrift.TBase
22 | 
23 | abstract class FixedPathLzoThrift[T <: TBase[_, _]: Manifest](path: String*)
24 |     extends FixedPathSource(path: _*)
25 |     with LzoThrift[T] {
26 |   def column = manifest[T].runtimeClass
27 | }
28 | 
29 | abstract class FixedPathLzoProtobuf[T <: Message: Manifest](path: String)
30 |     extends FixedPathSource(path)
31 |     with LzoProtobuf[T] {
32 |   def column = manifest[T].runtimeClass
33 | }
34 | 


--------------------------------------------------------------------------------
/scalding-core/src/test/scala/com/twitter/scalding/estimation/memory/MemoryEstimatorStepStrategyTest.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.estimation.memory
 2 | 
 3 | import org.apache.hadoop.mapred.JobConf
 4 | import org.scalatest.{Matchers, WordSpec}
 5 | 
 6 | class MemoryEstimatorStepStrategyTest extends WordSpec with Matchers {
 7 |   "A Memory estimator step strategy" should {
 8 |     "set xmx settings correctly" in {
 9 |       val conf = confWith("test.opts", "-Xmx3500m -Djava.net.preferIPv4Stack=true -Xms34m")
10 | 
11 |       MemoryEstimatorStepStrategy.setXmxMemory("test.opts", 1024, conf)
12 | 
13 |       conf.get("test.opts") shouldBe "-Djava.net.preferIPv4Stack=true -Xmx1024m"
14 |     }
15 | 
16 |     "set xmx settings correctly with empty original config" in {
17 |       val conf = confWith(Map.empty)
18 | 
19 |       MemoryEstimatorStepStrategy.setXmxMemory("test.opts", 1024, conf)
20 | 
21 |       conf.get("test.opts") shouldBe " -Xmx1024m"
22 |     }
23 |   }
24 | 
25 |   def confWith(key: String, value: String): JobConf =
26 |     confWith(Map(key -> value))
27 | 
28 |   def confWith(values: Map[String, String]): JobConf = {
29 |     val conf = new JobConf(false)
30 | 
31 |     values.foreach { case (k, v) =>
32 |       conf.set(k, v)
33 |     }
34 | 
35 |     conf
36 |   }
37 | }
38 | 


--------------------------------------------------------------------------------
/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/handler/ColumnFormat.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.db.macros.impl.handler
 2 | 
 3 | import scala.reflect.macros.Context
 4 | 
 5 | import com.twitter.scalding.db.macros.impl.FieldName
 6 | 
 7 | object ColumnFormat {
 8 |   def apply(c: Context)(fAccessor: List[c.universe.MethodSymbol], fType: String, size: Option[Int])(implicit
 9 |       fName: FieldName,
10 |       isNullable: Boolean,
11 |       defaultV: Option[c.Expr[String]]
12 |   ): ColumnFormat[c.type] =
13 |     new ColumnFormat[c.type](c) {
14 |       val fieldAccessor = fAccessor
15 |       val fieldType = fType
16 |       val fieldName = fName
17 |       val nullable = isNullable
18 |       val sizeOpt = size
19 |       val defaultValue = defaultV
20 |     }
21 | }
22 | 
23 | /**
24 |  * Contains data format information for a column as defined in the case class.
25 |  *
26 |  * Used by the ColumnDefinitionProvider macro too generate columns definitions and JDBC ResultSet extractor.
27 |  */
28 | abstract class ColumnFormat[C <: Context](val ctx: C) {
29 |   def fieldAccessor: List[ctx.universe.MethodSymbol]
30 |   def fieldType: String
31 |   def fieldName: FieldName
32 |   def nullable: Boolean
33 |   def sizeOpt: Option[Int]
34 |   def defaultValue: Option[ctx.Expr[String]]
35 | }
36 | 


--------------------------------------------------------------------------------
/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/LzoCodecSource.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2012 Twitter, Inc.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 | http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 |  */
16 | 
17 | package com.twitter.scalding.commons.source
18 | 
19 | import com.twitter.chill.Externalizer
20 | import com.twitter.bijection.Injection
21 | 
22 | /**
23 |  * Source used to write some type T into an LZO-compressed SequenceFile using a codec on T for serialization.
24 |  */
25 | 
26 | object LzoCodecSource {
27 |   def apply[T](paths: String*)(implicit passedInjection: Injection[T, Array[Byte]]) =
28 |     new LzoCodec[T] {
29 |       val hdfsPaths = paths
30 |       val localPaths = paths
31 |       val boxed = Externalizer(passedInjection)
32 |       override def injection = boxed.get
33 |     }
34 | }
35 | 


--------------------------------------------------------------------------------
/scalding-core/src/test/scala/com/twitter/scalding/IterableExecutionSerializationTest.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding
 2 | 
 3 | import com.twitter.bijection.JavaSerializationInjection
 4 | import com.twitter.chill.KryoPool
 5 | import com.twitter.chill.config.ScalaAnyRefMapConfig
 6 | import com.twitter.scalding.serialization.{Externalizer, KryoHadoop}
 7 | import com.twitter.scalding.source.TypedText
 8 | import org.scalatest.FunSuite
 9 | 
10 | class ToIterableSerializationTest extends FunSuite {
11 | 
12 |   class Foo {
13 |     val field = 42
14 |   }
15 | 
16 |   val myFoo = new Foo
17 |   val testIterableExecution =
18 |     Execution.toIterable(TypedPipe.from(TypedText.tsv[Int]("foo")).map(_ * myFoo.field))
19 | 
20 |   test("toIterableExecution should roundtrip") {
21 | 
22 |     val jInjection = JavaSerializationInjection[Externalizer[Execution[Iterable[Int]]]]
23 |     val externalizer = Externalizer(testIterableExecution)
24 | 
25 |     assert(jInjection.invert(jInjection(externalizer)).isSuccess)
26 |   }
27 |   test("testing kryo") {
28 |     val kryo = new KryoHadoop(ScalaAnyRefMapConfig(Map("scalding.kryo.setreferences" -> "true")))
29 |     val kryoPool = KryoPool.withByteArrayOutputStream(1, kryo)
30 |     assert(scala.util.Try(kryoPool.deepCopy(testIterableExecution)).isSuccess)
31 |   }
32 | 
33 | }
34 | 


--------------------------------------------------------------------------------
/scalding-dagon/src/test/scala/com/twitter/scalding/dagon/MemoizeTests.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.dagon
 2 | 
 3 | import org.scalatest.FunSuite
 4 | 
 5 | class MemoizeTests extends FunSuite {
 6 |   test("fibonacci is linear in time") {
 7 | 
 8 |     var calls = 0
 9 | 
10 |     val fib =
11 |       Memoize.function[Int, Long] { (i, f) =>
12 |         calls += 1
13 | 
14 |         i match {
15 |           case 0 => 0
16 |           case 1 => 1
17 |           case i => f(i - 1) + f(i - 2)
18 |         }
19 |       }
20 | 
21 |     def fib2(n: Int, x: Long, y: Long): Long =
22 |       if (n == 0) x
23 |       else fib2(n - 1, y, x + y)
24 | 
25 |     assert(fib(100) == fib2(100, 0L, 1L))
26 |     assert(calls == 101)
27 |   }
28 | 
29 |   test("functionK repeated calls only evaluate once") {
30 | 
31 |     var calls = 0
32 |     val fn =
33 |       Memoize.functionK[BoolT, BoolT](new Memoize.RecursiveK[BoolT, BoolT] {
34 |         def toFunction[T] = { case (b, rec) =>
35 |           calls += 1
36 | 
37 |           !b
38 |         }
39 |       })
40 | 
41 |     assert(fn(true) == false)
42 |     assert(calls == 1)
43 |     assert(fn(true) == false)
44 |     assert(calls == 1)
45 | 
46 |     assert(fn(false) == true)
47 |     assert(calls == 2)
48 |     assert(fn(false) == true)
49 |     assert(calls == 2)
50 | 
51 |   }
52 | }
53 | 


--------------------------------------------------------------------------------
/scalding-db/src/main/scala/com/twitter/scalding/db/package.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2015 Twitter, Inc.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 | http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 |  */
16 | package com.twitter.scalding.db
17 | 
18 | import scala.language.experimental.{macros => sMacros}
19 | 
20 | import com.twitter.scalding.db.macros.impl.{ColumnDefinitionProviderImpl, DBTypeDescriptorImpl}
21 | 
22 | // The implicits in the jdbc.macro's package
23 | // These are to allow us to auto provide our Type Classes without the user possibly knowing
24 | // all of the various ways we could build it.
25 | package object macros {
26 |   implicit def toColumnDefinitionProvider[T]: ColumnDefinitionProvider[T] =
27 |     macro ColumnDefinitionProviderImpl[T]
28 |   implicit def toDBTypeDescriptor[T]: DBTypeDescriptor[T] = macro DBTypeDescriptorImpl[T]
29 | }
30 | 


--------------------------------------------------------------------------------
/scalding-serialization/src/main/java/com/twitter/scalding/serialization/Undeprecated.java:
--------------------------------------------------------------------------------
 1 |  /*
 2 |   Copyright 2015 Twitter, Inc.
 3 | 
 4 |   Licensed under the Apache License, Version 2.0 (the "License");
 5 |   you may not use this file except in compliance with the License.
 6 |   You may obtain a copy of the License at
 7 | 
 8 |   http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 |   Unless required by applicable law or agreed to in writing, software
11 |   distributed under the License is distributed on an "AS IS" BASIS,
12 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |   See the License for the specific language governing permissions and
14 |   limitations under the License.
15 |   */
16 | package com.twitter.scalding.serialization;
17 | 
18 | public class Undeprecated {
19 |   /**
20 |    * This method is faster for ASCII data, but unsafe otherwise
21 |    * it is used by our macros AFTER checking that the string is ASCII
22 |    * following a pattern seen in Kryo, which benchmarking showed helped.
23 |    * Scala cannot suppress warnings like this so we do it here
24 |    */
25 |   @SuppressWarnings("deprecation")
26 |   public static void getAsciiBytes(String element, int charStart, int charLen, byte[] bytes, int byteOffset) {
27 |     element.getBytes(charStart, charLen, bytes, byteOffset);
28 |   }
29 | }
30 | 


--------------------------------------------------------------------------------
/scalding-core/src/main/scala/com/twitter/scalding/OptionalSource.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2012 Twitter, Inc.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 | http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 |  */
16 | package com.twitter.scalding
17 | 
18 | import scala.util.{Failure, Success, Try}
19 | import cascading.tap.Tap
20 | 
21 | case class OptionalSource[T](src: Mappable[T]) extends Source with Mappable[T] {
22 |   override def converter[U >: T] = TupleConverter.asSuperConverter(src.converter)
23 | 
24 |   def createTap(readOrWrite: AccessMode)(implicit mode: Mode): Tap[_, _, _] =
25 |     Try(src.validateTaps(mode)) match {
26 |       case Success(_) =>
27 |         src.createTap(readOrWrite)
28 |       case Failure(_) =>
29 |         IterableSource[T](Nil)(TupleSetter.singleSetter[T], src.converter)
30 |           .createTap(readOrWrite)
31 |           .asInstanceOf[Tap[_, _, _]]
32 |     }
33 | }
34 | 


--------------------------------------------------------------------------------
/scalding-core/src/main/scala/com/twitter/scalding/TypedPipeChecker.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding
 2 | 
 3 | /**
 4 |  * This class is used to assist with testing a TypedPipe
 5 |  */
 6 | object TypedPipeChecker {
 7 | 
 8 |   /*
 9 |    * Takes a List and a transform function.
10 |    * The resulting TypedPipe form the transform will be run through asserts
11 |    */
12 |   def checkOutputTransform[T, U, R](input: List[T])(transform: TypedPipe[T] => TypedPipe[U])(
13 |       assertions: List[U] => R
14 |   ): R =
15 |     assertions(inMemoryToList(transform(TypedPipe.from(input))))
16 | 
17 |   /*
18 |    * Execute a TypedPipe in memory, convert the resulting Iterator to
19 |    * a list and run it through a function that makes arbitrary
20 |    * assertions on it.
21 |    */
22 |   def checkOutput[T, R](output: TypedPipe[T])(assertions: List[T] => R): R =
23 |     assertions(inMemoryToList(output))
24 | 
25 |   /**
26 |    * Execute a TypedPipe in memory and return the result as a List
27 |    */
28 |   def inMemoryToList[T](output: TypedPipe[T]): List[T] =
29 |     output.toIterableExecution
30 |       .waitFor(Config.unitTestDefault, Local(strictSources = true))
31 |       .get
32 |       .toList
33 | 
34 |   implicit class InMemoryToListEnrichment[T](val pipe: TypedPipe[T]) extends AnyVal {
35 |     def inMemoryToList: List[T] = TypedPipeChecker.inMemoryToList(pipe)
36 |   }
37 | }
38 | 


--------------------------------------------------------------------------------
/scalding-core/codegen/typed_source_generator.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | # Run it like this:
 4 | #
 5 | #   ./codegen/typed_source_generator.rb > src/main/scala/com/twitter/scalding/typed/GeneratedTypedSource.scala
 6 | 
 7 | $indent = "  "
 8 | 
 9 | TYPES = ('A'..'Z').to_a
10 | 
11 | def make_typed_source(cnt)
12 |   other_cnts = (1..(22-cnt)).to_a
13 |   typeString = TYPES[0..(cnt - 1)].join(",")
14 |   puts "trait TypedSource#{cnt}[#{typeString}] extends TypedSource[Tuple#{cnt}[#{typeString}]] {"
15 |   puts "#{$indent}def converter[Z >: Tuple#{cnt}[#{typeString}]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple#{cnt}[#{typeString}]])"
16 |   puts "}"
17 | end
18 | 
19 | def make_typed_sink(cnt)
20 |   other_cnts = (1..(22-cnt)).to_a
21 |   typeString = TYPES[0..(cnt - 1)].join(",")
22 |   puts "trait TypedSink#{cnt}[#{typeString}] extends TypedSink[Tuple#{cnt}[#{typeString}]] {"
23 |   puts "#{$indent}final def setter[Z <: Tuple#{cnt}[#{typeString}]] = TupleSetter.asSubSetter(TupleSetter.of[Tuple#{cnt}[#{typeString}]])"
24 |   puts "}"
25 | end
26 | 
27 | puts "// following were autogenerated by #{__FILE__} at #{Time.now} do not edit"
28 | puts %q|package com.twitter.scalding
29 | 
30 | |
31 | 
32 | (1..22).each { |c|
33 |   make_typed_source(c)
34 |   puts
35 | }
36 | 
37 | (1..22).each { |c|
38 |   make_typed_sink(c)
39 |   puts
40 | }
41 | 
42 | puts "// end of autogenerated"
43 | 


--------------------------------------------------------------------------------
/scalding-core/src/test/scala/com/twitter/scalding/TypedSketchJoinJobForEmptyKeysTest.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding
 2 | 
 3 | import org.scalatest.{Matchers, WordSpec}
 4 | 
 5 | class TypedSketchJoinJobForEmptyKeys(args: Args) extends Job(args) {
 6 |   // Deal with when a key appears in left but not right
 7 |   val leftTypedPipe = TypedPipe.from(List((1, 1111)))
 8 |   val rightTypedPipe = TypedPipe.from(List((3, 3333), (4, 4444)))
 9 | 
10 |   implicit def serialize(k: Int): Array[Byte] = k.toString.getBytes
11 | 
12 |   val sketched = leftTypedPipe
13 |     .sketch(1)
14 |     .leftJoin(rightTypedPipe)
15 | 
16 |   // this is test that a TypedPipe.Keyed method works:
17 |   sketched.values
18 | 
19 |   sketched
20 |     .map { case (a, (b, c)) =>
21 |       (a, b, c.getOrElse(-1))
22 |     }
23 |     .write(TypedTsv("output"))
24 | }
25 | 
26 | class TypedSketchJoinJobForEmptyKeysTest extends WordSpec with Matchers {
27 |   "A TypedSketchJoinJobForEmptyKeysTest" should {
28 |     "Sketch leftJoin with a single left key should be correct" in {
29 |       JobTest(new TypedSketchJoinJobForEmptyKeys(_))
30 |         .sink[(Int, Int, Int)](TypedTsv[(Int, Int, Int)]("output")) { outBuf =>
31 |           outBuf should have size 1
32 |           val unordered = outBuf.toSet
33 |           unordered should contain(1, 1111, -1)
34 |         }
35 |         .run
36 |         .finish()
37 |     }
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/handler/NumericTypeHandler.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.db.macros.impl.handler
 2 | 
 3 | import scala.reflect.macros.Context
 4 | import scala.util.{Failure, Success}
 5 | 
 6 | import com.twitter.scalding.db.macros.impl.FieldName
 7 | 
 8 | object NumericTypeHandler {
 9 |   def apply[T](c: Context)(implicit
10 |       accessorTree: List[c.universe.MethodSymbol],
11 |       fieldName: FieldName,
12 |       defaultValue: Option[c.Expr[String]],
13 |       annotationInfo: List[(c.universe.Type, Option[Int])],
14 |       nullable: Boolean,
15 |       numericType: String
16 |   ): scala.util.Try[List[ColumnFormat[c.type]]] = {
17 | 
18 |     val helper = new {
19 |       val ctx: c.type = c
20 |       val cfieldName = fieldName
21 |       val cannotationInfo = annotationInfo
22 |     } with AnnotationHelper
23 | 
24 |     val extracted = for {
25 |       (nextHelper, sizeAnno) <- helper.sizeAnnotation
26 |       _ <- nextHelper.validateFinished
27 |     } yield sizeAnno
28 | 
29 |     extracted.flatMap {
30 |       case WithSize(s) if s > 0 => Success(List(ColumnFormat(c)(accessorTree, numericType, Some(s))))
31 |       case WithSize(s) => Failure(new Exception(s"Int field $fieldName, has a size defined that is <= 0."))
32 |       case WithoutSize => Success(List(ColumnFormat(c)(accessorTree, numericType, None)))
33 |     }
34 |   }
35 | }
36 | 


--------------------------------------------------------------------------------
/scalding-core/src/main/scala/com/twitter/scalding/BijectedOrderedSerialization.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2014 Twitter, Inc.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 | http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 |  */
16 | package com.twitter.scalding
17 | 
18 | import com.twitter.scalding.serialization.OrderedSerialization
19 | import com.twitter.bijection.{ImplicitBijection, Injection}
20 | 
21 | object BijectedOrderedSerialization {
22 |   implicit def fromBijection[T, U](implicit
23 |       bij: ImplicitBijection[T, U],
24 |       ordSer: OrderedSerialization[U]
25 |   ): OrderedSerialization[T] =
26 |     OrderedSerialization.viaTransform[T, U](bij.apply(_), bij.invert(_))
27 | 
28 |   implicit def fromInjection[T, U](implicit
29 |       bij: Injection[T, U],
30 |       ordSer: OrderedSerialization[U]
31 |   ): OrderedSerialization[T] =
32 |     OrderedSerialization.viaTryTransform[T, U](bij.apply(_), bij.invert(_))
33 | }
34 | 


--------------------------------------------------------------------------------
/scalding-core/src/main/scala/com/twitter/scalding/estimation/Common.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.estimation
 2 | 
 3 | import cascading.flow.FlowStep
 4 | import cascading.tap.hadoop.Hfs
 5 | import cascading.tap.{CompositeTap, Tap}
 6 | import com.twitter.scalding.tap.GlobHfs
 7 | import org.apache.hadoop.mapred.JobConf
 8 | import org.slf4j.LoggerFactory
 9 | import scala.collection.JavaConverters._
10 | 
11 | object Common {
12 |   private[this] val LOG = LoggerFactory.getLogger(this.getClass)
13 | 
14 |   private def unrollTaps(taps: Seq[Tap[_, _, _]]): Seq[Tap[_, _, _]] =
15 |     taps.flatMap {
16 |       case multi: CompositeTap[_] =>
17 |         unrollTaps(multi.getChildTaps.asScala.toSeq)
18 |       case t => Seq(t)
19 |     }
20 | 
21 |   def unrollTaps(step: FlowStep[JobConf]): Seq[Tap[_, _, _]] =
22 |     unrollTaps(step.getSources.asScala.toSeq)
23 | 
24 |   def inputSizes(step: FlowStep[JobConf]): Seq[(String, Long)] = {
25 |     val conf = step.getConfig
26 |     unrollTaps(step).flatMap {
27 |       case tap: GlobHfs => Some(tap.toString -> tap.getSize(conf))
28 |       case tap: Hfs     => Some(tap.toString -> GlobHfs.getSize(tap.getPath, conf))
29 |       case tap =>
30 |         LOG.warn("InputSizeReducerEstimator unable to calculate size: " + tap)
31 |         None
32 |     }
33 |   }
34 | 
35 |   def totalInputSize(step: FlowStep[JobConf]): Long = inputSizes(step).map(_._2).sum
36 | }
37 | 


--------------------------------------------------------------------------------
/scalding-core/src/main/scala/com/twitter/scalding/serialization/MultiJoinExternalizer.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.serialization
 2 | 
 3 | import com.twitter.scalding.typed.MultiJoinFunction
 4 | 
 5 | object MultiJoinExternalizer {
 6 |   import MultiJoinFunction.Transformer
 7 | 
 8 |   final case class ExternalizeMapGroup[A, B, C](@transient fn: (A, Iterator[B]) => Iterator[C])
 9 |       extends Function2[A, Iterator[B], Iterator[C]] {
10 |     private[this] val fnEx = Externalizer(fn)
11 | 
12 |     def apply(a: A, it: Iterator[B]) = fnEx.get(a, it)
13 |   }
14 | 
15 |   final case class ExternalizeJoin[A, B, C, D](@transient fn: (A, Iterator[B], Iterable[C]) => Iterator[D])
16 |       extends Function3[A, Iterator[B], Iterable[C], Iterator[D]] {
17 |     private[this] val fnEx = Externalizer(fn)
18 | 
19 |     def apply(a: A, bs: Iterator[B], cs: Iterable[C]) = fnEx.get(a, bs, cs)
20 |   }
21 | 
22 |   private[this] object ExtTrans extends Transformer {
23 |     def transformJoin[A, B, C, D](
24 |         fn: (A, Iterator[B], Iterable[C]) => Iterator[D]
25 |     ): (A, Iterator[B], Iterable[C]) => Iterator[D] =
26 |       ExternalizeJoin(fn)
27 |     def transformMap[A, B, C](fn: (A, Iterator[B]) => Iterator[C]): (A, Iterator[B]) => Iterator[C] =
28 |       ExternalizeMapGroup(fn)
29 |   }
30 | 
31 |   def externalize[A, B](mjf: MultiJoinFunction[A, B]): MultiJoinFunction[A, B] =
32 |     ExtTrans(mjf)
33 | }
34 | 


--------------------------------------------------------------------------------
/scalding-commons/src/test/scala/com/twitter/scalding/WordCountTest.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2012 Twitter, Inc.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 | http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 |  */
16 | package com.twitter.scalding
17 | 
18 | import org.scalatest.{Matchers, WordSpec}
19 | 
20 | class WordCountTest extends WordSpec with Matchers {
21 |   "A WordCount job" should {
22 |     JobTest(new com.twitter.scalding.examples.WordCountJob(_))
23 |       .arg("input", "inputFile")
24 |       .arg("output", "outputFile")
25 |       .source(TextLine("inputFile"), List((0, "hack hack hack and hack")))
26 |       .sink[(String, Int)](TypedTsv[(String, Long)]("outputFile")) { outputBuffer =>
27 |         val outMap = outputBuffer.toMap
28 |         "count words correctly" in {
29 |           outMap("hack") shouldBe 4
30 |           outMap("and") shouldBe 1
31 |         }
32 |       }
33 |       .run
34 |       .finish()
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetScrooge.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.parquet.scrooge
 2 | 
 3 | import cascading.scheme.Scheme
 4 | import com.twitter.scalding._
 5 | import com.twitter.scalding.parquet.thrift.ParquetThriftBaseFileSource
 6 | import com.twitter.scalding.source.{DailySuffixSource, HourlySuffixSource}
 7 | import com.twitter.scrooge.ThriftStruct
 8 | 
 9 | import scala.reflect.ClassTag
10 | 
11 | trait ParquetScrooge[T <: ThriftStruct] extends ParquetThriftBaseFileSource[T] {
12 | 
13 |   override def hdfsScheme = {
14 |     // See docs in Parquet346ScroogeScheme
15 |     val scheme = new Parquet346ScroogeScheme[T](this.config)
16 |     HadoopSchemeInstance(scheme.asInstanceOf[Scheme[_, _, _, _, _]])
17 |   }
18 | 
19 | }
20 | 
21 | class DailySuffixParquetScrooge[T <: ThriftStruct](path: String, dateRange: DateRange)(implicit
22 |     override val ct: ClassTag[T]
23 | ) extends DailySuffixSource(path, dateRange)
24 |     with ParquetScrooge[T]
25 | 
26 | class HourlySuffixParquetScrooge[T <: ThriftStruct](path: String, dateRange: DateRange)(implicit
27 |     override val ct: ClassTag[T]
28 | ) extends HourlySuffixSource(path, dateRange)
29 |     with ParquetScrooge[T]
30 | 
31 | class FixedPathParquetScrooge[T <: ThriftStruct](paths: String*)(implicit override val ct: ClassTag[T])
32 |     extends FixedPathSource(paths: _*)
33 |     with ParquetScrooge[T]
34 | 


--------------------------------------------------------------------------------
/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ParquetScroogeInputFormat.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The ASF licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *   http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing,
13 |  * software distributed under the License is distributed on an
14 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 |  * KIND, either express or implied.  See the License for the
16 |  * specific language governing permissions and limitations
17 |  * under the License.
18 |  */
19 | package com.twitter.scalding.parquet.scrooge;
20 | 
21 | import org.apache.parquet.hadoop.thrift.ParquetThriftInputFormat;
22 | 
23 | /**
24 |  * Use this class to read Scrooge records from parquet file
25 |  * @param <T>  Type of Scrooge records to read
26 |  */
27 | public class ParquetScroogeInputFormat<T> extends ParquetThriftInputFormat<T> {
28 |   public ParquetScroogeInputFormat() {
29 |     super(ScroogeReadSupport.class);
30 |   }
31 | }
32 | 


--------------------------------------------------------------------------------
/scalding-core/src/main/scala/com/twitter/scalding/macros/MacroImplicits.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  Copyright 2012 Twitter, Inc.
 3 | 
 4 |  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  you may not use this file except in compliance with the License.
 6 |  You may obtain a copy of the License at
 7 | 
 8 |  http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 |  Unless required by applicable law or agreed to in writing, software
11 |  distributed under the License is distributed on an "AS IS" BASIS,
12 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  See the License for the specific language governing permissions and
14 |  limitations under the License.
15 |  */
16 | package com.twitter.scalding.macros
17 | 
18 | import scala.language.experimental.macros
19 | 
20 | import com.twitter.scalding._
21 | import com.twitter.scalding.macros.impl._
22 | 
23 | object MacroImplicits {
24 | 
25 |   /**
26 |    * This method provides proof that the given type is a case class.
27 |    */
28 |   implicit def materializeCaseClassTupleSetter[T]: TupleSetter[T] =
29 |     macro TupleSetterImpl.caseClassTupleSetterImpl[T]
30 |   implicit def materializeCaseClassTupleConverter[T]: TupleConverter[T] =
31 |     macro TupleConverterImpl.caseClassTupleConverterImpl[T]
32 |   implicit def materializeCaseClassTypeDescriptor[T]: TypeDescriptor[T] =
33 |     macro TypeDescriptorProviderImpl.caseClassTypeDescriptorImpl[T]
34 | }
35 | 


--------------------------------------------------------------------------------
/scalding-core/src/test/scala/com/twitter/scalding/TestTapFactoryTest.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding
 2 | 
 3 | import cascading.tap.Tap
 4 | import cascading.tuple.{Fields, Tuple}
 5 | import scala.collection.mutable.Buffer
 6 | import org.scalatest.{Matchers, WordSpec}
 7 | 
 8 | class TestTapFactoryTest extends WordSpec with Matchers {
 9 |   "A test tap created by TestTapFactory" should {
10 |     "error helpfully when a source is not in the map for test buffers" in {
11 |       // Source to use for this test.
12 |       val testSource = Tsv("path")
13 | 
14 |       // Map of sources to use when creating the tap-- does not contain testSource
15 |       val emptySourceMap = Map[Source, Buffer[Tuple]]()
16 | 
17 |       val testMode = Test(emptySourceMap.get(_))
18 |       val testTapFactory = TestTapFactory(testSource, new Fields())
19 | 
20 |       def createIllegalTap(accessMode: AccessMode): Tap[Any, Any, Any] =
21 |         testTapFactory.createTap(accessMode)(testMode).asInstanceOf[Tap[Any, Any, Any]]
22 | 
23 |       (the[IllegalArgumentException] thrownBy {
24 |         createIllegalTap(Read)
25 |       } should have).message("requirement failed: " + TestTapFactory.sourceNotFoundError.format(testSource))
26 | 
27 |       (the[IllegalArgumentException] thrownBy {
28 |         createIllegalTap(Write)
29 |       } should have).message("requirement failed: " + TestTapFactory.sinkNotFoundError.format(testSource))
30 |     }
31 |   }
32 | }
33 | 


--------------------------------------------------------------------------------
/scalding-core/src/main/scala/com/twitter/scalding/serialization/Externalizer.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2012 Twitter, Inc.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 | http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 |  */
16 | package com.twitter.scalding.serialization
17 | 
18 | import com.twitter.chill.{Externalizer => ChillExtern}
19 | 
20 | import com.esotericsoftware.kryo.DefaultSerializer
21 | import com.esotericsoftware.kryo.serializers.JavaSerializer
22 | 
23 | import com.twitter.chill.config.ScalaAnyRefMapConfig
24 | 
25 | /**
26 |  * We need to control the Kryo created
27 |  */
28 | object Externalizer {
29 |   def apply[T](t: T): Externalizer[T] = {
30 |     val e = new Externalizer[T]
31 |     e.set(t)
32 |     e
33 |   }
34 | }
35 | 
36 | @DefaultSerializer(classOf[JavaSerializer])
37 | class Externalizer[T] extends ChillExtern[T] {
38 |   protected override def kryo =
39 |     new KryoHadoop(ScalaAnyRefMapConfig(Map("scalding.kryo.setreferences" -> "true")))
40 | }
41 | 


--------------------------------------------------------------------------------
/scalding-parquet/src/main/java/com/twitter/scalding/parquet/tuple/SchemaIntersection.java:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.parquet.tuple;
 2 | 
 3 | import org.apache.parquet.schema.MessageType;
 4 | import org.apache.parquet.schema.Type;
 5 | 
 6 | import cascading.tuple.Fields;
 7 | 
 8 | import java.util.List;
 9 | import java.util.ArrayList;
10 | 
11 | public class SchemaIntersection {
12 | 
13 |   private final MessageType requestedSchema;
14 |   private final Fields sourceFields;
15 | 
16 |   public SchemaIntersection(MessageType fileSchema, Fields requestedFields) {
17 |     if(requestedFields == Fields.UNKNOWN)
18 |       requestedFields = Fields.ALL;
19 | 
20 |     Fields newFields = Fields.NONE;
21 |     List<Type> newSchemaFields = new ArrayList<Type>();
22 |     int schemaSize = fileSchema.getFieldCount();
23 | 
24 |     for (int i = 0; i < schemaSize; i++) {
25 |       Type type = fileSchema.getType(i);
26 |       Fields name = new Fields(type.getName());
27 | 
28 |       if(requestedFields.contains(name)) {
29 |         newFields = newFields.append(name);
30 |         newSchemaFields.add(type);
31 |       }
32 |     }
33 | 
34 |     this.sourceFields = newFields;
35 |     this.requestedSchema = new MessageType(fileSchema.getName(), newSchemaFields);
36 |   }
37 | 
38 |   public MessageType getRequestedSchema() {
39 |     return requestedSchema;
40 |   }
41 | 
42 |   public Fields getSourceFields() {
43 |     return sourceFields;
44 |   }
45 | }
46 | 


--------------------------------------------------------------------------------
/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/UnsignedComparisons.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2015 Twitter, Inc.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 | http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 |  */
16 | 
17 | package com.twitter.scalding.serialization
18 | 
19 | object UnsignedComparisons {
20 |   final def unsignedLongCompare(a: Long, b: Long): Int = if (a == b) 0
21 |   else {
22 |     val xor = a ^ b
23 |     // If xor >= 0, then a and b are on the same side of zero
24 |     if (xor >= 0L) java.lang.Long.compare(a, b)
25 |     else if (b >= 0L) 1
26 |     else -1
27 |   }
28 |   final def unsignedIntCompare(a: Int, b: Int): Int =
29 |     java.lang.Long.compare(a.toLong & 0xffffffffL, b.toLong & 0xffffffffL)
30 | 
31 |   final def unsignedShortCompare(a: Short, b: Short): Int =
32 |     Integer.compare(a & 0xffff, b & 0xffff)
33 | 
34 |   final def unsignedByteCompare(a: Byte, b: Byte): Int =
35 |     Integer.compare(a & 0xff, b & 0xff)
36 | }
37 | 


--------------------------------------------------------------------------------
/scalding-core/src/main/scala/com/twitter/scalding/source/MaxFailuresCheck.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2012 Twitter, Inc.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 | http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 |  */
16 | 
17 | package com.twitter.scalding.source
18 | 
19 | import com.twitter.bijection.Injection
20 | import java.util.concurrent.atomic.AtomicInteger
21 | 
22 | // TODO: this should actually increment an read a Hadoop counter
23 | class MaxFailuresCheck[T, U](val maxFailures: Int)(implicit override val injection: Injection[T, U])
24 |     extends CheckedInversion[T, U] {
25 | 
26 |   private val failures = new AtomicInteger(0)
27 |   def apply(input: U): Option[T] =
28 |     try {
29 |       Some(injection.invert(input).get)
30 |     } catch {
31 |       case e: Exception =>
32 |         // TODO: use proper logging
33 |         e.printStackTrace()
34 |         assert(failures.incrementAndGet <= maxFailures, "maximum decoding errors exceeded")
35 |         None
36 |     }
37 | }
38 | 


--------------------------------------------------------------------------------
/scalding-base/src/main/scala/com/twitter/scalding/typed/WithDescription.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2015 Twitter, Inc.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 | http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 |  */
16 | package com.twitter.scalding.typed
17 | 
18 | /**
19 |  * Used for objects that may have a description set to be used in .dot and MR step names.
20 |  */
21 | trait HasDescription {
22 |   def descriptions: Seq[String]
23 | }
24 | 
25 | /**
26 |  * Used for objects that may _set_ a description to be used in .dot and MR step names.
27 |  */
28 | trait WithDescription[+This <: WithDescription[This]] extends HasDescription { self: This =>
29 | 
30 |   /** never mutates this, instead returns a new item. */
31 |   def withDescription(description: String): This
32 | 
33 |   def withDescription(descriptionOpt: Option[String]): This =
34 |     descriptionOpt match {
35 |       case Some(description) => withDescription(description)
36 |       case None              => self
37 |     }
38 | }
39 | 


--------------------------------------------------------------------------------
/scalding-hadoop-test/src/test/scala/com/twitter/scalding/platform/TestJobsWithDescriptions.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.platform
 2 | 
 3 | import com.twitter.scalding._
 4 | 
 5 | /*
 6 |  * These jobs are used in PlatformTests that test correct line numbers in descriptions.
 7 |  * Placing them in a separate file means we don't have to update the tests that care about
 8 |  * line numbers when PlatformTest.scala changes for unrelated reasons.
 9 |  */
10 | 
11 | class TypedPipeJoinWithDescriptionJob(args: Args) extends Job(args) {
12 |   PlatformTest.setAutoForceRight(mode, true)
13 | 
14 |   val x = TypedPipe.from[(Int, Int)](List((1, 1)))
15 |   val y = TypedPipe.from[(Int, String)](List((1, "first")))
16 |   val z = TypedPipe.from[(Int, Boolean)](List((2, true))).group
17 | 
18 |   x.hashJoin(y) // this triggers an implicit that somehow pushes the line number to the next one
19 |     .withDescription("hashJoin")
20 |     .leftJoin(z)
21 |     .withDescription("leftJoin")
22 |     .values
23 |     .write(TypedTsv[((Int, String), Option[Boolean])]("output"))
24 | }
25 | 
26 | class TypedPipeWithDescriptionJob(args: Args) extends Job(args) {
27 |   TypedPipe
28 |     .from[String](List("word1", "word1", "word2"))
29 |     .withDescription("map stage - assign words to 1")
30 |     .map(w => (w, 1L))
31 |     .group
32 |     .withDescription("reduce stage - sum")
33 |     .sum
34 |     .withDescription("write")
35 |     .write(TypedTsv[(String, Long)]("output"))
36 | }
37 | 


--------------------------------------------------------------------------------
/scalding-db/src/main/scala/com/twitter/scalding/db/extensions/VerticaExtensions.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2015 Twitter, Inc.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 | http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 |  */
16 | 
17 | package com.twitter.scalding.db.extensions
18 | 
19 | import com.twitter.scalding.db._
20 | 
21 | object VerticaExtensions {
22 |   def verticaMutator: PartialFunction[DBColumnDefinition, DBColumnDefinition] = {
23 |     case t @ DBColumnDefinition(BIGINT, _, _, None, _, _)   => t.copy(sizeOpt = None)
24 |     case t @ DBColumnDefinition(INT, _, _, None, _, _)      => t.copy(sizeOpt = None)
25 |     case t @ DBColumnDefinition(SMALLINT, _, _, None, _, _) => t.copy(sizeOpt = None)
26 |     case t @ DBColumnDefinition(BOOLEAN, _, _, None, _, _)  => t.copy(sizeOpt = None)
27 |     case t @ DBColumnDefinition(TINYINT, _, _, None, _, _)  => t.copy(sizeOpt = None)
28 |     case t @ DBColumnDefinition(DOUBLE, _, _, _, _, _) => t.copy(sqlType = SqlTypeName("DOUBLE PRECISION"))
29 |   }
30 | }
31 | 


--------------------------------------------------------------------------------
/scalding-quotation/src/main/scala/com/twitter/scalding/quotation/Quoted.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.quotation
 2 | 
 3 | import java.io.File
 4 | 
 5 | /**
 6 |  * Meta information about a method call.
 7 |  */
 8 | case class Quoted(position: Source, text: Option[String], projections: Projections) {
 9 |   override def toString = s"$position ${text.getOrElse("")}"
10 | }
11 | 
12 | object Quoted {
13 |   import language.experimental.macros
14 |   implicit def method: Quoted = macro QuotedMacro.method
15 | 
16 |   private[scalding] def internal: Quoted = macro QuotedMacro.internal
17 | 
18 |   def function[T1, U](f: T1 => U): Function1[T1, U] with QuotedFunction = macro QuotedMacro.function
19 |   def function[T1, T2, U](f: (T1, T2) => U): Function2[T1, T2, U] with QuotedFunction =
20 |     macro QuotedMacro.function
21 |   def function[T1, T2, T3, U](f: (T1, T2, T3) => U): Function3[T1, T2, T3, U] with QuotedFunction =
22 |     macro QuotedMacro.function
23 |   def function[T1, T2, T3, T4, U](f: (T1, T2, T3, T4) => U): Function4[T1, T2, T3, T4, U]
24 |     with QuotedFunction = macro QuotedMacro.function
25 |   def function[T1, T2, T3, T4, T5, U](f: (T1, T2, T3, T4, T5) => U): Function5[T1, T2, T3, T4, T5, U]
26 |     with QuotedFunction = macro QuotedMacro.function
27 | }
28 | 
29 | case class Source(path: String, line: Int) {
30 |   def classFile = path.split(File.separator).last
31 |   override def toString = s"$classFile:$line"
32 | }
33 | 
34 | trait QuotedFunction {
35 |   def quoted: Quoted
36 | }
37 | 


--------------------------------------------------------------------------------
/scalding-core/src/main/scala/com/twitter/scalding/estimation/HistoryService.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.estimation
 2 | 
 3 | import scala.util.Try
 4 | 
 5 | /**
 6 |  * Info about a prior FlowStep, provided by implementers of HistoryService
 7 |  */
 8 | final case class FlowStepHistory(
 9 |     keys: FlowStepKeys,
10 |     submitTimeMillis: Long,
11 |     launchTimeMillis: Long,
12 |     finishTimeMillis: Long,
13 |     totalMaps: Long,
14 |     totalReduces: Long,
15 |     finishedMaps: Long,
16 |     finishedReduces: Long,
17 |     failedMaps: Long,
18 |     failedReduces: Long,
19 |     mapFileBytesRead: Long,
20 |     mapFileBytesWritten: Long,
21 |     mapOutputBytes: Long,
22 |     reduceFileBytesRead: Long,
23 |     hdfsBytesRead: Long,
24 |     hdfsBytesWritten: Long,
25 |     mapperTimeMillis: Long,
26 |     reducerTimeMillis: Long,
27 |     reduceShuffleBytes: Long,
28 |     cost: Double,
29 |     tasks: Seq[Task]
30 | )
31 | 
32 | final case class FlowStepKeys(
33 |     jobName: String,
34 |     user: String,
35 |     priority: String,
36 |     status: String,
37 |     version: String,
38 |     queue: String
39 | )
40 | 
41 | final case class Task(details: Map[String, Any], counters: Map[String, Long]) {
42 |   def taskType: Option[String] = details.get(Task.TaskType).map(_.asInstanceOf[String])
43 | }
44 | 
45 | object Task {
46 |   val TaskType = "taskType"
47 | }
48 | 
49 | trait HistoryService {
50 |   def fetchHistory(info: FlowStrategyInfo, maxHistory: Int): Try[Seq[FlowStepHistory]]
51 | }
52 | 


--------------------------------------------------------------------------------
/scalding-dagon/src/test/scala/com/twitter/scalding/dagon/CacheTests.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.dagon
 2 | 
 3 | import org.scalacheck.Prop._
 4 | import org.scalacheck.{Arbitrary, Cogen, Properties}
 5 | 
 6 | abstract class CacheTests[K: Cogen: Arbitrary, V: Arbitrary](name: String) extends Properties(name) {
 7 | 
 8 |   def buildMap(c: Cache[K, V], ks: Iterable[K], f: K => V): Map[K, V] =
 9 |     ks.iterator.foldLeft(Map.empty[K, V]) { (m, k) =>
10 |       m.updated(k, c.getOrElseUpdate(k, f(k)))
11 |     }
12 | 
13 |   property("getOrElseUpdate") = forAll { (f: K => V, k: K, v1: V, v2: V) =>
14 |     val c = Cache.empty[K, V]
15 |     var count = 0
16 |     val x = c.getOrElseUpdate(k, { count += 1; v1 })
17 |     val y = c.getOrElseUpdate(k, { count += 1; v2 })
18 |     x == v1 && y == v1 && count == 1
19 |   }
20 | 
21 |   property("toMap") = forAll { (f: K => V, ks: Set[K]) =>
22 |     val c = Cache.empty[K, V]
23 |     val m = buildMap(c, ks, f)
24 |     c.toMap == m
25 |   }
26 | 
27 |   property("duplicate") = forAll { (f: K => V, ks: Set[K]) =>
28 |     val c = Cache.empty[K, V]
29 |     val d = c.duplicate
30 |     buildMap(c, ks, f)
31 |     d.toMap.isEmpty
32 |   }
33 | 
34 |   property("reset works") = forAll { (f: K => V, ks: Set[K]) =>
35 |     val c = Cache.empty[K, V]
36 |     buildMap(c, ks, f)
37 |     val d = c.duplicate
38 |     c.reset()
39 |     c.toMap.isEmpty && d.toMap.size == ks.size
40 |   }
41 | }
42 | 
43 | object CacheTestsSL extends CacheTests[String, Long]("CacheTests[String, Long]")
44 | 


--------------------------------------------------------------------------------
/scalding-base/src/main/scala/com/twitter/scalding/Mode.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding
 2 | 
 3 | trait Mode extends java.io.Serializable {
 4 | 
 5 |   /**
 6 |    * Make the Execution.Writer for this platform
 7 |    */
 8 |   def newWriter(): Execution.Writer
 9 | 
10 |   /**
11 |    * Config.defaultForMode converts this map into a Config (we don't use Config here to avoid a circular
12 |    * dependency)
13 |    */
14 |   def defaultConfig: Map[String, String] = Map.empty
15 | }
16 | 
17 | object Mode {
18 | 
19 |   /**
20 |    * This is a Args and a Mode together. It is used purely as a work-around for the fact that Job only accepts
21 |    * an Args object, but needs a Mode inside.
22 |    */
23 |   private class ArgsWithMode(argsMap: Map[String, List[String]], val mode: Mode) extends Args(argsMap) {
24 |     override def +(keyvals: (String, Iterable[String])): Args =
25 |       new ArgsWithMode(super.+(keyvals).m, mode)
26 |   }
27 | 
28 |   /** Attach a mode to these Args and return the new Args */
29 |   def putMode(mode: Mode, args: Args): Args = new ArgsWithMode(args.m, mode)
30 | 
31 |   /** Get a Mode if this Args was the result of a putMode */
32 |   def getMode(args: Args): Option[Mode] = args match {
33 |     case withMode: ArgsWithMode => Some(withMode.mode)
34 |     case _                      => None
35 |   }
36 | }
37 | 
38 | case class ModeException(message: String) extends RuntimeException(message)
39 | case class ModeLoadException(message: String, origin: ClassNotFoundException) extends RuntimeException(origin)
40 | 


--------------------------------------------------------------------------------
/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ScroogeSchemaConversionException.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The ASF licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *   http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing,
13 |  * software distributed under the License is distributed on an
14 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 |  * KIND, either express or implied.  See the License for the
16 |  * specific language governing permissions and limitations
17 |  * under the License.
18 |  */
19 | package com.twitter.scalding.parquet.scrooge;
20 | 
21 | import org.apache.parquet.ParquetRuntimeException;
22 | 
23 | /**
24 |  * Throw this exception when there is an error converting a Scrooge class to
25 |  * thrift schema
26 |  */
27 | class ScroogeSchemaConversionException extends ParquetRuntimeException {
28 |   public ScroogeSchemaConversionException(String message, Throwable cause) {
29 |     super(message, cause);
30 |   }
31 | 
32 |   public ScroogeSchemaConversionException(String message) {
33 |     super(message);
34 |   }
35 | }
36 | 
37 | 


--------------------------------------------------------------------------------
/tutorial/MatrixTutorial3.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.examples
 2 | 
 3 | import com.twitter.scalding._
 4 | import com.twitter.scalding.mathematics.Matrix
 5 | 
 6 | 
 7 | /*
 8 | * MatrixTutorial3.scala
 9 | *
10 | * Loads two directed graph adjacency matrices where a[i,j] = 1 if there is an edge from a[i] to b[j]
11 | * and computes the intersection and the differences between the two
12 | * 
13 | * ../scripts/scald.rb --local MatrixTutorial3.scala --input1 data/graph.tsv --input2 data/graph2.tsv --intersection data/intersection.tsv --leftDiff data/leftDiff.tsv --rightDiff data/rightDiff.tsv
14 | *
15 | */
16 | 
17 | 
18 | class ComputeMatrixIntersectionJob(args : Args) extends Job(args) {
19 |  
20 |   import Matrix._
21 |   
22 |   val adjacencyMatrix1 = Tsv( args("input1"), ('user1, 'user2, 'rel) )
23 |     .read
24 |     .toMatrix[Long,Long,Double]('user1, 'user2, 'rel)
25 | 
26 |   val adjacencyMatrix2 = Tsv( args("input2"), ('user1, 'user2, 'rel) )
27 |     .read
28 |     .toMatrix[Long,Long,Double]('user1, 'user2, 'rel)
29 | 
30 |   //zip puts creates a pair element out of corresponding elements in the two matrices
31 |   val intersection = adjacencyMatrix1
32 |                         .zip(adjacencyMatrix2)
33 |                         .mapValues( pair => if (pair._1 > 0 && pair._2 > 0) 1.0 else 0.0 )
34 |                         .write(Tsv(args("intersection")))
35 |   (adjacencyMatrix1 - intersection).write(Tsv(args("leftDiff")))
36 |   (adjacencyMatrix2 - intersection).write(Tsv(args("rightDiff")))
37 |   
38 | }
39 | 
40 | 


--------------------------------------------------------------------------------
/scalding-core/src/main/scala/com/twitter/scalding/Dsl.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2012 Twitter, Inc.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 | http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 |  */
16 | package com.twitter.scalding
17 | 
18 | import cascading.pipe.Pipe
19 | import cascading.flow.FlowDef
20 | 
21 | /**
22 |  * This object has all the implicit functions and values that are used to make the scalding DSL, which
23 |  * includes the functions for automatically creating cascading.tuple.Fields objects from scala tuples of
24 |  * Strings, Symbols or Ints, as well as the cascading.pipe.Pipe enrichment to RichPipe which adds the
25 |  * scala.collections-like API to Pipe.
26 |  *
27 |  * It's useful to import Dsl._ when you are writing scalding code outside of a Job.
28 |  */
29 | object Dsl extends FieldConversions with java.io.Serializable {
30 |   implicit def pipeToRichPipe(pipe: Pipe): RichPipe = new RichPipe(pipe)
31 | 
32 |   /**
33 |    * Enrichment on FlowDef
34 |    */
35 |   implicit def flowDefToRichFlowDef(fd: FlowDef): RichFlowDef = new RichFlowDef(fd)
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/scalding-core/src/main/scala/com/twitter/scalding/reducer_estimation/ReducerEstimatorConfig.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.reducer_estimation
 2 | 
 3 | import org.apache.hadoop.mapred.JobConf
 4 | 
 5 | object ReducerEstimatorConfig {
 6 | 
 7 |   /** Output param: what the Reducer Estimator recommended, regardless of if it was used. */
 8 |   val estimatedNumReducers = "scalding.reducer.estimator.result"
 9 | 
10 |   /**
11 |    * Output param: same as estimatedNumReducers but with the cap specified by maxEstimatedReducersKey applied.
12 |    * Can be used to determine whether a cap was applied to the estimated number of reducers and potentially to
13 |    * trigger alerting / logging.
14 |    */
15 |   val cappedEstimatedNumReducersKey = "scalding.reducer.estimator.result.capped"
16 | 
17 |   /** Output param: what the original job config was. */
18 |   val originalNumReducers = "scalding.reducer.estimator.original.mapred.reduce.tasks"
19 | 
20 |   /**
21 |    * If we estimate more than this number of reducers, we will use this number instead of the estimated value
22 |    */
23 |   val maxEstimatedReducersKey = "scalding.reducer.estimator.max.estimated.reducers"
24 | 
25 |   /* fairly arbitrary choice here -- you will probably want to configure this in your cluster defaults */
26 |   val defaultMaxEstimatedReducers = 5000
27 | 
28 |   /** Maximum number of history items to use for reducer estimation. */
29 |   val maxHistoryKey = "scalding.reducer.estimator.max.history"
30 | 
31 |   def getMaxHistory(conf: JobConf): Int = conf.getInt(maxHistoryKey, 1)
32 | }
33 | 


--------------------------------------------------------------------------------
/scalding-db/src/main/scala/com/twitter/scalding/db/ColumnDefinition.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2015 Twitter, Inc.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 | http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 |  */
16 | 
17 | package com.twitter.scalding.db
18 | 
19 | import com.twitter.scalding.TupleConverter
20 | 
21 | case class ColumnName(toStr: String) extends AnyVal
22 | case class SqlTypeName(toStr: String) extends AnyVal
23 | 
24 | case class ColumnDefinition(
25 |     jdbcType: SqlType,
26 |     name: ColumnName,
27 |     nullable: IsNullable,
28 |     sizeOpt: Option[Int],
29 |     defaultValue: Option[String]
30 | ) extends Serializable
31 | 
32 | trait ColumnDefinitionProvider[T] extends Serializable {
33 |   def columns: Iterable[ColumnDefinition]
34 |   def resultSetExtractor: ResultSetExtractor[T]
35 | }
36 | 
37 | class JdbcValidationException(msg: String) extends RuntimeException(msg)
38 | 
39 | trait ResultSetExtractor[T] {
40 |   def validate(rsmd: java.sql.ResultSetMetaData): scala.util.Try[Unit]
41 |   def toCaseClass(rs: java.sql.ResultSet, c: TupleConverter[T]): T
42 | }
43 | 


--------------------------------------------------------------------------------
/scalding-parquet/README.md:
--------------------------------------------------------------------------------
 1 | # Parquet support for Scalding
 2 | 
 3 | The implementation is ported from code used by Twitter internally written by Sam Ritchie, Ian O'Connell, Oscar Boykin, Tianshuo Deng
 4 | ## Use com.twitter.scalding.parquet.thrift for reading apache Thrift (TBase) records
 5 | ## Use com.twitter.scalding.parquet.scrooge for reading scrooge Thrift (ThriftStruct) records
 6 |   Located in the scalding-parquet-scrooge module
 7 | ## Use com.twitter.scalding.parquet.tuple for reading Tuple records
 8 | ## Use com.twitter.scalding.parquet.tuple.TypedParquet for reading or writing case classes:
 9 | Can use macro in com.twitter.scalding.parquet.tuple.macros.Macros to generate parquet read/write support. Here's an example:
10 | ```scala
11 |   import com.twitter.scalding.parquet.tuple.macros.Macros._
12 | 
13 |   case class SampleClass(x: Int, y: String)
14 | 
15 |   class WriteToTypedParquetTupleJob(args: Args) extends Job(args) {
16 |     val outputPath = args.required("output")
17 |     val sink = TypedParquetSink[SampleClass](outputPath)
18 | 
19 |     TypedPipe.from(List(SampleClass(0, "foo"), SampleClass(1, "bar"))).write(sink)
20 |   }
21 | 
22 |   class ReadWithFilterPredicateJob(args: Args) extends Job(args) {
23 |     val fp: FilterPredicate = FilterApi.eq(binaryColumn("y"), Binary.fromString("foo"))
24 | 
25 |     val inputPath = args.required("input")
26 |     val outputPath = args.required("output")
27 | 
28 |     val input = TypedParquet[SampleClass](inputPath, fp)
29 | 
30 |     TypedPipe.from(input).map(_.x).write(TypedTsv[Int](outputPath))
31 |   }
32 | ```


--------------------------------------------------------------------------------
/scalding-core/src/main/scala/com/twitter/scalding/macros/impl/CaseClassFieldSetter.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  Copyright 2015 Twitter, Inc.
 3 | 
 4 |  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  you may not use this file except in compliance with the License.
 6 |  You may obtain a copy of the License at
 7 | 
 8 |  http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 |  Unless required by applicable law or agreed to in writing, software
11 |  distributed under the License is distributed on an "AS IS" BASIS,
12 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  See the License for the specific language governing permissions and
14 |  limitations under the License.
15 |  */
16 | 
17 | package com.twitter.scalding.macros.impl
18 | 
19 | import scala.reflect.macros.Context
20 | import scala.util.Try
21 | 
22 | /**
23 |  * Helper to set fields from a case class to other "container" types E.g. cascading Tuple, jdbc
24 |  * PreparedStatement
25 |  */
26 | trait CaseClassFieldSetter {
27 | 
28 |   // mark the field as absent/null
29 |   def absent(c: Context)(idx: Int, container: c.TermName): c.Tree
30 | 
31 |   // use the default field setter (for when there is no type-specific setter)
32 |   def default(c: Context)(idx: Int, container: c.TermName, fieldValue: c.Tree): c.Tree
33 | 
34 |   // use the field setter known specific to the given field type
35 |   // return scala.util.Failure if no type specific setter in the container
36 |   def from(c: Context)(fieldType: c.Type, idx: Int, container: c.TermName, fieldValue: c.Tree): Try[c.Tree]
37 | }
38 | 


--------------------------------------------------------------------------------
/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/runtime_helpers/MacroEqualityOrderedSerialization.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  Copyright 2016 Twitter, Inc.
 3 | 
 4 |  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  you may not use this file except in compliance with the License.
 6 |  You may obtain a copy of the License at
 7 | 
 8 |  http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 |  Unless required by applicable law or agreed to in writing, software
11 |  distributed under the License is distributed on an "AS IS" BASIS,
12 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  See the License for the specific language governing permissions and
14 |  limitations under the License.
15 |  */
16 | package com.twitter.scalding.serialization.macros.impl.ordered_serialization.runtime_helpers
17 | 
18 | import com.twitter.scalding.serialization.{EquivSerialization, OrderedSerialization}
19 | 
20 | object MacroEqualityOrderedSerialization {
21 |   private val seed = "MacroEqualityOrderedSerialization".hashCode
22 | }
23 | 
24 | abstract class MacroEqualityOrderedSerialization[T]
25 |     extends OrderedSerialization[T]
26 |     with EquivSerialization[T] {
27 |   def uniqueId: String
28 |   override def hashCode = MacroEqualityOrderedSerialization.seed ^ uniqueId.hashCode
29 |   override def equals(other: Any): Boolean = other match {
30 |     case o: MacroEqualityOrderedSerialization[_] => o.uniqueId == uniqueId
31 |     case _                                       => false
32 |   }
33 | }
34 | 


--------------------------------------------------------------------------------
/scalding-core/src/main/scala/com/twitter/scalding/mathematics/Histogram.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.mathematics
 2 | 
 3 | class Histogram(map: Map[Double, Long], binWidth: Double) {
 4 |   lazy val size = map.values.sum
 5 |   lazy val sum = map.foldLeft(0.0) { case (acc, (bin, count)) => acc + bin * count }
 6 |   lazy val keys = map.keys.toList.sorted
 7 | 
 8 |   lazy val min = keys.head
 9 |   lazy val max = keys.last
10 | 
11 |   lazy val stdDev = {
12 |     val squaredDiff = map.foldLeft(0.0) { case (acc, (bin, count)) =>
13 |       acc + count * math.pow(bin - mean, 2.0)
14 |     }
15 |     math.sqrt(squaredDiff / size)
16 |   }
17 | 
18 |   lazy val cdf = {
19 |     var cumulative = 0L
20 |     var result = Map[Double, Double]()
21 |     keys.foreach { bin =>
22 |       cumulative += map(bin)
23 |       result += (bin -> (cumulative.toDouble / size))
24 |     }
25 |     result
26 |   }
27 | 
28 |   lazy val lorenz = {
29 |     var cumulativeUnique = 0.0
30 |     var cumulativeTotal = 0.0
31 |     var result = Map[Double, Double]()
32 |     keys.foreach { bin =>
33 |       cumulativeUnique += map(bin)
34 |       cumulativeTotal += bin * map(bin)
35 |       result += (cumulativeUnique / size -> cumulativeTotal / sum)
36 |     }
37 |     result
38 |   }
39 | 
40 |   def percentile(p: Int) = keys.find(bin => cdf(bin) * 100 >= p).getOrElse(-1d)
41 | 
42 |   lazy val median = percentile(50)
43 |   lazy val q1 = percentile(25)
44 |   lazy val q3 = percentile(75)
45 | 
46 |   def mean = sum / size
47 |   def innerQuartileRange = q3 - q1
48 |   def coefficientOfDispersion = innerQuartileRange / (q3 + q1)
49 | }
50 | 


--------------------------------------------------------------------------------
/tutorial/MatrixTutorial6.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.examples
 2 | 
 3 | import com.twitter.scalding._
 4 | import com.twitter.scalding.mathematics.Matrix
 5 | 
 6 | /*
 7 | * MatrixTutorial6.scala
 8 | *
 9 | * Loads a document to word matrix where a[i,j] = freq of the word j in the document i 
10 | * computes the Tf-Idf score of each word w.r.t. to each document and keeps the top nrWords in each document
11 | * (see http://en.wikipedia.org/wiki/Tf*idf for more info)
12 | * 
13 | * ../scripts/scald.rb --local MatrixTutorial6.scala --input data/docBOW.tsv --nrWords 300 --output data/featSelectedMatrix.tsv
14 | *
15 | */
16 | 
17 | class TfIdfJob(args : Args) extends Job(args) {
18 |   
19 |   import Matrix._
20 | 
21 |   val docWordMatrix = Tsv( args("input"), ('doc, 'word, 'count) )
22 |     .read
23 |     .toMatrix[Long,String,Double]('doc, 'word, 'count)
24 | 
25 |   // compute the overall document frequency of each row
26 |   val docFreq = docWordMatrix.binarizeAs[Double].sumRowVectors
27 | 
28 |   // compute the inverse document frequency vector
29 |   val invDocFreqVct = docFreq.toMatrix(1).rowL1Normalize.mapValues( x => log2(1/x) )
30 | 
31 |   // zip the row vector along the entire document - word matrix
32 |   val invDocFreqMat = docWordMatrix.zip(invDocFreqVct.getRow(1)).mapValues( pair => pair._2 )
33 | 
34 |   // multiply the term frequency with the inverse document frequency and keep the top nrWords
35 |   docWordMatrix.hProd(invDocFreqMat).topRowElems( args("nrWords").toInt ).write(Tsv( args("output") ))
36 | 
37 |   def log2(x : Double) = scala.math.log(x)/scala.math.log(2.0)
38 | 
39 | }
40 | 
41 | 


--------------------------------------------------------------------------------
/scalding-core/src/test/scala/com/twitter/scalding/TimePathedSourceTest.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2016 Twitter, Inc.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 | http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 |  */
16 | package com.twitter.scalding
17 | 
18 | import java.util.TimeZone
19 | 
20 | import org.scalatest.{Matchers, WordSpec}
21 | 
22 | class TimePathedSourceTest extends WordSpec with Matchers {
23 |   "TimePathedSource.hdfsWritePath" should {
24 |     val dateRange = DateRange(RichDate(0L), RichDate(0L))
25 |     val utcTZ = DateOps.UTC
26 | 
27 |     "crib if path == /*" in {
28 |       intercept[AssertionError](TestTimePathedSource("/*", dateRange, utcTZ).hdfsWritePath)
29 |     }
30 | 
31 |     "crib if path doesn't end with /*" in {
32 |       intercept[AssertionError](TestTimePathedSource("/my/invalid/path", dateRange, utcTZ).hdfsWritePath)
33 |     }
34 | 
35 |     "work for path ending with /*" in {
36 |       TestTimePathedSource("/my/path/*", dateRange, utcTZ).hdfsWritePath.startsWith("/my/path")
37 |     }
38 |   }
39 | }
40 | 
41 | case class TestTimePathedSource(p: String, dr: DateRange, t: TimeZone) extends TimePathedSource(p, dr, t)
42 | 


--------------------------------------------------------------------------------
/tutorial/MatrixTutorial5.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.examples
 2 | 
 3 | import com.twitter.scalding._
 4 | import com.twitter.scalding.mathematics.Matrix
 5 | 
 6 | 
 7 | /*
 8 | * MatrixTutorial5.scala
 9 | *
10 | * Loads a directed graph adjacency matrix where a[i,j] = 1 if there is an edge from a[i] to b[j]
11 | * and computes the jaccard similarity between any two pairs of vectors
12 | * 
13 | * ../scripts/scald.rb --local MatrixTutorial5.scala --input data/graph.tsv --output data/jaccardSim.tsv
14 | *
15 | */
16 | 
17 | class ComputeJaccardJob(args : Args) extends Job(args) {
18 |   
19 |   import Matrix._
20 | 
21 |   val adjacencyMatrix = Tsv( args("input"), ('user1, 'user2, 'rel) )
22 |     .read
23 |     .toMatrix[Long,Long,Double]('user1, 'user2, 'rel)
24 | 
25 |   val aBinary = adjacencyMatrix.binarizeAs[Double]
26 |  
27 |   // intersectMat holds the size of the intersection of row(a)_i n row (b)_j
28 |   val intersectMat = aBinary * aBinary.transpose
29 |   val aSumVct = aBinary.sumColVectors
30 |   val bSumVct = aBinary.sumRowVectors
31 | 
32 |   //Using zip to repeat the row and column vectors values on the right hand
33 |   //for all non-zeroes on the left hand matrix
34 |   val xMat = intersectMat.zip(aSumVct).mapValues( pair => pair._2 )
35 |   val yMat = intersectMat.zip(bSumVct).mapValues( pair => pair._2 )
36 |   
37 |   val unionMat = xMat + yMat - intersectMat
38 |   //We are guaranteed to have Double both in the intersection and in the union matrix
39 |   intersectMat.zip(unionMat)
40 |               .mapValues( pair => pair._1 / pair._2 )
41 |               .write(Tsv( args("output") ))
42 | 
43 | }
44 | 
45 | 


--------------------------------------------------------------------------------
/scalding-core/src/test/scala/com/twitter/scalding/ExecutionUtilTest.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding
 2 | 
 3 | import org.scalatest.{Matchers, WordSpec}
 4 | 
 5 | class ExecutionUtilTest extends WordSpec with Matchers {
 6 |   import ExecutionUtil._
 7 | 
 8 |   implicit val tz: java.util.TimeZone = DateOps.UTC
 9 |   implicit val dp: DateParser = DateParser.default
10 |   implicit val dateRange: DateRange = DateRange.parse("2015-01-01", "2015-01-10")
11 | 
12 |   def run[T](e: Execution[T]) = {
13 |     val mode = Local(true)
14 |     e.waitFor(Config.defaultFrom(mode), mode)
15 |   }
16 | 
17 |   def testJob(dr: DateRange) = {
18 |     assert(dr != null)
19 |     TypedPipe
20 |       .from[Int](Seq(1, 2, 3))
21 |       .toIterableExecution
22 |       .map(_.head)
23 |   }
24 | 
25 |   def testJobFailure(dr: DateRange) =
26 |     throw new Exception(s"failed: $dr")
27 | 
28 |   "ExecutionUtil" should {
29 |     "run multiple jobs" in {
30 |       val days = dateRange.each(Days(1)).toSeq
31 |       val result = runDatesWithParallelism(Days(1))(testJob)
32 |       assert(run(result).get == days.map(d => (d, 1)))
33 |     }
34 | 
35 |     "run multiple jobs with executions" in {
36 |       val days = dateRange.each(Days(1)).toSeq
37 |       val result = runDateRangeWithParallelism(Days(1))(testJob)
38 |       assert(run(result).get == days.map(d => 1))
39 |     }
40 | 
41 |     "run multiple jobs with executions and sum results" in {
42 |       val days = dateRange.each(Days(1)).toSeq
43 |       val result = runDateRangeWithParallelismSum(Days(1))(testJob)
44 |       assert(run(result).get == days.map(d => 1).sum)
45 |     }
46 |   }
47 | }
48 | 


--------------------------------------------------------------------------------
/scalding-hraven/src/main/scala/com/twitter/scalding/hraven/reducer_estimation/HRavenBasedEstimator.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.hraven.reducer_estimation
 2 | 
 3 | import com.twitter.hraven.{CounterMap, TaskDetails}
 4 | import com.twitter.scalding.estimation.Task
 5 | import com.twitter.scalding.hraven.estimation.HRavenHistoryService
 6 | import com.twitter.scalding.reducer_estimation.{RatioBasedEstimator, RuntimeReducerEstimator}
 7 | 
 8 | trait HRavenReducerHistoryService extends HRavenHistoryService {
 9 |   override protected val counterFields: List[String] = List()
10 |   override protected val detailFields: List[String] = List(Task.TaskType, "status", "startTime", "finishTime")
11 | 
12 |   override protected def counters(taskCounters: CounterMap): Option[Map[String, Long]] = Some(Map.empty)
13 | 
14 |   override protected def details(taskDetails: TaskDetails): Option[Map[String, Any]] =
15 |     if (taskDetails.getType.nonEmpty) {
16 |       Some(
17 |         Map(
18 |           Task.TaskType -> taskDetails.getType,
19 |           "status" -> taskDetails.getStatus,
20 |           "startTime" -> taskDetails.getStartTime,
21 |           "finishTime" -> taskDetails.getFinishTime
22 |         )
23 |       )
24 |     } else {
25 |       None
26 |     }
27 | }
28 | 
29 | object HRavenReducerHistoryService extends HRavenReducerHistoryService
30 | 
31 | class HRavenRatioBasedEstimator extends RatioBasedEstimator {
32 |   override val historyService = HRavenReducerHistoryService
33 | }
34 | 
35 | class HRavenRuntimeBasedEstimator extends RuntimeReducerEstimator {
36 |   override val historyService = HRavenReducerHistoryService
37 | }
38 | 


--------------------------------------------------------------------------------
/tutorial/MatrixTutorial2.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.examples
 2 | 
 3 | import com.twitter.scalding._
 4 | import com.twitter.scalding.mathematics.Matrix
 5 | 
 6 | 
 7 | /*
 8 | * MatrixTutorial2.scala
 9 | * 
10 | * Loads a directed graph adjacency matrix where a[i,j] = 1 if there is an edge from a[i] to b[j]
11 | * and returns a graph containing only the nodes with outdegree smaller than a given value
12 | *
13 | * ../scripts/scald.rb --local MatrixTutorial2.scala --input data/graph.tsv --maxOutdegree 1000 --output data/graphFiltered.tsv
14 | * 
15 | */
16 | 
17 | 
18 | class FilterOutdegreeJob(args : Args) extends Job(args) {
19 |   
20 |   import Matrix._
21 |     
22 |   val adjacencyMatrix = Tsv( args("input"), ('user1, 'user2, 'rel)  )
23 |     .read
24 |     .toMatrix[Long,Long,Double]('user1, 'user2, 'rel)
25 | 
26 |   // Each row corresponds to the outgoing edges so to compute the outdegree we sum out the columns 
27 |   val outdegree = adjacencyMatrix.sumColVectors
28 | 
29 |   // We convert the column vector to a matrix object to be able to use the matrix method filterValues
30 |   // we make all non zero values into ones and then convert it back to column vector
31 |   val outdegreeFiltered = outdegree.toMatrix[Int](1)
32 |                           .filterValues{ _ < args("maxOutdegree").toDouble }
33 |                           .binarizeAs[Double].getCol(1)
34 | 						           				           
35 |   // We multiply on the left hand side with the diagonal matrix created from the column vector
36 |   // to keep only the rows with outdregree smaller than maxOutdegree
37 |   (outdegreeFiltered.diag * adjacencyMatrix).write(Tsv( args("output") ) )
38 | 
39 | }
40 | 
41 | 


--------------------------------------------------------------------------------
/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ParquetScroogeOutputFormat.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2012 Twitter, Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | package com.twitter.scalding.parquet.scrooge;
17 | 
18 | import com.twitter.scrooge.ThriftStruct;
19 | import org.apache.hadoop.conf.Configuration;
20 | import org.apache.parquet.hadoop.ParquetOutputFormat;
21 | 
22 | /**
23 |  * Use this class to write Scrooge records to parquet
24 |  * @param <T>  Type of Scrooge records to write
25 |  */
26 | public class ParquetScroogeOutputFormat<T extends ThriftStruct> extends ParquetOutputFormat<T> {
27 | 
28 |   public static void setScroogeClass(Configuration configuration, Class<? extends ThriftStruct> thriftClass) {
29 |     ScroogeWriteSupport.setScroogeClass(configuration, thriftClass);
30 |   }
31 | 
32 |   public static Class<? extends ThriftStruct> getScroogeClass(Configuration configuration) {
33 |     return ScroogeWriteSupport.getScroogeClass(configuration);
34 |   }
35 | 
36 |   public ParquetScroogeOutputFormat() {
37 |     super(new ScroogeWriteSupport<T>());
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/maple/src/main/java/com/twitter/maple/tap/StdoutTap.java:
--------------------------------------------------------------------------------
 1 | package com.twitter.maple.tap;
 2 | 
 3 | import cascading.flow.hadoop.HadoopFlowProcess;
 4 | import cascading.scheme.hadoop.SequenceFile;
 5 | import cascading.tap.hadoop.Lfs;
 6 | import cascading.tuple.Fields;
 7 | import cascading.tuple.TupleEntryIterator;
 8 | import org.apache.hadoop.mapred.JobConf;
 9 | 
10 | import java.io.File;
11 | import java.io.IOException;
12 | 
13 | public class StdoutTap extends Lfs {
14 | 
15 |     public StdoutTap() {
16 |         super(new SequenceFile(Fields.ALL), getTempDir());
17 |     }
18 | 
19 |     public static String getTempDir() {
20 |         final File temp;
21 |         try {
22 |             temp = File.createTempFile("temp", Long.toString(System.nanoTime()));
23 |         } catch (IOException e) {
24 |             throw new RuntimeException(e);
25 |         }
26 |         temp.deleteOnExit();
27 |         if (!(temp.delete())) {
28 |             throw new RuntimeException("Could not delete temp file: " + temp.getAbsolutePath());
29 |         }
30 | 
31 |         return temp.getAbsoluteFile().getPath();
32 |     }
33 | 
34 |     @Override
35 |     public boolean commitResource(JobConf conf) throws java.io.IOException {
36 |         TupleEntryIterator it = new HadoopFlowProcess(conf).openTapForRead(this);
37 |         System.out.println("");
38 |         System.out.println("");
39 |         System.out.println("RESULTS");
40 |         System.out.println("-----------------------");
41 |         while (it.hasNext()) {
42 |             System.out.println(it.next().getTuple());
43 |         }
44 |         System.out.println("-----------------------");
45 |         it.close();
46 |         return true;
47 |     }
48 | }


--------------------------------------------------------------------------------
/scalding-dagon/src/test/scala/com/twitter/scalding/dagon/HCacheTests.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.dagon
 2 | 
 3 | import org.scalacheck.Prop._
 4 | import org.scalacheck.{Arbitrary, Cogen, Properties}
 5 | 
 6 | abstract class HCacheTests[K[_], V[_]](name: String)(implicit
 7 |     ka: Arbitrary[K[Int]],
 8 |     kc: Cogen[K[Int]],
 9 |     va: Arbitrary[V[Int]]
10 | ) extends Properties(name) {
11 | 
12 |   def buildHMap(c: HCache[K, V], ks: Iterable[K[Int]], f: K[Int] => V[Int]): HMap[K, V] =
13 |     ks.iterator.foldLeft(HMap.empty[K, V]) { (m, k) =>
14 |       m.updated(k, c.getOrElseUpdate(k, f(k)))
15 |     }
16 | 
17 |   property("getOrElseUpdate") = forAll { (f: K[Int] => V[Int], k: K[Int], v1: V[Int], v2: V[Int]) =>
18 |     val c = HCache.empty[K, V]
19 |     var count = 0
20 |     val x = c.getOrElseUpdate(k, { count += 1; v1 })
21 |     val y = c.getOrElseUpdate(k, { count += 1; v2 })
22 |     x == v1 && y == v1 && count == 1
23 |   }
24 | 
25 |   property("toHMap") = forAll { (f: K[Int] => V[Int], ks: Set[K[Int]]) =>
26 |     val c = HCache.empty[K, V]
27 |     val m = buildHMap(c, ks, f)
28 |     c.toHMap == m
29 |   }
30 | 
31 |   property("duplicate") = forAll { (f: K[Int] => V[Int], ks: Set[K[Int]]) =>
32 |     val c = HCache.empty[K, V]
33 |     val d = c.duplicate
34 |     buildHMap(c, ks, f)
35 |     d.toHMap.isEmpty
36 |   }
37 | 
38 |   property("reset works") = forAll { (f: K[Int] => V[Int], ks: Set[K[Int]]) =>
39 |     val c = HCache.empty[K, V]
40 |     buildHMap(c, ks, f)
41 |     val d = c.duplicate
42 |     c.reset()
43 |     c.toHMap.isEmpty && d.toHMap.size == ks.size
44 |   }
45 | }
46 | 
47 | object HCacheTestsLL extends HCacheTests[List, List]("HCacheTests[List, List]")
48 | 


--------------------------------------------------------------------------------
/scalding-core/src/test/scala/com/twitter/scalding/PathFilterTest.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding
 2 | 
 3 | import org.scalatest.{Matchers, WordSpec}
 4 | import org.apache.hadoop.fs.{Path => HadoopPath, PathFilter}
 5 | 
 6 | class PathFilterTest extends WordSpec with Matchers {
 7 |   "RichPathFilter" should {
 8 |     import RichPathFilter.toRichPathFilter
 9 |     val p = new HadoopPath("/nowhere")
10 | 
11 |     "compose ands" in {
12 |       AlwaysTrue.and(AlwaysTrue).accept(p) shouldBe true
13 |       AlwaysTrue.and(AlwaysFalse).accept(p) shouldBe false
14 |       AlwaysFalse.and(AlwaysTrue).accept(p) shouldBe false
15 |       AlwaysFalse.and(AlwaysFalse).accept(p) shouldBe false
16 | 
17 |       AlwaysTrue.and(AlwaysTrue, AlwaysTrue).accept(p) shouldBe true
18 |       AlwaysTrue.and(AlwaysTrue, AlwaysFalse).accept(p) shouldBe false
19 |     }
20 | 
21 |     "compose ors" in {
22 |       AlwaysTrue.or(AlwaysTrue).accept(p) shouldBe true
23 |       AlwaysTrue.or(AlwaysFalse).accept(p) shouldBe true
24 |       AlwaysFalse.or(AlwaysTrue).accept(p) shouldBe true
25 |       AlwaysFalse.or(AlwaysFalse).accept(p) shouldBe false
26 | 
27 |       AlwaysFalse.or(AlwaysTrue, AlwaysTrue).accept(p) shouldBe true
28 |       AlwaysTrue.or(AlwaysFalse, AlwaysFalse).accept(p) shouldBe true
29 |     }
30 | 
31 |     "negate nots" in {
32 |       AlwaysTrue.not.accept(p) shouldBe false
33 |       AlwaysFalse.not.accept(p) shouldBe true
34 |       AlwaysTrue.not.not.accept(p) shouldBe true
35 |     }
36 | 
37 |   }
38 | }
39 | 
40 | object AlwaysTrue extends PathFilter {
41 |   override def accept(p: HadoopPath): Boolean = true
42 | }
43 | 
44 | object AlwaysFalse extends PathFilter {
45 |   override def accept(p: HadoopPath): Boolean = false
46 | }
47 | 


--------------------------------------------------------------------------------
/scalding-dagon/src/main/scala/com/twitter/scalding/dagon/Rule.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.dagon
 2 | 
 3 | import java.io.Serializable
 4 | 
 5 | /**
 6 |  * This implements a simplification rule on Dags
 7 |  */
 8 | trait Rule[N[_]] extends Serializable { self =>
 9 | 
10 |   /**
11 |    * If the given Id can be replaced with a simpler expression, return Some(expr) else None.
12 |    *
13 |    * If it is convenient, you might write a partial function and then call .lift to get the correct Function
14 |    * type
15 |    */
16 |   def apply[T](on: Dag[N]): N[T] => Option[N[T]]
17 | 
18 |   /**
19 |    * If the current rule cannot apply, then try the argument here. Note, this applies in series at a given
20 |    * node, not on the whole Dag after the first rule has run. For that, see Dag.applySeq.
21 |    */
22 |   def orElse(that: Rule[N]): Rule[N] =
23 |     new Rule[N] {
24 |       def apply[T](on: Dag[N]) = { n =>
25 |         self.apply(on)(n) match {
26 |           case Some(n1) if n1 == n =>
27 |             // If the rule emits the same as input fall through
28 |             that.apply(on)(n)
29 |           case None =>
30 |             that.apply(on)(n)
31 |           case s @ Some(_) => s
32 |         }
33 |       }
34 | 
35 |       override def toString: String =
36 |         s"$self.orElse($that)"
37 |     }
38 | }
39 | 
40 | object Rule {
41 | 
42 |   /**
43 |    * A Rule that never applies
44 |    */
45 |   def empty[N[_]]: Rule[N] =
46 |     new Rule[N] {
47 |       def apply[T](on: Dag[N]) = { _ => None }
48 |     }
49 | 
50 |   /**
51 |    * Build a new Rule out of several using orElse to compose
52 |    */
53 |   def orElse[N[_]](it: Iterable[Rule[N]]): Rule[N] =
54 |     it.reduceOption(_ orElse _).getOrElse(empty)
55 | }
56 | 


--------------------------------------------------------------------------------
/scripts/testValidator.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -exv
 2 | 
 3 | # Identify the bin dir in the distribution, and source the common include script
 4 | BASE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/.. && pwd )"
 5 | cd $BASE_DIR
 6 | 
 7 | 
 8 | #Which things do we want to build, all things that start with scalding- or maple are build targets
 9 | # This will produce a long string targets separated by spaces
10 | TARGET_NAMES=$(ls -d scalding-* maple)
11 | 
12 | # Cat the travis build file, ignoring the assembly lines
13 | # Reformatting any quotes to new lines so we get things nicer split up
14 | # Then grep this for the scalding- and maple from above
15 | BUILDS_WE_HAVE=$(cat .travis.yml | grep -v scripts/build_assembly_no_test  | tr '"' '\n' | tr ' ' '\n' | grep -e scalding- -e maple )
16 | 
17 | # Grab the blacklist, lines starting with #'s
18 | BLACKLIST_BUILDS=$(cat .travis.blacklist | egrep -v '^\s*#')
19 | 
20 | 
21 | TEST_ID=$(date '+%s')
22 | GOAL_PATH="/tmp/scalding_goal.$TEST_ID.txt"
23 | HAVE_PATH="/tmp/scalding_gHAVE.$TEST_ID.txt"
24 | # Ideally we want to have each target twice, once for 2.10 and 2.11
25 | # So echo them twice, counting their frequency into the goal path
26 | echo $TARGET_NAMES $TARGET_NAMES | tr ' ' '\n' | sort | uniq -c > $GOAL_PATH
27 | 
28 | #Now we take the builds we have, appending the
29 | #blacklist builds
30 | echo $BUILDS_WE_HAVE $BLACKLIST_BUILDS | tr ' ' '\n' | sort | uniq -c > $HAVE_PATH
31 | 
32 | # Once we've done this both lists should be identical
33 | DIFF=$(diff $GOAL_PATH $HAVE_PATH)
34 | RET=$?
35 | rm -f $GOAL_PATH
36 | rm -f $HAVE_PATH
37 | 
38 | if [ $RET -eq 0 ]; then
39 |     echo "All builds running"
40 |     exit 0
41 | else
42 |     echo -e "Missing some builds, diff $DIFF"
43 |     exit 1
44 | fi


--------------------------------------------------------------------------------
/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/DBTypeDescriptorImpl.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.db.macros.impl
 2 | 
 3 | import scala.reflect.macros.Context
 4 | 
 5 | import com.twitter.bijection.macros.impl.IsCaseClassImpl
 6 | import com.twitter.scalding.macros.impl.{FieldsProviderImpl, TupleConverterImpl, TupleSetterImpl}
 7 | import com.twitter.scalding.db.DBTypeDescriptor
 8 | 
 9 | object DBTypeDescriptorImpl {
10 | 
11 |   def apply[T](c: Context)(implicit T: c.WeakTypeTag[T]): c.Expr[DBTypeDescriptor[T]] = {
12 |     import c.universe._
13 | 
14 |     if (!IsCaseClassImpl.isCaseClassType(c)(T.tpe))
15 |       c.abort(
16 |         c.enclosingPosition,
17 |         s"""We cannot enforce ${T.tpe} is a case class, either it is not a case class or this macro call is possibly enclosed in a class.
18 |         This will mean the macro is operating on a non-resolved type."""
19 |       )
20 | 
21 |     val columnDefn = ColumnDefinitionProviderImpl[T](c)
22 |     val converter = TupleConverterImpl.caseClassTupleConverterWithUnknownImpl[T](c)
23 |     val setter = TupleSetterImpl.caseClassTupleSetterWithUnknownImpl[T](c)
24 |     val jdbcSetter = JdbcStatementSetterImpl.caseClassJdbcSetterCommonImpl[T](c, true)
25 |     val fields = FieldsProviderImpl.toFieldsWithUnknownNoPrefixImpl[T](c)
26 | 
27 |     val res = q"""
28 |     new _root_.com.twitter.scalding.db.DBTypeDescriptor[$T] with _root_.com.twitter.bijection.macros.MacroGenerated {
29 |       override val columnDefn = $columnDefn
30 |       override val converter = $converter
31 |       override val setter = $setter
32 |       override val fields = $fields
33 |       override val jdbcSetter = $jdbcSetter
34 |     }
35 |     """
36 |     c.Expr[DBTypeDescriptor[T]](res)
37 |   }
38 | }
39 | 


--------------------------------------------------------------------------------
/scalding-core/src/main/scala/com/twitter/scalding/serialization/RequiredBinaryComparators.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.serialization
 2 | 
 3 | import com.twitter.scalding._
 4 | 
 5 | import scala.language.experimental.{macros => smacros}
 6 | 
 7 | /**
 8 |  * RequiredBinaryComparators provide comparators (or Ordering in Scala) that are capable of comparing keys in
 9 |  * their serialized form reducing the amount of time spent in serialization/deserialization. These comparators
10 |  * are implemented using Scala macros, and currently provide binary comparators for primitives, strings,
11 |  * Options, tuples, collections, case classes and Scrooge objects.
12 |  */
13 | trait RequiredBinaryComparators extends RequiredBinaryComparatorsConfig {
14 | 
15 |   implicit def ordSer[T]: OrderedSerialization[T] =
16 |     macro com.twitter.scalding.serialization.macros.impl.OrderedSerializationProviderImpl[T]
17 | 
18 | }
19 | 
20 | object RequiredBinaryComparators {
21 | 
22 |   implicit def orderedSerialization[T]: OrderedSerialization[T] =
23 |     macro com.twitter.scalding.serialization.macros.impl.OrderedSerializationProviderImpl[T]
24 | }
25 | 
26 | /**
27 |  * Use this for an ExecutionApp.
28 |  */
29 | trait RequiredBinaryComparatorsExecutionApp extends ExecutionApp {
30 |   implicit def ordSer[T]: OrderedSerialization[T] =
31 |     macro com.twitter.scalding.serialization.macros.impl.OrderedSerializationProviderImpl[T]
32 |   def requireOrderedSerializationMode: RequireOrderedSerializationMode = RequireOrderedSerializationMode.Fail
33 |   override def config(inputArgs: Array[String]): (Config, Mode) = {
34 |     val (conf, m) = super.config(inputArgs)
35 |     (conf.setRequireOrderedSerializationMode(Some(requireOrderedSerializationMode)), m)
36 |   }
37 | }
38 | 


--------------------------------------------------------------------------------
/scalding-base/src/test/scala/com/twitter/scalding/typed/TypedPipeMonoidTest.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding
 2 | package typed
 3 | 
 4 | import memory_backend.MemoryMode
 5 | import com.twitter.algebird.Monoid.{plus, sum, zero}
 6 | import org.scalatest.FunSuite
 7 | import org.scalatest.prop.PropertyChecks
 8 | 
 9 | class TypedPipeMonoidTest extends FunSuite with PropertyChecks {
10 | 
11 |   def run[A](t: TypedPipe[A]): List[A] =
12 |     t.toIterableExecution.map(_.toList).waitFor(Config.empty, MemoryMode.empty).get
13 | 
14 |   def sortedEq[A: Ordering](a: List[A], b: List[A]): Boolean =
15 |     a.sorted == b.sorted
16 | 
17 |   def eqvPipe[A: Ordering](a: TypedPipe[A], b: TypedPipe[A]): Boolean =
18 |     sortedEq(run(a), run(b))
19 | 
20 |   test("typedPipeMonoid.zero should be equal to TypePipe.empty") {
21 |     assert(zero[TypedPipe[Int]] == TypedPipe.empty)
22 |   }
23 | 
24 |   test("monoid is associative") {
25 |     forAll { (a: List[Int], b: List[Int], c: List[Int]) =>
26 |       val left = plus(plus(TypedPipe.from(a), TypedPipe.from(b)), TypedPipe.from(c))
27 |       val right = plus(TypedPipe.from(a), plus(TypedPipe.from(b), TypedPipe.from(c)))
28 |       assert(eqvPipe(left, right))
29 |     }
30 |   }
31 | 
32 |   test("monoid is commutative") {
33 |     forAll { (a: List[Int], b: List[Int]) =>
34 |       val left = plus(TypedPipe.from(a), TypedPipe.from(b))
35 |       val right = plus(TypedPipe.from(b), TypedPipe.from(a))
36 |       assert(eqvPipe(left, right))
37 |     }
38 |   }
39 | 
40 |   test("monoid sum is equivalent to a union") {
41 |     forAll { (as: List[List[Int]]) =>
42 |       val pipes = as.map(TypedPipe.from(_))
43 |       val bigPipe = TypedPipe.from(as.flatten)
44 |       assert(eqvPipe(sum(pipes), bigPipe))
45 |     }
46 |   }
47 | 
48 | }
49 | 


--------------------------------------------------------------------------------
/scalding-serialization/src/main/scala/com/twitter/scalding/serialization/macros/impl/ordered_serialization/runtime_helpers/LengthCalculations.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  Copyright 2014 Twitter, Inc.
 3 | 
 4 |  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  you may not use this file except in compliance with the License.
 6 |  You may obtain a copy of the License at
 7 | 
 8 |  http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 |  Unless required by applicable law or agreed to in writing, software
11 |  distributed under the License is distributed on an "AS IS" BASIS,
12 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  See the License for the specific language governing permissions and
14 |  limitations under the License.
15 |  */
16 | package com.twitter.scalding.serialization.macros.impl.ordered_serialization.runtime_helpers
17 | 
18 | /**
19 |  * There is a Monoid on MaybeLength, with ConstLen(0) being the zero.
20 |  */
21 | sealed trait MaybeLength {
22 |   def +(that: MaybeLength): MaybeLength
23 | }
24 | 
25 | case object NoLengthCalculation extends MaybeLength {
26 |   def +(that: MaybeLength): MaybeLength = this
27 | }
28 | final case class ConstLen(toInt: Int) extends MaybeLength {
29 |   def +(that: MaybeLength): MaybeLength = that match {
30 |     case ConstLen(c)         => ConstLen(toInt + c)
31 |     case DynamicLen(d)       => DynamicLen(toInt + d)
32 |     case NoLengthCalculation => NoLengthCalculation
33 |   }
34 | }
35 | final case class DynamicLen(toInt: Int) extends MaybeLength {
36 |   def +(that: MaybeLength): MaybeLength = that match {
37 |     case ConstLen(c)         => DynamicLen(toInt + c)
38 |     case DynamicLen(d)       => DynamicLen(toInt + d)
39 |     case NoLengthCalculation => NoLengthCalculation
40 |   }
41 | }
42 | 


--------------------------------------------------------------------------------
/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/JdbcStatementSetterImpl.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  Copyright 2015 Twitter, Inc.
 3 | 
 4 |  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  you may not use this file except in compliance with the License.
 6 |  You may obtain a copy of the License at
 7 | 
 8 |  http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 |  Unless required by applicable law or agreed to in writing, software
11 |  distributed under the License is distributed on an "AS IS" BASIS,
12 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  See the License for the specific language governing permissions and
14 |  limitations under the License.
15 |  */
16 | package com.twitter.scalding.db.macros.impl
17 | 
18 | import scala.reflect.macros.Context
19 | 
20 | import com.twitter.scalding.macros.impl.CaseClassBasedSetterImpl
21 | import com.twitter.scalding.db.JdbcStatementSetter
22 | 
23 | /**
24 |  * Generates JDBC PreparedStatement data from case class
25 |  */
26 | private[macros] object JdbcStatementSetterImpl {
27 | 
28 |   def caseClassJdbcSetterCommonImpl[T](c: Context, allowUnknownTypes: Boolean)(implicit
29 |       T: c.WeakTypeTag[T]
30 |   ): c.Expr[JdbcStatementSetter[T]] = {
31 |     import c.universe._
32 | 
33 |     val stmtTerm = newTermName(c.fresh("stmt"))
34 |     val (_, setterTerm) = CaseClassBasedSetterImpl(c)(stmtTerm, allowUnknownTypes, JdbcFieldSetter)
35 |     val res = q"""
36 |     new _root_.com.twitter.scalding.db.JdbcStatementSetter[$T] with _root_.com.twitter.bijection.macros.MacroGenerated {
37 |       override def apply(t: $T, $stmtTerm: _root_.java.sql.PreparedStatement) = _root_.scala.util.Try {
38 |         $setterTerm
39 |         $stmtTerm
40 |       }
41 |     }
42 |     """
43 |     c.Expr[JdbcStatementSetter[T]](res)
44 |   }
45 | }
46 | 


--------------------------------------------------------------------------------
/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/LzoGenericSource.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2015 Twitter, Inc.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 | http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 |  */
16 | 
17 | package com.twitter.scalding.commons.source
18 | 
19 | import com.twitter.elephantbird.mapreduce.io.BinaryConverter
20 | import com.twitter.scalding._
21 | 
22 | import cascading.scheme.Scheme
23 | 
24 | /**
25 |  * Generic source with an underlying GenericScheme that uses the supplied BinaryConverter.
26 |  */
27 | abstract class LzoGenericSource[T]
28 |     extends FileSource
29 |     with SingleMappable[T]
30 |     with TypedSink[T]
31 |     with LocalTapSource {
32 |   def clazz: Class[T]
33 |   def conv: BinaryConverter[T]
34 |   override def setter[U <: T] = TupleSetter.asSubSetter[T, U](TupleSetter.singleSetter[T])
35 |   override def hdfsScheme = HadoopSchemeInstance(
36 |     LzoGenericScheme[T](conv, clazz).asInstanceOf[Scheme[_, _, _, _, _]]
37 |   )
38 | }
39 | 
40 | object LzoGenericSource {
41 |   def apply[T](passedConv: BinaryConverter[T], passedClass: Class[T], paths: String*) =
42 |     new LzoGenericSource[T] {
43 |       override val conv: BinaryConverter[T] = passedConv
44 |       override val clazz = passedClass
45 |       override val hdfsPaths = paths
46 |       override val localPaths = paths
47 |     }
48 | }
49 | 


--------------------------------------------------------------------------------
/tutorial/Tutorial1.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2012 Twitter, Inc.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 | http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | import com.twitter.scalding._
17 | 
18 | /**
19 | Scalding tutorial part 1.
20 | 
21 | In part 0, we made a copy of hello.txt, but it wasn't a perfect copy:
22 | it was annotated with line numbers.
23 | 
24 | That's because the data stream coming out of a TextLine source actually
25 | has two fields: one, called "line", has the actual line of text. The other,
26 | called "num", has the line number in the file. When you write these
27 | tuples to a TextLine, it naively outputs them both on each line.
28 | 
29 | We can ask scalding to select just the "line" field from the pipe, using the
30 | project() method. When we refer to a data stream's fields, we use Scala symbols,
31 | like this: 'line.
32 | 
33 | To run this job:
34 |   scripts/scald.rb --local tutorial/Tutorial1.scala
35 | 
36 | Check the output:
37 |   cat tutorial/data/output1.txt
38 | 
39 | **/
40 | 
41 | class Tutorial1(args : Args) extends Job(args) {
42 | 
43 |   val input = TextLine("tutorial/data/hello.txt")
44 |   val output = TextLine("tutorial/data/output1.txt")
45 | 
46 |   /**
47 |   We generally write each step of the pipeline on a separate line.
48 |   **/
49 |   input
50 |     .read
51 |     .project('line)
52 |     .write(output)
53 | }
54 | 


--------------------------------------------------------------------------------
/scalding-core/src/main/scala/com/twitter/scalding/TupleArity.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2012 Twitter, Inc.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 | http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 |  */
16 | 
17 | package com.twitter.scalding
18 | 
19 | import cascading.tuple.Fields
20 | 
21 | /**
22 |  * Mixed in to both TupleConverter and TupleSetter to improve arity safety of cascading jobs before we run
23 |  * anything on Hadoop.
24 |  */
25 | trait TupleArity {
26 | 
27 |   /**
28 |    * Return the arity of product types, should probably only be used implicitly The use case here is to see
29 |    * how many fake field names we need in Cascading to hold an intermediate value for mapReduceMap
30 |    */
31 |   def arity: Int
32 | 
33 |   /**
34 |    * assert that the arity of this setter matches the fields given. if arity == -1, we can't check, and if
35 |    * Fields is not a definite size, (such as Fields.ALL), we also cannot check, so this should only be
36 |    * considered a weak check.
37 |    */
38 |   def assertArityMatches(f: Fields): Unit =
39 |     // Fields.size == 0 for the indefinite Fields: ALL, GROUP, VALUES, UNKNOWN, etc..
40 |     if (f.size > 0 && arity >= 0) {
41 |       assert(
42 |         arity == f.size,
43 |         "Arity of (" + super.getClass + ") is "
44 |           + arity + ", which doesn't match: + (" + f.toString + ")"
45 |       )
46 |     }
47 | }
48 | 


--------------------------------------------------------------------------------
/scalding-core/src/main/java/com/twitter/scalding/tap/GlobHfs.java:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.tap;
 2 | 
 3 | import java.io.FileNotFoundException;
 4 | import java.io.IOException;
 5 | 
 6 | import org.apache.hadoop.fs.FileStatus;
 7 | import org.apache.hadoop.fs.FileSystem;
 8 | import org.apache.hadoop.fs.Path;
 9 | import org.apache.hadoop.mapred.JobConf;
10 | import org.apache.hadoop.mapred.OutputCollector;
11 | import org.apache.hadoop.mapred.RecordReader;
12 | 
13 | import cascading.scheme.Scheme;
14 | 
15 | /**
16 |  * Default implementation of getSize in {@link cascading.tap.hadoop.Hfs} don't respect to paths with glob patterns,
17 |  * that will throw IOException where we actually can calculate size of source.
18 |  */
19 | public class GlobHfs extends ScaldingHfs {
20 |   public GlobHfs(Scheme<JobConf, RecordReader, OutputCollector, ?, ?> scheme) {
21 |     super(scheme);
22 |   }
23 | 
24 |   public GlobHfs(Scheme<JobConf, RecordReader, OutputCollector, ?, ?> scheme, String stringPath) {
25 |     super(scheme, stringPath);
26 |   }
27 | 
28 |   @Override
29 |   public long getSize(JobConf conf) throws IOException {
30 |     return getSize(getPath(), conf);
31 |   }
32 | 
33 |   /**
34 |    * Get the total size of the file(s) specified by the Hfs, which may contain a glob
35 |    * pattern in its path, so we must be ready to handle that case.
36 |    */
37 |   public static long getSize(Path path, JobConf conf) throws IOException {
38 |     FileSystem fs = path.getFileSystem(conf);
39 |     FileStatus[] statuses = fs.globStatus(path);
40 | 
41 |     if (statuses == null) {
42 |       throw new FileNotFoundException(String.format("File not found: %s", path));
43 |     }
44 | 
45 |     long size = 0;
46 |     for (FileStatus status : statuses) {
47 |       size += fs.getContentSummary(status.getPath()).getLength();
48 |     }
49 |     return size;
50 |   }
51 | }
52 | 


--------------------------------------------------------------------------------
/scalding-args/src/test/scala/com/twitter/scalding/RangedArgsSpec.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2012 Twitter, Inc.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 | http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 |  */
16 | 
17 | package com.twitter.scalding
18 | 
19 | import org.scalatest.WordSpec
20 | 
21 | class RangeSpecs extends WordSpec {
22 |   "A Range" should {
23 |     val testRange = Range(4, 5)
24 | 
25 |     "contain its endpoints" in {
26 |       assert(testRange.lower === 4)
27 |       assert(testRange.upper === 5)
28 |     }
29 | 
30 |     "throw errors for misordered ranges" in {
31 |       Range(4, 4)
32 |       intercept[AssertionError](Range(5, 4))
33 |     }
34 | 
35 |     "assert lower bounds" in {
36 |       testRange.assertLowerBound(3)
37 |       testRange.assertLowerBound(4)
38 |       intercept[AssertionError](testRange.assertLowerBound(5))
39 |     }
40 | 
41 |     "assert upper bounds" in {
42 |       testRange.assertUpperBound(6)
43 |       testRange.assertUpperBound(5)
44 |       intercept[AssertionError](testRange.assertUpperBound(4))
45 |     }
46 | 
47 |     "print nicely with mkString" should {
48 |       "for trivial ranges" in {
49 |         assert(Range(4, 4).mkString("_") === "4")
50 |       }
51 |       "for proper ranges" in {
52 |         assert(testRange.mkString("_") === "4_5")
53 |         assert(testRange.mkString("-") === "4-5")
54 |       }
55 |     }
56 |   }
57 | }
58 | 


--------------------------------------------------------------------------------
/scalding-core/src/main/scala/com/twitter/scalding/typed/BijectedSourceSink.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2012 Twitter, Inc.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 | http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 |  */
16 | package com.twitter.scalding.typed
17 | 
18 | import cascading.flow.FlowDef
19 | import cascading.pipe.Pipe
20 | 
21 | import com.twitter.bijection.ImplicitBijection
22 | import com.twitter.scalding._
23 | import serialization.Externalizer
24 | 
25 | object BijectedSourceSink {
26 |   type SourceSink[T] = TypedSource[T] with TypedSink[T]
27 |   def apply[T, U](parent: SourceSink[T])(implicit
28 |       transformer: ImplicitBijection[T, U]
29 |   ): BijectedSourceSink[T, U] =
30 |     new BijectedSourceSink(parent)(transformer)
31 | }
32 | 
33 | class BijectedSourceSink[T, U](parent: BijectedSourceSink.SourceSink[T])(implicit
34 |     @transient transformer: ImplicitBijection[T, U]
35 | ) extends TypedSource[U]
36 |     with TypedSink[U] {
37 | 
38 |   val lockedBij = Externalizer(transformer)
39 | 
40 |   def setter[V <: U] = parent.setter.contraMap(lockedBij.get.invert(_))
41 | 
42 |   override def converter[W >: U] = parent.converter.andThen { t: T => lockedBij.get(t) }: TupleConverter[W]
43 | 
44 |   override def read(implicit flowDef: FlowDef, mode: Mode): Pipe = parent.read
45 |   override def writeFrom(pipe: Pipe)(implicit flowDef: FlowDef, mode: Mode) = parent.writeFrom(pipe)
46 | }
47 | 


--------------------------------------------------------------------------------
/scalding-base/src/test/scala/com/twitter/scalding/typed/CoGroupableTest.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.typed
 2 | 
 3 | import org.scalatest.FunSuite
 4 | 
 5 | class CoGroupableTest extends FunSuite {
 6 |   test("CoGroupable.atMostOneValue is consistent") {
 7 |     val init = TypedPipe.from(List((1, 2)))
 8 | 
 9 |     assert(CoGroupable.atMostOneValue(init.sumByKey))
10 |     assert(CoGroupable.atMostOneValue(init.group.sum))
11 |     assert(CoGroupable.atMostOneValue(init.group.mapValues(_ + 100).sum))
12 |     assert(CoGroupable.atMostOneValue(init.group.forceToReducers.mapValues(_ + 100).sum))
13 |     assert(CoGroupable.atMostOneValue(init.group.forceToReducers.mapValues(_ + 100).sum.mapValues(_ - 100)))
14 |     assert(CoGroupable.atMostOneValue(init.group.forceToReducers.mapValues(_ + 100).sum.filter {
15 |       case (k, v) => k > v
16 |     }))
17 |     assert(CoGroupable.atMostOneValue(init.group.mapValues(_ * 2).sum.join(init.group.sum)))
18 | 
19 |     assert(!CoGroupable.atMostOneValue(init.group))
20 |     assert(!CoGroupable.atMostOneValue(init.group.scanLeft(0)(_ + _)))
21 |     assert(!CoGroupable.atMostOneValue(init.join(init.group.mapValues(_ * 2))))
22 |     assert(!CoGroupable.atMostOneValue(init.group.sum.flatMapValues(List(_))))
23 | 
24 |     val sum1 = init.sumByKey
25 | 
26 |     assert(CoGroupable.atMostOneValue(sum1.join(sum1.join(sum1))))
27 |     assert(CoGroupable.atMostOneValue(sum1.join(sum1).join(sum1)))
28 | 
29 |     assert(!CoGroupable.atMostOneValue(init.join(sum1.join(sum1))))
30 |     assert(!CoGroupable.atMostOneValue(init.join(sum1).join(sum1)))
31 |     assert(!CoGroupable.atMostOneValue(sum1.join(init.join(sum1))))
32 |     assert(!CoGroupable.atMostOneValue(sum1.join(init).join(sum1)))
33 |     assert(!CoGroupable.atMostOneValue(sum1.join(sum1.join(init))))
34 |     assert(!CoGroupable.atMostOneValue(sum1.join(sum1).join(init)))
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/scalding-db/src/main/scala/com/twitter/scalding/db/macros/DBMacro.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.db.macros
 2 | 
 3 | import scala.language.experimental.macros
 4 | import com.twitter.scalding.db.macros.impl._
 5 | import com.twitter.scalding.db.{ColumnDefinitionProvider, DBTypeDescriptor}
 6 | 
 7 | // This is the sealed base trait for scala runtime annotiations used by the JDBC macros.
 8 | // These will read from these macros as a means to annotate fields to make up for the missing
 9 | // extra type information JDBC wants but is not in the jvm types.
10 | sealed trait ScaldingDBAnnotation
11 | 
12 | // This is the size in characters for a char field
13 | // For integers its really for display purposes
14 | @scala.annotation.meta.getter
15 | final class size(val size: Int) extends annotation.StaticAnnotation with ScaldingDBAnnotation
16 | 
17 | // JDBC TEXT type, this forces the String field in question to be a text type
18 | @scala.annotation.meta.getter
19 | final class text() extends annotation.StaticAnnotation with ScaldingDBAnnotation
20 | 
21 | // JDBC VARCHAR type, this forces the String field in question to be a text type
22 | @scala.annotation.meta.getter
23 | final class varchar() extends annotation.StaticAnnotation with ScaldingDBAnnotation
24 | 
25 | // JDBC DATE type, this toggles a java.util.Date field to be JDBC Date.
26 | // It will default to DATETIME to preserve the full resolution of java.util.Date
27 | @scala.annotation.meta.getter
28 | final class date() extends annotation.StaticAnnotation with ScaldingDBAnnotation
29 | 
30 | // This is the entry point to explicitly calling the JDBC macros.
31 | // Most often the implicits will be used in the package however
32 | object DBMacro {
33 |   def toColumnDefinitionProvider[T]: ColumnDefinitionProvider[T] = macro ColumnDefinitionProviderImpl[T]
34 |   def toDBTypeDescriptor[T]: DBTypeDescriptor[T] = macro DBTypeDescriptorImpl[T]
35 | }
36 | 


--------------------------------------------------------------------------------
/scalding-serialization/src/test/scala/com/twitter/scalding/serialization/UnsignedComparisonLaws.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.serialization
 2 | 
 3 | import org.scalacheck.Properties
 4 | import org.scalacheck.Prop.forAll
 5 | import org.scalacheck.Prop._
 6 | 
 7 | object UnsignedComparisonLaws extends Properties("UnsignedComparisonLaws") {
 8 | 
 9 |   property("UnsignedLongCompare works") = forAll { (l1: Long, l2: Long) =>
10 |     val cmp = UnsignedComparisons.unsignedLongCompare(l1, l2)
11 |     (l1 >= 0, l2 >= 0) match {
12 |       case (true, true)   => cmp == java.lang.Long.compare(l1, l2)
13 |       case (true, false)  => cmp < 0 // negative is bigger
14 |       case (false, true)  => cmp > 0
15 |       case (false, false) => cmp == java.lang.Long.compare(l1 & Long.MaxValue, l2 & Long.MaxValue)
16 |     }
17 |   }
18 |   property("UnsignedIntCompare works") = forAll { (l1: Int, l2: Int) =>
19 |     val cmp = UnsignedComparisons.unsignedIntCompare(l1, l2)
20 |     (l1 >= 0, l2 >= 0) match {
21 |       case (true, true)   => cmp == java.lang.Integer.compare(l1, l2)
22 |       case (true, false)  => cmp < 0 // negative is bigger
23 |       case (false, true)  => cmp > 0
24 |       case (false, false) => cmp == java.lang.Integer.compare(l1 & Int.MaxValue, l2 & Int.MaxValue)
25 |     }
26 |   }
27 |   property("UnsignedByteCompare works") = forAll { (l1: Byte, l2: Byte) =>
28 |     def clamp(i: Int) = if (i > 0) 1 else if (i < 0) -1 else 0
29 |     val cmp = clamp(UnsignedComparisons.unsignedByteCompare(l1, l2))
30 |     (l1 >= 0, l2 >= 0) match {
31 |       case (true, true)  => cmp == clamp(java.lang.Byte.compare(l1, l2))
32 |       case (true, false) => cmp < 0 // negative is bigger
33 |       case (false, true) => cmp > 0
34 |       // Convert to positive ints
35 |       case (false, false) => cmp == java.lang.Integer.compare(l1 & Byte.MaxValue, l2 & Byte.MaxValue)
36 |     }
37 |   }
38 | }
39 | 


--------------------------------------------------------------------------------
/tutorial/ReplTutorial1.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2012 Twitter, Inc.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 | http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | import com.twitter.scalding._
17 | 
18 | /**
19 | Scalding Tutorial1 in REPL form.
20 | 
21 | To test it, first make sure you've built the target/scalding-assembly-XXX.jar:
22 | from the base directory type:
23 |   sbt assembly
24 | 
25 | Now run the REPL in local mode.
26 |   scripts/scald-repl.sh --local
27 | 
28 | Run the Tutorial by typing the following.
29 |   :load tutorial/ReplTutorial1.scala
30 | 
31 | You can check the input:
32 |   cat tutorial/data/hello.txt
33 | 
34 | And the output:
35 |   cat tutorial/data/output1.txt
36 | 
37 | The output should look exactly like the input.
38 | **/
39 | 
40 | /**
41 | Both input and output data sources are represented by instances of
42 | com.twitter.scalding.Source.
43 | 
44 | Scalding comes with some basic source types like TextLine and Tsv.
45 | There are also many twitter-specific types like MergedAdRequestSource.
46 | **/
47 | 
48 | val input = TextLine("tutorial/data/hello.txt")
49 | val output = TextLine("tutorial/data/output1.txt")
50 | 
51 | /**
52 | You can then define a pipe that reads the source and writes to the sink.
53 | The "project" just fetches the content of the line, and not the line number.
54 | **/
55 | input.read.project('line).write(output)
56 | 
57 | /**
58 | And then run it!
59 | **/
60 | run
61 | 


--------------------------------------------------------------------------------
/scalding-hraven/src/test/resources/jobResponse_job_1470171371859_6607542.json:
--------------------------------------------------------------------------------
 1 | [ {
 2 |   "taskType" : "",
 3 |   "counters" : {
 4 |   }
 5 | }, {
 6 |   "taskType" : "MAP",
 7 |   "counters" : {
 8 |     "org.apache.hadoop.mapreduce.TaskCounter" : {
 9 |       "PHYSICAL_MEMORY_BYTES" : 751378432,
10 |       "GC_TIME_MILLIS" : 310,
11 |       "CPU_MILLISECONDS" : 38570,
12 |       "COMMITTED_HEAP_BYTES" : 810524672
13 |     }
14 |   }
15 | }, {
16 |   "taskType" : "MAP",
17 |   "counters" : {
18 |     "org.apache.hadoop.mapreduce.TaskCounter" : {
19 |       "PHYSICAL_MEMORY_BYTES" : 751378432,
20 |       "GC_TIME_MILLIS" : 310,
21 |       "CPU_MILLISECONDS" : 38570,
22 |       "COMMITTED_HEAP_BYTES" : 810524672
23 |     }
24 |   }
25 | }, {
26 |   "taskType" : "MAP",
27 |   "counters" : {
28 |     "org.apache.hadoop.mapreduce.TaskCounter" : {
29 |       "PHYSICAL_MEMORY_BYTES" : 759648256,
30 |       "GC_TIME_MILLIS" : 313,
31 |       "CPU_MILLISECONDS" : 38620,
32 |       "COMMITTED_HEAP_BYTES" : 810520576
33 |     }
34 |   }
35 | }, {
36 |   "taskType" : "REDUCE",
37 |   "counters" : {
38 |     "org.apache.hadoop.mapreduce.TaskCounter" : {
39 |       "PHYSICAL_MEMORY_BYTES" : 449499136,
40 |       "GC_TIME_MILLIS" : 444,
41 |       "CPU_MILLISECONDS" : 53720,
42 |       "COMMITTED_HEAP_BYTES" : 506986496
43 |     }
44 |   }
45 | }, {
46 |   "taskType" : "REDUCE",
47 |   "counters" : {
48 |     "org.apache.hadoop.mapreduce.TaskCounter" : {
49 |       "PHYSICAL_MEMORY_BYTES" : 449499136,
50 |       "GC_TIME_MILLIS" : 444,
51 |       "CPU_MILLISECONDS" : 53720,
52 |       "COMMITTED_HEAP_BYTES" : 506986496
53 |     }
54 |   }
55 | }, {
56 |   "taskType" : "REDUCE",
57 |   "counters" : {
58 |     "org.apache.hadoop.mapreduce.TaskCounter" : {
59 |       "PHYSICAL_MEMORY_BYTES" : 465207296,
60 |       "GC_TIME_MILLIS" : 529,
61 |       "CPU_MILLISECONDS" : 57210,
62 |       "COMMITTED_HEAP_BYTES" : 506986496
63 |     }
64 |   }
65 | } ]
66 | 


--------------------------------------------------------------------------------
/scalding-hraven/src/test/resources/jobResponse_job_1470171371859_6608570.json:
--------------------------------------------------------------------------------
 1 | [ {
 2 |   "taskType" : "",
 3 |   "counters" : {
 4 |   }
 5 | }, {
 6 |   "taskType" : "MAP",
 7 |   "counters" : {
 8 |     "org.apache.hadoop.mapreduce.TaskCounter" : {
 9 |       "PHYSICAL_MEMORY_BYTES" : 768618496,
10 |       "GC_TIME_MILLIS" : 371,
11 |       "CPU_MILLISECONDS" : 45260,
12 |       "COMMITTED_HEAP_BYTES" : 814776320
13 |     }
14 |   }
15 | }, {
16 |   "taskType" : "MAP",
17 |   "counters" : {
18 |     "org.apache.hadoop.mapreduce.TaskCounter" : {
19 |       "PHYSICAL_MEMORY_BYTES" : 768618496,
20 |       "GC_TIME_MILLIS" : 371,
21 |       "CPU_MILLISECONDS" : 45260,
22 |       "COMMITTED_HEAP_BYTES" : 814776320
23 |     }
24 |   }
25 | }, {
26 |   "taskType" : "MAP",
27 |   "counters" : {
28 |     "org.apache.hadoop.mapreduce.TaskCounter" : {
29 |       "PHYSICAL_MEMORY_BYTES" : 758517760,
30 |       "GC_TIME_MILLIS" : 355,
31 |       "CPU_MILLISECONDS" : 43950,
32 |       "COMMITTED_HEAP_BYTES" : 814280704
33 |     }
34 |   }
35 | }, {
36 |   "taskType" : "REDUCE",
37 |   "counters" : {
38 |     "org.apache.hadoop.mapreduce.TaskCounter" : {
39 |       "PHYSICAL_MEMORY_BYTES" : 433074176,
40 |       "GC_TIME_MILLIS" : 671,
41 |       "CPU_MILLISECONDS" : 74270,
42 |       "COMMITTED_HEAP_BYTES" : 506986496
43 |     }
44 |   }
45 | }, {
46 |   "taskType" : "REDUCE",
47 |   "counters" : {
48 |     "org.apache.hadoop.mapreduce.TaskCounter" : {
49 |       "PHYSICAL_MEMORY_BYTES" : 421924864,
50 |       "GC_TIME_MILLIS" : 596,
51 |       "CPU_MILLISECONDS" : 64390,
52 |       "COMMITTED_HEAP_BYTES" : 506986496
53 |     }
54 |   }
55 | }, {
56 |   "taskType" : "REDUCE",
57 |   "counters" : {
58 |     "org.apache.hadoop.mapreduce.TaskCounter" : {
59 |       "PHYSICAL_MEMORY_BYTES" : 421924864,
60 |       "GC_TIME_MILLIS" : 596,
61 |       "CPU_MILLISECONDS" : 64390,
62 |       "COMMITTED_HEAP_BYTES" : 506986496
63 |     }
64 |   }
65 | } ]
66 | 


--------------------------------------------------------------------------------
/scalding-commons/src/main/scala/com/twitter/scalding/examples/MergeTest.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.examples
 2 | 
 3 | import scala.annotation.tailrec
 4 | 
 5 | import com.twitter.scalding._
 6 | 
 7 | /**
 8 |  * This example job does not yet work. It is a test for Kyro serialization
 9 |  */
10 | class MergeTest(args: Args) extends Job(args) {
11 |   TextLine(args("input"))
12 |     .flatMapTo('word)(_.split("""\s+"""))
13 |     .groupBy('word)(_.size)
14 |     // Now, let's get the top 10 words:
15 |     .groupAll {
16 |       _.mapReduceMap(('word, 'size) -> 'list) /* map1 */ { tup: (String, Long) => List(tup) } /* reduce */ {
17 |         (l1: List[(String, Long)], l2: List[(String, Long)]) =>
18 |           mergeSort2(l1, l2, 10, cmpTup)
19 |       } /* map2 */ { lout: List[(String, Long)] =>
20 |         lout
21 |       }
22 |     }
23 |     // Now expand out the list.
24 |     .flatMap('list -> ('word, 'cnt)) { list: List[(String, Long)] => list }
25 |     .project('word, 'cnt)
26 |     .write(Tsv(args("output")))
27 | 
28 |   // Reverse sort to get the top items
29 |   def cmpTup(t1: (String, Long), t2: (String, Long)) = t2._2.compareTo(t1._2)
30 | 
31 |   def mergeSort2[T](v1: List[T], v2: List[T], k: Int, cmp: Function2[T, T, Int]) = {
32 |     @tailrec
33 |     def mergeSortR(acc: List[T], list1: List[T], list2: List[T], k: Int): List[T] =
34 |       (list1, list2, k) match {
35 |         case (_, _, 0) => acc
36 |         case (x1 :: t1, x2 :: t2, _) => {
37 |           if (cmp(x1, x2) < 0) {
38 |             mergeSortR(x1 :: acc, t1, list2, k - 1)
39 |           } else {
40 |             mergeSortR(x2 :: acc, list1, t2, k - 1)
41 |           }
42 |         }
43 |         case (x1 :: t1, Nil, _) => mergeSortR(x1 :: acc, t1, Nil, k - 1)
44 |         case (Nil, x2 :: t2, _) => mergeSortR(x2 :: acc, Nil, t2, k - 1)
45 |         case (Nil, Nil, _)      => acc
46 |       }
47 |     mergeSortR(Nil, v1, v2, k).reverse
48 |   }
49 | }
50 | 


--------------------------------------------------------------------------------
/scalding-core/codegen/tuple_adder_generator.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | # Run it like this:
 4 | #
 5 | #   ruby scripts/tuple_adder_generator.rb > src/main/scala/com/twitter/scalding/GeneratedTupleAdders.scala
 6 | 
 7 | $indent = "  "
 8 | 
 9 | TYPES = ('A'..'Z').to_a
10 | 
11 | def make_tuple_adder(cnt)
12 |   other_cnts = (1..(22-cnt)).to_a
13 |   puts "#{$indent}class Tuple#{cnt}Adder[#{TYPES[0..(cnt - 1)].join(",")}](tup : #{get_tuple(0, cnt)}) {"
14 | 
15 |   # Add the :+ method
16 |   puts "#{$indent}#{$indent}def :+[#{TYPES[cnt]}](other : #{TYPES[cnt]}) = {"
17 |   puts "#{$indent}#{$indent}#{$indent}(#{tup_get("tup", cnt)},other)"
18 |   puts "#{$indent}#{$indent}}"
19 | 
20 |   # Add the +: method
21 |   puts "#{$indent}#{$indent}def +:[#{TYPES[cnt]}](other : #{TYPES[cnt]}) = {"
22 |   puts "#{$indent}#{$indent}#{$indent}(other,#{tup_get("tup", cnt)})"
23 |   puts "#{$indent}#{$indent}}"
24 | 
25 |   other_cnts.each do |ocnt|
26 |     puts
27 |     puts "#{$indent}#{$indent}def ++[#{TYPES[cnt..(cnt + ocnt - 1)].join(",")}](other : #{get_tuple(cnt, ocnt)}) = {"
28 |     puts "#{$indent}#{$indent}#{$indent}(#{tup_get("tup", cnt)},#{tup_get("other", ocnt)})"
29 |     puts "#{$indent}#{$indent}}"
30 |   end
31 |   puts "#{$indent}}"
32 |   puts
33 |   puts "#{$indent}implicit def tup#{cnt}ToAdder[#{TYPES[0..(cnt - 1)].join(",")}](tup : Tuple#{cnt}[#{TYPES[0..(cnt - 1)].join(",")}]) = new Tuple#{cnt}Adder(tup)"
34 | end
35 | 
36 | def get_tuple(cnt1, cnt2)
37 |   "Tuple#{cnt2}[#{TYPES[cnt1..(cnt1 + cnt2 - 1)].join(",")}]"
38 | end
39 | 
40 | def tup_get(name, cnt)
41 |   (1..cnt).map{ |i| "#{name}._#{i}" }.join(",")
42 | end
43 | 
44 | puts "// following were autogenerated by #{__FILE__} at #{Time.now} do not edit"
45 | puts %q|package com.twitter.scalding
46 | 
47 | trait GeneratedTupleAdders {
48 | |
49 | 
50 | (1..21).each { |c|
51 |   make_tuple_adder(c)
52 |   puts
53 | }
54 | 
55 | puts "}"
56 | puts "// end of autogenerated"
57 | 


--------------------------------------------------------------------------------
/scalding-core/src/main/scala/com/twitter/scalding/typed/MemorySink.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2014 Twitter, Inc.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 | http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 |  */
16 | package com.twitter.scalding.typed
17 | 
18 | import com.twitter.scalding._
19 | 
20 | import scala.collection.mutable.Buffer
21 | 
22 | import java.util.UUID
23 | 
24 | import cascading.pipe.Pipe
25 | import cascading.flow.FlowDef
26 | import cascading.scheme.NullScheme
27 | import cascading.tuple.Tuple
28 | 
29 | /*
30 |  * This is useful for in-memory testing with Execution
31 |  * It only works for CascadingLocal mode.
32 |  */
33 | class MemorySink[T] extends TypedSink[T] {
34 |   private[this] val buf = Buffer[Tuple]()
35 |   private[this] val name: String = UUID.randomUUID.toString
36 | 
37 |   // takes a copy as of NOW. Don't call this before the job has run
38 |   def readResults: Iterable[T] =
39 |     buf.iterator.map(_.getObject(0).asInstanceOf[T]).toList
40 | 
41 |   def setter[U <: T] = TupleSetter.asSubSetter(TupleSetter.singleSetter[T])
42 |   def writeFrom(pipe: Pipe)(implicit flowDef: FlowDef, mode: Mode): Pipe =
43 |     mode match {
44 |       case cl: CascadingLocal =>
45 |         val tap = new MemoryTap(new NullScheme(sinkFields, sinkFields), buf)
46 |         flowDef.addSink(name, tap)
47 |         flowDef.addTail(new Pipe(name, pipe))
48 |         pipe
49 |       case _ => sys.error("MemorySink only usable with cascading local")
50 |     }
51 | }
52 | 


--------------------------------------------------------------------------------
/scalding-core/src/test/scala/com/twitter/scalding/ScanLeftTest.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding
 2 | 
 3 | import org.scalatest.{Matchers, WordSpec}
 4 | 
 5 | /**
 6 |  * Simple Example: First group data by gender and then sort by height reverse order. Then add another column
 7 |  * for each group which is the rank order of the height.
 8 |  */
 9 | class AddRankingWithScanLeft(args: Args) extends Job(args) {
10 |   Tsv("input1", ('gender, 'height)).read
11 |     .groupBy('gender) { group =>
12 |       group.sortBy('height).reverse
13 |       group.scanLeft('height -> 'rank)(0L) { (rank: Long, user_id: Double) =>
14 |         (rank + 1L)
15 |       }
16 |     }
17 |     // scanLeft generates an extra line per group, thus remove it
18 |     .filter('height) { x: String => x != null }
19 |     .debug
20 |     .write(Tsv("result1"))
21 | }
22 | 
23 | class ScanLeftTest extends WordSpec with Matchers {
24 |   import Dsl._
25 | 
26 |   // --- A simple ranking job
27 |   val sampleInput1 =
28 |     List(("male", "165.2"), ("female", "172.2"), ("male", "184.1"), ("male", "125.4"), ("female", "128.6"))
29 | 
30 |   // Each group sorted and ranking added highest person to shortest
31 |   val expectedOutput1 = Set(
32 |     ("male", 184.1, 1),
33 |     ("male", 165.2, 2),
34 |     ("male", 125.4, 3),
35 |     ("female", 172.2, 1),
36 |     ("female", 128.6, 2)
37 |   )
38 | 
39 |   "A simple ranking scanleft job" should {
40 |     JobTest(new AddRankingWithScanLeft(_))
41 |       .source(Tsv("input1", ('gender, 'height)), sampleInput1)
42 |       .sink[(String, Double, Long)](Tsv("result1")) { outBuf1 =>
43 |         "produce correct number of records when filtering out null values" in {
44 |           outBuf1 should have size 5
45 |         }
46 |         "create correct ranking per group, 1st being the heighest person of that group" in {
47 |           outBuf1.toSet shouldBe expectedOutput1
48 |         }
49 |       }
50 |       .run
51 |       .finish()
52 |   }
53 | }
54 | 


--------------------------------------------------------------------------------
/scalding-quotation/src/main/scala/com/twitter/scalding/quotation/Liftables.scala:
--------------------------------------------------------------------------------
 1 | package com.twitter.scalding.quotation
 2 | 
 3 | import scala.reflect.macros.blackbox.Context
 4 | 
 5 | /**
 6 |  * These Liftables allows us to lift values into quasiquote trees. For example:
 7 |  *
 8 |  * def test(v: Source) => q"$v"
 9 |  *
10 |  * uses `sourceLiftable`
11 |  */
12 | trait Liftables {
13 |   val c: Context
14 |   import c.universe.{TypeName => _, _}
15 | 
16 |   protected implicit val sourceLiftable: Liftable[Source] = Liftable { case Source(path, line) =>
17 |     q"_root_.com.twitter.scalding.quotation.Source($path, $line)"
18 |   }
19 | 
20 |   protected implicit val projectionsLiftable: Liftable[Projections] = Liftable { case p =>
21 |     q"_root_.com.twitter.scalding.quotation.Projections(${p.set})"
22 |   }
23 | 
24 |   protected implicit val typeNameLiftable: Liftable[TypeName] = Liftable { case TypeName(name) =>
25 |     q"_root_.com.twitter.scalding.quotation.TypeName($name)"
26 |   }
27 | 
28 |   protected implicit val accessorLiftable: Liftable[Accessor] = Liftable { case Accessor(name) =>
29 |     q"_root_.com.twitter.scalding.quotation.Accessor($name)"
30 |   }
31 | 
32 |   protected implicit val quotedLiftable: Liftable[Quoted] = Liftable { case Quoted(source, call, fa) =>
33 |     q"_root_.com.twitter.scalding.quotation.Quoted($source, $call, $fa)"
34 |   }
35 | 
36 |   protected implicit val projectionLiftable: Liftable[Projection] = Liftable {
37 |     case p: Property      => q"$p"
38 |     case p: TypeReference => q"$p"
39 |   }
40 | 
41 |   protected implicit val propertyLiftable: Liftable[Property] = Liftable {
42 |     case Property(path, accessor, tpe) =>
43 |       q"_root_.com.twitter.scalding.quotation.Property($path, $accessor, $tpe)"
44 |   }
45 | 
46 |   protected implicit val typeReferenceLiftable: Liftable[TypeReference] = Liftable {
47 |     case TypeReference(name) => q"_root_.com.twitter.scalding.quotation.TypeReference($name)"
48 |   }
49 | }
50 | 


--------------------------------------------------------------------------------
/scalding-avro/src/main/scala/com/twitter/scalding/avro/package.scala:
--------------------------------------------------------------------------------
 1 | /*  Copyright 2013 eBay, inc.
 2 |  *
 3 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 4 |  *  you may not use this file except in compliance with the License.
 5 |  *  You may obtain a copy of the License at
 6 |  *
 7 |  *      http://www.apache.org/licenses/LICENSE-2.0
 8 |  *
 9 |  *  Unless required by applicable law or agreed to in writing, software
10 |  *  distributed under the License is distributed on an "AS IS" BASIS,
11 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 |  *  See the License for the specific language governing permissions and
13 |  *  limitations under the License.
14 |  */
15 | 
16 | package com.twitter.scalding
17 | 
18 | import cascading.flow.FlowDef
19 | import org.apache.avro.Schema
20 | import collection.JavaConverters._
21 | import cascading.tuple.Fields
22 | 
23 | import com.twitter.scalding.typed.cascading_backend.CascadingExtensions._
24 | 
25 | package object avro {
26 |   def writePackedAvro[T](pipe: TypedPipe[T], path: String)(implicit
27 |       mf: Manifest[T],
28 |       st: AvroSchemaType[T],
29 |       conv: TupleConverter[T],
30 |       set: TupleSetter[T],
31 |       flow: FlowDef,
32 |       mode: Mode
33 |   ): Unit = {
34 |     val sink = PackedAvroSource[T](path)
35 |     pipe.write(sink)
36 |   }
37 | 
38 |   def writeUnpackedAvro[T <: Product](pipe: TypedPipe[T], path: String, schema: Schema)(implicit
39 |       mf: Manifest[T],
40 |       conv: TupleConverter[T],
41 |       set: TupleSetter[T],
42 |       flow: FlowDef,
43 |       mode: Mode
44 |   ): Unit = {
45 |     import Dsl._
46 |     val sink = UnpackedAvroSource[T](path, Some(schema))
47 |     val outFields = {
48 |       val schemaFields = schema.getFields
49 |       schemaFields.asScala.foldLeft(new Fields())((cFields, sField) =>
50 |         cFields.append(new Fields(sField.name()))
51 |       )
52 |     }
53 |     pipe.toPipe(outFields).write(sink)
54 |   }
55 | }
56 | 


--------------------------------------------------------------------------------
/scalding-args/src/main/scala/com/twitter/scalding/RangedArgs.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2012 Twitter, Inc.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 | http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 |  */
16 | 
17 | package com.twitter.scalding
18 | 
19 | object RangedArgs {
20 |   implicit def rangedFromArgs(args: Args): RangedArgs = new RangedArgs(args)
21 | }
22 | 
23 | case class Range[T](lower: T, upper: T)(implicit ord: Ordering[T]) {
24 |   assert(ord.lteq(lower, upper), "Bad range: " + lower + " > " + upper)
25 | 
26 |   def assertLowerBound(min: T): Unit =
27 |     assert(ord.lteq(min, lower), "Range out of bounds: " + lower + " < " + min)
28 | 
29 |   def assertUpperBound(max: T): Unit =
30 |     assert(ord.gteq(max, upper), "Range out of bounds: " + upper + " > " + max)
31 | 
32 |   def assertBounds(min: T, max: T): Unit = {
33 |     assertLowerBound(min)
34 |     assertUpperBound(max)
35 |   }
36 | 
37 |   def mkString(sep: String) =
38 |     if (ord.equiv(lower, upper)) {
39 |       lower.toString
40 |     } else {
41 |       lower.toString + sep + upper.toString
42 |     }
43 | }
44 | 
45 | class RangedArgs(args: Args) {
46 |   def range[T](argName: String)(cnv: String => T)(implicit ord: Ordering[T]): Range[T] =
47 |     args.list(argName) match {
48 |       case List(v) =>
49 |         Range(cnv(v), cnv(v))
50 |       case List(v1, v2) =>
51 |         Range(cnv(v1), cnv(v2))
52 |       case _ =>
53 |         throw new IllegalArgumentException(argName + " must have either 1 or 2 values specified")
54 |     }
55 | }
56 | 


--------------------------------------------------------------------------------