├── .gitignore
├── LICENSE-2.0.txt
├── README.md
├── build.sbt
├── data
    ├── docBOW.tsv
    ├── graph.tsv
    ├── graph2.tsv
    ├── hello.txt
    ├── helloDoc.txt
    ├── phones.txt
    ├── session.json
    ├── word_scores.tsv
    └── words.txt
├── project
    ├── BuildSettings.scala
    ├── Dependencies.scala
    ├── ScaldingTutorialBuild.scala
    ├── build.properties
    └── plugins.sbt
└── src
    └── main
        └── scala
            └── tutorial
                ├── AvroTutorial0.scala
                ├── CodeSnippets.md
                ├── JobRunner.scala
                ├── JsonTutorial0.scala
                ├── MatrixTutorial0.scala
                ├── MatrixTutorial1.scala
                ├── MatrixTutorial2.scala
                ├── MatrixTutorial3.scala
                ├── MatrixTutorial4.scala
                ├── MatrixTutorial5.scala
                ├── MatrixTutorial6.scala
                ├── Tutorial0.scala
                ├── Tutorial1.scala
                ├── Tutorial2.scala
                ├── Tutorial3.scala
                ├── Tutorial4.scala
                ├── Tutorial5.scala
                ├── Tutorial6.scala
                └── TypedTutorial.scala


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.class
 2 | *.log
 3 | 
 4 | # sbt specific
 5 | dist/*
 6 | target/
 7 | lib_managed/
 8 | src_managed/
 9 | project/boot/
10 | project/plugins/project/
11 | 


--------------------------------------------------------------------------------
/LICENSE-2.0.txt:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Scalding Tutorial Project 
 2 | 
 3 | ## Introduction
 4 | 
 5 | This is Twitter's [tutorial] [tutorial] for [Scalding] [scalding] adapted to run
 6 | on Hadoop as a standalone job - i.e. without requiring `scald.rb` etc.
 7 | 
 8 | This was built as a Scala SBT project by the [Concurrent Inc] [concurrent] team,
 9 | in order to integrate the scalding tutorial into the [Cascading SDK][sdk].  It
10 | is based on the excellent work done by [Snowplow Analytics][snowplow] for
11 | porting the [`Wordcount example`][wordcount] to SBT. 
12 | 
13 | The versioning of the project follows the versions of the scalding release on
14 | which it is based.
15 | 
16 | Please note that this tutorial uses scala 2.10 and not 2.9.
17 | 
18 | ## Prerequisites
19 | 
20 | In order to use this tutorial, you need to have `SBT` and the `hadoop` command
21 | installed. Cascading and therefore scalding is compatible with a number of
22 | hadoop distributions. If you are unsure, if your distribution is compatible,
23 | please check the [compatibility][compatibility] page. 
24 | 
25 | You do not need to have a full hadoop cluster, in order to run this tutorial.
26 | The local mode of hadoop is sufficient.
27 | 
28 | 
29 | ## Building
30 | 
31 | Assuming you already have SBT installed:
32 | 
33 |     $ git clone git://github.com/Cascading/scalding-tutorial.git
34 |     $ cd scalding-tutorial
35 |     $ sbt assembly
36 | 
37 | The 'fat jar' is now available as:
38 | 
39 |     target/scalding-tutorial-0.14.0.jar
40 | 
41 | ## Project structure
42 | 
43 | Some modifications have been done to the code, order to properly work in an SBT
44 | based build.
45 | 
46 | * all code is now in `src/main/scala/tutorial`
47 | * the data files for the different parts live now in `data`
48 | * the classes in the matrix tutorial have been renamed to match the file names,
49 |   so that the commandline invocation is similar to the original tutorial
50 | * the documentation of the examples has been adapted to match the new structure
51 | 
52 | ## Running the examples
53 | 
54 | Each part of the tutorial explains, how to run it properly. However the general
55 | way is always
56 | 
57 |     $ yarn jar target/scalding-tutorial-0.14.0.jar <TutorialPart> --local <addtional arguments>
58 | 
59 | ## Copyright and license
60 | 
61 | Copyright 2012-2014 Concurrent Inc, with significant portions copyright 2012 Twitter, Inc. and Snowplow Analytics Inc.
62 | 
63 | Licensed under the [Apache License, Version 2.0] [license] (the "License");
64 | you may not use this software except in compliance with the License.
65 | 
66 | Unless required by applicable law or agreed to in writing, software
67 | distributed under the License is distributed on an "AS IS" BASIS,
68 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
69 | See the License for the specific language governing permissions and
70 | limitations under the License.
71 | 
72 | [tutorial]: https://github.com/twitter/scalding/tree/develop/tutorial
73 | [sdk]: http://cascading.org/sdk
74 | [scalding]: https://github.com/twitter/scalding/
75 | [concurrent]: http://concurrentinc.com
76 | [snowplow]: http://snowplowanalytics.com
77 | [wordcount]: http://github.com/snowplow/scalding-example-project 
78 | [license]: http://www.apache.org/licenses/LICENSE-2.0
79 | [compatibility]: http://www.cascading.org/support/compatibility/
80 | 


--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
1 | net.virtualvoid.sbt.graph.Plugin.graphSettings
2 | 


--------------------------------------------------------------------------------
/data/docBOW.tsv:
--------------------------------------------------------------------------------
 1 | 1	hello	2
 2 | 1	twitter	1
 3 | 2	conversation	1
 4 | 2	celebrities	1
 5 | 2	twitter	1
 6 | 3	elections	1
 7 | 3	debate	1
 8 | 3	twitter	1
 9 | 3	political	1
10 | 


--------------------------------------------------------------------------------
/data/graph.tsv:
--------------------------------------------------------------------------------
1 | 1	2	1
2 | 1	3	1
3 | 3	2	1
4 | 4	2	2
5 | 


--------------------------------------------------------------------------------
/data/graph2.tsv:
--------------------------------------------------------------------------------
1 | 1	2	1
2 | 1	3	1
3 | 2	3	1
4 | 


--------------------------------------------------------------------------------
/data/hello.txt:
--------------------------------------------------------------------------------
1 | Hello world
2 | Goodbye world
3 | 


--------------------------------------------------------------------------------
/data/helloDoc.txt:
--------------------------------------------------------------------------------
1 | 1	Hello world
2 | 2	See ya soon world
3 | 3	Hello again world
4 | 


--------------------------------------------------------------------------------
/data/phones.txt:
--------------------------------------------------------------------------------
1 | john smith 5551212 30 US
2 | harry bovik 4122680000 55 US
3 | jane doe 4125551212 40 CN
4 | 


--------------------------------------------------------------------------------
/data/session.json:
--------------------------------------------------------------------------------
1 | {"sessionId": "efdc0698-66ea-4d3a-ac6e-fab9fd97a78b"}
2 | {"sessionId": "efdc0698-66ea-4d3a-ac6e-fab9fd97a78b"}
3 | {"sessionId": "efdc0698-66ea-4d3a-ac6e-fab9fd97a78b"}
4 | {"sessionId": "efdc0698-66ea-4d3a-ac6e-fab9fd97a78b"}
5 | {"sessionId": "efdc0698-66ea-4d3a-ac6e-fab9fd97a78b"}
6 | {"sessionId": "efdc0698-66ea-4d3a-ac6e-fab9fd97a78b"}
7 | 


--------------------------------------------------------------------------------
/data/word_scores.tsv:
--------------------------------------------------------------------------------
1 | hello	1.0
2 | world	2.0
3 | goodbye	3.0


--------------------------------------------------------------------------------
/data/words.txt:
--------------------------------------------------------------------------------
1 | hello
2 | world
3 | goodbye
4 | 


--------------------------------------------------------------------------------
/project/BuildSettings.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2012 SnowPlow Analytics Ltd. All rights reserved.
 3 |  *
 4 |  * This program is licensed to you under the Apache License Version 2.0,
 5 |  * and you may not use this file except in compliance with the Apache License Version 2.0.
 6 |  * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0.
 7 |  *
 8 |  * Unless required by applicable law or agreed to in writing,
 9 |  * software distributed under the Apache License Version 2.0 is distributed on an
10 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 |  * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under.
12 |  */
13 | import sbt._
14 | import Keys._
15 | 
16 | object BuildSettings {
17 | 
18 |   // Basic settings for our app
19 |   lazy val basicSettings = Seq[Setting[_]](
20 |     organization  := "Concurrent Inc.",
21 |     version       := "0.14.0", // -> follow the release numbers of scalding
22 |     description   := "The scalding tutorial as an SBT project",
23 |     scalaVersion  := "2.10.0", 
24 |     scalacOptions := Seq("-deprecation", "-encoding", "utf8"),
25 |     resolvers     ++= Dependencies.resolutionRepos
26 |   )
27 | 
28 |   // sbt-assembly settings for building a fat jar
29 |   import sbtassembly.Plugin._
30 |   import AssemblyKeys._
31 |   lazy val sbtAssemblySettings = assemblySettings ++ Seq(
32 | 
33 |     // Slightly cleaner jar name
34 |     jarName in assembly <<= (name, version) { (name, version) => name + "-" + version + ".jar" },
35 |     
36 |     // Drop these jars
37 |     excludedJars in assembly <<= (fullClasspath in assembly) map { cp =>
38 |       val excludes = Set(
39 |         "jsp-api-2.1-6.1.14.jar",
40 |         "jsp-2.1-6.1.14.jar",
41 |         "jasper-compiler-5.5.12.jar",
42 |         "minlog-1.2.jar", // Otherwise causes conflicts with Kyro (which bundles it)
43 |         "janino-2.5.16.jar", // Janino includes a broken signature, and is not needed anyway
44 |         "commons-beanutils-core-1.8.0.jar", // Clash with each other and with commons-collections
45 |         "commons-beanutils-1.7.0.jar",      // "
46 |         "hadoop-core-1.1.2.jar", 
47 |         "hadoop-tools-1.1.2.jar" // "
48 |       ) 
49 |       cp filter { jar => excludes(jar.data.getName) }
50 |     },
51 |     
52 |     mergeStrategy in assembly <<= (mergeStrategy in assembly) {
53 |       (old) => {
54 |         case "project.clj" => MergeStrategy.discard // Leiningen build files
55 |         case x => old(x)
56 |       }
57 |     }
58 |   )
59 | 
60 |   lazy val buildSettings = basicSettings ++ sbtAssemblySettings
61 | }
62 | 


--------------------------------------------------------------------------------
/project/Dependencies.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2012 SnowPlow Analytics Ltd. All rights reserved.
 3 |  *
 4 |  * This program is licensed to you under the Apache License Version 2.0,
 5 |  * and you may not use this file except in compliance with the Apache License Version 2.0.
 6 |  * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0.
 7 |  *
 8 |  * Unless required by applicable law or agreed to in writing,
 9 |  * software distributed under the Apache License Version 2.0 is distributed on an
10 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 |  * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under.
12 |  */
13 | import sbt._
14 | 
15 | object Dependencies {
16 |   val resolutionRepos = Seq(
17 |     ScalaToolsSnapshots,
18 |     "Concurrent Maven Repo" at "http://conjars.org/repo" // For Scalding, Cascading etc
19 |   )
20 | 
21 |   object V {
22 |     val scalding  = "0.14.0"
23 |     val hadoop    = "2.6.0"
24 |     val specs2    = "1.13" // -> "1.13" when we bump to Scala 2.10.0
25 |     // Add versions for your additional libraries here...
26 |     val cascading = "2.7.0"
27 |   }
28 | 
29 |   object Libraries {
30 |     val cascadingCore = "cascading"                %  "cascading-core"       % V.cascading
31 |     val cascadingLocal = "cascading"                %  "cascading-local"       % V.cascading
32 |     val cascadingHadoop = "cascading"                %  "cascading-hadoop2-mr1"       % V.cascading
33 |     val scaldingCore = "com.twitter"                %%  "scalding-core"       % V.scalding exclude( "cascading", "cascading-local" ) exclude( "cascading", "cascading-hadoop" )
34 |     val scaldingJson = "com.twitter"                %%  "scalding-json"       % V.scalding exclude( "cascading", "cascading-local" ) exclude( "cascading", "cascading-hadoop" )
35 |     val scaldingAvro = "com.twitter"                %%  "scalding-avro"       % V.scalding exclude( "cascading", "cascading-local" ) exclude( "cascading", "cascading-hadoop" )
36 |     val hadoopCore   = "org.apache.hadoop"          % "hadoop-common"           % V.hadoop       % "provided"
37 |     val hadoopClientCore   = "org.apache.hadoop"          % "hadoop-mapreduce-client-core"           % V.hadoop       % "provided"
38 |     // Add additional libraries from mvnrepository.com (SBT syntax) here...
39 | 
40 |     // Scala (test only)
41 |     val specs2       = "org.specs2"                 %% "specs2"               % V.specs2       % "test"
42 |   }
43 | }
44 | 


--------------------------------------------------------------------------------
/project/ScaldingTutorialBuild.scala:
--------------------------------------------------------------------------------
 1 | /* 
 2 |  * Copyright (c) 2012 SnowPlow Analytics Ltd. All rights reserved.
 3 |  *
 4 |  * This program is licensed to you under the Apache License Version 2.0,
 5 |  * and you may not use this file except in compliance with the Apache License Version 2.0.
 6 |  * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0.
 7 |  *
 8 |  * Unless required by applicable law or agreed to in writing,
 9 |  * software distributed under the Apache License Version 2.0 is distributed on an
10 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 |  * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under.
12 |  */
13 | import sbt._
14 | import Keys._
15 | 
16 | object ScaldingTutorialProjectBuild extends Build {
17 | 
18 |   import Dependencies._
19 |   import BuildSettings._
20 | 
21 |   // Configure prompt to show current project
22 |   override lazy val settings = super.settings :+ {
23 |     shellPrompt := { s => Project.extract(s).currentProject.id + " > " }
24 |   }
25 | 
26 |   // Define our project, with basic project information and library dependencies
27 |   lazy val project = Project("scalding-tutorial", file("."))
28 |     .settings(buildSettings: _*)
29 |     .settings(
30 |       libraryDependencies ++= Seq(
31 |         Libraries.cascadingCore,
32 |         Libraries.cascadingLocal,
33 |         Libraries.cascadingHadoop,
34 |         Libraries.scaldingCore,
35 |         Libraries.scaldingJson,
36 |         Libraries.scaldingAvro,
37 |         Libraries.hadoopCore,
38 |         Libraries.hadoopClientCore,
39 |         Libraries.specs2
40 |         // Add your additional libraries here (comma-separated)...
41 |       )
42 |     )
43 | }
44 | 


--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=0.12.3
2 | 


--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | resolvers += Resolver.url("plugins-artifactory", url("http://scalasbt.artifactoryonline.com/scalasbt/sbt-plugin-releases"))(Resolver.ivyStylePatterns)
2 | 
3 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.8.5")
4 | 
5 | addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.7.4")
6 | 


--------------------------------------------------------------------------------
/src/main/scala/tutorial/AvroTutorial0.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 | Scalding with Avro (and Json) tutorial part 0.
 3 | 
 4 | To run this job:
 5 |    yarn jar target/scalding-tutorial-0.14.0.jar AvroTutorial0 --local --avro --json
 6 | 
 7 | Check the output:
 8 |   java -jar avro-tools-1.7.6.jar tojson tutorial/data/avrooutput0.avro
 9 | 
10 | **/
11 | 
12 | import com.twitter.scalding.{Job, Args, JsonLine}
13 | import com.twitter.scalding.avro.UnpackedAvroSource
14 | import org.apache.avro.Schema
15 |  
16 | class AvroTutorial0(args: Args) extends Job(args) {
17 |   val schema = """{
18 | "type": "record", "name": "parseJson", "fields": [
19 | { "name": "sessionId", "type": "string" },
20 | { "name": "optionalField", "type": ["string", "null"] }
21 | ] }"""
22 | 
23 |   JsonLine("data/session.json", ('sessionId, 'optionalField)).read
24 |     .write(UnpackedAvroSource("target/data/avrooutput0.avro", new Schema.Parser().parse(schema)))
25 | }
26 | 


--------------------------------------------------------------------------------
/src/main/scala/tutorial/CodeSnippets.md:
--------------------------------------------------------------------------------
1 | Please see the [API reference](https://github.com/twitter/scalding/wiki/API-Reference) on the wiki.
2 | 


--------------------------------------------------------------------------------
/src/main/scala/tutorial/JobRunner.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright (c) 2012 SnowPlow Analytics Ltd. All rights reserved.
 3 |  *
 4 |  * This program is licensed to you under the Apache License Version 2.0,
 5 |  * and you may not use this file except in compliance with the Apache License Version 2.0.
 6 |  * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0.
 7 |  *
 8 |  * Unless required by applicable law or agreed to in writing,
 9 |  * software distributed under the Apache License Version 2.0 is distributed on an
10 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 |  * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under.
12 |  */
13 | 
14 | // Hadoop
15 | import org.apache.hadoop
16 | 
17 | // Scalding
18 | import com.twitter.scalding.Tool
19 | 
20 | /**
21 |  * Entrypoint for Hadoop to kick off the job.
22 |  *
23 |  * Borrowed from com.twitter.scalding.Tool
24 |  */
25 | object JobRunner {
26 |   def main(args : Array[String]) {
27 |     hadoop.util.ToolRunner.run(new hadoop.conf.Configuration, new Tool, args);
28 |   }
29 | }
30 | 


--------------------------------------------------------------------------------
/src/main/scala/tutorial/JsonTutorial0.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 | Scalding with Json tutorial part 0.
 3 | 
 4 | To run this job:
 5 |   yarn jar target/scalding-tutorial-0.14.0.jar JsonTutorial0 --local --json
 6 | 
 7 | Check the output:
 8 |   cat target/data/jsonoutput0.tsv
 9 | 
10 | **/
11 | 
12 | import com.twitter.scalding.{Job, Args, JsonLine, Tsv}
13 | 
14 | class JsonTutorial0(args: Args) extends Job(args) {
15 |   JsonLine("data/session.json", ('sessionId)).read
16 |     .groupBy('sessionId){_.size}
17 |     .write(Tsv("target/data/jsonoutput0.tsv"))
18 | }
19 | 


--------------------------------------------------------------------------------
/src/main/scala/tutorial/MatrixTutorial0.scala:
--------------------------------------------------------------------------------
 1 | import com.twitter.scalding._
 2 | import com.twitter.scalding.mathematics.Matrix
 3 | 
 4 | /*
 5 | * MatrixTutorial0.scala
 6 | *
 7 | * Loads a directed graph adjacency matrix where a[i,j] = 1 if there is an edge from a[i] to b[j]
 8 | * and compute the outdegree of each node i
 9 | *
10 |   yarn jar target/scalding-tutorial-0.14.0.jar MatrixTutorial0 --local\
11 |     --input data/graph.tsv \
12 |     --output target/data/outdegree.tsv
13 | *
14 | */
15 | 
16 | 
17 | class MatrixTutorial0(args : Args) extends Job(args) {
18 | 
19 |   import Matrix._
20 | 
21 |   val adjacencyMatrix = Tsv( args("input"), ('user1, 'user2, 'rel) )
22 |     .read
23 |     .toMatrix[Long,Long,Double]('user1, 'user2, 'rel)
24 | 
25 |   // each row i represents all of the outgoing edges from i
26 |   // by summing out all of the columns we get the outdegree of i
27 |   adjacencyMatrix.sumColVectors.write( Tsv( args("output") ) )
28 | }
29 | 
30 | 


--------------------------------------------------------------------------------
/src/main/scala/tutorial/MatrixTutorial1.scala:
--------------------------------------------------------------------------------
 1 | import com.twitter.scalding._
 2 | import com.twitter.scalding.mathematics.Matrix
 3 | 
 4 | 
 5 | /*
 6 | * MatrixTutorial1.scala
 7 | * 
 8 | * Loads a directed graph adjacency matrix where a[i,j] = 1 if there is an edge from a[i] to b[j]
 9 | * and compute the co-follows between any two nodes 
10 | *
11 |   yarn jar target/scalding-tutorial-0.14.0.jar MatrixTutorial1 --local\
12 |     --input data/graph.tsv --output target/data/cofollows.tsv
13 | *
14 | */
15 | 
16 | 
17 | class MatrixTutorial1(args : Args) extends Job(args) {
18 |   
19 |   import Matrix._
20 |   
21 |   val adjacencyMatrix = Tsv( args("input"), ('user1, 'user2, 'rel) )
22 |   	.read
23 |   	.toMatrix[Long,Long,Double]('user1, 'user2, 'rel)
24 | 
25 |   // compute the innerproduct of the adjacency matrix with itself 
26 |   (adjacencyMatrix * adjacencyMatrix.transpose).write( Tsv( args("output") ) )
27 | }
28 | 
29 | 


--------------------------------------------------------------------------------
/src/main/scala/tutorial/MatrixTutorial2.scala:
--------------------------------------------------------------------------------
 1 | import com.twitter.scalding._
 2 | import com.twitter.scalding.mathematics.Matrix
 3 | 
 4 | 
 5 | /*
 6 | * MatrixTutorial2.scala
 7 | * 
 8 | * Loads a directed graph adjacency matrix where a[i,j] = 1 if there is an edge from a[i] to b[j]
 9 | * and returns a graph containing only the nodes with outdegree smaller than a given value
10 | * 
11 |   yarn jar target/scalding-tutorial-0.14.0.jar MatrixTutorial2 --local\
12 |     --input data/graph.tsv --maxOutdegree 1000 --output target/data/graphFiltered.tsv
13 | * 
14 | */
15 | 
16 | 
17 | class MatrixTutorial2(args : Args) extends Job(args) {
18 |   
19 |   import Matrix._
20 |     
21 |   val adjacencyMatrix = Tsv( args("input"), ('user1, 'user2, 'rel)  )
22 |     .read
23 |     .toMatrix[Long,Long,Double]('user1, 'user2, 'rel)
24 | 
25 |   // Each row corresponds to the outgoing edges so to compute the outdegree we sum out the columns 
26 |   val outdegree = adjacencyMatrix.sumColVectors
27 | 
28 |   // We convert the column vector to a matrix object to be able to use the matrix method filterValues
29 |   // we make all non zero values into ones and then convert it back to column vector
30 |   val outdegreeFiltered = outdegree.toMatrix[Int](1)
31 |                           .filterValues{ _ < args("maxOutdegree").toDouble }
32 |                           .binarizeAs[Double].getCol(1)
33 | 						           				           
34 |   // We multiply on the left hand side with the diagonal matrix created from the column vector
35 |   // to keep only the rows with outdregree smaller than maxOutdegree
36 |   (outdegreeFiltered.diag * adjacencyMatrix).write(Tsv( args("output") ) )
37 | 
38 | }
39 | 
40 | 


--------------------------------------------------------------------------------
/src/main/scala/tutorial/MatrixTutorial3.scala:
--------------------------------------------------------------------------------
 1 | import com.twitter.scalding._
 2 | import com.twitter.scalding.mathematics.Matrix
 3 | 
 4 | 
 5 | /*
 6 | * MatrixTutorial3.scala
 7 | *
 8 | * Loads two directed graph adjacency matrices where a[i,j] = 1 if there is an edge from a[i] to b[j]
 9 | * and computes the intersection and the differences between the two
10 | * 
11 |   yarn jar target/scalding-tutorial-0.14.0.jar MatrixTutorial3 --local\
12 |     --input1 data/graph.tsv --input2 data/graph2.tsv --intersection data/intersection.tsv\
13 |     --leftDiff target/data/leftDiff.tsv --rightDiff target/data/rightDiff.tsv
14 | *
15 | */
16 | 
17 | 
18 | class MatrixTutorial3(args : Args) extends Job(args) {
19 |  
20 |   import Matrix._
21 |   
22 |   val adjacencyMatrix1 = Tsv( args("input1"), ('user1, 'user2, 'rel) )
23 |     .read
24 |     .toMatrix[Long,Long,Double]('user1, 'user2, 'rel)
25 | 
26 |   val adjacencyMatrix2 = Tsv( args("input2"), ('user1, 'user2, 'rel) )
27 |     .read
28 |     .toMatrix[Long,Long,Double]('user1, 'user2, 'rel)
29 | 
30 |   //zip puts creates a pair element out of corresponding elements in the two matrices
31 |   val intersection = adjacencyMatrix1
32 |                         .zip(adjacencyMatrix2)
33 |                         .mapValues( pair => if (pair._1 > 0 && pair._2 > 0) 1.0 else 0.0 )
34 |                         .write(Tsv(args("intersection")))
35 |   (adjacencyMatrix1 - intersection).write(Tsv(args("leftDiff")))
36 |   (adjacencyMatrix2 - intersection).write(Tsv(args("rightDiff")))
37 |   
38 | }
39 | 
40 | 


--------------------------------------------------------------------------------
/src/main/scala/tutorial/MatrixTutorial4.scala:
--------------------------------------------------------------------------------
 1 | import com.twitter.scalding._
 2 | import com.twitter.scalding.mathematics.Matrix
 3 | 
 4 | 
 5 | /*
 6 | * MatrixTutorial4.scala
 7 | *
 8 | * Loads a directed graph adjacency matrix where a[i,j] = 1 if there is an edge from a[i] to b[j]
 9 | * and computes the cosine of the angle between every two pairs of vectors
10 | * 
11 |   yarn jar target/scalding-tutorial-0.14.0.jar MatrixTutorial4 --local\
12 |     --input data/graph.tsv --output target/data/cosineSim.tsv
13 | *
14 | */
15 | 
16 | class MatrixTutorial4(args : Args) extends Job(args) {
17 |   
18 |   import Matrix._
19 | 
20 |   val adjacencyMatrix = Tsv( args("input"), ('user1, 'user2, 'rel) )
21 |   	.read
22 |   	.toMatrix[Long,Long,Double]('user1, 'user2, 'rel)
23 | 
24 |   // we compute the L2 normalized adjacency graph 
25 |   val normMatrix = adjacencyMatrix.rowL2Normalize
26 | 
27 |   // we compute the innerproduct of the normalized matrix with itself
28 |   // which is equivalent with computing cosine: AA^T / ||A|| * ||A||
29 |   (normMatrix * normMatrix.transpose).write( Tsv( args("output") ) )
30 | 
31 | }
32 | 
33 | 


--------------------------------------------------------------------------------
/src/main/scala/tutorial/MatrixTutorial5.scala:
--------------------------------------------------------------------------------
 1 | import com.twitter.scalding._
 2 | import com.twitter.scalding.mathematics.Matrix
 3 | 
 4 | 
 5 | /*
 6 | * MatrixTutorial5.scala
 7 | *
 8 | * Loads a directed graph adjacency matrix where a[i,j] = 1 if there is an edge from a[i] to b[j]
 9 | * and computes the jaccard similarity between any two pairs of vectors
10 | * 
11 |   yarn jar target/scalding-tutorial-0.14.0.jar MatrixTutorial5 --local\
12 |     --input data/graph.tsv --output target/data/jaccardSim.tsv
13 | *
14 | */
15 | 
16 | class MatrixTutorial5(args : Args) extends Job(args) {
17 |   
18 |   import Matrix._
19 | 
20 |   val adjacencyMatrix = Tsv( args("input"), ('user1, 'user2, 'rel) )
21 |     .read
22 |     .toMatrix[Long,Long,Double]('user1, 'user2, 'rel)
23 | 
24 |   val aBinary = adjacencyMatrix.binarizeAs[Double]
25 |  
26 |   // intersectMat holds the size of the intersection of row(a)_i n row (b)_j
27 |   val intersectMat = aBinary * aBinary.transpose
28 |   val aSumVct = aBinary.sumColVectors
29 |   val bSumVct = aBinary.sumRowVectors
30 | 
31 |   //Using zip to repeat the row and column vectors values on the right hand
32 |   //for all non-zeroes on the left hand matrix
33 |   val xMat = intersectMat.zip(aSumVct).mapValues( pair => pair._2 )
34 |   val yMat = intersectMat.zip(bSumVct).mapValues( pair => pair._2 )
35 |   
36 |   val unionMat = xMat + yMat - intersectMat
37 |   //We are guaranteed to have Double both in the intersection and in the union matrix
38 |   intersectMat.zip(unionMat)
39 |               .mapValues( pair => pair._1 / pair._2 )
40 |               .write(Tsv( args("output") ))
41 | 
42 | }
43 | 
44 | 


--------------------------------------------------------------------------------
/src/main/scala/tutorial/MatrixTutorial6.scala:
--------------------------------------------------------------------------------
 1 | import com.twitter.scalding._
 2 | import com.twitter.scalding.mathematics.Matrix
 3 | 
 4 | /*
 5 | * MatrixTutorial6.scala
 6 | *
 7 | * Loads a document to word matrix where a[i,j] = freq of the word j in the document i 
 8 | * computes the Tf-Idf score of each word w.r.t. to each document and keeps the top nrWords in each document
 9 | * (see http://en.wikipedia.org/wiki/Tf*idf for more info)
10 | *
11 |   yarn jar target/scalding-tutorial-0.14.0.jar MatrixTutorial6 --local\
12 |     --input data/docBOW.tsv --nrWords 300 --output target/data/featSelectedMatrix.tsv
13 | *
14 | */
15 | 
16 | class MatrixTutorial6(args : Args) extends Job(args) {
17 |   
18 |   import Matrix._
19 | 
20 |   val docWordMatrix = Tsv( args("input"), ('doc, 'word, 'count) )
21 |     .read
22 |     .toMatrix[Long,String,Double]('doc, 'word, 'count)
23 | 
24 |   // compute the overall document frequency of each row
25 |   val docFreq = docWordMatrix.binarizeAs[Double].sumRowVectors
26 | 
27 |   // compute the inverse document frequency vector
28 |   val invDocFreqVct = docFreq.toMatrix(1).rowL1Normalize.mapValues( x => log2(1/x) )
29 | 
30 |   // zip the row vector along the entire document - word matrix
31 |   val invDocFreqMat = docWordMatrix.zip(invDocFreqVct.getRow(1)).mapValues( pair => pair._2 )
32 | 
33 |   // multiply the term frequency with the inverse document frequency and keep the top nrWords
34 |   docWordMatrix.hProd(invDocFreqMat).topRowElems( args("nrWords").toInt ).write(Tsv( args("output") ))
35 | 
36 |   def log2(x : Double) = scala.math.log(x)/scala.math.log(2.0)
37 | 
38 | }
39 | 
40 | 


--------------------------------------------------------------------------------
/src/main/scala/tutorial/Tutorial0.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2012 Twitter, Inc.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 | http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | import com.twitter.scalding._
17 | 
18 | /**
19 | Scalding tutorial part 0.
20 | 
21 | This is the simplest possible scalding job: it reads from one data source and writes the data,
22 | unchanged, to another.
23 | 
24 | To test it, from the science directory, first make sure you've built the target/scalding-assembly-0.2.0.jar:
25 | from the base directory type:
26 |   sbt assembly
27 | 
28 |   yarn jar target/scalding-tutorial-0.14.0.jar Tutorial0 --local
29 | 
30 | You can check the input:
31 |   cat data/hello.txt
32 | 
33 | And the output:
34 |   cat target/data/output0.txt
35 | 
36 | The output should look just like the input, but with line numbers.
37 | More on this in part 1 of the tutorial.
38 | **/
39 | 
40 | 
41 | /**
42 | All jobs in scalding are represented by a subclass of com.twitter.scalding.Job.
43 | The constructor must take a single com.twitter.scalding.Args, even if, as here,
44 | we don't use it.
45 | 
46 | For the scald.rb script to work, name the class to match the file,
47 | and don't use a package.
48 | **/
49 | class Tutorial0(args : Args) extends Job(args) {
50 | 
51 |   /**
52 |   Both input and output data sources are represented by instances of
53 |   com.twitter.scalding.Source.
54 | 
55 |   Scalding comes with some basic source types like TextLine and Tsv.
56 |   There are also many twitter-specific types like MergedAdRequestSource.
57 |   **/
58 |   val input = TextLine("data/hello.txt")
59 |   val output = TextLine("target/data/output0.txt")
60 | 
61 |   /**
62 |   This is the minimal pipeline. Source.read returns a cascading.pipe.Pipe, which represents
63 |   a stream of data. We can transform this stream in many ways, but here we're simply
64 |   asking it to write itself to the output source.
65 |   **/
66 |   input.read.write(output)
67 | 
68 |   /**
69 |   By the way, if you look at the docs for Pipe, you won't find write there. That's
70 |   because it's actually defined on com.twitter.scalding.RichPipe. Most of the methods
71 |   we call on Pipes will actually be found on RichPipe; in typical scala style,
72 |   the conversion between them is implicit.
73 |   **/
74 | }
75 | 


--------------------------------------------------------------------------------
/src/main/scala/tutorial/Tutorial1.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2012 Twitter, Inc.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 | http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | import com.twitter.scalding._
17 | 
18 | /**
19 | Scalding tutorial part 1.
20 | 
21 | In part 0, we made a copy of hello.txt, but it wasn't a perfect copy:
22 | it was annotated with line numbers.
23 | 
24 | That's because the data stream coming out of a TextLine source actually
25 | has two fields: one, called "line", has the actual line of text. The other,
26 | called "offset", has the line number in the file. When you write these
27 | tuples to a TextLine, it naively outputs them both on each line.
28 | 
29 | We can ask scalding to select just the "line" field from the pipe, using the
30 | project() method. When we refer to a data stream's fields, we use Scala symbols,
31 | like this: 'line.
32 | 
33 | To run this job:
34 |   yarn jar target/scalding-tutorial-0.14.0.jar Tutorial1 --local
35 | 
36 | Check the output:
37 |   cat target/data/output1.txt
38 | 
39 | **/
40 | 
41 | class Tutorial1(args : Args) extends Job(args) {
42 | 
43 |   val input = TextLine("data/hello.txt")
44 |   val output = TextLine("target/data/output1.txt")
45 | 
46 |   /**
47 |   We generally write each step of the pipeline on a separate line.
48 |   **/
49 |   input
50 |     .read
51 |     .project('line)
52 |     .write(output)
53 | }
54 | 


--------------------------------------------------------------------------------
/src/main/scala/tutorial/Tutorial2.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2012 Twitter, Inc.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 | http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | import com.twitter.scalding._
17 | 
18 | /**
19 | Scalding tutorial part 2.
20 | 
21 | In parts 0 and 1, we made copies of hello.txt. Now let's try to
22 | modify the copies by reversing each line.
23 | 
24 | To run this job:
25 |   yarn jar target/scalding-tutorial-0.14.0.jar Tutorial2 --local
26 | 
27 | Check the output:
28 |   cat target/data/output2.txt
29 | 
30 | **/
31 | 
32 | class Tutorial2(args : Args) extends Job(args) {
33 | 
34 |   val input = TextLine("data/hello.txt")
35 |   val output = TextLine("target/data/output2.txt")
36 | 
37 |   input
38 |     .read
39 | 
40 |     /**
41 |     As with a scala collection, you can map over a pipe, where each
42 |     item gets passed into an anonymous function, and we create a new
43 |     pipe with the results.
44 | 
45 |     In scalding, we need to also annotate the call to map with the names of the
46 |     fields it operates on. In this case, we want to take the 'line field
47 |     as input, and we want to output a new field named 'reversed.
48 | 
49 |     Unlike with a normal scala map{}, we always need to specify the
50 |     types of the arguments to the anonymous function.
51 |     **/
52 | 
53 |     .map('line -> 'reversed){ line : String => line.reverse}
54 | 
55 |     /**
56 |     The map transformation in scalding is additive: the 'offset and 'line
57 |     fields haven't gone away, we've just added a new 'reversed field to each
58 |     entry. If we only want to write the 'reversed version, we need to use
59 |     project.
60 |     **/
61 | 
62 |     .project('reversed)
63 |     .write(output)
64 | }
65 | 


--------------------------------------------------------------------------------
/src/main/scala/tutorial/Tutorial3.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2012 Twitter, Inc.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 | http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | import com.twitter.scalding._
17 | 
18 | /**
19 | Scalding tutorial part 3.
20 | 
21 | So far, we've been hardcoding the input file. Let's make that an argument,
22 | which changes how we run the job:
23 | 
24 |   yarn jar target/scalding-tutorial-0.14.0.jar \
25 |     Tutorial3 --local\
26 |     --input data/hello.txt
27 | 
28 | We're also going to use a new transformation: flatMap.
29 | 
30 | Check the output:
31 |   cat target/data/output3.txt
32 | 
33 | You can also of course try this with other input parameters. For example:
34 | 
35 |   yarn jar target/scalding-tutorial-0.14.0.jar \
36 |     Tutorial3 --local\
37 |     --input target/data/output2.txt
38 | 
39 | **/
40 | 
41 | class Tutorial3(args : Args) extends Job(args) {
42 | 
43 |   /**
44 |   We can ask args for the --input argument from the command line.
45 |   If it's missing, we'll get an error.
46 |   **/
47 |   val input = TextLine(args("input"))
48 |   val output = TextLine("target/data/output3.txt")
49 | 
50 |   input
51 |     .read
52 | 
53 |     /**
54 |     flatMap is like map, but instead of returning a single item from the
55 |     function, we return a collection of items. Each of these items will create
56 |     a new entry in the data stream; here, we'll end up with a new entry for each word.
57 |     **/
58 | 
59 |     .flatMap('line -> 'word){ line : String => line.split("\\s")}
60 | 
61 |     /**
62 |     We still want to project just the 'word field for our final output.
63 |     For interest, though, let's stash a copy of the data before we do that.
64 |     write() returns the pipe, so we can keep chaining our pipeline.
65 |     **/
66 | 
67 |     .write(Tsv("target/data/tmp3.tsv"))
68 |     .project('word)
69 |     .write(output)
70 | }
71 | 


--------------------------------------------------------------------------------
/src/main/scala/tutorial/Tutorial4.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2012 Twitter, Inc.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 | http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | import com.twitter.scalding._
17 | 
18 | /**
19 | Scalding tutorial part 4.
20 | 
21 | You might have noticed that in part 3, we ended up with a list of words.
22 | Clearly we're ready for that most exciting of MapReduce examples, the word count.
23 | 
24 | Also, let's go ahead and make this fully general by parameterizing the output location.
25 | 
26 | Run:
27 |   yarn jar target/scalding-tutorial-0.14.0.jar \
28 |     Tutorial4 --local\
29 |     --input data/hello.txt \
30 |     --output target/data/output4.txt
31 | 
32 | Check the output:
33 |   cat target/data/output4.txt
34 | 
35 | **/
36 | 
37 | class Tutorial4(args : Args) extends Job(args) {
38 | 
39 |   //we probably don't need to bother with vals for input/output anymore
40 |   TextLine(args("input"))
41 |     .read
42 |     .flatMap('line -> 'word){ line : String => line.split("\\s")}
43 | 
44 |     /**
45 |     To count the words, first we need to group by word.
46 |     groupBy takes any number of fields as the group key. In this
47 |     case we just want 'word.
48 | 
49 |     groupBy also takes an anonymous function, to which it will pass a
50 |     com.twitter.scalding.GroupBuilder.
51 | 
52 |     Each method call to GroupBuilder will specify an aggregation we want to
53 |     perform on the group. In general, the resulting data stream will have all
54 |     of the group fields (with one entry for each set of unique values), plus
55 |     one new field for each aggregation.
56 | 
57 |     In this case, the only aggregation we care about is size: how many values are
58 |     in the group.
59 |     **/
60 | 
61 |     .groupBy('word){group => group.size}
62 | 
63 |     /**
64 |     No project is needed here because the groupBy has eliminated everything but 'word
65 |     and the size field.
66 |     **/
67 | 
68 |     .write(Tsv(args("output")))
69 | }
70 | 


--------------------------------------------------------------------------------
/src/main/scala/tutorial/Tutorial5.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2012 Twitter, Inc.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 | http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | import com.twitter.scalding._
17 | 
18 | /**
19 | Scalding tutorial part 5.
20 | 
21 | This example is a little bit contrived so that we can play with joins.
22 | 
23 | Let's define a metric for a line of text which is the sum of the rank
24 | of each of its words in the words input file - the word "hello" comes
25 | first (rank 0) whereas the second word is "world", with a rank of 1.
26 | 
27 | So, the line "hello world" would have a total score of 0+1 = 1
28 | 
29 | We'll read in an input file, split it into words, join those words
30 | with the words input file to get their individual ranks, then
31 | group by line to get a total score and output each line/score pair.
32 | 
33 | Run:
34 |   yarn jar target/scalding-tutorial-0.14.0.jar \
35 |     Tutorial5 --local\
36 |     --input data/hello.txt \
37 |     --output target/data/output5.txt \
38 |     --words data/words.txt
39 | 
40 | Check the output:
41 |   cat target/data/output5.txt
42 | 
43 | Note that the line order may no longer be the same as the input file.
44 | That's parallelism, man.
45 | 
46 | **/
47 | 
48 | class Tutorial5(args : Args) extends Job(args) {
49 | 
50 |   /**
51 |   We'll start with the dict data source.
52 | 
53 |   When we join, we'll need unique field names, so we'll rename
54 |   the 'offset' field to be score. Also, we want to normalize
55 |   the words to be lowercase.
56 |   **/
57 | 
58 |   val scores = TextLine(args("words"))
59 |                 .read
60 |                 .rename('offset, 'score)
61 |                 .map('line -> 'dictWord){line : String => line.toLowerCase}
62 |                 .project('score, 'dictWord)
63 | 
64 |   TextLine(args("input"))
65 |     .read
66 | 
67 |     //split and normalize to lowercase
68 |     .flatMap('line -> 'word){ line : String => line.split("\\s").map{_.toLowerCase}}
69 | 
70 |     /**
71 |     When we join, we need to specify which fields from each side of the join should match.
72 |     This is like a SQL inner join: we end up with a new row that combines each possible
73 |     matching pair, with all of the fields of both the left and right side.
74 |     **/
75 | 
76 |     .joinWithLarger('word -> 'dictWord, scores)
77 | 
78 |     /**
79 |     Now that we have a score for each word, we can group back to the original lines
80 |     and sum up the word scores. Sum is another common aggregation that GroupBuilder
81 |     provides; we just need to specify which field to sum by.
82 |     **/
83 | 
84 |     .groupBy('line){group => group.sum[Double]('score)}
85 |     .write(Tsv(args("output")))
86 | }
87 | 


--------------------------------------------------------------------------------
/src/main/scala/tutorial/Tutorial6.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2012 Twitter, Inc.
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 | http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | import com.twitter.scalding._
17 | 
18 | /**
19 | Scalding tutorial part 6.
20 | 
21 | This is similar to Tutorial1 except that we show the use of Scala Enumerations to specify fields.
22 | 
23 | To run this job:
24 |   yarn jar target/scalding-tutorial-0.14.0.jar Tutorial6 --local
25 | 
26 | Check the output:
27 |   cat target/data/output6.tsv
28 | 
29 | **/
30 | 
31 | class Tutorial6(args : Args) extends Job(args) {
32 |   /** When a data set has a large number of fields, and we want to specify those fields conveniently
33 |     in code, we can use, for example, a Tuple of Symbols (as most of the other tutorials show), or a List of Symbols.
34 |     Note that Tuples can only be used if the number of fields is at most 22, since Scala Tuples cannot have more
35 |     than 22 elements. Another alternative is to use Enumerations, which we show here **/
36 | 
37 |   object Schema extends Enumeration {
38 |     val first, last, phone, age, country = Value // arbitrary number of fields
39 |   }
40 | 
41 |   import Schema._
42 | 
43 |   Csv("data/phones.txt", separator = " ", fields = Schema)
44 |     .read
45 |     .project(first,age)
46 |     .write(Tsv("target/data/output6.tsv"))
47 | }
48 | 
49 | 


--------------------------------------------------------------------------------
/src/main/scala/tutorial/TypedTutorial.scala:
--------------------------------------------------------------------------------
  1 | import cascading.pipe.Pipe
  2 | import com.twitter.scalding._
  3 | 
  4 | /**
  5 | Scalding Tutorial ported to use the Type-safe API (TDsl)
  6 | (rather than Cascading's Fields API). The examples here roughly correspond
  7 | to those in `tutorial/Tutorial{0..5}.scala`.
  8 | 
  9 | These tutorials are all run from this single file; which one is run can
 10 | be chosen with a command-line flag "--tutorial". For instance, to run the
 11 | first tutorial example:
 12 | 
 13 | yarn jar target/scalding-tutorial-0.14.0.jar TypedTutorial --local \
 14 |       --tutorial 0 \
 15 |       --input data/hello.txt \
 16 |       --output target/data/output0.txt \
 17 |       --words data/word_scores.tsv
 18 | 
 19 | (Note: only tutorial 5 uses "word_scores.tsv")
 20 | **/
 21 | class TypedTutorial(args : Args) extends Job(args) {
 22 |   
 23 |   args("tutorial") match {
 24 |     
 25 |     /**
 26 |     Tutorial {0,1}: Write out to a TSV file.
 27 |     ----------------------------------------
 28 |     In this first version we will be as explicit as possible to show all
 29 |     the steps required to go from a raw text file to a typed stream.
 30 |     **/
 31 |     case "0" | "1" => {
 32 |       
 33 |       // The TextLine source splits the input by lines.
 34 |       val textSource = TextLine(args("input"))
 35 |       
 36 |       // Create a type-safe pipe from the TextLine.
 37 |       val lines: TypedPipe[String] = 
 38 |         TypedPipe.from[String](textSource) 
 39 |       
 40 |       // Write the typed pipe out to a tab-delimited file.
 41 |       lines.write(TypedTsv[String](args("output")))
 42 |     }
 43 |     
 44 |     /**
 45 |     Tutorial 2: Simple map
 46 |     ----------------------
 47 |     Reverse all the strings. Notice that we've now left off the [String] type.
 48 |     Scala can generally infer these types for us, making the code cleaner.
 49 |     **/
 50 |     case "2" | "map" => {
 51 |       // Create a typed pipe from the TextLine (of type TypedPipe[String] still)
 52 |       TypedPipe.from(TextLine(args("input")))
 53 |         // Transform each line, reversing it. Output is a new TypedPipe, still of String.
 54 |         .map(_.reverse)
 55 |         // Note, the types for the TypedTsv *can* be inferred by Scala here.
 56 |         // However, it's best to specify them explicitly so that if the
 57 |         // output type changes, it is detected and doesn't break the next
 58 |         // thing to read from the output file.
 59 |         .write(TypedTsv[String](args("output")))
 60 |     }
 61 |     
 62 |     /**
 63 |     Tutorial 3: Flat Map
 64 |     ---------------------
 65 |     Dump all the words.
 66 |     **/
 67 |     case "3" | "flatmap" => {
 68 |       TypedPipe.from(TextLine(args("input")))
 69 |         // flatMap is like map, but instead of returning a single item
 70 |         // from the function, we return a collection of items. Each of
 71 |         // these items will create a new entry in the data stream; here,
 72 |         // we'll end up with a new entry for each word.
 73 |         .flatMap(_.split("\\s"))
 74 |         // output of flatMap is still a collection of String
 75 |         .write(TypedTsv[String](args("output")))
 76 |     }
 77 |     
 78 |     /**
 79 |     Tutorial 4: Word Count
 80 |     ----------------------
 81 |     Now that we have a stream of words, clearly we're ready for
 82 |     that most exciting of MapReduce examples: the Word Count.
 83 |     **/
 84 |     case "4" | "wordcount" => {
 85 |       // Get the words (just like above in case "3")
 86 |       val words = TypedPipe.from(TextLine(args("input")))
 87 |                            .flatMap(_.split("\\s"))
 88 |       
 89 |       // To count the words, we use TypedPipe's `groupBy` method.
 90 |       // However, this no longer returns a `TypedPipe[T]`, but rather
 91 |       // a `Grouped[K,T]` based on the type of the key used to group by.
 92 |       // 
 93 |       // groupBy accepts a function to determine the key for grouping. 
 94 |       // In the case of word count, let's imagine we want to make sure 
 95 |       // capitalization doesn't matter, so to come up with the key, 
 96 |       // we normalize it to lower case.
 97 |       val groups : Grouped[String,String] = words.groupBy(_.toLowerCase)
 98 |       
 99 |       // Next we specify what to do with each aggregation. In the case
100 |       // of word count, we simply want the size of each group. This
101 |       // operation results in a new `Grouped` that has the key (String, 
102 |       // the lower case words), and the counts (Long).
103 |       //
104 |       // Note: To do more interesting aggregations, Scalding supports
105 |       // a variety of operations, such as `sum`, `reduce`, `foldLeft`,
106 |       // `mapGroup`, etc, that can all be applied efficiently on Monoids
107 |       // (primitives like Long, container types like `Map`, or custom
108 |       // monoids you define yourself). See the wiki for more details:
109 |       // https://github.com/twitter/scalding/wiki/Type-safe-api-reference
110 |       val counts = groups.size
111 |       
112 |       // And finally, we dump these results to a TypedTsv with the 
113 |       // correct Tuple type.
114 |       counts.write(TypedTsv[(String,Long)](args("output")))
115 |     }
116 |     
117 |     /**
118 |     Tutorial 5: Demonstrate joins
119 |     -----------------------------
120 |     Associate a score with each word and compute a score for each line.
121 |     
122 |     Note: this example is a bit contrived, but serves to demonstrate
123 |     how to combine multiple input sources.
124 |     **/
125 |     case "5" | "join" => {
126 |       // Load the scores for each word from TSV file and group by word.
127 |       val scores: Grouped[String,Double] =
128 |         // For TypedTsv, Scalding coerces the fields to the specified types,
129 |         // throwing an exception if any line fails.
130 |         TypedPipe.from(TypedTsv[(String,Double)](args("words")))
131 |           // group by word so we can join it
132 |           .group
133 |       
134 |       // get the lines, this time from an 'OffsetTextLine' which is a
135 |       // typed wrapper on 'TextLine' that contains the 'byte offset' and
136 |       // text of each line in the file.
137 |       val lines: TypedPipe[(Long,String)] = TypedPipe.from(OffsetTextLine(args("input")))
138 |       
139 |       // Split lines into words, but keep their original line offset with them.
140 |       val wordsWithLine : Grouped[String,Long] =
141 |         lines
142 |           .flatMap{ case (offset, line) =>
143 |             // split into words
144 |             line.split("\\s")
145 |             // keep the line offset with them
146 |               .map(word => (word.toLowerCase, offset))
147 |           }
148 |           // make the 'word' field the key
149 |           .group
150 |       
151 |       // Associate scores with each word; merges the two value types into
152 |       // a tuple: [String,Long] join [String,Double] -> [String,(Long,Double)]
153 |       val scoredWords = wordsWithLine.join(scores)
154 |       
155 |       // get scores for each line (indexed by line number)
156 |       val scoredLinesByNumber = 
157 |         scoredWords
158 |           // select the line offset and score fields
159 |           .map{ case (word,(offset,score)) => (offset,score) }
160 |           // group by line offset (groups all the words for a line together)
161 |           .group
162 |           // compute total score per line
163 |           .sum
164 |           // Group and sum are often run together in this way.
165 |           // The `sumByKey` operation performs performs both.
166 |       
167 |       // Associate the original line text with the computed score,
168 |       // discard the 'offset' field
169 |       val scoredLines: TypedPipe[(String,Double)] =
170 |         lines
171 |           // index lines by 'offset'
172 |           .group
173 |           // associate scores with lines (by offset)
174 |           .join(scoredLinesByNumber)
175 |           // take just the value fields (discard the 'line offset')
176 |           .values
177 |       
178 |       // write out the final result
179 |       scoredLines.write(TypedTsv[(String,Double)](args("output")))
180 |       
181 |     }
182 |     
183 |     /**
184 |     Interoperability with Fields API
185 |     --------------------------------
186 |     Scalding also provides a thinner, un-type-safe wrapper over Cascading 
187 |     which is known as the Fields API because each record has a number of 
188 |     named "fields".
189 |     
190 |     Most jobs can be done completely in the Typed API, but for compatibility,
191 |     there are ways to go back and forth between the two schemes, which the
192 |     next couple cases demonstrate.
193 |     **/
194 |     
195 |     /**
196 |     Pipe vs. TypedPipe
197 |     ------------------
198 |     TypedPipes can be easily converted to Pipes and vice-versa.
199 |     **/
200 |     case "pipes" => {
201 |       // calling 'read' on a source returns an un-typed Pipe
202 |       // TextLine, by default, contains two fields: 'offset, and 'line.
203 |       val rawPipe: Pipe = TextLine(args("input")).read
204 |       
205 |       // To convert to a typed pipe, we must specify the fields we want 
206 |       // and their types:
207 |       val lines: TypedPipe[(Long,String)] =
208 |         TypedPipe.from[(Long,String)](rawPipe, ('offset,'line))
209 |       
210 |       // We can operate on this typed pipe as above, and come up with a 
211 |       // different set of fields
212 |       val lineSizes: TypedPipe[Long] = lines.map{ case (offset,line) => line.length }
213 |       
214 |       // To convert back to a Fields Pipe, we must specify the names of the fields:
215 |       val lineSizesField: Pipe = lineSizes.toPipe('size)
216 |       
217 |       // finally, we can write out this untyped pipe with an untyped sink:
218 |       lineSizesField.write(Tsv(args("output")))
219 |     }
220 |     
221 |     /**
222 |     Bonus: Typed blocks
223 |     -------------------
224 |     An alternative to working completely in typed mode is to use
225 |     `typed` blocks, which create a TypedPipe within the scope, and then
226 |     map the output back into an untyped Pipe. You specify the fields to 
227 |     map in and out using the `->` pair passed to `typed()`.
228 |     **/
229 |     case "block" => {
230 |       // Get the .typed enrichment
231 |       import TDsl._
232 |       
233 |       TextLine(args("input")).read
234 |         .typed('line -> 'size) { tp: TypedPipe[String] =>
235 |           // now operate on the typed pipe
236 |           tp.map(_.length)
237 |         }
238 |         // the final output will have just the 'size field
239 |         // and can be dumped using the un-typed Tsv source.
240 |         .write(Tsv(args("output")))
241 |     }
242 |   }
243 | }
244 | 


--------------------------------------------------------------------------------