├── .gitignore ├── LICENSE-2.0.txt ├── README.md ├── build.sbt ├── data ├── docBOW.tsv ├── graph.tsv ├── graph2.tsv ├── hello.txt ├── helloDoc.txt ├── phones.txt ├── session.json ├── word_scores.tsv └── words.txt ├── project ├── BuildSettings.scala ├── Dependencies.scala ├── ScaldingTutorialBuild.scala ├── build.properties └── plugins.sbt └── src └── main └── scala └── tutorial ├── AvroTutorial0.scala ├── CodeSnippets.md ├── JobRunner.scala ├── JsonTutorial0.scala ├── MatrixTutorial0.scala ├── MatrixTutorial1.scala ├── MatrixTutorial2.scala ├── MatrixTutorial3.scala ├── MatrixTutorial4.scala ├── MatrixTutorial5.scala ├── MatrixTutorial6.scala ├── Tutorial0.scala ├── Tutorial1.scala ├── Tutorial2.scala ├── Tutorial3.scala ├── Tutorial4.scala ├── Tutorial5.scala ├── Tutorial6.scala └── TypedTutorial.scala /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | *.log 3 | 4 | # sbt specific 5 | dist/* 6 | target/ 7 | lib_managed/ 8 | src_managed/ 9 | project/boot/ 10 | project/plugins/project/ 11 | -------------------------------------------------------------------------------- /LICENSE-2.0.txt: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Scalding Tutorial Project 2 | 3 | ## Introduction 4 | 5 | This is Twitter's [tutorial] [tutorial] for [Scalding] [scalding] adapted to run 6 | on Hadoop as a standalone job - i.e. without requiring `scald.rb` etc. 7 | 8 | This was built as a Scala SBT project by the [Concurrent Inc] [concurrent] team, 9 | in order to integrate the scalding tutorial into the [Cascading SDK][sdk]. It 10 | is based on the excellent work done by [Snowplow Analytics][snowplow] for 11 | porting the [`Wordcount example`][wordcount] to SBT. 12 | 13 | The versioning of the project follows the versions of the scalding release on 14 | which it is based. 15 | 16 | Please note that this tutorial uses scala 2.10 and not 2.9. 17 | 18 | ## Prerequisites 19 | 20 | In order to use this tutorial, you need to have `SBT` and the `hadoop` command 21 | installed. Cascading and therefore scalding is compatible with a number of 22 | hadoop distributions. If you are unsure, if your distribution is compatible, 23 | please check the [compatibility][compatibility] page. 24 | 25 | You do not need to have a full hadoop cluster, in order to run this tutorial. 26 | The local mode of hadoop is sufficient. 27 | 28 | 29 | ## Building 30 | 31 | Assuming you already have SBT installed: 32 | 33 | $ git clone git://github.com/Cascading/scalding-tutorial.git 34 | $ cd scalding-tutorial 35 | $ sbt assembly 36 | 37 | The 'fat jar' is now available as: 38 | 39 | target/scalding-tutorial-0.14.0.jar 40 | 41 | ## Project structure 42 | 43 | Some modifications have been done to the code, order to properly work in an SBT 44 | based build. 45 | 46 | * all code is now in `src/main/scala/tutorial` 47 | * the data files for the different parts live now in `data` 48 | * the classes in the matrix tutorial have been renamed to match the file names, 49 | so that the commandline invocation is similar to the original tutorial 50 | * the documentation of the examples has been adapted to match the new structure 51 | 52 | ## Running the examples 53 | 54 | Each part of the tutorial explains, how to run it properly. However the general 55 | way is always 56 | 57 | $ yarn jar target/scalding-tutorial-0.14.0.jar --local 58 | 59 | ## Copyright and license 60 | 61 | Copyright 2012-2014 Concurrent Inc, with significant portions copyright 2012 Twitter, Inc. and Snowplow Analytics Inc. 62 | 63 | Licensed under the [Apache License, Version 2.0] [license] (the "License"); 64 | you may not use this software except in compliance with the License. 65 | 66 | Unless required by applicable law or agreed to in writing, software 67 | distributed under the License is distributed on an "AS IS" BASIS, 68 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 69 | See the License for the specific language governing permissions and 70 | limitations under the License. 71 | 72 | [tutorial]: https://github.com/twitter/scalding/tree/develop/tutorial 73 | [sdk]: http://cascading.org/sdk 74 | [scalding]: https://github.com/twitter/scalding/ 75 | [concurrent]: http://concurrentinc.com 76 | [snowplow]: http://snowplowanalytics.com 77 | [wordcount]: http://github.com/snowplow/scalding-example-project 78 | [license]: http://www.apache.org/licenses/LICENSE-2.0 79 | [compatibility]: http://www.cascading.org/support/compatibility/ 80 | -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | net.virtualvoid.sbt.graph.Plugin.graphSettings 2 | -------------------------------------------------------------------------------- /data/docBOW.tsv: -------------------------------------------------------------------------------- 1 | 1 hello 2 2 | 1 twitter 1 3 | 2 conversation 1 4 | 2 celebrities 1 5 | 2 twitter 1 6 | 3 elections 1 7 | 3 debate 1 8 | 3 twitter 1 9 | 3 political 1 10 | -------------------------------------------------------------------------------- /data/graph.tsv: -------------------------------------------------------------------------------- 1 | 1 2 1 2 | 1 3 1 3 | 3 2 1 4 | 4 2 2 5 | -------------------------------------------------------------------------------- /data/graph2.tsv: -------------------------------------------------------------------------------- 1 | 1 2 1 2 | 1 3 1 3 | 2 3 1 4 | -------------------------------------------------------------------------------- /data/hello.txt: -------------------------------------------------------------------------------- 1 | Hello world 2 | Goodbye world 3 | -------------------------------------------------------------------------------- /data/helloDoc.txt: -------------------------------------------------------------------------------- 1 | 1 Hello world 2 | 2 See ya soon world 3 | 3 Hello again world 4 | -------------------------------------------------------------------------------- /data/phones.txt: -------------------------------------------------------------------------------- 1 | john smith 5551212 30 US 2 | harry bovik 4122680000 55 US 3 | jane doe 4125551212 40 CN 4 | -------------------------------------------------------------------------------- /data/session.json: -------------------------------------------------------------------------------- 1 | {"sessionId": "efdc0698-66ea-4d3a-ac6e-fab9fd97a78b"} 2 | {"sessionId": "efdc0698-66ea-4d3a-ac6e-fab9fd97a78b"} 3 | {"sessionId": "efdc0698-66ea-4d3a-ac6e-fab9fd97a78b"} 4 | {"sessionId": "efdc0698-66ea-4d3a-ac6e-fab9fd97a78b"} 5 | {"sessionId": "efdc0698-66ea-4d3a-ac6e-fab9fd97a78b"} 6 | {"sessionId": "efdc0698-66ea-4d3a-ac6e-fab9fd97a78b"} 7 | -------------------------------------------------------------------------------- /data/word_scores.tsv: -------------------------------------------------------------------------------- 1 | hello 1.0 2 | world 2.0 3 | goodbye 3.0 -------------------------------------------------------------------------------- /data/words.txt: -------------------------------------------------------------------------------- 1 | hello 2 | world 3 | goodbye 4 | -------------------------------------------------------------------------------- /project/BuildSettings.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2012 SnowPlow Analytics Ltd. All rights reserved. 3 | * 4 | * This program is licensed to you under the Apache License Version 2.0, 5 | * and you may not use this file except in compliance with the Apache License Version 2.0. 6 | * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. 7 | * 8 | * Unless required by applicable law or agreed to in writing, 9 | * software distributed under the Apache License Version 2.0 is distributed on an 10 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. 12 | */ 13 | import sbt._ 14 | import Keys._ 15 | 16 | object BuildSettings { 17 | 18 | // Basic settings for our app 19 | lazy val basicSettings = Seq[Setting[_]]( 20 | organization := "Concurrent Inc.", 21 | version := "0.14.0", // -> follow the release numbers of scalding 22 | description := "The scalding tutorial as an SBT project", 23 | scalaVersion := "2.10.0", 24 | scalacOptions := Seq("-deprecation", "-encoding", "utf8"), 25 | resolvers ++= Dependencies.resolutionRepos 26 | ) 27 | 28 | // sbt-assembly settings for building a fat jar 29 | import sbtassembly.Plugin._ 30 | import AssemblyKeys._ 31 | lazy val sbtAssemblySettings = assemblySettings ++ Seq( 32 | 33 | // Slightly cleaner jar name 34 | jarName in assembly <<= (name, version) { (name, version) => name + "-" + version + ".jar" }, 35 | 36 | // Drop these jars 37 | excludedJars in assembly <<= (fullClasspath in assembly) map { cp => 38 | val excludes = Set( 39 | "jsp-api-2.1-6.1.14.jar", 40 | "jsp-2.1-6.1.14.jar", 41 | "jasper-compiler-5.5.12.jar", 42 | "minlog-1.2.jar", // Otherwise causes conflicts with Kyro (which bundles it) 43 | "janino-2.5.16.jar", // Janino includes a broken signature, and is not needed anyway 44 | "commons-beanutils-core-1.8.0.jar", // Clash with each other and with commons-collections 45 | "commons-beanutils-1.7.0.jar", // " 46 | "hadoop-core-1.1.2.jar", 47 | "hadoop-tools-1.1.2.jar" // " 48 | ) 49 | cp filter { jar => excludes(jar.data.getName) } 50 | }, 51 | 52 | mergeStrategy in assembly <<= (mergeStrategy in assembly) { 53 | (old) => { 54 | case "project.clj" => MergeStrategy.discard // Leiningen build files 55 | case x => old(x) 56 | } 57 | } 58 | ) 59 | 60 | lazy val buildSettings = basicSettings ++ sbtAssemblySettings 61 | } 62 | -------------------------------------------------------------------------------- /project/Dependencies.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2012 SnowPlow Analytics Ltd. All rights reserved. 3 | * 4 | * This program is licensed to you under the Apache License Version 2.0, 5 | * and you may not use this file except in compliance with the Apache License Version 2.0. 6 | * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. 7 | * 8 | * Unless required by applicable law or agreed to in writing, 9 | * software distributed under the Apache License Version 2.0 is distributed on an 10 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. 12 | */ 13 | import sbt._ 14 | 15 | object Dependencies { 16 | val resolutionRepos = Seq( 17 | ScalaToolsSnapshots, 18 | "Concurrent Maven Repo" at "http://conjars.org/repo" // For Scalding, Cascading etc 19 | ) 20 | 21 | object V { 22 | val scalding = "0.14.0" 23 | val hadoop = "2.6.0" 24 | val specs2 = "1.13" // -> "1.13" when we bump to Scala 2.10.0 25 | // Add versions for your additional libraries here... 26 | val cascading = "2.7.0" 27 | } 28 | 29 | object Libraries { 30 | val cascadingCore = "cascading" % "cascading-core" % V.cascading 31 | val cascadingLocal = "cascading" % "cascading-local" % V.cascading 32 | val cascadingHadoop = "cascading" % "cascading-hadoop2-mr1" % V.cascading 33 | val scaldingCore = "com.twitter" %% "scalding-core" % V.scalding exclude( "cascading", "cascading-local" ) exclude( "cascading", "cascading-hadoop" ) 34 | val scaldingJson = "com.twitter" %% "scalding-json" % V.scalding exclude( "cascading", "cascading-local" ) exclude( "cascading", "cascading-hadoop" ) 35 | val scaldingAvro = "com.twitter" %% "scalding-avro" % V.scalding exclude( "cascading", "cascading-local" ) exclude( "cascading", "cascading-hadoop" ) 36 | val hadoopCore = "org.apache.hadoop" % "hadoop-common" % V.hadoop % "provided" 37 | val hadoopClientCore = "org.apache.hadoop" % "hadoop-mapreduce-client-core" % V.hadoop % "provided" 38 | // Add additional libraries from mvnrepository.com (SBT syntax) here... 39 | 40 | // Scala (test only) 41 | val specs2 = "org.specs2" %% "specs2" % V.specs2 % "test" 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /project/ScaldingTutorialBuild.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2012 SnowPlow Analytics Ltd. All rights reserved. 3 | * 4 | * This program is licensed to you under the Apache License Version 2.0, 5 | * and you may not use this file except in compliance with the Apache License Version 2.0. 6 | * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. 7 | * 8 | * Unless required by applicable law or agreed to in writing, 9 | * software distributed under the Apache License Version 2.0 is distributed on an 10 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. 12 | */ 13 | import sbt._ 14 | import Keys._ 15 | 16 | object ScaldingTutorialProjectBuild extends Build { 17 | 18 | import Dependencies._ 19 | import BuildSettings._ 20 | 21 | // Configure prompt to show current project 22 | override lazy val settings = super.settings :+ { 23 | shellPrompt := { s => Project.extract(s).currentProject.id + " > " } 24 | } 25 | 26 | // Define our project, with basic project information and library dependencies 27 | lazy val project = Project("scalding-tutorial", file(".")) 28 | .settings(buildSettings: _*) 29 | .settings( 30 | libraryDependencies ++= Seq( 31 | Libraries.cascadingCore, 32 | Libraries.cascadingLocal, 33 | Libraries.cascadingHadoop, 34 | Libraries.scaldingCore, 35 | Libraries.scaldingJson, 36 | Libraries.scaldingAvro, 37 | Libraries.hadoopCore, 38 | Libraries.hadoopClientCore, 39 | Libraries.specs2 40 | // Add your additional libraries here (comma-separated)... 41 | ) 42 | ) 43 | } 44 | -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=0.12.3 2 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | resolvers += Resolver.url("plugins-artifactory", url("http://scalasbt.artifactoryonline.com/scalasbt/sbt-plugin-releases"))(Resolver.ivyStylePatterns) 2 | 3 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.8.5") 4 | 5 | addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.7.4") 6 | -------------------------------------------------------------------------------- /src/main/scala/tutorial/AvroTutorial0.scala: -------------------------------------------------------------------------------- 1 | /** 2 | Scalding with Avro (and Json) tutorial part 0. 3 | 4 | To run this job: 5 | yarn jar target/scalding-tutorial-0.14.0.jar AvroTutorial0 --local --avro --json 6 | 7 | Check the output: 8 | java -jar avro-tools-1.7.6.jar tojson tutorial/data/avrooutput0.avro 9 | 10 | **/ 11 | 12 | import com.twitter.scalding.{Job, Args, JsonLine} 13 | import com.twitter.scalding.avro.UnpackedAvroSource 14 | import org.apache.avro.Schema 15 | 16 | class AvroTutorial0(args: Args) extends Job(args) { 17 | val schema = """{ 18 | "type": "record", "name": "parseJson", "fields": [ 19 | { "name": "sessionId", "type": "string" }, 20 | { "name": "optionalField", "type": ["string", "null"] } 21 | ] }""" 22 | 23 | JsonLine("data/session.json", ('sessionId, 'optionalField)).read 24 | .write(UnpackedAvroSource("target/data/avrooutput0.avro", new Schema.Parser().parse(schema))) 25 | } 26 | -------------------------------------------------------------------------------- /src/main/scala/tutorial/CodeSnippets.md: -------------------------------------------------------------------------------- 1 | Please see the [API reference](https://github.com/twitter/scalding/wiki/API-Reference) on the wiki. 2 | -------------------------------------------------------------------------------- /src/main/scala/tutorial/JobRunner.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2012 SnowPlow Analytics Ltd. All rights reserved. 3 | * 4 | * This program is licensed to you under the Apache License Version 2.0, 5 | * and you may not use this file except in compliance with the Apache License Version 2.0. 6 | * You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. 7 | * 8 | * Unless required by applicable law or agreed to in writing, 9 | * software distributed under the Apache License Version 2.0 is distributed on an 10 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | * See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. 12 | */ 13 | 14 | // Hadoop 15 | import org.apache.hadoop 16 | 17 | // Scalding 18 | import com.twitter.scalding.Tool 19 | 20 | /** 21 | * Entrypoint for Hadoop to kick off the job. 22 | * 23 | * Borrowed from com.twitter.scalding.Tool 24 | */ 25 | object JobRunner { 26 | def main(args : Array[String]) { 27 | hadoop.util.ToolRunner.run(new hadoop.conf.Configuration, new Tool, args); 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/main/scala/tutorial/JsonTutorial0.scala: -------------------------------------------------------------------------------- 1 | /** 2 | Scalding with Json tutorial part 0. 3 | 4 | To run this job: 5 | yarn jar target/scalding-tutorial-0.14.0.jar JsonTutorial0 --local --json 6 | 7 | Check the output: 8 | cat target/data/jsonoutput0.tsv 9 | 10 | **/ 11 | 12 | import com.twitter.scalding.{Job, Args, JsonLine, Tsv} 13 | 14 | class JsonTutorial0(args: Args) extends Job(args) { 15 | JsonLine("data/session.json", ('sessionId)).read 16 | .groupBy('sessionId){_.size} 17 | .write(Tsv("target/data/jsonoutput0.tsv")) 18 | } 19 | -------------------------------------------------------------------------------- /src/main/scala/tutorial/MatrixTutorial0.scala: -------------------------------------------------------------------------------- 1 | import com.twitter.scalding._ 2 | import com.twitter.scalding.mathematics.Matrix 3 | 4 | /* 5 | * MatrixTutorial0.scala 6 | * 7 | * Loads a directed graph adjacency matrix where a[i,j] = 1 if there is an edge from a[i] to b[j] 8 | * and compute the outdegree of each node i 9 | * 10 | yarn jar target/scalding-tutorial-0.14.0.jar MatrixTutorial0 --local\ 11 | --input data/graph.tsv \ 12 | --output target/data/outdegree.tsv 13 | * 14 | */ 15 | 16 | 17 | class MatrixTutorial0(args : Args) extends Job(args) { 18 | 19 | import Matrix._ 20 | 21 | val adjacencyMatrix = Tsv( args("input"), ('user1, 'user2, 'rel) ) 22 | .read 23 | .toMatrix[Long,Long,Double]('user1, 'user2, 'rel) 24 | 25 | // each row i represents all of the outgoing edges from i 26 | // by summing out all of the columns we get the outdegree of i 27 | adjacencyMatrix.sumColVectors.write( Tsv( args("output") ) ) 28 | } 29 | 30 | -------------------------------------------------------------------------------- /src/main/scala/tutorial/MatrixTutorial1.scala: -------------------------------------------------------------------------------- 1 | import com.twitter.scalding._ 2 | import com.twitter.scalding.mathematics.Matrix 3 | 4 | 5 | /* 6 | * MatrixTutorial1.scala 7 | * 8 | * Loads a directed graph adjacency matrix where a[i,j] = 1 if there is an edge from a[i] to b[j] 9 | * and compute the co-follows between any two nodes 10 | * 11 | yarn jar target/scalding-tutorial-0.14.0.jar MatrixTutorial1 --local\ 12 | --input data/graph.tsv --output target/data/cofollows.tsv 13 | * 14 | */ 15 | 16 | 17 | class MatrixTutorial1(args : Args) extends Job(args) { 18 | 19 | import Matrix._ 20 | 21 | val adjacencyMatrix = Tsv( args("input"), ('user1, 'user2, 'rel) ) 22 | .read 23 | .toMatrix[Long,Long,Double]('user1, 'user2, 'rel) 24 | 25 | // compute the innerproduct of the adjacency matrix with itself 26 | (adjacencyMatrix * adjacencyMatrix.transpose).write( Tsv( args("output") ) ) 27 | } 28 | 29 | -------------------------------------------------------------------------------- /src/main/scala/tutorial/MatrixTutorial2.scala: -------------------------------------------------------------------------------- 1 | import com.twitter.scalding._ 2 | import com.twitter.scalding.mathematics.Matrix 3 | 4 | 5 | /* 6 | * MatrixTutorial2.scala 7 | * 8 | * Loads a directed graph adjacency matrix where a[i,j] = 1 if there is an edge from a[i] to b[j] 9 | * and returns a graph containing only the nodes with outdegree smaller than a given value 10 | * 11 | yarn jar target/scalding-tutorial-0.14.0.jar MatrixTutorial2 --local\ 12 | --input data/graph.tsv --maxOutdegree 1000 --output target/data/graphFiltered.tsv 13 | * 14 | */ 15 | 16 | 17 | class MatrixTutorial2(args : Args) extends Job(args) { 18 | 19 | import Matrix._ 20 | 21 | val adjacencyMatrix = Tsv( args("input"), ('user1, 'user2, 'rel) ) 22 | .read 23 | .toMatrix[Long,Long,Double]('user1, 'user2, 'rel) 24 | 25 | // Each row corresponds to the outgoing edges so to compute the outdegree we sum out the columns 26 | val outdegree = adjacencyMatrix.sumColVectors 27 | 28 | // We convert the column vector to a matrix object to be able to use the matrix method filterValues 29 | // we make all non zero values into ones and then convert it back to column vector 30 | val outdegreeFiltered = outdegree.toMatrix[Int](1) 31 | .filterValues{ _ < args("maxOutdegree").toDouble } 32 | .binarizeAs[Double].getCol(1) 33 | 34 | // We multiply on the left hand side with the diagonal matrix created from the column vector 35 | // to keep only the rows with outdregree smaller than maxOutdegree 36 | (outdegreeFiltered.diag * adjacencyMatrix).write(Tsv( args("output") ) ) 37 | 38 | } 39 | 40 | -------------------------------------------------------------------------------- /src/main/scala/tutorial/MatrixTutorial3.scala: -------------------------------------------------------------------------------- 1 | import com.twitter.scalding._ 2 | import com.twitter.scalding.mathematics.Matrix 3 | 4 | 5 | /* 6 | * MatrixTutorial3.scala 7 | * 8 | * Loads two directed graph adjacency matrices where a[i,j] = 1 if there is an edge from a[i] to b[j] 9 | * and computes the intersection and the differences between the two 10 | * 11 | yarn jar target/scalding-tutorial-0.14.0.jar MatrixTutorial3 --local\ 12 | --input1 data/graph.tsv --input2 data/graph2.tsv --intersection data/intersection.tsv\ 13 | --leftDiff target/data/leftDiff.tsv --rightDiff target/data/rightDiff.tsv 14 | * 15 | */ 16 | 17 | 18 | class MatrixTutorial3(args : Args) extends Job(args) { 19 | 20 | import Matrix._ 21 | 22 | val adjacencyMatrix1 = Tsv( args("input1"), ('user1, 'user2, 'rel) ) 23 | .read 24 | .toMatrix[Long,Long,Double]('user1, 'user2, 'rel) 25 | 26 | val adjacencyMatrix2 = Tsv( args("input2"), ('user1, 'user2, 'rel) ) 27 | .read 28 | .toMatrix[Long,Long,Double]('user1, 'user2, 'rel) 29 | 30 | //zip puts creates a pair element out of corresponding elements in the two matrices 31 | val intersection = adjacencyMatrix1 32 | .zip(adjacencyMatrix2) 33 | .mapValues( pair => if (pair._1 > 0 && pair._2 > 0) 1.0 else 0.0 ) 34 | .write(Tsv(args("intersection"))) 35 | (adjacencyMatrix1 - intersection).write(Tsv(args("leftDiff"))) 36 | (adjacencyMatrix2 - intersection).write(Tsv(args("rightDiff"))) 37 | 38 | } 39 | 40 | -------------------------------------------------------------------------------- /src/main/scala/tutorial/MatrixTutorial4.scala: -------------------------------------------------------------------------------- 1 | import com.twitter.scalding._ 2 | import com.twitter.scalding.mathematics.Matrix 3 | 4 | 5 | /* 6 | * MatrixTutorial4.scala 7 | * 8 | * Loads a directed graph adjacency matrix where a[i,j] = 1 if there is an edge from a[i] to b[j] 9 | * and computes the cosine of the angle between every two pairs of vectors 10 | * 11 | yarn jar target/scalding-tutorial-0.14.0.jar MatrixTutorial4 --local\ 12 | --input data/graph.tsv --output target/data/cosineSim.tsv 13 | * 14 | */ 15 | 16 | class MatrixTutorial4(args : Args) extends Job(args) { 17 | 18 | import Matrix._ 19 | 20 | val adjacencyMatrix = Tsv( args("input"), ('user1, 'user2, 'rel) ) 21 | .read 22 | .toMatrix[Long,Long,Double]('user1, 'user2, 'rel) 23 | 24 | // we compute the L2 normalized adjacency graph 25 | val normMatrix = adjacencyMatrix.rowL2Normalize 26 | 27 | // we compute the innerproduct of the normalized matrix with itself 28 | // which is equivalent with computing cosine: AA^T / ||A|| * ||A|| 29 | (normMatrix * normMatrix.transpose).write( Tsv( args("output") ) ) 30 | 31 | } 32 | 33 | -------------------------------------------------------------------------------- /src/main/scala/tutorial/MatrixTutorial5.scala: -------------------------------------------------------------------------------- 1 | import com.twitter.scalding._ 2 | import com.twitter.scalding.mathematics.Matrix 3 | 4 | 5 | /* 6 | * MatrixTutorial5.scala 7 | * 8 | * Loads a directed graph adjacency matrix where a[i,j] = 1 if there is an edge from a[i] to b[j] 9 | * and computes the jaccard similarity between any two pairs of vectors 10 | * 11 | yarn jar target/scalding-tutorial-0.14.0.jar MatrixTutorial5 --local\ 12 | --input data/graph.tsv --output target/data/jaccardSim.tsv 13 | * 14 | */ 15 | 16 | class MatrixTutorial5(args : Args) extends Job(args) { 17 | 18 | import Matrix._ 19 | 20 | val adjacencyMatrix = Tsv( args("input"), ('user1, 'user2, 'rel) ) 21 | .read 22 | .toMatrix[Long,Long,Double]('user1, 'user2, 'rel) 23 | 24 | val aBinary = adjacencyMatrix.binarizeAs[Double] 25 | 26 | // intersectMat holds the size of the intersection of row(a)_i n row (b)_j 27 | val intersectMat = aBinary * aBinary.transpose 28 | val aSumVct = aBinary.sumColVectors 29 | val bSumVct = aBinary.sumRowVectors 30 | 31 | //Using zip to repeat the row and column vectors values on the right hand 32 | //for all non-zeroes on the left hand matrix 33 | val xMat = intersectMat.zip(aSumVct).mapValues( pair => pair._2 ) 34 | val yMat = intersectMat.zip(bSumVct).mapValues( pair => pair._2 ) 35 | 36 | val unionMat = xMat + yMat - intersectMat 37 | //We are guaranteed to have Double both in the intersection and in the union matrix 38 | intersectMat.zip(unionMat) 39 | .mapValues( pair => pair._1 / pair._2 ) 40 | .write(Tsv( args("output") )) 41 | 42 | } 43 | 44 | -------------------------------------------------------------------------------- /src/main/scala/tutorial/MatrixTutorial6.scala: -------------------------------------------------------------------------------- 1 | import com.twitter.scalding._ 2 | import com.twitter.scalding.mathematics.Matrix 3 | 4 | /* 5 | * MatrixTutorial6.scala 6 | * 7 | * Loads a document to word matrix where a[i,j] = freq of the word j in the document i 8 | * computes the Tf-Idf score of each word w.r.t. to each document and keeps the top nrWords in each document 9 | * (see http://en.wikipedia.org/wiki/Tf*idf for more info) 10 | * 11 | yarn jar target/scalding-tutorial-0.14.0.jar MatrixTutorial6 --local\ 12 | --input data/docBOW.tsv --nrWords 300 --output target/data/featSelectedMatrix.tsv 13 | * 14 | */ 15 | 16 | class MatrixTutorial6(args : Args) extends Job(args) { 17 | 18 | import Matrix._ 19 | 20 | val docWordMatrix = Tsv( args("input"), ('doc, 'word, 'count) ) 21 | .read 22 | .toMatrix[Long,String,Double]('doc, 'word, 'count) 23 | 24 | // compute the overall document frequency of each row 25 | val docFreq = docWordMatrix.binarizeAs[Double].sumRowVectors 26 | 27 | // compute the inverse document frequency vector 28 | val invDocFreqVct = docFreq.toMatrix(1).rowL1Normalize.mapValues( x => log2(1/x) ) 29 | 30 | // zip the row vector along the entire document - word matrix 31 | val invDocFreqMat = docWordMatrix.zip(invDocFreqVct.getRow(1)).mapValues( pair => pair._2 ) 32 | 33 | // multiply the term frequency with the inverse document frequency and keep the top nrWords 34 | docWordMatrix.hProd(invDocFreqMat).topRowElems( args("nrWords").toInt ).write(Tsv( args("output") )) 35 | 36 | def log2(x : Double) = scala.math.log(x)/scala.math.log(2.0) 37 | 38 | } 39 | 40 | -------------------------------------------------------------------------------- /src/main/scala/tutorial/Tutorial0.scala: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2012 Twitter, Inc. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | import com.twitter.scalding._ 17 | 18 | /** 19 | Scalding tutorial part 0. 20 | 21 | This is the simplest possible scalding job: it reads from one data source and writes the data, 22 | unchanged, to another. 23 | 24 | To test it, from the science directory, first make sure you've built the target/scalding-assembly-0.2.0.jar: 25 | from the base directory type: 26 | sbt assembly 27 | 28 | yarn jar target/scalding-tutorial-0.14.0.jar Tutorial0 --local 29 | 30 | You can check the input: 31 | cat data/hello.txt 32 | 33 | And the output: 34 | cat target/data/output0.txt 35 | 36 | The output should look just like the input, but with line numbers. 37 | More on this in part 1 of the tutorial. 38 | **/ 39 | 40 | 41 | /** 42 | All jobs in scalding are represented by a subclass of com.twitter.scalding.Job. 43 | The constructor must take a single com.twitter.scalding.Args, even if, as here, 44 | we don't use it. 45 | 46 | For the scald.rb script to work, name the class to match the file, 47 | and don't use a package. 48 | **/ 49 | class Tutorial0(args : Args) extends Job(args) { 50 | 51 | /** 52 | Both input and output data sources are represented by instances of 53 | com.twitter.scalding.Source. 54 | 55 | Scalding comes with some basic source types like TextLine and Tsv. 56 | There are also many twitter-specific types like MergedAdRequestSource. 57 | **/ 58 | val input = TextLine("data/hello.txt") 59 | val output = TextLine("target/data/output0.txt") 60 | 61 | /** 62 | This is the minimal pipeline. Source.read returns a cascading.pipe.Pipe, which represents 63 | a stream of data. We can transform this stream in many ways, but here we're simply 64 | asking it to write itself to the output source. 65 | **/ 66 | input.read.write(output) 67 | 68 | /** 69 | By the way, if you look at the docs for Pipe, you won't find write there. That's 70 | because it's actually defined on com.twitter.scalding.RichPipe. Most of the methods 71 | we call on Pipes will actually be found on RichPipe; in typical scala style, 72 | the conversion between them is implicit. 73 | **/ 74 | } 75 | -------------------------------------------------------------------------------- /src/main/scala/tutorial/Tutorial1.scala: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2012 Twitter, Inc. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | import com.twitter.scalding._ 17 | 18 | /** 19 | Scalding tutorial part 1. 20 | 21 | In part 0, we made a copy of hello.txt, but it wasn't a perfect copy: 22 | it was annotated with line numbers. 23 | 24 | That's because the data stream coming out of a TextLine source actually 25 | has two fields: one, called "line", has the actual line of text. The other, 26 | called "offset", has the line number in the file. When you write these 27 | tuples to a TextLine, it naively outputs them both on each line. 28 | 29 | We can ask scalding to select just the "line" field from the pipe, using the 30 | project() method. When we refer to a data stream's fields, we use Scala symbols, 31 | like this: 'line. 32 | 33 | To run this job: 34 | yarn jar target/scalding-tutorial-0.14.0.jar Tutorial1 --local 35 | 36 | Check the output: 37 | cat target/data/output1.txt 38 | 39 | **/ 40 | 41 | class Tutorial1(args : Args) extends Job(args) { 42 | 43 | val input = TextLine("data/hello.txt") 44 | val output = TextLine("target/data/output1.txt") 45 | 46 | /** 47 | We generally write each step of the pipeline on a separate line. 48 | **/ 49 | input 50 | .read 51 | .project('line) 52 | .write(output) 53 | } 54 | -------------------------------------------------------------------------------- /src/main/scala/tutorial/Tutorial2.scala: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2012 Twitter, Inc. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | import com.twitter.scalding._ 17 | 18 | /** 19 | Scalding tutorial part 2. 20 | 21 | In parts 0 and 1, we made copies of hello.txt. Now let's try to 22 | modify the copies by reversing each line. 23 | 24 | To run this job: 25 | yarn jar target/scalding-tutorial-0.14.0.jar Tutorial2 --local 26 | 27 | Check the output: 28 | cat target/data/output2.txt 29 | 30 | **/ 31 | 32 | class Tutorial2(args : Args) extends Job(args) { 33 | 34 | val input = TextLine("data/hello.txt") 35 | val output = TextLine("target/data/output2.txt") 36 | 37 | input 38 | .read 39 | 40 | /** 41 | As with a scala collection, you can map over a pipe, where each 42 | item gets passed into an anonymous function, and we create a new 43 | pipe with the results. 44 | 45 | In scalding, we need to also annotate the call to map with the names of the 46 | fields it operates on. In this case, we want to take the 'line field 47 | as input, and we want to output a new field named 'reversed. 48 | 49 | Unlike with a normal scala map{}, we always need to specify the 50 | types of the arguments to the anonymous function. 51 | **/ 52 | 53 | .map('line -> 'reversed){ line : String => line.reverse} 54 | 55 | /** 56 | The map transformation in scalding is additive: the 'offset and 'line 57 | fields haven't gone away, we've just added a new 'reversed field to each 58 | entry. If we only want to write the 'reversed version, we need to use 59 | project. 60 | **/ 61 | 62 | .project('reversed) 63 | .write(output) 64 | } 65 | -------------------------------------------------------------------------------- /src/main/scala/tutorial/Tutorial3.scala: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2012 Twitter, Inc. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | import com.twitter.scalding._ 17 | 18 | /** 19 | Scalding tutorial part 3. 20 | 21 | So far, we've been hardcoding the input file. Let's make that an argument, 22 | which changes how we run the job: 23 | 24 | yarn jar target/scalding-tutorial-0.14.0.jar \ 25 | Tutorial3 --local\ 26 | --input data/hello.txt 27 | 28 | We're also going to use a new transformation: flatMap. 29 | 30 | Check the output: 31 | cat target/data/output3.txt 32 | 33 | You can also of course try this with other input parameters. For example: 34 | 35 | yarn jar target/scalding-tutorial-0.14.0.jar \ 36 | Tutorial3 --local\ 37 | --input target/data/output2.txt 38 | 39 | **/ 40 | 41 | class Tutorial3(args : Args) extends Job(args) { 42 | 43 | /** 44 | We can ask args for the --input argument from the command line. 45 | If it's missing, we'll get an error. 46 | **/ 47 | val input = TextLine(args("input")) 48 | val output = TextLine("target/data/output3.txt") 49 | 50 | input 51 | .read 52 | 53 | /** 54 | flatMap is like map, but instead of returning a single item from the 55 | function, we return a collection of items. Each of these items will create 56 | a new entry in the data stream; here, we'll end up with a new entry for each word. 57 | **/ 58 | 59 | .flatMap('line -> 'word){ line : String => line.split("\\s")} 60 | 61 | /** 62 | We still want to project just the 'word field for our final output. 63 | For interest, though, let's stash a copy of the data before we do that. 64 | write() returns the pipe, so we can keep chaining our pipeline. 65 | **/ 66 | 67 | .write(Tsv("target/data/tmp3.tsv")) 68 | .project('word) 69 | .write(output) 70 | } 71 | -------------------------------------------------------------------------------- /src/main/scala/tutorial/Tutorial4.scala: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2012 Twitter, Inc. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | import com.twitter.scalding._ 17 | 18 | /** 19 | Scalding tutorial part 4. 20 | 21 | You might have noticed that in part 3, we ended up with a list of words. 22 | Clearly we're ready for that most exciting of MapReduce examples, the word count. 23 | 24 | Also, let's go ahead and make this fully general by parameterizing the output location. 25 | 26 | Run: 27 | yarn jar target/scalding-tutorial-0.14.0.jar \ 28 | Tutorial4 --local\ 29 | --input data/hello.txt \ 30 | --output target/data/output4.txt 31 | 32 | Check the output: 33 | cat target/data/output4.txt 34 | 35 | **/ 36 | 37 | class Tutorial4(args : Args) extends Job(args) { 38 | 39 | //we probably don't need to bother with vals for input/output anymore 40 | TextLine(args("input")) 41 | .read 42 | .flatMap('line -> 'word){ line : String => line.split("\\s")} 43 | 44 | /** 45 | To count the words, first we need to group by word. 46 | groupBy takes any number of fields as the group key. In this 47 | case we just want 'word. 48 | 49 | groupBy also takes an anonymous function, to which it will pass a 50 | com.twitter.scalding.GroupBuilder. 51 | 52 | Each method call to GroupBuilder will specify an aggregation we want to 53 | perform on the group. In general, the resulting data stream will have all 54 | of the group fields (with one entry for each set of unique values), plus 55 | one new field for each aggregation. 56 | 57 | In this case, the only aggregation we care about is size: how many values are 58 | in the group. 59 | **/ 60 | 61 | .groupBy('word){group => group.size} 62 | 63 | /** 64 | No project is needed here because the groupBy has eliminated everything but 'word 65 | and the size field. 66 | **/ 67 | 68 | .write(Tsv(args("output"))) 69 | } 70 | -------------------------------------------------------------------------------- /src/main/scala/tutorial/Tutorial5.scala: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2012 Twitter, Inc. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | import com.twitter.scalding._ 17 | 18 | /** 19 | Scalding tutorial part 5. 20 | 21 | This example is a little bit contrived so that we can play with joins. 22 | 23 | Let's define a metric for a line of text which is the sum of the rank 24 | of each of its words in the words input file - the word "hello" comes 25 | first (rank 0) whereas the second word is "world", with a rank of 1. 26 | 27 | So, the line "hello world" would have a total score of 0+1 = 1 28 | 29 | We'll read in an input file, split it into words, join those words 30 | with the words input file to get their individual ranks, then 31 | group by line to get a total score and output each line/score pair. 32 | 33 | Run: 34 | yarn jar target/scalding-tutorial-0.14.0.jar \ 35 | Tutorial5 --local\ 36 | --input data/hello.txt \ 37 | --output target/data/output5.txt \ 38 | --words data/words.txt 39 | 40 | Check the output: 41 | cat target/data/output5.txt 42 | 43 | Note that the line order may no longer be the same as the input file. 44 | That's parallelism, man. 45 | 46 | **/ 47 | 48 | class Tutorial5(args : Args) extends Job(args) { 49 | 50 | /** 51 | We'll start with the dict data source. 52 | 53 | When we join, we'll need unique field names, so we'll rename 54 | the 'offset' field to be score. Also, we want to normalize 55 | the words to be lowercase. 56 | **/ 57 | 58 | val scores = TextLine(args("words")) 59 | .read 60 | .rename('offset, 'score) 61 | .map('line -> 'dictWord){line : String => line.toLowerCase} 62 | .project('score, 'dictWord) 63 | 64 | TextLine(args("input")) 65 | .read 66 | 67 | //split and normalize to lowercase 68 | .flatMap('line -> 'word){ line : String => line.split("\\s").map{_.toLowerCase}} 69 | 70 | /** 71 | When we join, we need to specify which fields from each side of the join should match. 72 | This is like a SQL inner join: we end up with a new row that combines each possible 73 | matching pair, with all of the fields of both the left and right side. 74 | **/ 75 | 76 | .joinWithLarger('word -> 'dictWord, scores) 77 | 78 | /** 79 | Now that we have a score for each word, we can group back to the original lines 80 | and sum up the word scores. Sum is another common aggregation that GroupBuilder 81 | provides; we just need to specify which field to sum by. 82 | **/ 83 | 84 | .groupBy('line){group => group.sum[Double]('score)} 85 | .write(Tsv(args("output"))) 86 | } 87 | -------------------------------------------------------------------------------- /src/main/scala/tutorial/Tutorial6.scala: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2012 Twitter, Inc. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | import com.twitter.scalding._ 17 | 18 | /** 19 | Scalding tutorial part 6. 20 | 21 | This is similar to Tutorial1 except that we show the use of Scala Enumerations to specify fields. 22 | 23 | To run this job: 24 | yarn jar target/scalding-tutorial-0.14.0.jar Tutorial6 --local 25 | 26 | Check the output: 27 | cat target/data/output6.tsv 28 | 29 | **/ 30 | 31 | class Tutorial6(args : Args) extends Job(args) { 32 | /** When a data set has a large number of fields, and we want to specify those fields conveniently 33 | in code, we can use, for example, a Tuple of Symbols (as most of the other tutorials show), or a List of Symbols. 34 | Note that Tuples can only be used if the number of fields is at most 22, since Scala Tuples cannot have more 35 | than 22 elements. Another alternative is to use Enumerations, which we show here **/ 36 | 37 | object Schema extends Enumeration { 38 | val first, last, phone, age, country = Value // arbitrary number of fields 39 | } 40 | 41 | import Schema._ 42 | 43 | Csv("data/phones.txt", separator = " ", fields = Schema) 44 | .read 45 | .project(first,age) 46 | .write(Tsv("target/data/output6.tsv")) 47 | } 48 | 49 | -------------------------------------------------------------------------------- /src/main/scala/tutorial/TypedTutorial.scala: -------------------------------------------------------------------------------- 1 | import cascading.pipe.Pipe 2 | import com.twitter.scalding._ 3 | 4 | /** 5 | Scalding Tutorial ported to use the Type-safe API (TDsl) 6 | (rather than Cascading's Fields API). The examples here roughly correspond 7 | to those in `tutorial/Tutorial{0..5}.scala`. 8 | 9 | These tutorials are all run from this single file; which one is run can 10 | be chosen with a command-line flag "--tutorial". For instance, to run the 11 | first tutorial example: 12 | 13 | yarn jar target/scalding-tutorial-0.14.0.jar TypedTutorial --local \ 14 | --tutorial 0 \ 15 | --input data/hello.txt \ 16 | --output target/data/output0.txt \ 17 | --words data/word_scores.tsv 18 | 19 | (Note: only tutorial 5 uses "word_scores.tsv") 20 | **/ 21 | class TypedTutorial(args : Args) extends Job(args) { 22 | 23 | args("tutorial") match { 24 | 25 | /** 26 | Tutorial {0,1}: Write out to a TSV file. 27 | ---------------------------------------- 28 | In this first version we will be as explicit as possible to show all 29 | the steps required to go from a raw text file to a typed stream. 30 | **/ 31 | case "0" | "1" => { 32 | 33 | // The TextLine source splits the input by lines. 34 | val textSource = TextLine(args("input")) 35 | 36 | // Create a type-safe pipe from the TextLine. 37 | val lines: TypedPipe[String] = 38 | TypedPipe.from[String](textSource) 39 | 40 | // Write the typed pipe out to a tab-delimited file. 41 | lines.write(TypedTsv[String](args("output"))) 42 | } 43 | 44 | /** 45 | Tutorial 2: Simple map 46 | ---------------------- 47 | Reverse all the strings. Notice that we've now left off the [String] type. 48 | Scala can generally infer these types for us, making the code cleaner. 49 | **/ 50 | case "2" | "map" => { 51 | // Create a typed pipe from the TextLine (of type TypedPipe[String] still) 52 | TypedPipe.from(TextLine(args("input"))) 53 | // Transform each line, reversing it. Output is a new TypedPipe, still of String. 54 | .map(_.reverse) 55 | // Note, the types for the TypedTsv *can* be inferred by Scala here. 56 | // However, it's best to specify them explicitly so that if the 57 | // output type changes, it is detected and doesn't break the next 58 | // thing to read from the output file. 59 | .write(TypedTsv[String](args("output"))) 60 | } 61 | 62 | /** 63 | Tutorial 3: Flat Map 64 | --------------------- 65 | Dump all the words. 66 | **/ 67 | case "3" | "flatmap" => { 68 | TypedPipe.from(TextLine(args("input"))) 69 | // flatMap is like map, but instead of returning a single item 70 | // from the function, we return a collection of items. Each of 71 | // these items will create a new entry in the data stream; here, 72 | // we'll end up with a new entry for each word. 73 | .flatMap(_.split("\\s")) 74 | // output of flatMap is still a collection of String 75 | .write(TypedTsv[String](args("output"))) 76 | } 77 | 78 | /** 79 | Tutorial 4: Word Count 80 | ---------------------- 81 | Now that we have a stream of words, clearly we're ready for 82 | that most exciting of MapReduce examples: the Word Count. 83 | **/ 84 | case "4" | "wordcount" => { 85 | // Get the words (just like above in case "3") 86 | val words = TypedPipe.from(TextLine(args("input"))) 87 | .flatMap(_.split("\\s")) 88 | 89 | // To count the words, we use TypedPipe's `groupBy` method. 90 | // However, this no longer returns a `TypedPipe[T]`, but rather 91 | // a `Grouped[K,T]` based on the type of the key used to group by. 92 | // 93 | // groupBy accepts a function to determine the key for grouping. 94 | // In the case of word count, let's imagine we want to make sure 95 | // capitalization doesn't matter, so to come up with the key, 96 | // we normalize it to lower case. 97 | val groups : Grouped[String,String] = words.groupBy(_.toLowerCase) 98 | 99 | // Next we specify what to do with each aggregation. In the case 100 | // of word count, we simply want the size of each group. This 101 | // operation results in a new `Grouped` that has the key (String, 102 | // the lower case words), and the counts (Long). 103 | // 104 | // Note: To do more interesting aggregations, Scalding supports 105 | // a variety of operations, such as `sum`, `reduce`, `foldLeft`, 106 | // `mapGroup`, etc, that can all be applied efficiently on Monoids 107 | // (primitives like Long, container types like `Map`, or custom 108 | // monoids you define yourself). See the wiki for more details: 109 | // https://github.com/twitter/scalding/wiki/Type-safe-api-reference 110 | val counts = groups.size 111 | 112 | // And finally, we dump these results to a TypedTsv with the 113 | // correct Tuple type. 114 | counts.write(TypedTsv[(String,Long)](args("output"))) 115 | } 116 | 117 | /** 118 | Tutorial 5: Demonstrate joins 119 | ----------------------------- 120 | Associate a score with each word and compute a score for each line. 121 | 122 | Note: this example is a bit contrived, but serves to demonstrate 123 | how to combine multiple input sources. 124 | **/ 125 | case "5" | "join" => { 126 | // Load the scores for each word from TSV file and group by word. 127 | val scores: Grouped[String,Double] = 128 | // For TypedTsv, Scalding coerces the fields to the specified types, 129 | // throwing an exception if any line fails. 130 | TypedPipe.from(TypedTsv[(String,Double)](args("words"))) 131 | // group by word so we can join it 132 | .group 133 | 134 | // get the lines, this time from an 'OffsetTextLine' which is a 135 | // typed wrapper on 'TextLine' that contains the 'byte offset' and 136 | // text of each line in the file. 137 | val lines: TypedPipe[(Long,String)] = TypedPipe.from(OffsetTextLine(args("input"))) 138 | 139 | // Split lines into words, but keep their original line offset with them. 140 | val wordsWithLine : Grouped[String,Long] = 141 | lines 142 | .flatMap{ case (offset, line) => 143 | // split into words 144 | line.split("\\s") 145 | // keep the line offset with them 146 | .map(word => (word.toLowerCase, offset)) 147 | } 148 | // make the 'word' field the key 149 | .group 150 | 151 | // Associate scores with each word; merges the two value types into 152 | // a tuple: [String,Long] join [String,Double] -> [String,(Long,Double)] 153 | val scoredWords = wordsWithLine.join(scores) 154 | 155 | // get scores for each line (indexed by line number) 156 | val scoredLinesByNumber = 157 | scoredWords 158 | // select the line offset and score fields 159 | .map{ case (word,(offset,score)) => (offset,score) } 160 | // group by line offset (groups all the words for a line together) 161 | .group 162 | // compute total score per line 163 | .sum 164 | // Group and sum are often run together in this way. 165 | // The `sumByKey` operation performs performs both. 166 | 167 | // Associate the original line text with the computed score, 168 | // discard the 'offset' field 169 | val scoredLines: TypedPipe[(String,Double)] = 170 | lines 171 | // index lines by 'offset' 172 | .group 173 | // associate scores with lines (by offset) 174 | .join(scoredLinesByNumber) 175 | // take just the value fields (discard the 'line offset') 176 | .values 177 | 178 | // write out the final result 179 | scoredLines.write(TypedTsv[(String,Double)](args("output"))) 180 | 181 | } 182 | 183 | /** 184 | Interoperability with Fields API 185 | -------------------------------- 186 | Scalding also provides a thinner, un-type-safe wrapper over Cascading 187 | which is known as the Fields API because each record has a number of 188 | named "fields". 189 | 190 | Most jobs can be done completely in the Typed API, but for compatibility, 191 | there are ways to go back and forth between the two schemes, which the 192 | next couple cases demonstrate. 193 | **/ 194 | 195 | /** 196 | Pipe vs. TypedPipe 197 | ------------------ 198 | TypedPipes can be easily converted to Pipes and vice-versa. 199 | **/ 200 | case "pipes" => { 201 | // calling 'read' on a source returns an un-typed Pipe 202 | // TextLine, by default, contains two fields: 'offset, and 'line. 203 | val rawPipe: Pipe = TextLine(args("input")).read 204 | 205 | // To convert to a typed pipe, we must specify the fields we want 206 | // and their types: 207 | val lines: TypedPipe[(Long,String)] = 208 | TypedPipe.from[(Long,String)](rawPipe, ('offset,'line)) 209 | 210 | // We can operate on this typed pipe as above, and come up with a 211 | // different set of fields 212 | val lineSizes: TypedPipe[Long] = lines.map{ case (offset,line) => line.length } 213 | 214 | // To convert back to a Fields Pipe, we must specify the names of the fields: 215 | val lineSizesField: Pipe = lineSizes.toPipe('size) 216 | 217 | // finally, we can write out this untyped pipe with an untyped sink: 218 | lineSizesField.write(Tsv(args("output"))) 219 | } 220 | 221 | /** 222 | Bonus: Typed blocks 223 | ------------------- 224 | An alternative to working completely in typed mode is to use 225 | `typed` blocks, which create a TypedPipe within the scope, and then 226 | map the output back into an untyped Pipe. You specify the fields to 227 | map in and out using the `->` pair passed to `typed()`. 228 | **/ 229 | case "block" => { 230 | // Get the .typed enrichment 231 | import TDsl._ 232 | 233 | TextLine(args("input")).read 234 | .typed('line -> 'size) { tp: TypedPipe[String] => 235 | // now operate on the typed pipe 236 | tp.map(_.length) 237 | } 238 | // the final output will have just the 'size field 239 | // and can be dumped using the un-typed Tsv source. 240 | .write(Tsv(args("output"))) 241 | } 242 | } 243 | } 244 | --------------------------------------------------------------------------------