├── project └── build.properties ├── src ├── test │ ├── resources │ │ ├── combine │ │ │ ├── file_1.csv │ │ │ └── file_2.csv │ │ └── archive │ │ │ ├── test_bzip2.txt.bz2 │ │ │ ├── test_gzip.txt.gz │ │ │ └── test_lzma.txt.xz │ └── scala │ │ └── ru │ │ └── retailrocket │ │ └── spark │ │ └── multitool │ │ ├── AlgsSuite.scala │ │ ├── LoadersSuite.scala │ │ └── FunctionsSuite.scala └── main │ └── scala │ └── ru │ └── retailrocket │ └── spark │ └── multitool │ ├── HashFNV.scala │ ├── Config.scala │ ├── algs │ └── package.scala │ ├── DataFrameFunctions.scala │ ├── fs │ └── package.scala │ ├── RDDFunctions.scala │ ├── Loaders.scala │ └── package.scala ├── example ├── src │ └── main │ │ ├── resource │ │ ├── file_1.txt │ │ └── file_2.txt │ │ └── scala │ │ └── MyProject.scala └── build.sbt ├── .gitignore ├── LICENSE └── README.md /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version = 1.3.12 2 | -------------------------------------------------------------------------------- /src/test/resources/combine/file_1.csv: -------------------------------------------------------------------------------- 1 | 1 2 | 2 3 | -------------------------------------------------------------------------------- /src/test/resources/combine/file_2.csv: -------------------------------------------------------------------------------- 1 | 3 2 | 4 3 | -------------------------------------------------------------------------------- /example/src/main/resource/file_1.txt: -------------------------------------------------------------------------------- 1 | 1 2 | 2 3 | 3 4 | -------------------------------------------------------------------------------- /example/src/main/resource/file_2.txt: -------------------------------------------------------------------------------- 1 | 4 2 | 5 3 | 6 4 | -------------------------------------------------------------------------------- /src/test/resources/archive/test_bzip2.txt.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RetailRocket/SparkMultiTool/HEAD/src/test/resources/archive/test_bzip2.txt.bz2 -------------------------------------------------------------------------------- /src/test/resources/archive/test_gzip.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RetailRocket/SparkMultiTool/HEAD/src/test/resources/archive/test_gzip.txt.gz -------------------------------------------------------------------------------- /src/test/resources/archive/test_lzma.txt.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RetailRocket/SparkMultiTool/HEAD/src/test/resources/archive/test_lzma.txt.xz -------------------------------------------------------------------------------- /example/build.sbt: -------------------------------------------------------------------------------- 1 | name := "MyProject" 2 | 3 | version := "1.0" 4 | 5 | scalaVersion := "2.10.4" 6 | 7 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.1.0" 8 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | *.log 3 | 4 | # sbt specific 5 | .cache/ 6 | .history/ 7 | .lib/ 8 | dist/* 9 | target/ 10 | lib_managed/ 11 | src_managed/ 12 | project/boot/ 13 | project/plugins/project/ 14 | 15 | # Scala-IDE specific 16 | .scala_dependencies 17 | .worksheet 18 | -------------------------------------------------------------------------------- /src/main/scala/ru/retailrocket/spark/multitool/HashFNV.scala: -------------------------------------------------------------------------------- 1 | package ru.retailrocket.spark.multitool 2 | 3 | 4 | object HashFNV { 5 | val FNV_32_INIT: Int = 33554467 6 | val FNV_32_PRIME: Int = 0x01000193 7 | 8 | def hash(s: String, init: Int=FNV_32_INIT): Int = { 9 | var hval = init 10 | val bytes = s.getBytes 11 | for(i <- bytes) { 12 | hval *= FNV_32_PRIME 13 | hval ^= i 14 | } 15 | hval 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /example/src/main/scala/MyProject.scala: -------------------------------------------------------------------------------- 1 | import org.apache.spark.SparkContext 2 | import org.apache.spark.SparkContext._ 3 | import org.apache.spark.SparkConf 4 | 5 | import ru.retailrocket.spark.multitool.Loaders._ 6 | 7 | object MyProject { 8 | def main(args: Array[String]) { 9 | val sc = new SparkContext("local", "MyProject") 10 | val sessions = sc.combineTextFile("file://" + getClass.getResource("src/main/resource").getFile) 11 | println(s"sessions count ${sessions.count()}") 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /src/main/scala/ru/retailrocket/spark/multitool/Config.scala: -------------------------------------------------------------------------------- 1 | package ru.retailrocket.spark.multitool 2 | 3 | import com.typesafe.config.{ Config => TypeSafeConfig } 4 | import org.apache.spark.{SparkContext, SparkConf} 5 | 6 | object Config { 7 | def flatConfig(config: TypeSafeConfig): Seq[(String, AnyRef)] = { 8 | import scala.collection.convert.WrapAsScala._ 9 | 10 | config.entrySet().map { entry => 11 | val k = entry.getKey 12 | val v = entry.getValue.unwrapped() 13 | (k, v) 14 | }.toSeq 15 | } 16 | 17 | def asSparkConfig(config: TypeSafeConfig): SparkConf = { 18 | val sc = new SparkConf() 19 | 20 | for ((key, value) <- flatConfig(config)) { 21 | sc.set(key, value.toString) 22 | } 23 | 24 | sc 25 | } 26 | } -------------------------------------------------------------------------------- /src/test/scala/ru/retailrocket/spark/multitool/AlgsSuite.scala: -------------------------------------------------------------------------------- 1 | package ru.retailrocket.spark.multitool 2 | 3 | import org.scalatest._ 4 | 5 | import org.apache.spark.SparkContext 6 | import org.apache.spark.SparkContext._ 7 | import org.apache.spark.SparkConf 8 | import org.apache.spark.rdd._ 9 | 10 | 11 | class AlgsSuite extends FunSuite with BeforeAndAfterAll { 12 | lazy val sc: SparkContext = new SparkContext("local", getClass.getSimpleName) 13 | implicit val parallel = 5 14 | 15 | test("cosine") { 16 | val data = sc.parallelize(List[(Int, Long, Double)]( 17 | (1, 1L, 0.5), 18 | (1, 2L, 0.3), 19 | (2, 1L, 0.6), 20 | (2, 2L, 0.2), 21 | (3, 1L, 0.5), 22 | (3, 3L, 0.2), 23 | (4, 1L, 0.1))) 24 | 25 | val res = algs.cosine(data).collect.sorted 26 | 27 | assert(res(0) === (1L,2L,0.8028463951575711), "((0.5*0.3)+(0.6*0.2)) / (sqrt(0.5^2+0.6^2+0.5^2+0.1^2)*sqrt(0.3^2+0.2^2))") 28 | assert(res(1) === (1L,3L,0.5360562674188974)) 29 | } 30 | 31 | override def afterAll() { 32 | sc.stop() 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Retail Rocket 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /src/main/scala/ru/retailrocket/spark/multitool/algs/package.scala: -------------------------------------------------------------------------------- 1 | package ru.retailrocket.spark.multitool 2 | 3 | import org.apache.spark.SparkContext._ 4 | import org.apache.spark.storage.StorageLevel 5 | import org.apache.spark.rdd.RDD 6 | 7 | import scala.util._ 8 | import scala.reflect.ClassTag 9 | 10 | package object algs { 11 | def cosine[Session : ClassTag, Item <% Ordered[Item] : ClassTag](src: RDD[(Session, Item, Double)])(implicit parallel: Int): RDD[(Item, Item, Double)] = { 12 | val ab = src 13 | .map{case(session, item, weight) => (session, (item, weight))} 14 | .groupByKey(parallel) 15 | .flatMap{case(prop, items) => 16 | for((itemA, weightA) <- items; (itemB, weightB) <- items if itemA < itemB) 17 | yield ((itemA, itemB), weightA * weightB)} 18 | .reduceByKey(_+_, parallel) 19 | .map{case((itemA, itemB), weight) => (itemA, itemB, weight)} 20 | 21 | val a = src 22 | .map{case(session, item, weight) => (item, weight * weight)} 23 | .reduceByKey(_+_, parallel) 24 | .map{case(item, weight) => (item, math.sqrt(weight))} 25 | .persist(StorageLevel.MEMORY_AND_DISK_SER) 26 | 27 | val cosineL = ab 28 | .map{case(itemA, itemB, weightAB) => (itemA, (itemB, weightAB))} 29 | .join(a, parallel) 30 | .map{case(itemA, ((itemB, weightAB), weightA)) => (itemB, (itemA, weightAB, weightA))} 31 | .join(a, parallel) 32 | .map{case(itemB, ((itemA, weightAB, weightA), weightB)) => (itemA, itemB, weightAB / (weightA * weightB))} 33 | 34 | val cosineR = cosineL 35 | .map{case(itemA, itemB, weight) => (itemB, itemA, weight)} 36 | 37 | cosineL union cosineR 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/main/scala/ru/retailrocket/spark/multitool/DataFrameFunctions.scala: -------------------------------------------------------------------------------- 1 | package ru.retailrocket.spark.multitool 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkContext._ 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.rdd._ 7 | import org.apache.spark.sql._ 8 | 9 | import org.apache.hadoop.mapreduce.RecordReader 10 | import org.apache.hadoop.mapreduce.TaskAttemptContext 11 | import org.apache.hadoop.mapreduce.InputSplit 12 | import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat 13 | import org.apache.hadoop.mapred.JobConf 14 | import org.apache.hadoop.mapred.FileOutputFormat 15 | import org.apache.hadoop.io.compress.CompressionCodecFactory 16 | import org.apache.hadoop.util.LineReader 17 | import org.apache.hadoop.io._ 18 | import org.apache.hadoop.fs._ 19 | import org.apache.hadoop.io.compress._ 20 | import org.apache.hadoop.mapreduce.lib.input._ 21 | import org.apache.hadoop.conf.Configuration 22 | import org.apache.hadoop.conf.Configured 23 | 24 | import scala.reflect.ClassTag 25 | import scala.reflect._ 26 | import scala.util._ 27 | 28 | 29 | object DataFrameFunctions { 30 | def transform[R:ClassTag](f: Row=>Option[R])(src: DataFrame): RDDFunctions.TransformResult[Row,R] = { 31 | val dst = src.rdd.map{s => (s, Try{f(s)})} 32 | val output = dst.flatMap{case (_, Success(d)) => d; case _ => None} 33 | val error = dst.flatMap{case (_, Failure(t)) => Some(t); case _ => None} 34 | val ignore = dst.flatMap{case (s, Failure(_)) => Some(s); case _ => None} 35 | RDDFunctions.TransformResult(output, error, ignore) 36 | } 37 | 38 | def transform[R:ClassTag](f: Row=>R)(src: DataFrame)(implicit d: DummyImplicit): RDDFunctions.TransformResult[Row,R] = { 39 | val dst = src.rdd.map{s => (s, Try{f(s)})} 40 | val output = dst.flatMap{case (_, Success(d)) => Some(d); case _ => None} 41 | val error = dst.flatMap{case (_, Failure(t)) => Some(t); case _ => None} 42 | val ignore = dst.flatMap{case (s, Failure(_)) => Some(s); case _ => None} 43 | RDDFunctions.TransformResult(output, error, ignore) 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/test/scala/ru/retailrocket/spark/multitool/LoadersSuite.scala: -------------------------------------------------------------------------------- 1 | package ru.retailrocket.spark.multitool 2 | 3 | import org.scalatest._ 4 | 5 | import org.apache.spark.SparkContext 6 | import org.apache.spark.SparkContext._ 7 | import org.apache.spark.SparkConf 8 | import org.apache.spark.rdd._ 9 | 10 | import Loaders._ 11 | import Loaders.Filter 12 | 13 | 14 | class FileNameEqualityFilter extends Filter { 15 | def check(rules: Traversable[Filter.Rule], path: Array[String]) = { 16 | rules.forall{ 17 | case(k, Array(eq)) => 18 | k match { 19 | case "file" => eq == path.last 20 | case _ => false 21 | } 22 | } 23 | } 24 | } 25 | 26 | class LoadersSuite extends FunSuite with BeforeAndAfterAll { 27 | lazy val sc: SparkContext = new SparkContext("local", getClass.getSimpleName) 28 | def path(file: String) = getClass.getResource("/" + file).getFile 29 | 30 | test("forPathAndCombine") { 31 | val output = sc.forPath(path("combine")).combine().collect.sorted 32 | assert(output.deep == Array("1","2","3","4").deep) 33 | } 34 | 35 | test("forPathAndCombineWithPath") { 36 | val output = sc.forPath(path("combine")).combineWithPath().collect.sorted 37 | assert(output(1)._1.endsWith("file_1.csv")) 38 | } 39 | 40 | test("forPathWithFilter") { 41 | val output = sc.forPath(path("combine")+"/*") 42 | .addFilter(classOf[FileNameEqualityFilter], Seq("file" -> Array("file_2.csv"))) 43 | .combine().collect.sorted 44 | assert(output.deep == Array("3","4").deep) 45 | } 46 | 47 | test("compression") { 48 | { 49 | val actual = sc.forPath(path("archive")+"/test_gzip.txt.gz").combine().collect().head 50 | assert(actual === "gzip") 51 | } 52 | 53 | { 54 | val actual = sc.forPath(path("archive")+"/test_bzip2.txt.bz2").combine().collect().head 55 | assert(actual === "bzip2") 56 | } 57 | 58 | { 59 | val actual = sc.forPath(path("archive")+"/test_lzma.txt.xz").combine().collect().head 60 | assert(actual === "lzma") 61 | } 62 | } 63 | 64 | override def afterAll() { 65 | sc.stop() 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | SparkMultiTool 2 | ============== 3 | 4 | Tools for spark which we use on the daily basis. 5 | It contains: 6 | * Loader of HDFS files with combining small files (uses Hadoop CombineTextInputFormat/CombineFileInputFormat) 7 | * Future: cosine calculation 8 | * Future: quantile calculation 9 | 10 | #Requirements 11 | This library was succeffully tested with Scala 2.11.8 and Spark 2.3.1. 12 | You should install SBT: 13 | * [SBT tool](www.scala-sbt.org/download.html) 14 | 15 | 16 | #Build 17 | This build based on Scala 2.11.8 and Spark 2.3.1. Edit build.sbt If you have another environment. 18 | 19 | For building install sbt, launch a terminal, change current to sparkmultitool directory and launch a command: 20 | 21 | ``` 22 | sbt package 23 | sbt test 24 | ``` 25 | Next copy spark-multitool*.jar from ./target/scala-2.11/... to the lib folder of your sbt project. 26 | 27 | #Usage 28 | Include spark-multitool*.jar in --jars path in spark-submit like this: 29 | ``` 30 | spark-submit --master local --executor-memory 2G --class "Tst" --num-executors 1 --executor-cores 1 --jars lib/spark-multitool_2.11-0.9.jar target/scala-2.11/tst_2.11-0.1.jar 31 | 32 | ``` 33 | See examples folder. 34 | 35 | ##Loaders 36 | **ru.retailrocket.spark.multitool.Loaders** - combine input files before mappers by means of Hadoop CombineTextInputFormat/CombineFileInputFormat. In our case it reduced the number of mappers from 100000 to approx 3000 and made job significantly faster. 37 | Parameters: 38 | * **path** - path to the files (as in spark.textFile) 39 | * **size** - size of target partition in Megabytes. Optimal value equals to a HDFS block size 40 | * **delim** - line delimiters 41 | 42 | This example loads files from "/test/*" and combine them in mappers. 43 | ``` 44 | import org.apache.spark.SparkConf 45 | import org.apache.spark.SparkContext 46 | import org.apache.spark.SparkContext._ 47 | 48 | import ru.retailrocket.spark.multitool.Loaders._ 49 | 50 | object Tst { 51 | def main(args: Array[String]) = { 52 | val conf = new SparkConf().setMaster("local").setAppName("My App") 53 | val sc = new SparkContext("local", "My App") 54 | 55 | val path = "file:///test/*" 56 | 57 | { 58 | val sessions = sc 59 | .forPath(path) 60 | .setSplitSize(256) // optional 61 | .setRecordDelim("\n") // optional 62 | .combine() 63 | println(sessions.count()) 64 | } 65 | 66 | { 67 | // you can also get RDD[(String, String)] with (file, line) 68 | val sessions = sc 69 | .forPath(path) 70 | .combineWithPath() 71 | println(sessions.count()) 72 | 73 | { 74 | // or add path filter, e.g. for partitioning 75 | class FileNameEqualityFilter extends Filter { 76 | def check(rules: Traversable[Filter.Rule], path: Array[String]) = { 77 | rules.forall { 78 | case(k, Array(eq)) => 79 | k match { 80 | case "file" => eq == path.last 81 | case _ => false 82 | } 83 | } 84 | } 85 | } 86 | val sessions = sc 87 | .forPath(path) 88 | .addFilter(classOf[FileNameEqualityFilter], Seq("file" -> Array("file.name"))) 89 | .combine() 90 | println(sessions.count()) 91 | } 92 | } 93 | } 94 | } 95 | ``` 96 | 97 | ##Algorithms 98 | 99 | **ru.retailrocket.spark.multitool.algs.cosine** - cosine similarity function. 100 | 101 | ##Utility 102 | 103 | **ru.retailrocket.spark.multitool.HashFNV** - simple, but useful hash function. Original idea from org.apache.pig.piggybank.evaluation.string.HashFNV 104 | -------------------------------------------------------------------------------- /src/test/scala/ru/retailrocket/spark/multitool/FunctionsSuite.scala: -------------------------------------------------------------------------------- 1 | package ru.retailrocket.spark.multitool 2 | 3 | import org.scalatest._ 4 | import java.nio.file.FileAlreadyExistsException 5 | 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.SparkContext._ 8 | import org.apache.spark.SparkConf 9 | import org.apache.spark.rdd._ 10 | import scala.util._ 11 | 12 | import Implicits._ 13 | 14 | 15 | object Helpers { 16 | def f(x:Int): Int = 8 / x 17 | 18 | val serializer = new StringSerializer[Int] { 19 | override def apply(src: Int) = s"i: ${src.toString}" 20 | } 21 | } 22 | 23 | class FunctionsSuite extends FunSuite with BeforeAndAfterAll { 24 | lazy val sc: SparkContext = new SparkContext("local", getClass.getSimpleName) 25 | implicit val parallel = 5 26 | 27 | test("transform func") { 28 | val src = sc.parallelize(List(1,2,4,0)) 29 | val dst = src.transform(Helpers.f _) 30 | assert(dst.output.count() === 3) 31 | assert(dst.error.count() === 1) 32 | assert(dst.ignore.count() === 1) 33 | } 34 | 35 | test("transform partial") { 36 | val src = sc.parallelize(List(1,2,4,0)) 37 | val dst = src.transform{case x => 8 / x} 38 | assert(dst.output.count() === 3) 39 | assert(dst.error.count() === 1) 40 | assert(dst.ignore.count() === 1) 41 | } 42 | 43 | test("flat transform") { 44 | val src = sc.parallelize(List(1,2,4,0)) 45 | 46 | val dst1 = src.flatTransform{x => Try{8/x}.toOption} 47 | assert(dst1.output.count() === 3) 48 | assert(dst1.error.count() === 0) 49 | assert(dst1.ignore.count() === 0) 50 | 51 | val dst2 = src.flatTransform{x => Seq(x,x)} 52 | assert(dst2.output.count() === 8) 53 | assert(dst2.error.count() === 0) 54 | assert(dst2.ignore.count() === 0) 55 | } 56 | 57 | test("save via temp and archive") { 58 | val root = fs.createTempDirectoryLocal("model_test") 59 | val data = sc.parallelize(Seq(1,2,3)) 60 | val temp = s"${root}/model_test_temp" 61 | val output = s"${root}/model_test_data" 62 | 63 | 64 | fs.delete(output) 65 | data.saveViaTempWithRename(Helpers.serializer)(output, tempPath=Option(temp)) 66 | assert(fs.exists(output)) 67 | 68 | val dst = sc.textFile(output).collect().toSet 69 | assert(dst === Set("i: 1", "i: 2", "i: 3")) 70 | 71 | intercept[FileAlreadyExistsException] { data.saveViaTempWithRename(Helpers.serializer)(output, tempPath=Option(temp)) } 72 | Thread.sleep(1000) 73 | 74 | data.saveViaTempWithReplace(Helpers.serializer)(output, tempPath=Option(temp)) 75 | assert(fs.exists(output)) 76 | 77 | fs.delete(output) 78 | data.saveViaTempWithReplace(Helpers.serializer)(output, tempPath=Option(temp)) 79 | assert(fs.exists(output)) 80 | fs.delete(output) 81 | } 82 | 83 | test("functions") { 84 | import Functions._ 85 | 86 | { 87 | val seq = Seq(1->2,4->3,2->4,4->1) 88 | val max = seq.maxBy { _._2 } 89 | val min = seq.minBy { _._2 } 90 | assert(max === 2->4) 91 | assert(min === 4->1) 92 | } 93 | 94 | { 95 | val src = Seq(1->2, 2->3).reduce(sumTuple2[Int] _) 96 | assert(src === 3->5) 97 | } 98 | 99 | { 100 | val src = Seq(1.0->2, 2.0->3).reduce(sumTuple2[Double, Int] _) 101 | assert(src === 3.0->5) 102 | } 103 | 104 | { 105 | val src = Seq((1,2,3), (3,4,5)).reduce(sumTuple3[Int] _) 106 | assert(src === (4,6,8)) 107 | } 108 | 109 | { 110 | val src = Seq((1,2.0,3L), (3,4.0,5L)).reduce(sumTuple3[Int, Double, Long] _) 111 | assert(src === (4,6.0,8L)) 112 | } 113 | 114 | { 115 | assert(Seq(1,2,3).contains(1) === true) 116 | assert(Seq(1,2,3).contains(22) === false) 117 | assert(Seq(1,2,3).contains("s") === false) 118 | 119 | assert(Seq(1,2,3).has(1) === true) 120 | assert(Seq(1,2,3).has(22) === false) 121 | 122 | assert(Array(1,2,3).contains(1) === true) 123 | assert(Array(1,2,3).contains(22) === false) 124 | assert(Array(1,2,3).contains("s") === false) 125 | 126 | assert(Array(1,2,3).has(1) === true) 127 | assert(Array(1,2,3).has(22) === false) 128 | assertTypeError("""Array(1,2,3).has("s")""") 129 | } 130 | 131 | { 132 | assert("qq ww ee".nthIndexOf(" ", 0) == 2) 133 | assert("qq ww ee rr tt".nthIndexOf(" ", 3) == 11) 134 | assert("qq ww ee rr tt".nthSplit(" ", 3) == ("qq ww ee rr", "tt")) 135 | } 136 | 137 | { 138 | import Implicits.Ops 139 | assert("1" === "1") 140 | assert("1" !== "2") 141 | } 142 | } 143 | 144 | override def afterAll() { 145 | sc.stop() 146 | } 147 | } 148 | -------------------------------------------------------------------------------- /src/main/scala/ru/retailrocket/spark/multitool/fs/package.scala: -------------------------------------------------------------------------------- 1 | package ru.retailrocket.spark.multitool 2 | 3 | import org.apache.hadoop.fs._ 4 | import org.apache.hadoop.conf.Configuration 5 | import org.apache.hadoop.io.compress.CompressionCodec 6 | import org.apache.spark.rdd.RDD 7 | import java.io._ 8 | import java.nio.file.{FileAlreadyExistsException} 9 | 10 | 11 | package object fs { 12 | val DefaultTempPath = "/tmp/spark" 13 | 14 | val DefaultCodec = classOf[org.apache.hadoop.io.compress.GzipCodec] 15 | 16 | def actionViaTemp(output: String, tempPath: Option[String]=None)(action: String => Unit)(store: (String, String) => Unit): Unit = { 17 | val tempRoot = tempPath getOrElse DefaultTempPath 18 | val temp = "%s_%d".format(tempRoot, System.currentTimeMillis) 19 | action(temp) 20 | store(temp, output) 21 | } 22 | 23 | def saveRddViaTemp[T](serializer: StringSerializer[T])(output: String, tempPath: Option[String]=None, codec: Option[Class[_ <: CompressionCodec]]=None)(store: (String, String) => Unit)(src: RDD[T]): Unit = { 24 | actionViaTemp(output, tempPath) { path => src.map(serializer.apply _).saveAsTextFile(path, codec getOrElse DefaultCodec) } (store) 25 | } 26 | 27 | def saveRddViaTempWithReplace[T](serializer: StringSerializer[T])(output: String, tempPath: Option[String]=None, codec: Option[Class[_ <: CompressionCodec]]=None)(src: RDD[T]): Unit = { 28 | saveRddViaTemp(serializer)(output, tempPath, codec)(replace _)(src) 29 | } 30 | 31 | def saveRddViaTempWithRename[T](serializer: StringSerializer[T])(output: String, tempPath: Option[String]=None, codec: Option[Class[_ <: CompressionCodec]]=None)(src: RDD[T]): Unit = { 32 | saveRddViaTemp(serializer)(output, tempPath, codec)(rename _)(src) 33 | } 34 | 35 | def saveStringViaTemp(output: String, tempPath: Option[String]=None, overwrite: Boolean = false )(store: (String, String) => Unit)(src: String): Unit = { 36 | actionViaTemp(output, tempPath) { path => storeHdfs(src, path, overwrite) } (store) 37 | } 38 | 39 | def saveStringViaTempWithReplace(output: String, tempPath: Option[String]=None, overwrite: Boolean = false )(src: String): Unit = { 40 | saveStringViaTemp(output, tempPath, overwrite)(replace _)(src) 41 | } 42 | 43 | def saveStringViaTempWithRename(output: String, tempPath: Option[String]=None, overwrite: Boolean = false )(src: String): Unit = { 44 | saveStringViaTemp(output, tempPath, overwrite)(rename _)(src) 45 | } 46 | 47 | def exists(dst: String): Boolean = { 48 | val fs = FileSystem.get(new Configuration()) 49 | val dstPath = new Path(dst) 50 | fs.exists(dstPath) 51 | } 52 | 53 | def delete(dst: String, recursive: Boolean=true): Unit = { 54 | val fs = FileSystem.get(new Configuration()) 55 | val dstPath = new Path(dst) 56 | if(fs.exists(dstPath)) fs.delete(dstPath, recursive) 57 | } 58 | 59 | def checkParentAndCreate(dst: Path): Unit = { 60 | val fs = FileSystem.get(new Configuration()) 61 | val parent = dst.getParent 62 | if(!fs.exists(parent)) fs.mkdirs(parent) 63 | } 64 | 65 | def checkParentAndRename(src: Path, dst: Path) { 66 | val fs = FileSystem.get(new Configuration()) 67 | checkParentAndCreate(dst) 68 | fs.rename(src, dst) 69 | } 70 | 71 | def rename(src: String, dst: String) = { 72 | val fs = FileSystem.get(new Configuration()) 73 | val dstPath = new Path(dst) 74 | val srcPath = new Path(src) 75 | if(fs.exists(dstPath)) throw new FileAlreadyExistsException(s"path already exists - ${dst}") 76 | checkParentAndRename(srcPath, dstPath) 77 | } 78 | 79 | def replace(src: String, dst: String) = { 80 | val fs = FileSystem.get(new Configuration()) 81 | val srcPath = new Path(src) 82 | val dstPath = new Path(dst) 83 | if(fs.exists(dstPath)) fs.delete(dstPath, true) 84 | checkParentAndRename(srcPath, dstPath) 85 | } 86 | 87 | def storeLocal(data: String, path: String) { 88 | val out = new FileOutputStream(path) 89 | val bytes = data.getBytes 90 | out.write(bytes, 0, bytes.size) 91 | out.close() 92 | } 93 | 94 | def storeHdfs(data: String, path: String, overwrite: Boolean = false) { 95 | val fs = FileSystem.get(new Configuration()) 96 | val out = fs.create(new Path(path), overwrite) 97 | val bytes = data.getBytes 98 | out.write(bytes, 0, bytes.size) 99 | out.close() 100 | } 101 | 102 | def storeIterableToHdfs[T](serializer: StringSerializer[T])(path: String, overwrite: Boolean = false)(data: Iterable[T]) { 103 | val fs = FileSystem.get(new Configuration()) 104 | val file = fs.create(new Path(path), overwrite) 105 | 106 | val writer = new BufferedWriter(new OutputStreamWriter(file)) 107 | data.foreach { src => 108 | writer.write(serializer(src)) 109 | writer.newLine() 110 | } 111 | 112 | writer.close() 113 | file.close() 114 | } 115 | 116 | def createTempDirectoryLocal(prefix: String): String = { 117 | val temp = File.createTempFile(prefix, "") 118 | temp.delete() 119 | temp.mkdir() 120 | temp.getPath 121 | } 122 | } 123 | -------------------------------------------------------------------------------- /src/main/scala/ru/retailrocket/spark/multitool/RDDFunctions.scala: -------------------------------------------------------------------------------- 1 | package ru.retailrocket.spark.multitool 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkContext._ 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.Accumulator 7 | import org.apache.spark.storage.StorageLevel 8 | import org.apache.spark.rdd._ 9 | 10 | import org.apache.hadoop.mapreduce.RecordReader 11 | import org.apache.hadoop.mapreduce.TaskAttemptContext 12 | import org.apache.hadoop.mapreduce.InputSplit 13 | import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat 14 | import org.apache.hadoop.mapred.JobConf 15 | import org.apache.hadoop.mapred.FileOutputFormat 16 | import org.apache.hadoop.io.compress.CompressionCodecFactory 17 | import org.apache.hadoop.util.LineReader 18 | import org.apache.hadoop.io._ 19 | import org.apache.hadoop.fs._ 20 | import org.apache.hadoop.io.compress._ 21 | import org.apache.hadoop.mapreduce.lib.input._ 22 | import org.apache.hadoop.conf.Configuration 23 | import org.apache.hadoop.conf.Configured 24 | 25 | import scala.reflect.ClassTag 26 | import scala.reflect._ 27 | import scala.util._ 28 | 29 | 30 | object RDDFunctions { 31 | val DefaultPersistLevel = StorageLevel.MEMORY_AND_DISK_SER 32 | 33 | case class TransformResult[T, R: ClassTag](output: RDD[R], error: RDD[Throwable], ignore: RDD[T]) { 34 | def name = classTag[R].toString 35 | def summary: String = s"${name} output ${output.count()} ignore ${ignore.count()}" 36 | def cache(): TransformResult[T,R] = 37 | TransformResult(output.cache(), error.cache(), ignore.cache()) 38 | def persist(level: StorageLevel=DefaultPersistLevel): TransformResult[T,R] = 39 | TransformResult(output.persist(level), error.persist(level), ignore.persist(level)) 40 | } 41 | 42 | def transform[T:ClassTag, R:ClassTag](f: T=>R)(src: RDD[T]): TransformResult[T,R] = { 43 | val dst = src.map{s => (s, Try{f(s)})} 44 | val output = dst.flatMap{case (_, Success(d)) => Some(d); case _ => None} 45 | val error = dst.flatMap{case (_, Failure(t)) => Some(t); case _ => None} 46 | val ignore = dst.flatMap{case (s, Failure(_)) => Some(s); case _ => None} 47 | TransformResult(output, error, ignore) 48 | } 49 | 50 | def flatTransform[T:ClassTag, R:ClassTag, C<%TraversableOnce[R]](f: T=>C)(src: RDD[T]): TransformResult[T,R] = { 51 | val dst = src.map{s => (s, Try{f(s)})} 52 | val output = dst.flatMap{case (_, Success(d)) => d; case _ => None} 53 | val error = dst.flatMap{case (_, Failure(t)) => Some(t); case _ => None} 54 | val ignore = dst.flatMap{case (s, Failure(_)) => Some(s); case _ => None} 55 | TransformResult(output, error, ignore) 56 | } 57 | 58 | case class TransformResultWithAccums[T, R: ClassTag](output: RDD[R], error: RDD[Throwable], ignore: RDD[T], inputAccum: Accumulator[Long], outputAccum: Accumulator[Long], errorAccum: Accumulator[Long]) { 59 | def name = classTag[R].toString 60 | def summary: String = s"${name} input ${inputAccum.value} output ${outputAccum.value} error ${errorAccum.value}" 61 | def errorRatio: Double = errorAccum.value.toDouble / (outputAccum.value + errorAccum.value) 62 | def cache(): TransformResult[T,R] = 63 | TransformResult(output.cache(), error.cache(), ignore.cache()) 64 | def persist(level: StorageLevel=DefaultPersistLevel): TransformResult[T,R] = 65 | TransformResult(output.persist(level), error.persist(level), ignore.persist(level)) 66 | } 67 | 68 | def transformWithAccums[T:ClassTag, R:ClassTag](f: T=>R)(src: RDD[T])(implicit sc: SparkContext): TransformResultWithAccums[T,R] = { 69 | val inputAccum = sc.accumulator(0L, "input") 70 | val outputAccum = sc.accumulator(0L, "output") 71 | val errorAccum = sc.accumulator(0L, "error") 72 | val dst = src.map{s => (s, Try{f(s)})} 73 | val output = dst.flatMap{ 74 | case (_, Success(d)) => 75 | inputAccum += 1 76 | outputAccum += 1 77 | Some(d) 78 | case _ => 79 | inputAccum += 1 80 | errorAccum += 1 81 | None} 82 | val error = dst.flatMap{ 83 | case (_, Failure(t)) => 84 | Some(t) 85 | case _ => 86 | None} 87 | val ignore = dst.flatMap{ 88 | case (s, Failure(_)) => 89 | Some(s) 90 | case _ => 91 | None} 92 | TransformResultWithAccums(output, error, ignore, inputAccum, outputAccum, errorAccum) 93 | } 94 | 95 | def flatTransformWithAccums[T:ClassTag, R:ClassTag, C<%TraversableOnce[R]](f: T=>C)(src: RDD[T])(implicit sc: SparkContext): TransformResultWithAccums[T,R] = { 96 | val inputAccum = sc.accumulator(0L, "input") 97 | val outputAccum = sc.accumulator(0L, "output") 98 | val errorAccum = sc.accumulator(0L, "error") 99 | val dst = src.map{s => (s, Try{f(s)})} 100 | val output = dst.flatMap{ 101 | case (_, Success(d)) => 102 | inputAccum += 1 103 | outputAccum += d.size 104 | d 105 | case _ => 106 | inputAccum += 1 107 | errorAccum += 1 108 | None} 109 | val error = dst.flatMap{ 110 | case (_, Failure(t)) => 111 | Some(t) 112 | case _ => 113 | None} 114 | val ignore = dst.flatMap{ 115 | case (s, Failure(_)) => 116 | Some(s) 117 | case _ => 118 | None} 119 | TransformResultWithAccums(output, error, ignore, inputAccum, outputAccum, errorAccum) 120 | } 121 | 122 | class KeyBasedMultipleTextOutputFormat extends MultipleTextOutputFormat[Text, Text] { 123 | override def generateFileNameForKeyValue(key: Text, value: Text, name: String): String = { 124 | key.toString + "/" + name 125 | } 126 | 127 | override def generateActualKey(key: Text, value: Text) = null 128 | } 129 | 130 | def saveAsMultipleTextFiles[T:ClassTag](src: RDD[T], root: String)(getPath: T => String)(getData: T => String): Unit = { 131 | saveAsMultipleTextFiles(src, root, None)(getPath)(getData) 132 | } 133 | 134 | def saveAsMultipleTextFiles[T:ClassTag](src: RDD[T], root: String, codec: Class[_ <: CompressionCodec])(getPath: T => String)(getData: T => String): Unit = { 135 | saveAsMultipleTextFiles(src, root, Option(codec))(getPath)(getData) 136 | } 137 | 138 | def saveAsMultipleTextFiles[T:ClassTag](src: RDD[T], root: String, codec: Option[Class[_ <: CompressionCodec]])(getPath: T => String)(getData: T => String): Unit = { 139 | val hadoopConf = new Configuration() 140 | val jobConf = new JobConf(hadoopConf) 141 | 142 | jobConf.setOutputFormat(classOf[KeyBasedMultipleTextOutputFormat]) 143 | 144 | if(codec.isDefined) { 145 | jobConf.setBoolean("mapred.output.compress", true) 146 | jobConf.setClass("mapred.output.compression.codec", codec.get, classOf[CompressionCodec]) 147 | } 148 | 149 | FileOutputFormat.setOutputPath(jobConf, new Path(root)) 150 | 151 | src 152 | .map { v => (new Text(getPath(v)), new Text(getData(v))) } 153 | .saveAsHadoopDataset(jobConf) 154 | } 155 | } 156 | -------------------------------------------------------------------------------- /src/main/scala/ru/retailrocket/spark/multitool/Loaders.scala: -------------------------------------------------------------------------------- 1 | package ru.retailrocket.spark.multitool 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkContext._ 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.rdd._ 7 | 8 | import org.apache.hadoop.mapreduce.RecordReader 9 | import org.apache.hadoop.mapreduce.TaskAttemptContext 10 | import org.apache.hadoop.mapreduce.InputSplit 11 | import org.apache.hadoop.io.compress.CompressionCodecFactory 12 | import org.apache.hadoop.util.LineReader 13 | import org.apache.hadoop.io._ 14 | import org.apache.hadoop.fs._ 15 | import org.apache.hadoop.mapreduce.lib.input._ 16 | import org.apache.hadoop.conf.Configuration 17 | import org.apache.hadoop.conf.Configured 18 | 19 | import scala.reflect.ClassTag 20 | import scala.reflect._ 21 | 22 | object Loaders { 23 | abstract class Filter extends Configured with PathFilter { 24 | import Filter._ 25 | 26 | private[this] var filter: Option[Traversable[Rule]] = None 27 | 28 | def check(rules: Traversable[Rule], path: Array[String]): Boolean 29 | 30 | override def accept(path: Path): Boolean = filter 31 | .map{f => check(f, path.toString.split(Path.SEPARATOR))} 32 | .getOrElse(true) 33 | 34 | override def setConf(conf: Configuration) { 35 | filter = Option(conf) 36 | .map(_.get(RulesPropName)) 37 | .map(parseRules) 38 | } 39 | } 40 | 41 | object Filter { 42 | type Rule = (String, Array[String]) 43 | 44 | val Pattern = """([^=]+)=(.+)""".r 45 | val RulesPropName = "ru.retailrocket.loaders.filter.rules" 46 | 47 | def storeRules(src: Traversable[Rule]) = src.map{ 48 | case(k, eqs) => "%s=%s".format(k, eqs.mkString(",")) 49 | }.mkString("&") 50 | 51 | def parseRules(src: String) = src.split("&").map{ 52 | case Pattern(k, eqs) => (k, eqs.split(",")) 53 | } 54 | } 55 | 56 | private class CombineTextFileWithOffsetInputFormat extends CombineFileInputFormat[LongWritable, Text] { 57 | override def createRecordReader( 58 | split: InputSplit, 59 | context: TaskAttemptContext): RecordReader[LongWritable, Text] = 60 | new CombineFileRecordReader(split.asInstanceOf[CombineFileSplit], context, classOf[CombineTextFileWithOffsetRecordReader]) 61 | } 62 | 63 | private class CombineTextFileWithOffsetRecordReader( 64 | split: CombineFileSplit, 65 | context: TaskAttemptContext, 66 | index: Integer) extends CombineTextFileRecordReader[LongWritable](split, context, index) { 67 | 68 | override def generateKey(split: CombineFileSplit, index: Integer): LongWritable = new LongWritable(split.getOffset(index)) 69 | } 70 | 71 | private class CombineTextFileWithPathInputFormat extends CombineFileInputFormat[Text, Text] { 72 | override def createRecordReader( 73 | split: InputSplit, 74 | context: TaskAttemptContext): RecordReader[Text, Text] = 75 | new CombineFileRecordReader(split.asInstanceOf[CombineFileSplit], context, classOf[CombineTextFileWithPathRecordReader]) 76 | } 77 | 78 | private class CombineTextFileWithPathRecordReader( 79 | split: CombineFileSplit, 80 | context: TaskAttemptContext, 81 | index: Integer) extends CombineTextFileRecordReader[Text](split, context, index) { 82 | 83 | override def generateKey(split: CombineFileSplit, index: Integer): Text = new Text(split.getPath(index).toString) 84 | } 85 | 86 | private abstract class CombineTextFileRecordReader[K]( 87 | split: CombineFileSplit, 88 | context: TaskAttemptContext, 89 | index: Integer) extends RecordReader[K, Text] { 90 | 91 | val conf = context.getConfiguration 92 | val path = split.getPath(index) 93 | val fs = path.getFileSystem(conf) 94 | val codec = Option(new CompressionCodecFactory(conf).getCodec(path)) 95 | 96 | val start = split.getOffset(index) 97 | val length = if(codec.isEmpty) split.getLength(index) else Long.MaxValue 98 | val end = start + length 99 | 100 | val fd = fs.open(path) 101 | if(start > 0) fd.seek(start) 102 | 103 | val fileIn = codec match { 104 | case Some(codec) => codec.createInputStream(fd) 105 | case None => fd 106 | } 107 | 108 | var reader = new LineReader(fileIn) 109 | var pos = start 110 | 111 | def generateKey(split: CombineFileSplit, index: Integer): K 112 | 113 | protected val key = generateKey(split, index) 114 | protected val value = new Text 115 | 116 | override def initialize(split: InputSplit, ctx: TaskAttemptContext) {} 117 | 118 | override def nextKeyValue(): Boolean = { 119 | if (pos < end) { 120 | val newSize = reader.readLine(value) 121 | pos += newSize 122 | newSize != 0 123 | } else { 124 | false 125 | } 126 | } 127 | 128 | override def close(): Unit = if (reader != null) { reader.close(); reader = null } 129 | override def getCurrentKey: K = key 130 | override def getCurrentValue: Text = value 131 | override def getProgress: Float = if (start == end) 0.0f else math.min(1.0f, (pos - start).toFloat / (end - start)) 132 | } 133 | 134 | private val defaultCombineSize = 256 135 | private val defaultCombineDelim = "\n" 136 | 137 | def generateOffsetKey(split: CombineFileSplit, index: Integer) = split.getOffset(index) 138 | 139 | class Context(val sc: SparkContext, val path: String) { 140 | 141 | val conf = new Configuration() 142 | conf.set("textinputformat.record.delimiter", defaultCombineDelim) 143 | conf.set("mapreduce.input.fileinputformat.input.dir.recursive", "true") 144 | conf.set("mapred.input.dir", path) 145 | conf.setLong("mapred.max.split.size", defaultCombineSize*1024*1024) 146 | 147 | conf.set("io.compression.codecs", s"io.sensesecure.hadoop.xz.XZCodec") 148 | 149 | def addFilterClass[T <: Filter](filterClass: Class[T]): Context = { 150 | conf.set("mapreduce.input.pathFilter.class", filterClass.getName) 151 | this 152 | } 153 | 154 | def addFilterRules(filterRules: String) = { 155 | conf.set(Filter.RulesPropName, filterRules) 156 | this 157 | } 158 | 159 | def addFilterRules(filterRules: Traversable[Filter.Rule]) = { 160 | conf.set(Filter.RulesPropName, Filter.storeRules(filterRules)) 161 | this 162 | } 163 | 164 | def addFilter[T <: Filter](filter: (Class[T], Traversable[Filter.Rule])) = this 165 | .addFilterClass(filter._1) 166 | .addFilterRules(filter._2) 167 | 168 | def setSplitSize(size: Long) = { 169 | conf.setLong("mapred.min.split.size", size*1024*1024) 170 | conf.setLong("mapred.max.split.size", size*1024*1024) 171 | this 172 | } 173 | 174 | def setRecordDelim(delim: String) = { 175 | conf.set("textinputformat.record.delimiter", delim) 176 | this 177 | } 178 | 179 | def combine(): RDD[String] = sc 180 | .newAPIHadoopRDD(conf, classOf[CombineTextInputFormat], classOf[LongWritable], classOf[Text]) 181 | .map { case (k, v) => v.toString } 182 | 183 | def combineWithPath(): RDD[(String, String)] = sc 184 | .newAPIHadoopRDD(conf, classOf[CombineTextFileWithPathInputFormat], classOf[Text], classOf[Text]) 185 | .map{case(path, data) => (path.toString, data.toString)} 186 | } 187 | 188 | def forPath(sc: SparkContext, path: String) = { 189 | new Context(sc, path) 190 | } 191 | 192 | implicit class SparkContextFunctions(val self: SparkContext) extends AnyVal { 193 | def forPath(path: String): Loaders.Context = Loaders.forPath(self, path) 194 | } 195 | } 196 | -------------------------------------------------------------------------------- /src/main/scala/ru/retailrocket/spark/multitool/package.scala: -------------------------------------------------------------------------------- 1 | package ru.retailrocket.spark 2 | 3 | import scala.reflect.ClassTag 4 | import scala.annotation.tailrec 5 | import scala.util._ 6 | 7 | import org.apache.spark.SparkContext 8 | import org.apache.spark.rdd._ 9 | import org.apache.spark.sql._ 10 | import org.apache.hadoop.io.compress._ 11 | 12 | 13 | package object multitool { 14 | object Functions { 15 | def tap[T:ClassTag](f: T => Unit)(o: T) = {f(o); o} 16 | def applyIf[T:ClassTag](p: Boolean)(f: T => T)(o: T): T = {if(p) f(o) else o} 17 | def applyIf[T:ClassTag](p: T=>Boolean)(f: T => T)(o: T): T = {if(p(o)) f(o) else o} 18 | def applyOption[T:ClassTag,V:ClassTag](v: Option[V])(f: (T,V) => T)(o: T): T = {if(v.isDefined) f(o, v.get) else o} 19 | def tapIf[T:ClassTag](p: Boolean)(f: T => Unit)(o: T) = {if(p) f(o); o} 20 | def use[T:ClassTag, R:ClassTag](f: T => R)(o: T): R = f(o) 21 | 22 | def maxBy[T,O<%Ordered[O]](f:T=>O)(a:T, b:T) = if(f(a)>f(b)) a else b 23 | def minBy[T,O<%Ordered[O]](f:T=>O)(a:T, b:T) = if(f(a) i 67 | case _ => f(n-1, s.indexOf(d, i+1)) 68 | } 69 | f(n, 0) 70 | } 71 | 72 | def nthSplit(s: String, d: String, n: Int): (String, String) = { 73 | val (s1, s2) = s.splitAt(nthIndexOf(s, d, n)+d.length) 74 | (s1.take(s1.length-d.length), s2) 75 | } 76 | } 77 | 78 | object PairFunctions { 79 | def flatMapValues[K,V,T](src: Traversable[(K,V)])(f: (V) => TraversableOnce[T]): Traversable[(K,T)] = 80 | src.flatMap { case (k,v) => f(v).map { r => (k, r) } } 81 | 82 | def mapValues[K,V,T](src: Traversable[(K,V)])(f: (V) => T): Traversable[(K,T)] = 83 | src.map {case (k,v) => (k, f(v)) } 84 | 85 | def groupByKey[K,V](src: Traversable[(K,V)]) = 86 | mapValues(src.groupBy { _._1 } ) { _.map { _._2 } } 87 | 88 | def reduceByKey[K,V](src: Traversable[(K,V)])(f: (V,V) => V): Traversable[(K,V)] = 89 | groupByKey(src).map { case (k, vs) => (k, vs.reduce(f)) } 90 | 91 | def countByKey[K,V](src: Traversable[(K,V)]): Map[K, Long] = 92 | reduceByKey(src.map { case (k, v) => (k, 1L) } ) { _+_ }.toMap 93 | 94 | def cogroup[K,V1,V2](src1: Traversable[(K,V1)], src2: Traversable[(K,V2)]): Traversable[(K, (Traversable[V1], Traversable[V2]))] = { 95 | val g1 = groupByKey(src1).toMap 96 | val g2 = groupByKey(src2).toMap 97 | val ks = g1.keys.toSet | g2.keys.toSet 98 | for { 99 | k <- ks.toSeq 100 | vs1 = g1.get(k).toList.flatten 101 | vs2 = g2.get(k).toList.flatten 102 | } yield (k, (vs1, vs2)) 103 | } 104 | 105 | def join[K,V1,V2](src1: Traversable[(K,V1)], src2: Traversable[(K,V2)]): Traversable[(K, (V1, V2))] = { 106 | for { 107 | (k, (vs1, vs2)) <- cogroup(src1, src2) 108 | v1 <- vs1 109 | v2 <- vs2 110 | } yield (k, (v1, v2)) 111 | } 112 | 113 | def leftOuterJoin[K,V1,V2](src1: Traversable[(K,V1)], src2: Traversable[(K,V2)]): Traversable[(K, (V1, Option[V2]))] = { 114 | for { 115 | (k, (vs1, vs2)) <- cogroup(src1, src2) 116 | v1 <- vs1 117 | v2 <- if(vs2.isEmpty) Seq(None) else vs2.map { Option(_) } 118 | } yield (k, (v1, v2)) 119 | } 120 | 121 | def rightOuterJoin[K,V1,V2](src1: Traversable[(K,V1)], src2: Traversable[(K,V2)]): Traversable[(K, (Option[V1], V2))] = { 122 | for { 123 | (k, (vs1, vs2)) <- cogroup(src1, src2) 124 | v1 <- if(vs1.isEmpty) Seq(None) else vs1.map { Option(_) } 125 | v2 <- vs2 126 | } yield (k, (v1, v2)) 127 | } 128 | 129 | def fullOuterJoin[K,V1,V2](src1: Traversable[(K,V1)], src2: Traversable[(K,V2)]): Traversable[(K, (Option[V1], Option[V2]))] = { 130 | for { 131 | (k, (vs1, vs2)) <- multitool.PairFunctions.cogroup(src1, src2) 132 | v1 <- if(vs1.isEmpty) Seq(None) else vs1.map { Option(_) } 133 | v2 <- if(vs2.isEmpty) Seq(None) else vs2.map { Option(_) } 134 | } yield (k, (v1, v2)) 135 | } 136 | } 137 | 138 | object TraversableFunctions { 139 | def countByValue[T](src: Traversable[T]): Map[T, Long] = PairFunctions.countByKey(src.map { (_, 1L) } ) 140 | } 141 | 142 | object Implicits { 143 | object Ops { 144 | implicit class MultitoolOpsImplicits[T](val self: T) { 145 | def ===(that: T): Boolean = self == that 146 | def !==(that: T): Boolean = self != that 147 | } 148 | } 149 | 150 | implicit class MultitoolFunctionsImplicits[T:ClassTag](val self: T) { 151 | def tap(f: T => Unit) = Functions.tap(f)(self) 152 | def tapIf(p: Boolean)(f: T => Unit) = Functions.tapIf(p)(f)(self) 153 | def applyIf(p: Boolean)(f: T => T): T = Functions.applyIf(p)(f)(self) 154 | def applyOption[V:ClassTag](v: Option[V])(f: (T,V) => T): T = Functions.applyOption(v)(f)(self) 155 | def use[R: ClassTag](f: T => R): R = Functions.use(f)(self) 156 | } 157 | 158 | implicit class MultitoolPairFunctionsImplicits[K:ClassTag, V:ClassTag](val self: Traversable[(K,V)]) { 159 | def flatMapValues[T](f: (V) => TraversableOnce[T]) = PairFunctions.flatMapValues(self)(f) 160 | def mapValues[T](f: (V) => T) = PairFunctions.mapValues(self)(f) 161 | def groupByKey() = PairFunctions.groupByKey(self) 162 | def reduceByKey(f: (V,V) => V) = PairFunctions.reduceByKey(self)(f) 163 | def countByKey() = PairFunctions.countByKey(self) 164 | def cogroup[V2](src2: Traversable[(K,V2)]) = PairFunctions.cogroup(self, src2) 165 | def join[V2](src2: Traversable[(K,V2)]) = PairFunctions.join(self, src2) 166 | def leftOuterJoin[V2](src2: Traversable[(K,V2)]) = PairFunctions.leftOuterJoin(self, src2) 167 | def rightOuterJoin[V2](src2: Traversable[(K,V2)]) = PairFunctions.rightOuterJoin(self, src2) 168 | def fullOuterJoin[V2](src2: Traversable[(K,V2)]) = PairFunctions.fullOuterJoin(self, src2) 169 | } 170 | 171 | implicit class MultitoolTraversableFunctionsImplicits[T](val self: Traversable[T]) extends AnyVal { 172 | def countByValue() = TraversableFunctions.countByValue(self) 173 | } 174 | 175 | implicit class MultitoolSetFunctionsImplicits[T](val self: Set[T]) extends AnyVal { 176 | def has(t: T) = self.contains(t) 177 | } 178 | 179 | implicit class MultitoolSeqFunctionsImplicits[T](val self: Seq[T]) extends AnyVal { 180 | def has(t: T) = self.contains(t) 181 | } 182 | 183 | implicit class MultitoolArrayFunctionsImplicits[T](val self: Array[T]) extends AnyVal { 184 | def has(t: T) = self.contains(t) 185 | } 186 | 187 | implicit class MultitoolRDDFunctionsImplicits[T:ClassTag](val self: RDD[T]) { 188 | def transform[R:ClassTag](f: T=>R): RDDFunctions.TransformResult[T,R] = { 189 | RDDFunctions.transform(f)(self) 190 | } 191 | def flatTransform[R:ClassTag, C<%TraversableOnce[R]](f: T=>C): RDDFunctions.TransformResult[T,R] = { 192 | RDDFunctions.flatTransform(f)(self) 193 | } 194 | def transformWithAccums[R:ClassTag](f: T=>R)(implicit sc: SparkContext): RDDFunctions.TransformResultWithAccums[T,R] = { 195 | RDDFunctions.transformWithAccums(f)(self) 196 | } 197 | def flatTransformWithAccums[R:ClassTag, C<%TraversableOnce[R]](f: T=>C)(implicit sc: SparkContext): RDDFunctions.TransformResultWithAccums[T,R] = { 198 | RDDFunctions.flatTransformWithAccums(f)(self) 199 | } 200 | def saveViaTemp(serializer: StringSerializer[T])(output: String, tempPath: Option[String]=None, codec: Option[Class[_ <: CompressionCodec]]=None)(store: (String, String) => Unit): Unit = { 201 | fs.saveRddViaTemp(serializer)(output, tempPath, codec)(store)(self) 202 | } 203 | def saveViaTempWithReplace(serializer: StringSerializer[T])(output: String, tempPath: Option[String]=None, codec: Option[Class[_ <: CompressionCodec]]=None): Unit = { 204 | fs.saveRddViaTempWithReplace(serializer)(output, tempPath, codec)(self) 205 | } 206 | def saveViaTempWithRename(serializer: StringSerializer[T])(output: String, tempPath: Option[String]=None, codec: Option[Class[_ <: CompressionCodec]]=None): Unit = { 207 | fs.saveRddViaTempWithRename(serializer)(output, tempPath, codec)(self) 208 | } 209 | def saveAsMultipleTextFiles(root: String)(getPath: T => String)(getData: T => String): Unit = { 210 | RDDFunctions.saveAsMultipleTextFiles(self, root)(getPath)(getData) 211 | } 212 | def saveAsMultipleTextFiles(root: String, codec: Class[_ <: CompressionCodec])(getPath: T => String)(getData: T => String): Unit = { 213 | RDDFunctions.saveAsMultipleTextFiles(self, root, codec)(getPath)(getData) 214 | } 215 | def saveAsMultipleTextFiles(root: String, codec: Option[Class[_ <: CompressionCodec]])(getPath: T => String)(getData: T => String): Unit = { 216 | RDDFunctions.saveAsMultipleTextFiles(self, root, codec)(getPath)(getData) 217 | } 218 | } 219 | 220 | implicit class MultitoolDataFrameFunctionsImplicits(val self: DataFrame) { 221 | def transform[R:ClassTag](f: Row=>Option[R]): RDDFunctions.TransformResult[Row,R] = { 222 | DataFrameFunctions.transform(f)(self) 223 | } 224 | def transform[R:ClassTag](f: Row=>R)(implicit d: DummyImplicit): RDDFunctions.TransformResult[Row,R] = { 225 | DataFrameFunctions.transform(f)(self) 226 | } 227 | } 228 | 229 | implicit class RichBoolean(val self: Boolean) extends AnyVal { 230 | def toInt = if(self) 1 else 0 231 | def toDouble = if(self) 1.0 else 0.0 232 | } 233 | 234 | implicit class RichInt(val self: Int) extends AnyVal { 235 | def toBoolean = self == 1 236 | } 237 | 238 | implicit class RichString(val self: String) extends AnyVal { 239 | def nthIndexOf(d: String, n: Int): Int = { 240 | Functions.nthIndexOf(self, d, n) 241 | } 242 | def nthSplit(d: String, n: Int): (String, String) = { 243 | Functions.nthSplit(self, d, n) 244 | } 245 | } 246 | } 247 | 248 | trait StringSerializer[T] extends Serializable { 249 | def apply(src: T): String 250 | } 251 | } 252 | --------------------------------------------------------------------------------