├── project
    └── build.properties
├── src
    ├── test
    │   ├── resources
    │   │   ├── combine
    │   │   │   ├── file_1.csv
    │   │   │   └── file_2.csv
    │   │   └── archive
    │   │   │   ├── test_bzip2.txt.bz2
    │   │   │   ├── test_gzip.txt.gz
    │   │   │   └── test_lzma.txt.xz
    │   └── scala
    │   │   └── ru
    │   │       └── retailrocket
    │   │           └── spark
    │   │               └── multitool
    │   │                   ├── AlgsSuite.scala
    │   │                   ├── LoadersSuite.scala
    │   │                   └── FunctionsSuite.scala
    └── main
    │   └── scala
    │       └── ru
    │           └── retailrocket
    │               └── spark
    │                   └── multitool
    │                       ├── HashFNV.scala
    │                       ├── Config.scala
    │                       ├── algs
    │                           └── package.scala
    │                       ├── DataFrameFunctions.scala
    │                       ├── fs
    │                           └── package.scala
    │                       ├── RDDFunctions.scala
    │                       ├── Loaders.scala
    │                       └── package.scala
├── example
    ├── src
    │   └── main
    │   │   ├── resource
    │   │       ├── file_1.txt
    │   │       └── file_2.txt
    │   │   └── scala
    │   │       └── MyProject.scala
    └── build.sbt
├── .gitignore
├── LICENSE
└── README.md


/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version = 1.3.12
2 | 


--------------------------------------------------------------------------------
/src/test/resources/combine/file_1.csv:
--------------------------------------------------------------------------------
1 | 1
2 | 2
3 | 


--------------------------------------------------------------------------------
/src/test/resources/combine/file_2.csv:
--------------------------------------------------------------------------------
1 | 3
2 | 4
3 | 


--------------------------------------------------------------------------------
/example/src/main/resource/file_1.txt:
--------------------------------------------------------------------------------
1 | 1
2 | 2
3 | 3
4 | 


--------------------------------------------------------------------------------
/example/src/main/resource/file_2.txt:
--------------------------------------------------------------------------------
1 | 4
2 | 5
3 | 6
4 | 


--------------------------------------------------------------------------------
/src/test/resources/archive/test_bzip2.txt.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RetailRocket/SparkMultiTool/HEAD/src/test/resources/archive/test_bzip2.txt.bz2


--------------------------------------------------------------------------------
/src/test/resources/archive/test_gzip.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RetailRocket/SparkMultiTool/HEAD/src/test/resources/archive/test_gzip.txt.gz


--------------------------------------------------------------------------------
/src/test/resources/archive/test_lzma.txt.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RetailRocket/SparkMultiTool/HEAD/src/test/resources/archive/test_lzma.txt.xz


--------------------------------------------------------------------------------
/example/build.sbt:
--------------------------------------------------------------------------------
1 | name := "MyProject"
2 | 
3 | version := "1.0"
4 | 
5 | scalaVersion := "2.10.4"
6 | 
7 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.1.0"
8 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.class
 2 | *.log
 3 | 
 4 | # sbt specific
 5 | .cache/
 6 | .history/
 7 | .lib/
 8 | dist/*
 9 | target/
10 | lib_managed/
11 | src_managed/
12 | project/boot/
13 | project/plugins/project/
14 | 
15 | # Scala-IDE specific
16 | .scala_dependencies
17 | .worksheet
18 | 


--------------------------------------------------------------------------------
/src/main/scala/ru/retailrocket/spark/multitool/HashFNV.scala:
--------------------------------------------------------------------------------
 1 | package ru.retailrocket.spark.multitool
 2 | 
 3 | 
 4 | object HashFNV {
 5 |   val FNV_32_INIT: Int = 33554467
 6 |   val FNV_32_PRIME: Int = 0x01000193
 7 | 
 8 |   def hash(s: String, init: Int=FNV_32_INIT): Int = {
 9 |     var hval = init
10 |     val bytes = s.getBytes
11 |     for(i <- bytes) {
12 |       hval *= FNV_32_PRIME
13 |       hval ^= i
14 |     }
15 |     hval
16 |   }
17 | }
18 | 


--------------------------------------------------------------------------------
/example/src/main/scala/MyProject.scala:
--------------------------------------------------------------------------------
 1 | import org.apache.spark.SparkContext
 2 | import org.apache.spark.SparkContext._
 3 | import org.apache.spark.SparkConf
 4 | 
 5 | import ru.retailrocket.spark.multitool.Loaders._
 6 | 
 7 | object MyProject {
 8 |   def main(args: Array[String]) {
 9 |     val sc = new SparkContext("local", "MyProject")
10 |     val sessions = sc.combineTextFile("file://" + getClass.getResource("src/main/resource").getFile)
11 |     println(s"sessions count ${sessions.count()}")
12 |   }
13 | }
14 | 


--------------------------------------------------------------------------------
/src/main/scala/ru/retailrocket/spark/multitool/Config.scala:
--------------------------------------------------------------------------------
 1 | package ru.retailrocket.spark.multitool
 2 | 
 3 | import com.typesafe.config.{ Config => TypeSafeConfig }
 4 | import org.apache.spark.{SparkContext, SparkConf}
 5 | 
 6 | object Config {
 7 |   def flatConfig(config: TypeSafeConfig): Seq[(String, AnyRef)] = {
 8 |     import scala.collection.convert.WrapAsScala._
 9 | 
10 |     config.entrySet().map { entry =>
11 |       val k = entry.getKey
12 |       val v = entry.getValue.unwrapped()
13 |       (k, v)
14 |     }.toSeq
15 |   }
16 | 
17 |   def asSparkConfig(config: TypeSafeConfig): SparkConf = {
18 |     val sc = new SparkConf()
19 | 
20 |     for ((key, value) <- flatConfig(config)) {
21 |       sc.set(key, value.toString)
22 |     }
23 | 
24 |     sc
25 |   }
26 | }


--------------------------------------------------------------------------------
/src/test/scala/ru/retailrocket/spark/multitool/AlgsSuite.scala:
--------------------------------------------------------------------------------
 1 | package ru.retailrocket.spark.multitool
 2 | 
 3 | import org.scalatest._
 4 | 
 5 | import org.apache.spark.SparkContext
 6 | import org.apache.spark.SparkContext._
 7 | import org.apache.spark.SparkConf
 8 | import org.apache.spark.rdd._
 9 | 
10 | 
11 | class AlgsSuite extends FunSuite with BeforeAndAfterAll {
12 |   lazy val sc: SparkContext = new SparkContext("local", getClass.getSimpleName)
13 |   implicit val parallel = 5
14 | 
15 |   test("cosine") {
16 |     val data = sc.parallelize(List[(Int, Long, Double)](
17 |         (1, 1L, 0.5),
18 |         (1, 2L, 0.3),
19 |         (2, 1L, 0.6),
20 |         (2, 2L, 0.2),
21 |         (3, 1L, 0.5),
22 |         (3, 3L, 0.2),
23 |         (4, 1L, 0.1)))
24 |     
25 |     val res = algs.cosine(data).collect.sorted
26 | 
27 |     assert(res(0) === (1L,2L,0.8028463951575711), "((0.5*0.3)+(0.6*0.2)) / (sqrt(0.5^2+0.6^2+0.5^2+0.1^2)*sqrt(0.3^2+0.2^2))")
28 |     assert(res(1) === (1L,3L,0.5360562674188974))
29 |   }
30 | 
31 |   override def afterAll() {
32 |     sc.stop()
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2014 Retail Rocket
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/src/main/scala/ru/retailrocket/spark/multitool/algs/package.scala:
--------------------------------------------------------------------------------
 1 | package ru.retailrocket.spark.multitool
 2 | 
 3 | import org.apache.spark.SparkContext._
 4 | import org.apache.spark.storage.StorageLevel
 5 | import org.apache.spark.rdd.RDD
 6 | 
 7 | import scala.util._
 8 | import scala.reflect.ClassTag
 9 | 
10 | package object algs {
11 |   def cosine[Session : ClassTag, Item <% Ordered[Item] : ClassTag](src: RDD[(Session, Item, Double)])(implicit parallel: Int): RDD[(Item, Item, Double)] = {
12 |     val ab = src
13 |       .map{case(session, item, weight) => (session, (item, weight))}
14 |       .groupByKey(parallel)
15 |       .flatMap{case(prop, items) =>
16 |         for((itemA, weightA) <- items; (itemB, weightB) <- items if itemA < itemB)
17 |         yield ((itemA, itemB), weightA * weightB)}
18 |       .reduceByKey(_+_, parallel)
19 |       .map{case((itemA, itemB), weight) => (itemA, itemB, weight)}
20 | 
21 |     val a = src
22 |       .map{case(session, item, weight) => (item, weight * weight)}
23 |       .reduceByKey(_+_, parallel)
24 |       .map{case(item, weight) => (item, math.sqrt(weight))}
25 |       .persist(StorageLevel.MEMORY_AND_DISK_SER)
26 | 
27 |     val cosineL = ab
28 |       .map{case(itemA, itemB, weightAB) => (itemA, (itemB, weightAB))}
29 |       .join(a, parallel)
30 |       .map{case(itemA, ((itemB, weightAB), weightA)) => (itemB, (itemA, weightAB, weightA))}
31 |       .join(a, parallel)
32 |       .map{case(itemB, ((itemA, weightAB, weightA), weightB)) => (itemA, itemB, weightAB / (weightA * weightB))}
33 | 
34 |     val cosineR = cosineL
35 |       .map{case(itemA, itemB, weight) => (itemB, itemA, weight)}
36 | 
37 |     cosineL union cosineR
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/src/main/scala/ru/retailrocket/spark/multitool/DataFrameFunctions.scala:
--------------------------------------------------------------------------------
 1 | package ru.retailrocket.spark.multitool
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.SparkContext._
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.rdd._
 7 | import org.apache.spark.sql._
 8 | 
 9 | import org.apache.hadoop.mapreduce.RecordReader
10 | import org.apache.hadoop.mapreduce.TaskAttemptContext
11 | import org.apache.hadoop.mapreduce.InputSplit
12 | import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat
13 | import org.apache.hadoop.mapred.JobConf
14 | import org.apache.hadoop.mapred.FileOutputFormat
15 | import org.apache.hadoop.io.compress.CompressionCodecFactory
16 | import org.apache.hadoop.util.LineReader
17 | import org.apache.hadoop.io._
18 | import org.apache.hadoop.fs._
19 | import org.apache.hadoop.io.compress._
20 | import org.apache.hadoop.mapreduce.lib.input._
21 | import org.apache.hadoop.conf.Configuration
22 | import org.apache.hadoop.conf.Configured
23 | 
24 | import scala.reflect.ClassTag
25 | import scala.reflect._
26 | import scala.util._
27 | 
28 | 
29 | object DataFrameFunctions {
30 |   def transform[R:ClassTag](f: Row=>Option[R])(src: DataFrame): RDDFunctions.TransformResult[Row,R] = {
31 |     val dst = src.rdd.map{s => (s, Try{f(s)})}
32 |     val output = dst.flatMap{case (_, Success(d)) => d; case _ => None}
33 |     val error = dst.flatMap{case (_, Failure(t)) => Some(t); case _ => None}
34 |     val ignore = dst.flatMap{case (s, Failure(_)) => Some(s); case _ => None}
35 |     RDDFunctions.TransformResult(output, error, ignore)
36 |   }
37 | 
38 |   def transform[R:ClassTag](f: Row=>R)(src: DataFrame)(implicit d: DummyImplicit): RDDFunctions.TransformResult[Row,R] = {
39 |     val dst = src.rdd.map{s => (s, Try{f(s)})}
40 |     val output = dst.flatMap{case (_, Success(d)) => Some(d); case _ => None}
41 |     val error = dst.flatMap{case (_, Failure(t)) => Some(t); case _ => None}
42 |     val ignore = dst.flatMap{case (s, Failure(_)) => Some(s); case _ => None}
43 |     RDDFunctions.TransformResult(output, error, ignore)
44 |   }
45 | }
46 | 


--------------------------------------------------------------------------------
/src/test/scala/ru/retailrocket/spark/multitool/LoadersSuite.scala:
--------------------------------------------------------------------------------
 1 | package ru.retailrocket.spark.multitool
 2 | 
 3 | import org.scalatest._
 4 | 
 5 | import org.apache.spark.SparkContext
 6 | import org.apache.spark.SparkContext._
 7 | import org.apache.spark.SparkConf
 8 | import org.apache.spark.rdd._
 9 | 
10 | import Loaders._
11 | import Loaders.Filter
12 | 
13 | 
14 | class FileNameEqualityFilter extends Filter {
15 |   def check(rules: Traversable[Filter.Rule], path: Array[String]) = {
16 |     rules.forall{
17 |       case(k, Array(eq)) =>
18 |         k match {
19 |           case "file" => eq == path.last
20 |           case _ => false
21 |         }
22 |     }
23 |   }
24 | }
25 | 
26 | class LoadersSuite extends FunSuite with BeforeAndAfterAll {
27 |   lazy val sc: SparkContext = new SparkContext("local", getClass.getSimpleName)
28 |   def path(file: String) = getClass.getResource("/" + file).getFile
29 | 
30 |   test("forPathAndCombine") {
31 |     val output = sc.forPath(path("combine")).combine().collect.sorted
32 |     assert(output.deep == Array("1","2","3","4").deep)
33 |   }
34 | 
35 |   test("forPathAndCombineWithPath") {
36 |     val output = sc.forPath(path("combine")).combineWithPath().collect.sorted
37 |     assert(output(1)._1.endsWith("file_1.csv"))
38 |   }
39 | 
40 |   test("forPathWithFilter") {
41 |     val output = sc.forPath(path("combine")+"/*")
42 |       .addFilter(classOf[FileNameEqualityFilter], Seq("file" -> Array("file_2.csv")))
43 |       .combine().collect.sorted
44 |     assert(output.deep == Array("3","4").deep)
45 |   }
46 | 
47 |   test("compression") {
48 |     {
49 |       val actual = sc.forPath(path("archive")+"/test_gzip.txt.gz").combine().collect().head
50 |       assert(actual === "gzip")
51 |     }
52 | 
53 |     {
54 |       val actual = sc.forPath(path("archive")+"/test_bzip2.txt.bz2").combine().collect().head
55 |       assert(actual === "bzip2")
56 |     }
57 | 
58 |     {
59 |       val actual = sc.forPath(path("archive")+"/test_lzma.txt.xz").combine().collect().head
60 |       assert(actual === "lzma")
61 |     }
62 |   }
63 | 
64 |   override def afterAll() {
65 |     sc.stop()
66 |   }
67 | }
68 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | SparkMultiTool
  2 | ==============
  3 | 
  4 | Tools for spark which we use on the daily basis.
  5 | It contains:
  6 | * Loader of HDFS files with combining small files (uses Hadoop CombineTextInputFormat/CombineFileInputFormat)
  7 | * Future: cosine calculation
  8 | * Future: quantile calculation
  9 | 
 10 | #Requirements
 11 | This library was succeffully tested with Scala 2.11.8 and Spark 2.3.1.
 12 | You should install SBT:
 13 | * [SBT tool](www.scala-sbt.org/download.html)
 14 | 
 15 | 
 16 | #Build
 17 | This build based on Scala 2.11.8 and Spark 2.3.1. Edit build.sbt If you have another environment.
 18 | 
 19 | For building install sbt, launch a terminal, change current to sparkmultitool directory  and launch a command:
 20 | 
 21 | ```
 22 | sbt package
 23 | sbt test
 24 | ```
 25 | Next copy spark-multitool*.jar from ./target/scala-2.11/...  to the lib folder of your sbt project.
 26 | 
 27 | #Usage
 28 | Include spark-multitool*.jar in --jars path in spark-submit like this:
 29 | ```
 30 | spark-submit --master local --executor-memory 2G --class "Tst" --num-executors 1 --executor-cores 1 --jars lib/spark-multitool_2.11-0.9.jar target/scala-2.11/tst_2.11-0.1.jar
 31 | 
 32 | ```
 33 | See examples folder.
 34 | 
 35 | ##Loaders
 36 | **ru.retailrocket.spark.multitool.Loaders** - combine input files before mappers by means of Hadoop CombineTextInputFormat/CombineFileInputFormat. In our case it reduced the number of mappers from 100000 to approx 3000 and made job significantly faster.
 37 | Parameters:
 38 | * **path** - path to the files (as in spark.textFile)
 39 | * **size** - size of target partition in Megabytes. Optimal value equals to a HDFS block size
 40 | * **delim** - line delimiters
 41 | 
 42 | This example loads files from "/test/*" and combine them in mappers.
 43 | ```
 44 | import org.apache.spark.SparkConf
 45 | import org.apache.spark.SparkContext
 46 | import org.apache.spark.SparkContext._
 47 | 
 48 | import ru.retailrocket.spark.multitool.Loaders._
 49 | 
 50 | object Tst {
 51 | 	def main(args: Array[String]) = {
 52 | 	  val conf = new SparkConf().setMaster("local").setAppName("My App")
 53 | 	  val sc = new SparkContext("local", "My App")
 54 | 
 55 |     val path = "file:///test/*"
 56 | 
 57 |     {
 58 |       val sessions = sc
 59 |         .forPath(path)
 60 |         .setSplitSize(256) // optional
 61 |         .setRecordDelim("\n") // optional
 62 |         .combine()
 63 | 	    println(sessions.count())
 64 |     }
 65 | 
 66 |     {
 67 |       // you can also get RDD[(String, String)] with (file, line)
 68 |       val sessions = sc
 69 |         .forPath(path)
 70 |         .combineWithPath()
 71 | 	    println(sessions.count())
 72 | 
 73 |       {
 74 |         // or add path filter, e.g. for partitioning
 75 |         class FileNameEqualityFilter extends Filter {
 76 |           def check(rules: Traversable[Filter.Rule], path: Array[String]) = {
 77 |             rules.forall {
 78 |               case(k, Array(eq)) =>
 79 |                 k match {
 80 |                   case "file" => eq == path.last
 81 |                   case _ => false
 82 |                 }
 83 |             }
 84 |           }
 85 |         }
 86 |         val sessions = sc
 87 |           .forPath(path)
 88 |           .addFilter(classOf[FileNameEqualityFilter], Seq("file" -> Array("file.name")))
 89 |           .combine()
 90 | 	      println(sessions.count())
 91 |       }
 92 |     }
 93 |   }
 94 | }
 95 | ```
 96 | 
 97 | ##Algorithms
 98 | 
 99 | **ru.retailrocket.spark.multitool.algs.cosine** - cosine similarity function.
100 | 
101 | ##Utility
102 | 
103 | **ru.retailrocket.spark.multitool.HashFNV** - simple, but useful hash function. Original idea from org.apache.pig.piggybank.evaluation.string.HashFNV
104 | 


--------------------------------------------------------------------------------
/src/test/scala/ru/retailrocket/spark/multitool/FunctionsSuite.scala:
--------------------------------------------------------------------------------
  1 | package ru.retailrocket.spark.multitool
  2 | 
  3 | import org.scalatest._
  4 | import java.nio.file.FileAlreadyExistsException
  5 | 
  6 | import org.apache.spark.SparkContext
  7 | import org.apache.spark.SparkContext._
  8 | import org.apache.spark.SparkConf
  9 | import org.apache.spark.rdd._
 10 | import scala.util._
 11 | 
 12 | import Implicits._
 13 | 
 14 | 
 15 | object Helpers {
 16 |   def f(x:Int): Int = 8 / x
 17 | 
 18 |   val serializer = new StringSerializer[Int] {
 19 |       override def apply(src: Int) = s"i: ${src.toString}"
 20 |     }
 21 | }
 22 | 
 23 | class FunctionsSuite extends FunSuite with BeforeAndAfterAll {
 24 |   lazy val sc: SparkContext = new SparkContext("local", getClass.getSimpleName)
 25 |   implicit val parallel = 5
 26 | 
 27 |   test("transform func") {
 28 |     val src = sc.parallelize(List(1,2,4,0))
 29 |     val dst = src.transform(Helpers.f _)
 30 |     assert(dst.output.count() === 3)
 31 |     assert(dst.error.count() === 1)
 32 |     assert(dst.ignore.count() === 1)
 33 |   }
 34 | 
 35 |   test("transform partial") {
 36 |     val src = sc.parallelize(List(1,2,4,0))
 37 |     val dst = src.transform{case x => 8 / x}
 38 |     assert(dst.output.count() === 3)
 39 |     assert(dst.error.count() === 1)
 40 |     assert(dst.ignore.count() === 1)
 41 |   }
 42 | 
 43 |   test("flat transform") {
 44 |     val src = sc.parallelize(List(1,2,4,0))
 45 | 
 46 |     val dst1 = src.flatTransform{x => Try{8/x}.toOption}
 47 |     assert(dst1.output.count() === 3)
 48 |     assert(dst1.error.count() === 0)
 49 |     assert(dst1.ignore.count() === 0)
 50 | 
 51 |     val dst2 = src.flatTransform{x => Seq(x,x)}
 52 |     assert(dst2.output.count() === 8)
 53 |     assert(dst2.error.count() === 0)
 54 |     assert(dst2.ignore.count() === 0)
 55 |   }
 56 | 
 57 |   test("save via temp and archive") {
 58 |     val root = fs.createTempDirectoryLocal("model_test")
 59 |     val data = sc.parallelize(Seq(1,2,3))
 60 |     val temp = s"${root}/model_test_temp"
 61 |     val output = s"${root}/model_test_data"
 62 | 
 63 | 
 64 |     fs.delete(output)
 65 |     data.saveViaTempWithRename(Helpers.serializer)(output, tempPath=Option(temp))
 66 |     assert(fs.exists(output))
 67 | 
 68 |     val dst = sc.textFile(output).collect().toSet
 69 |     assert(dst === Set("i: 1", "i: 2", "i: 3"))
 70 | 
 71 |     intercept[FileAlreadyExistsException] { data.saveViaTempWithRename(Helpers.serializer)(output, tempPath=Option(temp)) }
 72 |     Thread.sleep(1000)
 73 | 
 74 |     data.saveViaTempWithReplace(Helpers.serializer)(output, tempPath=Option(temp))
 75 |     assert(fs.exists(output))
 76 | 
 77 |     fs.delete(output)
 78 |     data.saveViaTempWithReplace(Helpers.serializer)(output, tempPath=Option(temp))
 79 |     assert(fs.exists(output))
 80 |     fs.delete(output)
 81 |   }
 82 | 
 83 |   test("functions") {
 84 |     import Functions._
 85 | 
 86 |     {
 87 |       val seq = Seq(1->2,4->3,2->4,4->1)
 88 |       val max = seq.maxBy { _._2 }
 89 |       val min = seq.minBy { _._2 }
 90 |       assert(max === 2->4)
 91 |       assert(min === 4->1)
 92 |     }
 93 | 
 94 |     {
 95 |       val src = Seq(1->2, 2->3).reduce(sumTuple2[Int] _)
 96 |       assert(src === 3->5)
 97 |     }
 98 | 
 99 |     {
100 |       val src = Seq(1.0->2, 2.0->3).reduce(sumTuple2[Double, Int] _)
101 |       assert(src === 3.0->5)
102 |     }
103 | 
104 |     {
105 |       val src = Seq((1,2,3), (3,4,5)).reduce(sumTuple3[Int] _)
106 |       assert(src === (4,6,8))
107 |     }
108 | 
109 |     {
110 |       val src = Seq((1,2.0,3L), (3,4.0,5L)).reduce(sumTuple3[Int, Double, Long] _)
111 |       assert(src === (4,6.0,8L))
112 |     }
113 | 
114 |     {
115 |       assert(Seq(1,2,3).contains(1) === true)
116 |       assert(Seq(1,2,3).contains(22) === false)
117 |       assert(Seq(1,2,3).contains("s") === false)
118 | 
119 |       assert(Seq(1,2,3).has(1) === true)
120 |       assert(Seq(1,2,3).has(22) === false)
121 | 
122 |       assert(Array(1,2,3).contains(1) === true)
123 |       assert(Array(1,2,3).contains(22) === false)
124 |       assert(Array(1,2,3).contains("s") === false)
125 | 
126 |       assert(Array(1,2,3).has(1) === true)
127 |       assert(Array(1,2,3).has(22) === false)
128 |       assertTypeError("""Array(1,2,3).has("s")""")
129 |     }
130 | 
131 |     {
132 |       assert("qq ww ee".nthIndexOf(" ", 0) == 2)
133 |       assert("qq ww ee rr tt".nthIndexOf(" ", 3) == 11)
134 |       assert("qq ww ee rr tt".nthSplit(" ", 3) == ("qq ww ee rr", "tt"))
135 |     }
136 | 
137 |     {
138 |       import Implicits.Ops
139 |       assert("1" === "1")
140 |       assert("1" !== "2")
141 |     }
142 |   }
143 | 
144 |   override def afterAll() {
145 |     sc.stop()
146 |   }
147 | }
148 | 


--------------------------------------------------------------------------------
/src/main/scala/ru/retailrocket/spark/multitool/fs/package.scala:
--------------------------------------------------------------------------------
  1 | package ru.retailrocket.spark.multitool
  2 | 
  3 | import org.apache.hadoop.fs._
  4 | import org.apache.hadoop.conf.Configuration
  5 | import org.apache.hadoop.io.compress.CompressionCodec
  6 | import org.apache.spark.rdd.RDD
  7 | import java.io._
  8 | import java.nio.file.{FileAlreadyExistsException}
  9 | 
 10 | 
 11 | package object fs {
 12 |   val DefaultTempPath = "/tmp/spark"
 13 | 
 14 |   val DefaultCodec = classOf[org.apache.hadoop.io.compress.GzipCodec]
 15 | 
 16 |   def actionViaTemp(output: String, tempPath: Option[String]=None)(action: String => Unit)(store: (String, String) => Unit): Unit = {
 17 |     val tempRoot = tempPath getOrElse DefaultTempPath
 18 |     val temp = "%s_%d".format(tempRoot, System.currentTimeMillis)
 19 |     action(temp)
 20 |     store(temp, output)
 21 |   }
 22 | 
 23 |   def saveRddViaTemp[T](serializer: StringSerializer[T])(output: String, tempPath: Option[String]=None, codec: Option[Class[_ <: CompressionCodec]]=None)(store: (String, String) => Unit)(src: RDD[T]): Unit = {
 24 |     actionViaTemp(output, tempPath) { path => src.map(serializer.apply _).saveAsTextFile(path, codec getOrElse DefaultCodec) } (store)
 25 |   }
 26 | 
 27 |   def saveRddViaTempWithReplace[T](serializer: StringSerializer[T])(output: String, tempPath: Option[String]=None, codec: Option[Class[_ <: CompressionCodec]]=None)(src: RDD[T]): Unit = {
 28 |     saveRddViaTemp(serializer)(output, tempPath, codec)(replace _)(src)
 29 |   }
 30 | 
 31 |   def saveRddViaTempWithRename[T](serializer: StringSerializer[T])(output: String, tempPath: Option[String]=None, codec: Option[Class[_ <: CompressionCodec]]=None)(src: RDD[T]): Unit = {
 32 |     saveRddViaTemp(serializer)(output, tempPath, codec)(rename _)(src)
 33 |   }
 34 | 
 35 |   def saveStringViaTemp(output: String, tempPath: Option[String]=None, overwrite: Boolean = false )(store: (String, String) => Unit)(src: String): Unit = {
 36 |     actionViaTemp(output, tempPath) { path => storeHdfs(src, path, overwrite) } (store)
 37 |   }
 38 | 
 39 |   def saveStringViaTempWithReplace(output: String, tempPath: Option[String]=None, overwrite: Boolean = false )(src: String): Unit = {
 40 |     saveStringViaTemp(output, tempPath, overwrite)(replace _)(src)
 41 |   }
 42 | 
 43 |   def saveStringViaTempWithRename(output: String, tempPath: Option[String]=None, overwrite: Boolean = false )(src: String): Unit = {
 44 |     saveStringViaTemp(output, tempPath, overwrite)(rename _)(src)
 45 |   }
 46 | 
 47 |   def exists(dst: String): Boolean = {
 48 |     val fs = FileSystem.get(new Configuration())
 49 |     val dstPath = new Path(dst)
 50 |     fs.exists(dstPath)
 51 |   }
 52 | 
 53 |   def delete(dst: String, recursive: Boolean=true): Unit = {
 54 |     val fs = FileSystem.get(new Configuration())
 55 |     val dstPath = new Path(dst)
 56 |     if(fs.exists(dstPath)) fs.delete(dstPath, recursive)
 57 |   }
 58 | 
 59 |   def checkParentAndCreate(dst: Path): Unit = {
 60 |     val fs = FileSystem.get(new Configuration())
 61 |     val parent = dst.getParent
 62 |     if(!fs.exists(parent)) fs.mkdirs(parent)
 63 |   }
 64 | 
 65 |   def checkParentAndRename(src: Path, dst: Path) {
 66 |     val fs = FileSystem.get(new Configuration())
 67 |     checkParentAndCreate(dst)
 68 |     fs.rename(src, dst)
 69 |   }
 70 | 
 71 |   def rename(src: String, dst: String) = {
 72 |     val fs = FileSystem.get(new Configuration())
 73 |     val dstPath = new Path(dst)
 74 |     val srcPath = new Path(src)
 75 |     if(fs.exists(dstPath)) throw new FileAlreadyExistsException(s"path already exists - ${dst}")
 76 |     checkParentAndRename(srcPath, dstPath)
 77 |   }
 78 | 
 79 |   def replace(src: String, dst: String) = {
 80 |     val fs = FileSystem.get(new Configuration())
 81 |     val srcPath = new Path(src)
 82 |     val dstPath = new Path(dst)
 83 |     if(fs.exists(dstPath)) fs.delete(dstPath, true)
 84 |     checkParentAndRename(srcPath, dstPath)
 85 |   }
 86 | 
 87 |   def storeLocal(data: String, path: String) {
 88 |     val out = new FileOutputStream(path)
 89 |     val bytes = data.getBytes
 90 |     out.write(bytes, 0, bytes.size)
 91 |     out.close()
 92 |   }
 93 | 
 94 |   def storeHdfs(data: String, path: String, overwrite: Boolean = false) {
 95 |     val fs = FileSystem.get(new Configuration())
 96 |     val out = fs.create(new Path(path), overwrite)
 97 |     val bytes = data.getBytes
 98 |     out.write(bytes, 0, bytes.size)
 99 |     out.close()
100 |   }
101 | 
102 |   def storeIterableToHdfs[T](serializer: StringSerializer[T])(path: String, overwrite: Boolean = false)(data: Iterable[T]) {
103 |     val fs = FileSystem.get(new Configuration())
104 |     val file = fs.create(new Path(path), overwrite)
105 | 
106 |     val writer = new BufferedWriter(new OutputStreamWriter(file))
107 |     data.foreach { src =>
108 |       writer.write(serializer(src))
109 |       writer.newLine()
110 |     }
111 | 
112 |     writer.close()
113 |     file.close()
114 |   }
115 | 
116 |   def createTempDirectoryLocal(prefix: String): String = {
117 |     val temp = File.createTempFile(prefix, "")
118 |     temp.delete()
119 |     temp.mkdir()
120 |     temp.getPath
121 |   }
122 | }
123 | 


--------------------------------------------------------------------------------
/src/main/scala/ru/retailrocket/spark/multitool/RDDFunctions.scala:
--------------------------------------------------------------------------------
  1 | package ru.retailrocket.spark.multitool
  2 | 
  3 | import org.apache.spark.SparkContext
  4 | import org.apache.spark.SparkContext._
  5 | import org.apache.spark.SparkConf
  6 | import org.apache.spark.Accumulator
  7 | import org.apache.spark.storage.StorageLevel
  8 | import org.apache.spark.rdd._
  9 | 
 10 | import org.apache.hadoop.mapreduce.RecordReader
 11 | import org.apache.hadoop.mapreduce.TaskAttemptContext
 12 | import org.apache.hadoop.mapreduce.InputSplit
 13 | import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat
 14 | import org.apache.hadoop.mapred.JobConf
 15 | import org.apache.hadoop.mapred.FileOutputFormat
 16 | import org.apache.hadoop.io.compress.CompressionCodecFactory
 17 | import org.apache.hadoop.util.LineReader
 18 | import org.apache.hadoop.io._
 19 | import org.apache.hadoop.fs._
 20 | import org.apache.hadoop.io.compress._
 21 | import org.apache.hadoop.mapreduce.lib.input._
 22 | import org.apache.hadoop.conf.Configuration
 23 | import org.apache.hadoop.conf.Configured
 24 | 
 25 | import scala.reflect.ClassTag
 26 | import scala.reflect._
 27 | import scala.util._
 28 | 
 29 | 
 30 | object RDDFunctions {
 31 |   val DefaultPersistLevel = StorageLevel.MEMORY_AND_DISK_SER
 32 | 
 33 |   case class TransformResult[T, R: ClassTag](output: RDD[R], error: RDD[Throwable], ignore: RDD[T]) {
 34 |     def name = classTag[R].toString
 35 |     def summary: String = s"${name} output ${output.count()} ignore ${ignore.count()}"
 36 |     def cache(): TransformResult[T,R] =
 37 |       TransformResult(output.cache(), error.cache(), ignore.cache())
 38 |     def persist(level: StorageLevel=DefaultPersistLevel): TransformResult[T,R] =
 39 |       TransformResult(output.persist(level), error.persist(level), ignore.persist(level))
 40 |   }
 41 | 
 42 |   def transform[T:ClassTag, R:ClassTag](f: T=>R)(src: RDD[T]): TransformResult[T,R] = {
 43 |     val dst = src.map{s => (s, Try{f(s)})}
 44 |     val output = dst.flatMap{case (_, Success(d)) => Some(d); case _ => None}
 45 |     val error = dst.flatMap{case (_, Failure(t)) => Some(t); case _ => None}
 46 |     val ignore = dst.flatMap{case (s, Failure(_)) => Some(s); case _ => None}
 47 |     TransformResult(output, error, ignore)
 48 |   }
 49 | 
 50 |   def flatTransform[T:ClassTag, R:ClassTag, C<%TraversableOnce[R]](f: T=>C)(src: RDD[T]): TransformResult[T,R] = {
 51 |     val dst = src.map{s => (s, Try{f(s)})}
 52 |     val output = dst.flatMap{case (_, Success(d)) => d; case _ => None}
 53 |     val error = dst.flatMap{case (_, Failure(t)) => Some(t); case _ => None}
 54 |     val ignore = dst.flatMap{case (s, Failure(_)) => Some(s); case _ => None}
 55 |     TransformResult(output, error, ignore)
 56 |   }
 57 | 
 58 |   case class TransformResultWithAccums[T, R: ClassTag](output: RDD[R], error: RDD[Throwable], ignore: RDD[T], inputAccum: Accumulator[Long], outputAccum: Accumulator[Long], errorAccum: Accumulator[Long]) {
 59 |     def name = classTag[R].toString
 60 |     def summary: String = s"${name} input ${inputAccum.value} output ${outputAccum.value} error ${errorAccum.value}"
 61 |     def errorRatio: Double = errorAccum.value.toDouble / (outputAccum.value + errorAccum.value)
 62 |     def cache(): TransformResult[T,R] =
 63 |       TransformResult(output.cache(), error.cache(), ignore.cache())
 64 |     def persist(level: StorageLevel=DefaultPersistLevel): TransformResult[T,R] =
 65 |       TransformResult(output.persist(level), error.persist(level), ignore.persist(level))
 66 |   }
 67 | 
 68 |   def transformWithAccums[T:ClassTag, R:ClassTag](f: T=>R)(src: RDD[T])(implicit sc: SparkContext): TransformResultWithAccums[T,R] = {
 69 |     val inputAccum = sc.accumulator(0L, "input")
 70 |     val outputAccum = sc.accumulator(0L, "output")
 71 |     val errorAccum = sc.accumulator(0L, "error")
 72 |     val dst = src.map{s => (s, Try{f(s)})}
 73 |     val output = dst.flatMap{
 74 |         case (_, Success(d)) =>
 75 |           inputAccum += 1
 76 |           outputAccum += 1
 77 |           Some(d)
 78 |         case _ =>
 79 |           inputAccum += 1
 80 |           errorAccum += 1
 81 |           None}
 82 |     val error = dst.flatMap{
 83 |         case (_, Failure(t)) =>
 84 |           Some(t)
 85 |         case _ =>
 86 |           None}
 87 |     val ignore = dst.flatMap{
 88 |         case (s, Failure(_)) =>
 89 |           Some(s)
 90 |         case _ =>
 91 |           None}
 92 |     TransformResultWithAccums(output, error, ignore, inputAccum, outputAccum, errorAccum)
 93 |   }
 94 | 
 95 |   def flatTransformWithAccums[T:ClassTag, R:ClassTag, C<%TraversableOnce[R]](f: T=>C)(src: RDD[T])(implicit sc: SparkContext): TransformResultWithAccums[T,R] = {
 96 |     val inputAccum = sc.accumulator(0L, "input")
 97 |     val outputAccum = sc.accumulator(0L, "output")
 98 |     val errorAccum = sc.accumulator(0L, "error")
 99 |     val dst = src.map{s => (s, Try{f(s)})}
100 |     val output = dst.flatMap{
101 |         case (_, Success(d)) =>
102 |           inputAccum += 1
103 |           outputAccum += d.size
104 |           d
105 |         case _ =>
106 |           inputAccum += 1
107 |           errorAccum += 1
108 |           None}
109 |     val error = dst.flatMap{
110 |         case (_, Failure(t)) =>
111 |           Some(t)
112 |         case _ =>
113 |           None}
114 |     val ignore = dst.flatMap{
115 |         case (s, Failure(_)) =>
116 |           Some(s)
117 |         case _ =>
118 |           None}
119 |     TransformResultWithAccums(output, error, ignore, inputAccum, outputAccum, errorAccum)
120 |   }
121 | 
122 |   class KeyBasedMultipleTextOutputFormat extends MultipleTextOutputFormat[Text, Text] {
123 |     override def generateFileNameForKeyValue(key: Text, value: Text, name: String): String = {
124 |       key.toString + "/" + name
125 |     }
126 |     
127 |     override def generateActualKey(key: Text, value: Text) = null
128 |   }
129 | 
130 |   def saveAsMultipleTextFiles[T:ClassTag](src: RDD[T], root: String)(getPath: T => String)(getData: T => String): Unit = {
131 |     saveAsMultipleTextFiles(src, root, None)(getPath)(getData)
132 |   }
133 | 
134 |   def saveAsMultipleTextFiles[T:ClassTag](src: RDD[T], root: String, codec: Class[_ <: CompressionCodec])(getPath: T => String)(getData: T => String): Unit = {
135 |     saveAsMultipleTextFiles(src, root, Option(codec))(getPath)(getData)
136 |   }
137 | 
138 |   def saveAsMultipleTextFiles[T:ClassTag](src: RDD[T], root: String, codec: Option[Class[_ <: CompressionCodec]])(getPath: T => String)(getData: T => String): Unit = {
139 |     val hadoopConf = new Configuration()
140 |     val jobConf = new JobConf(hadoopConf)
141 | 
142 |     jobConf.setOutputFormat(classOf[KeyBasedMultipleTextOutputFormat])
143 | 
144 |     if(codec.isDefined) {
145 |       jobConf.setBoolean("mapred.output.compress", true)
146 |       jobConf.setClass("mapred.output.compression.codec", codec.get, classOf[CompressionCodec])
147 |     }
148 | 
149 |     FileOutputFormat.setOutputPath(jobConf, new Path(root))
150 | 
151 |     src
152 |       .map { v => (new Text(getPath(v)), new Text(getData(v))) }
153 |       .saveAsHadoopDataset(jobConf)
154 |   }
155 | }
156 | 


--------------------------------------------------------------------------------
/src/main/scala/ru/retailrocket/spark/multitool/Loaders.scala:
--------------------------------------------------------------------------------
  1 | package ru.retailrocket.spark.multitool
  2 | 
  3 | import org.apache.spark.SparkContext
  4 | import org.apache.spark.SparkContext._
  5 | import org.apache.spark.SparkConf
  6 | import org.apache.spark.rdd._
  7 | 
  8 | import org.apache.hadoop.mapreduce.RecordReader
  9 | import org.apache.hadoop.mapreduce.TaskAttemptContext
 10 | import org.apache.hadoop.mapreduce.InputSplit
 11 | import org.apache.hadoop.io.compress.CompressionCodecFactory
 12 | import org.apache.hadoop.util.LineReader
 13 | import org.apache.hadoop.io._
 14 | import org.apache.hadoop.fs._
 15 | import org.apache.hadoop.mapreduce.lib.input._
 16 | import org.apache.hadoop.conf.Configuration
 17 | import org.apache.hadoop.conf.Configured
 18 | 
 19 | import scala.reflect.ClassTag
 20 | import scala.reflect._
 21 | 
 22 | object Loaders {
 23 |   abstract class Filter extends Configured with PathFilter {
 24 |     import Filter._
 25 | 
 26 |     private[this] var filter: Option[Traversable[Rule]] = None
 27 |     
 28 |     def check(rules: Traversable[Rule], path: Array[String]): Boolean
 29 |       
 30 |     override def accept(path: Path): Boolean = filter
 31 |       .map{f => check(f, path.toString.split(Path.SEPARATOR))}
 32 |       .getOrElse(true)
 33 | 
 34 |     override def setConf(conf: Configuration) {
 35 |       filter = Option(conf)
 36 |         .map(_.get(RulesPropName))
 37 |         .map(parseRules)
 38 |     }
 39 |   }
 40 | 
 41 |   object Filter {
 42 |     type Rule = (String, Array[String])
 43 | 
 44 |     val Pattern = """([^=]+)=(.+)""".r
 45 |     val RulesPropName = "ru.retailrocket.loaders.filter.rules"
 46 | 
 47 |     def storeRules(src: Traversable[Rule]) = src.map{
 48 |         case(k, eqs) => "%s=%s".format(k, eqs.mkString(","))
 49 |       }.mkString("&")
 50 | 
 51 |     def parseRules(src: String) = src.split("&").map{
 52 |         case Pattern(k, eqs) => (k, eqs.split(","))
 53 |       }
 54 |   }
 55 | 
 56 |   private class CombineTextFileWithOffsetInputFormat extends CombineFileInputFormat[LongWritable, Text] {
 57 |     override def createRecordReader(
 58 |       split: InputSplit,
 59 |       context: TaskAttemptContext): RecordReader[LongWritable, Text] =
 60 |       new CombineFileRecordReader(split.asInstanceOf[CombineFileSplit], context, classOf[CombineTextFileWithOffsetRecordReader])
 61 |   }
 62 | 
 63 |   private class CombineTextFileWithOffsetRecordReader(
 64 |     split: CombineFileSplit,
 65 |     context: TaskAttemptContext,
 66 |     index: Integer) extends CombineTextFileRecordReader[LongWritable](split, context, index) {
 67 | 
 68 |     override def generateKey(split: CombineFileSplit, index: Integer): LongWritable = new LongWritable(split.getOffset(index))
 69 |   }
 70 | 
 71 |   private class CombineTextFileWithPathInputFormat extends CombineFileInputFormat[Text, Text] {
 72 |     override def createRecordReader(
 73 |       split: InputSplit,
 74 |       context: TaskAttemptContext): RecordReader[Text, Text] =
 75 |       new CombineFileRecordReader(split.asInstanceOf[CombineFileSplit], context, classOf[CombineTextFileWithPathRecordReader])
 76 |   }
 77 | 
 78 |   private class CombineTextFileWithPathRecordReader(
 79 |     split: CombineFileSplit,
 80 |     context: TaskAttemptContext,
 81 |     index: Integer) extends CombineTextFileRecordReader[Text](split, context, index) {
 82 | 
 83 |     override def generateKey(split: CombineFileSplit, index: Integer): Text = new Text(split.getPath(index).toString)
 84 |   }
 85 | 
 86 |   private abstract class CombineTextFileRecordReader[K](
 87 |     split: CombineFileSplit,
 88 |     context: TaskAttemptContext,
 89 |     index: Integer) extends RecordReader[K, Text] {
 90 | 
 91 |     val conf = context.getConfiguration
 92 |     val path = split.getPath(index)
 93 |     val fs = path.getFileSystem(conf)
 94 |     val codec = Option(new CompressionCodecFactory(conf).getCodec(path))
 95 | 
 96 |     val start = split.getOffset(index)
 97 |     val length = if(codec.isEmpty) split.getLength(index) else Long.MaxValue
 98 |     val end = start + length
 99 | 
100 |     val fd = fs.open(path)
101 |     if(start > 0) fd.seek(start)
102 | 
103 |     val fileIn = codec match {
104 |       case Some(codec) => codec.createInputStream(fd)
105 |       case None => fd
106 |     }
107 | 
108 |     var reader = new LineReader(fileIn)
109 |     var pos = start
110 | 
111 |     def generateKey(split: CombineFileSplit, index: Integer): K
112 | 
113 |     protected val key = generateKey(split, index)
114 |     protected val value = new Text
115 | 
116 |     override def initialize(split: InputSplit, ctx: TaskAttemptContext) {}
117 | 
118 |     override def nextKeyValue(): Boolean = {
119 |       if (pos < end) {
120 |         val newSize = reader.readLine(value)
121 |         pos += newSize
122 |         newSize != 0
123 |       } else {
124 |         false
125 |       }
126 |     }
127 |     
128 |     override def close(): Unit = if (reader != null) { reader.close(); reader = null }
129 |     override def getCurrentKey: K = key
130 |     override def getCurrentValue: Text = value
131 |     override def getProgress: Float = if (start == end) 0.0f else math.min(1.0f, (pos - start).toFloat / (end - start))
132 |   }
133 | 
134 |   private val defaultCombineSize = 256
135 |   private val defaultCombineDelim = "\n"
136 | 
137 |   def generateOffsetKey(split: CombineFileSplit, index: Integer) = split.getOffset(index)
138 | 
139 |   class Context(val sc: SparkContext, val path: String) {
140 | 
141 |     val conf = new Configuration()
142 |     conf.set("textinputformat.record.delimiter", defaultCombineDelim)
143 |     conf.set("mapreduce.input.fileinputformat.input.dir.recursive", "true")
144 |     conf.set("mapred.input.dir", path)
145 |     conf.setLong("mapred.max.split.size", defaultCombineSize*1024*1024)
146 | 
147 |     conf.set("io.compression.codecs", s"io.sensesecure.hadoop.xz.XZCodec")
148 | 
149 |     def addFilterClass[T <: Filter](filterClass: Class[T]): Context = {
150 |       conf.set("mapreduce.input.pathFilter.class", filterClass.getName)
151 |       this
152 |     }
153 | 
154 |     def addFilterRules(filterRules: String) = {
155 |       conf.set(Filter.RulesPropName, filterRules)
156 |       this
157 |     }
158 | 
159 |     def addFilterRules(filterRules: Traversable[Filter.Rule]) = {
160 |       conf.set(Filter.RulesPropName, Filter.storeRules(filterRules))
161 |       this
162 |     }
163 | 
164 |     def addFilter[T <: Filter](filter: (Class[T], Traversable[Filter.Rule])) = this
165 |       .addFilterClass(filter._1)
166 |       .addFilterRules(filter._2)
167 | 
168 |     def setSplitSize(size: Long) = {
169 |       conf.setLong("mapred.min.split.size", size*1024*1024)
170 |       conf.setLong("mapred.max.split.size", size*1024*1024)
171 |       this
172 |     }
173 | 
174 |     def setRecordDelim(delim: String) = {
175 |       conf.set("textinputformat.record.delimiter", delim)
176 |       this
177 |     }
178 | 
179 |     def combine(): RDD[String] = sc
180 |       .newAPIHadoopRDD(conf, classOf[CombineTextInputFormat], classOf[LongWritable], classOf[Text])
181 |       .map { case (k, v) => v.toString }
182 | 
183 |     def combineWithPath(): RDD[(String, String)] = sc
184 |       .newAPIHadoopRDD(conf, classOf[CombineTextFileWithPathInputFormat], classOf[Text], classOf[Text])
185 |       .map{case(path, data) => (path.toString, data.toString)}
186 |   }
187 | 
188 |   def forPath(sc: SparkContext, path: String) = {
189 |     new Context(sc, path)
190 |   }
191 | 
192 |   implicit class SparkContextFunctions(val self: SparkContext) extends AnyVal {
193 |     def forPath(path: String): Loaders.Context = Loaders.forPath(self, path)
194 |   }
195 | }
196 | 


--------------------------------------------------------------------------------
/src/main/scala/ru/retailrocket/spark/multitool/package.scala:
--------------------------------------------------------------------------------
  1 | package ru.retailrocket.spark
  2 | 
  3 | import scala.reflect.ClassTag
  4 | import scala.annotation.tailrec
  5 | import scala.util._
  6 | 
  7 | import org.apache.spark.SparkContext
  8 | import org.apache.spark.rdd._
  9 | import org.apache.spark.sql._
 10 | import org.apache.hadoop.io.compress._
 11 | 
 12 | 
 13 | package object multitool {
 14 |   object Functions {
 15 |     def tap[T:ClassTag](f: T => Unit)(o: T) = {f(o); o}
 16 |     def applyIf[T:ClassTag](p: Boolean)(f: T => T)(o: T): T = {if(p) f(o) else o}
 17 |     def applyIf[T:ClassTag](p: T=>Boolean)(f: T => T)(o: T): T = {if(p(o)) f(o) else o}
 18 |     def applyOption[T:ClassTag,V:ClassTag](v: Option[V])(f: (T,V) => T)(o: T): T = {if(v.isDefined) f(o, v.get) else o}
 19 |     def tapIf[T:ClassTag](p: Boolean)(f: T => Unit)(o: T) = {if(p) f(o); o}
 20 |     def use[T:ClassTag, R:ClassTag](f: T => R)(o: T): R = f(o)
 21 | 
 22 |     def maxBy[T,O<%Ordered[O]](f:T=>O)(a:T, b:T) = if(f(a)>f(b)) a else b
 23 |     def minBy[T,O<%Ordered[O]](f:T=>O)(a:T, b:T) = if(f(a)<f(b)) a else b
 24 | 
 25 |     def sumTuple2[T](a: (T,T), b: (T,T))(implicit num: Numeric[T]): (T,T) = {
 26 |       import num._
 27 |       (a._1+b._1, a._2+b._2)
 28 |     }
 29 | 
 30 |     def sumTuple2[T1,T2](a: (T1,T2), b: (T1,T2))(implicit num1: Numeric[T1], num2: Numeric[T2]): (T1,T2) = {
 31 |       val s1 = {
 32 |         import num1._
 33 |         a._1+b._1
 34 |       }
 35 |       val s2 = {
 36 |         import num2._
 37 |         a._2+b._2
 38 |       }
 39 |       (s1, s2)
 40 |     }
 41 | 
 42 |     def sumTuple3[T](a: (T,T,T), b: (T,T,T))(implicit num: Numeric[T]): (T,T,T) = {
 43 |       import num._
 44 |       (a._1+b._1, a._2+b._2, a._3+b._3)
 45 |     }
 46 | 
 47 |     def sumTuple3[T1,T2,T3](a: (T1,T2,T3), b: (T1,T2,T3))(implicit num1: Numeric[T1], num2: Numeric[T2], num3: Numeric[T3]): (T1,T2,T3) = {
 48 |       val s1 = {
 49 |         import num1._
 50 |         a._1+b._1
 51 |       }
 52 |       val s2 = {
 53 |         import num2._
 54 |         a._2+b._2
 55 |       }
 56 |       val s3 = {
 57 |         import num3._
 58 |         a._3+b._3
 59 |       }
 60 |       (s1, s2, s3)
 61 |     }
 62 | 
 63 |     def nthIndexOf(s: String, d: String, n: Int): Int = {
 64 |       @tailrec
 65 |       def f(n: Int, i: Int): Int = n match {
 66 |           case -1 => i
 67 |           case _ => f(n-1, s.indexOf(d, i+1))
 68 |         }
 69 |       f(n, 0)
 70 |     }
 71 | 
 72 |     def nthSplit(s: String, d: String, n: Int): (String, String) = {
 73 |       val (s1, s2) = s.splitAt(nthIndexOf(s, d, n)+d.length)
 74 |       (s1.take(s1.length-d.length), s2)
 75 |     }
 76 |   }
 77 | 
 78 |   object PairFunctions {
 79 |     def flatMapValues[K,V,T](src: Traversable[(K,V)])(f: (V) => TraversableOnce[T]): Traversable[(K,T)] =
 80 |       src.flatMap { case (k,v) => f(v).map { r => (k, r) } }
 81 | 
 82 |     def mapValues[K,V,T](src: Traversable[(K,V)])(f: (V) => T): Traversable[(K,T)] =
 83 |       src.map {case (k,v) => (k, f(v)) }
 84 | 
 85 |     def groupByKey[K,V](src: Traversable[(K,V)]) =
 86 |       mapValues(src.groupBy { _._1 } ) { _.map { _._2 } }
 87 | 
 88 |     def reduceByKey[K,V](src: Traversable[(K,V)])(f: (V,V) => V): Traversable[(K,V)] =
 89 |       groupByKey(src).map { case (k, vs) => (k, vs.reduce(f)) }
 90 | 
 91 |     def countByKey[K,V](src: Traversable[(K,V)]): Map[K, Long] =
 92 |       reduceByKey(src.map { case (k, v) => (k, 1L) } ) { _+_ }.toMap
 93 | 
 94 |     def cogroup[K,V1,V2](src1: Traversable[(K,V1)], src2: Traversable[(K,V2)]): Traversable[(K, (Traversable[V1], Traversable[V2]))] = {
 95 |       val g1 = groupByKey(src1).toMap
 96 |       val g2 = groupByKey(src2).toMap
 97 |       val ks = g1.keys.toSet | g2.keys.toSet
 98 |       for {
 99 |         k <- ks.toSeq
100 |         vs1 = g1.get(k).toList.flatten
101 |         vs2 = g2.get(k).toList.flatten
102 |       } yield (k, (vs1, vs2))
103 |     }
104 | 
105 |     def join[K,V1,V2](src1: Traversable[(K,V1)], src2: Traversable[(K,V2)]): Traversable[(K, (V1, V2))] = {
106 |       for {
107 |         (k, (vs1, vs2)) <- cogroup(src1, src2)
108 |         v1 <- vs1
109 |         v2 <- vs2
110 |       } yield (k, (v1, v2))
111 |     }
112 | 
113 |     def leftOuterJoin[K,V1,V2](src1: Traversable[(K,V1)], src2: Traversable[(K,V2)]): Traversable[(K, (V1, Option[V2]))] = {
114 |       for {
115 |         (k, (vs1, vs2)) <- cogroup(src1, src2)
116 |         v1 <- vs1
117 |         v2 <- if(vs2.isEmpty) Seq(None) else vs2.map { Option(_) }
118 |       } yield (k, (v1, v2))
119 |     }
120 | 
121 |     def rightOuterJoin[K,V1,V2](src1: Traversable[(K,V1)], src2: Traversable[(K,V2)]): Traversable[(K, (Option[V1], V2))] = {
122 |       for {
123 |         (k, (vs1, vs2)) <- cogroup(src1, src2)
124 |         v1 <- if(vs1.isEmpty) Seq(None) else vs1.map { Option(_) }
125 |         v2 <- vs2
126 |       } yield (k, (v1, v2))
127 |     }
128 | 
129 |     def fullOuterJoin[K,V1,V2](src1: Traversable[(K,V1)], src2: Traversable[(K,V2)]): Traversable[(K, (Option[V1], Option[V2]))] = {
130 |       for {
131 |         (k, (vs1, vs2)) <- multitool.PairFunctions.cogroup(src1, src2)
132 |         v1 <- if(vs1.isEmpty) Seq(None) else vs1.map { Option(_) }
133 |         v2 <- if(vs2.isEmpty) Seq(None) else vs2.map { Option(_) }
134 |       } yield (k, (v1, v2))
135 |     }
136 |   }
137 | 
138 |   object TraversableFunctions {
139 |     def countByValue[T](src: Traversable[T]): Map[T, Long] = PairFunctions.countByKey(src.map { (_, 1L) } )
140 |   }
141 | 
142 |   object Implicits {
143 |     object Ops {
144 |       implicit class MultitoolOpsImplicits[T](val self: T) {
145 |         def ===(that: T): Boolean = self == that
146 |         def !==(that: T): Boolean = self != that
147 |       }
148 |     }
149 | 
150 |     implicit class MultitoolFunctionsImplicits[T:ClassTag](val self: T) {
151 |       def tap(f: T => Unit) = Functions.tap(f)(self)
152 |       def tapIf(p: Boolean)(f: T => Unit) = Functions.tapIf(p)(f)(self)
153 |       def applyIf(p: Boolean)(f: T => T): T = Functions.applyIf(p)(f)(self)
154 |       def applyOption[V:ClassTag](v: Option[V])(f: (T,V) => T): T = Functions.applyOption(v)(f)(self)
155 |       def use[R: ClassTag](f: T => R): R = Functions.use(f)(self)
156 |     }
157 | 
158 |     implicit class MultitoolPairFunctionsImplicits[K:ClassTag, V:ClassTag](val self: Traversable[(K,V)]) {
159 |       def flatMapValues[T](f: (V) => TraversableOnce[T]) = PairFunctions.flatMapValues(self)(f)
160 |       def mapValues[T](f: (V) => T) = PairFunctions.mapValues(self)(f)
161 |       def groupByKey() = PairFunctions.groupByKey(self)
162 |       def reduceByKey(f: (V,V) => V) = PairFunctions.reduceByKey(self)(f)
163 |       def countByKey() = PairFunctions.countByKey(self)
164 |       def cogroup[V2](src2: Traversable[(K,V2)]) = PairFunctions.cogroup(self, src2)
165 |       def join[V2](src2: Traversable[(K,V2)]) = PairFunctions.join(self, src2)
166 |       def leftOuterJoin[V2](src2: Traversable[(K,V2)]) = PairFunctions.leftOuterJoin(self, src2)
167 |       def rightOuterJoin[V2](src2: Traversable[(K,V2)]) = PairFunctions.rightOuterJoin(self, src2)
168 |       def fullOuterJoin[V2](src2: Traversable[(K,V2)]) = PairFunctions.fullOuterJoin(self, src2)
169 |     }
170 | 
171 |     implicit class MultitoolTraversableFunctionsImplicits[T](val self: Traversable[T]) extends AnyVal {
172 |       def countByValue() = TraversableFunctions.countByValue(self)
173 |     }
174 | 
175 |     implicit class MultitoolSetFunctionsImplicits[T](val self: Set[T]) extends AnyVal {
176 |       def has(t: T) = self.contains(t)
177 |     }
178 | 
179 |     implicit class MultitoolSeqFunctionsImplicits[T](val self: Seq[T]) extends AnyVal {
180 |       def has(t: T) = self.contains(t)
181 |     }
182 | 
183 |     implicit class MultitoolArrayFunctionsImplicits[T](val self: Array[T]) extends AnyVal {
184 |       def has(t: T) = self.contains(t)
185 |     }
186 | 
187 |     implicit class MultitoolRDDFunctionsImplicits[T:ClassTag](val self: RDD[T]) {
188 |       def transform[R:ClassTag](f: T=>R): RDDFunctions.TransformResult[T,R] = {
189 |         RDDFunctions.transform(f)(self)
190 |       }
191 |       def flatTransform[R:ClassTag, C<%TraversableOnce[R]](f: T=>C): RDDFunctions.TransformResult[T,R] = {
192 |         RDDFunctions.flatTransform(f)(self)
193 |       }
194 |       def transformWithAccums[R:ClassTag](f: T=>R)(implicit sc: SparkContext): RDDFunctions.TransformResultWithAccums[T,R] = {
195 |         RDDFunctions.transformWithAccums(f)(self)
196 |       }
197 |       def flatTransformWithAccums[R:ClassTag, C<%TraversableOnce[R]](f: T=>C)(implicit sc: SparkContext): RDDFunctions.TransformResultWithAccums[T,R] = {
198 |         RDDFunctions.flatTransformWithAccums(f)(self)
199 |       }
200 |       def saveViaTemp(serializer: StringSerializer[T])(output: String, tempPath: Option[String]=None, codec: Option[Class[_ <: CompressionCodec]]=None)(store: (String, String) => Unit): Unit = {
201 |         fs.saveRddViaTemp(serializer)(output, tempPath, codec)(store)(self)
202 |       }
203 |       def saveViaTempWithReplace(serializer: StringSerializer[T])(output: String, tempPath: Option[String]=None, codec: Option[Class[_ <: CompressionCodec]]=None): Unit = {
204 |         fs.saveRddViaTempWithReplace(serializer)(output, tempPath, codec)(self)
205 |       }
206 |       def saveViaTempWithRename(serializer: StringSerializer[T])(output: String, tempPath: Option[String]=None, codec: Option[Class[_ <: CompressionCodec]]=None): Unit = {
207 |         fs.saveRddViaTempWithRename(serializer)(output, tempPath, codec)(self)
208 |       }
209 |       def saveAsMultipleTextFiles(root: String)(getPath: T => String)(getData: T => String): Unit = {
210 |         RDDFunctions.saveAsMultipleTextFiles(self, root)(getPath)(getData)
211 |       }
212 |       def saveAsMultipleTextFiles(root: String, codec: Class[_ <: CompressionCodec])(getPath: T => String)(getData: T => String): Unit = {
213 |         RDDFunctions.saveAsMultipleTextFiles(self, root, codec)(getPath)(getData)
214 |       }
215 |       def saveAsMultipleTextFiles(root: String, codec: Option[Class[_ <: CompressionCodec]])(getPath: T => String)(getData: T => String): Unit = {
216 |         RDDFunctions.saveAsMultipleTextFiles(self, root, codec)(getPath)(getData)
217 |       }
218 |     }
219 | 
220 |     implicit class MultitoolDataFrameFunctionsImplicits(val self: DataFrame) {
221 |       def transform[R:ClassTag](f: Row=>Option[R]): RDDFunctions.TransformResult[Row,R] = {
222 |         DataFrameFunctions.transform(f)(self)
223 |       }
224 |       def transform[R:ClassTag](f: Row=>R)(implicit d: DummyImplicit): RDDFunctions.TransformResult[Row,R] = {
225 |         DataFrameFunctions.transform(f)(self)
226 |       }
227 |     }
228 | 
229 |     implicit class RichBoolean(val self: Boolean) extends AnyVal {
230 |       def toInt = if(self) 1 else 0
231 |       def toDouble = if(self) 1.0 else 0.0
232 |     }
233 | 
234 |     implicit class RichInt(val self: Int) extends AnyVal {
235 |       def toBoolean = self == 1
236 |     }
237 | 
238 |     implicit class RichString(val self: String) extends AnyVal {
239 |       def nthIndexOf(d: String, n: Int): Int = {
240 |         Functions.nthIndexOf(self, d, n)
241 |       }
242 |       def nthSplit(d: String, n: Int): (String, String) = {
243 |         Functions.nthSplit(self, d, n)
244 |       }
245 |     }
246 |   }
247 | 
248 |   trait StringSerializer[T] extends Serializable {
249 |     def apply(src: T): String
250 |   }
251 | }
252 | 


--------------------------------------------------------------------------------