├── OWNER ├── misc └── scalac-plugin.xml ├── .settings └── org.eclipse.jdt.core.prefs ├── scripts ├── testrepeat.scala ├── test.scala ├── check.scala ├── testdistribute.scala ├── hadoop.scala └── remotetest.scala ├── .classpath ├── .project └── src └── smr ├── Order.scala ├── examples └── hadoop │ └── Basic.scala ├── ThreadLocal.scala ├── hadoop ├── AnyWritable.scala ├── Reduceable.scala ├── Mapper.scala ├── SwapMapper.scala ├── TransformValMapper.scala ├── PairTransformMapper.scala ├── TransformMapper.scala ├── CollectorMapper.scala ├── PairCollectorMapper.scala ├── Reduce.scala ├── ReduceWrapper.scala ├── ClosureMapper.scala ├── Implicits.scala ├── Magic.scala ├── PathIterable.scala ├── Hadoop.scala └── PathPairs.scala ├── plugin └── SerOverride.scala ├── SerFunction.scala ├── actors ├── TransActor.scala ├── Hub.scala ├── Worker.scala └── Distributor.scala ├── Functions.scala ├── DistributedIterable.scala ├── DistributedPairs.scala ├── Util.scala └── Defaults.scala /OWNER: -------------------------------------------------------------------------------- 1 | dlwh@stanford.edu 2 | -------------------------------------------------------------------------------- /misc/scalac-plugin.xml: -------------------------------------------------------------------------------- 1 | 2 | seroverride 3 | smr.plugin.SerOverride 4 | 5 | -------------------------------------------------------------------------------- /.settings/org.eclipse.jdt.core.prefs: -------------------------------------------------------------------------------- 1 | #Tue Aug 05 17:25:51 PDT 2008 2 | eclipse.preferences.version=1 3 | org.eclipse.jdt.core.builder.cleanOutputFolder=ignore 4 | org.eclipse.jdt.core.builder.resourceCopyExclusionFilter=*.scala 5 | -------------------------------------------------------------------------------- /scripts/testrepeat.scala: -------------------------------------------------------------------------------- 1 | import smr._; 2 | import smr.defaults._; 3 | 4 | //scala.actors.Debug.level = 10; 5 | val x = new ActorDistributor(3,9010); 6 | val c = x.distribute(1 to 10000); 7 | println(c.map(2*).reduce(_+_)); 8 | println(c.map(2*).reduce(_+_)); 9 | println(c.reduce(_+_)); 10 | x.close(); 11 | System.exit(0); 12 | -------------------------------------------------------------------------------- /.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /scripts/test.scala: -------------------------------------------------------------------------------- 1 | import smr._; 2 | import smr.Defaults._; 3 | 4 | //scala.actors.Debug.level = 10; 5 | val x = new ActorDistributor(4,9010); 6 | println("Go!"); 7 | println((1 to 10000000).map(BigInt(_)).reduceLeft(_+_)) 8 | println(x.distribute(1 to 10000000).map(BigInt(_)).reduce(_+_)) 9 | println(x.distribute( (1 to 100).toList).map(6*).map(BigInt(_)).reduce(_+_)) 10 | x.close(); 11 | System.exit(0); 12 | -------------------------------------------------------------------------------- /scripts/check.scala: -------------------------------------------------------------------------------- 1 | import edu.stanford.nlp.smr._; 2 | import edu.stanford.nlp.smr.defaults._; 3 | import org.scalacheck._; 4 | import Arbitrary._; 5 | import Prop._; 6 | 7 | val x = new ActorDistributor(3,9010); 8 | val prop_SumReduceIdentity = property { (l : List[Int]) => l.size == 0 || x.distribute(l).reduce(_+_) == l.reduceLeft(_+_) } 9 | Test.check(prop_SumReduceIdentity); 10 | x.close(); 11 | System.exit(0); 12 | -------------------------------------------------------------------------------- /scripts/testdistribute.scala: -------------------------------------------------------------------------------- 1 | import smr._ 2 | import smr.defaults._ 3 | import scala.actors.remote.Node; 4 | import scala.actors.remote.RemoteActor._; 5 | 6 | //scala.actors.Debug.level = 10; 7 | val y = new ActorDistributor(0,9000) 8 | val w = Worker(9000,'worker); 9 | y.addWorker(select(Node("localhost",9000),'worker)); 10 | println(y.distribute( (1 to 1000)).map(_ * 2).reduce(_+_)); 11 | y.close(); 12 | System.exit(0); 13 | -------------------------------------------------------------------------------- /scripts/hadoop.scala: -------------------------------------------------------------------------------- 1 | import smr.hadoop._; 2 | import org.apache.hadoop.fs._; 3 | 4 | val h = Hadoop(Array(""),new Path("output")); 5 | println(h.distribute(1 to 1000,3) reduce ( _+_)); 6 | println(h.distribute(1 to 1000,3) map (2*) reduce ( _+_)); 7 | 8 | val words = for( (off,line) <- h.loadLines(new Path("build.xml")); 9 | word <- line.split(" ")) 10 | yield(word,1); 11 | 12 | val counts = words.reduce{ (word,it) => 13 | (word,it.reduceLeft(_+_)); 14 | } 15 | 16 | counts.elements foreach println; 17 | counts.elements foreach println; 18 | -------------------------------------------------------------------------------- /.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | SMR 4 | 5 | 6 | 7 | 8 | 9 | ch.epfl.lamp.sdt.core.scalabuilder 10 | 11 | 12 | 13 | 14 | org.eclipse.jdt.core.javabuilder 15 | 16 | 17 | 18 | 19 | 20 | ch.epfl.lamp.sdt.core.scalanature 21 | org.eclipse.jdt.core.javanature 22 | 23 | 24 | -------------------------------------------------------------------------------- /src/smr/Order.scala: -------------------------------------------------------------------------------- 1 | package smr; 2 | 3 | /** 4 | * Class to make doing compareTo a little less painful; 5 | * 6 | */ 7 | @serializable 8 | abstract class Order[T](elems : (T => Comparable[_])*) extends Ordered[T] with Comparable[T] { this : T=> 9 | override def compare(o : T) = { 10 | Order.recursiveCompare( (0 until elems.length) map ( i => (elems(i)(this),elems(i)(o)))); 11 | } 12 | } 13 | 14 | object Order { 15 | private def recursiveCompare(o : Seq[(Any,Any)]):Int = { 16 | if(o.length == 0) 0 17 | else o(0)._1.asInstanceOf[Comparable[Any]].compareTo(o(0)._2) match { 18 | case 0 => recursiveCompare(o.drop(1)); 19 | case x => x; 20 | } 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /scripts/remotetest.scala: -------------------------------------------------------------------------------- 1 | import scala.actors.Actor._; 2 | import scala.actors.remote.RemoteActor._; 3 | import scala.actors.remote.Node; 4 | val x = actor { 5 | scala.actors.Debug.level = 10; 6 | try { 7 | alive(9010) 8 | register('myName,self); 9 | } catch { 10 | case x => println(x); 11 | } 12 | react { 13 | case _ => reply { 1} 14 | react { 15 | case x => println(x); reply {'done} 16 | }} 17 | } 18 | classLoader = x.getClass.getClassLoader 19 | 20 | try { 21 | x !? None // make sure we're ready 22 | val c = select(Node("128.12.89.161",9010),'myName) 23 | c ! 'test // this line doesn't seem to return. 24 | } catch { 25 | case e => println(e); 26 | } 27 | -------------------------------------------------------------------------------- /src/smr/examples/hadoop/Basic.scala: -------------------------------------------------------------------------------- 1 | package smr.examples.hadoop; 2 | import smr.hadoop._; 3 | import org.apache.hadoop.fs._; 4 | 5 | object Basic { 6 | def main(args :Array[String]) { 7 | 8 | val (h,remainingArgs) = Hadoop.fromArgs(args.drop(1).force, args(0), new Path("output")); 9 | println(h.distribute(1 to 1000,3) reduce ( _+_)); 10 | println(h.distribute(1 to 1000,3) map (2*) reduce ( _+_)); 11 | 12 | val words = 13 | for( (off,line) <- h.loadLines(new Path("file:///u/dlwh/src/scalanlp/build.xml")); 14 | word <- line.split(" ")) 15 | yield(word,1); 16 | 17 | val counts = words.reduce{ (word,it) => 18 | (word,it.reduceLeft(_+_)); 19 | } 20 | 21 | counts.elements foreach println; 22 | counts.elements foreach println; 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src/smr/ThreadLocal.scala: -------------------------------------------------------------------------------- 1 | package smr; 2 | 3 | /** 4 | * More scala like ThreadLocal storage. Also, it's serializable, to be more smr friendly. 5 | * 6 | * @author(dlwh) 7 | */ 8 | @serializable 9 | abstract class ThreadLocal[T] extends Function0[T] { 10 | @transient // var because of serialization constraints 11 | private var tl = new java.lang.ThreadLocal[T] { 12 | override def initialValue = default(); 13 | } 14 | 15 | @throws(classOf[java.io.IOException]) 16 | @throws(classOf[ClassNotFoundException]) 17 | private def readObject(in : java.io.ObjectInputStream) { 18 | tl = new java.lang.ThreadLocal[T] { 19 | override def initialValue = default(); 20 | } 21 | } 22 | 23 | // must be overridden 24 | protected def default(): T; 25 | 26 | def apply() = tl.get(); 27 | def get() = tl.get(); 28 | 29 | def value = tl.get(); 30 | def value_=(v : T) = tl.set(v); 31 | } 32 | -------------------------------------------------------------------------------- /src/smr/hadoop/AnyWritable.scala: -------------------------------------------------------------------------------- 1 | package smr.hadoop; 2 | 3 | import org.apache.hadoop.io._; 4 | import java.io._; 5 | 6 | class AnyWritable[T](var value : T) extends WritableComparable[AnyWritable[T]] { 7 | 8 | def get() = value; 9 | 10 | def this() = this(null.asInstanceOf[T]); 11 | 12 | @throws(classOf[java.io.IOException]) 13 | override def write(out : DataOutput) { 14 | val bytesOut = new ByteArrayOutputStream(); 15 | val bOut = new ObjectOutputStream(bytesOut); 16 | bOut.writeObject(value); 17 | bOut.close(); 18 | val arr = bytesOut.toByteArray; 19 | out.writeInt(arr.size); 20 | out.write(arr); 21 | } 22 | 23 | @throws(classOf[java.io.IOException]) 24 | override def readFields(in : DataInput) { 25 | val size = in.readInt(); 26 | val barr = new Array[Byte](size); 27 | in.readFully(barr); 28 | val bIn = new ObjectInputStream(new ByteArrayInputStream(barr)); 29 | value = bIn.readObject.asInstanceOf[T] 30 | } 31 | 32 | def compareTo(o : AnyWritable[T]) :Int = { 33 | value.asInstanceOf[{ def compareTo(o : Any): Int}].compareTo(o.value); 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/smr/hadoop/Reduceable.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2008, David Hall 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * 13 | * THIS SOFTWARE IS PROVIDED BY DAVID HALL ``AS IS'' AND ANY 14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 | * DISCLAIMED. IN NO EVENT SHALL DAVID HALL BE LIABLE FOR ANY 17 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 19 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 20 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 21 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 22 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | */ 24 | package smr.hadoop; 25 | import smr._; 26 | import org.apache.hadoop.io._; 27 | import org.apache.hadoop.conf._; 28 | import org.apache.hadoop.fs._; 29 | import org.apache.hadoop.util._; 30 | 31 | private trait Reduceable { 32 | 33 | } 34 | -------------------------------------------------------------------------------- /src/smr/plugin/SerOverride.scala: -------------------------------------------------------------------------------- 1 | package smr.plugin; 2 | import scala.tools.nsc 3 | import nsc.Global 4 | import nsc.Phase 5 | import nsc.plugins.Plugin 6 | import nsc.plugins.PluginComponent 7 | import nsc.transform._ 8 | import nsc.symtab.Flags._ 9 | 10 | class SerOverride(val global: Global) extends Plugin { 11 | import global._ 12 | 13 | val name = "seroverride" 14 | val description = "Makes all closures serializable" 15 | val components = List[PluginComponent](Component); 16 | 17 | private object Component extends PluginComponent { 18 | val global = SerOverride.this.global 19 | val runsAfter = "explicitouter" 20 | val phaseName = SerOverride.this.name 21 | def newPhase(prev: Phase) = new SerOverridePhase(prev) 22 | } 23 | 24 | private class SerTransformer extends Transformer { 25 | override def transform(t : Tree):Tree = t match { 26 | case cdef@ ClassDef(mods,name,tparams,impl) => 27 | val sym = cdef.symbol 28 | val serType = definitions.SerializableAttr.tpe 29 | if( sym.hasFlag(SYNTHETIC) && sym.name.toString.contains("anonfun") && !sym.attributes.exists(serType==_.atp)) { 30 | sym.attributes= AnnotationInfo(serType, List(), List()) :: sym.attributes 31 | copy.ClassDef(t, mods, name, transformTypeDefs(tparams), transformTemplate(impl)) ; 32 | } else { 33 | super.transform(t); 34 | } 35 | case _ => super.transform(t); 36 | } 37 | } 38 | 39 | private class SerOverridePhase(prev: Phase) extends Phase(prev) { 40 | def name = SerOverride.this.name 41 | def run { 42 | val trans = new SerTransformer; 43 | for(unit <- currentRun.units) { 44 | unit.body = trans.transform(unit.body) 45 | } 46 | } 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/smr/hadoop/Mapper.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2008, David Hall 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * 13 | * THIS SOFTWARE IS PROVIDED BY DAVID HALL ``AS IS'' AND ANY 14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 | * DISCLAIMED. IN NO EVENT SHALL DAVID HALL BE LIABLE FOR ANY 17 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 19 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 20 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 21 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 22 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | */ 24 | package smr.hadoop; 25 | import smr._; 26 | import org.apache.hadoop.io._; 27 | import org.apache.hadoop.conf._; 28 | import org.apache.hadoop.fs._; 29 | import org.apache.hadoop.util._; 30 | 31 | /** 32 | * Slightly more attractive scala interface 33 | */ 34 | @serializable 35 | trait Mapper[-K1,-V1,+K2,+V2] { 36 | def map(it : Iterator[(K1,V1)]) : Iterator[(K2,V2)]; 37 | def getFunClass(): Class[_] 38 | } 39 | -------------------------------------------------------------------------------- /src/smr/hadoop/SwapMapper.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2008, David Hall 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * 13 | * THIS SOFTWARE IS PROVIDED BY DAVID HALL ``AS IS'' AND ANY 14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 | * DISCLAIMED. IN NO EVENT SHALL DAVID HALL BE LIABLE FOR ANY 17 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 19 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 20 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 21 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 22 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | */ 24 | package smr.hadoop; 25 | import smr._; 26 | import org.apache.hadoop.io._; 27 | import org.apache.hadoop.conf._; 28 | import org.apache.hadoop.fs._; 29 | import org.apache.hadoop.util._; 30 | 31 | /** 32 | * (K1,V1) -->(V1,K1) 33 | */ 34 | @serializable 35 | class SwapMapper[K,V] extends Mapper[K,V,V,K] { 36 | def map(it : Iterator[(K,V)]) : Iterator[(V,K)] = { 37 | it.map(t => (t._2,t._1)) 38 | } 39 | def getFunClass(): Class[_] = this.getClass; 40 | } 41 | -------------------------------------------------------------------------------- /src/smr/SerFunction.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2008, David Hall 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * 13 | * THIS SOFTWARE IS PROVIDED BY DAVID HALL ``AS IS'' AND ANY 14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 | * DISCLAIMED. IN NO EVENT SHALL DAVID HALL BE LIABLE FOR ANY 17 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 19 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 20 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 21 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 22 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | */ 24 | package smr; 25 | 26 | 27 | @serializable trait SerFunction0[+B] extends Function0[B]; 28 | @serializable trait SerFunction1[-A,+B] extends Function1[A,B] { outer => 29 | override def andThen[C](f : B=>C) = new SerFunction1[A,C] { 30 | def apply(a : A) = f(outer.apply(a)); 31 | } 32 | } 33 | @serializable trait SerFunction2[-A,-B,+C] extends Function2[A,B,C]; 34 | @serializable trait SerFunction3[-A,-B,-C,+D] extends Function3[A,B,C,D]; 35 | 36 | 37 | // please Ant 38 | private object SerFunction; 39 | -------------------------------------------------------------------------------- /src/smr/hadoop/TransformValMapper.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2008, David Hall 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * 13 | * THIS SOFTWARE IS PROVIDED BY DAVID HALL ``AS IS'' AND ANY 14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 | * DISCLAIMED. IN NO EVENT SHALL DAVID HALL BE LIABLE FOR ANY 17 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 19 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 20 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 21 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 22 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | */ 24 | package smr.hadoop; 25 | import smr._; 26 | import org.apache.hadoop.io._; 27 | import org.apache.hadoop.conf._; 28 | import org.apache.hadoop.fs._; 29 | import org.apache.hadoop.util._; 30 | 31 | import Hadoop._; 32 | 33 | /** 34 | * (K1,V1) -->(V2,DefaultKey) 35 | */ 36 | @serializable 37 | class TransformValMapper[K,V1,V2](f: Iterator[V1]=>Iterator[V2]) extends Mapper[K,V1,V2,DefaultKey] { 38 | def map(it : Iterator[(K,V1)]) : Iterator[(V2,DefaultKey)] = { 39 | f(it.map(_._2)).map(x => (x,mkDefaultKey(x))); 40 | } 41 | def getFunClass(): Class[_] = this.getClass; 42 | } 43 | -------------------------------------------------------------------------------- /src/smr/hadoop/PairTransformMapper.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2008, David Hall 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * 13 | * THIS SOFTWARE IS PROVIDED BY DAVID HALL ``AS IS'' AND ANY 14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 | * DISCLAIMED. IN NO EVENT SHALL DAVID HALL BE LIABLE FOR ANY 17 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 19 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 20 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 21 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 22 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | */ 24 | package smr.hadoop; 25 | import smr._; 26 | import org.apache.hadoop.io._; 27 | import org.apache.hadoop.conf._; 28 | import org.apache.hadoop.fs._; 29 | import org.apache.hadoop.util._; 30 | 31 | import Hadoop.DefaultKey; 32 | /** 33 | * A fairly default mapper that only modifies values. 34 | * Ignores input key, and uses the default key for output. 35 | */ 36 | class PairTransformMapper[K1,V1,K2,V2](f:Iterator[(K1,V1)]=>Iterator[(K2,V2)]) extends Mapper[K1,V1,K2,V2] { 37 | override def map(it : Iterator[(K1,V1)]):Iterator[(K2,V2)] = f(it) 38 | 39 | override def getFunClass = f.getClass; 40 | } 41 | -------------------------------------------------------------------------------- /src/smr/hadoop/TransformMapper.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2008, David Hall 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * 13 | * THIS SOFTWARE IS PROVIDED BY DAVID HALL ``AS IS'' AND ANY 14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 | * DISCLAIMED. IN NO EVENT SHALL DAVID HALL BE LIABLE FOR ANY 17 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 19 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 20 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 21 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 22 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | */ 24 | package smr.hadoop; 25 | import smr._; 26 | import org.apache.hadoop.io._; 27 | import org.apache.hadoop.conf._; 28 | import org.apache.hadoop.fs._; 29 | import org.apache.hadoop.util._; 30 | 31 | import Hadoop.DefaultKey; 32 | /** 33 | * A fairly default mapper that only modifies values. 34 | * Ignores input key, and uses the default key for output. 35 | */ 36 | class TransformMapper[T,U](f:Iterator[T]=>Iterator[U]) extends Mapper[Any,T,DefaultKey,U] { 37 | override def map(it : Iterator[(Any,T)]):Iterator[(DefaultKey,U)] = { 38 | f(it.map(_._2)).map( x => (Hadoop.mkDefaultKey(x),x)); 39 | } 40 | 41 | override def getFunClass = f.getClass; 42 | } 43 | -------------------------------------------------------------------------------- /src/smr/hadoop/CollectorMapper.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2008, David Hall 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * 13 | * THIS SOFTWARE IS PROVIDED BY DAVID HALL ``AS IS'' AND ANY 14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 | * DISCLAIMED. IN NO EVENT SHALL DAVID HALL BE LIABLE FOR ANY 17 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 19 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 20 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 21 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 22 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | */ 24 | package smr.hadoop; 25 | import smr._; 26 | import org.apache.hadoop.io._; 27 | import org.apache.hadoop.conf._; 28 | import org.apache.hadoop.fs._; 29 | import org.apache.hadoop.util._; 30 | 31 | import Hadoop.DefaultKey; 32 | /** 33 | * Mapper than ensures all output values goes to the same Reducer by using a default key;. 34 | * Ignores input key. 35 | */ 36 | class CollectorMapper[T,U](f:Iterator[T]=>Iterator[U]) extends Mapper[Any,T,DefaultKey,U] { 37 | override def map(it : Iterator[(Any,T)]):Iterator[(DefaultKey,U)] = { 38 | val w = Hadoop.mkDefaultKey(); 39 | f(it.map(_._2)).map( (w,_)); 40 | } 41 | 42 | override def getFunClass = f.getClass; 43 | } 44 | -------------------------------------------------------------------------------- /src/smr/hadoop/PairCollectorMapper.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2008, David Hall 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * 13 | * THIS SOFTWARE IS PROVIDED BY DAVID HALL ``AS IS'' AND ANY 14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 | * DISCLAIMED. IN NO EVENT SHALL DAVID HALL BE LIABLE FOR ANY 17 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 19 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 20 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 21 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 22 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | */ 24 | package smr.hadoop; 25 | import smr._; 26 | import org.apache.hadoop.io._; 27 | import org.apache.hadoop.conf._; 28 | import org.apache.hadoop.fs._; 29 | import org.apache.hadoop.util._; 30 | 31 | import Hadoop.DefaultKey; 32 | /** 33 | * Mapper than ensures all output values goes to the same Reducer by using a default key;. 34 | * Ignores input key. 35 | */ 36 | class PairCollectorMapper[K1,V1,K2,V2](f:Iterator[(K1,V1)]=>Iterator[(K2,V2)]) extends Mapper[K1,V1,DefaultKey,(K2,V2)] { 37 | override def map(it : Iterator[(K1,V1)]):Iterator[(DefaultKey,(K2,V2))] = { 38 | val w = Hadoop.mkDefaultKey(); 39 | f(it).map( (w,_)); 40 | } 41 | 42 | override def getFunClass = f.getClass; 43 | } 44 | -------------------------------------------------------------------------------- /src/smr/actors/TransActor.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2008, David Hall 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * 13 | * THIS SOFTWARE IS PROVIDED BY DAVID HALL ``AS IS'' AND ANY 14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 | * DISCLAIMED. IN NO EVENT SHALL DAVID HALL BE LIABLE FOR ANY 17 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 19 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 20 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 21 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 22 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | */ 24 | package smr.actors; 25 | import scala.actors._; 26 | import scala.actors.remote.RemoteActor._; 27 | import scala.actors.remote.Node; 28 | import java.net.InetAddress; 29 | 30 | /** 31 | * Trait to help enable serialization of actors. 32 | * Should only be used with actors, but whatever. 33 | * 34 | * @author(dlwh) 35 | */ 36 | trait TransActor extends Actor { 37 | val port : Int; 38 | val sym : Symbol; 39 | } 40 | 41 | sealed case class SerializedActor(node : Node, sym : Symbol); 42 | 43 | object TransActor { 44 | def transActor(port_ : Int, sym_ : Symbol)( body: =>Unit) = new Actor with TransActor { 45 | val port = port_; 46 | val sym = sym_; 47 | start(); 48 | override def act() { 49 | alive(port); 50 | register(sym,Actor.self); 51 | body 52 | } 53 | } 54 | 55 | implicit def TransActorToSerializedActor(a : TransActor) : SerializedActor= { 56 | SerializedActor(new Node(InetAddress.getLocalHost.getHostName(),a.port),a.sym); 57 | } 58 | 59 | /** 60 | * Converts the serialized version of remote actors into a proxy. Note that this is not a 61 | * TransActor, so it cannot be sent down the wire. 62 | */ 63 | implicit def SerializedActorToActor(a :SerializedActor) : OutputChannel[Any] = select(a.node,a.sym); 64 | 65 | } 66 | 67 | -------------------------------------------------------------------------------- /src/smr/Functions.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2008, David Hall 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * 13 | * THIS SOFTWARE IS PROVIDED BY DAVID HALL ``AS IS'' AND ANY 14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 | * DISCLAIMED. IN NO EVENT SHALL DAVID HALL BE LIABLE FOR ANY 17 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 19 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 20 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 21 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 22 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | */ 24 | package smr; 25 | 26 | import scala.util.matching._ 27 | 28 | /** 29 | * Provides a number of useful utility functions for common maps and reduces. 30 | */ 31 | object Functions { 32 | 33 | // primarily maps 34 | /** 35 | * For every string in the input that matches the regex, output (match,1) 36 | * Ignores key. 37 | */ 38 | def countMatches[K](r : Regex) : ((K,String))=>Iterator[(String,Int)] = { case (k, s) => 39 | for(m <- r.findAllIn(s)) yield (m,1) 40 | } 41 | 42 | /** 43 | * Tokenizes the outputs by deliminators as in {@link java.util.StringTokenizer} 44 | */ 45 | def countTokens[K](delim : String) : ((K,String))=>Iterator[(String,Int)] = { case (k, s) => 46 | tokIterator(s,delim) map ( (_,1)); 47 | } 48 | 49 | /** 50 | * Swaps the key and the value. 51 | */ 52 | def swap[K,V]( pair : (K,V)) = pair match { case (k,v) => (v,k) } 53 | 54 | // Functions intended for reduce mostly: 55 | def sum(it : Iterator[Int]) = it.reduceLeft(_+_); 56 | def sum(it : Iterator[Float]) = it.reduceLeft(_+_); 57 | def sum(it : Iterator[Double]) = it.reduceLeft(_+_); 58 | def sum(it : Iterator[Long]) = it.reduceLeft(_+_); 59 | 60 | 61 | private def tokIterator(s : String, delim : String) = new Iterator[String] { 62 | private val mine = new java.util.StringTokenizer(s,delim); 63 | def hasNext = mine.hasMoreTokens; 64 | def next = mine.nextToken; 65 | } 66 | 67 | def identityReduce[K,V]( k : K, v: Iterator[V]) = v.map( (k,_)); 68 | } 69 | -------------------------------------------------------------------------------- /src/smr/hadoop/Reduce.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2008, David Hall 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * 13 | * THIS SOFTWARE IS PROVIDED BY DAVID HALL ``AS IS'' AND ANY 14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 | * DISCLAIMED. IN NO EVENT SHALL DAVID HALL BE LIABLE FOR ANY 17 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 19 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 20 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 21 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 22 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | */ 24 | package smr.hadoop; 25 | import smr._; 26 | 27 | import org.apache.hadoop.io._; 28 | import org.apache.hadoop.conf._; 29 | import org.apache.hadoop.fs._; 30 | import org.apache.hadoop.util._; 31 | 32 | @serializable 33 | trait Reduce[-K1,-V1,+K2,+V2] { 34 | def reduce(key : K1, it: Iterator[V1]): Iterator[(K2,V2)]; 35 | } 36 | 37 | import Hadoop._; 38 | class RealReduce[T](f : (T,T)=>T) extends Reduce[DefaultKey,T,DefaultKey,T] { 39 | override def reduce(k: DefaultKey, it :Iterator[T]) : Iterator[(DefaultKey,T)] = { 40 | Iterator.single((k,it.reduceLeft(f))); 41 | } 42 | } 43 | 44 | /** 45 | * Reduce that takes (K,[V]) and returns (car[V],K) 46 | */ 47 | class KeyToValReduce[K,V] extends Reduce[K,V,V,K] { 48 | override def reduce(k : K, it : Iterator[V]) : Iterator[(V,K)] = { 49 | it.take(1).map( (_,k)); 50 | } 51 | } 52 | 53 | class IdentityReduce[K,V] extends Reduce[K,V,K,V] { 54 | override def reduce(k : K, it :Iterator[V]): Iterator[(K,V)] = { 55 | it.map((k,_)); 56 | } 57 | } 58 | 59 | /** 60 | * Very simple wrapper class that exposes Hadoop's Reduce more or less exactly. 61 | */ 62 | class FlatReduce[K1,V1,K2,V2](f : (K1,Iterator[V1])=>Iterator[(K2,V2)]) extends Reduce[K1,V1,K2,V2] { 63 | def reduce(key : K1, it : Iterator[V1]) = f(key,it); 64 | } 65 | 66 | /** 67 | * Very simple wrapper class that's like FlatReduce, but only one value per key. 68 | */ 69 | class PairReduce[K1,V1,K2,V2](f : (K1,Iterator[V1])=>(K2,V2)) extends Reduce[K1,V1,K2,V2] { 70 | def reduce(key : K1, it : Iterator[V1]) = Iterator.single(f(key,it)); 71 | } 72 | -------------------------------------------------------------------------------- /src/smr/hadoop/ReduceWrapper.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2008, David Hall 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * 13 | * THIS SOFTWARE IS PROVIDED BY DAVID HALL ``AS IS'' AND ANY 14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 | * DISCLAIMED. IN NO EVENT SHALL DAVID HALL BE LIABLE FOR ANY 17 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 19 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 20 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 21 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 22 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | */ 24 | package smr.hadoop; 25 | import smr._; 26 | 27 | import org.apache.hadoop.io._; 28 | import org.apache.hadoop.conf._; 29 | import org.apache.hadoop.fs._; 30 | import org.apache.hadoop.util._; 31 | import org.apache.hadoop.mapred._; 32 | import org.apache.hadoop.filecache._; 33 | 34 | 35 | private class ReduceWrapper[K1,V1,K2,V2] extends Reducer[Writable,Writable,Writable,Writable] { 36 | import Hadoop._; 37 | var r : Reduce[K1,V1,K2,V2] = _; 38 | override def configure(conf : JobConf) { 39 | val mapString = conf.get("smr.job.reducer.file"); 40 | val localFiles = DistributedCache.getCacheFiles(conf); 41 | val mapFile = new Path(localFiles.filter(_.toString==mapString)(0).toString); 42 | val inputStream = new java.io.ObjectInputStream(mapFile.getFileSystem(conf).open(mapFile)); 43 | r = inputStream.readObject().asInstanceOf[Reduce[K1,V1,K2,V2]]; 44 | inputStream.close(); 45 | } 46 | 47 | private def xClass(x : Any) = x.asInstanceOf[AnyRef].getClass 48 | 49 | override def reduce(k : Writable, 50 | it: java.util.Iterator[Writable], 51 | output : OutputCollector[Writable,Writable], 52 | rp: Reporter) { 53 | val newK = Magic.wireToReal(k).asInstanceOf[K1]; 54 | 55 | val it2 = new Iterator[V1] { 56 | def hasNext = it.hasNext; 57 | def next = { 58 | i+=1; 59 | if(i%100 ==0) rp.progress(); 60 | Magic.wireToReal(it.next).asInstanceOf[V1]; 61 | } 62 | var i = 0; 63 | } 64 | r.reduce(newK,it2).foreach { case (k,v)=> 65 | output.collect(Magic.realToWire(k),Magic.realToWire(v)); 66 | } 67 | } 68 | def close() {} 69 | } 70 | -------------------------------------------------------------------------------- /src/smr/hadoop/ClosureMapper.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2008, David Hall 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * 13 | * THIS SOFTWARE IS PROVIDED BY DAVID HALL ``AS IS'' AND ANY 14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 | * DISCLAIMED. IN NO EVENT SHALL DAVID HALL BE LIABLE FOR ANY 17 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 19 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 20 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 21 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 22 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | */ 24 | package smr.hadoop; 25 | import smr._; 26 | 27 | import org.apache.hadoop.io._; 28 | import org.apache.hadoop.conf._; 29 | import org.apache.hadoop.fs._; 30 | import org.apache.hadoop.util._; 31 | import org.apache.hadoop.mapred._; 32 | import org.apache.hadoop.filecache._; 33 | 34 | class ClosureMapper[K1,V1,K2,V2] extends MapRunnable[Writable,Writable,Writable,Writable] { 35 | import Hadoop._; 36 | var m :Mapper[K1,V1,K2,V2] = _; 37 | 38 | override def configure(conf : JobConf) { 39 | val mapString = conf.get("smr.job.mapper.file"); 40 | val localFiles = DistributedCache.getCacheFiles(conf); 41 | val mapFile = new Path(localFiles.filter(_.toString==mapString)(0).toString); 42 | val inputStream = new java.io.ObjectInputStream(mapFile.getFileSystem(conf).open(mapFile)); 43 | m = inputStream.readObject().asInstanceOf[Mapper[K1,V1,K2,V2]]; 44 | inputStream.close(); 45 | } 46 | final override def run( 47 | input: RecordReader[Writable,Writable], 48 | output : OutputCollector[Writable,Writable], 49 | r : Reporter) { 50 | var i = 0; 51 | val it = Util.iteratorFromProducer { () => 52 | i+=1 53 | if(i%100==0) r.progress(); 54 | val k = input.createKey().asInstanceOf[Writable]; 55 | val v = input.createValue().asInstanceOf[Writable]; 56 | input.next(k,v) match { 57 | case true => Some((Magic.wireToReal(k).asInstanceOf[K1],Magic.wireToReal(v).asInstanceOf[V1])) 58 | case false => None 59 | } 60 | } 61 | m.map(it).foreach { case (k2,v2) => 62 | output.collect(Magic.realToWire(k2),Magic.realToWire(v2)); 63 | } 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /src/smr/DistributedIterable.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2008, David Hall 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * 13 | * THIS SOFTWARE IS PROVIDED BY DAVID HALL ``AS IS'' AND ANY 14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 | * DISCLAIMED. IN NO EVENT SHALL DAVID HALL BE LIABLE FOR ANY 17 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 19 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 20 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 21 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 22 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | */ 24 | package smr; 25 | 26 | import scala.reflect.Manifest; 27 | 28 | /** 29 | * A variant of {@link scala.Iterable} that's more amenable to distribution. 30 | * The design goal is to make these lazy by default, but the ActorDistributor 31 | * returns eager iterables at the moment. 32 | */ 33 | trait DistributedIterable[+T] { self => 34 | def elements : Iterator[T]; 35 | def map[B](f : T=>B)(implicit m : Manifest[B]): DistributedIterable[B]; 36 | def flatMap[U](f : T=>Iterable[U])(implicit m: Manifest[U]) : DistributedIterable[U]; 37 | def filter(f : T=>Boolean) : DistributedIterable[T]; 38 | 39 | /** 40 | * Process any computations that have been cached and return a new 41 | * DistributedIterable with those results. 42 | */ 43 | def force() : DistributedIterable[T]; 44 | 45 | /** 46 | * Sadly, both versions of reduce in the Scala libs are not fully associative, 47 | * which is required for a parallel reduce. This version of reduce demands 48 | * that the operators are associative. 49 | */ 50 | def reduce[B >: T](f : (B,B)=>B): B; 51 | 52 | /** 53 | * for each element, reshard the data by group(t)'s hashcode and create a new 54 | * Iterable with those elements. 55 | */ 56 | //def groupBy[U](group : T=>U) : DistributedIterable[(U,Iterable[T])]; 57 | 58 | /** 59 | * Removes all copies of the elements. 60 | */ 61 | def distinct() : DistributedIterable[T]; 62 | 63 | def toIterable : Iterable[T] = new Iterable[T] { 64 | def elements = self.elements; 65 | } 66 | 67 | // compatibility: will be removed soon: 68 | @deprecated 69 | def mapReduce[U,B>:U](f : T=>U)(r : (B,B)=>B)(implicit mU:Manifest[U]) = map(f).reduce(r); 70 | @deprecated 71 | def lazyMap[U](f : T=>U)(implicit mU:Manifest[U])= map(f); 72 | } 73 | 74 | 75 | -------------------------------------------------------------------------------- /src/smr/DistributedPairs.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2008, David Hall 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * 13 | * THIS SOFTWARE IS PROVIDED BY DAVID HALL ``AS IS'' AND ANY 14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 | * DISCLAIMED. IN NO EVENT SHALL DAVID HALL BE LIABLE FOR ANY 17 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 19 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 20 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 21 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 22 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | */ 24 | package smr; 25 | 26 | import scala.reflect.Manifest; 27 | 28 | /** 29 | * Represents a Distributed Iterable over Pairs. Distinct from a 30 | * DistributedIterable[(K,V)] because it's designed for the MapReduce framework 31 | * and hence only supports operations that yield pairs. 32 | */ 33 | trait DistributedPairs[+K,+V] { self => 34 | def elements : Iterator[(K,V)]; 35 | def map[J,U](f : ((K,V))=>(J,U))(implicit m : Manifest[J], mU:Manifest[U]): DistributedPairs[J,U]; 36 | def flatMap[J,U](f : ((K,V))=>Iterable[(J,U)])(implicit m: Manifest[J], mU:Manifest[U]) : DistributedPairs[J,U] 37 | def filter(f : ((K,V))=>Boolean) : DistributedPairs[K,V]; 38 | 39 | /** 40 | * Process any computations that have been cached and return a new 41 | * DistributedPairs with those results. 42 | */ 43 | def force() : DistributedPairs[K,V]; 44 | 45 | /** 46 | * Models MapReduce style reduce more exactly. 47 | */ 48 | def flatReduce[J,U](f : (K,Iterator[V])=>Iterator[(J,U)])(implicit m : Manifest[J], mU:Manifest[U]): DistributedPairs[J,U]; 49 | 50 | /** 51 | * For a slightly more "classic" reduce that outputs exactly one item for each input. Still not Scala's reduce. 52 | */ 53 | def reduce[J,U](f : (K,Iterator[V])=>(J,U))(implicit m : Manifest[J], mU:Manifest[U]): DistributedPairs[J,U]; 54 | 55 | def mapFirst[J](f: K=>J)(implicit m:Manifest[J]) : DistributedPairs[J,V]; 56 | def mapSecond[U](f: V=>U)(implicit m:Manifest[U]) : DistributedPairs[K,U]; 57 | 58 | def toIterable : Iterable[(K,V)] = new Iterable[(K,V)] { 59 | def elements = self.elements; 60 | } 61 | 62 | /** 63 | * Checkpoints a chain of operations, saving the output for later use (in say, a future run) 64 | */ 65 | def asStage(name : String) : DistributedPairs[K,V]; 66 | 67 | /** 68 | * Appends two pairs together. 69 | */ 70 | def ++[SK>:K,SV>:V](other : DistributedPairs[SK,SV])(implicit mSK : Manifest[SK], mSV:Manifest[SV]) : DistributedPairs[SK,SV]; 71 | } 72 | 73 | 74 | -------------------------------------------------------------------------------- /src/smr/hadoop/Implicits.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2008, David Hall 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * 13 | * THIS SOFTWARE IS PROVIDED BY DAVID HALL ``AS IS'' AND ANY 14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 | * DISCLAIMED. IN NO EVENT SHALL DAVID HALL BE LIABLE FOR ANY 17 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 19 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 20 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 21 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 22 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | */ 24 | package smr.hadoop; 25 | import smr._; 26 | 27 | import org.apache.hadoop.io._; 28 | import org.apache.hadoop.conf._; 29 | import org.apache.hadoop.fs._; 30 | import org.apache.hadoop.util._; 31 | import org.apache.hadoop.mapred._; 32 | 33 | /** 34 | * See {@link Implicits$} 35 | */ 36 | trait Implicits { 37 | 38 | implicit def stringToPath(s : String) = new Path(s); 39 | /** 40 | * An awful lot of Path operations take the form p.getFileSystem(conf).doFoo(p). 41 | * This makes some of them look like "p.doFoo" 42 | */ 43 | implicit def pathToUsefulPath(p : Path)(implicit conf : Configuration) = new UsefulPath(p,conf); 44 | 45 | class UsefulPath(p : Path, conf: Configuration) { 46 | def mkdirs() { p.getFileSystem(conf).mkdirs(p);} 47 | def createNewFile() = { p.getFileSystem(conf).create(p);} 48 | def exists() = { p.getFileSystem(conf).exists(p);} 49 | def listFiles() = {p.getFileSystem(conf).globStatus(new Path(p,"*")).map(_.getPath);} 50 | def length() = {p.getFileSystem(conf).getLength(p)} 51 | def moveTo(dst: Path) = { p.getFileSystem(conf).rename(p,dst)} 52 | def deleteOnExit() = {p.getFileSystem(conf).deleteOnExit(p);} 53 | def delete() = {p.getFileSystem(conf).delete(p)} 54 | } 55 | 56 | implicit def recordReaderToIterator[K,V](r : RecordReader[K,V]) = new Iterator[(K,V)] { 57 | def hasNext = kv match { 58 | case None => readNext(); hasN; 59 | case Some(_) => true; 60 | } 61 | 62 | def next = { 63 | val nx = kv.get 64 | kv = None; 65 | nx; 66 | } 67 | 68 | private def readNext() { 69 | val v = r.createValue; 70 | val k = r.createKey; 71 | try { 72 | hasN = r.next(k,v); 73 | if(!hasN) r.close(); 74 | else kv = Some((k,v)) 75 | } catch { 76 | case e => r.close(); 77 | } 78 | } 79 | 80 | private var kv :Option[(K,V)] = None; 81 | private var hasN = true; 82 | } 83 | 84 | implicit def fromWritable(w : IntWritable) = w.get(); 85 | implicit def fromWritable(w : LongWritable) = w.get(); 86 | implicit def fromWritable(w : DoubleWritable) = w.get(); 87 | implicit def fromWritable(w : ByteWritable) = w.get(); 88 | implicit def fromWritable(w : FloatWritable) = w.get(); 89 | implicit def fromText(t : Text) = t.toString(); 90 | def fromWritable[T](w : ObjectWritable) : Any = w.get() 91 | 92 | } 93 | 94 | /** 95 | * Provides a number of implicit conversions for using the lower level Hadoop API. 96 | * Not really necessary if you stick with SMR's magic. 97 | */ 98 | object Implicits extends Implicits{ 99 | } 100 | -------------------------------------------------------------------------------- /src/smr/actors/Hub.scala: -------------------------------------------------------------------------------- 1 | // 2 | // an smr hub keeps track of a bunch of workers 3 | // 4 | // dramage 2008 5 | // 6 | 7 | package smr.actors; 8 | 9 | import scala.actors.Actor 10 | import scala.actors.OutputChannel 11 | import scala.actors.Actor._ 12 | import scala.actors.remote.Node 13 | import scala.actors.remote.RemoteActor._ 14 | 15 | /** 16 | * A hub stores a list of active Workers. 17 | */ 18 | class Hub(port : Int) extends Actor { 19 | import Hub._ 20 | def this() = this(Util.freePort); 21 | 22 | private var workers : List[(Symbol,String,Int)] = Nil 23 | 24 | println("Hub: registering as hub on port " + port) 25 | 26 | start() 27 | 28 | override def act() { 29 | alive(port) 30 | register('hub, self) 31 | 32 | loop { 33 | println("Hub: ready") 34 | react { 35 | case HubRegister(name,machine,port) => 36 | println("Hub: registering "+(name,machine,port)) 37 | workers = (name,machine,port) :: workers 38 | case r:HubListRequest => 39 | println("Hub: listing to "+sender) 40 | reply { HubListResponse(workers) } 41 | case x:Any => 42 | println("Hub: other message "+x) 43 | } 44 | } 45 | } 46 | 47 | println("access"); 48 | scala.actors.remote.RemoteActor.classLoader = classOf[Hub].getClassLoader 49 | } 50 | 51 | /** 52 | * Provides a mechanism to get a distributor that works across the registered 53 | * classes. 54 | */ 55 | object Hub { 56 | case class HubRegister(name : Symbol, machine : String, port : Int) 57 | case class HubListRequest 58 | case class HubListResponse(workers : List[(Symbol,String,Int)]) 59 | 60 | import Util._; 61 | 62 | def apply(machine : String, port : Int) : OutputChannel[Any] = { 63 | scala.actors.remote.RemoteActor.classLoader = classOf[Hub].getClassLoader 64 | return select(Node(machine,port),'hub) 65 | } 66 | 67 | def workers(machine : String, port : Int) = { 68 | (Hub(machine,port) ! Hub.HubListRequest()); 69 | val workers : List[(Symbol,String,Int)] = self.?.asInstanceOf[HubListResponse].workers; 70 | workers map ((x:(Symbol,String,Int)) => select(Node(x._2,x._3),x._1)) 71 | } 72 | 73 | def distributor(machine : String, port : Int) : ActorDistributor = { 74 | val distributor = new ActorDistributor(0, freePort()) 75 | workers(machine,port) foreach distributor.addWorker 76 | distributor 77 | } 78 | 79 | def list(machine : String, port : Int) { 80 | (Hub(machine,port) ! Hub.HubListRequest()); 81 | val workers : List[(Symbol,String,Int)] = self.?.asInstanceOf[HubListResponse].workers 82 | 83 | workers foreach println 84 | } 85 | } 86 | 87 | /** 88 | * Spawns a new hub 89 | */ 90 | object SpawnHub { 91 | def main(argv : Array[String]) { 92 | scala.actors.remote.RemoteActor.classLoader = classOf[Hub].getClassLoader 93 | scala.actors.Debug.level = 10; 94 | 95 | new Hub() 96 | } 97 | } 98 | 99 | /** 100 | * Spawns a worker thread, registering it with the hub named in argv. 101 | * TODO: error message argv != Array("host","port"). 102 | */ 103 | object SpawnWorker { 104 | def main(argv : Array[String]) { 105 | scala.actors.Debug.level = 10; 106 | if(argv.length < 2) { 107 | println("Syntax: SpawnWorker [numWorkers] [classLoader class]"); 108 | } 109 | 110 | scala.actors.remote.RemoteActor.classLoader = if(argv.length > 3) Class.forName(argv(3)).getClassLoader else classOf[Hub].getClassLoader 111 | 112 | val numWorkers = if(argv.length > 2) java.lang.Integer.parseInt(argv(2)) else Runtime.getRuntime.availableProcessors; 113 | 114 | val hub = select(Node(argv(0),java.lang.Integer.parseInt(argv(1))), 'hub) 115 | 116 | val host = java.net.InetAddress.getLocalHost().getHostName() 117 | 118 | for(i <- 1 to numWorkers) { 119 | var port = Util.freePort() 120 | val worker = Worker(port,'worker); 121 | hub ! Hub.HubRegister('worker, host, port) 122 | } 123 | 124 | println("Worker: starting") 125 | } 126 | } 127 | 128 | -------------------------------------------------------------------------------- /src/smr/Util.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2008, David Hall 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * 13 | * THIS SOFTWARE IS PROVIDED BY DAVID HALL ``AS IS'' AND ANY 14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 | * DISCLAIMED. IN NO EVENT SHALL DAVID HALL BE LIABLE FOR ANY 17 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 19 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 20 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 21 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 22 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | */ 24 | package smr; 25 | import scala.actors.Actor._; 26 | import scala.actors.Actor; 27 | 28 | object Util { 29 | def identity[T] = new SerFunction1[T,T] { 30 | def apply(x : T) = x; 31 | }; 32 | def fMap[T,U](f : T=>U) = new SerFunction1[Iterable[T],Iterable[U]] { 33 | def apply(x : Iterable[T]) = x.map(f); 34 | }; 35 | 36 | def fFlatMap[T,U](f : T=>Iterable[U]) = new SerFunction1[Iterable[T],Iterable[U]] { 37 | def apply(x : Iterable[T]) = x.flatMap(f); 38 | }; 39 | 40 | def fFilter[T](f : T=>Boolean) = new SerFunction1[Iterable[T],Iterable[T]] { 41 | def apply(x : Iterable[T]) = x.filter(f); 42 | }; 43 | 44 | def itMap[T,U](f : T=>U) = new SerFunction1[Iterator[T],Iterator[U]] { 45 | def apply(x : Iterator[T]) = x.map(f); 46 | }; 47 | 48 | def itFlatMap[T,U](f : T=>Iterable[U]) = new SerFunction1[Iterator[T],Iterator[U]] { 49 | def apply(it : Iterator[T]) = for(x <- it; y <- f(x).elements) yield y; 50 | }; 51 | 52 | def itFilter[T](f : T=>Boolean) = new SerFunction1[Iterator[T],Iterator[T]] { 53 | def apply(x : Iterator[T]) = x.filter(f); 54 | }; 55 | 56 | // g(f(x)) 57 | def andThen[A,B,C](f: A=>B, g:B=>C) = new SerFunction1[A,C] { 58 | def apply(a : A) = g(f(a)); 59 | } 60 | 61 | def freePort() : Int = { 62 | val server = new java.net.ServerSocket(0); 63 | val port = server.getLocalPort(); 64 | server.close(); 65 | return port; 66 | } 67 | 68 | /** 69 | * Iterator that reacts to get the next element. 70 | * Used internally to 71 | */ 72 | class ActorIterator[T] extends Iterator[T] { 73 | def hasNext() = !nulled && (cache match { 74 | case Some(x) => true; 75 | case _ => 76 | (receiver !? Poll) match { 77 | case None => nulled = true; receiver ! Close; false; 78 | case opt @ Some(x) => cache = opt.asInstanceOf[Option[T]]; true; 79 | } 80 | }) 81 | def next() = {hasNext(); val x = cache.get; cache = None; x} 82 | 83 | val receiver = actor { 84 | loop { 85 | react { 86 | case Poll => reply {Actor.?} 87 | case Close => exit(); 88 | } 89 | } 90 | } 91 | private var nulled = false; 92 | private var cache : Option[T] = None; 93 | private case class Poll; 94 | private case class Close; 95 | } 96 | 97 | def iteratorFromProducer[T](p : ()=>Option[T]) = new Iterator[T] { 98 | private var nxt : Option[T] = None; 99 | private var hasN = true; 100 | 101 | private def readNext() = p() match { 102 | case o @ Some(t) => nxt = o; 103 | case None=>nxt=None; hasN = false; 104 | } 105 | 106 | def hasNext = hasN && (nxt match { 107 | case Some(t) => true; 108 | case None => readNext(); hasN; 109 | }) 110 | 111 | def next : T = nxt match { 112 | case Some(t) => nxt=None; t; 113 | case None=> readNext(); if(hasN) next else throw new NoSuchElementException(); 114 | } 115 | } 116 | 117 | } 118 | -------------------------------------------------------------------------------- /src/smr/hadoop/Magic.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2008, David Hall 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * 13 | * THIS SOFTWARE IS PROVIDED BY DAVID HALL ``AS IS'' AND ANY 14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 | * DISCLAIMED. IN NO EVENT SHALL DAVID HALL BE LIABLE FOR ANY 17 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 19 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 20 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 21 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 22 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | */ 24 | package smr.hadoop; 25 | import smr._; 26 | 27 | import java.io._; 28 | 29 | import org.apache.hadoop.io._; 30 | import org.apache.hadoop.conf._; 31 | import org.apache.hadoop.fs._; 32 | import org.apache.hadoop.util._; 33 | import org.apache.hadoop.mapred._; 34 | import org.apache.hadoop.filecache._; 35 | 36 | import scala.reflect.Manifest; 37 | 38 | // You know it's bad when you have a class called magic 39 | private object Magic { 40 | def wireToReal(t : Writable) :Any = t match { 41 | case t :Text => t.toString; 42 | case arr : ArrayWritable => arr.get().map(wireToReal); 43 | case t => try { 44 | t.asInstanceOf[{def get():Any;}].get(); 45 | } catch { 46 | case e => t; 47 | } 48 | } 49 | 50 | implicit def realToWire(t : Any):Writable = t match { 51 | case t : Writable => t; 52 | case t : Int => new IntWritable(t); 53 | case t : Long => new LongWritable(t); 54 | //case t : Byte => new ByteWritable(t); 55 | case t : Float => new FloatWritable(t); 56 | //case t : Double => new DoubleWritable(t); 57 | case t : Boolean => new BooleanWritable(t); 58 | case t : String => new Text(t); 59 | case t : Array[Byte] => new BytesWritable(t); 60 | case x : AnyRef if x.getClass.isArray => { 61 | val t = x.asInstanceOf[Array[Any]]; 62 | if(t.length == 0) new AnyWritable(t); 63 | else { 64 | val mapped = t.map(realToWire); 65 | val classes = mapped.map(_.getClass); 66 | if(classes.forall(classes(0)==_)) { 67 | // can only use ArrayWritable if all Writables are the same. 68 | new ArrayWritable(classes(0),mapped); 69 | } else { 70 | // fall back on AnyWritable 71 | val mapped = t.map(new AnyWritable[Any](_).asInstanceOf[Writable]); 72 | new ArrayWritable(classOf[AnyWritable[_]],mapped); 73 | } 74 | } 75 | } 76 | case _ => new AnyWritable(t); 77 | } 78 | 79 | private val CInt = classOf[Int]; 80 | private val CLong = classOf[Long]; 81 | private val CByte = classOf[Byte]; 82 | private val CDouble = classOf[Double]; 83 | private val CFloat = classOf[Float]; 84 | private val CBoolean = classOf[Boolean]; 85 | private val CString = classOf[String]; 86 | private val CArrayByte = classOf[Array[Byte]]; 87 | private val CArray = classOf[Array[_]]; 88 | 89 | def mkManifest[T](c:Class[T]) = new Manifest[T] { 90 | def erasure = c; 91 | } 92 | 93 | private val CWritable = mkManifest(classOf[Writable]); 94 | 95 | def classToWritableClass[T](c: Class[T]):Class[Writable] = c match { 96 | case c if mkManifest(c) <:< CWritable => c.asInstanceOf[Class[Writable]]; 97 | case CInt => classOf[IntWritable].asInstanceOf[Class[Writable]]; 98 | case CLong => classOf[LongWritable].asInstanceOf[Class[Writable]]; 99 | // case CByte => classOf[ByteWritable].asInstanceOf[Class[Writable]]; 100 | //case CDouble => classOf[DoubleWritable].asInstanceOf[Class[Writable]]; 101 | case CFloat => classOf[FloatWritable].asInstanceOf[Class[Writable]]; 102 | case CBoolean => classOf[BooleanWritable].asInstanceOf[Class[Writable]]; 103 | case CString => classOf[Text].asInstanceOf[Class[Writable]]; 104 | case CArrayByte => classOf[BytesWritable].asInstanceOf[Class[Writable]]; 105 | case CArray => classOf[ArrayWritable].asInstanceOf[Class[Writable]]; 106 | case _ => classOf[AnyWritable[_]].asInstanceOf[Class[Writable]]; 107 | } 108 | 109 | } 110 | -------------------------------------------------------------------------------- /src/smr/Defaults.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2008, David Hall 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * 13 | * THIS SOFTWARE IS PROVIDED BY DAVID HALL ``AS IS'' AND ANY 14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 | * DISCLAIMED. IN NO EVENT SHALL DAVID HALL BE LIABLE FOR ANY 17 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 19 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 20 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 21 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 22 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | */ 24 | package smr; 25 | import scala.collection.mutable._; 26 | import scala.reflect.Manifest; 27 | 28 | /** 29 | * Object to hold various sensible Defaults for SMR. Expected use: 30 | *

 31 |  * import smr.Defaults._;
 32 |  *

33 | * 34 | * @author dlwh 35 | */ 36 | object Defaults { 37 | 38 | /** 39 | * Implicit shard function that provides a reasonable default in most cases. Special treatment for 40 | * Ranges and for Seqs 41 | */ 42 | implicit def shard[T] (it : Iterable[T], numShards : Int) : List[Iterable[T]] = it match { 43 | case x : scala.Range.Inclusive => shardIRange(x,numShards).asInstanceOf[List[Iterable[T]]]; 44 | case x : scala.Range=> shardRange(x,numShards).asInstanceOf[List[Iterable[T]]]; 45 | case x : Seq[_] => 46 | if(x.size < numShards) { 47 | List(x) 48 | } else { 49 | val sz = x.size / numShards; 50 | val arrs = new ArrayBuffer[Iterable[T]] 51 | arrs ++= (for(val i <- 0 until numShards ) yield x.drop(sz * i).take(sz).toList); 52 | arrs.toList 53 | } 54 | case _ => 55 | val arrs = new ArrayBuffer[ArrayBuffer[T]] 56 | arrs ++= (for(val i <- 1 to numShards) yield new ArrayBuffer[T]); 57 | val elems = it.elements 58 | var i = 0; 59 | while(elems.hasNext) { arrs(i%numShards) += elems.next; i += 1} 60 | arrs.toList 61 | } 62 | implicit def fakeDistributedIterable[T](it : Iterable[T]):DistributedIterable[T] = new DistributedIterable[T] { 63 | override def map[U](f : T=>U)(implicit mU : Manifest[U]) = fakeDistributedIterable(it.map(f)); 64 | override def flatMap[U](f : T=>Iterable[U])(implicit mU : Manifest[U])= fakeDistributedIterable(it.flatMap(f)); 65 | override def filter(f : T=>Boolean) = fakeDistributedIterable(it.filter(f)); 66 | override def reduce[U>:T](f : (U,U)=>U) = it.reduceLeft(f); 67 | def groupBy[U](grp : T=>U) = { 68 | val map = Map[U,ArrayBuffer[T]](); 69 | for( t <- elements) { 70 | map.getOrElseUpdate(grp(t),new ArrayBuffer[T]) += t; 71 | } 72 | fakeDistributedIterable(map.asInstanceOf[Map[U,Iterable[T]]].toList); 73 | } 74 | override def distinct() = (Set() ++ elements).toSeq 75 | 76 | override def force() = try { 77 | it.asInstanceOf[Iterable.Projection[T]].force 78 | } catch { 79 | case e : ClassCastException => it 80 | } 81 | 82 | def elements = it.elements; 83 | } 84 | 85 | private def shardRange (r : scala.Range, numShards : Int) : List[Iterable[Int]]= { 86 | val arrs = new ArrayBuffer[Range] 87 | val n = numShards; 88 | arrs ++= (for(val i<- 0 until n) yield new Range(r.start + i * r.step,r.end,n * r.step)); 89 | arrs.toList 90 | } 91 | 92 | private def shardIRange (r : scala.Range.Inclusive, numShards : Int) : List[Iterable[Int]]= { 93 | val arrs = new ArrayBuffer[Range.Inclusive] 94 | val n = numShards; 95 | arrs ++= (for(val i<- 0 until n) yield new Range.Inclusive(r.start + i * r.step ,r.end,n * r.step)); 96 | arrs.toList 97 | } 98 | 99 | /** 100 | * Borrowed from scala source. Just add the annotation tag. 101 | * Part of the Scala API. 102 | * original author: @author Stephane Micheloud 103 | */ 104 | @serializable 105 | private[Defaults] class Range(val start: Int, val end: Int, val step: Int) extends RandomAccessSeq.Projection[Int] { 106 | if (step == 0) throw new Predef.IllegalArgumentException 107 | 108 | /** Create a new range with the start and end values of this range and 109 | * a new step. 110 | */ 111 | def by(step: Int): Range = new Range(start, end, step) 112 | 113 | lazy val length: Int = { 114 | if (start < end && this.step < 0) 0 115 | else if (start > end && this.step > 0) 0 116 | else { 117 | val base = if (start < end) end - start 118 | else start - end 119 | assert(base >= 0) 120 | val step = if (this.step < 0) -this.step else this.step 121 | assert(step >= 0) 122 | base / step + last(base, step) 123 | } 124 | } 125 | 126 | protected def last(base: Int, step: Int): Int = 127 | if (base % step != 0) 1 else 0 128 | 129 | def apply(idx: Int): Int = { 130 | if (idx < 0 || idx >= length) throw new Predef.IndexOutOfBoundsException 131 | start + (step * idx) 132 | } 133 | 134 | /** a Seq.contains, not a Iterator.contains! */ 135 | def contains(x: Int): Boolean = 136 | if (step > 0) 137 | x >= start && x < end && (((x - start) % step) == 0) 138 | else 139 | x <= start && x > end && (((x - end) % step) == 0) 140 | 141 | def inclusive = new Range.Inclusive(start,end,step) 142 | } 143 | 144 | private[Defaults] object Range { 145 | @serializable 146 | private[Defaults] class Inclusive(start: Int, end: Int, step: Int) extends Range(start, end, step) { 147 | override def apply(idx: Int): Int = super.apply(idx) 148 | override protected def last(base: Int, step: Int): Int = 1 149 | override def by(step: Int): Range = new Inclusive(start, end, step) 150 | } 151 | } 152 | 153 | } 154 | -------------------------------------------------------------------------------- /src/smr/hadoop/PathIterable.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2008, David Hall 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * 13 | * THIS SOFTWARE IS PROVIDED BY DAVID HALL ``AS IS'' AND ANY 14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 | * DISCLAIMED. IN NO EVENT SHALL DAVID HALL BE LIABLE FOR ANY 17 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 19 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 20 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 21 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 22 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | */ 24 | package smr.hadoop; 25 | import smr._; 26 | import org.apache.hadoop.io._; 27 | import org.apache.hadoop.conf._; 28 | import org.apache.hadoop.fs._; 29 | import org.apache.hadoop.util._; 30 | import org.apache.hadoop.mapred._; 31 | import scala.reflect.Manifest; 32 | 33 | /** 34 | * Represents SequenceFiles of (Hadoop.DefaultKey,T) pairs on disk. 35 | * All operations are scheduled as MapReduces using Hadoop.runMapReduce. 36 | * The DefaultKey is inaccessible. 37 | */ 38 | class PathIterable[T](h: Hadoop, val paths: Array[Path])(implicit m: Manifest[T]) extends DistributedIterable[T] { 39 | import Magic._; 40 | def elements = { 41 | if(paths.length == 0) 42 | new Iterator[T] { 43 | def hasNext = false; 44 | def next = throw new IllegalArgumentException("No elements were found!") 45 | } 46 | else paths.map(loadIterator).reduceLeft(_++_) 47 | } 48 | 49 | def force = this; 50 | 51 | import Hadoop._; 52 | def reduce[B>:T](f: (B,B)=>B) : B = { 53 | implicit val b = m.asInstanceOf[Manifest[B]]; 54 | implicit val klass = inputFormatClass.asInstanceOf[Class[InputFormat[Any,B]]]; 55 | val output = h.runMapReduce(paths, new CollectorMapper(identity[Iterator[B]]), new RealReduce(f), Set(ReduceCombine)); 56 | val path = output(0); 57 | 58 | val result = new SequenceFile.Reader(path.getFileSystem(h.conf),path,h.conf); 59 | val v = result.getValueClass.asSubclass(classOf[Writable]).newInstance(); 60 | val k = result.getKeyClass.asSubclass(classOf[Writable]).newInstance(); 61 | result.next(k,v); 62 | result.close(); 63 | Magic.wireToReal(v).asInstanceOf[B]; 64 | } 65 | 66 | /** 67 | * Equivalent to Set() ++ it.elements, but distributed. 68 | */ 69 | def distinct() = { 70 | implicit val klass = inputFormatClass.asInstanceOf[Class[InputFormat[DefaultKey,T]]]; 71 | val output = h.runMapReduce(paths,new SwapMapper[DefaultKey,T],new KeyToValReduce[T,DefaultKey]); 72 | new PathIterable(h,output); 73 | } 74 | 75 | /** 76 | * Lazy 77 | */ 78 | override def map[U](f : T=>U)(implicit m : Manifest[U]): DistributedIterable[U] = new ProjectedIterable[U](Util.itMap(f)); 79 | /** 80 | * Lazy 81 | */ 82 | override def flatMap[U](f : T=>Iterable[U]) (implicit m : Manifest[U]): DistributedIterable[U] = new ProjectedIterable[U](Util.itFlatMap(f)); 83 | /** 84 | * Lazy 85 | */ 86 | override def filter(f : T=>Boolean) : DistributedIterable[T] = new ProjectedIterable(Util.itFilter[T](f)); 87 | 88 | // Begin protected definitions 89 | /** 90 | * Loads the given path and returns and iterator that can read off objects. Defaults to SequenceFile's. 91 | */ 92 | protected def loadIterator(p : Path): Iterator[T] = { 93 | val rdr = new SequenceFile.Reader(p.getFileSystem(h.conf),p,h.conf); 94 | val keyType = rdr.getKeyClass().asSubclass(classOf[Writable]); 95 | val valType = rdr.getValueClass().asSubclass(classOf[Writable]); 96 | Util.iteratorFromProducer {() => 97 | val k = keyType.newInstance(); 98 | val v = valType.newInstance(); 99 | if(rdr.next(k,v)) { 100 | Some(wireToReal(v).asInstanceOf[T]); 101 | } else { 102 | rdr.close(); 103 | None; 104 | } 105 | } 106 | } 107 | 108 | /** 109 | * Returns the InputFormat needed to read a file 110 | */ 111 | protected implicit def inputFormatClass : Class[C] forSome{ type C <: InputFormat[_,_]} = { 112 | classOf[SequenceFileInputFormat[_,_]] 113 | } 114 | 115 | /** 116 | * Represents a transformation on the data. 117 | * Caches transform when "force" or "elements" is called. 118 | */ 119 | private class ProjectedIterable[U](transform:Iterator[T]=>Iterator[U])(implicit mU: Manifest[U]) extends DistributedIterable[U] { 120 | def elements = force.elements; 121 | 122 | // TODO: better to slow down one machine than repeat unnecessary work on the cluster? 123 | // seems reasonable. 124 | def force(): DistributedIterable[U] = synchronized { 125 | cache match { 126 | case Some(output)=> (new PathIterable(h,output)(mU)) 127 | case None => 128 | val output = h.runMapReduce(paths, 129 | new TransformMapper(transform), 130 | new IdentityReduce[DefaultKey,U]()); 131 | cache = Some(output); 132 | (new PathIterable(h,output)(mU)) 133 | } 134 | } 135 | 136 | /// So we don't repeat a computation unncessarily 137 | private var _cache : Option[Array[Path]] = None; 138 | 139 | // must be synchronized 140 | private def cache = synchronized { _cache }; 141 | private def cache_=(c : Option[Array[Path]]) = c; 142 | 143 | override def map[V](f : U=>V)(implicit m: Manifest[V]): DistributedIterable[V] = cache match { 144 | case Some(path) => new PathIterable[U](h,path).map(f); 145 | case None => new ProjectedIterable[V](Util.andThen(transform, Util.itMap(f))); 146 | } 147 | 148 | override def flatMap[V](f : U=>Iterable[V])(implicit m: Manifest[V]) : DistributedIterable[V] = cache match { 149 | case Some(path) => new PathIterable[U](h,path).flatMap(f); 150 | case _ => new ProjectedIterable[V](Util.andThen(transform,Util.itFlatMap(f))); 151 | } 152 | 153 | override def filter(f : U=>Boolean) : DistributedIterable[U] = cache match { 154 | case Some(path) => new PathIterable[U](h,path).filter(f); 155 | case None => new ProjectedIterable[U](Util.andThen(transform,Util.itFilter(f))); 156 | } 157 | 158 | def distinct() = cache match { 159 | case Some(path) => new PathIterable[U](h,path).distinct(); 160 | case None => 161 | val output = h.runMapReduce(paths, 162 | new TransformValMapper[DefaultKey,T,U](transform), 163 | new KeyToValReduce[U,DefaultKey]); 164 | new PathIterable(h,output); 165 | } 166 | 167 | def reduce[B>:U](f: (B,B)=>B) : B = cache match { 168 | case Some(path) => new PathIterable[U](h,path).reduce(f); 169 | case None => 170 | implicit val b = m.asInstanceOf[Manifest[B]]; 171 | val output = h.runMapReduce(paths, 172 | new CollectorMapper(transform), 173 | new RealReduce(f), 174 | Set(ReduceCombine)); 175 | val path = output(0); 176 | 177 | val result = new SequenceFile.Reader(path.getFileSystem(h.conf),path,h.conf); 178 | val v = result.getValueClass.asSubclass(classOf[Writable]).newInstance(); 179 | val k = result.getKeyClass.asSubclass(classOf[Writable]).newInstance(); 180 | result.next(k,v); 181 | result.close(); 182 | wireToReal(v).asInstanceOf[B]; 183 | } 184 | } 185 | 186 | } 187 | 188 | 189 | -------------------------------------------------------------------------------- /src/smr/actors/Worker.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2008, David Hall 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * 13 | * THIS SOFTWARE IS PROVIDED BY DAVID HALL ``AS IS'' AND ANY 14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 | * DISCLAIMED. IN NO EVENT SHALL DAVID HALL BE LIABLE FOR ANY 17 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 19 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 20 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 21 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 22 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | */ 24 | package smr.actors; 25 | import scala.actors.Actor; 26 | import scala.actors.Actor._; 27 | import scala.actors.Exit; 28 | import scala.collection.mutable.ArrayBuffer; 29 | import scala.collection._; 30 | import TransActor._; 31 | import scala.actors.remote.RemoteActor._; 32 | import scala.actors.remote.Node; 33 | 34 | import Distributor._; 35 | import Priv._; 36 | 37 | class Worker(port : Int, sym : Symbol) extends Actor { 38 | import Worker._; 39 | 40 | def this() = this(Util.freePort,'worker); 41 | 42 | start(); 43 | 44 | def act() { 45 | alive(port); 46 | register(sym,Actor.self); 47 | 48 | trapExit = true; 49 | 50 | val actual_worker = new RealWorker(); 51 | val accumulators = mutable.Map[JobID,Accumulator](); 52 | def getAcc(id : JobID) ={ 53 | accumulators.getOrElseUpdate(id,new Accumulator(id)); 54 | } 55 | 56 | loop { 57 | react { 58 | case msg @ Do(in,f,out) => 59 | //Debug.info(msg + ""); 60 | val outAcc = getAcc(out); 61 | getAcc(in).forwardShardNums(outAcc); 62 | getAcc(in).addShardListener { case (shard,data) => 63 | actual_worker.enqueue { x:Unit => 64 | val outData = f(data); 65 | outAcc.completeShard(shard,outData); 66 | } 67 | } 68 | case InPlaceDo(in,f) => 69 | getAcc(in).addShardListener { case (s,data) => 70 | actual_worker enqueue { x:Unit => 71 | f(data); 72 | } 73 | } 74 | case GetOutputActor(isLocal, out, shard, retr) => 75 | def getOutputActor[U,V](retr : Iterator[U]=>V) { 76 | val actorIterator = new Util.ActorIterator[U]; 77 | val a = Actor.actor { 78 | getAcc(out).completeShard(shard,retr(actorIterator)); 79 | } 80 | getAcc(out).reserveShard(shard); 81 | val actor = transActor(port,Symbol(":output-" + out + "-"+shard)) { 82 | Actor.loop { 83 | Actor.react { 84 | case msg@ Some(x) => actorIterator.receiver ! msg; 85 | case None => actorIterator.receiver ! None; exit(); 86 | } 87 | } 88 | } 89 | if(isLocal) { 90 | reply { (Some(actor),TransActorToSerializedActor(actor))} 91 | } else { 92 | reply { (None,TransActorToSerializedActor(actor))} 93 | } 94 | } 95 | getOutputActor(retr); 96 | case Done(id,s,r)=>getAcc(id).completeShard(s,r); 97 | case Reserve(id,shard) => getAcc(id).reserveShard(shard); 98 | case DoneAdding(id) => getAcc(id).doneReserving(); 99 | case rtr @ Retrieve(id,f,out,a) => 100 | val realActor = a match { 101 | case Right(a) => SerializedActorToActor(a); 102 | case Left(a) => a 103 | } 104 | //Debug.info(rtr + ""); 105 | // Push it off to the accumulator, have it forward things to the job runner 106 | getAcc(id).addShardListener{ case (shard,data) => 107 | actual_worker.enqueue { x :Unit => 108 | realActor ! Retrieved(out,shard,f(data)); 109 | } 110 | } 111 | case Close=> 112 | Debug.info("Worker " + self + " shutting down"); 113 | actual_worker.close(); 114 | accumulators.values.foreach(_.close()); 115 | exit(); 116 | case Remove(id) => 117 | val a = accumulators.get(id); 118 | accumulators -= id; 119 | Debug.info("Worker " + self + " removing job " + id); 120 | a.foreach( _.close()); 121 | case x => 122 | Debug.error( "Wrong input to worker! " + x); 123 | } 124 | } 125 | } 126 | 127 | } 128 | 129 | object Worker { 130 | def apply() = new Worker(); 131 | def apply(port : Int, sym : Symbol) = new Worker(port,sym); 132 | 133 | /* 134 | def setClassLoaderFromClass(c : Class[_]) { 135 | scala.actors.remote.RemoteActor.classLoader = classLoaderToUse 136 | classLoaderToUse = c.getClassLoader(); 137 | } 138 | 139 | private var classLoaderToUse = this.getClass.getClassLoader(); 140 | */ 141 | private class Accumulator(id : JobID) { 142 | private case class Forward(out : Accumulator); 143 | private case class Add(shard : Int); 144 | private case class Retr(f : ((Int,Any))=>Unit); 145 | 146 | def forwardShardNums(out : Accumulator) = inner ! Forward(out); 147 | def completeShard(shard : Int, data : Any) = inner ! Done(id,shard,data); 148 | def addShardListener(f : ((Int,Any))=>Unit) = inner ! Retr(f); 149 | def reserveShard(shard : Int) = inner ! Add(shard); 150 | def doneReserving() = inner ! DoneAdding(0); 151 | def close() = inner ! Close 152 | 153 | private val inner : Actor =actor { 154 | val active = mutable.Set[Int](); 155 | val done = mutable.Map[Int,Any](); 156 | val awaiting = new ArrayBuffer[((Int,Any))=>Unit](); 157 | val waitingForDoneReservation = new ArrayBuffer[Unit=>Unit](); 158 | var doneAdding = false; 159 | var shouldExit = false; 160 | 161 | def checkFinished() { 162 | if(doneAdding && active.size == 0) { 163 | awaiting.foreach{f => done.foreach(f)} 164 | awaiting.clear(); 165 | if(shouldExit) exit(); 166 | } 167 | } 168 | 169 | loop { 170 | react { 171 | case Add(s) => 172 | if(doneAdding) Debug.warning("Got a late add"); 173 | if(!done.contains(s)) active += s 174 | case Close => 175 | if(awaiting.isEmpty) { 176 | done.clear(); 177 | waitingForDoneReservation.clear(); 178 | exit(); 179 | } 180 | shouldExit = true; 181 | case Forward(out) => 182 | val f = { x: Unit => 183 | active.foreach { sh => out.reserveShard(sh)} 184 | done.keys.foreach { sh => out.reserveShard(sh)} 185 | out.doneReserving(); 186 | } 187 | if(doneAdding) f(); 188 | else waitingForDoneReservation += f; 189 | 190 | case Retr(f) => 191 | if(doneAdding && active.size==0) { 192 | done.foreach(f); 193 | } else { 194 | awaiting += f; 195 | } 196 | case msg @ DoneAdding(dbg) => 197 | doneAdding = true; 198 | waitingForDoneReservation foreach { f => f()} 199 | waitingForDoneReservation.clear(); 200 | checkFinished(); 201 | 202 | case Done(i,s,r) => 203 | active -= s; 204 | done += (s->r); 205 | checkFinished(); 206 | case x => Debug.error( "Wrong input to accumulator!" + x); 207 | } 208 | } 209 | } 210 | } 211 | 212 | private class RealWorker { 213 | private case class Enqueue(f : Unit=>Unit); 214 | def enqueue(f : Unit=>Unit) = inner ! Enqueue(f); 215 | def close() = inner ! Exit(Actor.self,'closed); 216 | 217 | private val inner = actor { 218 | loop { 219 | react { 220 | case Exit(_,_) => exit(); 221 | case Enqueue(f) => try { 222 | f(); 223 | } catch { 224 | case x => 225 | // todo: better error reporting 226 | x.printStackTrace(); 227 | } 228 | case x => Debug.error( "Wrong input to realWorker!" + x); 229 | } 230 | } 231 | } 232 | } 233 | } 234 | -------------------------------------------------------------------------------- /src/smr/hadoop/Hadoop.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2008, David Hall 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * 13 | * THIS SOFTWARE IS PROVIDED BY DAVID HALL ``AS IS'' AND ANY 14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 | * DISCLAIMED. IN NO EVENT SHALL DAVID HALL BE LIABLE FOR ANY 17 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 19 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 20 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 21 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 22 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | */ 24 | package smr.hadoop; 25 | import smr._; 26 | 27 | import java.io._; 28 | 29 | import org.apache.hadoop.io._; 30 | import org.apache.hadoop.conf._; 31 | import org.apache.hadoop.fs._; 32 | import org.apache.hadoop.util._; 33 | import org.apache.hadoop.mapred._; 34 | import org.apache.hadoop.filecache._; 35 | 36 | import scala.reflect.Manifest; 37 | 38 | 39 | /** 40 | * Supports Hadoop operations. 41 | * @see Hadoop$ 42 | */ 43 | class Hadoop(val conf : Configuration, userJar :String, private[hadoop] val dirGenerator : (String)=>Path) { 44 | // enable path conversions, and other goodies 45 | implicit private val cf = conf; 46 | import Implicits._; 47 | import Hadoop._; 48 | 49 | /** 50 | * Constructs a Hadoop instance with the given configuration and working directory (for files) 51 | */ 52 | def this(conf : Configuration, userJar : String, workDir : Path) = this(conf,userJar, {(pref:String) => 53 | new Path(workDir,pref); 54 | }); 55 | 56 | private[smr] val cacheDir = dirGenerator("tmp/cache"); 57 | 58 | conf.set("smr.cache.dir",cacheDir.toString); 59 | // cacheDir.mkdirs(); 60 | if(!conf.getBoolean(CONFIG_KEEP_FILES,false)) 61 | dirGenerator("tmp").deleteOnExit(); 62 | 63 | def load[T](p : Array[Path])(implicit m : Manifest[T])= new PathIterable[T](this,p); 64 | def load[T](p : Path)(implicit m : Manifest[T]):PathIterable[T]= load[T](Array(p)); 65 | 66 | def loadPairs[K,V](p : Path*)(implicit mK : Manifest[K], mV: Manifest[V]) = { 67 | new PathPairs[K,V](this,p.toArray); 68 | } 69 | 70 | def loadPairs[K,V](p : Seq[Path]) = new PathPairs(this,p.toArray); 71 | def loadLines(p : Path*) = new PathPairs[Long,String](this,p.toArray) with Lines; 72 | 73 | import Magic._; 74 | 75 | def distributePairs[K,V](ibl: Iterable[(K,V)], numShards : Int)(implicit mK:Manifest[K], mV:Manifest[V]) = { 76 | val paths = pathGenerator(numShards); 77 | val elems = ibl.elements.map{ case(k,v) => (realToWire(k),realToWire(v))} 78 | 79 | if(!elems.hasNext) 80 | throw new IllegalArgumentException("Empty iterable"); 81 | val first = elems.next; 82 | 83 | val writers = 84 | for(p <- paths; 85 | fs = p.getFileSystem(conf); 86 | wrtr = new SequenceFile.Writer(fs,conf,p,first._1.getClass,first._2.getClass)) 87 | yield wrtr; 88 | var i = 0; 89 | writers(i%numShards).append(first._1,first._2); 90 | while(elems.hasNext) { 91 | i+=1; 92 | val nxt = elems.next(); 93 | writers(i%numShards).append(nxt._1,nxt._2); 94 | } 95 | writers.foreach{_.close()}; 96 | loadPairs[K,V](paths).asInstanceOf[PathPairs[K,V]]; 97 | } 98 | 99 | def distribute[T](ibl : Iterable[T], numShards :Int)(implicit m : Manifest[T]) :PathIterable[T] = { 100 | val paths = pathGenerator(numShards); 101 | 102 | val elems = ibl.elements.map(Magic.realToWire); 103 | 104 | if(!elems.hasNext) 105 | throw new IllegalArgumentException("Empty iterable"); 106 | val first = elems.next; 107 | 108 | val writers = 109 | for(p <- paths; 110 | fs = p.getFileSystem(conf); 111 | wrtr = new SequenceFile.Writer(fs,conf,p,classOf[Hadoop.DefaultKeyWritable],first.getClass)) 112 | yield wrtr; 113 | var i = 0; 114 | writers(i%numShards).append(Magic.realToWire(mkDefaultKey(first)),first); 115 | while(elems.hasNext) { 116 | i+=1; 117 | val nxt = elems.next(); 118 | writers(i%numShards).append(Magic.realToWire(mkDefaultKey(nxt)),nxt); 119 | } 120 | writers.foreach{_.close()}; 121 | load[T](paths); 122 | } 123 | 124 | private def serializeClass(jobConf : JobConf, name : String, c : AnyRef) = { 125 | implicit val jc = jobConf; 126 | val path = new Path(cacheDir,name); 127 | val stream = new ObjectOutputStream(path.getFileSystem(jc).create(path)); 128 | stream.writeObject(c); 129 | stream.close(); 130 | DistributedCache.addCacheFile(path.toUri,jobConf); 131 | path; 132 | } 133 | 134 | private[hadoop] def runMapReduce[K1,V1,K2,V2,K3,V3](paths : Array[Path], 135 | m: Mapper[K1,V1,K2,V2], 136 | r: Reduce[K2,V2,K3,V3]) 137 | (implicit mk2:Manifest[K2], mv2:Manifest[V2], 138 | mk3:Manifest[K3], mv3:Manifest[V3], 139 | inputFormat : Class[T] forSome {type T <: InputFormat[_,_]}) : Array[Path]= { 140 | runMapReduce(paths,m,r,Set()); 141 | } 142 | 143 | private[hadoop] def runMapReduce[K1,V1,K2,V2,K3,V3](paths : Array[Path], 144 | m: Mapper[K1,V1,K2,V2], 145 | r: Reduce[K2,V2,K3,V3], 146 | options : Set[Hadoop.Options]) 147 | (implicit mk2:Manifest[K2], mv2:Manifest[V2], 148 | mk3:Manifest[K3], mv3:Manifest[V3], 149 | inputFormat : Class[T] forSome {type T <: InputFormat[_, _]}) = { 150 | implicit val jobConf = new JobConf(conf); 151 | jobConf.setJar(userJar); 152 | var outputOption : Option[Path] = None; 153 | options foreach { 154 | case ReduceCombine => jobConf.setCombinerClass(classOf[ReduceWrapper[_,_,_,_]]); 155 | case OutputDir(dir) => outputOption = Some(dirGenerator(dir)); 156 | case x => throw new IllegalArgumentException("Illegal MapReduce Option: " + x); 157 | } 158 | 159 | val outputPath = outputOption.getOrElse(genDir); 160 | jobConf.setJobName("SMR-"+outputPath.getName); 161 | jobConf.setInputFormat(inputFormat); 162 | jobConf.setOutputFormat(classOf[SequenceFileOutputFormat[_,_]]); 163 | 164 | val mPath = serializeClass(jobConf,outputPath.getName+"-Map.ser",m); 165 | val rPath = serializeClass(jobConf,outputPath.getName+"-Reduce.ser",r); 166 | jobConf.set("smr.job.mapper.file",mPath.toString); 167 | jobConf.set("smr.job.reducer.file",rPath.toString); 168 | 169 | jobConf.setMapRunnerClass(classOf[ClosureMapper[_,_,_,_]]); 170 | jobConf.setReducerClass(classOf[ReduceWrapper[_,_,_,_]]); 171 | jobConf.setNumReduceTasks(conf.getInt("smr.reduce.tasks.default",paths.length)); 172 | 173 | 174 | jobConf.setMapOutputKeyClass(Magic.classToWritableClass(mk2.erasure)); 175 | jobConf.setMapOutputValueClass(Magic.classToWritableClass(mv2.erasure)); 176 | jobConf.setOutputKeyClass(Magic.classToWritableClass(mk3.erasure)); 177 | jobConf.setOutputValueClass(Magic.classToWritableClass(mv3.erasure)); 178 | 179 | FileInputFormat.setInputPaths(jobConf, paths:_*); 180 | FileOutputFormat.setOutputPath(jobConf,outputPath); 181 | 182 | JobClient.runJob(jobConf); 183 | 184 | outputPath.listFiles(); 185 | } 186 | 187 | private var jobNum = 0; 188 | protected def nextName = synchronized { 189 | jobNum+=1; 190 | "job"+jobNum; 191 | } 192 | 193 | private def genDir() = { 194 | dirGenerator("tmp/"+nextName); 195 | } 196 | 197 | private def pathGenerator(numShards : Int) = { 198 | val dir = genDir(); 199 | // dir.mkdirs(); 200 | 201 | Array.fromFunction { i => 202 | new Path(dir,"part-"+i+"-of-"+numShards); 203 | } (numShards); 204 | } 205 | } 206 | 207 | object Hadoop { 208 | /** 209 | * Create a {@link Hadoop} instance from command line args and a working directory. 210 | */ 211 | def apply(args : Array[String], userJar : String, workDir : Path) = fromArgs(args, userJar, workDir)._1; 212 | 213 | /** 214 | * Create a {@link Hadoop} instance from command line args and a working directory. 215 | * @return hadoop instance and remaining args 216 | */ 217 | def fromArgs(args: Array[String], userJar : String, workDir : Path) = { 218 | var restArgs : Array[String] = null; 219 | var conf : Configuration = null; 220 | val tool = new Configured with Tool { 221 | @throws(classOf[Exception]) 222 | def run(args : Array[String]) : Int = { 223 | restArgs = args; 224 | conf = getConf(); 225 | 0; 226 | } 227 | } 228 | ToolRunner.run(tool,args); 229 | (new Hadoop(conf,userJar, workDir),args); 230 | } 231 | 232 | private[hadoop] sealed case class Options; 233 | case object ReduceCombine extends Options; 234 | case class OutputDir(s : String) extends Options; 235 | 236 | private[hadoop] type DefaultKeyWritable = IntWritable; 237 | private[hadoop] type DefaultKey= Int; 238 | private[hadoop] def mkDefaultKey() : DefaultKey= 0.asInstanceOf[DefaultKey]; 239 | private[hadoop] def mkDefaultKey[V](v: V): DefaultKey = v.asInstanceOf[AnyRef].hashCode(); 240 | 241 | val CONFIG_KEEP_FILES = "smr.files.keep"; 242 | 243 | private def copyFile(inFile : Path, outFile : Path)(implicit conf : Configuration) { 244 | val fs = inFile.getFileSystem(conf); 245 | // Read from and write to new file 246 | val in = fs.open(inFile); 247 | val out = fs.create(outFile); 248 | val COPY_BUFFER_SIZE = 4096; 249 | val buffer = new Array[Byte](4096); 250 | try { 251 | var bytesRead = in.read(buffer); 252 | while (bytesRead > 0) { 253 | out.write(buffer, 0, bytesRead); 254 | bytesRead = in.read(buffer); 255 | } 256 | } finally { 257 | in.close(); 258 | out.close(); 259 | } 260 | } 261 | } 262 | 263 | 264 | -------------------------------------------------------------------------------- /src/smr/hadoop/PathPairs.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2008, David Hall 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * 13 | * THIS SOFTWARE IS PROVIDED BY DAVID HALL ``AS IS'' AND ANY 14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 | * DISCLAIMED. IN NO EVENT SHALL DAVID HALL BE LIABLE FOR ANY 17 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 19 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 20 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 21 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 22 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | */ 24 | package smr.hadoop; 25 | import smr._; 26 | import org.apache.hadoop.io._; 27 | import org.apache.hadoop.conf._; 28 | import org.apache.hadoop.fs._; 29 | import org.apache.hadoop.util._; 30 | import org.apache.hadoop.mapred._; 31 | import scala.reflect.Manifest; 32 | 33 | import Magic._; 34 | import Hadoop._; 35 | 36 | abstract class AbstractPairs[K,V](val h: Hadoop)(implicit mK: Manifest[K], mV:Manifest[V]) extends DistributedPairs[K,V] with FileFormat[K,V] { self => 37 | protected[hadoop] def paths : Array[Path]; 38 | 39 | def elements = { 40 | if(paths.length == 0) 41 | new Iterator[(K,V)] { 42 | def hasNext = false; 43 | def next = throw new IllegalArgumentException("No elements were found!") 44 | } 45 | else paths.map(loadIterator).reduceLeft(_++_); 46 | } 47 | 48 | def force = new PathPairs[K,V](h,paths); 49 | 50 | /** 51 | * Models MapReduce/Hadoop-style reduce more exactly. 52 | */ 53 | def flatReduce[K2,V2](f : (K,Iterator[V])=>Iterator[(K2,V2)])(implicit m : Manifest[K2], mU:Manifest[V2]): DistributedPairs[K2,V2] = { 54 | new MapReducePairs(h, self.paths, new PairTransformMapper(identity[Iterator[(K,V)]]), new FlatReduce(f), inputFormatClass); 55 | } 56 | 57 | /** 58 | * Models MapReduce/Hadoop-style reduce more exactly. 59 | */ 60 | def reduce[K2,V2](f: (K,Iterator[V])=>(K2,V2))(implicit mL: Manifest[K2], mW:Manifest[V2]): DistributedPairs[K2,V2] = { 61 | new MapReducePairs(h, self.paths, new PairTransformMapper(identity[Iterator[(K,V)]]), new PairReduce(f), inputFormatClass); 62 | } 63 | 64 | /** 65 | * Lazy 66 | */ 67 | override def map[K2,V2](f : ((K,V))=>(K2,V2))(implicit mJ : Manifest[K2], mU : Manifest[V2]): DistributedPairs[K2,V2] = { 68 | new ProjectedPairs[K,V,K2,V2](this,Util.itMap(f)); 69 | } 70 | 71 | /** 72 | * Lazy 73 | */ 74 | override def flatMap[K2,V2](f : ((K,V))=>Iterable[(K2,V2)])(implicit mJ : Manifest[K2], mU : Manifest[V2]): DistributedPairs[K2,V2] = { 75 | new ProjectedPairs[K,V,K2,V2](this,Util.itFlatMap(f)); 76 | } 77 | 78 | /** 79 | * Lazy 80 | */ 81 | override def filter(f : ((K,V))=>Boolean) : DistributedPairs[K,V] = new ProjectedPairs[K,V,K,V](this,Util.itFilter[(K,V)](f)); 82 | 83 | /** 84 | * Lazy 85 | */ 86 | override def mapFirst[K2](f : K=>K2)(implicit mJ: Manifest[K2]) : DistributedPairs[K2,V] = { 87 | new ProjectedPairs[K,V,K2,V](this,Util.itMap { case (k,v) => (f(k),v)}); 88 | } 89 | 90 | /** 91 | * Lazy 92 | */ 93 | override def mapSecond[V2](f : V=>V2)(implicit mJ: Manifest[V2]) : DistributedPairs[K,V2] = { 94 | new ProjectedPairs[K,V,K,V2](this,Util.itMap{ case (k,v) => (k,f(v))}); 95 | } 96 | 97 | // Begin protected definitions 98 | /** 99 | * Loads the given path and returns and iterator that can read off objects. Defaults to SequenceFile's. 100 | */ 101 | override protected[hadoop] def loadIterator(p : Path): Iterator[(K,V)] = { 102 | val rdr = new SequenceFile.Reader(p.getFileSystem(h.conf),p,h.conf); 103 | val keyType = rdr.getKeyClass().asSubclass(classOf[Writable]); 104 | val valType = rdr.getValueClass().asSubclass(classOf[Writable]); 105 | Util.iteratorFromProducer {() => 106 | val k = keyType.newInstance(); 107 | val v = valType.newInstance(); 108 | if(rdr.next(k,v)) { 109 | Some((wireToReal(k).asInstanceOf[K],wireToReal(v).asInstanceOf[V])); 110 | } else { 111 | rdr.close(); 112 | None; 113 | } 114 | } 115 | } 116 | 117 | /** 118 | * Returns the InputFormat needed to read a file 119 | */ 120 | override protected[hadoop] implicit def inputFormatClass : Class[T] forSome{ type T <: InputFormat[_,_]} = { 121 | classOf[SequenceFileInputFormat[_,_]].asInstanceOf[Class[InputFormat[_,_]]]; 122 | } 123 | 124 | /** 125 | * Joins two PathPairs together. 126 | */ 127 | def ++[SK>:K,SV>:V](other : DistributedPairs[SK,SV])(implicit mSK:Manifest[SK], mSV:Manifest[SV]) = other match { 128 | case aOther : AbstractPairs[_,_] => new AbstractPairs[SK,SV](h) { 129 | protected[hadoop] override def paths = self.paths ++ aOther.paths; 130 | protected[hadoop] override implicit val inputFormatClass : Class[T] forSome{ type T <: InputFormat[_,_]} = { 131 | self.inputFormatClass; 132 | } 133 | 134 | def asStage(name:String) : PathPairs[SK,SV] = new PathPairs[SK,SV](h,paths).asStage(name); 135 | } 136 | case _ => throw new IllegalArgumentException("++ only valid for PathPairs and cousins"); 137 | } 138 | 139 | 140 | } 141 | 142 | /** 143 | * Represents pairs that have will be mapped and reduced. A complete cycle. 144 | */ 145 | // TODO: tighter integration between paths and asStage 146 | private class MapReducePairs[K1,V1,K2,V2,K3,V3](h : Hadoop, 147 | input: =>Array[Path], 148 | m : Mapper[K1,V1,K2,V2], 149 | r : Reduce[K2,V2,K3,V3], 150 | val inputFormat : Class[T] forSome {type T <: InputFormat[_,_]}) 151 | (implicit mk1 : Manifest[K1], mk2 : Manifest[K2], mk3:Manifest[K3], 152 | mv1:Manifest[V1], mv2:Manifest[V2], mv3 : Manifest[V3]) extends AbstractPairs[K3,V3](h) { 153 | 154 | import Implicits._; 155 | private implicit val conf = h.conf; 156 | 157 | // a little ugly. 158 | private var pathsRun = false; 159 | override lazy val paths = { 160 | synchronized {pathsRun = true; } 161 | h.runMapReduce(input, m,r); 162 | } 163 | 164 | override def asStage(dir : String) : DistributedPairs[K3,V3] = { 165 | val outDir = h.dirGenerator(dir); 166 | if(outDir.exists) { 167 | new PathPairs(h,outDir.listFiles); 168 | } else synchronized { 169 | if(pathsRun) { 170 | new PathPairs[K3,V3](h,paths).asStage(dir); 171 | } else { 172 | val outFiles = h.runMapReduce(input, m,r, Set(OutputDir(dir))); 173 | (new PathPairs[K3,V3](h,outFiles)) 174 | } 175 | } 176 | } 177 | 178 | override implicit def inputFormatClass : Class[_ <: InputFormat[_,_]] = inputFormat; 179 | } 180 | 181 | /** 182 | * Represents a set of Paths on disk. 183 | */ 184 | class PathPairs[K,V](h: Hadoop, val paths : Array[Path], keepFiles :Boolean)(implicit mK: Manifest[K], mV:Manifest[V]) extends AbstractPairs[K,V](h) { 185 | import Implicits._; 186 | 187 | def this(h: Hadoop, paths: Array[Path])(implicit mk:Manifest[K], mv:Manifest[V]) = this(h,paths,true); 188 | 189 | implicit val conf = h.conf; 190 | 191 | /** 192 | * Copies the files represented by the pathpairs to the stage directory. 193 | */ 194 | def asStage(output: String) = { 195 | val outputDir = h.dirGenerator(output); 196 | outputDir.mkdirs(); 197 | val outPaths = for(p <- paths) yield new Path(outputDir,p.getName); 198 | for( (src,dst) <- paths.zip(outPaths)) { 199 | src.moveTo(dst); 200 | } 201 | new PathPairs[K,V](h,outPaths); 202 | } 203 | } 204 | 205 | /** 206 | * Used to override the default behavior of Lines 207 | */ 208 | trait FileFormat[K,V] { 209 | protected[hadoop] def loadIterator(p: Path): Iterator[(K,V)] 210 | protected[hadoop] def inputFormatClass : Class[T] forSome { type T <: InputFormat[_,_]} 211 | } 212 | 213 | /** 214 | * Used with PathPairs, reads files line by line. Key is the offset in bytes 215 | */ 216 | trait Lines extends FileFormat[Long,String]{ this : PathPairs[Long,String] => 217 | import Implicits._; 218 | override protected[hadoop] def loadIterator(p: Path) = { 219 | implicit val conf = h.conf; 220 | 221 | val rdr = new LineRecordReader(p.getFileSystem(h.conf).open(p),0,p.length); 222 | val k = new LongWritable; 223 | val v = new Text; 224 | Util.iteratorFromProducer { () => 225 | if(rdr.next(k,v)) { 226 | Some((k.get,v.toString)); 227 | } else { 228 | rdr.close; 229 | None; 230 | } 231 | } 232 | } 233 | 234 | override protected[hadoop] def inputFormatClass = { 235 | classOf[TextInputFormat].asInstanceOf[Class[InputFormat[_,_]]]; 236 | } 237 | } 238 | 239 | /** 240 | * Represents a transformation on the data. 241 | * Caches transform when "force" or "elements" is called. 242 | */ 243 | class ProjectedPairs[K,V,K2,V2](parent : AbstractPairs[K,V], transform:Iterator[(K,V)]=>Iterator[(K2,V2)])(implicit mK:Manifest[K], mV:Manifest[V], mJ:Manifest[K2], mU: Manifest[V2]) extends AbstractPairs[K2,V2](parent.h) { 244 | import Implicits._; 245 | override def elements = force.elements; 246 | 247 | override protected[hadoop] def paths = force.paths; 248 | 249 | // TODO: better to slow down one machine than repeat unnecessary work on the cluster? 250 | // seems reasonable. 251 | override def force() : PathPairs[K2,V2] = synchronized { 252 | cache match { 253 | case Some(output)=> (new PathPairs(h,output)) 254 | case None => 255 | val output = h.runMapReduce(parent.paths, 256 | new PairTransformMapper(transform), 257 | new IdentityReduce[K2,V2]()); 258 | cache = Some(output); 259 | (new PathPairs(h,output)) 260 | } 261 | } 262 | 263 | def asStage(output : String):DistributedPairs[K2,V2] = { 264 | implicit val conf = h.conf; 265 | val outDir = h.dirGenerator(output); 266 | if(outDir.exists) { 267 | cache = Some(outDir.listFiles); 268 | this; 269 | } else synchronized { 270 | cache match { 271 | case Some(o)=> new PathPairs[K2,V2](h,o).asStage(output); 272 | case None=> 273 | val outFiles = h.runMapReduce(parent.paths, 274 | new PairTransformMapper(transform), 275 | new IdentityReduce[K2,V2](), 276 | Set(OutputDir(output))); 277 | synthetic = false; 278 | cache = Some(outFiles); 279 | (new PathPairs[K2,V2](h,outFiles)) 280 | } 281 | } 282 | } 283 | 284 | /// So we don't repeat a computation unncessarily 285 | private var _cache : Option[Array[Path]] = None; 286 | 287 | private var synthetic = true; 288 | 289 | // must be synchronized 290 | private def cache = synchronized { _cache }; 291 | private def cache_=(c : Option[Array[Path]]) = synchronized { 292 | _cache = c; 293 | if(synthetic) { 294 | c match { 295 | case _ => 296 | } 297 | } 298 | } 299 | 300 | implicit val conf = h.conf; 301 | 302 | override def map[K3,V3](f : ((K2,V2))=>(K3,V3))(implicit mL: Manifest[K3], mW: Manifest[V3]): DistributedPairs[K3,V3] = cache match { 303 | case Some(path) => new PathPairs[K2,V2](h,path).map(f); 304 | case None => new ProjectedPairs[K,V,K3,V3](parent,Util.andThen(transform, Util.itMap(f))); 305 | } 306 | 307 | override def flatMap[K3,V3](f : ((K2,V2))=>Iterable[(K3,V3)])(implicit mL: Manifest[K3], mW: Manifest[V3]) : DistributedPairs[K3,V3] = cache match { 308 | case Some(path) => new PathPairs[K2,V2](h,path).flatMap(f); 309 | case _ => new ProjectedPairs[K,V,K3,V3](parent,Util.andThen(transform,Util.itFlatMap(f))); 310 | } 311 | 312 | override def filter(f : ((K2,V2))=>Boolean) : DistributedPairs[K2,V2] = cache match { 313 | case Some(path) => new PathPairs[K2,V2](h,path).filter(f); 314 | case None => new ProjectedPairs[K,V,K2,V2](parent,Util.andThen(transform,Util.itFilter(f))); 315 | } 316 | 317 | /** 318 | * Lazy 319 | */ 320 | override def mapFirst[K3](f : K2=>K3)(implicit mL: Manifest[K3]) : DistributedPairs[K3,V2] = { 321 | new ProjectedPairs(parent,Util.andThen(transform,Util.itMap[(K2,V2),(K3,V2)]{ case (k,v) => (f(k),v)})); 322 | } 323 | 324 | /** 325 | * Lazy 326 | */ 327 | override def mapSecond[V3](f : V2=>V3)(implicit mW: Manifest[V3]) : DistributedPairs[K2,V3] = { 328 | new ProjectedPairs(parent,Util.andThen(transform,Util.itMap[(K2,V2),(K2,V3)]{ case (k,v) => (k,f(v))})); 329 | } 330 | 331 | /** 332 | * Models MapReduce/Hadoop-style reduce more exactly. 333 | */ 334 | override def flatReduce[K3,V3](f : (K2,Iterator[V2])=>Iterator[(K3,V3)])(implicit mK3 : Manifest[K3], mV3:Manifest[V3]): DistributedPairs[K3,V3] = { 335 | new MapReducePairs(h, parent.paths, new PairTransformMapper(transform), new FlatReduce(f), inputFormatClass); 336 | } 337 | 338 | /** 339 | * Models MapReduce/Hadoop-style reduce more exactly. 340 | */ 341 | override def reduce[K3,V3](f: (K2,Iterator[V2])=>(K3,V3))(implicit mL: Manifest[K3], mW:Manifest[V3]): DistributedPairs[K3,V3] = { 342 | new MapReducePairs(h, parent.paths, new PairTransformMapper(transform), new PairReduce(f), inputFormatClass); 343 | } 344 | 345 | override protected[hadoop] implicit def inputFormatClass : Class[T] forSome{ type T <: InputFormat[_,_]} = { 346 | parent.inputFormatClass; 347 | } 348 | } 349 | 350 | 351 | -------------------------------------------------------------------------------- /src/smr/actors/Distributor.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2008, David Hall 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * 13 | * THIS SOFTWARE IS PROVIDED BY DAVID HALL ``AS IS'' AND ANY 14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 | * DISCLAIMED. IN NO EVENT SHALL DAVID HALL BE LIABLE FOR ANY 17 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 19 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 20 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 21 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 22 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | */ 24 | package smr.actors; 25 | import scala.actors.Actor; 26 | import scala.actors.OutputChannel; 27 | import scala.actors.Exit; 28 | import scala.actors.Actor._; 29 | import scala.collection.mutable.ArrayBuffer; 30 | import scala.collection._; 31 | import scala.actors.remote.RemoteActor._; 32 | import scala.actors.remote._; 33 | import scala.reflect.Manifest; 34 | import TransActor._; 35 | 36 | object Distributor { 37 | // Every job needs an id: 38 | type JobID = Int; 39 | } 40 | import Distributor._; 41 | 42 | /** 43 | * Trait for defining objects that can automatically distribute tasks to perform on iterables. 44 | * @author dlwh 45 | */ 46 | trait Distributor { 47 | /** 48 | * Generates a DistributedIterable based on the sharding function. Shards from the list are 49 | * automatically divvied out to the workers. 50 | */ 51 | def distribute[T] (it : Iterable[T])(implicit shard : (Iterable[T],Int)=>List[Iterable[T]]) : DistributedIterable[T]; 52 | 53 | /** 54 | * Low level operation: should generally not be used, but made public for completeness. 55 | * Given a U, automatically shard it out using the shard function to workers. 56 | * @return a handle to the shards 57 | */ 58 | def shard[U,V](it : U)(implicit myShard : (U,Int)=>List[V]) : JobID; 59 | 60 | /** 61 | * Low level operation: should generally not be used, but made public for completeness. 62 | * Convert all sharded elements to U's. More or less a map operation. 63 | * @return a handle to the changed shards. 64 | */ 65 | def schedule[T,U](id : JobID, f: T=>U) : JobID; 66 | 67 | /** 68 | * Low level operation: should generally not be used, but made public for completeness. 69 | * Retreive all shards, first applying f to each one. Sent as Some(t) to the Actor. 70 | * When finished, None is sent. 71 | * @return a handle to the changed shards. 72 | */ 73 | def gather[T,U](job : JobID, f: T=>U, gather : Actor) :Unit; 74 | 75 | /** 76 | * Take input shards of type T and creates a new set of shards U, which are processed. 77 | * This can support a Google-style MapReduce 78 | * @return a handle to the changed shards. 79 | */ 80 | def groupBy[T,U,V](job : JobID, f: (T,((Int,U)=>Unit))=>Unit, received: Iterator[U]=>V): JobID; 81 | 82 | /** 83 | * Low level operation: should generally not be used, but made public for completeness. 84 | * Delete all shards with this id. 85 | * @return a handle to the changed shards. 86 | */ 87 | def remove(job : JobID) : Unit; 88 | 89 | /** 90 | * Close the distributor and all workers. 91 | */ 92 | def close() {} 93 | 94 | } 95 | 96 | private object Priv { 97 | 98 | // Messages to the scheduler from the disributor 99 | sealed case class SchedMsg; 100 | case class Shard[U,V](it : U, shard : (U,Int)=>List[V]) extends SchedMsg; 101 | case class GroupBy[T,U,V](job: JobID, f: (T,((Int,U)=>Unit))=>Unit, received: Iterator[U]=>V) extends SchedMsg; 102 | case class Sched(in : JobID, f : Any=>Any) extends SchedMsg; 103 | case class Get[T,U](job : JobID, f : T => U, gather : Actor) extends SchedMsg; 104 | case class Remove[U](job : JobID) extends SchedMsg; 105 | case class AddWorker[U](a :OutputChannel[Any]) extends SchedMsg; 106 | 107 | sealed case class WorkerMsg; 108 | case class Do(id : JobID, f : Any=>Any, out : JobID) extends WorkerMsg; 109 | case class InPlaceDo(id : JobID, f : Any=>Unit) extends WorkerMsg; 110 | case class Retrieve[T,U](in : JobID, f : Any=>Any, out : JobID, actor : Either[Actor,SerializedActor]) extends WorkerMsg; 111 | case class GetOutputActor[U,V](isLocal : Boolean, out : JobID, shard : Int, process : Iterator[U]=>V) extends WorkerMsg; 112 | case class DoneAdding(id : JobID) extends WorkerMsg; 113 | case class Reserve(id : JobID, shard : Int) extends WorkerMsg; 114 | case class Done[U](id : JobID, shard : Int, result : U) extends WorkerMsg; 115 | case object Close extends WorkerMsg; 116 | 117 | case class StartGet(out: JobID, numShards : Int, gather : Actor); 118 | case class Retrieved(out : JobID, shard : Int, result : Any); 119 | } 120 | import Priv._; 121 | 122 | object Debug extends scala.actors.Debug("smr:: ") { 123 | level = 4; 124 | } 125 | 126 | /** 127 | * Class most users will use. Example use: 128 | *

129 |  * val dist = new ActorDistributor(4,4000);
130 |  * dist.distribute(myIterable).map(f).reduce(g);
131 |  *

132 | */ 133 | class ActorDistributor(numWorkers : Int, port : Int) extends Distributor { 134 | override def distribute[T] (it : Iterable[T]) 135 | (implicit myShard : (Iterable[T],Int)=>List[Iterable[T]]) : DistributedIterable[T] = new InternalIterable[T] { 136 | protected lazy val id : JobID = shard(it)(myShard); 137 | protected lazy val scheduler = ActorDistributor.this; 138 | }; 139 | 140 | // pushes data onto the grid 141 | def shard[U,V](it : U)(implicit myShard : (U,Int)=>List[V]) = (scheduler !?Shard(it,myShard)).asInstanceOf[JobID]; 142 | // runs a task on some data on the grid 143 | def schedule[T,U](id : JobID, f: T=>U) = (scheduler !? Sched(id,f.asInstanceOf[Any=>Any])).asInstanceOf[JobID]; 144 | // gets it back using some function. Returns immediately. expect output from gather 145 | def gather[T,U](job : JobID, f: T=>U, gather : Actor) :Unit = (scheduler ! Get(job,f,gather)); 146 | // gets rid of it: 147 | def remove(job : JobID) : Unit = (scheduler ! Remove(job)); 148 | 149 | def groupBy[T,U,V](job : JobID, f: (T,((Int,U)=>Unit))=>Unit, received: Iterator[U]=>V) = 150 | (scheduler !? GroupBy(job,f,received)).asInstanceOf[JobID]; 151 | /** 152 | * Adds a (possibly remote) Worker to the workers list. 153 | */ 154 | def addWorker(w :OutputChannel[Any]) : Unit = (scheduler ! AddWorker(w)); 155 | 156 | override def close = { 157 | scheduler ! Exit(self,'close); 158 | workers.foreach(_._2 ! Close); 159 | } 160 | 161 | private val gatherer = actor { 162 | val gatherers = mutable.Map[JobID,Actor](); 163 | val shardsLeft = mutable.Map[JobID,Int](); 164 | loop { 165 | react { 166 | case StartGet(out, numShards, gather) => 167 | gatherers(out) = gather; 168 | shardsLeft(out) = numShards; 169 | reply{ None} 170 | case Retrieved(out, shard, result)=> 171 | gatherers(out) ! Some((shard,result)); 172 | shardsLeft(out) -= 1; 173 | if(shardsLeft(out) == 0) { 174 | gatherers(out) ! None; 175 | shardsLeft -= out; 176 | gatherers -= out; 177 | } 178 | } 179 | } 180 | } 181 | 182 | // Accumulator is a remote actor, so it just acts a middle man for gatherer. 183 | // Otherwise, potentially large amounts of data would get serialized in the gather closure for no reason. 184 | private val remoteAccumulator = transActor(port,'accumulator) { 185 | loop { 186 | react { 187 | case x => gatherer ! x 188 | } 189 | } 190 | } 191 | 192 | private val localAccumulator = actor { 193 | loop { 194 | react { 195 | case x => gatherer ! x 196 | } 197 | } 198 | } 199 | 200 | // central dispatcher for ActorDistributor 201 | private val scheduler = actor { 202 | val numShards = mutable.Map[JobID,Int](); 203 | var nextJob : JobID =0 204 | def getNextJob() = { 205 | val job = nextJob; 206 | nextJob +=1; 207 | job; 208 | } 209 | loop { 210 | react { 211 | case scala.actors.Exit(_,_) => exit(); 212 | case Shard(it,shard)=> 213 | val job = getNextJob(); 214 | val shards = shard(it,workers.length) 215 | numShards += (job -> shards.length); 216 | shards.zipWithIndex.foreach { 217 | x => 218 | Debug.info( "sending shard " + x._2 + " to Worker " + x._2 %workers.length); 219 | workers(x._2 % workers.length)._2 ! Done(job,x._2,x._1) 220 | } 221 | workers.foreach { _._2 ! DoneAdding(job) } 222 | reply { job } 223 | case Sched(in,f)=> 224 | val job = getNextJob(); 225 | val oldNumShards = numShards(in); 226 | numShards += (job->oldNumShards); 227 | Debug.info( "Running " + f.getClass.getName() + " on job " + in + "'s output as job " + job); 228 | workers.foreach { a => 229 | a._2 ! Do(in, f, job) 230 | } 231 | reply { job } 232 | case GroupBy(in, f, r) => 233 | // get type inference... 234 | def handleGroupBy[T,U,V](in : JobID, f : ( (T,(Int,U)=>Unit)=>Unit), r : Iterator[U]=>V) { 235 | val out = getNextJob(); 236 | val oldNumShards = numShards(in); 237 | numShards += (out->oldNumShards); 238 | // set up forwarding actors for the hashed outputs 239 | val outActors = getOutActors(out, oldNumShards, r); 240 | for( (isLocal,w) <- workers) { 241 | w ! DoneAdding(out); 242 | } 243 | 244 | val localActors = outActors.map(getLocalActors); 245 | def localOut(x : Any) { 246 | def output(idx : Int, u : U) { 247 | localActors(idx%localActors.length) ! Some(u); 248 | } 249 | f(x.asInstanceOf[T],output); 250 | localAccumulator ! Retrieved(out,1,None); 251 | } 252 | 253 | val remoteActors = outActors.map(getRemoteActors); 254 | def remoteOut(x :Any) { 255 | def output(idx : Int, u : U) { 256 | remoteActors(idx%remoteActors.length) ! Some(u); 257 | } 258 | f(x.asInstanceOf[T],output); 259 | remoteAccumulator ! Retrieved(out,1,None); 260 | } 261 | val rendevezous = actor { 262 | loop { 263 | react { 264 | case None => 265 | localActors.foreach{ _ ! None}; 266 | case _ => // don't care about results, just want to know when i'm done. 267 | } 268 | } 269 | } 270 | gatherer ! StartGet(out, oldNumShards, rendevezous); 271 | for( (isLocal,w) <- workers) { 272 | w ! InPlaceDo(in,if(!isLocal) remoteOut else localOut); 273 | } 274 | reply {out}; 275 | } 276 | handleGroupBy(in,f,r); 277 | case Get(in,f,gather)=> 278 | val out = getNextJob(); 279 | Debug.info( "Getting job " + in + " with function " + f.getClass.getName() + " as job id " + out); 280 | gatherer !? StartGet(out,numShards(in),gather); 281 | workers.foreach{ a => a._2 ! Retrieve(in,f.asInstanceOf[Any=>Any],out,if(a._1) Left(localAccumulator) else Right(remoteAccumulator))} 282 | case AddWorker(a)=> 283 | Debug.info("Added a worker."); 284 | workers += new Tuple2(false,a); // TODO:improve 285 | 286 | case Remove(id) => 287 | Debug.info("Master removing job " + id); 288 | workers.foreach{ _._2 ! Remove(id)}; 289 | numShards -= id; 290 | } 291 | } 292 | } 293 | private def getOutActors[U,V](out : JobID, numShards : Int, r : Iterator[U]=>V) = { 294 | for(i <- 0 until numShards; 295 | (isLocal,w) = workers(i%numShards)) { 296 | w ! GetOutputActor(isLocal, out, i, r); 297 | } 298 | val buff = new ArrayBuffer[(Option[Actor],SerializedActor)]; 299 | for( i <- 1 to numShards) { 300 | buff += (Actor.?).asInstanceOf[(Option[Actor],SerializedActor)]; 301 | } 302 | buff.toSeq; 303 | } 304 | 305 | private def getLocalActors(a : (Option[Actor],SerializedActor)):OutputChannel[Any] = a._1 match { 306 | case Some(a) => a; 307 | case None => a._2 308 | } 309 | private def getRemoteActors(a : (Option[Actor],SerializedActor)): OutputChannel[Any] = a._2; 310 | // boolean says i'm local and don't need to serialize things 311 | private val workers = new ArrayBuffer[(Boolean,OutputChannel[Any])]; 312 | for (val i <- List.range(0,numWorkers)) 313 | workers += new Tuple2(true,Worker()); 314 | } 315 | 316 | private[smr] trait InternalIterable[T] extends DistributedIterable[T] { 317 | protected val id : JobID; 318 | protected val scheduler : Distributor; 319 | import InternalIterable._; 320 | 321 | def elements = { 322 | val list : List[(Int,Iterable[T])] = handleGather(this,Util.identity[Iterable[T]]).toList; 323 | list.sort(_._1 < _._1).map(_._2.projection).reduceLeft(_ append _).elements 324 | } 325 | 326 | def map[U](f : T=>U)(implicit mU : Manifest[U]) : DistributedIterable[U] = handleMap(this,f); 327 | def flatMap[U](f : T=>Iterable[U]) (implicit mU : Manifest[U]): DistributedIterable[U] = handleFlatMap(this,f); 328 | def filter(f : T=>Boolean) : DistributedIterable[T] = handleFilter(this,f); 329 | def reduce[B >: T](f : (B,B)=>B) : B = handleReduce(this,f) 330 | override def mapReduce[U,B >: U](m : T=>U)(r : (B,B)=>B)(implicit mU:Manifest[U]) = { 331 | handleMapReduce(this,m,r); 332 | } 333 | def groupBy[U](group: T=>U):DistributedIterable[(U,Seq[T])] = handleGroupBy(this,group); 334 | def distinct() = handleDistinct(this); 335 | def force = this; 336 | 337 | override protected def finalize() { 338 | try { 339 | scheduler.remove(id); 340 | } finally { 341 | super.finalize(); 342 | } 343 | } 344 | } 345 | 346 | /** 347 | * This object wouldn't exist, except that scala closures pass in the this pointer 348 | * even if you don't use any state. Objects don't have that restriction. 349 | */ 350 | private[smr] object InternalIterable { 351 | private def handleGather[T,C,U](self : InternalIterable[T], f : SerFunction1[C,U]) = { 352 | val recv = actor { 353 | val b = new ArrayBuffer[(Int,U)]; 354 | react { 355 | case 'start => 356 | val replyTo = Actor.sender; 357 | loop { 358 | react{ 359 | case Some(x) => 360 | b += x.asInstanceOf[(Int,U)]; 361 | Debug.info("Got shard " + x.asInstanceOf[(Int,U)]._1); 362 | case None => replyTo ! b ; exit(); 363 | } 364 | } 365 | } 366 | } 367 | self.scheduler.gather(self.id, f, recv); 368 | (recv !? 'start).asInstanceOf[ArrayBuffer[(Int,U)]]; 369 | } 370 | 371 | private def handleMap[T,U](self : InternalIterable[T], f : T=>U) = { 372 | new InternalIterable[U] { 373 | protected val scheduler = self.scheduler; 374 | Debug.info("Map with " + f.getClass.getName); 375 | protected val id = scheduler.schedule(self.id,Util.fMap(f)); 376 | } 377 | } 378 | private def handleFlatMap[T,U](self : InternalIterable[T], f : T=>Iterable[U]) = { 379 | new InternalIterable[U] { 380 | protected val scheduler = self.scheduler; 381 | protected val id = scheduler.schedule(self.id,Util.fFlatMap(f)); 382 | } 383 | } 384 | private def handleFilter[T](self : InternalIterable[T], f : T=>Boolean) = { 385 | new InternalIterable[T] { 386 | protected val scheduler = self.scheduler; 387 | protected val id = scheduler.schedule(self.id,Util.fFilter(f)); 388 | } 389 | } 390 | 391 | private def handleReduce[T,B>:T](self : InternalIterable[T], f : (B,B)=>B) = { 392 | val b = handleGather[T,Iterable[T],Option[B]](self,new SerFunction1[Iterable[T],Option[B]]{ 393 | def apply(x : Iterable[T])= if (x.isEmpty) None else Some(x.reduceLeft(f)); 394 | }); 395 | b.filter(None!=).map{ (x : (Int,Option[B])) => x._2.get}.reduceLeft(f); 396 | } 397 | 398 | private def handleMapReduce[T,U,B>:U](self :InternalIterable[T], m : T=>U, r : (B,B)=>B) = { 399 | Debug.info("MapReduce with " + m.getClass.getName + " and reduce " + r.getClass.getName); 400 | 401 | val doMapReduce = new SerFunction1[Iterable[T],Option[B]] { 402 | def apply(x : Iterable[T]) = { 403 | if (x.isEmpty) None 404 | else { 405 | var elems = x.elements; 406 | var acc : B = m(elems.next);; 407 | while(elems.hasNext) acc= r(acc,m(elems.next)); 408 | Some(acc); 409 | } 410 | } 411 | } 412 | val b = handleGather[T,Iterable[T],Option[B]](self,doMapReduce); 413 | b.filter(None!=).map{ (x : (Int,Option[B])) => x._2.get}.reduceLeft(r); 414 | } 415 | 416 | private def handleGroupBy[T,U](self : InternalIterable[T], group : T=>U) = { 417 | val innerGroupBy = { (it : Iterable[T],out : (Int,(U,T)) =>Unit) => 418 | for(x <- it) { 419 | val ARBITRARY_PRIME=47; 420 | val u = group(x); 421 | out(u.hashCode+ARBITRARY_PRIME,(u,x)); 422 | } 423 | } 424 | 425 | val receiver = { (it: Iterator[(U,T)]) => 426 | val map = scala.collection.mutable.Map[U,ArrayBuffer[T]](); 427 | for( (u,t) <- it) { 428 | map.getOrElseUpdate(u,new ArrayBuffer[T]) += t; 429 | } 430 | map.toSeq; 431 | } 432 | new InternalIterable[(U,Seq[T])] { 433 | protected val scheduler = self.scheduler; 434 | protected val id = scheduler.groupBy(self.id,innerGroupBy, receiver); 435 | } 436 | } 437 | 438 | private def handleDistinct[T](self : InternalIterable[T]) = { 439 | val innerGroupBy = { (it : Iterable[T],out : (Int,T) =>Unit) => 440 | for(x <- it) { 441 | val ARBITRARY_PRIME=47; 442 | out(x.hashCode+ARBITRARY_PRIME,x); 443 | } 444 | } 445 | 446 | val receiver = { (it: Iterator[T]) => 447 | val set = scala.collection.mutable.Set[T]() ++ it; 448 | set.toSeq; 449 | } 450 | new InternalIterable[T] { 451 | protected val scheduler = self.scheduler; 452 | protected val id = scheduler.groupBy(self.id,innerGroupBy, receiver); 453 | } 454 | } 455 | 456 | } 457 | --------------------------------------------------------------------------------