[numWorkers] [classLoader class]");
108 | }
109 |
110 | scala.actors.remote.RemoteActor.classLoader = if(argv.length > 3) Class.forName(argv(3)).getClassLoader else classOf[Hub].getClassLoader
111 |
112 | val numWorkers = if(argv.length > 2) java.lang.Integer.parseInt(argv(2)) else Runtime.getRuntime.availableProcessors;
113 |
114 | val hub = select(Node(argv(0),java.lang.Integer.parseInt(argv(1))), 'hub)
115 |
116 | val host = java.net.InetAddress.getLocalHost().getHostName()
117 |
118 | for(i <- 1 to numWorkers) {
119 | var port = Util.freePort()
120 | val worker = Worker(port,'worker);
121 | hub ! Hub.HubRegister('worker, host, port)
122 | }
123 |
124 | println("Worker: starting")
125 | }
126 | }
127 |
128 |
--------------------------------------------------------------------------------
/src/smr/Util.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2008, David Hall
3 | * All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | *
13 | * THIS SOFTWARE IS PROVIDED BY DAVID HALL ``AS IS'' AND ANY
14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
15 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
16 | * DISCLAIMED. IN NO EVENT SHALL DAVID HALL BE LIABLE FOR ANY
17 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
18 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
19 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
20 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
21 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
22 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23 | */
24 | package smr;
25 | import scala.actors.Actor._;
26 | import scala.actors.Actor;
27 |
28 | object Util {
29 | def identity[T] = new SerFunction1[T,T] {
30 | def apply(x : T) = x;
31 | };
32 | def fMap[T,U](f : T=>U) = new SerFunction1[Iterable[T],Iterable[U]] {
33 | def apply(x : Iterable[T]) = x.map(f);
34 | };
35 |
36 | def fFlatMap[T,U](f : T=>Iterable[U]) = new SerFunction1[Iterable[T],Iterable[U]] {
37 | def apply(x : Iterable[T]) = x.flatMap(f);
38 | };
39 |
40 | def fFilter[T](f : T=>Boolean) = new SerFunction1[Iterable[T],Iterable[T]] {
41 | def apply(x : Iterable[T]) = x.filter(f);
42 | };
43 |
44 | def itMap[T,U](f : T=>U) = new SerFunction1[Iterator[T],Iterator[U]] {
45 | def apply(x : Iterator[T]) = x.map(f);
46 | };
47 |
48 | def itFlatMap[T,U](f : T=>Iterable[U]) = new SerFunction1[Iterator[T],Iterator[U]] {
49 | def apply(it : Iterator[T]) = for(x <- it; y <- f(x).elements) yield y;
50 | };
51 |
52 | def itFilter[T](f : T=>Boolean) = new SerFunction1[Iterator[T],Iterator[T]] {
53 | def apply(x : Iterator[T]) = x.filter(f);
54 | };
55 |
56 | // g(f(x))
57 | def andThen[A,B,C](f: A=>B, g:B=>C) = new SerFunction1[A,C] {
58 | def apply(a : A) = g(f(a));
59 | }
60 |
61 | def freePort() : Int = {
62 | val server = new java.net.ServerSocket(0);
63 | val port = server.getLocalPort();
64 | server.close();
65 | return port;
66 | }
67 |
68 | /**
69 | * Iterator that reacts to get the next element.
70 | * Used internally to
71 | */
72 | class ActorIterator[T] extends Iterator[T] {
73 | def hasNext() = !nulled && (cache match {
74 | case Some(x) => true;
75 | case _ =>
76 | (receiver !? Poll) match {
77 | case None => nulled = true; receiver ! Close; false;
78 | case opt @ Some(x) => cache = opt.asInstanceOf[Option[T]]; true;
79 | }
80 | })
81 | def next() = {hasNext(); val x = cache.get; cache = None; x}
82 |
83 | val receiver = actor {
84 | loop {
85 | react {
86 | case Poll => reply {Actor.?}
87 | case Close => exit();
88 | }
89 | }
90 | }
91 | private var nulled = false;
92 | private var cache : Option[T] = None;
93 | private case class Poll;
94 | private case class Close;
95 | }
96 |
97 | def iteratorFromProducer[T](p : ()=>Option[T]) = new Iterator[T] {
98 | private var nxt : Option[T] = None;
99 | private var hasN = true;
100 |
101 | private def readNext() = p() match {
102 | case o @ Some(t) => nxt = o;
103 | case None=>nxt=None; hasN = false;
104 | }
105 |
106 | def hasNext = hasN && (nxt match {
107 | case Some(t) => true;
108 | case None => readNext(); hasN;
109 | })
110 |
111 | def next : T = nxt match {
112 | case Some(t) => nxt=None; t;
113 | case None=> readNext(); if(hasN) next else throw new NoSuchElementException();
114 | }
115 | }
116 |
117 | }
118 |
--------------------------------------------------------------------------------
/src/smr/hadoop/Magic.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2008, David Hall
3 | * All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | *
13 | * THIS SOFTWARE IS PROVIDED BY DAVID HALL ``AS IS'' AND ANY
14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
15 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
16 | * DISCLAIMED. IN NO EVENT SHALL DAVID HALL BE LIABLE FOR ANY
17 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
18 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
19 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
20 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
21 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
22 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23 | */
24 | package smr.hadoop;
25 | import smr._;
26 |
27 | import java.io._;
28 |
29 | import org.apache.hadoop.io._;
30 | import org.apache.hadoop.conf._;
31 | import org.apache.hadoop.fs._;
32 | import org.apache.hadoop.util._;
33 | import org.apache.hadoop.mapred._;
34 | import org.apache.hadoop.filecache._;
35 |
36 | import scala.reflect.Manifest;
37 |
38 | // You know it's bad when you have a class called magic
39 | private object Magic {
40 | def wireToReal(t : Writable) :Any = t match {
41 | case t :Text => t.toString;
42 | case arr : ArrayWritable => arr.get().map(wireToReal);
43 | case t => try {
44 | t.asInstanceOf[{def get():Any;}].get();
45 | } catch {
46 | case e => t;
47 | }
48 | }
49 |
50 | implicit def realToWire(t : Any):Writable = t match {
51 | case t : Writable => t;
52 | case t : Int => new IntWritable(t);
53 | case t : Long => new LongWritable(t);
54 | //case t : Byte => new ByteWritable(t);
55 | case t : Float => new FloatWritable(t);
56 | //case t : Double => new DoubleWritable(t);
57 | case t : Boolean => new BooleanWritable(t);
58 | case t : String => new Text(t);
59 | case t : Array[Byte] => new BytesWritable(t);
60 | case x : AnyRef if x.getClass.isArray => {
61 | val t = x.asInstanceOf[Array[Any]];
62 | if(t.length == 0) new AnyWritable(t);
63 | else {
64 | val mapped = t.map(realToWire);
65 | val classes = mapped.map(_.getClass);
66 | if(classes.forall(classes(0)==_)) {
67 | // can only use ArrayWritable if all Writables are the same.
68 | new ArrayWritable(classes(0),mapped);
69 | } else {
70 | // fall back on AnyWritable
71 | val mapped = t.map(new AnyWritable[Any](_).asInstanceOf[Writable]);
72 | new ArrayWritable(classOf[AnyWritable[_]],mapped);
73 | }
74 | }
75 | }
76 | case _ => new AnyWritable(t);
77 | }
78 |
79 | private val CInt = classOf[Int];
80 | private val CLong = classOf[Long];
81 | private val CByte = classOf[Byte];
82 | private val CDouble = classOf[Double];
83 | private val CFloat = classOf[Float];
84 | private val CBoolean = classOf[Boolean];
85 | private val CString = classOf[String];
86 | private val CArrayByte = classOf[Array[Byte]];
87 | private val CArray = classOf[Array[_]];
88 |
89 | def mkManifest[T](c:Class[T]) = new Manifest[T] {
90 | def erasure = c;
91 | }
92 |
93 | private val CWritable = mkManifest(classOf[Writable]);
94 |
95 | def classToWritableClass[T](c: Class[T]):Class[Writable] = c match {
96 | case c if mkManifest(c) <:< CWritable => c.asInstanceOf[Class[Writable]];
97 | case CInt => classOf[IntWritable].asInstanceOf[Class[Writable]];
98 | case CLong => classOf[LongWritable].asInstanceOf[Class[Writable]];
99 | // case CByte => classOf[ByteWritable].asInstanceOf[Class[Writable]];
100 | //case CDouble => classOf[DoubleWritable].asInstanceOf[Class[Writable]];
101 | case CFloat => classOf[FloatWritable].asInstanceOf[Class[Writable]];
102 | case CBoolean => classOf[BooleanWritable].asInstanceOf[Class[Writable]];
103 | case CString => classOf[Text].asInstanceOf[Class[Writable]];
104 | case CArrayByte => classOf[BytesWritable].asInstanceOf[Class[Writable]];
105 | case CArray => classOf[ArrayWritable].asInstanceOf[Class[Writable]];
106 | case _ => classOf[AnyWritable[_]].asInstanceOf[Class[Writable]];
107 | }
108 |
109 | }
110 |
--------------------------------------------------------------------------------
/src/smr/Defaults.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2008, David Hall
3 | * All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | *
13 | * THIS SOFTWARE IS PROVIDED BY DAVID HALL ``AS IS'' AND ANY
14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
15 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
16 | * DISCLAIMED. IN NO EVENT SHALL DAVID HALL BE LIABLE FOR ANY
17 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
18 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
19 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
20 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
21 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
22 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23 | */
24 | package smr;
25 | import scala.collection.mutable._;
26 | import scala.reflect.Manifest;
27 |
28 | /**
29 | * Object to hold various sensible Defaults for SMR. Expected use:
30 | *
31 | * import smr.Defaults._;
32 | *
33 | *
34 | * @author dlwh
35 | */
36 | object Defaults {
37 |
38 | /**
39 | * Implicit shard function that provides a reasonable default in most cases. Special treatment for
40 | * Ranges and for Seqs
41 | */
42 | implicit def shard[T] (it : Iterable[T], numShards : Int) : List[Iterable[T]] = it match {
43 | case x : scala.Range.Inclusive => shardIRange(x,numShards).asInstanceOf[List[Iterable[T]]];
44 | case x : scala.Range=> shardRange(x,numShards).asInstanceOf[List[Iterable[T]]];
45 | case x : Seq[_] =>
46 | if(x.size < numShards) {
47 | List(x)
48 | } else {
49 | val sz = x.size / numShards;
50 | val arrs = new ArrayBuffer[Iterable[T]]
51 | arrs ++= (for(val i <- 0 until numShards ) yield x.drop(sz * i).take(sz).toList);
52 | arrs.toList
53 | }
54 | case _ =>
55 | val arrs = new ArrayBuffer[ArrayBuffer[T]]
56 | arrs ++= (for(val i <- 1 to numShards) yield new ArrayBuffer[T]);
57 | val elems = it.elements
58 | var i = 0;
59 | while(elems.hasNext) { arrs(i%numShards) += elems.next; i += 1}
60 | arrs.toList
61 | }
62 | implicit def fakeDistributedIterable[T](it : Iterable[T]):DistributedIterable[T] = new DistributedIterable[T] {
63 | override def map[U](f : T=>U)(implicit mU : Manifest[U]) = fakeDistributedIterable(it.map(f));
64 | override def flatMap[U](f : T=>Iterable[U])(implicit mU : Manifest[U])= fakeDistributedIterable(it.flatMap(f));
65 | override def filter(f : T=>Boolean) = fakeDistributedIterable(it.filter(f));
66 | override def reduce[U>:T](f : (U,U)=>U) = it.reduceLeft(f);
67 | def groupBy[U](grp : T=>U) = {
68 | val map = Map[U,ArrayBuffer[T]]();
69 | for( t <- elements) {
70 | map.getOrElseUpdate(grp(t),new ArrayBuffer[T]) += t;
71 | }
72 | fakeDistributedIterable(map.asInstanceOf[Map[U,Iterable[T]]].toList);
73 | }
74 | override def distinct() = (Set() ++ elements).toSeq
75 |
76 | override def force() = try {
77 | it.asInstanceOf[Iterable.Projection[T]].force
78 | } catch {
79 | case e : ClassCastException => it
80 | }
81 |
82 | def elements = it.elements;
83 | }
84 |
85 | private def shardRange (r : scala.Range, numShards : Int) : List[Iterable[Int]]= {
86 | val arrs = new ArrayBuffer[Range]
87 | val n = numShards;
88 | arrs ++= (for(val i<- 0 until n) yield new Range(r.start + i * r.step,r.end,n * r.step));
89 | arrs.toList
90 | }
91 |
92 | private def shardIRange (r : scala.Range.Inclusive, numShards : Int) : List[Iterable[Int]]= {
93 | val arrs = new ArrayBuffer[Range.Inclusive]
94 | val n = numShards;
95 | arrs ++= (for(val i<- 0 until n) yield new Range.Inclusive(r.start + i * r.step ,r.end,n * r.step));
96 | arrs.toList
97 | }
98 |
99 | /**
100 | * Borrowed from scala source. Just add the annotation tag.
101 | * Part of the Scala API.
102 | * original author: @author Stephane Micheloud
103 | */
104 | @serializable
105 | private[Defaults] class Range(val start: Int, val end: Int, val step: Int) extends RandomAccessSeq.Projection[Int] {
106 | if (step == 0) throw new Predef.IllegalArgumentException
107 |
108 | /** Create a new range with the start and end values of this range and
109 | * a new step.
110 | */
111 | def by(step: Int): Range = new Range(start, end, step)
112 |
113 | lazy val length: Int = {
114 | if (start < end && this.step < 0) 0
115 | else if (start > end && this.step > 0) 0
116 | else {
117 | val base = if (start < end) end - start
118 | else start - end
119 | assert(base >= 0)
120 | val step = if (this.step < 0) -this.step else this.step
121 | assert(step >= 0)
122 | base / step + last(base, step)
123 | }
124 | }
125 |
126 | protected def last(base: Int, step: Int): Int =
127 | if (base % step != 0) 1 else 0
128 |
129 | def apply(idx: Int): Int = {
130 | if (idx < 0 || idx >= length) throw new Predef.IndexOutOfBoundsException
131 | start + (step * idx)
132 | }
133 |
134 | /** a Seq.contains, not a Iterator.contains! */
135 | def contains(x: Int): Boolean =
136 | if (step > 0)
137 | x >= start && x < end && (((x - start) % step) == 0)
138 | else
139 | x <= start && x > end && (((x - end) % step) == 0)
140 |
141 | def inclusive = new Range.Inclusive(start,end,step)
142 | }
143 |
144 | private[Defaults] object Range {
145 | @serializable
146 | private[Defaults] class Inclusive(start: Int, end: Int, step: Int) extends Range(start, end, step) {
147 | override def apply(idx: Int): Int = super.apply(idx)
148 | override protected def last(base: Int, step: Int): Int = 1
149 | override def by(step: Int): Range = new Inclusive(start, end, step)
150 | }
151 | }
152 |
153 | }
154 |
--------------------------------------------------------------------------------
/src/smr/hadoop/PathIterable.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2008, David Hall
3 | * All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | *
13 | * THIS SOFTWARE IS PROVIDED BY DAVID HALL ``AS IS'' AND ANY
14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
15 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
16 | * DISCLAIMED. IN NO EVENT SHALL DAVID HALL BE LIABLE FOR ANY
17 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
18 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
19 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
20 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
21 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
22 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23 | */
24 | package smr.hadoop;
25 | import smr._;
26 | import org.apache.hadoop.io._;
27 | import org.apache.hadoop.conf._;
28 | import org.apache.hadoop.fs._;
29 | import org.apache.hadoop.util._;
30 | import org.apache.hadoop.mapred._;
31 | import scala.reflect.Manifest;
32 |
33 | /**
34 | * Represents SequenceFiles of (Hadoop.DefaultKey,T) pairs on disk.
35 | * All operations are scheduled as MapReduces using Hadoop.runMapReduce.
36 | * The DefaultKey is inaccessible.
37 | */
38 | class PathIterable[T](h: Hadoop, val paths: Array[Path])(implicit m: Manifest[T]) extends DistributedIterable[T] {
39 | import Magic._;
40 | def elements = {
41 | if(paths.length == 0)
42 | new Iterator[T] {
43 | def hasNext = false;
44 | def next = throw new IllegalArgumentException("No elements were found!")
45 | }
46 | else paths.map(loadIterator).reduceLeft(_++_)
47 | }
48 |
49 | def force = this;
50 |
51 | import Hadoop._;
52 | def reduce[B>:T](f: (B,B)=>B) : B = {
53 | implicit val b = m.asInstanceOf[Manifest[B]];
54 | implicit val klass = inputFormatClass.asInstanceOf[Class[InputFormat[Any,B]]];
55 | val output = h.runMapReduce(paths, new CollectorMapper(identity[Iterator[B]]), new RealReduce(f), Set(ReduceCombine));
56 | val path = output(0);
57 |
58 | val result = new SequenceFile.Reader(path.getFileSystem(h.conf),path,h.conf);
59 | val v = result.getValueClass.asSubclass(classOf[Writable]).newInstance();
60 | val k = result.getKeyClass.asSubclass(classOf[Writable]).newInstance();
61 | result.next(k,v);
62 | result.close();
63 | Magic.wireToReal(v).asInstanceOf[B];
64 | }
65 |
66 | /**
67 | * Equivalent to Set() ++ it.elements, but distributed.
68 | */
69 | def distinct() = {
70 | implicit val klass = inputFormatClass.asInstanceOf[Class[InputFormat[DefaultKey,T]]];
71 | val output = h.runMapReduce(paths,new SwapMapper[DefaultKey,T],new KeyToValReduce[T,DefaultKey]);
72 | new PathIterable(h,output);
73 | }
74 |
75 | /**
76 | * Lazy
77 | */
78 | override def map[U](f : T=>U)(implicit m : Manifest[U]): DistributedIterable[U] = new ProjectedIterable[U](Util.itMap(f));
79 | /**
80 | * Lazy
81 | */
82 | override def flatMap[U](f : T=>Iterable[U]) (implicit m : Manifest[U]): DistributedIterable[U] = new ProjectedIterable[U](Util.itFlatMap(f));
83 | /**
84 | * Lazy
85 | */
86 | override def filter(f : T=>Boolean) : DistributedIterable[T] = new ProjectedIterable(Util.itFilter[T](f));
87 |
88 | // Begin protected definitions
89 | /**
90 | * Loads the given path and returns and iterator that can read off objects. Defaults to SequenceFile's.
91 | */
92 | protected def loadIterator(p : Path): Iterator[T] = {
93 | val rdr = new SequenceFile.Reader(p.getFileSystem(h.conf),p,h.conf);
94 | val keyType = rdr.getKeyClass().asSubclass(classOf[Writable]);
95 | val valType = rdr.getValueClass().asSubclass(classOf[Writable]);
96 | Util.iteratorFromProducer {() =>
97 | val k = keyType.newInstance();
98 | val v = valType.newInstance();
99 | if(rdr.next(k,v)) {
100 | Some(wireToReal(v).asInstanceOf[T]);
101 | } else {
102 | rdr.close();
103 | None;
104 | }
105 | }
106 | }
107 |
108 | /**
109 | * Returns the InputFormat needed to read a file
110 | */
111 | protected implicit def inputFormatClass : Class[C] forSome{ type C <: InputFormat[_,_]} = {
112 | classOf[SequenceFileInputFormat[_,_]]
113 | }
114 |
115 | /**
116 | * Represents a transformation on the data.
117 | * Caches transform when "force" or "elements" is called.
118 | */
119 | private class ProjectedIterable[U](transform:Iterator[T]=>Iterator[U])(implicit mU: Manifest[U]) extends DistributedIterable[U] {
120 | def elements = force.elements;
121 |
122 | // TODO: better to slow down one machine than repeat unnecessary work on the cluster?
123 | // seems reasonable.
124 | def force(): DistributedIterable[U] = synchronized {
125 | cache match {
126 | case Some(output)=> (new PathIterable(h,output)(mU))
127 | case None =>
128 | val output = h.runMapReduce(paths,
129 | new TransformMapper(transform),
130 | new IdentityReduce[DefaultKey,U]());
131 | cache = Some(output);
132 | (new PathIterable(h,output)(mU))
133 | }
134 | }
135 |
136 | /// So we don't repeat a computation unncessarily
137 | private var _cache : Option[Array[Path]] = None;
138 |
139 | // must be synchronized
140 | private def cache = synchronized { _cache };
141 | private def cache_=(c : Option[Array[Path]]) = c;
142 |
143 | override def map[V](f : U=>V)(implicit m: Manifest[V]): DistributedIterable[V] = cache match {
144 | case Some(path) => new PathIterable[U](h,path).map(f);
145 | case None => new ProjectedIterable[V](Util.andThen(transform, Util.itMap(f)));
146 | }
147 |
148 | override def flatMap[V](f : U=>Iterable[V])(implicit m: Manifest[V]) : DistributedIterable[V] = cache match {
149 | case Some(path) => new PathIterable[U](h,path).flatMap(f);
150 | case _ => new ProjectedIterable[V](Util.andThen(transform,Util.itFlatMap(f)));
151 | }
152 |
153 | override def filter(f : U=>Boolean) : DistributedIterable[U] = cache match {
154 | case Some(path) => new PathIterable[U](h,path).filter(f);
155 | case None => new ProjectedIterable[U](Util.andThen(transform,Util.itFilter(f)));
156 | }
157 |
158 | def distinct() = cache match {
159 | case Some(path) => new PathIterable[U](h,path).distinct();
160 | case None =>
161 | val output = h.runMapReduce(paths,
162 | new TransformValMapper[DefaultKey,T,U](transform),
163 | new KeyToValReduce[U,DefaultKey]);
164 | new PathIterable(h,output);
165 | }
166 |
167 | def reduce[B>:U](f: (B,B)=>B) : B = cache match {
168 | case Some(path) => new PathIterable[U](h,path).reduce(f);
169 | case None =>
170 | implicit val b = m.asInstanceOf[Manifest[B]];
171 | val output = h.runMapReduce(paths,
172 | new CollectorMapper(transform),
173 | new RealReduce(f),
174 | Set(ReduceCombine));
175 | val path = output(0);
176 |
177 | val result = new SequenceFile.Reader(path.getFileSystem(h.conf),path,h.conf);
178 | val v = result.getValueClass.asSubclass(classOf[Writable]).newInstance();
179 | val k = result.getKeyClass.asSubclass(classOf[Writable]).newInstance();
180 | result.next(k,v);
181 | result.close();
182 | wireToReal(v).asInstanceOf[B];
183 | }
184 | }
185 |
186 | }
187 |
188 |
189 |
--------------------------------------------------------------------------------
/src/smr/actors/Worker.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2008, David Hall
3 | * All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | *
13 | * THIS SOFTWARE IS PROVIDED BY DAVID HALL ``AS IS'' AND ANY
14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
15 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
16 | * DISCLAIMED. IN NO EVENT SHALL DAVID HALL BE LIABLE FOR ANY
17 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
18 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
19 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
20 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
21 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
22 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23 | */
24 | package smr.actors;
25 | import scala.actors.Actor;
26 | import scala.actors.Actor._;
27 | import scala.actors.Exit;
28 | import scala.collection.mutable.ArrayBuffer;
29 | import scala.collection._;
30 | import TransActor._;
31 | import scala.actors.remote.RemoteActor._;
32 | import scala.actors.remote.Node;
33 |
34 | import Distributor._;
35 | import Priv._;
36 |
37 | class Worker(port : Int, sym : Symbol) extends Actor {
38 | import Worker._;
39 |
40 | def this() = this(Util.freePort,'worker);
41 |
42 | start();
43 |
44 | def act() {
45 | alive(port);
46 | register(sym,Actor.self);
47 |
48 | trapExit = true;
49 |
50 | val actual_worker = new RealWorker();
51 | val accumulators = mutable.Map[JobID,Accumulator]();
52 | def getAcc(id : JobID) ={
53 | accumulators.getOrElseUpdate(id,new Accumulator(id));
54 | }
55 |
56 | loop {
57 | react {
58 | case msg @ Do(in,f,out) =>
59 | //Debug.info(msg + "");
60 | val outAcc = getAcc(out);
61 | getAcc(in).forwardShardNums(outAcc);
62 | getAcc(in).addShardListener { case (shard,data) =>
63 | actual_worker.enqueue { x:Unit =>
64 | val outData = f(data);
65 | outAcc.completeShard(shard,outData);
66 | }
67 | }
68 | case InPlaceDo(in,f) =>
69 | getAcc(in).addShardListener { case (s,data) =>
70 | actual_worker enqueue { x:Unit =>
71 | f(data);
72 | }
73 | }
74 | case GetOutputActor(isLocal, out, shard, retr) =>
75 | def getOutputActor[U,V](retr : Iterator[U]=>V) {
76 | val actorIterator = new Util.ActorIterator[U];
77 | val a = Actor.actor {
78 | getAcc(out).completeShard(shard,retr(actorIterator));
79 | }
80 | getAcc(out).reserveShard(shard);
81 | val actor = transActor(port,Symbol(":output-" + out + "-"+shard)) {
82 | Actor.loop {
83 | Actor.react {
84 | case msg@ Some(x) => actorIterator.receiver ! msg;
85 | case None => actorIterator.receiver ! None; exit();
86 | }
87 | }
88 | }
89 | if(isLocal) {
90 | reply { (Some(actor),TransActorToSerializedActor(actor))}
91 | } else {
92 | reply { (None,TransActorToSerializedActor(actor))}
93 | }
94 | }
95 | getOutputActor(retr);
96 | case Done(id,s,r)=>getAcc(id).completeShard(s,r);
97 | case Reserve(id,shard) => getAcc(id).reserveShard(shard);
98 | case DoneAdding(id) => getAcc(id).doneReserving();
99 | case rtr @ Retrieve(id,f,out,a) =>
100 | val realActor = a match {
101 | case Right(a) => SerializedActorToActor(a);
102 | case Left(a) => a
103 | }
104 | //Debug.info(rtr + "");
105 | // Push it off to the accumulator, have it forward things to the job runner
106 | getAcc(id).addShardListener{ case (shard,data) =>
107 | actual_worker.enqueue { x :Unit =>
108 | realActor ! Retrieved(out,shard,f(data));
109 | }
110 | }
111 | case Close=>
112 | Debug.info("Worker " + self + " shutting down");
113 | actual_worker.close();
114 | accumulators.values.foreach(_.close());
115 | exit();
116 | case Remove(id) =>
117 | val a = accumulators.get(id);
118 | accumulators -= id;
119 | Debug.info("Worker " + self + " removing job " + id);
120 | a.foreach( _.close());
121 | case x =>
122 | Debug.error( "Wrong input to worker! " + x);
123 | }
124 | }
125 | }
126 |
127 | }
128 |
129 | object Worker {
130 | def apply() = new Worker();
131 | def apply(port : Int, sym : Symbol) = new Worker(port,sym);
132 |
133 | /*
134 | def setClassLoaderFromClass(c : Class[_]) {
135 | scala.actors.remote.RemoteActor.classLoader = classLoaderToUse
136 | classLoaderToUse = c.getClassLoader();
137 | }
138 |
139 | private var classLoaderToUse = this.getClass.getClassLoader();
140 | */
141 | private class Accumulator(id : JobID) {
142 | private case class Forward(out : Accumulator);
143 | private case class Add(shard : Int);
144 | private case class Retr(f : ((Int,Any))=>Unit);
145 |
146 | def forwardShardNums(out : Accumulator) = inner ! Forward(out);
147 | def completeShard(shard : Int, data : Any) = inner ! Done(id,shard,data);
148 | def addShardListener(f : ((Int,Any))=>Unit) = inner ! Retr(f);
149 | def reserveShard(shard : Int) = inner ! Add(shard);
150 | def doneReserving() = inner ! DoneAdding(0);
151 | def close() = inner ! Close
152 |
153 | private val inner : Actor =actor {
154 | val active = mutable.Set[Int]();
155 | val done = mutable.Map[Int,Any]();
156 | val awaiting = new ArrayBuffer[((Int,Any))=>Unit]();
157 | val waitingForDoneReservation = new ArrayBuffer[Unit=>Unit]();
158 | var doneAdding = false;
159 | var shouldExit = false;
160 |
161 | def checkFinished() {
162 | if(doneAdding && active.size == 0) {
163 | awaiting.foreach{f => done.foreach(f)}
164 | awaiting.clear();
165 | if(shouldExit) exit();
166 | }
167 | }
168 |
169 | loop {
170 | react {
171 | case Add(s) =>
172 | if(doneAdding) Debug.warning("Got a late add");
173 | if(!done.contains(s)) active += s
174 | case Close =>
175 | if(awaiting.isEmpty) {
176 | done.clear();
177 | waitingForDoneReservation.clear();
178 | exit();
179 | }
180 | shouldExit = true;
181 | case Forward(out) =>
182 | val f = { x: Unit =>
183 | active.foreach { sh => out.reserveShard(sh)}
184 | done.keys.foreach { sh => out.reserveShard(sh)}
185 | out.doneReserving();
186 | }
187 | if(doneAdding) f();
188 | else waitingForDoneReservation += f;
189 |
190 | case Retr(f) =>
191 | if(doneAdding && active.size==0) {
192 | done.foreach(f);
193 | } else {
194 | awaiting += f;
195 | }
196 | case msg @ DoneAdding(dbg) =>
197 | doneAdding = true;
198 | waitingForDoneReservation foreach { f => f()}
199 | waitingForDoneReservation.clear();
200 | checkFinished();
201 |
202 | case Done(i,s,r) =>
203 | active -= s;
204 | done += (s->r);
205 | checkFinished();
206 | case x => Debug.error( "Wrong input to accumulator!" + x);
207 | }
208 | }
209 | }
210 | }
211 |
212 | private class RealWorker {
213 | private case class Enqueue(f : Unit=>Unit);
214 | def enqueue(f : Unit=>Unit) = inner ! Enqueue(f);
215 | def close() = inner ! Exit(Actor.self,'closed);
216 |
217 | private val inner = actor {
218 | loop {
219 | react {
220 | case Exit(_,_) => exit();
221 | case Enqueue(f) => try {
222 | f();
223 | } catch {
224 | case x =>
225 | // todo: better error reporting
226 | x.printStackTrace();
227 | }
228 | case x => Debug.error( "Wrong input to realWorker!" + x);
229 | }
230 | }
231 | }
232 | }
233 | }
234 |
--------------------------------------------------------------------------------
/src/smr/hadoop/Hadoop.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2008, David Hall
3 | * All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | *
13 | * THIS SOFTWARE IS PROVIDED BY DAVID HALL ``AS IS'' AND ANY
14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
15 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
16 | * DISCLAIMED. IN NO EVENT SHALL DAVID HALL BE LIABLE FOR ANY
17 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
18 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
19 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
20 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
21 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
22 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23 | */
24 | package smr.hadoop;
25 | import smr._;
26 |
27 | import java.io._;
28 |
29 | import org.apache.hadoop.io._;
30 | import org.apache.hadoop.conf._;
31 | import org.apache.hadoop.fs._;
32 | import org.apache.hadoop.util._;
33 | import org.apache.hadoop.mapred._;
34 | import org.apache.hadoop.filecache._;
35 |
36 | import scala.reflect.Manifest;
37 |
38 |
39 | /**
40 | * Supports Hadoop operations.
41 | * @see Hadoop$
42 | */
43 | class Hadoop(val conf : Configuration, userJar :String, private[hadoop] val dirGenerator : (String)=>Path) {
44 | // enable path conversions, and other goodies
45 | implicit private val cf = conf;
46 | import Implicits._;
47 | import Hadoop._;
48 |
49 | /**
50 | * Constructs a Hadoop instance with the given configuration and working directory (for files)
51 | */
52 | def this(conf : Configuration, userJar : String, workDir : Path) = this(conf,userJar, {(pref:String) =>
53 | new Path(workDir,pref);
54 | });
55 |
56 | private[smr] val cacheDir = dirGenerator("tmp/cache");
57 |
58 | conf.set("smr.cache.dir",cacheDir.toString);
59 | // cacheDir.mkdirs();
60 | if(!conf.getBoolean(CONFIG_KEEP_FILES,false))
61 | dirGenerator("tmp").deleteOnExit();
62 |
63 | def load[T](p : Array[Path])(implicit m : Manifest[T])= new PathIterable[T](this,p);
64 | def load[T](p : Path)(implicit m : Manifest[T]):PathIterable[T]= load[T](Array(p));
65 |
66 | def loadPairs[K,V](p : Path*)(implicit mK : Manifest[K], mV: Manifest[V]) = {
67 | new PathPairs[K,V](this,p.toArray);
68 | }
69 |
70 | def loadPairs[K,V](p : Seq[Path]) = new PathPairs(this,p.toArray);
71 | def loadLines(p : Path*) = new PathPairs[Long,String](this,p.toArray) with Lines;
72 |
73 | import Magic._;
74 |
75 | def distributePairs[K,V](ibl: Iterable[(K,V)], numShards : Int)(implicit mK:Manifest[K], mV:Manifest[V]) = {
76 | val paths = pathGenerator(numShards);
77 | val elems = ibl.elements.map{ case(k,v) => (realToWire(k),realToWire(v))}
78 |
79 | if(!elems.hasNext)
80 | throw new IllegalArgumentException("Empty iterable");
81 | val first = elems.next;
82 |
83 | val writers =
84 | for(p <- paths;
85 | fs = p.getFileSystem(conf);
86 | wrtr = new SequenceFile.Writer(fs,conf,p,first._1.getClass,first._2.getClass))
87 | yield wrtr;
88 | var i = 0;
89 | writers(i%numShards).append(first._1,first._2);
90 | while(elems.hasNext) {
91 | i+=1;
92 | val nxt = elems.next();
93 | writers(i%numShards).append(nxt._1,nxt._2);
94 | }
95 | writers.foreach{_.close()};
96 | loadPairs[K,V](paths).asInstanceOf[PathPairs[K,V]];
97 | }
98 |
99 | def distribute[T](ibl : Iterable[T], numShards :Int)(implicit m : Manifest[T]) :PathIterable[T] = {
100 | val paths = pathGenerator(numShards);
101 |
102 | val elems = ibl.elements.map(Magic.realToWire);
103 |
104 | if(!elems.hasNext)
105 | throw new IllegalArgumentException("Empty iterable");
106 | val first = elems.next;
107 |
108 | val writers =
109 | for(p <- paths;
110 | fs = p.getFileSystem(conf);
111 | wrtr = new SequenceFile.Writer(fs,conf,p,classOf[Hadoop.DefaultKeyWritable],first.getClass))
112 | yield wrtr;
113 | var i = 0;
114 | writers(i%numShards).append(Magic.realToWire(mkDefaultKey(first)),first);
115 | while(elems.hasNext) {
116 | i+=1;
117 | val nxt = elems.next();
118 | writers(i%numShards).append(Magic.realToWire(mkDefaultKey(nxt)),nxt);
119 | }
120 | writers.foreach{_.close()};
121 | load[T](paths);
122 | }
123 |
124 | private def serializeClass(jobConf : JobConf, name : String, c : AnyRef) = {
125 | implicit val jc = jobConf;
126 | val path = new Path(cacheDir,name);
127 | val stream = new ObjectOutputStream(path.getFileSystem(jc).create(path));
128 | stream.writeObject(c);
129 | stream.close();
130 | DistributedCache.addCacheFile(path.toUri,jobConf);
131 | path;
132 | }
133 |
134 | private[hadoop] def runMapReduce[K1,V1,K2,V2,K3,V3](paths : Array[Path],
135 | m: Mapper[K1,V1,K2,V2],
136 | r: Reduce[K2,V2,K3,V3])
137 | (implicit mk2:Manifest[K2], mv2:Manifest[V2],
138 | mk3:Manifest[K3], mv3:Manifest[V3],
139 | inputFormat : Class[T] forSome {type T <: InputFormat[_,_]}) : Array[Path]= {
140 | runMapReduce(paths,m,r,Set());
141 | }
142 |
143 | private[hadoop] def runMapReduce[K1,V1,K2,V2,K3,V3](paths : Array[Path],
144 | m: Mapper[K1,V1,K2,V2],
145 | r: Reduce[K2,V2,K3,V3],
146 | options : Set[Hadoop.Options])
147 | (implicit mk2:Manifest[K2], mv2:Manifest[V2],
148 | mk3:Manifest[K3], mv3:Manifest[V3],
149 | inputFormat : Class[T] forSome {type T <: InputFormat[_, _]}) = {
150 | implicit val jobConf = new JobConf(conf);
151 | jobConf.setJar(userJar);
152 | var outputOption : Option[Path] = None;
153 | options foreach {
154 | case ReduceCombine => jobConf.setCombinerClass(classOf[ReduceWrapper[_,_,_,_]]);
155 | case OutputDir(dir) => outputOption = Some(dirGenerator(dir));
156 | case x => throw new IllegalArgumentException("Illegal MapReduce Option: " + x);
157 | }
158 |
159 | val outputPath = outputOption.getOrElse(genDir);
160 | jobConf.setJobName("SMR-"+outputPath.getName);
161 | jobConf.setInputFormat(inputFormat);
162 | jobConf.setOutputFormat(classOf[SequenceFileOutputFormat[_,_]]);
163 |
164 | val mPath = serializeClass(jobConf,outputPath.getName+"-Map.ser",m);
165 | val rPath = serializeClass(jobConf,outputPath.getName+"-Reduce.ser",r);
166 | jobConf.set("smr.job.mapper.file",mPath.toString);
167 | jobConf.set("smr.job.reducer.file",rPath.toString);
168 |
169 | jobConf.setMapRunnerClass(classOf[ClosureMapper[_,_,_,_]]);
170 | jobConf.setReducerClass(classOf[ReduceWrapper[_,_,_,_]]);
171 | jobConf.setNumReduceTasks(conf.getInt("smr.reduce.tasks.default",paths.length));
172 |
173 |
174 | jobConf.setMapOutputKeyClass(Magic.classToWritableClass(mk2.erasure));
175 | jobConf.setMapOutputValueClass(Magic.classToWritableClass(mv2.erasure));
176 | jobConf.setOutputKeyClass(Magic.classToWritableClass(mk3.erasure));
177 | jobConf.setOutputValueClass(Magic.classToWritableClass(mv3.erasure));
178 |
179 | FileInputFormat.setInputPaths(jobConf, paths:_*);
180 | FileOutputFormat.setOutputPath(jobConf,outputPath);
181 |
182 | JobClient.runJob(jobConf);
183 |
184 | outputPath.listFiles();
185 | }
186 |
187 | private var jobNum = 0;
188 | protected def nextName = synchronized {
189 | jobNum+=1;
190 | "job"+jobNum;
191 | }
192 |
193 | private def genDir() = {
194 | dirGenerator("tmp/"+nextName);
195 | }
196 |
197 | private def pathGenerator(numShards : Int) = {
198 | val dir = genDir();
199 | // dir.mkdirs();
200 |
201 | Array.fromFunction { i =>
202 | new Path(dir,"part-"+i+"-of-"+numShards);
203 | } (numShards);
204 | }
205 | }
206 |
207 | object Hadoop {
208 | /**
209 | * Create a {@link Hadoop} instance from command line args and a working directory.
210 | */
211 | def apply(args : Array[String], userJar : String, workDir : Path) = fromArgs(args, userJar, workDir)._1;
212 |
213 | /**
214 | * Create a {@link Hadoop} instance from command line args and a working directory.
215 | * @return hadoop instance and remaining args
216 | */
217 | def fromArgs(args: Array[String], userJar : String, workDir : Path) = {
218 | var restArgs : Array[String] = null;
219 | var conf : Configuration = null;
220 | val tool = new Configured with Tool {
221 | @throws(classOf[Exception])
222 | def run(args : Array[String]) : Int = {
223 | restArgs = args;
224 | conf = getConf();
225 | 0;
226 | }
227 | }
228 | ToolRunner.run(tool,args);
229 | (new Hadoop(conf,userJar, workDir),args);
230 | }
231 |
232 | private[hadoop] sealed case class Options;
233 | case object ReduceCombine extends Options;
234 | case class OutputDir(s : String) extends Options;
235 |
236 | private[hadoop] type DefaultKeyWritable = IntWritable;
237 | private[hadoop] type DefaultKey= Int;
238 | private[hadoop] def mkDefaultKey() : DefaultKey= 0.asInstanceOf[DefaultKey];
239 | private[hadoop] def mkDefaultKey[V](v: V): DefaultKey = v.asInstanceOf[AnyRef].hashCode();
240 |
241 | val CONFIG_KEEP_FILES = "smr.files.keep";
242 |
243 | private def copyFile(inFile : Path, outFile : Path)(implicit conf : Configuration) {
244 | val fs = inFile.getFileSystem(conf);
245 | // Read from and write to new file
246 | val in = fs.open(inFile);
247 | val out = fs.create(outFile);
248 | val COPY_BUFFER_SIZE = 4096;
249 | val buffer = new Array[Byte](4096);
250 | try {
251 | var bytesRead = in.read(buffer);
252 | while (bytesRead > 0) {
253 | out.write(buffer, 0, bytesRead);
254 | bytesRead = in.read(buffer);
255 | }
256 | } finally {
257 | in.close();
258 | out.close();
259 | }
260 | }
261 | }
262 |
263 |
264 |
--------------------------------------------------------------------------------
/src/smr/hadoop/PathPairs.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2008, David Hall
3 | * All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | *
13 | * THIS SOFTWARE IS PROVIDED BY DAVID HALL ``AS IS'' AND ANY
14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
15 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
16 | * DISCLAIMED. IN NO EVENT SHALL DAVID HALL BE LIABLE FOR ANY
17 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
18 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
19 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
20 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
21 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
22 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23 | */
24 | package smr.hadoop;
25 | import smr._;
26 | import org.apache.hadoop.io._;
27 | import org.apache.hadoop.conf._;
28 | import org.apache.hadoop.fs._;
29 | import org.apache.hadoop.util._;
30 | import org.apache.hadoop.mapred._;
31 | import scala.reflect.Manifest;
32 |
33 | import Magic._;
34 | import Hadoop._;
35 |
36 | abstract class AbstractPairs[K,V](val h: Hadoop)(implicit mK: Manifest[K], mV:Manifest[V]) extends DistributedPairs[K,V] with FileFormat[K,V] { self =>
37 | protected[hadoop] def paths : Array[Path];
38 |
39 | def elements = {
40 | if(paths.length == 0)
41 | new Iterator[(K,V)] {
42 | def hasNext = false;
43 | def next = throw new IllegalArgumentException("No elements were found!")
44 | }
45 | else paths.map(loadIterator).reduceLeft(_++_);
46 | }
47 |
48 | def force = new PathPairs[K,V](h,paths);
49 |
50 | /**
51 | * Models MapReduce/Hadoop-style reduce more exactly.
52 | */
53 | def flatReduce[K2,V2](f : (K,Iterator[V])=>Iterator[(K2,V2)])(implicit m : Manifest[K2], mU:Manifest[V2]): DistributedPairs[K2,V2] = {
54 | new MapReducePairs(h, self.paths, new PairTransformMapper(identity[Iterator[(K,V)]]), new FlatReduce(f), inputFormatClass);
55 | }
56 |
57 | /**
58 | * Models MapReduce/Hadoop-style reduce more exactly.
59 | */
60 | def reduce[K2,V2](f: (K,Iterator[V])=>(K2,V2))(implicit mL: Manifest[K2], mW:Manifest[V2]): DistributedPairs[K2,V2] = {
61 | new MapReducePairs(h, self.paths, new PairTransformMapper(identity[Iterator[(K,V)]]), new PairReduce(f), inputFormatClass);
62 | }
63 |
64 | /**
65 | * Lazy
66 | */
67 | override def map[K2,V2](f : ((K,V))=>(K2,V2))(implicit mJ : Manifest[K2], mU : Manifest[V2]): DistributedPairs[K2,V2] = {
68 | new ProjectedPairs[K,V,K2,V2](this,Util.itMap(f));
69 | }
70 |
71 | /**
72 | * Lazy
73 | */
74 | override def flatMap[K2,V2](f : ((K,V))=>Iterable[(K2,V2)])(implicit mJ : Manifest[K2], mU : Manifest[V2]): DistributedPairs[K2,V2] = {
75 | new ProjectedPairs[K,V,K2,V2](this,Util.itFlatMap(f));
76 | }
77 |
78 | /**
79 | * Lazy
80 | */
81 | override def filter(f : ((K,V))=>Boolean) : DistributedPairs[K,V] = new ProjectedPairs[K,V,K,V](this,Util.itFilter[(K,V)](f));
82 |
83 | /**
84 | * Lazy
85 | */
86 | override def mapFirst[K2](f : K=>K2)(implicit mJ: Manifest[K2]) : DistributedPairs[K2,V] = {
87 | new ProjectedPairs[K,V,K2,V](this,Util.itMap { case (k,v) => (f(k),v)});
88 | }
89 |
90 | /**
91 | * Lazy
92 | */
93 | override def mapSecond[V2](f : V=>V2)(implicit mJ: Manifest[V2]) : DistributedPairs[K,V2] = {
94 | new ProjectedPairs[K,V,K,V2](this,Util.itMap{ case (k,v) => (k,f(v))});
95 | }
96 |
97 | // Begin protected definitions
98 | /**
99 | * Loads the given path and returns and iterator that can read off objects. Defaults to SequenceFile's.
100 | */
101 | override protected[hadoop] def loadIterator(p : Path): Iterator[(K,V)] = {
102 | val rdr = new SequenceFile.Reader(p.getFileSystem(h.conf),p,h.conf);
103 | val keyType = rdr.getKeyClass().asSubclass(classOf[Writable]);
104 | val valType = rdr.getValueClass().asSubclass(classOf[Writable]);
105 | Util.iteratorFromProducer {() =>
106 | val k = keyType.newInstance();
107 | val v = valType.newInstance();
108 | if(rdr.next(k,v)) {
109 | Some((wireToReal(k).asInstanceOf[K],wireToReal(v).asInstanceOf[V]));
110 | } else {
111 | rdr.close();
112 | None;
113 | }
114 | }
115 | }
116 |
117 | /**
118 | * Returns the InputFormat needed to read a file
119 | */
120 | override protected[hadoop] implicit def inputFormatClass : Class[T] forSome{ type T <: InputFormat[_,_]} = {
121 | classOf[SequenceFileInputFormat[_,_]].asInstanceOf[Class[InputFormat[_,_]]];
122 | }
123 |
124 | /**
125 | * Joins two PathPairs together.
126 | */
127 | def ++[SK>:K,SV>:V](other : DistributedPairs[SK,SV])(implicit mSK:Manifest[SK], mSV:Manifest[SV]) = other match {
128 | case aOther : AbstractPairs[_,_] => new AbstractPairs[SK,SV](h) {
129 | protected[hadoop] override def paths = self.paths ++ aOther.paths;
130 | protected[hadoop] override implicit val inputFormatClass : Class[T] forSome{ type T <: InputFormat[_,_]} = {
131 | self.inputFormatClass;
132 | }
133 |
134 | def asStage(name:String) : PathPairs[SK,SV] = new PathPairs[SK,SV](h,paths).asStage(name);
135 | }
136 | case _ => throw new IllegalArgumentException("++ only valid for PathPairs and cousins");
137 | }
138 |
139 |
140 | }
141 |
142 | /**
143 | * Represents pairs that have will be mapped and reduced. A complete cycle.
144 | */
145 | // TODO: tighter integration between paths and asStage
146 | private class MapReducePairs[K1,V1,K2,V2,K3,V3](h : Hadoop,
147 | input: =>Array[Path],
148 | m : Mapper[K1,V1,K2,V2],
149 | r : Reduce[K2,V2,K3,V3],
150 | val inputFormat : Class[T] forSome {type T <: InputFormat[_,_]})
151 | (implicit mk1 : Manifest[K1], mk2 : Manifest[K2], mk3:Manifest[K3],
152 | mv1:Manifest[V1], mv2:Manifest[V2], mv3 : Manifest[V3]) extends AbstractPairs[K3,V3](h) {
153 |
154 | import Implicits._;
155 | private implicit val conf = h.conf;
156 |
157 | // a little ugly.
158 | private var pathsRun = false;
159 | override lazy val paths = {
160 | synchronized {pathsRun = true; }
161 | h.runMapReduce(input, m,r);
162 | }
163 |
164 | override def asStage(dir : String) : DistributedPairs[K3,V3] = {
165 | val outDir = h.dirGenerator(dir);
166 | if(outDir.exists) {
167 | new PathPairs(h,outDir.listFiles);
168 | } else synchronized {
169 | if(pathsRun) {
170 | new PathPairs[K3,V3](h,paths).asStage(dir);
171 | } else {
172 | val outFiles = h.runMapReduce(input, m,r, Set(OutputDir(dir)));
173 | (new PathPairs[K3,V3](h,outFiles))
174 | }
175 | }
176 | }
177 |
178 | override implicit def inputFormatClass : Class[_ <: InputFormat[_,_]] = inputFormat;
179 | }
180 |
181 | /**
182 | * Represents a set of Paths on disk.
183 | */
184 | class PathPairs[K,V](h: Hadoop, val paths : Array[Path], keepFiles :Boolean)(implicit mK: Manifest[K], mV:Manifest[V]) extends AbstractPairs[K,V](h) {
185 | import Implicits._;
186 |
187 | def this(h: Hadoop, paths: Array[Path])(implicit mk:Manifest[K], mv:Manifest[V]) = this(h,paths,true);
188 |
189 | implicit val conf = h.conf;
190 |
191 | /**
192 | * Copies the files represented by the pathpairs to the stage directory.
193 | */
194 | def asStage(output: String) = {
195 | val outputDir = h.dirGenerator(output);
196 | outputDir.mkdirs();
197 | val outPaths = for(p <- paths) yield new Path(outputDir,p.getName);
198 | for( (src,dst) <- paths.zip(outPaths)) {
199 | src.moveTo(dst);
200 | }
201 | new PathPairs[K,V](h,outPaths);
202 | }
203 | }
204 |
205 | /**
206 | * Used to override the default behavior of Lines
207 | */
208 | trait FileFormat[K,V] {
209 | protected[hadoop] def loadIterator(p: Path): Iterator[(K,V)]
210 | protected[hadoop] def inputFormatClass : Class[T] forSome { type T <: InputFormat[_,_]}
211 | }
212 |
213 | /**
214 | * Used with PathPairs, reads files line by line. Key is the offset in bytes
215 | */
216 | trait Lines extends FileFormat[Long,String]{ this : PathPairs[Long,String] =>
217 | import Implicits._;
218 | override protected[hadoop] def loadIterator(p: Path) = {
219 | implicit val conf = h.conf;
220 |
221 | val rdr = new LineRecordReader(p.getFileSystem(h.conf).open(p),0,p.length);
222 | val k = new LongWritable;
223 | val v = new Text;
224 | Util.iteratorFromProducer { () =>
225 | if(rdr.next(k,v)) {
226 | Some((k.get,v.toString));
227 | } else {
228 | rdr.close;
229 | None;
230 | }
231 | }
232 | }
233 |
234 | override protected[hadoop] def inputFormatClass = {
235 | classOf[TextInputFormat].asInstanceOf[Class[InputFormat[_,_]]];
236 | }
237 | }
238 |
239 | /**
240 | * Represents a transformation on the data.
241 | * Caches transform when "force" or "elements" is called.
242 | */
243 | class ProjectedPairs[K,V,K2,V2](parent : AbstractPairs[K,V], transform:Iterator[(K,V)]=>Iterator[(K2,V2)])(implicit mK:Manifest[K], mV:Manifest[V], mJ:Manifest[K2], mU: Manifest[V2]) extends AbstractPairs[K2,V2](parent.h) {
244 | import Implicits._;
245 | override def elements = force.elements;
246 |
247 | override protected[hadoop] def paths = force.paths;
248 |
249 | // TODO: better to slow down one machine than repeat unnecessary work on the cluster?
250 | // seems reasonable.
251 | override def force() : PathPairs[K2,V2] = synchronized {
252 | cache match {
253 | case Some(output)=> (new PathPairs(h,output))
254 | case None =>
255 | val output = h.runMapReduce(parent.paths,
256 | new PairTransformMapper(transform),
257 | new IdentityReduce[K2,V2]());
258 | cache = Some(output);
259 | (new PathPairs(h,output))
260 | }
261 | }
262 |
263 | def asStage(output : String):DistributedPairs[K2,V2] = {
264 | implicit val conf = h.conf;
265 | val outDir = h.dirGenerator(output);
266 | if(outDir.exists) {
267 | cache = Some(outDir.listFiles);
268 | this;
269 | } else synchronized {
270 | cache match {
271 | case Some(o)=> new PathPairs[K2,V2](h,o).asStage(output);
272 | case None=>
273 | val outFiles = h.runMapReduce(parent.paths,
274 | new PairTransformMapper(transform),
275 | new IdentityReduce[K2,V2](),
276 | Set(OutputDir(output)));
277 | synthetic = false;
278 | cache = Some(outFiles);
279 | (new PathPairs[K2,V2](h,outFiles))
280 | }
281 | }
282 | }
283 |
284 | /// So we don't repeat a computation unncessarily
285 | private var _cache : Option[Array[Path]] = None;
286 |
287 | private var synthetic = true;
288 |
289 | // must be synchronized
290 | private def cache = synchronized { _cache };
291 | private def cache_=(c : Option[Array[Path]]) = synchronized {
292 | _cache = c;
293 | if(synthetic) {
294 | c match {
295 | case _ =>
296 | }
297 | }
298 | }
299 |
300 | implicit val conf = h.conf;
301 |
302 | override def map[K3,V3](f : ((K2,V2))=>(K3,V3))(implicit mL: Manifest[K3], mW: Manifest[V3]): DistributedPairs[K3,V3] = cache match {
303 | case Some(path) => new PathPairs[K2,V2](h,path).map(f);
304 | case None => new ProjectedPairs[K,V,K3,V3](parent,Util.andThen(transform, Util.itMap(f)));
305 | }
306 |
307 | override def flatMap[K3,V3](f : ((K2,V2))=>Iterable[(K3,V3)])(implicit mL: Manifest[K3], mW: Manifest[V3]) : DistributedPairs[K3,V3] = cache match {
308 | case Some(path) => new PathPairs[K2,V2](h,path).flatMap(f);
309 | case _ => new ProjectedPairs[K,V,K3,V3](parent,Util.andThen(transform,Util.itFlatMap(f)));
310 | }
311 |
312 | override def filter(f : ((K2,V2))=>Boolean) : DistributedPairs[K2,V2] = cache match {
313 | case Some(path) => new PathPairs[K2,V2](h,path).filter(f);
314 | case None => new ProjectedPairs[K,V,K2,V2](parent,Util.andThen(transform,Util.itFilter(f)));
315 | }
316 |
317 | /**
318 | * Lazy
319 | */
320 | override def mapFirst[K3](f : K2=>K3)(implicit mL: Manifest[K3]) : DistributedPairs[K3,V2] = {
321 | new ProjectedPairs(parent,Util.andThen(transform,Util.itMap[(K2,V2),(K3,V2)]{ case (k,v) => (f(k),v)}));
322 | }
323 |
324 | /**
325 | * Lazy
326 | */
327 | override def mapSecond[V3](f : V2=>V3)(implicit mW: Manifest[V3]) : DistributedPairs[K2,V3] = {
328 | new ProjectedPairs(parent,Util.andThen(transform,Util.itMap[(K2,V2),(K2,V3)]{ case (k,v) => (k,f(v))}));
329 | }
330 |
331 | /**
332 | * Models MapReduce/Hadoop-style reduce more exactly.
333 | */
334 | override def flatReduce[K3,V3](f : (K2,Iterator[V2])=>Iterator[(K3,V3)])(implicit mK3 : Manifest[K3], mV3:Manifest[V3]): DistributedPairs[K3,V3] = {
335 | new MapReducePairs(h, parent.paths, new PairTransformMapper(transform), new FlatReduce(f), inputFormatClass);
336 | }
337 |
338 | /**
339 | * Models MapReduce/Hadoop-style reduce more exactly.
340 | */
341 | override def reduce[K3,V3](f: (K2,Iterator[V2])=>(K3,V3))(implicit mL: Manifest[K3], mW:Manifest[V3]): DistributedPairs[K3,V3] = {
342 | new MapReducePairs(h, parent.paths, new PairTransformMapper(transform), new PairReduce(f), inputFormatClass);
343 | }
344 |
345 | override protected[hadoop] implicit def inputFormatClass : Class[T] forSome{ type T <: InputFormat[_,_]} = {
346 | parent.inputFormatClass;
347 | }
348 | }
349 |
350 |
351 |
--------------------------------------------------------------------------------
/src/smr/actors/Distributor.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2008, David Hall
3 | * All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | *
13 | * THIS SOFTWARE IS PROVIDED BY DAVID HALL ``AS IS'' AND ANY
14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
15 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
16 | * DISCLAIMED. IN NO EVENT SHALL DAVID HALL BE LIABLE FOR ANY
17 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
18 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
19 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
20 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
21 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
22 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23 | */
24 | package smr.actors;
25 | import scala.actors.Actor;
26 | import scala.actors.OutputChannel;
27 | import scala.actors.Exit;
28 | import scala.actors.Actor._;
29 | import scala.collection.mutable.ArrayBuffer;
30 | import scala.collection._;
31 | import scala.actors.remote.RemoteActor._;
32 | import scala.actors.remote._;
33 | import scala.reflect.Manifest;
34 | import TransActor._;
35 |
36 | object Distributor {
37 | // Every job needs an id:
38 | type JobID = Int;
39 | }
40 | import Distributor._;
41 |
42 | /**
43 | * Trait for defining objects that can automatically distribute tasks to perform on iterables.
44 | * @author dlwh
45 | */
46 | trait Distributor {
47 | /**
48 | * Generates a DistributedIterable based on the sharding function. Shards from the list are
49 | * automatically divvied out to the workers.
50 | */
51 | def distribute[T] (it : Iterable[T])(implicit shard : (Iterable[T],Int)=>List[Iterable[T]]) : DistributedIterable[T];
52 |
53 | /**
54 | * Low level operation: should generally not be used, but made public for completeness.
55 | * Given a U, automatically shard it out using the shard function to workers.
56 | * @return a handle to the shards
57 | */
58 | def shard[U,V](it : U)(implicit myShard : (U,Int)=>List[V]) : JobID;
59 |
60 | /**
61 | * Low level operation: should generally not be used, but made public for completeness.
62 | * Convert all sharded elements to U's. More or less a map operation.
63 | * @return a handle to the changed shards.
64 | */
65 | def schedule[T,U](id : JobID, f: T=>U) : JobID;
66 |
67 | /**
68 | * Low level operation: should generally not be used, but made public for completeness.
69 | * Retreive all shards, first applying f to each one. Sent as Some(t) to the Actor.
70 | * When finished, None is sent.
71 | * @return a handle to the changed shards.
72 | */
73 | def gather[T,U](job : JobID, f: T=>U, gather : Actor) :Unit;
74 |
75 | /**
76 | * Take input shards of type T and creates a new set of shards U, which are processed.
77 | * This can support a Google-style MapReduce
78 | * @return a handle to the changed shards.
79 | */
80 | def groupBy[T,U,V](job : JobID, f: (T,((Int,U)=>Unit))=>Unit, received: Iterator[U]=>V): JobID;
81 |
82 | /**
83 | * Low level operation: should generally not be used, but made public for completeness.
84 | * Delete all shards with this id.
85 | * @return a handle to the changed shards.
86 | */
87 | def remove(job : JobID) : Unit;
88 |
89 | /**
90 | * Close the distributor and all workers.
91 | */
92 | def close() {}
93 |
94 | }
95 |
96 | private object Priv {
97 |
98 | // Messages to the scheduler from the disributor
99 | sealed case class SchedMsg;
100 | case class Shard[U,V](it : U, shard : (U,Int)=>List[V]) extends SchedMsg;
101 | case class GroupBy[T,U,V](job: JobID, f: (T,((Int,U)=>Unit))=>Unit, received: Iterator[U]=>V) extends SchedMsg;
102 | case class Sched(in : JobID, f : Any=>Any) extends SchedMsg;
103 | case class Get[T,U](job : JobID, f : T => U, gather : Actor) extends SchedMsg;
104 | case class Remove[U](job : JobID) extends SchedMsg;
105 | case class AddWorker[U](a :OutputChannel[Any]) extends SchedMsg;
106 |
107 | sealed case class WorkerMsg;
108 | case class Do(id : JobID, f : Any=>Any, out : JobID) extends WorkerMsg;
109 | case class InPlaceDo(id : JobID, f : Any=>Unit) extends WorkerMsg;
110 | case class Retrieve[T,U](in : JobID, f : Any=>Any, out : JobID, actor : Either[Actor,SerializedActor]) extends WorkerMsg;
111 | case class GetOutputActor[U,V](isLocal : Boolean, out : JobID, shard : Int, process : Iterator[U]=>V) extends WorkerMsg;
112 | case class DoneAdding(id : JobID) extends WorkerMsg;
113 | case class Reserve(id : JobID, shard : Int) extends WorkerMsg;
114 | case class Done[U](id : JobID, shard : Int, result : U) extends WorkerMsg;
115 | case object Close extends WorkerMsg;
116 |
117 | case class StartGet(out: JobID, numShards : Int, gather : Actor);
118 | case class Retrieved(out : JobID, shard : Int, result : Any);
119 | }
120 | import Priv._;
121 |
122 | object Debug extends scala.actors.Debug("smr:: ") {
123 | level = 4;
124 | }
125 |
126 | /**
127 | * Class most users will use. Example use:
128 | *
129 | * val dist = new ActorDistributor(4,4000);
130 | * dist.distribute(myIterable).map(f).reduce(g);
131 | *
132 | */
133 | class ActorDistributor(numWorkers : Int, port : Int) extends Distributor {
134 | override def distribute[T] (it : Iterable[T])
135 | (implicit myShard : (Iterable[T],Int)=>List[Iterable[T]]) : DistributedIterable[T] = new InternalIterable[T] {
136 | protected lazy val id : JobID = shard(it)(myShard);
137 | protected lazy val scheduler = ActorDistributor.this;
138 | };
139 |
140 | // pushes data onto the grid
141 | def shard[U,V](it : U)(implicit myShard : (U,Int)=>List[V]) = (scheduler !?Shard(it,myShard)).asInstanceOf[JobID];
142 | // runs a task on some data on the grid
143 | def schedule[T,U](id : JobID, f: T=>U) = (scheduler !? Sched(id,f.asInstanceOf[Any=>Any])).asInstanceOf[JobID];
144 | // gets it back using some function. Returns immediately. expect output from gather
145 | def gather[T,U](job : JobID, f: T=>U, gather : Actor) :Unit = (scheduler ! Get(job,f,gather));
146 | // gets rid of it:
147 | def remove(job : JobID) : Unit = (scheduler ! Remove(job));
148 |
149 | def groupBy[T,U,V](job : JobID, f: (T,((Int,U)=>Unit))=>Unit, received: Iterator[U]=>V) =
150 | (scheduler !? GroupBy(job,f,received)).asInstanceOf[JobID];
151 | /**
152 | * Adds a (possibly remote) Worker to the workers list.
153 | */
154 | def addWorker(w :OutputChannel[Any]) : Unit = (scheduler ! AddWorker(w));
155 |
156 | override def close = {
157 | scheduler ! Exit(self,'close);
158 | workers.foreach(_._2 ! Close);
159 | }
160 |
161 | private val gatherer = actor {
162 | val gatherers = mutable.Map[JobID,Actor]();
163 | val shardsLeft = mutable.Map[JobID,Int]();
164 | loop {
165 | react {
166 | case StartGet(out, numShards, gather) =>
167 | gatherers(out) = gather;
168 | shardsLeft(out) = numShards;
169 | reply{ None}
170 | case Retrieved(out, shard, result)=>
171 | gatherers(out) ! Some((shard,result));
172 | shardsLeft(out) -= 1;
173 | if(shardsLeft(out) == 0) {
174 | gatherers(out) ! None;
175 | shardsLeft -= out;
176 | gatherers -= out;
177 | }
178 | }
179 | }
180 | }
181 |
182 | // Accumulator is a remote actor, so it just acts a middle man for gatherer.
183 | // Otherwise, potentially large amounts of data would get serialized in the gather closure for no reason.
184 | private val remoteAccumulator = transActor(port,'accumulator) {
185 | loop {
186 | react {
187 | case x => gatherer ! x
188 | }
189 | }
190 | }
191 |
192 | private val localAccumulator = actor {
193 | loop {
194 | react {
195 | case x => gatherer ! x
196 | }
197 | }
198 | }
199 |
200 | // central dispatcher for ActorDistributor
201 | private val scheduler = actor {
202 | val numShards = mutable.Map[JobID,Int]();
203 | var nextJob : JobID =0
204 | def getNextJob() = {
205 | val job = nextJob;
206 | nextJob +=1;
207 | job;
208 | }
209 | loop {
210 | react {
211 | case scala.actors.Exit(_,_) => exit();
212 | case Shard(it,shard)=>
213 | val job = getNextJob();
214 | val shards = shard(it,workers.length)
215 | numShards += (job -> shards.length);
216 | shards.zipWithIndex.foreach {
217 | x =>
218 | Debug.info( "sending shard " + x._2 + " to Worker " + x._2 %workers.length);
219 | workers(x._2 % workers.length)._2 ! Done(job,x._2,x._1)
220 | }
221 | workers.foreach { _._2 ! DoneAdding(job) }
222 | reply { job }
223 | case Sched(in,f)=>
224 | val job = getNextJob();
225 | val oldNumShards = numShards(in);
226 | numShards += (job->oldNumShards);
227 | Debug.info( "Running " + f.getClass.getName() + " on job " + in + "'s output as job " + job);
228 | workers.foreach { a =>
229 | a._2 ! Do(in, f, job)
230 | }
231 | reply { job }
232 | case GroupBy(in, f, r) =>
233 | // get type inference...
234 | def handleGroupBy[T,U,V](in : JobID, f : ( (T,(Int,U)=>Unit)=>Unit), r : Iterator[U]=>V) {
235 | val out = getNextJob();
236 | val oldNumShards = numShards(in);
237 | numShards += (out->oldNumShards);
238 | // set up forwarding actors for the hashed outputs
239 | val outActors = getOutActors(out, oldNumShards, r);
240 | for( (isLocal,w) <- workers) {
241 | w ! DoneAdding(out);
242 | }
243 |
244 | val localActors = outActors.map(getLocalActors);
245 | def localOut(x : Any) {
246 | def output(idx : Int, u : U) {
247 | localActors(idx%localActors.length) ! Some(u);
248 | }
249 | f(x.asInstanceOf[T],output);
250 | localAccumulator ! Retrieved(out,1,None);
251 | }
252 |
253 | val remoteActors = outActors.map(getRemoteActors);
254 | def remoteOut(x :Any) {
255 | def output(idx : Int, u : U) {
256 | remoteActors(idx%remoteActors.length) ! Some(u);
257 | }
258 | f(x.asInstanceOf[T],output);
259 | remoteAccumulator ! Retrieved(out,1,None);
260 | }
261 | val rendevezous = actor {
262 | loop {
263 | react {
264 | case None =>
265 | localActors.foreach{ _ ! None};
266 | case _ => // don't care about results, just want to know when i'm done.
267 | }
268 | }
269 | }
270 | gatherer ! StartGet(out, oldNumShards, rendevezous);
271 | for( (isLocal,w) <- workers) {
272 | w ! InPlaceDo(in,if(!isLocal) remoteOut else localOut);
273 | }
274 | reply {out};
275 | }
276 | handleGroupBy(in,f,r);
277 | case Get(in,f,gather)=>
278 | val out = getNextJob();
279 | Debug.info( "Getting job " + in + " with function " + f.getClass.getName() + " as job id " + out);
280 | gatherer !? StartGet(out,numShards(in),gather);
281 | workers.foreach{ a => a._2 ! Retrieve(in,f.asInstanceOf[Any=>Any],out,if(a._1) Left(localAccumulator) else Right(remoteAccumulator))}
282 | case AddWorker(a)=>
283 | Debug.info("Added a worker.");
284 | workers += new Tuple2(false,a); // TODO:improve
285 |
286 | case Remove(id) =>
287 | Debug.info("Master removing job " + id);
288 | workers.foreach{ _._2 ! Remove(id)};
289 | numShards -= id;
290 | }
291 | }
292 | }
293 | private def getOutActors[U,V](out : JobID, numShards : Int, r : Iterator[U]=>V) = {
294 | for(i <- 0 until numShards;
295 | (isLocal,w) = workers(i%numShards)) {
296 | w ! GetOutputActor(isLocal, out, i, r);
297 | }
298 | val buff = new ArrayBuffer[(Option[Actor],SerializedActor)];
299 | for( i <- 1 to numShards) {
300 | buff += (Actor.?).asInstanceOf[(Option[Actor],SerializedActor)];
301 | }
302 | buff.toSeq;
303 | }
304 |
305 | private def getLocalActors(a : (Option[Actor],SerializedActor)):OutputChannel[Any] = a._1 match {
306 | case Some(a) => a;
307 | case None => a._2
308 | }
309 | private def getRemoteActors(a : (Option[Actor],SerializedActor)): OutputChannel[Any] = a._2;
310 | // boolean says i'm local and don't need to serialize things
311 | private val workers = new ArrayBuffer[(Boolean,OutputChannel[Any])];
312 | for (val i <- List.range(0,numWorkers))
313 | workers += new Tuple2(true,Worker());
314 | }
315 |
316 | private[smr] trait InternalIterable[T] extends DistributedIterable[T] {
317 | protected val id : JobID;
318 | protected val scheduler : Distributor;
319 | import InternalIterable._;
320 |
321 | def elements = {
322 | val list : List[(Int,Iterable[T])] = handleGather(this,Util.identity[Iterable[T]]).toList;
323 | list.sort(_._1 < _._1).map(_._2.projection).reduceLeft(_ append _).elements
324 | }
325 |
326 | def map[U](f : T=>U)(implicit mU : Manifest[U]) : DistributedIterable[U] = handleMap(this,f);
327 | def flatMap[U](f : T=>Iterable[U]) (implicit mU : Manifest[U]): DistributedIterable[U] = handleFlatMap(this,f);
328 | def filter(f : T=>Boolean) : DistributedIterable[T] = handleFilter(this,f);
329 | def reduce[B >: T](f : (B,B)=>B) : B = handleReduce(this,f)
330 | override def mapReduce[U,B >: U](m : T=>U)(r : (B,B)=>B)(implicit mU:Manifest[U]) = {
331 | handleMapReduce(this,m,r);
332 | }
333 | def groupBy[U](group: T=>U):DistributedIterable[(U,Seq[T])] = handleGroupBy(this,group);
334 | def distinct() = handleDistinct(this);
335 | def force = this;
336 |
337 | override protected def finalize() {
338 | try {
339 | scheduler.remove(id);
340 | } finally {
341 | super.finalize();
342 | }
343 | }
344 | }
345 |
346 | /**
347 | * This object wouldn't exist, except that scala closures pass in the this pointer
348 | * even if you don't use any state. Objects don't have that restriction.
349 | */
350 | private[smr] object InternalIterable {
351 | private def handleGather[T,C,U](self : InternalIterable[T], f : SerFunction1[C,U]) = {
352 | val recv = actor {
353 | val b = new ArrayBuffer[(Int,U)];
354 | react {
355 | case 'start =>
356 | val replyTo = Actor.sender;
357 | loop {
358 | react{
359 | case Some(x) =>
360 | b += x.asInstanceOf[(Int,U)];
361 | Debug.info("Got shard " + x.asInstanceOf[(Int,U)]._1);
362 | case None => replyTo ! b ; exit();
363 | }
364 | }
365 | }
366 | }
367 | self.scheduler.gather(self.id, f, recv);
368 | (recv !? 'start).asInstanceOf[ArrayBuffer[(Int,U)]];
369 | }
370 |
371 | private def handleMap[T,U](self : InternalIterable[T], f : T=>U) = {
372 | new InternalIterable[U] {
373 | protected val scheduler = self.scheduler;
374 | Debug.info("Map with " + f.getClass.getName);
375 | protected val id = scheduler.schedule(self.id,Util.fMap(f));
376 | }
377 | }
378 | private def handleFlatMap[T,U](self : InternalIterable[T], f : T=>Iterable[U]) = {
379 | new InternalIterable[U] {
380 | protected val scheduler = self.scheduler;
381 | protected val id = scheduler.schedule(self.id,Util.fFlatMap(f));
382 | }
383 | }
384 | private def handleFilter[T](self : InternalIterable[T], f : T=>Boolean) = {
385 | new InternalIterable[T] {
386 | protected val scheduler = self.scheduler;
387 | protected val id = scheduler.schedule(self.id,Util.fFilter(f));
388 | }
389 | }
390 |
391 | private def handleReduce[T,B>:T](self : InternalIterable[T], f : (B,B)=>B) = {
392 | val b = handleGather[T,Iterable[T],Option[B]](self,new SerFunction1[Iterable[T],Option[B]]{
393 | def apply(x : Iterable[T])= if (x.isEmpty) None else Some(x.reduceLeft(f));
394 | });
395 | b.filter(None!=).map{ (x : (Int,Option[B])) => x._2.get}.reduceLeft(f);
396 | }
397 |
398 | private def handleMapReduce[T,U,B>:U](self :InternalIterable[T], m : T=>U, r : (B,B)=>B) = {
399 | Debug.info("MapReduce with " + m.getClass.getName + " and reduce " + r.getClass.getName);
400 |
401 | val doMapReduce = new SerFunction1[Iterable[T],Option[B]] {
402 | def apply(x : Iterable[T]) = {
403 | if (x.isEmpty) None
404 | else {
405 | var elems = x.elements;
406 | var acc : B = m(elems.next);;
407 | while(elems.hasNext) acc= r(acc,m(elems.next));
408 | Some(acc);
409 | }
410 | }
411 | }
412 | val b = handleGather[T,Iterable[T],Option[B]](self,doMapReduce);
413 | b.filter(None!=).map{ (x : (Int,Option[B])) => x._2.get}.reduceLeft(r);
414 | }
415 |
416 | private def handleGroupBy[T,U](self : InternalIterable[T], group : T=>U) = {
417 | val innerGroupBy = { (it : Iterable[T],out : (Int,(U,T)) =>Unit) =>
418 | for(x <- it) {
419 | val ARBITRARY_PRIME=47;
420 | val u = group(x);
421 | out(u.hashCode+ARBITRARY_PRIME,(u,x));
422 | }
423 | }
424 |
425 | val receiver = { (it: Iterator[(U,T)]) =>
426 | val map = scala.collection.mutable.Map[U,ArrayBuffer[T]]();
427 | for( (u,t) <- it) {
428 | map.getOrElseUpdate(u,new ArrayBuffer[T]) += t;
429 | }
430 | map.toSeq;
431 | }
432 | new InternalIterable[(U,Seq[T])] {
433 | protected val scheduler = self.scheduler;
434 | protected val id = scheduler.groupBy(self.id,innerGroupBy, receiver);
435 | }
436 | }
437 |
438 | private def handleDistinct[T](self : InternalIterable[T]) = {
439 | val innerGroupBy = { (it : Iterable[T],out : (Int,T) =>Unit) =>
440 | for(x <- it) {
441 | val ARBITRARY_PRIME=47;
442 | out(x.hashCode+ARBITRARY_PRIME,x);
443 | }
444 | }
445 |
446 | val receiver = { (it: Iterator[T]) =>
447 | val set = scala.collection.mutable.Set[T]() ++ it;
448 | set.toSeq;
449 | }
450 | new InternalIterable[T] {
451 | protected val scheduler = self.scheduler;
452 | protected val id = scheduler.groupBy(self.id,innerGroupBy, receiver);
453 | }
454 | }
455 |
456 | }
457 |
--------------------------------------------------------------------------------