├── .gitignore ├── Gomfile ├── gossamr.go ├── sort_writer_test.go ├── examples └── wordcount │ └── wordcount.go ├── job.go ├── runner_test.go ├── sort_writer.go ├── gossamr_test.go ├── README.md ├── task_test.go ├── task.go ├── io_test.go ├── io.go ├── runner.go └── LICENSE /.gitignore: -------------------------------------------------------------------------------- 1 | gossamr 2 | _vendor 3 | ./wordcount 4 | -------------------------------------------------------------------------------- /Gomfile: -------------------------------------------------------------------------------- 1 | gom 'github.com/markchadwick/sortedpairs' 2 | gom 'github.com/markchadwick/spec' 3 | gom 'github.com/markchadwick/typedbytes' 4 | -------------------------------------------------------------------------------- /gossamr.go: -------------------------------------------------------------------------------- 1 | package gossamr 2 | 3 | import ( 4 | "os" 5 | ) 6 | 7 | func Run(tasks ...*Task) error { 8 | job := NewJob(tasks...) 9 | runner, err := GetRunner(os.Args) 10 | if err != nil { 11 | return err 12 | } 13 | return runner.Run(job) 14 | } 15 | -------------------------------------------------------------------------------- /sort_writer_test.go: -------------------------------------------------------------------------------- 1 | package gossamr 2 | 3 | import ( 4 | "fmt" 5 | "github.com/markchadwick/spec" 6 | "log" 7 | "math/rand" 8 | ) 9 | 10 | var _ = spec.Suite("Sort Writer", func(c *spec.C) { 11 | c.It("should flipping run", func(c *spec.C) { 12 | buf := NewBufCloser() 13 | sw, err := NewSortWriter(buf, 10) 14 | c.Assert(err).IsNil() 15 | 16 | for i := 0; i < 25; i++ { 17 | key := fmt.Sprintf("rec-%05d", rand.Int31n(100)) 18 | c.Assert(sw.Write(key, int32(i))).IsNil() 19 | } 20 | c.Assert(sw.Close()).IsNil() 21 | 22 | pr := NewPairReader(buf) 23 | for { 24 | k, v, err := pr.Next() 25 | if err != nil { 26 | return 27 | } 28 | log.Printf("%v = %v", k, v) 29 | } 30 | }) 31 | 32 | }) 33 | -------------------------------------------------------------------------------- /examples/wordcount/wordcount.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "log" 5 | "strings" 6 | 7 | "github.com/vistarmedia/gossamr" 8 | ) 9 | 10 | type WordCount struct{} 11 | 12 | func (wc *WordCount) Map(p int64, line string, c gossamr.Collector) error { 13 | for _, word := range strings.Fields(line) { 14 | c.Collect(strings.ToLower(word), int64(1)) 15 | } 16 | return nil 17 | } 18 | 19 | func (wc *WordCount) Reduce(word string, counts chan int64, c gossamr.Collector) error { 20 | var sum int64 = 0 21 | for v := range counts { 22 | sum += v 23 | } 24 | c.Collect(sum, word) 25 | return nil 26 | } 27 | 28 | func main() { 29 | wordcount := gossamr.NewTask(&WordCount{}) 30 | 31 | err := gossamr.Run(wordcount) 32 | if err != nil { 33 | log.Fatal(err) 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /job.go: -------------------------------------------------------------------------------- 1 | package gossamr 2 | 3 | import ( 4 | "log" 5 | ) 6 | 7 | type Collector interface { 8 | Collect(k, v interface{}) error 9 | } 10 | 11 | type writerCollector struct { 12 | writer Writer 13 | } 14 | 15 | var _ Collector = new(writerCollector) 16 | 17 | func NewWriterCollector(writer Writer) *writerCollector { 18 | return &writerCollector{writer} 19 | } 20 | 21 | func (wc *writerCollector) Collect(k, v interface{}) (err error) { 22 | err = wc.writer.Write(k, v) 23 | if err != nil { 24 | log.Printf("error writing to collector: %s", err.Error()) 25 | } 26 | return 27 | } 28 | 29 | type Job struct { 30 | // reader Reader 31 | // writer Writer 32 | tasks []*Task 33 | } 34 | 35 | func NewJob(tasks ...*Task) *Job { 36 | return &Job{ 37 | // reader: r, 38 | // writer: w, 39 | tasks: tasks, 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /runner_test.go: -------------------------------------------------------------------------------- 1 | package gossamr 2 | 3 | import ( 4 | "github.com/markchadwick/spec" 5 | ) 6 | 7 | var _ = spec.Suite("Task Phase Runner", func(c *spec.C) { 8 | c.It("should bail with missing args", func(c *spec.C) { 9 | args := []string{ 10 | "./myprog", 11 | } 12 | _, err := TaskPhaseRunnerFromArgs(args) 13 | c.Assert(err).NotNil() 14 | }) 15 | 16 | c.It("should bail on invalid phase", func(c *spec.C) { 17 | args := []string{ 18 | "./myprog", 19 | "-phase", "rock it", 20 | } 21 | _, err := TaskPhaseRunnerFromArgs(args) 22 | c.Assert(err).NotNil() 23 | c.Assert(err.Error()).Equals("Unknown phase rock it") 24 | }) 25 | 26 | c.It("should parse task # and phase", func(c *spec.C) { 27 | args := []string{ 28 | "./myprog", 29 | "-task", "2", 30 | "-phase", "combine", 31 | } 32 | r, err := TaskPhaseRunnerFromArgs(args) 33 | c.Assert(err).IsNil() 34 | c.Assert(r.taskNo).Equals(2) 35 | c.Assert(r.phase).Equals(CombinePhase) 36 | }) 37 | }) 38 | 39 | var _ = spec.Suite("Local runner", func(c *spec.C) { 40 | c.Skip("-pending-") 41 | }) 42 | -------------------------------------------------------------------------------- /sort_writer.go: -------------------------------------------------------------------------------- 1 | package gossamr 2 | 3 | import ( 4 | "github.com/markchadwick/sortedpairs" 5 | "github.com/markchadwick/typedbytes" 6 | "io" 7 | ) 8 | 9 | type pairWriter struct { 10 | w io.Writer 11 | } 12 | 13 | func (pw *pairWriter) Write(k, v []byte) (err error) { 14 | if _, err = pw.w.Write(k); err != nil { 15 | return 16 | } 17 | _, err = pw.w.Write(v) 18 | return 19 | } 20 | 21 | type SortWriter struct { 22 | w io.WriteCloser 23 | spw *sortedpairs.Writer 24 | } 25 | 26 | func NewSortWriter(w io.WriteCloser, capacity int) (*SortWriter, error) { 27 | pw := &pairWriter{w} 28 | spw, err := sortedpairs.NewWriter(pw, capacity) 29 | if err != nil { 30 | return nil, err 31 | } 32 | sw := &SortWriter{ 33 | w: w, 34 | spw: spw, 35 | } 36 | return sw, nil 37 | } 38 | 39 | func (sw *SortWriter) Write(k, v interface{}) (err error) { 40 | var kb, vb []byte 41 | if kb, err = typedbytes.Encode(k); err != nil { 42 | return 43 | } 44 | if vb, err = typedbytes.Encode(v); err != nil { 45 | return 46 | } 47 | return sw.spw.Write(kb, vb) 48 | } 49 | 50 | func (sw *SortWriter) Close() (err error) { 51 | err = sw.spw.Close() 52 | sw.w.Close() 53 | return 54 | } 55 | -------------------------------------------------------------------------------- /gossamr_test.go: -------------------------------------------------------------------------------- 1 | package gossamr 2 | 3 | import ( 4 | "bufio" 5 | "bytes" 6 | "github.com/markchadwick/spec" 7 | "io" 8 | "log" 9 | "testing" 10 | ) 11 | 12 | func init() { 13 | log.SetFlags(log.Lshortfile | log.Ldate | log.Ltime) 14 | } 15 | 16 | func Test(t *testing.T) { 17 | spec.Run(t) 18 | } 19 | 20 | type BufCloser struct { 21 | *bytes.Buffer 22 | } 23 | 24 | func NewBufCloser() *BufCloser { 25 | return &BufCloser{new(bytes.Buffer)} 26 | } 27 | 28 | func (bc *BufCloser) Close() error { 29 | return nil 30 | } 31 | 32 | // An in-memory buffer to block while input is pending and it is not closed. 33 | type TestBuffer struct { 34 | r *io.PipeReader 35 | w *io.PipeWriter 36 | br *bufio.Reader 37 | bw *bufio.Writer 38 | } 39 | 40 | func NewTestBuffer() *TestBuffer { 41 | r, w := io.Pipe() 42 | return &TestBuffer{ 43 | r: r, 44 | w: w, 45 | br: bufio.NewReader(r), 46 | bw: bufio.NewWriter(w), 47 | } 48 | } 49 | 50 | func (tb *TestBuffer) Read(p []byte) (int, error) { 51 | return tb.br.Read(p) 52 | } 53 | 54 | func (tb *TestBuffer) Write(p []byte) (int, error) { 55 | return tb.bw.Write(p) 56 | } 57 | 58 | func (tb *TestBuffer) Close() (err error) { 59 | if err = tb.bw.Flush(); err != nil { 60 | return 61 | } 62 | return tb.w.CloseWithError(io.EOF) 63 | } 64 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | **Gossamr** lets you run your Go programs on Hadoop. 2 | 3 | 4 | ## Quick Example 5 | Oh, man. Illustrating MapReduce with a word count? Get out of town. 6 | 7 | ```go 8 | package main 9 | 10 | import ( 11 | "log" 12 | "strings" 13 | 14 | "github.com/vistarmedia/gossamr" 15 | ) 16 | 17 | type WordCount struct{} 18 | 19 | func (wc *WordCount) Map(p int64, line string, c gossamr.Collector) error { 20 | for _, word := range strings.Fields(line) { 21 | c.Collect(strings.ToLower(word), int64(1)) 22 | } 23 | return nil 24 | } 25 | 26 | func (wc *WordCount) Reduce(word string, counts chan int64, c gossamr.Collector) error { 27 | var sum int64 28 | for v := range counts { 29 | sum += v 30 | } 31 | c.Collect(sum, word) 32 | return nil 33 | } 34 | 35 | func main() { 36 | wordcount := gossamr.NewTask(&WordCount{}) 37 | 38 | err := gossamr.Run(wordcount) 39 | if err != nil { 40 | log.Fatal(err) 41 | } 42 | } 43 | ``` 44 | 45 | ## Running with Hadoop 46 | 47 | ./bin/hadoop jar ./contrib/streaming/hadoop-streaming-1.2.1.jar \ 48 | -input /mytext.txt \ 49 | -output /output.15 \ 50 | -mapper "gossamr -task 0 -phase map" \ 51 | -reducer "gossamr -task 0 -phase reduce" \ 52 | -io typedbytes \ 53 | -file ./wordcount 54 | -numReduceTasks 6 55 | -------------------------------------------------------------------------------- /task_test.go: -------------------------------------------------------------------------------- 1 | package gossamr 2 | 3 | import ( 4 | "fmt" 5 | "github.com/markchadwick/spec" 6 | "io" 7 | "sync" 8 | ) 9 | 10 | type Echo struct { 11 | } 12 | 13 | func (e *Echo) Map(k, v string, c Collector) error { 14 | return c.Collect(fmt.Sprintf("%s said", k), fmt.Sprintf("Hello, %s", v)) 15 | } 16 | 17 | func (e *Echo) String() string { 18 | return "Echo" 19 | } 20 | 21 | var _ = spec.Suite("Task", func(c *spec.C) { 22 | echo := new(Echo) 23 | echoTask := NewTask(echo) 24 | w := NewTestBuffer() 25 | r := NewTestBuffer() 26 | defer r.Close() 27 | defer w.Close() 28 | 29 | c.It("should know when a method is missing", func(c *spec.C) { 30 | _, ok := echoTask.methodByName("Missing") 31 | c.Assert(ok).IsFalse() 32 | }) 33 | 34 | c.It("should know when a method exists", func(c *spec.C) { 35 | mapper, ok := echoTask.methodByName("Map") 36 | c.Assert(ok).IsTrue() 37 | c.Assert(mapper).NotNil() 38 | }) 39 | 40 | c.It("should not run an invalid phase", func(c *spec.C) { 41 | err := echoTask.Run(66, r, w) 42 | c.Assert(err).NotNil() 43 | c.Assert(err.Error()).Equals("Invalid phase 66") 44 | }) 45 | 46 | c.It("should not run an unimplemented phase", func(c *spec.C) { 47 | err := echoTask.Run(CombinePhase, r, w) 48 | c.Assert(err).NotNil() 49 | c.Assert(err.Error()).Equals("No phase 1 for Echo") 50 | }) 51 | 52 | c.It("should run a simple map phase", func(c *spec.C) { 53 | input := NewPairWriter(r) 54 | output := NewPairReader(w) 55 | 56 | wg := new(sync.WaitGroup) 57 | wg.Add(1) 58 | go func() { 59 | defer wg.Done() 60 | c.Assert(input.Write("thelma", "louise")) 61 | c.Assert(input.Write("abbott", "costello")) 62 | input.Close() 63 | }() 64 | 65 | wg.Add(1) 66 | go func() { 67 | defer wg.Done() 68 | err := echoTask.Run(MapPhase, r, w) 69 | c.Assert(err).IsNil() 70 | }() 71 | 72 | var k, v interface{} 73 | var err error 74 | k, v, err = output.Next() 75 | c.Assert(err).IsNil() 76 | c.Assert(k).Equals("thelma said") 77 | c.Assert(v).Equals("Hello, louise") 78 | 79 | k, v, err = output.Next() 80 | c.Assert(err).IsNil() 81 | c.Assert(k).Equals("abbott said") 82 | c.Assert(v).Equals("Hello, costello") 83 | 84 | k, v, err = output.Next() 85 | c.Assert(k).IsNil() 86 | c.Assert(v).IsNil() 87 | c.Assert(err).NotNil() 88 | c.Assert(err).Equals(io.EOF) 89 | 90 | wg.Wait() 91 | }) 92 | }) 93 | -------------------------------------------------------------------------------- /task.go: -------------------------------------------------------------------------------- 1 | package gossamr 2 | 3 | import ( 4 | "fmt" 5 | "io" 6 | "log" 7 | "reflect" 8 | ) 9 | 10 | type Phase uint8 11 | 12 | const ( 13 | MapPhase Phase = iota 14 | CombinePhase 15 | ReducePhase 16 | ) 17 | 18 | func GetPhase(name string) (Phase, error) { 19 | switch name { 20 | default: 21 | return 0, fmt.Errorf("Unknown phase %s", name) 22 | case "": 23 | return 0, fmt.Errorf("Missing phase") 24 | case "map": 25 | return MapPhase, nil 26 | case "combine": 27 | return CombinePhase, nil 28 | case "reduce": 29 | return ReducePhase, nil 30 | } 31 | } 32 | 33 | type Task struct { 34 | instance interface{} 35 | value reflect.Value 36 | } 37 | 38 | func NewTask(instance interface{}) *Task { 39 | value := reflect.ValueOf(instance) 40 | return &Task{ 41 | instance: instance, 42 | value: value, 43 | } 44 | } 45 | 46 | func (t *Task) Run(phase Phase, r io.Reader, w io.WriteCloser) (err error) { 47 | var input Reader 48 | pairs := NewPairReader(r) 49 | output := NewPairWriter(w) 50 | 51 | var m reflect.Value 52 | var ok bool 53 | switch phase { 54 | default: 55 | return fmt.Errorf("Invalid phase %d", phase) 56 | case MapPhase: 57 | input = pairs 58 | m, ok = t.mapper() 59 | case CombinePhase: 60 | input = NewGroupedReader(pairs) 61 | m, ok = t.combiner() 62 | case ReducePhase: 63 | input = NewGroupedReader(pairs) 64 | m, ok = t.reducer() 65 | } 66 | if !ok { 67 | return fmt.Errorf("No phase %d for %s", phase, t.instance) 68 | } 69 | err = t.run(m, input, output) 70 | return 71 | } 72 | 73 | func (t *Task) run(m reflect.Value, input Reader, output Writer) (err error) { 74 | collector := NewWriterCollector(output) 75 | colValue := reflect.ValueOf(collector) 76 | 77 | defer func() { 78 | if e := output.Close(); e != nil && err == nil { 79 | err = e 80 | } 81 | }() 82 | 83 | var k, v interface{} 84 | for { 85 | k, v, err = input.Next() 86 | if err != nil { 87 | if err == io.EOF { 88 | return nil 89 | } 90 | log.Printf("Read error: %s", err) 91 | return 92 | } 93 | m.Call([]reflect.Value{ 94 | reflect.ValueOf(k), 95 | reflect.ValueOf(v), 96 | colValue, 97 | }) 98 | } 99 | } 100 | 101 | func (t *Task) mapper() (reflect.Value, bool) { 102 | return t.methodByName("Map") 103 | } 104 | 105 | func (t *Task) combiner() (reflect.Value, bool) { 106 | return t.methodByName("Combine") 107 | } 108 | 109 | func (t *Task) reducer() (reflect.Value, bool) { 110 | return t.methodByName("Reduce") 111 | } 112 | 113 | func (t *Task) methodByName(name string) (v reflect.Value, ok bool) { 114 | v = t.value.MethodByName(name) 115 | ok = v.Kind() == reflect.Func 116 | return 117 | } 118 | -------------------------------------------------------------------------------- /io_test.go: -------------------------------------------------------------------------------- 1 | package gossamr 2 | 3 | import ( 4 | "github.com/markchadwick/spec" 5 | "io" 6 | ) 7 | 8 | type TestReader struct { 9 | i int 10 | rows [][]interface{} 11 | } 12 | 13 | func NewTestReader(rows [][]interface{}) *TestReader { 14 | return &TestReader{ 15 | i: 0, 16 | rows: rows, 17 | } 18 | } 19 | 20 | func (tr *TestReader) Next() (k, v interface{}, err error) { 21 | if tr.i > len(tr.rows)-1 { 22 | return nil, nil, io.EOF 23 | } 24 | 25 | row := tr.rows[tr.i] 26 | tr.i++ 27 | return row[0], row[1], nil 28 | } 29 | 30 | var _ = spec.Suite("Grouped Reader", func(c *spec.C) { 31 | c.It("should know when its input is closed", func(c *spec.C) { 32 | tr := &TestReader{} 33 | gr := NewGroupedReader(tr) 34 | 35 | _, _, err := gr.Next() 36 | c.Assert(err).Equals(io.EOF) 37 | }) 38 | 39 | c.It("should group adjacent keys", func(c *spec.C) { 40 | tr := NewTestReader([][]interface{}{ 41 | {"seen", 12}, 42 | {"seen", 82}, 43 | }) 44 | gr := NewGroupedReader(tr) 45 | 46 | key, vs, err := gr.Next() 47 | c.Assert(err).IsNil() 48 | c.Assert(key).Equals("seen") 49 | 50 | ch, ok := vs.(chan int) 51 | c.Assert(ok).IsTrue() 52 | 53 | observed := make([]int, 0) 54 | for o := range ch { 55 | observed = append(observed, o) 56 | } 57 | c.Assert(observed).HasLen(2) 58 | c.Assert(observed[0]).Equals(12) 59 | c.Assert(observed[1]).Equals(82) 60 | 61 | key, vs, err = gr.Next() 62 | c.Assert(err).Equals(io.EOF) 63 | c.Assert(key).IsNil() 64 | c.Assert(vs).IsNil() 65 | }) 66 | 67 | c.It("should aggregate all similar keys", func(c *spec.C) { 68 | tr := NewTestReader([][]interface{}{ 69 | {"delivered", 10}, 70 | {"delivered", 80}, 71 | {"seen", 12}, 72 | {"seen", 82}, 73 | }) 74 | gr := NewGroupedReader(tr) 75 | 76 | key, vs, err := gr.Next() 77 | c.Assert(err).IsNil() 78 | c.Assert(key).Equals("delivered") 79 | 80 | ch, ok := vs.(chan int) 81 | c.Assert(ok).IsTrue() 82 | 83 | observed := make([]int, 0) 84 | for o := range ch { 85 | observed = append(observed, o) 86 | } 87 | c.Assert(observed).HasLen(2) 88 | c.Assert(observed[0]).Equals(10) 89 | c.Assert(observed[1]).Equals(80) 90 | 91 | key, vs, err = gr.Next() 92 | c.Assert(err).IsNil() 93 | c.Assert(key).Equals("seen") 94 | 95 | ch, ok = vs.(chan int) 96 | c.Assert(ok).IsTrue() 97 | 98 | observed = make([]int, 0) 99 | for o := range ch { 100 | observed = append(observed, o) 101 | } 102 | c.Assert(observed).HasLen(2) 103 | c.Assert(observed[0]).Equals(12) 104 | c.Assert(observed[1]).Equals(82) 105 | 106 | key, vs, err = gr.Next() 107 | c.Assert(err).Equals(io.EOF) 108 | c.Assert(key).IsNil() 109 | c.Assert(vs).IsNil() 110 | }) 111 | }) 112 | -------------------------------------------------------------------------------- /io.go: -------------------------------------------------------------------------------- 1 | package gossamr 2 | 3 | import ( 4 | "bufio" 5 | "fmt" 6 | "github.com/markchadwick/typedbytes" 7 | "io" 8 | "reflect" 9 | ) 10 | 11 | func Copy(r Reader, w Writer) (err error) { 12 | var k, v interface{} 13 | for { 14 | if k, v, err = r.Next(); err != nil { 15 | if err == io.EOF { 16 | return nil 17 | } 18 | return 19 | } 20 | if err = w.Write(k, v); err != nil { 21 | return 22 | } 23 | } 24 | } 25 | 26 | type Reader interface { 27 | Next() (k, v interface{}, err error) 28 | } 29 | 30 | type Writer interface { 31 | Write(k, v interface{}) error 32 | Close() error 33 | } 34 | 35 | // A reader that, for each key, will group all its values into a channel. 36 | type GroupedReader struct { 37 | nextKey interface{} 38 | nextValue interface{} 39 | nextError error 40 | reader Reader 41 | } 42 | 43 | func NewGroupedReader(reader Reader) Reader { 44 | return &GroupedReader{ 45 | nextKey: nil, 46 | nextValue: nil, 47 | reader: reader, 48 | } 49 | } 50 | 51 | func (gr *GroupedReader) Next() (k, v interface{}, err error) { 52 | if gr.nextError != nil { 53 | err = gr.nextError 54 | return 55 | } 56 | 57 | if gr.nextKey == nil && gr.nextValue == nil { 58 | gr.nextKey, gr.nextValue, err = gr.reader.Next() 59 | if err != nil { 60 | return 61 | } 62 | } 63 | 64 | key := gr.nextKey 65 | t := reflect.ChanOf(reflect.BothDir, reflect.TypeOf(gr.nextValue)) 66 | ch := reflect.MakeChan(t, 0) 67 | 68 | go func() { 69 | defer ch.Close() 70 | ch.Send(reflect.ValueOf(gr.nextValue)) 71 | for { 72 | k, v, err = gr.reader.Next() 73 | if err != nil { 74 | gr.nextError = err 75 | return 76 | } 77 | if k != key { 78 | gr.nextKey = k 79 | gr.nextValue = v 80 | return 81 | } 82 | ch.Send(reflect.ValueOf(v)) 83 | } 84 | }() 85 | return key, ch.Interface(), nil 86 | } 87 | 88 | // Read pairs serialized with Hadoop's typedbytes. It is assumed that in 89 | // non-local mode, this will always be the wire format for reading and writing. 90 | func NewPairReader(r io.Reader) Reader { 91 | byteReader := typedbytes.NewReader(r) 92 | return typedbytes.NewPairReader(byteReader) 93 | } 94 | 95 | // Write pairs to an underlying writer in Hadoop's typedbytes format. As above, 96 | // it is assumed all non-local IO will happen in this format 97 | func NewPairWriter(w io.WriteCloser) Writer { 98 | byteWriter := typedbytes.NewWriter(w) 99 | return typedbytes.NewPairWriter(byteWriter) 100 | } 101 | 102 | // Line Reader is used by basic streaming jobs. It yields a line number and the 103 | // raw line delimited by \n. The consumer must accept the arguments (int64, 104 | // string). 105 | type LineReader struct { 106 | n int64 107 | reader *bufio.Reader 108 | } 109 | 110 | func NewLineReader(r io.Reader) *LineReader { 111 | reader := bufio.NewReader(r) 112 | return &LineReader{ 113 | n: 0, 114 | reader: reader, 115 | } 116 | } 117 | 118 | func (lr *LineReader) Next() (k, v interface{}, err error) { 119 | k = lr.n 120 | var line []byte 121 | v, err = lr.reader.ReadString('\n') 122 | lr.n += int64(len(line)) 123 | return 124 | } 125 | 126 | // StringWriter will coax each key/value to a simple string and output it in 127 | // simple streaming format: key\tvalue\n 128 | type StringWriter struct { 129 | w io.WriteCloser 130 | } 131 | 132 | func NewStringWriter(w io.WriteCloser) *StringWriter { 133 | return &StringWriter{w} 134 | } 135 | 136 | func (sw *StringWriter) Write(k, v interface{}) (err error) { 137 | _, err = fmt.Fprintf(sw.w, "%v\t%v\n", k, v) 138 | return 139 | } 140 | 141 | func (sw *StringWriter) Close() error { 142 | return sw.w.Close() 143 | } 144 | -------------------------------------------------------------------------------- /runner.go: -------------------------------------------------------------------------------- 1 | // Runs a job (or part of a job). There are three primary types of runners 2 | // 3 | // 1. LocalRunner - Used for simulating a job locally. The sorting and 4 | // combining functions of Hadoop will be emulated as best as possible, though 5 | // no guarantees are made 6 | // 2. TaskPhaseRunner - Used inter-step during a Hadoop job. This runs a single 7 | // phase of a task 8 | // 3. JobRunner - Submits a multi-task Job to hadoop, organizing temporary 9 | // files and forking the necessary processes. 10 | package gossamr 11 | 12 | import ( 13 | "flag" 14 | "fmt" 15 | "io/ioutil" 16 | "log" 17 | "os" 18 | "path" 19 | "reflect" 20 | ) 21 | 22 | // Given the arguments, figure out which runner should be used. 23 | func GetRunner(args []string) (Runner, error) { 24 | if argsContain(args, "-task") { 25 | return TaskPhaseRunnerFromArgs(args) 26 | } 27 | 28 | return new(LocalRunner), nil 29 | } 30 | 31 | func argsContain(args []string, s string) bool { 32 | for _, arg := range args { 33 | if arg == s { 34 | return true 35 | } 36 | } 37 | return false 38 | } 39 | 40 | type Runner interface { 41 | Run(job *Job) error 42 | } 43 | 44 | // LocalRunner 45 | type LocalRunner struct { 46 | root string 47 | } 48 | 49 | func (lr *LocalRunner) Run(j *Job) (err error) { 50 | if lr.root, err = ioutil.TempDir("", "gossamr-"); err != nil { 51 | return 52 | } 53 | log.Printf("Working in %s", lr.root) 54 | defer os.RemoveAll(lr.root) 55 | return lr.runJob(j) 56 | } 57 | 58 | func (lr *LocalRunner) runJob(j *Job) (err error) { 59 | input := NewLineReader(os.Stdin) 60 | var fname string 61 | var output *os.File 62 | 63 | for i, task := range j.tasks { 64 | if fname, err = lr.runTask(i, task, input); err != nil { 65 | return 66 | } 67 | } 68 | 69 | if fname == "" { 70 | return nil 71 | } 72 | 73 | if output, err = os.Open(fname); err != nil { 74 | return 75 | } 76 | reader := NewPairReader(output) 77 | writer := NewStringWriter(os.Stdout) 78 | return Copy(reader, writer) 79 | } 80 | 81 | func (lr *LocalRunner) runTask(i int, t *Task, in Reader) (output string, err error) { 82 | var f *os.File 83 | mapper, hasMapper := t.mapper() 84 | combiner, hasCombiner := t.combiner() 85 | reducer, hasReducer := t.reducer() 86 | 87 | // A task must have a mapper 88 | if !hasMapper { 89 | return "", fmt.Errorf("Task[%d] has no mapper", i) 90 | } 91 | mapOutput, err := lr.open(i, "mapper") 92 | if err != nil { 93 | return "", err 94 | } 95 | output = mapOutput.Name() 96 | 97 | if hasCombiner || hasReducer { 98 | if err = lr.execSorted(t, mapper, in, mapOutput); err != nil { 99 | return 100 | } 101 | } else { 102 | if err = lr.exec(t, mapper, in, mapOutput); err != nil { 103 | return 104 | } 105 | } 106 | 107 | if hasCombiner { 108 | if f, err = os.Open(output); err != nil { 109 | return output, err 110 | } 111 | in = NewGroupedReader(NewPairReader(f)) 112 | 113 | combineOutput, err := lr.open(i, "combiner") 114 | if err != nil { 115 | return "", err 116 | } 117 | output = combineOutput.Name() 118 | 119 | if err = lr.execSorted(t, combiner, in, combineOutput); err != nil { 120 | return output, err 121 | } 122 | } 123 | 124 | if hasReducer { 125 | if f, err = os.Open(output); err != nil { 126 | return output, err 127 | } 128 | in = NewGroupedReader(NewPairReader(f)) 129 | 130 | reduceOutput, err := lr.open(i, "reducer") 131 | if err != nil { 132 | return "", err 133 | } 134 | output = reduceOutput.Name() 135 | if err = lr.execSorted(t, reducer, in, reduceOutput); err != nil { 136 | return output, err 137 | } 138 | } 139 | 140 | return 141 | } 142 | 143 | func (lr *LocalRunner) execSorted(t *Task, f reflect.Value, r Reader, out *os.File) error { 144 | w, err := NewSortWriter(out, 1024*1024) 145 | if err != nil { 146 | return err 147 | } 148 | return t.run(f, r, w) 149 | } 150 | 151 | func (lr *LocalRunner) exec(t *Task, f reflect.Value, r Reader, out *os.File) error { 152 | w := NewPairWriter(out) 153 | return t.run(f, r, w) 154 | } 155 | 156 | func (lr *LocalRunner) open(i int, name string) (f *os.File, err error) { 157 | fname := path.Join(lr.root, fmt.Sprintf("%03d-%s", i, name)) 158 | return os.OpenFile(fname, os.O_RDWR|os.O_CREATE, 0644) 159 | } 160 | 161 | // TaskPhaseRunner 162 | // Runs a single phase of a task forked from Hadoop. It is assumed that all 163 | // input and output will be typed bytes at this point. 164 | type TaskPhaseRunner struct { 165 | taskNo int 166 | phase Phase 167 | } 168 | 169 | func TaskPhaseRunnerFromArgs(args []string) (tpr *TaskPhaseRunner, err error) { 170 | fs := flag.NewFlagSet(args[0], flag.ContinueOnError) 171 | taskNo := fs.Int("task", 0, "task # to run") 172 | phaseName := fs.String("phase", "", "phase of task to run") 173 | 174 | if err = fs.Parse(args[1:]); err != nil { 175 | return 176 | } 177 | 178 | phase, err := GetPhase(*phaseName) 179 | if err != nil { 180 | return nil, err 181 | } 182 | 183 | tpr = &TaskPhaseRunner{ 184 | taskNo: *taskNo, 185 | phase: phase, 186 | } 187 | return 188 | } 189 | 190 | func (tpr *TaskPhaseRunner) Run(j *Job) error { 191 | if tpr.taskNo > len(j.tasks)-1 { 192 | return fmt.Errorf("No task %d", tpr.taskNo) 193 | } 194 | task := j.tasks[tpr.taskNo] 195 | log.Printf("Running phase %d with TaskPhaseRunner", tpr.phase) 196 | return task.Run(tpr.phase, os.Stdin, os.Stdout) 197 | } 198 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | --------------------------------------------------------------------------------