├── README.md
├── .gitignore
├── LICENSE
├── main_test.go
└── main.go


/README.md:
--------------------------------------------------------------------------------
1 | # sstable
2 | bigdata processing in golang
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Binaries for programs and plugins
 2 | *.exe
 3 | *.exe~
 4 | *.dll
 5 | *.so
 6 | *.dylib
 7 | 
 8 | # Test binary, build with `go test -c`
 9 | *.test
10 | 
11 | # Output of the go coverage tool, specifically when used with LiteIDE
12 | *.out
13 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 xtaci
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/main_test.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"io"
  6 | 	"log"
  7 | 	"math/rand"
  8 | 	"net/http"
  9 | 	_ "net/http/pprof"
 10 | 	"os"
 11 | 	"testing"
 12 | 	"time"
 13 | )
 14 | 
 15 | const testfile = "10G.data"
 16 | const Mega = 1024 * 1024
 17 | 
 18 | func init() {
 19 | 	go http.ListenAndServe(":6060", nil)
 20 | 	f, err := os.Open(testfile)
 21 | 	if err != nil {
 22 | 		log.Println("generating", testfile)
 23 | 		generate10G()
 24 | 	} else {
 25 | 		f.Close()
 26 | 	}
 27 | }
 28 | 
 29 | type dummyReader struct {
 30 | 	count int
 31 | 	max   int
 32 | 	rnd   *rand.Rand
 33 | }
 34 | 
 35 | func (dr *dummyReader) Read(p []byte) (n int, err error) {
 36 | 	var alpha = []byte("ABCDEFGHIJKLMNOPQRSTUVWXYZ             ")
 37 | 	if dr.count == dr.max {
 38 | 		return 0, io.EOF
 39 | 	}
 40 | 
 41 | 	remain := len(p)
 42 | 	idx := 0
 43 | 	for remain > 0 {
 44 | 		p[idx] = alpha[dr.rnd.Intn(len(alpha))]
 45 | 		idx++
 46 | 		remain--
 47 | 		dr.count++
 48 | 		if dr.count == dr.max {
 49 | 			return idx, io.EOF
 50 | 		}
 51 | 	}
 52 | 
 53 | 	return idx, nil
 54 | }
 55 | 
 56 | func newDummyReader(cap int) *dummyReader {
 57 | 	dr := new(dummyReader)
 58 | 	dr.max = cap
 59 | 	dr.rnd = rand.New(rand.NewSource(time.Now().UnixNano()))
 60 | 	return dr
 61 | }
 62 | 
 63 | func generate10G() {
 64 | 	dr := newDummyReader(10 * 1024 * Mega)
 65 | 	f, err := os.OpenFile(testfile, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0755)
 66 | 	if err != nil {
 67 | 		log.Fatal(err)
 68 | 	}
 69 | 
 70 | 	io.Copy(f, dr)
 71 | 
 72 | 	if err := f.Close(); err != nil {
 73 | 		log.Fatal(err)
 74 | 	}
 75 | }
 76 | 
 77 | func TestReduce(t *testing.T) {
 78 | 	reducer := new(uniqueReducer)
 79 | 	Reduce(22, reducer)
 80 | 	if reducer.hasUnique {
 81 | 		log.Println("Found the first unique element:", reducer.target)
 82 | 	} else {
 83 | 		log.Println("Unique element not found!")
 84 | 	}
 85 | }
 86 | 
 87 | func TestFindUniqueString(t *testing.T) {
 88 | 	t0 := bytes.NewBufferString("   ")
 89 | 	findUnique(t0, 128*Mega)
 90 | 	t1 := bytes.NewBufferString("a a b b c b") // 4
 91 | 	findUnique(t1, 128*Mega)
 92 | 	t2 := bytes.NewBufferString("a a a a a a") // no
 93 | 	findUnique(t2, 128*Mega)
 94 | 	t3 := bytes.NewBufferString("a b c d e a") //1
 95 | 	findUnique(t3, 128*Mega)
 96 | 	t4 := bytes.NewBufferString("a a a a a b") // 5
 97 | 	findUnique(t4, 128*Mega)
 98 | }
 99 | 
100 | func TestFindUnique100M(t *testing.T) {
101 | 	file, err := os.Open(testfile)
102 | 	if err != nil {
103 | 		log.Fatal(err)
104 | 	}
105 | 	findUnique(io.LimitReader(file, 100*Mega), 50*Mega)
106 | }
107 | func TestFindUnique1G(t *testing.T) {
108 | 	file, err := os.Open(testfile)
109 | 	if err != nil {
110 | 		log.Fatal(err)
111 | 	}
112 | 	findUnique(io.LimitReader(file, 1000*Mega), 500*Mega)
113 | }
114 | 
115 | func TestFindUnique10G(t *testing.T) {
116 | 	file, err := os.Open(testfile)
117 | 	if err != nil {
118 | 		log.Fatal(err)
119 | 	}
120 | 	findUnique(io.LimitReader(file, 10000*Mega), 2000*Mega)
121 | }
122 | 


--------------------------------------------------------------------------------
/main.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"bytes"
  6 | 	"container/heap"
  7 | 	"encoding/binary"
  8 | 	"fmt"
  9 | 	"io"
 10 | 	"log"
 11 | 	"math"
 12 | 	"os"
 13 | 	"runtime"
 14 | 	"sort"
 15 | 	"sync"
 16 | )
 17 | 
 18 | ////////////////////////////////////////////////////////////////////////////////
 19 | // pre-processing stage
 20 | // an in-memory sorter
 21 | 
 22 | // key-value construction
 23 | // rawEntry binary format:
 24 | // bytesSize uint32
 25 | // bytesPtr uint32
 26 | type rawEntry []byte
 27 | 
 28 | const entrySize = 8
 29 | 
 30 | func (e rawEntry) sz() uint32  { return binary.LittleEndian.Uint32(e[:]) }
 31 | func (e rawEntry) ptr() uint32 { return binary.LittleEndian.Uint32(e[4:]) }
 32 | 
 33 | // value binds to specific bytes buffer
 34 | func (e rawEntry) value(buf []byte) rawValue { return buf[e.ptr():][:e.sz()] }
 35 | 
 36 | // value binary format
 37 | // ord uint64
 38 | // line []byte
 39 | type rawValue []byte
 40 | 
 41 | func (v rawValue) ord() uint64   { return binary.LittleEndian.Uint64(v[:]) }
 42 | func (v rawValue) bytes() []byte { return v[8:] }
 43 | 
 44 | type entry struct {
 45 | 	bts []byte
 46 | 	ord uint64 // data order
 47 | }
 48 | 
 49 | // split large slice set into a group of small sets
 50 | // for limiting memory usage, we write content backwords
 51 | // and write idx forwards, as:
 52 | // [key0,key1,...keyN,..... valueN, .... value0]
 53 | type dataSet struct {
 54 | 	buf []byte
 55 | 
 56 | 	idxWritten  int
 57 | 	dataWritten int
 58 | 	dataPtr     int // point to last non-writable place, dataPtr -1 will be writable
 59 | 	idxPtr      int // point to next index place
 60 | 
 61 | 	swapbuf [entrySize]byte
 62 | }
 63 | 
 64 | func newDataSet(sz int) *dataSet {
 65 | 	e := new(dataSet)
 66 | 	e.buf = make([]byte, sz)
 67 | 	e.dataPtr = sz
 68 | 	return e
 69 | }
 70 | 
 71 | // Add bytes with it's data record order
 72 | func (s *dataSet) Add(bts []byte, ord uint64) bool {
 73 | 	sz := len(bts) + 8
 74 | 	if s.idxWritten+s.dataWritten+sz+entrySize > len(s.buf) {
 75 | 		return false
 76 | 	}
 77 | 
 78 | 	// write data
 79 | 	s.dataPtr -= sz
 80 | 	s.dataWritten += sz
 81 | 	binary.LittleEndian.PutUint64(s.buf[s.dataPtr:], ord)
 82 | 	copy(s.buf[s.dataPtr+8:], bts)
 83 | 
 84 | 	// write idx
 85 | 	binary.LittleEndian.PutUint32(s.buf[s.idxPtr:], uint32(sz))
 86 | 	binary.LittleEndian.PutUint32(s.buf[s.idxPtr+4:], uint32(s.dataPtr))
 87 | 	s.idxPtr += entrySize
 88 | 	s.idxWritten += entrySize
 89 | 
 90 | 	return true
 91 | }
 92 | 
 93 | // Reset to initial state for future use
 94 | func (s *dataSet) Reset() {
 95 | 	s.dataPtr = len(s.buf)
 96 | 	s.dataWritten = 0
 97 | 	s.idxPtr = 0
 98 | 	s.idxWritten = 0
 99 | }
100 | 
101 | // return the ith entry in binary form
102 | func (s *dataSet) e(i int) rawEntry {
103 | 	return rawEntry(s.buf[i*entrySize:][:entrySize])
104 | }
105 | 
106 | // return the ith element in object form
107 | func (s *dataSet) get(i int) entry {
108 | 	v := s.e(i).value(s.buf)
109 | 	return entry{v.bytes(), v.ord()}
110 | }
111 | 
112 | func (s *dataSet) Len() int { return s.idxWritten / entrySize }
113 | func (s *dataSet) Less(i, j int) bool {
114 | 	v1 := s.e(i).value(s.buf)
115 | 	v2 := s.e(j).value(s.buf)
116 | 	return bytes.Compare(v1.bytes(), v2.bytes()) < 0
117 | }
118 | 
119 | func (s *dataSet) Swap(i, j int) {
120 | 	copy(s.swapbuf[:], s.e(i))
121 | 	copy(s.e(i), s.e(j))
122 | 	copy(s.e(j), s.swapbuf[:])
123 | }
124 | 
125 | // data set reader for heap aggregation
126 | type dataSetReader struct {
127 | 	set  *dataSet
128 | 	head int
129 | 	elem entry
130 | }
131 | 
132 | func newDataSetReader(set *dataSet) *dataSetReader {
133 | 	if set.Len() == 0 {
134 | 		return nil
135 | 	}
136 | 	esr := new(dataSetReader)
137 | 	esr.set = set
138 | 	esr.elem = set.get(0)
139 | 	return esr
140 | }
141 | 
142 | func (esr *dataSetReader) next() bool {
143 | 	esr.head++
144 | 	if esr.head >= esr.set.Len() {
145 | 		return false
146 | 	}
147 | 	esr.elem = esr.set.get(esr.head)
148 | 	return true
149 | }
150 | 
151 | // memory based aggregator
152 | type memSortAggregator struct {
153 | 	sets []*dataSetReader
154 | }
155 | 
156 | func (h *memSortAggregator) Len() int { return len(h.sets) }
157 | func (h *memSortAggregator) Less(i, j int) bool {
158 | 	return bytes.Compare(h.sets[i].elem.bts, h.sets[j].elem.bts) < 0
159 | }
160 | func (h *memSortAggregator) Swap(i, j int)      { h.sets[i], h.sets[j] = h.sets[j], h.sets[i] }
161 | func (h *memSortAggregator) Push(x interface{}) { h.sets = append(h.sets, x.(*dataSetReader)) }
162 | func (h *memSortAggregator) Pop() interface{} {
163 | 	n := len(h.sets)
164 | 	x := h.sets[n-1]
165 | 	h.sets = h.sets[0 : n-1]
166 | 	return x
167 | }
168 | 
169 | // memory bounded sorter for big data
170 | type sorter struct {
171 | 	sets    []*dataSet
172 | 	free    []*dataSet
173 | 	setSize int
174 | 	limit   int // max total memory usage for sorting
175 | }
176 | 
177 | // a mapper defines a mapping for `entry` to  bytes
178 | type Mapper interface {
179 | 	Map(entry) []byte
180 | 	End() []byte
181 | }
182 | 
183 | func (h *sorter) Len() int {
184 | 	n := 0
185 | 	for k := range h.sets {
186 | 		n += h.sets[k].Len()
187 | 	}
188 | 	return n
189 | }
190 | 
191 | func (h *sorter) Map(w io.Writer, mapper Mapper) {
192 | 	if len(h.sets) > 0 {
193 | 		// sort the sets in parallel
194 | 		wg := new(sync.WaitGroup)
195 | 		for k := range h.sets {
196 | 			log.Println("sorting sets#", k, "element count:", h.sets[k].Len())
197 | 			wg.Add(1)
198 | 			go func(set *dataSet) {
199 | 				sort.Sort(set)
200 | 				wg.Done()
201 | 			}(h.sets[k])
202 | 		}
203 | 		wg.Wait()
204 | 		log.Println("merging sorted sets to file")
205 | 		agg := new(memSortAggregator)
206 | 		for k := range h.sets {
207 | 			heap.Push(agg, newDataSetReader(h.sets[k]))
208 | 		}
209 | 
210 | 		written := 0
211 | 		for agg.Len() > 0 {
212 | 			esr := heap.Pop(agg).(*dataSetReader)
213 | 			r := mapper.Map(esr.elem)
214 | 			if r != nil {
215 | 				w.Write(r)
216 | 				written++
217 | 			}
218 | 			if esr.next() {
219 | 				heap.Push(agg, esr)
220 | 			}
221 | 		}
222 | 		if r := mapper.End(); r != nil {
223 | 			w.Write(r)
224 | 			written++
225 | 		}
226 | 
227 | 		log.Println("written", written, "elements")
228 | 		for k := range h.sets {
229 | 			h.sets[k].Reset()
230 | 		}
231 | 		h.free = h.sets
232 | 		h.sets = nil
233 | 	}
234 | }
235 | 
236 | func (h *sorter) allocateNewSet() *dataSet {
237 | 	var newSet *dataSet
238 | 	last := len(h.free) - 1
239 | 	if last >= 0 {
240 | 		newSet = h.free[last]
241 | 		h.free = h.free[:last]
242 | 	} else {
243 | 		newSet = newDataSet(h.setSize)
244 | 	}
245 | 	h.sets = append(h.sets, newSet)
246 | 	return newSet
247 | }
248 | 
249 | // Add controls the memory for every input
250 | func (h *sorter) Add(bts []byte, ord uint64) bool {
251 | 	if len(h.sets) == 0 {
252 | 		h.allocateNewSet()
253 | 	}
254 | 	set := h.sets[len(h.sets)-1]
255 | 	if !set.Add(bts, ord) {
256 | 		if h.setSize*(len(h.sets)+1) > h.limit { // limit reached
257 | 			return false
258 | 		}
259 | 		newSet := h.allocateNewSet()
260 | 		newSet.Add(bts, ord)
261 | 	}
262 | 	return true
263 | }
264 | 
265 | func (h *sorter) init(limit int) {
266 | 	h.limit = limit
267 | 	h.setSize = limit / runtime.NumCPU()
268 | 
269 | 	// make sure one set is not larger than MaxUint32
270 | 	if h.setSize > math.MaxUint32 {
271 | 		h.setSize = math.MaxUint32
272 | 	}
273 | }
274 | 
275 | // sort2Disk sorts and maps the input and output to multiple
276 | // sorted files
277 | func sort2Disk(r io.Reader, memLimit int, mapper Mapper) int {
278 | 	h := new(sorter)
279 | 	h.init(memLimit)
280 | 	var ord uint64
281 | 	parts := 0
282 | 
283 | 	log.Println("beginning sort with memory limited to:", memLimit, "bytes")
284 | 	// file based serialization
285 | 	fileDump := func(hp *sorter, path string) {
286 | 		f, err := os.OpenFile(path, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0755)
287 | 		if err != nil {
288 | 			log.Fatal(err)
289 | 		}
290 | 		bufw := bufio.NewWriterSize(f, 1<<20)
291 | 		hp.Map(bufw, mapper)
292 | 		bufw.Flush()
293 | 		if err := f.Close(); err != nil {
294 | 			log.Fatal(err)
295 | 		}
296 | 	}
297 | 
298 | 	scanner := bufio.NewScanner(r)
299 | 	scanner.Split(bufio.ScanWords)
300 | 	for scanner.Scan() {
301 | 		if !h.Add(scanner.Bytes(), ord) {
302 | 			fileDump(h, fmt.Sprintf("part%v.dat", parts))
303 | 			log.Println("chunk#", parts, "written")
304 | 			parts++
305 | 			h.Add(scanner.Bytes(), ord)
306 | 		}
307 | 		ord++
308 | 	}
309 | 
310 | 	if err := scanner.Err(); err != nil {
311 | 		log.Fatal("error reading from source")
312 | 	}
313 | 
314 | 	if h.Len() > 0 {
315 | 		fileDump(h, fmt.Sprintf("part%v.dat", parts))
316 | 		log.Println("chunk#", parts, "written")
317 | 		parts++
318 | 	}
319 | 	return parts
320 | }
321 | 
322 | ////////////////////////////////////////////////////////////////////////////////
323 | // disk streaming stage
324 | type countedEntry []byte
325 | 
326 | func (c countedEntry) bytes() []byte { return c[16:] }
327 | func (c countedEntry) ord() uint64   { return binary.LittleEndian.Uint64(c) }
328 | func (c countedEntry) cnt() uint64   { return binary.LittleEndian.Uint64(c[8:]) }
329 | 
330 | type streamReader struct {
331 | 	r     io.Reader
332 | 	szbuf [4]byte
333 | 	buf   []byte
334 | }
335 | 
336 | func (sr *streamReader) head() []byte { return sr.buf }
337 | 
338 | func (sr *streamReader) next() bool {
339 | 	_, err := io.ReadFull(sr.r, sr.szbuf[:])
340 | 	if err != nil {
341 | 		return false
342 | 	}
343 | 	sz := binary.LittleEndian.Uint32(sr.szbuf[:])
344 | 	if cap(sr.buf) < int(sz) {
345 | 		sr.buf = make([]byte, sz)
346 | 	} else {
347 | 		sr.buf = sr.buf[:sz]
348 | 	}
349 | 	_, err = io.ReadFull(sr.r, sr.buf)
350 | 	if err != nil {
351 | 		return false
352 | 	}
353 | 
354 | 	return true
355 | }
356 | 
357 | func newStreamReader(r io.Reader) *streamReader {
358 | 	sr := new(streamReader)
359 | 	sr.r = bufio.NewReader(r)
360 | 	if sr.next() {
361 | 		return sr
362 | 	}
363 | 	return nil
364 | }
365 | 
366 | // streamAggregator always pop the min string
367 | type lessComparator func([]byte, []byte) bool
368 | type streamAggregator struct {
369 | 	entries []*streamReader
370 | 	less    lessComparator
371 | }
372 | 
373 | func newStreamAggregator(less lessComparator) *streamAggregator {
374 | 	agg := new(streamAggregator)
375 | 	agg.less = less
376 | 	return agg
377 | }
378 | 
379 | func (h *streamAggregator) Len() int { return len(h.entries) }
380 | func (h *streamAggregator) Less(i, j int) bool {
381 | 	return h.less(h.entries[i].head(), h.entries[j].head())
382 | }
383 | func (h *streamAggregator) Swap(i, j int)      { h.entries[i], h.entries[j] = h.entries[j], h.entries[i] }
384 | func (h *streamAggregator) Push(x interface{}) { h.entries = append(h.entries, x.(*streamReader)) }
385 | func (h *streamAggregator) Pop() interface{} {
386 | 	n := len(h.entries)
387 | 	x := h.entries[n-1]
388 | 	h.entries = h.entries[0 : n-1]
389 | 	return x
390 | }
391 | 
392 | // define a mapping function for counting
393 | type countMapper struct {
394 | 	last    entry
395 | 	lastCnt uint64
396 | 	hasLast bool
397 | 	buf     []byte
398 | }
399 | 
400 | func (m *countMapper) prepareBuffer(sz int) {
401 | 	if cap(m.buf) < sz {
402 | 		m.buf = make([]byte, sz)
403 | 	} else {
404 | 		m.buf = m.buf[:sz]
405 | 	}
406 | }
407 | 
408 | func (m *countMapper) writeLast() {
409 | 	// output format
410 | 	// size - 32bit
411 | 	// ord 64bit
412 | 	// cnt 64bit
413 | 	// bts (size - 16)
414 | 	sz := len(m.last.bts) + 8 + 8
415 | 	m.prepareBuffer(sz + 4)
416 | 	binary.LittleEndian.PutUint32(m.buf, uint32(sz))
417 | 	binary.LittleEndian.PutUint64(m.buf[4:], m.last.ord)
418 | 	binary.LittleEndian.PutUint64(m.buf[12:], m.lastCnt)
419 | 	copy(m.buf[20:], m.last.bts)
420 | }
421 | 
422 | func (m *countMapper) Map(e entry) (ret []byte) {
423 | 	if !m.hasLast {
424 | 		m.lastCnt = 1
425 | 		m.hasLast = true
426 | 		m.last = e
427 | 		return nil
428 | 	}
429 | 
430 | 	if bytes.Compare(e.bts, m.last.bts) == 0 { // counting
431 | 		m.lastCnt++
432 | 	} else {
433 | 		m.writeLast()
434 | 		m.last = e
435 | 		m.lastCnt = 1
436 | 		return m.buf
437 | 	}
438 | 	return nil
439 | }
440 | 
441 | func (m *countMapper) End() (ret []byte) {
442 | 	if !m.hasLast {
443 | 		return nil
444 | 	}
445 | 	m.writeLast()
446 | 	return m.buf
447 | }
448 | 
449 | // Reducer interface
450 | type Reducer interface {
451 | 	Reduce(countedEntry)
452 | 	End()
453 | }
454 | 
455 | type uniqueReducer struct {
456 | 	target    countedEntry
457 | 	last      countedEntry
458 | 	count     uint64
459 | 	hasUnique bool
460 | 	hasLast   bool
461 | }
462 | 
463 | func (r *uniqueReducer) checkTarget() {
464 | 	if r.count == 1 {
465 | 		if !r.hasUnique {
466 | 			r.target = r.deepcopy(r.last)
467 | 			r.hasUnique = true
468 | 		} else if r.last.ord() < r.target.ord() {
469 | 			r.target = r.deepcopy(r.last)
470 | 		}
471 | 	}
472 | }
473 | 
474 | func (r *uniqueReducer) deepcopy(e1 countedEntry) countedEntry {
475 | 	e2 := make([]byte, len(e1))
476 | 	copy(e2, e1)
477 | 	return e2
478 | }
479 | 
480 | func (r *uniqueReducer) updateLast(e countedEntry) {
481 | 	sz := len(e)
482 | 	if sz > cap(r.last) {
483 | 		r.last = make([]byte, sz)
484 | 	} else {
485 | 		r.last = r.last[:sz]
486 | 	}
487 | 	copy(r.last, e)
488 | }
489 | 
490 | func (r *uniqueReducer) Reduce(e countedEntry) {
491 | 	if !r.hasLast {
492 | 		r.updateLast(e)
493 | 		r.hasLast = true
494 | 	} else if bytes.Compare(r.last.bytes(), e.bytes()) == 0 {
495 | 		r.count += e.cnt()
496 | 	} else {
497 | 		r.checkTarget()
498 | 		r.updateLast(e)
499 | 		r.count = e.cnt()
500 | 	}
501 | }
502 | 
503 | func (r *uniqueReducer) End() {
504 | 	r.checkTarget()
505 | }
506 | 
507 | // reduce from parts, apply with reducer
508 | func Reduce(parts int, r Reducer) {
509 | 	files := make([]*os.File, parts)
510 | 
511 | 	less := func(left []byte, right []byte) bool {
512 | 		return bytes.Compare(countedEntry(left).bytes(), countedEntry(right).bytes()) < 0
513 | 	}
514 | 	h := newStreamAggregator(less)
515 | 	for i := 0; i < parts; i++ {
516 | 		f, err := os.Open(fmt.Sprintf("part%v.dat", i))
517 | 		if err != nil {
518 | 			log.Fatal(err)
519 | 		}
520 | 		files[i] = f
521 | 		if sr := newStreamReader(bufio.NewReaderSize(f, 1<<20)); sr != nil {
522 | 			heap.Push(h, sr)
523 | 		}
524 | 	}
525 | 
526 | 	for h.Len() > 0 {
527 | 		sr := heap.Pop(h).(*streamReader)
528 | 		r.Reduce(sr.head())
529 | 		if sr.next() {
530 | 			heap.Push(h, sr)
531 | 		}
532 | 	}
533 | 	r.End()
534 | 
535 | 	for _, f := range files[:] {
536 | 		if err := f.Close(); err != nil {
537 | 			log.Fatal(err)
538 | 		}
539 | 	}
540 | 
541 | }
542 | 
543 | // findUnique reads from r with a specified bufsize
544 | // and trys to find the first unique string in this file
545 | func findUnique(r io.Reader, memLimit int) {
546 | 	// step.1 sort into file chunks, mapping stage
547 | 	parts := sort2Disk(r, memLimit, new(countMapper))
548 | 	log.Println("Generated", parts, "parts")
549 | 	// step2. merge all sstable and provides a continous input
550 | 	log.Println("Reducing from#", parts, "sstable(s)")
551 | 	reducer := new(uniqueReducer)
552 | 	Reduce(parts, reducer)
553 | 
554 | 	if reducer.hasUnique {
555 | 		log.Println("Found the first unique element:", string(reducer.target.bytes()), reducer.target.ord())
556 | 	} else {
557 | 		log.Println("Unique element not found!")
558 | 	}
559 | }
560 | 


--------------------------------------------------------------------------------