├── README.md ├── .gitignore ├── LICENSE ├── main_test.go └── main.go /README.md: -------------------------------------------------------------------------------- 1 | # sstable 2 | bigdata processing in golang 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Binaries for programs and plugins 2 | *.exe 3 | *.exe~ 4 | *.dll 5 | *.so 6 | *.dylib 7 | 8 | # Test binary, build with `go test -c` 9 | *.test 10 | 11 | # Output of the go coverage tool, specifically when used with LiteIDE 12 | *.out 13 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 xtaci 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /main_test.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "io" 6 | "log" 7 | "math/rand" 8 | "net/http" 9 | _ "net/http/pprof" 10 | "os" 11 | "testing" 12 | "time" 13 | ) 14 | 15 | const testfile = "10G.data" 16 | const Mega = 1024 * 1024 17 | 18 | func init() { 19 | go http.ListenAndServe(":6060", nil) 20 | f, err := os.Open(testfile) 21 | if err != nil { 22 | log.Println("generating", testfile) 23 | generate10G() 24 | } else { 25 | f.Close() 26 | } 27 | } 28 | 29 | type dummyReader struct { 30 | count int 31 | max int 32 | rnd *rand.Rand 33 | } 34 | 35 | func (dr *dummyReader) Read(p []byte) (n int, err error) { 36 | var alpha = []byte("ABCDEFGHIJKLMNOPQRSTUVWXYZ ") 37 | if dr.count == dr.max { 38 | return 0, io.EOF 39 | } 40 | 41 | remain := len(p) 42 | idx := 0 43 | for remain > 0 { 44 | p[idx] = alpha[dr.rnd.Intn(len(alpha))] 45 | idx++ 46 | remain-- 47 | dr.count++ 48 | if dr.count == dr.max { 49 | return idx, io.EOF 50 | } 51 | } 52 | 53 | return idx, nil 54 | } 55 | 56 | func newDummyReader(cap int) *dummyReader { 57 | dr := new(dummyReader) 58 | dr.max = cap 59 | dr.rnd = rand.New(rand.NewSource(time.Now().UnixNano())) 60 | return dr 61 | } 62 | 63 | func generate10G() { 64 | dr := newDummyReader(10 * 1024 * Mega) 65 | f, err := os.OpenFile(testfile, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0755) 66 | if err != nil { 67 | log.Fatal(err) 68 | } 69 | 70 | io.Copy(f, dr) 71 | 72 | if err := f.Close(); err != nil { 73 | log.Fatal(err) 74 | } 75 | } 76 | 77 | func TestReduce(t *testing.T) { 78 | reducer := new(uniqueReducer) 79 | Reduce(22, reducer) 80 | if reducer.hasUnique { 81 | log.Println("Found the first unique element:", reducer.target) 82 | } else { 83 | log.Println("Unique element not found!") 84 | } 85 | } 86 | 87 | func TestFindUniqueString(t *testing.T) { 88 | t0 := bytes.NewBufferString(" ") 89 | findUnique(t0, 128*Mega) 90 | t1 := bytes.NewBufferString("a a b b c b") // 4 91 | findUnique(t1, 128*Mega) 92 | t2 := bytes.NewBufferString("a a a a a a") // no 93 | findUnique(t2, 128*Mega) 94 | t3 := bytes.NewBufferString("a b c d e a") //1 95 | findUnique(t3, 128*Mega) 96 | t4 := bytes.NewBufferString("a a a a a b") // 5 97 | findUnique(t4, 128*Mega) 98 | } 99 | 100 | func TestFindUnique100M(t *testing.T) { 101 | file, err := os.Open(testfile) 102 | if err != nil { 103 | log.Fatal(err) 104 | } 105 | findUnique(io.LimitReader(file, 100*Mega), 50*Mega) 106 | } 107 | func TestFindUnique1G(t *testing.T) { 108 | file, err := os.Open(testfile) 109 | if err != nil { 110 | log.Fatal(err) 111 | } 112 | findUnique(io.LimitReader(file, 1000*Mega), 500*Mega) 113 | } 114 | 115 | func TestFindUnique10G(t *testing.T) { 116 | file, err := os.Open(testfile) 117 | if err != nil { 118 | log.Fatal(err) 119 | } 120 | findUnique(io.LimitReader(file, 10000*Mega), 2000*Mega) 121 | } 122 | -------------------------------------------------------------------------------- /main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "bytes" 6 | "container/heap" 7 | "encoding/binary" 8 | "fmt" 9 | "io" 10 | "log" 11 | "math" 12 | "os" 13 | "runtime" 14 | "sort" 15 | "sync" 16 | ) 17 | 18 | //////////////////////////////////////////////////////////////////////////////// 19 | // pre-processing stage 20 | // an in-memory sorter 21 | 22 | // key-value construction 23 | // rawEntry binary format: 24 | // bytesSize uint32 25 | // bytesPtr uint32 26 | type rawEntry []byte 27 | 28 | const entrySize = 8 29 | 30 | func (e rawEntry) sz() uint32 { return binary.LittleEndian.Uint32(e[:]) } 31 | func (e rawEntry) ptr() uint32 { return binary.LittleEndian.Uint32(e[4:]) } 32 | 33 | // value binds to specific bytes buffer 34 | func (e rawEntry) value(buf []byte) rawValue { return buf[e.ptr():][:e.sz()] } 35 | 36 | // value binary format 37 | // ord uint64 38 | // line []byte 39 | type rawValue []byte 40 | 41 | func (v rawValue) ord() uint64 { return binary.LittleEndian.Uint64(v[:]) } 42 | func (v rawValue) bytes() []byte { return v[8:] } 43 | 44 | type entry struct { 45 | bts []byte 46 | ord uint64 // data order 47 | } 48 | 49 | // split large slice set into a group of small sets 50 | // for limiting memory usage, we write content backwords 51 | // and write idx forwards, as: 52 | // [key0,key1,...keyN,..... valueN, .... value0] 53 | type dataSet struct { 54 | buf []byte 55 | 56 | idxWritten int 57 | dataWritten int 58 | dataPtr int // point to last non-writable place, dataPtr -1 will be writable 59 | idxPtr int // point to next index place 60 | 61 | swapbuf [entrySize]byte 62 | } 63 | 64 | func newDataSet(sz int) *dataSet { 65 | e := new(dataSet) 66 | e.buf = make([]byte, sz) 67 | e.dataPtr = sz 68 | return e 69 | } 70 | 71 | // Add bytes with it's data record order 72 | func (s *dataSet) Add(bts []byte, ord uint64) bool { 73 | sz := len(bts) + 8 74 | if s.idxWritten+s.dataWritten+sz+entrySize > len(s.buf) { 75 | return false 76 | } 77 | 78 | // write data 79 | s.dataPtr -= sz 80 | s.dataWritten += sz 81 | binary.LittleEndian.PutUint64(s.buf[s.dataPtr:], ord) 82 | copy(s.buf[s.dataPtr+8:], bts) 83 | 84 | // write idx 85 | binary.LittleEndian.PutUint32(s.buf[s.idxPtr:], uint32(sz)) 86 | binary.LittleEndian.PutUint32(s.buf[s.idxPtr+4:], uint32(s.dataPtr)) 87 | s.idxPtr += entrySize 88 | s.idxWritten += entrySize 89 | 90 | return true 91 | } 92 | 93 | // Reset to initial state for future use 94 | func (s *dataSet) Reset() { 95 | s.dataPtr = len(s.buf) 96 | s.dataWritten = 0 97 | s.idxPtr = 0 98 | s.idxWritten = 0 99 | } 100 | 101 | // return the ith entry in binary form 102 | func (s *dataSet) e(i int) rawEntry { 103 | return rawEntry(s.buf[i*entrySize:][:entrySize]) 104 | } 105 | 106 | // return the ith element in object form 107 | func (s *dataSet) get(i int) entry { 108 | v := s.e(i).value(s.buf) 109 | return entry{v.bytes(), v.ord()} 110 | } 111 | 112 | func (s *dataSet) Len() int { return s.idxWritten / entrySize } 113 | func (s *dataSet) Less(i, j int) bool { 114 | v1 := s.e(i).value(s.buf) 115 | v2 := s.e(j).value(s.buf) 116 | return bytes.Compare(v1.bytes(), v2.bytes()) < 0 117 | } 118 | 119 | func (s *dataSet) Swap(i, j int) { 120 | copy(s.swapbuf[:], s.e(i)) 121 | copy(s.e(i), s.e(j)) 122 | copy(s.e(j), s.swapbuf[:]) 123 | } 124 | 125 | // data set reader for heap aggregation 126 | type dataSetReader struct { 127 | set *dataSet 128 | head int 129 | elem entry 130 | } 131 | 132 | func newDataSetReader(set *dataSet) *dataSetReader { 133 | if set.Len() == 0 { 134 | return nil 135 | } 136 | esr := new(dataSetReader) 137 | esr.set = set 138 | esr.elem = set.get(0) 139 | return esr 140 | } 141 | 142 | func (esr *dataSetReader) next() bool { 143 | esr.head++ 144 | if esr.head >= esr.set.Len() { 145 | return false 146 | } 147 | esr.elem = esr.set.get(esr.head) 148 | return true 149 | } 150 | 151 | // memory based aggregator 152 | type memSortAggregator struct { 153 | sets []*dataSetReader 154 | } 155 | 156 | func (h *memSortAggregator) Len() int { return len(h.sets) } 157 | func (h *memSortAggregator) Less(i, j int) bool { 158 | return bytes.Compare(h.sets[i].elem.bts, h.sets[j].elem.bts) < 0 159 | } 160 | func (h *memSortAggregator) Swap(i, j int) { h.sets[i], h.sets[j] = h.sets[j], h.sets[i] } 161 | func (h *memSortAggregator) Push(x interface{}) { h.sets = append(h.sets, x.(*dataSetReader)) } 162 | func (h *memSortAggregator) Pop() interface{} { 163 | n := len(h.sets) 164 | x := h.sets[n-1] 165 | h.sets = h.sets[0 : n-1] 166 | return x 167 | } 168 | 169 | // memory bounded sorter for big data 170 | type sorter struct { 171 | sets []*dataSet 172 | free []*dataSet 173 | setSize int 174 | limit int // max total memory usage for sorting 175 | } 176 | 177 | // a mapper defines a mapping for `entry` to bytes 178 | type Mapper interface { 179 | Map(entry) []byte 180 | End() []byte 181 | } 182 | 183 | func (h *sorter) Len() int { 184 | n := 0 185 | for k := range h.sets { 186 | n += h.sets[k].Len() 187 | } 188 | return n 189 | } 190 | 191 | func (h *sorter) Map(w io.Writer, mapper Mapper) { 192 | if len(h.sets) > 0 { 193 | // sort the sets in parallel 194 | wg := new(sync.WaitGroup) 195 | for k := range h.sets { 196 | log.Println("sorting sets#", k, "element count:", h.sets[k].Len()) 197 | wg.Add(1) 198 | go func(set *dataSet) { 199 | sort.Sort(set) 200 | wg.Done() 201 | }(h.sets[k]) 202 | } 203 | wg.Wait() 204 | log.Println("merging sorted sets to file") 205 | agg := new(memSortAggregator) 206 | for k := range h.sets { 207 | heap.Push(agg, newDataSetReader(h.sets[k])) 208 | } 209 | 210 | written := 0 211 | for agg.Len() > 0 { 212 | esr := heap.Pop(agg).(*dataSetReader) 213 | r := mapper.Map(esr.elem) 214 | if r != nil { 215 | w.Write(r) 216 | written++ 217 | } 218 | if esr.next() { 219 | heap.Push(agg, esr) 220 | } 221 | } 222 | if r := mapper.End(); r != nil { 223 | w.Write(r) 224 | written++ 225 | } 226 | 227 | log.Println("written", written, "elements") 228 | for k := range h.sets { 229 | h.sets[k].Reset() 230 | } 231 | h.free = h.sets 232 | h.sets = nil 233 | } 234 | } 235 | 236 | func (h *sorter) allocateNewSet() *dataSet { 237 | var newSet *dataSet 238 | last := len(h.free) - 1 239 | if last >= 0 { 240 | newSet = h.free[last] 241 | h.free = h.free[:last] 242 | } else { 243 | newSet = newDataSet(h.setSize) 244 | } 245 | h.sets = append(h.sets, newSet) 246 | return newSet 247 | } 248 | 249 | // Add controls the memory for every input 250 | func (h *sorter) Add(bts []byte, ord uint64) bool { 251 | if len(h.sets) == 0 { 252 | h.allocateNewSet() 253 | } 254 | set := h.sets[len(h.sets)-1] 255 | if !set.Add(bts, ord) { 256 | if h.setSize*(len(h.sets)+1) > h.limit { // limit reached 257 | return false 258 | } 259 | newSet := h.allocateNewSet() 260 | newSet.Add(bts, ord) 261 | } 262 | return true 263 | } 264 | 265 | func (h *sorter) init(limit int) { 266 | h.limit = limit 267 | h.setSize = limit / runtime.NumCPU() 268 | 269 | // make sure one set is not larger than MaxUint32 270 | if h.setSize > math.MaxUint32 { 271 | h.setSize = math.MaxUint32 272 | } 273 | } 274 | 275 | // sort2Disk sorts and maps the input and output to multiple 276 | // sorted files 277 | func sort2Disk(r io.Reader, memLimit int, mapper Mapper) int { 278 | h := new(sorter) 279 | h.init(memLimit) 280 | var ord uint64 281 | parts := 0 282 | 283 | log.Println("beginning sort with memory limited to:", memLimit, "bytes") 284 | // file based serialization 285 | fileDump := func(hp *sorter, path string) { 286 | f, err := os.OpenFile(path, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0755) 287 | if err != nil { 288 | log.Fatal(err) 289 | } 290 | bufw := bufio.NewWriterSize(f, 1<<20) 291 | hp.Map(bufw, mapper) 292 | bufw.Flush() 293 | if err := f.Close(); err != nil { 294 | log.Fatal(err) 295 | } 296 | } 297 | 298 | scanner := bufio.NewScanner(r) 299 | scanner.Split(bufio.ScanWords) 300 | for scanner.Scan() { 301 | if !h.Add(scanner.Bytes(), ord) { 302 | fileDump(h, fmt.Sprintf("part%v.dat", parts)) 303 | log.Println("chunk#", parts, "written") 304 | parts++ 305 | h.Add(scanner.Bytes(), ord) 306 | } 307 | ord++ 308 | } 309 | 310 | if err := scanner.Err(); err != nil { 311 | log.Fatal("error reading from source") 312 | } 313 | 314 | if h.Len() > 0 { 315 | fileDump(h, fmt.Sprintf("part%v.dat", parts)) 316 | log.Println("chunk#", parts, "written") 317 | parts++ 318 | } 319 | return parts 320 | } 321 | 322 | //////////////////////////////////////////////////////////////////////////////// 323 | // disk streaming stage 324 | type countedEntry []byte 325 | 326 | func (c countedEntry) bytes() []byte { return c[16:] } 327 | func (c countedEntry) ord() uint64 { return binary.LittleEndian.Uint64(c) } 328 | func (c countedEntry) cnt() uint64 { return binary.LittleEndian.Uint64(c[8:]) } 329 | 330 | type streamReader struct { 331 | r io.Reader 332 | szbuf [4]byte 333 | buf []byte 334 | } 335 | 336 | func (sr *streamReader) head() []byte { return sr.buf } 337 | 338 | func (sr *streamReader) next() bool { 339 | _, err := io.ReadFull(sr.r, sr.szbuf[:]) 340 | if err != nil { 341 | return false 342 | } 343 | sz := binary.LittleEndian.Uint32(sr.szbuf[:]) 344 | if cap(sr.buf) < int(sz) { 345 | sr.buf = make([]byte, sz) 346 | } else { 347 | sr.buf = sr.buf[:sz] 348 | } 349 | _, err = io.ReadFull(sr.r, sr.buf) 350 | if err != nil { 351 | return false 352 | } 353 | 354 | return true 355 | } 356 | 357 | func newStreamReader(r io.Reader) *streamReader { 358 | sr := new(streamReader) 359 | sr.r = bufio.NewReader(r) 360 | if sr.next() { 361 | return sr 362 | } 363 | return nil 364 | } 365 | 366 | // streamAggregator always pop the min string 367 | type lessComparator func([]byte, []byte) bool 368 | type streamAggregator struct { 369 | entries []*streamReader 370 | less lessComparator 371 | } 372 | 373 | func newStreamAggregator(less lessComparator) *streamAggregator { 374 | agg := new(streamAggregator) 375 | agg.less = less 376 | return agg 377 | } 378 | 379 | func (h *streamAggregator) Len() int { return len(h.entries) } 380 | func (h *streamAggregator) Less(i, j int) bool { 381 | return h.less(h.entries[i].head(), h.entries[j].head()) 382 | } 383 | func (h *streamAggregator) Swap(i, j int) { h.entries[i], h.entries[j] = h.entries[j], h.entries[i] } 384 | func (h *streamAggregator) Push(x interface{}) { h.entries = append(h.entries, x.(*streamReader)) } 385 | func (h *streamAggregator) Pop() interface{} { 386 | n := len(h.entries) 387 | x := h.entries[n-1] 388 | h.entries = h.entries[0 : n-1] 389 | return x 390 | } 391 | 392 | // define a mapping function for counting 393 | type countMapper struct { 394 | last entry 395 | lastCnt uint64 396 | hasLast bool 397 | buf []byte 398 | } 399 | 400 | func (m *countMapper) prepareBuffer(sz int) { 401 | if cap(m.buf) < sz { 402 | m.buf = make([]byte, sz) 403 | } else { 404 | m.buf = m.buf[:sz] 405 | } 406 | } 407 | 408 | func (m *countMapper) writeLast() { 409 | // output format 410 | // size - 32bit 411 | // ord 64bit 412 | // cnt 64bit 413 | // bts (size - 16) 414 | sz := len(m.last.bts) + 8 + 8 415 | m.prepareBuffer(sz + 4) 416 | binary.LittleEndian.PutUint32(m.buf, uint32(sz)) 417 | binary.LittleEndian.PutUint64(m.buf[4:], m.last.ord) 418 | binary.LittleEndian.PutUint64(m.buf[12:], m.lastCnt) 419 | copy(m.buf[20:], m.last.bts) 420 | } 421 | 422 | func (m *countMapper) Map(e entry) (ret []byte) { 423 | if !m.hasLast { 424 | m.lastCnt = 1 425 | m.hasLast = true 426 | m.last = e 427 | return nil 428 | } 429 | 430 | if bytes.Compare(e.bts, m.last.bts) == 0 { // counting 431 | m.lastCnt++ 432 | } else { 433 | m.writeLast() 434 | m.last = e 435 | m.lastCnt = 1 436 | return m.buf 437 | } 438 | return nil 439 | } 440 | 441 | func (m *countMapper) End() (ret []byte) { 442 | if !m.hasLast { 443 | return nil 444 | } 445 | m.writeLast() 446 | return m.buf 447 | } 448 | 449 | // Reducer interface 450 | type Reducer interface { 451 | Reduce(countedEntry) 452 | End() 453 | } 454 | 455 | type uniqueReducer struct { 456 | target countedEntry 457 | last countedEntry 458 | count uint64 459 | hasUnique bool 460 | hasLast bool 461 | } 462 | 463 | func (r *uniqueReducer) checkTarget() { 464 | if r.count == 1 { 465 | if !r.hasUnique { 466 | r.target = r.deepcopy(r.last) 467 | r.hasUnique = true 468 | } else if r.last.ord() < r.target.ord() { 469 | r.target = r.deepcopy(r.last) 470 | } 471 | } 472 | } 473 | 474 | func (r *uniqueReducer) deepcopy(e1 countedEntry) countedEntry { 475 | e2 := make([]byte, len(e1)) 476 | copy(e2, e1) 477 | return e2 478 | } 479 | 480 | func (r *uniqueReducer) updateLast(e countedEntry) { 481 | sz := len(e) 482 | if sz > cap(r.last) { 483 | r.last = make([]byte, sz) 484 | } else { 485 | r.last = r.last[:sz] 486 | } 487 | copy(r.last, e) 488 | } 489 | 490 | func (r *uniqueReducer) Reduce(e countedEntry) { 491 | if !r.hasLast { 492 | r.updateLast(e) 493 | r.hasLast = true 494 | } else if bytes.Compare(r.last.bytes(), e.bytes()) == 0 { 495 | r.count += e.cnt() 496 | } else { 497 | r.checkTarget() 498 | r.updateLast(e) 499 | r.count = e.cnt() 500 | } 501 | } 502 | 503 | func (r *uniqueReducer) End() { 504 | r.checkTarget() 505 | } 506 | 507 | // reduce from parts, apply with reducer 508 | func Reduce(parts int, r Reducer) { 509 | files := make([]*os.File, parts) 510 | 511 | less := func(left []byte, right []byte) bool { 512 | return bytes.Compare(countedEntry(left).bytes(), countedEntry(right).bytes()) < 0 513 | } 514 | h := newStreamAggregator(less) 515 | for i := 0; i < parts; i++ { 516 | f, err := os.Open(fmt.Sprintf("part%v.dat", i)) 517 | if err != nil { 518 | log.Fatal(err) 519 | } 520 | files[i] = f 521 | if sr := newStreamReader(bufio.NewReaderSize(f, 1<<20)); sr != nil { 522 | heap.Push(h, sr) 523 | } 524 | } 525 | 526 | for h.Len() > 0 { 527 | sr := heap.Pop(h).(*streamReader) 528 | r.Reduce(sr.head()) 529 | if sr.next() { 530 | heap.Push(h, sr) 531 | } 532 | } 533 | r.End() 534 | 535 | for _, f := range files[:] { 536 | if err := f.Close(); err != nil { 537 | log.Fatal(err) 538 | } 539 | } 540 | 541 | } 542 | 543 | // findUnique reads from r with a specified bufsize 544 | // and trys to find the first unique string in this file 545 | func findUnique(r io.Reader, memLimit int) { 546 | // step.1 sort into file chunks, mapping stage 547 | parts := sort2Disk(r, memLimit, new(countMapper)) 548 | log.Println("Generated", parts, "parts") 549 | // step2. merge all sstable and provides a continous input 550 | log.Println("Reducing from#", parts, "sstable(s)") 551 | reducer := new(uniqueReducer) 552 | Reduce(parts, reducer) 553 | 554 | if reducer.hasUnique { 555 | log.Println("Found the first unique element:", string(reducer.target.bytes()), reducer.target.ord()) 556 | } else { 557 | log.Println("Unique element not found!") 558 | } 559 | } 560 | --------------------------------------------------------------------------------