├── .gitignore
├── 1984.txt
├── debug.go
├── decodeWorker.go
├── decodeWorker_test.go
├── encodeWorker.go
├── encodeWorker_test.go
├── main.go
├── main_test.go
├── readme.md
├── util.go
├── util_test.go
├── wordsMap.go
└── wordsMap_test.go


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | *.txt
3 | tmp-*
4 | !1984.txt


--------------------------------------------------------------------------------
/debug.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"log"
 5 | 	"runtime"
 6 | )
 7 | 
 8 | var DEBUG bool
 9 | 
10 | func debug(f func()) {
11 | 	if DEBUG {
12 | 		f()
13 | 	}
14 | }
15 | 
16 | var lastTotalFreed uint64
17 | 
18 | func printMemStats() {
19 | 	mb := uint64(1024 * 1024) // MB
20 | 	var m runtime.MemStats
21 | 	runtime.ReadMemStats(&m)
22 | 	log.Printf("Alloc = %v TotalAlloc = %v  Just Freed = %v Sys = %v NumGC = %v\n",
23 | 		m.Alloc/mb, m.TotalAlloc/mb, ((m.TotalAlloc-m.Alloc)-lastTotalFreed)/mb, m.Sys/mb, m.NumGC)
24 | 
25 | 	lastTotalFreed = m.TotalAlloc - m.Alloc
26 | }
27 | 


--------------------------------------------------------------------------------
/decodeWorker.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"bufio"
 5 | 	"encoding/gob"
 6 | 	"log"
 7 | 	"os"
 8 | 	"strconv"
 9 | )
10 | 
11 | type DecodeWorker struct {
12 | 	files    []*os.File
13 | 	decoders []*gob.Decoder
14 | }
15 | 
16 | func NewDecodeWorker(nSlice int) *DecodeWorker {
17 | 	files := make([]*os.File, nSlice)
18 | 	decoders := make([]*gob.Decoder, nSlice)
19 | 	for i := 0; i < nSlice; i++ {
20 | 		f, err := os.Open("tmp-" + strconv.Itoa(i))
21 | 		if err != nil {
22 | 			log.Fatal(err)
23 | 		}
24 | 		files[i] = f
25 | 		decoders[i] = gob.NewDecoder(bufio.NewReader(f))
26 | 	}
27 | 	return &DecodeWorker{
28 | 		files,
29 | 		decoders,
30 | 	}
31 | }
32 | 
33 | func (dw *DecodeWorker) CloseAllFiles() error {
34 | 	for _, f := range dw.files {
35 | 		err := f.Close()
36 | 		if err != nil {
37 | 			return err
38 | 		}
39 | 	}
40 | 	return nil
41 | }
42 | 


--------------------------------------------------------------------------------
/decodeWorker_test.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"os"
 5 | 	"testing"
 6 | )
 7 | 
 8 | func TestNewDecodeWorker(t *testing.T) {
 9 | 	os.Create("tmp-0")
10 | 	decWorker := NewDecodeWorker(1)
11 | 	decCount := len(decWorker.decoders)
12 | 	if decCount != 1 {
13 | 		t.Errorf("wrong decoder count, got %d, want %d.\n", decCount, 1)
14 | 	}
15 | }
16 | 
17 | func TestDecodeWorker(t *testing.T) {
18 | 	wm := WordsMap{}
19 | 	wm.Add("bb", 1)
20 | 	wm.Add("bc", 2)
21 | 	wm.Add("dd", 3)
22 | 	encWorker := NewEncodeWorker(2)
23 | 	must(encWorker.SaveWordsMap(&wm))
24 | 	must(encWorker.FlushAll())
25 | 	decWorker := NewDecodeWorker(2)
26 | 	err := decWorker.CloseAllFiles()
27 | 	if err != nil {
28 | 		t.Error(err)
29 | 	}
30 | }
31 | 


--------------------------------------------------------------------------------
/encodeWorker.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"bufio"
 5 | 	"encoding/gob"
 6 | 	"fmt"
 7 | 	"log"
 8 | 	"os"
 9 | 	"strconv"
10 | )
11 | 
12 | type EncodeWorker struct {
13 | 	files    []*os.File
14 | 	writers  []*bufio.Writer
15 | 	encoders []*gob.Encoder
16 | 	nSlice   int
17 | }
18 | 
19 | func NewEncodeWorker(nSlice int) *EncodeWorker {
20 | 	// Initialize all required resources
21 | 	files := make([]*os.File, nSlice)
22 | 	writers := make([]*bufio.Writer, nSlice)
23 | 	encoders := make([]*gob.Encoder, nSlice)
24 | 	for i := 0; i < nSlice; i++ {
25 | 		f, err := os.Create("tmp-" + strconv.Itoa(i))
26 | 		if err != nil {
27 | 			log.Fatal(err)
28 | 		}
29 | 		files[i] = f
30 | 		writers[i] = bufio.NewWriter(f)
31 | 		encoders[i] = gob.NewEncoder(writers[i])
32 | 	}
33 | 	return &EncodeWorker{
34 | 		files,
35 | 		writers,
36 | 		encoders,
37 | 		nSlice,
38 | 	}
39 | }
40 | 
41 | // SaveWordsMap map and save current wordsMap to different file slices
42 | // through hash function and remainder operation
43 | func (ew *EncodeWorker) SaveWordsMap(wordsMap *WordsMap) error {
44 | 	for word, ci := range *wordsMap {
45 | 		idx := ihash(word) % ew.nSlice
46 | 		wordDict := WordDict{word, ci}
47 | 		err := ew.encoders[idx].Encode(wordDict)
48 | 		if err != nil {
49 | 			return fmt.Errorf("Failed to encode WordDict. %T:%v\n", err, err)
50 | 		}
51 | 	}
52 | 	return nil
53 | }
54 | 
55 | func (ew *EncodeWorker) FlushAll() error {
56 | 	for _, w := range ew.writers {
57 | 		if err := w.Flush(); err != nil {
58 | 			return err
59 | 		}
60 | 	}
61 | 	return nil
62 | }
63 | 
64 | func (ew *EncodeWorker) CloseAllFile() error {
65 | 	for _, f := range ew.files {
66 | 		err := f.Close()
67 | 		if err != nil {
68 | 			return err
69 | 		}
70 | 	}
71 | 	return nil
72 | }
73 | 


--------------------------------------------------------------------------------
/encodeWorker_test.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"os"
 5 | 	"testing"
 6 | )
 7 | 
 8 | func TestNewEncodeWorker(t *testing.T) {
 9 | 	os.Create("input_test.txt")
10 | 	NewEncodeWorker(1)
11 | 	if !isFileExist("tmp-0") {
12 | 		t.Errorf("Failed to create a temp file")
13 | 	}
14 | }
15 | 
16 | func TestEncodeWorker(t *testing.T) {
17 | 	wm := WordsMap{}
18 | 	wm.Add("bb", 1)
19 | 	wm.Add("bc", 2)
20 | 	wm.Add("dd", 3)
21 | 	encWorker := NewEncodeWorker(2)
22 | 	t.Run("save WordsMap", func(t *testing.T) {
23 | 		err := encWorker.SaveWordsMap(&wm)
24 | 		if err != nil {
25 | 			t.Error(err)
26 | 		}
27 | 	})
28 | 	t.Run("flush all", func(t *testing.T) {
29 | 		err := encWorker.FlushAll()
30 | 		if err != nil {
31 | 			t.Error(err)
32 | 		}
33 | 	})
34 | }
35 | 
36 | func isFileExist(filename string) bool {
37 | 	if _, err := os.Stat(filename); os.IsNotExist(err) {
38 | 		return false
39 | 	}
40 | 	return true
41 | }
42 | 


--------------------------------------------------------------------------------
/main.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"encoding/gob"
  6 | 	"flag"
  7 | 	"fmt"
  8 | 	"github.com/pkg/errors"
  9 | 	"io"
 10 | 	"log"
 11 | 	_ "net/http/pprof"
 12 | 	"os"
 13 | )
 14 | 
 15 | var inputFlag = flag.String("input", "input.txt", "path to the input file")
 16 | var nSlice = flag.Int("count", 10, "number of the slice files")
 17 | var defaultMapLen = flag.Int("maplen", 10000, "default length of Map")
 18 | 
 19 | func main() {
 20 | 	// For debugging, you can set an environment variable before running:
 21 | 	// GODEBUG="gctrace=1"
 22 | 	// or uncomment the following line (see debug.go for more detail):
 23 | 	// DEBUG = true
 24 | 
 25 | 	flag.Parse()
 26 | 	fmt.Println("------------------ step 1: split input file -------------------------")
 27 | 
 28 | 	seqTotal, err := SplitInput(*inputFlag, *nSlice)
 29 | 	if err != nil {
 30 | 		log.Fatalf("Failed to split input file. %T:%v\n", err, err)
 31 | 	}
 32 | 
 33 | 	fmt.Println("------------ step 2: find the first non-repeating word --------------")
 34 | 
 35 | 	decWorker := NewDecodeWorker(*nSlice)
 36 | 	defer decWorker.CloseAllFiles()
 37 | 
 38 | 	globalFirstWord := WordDict{"", CountIndex{0, seqTotal}}
 39 | 
 40 | 	// Read each slice file in turn and rebuild hashmap
 41 | 	for _, decoder := range decWorker.decoders {
 42 | 		UniqueWordsMap := BuildUniqueWordsMap(decoder)
 43 | 
 44 | 		// find the word with minimum sequence number in a slice file
 45 | 		firstWord := UniqueWordsMap.FindMinSeqWord(seqTotal)
 46 | 
 47 | 		// find the word with minimum sequence number in each slice file
 48 | 		if firstWord.Seq < globalFirstWord.Seq {
 49 | 			globalFirstWord = firstWord
 50 | 		}
 51 | 
 52 | 		UniqueWordsMap = nil
 53 | 		debug(printMemStats)
 54 | 	}
 55 | 
 56 | 	fmt.Println(globalFirstWord)
 57 | }
 58 | 
 59 | // SplitInput read each word in the file,
 60 | // store it in hashmap,
 61 | // and write it into different slice file.
 62 | func SplitInput(filename string, nSlice int) (int, error) {
 63 | 	encWorker := NewEncodeWorker(nSlice)
 64 | 	defer encWorker.CloseAllFile()
 65 | 	defer encWorker.FlushAll()
 66 | 
 67 | 	f, err := os.Open(filename)
 68 | 	if err != nil {
 69 | 		return -1, errors.New("Failed to read file")
 70 | 	}
 71 | 	defer closeFile(f)
 72 | 	r := bufio.NewReader(f)
 73 | 
 74 | 	fi, err := f.Stat()
 75 | 	totalSize := int(fi.Size())
 76 | 	sliceSize := totalSize / nSlice
 77 | 	currentSize := 0
 78 | 
 79 | 	wordsMap := make(WordsMap, *defaultMapLen)
 80 | 	byts := make([]byte, 0, 32)
 81 | 	seq := 0 // sequence number of each word
 82 | 
 83 | 	for {
 84 | 		b, err := r.ReadByte()
 85 | 		if err != nil {
 86 | 			if err == io.EOF {
 87 | 				// the input file has been read out
 88 | 				break
 89 | 			} else {
 90 | 				return -1, err
 91 | 			}
 92 | 		}
 93 | 		if isLetter(b) {
 94 | 			byts = append(byts, b)
 95 | 		} else {
 96 | 			if len(byts) != 0 {
 97 | 				// Save the word to WordMap
 98 | 				word := string(byts)
 99 | 				seq++
100 | 				wordsMap.Add(word, seq)
101 | 				byts = byts[:0]
102 | 			}
103 | 		}
104 | 		if currentSize > sliceSize {
105 | 			// for avoiding memory limit exceeded,
106 | 			// save WordsMap to disk and free up memory as needed
107 | 			must(encWorker.SaveWordsMap(&wordsMap))
108 | 			debug(printMemStats)
109 | 			wordsMap = make(WordsMap, *defaultMapLen)
110 | 			currentSize = 0
111 | 		}
112 | 		currentSize++
113 | 	}
114 | 	must(encWorker.SaveWordsMap(&wordsMap))
115 | 	debug(printMemStats)
116 | 	return seq, nil
117 | }
118 | 
119 | // BuildUniqueWordsMap read all data in a slice file,
120 | // merge duplicates and rebuild the WordsMap with unique words
121 | func BuildUniqueWordsMap(dec *gob.Decoder) *WordsMap {
122 | 	uniqueWordsMap := make(WordsMap, *defaultMapLen)
123 | 	for {
124 | 		wd := WordDict{}
125 | 		err := dec.Decode(&wd)
126 | 		if err != nil {
127 | 			if err == io.EOF {
128 | 				break
129 | 			} else {
130 | 				log.Fatalf("Failed to read tmp file.%T:%v/n", err, err)
131 | 			}
132 | 		}
133 | 
134 | 		if ci, ok := uniqueWordsMap[wd.Word]; ok {
135 | 			uniqueWordsMap[wd.Word] = CountIndex{wd.Count + ci.Count, ci.Seq}
136 | 		} else {
137 | 			uniqueWordsMap[wd.Word] = CountIndex{wd.Count, wd.Seq}
138 | 		}
139 | 	}
140 | 	return &uniqueWordsMap
141 | }
142 | 


--------------------------------------------------------------------------------
/main_test.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"bufio"
 5 | 	"bytes"
 6 | 	"encoding/gob"
 7 | 	"log"
 8 | 	"os"
 9 | 	"reflect"
10 | 	"testing"
11 | )
12 | 
13 | // Recursively generates the full arrangement of character arrays
14 | // and outputs them to the file with space division.
15 | func permute(chars []byte, curr []byte, w *bufio.Writer) {
16 | 	if len(chars) == 1 {
17 | 		curr = append(curr, chars[0])
18 | 		w.Write(curr)
19 | 		w.WriteByte(' ')
20 | 		return
21 | 	}
22 | 
23 | 	for i, num := range chars {
24 | 		tmp := append([]byte{}, chars[:i]...)
25 | 		tmp = append(tmp, chars[i+1:]...)
26 | 		permute(tmp, append(curr, num), w)
27 | 	}
28 | }
29 | 
30 | // Outputs the full arrangement of the specified
31 | // character slice into the input file
32 | func createTestInput(chars []byte) {
33 | 	f, err := os.Create("input_test.txt")
34 | 	defer f.Close()
35 | 	if err != nil {
36 | 		log.Fatal(err)
37 | 	}
38 | 	w := bufio.NewWriter(f)
39 | 	permute(chars, []byte{}, w)
40 | 	must(w.Flush())
41 | }
42 | 
43 | func clearTestInput() {
44 | 	must(os.Remove("input_test.txt"))
45 | }
46 | 
47 | func TestSplitInput(t *testing.T) {
48 | 	createTestInput([]byte("ABCDEFGHI"))
49 | 	defer clearTestInput()
50 | 	nSlice := 3
51 | 	seqTotal, err := SplitInput("input_test.txt", nSlice)
52 | 	if err != nil {
53 | 		t.Error(err)
54 | 	}
55 | 	expectSeqTotal := 362880
56 | 	if seqTotal != expectSeqTotal {
57 | 		t.Errorf("seq total wrong, got %d, want %d", seqTotal, expectSeqTotal)
58 | 	}
59 | }
60 | 
61 | func TestBuildUniqueWordsMap(t *testing.T) {
62 | 	b := bytes.NewBuffer([]byte{})
63 | 	w := bufio.NewWriter(b)
64 | 	enc := gob.NewEncoder(w)
65 | 	must(enc.Encode(WordDict{"bb", CountIndex{1, 1}}))
66 | 	must(enc.Encode(WordDict{"bc", CountIndex{2, 2}}))
67 | 	must(enc.Encode(WordDict{"bb", CountIndex{1, 3}}))
68 | 	must(enc.Encode(WordDict{"ca", CountIndex{1, 4}}))
69 | 	must(w.Flush())
70 | 	dec := gob.NewDecoder(bufio.NewReader(b))
71 | 	uniqueWordsMap := BuildUniqueWordsMap(dec)
72 | 	expect := WordsMap{}
73 | 	expect["bb"] = CountIndex{2, 1}
74 | 	expect["ca"] = CountIndex{1, 4}
75 | 	expect["bc"] = CountIndex{2, 2}
76 | 
77 | 	if !reflect.DeepEqual(*uniqueWordsMap, expect) {
78 | 		t.Errorf("got %+v, want %+v", *uniqueWordsMap, expect)
79 | 	}
80 | }
81 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
  1 | ## PingCAP 小作业 - 大文件中查找第一个非重复的单词
  2 | 
  3 | 开始时间 | 北京时间：2019 年 3 月 27 日，
  4 | 
  5 | 截止时间 | 北京时间：2019 年 4 月 3 日，
  6 | 
  7 | 作业要求：在 GitHub 上实现，截止时间前提交项目链接
  8 | 
  9 | 题目:
 10 | 
 11 | 有一个 100GB 的文件，里面内容是文本，要求：
 12 | 1. 找出第一个不重复的词
 13 | 2. 只允许扫一遍原文件
 14 | 3. 尽量少的 IO
 15 | 4. 内存限制 16G
 16 | 
 17 | 提示：
 18 | 
 19 | - 注意代码可读性，添加必要的注释（英文）
 20 | - 注意代码风格与规范，添加必要的单元测试和文档
 21 | - 注意异常处理，尝试优化性能
 22 | 
 23 | ### 解题思路
 24 | 
 25 | 1. 首先以字节流的方式读取 input 文件，将读到的每个词通过哈希运算和取余操作保存到不同的切片文件中。保存的同时附上单词的序号。
 26 | 2. 依次读取每个切片文件到内存中，利用 hashmap 统计每个词的出现频率。
 27 | 3. 在当前切片文件的出现次数为 1 的词中，找出最早出现的那一个词，即序号最小的词。
 28 | 4. 待所有切片文件读取完成后，比较并找出不同切片文件之间序号最小的词。
 29 | 
 30 | ### 算法分析
 31 | 
 32 | 算法需要在第一步读取整个 input 文件，然后写入到不同的切片文件中，在第二步依次读取所有切片文件，然后统计频率和找出序号最小的词。
 33 | 整体需要 2 次完整的文件读取，和 1 次完整的文件写入，一共三次 IO 操作。
 34 | 
 35 | ### 算法优化
 36 | 
 37 | 为了减少文件 IO，首先想到的方法是压缩切片文件，我使用了 golang 标准库中的 gob 序列化单词数据到切片文件中，可以获得较快的序列化速度和较好的压缩效果。
 38 | 
 39 | 在读取 input 文件的过程中，可以建立一个 hashmap，进行频率统计和单词去重的工作。当读了很多的内容时（快要超过内存限制时），将 hashmap 中的单词通过哈希函数序列化到磁盘上的不同切片文件中，再释放其占用的内存。重复这个过程直到读完。读取分片文件时，一次性读进内存，重新构建一个 hashmap，合并读到的相同的词。和原来直接写入每个单词的做法相比，多了在写入磁盘前使用 hashmap 进行去重的步骤，会极大地较少对磁盘的写入的数据量。
 40 | 
 41 | 最后，可以在一些细节上进行优化，比如创建的 Map 设定较大的 `len`，避免迭代过程中进行动态扩容。对迭代过程中临时变量（`[]byte` 类型）设定较大的 `cap` ，并且每次拿出字符串后就地清空，而不是重新赋值，这样做同样可以减少新申请内存和动态扩容带来的开销。
 42 | 
 43 | > PS: 新创建的 `Map` 所设定的 `len` 默认是 10000。我对一本文学著作（[《1984》乔治·奥威尔](https://zh.wikipedia.org/wiki/%E4%B8%80%E4%B9%9D%E5%85%AB%E5%9B%9B)）进行分析，发现**自然语言**组成的英文文本数据中，真正出现的英文单词一般不会超过 10000 个，因此取了 10000 作为 Map 默认长度。
 44 | 
 45 | ### 实际测试
 46 | 
 47 | #### 测试环境
 48 | 硬件： Intel i7-4700MQ / 8G RAM
 49 | 
 50 | 系统： Windows 10 / Golang 1.11
 51 | 
 52 | 测试集采用字母表前 11 个字母生成的全排列，总计约 4x10^8 个不同的单词，每个单词长度为11，文件大小约 400M。
 53 | 通过函数 [createTestInput](https://github.com/Deardrops/pingcapAssignment/blob/a504a8540b4d79711738c09b18cac19de9da4f8a/main_test.go#L32)  产生测试集文件。
 54 | 
 55 | 文件切片数量设置为 10。测试的方法见下方 debug 部分说明。
 56 | 
 57 | #### 测试结果
 58 | 
 59 | 下图显示了每次 GC 过程中内存的变化情况：
 60 | 
 61 | ![GC 过程中的内存分配情况](https://i.loli.net/2019/03/29/5c9e2be4a6042.png)
 62 | 
 63 | 从上图可以看出，整体堆大小一直维持在 2G 以内。每次 GC 都能回收掉一半左右的堆空间。为新建的对象腾出很多空间来。
 64 | 
 65 | 下图展示了每次**即将清理**超大对象时的内存分配情况（内存单位为 MB）：
 66 | 
 67 | ![完成大文件读写时的内存分配情况](https://i.loli.net/2019/03/29/5c9e2e86d6ea0.png)
 68 | 
 69 | 上图显示了程序的内存使用峰值情况，可以看出，程序的内存并不会无限增长，在堆上实际分配的内存最大在 1G 左右。
 70 | 每次 GC，几乎都能回收 500-1500MB 不等的空闲内存。向系统申请的总内存，不会超过 2.8GB。
 71 | 
 72 | 综上，通过分而治之的方法解决这个问题，效果拔群。
 73 | 
 74 | ### 进一步优化
 75 | 
 76 | 可以通过 pprof 工具对程序的 cpu 占用和 heap 使用情况进行详细分析，找到性能瓶颈进行进一步的优化。可能的几个优化方向有：
 77 | 
 78 | - 使用 goroutine 并发处理，可以充分利用多线程 CPU 资源。
 79 | - 采用 MapReduce 的思想，先将 input 文件分成几块，传到不同的机器上，在不同机器上进行 hash 映射分割，将切片文件传到不同的机器中，找各自切片中最早出现的非重复元素，最后合并结果即可。这样可以成倍地提升 CPU 资源和磁盘 IO 能力，但网络带宽可能成为新的瓶颈。
 80 | - 采取整存整取的策略（对于HDD磁盘），例如 Block 大小为 64MB，我们写入磁盘时尽量让数据量保持为 64MB 的整数倍，这样避免数据划分太细，被分散在磁盘的各个位置，读写时跨磁盘扇区导致的耗时过长问题。
 81 | - 重写一些占用内存较多的或者 CPU 开销较大的标准库函数，以提升性能。比如 strings.ToLower() 函数，参数为字符串变量（string），可以重写一个功能一样的函数，参数改为 `[]byte` 类型的变量，避免了进行类型转换，可以减少 CPU 和内存开销。
 82 | - 用 cgo 等手段，手动进行内存管理，可以避免 golang GC带来的开销（GC时会 `stop the world`）。例如，可以用 `unsafe.Pointer` 和 `reflect.MapHeader` 自定义一个类似 Map 的数据结构，并且手动为其分配内存和释放内存，达到最高的效率。
 83 | 
 84 | ### 使用指南
 85 | 
 86 | #### 下载仓库到本地
 87 | ```bash
 88 | go get github.com/Deardrops/pingcapAssignment
 89 | ```
 90 | #### 进入项目文件夹
 91 | ```bash
 92 | cd $GOPATH/src/github.com/Deardrops/pingcapAssignment
 93 | ```
 94 | #### 运行程序
 95 | ```bash
 96 | go run . --input=1984.txt --count=10 --mapLen=10000
 97 | ```
 98 | ##### 参数说明（可以通过 `--help` 查看参数说明）
 99 | - `input`：input 文件的路径，默认为 `input.txt`
100 | - `count`：切片文件的数目，默认为 `10`
101 | - `mapLen`：为每个新建的 Map 对象设定的 len，默认值为 `10000`
102 | #### 运行测试
103 | ```bash
104 | go test -v
105 | ```
106 | 本项目附有完整的单元测试。
107 | #### Debug
108 | 主要通过两种方式查看内存分配情况：
109 | 
110 | 一种是设定环境变量 `GODEBUG="gctrace=1"`，然后运行程序，可以看到每次 GC 时堆上的空间变化情况。
111 | 
112 | 另一种是在 main 函数中令全局变量 `DEBUG=true`，使用 golang 的 runtime 库查看内存分配情况。
113 | 
114 | ## 参考资料
115 | 
116 | 1. [通过 Go 语言学习测试驱动开发](https://studygolang.gitbook.io/learn-go-with-tests/)
117 | 2. [golang 如何排查和定位 GC 问题](https://my.oschina.net/u/3470972/blog/1609721)
118 | 3. [Golang 之 bytes.buffer](https://www.kancloud.cn/digest/batu-go/153538)
119 | 4. [Leetcode - 全排列问题](https://github.com/Deardrops/leetcode/tree/master/top-interview-questions-medium/backtracking/permutations)
120 | 5. [知乎 - golang 的 gc 如何处理 map](https://www.zhihu.com/question/65426766)
121 | 6. [golang 内存分析 / 动态追踪](https://lrita.github.io/2017/05/26/golang-memory-pprof/#go-tool)
122 | 7. [go pprof 性能分析](http://wudaijun.com/2018/04/go-pprof/)
123 | 8. [Golang 大杀器之性能剖析 PProf](https://segmentfault.com/a/1190000016412013)
124 | 9. [golang 开启 GODEBUG=gctrace=1 显示信息的含义](https://sheepbao.github.io/post/golang_debug_gctrace/)
125 | 10. [golang bufio、ioutil 读文件的速度比较（性能测试）和影响因素分析](https://segmentfault.com/a/1190000011680507)
126 | 


--------------------------------------------------------------------------------
/util.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"hash/fnv"
 5 | 	"log"
 6 | 	"os"
 7 | )
 8 | 
 9 | // Check if a byte is legal letter, which means in [a-zA-Z]
10 | func isLetter(b byte) bool {
11 | 	if b >= 'A' && b <= 'Z' {
12 | 		return true
13 | 	}
14 | 	if b >= 'a' && b <= 'z' {
15 | 		return true
16 | 	}
17 | 	return false
18 | }
19 | 
20 | // An hash function copied from lab1 (MapReduce) of MIT 6.824
21 | func ihash(s string) int {
22 | 	h := fnv.New32a()
23 | 	h.Write([]byte(s))
24 | 	return int(h.Sum32() & 0x7fffffff)
25 | }
26 | 
27 | func must(err error) {
28 | 	if err != nil {
29 | 		log.Fatal(err)
30 | 	}
31 | }
32 | 
33 | func closeFile(f *os.File) {
34 | 	err := f.Close()
35 | 	if err != nil {
36 | 		log.Fatalf("Failed to close file. %T:%v\n", err, err)
37 | 	}
38 | }
39 | 


--------------------------------------------------------------------------------
/util_test.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import "testing"
 4 | 
 5 | func TestIsLetter(t *testing.T) {
 6 | 	areaTests := []struct {
 7 | 		in  uint8
 8 | 		out bool
 9 | 	}{
10 | 		{'3', false},
11 | 		{'z', true},
12 | 		{'+', false},
13 | 	}
14 | 
15 | 	for _, tt := range areaTests {
16 | 		got := isLetter(tt.in)
17 | 		if got != tt.out {
18 | 			t.Errorf("with %c, got %v, want %v\n", tt.in, got, tt.out)
19 | 		}
20 | 	}
21 | }
22 | 


--------------------------------------------------------------------------------
/wordsMap.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import "strings"
 4 | 
 5 | // WordDict is the wrapper of word,
 6 | // containing the word string, total number of occurrences and sequence number
 7 | type WordDict struct {
 8 | 	Word string
 9 | 	CountIndex
10 | }
11 | 
12 | // CountIndex is a wrapper of two attribute of word,
13 | // total number of occurrences and sequence number
14 | type CountIndex struct {
15 | 	Count int // total number of occurrences
16 | 	Seq   int // sequence number of word
17 | }
18 | 
19 | // WordsMap is a wrapper of Map, each item is a word
20 | // Key: string of word
21 | // Value: its total number of occurrences and sequence number
22 | type WordsMap map[string]CountIndex
23 | 
24 | // Add a/an new/exist word to WordsMap
25 | func (wm *WordsMap) Add(word string, seq int) {
26 | 	word = strings.ToLower(word)
27 | 	if ci, ok := (*wm)[word]; ok {
28 | 		(*wm)[word] = CountIndex{ci.Count + 1, ci.Seq}
29 | 	} else {
30 | 		(*wm)[word] = CountIndex{1, seq}
31 | 	}
32 | }
33 | 
34 | // FindMinSeqWord return the word with minimum sequence number
35 | func (wm *WordsMap) FindMinSeqWord(seqTotal int) WordDict {
36 | 	firstWord := WordDict{"", CountIndex{0, seqTotal}}
37 | 	for word, ci := range *wm {
38 | 		if ci.Count == 1 && ci.Seq < firstWord.Seq {
39 | 			firstWord.Word = word
40 | 			firstWord.Count = 1
41 | 			firstWord.Seq = ci.Seq
42 | 		}
43 | 	}
44 | 	return firstWord
45 | }
46 | 


--------------------------------------------------------------------------------
/wordsMap_test.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"reflect"
 5 | 	"testing"
 6 | )
 7 | 
 8 | func TestWordsMap_Add(t *testing.T) {
 9 | 	t.Run("add a new Key", func(t *testing.T) {
10 | 		wm := WordsMap{}
11 | 		wm.Add("ovo", 1)
12 | 		expect := WordsMap{}
13 | 		expect["ovo"] = CountIndex{1, 1}
14 | 		if !reflect.DeepEqual(wm, expect) {
15 | 			t.Errorf("got %+v, want %+v\n", wm, expect)
16 | 		}
17 | 	})
18 | 
19 | 	t.Run("add an exist Key", func(t *testing.T) {
20 | 		wm := WordsMap{}
21 | 		wm["ovo"] = CountIndex{1, 10}
22 | 		wm.Add("ovo", 20)
23 | 		expect := WordsMap{}
24 | 		expect["ovo"] = CountIndex{2, 10}
25 | 		if !reflect.DeepEqual(wm, expect) {
26 | 			t.Errorf("got %+v, want %+v\n", wm, expect)
27 | 		}
28 | 	})
29 | }
30 | 
31 | func TestWordsMap_FindMinSeqWord(t *testing.T) {
32 | 	wm := WordsMap{}
33 | 	wm.Add("eins", 1)
34 | 	wm.Add("zwei", 2)
35 | 	wm.Add("drei", 3)
36 | 	wm.Add("drei", 4)
37 | 	wm.Add("eins", 5)
38 | 	got := wm.FindMinSeqWord(5)
39 | 	expect := WordDict{
40 | 		"zwei",
41 | 		CountIndex{1, 2},
42 | 	}
43 | 	if !reflect.DeepEqual(got, expect) {
44 | 		t.Errorf("got %+v, want %+v\n", got, expect)
45 | 	}
46 | }
47 | 


--------------------------------------------------------------------------------