├── .gitignore
├── LICENSE
├── README.md
├── bleve_indexer.go
└── cmd
    └── bench
        └── main.go


/.gitignore:
--------------------------------------------------------------------------------
1 | bench
2 | **/bench
3 | 
4 | # vim temp files
5 | *.swp
6 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2016-2018 Philip O'Toole http://www.philipotoole.com
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | bleve-bench
 2 | ======
 3 | 
 4 | *Detailed background on this code can be found on [this blog post](http://www.philipotoole.com/increasing-bleve-performance-sharding/).*
 5 | 
 6 | bleve-bench is a program to test the impact of batch size and sharding on indexing performance of the [bleve library](https://github.com/blevesearch/bleve).
 7 | 
 8 | ## Building and Running
 9 | *Building bleve-bench requires Go 1.3 or later. [gvm](https://github.com/moovweb/gvm) is a great tool for managing your version of Go.*
10 | 
11 | Download and run bleve-bench like so (tested on 64-bit Kubuntu 14.04):
12 | 
13 |     mkdir bleve-bench # Or any directory of your choice.
14 |     cd bleve-bench/
15 |     export GOPATH=$PWD
16 |     go get -v github.com/otoolep/bleve-bench
17 |     go install github.com/otoolep/bleve-bench/cmd/bench/.
18 |     $GOPATH/bin/bench -h
19 | 
20 | Executing the last command will show the various options. An example run is shown below.
21 | 
22 |     $ $GOPATH/bin/bench -docs testdata.txt -maxprocs 8 -shards 50 -batchSize 100
23 |     Opening docs file testdata.txt
24 |     100000 documents read for indexing.
25 |     Commencing indexing. GOMAXPROCS: 8, batch size: 100, shards: 50.
26 |     Indexing operation took 3.479690221s
27 |     100000 documents indexed.
28 |     Indexing rate: 28738 docs/sec.
29 |     
30 | Each line in the test data file is read and indexed as a distinct document. Any previously indexed data is deleted before indexing begins.
31 | 
32 | 
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/bleve_indexer.go:
--------------------------------------------------------------------------------
  1 | package indexer
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"os"
  6 | 	"path/filepath"
  7 | 	"strconv"
  8 | 	"sync"
  9 | 
 10 | 	"github.com/blevesearch/bleve"
 11 | )
 12 | 
 13 | // Indexer represents the indexing engine.
 14 | type Indexer struct {
 15 | 	path    string // Path to bleve storage
 16 | 	batchSz int    // Indexing batch size
 17 | 
 18 | 	shards []bleve.Index    // Index shards i.e. bleve indexes
 19 | 	alias  bleve.IndexAlias // All bleve indexes as one reference, for search
 20 | }
 21 | 
 22 | // New returns a new indexer.
 23 | func New(path string, nShards, batchSz int) *Indexer {
 24 | 	return &Indexer{
 25 | 		path:    path,
 26 | 		batchSz: batchSz,
 27 | 		shards:  make([]bleve.Index, 0, nShards),
 28 | 		alias:   bleve.NewIndexAlias(),
 29 | 	}
 30 | }
 31 | 
 32 | // Open opens the indexer, preparing it for indexing.
 33 | func (i *Indexer) Open() error {
 34 | 	if err := os.MkdirAll(i.path, 0755); err != nil {
 35 | 		return fmt.Errorf("unable to create index directory %s", i.path)
 36 | 	}
 37 | 
 38 | 	for s := 0; s < cap(i.shards); s++ {
 39 | 		path := filepath.Join(i.path, strconv.Itoa(s))
 40 | 		b, err := bleve.New(path, mapping())
 41 | 		if err != nil {
 42 | 			return fmt.Errorf("index %d at %s: %s", s, path, err.Error())
 43 | 		}
 44 | 
 45 | 		i.shards = append(i.shards, b)
 46 | 		i.alias.Add(b)
 47 | 	}
 48 | 
 49 | 	return nil
 50 | }
 51 | 
 52 | // Index indexes the given docs, dividing the docs evenly across the shards.
 53 | // Blocks until all documents have been indexed.
 54 | func (i *Indexer) Index(docs [][]byte) error {
 55 | 	base := 0
 56 | 	docsPerShard := (len(docs) / len(i.shards))
 57 | 	var wg sync.WaitGroup
 58 | 
 59 | 	wg.Add(len(i.shards))
 60 | 	for _, s := range i.shards {
 61 | 		go func(b bleve.Index, ds [][]byte) {
 62 | 			defer wg.Done()
 63 | 
 64 | 			batch := b.NewBatch()
 65 | 			n := 0
 66 | 
 67 | 			// Just index whole batches.
 68 | 			for n = 0; n < len(ds)-(len(ds)%i.batchSz); n++ {
 69 | 				data := struct {
 70 | 					Body string
 71 | 				}{
 72 | 					Body: string(ds[n]),
 73 | 				}
 74 | 
 75 | 				if err := batch.Index(strconv.Itoa(n), data); err != nil {
 76 | 					panic(fmt.Sprintf("failed to index doc: %s", err.Error()))
 77 | 				}
 78 | 
 79 | 				if batch.Size() == i.batchSz {
 80 | 					if err := b.Batch(batch); err != nil {
 81 | 						panic(fmt.Sprintf("failed to index batch: %s", err.Error()))
 82 | 					}
 83 | 					batch = b.NewBatch()
 84 | 				}
 85 | 			}
 86 | 		}(s, docs[base:base+docsPerShard])
 87 | 		base = base + docsPerShard
 88 | 	}
 89 | 
 90 | 	wg.Wait()
 91 | 	return nil
 92 | }
 93 | 
 94 | // Count returns the total number of documents indexed.
 95 | func (i *Indexer) Count() (uint64, error) {
 96 | 	return i.alias.DocCount()
 97 | }
 98 | 
 99 | func mapping() *bleve.IndexMapping {
100 | 	// a generic reusable mapping for english text
101 | 	standardJustIndexed := bleve.NewTextFieldMapping()
102 | 	standardJustIndexed.Store = false
103 | 	standardJustIndexed.IncludeInAll = false
104 | 	standardJustIndexed.IncludeTermVectors = false
105 | 	standardJustIndexed.Analyzer = "standard"
106 | 
107 | 	articleMapping := bleve.NewDocumentMapping()
108 | 
109 | 	// body
110 | 	articleMapping.AddFieldMappingsAt("Body", standardJustIndexed)
111 | 
112 | 	indexMapping := bleve.NewIndexMapping()
113 | 	indexMapping.DefaultMapping = articleMapping
114 | 	indexMapping.DefaultAnalyzer = "standard"
115 | 	return indexMapping
116 | }
117 | 


--------------------------------------------------------------------------------
/cmd/bench/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"bufio"
 5 | 	"flag"
 6 | 	"fmt"
 7 | 	"os"
 8 | 	"runtime"
 9 | 	"time"
10 | 
11 | 	"github.com/otoolep/bleve-bench"
12 | )
13 | 
14 | var batchSize = flag.Int("batchSize", 100, "batch size for indexing")
15 | var nShards = flag.Int("shards", 1, "number of indexing shards")
16 | var maxprocs = flag.Int("maxprocs", 1, "GOMAXPROCS")
17 | var indexPath = flag.String("index", "indexes", "index storage path")
18 | var docsPath = flag.String("docs", "docs", "path to docs file")
19 | var csv = flag.Bool("csv", false, "summary CSV output")
20 | 
21 | func main() {
22 | 	flag.Parse()
23 | 
24 | 	runtime.GOMAXPROCS(*maxprocs)
25 | 
26 | 	// Remove any existing indexes.
27 | 	if err := os.RemoveAll(*indexPath); err != nil {
28 | 		fmt.Println("failed to remove %s.", *indexPath)
29 | 		os.Exit(1)
30 | 	}
31 | 
32 | 	// Attempt to open the file.
33 | 	fmt.Printf("Opening docs file %s\n", *docsPath)
34 | 	f, err := os.Open(*docsPath)
35 | 	if err != nil {
36 | 		fmt.Printf("failed to open docs file: %s\n", err.Error())
37 | 		os.Exit(1)
38 | 	}
39 | 
40 | 	// Read the lines into memory.
41 | 	docs := make([][]byte, 0, 100000)
42 | 	reader := bufio.NewReader(f)
43 | 
44 | 	var l []byte
45 | 	l, err = reader.ReadBytes(byte('\n'))
46 | 	for err == nil {
47 | 		docs = append(docs, l)
48 | 		l, err = reader.ReadBytes(byte('\n'))
49 | 	}
50 | 	fmt.Printf("%d documents read for indexing.\n", len(docs))
51 | 
52 | 	if len(docs)%(*nShards) != 0 {
53 | 		fmt.Println("Document count must be evenly divisible by shard count")
54 | 		os.Exit(1)
55 | 	}
56 | 
57 | 	i := indexer.New(*indexPath, *nShards, *batchSize)
58 | 	if err := i.Open(); err != nil {
59 | 		fmt.Println("failed to open indexer:", err)
60 | 		os.Exit(1)
61 | 	}
62 | 
63 | 	startTime := time.Now()
64 | 	if err := i.Index(docs); err != nil {
65 | 		fmt.Println("failed to index documents:", err)
66 | 		os.Exit(1)
67 | 	}
68 | 	duration := time.Now().Sub(startTime)
69 | 
70 | 	count, err := i.Count()
71 | 	if err != nil {
72 | 		fmt.Println("failed to determine total document count")
73 | 		os.Exit(1)
74 | 	}
75 | 	rate := int(float64(count) / duration.Seconds())
76 | 
77 | 	fmt.Printf("Commencing indexing. GOMAXPROCS: %d, batch size: %d, shards: %d.\n",
78 | 		runtime.GOMAXPROCS(-1), *batchSize, *nShards)
79 | 
80 | 	fmt.Println("Indexing operation took", duration)
81 | 	fmt.Printf("%d documents indexed.\n", count)
82 | 	fmt.Printf("Indexing rate: %d docs/sec.\n", rate)
83 | 
84 | 	if *csv {
85 | 		fmt.Printf("csv,%d,%d,%d,%d,%d,%d\n", len(docs), count, runtime.GOMAXPROCS(-1), *batchSize, *nShards, rate)
86 | 	}
87 | }
88 | 


--------------------------------------------------------------------------------