├── .gitignore ├── LICENSE ├── README.md ├── bleve_indexer.go └── cmd └── bench └── main.go /.gitignore: -------------------------------------------------------------------------------- 1 | bench 2 | **/bench 3 | 4 | # vim temp files 5 | *.swp 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016-2018 Philip O'Toole http://www.philipotoole.com 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | bleve-bench 2 | ====== 3 | 4 | *Detailed background on this code can be found on [this blog post](http://www.philipotoole.com/increasing-bleve-performance-sharding/).* 5 | 6 | bleve-bench is a program to test the impact of batch size and sharding on indexing performance of the [bleve library](https://github.com/blevesearch/bleve). 7 | 8 | ## Building and Running 9 | *Building bleve-bench requires Go 1.3 or later. [gvm](https://github.com/moovweb/gvm) is a great tool for managing your version of Go.* 10 | 11 | Download and run bleve-bench like so (tested on 64-bit Kubuntu 14.04): 12 | 13 | mkdir bleve-bench # Or any directory of your choice. 14 | cd bleve-bench/ 15 | export GOPATH=$PWD 16 | go get -v github.com/otoolep/bleve-bench 17 | go install github.com/otoolep/bleve-bench/cmd/bench/. 18 | $GOPATH/bin/bench -h 19 | 20 | Executing the last command will show the various options. An example run is shown below. 21 | 22 | $ $GOPATH/bin/bench -docs testdata.txt -maxprocs 8 -shards 50 -batchSize 100 23 | Opening docs file testdata.txt 24 | 100000 documents read for indexing. 25 | Commencing indexing. GOMAXPROCS: 8, batch size: 100, shards: 50. 26 | Indexing operation took 3.479690221s 27 | 100000 documents indexed. 28 | Indexing rate: 28738 docs/sec. 29 | 30 | Each line in the test data file is read and indexed as a distinct document. Any previously indexed data is deleted before indexing begins. 31 | 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /bleve_indexer.go: -------------------------------------------------------------------------------- 1 | package indexer 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "path/filepath" 7 | "strconv" 8 | "sync" 9 | 10 | "github.com/blevesearch/bleve" 11 | ) 12 | 13 | // Indexer represents the indexing engine. 14 | type Indexer struct { 15 | path string // Path to bleve storage 16 | batchSz int // Indexing batch size 17 | 18 | shards []bleve.Index // Index shards i.e. bleve indexes 19 | alias bleve.IndexAlias // All bleve indexes as one reference, for search 20 | } 21 | 22 | // New returns a new indexer. 23 | func New(path string, nShards, batchSz int) *Indexer { 24 | return &Indexer{ 25 | path: path, 26 | batchSz: batchSz, 27 | shards: make([]bleve.Index, 0, nShards), 28 | alias: bleve.NewIndexAlias(), 29 | } 30 | } 31 | 32 | // Open opens the indexer, preparing it for indexing. 33 | func (i *Indexer) Open() error { 34 | if err := os.MkdirAll(i.path, 0755); err != nil { 35 | return fmt.Errorf("unable to create index directory %s", i.path) 36 | } 37 | 38 | for s := 0; s < cap(i.shards); s++ { 39 | path := filepath.Join(i.path, strconv.Itoa(s)) 40 | b, err := bleve.New(path, mapping()) 41 | if err != nil { 42 | return fmt.Errorf("index %d at %s: %s", s, path, err.Error()) 43 | } 44 | 45 | i.shards = append(i.shards, b) 46 | i.alias.Add(b) 47 | } 48 | 49 | return nil 50 | } 51 | 52 | // Index indexes the given docs, dividing the docs evenly across the shards. 53 | // Blocks until all documents have been indexed. 54 | func (i *Indexer) Index(docs [][]byte) error { 55 | base := 0 56 | docsPerShard := (len(docs) / len(i.shards)) 57 | var wg sync.WaitGroup 58 | 59 | wg.Add(len(i.shards)) 60 | for _, s := range i.shards { 61 | go func(b bleve.Index, ds [][]byte) { 62 | defer wg.Done() 63 | 64 | batch := b.NewBatch() 65 | n := 0 66 | 67 | // Just index whole batches. 68 | for n = 0; n < len(ds)-(len(ds)%i.batchSz); n++ { 69 | data := struct { 70 | Body string 71 | }{ 72 | Body: string(ds[n]), 73 | } 74 | 75 | if err := batch.Index(strconv.Itoa(n), data); err != nil { 76 | panic(fmt.Sprintf("failed to index doc: %s", err.Error())) 77 | } 78 | 79 | if batch.Size() == i.batchSz { 80 | if err := b.Batch(batch); err != nil { 81 | panic(fmt.Sprintf("failed to index batch: %s", err.Error())) 82 | } 83 | batch = b.NewBatch() 84 | } 85 | } 86 | }(s, docs[base:base+docsPerShard]) 87 | base = base + docsPerShard 88 | } 89 | 90 | wg.Wait() 91 | return nil 92 | } 93 | 94 | // Count returns the total number of documents indexed. 95 | func (i *Indexer) Count() (uint64, error) { 96 | return i.alias.DocCount() 97 | } 98 | 99 | func mapping() *bleve.IndexMapping { 100 | // a generic reusable mapping for english text 101 | standardJustIndexed := bleve.NewTextFieldMapping() 102 | standardJustIndexed.Store = false 103 | standardJustIndexed.IncludeInAll = false 104 | standardJustIndexed.IncludeTermVectors = false 105 | standardJustIndexed.Analyzer = "standard" 106 | 107 | articleMapping := bleve.NewDocumentMapping() 108 | 109 | // body 110 | articleMapping.AddFieldMappingsAt("Body", standardJustIndexed) 111 | 112 | indexMapping := bleve.NewIndexMapping() 113 | indexMapping.DefaultMapping = articleMapping 114 | indexMapping.DefaultAnalyzer = "standard" 115 | return indexMapping 116 | } 117 | -------------------------------------------------------------------------------- /cmd/bench/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "flag" 6 | "fmt" 7 | "os" 8 | "runtime" 9 | "time" 10 | 11 | "github.com/otoolep/bleve-bench" 12 | ) 13 | 14 | var batchSize = flag.Int("batchSize", 100, "batch size for indexing") 15 | var nShards = flag.Int("shards", 1, "number of indexing shards") 16 | var maxprocs = flag.Int("maxprocs", 1, "GOMAXPROCS") 17 | var indexPath = flag.String("index", "indexes", "index storage path") 18 | var docsPath = flag.String("docs", "docs", "path to docs file") 19 | var csv = flag.Bool("csv", false, "summary CSV output") 20 | 21 | func main() { 22 | flag.Parse() 23 | 24 | runtime.GOMAXPROCS(*maxprocs) 25 | 26 | // Remove any existing indexes. 27 | if err := os.RemoveAll(*indexPath); err != nil { 28 | fmt.Println("failed to remove %s.", *indexPath) 29 | os.Exit(1) 30 | } 31 | 32 | // Attempt to open the file. 33 | fmt.Printf("Opening docs file %s\n", *docsPath) 34 | f, err := os.Open(*docsPath) 35 | if err != nil { 36 | fmt.Printf("failed to open docs file: %s\n", err.Error()) 37 | os.Exit(1) 38 | } 39 | 40 | // Read the lines into memory. 41 | docs := make([][]byte, 0, 100000) 42 | reader := bufio.NewReader(f) 43 | 44 | var l []byte 45 | l, err = reader.ReadBytes(byte('\n')) 46 | for err == nil { 47 | docs = append(docs, l) 48 | l, err = reader.ReadBytes(byte('\n')) 49 | } 50 | fmt.Printf("%d documents read for indexing.\n", len(docs)) 51 | 52 | if len(docs)%(*nShards) != 0 { 53 | fmt.Println("Document count must be evenly divisible by shard count") 54 | os.Exit(1) 55 | } 56 | 57 | i := indexer.New(*indexPath, *nShards, *batchSize) 58 | if err := i.Open(); err != nil { 59 | fmt.Println("failed to open indexer:", err) 60 | os.Exit(1) 61 | } 62 | 63 | startTime := time.Now() 64 | if err := i.Index(docs); err != nil { 65 | fmt.Println("failed to index documents:", err) 66 | os.Exit(1) 67 | } 68 | duration := time.Now().Sub(startTime) 69 | 70 | count, err := i.Count() 71 | if err != nil { 72 | fmt.Println("failed to determine total document count") 73 | os.Exit(1) 74 | } 75 | rate := int(float64(count) / duration.Seconds()) 76 | 77 | fmt.Printf("Commencing indexing. GOMAXPROCS: %d, batch size: %d, shards: %d.\n", 78 | runtime.GOMAXPROCS(-1), *batchSize, *nShards) 79 | 80 | fmt.Println("Indexing operation took", duration) 81 | fmt.Printf("%d documents indexed.\n", count) 82 | fmt.Printf("Indexing rate: %d docs/sec.\n", rate) 83 | 84 | if *csv { 85 | fmt.Printf("csv,%d,%d,%d,%d,%d,%d\n", len(docs), count, runtime.GOMAXPROCS(-1), *batchSize, *nShards, rate) 86 | } 87 | } 88 | --------------------------------------------------------------------------------