├── .gitignore
├── LICENSE
├── README.md
├── fulltext.go
├── indexer.go
├── indexer_test.go
├── searcher.go
├── searcher_test.go
├── stopwords.go
├── testdata
    ├── searchform.html
    ├── searchresults.html
    └── shakespeare.mit.edu.zip
└── util.go


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled Object files, Static and Dynamic libs (Shared Objects)
 2 | *.o
 3 | *.a
 4 | *.so
 5 | 
 6 | # Folders
 7 | _obj
 8 | _test
 9 | 
10 | # Architecture specific extensions/prefixes
11 | *.[568vq]
12 | [568vq].out
13 | 
14 | *.cgo1.go
15 | *.cgo2.c
16 | _cgo_defun.c
17 | _cgo_gotypes.go
18 | _cgo_export.*
19 | 
20 | _testmain.go
21 | 
22 | *.exe
23 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2013-2014 Brad Peabody
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 6 | this software and associated documentation files (the "Software"), to deal in
 7 | the Software without restriction, including without limitation the rights to
 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 9 | the Software, and to permit persons to whom the Software is furnished to do so,
10 | subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Overview
 2 | ========
 3 | 
 4 | This is a simple, pure-Go, full text indexing and search library.
 5 | 
 6 | I made it for use on small to medium websites, although there is nothing web-specific about it's API or operation.
 7 | 
 8 | Cdb (http://github.com/jbarham/go-cdb) is used to perform the indexing and lookups.
 9 | 
10 | Status
11 | ------
12 | 
13 | This project is more or less stable.
14 | 
15 | Notes on Building
16 | --------
17 | 
18 | fulltext requires CDB:
19 | 
20 | 	go get github.com/jbarham/go-cdb
21 | 
22 | Usage
23 | ------
24 | 
25 | First, you must create an index.  Like this:
26 | 
27 | 	import "github.com/bradleypeabody/fulltext"
28 | 
29 | 	// create new index with temp dir (usually "" is fine)
30 | 	idx, err := fulltext.NewIndexer(""); if err != nil { panic(err) }
31 | 	defer idx.Close()
32 | 
33 | 	// provide stop words if desired
34 | 	idx.StopWordCheck = fulltext.EnglishStopWordChecker
35 | 
36 | 	// for each document you want to add, you do something like this:
37 | 	doc := fulltext.IndexDoc{
38 | 		Id: []byte(uuid), // unique identifier (the path to a webpage works...)
39 | 		StoreValue: []byte(title), // bytes you want to be able to retrieve from search results
40 | 		IndexValue: []byte(data), // bytes you want to be split into words and indexed
41 | 	}
42 | 	idx.AddDoc(doc) // add it
43 | 
44 | 	// when done, write out to final index
45 | 	err = idx.FinalizeAndWrite(f); if err != nil { panic(err) }
46 | 
47 | Once you have an index file, you can search it like this:
48 | 
49 | 	s, err := fulltext.NewSearcher("/path/to/index/file"); if err != nil { panic(err) }
50 | 	defer s.Close()
51 | 	sr, err := s.SimpleSearch("Horatio", 20); if err != nil { panic(err) }
52 | 	for k, v := range sr.Items {
53 | 		fmt.Printf("----------- #:%d\n", k)
54 | 		fmt.Printf("Id: %s\n", v.Id)
55 | 		fmt.Printf("Score: %d\n", v.Score)
56 | 		fmt.Printf("StoreValue: %s\n", v.StoreValue)
57 | 	}
58 | 
59 | It's rather simplistic.  But it's fast and it works.
60 | 
61 | Thoughts in Comparison to blevesearch
62 | -------------------------------------
63 | 
64 | I wrote this project before <a href="https://github.com/blevesearch/bleve">blevesearch</a> was released.  I've done a number of implementions now of website search engines using fulltext and also a number of others using blevesearch.  My general experience has been that blevesearch is better suited for projects where you are really doing significant development on your search results and need the ability to customize things for various locales, etc.  Fulltext on the other hand is much simpler and is better for projects that either a) have simpler search requirements or b) prefer speed of indexing over quality of results.
65 | 
66 | Adding a fulltext search engine to a website with a few hundred pages is a simple task and the indexing is fast enough that you can just run it as part of your pre-publish build process.  So while there is a lot more development on blevesearch happening - and hats off to them, it's a great product - fulltext still seems to have it's place for these simpler scenarios.
67 | 
68 | TODOs
69 | -----
70 | 
71 | * ~~Will likely need some sort of "stop word" functionality.~~
72 | 
73 | * ~~Wordize(), IndexizeWord()~~ and the scoring aggregation logic should be extracted to callback functions with the existing functionality as default.
74 | 
75 | * The search logic is currently very naive.  Ideally this project would have something as sophisticated as <a href="http://lucene.apache.org/core/4_10_0/queryparser/org/apache/lucene/queryparser/classic/package-summary.html" target="_blank">Lucene's query parser</a>.  But in reality what I'll likely do is a simple survey of which common features are actually used on any on-site search engines I can get my hands on.  Quoting ("black cat"), and logical operators (Jim OR James) would likely be at the top of the list and implementing that sort of thing would be higher priority than trying to duplicate Lucene.
76 | 
77 | * I've considered using boltdb for storage as an alternative to CDB, but I haven't found the time to work on it.  This approach would provide the ability to update the index, reduce memory consumption during index building, and potenteially allow for wildcard suffixes.
78 | 
79 | Implementation Notes
80 | --------------------
81 | 
82 | I originally tried doing this on top of Sqlite.  It was dreadfully slow.  Cdb is orders of magnitude faster.
83 | 
84 | Two main disadvantages from going the Cdb route are that the index cannot be edited once it is built (you have to recreate it in full), and since it's hash-based it will not support any sort of fuzzy matching unless those variations are included in the index (which they are not, in the current implementation.)   For my purposes these two disadvantages are overshadowed by the fact that it's blinding fast, easy to use, portable (pure-Go), and its interface allowed me to build the indexes I needed into a single file.
85 | 
86 | In the test suite is included a copy of the complete works of William Shakespeare (thanks to Jeremy Hylton's http://shakespeare.mit.edu/) and this library is used to create a simple search engine on top of that corpus.  By default it only runs for 10 seconds, but you can run it for longer by doing something like:
87 | 
88 | 	SEARCHER_WEB_TIMEOUT_SECONDS=120 go test fulltext -v
89 | 
90 | Works on Windows.
91 | 
92 | Future Work
93 | -----------
94 | 
95 | It might be feasible to supplant this project with something using suffix arrays ( http://golang.org/pkg/index/suffixarray/ ).  The main down side would be the requirement of a lot more storage space (and memory to load and search it).  Retooling the index/suffixarray package so it can work against the disk is an idea, but is not necessarily simple.  The upside of an approach like that would be full regex support for searches with decent performance - which would rock.  The index could potentially be sharded by the first character or two of the search - but that's still not as good as something with sensible caching where the whole set can be kept on disk and the "hot" parts cached in memory, etc.
96 | 


--------------------------------------------------------------------------------
/fulltext.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | 
 3 | A simple cross-platform, full-text search engine, backed by sqlite.
 4 | Intended for use on small- to medium-sized websites.
 5 | 
 6 | See README.md for usage.
 7 | 
 8 | */
 9 | package fulltext
10 | 


--------------------------------------------------------------------------------
/indexer.go:
--------------------------------------------------------------------------------
  1 | package fulltext
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"encoding/gob"
  6 | 	"fmt"
  7 | 	"github.com/jbarham/go-cdb"
  8 | 	"io"
  9 | 	"io/ioutil"
 10 | 	"os"
 11 | 	"syscall"
 12 | )
 13 | 
 14 | // Size of header block to prepend - make it 4k to align disk reads
 15 | const HEADER_SIZE = 4096
 16 | 
 17 | // Produces a set of cdb files from a series of AddDoc() calls
 18 | type Indexer struct {
 19 | 	docTxtFile    *os.File
 20 | 	wordTxtFile   *os.File
 21 | 	docCdbFile    *os.File
 22 | 	wordCdbFile   *os.File
 23 | 	wordMap       map[string]map[string]int // map of [word][docId]count
 24 | 	WordSplit     WordSplitter
 25 | 	WordClean     WordCleaner
 26 | 	StopWordCheck StopWordChecker
 27 | }
 28 | 
 29 | // Contents of a single document to be indexed
 30 | type IndexDoc struct {
 31 | 	Id         []byte // the id, this is usually the path to the document
 32 | 	IndexValue []byte // index this data
 33 | 	StoreValue []byte // store this data
 34 | }
 35 | 
 36 | // Creates a new indexer, using the given temp dir while building
 37 | // the index.
 38 | func NewIndexer(tempDir string) (*Indexer, error) {
 39 | 	idx := &Indexer{}
 40 | 	var err error
 41 | 	idx.docTxtFile, err = ioutil.TempFile(tempDir, "doctmp")
 42 | 	if err != nil {
 43 | 		return nil, err
 44 | 	}
 45 | 	idx.wordTxtFile, err = ioutil.TempFile(tempDir, "wordtmp")
 46 | 	if err != nil {
 47 | 		return nil, err
 48 | 	}
 49 | 	idx.docCdbFile, err = ioutil.TempFile(tempDir, "doccdb")
 50 | 	if err != nil {
 51 | 		return nil, err
 52 | 	}
 53 | 	idx.wordCdbFile, err = ioutil.TempFile(tempDir, "wordcdb")
 54 | 	if err != nil {
 55 | 		return nil, err
 56 | 	}
 57 | 	idx.wordMap = make(map[string]map[string]int)
 58 | 	idx.WordSplit = Wordize
 59 | 	idx.WordClean = IndexizeWord
 60 | 	return idx, nil
 61 | }
 62 | 
 63 | // Add a document to the index - writes to temporary files and stores some data in memory while building the index.
 64 | func (idx *Indexer) AddDoc(idoc IndexDoc) error {
 65 | 	// add to docs
 66 | 	docId := string(idoc.Id)
 67 | 	writeTextLine(idx.docTxtFile, []byte(docId), idoc.StoreValue)
 68 | 	words := append(idx.WordSplit(string(idoc.IndexValue)), idx.WordSplit(string(idoc.StoreValue))...)
 69 | 	for _, word := range words {
 70 | 		word = idx.WordClean(word)
 71 | 
 72 | 		// skip if stop word
 73 | 		if idx.StopWordCheck != nil {
 74 | 			if idx.StopWordCheck(word) {
 75 | 				continue
 76 | 			}
 77 | 		}
 78 | 
 79 | 		// ensure nested map exists
 80 | 		if idx.wordMap[word] == nil {
 81 | 			idx.wordMap[word] = make(map[string]int)
 82 | 		}
 83 | 		// increment count by one for this combination
 84 | 		c := idx.wordMap[word][docId] + 1
 85 | 		idx.wordMap[word][docId] = c
 86 | 	}
 87 | 	return nil
 88 | }
 89 | 
 90 | // Builds a final single index file, which consists of some simple header info,
 91 | // followed by the cdb binary files that comprise the full index.
 92 | func (idx *Indexer) FinalizeAndWrite(w io.Writer) error {
 93 | 
 94 | 	var buf bytes.Buffer
 95 | 
 96 | 	// write out the word data
 97 | 	for word, m := range idx.wordMap {
 98 | 		enc := gob.NewEncoder(&buf)
 99 | 		enc.Encode(m)
100 | 		writeTextLine(idx.wordTxtFile, []byte(word), buf.Bytes())
101 | 		buf.Reset()
102 | 	}
103 | 
104 | 	var err error
105 | 
106 | 	idx.docTxtFile.Write([]byte("\n"))
107 | 	idx.wordTxtFile.Write([]byte("\n"))
108 | 
109 | 	_, err = idx.docTxtFile.Seek(0, 0)
110 | 	if err != nil {
111 | 		return err
112 | 	}
113 | 	_, err = idx.wordTxtFile.Seek(0, 0)
114 | 	if err != nil {
115 | 		return err
116 | 	}
117 | 
118 | 	// make cdb files
119 | 	err = cdb.Make(idx.docCdbFile, idx.docTxtFile)
120 | 	if err != nil {
121 | 		return err
122 | 	}
123 | 	err = cdb.Make(idx.wordCdbFile, idx.wordTxtFile)
124 | 	if err != nil {
125 | 		return err
126 | 	}
127 | 
128 | 	// make sure the contents are all settled
129 | 	idx.docCdbFile.Sync()
130 | 	idx.wordCdbFile.Sync()
131 | 	_, err = idx.docCdbFile.Seek(0, 0)
132 | 	if err != nil {
133 | 		return err
134 | 	}
135 | 	_, err = idx.wordCdbFile.Seek(0, 0)
136 | 	if err != nil {
137 | 		return err
138 | 	}
139 | 
140 | 	docStat, err := idx.docCdbFile.Stat()
141 | 	if err != nil {
142 | 		return err
143 | 	}
144 | 	wordStat, err := idx.wordCdbFile.Stat()
145 | 	if err != nil {
146 | 		return err
147 | 	}
148 | 
149 | 	// now package it all up
150 | 	buf.Reset()
151 | 	enc := gob.NewEncoder(&buf)
152 | 	bhead := []int{int(docStat.Size()), int(wordStat.Size())}
153 | 	enc.Encode(bhead)
154 | 
155 | 	// extend buffer to be HEADER_SIZE len
156 | 	bpadsize := HEADER_SIZE - buf.Len()
157 | 	buf.Write(make([]byte, bpadsize, bpadsize))
158 | 	b := buf.Bytes()
159 | 
160 | 	_, err = w.Write(b)
161 | 	if err != nil {
162 | 		return err
163 | 	}
164 | 
165 | 	_, err = io.Copy(w, idx.docCdbFile)
166 | 	if err != nil {
167 | 		return err
168 | 	}
169 | 	_, err = io.Copy(w, idx.wordCdbFile)
170 | 	if err != nil {
171 | 		return err
172 | 	}
173 | 
174 | 	return nil
175 | }
176 | 
177 | // Dump some human readable status information
178 | func (idx *Indexer) DumpStatus(w io.Writer) {
179 | 	fmt.Fprintf(w, "files used:\n\t%s\n\t%s\n\t%s\n\t%s\n", idx.docTxtFile.Name(), idx.wordTxtFile.Name(), idx.docCdbFile.Name(), idx.wordCdbFile.Name())
180 | 	// fmt.Fprintf(w, "wordMap: %+v\n", idx.wordMap)
181 | }
182 | 
183 | // close and remove all resources
184 | func (idx *Indexer) Close() {
185 | 	syscall.Unlink(idx.docTxtFile.Name())
186 | 	idx.docTxtFile.Close()
187 | 	syscall.Unlink(idx.wordTxtFile.Name())
188 | 	idx.wordTxtFile.Close()
189 | 	syscall.Unlink(idx.docCdbFile.Name())
190 | 	idx.docCdbFile.Close()
191 | 	syscall.Unlink(idx.wordCdbFile.Name())
192 | 	idx.wordCdbFile.Close()
193 | 	idx.wordMap = nil
194 | }
195 | 
196 | // Write a single line of data in cdb's text format
197 | func writeTextLine(w io.Writer, key []byte, data []byte) (err error) {
198 | 	_, err = fmt.Fprintf(w, "+%d,%d:%s->%s\n", len(key), len(data), key, data)
199 | 	return
200 | }
201 | 


--------------------------------------------------------------------------------
/indexer_test.go:
--------------------------------------------------------------------------------
 1 | package fulltext
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"io/ioutil"
 6 | 	"os"
 7 | 	"path/filepath"
 8 | 	re "regexp"
 9 | 	"testing"
10 | )
11 | 
12 | func TestIndexer(t *testing.T) {
13 | 	fmt.Printf("TestIndexer\n")
14 | 
15 | 	idx, err := NewIndexer("")
16 | 	if err != nil {
17 | 		panic(err)
18 | 	}
19 | 
20 | 	idx.AddDoc(IndexDoc{Id: []byte(`blah1`), StoreValue: []byte(`store this`), IndexValue: []byte(`test of the emergency broadcast system`)})
21 | 	idx.AddDoc(IndexDoc{Id: []byte(`blah2`), StoreValue: []byte(`store this stuff too, yeah store it`), IndexValue: []byte(`every good boy does fine`)})
22 | 	idx.AddDoc(IndexDoc{Id: []byte(`blah3`), StoreValue: []byte(`more storage here`), IndexValue: []byte(`a taco in the hand is worth two in the truck`)})
23 | 
24 | 	idx.DumpStatus(os.Stdout)
25 | 
26 | 	f, err := ioutil.TempFile("", "idxout")
27 | 	if err != nil {
28 | 		panic(err)
29 | 	}
30 | 	err = idx.FinalizeAndWrite(f)
31 | 	if err != nil {
32 | 		panic(err)
33 | 	}
34 | 	f.Close()
35 | 
36 | 	fmt.Printf("Wrote index file: %s\n", f.Name())
37 | 
38 | }
39 | 
40 | // A more extensive test - index the complete works of William Shakespeare
41 | func NoTestTheBardIndexing(t *testing.T) {
42 | 
43 | 	fmt.Println("TestTheBardIndexing")
44 | 
45 | 	idx, err := NewIndexer("")
46 | 	if err != nil {
47 | 		panic(err)
48 | 	}
49 | 	defer idx.Close()
50 | 
51 | 	titlere := re.MustCompile("(?i)<title>([^<]+)</title>")
52 | 
53 | 	n := 0
54 | 
55 | 	filepath.Walk("testdata/shakespeare.mit.edu/", func(path string, f os.FileInfo, err error) error {
56 | 		if !f.IsDir() /*&& n < 5*/ {
57 | 			n++
58 | 			fmt.Printf("indexing: %s\n", path)
59 | 			b, err := ioutil.ReadFile(path)
60 | 			if err != nil {
61 | 				panic(err)
62 | 			}
63 | 			title := string(titlere.Find(b))
64 | 			body := HTMLStripTags(string(b))
65 | 			doc := IndexDoc{
66 | 				Id:         []byte(path),
67 | 				StoreValue: []byte(title),
68 | 				IndexValue: []byte(title + " " + title + " " + body),
69 | 			}
70 | 			idx.AddDoc(doc)
71 | 		}
72 | 		return nil
73 | 	})
74 | 
75 | 	// idx.DebugDump(os.Stdout)
76 | 
77 | 	fmt.Println("Writing final index...")
78 | 	f, err := ioutil.TempFile("", "idxout")
79 | 	if err != nil {
80 | 		panic(err)
81 | 	}
82 | 	err = idx.FinalizeAndWrite(f)
83 | 	if err != nil {
84 | 		panic(err)
85 | 	}
86 | 	f.Close()
87 | 
88 | 	fmt.Printf("Wrote index file: %s\n", f.Name())
89 | 
90 | }
91 | 


--------------------------------------------------------------------------------
/searcher.go:
--------------------------------------------------------------------------------
  1 | package fulltext
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"encoding/gob"
  6 | 	"github.com/jbarham/go-cdb"
  7 | 	"io"
  8 | 	"io/ioutil"
  9 | 	"os"
 10 | 	"sort"
 11 | )
 12 | 
 13 | // Interface for search.  Not thread-safe, but low overhead
 14 | // so having a separate one per thread should be workable.
 15 | type Searcher struct {
 16 | 	file    *os.File
 17 | 	docCdb  *cdb.Cdb
 18 | 	wordCdb *cdb.Cdb
 19 | }
 20 | 
 21 | // Wraps a ReaderAt and adjusts (tweaks) it's offset by the specified amount
 22 | type tweakedReaderAt struct {
 23 | 	readerAt io.ReaderAt
 24 | 	tweak    int64
 25 | }
 26 | 
 27 | func (t *tweakedReaderAt) ReadAt(p []byte, off int64) (n int, err error) {
 28 | 	n, err = t.readerAt.ReadAt(p, off+t.tweak)
 29 | 	return
 30 | }
 31 | 
 32 | // A single item in a search result
 33 | type SearchResultItem struct {
 34 | 	Id         []byte // id of this item (document)
 35 | 	StoreValue []byte // the stored value of this document
 36 | 	Score      int64  // the total score
 37 | }
 38 | 
 39 | // Implement sort.Interface
 40 | type SearchResultItems []SearchResultItem
 41 | 
 42 | func (s SearchResultItems) Len() int      { return len(s) }
 43 | func (s SearchResultItems) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
 44 | func (s SearchResultItems) Less(i, j int) bool {
 45 | 	// if same score, then sort by raw bytes comparison of store value -
 46 | 	// so we get consistently ordered results, even when score is same
 47 | 	if s[i].Score == s[j].Score {
 48 | 		return bytes.Compare(s[i].Id, s[j].Id) < 0
 49 | 	}
 50 | 	return s[i].Score < s[j].Score
 51 | }
 52 | 
 53 | // What happened during the search
 54 | type SearchResults struct {
 55 | 	Items SearchResultItems
 56 | }
 57 | 
 58 | // Make a new searcher using the file at the specified path
 59 | // TODO: Make a variation that accepts a ReaderAt
 60 | func NewSearcher(fpath string) (*Searcher, error) {
 61 | 
 62 | 	s := &Searcher{}
 63 | 
 64 | 	f, err := os.Open(fpath)
 65 | 	if err != nil {
 66 | 		return s, err
 67 | 	}
 68 | 	s.file = f
 69 | 
 70 | 	// write out the word data
 71 | 	dec := gob.NewDecoder(f)
 72 | 	lens := make([]int64, 2, 2)
 73 | 	dec.Decode(&lens)
 74 | 
 75 | 	s.docCdb = cdb.New(&tweakedReaderAt{f, HEADER_SIZE})
 76 | 	s.wordCdb = cdb.New(&tweakedReaderAt{f, HEADER_SIZE + lens[0]})
 77 | 
 78 | 	return s, nil
 79 | }
 80 | 
 81 | // Close and release resources
 82 | func (s *Searcher) Close() error {
 83 | 	s.docCdb = nil
 84 | 	s.wordCdb = nil
 85 | 	return s.file.Close()
 86 | }
 87 | 
 88 | // Perform a search
 89 | func (s *Searcher) SimpleSearch(search string, maxn int) (SearchResults, error) {
 90 | 
 91 | 	sr := SearchResults{}
 92 | 
 93 | 	// break search into words_word
 94 | 	searchWords := Wordize(search)
 95 | 
 96 | 	itemMap := make(map[string]SearchResultItem)
 97 | 
 98 | 	// read word data for each word that was provided
 99 | 	for _, w := range searchWords {
100 | 		w = IndexizeWord(w)
101 | 		// find the docs for this word
102 | 		mapGob, err := s.wordCdb.Find([]byte(w))
103 | 		if err == io.EOF {
104 | 			continue
105 | 		}
106 | 		if err != nil {
107 | 			return sr, err
108 | 		}
109 | 
110 | 		m := make(map[string]int)
111 | 
112 | 		dec := gob.NewDecoder(mapGob)
113 | 		err = dec.Decode(&m)
114 | 		if err != nil {
115 | 			return sr, err
116 | 		}
117 | 
118 | 		// for each doc, increase score
119 | 		for docId, cnt := range m {
120 | 			sri := itemMap[docId]
121 | 			if sri.Score < 1 {
122 | 				sri.Id = []byte(docId)
123 | 			}
124 | 			sri.Score += int64(cnt)
125 | 			itemMap[docId] = sri
126 | 		}
127 | 
128 | 	}
129 | 
130 | 	// convert to slice
131 | 	items := make(SearchResultItems, 0, maxn)
132 | 	for _, item := range itemMap {
133 | 		items = append(items, item)
134 | 	}
135 | 
136 | 	// sort by score descending
137 | 	sort.Sort(sort.Reverse(items))
138 | 
139 | 	// limit to maxn
140 | 	if len(items) > maxn {
141 | 		items = items[:maxn]
142 | 	}
143 | 
144 | 	// pull document contents from doc cdb
145 | 	for i := range items {
146 | 		item := &items[i]
147 | 		v, err := s.docCdb.Find(item.Id)
148 | 		if err == io.EOF {
149 | 			panic("doc id " + string(item.Id) + " not found in index, this should never happen")
150 | 		}
151 | 		if err != nil {
152 | 			return sr, err
153 | 		}
154 | 		v1, err := ioutil.ReadAll(v)
155 | 		if err != nil {
156 | 			return sr, err
157 | 		}
158 | 		item.StoreValue = v1
159 | 	}
160 | 
161 | 	sr.Items = items
162 | 
163 | 	return sr, nil
164 | 
165 | }
166 | 


--------------------------------------------------------------------------------
/searcher_test.go:
--------------------------------------------------------------------------------
  1 | package fulltext
  2 | 
  3 | import (
  4 | 	"archive/zip"
  5 | 	"bytes"
  6 | 	"fmt"
  7 | 	"html/template"
  8 | 	"io"
  9 | 	"io/ioutil"
 10 | 	"net"
 11 | 	"net/http"
 12 | 	"os"
 13 | 	"path/filepath"
 14 | 	re "regexp"
 15 | 	"strconv"
 16 | 	"strings"
 17 | 	"testing"
 18 | 	"time"
 19 | )
 20 | 
 21 | // Extract a single file from a zip and return it's contents
 22 | func zipExtract(zfpath string, fpath string) ([]byte, error) {
 23 | 
 24 | 	zr, err := zip.OpenReader(zfpath)
 25 | 	if err != nil {
 26 | 		return nil, err
 27 | 	}
 28 | 	defer zr.Close()
 29 | 
 30 | 	fpath = strings.Trim(filepath.Clean(filepath.ToSlash(fpath)), "/")
 31 | 
 32 | 	for _, f := range zr.File {
 33 | 
 34 | 		fn := strings.Trim(filepath.Clean(filepath.ToSlash(f.Name)), "/")
 35 | 
 36 | 		// keep going until we find it
 37 | 		if fn != fpath {
 38 | 			continue
 39 | 		}
 40 | 
 41 | 		rc, err := f.Open()
 42 | 		if err != nil {
 43 | 			panic(err)
 44 | 		}
 45 | 		b, err := ioutil.ReadAll(rc)
 46 | 		if err != nil {
 47 | 			return nil, err
 48 | 		}
 49 | 		rc.Close()
 50 | 
 51 | 		return b, nil
 52 | 
 53 | 	}
 54 | 
 55 | 	return nil, io.EOF
 56 | 
 57 | }
 58 | 
 59 | // Index and search the complete works of William Shakespeare
 60 | func TestTheBardSearch(t *testing.T) {
 61 | 
 62 | 	fmt.Println("TestTheBardIndexing")
 63 | 
 64 | 	idx, err := NewIndexer("")
 65 | 	if err != nil {
 66 | 		panic(err)
 67 | 	}
 68 | 	defer idx.Close()
 69 | 
 70 | 	// use English stop words
 71 | 	idx.StopWordCheck = EnglishStopWordChecker
 72 | 
 73 | 	titlere := re.MustCompile("(?i)<title>([^<]+)</title>")
 74 | 
 75 | 	zr, err := zip.OpenReader("testdata/shakespeare.mit.edu.zip")
 76 | 	if err != nil {
 77 | 		panic(err)
 78 | 	}
 79 | 	defer zr.Close()
 80 | 
 81 | 	for _, f := range zr.File {
 82 | 		fmt.Printf("indexing: %s\n", f.Name)
 83 | 
 84 | 		rc, err := f.Open()
 85 | 		if err != nil {
 86 | 			panic(err)
 87 | 		}
 88 | 		b, err := ioutil.ReadAll(rc)
 89 | 		if err != nil {
 90 | 			panic(err)
 91 | 		}
 92 | 
 93 | 		// extract title tag
 94 | 		tret := titlere.FindSubmatch(b)
 95 | 		title := ""
 96 | 		if len(tret) > 1 {
 97 | 			title = strings.TrimSpace(string(tret[1]))
 98 | 		}
 99 | 
100 | 		// strip html from entire doc and get text
101 | 		body := HTMLStripTags(string(b))
102 | 
103 | 		// make a doc out of it
104 | 		doc := IndexDoc{
105 | 			Id:         []byte(f.Name),
106 | 			StoreValue: []byte(title),
107 | 			IndexValue: []byte(title + " " + title + " " + body),
108 | 		}
109 | 		idx.AddDoc(doc)
110 | 
111 | 		rc.Close()
112 | 	}
113 | 
114 | 	fmt.Println("Writing final index...")
115 | 	f, err := ioutil.TempFile("", "idxout")
116 | 	if err != nil {
117 | 		panic(err)
118 | 	}
119 | 	err = idx.FinalizeAndWrite(f)
120 | 	if err != nil {
121 | 		panic(err)
122 | 	}
123 | 
124 | 	fmt.Println("Debug data: \n")
125 | 	idx.DumpStatus(os.Stdout)
126 | 
127 | 	// panic("DONE")
128 | 
129 | 	f.Close()
130 | 
131 | 	fmt.Printf("Wrote index file: %s\n", f.Name())
132 | 
133 | 	/////////////////////////////////
134 | 
135 | 	start := time.Now()
136 | 
137 | 	s, err := NewSearcher(f.Name())
138 | 	if err != nil {
139 | 		panic(err)
140 | 	}
141 | 
142 | 	fmt.Printf("Opening searcher took: %s\n", time.Since(start).String())
143 | 
144 | 	start = time.Now()
145 | 
146 | 	sr, err := s.SimpleSearch("king", 20)
147 | 	if err != nil {
148 | 		panic(err)
149 | 	}
150 | 
151 | 	if len(sr.Items) == 0 {
152 | 		t.Fatalf("Search for 'king' returned 0 results, but should have gotten something")
153 | 	}
154 | 
155 | 	fmt.Printf("Searching took: %s\n", time.Since(start).String())
156 | 
157 | 	fmt.Printf("Total Results for 'king': %d\n", len(sr.Items))
158 | 	for k, v := range sr.Items {
159 | 		fmt.Printf("----------- #:%d\n", k)
160 | 		fmt.Printf("Id: %s\n", v.Id)
161 | 		fmt.Printf("Score: %d\n", v.Score)
162 | 		fmt.Printf("StoreValue: %s\n", v.StoreValue)
163 | 	}
164 | 
165 | 	fmt.Printf("Raw dump: %+v\n", sr)
166 | 
167 | 	// look for a stop word and make sure it's not there
168 | 
169 | 	sr, err = s.SimpleSearch("the", 20)
170 | 	if err != nil {
171 | 		panic(err)
172 | 	}
173 | 	if len(sr.Items) != 0 {
174 | 		t.Fatalf("Search for 'the' returned %d results when it should have been 0 because it's a stop word", len(sr.Items))
175 | 	}
176 | 	fmt.Printf("Check for stop word passed\n")
177 | 
178 | 	///////////////////////////////////////////////////
179 | 
180 | 	fmt.Printf("Starting Shakespeare's very own search interface at :1414 ...")
181 | 
182 | 	ln, err := net.Listen("tcp", ":1414")
183 | 	if err != nil {
184 | 		panic(err)
185 | 	}
186 | 
187 | 	timeoutStr := os.Getenv("SEARCHER_WEB_TIMEOUT_SECONDS")
188 | 
189 | 	timeout, err := strconv.Atoi(timeoutStr)
190 | 	if err != nil {
191 | 		timeout = 10
192 | 	}
193 | 
194 | 	zfpath := "testdata/shakespeare.mit.edu.zip"
195 | 
196 | 	// wait for specified time
197 | 	go func() { time.Sleep(time.Duration(timeout) * time.Second); ln.Close() }()
198 | 
199 | 	// main request handler
200 | 	err = http.Serve(ln, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
201 | 
202 | 		// home page redirect
203 | 		if r.URL.Path == "/" || r.URL.Path == "/Shakespeare" {
204 | 			http.Redirect(w, r, "/shakespeare.mit.edu/index.html", 302)
205 | 			return
206 | 		}
207 | 
208 | 		// handle search result page
209 | 		if r.URL.Path == "/searchresults.html" {
210 | 
211 | 			w.Header().Set("Content-type", "text/html")
212 | 
213 | 			q := r.FormValue("q")
214 | 
215 | 			// do search
216 | 			sr, err := s.SimpleSearch(q, 20)
217 | 			if err != nil {
218 | 				panic(err)
219 | 			}
220 | 
221 | 			// render results page
222 | 			sres, err := ioutil.ReadFile("testdata/searchresults.html")
223 | 			if err != nil {
224 | 				panic(err)
225 | 			}
226 | 			t := template.Must(template.New("main").Parse(string(sres)))
227 | 			var buf bytes.Buffer
228 | 			t.Execute(&buf, &map[string]interface{}{
229 | 				"q":  q,
230 | 				"sr": sr,
231 | 			})
232 | 			sresbytes := buf.Bytes()
233 | 
234 | 			w.Write(sresbytes)
235 | 
236 | 			return
237 | 		}
238 | 
239 | 		// by default look through zip file
240 | 		b, err := zipExtract(zfpath, r.URL.Path)
241 | 		if err != nil {
242 | 			http.Error(w, "File not found", 404)
243 | 		}
244 | 		if strings.HasSuffix(r.URL.Path, ".css") {
245 | 			w.Header().Set("Content-type", "text/css")
246 | 		}
247 | 		if strings.HasSuffix(r.URL.Path, ".gif") {
248 | 			w.Header().Set("Content-type", "image/gif")
249 | 		}
250 | 		if strings.HasSuffix(r.URL.Path, ".jpg") {
251 | 			w.Header().Set("Content-type", "image/jpeg")
252 | 		}
253 | 
254 | 		// for html files we inject a search box
255 | 		if strings.HasSuffix(r.URL.Path, ".html") {
256 | 			w.Header().Set("Content-type", "text/html")
257 | 
258 | 			// render search form
259 | 			sf, err := ioutil.ReadFile("testdata/searchform.html")
260 | 			if err != nil {
261 | 				panic(err)
262 | 			}
263 | 			t := template.Must(template.New("main").Parse(string(sf)))
264 | 			var buf bytes.Buffer
265 | 			t.Execute(&buf, r.FormValue("q"))
266 | 			sfbytes := buf.Bytes()
267 | 
268 | 			// inject into page
269 | 
270 | 			pagebytes := re.MustCompile("(<body[^>]*>)").ReplaceAllLiteral(b, []byte("<body bgcolor=\"#ffffff\" text=\"#000000\">"+string(sfbytes)))
271 | 			w.Write(pagebytes)
272 | 			return
273 | 
274 | 		}
275 | 
276 | 		w.Write(b)
277 | 
278 | 	}))
279 | 
280 | 	if err != nil {
281 | 		fmt.Printf("err from listen: %s\n", err)
282 | 	}
283 | 
284 | 	s.Close()
285 | 
286 | }
287 | 


--------------------------------------------------------------------------------
/stopwords.go:
--------------------------------------------------------------------------------
  1 | package fulltext
  2 | 
  3 | var EnglishStopWordChecker = func(s string) bool {
  4 | 	return STOPWORDS_EN[s]
  5 | }
  6 | 
  7 | // English stop words
  8 | var STOPWORDS_EN = map[string]bool{
  9 | 	"a":        true,
 10 | 	"about":    true,
 11 | 	"above":    true,
 12 | 	"after":    true,
 13 | 	"again":    true,
 14 | 	"against":  true,
 15 | 	"all":      true,
 16 | 	"am":       true,
 17 | 	"an":       true,
 18 | 	"and":      true,
 19 | 	"any":      true,
 20 | 	"are":      true,
 21 | 	"aren't":   true,
 22 | 	"as":       true,
 23 | 	"at":       true,
 24 | 	"be":       true,
 25 | 	"because":  true,
 26 | 	"been":     true,
 27 | 	"before":   true,
 28 | 	"being":    true,
 29 | 	"below":    true,
 30 | 	"between":  true,
 31 | 	"both":     true,
 32 | 	"but":      true,
 33 | 	"by":       true,
 34 | 	"can't":    true,
 35 | 	"cannot":   true,
 36 | 	"could":    true,
 37 | 	"couldn't": true,
 38 | 	"did":      true,
 39 | 	"didn't":   true,
 40 | 	"do":       true,
 41 | 	"does":     true,
 42 | 	"doesn't":  true,
 43 | 	"doing":    true,
 44 | 	"don't":    true,
 45 | 	"down":     true,
 46 | 	"during":   true,
 47 | 	"each":     true,
 48 | 	"few":      true,
 49 | 	"for":      true,
 50 | 	"from":     true,
 51 | 	"further":  true,
 52 | 	"had":      true,
 53 | 	"hadn't":   true,
 54 | 	"has":      true,
 55 | 	"hasn't":   true,
 56 | 	"have":     true,
 57 | 	"haven't":  true,
 58 | 	"having":   true,
 59 | 	"he":       true,
 60 | 	"he'd":     true,
 61 | 	"he'll":    true,
 62 | 	"he's":     true,
 63 | 	"her":      true,
 64 | 	"here":     true,
 65 | 	"here's":   true,
 66 | 	"hers":     true,
 67 | 	"herself":  true,
 68 | 	"him":      true,
 69 | 	"himself":  true,
 70 | 	"his":      true,
 71 | 	"how":      true,
 72 | 	"how's":    true,
 73 | 	"i":        true,
 74 | 	"i'd":      true,
 75 | 	"i'll":     true,
 76 | 	"i'm":      true,
 77 | 	"i've":     true,
 78 | 	"if":       true,
 79 | 	"in":       true,
 80 | 	"into":     true,
 81 | 	"is":       true,
 82 | 	"isn't":    true,
 83 | 	"it":       true,
 84 | 	"it's":     true,
 85 | 	"its":      true,
 86 | 	"itself":   true,
 87 | 	"let's":    true,
 88 | 	"me":       true,
 89 | 	"more":     true,
 90 | 	"most":     true,
 91 | 	"mustn't":  true,
 92 | 	"my":       true,
 93 | 	"myself":   true,
 94 | 	"no":       true,
 95 | 	"nor":      true,
 96 | 	"not":      true,
 97 | 	"of":       true,
 98 | 	"off":      true,
 99 | 	"on":       true,
100 | 	"once":     true,
101 | 	"only":     true,
102 | 	"or":       true,
103 | 	"other":    true,
104 | 	"ought":    true,
105 | 	"our":      true,
106 | 	"ours	ourselves": true,
107 | 	"out":        true,
108 | 	"over":       true,
109 | 	"own":        true,
110 | 	"same":       true,
111 | 	"shan't":     true,
112 | 	"she":        true,
113 | 	"she'd":      true,
114 | 	"she'll":     true,
115 | 	"she's":      true,
116 | 	"should":     true,
117 | 	"shouldn't":  true,
118 | 	"so":         true,
119 | 	"some":       true,
120 | 	"such":       true,
121 | 	"than":       true,
122 | 	"that":       true,
123 | 	"that's":     true,
124 | 	"the":        true,
125 | 	"their":      true,
126 | 	"theirs":     true,
127 | 	"them":       true,
128 | 	"themselves": true,
129 | 	"then":       true,
130 | 	"there":      true,
131 | 	"there's":    true,
132 | 	"these":      true,
133 | 	"they":       true,
134 | 	"they'd":     true,
135 | 	"they'll":    true,
136 | 	"they're":    true,
137 | 	"they've":    true,
138 | 	"this":       true,
139 | 	"those":      true,
140 | 	"through":    true,
141 | 	"to":         true,
142 | 	"too":        true,
143 | 	"under":      true,
144 | 	"until":      true,
145 | 	"up":         true,
146 | 	"very":       true,
147 | 	"was":        true,
148 | 	"wasn't":     true,
149 | 	"we":         true,
150 | 	"we'd":       true,
151 | 	"we'll":      true,
152 | 	"we're":      true,
153 | 	"we've":      true,
154 | 	"were":       true,
155 | 	"weren't":    true,
156 | 	"what":       true,
157 | 	"what's":     true,
158 | 	"when":       true,
159 | 	"when's":     true,
160 | 	"where":      true,
161 | 	"where's":    true,
162 | 	"which":      true,
163 | 	"while":      true,
164 | 	"who":        true,
165 | 	"who's":      true,
166 | 	"whom":       true,
167 | 	"why":        true,
168 | 	"why's":      true,
169 | 	"with":       true,
170 | 	"won't":      true,
171 | 	"would":      true,
172 | 	"wouldn't":   true,
173 | 	"you":        true,
174 | 	"you'd":      true,
175 | 	"you'll":     true,
176 | 	"you're":     true,
177 | 	"you've":     true,
178 | 	"your":       true,
179 | 	"yours":      true,
180 | 	"yourself":   true,
181 | 	"yourselves": true,
182 | }
183 | 


--------------------------------------------------------------------------------
/testdata/searchform.html:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | <form method="GET" action="/searchresults.html" style="float:right;display:block;width:200px">
4 | 	<input type="text" name="q" value="{{.}}"/> <input type="submit" value="Search"/>
5 | </form>
6 | 


--------------------------------------------------------------------------------
/testdata/searchresults.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 | <head>
 3 | 	<title>Search Results</title>
 4 | </head>
 5 | <body>
 6 | 
 7 | <form method="GET" action="/searchresults.html" style="float:right;display:block;width:300px">
 8 | 	Search Again:
 9 | 	<input type="text" name="q" value="{{.q}}"/> <input type="submit" value="Search"/>
10 | </form>
11 | 
12 | <div>
13 | 
14 | <h1>Search Results:</h1>
15 | 
16 | <div><a href="/">[return home]</a><br/>&nbsp;</div>
17 | 
18 | {{range .sr.Items}}
19 | 
20 | <div>
21 | 	<a href="/{{printf "%s" .Id}}">{{printf "%s" .StoreValue}}</a> (Score: {{printf "%d" .Score}})
22 | 	<br/>&nbsp;
23 | </div>
24 | 
25 | {{else}}
26 | 
27 | No results found.
28 | Now is the winter of our discontent.
29 | 
30 | {{end}}
31 | 
32 | </div>
33 | 
34 | 
35 | </body>
36 | </html>
37 | 


--------------------------------------------------------------------------------
/testdata/shakespeare.mit.edu.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bradleypeabody/fulltext/a28063e64b3da56602233d36cd7e9a147f41b0ac/testdata/shakespeare.mit.edu.zip


--------------------------------------------------------------------------------
/util.go:
--------------------------------------------------------------------------------
  1 | package fulltext
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"html/template"
  6 | 	re "regexp"
  7 | 	"strings"
  8 | )
  9 | 
 10 | var wordizeRe *re.Regexp
 11 | 
 12 | func init() {
 13 | 	wordizeRe = re.MustCompile("[\\s,.;:!?[\\]()'\"]+")
 14 | }
 15 | 
 16 | type WordSplitter func(string) []string
 17 | 
 18 | // Split a string up into words
 19 | func Wordize(t string) []string {
 20 | 	return wordizeRe.Split(t, -1)
 21 | }
 22 | 
 23 | type WordCleaner func(string) string
 24 | 
 25 | // Make word appropriate for indexing
 26 | func IndexizeWord(w string) string {
 27 | 	return strings.TrimSpace(strings.ToLower(w))
 28 | }
 29 | 
 30 | type StopWordChecker func(string) bool
 31 | 
 32 | // This function copied from here: https://github.com/kennygrant/sanitize/blob/master/sanitize.go
 33 | // License is: https://github.com/kennygrant/sanitize/blob/master/License-BSD.txt
 34 | // Strip html tags, replace common entities, and escape <>&;'" in the result.
 35 | // Note the returned text may contain entities as it is escaped by HTMLEscapeString,
 36 | // and most entities are not translated.
 37 | func HTMLStripTags(s string) (output string) {
 38 | 
 39 | 	output = ""
 40 | 
 41 | 	// Shortcut strings with no tags in them
 42 | 	if !strings.ContainsAny(s, "<>") {
 43 | 		output = s
 44 | 	} else {
 45 | 
 46 | 		// First remove line breaks etc as these have no meaning outside html tags (except pre)
 47 | 		// this means pre sections will lose formatting... but will result in less uninentional paras.
 48 | 		s = strings.Replace(s, "\n", "", -1)
 49 | 
 50 | 		// Then replace line breaks with newlines, to preserve that formatting
 51 | 		s = strings.Replace(s, "</p>", "\n", -1)
 52 | 		s = strings.Replace(s, "<br>", "\n", -1)
 53 | 		s = strings.Replace(s, "</br>", "\n", -1)
 54 | 
 55 | 		// Walk through the string removing all tags
 56 | 		b := bytes.NewBufferString("")
 57 | 		inTag := false
 58 | 		for _, r := range s {
 59 | 			switch r {
 60 | 			case '<':
 61 | 				inTag = true
 62 | 			case '>':
 63 | 				inTag = false
 64 | 			default:
 65 | 				if !inTag {
 66 | 					b.WriteRune(r)
 67 | 				}
 68 | 			}
 69 | 		}
 70 | 		output = b.String()
 71 | 	}
 72 | 
 73 | 	// In case we have missed any tags above, escape the text - removes <, >, &, ' and ".
 74 | 	output = template.HTMLEscapeString(output)
 75 | 
 76 | 	// Remove a few common harmless entities, to arrive at something more like plain text
 77 | 	// This relies on having removed *all* tags above
 78 | 	output = strings.Replace(output, "&nbsp;", " ", -1)
 79 | 	output = strings.Replace(output, "&quot;", "\"", -1)
 80 | 	output = strings.Replace(output, "&apos;", "'", -1)
 81 | 	output = strings.Replace(output, "&#34;", "\"", -1)
 82 | 	output = strings.Replace(output, "&#39;", "'", -1)
 83 | 	// NB spaces here are significant - we only allow & not part of entity
 84 | 	output = strings.Replace(output, "&amp; ", "& ", -1)
 85 | 	output = strings.Replace(output, "&amp;amp; ", "& ", -1)
 86 | 
 87 | 	return output
 88 | }
 89 | 
 90 | var titlere *re.Regexp
 91 | var descre *re.Regexp
 92 | 
 93 | func init() {
 94 | 	titlere = re.MustCompile("(?i)<title>([^<]+)</title>")
 95 | 	descre = re.MustCompile("(?i)<meta\\s+name\\s*=\\s*[\"']description[\"']\\s+content\\s*=\\s*[\"'](.*?)[\"']\\s*/?>")
 96 | }
 97 | 
 98 | // Helper to extract an HTML title from the title tag
 99 | func HTMLExtractTitle(html string) string {
100 | 	tret := titlere.FindSubmatch([]byte(html))
101 | 	title := ""
102 | 	if len(tret) > 1 {
103 | 		title = strings.TrimSpace(string(tret[1]))
104 | 	}
105 | 	return title
106 | }
107 | 
108 | // Helper to extract an HTML description from the meta[name=description] tag
109 | func HTMLExtractDescription(html string) string {
110 | 	tret := descre.FindSubmatch([]byte(html))
111 | 	desc := ""
112 | 	if len(tret) > 1 {
113 | 		desc = strings.TrimSpace(string(tret[1]))
114 | 	}
115 | 	return desc
116 | }
117 | 


--------------------------------------------------------------------------------