├── .gitignore ├── LICENSE ├── README.md ├── fulltext.go ├── indexer.go ├── indexer_test.go ├── searcher.go ├── searcher_test.go ├── stopwords.go ├── testdata ├── searchform.html ├── searchresults.html └── shakespeare.mit.edu.zip └── util.go /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files, Static and Dynamic libs (Shared Objects) 2 | *.o 3 | *.a 4 | *.so 5 | 6 | # Folders 7 | _obj 8 | _test 9 | 10 | # Architecture specific extensions/prefixes 11 | *.[568vq] 12 | [568vq].out 13 | 14 | *.cgo1.go 15 | *.cgo2.c 16 | _cgo_defun.c 17 | _cgo_gotypes.go 18 | _cgo_export.* 19 | 20 | _testmain.go 21 | 22 | *.exe 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2013-2014 Brad Peabody 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Overview 2 | ======== 3 | 4 | This is a simple, pure-Go, full text indexing and search library. 5 | 6 | I made it for use on small to medium websites, although there is nothing web-specific about it's API or operation. 7 | 8 | Cdb (http://github.com/jbarham/go-cdb) is used to perform the indexing and lookups. 9 | 10 | Status 11 | ------ 12 | 13 | This project is more or less stable. 14 | 15 | Notes on Building 16 | -------- 17 | 18 | fulltext requires CDB: 19 | 20 | go get github.com/jbarham/go-cdb 21 | 22 | Usage 23 | ------ 24 | 25 | First, you must create an index. Like this: 26 | 27 | import "github.com/bradleypeabody/fulltext" 28 | 29 | // create new index with temp dir (usually "" is fine) 30 | idx, err := fulltext.NewIndexer(""); if err != nil { panic(err) } 31 | defer idx.Close() 32 | 33 | // provide stop words if desired 34 | idx.StopWordCheck = fulltext.EnglishStopWordChecker 35 | 36 | // for each document you want to add, you do something like this: 37 | doc := fulltext.IndexDoc{ 38 | Id: []byte(uuid), // unique identifier (the path to a webpage works...) 39 | StoreValue: []byte(title), // bytes you want to be able to retrieve from search results 40 | IndexValue: []byte(data), // bytes you want to be split into words and indexed 41 | } 42 | idx.AddDoc(doc) // add it 43 | 44 | // when done, write out to final index 45 | err = idx.FinalizeAndWrite(f); if err != nil { panic(err) } 46 | 47 | Once you have an index file, you can search it like this: 48 | 49 | s, err := fulltext.NewSearcher("/path/to/index/file"); if err != nil { panic(err) } 50 | defer s.Close() 51 | sr, err := s.SimpleSearch("Horatio", 20); if err != nil { panic(err) } 52 | for k, v := range sr.Items { 53 | fmt.Printf("----------- #:%d\n", k) 54 | fmt.Printf("Id: %s\n", v.Id) 55 | fmt.Printf("Score: %d\n", v.Score) 56 | fmt.Printf("StoreValue: %s\n", v.StoreValue) 57 | } 58 | 59 | It's rather simplistic. But it's fast and it works. 60 | 61 | Thoughts in Comparison to blevesearch 62 | ------------------------------------- 63 | 64 | I wrote this project before blevesearch was released. I've done a number of implementions now of website search engines using fulltext and also a number of others using blevesearch. My general experience has been that blevesearch is better suited for projects where you are really doing significant development on your search results and need the ability to customize things for various locales, etc. Fulltext on the other hand is much simpler and is better for projects that either a) have simpler search requirements or b) prefer speed of indexing over quality of results. 65 | 66 | Adding a fulltext search engine to a website with a few hundred pages is a simple task and the indexing is fast enough that you can just run it as part of your pre-publish build process. So while there is a lot more development on blevesearch happening - and hats off to them, it's a great product - fulltext still seems to have it's place for these simpler scenarios. 67 | 68 | TODOs 69 | ----- 70 | 71 | * ~~Will likely need some sort of "stop word" functionality.~~ 72 | 73 | * ~~Wordize(), IndexizeWord()~~ and the scoring aggregation logic should be extracted to callback functions with the existing functionality as default. 74 | 75 | * The search logic is currently very naive. Ideally this project would have something as sophisticated as Lucene's query parser. But in reality what I'll likely do is a simple survey of which common features are actually used on any on-site search engines I can get my hands on. Quoting ("black cat"), and logical operators (Jim OR James) would likely be at the top of the list and implementing that sort of thing would be higher priority than trying to duplicate Lucene. 76 | 77 | * I've considered using boltdb for storage as an alternative to CDB, but I haven't found the time to work on it. This approach would provide the ability to update the index, reduce memory consumption during index building, and potenteially allow for wildcard suffixes. 78 | 79 | Implementation Notes 80 | -------------------- 81 | 82 | I originally tried doing this on top of Sqlite. It was dreadfully slow. Cdb is orders of magnitude faster. 83 | 84 | Two main disadvantages from going the Cdb route are that the index cannot be edited once it is built (you have to recreate it in full), and since it's hash-based it will not support any sort of fuzzy matching unless those variations are included in the index (which they are not, in the current implementation.) For my purposes these two disadvantages are overshadowed by the fact that it's blinding fast, easy to use, portable (pure-Go), and its interface allowed me to build the indexes I needed into a single file. 85 | 86 | In the test suite is included a copy of the complete works of William Shakespeare (thanks to Jeremy Hylton's http://shakespeare.mit.edu/) and this library is used to create a simple search engine on top of that corpus. By default it only runs for 10 seconds, but you can run it for longer by doing something like: 87 | 88 | SEARCHER_WEB_TIMEOUT_SECONDS=120 go test fulltext -v 89 | 90 | Works on Windows. 91 | 92 | Future Work 93 | ----------- 94 | 95 | It might be feasible to supplant this project with something using suffix arrays ( http://golang.org/pkg/index/suffixarray/ ). The main down side would be the requirement of a lot more storage space (and memory to load and search it). Retooling the index/suffixarray package so it can work against the disk is an idea, but is not necessarily simple. The upside of an approach like that would be full regex support for searches with decent performance - which would rock. The index could potentially be sharded by the first character or two of the search - but that's still not as good as something with sensible caching where the whole set can be kept on disk and the "hot" parts cached in memory, etc. 96 | -------------------------------------------------------------------------------- /fulltext.go: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | A simple cross-platform, full-text search engine, backed by sqlite. 4 | Intended for use on small- to medium-sized websites. 5 | 6 | See README.md for usage. 7 | 8 | */ 9 | package fulltext 10 | -------------------------------------------------------------------------------- /indexer.go: -------------------------------------------------------------------------------- 1 | package fulltext 2 | 3 | import ( 4 | "bytes" 5 | "encoding/gob" 6 | "fmt" 7 | "github.com/jbarham/go-cdb" 8 | "io" 9 | "io/ioutil" 10 | "os" 11 | "syscall" 12 | ) 13 | 14 | // Size of header block to prepend - make it 4k to align disk reads 15 | const HEADER_SIZE = 4096 16 | 17 | // Produces a set of cdb files from a series of AddDoc() calls 18 | type Indexer struct { 19 | docTxtFile *os.File 20 | wordTxtFile *os.File 21 | docCdbFile *os.File 22 | wordCdbFile *os.File 23 | wordMap map[string]map[string]int // map of [word][docId]count 24 | WordSplit WordSplitter 25 | WordClean WordCleaner 26 | StopWordCheck StopWordChecker 27 | } 28 | 29 | // Contents of a single document to be indexed 30 | type IndexDoc struct { 31 | Id []byte // the id, this is usually the path to the document 32 | IndexValue []byte // index this data 33 | StoreValue []byte // store this data 34 | } 35 | 36 | // Creates a new indexer, using the given temp dir while building 37 | // the index. 38 | func NewIndexer(tempDir string) (*Indexer, error) { 39 | idx := &Indexer{} 40 | var err error 41 | idx.docTxtFile, err = ioutil.TempFile(tempDir, "doctmp") 42 | if err != nil { 43 | return nil, err 44 | } 45 | idx.wordTxtFile, err = ioutil.TempFile(tempDir, "wordtmp") 46 | if err != nil { 47 | return nil, err 48 | } 49 | idx.docCdbFile, err = ioutil.TempFile(tempDir, "doccdb") 50 | if err != nil { 51 | return nil, err 52 | } 53 | idx.wordCdbFile, err = ioutil.TempFile(tempDir, "wordcdb") 54 | if err != nil { 55 | return nil, err 56 | } 57 | idx.wordMap = make(map[string]map[string]int) 58 | idx.WordSplit = Wordize 59 | idx.WordClean = IndexizeWord 60 | return idx, nil 61 | } 62 | 63 | // Add a document to the index - writes to temporary files and stores some data in memory while building the index. 64 | func (idx *Indexer) AddDoc(idoc IndexDoc) error { 65 | // add to docs 66 | docId := string(idoc.Id) 67 | writeTextLine(idx.docTxtFile, []byte(docId), idoc.StoreValue) 68 | words := append(idx.WordSplit(string(idoc.IndexValue)), idx.WordSplit(string(idoc.StoreValue))...) 69 | for _, word := range words { 70 | word = idx.WordClean(word) 71 | 72 | // skip if stop word 73 | if idx.StopWordCheck != nil { 74 | if idx.StopWordCheck(word) { 75 | continue 76 | } 77 | } 78 | 79 | // ensure nested map exists 80 | if idx.wordMap[word] == nil { 81 | idx.wordMap[word] = make(map[string]int) 82 | } 83 | // increment count by one for this combination 84 | c := idx.wordMap[word][docId] + 1 85 | idx.wordMap[word][docId] = c 86 | } 87 | return nil 88 | } 89 | 90 | // Builds a final single index file, which consists of some simple header info, 91 | // followed by the cdb binary files that comprise the full index. 92 | func (idx *Indexer) FinalizeAndWrite(w io.Writer) error { 93 | 94 | var buf bytes.Buffer 95 | 96 | // write out the word data 97 | for word, m := range idx.wordMap { 98 | enc := gob.NewEncoder(&buf) 99 | enc.Encode(m) 100 | writeTextLine(idx.wordTxtFile, []byte(word), buf.Bytes()) 101 | buf.Reset() 102 | } 103 | 104 | var err error 105 | 106 | idx.docTxtFile.Write([]byte("\n")) 107 | idx.wordTxtFile.Write([]byte("\n")) 108 | 109 | _, err = idx.docTxtFile.Seek(0, 0) 110 | if err != nil { 111 | return err 112 | } 113 | _, err = idx.wordTxtFile.Seek(0, 0) 114 | if err != nil { 115 | return err 116 | } 117 | 118 | // make cdb files 119 | err = cdb.Make(idx.docCdbFile, idx.docTxtFile) 120 | if err != nil { 121 | return err 122 | } 123 | err = cdb.Make(idx.wordCdbFile, idx.wordTxtFile) 124 | if err != nil { 125 | return err 126 | } 127 | 128 | // make sure the contents are all settled 129 | idx.docCdbFile.Sync() 130 | idx.wordCdbFile.Sync() 131 | _, err = idx.docCdbFile.Seek(0, 0) 132 | if err != nil { 133 | return err 134 | } 135 | _, err = idx.wordCdbFile.Seek(0, 0) 136 | if err != nil { 137 | return err 138 | } 139 | 140 | docStat, err := idx.docCdbFile.Stat() 141 | if err != nil { 142 | return err 143 | } 144 | wordStat, err := idx.wordCdbFile.Stat() 145 | if err != nil { 146 | return err 147 | } 148 | 149 | // now package it all up 150 | buf.Reset() 151 | enc := gob.NewEncoder(&buf) 152 | bhead := []int{int(docStat.Size()), int(wordStat.Size())} 153 | enc.Encode(bhead) 154 | 155 | // extend buffer to be HEADER_SIZE len 156 | bpadsize := HEADER_SIZE - buf.Len() 157 | buf.Write(make([]byte, bpadsize, bpadsize)) 158 | b := buf.Bytes() 159 | 160 | _, err = w.Write(b) 161 | if err != nil { 162 | return err 163 | } 164 | 165 | _, err = io.Copy(w, idx.docCdbFile) 166 | if err != nil { 167 | return err 168 | } 169 | _, err = io.Copy(w, idx.wordCdbFile) 170 | if err != nil { 171 | return err 172 | } 173 | 174 | return nil 175 | } 176 | 177 | // Dump some human readable status information 178 | func (idx *Indexer) DumpStatus(w io.Writer) { 179 | fmt.Fprintf(w, "files used:\n\t%s\n\t%s\n\t%s\n\t%s\n", idx.docTxtFile.Name(), idx.wordTxtFile.Name(), idx.docCdbFile.Name(), idx.wordCdbFile.Name()) 180 | // fmt.Fprintf(w, "wordMap: %+v\n", idx.wordMap) 181 | } 182 | 183 | // close and remove all resources 184 | func (idx *Indexer) Close() { 185 | syscall.Unlink(idx.docTxtFile.Name()) 186 | idx.docTxtFile.Close() 187 | syscall.Unlink(idx.wordTxtFile.Name()) 188 | idx.wordTxtFile.Close() 189 | syscall.Unlink(idx.docCdbFile.Name()) 190 | idx.docCdbFile.Close() 191 | syscall.Unlink(idx.wordCdbFile.Name()) 192 | idx.wordCdbFile.Close() 193 | idx.wordMap = nil 194 | } 195 | 196 | // Write a single line of data in cdb's text format 197 | func writeTextLine(w io.Writer, key []byte, data []byte) (err error) { 198 | _, err = fmt.Fprintf(w, "+%d,%d:%s->%s\n", len(key), len(data), key, data) 199 | return 200 | } 201 | -------------------------------------------------------------------------------- /indexer_test.go: -------------------------------------------------------------------------------- 1 | package fulltext 2 | 3 | import ( 4 | "fmt" 5 | "io/ioutil" 6 | "os" 7 | "path/filepath" 8 | re "regexp" 9 | "testing" 10 | ) 11 | 12 | func TestIndexer(t *testing.T) { 13 | fmt.Printf("TestIndexer\n") 14 | 15 | idx, err := NewIndexer("") 16 | if err != nil { 17 | panic(err) 18 | } 19 | 20 | idx.AddDoc(IndexDoc{Id: []byte(`blah1`), StoreValue: []byte(`store this`), IndexValue: []byte(`test of the emergency broadcast system`)}) 21 | idx.AddDoc(IndexDoc{Id: []byte(`blah2`), StoreValue: []byte(`store this stuff too, yeah store it`), IndexValue: []byte(`every good boy does fine`)}) 22 | idx.AddDoc(IndexDoc{Id: []byte(`blah3`), StoreValue: []byte(`more storage here`), IndexValue: []byte(`a taco in the hand is worth two in the truck`)}) 23 | 24 | idx.DumpStatus(os.Stdout) 25 | 26 | f, err := ioutil.TempFile("", "idxout") 27 | if err != nil { 28 | panic(err) 29 | } 30 | err = idx.FinalizeAndWrite(f) 31 | if err != nil { 32 | panic(err) 33 | } 34 | f.Close() 35 | 36 | fmt.Printf("Wrote index file: %s\n", f.Name()) 37 | 38 | } 39 | 40 | // A more extensive test - index the complete works of William Shakespeare 41 | func NoTestTheBardIndexing(t *testing.T) { 42 | 43 | fmt.Println("TestTheBardIndexing") 44 | 45 | idx, err := NewIndexer("") 46 | if err != nil { 47 | panic(err) 48 | } 49 | defer idx.Close() 50 | 51 | titlere := re.MustCompile("(?i)([^<]+)") 52 | 53 | n := 0 54 | 55 | filepath.Walk("testdata/shakespeare.mit.edu/", func(path string, f os.FileInfo, err error) error { 56 | if !f.IsDir() /*&& n < 5*/ { 57 | n++ 58 | fmt.Printf("indexing: %s\n", path) 59 | b, err := ioutil.ReadFile(path) 60 | if err != nil { 61 | panic(err) 62 | } 63 | title := string(titlere.Find(b)) 64 | body := HTMLStripTags(string(b)) 65 | doc := IndexDoc{ 66 | Id: []byte(path), 67 | StoreValue: []byte(title), 68 | IndexValue: []byte(title + " " + title + " " + body), 69 | } 70 | idx.AddDoc(doc) 71 | } 72 | return nil 73 | }) 74 | 75 | // idx.DebugDump(os.Stdout) 76 | 77 | fmt.Println("Writing final index...") 78 | f, err := ioutil.TempFile("", "idxout") 79 | if err != nil { 80 | panic(err) 81 | } 82 | err = idx.FinalizeAndWrite(f) 83 | if err != nil { 84 | panic(err) 85 | } 86 | f.Close() 87 | 88 | fmt.Printf("Wrote index file: %s\n", f.Name()) 89 | 90 | } 91 | -------------------------------------------------------------------------------- /searcher.go: -------------------------------------------------------------------------------- 1 | package fulltext 2 | 3 | import ( 4 | "bytes" 5 | "encoding/gob" 6 | "github.com/jbarham/go-cdb" 7 | "io" 8 | "io/ioutil" 9 | "os" 10 | "sort" 11 | ) 12 | 13 | // Interface for search. Not thread-safe, but low overhead 14 | // so having a separate one per thread should be workable. 15 | type Searcher struct { 16 | file *os.File 17 | docCdb *cdb.Cdb 18 | wordCdb *cdb.Cdb 19 | } 20 | 21 | // Wraps a ReaderAt and adjusts (tweaks) it's offset by the specified amount 22 | type tweakedReaderAt struct { 23 | readerAt io.ReaderAt 24 | tweak int64 25 | } 26 | 27 | func (t *tweakedReaderAt) ReadAt(p []byte, off int64) (n int, err error) { 28 | n, err = t.readerAt.ReadAt(p, off+t.tweak) 29 | return 30 | } 31 | 32 | // A single item in a search result 33 | type SearchResultItem struct { 34 | Id []byte // id of this item (document) 35 | StoreValue []byte // the stored value of this document 36 | Score int64 // the total score 37 | } 38 | 39 | // Implement sort.Interface 40 | type SearchResultItems []SearchResultItem 41 | 42 | func (s SearchResultItems) Len() int { return len(s) } 43 | func (s SearchResultItems) Swap(i, j int) { s[i], s[j] = s[j], s[i] } 44 | func (s SearchResultItems) Less(i, j int) bool { 45 | // if same score, then sort by raw bytes comparison of store value - 46 | // so we get consistently ordered results, even when score is same 47 | if s[i].Score == s[j].Score { 48 | return bytes.Compare(s[i].Id, s[j].Id) < 0 49 | } 50 | return s[i].Score < s[j].Score 51 | } 52 | 53 | // What happened during the search 54 | type SearchResults struct { 55 | Items SearchResultItems 56 | } 57 | 58 | // Make a new searcher using the file at the specified path 59 | // TODO: Make a variation that accepts a ReaderAt 60 | func NewSearcher(fpath string) (*Searcher, error) { 61 | 62 | s := &Searcher{} 63 | 64 | f, err := os.Open(fpath) 65 | if err != nil { 66 | return s, err 67 | } 68 | s.file = f 69 | 70 | // write out the word data 71 | dec := gob.NewDecoder(f) 72 | lens := make([]int64, 2, 2) 73 | dec.Decode(&lens) 74 | 75 | s.docCdb = cdb.New(&tweakedReaderAt{f, HEADER_SIZE}) 76 | s.wordCdb = cdb.New(&tweakedReaderAt{f, HEADER_SIZE + lens[0]}) 77 | 78 | return s, nil 79 | } 80 | 81 | // Close and release resources 82 | func (s *Searcher) Close() error { 83 | s.docCdb = nil 84 | s.wordCdb = nil 85 | return s.file.Close() 86 | } 87 | 88 | // Perform a search 89 | func (s *Searcher) SimpleSearch(search string, maxn int) (SearchResults, error) { 90 | 91 | sr := SearchResults{} 92 | 93 | // break search into words_word 94 | searchWords := Wordize(search) 95 | 96 | itemMap := make(map[string]SearchResultItem) 97 | 98 | // read word data for each word that was provided 99 | for _, w := range searchWords { 100 | w = IndexizeWord(w) 101 | // find the docs for this word 102 | mapGob, err := s.wordCdb.Find([]byte(w)) 103 | if err == io.EOF { 104 | continue 105 | } 106 | if err != nil { 107 | return sr, err 108 | } 109 | 110 | m := make(map[string]int) 111 | 112 | dec := gob.NewDecoder(mapGob) 113 | err = dec.Decode(&m) 114 | if err != nil { 115 | return sr, err 116 | } 117 | 118 | // for each doc, increase score 119 | for docId, cnt := range m { 120 | sri := itemMap[docId] 121 | if sri.Score < 1 { 122 | sri.Id = []byte(docId) 123 | } 124 | sri.Score += int64(cnt) 125 | itemMap[docId] = sri 126 | } 127 | 128 | } 129 | 130 | // convert to slice 131 | items := make(SearchResultItems, 0, maxn) 132 | for _, item := range itemMap { 133 | items = append(items, item) 134 | } 135 | 136 | // sort by score descending 137 | sort.Sort(sort.Reverse(items)) 138 | 139 | // limit to maxn 140 | if len(items) > maxn { 141 | items = items[:maxn] 142 | } 143 | 144 | // pull document contents from doc cdb 145 | for i := range items { 146 | item := &items[i] 147 | v, err := s.docCdb.Find(item.Id) 148 | if err == io.EOF { 149 | panic("doc id " + string(item.Id) + " not found in index, this should never happen") 150 | } 151 | if err != nil { 152 | return sr, err 153 | } 154 | v1, err := ioutil.ReadAll(v) 155 | if err != nil { 156 | return sr, err 157 | } 158 | item.StoreValue = v1 159 | } 160 | 161 | sr.Items = items 162 | 163 | return sr, nil 164 | 165 | } 166 | -------------------------------------------------------------------------------- /searcher_test.go: -------------------------------------------------------------------------------- 1 | package fulltext 2 | 3 | import ( 4 | "archive/zip" 5 | "bytes" 6 | "fmt" 7 | "html/template" 8 | "io" 9 | "io/ioutil" 10 | "net" 11 | "net/http" 12 | "os" 13 | "path/filepath" 14 | re "regexp" 15 | "strconv" 16 | "strings" 17 | "testing" 18 | "time" 19 | ) 20 | 21 | // Extract a single file from a zip and return it's contents 22 | func zipExtract(zfpath string, fpath string) ([]byte, error) { 23 | 24 | zr, err := zip.OpenReader(zfpath) 25 | if err != nil { 26 | return nil, err 27 | } 28 | defer zr.Close() 29 | 30 | fpath = strings.Trim(filepath.Clean(filepath.ToSlash(fpath)), "/") 31 | 32 | for _, f := range zr.File { 33 | 34 | fn := strings.Trim(filepath.Clean(filepath.ToSlash(f.Name)), "/") 35 | 36 | // keep going until we find it 37 | if fn != fpath { 38 | continue 39 | } 40 | 41 | rc, err := f.Open() 42 | if err != nil { 43 | panic(err) 44 | } 45 | b, err := ioutil.ReadAll(rc) 46 | if err != nil { 47 | return nil, err 48 | } 49 | rc.Close() 50 | 51 | return b, nil 52 | 53 | } 54 | 55 | return nil, io.EOF 56 | 57 | } 58 | 59 | // Index and search the complete works of William Shakespeare 60 | func TestTheBardSearch(t *testing.T) { 61 | 62 | fmt.Println("TestTheBardIndexing") 63 | 64 | idx, err := NewIndexer("") 65 | if err != nil { 66 | panic(err) 67 | } 68 | defer idx.Close() 69 | 70 | // use English stop words 71 | idx.StopWordCheck = EnglishStopWordChecker 72 | 73 | titlere := re.MustCompile("(?i)([^<]+)") 74 | 75 | zr, err := zip.OpenReader("testdata/shakespeare.mit.edu.zip") 76 | if err != nil { 77 | panic(err) 78 | } 79 | defer zr.Close() 80 | 81 | for _, f := range zr.File { 82 | fmt.Printf("indexing: %s\n", f.Name) 83 | 84 | rc, err := f.Open() 85 | if err != nil { 86 | panic(err) 87 | } 88 | b, err := ioutil.ReadAll(rc) 89 | if err != nil { 90 | panic(err) 91 | } 92 | 93 | // extract title tag 94 | tret := titlere.FindSubmatch(b) 95 | title := "" 96 | if len(tret) > 1 { 97 | title = strings.TrimSpace(string(tret[1])) 98 | } 99 | 100 | // strip html from entire doc and get text 101 | body := HTMLStripTags(string(b)) 102 | 103 | // make a doc out of it 104 | doc := IndexDoc{ 105 | Id: []byte(f.Name), 106 | StoreValue: []byte(title), 107 | IndexValue: []byte(title + " " + title + " " + body), 108 | } 109 | idx.AddDoc(doc) 110 | 111 | rc.Close() 112 | } 113 | 114 | fmt.Println("Writing final index...") 115 | f, err := ioutil.TempFile("", "idxout") 116 | if err != nil { 117 | panic(err) 118 | } 119 | err = idx.FinalizeAndWrite(f) 120 | if err != nil { 121 | panic(err) 122 | } 123 | 124 | fmt.Println("Debug data: \n") 125 | idx.DumpStatus(os.Stdout) 126 | 127 | // panic("DONE") 128 | 129 | f.Close() 130 | 131 | fmt.Printf("Wrote index file: %s\n", f.Name()) 132 | 133 | ///////////////////////////////// 134 | 135 | start := time.Now() 136 | 137 | s, err := NewSearcher(f.Name()) 138 | if err != nil { 139 | panic(err) 140 | } 141 | 142 | fmt.Printf("Opening searcher took: %s\n", time.Since(start).String()) 143 | 144 | start = time.Now() 145 | 146 | sr, err := s.SimpleSearch("king", 20) 147 | if err != nil { 148 | panic(err) 149 | } 150 | 151 | if len(sr.Items) == 0 { 152 | t.Fatalf("Search for 'king' returned 0 results, but should have gotten something") 153 | } 154 | 155 | fmt.Printf("Searching took: %s\n", time.Since(start).String()) 156 | 157 | fmt.Printf("Total Results for 'king': %d\n", len(sr.Items)) 158 | for k, v := range sr.Items { 159 | fmt.Printf("----------- #:%d\n", k) 160 | fmt.Printf("Id: %s\n", v.Id) 161 | fmt.Printf("Score: %d\n", v.Score) 162 | fmt.Printf("StoreValue: %s\n", v.StoreValue) 163 | } 164 | 165 | fmt.Printf("Raw dump: %+v\n", sr) 166 | 167 | // look for a stop word and make sure it's not there 168 | 169 | sr, err = s.SimpleSearch("the", 20) 170 | if err != nil { 171 | panic(err) 172 | } 173 | if len(sr.Items) != 0 { 174 | t.Fatalf("Search for 'the' returned %d results when it should have been 0 because it's a stop word", len(sr.Items)) 175 | } 176 | fmt.Printf("Check for stop word passed\n") 177 | 178 | /////////////////////////////////////////////////// 179 | 180 | fmt.Printf("Starting Shakespeare's very own search interface at :1414 ...") 181 | 182 | ln, err := net.Listen("tcp", ":1414") 183 | if err != nil { 184 | panic(err) 185 | } 186 | 187 | timeoutStr := os.Getenv("SEARCHER_WEB_TIMEOUT_SECONDS") 188 | 189 | timeout, err := strconv.Atoi(timeoutStr) 190 | if err != nil { 191 | timeout = 10 192 | } 193 | 194 | zfpath := "testdata/shakespeare.mit.edu.zip" 195 | 196 | // wait for specified time 197 | go func() { time.Sleep(time.Duration(timeout) * time.Second); ln.Close() }() 198 | 199 | // main request handler 200 | err = http.Serve(ln, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 201 | 202 | // home page redirect 203 | if r.URL.Path == "/" || r.URL.Path == "/Shakespeare" { 204 | http.Redirect(w, r, "/shakespeare.mit.edu/index.html", 302) 205 | return 206 | } 207 | 208 | // handle search result page 209 | if r.URL.Path == "/searchresults.html" { 210 | 211 | w.Header().Set("Content-type", "text/html") 212 | 213 | q := r.FormValue("q") 214 | 215 | // do search 216 | sr, err := s.SimpleSearch(q, 20) 217 | if err != nil { 218 | panic(err) 219 | } 220 | 221 | // render results page 222 | sres, err := ioutil.ReadFile("testdata/searchresults.html") 223 | if err != nil { 224 | panic(err) 225 | } 226 | t := template.Must(template.New("main").Parse(string(sres))) 227 | var buf bytes.Buffer 228 | t.Execute(&buf, &map[string]interface{}{ 229 | "q": q, 230 | "sr": sr, 231 | }) 232 | sresbytes := buf.Bytes() 233 | 234 | w.Write(sresbytes) 235 | 236 | return 237 | } 238 | 239 | // by default look through zip file 240 | b, err := zipExtract(zfpath, r.URL.Path) 241 | if err != nil { 242 | http.Error(w, "File not found", 404) 243 | } 244 | if strings.HasSuffix(r.URL.Path, ".css") { 245 | w.Header().Set("Content-type", "text/css") 246 | } 247 | if strings.HasSuffix(r.URL.Path, ".gif") { 248 | w.Header().Set("Content-type", "image/gif") 249 | } 250 | if strings.HasSuffix(r.URL.Path, ".jpg") { 251 | w.Header().Set("Content-type", "image/jpeg") 252 | } 253 | 254 | // for html files we inject a search box 255 | if strings.HasSuffix(r.URL.Path, ".html") { 256 | w.Header().Set("Content-type", "text/html") 257 | 258 | // render search form 259 | sf, err := ioutil.ReadFile("testdata/searchform.html") 260 | if err != nil { 261 | panic(err) 262 | } 263 | t := template.Must(template.New("main").Parse(string(sf))) 264 | var buf bytes.Buffer 265 | t.Execute(&buf, r.FormValue("q")) 266 | sfbytes := buf.Bytes() 267 | 268 | // inject into page 269 | 270 | pagebytes := re.MustCompile("(]*>)").ReplaceAllLiteral(b, []byte(""+string(sfbytes))) 271 | w.Write(pagebytes) 272 | return 273 | 274 | } 275 | 276 | w.Write(b) 277 | 278 | })) 279 | 280 | if err != nil { 281 | fmt.Printf("err from listen: %s\n", err) 282 | } 283 | 284 | s.Close() 285 | 286 | } 287 | -------------------------------------------------------------------------------- /stopwords.go: -------------------------------------------------------------------------------- 1 | package fulltext 2 | 3 | var EnglishStopWordChecker = func(s string) bool { 4 | return STOPWORDS_EN[s] 5 | } 6 | 7 | // English stop words 8 | var STOPWORDS_EN = map[string]bool{ 9 | "a": true, 10 | "about": true, 11 | "above": true, 12 | "after": true, 13 | "again": true, 14 | "against": true, 15 | "all": true, 16 | "am": true, 17 | "an": true, 18 | "and": true, 19 | "any": true, 20 | "are": true, 21 | "aren't": true, 22 | "as": true, 23 | "at": true, 24 | "be": true, 25 | "because": true, 26 | "been": true, 27 | "before": true, 28 | "being": true, 29 | "below": true, 30 | "between": true, 31 | "both": true, 32 | "but": true, 33 | "by": true, 34 | "can't": true, 35 | "cannot": true, 36 | "could": true, 37 | "couldn't": true, 38 | "did": true, 39 | "didn't": true, 40 | "do": true, 41 | "does": true, 42 | "doesn't": true, 43 | "doing": true, 44 | "don't": true, 45 | "down": true, 46 | "during": true, 47 | "each": true, 48 | "few": true, 49 | "for": true, 50 | "from": true, 51 | "further": true, 52 | "had": true, 53 | "hadn't": true, 54 | "has": true, 55 | "hasn't": true, 56 | "have": true, 57 | "haven't": true, 58 | "having": true, 59 | "he": true, 60 | "he'd": true, 61 | "he'll": true, 62 | "he's": true, 63 | "her": true, 64 | "here": true, 65 | "here's": true, 66 | "hers": true, 67 | "herself": true, 68 | "him": true, 69 | "himself": true, 70 | "his": true, 71 | "how": true, 72 | "how's": true, 73 | "i": true, 74 | "i'd": true, 75 | "i'll": true, 76 | "i'm": true, 77 | "i've": true, 78 | "if": true, 79 | "in": true, 80 | "into": true, 81 | "is": true, 82 | "isn't": true, 83 | "it": true, 84 | "it's": true, 85 | "its": true, 86 | "itself": true, 87 | "let's": true, 88 | "me": true, 89 | "more": true, 90 | "most": true, 91 | "mustn't": true, 92 | "my": true, 93 | "myself": true, 94 | "no": true, 95 | "nor": true, 96 | "not": true, 97 | "of": true, 98 | "off": true, 99 | "on": true, 100 | "once": true, 101 | "only": true, 102 | "or": true, 103 | "other": true, 104 | "ought": true, 105 | "our": true, 106 | "ours ourselves": true, 107 | "out": true, 108 | "over": true, 109 | "own": true, 110 | "same": true, 111 | "shan't": true, 112 | "she": true, 113 | "she'd": true, 114 | "she'll": true, 115 | "she's": true, 116 | "should": true, 117 | "shouldn't": true, 118 | "so": true, 119 | "some": true, 120 | "such": true, 121 | "than": true, 122 | "that": true, 123 | "that's": true, 124 | "the": true, 125 | "their": true, 126 | "theirs": true, 127 | "them": true, 128 | "themselves": true, 129 | "then": true, 130 | "there": true, 131 | "there's": true, 132 | "these": true, 133 | "they": true, 134 | "they'd": true, 135 | "they'll": true, 136 | "they're": true, 137 | "they've": true, 138 | "this": true, 139 | "those": true, 140 | "through": true, 141 | "to": true, 142 | "too": true, 143 | "under": true, 144 | "until": true, 145 | "up": true, 146 | "very": true, 147 | "was": true, 148 | "wasn't": true, 149 | "we": true, 150 | "we'd": true, 151 | "we'll": true, 152 | "we're": true, 153 | "we've": true, 154 | "were": true, 155 | "weren't": true, 156 | "what": true, 157 | "what's": true, 158 | "when": true, 159 | "when's": true, 160 | "where": true, 161 | "where's": true, 162 | "which": true, 163 | "while": true, 164 | "who": true, 165 | "who's": true, 166 | "whom": true, 167 | "why": true, 168 | "why's": true, 169 | "with": true, 170 | "won't": true, 171 | "would": true, 172 | "wouldn't": true, 173 | "you": true, 174 | "you'd": true, 175 | "you'll": true, 176 | "you're": true, 177 | "you've": true, 178 | "your": true, 179 | "yours": true, 180 | "yourself": true, 181 | "yourselves": true, 182 | } 183 | -------------------------------------------------------------------------------- /testdata/searchform.html: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 | 5 |
6 | -------------------------------------------------------------------------------- /testdata/searchresults.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Search Results 4 | 5 | 6 | 7 |
8 | Search Again: 9 | 10 |
11 | 12 |
13 | 14 |

Search Results:

15 | 16 |
[return home]
 
17 | 18 | {{range .sr.Items}} 19 | 20 |
21 | {{printf "%s" .StoreValue}} (Score: {{printf "%d" .Score}}) 22 |
  23 |
24 | 25 | {{else}} 26 | 27 | No results found. 28 | Now is the winter of our discontent. 29 | 30 | {{end}} 31 | 32 |
33 | 34 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /testdata/shakespeare.mit.edu.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bradleypeabody/fulltext/a28063e64b3da56602233d36cd7e9a147f41b0ac/testdata/shakespeare.mit.edu.zip -------------------------------------------------------------------------------- /util.go: -------------------------------------------------------------------------------- 1 | package fulltext 2 | 3 | import ( 4 | "bytes" 5 | "html/template" 6 | re "regexp" 7 | "strings" 8 | ) 9 | 10 | var wordizeRe *re.Regexp 11 | 12 | func init() { 13 | wordizeRe = re.MustCompile("[\\s,.;:!?[\\]()'\"]+") 14 | } 15 | 16 | type WordSplitter func(string) []string 17 | 18 | // Split a string up into words 19 | func Wordize(t string) []string { 20 | return wordizeRe.Split(t, -1) 21 | } 22 | 23 | type WordCleaner func(string) string 24 | 25 | // Make word appropriate for indexing 26 | func IndexizeWord(w string) string { 27 | return strings.TrimSpace(strings.ToLower(w)) 28 | } 29 | 30 | type StopWordChecker func(string) bool 31 | 32 | // This function copied from here: https://github.com/kennygrant/sanitize/blob/master/sanitize.go 33 | // License is: https://github.com/kennygrant/sanitize/blob/master/License-BSD.txt 34 | // Strip html tags, replace common entities, and escape <>&;'" in the result. 35 | // Note the returned text may contain entities as it is escaped by HTMLEscapeString, 36 | // and most entities are not translated. 37 | func HTMLStripTags(s string) (output string) { 38 | 39 | output = "" 40 | 41 | // Shortcut strings with no tags in them 42 | if !strings.ContainsAny(s, "<>") { 43 | output = s 44 | } else { 45 | 46 | // First remove line breaks etc as these have no meaning outside html tags (except pre) 47 | // this means pre sections will lose formatting... but will result in less uninentional paras. 48 | s = strings.Replace(s, "\n", "", -1) 49 | 50 | // Then replace line breaks with newlines, to preserve that formatting 51 | s = strings.Replace(s, "

", "\n", -1) 52 | s = strings.Replace(s, "
", "\n", -1) 53 | s = strings.Replace(s, "
", "\n", -1) 54 | 55 | // Walk through the string removing all tags 56 | b := bytes.NewBufferString("") 57 | inTag := false 58 | for _, r := range s { 59 | switch r { 60 | case '<': 61 | inTag = true 62 | case '>': 63 | inTag = false 64 | default: 65 | if !inTag { 66 | b.WriteRune(r) 67 | } 68 | } 69 | } 70 | output = b.String() 71 | } 72 | 73 | // In case we have missed any tags above, escape the text - removes <, >, &, ' and ". 74 | output = template.HTMLEscapeString(output) 75 | 76 | // Remove a few common harmless entities, to arrive at something more like plain text 77 | // This relies on having removed *all* tags above 78 | output = strings.Replace(output, " ", " ", -1) 79 | output = strings.Replace(output, """, "\"", -1) 80 | output = strings.Replace(output, "'", "'", -1) 81 | output = strings.Replace(output, """, "\"", -1) 82 | output = strings.Replace(output, "'", "'", -1) 83 | // NB spaces here are significant - we only allow & not part of entity 84 | output = strings.Replace(output, "& ", "& ", -1) 85 | output = strings.Replace(output, "&amp; ", "& ", -1) 86 | 87 | return output 88 | } 89 | 90 | var titlere *re.Regexp 91 | var descre *re.Regexp 92 | 93 | func init() { 94 | titlere = re.MustCompile("(?i)([^<]+)") 95 | descre = re.MustCompile("(?i)") 96 | } 97 | 98 | // Helper to extract an HTML title from the title tag 99 | func HTMLExtractTitle(html string) string { 100 | tret := titlere.FindSubmatch([]byte(html)) 101 | title := "" 102 | if len(tret) > 1 { 103 | title = strings.TrimSpace(string(tret[1])) 104 | } 105 | return title 106 | } 107 | 108 | // Helper to extract an HTML description from the meta[name=description] tag 109 | func HTMLExtractDescription(html string) string { 110 | tret := descre.FindSubmatch([]byte(html)) 111 | desc := "" 112 | if len(tret) > 1 { 113 | desc = strings.TrimSpace(string(tret[1])) 114 | } 115 | return desc 116 | } 117 | --------------------------------------------------------------------------------