├── .gitignore
├── LICENSE
├── README.md
├── fulltext.go
├── indexer.go
├── indexer_test.go
├── searcher.go
├── searcher_test.go
├── stopwords.go
├── testdata
├── searchform.html
├── searchresults.html
└── shakespeare.mit.edu.zip
└── util.go
/.gitignore:
--------------------------------------------------------------------------------
1 | # Compiled Object files, Static and Dynamic libs (Shared Objects)
2 | *.o
3 | *.a
4 | *.so
5 |
6 | # Folders
7 | _obj
8 | _test
9 |
10 | # Architecture specific extensions/prefixes
11 | *.[568vq]
12 | [568vq].out
13 |
14 | *.cgo1.go
15 | *.cgo2.c
16 | _cgo_defun.c
17 | _cgo_gotypes.go
18 | _cgo_export.*
19 |
20 | _testmain.go
21 |
22 | *.exe
23 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2013-2014 Brad Peabody
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
6 | this software and associated documentation files (the "Software"), to deal in
7 | the Software without restriction, including without limitation the rights to
8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
9 | the Software, and to permit persons to whom the Software is furnished to do so,
10 | subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Overview
2 | ========
3 |
4 | This is a simple, pure-Go, full text indexing and search library.
5 |
6 | I made it for use on small to medium websites, although there is nothing web-specific about it's API or operation.
7 |
8 | Cdb (http://github.com/jbarham/go-cdb) is used to perform the indexing and lookups.
9 |
10 | Status
11 | ------
12 |
13 | This project is more or less stable.
14 |
15 | Notes on Building
16 | --------
17 |
18 | fulltext requires CDB:
19 |
20 | go get github.com/jbarham/go-cdb
21 |
22 | Usage
23 | ------
24 |
25 | First, you must create an index. Like this:
26 |
27 | import "github.com/bradleypeabody/fulltext"
28 |
29 | // create new index with temp dir (usually "" is fine)
30 | idx, err := fulltext.NewIndexer(""); if err != nil { panic(err) }
31 | defer idx.Close()
32 |
33 | // provide stop words if desired
34 | idx.StopWordCheck = fulltext.EnglishStopWordChecker
35 |
36 | // for each document you want to add, you do something like this:
37 | doc := fulltext.IndexDoc{
38 | Id: []byte(uuid), // unique identifier (the path to a webpage works...)
39 | StoreValue: []byte(title), // bytes you want to be able to retrieve from search results
40 | IndexValue: []byte(data), // bytes you want to be split into words and indexed
41 | }
42 | idx.AddDoc(doc) // add it
43 |
44 | // when done, write out to final index
45 | err = idx.FinalizeAndWrite(f); if err != nil { panic(err) }
46 |
47 | Once you have an index file, you can search it like this:
48 |
49 | s, err := fulltext.NewSearcher("/path/to/index/file"); if err != nil { panic(err) }
50 | defer s.Close()
51 | sr, err := s.SimpleSearch("Horatio", 20); if err != nil { panic(err) }
52 | for k, v := range sr.Items {
53 | fmt.Printf("----------- #:%d\n", k)
54 | fmt.Printf("Id: %s\n", v.Id)
55 | fmt.Printf("Score: %d\n", v.Score)
56 | fmt.Printf("StoreValue: %s\n", v.StoreValue)
57 | }
58 |
59 | It's rather simplistic. But it's fast and it works.
60 |
61 | Thoughts in Comparison to blevesearch
62 | -------------------------------------
63 |
64 | I wrote this project before blevesearch was released. I've done a number of implementions now of website search engines using fulltext and also a number of others using blevesearch. My general experience has been that blevesearch is better suited for projects where you are really doing significant development on your search results and need the ability to customize things for various locales, etc. Fulltext on the other hand is much simpler and is better for projects that either a) have simpler search requirements or b) prefer speed of indexing over quality of results.
65 |
66 | Adding a fulltext search engine to a website with a few hundred pages is a simple task and the indexing is fast enough that you can just run it as part of your pre-publish build process. So while there is a lot more development on blevesearch happening - and hats off to them, it's a great product - fulltext still seems to have it's place for these simpler scenarios.
67 |
68 | TODOs
69 | -----
70 |
71 | * ~~Will likely need some sort of "stop word" functionality.~~
72 |
73 | * ~~Wordize(), IndexizeWord()~~ and the scoring aggregation logic should be extracted to callback functions with the existing functionality as default.
74 |
75 | * The search logic is currently very naive. Ideally this project would have something as sophisticated as Lucene's query parser. But in reality what I'll likely do is a simple survey of which common features are actually used on any on-site search engines I can get my hands on. Quoting ("black cat"), and logical operators (Jim OR James) would likely be at the top of the list and implementing that sort of thing would be higher priority than trying to duplicate Lucene.
76 |
77 | * I've considered using boltdb for storage as an alternative to CDB, but I haven't found the time to work on it. This approach would provide the ability to update the index, reduce memory consumption during index building, and potenteially allow for wildcard suffixes.
78 |
79 | Implementation Notes
80 | --------------------
81 |
82 | I originally tried doing this on top of Sqlite. It was dreadfully slow. Cdb is orders of magnitude faster.
83 |
84 | Two main disadvantages from going the Cdb route are that the index cannot be edited once it is built (you have to recreate it in full), and since it's hash-based it will not support any sort of fuzzy matching unless those variations are included in the index (which they are not, in the current implementation.) For my purposes these two disadvantages are overshadowed by the fact that it's blinding fast, easy to use, portable (pure-Go), and its interface allowed me to build the indexes I needed into a single file.
85 |
86 | In the test suite is included a copy of the complete works of William Shakespeare (thanks to Jeremy Hylton's http://shakespeare.mit.edu/) and this library is used to create a simple search engine on top of that corpus. By default it only runs for 10 seconds, but you can run it for longer by doing something like:
87 |
88 | SEARCHER_WEB_TIMEOUT_SECONDS=120 go test fulltext -v
89 |
90 | Works on Windows.
91 |
92 | Future Work
93 | -----------
94 |
95 | It might be feasible to supplant this project with something using suffix arrays ( http://golang.org/pkg/index/suffixarray/ ). The main down side would be the requirement of a lot more storage space (and memory to load and search it). Retooling the index/suffixarray package so it can work against the disk is an idea, but is not necessarily simple. The upside of an approach like that would be full regex support for searches with decent performance - which would rock. The index could potentially be sharded by the first character or two of the search - but that's still not as good as something with sensible caching where the whole set can be kept on disk and the "hot" parts cached in memory, etc.
96 |
--------------------------------------------------------------------------------
/fulltext.go:
--------------------------------------------------------------------------------
1 | /*
2 |
3 | A simple cross-platform, full-text search engine, backed by sqlite.
4 | Intended for use on small- to medium-sized websites.
5 |
6 | See README.md for usage.
7 |
8 | */
9 | package fulltext
10 |
--------------------------------------------------------------------------------
/indexer.go:
--------------------------------------------------------------------------------
1 | package fulltext
2 |
3 | import (
4 | "bytes"
5 | "encoding/gob"
6 | "fmt"
7 | "github.com/jbarham/go-cdb"
8 | "io"
9 | "io/ioutil"
10 | "os"
11 | "syscall"
12 | )
13 |
14 | // Size of header block to prepend - make it 4k to align disk reads
15 | const HEADER_SIZE = 4096
16 |
17 | // Produces a set of cdb files from a series of AddDoc() calls
18 | type Indexer struct {
19 | docTxtFile *os.File
20 | wordTxtFile *os.File
21 | docCdbFile *os.File
22 | wordCdbFile *os.File
23 | wordMap map[string]map[string]int // map of [word][docId]count
24 | WordSplit WordSplitter
25 | WordClean WordCleaner
26 | StopWordCheck StopWordChecker
27 | }
28 |
29 | // Contents of a single document to be indexed
30 | type IndexDoc struct {
31 | Id []byte // the id, this is usually the path to the document
32 | IndexValue []byte // index this data
33 | StoreValue []byte // store this data
34 | }
35 |
36 | // Creates a new indexer, using the given temp dir while building
37 | // the index.
38 | func NewIndexer(tempDir string) (*Indexer, error) {
39 | idx := &Indexer{}
40 | var err error
41 | idx.docTxtFile, err = ioutil.TempFile(tempDir, "doctmp")
42 | if err != nil {
43 | return nil, err
44 | }
45 | idx.wordTxtFile, err = ioutil.TempFile(tempDir, "wordtmp")
46 | if err != nil {
47 | return nil, err
48 | }
49 | idx.docCdbFile, err = ioutil.TempFile(tempDir, "doccdb")
50 | if err != nil {
51 | return nil, err
52 | }
53 | idx.wordCdbFile, err = ioutil.TempFile(tempDir, "wordcdb")
54 | if err != nil {
55 | return nil, err
56 | }
57 | idx.wordMap = make(map[string]map[string]int)
58 | idx.WordSplit = Wordize
59 | idx.WordClean = IndexizeWord
60 | return idx, nil
61 | }
62 |
63 | // Add a document to the index - writes to temporary files and stores some data in memory while building the index.
64 | func (idx *Indexer) AddDoc(idoc IndexDoc) error {
65 | // add to docs
66 | docId := string(idoc.Id)
67 | writeTextLine(idx.docTxtFile, []byte(docId), idoc.StoreValue)
68 | words := append(idx.WordSplit(string(idoc.IndexValue)), idx.WordSplit(string(idoc.StoreValue))...)
69 | for _, word := range words {
70 | word = idx.WordClean(word)
71 |
72 | // skip if stop word
73 | if idx.StopWordCheck != nil {
74 | if idx.StopWordCheck(word) {
75 | continue
76 | }
77 | }
78 |
79 | // ensure nested map exists
80 | if idx.wordMap[word] == nil {
81 | idx.wordMap[word] = make(map[string]int)
82 | }
83 | // increment count by one for this combination
84 | c := idx.wordMap[word][docId] + 1
85 | idx.wordMap[word][docId] = c
86 | }
87 | return nil
88 | }
89 |
90 | // Builds a final single index file, which consists of some simple header info,
91 | // followed by the cdb binary files that comprise the full index.
92 | func (idx *Indexer) FinalizeAndWrite(w io.Writer) error {
93 |
94 | var buf bytes.Buffer
95 |
96 | // write out the word data
97 | for word, m := range idx.wordMap {
98 | enc := gob.NewEncoder(&buf)
99 | enc.Encode(m)
100 | writeTextLine(idx.wordTxtFile, []byte(word), buf.Bytes())
101 | buf.Reset()
102 | }
103 |
104 | var err error
105 |
106 | idx.docTxtFile.Write([]byte("\n"))
107 | idx.wordTxtFile.Write([]byte("\n"))
108 |
109 | _, err = idx.docTxtFile.Seek(0, 0)
110 | if err != nil {
111 | return err
112 | }
113 | _, err = idx.wordTxtFile.Seek(0, 0)
114 | if err != nil {
115 | return err
116 | }
117 |
118 | // make cdb files
119 | err = cdb.Make(idx.docCdbFile, idx.docTxtFile)
120 | if err != nil {
121 | return err
122 | }
123 | err = cdb.Make(idx.wordCdbFile, idx.wordTxtFile)
124 | if err != nil {
125 | return err
126 | }
127 |
128 | // make sure the contents are all settled
129 | idx.docCdbFile.Sync()
130 | idx.wordCdbFile.Sync()
131 | _, err = idx.docCdbFile.Seek(0, 0)
132 | if err != nil {
133 | return err
134 | }
135 | _, err = idx.wordCdbFile.Seek(0, 0)
136 | if err != nil {
137 | return err
138 | }
139 |
140 | docStat, err := idx.docCdbFile.Stat()
141 | if err != nil {
142 | return err
143 | }
144 | wordStat, err := idx.wordCdbFile.Stat()
145 | if err != nil {
146 | return err
147 | }
148 |
149 | // now package it all up
150 | buf.Reset()
151 | enc := gob.NewEncoder(&buf)
152 | bhead := []int{int(docStat.Size()), int(wordStat.Size())}
153 | enc.Encode(bhead)
154 |
155 | // extend buffer to be HEADER_SIZE len
156 | bpadsize := HEADER_SIZE - buf.Len()
157 | buf.Write(make([]byte, bpadsize, bpadsize))
158 | b := buf.Bytes()
159 |
160 | _, err = w.Write(b)
161 | if err != nil {
162 | return err
163 | }
164 |
165 | _, err = io.Copy(w, idx.docCdbFile)
166 | if err != nil {
167 | return err
168 | }
169 | _, err = io.Copy(w, idx.wordCdbFile)
170 | if err != nil {
171 | return err
172 | }
173 |
174 | return nil
175 | }
176 |
177 | // Dump some human readable status information
178 | func (idx *Indexer) DumpStatus(w io.Writer) {
179 | fmt.Fprintf(w, "files used:\n\t%s\n\t%s\n\t%s\n\t%s\n", idx.docTxtFile.Name(), idx.wordTxtFile.Name(), idx.docCdbFile.Name(), idx.wordCdbFile.Name())
180 | // fmt.Fprintf(w, "wordMap: %+v\n", idx.wordMap)
181 | }
182 |
183 | // close and remove all resources
184 | func (idx *Indexer) Close() {
185 | syscall.Unlink(idx.docTxtFile.Name())
186 | idx.docTxtFile.Close()
187 | syscall.Unlink(idx.wordTxtFile.Name())
188 | idx.wordTxtFile.Close()
189 | syscall.Unlink(idx.docCdbFile.Name())
190 | idx.docCdbFile.Close()
191 | syscall.Unlink(idx.wordCdbFile.Name())
192 | idx.wordCdbFile.Close()
193 | idx.wordMap = nil
194 | }
195 |
196 | // Write a single line of data in cdb's text format
197 | func writeTextLine(w io.Writer, key []byte, data []byte) (err error) {
198 | _, err = fmt.Fprintf(w, "+%d,%d:%s->%s\n", len(key), len(data), key, data)
199 | return
200 | }
201 |
--------------------------------------------------------------------------------
/indexer_test.go:
--------------------------------------------------------------------------------
1 | package fulltext
2 |
3 | import (
4 | "fmt"
5 | "io/ioutil"
6 | "os"
7 | "path/filepath"
8 | re "regexp"
9 | "testing"
10 | )
11 |
12 | func TestIndexer(t *testing.T) {
13 | fmt.Printf("TestIndexer\n")
14 |
15 | idx, err := NewIndexer("")
16 | if err != nil {
17 | panic(err)
18 | }
19 |
20 | idx.AddDoc(IndexDoc{Id: []byte(`blah1`), StoreValue: []byte(`store this`), IndexValue: []byte(`test of the emergency broadcast system`)})
21 | idx.AddDoc(IndexDoc{Id: []byte(`blah2`), StoreValue: []byte(`store this stuff too, yeah store it`), IndexValue: []byte(`every good boy does fine`)})
22 | idx.AddDoc(IndexDoc{Id: []byte(`blah3`), StoreValue: []byte(`more storage here`), IndexValue: []byte(`a taco in the hand is worth two in the truck`)})
23 |
24 | idx.DumpStatus(os.Stdout)
25 |
26 | f, err := ioutil.TempFile("", "idxout")
27 | if err != nil {
28 | panic(err)
29 | }
30 | err = idx.FinalizeAndWrite(f)
31 | if err != nil {
32 | panic(err)
33 | }
34 | f.Close()
35 |
36 | fmt.Printf("Wrote index file: %s\n", f.Name())
37 |
38 | }
39 |
40 | // A more extensive test - index the complete works of William Shakespeare
41 | func NoTestTheBardIndexing(t *testing.T) {
42 |
43 | fmt.Println("TestTheBardIndexing")
44 |
45 | idx, err := NewIndexer("")
46 | if err != nil {
47 | panic(err)
48 | }
49 | defer idx.Close()
50 |
51 | titlere := re.MustCompile("(?i)
([^<]+)")
52 |
53 | n := 0
54 |
55 | filepath.Walk("testdata/shakespeare.mit.edu/", func(path string, f os.FileInfo, err error) error {
56 | if !f.IsDir() /*&& n < 5*/ {
57 | n++
58 | fmt.Printf("indexing: %s\n", path)
59 | b, err := ioutil.ReadFile(path)
60 | if err != nil {
61 | panic(err)
62 | }
63 | title := string(titlere.Find(b))
64 | body := HTMLStripTags(string(b))
65 | doc := IndexDoc{
66 | Id: []byte(path),
67 | StoreValue: []byte(title),
68 | IndexValue: []byte(title + " " + title + " " + body),
69 | }
70 | idx.AddDoc(doc)
71 | }
72 | return nil
73 | })
74 |
75 | // idx.DebugDump(os.Stdout)
76 |
77 | fmt.Println("Writing final index...")
78 | f, err := ioutil.TempFile("", "idxout")
79 | if err != nil {
80 | panic(err)
81 | }
82 | err = idx.FinalizeAndWrite(f)
83 | if err != nil {
84 | panic(err)
85 | }
86 | f.Close()
87 |
88 | fmt.Printf("Wrote index file: %s\n", f.Name())
89 |
90 | }
91 |
--------------------------------------------------------------------------------
/searcher.go:
--------------------------------------------------------------------------------
1 | package fulltext
2 |
3 | import (
4 | "bytes"
5 | "encoding/gob"
6 | "github.com/jbarham/go-cdb"
7 | "io"
8 | "io/ioutil"
9 | "os"
10 | "sort"
11 | )
12 |
13 | // Interface for search. Not thread-safe, but low overhead
14 | // so having a separate one per thread should be workable.
15 | type Searcher struct {
16 | file *os.File
17 | docCdb *cdb.Cdb
18 | wordCdb *cdb.Cdb
19 | }
20 |
21 | // Wraps a ReaderAt and adjusts (tweaks) it's offset by the specified amount
22 | type tweakedReaderAt struct {
23 | readerAt io.ReaderAt
24 | tweak int64
25 | }
26 |
27 | func (t *tweakedReaderAt) ReadAt(p []byte, off int64) (n int, err error) {
28 | n, err = t.readerAt.ReadAt(p, off+t.tweak)
29 | return
30 | }
31 |
32 | // A single item in a search result
33 | type SearchResultItem struct {
34 | Id []byte // id of this item (document)
35 | StoreValue []byte // the stored value of this document
36 | Score int64 // the total score
37 | }
38 |
39 | // Implement sort.Interface
40 | type SearchResultItems []SearchResultItem
41 |
42 | func (s SearchResultItems) Len() int { return len(s) }
43 | func (s SearchResultItems) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
44 | func (s SearchResultItems) Less(i, j int) bool {
45 | // if same score, then sort by raw bytes comparison of store value -
46 | // so we get consistently ordered results, even when score is same
47 | if s[i].Score == s[j].Score {
48 | return bytes.Compare(s[i].Id, s[j].Id) < 0
49 | }
50 | return s[i].Score < s[j].Score
51 | }
52 |
53 | // What happened during the search
54 | type SearchResults struct {
55 | Items SearchResultItems
56 | }
57 |
58 | // Make a new searcher using the file at the specified path
59 | // TODO: Make a variation that accepts a ReaderAt
60 | func NewSearcher(fpath string) (*Searcher, error) {
61 |
62 | s := &Searcher{}
63 |
64 | f, err := os.Open(fpath)
65 | if err != nil {
66 | return s, err
67 | }
68 | s.file = f
69 |
70 | // write out the word data
71 | dec := gob.NewDecoder(f)
72 | lens := make([]int64, 2, 2)
73 | dec.Decode(&lens)
74 |
75 | s.docCdb = cdb.New(&tweakedReaderAt{f, HEADER_SIZE})
76 | s.wordCdb = cdb.New(&tweakedReaderAt{f, HEADER_SIZE + lens[0]})
77 |
78 | return s, nil
79 | }
80 |
81 | // Close and release resources
82 | func (s *Searcher) Close() error {
83 | s.docCdb = nil
84 | s.wordCdb = nil
85 | return s.file.Close()
86 | }
87 |
88 | // Perform a search
89 | func (s *Searcher) SimpleSearch(search string, maxn int) (SearchResults, error) {
90 |
91 | sr := SearchResults{}
92 |
93 | // break search into words_word
94 | searchWords := Wordize(search)
95 |
96 | itemMap := make(map[string]SearchResultItem)
97 |
98 | // read word data for each word that was provided
99 | for _, w := range searchWords {
100 | w = IndexizeWord(w)
101 | // find the docs for this word
102 | mapGob, err := s.wordCdb.Find([]byte(w))
103 | if err == io.EOF {
104 | continue
105 | }
106 | if err != nil {
107 | return sr, err
108 | }
109 |
110 | m := make(map[string]int)
111 |
112 | dec := gob.NewDecoder(mapGob)
113 | err = dec.Decode(&m)
114 | if err != nil {
115 | return sr, err
116 | }
117 |
118 | // for each doc, increase score
119 | for docId, cnt := range m {
120 | sri := itemMap[docId]
121 | if sri.Score < 1 {
122 | sri.Id = []byte(docId)
123 | }
124 | sri.Score += int64(cnt)
125 | itemMap[docId] = sri
126 | }
127 |
128 | }
129 |
130 | // convert to slice
131 | items := make(SearchResultItems, 0, maxn)
132 | for _, item := range itemMap {
133 | items = append(items, item)
134 | }
135 |
136 | // sort by score descending
137 | sort.Sort(sort.Reverse(items))
138 |
139 | // limit to maxn
140 | if len(items) > maxn {
141 | items = items[:maxn]
142 | }
143 |
144 | // pull document contents from doc cdb
145 | for i := range items {
146 | item := &items[i]
147 | v, err := s.docCdb.Find(item.Id)
148 | if err == io.EOF {
149 | panic("doc id " + string(item.Id) + " not found in index, this should never happen")
150 | }
151 | if err != nil {
152 | return sr, err
153 | }
154 | v1, err := ioutil.ReadAll(v)
155 | if err != nil {
156 | return sr, err
157 | }
158 | item.StoreValue = v1
159 | }
160 |
161 | sr.Items = items
162 |
163 | return sr, nil
164 |
165 | }
166 |
--------------------------------------------------------------------------------
/searcher_test.go:
--------------------------------------------------------------------------------
1 | package fulltext
2 |
3 | import (
4 | "archive/zip"
5 | "bytes"
6 | "fmt"
7 | "html/template"
8 | "io"
9 | "io/ioutil"
10 | "net"
11 | "net/http"
12 | "os"
13 | "path/filepath"
14 | re "regexp"
15 | "strconv"
16 | "strings"
17 | "testing"
18 | "time"
19 | )
20 |
21 | // Extract a single file from a zip and return it's contents
22 | func zipExtract(zfpath string, fpath string) ([]byte, error) {
23 |
24 | zr, err := zip.OpenReader(zfpath)
25 | if err != nil {
26 | return nil, err
27 | }
28 | defer zr.Close()
29 |
30 | fpath = strings.Trim(filepath.Clean(filepath.ToSlash(fpath)), "/")
31 |
32 | for _, f := range zr.File {
33 |
34 | fn := strings.Trim(filepath.Clean(filepath.ToSlash(f.Name)), "/")
35 |
36 | // keep going until we find it
37 | if fn != fpath {
38 | continue
39 | }
40 |
41 | rc, err := f.Open()
42 | if err != nil {
43 | panic(err)
44 | }
45 | b, err := ioutil.ReadAll(rc)
46 | if err != nil {
47 | return nil, err
48 | }
49 | rc.Close()
50 |
51 | return b, nil
52 |
53 | }
54 |
55 | return nil, io.EOF
56 |
57 | }
58 |
59 | // Index and search the complete works of William Shakespeare
60 | func TestTheBardSearch(t *testing.T) {
61 |
62 | fmt.Println("TestTheBardIndexing")
63 |
64 | idx, err := NewIndexer("")
65 | if err != nil {
66 | panic(err)
67 | }
68 | defer idx.Close()
69 |
70 | // use English stop words
71 | idx.StopWordCheck = EnglishStopWordChecker
72 |
73 | titlere := re.MustCompile("(?i)([^<]+)")
74 |
75 | zr, err := zip.OpenReader("testdata/shakespeare.mit.edu.zip")
76 | if err != nil {
77 | panic(err)
78 | }
79 | defer zr.Close()
80 |
81 | for _, f := range zr.File {
82 | fmt.Printf("indexing: %s\n", f.Name)
83 |
84 | rc, err := f.Open()
85 | if err != nil {
86 | panic(err)
87 | }
88 | b, err := ioutil.ReadAll(rc)
89 | if err != nil {
90 | panic(err)
91 | }
92 |
93 | // extract title tag
94 | tret := titlere.FindSubmatch(b)
95 | title := ""
96 | if len(tret) > 1 {
97 | title = strings.TrimSpace(string(tret[1]))
98 | }
99 |
100 | // strip html from entire doc and get text
101 | body := HTMLStripTags(string(b))
102 |
103 | // make a doc out of it
104 | doc := IndexDoc{
105 | Id: []byte(f.Name),
106 | StoreValue: []byte(title),
107 | IndexValue: []byte(title + " " + title + " " + body),
108 | }
109 | idx.AddDoc(doc)
110 |
111 | rc.Close()
112 | }
113 |
114 | fmt.Println("Writing final index...")
115 | f, err := ioutil.TempFile("", "idxout")
116 | if err != nil {
117 | panic(err)
118 | }
119 | err = idx.FinalizeAndWrite(f)
120 | if err != nil {
121 | panic(err)
122 | }
123 |
124 | fmt.Println("Debug data: \n")
125 | idx.DumpStatus(os.Stdout)
126 |
127 | // panic("DONE")
128 |
129 | f.Close()
130 |
131 | fmt.Printf("Wrote index file: %s\n", f.Name())
132 |
133 | /////////////////////////////////
134 |
135 | start := time.Now()
136 |
137 | s, err := NewSearcher(f.Name())
138 | if err != nil {
139 | panic(err)
140 | }
141 |
142 | fmt.Printf("Opening searcher took: %s\n", time.Since(start).String())
143 |
144 | start = time.Now()
145 |
146 | sr, err := s.SimpleSearch("king", 20)
147 | if err != nil {
148 | panic(err)
149 | }
150 |
151 | if len(sr.Items) == 0 {
152 | t.Fatalf("Search for 'king' returned 0 results, but should have gotten something")
153 | }
154 |
155 | fmt.Printf("Searching took: %s\n", time.Since(start).String())
156 |
157 | fmt.Printf("Total Results for 'king': %d\n", len(sr.Items))
158 | for k, v := range sr.Items {
159 | fmt.Printf("----------- #:%d\n", k)
160 | fmt.Printf("Id: %s\n", v.Id)
161 | fmt.Printf("Score: %d\n", v.Score)
162 | fmt.Printf("StoreValue: %s\n", v.StoreValue)
163 | }
164 |
165 | fmt.Printf("Raw dump: %+v\n", sr)
166 |
167 | // look for a stop word and make sure it's not there
168 |
169 | sr, err = s.SimpleSearch("the", 20)
170 | if err != nil {
171 | panic(err)
172 | }
173 | if len(sr.Items) != 0 {
174 | t.Fatalf("Search for 'the' returned %d results when it should have been 0 because it's a stop word", len(sr.Items))
175 | }
176 | fmt.Printf("Check for stop word passed\n")
177 |
178 | ///////////////////////////////////////////////////
179 |
180 | fmt.Printf("Starting Shakespeare's very own search interface at :1414 ...")
181 |
182 | ln, err := net.Listen("tcp", ":1414")
183 | if err != nil {
184 | panic(err)
185 | }
186 |
187 | timeoutStr := os.Getenv("SEARCHER_WEB_TIMEOUT_SECONDS")
188 |
189 | timeout, err := strconv.Atoi(timeoutStr)
190 | if err != nil {
191 | timeout = 10
192 | }
193 |
194 | zfpath := "testdata/shakespeare.mit.edu.zip"
195 |
196 | // wait for specified time
197 | go func() { time.Sleep(time.Duration(timeout) * time.Second); ln.Close() }()
198 |
199 | // main request handler
200 | err = http.Serve(ln, http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
201 |
202 | // home page redirect
203 | if r.URL.Path == "/" || r.URL.Path == "/Shakespeare" {
204 | http.Redirect(w, r, "/shakespeare.mit.edu/index.html", 302)
205 | return
206 | }
207 |
208 | // handle search result page
209 | if r.URL.Path == "/searchresults.html" {
210 |
211 | w.Header().Set("Content-type", "text/html")
212 |
213 | q := r.FormValue("q")
214 |
215 | // do search
216 | sr, err := s.SimpleSearch(q, 20)
217 | if err != nil {
218 | panic(err)
219 | }
220 |
221 | // render results page
222 | sres, err := ioutil.ReadFile("testdata/searchresults.html")
223 | if err != nil {
224 | panic(err)
225 | }
226 | t := template.Must(template.New("main").Parse(string(sres)))
227 | var buf bytes.Buffer
228 | t.Execute(&buf, &map[string]interface{}{
229 | "q": q,
230 | "sr": sr,
231 | })
232 | sresbytes := buf.Bytes()
233 |
234 | w.Write(sresbytes)
235 |
236 | return
237 | }
238 |
239 | // by default look through zip file
240 | b, err := zipExtract(zfpath, r.URL.Path)
241 | if err != nil {
242 | http.Error(w, "File not found", 404)
243 | }
244 | if strings.HasSuffix(r.URL.Path, ".css") {
245 | w.Header().Set("Content-type", "text/css")
246 | }
247 | if strings.HasSuffix(r.URL.Path, ".gif") {
248 | w.Header().Set("Content-type", "image/gif")
249 | }
250 | if strings.HasSuffix(r.URL.Path, ".jpg") {
251 | w.Header().Set("Content-type", "image/jpeg")
252 | }
253 |
254 | // for html files we inject a search box
255 | if strings.HasSuffix(r.URL.Path, ".html") {
256 | w.Header().Set("Content-type", "text/html")
257 |
258 | // render search form
259 | sf, err := ioutil.ReadFile("testdata/searchform.html")
260 | if err != nil {
261 | panic(err)
262 | }
263 | t := template.Must(template.New("main").Parse(string(sf)))
264 | var buf bytes.Buffer
265 | t.Execute(&buf, r.FormValue("q"))
266 | sfbytes := buf.Bytes()
267 |
268 | // inject into page
269 |
270 | pagebytes := re.MustCompile("(]*>)").ReplaceAllLiteral(b, []byte(""+string(sfbytes)))
271 | w.Write(pagebytes)
272 | return
273 |
274 | }
275 |
276 | w.Write(b)
277 |
278 | }))
279 |
280 | if err != nil {
281 | fmt.Printf("err from listen: %s\n", err)
282 | }
283 |
284 | s.Close()
285 |
286 | }
287 |
--------------------------------------------------------------------------------
/stopwords.go:
--------------------------------------------------------------------------------
1 | package fulltext
2 |
3 | var EnglishStopWordChecker = func(s string) bool {
4 | return STOPWORDS_EN[s]
5 | }
6 |
7 | // English stop words
8 | var STOPWORDS_EN = map[string]bool{
9 | "a": true,
10 | "about": true,
11 | "above": true,
12 | "after": true,
13 | "again": true,
14 | "against": true,
15 | "all": true,
16 | "am": true,
17 | "an": true,
18 | "and": true,
19 | "any": true,
20 | "are": true,
21 | "aren't": true,
22 | "as": true,
23 | "at": true,
24 | "be": true,
25 | "because": true,
26 | "been": true,
27 | "before": true,
28 | "being": true,
29 | "below": true,
30 | "between": true,
31 | "both": true,
32 | "but": true,
33 | "by": true,
34 | "can't": true,
35 | "cannot": true,
36 | "could": true,
37 | "couldn't": true,
38 | "did": true,
39 | "didn't": true,
40 | "do": true,
41 | "does": true,
42 | "doesn't": true,
43 | "doing": true,
44 | "don't": true,
45 | "down": true,
46 | "during": true,
47 | "each": true,
48 | "few": true,
49 | "for": true,
50 | "from": true,
51 | "further": true,
52 | "had": true,
53 | "hadn't": true,
54 | "has": true,
55 | "hasn't": true,
56 | "have": true,
57 | "haven't": true,
58 | "having": true,
59 | "he": true,
60 | "he'd": true,
61 | "he'll": true,
62 | "he's": true,
63 | "her": true,
64 | "here": true,
65 | "here's": true,
66 | "hers": true,
67 | "herself": true,
68 | "him": true,
69 | "himself": true,
70 | "his": true,
71 | "how": true,
72 | "how's": true,
73 | "i": true,
74 | "i'd": true,
75 | "i'll": true,
76 | "i'm": true,
77 | "i've": true,
78 | "if": true,
79 | "in": true,
80 | "into": true,
81 | "is": true,
82 | "isn't": true,
83 | "it": true,
84 | "it's": true,
85 | "its": true,
86 | "itself": true,
87 | "let's": true,
88 | "me": true,
89 | "more": true,
90 | "most": true,
91 | "mustn't": true,
92 | "my": true,
93 | "myself": true,
94 | "no": true,
95 | "nor": true,
96 | "not": true,
97 | "of": true,
98 | "off": true,
99 | "on": true,
100 | "once": true,
101 | "only": true,
102 | "or": true,
103 | "other": true,
104 | "ought": true,
105 | "our": true,
106 | "ours ourselves": true,
107 | "out": true,
108 | "over": true,
109 | "own": true,
110 | "same": true,
111 | "shan't": true,
112 | "she": true,
113 | "she'd": true,
114 | "she'll": true,
115 | "she's": true,
116 | "should": true,
117 | "shouldn't": true,
118 | "so": true,
119 | "some": true,
120 | "such": true,
121 | "than": true,
122 | "that": true,
123 | "that's": true,
124 | "the": true,
125 | "their": true,
126 | "theirs": true,
127 | "them": true,
128 | "themselves": true,
129 | "then": true,
130 | "there": true,
131 | "there's": true,
132 | "these": true,
133 | "they": true,
134 | "they'd": true,
135 | "they'll": true,
136 | "they're": true,
137 | "they've": true,
138 | "this": true,
139 | "those": true,
140 | "through": true,
141 | "to": true,
142 | "too": true,
143 | "under": true,
144 | "until": true,
145 | "up": true,
146 | "very": true,
147 | "was": true,
148 | "wasn't": true,
149 | "we": true,
150 | "we'd": true,
151 | "we'll": true,
152 | "we're": true,
153 | "we've": true,
154 | "were": true,
155 | "weren't": true,
156 | "what": true,
157 | "what's": true,
158 | "when": true,
159 | "when's": true,
160 | "where": true,
161 | "where's": true,
162 | "which": true,
163 | "while": true,
164 | "who": true,
165 | "who's": true,
166 | "whom": true,
167 | "why": true,
168 | "why's": true,
169 | "with": true,
170 | "won't": true,
171 | "would": true,
172 | "wouldn't": true,
173 | "you": true,
174 | "you'd": true,
175 | "you'll": true,
176 | "you're": true,
177 | "you've": true,
178 | "your": true,
179 | "yours": true,
180 | "yourself": true,
181 | "yourselves": true,
182 | }
183 |
--------------------------------------------------------------------------------
/testdata/searchform.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
6 |
--------------------------------------------------------------------------------
/testdata/searchresults.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | Search Results
4 |
5 |
6 |
7 |
11 |
12 |
13 |
14 |
Search Results:
15 |
16 |
17 |
18 | {{range .sr.Items}}
19 |
20 |
24 |
25 | {{else}}
26 |
27 | No results found.
28 | Now is the winter of our discontent.
29 |
30 | {{end}}
31 |
32 |
33 |
34 |
35 |
36 |
37 |
--------------------------------------------------------------------------------
/testdata/shakespeare.mit.edu.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bradleypeabody/fulltext/a28063e64b3da56602233d36cd7e9a147f41b0ac/testdata/shakespeare.mit.edu.zip
--------------------------------------------------------------------------------
/util.go:
--------------------------------------------------------------------------------
1 | package fulltext
2 |
3 | import (
4 | "bytes"
5 | "html/template"
6 | re "regexp"
7 | "strings"
8 | )
9 |
10 | var wordizeRe *re.Regexp
11 |
12 | func init() {
13 | wordizeRe = re.MustCompile("[\\s,.;:!?[\\]()'\"]+")
14 | }
15 |
16 | type WordSplitter func(string) []string
17 |
18 | // Split a string up into words
19 | func Wordize(t string) []string {
20 | return wordizeRe.Split(t, -1)
21 | }
22 |
23 | type WordCleaner func(string) string
24 |
25 | // Make word appropriate for indexing
26 | func IndexizeWord(w string) string {
27 | return strings.TrimSpace(strings.ToLower(w))
28 | }
29 |
30 | type StopWordChecker func(string) bool
31 |
32 | // This function copied from here: https://github.com/kennygrant/sanitize/blob/master/sanitize.go
33 | // License is: https://github.com/kennygrant/sanitize/blob/master/License-BSD.txt
34 | // Strip html tags, replace common entities, and escape <>&;'" in the result.
35 | // Note the returned text may contain entities as it is escaped by HTMLEscapeString,
36 | // and most entities are not translated.
37 | func HTMLStripTags(s string) (output string) {
38 |
39 | output = ""
40 |
41 | // Shortcut strings with no tags in them
42 | if !strings.ContainsAny(s, "<>") {
43 | output = s
44 | } else {
45 |
46 | // First remove line breaks etc as these have no meaning outside html tags (except pre)
47 | // this means pre sections will lose formatting... but will result in less uninentional paras.
48 | s = strings.Replace(s, "\n", "", -1)
49 |
50 | // Then replace line breaks with newlines, to preserve that formatting
51 | s = strings.Replace(s, "", "\n", -1)
52 | s = strings.Replace(s, "
", "\n", -1)
53 | s = strings.Replace(s, "", "\n", -1)
54 |
55 | // Walk through the string removing all tags
56 | b := bytes.NewBufferString("")
57 | inTag := false
58 | for _, r := range s {
59 | switch r {
60 | case '<':
61 | inTag = true
62 | case '>':
63 | inTag = false
64 | default:
65 | if !inTag {
66 | b.WriteRune(r)
67 | }
68 | }
69 | }
70 | output = b.String()
71 | }
72 |
73 | // In case we have missed any tags above, escape the text - removes <, >, &, ' and ".
74 | output = template.HTMLEscapeString(output)
75 |
76 | // Remove a few common harmless entities, to arrive at something more like plain text
77 | // This relies on having removed *all* tags above
78 | output = strings.Replace(output, " ", " ", -1)
79 | output = strings.Replace(output, """, "\"", -1)
80 | output = strings.Replace(output, "'", "'", -1)
81 | output = strings.Replace(output, """, "\"", -1)
82 | output = strings.Replace(output, "'", "'", -1)
83 | // NB spaces here are significant - we only allow & not part of entity
84 | output = strings.Replace(output, "& ", "& ", -1)
85 | output = strings.Replace(output, "& ", "& ", -1)
86 |
87 | return output
88 | }
89 |
90 | var titlere *re.Regexp
91 | var descre *re.Regexp
92 |
93 | func init() {
94 | titlere = re.MustCompile("(?i)([^<]+)")
95 | descre = re.MustCompile("(?i)")
96 | }
97 |
98 | // Helper to extract an HTML title from the title tag
99 | func HTMLExtractTitle(html string) string {
100 | tret := titlere.FindSubmatch([]byte(html))
101 | title := ""
102 | if len(tret) > 1 {
103 | title = strings.TrimSpace(string(tret[1]))
104 | }
105 | return title
106 | }
107 |
108 | // Helper to extract an HTML description from the meta[name=description] tag
109 | func HTMLExtractDescription(html string) string {
110 | tret := descre.FindSubmatch([]byte(html))
111 | desc := ""
112 | if len(tret) > 1 {
113 | desc = strings.TrimSpace(string(tret[1]))
114 | }
115 | return desc
116 | }
117 |
--------------------------------------------------------------------------------