├── .travis.yml ├── doc.go ├── tokenize_test.go ├── bayes_test.go ├── sparse_test.go ├── encoding_test.go ├── testutil.go ├── example_test.go ├── encoding.go ├── sparse.go ├── README.md ├── bayes.go ├── stopbytes.go └── tokenize.go /.travis.yml: -------------------------------------------------------------------------------- 1 | language: go 2 | script: go test -race -cpu 1,2,4 -v ./... -------------------------------------------------------------------------------- /doc.go: -------------------------------------------------------------------------------- 1 | // Multiclass naive Bayesian document classification. 2 | // 3 | // While multinomial Bayesian classification offers 4 | // one-of-many classification, multibayes offers tools 5 | // for many-of-many classification. The multibayes 6 | // library strives to offer efficient storage and 7 | // calculation of multiple Bayesian posterior classification 8 | // probabilities. 9 | package multibayes 10 | -------------------------------------------------------------------------------- /tokenize_test.go: -------------------------------------------------------------------------------- 1 | package multibayes 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/bmizerany/assert" 7 | ) 8 | 9 | func TestTokenizer(t *testing.T) { 10 | testdata := getTestData() 11 | 12 | tokenize, err := newTokenizer(&tokenizerConf{ 13 | NGramSize: 1, 14 | }) 15 | 16 | assert.Equalf(t, nil, err, "Error creating tokenizer: %v", err) 17 | 18 | for _, doc := range testdata { 19 | _ = tokenize.Parse(doc.Text) 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /bayes_test.go: -------------------------------------------------------------------------------- 1 | package multibayes 2 | 3 | import ( 4 | "github.com/bmizerany/assert" 5 | "testing" 6 | ) 7 | 8 | func TestPosterior(t *testing.T) { 9 | classifier := NewClassifier() 10 | classifier.MinClassSize = 0 11 | classifier.trainWithTestData() 12 | 13 | probs := classifier.Posterior("Aaron's dog has tons of fleas") 14 | 15 | assert.Equalf(t, len(classifier.Matrix.Classes), len(probs), "Posterior returned incorrect number of classes") 16 | } 17 | -------------------------------------------------------------------------------- /sparse_test.go: -------------------------------------------------------------------------------- 1 | package multibayes 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/bmizerany/assert" 7 | ) 8 | 9 | func TestSparseMatrix(t *testing.T) { 10 | testdata := getTestData() 11 | tokenizer, err := newTokenizer(&tokenizerConf{ 12 | NGramSize: 1, 13 | }) 14 | assert.Equalf(t, err, nil, "Error creating new tokenizer") 15 | 16 | sparse := newSparseMatrix() 17 | 18 | for _, document := range testdata { 19 | ngrams := tokenizer.Parse(document.Text) 20 | sparse.Add(ngrams, document.Classes) 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /encoding_test.go: -------------------------------------------------------------------------------- 1 | package multibayes 2 | 3 | import ( 4 | "github.com/bmizerany/assert" 5 | "testing" 6 | ) 7 | 8 | func TestClassifierJSON(t *testing.T) { 9 | classifier := NewClassifier() 10 | classifier.trainWithTestData() 11 | 12 | b, err := classifier.MarshalJSON() 13 | assert.Equalf(t, nil, err, "Error marshaling JSON: %v\n", err) 14 | 15 | newclass, err := NewClassifierFromJSON(b) 16 | assert.Equalf(t, nil, err, "Error unmarshaling JSON: %v\n", err) 17 | 18 | assert.Equalf(t, 5, len(newclass.Matrix.Tokens), "Incorrect token length") 19 | assert.Equalf(t, 2, len(newclass.Matrix.Classes), "Incorrect class length") 20 | } 21 | -------------------------------------------------------------------------------- /testutil.go: -------------------------------------------------------------------------------- 1 | package multibayes 2 | 3 | type document struct { 4 | Text string 5 | Classes []string 6 | } 7 | 8 | func getTestData() []document { 9 | 10 | documents := []document{ 11 | { 12 | Text: "My dog has fleas.", 13 | Classes: []string{"vet"}, 14 | }, 15 | { 16 | Text: "My cat has ebola.", 17 | Classes: []string{"vet", "cdc"}, 18 | }, 19 | { 20 | Text: "Aaron has ebola.", 21 | Classes: []string{"cdc"}, 22 | }, 23 | } 24 | 25 | return documents 26 | } 27 | 28 | func (c *Classifier) trainWithTestData() { 29 | testdata := getTestData() 30 | for _, document := range testdata { 31 | c.Add(document.Text, document.Classes) 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /example_test.go: -------------------------------------------------------------------------------- 1 | package multibayes 2 | 3 | import ( 4 | "fmt" 5 | ) 6 | 7 | func Example() { 8 | documents := []struct { 9 | Text string 10 | Classes []string 11 | }{ 12 | { 13 | Text: "My dog has fleas.", 14 | Classes: []string{"vet"}, 15 | }, 16 | { 17 | Text: "My cat has ebola.", 18 | Classes: []string{"vet", "cdc"}, 19 | }, 20 | { 21 | Text: "Aaron has ebola.", 22 | Classes: []string{"cdc"}, 23 | }, 24 | } 25 | 26 | classifier := NewClassifier() 27 | classifier.MinClassSize = 0 28 | 29 | // train the classifier 30 | for _, document := range documents { 31 | classifier.Add(document.Text, document.Classes) 32 | } 33 | 34 | // predict new classes 35 | probs := classifier.Posterior("Aaron's dog has fleas.") 36 | fmt.Printf("Posterior Probabilities: vet: %.4f, cdc: %.4f\n", probs["vet"], probs["cdc"]) 37 | 38 | // Output: Posterior Probabilities: vet: 0.8571, cdc: 0.2727 39 | } 40 | -------------------------------------------------------------------------------- /encoding.go: -------------------------------------------------------------------------------- 1 | package multibayes 2 | 3 | import ( 4 | "encoding/json" 5 | "io/ioutil" 6 | ) 7 | 8 | type jsonableClassifier struct { 9 | Matrix *sparseMatrix `json:"matrix"` 10 | } 11 | 12 | func (c *Classifier) MarshalJSON() ([]byte, error) { 13 | return json.Marshal(&jsonableClassifier{c.Matrix}) 14 | } 15 | 16 | func (c *Classifier) UnmarshalJSON(buf []byte) error { 17 | j := jsonableClassifier{} 18 | 19 | err := json.Unmarshal(buf, &j) 20 | if err != nil { 21 | return nil 22 | } 23 | 24 | *c = *NewClassifier() 25 | c.Matrix = j.Matrix 26 | 27 | return nil 28 | } 29 | 30 | // Initialize a new classifier from a JSON byte slice. 31 | func NewClassifierFromJSON(buf []byte) (*Classifier, error) { 32 | classifier := &Classifier{} 33 | 34 | err := classifier.UnmarshalJSON(buf) 35 | if err != nil { 36 | return nil, err 37 | } 38 | 39 | return classifier, nil 40 | } 41 | 42 | func LoadClassifierFromFile(filename string) (*Classifier, error) { 43 | buf, err := ioutil.ReadFile(filename) 44 | if err != nil { 45 | return nil, err 46 | } 47 | 48 | return NewClassifierFromJSON(buf) 49 | } 50 | 51 | func (s *sparseColumn) MarshalJSON() ([]byte, error) { 52 | return json.Marshal(s.Data) 53 | } 54 | 55 | func (s *sparseColumn) UnmarshalJSON(buf []byte) error { 56 | var data []int 57 | 58 | err := json.Unmarshal(buf, &data) 59 | if err != nil { 60 | return err 61 | } 62 | 63 | s.Data = data 64 | 65 | return nil 66 | } 67 | -------------------------------------------------------------------------------- /sparse.go: -------------------------------------------------------------------------------- 1 | package multibayes 2 | 3 | type sparseMatrix struct { 4 | Tokens map[string]*sparseColumn `json:"tokens"` // []map[tokenindex]occurence 5 | Classes map[string]*sparseColumn `json:"classes"` // map[classname]classindex 6 | N int `json:"n"` // number of rows currently in the matrix 7 | } 8 | 9 | type sparseColumn struct { 10 | Data []int `json:"data"` 11 | } 12 | 13 | func newSparseColumn() *sparseColumn { 14 | return &sparseColumn{ 15 | Data: make([]int, 0, 1000), 16 | } 17 | } 18 | 19 | func (s *sparseColumn) Add(index int) { 20 | s.Data = append(s.Data, index) 21 | } 22 | 23 | // return the number of rows that contain the column 24 | func (s *sparseColumn) Count() int { 25 | return len(s.Data) 26 | } 27 | 28 | // sparse to dense 29 | func (s *sparseColumn) Expand(n int) []float64 { 30 | expanded := make([]float64, n) 31 | for _, index := range s.Data { 32 | expanded[index] = 1.0 33 | } 34 | return expanded 35 | } 36 | 37 | func newSparseMatrix() *sparseMatrix { 38 | return &sparseMatrix{ 39 | Tokens: make(map[string]*sparseColumn), 40 | Classes: make(map[string]*sparseColumn), 41 | N: 0, 42 | } 43 | } 44 | 45 | func (s *sparseMatrix) Add(ngrams []ngram, classes []string) { 46 | if len(ngrams) == 0 || len(classes) == 0 { 47 | return 48 | } 49 | for _, class := range classes { 50 | if _, ok := s.Classes[class]; !ok { 51 | s.Classes[class] = newSparseColumn() 52 | } 53 | 54 | s.Classes[class].Add(s.N) 55 | } 56 | 57 | // add ngrams uniquely 58 | added := make(map[string]int) 59 | for _, ngram := range ngrams { 60 | gramString := ngram.String() 61 | if _, ok := s.Tokens[gramString]; !ok { 62 | s.Tokens[gramString] = newSparseColumn() 63 | } 64 | 65 | // only add the document index once for the ngram 66 | if _, ok := added[gramString]; !ok { 67 | added[gramString] = 1 68 | s.Tokens[gramString].Add(s.N) 69 | } 70 | } 71 | // increment the row counter 72 | s.N++ 73 | } 74 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Multibayes 2 | ========== 3 | 4 | [![Build Status](https://travis-ci.org/lytics/multibayes.svg?branch=master)](https://travis-ci.org/lytics/multibayes) [![GoDoc](https://godoc.org/github.com/lytics/multibayes?status.svg)](https://godoc.org/github.com/lytics/multibayes) 5 | 6 | Multiclass naive Bayesian document classification. 7 | 8 | Often in document classification, a document may have more than one relevant classification -- a question on [stackoverflow](http://stackoverflow.com) might have tags "go", "map", and "interface". 9 | 10 | While multinomial Bayesian classification offers a one-of-many classification, multibayes offers tools for many-of-many classification. The multibayes library strives to offer efficient storage and calculation of multiple Bayesian posterior classification probabilities. 11 | 12 | ## Usage 13 | 14 | A new classifier is created with the `NewClassifier` function, and can be trained by adding documents and classes by calling the `Add` method: 15 | 16 | ```go 17 | classifier.Add("A new document", []string{"class1", "class2"}) 18 | ``` 19 | 20 | Posterior probabilities for a new document are calculated by calling the `Posterior` method: 21 | 22 | ```go 23 | classifier.Posterior("Another new document") 24 | ``` 25 | 26 | A posterior class probability is returned for each class observed in the training set, which the user can use to determine class assignment. A user can then assign classifications according to his or her own heuristics -- for example, by using all classes that yield a posterior probability greater than 0.8 27 | 28 | 29 | ## Example 30 | 31 | ```go 32 | documents := []struct { 33 | Text string 34 | Classes []string 35 | }{ 36 | { 37 | Text: "My dog has fleas.", 38 | Classes: []string{"vet"}, 39 | }, 40 | { 41 | Text: "My cat has ebola.", 42 | Classes: []string{"vet", "cdc"}, 43 | }, 44 | { 45 | Text: "Aaron has ebola.", 46 | Classes: []string{"cdc"}, 47 | }, 48 | } 49 | 50 | classifier := NewClassifier() 51 | classifier.MinClassSize = 0 52 | 53 | // train the classifier 54 | for _, document := range documents { 55 | classifier.Add(document.Text, document.Classes) 56 | } 57 | 58 | // predict new classes 59 | probs := classifier.Posterior("Aaron's dog has fleas.") 60 | fmt.Printf("Posterior Probabilities: %+v\n", probs) 61 | 62 | // Posterior Probabilities: map[vet:0.8571 cdc:0.2727] 63 | ``` 64 | -------------------------------------------------------------------------------- /bayes.go: -------------------------------------------------------------------------------- 1 | package multibayes 2 | 3 | import ( 4 | "math" 5 | ) 6 | 7 | var ( 8 | smoother = 1 // laplace 9 | defaultMinClassSize = 5 10 | ) 11 | 12 | type Classifier struct { 13 | Tokenizer *tokenizer `json:"-"` 14 | Matrix *sparseMatrix `json:"matrix"` 15 | MinClassSize int 16 | } 17 | 18 | // Create a new multibayes classifier. 19 | func NewClassifier() *Classifier { 20 | tokenize, _ := newTokenizer(&tokenizerConf{ 21 | NGramSize: 1, 22 | }) 23 | 24 | sparse := newSparseMatrix() 25 | 26 | return &Classifier{ 27 | Tokenizer: tokenize, 28 | Matrix: sparse, 29 | MinClassSize: defaultMinClassSize, 30 | } 31 | } 32 | 33 | // Train the classifier with a new document and its classes. 34 | func (c *Classifier) Add(document string, classes []string) { 35 | ngrams := c.Tokenizer.Parse(document) 36 | c.Matrix.Add(ngrams, classes) 37 | } 38 | 39 | // Calculate the posterior probability for a new document on each 40 | // class from the training set. 41 | func (c *Classifier) Posterior(document string) map[string]float64 { 42 | tokens := c.Tokenizer.Parse(document) 43 | predictions := make(map[string]float64) 44 | 45 | for class, classcolumn := range c.Matrix.Classes { 46 | if len(classcolumn.Data) < c.MinClassSize { 47 | continue 48 | } 49 | 50 | n := classcolumn.Count() 51 | smoothN := n + (smoother * 2) 52 | 53 | priors := []float64{ 54 | float64(n+smoother) / float64(c.Matrix.N+(smoother*2)), // P(C=Y) 55 | float64(c.Matrix.N-n+smoother) / float64(c.Matrix.N+(smoother*2)), // P(C=N) 56 | } 57 | 58 | loglikelihood := []float64{1.0, 1.0} 59 | 60 | // check if each token is in our token sparse matrix 61 | for _, token := range tokens { 62 | if tokencolumn, ok := c.Matrix.Tokens[token.String()]; ok { 63 | // conditional probability the token occurs for the class 64 | joint := intersection(tokencolumn.Data, classcolumn.Data) 65 | conditional := float64(joint+smoother) / float64(smoothN) // P(F|C=Y) 66 | loglikelihood[0] += math.Log(conditional) 67 | 68 | // conditional probability the token occurs if the class doesn't apply 69 | not := len(tokencolumn.Data) - joint 70 | notconditional := float64(not+smoother) / float64(smoothN) // P(F|C=N) 71 | loglikelihood[1] += math.Log(notconditional) 72 | } 73 | } 74 | 75 | likelihood := []float64{ 76 | math.Exp(loglikelihood[0]), 77 | math.Exp(loglikelihood[1]), 78 | } 79 | 80 | prob := bayesRule(priors, likelihood) // P(C|F) 81 | predictions[class] = prob[0] 82 | } 83 | 84 | return predictions 85 | } 86 | 87 | func bayesRule(prior, likelihood []float64) []float64 { 88 | 89 | posterior := make([]float64, len(prior)) 90 | 91 | sum := 0.0 92 | for i, _ := range prior { 93 | combined := prior[i] * likelihood[i] 94 | 95 | posterior[i] = combined 96 | sum += combined 97 | } 98 | 99 | // scale the likelihoods 100 | for i, _ := range posterior { 101 | posterior[i] /= sum 102 | } 103 | 104 | return posterior 105 | } 106 | 107 | // elements that are in both array1 and array2 108 | func intersection(array1, array2 []int) int { 109 | var count int 110 | for _, elem1 := range array1 { 111 | for _, elem2 := range array2 { 112 | if elem1 == elem2 { 113 | count++ 114 | break 115 | } 116 | } 117 | } 118 | return count 119 | } 120 | -------------------------------------------------------------------------------- /stopbytes.go: -------------------------------------------------------------------------------- 1 | package multibayes 2 | 3 | var ( 4 | stopbytes = [][]byte{ 5 | []byte(`i`), 6 | []byte(`me`), 7 | []byte(`my`), 8 | []byte(`myself`), 9 | []byte(`we`), 10 | []byte(`our`), 11 | []byte(`ours`), 12 | []byte(`ourselves`), 13 | []byte(`you`), 14 | []byte(`your`), 15 | []byte(`yours`), 16 | []byte(`yourself`), 17 | []byte(`yourselves`), 18 | []byte(`he`), 19 | []byte(`him`), 20 | []byte(`his`), 21 | []byte(`himself`), 22 | []byte(`she`), 23 | []byte(`her`), 24 | []byte(`hers`), 25 | []byte(`herself`), 26 | []byte(`it`), 27 | []byte(`its`), 28 | []byte(`itself`), 29 | []byte(`they`), 30 | []byte(`them`), 31 | []byte(`their`), 32 | []byte(`theirs`), 33 | []byte(`themselves`), 34 | []byte(`what`), 35 | []byte(`which`), 36 | []byte(`who`), 37 | []byte(`whom`), 38 | []byte(`this`), 39 | []byte(`that`), 40 | []byte(`these`), 41 | []byte(`those`), 42 | []byte(`am`), 43 | []byte(`is`), 44 | []byte(`are`), 45 | []byte(`was`), 46 | []byte(`were`), 47 | []byte(`be`), 48 | []byte(`been`), 49 | []byte(`being`), 50 | []byte(`have`), 51 | []byte(`has`), 52 | []byte(`had`), 53 | []byte(`having`), 54 | []byte(`do`), 55 | []byte(`does`), 56 | []byte(`did`), 57 | []byte(`doing`), 58 | []byte(`would`), 59 | []byte(`should`), 60 | []byte(`could`), 61 | []byte(`ought`), 62 | []byte(`i'm`), 63 | []byte(`you're`), 64 | []byte(`he's`), 65 | []byte(`she's`), 66 | []byte(`it's`), 67 | []byte(`we're`), 68 | []byte(`they're`), 69 | []byte(`i've`), 70 | []byte(`you've`), 71 | []byte(`we've`), 72 | []byte(`they've`), 73 | []byte(`i'd`), 74 | []byte(`you'd`), 75 | []byte(`he'd`), 76 | []byte(`she'd`), 77 | []byte(`we'd`), 78 | []byte(`they'd`), 79 | []byte(`i'll`), 80 | []byte(`you'll`), 81 | []byte(`he'll`), 82 | []byte(`she'll`), 83 | []byte(`we'll`), 84 | []byte(`they'll`), 85 | []byte(`isn't`), 86 | []byte(`aren't`), 87 | []byte(`wasn't`), 88 | []byte(`weren't`), 89 | []byte(`hasn't`), 90 | []byte(`haven't`), 91 | []byte(`hadn't`), 92 | []byte(`doesn't`), 93 | []byte(`don't`), 94 | []byte(`didn't`), 95 | []byte(`won't`), 96 | []byte(`wouldn't`), 97 | []byte(`shan't`), 98 | []byte(`shouldn't`), 99 | []byte(`can't`), 100 | []byte(`cannot`), 101 | []byte(`couldn't`), 102 | []byte(`mustn't`), 103 | []byte(`let's`), 104 | []byte(`that's`), 105 | []byte(`who's`), 106 | []byte(`what's`), 107 | []byte(`here's`), 108 | []byte(`there's`), 109 | []byte(`when's`), 110 | []byte(`where's`), 111 | []byte(`why's`), 112 | []byte(`how's`), 113 | []byte(`a`), 114 | []byte(`an`), 115 | []byte(`the`), 116 | []byte(`and`), 117 | []byte(`but`), 118 | []byte(`if`), 119 | []byte(`or`), 120 | []byte(`because`), 121 | []byte(`as`), 122 | []byte(`until`), 123 | []byte(`while`), 124 | []byte(`of`), 125 | []byte(`at`), 126 | []byte(`by`), 127 | []byte(`for`), 128 | []byte(`with`), 129 | []byte(`about`), 130 | []byte(`against`), 131 | []byte(`between`), 132 | []byte(`into`), 133 | []byte(`through`), 134 | []byte(`during`), 135 | []byte(`before`), 136 | []byte(`after`), 137 | []byte(`above`), 138 | []byte(`below`), 139 | []byte(`to`), 140 | []byte(`from`), 141 | []byte(`up`), 142 | []byte(`down`), 143 | []byte(`in`), 144 | []byte(`out`), 145 | []byte(`on`), 146 | []byte(`off`), 147 | []byte(`over`), 148 | []byte(`under`), 149 | []byte(`again`), 150 | []byte(`further`), 151 | []byte(`then`), 152 | []byte(`once`), 153 | []byte(`here`), 154 | []byte(`there`), 155 | []byte(`when`), 156 | []byte(`where`), 157 | []byte(`why`), 158 | []byte(`how`), 159 | []byte(`all`), 160 | []byte(`any`), 161 | []byte(`both`), 162 | []byte(`each`), 163 | []byte(`few`), 164 | []byte(`more`), 165 | []byte(`most`), 166 | []byte(`other`), 167 | []byte(`some`), 168 | []byte(`such`), 169 | []byte(`no`), 170 | []byte(`nor`), 171 | []byte(`not`), 172 | []byte(`only`), 173 | []byte(`own`), 174 | []byte(`same`), 175 | []byte(`so`), 176 | []byte(`than`), 177 | []byte(`too`), 178 | []byte(`very`), 179 | []byte(`-`), 180 | } 181 | ) 182 | -------------------------------------------------------------------------------- /tokenize.go: -------------------------------------------------------------------------------- 1 | package multibayes 2 | 3 | import ( 4 | "bytes" 5 | "encoding/base64" 6 | "regexp" 7 | "strings" 8 | 9 | "github.com/blevesearch/bleve/analysis" 10 | regexp_tokenizer "github.com/blevesearch/bleve/analysis/tokenizer/regexp" 11 | "github.com/blevesearch/go-porterstemmer" 12 | ) 13 | 14 | const ( 15 | tokenSeparator = "_" 16 | ) 17 | 18 | type ngram struct { 19 | Tokens [][]byte 20 | } 21 | 22 | // encodes in base64 for safe comparison 23 | func (ng *ngram) String() string { 24 | encoded := make([]string, len(ng.Tokens)) 25 | 26 | for i, token := range ng.Tokens { 27 | encoded[i] = string(token) 28 | //encoded[i] = base64.StdEncoding.EncodeToString(token) // safer? 29 | } 30 | 31 | return strings.Join(encoded, tokenSeparator) 32 | } 33 | 34 | func decodeNGram(s string) (*ngram, error) { 35 | encodedTokens := strings.Split(s, tokenSeparator) 36 | 37 | tokens := make([][]byte, len(encodedTokens)) 38 | 39 | var err error 40 | for i, encodedToken := range encodedTokens { 41 | tokens[i], err = base64.StdEncoding.DecodeString(encodedToken) 42 | if err != nil { 43 | return nil, err 44 | } 45 | } 46 | return &ngram{tokens}, nil 47 | } 48 | 49 | type tokenizerConf struct { 50 | regexp *regexp.Regexp 51 | NGramSize int64 52 | } 53 | 54 | type tokenizer struct { 55 | regexp_tokenizer.RegexpTokenizer 56 | Conf *tokenizerConf 57 | } 58 | 59 | func validateConf(tc *tokenizerConf) { 60 | tc.regexp = regexp.MustCompile(`[0-9A-z_'\-]+|\%|\$`) 61 | 62 | // TODO: We force NGramSize = 1 so as to create disjoint ngrams, 63 | // which is necessary for the naive assumption of conditional 64 | // independence among tokens. It would be great to allow ngrams 65 | // to be greater than 1 and select only disjoint ngrams from the 66 | // tokenizer. 67 | tc.NGramSize = 1 68 | } 69 | 70 | func newTokenizer(tc *tokenizerConf) (*tokenizer, error) { 71 | validateConf(tc) 72 | 73 | return &tokenizer{*regexp_tokenizer.NewRegexpTokenizer(tc.regexp), tc}, nil 74 | } 75 | 76 | // Tokenize and Gramify 77 | func (t *tokenizer) Parse(doc string) []ngram { 78 | // maybe use token types for datetimes or something instead of 79 | // the actual byte slice 80 | alltokens := t.Tokenize([]byte(strings.ToLower(doc))) 81 | filtered := make(map[int][]byte) 82 | for i, token := range alltokens { 83 | exclude := false 84 | for _, stop := range stopbytes { 85 | if bytes.Equal(token.Term, stop) { 86 | exclude = true 87 | break 88 | } 89 | } 90 | 91 | if exclude { 92 | continue 93 | } 94 | 95 | tokenString := porterstemmer.StemString(string(token.Term)) 96 | //tokenBytes := porterstemmer.Stem(token.Term) // takes runes, not bytes 97 | 98 | if token.Type == analysis.Numeric { 99 | tokenString = "NUMBER" 100 | } else if token.Type == analysis.DateTime { 101 | tokenString = "DATE" 102 | } 103 | 104 | filtered[i] = []byte(tokenString) 105 | } 106 | 107 | // only consider sequential terms as candidates for ngrams 108 | // terms separated by stopwords are ineligible 109 | allNGrams := make([]ngram, 0, 100) 110 | currentTokens := make([][]byte, 0, 100) 111 | 112 | lastObserved := -1 113 | for i, token := range filtered { 114 | if (i - 1) != lastObserved { 115 | 116 | ngrams := t.tokensToNGrams(currentTokens) 117 | allNGrams = append(allNGrams, ngrams...) 118 | 119 | currentTokens = make([][]byte, 0, 100) 120 | } 121 | 122 | currentTokens = append(currentTokens, token) 123 | lastObserved = i 124 | } 125 | 126 | // bring in the last one 127 | if len(currentTokens) > 0 { 128 | ngrams := t.tokensToNGrams(currentTokens) 129 | allNGrams = append(allNGrams, ngrams...) 130 | } 131 | 132 | return allNGrams 133 | } 134 | 135 | func (t *tokenizer) tokensToNGrams(tokens [][]byte) []ngram { 136 | nTokens := int64(len(tokens)) 137 | 138 | nNGrams := int64(0) 139 | for i := int64(1); i <= t.Conf.NGramSize; i++ { 140 | chosen := choose(nTokens, i) 141 | nNGrams += chosen 142 | } 143 | 144 | ngrams := make([]ngram, 0, nNGrams) 145 | for ngramSize := int64(1); ngramSize <= t.Conf.NGramSize; ngramSize++ { 146 | nNGramsOfSize := choose(nTokens, ngramSize) 147 | 148 | for i := int64(0); i < nNGramsOfSize; i++ { 149 | ngrams = append(ngrams, ngram{tokens[i:(i + ngramSize)]}) 150 | } 151 | } 152 | 153 | return ngrams 154 | } 155 | 156 | // not a binomial coefficient -- combinations must be sequential 157 | func choose(n, k int64) int64 { 158 | return max(n-k+int64(1), 0) 159 | } 160 | 161 | func max(x, y int64) int64 { 162 | if x > y { 163 | return x 164 | } 165 | return y 166 | } 167 | --------------------------------------------------------------------------------