├── .travis.yml
├── doc.go
├── tokenize_test.go
├── bayes_test.go
├── sparse_test.go
├── encoding_test.go
├── testutil.go
├── example_test.go
├── encoding.go
├── sparse.go
├── README.md
├── bayes.go
├── stopbytes.go
└── tokenize.go


/.travis.yml:
--------------------------------------------------------------------------------
1 | language: go
2 | script: go test -race -cpu 1,2,4 -v ./...


--------------------------------------------------------------------------------
/doc.go:
--------------------------------------------------------------------------------
 1 | // Multiclass naive Bayesian document classification.
 2 | //
 3 | // While multinomial Bayesian classification offers
 4 | // one-of-many classification, multibayes offers tools
 5 | // for many-of-many classification.  The multibayes
 6 | // library strives to offer efficient storage and
 7 | // calculation of multiple Bayesian posterior classification
 8 | // probabilities.
 9 | package multibayes
10 | 


--------------------------------------------------------------------------------
/tokenize_test.go:
--------------------------------------------------------------------------------
 1 | package multibayes
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 
 6 | 	"github.com/bmizerany/assert"
 7 | )
 8 | 
 9 | func TestTokenizer(t *testing.T) {
10 | 	testdata := getTestData()
11 | 
12 | 	tokenize, err := newTokenizer(&tokenizerConf{
13 | 		NGramSize: 1,
14 | 	})
15 | 
16 | 	assert.Equalf(t, nil, err, "Error creating tokenizer: %v", err)
17 | 
18 | 	for _, doc := range testdata {
19 | 		_ = tokenize.Parse(doc.Text)
20 | 	}
21 | }
22 | 


--------------------------------------------------------------------------------
/bayes_test.go:
--------------------------------------------------------------------------------
 1 | package multibayes
 2 | 
 3 | import (
 4 | 	"github.com/bmizerany/assert"
 5 | 	"testing"
 6 | )
 7 | 
 8 | func TestPosterior(t *testing.T) {
 9 | 	classifier := NewClassifier()
10 | 	classifier.MinClassSize = 0
11 | 	classifier.trainWithTestData()
12 | 
13 | 	probs := classifier.Posterior("Aaron's dog has tons of fleas")
14 | 
15 | 	assert.Equalf(t, len(classifier.Matrix.Classes), len(probs), "Posterior returned incorrect number of classes")
16 | }
17 | 


--------------------------------------------------------------------------------
/sparse_test.go:
--------------------------------------------------------------------------------
 1 | package multibayes
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 
 6 | 	"github.com/bmizerany/assert"
 7 | )
 8 | 
 9 | func TestSparseMatrix(t *testing.T) {
10 | 	testdata := getTestData()
11 | 	tokenizer, err := newTokenizer(&tokenizerConf{
12 | 		NGramSize: 1,
13 | 	})
14 | 	assert.Equalf(t, err, nil, "Error creating new tokenizer")
15 | 
16 | 	sparse := newSparseMatrix()
17 | 
18 | 	for _, document := range testdata {
19 | 		ngrams := tokenizer.Parse(document.Text)
20 | 		sparse.Add(ngrams, document.Classes)
21 | 	}
22 | }
23 | 


--------------------------------------------------------------------------------
/encoding_test.go:
--------------------------------------------------------------------------------
 1 | package multibayes
 2 | 
 3 | import (
 4 | 	"github.com/bmizerany/assert"
 5 | 	"testing"
 6 | )
 7 | 
 8 | func TestClassifierJSON(t *testing.T) {
 9 | 	classifier := NewClassifier()
10 | 	classifier.trainWithTestData()
11 | 
12 | 	b, err := classifier.MarshalJSON()
13 | 	assert.Equalf(t, nil, err, "Error marshaling JSON: %v\n", err)
14 | 
15 | 	newclass, err := NewClassifierFromJSON(b)
16 | 	assert.Equalf(t, nil, err, "Error unmarshaling JSON: %v\n", err)
17 | 
18 | 	assert.Equalf(t, 5, len(newclass.Matrix.Tokens), "Incorrect token length")
19 | 	assert.Equalf(t, 2, len(newclass.Matrix.Classes), "Incorrect class length")
20 | }
21 | 


--------------------------------------------------------------------------------
/testutil.go:
--------------------------------------------------------------------------------
 1 | package multibayes
 2 | 
 3 | type document struct {
 4 | 	Text    string
 5 | 	Classes []string
 6 | }
 7 | 
 8 | func getTestData() []document {
 9 | 
10 | 	documents := []document{
11 | 		{
12 | 			Text:    "My dog has fleas.",
13 | 			Classes: []string{"vet"},
14 | 		},
15 | 		{
16 | 			Text:    "My cat has ebola.",
17 | 			Classes: []string{"vet", "cdc"},
18 | 		},
19 | 		{
20 | 			Text:    "Aaron has ebola.",
21 | 			Classes: []string{"cdc"},
22 | 		},
23 | 	}
24 | 
25 | 	return documents
26 | }
27 | 
28 | func (c *Classifier) trainWithTestData() {
29 | 	testdata := getTestData()
30 | 	for _, document := range testdata {
31 | 		c.Add(document.Text, document.Classes)
32 | 	}
33 | }
34 | 


--------------------------------------------------------------------------------
/example_test.go:
--------------------------------------------------------------------------------
 1 | package multibayes
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | )
 6 | 
 7 | func Example() {
 8 | 	documents := []struct {
 9 | 		Text    string
10 | 		Classes []string
11 | 	}{
12 | 		{
13 | 			Text:    "My dog has fleas.",
14 | 			Classes: []string{"vet"},
15 | 		},
16 | 		{
17 | 			Text:    "My cat has ebola.",
18 | 			Classes: []string{"vet", "cdc"},
19 | 		},
20 | 		{
21 | 			Text:    "Aaron has ebola.",
22 | 			Classes: []string{"cdc"},
23 | 		},
24 | 	}
25 | 
26 | 	classifier := NewClassifier()
27 | 	classifier.MinClassSize = 0
28 | 
29 | 	// train the classifier
30 | 	for _, document := range documents {
31 | 		classifier.Add(document.Text, document.Classes)
32 | 	}
33 | 
34 | 	// predict new classes
35 | 	probs := classifier.Posterior("Aaron's dog has fleas.")
36 | 	fmt.Printf("Posterior Probabilities: vet: %.4f, cdc: %.4f\n", probs["vet"], probs["cdc"])
37 | 
38 | 	// Output: Posterior Probabilities: vet: 0.8571, cdc: 0.2727
39 | }
40 | 


--------------------------------------------------------------------------------
/encoding.go:
--------------------------------------------------------------------------------
 1 | package multibayes
 2 | 
 3 | import (
 4 | 	"encoding/json"
 5 | 	"io/ioutil"
 6 | )
 7 | 
 8 | type jsonableClassifier struct {
 9 | 	Matrix *sparseMatrix `json:"matrix"`
10 | }
11 | 
12 | func (c *Classifier) MarshalJSON() ([]byte, error) {
13 | 	return json.Marshal(&jsonableClassifier{c.Matrix})
14 | }
15 | 
16 | func (c *Classifier) UnmarshalJSON(buf []byte) error {
17 | 	j := jsonableClassifier{}
18 | 
19 | 	err := json.Unmarshal(buf, &j)
20 | 	if err != nil {
21 | 		return nil
22 | 	}
23 | 
24 | 	*c = *NewClassifier()
25 | 	c.Matrix = j.Matrix
26 | 
27 | 	return nil
28 | }
29 | 
30 | // Initialize a new classifier from a JSON byte slice.
31 | func NewClassifierFromJSON(buf []byte) (*Classifier, error) {
32 | 	classifier := &Classifier{}
33 | 
34 | 	err := classifier.UnmarshalJSON(buf)
35 | 	if err != nil {
36 | 		return nil, err
37 | 	}
38 | 
39 | 	return classifier, nil
40 | }
41 | 
42 | func LoadClassifierFromFile(filename string) (*Classifier, error) {
43 | 	buf, err := ioutil.ReadFile(filename)
44 | 	if err != nil {
45 | 		return nil, err
46 | 	}
47 | 
48 | 	return NewClassifierFromJSON(buf)
49 | }
50 | 
51 | func (s *sparseColumn) MarshalJSON() ([]byte, error) {
52 | 	return json.Marshal(s.Data)
53 | }
54 | 
55 | func (s *sparseColumn) UnmarshalJSON(buf []byte) error {
56 | 	var data []int
57 | 
58 | 	err := json.Unmarshal(buf, &data)
59 | 	if err != nil {
60 | 		return err
61 | 	}
62 | 
63 | 	s.Data = data
64 | 
65 | 	return nil
66 | }
67 | 


--------------------------------------------------------------------------------
/sparse.go:
--------------------------------------------------------------------------------
 1 | package multibayes
 2 | 
 3 | type sparseMatrix struct {
 4 | 	Tokens  map[string]*sparseColumn `json:"tokens"`  // []map[tokenindex]occurence
 5 | 	Classes map[string]*sparseColumn `json:"classes"` // map[classname]classindex
 6 | 	N       int                      `json:"n"`       // number of rows currently in the matrix
 7 | }
 8 | 
 9 | type sparseColumn struct {
10 | 	Data []int `json:"data"`
11 | }
12 | 
13 | func newSparseColumn() *sparseColumn {
14 | 	return &sparseColumn{
15 | 		Data: make([]int, 0, 1000),
16 | 	}
17 | }
18 | 
19 | func (s *sparseColumn) Add(index int) {
20 | 	s.Data = append(s.Data, index)
21 | }
22 | 
23 | // return the number of rows that contain the column
24 | func (s *sparseColumn) Count() int {
25 | 	return len(s.Data)
26 | }
27 | 
28 | // sparse to dense
29 | func (s *sparseColumn) Expand(n int) []float64 {
30 | 	expanded := make([]float64, n)
31 | 	for _, index := range s.Data {
32 | 		expanded[index] = 1.0
33 | 	}
34 | 	return expanded
35 | }
36 | 
37 | func newSparseMatrix() *sparseMatrix {
38 | 	return &sparseMatrix{
39 | 		Tokens:  make(map[string]*sparseColumn),
40 | 		Classes: make(map[string]*sparseColumn),
41 | 		N:       0,
42 | 	}
43 | }
44 | 
45 | func (s *sparseMatrix) Add(ngrams []ngram, classes []string) {
46 | 	if len(ngrams) == 0 || len(classes) == 0 {
47 | 		return
48 | 	}
49 | 	for _, class := range classes {
50 | 		if _, ok := s.Classes[class]; !ok {
51 | 			s.Classes[class] = newSparseColumn()
52 | 		}
53 | 
54 | 		s.Classes[class].Add(s.N)
55 | 	}
56 | 
57 | 	// add ngrams uniquely
58 | 	added := make(map[string]int)
59 | 	for _, ngram := range ngrams {
60 | 		gramString := ngram.String()
61 | 		if _, ok := s.Tokens[gramString]; !ok {
62 | 			s.Tokens[gramString] = newSparseColumn()
63 | 		}
64 | 
65 | 		// only add the document index once for the ngram
66 | 		if _, ok := added[gramString]; !ok {
67 | 			added[gramString] = 1
68 | 			s.Tokens[gramString].Add(s.N)
69 | 		}
70 | 	}
71 | 	// increment the row counter
72 | 	s.N++
73 | }
74 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Multibayes
 2 | ==========
 3 | 
 4 | [![Build Status](https://travis-ci.org/lytics/multibayes.svg?branch=master)](https://travis-ci.org/lytics/multibayes) [![GoDoc](https://godoc.org/github.com/lytics/multibayes?status.svg)](https://godoc.org/github.com/lytics/multibayes)
 5 | 
 6 | Multiclass naive Bayesian document classification.
 7 | 
 8 | Often in document classification, a document may have more than one relevant classification -- a question on [stackoverflow](http://stackoverflow.com) might have tags "go", "map", and "interface".
 9 | 
10 | While multinomial Bayesian classification offers a one-of-many classification, multibayes offers tools for many-of-many classification.  The multibayes library strives to offer efficient storage and calculation of multiple Bayesian posterior classification probabilities.
11 | 
12 | ## Usage
13 | 
14 | A new classifier is created with the `NewClassifier` function, and can be trained by adding documents and classes by calling the `Add` method:
15 | 
16 | ```go
17 | classifier.Add("A new document", []string{"class1", "class2"})
18 | ```
19 | 
20 | Posterior probabilities for a new document are calculated by calling the `Posterior` method:
21 | 
22 | ```go
23 | classifier.Posterior("Another new document")
24 | ```
25 | 
26 | A posterior class probability is returned for each class observed in the training set, which the user can use to determine class assignment.  A user can then assign classifications according to his or her own heuristics -- for example, by using all classes that yield a posterior probability greater than 0.8
27 | 
28 | 
29 | ## Example
30 | 
31 | ```go
32 | documents := []struct {
33 | 	Text    string
34 | 	Classes []string
35 | }{
36 | 	{
37 | 		Text:    "My dog has fleas.",
38 | 		Classes: []string{"vet"},
39 | 	},
40 | 	{
41 | 		Text:    "My cat has ebola.",
42 | 		Classes: []string{"vet", "cdc"},
43 | 	},
44 | 	{
45 | 		Text:    "Aaron has ebola.",
46 | 		Classes: []string{"cdc"},
47 | 	},
48 | }
49 | 
50 | classifier := NewClassifier()
51 | classifier.MinClassSize = 0
52 | 
53 | // train the classifier
54 | for _, document := range documents {
55 | 	classifier.Add(document.Text, document.Classes)
56 | }
57 | 
58 | // predict new classes
59 | probs := classifier.Posterior("Aaron's dog has fleas.")
60 | fmt.Printf("Posterior Probabilities: %+v\n", probs)
61 | 
62 | // Posterior Probabilities: map[vet:0.8571 cdc:0.2727]
63 | ```
64 | 


--------------------------------------------------------------------------------
/bayes.go:
--------------------------------------------------------------------------------
  1 | package multibayes
  2 | 
  3 | import (
  4 | 	"math"
  5 | )
  6 | 
  7 | var (
  8 | 	smoother     = 1 // laplace
  9 | 	defaultMinClassSize = 5
 10 | )
 11 | 
 12 | type Classifier struct {
 13 | 	Tokenizer *tokenizer    `json:"-"`
 14 | 	Matrix    *sparseMatrix `json:"matrix"`
 15 | 	MinClassSize int
 16 | }
 17 | 
 18 | // Create a new multibayes classifier.
 19 | func NewClassifier() *Classifier {
 20 | 	tokenize, _ := newTokenizer(&tokenizerConf{
 21 | 		NGramSize: 1,
 22 | 	})
 23 | 
 24 | 	sparse := newSparseMatrix()
 25 | 
 26 | 	return &Classifier{
 27 | 		Tokenizer: tokenize,
 28 | 		Matrix:    sparse,
 29 | 		MinClassSize: defaultMinClassSize,
 30 | 	}
 31 | }
 32 | 
 33 | // Train the classifier with a new document and its classes.
 34 | func (c *Classifier) Add(document string, classes []string) {
 35 | 	ngrams := c.Tokenizer.Parse(document)
 36 | 	c.Matrix.Add(ngrams, classes)
 37 | }
 38 | 
 39 | // Calculate the posterior probability for a new document on each
 40 | // class from the training set.
 41 | func (c *Classifier) Posterior(document string) map[string]float64 {
 42 | 	tokens := c.Tokenizer.Parse(document)
 43 | 	predictions := make(map[string]float64)
 44 | 
 45 | 	for class, classcolumn := range c.Matrix.Classes {
 46 | 		if len(classcolumn.Data) < c.MinClassSize {
 47 | 			continue
 48 | 		}
 49 | 
 50 | 		n := classcolumn.Count()
 51 | 		smoothN := n + (smoother * 2)
 52 | 
 53 | 		priors := []float64{
 54 | 			float64(n+smoother) / float64(c.Matrix.N+(smoother*2)),            // P(C=Y)
 55 | 			float64(c.Matrix.N-n+smoother) / float64(c.Matrix.N+(smoother*2)), // P(C=N)
 56 | 		}
 57 | 
 58 | 		loglikelihood := []float64{1.0, 1.0}
 59 | 
 60 | 		// check if each token is in our token sparse matrix
 61 | 		for _, token := range tokens {
 62 | 			if tokencolumn, ok := c.Matrix.Tokens[token.String()]; ok {
 63 | 				// conditional probability the token occurs for the class
 64 | 				joint := intersection(tokencolumn.Data, classcolumn.Data)
 65 | 				conditional := float64(joint+smoother) / float64(smoothN) // P(F|C=Y)
 66 | 				loglikelihood[0] += math.Log(conditional)
 67 | 
 68 | 				// conditional probability the token occurs if the class doesn't apply
 69 | 				not := len(tokencolumn.Data) - joint
 70 | 				notconditional := float64(not+smoother) / float64(smoothN) // P(F|C=N)
 71 | 				loglikelihood[1] += math.Log(notconditional)
 72 | 			}
 73 | 		}
 74 | 
 75 | 		likelihood := []float64{
 76 | 			math.Exp(loglikelihood[0]),
 77 | 			math.Exp(loglikelihood[1]),
 78 | 		}
 79 | 
 80 | 		prob := bayesRule(priors, likelihood) // P(C|F)
 81 | 		predictions[class] = prob[0]
 82 | 	}
 83 | 
 84 | 	return predictions
 85 | }
 86 | 
 87 | func bayesRule(prior, likelihood []float64) []float64 {
 88 | 
 89 | 	posterior := make([]float64, len(prior))
 90 | 
 91 | 	sum := 0.0
 92 | 	for i, _ := range prior {
 93 | 		combined := prior[i] * likelihood[i]
 94 | 
 95 | 		posterior[i] = combined
 96 | 		sum += combined
 97 | 	}
 98 | 
 99 | 	// scale the likelihoods
100 | 	for i, _ := range posterior {
101 | 		posterior[i] /= sum
102 | 	}
103 | 
104 | 	return posterior
105 | }
106 | 
107 | // elements that are in both array1 and array2
108 | func intersection(array1, array2 []int) int {
109 | 	var count int
110 | 	for _, elem1 := range array1 {
111 | 		for _, elem2 := range array2 {
112 | 			if elem1 == elem2 {
113 | 				count++
114 | 				break
115 | 			}
116 | 		}
117 | 	}
118 | 	return count
119 | }
120 | 


--------------------------------------------------------------------------------
/stopbytes.go:
--------------------------------------------------------------------------------
  1 | package multibayes
  2 | 
  3 | var (
  4 | 	stopbytes = [][]byte{
  5 | 		[]byte(`i`),
  6 | 		[]byte(`me`),
  7 | 		[]byte(`my`),
  8 | 		[]byte(`myself`),
  9 | 		[]byte(`we`),
 10 | 		[]byte(`our`),
 11 | 		[]byte(`ours`),
 12 | 		[]byte(`ourselves`),
 13 | 		[]byte(`you`),
 14 | 		[]byte(`your`),
 15 | 		[]byte(`yours`),
 16 | 		[]byte(`yourself`),
 17 | 		[]byte(`yourselves`),
 18 | 		[]byte(`he`),
 19 | 		[]byte(`him`),
 20 | 		[]byte(`his`),
 21 | 		[]byte(`himself`),
 22 | 		[]byte(`she`),
 23 | 		[]byte(`her`),
 24 | 		[]byte(`hers`),
 25 | 		[]byte(`herself`),
 26 | 		[]byte(`it`),
 27 | 		[]byte(`its`),
 28 | 		[]byte(`itself`),
 29 | 		[]byte(`they`),
 30 | 		[]byte(`them`),
 31 | 		[]byte(`their`),
 32 | 		[]byte(`theirs`),
 33 | 		[]byte(`themselves`),
 34 | 		[]byte(`what`),
 35 | 		[]byte(`which`),
 36 | 		[]byte(`who`),
 37 | 		[]byte(`whom`),
 38 | 		[]byte(`this`),
 39 | 		[]byte(`that`),
 40 | 		[]byte(`these`),
 41 | 		[]byte(`those`),
 42 | 		[]byte(`am`),
 43 | 		[]byte(`is`),
 44 | 		[]byte(`are`),
 45 | 		[]byte(`was`),
 46 | 		[]byte(`were`),
 47 | 		[]byte(`be`),
 48 | 		[]byte(`been`),
 49 | 		[]byte(`being`),
 50 | 		[]byte(`have`),
 51 | 		[]byte(`has`),
 52 | 		[]byte(`had`),
 53 | 		[]byte(`having`),
 54 | 		[]byte(`do`),
 55 | 		[]byte(`does`),
 56 | 		[]byte(`did`),
 57 | 		[]byte(`doing`),
 58 | 		[]byte(`would`),
 59 | 		[]byte(`should`),
 60 | 		[]byte(`could`),
 61 | 		[]byte(`ought`),
 62 | 		[]byte(`i'm`),
 63 | 		[]byte(`you're`),
 64 | 		[]byte(`he's`),
 65 | 		[]byte(`she's`),
 66 | 		[]byte(`it's`),
 67 | 		[]byte(`we're`),
 68 | 		[]byte(`they're`),
 69 | 		[]byte(`i've`),
 70 | 		[]byte(`you've`),
 71 | 		[]byte(`we've`),
 72 | 		[]byte(`they've`),
 73 | 		[]byte(`i'd`),
 74 | 		[]byte(`you'd`),
 75 | 		[]byte(`he'd`),
 76 | 		[]byte(`she'd`),
 77 | 		[]byte(`we'd`),
 78 | 		[]byte(`they'd`),
 79 | 		[]byte(`i'll`),
 80 | 		[]byte(`you'll`),
 81 | 		[]byte(`he'll`),
 82 | 		[]byte(`she'll`),
 83 | 		[]byte(`we'll`),
 84 | 		[]byte(`they'll`),
 85 | 		[]byte(`isn't`),
 86 | 		[]byte(`aren't`),
 87 | 		[]byte(`wasn't`),
 88 | 		[]byte(`weren't`),
 89 | 		[]byte(`hasn't`),
 90 | 		[]byte(`haven't`),
 91 | 		[]byte(`hadn't`),
 92 | 		[]byte(`doesn't`),
 93 | 		[]byte(`don't`),
 94 | 		[]byte(`didn't`),
 95 | 		[]byte(`won't`),
 96 | 		[]byte(`wouldn't`),
 97 | 		[]byte(`shan't`),
 98 | 		[]byte(`shouldn't`),
 99 | 		[]byte(`can't`),
100 | 		[]byte(`cannot`),
101 | 		[]byte(`couldn't`),
102 | 		[]byte(`mustn't`),
103 | 		[]byte(`let's`),
104 | 		[]byte(`that's`),
105 | 		[]byte(`who's`),
106 | 		[]byte(`what's`),
107 | 		[]byte(`here's`),
108 | 		[]byte(`there's`),
109 | 		[]byte(`when's`),
110 | 		[]byte(`where's`),
111 | 		[]byte(`why's`),
112 | 		[]byte(`how's`),
113 | 		[]byte(`a`),
114 | 		[]byte(`an`),
115 | 		[]byte(`the`),
116 | 		[]byte(`and`),
117 | 		[]byte(`but`),
118 | 		[]byte(`if`),
119 | 		[]byte(`or`),
120 | 		[]byte(`because`),
121 | 		[]byte(`as`),
122 | 		[]byte(`until`),
123 | 		[]byte(`while`),
124 | 		[]byte(`of`),
125 | 		[]byte(`at`),
126 | 		[]byte(`by`),
127 | 		[]byte(`for`),
128 | 		[]byte(`with`),
129 | 		[]byte(`about`),
130 | 		[]byte(`against`),
131 | 		[]byte(`between`),
132 | 		[]byte(`into`),
133 | 		[]byte(`through`),
134 | 		[]byte(`during`),
135 | 		[]byte(`before`),
136 | 		[]byte(`after`),
137 | 		[]byte(`above`),
138 | 		[]byte(`below`),
139 | 		[]byte(`to`),
140 | 		[]byte(`from`),
141 | 		[]byte(`up`),
142 | 		[]byte(`down`),
143 | 		[]byte(`in`),
144 | 		[]byte(`out`),
145 | 		[]byte(`on`),
146 | 		[]byte(`off`),
147 | 		[]byte(`over`),
148 | 		[]byte(`under`),
149 | 		[]byte(`again`),
150 | 		[]byte(`further`),
151 | 		[]byte(`then`),
152 | 		[]byte(`once`),
153 | 		[]byte(`here`),
154 | 		[]byte(`there`),
155 | 		[]byte(`when`),
156 | 		[]byte(`where`),
157 | 		[]byte(`why`),
158 | 		[]byte(`how`),
159 | 		[]byte(`all`),
160 | 		[]byte(`any`),
161 | 		[]byte(`both`),
162 | 		[]byte(`each`),
163 | 		[]byte(`few`),
164 | 		[]byte(`more`),
165 | 		[]byte(`most`),
166 | 		[]byte(`other`),
167 | 		[]byte(`some`),
168 | 		[]byte(`such`),
169 | 		[]byte(`no`),
170 | 		[]byte(`nor`),
171 | 		[]byte(`not`),
172 | 		[]byte(`only`),
173 | 		[]byte(`own`),
174 | 		[]byte(`same`),
175 | 		[]byte(`so`),
176 | 		[]byte(`than`),
177 | 		[]byte(`too`),
178 | 		[]byte(`very`),
179 | 		[]byte(`-`),
180 | 	}
181 | )
182 | 


--------------------------------------------------------------------------------
/tokenize.go:
--------------------------------------------------------------------------------
  1 | package multibayes
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"encoding/base64"
  6 | 	"regexp"
  7 | 	"strings"
  8 | 
  9 | 	"github.com/blevesearch/bleve/analysis"
 10 | 	regexp_tokenizer "github.com/blevesearch/bleve/analysis/tokenizer/regexp"
 11 | 	"github.com/blevesearch/go-porterstemmer"
 12 | )
 13 | 
 14 | const (
 15 | 	tokenSeparator = "_"
 16 | )
 17 | 
 18 | type ngram struct {
 19 | 	Tokens [][]byte
 20 | }
 21 | 
 22 | // encodes in base64 for safe comparison
 23 | func (ng *ngram) String() string {
 24 | 	encoded := make([]string, len(ng.Tokens))
 25 | 
 26 | 	for i, token := range ng.Tokens {
 27 | 		encoded[i] = string(token)
 28 | 		//encoded[i] = base64.StdEncoding.EncodeToString(token) // safer?
 29 | 	}
 30 | 
 31 | 	return strings.Join(encoded, tokenSeparator)
 32 | }
 33 | 
 34 | func decodeNGram(s string) (*ngram, error) {
 35 | 	encodedTokens := strings.Split(s, tokenSeparator)
 36 | 
 37 | 	tokens := make([][]byte, len(encodedTokens))
 38 | 
 39 | 	var err error
 40 | 	for i, encodedToken := range encodedTokens {
 41 | 		tokens[i], err = base64.StdEncoding.DecodeString(encodedToken)
 42 | 		if err != nil {
 43 | 			return nil, err
 44 | 		}
 45 | 	}
 46 | 	return &ngram{tokens}, nil
 47 | }
 48 | 
 49 | type tokenizerConf struct {
 50 | 	regexp    *regexp.Regexp
 51 | 	NGramSize int64
 52 | }
 53 | 
 54 | type tokenizer struct {
 55 | 	regexp_tokenizer.RegexpTokenizer
 56 | 	Conf *tokenizerConf
 57 | }
 58 | 
 59 | func validateConf(tc *tokenizerConf) {
 60 | 	tc.regexp = regexp.MustCompile(`[0-9A-z_'\-]+|\%|\$`)
 61 | 
 62 | 	// TODO: We force NGramSize = 1 so as to create disjoint ngrams,
 63 | 	// which is necessary for the naive assumption of conditional
 64 | 	// independence among tokens. It would be great to allow ngrams
 65 | 	// to be greater than 1 and select only disjoint ngrams from the
 66 | 	// tokenizer.
 67 | 	tc.NGramSize = 1
 68 | }
 69 | 
 70 | func newTokenizer(tc *tokenizerConf) (*tokenizer, error) {
 71 | 	validateConf(tc)
 72 | 
 73 | 	return &tokenizer{*regexp_tokenizer.NewRegexpTokenizer(tc.regexp), tc}, nil
 74 | }
 75 | 
 76 | // Tokenize and Gramify
 77 | func (t *tokenizer) Parse(doc string) []ngram {
 78 | 	// maybe use token types for datetimes or something instead of
 79 | 	// the actual byte slice
 80 | 	alltokens := t.Tokenize([]byte(strings.ToLower(doc)))
 81 | 	filtered := make(map[int][]byte)
 82 | 	for i, token := range alltokens {
 83 | 		exclude := false
 84 | 		for _, stop := range stopbytes {
 85 | 			if bytes.Equal(token.Term, stop) {
 86 | 				exclude = true
 87 | 				break
 88 | 			}
 89 | 		}
 90 | 
 91 | 		if exclude {
 92 | 			continue
 93 | 		}
 94 | 
 95 | 		tokenString := porterstemmer.StemString(string(token.Term))
 96 | 		//tokenBytes := porterstemmer.Stem(token.Term) // takes runes, not bytes
 97 | 
 98 | 		if token.Type == analysis.Numeric {
 99 | 			tokenString = "NUMBER"
100 | 		} else if token.Type == analysis.DateTime {
101 | 			tokenString = "DATE"
102 | 		}
103 | 
104 | 		filtered[i] = []byte(tokenString)
105 | 	}
106 | 
107 | 	// only consider sequential terms as candidates for ngrams
108 | 	// terms separated by stopwords are ineligible
109 | 	allNGrams := make([]ngram, 0, 100)
110 | 	currentTokens := make([][]byte, 0, 100)
111 | 
112 | 	lastObserved := -1
113 | 	for i, token := range filtered {
114 | 		if (i - 1) != lastObserved {
115 | 
116 | 			ngrams := t.tokensToNGrams(currentTokens)
117 | 			allNGrams = append(allNGrams, ngrams...)
118 | 
119 | 			currentTokens = make([][]byte, 0, 100)
120 | 		}
121 | 
122 | 		currentTokens = append(currentTokens, token)
123 | 		lastObserved = i
124 | 	}
125 | 
126 | 	// bring in the last one
127 | 	if len(currentTokens) > 0 {
128 | 		ngrams := t.tokensToNGrams(currentTokens)
129 | 		allNGrams = append(allNGrams, ngrams...)
130 | 	}
131 | 
132 | 	return allNGrams
133 | }
134 | 
135 | func (t *tokenizer) tokensToNGrams(tokens [][]byte) []ngram {
136 | 	nTokens := int64(len(tokens))
137 | 
138 | 	nNGrams := int64(0)
139 | 	for i := int64(1); i <= t.Conf.NGramSize; i++ {
140 | 		chosen := choose(nTokens, i)
141 | 		nNGrams += chosen
142 | 	}
143 | 
144 | 	ngrams := make([]ngram, 0, nNGrams)
145 | 	for ngramSize := int64(1); ngramSize <= t.Conf.NGramSize; ngramSize++ {
146 | 		nNGramsOfSize := choose(nTokens, ngramSize)
147 | 
148 | 		for i := int64(0); i < nNGramsOfSize; i++ {
149 | 			ngrams = append(ngrams, ngram{tokens[i:(i + ngramSize)]})
150 | 		}
151 | 	}
152 | 
153 | 	return ngrams
154 | }
155 | 
156 | // not a binomial coefficient -- combinations must be sequential
157 | func choose(n, k int64) int64 {
158 | 	return max(n-k+int64(1), 0)
159 | }
160 | 
161 | func max(x, y int64) int64 {
162 | 	if x > y {
163 | 		return x
164 | 	}
165 | 	return y
166 | }
167 | 


--------------------------------------------------------------------------------