├── .github └── workflows │ └── build.yaml ├── .gitignore ├── LICENSE ├── README.md ├── VERSION ├── classifier.go ├── classifier_test.go ├── func.go ├── func_test.go ├── go.mod ├── go.sum ├── index ├── index.go └── index_test.go ├── knn ├── knn.go ├── knn_test.go ├── matrix.go ├── similarity.go ├── similarity_test.go ├── sort.go └── testdata │ ├── README.md │ ├── business │ ├── 1.txt │ ├── 2.txt │ ├── 3.txt │ ├── 4.txt │ ├── 5.txt │ └── 6.txt │ └── sports │ ├── 1.txt │ ├── 2.txt │ ├── 3.txt │ ├── 4.txt │ ├── 5.txt │ └── 6.txt ├── naive ├── naive.go └── naive_test.go ├── stopwords.go ├── stopwords_test.go ├── tokens.go ├── tokens_test.go └── weight.go /.github/workflows/build.yaml: -------------------------------------------------------------------------------- 1 | name: build pipeline 2 | on: 3 | push: 4 | branches: 5 | - master 6 | jobs: 7 | build: 8 | runs-on: ubuntu-latest 9 | steps: 10 | - uses: actions/checkout@v3 11 | 12 | - name: Setup 13 | uses: actions/setup-go@v3 14 | with: 15 | go-version: 1.19.4 16 | 17 | - name: Deps 18 | run: | 19 | go version 20 | 21 | - name: Build 22 | run: go build -v 23 | 24 | - name: Lint 25 | run: | 26 | go vet . 27 | 28 | - name: Test 29 | run: go test -v -cover $(go list ./...) 30 | 31 | - name: Coverage 32 | run: | 33 | for pkg in $(go list ./...); do go test -v -coverprofile=coverage_tmp.txt -covermode=atomic $pkg || ERROR="Error testing $pkg"; tail -n +2 coverage_tmp.txt >> coverage.txt || die "Unable to append coverage for $pkg"; done 34 | bash <(curl -s https://codecov.io/bash) 35 | 36 | - name: Release 37 | env: 38 | GITHUB_TOKEN: ${{ github.token }} 39 | run: | 40 | VERSION=$(cat VERSION | grep "^version" | sed -e 's/version=//') 41 | go get github.com/aktau/github-release 42 | go install github.com/aktau/github-release 43 | $(go env GOPATH)/bin/github-release release --user n3integration --repo classifier --tag v$VERSION || echo "duplicate release" 44 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files, Static and Dynamic libs (Shared Objects) 2 | *.o 3 | *.a 4 | *.so 5 | 6 | # Folders 7 | _obj 8 | _test 9 | vendor/ 10 | 11 | # Architecture specific extensions/prefixes 12 | *.[568vq] 13 | [568vq].out 14 | 15 | *.cgo1.go 16 | *.cgo2.c 17 | _cgo_defun.c 18 | _cgo_gotypes.go 19 | _cgo_export.* 20 | 21 | _testmain.go 22 | 23 | *.exe 24 | *.test 25 | *.prof 26 | classifier 27 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # classifier 2 | General purpose text classifier (naïve bayes, k-nearest neighbors) 3 | 4 | [![codecov](https://codecov.io/gh/n3integration/classifier/branch/master/graph/badge.svg)](https://codecov.io/gh/n3integration/classifier) 5 | [![Go Report Card](https://goreportcard.com/badge/github.com/n3integration/classifier)](https://goreportcard.com/report/github.com/n3integration/classifier) 6 | [![Documentation](https://godoc.org/github.com/n3integration/classifier?status.svg)](http://godoc.org/github.com/n3integration/classifier) 7 | 8 | ## Installation 9 | 10 | ```bash 11 | go get github.com/n3integration/classifier 12 | ``` 13 | 14 | ## Usage 15 | 16 | ### Classification 17 | 18 | There are two methods of classifying text data: `io.Reader` or `string`. To classify strings, use the `TrainString` 19 | or `ClassifyString` functions. To classify larger sources, use the `Train` and `Classify` functions that 20 | take an `io.Reader` as input. 21 | 22 | ```go 23 | package main 24 | 25 | import ( 26 | "fmt" 27 | 28 | "github.com/n3integration/classifier/naive" 29 | ) 30 | 31 | func main() { 32 | classifier := naive.New() 33 | classifier.TrainString("The quick brown fox jumped over the lazy dog", "ham") 34 | classifier.TrainString("Earn a degree online", "ham") 35 | classifier.TrainString("Earn cash quick online", "spam") 36 | 37 | if classification, err := classifier.ClassifyString("Earn your masters degree online"); err == nil { 38 | fmt.Println("Classification => ", classification) // ham 39 | } else { 40 | fmt.Println("error: ", err) 41 | } 42 | } 43 | ``` 44 | 45 | ## Contributing 46 | 47 | - Fork the repository 48 | - Create a local feature branch 49 | - Run `gofmt` 50 | - Bump the `VERSION` file using [semantic versioning](https://semver.org/) 51 | - Submit a pull request 52 | 53 | ## License 54 | 55 | Copyright 2023 n3integration@gmail.com 56 | 57 | Licensed under the Apache License, Version 2.0 (the "License"); 58 | you may not use this file except in compliance with the License. 59 | You may obtain a copy of the License at 60 | 61 | http://www.apache.org/licenses/LICENSE-2.0 62 | 63 | Unless required by applicable law or agreed to in writing, software 64 | distributed under the License is distributed on an "AS IS" BASIS, 65 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 66 | See the License for the specific language governing permissions and 67 | limitations under the License. 68 | -------------------------------------------------------------------------------- /VERSION: -------------------------------------------------------------------------------- 1 | version=0.5.0 2 | -------------------------------------------------------------------------------- /classifier.go: -------------------------------------------------------------------------------- 1 | package classifier 2 | 3 | import "io" 4 | 5 | // Classifier provides a simple interface for different text classifiers 6 | type Classifier interface { 7 | // Train allows clients to train the classifier 8 | Train(io.Reader, string) error 9 | // TrainString allows clients to train the classifier using a string 10 | TrainString(string, string) error 11 | // Classify performs a classification on the input corpus and assumes that 12 | // the underlying classifier has been trained. 13 | Classify(io.Reader) (string, error) 14 | // ClassifyString performs text classification using a string 15 | ClassifyString(string) (string, error) 16 | } 17 | 18 | // WordCounts extracts term frequencies from a text corpus 19 | func WordCounts(r io.Reader) (map[string]int, error) { 20 | instream := NewTokenizer().Tokenize(r) 21 | wc := make(map[string]int) 22 | for token := range instream { 23 | wc[token] = wc[token] + 1 24 | } 25 | return wc, nil 26 | } 27 | -------------------------------------------------------------------------------- /classifier_test.go: -------------------------------------------------------------------------------- 1 | package classifier 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestWordCounts(t *testing.T) { 8 | wc, err := WordCounts(toReader(text)) 9 | 10 | if err != nil { 11 | t.Error("failed to get word counts:", err) 12 | } 13 | 14 | if len(wc) != expected { 15 | t.Errorf("Expected %d; actual %d", expected, len(wc)) 16 | } 17 | 18 | for key, value := range wc { 19 | if value != 1 { 20 | t.Errorf("Incorrect term frequency for %s: %d", key, value) 21 | } 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /func.go: -------------------------------------------------------------------------------- 1 | package classifier 2 | 3 | const defaultBufferSize = 50 4 | 5 | // Predicate provides a predicate function 6 | type Predicate func(string) bool 7 | 8 | // Mapper provides a map function 9 | type Mapper func(string) string 10 | 11 | // Map applies f to each element of the supplied input channel 12 | func Map(vs chan string, f ...Mapper) chan string { 13 | stream := make(chan string, defaultBufferSize) 14 | 15 | go func() { 16 | for v := range vs { 17 | for _, fn := range f { 18 | v = fn(v) 19 | } 20 | stream <- v 21 | } 22 | close(stream) 23 | }() 24 | 25 | return stream 26 | } 27 | 28 | // Filter removes elements from the input channel where the supplied predicate 29 | // is satisfied 30 | // Filter is a Predicate aggregation 31 | func Filter(vs chan string, filters ...Predicate) chan string { 32 | stream := make(chan string, defaultBufferSize) 33 | apply := func(text string) bool { 34 | for _, f := range filters { 35 | if !f(text) { 36 | return false 37 | } 38 | } 39 | return true 40 | } 41 | 42 | go func() { 43 | for text := range vs { 44 | if apply(text) { 45 | stream <- text 46 | } 47 | } 48 | close(stream) 49 | }() 50 | 51 | return stream 52 | } -------------------------------------------------------------------------------- /func_test.go: -------------------------------------------------------------------------------- 1 | package classifier 2 | 3 | import ( 4 | "strings" 5 | "testing" 6 | ) 7 | 8 | var words = []string{ 9 | "hello", "world", 10 | } 11 | 12 | func streamWords() chan string { 13 | stream := make(chan string) 14 | go func() { 15 | for _, word := range words { 16 | stream <- word 17 | } 18 | close(stream) 19 | }() 20 | return stream 21 | } 22 | 23 | func TestMap(t *testing.T) { 24 | i := 0 25 | results := Map(streamWords(), strings.ToUpper) 26 | for word := range results { 27 | expected := strings.ToUpper(words[i]) 28 | if expected != word { 29 | t.Errorf("did not match expected result %v <> %v", expected, word) 30 | } 31 | i++ 32 | } 33 | } 34 | 35 | func TestFilter(t *testing.T) { 36 | results := Filter(streamWords(), func(s string) bool { 37 | return s != words[0] 38 | }) 39 | 40 | i := 0 41 | for word := range results { 42 | i++ 43 | if word != words[1] { 44 | t.Error("incorrect result:", word) 45 | } 46 | } 47 | if i != 1 { 48 | t.Error("incorrect number of results:", i) 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/n3integration/classifier 2 | 3 | go 1.19 4 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/n3integration/classifier/f8630b69279e8eae662ff56114a722497c4bd19d/go.sum -------------------------------------------------------------------------------- /index/index.go: -------------------------------------------------------------------------------- 1 | package index 2 | 3 | import ( 4 | "fmt" 5 | "sync" 6 | ) 7 | 8 | // TermIndex provides a term frequency index 9 | type TermIndex struct { 10 | index int 11 | terms map[string]*termRef 12 | sync.RWMutex 13 | } 14 | 15 | // NewTermIndex initializes an empty term frequency index 16 | func NewTermIndex(capacity int) *TermIndex { 17 | return &TermIndex{ 18 | terms: make(map[string]*termRef, capacity), 19 | } 20 | } 21 | 22 | // Add a term to the index 23 | func (i *TermIndex) Add(t string) int { 24 | i.Lock() 25 | defer i.Unlock() 26 | if _, ok := i.terms[t]; ok { 27 | i.terms[t].incr() 28 | return i.terms[t].index 29 | } 30 | i.terms[t] = &termRef{ 31 | 1, 32 | i.index, 33 | } 34 | i.index++ 35 | return i.terms[t].index 36 | } 37 | 38 | // IndexOf returns the index of the provided term, or -1 if not found 39 | func (i *TermIndex) IndexOf(term string) int { 40 | i.RLock() 41 | defer i.RUnlock() 42 | if t, ok := i.terms[term]; ok { 43 | return t.index 44 | } 45 | return -1 46 | } 47 | 48 | // Frequency returns the term frequency within the index 49 | func (i *TermIndex) Frequency(term string) float64 { 50 | i.RLock() 51 | defer i.RUnlock() 52 | if t, ok := i.terms[term]; ok { 53 | return t.freq 54 | } 55 | return 0 56 | } 57 | 58 | // Count returns the number of terms within the index 59 | func (i *TermIndex) Count() int { 60 | i.RLock() 61 | defer i.RUnlock() 62 | return len(i.terms) 63 | } 64 | 65 | func (i *TermIndex) String() string { 66 | i.RLock() 67 | defer i.RUnlock() 68 | return fmt.Sprintf("%v", i.terms) 69 | } 70 | 71 | // termRef provides a given term's frequency and ref index 72 | type termRef struct { 73 | freq float64 74 | index int 75 | } 76 | 77 | func (t *termRef) incr() float64 { 78 | t.freq++ 79 | return t.freq 80 | } 81 | 82 | func (t *termRef) String() string { 83 | return fmt.Sprintf("%v", t.freq) 84 | } 85 | -------------------------------------------------------------------------------- /index/index_test.go: -------------------------------------------------------------------------------- 1 | package index 2 | 3 | import ( 4 | "strings" 5 | "testing" 6 | ) 7 | 8 | var ( 9 | text = "The quick brown fox jumped over the lazy dog" 10 | expected = 7 11 | ) 12 | 13 | func TestTermIndex(t *testing.T) { 14 | allTermsExpected := expected + 1 15 | index := NewTermIndex(allTermsExpected) 16 | for _, txt := range strings.Split(text, " ") { 17 | index.Add(strings.ToLower(txt)) 18 | } 19 | if index.Count() != allTermsExpected { 20 | t.Errorf("incorrect index size; expected %v, but got %v", expected, index.Count()) 21 | } 22 | for term := range index.terms { 23 | if index.Frequency(term) < 1 { 24 | t.Errorf("incorrect frequency; expected %v, but got %v", expected, index.Frequency(term)) 25 | } 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /knn/knn.go: -------------------------------------------------------------------------------- 1 | package knn 2 | 3 | import ( 4 | "bytes" 5 | "errors" 6 | "fmt" 7 | "io" 8 | "math" 9 | "sort" 10 | "sync" 11 | 12 | "github.com/n3integration/classifier" 13 | "github.com/n3integration/classifier/index" 14 | ) 15 | 16 | const ( 17 | defaultKVal = 1 18 | defaultIndexCapacity = 10_000 19 | ) 20 | 21 | // Option provides a functional setting for the Classifier 22 | type Option func(c *Classifier) error 23 | 24 | // Classifier provides k-nearest neighbor classification 25 | type Classifier struct { 26 | mu sync.RWMutex 27 | 28 | k int 29 | categories []string 30 | index *index.TermIndex 31 | matrix *sparse 32 | similarity SimilarityScore 33 | tokenizer classifier.Tokenizer 34 | weightScheme classifier.WeightSchemeStrategy 35 | } 36 | 37 | // New initializes a new k-nearest neighbor classifier unless overridden, 38 | // binary term weights and k=1 will be used for the created instance 39 | func New(opts ...Option) *Classifier { 40 | c := &Classifier{ 41 | k: defaultKVal, 42 | categories: make([]string, 0), 43 | index: index.NewTermIndex(defaultIndexCapacity), 44 | matrix: newSparseMatrix(), 45 | similarity: CosineSimilarity, 46 | tokenizer: classifier.NewTokenizer(), 47 | weightScheme: classifier.Binary, 48 | } 49 | for _, opt := range opts { 50 | opt(c) 51 | } 52 | return c 53 | } 54 | 55 | // K provides the value of 'k' 56 | func K(k int) Option { 57 | return func(c *Classifier) error { 58 | if k < 1 { 59 | return errors.New("the value of k must be a positive integer") 60 | } 61 | c.k = k 62 | return nil 63 | } 64 | } 65 | 66 | // WeightScheme provides the term weight scheme 67 | func WeightScheme(s classifier.WeightSchemeStrategy) Option { 68 | return func(c *Classifier) error { 69 | c.weightScheme = s 70 | return nil 71 | } 72 | } 73 | 74 | // Similarity provides an alternate similarity scoring strategy 75 | func Similarity(s SimilarityScore) Option { 76 | return func(c *Classifier) error { 77 | c.similarity = s 78 | return nil 79 | } 80 | } 81 | 82 | // Tokenizer provides an alternate document Tokenizer 83 | func Tokenizer(t classifier.Tokenizer) Option { 84 | return func(c *Classifier) error { 85 | c.tokenizer = t 86 | return nil 87 | } 88 | } 89 | 90 | // TermIndex provides an alternate TermIndex 91 | func TermIndex(i *index.TermIndex) Option { 92 | return func(c *Classifier) error { 93 | c.index = i 94 | return nil 95 | } 96 | } 97 | 98 | func (c *Classifier) TrainString(doc string, category string) error { 99 | return c.Train(asReader(doc), category) 100 | } 101 | 102 | func (c *Classifier) Train(r io.Reader, category string) error { 103 | wordFreq := make(map[string]float64) 104 | for text := range c.tokenizer.Tokenize(r) { 105 | count := wordFreq[text] 106 | wordFreq[text] = count + 1 107 | 108 | if count == 0 { 109 | c.index.Add(text) 110 | } 111 | } 112 | 113 | c.mu.Lock() 114 | defer c.mu.Unlock() 115 | c.categories = append(c.categories, category) 116 | c.matrix.Add(c.index, c.weightScheme(wordFreq), wordFreq) 117 | return nil 118 | } 119 | 120 | func (c *Classifier) ClassifyString(doc string) (string, error) { 121 | return c.Classify(asReader(doc)) 122 | } 123 | 124 | func (c *Classifier) Classify(r io.Reader) (string, error) { 125 | wordFreq := make(map[string]float64) 126 | for text := range c.tokenizer.Tokenize(r) { 127 | count := wordFreq[text] 128 | wordFreq[text] = count + 1 129 | } 130 | 131 | c.mu.RLock() 132 | defer c.mu.RUnlock() 133 | this := c.matrix.MakeRow(c.index, c.weightScheme, wordFreq) 134 | next := c.matrix.Rows() 135 | results := make(topResults, 0) 136 | 137 | for row := next(); row != nil; row = next() { 138 | results = append(results, &topResult{ 139 | Score: c.similarity(row, this), 140 | Category: c.categories[row.Index()], 141 | }) 142 | } 143 | 144 | sort.Sort(results) 145 | return results.query(c.k), nil 146 | } 147 | 148 | type topResults []*topResult 149 | 150 | func (r topResults) Len() int { 151 | return len(r) 152 | } 153 | 154 | func (r topResults) Less(i, j int) bool { 155 | return r[i].Score < r[j].Score 156 | } 157 | 158 | func (r topResults) Swap(i, j int) { 159 | r[i], r[j] = r[j], r[i] 160 | } 161 | 162 | func (r topResults) topK(k int) map[string]int { 163 | count := 0 164 | topk := make(map[string]int) 165 | for i := 1; i <= k; i++ { 166 | count = topk[r[len(r)-i].Category] 167 | topk[r[len(r)-i].Category] = count + 1 168 | } 169 | return topk 170 | } 171 | 172 | func (r topResults) query(k int) string { 173 | max := 0 174 | var category string 175 | topk := r.topK(int(math.Min(float64(k), float64(len(r))))) 176 | 177 | for cat, count := range topk { 178 | if count > max { 179 | max = count 180 | category = cat 181 | } 182 | } 183 | 184 | return category 185 | } 186 | 187 | type topResult struct { 188 | Score float64 189 | Category string 190 | } 191 | 192 | func (t *topResult) String() string { 193 | return fmt.Sprintf("%.2f", t.Score) 194 | } 195 | 196 | func asReader(text string) io.Reader { 197 | return bytes.NewBufferString(text) 198 | } 199 | -------------------------------------------------------------------------------- /knn/knn_test.go: -------------------------------------------------------------------------------- 1 | package knn 2 | 3 | import ( 4 | "fmt" 5 | "log" 6 | "os" 7 | "testing" 8 | 9 | "github.com/n3integration/classifier" 10 | ) 11 | 12 | func TestClassifier(t *testing.T) { 13 | knn := New( 14 | K(4), 15 | Similarity(EuclideanDistance), 16 | WeightScheme(classifier.TermFrequency), 17 | Tokenizer(classifier.NewTokenizer( 18 | classifier.Filters(classifier.IsNotStopWord, classifier.IsWord), 19 | classifier.SplitFunc(classifier.ScanAlphaWords), 20 | )), 21 | ) 22 | 23 | dataDir, err := os.ReadDir("testdata") 24 | if err != nil { 25 | log.Fatal(err) 26 | } 27 | 28 | for _, file := range dataDir { 29 | if file.IsDir() { 30 | dir := file 31 | files, rErr := os.ReadDir(fmt.Sprintf("testdata/%s", dir.Name())) 32 | if rErr != nil { 33 | log.Fatal(rErr) 34 | } 35 | for _, f := range files { 36 | if lErr := load(knn, dir.Name(), fmt.Sprintf("testdata/%s/%s", dir.Name(), f.Name())); lErr != nil { 37 | t.Fatal(lErr) 38 | } 39 | } 40 | } 41 | } 42 | 43 | testdata := []struct { 44 | Name string 45 | Headline string 46 | ExpectedCategory string 47 | }{ 48 | { 49 | Name: "Business Headline", 50 | Headline: `Small Businesses Keep Hiring as Fed Raises Rates to Cool Economy`, 51 | ExpectedCategory: "business", 52 | }, 53 | { 54 | Name: "Sports Headline", 55 | Headline: `How Eagles can win 2023 Super Bowl: Jalen Hurts, dominant offensive line pave the way for championship run`, 56 | ExpectedCategory: "sports", 57 | }, 58 | } 59 | 60 | for _, data := range testdata { 61 | category, err := knn.ClassifyString(data.Headline) 62 | if err != nil { 63 | t.Fatalf("failed to classify %s dataDir: %s", data.Name, err) 64 | } 65 | 66 | if category != data.ExpectedCategory { 67 | log.Println(knn.matrix) 68 | t.Fatalf("incorrectly classified %s; expected %s, but got %s", data.Name, data.ExpectedCategory, category) 69 | } 70 | } 71 | } 72 | 73 | func load(knn *Classifier, category, filename string) error { 74 | f, err := os.Open(filename) 75 | if err != nil { 76 | return fmt.Errorf("failed to load test data: %w", err) 77 | } 78 | defer f.Close() 79 | return knn.Train(f, category) 80 | } 81 | -------------------------------------------------------------------------------- /knn/matrix.go: -------------------------------------------------------------------------------- 1 | package knn 2 | 3 | import ( 4 | "fmt" 5 | "math" 6 | 7 | "github.com/n3integration/classifier" 8 | "github.com/n3integration/classifier/index" 9 | ) 10 | 11 | // sparse matrix implementation 12 | type sparse struct { 13 | ind []int 14 | val []float64 15 | ptr []int 16 | } 17 | 18 | // newSparseMatrix initializes an empty sparse matrix 19 | func newSparseMatrix() *sparse { 20 | return &sparse{ 21 | ind: make([]int, 0), 22 | val: make([]float64, 0), 23 | ptr: make([]int, 1), 24 | } 25 | } 26 | 27 | // Add a new row to the underlying matrix 28 | func (m *sparse) Add(index *index.TermIndex, weight classifier.WeightScheme, docWordFreq map[string]float64) { 29 | prev := len(m.ind) 30 | for term := range docWordFreq { 31 | m.ind = append(m.ind, index.IndexOf(term)) 32 | m.val = append(m.val, weight(term)) 33 | } 34 | 35 | cur := prev + len(docWordFreq) 36 | quickSort(m, prev, cur-1) 37 | m.ptr = append(m.ptr, cur) 38 | } 39 | 40 | // MakeRow creates and returns a new sparseRow without adding it to the underlying matrix 41 | func (m *sparse) MakeRow(index *index.TermIndex, weight classifier.WeightSchemeStrategy, wordFreq map[string]float64) *sparseRow { 42 | i := 0 43 | var idx int 44 | this := newSparseRow(len(wordFreq)) 45 | 46 | for term := range wordFreq { 47 | idx = index.IndexOf(term) 48 | if idx < 0 { 49 | idx = index.Add(term) 50 | } 51 | this.ind[i] = idx 52 | this.val[i] = weight(wordFreq)(term) 53 | i++ 54 | } 55 | 56 | quickSort(this, 0, len(wordFreq)-1) 57 | return this 58 | } 59 | 60 | // Rows returns an iterator over the matrix 61 | func (m *sparse) Rows() func() *sparseRow { 62 | i := 0 63 | r := &sparseRow{} 64 | 65 | return func() *sparseRow { 66 | if i == (len(m.ptr) - 1) { 67 | return nil 68 | } 69 | 70 | start := m.ptr[i] 71 | end := m.ptr[i+1] 72 | 73 | r.index = i 74 | r.ind = m.ind[start:end] 75 | r.val = m.val[start:end] 76 | i++ 77 | 78 | return r 79 | } 80 | } 81 | 82 | // Head returns the first 10 rows in the underlying matrix 83 | func (m *sparse) Head() []*sparseRow { 84 | iterator := m.Rows() 85 | count := int(math.Min(10, m.Size())) 86 | rows := make([]*sparseRow, count) 87 | 88 | for i := 0; i <= count; i++ { 89 | row := iterator() 90 | if row == nil { 91 | break 92 | } 93 | rows[i] = row 94 | } 95 | 96 | return rows 97 | } 98 | 99 | func (m *sparse) Shape() string { 100 | return fmt.Sprintf("%v x %v", len(m.ind), len(m.ptr)-1) 101 | } 102 | 103 | func (m *sparse) Size() float64 { 104 | return float64(len(m.ptr)) - 1 105 | } 106 | 107 | func (m *sparse) Partition(low int, high int) int { 108 | x := m.ind[high] 109 | i := low - 1 110 | 111 | for j := low; j <= high-1; j++ { 112 | if m.ind[j] <= x { 113 | i++ 114 | swap(&m.ind[i], &m.ind[j]) 115 | swap(&m.val[i], &m.val[j]) 116 | } 117 | } 118 | swap(&m.ind[i+1], &m.ind[high]) 119 | swap(&m.val[i+1], &m.val[high]) 120 | return i + 1 121 | } 122 | 123 | func (m *sparse) String() string { 124 | return fmt.Sprintf("%v\n%v\n%v", m.ind, m.val, m.ptr) 125 | } 126 | 127 | type sparseRow struct { 128 | ind []int 129 | val []float64 130 | index int 131 | } 132 | 133 | func newSparseRow(size int) *sparseRow { 134 | return &sparseRow{ 135 | ind: make([]int, size), 136 | val: make([]float64, size), 137 | } 138 | } 139 | 140 | // Column returns the feature and value at index i 141 | func (r *sparseRow) Column(i int) (int, float64) { 142 | return r.ind[i], r.val[i] 143 | } 144 | 145 | // Feature returns the feature at index i 146 | func (r *sparseRow) Feature(i int) int { 147 | return r.ind[i] 148 | } 149 | 150 | // Sum the sparseRow 151 | func (r *sparseRow) Sum() float64 { 152 | sum := 0.0 153 | for _, val := range r.val { 154 | sum += val 155 | } 156 | return sum 157 | } 158 | 159 | // Square the row 160 | func (r *sparseRow) Square() float64 { 161 | sum := 0.0 162 | for _, val := range r.val { 163 | sum += math.Pow(val, 2) 164 | } 165 | return sum 166 | } 167 | 168 | // L2Norm returns the euclidean distance 169 | func (r *sparseRow) L2Norm() float64 { 170 | return math.Sqrt(r.Square()) 171 | } 172 | 173 | // Dot returns the dot product 174 | func (r *sparseRow) Dot(other *sparseRow) float64 { 175 | sum := 0.0 176 | if r.Size() <= other.Size() { 177 | for i := 0; i < r.Len(); i++ { 178 | feature, val := r.Column(i) 179 | sum += val * other.Value(feature) 180 | } 181 | } else { 182 | for i := 0; i < other.Len(); i++ { 183 | feature, val := other.Column(i) 184 | sum += val * r.Value(feature) 185 | } 186 | } 187 | return sum 188 | } 189 | 190 | // Value returns the value of feature 191 | func (r *sparseRow) Value(feature int) float64 { 192 | i := search(r.ind, feature) 193 | if i >= 0 { 194 | return r.val[i] 195 | } 196 | return 0 197 | } 198 | 199 | // Values constructs a new sparse row from the provided features 200 | func (r *sparseRow) Values(features ...int) *sparseRow { 201 | other := newSparseRow(len(features)) 202 | for i := 0; i < len(features); i++ { 203 | other.ind[i] = features[i] 204 | other.val[i] = r.Value(features[i]) 205 | } 206 | return other 207 | } 208 | 209 | // Contains to check if row contains the provided feature 210 | func (r *sparseRow) Contains(feature int) bool { 211 | for _, val := range r.ind { 212 | if val == feature { 213 | return true 214 | } 215 | } 216 | return false 217 | } 218 | 219 | // Index returns the index pointer 220 | func (r *sparseRow) Index() int { 221 | return r.index 222 | } 223 | 224 | // Len returns the number of columns 225 | func (r *sparseRow) Len() int { 226 | return len(r.ind) 227 | } 228 | 229 | func (r *sparseRow) Less(i, j int) bool { 230 | return r.ind[i] < r.ind[j] 231 | } 232 | 233 | func (r *sparseRow) Swap(i, j int) { 234 | ind := r.ind[i] 235 | r.ind[i] = r.ind[j] 236 | r.ind[j] = ind 237 | 238 | val := r.val[i] 239 | r.val[i] = r.val[j] 240 | r.val[j] = val 241 | } 242 | 243 | func (r *sparseRow) Size() float64 { 244 | return float64(len(r.val)) 245 | } 246 | 247 | func (r *sparseRow) Partition(low int, high int) int { 248 | x := r.ind[high] 249 | i := low - 1 250 | 251 | for j := low; j <= high-1; j++ { 252 | if r.ind[j] <= x { 253 | i++ 254 | swap(&r.ind[i], &r.ind[j]) 255 | swap(&r.val[i], &r.val[j]) 256 | } 257 | } 258 | swap(&r.ind[i+1], &r.ind[high]) 259 | swap(&r.val[i+1], &r.val[high]) 260 | return i + 1 261 | } 262 | 263 | func (r *sparseRow) String() string { 264 | return fmt.Sprintf("%v\n%v", r.ind, r.val) 265 | } 266 | -------------------------------------------------------------------------------- /knn/similarity.go: -------------------------------------------------------------------------------- 1 | package knn 2 | 3 | import ( 4 | "math" 5 | ) 6 | 7 | // SimilarityScore provides pluggable support for row similarity 8 | type SimilarityScore func(left, right *sparseRow) float64 9 | 10 | // EuclideanDistance between rows 11 | func EuclideanDistance(left, right *sparseRow) float64 { 12 | distanceTo := func(left, right *sparseRow) float64 { 13 | score := 0.0 14 | terms := make(map[int]float64) 15 | for i := 0; i < left.Len(); i++ { 16 | term, val := left.Column(i) 17 | terms[term] = val 18 | score += math.Pow(val-right.Value(term), 2) 19 | } 20 | 21 | for i := 0; i < right.Len(); i++ { 22 | term, _ := right.Column(i) 23 | if _, ok := terms[term]; !ok { 24 | score += math.Pow(0-right.Value(term), 2) 25 | } 26 | } 27 | return 1 / (1 + math.Sqrt(score)) 28 | } 29 | 30 | if left.Len() >= right.Len() { 31 | return distanceTo(left, right) 32 | } 33 | return distanceTo(right, left) 34 | } 35 | 36 | // CosineSimilarity between rows 37 | func CosineSimilarity(left, right *sparseRow) float64 { 38 | return left.Dot(right) / (left.L2Norm() * right.L2Norm()) 39 | } 40 | 41 | // PearsonCorrelation between rows 42 | func PearsonCorrelation(left, right *sparseRow) float64 { 43 | score := func(left, right *sparseRow) float64 { 44 | n := left.Size() 45 | leftSum := left.Sum() 46 | rightSum := right.Sum() 47 | denom := math.Sqrt((left.Square() - math.Pow(leftSum, 2)/n) * (right.Square() - math.Pow(rightSum, 2)/n)) 48 | 49 | if denom == 0 { 50 | return 0 51 | } 52 | return (left.Dot(right) - ((leftSum * rightSum) / n)) / denom 53 | } 54 | 55 | similar := make([]int, 0) 56 | for i := 0; i < left.Len(); i++ { 57 | term, _ := left.Column(i) 58 | if right.Contains(term) { 59 | similar = append(similar, term) 60 | } 61 | } 62 | 63 | if len(similar) == 0 { 64 | return 0 65 | } 66 | return score(left.Values(similar...), right.Values(similar...)) 67 | } 68 | -------------------------------------------------------------------------------- /knn/similarity_test.go: -------------------------------------------------------------------------------- 1 | package knn 2 | 3 | import ( 4 | "math" 5 | "testing" 6 | ) 7 | 8 | func TestSimilarity(t *testing.T) { 9 | allowedVariance := .01 10 | row1 := newSparseRow(2) 11 | row1.ind = []int{0, 1} 12 | row1.val = []float64{2, -1} 13 | row2 := newSparseRow(2) 14 | row2.ind = []int{0, 1} 15 | row2.val = []float64{-2, 1} 16 | 17 | t.Run("Euclidean Distance", func(t *testing.T) { 18 | expected := 0.18 19 | actual := EuclideanDistance(row1, row2) 20 | assertEquivalent(t, expected, actual, allowedVariance) 21 | 22 | if actual := EuclideanDistance(row1, row1); actual != 1 { 23 | t.Fatalf("expected identical row to equal one. got %.2f", actual) 24 | } 25 | }) 26 | 27 | t.Run("Pearson Correlation", func(t *testing.T) { 28 | if actual := PearsonCorrelation(row1, row1); actual != 1 { 29 | t.Fatalf("expected strong positive correlation. got %.2f", actual) 30 | } 31 | 32 | if actual := PearsonCorrelation(row1, row2); actual != -1 { 33 | t.Fatalf("expected strong inverse correlation. got %.2f", actual) 34 | } 35 | 36 | row3 := newSparseRow(2) 37 | row3.ind = []int{2, 3} 38 | row3.val = []float64{4, 5} 39 | if actual := PearsonCorrelation(row1, row3); actual != 0 { 40 | t.Fatalf("expected dissimilar rows to equal zero; got %.2f", actual) 41 | } 42 | }) 43 | 44 | t.Run("Cosine Similarity", func(t *testing.T) { 45 | assertEquivalent(t, CosineSimilarity(row1, row1), 1.0, allowedVariance) 46 | assertEquivalent(t, CosineSimilarity(row1, row2), -1.0, allowedVariance) 47 | }) 48 | } 49 | 50 | func assertEquivalent(t *testing.T, actual, expected, threshold float64) { 51 | if math.Abs(actual-expected) > threshold { 52 | t.Fatalf("expected %.2f to be equivalent to %.2f within +/- %.2f", actual, expected, threshold) 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /knn/sort.go: -------------------------------------------------------------------------------- 1 | package knn 2 | 3 | // Partitioning indicates that a type can be partitioned and reordered 4 | type Partitioning interface { 5 | // Partition between low and high elements 6 | Partition(low, high int) int 7 | } 8 | 9 | func search(values []int, v int) int { 10 | low := 0 11 | high := len(values) 12 | for low <= high { 13 | mid := (low + high) / 2 14 | if v == values[mid] { 15 | return mid 16 | } else if v > values[mid] { 17 | low = mid + 1 18 | } else { 19 | high = mid - 1 20 | } 21 | } 22 | return -1 23 | } 24 | 25 | func quickSort(m Partitioning, low int, high int) { 26 | stack := make(Stack, 0) 27 | 28 | stack.push(low) 29 | stack.push(high) 30 | for stack.len() > 0 { 31 | high = stack.pop() 32 | low = stack.pop() 33 | 34 | pivot := m.Partition(low, high) 35 | if pivot-1 > low { 36 | stack.push(low) 37 | stack.push(pivot - 1) 38 | } 39 | 40 | if pivot+1 < high { 41 | stack.push(pivot + 1) 42 | stack.push(high) 43 | } 44 | } 45 | } 46 | 47 | func swap[V int | float64](a, b *V) { 48 | t := *a 49 | *a = *b 50 | *b = t 51 | } 52 | 53 | type Stack []int 54 | 55 | func (s *Stack) push(v int) { 56 | *s = append(*s, v) 57 | } 58 | 59 | func (s *Stack) pop() int { 60 | v := (*s)[len(*s)-1] 61 | (*s)[len(*s)-1] = 0 62 | *s = (*s)[:len(*s)-1] 63 | return v 64 | } 65 | 66 | func (s *Stack) len() int { 67 | return len(*s) 68 | } 69 | -------------------------------------------------------------------------------- /knn/testdata/README.md: -------------------------------------------------------------------------------- 1 | ## Test Data Set 2 | 3 | Data extracted from [Kaggle](https://www.kaggle.com/datasets/jensenbaxter/10dataset-text-document-classification) 4 | -------------------------------------------------------------------------------- /knn/testdata/business/1.txt: -------------------------------------------------------------------------------- 1 | Winn-Dixie files for bankruptcy 2 | 3 | US supermarket group Winn-Dixie has filed for bankruptcy protection after succumbing to stiff competition in a market dominated by Wal-Mart. 4 | 5 | Winn-Dixie, once among the most profitable of US grocers, said Chapter 11 protection would enable it to successfully restructure. It said its 920 stores would remain open, but analysts said it would most likely off-load a number of sites. The Jacksonville, Florida-based firm has total debts of $1.87bn (£980m). In its bankruptcy petition it listed its biggest creditor as US foods giant Kraft Foods, which it owes $15.1m. 6 | 7 | Analysts say Winn-Dixie had not kept up with consumers' demands and had also been burdened by a number of stores in need of upgrading. A 10-month restructuring plan was deemed a failure, and following a larger-than-expected quarterly loss earlier this month, Winn-Dixie's slide into bankruptcy was widely expected. The company's new chief executive Peter Lynch said Winn-Dixie would use the Chapter 11 breathing space to take the necessary action to turn itself around. "This includes achieving significant cost reductions, improving the merchandising and customer service in all locations and generating a sense of excitement in the stores," he said. Yet Evan Mann, a senior bond analyst at Gimme Credit, said Mr Lynch's job would not be easy, as the bankruptcy would inevitably put off some customers. "The real big issue is what's going to happen over the next one or two quarters now that they are in bankruptcy and all their customers see this in their local newspapers," he said. 8 | -------------------------------------------------------------------------------- /knn/testdata/business/2.txt: -------------------------------------------------------------------------------- 1 | Japanese growth grinds to a halt 2 | 3 | Growth in Japan evaporated in the three months to September, sparking renewed concern about an economy not long out of a decade-long trough. 4 | 5 | Output in the period grew just 0.1%, an annual rate of 0.3%. Exports - the usual engine of recovery - faltered, while domestic demand stayed subdued and corporate investment also fell short. The growth falls well short of expectations, but does mark a sixth straight quarter of expansion. 6 | 7 | The economy had stagnated throughout the 1990s, experiencing only brief spurts of expansion amid long periods in the doldrums. One result was deflation - prices falling rather than rising - which made Japanese shoppers cautious and kept them from spending. 8 | 9 | The effect was to leave the economy more dependent than ever on exports for its recent recovery. But high oil prices have knocked 0.2% off the growth rate, while the falling dollar means products shipped to the US are becoming relatively more expensive. 10 | 11 | The performance for the third quarter marks a sharp downturn from earlier in the year. The first quarter showed annual growth of 6.3%, with the second showing 1.1%, and economists had been predicting as much as 2% this time around. "Exports slowed while capital spending became weaker," said Hiromichi Shirakawa, chief economist at UBS Securities in Tokyo. "Personal consumption looks good, but it was mainly due to temporary factors such as the Olympics. "The amber light is flashing." The government may now find it more difficult to raise taxes, a policy it will have to implement when the economy picks up to help deal with Japan's massive public debt. 12 | -------------------------------------------------------------------------------- /knn/testdata/business/3.txt: -------------------------------------------------------------------------------- 1 | WorldCom director admits lying 2 | 3 | The former chief financial officer at US telecoms firm WorldCom has admitted before a New York court that he used to lie to fellow board members. 4 | 5 | Speaking at the trial of his former boss Bernard Ebbers, Scott Sullivan said he lied to the board to cover up the hole in WorldCom's finances. Mr Ebbers is on trial for fraud and conspiracy in relation to WorldCom's collapse in 2002. He pleads not guilty. The firm had been overstating its accounts by $11bn (£8.5bn). Mr Sullivan, 42, has already pleaded guilty to fraud and will be sentenced following Mr Ebbers' trial, where he is appearing as a prosecution witness. Mr Ebbers, 63, has always insisted that he was unaware of any hidden shortfalls in WorldCom's finances. 6 | 7 | In the New York court on Wednesday, Mr Ebbers' lawyer Reid Weingarten asked Mr Sullivan: "If you believe something is in your interest, you are willing and able to lie to accomplish it, isn't that right?" 8 | 9 | "On that date, yes. I was lying," replied Mr Sullivan. Mr Weingarten has suggested that Mr Sullivan is implicating Mr Ebbers only to win a lighter sentence, something Mr Sullivan denies. Mr Sullivan also rejects a suggestion that he had once told fellow WorldCom board member Bert Roberts that Mr Ebbers was unaware of the accounting fraud at WorldCom. The trial of Mr Ebbers is now into its third week. 10 | 11 | Under 23 hours of questioning from a federal prosecutor, Mr Sullivan has previously told the court that he repeatedly warned Mr Ebbers that falsifying the books would be the only way to meet Wall Street revenue and earnings expectations. Mr Sullivan claims that Mr Ebbers refused to stop the fraud. Mr Ebbers could face a sentence of 85 years if convicted of all the charges he is facing. WorldCom's problems appear to have begun with the collapse of the dotcom boom which cut its business from internet companies. Prosecutors allege that the company's top executives responded by orchestrating massive fraud over a two-year period. WorldCom emerged from bankruptcy protection in 2004, and is now known as MCI. 12 | -------------------------------------------------------------------------------- /knn/testdata/business/4.txt: -------------------------------------------------------------------------------- 1 | Bank voted 8-1 for no rate change 2 | 3 | The decision to keep interest rates on hold at 4.75% earlier this month was passed 8-1 by the Bank of England's rate-setting body, minutes have shown. 4 | 5 | One member of the Bank's Monetary Policy Committee (MPC) - Paul Tucker - voted to raise rates to 5%. The news surprised some analysts who had expected the latest minutes to show another unanimous decision. Worries over growth rates and consumer spending were behind the decision to freeze rates, the minutes showed. The Bank's latest inflation report, released last week, had noted that the main reason inflation might fall was weaker consumer spending. 6 | 7 | However, MPC member Paul Tucker voted for a quarter point rise in interest rates to 5%. He argued that economic growth was picking up, and that the equity, credit and housing markets had been stronger than expected. 8 | 9 | The Bank's minutes said that risks to the inflation forecast were "sufficiently to the downside" to keep rates on hold at its latest meeting. However, the minutes added: "Some members noted that an increase might be warranted in due course if the economy evolved in line with the central projection". Ross Walker, UK economist at Royal Bank of Scotland, said he was surprised that a dissenting vote had been made so soon. He said the minutes appeared to be "trying to get the market to focus on the possibility of a rise in rates". "If the economy pans out as they expect then they are probably going to have to hike rates." However, he added, any rate increase is not likely to happen until later this year, with MPC members likely to look for a more sustainable pick up in consumer spending before acting. 10 | -------------------------------------------------------------------------------- /knn/testdata/business/5.txt: -------------------------------------------------------------------------------- 1 | Brewers' profits lose their fizz 2 | 3 | Heineken and Carlsberg, two of the world's largest brewers, have reported falling profits after beer sales in western Europe fell flat. 4 | 5 | Dutch firm Heineken saw its annual profits drop 33% and warned that earnings in 2005 may also slide. Danish brewer Carlsberg suffered a 3% fall in profits due to waning demand and increased marketing costs. Both are looking to Russia and China to provide future growth as western European markets are largely mature. 6 | 7 | Heineken's net income fell to 537m euros ($701m; £371m) during 2004, from 798m euro a year ago. It blamed weak demand in western Europe and currency losses. It had warned in September that the weakening US dollar, which has cut the value of foreign sales, would knock 125m euros off its operating profits. Despite the dip in profits, Heineken's sales have been improving and total revenue for the year was 10bn euros, up 8.1% from 9.26bn euros in 2003. Heineken said it now plans to invest 100m euros in "aggressive" and "high-impact" marketing in Europe and the US in 2005. Heineken, which also owns the Amstel and Murphy's stout brands, said it would also seek to cut costs. This may involve closing down breweries. 8 | 9 | Heineken increased its dividend payment by 25% to 40 euro cents, but warned that the continued impact of a weaker dollar and an increased marketing spend may lead to a drop in 2005 net profit. 10 | 11 | Carlsberg, the world's fifth-largest brewer, saw annual pre-tax profits fall to 3.4bn Danish kroner (456m euros). Its beer sales have been affected by the sluggish European economy and by the banning of smoking in pubs in several European countries. Nevertheless, total sales increased 4% to 36bn kroner, thanks to strong sales of Carlsberg lager in Russia and Poland. Carlsberg is more optimistic than Heineken about 2005, projecting a 15% rise in net profits for the year. However, it also plans to cut 200 jobs in Sweden, where sales have been hit by demand for cheap, imported brands. "We remain cautious about the medium-to-long term outlook for revenue growth across western Europe for a host of economic, social and structural reasons," investment bank Merrill Lynch said of Carlsberg. 12 | -------------------------------------------------------------------------------- /knn/testdata/business/6.txt: -------------------------------------------------------------------------------- 1 | Saab to build Cadillacs in Sweden 2 | 3 | General Motors, the world's largest car maker, has confirmed that it will build a new medium-sized Cadillac BLS at its loss-making Saab factory in Sweden. 4 | 5 | The car, unveiled at the Geneva motor show, is intended to compete in the medium-sized luxury car market. It will not be sold in the US, said GM Europe president Carl-Peter Forster. As part of its efforts to make the US marque appeal to European drivers, the car will be the first Cadillac with a diesel engine. 6 | 7 | GM's announcement should go some way to allay fears of the Saab factory's closure. The factory in Trollhaettan has been at the centre of rumours about GM's planned severe cutbacks in its troubled European operations. But the group's new commitment to the Swedish factory may not be welcomed by the group's Opel workers in Ruesselsheim, Germany. They may now have to face a larger proportion of GM's cuts. 8 | 9 | Neither will the announcement be seen as unalloyed good news in Sweden, since it reflects Saab's failure to make significant inroads into the lucrative European luxury car market. For years, Saab has consistently said it is competing head-on with BMW, Mercedes and Jaguar. The segment's leaders do not agree. 10 | 11 | GM's plans to build the American marque in Sweden is part of its efforts to push it as an alternative luxury brand for European drivers. In the US, it has long been established as an upmarket brand - even the presidential limousine carries the badge. Yet it could prove tough for Cadillac to steal market share from the majors in Europe. Other luxury car makers, most notably the Toyota subsidiary Lexus, have enjoyed tremendous success in the US without managing to make significant inroads in Europe. There, German marques Mercedes Benz and BMW have retained their stranglehold on the luxury market. 12 | 13 | Bringing Cadillac production to Sweden should help introduce desperately-needed scale to the Saab factory, which currently produces fewer than 130,000 cars per year. That is about half of what major car makers consider sufficient numbers for profitable operations, and Saab is losing money fast - albeit with losses halved in 2004 to $200m (£104m; 151m euros) from $500m the previous year. Beyond the 12,000 job cuts announced last year at its European operations, GM is reducing expenditure by building Saabs, Opels - badged as Vauxhalls in the UK - and now Cadillacs on the same framework, and by allowing the different brands to share parts. Another way to further reduce Saab's losses could be to shift some of the production of Saabs to the US, a market where drivers have adopted it as an upmarket European car. Doing so would remove the exposure to the weak US dollar, which is making Saabs more expensive to US consumers. But not everyone in the industry agree that it would be the best way forward. "We know that in five years the US dollar will be stronger than it is today," the chief executive of a leading European car maker told BBC News. The current trend towards US production was "stupid", he said. 14 | 15 | In a separate announcement, GM unveiled a new scheme to allow European consumers the chance to test drive its Opel and Vauxhall models. It is to deploy a fleet of 35,000 test cars across 40 countries, inviting potential buyers to try out a vehicle for 24-hours. It follows a similar initiative by GM in the US. GM said it wanted to change "customers' perceptions" about Opel and Vauxhall cars, showing them that the quality had improved in recent years. 16 | -------------------------------------------------------------------------------- /knn/testdata/sports/1.txt: -------------------------------------------------------------------------------- 1 | Claxton hunting first major medal 2 | 3 | British hurdler Sarah Claxton is confident she can win her first major medal at next month's European Indoor Championships in Madrid. 4 | 5 | The 25-year-old has already smashed the British record over 60m hurdles twice this season, setting a new mark of 7.96 seconds to win the AAAs title. "I am quite confident," said Claxton. "But I take each race as it comes. "As long as I keep up my training but not do too much I think there is a chance of a medal." Claxton has won the national 60m hurdles title for the past three years but has struggled to translate her domestic success to the international stage. Now, the Scotland-born athlete owns the equal fifth-fastest time in the world this year. And at last week's Birmingham Grand Prix, Claxton left European medal favourite Russian Irina Shevchenko trailing in sixth spot. 6 | 7 | For the first time, Claxton has only been preparing for a campaign over the hurdles - which could explain her leap in form. In previous seasons, the 25-year-old also contested the long jump but since moving from Colchester to London she has re-focused her attentions. Claxton will see if her new training regime pays dividends at the European Indoors which take place on 5-6 March. 8 | -------------------------------------------------------------------------------- /knn/testdata/sports/2.txt: -------------------------------------------------------------------------------- 1 | Greene sets sights on world title 2 | 3 | Maurice Greene aims to wipe out the pain of losing his Olympic 100m title in Athens by winning a fourth World Championship crown this summer. 4 | 5 | He had to settle for bronze in Greece behind fellow American Justin Gatlin and Francis Obikwelu of Portugal. "It really hurts to look at that medal. It was my mistake. I lost because of the things I did," said Greene, who races in Birmingham on Friday. "It's never going to happen again. My goal - I'm going to win the worlds." Greene crossed the line just 0.02 seconds behind Gatlin, who won in 9.87 seconds in one of the closest and fastest sprints of all time. But Greene believes he lost the race and his title in the semi-finals. "In my semi-final race, I should have won the race but I was conserving energy. "That's when Francis Obikwelu came up and I took third because I didn't know he was there. "I believe that's what put me in lane seven in the final and, while I was in lane seven, I couldn't feel anything in the race. 6 | 7 | "I just felt like I was running all alone. "I believe if I was in the middle of the race I would have been able to react to people that came ahead of me." Greene was also denied Olympic gold in the 4x100m men's relay when he could not catch Britain's Mark Lewis-Francis on the final leg. The Kansas star is set to go head-to-head with Lewis-Francis again at Friday's Norwich Union Grand Prix. The pair contest the 60m, the distance over which Greene currently holds the world record of 6.39 seconds. He then has another indoor meeting in France before resuming training for the outdoor season and the task of recapturing his world title in Helsinki in August. Greene believes Gatlin will again prove the biggest threat to his ambitions in Finland. But he also admits he faces more than one rival for the world crown. "There's always someone else coming. I think when I was coming up I would say there was me and Ato (Boldon) in the young crowd," Greene said. "Now you've got about five or six young guys coming up at the same time." 8 | -------------------------------------------------------------------------------- /knn/testdata/sports/3.txt: -------------------------------------------------------------------------------- 1 | Edwards tips Idowu for Euro gold 2 | 3 | World outdoor triple jump record holder and BBC pundit Jonathan Edwards believes Phillips Idowu can take gold at the European Indoor Championships. 4 | 5 | Idowu landed 17.30m at the British trials in Sheffield last month to lead the world triple jump rankings. "It's all down to him, but if he jumps as well as he did in Sheffield he could win the gold medal," said Edwards. "His ability is undoubted but all his best performances seem to happen in domestic meetings." 6 | 7 | Idowu made his breakthrough five years ago but so far has only a Commonwealth silver medal to his name. Edwards himself kept Idowu off top spot at the Manchester Games. But he believes the European Indoors in Madrid represent a chance for the 26-year-old to prove his credentials as Britain's top triple jumper. "He has to start producing at international level and here is the beginning," said Edwards. "Phillips still needs to be much more consistent. I'm sure a victory in Madrid will build up his confidence and self-belief that he can be best in the world." The qualifying round of the men's triple jump in Madrid takes place on Friday with the final scheduled for Saturday. Olympic champion Christian Olsson will not be taking part as he is out for the entire indoor season with an ankle injury. 8 | -------------------------------------------------------------------------------- /knn/testdata/sports/4.txt: -------------------------------------------------------------------------------- 1 | Mido makes third apology 2 | 3 | Ahmed 'Mido' Hossam has made another apology to the Egyptian people in an attempt to rejoin the national team. 4 | 5 | The 21-year-old told a news conference in Cairo on Sunday that he is sorry for the problems that have led to his exclusion from the Pharaohs since July last year. Mido said: "There isn't much I have to say today, all there is to say is that I came specially from England to Egypt to rejoin the national team and to apologise for all my mistakes." Mido was axed by former coach Marco Tardelli after failing to answer a national call-up, claiming he had a groin injury. But he then played in a friendly for his club AS Roma within 24 hours of a World Cup qualifying match at home to Cameroon last September. Mido added: "It's not my right to give orders and say when I want to play ... at the same time I will always make sure that I put the national's team's matches as my top priority. "I feel that the national players are playing with a new spirit as I saw them play against Belgium (Egypt won 4-0 on Wednesday) and I simply want to add to their success. "I do confess that I was rude to the Egyptian press at times but now I have gained more experience and know that I will never go anywhere without the press's support. "Many of the international stars like David Beckham and (Zinedine) Zidane had the press opposing them. "So I'm now used to the fact that the press can be against me at times and I don't have to overreact when this happens. Meanwhile, Egypt FA spokesman Methat Shalaby welcomed the apology and said no one had exerted pressure on Mido to apologise. "Mido's apology today does not negatively affect Mido in anyway, on the contrary it makes him a bigger star and a role model for all football players," Shalaby said. Shalaby earlier said that after an apology Mido would be available for the national side if coach Hassan Shehata chose him. Mido joined Tottenham in an 18-month loan deal near the end of the January transfer window, scoring twice on his debut against Portsmouth. 6 | -------------------------------------------------------------------------------- /knn/testdata/sports/5.txt: -------------------------------------------------------------------------------- 1 | McIlroy aiming for Madrid title 2 | 3 | Northern Ireland man James McIlroy is confident he can win his first major title at this weekend's Spar European Indoor Championships in Madrid. 4 | 5 | The 28-year-old has been in great form in recent weeks and will go in as one of the 800 metres favourites. "I believe after my wins abroad and in our trial race in Sheffield, I can run my race from the front, back or middle," said McIlroy. New coach Tony Lester has helped get McIlroy's career back on track. The 28-year-old 800 metres runner has not always matched his promise with performances but believes his decision to change coaches and move base will bring the rewards. McIlroy now lives in Windsor and feels his career has been transformed by the no-nonsense leadership style of former Army sergeant Lester. Lester is better known for his work with 400m runners Roger Black and Mark Richardson in the past but under his guidance McIlroy has secured five wins this indoor season. 6 | 7 | McIlroy now claims he is in his best shape since finishing fourth for Ireland at the outdoor European Championships in 1998. "That was my last decent year," said McIlroy, who temporarily retired last August before returning to the sport under Lester's shrewd guidance. "Before, every race was like trying to climb Mount Everest and I now know you can't do it on your own. "Trying to succeed saw me sometimes standing half-dead and terrified on the starting line, which became a bit too much." McIlroy, who was compared to the likes of Sebastian Coe, Steve Cram and Steve Ovett in his younger days, is now competing without the benefit of National Lottery funding. That situation could change if he maintains his current form and repeats the world-class times he produced in the 800m and 1000m at major races in Erfurt and Stuttgart earlier this season. Russian Dmitriy Bogdanov won at the same Madrid venue last week and then claimed the European Championship race would be between himself, Dutchman Arnoud Okken and Antonio Reina of Spain but McIlroy is unfazed. 8 | 9 | He admitted: "He looked quite good in his win and fair enough everyone has the right to their own opinion. "I never write myself off and let's face it, I haven't or looked like being beaten this season." And McIlroy, whose time of one minute 46.68seconds in Erfurt elevated him to sixth place on the UK All-Time list, is also already looking beyond Madrid. He said: "I've been much more focused this year about my career and having such a good team around me has been very important. "Ultimately of course, this weekend is a means to an end and that is getting prepared for the summer's world championships. "That ambition has meant that I've had only two nights out since last August. The rest of my time has seen me just concentrating on rebuilding my career." 10 | -------------------------------------------------------------------------------- /knn/testdata/sports/6.txt: -------------------------------------------------------------------------------- 1 | 2004: An Irish Athletics Year 2 | 3 | 2004 won't be remembered as one of Irish athletics' great years. 4 | 5 | The year began with that optimism which invariably and unaccountably, seems to herald an upcoming Olympiad. But come late August, a few hot days in the magnificent stadium in Athens told us of the true strength of Irish athletics - or to be more accurate, the lack of it. Sonia O'Sullivan's Olympic farewell apart, there was little to stir the emotions of Irish athletics watchers. But after the disastrous build-up to the games, we shouldn't have been surprised. At the start of the year, an O'Sullivan had been earmarked as Ireland's best medal prospect but as it turned out, walker Gillian never even made it to the start line because of injury. Less than a week before the Olympics, the sport was rocked by news that 10,000m hope Cathal Lombard had tested for the banned substance EPO. Lombard's shattering of Mark Carroll's national 10,000m record in April had already set tongues wagging but even the most cynical of observers, were surprised when he was rumbled after an Irish Sports Council sting operation. The Corkman quickly held his hands up in admission and was promptly handed a two-year ban from the sport. 6 | 7 | Back at pre-Olympic ranch in Greece, it must have seemed that things couldn't have got any worse but they very nearly did with walker Jamie Costin lucky to escape with his life after being involved in a car crash near Athens. Once the track and field action began in Athens, a familiar pattern of underachievement emerged although Alistair Cragg's performance in being the only athlete from a European nation to qualify for the 5,000m final did offer hope for the future. Our beloved Sonia scraped into the women's 5K final as a fastest loser and for a couple of days, the country attempted to delude itself into believing that she might be in the medal shake-up. As it happened, she went out the back door early in the final although there was nothing undignified about the way that she insisted on finishing the race over a minute behind winner Meseret Defar. It later transpired that Sonia had been suffering from a stomach bug in the 48 hours before the final although typically, the Cobhwoman played down the effects of the illness. Amazingly, she was back in action a couple of weeks later when beating a world-class field at the Flora Lite 5K road race in London and while her major championship days may be over, it's unlikely that we have seen the last of her in competition. At least Sonia managed to make it to Athens. At the start of the year, several Northern Ireland athletes had genuine hopes of qualifying for the Games but come August, an out-of-form and injured Paul Brizzel was the lone standard bearer for the province. The Ballymena man gave it a lash but his achilles problem, and a bad lane draw, meant a time of 21.00 and an early exit. 8 | 9 | James McIlroy, Gareth Turnbull, Zoe Brown and Paul McKee all had to be content with watching the Athens action on their television screens. 800m hope McIlroy never got near his best during the summer and a fourth place in the British trials effectively ended his hopes of making the plane. The injury-plagued Turnbull gamely travelled round Europe in search of the 1500m qualifying mark but 3:39 was the best he could achieve, after missing several months training during the previous winter. A lingering hamstring probem and a virus wrecked McKee's Athens ambitions and both he and Turnbull deserve a slice of better fortune in 2005. Pole vaulter Brown had hoped for a vote of confidence from the British selectors after she had achieved the Athens B standard but the call never came. As the summer ended, stalwarts Catherina McKiernan and Dermot Donnelly hung up their competitive spikes. McKiernan had to candidly acknowledge that time had crept up on her after several injury-ravaged years. Donnelly and his Annadale Striders team-mates later suffered tragedy when their friend and clubman Andy Campbell was found dead at his home on 18 December. A large turnout of athletics-loving folk turned out in west Belfast to offer their respects to the Campbell family and Andy's many friends. As only death can, it put the year's athletics happenings in a sharp perspective. 10 | -------------------------------------------------------------------------------- /naive/naive.go: -------------------------------------------------------------------------------- 1 | package naive 2 | 3 | import ( 4 | "bytes" 5 | "errors" 6 | "io" 7 | "sync" 8 | 9 | "github.com/n3integration/classifier" 10 | ) 11 | 12 | // ErrNotClassified indicates that a document could not be classified 13 | var ErrNotClassified = errors.New("unable to classify document") 14 | 15 | // Option provides a functional setting for the Classifier 16 | type Option func(c *Classifier) error 17 | 18 | // Classifier implements a naive bayes classifier 19 | type Classifier struct { 20 | feat2cat map[string]map[string]int 21 | catCount map[string]int 22 | tokenizer classifier.Tokenizer 23 | mu sync.RWMutex 24 | } 25 | 26 | // New initializes a new naive Classifier using the standard tokenizer 27 | func New(opts ...Option) *Classifier { 28 | c := &Classifier{ 29 | feat2cat: make(map[string]map[string]int), 30 | catCount: make(map[string]int), 31 | tokenizer: classifier.NewTokenizer(), 32 | } 33 | for _, opt := range opts { 34 | opt(c) 35 | } 36 | return c 37 | } 38 | 39 | // Tokenizer overrides the classifier's default Tokenizer 40 | func Tokenizer(t classifier.Tokenizer) Option { 41 | return func(c *Classifier) error { 42 | c.tokenizer = t 43 | return nil 44 | } 45 | } 46 | 47 | // Train provides supervisory training to the classifier 48 | func (c *Classifier) Train(r io.Reader, category string) error { 49 | c.mu.Lock() 50 | defer c.mu.Unlock() 51 | 52 | for feature := range c.tokenizer.Tokenize(r) { 53 | c.addFeature(feature, category) 54 | } 55 | 56 | c.addCategory(category) 57 | return nil 58 | } 59 | 60 | // TrainString provides supervisory training to the classifier 61 | func (c *Classifier) TrainString(doc string, category string) error { 62 | return c.Train(asReader(doc), category) 63 | } 64 | 65 | // Classify attempts to classify a document. If the document cannot be classified 66 | // (eg. because the classifier has not been trained), an error is returned. 67 | func (c *Classifier) Classify(r io.Reader) (string, error) { 68 | max := 0.0 69 | var err error 70 | classification := "" 71 | probabilities := make(map[string]float64) 72 | 73 | c.mu.RLock() 74 | defer c.mu.RUnlock() 75 | 76 | for _, category := range c.categories() { 77 | probabilities[category] = c.probability(r, category) 78 | if probabilities[category] > max { 79 | max = probabilities[category] 80 | classification = category 81 | } 82 | } 83 | 84 | if classification == "" { 85 | return "", ErrNotClassified 86 | } 87 | return classification, err 88 | } 89 | 90 | // Probabilities runs the provided string through the model and returns 91 | // the potential probability for each classification 92 | func (c *Classifier) Probabilities(str string) (map[string]float64, string) { 93 | probabilities := make(map[string]float64) 94 | 95 | c.mu.RLock() 96 | defer c.mu.RUnlock() 97 | 98 | best := 0.0 99 | cat := `` 100 | 101 | for _, category := range c.categories() { 102 | prob := c.probability(asReader(str), category) 103 | if prob > 0 { 104 | probabilities[category] = prob 105 | } 106 | if prob > best { 107 | best = prob 108 | cat = category 109 | } 110 | } 111 | 112 | return probabilities, cat 113 | } 114 | 115 | // ClassifyString provides convenience classification for strings 116 | func (c *Classifier) ClassifyString(doc string) (string, error) { 117 | return c.Classify(asReader(doc)) 118 | } 119 | 120 | func (c *Classifier) addFeature(feature string, category string) { 121 | if _, ok := c.feat2cat[feature]; !ok { 122 | c.feat2cat[feature] = make(map[string]int) 123 | } 124 | c.feat2cat[feature][category]++ 125 | } 126 | 127 | func (c *Classifier) featureCount(feature string, category string) float64 { 128 | if _, ok := c.feat2cat[feature]; ok { 129 | return float64(c.feat2cat[feature][category]) 130 | } 131 | return 0.0 132 | } 133 | 134 | func (c *Classifier) addCategory(category string) { 135 | c.catCount[category]++ 136 | } 137 | 138 | func (c *Classifier) categoryCount(category string) float64 { 139 | if _, ok := c.catCount[category]; ok { 140 | return float64(c.catCount[category]) 141 | } 142 | return 0.0 143 | } 144 | 145 | func (c *Classifier) count() int { 146 | sum := 0 147 | for _, value := range c.catCount { 148 | sum += value 149 | } 150 | return sum 151 | } 152 | 153 | func (c *Classifier) categories() []string { 154 | var keys []string 155 | for k := range c.catCount { 156 | keys = append(keys, k) 157 | } 158 | return keys 159 | } 160 | 161 | func (c *Classifier) featureProbability(feature string, category string) float64 { 162 | if c.categoryCount(category) == 0 { 163 | return 0.0 164 | } 165 | return c.featureCount(feature, category) / c.categoryCount(category) 166 | } 167 | 168 | func (c *Classifier) weightedProbability(feature string, category string) float64 { 169 | return c.variableWeightedProbability(feature, category, 1.0, 0.5) 170 | } 171 | 172 | func (c *Classifier) variableWeightedProbability(feature string, category string, weight float64, assumedProb float64) float64 { 173 | sum := 0.0 174 | probability := c.featureProbability(feature, category) 175 | for _, category := range c.categories() { 176 | sum += c.featureCount(feature, category) 177 | } 178 | return ((weight * assumedProb) + (sum * probability)) / (weight + sum) 179 | } 180 | 181 | func (c *Classifier) probability(r io.Reader, category string) float64 { 182 | categoryProbability := c.categoryCount(category) / float64(c.count()) 183 | docProbability := c.docProbability(r, category) 184 | return docProbability * categoryProbability 185 | } 186 | 187 | func (c *Classifier) docProbability(r io.Reader, category string) float64 { 188 | probability := 1.0 189 | for feature := range c.tokenizer.Tokenize(r) { 190 | probability *= c.weightedProbability(feature, category) 191 | } 192 | return probability 193 | } 194 | 195 | func asReader(text string) io.Reader { 196 | return bytes.NewBufferString(text) 197 | } 198 | -------------------------------------------------------------------------------- /naive/naive_test.go: -------------------------------------------------------------------------------- 1 | package naive 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | var ( 8 | ham = "The quick brown fox jumps over the lazy dog" 9 | spam = "Earn cash quick online" 10 | ) 11 | 12 | func TestProbability(t *testing.T) { 13 | classifier := New() 14 | 15 | t.Run(`Probabilities`, func(t *testing.T) { 16 | for z := 0; z < 1; z++ { 17 | classifier.TrainString(`aaa bbb ccc ddd`, "A") 18 | classifier.TrainString(`111 222 333 444 zzz`, "X") 19 | classifier.TrainString(`bbb ccc ddd eee`, "A") 20 | classifier.TrainString(`222 333 444 555 zzz`, "X") 21 | classifier.TrainString(`bbb ccc ddd eee fff`, "A") 22 | classifier.TrainString(`222 333 444 555 666 zzz`, "X") 23 | } 24 | 25 | if m, _ := classifier.Probabilities(`bbb ccc ddd`); m[`A`] <= m[`X`] { 26 | t.Errorf(`A=%.2f value should be greater than X=%.2f`, m[`X`], m[`A`]) 27 | } 28 | 29 | if m, _ := classifier.Probabilities(`222 333 zzz`); m[`X`] <= m[`A`] { 30 | t.Errorf(`X=%.2f value should be greater than A=%.2f`, m[`X`], m[`A`]) 31 | } 32 | }) 33 | } 34 | func TestAddFeature(t *testing.T) { 35 | classifier := New() 36 | classifier.addFeature("quick", "good") 37 | assertFeatureCount(t, classifier, "quick", "good", 1.0) 38 | assertFeatureCount(t, classifier, "quick", "bad", 0.0) 39 | classifier.addFeature("quick", "bad") 40 | assertFeatureCount(t, classifier, "quick", "bad", 1.0) 41 | } 42 | 43 | func TestAddCategory(t *testing.T) { 44 | classifier := New() 45 | 46 | assertCategoryCount(t, classifier, "good", 0.0) 47 | classifier.addCategory("good") 48 | assertCategoryCount(t, classifier, "good", 1.0) 49 | categories := classifier.categories() 50 | 51 | assertEqual(t, float64(classifier.count()), float64(len(categories))) 52 | } 53 | 54 | func TestTrain(t *testing.T) { 55 | classifier := New() 56 | 57 | if err := classifier.TrainString(ham, "good"); err != nil { 58 | t.Error("classifier training failed") 59 | } 60 | 61 | if err := classifier.TrainString(spam, "bad"); err != nil { 62 | t.Error("classifier training failed") 63 | } 64 | 65 | assertFeatureCount(t, classifier, "quick", "good", 1.0) 66 | assertFeatureCount(t, classifier, "quick", "bad", 1.0) 67 | assertCategoryCount(t, classifier, "good", 1) 68 | assertCategoryCount(t, classifier, "bad", 1) 69 | } 70 | 71 | func TestClassify(t *testing.T) { 72 | classifier := New() 73 | text := "Quick way to make cash" 74 | 75 | t.Run("Empty classifier", func(t *testing.T) { 76 | if _, err := classifier.ClassifyString(text); err != ErrNotClassified { 77 | t.Errorf("expected classification error; received: %v", err) 78 | } 79 | }) 80 | 81 | t.Run("Trained classifier", func(t *testing.T) { 82 | classifier.TrainString(ham, "good") 83 | classifier.TrainString(spam, "bad") 84 | 85 | if _, err := classifier.ClassifyString(text); err != nil { 86 | t.Error("document incorrectly classified") 87 | } 88 | }) 89 | } 90 | 91 | func assertCategoryCount(t *testing.T, classifier *Classifier, category string, count float64) { 92 | v := classifier.categoryCount(category) 93 | assertEqual(t, count, v) 94 | } 95 | 96 | func assertFeatureCount(t *testing.T, classifier *Classifier, feature string, category string, count float64) { 97 | v := classifier.featureCount(feature, category) 98 | assertEqual(t, count, v) 99 | } 100 | 101 | func assertEqual(t *testing.T, expected, actual float64) { 102 | if actual != expected { 103 | t.Errorf("Expectation mismatch. Expected(%f) <=> Actual (%f)", expected, actual) 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /stopwords.go: -------------------------------------------------------------------------------- 1 | package classifier 2 | 3 | import ( 4 | "strings" 5 | ) 6 | 7 | var ( 8 | stopwords = map[string]struct{}{ 9 | "a": {}, "able": {}, "about": {}, "across": {}, "after": {}, "all": {}, "almost": {}, "also": {}, "am": {}, "among": {}, "an": {}, "and": {}, "any": {}, "are": {}, "as": {}, "at": {}, 10 | "be": {}, "because": {}, "been": {}, "but": {}, "by": {}, "can": {}, "cannot": {}, "could": {}, "dear": {}, "did": {}, "do": {}, "does": {}, "either": {}, "else": {}, "ever": {}, 11 | "every": {}, "for": {}, "from": {}, "get": {}, "got": {}, "had": {}, "has": {}, "have": {}, "he": {}, "her": {}, "hers": {}, "him": {}, "his": {}, "how": {}, "however": {}, "i": {}, 12 | "if": {}, "in": {}, "into": {}, "is": {}, "it": {}, "its": {}, "just": {}, "least": {}, "let": {}, "like": {}, "likely": {}, "may": {}, "me": {}, "might": {}, "most": {}, "must": {}, 13 | "my": {}, "neither": {}, "no": {}, "nor": {}, "not": {}, "of": {}, "off": {}, "often": {}, "on": {}, "only": {}, "or": {}, "other": {}, "our": {}, "own": {}, "rather": {}, "said": {}, 14 | "say": {}, "says": {}, "she": {}, "should": {}, "since": {}, "so": {}, "some": {}, "than": {}, "that": {}, "the": {}, "their": {}, "them": {}, "then": {}, "there": {}, "these": {}, 15 | "they": {}, "this": {}, "tis": {}, "to": {}, "too": {}, "twas": {}, "us": {}, "wants": {}, "was": {}, "we": {}, "were": {}, "what": {}, "when": {}, "where": {}, "which": {}, "while": {}, 16 | "who": {}, "whom": {}, "why": {}, "will": {}, "with": {}, "would": {}, "yet": {}, "you": {}, "your": {}, 17 | } 18 | ) 19 | 20 | // IsStopWord checks against a list of known english stop words and returns true if v is a 21 | // stop word; false otherwise 22 | func IsStopWord(v string) bool { 23 | if _, ok := stopwords[strings.ToLower(v)]; ok { 24 | return true 25 | } 26 | return false 27 | } 28 | 29 | // IsNotStopWord is the inverse function of IsStopWord 30 | func IsNotStopWord(v string) bool { 31 | return !IsStopWord(v) 32 | } 33 | -------------------------------------------------------------------------------- /stopwords_test.go: -------------------------------------------------------------------------------- 1 | package classifier 2 | 3 | import "testing" 4 | 5 | func TestStopWords(t *testing.T) { 6 | t.Run("Stopword", func(t *testing.T) { 7 | sample := []string{"a", "is", "the"} 8 | for _, v := range sample { 9 | if IsNotStopWord(v) { 10 | t.Errorf("%s was not identified as a stop word", v) 11 | } 12 | } 13 | }) 14 | t.Run("Other", func(t *testing.T) { 15 | sample := []string{"hello", "world"} 16 | for _, v := range sample { 17 | if IsStopWord(v) { 18 | t.Errorf("%s was incorrectly identified as a stop word", v) 19 | } 20 | } 21 | }) 22 | } 23 | -------------------------------------------------------------------------------- /tokens.go: -------------------------------------------------------------------------------- 1 | package classifier 2 | 3 | import ( 4 | "bufio" 5 | "io" 6 | "strings" 7 | "unicode" 8 | "unicode/utf8" 9 | ) 10 | 11 | // Tokenizer provides a common interface to tokenize documents 12 | type Tokenizer interface { 13 | // Tokenize breaks the provided document into a channel of tokens 14 | Tokenize(io.Reader) chan string 15 | } 16 | 17 | // IsWord is a predicate to determine if a string contains at least two 18 | // characters and doesn't contain any numbers 19 | func IsWord(v string) bool { 20 | return len(v) > 2 && !strings.ContainsAny(v, "01234556789") 21 | } 22 | 23 | // StdOption provides configuration settings for a StdTokenizer 24 | type StdOption func(*StdTokenizer) 25 | 26 | // StdTokenizer provides a common document tokenizer that splits a 27 | // document by word boundaries 28 | type StdTokenizer struct { 29 | transforms []Mapper 30 | splitFn bufio.SplitFunc 31 | filters []Predicate 32 | bufferSize int 33 | } 34 | 35 | // NewTokenizer initializes a new standard Tokenizer instance 36 | func NewTokenizer(opts ...StdOption) *StdTokenizer { 37 | tokenizer := &StdTokenizer{ 38 | bufferSize: 100, 39 | splitFn: bufio.ScanWords, 40 | transforms: []Mapper{ 41 | strings.ToLower, 42 | }, 43 | filters: []Predicate{ 44 | IsNotStopWord, 45 | }, 46 | } 47 | for _, opt := range opts { 48 | opt(tokenizer) 49 | } 50 | return tokenizer 51 | } 52 | 53 | // Tokenize words and return streaming results 54 | func (t *StdTokenizer) Tokenize(r io.Reader) chan string { 55 | tokenizer := bufio.NewScanner(r) 56 | tokenizer.Split(t.splitFn) 57 | tokens := make(chan string, t.bufferSize) 58 | 59 | go func() { 60 | for tokenizer.Scan() { 61 | tokens <- tokenizer.Text() 62 | } 63 | close(tokens) 64 | }() 65 | 66 | return t.pipeline(tokens) 67 | } 68 | 69 | func (t *StdTokenizer) pipeline(in chan string) chan string { 70 | return Map(Filter(in, t.filters...), t.transforms...) 71 | } 72 | 73 | // BufferSize adjusts the size of the buffered channel 74 | func BufferSize(size int) StdOption { 75 | return func(t *StdTokenizer) { 76 | t.bufferSize = size 77 | } 78 | } 79 | 80 | // SplitFunc overrides the default word split function, based on whitespace 81 | func SplitFunc(fn bufio.SplitFunc) StdOption { 82 | return func(t *StdTokenizer) { 83 | t.splitFn = fn 84 | } 85 | } 86 | 87 | // Transforms overrides the list of mappers 88 | func Transforms(m ...Mapper) StdOption { 89 | return func(t *StdTokenizer) { 90 | t.transforms = m 91 | } 92 | } 93 | 94 | // Filters overrides the list of predicates 95 | func Filters(f ...Predicate) StdOption { 96 | return func(t *StdTokenizer) { 97 | t.filters = f 98 | } 99 | } 100 | 101 | // ScanAlphaWords is a function that splits text on whitespace, punctuation, and symbols; 102 | // derived bufio.ScanWords 103 | func ScanAlphaWords(data []byte, atEOF bool) (advance int, token []byte, err error) { 104 | // Skip leading spaces and symbols 105 | start := 0 106 | for width := 0; start < len(data); start += width { 107 | var r rune 108 | r, width = utf8.DecodeRune(data[start:]) 109 | 110 | if !unicode.IsSpace(r) && !unicode.IsPunct(r) && !unicode.IsSymbol(r) { 111 | break 112 | } 113 | } 114 | 115 | // Scan until space or symbol, marking end of word. 116 | for width, i := 0, start; i < len(data); i += width { 117 | var r rune 118 | r, width = utf8.DecodeRune(data[i:]) 119 | if unicode.IsSpace(r) || unicode.IsPunct(r) || unicode.IsSymbol(r) { 120 | return i + width, data[start:i], nil 121 | } 122 | } 123 | 124 | // If we're at EOF, we have a final, non-empty, non-terminated word. Return it. 125 | if atEOF && len(data) > start { 126 | return len(data), data[start:], nil 127 | } 128 | // Request more data. 129 | return start, nil, nil 130 | } 131 | -------------------------------------------------------------------------------- /tokens_test.go: -------------------------------------------------------------------------------- 1 | package classifier 2 | 3 | import ( 4 | "bytes" 5 | "io" 6 | "strings" 7 | "testing" 8 | "unicode" 9 | ) 10 | 11 | var ( 12 | text = "The quick brown fox jumped over the lazy dog" 13 | expected = 7 14 | ) 15 | 16 | type assertion func(t *testing.T, v string) 17 | 18 | func TestTokenize(t *testing.T) { 19 | tests := []struct { 20 | Name string 21 | Opts []StdOption 22 | Assertions []assertion 23 | }{ 24 | {"Standard Tokenizer", options(), assertions()}, 25 | {"Buffered Tokenizer", options(BufferSize(1)), assertions()}, 26 | {"ToUpper Tokenizer", options(Transforms(toUpper)), assertions(isUpper)}, 27 | {"Stopword Tokenizer", options(Filters(IsNotStopWord)), assertions(isStopWord)}, 28 | } 29 | 30 | for _, test := range tests { 31 | t.Run(test.Name, func(t *testing.T) { 32 | tokens := NewTokenizer(test.Opts...).Tokenize(toReader(text)) 33 | doTokenizeTest(t, tokens) 34 | }) 35 | } 36 | } 37 | 38 | func isStopWord(t *testing.T, v string) { 39 | if IsStopWord(v) { 40 | t.Errorf("value is a stopword") 41 | } 42 | } 43 | 44 | func isUpper(t *testing.T, v string) { 45 | for _, c := range v { 46 | if !unicode.IsUpper(c) { 47 | t.Errorf("value is not in uppercase") 48 | return 49 | } 50 | } 51 | } 52 | 53 | func toUpper(s string) string { 54 | return strings.ToUpper(s) 55 | } 56 | 57 | func toReader(text string) io.Reader { 58 | return bytes.NewBuffer([]byte(text)) 59 | } 60 | 61 | func doTokenizeTest(t *testing.T, tokens chan string, assertions ...assertion) { 62 | actual := 0 63 | for v := range tokens { 64 | for _, assert := range assertions { 65 | assert(t, v) 66 | } 67 | actual++ 68 | } 69 | if actual != expected { 70 | t.Errorf("Expected %d tokens; actual: %d", expected, actual) 71 | } 72 | } 73 | 74 | func options(opts ...StdOption) []StdOption { 75 | return opts 76 | } 77 | 78 | func assertions(assertions ...assertion) []assertion { 79 | return assertions 80 | } 81 | -------------------------------------------------------------------------------- /weight.go: -------------------------------------------------------------------------------- 1 | package classifier 2 | 3 | import ( 4 | "math" 5 | ) 6 | 7 | // WeightSchemeStrategy provides support for pluggable weight schemes 8 | type WeightSchemeStrategy func(doc map[string]float64) WeightScheme 9 | 10 | // WeightScheme provides a contract for term frequency weight schemes 11 | type WeightScheme func(term string) float64 12 | 13 | // Binary weight scheme: 1 if present; 0 otherwise 14 | func Binary(doc map[string]float64) WeightScheme { 15 | return func(term string) float64 { 16 | if _, ok := doc[term]; ok { 17 | return 1 18 | } 19 | return 0 20 | } 21 | } 22 | 23 | // BagOfWords weight scheme: counts the number of occurrences 24 | func BagOfWords(doc map[string]float64) WeightScheme { 25 | return func(term string) float64 { 26 | return doc[term] 27 | } 28 | } 29 | 30 | // TermFrequency weight scheme; counts the number of occurrences divided by 31 | // the number of terms within a document 32 | func TermFrequency(doc map[string]float64) WeightScheme { 33 | return func(term string) float64 { 34 | return math.Sqrt(doc[term] / float64(len(doc))) 35 | } 36 | } 37 | 38 | // LogNorm weight scheme: returns the natural log of the number of occurrences of a term 39 | func LogNorm(doc map[string]float64) WeightScheme { 40 | return func(term string) float64 { 41 | return math.Log(1 + doc[term]) 42 | } 43 | } 44 | --------------------------------------------------------------------------------