├── .github
    └── workflows
    │   └── build.yaml
├── .gitignore
├── LICENSE
├── README.md
├── VERSION
├── classifier.go
├── classifier_test.go
├── func.go
├── func_test.go
├── go.mod
├── go.sum
├── index
    ├── index.go
    └── index_test.go
├── knn
    ├── knn.go
    ├── knn_test.go
    ├── matrix.go
    ├── similarity.go
    ├── similarity_test.go
    ├── sort.go
    └── testdata
    │   ├── README.md
    │   ├── business
    │       ├── 1.txt
    │       ├── 2.txt
    │       ├── 3.txt
    │       ├── 4.txt
    │       ├── 5.txt
    │       └── 6.txt
    │   └── sports
    │       ├── 1.txt
    │       ├── 2.txt
    │       ├── 3.txt
    │       ├── 4.txt
    │       ├── 5.txt
    │       └── 6.txt
├── naive
    ├── naive.go
    └── naive_test.go
├── stopwords.go
├── stopwords_test.go
├── tokens.go
├── tokens_test.go
└── weight.go


/.github/workflows/build.yaml:
--------------------------------------------------------------------------------
 1 | name: build pipeline
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - master
 6 | jobs:
 7 |   build:
 8 |     runs-on: ubuntu-latest
 9 |     steps:
10 |       - uses: actions/checkout@v3
11 | 
12 |       - name: Setup
13 |         uses: actions/setup-go@v3
14 |         with:
15 |           go-version: 1.19.4
16 | 
17 |       - name: Deps
18 |         run: |
19 |           go version
20 | 
21 |       - name: Build
22 |         run: go build -v
23 | 
24 |       - name: Lint
25 |         run: |
26 |           go vet .
27 | 
28 |       - name: Test
29 |         run: go test -v -cover $(go list ./...)
30 | 
31 |       - name: Coverage
32 |         run: |
33 |             for pkg in $(go list ./...); do go test -v -coverprofile=coverage_tmp.txt -covermode=atomic $pkg || ERROR="Error testing $pkg"; tail -n +2 coverage_tmp.txt >> coverage.txt || die "Unable to append coverage for $pkg"; done
34 |             bash <(curl -s https://codecov.io/bash)
35 | 
36 |       - name: Release
37 |         env:
38 |           GITHUB_TOKEN: ${{ github.token }}
39 |         run: |
40 |             VERSION=$(cat VERSION | grep "^version" | sed -e 's/version=//')
41 |             go get github.com/aktau/github-release
42 |             go install github.com/aktau/github-release
43 |             $(go env GOPATH)/bin/github-release release --user n3integration --repo classifier --tag v$VERSION || echo "duplicate release"
44 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled Object files, Static and Dynamic libs (Shared Objects)
 2 | *.o
 3 | *.a
 4 | *.so
 5 | 
 6 | # Folders
 7 | _obj
 8 | _test
 9 | vendor/
10 | 
11 | # Architecture specific extensions/prefixes
12 | *.[568vq]
13 | [568vq].out
14 | 
15 | *.cgo1.go
16 | *.cgo2.c
17 | _cgo_defun.c
18 | _cgo_gotypes.go
19 | _cgo_export.*
20 | 
21 | _testmain.go
22 | 
23 | *.exe
24 | *.test
25 | *.prof
26 | classifier
27 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # classifier
 2 | General purpose text classifier (naïve bayes, k-nearest neighbors)
 3 | 
 4 | [![codecov](https://codecov.io/gh/n3integration/classifier/branch/master/graph/badge.svg)](https://codecov.io/gh/n3integration/classifier)
 5 | [![Go Report Card](https://goreportcard.com/badge/github.com/n3integration/classifier)](https://goreportcard.com/report/github.com/n3integration/classifier)
 6 | [![Documentation](https://godoc.org/github.com/n3integration/classifier?status.svg)](http://godoc.org/github.com/n3integration/classifier)
 7 | 
 8 | ## Installation
 9 | 
10 | ```bash
11 | go get github.com/n3integration/classifier
12 | ```
13 | 
14 | ## Usage
15 | 
16 | ### Classification
17 | 
18 | There are two methods of classifying text data: `io.Reader` or `string`. To classify strings, use the `TrainString` 
19 | or `ClassifyString` functions. To classify larger sources, use the `Train` and `Classify` functions that 
20 | take an `io.Reader` as input.
21 | 
22 | ```go
23 | package main
24 | 
25 | import (
26 | 	"fmt"
27 | 	
28 | 	"github.com/n3integration/classifier/naive"
29 | )
30 | 
31 | func main() {
32 |     classifier := naive.New()
33 |     classifier.TrainString("The quick brown fox jumped over the lazy dog", "ham")
34 |     classifier.TrainString("Earn a degree online", "ham")
35 |     classifier.TrainString("Earn cash quick online", "spam")
36 |     
37 |     if classification, err := classifier.ClassifyString("Earn your masters degree online"); err == nil {
38 |         fmt.Println("Classification => ", classification) // ham
39 |     } else {
40 |         fmt.Println("error: ", err)
41 |     }	
42 | }
43 | ```
44 | 
45 | ## Contributing
46 | 
47 | - Fork the repository
48 | - Create a local feature branch
49 | - Run `gofmt`
50 | - Bump the `VERSION` file using [semantic versioning](https://semver.org/)
51 | - Submit a pull request
52 | 
53 | ## License
54 | 
55 | Copyright 2023 n3integration@gmail.com
56 | 
57 | Licensed under the Apache License, Version 2.0 (the "License");
58 | you may not use this file except in compliance with the License.
59 | You may obtain a copy of the License at
60 | 
61 |     http://www.apache.org/licenses/LICENSE-2.0
62 | 
63 | Unless required by applicable law or agreed to in writing, software
64 | distributed under the License is distributed on an "AS IS" BASIS,
65 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
66 | See the License for the specific language governing permissions and
67 | limitations under the License.
68 | 


--------------------------------------------------------------------------------
/VERSION:
--------------------------------------------------------------------------------
1 | version=0.5.0
2 | 


--------------------------------------------------------------------------------
/classifier.go:
--------------------------------------------------------------------------------
 1 | package classifier
 2 | 
 3 | import "io"
 4 | 
 5 | // Classifier provides a simple interface for different text classifiers
 6 | type Classifier interface {
 7 | 	// Train allows clients to train the classifier
 8 | 	Train(io.Reader, string) error
 9 | 	// TrainString allows clients to train the classifier using a string
10 | 	TrainString(string, string) error
11 | 	// Classify performs a classification on the input corpus and assumes that
12 | 	// the underlying classifier has been trained.
13 | 	Classify(io.Reader) (string, error)
14 | 	// ClassifyString performs text classification using a string
15 | 	ClassifyString(string) (string, error)
16 | }
17 | 
18 | // WordCounts extracts term frequencies from a text corpus
19 | func WordCounts(r io.Reader) (map[string]int, error) {
20 | 	instream := NewTokenizer().Tokenize(r)
21 | 	wc := make(map[string]int)
22 | 	for token := range instream {
23 | 		wc[token] = wc[token] + 1
24 | 	}
25 | 	return wc, nil
26 | }
27 | 


--------------------------------------------------------------------------------
/classifier_test.go:
--------------------------------------------------------------------------------
 1 | package classifier
 2 | 
 3 | import (
 4 | 	"testing"
 5 | )
 6 | 
 7 | func TestWordCounts(t *testing.T) {
 8 | 	wc, err := WordCounts(toReader(text))
 9 | 
10 | 	if err != nil {
11 | 		t.Error("failed to get word counts:", err)
12 | 	}
13 | 
14 | 	if len(wc) != expected {
15 | 		t.Errorf("Expected %d; actual %d", expected, len(wc))
16 | 	}
17 | 
18 | 	for key, value := range wc {
19 | 		if value != 1 {
20 | 			t.Errorf("Incorrect term frequency for %s: %d", key, value)
21 | 		}
22 | 	}
23 | }
24 | 


--------------------------------------------------------------------------------
/func.go:
--------------------------------------------------------------------------------
 1 | package classifier
 2 | 
 3 | const defaultBufferSize = 50
 4 | 
 5 | // Predicate provides a predicate function
 6 | type Predicate func(string) bool
 7 | 
 8 | // Mapper provides a map function
 9 | type Mapper func(string) string
10 | 
11 | // Map applies f to each element of the supplied input channel
12 | func Map(vs chan string, f ...Mapper) chan string {
13 | 	stream := make(chan string, defaultBufferSize)
14 | 
15 | 	go func() {
16 | 		for v := range vs {
17 | 			for _, fn := range f {
18 | 				v = fn(v)
19 | 			}
20 | 			stream <- v
21 | 		}
22 | 		close(stream)
23 | 	}()
24 | 
25 | 	return stream
26 | }
27 | 
28 | // Filter removes elements from the input channel where the supplied predicate
29 | // is satisfied
30 | // Filter is a Predicate aggregation
31 | func Filter(vs chan string, filters ...Predicate) chan string {
32 | 	stream := make(chan string, defaultBufferSize)
33 | 	apply := func(text string) bool {
34 | 		for _, f := range filters {
35 | 			if !f(text) {
36 | 				return false
37 | 			}
38 | 		}
39 | 		return true
40 | 	}
41 | 
42 | 	go func() {
43 | 		for text := range vs {
44 | 			if apply(text) {
45 | 				stream <- text
46 | 			}
47 | 		}
48 | 		close(stream)
49 | 	}()
50 | 
51 | 	return stream
52 | }


--------------------------------------------------------------------------------
/func_test.go:
--------------------------------------------------------------------------------
 1 | package classifier
 2 | 
 3 | import (
 4 | 	"strings"
 5 | 	"testing"
 6 | )
 7 | 
 8 | var words = []string{
 9 | 	"hello", "world",
10 | }
11 | 
12 | func streamWords() chan string {
13 | 	stream := make(chan string)
14 | 	go func() {
15 | 		for _, word := range words {
16 | 			stream <- word
17 | 		}
18 | 		close(stream)
19 | 	}()
20 | 	return stream
21 | }
22 | 
23 | func TestMap(t *testing.T) {
24 | 	i := 0
25 | 	results := Map(streamWords(), strings.ToUpper)
26 | 	for word := range results {
27 | 		expected := strings.ToUpper(words[i])
28 | 		if expected != word {
29 | 			t.Errorf("did not match expected result %v <> %v", expected, word)
30 | 		}
31 | 		i++
32 | 	}
33 | }
34 | 
35 | func TestFilter(t *testing.T) {
36 | 	results := Filter(streamWords(), func(s string) bool {
37 | 		return s != words[0]
38 | 	})
39 | 
40 | 	i := 0
41 | 	for word := range results {
42 | 		i++
43 | 		if word != words[1] {
44 | 			t.Error("incorrect result:", word)
45 | 		}
46 | 	}
47 | 	if i != 1 {
48 | 		t.Error("incorrect number of results:", i)
49 | 	}
50 | }
51 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/n3integration/classifier
2 | 
3 | go 1.19
4 | 


--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/n3integration/classifier/f8630b69279e8eae662ff56114a722497c4bd19d/go.sum


--------------------------------------------------------------------------------
/index/index.go:
--------------------------------------------------------------------------------
 1 | package index
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"sync"
 6 | )
 7 | 
 8 | // TermIndex provides a term frequency index
 9 | type TermIndex struct {
10 | 	index int
11 | 	terms map[string]*termRef
12 | 	sync.RWMutex
13 | }
14 | 
15 | // NewTermIndex initializes an empty term frequency index
16 | func NewTermIndex(capacity int) *TermIndex {
17 | 	return &TermIndex{
18 | 		terms: make(map[string]*termRef, capacity),
19 | 	}
20 | }
21 | 
22 | // Add a term to the index
23 | func (i *TermIndex) Add(t string) int {
24 | 	i.Lock()
25 | 	defer i.Unlock()
26 | 	if _, ok := i.terms[t]; ok {
27 | 		i.terms[t].incr()
28 | 		return i.terms[t].index
29 | 	}
30 | 	i.terms[t] = &termRef{
31 | 		1,
32 | 		i.index,
33 | 	}
34 | 	i.index++
35 | 	return i.terms[t].index
36 | }
37 | 
38 | // IndexOf returns the index of the provided term, or -1 if not found
39 | func (i *TermIndex) IndexOf(term string) int {
40 | 	i.RLock()
41 | 	defer i.RUnlock()
42 | 	if t, ok := i.terms[term]; ok {
43 | 		return t.index
44 | 	}
45 | 	return -1
46 | }
47 | 
48 | // Frequency returns the term frequency within the index
49 | func (i *TermIndex) Frequency(term string) float64 {
50 | 	i.RLock()
51 | 	defer i.RUnlock()
52 | 	if t, ok := i.terms[term]; ok {
53 | 		return t.freq
54 | 	}
55 | 	return 0
56 | }
57 | 
58 | // Count returns the number of terms within the index
59 | func (i *TermIndex) Count() int {
60 | 	i.RLock()
61 | 	defer i.RUnlock()
62 | 	return len(i.terms)
63 | }
64 | 
65 | func (i *TermIndex) String() string {
66 | 	i.RLock()
67 | 	defer i.RUnlock()
68 | 	return fmt.Sprintf("%v", i.terms)
69 | }
70 | 
71 | // termRef provides a given term's frequency and ref index
72 | type termRef struct {
73 | 	freq  float64
74 | 	index int
75 | }
76 | 
77 | func (t *termRef) incr() float64 {
78 | 	t.freq++
79 | 	return t.freq
80 | }
81 | 
82 | func (t *termRef) String() string {
83 | 	return fmt.Sprintf("%v", t.freq)
84 | }
85 | 


--------------------------------------------------------------------------------
/index/index_test.go:
--------------------------------------------------------------------------------
 1 | package index
 2 | 
 3 | import (
 4 | 	"strings"
 5 | 	"testing"
 6 | )
 7 | 
 8 | var (
 9 | 	text     = "The quick brown fox jumped over the lazy dog"
10 | 	expected = 7
11 | )
12 | 
13 | func TestTermIndex(t *testing.T) {
14 | 	allTermsExpected := expected + 1
15 | 	index := NewTermIndex(allTermsExpected)
16 | 	for _, txt := range strings.Split(text, " ") {
17 | 		index.Add(strings.ToLower(txt))
18 | 	}
19 | 	if index.Count() != allTermsExpected {
20 | 		t.Errorf("incorrect index size; expected %v, but got %v", expected, index.Count())
21 | 	}
22 | 	for term := range index.terms {
23 | 		if index.Frequency(term) < 1 {
24 | 			t.Errorf("incorrect frequency; expected %v, but got %v", expected, index.Frequency(term))
25 | 		}
26 | 	}
27 | }
28 | 


--------------------------------------------------------------------------------
/knn/knn.go:
--------------------------------------------------------------------------------
  1 | package knn
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"errors"
  6 | 	"fmt"
  7 | 	"io"
  8 | 	"math"
  9 | 	"sort"
 10 | 	"sync"
 11 | 
 12 | 	"github.com/n3integration/classifier"
 13 | 	"github.com/n3integration/classifier/index"
 14 | )
 15 | 
 16 | const (
 17 | 	defaultKVal          = 1
 18 | 	defaultIndexCapacity = 10_000
 19 | )
 20 | 
 21 | // Option provides a functional setting for the Classifier
 22 | type Option func(c *Classifier) error
 23 | 
 24 | // Classifier provides k-nearest neighbor classification
 25 | type Classifier struct {
 26 | 	mu sync.RWMutex
 27 | 
 28 | 	k            int
 29 | 	categories   []string
 30 | 	index        *index.TermIndex
 31 | 	matrix       *sparse
 32 | 	similarity   SimilarityScore
 33 | 	tokenizer    classifier.Tokenizer
 34 | 	weightScheme classifier.WeightSchemeStrategy
 35 | }
 36 | 
 37 | // New initializes a new k-nearest neighbor classifier unless overridden,
 38 | // binary term weights and k=1 will be used for the created instance
 39 | func New(opts ...Option) *Classifier {
 40 | 	c := &Classifier{
 41 | 		k:            defaultKVal,
 42 | 		categories:   make([]string, 0),
 43 | 		index:        index.NewTermIndex(defaultIndexCapacity),
 44 | 		matrix:       newSparseMatrix(),
 45 | 		similarity:   CosineSimilarity,
 46 | 		tokenizer:    classifier.NewTokenizer(),
 47 | 		weightScheme: classifier.Binary,
 48 | 	}
 49 | 	for _, opt := range opts {
 50 | 		opt(c)
 51 | 	}
 52 | 	return c
 53 | }
 54 | 
 55 | // K provides the value of 'k'
 56 | func K(k int) Option {
 57 | 	return func(c *Classifier) error {
 58 | 		if k < 1 {
 59 | 			return errors.New("the value of k must be a positive integer")
 60 | 		}
 61 | 		c.k = k
 62 | 		return nil
 63 | 	}
 64 | }
 65 | 
 66 | // WeightScheme provides the term weight scheme
 67 | func WeightScheme(s classifier.WeightSchemeStrategy) Option {
 68 | 	return func(c *Classifier) error {
 69 | 		c.weightScheme = s
 70 | 		return nil
 71 | 	}
 72 | }
 73 | 
 74 | // Similarity provides an alternate similarity scoring strategy
 75 | func Similarity(s SimilarityScore) Option {
 76 | 	return func(c *Classifier) error {
 77 | 		c.similarity = s
 78 | 		return nil
 79 | 	}
 80 | }
 81 | 
 82 | // Tokenizer provides an alternate document Tokenizer
 83 | func Tokenizer(t classifier.Tokenizer) Option {
 84 | 	return func(c *Classifier) error {
 85 | 		c.tokenizer = t
 86 | 		return nil
 87 | 	}
 88 | }
 89 | 
 90 | // TermIndex provides an alternate TermIndex
 91 | func TermIndex(i *index.TermIndex) Option {
 92 | 	return func(c *Classifier) error {
 93 | 		c.index = i
 94 | 		return nil
 95 | 	}
 96 | }
 97 | 
 98 | func (c *Classifier) TrainString(doc string, category string) error {
 99 | 	return c.Train(asReader(doc), category)
100 | }
101 | 
102 | func (c *Classifier) Train(r io.Reader, category string) error {
103 | 	wordFreq := make(map[string]float64)
104 | 	for text := range c.tokenizer.Tokenize(r) {
105 | 		count := wordFreq[text]
106 | 		wordFreq[text] = count + 1
107 | 
108 | 		if count == 0 {
109 | 			c.index.Add(text)
110 | 		}
111 | 	}
112 | 
113 | 	c.mu.Lock()
114 | 	defer c.mu.Unlock()
115 | 	c.categories = append(c.categories, category)
116 | 	c.matrix.Add(c.index, c.weightScheme(wordFreq), wordFreq)
117 | 	return nil
118 | }
119 | 
120 | func (c *Classifier) ClassifyString(doc string) (string, error) {
121 | 	return c.Classify(asReader(doc))
122 | }
123 | 
124 | func (c *Classifier) Classify(r io.Reader) (string, error) {
125 | 	wordFreq := make(map[string]float64)
126 | 	for text := range c.tokenizer.Tokenize(r) {
127 | 		count := wordFreq[text]
128 | 		wordFreq[text] = count + 1
129 | 	}
130 | 
131 | 	c.mu.RLock()
132 | 	defer c.mu.RUnlock()
133 | 	this := c.matrix.MakeRow(c.index, c.weightScheme, wordFreq)
134 | 	next := c.matrix.Rows()
135 | 	results := make(topResults, 0)
136 | 
137 | 	for row := next(); row != nil; row = next() {
138 | 		results = append(results, &topResult{
139 | 			Score:    c.similarity(row, this),
140 | 			Category: c.categories[row.Index()],
141 | 		})
142 | 	}
143 | 
144 | 	sort.Sort(results)
145 | 	return results.query(c.k), nil
146 | }
147 | 
148 | type topResults []*topResult
149 | 
150 | func (r topResults) Len() int {
151 | 	return len(r)
152 | }
153 | 
154 | func (r topResults) Less(i, j int) bool {
155 | 	return r[i].Score < r[j].Score
156 | }
157 | 
158 | func (r topResults) Swap(i, j int) {
159 | 	r[i], r[j] = r[j], r[i]
160 | }
161 | 
162 | func (r topResults) topK(k int) map[string]int {
163 | 	count := 0
164 | 	topk := make(map[string]int)
165 | 	for i := 1; i <= k; i++ {
166 | 		count = topk[r[len(r)-i].Category]
167 | 		topk[r[len(r)-i].Category] = count + 1
168 | 	}
169 | 	return topk
170 | }
171 | 
172 | func (r topResults) query(k int) string {
173 | 	max := 0
174 | 	var category string
175 | 	topk := r.topK(int(math.Min(float64(k), float64(len(r)))))
176 | 
177 | 	for cat, count := range topk {
178 | 		if count > max {
179 | 			max = count
180 | 			category = cat
181 | 		}
182 | 	}
183 | 
184 | 	return category
185 | }
186 | 
187 | type topResult struct {
188 | 	Score    float64
189 | 	Category string
190 | }
191 | 
192 | func (t *topResult) String() string {
193 | 	return fmt.Sprintf("%.2f", t.Score)
194 | }
195 | 
196 | func asReader(text string) io.Reader {
197 | 	return bytes.NewBufferString(text)
198 | }
199 | 


--------------------------------------------------------------------------------
/knn/knn_test.go:
--------------------------------------------------------------------------------
 1 | package knn
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"log"
 6 | 	"os"
 7 | 	"testing"
 8 | 
 9 | 	"github.com/n3integration/classifier"
10 | )
11 | 
12 | func TestClassifier(t *testing.T) {
13 | 	knn := New(
14 | 		K(4),
15 | 		Similarity(EuclideanDistance),
16 | 		WeightScheme(classifier.TermFrequency),
17 | 		Tokenizer(classifier.NewTokenizer(
18 | 			classifier.Filters(classifier.IsNotStopWord, classifier.IsWord),
19 | 			classifier.SplitFunc(classifier.ScanAlphaWords),
20 | 		)),
21 | 	)
22 | 
23 | 	dataDir, err := os.ReadDir("testdata")
24 | 	if err != nil {
25 | 		log.Fatal(err)
26 | 	}
27 | 
28 | 	for _, file := range dataDir {
29 | 		if file.IsDir() {
30 | 			dir := file
31 | 			files, rErr := os.ReadDir(fmt.Sprintf("testdata/%s", dir.Name()))
32 | 			if rErr != nil {
33 | 				log.Fatal(rErr)
34 | 			}
35 | 			for _, f := range files {
36 | 				if lErr := load(knn, dir.Name(), fmt.Sprintf("testdata/%s/%s", dir.Name(), f.Name())); lErr != nil {
37 | 					t.Fatal(lErr)
38 | 				}
39 | 			}
40 | 		}
41 | 	}
42 | 
43 | 	testdata := []struct {
44 | 		Name             string
45 | 		Headline         string
46 | 		ExpectedCategory string
47 | 	}{
48 | 		{
49 | 			Name:             "Business Headline",
50 | 			Headline:         `Small Businesses Keep Hiring as Fed Raises Rates to Cool Economy`,
51 | 			ExpectedCategory: "business",
52 | 		},
53 | 		{
54 | 			Name:             "Sports Headline",
55 | 			Headline:         `How Eagles can win 2023 Super Bowl: Jalen Hurts, dominant offensive line pave the way for championship run`,
56 | 			ExpectedCategory: "sports",
57 | 		},
58 | 	}
59 | 
60 | 	for _, data := range testdata {
61 | 		category, err := knn.ClassifyString(data.Headline)
62 | 		if err != nil {
63 | 			t.Fatalf("failed to classify %s dataDir: %s", data.Name, err)
64 | 		}
65 | 
66 | 		if category != data.ExpectedCategory {
67 | 			log.Println(knn.matrix)
68 | 			t.Fatalf("incorrectly classified %s; expected %s, but got %s", data.Name, data.ExpectedCategory, category)
69 | 		}
70 | 	}
71 | }
72 | 
73 | func load(knn *Classifier, category, filename string) error {
74 | 	f, err := os.Open(filename)
75 | 	if err != nil {
76 | 		return fmt.Errorf("failed to load test data: %w", err)
77 | 	}
78 | 	defer f.Close()
79 | 	return knn.Train(f, category)
80 | }
81 | 


--------------------------------------------------------------------------------
/knn/matrix.go:
--------------------------------------------------------------------------------
  1 | package knn
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"math"
  6 | 
  7 | 	"github.com/n3integration/classifier"
  8 | 	"github.com/n3integration/classifier/index"
  9 | )
 10 | 
 11 | // sparse matrix implementation
 12 | type sparse struct {
 13 | 	ind []int
 14 | 	val []float64
 15 | 	ptr []int
 16 | }
 17 | 
 18 | // newSparseMatrix initializes an empty sparse matrix
 19 | func newSparseMatrix() *sparse {
 20 | 	return &sparse{
 21 | 		ind: make([]int, 0),
 22 | 		val: make([]float64, 0),
 23 | 		ptr: make([]int, 1),
 24 | 	}
 25 | }
 26 | 
 27 | // Add a new row to the underlying matrix
 28 | func (m *sparse) Add(index *index.TermIndex, weight classifier.WeightScheme, docWordFreq map[string]float64) {
 29 | 	prev := len(m.ind)
 30 | 	for term := range docWordFreq {
 31 | 		m.ind = append(m.ind, index.IndexOf(term))
 32 | 		m.val = append(m.val, weight(term))
 33 | 	}
 34 | 
 35 | 	cur := prev + len(docWordFreq)
 36 | 	quickSort(m, prev, cur-1)
 37 | 	m.ptr = append(m.ptr, cur)
 38 | }
 39 | 
 40 | // MakeRow creates and returns a new sparseRow without adding it to the underlying matrix
 41 | func (m *sparse) MakeRow(index *index.TermIndex, weight classifier.WeightSchemeStrategy, wordFreq map[string]float64) *sparseRow {
 42 | 	i := 0
 43 | 	var idx int
 44 | 	this := newSparseRow(len(wordFreq))
 45 | 
 46 | 	for term := range wordFreq {
 47 | 		idx = index.IndexOf(term)
 48 | 		if idx < 0 {
 49 | 			idx = index.Add(term)
 50 | 		}
 51 | 		this.ind[i] = idx
 52 | 		this.val[i] = weight(wordFreq)(term)
 53 | 		i++
 54 | 	}
 55 | 
 56 | 	quickSort(this, 0, len(wordFreq)-1)
 57 | 	return this
 58 | }
 59 | 
 60 | // Rows returns an iterator over the matrix
 61 | func (m *sparse) Rows() func() *sparseRow {
 62 | 	i := 0
 63 | 	r := &sparseRow{}
 64 | 
 65 | 	return func() *sparseRow {
 66 | 		if i == (len(m.ptr) - 1) {
 67 | 			return nil
 68 | 		}
 69 | 
 70 | 		start := m.ptr[i]
 71 | 		end := m.ptr[i+1]
 72 | 
 73 | 		r.index = i
 74 | 		r.ind = m.ind[start:end]
 75 | 		r.val = m.val[start:end]
 76 | 		i++
 77 | 
 78 | 		return r
 79 | 	}
 80 | }
 81 | 
 82 | // Head returns the first 10 rows in the underlying matrix
 83 | func (m *sparse) Head() []*sparseRow {
 84 | 	iterator := m.Rows()
 85 | 	count := int(math.Min(10, m.Size()))
 86 | 	rows := make([]*sparseRow, count)
 87 | 
 88 | 	for i := 0; i <= count; i++ {
 89 | 		row := iterator()
 90 | 		if row == nil {
 91 | 			break
 92 | 		}
 93 | 		rows[i] = row
 94 | 	}
 95 | 
 96 | 	return rows
 97 | }
 98 | 
 99 | func (m *sparse) Shape() string {
100 | 	return fmt.Sprintf("%v x %v", len(m.ind), len(m.ptr)-1)
101 | }
102 | 
103 | func (m *sparse) Size() float64 {
104 | 	return float64(len(m.ptr)) - 1
105 | }
106 | 
107 | func (m *sparse) Partition(low int, high int) int {
108 | 	x := m.ind[high]
109 | 	i := low - 1
110 | 
111 | 	for j := low; j <= high-1; j++ {
112 | 		if m.ind[j] <= x {
113 | 			i++
114 | 			swap(&m.ind[i], &m.ind[j])
115 | 			swap(&m.val[i], &m.val[j])
116 | 		}
117 | 	}
118 | 	swap(&m.ind[i+1], &m.ind[high])
119 | 	swap(&m.val[i+1], &m.val[high])
120 | 	return i + 1
121 | }
122 | 
123 | func (m *sparse) String() string {
124 | 	return fmt.Sprintf("%v\n%v\n%v", m.ind, m.val, m.ptr)
125 | }
126 | 
127 | type sparseRow struct {
128 | 	ind   []int
129 | 	val   []float64
130 | 	index int
131 | }
132 | 
133 | func newSparseRow(size int) *sparseRow {
134 | 	return &sparseRow{
135 | 		ind: make([]int, size),
136 | 		val: make([]float64, size),
137 | 	}
138 | }
139 | 
140 | // Column returns the feature and value at index i
141 | func (r *sparseRow) Column(i int) (int, float64) {
142 | 	return r.ind[i], r.val[i]
143 | }
144 | 
145 | // Feature returns the feature at index i
146 | func (r *sparseRow) Feature(i int) int {
147 | 	return r.ind[i]
148 | }
149 | 
150 | // Sum the sparseRow
151 | func (r *sparseRow) Sum() float64 {
152 | 	sum := 0.0
153 | 	for _, val := range r.val {
154 | 		sum += val
155 | 	}
156 | 	return sum
157 | }
158 | 
159 | // Square the row
160 | func (r *sparseRow) Square() float64 {
161 | 	sum := 0.0
162 | 	for _, val := range r.val {
163 | 		sum += math.Pow(val, 2)
164 | 	}
165 | 	return sum
166 | }
167 | 
168 | // L2Norm returns the euclidean distance
169 | func (r *sparseRow) L2Norm() float64 {
170 | 	return math.Sqrt(r.Square())
171 | }
172 | 
173 | // Dot returns the dot product
174 | func (r *sparseRow) Dot(other *sparseRow) float64 {
175 | 	sum := 0.0
176 | 	if r.Size() <= other.Size() {
177 | 		for i := 0; i < r.Len(); i++ {
178 | 			feature, val := r.Column(i)
179 | 			sum += val * other.Value(feature)
180 | 		}
181 | 	} else {
182 | 		for i := 0; i < other.Len(); i++ {
183 | 			feature, val := other.Column(i)
184 | 			sum += val * r.Value(feature)
185 | 		}
186 | 	}
187 | 	return sum
188 | }
189 | 
190 | // Value returns the value of feature
191 | func (r *sparseRow) Value(feature int) float64 {
192 | 	i := search(r.ind, feature)
193 | 	if i >= 0 {
194 | 		return r.val[i]
195 | 	}
196 | 	return 0
197 | }
198 | 
199 | // Values constructs a new sparse row from the provided features
200 | func (r *sparseRow) Values(features ...int) *sparseRow {
201 | 	other := newSparseRow(len(features))
202 | 	for i := 0; i < len(features); i++ {
203 | 		other.ind[i] = features[i]
204 | 		other.val[i] = r.Value(features[i])
205 | 	}
206 | 	return other
207 | }
208 | 
209 | // Contains to check if row contains the provided feature
210 | func (r *sparseRow) Contains(feature int) bool {
211 | 	for _, val := range r.ind {
212 | 		if val == feature {
213 | 			return true
214 | 		}
215 | 	}
216 | 	return false
217 | }
218 | 
219 | // Index returns the index pointer
220 | func (r *sparseRow) Index() int {
221 | 	return r.index
222 | }
223 | 
224 | // Len returns the number of columns
225 | func (r *sparseRow) Len() int {
226 | 	return len(r.ind)
227 | }
228 | 
229 | func (r *sparseRow) Less(i, j int) bool {
230 | 	return r.ind[i] < r.ind[j]
231 | }
232 | 
233 | func (r *sparseRow) Swap(i, j int) {
234 | 	ind := r.ind[i]
235 | 	r.ind[i] = r.ind[j]
236 | 	r.ind[j] = ind
237 | 
238 | 	val := r.val[i]
239 | 	r.val[i] = r.val[j]
240 | 	r.val[j] = val
241 | }
242 | 
243 | func (r *sparseRow) Size() float64 {
244 | 	return float64(len(r.val))
245 | }
246 | 
247 | func (r *sparseRow) Partition(low int, high int) int {
248 | 	x := r.ind[high]
249 | 	i := low - 1
250 | 
251 | 	for j := low; j <= high-1; j++ {
252 | 		if r.ind[j] <= x {
253 | 			i++
254 | 			swap(&r.ind[i], &r.ind[j])
255 | 			swap(&r.val[i], &r.val[j])
256 | 		}
257 | 	}
258 | 	swap(&r.ind[i+1], &r.ind[high])
259 | 	swap(&r.val[i+1], &r.val[high])
260 | 	return i + 1
261 | }
262 | 
263 | func (r *sparseRow) String() string {
264 | 	return fmt.Sprintf("%v\n%v", r.ind, r.val)
265 | }
266 | 


--------------------------------------------------------------------------------
/knn/similarity.go:
--------------------------------------------------------------------------------
 1 | package knn
 2 | 
 3 | import (
 4 | 	"math"
 5 | )
 6 | 
 7 | // SimilarityScore provides pluggable support for row similarity
 8 | type SimilarityScore func(left, right *sparseRow) float64
 9 | 
10 | // EuclideanDistance between rows
11 | func EuclideanDistance(left, right *sparseRow) float64 {
12 | 	distanceTo := func(left, right *sparseRow) float64 {
13 | 		score := 0.0
14 | 		terms := make(map[int]float64)
15 | 		for i := 0; i < left.Len(); i++ {
16 | 			term, val := left.Column(i)
17 | 			terms[term] = val
18 | 			score += math.Pow(val-right.Value(term), 2)
19 | 		}
20 | 
21 | 		for i := 0; i < right.Len(); i++ {
22 | 			term, _ := right.Column(i)
23 | 			if _, ok := terms[term]; !ok {
24 | 				score += math.Pow(0-right.Value(term), 2)
25 | 			}
26 | 		}
27 | 		return 1 / (1 + math.Sqrt(score))
28 | 	}
29 | 
30 | 	if left.Len() >= right.Len() {
31 | 		return distanceTo(left, right)
32 | 	}
33 | 	return distanceTo(right, left)
34 | }
35 | 
36 | // CosineSimilarity between rows
37 | func CosineSimilarity(left, right *sparseRow) float64 {
38 | 	return left.Dot(right) / (left.L2Norm() * right.L2Norm())
39 | }
40 | 
41 | // PearsonCorrelation between rows
42 | func PearsonCorrelation(left, right *sparseRow) float64 {
43 | 	score := func(left, right *sparseRow) float64 {
44 | 		n := left.Size()
45 | 		leftSum := left.Sum()
46 | 		rightSum := right.Sum()
47 | 		denom := math.Sqrt((left.Square() - math.Pow(leftSum, 2)/n) * (right.Square() - math.Pow(rightSum, 2)/n))
48 | 
49 | 		if denom == 0 {
50 | 			return 0
51 | 		}
52 | 		return (left.Dot(right) - ((leftSum * rightSum) / n)) / denom
53 | 	}
54 | 
55 | 	similar := make([]int, 0)
56 | 	for i := 0; i < left.Len(); i++ {
57 | 		term, _ := left.Column(i)
58 | 		if right.Contains(term) {
59 | 			similar = append(similar, term)
60 | 		}
61 | 	}
62 | 
63 | 	if len(similar) == 0 {
64 | 		return 0
65 | 	}
66 | 	return score(left.Values(similar...), right.Values(similar...))
67 | }
68 | 


--------------------------------------------------------------------------------
/knn/similarity_test.go:
--------------------------------------------------------------------------------
 1 | package knn
 2 | 
 3 | import (
 4 | 	"math"
 5 | 	"testing"
 6 | )
 7 | 
 8 | func TestSimilarity(t *testing.T) {
 9 | 	allowedVariance := .01
10 | 	row1 := newSparseRow(2)
11 | 	row1.ind = []int{0, 1}
12 | 	row1.val = []float64{2, -1}
13 | 	row2 := newSparseRow(2)
14 | 	row2.ind = []int{0, 1}
15 | 	row2.val = []float64{-2, 1}
16 | 
17 | 	t.Run("Euclidean Distance", func(t *testing.T) {
18 | 		expected := 0.18
19 | 		actual := EuclideanDistance(row1, row2)
20 | 		assertEquivalent(t, expected, actual, allowedVariance)
21 | 
22 | 		if actual := EuclideanDistance(row1, row1); actual != 1 {
23 | 			t.Fatalf("expected identical row to equal one. got %.2f", actual)
24 | 		}
25 | 	})
26 | 
27 | 	t.Run("Pearson Correlation", func(t *testing.T) {
28 | 		if actual := PearsonCorrelation(row1, row1); actual != 1 {
29 | 			t.Fatalf("expected strong positive correlation. got %.2f", actual)
30 | 		}
31 | 
32 | 		if actual := PearsonCorrelation(row1, row2); actual != -1 {
33 | 			t.Fatalf("expected strong inverse correlation. got %.2f", actual)
34 | 		}
35 | 
36 | 		row3 := newSparseRow(2)
37 | 		row3.ind = []int{2, 3}
38 | 		row3.val = []float64{4, 5}
39 | 		if actual := PearsonCorrelation(row1, row3); actual != 0 {
40 | 			t.Fatalf("expected dissimilar rows to equal zero; got %.2f", actual)
41 | 		}
42 | 	})
43 | 
44 | 	t.Run("Cosine Similarity", func(t *testing.T) {
45 | 		assertEquivalent(t, CosineSimilarity(row1, row1), 1.0, allowedVariance)
46 | 		assertEquivalent(t, CosineSimilarity(row1, row2), -1.0, allowedVariance)
47 | 	})
48 | }
49 | 
50 | func assertEquivalent(t *testing.T, actual, expected, threshold float64) {
51 | 	if math.Abs(actual-expected) > threshold {
52 | 		t.Fatalf("expected %.2f to be equivalent to %.2f within +/- %.2f", actual, expected, threshold)
53 | 	}
54 | }
55 | 


--------------------------------------------------------------------------------
/knn/sort.go:
--------------------------------------------------------------------------------
 1 | package knn
 2 | 
 3 | // Partitioning indicates that a type can be partitioned and reordered
 4 | type Partitioning interface {
 5 | 	// Partition between low and high elements
 6 | 	Partition(low, high int) int
 7 | }
 8 | 
 9 | func search(values []int, v int) int {
10 | 	low := 0
11 | 	high := len(values)
12 | 	for low <= high {
13 | 		mid := (low + high) / 2
14 | 		if v == values[mid] {
15 | 			return mid
16 | 		} else if v > values[mid] {
17 | 			low = mid + 1
18 | 		} else {
19 | 			high = mid - 1
20 | 		}
21 | 	}
22 | 	return -1
23 | }
24 | 
25 | func quickSort(m Partitioning, low int, high int) {
26 | 	stack := make(Stack, 0)
27 | 
28 | 	stack.push(low)
29 | 	stack.push(high)
30 | 	for stack.len() > 0 {
31 | 		high = stack.pop()
32 | 		low = stack.pop()
33 | 
34 | 		pivot := m.Partition(low, high)
35 | 		if pivot-1 > low {
36 | 			stack.push(low)
37 | 			stack.push(pivot - 1)
38 | 		}
39 | 
40 | 		if pivot+1 < high {
41 | 			stack.push(pivot + 1)
42 | 			stack.push(high)
43 | 		}
44 | 	}
45 | }
46 | 
47 | func swap[V int | float64](a, b *V) {
48 | 	t := *a
49 | 	*a = *b
50 | 	*b = t
51 | }
52 | 
53 | type Stack []int
54 | 
55 | func (s *Stack) push(v int) {
56 | 	*s = append(*s, v)
57 | }
58 | 
59 | func (s *Stack) pop() int {
60 | 	v := (*s)[len(*s)-1]
61 | 	(*s)[len(*s)-1] = 0
62 | 	*s = (*s)[:len(*s)-1]
63 | 	return v
64 | }
65 | 
66 | func (s *Stack) len() int {
67 | 	return len(*s)
68 | }
69 | 


--------------------------------------------------------------------------------
/knn/testdata/README.md:
--------------------------------------------------------------------------------
1 | ## Test Data Set
2 | 
3 | Data extracted from [Kaggle](https://www.kaggle.com/datasets/jensenbaxter/10dataset-text-document-classification)
4 | 


--------------------------------------------------------------------------------
/knn/testdata/business/1.txt:
--------------------------------------------------------------------------------
1 | Winn-Dixie files for bankruptcy
2 | 
3 | US supermarket group Winn-Dixie has filed for bankruptcy protection after succumbing to stiff competition in a market dominated by Wal-Mart.
4 | 
5 | Winn-Dixie, once among the most profitable of US grocers, said Chapter 11 protection would enable it to successfully restructure. It said its 920 stores would remain open, but analysts said it would most likely off-load a number of sites. The Jacksonville, Florida-based firm has total debts of $1.87bn (£980m). In its bankruptcy petition it listed its biggest creditor as US foods giant Kraft Foods, which it owes $15.1m.
6 | 
7 | Analysts say Winn-Dixie had not kept up with consumers' demands and had also been burdened by a number of stores in need of upgrading. A 10-month restructuring plan was deemed a failure, and following a larger-than-expected quarterly loss earlier this month, Winn-Dixie's slide into bankruptcy was widely expected. The company's new chief executive Peter Lynch said Winn-Dixie would use the Chapter 11 breathing space to take the necessary action to turn itself around. "This includes achieving significant cost reductions, improving the merchandising and customer service in all locations and generating a sense of excitement in the stores," he said. Yet Evan Mann, a senior bond analyst at Gimme Credit, said Mr Lynch's job would not be easy, as the bankruptcy would inevitably put off some customers. "The real big issue is what's going to happen over the next one or two quarters now that they are in bankruptcy and all their customers see this in their local newspapers," he said.
8 | 


--------------------------------------------------------------------------------
/knn/testdata/business/2.txt:
--------------------------------------------------------------------------------
 1 | Japanese growth grinds to a halt
 2 | 
 3 | Growth in Japan evaporated in the three months to September, sparking renewed concern about an economy not long out of a decade-long trough.
 4 | 
 5 | Output in the period grew just 0.1%, an annual rate of 0.3%. Exports - the usual engine of recovery - faltered, while domestic demand stayed subdued and corporate investment also fell short. The growth falls well short of expectations, but does mark a sixth straight quarter of expansion.
 6 | 
 7 | The economy had stagnated throughout the 1990s, experiencing only brief spurts of expansion amid long periods in the doldrums. One result was deflation - prices falling rather than rising - which made Japanese shoppers cautious and kept them from spending.
 8 | 
 9 | The effect was to leave the economy more dependent than ever on exports for its recent recovery. But high oil prices have knocked 0.2% off the growth rate, while the falling dollar means products shipped to the US are becoming relatively more expensive.
10 | 
11 | The performance for the third quarter marks a sharp downturn from earlier in the year. The first quarter showed annual growth of 6.3%, with the second showing 1.1%, and economists had been predicting as much as 2% this time around. "Exports slowed while capital spending became weaker," said Hiromichi Shirakawa, chief economist at UBS Securities in Tokyo. "Personal consumption looks good, but it was mainly due to temporary factors such as the Olympics. "The amber light is flashing." The government may now find it more difficult to raise taxes, a policy it will have to implement when the economy picks up to help deal with Japan's massive public debt.
12 | 


--------------------------------------------------------------------------------
/knn/testdata/business/3.txt:
--------------------------------------------------------------------------------
 1 | WorldCom director admits lying
 2 | 
 3 | The former chief financial officer at US telecoms firm WorldCom has admitted before a New York court that he used to lie to fellow board members.
 4 | 
 5 | Speaking at the trial of his former boss Bernard Ebbers, Scott Sullivan said he lied to the board to cover up the hole in WorldCom's finances. Mr Ebbers is on trial for fraud and conspiracy in relation to WorldCom's collapse in 2002. He pleads not guilty. The firm had been overstating its accounts by $11bn (£8.5bn). Mr Sullivan, 42, has already pleaded guilty to fraud and will be sentenced following Mr Ebbers' trial, where he is appearing as a prosecution witness. Mr Ebbers, 63, has always insisted that he was unaware of any hidden shortfalls in WorldCom's finances.
 6 | 
 7 | In the New York court on Wednesday, Mr Ebbers' lawyer Reid Weingarten asked Mr Sullivan: "If you believe something is in your interest, you are willing and able to lie to accomplish it, isn't that right?"
 8 | 
 9 | "On that date, yes. I was lying," replied Mr Sullivan. Mr Weingarten has suggested that Mr Sullivan is implicating Mr Ebbers only to win a lighter sentence, something Mr Sullivan denies. Mr Sullivan also rejects a suggestion that he had once told fellow WorldCom board member Bert Roberts that Mr Ebbers was unaware of the accounting fraud at WorldCom. The trial of Mr Ebbers is now into its third week.
10 | 
11 | Under 23 hours of questioning from a federal prosecutor, Mr Sullivan has previously told the court that he repeatedly warned Mr Ebbers that falsifying the books would be the only way to meet Wall Street revenue and earnings expectations. Mr Sullivan claims that Mr Ebbers refused to stop the fraud. Mr Ebbers could face a sentence of 85 years if convicted of all the charges he is facing. WorldCom's problems appear to have begun with the collapse of the dotcom boom which cut its business from internet companies. Prosecutors allege that the company's top executives responded by orchestrating massive fraud over a two-year period. WorldCom emerged from bankruptcy protection in 2004, and is now known as MCI.
12 | 


--------------------------------------------------------------------------------
/knn/testdata/business/4.txt:
--------------------------------------------------------------------------------
 1 | Bank voted 8-1 for no rate change
 2 | 
 3 | The decision to keep interest rates on hold at 4.75% earlier this month was passed 8-1 by the Bank of England's rate-setting body, minutes have shown.
 4 | 
 5 | One member of the Bank's Monetary Policy Committee (MPC) - Paul Tucker - voted to raise rates to 5%. The news surprised some analysts who had expected the latest minutes to show another unanimous decision. Worries over growth rates and consumer spending were behind the decision to freeze rates, the minutes showed. The Bank's latest inflation report, released last week, had noted that the main reason inflation might fall was weaker consumer spending.
 6 | 
 7 | However, MPC member Paul Tucker voted for a quarter point rise in interest rates to 5%. He argued that economic growth was picking up, and that the equity, credit and housing markets had been stronger than expected.
 8 | 
 9 | The Bank's minutes said that risks to the inflation forecast were "sufficiently to the downside" to keep rates on hold at its latest meeting. However, the minutes added: "Some members noted that an increase might be warranted in due course if the economy evolved in line with the central projection". Ross Walker, UK economist at Royal Bank of Scotland, said he was surprised that a dissenting vote had been made so soon. He said the minutes appeared to be "trying to get the market to focus on the possibility of a rise in rates". "If the economy pans out as they expect then they are probably going to have to hike rates." However, he added, any rate increase is not likely to happen until later this year, with MPC members likely to look for a more sustainable pick up in consumer spending before acting.
10 | 


--------------------------------------------------------------------------------
/knn/testdata/business/5.txt:
--------------------------------------------------------------------------------
 1 | Brewers' profits lose their fizz
 2 | 
 3 | Heineken and Carlsberg, two of the world's largest brewers, have reported falling profits after beer sales in western Europe fell flat.
 4 | 
 5 | Dutch firm Heineken saw its annual profits drop 33% and warned that earnings in 2005 may also slide. Danish brewer Carlsberg suffered a 3% fall in profits due to waning demand and increased marketing costs. Both are looking to Russia and China to provide future growth as western European markets are largely mature.
 6 | 
 7 | Heineken's net income fell to 537m euros ($701m; £371m) during 2004, from 798m euro a year ago. It blamed weak demand in western Europe and currency losses. It had warned in September that the weakening US dollar, which has cut the value of foreign sales, would knock 125m euros off its operating profits. Despite the dip in profits, Heineken's sales have been improving and total revenue for the year was 10bn euros, up 8.1% from 9.26bn euros in 2003. Heineken said it now plans to invest 100m euros in "aggressive" and "high-impact" marketing in Europe and the US in 2005. Heineken, which also owns the Amstel and Murphy's stout brands, said it would also seek to cut costs. This may involve closing down breweries.
 8 | 
 9 | Heineken increased its dividend payment by 25% to 40 euro cents, but warned that the continued impact of a weaker dollar and an increased marketing spend may lead to a drop in 2005 net profit.
10 | 
11 | Carlsberg, the world's fifth-largest brewer, saw annual pre-tax profits fall to 3.4bn Danish kroner (456m euros). Its beer sales have been affected by the sluggish European economy and by the banning of smoking in pubs in several European countries. Nevertheless, total sales increased 4% to 36bn kroner, thanks to strong sales of Carlsberg lager in Russia and Poland. Carlsberg is more optimistic than Heineken about 2005, projecting a 15% rise in net profits for the year. However, it also plans to cut 200 jobs in Sweden, where sales have been hit by demand for cheap, imported brands. "We remain cautious about the medium-to-long term outlook for revenue growth across western Europe for a host of economic, social and structural reasons," investment bank Merrill Lynch said of Carlsberg.
12 | 


--------------------------------------------------------------------------------
/knn/testdata/business/6.txt:
--------------------------------------------------------------------------------
 1 | Saab to build Cadillacs in Sweden
 2 | 
 3 | General Motors, the world's largest car maker, has confirmed that it will build a new medium-sized Cadillac BLS at its loss-making Saab factory in Sweden.
 4 | 
 5 | The car, unveiled at the Geneva motor show, is intended to compete in the medium-sized luxury car market. It will not be sold in the US, said GM Europe president Carl-Peter Forster. As part of its efforts to make the US marque appeal to European drivers, the car will be the first Cadillac with a diesel engine.
 6 | 
 7 | GM's announcement should go some way to allay fears of the Saab factory's closure. The factory in Trollhaettan has been at the centre of rumours about GM's planned severe cutbacks in its troubled European operations. But the group's new commitment to the Swedish factory may not be welcomed by the group's Opel workers in Ruesselsheim, Germany. They may now have to face a larger proportion of GM's cuts.
 8 | 
 9 | Neither will the announcement be seen as unalloyed good news in Sweden, since it reflects Saab's failure to make significant inroads into the lucrative European luxury car market. For years, Saab has consistently said it is competing head-on with BMW, Mercedes and Jaguar. The segment's leaders do not agree.
10 | 
11 | GM's plans to build the American marque in Sweden is part of its efforts to push it as an alternative luxury brand for European drivers. In the US, it has long been established as an upmarket brand - even the presidential limousine carries the badge. Yet it could prove tough for Cadillac to steal market share from the majors in Europe. Other luxury car makers, most notably the Toyota subsidiary Lexus, have enjoyed tremendous success in the US without managing to make significant inroads in Europe. There, German marques Mercedes Benz and BMW have retained their stranglehold on the luxury market.
12 | 
13 | Bringing Cadillac production to Sweden should help introduce desperately-needed scale to the Saab factory, which currently produces fewer than 130,000 cars per year. That is about half of what major car makers consider sufficient numbers for profitable operations, and Saab is losing money fast - albeit with losses halved in 2004 to $200m (£104m; 151m euros) from $500m the previous year. Beyond the 12,000 job cuts announced last year at its European operations, GM is reducing expenditure by building Saabs, Opels - badged as Vauxhalls in the UK - and now Cadillacs on the same framework, and by allowing the different brands to share parts. Another way to further reduce Saab's losses could be to shift some of the production of Saabs to the US, a market where drivers have adopted it as an upmarket European car. Doing so would remove the exposure to the weak US dollar, which is making Saabs more expensive to US consumers. But not everyone in the industry agree that it would be the best way forward. "We know that in five years the US dollar will be stronger than it is today," the chief executive of a leading European car maker told BBC News. The current trend towards US production was "stupid", he said.
14 | 
15 | In a separate announcement, GM unveiled a new scheme to allow European consumers the chance to test drive its Opel and Vauxhall models. It is to deploy a fleet of 35,000 test cars across 40 countries, inviting potential buyers to try out a vehicle for 24-hours. It follows a similar initiative by GM in the US. GM said it wanted to change "customers' perceptions" about Opel and Vauxhall cars, showing them that the quality had improved in recent years.
16 | 


--------------------------------------------------------------------------------
/knn/testdata/sports/1.txt:
--------------------------------------------------------------------------------
1 | Claxton hunting first major medal
2 | 
3 | British hurdler Sarah Claxton is confident she can win her first major medal at next month's European Indoor Championships in Madrid.
4 | 
5 | The 25-year-old has already smashed the British record over 60m hurdles twice this season, setting a new mark of 7.96 seconds to win the AAAs title. "I am quite confident," said Claxton. "But I take each race as it comes. "As long as I keep up my training but not do too much I think there is a chance of a medal." Claxton has won the national 60m hurdles title for the past three years but has struggled to translate her domestic success to the international stage. Now, the Scotland-born athlete owns the equal fifth-fastest time in the world this year. And at last week's Birmingham Grand Prix, Claxton left European medal favourite Russian Irina Shevchenko trailing in sixth spot.
6 | 
7 | For the first time, Claxton has only been preparing for a campaign over the hurdles - which could explain her leap in form. In previous seasons, the 25-year-old also contested the long jump but since moving from Colchester to London she has re-focused her attentions. Claxton will see if her new training regime pays dividends at the European Indoors which take place on 5-6 March.
8 | 


--------------------------------------------------------------------------------
/knn/testdata/sports/2.txt:
--------------------------------------------------------------------------------
1 | Greene sets sights on world title
2 | 
3 | Maurice Greene aims to wipe out the pain of losing his Olympic 100m title in Athens by winning a fourth World Championship crown this summer.
4 | 
5 | He had to settle for bronze in Greece behind fellow American Justin Gatlin and Francis Obikwelu of Portugal. "It really hurts to look at that medal. It was my mistake. I lost because of the things I did," said Greene, who races in Birmingham on Friday. "It's never going to happen again. My goal - I'm going to win the worlds." Greene crossed the line just 0.02 seconds behind Gatlin, who won in 9.87 seconds in one of the closest and fastest sprints of all time. But Greene believes he lost the race and his title in the semi-finals. "In my semi-final race, I should have won the race but I was conserving energy. "That's when Francis Obikwelu came up and I took third because I didn't know he was there. "I believe that's what put me in lane seven in the final and, while I was in lane seven, I couldn't feel anything in the race.
6 | 
7 | "I just felt like I was running all alone. "I believe if I was in the middle of the race I would have been able to react to people that came ahead of me." Greene was also denied Olympic gold in the 4x100m men's relay when he could not catch Britain's Mark Lewis-Francis on the final leg. The Kansas star is set to go head-to-head with Lewis-Francis again at Friday's Norwich Union Grand Prix. The pair contest the 60m, the distance over which Greene currently holds the world record of 6.39 seconds. He then has another indoor meeting in France before resuming training for the outdoor season and the task of recapturing his world title in Helsinki in August. Greene believes Gatlin will again prove the biggest threat to his ambitions in Finland. But he also admits he faces more than one rival for the world crown. "There's always someone else coming. I think when I was coming up I would say there was me and Ato (Boldon) in the young crowd," Greene said. "Now you've got about five or six young guys coming up at the same time."
8 | 


--------------------------------------------------------------------------------
/knn/testdata/sports/3.txt:
--------------------------------------------------------------------------------
1 | Edwards tips Idowu for Euro gold
2 | 
3 | World outdoor triple jump record holder and BBC pundit Jonathan Edwards believes Phillips Idowu can take gold at the European Indoor Championships.
4 | 
5 | Idowu landed 17.30m at the British trials in Sheffield last month to lead the world triple jump rankings. "It's all down to him, but if he jumps as well as he did in Sheffield he could win the gold medal," said Edwards. "His ability is undoubted but all his best performances seem to happen in domestic meetings."
6 | 
7 | Idowu made his breakthrough five years ago but so far has only a Commonwealth silver medal to his name. Edwards himself kept Idowu off top spot at the Manchester Games. But he believes the European Indoors in Madrid represent a chance for the 26-year-old to prove his credentials as Britain's top triple jumper. "He has to start producing at international level and here is the beginning," said Edwards. "Phillips still needs to be much more consistent. I'm sure a victory in Madrid will build up his confidence and self-belief that he can be best in the world." The qualifying round of the men's triple jump in Madrid takes place on Friday with the final scheduled for Saturday. Olympic champion Christian Olsson will not be taking part as he is out for the entire indoor season with an ankle injury.
8 | 


--------------------------------------------------------------------------------
/knn/testdata/sports/4.txt:
--------------------------------------------------------------------------------
1 | Mido makes third apology
2 | 
3 | Ahmed 'Mido' Hossam has made another apology to the Egyptian people in an attempt to rejoin the national team.
4 | 
5 | The 21-year-old told a news conference in Cairo on Sunday that he is sorry for the problems that have led to his exclusion from the Pharaohs since July last year. Mido said: "There isn't much I have to say today, all there is to say is that I came specially from England to Egypt to rejoin the national team and to apologise for all my mistakes." Mido was axed by former coach Marco Tardelli after failing to answer a national call-up, claiming he had a groin injury. But he then played in a friendly for his club AS Roma within 24 hours of a World Cup qualifying match at home to Cameroon last September. Mido added: "It's not my right to give orders and say when I want to play ... at the same time I will always make sure that I put the national's team's matches as my top priority. "I feel that the national players are playing with a new spirit as I saw them play against Belgium (Egypt won 4-0 on Wednesday) and I simply want to add to their success. "I do confess that I was rude to the Egyptian press at times but now I have gained more experience and know that I will never go anywhere without the press's support. "Many of the international stars like David Beckham and (Zinedine) Zidane had the press opposing them. "So I'm now used to the fact that the press can be against me at times and I don't have to overreact when this happens. Meanwhile, Egypt FA spokesman Methat Shalaby welcomed the apology and said no one had exerted pressure on Mido to apologise. "Mido's apology today does not negatively affect Mido in anyway, on the contrary it makes him a bigger star and a role model for all football players," Shalaby said. Shalaby earlier said that after an apology Mido would be available for the national side if coach Hassan Shehata chose him. Mido joined Tottenham in an 18-month loan deal near the end of the January transfer window, scoring twice on his debut against Portsmouth.
6 | 


--------------------------------------------------------------------------------
/knn/testdata/sports/5.txt:
--------------------------------------------------------------------------------
 1 | McIlroy aiming for Madrid title
 2 | 
 3 | Northern Ireland man James McIlroy is confident he can win his first major title at this weekend's Spar European Indoor Championships in Madrid.
 4 | 
 5 | The 28-year-old has been in great form in recent weeks and will go in as one of the 800 metres favourites. "I believe after my wins abroad and in our trial race in Sheffield, I can run my race from the front, back or middle," said McIlroy. New coach Tony Lester has helped get McIlroy's career back on track. The 28-year-old 800 metres runner has not always matched his promise with performances but believes his decision to change coaches and move base will bring the rewards. McIlroy now lives in Windsor and feels his career has been transformed by the no-nonsense leadership style of former Army sergeant Lester. Lester is better known for his work with 400m runners Roger Black and Mark Richardson in the past but under his guidance McIlroy has secured five wins this indoor season.
 6 | 
 7 | McIlroy now claims he is in his best shape since finishing fourth for Ireland at the outdoor European Championships in 1998. "That was my last decent year," said McIlroy, who temporarily retired last August before returning to the sport under Lester's shrewd guidance. "Before, every race was like trying to climb Mount Everest and I now know you can't do it on your own. "Trying to succeed saw me sometimes standing half-dead and terrified on the starting line, which became a bit too much." McIlroy, who was compared to the likes of Sebastian Coe, Steve Cram and Steve Ovett in his younger days, is now competing without the benefit of National Lottery funding. That situation could change if he maintains his current form and repeats the world-class times he produced in the 800m and 1000m at major races in Erfurt and Stuttgart earlier this season. Russian Dmitriy Bogdanov won at the same Madrid venue last week and then claimed the European Championship race would be between himself, Dutchman Arnoud Okken and Antonio Reina of Spain but McIlroy is unfazed.
 8 | 
 9 | He admitted: "He looked quite good in his win and fair enough everyone has the right to their own opinion. "I never write myself off and let's face it, I haven't or looked like being beaten this season." And McIlroy, whose time of one minute 46.68seconds in Erfurt elevated him to sixth place on the UK All-Time list, is also already looking beyond Madrid. He said: "I've been much more focused this year about my career and having such a good team around me has been very important. "Ultimately of course, this weekend is a means to an end and that is getting prepared for the summer's world championships. "That ambition has meant that I've had only two nights out since last August. The rest of my time has seen me just concentrating on rebuilding my career."
10 | 


--------------------------------------------------------------------------------
/knn/testdata/sports/6.txt:
--------------------------------------------------------------------------------
 1 | 2004: An Irish Athletics Year
 2 | 
 3 | 2004 won't be remembered as one of Irish athletics' great years.
 4 | 
 5 | The year began with that optimism which invariably and unaccountably, seems to herald an upcoming Olympiad. But come late August, a few hot days in the magnificent stadium in Athens told us of the true strength of Irish athletics - or to be more accurate, the lack of it. Sonia O'Sullivan's Olympic farewell apart, there was little to stir the emotions of Irish athletics watchers. But after the disastrous build-up to the games, we shouldn't have been surprised. At the start of the year, an O'Sullivan had been earmarked as Ireland's best medal prospect but as it turned out, walker Gillian never even made it to the start line because of injury. Less than a week before the Olympics, the sport was rocked by news that 10,000m hope Cathal Lombard had tested for the banned substance EPO. Lombard's shattering of Mark Carroll's national 10,000m record in April had already set tongues wagging but even the most cynical of observers, were surprised when he was rumbled after an Irish Sports Council sting operation. The Corkman quickly held his hands up in admission and was promptly handed a two-year ban from the sport.
 6 | 
 7 | Back at pre-Olympic ranch in Greece, it must have seemed that things couldn't have got any worse but they very nearly did with walker Jamie Costin lucky to escape with his life after being involved in a car crash near Athens. Once the track and field action began in Athens, a familiar pattern of underachievement emerged although Alistair Cragg's performance in being the only athlete from a European nation to qualify for the 5,000m final did offer hope for the future. Our beloved Sonia scraped into the women's 5K final as a fastest loser and for a couple of days, the country attempted to delude itself into believing that she might be in the medal shake-up. As it happened, she went out the back door early in the final although there was nothing undignified about the way that she insisted on finishing the race over a minute behind winner Meseret Defar. It later transpired that Sonia had been suffering from a stomach bug in the 48 hours before the final although typically, the Cobhwoman played down the effects of the illness. Amazingly, she was back in action a couple of weeks later when beating a world-class field at the Flora Lite 5K road race in London and while her major championship days may be over, it's unlikely that we have seen the last of her in competition. At least Sonia managed to make it to Athens. At the start of the year, several Northern Ireland athletes had genuine hopes of qualifying for the Games but come August, an out-of-form and injured Paul Brizzel was the lone standard bearer for the province. The Ballymena man gave it a lash but his achilles problem, and a bad lane draw, meant a time of 21.00 and an early exit.
 8 | 
 9 | James McIlroy, Gareth Turnbull, Zoe Brown and Paul McKee all had to be content with watching the Athens action on their television screens. 800m hope McIlroy never got near his best during the summer and a fourth place in the British trials effectively ended his hopes of making the plane. The injury-plagued Turnbull gamely travelled round Europe in search of the 1500m qualifying mark but 3:39 was the best he could achieve, after missing several months training during the previous winter. A lingering hamstring probem and a virus wrecked McKee's Athens ambitions and both he and Turnbull deserve a slice of better fortune in 2005. Pole vaulter Brown had hoped for a vote of confidence from the British selectors after she had achieved the Athens B standard but the call never came. As the summer ended, stalwarts Catherina McKiernan and Dermot Donnelly hung up their competitive spikes. McKiernan had to candidly acknowledge that time had crept up on her after several injury-ravaged years. Donnelly and his Annadale Striders team-mates later suffered tragedy when their friend and clubman Andy Campbell was found dead at his home on 18 December. A large turnout of athletics-loving folk turned out in west Belfast to offer their respects to the Campbell family and Andy's many friends. As only death can, it put the year's athletics happenings in a sharp perspective.
10 | 


--------------------------------------------------------------------------------
/naive/naive.go:
--------------------------------------------------------------------------------
  1 | package naive
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"errors"
  6 | 	"io"
  7 | 	"sync"
  8 | 
  9 | 	"github.com/n3integration/classifier"
 10 | )
 11 | 
 12 | // ErrNotClassified indicates that a document could not be classified
 13 | var ErrNotClassified = errors.New("unable to classify document")
 14 | 
 15 | // Option provides a functional setting for the Classifier
 16 | type Option func(c *Classifier) error
 17 | 
 18 | // Classifier implements a naive bayes classifier
 19 | type Classifier struct {
 20 | 	feat2cat  map[string]map[string]int
 21 | 	catCount  map[string]int
 22 | 	tokenizer classifier.Tokenizer
 23 | 	mu        sync.RWMutex
 24 | }
 25 | 
 26 | // New initializes a new naive Classifier using the standard tokenizer
 27 | func New(opts ...Option) *Classifier {
 28 | 	c := &Classifier{
 29 | 		feat2cat:  make(map[string]map[string]int),
 30 | 		catCount:  make(map[string]int),
 31 | 		tokenizer: classifier.NewTokenizer(),
 32 | 	}
 33 | 	for _, opt := range opts {
 34 | 		opt(c)
 35 | 	}
 36 | 	return c
 37 | }
 38 | 
 39 | // Tokenizer overrides the classifier's default Tokenizer
 40 | func Tokenizer(t classifier.Tokenizer) Option {
 41 | 	return func(c *Classifier) error {
 42 | 		c.tokenizer = t
 43 | 		return nil
 44 | 	}
 45 | }
 46 | 
 47 | // Train provides supervisory training to the classifier
 48 | func (c *Classifier) Train(r io.Reader, category string) error {
 49 | 	c.mu.Lock()
 50 | 	defer c.mu.Unlock()
 51 | 
 52 | 	for feature := range c.tokenizer.Tokenize(r) {
 53 | 		c.addFeature(feature, category)
 54 | 	}
 55 | 
 56 | 	c.addCategory(category)
 57 | 	return nil
 58 | }
 59 | 
 60 | // TrainString provides supervisory training to the classifier
 61 | func (c *Classifier) TrainString(doc string, category string) error {
 62 | 	return c.Train(asReader(doc), category)
 63 | }
 64 | 
 65 | // Classify attempts to classify a document. If the document cannot be classified
 66 | // (eg. because the classifier has not been trained), an error is returned.
 67 | func (c *Classifier) Classify(r io.Reader) (string, error) {
 68 | 	max := 0.0
 69 | 	var err error
 70 | 	classification := ""
 71 | 	probabilities := make(map[string]float64)
 72 | 
 73 | 	c.mu.RLock()
 74 | 	defer c.mu.RUnlock()
 75 | 
 76 | 	for _, category := range c.categories() {
 77 | 		probabilities[category] = c.probability(r, category)
 78 | 		if probabilities[category] > max {
 79 | 			max = probabilities[category]
 80 | 			classification = category
 81 | 		}
 82 | 	}
 83 | 
 84 | 	if classification == "" {
 85 | 		return "", ErrNotClassified
 86 | 	}
 87 | 	return classification, err
 88 | }
 89 | 
 90 | // Probabilities runs the provided string through the model and returns
 91 | // the potential probability for each classification
 92 | func (c *Classifier) Probabilities(str string) (map[string]float64, string) {
 93 | 	probabilities := make(map[string]float64)
 94 | 
 95 | 	c.mu.RLock()
 96 | 	defer c.mu.RUnlock()
 97 | 
 98 | 	best := 0.0
 99 | 	cat := ``
100 | 
101 | 	for _, category := range c.categories() {
102 | 		prob := c.probability(asReader(str), category)
103 | 		if prob > 0 {
104 | 			probabilities[category] = prob
105 | 		}
106 | 		if prob > best {
107 | 			best = prob
108 | 			cat = category
109 | 		}
110 | 	}
111 | 
112 | 	return probabilities, cat
113 | }
114 | 
115 | // ClassifyString provides convenience classification for strings
116 | func (c *Classifier) ClassifyString(doc string) (string, error) {
117 | 	return c.Classify(asReader(doc))
118 | }
119 | 
120 | func (c *Classifier) addFeature(feature string, category string) {
121 | 	if _, ok := c.feat2cat[feature]; !ok {
122 | 		c.feat2cat[feature] = make(map[string]int)
123 | 	}
124 | 	c.feat2cat[feature][category]++
125 | }
126 | 
127 | func (c *Classifier) featureCount(feature string, category string) float64 {
128 | 	if _, ok := c.feat2cat[feature]; ok {
129 | 		return float64(c.feat2cat[feature][category])
130 | 	}
131 | 	return 0.0
132 | }
133 | 
134 | func (c *Classifier) addCategory(category string) {
135 | 	c.catCount[category]++
136 | }
137 | 
138 | func (c *Classifier) categoryCount(category string) float64 {
139 | 	if _, ok := c.catCount[category]; ok {
140 | 		return float64(c.catCount[category])
141 | 	}
142 | 	return 0.0
143 | }
144 | 
145 | func (c *Classifier) count() int {
146 | 	sum := 0
147 | 	for _, value := range c.catCount {
148 | 		sum += value
149 | 	}
150 | 	return sum
151 | }
152 | 
153 | func (c *Classifier) categories() []string {
154 | 	var keys []string
155 | 	for k := range c.catCount {
156 | 		keys = append(keys, k)
157 | 	}
158 | 	return keys
159 | }
160 | 
161 | func (c *Classifier) featureProbability(feature string, category string) float64 {
162 | 	if c.categoryCount(category) == 0 {
163 | 		return 0.0
164 | 	}
165 | 	return c.featureCount(feature, category) / c.categoryCount(category)
166 | }
167 | 
168 | func (c *Classifier) weightedProbability(feature string, category string) float64 {
169 | 	return c.variableWeightedProbability(feature, category, 1.0, 0.5)
170 | }
171 | 
172 | func (c *Classifier) variableWeightedProbability(feature string, category string, weight float64, assumedProb float64) float64 {
173 | 	sum := 0.0
174 | 	probability := c.featureProbability(feature, category)
175 | 	for _, category := range c.categories() {
176 | 		sum += c.featureCount(feature, category)
177 | 	}
178 | 	return ((weight * assumedProb) + (sum * probability)) / (weight + sum)
179 | }
180 | 
181 | func (c *Classifier) probability(r io.Reader, category string) float64 {
182 | 	categoryProbability := c.categoryCount(category) / float64(c.count())
183 | 	docProbability := c.docProbability(r, category)
184 | 	return docProbability * categoryProbability
185 | }
186 | 
187 | func (c *Classifier) docProbability(r io.Reader, category string) float64 {
188 | 	probability := 1.0
189 | 	for feature := range c.tokenizer.Tokenize(r) {
190 | 		probability *= c.weightedProbability(feature, category)
191 | 	}
192 | 	return probability
193 | }
194 | 
195 | func asReader(text string) io.Reader {
196 | 	return bytes.NewBufferString(text)
197 | }
198 | 


--------------------------------------------------------------------------------
/naive/naive_test.go:
--------------------------------------------------------------------------------
  1 | package naive
  2 | 
  3 | import (
  4 | 	"testing"
  5 | )
  6 | 
  7 | var (
  8 | 	ham  = "The quick brown fox jumps over the lazy dog"
  9 | 	spam = "Earn cash quick online"
 10 | )
 11 | 
 12 | func TestProbability(t *testing.T) {
 13 | 	classifier := New()
 14 | 
 15 | 	t.Run(`Probabilities`, func(t *testing.T) {
 16 | 		for z := 0; z < 1; z++ {
 17 | 			classifier.TrainString(`aaa bbb ccc ddd`, "A")
 18 | 			classifier.TrainString(`111 222 333 444 zzz`, "X")
 19 | 			classifier.TrainString(`bbb ccc ddd eee`, "A")
 20 | 			classifier.TrainString(`222 333 444 555 zzz`, "X")
 21 | 			classifier.TrainString(`bbb ccc ddd eee fff`, "A")
 22 | 			classifier.TrainString(`222 333 444 555 666 zzz`, "X")
 23 | 		}
 24 | 
 25 | 		if m, _ := classifier.Probabilities(`bbb ccc ddd`); m[`A`] <= m[`X`] {
 26 | 			t.Errorf(`A=%.2f value should be greater than X=%.2f`, m[`X`], m[`A`])
 27 | 		}
 28 | 
 29 | 		if m, _ := classifier.Probabilities(`222 333 zzz`); m[`X`] <= m[`A`] {
 30 | 			t.Errorf(`X=%.2f value should be greater than A=%.2f`, m[`X`], m[`A`])
 31 | 		}
 32 | 	})
 33 | }
 34 | func TestAddFeature(t *testing.T) {
 35 | 	classifier := New()
 36 | 	classifier.addFeature("quick", "good")
 37 | 	assertFeatureCount(t, classifier, "quick", "good", 1.0)
 38 | 	assertFeatureCount(t, classifier, "quick", "bad", 0.0)
 39 | 	classifier.addFeature("quick", "bad")
 40 | 	assertFeatureCount(t, classifier, "quick", "bad", 1.0)
 41 | }
 42 | 
 43 | func TestAddCategory(t *testing.T) {
 44 | 	classifier := New()
 45 | 
 46 | 	assertCategoryCount(t, classifier, "good", 0.0)
 47 | 	classifier.addCategory("good")
 48 | 	assertCategoryCount(t, classifier, "good", 1.0)
 49 | 	categories := classifier.categories()
 50 | 
 51 | 	assertEqual(t, float64(classifier.count()), float64(len(categories)))
 52 | }
 53 | 
 54 | func TestTrain(t *testing.T) {
 55 | 	classifier := New()
 56 | 
 57 | 	if err := classifier.TrainString(ham, "good"); err != nil {
 58 | 		t.Error("classifier training failed")
 59 | 	}
 60 | 
 61 | 	if err := classifier.TrainString(spam, "bad"); err != nil {
 62 | 		t.Error("classifier training failed")
 63 | 	}
 64 | 
 65 | 	assertFeatureCount(t, classifier, "quick", "good", 1.0)
 66 | 	assertFeatureCount(t, classifier, "quick", "bad", 1.0)
 67 | 	assertCategoryCount(t, classifier, "good", 1)
 68 | 	assertCategoryCount(t, classifier, "bad", 1)
 69 | }
 70 | 
 71 | func TestClassify(t *testing.T) {
 72 | 	classifier := New()
 73 | 	text := "Quick way to make cash"
 74 | 
 75 | 	t.Run("Empty classifier", func(t *testing.T) {
 76 | 		if _, err := classifier.ClassifyString(text); err != ErrNotClassified {
 77 | 			t.Errorf("expected classification error; received: %v", err)
 78 | 		}
 79 | 	})
 80 | 
 81 | 	t.Run("Trained classifier", func(t *testing.T) {
 82 | 		classifier.TrainString(ham, "good")
 83 | 		classifier.TrainString(spam, "bad")
 84 | 
 85 | 		if _, err := classifier.ClassifyString(text); err != nil {
 86 | 			t.Error("document incorrectly classified")
 87 | 		}
 88 | 	})
 89 | }
 90 | 
 91 | func assertCategoryCount(t *testing.T, classifier *Classifier, category string, count float64) {
 92 | 	v := classifier.categoryCount(category)
 93 | 	assertEqual(t, count, v)
 94 | }
 95 | 
 96 | func assertFeatureCount(t *testing.T, classifier *Classifier, feature string, category string, count float64) {
 97 | 	v := classifier.featureCount(feature, category)
 98 | 	assertEqual(t, count, v)
 99 | }
100 | 
101 | func assertEqual(t *testing.T, expected, actual float64) {
102 | 	if actual != expected {
103 | 		t.Errorf("Expectation mismatch. Expected(%f) <=> Actual (%f)", expected, actual)
104 | 	}
105 | }
106 | 


--------------------------------------------------------------------------------
/stopwords.go:
--------------------------------------------------------------------------------
 1 | package classifier
 2 | 
 3 | import (
 4 | 	"strings"
 5 | )
 6 | 
 7 | var (
 8 | 	stopwords = map[string]struct{}{
 9 | 		"a": {}, "able": {}, "about": {}, "across": {}, "after": {}, "all": {}, "almost": {}, "also": {}, "am": {}, "among": {}, "an": {}, "and": {}, "any": {}, "are": {}, "as": {}, "at": {},
10 | 		"be": {}, "because": {}, "been": {}, "but": {}, "by": {}, "can": {}, "cannot": {}, "could": {}, "dear": {}, "did": {}, "do": {}, "does": {}, "either": {}, "else": {}, "ever": {},
11 | 		"every": {}, "for": {}, "from": {}, "get": {}, "got": {}, "had": {}, "has": {}, "have": {}, "he": {}, "her": {}, "hers": {}, "him": {}, "his": {}, "how": {}, "however": {}, "i": {},
12 | 		"if": {}, "in": {}, "into": {}, "is": {}, "it": {}, "its": {}, "just": {}, "least": {}, "let": {}, "like": {}, "likely": {}, "may": {}, "me": {}, "might": {}, "most": {}, "must": {},
13 | 		"my": {}, "neither": {}, "no": {}, "nor": {}, "not": {}, "of": {}, "off": {}, "often": {}, "on": {}, "only": {}, "or": {}, "other": {}, "our": {}, "own": {}, "rather": {}, "said": {},
14 | 		"say": {}, "says": {}, "she": {}, "should": {}, "since": {}, "so": {}, "some": {}, "than": {}, "that": {}, "the": {}, "their": {}, "them": {}, "then": {}, "there": {}, "these": {},
15 | 		"they": {}, "this": {}, "tis": {}, "to": {}, "too": {}, "twas": {}, "us": {}, "wants": {}, "was": {}, "we": {}, "were": {}, "what": {}, "when": {}, "where": {}, "which": {}, "while": {},
16 | 		"who": {}, "whom": {}, "why": {}, "will": {}, "with": {}, "would": {}, "yet": {}, "you": {}, "your": {},
17 | 	}
18 | )
19 | 
20 | // IsStopWord checks against a list of known english stop words and returns true if v is a
21 | // stop word; false otherwise
22 | func IsStopWord(v string) bool {
23 | 	if _, ok := stopwords[strings.ToLower(v)]; ok {
24 | 		return true
25 | 	}
26 | 	return false
27 | }
28 | 
29 | // IsNotStopWord is the inverse function of IsStopWord
30 | func IsNotStopWord(v string) bool {
31 | 	return !IsStopWord(v)
32 | }
33 | 


--------------------------------------------------------------------------------
/stopwords_test.go:
--------------------------------------------------------------------------------
 1 | package classifier
 2 | 
 3 | import "testing"
 4 | 
 5 | func TestStopWords(t *testing.T) {
 6 | 	t.Run("Stopword", func(t *testing.T) {
 7 | 		sample := []string{"a", "is", "the"}
 8 | 		for _, v := range sample {
 9 | 			if IsNotStopWord(v) {
10 | 				t.Errorf("%s was not identified as a stop word", v)
11 | 			}
12 | 		}
13 | 	})
14 | 	t.Run("Other", func(t *testing.T) {
15 | 		sample := []string{"hello", "world"}
16 | 		for _, v := range sample {
17 | 			if IsStopWord(v) {
18 | 				t.Errorf("%s was incorrectly identified as a stop word", v)
19 | 			}
20 | 		}
21 | 	})
22 | }
23 | 


--------------------------------------------------------------------------------
/tokens.go:
--------------------------------------------------------------------------------
  1 | package classifier
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"io"
  6 | 	"strings"
  7 | 	"unicode"
  8 | 	"unicode/utf8"
  9 | )
 10 | 
 11 | // Tokenizer provides a common interface to tokenize documents
 12 | type Tokenizer interface {
 13 | 	// Tokenize breaks the provided document into a channel of tokens
 14 | 	Tokenize(io.Reader) chan string
 15 | }
 16 | 
 17 | // IsWord is a predicate to determine if a string contains at least two
 18 | // characters and doesn't contain any numbers
 19 | func IsWord(v string) bool {
 20 | 	return len(v) > 2 && !strings.ContainsAny(v, "01234556789")
 21 | }
 22 | 
 23 | // StdOption provides configuration settings for a StdTokenizer
 24 | type StdOption func(*StdTokenizer)
 25 | 
 26 | // StdTokenizer provides a common document tokenizer that splits a
 27 | // document by word boundaries
 28 | type StdTokenizer struct {
 29 | 	transforms []Mapper
 30 | 	splitFn    bufio.SplitFunc
 31 | 	filters    []Predicate
 32 | 	bufferSize int
 33 | }
 34 | 
 35 | // NewTokenizer initializes a new standard Tokenizer instance
 36 | func NewTokenizer(opts ...StdOption) *StdTokenizer {
 37 | 	tokenizer := &StdTokenizer{
 38 | 		bufferSize: 100,
 39 | 		splitFn:    bufio.ScanWords,
 40 | 		transforms: []Mapper{
 41 | 			strings.ToLower,
 42 | 		},
 43 | 		filters: []Predicate{
 44 | 			IsNotStopWord,
 45 | 		},
 46 | 	}
 47 | 	for _, opt := range opts {
 48 | 		opt(tokenizer)
 49 | 	}
 50 | 	return tokenizer
 51 | }
 52 | 
 53 | // Tokenize words and return streaming results
 54 | func (t *StdTokenizer) Tokenize(r io.Reader) chan string {
 55 | 	tokenizer := bufio.NewScanner(r)
 56 | 	tokenizer.Split(t.splitFn)
 57 | 	tokens := make(chan string, t.bufferSize)
 58 | 
 59 | 	go func() {
 60 | 		for tokenizer.Scan() {
 61 | 			tokens <- tokenizer.Text()
 62 | 		}
 63 | 		close(tokens)
 64 | 	}()
 65 | 
 66 | 	return t.pipeline(tokens)
 67 | }
 68 | 
 69 | func (t *StdTokenizer) pipeline(in chan string) chan string {
 70 | 	return Map(Filter(in, t.filters...), t.transforms...)
 71 | }
 72 | 
 73 | // BufferSize adjusts the size of the buffered channel
 74 | func BufferSize(size int) StdOption {
 75 | 	return func(t *StdTokenizer) {
 76 | 		t.bufferSize = size
 77 | 	}
 78 | }
 79 | 
 80 | // SplitFunc overrides the default word split function, based on whitespace
 81 | func SplitFunc(fn bufio.SplitFunc) StdOption {
 82 | 	return func(t *StdTokenizer) {
 83 | 		t.splitFn = fn
 84 | 	}
 85 | }
 86 | 
 87 | // Transforms overrides the list of mappers
 88 | func Transforms(m ...Mapper) StdOption {
 89 | 	return func(t *StdTokenizer) {
 90 | 		t.transforms = m
 91 | 	}
 92 | }
 93 | 
 94 | // Filters overrides the list of predicates
 95 | func Filters(f ...Predicate) StdOption {
 96 | 	return func(t *StdTokenizer) {
 97 | 		t.filters = f
 98 | 	}
 99 | }
100 | 
101 | // ScanAlphaWords is a function that splits text on whitespace, punctuation, and symbols;
102 | // derived bufio.ScanWords
103 | func ScanAlphaWords(data []byte, atEOF bool) (advance int, token []byte, err error) {
104 | 	// Skip leading spaces and symbols
105 | 	start := 0
106 | 	for width := 0; start < len(data); start += width {
107 | 		var r rune
108 | 		r, width = utf8.DecodeRune(data[start:])
109 | 
110 | 		if !unicode.IsSpace(r) && !unicode.IsPunct(r) && !unicode.IsSymbol(r) {
111 | 			break
112 | 		}
113 | 	}
114 | 
115 | 	// Scan until space or symbol, marking end of word.
116 | 	for width, i := 0, start; i < len(data); i += width {
117 | 		var r rune
118 | 		r, width = utf8.DecodeRune(data[i:])
119 | 		if unicode.IsSpace(r) || unicode.IsPunct(r) || unicode.IsSymbol(r) {
120 | 			return i + width, data[start:i], nil
121 | 		}
122 | 	}
123 | 
124 | 	// If we're at EOF, we have a final, non-empty, non-terminated word. Return it.
125 | 	if atEOF && len(data) > start {
126 | 		return len(data), data[start:], nil
127 | 	}
128 | 	// Request more data.
129 | 	return start, nil, nil
130 | }
131 | 


--------------------------------------------------------------------------------
/tokens_test.go:
--------------------------------------------------------------------------------
 1 | package classifier
 2 | 
 3 | import (
 4 | 	"bytes"
 5 | 	"io"
 6 | 	"strings"
 7 | 	"testing"
 8 | 	"unicode"
 9 | )
10 | 
11 | var (
12 | 	text     = "The quick brown fox jumped over the lazy dog"
13 | 	expected = 7
14 | )
15 | 
16 | type assertion func(t *testing.T, v string)
17 | 
18 | func TestTokenize(t *testing.T) {
19 | 	tests := []struct {
20 | 		Name       string
21 | 		Opts       []StdOption
22 | 		Assertions []assertion
23 | 	}{
24 | 		{"Standard Tokenizer", options(), assertions()},
25 | 		{"Buffered Tokenizer", options(BufferSize(1)), assertions()},
26 | 		{"ToUpper Tokenizer", options(Transforms(toUpper)), assertions(isUpper)},
27 | 		{"Stopword Tokenizer", options(Filters(IsNotStopWord)), assertions(isStopWord)},
28 | 	}
29 | 
30 | 	for _, test := range tests {
31 | 		t.Run(test.Name, func(t *testing.T) {
32 | 			tokens := NewTokenizer(test.Opts...).Tokenize(toReader(text))
33 | 			doTokenizeTest(t, tokens)
34 | 		})
35 | 	}
36 | }
37 | 
38 | func isStopWord(t *testing.T, v string) {
39 | 	if IsStopWord(v) {
40 | 		t.Errorf("value is a stopword")
41 | 	}
42 | }
43 | 
44 | func isUpper(t *testing.T, v string) {
45 | 	for _, c := range v {
46 | 		if !unicode.IsUpper(c) {
47 | 			t.Errorf("value is not in uppercase")
48 | 			return
49 | 		}
50 | 	}
51 | }
52 | 
53 | func toUpper(s string) string {
54 | 	return strings.ToUpper(s)
55 | }
56 | 
57 | func toReader(text string) io.Reader {
58 | 	return bytes.NewBuffer([]byte(text))
59 | }
60 | 
61 | func doTokenizeTest(t *testing.T, tokens chan string, assertions ...assertion) {
62 | 	actual := 0
63 | 	for v := range tokens {
64 | 		for _, assert := range assertions {
65 | 			assert(t, v)
66 | 		}
67 | 		actual++
68 | 	}
69 | 	if actual != expected {
70 | 		t.Errorf("Expected %d tokens; actual: %d", expected, actual)
71 | 	}
72 | }
73 | 
74 | func options(opts ...StdOption) []StdOption {
75 | 	return opts
76 | }
77 | 
78 | func assertions(assertions ...assertion) []assertion {
79 | 	return assertions
80 | }
81 | 


--------------------------------------------------------------------------------
/weight.go:
--------------------------------------------------------------------------------
 1 | package classifier
 2 | 
 3 | import (
 4 | 	"math"
 5 | )
 6 | 
 7 | // WeightSchemeStrategy provides support for pluggable weight schemes
 8 | type WeightSchemeStrategy func(doc map[string]float64) WeightScheme
 9 | 
10 | // WeightScheme provides a contract for term frequency weight schemes
11 | type WeightScheme func(term string) float64
12 | 
13 | // Binary weight scheme: 1 if present; 0 otherwise
14 | func Binary(doc map[string]float64) WeightScheme {
15 | 	return func(term string) float64 {
16 | 		if _, ok := doc[term]; ok {
17 | 			return 1
18 | 		}
19 | 		return 0
20 | 	}
21 | }
22 | 
23 | // BagOfWords weight scheme: counts the number of occurrences
24 | func BagOfWords(doc map[string]float64) WeightScheme {
25 | 	return func(term string) float64 {
26 | 		return doc[term]
27 | 	}
28 | }
29 | 
30 | // TermFrequency weight scheme; counts the number of occurrences divided by
31 | // the number of terms within a document
32 | func TermFrequency(doc map[string]float64) WeightScheme {
33 | 	return func(term string) float64 {
34 | 		return math.Sqrt(doc[term] / float64(len(doc)))
35 | 	}
36 | }
37 | 
38 | // LogNorm weight scheme: returns the natural log of the number of occurrences of a term
39 | func LogNorm(doc map[string]float64) WeightScheme {
40 | 	return func(term string) float64 {
41 | 		return math.Log(1 + doc[term])
42 | 	}
43 | }
44 | 


--------------------------------------------------------------------------------