├── Gophers.008.crop.png
├── GOPHER
├── .travis.yml
├── LICENSE
├── utils.go
├── hashing.go
├── doc.go
├── dimreduction_test.go
├── measures
└── pairwise
│ └── comparisons.go
├── index_test.go
├── weightings_test.go
├── weightings.go
├── example_test.go
├── vectorisers_test.go
├── README.md
├── dimreduction.go
├── index.go
├── lsh.go
├── lda_test.go
├── randomprojection_test.go
├── vectorisers.go
├── randomprojection.go
└── lda.go
/Gophers.008.crop.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/james-bowman/nlp/HEAD/Gophers.008.crop.png
--------------------------------------------------------------------------------
/GOPHER:
--------------------------------------------------------------------------------
1 | The Go gopher was designed by Renee French and is licensed under the Creative Commons Attributions 3.0.
2 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: go
2 |
3 | go:
4 | - 1.13.x
5 | - 1.14.x
6 | - tip
7 |
8 | before_install:
9 | - go get -t -v ./...
10 |
11 | script:
12 | - go test -coverprofile=coverage.txt -covermode=atomic
13 |
14 | after_success:
15 | - bash <(curl -s https://codecov.io/bash)
16 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2017 James Bowman
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/utils.go:
--------------------------------------------------------------------------------
1 | package nlp
2 |
3 | import (
4 | "github.com/james-bowman/sparse"
5 | "gonum.org/v1/gonum/mat"
6 | )
7 |
8 | // ColDo executes fn for each column j in m. If the matrix implements the mat.ColViewer
9 | // interface then this interface will be used to iterate over the column vectors more
10 | // efficiently. If the matrix implements the sparse.TypeConverter interface then the
11 | // matrix will be converted to a CSC matrix (which implements the mat.ColViewer
12 | // interface) so that it can benefit from the same optimisation.
13 | func ColDo(m mat.Matrix, fn func(j int, vec mat.Vector)) {
14 | if v, isOk := m.(mat.Vector); isOk {
15 | fn(0, v)
16 | return
17 | }
18 |
19 | if cv, isOk := m.(mat.ColViewer); isOk {
20 | _, c := m.Dims()
21 | for j := 0; j < c; j++ {
22 | fn(j, cv.ColView(j))
23 | }
24 | return
25 | }
26 |
27 | if sv, isOk := m.(sparse.TypeConverter); isOk {
28 | csc := sv.ToCSC()
29 | _, c := csc.Dims()
30 | for j := 0; j < c; j++ {
31 | fn(j, csc.ColView(j))
32 | }
33 | return
34 | }
35 |
36 | r, c := m.Dims()
37 | for j := 0; j < c; j++ {
38 | fn(j, mat.NewVecDense(r, mat.Col(nil, j, m)))
39 | }
40 | }
41 |
42 | // ColNonZeroElemDo executes fn for each non-zero element in column j of matrix m.
43 | // If m implements mat.ColNonZeroDoer then this interface will be used to perform
44 | // the iteration.
45 | func ColNonZeroElemDo(m mat.Matrix, j int, fn func(i, j int, v float64)) {
46 | colNonZeroDoer, isSparse := m.(mat.ColNonZeroDoer)
47 | r, _ := m.Dims()
48 |
49 | if isSparse {
50 | colNonZeroDoer.DoColNonZero(j, fn)
51 | } else {
52 | for i := 0; i < r; i++ {
53 | v := m.At(i, j)
54 | if v != 0 {
55 | fn(i, j, v)
56 | }
57 | }
58 | }
59 | }
60 |
--------------------------------------------------------------------------------
/hashing.go:
--------------------------------------------------------------------------------
1 | package nlp
2 |
3 | import (
4 | "math/rand"
5 |
6 | "github.com/james-bowman/sparse"
7 | "gonum.org/v1/gonum/mat"
8 | )
9 |
10 | // SimHash implements the SimHash Locality Sensitive Hashing (LSH) algorithm for
11 | // angular distance using sign random projections based on the work of Moses S. Charikar.
12 | // The distance between the original vectors is preserved through the hashing process
13 | // such that hashed vectors can be compared using Hamming Similarity for a faster,
14 | // more space efficient, approximation of Cosine Similarity for the original vectors.
15 | //
16 | // Charikar, Moses S. "Similarity Estimation Techniques from Rounding Algorithms"
17 | // in Proceedings of the thiry-fourth annual ACM symposium on Theory of computing -
18 | // STOC ’02, 2002, p. 380.
19 | // https://www.cs.princeton.edu/courses/archive/spr04/cos598B/bib/CharikarEstim.pdf
20 | type SimHash struct {
21 | hyperplanes []*mat.VecDense
22 | }
23 |
24 | // NewSimHash constructs a new SimHash creating a set of locality sensitive
25 | // hash functions which are combined to accept input vectors of length dim
26 | // and produce hashed binary vector fingerprints of length bits. This method
27 | // creates a series of random hyperplanes which are then compared to each
28 | // input vector to produce the output hashed binary vector encoding the input
29 | // vector's location in vector space relative to the hyperplanes. Each bit in
30 | // the output vector corresponds to the sign (1/0 for +/-) of the result of
31 | // the dot product comparison with each random hyperplane.
32 | func NewSimHash(bits int, dim int) *SimHash {
33 | // Generate random hyperplanes
34 | hyperplanes := make([]*mat.VecDense, bits)
35 |
36 | for j := 0; j < bits; j++ {
37 | p := make([]float64, dim)
38 | for i := 0; i < dim; i++ {
39 | p[i] = rand.NormFloat64()
40 | }
41 | hyperplanes[j] = mat.NewVecDense(dim, p)
42 | }
43 | return &SimHash{hyperplanes: hyperplanes}
44 | }
45 |
46 | // Hash accepts a Vector and outputs a BinaryVec (which also implements the
47 | // Gonum Vector interface). This method will panic if the input vector is of a
48 | // different length than the dim parameter used when constructing the SimHash.
49 | func (h *SimHash) Hash(v mat.Vector) *sparse.BinaryVec {
50 | bits := len(h.hyperplanes)
51 | dim := h.hyperplanes[0].Len()
52 | if dim != v.Len() {
53 | panic("The supplied vector has a different number of dimensions from the projected hyperplanes")
54 | }
55 | sig := sparse.NewBinaryVec(bits)
56 | for i := 0; i < bits; i++ {
57 | if sparse.Dot(v, h.hyperplanes[i]) >= 0 {
58 | sig.SetBit(i)
59 | }
60 | }
61 | return sig
62 | }
63 |
--------------------------------------------------------------------------------
/doc.go:
--------------------------------------------------------------------------------
1 | /*
2 | Package nlp provides implementations of selected machine learning algorithms for natural language processing of text corpora. The primary focus is the statistical semantics of plain-text documents supporting semantic analysis and retrieval of semantically similar documents.
3 |
4 | The package makes use of the Gonum (http://http//www.gonum.org/) library for linear algebra and scientific computing with some inspiration taken from Python's scikit-learn (http://scikit-learn.org/stable/) and Gensim(https://radimrehurek.com/gensim/)
5 |
6 | Overview
7 |
8 | The primary intended use case is to support document input as text strings encoded as a matrix of numerical feature vectors called a `term document matrix`. Each column in the matrix corresponds to a document in the corpus and each row corresponds to a unique term occurring in the corpus. The individual elements within the matrix contain the frequency with which each term occurs within each document (referred to as `term frequency`). Whilst textual data from document corpora are the primary intended use case, the algorithms can be used with other types of data from other sources once encoded (vectorised) into a suitable matrix e.g. image data, sound data, users/products, etc.
9 |
10 | These matrices can be processed and manipulated through the application of additional transformations for weighting features, identifying relationships or optimising the data for analysis, information retrieval and/or predictions.
11 |
12 | Typically the algorithms in this package implement one of three primary interfaces:
13 |
14 | Vectoriser - Taking document input as strings and outputting matrices of numerical features e.g. term frequency.
15 | Transformer - Takes matrices of numerical features and applies some logic/transformation to output a new matrix.
16 | Comparer - Functions taking two vectors (columns from a matrix) and outputting a distance/similarity measure.
17 |
18 | One of the implementations of Vectoriser is Pipeline which can be used to wire together pipelines composed of a Vectoriser and one or more Transformers arranged in serial so that the output from each stage forms the input of the next. This can be used to construct a classic LSI (Latent Semantic Indexing) pipeline (vectoriser -> TF.IDF weighting -> Truncated SVD):
19 |
20 | pipeline := nlp.NewPipeline(
21 | nlp.NewCountVectoriser(true),
22 | nlp.NewTFIDFTransformer(),
23 | nlp.NewTruncatedSVD(100),
24 | )
25 |
26 | Whilst they take different inputs, both Vectorisers and Transformers have 3 primary methods:
27 |
28 | Fit() - Trains the model based upon the supplied, input training data.
29 | Transform() - Transforms the input into the output matrix (requires the model to be already fitted by a previous call to Fit() or FitTransform()).
30 | FitTransform() - Convenience method combining Fit() and Transform() methods to transform input data, fitting the model to the input data in the process.
31 | */
32 | package nlp
33 |
--------------------------------------------------------------------------------
/dimreduction_test.go:
--------------------------------------------------------------------------------
1 | package nlp
2 |
3 | import (
4 | "bytes"
5 | "testing"
6 |
7 | "gonum.org/v1/gonum/mat"
8 | )
9 |
10 | func TestTruncatedSVDFitTransform(t *testing.T) {
11 | var tests = []struct {
12 | m int
13 | n int
14 | input []float64
15 | k int
16 | r int
17 | c int
18 | result []float64
19 | }{
20 | {
21 | m: 6, n: 4,
22 | input: []float64{
23 | 1, 3, 5, 2,
24 | 8, 1, 0, 0,
25 | 2, 1, 0, 1,
26 | 0, 0, 0, 0,
27 | 0, 0, 0, 1,
28 | 0, 1, 0, 0,
29 | },
30 | k: 2,
31 | r: 2, c: 4,
32 | result: []float64{
33 | -8.090, -2.212, -1.695, -0.955,
34 | 1.888, -2.524, -4.649, -1.930,
35 | },
36 | },
37 | }
38 |
39 | for _, test := range tests {
40 | transformer := NewTruncatedSVD(test.k)
41 | input := mat.NewDense(test.m, test.n, test.input)
42 | expResult := mat.NewDense(test.r, test.c, test.result)
43 |
44 | result, err := transformer.FitTransform(input)
45 |
46 | if err != nil {
47 | t.Errorf("Failed Truncated SVD transform caused by %v", err)
48 | }
49 |
50 | if !mat.EqualApprox(expResult, result, 0.01) {
51 | t.Logf("Expected matrix: \n%v\n but found: \n%v\n",
52 | mat.Formatted(expResult),
53 | mat.Formatted(result))
54 | t.Fail()
55 | }
56 |
57 | result2, err := transformer.Transform(input)
58 |
59 | if err != nil {
60 | t.Errorf("Failed Truncated SVD transform caused by %v", err)
61 | }
62 |
63 | if !mat.EqualApprox(result, result2, 0.001) {
64 | t.Logf("First matrix: \n%v\n but second matrix: \n%v\n",
65 | mat.Formatted(result),
66 | mat.Formatted(result2))
67 | t.Fail()
68 | }
69 | }
70 | }
71 |
72 | func TestPCAFitTransform(t *testing.T) {
73 | var tests = []struct {
74 | m int
75 | n int
76 | input []float64
77 | k int
78 | r int
79 | c int
80 | result []float64
81 | }{
82 | {
83 | m: 6, n: 4,
84 | input: []float64{
85 | 1, 3, 5, 2,
86 | 8, 1, 0, 0,
87 | 2, 1, 0, 1,
88 | 0, 0, 0, 0,
89 | 0, 0, 0, 1,
90 | 0, 1, 0, 0,
91 | },
92 | k: 2,
93 | r: 2, c: 4,
94 | result: []float64{
95 | -7.478, -0.128, 1.591, 0.496,
96 | 2.937, 2.581, 4.240, 1.110,
97 | },
98 | },
99 | }
100 |
101 | for _, test := range tests {
102 | transformer := NewPCA(test.k)
103 | input := mat.NewDense(test.m, test.n, test.input)
104 | expResult := mat.NewDense(test.r, test.c, test.result)
105 |
106 | result, err := transformer.FitTransform(input)
107 |
108 | if err != nil {
109 | t.Errorf("Failed Truncated SVD transform caused by %v", err)
110 | }
111 |
112 | if !mat.EqualApprox(expResult, result, 0.01) {
113 | t.Logf("Expected matrix: \n%v\n but found: \n%v\n",
114 | mat.Formatted(expResult),
115 | mat.Formatted(result))
116 | t.Fail()
117 | }
118 |
119 | result2, err := transformer.Transform(input)
120 |
121 | if err != nil {
122 | t.Errorf("Failed Truncated SVD transform caused by %v", err)
123 | }
124 |
125 | if !mat.EqualApprox(result, result2, 0.001) {
126 | t.Logf("First matrix: \n%v\n but second matrix: \n%v\n",
127 | mat.Formatted(result),
128 | mat.Formatted(result2))
129 | t.Fail()
130 | }
131 | }
132 | }
133 |
134 | func TestTruncatedSVDSaveLoad(t *testing.T) {
135 | var transforms = []struct {
136 | wanted *TruncatedSVD
137 | }{
138 | {
139 | wanted: &TruncatedSVD{
140 | Components: mat.NewDense(4, 2, []float64{
141 | 1, 5,
142 | 3, 2,
143 | 9, 0,
144 | 8, 4,
145 | }),
146 | K: 2,
147 | },
148 | },
149 | }
150 |
151 | for ti, test := range transforms {
152 | t.Logf("**** TestTruncatedSVDSaveLoad - Test Run %d.\n", ti+1)
153 |
154 | buf := new(bytes.Buffer)
155 | if err := test.wanted.Save(buf); err != nil {
156 | t.Errorf("Error encoding: %v\n", err)
157 | continue
158 | }
159 |
160 | var b TruncatedSVD
161 | if err := b.Load(buf); err != nil {
162 | t.Errorf("Error unencoding: %v\n", err)
163 | continue
164 | }
165 |
166 | if !mat.Equal(test.wanted.Components, b.Components) {
167 | t.Logf("Components mismatch: Wanted %v but got %v\n", mat.Formatted(test.wanted.Components), mat.Formatted(b.Components))
168 | t.Fail()
169 | }
170 | if test.wanted.K != b.K {
171 | t.Logf("K value mismatch: Wanted %d but got %d\n", test.wanted.K, b.K)
172 | t.Fail()
173 | }
174 | }
175 | }
176 |
--------------------------------------------------------------------------------
/measures/pairwise/comparisons.go:
--------------------------------------------------------------------------------
1 | package pairwise
2 |
3 | import (
4 | "math"
5 |
6 | "github.com/james-bowman/sparse"
7 | "gonum.org/v1/gonum/mat"
8 | )
9 |
10 | // Comparer is a type of function that compares two mat.Vector types and
11 | // returns a value indicating how similar they are.
12 | type Comparer func(a, b mat.Vector) float64
13 |
14 | // CosineSimilarity calculates the cosine of the angles of 2 vectors i.e. how
15 | // similar they are. Possible values range up to 1 (exact match). NaN will be
16 | // returned if either vector is zero length or contains only 0s.
17 | func CosineSimilarity(a, b mat.Vector) float64 {
18 | // Cosine angle between two vectors is equal to their dot product divided by
19 | // the product of their L2 norms
20 | dotProduct := sparse.Dot(a, b)
21 | norma := sparse.Norm(a, 2.0)
22 | normb := sparse.Norm(b, 2.0)
23 |
24 | if norma == 0 || normb == 0 {
25 | return math.NaN()
26 | }
27 |
28 | return (dotProduct / (norma * normb))
29 | }
30 |
31 | // CosineDistance is the complement of CosineSimilarity in the positive space.
32 | // CosineDistance = 1.0 - CosineSimilariy
33 | // It should be noted that CosineDistance is not strictly a valid distance measure
34 | // as it does not obey triangular inequality. For applications requiring a distance
35 | // measure that conforms with the strict definition then AngularDistance or
36 | // Euclidean distance (with all vectors L2 normalised first) should be used instead.
37 | // Whilst these distance measures may give different values, they will rank the same
38 | // as CosineDistance.
39 | func CosineDistance(a, b mat.Vector) float64 {
40 | return 1.0 - CosineSimilarity(a, b)
41 | }
42 |
43 | // AngularDistance is a distance measure closely related to CosineSimilarity.
44 | // It measures the difference between the angles of 2 vectors by taking
45 | // the inverse cosine (acos) of the CosineSimilarity and dividing by Pi.
46 | // Unlike CosineSimilarity, this distance measure is a valid distance measure
47 | // as it obeys triangular inequality.
48 | // See https://en.wikipedia.org/wiki/Cosine_similarity#Angular_distance_and_similarity
49 | func AngularDistance(a, b mat.Vector) float64 {
50 | cos := CosineSimilarity(a, b)
51 | if cos > 1 {
52 | cos = 1.0
53 | }
54 | theta := math.Acos(cos)
55 | return theta / math.Pi
56 | }
57 |
58 | // AngularSimilarity is the inverse of AngularDistance.
59 | // AngularSimilarity = 1.0 - AngularDistance
60 | func AngularSimilarity(a, b mat.Vector) float64 {
61 | return 1.0 - AngularDistance(a, b)
62 | }
63 |
64 | // HammingDistance is a distance measure sometimes referred to as the
65 | // `Matching Distance` and measures how different the 2 vectors are
66 | // in terms of the number of non-matching elements. This measurement
67 | // is normalised to provide the distance as proportional to the total
68 | // number of elements in the vectors. If a and b are not the same
69 | // shape then the function will panic.
70 | func HammingDistance(a, b mat.Vector) float64 {
71 | ba, aok := a.(*sparse.BinaryVec)
72 | bb, bok := b.(*sparse.BinaryVec)
73 |
74 | if aok && bok {
75 | return float64(ba.DistanceFrom(bb)) / float64(ba.Len())
76 | }
77 |
78 | var count float64
79 | for i := 0; i < a.Len(); i++ {
80 | if a.AtVec(i) != b.AtVec(i) {
81 | count++
82 | }
83 | }
84 | return count / float64(a.Len())
85 | }
86 |
87 | // HammingSimilarity is the inverse of HammingDistance (1-HammingDistance)
88 | // and represents the proportion of elements within the 2 vectors that
89 | // exactly match.
90 | func HammingSimilarity(a, b mat.Vector) float64 {
91 | return 1.0 - HammingDistance(a, b)
92 | }
93 |
94 | // EuclideanDistance calculates the Euclidean distance
95 | // (l2 distance) between vectors a and b or more specifically
96 | // \sqrt{\sum_{i=1}^n (a_i - b_i)^2}
97 | func EuclideanDistance(a, b mat.Vector) float64 {
98 | var v mat.VecDense
99 | v.SubVec(a, b)
100 | return math.Sqrt(mat.Dot(&v, &v))
101 | }
102 |
103 | // ManhattenDistance calculates the Manhatten distance (l1 distance) otherwise
104 | // known as the taxi cab distance between two vectors a and b.
105 | func ManhattenDistance(a, b mat.Vector) float64 {
106 | var v mat.VecDense
107 | v.SubVec(a, b)
108 | return mat.Norm(&v, 1)
109 | }
110 |
111 | // VectorLenSimilarity calculates the len of ab vectors
112 | func VectorLenSimilarity(a, b mat.Vector) float64 {
113 | dotProduct := sparse.Dot(a, b)
114 | if dotProduct == 0 {
115 | return math.NaN()
116 | }
117 | return math.Sqrt(dotProduct)
118 | }
119 |
--------------------------------------------------------------------------------
/index_test.go:
--------------------------------------------------------------------------------
1 | package nlp
2 |
3 | import (
4 | "sort"
5 | "testing"
6 |
7 | "github.com/james-bowman/nlp/measures/pairwise"
8 | "github.com/james-bowman/sparse"
9 | "gonum.org/v1/gonum/floats"
10 | "gonum.org/v1/gonum/mat"
11 | )
12 |
13 | func TestIndexerIndex(t *testing.T) {
14 | m := sparse.Random(sparse.DenseFormat, 100, 10, 1.0)
15 |
16 | tests := []struct {
17 | index Indexer
18 | }{
19 | {index: NewLinearScanIndex(pairwise.CosineDistance)},
20 | {index: NewLSHIndex(false, NewSimHash(1000, 100), NewClassicLSH(50, 20), pairwise.CosineDistance)},
21 | {index: NewLSHIndex(true, NewSimHash(1000, 100), NewClassicLSH(50, 20), pairwise.HammingDistance)},
22 | {index: NewLSHIndex(false, NewSimHash(1000, 100), NewLSHForest(50, 20), pairwise.CosineDistance)},
23 | }
24 |
25 | for ti, test := range tests {
26 | ColDo(m, func(j int, v mat.Vector) {
27 | test.index.Index(v, j)
28 | })
29 |
30 | ColDo(m, func(j int, v mat.Vector) {
31 | matches := test.index.Search(v, 1)
32 |
33 | if len(matches) != 1 {
34 | t.Errorf("Test %d: Search expected 1 result but received %d", ti+1, len(matches))
35 | }
36 | if matches[0].ID != j {
37 | t.Errorf("Test %d: Search expected to find %d but found %d", ti+1, j, matches[0].ID)
38 | }
39 | if matches[0].Distance < -0.0000001 || matches[0].Distance > 0.0000001 {
40 | t.Errorf("Test %d: Search match distance expected 0.0 but received %f", ti+1, matches[0].Distance)
41 | }
42 | })
43 | }
44 | }
45 |
46 | func TestIndexerSearch(t *testing.T) {
47 | numCols := 10
48 | m := sparse.Random(sparse.DenseFormat, 100, numCols, 1.0)
49 |
50 | // build similarity matrix
51 | similarityMatrix := make([]float64, numCols*numCols)
52 | inds := make([][]int, numCols)
53 | ColDo(m, func(j int, v1 mat.Vector) {
54 | ColDo(m, func(i int, v2 mat.Vector) {
55 | similarityMatrix[j*numCols+i] = pairwise.CosineDistance(v1, v2)
56 | })
57 | inds[j] = make([]int, numCols)
58 | floats.Argsort(similarityMatrix[j*numCols:(j+1)*numCols], inds[j])
59 | for left, right := 0, len(inds[j])-1; left < right; left, right = left+1, right-1 {
60 | inds[j][left], inds[j][right] = inds[j][right], inds[j][left]
61 | similarityMatrix[j*numCols+left], similarityMatrix[j*numCols+right] = similarityMatrix[j*numCols+right], similarityMatrix[j*numCols+left]
62 | }
63 | })
64 |
65 | tests := []struct {
66 | k int
67 | index Indexer
68 | }{
69 | {k: numCols, index: NewLinearScanIndex(pairwise.CosineDistance)},
70 | {k: numCols, index: NewLSHIndex(false, NewSimHash(700, 100), NewClassicLSH(7, 100), pairwise.CosineDistance)},
71 | {k: numCols, index: NewLSHIndex(false, NewSimHash(1000, 100), NewLSHForest(50, 20), pairwise.CosineDistance)},
72 | }
73 |
74 | for ti, test := range tests {
75 | ColDo(m, func(j int, v mat.Vector) {
76 | test.index.Index(v, j)
77 | })
78 |
79 | ColDo(m, func(j int, v mat.Vector) {
80 | matches := test.index.Search(v, test.k)
81 |
82 | if len(matches) != test.k {
83 | t.Errorf("Test %d: Search expected %d result but received %d", ti+1, test.k, len(matches))
84 | }
85 | heap := resultHeap{matches: matches}
86 | sort.Sort(heap)
87 |
88 | for i, match := range matches {
89 | if match.ID != inds[j][i] {
90 | t.Errorf("Test %d: For col #%d, Rank #%d - expected %v but found %v", ti+1, j, i, inds[j], matches)
91 | return
92 | }
93 | }
94 | })
95 | }
96 | }
97 |
98 | func TestIndexerRemove(t *testing.T) {
99 | m := sparse.Random(sparse.DenseFormat, 100, 10, 1.0)
100 |
101 | tests := []struct {
102 | index Indexer
103 | }{
104 | {index: NewLinearScanIndex(pairwise.CosineDistance)},
105 | {index: NewLSHIndex(false, NewSimHash(1000, 100), NewClassicLSH(50, 20), pairwise.CosineDistance)},
106 | {index: NewLSHIndex(true, NewSimHash(1000, 100), NewClassicLSH(50, 20), pairwise.HammingDistance)},
107 | {index: NewLSHIndex(false, NewSimHash(1000, 100), NewLSHForest(50, 20), pairwise.CosineDistance)},
108 | }
109 |
110 | for ti, test := range tests {
111 | ColDo(m, func(j int, v mat.Vector) {
112 | test.index.Index(v, j)
113 | })
114 |
115 | ColDo(m, func(j int, v mat.Vector) {
116 | test.index.Remove(j)
117 | matches := test.index.Search(v, 1)
118 |
119 | if len(matches) > 1 {
120 | t.Errorf("Test %d: Search expected less than 1 result but received %d", ti+1, len(matches))
121 | }
122 | if len(matches) == 1 {
123 | if matches[0].ID == j {
124 | t.Errorf("Test %d: Search expected not to find %d but found %d", ti+1, j, matches[0].ID)
125 | }
126 | }
127 | })
128 | }
129 | }
130 |
--------------------------------------------------------------------------------
/weightings_test.go:
--------------------------------------------------------------------------------
1 | package nlp
2 |
3 | import (
4 | "bytes"
5 | "testing"
6 |
7 | "github.com/james-bowman/sparse"
8 | "gonum.org/v1/gonum/mat"
9 | )
10 |
11 | func TestTfidfTransformerFit(t *testing.T) {
12 | var tests = []struct {
13 | m int
14 | n int
15 | input []float64
16 | dim int
17 | transform []float64
18 | }{
19 | {
20 | m: 6, n: 4,
21 | input: []float64{
22 | 1, 3, 5, 2,
23 | 8, 1, 0, 0,
24 | 2, 1, 0, 1,
25 | 0, 0, 0, 0,
26 | 0, 0, 0, 1,
27 | 0, 1, 0, 0,
28 | },
29 | dim: 6,
30 | transform: []float64{
31 | 0,
32 | 0.5108256237659907,
33 | 0.22314355131420976,
34 | 1.6094379124341003,
35 | 0.9162907318741551,
36 | 0.9162907318741551,
37 | },
38 | },
39 | }
40 |
41 | for _, test := range tests {
42 | transformer := NewTfidfTransformer()
43 | input := mat.NewDense(test.m, test.n, test.input)
44 |
45 | transformer.Fit(input)
46 |
47 | weights := transformer.transform.Diagonal()
48 | for i, v := range weights {
49 | if v != test.transform[i] {
50 | t.Logf("Expected weights: \n%v\n but found: \n%v\n",
51 | test.transform, weights)
52 | t.Fail()
53 | }
54 | }
55 | }
56 | }
57 |
58 | func TestTfidfTransformerTransform(t *testing.T) {
59 | var tests = []struct {
60 | m int
61 | n int
62 | input []float64
63 | tm int
64 | tn int
65 | output []float64
66 | }{
67 | {
68 | m: 6, n: 4,
69 | input: []float64{
70 | 1, 3, 5, 2,
71 | 8, 1, 0, 0,
72 | 2, 1, 0, 1,
73 | 0, 0, 0, 0,
74 | 0, 0, 0, 1,
75 | 0, 1, 0, 0,
76 | },
77 | tm: 6, tn: 4,
78 | output: []float64{
79 | 0.000, 0.000, 0.000, 0.000,
80 | 4.087, 0.511, 0.000, 0.000,
81 | 0.446, 0.223, 0.000, 0.223,
82 | 0.000, 0.000, 0.000, 0.000,
83 | 0.000, 0.000, 0.000, 0.916,
84 | 0.000, 0.916, 0.000, 0.000,
85 | },
86 | },
87 | }
88 |
89 | for _, test := range tests {
90 | transformer := NewTfidfTransformer()
91 | input := mat.NewDense(test.m, test.n, test.input)
92 | output := mat.NewDense(test.tm, test.tn, test.output)
93 |
94 | result, err := transformer.FitTransform(input)
95 |
96 | if err != nil {
97 | t.Errorf("Failed tfidf fit transform caused by %v", err)
98 | }
99 |
100 | if !mat.EqualApprox(output, result, 0.001) {
101 | t.Logf("Expected matrix: \n%v\n but found: \n%v\n",
102 | mat.Formatted(output),
103 | mat.Formatted(result))
104 | t.Fail()
105 | }
106 |
107 | // test that subsequent transforms produce same result as initial
108 | result2, err := transformer.Transform(input)
109 |
110 | if err != nil {
111 | t.Errorf("Failed tfidf fit transform caused by %v", err)
112 | }
113 |
114 | if !mat.Equal(result, result2) {
115 | t.Logf("Expected matrix: \n%v\n but found: \n%v\n",
116 | mat.Formatted(result),
117 | mat.Formatted(result2))
118 | t.Fail()
119 | }
120 | }
121 | }
122 |
123 | func TestTfidfTransformerSaveLoad(t *testing.T) {
124 | var transforms = []struct {
125 | wantedTransform *sparse.DIA
126 | }{
127 | {
128 | wantedTransform: sparse.NewDIA(2, 2, []float64{1, 5}),
129 | },
130 | }
131 |
132 | for ti, test := range transforms {
133 | t.Logf("**** TestTfidfTransformerSave - Test Run %d.\n", ti+1)
134 |
135 | a := NewTfidfTransformer()
136 | a.transform = test.wantedTransform
137 |
138 | buf := new(bytes.Buffer)
139 | if err := a.Save(buf); err != nil {
140 | t.Errorf("Error encoding: %v\n", err)
141 | continue
142 | }
143 |
144 | b := NewTfidfTransformer()
145 | if err := b.Load(buf); err != nil {
146 | t.Errorf("Error unencoding: %v\n", err)
147 | continue
148 | }
149 |
150 | if !mat.Equal(a.transform, b.transform) {
151 | t.Logf("Wanted %v but got %v\n", mat.Formatted(a.transform), mat.Formatted(b.transform))
152 | t.Fail()
153 | }
154 | }
155 | }
156 |
157 | func benchmarkTFIDFFitTransform(t Transformer, m, n int, b *testing.B) {
158 | mat := mat.NewDense(m, n, nil)
159 |
160 | for n := 0; n < b.N; n++ {
161 | t.FitTransform(mat)
162 | }
163 | }
164 |
165 | func BenchmarkTFIDFFitTransform20x10(b *testing.B) {
166 | benchmarkTFIDFFitTransform(NewTfidfTransformer(), 20, 10, b)
167 | }
168 | func BenchmarkTFIDFFitTransform200x100(b *testing.B) {
169 | benchmarkTFIDFFitTransform(NewTfidfTransformer(), 200, 100, b)
170 | }
171 | func BenchmarkTFIDFFitTransform2000x1000(b *testing.B) {
172 | benchmarkTFIDFFitTransform(NewTfidfTransformer(), 2000, 1000, b)
173 | }
174 | func BenchmarkTFIDFFitTransform20000x10000(b *testing.B) {
175 | benchmarkTFIDFFitTransform(NewTfidfTransformer(), 20000, 10000, b)
176 | }
177 |
--------------------------------------------------------------------------------
/weightings.go:
--------------------------------------------------------------------------------
1 | package nlp
2 |
3 | import (
4 | "io"
5 | "math"
6 |
7 | "github.com/james-bowman/sparse"
8 | "gonum.org/v1/gonum/mat"
9 | )
10 |
11 | // TfidfTransformer takes a raw term document matrix and weights each raw term frequency
12 | // value depending upon how commonly it occurs across all documents within the corpus.
13 | // For example a very commonly occurring word like `the` is likely to occur in all documents
14 | // and so would be weighted down.
15 | // More precisely, TfidfTransformer applies a tf-idf algorithm to the matrix where each
16 | // term frequency is multiplied by the inverse document frequency. Inverse document
17 | // frequency is calculated as log(n/df) where df is the number of documents in which the
18 | // term occurs and n is the total number of documents within the corpus. We add 1 to both n
19 | // and df before division to prevent division by zero.
20 | type TfidfTransformer struct {
21 | transform *sparse.DIA
22 | }
23 |
24 | // NewTfidfTransformer constructs a new TfidfTransformer.
25 | func NewTfidfTransformer() *TfidfTransformer {
26 | return &TfidfTransformer{}
27 | }
28 |
29 | // Fit takes a training term document matrix, counts term occurrences across all documents
30 | // and constructs an inverse document frequency transform to apply to matrices in subsequent
31 | // calls to Transform().
32 | func (t *TfidfTransformer) Fit(matrix mat.Matrix) Transformer {
33 | if t, isTypeConv := matrix.(sparse.TypeConverter); isTypeConv {
34 | matrix = t.ToCSR()
35 | }
36 | m, n := matrix.Dims()
37 |
38 | weights := make([]float64, m)
39 | var df int
40 | if csr, ok := matrix.(*sparse.CSR); ok {
41 | for i := 0; i < m; i++ {
42 | weights[i] = math.Log(float64(1+n) / float64(1+csr.RowNNZ(i)))
43 | }
44 | } else {
45 | for i := 0; i < m; i++ {
46 | df = 0
47 | for j := 0; j < n; j++ {
48 | if matrix.At(i, j) != 0 {
49 | df++
50 | }
51 | }
52 | weights[i] = math.Log(float64(1+n) / float64(1+df))
53 | }
54 | }
55 |
56 | // build a diagonal matrix from array of term weighting values for subsequent
57 | // multiplication with term document matrics
58 | t.transform = sparse.NewDIA(m, m, weights)
59 |
60 | return t
61 | }
62 |
63 | // Transform applies the inverse document frequency (IDF) transform by multiplying
64 | // each term frequency by its corresponding IDF value. This has the effect of weighting
65 | // each term frequency according to how often it appears across the whole document corpus
66 | // so that naturally frequent occurring words are given less weight than uncommon ones.
67 | // The returned matrix is a sparse matrix type.
68 | func (t *TfidfTransformer) Transform(matrix mat.Matrix) (mat.Matrix, error) {
69 | if t, isTypeConv := matrix.(sparse.TypeConverter); isTypeConv {
70 | matrix = t.ToCSR()
71 | }
72 | var product sparse.CSR
73 |
74 | // simply multiply the matrix by our idf transform (the diagonal matrix of term weights)
75 | product.Mul(t.transform, matrix)
76 |
77 | // todo: possibly L2 norm matrix to remove any bias caused by documents of different
78 | // lengths where longer documents naturally have more words and so higher word counts
79 |
80 | return &product, nil
81 | }
82 |
83 | // FitTransform is exactly equivalent to calling Fit() followed by Transform() on the
84 | // same matrix. This is a convenience where separate training data is not being
85 | // used to fit the model i.e. the model is fitted on the fly to the test data.
86 | // The returned matrix is a sparse matrix type.
87 | func (t *TfidfTransformer) FitTransform(matrix mat.Matrix) (mat.Matrix, error) {
88 | if t, isTypeConv := matrix.(sparse.TypeConverter); isTypeConv {
89 | matrix = t.ToCSR()
90 | }
91 | return t.Fit(matrix).Transform(matrix)
92 | }
93 |
94 | // Save binary serialises the model and writes it into w. This is useful for persisting
95 | // a trained model to disk so that it may be loaded (using the Load() method)in another
96 | // context (e.g. production) for reproducible results.
97 | func (t TfidfTransformer) Save(w io.Writer) error {
98 | _, err := t.transform.MarshalBinaryTo(w)
99 |
100 | return err
101 | }
102 |
103 | // Load binary deserialises the previously serialised model into the receiver. This is
104 | // useful for loading a previously trained and saved model from another context
105 | // (e.g. offline training) for use within another context (e.g. production) for
106 | // reproducible results. Load should only be performed with trusted data.
107 | func (t *TfidfTransformer) Load(r io.Reader) error {
108 | var model sparse.DIA
109 |
110 | if _, err := model.UnmarshalBinaryFrom(r); err != nil {
111 | return err
112 | }
113 | t.transform = &model
114 |
115 | return nil
116 | }
117 |
--------------------------------------------------------------------------------
/example_test.go:
--------------------------------------------------------------------------------
1 | package nlp_test
2 |
3 | import (
4 | "fmt"
5 |
6 | "github.com/james-bowman/nlp"
7 | "github.com/james-bowman/nlp/measures/pairwise"
8 | "gonum.org/v1/gonum/mat"
9 | )
10 |
11 | func Example() {
12 | testCorpus := []string{
13 | "The quick brown fox jumped over the lazy dog",
14 | "hey diddle diddle, the cat and the fiddle",
15 | "the cow jumped over the moon",
16 | "the little dog laughed to see such fun",
17 | "and the dish ran away with the spoon",
18 | }
19 |
20 | var stopWords = []string{"a", "about", "above", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", "another", "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are", "around", "as", "at", "back", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom", "but", "by", "call", "can", "cannot", "cant", "co", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven", "else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "ie", "if", "in", "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "thickv", "thin", "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves"}
21 |
22 | query := "the brown fox ran around the dog"
23 |
24 | vectoriser := nlp.NewCountVectoriser(stopWords...)
25 | transformer := nlp.NewTfidfTransformer()
26 |
27 | // set k (the number of dimensions following truncation) to 4
28 | reducer := nlp.NewTruncatedSVD(4)
29 |
30 | lsiPipeline := nlp.NewPipeline(vectoriser, transformer, reducer)
31 |
32 | // Transform the corpus into an LSI fitting the model to the documents in the process
33 | lsi, err := lsiPipeline.FitTransform(testCorpus...)
34 | if err != nil {
35 | fmt.Printf("Failed to process documents because %v", err)
36 | return
37 | }
38 |
39 | // run the query through the same pipeline that was fitted to the corpus and
40 | // to project it into the same dimensional space
41 | queryVector, err := lsiPipeline.Transform(query)
42 | if err != nil {
43 | fmt.Printf("Failed to process documents because %v", err)
44 | return
45 | }
46 |
47 | // iterate over document feature vectors (columns) in the LSI matrix and compare
48 | // with the query vector for similarity. Similarity is determined by the difference
49 | // between the angles of the vectors known as the cosine similarity
50 | highestSimilarity := -1.0
51 | var matched int
52 | _, docs := lsi.Dims()
53 | for i := 0; i < docs; i++ {
54 | similarity := pairwise.CosineSimilarity(queryVector.(mat.ColViewer).ColView(0), lsi.(mat.ColViewer).ColView(i))
55 | if similarity > highestSimilarity {
56 | matched = i
57 | highestSimilarity = similarity
58 | }
59 | }
60 |
61 | fmt.Printf("Matched '%s'", testCorpus[matched])
62 | // Output: Matched 'The quick brown fox jumped over the lazy dog'
63 | }
64 |
--------------------------------------------------------------------------------
/vectorisers_test.go:
--------------------------------------------------------------------------------
1 | package nlp
2 |
3 | import (
4 | "testing"
5 |
6 | "github.com/james-bowman/sparse"
7 | )
8 |
9 | var stopWords = []string{"a", "about", "above", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", "another", "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are", "around", "as", "at", "back", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom", "but", "by", "call", "can", "cannot", "cant", "co", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven", "else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "ie", "if", "in", "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "thickv", "thin", "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves"}
10 |
11 | var trainSet = []string{
12 | "The quick brown fox jumped over the. Lazy dog",
13 | "the brown Cat sat on the mat",
14 | "the little dog laughed to see such fun",
15 | "laughing cow",
16 | "the cow ran around the dog",
17 | "spoon dish and plate",
18 | }
19 |
20 | var testSet = []string{
21 | "hey diddle diddle",
22 | "the cat and the fiddle",
23 | "the cow jumped over the moon",
24 | "the quick brown fox jumped over the. Lazy dog",
25 | "The little dog laughed to see such fun",
26 | "The dish ran away with the spoon",
27 | }
28 |
29 | func TestCountVectoriserFit(t *testing.T) {
30 | var tests = []struct {
31 | train []string
32 | stop []string
33 | vocabSize int
34 | }{
35 | {trainSet, []string{}, 26},
36 | {trainSet[0:1], []string{}, 8},
37 | {trainSet, stopWords, 18},
38 | }
39 |
40 | for testRun, test := range tests {
41 | t.Logf("**** Test Run %d.\n", testRun+1)
42 | vectoriser := NewCountVectoriser(test.stop...)
43 |
44 | vectoriser.Fit(test.train...)
45 |
46 | if len(vectoriser.Vocabulary) != test.vocabSize {
47 | t.Logf("Expected training dataset %v of size %d but found vocabulary %v of size %d",
48 | test.train, test.vocabSize, vectoriser.Vocabulary, len(vectoriser.Vocabulary))
49 | t.Fail()
50 | }
51 | }
52 | }
53 | func TestCountVectoriserTransform(t *testing.T) {
54 | var tests = []struct {
55 | train []string
56 | vocabSize int
57 | stop []string
58 | test []string
59 | }{
60 | {trainSet, 26, []string{}, testSet},
61 | {trainSet[0:1], 8, []string{}, testSet[0:3]},
62 | {testSet, 26, []string{}, testSet},
63 | {testSet, 19, stopWords, testSet},
64 | }
65 |
66 | for testRun, test := range tests {
67 | t.Logf("**** Test Run %d.\n", testRun+1)
68 |
69 | vectoriser := NewCountVectoriser(test.stop...)
70 | vectoriser.Fit(test.train...)
71 |
72 | vec, err := vectoriser.Transform(test.test...)
73 |
74 | if err != nil {
75 | t.Errorf("Error fitting and applying vectoriser caused by %v", err)
76 | }
77 |
78 | m, n := vec.Dims()
79 |
80 | if m != test.vocabSize || n != len(test.test) {
81 | t.Logf("Expected matrix %d x %d but found %d x %d", test.vocabSize, len(test.test), m, n)
82 | t.Fail()
83 | }
84 | }
85 | }
86 |
87 | func TestHashingVectoriserTransform(t *testing.T) {
88 | var tests = []struct {
89 | train []string
90 | nnz int
91 | features int
92 | stop []string
93 | test []string
94 | }{
95 | {trainSet, 33, 260000, []string{}, testSet},
96 | {trainSet[0:1], 11, 260000, []string{}, testSet[0:3]},
97 | {testSet, 33, 260001, []string{}, testSet},
98 | {testSet, 21, 260000, stopWords, testSet},
99 | }
100 |
101 | for testRun, test := range tests {
102 | t.Logf("**** Test Run %d.\n", testRun+1)
103 | vectoriser := NewHashingVectoriser(test.features, test.stop...)
104 | vectoriser.Fit(test.train...)
105 |
106 | vec, err := vectoriser.Transform(test.test...)
107 |
108 | if err != nil {
109 | t.Errorf("Error fitting and applying vectoriser caused by %v", err)
110 | }
111 |
112 | m, n := vec.Dims()
113 |
114 | if m != test.features || n != len(test.test) || vec.(sparse.Sparser).NNZ() != test.nnz {
115 | t.Logf("Expected matrix %d x %d with NNZ = %d but found %d x %d with NNZ = %d",
116 | test.features,
117 | len(test.test),
118 | test.nnz,
119 | m, n,
120 | vec.(sparse.Sparser).NNZ())
121 | t.Fail()
122 | }
123 | }
124 | }
125 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Natural Language Processing
2 | [](https://opensource.org/licenses/MIT)
3 | [](https://godoc.org/github.com/james-bowman/nlp)
4 | [](https://travis-ci.org/james-bowman/nlp)
5 | [](https://goreportcard.com/report/github.com/james-bowman/nlp)
6 | [](https://codecov.io/gh/james-bowman/nlp)
7 | [](https://github.com/avelino/awesome-go)
8 | [](https://sourcegraph.com/github.com/james-bowman/nlp?badge)
9 |
10 |
11 |
12 |
13 | Implementations of selected machine learning algorithms for natural language processing in golang. The primary focus for the package is the statistical semantics of plain-text documents supporting semantic analysis and retrieval of semantically similar documents.
14 |
15 | Built upon the [Gonum](https://www.gonum.org/) package for linear algebra and scientific computing with some inspiration taken from Python's [scikit-learn](http://scikit-learn.org/stable/) and [Gensim](https://radimrehurek.com/gensim/).
16 |
17 | Check out [the companion blog post](http://www.jamesbowman.me/post/semantic-analysis-of-webpages-with-machine-learning-in-go/) or [the Go documentation page](https://godoc.org/github.com/james-bowman/nlp) for full usage and examples.
18 |
19 |
20 |
21 | ## Features
22 |
23 | * [LSA (Latent Semantic Analysis aka Latent Semantic Indexing (LSI))][LSA] implementation using truncated [SVD (Singular Value Decomposition)](https://en.wikipedia.org/wiki/Singular-value_decomposition) for dimensionality reduction.
24 | * Fast comparison and retrieval of semantically similar documents using [SimHash](https://en.wikipedia.org/wiki/SimHash)(random hyperplanes/[sign random projection](https://en.wikipedia.org/wiki/Locality-sensitive_hashing#Random_projection)) algorithm with multi-index and Forest schemes for [LSH (Locality Sensitive Hashing)](https://en.wikipedia.org/wiki/Locality-sensitive_hashing) to support fast, approximate cosine similarity/angular distance comparisons and approximate nearest neighbour search using significantly less memory and processing time.
25 | * [Random Indexing (RI)](https://en.wikipedia.org/wiki/Random_indexing) and Reflective Random Indexing (RRI) (which extends RI to support indirect inference) for scalable [Latent Semantic Analysis (LSA)][LSA] over large, web-scale corpora.
26 | * [Latent Dirichlet Allocation (LDA)](https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation) using a parallelised implementation of the fast [SCVB0 (Stochastic Collapsed Variational Bayesian inference)][SCVB0] algorithm for unsupervised topic extraction.
27 | * [PCA (Principal Component Analysis)](https://en.wikipedia.org/wiki/Principal_component_analysis)
28 | * [TF-IDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf) weighting to account for frequently occuring words
29 | * [Sparse matrix](http://github.com/james-bowman/sparse) implementations used for more efficient memory usage and processing over large document corpora.
30 | * Stop word removal to remove frequently occuring English words e.g. "the", "and"
31 | * [Feature hashing](https://en.wikipedia.org/wiki/Feature_hashing) ('the hashing trick') implementation (using [MurmurHash3](http://github.com/spaolacci/murmur3)) for reduced memory requirements and reduced reliance on training data
32 | * Similarity/distance measures to calculate the similarity/distance between feature vectors.
33 |
34 | ## Planned
35 |
36 | * Expanded persistence support
37 | * Stemming to treat words with common root as the same e.g. "go" and "going"
38 | * Clustering algorithms e.g. Heirachical, K-means, etc.
39 | * Classification algorithms e.g. SVM, KNN, random forest, etc.
40 |
41 | ## References
42 |
43 | 1. [Rosario, Barbara. Latent Semantic Indexing: An overview. INFOSYS 240 Spring 2000](http://people.ischool.berkeley.edu/~rosario/projects/LSI.pdf)
44 | 1. [Latent Semantic Analysis, a scholarpedia article on LSA written by Tom Landauer, one of the creators of LSA.](http://www.scholarpedia.org/article/Latent_semantic_analysis)
45 | 1. [Thomo, Alex. Latent Semantic Analysis (Tutorial).](http://webhome.cs.uvic.ca/~thomo/svd.pdf)
46 | 1. [Latent Semantic Indexing. Standford NLP Course](http://nlp.stanford.edu/IR-book/html/htmledition/latent-semantic-indexing-1.html)
47 | 1. [Charikar, Moses S. "Similarity Estimation Techniques from Rounding Algorithms" in Proceedings of the thiry-fourth annual ACM symposium on Theory of computing - STOC ’02, 2002, p. 380.](https://www.cs.princeton.edu/courses/archive/spr04/cos598B/bib/CharikarEstim.pdf)
48 | 1. [M. Bawa, T. Condie, and P. Ganesan, “LSH forest: self-tuning indexes for similarity search,” Proc. 14th Int. Conf. World Wide Web - WWW ’05, p. 651, 2005.](http://dl.acm.org/citation.cfm?id=1060745.1060840)
49 | 1. [A. Gionis, P. Indyk, and R. Motwani, “Similarity Search in High Dimensions via Hashing,” VLDB ’99 Proc. 25th Int. Conf. Very Large Data Bases, vol. 99, no. 1, pp. 518–529, 1999.](http://www.cs.princeton.edu/courses/archive/spring13/cos598C/Gionis.pdf%5Cnhttp://portal.acm.org/citation.cfm?id=671516)
50 | 1. [Kanerva, Pentti, Kristoferson, Jan and Holst, Anders (2000). Random Indexing of Text Samples for Latent Semantic Analysis](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.4.6523&rep=rep1&type=pdf)
51 | 1. [Rangan, Venkat. Discovery of Related Terms in a corpus using Reflective Random Indexing](https://www.umiacs.umd.edu/~oard/desi4/papers/rangan.pdf)
52 | 1. [Vasuki, Vidya and Cohen, Trevor. Reflective random indexing for semi-automatic indexing of the biomedical literature](https://ac.els-cdn.com/S1532046410000481/1-s2.0-S1532046410000481-main.pdf?_tid=f31f92e8-028a-11e8-8c31-00000aab0f6c&acdnat=1516965824_e24a804445fff1744281ca6f5898a3a4)
53 | 1. [QasemiZadeh, Behrang and Handschuh, Siegfried. Random Indexing Explained with High Probability](http://pars.ie/publications/papers/pre-prints/random-indexing-dr-explained.pdf)
54 | 1. [Foulds, James; Boyles, Levi; Dubois, Christopher; Smyth, Padhraic; Welling, Max (2013). Stochastic Collapsed Variational Bayesian Inference for Latent Dirichlet Allocation][SCVB0]
55 |
56 |
59 |
60 | [LSA]: https://en.wikipedia.org/wiki/Latent_semantic_analysis
61 | [SCVB0]: https://arxiv.org/pdf/1305.2452
62 |
--------------------------------------------------------------------------------
/dimreduction.go:
--------------------------------------------------------------------------------
1 | package nlp
2 |
3 | import (
4 | "encoding/binary"
5 | "fmt"
6 | "io"
7 |
8 | "github.com/james-bowman/sparse"
9 | "gonum.org/v1/gonum/mat"
10 | "gonum.org/v1/gonum/stat"
11 | )
12 |
13 | // TruncatedSVD implements the Singular Value Decomposition factorisation of matrices.
14 | // This produces an approximation of the input matrix at a lower rank. This is a core
15 | // component of LSA (Latent Semantic Analsis)
16 | type TruncatedSVD struct {
17 | // Components is the truncated term matrix (matrix U of the Singular Value Decomposition
18 | // (A=USV^T)). The matrix will be of size m, k where m = the number of unique terms
19 | // in the training data and k = the number of elements to truncate to (specified by
20 | // attribute K) or m or n (the number of documents in the training data) whichever of
21 | // the 3 values is smaller.
22 | Components *mat.Dense
23 |
24 | // K is the number of dimensions to which the output, transformed, matrix should be
25 | // truncated to. The matrix output by the FitTransform() and Transform() methods will
26 | // be n rows by min(m, n, K) columns, where n is the number of columns in the original,
27 | // input matrix and min(m, n, K) is the lowest value of m, n, K where m is the number of
28 | // rows in the original, input matrix.
29 | K int
30 | }
31 |
32 | // NewTruncatedSVD creates a new TruncatedSVD transformer with K (the truncated
33 | // dimensionality) being set to the specified value k
34 | func NewTruncatedSVD(k int) *TruncatedSVD {
35 | return &TruncatedSVD{K: k}
36 | }
37 |
38 | // Fit performs the SVD factorisation on the input training data matrix, mat and
39 | // stores the output term matrix as a transform to apply to matrices in the Transform matrix.
40 | func (t *TruncatedSVD) Fit(mat mat.Matrix) Transformer {
41 | if _, err := t.FitTransform(mat); err != nil {
42 | panic("nlp: Failed to fit truncated SVD because " + err.Error())
43 | }
44 | return t
45 | }
46 |
47 | // Transform applies the transform decomposed from the training data matrix in Fit()
48 | // to the input matrix. The resulting output matrix will be the closest approximation
49 | // to the input matrix at a reduced rank. The returned matrix is a dense matrix type.
50 | func (t *TruncatedSVD) Transform(m mat.Matrix) (mat.Matrix, error) {
51 | var product mat.Dense
52 |
53 | product.Mul(t.Components.T(), m)
54 |
55 | return &product, nil
56 | }
57 |
58 | // FitTransform is approximately equivalent to calling Fit() followed by Transform()
59 | // on the same matrix. This is a useful shortcut where separate training data is not being
60 | // used to fit the model i.e. the model is fitted on the fly to the test data.
61 | // The returned matrix is a dense matrix type.
62 | func (t *TruncatedSVD) FitTransform(m mat.Matrix) (mat.Matrix, error) {
63 | var svd mat.SVD
64 | if ok := svd.Factorize(m, mat.SVDThin); !ok {
65 | return nil, fmt.Errorf("Failed SVD Factorisation of working matrix")
66 | }
67 | s, u, v := t.extractSVD(&svd)
68 |
69 | r, c := m.Dims()
70 | min := minimum(t.K, r, c)
71 |
72 | // truncate U and V matrices to k << min(m, n)
73 | uk := u.Slice(0, r, 0, min)
74 | vk := v.Slice(0, c, 0, min)
75 |
76 | t.Components = uk.(*mat.Dense)
77 |
78 | // multiply Sigma by transpose of V. As sigma is a symmetrical (square) diagonal matrix it is
79 | // more efficient to simply multiply each element from the array of diagonal values with each
80 | // element from the matrix V rather than multiplying out the non-zero values from off the diagonal.
81 | var product mat.Dense
82 | product.Apply(func(i, j int, v float64) float64 {
83 | return (v * s[i])
84 | }, vk.T())
85 |
86 | return &product, nil
87 | }
88 |
89 | func minimum(k, m, n int) int {
90 | return min(k, min(m, n))
91 | }
92 |
93 | func min(m, n int) int {
94 | if m < n {
95 | return m
96 | }
97 | return n
98 | }
99 |
100 | func (t *TruncatedSVD) extractSVD(svd *mat.SVD) (s []float64, u, v *mat.Dense) {
101 | var um, vm mat.Dense
102 | svd.UTo(&um)
103 | svd.VTo(&vm)
104 | s = svd.Values(nil)
105 | return s, &um, &vm
106 | }
107 |
108 | // Save binary serialises the model and writes it into w. This is useful for persisting
109 | // a trained model to disk so that it may be loaded (using the Load() method)in another
110 | // context (e.g. production) for reproducible results.
111 | func (t TruncatedSVD) Save(w io.Writer) error {
112 | var buf [8]byte
113 | binary.LittleEndian.PutUint64(buf[:], uint64(t.K))
114 | if _, err := w.Write(buf[:]); err != nil {
115 | return err
116 | }
117 |
118 | _, err := t.Components.MarshalBinaryTo(w)
119 |
120 | return err
121 | }
122 |
123 | // Load binary deserialises the previously serialised model into the receiver. This is
124 | // useful for loading a previously trained and saved model from another context
125 | // (e.g. offline training) for use within another context (e.g. production) for
126 | // reproducible results. Load should only be performed with trusted data.
127 | func (t *TruncatedSVD) Load(r io.Reader) error {
128 | var n int
129 | var buf [8]byte
130 | var err error
131 | for n < len(buf) && err == nil {
132 | var nn int
133 | nn, err = r.Read(buf[n:])
134 | n += nn
135 | }
136 | if err == io.EOF {
137 | return io.ErrUnexpectedEOF
138 | }
139 | if err != nil {
140 | return err
141 | }
142 | k := int(binary.LittleEndian.Uint64(buf[:]))
143 |
144 | var model mat.Dense
145 | if _, err := model.UnmarshalBinaryFrom(r); err != nil {
146 | return err
147 | }
148 |
149 | t.K = k
150 | t.Components = &model
151 |
152 | return nil
153 | }
154 |
155 | // PCA calculates the principal components of a matrix, or the axis of greatest variance and
156 | // then projects matrices onto those axis.
157 | // See https://en.wikipedia.org/wiki/Principal_component_analysis for further details.
158 | type PCA struct {
159 | // K is the number of components
160 | K int
161 | pc *stat.PC
162 | }
163 |
164 | // NewPCA constructs a new Principal Component Analysis transformer to reduce the dimensionality,
165 | // projecting matrices onto the axis of greatest variance
166 | func NewPCA(k int) *PCA {
167 | return &PCA{K: k, pc: &stat.PC{}}
168 | }
169 |
170 | // Fit calculates the principal component directions (axis of greatest variance) within the
171 | // training data which can then be used to project matrices onto those principal components using
172 | // the Transform() method.
173 | func (p *PCA) Fit(m mat.Matrix) Transformer {
174 | if ok := p.pc.PrincipalComponents(m.T(), nil); !ok {
175 | panic("nlp: PCA analysis failed during fitting")
176 | }
177 |
178 | return p
179 | }
180 |
181 | // Transform projects the matrix onto the first K principal components calculated during training
182 | // (the Fit() method). The returned matrix will be of reduced dimensionality compared to the input
183 | // (K x c compared to r x c of the input).
184 | func (p *PCA) Transform(m mat.Matrix) (mat.Matrix, error) {
185 | r, _ := m.Dims()
186 |
187 | //var proj mat.Dense
188 | var proj sparse.CSR
189 | var dst mat.Dense
190 | p.pc.VectorsTo(&dst)
191 | proj.Mul(m.T(), dst.Slice(0, r, 0, p.K))
192 |
193 | // matrix is r x c (t x d)
194 | // m.T() = c x r (d x t)
195 | // slice c x K
196 |
197 | // (ar x ac) * (br x bc) = ar x bc
198 | // ac == br
199 | return proj.T(), nil
200 | }
201 |
202 | // FitTransform is approximately equivalent to calling Fit() followed by Transform()
203 | // on the same matrix. This is a useful shortcut where separate training data is not being
204 | // used to fit the model i.e. the model is fitted on the fly to the test data.
205 | func (p *PCA) FitTransform(m mat.Matrix) (mat.Matrix, error) {
206 | return p.Fit(m).Transform(m)
207 | }
208 |
209 | // ExplainedVariance returns a slice of float64 values representing the variances of the
210 | // principal component scores.
211 | func (p *PCA) ExplainedVariance() []float64 {
212 | return p.pc.VarsTo(nil)
213 | }
214 |
--------------------------------------------------------------------------------
/index.go:
--------------------------------------------------------------------------------
1 | package nlp
2 |
3 | import (
4 | "container/heap"
5 | "sync"
6 |
7 | "github.com/james-bowman/nlp/measures/pairwise"
8 | "github.com/james-bowman/sparse"
9 | "gonum.org/v1/gonum/mat"
10 | )
11 |
12 | // Match represents a matching item for nearest neighbour similarity searches.
13 | // It contains both the ID of the matching item and the distance from the queried item.
14 | // The distance is represented as a score from 0 (exact match) to 1 (orthogonal)
15 | // depending upon the metric used.
16 | type Match struct {
17 | Distance float64
18 | ID interface{}
19 | }
20 |
21 | // resultHeap is a min heap (priority queue) used to compile the top-k matches whilst
22 | // performing nearest neighbour similarity searches.
23 | type resultHeap struct {
24 | matches []Match
25 | }
26 |
27 | func (r resultHeap) Len() int { return len(r.matches) }
28 |
29 | func (r resultHeap) Less(i, j int) bool { return r.matches[i].Distance > r.matches[j].Distance }
30 |
31 | func (r resultHeap) Swap(i, j int) { r.matches[i], r.matches[j] = r.matches[j], r.matches[i] }
32 |
33 | func (r *resultHeap) Push(x interface{}) {
34 | r.matches = append(r.matches, x.(Match))
35 | }
36 |
37 | func (r *resultHeap) Pop() interface{} {
38 | old := r.matches
39 | n := len(old)
40 | x := old[n-1]
41 | r.matches = old[0 : n-1]
42 | return x
43 | }
44 |
45 | // Indexer indexes vectors to support Nearest Neighbour (NN) similarity searches across
46 | // the indexed vectors.
47 | type Indexer interface {
48 | Index(v mat.Vector, id interface{})
49 | Search(q mat.Vector, k int) []Match
50 | Remove(ids interface{})
51 | }
52 |
53 | // LinearScanIndex supports Nearest Neighbour (NN) similarity searches across indexed
54 | // vectors performing queries in O(n) and requiring O(n) storage. As the name implies,
55 | // LinearScanIndex performs a linear scan across all indexed vectors comparing them
56 | // each in turn with the specified query vector using the configured pairwise distance
57 | // metric. LinearScanIndex is accurate and will always return the true top-k nearest
58 | // neighbours as opposed to some other types of index, like LSHIndex,
59 | // which perform Approximate Nearest Neighbour (ANN) searches and trade some recall
60 | // accuracy for performance over large scale datasets.
61 | type LinearScanIndex struct {
62 | lock sync.RWMutex
63 | signatures []mat.Vector
64 | ids []interface{}
65 | distance pairwise.Comparer
66 | }
67 |
68 | // NewLinearScanIndex construct a new empty LinearScanIndex which will use the specified
69 | // pairwise distance metric to determine nearest neighbours based on similarity.
70 | func NewLinearScanIndex(compareFN pairwise.Comparer) *LinearScanIndex {
71 | return &LinearScanIndex{distance: compareFN}
72 | }
73 |
74 | // Index adds the specified vector v with associated id to the index.
75 | func (b *LinearScanIndex) Index(v mat.Vector, id interface{}) {
76 | b.lock.Lock()
77 | b.signatures = append(b.signatures, v)
78 | b.ids = append(b.ids, id)
79 | b.lock.Unlock()
80 | }
81 |
82 | // Search searches for the top-k nearest neighbours in the index. The method
83 | // returns up to the top-k most similar items in unsorted order. The method may
84 | // return fewer than k items if less than k neighbours are found.
85 | func (b *LinearScanIndex) Search(qv mat.Vector, k int) []Match {
86 | b.lock.RLock()
87 | defer b.lock.RUnlock()
88 |
89 | size := len(b.signatures)
90 |
91 | var point int
92 | var results resultHeap
93 | results.matches = make([]Match, 0, k)
94 |
95 | for point = 0; point < k && point < size; point++ {
96 | mv := b.signatures[point]
97 | match := Match{Distance: b.distance(qv, mv), ID: b.ids[point]}
98 | results.matches = append(results.matches, match)
99 | }
100 | if len(results.matches) < k {
101 | return results.matches
102 | }
103 | heap.Init(&results)
104 | var dist float64
105 | for i := point; i < size; i++ {
106 | mv := b.signatures[i]
107 | dist = b.distance(qv, mv)
108 | if dist <= results.matches[0].Distance {
109 | heap.Pop(&results)
110 | heap.Push(&results, Match{Distance: dist, ID: b.ids[i]})
111 | }
112 | }
113 |
114 | return results.matches
115 | }
116 |
117 | // Remove removes the vector with the specified id from the index. If no vector
118 | // is found with the specified id the method will simply do nothing.
119 | func (b *LinearScanIndex) Remove(id interface{}) {
120 | b.lock.Lock()
121 | defer b.lock.Unlock()
122 |
123 | for i, v := range b.ids {
124 | if v == id {
125 | copy(b.signatures[i:], b.signatures[i+1:])
126 | b.signatures[len(b.signatures)-1] = nil
127 | b.signatures = b.signatures[:len(b.signatures)-1]
128 |
129 | copy(b.ids[i:], b.ids[i+1:])
130 | b.ids[len(b.ids)-1] = nil
131 | b.ids = b.ids[:len(b.ids)-1]
132 |
133 | return
134 | }
135 | }
136 | }
137 |
138 | // Hasher interface represents a Locality Sensitive Hashing algorithm whereby
139 | // the proximity of data points is preserved in the hash space i.e. similar data
140 | // points will be hashed to values close together in the hash space.
141 | type Hasher interface {
142 | // Hash hashes the input vector into a BinaryVector hash representation
143 | Hash(mat.Vector) *sparse.BinaryVec
144 | }
145 |
146 | // LSHScheme interface represents LSH indexing schemes to support Approximate Nearest
147 | // Neighbour (ANN) search.
148 | type LSHScheme interface {
149 | // Put stores the specified LSH signature and associated ID in the LSH index
150 | Put(id interface{}, signature *sparse.BinaryVec)
151 |
152 | // GetCandidates returns the IDs of candidate nearest neighbours. It is up to
153 | // the calling code to further filter these candidates based on distance to arrive
154 | // at the top-k approximate nearest neighbours. The number of candidates returned
155 | // may be smaller or larger than k.
156 | GetCandidates(query *sparse.BinaryVec, k int) []interface{}
157 |
158 | // Remove removes the specified item from the LSH index
159 | Remove(id interface{})
160 | }
161 |
162 | // LSHIndex is an LSH (Locality Sensitive Hashing) based index supporting Approximate
163 | // Nearest Neighbour (ANN) search in O(log n). The storage required by the index will
164 | // depend upon the underlying LSH scheme used but will typically be higher than O(n).
165 | // In use cases where accurate Nearest Neighbour search is required other types of
166 | // index should be considered like LinearScanIndex.
167 | type LSHIndex struct {
168 | lock sync.RWMutex
169 | isApprox bool
170 | hasher Hasher
171 | scheme LSHScheme
172 | signatures map[interface{}]mat.Vector
173 | distance pairwise.Comparer
174 | }
175 |
176 | // NewLSHIndex creates a new LSHIndex. When queried, the initial candidate
177 | // nearest neighbours returned by the underlying LSH indexing algorithm
178 | // are further filtered by comparing distances to the query vector using the supplied
179 | // distance metric. If approx is true, the filtering comparison is performed on the
180 | // hashes and if approx is false, then the comparison is performed on the original
181 | // vectors instead. This will have time and storage implications as comparing the
182 | // original vectors will be more accurate but slower and require the original vectors
183 | // be stored for the comparison. The LSH algorithm and underlying LSH indexing
184 | // algorithm may both be specified as hasher and store parameters respectively.
185 | func NewLSHIndex(approx bool, hasher Hasher, store LSHScheme, distance pairwise.Comparer) *LSHIndex {
186 | index := LSHIndex{
187 | isApprox: approx,
188 | hasher: hasher,
189 | scheme: store,
190 | signatures: make(map[interface{}]mat.Vector),
191 | distance: distance,
192 | }
193 |
194 | return &index
195 | }
196 |
197 | // Index indexes the supplied vector along with its associated ID.
198 | func (l *LSHIndex) Index(v mat.Vector, id interface{}) {
199 | h := l.hasher.Hash(v)
200 |
201 | l.lock.Lock()
202 | defer l.lock.Unlock()
203 |
204 | l.scheme.Put(id, h)
205 | if l.isApprox {
206 | l.signatures[id] = h
207 | } else {
208 | l.signatures[id] = v
209 | }
210 | }
211 |
212 | // Search searches for the top-k approximate nearest neighbours in the index. The
213 | // method returns up to the top-k most similar items in unsorted order. The method may
214 | // return fewer than k items if less than k neighbours are found.
215 | func (l *LSHIndex) Search(q mat.Vector, k int) []Match {
216 | hv := l.hasher.Hash(q)
217 |
218 | l.lock.RLock()
219 | defer l.lock.RUnlock()
220 |
221 | candidateIDs := l.scheme.GetCandidates(hv, k)
222 | size := len(candidateIDs)
223 |
224 | var qv mat.Vector
225 | if l.isApprox {
226 | qv = hv
227 | } else {
228 | qv = q
229 | }
230 |
231 | var point int
232 | var results resultHeap
233 | results.matches = make([]Match, 0, k)
234 |
235 | for point = 0; point < k && point < size; point++ {
236 | mv := l.signatures[candidateIDs[point]]
237 | match := Match{Distance: l.distance(qv, mv), ID: candidateIDs[point]}
238 | results.matches = append(results.matches, match)
239 | }
240 | if len(results.matches) < k {
241 | return results.matches
242 | }
243 | heap.Init(&results)
244 | var dist float64
245 | for i := point; i < size; i++ {
246 | mv := l.signatures[candidateIDs[i]]
247 | dist = l.distance(qv, mv)
248 | if dist <= results.matches[0].Distance {
249 | heap.Pop(&results)
250 | heap.Push(&results, Match{Distance: dist, ID: candidateIDs[i]})
251 | }
252 | }
253 |
254 | return results.matches
255 | }
256 |
257 | // Remove removes the vector with the specified id from the index. If no vector
258 | // is found with the specified id the method will simply do nothing.
259 | func (l *LSHIndex) Remove(id interface{}) {
260 | l.lock.Lock()
261 | defer l.lock.Unlock()
262 |
263 | delete(l.signatures, id)
264 | l.scheme.Remove(id)
265 | }
266 |
--------------------------------------------------------------------------------
/lsh.go:
--------------------------------------------------------------------------------
1 | package nlp
2 |
3 | import (
4 | "fmt"
5 | "strings"
6 |
7 | radix "github.com/armon/go-radix"
8 | "github.com/james-bowman/sparse"
9 | )
10 |
11 | // lshTableBucket represents a hash table bucket used for ClassicLSH. The bucket
12 | // is a slice of IDs relating to items whose hash maps to the bucket.
13 | type lshTableBucket []interface{}
14 |
15 | // lshTable is an hash table used for ClassicLSH. It is simply a map of hashcodes
16 | // to lshTableBuckets
17 | //type lshTable map[uint64]lshTableBucket
18 | type lshTable map[uint64]lshTableBucket
19 |
20 | // remove removes the specified item from the LSH table
21 | func (t lshTable) remove(id interface{}) {
22 | for key, bucketContents := range t {
23 | for j, indexedID := range bucketContents {
24 | if id == indexedID {
25 | bucketContents[j] = bucketContents[len(bucketContents)-1]
26 | t[key] = bucketContents[:len(bucketContents)-1]
27 | if len(t[key]) == 0 {
28 | delete(t, key)
29 | }
30 | return
31 | }
32 | }
33 | }
34 | }
35 |
36 | // ClassicLSH supports finding top-k Approximate Nearest Neighbours (ANN) using Locality
37 | // Sensitive Hashing (LSH). Classic LSH scheme is based on using hash tables to store
38 | // items by their locality sensitive hash code based on the work of A. Gionis et al.
39 | // Items that map to the same bucket (their hash codes collide) are similar. Multiple
40 | // hash tables are used to improve recall where some similar items would otherwise
41 | // hash to separate, neighbouring buckets in only a single table.
42 | //
43 | // A. Gionis, P. Indyk, and R. Motwani, “Similarity Search in High Dimensions via
44 | // Hashing,” VLDB ’99 Proc. 25th Int. Conf. Very Large Data Bases, vol. 99, no. 1,
45 | // pp. 518–529, 1999.
46 | // http://www.cs.princeton.edu/courses/archive/spring13/cos598C/Gionis.pdf%5Cnhttp://portal.acm.org/citation.cfm?id=671516
47 | type ClassicLSH struct {
48 | numHashtables int
49 | numHashfunctions int
50 | reqLen int
51 | hashTables []lshTable
52 | }
53 |
54 | // NewClassicLSH creates a new ClassicLSH with the configured number of hash tables
55 | // and hash functions per table. The length of hash signatures used in this type's
56 | // methods (Put() and GetCandidates()) should be exactly equal to functions * tables.
57 | // The Classic LSH algorithm uses multiple hash tables to improve recall for similar
58 | // items that hash to nearby buckets within a specific hash table.
59 | func NewClassicLSH(functions, tables int) *ClassicLSH {
60 | hashtables := make([]lshTable, tables)
61 | for i := range hashtables {
62 | hashtables[i] = make(map[uint64]lshTableBucket)
63 | }
64 |
65 | return &ClassicLSH{
66 | reqLen: tables * functions,
67 | numHashtables: tables,
68 | numHashfunctions: functions,
69 | hashTables: hashtables,
70 | }
71 | }
72 |
73 | // Put stores the specified LSH signature and associated ID in the LSH index.
74 | // The method panics if the signature is not the same length as tables * functions.
75 | func (l *ClassicLSH) Put(id interface{}, signature *sparse.BinaryVec) {
76 | keys := l.hashKeysForSignature(signature)
77 | for i := range l.hashTables {
78 | l.hashTables[i][keys[i]] = append(l.hashTables[i][keys[i]], id)
79 | }
80 | }
81 |
82 | // GetCandidates returns the IDs of candidate nearest neighbours. It is up to
83 | // the calling code to further filter these candidates based on distance to arrive
84 | // at the top-k approximate nearest neighbours. The number of candidates returned
85 | // may be smaller or larger than k. The method panics if the signature is not the
86 | // same length as tables * functions.
87 | func (l *ClassicLSH) GetCandidates(query *sparse.BinaryVec, k int) []interface{} {
88 | keys := l.hashKeysForSignature(query)
89 |
90 | seen := make(map[interface{}]struct{})
91 | for i, table := range l.hashTables {
92 | if bucketEntries, exist := table[keys[i]]; exist {
93 | for _, id := range bucketEntries {
94 | seen[id] = struct{}{}
95 | }
96 | }
97 | }
98 |
99 | // Collect results
100 | ids := make([]interface{}, len(seen))
101 | var i int
102 | for index := range seen {
103 | ids[i] = index
104 | i++
105 | }
106 |
107 | return ids
108 | }
109 |
110 | // Remove removes the specified item from the LSH index
111 | func (l *ClassicLSH) Remove(id interface{}) {
112 | for _, table := range l.hashTables {
113 | table.remove(id)
114 | }
115 | }
116 |
117 | // hashKeysForSignature chunks the hash into a number of smaller hash codes (one per
118 | // table) each the length of the configured number of hash functions per table.
119 | // The method panics if the signature is not the same length as tables * functions.
120 | func (l *ClassicLSH) hashKeysForSignature(signature *sparse.BinaryVec) []uint64 {
121 | // TODO: rather than simply chunking up the hash signature into k/l chunks
122 | // possibly select hash functions (digits) uniformly at random (with replacement?)
123 | if signature.Len() != l.reqLen {
124 | panic(fmt.Sprintf("nlp: Specified signature is not the correct length. Needed %d but received %d", l.reqLen, signature.Len()))
125 | }
126 | keys := make([]uint64, l.numHashtables)
127 | for i := range keys {
128 | //keys[i] = signature.SliceToUint64(i*l.numHashfunctions, ((i+1)*l.numHashfunctions)-1)
129 | keys[i] = signature.SliceToUint64(i*l.numHashfunctions, ((i + 1) * l.numHashfunctions))
130 | }
131 | return keys
132 | }
133 |
134 | // hashKeysForSignature chunks the hash into a number of smaller hash codes (one per
135 | // table) each the length of the configured number of hash functions per table.
136 | // The method panics if the signature is not the same length as tables * functions.
137 | // func (l *ClassicLSH) hashKeysForSignature(signature *sparse.BinaryVec) []string {
138 | // // TODO: rather than simply chunking up the hash signature into k/l chunks
139 | // // possibly select hash functions (digits) uniformly at random (with replacement?)
140 | // if signature.Len() != l.reqLen {
141 | // panic(fmt.Sprintf("nlp: Specified signature is not the correct length. Needed %d but received %d", l.reqLen, signature.Len()))
142 | // }
143 | // keys := make([]string, l.numHashtables)
144 | // key := signature.String()
145 | // for i := range keys {
146 | // keys[i] = key[i*l.numHashfunctions : (i+1)*l.numHashfunctions]
147 | // }
148 | // return keys
149 | // }
150 |
151 | // LSHForest is an implementation of the LSH Forest Locality Sensitive Hashing scheme
152 | // based on the work of M. Bawa et al.
153 | //
154 | // M. Bawa, T. Condie, and P. Ganesan, “LSH forest: self-tuning indexes for
155 | // similarity search,” Proc. 14th Int. Conf. World Wide Web - WWW ’05, p. 651, 2005.
156 | // http://dl.acm.org/citation.cfm?id=1060745.1060840
157 | type LSHForest struct {
158 | trees []*radix.Tree
159 | numHashfunctions int
160 | reqLen int
161 | }
162 |
163 | // NewLSHForest creates a new LSHForest Locality Sensitive Hashing scheme with the
164 | // specified number of hash tables and hash functions per table.
165 | func NewLSHForest(functions int, tables int) *LSHForest {
166 | trees := make([]*radix.Tree, tables)
167 | for i := range trees {
168 | trees[i] = radix.New()
169 | }
170 | return &LSHForest{
171 | trees: trees,
172 | numHashfunctions: functions,
173 | reqLen: functions * tables,
174 | }
175 | }
176 |
177 | // Put stores the specified LSH signature and associated ID in the LSH index
178 | func (l *LSHForest) Put(id interface{}, signature *sparse.BinaryVec) {
179 | keys := l.hashKeysForSignature(signature)
180 | for i, tree := range l.trees {
181 | //bucket, _ := tree.Get(keys[i])
182 | bucket, ok := tree.Get(keys[i])
183 | if !ok {
184 | bucket = make([]interface{}, 0)
185 | }
186 | tree.Insert(keys[i], append(bucket.([]interface{}), id))
187 | }
188 | }
189 |
190 | // GetCandidates returns the IDs of candidate nearest neighbours. It is up to
191 | // the calling code to further filter these candidates based on distance to arrive
192 | // at the top-k approximate nearest neighbours. The number of candidates returned
193 | // may be smaller or larger than k.
194 | func (l *LSHForest) GetCandidates(query *sparse.BinaryVec, k int) []interface{} {
195 | keys := l.hashKeysForSignature(query)
196 |
197 | m := k
198 | seen := make(map[interface{}]struct{})
199 |
200 | for i, tree := range l.trees {
201 | if bucketEntries, exist := tree.Get(keys[i]); exist {
202 | for _, id := range bucketEntries.([]interface{}) {
203 | seen[id] = struct{}{}
204 | }
205 | }
206 | }
207 |
208 | // if we have not found enough candidates then walk back up the trees for
209 | // similar items in neighbouring buckets with shared prefixes
210 | x := l.numHashfunctions
211 | for len(seen) < m && x > 0 {
212 | for i, tree := range l.trees {
213 | var k string
214 | if keys[i][x-1] == '1' {
215 | k = "0"
216 | } else {
217 | k = "1"
218 | }
219 |
220 | altKey := strings.Join([]string{keys[i][0 : x-1], k}, "")
221 | tree.WalkPrefix(altKey, func(s string, v interface{}) bool {
222 | for _, id := range v.([]interface{}) {
223 | seen[id] = struct{}{}
224 | }
225 | return false
226 | })
227 | }
228 | x--
229 | }
230 |
231 | // Collect results
232 | candidates := make([]interface{}, len(seen))
233 | var i int
234 | for index := range seen {
235 | candidates[i] = index
236 | i++
237 | }
238 |
239 | return candidates
240 | }
241 |
242 | // Remove removes the specified item from the LSH index
243 | func (l *LSHForest) Remove(id interface{}) {
244 | for _, tree := range l.trees {
245 | tree.Walk(func(s string, v interface{}) bool {
246 | bucketContents := v.([]interface{})
247 | for i, indexedID := range bucketContents {
248 | if id == indexedID {
249 | bucketContents[i] = bucketContents[len(bucketContents)-1]
250 | bucketContents = bucketContents[:len(bucketContents)-1]
251 | if len(bucketContents) == 0 {
252 | tree.Delete(s)
253 | } else {
254 | tree.Insert(s, bucketContents)
255 | }
256 | return true
257 | }
258 | }
259 | return false
260 | })
261 | }
262 | }
263 |
264 | // hashKeysForSignature chunks the hash into a number of smaller hash codes (one per
265 | // table) each the length of the configured number of hash functions per table.
266 | // The method panics if the signature is not the same length as tables * functions.
267 | func (l *LSHForest) hashKeysForSignature(signature *sparse.BinaryVec) []string {
268 | // TODO: rather than simply chunking up the hash signature into k/l chunks
269 | // possibly select hash functions (digits) uniformly at random (with replacement?)
270 | if signature.Len() != l.reqLen {
271 | panic(fmt.Sprintf("nlp: Specified signature is not the correct length. Needed %d but received %d", l.reqLen, signature.Len()))
272 | }
273 | keys := make([]string, len(l.trees))
274 | key := signature.String()
275 | for i := range keys {
276 | keys[i] = key[i*l.numHashfunctions : (i+1)*l.numHashfunctions]
277 | }
278 | return keys
279 | }
280 |
--------------------------------------------------------------------------------
/lda_test.go:
--------------------------------------------------------------------------------
1 | package nlp_test
2 |
3 | import (
4 | "fmt"
5 | "math"
6 | "testing"
7 |
8 | "golang.org/x/exp/rand"
9 |
10 | "github.com/james-bowman/nlp"
11 | "gonum.org/v1/gonum/mat"
12 | )
13 |
14 | var stopWords = []string{"a", "about", "above", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", "another", "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are", "around", "as", "at", "back", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom", "but", "by", "call", "can", "cannot", "cant", "co", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven", "else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "ie", "if", "in", "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "thickv", "thin", "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves"}
15 |
16 | func TestLDAFit(t *testing.T) {
17 | tests := []struct {
18 | topics int
19 | r, c int
20 | data []float64
21 | expectedTopics [][]float64
22 | }{
23 | {
24 | topics: 3,
25 | r: 9, c: 9,
26 | data: []float64{
27 | 3, 3, 3, 0, 0, 0, 0, 0, 0,
28 | 3, 3, 3, 0, 0, 0, 0, 0, 0,
29 | 3, 3, 3, 0, 0, 0, 0, 0, 0,
30 | 0, 0, 0, 3, 3, 3, 0, 0, 0,
31 | 0, 0, 0, 3, 3, 3, 0, 0, 0,
32 | 0, 0, 0, 3, 3, 3, 0, 0, 0,
33 | 0, 0, 0, 0, 0, 0, 4, 4, 4,
34 | 0, 0, 0, 0, 0, 0, 4, 4, 4,
35 | 0, 0, 0, 0, 0, 0, 4, 4, 4,
36 | },
37 | expectedTopics: [][]float64{
38 | {0.33, 0.33, 0.33, 0, 0, 0, 0, 0, 0},
39 | {0, 0, 0, 0, 0, 0, 0.33, 0.33, 0.33},
40 | {0, 0, 0, 0.33, 0.33, 0.33, 0, 0, 0},
41 | },
42 | },
43 | {
44 | topics: 3,
45 | r: 9, c: 9,
46 | data: []float64{
47 | 3, 3, 3, 0, 0, 0, 0, 0, 0,
48 | 3, 3, 3, 0, 0, 0, 0, 0, 0,
49 | 3, 3, 3, 0, 0, 0, 0, 0, 0,
50 | 0, 0, 0, 3, 5, 1, 0, 0, 0,
51 | 0, 0, 0, 3, 5, 0, 0, 0, 0,
52 | 0, 0, 0, 3, 5, 0, 0, 0, 0,
53 | 0, 0, 0, 0, 0, 0, 4, 4, 4,
54 | 0, 0, 0, 0, 0, 0, 4, 4, 4,
55 | 0, 0, 0, 0, 0, 0, 4, 4, 4,
56 | },
57 | expectedTopics: [][]float64{
58 | {0.33, 0.33, 0.33, 0, 0, 0, 0, 0, 0},
59 | {0, 0, 0, 0, 0, 0, 0.33, 0.33, 0.33},
60 | {0, 0, 0, 0.428, 0.285, 0.285, 0, 0, 0},
61 | },
62 | },
63 | }
64 |
65 | for ti, test := range tests {
66 | // set Rnd to fixed constant seed for deterministic results
67 | lda := nlp.NewLatentDirichletAllocation(test.topics)
68 | lda.Rnd = rand.New(rand.NewSource(uint64(0)))
69 |
70 | in := mat.NewDense(test.r, test.c, test.data)
71 | lda.Fit(in)
72 |
73 | components := lda.Components()
74 |
75 | for i := 0; i < test.topics; i++ {
76 | var sum float64
77 | for ri, v := range test.expectedTopics[i] {
78 | cv := components.At(i, ri)
79 | sum += cv
80 | if math.Abs(cv-v) > 0.01 {
81 | t.Errorf("Test %d: Topic (%d) over word (%d) distribution incorrect. Expected %f but received %f\n", ti, i, ri, v, cv)
82 | }
83 | }
84 | if math.Abs(1-sum) > 0.00000001 {
85 | t.Errorf("Test %d: values in topic (%d) over word distributions should sum to 1 but summed to %f\n", ti, i, sum)
86 | }
87 | }
88 | }
89 | }
90 |
91 | func TestLDAFitTransform(t *testing.T) {
92 | tests := []struct {
93 | topics int
94 | r, c int
95 | data []float64
96 | expectedDocs [][]float64
97 | }{
98 | {
99 | topics: 3,
100 | r: 9, c: 9,
101 | data: []float64{
102 | 3, 3, 3, 0, 0, 0, 0, 0, 0,
103 | 3, 3, 3, 0, 0, 0, 0, 0, 0,
104 | 3, 3, 3, 0, 0, 0, 0, 0, 0,
105 | 0, 0, 0, 3, 3, 3, 0, 0, 0,
106 | 0, 0, 0, 3, 3, 3, 0, 0, 0,
107 | 0, 0, 0, 3, 3, 3, 0, 0, 0,
108 | 0, 0, 0, 0, 0, 0, 4, 4, 4,
109 | 0, 0, 0, 0, 0, 0, 4, 4, 4,
110 | 0, 0, 0, 0, 0, 0, 4, 4, 4,
111 | },
112 | expectedDocs: [][]float64{
113 | {1, 0, 0},
114 | {1, 0, 0},
115 | {1, 0, 0},
116 | {0, 0, 1},
117 | {0, 0, 1},
118 | {0, 0, 1},
119 | {0, 1, 0},
120 | {0, 1, 0},
121 | {0, 1, 0},
122 | },
123 | },
124 | {
125 | topics: 3,
126 | r: 9, c: 9,
127 | data: []float64{
128 | 3, 3, 3, 0, 0, 0, 0, 0, 0,
129 | 3, 3, 3, 0, 0, 0, 0, 0, 0,
130 | 3, 3, 3, 0, 0, 0, 0, 0, 0,
131 | 0, 0, 0, 3, 5, 1, 0, 0, 0,
132 | 0, 0, 0, 3, 5, 0, 0, 0, 0,
133 | 0, 0, 0, 3, 5, 0, 0, 0, 0,
134 | 0, 0, 0, 0, 0, 0, 4, 4, 4,
135 | 0, 0, 0, 0, 0, 0, 4, 4, 4,
136 | 0, 0, 0, 0, 0, 0, 4, 4, 4,
137 | },
138 | expectedDocs: [][]float64{
139 | {1, 0, 0},
140 | {1, 0, 0},
141 | {1, 0, 0},
142 | {0, 0, 1},
143 | {0, 0, 1},
144 | {0, 0, 1},
145 | {0, 1, 0},
146 | {0, 1, 0},
147 | {0, 1, 0},
148 | },
149 | },
150 | }
151 |
152 | for ti, test := range tests {
153 | // set Rnd to fixed constant seed for deterministic results
154 | lda := nlp.NewLatentDirichletAllocation(test.topics)
155 | lda.Rnd = rand.New(rand.NewSource(uint64(0)))
156 |
157 | in := mat.NewDense(test.r, test.c, test.data)
158 | theta, err := lda.FitTransform(in)
159 | if err != nil {
160 | t.Error(err)
161 | }
162 |
163 | for j := 0; j < test.c; j++ {
164 | var sum float64
165 | for ri, v := range test.expectedDocs[j] {
166 | cv := theta.At(ri, j)
167 | sum += cv
168 | if math.Abs(cv-v) > 0.01 {
169 | t.Errorf("Test %d: Document (%d) over topic (%d) distribution incorrect. Expected %f but received %f\n", ti, j, ri, v, cv)
170 | }
171 | }
172 | if math.Abs(1-sum) > 0.00000001 {
173 | t.Errorf("Test %d: values in document (%d) over topic distributions should sum to 1 but summed to %f\n", ti, j, sum)
174 | }
175 | }
176 | }
177 | }
178 |
179 | func TestLDATransform(t *testing.T) {
180 | tests := []struct {
181 | topics int
182 | r, c int
183 | data []float64
184 | }{
185 | {
186 | topics: 3,
187 | r: 9, c: 9,
188 | data: []float64{
189 | 3, 3, 3, 0, 0, 0, 0, 0, 0,
190 | 3, 3, 3, 0, 0, 0, 0, 0, 0,
191 | 3, 3, 3, 0, 0, 0, 0, 0, 0,
192 | 0, 0, 0, 3, 3, 3, 0, 0, 0,
193 | 0, 0, 0, 3, 3, 3, 0, 0, 0,
194 | 0, 0, 0, 3, 3, 3, 0, 0, 0,
195 | 0, 0, 0, 0, 0, 0, 4, 4, 4,
196 | 0, 0, 0, 0, 0, 0, 4, 4, 4,
197 | 0, 0, 0, 0, 0, 0, 4, 4, 4,
198 | },
199 | },
200 | {
201 | topics: 3,
202 | r: 9, c: 9,
203 | data: []float64{
204 | 3, 3, 3, 0, 0, 0, 0, 0, 0,
205 | 3, 3, 3, 0, 0, 0, 0, 0, 0,
206 | 3, 3, 3, 0, 0, 0, 0, 0, 0,
207 | 0, 0, 0, 3, 5, 1, 0, 0, 0,
208 | 0, 0, 0, 3, 5, 0, 0, 0, 0,
209 | 0, 0, 0, 3, 5, 0, 0, 0, 0,
210 | 0, 0, 0, 0, 0, 0, 4, 4, 4,
211 | 0, 0, 0, 0, 0, 0, 4, 4, 4,
212 | 0, 0, 0, 0, 0, 0, 4, 4, 4,
213 | },
214 | },
215 | }
216 |
217 | for ti, test := range tests {
218 | // set Rnd to fixed constant seed for deterministic results
219 | lda := nlp.NewLatentDirichletAllocation(test.topics)
220 | lda.Rnd = rand.New(rand.NewSource(uint64(0)))
221 | lda.PerplexityEvaluationFrequency = 2
222 |
223 | in := mat.NewDense(test.r, test.c, test.data)
224 | theta, err := lda.FitTransform(in)
225 | if err != nil {
226 | t.Error(err)
227 | }
228 |
229 | tTheta, err := lda.Transform(in)
230 |
231 | if !mat.EqualApprox(theta, tTheta, 0.035) {
232 | t.Errorf("Test %d: Transformed matrix not equal to FitTransformed\nExpected:\n %v\nbut received:\n %v\n", ti, mat.Formatted(theta), mat.Formatted(tTheta))
233 | }
234 | }
235 | }
236 |
237 | func ExampleLatentDirichletAllocation() {
238 | corpus := []string{
239 | "The quick brown fox jumped over the lazy dog",
240 | "The cow jumped over the moon",
241 | "The little dog laughed to see such fun",
242 | }
243 |
244 | // Create a pipeline with a count vectoriser and LDA transformer for 2 topics
245 | vectoriser := nlp.NewCountVectoriser(stopWords...)
246 | lda := nlp.NewLatentDirichletAllocation(2)
247 | pipeline := nlp.NewPipeline(vectoriser, lda)
248 |
249 | docsOverTopics, err := pipeline.FitTransform(corpus...)
250 | if err != nil {
251 | fmt.Printf("Failed to model topics for documents because %v", err)
252 | return
253 | }
254 |
255 | // Examine Document over topic probability distribution
256 | dr, dc := docsOverTopics.Dims()
257 | for doc := 0; doc < dc; doc++ {
258 | fmt.Printf("\nTopic distribution for document: '%s' -", corpus[doc])
259 | for topic := 0; topic < dr; topic++ {
260 | if topic > 0 {
261 | fmt.Printf(",")
262 | }
263 | fmt.Printf(" Topic #%d=%f", topic, docsOverTopics.At(topic, doc))
264 | }
265 | }
266 |
267 | // Examine Topic over word probability distribution
268 | topicsOverWords := lda.Components()
269 | tr, tc := topicsOverWords.Dims()
270 |
271 | vocab := make([]string, len(vectoriser.Vocabulary))
272 | for k, v := range vectoriser.Vocabulary {
273 | vocab[v] = k
274 | }
275 | for topic := 0; topic < tr; topic++ {
276 | fmt.Printf("\nWord distribution for Topic #%d -", topic)
277 | for word := 0; word < tc; word++ {
278 | if word > 0 {
279 | fmt.Printf(",")
280 | }
281 | fmt.Printf(" '%s'=%f", vocab[word], topicsOverWords.At(topic, word))
282 | }
283 | }
284 | }
285 |
--------------------------------------------------------------------------------
/randomprojection_test.go:
--------------------------------------------------------------------------------
1 | package nlp
2 |
3 | import (
4 | "math"
5 | "testing"
6 |
7 | "github.com/james-bowman/nlp/measures/pairwise"
8 | "github.com/james-bowman/sparse"
9 | "golang.org/x/exp/rand"
10 | "gonum.org/v1/gonum/mat"
11 | )
12 |
13 | func TestSignRandomProjection(t *testing.T) {
14 | tests := []struct {
15 | rows int
16 | cols int
17 | bits int
18 | }{
19 | {rows: 100, cols: 1000, bits: 1024},
20 | {rows: 100, cols: 1000, bits: 256},
21 | }
22 |
23 | for ti, test := range tests {
24 | // Given an input matrix and a query matching one column
25 | matrix := mat.NewDense(test.rows, test.cols, nil)
26 | for i := 0; i < test.rows; i++ {
27 | for j := 0; j < test.cols; j++ {
28 | matrix.Set(i, j, rand.Float64())
29 | }
30 | }
31 |
32 | query := matrix.ColView(0)
33 |
34 | // When transformed using sign random projections
35 | transformer := NewSignRandomProjection(test.bits)
36 | reducedDimMatrix, err := transformer.FitTransform(matrix)
37 | if err != nil {
38 | t.Errorf("Failed to transform matrix because %v\n", err)
39 | }
40 | m := reducedDimMatrix.(*sparse.Binary)
41 |
42 | reducedDimQuery, err := transformer.Transform(query)
43 | if err != nil {
44 | t.Errorf("Failed to transform query because %v\n", err)
45 | }
46 | q := reducedDimQuery.(*sparse.Binary).ColView(0)
47 |
48 | var culmDiff float64
49 | for i := 0; i < test.cols; i++ {
50 | angSim := pairwise.AngularSimilarity(query, matrix.ColView(i))
51 | lshSim := pairwise.HammingSimilarity(q, m.ColView(i))
52 |
53 | if i == 0 {
54 | if math.Abs(angSim-lshSim) >= 0.0000001 {
55 | t.Errorf("Test %d: Expected matching similarity but found %.10f (Ang) and %.10f (LSH)\n", ti, angSim, lshSim)
56 | }
57 | }
58 |
59 | diff := math.Abs(lshSim-angSim) / angSim
60 | culmDiff += diff
61 | }
62 | avgDiff := culmDiff / float64(test.cols)
63 |
64 | // Then output matrix should be of specified length,
65 | // matching column should still have similarity of ~1.0 and
66 | // avg difference betwen angular and hamming similarities should
67 | // be less than 0.03
68 | r, c := m.Dims()
69 | if r != test.bits || c != test.cols {
70 | t.Errorf("Test %d: Expected output matrix to be %dx%d but was %dx%d\n", ti, test.bits, test.cols, r, c)
71 | }
72 | if avgDiff >= 0.03 {
73 | t.Errorf("Test %d: Expected difference between vector spaces %f but was %f\n", ti, 0.03, avgDiff)
74 | }
75 | }
76 | }
77 |
78 | func TestRandomProjection(t *testing.T) {
79 | tests := []struct {
80 | k int
81 | rows int
82 | cols int
83 | density float32
84 | }{
85 | {k: 400, rows: 700, cols: 600, density: 0.02},
86 | {k: 400, rows: 800, cols: 800, density: 0.02},
87 | }
88 |
89 | for ti, test := range tests {
90 | matrix := sparse.Random(sparse.CSRFormat, test.rows, test.cols, test.density).(sparse.TypeConverter).ToCSR()
91 | query := matrix.ToCSC().ColView(0)
92 |
93 | // When transformed using sign random projections
94 | transformer := NewRandomProjection(test.k, float64(test.density))
95 | transformer.rnd = rand.New(rand.NewSource(uint64(0)))
96 | reducedDimMatrix, err := transformer.FitTransform(matrix)
97 | if err != nil {
98 | t.Errorf("Failed to transform matrix because %v\n", err)
99 | }
100 | m := reducedDimMatrix.(*sparse.CSR).ToCSC()
101 |
102 | reducedDimQuery, err := transformer.Transform(query)
103 | if err != nil {
104 | t.Errorf("Failed to transform query because %v\n", err)
105 | }
106 | q := reducedDimQuery.(*sparse.CSR).ToCSC().ColView(0)
107 |
108 | var culmDiff float64
109 | ColDo(matrix, func(j int, v mat.Vector) {
110 | angSim := pairwise.CosineSimilarity(query, v)
111 | lshSim := pairwise.CosineSimilarity(q, m.ColView(j))
112 |
113 | if j == 0 {
114 | if math.Abs(angSim-lshSim) >= 0.0000001 {
115 | t.Errorf("Test %d: Expected matching similarity but found %.10f (Ang) and %.10f (LSH)\n", ti, angSim, lshSim)
116 | }
117 | }
118 |
119 | //diff := math.Abs(lshSim-angSim) / angSim
120 | diff := math.Abs(lshSim - angSim)
121 | culmDiff += diff
122 | })
123 | t.Logf("CulmDiff = %f\n", culmDiff)
124 | avgDiff := culmDiff / float64(test.cols)
125 |
126 | // Then output matrix should be of specified length,
127 | // matching column should still have similarity of ~1.0 and
128 | // avg difference betwen angular and hamming similarities should
129 | // be less than 0.03
130 | r, c := reducedDimMatrix.Dims()
131 | if r != test.k || c != test.cols {
132 | t.Errorf("Test %d: Expected output matrix to be %dx%d but was %dx%d\n", ti, test.k, test.cols, r, c)
133 | }
134 | if avgDiff >= 0.05 {
135 | t.Errorf("Test %d: Expected difference between vector spaces %f but was %f\n", ti, 0.05, avgDiff)
136 | }
137 | }
138 | }
139 |
140 | func TestRandomIndexingFit(t *testing.T) {
141 | tests := []struct {
142 | k int
143 | rows int
144 | cols int
145 | density float32
146 | }{
147 | {k: 400, rows: 700, cols: 600, density: 0.02},
148 | {k: 400, rows: 800, cols: 800, density: 0.02},
149 | }
150 |
151 | for ti, test := range tests {
152 | matrix := sparse.Random(sparse.CSRFormat, test.rows, test.cols, test.density).(sparse.TypeConverter).ToCSR()
153 | query := matrix.ToCSC().ColView(0)
154 |
155 | // When transformed using sign random projections
156 | transformer := NewRandomIndexing(test.k, float64(test.density))
157 | transformer.rnd = rand.New(rand.NewSource(uint64(0)))
158 | reducedDimMatrix, err := transformer.FitTransform(matrix)
159 | if err != nil {
160 | t.Errorf("Failed to transform matrix because %v\n", err)
161 | }
162 | m := reducedDimMatrix.(sparse.TypeConverter).ToCSC()
163 |
164 | reducedDimQuery, err := transformer.Transform(query)
165 | if err != nil {
166 | t.Errorf("Failed to transform query because %v\n", err)
167 | }
168 | q := reducedDimQuery.(sparse.TypeConverter).ToCSC().ColView(0)
169 |
170 | var culmDiff float64
171 | ColDo(matrix, func(j int, v mat.Vector) {
172 | angSim := pairwise.CosineSimilarity(query, v)
173 | lshSim := pairwise.CosineSimilarity(q, m.ColView(j))
174 |
175 | if j == 0 {
176 | if math.Abs(angSim-lshSim) >= 0.05 {
177 | t.Errorf("Test %d: Expected matching similarity but found %.10f (Ang) and %.10f (LSH)\n", ti, angSim, lshSim)
178 | }
179 | }
180 |
181 | //diff := math.Abs(lshSim-angSim) / angSim
182 | diff := math.Abs(lshSim - angSim)
183 | culmDiff += diff
184 | })
185 | t.Logf("CulmDiff = %f\n", culmDiff)
186 | avgDiff := culmDiff / float64(test.cols)
187 |
188 | // Then output matrix should be of specified length,
189 | // matching column should still have similarity of ~1.0 and
190 | // avg difference betwen angular and hamming similarities should
191 | // be less than 0.03
192 | r, c := reducedDimMatrix.Dims()
193 | if r != test.k || c != test.cols {
194 | t.Errorf("Test %d: Expected output matrix to be %dx%d but was %dx%d\n", ti, test.k, test.cols, r, c)
195 | }
196 | if avgDiff >= 0.12 {
197 | t.Errorf("Test %d: Expected difference between vector spaces %f but was %f\n", ti, 0.12, avgDiff)
198 | }
199 | }
200 | }
201 |
202 | func TestRandomIndexingPartialFit(t *testing.T) {
203 | tests := []struct {
204 | k int
205 | rows int
206 | cols int
207 | density float32
208 | }{
209 | {k: 400, rows: 700, cols: 600, density: 0.02},
210 | {k: 400, rows: 800, cols: 800, density: 0.02},
211 | }
212 |
213 | for ti, test := range tests {
214 | matrix := sparse.Random(sparse.CSRFormat, test.rows, test.cols, test.density).(sparse.TypeConverter).ToCSR()
215 | query := matrix.ToCSC().ColView(0)
216 |
217 | // When transformed using sign random projections
218 | transformer := NewRandomIndexing(test.k, float64(test.density))
219 | transformer.rnd = rand.New(rand.NewSource(uint64(0)))
220 |
221 | ColDo(matrix, func(j int, v mat.Vector) {
222 | transformer.PartialFit(v)
223 | })
224 |
225 | reducedDimMatrix, err := transformer.Transform(matrix)
226 | if err != nil {
227 | t.Errorf("Failed to transform matrix because %v\n", err)
228 | }
229 | m := reducedDimMatrix.(sparse.TypeConverter).ToCSC()
230 |
231 | reducedDimQuery, err := transformer.Transform(query)
232 | if err != nil {
233 | t.Errorf("Failed to transform query because %v\n", err)
234 | }
235 | q := reducedDimQuery.(sparse.TypeConverter).ToCSC().ColView(0)
236 |
237 | var culmDiff float64
238 | ColDo(matrix, func(j int, v mat.Vector) {
239 | angSim := pairwise.CosineSimilarity(query, v)
240 | lshSim := pairwise.CosineSimilarity(q, m.ColView(j))
241 |
242 | if j == 0 {
243 | if math.Abs(angSim-lshSim) >= 0.05 {
244 | t.Errorf("Test %d: Expected matching similarity but found %.10f (Ang) and %.10f (LSH)\n", ti, angSim, lshSim)
245 | }
246 | }
247 |
248 | //diff := math.Abs(lshSim-angSim) / angSim
249 | diff := math.Abs(lshSim - angSim)
250 | culmDiff += diff
251 | })
252 | t.Logf("CulmDiff = %f\n", culmDiff)
253 | avgDiff := culmDiff / float64(test.cols)
254 |
255 | // Then output matrix should be of specified length,
256 | // matching column should still have similarity of ~1.0 and
257 | // avg difference betwen angular and hamming similarities should
258 | // be less than 0.03
259 | r, c := reducedDimMatrix.Dims()
260 | if r != test.k || c != test.cols {
261 | t.Errorf("Test %d: Expected output matrix to be %dx%d but was %dx%d\n", ti, test.k, test.cols, r, c)
262 | }
263 | if avgDiff >= 0.12 {
264 | t.Errorf("Test %d: Expected difference between vector spaces %f but was %f\n", ti, 0.12, avgDiff)
265 | }
266 | }
267 | }
268 |
269 | func TestReflectiveRandomIndexing(t *testing.T) {
270 | tests := []struct {
271 | k int
272 | rows int
273 | cols int
274 | density float32
275 | }{
276 | {k: 400, rows: 700, cols: 600, density: 0.02},
277 | {k: 400, rows: 800, cols: 800, density: 0.02},
278 | }
279 |
280 | for ti, test := range tests {
281 | matrix := sparse.Random(sparse.CSRFormat, test.rows, test.cols, test.density).(sparse.TypeConverter).ToCSR()
282 | query := matrix.ToCSC().ColView(0)
283 |
284 | // When transformed using Reflective Random Indexing
285 | transformer := NewReflectiveRandomIndexing(test.k, TermBasedRRI, 0, float64(test.density))
286 | transformer.rnd = rand.New(rand.NewSource(uint64(0)))
287 | reducedDimMatrix, err := transformer.FitTransform(matrix)
288 | if err != nil {
289 | t.Errorf("Failed to transform matrix because %v\n", err)
290 | }
291 | m := reducedDimMatrix.(sparse.TypeConverter).ToCSC()
292 |
293 | reducedDimQuery, err := transformer.Transform(query)
294 | if err != nil {
295 | t.Errorf("Failed to transform query because %v\n", err)
296 | }
297 | q := reducedDimQuery.(sparse.TypeConverter).ToCSC().ColView(0)
298 |
299 | var culmDiff float64
300 | ColDo(matrix, func(j int, v mat.Vector) {
301 | origSim := pairwise.CosineSimilarity(query, v)
302 | redSim := pairwise.CosineSimilarity(q, m.ColView(j))
303 |
304 | if j == 0 {
305 | if math.Abs(origSim-redSim) >= 0.0000001 {
306 | t.Errorf("Test %d: Expected matching similarity but found %.10f (Original) and %.10f (Reduced)\n", ti, origSim, redSim)
307 | }
308 | }
309 |
310 | diff := math.Abs(redSim - origSim)
311 | culmDiff += diff
312 | })
313 | t.Logf("CulmDiff = %f\n", culmDiff)
314 | avgDiff := culmDiff / float64(test.cols)
315 |
316 | // Then output matrix should be of specified length,
317 | // matching column should still have similarity of ~1.0 and
318 | // avg difference betwen angular and hamming similarities should
319 | // be less than 0.03
320 | r, c := reducedDimMatrix.Dims()
321 | if r != test.k || c != test.cols {
322 | t.Errorf("Test %d: Expected output matrix to be %dx%d but was %dx%d\n", ti, test.k, test.cols, r, c)
323 | }
324 | if avgDiff >= 0.12 {
325 | t.Errorf("Test %d: Expected difference between vector spaces %f but was %f\n", ti, 0.12, avgDiff)
326 | }
327 | }
328 | }
329 |
--------------------------------------------------------------------------------
/vectorisers.go:
--------------------------------------------------------------------------------
1 | package nlp
2 |
3 | import (
4 | "regexp"
5 | "strings"
6 |
7 | "github.com/james-bowman/sparse"
8 | "github.com/spaolacci/murmur3"
9 | "gonum.org/v1/gonum/mat"
10 | )
11 |
12 | // Vectoriser provides a common interface for vectorisers that take a variable
13 | // set of string arguments and produce a numerical matrix of features.
14 | type Vectoriser interface {
15 | Fit(...string) Vectoriser
16 | Transform(...string) (mat.Matrix, error)
17 | FitTransform(...string) (mat.Matrix, error)
18 | }
19 |
20 | // OnlineVectoriser is an extension to the Vectoriser interface that supports
21 | // online (streaming/mini-batch) training as opposed to just batch.
22 | type OnlineVectoriser interface {
23 | Vectoriser
24 | PartialFit(...string) OnlineVectoriser
25 | }
26 |
27 | // Transformer provides a common interface for transformer steps.
28 | type Transformer interface {
29 | Fit(mat.Matrix) Transformer
30 | Transform(mat mat.Matrix) (mat.Matrix, error)
31 | FitTransform(mat mat.Matrix) (mat.Matrix, error)
32 | }
33 |
34 | // OnlineTransformer is an extension to the Transformer interface that
35 | // supports online (streaming/mini-batch) training as opposed to just batch.
36 | type OnlineTransformer interface {
37 | Transformer
38 | PartialFit(mat.Matrix) OnlineTransformer
39 | }
40 |
41 | // Tokeniser interface for tokenisers allowing substitution of different
42 | // tokenisation strategies e.g. Regexp and also supporting different
43 | // different token types n-grams and languages.
44 | type Tokeniser interface {
45 | // ForEachIn iterates over each token within text and invokes function
46 | // f with the token as parameter
47 | ForEachIn(text string, f func(token string))
48 |
49 | // Tokenise returns a slice of all the tokens contained in string
50 | // text
51 | Tokenise(text string) []string
52 | }
53 |
54 | // RegExpTokeniser implements Tokeniser interface using a basic RegExp
55 | // pattern for unary-gram word tokeniser supporting optional stop word
56 | // removal
57 | type RegExpTokeniser struct {
58 | RegExp *regexp.Regexp
59 | StopWords map[string]bool
60 | }
61 |
62 | // NewTokeniser returns a new, default Tokeniser implementation.
63 | // stopWords is a potentially empty string slice
64 | // that contains the words that should be removed from the corpus
65 | // default regExpTokeniser will split words by whitespace/tabs: "\t\n\f\r "
66 | func NewTokeniser(stopWords ...string) Tokeniser {
67 | var stop map[string]bool
68 |
69 | stop = make(map[string]bool)
70 | for _, word := range stopWords {
71 | stop[word] = true
72 | }
73 | return &RegExpTokeniser{
74 | RegExp: regexp.MustCompile("[\\p{L}]+"),
75 | StopWords: stop,
76 | }
77 | }
78 |
79 | // ForEachIn iterates over each token within text and invokes function
80 | // f with the token as parameter. If StopWords is not nil then any
81 | // tokens from text present in StopWords will be ignored.
82 | func (t *RegExpTokeniser) ForEachIn(text string, f func(token string)) {
83 | tokens := t.tokenise(text)
84 | for _, token := range tokens {
85 | if t.StopWords != nil {
86 | if t.StopWords[token] {
87 | continue
88 | }
89 | }
90 | f(token)
91 | }
92 | }
93 |
94 | // Tokenise returns a slice of all the tokens contained in string
95 | // text. If StopWords is not nil then any tokens from text present in
96 | // StopWords will be removed from the slice.
97 | func (t *RegExpTokeniser) Tokenise(text string) []string {
98 | words := t.tokenise(text)
99 |
100 | // filter out stop words
101 | if t.StopWords != nil {
102 | b := words[:0]
103 | for _, w := range words {
104 | if !t.StopWords[w] {
105 | b = append(b, w)
106 | }
107 | }
108 | return b
109 | }
110 |
111 | return words
112 | }
113 |
114 | // tokenise returns a slice of all the tokens contained in string
115 | // text.
116 | func (t *RegExpTokeniser) tokenise(text string) []string {
117 | // convert content to lower case
118 | c := strings.ToLower(text)
119 |
120 | // match whole words, removing any punctuation/whitespace
121 | words := t.RegExp.FindAllString(c, -1)
122 |
123 | return words
124 | }
125 |
126 | // CountVectoriser can be used to encode one or more text documents into a term document
127 | // matrix where each column represents a document within the corpus and each row represents
128 | // a term present in the training data set. Each element represents the frequency the
129 | // corresponding term appears in the corresponding document e.g. tf(t, d) = 5 would mean
130 | // that term t (perhaps the word "dog") appears 5 times in the document d.
131 | type CountVectoriser struct {
132 | // Vocabulary is a map of words to indices that point to the row number representing
133 | // that word in the term document matrix output from the Transform() and FitTransform()
134 | // methods. The Vocabulary map is populated by the Fit() or FitTransform() methods
135 | // based upon the words occurring in the datasets supplied to those methods. Within
136 | // Transform(), any words found in the test data set that were not present in the
137 | // training data set supplied to Fit() will not have an entry in the Vocabulary
138 | // and will be ignored.
139 | Vocabulary map[string]int
140 |
141 | // Tokeniser is used to tokenise input text into features.
142 | Tokeniser Tokeniser
143 | }
144 |
145 | // NewCountVectoriser creates a new CountVectoriser.
146 | // stopWords is a potentially empty slice of words to be removed from the corpus
147 | func NewCountVectoriser(stopWords ...string) *CountVectoriser {
148 | return &CountVectoriser{
149 | Vocabulary: make(map[string]int),
150 | Tokeniser: NewTokeniser(stopWords...),
151 | }
152 | }
153 |
154 | // Fit processes the supplied training data (a variable number of strings representing
155 | // documents). Each word appearing inside the training data will be added to the
156 | // Vocabulary. The Fit() method is intended to be called once to train the model
157 | // in a batch context. Calling the Fit() method a sceond time have the effect of
158 | // re-training the model from scratch (discarding the previously learnt vocabulary).
159 | func (v *CountVectoriser) Fit(train ...string) Vectoriser {
160 | i := 0
161 | if len(v.Vocabulary) != 0 {
162 | v.Vocabulary = make(map[string]int)
163 | }
164 | v.fitVocab(i, train...)
165 |
166 | return v
167 | }
168 |
169 | // fitVocab learns the vocabulary contained within the supplied training documents
170 | func (v *CountVectoriser) fitVocab(start int, train ...string) {
171 | i := start
172 | for _, doc := range train {
173 | v.Tokeniser.ForEachIn(doc, func(word string) {
174 | _, exists := v.Vocabulary[word]
175 | if !exists {
176 | v.Vocabulary[word] = i
177 | i++
178 | }
179 | })
180 | }
181 | }
182 |
183 | // Transform transforms the supplied documents into a term document matrix where each
184 | // column is a feature vector representing one of the supplied documents. Each element
185 | // represents the frequency with which the associated term for that row occurred within
186 | // that document. The returned matrix is a sparse matrix type.
187 | func (v *CountVectoriser) Transform(docs ...string) (mat.Matrix, error) {
188 | mat := sparse.NewDOK(len(v.Vocabulary), len(docs))
189 |
190 | for d, doc := range docs {
191 | v.Tokeniser.ForEachIn(doc, func(word string) {
192 | i, exists := v.Vocabulary[word]
193 |
194 | if exists {
195 | mat.Set(i, d, mat.At(i, d)+1)
196 | }
197 | })
198 | }
199 | return mat, nil
200 | }
201 |
202 | // FitTransform is exactly equivalent to calling Fit() followed by Transform() on the
203 | // same matrix. This is a convenience where separate training data is not being
204 | // used to fit the model i.e. the model is fitted on the fly to the test data.
205 | // The returned matrix is a sparse matrix type.
206 | func (v *CountVectoriser) FitTransform(docs ...string) (mat.Matrix, error) {
207 | return v.Fit(docs...).Transform(docs...)
208 | }
209 |
210 | // HashingVectoriser can be used to encode one or more text documents into a term document
211 | // matrix where each column represents a document within the corpus and each row represents
212 | // a term. Each element represents the frequency the corresponding term appears in the
213 | // corresponding document e.g. tf(t, d) = 5 would mean that term t (perhaps the word "dog")
214 | // appears 5 times in the document d.
215 | type HashingVectoriser struct {
216 | NumFeatures int
217 | Tokeniser Tokeniser
218 | }
219 |
220 | // NewHashingVectoriser creates a new HashingVectoriser. If stopWords is not an empty slice then
221 | // english stop words will be removed. numFeatures specifies the number of features
222 | // that should be present in produced vectors. Each word in a document is hashed and
223 | // the mod of the hash and numFeatures gives the row in the matrix corresponding to that
224 | // word.
225 | func NewHashingVectoriser(numFeatures int, stopWords ...string) *HashingVectoriser {
226 | return &HashingVectoriser{
227 | NumFeatures: numFeatures,
228 | Tokeniser: NewTokeniser(stopWords...),
229 | }
230 | }
231 |
232 | // Fit does nothing for a HashingVectoriser. As the HashingVectoriser vectorises features
233 | // based on their hash, it does require a pre-determined vocabulary to map features to their
234 | // correct row in the vector. It is effectively stateless and does not require fitting to
235 | // training data. The method is included for compatibility with other vectorisers.
236 | func (v *HashingVectoriser) Fit(train ...string) Vectoriser {
237 | // The hashing vectoriser is stateless and does not require pre-training so this
238 | // method does nothing.
239 | return v
240 | }
241 |
242 | // PartialFit does nothing for a HashingVectoriser. As the HashingVectoriser vectorises
243 | // features based on their hash, it does not require a pre-learnt vocabulary to map
244 | // features to the correct row in the feature vector. This method is included
245 | // for compatibility with other vectorisers.
246 | func (v *HashingVectoriser) PartialFit(train ...string) Vectoriser {
247 | // The hashing vectoriser is stateless and does not requre training so this method
248 | // does nothing.
249 | return v
250 | }
251 |
252 | // Transform transforms the supplied documents into a term document matrix where each
253 | // column is a feature vector representing one of the supplied documents. Each element
254 | // represents the frequency with which the associated term for that row occurred within
255 | // that document. The returned matrix is a sparse matrix type.
256 | func (v *HashingVectoriser) Transform(docs ...string) (mat.Matrix, error) {
257 | mat := sparse.NewDOK(v.NumFeatures, len(docs))
258 |
259 | for d, doc := range docs {
260 | v.Tokeniser.ForEachIn(doc, func(word string) {
261 | h := murmur3.Sum32([]byte(word))
262 | i := int(h) % v.NumFeatures
263 |
264 | mat.Set(i, d, mat.At(i, d)+1)
265 | })
266 | }
267 | return mat, nil
268 | }
269 |
270 | // FitTransform for a HashingVectoriser is exactly equivalent to calling
271 | // Transform() with the same matrix. For most vectorisers, Fit() must be called
272 | // prior to Transform() and so this method is a convenience where separate
273 | // training data is not used to fit the model. For a HashingVectoriser, fitting is
274 | // not required and so this method is exactly equivalent to Transform(). As with
275 | // Fit(), this method is included with the HashingVectoriser for compatibility
276 | // with other vectorisers. The returned matrix is a sparse matrix type.
277 | func (v *HashingVectoriser) FitTransform(docs ...string) (mat.Matrix, error) {
278 | return v.Transform(docs...)
279 | }
280 |
281 | // Pipeline is a mechanism for composing processing pipelines out of vectorisers
282 | // transformation steps. For example to compose a classic LSA/LSI pipeline
283 | // (vectorisation -> TFIDF transformation -> Truncated SVD) one could use a
284 | // Pipeline as follows:
285 | // lsaPipeline := NewPipeline(NewCountVectoriser(false), NewTfidfTransformer(), NewTruncatedSVD(100))
286 | //
287 | type Pipeline struct {
288 | Vectoriser Vectoriser
289 | Transformers []Transformer
290 | }
291 |
292 | // NewPipeline constructs a new processing pipline with the supplied Vectoriser
293 | // and one or more transformers
294 | func NewPipeline(vectoriser Vectoriser, transformers ...Transformer) *Pipeline {
295 | pipeline := Pipeline{
296 | Vectoriser: vectoriser,
297 | Transformers: transformers,
298 | }
299 |
300 | return &pipeline
301 | }
302 |
303 | // Fit fits the model(s) to the supplied training data
304 | func (p *Pipeline) Fit(docs ...string) Vectoriser {
305 | if _, err := p.FitTransform(docs...); err != nil {
306 | panic("nlp: Failed to Fit pipeline because " + err.Error())
307 | }
308 |
309 | return p
310 | }
311 |
312 | // Transform transforms the supplied documents into a matrix representation
313 | // of numerical feature vectors using a model(s) previously fitted to supplied
314 | // training data.
315 | func (p *Pipeline) Transform(docs ...string) (mat.Matrix, error) {
316 | matrix, err := p.Vectoriser.Transform(docs...)
317 | if err != nil {
318 | return matrix, err
319 | }
320 | for _, t := range p.Transformers {
321 | matrix, err = t.Transform(matrix)
322 | if err != nil {
323 | return matrix, err
324 | }
325 | }
326 | return matrix, nil
327 | }
328 |
329 | // FitTransform transforms the supplied documents into a matrix representation
330 | // of numerical feature vectors fitting the model to the supplied data in the
331 | // process.
332 | func (p *Pipeline) FitTransform(docs ...string) (mat.Matrix, error) {
333 | matrix, err := p.Vectoriser.FitTransform(docs...)
334 | if err != nil {
335 | return matrix, err
336 | }
337 | for _, t := range p.Transformers {
338 | matrix, err = t.FitTransform(matrix)
339 | if err != nil {
340 | return matrix, err
341 | }
342 | }
343 | return matrix, nil
344 | }
345 |
--------------------------------------------------------------------------------
/randomprojection.go:
--------------------------------------------------------------------------------
1 | package nlp
2 |
3 | import (
4 | "math"
5 | "time"
6 |
7 | "golang.org/x/exp/rand"
8 |
9 | "github.com/james-bowman/sparse"
10 | "gonum.org/v1/gonum/mat"
11 | "gonum.org/v1/gonum/stat/distuv"
12 | "gonum.org/v1/gonum/stat/sampleuv"
13 | )
14 |
15 | // SignRandomProjection represents a transform of a matrix into a lower
16 | // dimensional space. Sign Random Projection is a method of Locality
17 | // Sensitive Hashing (LSH) sometimes referred to as the random hyperplane method.
18 | // A set of random hyperplanes are created in the original dimensional
19 | // space and then input matrices are expressed relative to the random
20 | // hyperplanes as follows:
21 | // For each column vector in the input matrix, construct a corresponding output
22 | // bit vector with each bit (i) calculated as follows:
23 | // if dot(vector, hyperplane[i]) > 0
24 | // bit[i] = 1
25 | // else
26 | // bit[i] = 0
27 | // Whilst similar to other methods of random projection this method is unique in that
28 | // it uses only a single bit in the output matrix to represent the sign of the result
29 | // of the comparison (Dot product) with each hyperplane so encodes vector
30 | // representations with very low memory and processor requirements whilst preserving
31 | // relative distance between vectors from the original space.
32 | // Hamming similarity (and distance) between the transformed vectors in the
33 | // subspace can approximate Angular similarity (and distance) (which is strongly
34 | // related to Cosine similarity) of the associated vectors from the original space.
35 | type SignRandomProjection struct {
36 | // Bits represents the number of bits the output vectors should
37 | // be in length and hence the number of random hyperplanes needed
38 | // for the transformation
39 | Bits int
40 |
41 | // simhash is the simhash LSH (Locality Sensitive Hashing) algorithm
42 | // used to perform the sign random projection
43 | simHash *SimHash
44 | }
45 |
46 | // NewSignRandomProjection constructs a new SignRandomProjection transformer
47 | // to reduce the dimensionality. The transformer uses a number of random hyperplanes
48 | // represented by `bits` and is the dimensionality of the output, transformed
49 | // matrices.
50 | func NewSignRandomProjection(bits int) *SignRandomProjection {
51 | return &SignRandomProjection{Bits: bits}
52 | }
53 |
54 | // Fit creates the random hyperplanes from the input training data matrix, mat and
55 | // stores the hyperplanes as a transform to apply to matrices.
56 | func (s *SignRandomProjection) Fit(m mat.Matrix) Transformer {
57 | rows, _ := m.Dims()
58 | s.simHash = NewSimHash(s.Bits, rows)
59 | return s
60 | }
61 |
62 | // Transform applies the transform decomposed from the training data matrix in Fit()
63 | // to the input matrix. The columns in the resulting output matrix will be a low
64 | // dimensional binary representation of the columns within the original
65 | // i.e. a hash or fingerprint that can be quickly and efficiently compared with other
66 | // similar vectors. Hamming similarity in the new dimensional space can be
67 | // used to approximate Cosine similarity between the vectors of the original space.
68 | // The returned matrix is a Binary matrix or BinaryVec type depending
69 | // upon whether m is Matrix or Vector.
70 | func (s *SignRandomProjection) Transform(m mat.Matrix) (mat.Matrix, error) {
71 | _, cols := m.Dims()
72 |
73 | sigs := make([]sparse.BinaryVec, cols)
74 | ColDo(m, func(j int, v mat.Vector) {
75 | sigs[j] = *s.simHash.Hash(v)
76 | })
77 | return sparse.NewBinary(s.Bits, cols, sigs), nil
78 | }
79 |
80 | // FitTransform is approximately equivalent to calling Fit() followed by Transform()
81 | // on the same matrix. This is a useful shortcut where separate training data is not being
82 | // used to fit the model i.e. the model is fitted on the fly to the test data.
83 | // The returned matrix is a Binary matrix or BinaryVec type depending upon
84 | // whether m is Matrix or Vector.
85 | func (s *SignRandomProjection) FitTransform(m mat.Matrix) (mat.Matrix, error) {
86 | return s.Fit(m).Transform(m)
87 | }
88 |
89 | // RandomProjection is a method of dimensionality reduction based upon
90 | // the Johnson–Lindenstrauss lemma stating that a small set of points
91 | // in a high-dimensional space can be embedded into a space of much
92 | // lower dimension in such a way that distances between the points
93 | // are nearly preserved.
94 | //
95 | // The technique projects the original
96 | // matrix orthogonally onto a random subspace, transforming the
97 | // elements of the original matrix into a lower dimensional representation.
98 | // Computing orthogonal matrices is expensive and so this technique
99 | // uses specially generated random matrices (hence the name) following
100 | // the principle that in high dimensional spaces, there are lots of
101 | // nearly orthogonal matrices.
102 | type RandomProjection struct {
103 | K int
104 | Density float64
105 | rnd *rand.Rand
106 | projections mat.Matrix
107 | }
108 |
109 | // NewRandomProjection creates and returns a new RandomProjection
110 | // transformer. The RandomProjection will use a specially generated
111 | // random matrix of the specified density and dimensionality k to
112 | // perform the transform to k dimensional space.
113 | func NewRandomProjection(k int, density float64) *RandomProjection {
114 | r := RandomProjection{
115 | K: k,
116 | Density: density,
117 | }
118 |
119 | return &r
120 | }
121 |
122 | // Fit creates the random (almost) orthogonal matrix used to project
123 | // input matrices into the new reduced dimensional subspace.
124 | func (r *RandomProjection) Fit(m mat.Matrix) Transformer {
125 | rows, _ := m.Dims()
126 | r.projections = CreateRandomProjectionTransform(r.K, rows, r.Density, r.rnd)
127 | return r
128 | }
129 |
130 | // Transform applies the transformation, projecting the input matrix
131 | // into the reduced dimensional subspace. The transformed matrix
132 | // will be a sparse CSR format matrix of shape k x c.
133 | func (r *RandomProjection) Transform(m mat.Matrix) (mat.Matrix, error) {
134 | var product sparse.CSR
135 |
136 | // projections will be dimensions k x r (k x t)
137 | // m will be dimensions r x c (t x d)
138 | // product will be of reduced dimensions k x c (k x d)
139 | if t, isTypeConv := m.(sparse.TypeConverter); isTypeConv {
140 | m = t.ToCSR()
141 | }
142 |
143 | product.Mul(r.projections, m)
144 |
145 | return &product, nil
146 | }
147 |
148 | // FitTransform is approximately equivalent to calling Fit() followed by Transform()
149 | // on the same matrix. This is a useful shortcut where separate training data is not being
150 | // used to fit the model i.e. the model is fitted on the fly to the test data.
151 | // The returned matrix is a sparse CSR format matrix of shape k x c.
152 | func (r *RandomProjection) FitTransform(m mat.Matrix) (mat.Matrix, error) {
153 | return r.Fit(m).Transform(m)
154 | }
155 |
156 | // RRIBasis represents the initial basis for the index/elemental vectors
157 | // used for Random Reflective Indexing
158 | type RRIBasis int
159 |
160 | const (
161 | // DocBasedRRI represents columns (documents/contexts in a term-document
162 | // matrix) forming the initial basis for index/elemental vectors in Random Indexing
163 | DocBasedRRI RRIBasis = iota
164 |
165 | // TermBasedRRI indicates rows (terms in a term-document matrix)
166 | // form the initial basis for index/elemental vectors in Reflective Random Indexing.
167 | TermBasedRRI
168 | )
169 |
170 | // RandomIndexing is a method of dimensionality reduction used for Latent Semantic
171 | // Analysis in a similar way to TruncatedSVD and PCA. Random
172 | // Indexing is designed to solve limitations of very high dimensional
173 | // vector space model implementations for modelling term co-occurance
174 | // in language processing such as SVD typically used for LSA/LSI (Latent
175 | // Semantic Analysis/Latent Semantic Indexing). In implementation
176 | // it bears some similarity to other random projection techniques
177 | // such as those implemented in RandomProjection and SignRandomProjection
178 | // within this package.
179 | // The RandomIndexing type can also be used to perform Reflective
180 | // Random Indexing which extends the Random Indexing model with additional
181 | // training cycles to better support indirect inferrence i.e. find synonyms
182 | // where the words do not appear together in documents.
183 | type RandomIndexing struct {
184 | // K specifies the number of dimensions for the semantic space
185 | K int
186 |
187 | // Density specifies the proportion of non-zero elements in the
188 | // elemental vectors
189 | Density float64
190 |
191 | // Type specifies the initial basis for the elemental vectors
192 | // i.e. whether they initially represent the rows or columns
193 | // This is only relevent for Reflective Random Indexing
194 | Type RRIBasis
195 |
196 | // Reflections specifies the number of reflective training cycles
197 | // to run during fitting for RRI (Reflective Random Indexing). For
198 | // Randome Indexing (non-reflective) this is 0.
199 | Reflections int
200 |
201 | rnd *rand.Rand
202 |
203 | // components is a k x t matrix where `t` is the number of terms
204 | // (rows) in the training data matrix. The columns in this matrix
205 | // contain the `context` vectors for RI where each column represents
206 | // a semantic representation of a term based upon the contexts
207 | // in which it has appeared within the training data.
208 | components mat.Matrix
209 | }
210 |
211 | // NewRandomIndexing returns a new RandomIndexing transformer
212 | // configured to transform term document matrices into k dimensional
213 | // space. The density parameter specifies the density of the index/elemental
214 | // vectors used to project the input matrix into lower dimensional
215 | // space i.e. the proportion of elements that are non-zero.
216 | func NewRandomIndexing(k int, density float64) *RandomIndexing {
217 | return &RandomIndexing{
218 | K: k,
219 | Density: density,
220 | }
221 | }
222 |
223 | // NewReflectiveRandomIndexing returns a new RandomIndexing type
224 | // configured for Reflective Random Indexing. Reflective Random
225 | // Indexing applies additional (reflective) training cycles ontop
226 | // of Random Indexing to capture indirect inferences (synonyms).
227 | // i.e. similarity between terms that do not directly co-occur
228 | // within the same context/document.
229 | // basis specifies the basis for the reflective random indexing i.e.
230 | // whether the initial, random index/elemental vectors should represent
231 | // documents (columns) or terms (rows).
232 | // reflections is the number of additional training cycles to apply
233 | // to build the elemental vectors.
234 | // Specifying basis == DocBasedRRI and reflections == 0 is equivalent
235 | // to conventional Random Indexing.
236 | func NewReflectiveRandomIndexing(k int, basis RRIBasis, reflections int, density float64) *RandomIndexing {
237 | return &RandomIndexing{
238 | K: k,
239 | Type: basis,
240 | Reflections: reflections,
241 | Density: density,
242 | }
243 | }
244 |
245 | // PartialFit extends the model to take account of the specified matrix m. The
246 | // context vectors are learnt and stored to be used for furture transformations
247 | // and analysis. PartialFit performs Random Indexing even if the Transformer is
248 | // configured for Reflective Random Indexing so if RRI is required please train
249 | // using the Fit() method as a batch operation. Unlike the Fit() method, the
250 | // PartialFit() method is designed to be called multiple times to support online
251 | // and mini-batch learning whereas the Fit() method is only intended to be called
252 | // once for batch learning.
253 | func (r *RandomIndexing) PartialFit(m mat.Matrix) OnlineTransformer {
254 | rows, cols := m.Dims()
255 |
256 | if r.components == nil || r.components.(*sparse.CSR).IsZero() {
257 | r.components = sparse.NewCSR(r.K, rows, make([]int, r.K+1), []int{}, []float64{})
258 | }
259 | current := r.components
260 |
261 | // Create transform in transpose to get better randomised sparsity patterns
262 | // when partial fitting with small mini-batches e.g. single column/streaming
263 | idxVecs := CreateRandomProjectionTransform(cols, r.K, r.Density, r.rnd).T()
264 | ctxVecs := r.contextualise(m.T(), idxVecs)
265 |
266 | current.(*sparse.CSR).Add(current, ctxVecs)
267 | r.components = current
268 |
269 | return r
270 | }
271 |
272 | // Components returns a t x k matrix where `t` is the number of terms
273 | // (rows) in the training data matrix. The rows in this matrix
274 | // are the `context` vectors for RI each one representing
275 | // a semantic representation of a term based upon the contexts
276 | // in which it has appeared within the training data.
277 | func (r *RandomIndexing) Components() mat.Matrix {
278 | return r.components.T()
279 | }
280 |
281 | // SetComponents sets a t x k matrix where `t` is the number of terms
282 | // (rows) in the training data matrix.
283 | func (r *RandomIndexing) SetComponents(m mat.Matrix) {
284 | r.components = m
285 | }
286 |
287 | // Fit trains the model, creating random index/elemental vectors to
288 | // be used to construct the new projected feature vectors ('context'
289 | // vectors) in the reduced semantic dimensional space. If configured for
290 | // Reflective Random Indexing then Fit may actually run multiple
291 | // training cycles as specified during construction. The Fit method
292 | // trains the model in batch mode so is intended to be called once, for
293 | // online/streaming or mini-batch training please consider the
294 | // PartialFit method instead.
295 | func (r *RandomIndexing) Fit(m mat.Matrix) Transformer {
296 | rows, cols := m.Dims()
297 | var idxVecs mat.Matrix
298 |
299 | if r.Type == TermBasedRRI {
300 | idxVecs = CreateRandomProjectionTransform(r.K, rows, r.Density, r.rnd)
301 | } else {
302 | idxVecs = CreateRandomProjectionTransform(r.K, cols, r.Density, r.rnd)
303 | idxVecs = r.contextualise(m.T(), idxVecs)
304 | }
305 |
306 | for i := 0; i < r.Reflections; i++ {
307 | idxVecs = r.contextualise(m, idxVecs)
308 | idxVecs = r.contextualise(m.T(), idxVecs)
309 | }
310 |
311 | r.components = idxVecs
312 | return r
313 | }
314 |
315 | // FitTransform is approximately equivalent to calling Fit() followed by Transform()
316 | // on the same matrix. This is a useful shortcut where separate training data is not being
317 | // used to fit the model i.e. the model is fitted on the fly to the test data.
318 | // The returned matrix is a sparse CSR format matrix of shape k x c.
319 | func (r *RandomIndexing) FitTransform(m mat.Matrix) (mat.Matrix, error) {
320 | return r.Fit(m).Transform(m)
321 | }
322 |
323 | // Transform applies the transform, projecting matrix m into the
324 | // lower dimensional semantic space. The output matrix will be of
325 | // shape k x c and will be a sparse CSR format matrix. The transformation
326 | // for each document vector is simply the accumulation of all trained context
327 | // vectors relating to terms appearing in the document. These are weighted by
328 | // the frequency the term appears in the document.
329 | func (r *RandomIndexing) Transform(m mat.Matrix) (mat.Matrix, error) {
330 | return r.contextualise(m, r.components), nil
331 | }
332 |
333 | // contextualise accumulates the vectors vectors for each column in matrix m weighting
334 | // each row vector in vectors by its corresponding value in column of the matrix
335 | func (r *RandomIndexing) contextualise(m mat.Matrix, vectors mat.Matrix) mat.Matrix {
336 | var product sparse.CSR
337 |
338 | product.Mul(vectors, m)
339 |
340 | return &product
341 | }
342 |
343 | // CreateRandomProjectionTransform returns a new random matrix for
344 | // Random Projections of shape newDims x origDims. The matrix will
345 | // be randomly populated using probability distributions where density
346 | // is used as the probability that each element will be populated.
347 | // Populated values will be randomly selected from [-1, 1] scaled
348 | // according to the density and dimensions of the matrix. If rnd is
349 | // nil then a new random number generator will be created and used.
350 | func CreateRandomProjectionTransform(newDims, origDims int, density float64, rnd *rand.Rand) mat.Matrix {
351 | if rnd == nil {
352 | rnd = rand.New(rand.NewSource(uint64(time.Now().UnixNano())))
353 | }
354 | // TODO Possibly return a mat.Dense instead of sparse.CSR if
355 | // density == 1
356 |
357 | var ptr int
358 | var ind []int
359 | indptr := make([]int, newDims+1)
360 |
361 | for i := 0; i < newDims; i++ {
362 | nnz := binomial(origDims, density, rnd)
363 | if nnz > 0 {
364 | idx := make([]int, nnz)
365 | sampleuv.WithoutReplacement(idx, origDims, rnd)
366 | //sort.Ints(idx)
367 | ind = append(ind, idx...)
368 | ptr += nnz
369 | }
370 | indptr[i+1] = ptr
371 | }
372 |
373 | vals := make([]float64, len(ind))
374 | values(vals, newDims, density, rnd)
375 |
376 | return sparse.NewCSR(newDims, origDims, indptr, ind, vals)
377 | }
378 |
379 | func binomial(n int, p float64, rnd *rand.Rand) int {
380 | dist := distuv.Bernoulli{
381 | P: p,
382 | // Should this be Source (Gonum code and docs seem out of sync)
383 | Src: rnd,
384 | }
385 |
386 | var x int
387 | for i := 0; i < n; i++ {
388 | x += int(dist.Rand())
389 | }
390 | return x
391 | }
392 |
393 | func values(idx []float64, dims int, density float64, rnd *rand.Rand) {
394 | dist := distuv.Bernoulli{
395 | P: 0.5,
396 | // Should this be Source (Gonum code and docs seem out of sync)
397 | Src: rnd,
398 | }
399 |
400 | factor := math.Sqrt(1.0/density) / math.Sqrt(float64(dims))
401 | for i := range idx {
402 | idx[i] = factor * (dist.Rand()*2 - 1)
403 | }
404 | }
405 |
--------------------------------------------------------------------------------
/lda.go:
--------------------------------------------------------------------------------
1 | package nlp
2 |
3 | import (
4 | "math"
5 | "runtime"
6 | "sync"
7 | "time"
8 |
9 | "github.com/james-bowman/sparse"
10 | "golang.org/x/exp/rand"
11 | "gonum.org/v1/gonum/mat"
12 | )
13 |
14 | // LearningSchedule is used to calculate the learning rate for each iteration using a natural
15 | // gradient descent algorithm.
16 | type LearningSchedule struct {
17 | // S is the scale of the step size for the learning rate.
18 | S float64
19 |
20 | // Tau is the learning offset. The learning offset downweights the
21 | // learning rate from early iterations.
22 | Tau float64
23 |
24 | // Kappa controls the learning decay. This is the amount the learning rate
25 | // reduces each iteration. This is typically a value between 0.5 and 1.0.
26 | Kappa float64
27 | }
28 |
29 | // Calc returns the learning rate for the specified iteration
30 | func (l LearningSchedule) Calc(iteration float64) float64 {
31 | return l.S / math.Pow(l.Tau+iteration, l.Kappa)
32 | }
33 |
34 | type ldaMiniBatch struct {
35 | start, end int
36 | nPhiHat []float64
37 | nZHat []float64
38 | gamma []float64
39 | }
40 |
41 | func newLdaMiniBatch(topics int, words int) *ldaMiniBatch {
42 | l := ldaMiniBatch{
43 | nPhiHat: make([]float64, topics*words),
44 | nZHat: make([]float64, topics),
45 | gamma: make([]float64, topics),
46 | }
47 | return &l
48 | }
49 |
50 | func (l *ldaMiniBatch) reset() {
51 | for i := range l.nPhiHat {
52 | l.nPhiHat[i] = 0
53 | }
54 | for i := range l.nZHat {
55 | l.nZHat[i] = 0
56 | }
57 | // assume gamma does not need to be zeroed between mini batches
58 | }
59 |
60 | // LatentDirichletAllocation (LDA) for fast unsupervised topic extraction. LDA processes
61 | // documents and learns their latent topic model estimating the posterior document over topic
62 | // probability distribution (the probabilities of each document being allocated to each
63 | // topic) and the posterior topic over word probability distribution.
64 | //
65 | // This transformer uses a parallel implemention of the
66 | // SCVB0 (Stochastic Collapsed Variational Bayes) Algorithm (https://arxiv.org/pdf/1305.2452.pdf)
67 | // by Jimmy Foulds with optional `clumping` optimisations.
68 | type LatentDirichletAllocation struct {
69 | // Iterations is the maximum number of training iterations
70 | Iterations int
71 |
72 | // PerplexityTolerance is the tolerance of perplexity below which the Fit method will stop iterating
73 | // and complete. If the evaluated perplexity is is below the tolerance, fitting will terminate successfully
74 | // without necessarily completing all of the configured number of training iterations.
75 | PerplexityTolerance float64
76 |
77 | // PerplexityEvaluationFrquency is the frequency with which to test Perplexity against PerplexityTolerance inside
78 | // Fit. A value <= 0 will not evaluate Perplexity at all and simply iterate for `Iterations` iterations.
79 | PerplexityEvaluationFrequency int
80 |
81 | // BatchSize is the size of mini batches used during training
82 | BatchSize int
83 |
84 | // K is the number of topics
85 | K int
86 |
87 | // NumBurnInPasses is the number of `burn-in` passes across the documents in the
88 | // training data to learn the document statistics before we start collecting topic statistics.
89 | BurnInPasses int
90 |
91 | // TransformationPasses is the number of passes to transform new documents given a previously
92 | // fitted topic model
93 | TransformationPasses int
94 |
95 | // MeanChangeTolerance is the tolerance of change to Theta between burn in passes.
96 | // If the level of change between passes is below the tolerance, the burn in will complete
97 | // without necessarily completing the configured number of passes.
98 | MeanChangeTolerance float64
99 |
100 | // ChangeEvaluationFrequency is the frequency with which to test Perplexity against
101 | // MeanChangeTolerance during burn-in and transformation. A value <= 0 will not evaluate
102 | // the mean change at all and simply iterate for `BurnInPasses` iterations.
103 | ChangeEvaluationFrequency int
104 |
105 | // Alpha is the prior of theta (the documents over topics distribution)
106 | Alpha float64
107 |
108 | // Eta is the prior of phi (the topics over words distribution)
109 | Eta float64
110 |
111 | // RhoPhi is the learning rate for phi (the topics over words distribution)
112 | RhoPhi LearningSchedule
113 |
114 | // RhoTheta is the learning rate for theta (the documents over topics distribution)
115 | RhoTheta LearningSchedule
116 |
117 | rhoPhiT float64
118 | rhoThetaT float64
119 |
120 | wordsInCorpus float64
121 | w, d int
122 |
123 | // Rnd is the random number generator used to generate the initial distributions
124 | // for nTheta (the document over topic distribution), nPhi (the topic over word
125 | // distribution) and nZ (the topic assignments).
126 | Rnd *rand.Rand
127 |
128 | // mutexes for updating global topic statistics
129 | phiMutex sync.Mutex
130 | zMutex sync.Mutex
131 |
132 | // Processes is the degree of parallelisation, or more specifically, the number of
133 | // concurrent go routines to use during fitting.
134 | Processes int
135 |
136 | // nPhi is the topics over words distribution
137 | nPhi []float64
138 |
139 | // nZ is the topic assignments
140 | nZ []float64
141 | }
142 |
143 | // NewLatentDirichletAllocation returns a new LatentDirichletAllocation type initialised
144 | // with default values for k topics.
145 | func NewLatentDirichletAllocation(k int) *LatentDirichletAllocation {
146 | // TODO:
147 | // - Add FitPartial (and FitPartialTransform?) methods
148 | // - refactor word counting
149 | // - rename and check rhoTheta_t and rhoPhi_t
150 | // - Check visibilitiy of member variables
151 | // - Try parallelising:
152 | // - minibatches
153 | // - individual docs within minibatches
154 | // - M step
155 | // - other areas
156 | // - investigate whetehr can combine/consolidate fitMiniBatch and burnIn
157 | // - Check whether nPhi could be sparse
158 | // - Add persistence methods
159 |
160 | l := LatentDirichletAllocation{
161 | Iterations: 1000,
162 | PerplexityTolerance: 1e-2,
163 | PerplexityEvaluationFrequency: 30,
164 | BatchSize: 100,
165 | K: k,
166 | BurnInPasses: 1,
167 | TransformationPasses: 500,
168 | MeanChangeTolerance: 1e-5,
169 | ChangeEvaluationFrequency: 30,
170 | Alpha: 0.1,
171 | Eta: 0.01,
172 | RhoPhi: LearningSchedule{
173 | S: 10,
174 | Tau: 1000,
175 | Kappa: 0.9,
176 | },
177 | RhoTheta: LearningSchedule{
178 | S: 1,
179 | Tau: 10,
180 | Kappa: 0.9,
181 | },
182 | rhoPhiT: 1,
183 | rhoThetaT: 1,
184 | Rnd: rand.New(rand.NewSource(uint64(time.Now().UnixNano()))),
185 | Processes: runtime.GOMAXPROCS(0),
186 | }
187 |
188 | return &l
189 | }
190 |
191 | // init initialises model for fitting allocating memory for distributions and
192 | // randomising initial values.
193 | func (l *LatentDirichletAllocation) init(m mat.Matrix) {
194 | r, c := m.Dims()
195 | l.w, l.d = r, c
196 | l.nPhi = make([]float64, l.K*r)
197 | l.nZ = make([]float64, l.K)
198 | var v float64
199 | for i := 0; i < r; i++ {
200 | for k := 0; k < l.K; k++ {
201 | v = float64((l.Rnd.Int() % (r * l.K))) / float64(r*l.K)
202 | l.nPhi[i*l.K+k] = v
203 | l.nZ[k] += v
204 | }
205 | }
206 | }
207 |
208 | // Fit fits the model to the specified matrix m. The latent topics, and probability
209 | // distribution of topics over words, are learnt and stored to be used for furture transformations
210 | // and analysis.
211 | func (l *LatentDirichletAllocation) Fit(m mat.Matrix) Transformer {
212 | l.FitTransform(m)
213 | return l
214 | }
215 |
216 | // burnInDoc calculates document statistics as part of fitting and transforming new
217 | // documents
218 | func (l *LatentDirichletAllocation) burnInDoc(j int, iterations int, m mat.Matrix, wc float64, gamma *[]float64, nTheta []float64) {
219 | var rhoTheta float64
220 | var sum, prevSum float64
221 | var thetaInd int
222 |
223 | for counter := 1; counter <= iterations; counter++ {
224 | if l.ChangeEvaluationFrequency > 0 && counter%l.ChangeEvaluationFrequency == 0 && 1 < iterations {
225 | // take a copy of current column j
226 | prevSum = 0
227 | for k := 0; k < l.K; k++ {
228 | prevSum += nTheta[j*l.K+k]
229 | }
230 | }
231 | rhoTheta = l.RhoTheta.Calc(l.rhoThetaT + float64(counter))
232 | ColNonZeroElemDo(m, j, func(i, j int, v float64) {
233 | var gammaSum float64
234 | for k := 0; k < l.K; k++ {
235 | // Eqn. 5.
236 | (*gamma)[k] = ((l.nPhi[i*l.K+k] + l.Eta) * (nTheta[j*l.K+k] + l.Alpha) / (l.nZ[k] + l.Eta*float64(l.w)))
237 | gammaSum += (*gamma)[k]
238 | }
239 |
240 | for k := 0; k < l.K; k++ {
241 | (*gamma)[k] /= gammaSum
242 | }
243 |
244 | for k := 0; k < l.K; k++ {
245 | // Eqn. 9.
246 | thetaInd = j*l.K + k
247 | nTheta[thetaInd] = ((math.Pow((1.0-rhoTheta), v) * nTheta[thetaInd]) +
248 | ((1 - math.Pow((1.0-rhoTheta), v)) * wc * (*gamma)[k]))
249 | }
250 | })
251 | if l.ChangeEvaluationFrequency > 0 && counter%l.ChangeEvaluationFrequency == 0 && counter < iterations {
252 | sum = 0
253 | for k := 0; k < l.K; k++ {
254 | sum += nTheta[j*l.K+k]
255 | }
256 | if math.Abs(sum-prevSum)/float64(l.K) < l.MeanChangeTolerance {
257 | break
258 | }
259 | }
260 | }
261 | }
262 |
263 | // fitMiniBatch fits a proportion of the matrix as specified by miniBatch. The
264 | // algorithm is stochastic and so estimates across the minibatch and then applies those
265 | // estimates to the global statistics.
266 | func (l *LatentDirichletAllocation) fitMiniBatch(miniBatch *ldaMiniBatch, wc []float64, nTheta []float64, m mat.Matrix) {
267 | var rhoTheta float64
268 | batchSize := miniBatch.end - miniBatch.start
269 | var phiInd, thetaInd int
270 |
271 | for j := miniBatch.start; j < miniBatch.end; j++ {
272 | l.burnInDoc(j, l.BurnInPasses, m, wc[j], &miniBatch.gamma, nTheta)
273 |
274 | rhoTheta = l.RhoTheta.Calc(l.rhoThetaT + float64(l.BurnInPasses))
275 | ColNonZeroElemDo(m, j, func(i, j int, v float64) {
276 | var gammaSum float64
277 | for k := 0; k < l.K; k++ {
278 | // Eqn. 5.
279 | miniBatch.gamma[k] = ((l.nPhi[i*l.K+k] + l.Eta) * (nTheta[j*l.K+k] + l.Alpha) / (l.nZ[k] + l.Eta*float64(l.w)))
280 | gammaSum += miniBatch.gamma[k]
281 | }
282 | for k := 0; k < l.K; k++ {
283 | miniBatch.gamma[k] /= gammaSum
284 | }
285 |
286 | for k := 0; k < l.K; k++ {
287 | // Eqn. 9.
288 | thetaInd = j*l.K + k
289 | nTheta[thetaInd] = ((math.Pow((1.0-rhoTheta), v) * nTheta[thetaInd]) +
290 | ((1 - math.Pow((1.0-rhoTheta), v)) * wc[j] * miniBatch.gamma[k]))
291 |
292 | // calculate sufficient stats
293 | nv := l.wordsInCorpus * miniBatch.gamma[k] / float64(batchSize)
294 | miniBatch.nPhiHat[i*l.K+k] += nv
295 | miniBatch.nZHat[k] += nv
296 | }
297 | })
298 | }
299 | rhoPhi := l.RhoPhi.Calc(l.rhoPhiT)
300 | l.rhoPhiT++
301 |
302 | // Eqn. 7.
303 | l.phiMutex.Lock()
304 | for w := 0; w < l.w; w++ {
305 | for k := 0; k < l.K; k++ {
306 | phiInd = w*l.K + k
307 | l.nPhi[phiInd] = ((1.0 - rhoPhi) * l.nPhi[phiInd]) + (rhoPhi * miniBatch.nPhiHat[phiInd])
308 | }
309 | }
310 | l.phiMutex.Unlock()
311 |
312 | // Eqn. 8.
313 | l.zMutex.Lock()
314 | for k := 0; k < l.K; k++ {
315 | l.nZ[k] = ((1.0 - rhoPhi) * l.nZ[k]) + (rhoPhi * miniBatch.nZHat[k])
316 | }
317 | l.zMutex.Unlock()
318 | }
319 |
320 | // normaliseTheta normalises theta to derive the posterior probability estimates for
321 | // documents over topics. All values for each document are divided by the sum of all
322 | // values for the document.
323 | func (l *LatentDirichletAllocation) normaliseTheta(theta []float64, result []float64) []float64 {
324 | //adjustment := l.Alpha
325 | adjustment := 0.0
326 | c := len(theta) / l.K
327 | if result == nil {
328 | result = make([]float64, l.K*c)
329 | }
330 | for j := 0; j < c; j++ {
331 | var sum float64
332 | for k := 0; k < l.K; k++ {
333 | sum += theta[j*l.K+k] + adjustment
334 | }
335 | for k := 0; k < l.K; k++ {
336 | result[j*l.K+k] = (theta[j*l.K+k] + adjustment) / sum
337 | }
338 | }
339 | return result
340 | }
341 |
342 | // normalisePhi normalises phi to derive the posterior probability estimates for
343 | // topics over words. All values for each topic are divided by the sum of all values
344 | // for the topic.
345 | func (l *LatentDirichletAllocation) normalisePhi(phi []float64, result []float64) []float64 {
346 | //adjustment := l.Eta
347 | adjustment := 0.0
348 | if result == nil {
349 | result = make([]float64, l.K*l.w)
350 | }
351 | sum := make([]float64, l.K)
352 | for i := 0; i < l.w; i++ {
353 | for k := 0; k < l.K; k++ {
354 | sum[k] += phi[i*l.K+k] + adjustment
355 | }
356 | }
357 | for i := 0; i < l.w; i++ {
358 | for k := 0; k < l.K; k++ {
359 | result[i*l.K+k] = (phi[i*l.K+k] + adjustment) / sum[k]
360 | }
361 | }
362 | return result
363 | }
364 |
365 | // Perplexity calculates the perplexity of the matrix m against the trained model.
366 | // m is first transformed into corresponding posterior estimates for document over topic
367 | // distributions and then used to calculate the perplexity.
368 | func (l *LatentDirichletAllocation) Perplexity(m mat.Matrix) float64 {
369 | if t, isTypeConv := m.(sparse.TypeConverter); isTypeConv {
370 | m = t.ToCSC()
371 | }
372 | var wordCount float64
373 | r, c := m.Dims()
374 |
375 | if s, isSparse := m.(sparse.Sparser); isSparse {
376 | s.DoNonZero(func(i, j int, v float64) {
377 | wordCount += v
378 | })
379 | } else {
380 | for i := 0; i < r; i++ {
381 | for j := 0; j < c; j++ {
382 | wordCount += m.At(i, j)
383 | }
384 | }
385 | }
386 |
387 | theta := l.unNormalisedTransform(m)
388 | return l.perplexity(m, wordCount, l.normaliseTheta(theta, theta), l.normalisePhi(l.nPhi, nil))
389 | }
390 |
391 | // perplexity returns the perplexity of the matrix against the model.
392 | func (l *LatentDirichletAllocation) perplexity(m mat.Matrix, sum float64, nTheta []float64, nPhi []float64) float64 {
393 | _, c := m.Dims()
394 | var perplexity float64
395 | var ttlLogWordProb float64
396 |
397 | for j := 0; j < c; j++ {
398 | ColNonZeroElemDo(m, j, func(i, j int, v float64) {
399 | var dot float64
400 | for k := 0; k < l.K; k++ {
401 | dot += nPhi[i*l.K+k] * nTheta[j*l.K+k]
402 | }
403 | ttlLogWordProb += math.Log2(dot) * v
404 | })
405 | }
406 | perplexity = math.Exp2(-ttlLogWordProb / sum)
407 | return perplexity
408 | }
409 |
410 | // Components returns the topic over words probability distribution. The returned
411 | // matrix is of dimensions K x W where w was the number of rows in the training matrix
412 | // and each column represents a unique words in the vocabulary and K is the number of
413 | // topics.
414 | func (l *LatentDirichletAllocation) Components() mat.Matrix {
415 | return mat.DenseCopyOf(mat.NewDense(l.w, l.K, l.normalisePhi(l.nPhi, nil)).T())
416 | }
417 |
418 | // unNormalisedTransform performs an unNormalisedTransform - the output
419 | // needs to be normalised using normaliseTheta before use.
420 | func (l *LatentDirichletAllocation) unNormalisedTransform(m mat.Matrix) []float64 {
421 | _, c := m.Dims()
422 | theta := make([]float64, l.K*c)
423 | for i := range theta {
424 | //data[i] = rnd.Float64() + 0.5
425 | theta[i] = float64((l.Rnd.Int() % (c * l.K))) / float64(c*l.K)
426 | }
427 | gamma := make([]float64, l.K)
428 |
429 | for j := 0; j < c; j++ {
430 | var wc float64
431 | ColNonZeroElemDo(m, j, func(i, j int, v float64) {
432 | wc += v
433 | })
434 | l.burnInDoc(j, l.TransformationPasses, m, wc, &gamma, theta)
435 | }
436 | return theta
437 | }
438 |
439 | // Transform transforms the input matrix into a matrix representing the distribution
440 | // of the documents over topics.
441 | // THe returned matrix contains the document over topic distributions where each element
442 | // is the probability of the corresponding document being related to the corresponding
443 | // topic. The returned matrix is a Dense matrix of shape K x C where K is the number
444 | // of topics and C is the number of columns in the input matrix (representing the
445 | // documents).
446 | func (l *LatentDirichletAllocation) Transform(m mat.Matrix) (mat.Matrix, error) {
447 | if t, isTypeConv := m.(sparse.TypeConverter); isTypeConv {
448 | m = t.ToCSC()
449 | }
450 | _, c := m.Dims()
451 | theta := l.unNormalisedTransform(m)
452 | return mat.DenseCopyOf(mat.NewDense(c, l.K, l.normaliseTheta(theta, theta)).T()), nil
453 | }
454 |
455 | // FitTransform is approximately equivalent to calling Fit() followed by Transform()
456 | // on the same matrix. This is a useful shortcut where separate training data is not being
457 | // used to fit the model i.e. the model is fitted on the fly to the test data.
458 | // THe returned matrix contains the document over topic distributions where each element
459 | // is the probability of the corresponding document being related to the corresponding
460 | // topic. The returned matrix is a Dense matrix of shape K x C where K is the number
461 | // of topics and C is the number of columns in the input matrix (representing the
462 | // documents).
463 | func (l *LatentDirichletAllocation) FitTransform(m mat.Matrix) (mat.Matrix, error) {
464 | if t, isTypeConv := m.(sparse.TypeConverter); isTypeConv {
465 | m = t.ToCSC()
466 | }
467 |
468 | l.init(m)
469 |
470 | _, c := m.Dims()
471 |
472 | nTheta := make([]float64, l.K*c)
473 | for i := 0; i < l.K*c; i++ {
474 | nTheta[i] = float64((l.Rnd.Int() % (c * l.K))) / float64(c*l.K)
475 | }
476 | wc := make([]float64, c)
477 | for j := 0; j < c; j++ {
478 | ColNonZeroElemDo(m, j, func(i, j int, v float64) {
479 | wc[j] += v
480 | })
481 | l.wordsInCorpus += wc[j]
482 | }
483 |
484 | var phiProb []float64
485 | var thetaProb []float64
486 |
487 | numMiniBatches := int(math.Ceil(float64(c) / float64(l.BatchSize)))
488 | processes := l.Processes
489 | if numMiniBatches < l.Processes {
490 | processes = numMiniBatches
491 | }
492 | miniBatches := make([]*ldaMiniBatch, processes)
493 | for i := range miniBatches {
494 | miniBatches[i] = newLdaMiniBatch(l.K, l.w)
495 | }
496 |
497 | l.rhoPhiT = 1
498 | var perplexity float64
499 | var prevPerplexity float64
500 |
501 | for it := 0; it < l.Iterations; it++ {
502 | l.rhoThetaT++
503 |
504 | mb := make(chan int)
505 | var wg sync.WaitGroup
506 |
507 | for process := 0; process < processes; process++ {
508 | wg.Add(1)
509 | go func(miniBatch *ldaMiniBatch) {
510 | defer wg.Done()
511 | for j := range mb {
512 | miniBatch.reset()
513 | miniBatch.start = j * l.BatchSize
514 | if j < numMiniBatches-1 {
515 | miniBatch.end = miniBatch.start + l.BatchSize
516 | } else {
517 | miniBatch.end = c
518 | }
519 | l.fitMiniBatch(miniBatch, wc, nTheta, m)
520 | }
521 | }(miniBatches[process])
522 | }
523 |
524 | for j := 0; j < numMiniBatches; j++ {
525 | mb <- j
526 | }
527 | close(mb)
528 | wg.Wait()
529 |
530 | if l.PerplexityEvaluationFrequency > 0 && (it+1)%l.PerplexityEvaluationFrequency == 0 {
531 | phiProb = l.normalisePhi(l.nPhi, phiProb)
532 | thetaProb = l.normaliseTheta(nTheta, thetaProb)
533 | perplexity = l.perplexity(m, l.wordsInCorpus, thetaProb, phiProb)
534 |
535 | if prevPerplexity != 0 && math.Abs(prevPerplexity-perplexity) < l.PerplexityTolerance {
536 | break
537 | }
538 | prevPerplexity = perplexity
539 | }
540 | }
541 | return mat.DenseCopyOf(mat.NewDense(c, l.K, l.normaliseTheta(nTheta, thetaProb)).T()), nil
542 | }
543 |
--------------------------------------------------------------------------------