├── .gitattributes ├── .gitignore ├── README.md └── classifier.go /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | *.sln merge=union 7 | *.csproj merge=union 8 | *.vbproj merge=union 9 | *.fsproj merge=union 10 | *.dbproj merge=union 11 | 12 | # Standard to msysgit 13 | *.doc diff=astextplain 14 | *.DOC diff=astextplain 15 | *.docx diff=astextplain 16 | *.DOCX diff=astextplain 17 | *.dot diff=astextplain 18 | *.DOT diff=astextplain 19 | *.pdf diff=astextplain 20 | *.PDF diff=astextplain 21 | *.rtf diff=astextplain 22 | *.RTF diff=astextplain 23 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Windows image file caches 2 | Thumbs.db 3 | ehthumbs.db 4 | 5 | # Folder config file 6 | Desktop.ini 7 | 8 | # Recycle Bin used on file shares 9 | $RECYCLE.BIN/ 10 | 11 | # Windows Installer files 12 | *.cab 13 | *.msi 14 | *.msm 15 | *.msp 16 | 17 | # ========================= 18 | # Operating System Files 19 | # ========================= 20 | 21 | # OSX 22 | # ========================= 23 | 24 | .DS_Store 25 | .AppleDouble 26 | .LSOverride 27 | 28 | # Icon must ends with two \r. 29 | Icon 30 | 31 | # Thumbnails 32 | ._* 33 | 34 | # Files that might appear on external disk 35 | .Spotlight-V100 36 | .Trashes 37 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ##Classifier 2 | 3 | This is a very fast and very memory efficient text classifier for [Go](http://golang.org/). It can train and classify thousands of documents in seconds. The resulting classifier can be saved and loaded from file very quickly, using its own custom file format designed for high speed applications. The classifier itself uses my [BinSearch](http://github.com/AlasdairF/BinSearch) package as its structural backend, which is faster than a hashtable while using only 8 - 16 bytes of memory per token, with 5KB overhead (every word in the English language could be included in the classifier and the entire classifier would fit into 7MB of memory.) 4 | 5 | This classifier was written after much experience of trying many different classification techniques for the problem of document categorization, and this is my own implementation of what I have found works best. It uses an ensemble method to increase accuracy, which is similar to what is more commonly known as a 'Random Forest' classifier. This classifier is made specifically for document classification; it classifies based on token frequency and rarity whereby if category_1 has 0.01 frequency for a particular token, and the overall average frequency is 0.005 then this token will be given a score of Log(0.01 / 0.005) = 0.693 for this category. Twenty different ensembles of each category are generated, pruned and then combined. Additionally, this classifier is adaptive in that it can self-optimize through the `Test` function. I attempted many other techniques that did not make it into the final version, because they were unsuccessful; this classifier is based on experience and practice, not only theory - it is accurate, fast, efficient and made for production use in high-bandwidth applications. 6 | 7 | For people who are not familiar with classifiers: you start with your list of categories and several (more is better) "training files" for each category, which have been hand picked to be good representatives of this category. You then load these categories and training files into the classifier and it magically makes a classifier object which can then be used to classify new documents into these categories. 8 | 9 | Due to the use of [BinSearch](http://github.com/AlasdairF/BinSearch) as the backend, the maximum size of any individual category or token length is 64 bytes inclusive, i.e. `[65]byte`. 10 | 11 | 12 | ## Importing 13 | 14 | import "github.com/AlasdairF/Classifier" 15 | 16 | ## Training 17 | 18 | Start the trainer: 19 | 20 | obj := new(classifier.Trainer) 21 | 22 | Define your categories, this must be a slice of a slice of bytes: `[][]byte`. 23 | 24 | obj.DefineCategories(categories) 25 | 26 | Add training documents, category is a slice of bytes `[]byte`, tokens is a slice of a slice of bytes `[][]byte` (or a slice of uint64 as imported). 27 | 28 | err := obj.AddTrainingDoc(category, tokens) 29 | // and again for each training document 30 | 31 | If you are going to use the `Test` feature to optimize for the best variables for training then you need to add test files. If you don't have any test files then you can add the training files as test files too, this will give you a higher than true accuracy report but it will still help the `Test` function determine the best variables for the classifier. 32 | 33 | err := obj.AddTestDoc(category, tokens) 34 | // keep doing it for each one 35 | 36 | The classifier uses two variables called `allowance` and `maxscore` to optimize the classifier. Both are `float32`. `allowance` means that any word with a score below this will not be included in the classifier. `maxscore` means that no word can be given a score of more than this in the classifier. It is best to let the `Test` function choose these for you. 37 | 38 | To use the `Test` function (once you've added training and test files) you only need to do as follows. Note that if `verbose` is set to true then you will get thousands of lines output to Stdout telling you the accuracy level of each test and which one was best; if it's set to false then it's silent. `Test` returns the best values for `allowance` and `maxscore`. 39 | 40 | verbose := true 41 | allowance, maxscore, err := obj.Test(verbose) 42 | 43 | You can now create your classifier: 44 | 45 | obj.Create(allowance, maxscore) 46 | 47 | Then save it to a file: 48 | 49 | err := obj.Save(`somedir/myshiz.classifier`) 50 | 51 | You can also use any of the Classification functions below on your `Trainer` object, if you want to start classifying right away. You only need to create a new `Classifier` object if you are loading a classifier from a file since the `Trainer` object inherits all of the functions of the `Classifier` object. 52 | 53 | ## Classification 54 | 55 | Load the classifier you previously saved: 56 | 57 | obj, err := classifier.Load(`somedir/myshiz.classifier`) 58 | // *OR* 59 | obj := classifier.MustLoad(`somedir/myshiz.classifier`) 60 | 61 | If want to retrieve a list of the categories for printing then they are here as a slice of a slice bytes: 62 | 63 | categories := obj.Categories // [][]byte 64 | 65 | Classify something: 66 | 67 | scores := obj.Classify(tokens) // tokens is [][]byte 68 | 69 | The above will give you a slice of `uint64` where each index represents the index of the category in `obj.Categories` (which is exactly the same as what you gave originally to `DefineCategories`) and the `uint64` is the score for this category (only meaningful relative to the other scores.) You may need to sort this list, for which I would recommend my optimized sorting function [Sort/Uint16Uint64](http://github.com/AlasdairF/Sort) like this: 70 | 71 | // import "github.com/AlasdairF/Sort/Uint16Uint64" 72 | sorted := sortUint16Uint64.New(scores) 73 | sortUint16Uint64.Desc(sorted) 74 | // You could then output this as follows 75 | cats := obj.Categories 76 | for i, score := range sorted { 77 | fmt.Println(i, `Category`, string(cats[score.K]), `Score`, score.V) 78 | } 79 | 80 | To make things easy, if you want *only* the best matching category and score, and not the results for each category, then you can do the following, which returns the `[]byte` of the category that this document best matches and its score as `uint64`: 81 | 82 | category, score := classifier.ClassifySimple(tokens) 83 | fmt.Println(`Best category was`, string(category), `with score`, score) 84 | 85 | 86 | ## Tokenization / Feature Extraction 87 | 88 | You do need to tokenize each document before training on it or classifying it, which means to extract `tokens` (usually words) from the document ready for classifying. How you tokenize depends on what you are trying to classify. However you choose to tokenize, you must be sure to do it *exactly the same* to the training documents, test documents, and the documents you eventually classify. 89 | 90 | I have written a [Tokenize](http://github.com/AlasdairF/Tokenize) package that works perfectly with this Classifier. I suggest you check that out. 91 | 92 | Following are some recommendations if you choose to do your own tokenization. 93 | 94 | 1. Make the text all lowercase. 95 | 2. If the text is generated by OCR then remove accents from characters to normalize mistakes. 96 | 3. Strip punctuation, numbers and special characters. 97 | 4. If you know the language then you can use a stemmer to reduce the words to their stems, [like this one I collected](http://github.com/AlasdairF/Stemmer). 98 | 5. Remove stopwords, which are common words such as 'and', 'or', 'then', etc. 99 | 100 | If you have additional features for the document, such as title, author, keywords, etc. then these can be added to the tokens. You may want to make them special by adding prefix (which could be a capital letter if you lowercased everything) so that they only match against other instances of the same feature (e.g. prefix 'T' onto the beginning of each title token). You may want to add these as normal tokens *and* add them in again with the prefix (which works well for titles and keywords). 101 | 102 | Tokens don't have to be split on individual words, you can split them on anything you want, such as bigrams (double words, e.g. 'ancient history'). Or add both single words and then add bigrams as well. Often though bigrams do not work as well as one might expect, since they can easily become too powerful as a scorer and then throw a document into the wrong category if it happens to contain this bigram, not to mention you can turn thousands of tokens into millions by doing this. Long story sort: bigrams can be tricky, they can increase your accuracy but only if you test properly with them, select them well, and ensure they are suitable in your case. 103 | 104 | 105 | ~ Alasdair Forsythe 106 | -------------------------------------------------------------------------------- /classifier.go: -------------------------------------------------------------------------------- 1 | package classifier 2 | 3 | import ( 4 | "github.com/AlasdairF/BinSearch" 5 | "github.com/AlasdairF/Custom" 6 | "os" 7 | "math" 8 | "math/rand" 9 | "errors" 10 | "fmt" 11 | ) 12 | 13 | /* 14 | 15 | NOTE: 16 | Maximum byte length of any token or category name is 64 bytes (UTF8). 17 | Maximum number of categories is 65,536. 18 | 19 | */ 20 | 21 | // --------------- CONSTANTS --------------- 22 | 23 | /* 24 | 25 | The number of ensembles can be changed here from 20 to any other number. 26 | It works best on 20, that's why it's hardcoded. 27 | 28 | */ 29 | const number_of_ensembles = 20 30 | 31 | 32 | // --------------- STRUCTS --------------- 33 | 34 | type Trainer struct { 35 | Classifier // inherits Classifier struct 36 | testDocs [][]binsearch.CounterBytes 37 | numTestDocs int 38 | trainingTokens [][][]byte 39 | categoryIndex binsearch.KeyValBytes 40 | ensembleContent [][]word 41 | ensembled bool 42 | } 43 | 44 | type Classifier struct { 45 | Categories [][]byte 46 | rules binsearch.KeyBytes 47 | res [][]scorer 48 | } 49 | 50 | type word struct { 51 | tok []byte 52 | score int 53 | } 54 | 55 | type scorer struct { 56 | category uint16 57 | score uint64 58 | } 59 | 60 | // --------------- FUNCTIONS --------------- 61 | 62 | // randomList is a helper function to generate random lists of integers for the ensemble function. It does not need to be seeded since it is good for the random numbers to be the same for the same content. 63 | func randomList(num int, wanted int) []int { 64 | output := make([]int, wanted) 65 | used := make([]bool, num) 66 | var n int 67 | for got:=0; got ensembles <-> indices 81 | t.Categories = categories 82 | for i, category := range categories { 83 | if t.categoryIndex.AddUnsorted(category, i) != nil { 84 | return errors.New(`Category name must be no more than 64 bytes.`) 85 | } 86 | } 87 | t.categoryIndex.Build() 88 | t.testDocs = make([][]binsearch.CounterBytes, len(categories)) 89 | t.trainingTokens = make([][][]byte, len(categories)) 90 | return nil 91 | } 92 | 93 | // AddTrainingDoc adds a training document to the classifier. 94 | func (t *Trainer) AddTrainingDoc(category []byte, tokens [][]byte) error { 95 | t.ensembled = false // Needs to be ensembled whenever a training doc is added 96 | // Check to see if category exists already, if it doesn't then add it 97 | indx, ok := t.categoryIndex.Find(category) 98 | if !ok { 99 | return errors.New(`AddTrainingDoc: Category '` + string(category) + `' not defined`) 100 | } 101 | // Add tokens 102 | t.trainingTokens[indx] = append(t.trainingTokens[indx], tokens...) 103 | return nil 104 | } 105 | 106 | // AddTestDoc adds a document for testing under the Test function. 107 | func (t *Trainer) AddTestDoc(category []byte, tokens [][]byte) error { 108 | // Check to see if category exists already, if it doesn't then add it 109 | indx, ok := t.categoryIndex.Find(category) 110 | if !ok { 111 | return errors.New(`AddTestDoc: Category '` + string(category) + `' not defined`) 112 | } 113 | // Check capacity and grow if necessary 114 | t.testDocs[indx] = append(t.testDocs[indx], binsearch.CounterBytes{}) 115 | obj := &t.testDocs[indx][len(t.testDocs[indx])-1] 116 | for _, word := range tokens { 117 | obj.Add(word, 1) 118 | } 119 | obj.Build() 120 | 121 | t.numTestDocs++ 122 | return nil 123 | } 124 | 125 | // ensemble does most of the calculations and pruning for the classifier, which is then finished off by Create. 126 | func (t *Trainer) ensemble() { 127 | // Initialize 128 | nlist := make([]int, len(t.Categories) * number_of_ensembles) 129 | tokmap := make([]binsearch.CounterBytes, len(t.Categories) * number_of_ensembles) 130 | ensembleTokAvg := new(binsearch.CounterBytes) 131 | var i, i2, indx, ensembleindx, num_tokens, per_ensemble, total int 132 | var tokloop []int 133 | var tok []byte 134 | numcats := len(t.Categories) 135 | // Loop through all categories of training docs 136 | for indx=0; indx len(ensembleContent) { 170 | ensembleContent = make([]word, l) 171 | } 172 | // Loop through all tokens in this ensemble 173 | i2 = 0 174 | if tokmap[ensembleindx].Reset() { 175 | for eof = false; !eof; { 176 | tok, count, eof = tokmap[ensembleindx].Next() // get the next one 177 | if count >= 2 { // there must be at least 2 occurances of this token in this ensemble 178 | av = (count * 10000000) / nlist[ensembleindx] // Calculatate frequency for this token within this ensemble 179 | v, ok = ensembleTokAvg.Find(tok) // what's the average for this token overall? 180 | if av > v && ok { // if this token frequency in this ensemble is greater than average for all categories and ensembles 181 | ensembleContent[i2] = word{tok, (av * 1000) / v} // the result is the percentage over the average that this tokens occurs in this ensemble, multiplied by 1000 so it fits. It will always be >1 (or in this case >1000) 182 | i2++ 183 | } 184 | } 185 | } 186 | } 187 | // And save the pruned ensembleContent into the struct 188 | t.ensembleContent[ensembleindx] = make([]word, i2) 189 | copy(t.ensembleContent[ensembleindx], ensembleContent[0:i2]) 190 | ensembleindx++ 191 | } 192 | } 193 | return 194 | } 195 | 196 | // Create builds the classifier using the two variables allowance & maxscore. Set allowance & maxscore to 0 for no limits. 197 | func (t *Trainer) Create(allowance float32, maxscore float32) { 198 | // First run ensemble if it hasn't been run already 199 | if !t.ensembled { 200 | t.ensemble() 201 | t.ensembled = true 202 | } 203 | // Now build the classifier 204 | allowanceint, maxscoreint := int(allowance * 1000), int(maxscore * 1000) 205 | var i, ensembleindx, score int 206 | var indx16 uint16 207 | var scorelog uint64 208 | var eof bool 209 | var tok []byte 210 | 211 | // First loop through and calculate exactly how many words will be included in the classifier 212 | dupfinder := new(binsearch.CounterBytes) // create tally for scores from this category 213 | for indx, _ := range t.Categories { // loop through categories 214 | for i=0; i= allowanceint { // If the score is greater than the allowance 217 | dupfinder.Add(obj.tok, 0) 218 | } 219 | } 220 | } 221 | } 222 | 223 | // Convert the tally into a KeyBytes structure, which is the dictionary of tokens 224 | dupfinder.Build() 225 | rules := dupfinder.KeyBytes() 226 | dupfinder = nil 227 | res := make([][]scorer, rules.Len()) 228 | 229 | // Now calculate the score for each dictionary token for each category 230 | for indx, _ := range t.Categories { // loop through categories 231 | tally := new(binsearch.CounterBytes) // create tally for scores from this category 232 | for i=0; i= allowanceint { // If the score is greater than the allowance 236 | if maxscoreint > 0 && obj.score > maxscoreint { // if score is greater than the maximum allowed score for one token then reduce it to the maximum 237 | tally.Add(obj.tok, maxscoreint) 238 | } else { 239 | tally.Add(obj.tok, obj.score) 240 | } 241 | } 242 | } 243 | } 244 | tally.Build() 245 | // Enter tallys into classifier 246 | indx16 = uint16(indx) 247 | 248 | if tally.Reset() { 249 | for eof = false; !eof; { 250 | tok, score, eof = tally.Next() // get the next one 251 | scorelog = uint64(math.Log(float64(score) / 1000) * 1000) 252 | if scorelog > 0 { 253 | i, _ = rules.Find(tok) 254 | res[i] = append(res[i], scorer{indx16, scorelog}) 255 | } 256 | } 257 | } 258 | } 259 | 260 | t.res = res 261 | t.rules = *rules 262 | } 263 | 264 | // Classify classifies tokens and returns a slice of uint64 where each index is the same as the index for the category name in classifier.Categories, which is the same as the []string of categories originally past to DefineCategories. 265 | func (t *Classifier) Classify(tokens [][]byte) []uint64 { 266 | var tok []byte 267 | var ok bool 268 | var i int 269 | var obj scorer 270 | scoreboard := make([]uint64, len(t.Categories)) 271 | for _, tok = range tokens { 272 | if i, ok = t.rules.Find(tok); ok { 273 | for _, obj = range t.res[i] { 274 | scoreboard[obj.category] += obj.score 275 | } 276 | } 277 | } 278 | return scoreboard 279 | } 280 | 281 | // ClassifySimple is a wrapper for Classify, it returns the name of the best category as a string, and the score of the best category as float32. 282 | func (t *Classifier) ClassifySimple(tokens [][]byte) ([]byte, uint64) { 283 | scoreboard := t.Classify(tokens) 284 | var bestscore uint64 285 | var bestcat int 286 | for cat, score := range scoreboard { 287 | if score > bestscore { 288 | bestscore = score 289 | bestcat = cat 290 | } 291 | } 292 | return t.Categories[bestcat], bestscore 293 | } 294 | 295 | func (t *Trainer) classifyTestDoc(test *binsearch.CounterBytes) int { 296 | var tok []byte 297 | var v, i int 298 | var v64 uint64 299 | var eof, ok bool 300 | var obj scorer 301 | scoreboard := make([]uint64, len(t.Categories)) 302 | if test.Reset() { 303 | for !eof { 304 | tok, v, eof = test.Next() // get the next one 305 | if i, ok = t.rules.Find(tok); ok { 306 | v64 = uint64(v) 307 | for _, obj = range t.res[i] { 308 | scoreboard[obj.category] += obj.score * v64 309 | } 310 | } 311 | } 312 | } 313 | var bestscore uint64 314 | i = 0 315 | for cat, score := range scoreboard { 316 | if score > bestscore { 317 | bestscore = score 318 | i = cat 319 | } 320 | } 321 | return i 322 | } 323 | 324 | // Test tries 2,401 different combinations of allowance & maxscore then returns the values of allowance & maxscore which performs the best. Test requires an argument of true or false for verbose, if true Test will print all results to Stdout. 325 | func (t *Trainer) Test(verbose bool) (float32, float32, error) { 326 | // Check there are test files 327 | if t.numTestDocs == 0 { 328 | return 0, 0, errors.New(`Test: Add test files`) 329 | } 330 | num_test_docs := float32(t.numTestDocs) 331 | // Set some variables 332 | var bestaccuracy, bestallowance, bestmaxscore, accuracy, allowance, maxscore float32 333 | var i, indx, correct, l, compare int 334 | // auto is the list of numbers to try for allowance and maxscore 335 | var auto_allowance = [...]float32{0,1.05,1.1,1.15,1.2,1.25,1.3,1.4,1.5,1.6,1.7,1.8,1.9,2,2.5,3,4,5,6,7,8,9,10,15,20,25,30,40,50,75,100,150,200,300,400,500,600,700,800,900,1000,1500,2000,3000,4000,5000,10000,20000,50000,100000,1000000} 336 | var auto_maxscore = [...]float32{0,10000000,1000000,100000,50000,20000,10000,5000,4000,3000,2000,1500,1200,1000,900,800,700,600,550,500,475,450,425,400,375,350,325,300,275,250,225,200,150,100,75,50,40,30,25,20,15,10,8,6,4,2} 337 | for _, allowance = range auto_allowance { // loop through auto for allowance 338 | for _, maxscore = range auto_maxscore { // loop through auto for maxscore 339 | t.Create(allowance, maxscore) // build the classifier for allowance & maxscore 340 | correct = 0 341 | // Count the number of correct results from testDocs under this classifier 342 | for indx = range t.Categories { 343 | l = len(t.testDocs[indx]) 344 | for i=0; i bestaccuracy { 357 | bestaccuracy = accuracy 358 | bestallowance = allowance 359 | bestmaxscore = maxscore 360 | } 361 | } 362 | } 363 | if verbose { 364 | fmt.Println(`BEST RESULT`) 365 | fmt.Printf("allowance %g, maxscore %g = %f\n", bestallowance, bestmaxscore, bestaccuracy) 366 | } 367 | return bestallowance, bestmaxscore, nil 368 | } 369 | 370 | func MustLoad(filename string) *Classifier { 371 | t, err := Load(filename) 372 | if err != nil { 373 | panic(err) 374 | } 375 | return t 376 | } 377 | 378 | // Loads a classifier from a file previously saved with Save. 379 | func Load(filename string) (*Classifier, error) { 380 | // Open file for reading 381 | fi, err := os.Open(filename) 382 | if err != nil { 383 | return nil, err 384 | } 385 | defer fi.Close() 386 | 387 | // Attach reader 388 | r := custom.NewZlibReader(fi) 389 | defer r.Close() 390 | 391 | var i uint16 392 | numcats := r.ReadUint16() 393 | categories := make([][]byte, numcats) 394 | for i=0; i