├── .gitignore
├── lapis
    ├── bayes
    │   ├── classifiers
    │   │   ├── default.moon
    │   │   ├── default.lua
    │   │   ├── test.moon
    │   │   ├── fisher.moon
    │   │   ├── bayes_multi.moon
    │   │   ├── fisher.lua
    │   │   ├── test.lua
    │   │   ├── bayes.moon
    │   │   ├── base.moon
    │   │   ├── bayes_multi.lua
    │   │   ├── bayes.lua
    │   │   └── base.lua
    │   ├── models.moon
    │   ├── models.lua
    │   ├── schema.moon
    │   ├── schema.lua
    │   ├── tokenizers
    │   │   ├── base.moon
    │   │   ├── base.lua
    │   │   ├── ngram.moon
    │   │   ├── url_domains.moon
    │   │   ├── postgres_text.moon
    │   │   ├── postgres_text.lua
    │   │   ├── ngram.lua
    │   │   ├── url_domains.lua
    │   │   └── spam.moon
    │   ├── model.moon
    │   ├── migrations.moon
    │   ├── model.lua
    │   ├── migrations.lua
    │   ├── models
    │   │   ├── word_classifications.moon
    │   │   ├── categories.moon
    │   │   ├── word_classifications.lua
    │   │   └── categories.lua
    │   └── text
    │   │   ├── utf8.lua
    │   │   ├── utf8.moon
    │   │   ├── punycode.moon
    │   │   ├── punycode.lua
    │   │   ├── stem.lua
    │   │   └── stem.moon
    ├── bayes.lua
    └── bayes.moon
├── migrations.moon
├── lint_config.moon
├── config.moon
├── Makefile
├── .github
    └── workflows
    │   └── test.yml
├── spec
    ├── url_tokenizer_spec.moon
    ├── utf8_spec.moon
    ├── postgres_text_tokenizer_spec.moon
    ├── punycode_spec.moon
    ├── unaccent_spec.moon
    ├── stem_spec.moon
    ├── bayes_spec.moon
    └── ngram_tokenizer_spec.moon
├── lapis-bayes-dev-1.rockspec
└── examples
    └── detect_language.lua


/.gitignore:
--------------------------------------------------------------------------------
1 | config.lua
2 | lint_config.lua
3 | migrations.lua
4 | *.rock
5 | 


--------------------------------------------------------------------------------
/lapis/bayes/classifiers/default.moon:
--------------------------------------------------------------------------------
1 | require "lapis.bayes.classifiers.bayes"
2 | 


--------------------------------------------------------------------------------
/lapis/bayes/classifiers/default.lua:
--------------------------------------------------------------------------------
1 | return require("lapis.bayes.classifiers.bayes")
2 | 


--------------------------------------------------------------------------------
/lapis/bayes/models.moon:
--------------------------------------------------------------------------------
1 | import autoload from require "lapis.util"
2 | autoload "lapis.bayes.models"
3 | 


--------------------------------------------------------------------------------
/migrations.moon:
--------------------------------------------------------------------------------
1 | 
2 | import run_migrations from require "lapis.bayes.schema"
3 | 
4 | {
5 |   run_migrations
6 | }
7 | 
8 | 


--------------------------------------------------------------------------------
/lapis/bayes/models.lua:
--------------------------------------------------------------------------------
1 | local autoload
2 | autoload = require("lapis.util").autoload
3 | return autoload("lapis.bayes.models")
4 | 


--------------------------------------------------------------------------------
/lapis/bayes/schema.moon:
--------------------------------------------------------------------------------
1 | run_migrations = ->
2 |   m = require "lapis.db.migrations"
3 |   m.run_migrations require("lapis.bayes.migrations"), "lapis_bayes"
4 | 
5 | { :run_migrations }
6 | 


--------------------------------------------------------------------------------
/lint_config.moon:
--------------------------------------------------------------------------------
1 | {
2 |   whitelist_globals: {
3 |     ["spec/"]: {
4 |       "it", "describe", "before_each", "after_each", "setup", "teardown", "pending"
5 |     }
6 |   }
7 | }
8 | 
9 | 


--------------------------------------------------------------------------------
/lapis/bayes/schema.lua:
--------------------------------------------------------------------------------
1 | local run_migrations
2 | run_migrations = function()
3 |   local m = require("lapis.db.migrations")
4 |   return m.run_migrations(require("lapis.bayes.migrations"), "lapis_bayes")
5 | end
6 | return {
7 |   run_migrations = run_migrations
8 | }
9 | 


--------------------------------------------------------------------------------
/config.moon:
--------------------------------------------------------------------------------
 1 | config = require "lapis.config"
 2 | 
 3 | config {"development", "test"}, ->
 4 |   logging false -- hide query logs
 5 | 
 6 |   postgres {
 7 |     database: "lapis_bayes"
 8 | 
 9 |     host: os.getenv "PGHOST"
10 |     user: os.getenv "PGUSER"
11 |     password: os.getenv "PGPASSWORD"
12 |   }
13 | 
14 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | migrate: build
 3 | 	make test_db > /dev/null
 4 | 	lapis migrate
 5 | 
 6 | local: build
 7 | 	luarocks --lua-version=5.1 make --local *-dev-1.rockspec
 8 | 
 9 | build:
10 | 	-rm $$(find lapis -type f | grep '\.lua$$')
11 | 	moonc lapis
12 | 	moonc *.moon
13 | 
14 | test_db:
15 | 	-dropdb -U postgres lapis_bayes
16 | 	createdb -U postgres lapis_bayes
17 | 
18 | lint::
19 | 	moonc lint_config.moon
20 | 	git ls-files | grep '\.moon$$' | grep -v config.moon | xargs -n 100 moonc -l
21 | 
22 | tags::
23 | 	moon-tags --lapis $$(git ls-files lapis/) > $@
24 | 


--------------------------------------------------------------------------------
/lapis/bayes/tokenizers/base.moon:
--------------------------------------------------------------------------------
 1 | -- Provides a common interface contract for tokenizers. Subclasses should
 2 | -- extend this class and override the `tokenize_text` method with their
 3 | -- implementation.
 4 | --
 5 | -- Required override:
 6 | --   * `tokenize_text(text)` - accept raw text input and return an array-like table
 7 | --     of token strings suitable for classification.
 8 | 
 9 | class BaseTokenizer
10 |   tokenize_text: (...) =>
11 |     class_name = @__class and @__class.__name or "TokenizerBase"
12 |     error "#{class_name} must implement tokenize_text(...)", 2
13 | 


--------------------------------------------------------------------------------
/lapis/bayes/tokenizers/base.lua:
--------------------------------------------------------------------------------
 1 | local BaseTokenizer
 2 | do
 3 |   local _class_0
 4 |   local _base_0 = {
 5 |     tokenize_text = function(self, ...)
 6 |       local class_name = self.__class and self.__class.__name or "TokenizerBase"
 7 |       return error(tostring(class_name) .. " must implement tokenize_text(...)", 2)
 8 |     end
 9 |   }
10 |   _base_0.__index = _base_0
11 |   _class_0 = setmetatable({
12 |     __init = function() end,
13 |     __base = _base_0,
14 |     __name = "BaseTokenizer"
15 |   }, {
16 |     __index = _base_0,
17 |     __call = function(cls, ...)
18 |       local _self_0 = setmetatable({}, _base_0)
19 |       cls.__init(_self_0, ...)
20 |       return _self_0
21 |     end
22 |   })
23 |   _base_0.__class = _class_0
24 |   BaseTokenizer = _class_0
25 |   return _class_0
26 | end
27 | 


--------------------------------------------------------------------------------
/lapis/bayes/model.moon:
--------------------------------------------------------------------------------
 1 | 
 2 | prefix = "lapis_bayes_"
 3 | 
 4 | import Model from require "lapis.db.model"
 5 | 
 6 | db = require "lapis.db"
 7 | 
 8 | -- all tuples should be same size
 9 | encode_tuples = (tuples) ->
10 |   buffer = { "VALUES" }
11 | 
12 |   {insert: i} = table
13 |   n_tuples = #tuples
14 |   for t_idx=1,n_tuples
15 |     tuple = tuples[t_idx]
16 |     i buffer, " ("
17 |     k = #tuple
18 |     for idx=1,k
19 |       i buffer, db.escape_literal tuple[idx]
20 |       unless idx == k
21 |         i buffer, ", "
22 | 
23 |     if t_idx == n_tuples
24 |       i buffer, ")"
25 |     else
26 |       i buffer, "), "
27 | 
28 |   table.concat buffer
29 | 
30 | {
31 |   Model: Model\scoped_model prefix, "lapis.bayes.models"
32 |   prefix_table: (name) -> "#{prefix}#{name}"
33 |   :encode_tuples
34 | }
35 | 


--------------------------------------------------------------------------------
/lapis/bayes/migrations.moon:
--------------------------------------------------------------------------------
 1 | schema = require "lapis.db.schema"
 2 | 
 3 | import add_column, create_index, drop_index, drop_column, create_table from schema
 4 | 
 5 | {
 6 |   :serial, :boolean, :varchar, :integer, :text, :foreign_key, :double, :time,
 7 |   :numeric, :enum
 8 | } = schema.types
 9 | 
10 | import prefix_table from require "lapis.bayes.model"
11 | 
12 | {
13 |   [1439610038]: =>
14 |     create_table prefix_table("categories"), {
15 |       {"id", serial}
16 |       {"name", text}
17 | 
18 |       {"total_count", integer}
19 | 
20 |       {"created_at", time}
21 |       {"updated_at", time}
22 | 
23 |       "PRIMARY KEY (id)"
24 |     }
25 | 
26 |     create_table prefix_table("word_classifications"), {
27 |       {"category_id", foreign_key}
28 |       {"word", text}
29 |       {"count", integer}
30 | 
31 |       "PRIMARY KEY (category_id, word)"
32 |     }
33 | 
34 |   [1474434614]: =>
35 |     create_index prefix_table("categories"), "name"
36 | }
37 | 
38 | 


--------------------------------------------------------------------------------
/lapis/bayes/model.lua:
--------------------------------------------------------------------------------
 1 | local prefix = "lapis_bayes_"
 2 | local Model
 3 | Model = require("lapis.db.model").Model
 4 | local db = require("lapis.db")
 5 | local encode_tuples
 6 | encode_tuples = function(tuples)
 7 |   local buffer = {
 8 |     "VALUES"
 9 |   }
10 |   local i
11 |   i = table.insert
12 |   local n_tuples = #tuples
13 |   for t_idx = 1, n_tuples do
14 |     local tuple = tuples[t_idx]
15 |     i(buffer, " (")
16 |     local k = #tuple
17 |     for idx = 1, k do
18 |       i(buffer, db.escape_literal(tuple[idx]))
19 |       if not (idx == k) then
20 |         i(buffer, ", ")
21 |       end
22 |     end
23 |     if t_idx == n_tuples then
24 |       i(buffer, ")")
25 |     else
26 |       i(buffer, "), ")
27 |     end
28 |   end
29 |   return table.concat(buffer)
30 | end
31 | return {
32 |   Model = Model:scoped_model(prefix, "lapis.bayes.models"),
33 |   prefix_table = function(name)
34 |     return tostring(prefix) .. tostring(name)
35 |   end,
36 |   encode_tuples = encode_tuples
37 | }
38 | 


--------------------------------------------------------------------------------
/lapis/bayes.lua:
--------------------------------------------------------------------------------
 1 | local VERSION = "1.4.0"
 2 | local text_probabilities
 3 | text_probabilities = function(categories, text, opts)
 4 |   if opts == nil then
 5 |     opts = { }
 6 |   end
 7 |   local DefaultClassifier = require("lapis.bayes.classifiers.default")
 8 |   return DefaultClassifier(opts):text_probabilities(categories, text, opts)
 9 | end
10 | local classify_text
11 | classify_text = function(categories, text, opts)
12 |   if opts == nil then
13 |     opts = { }
14 |   end
15 |   local DefaultClassifier = require("lapis.bayes.classifiers.default")
16 |   return DefaultClassifier(opts):classify_text(categories, text, opts)
17 | end
18 | local train_text
19 | train_text = function(category, text, opts, ...)
20 |   if opts == nil then
21 |     opts = { }
22 |   end
23 |   local DefaultClassifier = require("lapis.bayes.classifiers.default")
24 |   return DefaultClassifier(opts):train_text(category, text, ...)
25 | end
26 | return {
27 |   classify_text = classify_text,
28 |   train_text = train_text,
29 |   text_probabilities = text_probabilities,
30 |   VERSION = VERSION
31 | }
32 | 


--------------------------------------------------------------------------------
/lapis/bayes.moon:
--------------------------------------------------------------------------------
 1 | VERSION = "1.4.0"
 2 | 
 3 | -- calculate the probabilities of text using default classifier
 4 | -- categories: array of category names
 5 | -- text: the text to calculate probabilities for
 6 | text_probabilities = (categories, text, opts={}) ->
 7 |   DefaultClassifier = require "lapis.bayes.classifiers.default"
 8 |   DefaultClassifier(opts)\text_probabilities categories, text, opts
 9 | 
10 | -- return the best matching category for the given text using the default
11 | -- classifier
12 | classify_text = (categories, text, opts={}) ->
13 |   DefaultClassifier = require "lapis.bayes.classifiers.default"
14 |   DefaultClassifier(opts)\classify_text categories, text, opts
15 | 
16 | -- train text using default classifier's tokenizer
17 | -- category: string name of category
18 | -- text: the text (or array of words) to train
19 | -- opts: options to pass to the classifier
20 | train_text = (category, text, opts={}, ...) ->
21 |   DefaultClassifier = require "lapis.bayes.classifiers.default"
22 |   DefaultClassifier(opts)\train_text category, text, ...
23 | 
24 | { :classify_text, :train_text, :text_probabilities, :VERSION }
25 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: "test"
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   test:
 7 |     runs-on: ubuntu-latest
 8 | 
 9 |     env:
10 |       PGUSER: postgres
11 |       PGPASSWORD: postgres
12 |       PGHOST: 127.0.0.1
13 | 
14 |     services:
15 |       postgres:
16 |         image: postgres:12
17 |         env:
18 |           POSTGRES_PASSWORD: postgres
19 |         options: --health-cmd pg_isready --health-interval 10s --health-timeout 5s --health-retries 5
20 |         ports:
21 |           - 5432:5432
22 | 
23 |     steps:
24 |     - uses: actions/checkout@master
25 |     - uses: leafo/gh-actions-lua@master
26 |       with:
27 |         luaVersion: "luajit-openresty"
28 | 
29 |     - uses: leafo/gh-actions-luarocks@master
30 | 
31 |     - name: build
32 |       run: |
33 |         luarocks install busted
34 |         luarocks install moonscript
35 |         luarocks make
36 |         luarocks install web_sanitize
37 |         luarocks install tableshape
38 | 
39 |     - name: setup db
40 |       run: |
41 |         psql -c 'create database lapis_bayes'
42 |         moonc *.moon
43 |         lapis migrate
44 | 
45 |     - name: test
46 |       run: |
47 |         busted -o utfTerminal
48 | 


--------------------------------------------------------------------------------
/lapis/bayes/classifiers/test.moon:
--------------------------------------------------------------------------------
 1 | average = (nums) ->
 2 |   sum = 0
 3 |   for n in *nums
 4 |     sum += n
 5 | 
 6 |   return sum / #nums
 7 | 
 8 | weighted_avg = (tuples) ->
 9 |   num_tuples = #tuples
10 |   sum = 0
11 |   sum_weight = 0
12 | 
13 |   for {num, weight} in *tuples
14 |     sum += num
15 |     sum_weight += weight
16 | 
17 |   avg_weight = sum_weight/num_tuples
18 | 
19 |   avg = 0
20 |   for {num, weight} in *tuples
21 |     avg += (num/num_tuples) * (weight/avg_weight)
22 | 
23 |   avg
24 | 
25 | class TestClassifier extends require "lapis.bayes.classifiers.base"
26 |   word_probabilities: (categories, available_words) =>
27 |     total_counts = {}
28 |     for c in *categories
29 |       continue unless c.word_counts
30 |       for word, count in pairs c.word_counts
31 |         total_counts[word] or= 0
32 |         total_counts[word] += count
33 | 
34 |     probs = for c in *categories
35 |       tuples = for word in *available_words
36 |         total_count = total_counts[word]
37 |         cat_count = c.word_counts and c.word_counts[word] or 0
38 |         {cat_count/total_count, total_count}
39 | 
40 |       {c.name, weighted_avg tuples}
41 | 
42 |     table.sort probs, (a,b) ->
43 |       a[2] > b[2]
44 | 
45 |     probs
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/lapis/bayes/classifiers/fisher.moon:
--------------------------------------------------------------------------------
 1 | -- http://www.linuxjournal.com/article/6467
 2 | 
 3 | inv_chi2 = (chi, df) ->
 4 |   assert df % 2 == 0, "df must be even"
 5 |   m = chi / 2.0
 6 |   sum = math.exp -m
 7 |   term = sum
 8 |   for i=1, math.floor df/2
 9 |     term *= m / i
10 |     sum += term
11 | 
12 |   math.min sum, 1
13 | 
14 | class FisherClassifier extends require "lapis.bayes.classifiers.base"
15 |   @default_options: {
16 |     robs: 1
17 |     robx: 0.5
18 |     min_dev: 0.3
19 |   }
20 | 
21 |   word_probabilities: (categories, available_words) =>
22 |     return nil, "only two categories supported at once" unless #categories == 2
23 | 
24 |     {a, b} = categories
25 | 
26 |     s = @opts.robs
27 |     x = @opts.robx
28 |     min_dev = @opts.min_dev
29 | 
30 |     mul_a = 0
31 |     mul_b = 0
32 | 
33 |     kept_tokens = 0
34 | 
35 |     for word in *available_words
36 |       a_count = a.word_counts and a.word_counts[word] or 0
37 |       b_count = b.word_counts and b.word_counts[word] or 0
38 | 
39 |       p = a_count / (a_count + b_count)
40 |       n = a_count + b_count
41 |       val = ((s * x) + (n * p)) / (s + n)
42 | 
43 |       if not min_dev or math.abs(val - 0.5) > min_dev
44 |         mul_a += math.log val
45 |         mul_b += math.log 1 - val
46 |         kept_tokens += 1
47 | 
48 |     if kept_tokens == 0
49 |       return nil, "not enough strong signals to decide"
50 | 
51 |     pa = inv_chi2 -2 * mul_a, 2 * kept_tokens
52 |     pb = inv_chi2 -2 * mul_b, 2 * kept_tokens
53 | 
54 |     p = (1 + pa - pb) / 2
55 | 
56 |     tuples = {
57 |       {a.name, p}
58 |       {b.name, 1 - p}
59 |     }
60 | 
61 |     table.sort tuples, (a,b) -> a[2] > b[2]
62 | 
63 |     tuples
64 | 
65 | 
66 | 


--------------------------------------------------------------------------------
/lapis/bayes/migrations.lua:
--------------------------------------------------------------------------------
 1 | local schema = require("lapis.db.schema")
 2 | local add_column, create_index, drop_index, drop_column, create_table
 3 | add_column, create_index, drop_index, drop_column, create_table = schema.add_column, schema.create_index, schema.drop_index, schema.drop_column, schema.create_table
 4 | local serial, boolean, varchar, integer, text, foreign_key, double, time, numeric, enum
 5 | do
 6 |   local _obj_0 = schema.types
 7 |   serial, boolean, varchar, integer, text, foreign_key, double, time, numeric, enum = _obj_0.serial, _obj_0.boolean, _obj_0.varchar, _obj_0.integer, _obj_0.text, _obj_0.foreign_key, _obj_0.double, _obj_0.time, _obj_0.numeric, _obj_0.enum
 8 | end
 9 | local prefix_table
10 | prefix_table = require("lapis.bayes.model").prefix_table
11 | return {
12 |   [1439610038] = function(self)
13 |     create_table(prefix_table("categories"), {
14 |       {
15 |         "id",
16 |         serial
17 |       },
18 |       {
19 |         "name",
20 |         text
21 |       },
22 |       {
23 |         "total_count",
24 |         integer
25 |       },
26 |       {
27 |         "created_at",
28 |         time
29 |       },
30 |       {
31 |         "updated_at",
32 |         time
33 |       },
34 |       "PRIMARY KEY (id)"
35 |     })
36 |     return create_table(prefix_table("word_classifications"), {
37 |       {
38 |         "category_id",
39 |         foreign_key
40 |       },
41 |       {
42 |         "word",
43 |         text
44 |       },
45 |       {
46 |         "count",
47 |         integer
48 |       },
49 |       "PRIMARY KEY (category_id, word)"
50 |     })
51 |   end,
52 |   [1474434614] = function(self)
53 |     return create_index(prefix_table("categories"), "name")
54 |   end
55 | }
56 | 


--------------------------------------------------------------------------------
/spec/url_tokenizer_spec.moon:
--------------------------------------------------------------------------------
 1 | 
 2 | UrlDomainsTokenizer = require "lapis.bayes.tokenizers.url_domains"
 3 | 
 4 | describe "lapis.bayes.tokenizer.url_tokenizer", ->
 5 |   it "builds grammar", ->
 6 |     tokenizer = UrlDomainsTokenizer!
 7 |     p = tokenizer\build_grammar!
 8 |     p\match "https"
 9 | 
10 |   describe "with grammar", ->
11 |     local grammar
12 | 
13 |     before_each ->
14 |       grammar = UrlDomainsTokenizer!\build_grammar!
15 | 
16 |     it "detects some urls", ->
17 |       assert.same {
18 |         "http://leafo.net& "
19 |         "http://google.com/p8sslord"
20 |         "www.leafodad.com"
21 |       }, grammar\match "href='http://leafo.net&amp; ' http://google.com/p8sslord please help the good one www.leafodad.com yeah what the freak"
22 | 
23 |   describe "with tonenizer", ->
24 |     local tokenize_text
25 |     before_each ->
26 |       tokenize_text = UrlDomainsTokenizer!\tokenize_text
27 | 
28 |     it "extracts tokens from string", ->
29 |       assert.same {
30 |         "leafo.net&"
31 |         "google.com"
32 |         "leafodad.com"
33 |       }, tokenize_text "href='http://leafo.net&amp; ' http://google.com/p8sslord/da?what please help the good one www.leafodad.com yeah what the freak"
34 | 
35 |     it "gets domain from iframe", ->
36 |       assert.same {
37 |         'youtube.com'
38 |       }, tokenize_text [[<iframe src="http://youtube.com/hello-world" frameborder="0"></iframe>]]
39 | 
40 |     it "ignore domains", ->
41 |       tokens = UrlDomainsTokenizer({
42 |         ignore_domains: {
43 |           "leafo.net": true
44 |           "*.google.com": true
45 |         }
46 |       })\tokenize_text [[
47 |         http://leafo.net
48 |         http://good.leafo.net
49 |         http://google.com
50 |         http://butt.google.com
51 |         http://plus.good.google.com
52 |       ]]
53 | 
54 |       assert.same {"good.leafo.net", "google.com"}, tokens
55 | 


--------------------------------------------------------------------------------
/lapis-bayes-dev-1.rockspec:
--------------------------------------------------------------------------------
 1 | package = "lapis-bayes"
 2 | version = "dev-1"
 3 | 
 4 | source = {
 5 |   url = "git+https://github.com/leafo/lapis-bayes.git"
 6 | }
 7 | 
 8 | description = {
 9 |   summary = "Naive Bayes classifier for use in Lua",
10 |   license = "MIT",
11 |   maintainer = "Leaf Corcoran <leafot@gmail.com>",
12 | }
13 | 
14 | dependencies = {
15 |   "lua == 5.1",
16 |   "lapis >= 1.16.0"
17 | }
18 | 
19 | build = {
20 |   type = "builtin",
21 |   modules = {
22 |     ["lapis.bayes"] = "lapis/bayes.lua",
23 |     ["lapis.bayes.classifiers.base"] = "lapis/bayes/classifiers/base.lua",
24 |     ["lapis.bayes.classifiers.bayes"] = "lapis/bayes/classifiers/bayes.lua",
25 |     ["lapis.bayes.classifiers.bayes_multi"] = "lapis/bayes/classifiers/bayes_multi.lua",
26 |     ["lapis.bayes.classifiers.default"] = "lapis/bayes/classifiers/default.lua",
27 |     ["lapis.bayes.classifiers.fisher"] = "lapis/bayes/classifiers/fisher.lua",
28 |     ["lapis.bayes.classifiers.test"] = "lapis/bayes/classifiers/test.lua",
29 |     ["lapis.bayes.migrations"] = "lapis/bayes/migrations.lua",
30 |     ["lapis.bayes.model"] = "lapis/bayes/model.lua",
31 |     ["lapis.bayes.models"] = "lapis/bayes/models.lua",
32 |     ["lapis.bayes.models.categories"] = "lapis/bayes/models/categories.lua",
33 |     ["lapis.bayes.models.word_classifications"] = "lapis/bayes/models/word_classifications.lua",
34 |     ["lapis.bayes.schema"] = "lapis/bayes/schema.lua",
35 |     ["lapis.bayes.text.punycode"] = "lapis/bayes/text/punycode.lua",
36 |     ["lapis.bayes.text.stem"] = "lapis/bayes/text/stem.lua",
37 |     ["lapis.bayes.text.unaccent"] = "lapis/bayes/text/unaccent.lua",
38 |     ["lapis.bayes.text.utf8"] = "lapis/bayes/text/utf8.lua",
39 |     ["lapis.bayes.tokenizers.base"] = "lapis/bayes/tokenizers/base.lua",
40 |     ["lapis.bayes.tokenizers.ngram"] = "lapis/bayes/tokenizers/ngram.lua",
41 |     ["lapis.bayes.tokenizers.postgres_text"] = "lapis/bayes/tokenizers/postgres_text.lua",
42 |     ["lapis.bayes.tokenizers.spam"] = "lapis/bayes/tokenizers/spam.lua",
43 |     ["lapis.bayes.tokenizers.url_domains"] = "lapis/bayes/tokenizers/url_domains.lua",
44 |   }
45 | }
46 | 


--------------------------------------------------------------------------------
/lapis/bayes/models/word_classifications.moon:
--------------------------------------------------------------------------------
 1 | 
 2 | db = require "lapis.db"
 3 | import Model from require "lapis.bayes.model"
 4 | 
 5 | -- Generated schema dump: (do not edit)
 6 | --
 7 | -- CREATE TABLE lapis_bayes_word_classifications (
 8 | --   category_id integer NOT NULL,
 9 | --   word text NOT NULL,
10 | --   count integer DEFAULT 0 NOT NULL
11 | -- );
12 | -- ALTER TABLE ONLY lapis_bayes_word_classifications
13 | --   ADD CONSTRAINT lapis_bayes_word_classifications_pkey PRIMARY KEY (category_id, word);
14 | --
15 | class WordClassifications extends Model
16 |   @primary_key: {"category_id", "word"}
17 | 
18 |   @relations: {
19 |     {"category", belongs_to: "Categories"}
20 |   }
21 | 
22 |   @find_or_create: (opts={}) =>
23 |     @find(opts) or @create(opts)
24 | 
25 |   @purge_word: (word, categories) =>
26 |     import Categories from require "lapis.bayes.models"
27 | 
28 |     categories = { categories } unless type(categories) == "table"
29 |     original_count = #categories
30 |     assert original_count > 0, "missing categories"
31 |     categories = Categories\find_all categories, key: "name"
32 |     assert #categories == original_count, "failed to find all categories specified"
33 | 
34 |     wcs = @select "where word = ? and category_id in ?",
35 |       word, db.list [c.id for c in *categories]
36 | 
37 |     count = 0
38 |     for wc in *wcs
39 |       if wc\delete!
40 |         count += 1
41 | 
42 |     count > 0, count
43 | 
44 |   delete: =>
45 |     deleted, res = super db.raw "*"
46 | 
47 |     if deleted
48 |       removed_row = @@load (unpack res)
49 | 
50 |       import Categories from require "lapis.bayes.models"
51 |       db.update Categories\table_name!, {
52 |         total_count: db.raw db.interpolate_query " total_count - ?", removed_row.count
53 |       }, {
54 |         id: @category_id
55 |       }
56 | 
57 |       true
58 | 
59 | 
60 |   -- note: this should not be called directly, use the associated method on the category model
61 |   _increment: (amount) =>
62 |     amount = assert tonumber(amount), "expecting number"
63 |     @update {
64 |       count: db.raw "count + #{amount}"
65 |     }
66 | 
67 |     if @count == 0
68 |       db.delete @@table_name!, {
69 |         category_id: @category_id
70 |         word: @word
71 |         count: 0
72 |       }
73 | 
74 | 
75 | 


--------------------------------------------------------------------------------
/lapis/bayes/tokenizers/ngram.moon:
--------------------------------------------------------------------------------
 1 | class NgramTokenizer extends require "lapis.bayes.tokenizers.base"
 2 |   new: (@opts = {}) =>
 3 | 
 4 |   build_grammar: =>
 5 |     import C, Ct from require "lpeg"
 6 |     utf8 = require "lapis.util.utf8"
 7 | 
 8 |     whitespace = utf8.whitespace
 9 |     printable = utf8.printable_character
10 |     word_chars = printable - whitespace
11 |     word = C word_chars^1
12 | 
13 |     Ct (word + whitespace^1)^0
14 | 
15 |   normalize_word: (word) =>
16 |     return unless word and word != ""
17 | 
18 |     normalized = tostring(word)\lower!
19 |     normalized = normalized\gsub("[%p]", "")
20 |     normalized = normalized\gsub("%s+", "")
21 | 
22 |     return unless normalized != ""
23 |     normalized
24 | 
25 |   ngram_size: =>
26 |     n = tonumber(@opts.n) or 2
27 |     n = math.floor n
28 |     n = 1 if n < 1
29 |     n
30 | 
31 |   word_ngrams: (word, n) =>
32 |     -- Split word into UTF-8 characters using LPEG
33 |     import C, Ct from require "lpeg"
34 |     utf8 = require "lapis.util.utf8"
35 |     printable = utf8.printable_character
36 | 
37 |     char_pattern = Ct (C printable)^0
38 |     chars = char_pattern\match word
39 | 
40 |     return { word } unless chars
41 | 
42 |     len = #chars
43 |     return { word } if len == 0
44 |     return { word } if len < n
45 | 
46 |     out = {}
47 |     for i = 1, len - n + 1
48 |       ngram = table.concat chars, "", i, i + n - 1
49 |       table.insert out, ngram
50 | 
51 |     out
52 | 
53 |   tokenize_text: (text) =>
54 |     return {} unless text and text != ""
55 | 
56 |     if pre_filter = @opts.filter_text
57 |       text = pre_filter text
58 |       return {} unless text and text != ""
59 | 
60 |     @grammar or= @build_grammar!
61 |     words = @grammar\match text
62 |     return {} unless words
63 | 
64 |     n = @ngram_size!
65 |     ignore_numbers = @opts.ignore_numbers
66 |     ignore_numbers = true if ignore_numbers == nil
67 | 
68 |     tokens = {}
69 |     for raw_word in *words
70 |       cleaned = @normalize_word raw_word
71 |       continue unless cleaned
72 | 
73 |       if ignore_numbers and cleaned\match "^%d+$"
74 |         continue
75 | 
76 |       for token in *@word_ngrams cleaned, n
77 |         table.insert tokens, token
78 | 
79 |     if @opts.filter_tokens
80 |       tokens = @opts.filter_tokens tokens, @opts
81 | 
82 |     tokens
83 | 


--------------------------------------------------------------------------------
/lapis/bayes/text/utf8.lua:
--------------------------------------------------------------------------------
 1 | local P, R
 2 | do
 3 |   local _obj_0 = require("lpeg")
 4 |   P, R = _obj_0.P, _obj_0.R
 5 | end
 6 | local cont = R("\128\191")
 7 | local han_ext_a = P("\227") * R("\144\191") * cont + P("\228") * R("\128\182") * cont
 8 | local han_unified = P("\228") * R("\184\191") * cont + R("\229\232") * cont * cont + P("\233") * R("\128\191") * cont
 9 | local han_compat = P("\239") * R("\164\171") * cont
10 | local han_supplement = P("\240") * R("\160\178") * cont * cont
11 | local han_character = han_ext_a + han_unified + han_compat + han_supplement
12 | local hiragana_block = P("\227\129") * cont + P("\227\130") * R("\128\159")
13 | local kana_supplement = P("\240\155") * R("\128\133") * cont
14 | local hiragana_character = hiragana_block + kana_supplement
15 | local katakana_main = P("\227\130") * R("\160\191") + P("\227\131") * cont
16 | local katakana_phonetic_ext = P("\227\135") * R("\176\191")
17 | local katakana_halfwidth = P("\239\189") * R("\166\191") + P("\239\190") * R("\128\159")
18 | local katakana_character = katakana_main + katakana_phonetic_ext + katakana_halfwidth + kana_supplement
19 | local kana_character = hiragana_block + katakana_main + katakana_phonetic_ext + katakana_halfwidth + kana_supplement
20 | local hangul_jamo = P("\225") * R("\132\135") * cont
21 | local hangul_jamo_ext_a = P("\234\165") * R("\160\191")
22 | local hangul_compat_jamo = P("\227\132") * R("\176\191") + P("\227\133") * cont + P("\227\134") * cont + P("\227\135") * R("\128\143")
23 | local hangul_syllables = P("\234") * R("\176\191") * cont + R("\235\236") * cont * cont + P("\237") * (R("\128\157") * cont + P("\158") * R("\128\163"))
24 | local hangul_jamo_ext_b = P("\237\158") * R("\176\191") + P("\237\159") * cont
25 | local hangul_halfwidth = P("\239\190") * R("\160\191") + P("\239\191") * R("\128\156")
26 | local hangul_character = hangul_jamo + hangul_jamo_ext_a + hangul_compat_jamo + hangul_syllables + hangul_jamo_ext_b + hangul_halfwidth
27 | local zero_width_character = P("\226") * (P("\128") * R("\139\141") + P("\129\160")) + P("\239\187\191")
28 | local cjk_character = han_character + kana_character + hangul_character
29 | return {
30 |   cont = cont,
31 |   han_character = han_character,
32 |   hiragana_character = hiragana_character,
33 |   katakana_character = katakana_character,
34 |   kana_character = kana_character,
35 |   hangul_character = hangul_character,
36 |   cjk_character = cjk_character,
37 |   zero_width_character = zero_width_character
38 | }
39 | 


--------------------------------------------------------------------------------
/lapis/bayes/text/utf8.moon:
--------------------------------------------------------------------------------
 1 | import P, R from require "lpeg"
 2 | 
 3 | cont = R "\128\191"
 4 | 
 5 | -- Han ideographs (basic, extensions, compatibility, supplementary planes)
 6 | han_ext_a = P"\227" * R("\144\191") * cont + P"\228" * R("\128\182") * cont
 7 | han_unified = P"\228" * R("\184\191") * cont + R("\229\232") * cont * cont + P"\233" * R("\128\191") * cont
 8 | han_compat = P"\239" * R("\164\171") * cont
 9 | han_supplement = P"\240" * R("\160\178") * cont * cont
10 | han_character = han_ext_a + han_unified + han_compat + han_supplement
11 | 
12 | -- Japanese Hiragana
13 | hiragana_block = P"\227\129" * cont + P"\227\130" * R("\128\159")
14 | 
15 | -- Kana supplement & historic kana (hentaigana, archaic forms)
16 | kana_supplement = P"\240\155" * R("\128\133") * cont
17 | 
18 | hiragana_character = hiragana_block + kana_supplement
19 | 
20 | -- Japanese Katakana (standard, extensions, halfwidth)
21 | katakana_main = P"\227\130" * R("\160\191") + P"\227\131" * cont
22 | katakana_phonetic_ext = P"\227\135" * R("\176\191")
23 | katakana_halfwidth = P"\239\189" * R("\166\191") + P"\239\190" * R("\128\159")
24 | katakana_character = katakana_main + katakana_phonetic_ext + katakana_halfwidth + kana_supplement
25 | 
26 | kana_character = hiragana_block + katakana_main + katakana_phonetic_ext + katakana_halfwidth + kana_supplement
27 | 
28 | -- Korean Hangul (jamo, syllables, compatibility/halfwidth)
29 | hangul_jamo = P"\225" * R("\132\135") * cont
30 | hangul_jamo_ext_a = P"\234\165" * R("\160\191")
31 | hangul_compat_jamo = P"\227\132" * R("\176\191") + P"\227\133" * cont + P"\227\134" * cont + P"\227\135" * R("\128\143")
32 | hangul_syllables = P"\234" * R("\176\191") * cont + R("\235\236") * cont * cont + P"\237" * (R("\128\157") * cont + P"\158" * R("\128\163"))
33 | hangul_jamo_ext_b = P"\237\158" * R("\176\191") + P"\237\159" * cont
34 | hangul_halfwidth = P"\239\190" * R("\160\191") + P"\239\191" * R("\128\156")
35 | hangul_character = hangul_jamo + hangul_jamo_ext_a + hangul_compat_jamo + hangul_syllables + hangul_jamo_ext_b + hangul_halfwidth
36 | 
37 | -- Zero-width characters (invisible formatting characters)
38 | -- Tree-structured: branch by first byte, then second, then third
39 | zero_width_character = P"\226" * (P"\128" * R"\139\141" + P"\129\160") + P"\239\187\191"
40 | 
41 | cjk_character = han_character + kana_character + hangul_character
42 | 
43 | {
44 |   :cont
45 |   :han_character
46 |   :hiragana_character
47 |   :katakana_character
48 |   :kana_character
49 |   :hangul_character
50 |   :cjk_character
51 |   :zero_width_character
52 | }
53 | 


--------------------------------------------------------------------------------
/examples/detect_language.lua:
--------------------------------------------------------------------------------
 1 | local NgramTokenizer = require("lapis.bayes.tokenizers.ngram")
 2 | local BayesMultiClassifier = require("lapis.bayes.classifiers.bayes_multi")
 3 | 
 4 | -- generates character ngrames of length 2
 5 | local tokenizer = NgramTokenizer({n = 2})
 6 | 
 7 | -- A BayesMultiClassifier supports classifying to any number of categories
 8 | local classifier = BayesMultiClassifier({tokenizer = tokenizer})
 9 | 
10 | local training_data = {
11 |   {"english", "The quick brown fox jumps over the lazy dog"},
12 |   {"english", "Hello world this is a test of the system"},
13 |   {"english", "Programming and software development with modern technology"},
14 | 
15 |   {"spanish", "El rápido zorro marrón salta sobre el perro perezoso"},
16 |   {"spanish", "Hola mundo esta es una prueba del sistema"},
17 |   {"spanish", "Los lenguajes de programación son herramientas importantes"},
18 | 
19 |   {"french", "Le rapide renard brun saute pardessus le chien paresseux"},
20 |   {"french", "Bonjour le monde ceci est un test du système"},
21 |   {"french", "Les langages de programmation sont des outils importants"},
22 | 
23 |   {"german", "Der schnelle braune Fuchs springt über den faulen Hund"},
24 |   {"german", "Hallo Welt dies ist ein Test des Systems"},
25 |   {"german", "Programmiersprachen sind wichtige Werkzeuge für die Entwicklung"},
26 | 
27 |   {"chinese", "敏捷的棕色狐狸跳过懒狗"},
28 |   {"chinese", "你好世界这是一个系统的测试"},
29 |   {"chinese", "编程语言是表达算法的重要工具"},
30 | }
31 | 
32 | -- Train the classifier
33 | print("Training classifier...")
34 | for _, entry in ipairs(training_data) do
35 |   local language, text = entry[1], entry[2]
36 |   classifier:train_text(language, text)
37 | end
38 | print("Training complete.\n")
39 | 
40 | -- Classify new text
41 | local test_cases = {
42 |   "Welcome to our website",
43 |   "Bienvenido a nuestro sitio",
44 |   "Bienvenue sur notre site",
45 |   "Willkommen auf unserer Website",
46 |   "欢迎来到我们的网站",
47 | }
48 | 
49 | print("Classifying test sentences:\n")
50 | for _, test in ipairs(test_cases) do
51 |   local text = test[1]
52 | 
53 |   -- Get probability distribution across all languages
54 |   local probs = classifier:text_probabilities({
55 |     "english",
56 |     "spanish",
57 |     "french",
58 |     "german",
59 |     "chinese"
60 |   }, text)
61 | 
62 |   -- The result is sorted by probability, first entry is the detected language
63 |   local detected_language = probs[1][1]
64 |   local confidence = probs[1][2]
65 | 
66 |   print(string.format('Text: "%s"', text))
67 |   print(string.format("Detected: %s (%.1f%% confidence)\n", detected_language, confidence * 100))
68 | end
69 | 


--------------------------------------------------------------------------------
/lapis/bayes/tokenizers/url_domains.moon:
--------------------------------------------------------------------------------
 1 | import trim from require "lapis.util"
 2 | 
 3 | class UrlDomainsTokenizer extends require "lapis.bayes.tokenizers.base"
 4 |   new: (@opts = {}) =>
 5 | 
 6 |   ignore_domain: (domain) =>
 7 |     return unless @opts and @opts.ignore_domains
 8 |     return true if @opts.ignore_domains[domain]
 9 | 
10 |     while true
11 |       sub = domain\gsub("^%**%.?[^%.]+", "*")
12 |       return false if sub == domain
13 |       return true if @opts.ignore_domains[sub]
14 |       domain = sub
15 | 
16 |   -- strip urls to just domains
17 |   filter_tokens: (urls) =>
18 |     return for url in *urls
19 |       url = url\lower!
20 |       url = trim url
21 |       url = url\gsub "^%w+://", ""
22 |       url = url\gsub "^www%.", ""
23 |       url = url\gsub "/.*$", ""
24 |       url = trim url
25 | 
26 |       url\gsub "<$", ""
27 |       url\gsub "^>", ""
28 | 
29 |       continue if url == ""
30 |       continue if url\match "^%w+:" -- mailto and co
31 |       continue if url\match [=[[<>="' ]]=]
32 |       continue unless url\match "%."
33 | 
34 |       continue if @ignore_domain url
35 | 
36 |       url
37 | 
38 |   build_grammar: =>
39 |     import P, S, R, C, Ct, Cs from require "lpeg"
40 | 
41 |     case_insensitive = (text) ->
42 |       out = nil
43 |       for char in text\gmatch "."
44 |         p = S"#{char\lower!}#{char\upper!}"
45 |         if out
46 |           out *= p
47 |         else
48 |           out = p
49 | 
50 |       out
51 | 
52 |     -- this is far from comprehensive
53 |     unescape_char = P"&gt;" / ">" +
54 |       P"&lt;" / "<" +
55 |       P"&amp;" / "&" +
56 |       P"&nbsp;" / " " +
57 |       P"&#x27;" / "'" +
58 |       P"&#x2F;" / "/" +
59 |       P"&quot;" / '"'
60 | 
61 |     unescape_text = Cs (unescape_char + 1)^1
62 | 
63 |     some_space = S" \t\n"
64 |     space = some_space^0
65 |     alphanum = R "az", "AZ", "09"
66 | 
67 |     scheme = case_insensitive"http" * case_insensitive"s"^-1 * P"://"
68 |     raw_url = C scheme * (P(1) - S" \t\n")^1
69 | 
70 |     word = (alphanum + S"._-")^1
71 |     attr_value = C(word) + P'"' * C((1 - P'"')^0) * P'"' + P"'" * C((1 - P"'")^0) * P"'"
72 | 
73 |     href = (case_insensitive"href" + case_insensitive"src") * space * P"=" * space * attr_value / (v) -> unescape_text\match(v) or ""
74 | 
75 |     simple = C case_insensitive"www" * (P"." * (1 - (S"./" + some_space))^1)^1
76 | 
77 |     Ct (raw_url + href + simple + 1)^0
78 | 
79 |   tokenize_text: (text) =>
80 |     @grammar or= @build_grammar!
81 |     matches = @grammar\match text
82 |     return nil, "failed to parse text" unless matches
83 |     @filter_tokens matches
84 | 


--------------------------------------------------------------------------------
/lapis/bayes/tokenizers/postgres_text.moon:
--------------------------------------------------------------------------------
 1 | db = require "lapis.db"
 2 | 
 3 | -- postgres based tokenizer
 4 | -- opts = {
 5 | --   filter_text: function -- function to pre-filter text, returns new text
 6 | --   strip_tags: bool -- remove html tags from input in default
 7 | --   symbols_split_tokens: bool -- symbols split apart tokens
 8 | --   min_token_length: number -- min length of token (default 2)
 9 | --   max_token_length: number -- max length of token (default 12)
10 | --   strip_numbers: bool -- remove tokens that are a number (including decimal, default true)
11 | --   ignore_words: table -- table of words to ignore (keys are words, values should be truthy)
12 | --   filter_tokens: function -- custom function to filter tokens, receives tokens and opts
13 | --   legacy_tokenizer: bool -- use slower ts_debug tokenizer that keeps duplicates
14 | --   regconfig: string -- PostgreSQL text search configuration (default "english")
15 | -- }
16 | class PostgresTextTokenizer extends require "lapis.bayes.tokenizers.base"
17 |   new: (@opts = {}) =>
18 | 
19 |   filter_tokens: (tokens) =>
20 |     opts = @opts
21 |     min_len = opts and opts.min_token_length or 2
22 |     max_len = opts and opts.max_token_length or 12
23 | 
24 |     strip_numbers = opts and opts.strip_numbers
25 |     strip_numbers = true if strip_numbers == nil
26 | 
27 |     return for t in *tokens
28 |       t_len = #t
29 |       continue if t_len > max_len
30 |       continue if t_len < min_len
31 | 
32 |       if strip_numbers and t\match "^[%d%.%/%-]+$"
33 |         continue
34 | 
35 |       continue if @opts and @opts.ignore_words and @opts.ignore_words[t]
36 |       t
37 | 
38 |   slow_pg_tokenize: (text) =>
39 |     regconfig = @opts.regconfig or "english"
40 |     -- this slower form will keep duplicate words
41 |     db.query [[SELECT unnest(lexemes) AS word FROM ts_debug(?, ?)]], regconfig, text
42 | 
43 |   -- much faster (50x), but loses duplicates. Needs newer version of postgres
44 |   pg_tokenize: (text) =>
45 |     regconfig = @opts.regconfig or "english"
46 |     db.query [[SELECT unnest(tsvector_to_array(to_tsvector(?, ?))) AS word]], regconfig, text
47 | 
48 |   tokenize_text: (text) =>
49 |     if pre_filter = @opts.filter_text
50 |       text = pre_filter text
51 | 
52 |     if @opts.strip_tags
53 |       import extract_text from require "web_sanitize"
54 |       text = extract_text text
55 | 
56 |     if @opts.symbols_split_tokens
57 |       text = text\gsub "[%!%@%#%$%%%^%&%*%(%)%[%]%{%}%|%\\%/%`%~%-%_%<%>%,%.]", " "
58 | 
59 |     res = if @opts.legacy_tokenizer
60 |       @slow_pg_tokenize text
61 |     else
62 |       @pg_tokenize text
63 | 
64 |     tokens = @filter_tokens [r.word for r in *res]
65 | 
66 |     if @opts.filter_tokens
67 |       tokens = @opts.filter_tokens tokens, @opts
68 | 
69 |     tokens
70 | 


--------------------------------------------------------------------------------
/lapis/bayes/models/categories.moon:
--------------------------------------------------------------------------------
 1 | 
 2 | db = require "lapis.db"
 3 | import Model, encode_tuples from require "lapis.bayes.model"
 4 | 
 5 | -- Generated schema dump: (do not edit)
 6 | --
 7 | -- CREATE TABLE lapis_bayes_categories (
 8 | --   id integer NOT NULL,
 9 | --   name text NOT NULL,
10 | --   total_count integer DEFAULT 0 NOT NULL,
11 | --   created_at timestamp without time zone NOT NULL,
12 | --   updated_at timestamp without time zone NOT NULL
13 | -- );
14 | -- ALTER TABLE ONLY lapis_bayes_categories
15 | --   ADD CONSTRAINT lapis_bayes_categories_pkey PRIMARY KEY (id);
16 | --
17 | class Categories extends Model
18 |   @timestamp: true
19 | 
20 |   @relations: {
21 |     {"word_classifications", has_many: "WordClassifications"}
22 |   }
23 | 
24 |   @find_or_create: (name) =>
25 |     @find(:name) or @create(:name)
26 | 
27 |   delete: =>
28 |     if super!
29 |       import WordClassifications from require "lapis.bayes.models"
30 |       db.delete WordClassifications\table_name!, {
31 |         category_id: @id
32 |       }
33 | 
34 |   increment: (amount) =>
35 |     amount = assert tonumber(amount), "expecting number"
36 |     @update {
37 |       total_count: db.raw "total_count + #{amount}"
38 |     }
39 | 
40 |   -- NOTE: this was removed since it was tied to a specific tokenizer
41 |   increment_text: (text, opts={}) =>
42 |     error "This method has been removed, use increment_words instead"
43 | 
44 |   -- increment a single word by count
45 |   increment_word: (word, count) =>
46 |     import WordClassifications from require "lapis.bayes.models"
47 |     w = WordClassifications\find_or_create {
48 |       category_id: @id
49 |       :word
50 |     }
51 |     w\_increment count
52 |     @increment count
53 | 
54 |   -- issue a single query to increment all WordClassifications for this
55 |   -- category with the list of words
56 |   -- counts: table in the format {word = count, ... word1, word2, ...}
57 |   increment_words: (counts) =>
58 |     return nil, "missing counts" unless counts
59 | 
60 |     -- combine hash and array words into summed count
61 |     merged_counts = {}
62 |     for k,v in pairs counts
63 |       word, count = if type(k) == "string"
64 |         k, v
65 |       else
66 |         v, 1
67 | 
68 |       merged_counts[word] or= 0
69 |       merged_counts[word] += count
70 | 
71 |     total_count = 0
72 |     tuples = for word, count in pairs merged_counts
73 |       total_count += count
74 |       {@id, word, count}
75 | 
76 |     unless next tuples
77 |       return total_count
78 | 
79 |     import WordClassifications from require "lapis.bayes.models"
80 |     tbl = db.escape_identifier WordClassifications\table_name!
81 | 
82 |     db.query "
83 |     INSERT INTO #{tbl} (category_id, word, count) #{encode_tuples tuples}
84 |     ON CONFLICT (category_id, word) DO UPDATE SET count = #{tbl}.count + EXCLUDED.count
85 |     "
86 | 
87 |     @increment total_count
88 |     total_count
89 | 
90 | 


--------------------------------------------------------------------------------
/lapis/bayes/classifiers/bayes_multi.moon:
--------------------------------------------------------------------------------
 1 | -- Multiclass naive Bayes classifier with Laplace-style smoothing
 2 | class BayesMultiClassifier extends require "lapis.bayes.classifiers.base"
 3 |   @default_options: {
 4 |     max_words: 40
 5 |     default_prob: 0.1
 6 |   }
 7 | 
 8 |   candidate_words: (categories, available_words, count) =>
 9 |     return available_words unless count and count < #available_words
10 | 
11 |     tuples = for word in *available_words
12 |       totals = 0
13 |       counts = {}
14 |       for category in *categories
15 |         word_counts = category.word_counts
16 |         c = word_counts and word_counts[word] or 0
17 |         table.insert counts, c
18 |         totals += c
19 | 
20 |       score = if totals == 0
21 |         0
22 |       else
23 |         mean = totals / #counts
24 |         variance = 0
25 |         for c in *counts
26 |           variance += (c - mean) ^ 2
27 |         variance / #counts
28 | 
29 |       score += math.random! / 1000
30 | 
31 |       { word, score }
32 | 
33 |     table.sort tuples, (a, b) -> a[2] > b[2]
34 |     [t[1] for t in *tuples[,count]]
35 | 
36 |   word_probabilities: (categories, available_words) =>
37 |     return nil, "at least two categories required" unless #categories >= 2
38 | 
39 |     available_words = @candidate_words categories, available_words, @opts.max_words
40 |     vocab_size = #available_words
41 | 
42 |     return nil, "no words to score" unless vocab_size > 0
43 | 
44 |     smoothing = if @opts.default_prob and @opts.default_prob > 0
45 |       @opts.default_prob
46 |     else
47 |       1e-6
48 | 
49 |     sum_counts = 0
50 |     for category in *categories
51 |       sum_counts += category.total_count or 0
52 | 
53 |     prior_smoothing = smoothing * #categories
54 | 
55 |     local max_log
56 |     log_scores = for category in *categories
57 |       cat_total = math.max (category.total_count or 0), 0
58 |       prior = (cat_total + smoothing) / (sum_counts + prior_smoothing)
59 |       log_score = math.log prior
60 | 
61 |       denominator = cat_total + (smoothing * vocab_size)
62 |       denominator = smoothing * vocab_size if denominator <= 0
63 | 
64 |       for word in *available_words
65 |         word_count = category.word_counts and category.word_counts[word] or 0
66 |         log_score += math.log ((word_count + smoothing) / denominator)
67 | 
68 |       max_log = if max_log
69 |         math.max max_log, log_score
70 |       else
71 |         log_score
72 | 
73 |       { category, log_score }
74 | 
75 |     weights = {}
76 |     total_weight = 0
77 |     for {category, log_score} in *log_scores
78 |       weight = math.exp (log_score - max_log)
79 |       total_weight += weight
80 |       table.insert weights, { category.name, weight }
81 | 
82 |     return nil, "unable to normalise probabilities" unless total_weight > 0
83 | 
84 |     for tuple in *weights
85 |       tuple[2] /= total_weight
86 | 
87 |     table.sort weights, (a, b) -> a[2] > b[2]
88 |     weights
89 | 


--------------------------------------------------------------------------------
/spec/utf8_spec.moon:
--------------------------------------------------------------------------------
 1 | scripts = require "lapis.bayes.text.utf8"
 2 | import C, P from require "lpeg"
 3 | 
 4 | capture = (pattern, text) ->
 5 |   (C(pattern) * -P(1))\match text
 6 | 
 7 | matches = (pattern, text) ->
 8 |   not not ((pattern * -P(1))\match text)
 9 | 
10 | describe "lapis.bayes.text.utf8", ->
11 |   describe "han_character", ->
12 |     it "matches a basic Han ideograph", ->
13 |       assert.same "漢", capture scripts.han_character, "漢"
14 | 
15 |     it "matches a supplementary plane character", ->
16 |       assert.same "𠀋", capture scripts.han_character, "𠀋"
17 | 
18 |     it "does not match kana characters", ->
19 |       assert.falsy matches scripts.han_character, "あ"
20 |       assert.falsy matches scripts.han_character, "ア"
21 | 
22 |   describe "kana_character", ->
23 |     it "matches hiragana and katakana", ->
24 |       assert.same "あ", capture scripts.kana_character, "あ"
25 |       assert.same "ア", capture scripts.kana_character, "ア"
26 | 
27 |     it "matches halfwidth katakana", ->
28 |       assert.same "ｱ", capture scripts.kana_character, "ｱ"
29 | 
30 |     it "does not match Han or Latin letters", ->
31 |       assert.falsy matches scripts.kana_character, "漢"
32 |       assert.falsy matches scripts.kana_character, "A"
33 | 
34 |   describe "hangul_character", ->
35 |     it "matches modern syllables and jamo", ->
36 |       assert.same "한", capture scripts.hangul_character, "한"
37 |       assert.same "ᄀ", capture scripts.hangul_character, "ᄀ"
38 | 
39 |     it "matches halfwidth Hangul letters", ->
40 |       assert.same "ﾡ", capture scripts.hangul_character, "ﾡ"
41 | 
42 |     it "does not match kana", ->
43 |       assert.falsy matches scripts.hangul_character, "ア"
44 | 
45 |   describe "cjk_character", ->
46 |     it "matches characters across Han, Kana, and Hangul", ->
47 |       assert.same "漢", capture scripts.cjk_character, "漢"
48 |       assert.same "あ", capture scripts.cjk_character, "あ"
49 |       assert.same "한", capture scripts.cjk_character, "한"
50 | 
51 |     it "rejects non-CJK characters", ->
52 |       assert.falsy matches scripts.cjk_character, "A"
53 |       assert.falsy matches scripts.cjk_character, "1"
54 | 
55 |   describe "zero_width_character", ->
56 |     it "matches zero width space (U+200B)", ->
57 |       assert.same "\226\128\139", capture scripts.zero_width_character, "\226\128\139"
58 | 
59 |     it "matches zero width non-joiner (U+200C)", ->
60 |       assert.same "\226\128\140", capture scripts.zero_width_character, "\226\128\140"
61 | 
62 |     it "matches zero width joiner (U+200D)", ->
63 |       assert.same "\226\128\141", capture scripts.zero_width_character, "\226\128\141"
64 | 
65 |     it "matches word joiner (U+2060)", ->
66 |       assert.same "\226\129\160", capture scripts.zero_width_character, "\226\129\160"
67 | 
68 |     it "matches byte order mark (U+FEFF)", ->
69 |       assert.same "\239\187\191", capture scripts.zero_width_character, "\239\187\191"
70 | 
71 |     it "rejects visible characters", ->
72 |       assert.falsy matches scripts.zero_width_character, "A"
73 |       assert.falsy matches scripts.zero_width_character, " "
74 |       assert.falsy matches scripts.zero_width_character, "漢"
75 | 


--------------------------------------------------------------------------------
/lapis/bayes/classifiers/fisher.lua:
--------------------------------------------------------------------------------
  1 | local inv_chi2
  2 | inv_chi2 = function(chi, df)
  3 |   assert(df % 2 == 0, "df must be even")
  4 |   local m = chi / 2.0
  5 |   local sum = math.exp(-m)
  6 |   local term = sum
  7 |   for i = 1, math.floor(df / 2) do
  8 |     term = term * (m / i)
  9 |     sum = sum + term
 10 |   end
 11 |   return math.min(sum, 1)
 12 | end
 13 | local FisherClassifier
 14 | do
 15 |   local _class_0
 16 |   local _parent_0 = require("lapis.bayes.classifiers.base")
 17 |   local _base_0 = {
 18 |     word_probabilities = function(self, categories, available_words)
 19 |       if not (#categories == 2) then
 20 |         return nil, "only two categories supported at once"
 21 |       end
 22 |       local a, b
 23 |       a, b = categories[1], categories[2]
 24 |       local s = self.opts.robs
 25 |       local x = self.opts.robx
 26 |       local min_dev = self.opts.min_dev
 27 |       local mul_a = 0
 28 |       local mul_b = 0
 29 |       local kept_tokens = 0
 30 |       for _index_0 = 1, #available_words do
 31 |         local word = available_words[_index_0]
 32 |         local a_count = a.word_counts and a.word_counts[word] or 0
 33 |         local b_count = b.word_counts and b.word_counts[word] or 0
 34 |         local p = a_count / (a_count + b_count)
 35 |         local n = a_count + b_count
 36 |         local val = ((s * x) + (n * p)) / (s + n)
 37 |         if not min_dev or math.abs(val - 0.5) > min_dev then
 38 |           mul_a = mul_a + math.log(val)
 39 |           mul_b = mul_b + math.log(1 - val)
 40 |           kept_tokens = kept_tokens + 1
 41 |         end
 42 |       end
 43 |       if kept_tokens == 0 then
 44 |         return nil, "not enough strong signals to decide"
 45 |       end
 46 |       local pa = inv_chi2(-2 * mul_a, 2 * kept_tokens)
 47 |       local pb = inv_chi2(-2 * mul_b, 2 * kept_tokens)
 48 |       local p = (1 + pa - pb) / 2
 49 |       local tuples = {
 50 |         {
 51 |           a.name,
 52 |           p
 53 |         },
 54 |         {
 55 |           b.name,
 56 |           1 - p
 57 |         }
 58 |       }
 59 |       table.sort(tuples, function(a, b)
 60 |         return a[2] > b[2]
 61 |       end)
 62 |       return tuples
 63 |     end
 64 |   }
 65 |   _base_0.__index = _base_0
 66 |   setmetatable(_base_0, _parent_0.__base)
 67 |   _class_0 = setmetatable({
 68 |     __init = function(self, ...)
 69 |       return _class_0.__parent.__init(self, ...)
 70 |     end,
 71 |     __base = _base_0,
 72 |     __name = "FisherClassifier",
 73 |     __parent = _parent_0
 74 |   }, {
 75 |     __index = function(cls, name)
 76 |       local val = rawget(_base_0, name)
 77 |       if val == nil then
 78 |         local parent = rawget(cls, "__parent")
 79 |         if parent then
 80 |           return parent[name]
 81 |         end
 82 |       else
 83 |         return val
 84 |       end
 85 |     end,
 86 |     __call = function(cls, ...)
 87 |       local _self_0 = setmetatable({}, _base_0)
 88 |       cls.__init(_self_0, ...)
 89 |       return _self_0
 90 |     end
 91 |   })
 92 |   _base_0.__class = _class_0
 93 |   local self = _class_0
 94 |   self.default_options = {
 95 |     robs = 1,
 96 |     robx = 0.5,
 97 |     min_dev = 0.3
 98 |   }
 99 |   if _parent_0.__inherited then
100 |     _parent_0.__inherited(_parent_0, _class_0)
101 |   end
102 |   FisherClassifier = _class_0
103 |   return _class_0
104 | end
105 | 


--------------------------------------------------------------------------------
/lapis/bayes/models/word_classifications.lua:
--------------------------------------------------------------------------------
  1 | local db = require("lapis.db")
  2 | local Model
  3 | Model = require("lapis.bayes.model").Model
  4 | local WordClassifications
  5 | do
  6 |   local _class_0
  7 |   local _parent_0 = Model
  8 |   local _base_0 = {
  9 |     delete = function(self)
 10 |       local deleted, res = _class_0.__parent.__base.delete(self, db.raw("*"))
 11 |       if deleted then
 12 |         local removed_row = self.__class:load((unpack(res)))
 13 |         local Categories
 14 |         Categories = require("lapis.bayes.models").Categories
 15 |         db.update(Categories:table_name(), {
 16 |           total_count = db.raw(db.interpolate_query(" total_count - ?", removed_row.count))
 17 |         }, {
 18 |           id = self.category_id
 19 |         })
 20 |         return true
 21 |       end
 22 |     end,
 23 |     _increment = function(self, amount)
 24 |       amount = assert(tonumber(amount), "expecting number")
 25 |       self:update({
 26 |         count = db.raw("count + " .. tostring(amount))
 27 |       })
 28 |       if self.count == 0 then
 29 |         return db.delete(self.__class:table_name(), {
 30 |           category_id = self.category_id,
 31 |           word = self.word,
 32 |           count = 0
 33 |         })
 34 |       end
 35 |     end
 36 |   }
 37 |   _base_0.__index = _base_0
 38 |   setmetatable(_base_0, _parent_0.__base)
 39 |   _class_0 = setmetatable({
 40 |     __init = function(self, ...)
 41 |       return _class_0.__parent.__init(self, ...)
 42 |     end,
 43 |     __base = _base_0,
 44 |     __name = "WordClassifications",
 45 |     __parent = _parent_0
 46 |   }, {
 47 |     __index = function(cls, name)
 48 |       local val = rawget(_base_0, name)
 49 |       if val == nil then
 50 |         local parent = rawget(cls, "__parent")
 51 |         if parent then
 52 |           return parent[name]
 53 |         end
 54 |       else
 55 |         return val
 56 |       end
 57 |     end,
 58 |     __call = function(cls, ...)
 59 |       local _self_0 = setmetatable({}, _base_0)
 60 |       cls.__init(_self_0, ...)
 61 |       return _self_0
 62 |     end
 63 |   })
 64 |   _base_0.__class = _class_0
 65 |   local self = _class_0
 66 |   self.primary_key = {
 67 |     "category_id",
 68 |     "word"
 69 |   }
 70 |   self.relations = {
 71 |     {
 72 |       "category",
 73 |       belongs_to = "Categories"
 74 |     }
 75 |   }
 76 |   self.find_or_create = function(self, opts)
 77 |     if opts == nil then
 78 |       opts = { }
 79 |     end
 80 |     return self:find(opts) or self:create(opts)
 81 |   end
 82 |   self.purge_word = function(self, word, categories)
 83 |     local Categories
 84 |     Categories = require("lapis.bayes.models").Categories
 85 |     if not (type(categories) == "table") then
 86 |       categories = {
 87 |         categories
 88 |       }
 89 |     end
 90 |     local original_count = #categories
 91 |     assert(original_count > 0, "missing categories")
 92 |     categories = Categories:find_all(categories, {
 93 |       key = "name"
 94 |     })
 95 |     assert(#categories == original_count, "failed to find all categories specified")
 96 |     local wcs = self:select("where word = ? and category_id in ?", word, db.list((function()
 97 |       local _accum_0 = { }
 98 |       local _len_0 = 1
 99 |       for _index_0 = 1, #categories do
100 |         local c = categories[_index_0]
101 |         _accum_0[_len_0] = c.id
102 |         _len_0 = _len_0 + 1
103 |       end
104 |       return _accum_0
105 |     end)()))
106 |     local count = 0
107 |     for _index_0 = 1, #wcs do
108 |       local wc = wcs[_index_0]
109 |       if wc:delete() then
110 |         count = count + 1
111 |       end
112 |     end
113 |     return count > 0, count
114 |   end
115 |   if _parent_0.__inherited then
116 |     _parent_0.__inherited(_parent_0, _class_0)
117 |   end
118 |   WordClassifications = _class_0
119 |   return _class_0
120 | end
121 | 


--------------------------------------------------------------------------------
/lapis/bayes/classifiers/test.lua:
--------------------------------------------------------------------------------
  1 | local average
  2 | average = function(nums)
  3 |   local sum = 0
  4 |   for _index_0 = 1, #nums do
  5 |     local n = nums[_index_0]
  6 |     sum = sum + n
  7 |   end
  8 |   return sum / #nums
  9 | end
 10 | local weighted_avg
 11 | weighted_avg = function(tuples)
 12 |   local num_tuples = #tuples
 13 |   local sum = 0
 14 |   local sum_weight = 0
 15 |   for _index_0 = 1, #tuples do
 16 |     local _des_0 = tuples[_index_0]
 17 |     local num, weight
 18 |     num, weight = _des_0[1], _des_0[2]
 19 |     sum = sum + num
 20 |     sum_weight = sum_weight + weight
 21 |   end
 22 |   local avg_weight = sum_weight / num_tuples
 23 |   local avg = 0
 24 |   for _index_0 = 1, #tuples do
 25 |     local _des_0 = tuples[_index_0]
 26 |     local num, weight
 27 |     num, weight = _des_0[1], _des_0[2]
 28 |     avg = avg + ((num / num_tuples) * (weight / avg_weight))
 29 |   end
 30 |   return avg
 31 | end
 32 | local TestClassifier
 33 | do
 34 |   local _class_0
 35 |   local _parent_0 = require("lapis.bayes.classifiers.base")
 36 |   local _base_0 = {
 37 |     word_probabilities = function(self, categories, available_words)
 38 |       local total_counts = { }
 39 |       for _index_0 = 1, #categories do
 40 |         local _continue_0 = false
 41 |         repeat
 42 |           local c = categories[_index_0]
 43 |           if not (c.word_counts) then
 44 |             _continue_0 = true
 45 |             break
 46 |           end
 47 |           for word, count in pairs(c.word_counts) do
 48 |             local _update_0 = word
 49 |             total_counts[_update_0] = total_counts[_update_0] or 0
 50 |             local _update_1 = word
 51 |             total_counts[_update_1] = total_counts[_update_1] + count
 52 |           end
 53 |           _continue_0 = true
 54 |         until true
 55 |         if not _continue_0 then
 56 |           break
 57 |         end
 58 |       end
 59 |       local probs
 60 |       do
 61 |         local _accum_0 = { }
 62 |         local _len_0 = 1
 63 |         for _index_0 = 1, #categories do
 64 |           local c = categories[_index_0]
 65 |           local tuples
 66 |           do
 67 |             local _accum_1 = { }
 68 |             local _len_1 = 1
 69 |             for _index_1 = 1, #available_words do
 70 |               local word = available_words[_index_1]
 71 |               local total_count = total_counts[word]
 72 |               local cat_count = c.word_counts and c.word_counts[word] or 0
 73 |               local _value_0 = {
 74 |                 cat_count / total_count,
 75 |                 total_count
 76 |               }
 77 |               _accum_1[_len_1] = _value_0
 78 |               _len_1 = _len_1 + 1
 79 |             end
 80 |             tuples = _accum_1
 81 |           end
 82 |           local _value_0 = {
 83 |             c.name,
 84 |             weighted_avg(tuples)
 85 |           }
 86 |           _accum_0[_len_0] = _value_0
 87 |           _len_0 = _len_0 + 1
 88 |         end
 89 |         probs = _accum_0
 90 |       end
 91 |       table.sort(probs, function(a, b)
 92 |         return a[2] > b[2]
 93 |       end)
 94 |       return probs
 95 |     end
 96 |   }
 97 |   _base_0.__index = _base_0
 98 |   setmetatable(_base_0, _parent_0.__base)
 99 |   _class_0 = setmetatable({
100 |     __init = function(self, ...)
101 |       return _class_0.__parent.__init(self, ...)
102 |     end,
103 |     __base = _base_0,
104 |     __name = "TestClassifier",
105 |     __parent = _parent_0
106 |   }, {
107 |     __index = function(cls, name)
108 |       local val = rawget(_base_0, name)
109 |       if val == nil then
110 |         local parent = rawget(cls, "__parent")
111 |         if parent then
112 |           return parent[name]
113 |         end
114 |       else
115 |         return val
116 |       end
117 |     end,
118 |     __call = function(cls, ...)
119 |       local _self_0 = setmetatable({}, _base_0)
120 |       cls.__init(_self_0, ...)
121 |       return _self_0
122 |     end
123 |   })
124 |   _base_0.__class = _class_0
125 |   if _parent_0.__inherited then
126 |     _parent_0.__inherited(_parent_0, _class_0)
127 |   end
128 |   TestClassifier = _class_0
129 |   return _class_0
130 | end
131 | 


--------------------------------------------------------------------------------
/spec/postgres_text_tokenizer_spec.moon:
--------------------------------------------------------------------------------
  1 | import use_test_env from require "lapis.spec"
  2 | 
  3 | describe "lapis.bayes.tokenizers.postgres_text", ->
  4 |   use_test_env!
  5 | 
  6 |   it "skips words in ignore list", ->
  7 |     PostgresTextTokenizer = require "lapis.bayes.tokenizers.postgres_text"
  8 | 
  9 |     t = PostgresTextTokenizer {
 10 |       ignore_words: {
 11 |         hodoc: true
 12 |       }
 13 |     }
 14 | 
 15 |     assert.same {"delisho"}, t\tokenize_text "12 delisho hodocs for $5.99"
 16 | 
 17 | 
 18 |   it "splits on symbols with option", ->
 19 |     PostgresTextTokenizer = require "lapis.bayes.tokenizers.postgres_text"
 20 | 
 21 |     t = PostgresTextTokenizer {
 22 |       symbols_split_tokens: true
 23 |     }
 24 | 
 25 |     assert.same {
 26 |       "buttz"
 27 |       "com"
 28 |       "disgust"
 29 |       "power"
 30 |       "super"
 31 |       "wow"
 32 |     },
 33 |       t\tokenize_text "wow that was super-disgusting buttz.com power/up"
 34 | 
 35 |   it "adds a custom prefilter", ->
 36 |     PostgresTextTokenizer = require "lapis.bayes.tokenizers.postgres_text"
 37 | 
 38 |     t = PostgresTextTokenizer {
 39 |       filter_text: (text) ->
 40 |         text\gsub "[%w]+", "%1zoo"
 41 |     }
 42 | 
 43 |     assert.same {"goodzoo", "greatzoo", "stuffzoo", "wowzoo"},
 44 |       t\tokenize_text "good great stuff wow"
 45 | 
 46 |   it "adds a custom token filter", ->
 47 |     PostgresTextTokenizer = require "lapis.bayes.tokenizers.postgres_text"
 48 | 
 49 |     t = PostgresTextTokenizer {
 50 |       filter_tokens: (tokens) ->
 51 |         [t\reverse! for t in *tokens]
 52 |     }
 53 | 
 54 |     assert.same {"doog", "taerg", "ffuts", "wow"},
 55 |       t\tokenize_text "good great stuff wow"
 56 | 
 57 |   it "respects min_token_length", ->
 58 |     PostgresTextTokenizer = require "lapis.bayes.tokenizers.postgres_text"
 59 | 
 60 |     t = PostgresTextTokenizer {
 61 |       min_token_length: 5
 62 |     }
 63 | 
 64 |     assert.same {"great", "stuff"},
 65 |       t\tokenize_text "hi wow great stuff"
 66 | 
 67 |   it "respects max_token_length", ->
 68 |     PostgresTextTokenizer = require "lapis.bayes.tokenizers.postgres_text"
 69 | 
 70 |     t = PostgresTextTokenizer {
 71 |       max_token_length: 4
 72 |     }
 73 | 
 74 |     assert.same {"good", "wow"},
 75 |       t\tokenize_text "good great stuff wow"
 76 | 
 77 |   it "strips numbers by default", ->
 78 |     PostgresTextTokenizer = require "lapis.bayes.tokenizers.postgres_text"
 79 | 
 80 |     t = PostgresTextTokenizer!
 81 | 
 82 |     tokens = t\tokenize_text "cost 99 dollars 5.99"
 83 |     table.sort tokens
 84 |     assert.same {"cost", "dollar"},
 85 |       tokens
 86 | 
 87 |   it "keeps numbers when strip_numbers is false", ->
 88 |     PostgresTextTokenizer = require "lapis.bayes.tokenizers.postgres_text"
 89 | 
 90 |     t = PostgresTextTokenizer {
 91 |       strip_numbers: false
 92 |     }
 93 | 
 94 |     tokens = t\tokenize_text "cost 99 dollars 5.99"
 95 |     table.sort tokens
 96 |     assert.same {"5.99", "99", "cost", "dollar"},
 97 |       tokens
 98 | 
 99 |   it "strips HTML tags with strip_tags option", ->
100 |     PostgresTextTokenizer = require "lapis.bayes.tokenizers.postgres_text"
101 | 
102 |     t = PostgresTextTokenizer {
103 |       strip_tags: true
104 |     }
105 | 
106 |     assert.same {"hello", "link", "world"},
107 |       t\tokenize_text [[<div>hello world</div><a href="test">link</a>]]
108 | 
109 |   it "uses legacy tokenizer that keeps duplicates", ->
110 |     PostgresTextTokenizer = require "lapis.bayes.tokenizers.postgres_text"
111 | 
112 |     t = PostgresTextTokenizer {
113 |       legacy_tokenizer: true
114 |     }
115 | 
116 |     tokens = t\tokenize_text "burgers are burgers"
117 |     table.sort tokens
118 |     assert.same {"burger", "burger"},
119 |       tokens
120 | 
121 |   it "uses custom regconfig", ->
122 |     PostgresTextTokenizer = require "lapis.bayes.tokenizers.postgres_text"
123 | 
124 |     -- Test with french config
125 |     t = PostgresTextTokenizer {
126 |       regconfig: "french"
127 |     }
128 | 
129 |     -- This should tokenize using French rules
130 |     tokens = t\tokenize_text "les maisons"
131 |     assert.truthy tokens
132 |     assert.truthy #tokens > 0
133 | 


--------------------------------------------------------------------------------
/lapis/bayes/tokenizers/postgres_text.lua:
--------------------------------------------------------------------------------
  1 | local db = require("lapis.db")
  2 | local PostgresTextTokenizer
  3 | do
  4 |   local _class_0
  5 |   local _parent_0 = require("lapis.bayes.tokenizers.base")
  6 |   local _base_0 = {
  7 |     filter_tokens = function(self, tokens)
  8 |       local opts = self.opts
  9 |       local min_len = opts and opts.min_token_length or 2
 10 |       local max_len = opts and opts.max_token_length or 12
 11 |       local strip_numbers = opts and opts.strip_numbers
 12 |       if strip_numbers == nil then
 13 |         strip_numbers = true
 14 |       end
 15 |       return (function()
 16 |         local _accum_0 = { }
 17 |         local _len_0 = 1
 18 |         for _index_0 = 1, #tokens do
 19 |           local _continue_0 = false
 20 |           repeat
 21 |             local t = tokens[_index_0]
 22 |             local t_len = #t
 23 |             if t_len > max_len then
 24 |               _continue_0 = true
 25 |               break
 26 |             end
 27 |             if t_len < min_len then
 28 |               _continue_0 = true
 29 |               break
 30 |             end
 31 |             if strip_numbers and t:match("^[%d%.%/%-]+$") then
 32 |               _continue_0 = true
 33 |               break
 34 |             end
 35 |             if self.opts and self.opts.ignore_words and self.opts.ignore_words[t] then
 36 |               _continue_0 = true
 37 |               break
 38 |             end
 39 |             local _value_0 = t
 40 |             _accum_0[_len_0] = _value_0
 41 |             _len_0 = _len_0 + 1
 42 |             _continue_0 = true
 43 |           until true
 44 |           if not _continue_0 then
 45 |             break
 46 |           end
 47 |         end
 48 |         return _accum_0
 49 |       end)()
 50 |     end,
 51 |     slow_pg_tokenize = function(self, text)
 52 |       local regconfig = self.opts.regconfig or "english"
 53 |       return db.query([[SELECT unnest(lexemes) AS word FROM ts_debug(?, ?)]], regconfig, text)
 54 |     end,
 55 |     pg_tokenize = function(self, text)
 56 |       local regconfig = self.opts.regconfig or "english"
 57 |       return db.query([[SELECT unnest(tsvector_to_array(to_tsvector(?, ?))) AS word]], regconfig, text)
 58 |     end,
 59 |     tokenize_text = function(self, text)
 60 |       do
 61 |         local pre_filter = self.opts.filter_text
 62 |         if pre_filter then
 63 |           text = pre_filter(text)
 64 |         end
 65 |       end
 66 |       if self.opts.strip_tags then
 67 |         local extract_text
 68 |         extract_text = require("web_sanitize").extract_text
 69 |         text = extract_text(text)
 70 |       end
 71 |       if self.opts.symbols_split_tokens then
 72 |         text = text:gsub("[%!%@%#%$%%%^%&%*%(%)%[%]%{%}%|%\\%/%`%~%-%_%<%>%,%.]", " ")
 73 |       end
 74 |       local res
 75 |       if self.opts.legacy_tokenizer then
 76 |         res = self:slow_pg_tokenize(text)
 77 |       else
 78 |         res = self:pg_tokenize(text)
 79 |       end
 80 |       local tokens = self:filter_tokens((function()
 81 |         local _accum_0 = { }
 82 |         local _len_0 = 1
 83 |         for _index_0 = 1, #res do
 84 |           local r = res[_index_0]
 85 |           _accum_0[_len_0] = r.word
 86 |           _len_0 = _len_0 + 1
 87 |         end
 88 |         return _accum_0
 89 |       end)())
 90 |       if self.opts.filter_tokens then
 91 |         tokens = self.opts.filter_tokens(tokens, self.opts)
 92 |       end
 93 |       return tokens
 94 |     end
 95 |   }
 96 |   _base_0.__index = _base_0
 97 |   setmetatable(_base_0, _parent_0.__base)
 98 |   _class_0 = setmetatable({
 99 |     __init = function(self, opts)
100 |       if opts == nil then
101 |         opts = { }
102 |       end
103 |       self.opts = opts
104 |     end,
105 |     __base = _base_0,
106 |     __name = "PostgresTextTokenizer",
107 |     __parent = _parent_0
108 |   }, {
109 |     __index = function(cls, name)
110 |       local val = rawget(_base_0, name)
111 |       if val == nil then
112 |         local parent = rawget(cls, "__parent")
113 |         if parent then
114 |           return parent[name]
115 |         end
116 |       else
117 |         return val
118 |       end
119 |     end,
120 |     __call = function(cls, ...)
121 |       local _self_0 = setmetatable({}, _base_0)
122 |       cls.__init(_self_0, ...)
123 |       return _self_0
124 |     end
125 |   })
126 |   _base_0.__class = _class_0
127 |   if _parent_0.__inherited then
128 |     _parent_0.__inherited(_parent_0, _class_0)
129 |   end
130 |   PostgresTextTokenizer = _class_0
131 |   return _class_0
132 | end
133 | 


--------------------------------------------------------------------------------
/lapis/bayes/text/punycode.moon:
--------------------------------------------------------------------------------
  1 | -- Punycode implementation for internationalized domain names
  2 | -- Based on RFC 3492: https://tools.ietf.org/html/rfc3492
  3 | 
  4 | -- Punycode parameters
  5 | base = 36
  6 | tmin = 1
  7 | tmax = 26
  8 | skew = 38
  9 | damp = 700
 10 | initial_bias = 72
 11 | initial_n = 128
 12 | delimiter = 0x2D  -- hyphen-minus
 13 | 
 14 | -- Adapt bias after each delta
 15 | adapt = (delta, numpoints, firsttime) ->
 16 |   delta = if firsttime
 17 |     math.floor delta / damp
 18 |   else
 19 |     math.floor delta / 2
 20 | 
 21 |   delta = delta + math.floor delta / numpoints
 22 |   k = 0
 23 | 
 24 |   while delta > math.floor((base - tmin) * tmax / 2)
 25 |     delta = math.floor delta / (base - tmin)
 26 |     k = k + base
 27 | 
 28 |   k + math.floor ((base - tmin + 1) * delta) / (delta + skew)
 29 | 
 30 | -- Encode a single digit (0-35) to character
 31 | encode_digit = (d) ->
 32 |   if d < 26
 33 |     string.char d + 0x61  -- a-z
 34 |   else
 35 |     string.char d - 26 + 0x30  -- 0-9
 36 | 
 37 | -- Calculate threshold for digit
 38 | threshold = (k, bias) ->
 39 |   if k <= bias + tmin
 40 |     tmin
 41 |   elseif k >= bias + tmax
 42 |     tmax
 43 |   else
 44 |     k - bias
 45 | 
 46 | -- Check if character is basic (ASCII)
 47 | is_basic = (cp) ->
 48 |   cp < 0x80
 49 | 
 50 | -- Get UTF8 codepoints from string
 51 | utf8_codepoints = (str) ->
 52 |   codepoints = {}
 53 |   i = 1
 54 |   while i <= #str
 55 |     b = string.byte str, i
 56 |     cp = nil
 57 |     len = 1
 58 | 
 59 |     if b < 0x80
 60 |       cp = b
 61 |       len = 1
 62 |     elseif b >= 0xC0 and b < 0xE0
 63 |       b2 = string.byte(str, i + 1) or 0
 64 |       cp = ((b - 0xC0) * 0x40) + (b2 - 0x80)
 65 |       len = 2
 66 |     elseif b >= 0xE0 and b < 0xF0
 67 |       b2 = string.byte(str, i + 1) or 0
 68 |       b3 = string.byte(str, i + 2) or 0
 69 |       cp = ((b - 0xE0) * 0x1000) + ((b2 - 0x80) * 0x40) + (b3 - 0x80)
 70 |       len = 3
 71 |     elseif b >= 0xF0 and b < 0xF8
 72 |       b2 = string.byte(str, i + 1) or 0
 73 |       b3 = string.byte(str, i + 2) or 0
 74 |       b4 = string.byte(str, i + 3) or 0
 75 |       cp = ((b - 0xF0) * 0x40000) + ((b2 - 0x80) * 0x1000) + ((b3 - 0x80) * 0x40) + (b4 - 0x80)
 76 |       len = 4
 77 |     else
 78 |       -- Invalid UTF8, skip
 79 |       cp = b
 80 |       len = 1
 81 | 
 82 |     table.insert codepoints, cp
 83 |     i = i + len
 84 | 
 85 |   codepoints
 86 | 
 87 | -- Encode a domain label using Punycode
 88 | punycode_encode = (label) ->
 89 |   return label unless label and label != ""
 90 | 
 91 |   -- short circuit
 92 |   if label\match "^[%w%-]+$"
 93 |     return label
 94 | 
 95 |   -- Get codepoints
 96 |   codepoints = utf8_codepoints label
 97 |   input_length = #codepoints
 98 | 
 99 |   -- Check if all characters are basic (ASCII)
100 |   has_nonbasic = false
101 |   for cp in *codepoints
102 |     if not is_basic cp
103 |       has_nonbasic = true
104 |       break
105 | 
106 |   return label unless has_nonbasic
107 | 
108 |   -- Extract basic characters
109 |   output = {}
110 |   basic_length = 0
111 | 
112 |   for cp in *codepoints
113 |     if is_basic cp
114 |       table.insert output, string.char(cp)
115 |       basic_length = basic_length + 1
116 | 
117 |   -- Add delimiter if we had basic characters
118 |   handled = basic_length
119 |   if basic_length > 0
120 |     table.insert output, string.char(delimiter)
121 | 
122 |   -- Encode non-basic characters
123 |   n = initial_n
124 |   bias = initial_bias
125 |   delta = 0
126 | 
127 |   while handled < input_length
128 |     -- Find next unhandled codepoint
129 |     m = 0x10FFFF + 1
130 |     for cp in *codepoints
131 |       if cp >= n and cp < m
132 |         m = cp
133 | 
134 |     -- Increase delta
135 |     delta = delta + (m - n) * (handled + 1)
136 |     n = m
137 | 
138 |     -- Encode all codepoints up to m
139 |     for cp in *codepoints
140 |       if cp < n
141 |         delta = delta + 1
142 |       elseif cp == n
143 |         -- Encode delta
144 |         q = delta
145 |         k = base
146 | 
147 |         while true
148 |           t = threshold k, bias
149 |           if q < t
150 |             break
151 | 
152 |           table.insert output, encode_digit(t + ((q - t) % (base - t)))
153 |           q = math.floor (q - t) / (base - t)
154 |           k = k + base
155 | 
156 |         table.insert output, encode_digit(q)
157 |         bias = adapt delta, handled + 1, handled == basic_length
158 |         delta = 0
159 |         handled = handled + 1
160 | 
161 |     delta = delta + 1
162 |     n = n + 1
163 | 
164 |   "xn--" .. table.concat output
165 | 
166 | {
167 |   :punycode_encode
168 | }
169 | 


--------------------------------------------------------------------------------
/lapis/bayes/models/categories.lua:
--------------------------------------------------------------------------------
  1 | local db = require("lapis.db")
  2 | local Model, encode_tuples
  3 | do
  4 |   local _obj_0 = require("lapis.bayes.model")
  5 |   Model, encode_tuples = _obj_0.Model, _obj_0.encode_tuples
  6 | end
  7 | local Categories
  8 | do
  9 |   local _class_0
 10 |   local _parent_0 = Model
 11 |   local _base_0 = {
 12 |     delete = function(self)
 13 |       if _class_0.__parent.__base.delete(self) then
 14 |         local WordClassifications
 15 |         WordClassifications = require("lapis.bayes.models").WordClassifications
 16 |         return db.delete(WordClassifications:table_name(), {
 17 |           category_id = self.id
 18 |         })
 19 |       end
 20 |     end,
 21 |     increment = function(self, amount)
 22 |       amount = assert(tonumber(amount), "expecting number")
 23 |       return self:update({
 24 |         total_count = db.raw("total_count + " .. tostring(amount))
 25 |       })
 26 |     end,
 27 |     increment_text = function(self, text, opts)
 28 |       if opts == nil then
 29 |         opts = { }
 30 |       end
 31 |       return error("This method has been removed, use increment_words instead")
 32 |     end,
 33 |     increment_word = function(self, word, count)
 34 |       local WordClassifications
 35 |       WordClassifications = require("lapis.bayes.models").WordClassifications
 36 |       local w = WordClassifications:find_or_create({
 37 |         category_id = self.id,
 38 |         word = word
 39 |       })
 40 |       w:_increment(count)
 41 |       return self:increment(count)
 42 |     end,
 43 |     increment_words = function(self, counts)
 44 |       if not (counts) then
 45 |         return nil, "missing counts"
 46 |       end
 47 |       local merged_counts = { }
 48 |       for k, v in pairs(counts) do
 49 |         local word, count
 50 |         if type(k) == "string" then
 51 |           word, count = k, v
 52 |         else
 53 |           word, count = v, 1
 54 |         end
 55 |         local _update_0 = word
 56 |         merged_counts[_update_0] = merged_counts[_update_0] or 0
 57 |         local _update_1 = word
 58 |         merged_counts[_update_1] = merged_counts[_update_1] + count
 59 |       end
 60 |       local total_count = 0
 61 |       local tuples
 62 |       do
 63 |         local _accum_0 = { }
 64 |         local _len_0 = 1
 65 |         for word, count in pairs(merged_counts) do
 66 |           total_count = total_count + count
 67 |           local _value_0 = {
 68 |             self.id,
 69 |             word,
 70 |             count
 71 |           }
 72 |           _accum_0[_len_0] = _value_0
 73 |           _len_0 = _len_0 + 1
 74 |         end
 75 |         tuples = _accum_0
 76 |       end
 77 |       if not (next(tuples)) then
 78 |         return total_count
 79 |       end
 80 |       local WordClassifications
 81 |       WordClassifications = require("lapis.bayes.models").WordClassifications
 82 |       local tbl = db.escape_identifier(WordClassifications:table_name())
 83 |       db.query("\n    INSERT INTO " .. tostring(tbl) .. " (category_id, word, count) " .. tostring(encode_tuples(tuples)) .. "\n    ON CONFLICT (category_id, word) DO UPDATE SET count = " .. tostring(tbl) .. ".count + EXCLUDED.count\n    ")
 84 |       self:increment(total_count)
 85 |       return total_count
 86 |     end
 87 |   }
 88 |   _base_0.__index = _base_0
 89 |   setmetatable(_base_0, _parent_0.__base)
 90 |   _class_0 = setmetatable({
 91 |     __init = function(self, ...)
 92 |       return _class_0.__parent.__init(self, ...)
 93 |     end,
 94 |     __base = _base_0,
 95 |     __name = "Categories",
 96 |     __parent = _parent_0
 97 |   }, {
 98 |     __index = function(cls, name)
 99 |       local val = rawget(_base_0, name)
100 |       if val == nil then
101 |         local parent = rawget(cls, "__parent")
102 |         if parent then
103 |           return parent[name]
104 |         end
105 |       else
106 |         return val
107 |       end
108 |     end,
109 |     __call = function(cls, ...)
110 |       local _self_0 = setmetatable({}, _base_0)
111 |       cls.__init(_self_0, ...)
112 |       return _self_0
113 |     end
114 |   })
115 |   _base_0.__class = _class_0
116 |   local self = _class_0
117 |   self.timestamp = true
118 |   self.relations = {
119 |     {
120 |       "word_classifications",
121 |       has_many = "WordClassifications"
122 |     }
123 |   }
124 |   self.find_or_create = function(self, name)
125 |     return self:find({
126 |       name = name
127 |     }) or self:create({
128 |       name = name
129 |     })
130 |   end
131 |   if _parent_0.__inherited then
132 |     _parent_0.__inherited(_parent_0, _class_0)
133 |   end
134 |   Categories = _class_0
135 |   return _class_0
136 | end
137 | 


--------------------------------------------------------------------------------
/lapis/bayes/classifiers/bayes.moon:
--------------------------------------------------------------------------------
  1 | -- implements naive bayes with assumed probability
  2 | class BayesClassifier extends require "lapis.bayes.classifiers.base"
  3 |   @default_options: {
  4 |     max_words: 40
  5 |     default_prob: 0.1
  6 |     log: false
  7 |     token_weight_patterns: nil
  8 |     uncertainty_weight: 1.0
  9 |   }
 10 | 
 11 |   get_token_weight: (word) =>
 12 |     return 1.0 unless @opts.token_weight_patterns
 13 | 
 14 |     for pattern, weight in pairs @opts.token_weight_patterns
 15 |       if word\match pattern
 16 |         return weight
 17 | 
 18 |     1.0
 19 | 
 20 |   word_probabilities: (categories, available_words, opts={}) =>
 21 |     opts or= {}
 22 |     return nil, "only two categories supported at once" unless #categories == 2
 23 | 
 24 |     a, b = unpack categories
 25 | 
 26 |     sum_counts = 0
 27 |     for c in *categories
 28 |       sum_counts += c.total_count
 29 | 
 30 |     available_words = @candidate_words categories, available_words, @opts.max_words
 31 |     available_words_count = #available_words
 32 | 
 33 |     unclassified_counts = opts.unclassified_counts or @opts.unclassified_counts
 34 |     uncertainty_weight = if opts.uncertainty_weight != nil
 35 |       opts.uncertainty_weight
 36 |     else
 37 |       @opts.uncertainty_weight or 1.0
 38 |     uncertainty_weight = math.max uncertainty_weight, 0
 39 | 
 40 |     token_weights = {}
 41 |     for word in *available_words
 42 |       weight = @get_token_weight word
 43 | 
 44 |       if unclassified_counts
 45 |         unc = unclassified_counts[word]
 46 |         if unc and unc > 0
 47 |           classified_total = 0
 48 |           classified_total += (a.word_counts and a.word_counts[word]) or 0
 49 |           classified_total += (b.word_counts and b.word_counts[word]) or 0
 50 | 
 51 |           total = classified_total + unc
 52 |           if total > 0 and uncertainty_weight != 0
 53 |             confidence = classified_total / total
 54 |             weight *= confidence ^ uncertainty_weight
 55 | 
 56 |       token_weights[word] = weight
 57 | 
 58 |     default_prob = @opts.default_prob / sum_counts
 59 | 
 60 |     default_a = default_prob * a.total_count
 61 |     default_b = default_prob * b.total_count
 62 | 
 63 |     -- NOTE: you should use log mode if you have a large number of tokens
 64 |     -- because the numbers get really small
 65 |     prob = if @opts.log
 66 |       ai_log_sum = 0
 67 |       bi_log_sum = 0
 68 | 
 69 |       for word in *available_words
 70 |         ai_count = (a.word_counts and a.word_counts[word] or 0) + default_a
 71 |         bi_count = (b.word_counts and b.word_counts[word] or 0) + default_b
 72 | 
 73 |         weight = token_weights[word] or @get_token_weight word
 74 | 
 75 |         ai_log_sum += weight * math.log ai_count
 76 |         bi_log_sum += weight * math.log bi_count
 77 | 
 78 |       ai_log_sum += math.log a.total_count
 79 |       bi_log_sum += math.log b.total_count
 80 | 
 81 |       ai_log_sum -= math.log (default_a + a.total_count)
 82 |       bi_log_sum -= math.log (default_b + b.total_count)
 83 | 
 84 |       ai_log_sum -= math.log available_words_count
 85 |       bi_log_sum -= math.log available_words_count
 86 | 
 87 |       max_log_sum = math.max ai_log_sum, bi_log_sum
 88 | 
 89 |       ai_prob = math.exp(ai_log_sum - max_log_sum)
 90 |       bi_prob = math.exp(bi_log_sum - max_log_sum)
 91 | 
 92 |       ai_prob / (ai_prob + bi_prob)
 93 |     else
 94 |       local ai_mul, bi_mul
 95 | 
 96 |       for word in *available_words
 97 |         ai_count = (a.word_counts and a.word_counts[word] or 0) + default_a
 98 |         bi_count = (b.word_counts and b.word_counts[word] or 0) + default_b
 99 | 
100 |         weight = token_weights[word] or @get_token_weight word
101 | 
102 |         if ai_mul
103 |           ai_mul *= ai_count ^ weight
104 |         else
105 |           ai_mul = ai_count ^ weight
106 | 
107 |         if bi_mul
108 |           bi_mul *= bi_count ^ weight
109 |         else
110 |           bi_mul = bi_count ^ weight
111 | 
112 |       ai_prob = a.total_count * ai_mul / ((a.total_count + default_a) * available_words_count)
113 |       bi_prob = b.total_count * bi_mul / ((b.total_count + default_b) * available_words_count)
114 | 
115 |       ai_prob = 0 if ai_prob != ai_prob
116 |       bi_prob = 0 if bi_prob != bi_prob
117 | 
118 |       ai_prob / (ai_prob + bi_prob)
119 | 
120 |     if prob != prob
121 |       return nil, "Got nan when calculating prob"
122 | 
123 |     if prob == math.huge or prob == -math.huge
124 |       return nil, "Got inf when calculating prob"
125 | 
126 |     tuples = {
127 |       { a.name, prob }
128 |       { b.name, 1 - prob }
129 |     }
130 | 
131 |     table.sort tuples, (a, b) -> a[2] > b[2]
132 |     tuples
133 | 


--------------------------------------------------------------------------------
/lapis/bayes/text/punycode.lua:
--------------------------------------------------------------------------------
  1 | local base = 36
  2 | local tmin = 1
  3 | local tmax = 26
  4 | local skew = 38
  5 | local damp = 700
  6 | local initial_bias = 72
  7 | local initial_n = 128
  8 | local delimiter = 0x2D
  9 | local adapt
 10 | adapt = function(delta, numpoints, firsttime)
 11 |   if firsttime then
 12 |     delta = math.floor(delta / damp)
 13 |   else
 14 |     delta = math.floor(delta / 2)
 15 |   end
 16 |   delta = delta + math.floor(delta / numpoints)
 17 |   local k = 0
 18 |   while delta > math.floor((base - tmin) * tmax / 2) do
 19 |     delta = math.floor(delta / (base - tmin))
 20 |     k = k + base
 21 |   end
 22 |   return k + math.floor(((base - tmin + 1) * delta) / (delta + skew))
 23 | end
 24 | local encode_digit
 25 | encode_digit = function(d)
 26 |   if d < 26 then
 27 |     return string.char(d + 0x61)
 28 |   else
 29 |     return string.char(d - 26 + 0x30)
 30 |   end
 31 | end
 32 | local threshold
 33 | threshold = function(k, bias)
 34 |   if k <= bias + tmin then
 35 |     return tmin
 36 |   elseif k >= bias + tmax then
 37 |     return tmax
 38 |   else
 39 |     return k - bias
 40 |   end
 41 | end
 42 | local is_basic
 43 | is_basic = function(cp)
 44 |   return cp < 0x80
 45 | end
 46 | local utf8_codepoints
 47 | utf8_codepoints = function(str)
 48 |   local codepoints = { }
 49 |   local i = 1
 50 |   while i <= #str do
 51 |     local b = string.byte(str, i)
 52 |     local cp = nil
 53 |     local len = 1
 54 |     if b < 0x80 then
 55 |       cp = b
 56 |       len = 1
 57 |     elseif b >= 0xC0 and b < 0xE0 then
 58 |       local b2 = string.byte(str, i + 1) or 0
 59 |       cp = ((b - 0xC0) * 0x40) + (b2 - 0x80)
 60 |       len = 2
 61 |     elseif b >= 0xE0 and b < 0xF0 then
 62 |       local b2 = string.byte(str, i + 1) or 0
 63 |       local b3 = string.byte(str, i + 2) or 0
 64 |       cp = ((b - 0xE0) * 0x1000) + ((b2 - 0x80) * 0x40) + (b3 - 0x80)
 65 |       len = 3
 66 |     elseif b >= 0xF0 and b < 0xF8 then
 67 |       local b2 = string.byte(str, i + 1) or 0
 68 |       local b3 = string.byte(str, i + 2) or 0
 69 |       local b4 = string.byte(str, i + 3) or 0
 70 |       cp = ((b - 0xF0) * 0x40000) + ((b2 - 0x80) * 0x1000) + ((b3 - 0x80) * 0x40) + (b4 - 0x80)
 71 |       len = 4
 72 |     else
 73 |       cp = b
 74 |       len = 1
 75 |     end
 76 |     table.insert(codepoints, cp)
 77 |     i = i + len
 78 |   end
 79 |   return codepoints
 80 | end
 81 | local punycode_encode
 82 | punycode_encode = function(label)
 83 |   if not (label and label ~= "") then
 84 |     return label
 85 |   end
 86 |   if label:match("^[%w%-]+$") then
 87 |     return label
 88 |   end
 89 |   local codepoints = utf8_codepoints(label)
 90 |   local input_length = #codepoints
 91 |   local has_nonbasic = false
 92 |   for _index_0 = 1, #codepoints do
 93 |     local cp = codepoints[_index_0]
 94 |     if not is_basic(cp) then
 95 |       has_nonbasic = true
 96 |       break
 97 |     end
 98 |   end
 99 |   if not (has_nonbasic) then
100 |     return label
101 |   end
102 |   local output = { }
103 |   local basic_length = 0
104 |   for _index_0 = 1, #codepoints do
105 |     local cp = codepoints[_index_0]
106 |     if is_basic(cp) then
107 |       table.insert(output, string.char(cp))
108 |       basic_length = basic_length + 1
109 |     end
110 |   end
111 |   local handled = basic_length
112 |   if basic_length > 0 then
113 |     table.insert(output, string.char(delimiter))
114 |   end
115 |   local n = initial_n
116 |   local bias = initial_bias
117 |   local delta = 0
118 |   while handled < input_length do
119 |     local m = 0x10FFFF + 1
120 |     for _index_0 = 1, #codepoints do
121 |       local cp = codepoints[_index_0]
122 |       if cp >= n and cp < m then
123 |         m = cp
124 |       end
125 |     end
126 |     delta = delta + (m - n) * (handled + 1)
127 |     n = m
128 |     for _index_0 = 1, #codepoints do
129 |       local cp = codepoints[_index_0]
130 |       if cp < n then
131 |         delta = delta + 1
132 |       elseif cp == n then
133 |         local q = delta
134 |         local k = base
135 |         while true do
136 |           local t = threshold(k, bias)
137 |           if q < t then
138 |             break
139 |           end
140 |           table.insert(output, encode_digit(t + ((q - t) % (base - t))))
141 |           q = math.floor((q - t) / (base - t))
142 |           k = k + base
143 |         end
144 |         table.insert(output, encode_digit(q))
145 |         bias = adapt(delta, handled + 1, handled == basic_length)
146 |         delta = 0
147 |         handled = handled + 1
148 |       end
149 |     end
150 |     delta = delta + 1
151 |     n = n + 1
152 |   end
153 |   return "xn--" .. table.concat(output)
154 | end
155 | return {
156 |   punycode_encode = punycode_encode
157 | }
158 | 


--------------------------------------------------------------------------------
/lapis/bayes/tokenizers/ngram.lua:
--------------------------------------------------------------------------------
  1 | local NgramTokenizer
  2 | do
  3 |   local _class_0
  4 |   local _parent_0 = require("lapis.bayes.tokenizers.base")
  5 |   local _base_0 = {
  6 |     build_grammar = function(self)
  7 |       local C, Ct
  8 |       do
  9 |         local _obj_0 = require("lpeg")
 10 |         C, Ct = _obj_0.C, _obj_0.Ct
 11 |       end
 12 |       local utf8 = require("lapis.util.utf8")
 13 |       local whitespace = utf8.whitespace
 14 |       local printable = utf8.printable_character
 15 |       local word_chars = printable - whitespace
 16 |       local word = C(word_chars ^ 1)
 17 |       return Ct((word + whitespace ^ 1) ^ 0)
 18 |     end,
 19 |     normalize_word = function(self, word)
 20 |       if not (word and word ~= "") then
 21 |         return 
 22 |       end
 23 |       local normalized = tostring(word):lower()
 24 |       normalized = normalized:gsub("[%p]", "")
 25 |       normalized = normalized:gsub("%s+", "")
 26 |       if not (normalized ~= "") then
 27 |         return 
 28 |       end
 29 |       return normalized
 30 |     end,
 31 |     ngram_size = function(self)
 32 |       local n = tonumber(self.opts.n) or 2
 33 |       n = math.floor(n)
 34 |       if n < 1 then
 35 |         n = 1
 36 |       end
 37 |       return n
 38 |     end,
 39 |     word_ngrams = function(self, word, n)
 40 |       local C, Ct
 41 |       do
 42 |         local _obj_0 = require("lpeg")
 43 |         C, Ct = _obj_0.C, _obj_0.Ct
 44 |       end
 45 |       local utf8 = require("lapis.util.utf8")
 46 |       local printable = utf8.printable_character
 47 |       local char_pattern = Ct((C(printable)) ^ 0)
 48 |       local chars = char_pattern:match(word)
 49 |       if not (chars) then
 50 |         return {
 51 |           word
 52 |         }
 53 |       end
 54 |       local len = #chars
 55 |       if len == 0 then
 56 |         return {
 57 |           word
 58 |         }
 59 |       end
 60 |       if len < n then
 61 |         return {
 62 |           word
 63 |         }
 64 |       end
 65 |       local out = { }
 66 |       for i = 1, len - n + 1 do
 67 |         local ngram = table.concat(chars, "", i, i + n - 1)
 68 |         table.insert(out, ngram)
 69 |       end
 70 |       return out
 71 |     end,
 72 |     tokenize_text = function(self, text)
 73 |       if not (text and text ~= "") then
 74 |         return { }
 75 |       end
 76 |       do
 77 |         local pre_filter = self.opts.filter_text
 78 |         if pre_filter then
 79 |           text = pre_filter(text)
 80 |           if not (text and text ~= "") then
 81 |             return { }
 82 |           end
 83 |         end
 84 |       end
 85 |       self.grammar = self.grammar or self:build_grammar()
 86 |       local words = self.grammar:match(text)
 87 |       if not (words) then
 88 |         return { }
 89 |       end
 90 |       local n = self:ngram_size()
 91 |       local ignore_numbers = self.opts.ignore_numbers
 92 |       if ignore_numbers == nil then
 93 |         ignore_numbers = true
 94 |       end
 95 |       local tokens = { }
 96 |       for _index_0 = 1, #words do
 97 |         local _continue_0 = false
 98 |         repeat
 99 |           local raw_word = words[_index_0]
100 |           local cleaned = self:normalize_word(raw_word)
101 |           if not (cleaned) then
102 |             _continue_0 = true
103 |             break
104 |           end
105 |           if ignore_numbers and cleaned:match("^%d+$") then
106 |             _continue_0 = true
107 |             break
108 |           end
109 |           local _list_0 = self:word_ngrams(cleaned, n)
110 |           for _index_1 = 1, #_list_0 do
111 |             local token = _list_0[_index_1]
112 |             table.insert(tokens, token)
113 |           end
114 |           _continue_0 = true
115 |         until true
116 |         if not _continue_0 then
117 |           break
118 |         end
119 |       end
120 |       if self.opts.filter_tokens then
121 |         tokens = self.opts.filter_tokens(tokens, self.opts)
122 |       end
123 |       return tokens
124 |     end
125 |   }
126 |   _base_0.__index = _base_0
127 |   setmetatable(_base_0, _parent_0.__base)
128 |   _class_0 = setmetatable({
129 |     __init = function(self, opts)
130 |       if opts == nil then
131 |         opts = { }
132 |       end
133 |       self.opts = opts
134 |     end,
135 |     __base = _base_0,
136 |     __name = "NgramTokenizer",
137 |     __parent = _parent_0
138 |   }, {
139 |     __index = function(cls, name)
140 |       local val = rawget(_base_0, name)
141 |       if val == nil then
142 |         local parent = rawget(cls, "__parent")
143 |         if parent then
144 |           return parent[name]
145 |         end
146 |       else
147 |         return val
148 |       end
149 |     end,
150 |     __call = function(cls, ...)
151 |       local _self_0 = setmetatable({}, _base_0)
152 |       cls.__init(_self_0, ...)
153 |       return _self_0
154 |     end
155 |   })
156 |   _base_0.__class = _class_0
157 |   if _parent_0.__inherited then
158 |     _parent_0.__inherited(_parent_0, _class_0)
159 |   end
160 |   NgramTokenizer = _class_0
161 |   return _class_0
162 | end
163 | 


--------------------------------------------------------------------------------
/lapis/bayes/tokenizers/url_domains.lua:
--------------------------------------------------------------------------------
  1 | local trim
  2 | trim = require("lapis.util").trim
  3 | local UrlDomainsTokenizer
  4 | do
  5 |   local _class_0
  6 |   local _parent_0 = require("lapis.bayes.tokenizers.base")
  7 |   local _base_0 = {
  8 |     ignore_domain = function(self, domain)
  9 |       if not (self.opts and self.opts.ignore_domains) then
 10 |         return 
 11 |       end
 12 |       if self.opts.ignore_domains[domain] then
 13 |         return true
 14 |       end
 15 |       while true do
 16 |         local sub = domain:gsub("^%**%.?[^%.]+", "*")
 17 |         if sub == domain then
 18 |           return false
 19 |         end
 20 |         if self.opts.ignore_domains[sub] then
 21 |           return true
 22 |         end
 23 |         domain = sub
 24 |       end
 25 |     end,
 26 |     filter_tokens = function(self, urls)
 27 |       return (function()
 28 |         local _accum_0 = { }
 29 |         local _len_0 = 1
 30 |         for _index_0 = 1, #urls do
 31 |           local _continue_0 = false
 32 |           repeat
 33 |             local url = urls[_index_0]
 34 |             url = url:lower()
 35 |             url = trim(url)
 36 |             url = url:gsub("^%w+://", "")
 37 |             url = url:gsub("^www%.", "")
 38 |             url = url:gsub("/.*$", "")
 39 |             url = trim(url)
 40 |             url:gsub("<$", "")
 41 |             url:gsub("^>", "")
 42 |             if url == "" then
 43 |               _continue_0 = true
 44 |               break
 45 |             end
 46 |             if url:match("^%w+:") then
 47 |               _continue_0 = true
 48 |               break
 49 |             end
 50 |             if url:match([=[[<>="' ]]=]) then
 51 |               _continue_0 = true
 52 |               break
 53 |             end
 54 |             if not (url:match("%.")) then
 55 |               _continue_0 = true
 56 |               break
 57 |             end
 58 |             if self:ignore_domain(url) then
 59 |               _continue_0 = true
 60 |               break
 61 |             end
 62 |             local _value_0 = url
 63 |             _accum_0[_len_0] = _value_0
 64 |             _len_0 = _len_0 + 1
 65 |             _continue_0 = true
 66 |           until true
 67 |           if not _continue_0 then
 68 |             break
 69 |           end
 70 |         end
 71 |         return _accum_0
 72 |       end)()
 73 |     end,
 74 |     build_grammar = function(self)
 75 |       local P, S, R, C, Ct, Cs
 76 |       do
 77 |         local _obj_0 = require("lpeg")
 78 |         P, S, R, C, Ct, Cs = _obj_0.P, _obj_0.S, _obj_0.R, _obj_0.C, _obj_0.Ct, _obj_0.Cs
 79 |       end
 80 |       local case_insensitive
 81 |       case_insensitive = function(text)
 82 |         local out = nil
 83 |         for char in text:gmatch(".") do
 84 |           local p = S(tostring(char:lower()) .. tostring(char:upper()))
 85 |           if out then
 86 |             out = out * p
 87 |           else
 88 |             out = p
 89 |           end
 90 |         end
 91 |         return out
 92 |       end
 93 |       local unescape_char = P("&gt;") / ">" + P("&lt;") / "<" + P("&amp;") / "&" + P("&nbsp;") / " " + P("&#x27;") / "'" + P("&#x2F;") / "/" + P("&quot;") / '"'
 94 |       local unescape_text = Cs((unescape_char + 1) ^ 1)
 95 |       local some_space = S(" \t\n")
 96 |       local space = some_space ^ 0
 97 |       local alphanum = R("az", "AZ", "09")
 98 |       local scheme = case_insensitive("http") * case_insensitive("s") ^ -1 * P("://")
 99 |       local raw_url = C(scheme * (P(1) - S(" \t\n")) ^ 1)
100 |       local word = (alphanum + S("._-")) ^ 1
101 |       local attr_value = C(word) + P('"') * C((1 - P('"')) ^ 0) * P('"') + P("'") * C((1 - P("'")) ^ 0) * P("'")
102 |       local href = (case_insensitive("href") + case_insensitive("src")) * space * P("=") * space * attr_value / function(v)
103 |         return unescape_text:match(v) or ""
104 |       end
105 |       local simple = C(case_insensitive("www") * (P(".") * (1 - (S("./") + some_space)) ^ 1) ^ 1)
106 |       return Ct((raw_url + href + simple + 1) ^ 0)
107 |     end,
108 |     tokenize_text = function(self, text)
109 |       self.grammar = self.grammar or self:build_grammar()
110 |       local matches = self.grammar:match(text)
111 |       if not (matches) then
112 |         return nil, "failed to parse text"
113 |       end
114 |       return self:filter_tokens(matches)
115 |     end
116 |   }
117 |   _base_0.__index = _base_0
118 |   setmetatable(_base_0, _parent_0.__base)
119 |   _class_0 = setmetatable({
120 |     __init = function(self, opts)
121 |       if opts == nil then
122 |         opts = { }
123 |       end
124 |       self.opts = opts
125 |     end,
126 |     __base = _base_0,
127 |     __name = "UrlDomainsTokenizer",
128 |     __parent = _parent_0
129 |   }, {
130 |     __index = function(cls, name)
131 |       local val = rawget(_base_0, name)
132 |       if val == nil then
133 |         local parent = rawget(cls, "__parent")
134 |         if parent then
135 |           return parent[name]
136 |         end
137 |       else
138 |         return val
139 |       end
140 |     end,
141 |     __call = function(cls, ...)
142 |       local _self_0 = setmetatable({}, _base_0)
143 |       cls.__init(_self_0, ...)
144 |       return _self_0
145 |     end
146 |   })
147 |   _base_0.__class = _class_0
148 |   if _parent_0.__inherited then
149 |     _parent_0.__inherited(_parent_0, _class_0)
150 |   end
151 |   UrlDomainsTokenizer = _class_0
152 |   return _class_0
153 | end
154 | 


--------------------------------------------------------------------------------
/lapis/bayes/classifiers/base.moon:
--------------------------------------------------------------------------------
  1 | import uniquify from require "lapis.util"
  2 | 
  3 | class BaseClassifier
  4 |   default_tokenizer: "lapis.bayes.tokenizers.postgres_text"
  5 | 
  6 |   new: (@opts={}) =>
  7 |     if @@default_options
  8 |       @opts = setmetatable {k,v for k,v in pairs @opts}, __index: @@default_options
  9 | 
 10 |   word_probabilities: (categories, words) =>
 11 |     error "word_probabilities: subclass must implement"
 12 | 
 13 |   classify_text: (...) =>
 14 |     counts, word_rate_or_err = @text_probabilities ...
 15 |     unless counts
 16 |       return nil, word_rate_or_err
 17 | 
 18 |     counts[1][1], counts[1][2], word_rate_or_err
 19 | 
 20 |   tokenize_text: (text) =>
 21 |     assert text, "missing text to tokenize"
 22 | 
 23 |     -- text is some object that is already tokenized
 24 |     unless type(text) == "string"
 25 |       return text
 26 | 
 27 |     -- custom tokenizer function passed
 28 |     if @opts.tokenize_text
 29 |       return @opts.tokenize_text text, @opts
 30 | 
 31 |     -- tokenizer instance passed
 32 |     tokenizer = if @opts.tokenizer
 33 |       @opts.tokenizer
 34 |     else
 35 |       Tokenizer = require @default_tokenizer
 36 |       Tokenizer(@opts)
 37 | 
 38 |     tokenizer\tokenize_text text
 39 | 
 40 |   train_text: (category, text, opts) =>
 41 |     tokens = @tokenize_text text
 42 | 
 43 |     if opts and opts.filter_tokens
 44 |       tokens = opts.filter_tokens opts, text
 45 | 
 46 |     import Categories from require "lapis.bayes.models"
 47 |     category = Categories\find_or_create category
 48 |     category\increment_words tokens
 49 | 
 50 |   -- categories: a lua array of categories names
 51 |   -- text: string of text to classify, or an array of tokens to classify
 52 |   text_probabilities: (category_names, text, opts) =>
 53 |     opts or= {}
 54 | 
 55 |     categories, err = @find_categories category_names
 56 | 
 57 |     unless categories
 58 |       return nil, err
 59 | 
 60 |     words = @tokenize_text text
 61 | 
 62 |     unless words and next words
 63 |       return nil, "failed to generate tokens for text"
 64 | 
 65 |     available_words, err = @count_words categories, words
 66 | 
 67 |     unless available_words
 68 |       return nil, err
 69 | 
 70 |     available_words_set = {word, true for word in *available_words}
 71 |     count = 0
 72 |     for word in *words
 73 |       count +=1 if available_words_set[word]
 74 | 
 75 |     token_ratio = count / #words
 76 | 
 77 |     probs, err = @word_probabilities categories, available_words, opts
 78 |     unless probs
 79 |       return nil, err
 80 | 
 81 |     -- put probs in hash table part of result
 82 |     for {c, p} in *probs
 83 |       probs[c] = p
 84 | 
 85 |     probs, token_ratio
 86 | 
 87 |   -- query the category objects by category name
 88 |   -- returns an array of category records in the same order as the input
 89 |   find_categories: (category_names) =>
 90 |     import Categories from require "lapis.bayes.models"
 91 |     db = Categories.db
 92 | 
 93 |     categories = Categories\select "where name in ?", db.list category_names
 94 |     by_name = {c.name, c for c in *categories}
 95 | 
 96 |     local missing
 97 | 
 98 |     result = for name in *category_names
 99 |       c = by_name[name]
100 | 
101 |       unless c
102 |         missing or= {}
103 |         table.insert missing, name
104 |         continue
105 | 
106 |       c
107 | 
108 |     if missing and next missing
109 |       return nil, "find_categories: missing categories (#{table.concat missing, ", "})"
110 | 
111 |     result
112 | 
113 |   -- query for WordClassifications for the requested category ids
114 |   -- both arguments are arrays
115 |   -- returns WordClassifications in no particular order
116 |   find_word_classifications: (words, category_ids) =>
117 |     return {} unless next(words) and next category_ids
118 | 
119 |     import WordClassifications from require "lapis.bayes.models"
120 |     db = WordClassifications.db
121 |     WordClassifications\select "where word in ? and category_id in ?", db.list(words), db.list(category_ids)
122 | 
123 |   -- reduce the set of available words by looking for polarizing words
124 |   -- categories: array of category objects
125 |   -- available_words: array of available words
126 |   -- count: the max length of returned words array
127 |   candidate_words: (categories, available_words, count) =>
128 |     return available_words if #available_words <= count
129 | 
130 |     assert #categories == 2, "can only do two categories"
131 | 
132 |     a,b = unpack categories
133 |     -- calculate conflict words
134 |     tuples = for word in *available_words
135 |       a_count = a.word_counts and a.word_counts[word] or 0
136 |       b_count = b.word_counts and b.word_counts[word] or 0
137 | 
138 |       {
139 |         word
140 |         math.random! / 100 + math.abs (a_count - b_count) / math.sqrt a_count + b_count
141 |         a_count
142 |         b_count
143 |       }
144 | 
145 |     table.sort tuples, (a,b) ->
146 |       a[2] > b[2]
147 | 
148 |     [t[1] for t in *tuples[,count]]
149 | 
150 |   -- load the categories with the counts from the words text, return the list
151 |   -- of words that appear in at least one category
152 |   --
153 |   -- categories: array of categories
154 |   -- words: array of tokens
155 |   count_words: (categories, words) =>
156 |     categories_by_id = {c.id, c for c in *categories}
157 |     words = uniquify words
158 | 
159 |     wcs = @find_word_classifications words, [c.id for c in *categories]
160 | 
161 |     available_words = [word for word in pairs {wc.word, true for wc in *wcs}]
162 | 
163 |     if #available_words == 0
164 |       return nil, "no words in text are classifyable"
165 | 
166 |     for wc in *wcs
167 |       category = categories_by_id[wc.category_id]
168 |       category.word_counts or= {}
169 |       category.word_counts[wc.word] = wc.count
170 | 
171 |     available_words
172 | 


--------------------------------------------------------------------------------
/lapis/bayes/classifiers/bayes_multi.lua:
--------------------------------------------------------------------------------
  1 | local BayesMultiClassifier
  2 | do
  3 |   local _class_0
  4 |   local _parent_0 = require("lapis.bayes.classifiers.base")
  5 |   local _base_0 = {
  6 |     candidate_words = function(self, categories, available_words, count)
  7 |       if not (count and count < #available_words) then
  8 |         return available_words
  9 |       end
 10 |       local tuples
 11 |       do
 12 |         local _accum_0 = { }
 13 |         local _len_0 = 1
 14 |         for _index_0 = 1, #available_words do
 15 |           local word = available_words[_index_0]
 16 |           local totals = 0
 17 |           local counts = { }
 18 |           for _index_1 = 1, #categories do
 19 |             local category = categories[_index_1]
 20 |             local word_counts = category.word_counts
 21 |             local c = word_counts and word_counts[word] or 0
 22 |             table.insert(counts, c)
 23 |             totals = totals + c
 24 |           end
 25 |           local score
 26 |           if totals == 0 then
 27 |             score = 0
 28 |           else
 29 |             local mean = totals / #counts
 30 |             local variance = 0
 31 |             for _index_1 = 1, #counts do
 32 |               local c = counts[_index_1]
 33 |               variance = variance + ((c - mean) ^ 2)
 34 |             end
 35 |             score = variance / #counts
 36 |           end
 37 |           score = score + (math.random() / 1000)
 38 |           local _value_0 = {
 39 |             word,
 40 |             score
 41 |           }
 42 |           _accum_0[_len_0] = _value_0
 43 |           _len_0 = _len_0 + 1
 44 |         end
 45 |         tuples = _accum_0
 46 |       end
 47 |       table.sort(tuples, function(a, b)
 48 |         return a[2] > b[2]
 49 |       end)
 50 |       local _accum_0 = { }
 51 |       local _len_0 = 1
 52 |       local _max_0 = count
 53 |       for _index_0 = 1, _max_0 < 0 and #tuples + _max_0 or _max_0 do
 54 |         local t = tuples[_index_0]
 55 |         _accum_0[_len_0] = t[1]
 56 |         _len_0 = _len_0 + 1
 57 |       end
 58 |       return _accum_0
 59 |     end,
 60 |     word_probabilities = function(self, categories, available_words)
 61 |       if not (#categories >= 2) then
 62 |         return nil, "at least two categories required"
 63 |       end
 64 |       available_words = self:candidate_words(categories, available_words, self.opts.max_words)
 65 |       local vocab_size = #available_words
 66 |       if not (vocab_size > 0) then
 67 |         return nil, "no words to score"
 68 |       end
 69 |       local smoothing
 70 |       if self.opts.default_prob and self.opts.default_prob > 0 then
 71 |         smoothing = self.opts.default_prob
 72 |       else
 73 |         smoothing = 1e-6
 74 |       end
 75 |       local sum_counts = 0
 76 |       for _index_0 = 1, #categories do
 77 |         local category = categories[_index_0]
 78 |         sum_counts = sum_counts + (category.total_count or 0)
 79 |       end
 80 |       local prior_smoothing = smoothing * #categories
 81 |       local max_log
 82 |       local log_scores
 83 |       do
 84 |         local _accum_0 = { }
 85 |         local _len_0 = 1
 86 |         for _index_0 = 1, #categories do
 87 |           local category = categories[_index_0]
 88 |           local cat_total = math.max((category.total_count or 0), 0)
 89 |           local prior = (cat_total + smoothing) / (sum_counts + prior_smoothing)
 90 |           local log_score = math.log(prior)
 91 |           local denominator = cat_total + (smoothing * vocab_size)
 92 |           if denominator <= 0 then
 93 |             denominator = smoothing * vocab_size
 94 |           end
 95 |           for _index_1 = 1, #available_words do
 96 |             local word = available_words[_index_1]
 97 |             local word_count = category.word_counts and category.word_counts[word] or 0
 98 |             log_score = log_score + math.log(((word_count + smoothing) / denominator))
 99 |           end
100 |           if max_log then
101 |             max_log = math.max(max_log, log_score)
102 |           else
103 |             max_log = log_score
104 |           end
105 |           local _value_0 = {
106 |             category,
107 |             log_score
108 |           }
109 |           _accum_0[_len_0] = _value_0
110 |           _len_0 = _len_0 + 1
111 |         end
112 |         log_scores = _accum_0
113 |       end
114 |       local weights = { }
115 |       local total_weight = 0
116 |       for _index_0 = 1, #log_scores do
117 |         local _des_0 = log_scores[_index_0]
118 |         local category, log_score
119 |         category, log_score = _des_0[1], _des_0[2]
120 |         local weight = math.exp((log_score - max_log))
121 |         total_weight = total_weight + weight
122 |         table.insert(weights, {
123 |           category.name,
124 |           weight
125 |         })
126 |       end
127 |       if not (total_weight > 0) then
128 |         return nil, "unable to normalise probabilities"
129 |       end
130 |       for _index_0 = 1, #weights do
131 |         local tuple = weights[_index_0]
132 |         local _update_0 = 2
133 |         tuple[_update_0] = tuple[_update_0] / total_weight
134 |       end
135 |       table.sort(weights, function(a, b)
136 |         return a[2] > b[2]
137 |       end)
138 |       return weights
139 |     end
140 |   }
141 |   _base_0.__index = _base_0
142 |   setmetatable(_base_0, _parent_0.__base)
143 |   _class_0 = setmetatable({
144 |     __init = function(self, ...)
145 |       return _class_0.__parent.__init(self, ...)
146 |     end,
147 |     __base = _base_0,
148 |     __name = "BayesMultiClassifier",
149 |     __parent = _parent_0
150 |   }, {
151 |     __index = function(cls, name)
152 |       local val = rawget(_base_0, name)
153 |       if val == nil then
154 |         local parent = rawget(cls, "__parent")
155 |         if parent then
156 |           return parent[name]
157 |         end
158 |       else
159 |         return val
160 |       end
161 |     end,
162 |     __call = function(cls, ...)
163 |       local _self_0 = setmetatable({}, _base_0)
164 |       cls.__init(_self_0, ...)
165 |       return _self_0
166 |     end
167 |   })
168 |   _base_0.__class = _class_0
169 |   local self = _class_0
170 |   self.default_options = {
171 |     max_words = 40,
172 |     default_prob = 0.1
173 |   }
174 |   if _parent_0.__inherited then
175 |     _parent_0.__inherited(_parent_0, _class_0)
176 |   end
177 |   BayesMultiClassifier = _class_0
178 |   return _class_0
179 | end
180 | 


--------------------------------------------------------------------------------
/spec/punycode_spec.moon:
--------------------------------------------------------------------------------
 1 | punycode = require "lapis.bayes.text.punycode"
 2 | 
 3 | describe "lapis.bayes.text.punycode", ->
 4 |   describe "punycode_encode", ->
 5 |     fixtures = {
 6 |       { description: "German umlaut: münchen", label: "münchen", expected: "xn--mnchen-3ya" }
 7 |       { description: "German umlaut: müller", label: "müller", expected: "xn--mller-kva" }
 8 |       { description: "German umlaut: bücher", label: "bücher", expected: "xn--bcher-kva" }
 9 |       { description: "French accent: français", label: "français", expected: "xn--franais-xxa" }
10 |       { description: "French accent: café", label: "café", expected: "xn--caf-dma" }
11 |       { description: "Spanish tilde: español", label: "español", expected: "xn--espaol-zwa" }
12 |       { description: "Spanish tilde: mañana", label: "mañana", expected: "xn--maana-pta" }
13 |       { description: "Japanese kanji: 日本", label: "日本", expected: "xn--wgv71a" }
14 |       { description: "Japanese hiragana: こんにちは", label: "こんにちは", expected: "xn--28j2a3ar1p" }
15 |       { description: "Japanese katakana: テスト", label: "テスト", expected: "xn--zckzah" }
16 |       { description: "Chinese simplified: 中国", label: "中国", expected: "xn--fiqs8s" }
17 |       { description: "Chinese traditional: 中國", label: "中國", expected: "xn--fiqz9s" }
18 |       { description: "Korean hangul: 한국", label: "한국", expected: "xn--3e0b707e" }
19 |       { description: "Arabic: العربية", label: "العربية", expected: "xn--mgbcd4a2b0d2b" }
20 |       { description: "Russian cyrillic: россия", label: "россия", expected: "xn--h1alffa9f" }
21 |       { description: "Greek: ελληνικά", label: "ελληνικά", expected: "xn--hxargifdar" }
22 |       { description: "Hebrew: עברית", label: "עברית", expected: "xn--5dbqzzl" }
23 |       { description: "Thai: ไทย", label: "ไทย", expected: "xn--o3cw4h" }
24 |       { description: "Mixed ASCII & Unicode: bücher-buch", label: "bücher-buch", expected: "xn--bcher-buch-9db" }
25 |       { description: "Mixed ASCII & Unicode: hello世界", label: "hello世界", expected: "xn--hello-ck1hg65u" }
26 |       { description: "Single Unicode codepoint: ü", label: "ü", expected: "xn--tda" }
27 |       { description: "Single Unicode codepoint: ñ", label: "ñ", expected: "xn--ida" }
28 |       { description: "Numeric suffix: 123ü", label: "123ü", expected: "xn--123-joa" }
29 |       { description: "Leading hyphen: -ü", label: "-ü", expected: "xn----eha" }
30 |       { description: "Swiss city: zürich", label: "zürich", expected: "xn--zrich-kva" }
31 |       { description: "Russian city: москва", label: "москва", expected: "xn--80adxhks" }
32 |       { description: "Arabic city: القاهرة", label: "القاهرة", expected: "xn--mgbag5a2flx" }
33 |       { description: "Hyphen only label", label: "---", expected: "---" }
34 |       { description: "German compound: bücher-bücherei", label: "bücher-bücherei", expected: "xn--bcher-bcherei-wobg" }
35 |       { description: "Czech example", label: "Pročprostěnemluvíčesky", expected: "xn--Proprostnemluvesky-uyb24dma41a" }
36 |       { description: "Chinese (simplified) example", label: "他们为什么不说中文", expected: "xn--ihqwcrb4cv8a8dqg056pqjye" }
37 |       { description: "Chinese (traditional) example", label: "他們爲什麽不說中文", expected: "xn--ihqwctvzc91f659drss3x8bo0yb" }
38 |       { description: "Arabic example", label: "ليهمابتكلموشعربي؟", expected: "xn--egbpdaj6bu4bxfgehfvwxn" }
39 |       { description: "Hebrew example", label: "למההםפשוטלאמדבריםעברית", expected: "xn--4dbcagdahymbxekheh6e0a7fei0b" }
40 |       { description: "Hindi example", label: "यहलोगहिन्दीक्योंनहींबोलसकतेहैं", expected: "xn--i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd" }
41 |       { description: "Japanese sentence", label: "なぜみんな日本語を話してくれないのか", expected: "xn--n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa" }
42 |       { description: "Korean example", label: "세계의모든사람들이한국어를이해한다면얼마나좋을까", expected: "xn--989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5jpsd879ccm6fea98c" }
43 |       { description: "Russian example", label: "почемужеонинеговорятпорусски", expected: "xn--b1abfaaepdrnnbgefbadotcwatmq2g4l" }
44 |       { description: "Spanish sentence", label: "PorquénopuedensimplementehablarenEspañol", expected: "xn--PorqunopuedensimplementehablarenEspaol-fmd56a" }
45 |       { description: "Vietnamese example", label: "TạisaohọkhôngthểchỉnóitiếngViệt", expected: "xn--TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g" }
46 |       { description: "Mixed example: 3年B組金八先生", label: "3年B組金八先生", expected: "xn--3B-ww4c5e180e575a65lsy2b" }
47 |       { description: "Mixed example: 安室奈美恵-with-SUPER-MONKEYS", label: "安室奈美恵-with-SUPER-MONKEYS", expected: "xn---with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n" }
48 |       { description: "Mixed example: Hello-Another-Way-それぞれの場所", label: "Hello-Another-Way-それぞれの場所", expected: "xn--Hello-Another-Way--fc4qua05auwb3674vfr0b" }
49 |       { description: "Mixed example: ひとつ屋根の下2", label: "ひとつ屋根の下2", expected: "xn--2-u9tlzr9756bt3uc0v" }
50 |       { description: "Mixed example: MajiでKoiする5秒前", label: "MajiでKoiする5秒前", expected: "xn--MajiKoi5-783gue6qz075azm5e" }
51 |       { description: "Mixed example: パフィーdeルンバ", label: "パフィーdeルンバ", expected: "xn--de-jg4avhby1noc0d" }
52 |       { description: "Mixed example: そのスピードで", label: "そのスピードで", expected: "xn--d9juau41awczczp" }
53 |     }
54 | 
55 |     it "passes through ASCII-only strings unchanged", ->
56 |       assert.same "example", punycode.punycode_encode "example"
57 |       assert.same "test", punycode.punycode_encode "test"
58 |       assert.same "hello-world", punycode.punycode_encode "hello-world"
59 |       assert.same "abc123", punycode.punycode_encode "abc123"
60 | 
61 |     it "handles empty string", ->
62 |       assert.same "", punycode.punycode_encode ""
63 | 
64 |     describe "fixture encodings", ->
65 |       for case in *fixtures
66 |         it "encodes #{case.description}", ->
67 |           assert.same case.expected, punycode.punycode_encode case.label
68 | 
69 |     describe "ASCII boundary behaviour", ->
70 |       it "preserves leading ASCII characters", ->
71 |         result = punycode.punycode_encode "test日本"
72 |         assert.true (result\match "^xn%-%-test") != nil
73 | 
74 |       it "handles trailing hyphen with Unicode", ->
75 |         result = punycode.punycode_encode "test-ü"
76 |         assert.true (result\match "^xn%-%-") != nil
77 | 
78 |       it "preserves case for ASCII characters", ->
79 |         result = punycode.punycode_encode "Test日本"
80 |         assert.true (result\match "Test") != nil
81 | 
82 |     it "handles emoji", ->
83 |       result = punycode.punycode_encode "💩"
84 |       assert.is_string result
85 | 


--------------------------------------------------------------------------------
/lapis/bayes/classifiers/bayes.lua:
--------------------------------------------------------------------------------
  1 | local BayesClassifier
  2 | do
  3 |   local _class_0
  4 |   local _parent_0 = require("lapis.bayes.classifiers.base")
  5 |   local _base_0 = {
  6 |     get_token_weight = function(self, word)
  7 |       if not (self.opts.token_weight_patterns) then
  8 |         return 1.0
  9 |       end
 10 |       for pattern, weight in pairs(self.opts.token_weight_patterns) do
 11 |         if word:match(pattern) then
 12 |           return weight
 13 |         end
 14 |       end
 15 |       return 1.0
 16 |     end,
 17 |     word_probabilities = function(self, categories, available_words, opts)
 18 |       if opts == nil then
 19 |         opts = { }
 20 |       end
 21 |       opts = opts or { }
 22 |       if not (#categories == 2) then
 23 |         return nil, "only two categories supported at once"
 24 |       end
 25 |       local a, b = unpack(categories)
 26 |       local sum_counts = 0
 27 |       for _index_0 = 1, #categories do
 28 |         local c = categories[_index_0]
 29 |         sum_counts = sum_counts + c.total_count
 30 |       end
 31 |       available_words = self:candidate_words(categories, available_words, self.opts.max_words)
 32 |       local available_words_count = #available_words
 33 |       local unclassified_counts = opts.unclassified_counts or self.opts.unclassified_counts
 34 |       local uncertainty_weight
 35 |       if opts.uncertainty_weight ~= nil then
 36 |         uncertainty_weight = opts.uncertainty_weight
 37 |       else
 38 |         uncertainty_weight = self.opts.uncertainty_weight or 1.0
 39 |       end
 40 |       uncertainty_weight = math.max(uncertainty_weight, 0)
 41 |       local token_weights = { }
 42 |       for _index_0 = 1, #available_words do
 43 |         local word = available_words[_index_0]
 44 |         local weight = self:get_token_weight(word)
 45 |         if unclassified_counts then
 46 |           local unc = unclassified_counts[word]
 47 |           if unc and unc > 0 then
 48 |             local classified_total = 0
 49 |             classified_total = classified_total + ((a.word_counts and a.word_counts[word]) or 0)
 50 |             classified_total = classified_total + ((b.word_counts and b.word_counts[word]) or 0)
 51 |             local total = classified_total + unc
 52 |             if total > 0 and uncertainty_weight ~= 0 then
 53 |               local confidence = classified_total / total
 54 |               weight = weight * (confidence ^ uncertainty_weight)
 55 |             end
 56 |           end
 57 |         end
 58 |         token_weights[word] = weight
 59 |       end
 60 |       local default_prob = self.opts.default_prob / sum_counts
 61 |       local default_a = default_prob * a.total_count
 62 |       local default_b = default_prob * b.total_count
 63 |       local prob
 64 |       if self.opts.log then
 65 |         local ai_log_sum = 0
 66 |         local bi_log_sum = 0
 67 |         for _index_0 = 1, #available_words do
 68 |           local word = available_words[_index_0]
 69 |           local ai_count = (a.word_counts and a.word_counts[word] or 0) + default_a
 70 |           local bi_count = (b.word_counts and b.word_counts[word] or 0) + default_b
 71 |           local weight = token_weights[word] or self:get_token_weight(word)
 72 |           ai_log_sum = ai_log_sum + (weight * math.log(ai_count))
 73 |           bi_log_sum = bi_log_sum + (weight * math.log(bi_count))
 74 |         end
 75 |         ai_log_sum = ai_log_sum + math.log(a.total_count)
 76 |         bi_log_sum = bi_log_sum + math.log(b.total_count)
 77 |         ai_log_sum = ai_log_sum - math.log((default_a + a.total_count))
 78 |         bi_log_sum = bi_log_sum - math.log((default_b + b.total_count))
 79 |         ai_log_sum = ai_log_sum - math.log(available_words_count)
 80 |         bi_log_sum = bi_log_sum - math.log(available_words_count)
 81 |         local max_log_sum = math.max(ai_log_sum, bi_log_sum)
 82 |         local ai_prob = math.exp(ai_log_sum - max_log_sum)
 83 |         local bi_prob = math.exp(bi_log_sum - max_log_sum)
 84 |         prob = ai_prob / (ai_prob + bi_prob)
 85 |       else
 86 |         local ai_mul, bi_mul
 87 |         for _index_0 = 1, #available_words do
 88 |           local word = available_words[_index_0]
 89 |           local ai_count = (a.word_counts and a.word_counts[word] or 0) + default_a
 90 |           local bi_count = (b.word_counts and b.word_counts[word] or 0) + default_b
 91 |           local weight = token_weights[word] or self:get_token_weight(word)
 92 |           if ai_mul then
 93 |             ai_mul = ai_mul * (ai_count ^ weight)
 94 |           else
 95 |             ai_mul = ai_count ^ weight
 96 |           end
 97 |           if bi_mul then
 98 |             bi_mul = bi_mul * (bi_count ^ weight)
 99 |           else
100 |             bi_mul = bi_count ^ weight
101 |           end
102 |         end
103 |         local ai_prob = a.total_count * ai_mul / ((a.total_count + default_a) * available_words_count)
104 |         local bi_prob = b.total_count * bi_mul / ((b.total_count + default_b) * available_words_count)
105 |         if ai_prob ~= ai_prob then
106 |           ai_prob = 0
107 |         end
108 |         if bi_prob ~= bi_prob then
109 |           bi_prob = 0
110 |         end
111 |         prob = ai_prob / (ai_prob + bi_prob)
112 |       end
113 |       if prob ~= prob then
114 |         return nil, "Got nan when calculating prob"
115 |       end
116 |       if prob == math.huge or prob == -math.huge then
117 |         return nil, "Got inf when calculating prob"
118 |       end
119 |       local tuples = {
120 |         {
121 |           a.name,
122 |           prob
123 |         },
124 |         {
125 |           b.name,
126 |           1 - prob
127 |         }
128 |       }
129 |       table.sort(tuples, function(a, b)
130 |         return a[2] > b[2]
131 |       end)
132 |       return tuples
133 |     end
134 |   }
135 |   _base_0.__index = _base_0
136 |   setmetatable(_base_0, _parent_0.__base)
137 |   _class_0 = setmetatable({
138 |     __init = function(self, ...)
139 |       return _class_0.__parent.__init(self, ...)
140 |     end,
141 |     __base = _base_0,
142 |     __name = "BayesClassifier",
143 |     __parent = _parent_0
144 |   }, {
145 |     __index = function(cls, name)
146 |       local val = rawget(_base_0, name)
147 |       if val == nil then
148 |         local parent = rawget(cls, "__parent")
149 |         if parent then
150 |           return parent[name]
151 |         end
152 |       else
153 |         return val
154 |       end
155 |     end,
156 |     __call = function(cls, ...)
157 |       local _self_0 = setmetatable({}, _base_0)
158 |       cls.__init(_self_0, ...)
159 |       return _self_0
160 |     end
161 |   })
162 |   _base_0.__class = _class_0
163 |   local self = _class_0
164 |   self.default_options = {
165 |     max_words = 40,
166 |     default_prob = 0.1,
167 |     log = false,
168 |     token_weight_patterns = nil,
169 |     uncertainty_weight = 1.0
170 |   }
171 |   if _parent_0.__inherited then
172 |     _parent_0.__inherited(_parent_0, _class_0)
173 |   end
174 |   BayesClassifier = _class_0
175 |   return _class_0
176 | end
177 | 


--------------------------------------------------------------------------------
/spec/unaccent_spec.moon:
--------------------------------------------------------------------------------
  1 | 
  2 | unaccent = require "lapis.bayes.text.unaccent"
  3 | 
  4 | describe "lapis.bayes.text.unaccent", ->
  5 |   describe "unaccent_string", ->
  6 |     it "passes through basic ASCII unchanged", ->
  7 |       assert.same "hello world", unaccent.unaccent_string "hello world"
  8 |       assert.same "abc123", unaccent.unaccent_string "abc123"
  9 |       assert.same "test", unaccent.unaccent_string "test"
 10 | 
 11 |     it "handles empty string", ->
 12 |       assert.same "", unaccent.unaccent_string ""
 13 | 
 14 |     it "converts fullwidth characters to ASCII", ->
 15 |       assert.same "abc", unaccent.unaccent_string "ａｂｃ"
 16 |       assert.same "ABC", unaccent.unaccent_string "ＡＢＣ"
 17 |       assert.same "123", unaccent.unaccent_string "１２３"
 18 | 
 19 |     it "converts mathematical alphanumerics", ->
 20 |       assert.same "abc", unaccent.unaccent_string "𝕒𝕓𝕔"
 21 |       assert.same "xyz", unaccent.unaccent_string "𝚡𝚢𝚣"
 22 |       assert.same "ABC", unaccent.unaccent_string "𝓐𝓑𝓒"
 23 | 
 24 |     it "converts mathematical bold letters", ->
 25 |       assert.same "SaleIsLiveCheckNow", unaccent.unaccent_string "𝐒𝐚𝐥𝐞𝐈𝐬𝐋𝐢𝐯𝐞𝐂𝐡𝐞𝐜𝐤𝐍𝐨𝐰"
 26 |       assert.same "ABC", unaccent.unaccent_string "𝐀𝐁𝐂"
 27 |       assert.same "xyz", unaccent.unaccent_string "𝐱𝐲𝐳"
 28 | 
 29 |     it "removes accents from Latin characters", ->
 30 |       assert.same "aeiou", unaccent.unaccent_string "àéíóú"
 31 |       assert.same "AEIOU", unaccent.unaccent_string "ÀÉÍÓÚ"
 32 |       assert.same "nca", unaccent.unaccent_string "ñçä"
 33 | 
 34 |     it "converts Greek letters to Latin", ->
 35 |       assert.same "a", unaccent.unaccent_string "α"
 36 |       assert.same "y", unaccent.unaccent_string "γ"
 37 |       assert.same "n", unaccent.unaccent_string "π"
 38 |       assert.same "o", unaccent.unaccent_string "ο"
 39 | 
 40 |     it "converts Cyrillic letters to Latin", ->
 41 |       assert.same "a", unaccent.unaccent_string "а"
 42 |       assert.same "e", unaccent.unaccent_string "е"
 43 |       assert.same "o", unaccent.unaccent_string "о"
 44 | 
 45 |     it "normalizes special punctuation", ->
 46 |       assert.same ".", unaccent.unaccent_string "。"
 47 |       assert.same ",", unaccent.unaccent_string "，"
 48 |       assert.same ":", unaccent.unaccent_string "："
 49 |       assert.same "!", unaccent.unaccent_string "！"
 50 | 
 51 |     it "normalizes mathematical operators", ->
 52 |       assert.same "==", unaccent.unaccent_string "⩵"
 53 |       assert.same "===", unaccent.unaccent_string "⩶"
 54 |       assert.same "::=", unaccent.unaccent_string "⩴"
 55 | 
 56 |     it "normalizes brackets", ->
 57 |       assert.same "[", unaccent.unaccent_string "［"
 58 |       assert.same "]", unaccent.unaccent_string "］"
 59 |       assert.same "{", unaccent.unaccent_string "｛"
 60 |       assert.same "}", unaccent.unaccent_string "｝"
 61 | 
 62 |     it "converts special number forms", ->
 63 |       assert.same "0", unaccent.unaccent_string "０"
 64 |       assert.same " 1/2", unaccent.unaccent_string "½"
 65 |       assert.same " 1/4", unaccent.unaccent_string "¼"
 66 |       assert.same " 3/4", unaccent.unaccent_string "¾"
 67 | 
 68 |     it "converts Roman numerals", ->
 69 |       assert.same "1", unaccent.unaccent_string "Ⅰ"
 70 |       assert.same "IV", unaccent.unaccent_string "Ⅳ"
 71 |       assert.same "XII", unaccent.unaccent_string "Ⅻ"
 72 | 
 73 |     it "converts circled numbers", ->
 74 |       assert.same "1", unaccent.unaccent_string "①"
 75 |       assert.same "10", unaccent.unaccent_string "⑩"
 76 |       assert.same "20", unaccent.unaccent_string "⑳"
 77 | 
 78 |     it "converts enclosed alphanumerics", ->
 79 |       assert.same "(1)", unaccent.unaccent_string "⑴"
 80 |       assert.same "(a)", unaccent.unaccent_string "⒜"
 81 |       assert.same "1.", unaccent.unaccent_string "⒈"
 82 | 
 83 |     it "handles mixed character types", ->
 84 |       assert.same "hello123", unaccent.unaccent_string "ｈｅｌｌｏ１２３"
 85 |       assert.same "test.com", unaccent.unaccent_string "ｔｅｓｔ。ｃｏｍ"
 86 | 
 87 |     it "handles characters that should pass through", ->
 88 |       result = unaccent.unaccent_string "hello-world_test"
 89 |       assert.same "hello-world_test", result
 90 | 
 91 |     it "handles ligatures", ->
 92 |       assert.same "fi", unaccent.unaccent_string "ﬁ"
 93 |       assert.same "fl", unaccent.unaccent_string "ﬂ"
 94 |       assert.same "ffi", unaccent.unaccent_string "ﬃ"
 95 |       assert.same "ffl", unaccent.unaccent_string "ﬄ"
 96 |       assert.same "st", unaccent.unaccent_string "ﬆ"
 97 | 
 98 |     it "handles special letter forms", ->
 99 |       assert.same "ss", unaccent.unaccent_string "ß"
100 |       assert.same "SS", unaccent.unaccent_string "ẞ"
101 |       assert.same "ae", unaccent.unaccent_string "æ"
102 |       assert.same "AE", unaccent.unaccent_string "Æ"
103 |       assert.same "oe", unaccent.unaccent_string "œ"
104 |       assert.same "OE", unaccent.unaccent_string "Œ"
105 | 
106 |     describe "comprehensive normalization tests from test.moon", ->
107 |       -- Note: unaccent_string only does character transliteration, not case normalization
108 |       -- Expected values show what unaccent_string outputs (with spaces removed)
109 |       normalizes = {
110 |         {"hello world", "helloworld"}
111 |         {"baｍWaＲ7°ＣｏМ", "bamWaR7.CoM"}
112 |         {"BaМwＡｒ7．СοM", "BaMwAr7.CoM"}
113 |         {"b Ａ ｍ w A ｒ 7 ° ｃ Ｏ М", "bAmwAr7.coM"}
114 |         {"B A Μ W а Ｒ 7 ㆍc o m", "BAMWaR7.com"}
115 |         {"b ＡΜ ｗ А Ｒ 7．ｃＯм", "bAMwAR7.com"}
116 |         {"ｂａｍｗａｒ７.ｃｏｍ", "bamwar7.com"}
117 |         {"ＢＡＭ〉ＷＡＲ７.ｃｏｍ", "BAM>WAR7.com"}
118 |         {"B A M W A R 7ㆍCOM", "BAMWAR7.COM"}
119 |         {"ＢＡＭＷＡＲ７.ＣＯＭ", "BAMWAR7.CoM"}
120 |         {"〚ｂａｍ〛ｗａｒ７.〚ｃｏｍ〛", "[bam]war7.[com]"}
121 |         {"⒲⒲⒲.⒝⒜⒨⒲⒜⒭⑺.⒞⒪⒨", "(w)(w)(w).(b)(a)(m)(w)(a)(r)(7).(c)(o)(m)"}
122 |         {" ⓦⓦⓦ.ⓑⓐⓜⓦⓐⓡ⑦.ⓒⓞⓜ", "www.bamwar7.com"}
123 |         {"🇱🅔🅰🄵", "leaf"}
124 |         {"ero588，C0M", "ero588,C0M"}
125 |         {"RK772。CoM", "RK772.CoM"}
126 |         {"MIO652。CoM", "MIO652.CoM"}
127 |         {"ＫＢＳ４５４。ＣＯＭ", "KBS454.CoM"}
128 |         {"MI738。CoM", "MI738.CoM"}
129 |         {"mkmk35。COM", "mkmk35.COM"}
130 |         {"79ESA。CｏM", "79ESA.CoM"}
131 |         {"APA82。CoM", "APA82.CoM"}
132 |         {"𝚟𝚘𝚙.𝚜𝚞", "vop.su"}
133 |         {"ＭＭＯ77。ＣＯＭ", "MMo77.CoM"}
134 |         {"ＭＩＯ６５２。ＣＯＭ", "Mio652.CoM"}
135 |         {"kakao: dnj2016", "kakao:dnj2016"}
136 |       }
137 | 
138 |       for {before, after} in *normalizes
139 |         it "normalizes '#{before}'", ->
140 |           result = unaccent.unaccent_string before
141 |           -- Remove spaces for comparison since the test.moon examples show this
142 |           result_normalized = result\gsub "%s", ""
143 |           assert.same after, result_normalized
144 | 
145 |   describe "unaccent_table", ->
146 |     it "exists and is a table", ->
147 |       assert.is_table unaccent.unaccent_table
148 | 
149 |     it "has expected number of entries", ->
150 |       count = 0
151 |       for k, v in pairs unaccent.unaccent_table
152 |         count += 1
153 |       assert.true count > 2000, "Expected over 2000 mappings"
154 | 
155 |     it "contains specific mappings", ->
156 |       assert.same "a", unaccent.unaccent_table["à"]
157 |       assert.same "e", unaccent.unaccent_table["é"]
158 |       assert.same "A", unaccent.unaccent_table["Ａ"]
159 |       assert.same "0", unaccent.unaccent_table["０"]
160 |       assert.same ".", unaccent.unaccent_table["。"]
161 | 
162 |     it "maps fullwidth characters", ->
163 |       assert.same "a", unaccent.unaccent_table["ａ"]
164 |       assert.same "z", unaccent.unaccent_table["ｚ"]
165 |       assert.same "0", unaccent.unaccent_table["０"]
166 |       assert.same "9", unaccent.unaccent_table["９"]
167 | 
168 |     it "maps Greek letters", ->
169 |       assert.same "a", unaccent.unaccent_table["α"]
170 |       assert.same "y", unaccent.unaccent_table["γ"]
171 |       assert.same "n", unaccent.unaccent_table["π"]
172 | 
173 |     it "maps mathematical alphanumerics", ->
174 |       assert.true unaccent.unaccent_table["𝕒"] != nil
175 |       assert.true unaccent.unaccent_table["𝓐"] != nil
176 |       assert.true unaccent.unaccent_table["𝚊"] != nil
177 | 


--------------------------------------------------------------------------------
/lapis/bayes/classifiers/base.lua:
--------------------------------------------------------------------------------
  1 | local uniquify
  2 | uniquify = require("lapis.util").uniquify
  3 | local BaseClassifier
  4 | do
  5 |   local _class_0
  6 |   local _base_0 = {
  7 |     default_tokenizer = "lapis.bayes.tokenizers.postgres_text",
  8 |     word_probabilities = function(self, categories, words)
  9 |       return error("word_probabilities: subclass must implement")
 10 |     end,
 11 |     classify_text = function(self, ...)
 12 |       local counts, word_rate_or_err = self:text_probabilities(...)
 13 |       if not (counts) then
 14 |         return nil, word_rate_or_err
 15 |       end
 16 |       return counts[1][1], counts[1][2], word_rate_or_err
 17 |     end,
 18 |     tokenize_text = function(self, text)
 19 |       assert(text, "missing text to tokenize")
 20 |       if not (type(text) == "string") then
 21 |         return text
 22 |       end
 23 |       if self.opts.tokenize_text then
 24 |         return self.opts.tokenize_text(text, self.opts)
 25 |       end
 26 |       local tokenizer
 27 |       if self.opts.tokenizer then
 28 |         tokenizer = self.opts.tokenizer
 29 |       else
 30 |         local Tokenizer = require(self.default_tokenizer)
 31 |         tokenizer = Tokenizer(self.opts)
 32 |       end
 33 |       return tokenizer:tokenize_text(text)
 34 |     end,
 35 |     train_text = function(self, category, text, opts)
 36 |       local tokens = self:tokenize_text(text)
 37 |       if opts and opts.filter_tokens then
 38 |         tokens = opts.filter_tokens(opts, text)
 39 |       end
 40 |       local Categories
 41 |       Categories = require("lapis.bayes.models").Categories
 42 |       category = Categories:find_or_create(category)
 43 |       return category:increment_words(tokens)
 44 |     end,
 45 |     text_probabilities = function(self, category_names, text, opts)
 46 |       opts = opts or { }
 47 |       local categories, err = self:find_categories(category_names)
 48 |       if not (categories) then
 49 |         return nil, err
 50 |       end
 51 |       local words = self:tokenize_text(text)
 52 |       if not (words and next(words)) then
 53 |         return nil, "failed to generate tokens for text"
 54 |       end
 55 |       local available_words
 56 |       available_words, err = self:count_words(categories, words)
 57 |       if not (available_words) then
 58 |         return nil, err
 59 |       end
 60 |       local available_words_set
 61 |       do
 62 |         local _tbl_0 = { }
 63 |         for _index_0 = 1, #available_words do
 64 |           local word = available_words[_index_0]
 65 |           _tbl_0[word] = true
 66 |         end
 67 |         available_words_set = _tbl_0
 68 |       end
 69 |       local count = 0
 70 |       for _index_0 = 1, #words do
 71 |         local word = words[_index_0]
 72 |         if available_words_set[word] then
 73 |           count = count + 1
 74 |         end
 75 |       end
 76 |       local token_ratio = count / #words
 77 |       local probs
 78 |       probs, err = self:word_probabilities(categories, available_words, opts)
 79 |       if not (probs) then
 80 |         return nil, err
 81 |       end
 82 |       for _index_0 = 1, #probs do
 83 |         local _des_0 = probs[_index_0]
 84 |         local c, p
 85 |         c, p = _des_0[1], _des_0[2]
 86 |         probs[c] = p
 87 |       end
 88 |       return probs, token_ratio
 89 |     end,
 90 |     find_categories = function(self, category_names)
 91 |       local Categories
 92 |       Categories = require("lapis.bayes.models").Categories
 93 |       local db = Categories.db
 94 |       local categories = Categories:select("where name in ?", db.list(category_names))
 95 |       local by_name
 96 |       do
 97 |         local _tbl_0 = { }
 98 |         for _index_0 = 1, #categories do
 99 |           local c = categories[_index_0]
100 |           _tbl_0[c.name] = c
101 |         end
102 |         by_name = _tbl_0
103 |       end
104 |       local missing
105 |       local result
106 |       do
107 |         local _accum_0 = { }
108 |         local _len_0 = 1
109 |         for _index_0 = 1, #category_names do
110 |           local _continue_0 = false
111 |           repeat
112 |             local name = category_names[_index_0]
113 |             local c = by_name[name]
114 |             if not (c) then
115 |               missing = missing or { }
116 |               table.insert(missing, name)
117 |               _continue_0 = true
118 |               break
119 |             end
120 |             local _value_0 = c
121 |             _accum_0[_len_0] = _value_0
122 |             _len_0 = _len_0 + 1
123 |             _continue_0 = true
124 |           until true
125 |           if not _continue_0 then
126 |             break
127 |           end
128 |         end
129 |         result = _accum_0
130 |       end
131 |       if missing and next(missing) then
132 |         return nil, "find_categories: missing categories (" .. tostring(table.concat(missing, ", ")) .. ")"
133 |       end
134 |       return result
135 |     end,
136 |     find_word_classifications = function(self, words, category_ids)
137 |       if not (next(words) and next(category_ids)) then
138 |         return { }
139 |       end
140 |       local WordClassifications
141 |       WordClassifications = require("lapis.bayes.models").WordClassifications
142 |       local db = WordClassifications.db
143 |       return WordClassifications:select("where word in ? and category_id in ?", db.list(words), db.list(category_ids))
144 |     end,
145 |     candidate_words = function(self, categories, available_words, count)
146 |       if #available_words <= count then
147 |         return available_words
148 |       end
149 |       assert(#categories == 2, "can only do two categories")
150 |       local a, b = unpack(categories)
151 |       local tuples
152 |       do
153 |         local _accum_0 = { }
154 |         local _len_0 = 1
155 |         for _index_0 = 1, #available_words do
156 |           local word = available_words[_index_0]
157 |           local a_count = a.word_counts and a.word_counts[word] or 0
158 |           local b_count = b.word_counts and b.word_counts[word] or 0
159 |           local _value_0 = {
160 |             word,
161 |             math.random() / 100 + math.abs((a_count - b_count) / math.sqrt(a_count + b_count)),
162 |             a_count,
163 |             b_count
164 |           }
165 |           _accum_0[_len_0] = _value_0
166 |           _len_0 = _len_0 + 1
167 |         end
168 |         tuples = _accum_0
169 |       end
170 |       table.sort(tuples, function(a, b)
171 |         return a[2] > b[2]
172 |       end)
173 |       local _accum_0 = { }
174 |       local _len_0 = 1
175 |       local _max_0 = count
176 |       for _index_0 = 1, _max_0 < 0 and #tuples + _max_0 or _max_0 do
177 |         local t = tuples[_index_0]
178 |         _accum_0[_len_0] = t[1]
179 |         _len_0 = _len_0 + 1
180 |       end
181 |       return _accum_0
182 |     end,
183 |     count_words = function(self, categories, words)
184 |       local categories_by_id
185 |       do
186 |         local _tbl_0 = { }
187 |         for _index_0 = 1, #categories do
188 |           local c = categories[_index_0]
189 |           _tbl_0[c.id] = c
190 |         end
191 |         categories_by_id = _tbl_0
192 |       end
193 |       words = uniquify(words)
194 |       local wcs = self:find_word_classifications(words, (function()
195 |         local _accum_0 = { }
196 |         local _len_0 = 1
197 |         for _index_0 = 1, #categories do
198 |           local c = categories[_index_0]
199 |           _accum_0[_len_0] = c.id
200 |           _len_0 = _len_0 + 1
201 |         end
202 |         return _accum_0
203 |       end)())
204 |       local available_words
205 |       do
206 |         local _accum_0 = { }
207 |         local _len_0 = 1
208 |         for word in pairs((function()
209 |           local _tbl_0 = { }
210 |           for _index_0 = 1, #wcs do
211 |             local wc = wcs[_index_0]
212 |             _tbl_0[wc.word] = true
213 |           end
214 |           return _tbl_0
215 |         end)()) do
216 |           _accum_0[_len_0] = word
217 |           _len_0 = _len_0 + 1
218 |         end
219 |         available_words = _accum_0
220 |       end
221 |       if #available_words == 0 then
222 |         return nil, "no words in text are classifyable"
223 |       end
224 |       for _index_0 = 1, #wcs do
225 |         local wc = wcs[_index_0]
226 |         local category = categories_by_id[wc.category_id]
227 |         category.word_counts = category.word_counts or { }
228 |         category.word_counts[wc.word] = wc.count
229 |       end
230 |       return available_words
231 |     end
232 |   }
233 |   _base_0.__index = _base_0
234 |   _class_0 = setmetatable({
235 |     __init = function(self, opts)
236 |       if opts == nil then
237 |         opts = { }
238 |       end
239 |       self.opts = opts
240 |       if self.__class.default_options then
241 |         self.opts = setmetatable((function()
242 |           local _tbl_0 = { }
243 |           for k, v in pairs(self.opts) do
244 |             _tbl_0[k] = v
245 |           end
246 |           return _tbl_0
247 |         end)(), {
248 |           __index = self.__class.default_options
249 |         })
250 |       end
251 |     end,
252 |     __base = _base_0,
253 |     __name = "BaseClassifier"
254 |   }, {
255 |     __index = _base_0,
256 |     __call = function(cls, ...)
257 |       local _self_0 = setmetatable({}, _base_0)
258 |       cls.__init(_self_0, ...)
259 |       return _self_0
260 |     end
261 |   })
262 |   _base_0.__class = _class_0
263 |   BaseClassifier = _class_0
264 |   return _class_0
265 | end
266 | 


--------------------------------------------------------------------------------
/spec/stem_spec.moon:
--------------------------------------------------------------------------------
  1 | stem = require "lapis.bayes.text.stem"
  2 | 
  3 | test_word = (input, expected) ->
  4 |   assert.same expected, stem.stem_word input
  5 | 
  6 | describe "lapis.bayes.text.stem", ->
  7 |   describe "stem_word", ->
  8 |     it "handles nil and empty strings", ->
  9 |       assert.same nil, stem.stem_word nil
 10 |       assert.same "", stem.stem_word ""
 11 | 
 12 |     it "handles short words (< 3 chars)", ->
 13 |       test_word "a", "a"
 14 |       test_word "ab", "ab"
 15 |       test_word "at", "at"
 16 | 
 17 |     it "handles words that don't need stemming", ->
 18 |       test_word "cat", "cat"
 19 |       test_word "dog", "dog"
 20 |       test_word "tree", "tree"
 21 | 
 22 |     it "converts to lowercase", ->
 23 |       test_word "HELLO", "hello"
 24 |       test_word "WoRlD", "world"
 25 |       test_word "TEST", "test"
 26 | 
 27 |     describe "exception words", ->
 28 |       it "handles skis/skies", ->
 29 |         test_word "skis", "ski"
 30 |         test_word "skies", "sky"
 31 |         test_word "sky", "sky"
 32 | 
 33 |       it "handles special -ly cases", ->
 34 |         test_word "idly", "idl"
 35 |         test_word "gently", "gentl"
 36 |         test_word "ugly", "ugli"
 37 |         test_word "early", "earli"
 38 |         test_word "only", "onli"
 39 |         test_word "singly", "singl"
 40 | 
 41 |       it "handles invariant forms", ->
 42 |         test_word "news", "news"
 43 |         test_word "howe", "howe"
 44 |         test_word "atlas", "atlas"
 45 |         test_word "cosmos", "cosmos"
 46 |         test_word "bias", "bias"
 47 |         test_word "andes", "andes"
 48 | 
 49 |     describe "Step 1a - plurals and possessives", ->
 50 |       it "removes apostrophes", ->
 51 |         test_word "dog's", "dog"
 52 |         test_word "cat's'", "cat"
 53 | 
 54 |       it "handles sses -> ss", ->
 55 |         test_word "blesses", "bless"
 56 |         test_word "stresses", "stress"
 57 | 
 58 |       it "handles ied/ies", ->
 59 |         test_word "tied", "tie"
 60 |         test_word "pies", "pie"
 61 |         test_word "cries", "cri"
 62 |         test_word "studies", "studi"
 63 | 
 64 |       it "removes trailing s when appropriate", ->
 65 |         test_word "cats", "cat"
 66 |         test_word "dogs", "dog"
 67 |         test_word "gas", "ga"  -- has vowel so s is removed
 68 |         test_word "this", "thi"  -- has vowel so s is removed
 69 |         test_word "class", "class"  -- ss ending
 70 | 
 71 |     describe "Step 1b - ed, ing suffixes", ->
 72 |       it "handles eed/eedly in R1", ->
 73 |         test_word "agreed", "agre"
 74 |         test_word "feed", "feed"  -- R1 is null, so eed not replaced
 75 | 
 76 |       it "handles ed/edly", ->
 77 |         test_word "plastered", "plaster"
 78 |         test_word "bled", "bled"
 79 |         test_word "motivated", "motiv"
 80 | 
 81 |       it "handles ing/ingly", ->
 82 |         test_word "sing", "sing"
 83 |         test_word "motivating", "motiv"
 84 |         test_word "running", "run"
 85 |         test_word "hopping", "hop"
 86 | 
 87 |       it "adds e after at/bl/iz", ->
 88 |         test_word "luxuriated", "luxuri"  -- removes 'ated', no e added
 89 |         test_word "troubled", "troubl"
 90 | 
 91 |       it "removes double consonants", ->
 92 |         test_word "hopped", "hop"
 93 |         test_word "fitted", "fit"
 94 |         test_word "planned", "plan"
 95 | 
 96 |       it "handles special ing cases", ->
 97 |         test_word "inning", "inning"
 98 |         test_word "outing", "outing"
 99 |         test_word "canning", "canning"
100 | 
101 |     describe "Step 1c - y suffix", ->
102 |       it "replaces suffix y with i", ->
103 |         test_word "happy", "happi"
104 |         test_word "sky", "sky"  -- exception word, not changed
105 | 
106 |       it "does not replace y at start or after vowel", ->
107 |         test_word "say", "say"
108 |         test_word "boy", "boy"
109 | 
110 |     describe "Step 2 - derivational suffixes", ->
111 |       it "handles tional -> tion", ->
112 |         test_word "relational", "relat"
113 |         test_word "conditional", "condit"
114 |         test_word "rational", "ration"
115 | 
116 |       it "handles enci -> ence", ->
117 |         test_word "valenci", "valenc"
118 | 
119 |       it "handles anci -> ance", ->
120 |         test_word "hesitanci", "hesit"
121 | 
122 |       it "handles izer -> ize", ->
123 |         test_word "digitizer", "digit"
124 | 
125 |       it "handles ational -> ate", ->
126 |         test_word "operational", "oper"
127 | 
128 |       it "handles ation/ator -> ate", ->
129 |         test_word "predication", "predic"
130 |         test_word "operator", "oper"
131 | 
132 |       it "handles alism -> al", ->
133 |         test_word "feudalism", "feudal"
134 | 
135 |       it "handles fulness -> ful", ->
136 |         test_word "hopefulness", "hope"
137 | 
138 |       it "handles ousness -> ous", ->
139 |         test_word "callousness", "callous"
140 | 
141 |       it "handles iveness -> ive", ->
142 |         test_word "decisiveness", "decis"
143 | 
144 |       it "handles biliti -> ble", ->
145 |         test_word "sensibiliti", "sensibl"
146 | 
147 |       it "handles li deletion", ->
148 |         test_word "formalli", "formal"
149 | 
150 |     describe "Step 3 - more derivational suffixes", ->
151 |       it "handles icate -> ic", ->
152 |         test_word "duplicate", "duplic"
153 | 
154 |       it "handles ative deletion in R2", ->
155 |         test_word "demonstrative", "demonstr"
156 | 
157 |       it "handles alize -> al", ->
158 |         test_word "normalize", "normal"
159 | 
160 |       it "handles ful/ness deletion", ->
161 |         test_word "hopeful", "hope"
162 |         test_word "goodness", "good"
163 | 
164 |     describe "Step 4 - suffix deletion", ->
165 |       it "handles al", ->
166 |         test_word "radical", "radic"
167 | 
168 |       it "handles ance/ence", ->
169 |         test_word "dependence", "depend"
170 | 
171 |       it "handles er", ->
172 |         test_word "computer", "comput"
173 | 
174 |       it "handles able/ible", ->
175 |         test_word "adjustable", "adjust"
176 |         test_word "divisible", "divis"
177 | 
178 |       it "handles ant/ent/ment", ->
179 |         test_word "irritant", "irrit"
180 |         test_word "different", "differ"
181 |         test_word "adjustment", "adjust"
182 | 
183 |       it "handles ion after s or t", ->
184 |         test_word "adoption", "adopt"
185 |         test_word "decision", "decis"
186 | 
187 |       it "handles ism/iti/ous/ive/ize", ->
188 |         test_word "communism", "communism"  -- ism in R2 only
189 |         test_word "sensitivity", "sensit"
190 |         test_word "continuous", "continu"
191 |         test_word "effective", "effect"
192 |         test_word "realize", "realiz"
193 | 
194 |     describe "Step 5 - final cleanup", ->
195 |       it "removes trailing e in R2", ->
196 |         test_word "debate", "debat"
197 |         test_word "create", "creat"
198 | 
199 |       it "removes trailing e in R1 if not short syllable", ->
200 |         test_word "hope", "hope"
201 | 
202 |       it "keeps trailing e after short syllable in R1", ->
203 |         test_word "centre", "centr"
204 | 
205 |       it "removes double l in R2", ->
206 |         test_word "controll", "control"
207 | 
208 |     describe "word families", ->
209 |       it "stems connection family to connect", ->
210 |         test_word "connection", "connect"
211 |         test_word "connections", "connect"
212 |         test_word "connective", "connect"
213 |         test_word "connected", "connect"
214 |         test_word "connecting", "connect"
215 | 
216 |       it "stems generate family", ->
217 |         test_word "generate", "generat"
218 |         test_word "generates", "generat"
219 |         test_word "generated", "generat"
220 |         test_word "generating", "generat"
221 |         test_word "generator", "generat"
222 |         test_word "general", "general"
223 |         test_word "generalization", "general"
224 | 
225 |       it "stems happy family to happi", ->
226 |         test_word "happy", "happi"
227 |         test_word "happiness", "happi"
228 |         test_word "happily", "happili"
229 | 
230 |       it "stems run family", ->
231 |         test_word "run", "run"
232 |         test_word "running", "run"
233 |         test_word "runs", "run"
234 |         test_word "runner", "runner"
235 | 
236 |     describe "complex derivational chains", ->
237 |       it "handles multiply derived words", ->
238 |         test_word "vietnamization", "vietnam"
239 |         test_word "conformabli", "conform"
240 |         test_word "radicalli", "radic"
241 |         test_word "differentli", "differ"
242 | 
243 |     describe "special prefix handling", ->
244 |       it "handles commun- prefix", ->
245 |         test_word "communism", "communism"  -- ism not in R2
246 |         test_word "communication", "communic"
247 |         test_word "community", "communiti"
248 | 
249 |       it "handles gener- prefix", ->
250 |         test_word "generate", "generat"
251 |         test_word "generator", "generat"
252 |         test_word "generous", "generous"
253 | 
254 |       it "handles univers- prefix", ->
255 |         test_word "university", "universiti"
256 |         test_word "universal", "universal"
257 |         test_word "universe", "univers"
258 | 
259 |     describe "edge cases", ->
260 |       it "handles very long words", ->
261 |         result = stem.stem_word "antidisestablishmentarianism"
262 |         assert.is_string result
263 |         assert.true #result > 0
264 | 
265 |       it "handles words with no vowels", ->
266 |         test_word "shhh", "shhh"
267 |         test_word "hmm", "hmm"
268 | 
269 |       it "handles repeated consonants", ->
270 |         test_word "bless", "bless"
271 |         test_word "press", "press"
272 | 
273 |       it "handles words ending in y", ->
274 |         test_word "daily", "daili"
275 |         test_word "easily", "easili"
276 | 
277 |       it "preserves words that should not be stemmed", ->
278 |         test_word "test", "test"
279 |         test_word "best", "best"
280 | 


--------------------------------------------------------------------------------
/spec/bayes_spec.moon:
--------------------------------------------------------------------------------
  1 | 
  2 | import use_test_env from require "lapis.spec"
  3 | import truncate_tables from require "lapis.spec.db"
  4 | 
  5 | import Categories, WordClassifications from require "lapis.bayes.models"
  6 | 
  7 | describe "lapis.bayes", ->
  8 |   use_test_env!
  9 | 
 10 |   describe "WordClassifications", ->
 11 |     local c1, c2
 12 | 
 13 |     before_each ->
 14 |       truncate_tables Categories, WordClassifications
 15 | 
 16 |       c1 = Categories\find_or_create "hello"
 17 |       c1\increment_words {
 18 |         alpha: 17
 19 |         beta: 19
 20 |       }
 21 | 
 22 |       c2 = Categories\find_or_create "world"
 23 |       c2\increment_words {
 24 |         beta: 22
 25 |         triple: 27
 26 |       }
 27 | 
 28 |     it "has the correct counts", ->
 29 |       c1_words = {c.word, c.count for c in *c1\get_word_classifications!}
 30 |       c2_words = {c.word, c.count for c in *c2\get_word_classifications!}
 31 | 
 32 |       assert.same {
 33 |         alpha: 17
 34 |         beta: 19
 35 |       }, c1_words
 36 | 
 37 |       assert.same {
 38 |         beta: 22
 39 |         triple: 27
 40 |       }, c2_words
 41 | 
 42 | 
 43 |     it "deletes word from category", ->
 44 |       c1_count = c1.total_count
 45 |       c2_count = c2.total_count
 46 | 
 47 |       wc = assert WordClassifications\find category_id: c1.id, word: "beta"
 48 |       wc\delete!
 49 | 
 50 |       c1\refresh!
 51 |       c2\refresh!
 52 | 
 53 |       assert.same 19, c1_count - c1.total_count
 54 |       assert.same 0, c2_count - c2.total_count
 55 | 
 56 |     it "purges purges words from all categories", ->
 57 |       c1_count = c1.total_count
 58 |       c2_count = c2.total_count
 59 | 
 60 |       deleted, count = WordClassifications\purge_word "alpha", {"hello", "world"}
 61 |       assert.true deleted
 62 |       assert.same 1, count
 63 | 
 64 |       c1\refresh!
 65 |       c2\refresh!
 66 | 
 67 |       assert.same 17, c1_count - c1.total_count
 68 |       assert.same 0, c2_count - c2.total_count
 69 | 
 70 |     it "it increments an individual word", ->
 71 |       wc = assert WordClassifications\find category_id: c1.id, word: "beta"
 72 | 
 73 |       before_word_count = wc.count
 74 | 
 75 |       wc\_increment 1
 76 |       wc\refresh!
 77 |       assert.same before_word_count + 1, wc.count
 78 | 
 79 |     it "deletes word when being unincremented to 0", ->
 80 |       wc = assert WordClassifications\find category_id: c1.id, word: "beta"
 81 |       wc\_increment -wc.count
 82 | 
 83 |       assert.nil (WordClassifications\find {
 84 |         category_id: c1.id
 85 |         word: "beta"
 86 |       })
 87 | 
 88 |     it "clears out words when decremeitng them", ->
 89 |       words = c1\get_word_classifications!
 90 |       for word in *words
 91 |         c1\increment_word word.word, -word.count
 92 | 
 93 |       assert.same 0, c1.total_count
 94 |       c1\refresh!
 95 |       assert.same {}, c1\get_word_classifications!
 96 | 
 97 |   describe "Categories", ->
 98 |     before_each ->
 99 |       truncate_tables Categories, WordClassifications
100 | 
101 |     it "finds or creates category", ->
102 |       c = Categories\find_or_create "hello"
103 |       c2 = Categories\find_or_create "hello"
104 |       assert.same c.id, c2.id
105 | 
106 |     it "increments words", ->
107 |       c = Categories\find_or_create "hello"
108 | 
109 |       WordClassifications\create {
110 |         word: "color"
111 |         category_id: c.id
112 |         count: 2
113 |       }
114 | 
115 |       c\increment_words {
116 |         color: 55
117 |         height: 12
118 |         green: 8
119 |       }
120 | 
121 |       wc_by_name = {wc.word, wc for wc in *WordClassifications\select!}
122 | 
123 |       assert.same 57, wc_by_name.color.count
124 |       assert.same 12, wc_by_name.height.count
125 |       assert.same 8, wc_by_name.green.count
126 | 
127 |     it "deletes category", ->
128 |       c = Categories\find_or_create "hello"
129 |       c\increment_words {
130 |         color: 23
131 |         height: 2
132 |       }
133 |       c\delete!
134 | 
135 |   describe "tokenize text", ->
136 |     describe "default tokenizer", ->
137 |       tokenize_text = (text, ...) ->
138 |         if ...
139 |           error "Got expected additional arguments for tokenize text"
140 | 
141 |         BaseClassifier = require "lapis.bayes.classifiers.base"
142 |         BaseClassifier!\tokenize_text text
143 | 
144 |       it "gets tokens for empty string", ->
145 |         assert.same {}, tokenize_text ""
146 | 
147 |       it "gets tokens for basic string", ->
148 |         assert.same {"hello", "world"}, tokenize_text "hello world"
149 | 
150 |       it "gets tokens with stems and no stop words", ->
151 |         assert.same {"burger", "eat"}, tokenize_text "i am eating burgers"
152 | 
153 |       it "doesn't keep dupes", ->
154 |         assert.same {"burger"}, tokenize_text "burgers are burgers"
155 | 
156 |       it "skips tokens that are too long or short", ->
157 |         assert.same {"great"}, tokenize_text "a b c d e f g great eatingthebigriceball "
158 | 
159 |       it "strips numbers", ->
160 |         assert.same {"delisho", "hodoc"}, tokenize_text "12 delisho hodocs for $5.99"
161 | 
162 |       it "uses custom tokenizer as classifier option", ->
163 |         BaseClassifier = require "lapis.bayes.classifiers.base"
164 |         c = BaseClassifier {
165 |           tokenizer: require "lapis.bayes.tokenizers.url_domains"
166 |         }
167 | 
168 |         assert.same {"leafo.net"}, c\tokenize_text "hello www.leafo.net website"
169 | 
170 |       it "users custom tokenize function", ->
171 |         BaseClassifier = require "lapis.bayes.classifiers.base"
172 |         c = BaseClassifier {
173 |           tokenize_text: (text) ->
174 |             [t for t in text\gmatch "."]
175 |         }
176 | 
177 |         assert.same {
178 |           "h", "e", "l", "l", "o"
179 |         }, c\tokenize_text "hello"
180 | 
181 | 
182 |       it "passes tokens through if already table", ->
183 |         BaseClassifier = require "lapis.bayes.classifiers.base"
184 |         c = BaseClassifier { }
185 | 
186 |         assert.same { "one", "two" }, c\tokenize_text {"one", "two"}
187 | 
188 | 
189 |   describe "train_text", ->
190 |     import train_text from require "lapis.bayes"
191 | 
192 |     before_each ->
193 |       truncate_tables Categories, WordClassifications
194 | 
195 |     it "classifies a single string", ->
196 |       train_text "spam", "hello this is spam, I love spam"
197 |       assert.same 1, Categories\count!
198 |       c = unpack Categories\select!
199 |       assert.same "spam", c.name
200 |       assert.same 3, WordClassifications\count!
201 |       words = WordClassifications\select!
202 |       table.sort words, (a, b) ->
203 |         a.word < b.word
204 | 
205 |       assert.same {
206 |         { category_id: c.id, count: 1, word: "hello" }
207 |         { category_id: c.id, count: 1, word: "love" }
208 |         { category_id: c.id, count: 1, word: "spam" }
209 |       }, words
210 | 
211 | 
212 |     it "classifies multiple strings", ->
213 |       train_text "spam", "hello this is spam, I love spam"
214 |       train_text "ham", "there is ham here"
215 |       train_text "spam", "eating spamming the regular stuff"
216 |       train_text "ham","pigs create too much jam"
217 | 
218 |     it "uses custom tokenizer", ->
219 |       train_text "spam", "cat eat foot", {
220 |         tokenize_text: (str, opts) ->
221 |           [c for c in str\gmatch "[^%s]"]
222 |       }
223 | 
224 |       assert.same {
225 |         t: 3
226 |         f: 1
227 |         o: 2
228 |         a: 2
229 |         c: 1
230 |         e: 1
231 |       }, {c.word, c.count for c in *WordClassifications\select!}
232 | 
233 |   describe "text_probabilities", ->
234 |     import text_probabilities from require "lapis.bayes"
235 | 
236 |     before_each ->
237 |       truncate_tables Categories, WordClassifications
238 | 
239 |     it "works when there is no data", ->
240 |       Categories\create name: "spam"
241 |       Categories\create name: "ham"
242 | 
243 |       assert.same {
244 |         nil, "no words in text are classifyable"
245 |       }, {
246 |         text_probabilities {"spam", "ham"}, "hello world"
247 |       }
248 | 
249 |     it "works when there is some data", ->
250 |       spam = Categories\create name: "spam"
251 |       spam\increment_words {"hello", "world"}
252 | 
253 |       ham = Categories\create name: "ham"
254 |       ham\increment_words {"butt", "world"}
255 | 
256 |       probs, rate = text_probabilities {"spam", "ham"}, "butt zone"
257 |       assert.same 0.5, rate
258 |       -- normalize probs for easy specs
259 |       probs = for p in *probs
260 |         {p[1], math.floor p[2] * 100 + 0.5}
261 | 
262 |       assert.same {
263 |         {"ham", 95}
264 |         {"spam", 5}
265 |       }, probs
266 | 
267 |   describe "models", ->
268 |     before_each ->
269 |       truncate_tables Categories, WordClassifications
270 | 
271 |     it "increment_words", ->
272 |       spam = Categories\create name: "spam"
273 |       count = spam\increment_words {
274 |         "first token"
275 |         "hello.world"
276 |         "http://leafo.net"
277 |         "hello.world"
278 |         zone: 77
279 |       }
280 | 
281 |       assert.same 81, count
282 | 
283 |       words = WordClassifications\select "order by word asc", fields: "category_id, word, count"
284 | 
285 |       assert.same {
286 |         {
287 |           category_id: spam.id
288 |           count: 1
289 |           word: "first token"
290 |         }
291 |         {
292 |           category_id: spam.id
293 |           count: 2
294 |           word: "hello.world"
295 |         },
296 |         {
297 |           category_id: spam.id
298 |           count: 1
299 |           word: "http://leafo.net"
300 |         },
301 |         {
302 |           category_id: spam.id
303 |           count: 77
304 |           word: "zone"
305 |         }
306 |       }, words
307 | 
308 | 
309 |       count = spam\increment_words {
310 |         "hello.world"
311 |         "hello.world"
312 |         "zone"
313 |         "hello.world": 3
314 |       }
315 | 
316 |       assert.same 6, count
317 | 
318 |       words = WordClassifications\select "order by word asc", fields: "category_id, word, count"
319 | 
320 | 
321 |       assert.same {
322 |         {
323 |           category_id: spam.id
324 |           count: 1
325 |           word: "first token"
326 |         }
327 |         {
328 |           category_id: spam.id
329 |           count: 7
330 |           word: "hello.world"
331 |         },
332 |         {
333 |           category_id: spam.id
334 |           count: 1
335 |           word: "http://leafo.net"
336 |         },
337 |         {
338 |           category_id: spam.id
339 |           count: 78
340 |           word: "zone"
341 |         }
342 |       }, words
343 | 


--------------------------------------------------------------------------------
/spec/ngram_tokenizer_spec.moon:
--------------------------------------------------------------------------------
  1 | NgramTokenizer = require "lapis.bayes.tokenizers.ngram"
  2 | 
  3 | it_tokenizes = (label, input, expected_tokens, opts=nil) ->
  4 |   it "tokenizes #{label}", ->
  5 |     tokenizer = NgramTokenizer opts
  6 |     tokens = tokenizer\tokenize_text input
  7 |     assert.same expected_tokens, tokens, "Tokens for #{input\sub 1, 80}"
  8 | 
  9 | describe "lapis.bayes.tokenizers.ngram", ->
 10 |   describe "basic tokenization", ->
 11 |     it_tokenizes "simple text with default bigrams", "hello world", {
 12 |       "he"
 13 |       "el"
 14 |       "ll"
 15 |       "lo"
 16 |       "wo"
 17 |       "or"
 18 |       "rl"
 19 |       "ld"
 20 |     }
 21 | 
 22 |     it_tokenizes "single word", "test", {
 23 |       "te"
 24 |       "es"
 25 |       "st"
 26 |     }
 27 | 
 28 |     it_tokenizes "multiple words", "cat dog fox", {
 29 |       "ca"
 30 |       "at"
 31 |       "do"
 32 |       "og"
 33 |       "fo"
 34 |       "ox"
 35 |     }
 36 | 
 37 |   describe "different n values", ->
 38 |     it_tokenizes "with unigrams (n=1)", "hello", {
 39 |       "h"
 40 |       "e"
 41 |       "l"
 42 |       "l"
 43 |       "o"
 44 |     }, { n: 1 }
 45 | 
 46 |     it_tokenizes "with trigrams (n=3)", "hello", {
 47 |       "hel"
 48 |       "ell"
 49 |       "llo"
 50 |     }, { n: 3 }
 51 | 
 52 |     it_tokenizes "with 4-grams (n=4)", "hello", {
 53 |       "hell"
 54 |       "ello"
 55 |     }, { n: 4 }
 56 | 
 57 |     it_tokenizes "with n=5 exact word length", "hello", {
 58 |       "hello"
 59 |     }, { n: 5 }
 60 | 
 61 |     it_tokenizes "with n=0 defaults to 1", "hi", {
 62 |       "h"
 63 |       "i"
 64 |     }, { n: 0 }
 65 | 
 66 |     it_tokenizes "with negative n defaults to 1", "hi", {
 67 |       "h"
 68 |       "i"
 69 |     }, { n: -5 }
 70 | 
 71 |     it_tokenizes "with fractional n gets floored", "test", {
 72 |       "te"
 73 |       "es"
 74 |       "st"
 75 |     }, { n: 2.7 }
 76 | 
 77 |   describe "word normalization", ->
 78 |     it_tokenizes "converts to lowercase", "Hello WORLD", {
 79 |       "he"
 80 |       "el"
 81 |       "ll"
 82 |       "lo"
 83 |       "wo"
 84 |       "or"
 85 |       "rl"
 86 |       "ld"
 87 |     }
 88 | 
 89 |     it_tokenizes "removes punctuation", "hello, world!", {
 90 |       "he"
 91 |       "el"
 92 |       "ll"
 93 |       "lo"
 94 |       "wo"
 95 |       "or"
 96 |       "rl"
 97 |       "ld"
 98 |     }
 99 | 
100 |     it_tokenizes "handles mixed case and punctuation", "Hello, World!", {
101 |       "he"
102 |       "el"
103 |       "ll"
104 |       "lo"
105 |       "wo"
106 |       "or"
107 |       "rl"
108 |       "ld"
109 |     }
110 | 
111 |     it_tokenizes "removes multiple spaces", "hello    world", {
112 |       "he"
113 |       "el"
114 |       "ll"
115 |       "lo"
116 |       "wo"
117 |       "or"
118 |       "rl"
119 |       "ld"
120 |     }
121 | 
122 |     it_tokenizes "strips punctuation from words", "don't can't won't", {
123 |       "do"
124 |       "on"
125 |       "nt"
126 |       "ca"
127 |       "an"
128 |       "nt"
129 |       "wo"
130 |       "on"
131 |       "nt"
132 |     }
133 | 
134 |   describe "ngram_size method", ->
135 |     it "returns default n=2", ->
136 |       tokenizer = NgramTokenizer!
137 |       assert.equal 2, tokenizer\ngram_size!
138 | 
139 |     it "returns configured n", ->
140 |       tokenizer = NgramTokenizer n: 3
141 |       assert.equal 3, tokenizer\ngram_size!
142 | 
143 |     it "handles string n", ->
144 |       tokenizer = NgramTokenizer n: "4"
145 |       assert.equal 4, tokenizer\ngram_size!
146 | 
147 |     it "floors fractional n", ->
148 |       tokenizer = NgramTokenizer n: 3.9
149 |       assert.equal 3, tokenizer\ngram_size!
150 | 
151 |     it "returns 1 for invalid n", ->
152 |       tokenizer = NgramTokenizer n: 0
153 |       assert.equal 1, tokenizer\ngram_size!
154 | 
155 |   describe "normalize_word method", ->
156 |     local tokenizer
157 |     before_each ->
158 |       tokenizer = NgramTokenizer!
159 | 
160 |     it "normalizes to lowercase", ->
161 |       assert.equal "hello", tokenizer\normalize_word "HELLO"
162 |       assert.equal "hello", tokenizer\normalize_word "Hello"
163 | 
164 |     it "removes punctuation", ->
165 |       assert.equal "hello", tokenizer\normalize_word "hello!"
166 |       assert.equal "hello", tokenizer\normalize_word "hello,"
167 |       assert.equal "hello", tokenizer\normalize_word "hello..."
168 | 
169 |     it "removes whitespace", ->
170 |       assert.equal "hello", tokenizer\normalize_word "hello "
171 |       assert.equal "hello", tokenizer\normalize_word " hello"
172 |       assert.equal "hello", tokenizer\normalize_word " hello "
173 | 
174 |     it "removes all punctuation and whitespace", ->
175 |       assert.equal "hello", tokenizer\normalize_word "  hello!!! "
176 | 
177 |     it "returns nil for empty string", ->
178 |       assert.is_nil tokenizer\normalize_word ""
179 | 
180 |     it "returns nil for nil input", ->
181 |       assert.is_nil tokenizer\normalize_word nil
182 | 
183 |     it "returns nil for whitespace only", ->
184 |       assert.is_nil tokenizer\normalize_word "   "
185 | 
186 |     it "returns nil for punctuation only", ->
187 |       assert.is_nil tokenizer\normalize_word "!!!"
188 | 
189 |   describe "word_ngrams method", ->
190 |     local tokenizer
191 |     before_each ->
192 |       tokenizer = NgramTokenizer!
193 | 
194 |     it "generates bigrams from word", ->
195 |       ngrams = tokenizer\word_ngrams "hello", 2
196 |       assert.same {"he", "el", "ll", "lo"}, ngrams
197 | 
198 |     it "generates trigrams from word", ->
199 |       ngrams = tokenizer\word_ngrams "hello", 3
200 |       assert.same {"hel", "ell", "llo"}, ngrams
201 | 
202 |     it "returns full word when length < n", ->
203 |       ngrams = tokenizer\word_ngrams "hi", 3
204 |       assert.same {"hi"}, ngrams
205 | 
206 |     it "returns full word when length == n", ->
207 |       ngrams = tokenizer\word_ngrams "hi", 2
208 |       assert.same {"hi"}, ngrams
209 | 
210 |     it "returns full word for empty string", ->
211 |       ngrams = tokenizer\word_ngrams "", 2
212 |       assert.same {""}, ngrams
213 | 
214 |     it "generates unigrams", ->
215 |       ngrams = tokenizer\word_ngrams "cat", 1
216 |       assert.same {"c", "a", "t"}, ngrams
217 | 
218 |   describe "number handling", ->
219 |     it_tokenizes "ignores numbers by default", "hello 123 world 456", {
220 |       "he"
221 |       "el"
222 |       "ll"
223 |       "lo"
224 |       "wo"
225 |       "or"
226 |       "rl"
227 |       "ld"
228 |     }
229 | 
230 |     it_tokenizes "includes numbers when ignore_numbers is false", "hello 123 world", {
231 |       "he"
232 |       "el"
233 |       "ll"
234 |       "lo"
235 |       "12"
236 |       "23"
237 |       "wo"
238 |       "or"
239 |       "rl"
240 |       "ld"
241 |     }, { ignore_numbers: false }
242 | 
243 |     it_tokenizes "handles mixed alphanumeric", "abc123 def456", {
244 |       "ab"
245 |       "bc"
246 |       "c1"
247 |       "12"
248 |       "23"
249 |       "de"
250 |       "ef"
251 |       "f4"
252 |       "45"
253 |       "56"
254 |     }, { ignore_numbers: false }
255 | 
256 |   describe "edge cases", ->
257 |     it_tokenizes "empty string", "", {}
258 | 
259 |     it_tokenizes "only whitespace", "     ", {}
260 | 
261 |     it_tokenizes "only punctuation", "!!???..", {}
262 | 
263 |     it_tokenizes "single character", "a", {
264 |       "a"
265 |     }
266 | 
267 |     it_tokenizes "two characters with bigrams", "ab", {
268 |       "ab"
269 |     }
270 | 
271 |     it_tokenizes "word longer than n", "testing", {
272 |       "te"
273 |       "es"
274 |       "st"
275 |       "ti"
276 |       "in"
277 |       "ng"
278 |     }
279 | 
280 |   describe "unicode and international characters", ->
281 |     it_tokenizes "accented characters", "café résumé", {
282 |       "ca"
283 |       "af"
284 |       "fé"
285 |       "ré"
286 |       "és"
287 |       "su"
288 |       "um"
289 |       "mé"
290 |     }
291 | 
292 |     it_tokenizes "spanish text", "español niño", {
293 |       "es"
294 |       "sp"
295 |       "pa"
296 |       "añ"
297 |       "ño"
298 |       "ol"
299 |       "ni"
300 |       "iñ"
301 |       "ño"
302 |     }
303 | 
304 |     it_tokenizes "german umlauts", "über schön", {
305 |       "üb"
306 |       "be"
307 |       "er"
308 |       "sc"
309 |       "ch"
310 |       "hö"
311 |       "ön"
312 |     }
313 | 
314 |     it_tokenizes "french accents", "élève être", {
315 |       "él"
316 |       "lè"
317 |       "èv"
318 |       "ve"
319 |       "êt"
320 |       "tr"
321 |       "re"
322 |     }
323 | 
324 |     it_tokenizes "chinese characters", "你好世界", {
325 |       "你好"
326 |       "好世"
327 |       "世界"
328 |     }
329 | 
330 |     it_tokenizes "mixed english and chinese", "hello 世界 world", {
331 |       "he"
332 |       "el"
333 |       "ll"
334 |       "lo"
335 |       "世界"
336 |       "wo"
337 |       "or"
338 |       "rl"
339 |       "ld"
340 |     }
341 | 
342 |   describe "filter_text option", ->
343 |     it_tokenizes "with custom text filter", "hello KEEP world", {
344 |       "he"
345 |       "el"
346 |       "ll"
347 |       "lo"
348 |       "ke"
349 |       "ee"
350 |       "ep"
351 |       "wo"
352 |       "or"
353 |       "rl"
354 |       "ld"
355 |     }, {
356 |       filter_text: (text) -> text\gsub("KEEP", "keep")
357 |     }
358 | 
359 |     it_tokenizes "filter that removes text", "hello remove world", {
360 |       "he"
361 |       "el"
362 |       "ll"
363 |       "lo"
364 |       "wo"
365 |       "or"
366 |       "rl"
367 |       "ld"
368 |     }, {
369 |       filter_text: (text) -> text\gsub("remove", "")
370 |     }
371 | 
372 |     it "returns empty when filter returns empty", ->
373 |       tokenizer = NgramTokenizer {
374 |         filter_text: (text) -> ""
375 |       }
376 |       tokens = tokenizer\tokenize_text "hello world"
377 |       assert.same {}, tokens
378 | 
379 |     it "returns empty when filter returns nil", ->
380 |       tokenizer = NgramTokenizer {
381 |         filter_text: (text) -> nil
382 |       }
383 |       tokens = tokenizer\tokenize_text "hello world"
384 |       assert.same {}, tokens
385 | 
386 |   describe "filter_tokens option", ->
387 |     it "with custom token filter", ->
388 |       tokenizer = NgramTokenizer {
389 |         filter_tokens: (tokens, opts) ->
390 |           filtered = {}
391 |           for token in *tokens
392 |             if token != "el"
393 |               table.insert filtered, token
394 |           filtered
395 |       }
396 |       tokens = tokenizer\tokenize_text "hello"
397 |       assert.same {"he", "ll", "lo"}, tokens
398 | 
399 |     it "filter can modify tokens", ->
400 |       tokenizer = NgramTokenizer {
401 |         filter_tokens: (tokens, opts) ->
402 |           modified = {}
403 |           for token in *tokens
404 |             table.insert modified, "prefix:#{token}"
405 |           modified
406 |       }
407 |       tokens = tokenizer\tokenize_text "hi"
408 |       assert.same {"prefix:hi"}, tokens
409 | 
410 |     it "filter receives opts parameter", ->
411 |       received_opts = nil
412 |       tokenizer = NgramTokenizer {
413 |         n: 3
414 |         filter_tokens: (tokens, opts) ->
415 |           received_opts = opts
416 |           tokens
417 |       }
418 |       tokenizer\tokenize_text "test"
419 |       assert.is_not_nil received_opts
420 |       assert.equal 3, received_opts.n
421 | 
422 |   describe "comprehensive examples", ->
423 |     it_tokenizes "sentence with mixed content", "The quick brown fox jumps!", {
424 |       "th"
425 |       "he"
426 |       "qu"
427 |       "ui"
428 |       "ic"
429 |       "ck"
430 |       "br"
431 |       "ro"
432 |       "ow"
433 |       "wn"
434 |       "fo"
435 |       "ox"
436 |       "ju"
437 |       "um"
438 |       "mp"
439 |       "ps"
440 |     }
441 | 
442 |     it_tokenizes "with trigrams on real text", "testing ngrams", {
443 |       "tes"
444 |       "est"
445 |       "sti"
446 |       "tin"
447 |       "ing"
448 |       "ngr"
449 |       "gra"
450 |       "ram"
451 |       "ams"
452 |     }, { n: 3 }
453 | 
454 |     it_tokenizes "real world example", "Machine Learning is amazing!", {
455 |       "ma"
456 |       "ac"
457 |       "ch"
458 |       "hi"
459 |       "in"
460 |       "ne"
461 |       "le"
462 |       "ea"
463 |       "ar"
464 |       "rn"
465 |       "ni"
466 |       "in"
467 |       "ng"
468 |       "is"
469 |       "am"
470 |       "ma"
471 |       "az"
472 |       "zi"
473 |       "in"
474 |       "ng"
475 |     }
476 | 
477 |   describe "build_grammar", ->
478 |     it "grammar parses words", ->
479 |       tokenizer = NgramTokenizer!
480 |       grammar = tokenizer\build_grammar!
481 |       words = grammar\match "hello world test"
482 |       assert.same {"hello", "world", "test"}, words
483 | 
484 |     it "grammar handles punctuation", ->
485 |       tokenizer = NgramTokenizer!
486 |       grammar = tokenizer\build_grammar!
487 |       words = grammar\match "hello, world! test?"
488 |       assert.same {"hello,", "world!", "test?"}, words
489 | 
490 |     it "grammar handles multiple spaces", ->
491 |       tokenizer = NgramTokenizer!
492 |       grammar = tokenizer\build_grammar!
493 |       words = grammar\match "hello    world"
494 |       assert.same {"hello", "world"}, words
495 | 
496 |     it "grammar handles tabs and newlines", ->
497 |       tokenizer = NgramTokenizer!
498 |       grammar = tokenizer\build_grammar!
499 |       words = grammar\match "hello\tworld\ntest"
500 |       assert.same {"hello", "world", "test"}, words
501 | 


--------------------------------------------------------------------------------
/lapis/bayes/text/stem.lua:
--------------------------------------------------------------------------------
  1 | local is_vowel
  2 | is_vowel = function(char)
  3 |   if not (char) then
  4 |     return false
  5 |   end
  6 |   char = char:lower()
  7 |   return char == 'a' or char == 'e' or char == 'i' or char == 'o' or char == 'u' or char == 'y'
  8 | end
  9 | local is_consonant
 10 | is_consonant = function(char)
 11 |   if not (char) then
 12 |     return false
 13 |   end
 14 |   return not is_vowel(char)
 15 | end
 16 | local is_vowel_wxy
 17 | is_vowel_wxy = function(char)
 18 |   if not (char) then
 19 |     return false
 20 |   end
 21 |   char = char:lower()
 22 |   return char == 'a' or char == 'e' or char == 'i' or char == 'o' or char == 'u' or char == 'y' or char == 'w' or char == 'x'
 23 | end
 24 | local is_valid_li
 25 | is_valid_li = function(char)
 26 |   if not (char) then
 27 |     return false
 28 |   end
 29 |   char = char:lower()
 30 |   return char == 'c' or char == 'd' or char == 'e' or char == 'g' or char == 'h' or char == 'k' or char == 'm' or char == 'n' or char == 'r' or char == 't'
 31 | end
 32 | local ends_with
 33 | ends_with = function(word, suffix)
 34 |   if #word < #suffix then
 35 |     return false
 36 |   end
 37 |   return word:sub(-#suffix) == suffix
 38 | end
 39 | local contains_vowel
 40 | contains_vowel = function(word)
 41 |   for i = 1, #word do
 42 |     if is_vowel(word:sub(i, i)) then
 43 |       return true
 44 |     end
 45 |   end
 46 |   return false
 47 | end
 48 | local replace_suffix
 49 | replace_suffix = function(word, suffix, replacement)
 50 |   if ends_with(word, suffix) then
 51 |     return word:sub(1, #word - #suffix) .. replacement
 52 |   else
 53 |     return word
 54 |   end
 55 | end
 56 | local get_suffix
 57 | get_suffix = function(word, pos)
 58 |   if pos > #word then
 59 |     return ""
 60 |   end
 61 |   return word:sub(pos)
 62 | end
 63 | local find_r1
 64 | find_r1 = function(word)
 65 |   if word:sub(1, 5) == "gener" then
 66 |     return 6
 67 |   elseif word:sub(1, 6) == "commun" then
 68 |     return 7
 69 |   elseif word:sub(1, 5) == "arsen" then
 70 |     return 6
 71 |   elseif word:sub(1, 4) == "past" then
 72 |     return 5
 73 |   elseif word:sub(1, 7) == "univers" then
 74 |     return 8
 75 |   elseif word:sub(1, 5) == "later" then
 76 |     return 6
 77 |   elseif word:sub(1, 5) == "emerg" then
 78 |     return 6
 79 |   elseif word:sub(1, 5) == "organ" then
 80 |     return 6
 81 |   end
 82 |   for i = 1, #word - 1 do
 83 |     if is_vowel(word:sub(i, i)) and is_consonant(word:sub(i + 1, i + 1)) then
 84 |       return i + 2
 85 |     end
 86 |   end
 87 |   return #word + 1
 88 | end
 89 | local find_r2
 90 | find_r2 = function(word)
 91 |   local r1_pos = find_r1(word)
 92 |   if r1_pos > #word then
 93 |     return #word + 1
 94 |   end
 95 |   for i = r1_pos, #word - 1 do
 96 |     if is_vowel(word:sub(i, i)) and is_consonant(word:sub(i + 1, i + 1)) then
 97 |       return i + 2
 98 |     end
 99 |   end
100 |   return #word + 1
101 | end
102 | local in_r1
103 | in_r1 = function(word, pos)
104 |   local r1 = find_r1(word)
105 |   return pos >= r1
106 | end
107 | local in_r2
108 | in_r2 = function(word, pos)
109 |   local r2 = find_r2(word)
110 |   return pos >= r2
111 | end
112 | local is_short_syllable_at
113 | is_short_syllable_at = function(word, pos)
114 |   if pos < 1 or pos > #word then
115 |     return false
116 |   end
117 |   local char = word:sub(pos, pos)
118 |   if not (is_vowel(char)) then
119 |     return false
120 |   end
121 |   if pos == 1 then
122 |     if #word > 1 then
123 |       local next_char = word:sub(2, 2)
124 |       return is_consonant(next_char)
125 |     end
126 |     return false
127 |   end
128 |   if pos < #word then
129 |     local prev_char = word:sub(pos - 1, pos - 1)
130 |     local next_char = word:sub(pos + 1, pos + 1)
131 |     if is_consonant(prev_char) and is_consonant(next_char) then
132 |       local next_lower = next_char:lower()
133 |       return next_lower ~= 'w' and next_lower ~= 'x' and next_char ~= 'Y'
134 |     end
135 |   end
136 |   return false
137 | end
138 | local ends_with_short_syllable
139 | ends_with_short_syllable = function(word)
140 |   if #word < 2 then
141 |     return false
142 |   end
143 |   if #word == 2 then
144 |     return is_vowel(word:sub(1, 1)) and is_consonant(word:sub(2, 2))
145 |   end
146 |   if #word >= 3 then
147 |     local c1 = word:sub(-3, -3)
148 |     local c2 = word:sub(-2, -2)
149 |     local c3 = word:sub(-1, -1)
150 |     if is_consonant(c1) and is_vowel(c2) and is_consonant(c3) then
151 |       local c3_lower = c3:lower()
152 |       return c3_lower ~= 'w' and c3_lower ~= 'x' and c3 ~= 'Y'
153 |     end
154 |   end
155 |   return false
156 | end
157 | local is_short_word
158 | is_short_word = function(word)
159 |   local r1 = find_r1(word)
160 |   if r1 > #word then
161 |     return true
162 |   end
163 |   if r1 == #word + 1 and ends_with_short_syllable(word) then
164 |     return true
165 |   end
166 |   return false
167 | end
168 | local prelude
169 | prelude = function(word)
170 |   if #word == 0 then
171 |     return word
172 |   end
173 |   if word:sub(1, 1) == "'" then
174 |     word = word:sub(2)
175 |   end
176 |   local result = { }
177 |   local y_found = false
178 |   for i = 1, #word do
179 |     local char = word:sub(i, i)
180 |     if char == 'y' then
181 |       if i == 1 then
182 |         table.insert(result, 'Y')
183 |         y_found = true
184 |       elseif i > 1 and is_vowel(word:sub(i - 1, i - 1)) then
185 |         table.insert(result, 'Y')
186 |         y_found = true
187 |       else
188 |         table.insert(result, char)
189 |       end
190 |     else
191 |       table.insert(result, char)
192 |     end
193 |   end
194 |   return table.concat(result), y_found
195 | end
196 | local postlude
197 | postlude = function(word, y_found)
198 |   if not (y_found) then
199 |     return word
200 |   end
201 |   return word:gsub('Y', 'y')
202 | end
203 | local exception1
204 | exception1 = function(word)
205 |   local exceptions = {
206 |     skis = "ski",
207 |     skies = "sky",
208 |     idly = "idl",
209 |     gently = "gentl",
210 |     ugly = "ugli",
211 |     early = "earli",
212 |     only = "onli",
213 |     singly = "singl",
214 |     sky = "sky",
215 |     news = "news",
216 |     howe = "howe",
217 |     atlas = "atlas",
218 |     cosmos = "cosmos",
219 |     bias = "bias",
220 |     andes = "andes"
221 |   }
222 |   return exceptions[word]
223 | end
224 | local step_1a
225 | step_1a = function(word)
226 |   if ends_with(word, "'s'") then
227 |     return word:sub(1, -4)
228 |   elseif ends_with(word, "'s") then
229 |     return word:sub(1, -3)
230 |   elseif ends_with(word, "'") then
231 |     return word:sub(1, -2)
232 |   end
233 |   if ends_with(word, "sses") then
234 |     return replace_suffix(word, "sses", "ss")
235 |   end
236 |   if ends_with(word, "ied") then
237 |     if #word > 4 then
238 |       return replace_suffix(word, "ied", "i")
239 |     else
240 |       return replace_suffix(word, "ied", "ie")
241 |     end
242 |   end
243 |   if ends_with(word, "ies") then
244 |     if #word > 4 then
245 |       return replace_suffix(word, "ies", "i")
246 |     else
247 |       return replace_suffix(word, "ies", "ie")
248 |     end
249 |   end
250 |   if ends_with(word, "s") and not ends_with(word, "us") and not ends_with(word, "ss") then
251 |     local stem = word:sub(1, -2)
252 |     if contains_vowel(stem) then
253 |       return stem
254 |     end
255 |   end
256 |   return word
257 | end
258 | local step_1b
259 | step_1b = function(word)
260 |   if ends_with(word, "eedly") then
261 |     local stem = word:sub(1, -6)
262 |     if in_r1(word, #stem + 1) then
263 |       if ends_with(stem, "proc") or ends_with(stem, "exc") or ends_with(stem, "succ") then
264 |         return word
265 |       end
266 |       return stem .. "ee"
267 |     end
268 |     return word
269 |   end
270 |   if ends_with(word, "eed") then
271 |     local stem = word:sub(1, -4)
272 |     if in_r1(word, #stem + 1) then
273 |       if ends_with(stem, "proc") or ends_with(stem, "exc") or ends_with(stem, "succ") then
274 |         return word
275 |       end
276 |       return stem .. "ee"
277 |     end
278 |     return word
279 |   end
280 |   local suffix_removed = false
281 |   local stem = word
282 |   if ends_with(word, "ingly") then
283 |     stem = word:sub(1, -6)
284 |     suffix_removed = true
285 |   elseif ends_with(word, "edly") then
286 |     stem = word:sub(1, -5)
287 |     suffix_removed = true
288 |   elseif ends_with(word, "ing") then
289 |     stem = word:sub(1, -4)
290 |     suffix_removed = true
291 |   elseif ends_with(word, "ed") then
292 |     stem = word:sub(1, -3)
293 |     suffix_removed = true
294 |   end
295 |   if suffix_removed then
296 |     if not (contains_vowel(stem)) then
297 |       return word
298 |     end
299 |     if ends_with(word, "ing") then
300 |       if ends_with(stem, "y") and #stem > 1 then
301 |         local prev = stem:sub(-2, -2)
302 |         if is_consonant(prev) and #stem == 2 then
303 |           return stem:sub(1, -2) .. "ie"
304 |         end
305 |       end
306 |       if ends_with(stem, "inn") or ends_with(stem, "out") or ends_with(stem, "cann") or ends_with(stem, "herr") or ends_with(stem, "earr") or ends_with(stem, "even") then
307 |         return word
308 |       end
309 |     end
310 |     if ends_with(stem, "at") or ends_with(stem, "bl") or ends_with(stem, "iz") then
311 |       return stem .. "e"
312 |     end
313 |     if #stem >= 2 then
314 |       local last = stem:sub(-1, -1)
315 |       local prev = stem:sub(-2, -2)
316 |       if last == prev and is_consonant(last) then
317 |         local last_lower = last:lower()
318 |         if not (last_lower == 'a' or last_lower == 'e' or last_lower == 'o') then
319 |           if last_lower == 'b' or last_lower == 'd' or last_lower == 'f' or last_lower == 'g' or last_lower == 'm' or last_lower == 'n' or last_lower == 'p' or last_lower == 'r' or last_lower == 't' then
320 |             return stem:sub(1, -2)
321 |           end
322 |         end
323 |       end
324 |     end
325 |     if in_r1(word, #stem + 1) and ends_with_short_syllable(stem) then
326 |       return stem .. "e"
327 |     end
328 |     return stem
329 |   end
330 |   return word
331 | end
332 | local step_1c
333 | step_1c = function(word)
334 |   if #word > 2 then
335 |     local last = word:sub(-1, -1)
336 |     local prev = word:sub(-2, -2)
337 |     if (last == 'y' or last == 'Y') and is_consonant(prev) then
338 |       return word:sub(1, -2) .. "i"
339 |     end
340 |   end
341 |   return word
342 | end
343 | local step_2
344 | step_2 = function(word)
345 |   local mappings = {
346 |     {
347 |       "ational",
348 |       "ate"
349 |     },
350 |     {
351 |       "tional",
352 |       "tion"
353 |     },
354 |     {
355 |       "enci",
356 |       "ence"
357 |     },
358 |     {
359 |       "anci",
360 |       "ance"
361 |     },
362 |     {
363 |       "abli",
364 |       "able"
365 |     },
366 |     {
367 |       "entli",
368 |       "ent"
369 |     },
370 |     {
371 |       "ization",
372 |       "ize"
373 |     },
374 |     {
375 |       "izer",
376 |       "ize"
377 |     },
378 |     {
379 |       "ation",
380 |       "ate"
381 |     },
382 |     {
383 |       "ator",
384 |       "ate"
385 |     },
386 |     {
387 |       "alism",
388 |       "al"
389 |     },
390 |     {
391 |       "aliti",
392 |       "al"
393 |     },
394 |     {
395 |       "alli",
396 |       "al"
397 |     },
398 |     {
399 |       "fulness",
400 |       "ful"
401 |     },
402 |     {
403 |       "ousli",
404 |       "ous"
405 |     },
406 |     {
407 |       "ousness",
408 |       "ous"
409 |     },
410 |     {
411 |       "iveness",
412 |       "ive"
413 |     },
414 |     {
415 |       "iviti",
416 |       "ive"
417 |     },
418 |     {
419 |       "biliti",
420 |       "ble"
421 |     },
422 |     {
423 |       "bli",
424 |       "ble"
425 |     },
426 |     {
427 |       "fulli",
428 |       "ful"
429 |     },
430 |     {
431 |       "lessli",
432 |       "less"
433 |     }
434 |   }
435 |   for _index_0 = 1, #mappings do
436 |     local pair = mappings[_index_0]
437 |     local suffix, replacement = pair[1], pair[2]
438 |     if ends_with(word, suffix) then
439 |       local stem = word:sub(1, #word - #suffix)
440 |       if in_r1(word, #stem + 1) then
441 |         return stem .. replacement
442 |       end
443 |     end
444 |   end
445 |   if ends_with(word, "ogi") then
446 |     local stem = word:sub(1, -4)
447 |     if in_r1(word, #stem + 1) and ends_with(stem, "l") then
448 |       return stem .. "og"
449 |     end
450 |   end
451 |   if ends_with(word, "li") then
452 |     local stem = word:sub(1, -3)
453 |     if in_r1(word, #stem + 1) and #stem > 0 then
454 |       local last = stem:sub(-1, -1)
455 |       if is_valid_li(last) then
456 |         return stem
457 |       end
458 |     end
459 |   end
460 |   if ends_with(word, "ogist") then
461 |     local stem = word:sub(1, -5)
462 |     if in_r1(word, #stem + 1) then
463 |       return stem .. "og"
464 |     end
465 |   end
466 |   return word
467 | end
468 | local step_3
469 | step_3 = function(word)
470 |   local mappings = {
471 |     {
472 |       "ational",
473 |       "ate"
474 |     },
475 |     {
476 |       "tional",
477 |       "tion"
478 |     },
479 |     {
480 |       "alize",
481 |       "al"
482 |     },
483 |     {
484 |       "icate",
485 |       "ic"
486 |     },
487 |     {
488 |       "iciti",
489 |       "ic"
490 |     },
491 |     {
492 |       "ical",
493 |       "ic"
494 |     },
495 |     {
496 |       "ful",
497 |       ""
498 |     },
499 |     {
500 |       "ness",
501 |       ""
502 |     }
503 |   }
504 |   for _index_0 = 1, #mappings do
505 |     local pair = mappings[_index_0]
506 |     local suffix, replacement = pair[1], pair[2]
507 |     if ends_with(word, suffix) then
508 |       local stem = word:sub(1, #word - #suffix)
509 |       if in_r1(word, #stem + 1) then
510 |         return stem .. replacement
511 |       end
512 |     end
513 |   end
514 |   if ends_with(word, "ative") then
515 |     local stem = word:sub(1, -6)
516 |     if in_r2(word, #stem + 1) then
517 |       return stem
518 |     end
519 |   end
520 |   return word
521 | end
522 | local step_4
523 | step_4 = function(word)
524 |   local suffixes = {
525 |     "al",
526 |     "ance",
527 |     "ence",
528 |     "er",
529 |     "ic",
530 |     "able",
531 |     "ible",
532 |     "ant",
533 |     "ement",
534 |     "ment",
535 |     "ent",
536 |     "ism",
537 |     "ate",
538 |     "iti",
539 |     "ous",
540 |     "ive",
541 |     "ize"
542 |   }
543 |   for _index_0 = 1, #suffixes do
544 |     local suffix = suffixes[_index_0]
545 |     if ends_with(word, suffix) then
546 |       local stem = word:sub(1, #word - #suffix)
547 |       if in_r2(word, #stem + 1) then
548 |         return stem
549 |       end
550 |     end
551 |   end
552 |   if ends_with(word, "ion") then
553 |     local stem = word:sub(1, -4)
554 |     if in_r2(word, #stem + 1) and #stem > 0 then
555 |       local last = stem:sub(-1, -1)
556 |       if last == 's' or last == 't' then
557 |         return stem
558 |       end
559 |     end
560 |   end
561 |   return word
562 | end
563 | local step_5
564 | step_5 = function(word)
565 |   if ends_with(word, "e") then
566 |     local stem = word:sub(1, -2)
567 |     if in_r2(word, #stem + 1) then
568 |       return stem
569 |     end
570 |     if in_r1(word, #stem + 1) and not ends_with_short_syllable(stem) then
571 |       return stem
572 |     end
573 |   end
574 |   if ends_with(word, "ll") and in_r2(word, #word) then
575 |     return word:sub(1, -2)
576 |   end
577 |   return word
578 | end
579 | local stem_word
580 | stem_word = function(word)
581 |   if not (word and type(word) == "string") then
582 |     return word
583 |   end
584 |   if #word < 3 then
585 |     return word
586 |   end
587 |   word = word:lower()
588 |   local exception = exception1(word)
589 |   if exception then
590 |     return exception
591 |   end
592 |   if #word < 3 then
593 |     return word
594 |   end
595 |   local y_found
596 |   word, y_found = prelude(word)
597 |   word = step_1a(word)
598 |   word = step_1b(word)
599 |   word = step_1c(word)
600 |   word = step_2(word)
601 |   word = step_3(word)
602 |   word = step_4(word)
603 |   word = step_5(word)
604 |   word = postlude(word, y_found)
605 |   return word
606 | end
607 | return {
608 |   stem_word = stem_word
609 | }
610 | 


--------------------------------------------------------------------------------
/lapis/bayes/text/stem.moon:
--------------------------------------------------------------------------------
  1 | -- Porter Stemmer implementation in MoonScript
  2 | -- Based on the Snowball English stemmer algorithm
  3 | -- https://github.com/snowballstem/snowball/blob/master/algorithms/english.sbl
  4 | --
  5 | -- This implementation is derived from the Snowball stemming algorithms
  6 | -- Copyright (c) 2001, Dr Martin Porter
  7 | -- Copyright (c) 2004,2005, Richard Boulton
  8 | -- Copyright (c) 2013, Yoshiki Shibukawa
  9 | -- Copyright (c) 2006,2007,2009,2010,2011,2014-2019, Olly Betts
 10 | -- All rights reserved.
 11 | --
 12 | -- Redistribution and use in source and binary forms, with or without
 13 | -- modification, are permitted provided that the following conditions
 14 | -- are met:
 15 | --
 16 | --   1. Redistributions of source code must retain the above copyright notice,
 17 | --      this list of conditions and the following disclaimer.
 18 | --   2. Redistributions in binary form must reproduce the above copyright notice,
 19 | --      this list of conditions and the following disclaimer in the documentation
 20 | --      and/or other materials provided with the distribution.
 21 | --   3. Neither the name of the Snowball project nor the names of its contributors
 22 | --      may be used to endorse or promote products derived from this software
 23 | --      without specific prior written permission.
 24 | --
 25 | -- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 26 | -- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 27 | -- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 28 | -- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
 29 | -- ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 30 | -- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 31 | -- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
 32 | -- ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 33 | -- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 34 | -- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 35 | 
 36 | -- Character group definitions
 37 | is_vowel = (char) ->
 38 |   return false unless char
 39 |   char = char\lower!
 40 |   char == 'a' or char == 'e' or char == 'i' or char == 'o' or char == 'u' or char == 'y'
 41 | 
 42 | is_consonant = (char) ->
 43 |   return false unless char
 44 |   not is_vowel char
 45 | 
 46 | is_vowel_wxy = (char) ->
 47 |   return false unless char
 48 |   char = char\lower!
 49 |   char == 'a' or char == 'e' or char == 'i' or char == 'o' or char == 'u' or char == 'y' or char == 'w' or char == 'x'
 50 | 
 51 | is_valid_li = (char) ->
 52 |   return false unless char
 53 |   char = char\lower!
 54 |   char == 'c' or char == 'd' or char == 'e' or char == 'g' or char == 'h' or char == 'k' or char == 'm' or char == 'n' or char == 'r' or char == 't'
 55 | 
 56 | -- String utility functions
 57 | ends_with = (word, suffix) ->
 58 |   return false if #word < #suffix
 59 |   word\sub(-#suffix) == suffix
 60 | 
 61 | contains_vowel = (word) ->
 62 |   for i = 1, #word
 63 |     return true if is_vowel word\sub(i, i)
 64 |   false
 65 | 
 66 | -- Replace suffix with replacement
 67 | replace_suffix = (word, suffix, replacement) ->
 68 |   if ends_with word, suffix
 69 |     word\sub(1, #word - #suffix) .. replacement
 70 |   else
 71 |     word
 72 | 
 73 | -- Get suffix starting at position
 74 | get_suffix = (word, pos) ->
 75 |   return "" if pos > #word
 76 |   word\sub pos
 77 | 
 78 | -- Region detection
 79 | -- Find R1: the region after the first non-vowel following a vowel
 80 | find_r1 = (word) ->
 81 |   -- Special handling for common prefixes
 82 |   if word\sub(1, 5) == "gener"
 83 |     return 6
 84 |   elseif word\sub(1, 6) == "commun"
 85 |     return 7
 86 |   elseif word\sub(1, 5) == "arsen"
 87 |     return 6
 88 |   elseif word\sub(1, 4) == "past"
 89 |     return 5
 90 |   elseif word\sub(1, 7) == "univers"
 91 |     return 8
 92 |   elseif word\sub(1, 5) == "later"
 93 |     return 6
 94 |   elseif word\sub(1, 5) == "emerg"
 95 |     return 6
 96 |   elseif word\sub(1, 5) == "organ"
 97 |     return 6
 98 | 
 99 |   -- Standard R1 detection: find first V followed by NV
100 |   for i = 1, #word - 1
101 |     if is_vowel(word\sub(i, i)) and is_consonant(word\sub(i + 1, i + 1))
102 |       return i + 2
103 | 
104 |   #word + 1
105 | 
106 | -- Find R2: the region after the first non-vowel following a vowel in R1
107 | find_r2 = (word) ->
108 |   r1_pos = find_r1 word
109 |   return #word + 1 if r1_pos > #word
110 | 
111 |   -- Find V followed by NV in R1
112 |   for i = r1_pos, #word - 1
113 |     if is_vowel(word\sub(i, i)) and is_consonant(word\sub(i + 1, i + 1))
114 |       return i + 2
115 | 
116 |   #word + 1
117 | 
118 | -- Test if position is at R1
119 | in_r1 = (word, pos) ->
120 |   r1 = find_r1 word
121 |   pos >= r1
122 | 
123 | -- Test if position is at R2
124 | in_r2 = (word, pos) ->
125 |   r2 = find_r2 word
126 |   pos >= r2
127 | 
128 | -- Test for short syllable
129 | -- A short syllable is either (a) a vowel followed by a non-vowel other than w, x or Y
130 | -- and preceded by a non-vowel, or (b) a vowel at the beginning of the word followed
131 | -- by a non-vowel.
132 | is_short_syllable_at = (word, pos) ->
133 |   return false if pos < 1 or pos > #word
134 | 
135 |   char = word\sub(pos, pos)
136 |   return false unless is_vowel char
137 | 
138 |   if pos == 1
139 |     -- Case (b): vowel at beginning followed by non-vowel
140 |     if #word > 1
141 |       next_char = word\sub(2, 2)
142 |       return is_consonant next_char
143 |     return false
144 | 
145 |   -- Case (a): non-vowel, vowel, non-vowel (not w,x,Y)
146 |   if pos < #word
147 |     prev_char = word\sub(pos - 1, pos - 1)
148 |     next_char = word\sub(pos + 1, pos + 1)
149 | 
150 |     if is_consonant(prev_char) and is_consonant(next_char)
151 |       next_lower = next_char\lower!
152 |       return next_lower != 'w' and next_lower != 'x' and next_char != 'Y'
153 | 
154 |   false
155 | 
156 | -- Test if word ends with short syllable
157 | ends_with_short_syllable = (word) ->
158 |   return false if #word < 2
159 | 
160 |   -- Check last two characters for pattern
161 |   if #word == 2
162 |     return is_vowel(word\sub(1, 1)) and is_consonant(word\sub(2, 2))
163 | 
164 |   -- Check last three characters for non-vowel, vowel, non-vowel (not w,x,Y)
165 |   if #word >= 3
166 |     c1 = word\sub(-3, -3)
167 |     c2 = word\sub(-2, -2)
168 |     c3 = word\sub(-1, -1)
169 | 
170 |     if is_consonant(c1) and is_vowel(c2) and is_consonant(c3)
171 |       c3_lower = c3\lower!
172 |       return c3_lower != 'w' and c3_lower != 'x' and c3 != 'Y'
173 | 
174 |   false
175 | 
176 | -- Test for short word: word is short if it consists of a short syllable
177 | -- and nothing else, or if R1 is null
178 | is_short_word = (word) ->
179 |   r1 = find_r1 word
180 |   return true if r1 > #word
181 | 
182 |   -- Also check if ends with short syllable at beginning of R1
183 |   if r1 == #word + 1 and ends_with_short_syllable word
184 |     return true
185 | 
186 |   false
187 | 
188 | -- Prelude: handle initial Y and y after vowel
189 | prelude = (word) ->
190 |   return word if #word == 0
191 | 
192 |   -- Remove initial apostrophe
193 |   word = word\sub(2) if word\sub(1, 1) == "'"
194 | 
195 |   result = {}
196 |   y_found = false
197 | 
198 |   for i = 1, #word
199 |     char = word\sub(i, i)
200 | 
201 |     if char == 'y'
202 |       -- Convert to Y if at beginning or after vowel
203 |       if i == 1
204 |         table.insert result, 'Y'
205 |         y_found = true
206 |       elseif i > 1 and is_vowel(word\sub(i - 1, i - 1))
207 |         table.insert result, 'Y'
208 |         y_found = true
209 |       else
210 |         table.insert result, char
211 |     else
212 |       table.insert result, char
213 | 
214 |   table.concat(result), y_found
215 | 
216 | -- Postlude: convert Y back to y
217 | postlude = (word, y_found) ->
218 |   return word unless y_found
219 |   word\gsub('Y', 'y')
220 | 
221 | -- Exception list 1: special cases
222 | exception1 = (word) ->
223 |   exceptions = {
224 |     skis: "ski"
225 |     skies: "sky"
226 |     idly: "idl"
227 |     gently: "gentl"
228 |     ugly: "ugli"
229 |     early: "earli"
230 |     only: "onli"
231 |     singly: "singl"
232 |     sky: "sky"
233 |     news: "news"
234 |     howe: "howe"
235 |     atlas: "atlas"
236 |     cosmos: "cosmos"
237 |     bias: "bias"
238 |     andes: "andes"
239 |   }
240 | 
241 |   exceptions[word]
242 | 
243 | -- Step 1a: handle plural forms
244 | step_1a = (word) ->
245 |   -- Handle apostrophe forms
246 |   if ends_with word, "'s'"
247 |     return word\sub(1, -4)
248 |   elseif ends_with word, "'s"
249 |     return word\sub(1, -3)
250 |   elseif ends_with word, "'"
251 |     return word\sub(1, -2)
252 | 
253 |   -- Handle sses -> ss
254 |   if ends_with word, "sses"
255 |     return replace_suffix word, "sses", "ss"
256 | 
257 |   -- Handle ied, ies
258 |   if ends_with word, "ied"
259 |     if #word > 4
260 |       return replace_suffix word, "ied", "i"
261 |     else
262 |       return replace_suffix word, "ied", "ie"
263 | 
264 |   if ends_with word, "ies"
265 |     if #word > 4
266 |       return replace_suffix word, "ies", "i"
267 |     else
268 |       return replace_suffix word, "ies", "ie"
269 | 
270 |   -- Handle s (but not us or ss)
271 |   if ends_with(word, "s") and not ends_with(word, "us") and not ends_with(word, "ss")
272 |     -- Only remove s if preceded by vowel somewhere in word
273 |     stem = word\sub(1, -2)
274 |     if contains_vowel stem
275 |       return stem
276 | 
277 |   word
278 | 
279 | -- Step 1b: handle ed, ing, eed forms
280 | step_1b = (word) ->
281 |   -- Handle eed, eedly
282 |   if ends_with word, "eedly"
283 |     stem = word\sub(1, -6)
284 |     if in_r1 word, #stem + 1
285 |       -- Check for special cases
286 |       if ends_with(stem, "proc") or ends_with(stem, "exc") or ends_with(stem, "succ")
287 |         return word
288 |       return stem .. "ee"
289 |     return word
290 | 
291 |   if ends_with word, "eed"
292 |     stem = word\sub(1, -4)
293 |     if in_r1 word, #stem + 1
294 |       if ends_with(stem, "proc") or ends_with(stem, "exc") or ends_with(stem, "succ")
295 |         return word
296 |       return stem .. "ee"
297 |     return word
298 | 
299 |   -- Handle ed, edly, ing, ingly
300 |   suffix_removed = false
301 |   stem = word
302 | 
303 |   if ends_with word, "ingly"
304 |     stem = word\sub(1, -6)
305 |     suffix_removed = true
306 |   elseif ends_with word, "edly"
307 |     stem = word\sub(1, -5)
308 |     suffix_removed = true
309 |   elseif ends_with word, "ing"
310 |     stem = word\sub(1, -4)
311 |     suffix_removed = true
312 |   elseif ends_with word, "ed"
313 |     stem = word\sub(1, -3)
314 |     suffix_removed = true
315 | 
316 |   if suffix_removed
317 |     -- Only proceed if stem contains vowel
318 |     return word unless contains_vowel stem
319 | 
320 |     -- Special handling for ing forms
321 |     if ends_with word, "ing"
322 |       -- dying -> die, lying -> lie, tying -> tie
323 |       if ends_with(stem, "y") and #stem > 1
324 |         prev = stem\sub(-2, -2)
325 |         if is_consonant(prev) and #stem == 2
326 |           return stem\sub(1, -2) .. "ie"
327 | 
328 |       -- inning, outing, canning stay as is
329 |       if ends_with(stem, "inn") or ends_with(stem, "out") or ends_with(stem, "cann") or ends_with(stem, "herr") or ends_with(stem, "earr") or ends_with(stem, "even")
330 |         return word
331 | 
332 |     -- Post-processing based on stem ending
333 |     if ends_with(stem, "at") or ends_with(stem, "bl") or ends_with(stem, "iz")
334 |       return stem .. "e"
335 | 
336 |     -- Handle double consonants (not aeo)
337 |     if #stem >= 2
338 |       last = stem\sub(-1, -1)
339 |       prev = stem\sub(-2, -2)
340 |       if last == prev and is_consonant(last)
341 |         last_lower = last\lower!
342 |         unless last_lower == 'a' or last_lower == 'e' or last_lower == 'o'
343 |           -- Remove one of the double consonants (but check for special cases)
344 |           if last_lower == 'b' or last_lower == 'd' or last_lower == 'f' or last_lower == 'g' or last_lower == 'm' or last_lower == 'n' or last_lower == 'p' or last_lower == 'r' or last_lower == 't'
345 |             return stem\sub(1, -2)
346 | 
347 |     -- If R1 is null and ends with short syllable, add e
348 |     if in_r1(word, #stem + 1) and ends_with_short_syllable stem
349 |       return stem .. "e"
350 | 
351 |     return stem
352 | 
353 |   word
354 | 
355 | -- Step 1c: replace suffix y or Y by i if preceded by non-vowel which is not at the beginning
356 | step_1c = (word) ->
357 |   if #word > 2
358 |     last = word\sub(-1, -1)
359 |     prev = word\sub(-2, -2)
360 | 
361 |     if (last == 'y' or last == 'Y') and is_consonant(prev)
362 |       return word\sub(1, -2) .. "i"
363 | 
364 |   word
365 | 
366 | -- Step 2: suffix removal for derivational suffixes
367 | step_2 = (word) ->
368 |   mappings = {
369 |     {"ational", "ate"}
370 |     {"tional", "tion"}
371 |     {"enci", "ence"}
372 |     {"anci", "ance"}
373 |     {"abli", "able"}
374 |     {"entli", "ent"}
375 |     {"ization", "ize"}
376 |     {"izer", "ize"}
377 |     {"ation", "ate"}
378 |     {"ator", "ate"}
379 |     {"alism", "al"}
380 |     {"aliti", "al"}
381 |     {"alli", "al"}
382 |     {"fulness", "ful"}
383 |     {"ousli", "ous"}
384 |     {"ousness", "ous"}
385 |     {"iveness", "ive"}
386 |     {"iviti", "ive"}
387 |     {"biliti", "ble"}
388 |     {"bli", "ble"}
389 |     {"fulli", "ful"}
390 |     {"lessli", "less"}
391 |   }
392 | 
393 |   for pair in *mappings
394 |     suffix, replacement = pair[1], pair[2]
395 |     if ends_with word, suffix
396 |       stem = word\sub(1, #word - #suffix)
397 |       if in_r1 word, #stem + 1
398 |         return stem .. replacement
399 | 
400 |   -- Special case: ogi -> og (when preceded by l)
401 |   if ends_with word, "ogi"
402 |     stem = word\sub(1, -4)
403 |     if in_r1(word, #stem + 1) and ends_with(stem, "l")
404 |       return stem .. "og"
405 | 
406 |   -- Special case: li -> delete (when preceded by valid_li)
407 |   if ends_with word, "li"
408 |     stem = word\sub(1, -3)
409 |     if in_r1(word, #stem + 1) and #stem > 0
410 |       last = stem\sub(-1, -1)
411 |       if is_valid_li last
412 |         return stem
413 | 
414 |   -- Special case: ogist -> og
415 |   if ends_with word, "ogist"
416 |     stem = word\sub(1, -5)
417 |     if in_r1 word, #stem + 1
418 |       return stem .. "og"
419 | 
420 |   word
421 | 
422 | -- Step 3: suffix removal
423 | step_3 = (word) ->
424 |   mappings = {
425 |     {"ational", "ate"}
426 |     {"tional", "tion"}
427 |     {"alize", "al"}
428 |     {"icate", "ic"}
429 |     {"iciti", "ic"}
430 |     {"ical", "ic"}
431 |     {"ful", ""}
432 |     {"ness", ""}
433 |   }
434 | 
435 |   for pair in *mappings
436 |     suffix, replacement = pair[1], pair[2]
437 |     if ends_with word, suffix
438 |       stem = word\sub(1, #word - #suffix)
439 |       if in_r1 word, #stem + 1
440 |         return stem .. replacement
441 | 
442 |   -- Special case: ative -> delete (in R2)
443 |   if ends_with word, "ative"
444 |     stem = word\sub(1, -6)
445 |     if in_r2 word, #stem + 1
446 |       return stem
447 | 
448 |   word
449 | 
450 | -- Step 4: suffix removal
451 | step_4 = (word) ->
452 |   suffixes = {
453 |     "al", "ance", "ence", "er", "ic", "able", "ible",
454 |     "ant", "ement", "ment", "ent", "ism", "ate",
455 |     "iti", "ous", "ive", "ize"
456 |   }
457 | 
458 |   for suffix in *suffixes
459 |     if ends_with word, suffix
460 |       stem = word\sub(1, #word - #suffix)
461 |       if in_r2 word, #stem + 1
462 |         return stem
463 | 
464 |   -- Special case: ion -> delete (when preceded by s or t in R2)
465 |   if ends_with word, "ion"
466 |     stem = word\sub(1, -4)
467 |     if in_r2(word, #stem + 1) and #stem > 0
468 |       last = stem\sub(-1, -1)
469 |       if last == 's' or last == 't'
470 |         return stem
471 | 
472 |   word
473 | 
474 | -- Step 5: suffix removal
475 | step_5 = (word) ->
476 |   -- Step 5a: remove trailing e
477 |   if ends_with word, "e"
478 |     stem = word\sub(1, -2)
479 | 
480 |     -- Delete if in R2
481 |     if in_r2 word, #stem + 1
482 |       return stem
483 | 
484 |     -- Delete if in R1 and not preceded by short syllable
485 |     if in_r1(word, #stem + 1) and not ends_with_short_syllable(stem)
486 |       return stem
487 | 
488 |   -- Step 5b: remove trailing l
489 |   if ends_with(word, "ll") and in_r2(word, #word)
490 |     return word\sub(1, -2)
491 | 
492 |   word
493 | 
494 | -- Main stemming function
495 | stem_word = (word) ->
496 |   return word unless word and type(word) == "string"
497 |   return word if #word < 3
498 | 
499 |   word = word\lower!
500 | 
501 |   -- Check exceptions first
502 |   exception = exception1 word
503 |   return exception if exception
504 | 
505 |   -- If word is too short, return as-is
506 |   return word if #word < 3
507 | 
508 |   -- Run through stemming steps
509 |   word, y_found = prelude word
510 | 
511 |   word = step_1a word
512 |   word = step_1b word
513 |   word = step_1c word
514 |   word = step_2 word
515 |   word = step_3 word
516 |   word = step_4 word
517 |   word = step_5 word
518 | 
519 |   word = postlude word, y_found
520 | 
521 |   word
522 | 
523 | {
524 |   :stem_word
525 | }
526 | 


--------------------------------------------------------------------------------
/lapis/bayes/tokenizers/spam.moon:
--------------------------------------------------------------------------------
  1 | unpack_fn = table.unpack or unpack
  2 | 
  3 | punycode = require "lapis.bayes.text.punycode"
  4 | import Extractor from require "web_sanitize.html"
  5 | types = require "lapis.validate.types"
  6 | 
  7 | import cjk_character from require "lapis.bayes.text.utf8"
  8 | 
  9 | extract_text = Extractor {
 10 |   escape_html: false
 11 | }
 12 | 
 13 | normalize_number = (value) ->
 14 |   return unless value and value != ""
 15 | 
 16 |   normalized = value\gsub("[,%s]", "")
 17 |   digits_only = normalized\gsub("[^%d]", "")
 18 |   return if digits_only == ""
 19 | 
 20 |   normalized
 21 | 
 22 | -- NOTE: this only works with ASCII punctuation characters, be careful when
 23 | -- updating punct_pattern if it's going to include unicode punctuation
 24 | handle_punct = (chars) ->
 25 |   char = chars\sub 1, 1
 26 |   {tag: "punct", value: char .. tostring(#chars)}
 27 | 
 28 | handle_invalid_byte = (byte) ->
 29 |   {tag: "invalid_byte", value: tostring(string.byte(byte))}
 30 | 
 31 | -- return new array with order shuffled by dithering
 32 | -- e: dither factor
 33 | -- https://buildingrecommenders.wordpress.com/2015/11/11/dithering/
 34 | dithered = do
 35 |   -- random normal box muller
 36 |   gn = (sd=1, mean=0, r=math.random) ->
 37 |     local x1, x2, w, y1, y2
 38 |     while true
 39 |       x1 = 2 * r! - 1
 40 |       x2 = 2 * r! - 1
 41 |       w = x1^2 + x2^2
 42 |       break if w < 1
 43 | 
 44 |     w = math.sqrt -2 * math.log(w) / 2
 45 |     y1 = x1 * w
 46 |     y2 = x2 * w
 47 | 
 48 |     y1 * sd + mean
 49 | 
 50 |   dither_score = (rank, e) ->
 51 |     math.log(rank) + gn(math.log(e))
 52 | 
 53 |   (items, e=1.5) ->
 54 |     rows = for i, item in ipairs items
 55 |       {dither_score(i, e), item}
 56 | 
 57 |     table.sort rows, (a, b) ->
 58 |       a[1] < b[1]
 59 | 
 60 |     [row[2] for row in *rows]
 61 | 
 62 | 
 63 | -- spam tokenizer with support for domains, emails, currencies, and more
 64 | -- opts = {
 65 | --   filter_text: function -- function to pre-filter text, returns new text
 66 | --   min_word_length: number -- minimum length of word (default 2)
 67 | --   max_word_length: number -- maximum length of word (default 32)
 68 | --   ignore_words: table -- table of words to ignore
 69 | --   stem_words: bool -- enable word stemming
 70 | --   unaccent: bool -- enable unaccenting (default true)
 71 | --   dedupe: bool -- enable deduplication (default true)
 72 | --   ignore_tokens: table -- table of tokens to ignore eg. {"my_token" = false}
 73 | --   ignore_domains: {string} -- domains to ignore (`example.com` exact, `.example.com` includes subdomains)
 74 | --   sample_at_most: number -- limit number of sampled tokens
 75 | --   dither: bool -- enable dithering when sampling (default true)
 76 | --   bigram_tokens: bool -- enable bigram generation
 77 | --   filter_tokens: function -- function to filter tokens, called at end with (tokens, opts)
 78 | --   domain_tokens_first: bool -- move domain tokens before all other tokens (default false)
 79 | --   split_cjk: -- split chinese, korean, japanese characters to be individual words
 80 | -- }
 81 | class SpamTokenizer extends require "lapis.bayes.tokenizers.base"
 82 |   new: (@opts = {}) =>
 83 | 
 84 |   tagged_token_to_string: (token) =>
 85 |     "#{token.tag}:#{token.value}"
 86 | 
 87 |   normalize_domain_string: (domain) =>
 88 |     return unless domain and domain != ""
 89 |     domain = tostring domain
 90 |     domain = domain\gsub("^%s+", "")\gsub("%s+$", "")
 91 |     domain = domain\gsub("%.+$", "")
 92 |     return if domain == ""
 93 | 
 94 |     labels = {}
 95 |     for label in domain\gmatch "[^%.]+"
 96 |       return if label == ""
 97 |       encoded = punycode.punycode_encode label
 98 |       encoded or= label
 99 |       table.insert labels, encoded\lower!
100 | 
101 |     return unless next labels
102 |     table.concat labels, "."
103 | 
104 |   build_ignored_domains: =>
105 |     entries = @opts.ignore_domains
106 |     return false unless entries and #entries > 0
107 | 
108 |     exact = {}
109 |     suffix = {}
110 | 
111 |     for domain in *entries
112 |       continue unless type(domain) == "string"
113 |       domain = domain\gsub("^%s+", "")\gsub("%s+$", "")
114 |       continue if domain == ""
115 | 
116 |       is_suffix = domain\sub(1, 1) == "."
117 |       domain = domain\sub(2) if is_suffix
118 |       continue if domain == ""
119 | 
120 |       normalized = @normalize_domain_string domain
121 |       continue unless normalized
122 | 
123 |       if is_suffix
124 |         suffix[normalized] = true
125 |       else
126 |         exact[normalized] = true
127 | 
128 |     return false unless next(exact) or next(suffix)
129 | 
130 |     {
131 |       exact: exact
132 |       suffix: suffix
133 |     }
134 | 
135 |   should_ignore_domain: (domain) =>
136 |     return false unless @opts.ignore_domains
137 | 
138 |     if @ignored_domains == nil
139 |       @ignored_domains = @build_ignored_domains!
140 | 
141 | 
142 |     return false unless @ignored_domains
143 |     normalized = @normalize_domain_string domain
144 |     return false unless normalized
145 | 
146 |     if @ignored_domains.exact[normalized]
147 |       return true
148 | 
149 |     for suffix in pairs @ignored_domains.suffix
150 |       return true if normalized == suffix
151 |       if #normalized > #suffix
152 |         if normalized\sub(-(#suffix + 1)) == ".#{suffix}"
153 |           return true
154 | 
155 |     false
156 | 
157 |   build_grammar: =>
158 |     import P, S, R, C, Cg, Ct from require "lpeg"
159 |     utf8 = require "lapis.util.utf8"
160 | 
161 |     min_len = @opts.min_word_length or 2
162 |     max_len = @opts.max_word_length or 32
163 |     ignore_words = @opts.ignore_words
164 | 
165 |     truncate = types.truncated_text max_len
166 | 
167 |     stem = if @opts.stem_words
168 |       require("lapis.bayes.text.stem").stem_word
169 | 
170 |     case_insensitive = (text) ->
171 |       out = nil
172 |       for char in text\gmatch "."
173 |         lower = char\lower!
174 |         upper = char\upper!
175 |         pattern = if lower == upper
176 |           P char
177 |         else
178 |           S "#{lower}#{upper}"
179 | 
180 |         out = if out
181 |           out * pattern
182 |         else
183 |           pattern
184 | 
185 |       out or P(false)
186 | 
187 |     normalize_word = (word) ->
188 |       return unless word and word != ""
189 | 
190 |       word = word\lower!
191 |       word = word\gsub("'+", "")
192 | 
193 |       return if #word < min_len
194 |       if #word > max_len
195 |         word = truncate\transform word
196 |       return if ignore_words and ignore_words[word]
197 | 
198 |       word
199 | 
200 |     handle_domain_token = (domain) ->
201 |       -- convert subdomains to punycode
202 |       labels = for label in domain\gmatch "[^%.]+"
203 |         encoded = punycode.punycode_encode label
204 |         if #encoded > max_len
205 |           truncate\transform encoded
206 |         else
207 |           encoded
208 | 
209 |       tokens = {
210 |         {tag: "domain", value: truncate\transform table.concat(labels, ".")\lower!}
211 |       }
212 | 
213 |       -- Generate hierarchical domain tokens with leading dots for subdomains
214 |       if #labels >= 2
215 |         for i = 2, #labels
216 |           suffix = table.concat [labels[j] for j = i, #labels], "."
217 |           table.insert tokens, {tag: "domain", value: truncate\transform ".#{suffix\lower!}"}
218 | 
219 |       unpack_fn tokens
220 | 
221 |     extract_url_words = (...) ->
222 |       out = {}
223 |       for part in *{...}
224 |         continue unless part and #part > 0
225 | 
226 |         -- Strip leading URL punctuation like / ? #
227 |         part = part\gsub("^[:/?#]+", "")
228 |         continue if part == ""
229 | 
230 |         -- Treat underscores and other punctuation as separators
231 |         part = part\gsub("_", " ")
232 |         part = part\gsub("[^%w']+", " ")
233 | 
234 |         for raw in part\gmatch "%S+"
235 |           normalized = normalize_word raw
236 |           table.insert out, normalized if normalized
237 | 
238 |       out
239 | 
240 |     handle_url = (t) ->
241 |       return if @should_ignore_domain t.domain
242 | 
243 |       tokens = {}
244 | 
245 |       if t.userinfo and t.userinfo != ""
246 |         table.insert tokens, t.userinfo\lower!
247 | 
248 |       for word in *extract_url_words t.path, t.query, t.fragment
249 |         table.insert tokens, word
250 | 
251 |       for token in *{handle_domain_token t.domain}
252 |         table.insert tokens, token
253 | 
254 |       unpack_fn tokens
255 | 
256 |     handle_email = (email) ->
257 |       email = email\lower!
258 |       user, domain = email\match "^([^@]+)@(.+)$"
259 | 
260 |       tokens = {{tag: "email", value: truncate\transform email}}
261 | 
262 |       if user
263 |         user_token = normalize_word user
264 |         table.insert tokens, {tag: "email_user", value: user_token} if user_token
265 | 
266 |       if domain
267 |         for token in *{handle_domain_token domain}
268 |           table.insert tokens, token
269 | 
270 |       unpack_fn tokens
271 | 
272 |     handle_number = (value) ->
273 |       normalized = normalize_number value
274 |       return unless normalized
275 |       if #normalized > max_len
276 |         truncate\transform normalized
277 |       else
278 |         normalized
279 | 
280 |     handle_currency = (value) ->
281 |       symbol, rest = value\match "^([%$£€¥]+)%s*(.+)$"
282 |       symbol or= value\sub 1, 1
283 |       rest or= ""
284 | 
285 |       normalized_number = normalize_number rest
286 |       if normalized_number and #normalized_number > max_len
287 |         normalized_number = truncate\transform normalized_number
288 | 
289 |       if symbol and symbol != ""
290 |         if normalized_number
291 |           {tag: "currency", value: symbol}, normalized_number
292 |         else
293 |           {tag: "currency", value: symbol}
294 | 
295 |     handle_percent = (value) ->
296 |       number_part = value\sub 1, #value - 1
297 |       normalized = normalize_number number_part
298 |       return unless normalized
299 |       if #normalized > max_len - 1  -- reserve 1 char for %
300 |         normalized = truncate\transform normalized
301 |       "#{normalized}%"
302 | 
303 |     handle_caps_word = (word) ->
304 |       return unless word\match "%u"
305 | 
306 | 
307 |       normalized = normalize_word word
308 |       return unless normalized
309 |       stemmed = if stem
310 |         stem(normalized) or normalized
311 |       else
312 |         normalized
313 |       stemmed, {tag: "caps", value: stemmed}
314 | 
315 |     handle_word = (word) ->
316 |       normalized = normalize_word word
317 |       return unless normalized
318 |       if stem
319 |         stem(normalized) or normalized
320 |       else
321 |         normalized
322 | 
323 |     whitespace = utf8.whitespace
324 |     alpha = R "az", "AZ"
325 |     digit = R "09"
326 |     alphanum = alpha + digit
327 | 
328 |     punct_chars = S"!?$#%"
329 |     other_punct = S"()[]{},.;:\"<>/@#"
330 |     word_char = utf8.printable_character - whitespace - punct_chars - other_punct
331 |     word_pattern = (word_char + P"'")^1
332 | 
333 |     cjk_word = if @opts.split_cjk
334 |       word_char = word_char - cjk_character
335 |       C(cjk_character) / handle_word
336 | 
337 |     caps_char = R"AZ"
338 |     caps_pattern = caps_char^2 * (caps_char + digit)^0
339 | 
340 |     sign = S"+-"^-1
341 |     number_body = sign * digit^1 * (P"," * digit^3)^0 * (P"." * digit^1)^-1
342 | 
343 |     percent_pattern = number_body * P"%"
344 |     currency_pattern = S"$£€¥" * whitespace^0 * number_body
345 | 
346 |     punct_pattern = punct_chars^3 * punct_chars^0
347 | 
348 |     domain_char = utf8.printable_character - whitespace - S"./:@?#[](){}<>\"',;&"
349 |     domain_label = domain_char^1
350 |     domain_pattern = domain_label * (P"." * domain_label)^1
351 | 
352 |     not_path = S" \t\r\n\"'<>()[\\]{}?#"
353 |     port_part = (P":" * digit^1)^-1
354 |     path_part = (P"/" * (1 - not_path)^0)^0
355 |     query_part = (P"?" * (1 - not_path)^0)^-1
356 |     fragment_part = (P"#" * (1 - not_path)^0)^-1
357 | 
358 |     www_prefix = case_insensitive "www."
359 |     scheme = (alpha + digit)^1
360 | 
361 |     -- Userinfo handling for URLs (username:password@host format)
362 |     -- Often abused in phishing: https://legitimate.com@malicious.com
363 |     userinfo_char = utf8.printable_character - whitespace - S"@/?#[](){}<>\"',;&"
364 | 
365 |     url_rest = Cg(domain_pattern, "domain") * port_part * Cg(path_part, "path") * Cg(query_part, "query") * Cg(fragment_part, "fragment")
366 | 
367 |     -- NOTE: url uses Ct and named groups to extract all parts of urls
368 |     url_with_scheme = Ct(scheme * P"://" * (Cg(userinfo_char^1, "userinfo") * P"@")^-1 * www_prefix^-1 * url_rest)
369 |     url_without_scheme = Ct(www_prefix * url_rest)
370 | 
371 |     email_pattern = C((alphanum + S".%+_'-")^1 * P"@" * domain_pattern)
372 | 
373 |     number_capture = C(number_body) * -(alpha)
374 | 
375 |     token_patterns = {
376 |       url_with_scheme / handle_url
377 |       url_without_scheme / handle_url
378 |       email_pattern / handle_email
379 |       C(currency_pattern) / handle_currency
380 |       C(percent_pattern) / handle_percent
381 |       number_capture / handle_number
382 |       C(caps_pattern) / handle_caps_word
383 |       -- CJK here...
384 |       C(word_pattern) / handle_word
385 |       C(punct_pattern) / handle_punct
386 |     }
387 | 
388 |     if cjk_word
389 |       table.insert token_patterns, 8, cjk_word
390 | 
391 |     tokens = token_patterns[1]
392 |     for i = 2, #token_patterns
393 |       tokens = tokens + token_patterns[i]
394 | 
395 |     printable = utf8.printable_character
396 |     Ct (tokens + printable + (C(P(1)) / handle_invalid_byte))^0
397 | 
398 |   -- this is processed on the test before HTML is stripped to get any URLs that
399 |   -- might exist in attributes or in the markup
400 |   collect_url_tokens: (text) =>
401 |     return {} unless text and text != ""
402 | 
403 |     @grammar or= @build_grammar!
404 |     tokens = @grammar\match text
405 |     return {} unless tokens
406 | 
407 |     out = for token in *tokens
408 |       continue unless type(token) == "table"
409 |       switch token.tag
410 |         when "domain", "email", "email_user"
411 |           token
412 |         else
413 |           continue
414 |     out
415 | 
416 |   dedupe_tokens: (tokens) =>
417 |     return {} unless tokens
418 |     seen = {}
419 |     deduped = {}
420 |     for token in *tokens
421 |       -- For table tokens, use string representation as key
422 |       key = if type(token) == "table"
423 |         @tagged_token_to_string token
424 |       else
425 |         token
426 | 
427 |       unless seen[key]
428 |         seen[key] = true
429 |         table.insert deduped, token
430 |     deduped
431 | 
432 |   generate_bigrams: (tokens) =>
433 |     return {} unless tokens
434 |     count = #tokens
435 |     return {} if count < 2
436 |     ignore_tokens = @opts.ignore_tokens
437 | 
438 |     bigrams = {}
439 |     for i = 1, count - 1
440 |       first = tokens[i]
441 |       second = tokens[i + 1]
442 |       continue unless first and second
443 | 
444 |       bigram = first .. " " .. second
445 |       continue if ignore_tokens and ignore_tokens[bigram]
446 | 
447 |       table.insert bigrams, bigram
448 | 
449 |     bigrams
450 | 
451 |   sample_tokens: (tokens, limit=@opts.sample_at_most) =>
452 |     return {} unless tokens
453 |     return tokens unless limit
454 | 
455 |     limit = math.floor limit
456 |     return {} if limit <= 0
457 |     count = #tokens
458 |     return tokens if count <= limit
459 | 
460 |     tokens_to_sample = if @opts.dither == false
461 |       tokens
462 |     else
463 |       dithered tokens
464 | 
465 |     [tokens_to_sample[idx] for idx=1,limit]
466 | 
467 |   -- lift the tokens that match the pattern to the top, preserving order otherwise
468 |   lift_tokens: (tokens, pattern) =>
469 |     lifted = {}
470 |     rest = for t in *tokens
471 |       if t\match pattern
472 |         table.insert lifted, t
473 |         continue
474 | 
475 |       t
476 | 
477 |     for r in *rest
478 |       table.insert lifted, r
479 | 
480 |     lifted
481 | 
482 |   tokenize_text: (text) =>
483 |     return {} unless text
484 | 
485 |     text = tostring text
486 | 
487 |     if @opts.filter_text
488 |       text = @opts.filter_text text
489 | 
490 |     unless @opts.unaccent == false
491 |       text = require("lapis.bayes.text.unaccent").unaccent_string(text) or text
492 | 
493 |     -- extract URLs before cleaing up text to capture urls in HTML markup
494 |     raw_domain_tokens = @collect_url_tokens text
495 | 
496 |     text = extract_text text
497 | 
498 |     @grammar or= @build_grammar!
499 |     tokens = @grammar\match text or {}
500 | 
501 |     dedupe = true
502 |     if @opts.dedupe != nil
503 |       dedupe = @opts.dedupe
504 | 
505 |     ignore_tokens = @opts.ignore_tokens
506 |     sample_limit = @opts.sample_at_most
507 |     generate_bigrams = @opts.bigram_tokens
508 | 
509 |     -- new token merging strategy, try to keep things adjacent
510 |     merged_tokens = {}
511 |     seen_tokens = {} -- for deduping
512 | 
513 |     insert_token = (t) ->
514 |       if ignore_tokens and ignore_tokens[t]
515 |         return
516 | 
517 |       if dedupe and seen_tokens[t]
518 |         return
519 | 
520 |       seen_tokens[t] = true
521 | 
522 |       table.insert merged_tokens, t
523 | 
524 |     prev_token = nil -- for bigram generation
525 | 
526 |     for idx=1,#tokens
527 |       token = tokens[idx]
528 | 
529 |       switch type token
530 |         when "table" -- special token
531 |           switch token.tag
532 |             when "caps", "invalid_byte", "currency"
533 | 
534 |               nil
535 |             else
536 |               prev_token = nil -- break the bigram
537 | 
538 |           insert_token @tagged_token_to_string token
539 | 
540 |         when "string" -- plain word
541 |           insert_token token
542 | 
543 |           if prev_token and generate_bigrams
544 |             insert_token "#{prev_token} #{token}"
545 | 
546 |           prev_token = token
547 | 
548 |     -- these lose positioning due to being extracted differently, so we just
549 |     -- insert them in order at the top by moving some variables around
550 |     if raw_domain_tokens
551 |       original_tokens = merged_tokens
552 |       merged_tokens = {}
553 |       for token in *raw_domain_tokens
554 |         insert_token @tagged_token_to_string token
555 | 
556 |       for t in *original_tokens
557 |         table.insert merged_tokens, t
558 | 
559 |     if @opts.domain_tokens_first
560 |       merged_tokens = @lift_tokens merged_tokens, "^domain:"
561 | 
562 |     if sample_limit
563 |       merged_tokens = @sample_tokens merged_tokens
564 | 
565 |     -- Apply custom filter at the very end if provided
566 |     if @opts.filter_tokens
567 |       merged_tokens = @opts.filter_tokens merged_tokens, @opts
568 | 
569 |     merged_tokens
570 | 
571 | return SpamTokenizer
572 | 


--------------------------------------------------------------------------------