├── .gitignore
├── lapis
├── bayes
│ ├── classifiers
│ │ ├── default.moon
│ │ ├── default.lua
│ │ ├── test.moon
│ │ ├── fisher.moon
│ │ ├── bayes_multi.moon
│ │ ├── fisher.lua
│ │ ├── test.lua
│ │ ├── bayes.moon
│ │ ├── base.moon
│ │ ├── bayes_multi.lua
│ │ ├── bayes.lua
│ │ └── base.lua
│ ├── models.moon
│ ├── models.lua
│ ├── schema.moon
│ ├── schema.lua
│ ├── tokenizers
│ │ ├── base.moon
│ │ ├── base.lua
│ │ ├── ngram.moon
│ │ ├── url_domains.moon
│ │ ├── postgres_text.moon
│ │ ├── postgres_text.lua
│ │ ├── ngram.lua
│ │ ├── url_domains.lua
│ │ └── spam.moon
│ ├── model.moon
│ ├── migrations.moon
│ ├── model.lua
│ ├── migrations.lua
│ ├── text
│ │ ├── utf8.moon
│ │ ├── utf8.lua
│ │ ├── punycode.moon
│ │ ├── punycode.lua
│ │ ├── stem.lua
│ │ └── stem.moon
│ └── models
│ │ ├── word_classifications.moon
│ │ ├── categories.moon
│ │ ├── word_classifications.lua
│ │ └── categories.lua
├── bayes.lua
└── bayes.moon
├── migrations.moon
├── lint_config.moon
├── config.moon
├── Makefile
├── .github
└── workflows
│ └── test.yml
├── spec
├── url_tokenizer_spec.moon
├── utf8_spec.moon
├── postgres_text_tokenizer_spec.moon
├── punycode_spec.moon
├── unaccent_spec.moon
├── stem_spec.moon
├── bayes_spec.moon
└── ngram_tokenizer_spec.moon
├── lapis-bayes-dev-1.rockspec
└── examples
└── detect_language.lua
/.gitignore:
--------------------------------------------------------------------------------
1 | config.lua
2 | lint_config.lua
3 | migrations.lua
4 | *.rock
5 |
--------------------------------------------------------------------------------
/lapis/bayes/classifiers/default.moon:
--------------------------------------------------------------------------------
1 | require "lapis.bayes.classifiers.bayes"
2 |
--------------------------------------------------------------------------------
/lapis/bayes/classifiers/default.lua:
--------------------------------------------------------------------------------
1 | return require("lapis.bayes.classifiers.bayes")
2 |
--------------------------------------------------------------------------------
/lapis/bayes/models.moon:
--------------------------------------------------------------------------------
1 | import autoload from require "lapis.util"
2 | autoload "lapis.bayes.models"
3 |
--------------------------------------------------------------------------------
/migrations.moon:
--------------------------------------------------------------------------------
1 |
2 | import run_migrations from require "lapis.bayes.schema"
3 |
4 | {
5 | run_migrations
6 | }
7 |
8 |
--------------------------------------------------------------------------------
/lapis/bayes/models.lua:
--------------------------------------------------------------------------------
1 | local autoload
2 | autoload = require("lapis.util").autoload
3 | return autoload("lapis.bayes.models")
4 |
--------------------------------------------------------------------------------
/lapis/bayes/schema.moon:
--------------------------------------------------------------------------------
1 | run_migrations = ->
2 | m = require "lapis.db.migrations"
3 | m.run_migrations require("lapis.bayes.migrations"), "lapis_bayes"
4 |
5 | { :run_migrations }
6 |
--------------------------------------------------------------------------------
/lint_config.moon:
--------------------------------------------------------------------------------
1 | {
2 | whitelist_globals: {
3 | ["spec/"]: {
4 | "it", "describe", "before_each", "after_each", "setup", "teardown", "pending"
5 | }
6 | }
7 | }
8 |
9 |
--------------------------------------------------------------------------------
/lapis/bayes/schema.lua:
--------------------------------------------------------------------------------
1 | local run_migrations
2 | run_migrations = function()
3 | local m = require("lapis.db.migrations")
4 | return m.run_migrations(require("lapis.bayes.migrations"), "lapis_bayes")
5 | end
6 | return {
7 | run_migrations = run_migrations
8 | }
9 |
--------------------------------------------------------------------------------
/config.moon:
--------------------------------------------------------------------------------
1 | config = require "lapis.config"
2 |
3 | config {"development", "test"}, ->
4 | logging false -- hide query logs
5 |
6 | postgres {
7 | database: "lapis_bayes"
8 |
9 | host: os.getenv "PGHOST"
10 | user: os.getenv "PGUSER"
11 | password: os.getenv "PGPASSWORD"
12 | }
13 |
14 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 |
2 | migrate: build
3 | make test_db > /dev/null
4 | lapis migrate
5 |
6 | local: build
7 | luarocks --lua-version=5.1 make --local *-dev-1.rockspec
8 |
9 | build:
10 | -rm $$(find lapis -type f | grep '\.lua$$')
11 | moonc lapis
12 | moonc *.moon
13 |
14 | test_db:
15 | -dropdb -U postgres lapis_bayes
16 | createdb -U postgres lapis_bayes
17 |
18 | lint::
19 | moonc lint_config.moon
20 | git ls-files | grep '\.moon$$' | grep -v config.moon | xargs -n 100 moonc -l
21 |
22 | tags::
23 | moon-tags --lapis $$(git ls-files lapis/) > $@
24 |
--------------------------------------------------------------------------------
/lapis/bayes/tokenizers/base.moon:
--------------------------------------------------------------------------------
1 | -- Provides a common interface contract for tokenizers. Subclasses should
2 | -- extend this class and override the `tokenize_text` method with their
3 | -- implementation.
4 | --
5 | -- Required override:
6 | -- * `tokenize_text(text)` - accept raw text input and return an array-like table
7 | -- of token strings suitable for classification.
8 |
9 | class BaseTokenizer
10 | tokenize_text: (...) =>
11 | class_name = @__class and @__class.__name or "TokenizerBase"
12 | error "#{class_name} must implement tokenize_text(...)", 2
13 |
--------------------------------------------------------------------------------
/lapis/bayes/tokenizers/base.lua:
--------------------------------------------------------------------------------
1 | local BaseTokenizer
2 | do
3 | local _class_0
4 | local _base_0 = {
5 | tokenize_text = function(self, ...)
6 | local class_name = self.__class and self.__class.__name or "TokenizerBase"
7 | return error(tostring(class_name) .. " must implement tokenize_text(...)", 2)
8 | end
9 | }
10 | _base_0.__index = _base_0
11 | _class_0 = setmetatable({
12 | __init = function() end,
13 | __base = _base_0,
14 | __name = "BaseTokenizer"
15 | }, {
16 | __index = _base_0,
17 | __call = function(cls, ...)
18 | local _self_0 = setmetatable({}, _base_0)
19 | cls.__init(_self_0, ...)
20 | return _self_0
21 | end
22 | })
23 | _base_0.__class = _class_0
24 | BaseTokenizer = _class_0
25 | return _class_0
26 | end
27 |
--------------------------------------------------------------------------------
/lapis/bayes/model.moon:
--------------------------------------------------------------------------------
1 |
2 | prefix = "lapis_bayes_"
3 |
4 | import Model from require "lapis.db.model"
5 |
6 | db = require "lapis.db"
7 |
8 | -- all tuples should be same size
9 | encode_tuples = (tuples) ->
10 | buffer = { "VALUES" }
11 |
12 | {insert: i} = table
13 | n_tuples = #tuples
14 | for t_idx=1,n_tuples
15 | tuple = tuples[t_idx]
16 | i buffer, " ("
17 | k = #tuple
18 | for idx=1,k
19 | i buffer, db.escape_literal tuple[idx]
20 | unless idx == k
21 | i buffer, ", "
22 |
23 | if t_idx == n_tuples
24 | i buffer, ")"
25 | else
26 | i buffer, "), "
27 |
28 | table.concat buffer
29 |
30 | {
31 | Model: Model\scoped_model prefix, "lapis.bayes.models"
32 | prefix_table: (name) -> "#{prefix}#{name}"
33 | :encode_tuples
34 | }
35 |
--------------------------------------------------------------------------------
/lapis/bayes/migrations.moon:
--------------------------------------------------------------------------------
1 | schema = require "lapis.db.schema"
2 |
3 | import add_column, create_index, drop_index, drop_column, create_table from schema
4 |
5 | {
6 | :serial, :boolean, :varchar, :integer, :text, :foreign_key, :double, :time,
7 | :numeric, :enum
8 | } = schema.types
9 |
10 | import prefix_table from require "lapis.bayes.model"
11 |
12 | {
13 | [1439610038]: =>
14 | create_table prefix_table("categories"), {
15 | {"id", serial}
16 | {"name", text}
17 |
18 | {"total_count", integer}
19 |
20 | {"created_at", time}
21 | {"updated_at", time}
22 |
23 | "PRIMARY KEY (id)"
24 | }
25 |
26 | create_table prefix_table("word_classifications"), {
27 | {"category_id", foreign_key}
28 | {"word", text}
29 | {"count", integer}
30 |
31 | "PRIMARY KEY (category_id, word)"
32 | }
33 |
34 | [1474434614]: =>
35 | create_index prefix_table("categories"), "name"
36 | }
37 |
38 |
--------------------------------------------------------------------------------
/lapis/bayes/model.lua:
--------------------------------------------------------------------------------
1 | local prefix = "lapis_bayes_"
2 | local Model
3 | Model = require("lapis.db.model").Model
4 | local db = require("lapis.db")
5 | local encode_tuples
6 | encode_tuples = function(tuples)
7 | local buffer = {
8 | "VALUES"
9 | }
10 | local i
11 | i = table.insert
12 | local n_tuples = #tuples
13 | for t_idx = 1, n_tuples do
14 | local tuple = tuples[t_idx]
15 | i(buffer, " (")
16 | local k = #tuple
17 | for idx = 1, k do
18 | i(buffer, db.escape_literal(tuple[idx]))
19 | if not (idx == k) then
20 | i(buffer, ", ")
21 | end
22 | end
23 | if t_idx == n_tuples then
24 | i(buffer, ")")
25 | else
26 | i(buffer, "), ")
27 | end
28 | end
29 | return table.concat(buffer)
30 | end
31 | return {
32 | Model = Model:scoped_model(prefix, "lapis.bayes.models"),
33 | prefix_table = function(name)
34 | return tostring(prefix) .. tostring(name)
35 | end,
36 | encode_tuples = encode_tuples
37 | }
38 |
--------------------------------------------------------------------------------
/lapis/bayes.lua:
--------------------------------------------------------------------------------
1 | local VERSION = "1.4.0"
2 | local text_probabilities
3 | text_probabilities = function(categories, text, opts)
4 | if opts == nil then
5 | opts = { }
6 | end
7 | local DefaultClassifier = require("lapis.bayes.classifiers.default")
8 | return DefaultClassifier(opts):text_probabilities(categories, text, opts)
9 | end
10 | local classify_text
11 | classify_text = function(categories, text, opts)
12 | if opts == nil then
13 | opts = { }
14 | end
15 | local DefaultClassifier = require("lapis.bayes.classifiers.default")
16 | return DefaultClassifier(opts):classify_text(categories, text, opts)
17 | end
18 | local train_text
19 | train_text = function(category, text, opts, ...)
20 | if opts == nil then
21 | opts = { }
22 | end
23 | local DefaultClassifier = require("lapis.bayes.classifiers.default")
24 | return DefaultClassifier(opts):train_text(category, text, ...)
25 | end
26 | return {
27 | classify_text = classify_text,
28 | train_text = train_text,
29 | text_probabilities = text_probabilities,
30 | VERSION = VERSION
31 | }
32 |
--------------------------------------------------------------------------------
/lapis/bayes.moon:
--------------------------------------------------------------------------------
1 | VERSION = "1.4.0"
2 |
3 | -- calculate the probabilities of text using default classifier
4 | -- categories: array of category names
5 | -- text: the text to calculate probabilities for
6 | text_probabilities = (categories, text, opts={}) ->
7 | DefaultClassifier = require "lapis.bayes.classifiers.default"
8 | DefaultClassifier(opts)\text_probabilities categories, text, opts
9 |
10 | -- return the best matching category for the given text using the default
11 | -- classifier
12 | classify_text = (categories, text, opts={}) ->
13 | DefaultClassifier = require "lapis.bayes.classifiers.default"
14 | DefaultClassifier(opts)\classify_text categories, text, opts
15 |
16 | -- train text using default classifier's tokenizer
17 | -- category: string name of category
18 | -- text: the text (or array of words) to train
19 | -- opts: options to pass to the classifier
20 | train_text = (category, text, opts={}, ...) ->
21 | DefaultClassifier = require "lapis.bayes.classifiers.default"
22 | DefaultClassifier(opts)\train_text category, text, ...
23 |
24 | { :classify_text, :train_text, :text_probabilities, :VERSION }
25 |
--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
1 | name: "test"
2 |
3 | on: [push, pull_request]
4 |
5 | jobs:
6 | test:
7 | runs-on: ubuntu-latest
8 |
9 | env:
10 | PGUSER: postgres
11 | PGPASSWORD: postgres
12 | PGHOST: 127.0.0.1
13 |
14 | services:
15 | postgres:
16 | image: postgres:12
17 | env:
18 | POSTGRES_PASSWORD: postgres
19 | options: --health-cmd pg_isready --health-interval 10s --health-timeout 5s --health-retries 5
20 | ports:
21 | - 5432:5432
22 |
23 | steps:
24 | - uses: actions/checkout@master
25 | - uses: leafo/gh-actions-lua@master
26 | with:
27 | luaVersion: "luajit-openresty"
28 |
29 | - uses: leafo/gh-actions-luarocks@master
30 |
31 | - name: build
32 | run: |
33 | luarocks install busted
34 | luarocks install moonscript
35 | luarocks make
36 | luarocks install web_sanitize
37 | luarocks install tableshape
38 |
39 | - name: setup db
40 | run: |
41 | psql -c 'create database lapis_bayes'
42 | moonc *.moon
43 | lapis migrate
44 |
45 | - name: test
46 | run: |
47 | busted -o utfTerminal
48 |
--------------------------------------------------------------------------------
/lapis/bayes/classifiers/test.moon:
--------------------------------------------------------------------------------
1 | average = (nums) ->
2 | sum = 0
3 | for n in *nums
4 | sum += n
5 |
6 | return sum / #nums
7 |
8 | weighted_avg = (tuples) ->
9 | num_tuples = #tuples
10 | sum = 0
11 | sum_weight = 0
12 |
13 | for {num, weight} in *tuples
14 | sum += num
15 | sum_weight += weight
16 |
17 | avg_weight = sum_weight/num_tuples
18 |
19 | avg = 0
20 | for {num, weight} in *tuples
21 | avg += (num/num_tuples) * (weight/avg_weight)
22 |
23 | avg
24 |
25 | class TestClassifier extends require "lapis.bayes.classifiers.base"
26 | word_probabilities: (categories, available_words) =>
27 | total_counts = {}
28 | for c in *categories
29 | continue unless c.word_counts
30 | for word, count in pairs c.word_counts
31 | total_counts[word] or= 0
32 | total_counts[word] += count
33 |
34 | probs = for c in *categories
35 | tuples = for word in *available_words
36 | total_count = total_counts[word]
37 | cat_count = c.word_counts and c.word_counts[word] or 0
38 | {cat_count/total_count, total_count}
39 |
40 | {c.name, weighted_avg tuples}
41 |
42 | table.sort probs, (a,b) ->
43 | a[2] > b[2]
44 |
45 | probs
46 |
47 |
48 |
--------------------------------------------------------------------------------
/lapis/bayes/classifiers/fisher.moon:
--------------------------------------------------------------------------------
1 | -- http://www.linuxjournal.com/article/6467
2 |
3 | inv_chi2 = (chi, df) ->
4 | assert df % 2 == 0, "df must be even"
5 | m = chi / 2.0
6 | sum = math.exp -m
7 | term = sum
8 | for i=1, math.floor df/2
9 | term *= m / i
10 | sum += term
11 |
12 | math.min sum, 1
13 |
14 | class FisherClassifier extends require "lapis.bayes.classifiers.base"
15 | @default_options: {
16 | robs: 1
17 | robx: 0.5
18 | min_dev: 0.3
19 | }
20 |
21 | word_probabilities: (categories, available_words) =>
22 | return nil, "only two categories supported at once" unless #categories == 2
23 |
24 | {a, b} = categories
25 |
26 | s = @opts.robs
27 | x = @opts.robx
28 | min_dev = @opts.min_dev
29 |
30 | mul_a = 0
31 | mul_b = 0
32 |
33 | kept_tokens = 0
34 |
35 | for word in *available_words
36 | a_count = a.word_counts and a.word_counts[word] or 0
37 | b_count = b.word_counts and b.word_counts[word] or 0
38 |
39 | p = a_count / (a_count + b_count)
40 | n = a_count + b_count
41 | val = ((s * x) + (n * p)) / (s + n)
42 |
43 | if not min_dev or math.abs(val - 0.5) > min_dev
44 | mul_a += math.log val
45 | mul_b += math.log 1 - val
46 | kept_tokens += 1
47 |
48 | if kept_tokens == 0
49 | return nil, "not enough strong signals to decide"
50 |
51 | pa = inv_chi2 -2 * mul_a, 2 * kept_tokens
52 | pb = inv_chi2 -2 * mul_b, 2 * kept_tokens
53 |
54 | p = (1 + pa - pb) / 2
55 |
56 | tuples = {
57 | {a.name, p}
58 | {b.name, 1 - p}
59 | }
60 |
61 | table.sort tuples, (a,b) -> a[2] > b[2]
62 |
63 | tuples
64 |
65 |
66 |
--------------------------------------------------------------------------------
/lapis/bayes/migrations.lua:
--------------------------------------------------------------------------------
1 | local schema = require("lapis.db.schema")
2 | local add_column, create_index, drop_index, drop_column, create_table
3 | add_column, create_index, drop_index, drop_column, create_table = schema.add_column, schema.create_index, schema.drop_index, schema.drop_column, schema.create_table
4 | local serial, boolean, varchar, integer, text, foreign_key, double, time, numeric, enum
5 | do
6 | local _obj_0 = schema.types
7 | serial, boolean, varchar, integer, text, foreign_key, double, time, numeric, enum = _obj_0.serial, _obj_0.boolean, _obj_0.varchar, _obj_0.integer, _obj_0.text, _obj_0.foreign_key, _obj_0.double, _obj_0.time, _obj_0.numeric, _obj_0.enum
8 | end
9 | local prefix_table
10 | prefix_table = require("lapis.bayes.model").prefix_table
11 | return {
12 | [1439610038] = function(self)
13 | create_table(prefix_table("categories"), {
14 | {
15 | "id",
16 | serial
17 | },
18 | {
19 | "name",
20 | text
21 | },
22 | {
23 | "total_count",
24 | integer
25 | },
26 | {
27 | "created_at",
28 | time
29 | },
30 | {
31 | "updated_at",
32 | time
33 | },
34 | "PRIMARY KEY (id)"
35 | })
36 | return create_table(prefix_table("word_classifications"), {
37 | {
38 | "category_id",
39 | foreign_key
40 | },
41 | {
42 | "word",
43 | text
44 | },
45 | {
46 | "count",
47 | integer
48 | },
49 | "PRIMARY KEY (category_id, word)"
50 | })
51 | end,
52 | [1474434614] = function(self)
53 | return create_index(prefix_table("categories"), "name")
54 | end
55 | }
56 |
--------------------------------------------------------------------------------
/spec/url_tokenizer_spec.moon:
--------------------------------------------------------------------------------
1 |
2 | UrlDomainsTokenizer = require "lapis.bayes.tokenizers.url_domains"
3 |
4 | describe "lapis.bayes.tokenizer.url_tokenizer", ->
5 | it "builds grammar", ->
6 | tokenizer = UrlDomainsTokenizer!
7 | p = tokenizer\build_grammar!
8 | p\match "https"
9 |
10 | describe "with grammar", ->
11 | local grammar
12 |
13 | before_each ->
14 | grammar = UrlDomainsTokenizer!\build_grammar!
15 |
16 | it "detects some urls", ->
17 | assert.same {
18 | "http://leafo.net& "
19 | "http://google.com/p8sslord"
20 | "www.leafodad.com"
21 | }, grammar\match "href='http://leafo.net& ' http://google.com/p8sslord please help the good one www.leafodad.com yeah what the freak"
22 |
23 | describe "with tonenizer", ->
24 | local tokenize_text
25 | before_each ->
26 | tokenize_text = UrlDomainsTokenizer!\tokenize_text
27 |
28 | it "extracts tokens from string", ->
29 | assert.same {
30 | "leafo.net&"
31 | "google.com"
32 | "leafodad.com"
33 | }, tokenize_text "href='http://leafo.net& ' http://google.com/p8sslord/da?what please help the good one www.leafodad.com yeah what the freak"
34 |
35 | it "gets domain from iframe", ->
36 | assert.same {
37 | 'youtube.com'
38 | }, tokenize_text [[]]
39 |
40 | it "ignore domains", ->
41 | tokens = UrlDomainsTokenizer({
42 | ignore_domains: {
43 | "leafo.net": true
44 | "*.google.com": true
45 | }
46 | })\tokenize_text [[
47 | http://leafo.net
48 | http://good.leafo.net
49 | http://google.com
50 | http://butt.google.com
51 | http://plus.good.google.com
52 | ]]
53 |
54 | assert.same {"good.leafo.net", "google.com"}, tokens
55 |
--------------------------------------------------------------------------------
/lapis-bayes-dev-1.rockspec:
--------------------------------------------------------------------------------
1 | package = "lapis-bayes"
2 | version = "dev-1"
3 |
4 | source = {
5 | url = "git+https://github.com/leafo/lapis-bayes.git"
6 | }
7 |
8 | description = {
9 | summary = "Naive Bayes classifier for use in Lua",
10 | license = "MIT",
11 | maintainer = "Leaf Corcoran ",
12 | }
13 |
14 | dependencies = {
15 | "lua == 5.1",
16 | "lapis >= 1.16.0"
17 | }
18 |
19 | build = {
20 | type = "builtin",
21 | modules = {
22 | ["lapis.bayes"] = "lapis/bayes.lua",
23 | ["lapis.bayes.classifiers.base"] = "lapis/bayes/classifiers/base.lua",
24 | ["lapis.bayes.classifiers.bayes"] = "lapis/bayes/classifiers/bayes.lua",
25 | ["lapis.bayes.classifiers.bayes_multi"] = "lapis/bayes/classifiers/bayes_multi.lua",
26 | ["lapis.bayes.classifiers.default"] = "lapis/bayes/classifiers/default.lua",
27 | ["lapis.bayes.classifiers.fisher"] = "lapis/bayes/classifiers/fisher.lua",
28 | ["lapis.bayes.classifiers.test"] = "lapis/bayes/classifiers/test.lua",
29 | ["lapis.bayes.migrations"] = "lapis/bayes/migrations.lua",
30 | ["lapis.bayes.model"] = "lapis/bayes/model.lua",
31 | ["lapis.bayes.models"] = "lapis/bayes/models.lua",
32 | ["lapis.bayes.models.categories"] = "lapis/bayes/models/categories.lua",
33 | ["lapis.bayes.models.word_classifications"] = "lapis/bayes/models/word_classifications.lua",
34 | ["lapis.bayes.schema"] = "lapis/bayes/schema.lua",
35 | ["lapis.bayes.text.punycode"] = "lapis/bayes/text/punycode.lua",
36 | ["lapis.bayes.text.stem"] = "lapis/bayes/text/stem.lua",
37 | ["lapis.bayes.text.unaccent"] = "lapis/bayes/text/unaccent.lua",
38 | ["lapis.bayes.text.utf8"] = "lapis/bayes/text/utf8.lua",
39 | ["lapis.bayes.tokenizers.base"] = "lapis/bayes/tokenizers/base.lua",
40 | ["lapis.bayes.tokenizers.ngram"] = "lapis/bayes/tokenizers/ngram.lua",
41 | ["lapis.bayes.tokenizers.postgres_text"] = "lapis/bayes/tokenizers/postgres_text.lua",
42 | ["lapis.bayes.tokenizers.spam"] = "lapis/bayes/tokenizers/spam.lua",
43 | ["lapis.bayes.tokenizers.url_domains"] = "lapis/bayes/tokenizers/url_domains.lua",
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/spec/utf8_spec.moon:
--------------------------------------------------------------------------------
1 | scripts = require "lapis.bayes.text.utf8"
2 | import C, P from require "lpeg"
3 |
4 | capture = (pattern, text) ->
5 | (C(pattern) * -P(1))\match text
6 |
7 | matches = (pattern, text) ->
8 | not not ((pattern * -P(1))\match text)
9 |
10 | describe "lapis.bayes.text.utf8", ->
11 | describe "han_character", ->
12 | it "matches a basic Han ideograph", ->
13 | assert.same "漢", capture scripts.han_character, "漢"
14 |
15 | it "matches a supplementary plane character", ->
16 | assert.same "𠀋", capture scripts.han_character, "𠀋"
17 |
18 | it "does not match kana characters", ->
19 | assert.falsy matches scripts.han_character, "あ"
20 | assert.falsy matches scripts.han_character, "ア"
21 |
22 | describe "kana_character", ->
23 | it "matches hiragana and katakana", ->
24 | assert.same "あ", capture scripts.kana_character, "あ"
25 | assert.same "ア", capture scripts.kana_character, "ア"
26 |
27 | it "matches halfwidth katakana", ->
28 | assert.same "ア", capture scripts.kana_character, "ア"
29 |
30 | it "does not match Han or Latin letters", ->
31 | assert.falsy matches scripts.kana_character, "漢"
32 | assert.falsy matches scripts.kana_character, "A"
33 |
34 | describe "hangul_character", ->
35 | it "matches modern syllables and jamo", ->
36 | assert.same "한", capture scripts.hangul_character, "한"
37 | assert.same "ᄀ", capture scripts.hangul_character, "ᄀ"
38 |
39 | it "matches halfwidth Hangul letters", ->
40 | assert.same "ᄀ", capture scripts.hangul_character, "ᄀ"
41 |
42 | it "does not match kana", ->
43 | assert.falsy matches scripts.hangul_character, "ア"
44 |
45 | describe "cjk_character", ->
46 | it "matches characters across Han, Kana, and Hangul", ->
47 | assert.same "漢", capture scripts.cjk_character, "漢"
48 | assert.same "あ", capture scripts.cjk_character, "あ"
49 | assert.same "한", capture scripts.cjk_character, "한"
50 |
51 | it "rejects non-CJK characters", ->
52 | assert.falsy matches scripts.cjk_character, "A"
53 | assert.falsy matches scripts.cjk_character, "1"
54 |
--------------------------------------------------------------------------------
/lapis/bayes/text/utf8.moon:
--------------------------------------------------------------------------------
1 | import P, R from require "lpeg"
2 |
3 | cont = R "\128\191"
4 |
5 | -- Han ideographs (basic, extensions, compatibility, supplementary planes)
6 | han_ext_a = P"\227" * R("\144\191") * cont + P"\228" * R("\128\182") * cont
7 | han_unified = P"\228" * R("\184\191") * cont + R("\229\232") * cont * cont + P"\233" * R("\128\191") * cont
8 | han_compat = P"\239" * R("\164\171") * cont
9 | han_supplement = P"\240" * R("\160\178") * cont * cont
10 | han_character = han_ext_a + han_unified + han_compat + han_supplement
11 |
12 | -- Japanese Hiragana
13 | hiragana_block = P"\227\129" * cont + P"\227\130" * R("\128\159")
14 |
15 | -- Kana supplement & historic kana (hentaigana, archaic forms)
16 | kana_supplement = P"\240\155" * R("\128\133") * cont
17 |
18 | hiragana_character = hiragana_block + kana_supplement
19 |
20 | -- Japanese Katakana (standard, extensions, halfwidth)
21 | katakana_main = P"\227\130" * R("\160\191") + P"\227\131" * cont
22 | katakana_phonetic_ext = P"\227\135" * R("\176\191")
23 | katakana_halfwidth = P"\239\189" * R("\166\191") + P"\239\190" * R("\128\159")
24 | katakana_character = katakana_main + katakana_phonetic_ext + katakana_halfwidth + kana_supplement
25 |
26 | kana_character = hiragana_block + katakana_main + katakana_phonetic_ext + katakana_halfwidth + kana_supplement
27 |
28 | -- Korean Hangul (jamo, syllables, compatibility/halfwidth)
29 | hangul_jamo = P"\225" * R("\132\135") * cont
30 | hangul_jamo_ext_a = P"\234\165" * R("\160\191")
31 | hangul_compat_jamo = P"\227\132" * R("\176\191") + P"\227\133" * cont + P"\227\134" * cont + P"\227\135" * R("\128\143")
32 | hangul_syllables = P"\234" * R("\176\191") * cont + R("\235\236") * cont * cont + P"\237" * (R("\128\157") * cont + P"\158" * R("\128\163"))
33 | hangul_jamo_ext_b = P"\237\158" * R("\176\191") + P"\237\159" * cont
34 | hangul_halfwidth = P"\239\190" * R("\160\191") + P"\239\191" * R("\128\156")
35 | hangul_character = hangul_jamo + hangul_jamo_ext_a + hangul_compat_jamo + hangul_syllables + hangul_jamo_ext_b + hangul_halfwidth
36 |
37 | cjk_character = han_character + kana_character + hangul_character
38 |
39 | {
40 | :cont
41 | :han_character
42 | :hiragana_character
43 | :katakana_character
44 | :kana_character
45 | :hangul_character
46 | :cjk_character
47 | }
48 |
--------------------------------------------------------------------------------
/lapis/bayes/models/word_classifications.moon:
--------------------------------------------------------------------------------
1 |
2 | db = require "lapis.db"
3 | import Model from require "lapis.bayes.model"
4 |
5 | -- Generated schema dump: (do not edit)
6 | --
7 | -- CREATE TABLE lapis_bayes_word_classifications (
8 | -- category_id integer NOT NULL,
9 | -- word text NOT NULL,
10 | -- count integer DEFAULT 0 NOT NULL
11 | -- );
12 | -- ALTER TABLE ONLY lapis_bayes_word_classifications
13 | -- ADD CONSTRAINT lapis_bayes_word_classifications_pkey PRIMARY KEY (category_id, word);
14 | --
15 | class WordClassifications extends Model
16 | @primary_key: {"category_id", "word"}
17 |
18 | @relations: {
19 | {"category", belongs_to: "Categories"}
20 | }
21 |
22 | @find_or_create: (opts={}) =>
23 | @find(opts) or @create(opts)
24 |
25 | @purge_word: (word, categories) =>
26 | import Categories from require "lapis.bayes.models"
27 |
28 | categories = { categories } unless type(categories) == "table"
29 | original_count = #categories
30 | assert original_count > 0, "missing categories"
31 | categories = Categories\find_all categories, key: "name"
32 | assert #categories == original_count, "failed to find all categories specified"
33 |
34 | wcs = @select "where word = ? and category_id in ?",
35 | word, db.list [c.id for c in *categories]
36 |
37 | count = 0
38 | for wc in *wcs
39 | if wc\delete!
40 | count += 1
41 |
42 | count > 0, count
43 |
44 | delete: =>
45 | deleted, res = super db.raw "*"
46 |
47 | if deleted
48 | removed_row = @@load (unpack res)
49 |
50 | import Categories from require "lapis.bayes.models"
51 | db.update Categories\table_name!, {
52 | total_count: db.raw db.interpolate_query " total_count - ?", removed_row.count
53 | }, {
54 | id: @category_id
55 | }
56 |
57 | true
58 |
59 |
60 | -- note: this should not be called directly, use the associated method on the category model
61 | _increment: (amount) =>
62 | amount = assert tonumber(amount), "expecting number"
63 | @update {
64 | count: db.raw "count + #{amount}"
65 | }
66 |
67 | if @count == 0
68 | db.delete @@table_name!, {
69 | category_id: @category_id
70 | word: @word
71 | count: 0
72 | }
73 |
74 |
75 |
--------------------------------------------------------------------------------
/lapis/bayes/text/utf8.lua:
--------------------------------------------------------------------------------
1 | local P, R
2 | do
3 | local _obj_0 = require("lpeg")
4 | P, R = _obj_0.P, _obj_0.R
5 | end
6 | local cont = R("\128\191")
7 | local han_ext_a = P("\227") * R("\144\191") * cont + P("\228") * R("\128\182") * cont
8 | local han_unified = P("\228") * R("\184\191") * cont + R("\229\232") * cont * cont + P("\233") * R("\128\191") * cont
9 | local han_compat = P("\239") * R("\164\171") * cont
10 | local han_supplement = P("\240") * R("\160\178") * cont * cont
11 | local han_character = han_ext_a + han_unified + han_compat + han_supplement
12 | local hiragana_block = P("\227\129") * cont + P("\227\130") * R("\128\159")
13 | local kana_supplement = P("\240\155") * R("\128\133") * cont
14 | local hiragana_character = hiragana_block + kana_supplement
15 | local katakana_main = P("\227\130") * R("\160\191") + P("\227\131") * cont
16 | local katakana_phonetic_ext = P("\227\135") * R("\176\191")
17 | local katakana_halfwidth = P("\239\189") * R("\166\191") + P("\239\190") * R("\128\159")
18 | local katakana_character = katakana_main + katakana_phonetic_ext + katakana_halfwidth + kana_supplement
19 | local kana_character = hiragana_block + katakana_main + katakana_phonetic_ext + katakana_halfwidth + kana_supplement
20 | local hangul_jamo = P("\225") * R("\132\135") * cont
21 | local hangul_jamo_ext_a = P("\234\165") * R("\160\191")
22 | local hangul_compat_jamo = P("\227\132") * R("\176\191") + P("\227\133") * cont + P("\227\134") * cont + P("\227\135") * R("\128\143")
23 | local hangul_syllables = P("\234") * R("\176\191") * cont + R("\235\236") * cont * cont + P("\237") * (R("\128\157") * cont + P("\158") * R("\128\163"))
24 | local hangul_jamo_ext_b = P("\237\158") * R("\176\191") + P("\237\159") * cont
25 | local hangul_halfwidth = P("\239\190") * R("\160\191") + P("\239\191") * R("\128\156")
26 | local hangul_character = hangul_jamo + hangul_jamo_ext_a + hangul_compat_jamo + hangul_syllables + hangul_jamo_ext_b + hangul_halfwidth
27 | local cjk_character = han_character + kana_character + hangul_character
28 | return {
29 | cont = cont,
30 | han_character = han_character,
31 | hiragana_character = hiragana_character,
32 | katakana_character = katakana_character,
33 | kana_character = kana_character,
34 | hangul_character = hangul_character,
35 | cjk_character = cjk_character
36 | }
37 |
--------------------------------------------------------------------------------
/lapis/bayes/tokenizers/ngram.moon:
--------------------------------------------------------------------------------
1 | class NgramTokenizer extends require "lapis.bayes.tokenizers.base"
2 | new: (@opts = {}) =>
3 |
4 | build_grammar: =>
5 | import C, Ct from require "lpeg"
6 | utf8 = require "lapis.util.utf8"
7 |
8 | whitespace = utf8.whitespace
9 | printable = utf8.printable_character
10 | word_chars = printable - whitespace
11 | word = C word_chars^1
12 |
13 | Ct (word + whitespace^1)^0
14 |
15 | normalize_word: (word) =>
16 | return unless word and word != ""
17 |
18 | normalized = tostring(word)\lower!
19 | normalized = normalized\gsub("[%p]", "")
20 | normalized = normalized\gsub("%s+", "")
21 |
22 | return unless normalized != ""
23 | normalized
24 |
25 | ngram_size: =>
26 | n = tonumber(@opts.n) or 2
27 | n = math.floor n
28 | n = 1 if n < 1
29 | n
30 |
31 | word_ngrams: (word, n) =>
32 | -- Split word into UTF-8 characters using LPEG
33 | import C, Ct from require "lpeg"
34 | utf8 = require "lapis.util.utf8"
35 | printable = utf8.printable_character
36 |
37 | char_pattern = Ct (C printable)^0
38 | chars = char_pattern\match word
39 |
40 | return { word } unless chars
41 |
42 | len = #chars
43 | return { word } if len == 0
44 | return { word } if len < n
45 |
46 | out = {}
47 | for i = 1, len - n + 1
48 | ngram = table.concat chars, "", i, i + n - 1
49 | table.insert out, ngram
50 |
51 | out
52 |
53 | tokenize_text: (text) =>
54 | return {} unless text and text != ""
55 |
56 | if pre_filter = @opts.filter_text
57 | text = pre_filter text
58 | return {} unless text and text != ""
59 |
60 | @grammar or= @build_grammar!
61 | words = @grammar\match text
62 | return {} unless words
63 |
64 | n = @ngram_size!
65 | ignore_numbers = @opts.ignore_numbers
66 | ignore_numbers = true if ignore_numbers == nil
67 |
68 | tokens = {}
69 | for raw_word in *words
70 | cleaned = @normalize_word raw_word
71 | continue unless cleaned
72 |
73 | if ignore_numbers and cleaned\match "^%d+$"
74 | continue
75 |
76 | for token in *@word_ngrams cleaned, n
77 | table.insert tokens, token
78 |
79 | if @opts.filter_tokens
80 | tokens = @opts.filter_tokens tokens, @opts
81 |
82 | tokens
83 |
--------------------------------------------------------------------------------
/examples/detect_language.lua:
--------------------------------------------------------------------------------
1 | local NgramTokenizer = require("lapis.bayes.tokenizers.ngram")
2 | local BayesMultiClassifier = require("lapis.bayes.classifiers.bayes_multi")
3 |
4 | -- generates character ngrames of length 2
5 | local tokenizer = NgramTokenizer({n = 2})
6 |
7 | -- A BayesMultiClassifier supports classifying to any number of categories
8 | local classifier = BayesMultiClassifier({tokenizer = tokenizer})
9 |
10 | local training_data = {
11 | {"english", "The quick brown fox jumps over the lazy dog"},
12 | {"english", "Hello world this is a test of the system"},
13 | {"english", "Programming and software development with modern technology"},
14 |
15 | {"spanish", "El rápido zorro marrón salta sobre el perro perezoso"},
16 | {"spanish", "Hola mundo esta es una prueba del sistema"},
17 | {"spanish", "Los lenguajes de programación son herramientas importantes"},
18 |
19 | {"french", "Le rapide renard brun saute pardessus le chien paresseux"},
20 | {"french", "Bonjour le monde ceci est un test du système"},
21 | {"french", "Les langages de programmation sont des outils importants"},
22 |
23 | {"german", "Der schnelle braune Fuchs springt über den faulen Hund"},
24 | {"german", "Hallo Welt dies ist ein Test des Systems"},
25 | {"german", "Programmiersprachen sind wichtige Werkzeuge für die Entwicklung"},
26 |
27 | {"chinese", "敏捷的棕色狐狸跳过懒狗"},
28 | {"chinese", "你好世界这是一个系统的测试"},
29 | {"chinese", "编程语言是表达算法的重要工具"},
30 | }
31 |
32 | -- Train the classifier
33 | print("Training classifier...")
34 | for _, entry in ipairs(training_data) do
35 | local language, text = entry[1], entry[2]
36 | classifier:train_text(language, text)
37 | end
38 | print("Training complete.\n")
39 |
40 | -- Classify new text
41 | local test_cases = {
42 | "Welcome to our website",
43 | "Bienvenido a nuestro sitio",
44 | "Bienvenue sur notre site",
45 | "Willkommen auf unserer Website",
46 | "欢迎来到我们的网站",
47 | }
48 |
49 | print("Classifying test sentences:\n")
50 | for _, test in ipairs(test_cases) do
51 | local text = test[1]
52 |
53 | -- Get probability distribution across all languages
54 | local probs = classifier:text_probabilities({
55 | "english",
56 | "spanish",
57 | "french",
58 | "german",
59 | "chinese"
60 | }, text)
61 |
62 | -- The result is sorted by probability, first entry is the detected language
63 | local detected_language = probs[1][1]
64 | local confidence = probs[1][2]
65 |
66 | print(string.format('Text: "%s"', text))
67 | print(string.format("Detected: %s (%.1f%% confidence)\n", detected_language, confidence * 100))
68 | end
69 |
--------------------------------------------------------------------------------
/lapis/bayes/tokenizers/url_domains.moon:
--------------------------------------------------------------------------------
1 | import trim from require "lapis.util"
2 |
3 | class UrlDomainsTokenizer extends require "lapis.bayes.tokenizers.base"
4 | new: (@opts = {}) =>
5 |
6 | ignore_domain: (domain) =>
7 | return unless @opts and @opts.ignore_domains
8 | return true if @opts.ignore_domains[domain]
9 |
10 | while true
11 | sub = domain\gsub("^%**%.?[^%.]+", "*")
12 | return false if sub == domain
13 | return true if @opts.ignore_domains[sub]
14 | domain = sub
15 |
16 | -- strip urls to just domains
17 | filter_tokens: (urls) =>
18 | return for url in *urls
19 | url = url\lower!
20 | url = trim url
21 | url = url\gsub "^%w+://", ""
22 | url = url\gsub "^www%.", ""
23 | url = url\gsub "/.*$", ""
24 | url = trim url
25 |
26 | url\gsub "<$", ""
27 | url\gsub "^>", ""
28 |
29 | continue if url == ""
30 | continue if url\match "^%w+:" -- mailto and co
31 | continue if url\match [=[[<>="' ]]=]
32 | continue unless url\match "%."
33 |
34 | continue if @ignore_domain url
35 |
36 | url
37 |
38 | build_grammar: =>
39 | import P, S, R, C, Ct, Cs from require "lpeg"
40 |
41 | case_insensitive = (text) ->
42 | out = nil
43 | for char in text\gmatch "."
44 | p = S"#{char\lower!}#{char\upper!}"
45 | if out
46 | out *= p
47 | else
48 | out = p
49 |
50 | out
51 |
52 | -- this is far from comprehensive
53 | unescape_char = P">" / ">" +
54 | P"<" / "<" +
55 | P"&" / "&" +
56 | P" " / " " +
57 | P"'" / "'" +
58 | P"/" / "/" +
59 | P""" / '"'
60 |
61 | unescape_text = Cs (unescape_char + 1)^1
62 |
63 | some_space = S" \t\n"
64 | space = some_space^0
65 | alphanum = R "az", "AZ", "09"
66 |
67 | scheme = case_insensitive"http" * case_insensitive"s"^-1 * P"://"
68 | raw_url = C scheme * (P(1) - S" \t\n")^1
69 |
70 | word = (alphanum + S"._-")^1
71 | attr_value = C(word) + P'"' * C((1 - P'"')^0) * P'"' + P"'" * C((1 - P"'")^0) * P"'"
72 |
73 | href = (case_insensitive"href" + case_insensitive"src") * space * P"=" * space * attr_value / (v) -> unescape_text\match(v) or ""
74 |
75 | simple = C case_insensitive"www" * (P"." * (1 - (S"./" + some_space))^1)^1
76 |
77 | Ct (raw_url + href + simple + 1)^0
78 |
79 | tokenize_text: (text) =>
80 | @grammar or= @build_grammar!
81 | matches = @grammar\match text
82 | return nil, "failed to parse text" unless matches
83 | @filter_tokens matches
84 |
--------------------------------------------------------------------------------
/lapis/bayes/tokenizers/postgres_text.moon:
--------------------------------------------------------------------------------
1 | db = require "lapis.db"
2 |
3 | -- postgres based tokenizer
4 | -- opts = {
5 | -- filter_text: function -- function to pre-filter text, returns new text
6 | -- strip_tags: bool -- remove html tags from input in default
7 | -- symbols_split_tokens: bool -- symbols split apart tokens
8 | -- min_token_length: number -- min length of token (default 2)
9 | -- max_token_length: number -- max length of token (default 12)
10 | -- strip_numbers: bool -- remove tokens that are a number (including decimal, default true)
11 | -- ignore_words: table -- table of words to ignore (keys are words, values should be truthy)
12 | -- filter_tokens: function -- custom function to filter tokens, receives tokens and opts
13 | -- legacy_tokenizer: bool -- use slower ts_debug tokenizer that keeps duplicates
14 | -- regconfig: string -- PostgreSQL text search configuration (default "english")
15 | -- }
16 | class PostgresTextTokenizer extends require "lapis.bayes.tokenizers.base"
17 | new: (@opts = {}) =>
18 |
19 | filter_tokens: (tokens) =>
20 | opts = @opts
21 | min_len = opts and opts.min_token_length or 2
22 | max_len = opts and opts.max_token_length or 12
23 |
24 | strip_numbers = opts and opts.strip_numbers
25 | strip_numbers = true if strip_numbers == nil
26 |
27 | return for t in *tokens
28 | t_len = #t
29 | continue if t_len > max_len
30 | continue if t_len < min_len
31 |
32 | if strip_numbers and t\match "^[%d%.%/%-]+$"
33 | continue
34 |
35 | continue if @opts and @opts.ignore_words and @opts.ignore_words[t]
36 | t
37 |
38 | slow_pg_tokenize: (text) =>
39 | regconfig = @opts.regconfig or "english"
40 | -- this slower form will keep duplicate words
41 | db.query [[SELECT unnest(lexemes) AS word FROM ts_debug(?, ?)]], regconfig, text
42 |
43 | -- much faster (50x), but loses duplicates. Needs newer version of postgres
44 | pg_tokenize: (text) =>
45 | regconfig = @opts.regconfig or "english"
46 | db.query [[SELECT unnest(tsvector_to_array(to_tsvector(?, ?))) AS word]], regconfig, text
47 |
48 | tokenize_text: (text) =>
49 | if pre_filter = @opts.filter_text
50 | text = pre_filter text
51 |
52 | if @opts.strip_tags
53 | import extract_text from require "web_sanitize"
54 | text = extract_text text
55 |
56 | if @opts.symbols_split_tokens
57 | text = text\gsub "[%!%@%#%$%%%^%&%*%(%)%[%]%{%}%|%\\%/%`%~%-%_%<%>%,%.]", " "
58 |
59 | res = if @opts.legacy_tokenizer
60 | @slow_pg_tokenize text
61 | else
62 | @pg_tokenize text
63 |
64 | tokens = @filter_tokens [r.word for r in *res]
65 |
66 | if @opts.filter_tokens
67 | tokens = @opts.filter_tokens tokens, @opts
68 |
69 | tokens
70 |
--------------------------------------------------------------------------------
/lapis/bayes/models/categories.moon:
--------------------------------------------------------------------------------
1 |
2 | db = require "lapis.db"
3 | import Model, encode_tuples from require "lapis.bayes.model"
4 |
5 | -- Generated schema dump: (do not edit)
6 | --
7 | -- CREATE TABLE lapis_bayes_categories (
8 | -- id integer NOT NULL,
9 | -- name text NOT NULL,
10 | -- total_count integer DEFAULT 0 NOT NULL,
11 | -- created_at timestamp without time zone NOT NULL,
12 | -- updated_at timestamp without time zone NOT NULL
13 | -- );
14 | -- ALTER TABLE ONLY lapis_bayes_categories
15 | -- ADD CONSTRAINT lapis_bayes_categories_pkey PRIMARY KEY (id);
16 | --
17 | class Categories extends Model
18 | @timestamp: true
19 |
20 | @relations: {
21 | {"word_classifications", has_many: "WordClassifications"}
22 | }
23 |
24 | @find_or_create: (name) =>
25 | @find(:name) or @create(:name)
26 |
27 | delete: =>
28 | if super!
29 | import WordClassifications from require "lapis.bayes.models"
30 | db.delete WordClassifications\table_name!, {
31 | category_id: @id
32 | }
33 |
34 | increment: (amount) =>
35 | amount = assert tonumber(amount), "expecting number"
36 | @update {
37 | total_count: db.raw "total_count + #{amount}"
38 | }
39 |
40 | -- NOTE: this was removed since it was tied to a specific tokenizer
41 | increment_text: (text, opts={}) =>
42 | error "This method has been removed, use increment_words instead"
43 |
44 | -- increment a single word by count
45 | increment_word: (word, count) =>
46 | import WordClassifications from require "lapis.bayes.models"
47 | w = WordClassifications\find_or_create {
48 | category_id: @id
49 | :word
50 | }
51 | w\_increment count
52 | @increment count
53 |
54 | -- issue a single query to increment all WordClassifications for this
55 | -- category with the list of words
56 | -- counts: table in the format {word = count, ... word1, word2, ...}
57 | increment_words: (counts) =>
58 | return nil, "missing counts" unless counts
59 |
60 | -- combine hash and array words into summed count
61 | merged_counts = {}
62 | for k,v in pairs counts
63 | word, count = if type(k) == "string"
64 | k, v
65 | else
66 | v, 1
67 |
68 | merged_counts[word] or= 0
69 | merged_counts[word] += count
70 |
71 | total_count = 0
72 | tuples = for word, count in pairs merged_counts
73 | total_count += count
74 | {@id, word, count}
75 |
76 | unless next tuples
77 | return total_count
78 |
79 | import WordClassifications from require "lapis.bayes.models"
80 | tbl = db.escape_identifier WordClassifications\table_name!
81 |
82 | db.query "
83 | INSERT INTO #{tbl} (category_id, word, count) #{encode_tuples tuples}
84 | ON CONFLICT (category_id, word) DO UPDATE SET count = #{tbl}.count + EXCLUDED.count
85 | "
86 |
87 | @increment total_count
88 | total_count
89 |
90 |
--------------------------------------------------------------------------------
/lapis/bayes/classifiers/bayes_multi.moon:
--------------------------------------------------------------------------------
1 | -- Multiclass naive Bayes classifier with Laplace-style smoothing
2 | class BayesMultiClassifier extends require "lapis.bayes.classifiers.base"
3 | @default_options: {
4 | max_words: 40
5 | default_prob: 0.1
6 | }
7 |
8 | candidate_words: (categories, available_words, count) =>
9 | return available_words unless count and count < #available_words
10 |
11 | tuples = for word in *available_words
12 | totals = 0
13 | counts = {}
14 | for category in *categories
15 | word_counts = category.word_counts
16 | c = word_counts and word_counts[word] or 0
17 | table.insert counts, c
18 | totals += c
19 |
20 | score = if totals == 0
21 | 0
22 | else
23 | mean = totals / #counts
24 | variance = 0
25 | for c in *counts
26 | variance += (c - mean) ^ 2
27 | variance / #counts
28 |
29 | score += math.random! / 1000
30 |
31 | { word, score }
32 |
33 | table.sort tuples, (a, b) -> a[2] > b[2]
34 | [t[1] for t in *tuples[,count]]
35 |
36 | word_probabilities: (categories, available_words) =>
37 | return nil, "at least two categories required" unless #categories >= 2
38 |
39 | available_words = @candidate_words categories, available_words, @opts.max_words
40 | vocab_size = #available_words
41 |
42 | return nil, "no words to score" unless vocab_size > 0
43 |
44 | smoothing = if @opts.default_prob and @opts.default_prob > 0
45 | @opts.default_prob
46 | else
47 | 1e-6
48 |
49 | sum_counts = 0
50 | for category in *categories
51 | sum_counts += category.total_count or 0
52 |
53 | prior_smoothing = smoothing * #categories
54 |
55 | local max_log
56 | log_scores = for category in *categories
57 | cat_total = math.max (category.total_count or 0), 0
58 | prior = (cat_total + smoothing) / (sum_counts + prior_smoothing)
59 | log_score = math.log prior
60 |
61 | denominator = cat_total + (smoothing * vocab_size)
62 | denominator = smoothing * vocab_size if denominator <= 0
63 |
64 | for word in *available_words
65 | word_count = category.word_counts and category.word_counts[word] or 0
66 | log_score += math.log ((word_count + smoothing) / denominator)
67 |
68 | max_log = if max_log
69 | math.max max_log, log_score
70 | else
71 | log_score
72 |
73 | { category, log_score }
74 |
75 | weights = {}
76 | total_weight = 0
77 | for {category, log_score} in *log_scores
78 | weight = math.exp (log_score - max_log)
79 | total_weight += weight
80 | table.insert weights, { category.name, weight }
81 |
82 | return nil, "unable to normalise probabilities" unless total_weight > 0
83 |
84 | for tuple in *weights
85 | tuple[2] /= total_weight
86 |
87 | table.sort weights, (a, b) -> a[2] > b[2]
88 | weights
89 |
--------------------------------------------------------------------------------
/lapis/bayes/classifiers/fisher.lua:
--------------------------------------------------------------------------------
1 | local inv_chi2
2 | inv_chi2 = function(chi, df)
3 | assert(df % 2 == 0, "df must be even")
4 | local m = chi / 2.0
5 | local sum = math.exp(-m)
6 | local term = sum
7 | for i = 1, math.floor(df / 2) do
8 | term = term * (m / i)
9 | sum = sum + term
10 | end
11 | return math.min(sum, 1)
12 | end
13 | local FisherClassifier
14 | do
15 | local _class_0
16 | local _parent_0 = require("lapis.bayes.classifiers.base")
17 | local _base_0 = {
18 | word_probabilities = function(self, categories, available_words)
19 | if not (#categories == 2) then
20 | return nil, "only two categories supported at once"
21 | end
22 | local a, b
23 | a, b = categories[1], categories[2]
24 | local s = self.opts.robs
25 | local x = self.opts.robx
26 | local min_dev = self.opts.min_dev
27 | local mul_a = 0
28 | local mul_b = 0
29 | local kept_tokens = 0
30 | for _index_0 = 1, #available_words do
31 | local word = available_words[_index_0]
32 | local a_count = a.word_counts and a.word_counts[word] or 0
33 | local b_count = b.word_counts and b.word_counts[word] or 0
34 | local p = a_count / (a_count + b_count)
35 | local n = a_count + b_count
36 | local val = ((s * x) + (n * p)) / (s + n)
37 | if not min_dev or math.abs(val - 0.5) > min_dev then
38 | mul_a = mul_a + math.log(val)
39 | mul_b = mul_b + math.log(1 - val)
40 | kept_tokens = kept_tokens + 1
41 | end
42 | end
43 | if kept_tokens == 0 then
44 | return nil, "not enough strong signals to decide"
45 | end
46 | local pa = inv_chi2(-2 * mul_a, 2 * kept_tokens)
47 | local pb = inv_chi2(-2 * mul_b, 2 * kept_tokens)
48 | local p = (1 + pa - pb) / 2
49 | local tuples = {
50 | {
51 | a.name,
52 | p
53 | },
54 | {
55 | b.name,
56 | 1 - p
57 | }
58 | }
59 | table.sort(tuples, function(a, b)
60 | return a[2] > b[2]
61 | end)
62 | return tuples
63 | end
64 | }
65 | _base_0.__index = _base_0
66 | setmetatable(_base_0, _parent_0.__base)
67 | _class_0 = setmetatable({
68 | __init = function(self, ...)
69 | return _class_0.__parent.__init(self, ...)
70 | end,
71 | __base = _base_0,
72 | __name = "FisherClassifier",
73 | __parent = _parent_0
74 | }, {
75 | __index = function(cls, name)
76 | local val = rawget(_base_0, name)
77 | if val == nil then
78 | local parent = rawget(cls, "__parent")
79 | if parent then
80 | return parent[name]
81 | end
82 | else
83 | return val
84 | end
85 | end,
86 | __call = function(cls, ...)
87 | local _self_0 = setmetatable({}, _base_0)
88 | cls.__init(_self_0, ...)
89 | return _self_0
90 | end
91 | })
92 | _base_0.__class = _class_0
93 | local self = _class_0
94 | self.default_options = {
95 | robs = 1,
96 | robx = 0.5,
97 | min_dev = 0.3
98 | }
99 | if _parent_0.__inherited then
100 | _parent_0.__inherited(_parent_0, _class_0)
101 | end
102 | FisherClassifier = _class_0
103 | return _class_0
104 | end
105 |
--------------------------------------------------------------------------------
/lapis/bayes/models/word_classifications.lua:
--------------------------------------------------------------------------------
1 | local db = require("lapis.db")
2 | local Model
3 | Model = require("lapis.bayes.model").Model
4 | local WordClassifications
5 | do
6 | local _class_0
7 | local _parent_0 = Model
8 | local _base_0 = {
9 | delete = function(self)
10 | local deleted, res = _class_0.__parent.__base.delete(self, db.raw("*"))
11 | if deleted then
12 | local removed_row = self.__class:load((unpack(res)))
13 | local Categories
14 | Categories = require("lapis.bayes.models").Categories
15 | db.update(Categories:table_name(), {
16 | total_count = db.raw(db.interpolate_query(" total_count - ?", removed_row.count))
17 | }, {
18 | id = self.category_id
19 | })
20 | return true
21 | end
22 | end,
23 | _increment = function(self, amount)
24 | amount = assert(tonumber(amount), "expecting number")
25 | self:update({
26 | count = db.raw("count + " .. tostring(amount))
27 | })
28 | if self.count == 0 then
29 | return db.delete(self.__class:table_name(), {
30 | category_id = self.category_id,
31 | word = self.word,
32 | count = 0
33 | })
34 | end
35 | end
36 | }
37 | _base_0.__index = _base_0
38 | setmetatable(_base_0, _parent_0.__base)
39 | _class_0 = setmetatable({
40 | __init = function(self, ...)
41 | return _class_0.__parent.__init(self, ...)
42 | end,
43 | __base = _base_0,
44 | __name = "WordClassifications",
45 | __parent = _parent_0
46 | }, {
47 | __index = function(cls, name)
48 | local val = rawget(_base_0, name)
49 | if val == nil then
50 | local parent = rawget(cls, "__parent")
51 | if parent then
52 | return parent[name]
53 | end
54 | else
55 | return val
56 | end
57 | end,
58 | __call = function(cls, ...)
59 | local _self_0 = setmetatable({}, _base_0)
60 | cls.__init(_self_0, ...)
61 | return _self_0
62 | end
63 | })
64 | _base_0.__class = _class_0
65 | local self = _class_0
66 | self.primary_key = {
67 | "category_id",
68 | "word"
69 | }
70 | self.relations = {
71 | {
72 | "category",
73 | belongs_to = "Categories"
74 | }
75 | }
76 | self.find_or_create = function(self, opts)
77 | if opts == nil then
78 | opts = { }
79 | end
80 | return self:find(opts) or self:create(opts)
81 | end
82 | self.purge_word = function(self, word, categories)
83 | local Categories
84 | Categories = require("lapis.bayes.models").Categories
85 | if not (type(categories) == "table") then
86 | categories = {
87 | categories
88 | }
89 | end
90 | local original_count = #categories
91 | assert(original_count > 0, "missing categories")
92 | categories = Categories:find_all(categories, {
93 | key = "name"
94 | })
95 | assert(#categories == original_count, "failed to find all categories specified")
96 | local wcs = self:select("where word = ? and category_id in ?", word, db.list((function()
97 | local _accum_0 = { }
98 | local _len_0 = 1
99 | for _index_0 = 1, #categories do
100 | local c = categories[_index_0]
101 | _accum_0[_len_0] = c.id
102 | _len_0 = _len_0 + 1
103 | end
104 | return _accum_0
105 | end)()))
106 | local count = 0
107 | for _index_0 = 1, #wcs do
108 | local wc = wcs[_index_0]
109 | if wc:delete() then
110 | count = count + 1
111 | end
112 | end
113 | return count > 0, count
114 | end
115 | if _parent_0.__inherited then
116 | _parent_0.__inherited(_parent_0, _class_0)
117 | end
118 | WordClassifications = _class_0
119 | return _class_0
120 | end
121 |
--------------------------------------------------------------------------------
/lapis/bayes/classifiers/test.lua:
--------------------------------------------------------------------------------
1 | local average
2 | average = function(nums)
3 | local sum = 0
4 | for _index_0 = 1, #nums do
5 | local n = nums[_index_0]
6 | sum = sum + n
7 | end
8 | return sum / #nums
9 | end
10 | local weighted_avg
11 | weighted_avg = function(tuples)
12 | local num_tuples = #tuples
13 | local sum = 0
14 | local sum_weight = 0
15 | for _index_0 = 1, #tuples do
16 | local _des_0 = tuples[_index_0]
17 | local num, weight
18 | num, weight = _des_0[1], _des_0[2]
19 | sum = sum + num
20 | sum_weight = sum_weight + weight
21 | end
22 | local avg_weight = sum_weight / num_tuples
23 | local avg = 0
24 | for _index_0 = 1, #tuples do
25 | local _des_0 = tuples[_index_0]
26 | local num, weight
27 | num, weight = _des_0[1], _des_0[2]
28 | avg = avg + ((num / num_tuples) * (weight / avg_weight))
29 | end
30 | return avg
31 | end
32 | local TestClassifier
33 | do
34 | local _class_0
35 | local _parent_0 = require("lapis.bayes.classifiers.base")
36 | local _base_0 = {
37 | word_probabilities = function(self, categories, available_words)
38 | local total_counts = { }
39 | for _index_0 = 1, #categories do
40 | local _continue_0 = false
41 | repeat
42 | local c = categories[_index_0]
43 | if not (c.word_counts) then
44 | _continue_0 = true
45 | break
46 | end
47 | for word, count in pairs(c.word_counts) do
48 | local _update_0 = word
49 | total_counts[_update_0] = total_counts[_update_0] or 0
50 | local _update_1 = word
51 | total_counts[_update_1] = total_counts[_update_1] + count
52 | end
53 | _continue_0 = true
54 | until true
55 | if not _continue_0 then
56 | break
57 | end
58 | end
59 | local probs
60 | do
61 | local _accum_0 = { }
62 | local _len_0 = 1
63 | for _index_0 = 1, #categories do
64 | local c = categories[_index_0]
65 | local tuples
66 | do
67 | local _accum_1 = { }
68 | local _len_1 = 1
69 | for _index_1 = 1, #available_words do
70 | local word = available_words[_index_1]
71 | local total_count = total_counts[word]
72 | local cat_count = c.word_counts and c.word_counts[word] or 0
73 | local _value_0 = {
74 | cat_count / total_count,
75 | total_count
76 | }
77 | _accum_1[_len_1] = _value_0
78 | _len_1 = _len_1 + 1
79 | end
80 | tuples = _accum_1
81 | end
82 | local _value_0 = {
83 | c.name,
84 | weighted_avg(tuples)
85 | }
86 | _accum_0[_len_0] = _value_0
87 | _len_0 = _len_0 + 1
88 | end
89 | probs = _accum_0
90 | end
91 | table.sort(probs, function(a, b)
92 | return a[2] > b[2]
93 | end)
94 | return probs
95 | end
96 | }
97 | _base_0.__index = _base_0
98 | setmetatable(_base_0, _parent_0.__base)
99 | _class_0 = setmetatable({
100 | __init = function(self, ...)
101 | return _class_0.__parent.__init(self, ...)
102 | end,
103 | __base = _base_0,
104 | __name = "TestClassifier",
105 | __parent = _parent_0
106 | }, {
107 | __index = function(cls, name)
108 | local val = rawget(_base_0, name)
109 | if val == nil then
110 | local parent = rawget(cls, "__parent")
111 | if parent then
112 | return parent[name]
113 | end
114 | else
115 | return val
116 | end
117 | end,
118 | __call = function(cls, ...)
119 | local _self_0 = setmetatable({}, _base_0)
120 | cls.__init(_self_0, ...)
121 | return _self_0
122 | end
123 | })
124 | _base_0.__class = _class_0
125 | if _parent_0.__inherited then
126 | _parent_0.__inherited(_parent_0, _class_0)
127 | end
128 | TestClassifier = _class_0
129 | return _class_0
130 | end
131 |
--------------------------------------------------------------------------------
/spec/postgres_text_tokenizer_spec.moon:
--------------------------------------------------------------------------------
1 | import use_test_env from require "lapis.spec"
2 |
3 | describe "lapis.bayes.tokenizers.postgres_text", ->
4 | use_test_env!
5 |
6 | it "skips words in ignore list", ->
7 | PostgresTextTokenizer = require "lapis.bayes.tokenizers.postgres_text"
8 |
9 | t = PostgresTextTokenizer {
10 | ignore_words: {
11 | hodoc: true
12 | }
13 | }
14 |
15 | assert.same {"delisho"}, t\tokenize_text "12 delisho hodocs for $5.99"
16 |
17 |
18 | it "splits on symbols with option", ->
19 | PostgresTextTokenizer = require "lapis.bayes.tokenizers.postgres_text"
20 |
21 | t = PostgresTextTokenizer {
22 | symbols_split_tokens: true
23 | }
24 |
25 | assert.same {
26 | "buttz"
27 | "com"
28 | "disgust"
29 | "power"
30 | "super"
31 | "wow"
32 | },
33 | t\tokenize_text "wow that was super-disgusting buttz.com power/up"
34 |
35 | it "adds a custom prefilter", ->
36 | PostgresTextTokenizer = require "lapis.bayes.tokenizers.postgres_text"
37 |
38 | t = PostgresTextTokenizer {
39 | filter_text: (text) ->
40 | text\gsub "[%w]+", "%1zoo"
41 | }
42 |
43 | assert.same {"goodzoo", "greatzoo", "stuffzoo", "wowzoo"},
44 | t\tokenize_text "good great stuff wow"
45 |
46 | it "adds a custom token filter", ->
47 | PostgresTextTokenizer = require "lapis.bayes.tokenizers.postgres_text"
48 |
49 | t = PostgresTextTokenizer {
50 | filter_tokens: (tokens) ->
51 | [t\reverse! for t in *tokens]
52 | }
53 |
54 | assert.same {"doog", "taerg", "ffuts", "wow"},
55 | t\tokenize_text "good great stuff wow"
56 |
57 | it "respects min_token_length", ->
58 | PostgresTextTokenizer = require "lapis.bayes.tokenizers.postgres_text"
59 |
60 | t = PostgresTextTokenizer {
61 | min_token_length: 5
62 | }
63 |
64 | assert.same {"great", "stuff"},
65 | t\tokenize_text "hi wow great stuff"
66 |
67 | it "respects max_token_length", ->
68 | PostgresTextTokenizer = require "lapis.bayes.tokenizers.postgres_text"
69 |
70 | t = PostgresTextTokenizer {
71 | max_token_length: 4
72 | }
73 |
74 | assert.same {"good", "wow"},
75 | t\tokenize_text "good great stuff wow"
76 |
77 | it "strips numbers by default", ->
78 | PostgresTextTokenizer = require "lapis.bayes.tokenizers.postgres_text"
79 |
80 | t = PostgresTextTokenizer!
81 |
82 | tokens = t\tokenize_text "cost 99 dollars 5.99"
83 | table.sort tokens
84 | assert.same {"cost", "dollar"},
85 | tokens
86 |
87 | it "keeps numbers when strip_numbers is false", ->
88 | PostgresTextTokenizer = require "lapis.bayes.tokenizers.postgres_text"
89 |
90 | t = PostgresTextTokenizer {
91 | strip_numbers: false
92 | }
93 |
94 | tokens = t\tokenize_text "cost 99 dollars 5.99"
95 | table.sort tokens
96 | assert.same {"5.99", "99", "cost", "dollar"},
97 | tokens
98 |
99 | it "strips HTML tags with strip_tags option", ->
100 | PostgresTextTokenizer = require "lapis.bayes.tokenizers.postgres_text"
101 |
102 | t = PostgresTextTokenizer {
103 | strip_tags: true
104 | }
105 |
106 | assert.same {"hello", "link", "world"},
107 | t\tokenize_text [[hello world
link]]
108 |
109 | it "uses legacy tokenizer that keeps duplicates", ->
110 | PostgresTextTokenizer = require "lapis.bayes.tokenizers.postgres_text"
111 |
112 | t = PostgresTextTokenizer {
113 | legacy_tokenizer: true
114 | }
115 |
116 | tokens = t\tokenize_text "burgers are burgers"
117 | table.sort tokens
118 | assert.same {"burger", "burger"},
119 | tokens
120 |
121 | it "uses custom regconfig", ->
122 | PostgresTextTokenizer = require "lapis.bayes.tokenizers.postgres_text"
123 |
124 | -- Test with french config
125 | t = PostgresTextTokenizer {
126 | regconfig: "french"
127 | }
128 |
129 | -- This should tokenize using French rules
130 | tokens = t\tokenize_text "les maisons"
131 | assert.truthy tokens
132 | assert.truthy #tokens > 0
133 |
--------------------------------------------------------------------------------
/lapis/bayes/tokenizers/postgres_text.lua:
--------------------------------------------------------------------------------
1 | local db = require("lapis.db")
2 | local PostgresTextTokenizer
3 | do
4 | local _class_0
5 | local _parent_0 = require("lapis.bayes.tokenizers.base")
6 | local _base_0 = {
7 | filter_tokens = function(self, tokens)
8 | local opts = self.opts
9 | local min_len = opts and opts.min_token_length or 2
10 | local max_len = opts and opts.max_token_length or 12
11 | local strip_numbers = opts and opts.strip_numbers
12 | if strip_numbers == nil then
13 | strip_numbers = true
14 | end
15 | return (function()
16 | local _accum_0 = { }
17 | local _len_0 = 1
18 | for _index_0 = 1, #tokens do
19 | local _continue_0 = false
20 | repeat
21 | local t = tokens[_index_0]
22 | local t_len = #t
23 | if t_len > max_len then
24 | _continue_0 = true
25 | break
26 | end
27 | if t_len < min_len then
28 | _continue_0 = true
29 | break
30 | end
31 | if strip_numbers and t:match("^[%d%.%/%-]+$") then
32 | _continue_0 = true
33 | break
34 | end
35 | if self.opts and self.opts.ignore_words and self.opts.ignore_words[t] then
36 | _continue_0 = true
37 | break
38 | end
39 | local _value_0 = t
40 | _accum_0[_len_0] = _value_0
41 | _len_0 = _len_0 + 1
42 | _continue_0 = true
43 | until true
44 | if not _continue_0 then
45 | break
46 | end
47 | end
48 | return _accum_0
49 | end)()
50 | end,
51 | slow_pg_tokenize = function(self, text)
52 | local regconfig = self.opts.regconfig or "english"
53 | return db.query([[SELECT unnest(lexemes) AS word FROM ts_debug(?, ?)]], regconfig, text)
54 | end,
55 | pg_tokenize = function(self, text)
56 | local regconfig = self.opts.regconfig or "english"
57 | return db.query([[SELECT unnest(tsvector_to_array(to_tsvector(?, ?))) AS word]], regconfig, text)
58 | end,
59 | tokenize_text = function(self, text)
60 | do
61 | local pre_filter = self.opts.filter_text
62 | if pre_filter then
63 | text = pre_filter(text)
64 | end
65 | end
66 | if self.opts.strip_tags then
67 | local extract_text
68 | extract_text = require("web_sanitize").extract_text
69 | text = extract_text(text)
70 | end
71 | if self.opts.symbols_split_tokens then
72 | text = text:gsub("[%!%@%#%$%%%^%&%*%(%)%[%]%{%}%|%\\%/%`%~%-%_%<%>%,%.]", " ")
73 | end
74 | local res
75 | if self.opts.legacy_tokenizer then
76 | res = self:slow_pg_tokenize(text)
77 | else
78 | res = self:pg_tokenize(text)
79 | end
80 | local tokens = self:filter_tokens((function()
81 | local _accum_0 = { }
82 | local _len_0 = 1
83 | for _index_0 = 1, #res do
84 | local r = res[_index_0]
85 | _accum_0[_len_0] = r.word
86 | _len_0 = _len_0 + 1
87 | end
88 | return _accum_0
89 | end)())
90 | if self.opts.filter_tokens then
91 | tokens = self.opts.filter_tokens(tokens, self.opts)
92 | end
93 | return tokens
94 | end
95 | }
96 | _base_0.__index = _base_0
97 | setmetatable(_base_0, _parent_0.__base)
98 | _class_0 = setmetatable({
99 | __init = function(self, opts)
100 | if opts == nil then
101 | opts = { }
102 | end
103 | self.opts = opts
104 | end,
105 | __base = _base_0,
106 | __name = "PostgresTextTokenizer",
107 | __parent = _parent_0
108 | }, {
109 | __index = function(cls, name)
110 | local val = rawget(_base_0, name)
111 | if val == nil then
112 | local parent = rawget(cls, "__parent")
113 | if parent then
114 | return parent[name]
115 | end
116 | else
117 | return val
118 | end
119 | end,
120 | __call = function(cls, ...)
121 | local _self_0 = setmetatable({}, _base_0)
122 | cls.__init(_self_0, ...)
123 | return _self_0
124 | end
125 | })
126 | _base_0.__class = _class_0
127 | if _parent_0.__inherited then
128 | _parent_0.__inherited(_parent_0, _class_0)
129 | end
130 | PostgresTextTokenizer = _class_0
131 | return _class_0
132 | end
133 |
--------------------------------------------------------------------------------
/lapis/bayes/text/punycode.moon:
--------------------------------------------------------------------------------
1 | -- Punycode implementation for internationalized domain names
2 | -- Based on RFC 3492: https://tools.ietf.org/html/rfc3492
3 |
4 | -- Punycode parameters
5 | base = 36
6 | tmin = 1
7 | tmax = 26
8 | skew = 38
9 | damp = 700
10 | initial_bias = 72
11 | initial_n = 128
12 | delimiter = 0x2D -- hyphen-minus
13 |
14 | -- Adapt bias after each delta
15 | adapt = (delta, numpoints, firsttime) ->
16 | delta = if firsttime
17 | math.floor delta / damp
18 | else
19 | math.floor delta / 2
20 |
21 | delta = delta + math.floor delta / numpoints
22 | k = 0
23 |
24 | while delta > math.floor((base - tmin) * tmax / 2)
25 | delta = math.floor delta / (base - tmin)
26 | k = k + base
27 |
28 | k + math.floor ((base - tmin + 1) * delta) / (delta + skew)
29 |
30 | -- Encode a single digit (0-35) to character
31 | encode_digit = (d) ->
32 | if d < 26
33 | string.char d + 0x61 -- a-z
34 | else
35 | string.char d - 26 + 0x30 -- 0-9
36 |
37 | -- Calculate threshold for digit
38 | threshold = (k, bias) ->
39 | if k <= bias + tmin
40 | tmin
41 | elseif k >= bias + tmax
42 | tmax
43 | else
44 | k - bias
45 |
46 | -- Check if character is basic (ASCII)
47 | is_basic = (cp) ->
48 | cp < 0x80
49 |
50 | -- Get UTF8 codepoints from string
51 | utf8_codepoints = (str) ->
52 | codepoints = {}
53 | i = 1
54 | while i <= #str
55 | b = string.byte str, i
56 | cp = nil
57 | len = 1
58 |
59 | if b < 0x80
60 | cp = b
61 | len = 1
62 | elseif b >= 0xC0 and b < 0xE0
63 | b2 = string.byte(str, i + 1) or 0
64 | cp = ((b - 0xC0) * 0x40) + (b2 - 0x80)
65 | len = 2
66 | elseif b >= 0xE0 and b < 0xF0
67 | b2 = string.byte(str, i + 1) or 0
68 | b3 = string.byte(str, i + 2) or 0
69 | cp = ((b - 0xE0) * 0x1000) + ((b2 - 0x80) * 0x40) + (b3 - 0x80)
70 | len = 3
71 | elseif b >= 0xF0 and b < 0xF8
72 | b2 = string.byte(str, i + 1) or 0
73 | b3 = string.byte(str, i + 2) or 0
74 | b4 = string.byte(str, i + 3) or 0
75 | cp = ((b - 0xF0) * 0x40000) + ((b2 - 0x80) * 0x1000) + ((b3 - 0x80) * 0x40) + (b4 - 0x80)
76 | len = 4
77 | else
78 | -- Invalid UTF8, skip
79 | cp = b
80 | len = 1
81 |
82 | table.insert codepoints, cp
83 | i = i + len
84 |
85 | codepoints
86 |
87 | -- Encode a domain label using Punycode
88 | punycode_encode = (label) ->
89 | return label unless label and label != ""
90 |
91 | -- short circuit
92 | if label\match "^[%w%-]+$"
93 | return label
94 |
95 | -- Get codepoints
96 | codepoints = utf8_codepoints label
97 | input_length = #codepoints
98 |
99 | -- Check if all characters are basic (ASCII)
100 | has_nonbasic = false
101 | for cp in *codepoints
102 | if not is_basic cp
103 | has_nonbasic = true
104 | break
105 |
106 | return label unless has_nonbasic
107 |
108 | -- Extract basic characters
109 | output = {}
110 | basic_length = 0
111 |
112 | for cp in *codepoints
113 | if is_basic cp
114 | table.insert output, string.char(cp)
115 | basic_length = basic_length + 1
116 |
117 | -- Add delimiter if we had basic characters
118 | handled = basic_length
119 | if basic_length > 0
120 | table.insert output, string.char(delimiter)
121 |
122 | -- Encode non-basic characters
123 | n = initial_n
124 | bias = initial_bias
125 | delta = 0
126 |
127 | while handled < input_length
128 | -- Find next unhandled codepoint
129 | m = 0x10FFFF + 1
130 | for cp in *codepoints
131 | if cp >= n and cp < m
132 | m = cp
133 |
134 | -- Increase delta
135 | delta = delta + (m - n) * (handled + 1)
136 | n = m
137 |
138 | -- Encode all codepoints up to m
139 | for cp in *codepoints
140 | if cp < n
141 | delta = delta + 1
142 | elseif cp == n
143 | -- Encode delta
144 | q = delta
145 | k = base
146 |
147 | while true
148 | t = threshold k, bias
149 | if q < t
150 | break
151 |
152 | table.insert output, encode_digit(t + ((q - t) % (base - t)))
153 | q = math.floor (q - t) / (base - t)
154 | k = k + base
155 |
156 | table.insert output, encode_digit(q)
157 | bias = adapt delta, handled + 1, handled == basic_length
158 | delta = 0
159 | handled = handled + 1
160 |
161 | delta = delta + 1
162 | n = n + 1
163 |
164 | "xn--" .. table.concat output
165 |
166 | {
167 | :punycode_encode
168 | }
169 |
--------------------------------------------------------------------------------
/lapis/bayes/models/categories.lua:
--------------------------------------------------------------------------------
1 | local db = require("lapis.db")
2 | local Model, encode_tuples
3 | do
4 | local _obj_0 = require("lapis.bayes.model")
5 | Model, encode_tuples = _obj_0.Model, _obj_0.encode_tuples
6 | end
7 | local Categories
8 | do
9 | local _class_0
10 | local _parent_0 = Model
11 | local _base_0 = {
12 | delete = function(self)
13 | if _class_0.__parent.__base.delete(self) then
14 | local WordClassifications
15 | WordClassifications = require("lapis.bayes.models").WordClassifications
16 | return db.delete(WordClassifications:table_name(), {
17 | category_id = self.id
18 | })
19 | end
20 | end,
21 | increment = function(self, amount)
22 | amount = assert(tonumber(amount), "expecting number")
23 | return self:update({
24 | total_count = db.raw("total_count + " .. tostring(amount))
25 | })
26 | end,
27 | increment_text = function(self, text, opts)
28 | if opts == nil then
29 | opts = { }
30 | end
31 | return error("This method has been removed, use increment_words instead")
32 | end,
33 | increment_word = function(self, word, count)
34 | local WordClassifications
35 | WordClassifications = require("lapis.bayes.models").WordClassifications
36 | local w = WordClassifications:find_or_create({
37 | category_id = self.id,
38 | word = word
39 | })
40 | w:_increment(count)
41 | return self:increment(count)
42 | end,
43 | increment_words = function(self, counts)
44 | if not (counts) then
45 | return nil, "missing counts"
46 | end
47 | local merged_counts = { }
48 | for k, v in pairs(counts) do
49 | local word, count
50 | if type(k) == "string" then
51 | word, count = k, v
52 | else
53 | word, count = v, 1
54 | end
55 | local _update_0 = word
56 | merged_counts[_update_0] = merged_counts[_update_0] or 0
57 | local _update_1 = word
58 | merged_counts[_update_1] = merged_counts[_update_1] + count
59 | end
60 | local total_count = 0
61 | local tuples
62 | do
63 | local _accum_0 = { }
64 | local _len_0 = 1
65 | for word, count in pairs(merged_counts) do
66 | total_count = total_count + count
67 | local _value_0 = {
68 | self.id,
69 | word,
70 | count
71 | }
72 | _accum_0[_len_0] = _value_0
73 | _len_0 = _len_0 + 1
74 | end
75 | tuples = _accum_0
76 | end
77 | if not (next(tuples)) then
78 | return total_count
79 | end
80 | local WordClassifications
81 | WordClassifications = require("lapis.bayes.models").WordClassifications
82 | local tbl = db.escape_identifier(WordClassifications:table_name())
83 | db.query("\n INSERT INTO " .. tostring(tbl) .. " (category_id, word, count) " .. tostring(encode_tuples(tuples)) .. "\n ON CONFLICT (category_id, word) DO UPDATE SET count = " .. tostring(tbl) .. ".count + EXCLUDED.count\n ")
84 | self:increment(total_count)
85 | return total_count
86 | end
87 | }
88 | _base_0.__index = _base_0
89 | setmetatable(_base_0, _parent_0.__base)
90 | _class_0 = setmetatable({
91 | __init = function(self, ...)
92 | return _class_0.__parent.__init(self, ...)
93 | end,
94 | __base = _base_0,
95 | __name = "Categories",
96 | __parent = _parent_0
97 | }, {
98 | __index = function(cls, name)
99 | local val = rawget(_base_0, name)
100 | if val == nil then
101 | local parent = rawget(cls, "__parent")
102 | if parent then
103 | return parent[name]
104 | end
105 | else
106 | return val
107 | end
108 | end,
109 | __call = function(cls, ...)
110 | local _self_0 = setmetatable({}, _base_0)
111 | cls.__init(_self_0, ...)
112 | return _self_0
113 | end
114 | })
115 | _base_0.__class = _class_0
116 | local self = _class_0
117 | self.timestamp = true
118 | self.relations = {
119 | {
120 | "word_classifications",
121 | has_many = "WordClassifications"
122 | }
123 | }
124 | self.find_or_create = function(self, name)
125 | return self:find({
126 | name = name
127 | }) or self:create({
128 | name = name
129 | })
130 | end
131 | if _parent_0.__inherited then
132 | _parent_0.__inherited(_parent_0, _class_0)
133 | end
134 | Categories = _class_0
135 | return _class_0
136 | end
137 |
--------------------------------------------------------------------------------
/lapis/bayes/classifiers/bayes.moon:
--------------------------------------------------------------------------------
1 | -- implements naive bayes with assumed probability
2 | class BayesClassifier extends require "lapis.bayes.classifiers.base"
3 | @default_options: {
4 | max_words: 40
5 | default_prob: 0.1
6 | log: false
7 | token_weight_patterns: nil
8 | uncertainty_weight: 1.0
9 | }
10 |
11 | get_token_weight: (word) =>
12 | return 1.0 unless @opts.token_weight_patterns
13 |
14 | for pattern, weight in pairs @opts.token_weight_patterns
15 | if word\match pattern
16 | return weight
17 |
18 | 1.0
19 |
20 | word_probabilities: (categories, available_words, opts={}) =>
21 | opts or= {}
22 | return nil, "only two categories supported at once" unless #categories == 2
23 |
24 | a, b = unpack categories
25 |
26 | sum_counts = 0
27 | for c in *categories
28 | sum_counts += c.total_count
29 |
30 | available_words = @candidate_words categories, available_words, @opts.max_words
31 | available_words_count = #available_words
32 |
33 | unclassified_counts = opts.unclassified_counts or @opts.unclassified_counts
34 | uncertainty_weight = if opts.uncertainty_weight != nil
35 | opts.uncertainty_weight
36 | else
37 | @opts.uncertainty_weight or 1.0
38 | uncertainty_weight = math.max uncertainty_weight, 0
39 |
40 | token_weights = {}
41 | for word in *available_words
42 | weight = @get_token_weight word
43 |
44 | if unclassified_counts
45 | unc = unclassified_counts[word]
46 | if unc and unc > 0
47 | classified_total = 0
48 | classified_total += (a.word_counts and a.word_counts[word]) or 0
49 | classified_total += (b.word_counts and b.word_counts[word]) or 0
50 |
51 | total = classified_total + unc
52 | if total > 0 and uncertainty_weight != 0
53 | confidence = classified_total / total
54 | weight *= confidence ^ uncertainty_weight
55 |
56 | token_weights[word] = weight
57 |
58 | default_prob = @opts.default_prob / sum_counts
59 |
60 | default_a = default_prob * a.total_count
61 | default_b = default_prob * b.total_count
62 |
63 | -- NOTE: you should use log mode if you have a large number of tokens
64 | -- because the numbers get really small
65 | prob = if @opts.log
66 | ai_log_sum = 0
67 | bi_log_sum = 0
68 |
69 | for word in *available_words
70 | ai_count = (a.word_counts and a.word_counts[word] or 0) + default_a
71 | bi_count = (b.word_counts and b.word_counts[word] or 0) + default_b
72 |
73 | weight = token_weights[word] or @get_token_weight word
74 |
75 | ai_log_sum += weight * math.log ai_count
76 | bi_log_sum += weight * math.log bi_count
77 |
78 | ai_log_sum += math.log a.total_count
79 | bi_log_sum += math.log b.total_count
80 |
81 | ai_log_sum -= math.log (default_a + a.total_count)
82 | bi_log_sum -= math.log (default_b + b.total_count)
83 |
84 | ai_log_sum -= math.log available_words_count
85 | bi_log_sum -= math.log available_words_count
86 |
87 | max_log_sum = math.max ai_log_sum, bi_log_sum
88 |
89 | ai_prob = math.exp(ai_log_sum - max_log_sum)
90 | bi_prob = math.exp(bi_log_sum - max_log_sum)
91 |
92 | ai_prob / (ai_prob + bi_prob)
93 | else
94 | local ai_mul, bi_mul
95 |
96 | for word in *available_words
97 | ai_count = (a.word_counts and a.word_counts[word] or 0) + default_a
98 | bi_count = (b.word_counts and b.word_counts[word] or 0) + default_b
99 |
100 | weight = token_weights[word] or @get_token_weight word
101 |
102 | if ai_mul
103 | ai_mul *= ai_count ^ weight
104 | else
105 | ai_mul = ai_count ^ weight
106 |
107 | if bi_mul
108 | bi_mul *= bi_count ^ weight
109 | else
110 | bi_mul = bi_count ^ weight
111 |
112 | ai_prob = a.total_count * ai_mul / ((a.total_count + default_a) * available_words_count)
113 | bi_prob = b.total_count * bi_mul / ((b.total_count + default_b) * available_words_count)
114 |
115 | ai_prob = 0 if ai_prob != ai_prob
116 | bi_prob = 0 if bi_prob != bi_prob
117 |
118 | ai_prob / (ai_prob + bi_prob)
119 |
120 | if prob != prob
121 | return nil, "Got nan when calculating prob"
122 |
123 | if prob == math.huge or prob == -math.huge
124 | return nil, "Got inf when calculating prob"
125 |
126 | tuples = {
127 | { a.name, prob }
128 | { b.name, 1 - prob }
129 | }
130 |
131 | table.sort tuples, (a, b) -> a[2] > b[2]
132 | tuples
133 |
--------------------------------------------------------------------------------
/lapis/bayes/text/punycode.lua:
--------------------------------------------------------------------------------
1 | local base = 36
2 | local tmin = 1
3 | local tmax = 26
4 | local skew = 38
5 | local damp = 700
6 | local initial_bias = 72
7 | local initial_n = 128
8 | local delimiter = 0x2D
9 | local adapt
10 | adapt = function(delta, numpoints, firsttime)
11 | if firsttime then
12 | delta = math.floor(delta / damp)
13 | else
14 | delta = math.floor(delta / 2)
15 | end
16 | delta = delta + math.floor(delta / numpoints)
17 | local k = 0
18 | while delta > math.floor((base - tmin) * tmax / 2) do
19 | delta = math.floor(delta / (base - tmin))
20 | k = k + base
21 | end
22 | return k + math.floor(((base - tmin + 1) * delta) / (delta + skew))
23 | end
24 | local encode_digit
25 | encode_digit = function(d)
26 | if d < 26 then
27 | return string.char(d + 0x61)
28 | else
29 | return string.char(d - 26 + 0x30)
30 | end
31 | end
32 | local threshold
33 | threshold = function(k, bias)
34 | if k <= bias + tmin then
35 | return tmin
36 | elseif k >= bias + tmax then
37 | return tmax
38 | else
39 | return k - bias
40 | end
41 | end
42 | local is_basic
43 | is_basic = function(cp)
44 | return cp < 0x80
45 | end
46 | local utf8_codepoints
47 | utf8_codepoints = function(str)
48 | local codepoints = { }
49 | local i = 1
50 | while i <= #str do
51 | local b = string.byte(str, i)
52 | local cp = nil
53 | local len = 1
54 | if b < 0x80 then
55 | cp = b
56 | len = 1
57 | elseif b >= 0xC0 and b < 0xE0 then
58 | local b2 = string.byte(str, i + 1) or 0
59 | cp = ((b - 0xC0) * 0x40) + (b2 - 0x80)
60 | len = 2
61 | elseif b >= 0xE0 and b < 0xF0 then
62 | local b2 = string.byte(str, i + 1) or 0
63 | local b3 = string.byte(str, i + 2) or 0
64 | cp = ((b - 0xE0) * 0x1000) + ((b2 - 0x80) * 0x40) + (b3 - 0x80)
65 | len = 3
66 | elseif b >= 0xF0 and b < 0xF8 then
67 | local b2 = string.byte(str, i + 1) or 0
68 | local b3 = string.byte(str, i + 2) or 0
69 | local b4 = string.byte(str, i + 3) or 0
70 | cp = ((b - 0xF0) * 0x40000) + ((b2 - 0x80) * 0x1000) + ((b3 - 0x80) * 0x40) + (b4 - 0x80)
71 | len = 4
72 | else
73 | cp = b
74 | len = 1
75 | end
76 | table.insert(codepoints, cp)
77 | i = i + len
78 | end
79 | return codepoints
80 | end
81 | local punycode_encode
82 | punycode_encode = function(label)
83 | if not (label and label ~= "") then
84 | return label
85 | end
86 | if label:match("^[%w%-]+$") then
87 | return label
88 | end
89 | local codepoints = utf8_codepoints(label)
90 | local input_length = #codepoints
91 | local has_nonbasic = false
92 | for _index_0 = 1, #codepoints do
93 | local cp = codepoints[_index_0]
94 | if not is_basic(cp) then
95 | has_nonbasic = true
96 | break
97 | end
98 | end
99 | if not (has_nonbasic) then
100 | return label
101 | end
102 | local output = { }
103 | local basic_length = 0
104 | for _index_0 = 1, #codepoints do
105 | local cp = codepoints[_index_0]
106 | if is_basic(cp) then
107 | table.insert(output, string.char(cp))
108 | basic_length = basic_length + 1
109 | end
110 | end
111 | local handled = basic_length
112 | if basic_length > 0 then
113 | table.insert(output, string.char(delimiter))
114 | end
115 | local n = initial_n
116 | local bias = initial_bias
117 | local delta = 0
118 | while handled < input_length do
119 | local m = 0x10FFFF + 1
120 | for _index_0 = 1, #codepoints do
121 | local cp = codepoints[_index_0]
122 | if cp >= n and cp < m then
123 | m = cp
124 | end
125 | end
126 | delta = delta + (m - n) * (handled + 1)
127 | n = m
128 | for _index_0 = 1, #codepoints do
129 | local cp = codepoints[_index_0]
130 | if cp < n then
131 | delta = delta + 1
132 | elseif cp == n then
133 | local q = delta
134 | local k = base
135 | while true do
136 | local t = threshold(k, bias)
137 | if q < t then
138 | break
139 | end
140 | table.insert(output, encode_digit(t + ((q - t) % (base - t))))
141 | q = math.floor((q - t) / (base - t))
142 | k = k + base
143 | end
144 | table.insert(output, encode_digit(q))
145 | bias = adapt(delta, handled + 1, handled == basic_length)
146 | delta = 0
147 | handled = handled + 1
148 | end
149 | end
150 | delta = delta + 1
151 | n = n + 1
152 | end
153 | return "xn--" .. table.concat(output)
154 | end
155 | return {
156 | punycode_encode = punycode_encode
157 | }
158 |
--------------------------------------------------------------------------------
/lapis/bayes/tokenizers/ngram.lua:
--------------------------------------------------------------------------------
1 | local NgramTokenizer
2 | do
3 | local _class_0
4 | local _parent_0 = require("lapis.bayes.tokenizers.base")
5 | local _base_0 = {
6 | build_grammar = function(self)
7 | local C, Ct
8 | do
9 | local _obj_0 = require("lpeg")
10 | C, Ct = _obj_0.C, _obj_0.Ct
11 | end
12 | local utf8 = require("lapis.util.utf8")
13 | local whitespace = utf8.whitespace
14 | local printable = utf8.printable_character
15 | local word_chars = printable - whitespace
16 | local word = C(word_chars ^ 1)
17 | return Ct((word + whitespace ^ 1) ^ 0)
18 | end,
19 | normalize_word = function(self, word)
20 | if not (word and word ~= "") then
21 | return
22 | end
23 | local normalized = tostring(word):lower()
24 | normalized = normalized:gsub("[%p]", "")
25 | normalized = normalized:gsub("%s+", "")
26 | if not (normalized ~= "") then
27 | return
28 | end
29 | return normalized
30 | end,
31 | ngram_size = function(self)
32 | local n = tonumber(self.opts.n) or 2
33 | n = math.floor(n)
34 | if n < 1 then
35 | n = 1
36 | end
37 | return n
38 | end,
39 | word_ngrams = function(self, word, n)
40 | local C, Ct
41 | do
42 | local _obj_0 = require("lpeg")
43 | C, Ct = _obj_0.C, _obj_0.Ct
44 | end
45 | local utf8 = require("lapis.util.utf8")
46 | local printable = utf8.printable_character
47 | local char_pattern = Ct((C(printable)) ^ 0)
48 | local chars = char_pattern:match(word)
49 | if not (chars) then
50 | return {
51 | word
52 | }
53 | end
54 | local len = #chars
55 | if len == 0 then
56 | return {
57 | word
58 | }
59 | end
60 | if len < n then
61 | return {
62 | word
63 | }
64 | end
65 | local out = { }
66 | for i = 1, len - n + 1 do
67 | local ngram = table.concat(chars, "", i, i + n - 1)
68 | table.insert(out, ngram)
69 | end
70 | return out
71 | end,
72 | tokenize_text = function(self, text)
73 | if not (text and text ~= "") then
74 | return { }
75 | end
76 | do
77 | local pre_filter = self.opts.filter_text
78 | if pre_filter then
79 | text = pre_filter(text)
80 | if not (text and text ~= "") then
81 | return { }
82 | end
83 | end
84 | end
85 | self.grammar = self.grammar or self:build_grammar()
86 | local words = self.grammar:match(text)
87 | if not (words) then
88 | return { }
89 | end
90 | local n = self:ngram_size()
91 | local ignore_numbers = self.opts.ignore_numbers
92 | if ignore_numbers == nil then
93 | ignore_numbers = true
94 | end
95 | local tokens = { }
96 | for _index_0 = 1, #words do
97 | local _continue_0 = false
98 | repeat
99 | local raw_word = words[_index_0]
100 | local cleaned = self:normalize_word(raw_word)
101 | if not (cleaned) then
102 | _continue_0 = true
103 | break
104 | end
105 | if ignore_numbers and cleaned:match("^%d+$") then
106 | _continue_0 = true
107 | break
108 | end
109 | local _list_0 = self:word_ngrams(cleaned, n)
110 | for _index_1 = 1, #_list_0 do
111 | local token = _list_0[_index_1]
112 | table.insert(tokens, token)
113 | end
114 | _continue_0 = true
115 | until true
116 | if not _continue_0 then
117 | break
118 | end
119 | end
120 | if self.opts.filter_tokens then
121 | tokens = self.opts.filter_tokens(tokens, self.opts)
122 | end
123 | return tokens
124 | end
125 | }
126 | _base_0.__index = _base_0
127 | setmetatable(_base_0, _parent_0.__base)
128 | _class_0 = setmetatable({
129 | __init = function(self, opts)
130 | if opts == nil then
131 | opts = { }
132 | end
133 | self.opts = opts
134 | end,
135 | __base = _base_0,
136 | __name = "NgramTokenizer",
137 | __parent = _parent_0
138 | }, {
139 | __index = function(cls, name)
140 | local val = rawget(_base_0, name)
141 | if val == nil then
142 | local parent = rawget(cls, "__parent")
143 | if parent then
144 | return parent[name]
145 | end
146 | else
147 | return val
148 | end
149 | end,
150 | __call = function(cls, ...)
151 | local _self_0 = setmetatable({}, _base_0)
152 | cls.__init(_self_0, ...)
153 | return _self_0
154 | end
155 | })
156 | _base_0.__class = _class_0
157 | if _parent_0.__inherited then
158 | _parent_0.__inherited(_parent_0, _class_0)
159 | end
160 | NgramTokenizer = _class_0
161 | return _class_0
162 | end
163 |
--------------------------------------------------------------------------------
/lapis/bayes/tokenizers/url_domains.lua:
--------------------------------------------------------------------------------
1 | local trim
2 | trim = require("lapis.util").trim
3 | local UrlDomainsTokenizer
4 | do
5 | local _class_0
6 | local _parent_0 = require("lapis.bayes.tokenizers.base")
7 | local _base_0 = {
8 | ignore_domain = function(self, domain)
9 | if not (self.opts and self.opts.ignore_domains) then
10 | return
11 | end
12 | if self.opts.ignore_domains[domain] then
13 | return true
14 | end
15 | while true do
16 | local sub = domain:gsub("^%**%.?[^%.]+", "*")
17 | if sub == domain then
18 | return false
19 | end
20 | if self.opts.ignore_domains[sub] then
21 | return true
22 | end
23 | domain = sub
24 | end
25 | end,
26 | filter_tokens = function(self, urls)
27 | return (function()
28 | local _accum_0 = { }
29 | local _len_0 = 1
30 | for _index_0 = 1, #urls do
31 | local _continue_0 = false
32 | repeat
33 | local url = urls[_index_0]
34 | url = url:lower()
35 | url = trim(url)
36 | url = url:gsub("^%w+://", "")
37 | url = url:gsub("^www%.", "")
38 | url = url:gsub("/.*$", "")
39 | url = trim(url)
40 | url:gsub("<$", "")
41 | url:gsub("^>", "")
42 | if url == "" then
43 | _continue_0 = true
44 | break
45 | end
46 | if url:match("^%w+:") then
47 | _continue_0 = true
48 | break
49 | end
50 | if url:match([=[[<>="' ]]=]) then
51 | _continue_0 = true
52 | break
53 | end
54 | if not (url:match("%.")) then
55 | _continue_0 = true
56 | break
57 | end
58 | if self:ignore_domain(url) then
59 | _continue_0 = true
60 | break
61 | end
62 | local _value_0 = url
63 | _accum_0[_len_0] = _value_0
64 | _len_0 = _len_0 + 1
65 | _continue_0 = true
66 | until true
67 | if not _continue_0 then
68 | break
69 | end
70 | end
71 | return _accum_0
72 | end)()
73 | end,
74 | build_grammar = function(self)
75 | local P, S, R, C, Ct, Cs
76 | do
77 | local _obj_0 = require("lpeg")
78 | P, S, R, C, Ct, Cs = _obj_0.P, _obj_0.S, _obj_0.R, _obj_0.C, _obj_0.Ct, _obj_0.Cs
79 | end
80 | local case_insensitive
81 | case_insensitive = function(text)
82 | local out = nil
83 | for char in text:gmatch(".") do
84 | local p = S(tostring(char:lower()) .. tostring(char:upper()))
85 | if out then
86 | out = out * p
87 | else
88 | out = p
89 | end
90 | end
91 | return out
92 | end
93 | local unescape_char = P(">") / ">" + P("<") / "<" + P("&") / "&" + P(" ") / " " + P("'") / "'" + P("/") / "/" + P(""") / '"'
94 | local unescape_text = Cs((unescape_char + 1) ^ 1)
95 | local some_space = S(" \t\n")
96 | local space = some_space ^ 0
97 | local alphanum = R("az", "AZ", "09")
98 | local scheme = case_insensitive("http") * case_insensitive("s") ^ -1 * P("://")
99 | local raw_url = C(scheme * (P(1) - S(" \t\n")) ^ 1)
100 | local word = (alphanum + S("._-")) ^ 1
101 | local attr_value = C(word) + P('"') * C((1 - P('"')) ^ 0) * P('"') + P("'") * C((1 - P("'")) ^ 0) * P("'")
102 | local href = (case_insensitive("href") + case_insensitive("src")) * space * P("=") * space * attr_value / function(v)
103 | return unescape_text:match(v) or ""
104 | end
105 | local simple = C(case_insensitive("www") * (P(".") * (1 - (S("./") + some_space)) ^ 1) ^ 1)
106 | return Ct((raw_url + href + simple + 1) ^ 0)
107 | end,
108 | tokenize_text = function(self, text)
109 | self.grammar = self.grammar or self:build_grammar()
110 | local matches = self.grammar:match(text)
111 | if not (matches) then
112 | return nil, "failed to parse text"
113 | end
114 | return self:filter_tokens(matches)
115 | end
116 | }
117 | _base_0.__index = _base_0
118 | setmetatable(_base_0, _parent_0.__base)
119 | _class_0 = setmetatable({
120 | __init = function(self, opts)
121 | if opts == nil then
122 | opts = { }
123 | end
124 | self.opts = opts
125 | end,
126 | __base = _base_0,
127 | __name = "UrlDomainsTokenizer",
128 | __parent = _parent_0
129 | }, {
130 | __index = function(cls, name)
131 | local val = rawget(_base_0, name)
132 | if val == nil then
133 | local parent = rawget(cls, "__parent")
134 | if parent then
135 | return parent[name]
136 | end
137 | else
138 | return val
139 | end
140 | end,
141 | __call = function(cls, ...)
142 | local _self_0 = setmetatable({}, _base_0)
143 | cls.__init(_self_0, ...)
144 | return _self_0
145 | end
146 | })
147 | _base_0.__class = _class_0
148 | if _parent_0.__inherited then
149 | _parent_0.__inherited(_parent_0, _class_0)
150 | end
151 | UrlDomainsTokenizer = _class_0
152 | return _class_0
153 | end
154 |
--------------------------------------------------------------------------------
/lapis/bayes/classifiers/base.moon:
--------------------------------------------------------------------------------
1 | import uniquify from require "lapis.util"
2 |
3 | class BaseClassifier
4 | default_tokenizer: "lapis.bayes.tokenizers.postgres_text"
5 |
6 | new: (@opts={}) =>
7 | if @@default_options
8 | @opts = setmetatable {k,v for k,v in pairs @opts}, __index: @@default_options
9 |
10 | word_probabilities: (categories, words) =>
11 | error "word_probabilities: subclass must implement"
12 |
13 | classify_text: (...) =>
14 | counts, word_rate_or_err = @text_probabilities ...
15 | unless counts
16 | return nil, word_rate_or_err
17 |
18 | counts[1][1], counts[1][2], word_rate_or_err
19 |
20 | tokenize_text: (text) =>
21 | assert text, "missing text to tokenize"
22 |
23 | -- text is some object that is already tokenized
24 | unless type(text) == "string"
25 | return text
26 |
27 | -- custom tokenizer function passed
28 | if @opts.tokenize_text
29 | return @opts.tokenize_text text, @opts
30 |
31 | -- tokenizer instance passed
32 | tokenizer = if @opts.tokenizer
33 | @opts.tokenizer
34 | else
35 | Tokenizer = require @default_tokenizer
36 | Tokenizer(@opts)
37 |
38 | tokenizer\tokenize_text text
39 |
40 | train_text: (category, text, opts) =>
41 | tokens = @tokenize_text text
42 |
43 | if opts and opts.filter_tokens
44 | tokens = opts.filter_tokens opts, text
45 |
46 | import Categories from require "lapis.bayes.models"
47 | category = Categories\find_or_create category
48 | category\increment_words tokens
49 |
50 | -- categories: a lua array of categories names
51 | -- text: string of text to classify, or an array of tokens to classify
52 | text_probabilities: (category_names, text, opts) =>
53 | opts or= {}
54 |
55 | categories, err = @find_categories category_names
56 |
57 | unless categories
58 | return nil, err
59 |
60 | words = @tokenize_text text
61 |
62 | unless words and next words
63 | return nil, "failed to generate tokens for text"
64 |
65 | available_words, err = @count_words categories, words
66 |
67 | unless available_words
68 | return nil, err
69 |
70 | available_words_set = {word, true for word in *available_words}
71 | count = 0
72 | for word in *words
73 | count +=1 if available_words_set[word]
74 |
75 | token_ratio = count / #words
76 |
77 | probs, err = @word_probabilities categories, available_words, opts
78 | unless probs
79 | return nil, err
80 |
81 | -- put probs in hash table part of result
82 | for {c, p} in *probs
83 | probs[c] = p
84 |
85 | probs, token_ratio
86 |
87 | -- query the category objects by category name
88 | -- returns an array of category records in the same order as the input
89 | find_categories: (category_names) =>
90 | import Categories from require "lapis.bayes.models"
91 | db = Categories.db
92 |
93 | categories = Categories\select "where name in ?", db.list category_names
94 | by_name = {c.name, c for c in *categories}
95 |
96 | local missing
97 |
98 | result = for name in *category_names
99 | c = by_name[name]
100 |
101 | unless c
102 | missing or= {}
103 | table.insert missing, name
104 | continue
105 |
106 | c
107 |
108 | if missing and next missing
109 | return nil, "find_categories: missing categories (#{table.concat missing, ", "})"
110 |
111 | result
112 |
113 | -- query for WordClassifications for the requested category ids
114 | -- both arguments are arrays
115 | -- returns WordClassifications in no particular order
116 | find_word_classifications: (words, category_ids) =>
117 | return {} unless next(words) and next category_ids
118 |
119 | import WordClassifications from require "lapis.bayes.models"
120 | db = WordClassifications.db
121 | WordClassifications\select "where word in ? and category_id in ?", db.list(words), db.list(category_ids)
122 |
123 | -- reduce the set of available words by looking for polarizing words
124 | -- categories: array of category objects
125 | -- available_words: array of available words
126 | -- count: the max length of returned words array
127 | candidate_words: (categories, available_words, count) =>
128 | return available_words if #available_words <= count
129 |
130 | assert #categories == 2, "can only do two categories"
131 |
132 | a,b = unpack categories
133 | -- calculate conflict words
134 | tuples = for word in *available_words
135 | a_count = a.word_counts and a.word_counts[word] or 0
136 | b_count = b.word_counts and b.word_counts[word] or 0
137 |
138 | {
139 | word
140 | math.random! / 100 + math.abs (a_count - b_count) / math.sqrt a_count + b_count
141 | a_count
142 | b_count
143 | }
144 |
145 | table.sort tuples, (a,b) ->
146 | a[2] > b[2]
147 |
148 | [t[1] for t in *tuples[,count]]
149 |
150 | -- load the categories with the counts from the words text, return the list
151 | -- of words that appear in at least one category
152 | --
153 | -- categories: array of categories
154 | -- words: array of tokens
155 | count_words: (categories, words) =>
156 | categories_by_id = {c.id, c for c in *categories}
157 | words = uniquify words
158 |
159 | wcs = @find_word_classifications words, [c.id for c in *categories]
160 |
161 | available_words = [word for word in pairs {wc.word, true for wc in *wcs}]
162 |
163 | if #available_words == 0
164 | return nil, "no words in text are classifyable"
165 |
166 | for wc in *wcs
167 | category = categories_by_id[wc.category_id]
168 | category.word_counts or= {}
169 | category.word_counts[wc.word] = wc.count
170 |
171 | available_words
172 |
--------------------------------------------------------------------------------
/lapis/bayes/classifiers/bayes_multi.lua:
--------------------------------------------------------------------------------
1 | local BayesMultiClassifier
2 | do
3 | local _class_0
4 | local _parent_0 = require("lapis.bayes.classifiers.base")
5 | local _base_0 = {
6 | candidate_words = function(self, categories, available_words, count)
7 | if not (count and count < #available_words) then
8 | return available_words
9 | end
10 | local tuples
11 | do
12 | local _accum_0 = { }
13 | local _len_0 = 1
14 | for _index_0 = 1, #available_words do
15 | local word = available_words[_index_0]
16 | local totals = 0
17 | local counts = { }
18 | for _index_1 = 1, #categories do
19 | local category = categories[_index_1]
20 | local word_counts = category.word_counts
21 | local c = word_counts and word_counts[word] or 0
22 | table.insert(counts, c)
23 | totals = totals + c
24 | end
25 | local score
26 | if totals == 0 then
27 | score = 0
28 | else
29 | local mean = totals / #counts
30 | local variance = 0
31 | for _index_1 = 1, #counts do
32 | local c = counts[_index_1]
33 | variance = variance + ((c - mean) ^ 2)
34 | end
35 | score = variance / #counts
36 | end
37 | score = score + (math.random() / 1000)
38 | local _value_0 = {
39 | word,
40 | score
41 | }
42 | _accum_0[_len_0] = _value_0
43 | _len_0 = _len_0 + 1
44 | end
45 | tuples = _accum_0
46 | end
47 | table.sort(tuples, function(a, b)
48 | return a[2] > b[2]
49 | end)
50 | local _accum_0 = { }
51 | local _len_0 = 1
52 | local _max_0 = count
53 | for _index_0 = 1, _max_0 < 0 and #tuples + _max_0 or _max_0 do
54 | local t = tuples[_index_0]
55 | _accum_0[_len_0] = t[1]
56 | _len_0 = _len_0 + 1
57 | end
58 | return _accum_0
59 | end,
60 | word_probabilities = function(self, categories, available_words)
61 | if not (#categories >= 2) then
62 | return nil, "at least two categories required"
63 | end
64 | available_words = self:candidate_words(categories, available_words, self.opts.max_words)
65 | local vocab_size = #available_words
66 | if not (vocab_size > 0) then
67 | return nil, "no words to score"
68 | end
69 | local smoothing
70 | if self.opts.default_prob and self.opts.default_prob > 0 then
71 | smoothing = self.opts.default_prob
72 | else
73 | smoothing = 1e-6
74 | end
75 | local sum_counts = 0
76 | for _index_0 = 1, #categories do
77 | local category = categories[_index_0]
78 | sum_counts = sum_counts + (category.total_count or 0)
79 | end
80 | local prior_smoothing = smoothing * #categories
81 | local max_log
82 | local log_scores
83 | do
84 | local _accum_0 = { }
85 | local _len_0 = 1
86 | for _index_0 = 1, #categories do
87 | local category = categories[_index_0]
88 | local cat_total = math.max((category.total_count or 0), 0)
89 | local prior = (cat_total + smoothing) / (sum_counts + prior_smoothing)
90 | local log_score = math.log(prior)
91 | local denominator = cat_total + (smoothing * vocab_size)
92 | if denominator <= 0 then
93 | denominator = smoothing * vocab_size
94 | end
95 | for _index_1 = 1, #available_words do
96 | local word = available_words[_index_1]
97 | local word_count = category.word_counts and category.word_counts[word] or 0
98 | log_score = log_score + math.log(((word_count + smoothing) / denominator))
99 | end
100 | if max_log then
101 | max_log = math.max(max_log, log_score)
102 | else
103 | max_log = log_score
104 | end
105 | local _value_0 = {
106 | category,
107 | log_score
108 | }
109 | _accum_0[_len_0] = _value_0
110 | _len_0 = _len_0 + 1
111 | end
112 | log_scores = _accum_0
113 | end
114 | local weights = { }
115 | local total_weight = 0
116 | for _index_0 = 1, #log_scores do
117 | local _des_0 = log_scores[_index_0]
118 | local category, log_score
119 | category, log_score = _des_0[1], _des_0[2]
120 | local weight = math.exp((log_score - max_log))
121 | total_weight = total_weight + weight
122 | table.insert(weights, {
123 | category.name,
124 | weight
125 | })
126 | end
127 | if not (total_weight > 0) then
128 | return nil, "unable to normalise probabilities"
129 | end
130 | for _index_0 = 1, #weights do
131 | local tuple = weights[_index_0]
132 | local _update_0 = 2
133 | tuple[_update_0] = tuple[_update_0] / total_weight
134 | end
135 | table.sort(weights, function(a, b)
136 | return a[2] > b[2]
137 | end)
138 | return weights
139 | end
140 | }
141 | _base_0.__index = _base_0
142 | setmetatable(_base_0, _parent_0.__base)
143 | _class_0 = setmetatable({
144 | __init = function(self, ...)
145 | return _class_0.__parent.__init(self, ...)
146 | end,
147 | __base = _base_0,
148 | __name = "BayesMultiClassifier",
149 | __parent = _parent_0
150 | }, {
151 | __index = function(cls, name)
152 | local val = rawget(_base_0, name)
153 | if val == nil then
154 | local parent = rawget(cls, "__parent")
155 | if parent then
156 | return parent[name]
157 | end
158 | else
159 | return val
160 | end
161 | end,
162 | __call = function(cls, ...)
163 | local _self_0 = setmetatable({}, _base_0)
164 | cls.__init(_self_0, ...)
165 | return _self_0
166 | end
167 | })
168 | _base_0.__class = _class_0
169 | local self = _class_0
170 | self.default_options = {
171 | max_words = 40,
172 | default_prob = 0.1
173 | }
174 | if _parent_0.__inherited then
175 | _parent_0.__inherited(_parent_0, _class_0)
176 | end
177 | BayesMultiClassifier = _class_0
178 | return _class_0
179 | end
180 |
--------------------------------------------------------------------------------
/spec/punycode_spec.moon:
--------------------------------------------------------------------------------
1 | punycode = require "lapis.bayes.text.punycode"
2 |
3 | describe "lapis.bayes.text.punycode", ->
4 | describe "punycode_encode", ->
5 | fixtures = {
6 | { description: "German umlaut: münchen", label: "münchen", expected: "xn--mnchen-3ya" }
7 | { description: "German umlaut: müller", label: "müller", expected: "xn--mller-kva" }
8 | { description: "German umlaut: bücher", label: "bücher", expected: "xn--bcher-kva" }
9 | { description: "French accent: français", label: "français", expected: "xn--franais-xxa" }
10 | { description: "French accent: café", label: "café", expected: "xn--caf-dma" }
11 | { description: "Spanish tilde: español", label: "español", expected: "xn--espaol-zwa" }
12 | { description: "Spanish tilde: mañana", label: "mañana", expected: "xn--maana-pta" }
13 | { description: "Japanese kanji: 日本", label: "日本", expected: "xn--wgv71a" }
14 | { description: "Japanese hiragana: こんにちは", label: "こんにちは", expected: "xn--28j2a3ar1p" }
15 | { description: "Japanese katakana: テスト", label: "テスト", expected: "xn--zckzah" }
16 | { description: "Chinese simplified: 中国", label: "中国", expected: "xn--fiqs8s" }
17 | { description: "Chinese traditional: 中國", label: "中國", expected: "xn--fiqz9s" }
18 | { description: "Korean hangul: 한국", label: "한국", expected: "xn--3e0b707e" }
19 | { description: "Arabic: العربية", label: "العربية", expected: "xn--mgbcd4a2b0d2b" }
20 | { description: "Russian cyrillic: россия", label: "россия", expected: "xn--h1alffa9f" }
21 | { description: "Greek: ελληνικά", label: "ελληνικά", expected: "xn--hxargifdar" }
22 | { description: "Hebrew: עברית", label: "עברית", expected: "xn--5dbqzzl" }
23 | { description: "Thai: ไทย", label: "ไทย", expected: "xn--o3cw4h" }
24 | { description: "Mixed ASCII & Unicode: bücher-buch", label: "bücher-buch", expected: "xn--bcher-buch-9db" }
25 | { description: "Mixed ASCII & Unicode: hello世界", label: "hello世界", expected: "xn--hello-ck1hg65u" }
26 | { description: "Single Unicode codepoint: ü", label: "ü", expected: "xn--tda" }
27 | { description: "Single Unicode codepoint: ñ", label: "ñ", expected: "xn--ida" }
28 | { description: "Numeric suffix: 123ü", label: "123ü", expected: "xn--123-joa" }
29 | { description: "Leading hyphen: -ü", label: "-ü", expected: "xn----eha" }
30 | { description: "Swiss city: zürich", label: "zürich", expected: "xn--zrich-kva" }
31 | { description: "Russian city: москва", label: "москва", expected: "xn--80adxhks" }
32 | { description: "Arabic city: القاهرة", label: "القاهرة", expected: "xn--mgbag5a2flx" }
33 | { description: "Hyphen only label", label: "---", expected: "---" }
34 | { description: "German compound: bücher-bücherei", label: "bücher-bücherei", expected: "xn--bcher-bcherei-wobg" }
35 | { description: "Czech example", label: "Pročprostěnemluvíčesky", expected: "xn--Proprostnemluvesky-uyb24dma41a" }
36 | { description: "Chinese (simplified) example", label: "他们为什么不说中文", expected: "xn--ihqwcrb4cv8a8dqg056pqjye" }
37 | { description: "Chinese (traditional) example", label: "他們爲什麽不說中文", expected: "xn--ihqwctvzc91f659drss3x8bo0yb" }
38 | { description: "Arabic example", label: "ليهمابتكلموشعربي؟", expected: "xn--egbpdaj6bu4bxfgehfvwxn" }
39 | { description: "Hebrew example", label: "למההםפשוטלאמדבריםעברית", expected: "xn--4dbcagdahymbxekheh6e0a7fei0b" }
40 | { description: "Hindi example", label: "यहलोगहिन्दीक्योंनहींबोलसकतेहैं", expected: "xn--i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd" }
41 | { description: "Japanese sentence", label: "なぜみんな日本語を話してくれないのか", expected: "xn--n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa" }
42 | { description: "Korean example", label: "세계의모든사람들이한국어를이해한다면얼마나좋을까", expected: "xn--989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5jpsd879ccm6fea98c" }
43 | { description: "Russian example", label: "почемужеонинеговорятпорусски", expected: "xn--b1abfaaepdrnnbgefbadotcwatmq2g4l" }
44 | { description: "Spanish sentence", label: "PorquénopuedensimplementehablarenEspañol", expected: "xn--PorqunopuedensimplementehablarenEspaol-fmd56a" }
45 | { description: "Vietnamese example", label: "TạisaohọkhôngthểchỉnóitiếngViệt", expected: "xn--TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g" }
46 | { description: "Mixed example: 3年B組金八先生", label: "3年B組金八先生", expected: "xn--3B-ww4c5e180e575a65lsy2b" }
47 | { description: "Mixed example: 安室奈美恵-with-SUPER-MONKEYS", label: "安室奈美恵-with-SUPER-MONKEYS", expected: "xn---with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n" }
48 | { description: "Mixed example: Hello-Another-Way-それぞれの場所", label: "Hello-Another-Way-それぞれの場所", expected: "xn--Hello-Another-Way--fc4qua05auwb3674vfr0b" }
49 | { description: "Mixed example: ひとつ屋根の下2", label: "ひとつ屋根の下2", expected: "xn--2-u9tlzr9756bt3uc0v" }
50 | { description: "Mixed example: MajiでKoiする5秒前", label: "MajiでKoiする5秒前", expected: "xn--MajiKoi5-783gue6qz075azm5e" }
51 | { description: "Mixed example: パフィーdeルンバ", label: "パフィーdeルンバ", expected: "xn--de-jg4avhby1noc0d" }
52 | { description: "Mixed example: そのスピードで", label: "そのスピードで", expected: "xn--d9juau41awczczp" }
53 | }
54 |
55 | it "passes through ASCII-only strings unchanged", ->
56 | assert.same "example", punycode.punycode_encode "example"
57 | assert.same "test", punycode.punycode_encode "test"
58 | assert.same "hello-world", punycode.punycode_encode "hello-world"
59 | assert.same "abc123", punycode.punycode_encode "abc123"
60 |
61 | it "handles empty string", ->
62 | assert.same "", punycode.punycode_encode ""
63 |
64 | describe "fixture encodings", ->
65 | for case in *fixtures
66 | it "encodes #{case.description}", ->
67 | assert.same case.expected, punycode.punycode_encode case.label
68 |
69 | describe "ASCII boundary behaviour", ->
70 | it "preserves leading ASCII characters", ->
71 | result = punycode.punycode_encode "test日本"
72 | assert.true (result\match "^xn%-%-test") != nil
73 |
74 | it "handles trailing hyphen with Unicode", ->
75 | result = punycode.punycode_encode "test-ü"
76 | assert.true (result\match "^xn%-%-") != nil
77 |
78 | it "preserves case for ASCII characters", ->
79 | result = punycode.punycode_encode "Test日本"
80 | assert.true (result\match "Test") != nil
81 |
82 | it "handles emoji", ->
83 | result = punycode.punycode_encode "💩"
84 | assert.is_string result
85 |
--------------------------------------------------------------------------------
/lapis/bayes/classifiers/bayes.lua:
--------------------------------------------------------------------------------
1 | local BayesClassifier
2 | do
3 | local _class_0
4 | local _parent_0 = require("lapis.bayes.classifiers.base")
5 | local _base_0 = {
6 | get_token_weight = function(self, word)
7 | if not (self.opts.token_weight_patterns) then
8 | return 1.0
9 | end
10 | for pattern, weight in pairs(self.opts.token_weight_patterns) do
11 | if word:match(pattern) then
12 | return weight
13 | end
14 | end
15 | return 1.0
16 | end,
17 | word_probabilities = function(self, categories, available_words, opts)
18 | if opts == nil then
19 | opts = { }
20 | end
21 | opts = opts or { }
22 | if not (#categories == 2) then
23 | return nil, "only two categories supported at once"
24 | end
25 | local a, b = unpack(categories)
26 | local sum_counts = 0
27 | for _index_0 = 1, #categories do
28 | local c = categories[_index_0]
29 | sum_counts = sum_counts + c.total_count
30 | end
31 | available_words = self:candidate_words(categories, available_words, self.opts.max_words)
32 | local available_words_count = #available_words
33 | local unclassified_counts = opts.unclassified_counts or self.opts.unclassified_counts
34 | local uncertainty_weight
35 | if opts.uncertainty_weight ~= nil then
36 | uncertainty_weight = opts.uncertainty_weight
37 | else
38 | uncertainty_weight = self.opts.uncertainty_weight or 1.0
39 | end
40 | uncertainty_weight = math.max(uncertainty_weight, 0)
41 | local token_weights = { }
42 | for _index_0 = 1, #available_words do
43 | local word = available_words[_index_0]
44 | local weight = self:get_token_weight(word)
45 | if unclassified_counts then
46 | local unc = unclassified_counts[word]
47 | if unc and unc > 0 then
48 | local classified_total = 0
49 | classified_total = classified_total + ((a.word_counts and a.word_counts[word]) or 0)
50 | classified_total = classified_total + ((b.word_counts and b.word_counts[word]) or 0)
51 | local total = classified_total + unc
52 | if total > 0 and uncertainty_weight ~= 0 then
53 | local confidence = classified_total / total
54 | weight = weight * (confidence ^ uncertainty_weight)
55 | end
56 | end
57 | end
58 | token_weights[word] = weight
59 | end
60 | local default_prob = self.opts.default_prob / sum_counts
61 | local default_a = default_prob * a.total_count
62 | local default_b = default_prob * b.total_count
63 | local prob
64 | if self.opts.log then
65 | local ai_log_sum = 0
66 | local bi_log_sum = 0
67 | for _index_0 = 1, #available_words do
68 | local word = available_words[_index_0]
69 | local ai_count = (a.word_counts and a.word_counts[word] or 0) + default_a
70 | local bi_count = (b.word_counts and b.word_counts[word] or 0) + default_b
71 | local weight = token_weights[word] or self:get_token_weight(word)
72 | ai_log_sum = ai_log_sum + (weight * math.log(ai_count))
73 | bi_log_sum = bi_log_sum + (weight * math.log(bi_count))
74 | end
75 | ai_log_sum = ai_log_sum + math.log(a.total_count)
76 | bi_log_sum = bi_log_sum + math.log(b.total_count)
77 | ai_log_sum = ai_log_sum - math.log((default_a + a.total_count))
78 | bi_log_sum = bi_log_sum - math.log((default_b + b.total_count))
79 | ai_log_sum = ai_log_sum - math.log(available_words_count)
80 | bi_log_sum = bi_log_sum - math.log(available_words_count)
81 | local max_log_sum = math.max(ai_log_sum, bi_log_sum)
82 | local ai_prob = math.exp(ai_log_sum - max_log_sum)
83 | local bi_prob = math.exp(bi_log_sum - max_log_sum)
84 | prob = ai_prob / (ai_prob + bi_prob)
85 | else
86 | local ai_mul, bi_mul
87 | for _index_0 = 1, #available_words do
88 | local word = available_words[_index_0]
89 | local ai_count = (a.word_counts and a.word_counts[word] or 0) + default_a
90 | local bi_count = (b.word_counts and b.word_counts[word] or 0) + default_b
91 | local weight = token_weights[word] or self:get_token_weight(word)
92 | if ai_mul then
93 | ai_mul = ai_mul * (ai_count ^ weight)
94 | else
95 | ai_mul = ai_count ^ weight
96 | end
97 | if bi_mul then
98 | bi_mul = bi_mul * (bi_count ^ weight)
99 | else
100 | bi_mul = bi_count ^ weight
101 | end
102 | end
103 | local ai_prob = a.total_count * ai_mul / ((a.total_count + default_a) * available_words_count)
104 | local bi_prob = b.total_count * bi_mul / ((b.total_count + default_b) * available_words_count)
105 | if ai_prob ~= ai_prob then
106 | ai_prob = 0
107 | end
108 | if bi_prob ~= bi_prob then
109 | bi_prob = 0
110 | end
111 | prob = ai_prob / (ai_prob + bi_prob)
112 | end
113 | if prob ~= prob then
114 | return nil, "Got nan when calculating prob"
115 | end
116 | if prob == math.huge or prob == -math.huge then
117 | return nil, "Got inf when calculating prob"
118 | end
119 | local tuples = {
120 | {
121 | a.name,
122 | prob
123 | },
124 | {
125 | b.name,
126 | 1 - prob
127 | }
128 | }
129 | table.sort(tuples, function(a, b)
130 | return a[2] > b[2]
131 | end)
132 | return tuples
133 | end
134 | }
135 | _base_0.__index = _base_0
136 | setmetatable(_base_0, _parent_0.__base)
137 | _class_0 = setmetatable({
138 | __init = function(self, ...)
139 | return _class_0.__parent.__init(self, ...)
140 | end,
141 | __base = _base_0,
142 | __name = "BayesClassifier",
143 | __parent = _parent_0
144 | }, {
145 | __index = function(cls, name)
146 | local val = rawget(_base_0, name)
147 | if val == nil then
148 | local parent = rawget(cls, "__parent")
149 | if parent then
150 | return parent[name]
151 | end
152 | else
153 | return val
154 | end
155 | end,
156 | __call = function(cls, ...)
157 | local _self_0 = setmetatable({}, _base_0)
158 | cls.__init(_self_0, ...)
159 | return _self_0
160 | end
161 | })
162 | _base_0.__class = _class_0
163 | local self = _class_0
164 | self.default_options = {
165 | max_words = 40,
166 | default_prob = 0.1,
167 | log = false,
168 | token_weight_patterns = nil,
169 | uncertainty_weight = 1.0
170 | }
171 | if _parent_0.__inherited then
172 | _parent_0.__inherited(_parent_0, _class_0)
173 | end
174 | BayesClassifier = _class_0
175 | return _class_0
176 | end
177 |
--------------------------------------------------------------------------------
/spec/unaccent_spec.moon:
--------------------------------------------------------------------------------
1 |
2 | unaccent = require "lapis.bayes.text.unaccent"
3 |
4 | describe "lapis.bayes.text.unaccent", ->
5 | describe "unaccent_string", ->
6 | it "passes through basic ASCII unchanged", ->
7 | assert.same "hello world", unaccent.unaccent_string "hello world"
8 | assert.same "abc123", unaccent.unaccent_string "abc123"
9 | assert.same "test", unaccent.unaccent_string "test"
10 |
11 | it "handles empty string", ->
12 | assert.same "", unaccent.unaccent_string ""
13 |
14 | it "converts fullwidth characters to ASCII", ->
15 | assert.same "abc", unaccent.unaccent_string "abc"
16 | assert.same "ABC", unaccent.unaccent_string "ABC"
17 | assert.same "123", unaccent.unaccent_string "123"
18 |
19 | it "converts mathematical alphanumerics", ->
20 | assert.same "abc", unaccent.unaccent_string "𝕒𝕓𝕔"
21 | assert.same "xyz", unaccent.unaccent_string "𝚡𝚢𝚣"
22 | assert.same "ABC", unaccent.unaccent_string "𝓐𝓑𝓒"
23 |
24 | it "converts mathematical bold letters", ->
25 | assert.same "SaleIsLiveCheckNow", unaccent.unaccent_string "𝐒𝐚𝐥𝐞𝐈𝐬𝐋𝐢𝐯𝐞𝐂𝐡𝐞𝐜𝐤𝐍𝐨𝐰"
26 | assert.same "ABC", unaccent.unaccent_string "𝐀𝐁𝐂"
27 | assert.same "xyz", unaccent.unaccent_string "𝐱𝐲𝐳"
28 |
29 | it "removes accents from Latin characters", ->
30 | assert.same "aeiou", unaccent.unaccent_string "àéíóú"
31 | assert.same "AEIOU", unaccent.unaccent_string "ÀÉÍÓÚ"
32 | assert.same "nca", unaccent.unaccent_string "ñçä"
33 |
34 | it "converts Greek letters to Latin", ->
35 | assert.same "a", unaccent.unaccent_string "α"
36 | assert.same "y", unaccent.unaccent_string "γ"
37 | assert.same "n", unaccent.unaccent_string "π"
38 | assert.same "o", unaccent.unaccent_string "ο"
39 |
40 | it "converts Cyrillic letters to Latin", ->
41 | assert.same "a", unaccent.unaccent_string "а"
42 | assert.same "e", unaccent.unaccent_string "е"
43 | assert.same "o", unaccent.unaccent_string "о"
44 |
45 | it "normalizes special punctuation", ->
46 | assert.same ".", unaccent.unaccent_string "。"
47 | assert.same ",", unaccent.unaccent_string ","
48 | assert.same ":", unaccent.unaccent_string ":"
49 | assert.same "!", unaccent.unaccent_string "!"
50 |
51 | it "normalizes mathematical operators", ->
52 | assert.same "==", unaccent.unaccent_string "⩵"
53 | assert.same "===", unaccent.unaccent_string "⩶"
54 | assert.same "::=", unaccent.unaccent_string "⩴"
55 |
56 | it "normalizes brackets", ->
57 | assert.same "[", unaccent.unaccent_string "["
58 | assert.same "]", unaccent.unaccent_string "]"
59 | assert.same "{", unaccent.unaccent_string "{"
60 | assert.same "}", unaccent.unaccent_string "}"
61 |
62 | it "converts special number forms", ->
63 | assert.same "0", unaccent.unaccent_string "0"
64 | assert.same " 1/2", unaccent.unaccent_string "½"
65 | assert.same " 1/4", unaccent.unaccent_string "¼"
66 | assert.same " 3/4", unaccent.unaccent_string "¾"
67 |
68 | it "converts Roman numerals", ->
69 | assert.same "1", unaccent.unaccent_string "Ⅰ"
70 | assert.same "IV", unaccent.unaccent_string "Ⅳ"
71 | assert.same "XII", unaccent.unaccent_string "Ⅻ"
72 |
73 | it "converts circled numbers", ->
74 | assert.same "1", unaccent.unaccent_string "①"
75 | assert.same "10", unaccent.unaccent_string "⑩"
76 | assert.same "20", unaccent.unaccent_string "⑳"
77 |
78 | it "converts enclosed alphanumerics", ->
79 | assert.same "(1)", unaccent.unaccent_string "⑴"
80 | assert.same "(a)", unaccent.unaccent_string "⒜"
81 | assert.same "1.", unaccent.unaccent_string "⒈"
82 |
83 | it "handles mixed character types", ->
84 | assert.same "hello123", unaccent.unaccent_string "hello123"
85 | assert.same "test.com", unaccent.unaccent_string "test。com"
86 |
87 | it "handles characters that should pass through", ->
88 | result = unaccent.unaccent_string "hello-world_test"
89 | assert.same "hello-world_test", result
90 |
91 | it "handles ligatures", ->
92 | assert.same "fi", unaccent.unaccent_string "fi"
93 | assert.same "fl", unaccent.unaccent_string "fl"
94 | assert.same "ffi", unaccent.unaccent_string "ffi"
95 | assert.same "ffl", unaccent.unaccent_string "ffl"
96 | assert.same "st", unaccent.unaccent_string "st"
97 |
98 | it "handles special letter forms", ->
99 | assert.same "ss", unaccent.unaccent_string "ß"
100 | assert.same "SS", unaccent.unaccent_string "ẞ"
101 | assert.same "ae", unaccent.unaccent_string "æ"
102 | assert.same "AE", unaccent.unaccent_string "Æ"
103 | assert.same "oe", unaccent.unaccent_string "œ"
104 | assert.same "OE", unaccent.unaccent_string "Œ"
105 |
106 | describe "comprehensive normalization tests from test.moon", ->
107 | -- Note: unaccent_string only does character transliteration, not case normalization
108 | -- Expected values show what unaccent_string outputs (with spaces removed)
109 | normalizes = {
110 | {"hello world", "helloworld"}
111 | {"bamWaR7°CoМ", "bamWaR7.CoM"}
112 | {"BaМwAr7.СοM", "BaMwAr7.CoM"}
113 | {"b A m w A r 7 ° c O М", "bAmwAr7.coM"}
114 | {"B A Μ W а R 7 ㆍc o m", "BAMWaR7.com"}
115 | {"b AΜ w А R 7.cOм", "bAMwAR7.com"}
116 | {"bamwar7.com", "bamwar7.com"}
117 | {"BAM〉WAR7.com", "BAM>WAR7.com"}
118 | {"B A M W A R 7ㆍCOM", "BAMWAR7.COM"}
119 | {"BAMWAR7.COM", "BAMWAR7.CoM"}
120 | {"〚bam〛war7.〚com〛", "[bam]war7.[com]"}
121 | {"⒲⒲⒲.⒝⒜⒨⒲⒜⒭⑺.⒞⒪⒨", "(w)(w)(w).(b)(a)(m)(w)(a)(r)(7).(c)(o)(m)"}
122 | {" ⓦⓦⓦ.ⓑⓐⓜⓦⓐⓡ⑦.ⓒⓞⓜ", "www.bamwar7.com"}
123 | {"🇱🅔🅰🄵", "leaf"}
124 | {"ero588,C0M", "ero588,C0M"}
125 | {"RK772。CoM", "RK772.CoM"}
126 | {"MIO652。CoM", "MIO652.CoM"}
127 | {"KBS454。COM", "KBS454.CoM"}
128 | {"MI738。CoM", "MI738.CoM"}
129 | {"mkmk35。COM", "mkmk35.COM"}
130 | {"79ESA。CoM", "79ESA.CoM"}
131 | {"APA82。CoM", "APA82.CoM"}
132 | {"𝚟𝚘𝚙.𝚜𝚞", "vop.su"}
133 | {"MMO77。COM", "MMo77.CoM"}
134 | {"MIO652。COM", "Mio652.CoM"}
135 | {"kakao: dnj2016", "kakao:dnj2016"}
136 | }
137 |
138 | for {before, after} in *normalizes
139 | it "normalizes '#{before}'", ->
140 | result = unaccent.unaccent_string before
141 | -- Remove spaces for comparison since the test.moon examples show this
142 | result_normalized = result\gsub "%s", ""
143 | assert.same after, result_normalized
144 |
145 | describe "unaccent_table", ->
146 | it "exists and is a table", ->
147 | assert.is_table unaccent.unaccent_table
148 |
149 | it "has expected number of entries", ->
150 | count = 0
151 | for k, v in pairs unaccent.unaccent_table
152 | count += 1
153 | assert.true count > 2000, "Expected over 2000 mappings"
154 |
155 | it "contains specific mappings", ->
156 | assert.same "a", unaccent.unaccent_table["à"]
157 | assert.same "e", unaccent.unaccent_table["é"]
158 | assert.same "A", unaccent.unaccent_table["A"]
159 | assert.same "0", unaccent.unaccent_table["0"]
160 | assert.same ".", unaccent.unaccent_table["。"]
161 |
162 | it "maps fullwidth characters", ->
163 | assert.same "a", unaccent.unaccent_table["a"]
164 | assert.same "z", unaccent.unaccent_table["z"]
165 | assert.same "0", unaccent.unaccent_table["0"]
166 | assert.same "9", unaccent.unaccent_table["9"]
167 |
168 | it "maps Greek letters", ->
169 | assert.same "a", unaccent.unaccent_table["α"]
170 | assert.same "y", unaccent.unaccent_table["γ"]
171 | assert.same "n", unaccent.unaccent_table["π"]
172 |
173 | it "maps mathematical alphanumerics", ->
174 | assert.true unaccent.unaccent_table["𝕒"] != nil
175 | assert.true unaccent.unaccent_table["𝓐"] != nil
176 | assert.true unaccent.unaccent_table["𝚊"] != nil
177 |
--------------------------------------------------------------------------------
/lapis/bayes/classifiers/base.lua:
--------------------------------------------------------------------------------
1 | local uniquify
2 | uniquify = require("lapis.util").uniquify
3 | local BaseClassifier
4 | do
5 | local _class_0
6 | local _base_0 = {
7 | default_tokenizer = "lapis.bayes.tokenizers.postgres_text",
8 | word_probabilities = function(self, categories, words)
9 | return error("word_probabilities: subclass must implement")
10 | end,
11 | classify_text = function(self, ...)
12 | local counts, word_rate_or_err = self:text_probabilities(...)
13 | if not (counts) then
14 | return nil, word_rate_or_err
15 | end
16 | return counts[1][1], counts[1][2], word_rate_or_err
17 | end,
18 | tokenize_text = function(self, text)
19 | assert(text, "missing text to tokenize")
20 | if not (type(text) == "string") then
21 | return text
22 | end
23 | if self.opts.tokenize_text then
24 | return self.opts.tokenize_text(text, self.opts)
25 | end
26 | local tokenizer
27 | if self.opts.tokenizer then
28 | tokenizer = self.opts.tokenizer
29 | else
30 | local Tokenizer = require(self.default_tokenizer)
31 | tokenizer = Tokenizer(self.opts)
32 | end
33 | return tokenizer:tokenize_text(text)
34 | end,
35 | train_text = function(self, category, text, opts)
36 | local tokens = self:tokenize_text(text)
37 | if opts and opts.filter_tokens then
38 | tokens = opts.filter_tokens(opts, text)
39 | end
40 | local Categories
41 | Categories = require("lapis.bayes.models").Categories
42 | category = Categories:find_or_create(category)
43 | return category:increment_words(tokens)
44 | end,
45 | text_probabilities = function(self, category_names, text, opts)
46 | opts = opts or { }
47 | local categories, err = self:find_categories(category_names)
48 | if not (categories) then
49 | return nil, err
50 | end
51 | local words = self:tokenize_text(text)
52 | if not (words and next(words)) then
53 | return nil, "failed to generate tokens for text"
54 | end
55 | local available_words
56 | available_words, err = self:count_words(categories, words)
57 | if not (available_words) then
58 | return nil, err
59 | end
60 | local available_words_set
61 | do
62 | local _tbl_0 = { }
63 | for _index_0 = 1, #available_words do
64 | local word = available_words[_index_0]
65 | _tbl_0[word] = true
66 | end
67 | available_words_set = _tbl_0
68 | end
69 | local count = 0
70 | for _index_0 = 1, #words do
71 | local word = words[_index_0]
72 | if available_words_set[word] then
73 | count = count + 1
74 | end
75 | end
76 | local token_ratio = count / #words
77 | local probs
78 | probs, err = self:word_probabilities(categories, available_words, opts)
79 | if not (probs) then
80 | return nil, err
81 | end
82 | for _index_0 = 1, #probs do
83 | local _des_0 = probs[_index_0]
84 | local c, p
85 | c, p = _des_0[1], _des_0[2]
86 | probs[c] = p
87 | end
88 | return probs, token_ratio
89 | end,
90 | find_categories = function(self, category_names)
91 | local Categories
92 | Categories = require("lapis.bayes.models").Categories
93 | local db = Categories.db
94 | local categories = Categories:select("where name in ?", db.list(category_names))
95 | local by_name
96 | do
97 | local _tbl_0 = { }
98 | for _index_0 = 1, #categories do
99 | local c = categories[_index_0]
100 | _tbl_0[c.name] = c
101 | end
102 | by_name = _tbl_0
103 | end
104 | local missing
105 | local result
106 | do
107 | local _accum_0 = { }
108 | local _len_0 = 1
109 | for _index_0 = 1, #category_names do
110 | local _continue_0 = false
111 | repeat
112 | local name = category_names[_index_0]
113 | local c = by_name[name]
114 | if not (c) then
115 | missing = missing or { }
116 | table.insert(missing, name)
117 | _continue_0 = true
118 | break
119 | end
120 | local _value_0 = c
121 | _accum_0[_len_0] = _value_0
122 | _len_0 = _len_0 + 1
123 | _continue_0 = true
124 | until true
125 | if not _continue_0 then
126 | break
127 | end
128 | end
129 | result = _accum_0
130 | end
131 | if missing and next(missing) then
132 | return nil, "find_categories: missing categories (" .. tostring(table.concat(missing, ", ")) .. ")"
133 | end
134 | return result
135 | end,
136 | find_word_classifications = function(self, words, category_ids)
137 | if not (next(words) and next(category_ids)) then
138 | return { }
139 | end
140 | local WordClassifications
141 | WordClassifications = require("lapis.bayes.models").WordClassifications
142 | local db = WordClassifications.db
143 | return WordClassifications:select("where word in ? and category_id in ?", db.list(words), db.list(category_ids))
144 | end,
145 | candidate_words = function(self, categories, available_words, count)
146 | if #available_words <= count then
147 | return available_words
148 | end
149 | assert(#categories == 2, "can only do two categories")
150 | local a, b = unpack(categories)
151 | local tuples
152 | do
153 | local _accum_0 = { }
154 | local _len_0 = 1
155 | for _index_0 = 1, #available_words do
156 | local word = available_words[_index_0]
157 | local a_count = a.word_counts and a.word_counts[word] or 0
158 | local b_count = b.word_counts and b.word_counts[word] or 0
159 | local _value_0 = {
160 | word,
161 | math.random() / 100 + math.abs((a_count - b_count) / math.sqrt(a_count + b_count)),
162 | a_count,
163 | b_count
164 | }
165 | _accum_0[_len_0] = _value_0
166 | _len_0 = _len_0 + 1
167 | end
168 | tuples = _accum_0
169 | end
170 | table.sort(tuples, function(a, b)
171 | return a[2] > b[2]
172 | end)
173 | local _accum_0 = { }
174 | local _len_0 = 1
175 | local _max_0 = count
176 | for _index_0 = 1, _max_0 < 0 and #tuples + _max_0 or _max_0 do
177 | local t = tuples[_index_0]
178 | _accum_0[_len_0] = t[1]
179 | _len_0 = _len_0 + 1
180 | end
181 | return _accum_0
182 | end,
183 | count_words = function(self, categories, words)
184 | local categories_by_id
185 | do
186 | local _tbl_0 = { }
187 | for _index_0 = 1, #categories do
188 | local c = categories[_index_0]
189 | _tbl_0[c.id] = c
190 | end
191 | categories_by_id = _tbl_0
192 | end
193 | words = uniquify(words)
194 | local wcs = self:find_word_classifications(words, (function()
195 | local _accum_0 = { }
196 | local _len_0 = 1
197 | for _index_0 = 1, #categories do
198 | local c = categories[_index_0]
199 | _accum_0[_len_0] = c.id
200 | _len_0 = _len_0 + 1
201 | end
202 | return _accum_0
203 | end)())
204 | local available_words
205 | do
206 | local _accum_0 = { }
207 | local _len_0 = 1
208 | for word in pairs((function()
209 | local _tbl_0 = { }
210 | for _index_0 = 1, #wcs do
211 | local wc = wcs[_index_0]
212 | _tbl_0[wc.word] = true
213 | end
214 | return _tbl_0
215 | end)()) do
216 | _accum_0[_len_0] = word
217 | _len_0 = _len_0 + 1
218 | end
219 | available_words = _accum_0
220 | end
221 | if #available_words == 0 then
222 | return nil, "no words in text are classifyable"
223 | end
224 | for _index_0 = 1, #wcs do
225 | local wc = wcs[_index_0]
226 | local category = categories_by_id[wc.category_id]
227 | category.word_counts = category.word_counts or { }
228 | category.word_counts[wc.word] = wc.count
229 | end
230 | return available_words
231 | end
232 | }
233 | _base_0.__index = _base_0
234 | _class_0 = setmetatable({
235 | __init = function(self, opts)
236 | if opts == nil then
237 | opts = { }
238 | end
239 | self.opts = opts
240 | if self.__class.default_options then
241 | self.opts = setmetatable((function()
242 | local _tbl_0 = { }
243 | for k, v in pairs(self.opts) do
244 | _tbl_0[k] = v
245 | end
246 | return _tbl_0
247 | end)(), {
248 | __index = self.__class.default_options
249 | })
250 | end
251 | end,
252 | __base = _base_0,
253 | __name = "BaseClassifier"
254 | }, {
255 | __index = _base_0,
256 | __call = function(cls, ...)
257 | local _self_0 = setmetatable({}, _base_0)
258 | cls.__init(_self_0, ...)
259 | return _self_0
260 | end
261 | })
262 | _base_0.__class = _class_0
263 | BaseClassifier = _class_0
264 | return _class_0
265 | end
266 |
--------------------------------------------------------------------------------
/spec/stem_spec.moon:
--------------------------------------------------------------------------------
1 | stem = require "lapis.bayes.text.stem"
2 |
3 | test_word = (input, expected) ->
4 | assert.same expected, stem.stem_word input
5 |
6 | describe "lapis.bayes.text.stem", ->
7 | describe "stem_word", ->
8 | it "handles nil and empty strings", ->
9 | assert.same nil, stem.stem_word nil
10 | assert.same "", stem.stem_word ""
11 |
12 | it "handles short words (< 3 chars)", ->
13 | test_word "a", "a"
14 | test_word "ab", "ab"
15 | test_word "at", "at"
16 |
17 | it "handles words that don't need stemming", ->
18 | test_word "cat", "cat"
19 | test_word "dog", "dog"
20 | test_word "tree", "tree"
21 |
22 | it "converts to lowercase", ->
23 | test_word "HELLO", "hello"
24 | test_word "WoRlD", "world"
25 | test_word "TEST", "test"
26 |
27 | describe "exception words", ->
28 | it "handles skis/skies", ->
29 | test_word "skis", "ski"
30 | test_word "skies", "sky"
31 | test_word "sky", "sky"
32 |
33 | it "handles special -ly cases", ->
34 | test_word "idly", "idl"
35 | test_word "gently", "gentl"
36 | test_word "ugly", "ugli"
37 | test_word "early", "earli"
38 | test_word "only", "onli"
39 | test_word "singly", "singl"
40 |
41 | it "handles invariant forms", ->
42 | test_word "news", "news"
43 | test_word "howe", "howe"
44 | test_word "atlas", "atlas"
45 | test_word "cosmos", "cosmos"
46 | test_word "bias", "bias"
47 | test_word "andes", "andes"
48 |
49 | describe "Step 1a - plurals and possessives", ->
50 | it "removes apostrophes", ->
51 | test_word "dog's", "dog"
52 | test_word "cat's'", "cat"
53 |
54 | it "handles sses -> ss", ->
55 | test_word "blesses", "bless"
56 | test_word "stresses", "stress"
57 |
58 | it "handles ied/ies", ->
59 | test_word "tied", "tie"
60 | test_word "pies", "pie"
61 | test_word "cries", "cri"
62 | test_word "studies", "studi"
63 |
64 | it "removes trailing s when appropriate", ->
65 | test_word "cats", "cat"
66 | test_word "dogs", "dog"
67 | test_word "gas", "ga" -- has vowel so s is removed
68 | test_word "this", "thi" -- has vowel so s is removed
69 | test_word "class", "class" -- ss ending
70 |
71 | describe "Step 1b - ed, ing suffixes", ->
72 | it "handles eed/eedly in R1", ->
73 | test_word "agreed", "agre"
74 | test_word "feed", "feed" -- R1 is null, so eed not replaced
75 |
76 | it "handles ed/edly", ->
77 | test_word "plastered", "plaster"
78 | test_word "bled", "bled"
79 | test_word "motivated", "motiv"
80 |
81 | it "handles ing/ingly", ->
82 | test_word "sing", "sing"
83 | test_word "motivating", "motiv"
84 | test_word "running", "run"
85 | test_word "hopping", "hop"
86 |
87 | it "adds e after at/bl/iz", ->
88 | test_word "luxuriated", "luxuri" -- removes 'ated', no e added
89 | test_word "troubled", "troubl"
90 |
91 | it "removes double consonants", ->
92 | test_word "hopped", "hop"
93 | test_word "fitted", "fit"
94 | test_word "planned", "plan"
95 |
96 | it "handles special ing cases", ->
97 | test_word "inning", "inning"
98 | test_word "outing", "outing"
99 | test_word "canning", "canning"
100 |
101 | describe "Step 1c - y suffix", ->
102 | it "replaces suffix y with i", ->
103 | test_word "happy", "happi"
104 | test_word "sky", "sky" -- exception word, not changed
105 |
106 | it "does not replace y at start or after vowel", ->
107 | test_word "say", "say"
108 | test_word "boy", "boy"
109 |
110 | describe "Step 2 - derivational suffixes", ->
111 | it "handles tional -> tion", ->
112 | test_word "relational", "relat"
113 | test_word "conditional", "condit"
114 | test_word "rational", "ration"
115 |
116 | it "handles enci -> ence", ->
117 | test_word "valenci", "valenc"
118 |
119 | it "handles anci -> ance", ->
120 | test_word "hesitanci", "hesit"
121 |
122 | it "handles izer -> ize", ->
123 | test_word "digitizer", "digit"
124 |
125 | it "handles ational -> ate", ->
126 | test_word "operational", "oper"
127 |
128 | it "handles ation/ator -> ate", ->
129 | test_word "predication", "predic"
130 | test_word "operator", "oper"
131 |
132 | it "handles alism -> al", ->
133 | test_word "feudalism", "feudal"
134 |
135 | it "handles fulness -> ful", ->
136 | test_word "hopefulness", "hope"
137 |
138 | it "handles ousness -> ous", ->
139 | test_word "callousness", "callous"
140 |
141 | it "handles iveness -> ive", ->
142 | test_word "decisiveness", "decis"
143 |
144 | it "handles biliti -> ble", ->
145 | test_word "sensibiliti", "sensibl"
146 |
147 | it "handles li deletion", ->
148 | test_word "formalli", "formal"
149 |
150 | describe "Step 3 - more derivational suffixes", ->
151 | it "handles icate -> ic", ->
152 | test_word "duplicate", "duplic"
153 |
154 | it "handles ative deletion in R2", ->
155 | test_word "demonstrative", "demonstr"
156 |
157 | it "handles alize -> al", ->
158 | test_word "normalize", "normal"
159 |
160 | it "handles ful/ness deletion", ->
161 | test_word "hopeful", "hope"
162 | test_word "goodness", "good"
163 |
164 | describe "Step 4 - suffix deletion", ->
165 | it "handles al", ->
166 | test_word "radical", "radic"
167 |
168 | it "handles ance/ence", ->
169 | test_word "dependence", "depend"
170 |
171 | it "handles er", ->
172 | test_word "computer", "comput"
173 |
174 | it "handles able/ible", ->
175 | test_word "adjustable", "adjust"
176 | test_word "divisible", "divis"
177 |
178 | it "handles ant/ent/ment", ->
179 | test_word "irritant", "irrit"
180 | test_word "different", "differ"
181 | test_word "adjustment", "adjust"
182 |
183 | it "handles ion after s or t", ->
184 | test_word "adoption", "adopt"
185 | test_word "decision", "decis"
186 |
187 | it "handles ism/iti/ous/ive/ize", ->
188 | test_word "communism", "communism" -- ism in R2 only
189 | test_word "sensitivity", "sensit"
190 | test_word "continuous", "continu"
191 | test_word "effective", "effect"
192 | test_word "realize", "realiz"
193 |
194 | describe "Step 5 - final cleanup", ->
195 | it "removes trailing e in R2", ->
196 | test_word "debate", "debat"
197 | test_word "create", "creat"
198 |
199 | it "removes trailing e in R1 if not short syllable", ->
200 | test_word "hope", "hope"
201 |
202 | it "keeps trailing e after short syllable in R1", ->
203 | test_word "centre", "centr"
204 |
205 | it "removes double l in R2", ->
206 | test_word "controll", "control"
207 |
208 | describe "word families", ->
209 | it "stems connection family to connect", ->
210 | test_word "connection", "connect"
211 | test_word "connections", "connect"
212 | test_word "connective", "connect"
213 | test_word "connected", "connect"
214 | test_word "connecting", "connect"
215 |
216 | it "stems generate family", ->
217 | test_word "generate", "generat"
218 | test_word "generates", "generat"
219 | test_word "generated", "generat"
220 | test_word "generating", "generat"
221 | test_word "generator", "generat"
222 | test_word "general", "general"
223 | test_word "generalization", "general"
224 |
225 | it "stems happy family to happi", ->
226 | test_word "happy", "happi"
227 | test_word "happiness", "happi"
228 | test_word "happily", "happili"
229 |
230 | it "stems run family", ->
231 | test_word "run", "run"
232 | test_word "running", "run"
233 | test_word "runs", "run"
234 | test_word "runner", "runner"
235 |
236 | describe "complex derivational chains", ->
237 | it "handles multiply derived words", ->
238 | test_word "vietnamization", "vietnam"
239 | test_word "conformabli", "conform"
240 | test_word "radicalli", "radic"
241 | test_word "differentli", "differ"
242 |
243 | describe "special prefix handling", ->
244 | it "handles commun- prefix", ->
245 | test_word "communism", "communism" -- ism not in R2
246 | test_word "communication", "communic"
247 | test_word "community", "communiti"
248 |
249 | it "handles gener- prefix", ->
250 | test_word "generate", "generat"
251 | test_word "generator", "generat"
252 | test_word "generous", "generous"
253 |
254 | it "handles univers- prefix", ->
255 | test_word "university", "universiti"
256 | test_word "universal", "universal"
257 | test_word "universe", "univers"
258 |
259 | describe "edge cases", ->
260 | it "handles very long words", ->
261 | result = stem.stem_word "antidisestablishmentarianism"
262 | assert.is_string result
263 | assert.true #result > 0
264 |
265 | it "handles words with no vowels", ->
266 | test_word "shhh", "shhh"
267 | test_word "hmm", "hmm"
268 |
269 | it "handles repeated consonants", ->
270 | test_word "bless", "bless"
271 | test_word "press", "press"
272 |
273 | it "handles words ending in y", ->
274 | test_word "daily", "daili"
275 | test_word "easily", "easili"
276 |
277 | it "preserves words that should not be stemmed", ->
278 | test_word "test", "test"
279 | test_word "best", "best"
280 |
--------------------------------------------------------------------------------
/spec/bayes_spec.moon:
--------------------------------------------------------------------------------
1 |
2 | import use_test_env from require "lapis.spec"
3 | import truncate_tables from require "lapis.spec.db"
4 |
5 | import Categories, WordClassifications from require "lapis.bayes.models"
6 |
7 | describe "lapis.bayes", ->
8 | use_test_env!
9 |
10 | describe "WordClassifications", ->
11 | local c1, c2
12 |
13 | before_each ->
14 | truncate_tables Categories, WordClassifications
15 |
16 | c1 = Categories\find_or_create "hello"
17 | c1\increment_words {
18 | alpha: 17
19 | beta: 19
20 | }
21 |
22 | c2 = Categories\find_or_create "world"
23 | c2\increment_words {
24 | beta: 22
25 | triple: 27
26 | }
27 |
28 | it "has the correct counts", ->
29 | c1_words = {c.word, c.count for c in *c1\get_word_classifications!}
30 | c2_words = {c.word, c.count for c in *c2\get_word_classifications!}
31 |
32 | assert.same {
33 | alpha: 17
34 | beta: 19
35 | }, c1_words
36 |
37 | assert.same {
38 | beta: 22
39 | triple: 27
40 | }, c2_words
41 |
42 |
43 | it "deletes word from category", ->
44 | c1_count = c1.total_count
45 | c2_count = c2.total_count
46 |
47 | wc = assert WordClassifications\find category_id: c1.id, word: "beta"
48 | wc\delete!
49 |
50 | c1\refresh!
51 | c2\refresh!
52 |
53 | assert.same 19, c1_count - c1.total_count
54 | assert.same 0, c2_count - c2.total_count
55 |
56 | it "purges purges words from all categories", ->
57 | c1_count = c1.total_count
58 | c2_count = c2.total_count
59 |
60 | deleted, count = WordClassifications\purge_word "alpha", {"hello", "world"}
61 | assert.true deleted
62 | assert.same 1, count
63 |
64 | c1\refresh!
65 | c2\refresh!
66 |
67 | assert.same 17, c1_count - c1.total_count
68 | assert.same 0, c2_count - c2.total_count
69 |
70 | it "it increments an individual word", ->
71 | wc = assert WordClassifications\find category_id: c1.id, word: "beta"
72 |
73 | before_word_count = wc.count
74 |
75 | wc\_increment 1
76 | wc\refresh!
77 | assert.same before_word_count + 1, wc.count
78 |
79 | it "deletes word when being unincremented to 0", ->
80 | wc = assert WordClassifications\find category_id: c1.id, word: "beta"
81 | wc\_increment -wc.count
82 |
83 | assert.nil (WordClassifications\find {
84 | category_id: c1.id
85 | word: "beta"
86 | })
87 |
88 | it "clears out words when decremeitng them", ->
89 | words = c1\get_word_classifications!
90 | for word in *words
91 | c1\increment_word word.word, -word.count
92 |
93 | assert.same 0, c1.total_count
94 | c1\refresh!
95 | assert.same {}, c1\get_word_classifications!
96 |
97 | describe "Categories", ->
98 | before_each ->
99 | truncate_tables Categories, WordClassifications
100 |
101 | it "finds or creates category", ->
102 | c = Categories\find_or_create "hello"
103 | c2 = Categories\find_or_create "hello"
104 | assert.same c.id, c2.id
105 |
106 | it "increments words", ->
107 | c = Categories\find_or_create "hello"
108 |
109 | WordClassifications\create {
110 | word: "color"
111 | category_id: c.id
112 | count: 2
113 | }
114 |
115 | c\increment_words {
116 | color: 55
117 | height: 12
118 | green: 8
119 | }
120 |
121 | wc_by_name = {wc.word, wc for wc in *WordClassifications\select!}
122 |
123 | assert.same 57, wc_by_name.color.count
124 | assert.same 12, wc_by_name.height.count
125 | assert.same 8, wc_by_name.green.count
126 |
127 | it "deletes category", ->
128 | c = Categories\find_or_create "hello"
129 | c\increment_words {
130 | color: 23
131 | height: 2
132 | }
133 | c\delete!
134 |
135 | describe "tokenize text", ->
136 | describe "default tokenizer", ->
137 | tokenize_text = (text, ...) ->
138 | if ...
139 | error "Got expected additional arguments for tokenize text"
140 |
141 | BaseClassifier = require "lapis.bayes.classifiers.base"
142 | BaseClassifier!\tokenize_text text
143 |
144 | it "gets tokens for empty string", ->
145 | assert.same {}, tokenize_text ""
146 |
147 | it "gets tokens for basic string", ->
148 | assert.same {"hello", "world"}, tokenize_text "hello world"
149 |
150 | it "gets tokens with stems and no stop words", ->
151 | assert.same {"burger", "eat"}, tokenize_text "i am eating burgers"
152 |
153 | it "doesn't keep dupes", ->
154 | assert.same {"burger"}, tokenize_text "burgers are burgers"
155 |
156 | it "skips tokens that are too long or short", ->
157 | assert.same {"great"}, tokenize_text "a b c d e f g great eatingthebigriceball "
158 |
159 | it "strips numbers", ->
160 | assert.same {"delisho", "hodoc"}, tokenize_text "12 delisho hodocs for $5.99"
161 |
162 | it "uses custom tokenizer as classifier option", ->
163 | BaseClassifier = require "lapis.bayes.classifiers.base"
164 | c = BaseClassifier {
165 | tokenizer: require "lapis.bayes.tokenizers.url_domains"
166 | }
167 |
168 | assert.same {"leafo.net"}, c\tokenize_text "hello www.leafo.net website"
169 |
170 | it "users custom tokenize function", ->
171 | BaseClassifier = require "lapis.bayes.classifiers.base"
172 | c = BaseClassifier {
173 | tokenize_text: (text) ->
174 | [t for t in text\gmatch "."]
175 | }
176 |
177 | assert.same {
178 | "h", "e", "l", "l", "o"
179 | }, c\tokenize_text "hello"
180 |
181 |
182 | it "passes tokens through if already table", ->
183 | BaseClassifier = require "lapis.bayes.classifiers.base"
184 | c = BaseClassifier { }
185 |
186 | assert.same { "one", "two" }, c\tokenize_text {"one", "two"}
187 |
188 |
189 | describe "train_text", ->
190 | import train_text from require "lapis.bayes"
191 |
192 | before_each ->
193 | truncate_tables Categories, WordClassifications
194 |
195 | it "classifies a single string", ->
196 | train_text "spam", "hello this is spam, I love spam"
197 | assert.same 1, Categories\count!
198 | c = unpack Categories\select!
199 | assert.same "spam", c.name
200 | assert.same 3, WordClassifications\count!
201 | words = WordClassifications\select!
202 | table.sort words, (a, b) ->
203 | a.word < b.word
204 |
205 | assert.same {
206 | { category_id: c.id, count: 1, word: "hello" }
207 | { category_id: c.id, count: 1, word: "love" }
208 | { category_id: c.id, count: 1, word: "spam" }
209 | }, words
210 |
211 |
212 | it "classifies multiple strings", ->
213 | train_text "spam", "hello this is spam, I love spam"
214 | train_text "ham", "there is ham here"
215 | train_text "spam", "eating spamming the regular stuff"
216 | train_text "ham","pigs create too much jam"
217 |
218 | it "uses custom tokenizer", ->
219 | train_text "spam", "cat eat foot", {
220 | tokenize_text: (str, opts) ->
221 | [c for c in str\gmatch "[^%s]"]
222 | }
223 |
224 | assert.same {
225 | t: 3
226 | f: 1
227 | o: 2
228 | a: 2
229 | c: 1
230 | e: 1
231 | }, {c.word, c.count for c in *WordClassifications\select!}
232 |
233 | describe "text_probabilities", ->
234 | import text_probabilities from require "lapis.bayes"
235 |
236 | before_each ->
237 | truncate_tables Categories, WordClassifications
238 |
239 | it "works when there is no data", ->
240 | Categories\create name: "spam"
241 | Categories\create name: "ham"
242 |
243 | assert.same {
244 | nil, "no words in text are classifyable"
245 | }, {
246 | text_probabilities {"spam", "ham"}, "hello world"
247 | }
248 |
249 | it "works when there is some data", ->
250 | spam = Categories\create name: "spam"
251 | spam\increment_words {"hello", "world"}
252 |
253 | ham = Categories\create name: "ham"
254 | ham\increment_words {"butt", "world"}
255 |
256 | probs, rate = text_probabilities {"spam", "ham"}, "butt zone"
257 | assert.same 0.5, rate
258 | -- normalize probs for easy specs
259 | probs = for p in *probs
260 | {p[1], math.floor p[2] * 100 + 0.5}
261 |
262 | assert.same {
263 | {"ham", 95}
264 | {"spam", 5}
265 | }, probs
266 |
267 | describe "models", ->
268 | before_each ->
269 | truncate_tables Categories, WordClassifications
270 |
271 | it "increment_words", ->
272 | spam = Categories\create name: "spam"
273 | count = spam\increment_words {
274 | "first token"
275 | "hello.world"
276 | "http://leafo.net"
277 | "hello.world"
278 | zone: 77
279 | }
280 |
281 | assert.same 81, count
282 |
283 | words = WordClassifications\select "order by word asc", fields: "category_id, word, count"
284 |
285 | assert.same {
286 | {
287 | category_id: spam.id
288 | count: 1
289 | word: "first token"
290 | }
291 | {
292 | category_id: spam.id
293 | count: 2
294 | word: "hello.world"
295 | },
296 | {
297 | category_id: spam.id
298 | count: 1
299 | word: "http://leafo.net"
300 | },
301 | {
302 | category_id: spam.id
303 | count: 77
304 | word: "zone"
305 | }
306 | }, words
307 |
308 |
309 | count = spam\increment_words {
310 | "hello.world"
311 | "hello.world"
312 | "zone"
313 | "hello.world": 3
314 | }
315 |
316 | assert.same 6, count
317 |
318 | words = WordClassifications\select "order by word asc", fields: "category_id, word, count"
319 |
320 |
321 | assert.same {
322 | {
323 | category_id: spam.id
324 | count: 1
325 | word: "first token"
326 | }
327 | {
328 | category_id: spam.id
329 | count: 7
330 | word: "hello.world"
331 | },
332 | {
333 | category_id: spam.id
334 | count: 1
335 | word: "http://leafo.net"
336 | },
337 | {
338 | category_id: spam.id
339 | count: 78
340 | word: "zone"
341 | }
342 | }, words
343 |
--------------------------------------------------------------------------------
/spec/ngram_tokenizer_spec.moon:
--------------------------------------------------------------------------------
1 | NgramTokenizer = require "lapis.bayes.tokenizers.ngram"
2 |
3 | it_tokenizes = (label, input, expected_tokens, opts=nil) ->
4 | it "tokenizes #{label}", ->
5 | tokenizer = NgramTokenizer opts
6 | tokens = tokenizer\tokenize_text input
7 | assert.same expected_tokens, tokens, "Tokens for #{input\sub 1, 80}"
8 |
9 | describe "lapis.bayes.tokenizers.ngram", ->
10 | describe "basic tokenization", ->
11 | it_tokenizes "simple text with default bigrams", "hello world", {
12 | "he"
13 | "el"
14 | "ll"
15 | "lo"
16 | "wo"
17 | "or"
18 | "rl"
19 | "ld"
20 | }
21 |
22 | it_tokenizes "single word", "test", {
23 | "te"
24 | "es"
25 | "st"
26 | }
27 |
28 | it_tokenizes "multiple words", "cat dog fox", {
29 | "ca"
30 | "at"
31 | "do"
32 | "og"
33 | "fo"
34 | "ox"
35 | }
36 |
37 | describe "different n values", ->
38 | it_tokenizes "with unigrams (n=1)", "hello", {
39 | "h"
40 | "e"
41 | "l"
42 | "l"
43 | "o"
44 | }, { n: 1 }
45 |
46 | it_tokenizes "with trigrams (n=3)", "hello", {
47 | "hel"
48 | "ell"
49 | "llo"
50 | }, { n: 3 }
51 |
52 | it_tokenizes "with 4-grams (n=4)", "hello", {
53 | "hell"
54 | "ello"
55 | }, { n: 4 }
56 |
57 | it_tokenizes "with n=5 exact word length", "hello", {
58 | "hello"
59 | }, { n: 5 }
60 |
61 | it_tokenizes "with n=0 defaults to 1", "hi", {
62 | "h"
63 | "i"
64 | }, { n: 0 }
65 |
66 | it_tokenizes "with negative n defaults to 1", "hi", {
67 | "h"
68 | "i"
69 | }, { n: -5 }
70 |
71 | it_tokenizes "with fractional n gets floored", "test", {
72 | "te"
73 | "es"
74 | "st"
75 | }, { n: 2.7 }
76 |
77 | describe "word normalization", ->
78 | it_tokenizes "converts to lowercase", "Hello WORLD", {
79 | "he"
80 | "el"
81 | "ll"
82 | "lo"
83 | "wo"
84 | "or"
85 | "rl"
86 | "ld"
87 | }
88 |
89 | it_tokenizes "removes punctuation", "hello, world!", {
90 | "he"
91 | "el"
92 | "ll"
93 | "lo"
94 | "wo"
95 | "or"
96 | "rl"
97 | "ld"
98 | }
99 |
100 | it_tokenizes "handles mixed case and punctuation", "Hello, World!", {
101 | "he"
102 | "el"
103 | "ll"
104 | "lo"
105 | "wo"
106 | "or"
107 | "rl"
108 | "ld"
109 | }
110 |
111 | it_tokenizes "removes multiple spaces", "hello world", {
112 | "he"
113 | "el"
114 | "ll"
115 | "lo"
116 | "wo"
117 | "or"
118 | "rl"
119 | "ld"
120 | }
121 |
122 | it_tokenizes "strips punctuation from words", "don't can't won't", {
123 | "do"
124 | "on"
125 | "nt"
126 | "ca"
127 | "an"
128 | "nt"
129 | "wo"
130 | "on"
131 | "nt"
132 | }
133 |
134 | describe "ngram_size method", ->
135 | it "returns default n=2", ->
136 | tokenizer = NgramTokenizer!
137 | assert.equal 2, tokenizer\ngram_size!
138 |
139 | it "returns configured n", ->
140 | tokenizer = NgramTokenizer n: 3
141 | assert.equal 3, tokenizer\ngram_size!
142 |
143 | it "handles string n", ->
144 | tokenizer = NgramTokenizer n: "4"
145 | assert.equal 4, tokenizer\ngram_size!
146 |
147 | it "floors fractional n", ->
148 | tokenizer = NgramTokenizer n: 3.9
149 | assert.equal 3, tokenizer\ngram_size!
150 |
151 | it "returns 1 for invalid n", ->
152 | tokenizer = NgramTokenizer n: 0
153 | assert.equal 1, tokenizer\ngram_size!
154 |
155 | describe "normalize_word method", ->
156 | local tokenizer
157 | before_each ->
158 | tokenizer = NgramTokenizer!
159 |
160 | it "normalizes to lowercase", ->
161 | assert.equal "hello", tokenizer\normalize_word "HELLO"
162 | assert.equal "hello", tokenizer\normalize_word "Hello"
163 |
164 | it "removes punctuation", ->
165 | assert.equal "hello", tokenizer\normalize_word "hello!"
166 | assert.equal "hello", tokenizer\normalize_word "hello,"
167 | assert.equal "hello", tokenizer\normalize_word "hello..."
168 |
169 | it "removes whitespace", ->
170 | assert.equal "hello", tokenizer\normalize_word "hello "
171 | assert.equal "hello", tokenizer\normalize_word " hello"
172 | assert.equal "hello", tokenizer\normalize_word " hello "
173 |
174 | it "removes all punctuation and whitespace", ->
175 | assert.equal "hello", tokenizer\normalize_word " hello!!! "
176 |
177 | it "returns nil for empty string", ->
178 | assert.is_nil tokenizer\normalize_word ""
179 |
180 | it "returns nil for nil input", ->
181 | assert.is_nil tokenizer\normalize_word nil
182 |
183 | it "returns nil for whitespace only", ->
184 | assert.is_nil tokenizer\normalize_word " "
185 |
186 | it "returns nil for punctuation only", ->
187 | assert.is_nil tokenizer\normalize_word "!!!"
188 |
189 | describe "word_ngrams method", ->
190 | local tokenizer
191 | before_each ->
192 | tokenizer = NgramTokenizer!
193 |
194 | it "generates bigrams from word", ->
195 | ngrams = tokenizer\word_ngrams "hello", 2
196 | assert.same {"he", "el", "ll", "lo"}, ngrams
197 |
198 | it "generates trigrams from word", ->
199 | ngrams = tokenizer\word_ngrams "hello", 3
200 | assert.same {"hel", "ell", "llo"}, ngrams
201 |
202 | it "returns full word when length < n", ->
203 | ngrams = tokenizer\word_ngrams "hi", 3
204 | assert.same {"hi"}, ngrams
205 |
206 | it "returns full word when length == n", ->
207 | ngrams = tokenizer\word_ngrams "hi", 2
208 | assert.same {"hi"}, ngrams
209 |
210 | it "returns full word for empty string", ->
211 | ngrams = tokenizer\word_ngrams "", 2
212 | assert.same {""}, ngrams
213 |
214 | it "generates unigrams", ->
215 | ngrams = tokenizer\word_ngrams "cat", 1
216 | assert.same {"c", "a", "t"}, ngrams
217 |
218 | describe "number handling", ->
219 | it_tokenizes "ignores numbers by default", "hello 123 world 456", {
220 | "he"
221 | "el"
222 | "ll"
223 | "lo"
224 | "wo"
225 | "or"
226 | "rl"
227 | "ld"
228 | }
229 |
230 | it_tokenizes "includes numbers when ignore_numbers is false", "hello 123 world", {
231 | "he"
232 | "el"
233 | "ll"
234 | "lo"
235 | "12"
236 | "23"
237 | "wo"
238 | "or"
239 | "rl"
240 | "ld"
241 | }, { ignore_numbers: false }
242 |
243 | it_tokenizes "handles mixed alphanumeric", "abc123 def456", {
244 | "ab"
245 | "bc"
246 | "c1"
247 | "12"
248 | "23"
249 | "de"
250 | "ef"
251 | "f4"
252 | "45"
253 | "56"
254 | }, { ignore_numbers: false }
255 |
256 | describe "edge cases", ->
257 | it_tokenizes "empty string", "", {}
258 |
259 | it_tokenizes "only whitespace", " ", {}
260 |
261 | it_tokenizes "only punctuation", "!!???..", {}
262 |
263 | it_tokenizes "single character", "a", {
264 | "a"
265 | }
266 |
267 | it_tokenizes "two characters with bigrams", "ab", {
268 | "ab"
269 | }
270 |
271 | it_tokenizes "word longer than n", "testing", {
272 | "te"
273 | "es"
274 | "st"
275 | "ti"
276 | "in"
277 | "ng"
278 | }
279 |
280 | describe "unicode and international characters", ->
281 | it_tokenizes "accented characters", "café résumé", {
282 | "ca"
283 | "af"
284 | "fé"
285 | "ré"
286 | "és"
287 | "su"
288 | "um"
289 | "mé"
290 | }
291 |
292 | it_tokenizes "spanish text", "español niño", {
293 | "es"
294 | "sp"
295 | "pa"
296 | "añ"
297 | "ño"
298 | "ol"
299 | "ni"
300 | "iñ"
301 | "ño"
302 | }
303 |
304 | it_tokenizes "german umlauts", "über schön", {
305 | "üb"
306 | "be"
307 | "er"
308 | "sc"
309 | "ch"
310 | "hö"
311 | "ön"
312 | }
313 |
314 | it_tokenizes "french accents", "élève être", {
315 | "él"
316 | "lè"
317 | "èv"
318 | "ve"
319 | "êt"
320 | "tr"
321 | "re"
322 | }
323 |
324 | it_tokenizes "chinese characters", "你好世界", {
325 | "你好"
326 | "好世"
327 | "世界"
328 | }
329 |
330 | it_tokenizes "mixed english and chinese", "hello 世界 world", {
331 | "he"
332 | "el"
333 | "ll"
334 | "lo"
335 | "世界"
336 | "wo"
337 | "or"
338 | "rl"
339 | "ld"
340 | }
341 |
342 | describe "filter_text option", ->
343 | it_tokenizes "with custom text filter", "hello KEEP world", {
344 | "he"
345 | "el"
346 | "ll"
347 | "lo"
348 | "ke"
349 | "ee"
350 | "ep"
351 | "wo"
352 | "or"
353 | "rl"
354 | "ld"
355 | }, {
356 | filter_text: (text) -> text\gsub("KEEP", "keep")
357 | }
358 |
359 | it_tokenizes "filter that removes text", "hello remove world", {
360 | "he"
361 | "el"
362 | "ll"
363 | "lo"
364 | "wo"
365 | "or"
366 | "rl"
367 | "ld"
368 | }, {
369 | filter_text: (text) -> text\gsub("remove", "")
370 | }
371 |
372 | it "returns empty when filter returns empty", ->
373 | tokenizer = NgramTokenizer {
374 | filter_text: (text) -> ""
375 | }
376 | tokens = tokenizer\tokenize_text "hello world"
377 | assert.same {}, tokens
378 |
379 | it "returns empty when filter returns nil", ->
380 | tokenizer = NgramTokenizer {
381 | filter_text: (text) -> nil
382 | }
383 | tokens = tokenizer\tokenize_text "hello world"
384 | assert.same {}, tokens
385 |
386 | describe "filter_tokens option", ->
387 | it "with custom token filter", ->
388 | tokenizer = NgramTokenizer {
389 | filter_tokens: (tokens, opts) ->
390 | filtered = {}
391 | for token in *tokens
392 | if token != "el"
393 | table.insert filtered, token
394 | filtered
395 | }
396 | tokens = tokenizer\tokenize_text "hello"
397 | assert.same {"he", "ll", "lo"}, tokens
398 |
399 | it "filter can modify tokens", ->
400 | tokenizer = NgramTokenizer {
401 | filter_tokens: (tokens, opts) ->
402 | modified = {}
403 | for token in *tokens
404 | table.insert modified, "prefix:#{token}"
405 | modified
406 | }
407 | tokens = tokenizer\tokenize_text "hi"
408 | assert.same {"prefix:hi"}, tokens
409 |
410 | it "filter receives opts parameter", ->
411 | received_opts = nil
412 | tokenizer = NgramTokenizer {
413 | n: 3
414 | filter_tokens: (tokens, opts) ->
415 | received_opts = opts
416 | tokens
417 | }
418 | tokenizer\tokenize_text "test"
419 | assert.is_not_nil received_opts
420 | assert.equal 3, received_opts.n
421 |
422 | describe "comprehensive examples", ->
423 | it_tokenizes "sentence with mixed content", "The quick brown fox jumps!", {
424 | "th"
425 | "he"
426 | "qu"
427 | "ui"
428 | "ic"
429 | "ck"
430 | "br"
431 | "ro"
432 | "ow"
433 | "wn"
434 | "fo"
435 | "ox"
436 | "ju"
437 | "um"
438 | "mp"
439 | "ps"
440 | }
441 |
442 | it_tokenizes "with trigrams on real text", "testing ngrams", {
443 | "tes"
444 | "est"
445 | "sti"
446 | "tin"
447 | "ing"
448 | "ngr"
449 | "gra"
450 | "ram"
451 | "ams"
452 | }, { n: 3 }
453 |
454 | it_tokenizes "real world example", "Machine Learning is amazing!", {
455 | "ma"
456 | "ac"
457 | "ch"
458 | "hi"
459 | "in"
460 | "ne"
461 | "le"
462 | "ea"
463 | "ar"
464 | "rn"
465 | "ni"
466 | "in"
467 | "ng"
468 | "is"
469 | "am"
470 | "ma"
471 | "az"
472 | "zi"
473 | "in"
474 | "ng"
475 | }
476 |
477 | describe "build_grammar", ->
478 | it "grammar parses words", ->
479 | tokenizer = NgramTokenizer!
480 | grammar = tokenizer\build_grammar!
481 | words = grammar\match "hello world test"
482 | assert.same {"hello", "world", "test"}, words
483 |
484 | it "grammar handles punctuation", ->
485 | tokenizer = NgramTokenizer!
486 | grammar = tokenizer\build_grammar!
487 | words = grammar\match "hello, world! test?"
488 | assert.same {"hello,", "world!", "test?"}, words
489 |
490 | it "grammar handles multiple spaces", ->
491 | tokenizer = NgramTokenizer!
492 | grammar = tokenizer\build_grammar!
493 | words = grammar\match "hello world"
494 | assert.same {"hello", "world"}, words
495 |
496 | it "grammar handles tabs and newlines", ->
497 | tokenizer = NgramTokenizer!
498 | grammar = tokenizer\build_grammar!
499 | words = grammar\match "hello\tworld\ntest"
500 | assert.same {"hello", "world", "test"}, words
501 |
--------------------------------------------------------------------------------
/lapis/bayes/text/stem.lua:
--------------------------------------------------------------------------------
1 | local is_vowel
2 | is_vowel = function(char)
3 | if not (char) then
4 | return false
5 | end
6 | char = char:lower()
7 | return char == 'a' or char == 'e' or char == 'i' or char == 'o' or char == 'u' or char == 'y'
8 | end
9 | local is_consonant
10 | is_consonant = function(char)
11 | if not (char) then
12 | return false
13 | end
14 | return not is_vowel(char)
15 | end
16 | local is_vowel_wxy
17 | is_vowel_wxy = function(char)
18 | if not (char) then
19 | return false
20 | end
21 | char = char:lower()
22 | return char == 'a' or char == 'e' or char == 'i' or char == 'o' or char == 'u' or char == 'y' or char == 'w' or char == 'x'
23 | end
24 | local is_valid_li
25 | is_valid_li = function(char)
26 | if not (char) then
27 | return false
28 | end
29 | char = char:lower()
30 | return char == 'c' or char == 'd' or char == 'e' or char == 'g' or char == 'h' or char == 'k' or char == 'm' or char == 'n' or char == 'r' or char == 't'
31 | end
32 | local ends_with
33 | ends_with = function(word, suffix)
34 | if #word < #suffix then
35 | return false
36 | end
37 | return word:sub(-#suffix) == suffix
38 | end
39 | local contains_vowel
40 | contains_vowel = function(word)
41 | for i = 1, #word do
42 | if is_vowel(word:sub(i, i)) then
43 | return true
44 | end
45 | end
46 | return false
47 | end
48 | local replace_suffix
49 | replace_suffix = function(word, suffix, replacement)
50 | if ends_with(word, suffix) then
51 | return word:sub(1, #word - #suffix) .. replacement
52 | else
53 | return word
54 | end
55 | end
56 | local get_suffix
57 | get_suffix = function(word, pos)
58 | if pos > #word then
59 | return ""
60 | end
61 | return word:sub(pos)
62 | end
63 | local find_r1
64 | find_r1 = function(word)
65 | if word:sub(1, 5) == "gener" then
66 | return 6
67 | elseif word:sub(1, 6) == "commun" then
68 | return 7
69 | elseif word:sub(1, 5) == "arsen" then
70 | return 6
71 | elseif word:sub(1, 4) == "past" then
72 | return 5
73 | elseif word:sub(1, 7) == "univers" then
74 | return 8
75 | elseif word:sub(1, 5) == "later" then
76 | return 6
77 | elseif word:sub(1, 5) == "emerg" then
78 | return 6
79 | elseif word:sub(1, 5) == "organ" then
80 | return 6
81 | end
82 | for i = 1, #word - 1 do
83 | if is_vowel(word:sub(i, i)) and is_consonant(word:sub(i + 1, i + 1)) then
84 | return i + 2
85 | end
86 | end
87 | return #word + 1
88 | end
89 | local find_r2
90 | find_r2 = function(word)
91 | local r1_pos = find_r1(word)
92 | if r1_pos > #word then
93 | return #word + 1
94 | end
95 | for i = r1_pos, #word - 1 do
96 | if is_vowel(word:sub(i, i)) and is_consonant(word:sub(i + 1, i + 1)) then
97 | return i + 2
98 | end
99 | end
100 | return #word + 1
101 | end
102 | local in_r1
103 | in_r1 = function(word, pos)
104 | local r1 = find_r1(word)
105 | return pos >= r1
106 | end
107 | local in_r2
108 | in_r2 = function(word, pos)
109 | local r2 = find_r2(word)
110 | return pos >= r2
111 | end
112 | local is_short_syllable_at
113 | is_short_syllable_at = function(word, pos)
114 | if pos < 1 or pos > #word then
115 | return false
116 | end
117 | local char = word:sub(pos, pos)
118 | if not (is_vowel(char)) then
119 | return false
120 | end
121 | if pos == 1 then
122 | if #word > 1 then
123 | local next_char = word:sub(2, 2)
124 | return is_consonant(next_char)
125 | end
126 | return false
127 | end
128 | if pos < #word then
129 | local prev_char = word:sub(pos - 1, pos - 1)
130 | local next_char = word:sub(pos + 1, pos + 1)
131 | if is_consonant(prev_char) and is_consonant(next_char) then
132 | local next_lower = next_char:lower()
133 | return next_lower ~= 'w' and next_lower ~= 'x' and next_char ~= 'Y'
134 | end
135 | end
136 | return false
137 | end
138 | local ends_with_short_syllable
139 | ends_with_short_syllable = function(word)
140 | if #word < 2 then
141 | return false
142 | end
143 | if #word == 2 then
144 | return is_vowel(word:sub(1, 1)) and is_consonant(word:sub(2, 2))
145 | end
146 | if #word >= 3 then
147 | local c1 = word:sub(-3, -3)
148 | local c2 = word:sub(-2, -2)
149 | local c3 = word:sub(-1, -1)
150 | if is_consonant(c1) and is_vowel(c2) and is_consonant(c3) then
151 | local c3_lower = c3:lower()
152 | return c3_lower ~= 'w' and c3_lower ~= 'x' and c3 ~= 'Y'
153 | end
154 | end
155 | return false
156 | end
157 | local is_short_word
158 | is_short_word = function(word)
159 | local r1 = find_r1(word)
160 | if r1 > #word then
161 | return true
162 | end
163 | if r1 == #word + 1 and ends_with_short_syllable(word) then
164 | return true
165 | end
166 | return false
167 | end
168 | local prelude
169 | prelude = function(word)
170 | if #word == 0 then
171 | return word
172 | end
173 | if word:sub(1, 1) == "'" then
174 | word = word:sub(2)
175 | end
176 | local result = { }
177 | local y_found = false
178 | for i = 1, #word do
179 | local char = word:sub(i, i)
180 | if char == 'y' then
181 | if i == 1 then
182 | table.insert(result, 'Y')
183 | y_found = true
184 | elseif i > 1 and is_vowel(word:sub(i - 1, i - 1)) then
185 | table.insert(result, 'Y')
186 | y_found = true
187 | else
188 | table.insert(result, char)
189 | end
190 | else
191 | table.insert(result, char)
192 | end
193 | end
194 | return table.concat(result), y_found
195 | end
196 | local postlude
197 | postlude = function(word, y_found)
198 | if not (y_found) then
199 | return word
200 | end
201 | return word:gsub('Y', 'y')
202 | end
203 | local exception1
204 | exception1 = function(word)
205 | local exceptions = {
206 | skis = "ski",
207 | skies = "sky",
208 | idly = "idl",
209 | gently = "gentl",
210 | ugly = "ugli",
211 | early = "earli",
212 | only = "onli",
213 | singly = "singl",
214 | sky = "sky",
215 | news = "news",
216 | howe = "howe",
217 | atlas = "atlas",
218 | cosmos = "cosmos",
219 | bias = "bias",
220 | andes = "andes"
221 | }
222 | return exceptions[word]
223 | end
224 | local step_1a
225 | step_1a = function(word)
226 | if ends_with(word, "'s'") then
227 | return word:sub(1, -4)
228 | elseif ends_with(word, "'s") then
229 | return word:sub(1, -3)
230 | elseif ends_with(word, "'") then
231 | return word:sub(1, -2)
232 | end
233 | if ends_with(word, "sses") then
234 | return replace_suffix(word, "sses", "ss")
235 | end
236 | if ends_with(word, "ied") then
237 | if #word > 4 then
238 | return replace_suffix(word, "ied", "i")
239 | else
240 | return replace_suffix(word, "ied", "ie")
241 | end
242 | end
243 | if ends_with(word, "ies") then
244 | if #word > 4 then
245 | return replace_suffix(word, "ies", "i")
246 | else
247 | return replace_suffix(word, "ies", "ie")
248 | end
249 | end
250 | if ends_with(word, "s") and not ends_with(word, "us") and not ends_with(word, "ss") then
251 | local stem = word:sub(1, -2)
252 | if contains_vowel(stem) then
253 | return stem
254 | end
255 | end
256 | return word
257 | end
258 | local step_1b
259 | step_1b = function(word)
260 | if ends_with(word, "eedly") then
261 | local stem = word:sub(1, -6)
262 | if in_r1(word, #stem + 1) then
263 | if ends_with(stem, "proc") or ends_with(stem, "exc") or ends_with(stem, "succ") then
264 | return word
265 | end
266 | return stem .. "ee"
267 | end
268 | return word
269 | end
270 | if ends_with(word, "eed") then
271 | local stem = word:sub(1, -4)
272 | if in_r1(word, #stem + 1) then
273 | if ends_with(stem, "proc") or ends_with(stem, "exc") or ends_with(stem, "succ") then
274 | return word
275 | end
276 | return stem .. "ee"
277 | end
278 | return word
279 | end
280 | local suffix_removed = false
281 | local stem = word
282 | if ends_with(word, "ingly") then
283 | stem = word:sub(1, -6)
284 | suffix_removed = true
285 | elseif ends_with(word, "edly") then
286 | stem = word:sub(1, -5)
287 | suffix_removed = true
288 | elseif ends_with(word, "ing") then
289 | stem = word:sub(1, -4)
290 | suffix_removed = true
291 | elseif ends_with(word, "ed") then
292 | stem = word:sub(1, -3)
293 | suffix_removed = true
294 | end
295 | if suffix_removed then
296 | if not (contains_vowel(stem)) then
297 | return word
298 | end
299 | if ends_with(word, "ing") then
300 | if ends_with(stem, "y") and #stem > 1 then
301 | local prev = stem:sub(-2, -2)
302 | if is_consonant(prev) and #stem == 2 then
303 | return stem:sub(1, -2) .. "ie"
304 | end
305 | end
306 | if ends_with(stem, "inn") or ends_with(stem, "out") or ends_with(stem, "cann") or ends_with(stem, "herr") or ends_with(stem, "earr") or ends_with(stem, "even") then
307 | return word
308 | end
309 | end
310 | if ends_with(stem, "at") or ends_with(stem, "bl") or ends_with(stem, "iz") then
311 | return stem .. "e"
312 | end
313 | if #stem >= 2 then
314 | local last = stem:sub(-1, -1)
315 | local prev = stem:sub(-2, -2)
316 | if last == prev and is_consonant(last) then
317 | local last_lower = last:lower()
318 | if not (last_lower == 'a' or last_lower == 'e' or last_lower == 'o') then
319 | if last_lower == 'b' or last_lower == 'd' or last_lower == 'f' or last_lower == 'g' or last_lower == 'm' or last_lower == 'n' or last_lower == 'p' or last_lower == 'r' or last_lower == 't' then
320 | return stem:sub(1, -2)
321 | end
322 | end
323 | end
324 | end
325 | if in_r1(word, #stem + 1) and ends_with_short_syllable(stem) then
326 | return stem .. "e"
327 | end
328 | return stem
329 | end
330 | return word
331 | end
332 | local step_1c
333 | step_1c = function(word)
334 | if #word > 2 then
335 | local last = word:sub(-1, -1)
336 | local prev = word:sub(-2, -2)
337 | if (last == 'y' or last == 'Y') and is_consonant(prev) then
338 | return word:sub(1, -2) .. "i"
339 | end
340 | end
341 | return word
342 | end
343 | local step_2
344 | step_2 = function(word)
345 | local mappings = {
346 | {
347 | "ational",
348 | "ate"
349 | },
350 | {
351 | "tional",
352 | "tion"
353 | },
354 | {
355 | "enci",
356 | "ence"
357 | },
358 | {
359 | "anci",
360 | "ance"
361 | },
362 | {
363 | "abli",
364 | "able"
365 | },
366 | {
367 | "entli",
368 | "ent"
369 | },
370 | {
371 | "ization",
372 | "ize"
373 | },
374 | {
375 | "izer",
376 | "ize"
377 | },
378 | {
379 | "ation",
380 | "ate"
381 | },
382 | {
383 | "ator",
384 | "ate"
385 | },
386 | {
387 | "alism",
388 | "al"
389 | },
390 | {
391 | "aliti",
392 | "al"
393 | },
394 | {
395 | "alli",
396 | "al"
397 | },
398 | {
399 | "fulness",
400 | "ful"
401 | },
402 | {
403 | "ousli",
404 | "ous"
405 | },
406 | {
407 | "ousness",
408 | "ous"
409 | },
410 | {
411 | "iveness",
412 | "ive"
413 | },
414 | {
415 | "iviti",
416 | "ive"
417 | },
418 | {
419 | "biliti",
420 | "ble"
421 | },
422 | {
423 | "bli",
424 | "ble"
425 | },
426 | {
427 | "fulli",
428 | "ful"
429 | },
430 | {
431 | "lessli",
432 | "less"
433 | }
434 | }
435 | for _index_0 = 1, #mappings do
436 | local pair = mappings[_index_0]
437 | local suffix, replacement = pair[1], pair[2]
438 | if ends_with(word, suffix) then
439 | local stem = word:sub(1, #word - #suffix)
440 | if in_r1(word, #stem + 1) then
441 | return stem .. replacement
442 | end
443 | end
444 | end
445 | if ends_with(word, "ogi") then
446 | local stem = word:sub(1, -4)
447 | if in_r1(word, #stem + 1) and ends_with(stem, "l") then
448 | return stem .. "og"
449 | end
450 | end
451 | if ends_with(word, "li") then
452 | local stem = word:sub(1, -3)
453 | if in_r1(word, #stem + 1) and #stem > 0 then
454 | local last = stem:sub(-1, -1)
455 | if is_valid_li(last) then
456 | return stem
457 | end
458 | end
459 | end
460 | if ends_with(word, "ogist") then
461 | local stem = word:sub(1, -5)
462 | if in_r1(word, #stem + 1) then
463 | return stem .. "og"
464 | end
465 | end
466 | return word
467 | end
468 | local step_3
469 | step_3 = function(word)
470 | local mappings = {
471 | {
472 | "ational",
473 | "ate"
474 | },
475 | {
476 | "tional",
477 | "tion"
478 | },
479 | {
480 | "alize",
481 | "al"
482 | },
483 | {
484 | "icate",
485 | "ic"
486 | },
487 | {
488 | "iciti",
489 | "ic"
490 | },
491 | {
492 | "ical",
493 | "ic"
494 | },
495 | {
496 | "ful",
497 | ""
498 | },
499 | {
500 | "ness",
501 | ""
502 | }
503 | }
504 | for _index_0 = 1, #mappings do
505 | local pair = mappings[_index_0]
506 | local suffix, replacement = pair[1], pair[2]
507 | if ends_with(word, suffix) then
508 | local stem = word:sub(1, #word - #suffix)
509 | if in_r1(word, #stem + 1) then
510 | return stem .. replacement
511 | end
512 | end
513 | end
514 | if ends_with(word, "ative") then
515 | local stem = word:sub(1, -6)
516 | if in_r2(word, #stem + 1) then
517 | return stem
518 | end
519 | end
520 | return word
521 | end
522 | local step_4
523 | step_4 = function(word)
524 | local suffixes = {
525 | "al",
526 | "ance",
527 | "ence",
528 | "er",
529 | "ic",
530 | "able",
531 | "ible",
532 | "ant",
533 | "ement",
534 | "ment",
535 | "ent",
536 | "ism",
537 | "ate",
538 | "iti",
539 | "ous",
540 | "ive",
541 | "ize"
542 | }
543 | for _index_0 = 1, #suffixes do
544 | local suffix = suffixes[_index_0]
545 | if ends_with(word, suffix) then
546 | local stem = word:sub(1, #word - #suffix)
547 | if in_r2(word, #stem + 1) then
548 | return stem
549 | end
550 | end
551 | end
552 | if ends_with(word, "ion") then
553 | local stem = word:sub(1, -4)
554 | if in_r2(word, #stem + 1) and #stem > 0 then
555 | local last = stem:sub(-1, -1)
556 | if last == 's' or last == 't' then
557 | return stem
558 | end
559 | end
560 | end
561 | return word
562 | end
563 | local step_5
564 | step_5 = function(word)
565 | if ends_with(word, "e") then
566 | local stem = word:sub(1, -2)
567 | if in_r2(word, #stem + 1) then
568 | return stem
569 | end
570 | if in_r1(word, #stem + 1) and not ends_with_short_syllable(stem) then
571 | return stem
572 | end
573 | end
574 | if ends_with(word, "ll") and in_r2(word, #word) then
575 | return word:sub(1, -2)
576 | end
577 | return word
578 | end
579 | local stem_word
580 | stem_word = function(word)
581 | if not (word and type(word) == "string") then
582 | return word
583 | end
584 | if #word < 3 then
585 | return word
586 | end
587 | word = word:lower()
588 | local exception = exception1(word)
589 | if exception then
590 | return exception
591 | end
592 | if #word < 3 then
593 | return word
594 | end
595 | local y_found
596 | word, y_found = prelude(word)
597 | word = step_1a(word)
598 | word = step_1b(word)
599 | word = step_1c(word)
600 | word = step_2(word)
601 | word = step_3(word)
602 | word = step_4(word)
603 | word = step_5(word)
604 | word = postlude(word, y_found)
605 | return word
606 | end
607 | return {
608 | stem_word = stem_word
609 | }
610 |
--------------------------------------------------------------------------------
/lapis/bayes/text/stem.moon:
--------------------------------------------------------------------------------
1 | -- Porter Stemmer implementation in MoonScript
2 | -- Based on the Snowball English stemmer algorithm
3 | -- https://github.com/snowballstem/snowball/blob/master/algorithms/english.sbl
4 | --
5 | -- This implementation is derived from the Snowball stemming algorithms
6 | -- Copyright (c) 2001, Dr Martin Porter
7 | -- Copyright (c) 2004,2005, Richard Boulton
8 | -- Copyright (c) 2013, Yoshiki Shibukawa
9 | -- Copyright (c) 2006,2007,2009,2010,2011,2014-2019, Olly Betts
10 | -- All rights reserved.
11 | --
12 | -- Redistribution and use in source and binary forms, with or without
13 | -- modification, are permitted provided that the following conditions
14 | -- are met:
15 | --
16 | -- 1. Redistributions of source code must retain the above copyright notice,
17 | -- this list of conditions and the following disclaimer.
18 | -- 2. Redistributions in binary form must reproduce the above copyright notice,
19 | -- this list of conditions and the following disclaimer in the documentation
20 | -- and/or other materials provided with the distribution.
21 | -- 3. Neither the name of the Snowball project nor the names of its contributors
22 | -- may be used to endorse or promote products derived from this software
23 | -- without specific prior written permission.
24 | --
25 | -- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
26 | -- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
27 | -- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
28 | -- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
29 | -- ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
30 | -- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
31 | -- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
32 | -- ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
33 | -- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
34 | -- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35 |
36 | -- Character group definitions
37 | is_vowel = (char) ->
38 | return false unless char
39 | char = char\lower!
40 | char == 'a' or char == 'e' or char == 'i' or char == 'o' or char == 'u' or char == 'y'
41 |
42 | is_consonant = (char) ->
43 | return false unless char
44 | not is_vowel char
45 |
46 | is_vowel_wxy = (char) ->
47 | return false unless char
48 | char = char\lower!
49 | char == 'a' or char == 'e' or char == 'i' or char == 'o' or char == 'u' or char == 'y' or char == 'w' or char == 'x'
50 |
51 | is_valid_li = (char) ->
52 | return false unless char
53 | char = char\lower!
54 | char == 'c' or char == 'd' or char == 'e' or char == 'g' or char == 'h' or char == 'k' or char == 'm' or char == 'n' or char == 'r' or char == 't'
55 |
56 | -- String utility functions
57 | ends_with = (word, suffix) ->
58 | return false if #word < #suffix
59 | word\sub(-#suffix) == suffix
60 |
61 | contains_vowel = (word) ->
62 | for i = 1, #word
63 | return true if is_vowel word\sub(i, i)
64 | false
65 |
66 | -- Replace suffix with replacement
67 | replace_suffix = (word, suffix, replacement) ->
68 | if ends_with word, suffix
69 | word\sub(1, #word - #suffix) .. replacement
70 | else
71 | word
72 |
73 | -- Get suffix starting at position
74 | get_suffix = (word, pos) ->
75 | return "" if pos > #word
76 | word\sub pos
77 |
78 | -- Region detection
79 | -- Find R1: the region after the first non-vowel following a vowel
80 | find_r1 = (word) ->
81 | -- Special handling for common prefixes
82 | if word\sub(1, 5) == "gener"
83 | return 6
84 | elseif word\sub(1, 6) == "commun"
85 | return 7
86 | elseif word\sub(1, 5) == "arsen"
87 | return 6
88 | elseif word\sub(1, 4) == "past"
89 | return 5
90 | elseif word\sub(1, 7) == "univers"
91 | return 8
92 | elseif word\sub(1, 5) == "later"
93 | return 6
94 | elseif word\sub(1, 5) == "emerg"
95 | return 6
96 | elseif word\sub(1, 5) == "organ"
97 | return 6
98 |
99 | -- Standard R1 detection: find first V followed by NV
100 | for i = 1, #word - 1
101 | if is_vowel(word\sub(i, i)) and is_consonant(word\sub(i + 1, i + 1))
102 | return i + 2
103 |
104 | #word + 1
105 |
106 | -- Find R2: the region after the first non-vowel following a vowel in R1
107 | find_r2 = (word) ->
108 | r1_pos = find_r1 word
109 | return #word + 1 if r1_pos > #word
110 |
111 | -- Find V followed by NV in R1
112 | for i = r1_pos, #word - 1
113 | if is_vowel(word\sub(i, i)) and is_consonant(word\sub(i + 1, i + 1))
114 | return i + 2
115 |
116 | #word + 1
117 |
118 | -- Test if position is at R1
119 | in_r1 = (word, pos) ->
120 | r1 = find_r1 word
121 | pos >= r1
122 |
123 | -- Test if position is at R2
124 | in_r2 = (word, pos) ->
125 | r2 = find_r2 word
126 | pos >= r2
127 |
128 | -- Test for short syllable
129 | -- A short syllable is either (a) a vowel followed by a non-vowel other than w, x or Y
130 | -- and preceded by a non-vowel, or (b) a vowel at the beginning of the word followed
131 | -- by a non-vowel.
132 | is_short_syllable_at = (word, pos) ->
133 | return false if pos < 1 or pos > #word
134 |
135 | char = word\sub(pos, pos)
136 | return false unless is_vowel char
137 |
138 | if pos == 1
139 | -- Case (b): vowel at beginning followed by non-vowel
140 | if #word > 1
141 | next_char = word\sub(2, 2)
142 | return is_consonant next_char
143 | return false
144 |
145 | -- Case (a): non-vowel, vowel, non-vowel (not w,x,Y)
146 | if pos < #word
147 | prev_char = word\sub(pos - 1, pos - 1)
148 | next_char = word\sub(pos + 1, pos + 1)
149 |
150 | if is_consonant(prev_char) and is_consonant(next_char)
151 | next_lower = next_char\lower!
152 | return next_lower != 'w' and next_lower != 'x' and next_char != 'Y'
153 |
154 | false
155 |
156 | -- Test if word ends with short syllable
157 | ends_with_short_syllable = (word) ->
158 | return false if #word < 2
159 |
160 | -- Check last two characters for pattern
161 | if #word == 2
162 | return is_vowel(word\sub(1, 1)) and is_consonant(word\sub(2, 2))
163 |
164 | -- Check last three characters for non-vowel, vowel, non-vowel (not w,x,Y)
165 | if #word >= 3
166 | c1 = word\sub(-3, -3)
167 | c2 = word\sub(-2, -2)
168 | c3 = word\sub(-1, -1)
169 |
170 | if is_consonant(c1) and is_vowel(c2) and is_consonant(c3)
171 | c3_lower = c3\lower!
172 | return c3_lower != 'w' and c3_lower != 'x' and c3 != 'Y'
173 |
174 | false
175 |
176 | -- Test for short word: word is short if it consists of a short syllable
177 | -- and nothing else, or if R1 is null
178 | is_short_word = (word) ->
179 | r1 = find_r1 word
180 | return true if r1 > #word
181 |
182 | -- Also check if ends with short syllable at beginning of R1
183 | if r1 == #word + 1 and ends_with_short_syllable word
184 | return true
185 |
186 | false
187 |
188 | -- Prelude: handle initial Y and y after vowel
189 | prelude = (word) ->
190 | return word if #word == 0
191 |
192 | -- Remove initial apostrophe
193 | word = word\sub(2) if word\sub(1, 1) == "'"
194 |
195 | result = {}
196 | y_found = false
197 |
198 | for i = 1, #word
199 | char = word\sub(i, i)
200 |
201 | if char == 'y'
202 | -- Convert to Y if at beginning or after vowel
203 | if i == 1
204 | table.insert result, 'Y'
205 | y_found = true
206 | elseif i > 1 and is_vowel(word\sub(i - 1, i - 1))
207 | table.insert result, 'Y'
208 | y_found = true
209 | else
210 | table.insert result, char
211 | else
212 | table.insert result, char
213 |
214 | table.concat(result), y_found
215 |
216 | -- Postlude: convert Y back to y
217 | postlude = (word, y_found) ->
218 | return word unless y_found
219 | word\gsub('Y', 'y')
220 |
221 | -- Exception list 1: special cases
222 | exception1 = (word) ->
223 | exceptions = {
224 | skis: "ski"
225 | skies: "sky"
226 | idly: "idl"
227 | gently: "gentl"
228 | ugly: "ugli"
229 | early: "earli"
230 | only: "onli"
231 | singly: "singl"
232 | sky: "sky"
233 | news: "news"
234 | howe: "howe"
235 | atlas: "atlas"
236 | cosmos: "cosmos"
237 | bias: "bias"
238 | andes: "andes"
239 | }
240 |
241 | exceptions[word]
242 |
243 | -- Step 1a: handle plural forms
244 | step_1a = (word) ->
245 | -- Handle apostrophe forms
246 | if ends_with word, "'s'"
247 | return word\sub(1, -4)
248 | elseif ends_with word, "'s"
249 | return word\sub(1, -3)
250 | elseif ends_with word, "'"
251 | return word\sub(1, -2)
252 |
253 | -- Handle sses -> ss
254 | if ends_with word, "sses"
255 | return replace_suffix word, "sses", "ss"
256 |
257 | -- Handle ied, ies
258 | if ends_with word, "ied"
259 | if #word > 4
260 | return replace_suffix word, "ied", "i"
261 | else
262 | return replace_suffix word, "ied", "ie"
263 |
264 | if ends_with word, "ies"
265 | if #word > 4
266 | return replace_suffix word, "ies", "i"
267 | else
268 | return replace_suffix word, "ies", "ie"
269 |
270 | -- Handle s (but not us or ss)
271 | if ends_with(word, "s") and not ends_with(word, "us") and not ends_with(word, "ss")
272 | -- Only remove s if preceded by vowel somewhere in word
273 | stem = word\sub(1, -2)
274 | if contains_vowel stem
275 | return stem
276 |
277 | word
278 |
279 | -- Step 1b: handle ed, ing, eed forms
280 | step_1b = (word) ->
281 | -- Handle eed, eedly
282 | if ends_with word, "eedly"
283 | stem = word\sub(1, -6)
284 | if in_r1 word, #stem + 1
285 | -- Check for special cases
286 | if ends_with(stem, "proc") or ends_with(stem, "exc") or ends_with(stem, "succ")
287 | return word
288 | return stem .. "ee"
289 | return word
290 |
291 | if ends_with word, "eed"
292 | stem = word\sub(1, -4)
293 | if in_r1 word, #stem + 1
294 | if ends_with(stem, "proc") or ends_with(stem, "exc") or ends_with(stem, "succ")
295 | return word
296 | return stem .. "ee"
297 | return word
298 |
299 | -- Handle ed, edly, ing, ingly
300 | suffix_removed = false
301 | stem = word
302 |
303 | if ends_with word, "ingly"
304 | stem = word\sub(1, -6)
305 | suffix_removed = true
306 | elseif ends_with word, "edly"
307 | stem = word\sub(1, -5)
308 | suffix_removed = true
309 | elseif ends_with word, "ing"
310 | stem = word\sub(1, -4)
311 | suffix_removed = true
312 | elseif ends_with word, "ed"
313 | stem = word\sub(1, -3)
314 | suffix_removed = true
315 |
316 | if suffix_removed
317 | -- Only proceed if stem contains vowel
318 | return word unless contains_vowel stem
319 |
320 | -- Special handling for ing forms
321 | if ends_with word, "ing"
322 | -- dying -> die, lying -> lie, tying -> tie
323 | if ends_with(stem, "y") and #stem > 1
324 | prev = stem\sub(-2, -2)
325 | if is_consonant(prev) and #stem == 2
326 | return stem\sub(1, -2) .. "ie"
327 |
328 | -- inning, outing, canning stay as is
329 | if ends_with(stem, "inn") or ends_with(stem, "out") or ends_with(stem, "cann") or ends_with(stem, "herr") or ends_with(stem, "earr") or ends_with(stem, "even")
330 | return word
331 |
332 | -- Post-processing based on stem ending
333 | if ends_with(stem, "at") or ends_with(stem, "bl") or ends_with(stem, "iz")
334 | return stem .. "e"
335 |
336 | -- Handle double consonants (not aeo)
337 | if #stem >= 2
338 | last = stem\sub(-1, -1)
339 | prev = stem\sub(-2, -2)
340 | if last == prev and is_consonant(last)
341 | last_lower = last\lower!
342 | unless last_lower == 'a' or last_lower == 'e' or last_lower == 'o'
343 | -- Remove one of the double consonants (but check for special cases)
344 | if last_lower == 'b' or last_lower == 'd' or last_lower == 'f' or last_lower == 'g' or last_lower == 'm' or last_lower == 'n' or last_lower == 'p' or last_lower == 'r' or last_lower == 't'
345 | return stem\sub(1, -2)
346 |
347 | -- If R1 is null and ends with short syllable, add e
348 | if in_r1(word, #stem + 1) and ends_with_short_syllable stem
349 | return stem .. "e"
350 |
351 | return stem
352 |
353 | word
354 |
355 | -- Step 1c: replace suffix y or Y by i if preceded by non-vowel which is not at the beginning
356 | step_1c = (word) ->
357 | if #word > 2
358 | last = word\sub(-1, -1)
359 | prev = word\sub(-2, -2)
360 |
361 | if (last == 'y' or last == 'Y') and is_consonant(prev)
362 | return word\sub(1, -2) .. "i"
363 |
364 | word
365 |
366 | -- Step 2: suffix removal for derivational suffixes
367 | step_2 = (word) ->
368 | mappings = {
369 | {"ational", "ate"}
370 | {"tional", "tion"}
371 | {"enci", "ence"}
372 | {"anci", "ance"}
373 | {"abli", "able"}
374 | {"entli", "ent"}
375 | {"ization", "ize"}
376 | {"izer", "ize"}
377 | {"ation", "ate"}
378 | {"ator", "ate"}
379 | {"alism", "al"}
380 | {"aliti", "al"}
381 | {"alli", "al"}
382 | {"fulness", "ful"}
383 | {"ousli", "ous"}
384 | {"ousness", "ous"}
385 | {"iveness", "ive"}
386 | {"iviti", "ive"}
387 | {"biliti", "ble"}
388 | {"bli", "ble"}
389 | {"fulli", "ful"}
390 | {"lessli", "less"}
391 | }
392 |
393 | for pair in *mappings
394 | suffix, replacement = pair[1], pair[2]
395 | if ends_with word, suffix
396 | stem = word\sub(1, #word - #suffix)
397 | if in_r1 word, #stem + 1
398 | return stem .. replacement
399 |
400 | -- Special case: ogi -> og (when preceded by l)
401 | if ends_with word, "ogi"
402 | stem = word\sub(1, -4)
403 | if in_r1(word, #stem + 1) and ends_with(stem, "l")
404 | return stem .. "og"
405 |
406 | -- Special case: li -> delete (when preceded by valid_li)
407 | if ends_with word, "li"
408 | stem = word\sub(1, -3)
409 | if in_r1(word, #stem + 1) and #stem > 0
410 | last = stem\sub(-1, -1)
411 | if is_valid_li last
412 | return stem
413 |
414 | -- Special case: ogist -> og
415 | if ends_with word, "ogist"
416 | stem = word\sub(1, -5)
417 | if in_r1 word, #stem + 1
418 | return stem .. "og"
419 |
420 | word
421 |
422 | -- Step 3: suffix removal
423 | step_3 = (word) ->
424 | mappings = {
425 | {"ational", "ate"}
426 | {"tional", "tion"}
427 | {"alize", "al"}
428 | {"icate", "ic"}
429 | {"iciti", "ic"}
430 | {"ical", "ic"}
431 | {"ful", ""}
432 | {"ness", ""}
433 | }
434 |
435 | for pair in *mappings
436 | suffix, replacement = pair[1], pair[2]
437 | if ends_with word, suffix
438 | stem = word\sub(1, #word - #suffix)
439 | if in_r1 word, #stem + 1
440 | return stem .. replacement
441 |
442 | -- Special case: ative -> delete (in R2)
443 | if ends_with word, "ative"
444 | stem = word\sub(1, -6)
445 | if in_r2 word, #stem + 1
446 | return stem
447 |
448 | word
449 |
450 | -- Step 4: suffix removal
451 | step_4 = (word) ->
452 | suffixes = {
453 | "al", "ance", "ence", "er", "ic", "able", "ible",
454 | "ant", "ement", "ment", "ent", "ism", "ate",
455 | "iti", "ous", "ive", "ize"
456 | }
457 |
458 | for suffix in *suffixes
459 | if ends_with word, suffix
460 | stem = word\sub(1, #word - #suffix)
461 | if in_r2 word, #stem + 1
462 | return stem
463 |
464 | -- Special case: ion -> delete (when preceded by s or t in R2)
465 | if ends_with word, "ion"
466 | stem = word\sub(1, -4)
467 | if in_r2(word, #stem + 1) and #stem > 0
468 | last = stem\sub(-1, -1)
469 | if last == 's' or last == 't'
470 | return stem
471 |
472 | word
473 |
474 | -- Step 5: suffix removal
475 | step_5 = (word) ->
476 | -- Step 5a: remove trailing e
477 | if ends_with word, "e"
478 | stem = word\sub(1, -2)
479 |
480 | -- Delete if in R2
481 | if in_r2 word, #stem + 1
482 | return stem
483 |
484 | -- Delete if in R1 and not preceded by short syllable
485 | if in_r1(word, #stem + 1) and not ends_with_short_syllable(stem)
486 | return stem
487 |
488 | -- Step 5b: remove trailing l
489 | if ends_with(word, "ll") and in_r2(word, #word)
490 | return word\sub(1, -2)
491 |
492 | word
493 |
494 | -- Main stemming function
495 | stem_word = (word) ->
496 | return word unless word and type(word) == "string"
497 | return word if #word < 3
498 |
499 | word = word\lower!
500 |
501 | -- Check exceptions first
502 | exception = exception1 word
503 | return exception if exception
504 |
505 | -- If word is too short, return as-is
506 | return word if #word < 3
507 |
508 | -- Run through stemming steps
509 | word, y_found = prelude word
510 |
511 | word = step_1a word
512 | word = step_1b word
513 | word = step_1c word
514 | word = step_2 word
515 | word = step_3 word
516 | word = step_4 word
517 | word = step_5 word
518 |
519 | word = postlude word, y_found
520 |
521 | word
522 |
523 | {
524 | :stem_word
525 | }
526 |
--------------------------------------------------------------------------------
/lapis/bayes/tokenizers/spam.moon:
--------------------------------------------------------------------------------
1 | unpack_fn = table.unpack or unpack
2 |
3 | punycode = require "lapis.bayes.text.punycode"
4 | import Extractor from require "web_sanitize.html"
5 | types = require "lapis.validate.types"
6 |
7 | import cjk_character from require "lapis.bayes.text.utf8"
8 |
9 | extract_text = Extractor {
10 | escape_html: false
11 | }
12 |
13 | normalize_number = (value) ->
14 | return unless value and value != ""
15 |
16 | normalized = value\gsub("[,%s]", "")
17 | digits_only = normalized\gsub("[^%d]", "")
18 | return if digits_only == ""
19 |
20 | normalized
21 |
22 | -- NOTE: this only works with ASCII punctuation characters, be careful when
23 | -- updating punct_pattern if it's going to include unicode punctuation
24 | handle_punct = (chars) ->
25 | char = chars\sub 1, 1
26 | {tag: "punct", value: char .. tostring(#chars)}
27 |
28 | handle_invalid_byte = (byte) ->
29 | {tag: "invalid_byte", value: tostring(string.byte(byte))}
30 |
31 | -- return new array with order shuffled by dithering
32 | -- e: dither factor
33 | -- https://buildingrecommenders.wordpress.com/2015/11/11/dithering/
34 | dithered = do
35 | -- random normal box muller
36 | gn = (sd=1, mean=0, r=math.random) ->
37 | local x1, x2, w, y1, y2
38 | while true
39 | x1 = 2 * r! - 1
40 | x2 = 2 * r! - 1
41 | w = x1^2 + x2^2
42 | break if w < 1
43 |
44 | w = math.sqrt -2 * math.log(w) / 2
45 | y1 = x1 * w
46 | y2 = x2 * w
47 |
48 | y1 * sd + mean
49 |
50 | dither_score = (rank, e) ->
51 | math.log(rank) + gn(math.log(e))
52 |
53 | (items, e=1.5) ->
54 | rows = for i, item in ipairs items
55 | {dither_score(i, e), item}
56 |
57 | table.sort rows, (a, b) ->
58 | a[1] < b[1]
59 |
60 | [row[2] for row in *rows]
61 |
62 |
63 | -- spam tokenizer with support for domains, emails, currencies, and more
64 | -- opts = {
65 | -- filter_text: function -- function to pre-filter text, returns new text
66 | -- min_word_length: number -- minimum length of word (default 2)
67 | -- max_word_length: number -- maximum length of word (default 32)
68 | -- ignore_words: table -- table of words to ignore
69 | -- stem_words: bool -- enable word stemming
70 | -- unaccent: bool -- enable unaccenting (default true)
71 | -- dedupe: bool -- enable deduplication (default true)
72 | -- ignore_tokens: table -- table of tokens to ignore eg. {"my_token" = false}
73 | -- ignore_domains: {string} -- domains to ignore (`example.com` exact, `.example.com` includes subdomains)
74 | -- sample_at_most: number -- limit number of sampled tokens
75 | -- dither: bool -- enable dithering when sampling (default true)
76 | -- bigram_tokens: bool -- enable bigram generation
77 | -- filter_tokens: function -- function to filter tokens, called at end with (tokens, opts)
78 | -- domain_tokens_first: bool -- move domain tokens before all other tokens (default false)
79 | -- split_cjk: -- split chinese, korean, japanese characters to be individual words
80 | -- }
81 | class SpamTokenizer extends require "lapis.bayes.tokenizers.base"
82 | new: (@opts = {}) =>
83 |
84 | tagged_token_to_string: (token) =>
85 | "#{token.tag}:#{token.value}"
86 |
87 | normalize_domain_string: (domain) =>
88 | return unless domain and domain != ""
89 | domain = tostring domain
90 | domain = domain\gsub("^%s+", "")\gsub("%s+$", "")
91 | domain = domain\gsub("%.+$", "")
92 | return if domain == ""
93 |
94 | labels = {}
95 | for label in domain\gmatch "[^%.]+"
96 | return if label == ""
97 | encoded = punycode.punycode_encode label
98 | encoded or= label
99 | table.insert labels, encoded\lower!
100 |
101 | return unless next labels
102 | table.concat labels, "."
103 |
104 | build_ignored_domains: =>
105 | entries = @opts.ignore_domains
106 | return false unless entries and #entries > 0
107 |
108 | exact = {}
109 | suffix = {}
110 |
111 | for domain in *entries
112 | continue unless type(domain) == "string"
113 | domain = domain\gsub("^%s+", "")\gsub("%s+$", "")
114 | continue if domain == ""
115 |
116 | is_suffix = domain\sub(1, 1) == "."
117 | domain = domain\sub(2) if is_suffix
118 | continue if domain == ""
119 |
120 | normalized = @normalize_domain_string domain
121 | continue unless normalized
122 |
123 | if is_suffix
124 | suffix[normalized] = true
125 | else
126 | exact[normalized] = true
127 |
128 | return false unless next(exact) or next(suffix)
129 |
130 | {
131 | exact: exact
132 | suffix: suffix
133 | }
134 |
135 | should_ignore_domain: (domain) =>
136 | return false unless @opts.ignore_domains
137 |
138 | if @ignored_domains == nil
139 | @ignored_domains = @build_ignored_domains!
140 |
141 |
142 | return false unless @ignored_domains
143 | normalized = @normalize_domain_string domain
144 | return false unless normalized
145 |
146 | if @ignored_domains.exact[normalized]
147 | return true
148 |
149 | for suffix in pairs @ignored_domains.suffix
150 | return true if normalized == suffix
151 | if #normalized > #suffix
152 | if normalized\sub(-(#suffix + 1)) == ".#{suffix}"
153 | return true
154 |
155 | false
156 |
157 | build_grammar: =>
158 | import P, S, R, C, Ct from require "lpeg"
159 | utf8 = require "lapis.util.utf8"
160 |
161 | min_len = @opts.min_word_length or 2
162 | max_len = @opts.max_word_length or 32
163 | ignore_words = @opts.ignore_words
164 |
165 | truncate = types.truncated_text max_len
166 |
167 | stem = if @opts.stem_words
168 | require("lapis.bayes.text.stem").stem_word
169 |
170 | case_insensitive = (text) ->
171 | out = nil
172 | for char in text\gmatch "."
173 | lower = char\lower!
174 | upper = char\upper!
175 | pattern = if lower == upper
176 | P char
177 | else
178 | S "#{lower}#{upper}"
179 |
180 | out = if out
181 | out * pattern
182 | else
183 | pattern
184 |
185 | out or P(false)
186 |
187 | normalize_word = (word) ->
188 | return unless word and word != ""
189 |
190 | word = word\lower!
191 | word = word\gsub("'+", "")
192 |
193 | return if #word < min_len
194 | if #word > max_len
195 | word = truncate\transform word
196 | return if ignore_words and ignore_words[word]
197 |
198 | word
199 |
200 | handle_domain_token = (domain) ->
201 | -- convert subdomains to punycode
202 | labels = for label in domain\gmatch "[^%.]+"
203 | encoded = punycode.punycode_encode label
204 | if #encoded > max_len
205 | truncate\transform encoded
206 | else
207 | encoded
208 |
209 | tokens = {
210 | {tag: "domain", value: truncate\transform table.concat(labels, ".")\lower!}
211 | }
212 |
213 | -- Generate hierarchical domain tokens with leading dots for subdomains
214 | if #labels >= 2
215 | for i = 2, #labels
216 | suffix = table.concat [labels[j] for j = i, #labels], "."
217 | table.insert tokens, {tag: "domain", value: truncate\transform ".#{suffix\lower!}"}
218 |
219 | unpack_fn tokens
220 |
221 | extract_url_words = (...) ->
222 | out = {}
223 | for part in *{...}
224 | continue unless part and #part > 0
225 |
226 | -- Strip leading URL punctuation like / ? #
227 | part = part\gsub("^[:/?#]+", "")
228 | continue if part == ""
229 |
230 | -- Treat underscores and other punctuation as separators
231 | part = part\gsub("_", " ")
232 | part = part\gsub("[^%w']+", " ")
233 |
234 | for raw in part\gmatch "%S+"
235 | normalized = normalize_word raw
236 | table.insert out, normalized if normalized
237 |
238 | out
239 |
240 | handle_url = (domain, path="", query="", fragment="") ->
241 | return if @should_ignore_domain domain
242 |
243 | tokens = {}
244 |
245 | for word in *extract_url_words path, query, fragment
246 | table.insert tokens, word
247 |
248 | for token in *{handle_domain_token domain}
249 | table.insert tokens, token
250 |
251 | unpack_fn tokens
252 |
253 | handle_email = (email) ->
254 | email = email\lower!
255 | user, domain = email\match "^([^@]+)@(.+)$"
256 |
257 | tokens = {{tag: "email", value: truncate\transform email}}
258 |
259 | if user
260 | user_token = normalize_word user
261 | table.insert tokens, {tag: "email_user", value: user_token} if user_token
262 |
263 | if domain
264 | for token in *{handle_domain_token domain}
265 | table.insert tokens, token
266 |
267 | unpack_fn tokens
268 |
269 | handle_number = (value) ->
270 | normalized = normalize_number value
271 | return unless normalized
272 | if #normalized > max_len
273 | truncate\transform normalized
274 | else
275 | normalized
276 |
277 | handle_currency = (value) ->
278 | symbol, rest = value\match "^([%$£€¥]+)%s*(.+)$"
279 | symbol or= value\sub 1, 1
280 | rest or= ""
281 |
282 | normalized_number = normalize_number rest
283 | if normalized_number and #normalized_number > max_len
284 | normalized_number = truncate\transform normalized_number
285 |
286 | if symbol and symbol != ""
287 | if normalized_number
288 | {tag: "currency", value: symbol}, normalized_number
289 | else
290 | {tag: "currency", value: symbol}
291 |
292 | handle_percent = (value) ->
293 | number_part = value\sub 1, #value - 1
294 | normalized = normalize_number number_part
295 | return unless normalized
296 | if #normalized > max_len - 1 -- reserve 1 char for %
297 | normalized = truncate\transform normalized
298 | "#{normalized}%"
299 |
300 | handle_caps_word = (word) ->
301 | return unless word\match "%u"
302 |
303 |
304 | normalized = normalize_word word
305 | return unless normalized
306 | stemmed = if stem
307 | stem(normalized) or normalized
308 | else
309 | normalized
310 | stemmed, {tag: "caps", value: stemmed}
311 |
312 | handle_word = (word) ->
313 | normalized = normalize_word word
314 | return unless normalized
315 | if stem
316 | stem(normalized) or normalized
317 | else
318 | normalized
319 |
320 | whitespace = utf8.whitespace
321 | alpha = R "az", "AZ"
322 | digit = R "09"
323 | alphanum = alpha + digit
324 |
325 | punct_chars = S"!?$#%"
326 | other_punct = S"()[]{},.;:\"<>/@#"
327 | word_char = utf8.printable_character - whitespace - punct_chars - other_punct
328 | word_pattern = (word_char + P"'")^1
329 |
330 | cjk_word = if @opts.split_cjk
331 | word_char = word_char - cjk_character
332 | C(cjk_character) / handle_word
333 |
334 | caps_char = R"AZ"
335 | caps_pattern = caps_char^2 * (caps_char + digit)^0
336 |
337 | sign = S"+-"^-1
338 | number_body = sign * digit^1 * (P"," * digit^3)^0 * (P"." * digit^1)^-1
339 |
340 | percent_pattern = number_body * P"%"
341 | currency_pattern = S"$£€¥" * whitespace^0 * number_body
342 |
343 | punct_pattern = punct_chars^3 * punct_chars^0
344 |
345 | domain_char = utf8.printable_character - whitespace - S"./:@?#[](){}<>\"',;&"
346 | domain_label = domain_char^1
347 | domain_pattern = domain_label * (P"." * domain_label)^1
348 |
349 | not_path = S" \t\r\n\"'<>()[\\]{}?#"
350 | port_part = (P":" * digit^1)^-1
351 | path_part = (P"/" * (1 - not_path)^0)^0
352 | query_part = (P"?" * (1 - not_path)^0)^-1
353 | fragment_part = (P"#" * (1 - not_path)^0)^-1
354 |
355 | www_prefix = case_insensitive "www."
356 | scheme = (alpha + digit)^1
357 |
358 | url_with_scheme = scheme * P"://" * www_prefix^-1 * C(domain_pattern) * port_part * C(path_part) * C(query_part) * C(fragment_part)
359 | url_without_scheme = www_prefix * C(domain_pattern) * port_part * C(path_part) * C(query_part) * C(fragment_part)
360 |
361 | email_pattern = C((alphanum + S".%+_'-")^1 * P"@" * domain_pattern)
362 |
363 | number_capture = C(number_body) * -(alpha)
364 |
365 | token_patterns = {
366 | url_with_scheme / handle_url
367 | url_without_scheme / handle_url
368 | email_pattern / handle_email
369 | C(currency_pattern) / handle_currency
370 | C(percent_pattern) / handle_percent
371 | number_capture / handle_number
372 | C(caps_pattern) / handle_caps_word
373 | -- CJK here...
374 | C(word_pattern) / handle_word
375 | C(punct_pattern) / handle_punct
376 | }
377 |
378 | if cjk_word
379 | table.insert token_patterns, 8, cjk_word
380 |
381 | tokens = token_patterns[1]
382 | for i = 2, #token_patterns
383 | tokens = tokens + token_patterns[i]
384 |
385 | printable = utf8.printable_character
386 | Ct (tokens + printable + (C(P(1)) / handle_invalid_byte))^0
387 |
388 | -- this is processed on the test before HTML is stripped to get any URLs that
389 | -- might exist in attributes or in the markup
390 | collect_url_tokens: (text) =>
391 | return {} unless text and text != ""
392 |
393 | @grammar or= @build_grammar!
394 | tokens = @grammar\match text
395 | return {} unless tokens
396 |
397 | out = for token in *tokens
398 | continue unless type(token) == "table"
399 | switch token.tag
400 | when "domain", "email", "email_user"
401 | token
402 | else
403 | continue
404 | out
405 |
406 | dedupe_tokens: (tokens) =>
407 | return {} unless tokens
408 | seen = {}
409 | deduped = {}
410 | for token in *tokens
411 | -- For table tokens, use string representation as key
412 | key = if type(token) == "table"
413 | @tagged_token_to_string token
414 | else
415 | token
416 |
417 | unless seen[key]
418 | seen[key] = true
419 | table.insert deduped, token
420 | deduped
421 |
422 | generate_bigrams: (tokens) =>
423 | return {} unless tokens
424 | count = #tokens
425 | return {} if count < 2
426 | ignore_tokens = @opts.ignore_tokens
427 |
428 | bigrams = {}
429 | for i = 1, count - 1
430 | first = tokens[i]
431 | second = tokens[i + 1]
432 | continue unless first and second
433 |
434 | bigram = first .. " " .. second
435 | continue if ignore_tokens and ignore_tokens[bigram]
436 |
437 | table.insert bigrams, bigram
438 |
439 | bigrams
440 |
441 | sample_tokens: (tokens, limit=@opts.sample_at_most) =>
442 | return {} unless tokens
443 | return tokens unless limit
444 |
445 | limit = math.floor limit
446 | return {} if limit <= 0
447 | count = #tokens
448 | return tokens if count <= limit
449 |
450 | tokens_to_sample = if @opts.dither == false
451 | tokens
452 | else
453 | dithered tokens
454 |
455 | [tokens_to_sample[idx] for idx=1,limit]
456 |
457 | -- lift the tokens that match the pattern to the top, preserving order otherwise
458 | lift_tokens: (tokens, pattern) =>
459 | lifted = {}
460 | rest = for t in *tokens
461 | if t\match pattern
462 | table.insert lifted, t
463 | continue
464 |
465 | t
466 |
467 | for r in *rest
468 | table.insert lifted, r
469 |
470 | lifted
471 |
472 | tokenize_text: (text) =>
473 | return {} unless text
474 |
475 | text = tostring text
476 |
477 | if @opts.filter_text
478 | text = @opts.filter_text text
479 |
480 | unless @opts.unaccent == false
481 | text = require("lapis.bayes.text.unaccent").unaccent_string(text) or text
482 |
483 | -- extract URLs before cleaing up text to capture urls in HTML markup
484 | raw_domain_tokens = @collect_url_tokens text
485 |
486 | text = extract_text text
487 |
488 | @grammar or= @build_grammar!
489 | tokens = @grammar\match text or {}
490 |
491 | dedupe = true
492 | if @opts.dedupe != nil
493 | dedupe = @opts.dedupe
494 |
495 | ignore_tokens = @opts.ignore_tokens
496 | sample_limit = @opts.sample_at_most
497 | generate_bigrams = @opts.bigram_tokens
498 |
499 | -- new token merging strategy, try to keep things adjacent
500 | merged_tokens = {}
501 | seen_tokens = {} -- for deduping
502 |
503 | insert_token = (t) ->
504 | if ignore_tokens and ignore_tokens[t]
505 | return
506 |
507 | if dedupe and seen_tokens[t]
508 | return
509 |
510 | seen_tokens[t] = true
511 |
512 | table.insert merged_tokens, t
513 |
514 | prev_token = nil -- for bigram generation
515 |
516 | for idx=1,#tokens
517 | token = tokens[idx]
518 |
519 | switch type token
520 | when "table" -- special token
521 | switch token.tag
522 | when "caps", "invalid_byte", "currency"
523 |
524 | nil
525 | else
526 | prev_token = nil -- break the bigram
527 |
528 | insert_token @tagged_token_to_string token
529 |
530 | when "string" -- plain word
531 | insert_token token
532 |
533 | if prev_token and generate_bigrams
534 | insert_token "#{prev_token} #{token}"
535 |
536 | prev_token = token
537 |
538 | -- these lose positioning due to being extracted differently, so we just
539 | -- insert them in order at the top by moving some variables around
540 | if raw_domain_tokens
541 | original_tokens = merged_tokens
542 | merged_tokens = {}
543 | for token in *raw_domain_tokens
544 | insert_token @tagged_token_to_string token
545 |
546 | for t in *original_tokens
547 | table.insert merged_tokens, t
548 |
549 | if @opts.domain_tokens_first
550 | merged_tokens = @lift_tokens merged_tokens, "^domain:"
551 |
552 | if sample_limit
553 | merged_tokens = @sample_tokens merged_tokens
554 |
555 | -- Apply custom filter at the very end if provided
556 | if @opts.filter_tokens
557 | merged_tokens = @opts.filter_tokens merged_tokens, @opts
558 |
559 | merged_tokens
560 |
561 | return SpamTokenizer
562 |
--------------------------------------------------------------------------------