├── .gitignore ├── lapis ├── bayes │ ├── classifiers │ │ ├── default.moon │ │ ├── default.lua │ │ ├── test.moon │ │ ├── fisher.moon │ │ ├── bayes_multi.moon │ │ ├── fisher.lua │ │ ├── test.lua │ │ ├── bayes.moon │ │ ├── base.moon │ │ ├── bayes_multi.lua │ │ ├── bayes.lua │ │ └── base.lua │ ├── models.moon │ ├── models.lua │ ├── schema.moon │ ├── schema.lua │ ├── tokenizers │ │ ├── base.moon │ │ ├── base.lua │ │ ├── ngram.moon │ │ ├── url_domains.moon │ │ ├── postgres_text.moon │ │ ├── postgres_text.lua │ │ ├── ngram.lua │ │ ├── url_domains.lua │ │ └── spam.moon │ ├── model.moon │ ├── migrations.moon │ ├── model.lua │ ├── migrations.lua │ ├── text │ │ ├── utf8.moon │ │ ├── utf8.lua │ │ ├── punycode.moon │ │ ├── punycode.lua │ │ ├── stem.lua │ │ └── stem.moon │ └── models │ │ ├── word_classifications.moon │ │ ├── categories.moon │ │ ├── word_classifications.lua │ │ └── categories.lua ├── bayes.lua └── bayes.moon ├── migrations.moon ├── lint_config.moon ├── config.moon ├── Makefile ├── .github └── workflows │ └── test.yml ├── spec ├── url_tokenizer_spec.moon ├── utf8_spec.moon ├── postgres_text_tokenizer_spec.moon ├── punycode_spec.moon ├── unaccent_spec.moon ├── stem_spec.moon ├── bayes_spec.moon └── ngram_tokenizer_spec.moon ├── lapis-bayes-dev-1.rockspec └── examples └── detect_language.lua /.gitignore: -------------------------------------------------------------------------------- 1 | config.lua 2 | lint_config.lua 3 | migrations.lua 4 | *.rock 5 | -------------------------------------------------------------------------------- /lapis/bayes/classifiers/default.moon: -------------------------------------------------------------------------------- 1 | require "lapis.bayes.classifiers.bayes" 2 | -------------------------------------------------------------------------------- /lapis/bayes/classifiers/default.lua: -------------------------------------------------------------------------------- 1 | return require("lapis.bayes.classifiers.bayes") 2 | -------------------------------------------------------------------------------- /lapis/bayes/models.moon: -------------------------------------------------------------------------------- 1 | import autoload from require "lapis.util" 2 | autoload "lapis.bayes.models" 3 | -------------------------------------------------------------------------------- /migrations.moon: -------------------------------------------------------------------------------- 1 | 2 | import run_migrations from require "lapis.bayes.schema" 3 | 4 | { 5 | run_migrations 6 | } 7 | 8 | -------------------------------------------------------------------------------- /lapis/bayes/models.lua: -------------------------------------------------------------------------------- 1 | local autoload 2 | autoload = require("lapis.util").autoload 3 | return autoload("lapis.bayes.models") 4 | -------------------------------------------------------------------------------- /lapis/bayes/schema.moon: -------------------------------------------------------------------------------- 1 | run_migrations = -> 2 | m = require "lapis.db.migrations" 3 | m.run_migrations require("lapis.bayes.migrations"), "lapis_bayes" 4 | 5 | { :run_migrations } 6 | -------------------------------------------------------------------------------- /lint_config.moon: -------------------------------------------------------------------------------- 1 | { 2 | whitelist_globals: { 3 | ["spec/"]: { 4 | "it", "describe", "before_each", "after_each", "setup", "teardown", "pending" 5 | } 6 | } 7 | } 8 | 9 | -------------------------------------------------------------------------------- /lapis/bayes/schema.lua: -------------------------------------------------------------------------------- 1 | local run_migrations 2 | run_migrations = function() 3 | local m = require("lapis.db.migrations") 4 | return m.run_migrations(require("lapis.bayes.migrations"), "lapis_bayes") 5 | end 6 | return { 7 | run_migrations = run_migrations 8 | } 9 | -------------------------------------------------------------------------------- /config.moon: -------------------------------------------------------------------------------- 1 | config = require "lapis.config" 2 | 3 | config {"development", "test"}, -> 4 | logging false -- hide query logs 5 | 6 | postgres { 7 | database: "lapis_bayes" 8 | 9 | host: os.getenv "PGHOST" 10 | user: os.getenv "PGUSER" 11 | password: os.getenv "PGPASSWORD" 12 | } 13 | 14 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | migrate: build 3 | make test_db > /dev/null 4 | lapis migrate 5 | 6 | local: build 7 | luarocks --lua-version=5.1 make --local *-dev-1.rockspec 8 | 9 | build: 10 | -rm $$(find lapis -type f | grep '\.lua$$') 11 | moonc lapis 12 | moonc *.moon 13 | 14 | test_db: 15 | -dropdb -U postgres lapis_bayes 16 | createdb -U postgres lapis_bayes 17 | 18 | lint:: 19 | moonc lint_config.moon 20 | git ls-files | grep '\.moon$$' | grep -v config.moon | xargs -n 100 moonc -l 21 | 22 | tags:: 23 | moon-tags --lapis $$(git ls-files lapis/) > $@ 24 | -------------------------------------------------------------------------------- /lapis/bayes/tokenizers/base.moon: -------------------------------------------------------------------------------- 1 | -- Provides a common interface contract for tokenizers. Subclasses should 2 | -- extend this class and override the `tokenize_text` method with their 3 | -- implementation. 4 | -- 5 | -- Required override: 6 | -- * `tokenize_text(text)` - accept raw text input and return an array-like table 7 | -- of token strings suitable for classification. 8 | 9 | class BaseTokenizer 10 | tokenize_text: (...) => 11 | class_name = @__class and @__class.__name or "TokenizerBase" 12 | error "#{class_name} must implement tokenize_text(...)", 2 13 | -------------------------------------------------------------------------------- /lapis/bayes/tokenizers/base.lua: -------------------------------------------------------------------------------- 1 | local BaseTokenizer 2 | do 3 | local _class_0 4 | local _base_0 = { 5 | tokenize_text = function(self, ...) 6 | local class_name = self.__class and self.__class.__name or "TokenizerBase" 7 | return error(tostring(class_name) .. " must implement tokenize_text(...)", 2) 8 | end 9 | } 10 | _base_0.__index = _base_0 11 | _class_0 = setmetatable({ 12 | __init = function() end, 13 | __base = _base_0, 14 | __name = "BaseTokenizer" 15 | }, { 16 | __index = _base_0, 17 | __call = function(cls, ...) 18 | local _self_0 = setmetatable({}, _base_0) 19 | cls.__init(_self_0, ...) 20 | return _self_0 21 | end 22 | }) 23 | _base_0.__class = _class_0 24 | BaseTokenizer = _class_0 25 | return _class_0 26 | end 27 | -------------------------------------------------------------------------------- /lapis/bayes/model.moon: -------------------------------------------------------------------------------- 1 | 2 | prefix = "lapis_bayes_" 3 | 4 | import Model from require "lapis.db.model" 5 | 6 | db = require "lapis.db" 7 | 8 | -- all tuples should be same size 9 | encode_tuples = (tuples) -> 10 | buffer = { "VALUES" } 11 | 12 | {insert: i} = table 13 | n_tuples = #tuples 14 | for t_idx=1,n_tuples 15 | tuple = tuples[t_idx] 16 | i buffer, " (" 17 | k = #tuple 18 | for idx=1,k 19 | i buffer, db.escape_literal tuple[idx] 20 | unless idx == k 21 | i buffer, ", " 22 | 23 | if t_idx == n_tuples 24 | i buffer, ")" 25 | else 26 | i buffer, "), " 27 | 28 | table.concat buffer 29 | 30 | { 31 | Model: Model\scoped_model prefix, "lapis.bayes.models" 32 | prefix_table: (name) -> "#{prefix}#{name}" 33 | :encode_tuples 34 | } 35 | -------------------------------------------------------------------------------- /lapis/bayes/migrations.moon: -------------------------------------------------------------------------------- 1 | schema = require "lapis.db.schema" 2 | 3 | import add_column, create_index, drop_index, drop_column, create_table from schema 4 | 5 | { 6 | :serial, :boolean, :varchar, :integer, :text, :foreign_key, :double, :time, 7 | :numeric, :enum 8 | } = schema.types 9 | 10 | import prefix_table from require "lapis.bayes.model" 11 | 12 | { 13 | [1439610038]: => 14 | create_table prefix_table("categories"), { 15 | {"id", serial} 16 | {"name", text} 17 | 18 | {"total_count", integer} 19 | 20 | {"created_at", time} 21 | {"updated_at", time} 22 | 23 | "PRIMARY KEY (id)" 24 | } 25 | 26 | create_table prefix_table("word_classifications"), { 27 | {"category_id", foreign_key} 28 | {"word", text} 29 | {"count", integer} 30 | 31 | "PRIMARY KEY (category_id, word)" 32 | } 33 | 34 | [1474434614]: => 35 | create_index prefix_table("categories"), "name" 36 | } 37 | 38 | -------------------------------------------------------------------------------- /lapis/bayes/model.lua: -------------------------------------------------------------------------------- 1 | local prefix = "lapis_bayes_" 2 | local Model 3 | Model = require("lapis.db.model").Model 4 | local db = require("lapis.db") 5 | local encode_tuples 6 | encode_tuples = function(tuples) 7 | local buffer = { 8 | "VALUES" 9 | } 10 | local i 11 | i = table.insert 12 | local n_tuples = #tuples 13 | for t_idx = 1, n_tuples do 14 | local tuple = tuples[t_idx] 15 | i(buffer, " (") 16 | local k = #tuple 17 | for idx = 1, k do 18 | i(buffer, db.escape_literal(tuple[idx])) 19 | if not (idx == k) then 20 | i(buffer, ", ") 21 | end 22 | end 23 | if t_idx == n_tuples then 24 | i(buffer, ")") 25 | else 26 | i(buffer, "), ") 27 | end 28 | end 29 | return table.concat(buffer) 30 | end 31 | return { 32 | Model = Model:scoped_model(prefix, "lapis.bayes.models"), 33 | prefix_table = function(name) 34 | return tostring(prefix) .. tostring(name) 35 | end, 36 | encode_tuples = encode_tuples 37 | } 38 | -------------------------------------------------------------------------------- /lapis/bayes.lua: -------------------------------------------------------------------------------- 1 | local VERSION = "1.4.0" 2 | local text_probabilities 3 | text_probabilities = function(categories, text, opts) 4 | if opts == nil then 5 | opts = { } 6 | end 7 | local DefaultClassifier = require("lapis.bayes.classifiers.default") 8 | return DefaultClassifier(opts):text_probabilities(categories, text, opts) 9 | end 10 | local classify_text 11 | classify_text = function(categories, text, opts) 12 | if opts == nil then 13 | opts = { } 14 | end 15 | local DefaultClassifier = require("lapis.bayes.classifiers.default") 16 | return DefaultClassifier(opts):classify_text(categories, text, opts) 17 | end 18 | local train_text 19 | train_text = function(category, text, opts, ...) 20 | if opts == nil then 21 | opts = { } 22 | end 23 | local DefaultClassifier = require("lapis.bayes.classifiers.default") 24 | return DefaultClassifier(opts):train_text(category, text, ...) 25 | end 26 | return { 27 | classify_text = classify_text, 28 | train_text = train_text, 29 | text_probabilities = text_probabilities, 30 | VERSION = VERSION 31 | } 32 | -------------------------------------------------------------------------------- /lapis/bayes.moon: -------------------------------------------------------------------------------- 1 | VERSION = "1.4.0" 2 | 3 | -- calculate the probabilities of text using default classifier 4 | -- categories: array of category names 5 | -- text: the text to calculate probabilities for 6 | text_probabilities = (categories, text, opts={}) -> 7 | DefaultClassifier = require "lapis.bayes.classifiers.default" 8 | DefaultClassifier(opts)\text_probabilities categories, text, opts 9 | 10 | -- return the best matching category for the given text using the default 11 | -- classifier 12 | classify_text = (categories, text, opts={}) -> 13 | DefaultClassifier = require "lapis.bayes.classifiers.default" 14 | DefaultClassifier(opts)\classify_text categories, text, opts 15 | 16 | -- train text using default classifier's tokenizer 17 | -- category: string name of category 18 | -- text: the text (or array of words) to train 19 | -- opts: options to pass to the classifier 20 | train_text = (category, text, opts={}, ...) -> 21 | DefaultClassifier = require "lapis.bayes.classifiers.default" 22 | DefaultClassifier(opts)\train_text category, text, ... 23 | 24 | { :classify_text, :train_text, :text_probabilities, :VERSION } 25 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: "test" 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | test: 7 | runs-on: ubuntu-latest 8 | 9 | env: 10 | PGUSER: postgres 11 | PGPASSWORD: postgres 12 | PGHOST: 127.0.0.1 13 | 14 | services: 15 | postgres: 16 | image: postgres:12 17 | env: 18 | POSTGRES_PASSWORD: postgres 19 | options: --health-cmd pg_isready --health-interval 10s --health-timeout 5s --health-retries 5 20 | ports: 21 | - 5432:5432 22 | 23 | steps: 24 | - uses: actions/checkout@master 25 | - uses: leafo/gh-actions-lua@master 26 | with: 27 | luaVersion: "luajit-openresty" 28 | 29 | - uses: leafo/gh-actions-luarocks@master 30 | 31 | - name: build 32 | run: | 33 | luarocks install busted 34 | luarocks install moonscript 35 | luarocks make 36 | luarocks install web_sanitize 37 | luarocks install tableshape 38 | 39 | - name: setup db 40 | run: | 41 | psql -c 'create database lapis_bayes' 42 | moonc *.moon 43 | lapis migrate 44 | 45 | - name: test 46 | run: | 47 | busted -o utfTerminal 48 | -------------------------------------------------------------------------------- /lapis/bayes/classifiers/test.moon: -------------------------------------------------------------------------------- 1 | average = (nums) -> 2 | sum = 0 3 | for n in *nums 4 | sum += n 5 | 6 | return sum / #nums 7 | 8 | weighted_avg = (tuples) -> 9 | num_tuples = #tuples 10 | sum = 0 11 | sum_weight = 0 12 | 13 | for {num, weight} in *tuples 14 | sum += num 15 | sum_weight += weight 16 | 17 | avg_weight = sum_weight/num_tuples 18 | 19 | avg = 0 20 | for {num, weight} in *tuples 21 | avg += (num/num_tuples) * (weight/avg_weight) 22 | 23 | avg 24 | 25 | class TestClassifier extends require "lapis.bayes.classifiers.base" 26 | word_probabilities: (categories, available_words) => 27 | total_counts = {} 28 | for c in *categories 29 | continue unless c.word_counts 30 | for word, count in pairs c.word_counts 31 | total_counts[word] or= 0 32 | total_counts[word] += count 33 | 34 | probs = for c in *categories 35 | tuples = for word in *available_words 36 | total_count = total_counts[word] 37 | cat_count = c.word_counts and c.word_counts[word] or 0 38 | {cat_count/total_count, total_count} 39 | 40 | {c.name, weighted_avg tuples} 41 | 42 | table.sort probs, (a,b) -> 43 | a[2] > b[2] 44 | 45 | probs 46 | 47 | 48 | -------------------------------------------------------------------------------- /lapis/bayes/classifiers/fisher.moon: -------------------------------------------------------------------------------- 1 | -- http://www.linuxjournal.com/article/6467 2 | 3 | inv_chi2 = (chi, df) -> 4 | assert df % 2 == 0, "df must be even" 5 | m = chi / 2.0 6 | sum = math.exp -m 7 | term = sum 8 | for i=1, math.floor df/2 9 | term *= m / i 10 | sum += term 11 | 12 | math.min sum, 1 13 | 14 | class FisherClassifier extends require "lapis.bayes.classifiers.base" 15 | @default_options: { 16 | robs: 1 17 | robx: 0.5 18 | min_dev: 0.3 19 | } 20 | 21 | word_probabilities: (categories, available_words) => 22 | return nil, "only two categories supported at once" unless #categories == 2 23 | 24 | {a, b} = categories 25 | 26 | s = @opts.robs 27 | x = @opts.robx 28 | min_dev = @opts.min_dev 29 | 30 | mul_a = 0 31 | mul_b = 0 32 | 33 | kept_tokens = 0 34 | 35 | for word in *available_words 36 | a_count = a.word_counts and a.word_counts[word] or 0 37 | b_count = b.word_counts and b.word_counts[word] or 0 38 | 39 | p = a_count / (a_count + b_count) 40 | n = a_count + b_count 41 | val = ((s * x) + (n * p)) / (s + n) 42 | 43 | if not min_dev or math.abs(val - 0.5) > min_dev 44 | mul_a += math.log val 45 | mul_b += math.log 1 - val 46 | kept_tokens += 1 47 | 48 | if kept_tokens == 0 49 | return nil, "not enough strong signals to decide" 50 | 51 | pa = inv_chi2 -2 * mul_a, 2 * kept_tokens 52 | pb = inv_chi2 -2 * mul_b, 2 * kept_tokens 53 | 54 | p = (1 + pa - pb) / 2 55 | 56 | tuples = { 57 | {a.name, p} 58 | {b.name, 1 - p} 59 | } 60 | 61 | table.sort tuples, (a,b) -> a[2] > b[2] 62 | 63 | tuples 64 | 65 | 66 | -------------------------------------------------------------------------------- /lapis/bayes/migrations.lua: -------------------------------------------------------------------------------- 1 | local schema = require("lapis.db.schema") 2 | local add_column, create_index, drop_index, drop_column, create_table 3 | add_column, create_index, drop_index, drop_column, create_table = schema.add_column, schema.create_index, schema.drop_index, schema.drop_column, schema.create_table 4 | local serial, boolean, varchar, integer, text, foreign_key, double, time, numeric, enum 5 | do 6 | local _obj_0 = schema.types 7 | serial, boolean, varchar, integer, text, foreign_key, double, time, numeric, enum = _obj_0.serial, _obj_0.boolean, _obj_0.varchar, _obj_0.integer, _obj_0.text, _obj_0.foreign_key, _obj_0.double, _obj_0.time, _obj_0.numeric, _obj_0.enum 8 | end 9 | local prefix_table 10 | prefix_table = require("lapis.bayes.model").prefix_table 11 | return { 12 | [1439610038] = function(self) 13 | create_table(prefix_table("categories"), { 14 | { 15 | "id", 16 | serial 17 | }, 18 | { 19 | "name", 20 | text 21 | }, 22 | { 23 | "total_count", 24 | integer 25 | }, 26 | { 27 | "created_at", 28 | time 29 | }, 30 | { 31 | "updated_at", 32 | time 33 | }, 34 | "PRIMARY KEY (id)" 35 | }) 36 | return create_table(prefix_table("word_classifications"), { 37 | { 38 | "category_id", 39 | foreign_key 40 | }, 41 | { 42 | "word", 43 | text 44 | }, 45 | { 46 | "count", 47 | integer 48 | }, 49 | "PRIMARY KEY (category_id, word)" 50 | }) 51 | end, 52 | [1474434614] = function(self) 53 | return create_index(prefix_table("categories"), "name") 54 | end 55 | } 56 | -------------------------------------------------------------------------------- /spec/url_tokenizer_spec.moon: -------------------------------------------------------------------------------- 1 | 2 | UrlDomainsTokenizer = require "lapis.bayes.tokenizers.url_domains" 3 | 4 | describe "lapis.bayes.tokenizer.url_tokenizer", -> 5 | it "builds grammar", -> 6 | tokenizer = UrlDomainsTokenizer! 7 | p = tokenizer\build_grammar! 8 | p\match "https" 9 | 10 | describe "with grammar", -> 11 | local grammar 12 | 13 | before_each -> 14 | grammar = UrlDomainsTokenizer!\build_grammar! 15 | 16 | it "detects some urls", -> 17 | assert.same { 18 | "http://leafo.net& " 19 | "http://google.com/p8sslord" 20 | "www.leafodad.com" 21 | }, grammar\match "href='http://leafo.net& ' http://google.com/p8sslord please help the good one www.leafodad.com yeah what the freak" 22 | 23 | describe "with tonenizer", -> 24 | local tokenize_text 25 | before_each -> 26 | tokenize_text = UrlDomainsTokenizer!\tokenize_text 27 | 28 | it "extracts tokens from string", -> 29 | assert.same { 30 | "leafo.net&" 31 | "google.com" 32 | "leafodad.com" 33 | }, tokenize_text "href='http://leafo.net& ' http://google.com/p8sslord/da?what please help the good one www.leafodad.com yeah what the freak" 34 | 35 | it "gets domain from iframe", -> 36 | assert.same { 37 | 'youtube.com' 38 | }, tokenize_text [[]] 39 | 40 | it "ignore domains", -> 41 | tokens = UrlDomainsTokenizer({ 42 | ignore_domains: { 43 | "leafo.net": true 44 | "*.google.com": true 45 | } 46 | })\tokenize_text [[ 47 | http://leafo.net 48 | http://good.leafo.net 49 | http://google.com 50 | http://butt.google.com 51 | http://plus.good.google.com 52 | ]] 53 | 54 | assert.same {"good.leafo.net", "google.com"}, tokens 55 | -------------------------------------------------------------------------------- /lapis-bayes-dev-1.rockspec: -------------------------------------------------------------------------------- 1 | package = "lapis-bayes" 2 | version = "dev-1" 3 | 4 | source = { 5 | url = "git+https://github.com/leafo/lapis-bayes.git" 6 | } 7 | 8 | description = { 9 | summary = "Naive Bayes classifier for use in Lua", 10 | license = "MIT", 11 | maintainer = "Leaf Corcoran ", 12 | } 13 | 14 | dependencies = { 15 | "lua == 5.1", 16 | "lapis >= 1.16.0" 17 | } 18 | 19 | build = { 20 | type = "builtin", 21 | modules = { 22 | ["lapis.bayes"] = "lapis/bayes.lua", 23 | ["lapis.bayes.classifiers.base"] = "lapis/bayes/classifiers/base.lua", 24 | ["lapis.bayes.classifiers.bayes"] = "lapis/bayes/classifiers/bayes.lua", 25 | ["lapis.bayes.classifiers.bayes_multi"] = "lapis/bayes/classifiers/bayes_multi.lua", 26 | ["lapis.bayes.classifiers.default"] = "lapis/bayes/classifiers/default.lua", 27 | ["lapis.bayes.classifiers.fisher"] = "lapis/bayes/classifiers/fisher.lua", 28 | ["lapis.bayes.classifiers.test"] = "lapis/bayes/classifiers/test.lua", 29 | ["lapis.bayes.migrations"] = "lapis/bayes/migrations.lua", 30 | ["lapis.bayes.model"] = "lapis/bayes/model.lua", 31 | ["lapis.bayes.models"] = "lapis/bayes/models.lua", 32 | ["lapis.bayes.models.categories"] = "lapis/bayes/models/categories.lua", 33 | ["lapis.bayes.models.word_classifications"] = "lapis/bayes/models/word_classifications.lua", 34 | ["lapis.bayes.schema"] = "lapis/bayes/schema.lua", 35 | ["lapis.bayes.text.punycode"] = "lapis/bayes/text/punycode.lua", 36 | ["lapis.bayes.text.stem"] = "lapis/bayes/text/stem.lua", 37 | ["lapis.bayes.text.unaccent"] = "lapis/bayes/text/unaccent.lua", 38 | ["lapis.bayes.text.utf8"] = "lapis/bayes/text/utf8.lua", 39 | ["lapis.bayes.tokenizers.base"] = "lapis/bayes/tokenizers/base.lua", 40 | ["lapis.bayes.tokenizers.ngram"] = "lapis/bayes/tokenizers/ngram.lua", 41 | ["lapis.bayes.tokenizers.postgres_text"] = "lapis/bayes/tokenizers/postgres_text.lua", 42 | ["lapis.bayes.tokenizers.spam"] = "lapis/bayes/tokenizers/spam.lua", 43 | ["lapis.bayes.tokenizers.url_domains"] = "lapis/bayes/tokenizers/url_domains.lua", 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /spec/utf8_spec.moon: -------------------------------------------------------------------------------- 1 | scripts = require "lapis.bayes.text.utf8" 2 | import C, P from require "lpeg" 3 | 4 | capture = (pattern, text) -> 5 | (C(pattern) * -P(1))\match text 6 | 7 | matches = (pattern, text) -> 8 | not not ((pattern * -P(1))\match text) 9 | 10 | describe "lapis.bayes.text.utf8", -> 11 | describe "han_character", -> 12 | it "matches a basic Han ideograph", -> 13 | assert.same "漢", capture scripts.han_character, "漢" 14 | 15 | it "matches a supplementary plane character", -> 16 | assert.same "𠀋", capture scripts.han_character, "𠀋" 17 | 18 | it "does not match kana characters", -> 19 | assert.falsy matches scripts.han_character, "あ" 20 | assert.falsy matches scripts.han_character, "ア" 21 | 22 | describe "kana_character", -> 23 | it "matches hiragana and katakana", -> 24 | assert.same "あ", capture scripts.kana_character, "あ" 25 | assert.same "ア", capture scripts.kana_character, "ア" 26 | 27 | it "matches halfwidth katakana", -> 28 | assert.same "ア", capture scripts.kana_character, "ア" 29 | 30 | it "does not match Han or Latin letters", -> 31 | assert.falsy matches scripts.kana_character, "漢" 32 | assert.falsy matches scripts.kana_character, "A" 33 | 34 | describe "hangul_character", -> 35 | it "matches modern syllables and jamo", -> 36 | assert.same "한", capture scripts.hangul_character, "한" 37 | assert.same "ᄀ", capture scripts.hangul_character, "ᄀ" 38 | 39 | it "matches halfwidth Hangul letters", -> 40 | assert.same "ᄀ", capture scripts.hangul_character, "ᄀ" 41 | 42 | it "does not match kana", -> 43 | assert.falsy matches scripts.hangul_character, "ア" 44 | 45 | describe "cjk_character", -> 46 | it "matches characters across Han, Kana, and Hangul", -> 47 | assert.same "漢", capture scripts.cjk_character, "漢" 48 | assert.same "あ", capture scripts.cjk_character, "あ" 49 | assert.same "한", capture scripts.cjk_character, "한" 50 | 51 | it "rejects non-CJK characters", -> 52 | assert.falsy matches scripts.cjk_character, "A" 53 | assert.falsy matches scripts.cjk_character, "1" 54 | -------------------------------------------------------------------------------- /lapis/bayes/text/utf8.moon: -------------------------------------------------------------------------------- 1 | import P, R from require "lpeg" 2 | 3 | cont = R "\128\191" 4 | 5 | -- Han ideographs (basic, extensions, compatibility, supplementary planes) 6 | han_ext_a = P"\227" * R("\144\191") * cont + P"\228" * R("\128\182") * cont 7 | han_unified = P"\228" * R("\184\191") * cont + R("\229\232") * cont * cont + P"\233" * R("\128\191") * cont 8 | han_compat = P"\239" * R("\164\171") * cont 9 | han_supplement = P"\240" * R("\160\178") * cont * cont 10 | han_character = han_ext_a + han_unified + han_compat + han_supplement 11 | 12 | -- Japanese Hiragana 13 | hiragana_block = P"\227\129" * cont + P"\227\130" * R("\128\159") 14 | 15 | -- Kana supplement & historic kana (hentaigana, archaic forms) 16 | kana_supplement = P"\240\155" * R("\128\133") * cont 17 | 18 | hiragana_character = hiragana_block + kana_supplement 19 | 20 | -- Japanese Katakana (standard, extensions, halfwidth) 21 | katakana_main = P"\227\130" * R("\160\191") + P"\227\131" * cont 22 | katakana_phonetic_ext = P"\227\135" * R("\176\191") 23 | katakana_halfwidth = P"\239\189" * R("\166\191") + P"\239\190" * R("\128\159") 24 | katakana_character = katakana_main + katakana_phonetic_ext + katakana_halfwidth + kana_supplement 25 | 26 | kana_character = hiragana_block + katakana_main + katakana_phonetic_ext + katakana_halfwidth + kana_supplement 27 | 28 | -- Korean Hangul (jamo, syllables, compatibility/halfwidth) 29 | hangul_jamo = P"\225" * R("\132\135") * cont 30 | hangul_jamo_ext_a = P"\234\165" * R("\160\191") 31 | hangul_compat_jamo = P"\227\132" * R("\176\191") + P"\227\133" * cont + P"\227\134" * cont + P"\227\135" * R("\128\143") 32 | hangul_syllables = P"\234" * R("\176\191") * cont + R("\235\236") * cont * cont + P"\237" * (R("\128\157") * cont + P"\158" * R("\128\163")) 33 | hangul_jamo_ext_b = P"\237\158" * R("\176\191") + P"\237\159" * cont 34 | hangul_halfwidth = P"\239\190" * R("\160\191") + P"\239\191" * R("\128\156") 35 | hangul_character = hangul_jamo + hangul_jamo_ext_a + hangul_compat_jamo + hangul_syllables + hangul_jamo_ext_b + hangul_halfwidth 36 | 37 | cjk_character = han_character + kana_character + hangul_character 38 | 39 | { 40 | :cont 41 | :han_character 42 | :hiragana_character 43 | :katakana_character 44 | :kana_character 45 | :hangul_character 46 | :cjk_character 47 | } 48 | -------------------------------------------------------------------------------- /lapis/bayes/models/word_classifications.moon: -------------------------------------------------------------------------------- 1 | 2 | db = require "lapis.db" 3 | import Model from require "lapis.bayes.model" 4 | 5 | -- Generated schema dump: (do not edit) 6 | -- 7 | -- CREATE TABLE lapis_bayes_word_classifications ( 8 | -- category_id integer NOT NULL, 9 | -- word text NOT NULL, 10 | -- count integer DEFAULT 0 NOT NULL 11 | -- ); 12 | -- ALTER TABLE ONLY lapis_bayes_word_classifications 13 | -- ADD CONSTRAINT lapis_bayes_word_classifications_pkey PRIMARY KEY (category_id, word); 14 | -- 15 | class WordClassifications extends Model 16 | @primary_key: {"category_id", "word"} 17 | 18 | @relations: { 19 | {"category", belongs_to: "Categories"} 20 | } 21 | 22 | @find_or_create: (opts={}) => 23 | @find(opts) or @create(opts) 24 | 25 | @purge_word: (word, categories) => 26 | import Categories from require "lapis.bayes.models" 27 | 28 | categories = { categories } unless type(categories) == "table" 29 | original_count = #categories 30 | assert original_count > 0, "missing categories" 31 | categories = Categories\find_all categories, key: "name" 32 | assert #categories == original_count, "failed to find all categories specified" 33 | 34 | wcs = @select "where word = ? and category_id in ?", 35 | word, db.list [c.id for c in *categories] 36 | 37 | count = 0 38 | for wc in *wcs 39 | if wc\delete! 40 | count += 1 41 | 42 | count > 0, count 43 | 44 | delete: => 45 | deleted, res = super db.raw "*" 46 | 47 | if deleted 48 | removed_row = @@load (unpack res) 49 | 50 | import Categories from require "lapis.bayes.models" 51 | db.update Categories\table_name!, { 52 | total_count: db.raw db.interpolate_query " total_count - ?", removed_row.count 53 | }, { 54 | id: @category_id 55 | } 56 | 57 | true 58 | 59 | 60 | -- note: this should not be called directly, use the associated method on the category model 61 | _increment: (amount) => 62 | amount = assert tonumber(amount), "expecting number" 63 | @update { 64 | count: db.raw "count + #{amount}" 65 | } 66 | 67 | if @count == 0 68 | db.delete @@table_name!, { 69 | category_id: @category_id 70 | word: @word 71 | count: 0 72 | } 73 | 74 | 75 | -------------------------------------------------------------------------------- /lapis/bayes/text/utf8.lua: -------------------------------------------------------------------------------- 1 | local P, R 2 | do 3 | local _obj_0 = require("lpeg") 4 | P, R = _obj_0.P, _obj_0.R 5 | end 6 | local cont = R("\128\191") 7 | local han_ext_a = P("\227") * R("\144\191") * cont + P("\228") * R("\128\182") * cont 8 | local han_unified = P("\228") * R("\184\191") * cont + R("\229\232") * cont * cont + P("\233") * R("\128\191") * cont 9 | local han_compat = P("\239") * R("\164\171") * cont 10 | local han_supplement = P("\240") * R("\160\178") * cont * cont 11 | local han_character = han_ext_a + han_unified + han_compat + han_supplement 12 | local hiragana_block = P("\227\129") * cont + P("\227\130") * R("\128\159") 13 | local kana_supplement = P("\240\155") * R("\128\133") * cont 14 | local hiragana_character = hiragana_block + kana_supplement 15 | local katakana_main = P("\227\130") * R("\160\191") + P("\227\131") * cont 16 | local katakana_phonetic_ext = P("\227\135") * R("\176\191") 17 | local katakana_halfwidth = P("\239\189") * R("\166\191") + P("\239\190") * R("\128\159") 18 | local katakana_character = katakana_main + katakana_phonetic_ext + katakana_halfwidth + kana_supplement 19 | local kana_character = hiragana_block + katakana_main + katakana_phonetic_ext + katakana_halfwidth + kana_supplement 20 | local hangul_jamo = P("\225") * R("\132\135") * cont 21 | local hangul_jamo_ext_a = P("\234\165") * R("\160\191") 22 | local hangul_compat_jamo = P("\227\132") * R("\176\191") + P("\227\133") * cont + P("\227\134") * cont + P("\227\135") * R("\128\143") 23 | local hangul_syllables = P("\234") * R("\176\191") * cont + R("\235\236") * cont * cont + P("\237") * (R("\128\157") * cont + P("\158") * R("\128\163")) 24 | local hangul_jamo_ext_b = P("\237\158") * R("\176\191") + P("\237\159") * cont 25 | local hangul_halfwidth = P("\239\190") * R("\160\191") + P("\239\191") * R("\128\156") 26 | local hangul_character = hangul_jamo + hangul_jamo_ext_a + hangul_compat_jamo + hangul_syllables + hangul_jamo_ext_b + hangul_halfwidth 27 | local cjk_character = han_character + kana_character + hangul_character 28 | return { 29 | cont = cont, 30 | han_character = han_character, 31 | hiragana_character = hiragana_character, 32 | katakana_character = katakana_character, 33 | kana_character = kana_character, 34 | hangul_character = hangul_character, 35 | cjk_character = cjk_character 36 | } 37 | -------------------------------------------------------------------------------- /lapis/bayes/tokenizers/ngram.moon: -------------------------------------------------------------------------------- 1 | class NgramTokenizer extends require "lapis.bayes.tokenizers.base" 2 | new: (@opts = {}) => 3 | 4 | build_grammar: => 5 | import C, Ct from require "lpeg" 6 | utf8 = require "lapis.util.utf8" 7 | 8 | whitespace = utf8.whitespace 9 | printable = utf8.printable_character 10 | word_chars = printable - whitespace 11 | word = C word_chars^1 12 | 13 | Ct (word + whitespace^1)^0 14 | 15 | normalize_word: (word) => 16 | return unless word and word != "" 17 | 18 | normalized = tostring(word)\lower! 19 | normalized = normalized\gsub("[%p]", "") 20 | normalized = normalized\gsub("%s+", "") 21 | 22 | return unless normalized != "" 23 | normalized 24 | 25 | ngram_size: => 26 | n = tonumber(@opts.n) or 2 27 | n = math.floor n 28 | n = 1 if n < 1 29 | n 30 | 31 | word_ngrams: (word, n) => 32 | -- Split word into UTF-8 characters using LPEG 33 | import C, Ct from require "lpeg" 34 | utf8 = require "lapis.util.utf8" 35 | printable = utf8.printable_character 36 | 37 | char_pattern = Ct (C printable)^0 38 | chars = char_pattern\match word 39 | 40 | return { word } unless chars 41 | 42 | len = #chars 43 | return { word } if len == 0 44 | return { word } if len < n 45 | 46 | out = {} 47 | for i = 1, len - n + 1 48 | ngram = table.concat chars, "", i, i + n - 1 49 | table.insert out, ngram 50 | 51 | out 52 | 53 | tokenize_text: (text) => 54 | return {} unless text and text != "" 55 | 56 | if pre_filter = @opts.filter_text 57 | text = pre_filter text 58 | return {} unless text and text != "" 59 | 60 | @grammar or= @build_grammar! 61 | words = @grammar\match text 62 | return {} unless words 63 | 64 | n = @ngram_size! 65 | ignore_numbers = @opts.ignore_numbers 66 | ignore_numbers = true if ignore_numbers == nil 67 | 68 | tokens = {} 69 | for raw_word in *words 70 | cleaned = @normalize_word raw_word 71 | continue unless cleaned 72 | 73 | if ignore_numbers and cleaned\match "^%d+$" 74 | continue 75 | 76 | for token in *@word_ngrams cleaned, n 77 | table.insert tokens, token 78 | 79 | if @opts.filter_tokens 80 | tokens = @opts.filter_tokens tokens, @opts 81 | 82 | tokens 83 | -------------------------------------------------------------------------------- /examples/detect_language.lua: -------------------------------------------------------------------------------- 1 | local NgramTokenizer = require("lapis.bayes.tokenizers.ngram") 2 | local BayesMultiClassifier = require("lapis.bayes.classifiers.bayes_multi") 3 | 4 | -- generates character ngrames of length 2 5 | local tokenizer = NgramTokenizer({n = 2}) 6 | 7 | -- A BayesMultiClassifier supports classifying to any number of categories 8 | local classifier = BayesMultiClassifier({tokenizer = tokenizer}) 9 | 10 | local training_data = { 11 | {"english", "The quick brown fox jumps over the lazy dog"}, 12 | {"english", "Hello world this is a test of the system"}, 13 | {"english", "Programming and software development with modern technology"}, 14 | 15 | {"spanish", "El rápido zorro marrón salta sobre el perro perezoso"}, 16 | {"spanish", "Hola mundo esta es una prueba del sistema"}, 17 | {"spanish", "Los lenguajes de programación son herramientas importantes"}, 18 | 19 | {"french", "Le rapide renard brun saute pardessus le chien paresseux"}, 20 | {"french", "Bonjour le monde ceci est un test du système"}, 21 | {"french", "Les langages de programmation sont des outils importants"}, 22 | 23 | {"german", "Der schnelle braune Fuchs springt über den faulen Hund"}, 24 | {"german", "Hallo Welt dies ist ein Test des Systems"}, 25 | {"german", "Programmiersprachen sind wichtige Werkzeuge für die Entwicklung"}, 26 | 27 | {"chinese", "敏捷的棕色狐狸跳过懒狗"}, 28 | {"chinese", "你好世界这是一个系统的测试"}, 29 | {"chinese", "编程语言是表达算法的重要工具"}, 30 | } 31 | 32 | -- Train the classifier 33 | print("Training classifier...") 34 | for _, entry in ipairs(training_data) do 35 | local language, text = entry[1], entry[2] 36 | classifier:train_text(language, text) 37 | end 38 | print("Training complete.\n") 39 | 40 | -- Classify new text 41 | local test_cases = { 42 | "Welcome to our website", 43 | "Bienvenido a nuestro sitio", 44 | "Bienvenue sur notre site", 45 | "Willkommen auf unserer Website", 46 | "欢迎来到我们的网站", 47 | } 48 | 49 | print("Classifying test sentences:\n") 50 | for _, test in ipairs(test_cases) do 51 | local text = test[1] 52 | 53 | -- Get probability distribution across all languages 54 | local probs = classifier:text_probabilities({ 55 | "english", 56 | "spanish", 57 | "french", 58 | "german", 59 | "chinese" 60 | }, text) 61 | 62 | -- The result is sorted by probability, first entry is the detected language 63 | local detected_language = probs[1][1] 64 | local confidence = probs[1][2] 65 | 66 | print(string.format('Text: "%s"', text)) 67 | print(string.format("Detected: %s (%.1f%% confidence)\n", detected_language, confidence * 100)) 68 | end 69 | -------------------------------------------------------------------------------- /lapis/bayes/tokenizers/url_domains.moon: -------------------------------------------------------------------------------- 1 | import trim from require "lapis.util" 2 | 3 | class UrlDomainsTokenizer extends require "lapis.bayes.tokenizers.base" 4 | new: (@opts = {}) => 5 | 6 | ignore_domain: (domain) => 7 | return unless @opts and @opts.ignore_domains 8 | return true if @opts.ignore_domains[domain] 9 | 10 | while true 11 | sub = domain\gsub("^%**%.?[^%.]+", "*") 12 | return false if sub == domain 13 | return true if @opts.ignore_domains[sub] 14 | domain = sub 15 | 16 | -- strip urls to just domains 17 | filter_tokens: (urls) => 18 | return for url in *urls 19 | url = url\lower! 20 | url = trim url 21 | url = url\gsub "^%w+://", "" 22 | url = url\gsub "^www%.", "" 23 | url = url\gsub "/.*$", "" 24 | url = trim url 25 | 26 | url\gsub "<$", "" 27 | url\gsub "^>", "" 28 | 29 | continue if url == "" 30 | continue if url\match "^%w+:" -- mailto and co 31 | continue if url\match [=[[<>="' ]]=] 32 | continue unless url\match "%." 33 | 34 | continue if @ignore_domain url 35 | 36 | url 37 | 38 | build_grammar: => 39 | import P, S, R, C, Ct, Cs from require "lpeg" 40 | 41 | case_insensitive = (text) -> 42 | out = nil 43 | for char in text\gmatch "." 44 | p = S"#{char\lower!}#{char\upper!}" 45 | if out 46 | out *= p 47 | else 48 | out = p 49 | 50 | out 51 | 52 | -- this is far from comprehensive 53 | unescape_char = P">" / ">" + 54 | P"<" / "<" + 55 | P"&" / "&" + 56 | P" " / " " + 57 | P"'" / "'" + 58 | P"/" / "/" + 59 | P""" / '"' 60 | 61 | unescape_text = Cs (unescape_char + 1)^1 62 | 63 | some_space = S" \t\n" 64 | space = some_space^0 65 | alphanum = R "az", "AZ", "09" 66 | 67 | scheme = case_insensitive"http" * case_insensitive"s"^-1 * P"://" 68 | raw_url = C scheme * (P(1) - S" \t\n")^1 69 | 70 | word = (alphanum + S"._-")^1 71 | attr_value = C(word) + P'"' * C((1 - P'"')^0) * P'"' + P"'" * C((1 - P"'")^0) * P"'" 72 | 73 | href = (case_insensitive"href" + case_insensitive"src") * space * P"=" * space * attr_value / (v) -> unescape_text\match(v) or "" 74 | 75 | simple = C case_insensitive"www" * (P"." * (1 - (S"./" + some_space))^1)^1 76 | 77 | Ct (raw_url + href + simple + 1)^0 78 | 79 | tokenize_text: (text) => 80 | @grammar or= @build_grammar! 81 | matches = @grammar\match text 82 | return nil, "failed to parse text" unless matches 83 | @filter_tokens matches 84 | -------------------------------------------------------------------------------- /lapis/bayes/tokenizers/postgres_text.moon: -------------------------------------------------------------------------------- 1 | db = require "lapis.db" 2 | 3 | -- postgres based tokenizer 4 | -- opts = { 5 | -- filter_text: function -- function to pre-filter text, returns new text 6 | -- strip_tags: bool -- remove html tags from input in default 7 | -- symbols_split_tokens: bool -- symbols split apart tokens 8 | -- min_token_length: number -- min length of token (default 2) 9 | -- max_token_length: number -- max length of token (default 12) 10 | -- strip_numbers: bool -- remove tokens that are a number (including decimal, default true) 11 | -- ignore_words: table -- table of words to ignore (keys are words, values should be truthy) 12 | -- filter_tokens: function -- custom function to filter tokens, receives tokens and opts 13 | -- legacy_tokenizer: bool -- use slower ts_debug tokenizer that keeps duplicates 14 | -- regconfig: string -- PostgreSQL text search configuration (default "english") 15 | -- } 16 | class PostgresTextTokenizer extends require "lapis.bayes.tokenizers.base" 17 | new: (@opts = {}) => 18 | 19 | filter_tokens: (tokens) => 20 | opts = @opts 21 | min_len = opts and opts.min_token_length or 2 22 | max_len = opts and opts.max_token_length or 12 23 | 24 | strip_numbers = opts and opts.strip_numbers 25 | strip_numbers = true if strip_numbers == nil 26 | 27 | return for t in *tokens 28 | t_len = #t 29 | continue if t_len > max_len 30 | continue if t_len < min_len 31 | 32 | if strip_numbers and t\match "^[%d%.%/%-]+$" 33 | continue 34 | 35 | continue if @opts and @opts.ignore_words and @opts.ignore_words[t] 36 | t 37 | 38 | slow_pg_tokenize: (text) => 39 | regconfig = @opts.regconfig or "english" 40 | -- this slower form will keep duplicate words 41 | db.query [[SELECT unnest(lexemes) AS word FROM ts_debug(?, ?)]], regconfig, text 42 | 43 | -- much faster (50x), but loses duplicates. Needs newer version of postgres 44 | pg_tokenize: (text) => 45 | regconfig = @opts.regconfig or "english" 46 | db.query [[SELECT unnest(tsvector_to_array(to_tsvector(?, ?))) AS word]], regconfig, text 47 | 48 | tokenize_text: (text) => 49 | if pre_filter = @opts.filter_text 50 | text = pre_filter text 51 | 52 | if @opts.strip_tags 53 | import extract_text from require "web_sanitize" 54 | text = extract_text text 55 | 56 | if @opts.symbols_split_tokens 57 | text = text\gsub "[%!%@%#%$%%%^%&%*%(%)%[%]%{%}%|%\\%/%`%~%-%_%<%>%,%.]", " " 58 | 59 | res = if @opts.legacy_tokenizer 60 | @slow_pg_tokenize text 61 | else 62 | @pg_tokenize text 63 | 64 | tokens = @filter_tokens [r.word for r in *res] 65 | 66 | if @opts.filter_tokens 67 | tokens = @opts.filter_tokens tokens, @opts 68 | 69 | tokens 70 | -------------------------------------------------------------------------------- /lapis/bayes/models/categories.moon: -------------------------------------------------------------------------------- 1 | 2 | db = require "lapis.db" 3 | import Model, encode_tuples from require "lapis.bayes.model" 4 | 5 | -- Generated schema dump: (do not edit) 6 | -- 7 | -- CREATE TABLE lapis_bayes_categories ( 8 | -- id integer NOT NULL, 9 | -- name text NOT NULL, 10 | -- total_count integer DEFAULT 0 NOT NULL, 11 | -- created_at timestamp without time zone NOT NULL, 12 | -- updated_at timestamp without time zone NOT NULL 13 | -- ); 14 | -- ALTER TABLE ONLY lapis_bayes_categories 15 | -- ADD CONSTRAINT lapis_bayes_categories_pkey PRIMARY KEY (id); 16 | -- 17 | class Categories extends Model 18 | @timestamp: true 19 | 20 | @relations: { 21 | {"word_classifications", has_many: "WordClassifications"} 22 | } 23 | 24 | @find_or_create: (name) => 25 | @find(:name) or @create(:name) 26 | 27 | delete: => 28 | if super! 29 | import WordClassifications from require "lapis.bayes.models" 30 | db.delete WordClassifications\table_name!, { 31 | category_id: @id 32 | } 33 | 34 | increment: (amount) => 35 | amount = assert tonumber(amount), "expecting number" 36 | @update { 37 | total_count: db.raw "total_count + #{amount}" 38 | } 39 | 40 | -- NOTE: this was removed since it was tied to a specific tokenizer 41 | increment_text: (text, opts={}) => 42 | error "This method has been removed, use increment_words instead" 43 | 44 | -- increment a single word by count 45 | increment_word: (word, count) => 46 | import WordClassifications from require "lapis.bayes.models" 47 | w = WordClassifications\find_or_create { 48 | category_id: @id 49 | :word 50 | } 51 | w\_increment count 52 | @increment count 53 | 54 | -- issue a single query to increment all WordClassifications for this 55 | -- category with the list of words 56 | -- counts: table in the format {word = count, ... word1, word2, ...} 57 | increment_words: (counts) => 58 | return nil, "missing counts" unless counts 59 | 60 | -- combine hash and array words into summed count 61 | merged_counts = {} 62 | for k,v in pairs counts 63 | word, count = if type(k) == "string" 64 | k, v 65 | else 66 | v, 1 67 | 68 | merged_counts[word] or= 0 69 | merged_counts[word] += count 70 | 71 | total_count = 0 72 | tuples = for word, count in pairs merged_counts 73 | total_count += count 74 | {@id, word, count} 75 | 76 | unless next tuples 77 | return total_count 78 | 79 | import WordClassifications from require "lapis.bayes.models" 80 | tbl = db.escape_identifier WordClassifications\table_name! 81 | 82 | db.query " 83 | INSERT INTO #{tbl} (category_id, word, count) #{encode_tuples tuples} 84 | ON CONFLICT (category_id, word) DO UPDATE SET count = #{tbl}.count + EXCLUDED.count 85 | " 86 | 87 | @increment total_count 88 | total_count 89 | 90 | -------------------------------------------------------------------------------- /lapis/bayes/classifiers/bayes_multi.moon: -------------------------------------------------------------------------------- 1 | -- Multiclass naive Bayes classifier with Laplace-style smoothing 2 | class BayesMultiClassifier extends require "lapis.bayes.classifiers.base" 3 | @default_options: { 4 | max_words: 40 5 | default_prob: 0.1 6 | } 7 | 8 | candidate_words: (categories, available_words, count) => 9 | return available_words unless count and count < #available_words 10 | 11 | tuples = for word in *available_words 12 | totals = 0 13 | counts = {} 14 | for category in *categories 15 | word_counts = category.word_counts 16 | c = word_counts and word_counts[word] or 0 17 | table.insert counts, c 18 | totals += c 19 | 20 | score = if totals == 0 21 | 0 22 | else 23 | mean = totals / #counts 24 | variance = 0 25 | for c in *counts 26 | variance += (c - mean) ^ 2 27 | variance / #counts 28 | 29 | score += math.random! / 1000 30 | 31 | { word, score } 32 | 33 | table.sort tuples, (a, b) -> a[2] > b[2] 34 | [t[1] for t in *tuples[,count]] 35 | 36 | word_probabilities: (categories, available_words) => 37 | return nil, "at least two categories required" unless #categories >= 2 38 | 39 | available_words = @candidate_words categories, available_words, @opts.max_words 40 | vocab_size = #available_words 41 | 42 | return nil, "no words to score" unless vocab_size > 0 43 | 44 | smoothing = if @opts.default_prob and @opts.default_prob > 0 45 | @opts.default_prob 46 | else 47 | 1e-6 48 | 49 | sum_counts = 0 50 | for category in *categories 51 | sum_counts += category.total_count or 0 52 | 53 | prior_smoothing = smoothing * #categories 54 | 55 | local max_log 56 | log_scores = for category in *categories 57 | cat_total = math.max (category.total_count or 0), 0 58 | prior = (cat_total + smoothing) / (sum_counts + prior_smoothing) 59 | log_score = math.log prior 60 | 61 | denominator = cat_total + (smoothing * vocab_size) 62 | denominator = smoothing * vocab_size if denominator <= 0 63 | 64 | for word in *available_words 65 | word_count = category.word_counts and category.word_counts[word] or 0 66 | log_score += math.log ((word_count + smoothing) / denominator) 67 | 68 | max_log = if max_log 69 | math.max max_log, log_score 70 | else 71 | log_score 72 | 73 | { category, log_score } 74 | 75 | weights = {} 76 | total_weight = 0 77 | for {category, log_score} in *log_scores 78 | weight = math.exp (log_score - max_log) 79 | total_weight += weight 80 | table.insert weights, { category.name, weight } 81 | 82 | return nil, "unable to normalise probabilities" unless total_weight > 0 83 | 84 | for tuple in *weights 85 | tuple[2] /= total_weight 86 | 87 | table.sort weights, (a, b) -> a[2] > b[2] 88 | weights 89 | -------------------------------------------------------------------------------- /lapis/bayes/classifiers/fisher.lua: -------------------------------------------------------------------------------- 1 | local inv_chi2 2 | inv_chi2 = function(chi, df) 3 | assert(df % 2 == 0, "df must be even") 4 | local m = chi / 2.0 5 | local sum = math.exp(-m) 6 | local term = sum 7 | for i = 1, math.floor(df / 2) do 8 | term = term * (m / i) 9 | sum = sum + term 10 | end 11 | return math.min(sum, 1) 12 | end 13 | local FisherClassifier 14 | do 15 | local _class_0 16 | local _parent_0 = require("lapis.bayes.classifiers.base") 17 | local _base_0 = { 18 | word_probabilities = function(self, categories, available_words) 19 | if not (#categories == 2) then 20 | return nil, "only two categories supported at once" 21 | end 22 | local a, b 23 | a, b = categories[1], categories[2] 24 | local s = self.opts.robs 25 | local x = self.opts.robx 26 | local min_dev = self.opts.min_dev 27 | local mul_a = 0 28 | local mul_b = 0 29 | local kept_tokens = 0 30 | for _index_0 = 1, #available_words do 31 | local word = available_words[_index_0] 32 | local a_count = a.word_counts and a.word_counts[word] or 0 33 | local b_count = b.word_counts and b.word_counts[word] or 0 34 | local p = a_count / (a_count + b_count) 35 | local n = a_count + b_count 36 | local val = ((s * x) + (n * p)) / (s + n) 37 | if not min_dev or math.abs(val - 0.5) > min_dev then 38 | mul_a = mul_a + math.log(val) 39 | mul_b = mul_b + math.log(1 - val) 40 | kept_tokens = kept_tokens + 1 41 | end 42 | end 43 | if kept_tokens == 0 then 44 | return nil, "not enough strong signals to decide" 45 | end 46 | local pa = inv_chi2(-2 * mul_a, 2 * kept_tokens) 47 | local pb = inv_chi2(-2 * mul_b, 2 * kept_tokens) 48 | local p = (1 + pa - pb) / 2 49 | local tuples = { 50 | { 51 | a.name, 52 | p 53 | }, 54 | { 55 | b.name, 56 | 1 - p 57 | } 58 | } 59 | table.sort(tuples, function(a, b) 60 | return a[2] > b[2] 61 | end) 62 | return tuples 63 | end 64 | } 65 | _base_0.__index = _base_0 66 | setmetatable(_base_0, _parent_0.__base) 67 | _class_0 = setmetatable({ 68 | __init = function(self, ...) 69 | return _class_0.__parent.__init(self, ...) 70 | end, 71 | __base = _base_0, 72 | __name = "FisherClassifier", 73 | __parent = _parent_0 74 | }, { 75 | __index = function(cls, name) 76 | local val = rawget(_base_0, name) 77 | if val == nil then 78 | local parent = rawget(cls, "__parent") 79 | if parent then 80 | return parent[name] 81 | end 82 | else 83 | return val 84 | end 85 | end, 86 | __call = function(cls, ...) 87 | local _self_0 = setmetatable({}, _base_0) 88 | cls.__init(_self_0, ...) 89 | return _self_0 90 | end 91 | }) 92 | _base_0.__class = _class_0 93 | local self = _class_0 94 | self.default_options = { 95 | robs = 1, 96 | robx = 0.5, 97 | min_dev = 0.3 98 | } 99 | if _parent_0.__inherited then 100 | _parent_0.__inherited(_parent_0, _class_0) 101 | end 102 | FisherClassifier = _class_0 103 | return _class_0 104 | end 105 | -------------------------------------------------------------------------------- /lapis/bayes/models/word_classifications.lua: -------------------------------------------------------------------------------- 1 | local db = require("lapis.db") 2 | local Model 3 | Model = require("lapis.bayes.model").Model 4 | local WordClassifications 5 | do 6 | local _class_0 7 | local _parent_0 = Model 8 | local _base_0 = { 9 | delete = function(self) 10 | local deleted, res = _class_0.__parent.__base.delete(self, db.raw("*")) 11 | if deleted then 12 | local removed_row = self.__class:load((unpack(res))) 13 | local Categories 14 | Categories = require("lapis.bayes.models").Categories 15 | db.update(Categories:table_name(), { 16 | total_count = db.raw(db.interpolate_query(" total_count - ?", removed_row.count)) 17 | }, { 18 | id = self.category_id 19 | }) 20 | return true 21 | end 22 | end, 23 | _increment = function(self, amount) 24 | amount = assert(tonumber(amount), "expecting number") 25 | self:update({ 26 | count = db.raw("count + " .. tostring(amount)) 27 | }) 28 | if self.count == 0 then 29 | return db.delete(self.__class:table_name(), { 30 | category_id = self.category_id, 31 | word = self.word, 32 | count = 0 33 | }) 34 | end 35 | end 36 | } 37 | _base_0.__index = _base_0 38 | setmetatable(_base_0, _parent_0.__base) 39 | _class_0 = setmetatable({ 40 | __init = function(self, ...) 41 | return _class_0.__parent.__init(self, ...) 42 | end, 43 | __base = _base_0, 44 | __name = "WordClassifications", 45 | __parent = _parent_0 46 | }, { 47 | __index = function(cls, name) 48 | local val = rawget(_base_0, name) 49 | if val == nil then 50 | local parent = rawget(cls, "__parent") 51 | if parent then 52 | return parent[name] 53 | end 54 | else 55 | return val 56 | end 57 | end, 58 | __call = function(cls, ...) 59 | local _self_0 = setmetatable({}, _base_0) 60 | cls.__init(_self_0, ...) 61 | return _self_0 62 | end 63 | }) 64 | _base_0.__class = _class_0 65 | local self = _class_0 66 | self.primary_key = { 67 | "category_id", 68 | "word" 69 | } 70 | self.relations = { 71 | { 72 | "category", 73 | belongs_to = "Categories" 74 | } 75 | } 76 | self.find_or_create = function(self, opts) 77 | if opts == nil then 78 | opts = { } 79 | end 80 | return self:find(opts) or self:create(opts) 81 | end 82 | self.purge_word = function(self, word, categories) 83 | local Categories 84 | Categories = require("lapis.bayes.models").Categories 85 | if not (type(categories) == "table") then 86 | categories = { 87 | categories 88 | } 89 | end 90 | local original_count = #categories 91 | assert(original_count > 0, "missing categories") 92 | categories = Categories:find_all(categories, { 93 | key = "name" 94 | }) 95 | assert(#categories == original_count, "failed to find all categories specified") 96 | local wcs = self:select("where word = ? and category_id in ?", word, db.list((function() 97 | local _accum_0 = { } 98 | local _len_0 = 1 99 | for _index_0 = 1, #categories do 100 | local c = categories[_index_0] 101 | _accum_0[_len_0] = c.id 102 | _len_0 = _len_0 + 1 103 | end 104 | return _accum_0 105 | end)())) 106 | local count = 0 107 | for _index_0 = 1, #wcs do 108 | local wc = wcs[_index_0] 109 | if wc:delete() then 110 | count = count + 1 111 | end 112 | end 113 | return count > 0, count 114 | end 115 | if _parent_0.__inherited then 116 | _parent_0.__inherited(_parent_0, _class_0) 117 | end 118 | WordClassifications = _class_0 119 | return _class_0 120 | end 121 | -------------------------------------------------------------------------------- /lapis/bayes/classifiers/test.lua: -------------------------------------------------------------------------------- 1 | local average 2 | average = function(nums) 3 | local sum = 0 4 | for _index_0 = 1, #nums do 5 | local n = nums[_index_0] 6 | sum = sum + n 7 | end 8 | return sum / #nums 9 | end 10 | local weighted_avg 11 | weighted_avg = function(tuples) 12 | local num_tuples = #tuples 13 | local sum = 0 14 | local sum_weight = 0 15 | for _index_0 = 1, #tuples do 16 | local _des_0 = tuples[_index_0] 17 | local num, weight 18 | num, weight = _des_0[1], _des_0[2] 19 | sum = sum + num 20 | sum_weight = sum_weight + weight 21 | end 22 | local avg_weight = sum_weight / num_tuples 23 | local avg = 0 24 | for _index_0 = 1, #tuples do 25 | local _des_0 = tuples[_index_0] 26 | local num, weight 27 | num, weight = _des_0[1], _des_0[2] 28 | avg = avg + ((num / num_tuples) * (weight / avg_weight)) 29 | end 30 | return avg 31 | end 32 | local TestClassifier 33 | do 34 | local _class_0 35 | local _parent_0 = require("lapis.bayes.classifiers.base") 36 | local _base_0 = { 37 | word_probabilities = function(self, categories, available_words) 38 | local total_counts = { } 39 | for _index_0 = 1, #categories do 40 | local _continue_0 = false 41 | repeat 42 | local c = categories[_index_0] 43 | if not (c.word_counts) then 44 | _continue_0 = true 45 | break 46 | end 47 | for word, count in pairs(c.word_counts) do 48 | local _update_0 = word 49 | total_counts[_update_0] = total_counts[_update_0] or 0 50 | local _update_1 = word 51 | total_counts[_update_1] = total_counts[_update_1] + count 52 | end 53 | _continue_0 = true 54 | until true 55 | if not _continue_0 then 56 | break 57 | end 58 | end 59 | local probs 60 | do 61 | local _accum_0 = { } 62 | local _len_0 = 1 63 | for _index_0 = 1, #categories do 64 | local c = categories[_index_0] 65 | local tuples 66 | do 67 | local _accum_1 = { } 68 | local _len_1 = 1 69 | for _index_1 = 1, #available_words do 70 | local word = available_words[_index_1] 71 | local total_count = total_counts[word] 72 | local cat_count = c.word_counts and c.word_counts[word] or 0 73 | local _value_0 = { 74 | cat_count / total_count, 75 | total_count 76 | } 77 | _accum_1[_len_1] = _value_0 78 | _len_1 = _len_1 + 1 79 | end 80 | tuples = _accum_1 81 | end 82 | local _value_0 = { 83 | c.name, 84 | weighted_avg(tuples) 85 | } 86 | _accum_0[_len_0] = _value_0 87 | _len_0 = _len_0 + 1 88 | end 89 | probs = _accum_0 90 | end 91 | table.sort(probs, function(a, b) 92 | return a[2] > b[2] 93 | end) 94 | return probs 95 | end 96 | } 97 | _base_0.__index = _base_0 98 | setmetatable(_base_0, _parent_0.__base) 99 | _class_0 = setmetatable({ 100 | __init = function(self, ...) 101 | return _class_0.__parent.__init(self, ...) 102 | end, 103 | __base = _base_0, 104 | __name = "TestClassifier", 105 | __parent = _parent_0 106 | }, { 107 | __index = function(cls, name) 108 | local val = rawget(_base_0, name) 109 | if val == nil then 110 | local parent = rawget(cls, "__parent") 111 | if parent then 112 | return parent[name] 113 | end 114 | else 115 | return val 116 | end 117 | end, 118 | __call = function(cls, ...) 119 | local _self_0 = setmetatable({}, _base_0) 120 | cls.__init(_self_0, ...) 121 | return _self_0 122 | end 123 | }) 124 | _base_0.__class = _class_0 125 | if _parent_0.__inherited then 126 | _parent_0.__inherited(_parent_0, _class_0) 127 | end 128 | TestClassifier = _class_0 129 | return _class_0 130 | end 131 | -------------------------------------------------------------------------------- /spec/postgres_text_tokenizer_spec.moon: -------------------------------------------------------------------------------- 1 | import use_test_env from require "lapis.spec" 2 | 3 | describe "lapis.bayes.tokenizers.postgres_text", -> 4 | use_test_env! 5 | 6 | it "skips words in ignore list", -> 7 | PostgresTextTokenizer = require "lapis.bayes.tokenizers.postgres_text" 8 | 9 | t = PostgresTextTokenizer { 10 | ignore_words: { 11 | hodoc: true 12 | } 13 | } 14 | 15 | assert.same {"delisho"}, t\tokenize_text "12 delisho hodocs for $5.99" 16 | 17 | 18 | it "splits on symbols with option", -> 19 | PostgresTextTokenizer = require "lapis.bayes.tokenizers.postgres_text" 20 | 21 | t = PostgresTextTokenizer { 22 | symbols_split_tokens: true 23 | } 24 | 25 | assert.same { 26 | "buttz" 27 | "com" 28 | "disgust" 29 | "power" 30 | "super" 31 | "wow" 32 | }, 33 | t\tokenize_text "wow that was super-disgusting buttz.com power/up" 34 | 35 | it "adds a custom prefilter", -> 36 | PostgresTextTokenizer = require "lapis.bayes.tokenizers.postgres_text" 37 | 38 | t = PostgresTextTokenizer { 39 | filter_text: (text) -> 40 | text\gsub "[%w]+", "%1zoo" 41 | } 42 | 43 | assert.same {"goodzoo", "greatzoo", "stuffzoo", "wowzoo"}, 44 | t\tokenize_text "good great stuff wow" 45 | 46 | it "adds a custom token filter", -> 47 | PostgresTextTokenizer = require "lapis.bayes.tokenizers.postgres_text" 48 | 49 | t = PostgresTextTokenizer { 50 | filter_tokens: (tokens) -> 51 | [t\reverse! for t in *tokens] 52 | } 53 | 54 | assert.same {"doog", "taerg", "ffuts", "wow"}, 55 | t\tokenize_text "good great stuff wow" 56 | 57 | it "respects min_token_length", -> 58 | PostgresTextTokenizer = require "lapis.bayes.tokenizers.postgres_text" 59 | 60 | t = PostgresTextTokenizer { 61 | min_token_length: 5 62 | } 63 | 64 | assert.same {"great", "stuff"}, 65 | t\tokenize_text "hi wow great stuff" 66 | 67 | it "respects max_token_length", -> 68 | PostgresTextTokenizer = require "lapis.bayes.tokenizers.postgres_text" 69 | 70 | t = PostgresTextTokenizer { 71 | max_token_length: 4 72 | } 73 | 74 | assert.same {"good", "wow"}, 75 | t\tokenize_text "good great stuff wow" 76 | 77 | it "strips numbers by default", -> 78 | PostgresTextTokenizer = require "lapis.bayes.tokenizers.postgres_text" 79 | 80 | t = PostgresTextTokenizer! 81 | 82 | tokens = t\tokenize_text "cost 99 dollars 5.99" 83 | table.sort tokens 84 | assert.same {"cost", "dollar"}, 85 | tokens 86 | 87 | it "keeps numbers when strip_numbers is false", -> 88 | PostgresTextTokenizer = require "lapis.bayes.tokenizers.postgres_text" 89 | 90 | t = PostgresTextTokenizer { 91 | strip_numbers: false 92 | } 93 | 94 | tokens = t\tokenize_text "cost 99 dollars 5.99" 95 | table.sort tokens 96 | assert.same {"5.99", "99", "cost", "dollar"}, 97 | tokens 98 | 99 | it "strips HTML tags with strip_tags option", -> 100 | PostgresTextTokenizer = require "lapis.bayes.tokenizers.postgres_text" 101 | 102 | t = PostgresTextTokenizer { 103 | strip_tags: true 104 | } 105 | 106 | assert.same {"hello", "link", "world"}, 107 | t\tokenize_text [[
hello world
link]] 108 | 109 | it "uses legacy tokenizer that keeps duplicates", -> 110 | PostgresTextTokenizer = require "lapis.bayes.tokenizers.postgres_text" 111 | 112 | t = PostgresTextTokenizer { 113 | legacy_tokenizer: true 114 | } 115 | 116 | tokens = t\tokenize_text "burgers are burgers" 117 | table.sort tokens 118 | assert.same {"burger", "burger"}, 119 | tokens 120 | 121 | it "uses custom regconfig", -> 122 | PostgresTextTokenizer = require "lapis.bayes.tokenizers.postgres_text" 123 | 124 | -- Test with french config 125 | t = PostgresTextTokenizer { 126 | regconfig: "french" 127 | } 128 | 129 | -- This should tokenize using French rules 130 | tokens = t\tokenize_text "les maisons" 131 | assert.truthy tokens 132 | assert.truthy #tokens > 0 133 | -------------------------------------------------------------------------------- /lapis/bayes/tokenizers/postgres_text.lua: -------------------------------------------------------------------------------- 1 | local db = require("lapis.db") 2 | local PostgresTextTokenizer 3 | do 4 | local _class_0 5 | local _parent_0 = require("lapis.bayes.tokenizers.base") 6 | local _base_0 = { 7 | filter_tokens = function(self, tokens) 8 | local opts = self.opts 9 | local min_len = opts and opts.min_token_length or 2 10 | local max_len = opts and opts.max_token_length or 12 11 | local strip_numbers = opts and opts.strip_numbers 12 | if strip_numbers == nil then 13 | strip_numbers = true 14 | end 15 | return (function() 16 | local _accum_0 = { } 17 | local _len_0 = 1 18 | for _index_0 = 1, #tokens do 19 | local _continue_0 = false 20 | repeat 21 | local t = tokens[_index_0] 22 | local t_len = #t 23 | if t_len > max_len then 24 | _continue_0 = true 25 | break 26 | end 27 | if t_len < min_len then 28 | _continue_0 = true 29 | break 30 | end 31 | if strip_numbers and t:match("^[%d%.%/%-]+$") then 32 | _continue_0 = true 33 | break 34 | end 35 | if self.opts and self.opts.ignore_words and self.opts.ignore_words[t] then 36 | _continue_0 = true 37 | break 38 | end 39 | local _value_0 = t 40 | _accum_0[_len_0] = _value_0 41 | _len_0 = _len_0 + 1 42 | _continue_0 = true 43 | until true 44 | if not _continue_0 then 45 | break 46 | end 47 | end 48 | return _accum_0 49 | end)() 50 | end, 51 | slow_pg_tokenize = function(self, text) 52 | local regconfig = self.opts.regconfig or "english" 53 | return db.query([[SELECT unnest(lexemes) AS word FROM ts_debug(?, ?)]], regconfig, text) 54 | end, 55 | pg_tokenize = function(self, text) 56 | local regconfig = self.opts.regconfig or "english" 57 | return db.query([[SELECT unnest(tsvector_to_array(to_tsvector(?, ?))) AS word]], regconfig, text) 58 | end, 59 | tokenize_text = function(self, text) 60 | do 61 | local pre_filter = self.opts.filter_text 62 | if pre_filter then 63 | text = pre_filter(text) 64 | end 65 | end 66 | if self.opts.strip_tags then 67 | local extract_text 68 | extract_text = require("web_sanitize").extract_text 69 | text = extract_text(text) 70 | end 71 | if self.opts.symbols_split_tokens then 72 | text = text:gsub("[%!%@%#%$%%%^%&%*%(%)%[%]%{%}%|%\\%/%`%~%-%_%<%>%,%.]", " ") 73 | end 74 | local res 75 | if self.opts.legacy_tokenizer then 76 | res = self:slow_pg_tokenize(text) 77 | else 78 | res = self:pg_tokenize(text) 79 | end 80 | local tokens = self:filter_tokens((function() 81 | local _accum_0 = { } 82 | local _len_0 = 1 83 | for _index_0 = 1, #res do 84 | local r = res[_index_0] 85 | _accum_0[_len_0] = r.word 86 | _len_0 = _len_0 + 1 87 | end 88 | return _accum_0 89 | end)()) 90 | if self.opts.filter_tokens then 91 | tokens = self.opts.filter_tokens(tokens, self.opts) 92 | end 93 | return tokens 94 | end 95 | } 96 | _base_0.__index = _base_0 97 | setmetatable(_base_0, _parent_0.__base) 98 | _class_0 = setmetatable({ 99 | __init = function(self, opts) 100 | if opts == nil then 101 | opts = { } 102 | end 103 | self.opts = opts 104 | end, 105 | __base = _base_0, 106 | __name = "PostgresTextTokenizer", 107 | __parent = _parent_0 108 | }, { 109 | __index = function(cls, name) 110 | local val = rawget(_base_0, name) 111 | if val == nil then 112 | local parent = rawget(cls, "__parent") 113 | if parent then 114 | return parent[name] 115 | end 116 | else 117 | return val 118 | end 119 | end, 120 | __call = function(cls, ...) 121 | local _self_0 = setmetatable({}, _base_0) 122 | cls.__init(_self_0, ...) 123 | return _self_0 124 | end 125 | }) 126 | _base_0.__class = _class_0 127 | if _parent_0.__inherited then 128 | _parent_0.__inherited(_parent_0, _class_0) 129 | end 130 | PostgresTextTokenizer = _class_0 131 | return _class_0 132 | end 133 | -------------------------------------------------------------------------------- /lapis/bayes/text/punycode.moon: -------------------------------------------------------------------------------- 1 | -- Punycode implementation for internationalized domain names 2 | -- Based on RFC 3492: https://tools.ietf.org/html/rfc3492 3 | 4 | -- Punycode parameters 5 | base = 36 6 | tmin = 1 7 | tmax = 26 8 | skew = 38 9 | damp = 700 10 | initial_bias = 72 11 | initial_n = 128 12 | delimiter = 0x2D -- hyphen-minus 13 | 14 | -- Adapt bias after each delta 15 | adapt = (delta, numpoints, firsttime) -> 16 | delta = if firsttime 17 | math.floor delta / damp 18 | else 19 | math.floor delta / 2 20 | 21 | delta = delta + math.floor delta / numpoints 22 | k = 0 23 | 24 | while delta > math.floor((base - tmin) * tmax / 2) 25 | delta = math.floor delta / (base - tmin) 26 | k = k + base 27 | 28 | k + math.floor ((base - tmin + 1) * delta) / (delta + skew) 29 | 30 | -- Encode a single digit (0-35) to character 31 | encode_digit = (d) -> 32 | if d < 26 33 | string.char d + 0x61 -- a-z 34 | else 35 | string.char d - 26 + 0x30 -- 0-9 36 | 37 | -- Calculate threshold for digit 38 | threshold = (k, bias) -> 39 | if k <= bias + tmin 40 | tmin 41 | elseif k >= bias + tmax 42 | tmax 43 | else 44 | k - bias 45 | 46 | -- Check if character is basic (ASCII) 47 | is_basic = (cp) -> 48 | cp < 0x80 49 | 50 | -- Get UTF8 codepoints from string 51 | utf8_codepoints = (str) -> 52 | codepoints = {} 53 | i = 1 54 | while i <= #str 55 | b = string.byte str, i 56 | cp = nil 57 | len = 1 58 | 59 | if b < 0x80 60 | cp = b 61 | len = 1 62 | elseif b >= 0xC0 and b < 0xE0 63 | b2 = string.byte(str, i + 1) or 0 64 | cp = ((b - 0xC0) * 0x40) + (b2 - 0x80) 65 | len = 2 66 | elseif b >= 0xE0 and b < 0xF0 67 | b2 = string.byte(str, i + 1) or 0 68 | b3 = string.byte(str, i + 2) or 0 69 | cp = ((b - 0xE0) * 0x1000) + ((b2 - 0x80) * 0x40) + (b3 - 0x80) 70 | len = 3 71 | elseif b >= 0xF0 and b < 0xF8 72 | b2 = string.byte(str, i + 1) or 0 73 | b3 = string.byte(str, i + 2) or 0 74 | b4 = string.byte(str, i + 3) or 0 75 | cp = ((b - 0xF0) * 0x40000) + ((b2 - 0x80) * 0x1000) + ((b3 - 0x80) * 0x40) + (b4 - 0x80) 76 | len = 4 77 | else 78 | -- Invalid UTF8, skip 79 | cp = b 80 | len = 1 81 | 82 | table.insert codepoints, cp 83 | i = i + len 84 | 85 | codepoints 86 | 87 | -- Encode a domain label using Punycode 88 | punycode_encode = (label) -> 89 | return label unless label and label != "" 90 | 91 | -- short circuit 92 | if label\match "^[%w%-]+$" 93 | return label 94 | 95 | -- Get codepoints 96 | codepoints = utf8_codepoints label 97 | input_length = #codepoints 98 | 99 | -- Check if all characters are basic (ASCII) 100 | has_nonbasic = false 101 | for cp in *codepoints 102 | if not is_basic cp 103 | has_nonbasic = true 104 | break 105 | 106 | return label unless has_nonbasic 107 | 108 | -- Extract basic characters 109 | output = {} 110 | basic_length = 0 111 | 112 | for cp in *codepoints 113 | if is_basic cp 114 | table.insert output, string.char(cp) 115 | basic_length = basic_length + 1 116 | 117 | -- Add delimiter if we had basic characters 118 | handled = basic_length 119 | if basic_length > 0 120 | table.insert output, string.char(delimiter) 121 | 122 | -- Encode non-basic characters 123 | n = initial_n 124 | bias = initial_bias 125 | delta = 0 126 | 127 | while handled < input_length 128 | -- Find next unhandled codepoint 129 | m = 0x10FFFF + 1 130 | for cp in *codepoints 131 | if cp >= n and cp < m 132 | m = cp 133 | 134 | -- Increase delta 135 | delta = delta + (m - n) * (handled + 1) 136 | n = m 137 | 138 | -- Encode all codepoints up to m 139 | for cp in *codepoints 140 | if cp < n 141 | delta = delta + 1 142 | elseif cp == n 143 | -- Encode delta 144 | q = delta 145 | k = base 146 | 147 | while true 148 | t = threshold k, bias 149 | if q < t 150 | break 151 | 152 | table.insert output, encode_digit(t + ((q - t) % (base - t))) 153 | q = math.floor (q - t) / (base - t) 154 | k = k + base 155 | 156 | table.insert output, encode_digit(q) 157 | bias = adapt delta, handled + 1, handled == basic_length 158 | delta = 0 159 | handled = handled + 1 160 | 161 | delta = delta + 1 162 | n = n + 1 163 | 164 | "xn--" .. table.concat output 165 | 166 | { 167 | :punycode_encode 168 | } 169 | -------------------------------------------------------------------------------- /lapis/bayes/models/categories.lua: -------------------------------------------------------------------------------- 1 | local db = require("lapis.db") 2 | local Model, encode_tuples 3 | do 4 | local _obj_0 = require("lapis.bayes.model") 5 | Model, encode_tuples = _obj_0.Model, _obj_0.encode_tuples 6 | end 7 | local Categories 8 | do 9 | local _class_0 10 | local _parent_0 = Model 11 | local _base_0 = { 12 | delete = function(self) 13 | if _class_0.__parent.__base.delete(self) then 14 | local WordClassifications 15 | WordClassifications = require("lapis.bayes.models").WordClassifications 16 | return db.delete(WordClassifications:table_name(), { 17 | category_id = self.id 18 | }) 19 | end 20 | end, 21 | increment = function(self, amount) 22 | amount = assert(tonumber(amount), "expecting number") 23 | return self:update({ 24 | total_count = db.raw("total_count + " .. tostring(amount)) 25 | }) 26 | end, 27 | increment_text = function(self, text, opts) 28 | if opts == nil then 29 | opts = { } 30 | end 31 | return error("This method has been removed, use increment_words instead") 32 | end, 33 | increment_word = function(self, word, count) 34 | local WordClassifications 35 | WordClassifications = require("lapis.bayes.models").WordClassifications 36 | local w = WordClassifications:find_or_create({ 37 | category_id = self.id, 38 | word = word 39 | }) 40 | w:_increment(count) 41 | return self:increment(count) 42 | end, 43 | increment_words = function(self, counts) 44 | if not (counts) then 45 | return nil, "missing counts" 46 | end 47 | local merged_counts = { } 48 | for k, v in pairs(counts) do 49 | local word, count 50 | if type(k) == "string" then 51 | word, count = k, v 52 | else 53 | word, count = v, 1 54 | end 55 | local _update_0 = word 56 | merged_counts[_update_0] = merged_counts[_update_0] or 0 57 | local _update_1 = word 58 | merged_counts[_update_1] = merged_counts[_update_1] + count 59 | end 60 | local total_count = 0 61 | local tuples 62 | do 63 | local _accum_0 = { } 64 | local _len_0 = 1 65 | for word, count in pairs(merged_counts) do 66 | total_count = total_count + count 67 | local _value_0 = { 68 | self.id, 69 | word, 70 | count 71 | } 72 | _accum_0[_len_0] = _value_0 73 | _len_0 = _len_0 + 1 74 | end 75 | tuples = _accum_0 76 | end 77 | if not (next(tuples)) then 78 | return total_count 79 | end 80 | local WordClassifications 81 | WordClassifications = require("lapis.bayes.models").WordClassifications 82 | local tbl = db.escape_identifier(WordClassifications:table_name()) 83 | db.query("\n INSERT INTO " .. tostring(tbl) .. " (category_id, word, count) " .. tostring(encode_tuples(tuples)) .. "\n ON CONFLICT (category_id, word) DO UPDATE SET count = " .. tostring(tbl) .. ".count + EXCLUDED.count\n ") 84 | self:increment(total_count) 85 | return total_count 86 | end 87 | } 88 | _base_0.__index = _base_0 89 | setmetatable(_base_0, _parent_0.__base) 90 | _class_0 = setmetatable({ 91 | __init = function(self, ...) 92 | return _class_0.__parent.__init(self, ...) 93 | end, 94 | __base = _base_0, 95 | __name = "Categories", 96 | __parent = _parent_0 97 | }, { 98 | __index = function(cls, name) 99 | local val = rawget(_base_0, name) 100 | if val == nil then 101 | local parent = rawget(cls, "__parent") 102 | if parent then 103 | return parent[name] 104 | end 105 | else 106 | return val 107 | end 108 | end, 109 | __call = function(cls, ...) 110 | local _self_0 = setmetatable({}, _base_0) 111 | cls.__init(_self_0, ...) 112 | return _self_0 113 | end 114 | }) 115 | _base_0.__class = _class_0 116 | local self = _class_0 117 | self.timestamp = true 118 | self.relations = { 119 | { 120 | "word_classifications", 121 | has_many = "WordClassifications" 122 | } 123 | } 124 | self.find_or_create = function(self, name) 125 | return self:find({ 126 | name = name 127 | }) or self:create({ 128 | name = name 129 | }) 130 | end 131 | if _parent_0.__inherited then 132 | _parent_0.__inherited(_parent_0, _class_0) 133 | end 134 | Categories = _class_0 135 | return _class_0 136 | end 137 | -------------------------------------------------------------------------------- /lapis/bayes/classifiers/bayes.moon: -------------------------------------------------------------------------------- 1 | -- implements naive bayes with assumed probability 2 | class BayesClassifier extends require "lapis.bayes.classifiers.base" 3 | @default_options: { 4 | max_words: 40 5 | default_prob: 0.1 6 | log: false 7 | token_weight_patterns: nil 8 | uncertainty_weight: 1.0 9 | } 10 | 11 | get_token_weight: (word) => 12 | return 1.0 unless @opts.token_weight_patterns 13 | 14 | for pattern, weight in pairs @opts.token_weight_patterns 15 | if word\match pattern 16 | return weight 17 | 18 | 1.0 19 | 20 | word_probabilities: (categories, available_words, opts={}) => 21 | opts or= {} 22 | return nil, "only two categories supported at once" unless #categories == 2 23 | 24 | a, b = unpack categories 25 | 26 | sum_counts = 0 27 | for c in *categories 28 | sum_counts += c.total_count 29 | 30 | available_words = @candidate_words categories, available_words, @opts.max_words 31 | available_words_count = #available_words 32 | 33 | unclassified_counts = opts.unclassified_counts or @opts.unclassified_counts 34 | uncertainty_weight = if opts.uncertainty_weight != nil 35 | opts.uncertainty_weight 36 | else 37 | @opts.uncertainty_weight or 1.0 38 | uncertainty_weight = math.max uncertainty_weight, 0 39 | 40 | token_weights = {} 41 | for word in *available_words 42 | weight = @get_token_weight word 43 | 44 | if unclassified_counts 45 | unc = unclassified_counts[word] 46 | if unc and unc > 0 47 | classified_total = 0 48 | classified_total += (a.word_counts and a.word_counts[word]) or 0 49 | classified_total += (b.word_counts and b.word_counts[word]) or 0 50 | 51 | total = classified_total + unc 52 | if total > 0 and uncertainty_weight != 0 53 | confidence = classified_total / total 54 | weight *= confidence ^ uncertainty_weight 55 | 56 | token_weights[word] = weight 57 | 58 | default_prob = @opts.default_prob / sum_counts 59 | 60 | default_a = default_prob * a.total_count 61 | default_b = default_prob * b.total_count 62 | 63 | -- NOTE: you should use log mode if you have a large number of tokens 64 | -- because the numbers get really small 65 | prob = if @opts.log 66 | ai_log_sum = 0 67 | bi_log_sum = 0 68 | 69 | for word in *available_words 70 | ai_count = (a.word_counts and a.word_counts[word] or 0) + default_a 71 | bi_count = (b.word_counts and b.word_counts[word] or 0) + default_b 72 | 73 | weight = token_weights[word] or @get_token_weight word 74 | 75 | ai_log_sum += weight * math.log ai_count 76 | bi_log_sum += weight * math.log bi_count 77 | 78 | ai_log_sum += math.log a.total_count 79 | bi_log_sum += math.log b.total_count 80 | 81 | ai_log_sum -= math.log (default_a + a.total_count) 82 | bi_log_sum -= math.log (default_b + b.total_count) 83 | 84 | ai_log_sum -= math.log available_words_count 85 | bi_log_sum -= math.log available_words_count 86 | 87 | max_log_sum = math.max ai_log_sum, bi_log_sum 88 | 89 | ai_prob = math.exp(ai_log_sum - max_log_sum) 90 | bi_prob = math.exp(bi_log_sum - max_log_sum) 91 | 92 | ai_prob / (ai_prob + bi_prob) 93 | else 94 | local ai_mul, bi_mul 95 | 96 | for word in *available_words 97 | ai_count = (a.word_counts and a.word_counts[word] or 0) + default_a 98 | bi_count = (b.word_counts and b.word_counts[word] or 0) + default_b 99 | 100 | weight = token_weights[word] or @get_token_weight word 101 | 102 | if ai_mul 103 | ai_mul *= ai_count ^ weight 104 | else 105 | ai_mul = ai_count ^ weight 106 | 107 | if bi_mul 108 | bi_mul *= bi_count ^ weight 109 | else 110 | bi_mul = bi_count ^ weight 111 | 112 | ai_prob = a.total_count * ai_mul / ((a.total_count + default_a) * available_words_count) 113 | bi_prob = b.total_count * bi_mul / ((b.total_count + default_b) * available_words_count) 114 | 115 | ai_prob = 0 if ai_prob != ai_prob 116 | bi_prob = 0 if bi_prob != bi_prob 117 | 118 | ai_prob / (ai_prob + bi_prob) 119 | 120 | if prob != prob 121 | return nil, "Got nan when calculating prob" 122 | 123 | if prob == math.huge or prob == -math.huge 124 | return nil, "Got inf when calculating prob" 125 | 126 | tuples = { 127 | { a.name, prob } 128 | { b.name, 1 - prob } 129 | } 130 | 131 | table.sort tuples, (a, b) -> a[2] > b[2] 132 | tuples 133 | -------------------------------------------------------------------------------- /lapis/bayes/text/punycode.lua: -------------------------------------------------------------------------------- 1 | local base = 36 2 | local tmin = 1 3 | local tmax = 26 4 | local skew = 38 5 | local damp = 700 6 | local initial_bias = 72 7 | local initial_n = 128 8 | local delimiter = 0x2D 9 | local adapt 10 | adapt = function(delta, numpoints, firsttime) 11 | if firsttime then 12 | delta = math.floor(delta / damp) 13 | else 14 | delta = math.floor(delta / 2) 15 | end 16 | delta = delta + math.floor(delta / numpoints) 17 | local k = 0 18 | while delta > math.floor((base - tmin) * tmax / 2) do 19 | delta = math.floor(delta / (base - tmin)) 20 | k = k + base 21 | end 22 | return k + math.floor(((base - tmin + 1) * delta) / (delta + skew)) 23 | end 24 | local encode_digit 25 | encode_digit = function(d) 26 | if d < 26 then 27 | return string.char(d + 0x61) 28 | else 29 | return string.char(d - 26 + 0x30) 30 | end 31 | end 32 | local threshold 33 | threshold = function(k, bias) 34 | if k <= bias + tmin then 35 | return tmin 36 | elseif k >= bias + tmax then 37 | return tmax 38 | else 39 | return k - bias 40 | end 41 | end 42 | local is_basic 43 | is_basic = function(cp) 44 | return cp < 0x80 45 | end 46 | local utf8_codepoints 47 | utf8_codepoints = function(str) 48 | local codepoints = { } 49 | local i = 1 50 | while i <= #str do 51 | local b = string.byte(str, i) 52 | local cp = nil 53 | local len = 1 54 | if b < 0x80 then 55 | cp = b 56 | len = 1 57 | elseif b >= 0xC0 and b < 0xE0 then 58 | local b2 = string.byte(str, i + 1) or 0 59 | cp = ((b - 0xC0) * 0x40) + (b2 - 0x80) 60 | len = 2 61 | elseif b >= 0xE0 and b < 0xF0 then 62 | local b2 = string.byte(str, i + 1) or 0 63 | local b3 = string.byte(str, i + 2) or 0 64 | cp = ((b - 0xE0) * 0x1000) + ((b2 - 0x80) * 0x40) + (b3 - 0x80) 65 | len = 3 66 | elseif b >= 0xF0 and b < 0xF8 then 67 | local b2 = string.byte(str, i + 1) or 0 68 | local b3 = string.byte(str, i + 2) or 0 69 | local b4 = string.byte(str, i + 3) or 0 70 | cp = ((b - 0xF0) * 0x40000) + ((b2 - 0x80) * 0x1000) + ((b3 - 0x80) * 0x40) + (b4 - 0x80) 71 | len = 4 72 | else 73 | cp = b 74 | len = 1 75 | end 76 | table.insert(codepoints, cp) 77 | i = i + len 78 | end 79 | return codepoints 80 | end 81 | local punycode_encode 82 | punycode_encode = function(label) 83 | if not (label and label ~= "") then 84 | return label 85 | end 86 | if label:match("^[%w%-]+$") then 87 | return label 88 | end 89 | local codepoints = utf8_codepoints(label) 90 | local input_length = #codepoints 91 | local has_nonbasic = false 92 | for _index_0 = 1, #codepoints do 93 | local cp = codepoints[_index_0] 94 | if not is_basic(cp) then 95 | has_nonbasic = true 96 | break 97 | end 98 | end 99 | if not (has_nonbasic) then 100 | return label 101 | end 102 | local output = { } 103 | local basic_length = 0 104 | for _index_0 = 1, #codepoints do 105 | local cp = codepoints[_index_0] 106 | if is_basic(cp) then 107 | table.insert(output, string.char(cp)) 108 | basic_length = basic_length + 1 109 | end 110 | end 111 | local handled = basic_length 112 | if basic_length > 0 then 113 | table.insert(output, string.char(delimiter)) 114 | end 115 | local n = initial_n 116 | local bias = initial_bias 117 | local delta = 0 118 | while handled < input_length do 119 | local m = 0x10FFFF + 1 120 | for _index_0 = 1, #codepoints do 121 | local cp = codepoints[_index_0] 122 | if cp >= n and cp < m then 123 | m = cp 124 | end 125 | end 126 | delta = delta + (m - n) * (handled + 1) 127 | n = m 128 | for _index_0 = 1, #codepoints do 129 | local cp = codepoints[_index_0] 130 | if cp < n then 131 | delta = delta + 1 132 | elseif cp == n then 133 | local q = delta 134 | local k = base 135 | while true do 136 | local t = threshold(k, bias) 137 | if q < t then 138 | break 139 | end 140 | table.insert(output, encode_digit(t + ((q - t) % (base - t)))) 141 | q = math.floor((q - t) / (base - t)) 142 | k = k + base 143 | end 144 | table.insert(output, encode_digit(q)) 145 | bias = adapt(delta, handled + 1, handled == basic_length) 146 | delta = 0 147 | handled = handled + 1 148 | end 149 | end 150 | delta = delta + 1 151 | n = n + 1 152 | end 153 | return "xn--" .. table.concat(output) 154 | end 155 | return { 156 | punycode_encode = punycode_encode 157 | } 158 | -------------------------------------------------------------------------------- /lapis/bayes/tokenizers/ngram.lua: -------------------------------------------------------------------------------- 1 | local NgramTokenizer 2 | do 3 | local _class_0 4 | local _parent_0 = require("lapis.bayes.tokenizers.base") 5 | local _base_0 = { 6 | build_grammar = function(self) 7 | local C, Ct 8 | do 9 | local _obj_0 = require("lpeg") 10 | C, Ct = _obj_0.C, _obj_0.Ct 11 | end 12 | local utf8 = require("lapis.util.utf8") 13 | local whitespace = utf8.whitespace 14 | local printable = utf8.printable_character 15 | local word_chars = printable - whitespace 16 | local word = C(word_chars ^ 1) 17 | return Ct((word + whitespace ^ 1) ^ 0) 18 | end, 19 | normalize_word = function(self, word) 20 | if not (word and word ~= "") then 21 | return 22 | end 23 | local normalized = tostring(word):lower() 24 | normalized = normalized:gsub("[%p]", "") 25 | normalized = normalized:gsub("%s+", "") 26 | if not (normalized ~= "") then 27 | return 28 | end 29 | return normalized 30 | end, 31 | ngram_size = function(self) 32 | local n = tonumber(self.opts.n) or 2 33 | n = math.floor(n) 34 | if n < 1 then 35 | n = 1 36 | end 37 | return n 38 | end, 39 | word_ngrams = function(self, word, n) 40 | local C, Ct 41 | do 42 | local _obj_0 = require("lpeg") 43 | C, Ct = _obj_0.C, _obj_0.Ct 44 | end 45 | local utf8 = require("lapis.util.utf8") 46 | local printable = utf8.printable_character 47 | local char_pattern = Ct((C(printable)) ^ 0) 48 | local chars = char_pattern:match(word) 49 | if not (chars) then 50 | return { 51 | word 52 | } 53 | end 54 | local len = #chars 55 | if len == 0 then 56 | return { 57 | word 58 | } 59 | end 60 | if len < n then 61 | return { 62 | word 63 | } 64 | end 65 | local out = { } 66 | for i = 1, len - n + 1 do 67 | local ngram = table.concat(chars, "", i, i + n - 1) 68 | table.insert(out, ngram) 69 | end 70 | return out 71 | end, 72 | tokenize_text = function(self, text) 73 | if not (text and text ~= "") then 74 | return { } 75 | end 76 | do 77 | local pre_filter = self.opts.filter_text 78 | if pre_filter then 79 | text = pre_filter(text) 80 | if not (text and text ~= "") then 81 | return { } 82 | end 83 | end 84 | end 85 | self.grammar = self.grammar or self:build_grammar() 86 | local words = self.grammar:match(text) 87 | if not (words) then 88 | return { } 89 | end 90 | local n = self:ngram_size() 91 | local ignore_numbers = self.opts.ignore_numbers 92 | if ignore_numbers == nil then 93 | ignore_numbers = true 94 | end 95 | local tokens = { } 96 | for _index_0 = 1, #words do 97 | local _continue_0 = false 98 | repeat 99 | local raw_word = words[_index_0] 100 | local cleaned = self:normalize_word(raw_word) 101 | if not (cleaned) then 102 | _continue_0 = true 103 | break 104 | end 105 | if ignore_numbers and cleaned:match("^%d+$") then 106 | _continue_0 = true 107 | break 108 | end 109 | local _list_0 = self:word_ngrams(cleaned, n) 110 | for _index_1 = 1, #_list_0 do 111 | local token = _list_0[_index_1] 112 | table.insert(tokens, token) 113 | end 114 | _continue_0 = true 115 | until true 116 | if not _continue_0 then 117 | break 118 | end 119 | end 120 | if self.opts.filter_tokens then 121 | tokens = self.opts.filter_tokens(tokens, self.opts) 122 | end 123 | return tokens 124 | end 125 | } 126 | _base_0.__index = _base_0 127 | setmetatable(_base_0, _parent_0.__base) 128 | _class_0 = setmetatable({ 129 | __init = function(self, opts) 130 | if opts == nil then 131 | opts = { } 132 | end 133 | self.opts = opts 134 | end, 135 | __base = _base_0, 136 | __name = "NgramTokenizer", 137 | __parent = _parent_0 138 | }, { 139 | __index = function(cls, name) 140 | local val = rawget(_base_0, name) 141 | if val == nil then 142 | local parent = rawget(cls, "__parent") 143 | if parent then 144 | return parent[name] 145 | end 146 | else 147 | return val 148 | end 149 | end, 150 | __call = function(cls, ...) 151 | local _self_0 = setmetatable({}, _base_0) 152 | cls.__init(_self_0, ...) 153 | return _self_0 154 | end 155 | }) 156 | _base_0.__class = _class_0 157 | if _parent_0.__inherited then 158 | _parent_0.__inherited(_parent_0, _class_0) 159 | end 160 | NgramTokenizer = _class_0 161 | return _class_0 162 | end 163 | -------------------------------------------------------------------------------- /lapis/bayes/tokenizers/url_domains.lua: -------------------------------------------------------------------------------- 1 | local trim 2 | trim = require("lapis.util").trim 3 | local UrlDomainsTokenizer 4 | do 5 | local _class_0 6 | local _parent_0 = require("lapis.bayes.tokenizers.base") 7 | local _base_0 = { 8 | ignore_domain = function(self, domain) 9 | if not (self.opts and self.opts.ignore_domains) then 10 | return 11 | end 12 | if self.opts.ignore_domains[domain] then 13 | return true 14 | end 15 | while true do 16 | local sub = domain:gsub("^%**%.?[^%.]+", "*") 17 | if sub == domain then 18 | return false 19 | end 20 | if self.opts.ignore_domains[sub] then 21 | return true 22 | end 23 | domain = sub 24 | end 25 | end, 26 | filter_tokens = function(self, urls) 27 | return (function() 28 | local _accum_0 = { } 29 | local _len_0 = 1 30 | for _index_0 = 1, #urls do 31 | local _continue_0 = false 32 | repeat 33 | local url = urls[_index_0] 34 | url = url:lower() 35 | url = trim(url) 36 | url = url:gsub("^%w+://", "") 37 | url = url:gsub("^www%.", "") 38 | url = url:gsub("/.*$", "") 39 | url = trim(url) 40 | url:gsub("<$", "") 41 | url:gsub("^>", "") 42 | if url == "" then 43 | _continue_0 = true 44 | break 45 | end 46 | if url:match("^%w+:") then 47 | _continue_0 = true 48 | break 49 | end 50 | if url:match([=[[<>="' ]]=]) then 51 | _continue_0 = true 52 | break 53 | end 54 | if not (url:match("%.")) then 55 | _continue_0 = true 56 | break 57 | end 58 | if self:ignore_domain(url) then 59 | _continue_0 = true 60 | break 61 | end 62 | local _value_0 = url 63 | _accum_0[_len_0] = _value_0 64 | _len_0 = _len_0 + 1 65 | _continue_0 = true 66 | until true 67 | if not _continue_0 then 68 | break 69 | end 70 | end 71 | return _accum_0 72 | end)() 73 | end, 74 | build_grammar = function(self) 75 | local P, S, R, C, Ct, Cs 76 | do 77 | local _obj_0 = require("lpeg") 78 | P, S, R, C, Ct, Cs = _obj_0.P, _obj_0.S, _obj_0.R, _obj_0.C, _obj_0.Ct, _obj_0.Cs 79 | end 80 | local case_insensitive 81 | case_insensitive = function(text) 82 | local out = nil 83 | for char in text:gmatch(".") do 84 | local p = S(tostring(char:lower()) .. tostring(char:upper())) 85 | if out then 86 | out = out * p 87 | else 88 | out = p 89 | end 90 | end 91 | return out 92 | end 93 | local unescape_char = P(">") / ">" + P("<") / "<" + P("&") / "&" + P(" ") / " " + P("'") / "'" + P("/") / "/" + P(""") / '"' 94 | local unescape_text = Cs((unescape_char + 1) ^ 1) 95 | local some_space = S(" \t\n") 96 | local space = some_space ^ 0 97 | local alphanum = R("az", "AZ", "09") 98 | local scheme = case_insensitive("http") * case_insensitive("s") ^ -1 * P("://") 99 | local raw_url = C(scheme * (P(1) - S(" \t\n")) ^ 1) 100 | local word = (alphanum + S("._-")) ^ 1 101 | local attr_value = C(word) + P('"') * C((1 - P('"')) ^ 0) * P('"') + P("'") * C((1 - P("'")) ^ 0) * P("'") 102 | local href = (case_insensitive("href") + case_insensitive("src")) * space * P("=") * space * attr_value / function(v) 103 | return unescape_text:match(v) or "" 104 | end 105 | local simple = C(case_insensitive("www") * (P(".") * (1 - (S("./") + some_space)) ^ 1) ^ 1) 106 | return Ct((raw_url + href + simple + 1) ^ 0) 107 | end, 108 | tokenize_text = function(self, text) 109 | self.grammar = self.grammar or self:build_grammar() 110 | local matches = self.grammar:match(text) 111 | if not (matches) then 112 | return nil, "failed to parse text" 113 | end 114 | return self:filter_tokens(matches) 115 | end 116 | } 117 | _base_0.__index = _base_0 118 | setmetatable(_base_0, _parent_0.__base) 119 | _class_0 = setmetatable({ 120 | __init = function(self, opts) 121 | if opts == nil then 122 | opts = { } 123 | end 124 | self.opts = opts 125 | end, 126 | __base = _base_0, 127 | __name = "UrlDomainsTokenizer", 128 | __parent = _parent_0 129 | }, { 130 | __index = function(cls, name) 131 | local val = rawget(_base_0, name) 132 | if val == nil then 133 | local parent = rawget(cls, "__parent") 134 | if parent then 135 | return parent[name] 136 | end 137 | else 138 | return val 139 | end 140 | end, 141 | __call = function(cls, ...) 142 | local _self_0 = setmetatable({}, _base_0) 143 | cls.__init(_self_0, ...) 144 | return _self_0 145 | end 146 | }) 147 | _base_0.__class = _class_0 148 | if _parent_0.__inherited then 149 | _parent_0.__inherited(_parent_0, _class_0) 150 | end 151 | UrlDomainsTokenizer = _class_0 152 | return _class_0 153 | end 154 | -------------------------------------------------------------------------------- /lapis/bayes/classifiers/base.moon: -------------------------------------------------------------------------------- 1 | import uniquify from require "lapis.util" 2 | 3 | class BaseClassifier 4 | default_tokenizer: "lapis.bayes.tokenizers.postgres_text" 5 | 6 | new: (@opts={}) => 7 | if @@default_options 8 | @opts = setmetatable {k,v for k,v in pairs @opts}, __index: @@default_options 9 | 10 | word_probabilities: (categories, words) => 11 | error "word_probabilities: subclass must implement" 12 | 13 | classify_text: (...) => 14 | counts, word_rate_or_err = @text_probabilities ... 15 | unless counts 16 | return nil, word_rate_or_err 17 | 18 | counts[1][1], counts[1][2], word_rate_or_err 19 | 20 | tokenize_text: (text) => 21 | assert text, "missing text to tokenize" 22 | 23 | -- text is some object that is already tokenized 24 | unless type(text) == "string" 25 | return text 26 | 27 | -- custom tokenizer function passed 28 | if @opts.tokenize_text 29 | return @opts.tokenize_text text, @opts 30 | 31 | -- tokenizer instance passed 32 | tokenizer = if @opts.tokenizer 33 | @opts.tokenizer 34 | else 35 | Tokenizer = require @default_tokenizer 36 | Tokenizer(@opts) 37 | 38 | tokenizer\tokenize_text text 39 | 40 | train_text: (category, text, opts) => 41 | tokens = @tokenize_text text 42 | 43 | if opts and opts.filter_tokens 44 | tokens = opts.filter_tokens opts, text 45 | 46 | import Categories from require "lapis.bayes.models" 47 | category = Categories\find_or_create category 48 | category\increment_words tokens 49 | 50 | -- categories: a lua array of categories names 51 | -- text: string of text to classify, or an array of tokens to classify 52 | text_probabilities: (category_names, text, opts) => 53 | opts or= {} 54 | 55 | categories, err = @find_categories category_names 56 | 57 | unless categories 58 | return nil, err 59 | 60 | words = @tokenize_text text 61 | 62 | unless words and next words 63 | return nil, "failed to generate tokens for text" 64 | 65 | available_words, err = @count_words categories, words 66 | 67 | unless available_words 68 | return nil, err 69 | 70 | available_words_set = {word, true for word in *available_words} 71 | count = 0 72 | for word in *words 73 | count +=1 if available_words_set[word] 74 | 75 | token_ratio = count / #words 76 | 77 | probs, err = @word_probabilities categories, available_words, opts 78 | unless probs 79 | return nil, err 80 | 81 | -- put probs in hash table part of result 82 | for {c, p} in *probs 83 | probs[c] = p 84 | 85 | probs, token_ratio 86 | 87 | -- query the category objects by category name 88 | -- returns an array of category records in the same order as the input 89 | find_categories: (category_names) => 90 | import Categories from require "lapis.bayes.models" 91 | db = Categories.db 92 | 93 | categories = Categories\select "where name in ?", db.list category_names 94 | by_name = {c.name, c for c in *categories} 95 | 96 | local missing 97 | 98 | result = for name in *category_names 99 | c = by_name[name] 100 | 101 | unless c 102 | missing or= {} 103 | table.insert missing, name 104 | continue 105 | 106 | c 107 | 108 | if missing and next missing 109 | return nil, "find_categories: missing categories (#{table.concat missing, ", "})" 110 | 111 | result 112 | 113 | -- query for WordClassifications for the requested category ids 114 | -- both arguments are arrays 115 | -- returns WordClassifications in no particular order 116 | find_word_classifications: (words, category_ids) => 117 | return {} unless next(words) and next category_ids 118 | 119 | import WordClassifications from require "lapis.bayes.models" 120 | db = WordClassifications.db 121 | WordClassifications\select "where word in ? and category_id in ?", db.list(words), db.list(category_ids) 122 | 123 | -- reduce the set of available words by looking for polarizing words 124 | -- categories: array of category objects 125 | -- available_words: array of available words 126 | -- count: the max length of returned words array 127 | candidate_words: (categories, available_words, count) => 128 | return available_words if #available_words <= count 129 | 130 | assert #categories == 2, "can only do two categories" 131 | 132 | a,b = unpack categories 133 | -- calculate conflict words 134 | tuples = for word in *available_words 135 | a_count = a.word_counts and a.word_counts[word] or 0 136 | b_count = b.word_counts and b.word_counts[word] or 0 137 | 138 | { 139 | word 140 | math.random! / 100 + math.abs (a_count - b_count) / math.sqrt a_count + b_count 141 | a_count 142 | b_count 143 | } 144 | 145 | table.sort tuples, (a,b) -> 146 | a[2] > b[2] 147 | 148 | [t[1] for t in *tuples[,count]] 149 | 150 | -- load the categories with the counts from the words text, return the list 151 | -- of words that appear in at least one category 152 | -- 153 | -- categories: array of categories 154 | -- words: array of tokens 155 | count_words: (categories, words) => 156 | categories_by_id = {c.id, c for c in *categories} 157 | words = uniquify words 158 | 159 | wcs = @find_word_classifications words, [c.id for c in *categories] 160 | 161 | available_words = [word for word in pairs {wc.word, true for wc in *wcs}] 162 | 163 | if #available_words == 0 164 | return nil, "no words in text are classifyable" 165 | 166 | for wc in *wcs 167 | category = categories_by_id[wc.category_id] 168 | category.word_counts or= {} 169 | category.word_counts[wc.word] = wc.count 170 | 171 | available_words 172 | -------------------------------------------------------------------------------- /lapis/bayes/classifiers/bayes_multi.lua: -------------------------------------------------------------------------------- 1 | local BayesMultiClassifier 2 | do 3 | local _class_0 4 | local _parent_0 = require("lapis.bayes.classifiers.base") 5 | local _base_0 = { 6 | candidate_words = function(self, categories, available_words, count) 7 | if not (count and count < #available_words) then 8 | return available_words 9 | end 10 | local tuples 11 | do 12 | local _accum_0 = { } 13 | local _len_0 = 1 14 | for _index_0 = 1, #available_words do 15 | local word = available_words[_index_0] 16 | local totals = 0 17 | local counts = { } 18 | for _index_1 = 1, #categories do 19 | local category = categories[_index_1] 20 | local word_counts = category.word_counts 21 | local c = word_counts and word_counts[word] or 0 22 | table.insert(counts, c) 23 | totals = totals + c 24 | end 25 | local score 26 | if totals == 0 then 27 | score = 0 28 | else 29 | local mean = totals / #counts 30 | local variance = 0 31 | for _index_1 = 1, #counts do 32 | local c = counts[_index_1] 33 | variance = variance + ((c - mean) ^ 2) 34 | end 35 | score = variance / #counts 36 | end 37 | score = score + (math.random() / 1000) 38 | local _value_0 = { 39 | word, 40 | score 41 | } 42 | _accum_0[_len_0] = _value_0 43 | _len_0 = _len_0 + 1 44 | end 45 | tuples = _accum_0 46 | end 47 | table.sort(tuples, function(a, b) 48 | return a[2] > b[2] 49 | end) 50 | local _accum_0 = { } 51 | local _len_0 = 1 52 | local _max_0 = count 53 | for _index_0 = 1, _max_0 < 0 and #tuples + _max_0 or _max_0 do 54 | local t = tuples[_index_0] 55 | _accum_0[_len_0] = t[1] 56 | _len_0 = _len_0 + 1 57 | end 58 | return _accum_0 59 | end, 60 | word_probabilities = function(self, categories, available_words) 61 | if not (#categories >= 2) then 62 | return nil, "at least two categories required" 63 | end 64 | available_words = self:candidate_words(categories, available_words, self.opts.max_words) 65 | local vocab_size = #available_words 66 | if not (vocab_size > 0) then 67 | return nil, "no words to score" 68 | end 69 | local smoothing 70 | if self.opts.default_prob and self.opts.default_prob > 0 then 71 | smoothing = self.opts.default_prob 72 | else 73 | smoothing = 1e-6 74 | end 75 | local sum_counts = 0 76 | for _index_0 = 1, #categories do 77 | local category = categories[_index_0] 78 | sum_counts = sum_counts + (category.total_count or 0) 79 | end 80 | local prior_smoothing = smoothing * #categories 81 | local max_log 82 | local log_scores 83 | do 84 | local _accum_0 = { } 85 | local _len_0 = 1 86 | for _index_0 = 1, #categories do 87 | local category = categories[_index_0] 88 | local cat_total = math.max((category.total_count or 0), 0) 89 | local prior = (cat_total + smoothing) / (sum_counts + prior_smoothing) 90 | local log_score = math.log(prior) 91 | local denominator = cat_total + (smoothing * vocab_size) 92 | if denominator <= 0 then 93 | denominator = smoothing * vocab_size 94 | end 95 | for _index_1 = 1, #available_words do 96 | local word = available_words[_index_1] 97 | local word_count = category.word_counts and category.word_counts[word] or 0 98 | log_score = log_score + math.log(((word_count + smoothing) / denominator)) 99 | end 100 | if max_log then 101 | max_log = math.max(max_log, log_score) 102 | else 103 | max_log = log_score 104 | end 105 | local _value_0 = { 106 | category, 107 | log_score 108 | } 109 | _accum_0[_len_0] = _value_0 110 | _len_0 = _len_0 + 1 111 | end 112 | log_scores = _accum_0 113 | end 114 | local weights = { } 115 | local total_weight = 0 116 | for _index_0 = 1, #log_scores do 117 | local _des_0 = log_scores[_index_0] 118 | local category, log_score 119 | category, log_score = _des_0[1], _des_0[2] 120 | local weight = math.exp((log_score - max_log)) 121 | total_weight = total_weight + weight 122 | table.insert(weights, { 123 | category.name, 124 | weight 125 | }) 126 | end 127 | if not (total_weight > 0) then 128 | return nil, "unable to normalise probabilities" 129 | end 130 | for _index_0 = 1, #weights do 131 | local tuple = weights[_index_0] 132 | local _update_0 = 2 133 | tuple[_update_0] = tuple[_update_0] / total_weight 134 | end 135 | table.sort(weights, function(a, b) 136 | return a[2] > b[2] 137 | end) 138 | return weights 139 | end 140 | } 141 | _base_0.__index = _base_0 142 | setmetatable(_base_0, _parent_0.__base) 143 | _class_0 = setmetatable({ 144 | __init = function(self, ...) 145 | return _class_0.__parent.__init(self, ...) 146 | end, 147 | __base = _base_0, 148 | __name = "BayesMultiClassifier", 149 | __parent = _parent_0 150 | }, { 151 | __index = function(cls, name) 152 | local val = rawget(_base_0, name) 153 | if val == nil then 154 | local parent = rawget(cls, "__parent") 155 | if parent then 156 | return parent[name] 157 | end 158 | else 159 | return val 160 | end 161 | end, 162 | __call = function(cls, ...) 163 | local _self_0 = setmetatable({}, _base_0) 164 | cls.__init(_self_0, ...) 165 | return _self_0 166 | end 167 | }) 168 | _base_0.__class = _class_0 169 | local self = _class_0 170 | self.default_options = { 171 | max_words = 40, 172 | default_prob = 0.1 173 | } 174 | if _parent_0.__inherited then 175 | _parent_0.__inherited(_parent_0, _class_0) 176 | end 177 | BayesMultiClassifier = _class_0 178 | return _class_0 179 | end 180 | -------------------------------------------------------------------------------- /spec/punycode_spec.moon: -------------------------------------------------------------------------------- 1 | punycode = require "lapis.bayes.text.punycode" 2 | 3 | describe "lapis.bayes.text.punycode", -> 4 | describe "punycode_encode", -> 5 | fixtures = { 6 | { description: "German umlaut: münchen", label: "münchen", expected: "xn--mnchen-3ya" } 7 | { description: "German umlaut: müller", label: "müller", expected: "xn--mller-kva" } 8 | { description: "German umlaut: bücher", label: "bücher", expected: "xn--bcher-kva" } 9 | { description: "French accent: français", label: "français", expected: "xn--franais-xxa" } 10 | { description: "French accent: café", label: "café", expected: "xn--caf-dma" } 11 | { description: "Spanish tilde: español", label: "español", expected: "xn--espaol-zwa" } 12 | { description: "Spanish tilde: mañana", label: "mañana", expected: "xn--maana-pta" } 13 | { description: "Japanese kanji: 日本", label: "日本", expected: "xn--wgv71a" } 14 | { description: "Japanese hiragana: こんにちは", label: "こんにちは", expected: "xn--28j2a3ar1p" } 15 | { description: "Japanese katakana: テスト", label: "テスト", expected: "xn--zckzah" } 16 | { description: "Chinese simplified: 中国", label: "中国", expected: "xn--fiqs8s" } 17 | { description: "Chinese traditional: 中國", label: "中國", expected: "xn--fiqz9s" } 18 | { description: "Korean hangul: 한국", label: "한국", expected: "xn--3e0b707e" } 19 | { description: "Arabic: العربية", label: "العربية", expected: "xn--mgbcd4a2b0d2b" } 20 | { description: "Russian cyrillic: россия", label: "россия", expected: "xn--h1alffa9f" } 21 | { description: "Greek: ελληνικά", label: "ελληνικά", expected: "xn--hxargifdar" } 22 | { description: "Hebrew: עברית", label: "עברית", expected: "xn--5dbqzzl" } 23 | { description: "Thai: ไทย", label: "ไทย", expected: "xn--o3cw4h" } 24 | { description: "Mixed ASCII & Unicode: bücher-buch", label: "bücher-buch", expected: "xn--bcher-buch-9db" } 25 | { description: "Mixed ASCII & Unicode: hello世界", label: "hello世界", expected: "xn--hello-ck1hg65u" } 26 | { description: "Single Unicode codepoint: ü", label: "ü", expected: "xn--tda" } 27 | { description: "Single Unicode codepoint: ñ", label: "ñ", expected: "xn--ida" } 28 | { description: "Numeric suffix: 123ü", label: "123ü", expected: "xn--123-joa" } 29 | { description: "Leading hyphen: -ü", label: "-ü", expected: "xn----eha" } 30 | { description: "Swiss city: zürich", label: "zürich", expected: "xn--zrich-kva" } 31 | { description: "Russian city: москва", label: "москва", expected: "xn--80adxhks" } 32 | { description: "Arabic city: القاهرة", label: "القاهرة", expected: "xn--mgbag5a2flx" } 33 | { description: "Hyphen only label", label: "---", expected: "---" } 34 | { description: "German compound: bücher-bücherei", label: "bücher-bücherei", expected: "xn--bcher-bcherei-wobg" } 35 | { description: "Czech example", label: "Pročprostěnemluvíčesky", expected: "xn--Proprostnemluvesky-uyb24dma41a" } 36 | { description: "Chinese (simplified) example", label: "他们为什么不说中文", expected: "xn--ihqwcrb4cv8a8dqg056pqjye" } 37 | { description: "Chinese (traditional) example", label: "他們爲什麽不說中文", expected: "xn--ihqwctvzc91f659drss3x8bo0yb" } 38 | { description: "Arabic example", label: "ليهمابتكلموشعربي؟", expected: "xn--egbpdaj6bu4bxfgehfvwxn" } 39 | { description: "Hebrew example", label: "למההםפשוטלאמדבריםעברית", expected: "xn--4dbcagdahymbxekheh6e0a7fei0b" } 40 | { description: "Hindi example", label: "यहलोगहिन्दीक्योंनहींबोलसकतेहैं", expected: "xn--i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd" } 41 | { description: "Japanese sentence", label: "なぜみんな日本語を話してくれないのか", expected: "xn--n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa" } 42 | { description: "Korean example", label: "세계의모든사람들이한국어를이해한다면얼마나좋을까", expected: "xn--989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5jpsd879ccm6fea98c" } 43 | { description: "Russian example", label: "почемужеонинеговорятпорусски", expected: "xn--b1abfaaepdrnnbgefbadotcwatmq2g4l" } 44 | { description: "Spanish sentence", label: "PorquénopuedensimplementehablarenEspañol", expected: "xn--PorqunopuedensimplementehablarenEspaol-fmd56a" } 45 | { description: "Vietnamese example", label: "TạisaohọkhôngthểchỉnóitiếngViệt", expected: "xn--TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g" } 46 | { description: "Mixed example: 3年B組金八先生", label: "3年B組金八先生", expected: "xn--3B-ww4c5e180e575a65lsy2b" } 47 | { description: "Mixed example: 安室奈美恵-with-SUPER-MONKEYS", label: "安室奈美恵-with-SUPER-MONKEYS", expected: "xn---with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n" } 48 | { description: "Mixed example: Hello-Another-Way-それぞれの場所", label: "Hello-Another-Way-それぞれの場所", expected: "xn--Hello-Another-Way--fc4qua05auwb3674vfr0b" } 49 | { description: "Mixed example: ひとつ屋根の下2", label: "ひとつ屋根の下2", expected: "xn--2-u9tlzr9756bt3uc0v" } 50 | { description: "Mixed example: MajiでKoiする5秒前", label: "MajiでKoiする5秒前", expected: "xn--MajiKoi5-783gue6qz075azm5e" } 51 | { description: "Mixed example: パフィーdeルンバ", label: "パフィーdeルンバ", expected: "xn--de-jg4avhby1noc0d" } 52 | { description: "Mixed example: そのスピードで", label: "そのスピードで", expected: "xn--d9juau41awczczp" } 53 | } 54 | 55 | it "passes through ASCII-only strings unchanged", -> 56 | assert.same "example", punycode.punycode_encode "example" 57 | assert.same "test", punycode.punycode_encode "test" 58 | assert.same "hello-world", punycode.punycode_encode "hello-world" 59 | assert.same "abc123", punycode.punycode_encode "abc123" 60 | 61 | it "handles empty string", -> 62 | assert.same "", punycode.punycode_encode "" 63 | 64 | describe "fixture encodings", -> 65 | for case in *fixtures 66 | it "encodes #{case.description}", -> 67 | assert.same case.expected, punycode.punycode_encode case.label 68 | 69 | describe "ASCII boundary behaviour", -> 70 | it "preserves leading ASCII characters", -> 71 | result = punycode.punycode_encode "test日本" 72 | assert.true (result\match "^xn%-%-test") != nil 73 | 74 | it "handles trailing hyphen with Unicode", -> 75 | result = punycode.punycode_encode "test-ü" 76 | assert.true (result\match "^xn%-%-") != nil 77 | 78 | it "preserves case for ASCII characters", -> 79 | result = punycode.punycode_encode "Test日本" 80 | assert.true (result\match "Test") != nil 81 | 82 | it "handles emoji", -> 83 | result = punycode.punycode_encode "💩" 84 | assert.is_string result 85 | -------------------------------------------------------------------------------- /lapis/bayes/classifiers/bayes.lua: -------------------------------------------------------------------------------- 1 | local BayesClassifier 2 | do 3 | local _class_0 4 | local _parent_0 = require("lapis.bayes.classifiers.base") 5 | local _base_0 = { 6 | get_token_weight = function(self, word) 7 | if not (self.opts.token_weight_patterns) then 8 | return 1.0 9 | end 10 | for pattern, weight in pairs(self.opts.token_weight_patterns) do 11 | if word:match(pattern) then 12 | return weight 13 | end 14 | end 15 | return 1.0 16 | end, 17 | word_probabilities = function(self, categories, available_words, opts) 18 | if opts == nil then 19 | opts = { } 20 | end 21 | opts = opts or { } 22 | if not (#categories == 2) then 23 | return nil, "only two categories supported at once" 24 | end 25 | local a, b = unpack(categories) 26 | local sum_counts = 0 27 | for _index_0 = 1, #categories do 28 | local c = categories[_index_0] 29 | sum_counts = sum_counts + c.total_count 30 | end 31 | available_words = self:candidate_words(categories, available_words, self.opts.max_words) 32 | local available_words_count = #available_words 33 | local unclassified_counts = opts.unclassified_counts or self.opts.unclassified_counts 34 | local uncertainty_weight 35 | if opts.uncertainty_weight ~= nil then 36 | uncertainty_weight = opts.uncertainty_weight 37 | else 38 | uncertainty_weight = self.opts.uncertainty_weight or 1.0 39 | end 40 | uncertainty_weight = math.max(uncertainty_weight, 0) 41 | local token_weights = { } 42 | for _index_0 = 1, #available_words do 43 | local word = available_words[_index_0] 44 | local weight = self:get_token_weight(word) 45 | if unclassified_counts then 46 | local unc = unclassified_counts[word] 47 | if unc and unc > 0 then 48 | local classified_total = 0 49 | classified_total = classified_total + ((a.word_counts and a.word_counts[word]) or 0) 50 | classified_total = classified_total + ((b.word_counts and b.word_counts[word]) or 0) 51 | local total = classified_total + unc 52 | if total > 0 and uncertainty_weight ~= 0 then 53 | local confidence = classified_total / total 54 | weight = weight * (confidence ^ uncertainty_weight) 55 | end 56 | end 57 | end 58 | token_weights[word] = weight 59 | end 60 | local default_prob = self.opts.default_prob / sum_counts 61 | local default_a = default_prob * a.total_count 62 | local default_b = default_prob * b.total_count 63 | local prob 64 | if self.opts.log then 65 | local ai_log_sum = 0 66 | local bi_log_sum = 0 67 | for _index_0 = 1, #available_words do 68 | local word = available_words[_index_0] 69 | local ai_count = (a.word_counts and a.word_counts[word] or 0) + default_a 70 | local bi_count = (b.word_counts and b.word_counts[word] or 0) + default_b 71 | local weight = token_weights[word] or self:get_token_weight(word) 72 | ai_log_sum = ai_log_sum + (weight * math.log(ai_count)) 73 | bi_log_sum = bi_log_sum + (weight * math.log(bi_count)) 74 | end 75 | ai_log_sum = ai_log_sum + math.log(a.total_count) 76 | bi_log_sum = bi_log_sum + math.log(b.total_count) 77 | ai_log_sum = ai_log_sum - math.log((default_a + a.total_count)) 78 | bi_log_sum = bi_log_sum - math.log((default_b + b.total_count)) 79 | ai_log_sum = ai_log_sum - math.log(available_words_count) 80 | bi_log_sum = bi_log_sum - math.log(available_words_count) 81 | local max_log_sum = math.max(ai_log_sum, bi_log_sum) 82 | local ai_prob = math.exp(ai_log_sum - max_log_sum) 83 | local bi_prob = math.exp(bi_log_sum - max_log_sum) 84 | prob = ai_prob / (ai_prob + bi_prob) 85 | else 86 | local ai_mul, bi_mul 87 | for _index_0 = 1, #available_words do 88 | local word = available_words[_index_0] 89 | local ai_count = (a.word_counts and a.word_counts[word] or 0) + default_a 90 | local bi_count = (b.word_counts and b.word_counts[word] or 0) + default_b 91 | local weight = token_weights[word] or self:get_token_weight(word) 92 | if ai_mul then 93 | ai_mul = ai_mul * (ai_count ^ weight) 94 | else 95 | ai_mul = ai_count ^ weight 96 | end 97 | if bi_mul then 98 | bi_mul = bi_mul * (bi_count ^ weight) 99 | else 100 | bi_mul = bi_count ^ weight 101 | end 102 | end 103 | local ai_prob = a.total_count * ai_mul / ((a.total_count + default_a) * available_words_count) 104 | local bi_prob = b.total_count * bi_mul / ((b.total_count + default_b) * available_words_count) 105 | if ai_prob ~= ai_prob then 106 | ai_prob = 0 107 | end 108 | if bi_prob ~= bi_prob then 109 | bi_prob = 0 110 | end 111 | prob = ai_prob / (ai_prob + bi_prob) 112 | end 113 | if prob ~= prob then 114 | return nil, "Got nan when calculating prob" 115 | end 116 | if prob == math.huge or prob == -math.huge then 117 | return nil, "Got inf when calculating prob" 118 | end 119 | local tuples = { 120 | { 121 | a.name, 122 | prob 123 | }, 124 | { 125 | b.name, 126 | 1 - prob 127 | } 128 | } 129 | table.sort(tuples, function(a, b) 130 | return a[2] > b[2] 131 | end) 132 | return tuples 133 | end 134 | } 135 | _base_0.__index = _base_0 136 | setmetatable(_base_0, _parent_0.__base) 137 | _class_0 = setmetatable({ 138 | __init = function(self, ...) 139 | return _class_0.__parent.__init(self, ...) 140 | end, 141 | __base = _base_0, 142 | __name = "BayesClassifier", 143 | __parent = _parent_0 144 | }, { 145 | __index = function(cls, name) 146 | local val = rawget(_base_0, name) 147 | if val == nil then 148 | local parent = rawget(cls, "__parent") 149 | if parent then 150 | return parent[name] 151 | end 152 | else 153 | return val 154 | end 155 | end, 156 | __call = function(cls, ...) 157 | local _self_0 = setmetatable({}, _base_0) 158 | cls.__init(_self_0, ...) 159 | return _self_0 160 | end 161 | }) 162 | _base_0.__class = _class_0 163 | local self = _class_0 164 | self.default_options = { 165 | max_words = 40, 166 | default_prob = 0.1, 167 | log = false, 168 | token_weight_patterns = nil, 169 | uncertainty_weight = 1.0 170 | } 171 | if _parent_0.__inherited then 172 | _parent_0.__inherited(_parent_0, _class_0) 173 | end 174 | BayesClassifier = _class_0 175 | return _class_0 176 | end 177 | -------------------------------------------------------------------------------- /spec/unaccent_spec.moon: -------------------------------------------------------------------------------- 1 | 2 | unaccent = require "lapis.bayes.text.unaccent" 3 | 4 | describe "lapis.bayes.text.unaccent", -> 5 | describe "unaccent_string", -> 6 | it "passes through basic ASCII unchanged", -> 7 | assert.same "hello world", unaccent.unaccent_string "hello world" 8 | assert.same "abc123", unaccent.unaccent_string "abc123" 9 | assert.same "test", unaccent.unaccent_string "test" 10 | 11 | it "handles empty string", -> 12 | assert.same "", unaccent.unaccent_string "" 13 | 14 | it "converts fullwidth characters to ASCII", -> 15 | assert.same "abc", unaccent.unaccent_string "abc" 16 | assert.same "ABC", unaccent.unaccent_string "ABC" 17 | assert.same "123", unaccent.unaccent_string "123" 18 | 19 | it "converts mathematical alphanumerics", -> 20 | assert.same "abc", unaccent.unaccent_string "𝕒𝕓𝕔" 21 | assert.same "xyz", unaccent.unaccent_string "𝚡𝚢𝚣" 22 | assert.same "ABC", unaccent.unaccent_string "𝓐𝓑𝓒" 23 | 24 | it "converts mathematical bold letters", -> 25 | assert.same "SaleIsLiveCheckNow", unaccent.unaccent_string "𝐒𝐚𝐥𝐞𝐈𝐬𝐋𝐢𝐯𝐞𝐂𝐡𝐞𝐜𝐤𝐍𝐨𝐰" 26 | assert.same "ABC", unaccent.unaccent_string "𝐀𝐁𝐂" 27 | assert.same "xyz", unaccent.unaccent_string "𝐱𝐲𝐳" 28 | 29 | it "removes accents from Latin characters", -> 30 | assert.same "aeiou", unaccent.unaccent_string "àéíóú" 31 | assert.same "AEIOU", unaccent.unaccent_string "ÀÉÍÓÚ" 32 | assert.same "nca", unaccent.unaccent_string "ñçä" 33 | 34 | it "converts Greek letters to Latin", -> 35 | assert.same "a", unaccent.unaccent_string "α" 36 | assert.same "y", unaccent.unaccent_string "γ" 37 | assert.same "n", unaccent.unaccent_string "π" 38 | assert.same "o", unaccent.unaccent_string "ο" 39 | 40 | it "converts Cyrillic letters to Latin", -> 41 | assert.same "a", unaccent.unaccent_string "а" 42 | assert.same "e", unaccent.unaccent_string "е" 43 | assert.same "o", unaccent.unaccent_string "о" 44 | 45 | it "normalizes special punctuation", -> 46 | assert.same ".", unaccent.unaccent_string "。" 47 | assert.same ",", unaccent.unaccent_string "," 48 | assert.same ":", unaccent.unaccent_string ":" 49 | assert.same "!", unaccent.unaccent_string "!" 50 | 51 | it "normalizes mathematical operators", -> 52 | assert.same "==", unaccent.unaccent_string "⩵" 53 | assert.same "===", unaccent.unaccent_string "⩶" 54 | assert.same "::=", unaccent.unaccent_string "⩴" 55 | 56 | it "normalizes brackets", -> 57 | assert.same "[", unaccent.unaccent_string "[" 58 | assert.same "]", unaccent.unaccent_string "]" 59 | assert.same "{", unaccent.unaccent_string "{" 60 | assert.same "}", unaccent.unaccent_string "}" 61 | 62 | it "converts special number forms", -> 63 | assert.same "0", unaccent.unaccent_string "0" 64 | assert.same " 1/2", unaccent.unaccent_string "½" 65 | assert.same " 1/4", unaccent.unaccent_string "¼" 66 | assert.same " 3/4", unaccent.unaccent_string "¾" 67 | 68 | it "converts Roman numerals", -> 69 | assert.same "1", unaccent.unaccent_string "Ⅰ" 70 | assert.same "IV", unaccent.unaccent_string "Ⅳ" 71 | assert.same "XII", unaccent.unaccent_string "Ⅻ" 72 | 73 | it "converts circled numbers", -> 74 | assert.same "1", unaccent.unaccent_string "①" 75 | assert.same "10", unaccent.unaccent_string "⑩" 76 | assert.same "20", unaccent.unaccent_string "⑳" 77 | 78 | it "converts enclosed alphanumerics", -> 79 | assert.same "(1)", unaccent.unaccent_string "⑴" 80 | assert.same "(a)", unaccent.unaccent_string "⒜" 81 | assert.same "1.", unaccent.unaccent_string "⒈" 82 | 83 | it "handles mixed character types", -> 84 | assert.same "hello123", unaccent.unaccent_string "hello123" 85 | assert.same "test.com", unaccent.unaccent_string "test。com" 86 | 87 | it "handles characters that should pass through", -> 88 | result = unaccent.unaccent_string "hello-world_test" 89 | assert.same "hello-world_test", result 90 | 91 | it "handles ligatures", -> 92 | assert.same "fi", unaccent.unaccent_string "fi" 93 | assert.same "fl", unaccent.unaccent_string "fl" 94 | assert.same "ffi", unaccent.unaccent_string "ffi" 95 | assert.same "ffl", unaccent.unaccent_string "ffl" 96 | assert.same "st", unaccent.unaccent_string "st" 97 | 98 | it "handles special letter forms", -> 99 | assert.same "ss", unaccent.unaccent_string "ß" 100 | assert.same "SS", unaccent.unaccent_string "ẞ" 101 | assert.same "ae", unaccent.unaccent_string "æ" 102 | assert.same "AE", unaccent.unaccent_string "Æ" 103 | assert.same "oe", unaccent.unaccent_string "œ" 104 | assert.same "OE", unaccent.unaccent_string "Œ" 105 | 106 | describe "comprehensive normalization tests from test.moon", -> 107 | -- Note: unaccent_string only does character transliteration, not case normalization 108 | -- Expected values show what unaccent_string outputs (with spaces removed) 109 | normalizes = { 110 | {"hello world", "helloworld"} 111 | {"bamWaR7°CoМ", "bamWaR7.CoM"} 112 | {"BaМwAr7.СοM", "BaMwAr7.CoM"} 113 | {"b A m w A r 7 ° c O М", "bAmwAr7.coM"} 114 | {"B A Μ W а R 7 ㆍc o m", "BAMWaR7.com"} 115 | {"b AΜ w А R 7.cOм", "bAMwAR7.com"} 116 | {"bamwar7.com", "bamwar7.com"} 117 | {"BAM〉WAR7.com", "BAM>WAR7.com"} 118 | {"B A M W A R 7ㆍCOM", "BAMWAR7.COM"} 119 | {"BAMWAR7.COM", "BAMWAR7.CoM"} 120 | {"〚bam〛war7.〚com〛", "[bam]war7.[com]"} 121 | {"⒲⒲⒲.⒝⒜⒨⒲⒜⒭⑺.⒞⒪⒨", "(w)(w)(w).(b)(a)(m)(w)(a)(r)(7).(c)(o)(m)"} 122 | {" ⓦⓦⓦ.ⓑⓐⓜⓦⓐⓡ⑦.ⓒⓞⓜ", "www.bamwar7.com"} 123 | {"🇱🅔🅰🄵", "leaf"} 124 | {"ero588,C0M", "ero588,C0M"} 125 | {"RK772。CoM", "RK772.CoM"} 126 | {"MIO652。CoM", "MIO652.CoM"} 127 | {"KBS454。COM", "KBS454.CoM"} 128 | {"MI738。CoM", "MI738.CoM"} 129 | {"mkmk35。COM", "mkmk35.COM"} 130 | {"79ESA。CoM", "79ESA.CoM"} 131 | {"APA82。CoM", "APA82.CoM"} 132 | {"𝚟𝚘𝚙.𝚜𝚞", "vop.su"} 133 | {"MMO77。COM", "MMo77.CoM"} 134 | {"MIO652。COM", "Mio652.CoM"} 135 | {"kakao: dnj2016", "kakao:dnj2016"} 136 | } 137 | 138 | for {before, after} in *normalizes 139 | it "normalizes '#{before}'", -> 140 | result = unaccent.unaccent_string before 141 | -- Remove spaces for comparison since the test.moon examples show this 142 | result_normalized = result\gsub "%s", "" 143 | assert.same after, result_normalized 144 | 145 | describe "unaccent_table", -> 146 | it "exists and is a table", -> 147 | assert.is_table unaccent.unaccent_table 148 | 149 | it "has expected number of entries", -> 150 | count = 0 151 | for k, v in pairs unaccent.unaccent_table 152 | count += 1 153 | assert.true count > 2000, "Expected over 2000 mappings" 154 | 155 | it "contains specific mappings", -> 156 | assert.same "a", unaccent.unaccent_table["à"] 157 | assert.same "e", unaccent.unaccent_table["é"] 158 | assert.same "A", unaccent.unaccent_table["A"] 159 | assert.same "0", unaccent.unaccent_table["0"] 160 | assert.same ".", unaccent.unaccent_table["。"] 161 | 162 | it "maps fullwidth characters", -> 163 | assert.same "a", unaccent.unaccent_table["a"] 164 | assert.same "z", unaccent.unaccent_table["z"] 165 | assert.same "0", unaccent.unaccent_table["0"] 166 | assert.same "9", unaccent.unaccent_table["9"] 167 | 168 | it "maps Greek letters", -> 169 | assert.same "a", unaccent.unaccent_table["α"] 170 | assert.same "y", unaccent.unaccent_table["γ"] 171 | assert.same "n", unaccent.unaccent_table["π"] 172 | 173 | it "maps mathematical alphanumerics", -> 174 | assert.true unaccent.unaccent_table["𝕒"] != nil 175 | assert.true unaccent.unaccent_table["𝓐"] != nil 176 | assert.true unaccent.unaccent_table["𝚊"] != nil 177 | -------------------------------------------------------------------------------- /lapis/bayes/classifiers/base.lua: -------------------------------------------------------------------------------- 1 | local uniquify 2 | uniquify = require("lapis.util").uniquify 3 | local BaseClassifier 4 | do 5 | local _class_0 6 | local _base_0 = { 7 | default_tokenizer = "lapis.bayes.tokenizers.postgres_text", 8 | word_probabilities = function(self, categories, words) 9 | return error("word_probabilities: subclass must implement") 10 | end, 11 | classify_text = function(self, ...) 12 | local counts, word_rate_or_err = self:text_probabilities(...) 13 | if not (counts) then 14 | return nil, word_rate_or_err 15 | end 16 | return counts[1][1], counts[1][2], word_rate_or_err 17 | end, 18 | tokenize_text = function(self, text) 19 | assert(text, "missing text to tokenize") 20 | if not (type(text) == "string") then 21 | return text 22 | end 23 | if self.opts.tokenize_text then 24 | return self.opts.tokenize_text(text, self.opts) 25 | end 26 | local tokenizer 27 | if self.opts.tokenizer then 28 | tokenizer = self.opts.tokenizer 29 | else 30 | local Tokenizer = require(self.default_tokenizer) 31 | tokenizer = Tokenizer(self.opts) 32 | end 33 | return tokenizer:tokenize_text(text) 34 | end, 35 | train_text = function(self, category, text, opts) 36 | local tokens = self:tokenize_text(text) 37 | if opts and opts.filter_tokens then 38 | tokens = opts.filter_tokens(opts, text) 39 | end 40 | local Categories 41 | Categories = require("lapis.bayes.models").Categories 42 | category = Categories:find_or_create(category) 43 | return category:increment_words(tokens) 44 | end, 45 | text_probabilities = function(self, category_names, text, opts) 46 | opts = opts or { } 47 | local categories, err = self:find_categories(category_names) 48 | if not (categories) then 49 | return nil, err 50 | end 51 | local words = self:tokenize_text(text) 52 | if not (words and next(words)) then 53 | return nil, "failed to generate tokens for text" 54 | end 55 | local available_words 56 | available_words, err = self:count_words(categories, words) 57 | if not (available_words) then 58 | return nil, err 59 | end 60 | local available_words_set 61 | do 62 | local _tbl_0 = { } 63 | for _index_0 = 1, #available_words do 64 | local word = available_words[_index_0] 65 | _tbl_0[word] = true 66 | end 67 | available_words_set = _tbl_0 68 | end 69 | local count = 0 70 | for _index_0 = 1, #words do 71 | local word = words[_index_0] 72 | if available_words_set[word] then 73 | count = count + 1 74 | end 75 | end 76 | local token_ratio = count / #words 77 | local probs 78 | probs, err = self:word_probabilities(categories, available_words, opts) 79 | if not (probs) then 80 | return nil, err 81 | end 82 | for _index_0 = 1, #probs do 83 | local _des_0 = probs[_index_0] 84 | local c, p 85 | c, p = _des_0[1], _des_0[2] 86 | probs[c] = p 87 | end 88 | return probs, token_ratio 89 | end, 90 | find_categories = function(self, category_names) 91 | local Categories 92 | Categories = require("lapis.bayes.models").Categories 93 | local db = Categories.db 94 | local categories = Categories:select("where name in ?", db.list(category_names)) 95 | local by_name 96 | do 97 | local _tbl_0 = { } 98 | for _index_0 = 1, #categories do 99 | local c = categories[_index_0] 100 | _tbl_0[c.name] = c 101 | end 102 | by_name = _tbl_0 103 | end 104 | local missing 105 | local result 106 | do 107 | local _accum_0 = { } 108 | local _len_0 = 1 109 | for _index_0 = 1, #category_names do 110 | local _continue_0 = false 111 | repeat 112 | local name = category_names[_index_0] 113 | local c = by_name[name] 114 | if not (c) then 115 | missing = missing or { } 116 | table.insert(missing, name) 117 | _continue_0 = true 118 | break 119 | end 120 | local _value_0 = c 121 | _accum_0[_len_0] = _value_0 122 | _len_0 = _len_0 + 1 123 | _continue_0 = true 124 | until true 125 | if not _continue_0 then 126 | break 127 | end 128 | end 129 | result = _accum_0 130 | end 131 | if missing and next(missing) then 132 | return nil, "find_categories: missing categories (" .. tostring(table.concat(missing, ", ")) .. ")" 133 | end 134 | return result 135 | end, 136 | find_word_classifications = function(self, words, category_ids) 137 | if not (next(words) and next(category_ids)) then 138 | return { } 139 | end 140 | local WordClassifications 141 | WordClassifications = require("lapis.bayes.models").WordClassifications 142 | local db = WordClassifications.db 143 | return WordClassifications:select("where word in ? and category_id in ?", db.list(words), db.list(category_ids)) 144 | end, 145 | candidate_words = function(self, categories, available_words, count) 146 | if #available_words <= count then 147 | return available_words 148 | end 149 | assert(#categories == 2, "can only do two categories") 150 | local a, b = unpack(categories) 151 | local tuples 152 | do 153 | local _accum_0 = { } 154 | local _len_0 = 1 155 | for _index_0 = 1, #available_words do 156 | local word = available_words[_index_0] 157 | local a_count = a.word_counts and a.word_counts[word] or 0 158 | local b_count = b.word_counts and b.word_counts[word] or 0 159 | local _value_0 = { 160 | word, 161 | math.random() / 100 + math.abs((a_count - b_count) / math.sqrt(a_count + b_count)), 162 | a_count, 163 | b_count 164 | } 165 | _accum_0[_len_0] = _value_0 166 | _len_0 = _len_0 + 1 167 | end 168 | tuples = _accum_0 169 | end 170 | table.sort(tuples, function(a, b) 171 | return a[2] > b[2] 172 | end) 173 | local _accum_0 = { } 174 | local _len_0 = 1 175 | local _max_0 = count 176 | for _index_0 = 1, _max_0 < 0 and #tuples + _max_0 or _max_0 do 177 | local t = tuples[_index_0] 178 | _accum_0[_len_0] = t[1] 179 | _len_0 = _len_0 + 1 180 | end 181 | return _accum_0 182 | end, 183 | count_words = function(self, categories, words) 184 | local categories_by_id 185 | do 186 | local _tbl_0 = { } 187 | for _index_0 = 1, #categories do 188 | local c = categories[_index_0] 189 | _tbl_0[c.id] = c 190 | end 191 | categories_by_id = _tbl_0 192 | end 193 | words = uniquify(words) 194 | local wcs = self:find_word_classifications(words, (function() 195 | local _accum_0 = { } 196 | local _len_0 = 1 197 | for _index_0 = 1, #categories do 198 | local c = categories[_index_0] 199 | _accum_0[_len_0] = c.id 200 | _len_0 = _len_0 + 1 201 | end 202 | return _accum_0 203 | end)()) 204 | local available_words 205 | do 206 | local _accum_0 = { } 207 | local _len_0 = 1 208 | for word in pairs((function() 209 | local _tbl_0 = { } 210 | for _index_0 = 1, #wcs do 211 | local wc = wcs[_index_0] 212 | _tbl_0[wc.word] = true 213 | end 214 | return _tbl_0 215 | end)()) do 216 | _accum_0[_len_0] = word 217 | _len_0 = _len_0 + 1 218 | end 219 | available_words = _accum_0 220 | end 221 | if #available_words == 0 then 222 | return nil, "no words in text are classifyable" 223 | end 224 | for _index_0 = 1, #wcs do 225 | local wc = wcs[_index_0] 226 | local category = categories_by_id[wc.category_id] 227 | category.word_counts = category.word_counts or { } 228 | category.word_counts[wc.word] = wc.count 229 | end 230 | return available_words 231 | end 232 | } 233 | _base_0.__index = _base_0 234 | _class_0 = setmetatable({ 235 | __init = function(self, opts) 236 | if opts == nil then 237 | opts = { } 238 | end 239 | self.opts = opts 240 | if self.__class.default_options then 241 | self.opts = setmetatable((function() 242 | local _tbl_0 = { } 243 | for k, v in pairs(self.opts) do 244 | _tbl_0[k] = v 245 | end 246 | return _tbl_0 247 | end)(), { 248 | __index = self.__class.default_options 249 | }) 250 | end 251 | end, 252 | __base = _base_0, 253 | __name = "BaseClassifier" 254 | }, { 255 | __index = _base_0, 256 | __call = function(cls, ...) 257 | local _self_0 = setmetatable({}, _base_0) 258 | cls.__init(_self_0, ...) 259 | return _self_0 260 | end 261 | }) 262 | _base_0.__class = _class_0 263 | BaseClassifier = _class_0 264 | return _class_0 265 | end 266 | -------------------------------------------------------------------------------- /spec/stem_spec.moon: -------------------------------------------------------------------------------- 1 | stem = require "lapis.bayes.text.stem" 2 | 3 | test_word = (input, expected) -> 4 | assert.same expected, stem.stem_word input 5 | 6 | describe "lapis.bayes.text.stem", -> 7 | describe "stem_word", -> 8 | it "handles nil and empty strings", -> 9 | assert.same nil, stem.stem_word nil 10 | assert.same "", stem.stem_word "" 11 | 12 | it "handles short words (< 3 chars)", -> 13 | test_word "a", "a" 14 | test_word "ab", "ab" 15 | test_word "at", "at" 16 | 17 | it "handles words that don't need stemming", -> 18 | test_word "cat", "cat" 19 | test_word "dog", "dog" 20 | test_word "tree", "tree" 21 | 22 | it "converts to lowercase", -> 23 | test_word "HELLO", "hello" 24 | test_word "WoRlD", "world" 25 | test_word "TEST", "test" 26 | 27 | describe "exception words", -> 28 | it "handles skis/skies", -> 29 | test_word "skis", "ski" 30 | test_word "skies", "sky" 31 | test_word "sky", "sky" 32 | 33 | it "handles special -ly cases", -> 34 | test_word "idly", "idl" 35 | test_word "gently", "gentl" 36 | test_word "ugly", "ugli" 37 | test_word "early", "earli" 38 | test_word "only", "onli" 39 | test_word "singly", "singl" 40 | 41 | it "handles invariant forms", -> 42 | test_word "news", "news" 43 | test_word "howe", "howe" 44 | test_word "atlas", "atlas" 45 | test_word "cosmos", "cosmos" 46 | test_word "bias", "bias" 47 | test_word "andes", "andes" 48 | 49 | describe "Step 1a - plurals and possessives", -> 50 | it "removes apostrophes", -> 51 | test_word "dog's", "dog" 52 | test_word "cat's'", "cat" 53 | 54 | it "handles sses -> ss", -> 55 | test_word "blesses", "bless" 56 | test_word "stresses", "stress" 57 | 58 | it "handles ied/ies", -> 59 | test_word "tied", "tie" 60 | test_word "pies", "pie" 61 | test_word "cries", "cri" 62 | test_word "studies", "studi" 63 | 64 | it "removes trailing s when appropriate", -> 65 | test_word "cats", "cat" 66 | test_word "dogs", "dog" 67 | test_word "gas", "ga" -- has vowel so s is removed 68 | test_word "this", "thi" -- has vowel so s is removed 69 | test_word "class", "class" -- ss ending 70 | 71 | describe "Step 1b - ed, ing suffixes", -> 72 | it "handles eed/eedly in R1", -> 73 | test_word "agreed", "agre" 74 | test_word "feed", "feed" -- R1 is null, so eed not replaced 75 | 76 | it "handles ed/edly", -> 77 | test_word "plastered", "plaster" 78 | test_word "bled", "bled" 79 | test_word "motivated", "motiv" 80 | 81 | it "handles ing/ingly", -> 82 | test_word "sing", "sing" 83 | test_word "motivating", "motiv" 84 | test_word "running", "run" 85 | test_word "hopping", "hop" 86 | 87 | it "adds e after at/bl/iz", -> 88 | test_word "luxuriated", "luxuri" -- removes 'ated', no e added 89 | test_word "troubled", "troubl" 90 | 91 | it "removes double consonants", -> 92 | test_word "hopped", "hop" 93 | test_word "fitted", "fit" 94 | test_word "planned", "plan" 95 | 96 | it "handles special ing cases", -> 97 | test_word "inning", "inning" 98 | test_word "outing", "outing" 99 | test_word "canning", "canning" 100 | 101 | describe "Step 1c - y suffix", -> 102 | it "replaces suffix y with i", -> 103 | test_word "happy", "happi" 104 | test_word "sky", "sky" -- exception word, not changed 105 | 106 | it "does not replace y at start or after vowel", -> 107 | test_word "say", "say" 108 | test_word "boy", "boy" 109 | 110 | describe "Step 2 - derivational suffixes", -> 111 | it "handles tional -> tion", -> 112 | test_word "relational", "relat" 113 | test_word "conditional", "condit" 114 | test_word "rational", "ration" 115 | 116 | it "handles enci -> ence", -> 117 | test_word "valenci", "valenc" 118 | 119 | it "handles anci -> ance", -> 120 | test_word "hesitanci", "hesit" 121 | 122 | it "handles izer -> ize", -> 123 | test_word "digitizer", "digit" 124 | 125 | it "handles ational -> ate", -> 126 | test_word "operational", "oper" 127 | 128 | it "handles ation/ator -> ate", -> 129 | test_word "predication", "predic" 130 | test_word "operator", "oper" 131 | 132 | it "handles alism -> al", -> 133 | test_word "feudalism", "feudal" 134 | 135 | it "handles fulness -> ful", -> 136 | test_word "hopefulness", "hope" 137 | 138 | it "handles ousness -> ous", -> 139 | test_word "callousness", "callous" 140 | 141 | it "handles iveness -> ive", -> 142 | test_word "decisiveness", "decis" 143 | 144 | it "handles biliti -> ble", -> 145 | test_word "sensibiliti", "sensibl" 146 | 147 | it "handles li deletion", -> 148 | test_word "formalli", "formal" 149 | 150 | describe "Step 3 - more derivational suffixes", -> 151 | it "handles icate -> ic", -> 152 | test_word "duplicate", "duplic" 153 | 154 | it "handles ative deletion in R2", -> 155 | test_word "demonstrative", "demonstr" 156 | 157 | it "handles alize -> al", -> 158 | test_word "normalize", "normal" 159 | 160 | it "handles ful/ness deletion", -> 161 | test_word "hopeful", "hope" 162 | test_word "goodness", "good" 163 | 164 | describe "Step 4 - suffix deletion", -> 165 | it "handles al", -> 166 | test_word "radical", "radic" 167 | 168 | it "handles ance/ence", -> 169 | test_word "dependence", "depend" 170 | 171 | it "handles er", -> 172 | test_word "computer", "comput" 173 | 174 | it "handles able/ible", -> 175 | test_word "adjustable", "adjust" 176 | test_word "divisible", "divis" 177 | 178 | it "handles ant/ent/ment", -> 179 | test_word "irritant", "irrit" 180 | test_word "different", "differ" 181 | test_word "adjustment", "adjust" 182 | 183 | it "handles ion after s or t", -> 184 | test_word "adoption", "adopt" 185 | test_word "decision", "decis" 186 | 187 | it "handles ism/iti/ous/ive/ize", -> 188 | test_word "communism", "communism" -- ism in R2 only 189 | test_word "sensitivity", "sensit" 190 | test_word "continuous", "continu" 191 | test_word "effective", "effect" 192 | test_word "realize", "realiz" 193 | 194 | describe "Step 5 - final cleanup", -> 195 | it "removes trailing e in R2", -> 196 | test_word "debate", "debat" 197 | test_word "create", "creat" 198 | 199 | it "removes trailing e in R1 if not short syllable", -> 200 | test_word "hope", "hope" 201 | 202 | it "keeps trailing e after short syllable in R1", -> 203 | test_word "centre", "centr" 204 | 205 | it "removes double l in R2", -> 206 | test_word "controll", "control" 207 | 208 | describe "word families", -> 209 | it "stems connection family to connect", -> 210 | test_word "connection", "connect" 211 | test_word "connections", "connect" 212 | test_word "connective", "connect" 213 | test_word "connected", "connect" 214 | test_word "connecting", "connect" 215 | 216 | it "stems generate family", -> 217 | test_word "generate", "generat" 218 | test_word "generates", "generat" 219 | test_word "generated", "generat" 220 | test_word "generating", "generat" 221 | test_word "generator", "generat" 222 | test_word "general", "general" 223 | test_word "generalization", "general" 224 | 225 | it "stems happy family to happi", -> 226 | test_word "happy", "happi" 227 | test_word "happiness", "happi" 228 | test_word "happily", "happili" 229 | 230 | it "stems run family", -> 231 | test_word "run", "run" 232 | test_word "running", "run" 233 | test_word "runs", "run" 234 | test_word "runner", "runner" 235 | 236 | describe "complex derivational chains", -> 237 | it "handles multiply derived words", -> 238 | test_word "vietnamization", "vietnam" 239 | test_word "conformabli", "conform" 240 | test_word "radicalli", "radic" 241 | test_word "differentli", "differ" 242 | 243 | describe "special prefix handling", -> 244 | it "handles commun- prefix", -> 245 | test_word "communism", "communism" -- ism not in R2 246 | test_word "communication", "communic" 247 | test_word "community", "communiti" 248 | 249 | it "handles gener- prefix", -> 250 | test_word "generate", "generat" 251 | test_word "generator", "generat" 252 | test_word "generous", "generous" 253 | 254 | it "handles univers- prefix", -> 255 | test_word "university", "universiti" 256 | test_word "universal", "universal" 257 | test_word "universe", "univers" 258 | 259 | describe "edge cases", -> 260 | it "handles very long words", -> 261 | result = stem.stem_word "antidisestablishmentarianism" 262 | assert.is_string result 263 | assert.true #result > 0 264 | 265 | it "handles words with no vowels", -> 266 | test_word "shhh", "shhh" 267 | test_word "hmm", "hmm" 268 | 269 | it "handles repeated consonants", -> 270 | test_word "bless", "bless" 271 | test_word "press", "press" 272 | 273 | it "handles words ending in y", -> 274 | test_word "daily", "daili" 275 | test_word "easily", "easili" 276 | 277 | it "preserves words that should not be stemmed", -> 278 | test_word "test", "test" 279 | test_word "best", "best" 280 | -------------------------------------------------------------------------------- /spec/bayes_spec.moon: -------------------------------------------------------------------------------- 1 | 2 | import use_test_env from require "lapis.spec" 3 | import truncate_tables from require "lapis.spec.db" 4 | 5 | import Categories, WordClassifications from require "lapis.bayes.models" 6 | 7 | describe "lapis.bayes", -> 8 | use_test_env! 9 | 10 | describe "WordClassifications", -> 11 | local c1, c2 12 | 13 | before_each -> 14 | truncate_tables Categories, WordClassifications 15 | 16 | c1 = Categories\find_or_create "hello" 17 | c1\increment_words { 18 | alpha: 17 19 | beta: 19 20 | } 21 | 22 | c2 = Categories\find_or_create "world" 23 | c2\increment_words { 24 | beta: 22 25 | triple: 27 26 | } 27 | 28 | it "has the correct counts", -> 29 | c1_words = {c.word, c.count for c in *c1\get_word_classifications!} 30 | c2_words = {c.word, c.count for c in *c2\get_word_classifications!} 31 | 32 | assert.same { 33 | alpha: 17 34 | beta: 19 35 | }, c1_words 36 | 37 | assert.same { 38 | beta: 22 39 | triple: 27 40 | }, c2_words 41 | 42 | 43 | it "deletes word from category", -> 44 | c1_count = c1.total_count 45 | c2_count = c2.total_count 46 | 47 | wc = assert WordClassifications\find category_id: c1.id, word: "beta" 48 | wc\delete! 49 | 50 | c1\refresh! 51 | c2\refresh! 52 | 53 | assert.same 19, c1_count - c1.total_count 54 | assert.same 0, c2_count - c2.total_count 55 | 56 | it "purges purges words from all categories", -> 57 | c1_count = c1.total_count 58 | c2_count = c2.total_count 59 | 60 | deleted, count = WordClassifications\purge_word "alpha", {"hello", "world"} 61 | assert.true deleted 62 | assert.same 1, count 63 | 64 | c1\refresh! 65 | c2\refresh! 66 | 67 | assert.same 17, c1_count - c1.total_count 68 | assert.same 0, c2_count - c2.total_count 69 | 70 | it "it increments an individual word", -> 71 | wc = assert WordClassifications\find category_id: c1.id, word: "beta" 72 | 73 | before_word_count = wc.count 74 | 75 | wc\_increment 1 76 | wc\refresh! 77 | assert.same before_word_count + 1, wc.count 78 | 79 | it "deletes word when being unincremented to 0", -> 80 | wc = assert WordClassifications\find category_id: c1.id, word: "beta" 81 | wc\_increment -wc.count 82 | 83 | assert.nil (WordClassifications\find { 84 | category_id: c1.id 85 | word: "beta" 86 | }) 87 | 88 | it "clears out words when decremeitng them", -> 89 | words = c1\get_word_classifications! 90 | for word in *words 91 | c1\increment_word word.word, -word.count 92 | 93 | assert.same 0, c1.total_count 94 | c1\refresh! 95 | assert.same {}, c1\get_word_classifications! 96 | 97 | describe "Categories", -> 98 | before_each -> 99 | truncate_tables Categories, WordClassifications 100 | 101 | it "finds or creates category", -> 102 | c = Categories\find_or_create "hello" 103 | c2 = Categories\find_or_create "hello" 104 | assert.same c.id, c2.id 105 | 106 | it "increments words", -> 107 | c = Categories\find_or_create "hello" 108 | 109 | WordClassifications\create { 110 | word: "color" 111 | category_id: c.id 112 | count: 2 113 | } 114 | 115 | c\increment_words { 116 | color: 55 117 | height: 12 118 | green: 8 119 | } 120 | 121 | wc_by_name = {wc.word, wc for wc in *WordClassifications\select!} 122 | 123 | assert.same 57, wc_by_name.color.count 124 | assert.same 12, wc_by_name.height.count 125 | assert.same 8, wc_by_name.green.count 126 | 127 | it "deletes category", -> 128 | c = Categories\find_or_create "hello" 129 | c\increment_words { 130 | color: 23 131 | height: 2 132 | } 133 | c\delete! 134 | 135 | describe "tokenize text", -> 136 | describe "default tokenizer", -> 137 | tokenize_text = (text, ...) -> 138 | if ... 139 | error "Got expected additional arguments for tokenize text" 140 | 141 | BaseClassifier = require "lapis.bayes.classifiers.base" 142 | BaseClassifier!\tokenize_text text 143 | 144 | it "gets tokens for empty string", -> 145 | assert.same {}, tokenize_text "" 146 | 147 | it "gets tokens for basic string", -> 148 | assert.same {"hello", "world"}, tokenize_text "hello world" 149 | 150 | it "gets tokens with stems and no stop words", -> 151 | assert.same {"burger", "eat"}, tokenize_text "i am eating burgers" 152 | 153 | it "doesn't keep dupes", -> 154 | assert.same {"burger"}, tokenize_text "burgers are burgers" 155 | 156 | it "skips tokens that are too long or short", -> 157 | assert.same {"great"}, tokenize_text "a b c d e f g great eatingthebigriceball " 158 | 159 | it "strips numbers", -> 160 | assert.same {"delisho", "hodoc"}, tokenize_text "12 delisho hodocs for $5.99" 161 | 162 | it "uses custom tokenizer as classifier option", -> 163 | BaseClassifier = require "lapis.bayes.classifiers.base" 164 | c = BaseClassifier { 165 | tokenizer: require "lapis.bayes.tokenizers.url_domains" 166 | } 167 | 168 | assert.same {"leafo.net"}, c\tokenize_text "hello www.leafo.net website" 169 | 170 | it "users custom tokenize function", -> 171 | BaseClassifier = require "lapis.bayes.classifiers.base" 172 | c = BaseClassifier { 173 | tokenize_text: (text) -> 174 | [t for t in text\gmatch "."] 175 | } 176 | 177 | assert.same { 178 | "h", "e", "l", "l", "o" 179 | }, c\tokenize_text "hello" 180 | 181 | 182 | it "passes tokens through if already table", -> 183 | BaseClassifier = require "lapis.bayes.classifiers.base" 184 | c = BaseClassifier { } 185 | 186 | assert.same { "one", "two" }, c\tokenize_text {"one", "two"} 187 | 188 | 189 | describe "train_text", -> 190 | import train_text from require "lapis.bayes" 191 | 192 | before_each -> 193 | truncate_tables Categories, WordClassifications 194 | 195 | it "classifies a single string", -> 196 | train_text "spam", "hello this is spam, I love spam" 197 | assert.same 1, Categories\count! 198 | c = unpack Categories\select! 199 | assert.same "spam", c.name 200 | assert.same 3, WordClassifications\count! 201 | words = WordClassifications\select! 202 | table.sort words, (a, b) -> 203 | a.word < b.word 204 | 205 | assert.same { 206 | { category_id: c.id, count: 1, word: "hello" } 207 | { category_id: c.id, count: 1, word: "love" } 208 | { category_id: c.id, count: 1, word: "spam" } 209 | }, words 210 | 211 | 212 | it "classifies multiple strings", -> 213 | train_text "spam", "hello this is spam, I love spam" 214 | train_text "ham", "there is ham here" 215 | train_text "spam", "eating spamming the regular stuff" 216 | train_text "ham","pigs create too much jam" 217 | 218 | it "uses custom tokenizer", -> 219 | train_text "spam", "cat eat foot", { 220 | tokenize_text: (str, opts) -> 221 | [c for c in str\gmatch "[^%s]"] 222 | } 223 | 224 | assert.same { 225 | t: 3 226 | f: 1 227 | o: 2 228 | a: 2 229 | c: 1 230 | e: 1 231 | }, {c.word, c.count for c in *WordClassifications\select!} 232 | 233 | describe "text_probabilities", -> 234 | import text_probabilities from require "lapis.bayes" 235 | 236 | before_each -> 237 | truncate_tables Categories, WordClassifications 238 | 239 | it "works when there is no data", -> 240 | Categories\create name: "spam" 241 | Categories\create name: "ham" 242 | 243 | assert.same { 244 | nil, "no words in text are classifyable" 245 | }, { 246 | text_probabilities {"spam", "ham"}, "hello world" 247 | } 248 | 249 | it "works when there is some data", -> 250 | spam = Categories\create name: "spam" 251 | spam\increment_words {"hello", "world"} 252 | 253 | ham = Categories\create name: "ham" 254 | ham\increment_words {"butt", "world"} 255 | 256 | probs, rate = text_probabilities {"spam", "ham"}, "butt zone" 257 | assert.same 0.5, rate 258 | -- normalize probs for easy specs 259 | probs = for p in *probs 260 | {p[1], math.floor p[2] * 100 + 0.5} 261 | 262 | assert.same { 263 | {"ham", 95} 264 | {"spam", 5} 265 | }, probs 266 | 267 | describe "models", -> 268 | before_each -> 269 | truncate_tables Categories, WordClassifications 270 | 271 | it "increment_words", -> 272 | spam = Categories\create name: "spam" 273 | count = spam\increment_words { 274 | "first token" 275 | "hello.world" 276 | "http://leafo.net" 277 | "hello.world" 278 | zone: 77 279 | } 280 | 281 | assert.same 81, count 282 | 283 | words = WordClassifications\select "order by word asc", fields: "category_id, word, count" 284 | 285 | assert.same { 286 | { 287 | category_id: spam.id 288 | count: 1 289 | word: "first token" 290 | } 291 | { 292 | category_id: spam.id 293 | count: 2 294 | word: "hello.world" 295 | }, 296 | { 297 | category_id: spam.id 298 | count: 1 299 | word: "http://leafo.net" 300 | }, 301 | { 302 | category_id: spam.id 303 | count: 77 304 | word: "zone" 305 | } 306 | }, words 307 | 308 | 309 | count = spam\increment_words { 310 | "hello.world" 311 | "hello.world" 312 | "zone" 313 | "hello.world": 3 314 | } 315 | 316 | assert.same 6, count 317 | 318 | words = WordClassifications\select "order by word asc", fields: "category_id, word, count" 319 | 320 | 321 | assert.same { 322 | { 323 | category_id: spam.id 324 | count: 1 325 | word: "first token" 326 | } 327 | { 328 | category_id: spam.id 329 | count: 7 330 | word: "hello.world" 331 | }, 332 | { 333 | category_id: spam.id 334 | count: 1 335 | word: "http://leafo.net" 336 | }, 337 | { 338 | category_id: spam.id 339 | count: 78 340 | word: "zone" 341 | } 342 | }, words 343 | -------------------------------------------------------------------------------- /spec/ngram_tokenizer_spec.moon: -------------------------------------------------------------------------------- 1 | NgramTokenizer = require "lapis.bayes.tokenizers.ngram" 2 | 3 | it_tokenizes = (label, input, expected_tokens, opts=nil) -> 4 | it "tokenizes #{label}", -> 5 | tokenizer = NgramTokenizer opts 6 | tokens = tokenizer\tokenize_text input 7 | assert.same expected_tokens, tokens, "Tokens for #{input\sub 1, 80}" 8 | 9 | describe "lapis.bayes.tokenizers.ngram", -> 10 | describe "basic tokenization", -> 11 | it_tokenizes "simple text with default bigrams", "hello world", { 12 | "he" 13 | "el" 14 | "ll" 15 | "lo" 16 | "wo" 17 | "or" 18 | "rl" 19 | "ld" 20 | } 21 | 22 | it_tokenizes "single word", "test", { 23 | "te" 24 | "es" 25 | "st" 26 | } 27 | 28 | it_tokenizes "multiple words", "cat dog fox", { 29 | "ca" 30 | "at" 31 | "do" 32 | "og" 33 | "fo" 34 | "ox" 35 | } 36 | 37 | describe "different n values", -> 38 | it_tokenizes "with unigrams (n=1)", "hello", { 39 | "h" 40 | "e" 41 | "l" 42 | "l" 43 | "o" 44 | }, { n: 1 } 45 | 46 | it_tokenizes "with trigrams (n=3)", "hello", { 47 | "hel" 48 | "ell" 49 | "llo" 50 | }, { n: 3 } 51 | 52 | it_tokenizes "with 4-grams (n=4)", "hello", { 53 | "hell" 54 | "ello" 55 | }, { n: 4 } 56 | 57 | it_tokenizes "with n=5 exact word length", "hello", { 58 | "hello" 59 | }, { n: 5 } 60 | 61 | it_tokenizes "with n=0 defaults to 1", "hi", { 62 | "h" 63 | "i" 64 | }, { n: 0 } 65 | 66 | it_tokenizes "with negative n defaults to 1", "hi", { 67 | "h" 68 | "i" 69 | }, { n: -5 } 70 | 71 | it_tokenizes "with fractional n gets floored", "test", { 72 | "te" 73 | "es" 74 | "st" 75 | }, { n: 2.7 } 76 | 77 | describe "word normalization", -> 78 | it_tokenizes "converts to lowercase", "Hello WORLD", { 79 | "he" 80 | "el" 81 | "ll" 82 | "lo" 83 | "wo" 84 | "or" 85 | "rl" 86 | "ld" 87 | } 88 | 89 | it_tokenizes "removes punctuation", "hello, world!", { 90 | "he" 91 | "el" 92 | "ll" 93 | "lo" 94 | "wo" 95 | "or" 96 | "rl" 97 | "ld" 98 | } 99 | 100 | it_tokenizes "handles mixed case and punctuation", "Hello, World!", { 101 | "he" 102 | "el" 103 | "ll" 104 | "lo" 105 | "wo" 106 | "or" 107 | "rl" 108 | "ld" 109 | } 110 | 111 | it_tokenizes "removes multiple spaces", "hello world", { 112 | "he" 113 | "el" 114 | "ll" 115 | "lo" 116 | "wo" 117 | "or" 118 | "rl" 119 | "ld" 120 | } 121 | 122 | it_tokenizes "strips punctuation from words", "don't can't won't", { 123 | "do" 124 | "on" 125 | "nt" 126 | "ca" 127 | "an" 128 | "nt" 129 | "wo" 130 | "on" 131 | "nt" 132 | } 133 | 134 | describe "ngram_size method", -> 135 | it "returns default n=2", -> 136 | tokenizer = NgramTokenizer! 137 | assert.equal 2, tokenizer\ngram_size! 138 | 139 | it "returns configured n", -> 140 | tokenizer = NgramTokenizer n: 3 141 | assert.equal 3, tokenizer\ngram_size! 142 | 143 | it "handles string n", -> 144 | tokenizer = NgramTokenizer n: "4" 145 | assert.equal 4, tokenizer\ngram_size! 146 | 147 | it "floors fractional n", -> 148 | tokenizer = NgramTokenizer n: 3.9 149 | assert.equal 3, tokenizer\ngram_size! 150 | 151 | it "returns 1 for invalid n", -> 152 | tokenizer = NgramTokenizer n: 0 153 | assert.equal 1, tokenizer\ngram_size! 154 | 155 | describe "normalize_word method", -> 156 | local tokenizer 157 | before_each -> 158 | tokenizer = NgramTokenizer! 159 | 160 | it "normalizes to lowercase", -> 161 | assert.equal "hello", tokenizer\normalize_word "HELLO" 162 | assert.equal "hello", tokenizer\normalize_word "Hello" 163 | 164 | it "removes punctuation", -> 165 | assert.equal "hello", tokenizer\normalize_word "hello!" 166 | assert.equal "hello", tokenizer\normalize_word "hello," 167 | assert.equal "hello", tokenizer\normalize_word "hello..." 168 | 169 | it "removes whitespace", -> 170 | assert.equal "hello", tokenizer\normalize_word "hello " 171 | assert.equal "hello", tokenizer\normalize_word " hello" 172 | assert.equal "hello", tokenizer\normalize_word " hello " 173 | 174 | it "removes all punctuation and whitespace", -> 175 | assert.equal "hello", tokenizer\normalize_word " hello!!! " 176 | 177 | it "returns nil for empty string", -> 178 | assert.is_nil tokenizer\normalize_word "" 179 | 180 | it "returns nil for nil input", -> 181 | assert.is_nil tokenizer\normalize_word nil 182 | 183 | it "returns nil for whitespace only", -> 184 | assert.is_nil tokenizer\normalize_word " " 185 | 186 | it "returns nil for punctuation only", -> 187 | assert.is_nil tokenizer\normalize_word "!!!" 188 | 189 | describe "word_ngrams method", -> 190 | local tokenizer 191 | before_each -> 192 | tokenizer = NgramTokenizer! 193 | 194 | it "generates bigrams from word", -> 195 | ngrams = tokenizer\word_ngrams "hello", 2 196 | assert.same {"he", "el", "ll", "lo"}, ngrams 197 | 198 | it "generates trigrams from word", -> 199 | ngrams = tokenizer\word_ngrams "hello", 3 200 | assert.same {"hel", "ell", "llo"}, ngrams 201 | 202 | it "returns full word when length < n", -> 203 | ngrams = tokenizer\word_ngrams "hi", 3 204 | assert.same {"hi"}, ngrams 205 | 206 | it "returns full word when length == n", -> 207 | ngrams = tokenizer\word_ngrams "hi", 2 208 | assert.same {"hi"}, ngrams 209 | 210 | it "returns full word for empty string", -> 211 | ngrams = tokenizer\word_ngrams "", 2 212 | assert.same {""}, ngrams 213 | 214 | it "generates unigrams", -> 215 | ngrams = tokenizer\word_ngrams "cat", 1 216 | assert.same {"c", "a", "t"}, ngrams 217 | 218 | describe "number handling", -> 219 | it_tokenizes "ignores numbers by default", "hello 123 world 456", { 220 | "he" 221 | "el" 222 | "ll" 223 | "lo" 224 | "wo" 225 | "or" 226 | "rl" 227 | "ld" 228 | } 229 | 230 | it_tokenizes "includes numbers when ignore_numbers is false", "hello 123 world", { 231 | "he" 232 | "el" 233 | "ll" 234 | "lo" 235 | "12" 236 | "23" 237 | "wo" 238 | "or" 239 | "rl" 240 | "ld" 241 | }, { ignore_numbers: false } 242 | 243 | it_tokenizes "handles mixed alphanumeric", "abc123 def456", { 244 | "ab" 245 | "bc" 246 | "c1" 247 | "12" 248 | "23" 249 | "de" 250 | "ef" 251 | "f4" 252 | "45" 253 | "56" 254 | }, { ignore_numbers: false } 255 | 256 | describe "edge cases", -> 257 | it_tokenizes "empty string", "", {} 258 | 259 | it_tokenizes "only whitespace", " ", {} 260 | 261 | it_tokenizes "only punctuation", "!!???..", {} 262 | 263 | it_tokenizes "single character", "a", { 264 | "a" 265 | } 266 | 267 | it_tokenizes "two characters with bigrams", "ab", { 268 | "ab" 269 | } 270 | 271 | it_tokenizes "word longer than n", "testing", { 272 | "te" 273 | "es" 274 | "st" 275 | "ti" 276 | "in" 277 | "ng" 278 | } 279 | 280 | describe "unicode and international characters", -> 281 | it_tokenizes "accented characters", "café résumé", { 282 | "ca" 283 | "af" 284 | "fé" 285 | "ré" 286 | "és" 287 | "su" 288 | "um" 289 | "mé" 290 | } 291 | 292 | it_tokenizes "spanish text", "español niño", { 293 | "es" 294 | "sp" 295 | "pa" 296 | "añ" 297 | "ño" 298 | "ol" 299 | "ni" 300 | "iñ" 301 | "ño" 302 | } 303 | 304 | it_tokenizes "german umlauts", "über schön", { 305 | "üb" 306 | "be" 307 | "er" 308 | "sc" 309 | "ch" 310 | "hö" 311 | "ön" 312 | } 313 | 314 | it_tokenizes "french accents", "élève être", { 315 | "él" 316 | "lè" 317 | "èv" 318 | "ve" 319 | "êt" 320 | "tr" 321 | "re" 322 | } 323 | 324 | it_tokenizes "chinese characters", "你好世界", { 325 | "你好" 326 | "好世" 327 | "世界" 328 | } 329 | 330 | it_tokenizes "mixed english and chinese", "hello 世界 world", { 331 | "he" 332 | "el" 333 | "ll" 334 | "lo" 335 | "世界" 336 | "wo" 337 | "or" 338 | "rl" 339 | "ld" 340 | } 341 | 342 | describe "filter_text option", -> 343 | it_tokenizes "with custom text filter", "hello KEEP world", { 344 | "he" 345 | "el" 346 | "ll" 347 | "lo" 348 | "ke" 349 | "ee" 350 | "ep" 351 | "wo" 352 | "or" 353 | "rl" 354 | "ld" 355 | }, { 356 | filter_text: (text) -> text\gsub("KEEP", "keep") 357 | } 358 | 359 | it_tokenizes "filter that removes text", "hello remove world", { 360 | "he" 361 | "el" 362 | "ll" 363 | "lo" 364 | "wo" 365 | "or" 366 | "rl" 367 | "ld" 368 | }, { 369 | filter_text: (text) -> text\gsub("remove", "") 370 | } 371 | 372 | it "returns empty when filter returns empty", -> 373 | tokenizer = NgramTokenizer { 374 | filter_text: (text) -> "" 375 | } 376 | tokens = tokenizer\tokenize_text "hello world" 377 | assert.same {}, tokens 378 | 379 | it "returns empty when filter returns nil", -> 380 | tokenizer = NgramTokenizer { 381 | filter_text: (text) -> nil 382 | } 383 | tokens = tokenizer\tokenize_text "hello world" 384 | assert.same {}, tokens 385 | 386 | describe "filter_tokens option", -> 387 | it "with custom token filter", -> 388 | tokenizer = NgramTokenizer { 389 | filter_tokens: (tokens, opts) -> 390 | filtered = {} 391 | for token in *tokens 392 | if token != "el" 393 | table.insert filtered, token 394 | filtered 395 | } 396 | tokens = tokenizer\tokenize_text "hello" 397 | assert.same {"he", "ll", "lo"}, tokens 398 | 399 | it "filter can modify tokens", -> 400 | tokenizer = NgramTokenizer { 401 | filter_tokens: (tokens, opts) -> 402 | modified = {} 403 | for token in *tokens 404 | table.insert modified, "prefix:#{token}" 405 | modified 406 | } 407 | tokens = tokenizer\tokenize_text "hi" 408 | assert.same {"prefix:hi"}, tokens 409 | 410 | it "filter receives opts parameter", -> 411 | received_opts = nil 412 | tokenizer = NgramTokenizer { 413 | n: 3 414 | filter_tokens: (tokens, opts) -> 415 | received_opts = opts 416 | tokens 417 | } 418 | tokenizer\tokenize_text "test" 419 | assert.is_not_nil received_opts 420 | assert.equal 3, received_opts.n 421 | 422 | describe "comprehensive examples", -> 423 | it_tokenizes "sentence with mixed content", "The quick brown fox jumps!", { 424 | "th" 425 | "he" 426 | "qu" 427 | "ui" 428 | "ic" 429 | "ck" 430 | "br" 431 | "ro" 432 | "ow" 433 | "wn" 434 | "fo" 435 | "ox" 436 | "ju" 437 | "um" 438 | "mp" 439 | "ps" 440 | } 441 | 442 | it_tokenizes "with trigrams on real text", "testing ngrams", { 443 | "tes" 444 | "est" 445 | "sti" 446 | "tin" 447 | "ing" 448 | "ngr" 449 | "gra" 450 | "ram" 451 | "ams" 452 | }, { n: 3 } 453 | 454 | it_tokenizes "real world example", "Machine Learning is amazing!", { 455 | "ma" 456 | "ac" 457 | "ch" 458 | "hi" 459 | "in" 460 | "ne" 461 | "le" 462 | "ea" 463 | "ar" 464 | "rn" 465 | "ni" 466 | "in" 467 | "ng" 468 | "is" 469 | "am" 470 | "ma" 471 | "az" 472 | "zi" 473 | "in" 474 | "ng" 475 | } 476 | 477 | describe "build_grammar", -> 478 | it "grammar parses words", -> 479 | tokenizer = NgramTokenizer! 480 | grammar = tokenizer\build_grammar! 481 | words = grammar\match "hello world test" 482 | assert.same {"hello", "world", "test"}, words 483 | 484 | it "grammar handles punctuation", -> 485 | tokenizer = NgramTokenizer! 486 | grammar = tokenizer\build_grammar! 487 | words = grammar\match "hello, world! test?" 488 | assert.same {"hello,", "world!", "test?"}, words 489 | 490 | it "grammar handles multiple spaces", -> 491 | tokenizer = NgramTokenizer! 492 | grammar = tokenizer\build_grammar! 493 | words = grammar\match "hello world" 494 | assert.same {"hello", "world"}, words 495 | 496 | it "grammar handles tabs and newlines", -> 497 | tokenizer = NgramTokenizer! 498 | grammar = tokenizer\build_grammar! 499 | words = grammar\match "hello\tworld\ntest" 500 | assert.same {"hello", "world", "test"}, words 501 | -------------------------------------------------------------------------------- /lapis/bayes/text/stem.lua: -------------------------------------------------------------------------------- 1 | local is_vowel 2 | is_vowel = function(char) 3 | if not (char) then 4 | return false 5 | end 6 | char = char:lower() 7 | return char == 'a' or char == 'e' or char == 'i' or char == 'o' or char == 'u' or char == 'y' 8 | end 9 | local is_consonant 10 | is_consonant = function(char) 11 | if not (char) then 12 | return false 13 | end 14 | return not is_vowel(char) 15 | end 16 | local is_vowel_wxy 17 | is_vowel_wxy = function(char) 18 | if not (char) then 19 | return false 20 | end 21 | char = char:lower() 22 | return char == 'a' or char == 'e' or char == 'i' or char == 'o' or char == 'u' or char == 'y' or char == 'w' or char == 'x' 23 | end 24 | local is_valid_li 25 | is_valid_li = function(char) 26 | if not (char) then 27 | return false 28 | end 29 | char = char:lower() 30 | return char == 'c' or char == 'd' or char == 'e' or char == 'g' or char == 'h' or char == 'k' or char == 'm' or char == 'n' or char == 'r' or char == 't' 31 | end 32 | local ends_with 33 | ends_with = function(word, suffix) 34 | if #word < #suffix then 35 | return false 36 | end 37 | return word:sub(-#suffix) == suffix 38 | end 39 | local contains_vowel 40 | contains_vowel = function(word) 41 | for i = 1, #word do 42 | if is_vowel(word:sub(i, i)) then 43 | return true 44 | end 45 | end 46 | return false 47 | end 48 | local replace_suffix 49 | replace_suffix = function(word, suffix, replacement) 50 | if ends_with(word, suffix) then 51 | return word:sub(1, #word - #suffix) .. replacement 52 | else 53 | return word 54 | end 55 | end 56 | local get_suffix 57 | get_suffix = function(word, pos) 58 | if pos > #word then 59 | return "" 60 | end 61 | return word:sub(pos) 62 | end 63 | local find_r1 64 | find_r1 = function(word) 65 | if word:sub(1, 5) == "gener" then 66 | return 6 67 | elseif word:sub(1, 6) == "commun" then 68 | return 7 69 | elseif word:sub(1, 5) == "arsen" then 70 | return 6 71 | elseif word:sub(1, 4) == "past" then 72 | return 5 73 | elseif word:sub(1, 7) == "univers" then 74 | return 8 75 | elseif word:sub(1, 5) == "later" then 76 | return 6 77 | elseif word:sub(1, 5) == "emerg" then 78 | return 6 79 | elseif word:sub(1, 5) == "organ" then 80 | return 6 81 | end 82 | for i = 1, #word - 1 do 83 | if is_vowel(word:sub(i, i)) and is_consonant(word:sub(i + 1, i + 1)) then 84 | return i + 2 85 | end 86 | end 87 | return #word + 1 88 | end 89 | local find_r2 90 | find_r2 = function(word) 91 | local r1_pos = find_r1(word) 92 | if r1_pos > #word then 93 | return #word + 1 94 | end 95 | for i = r1_pos, #word - 1 do 96 | if is_vowel(word:sub(i, i)) and is_consonant(word:sub(i + 1, i + 1)) then 97 | return i + 2 98 | end 99 | end 100 | return #word + 1 101 | end 102 | local in_r1 103 | in_r1 = function(word, pos) 104 | local r1 = find_r1(word) 105 | return pos >= r1 106 | end 107 | local in_r2 108 | in_r2 = function(word, pos) 109 | local r2 = find_r2(word) 110 | return pos >= r2 111 | end 112 | local is_short_syllable_at 113 | is_short_syllable_at = function(word, pos) 114 | if pos < 1 or pos > #word then 115 | return false 116 | end 117 | local char = word:sub(pos, pos) 118 | if not (is_vowel(char)) then 119 | return false 120 | end 121 | if pos == 1 then 122 | if #word > 1 then 123 | local next_char = word:sub(2, 2) 124 | return is_consonant(next_char) 125 | end 126 | return false 127 | end 128 | if pos < #word then 129 | local prev_char = word:sub(pos - 1, pos - 1) 130 | local next_char = word:sub(pos + 1, pos + 1) 131 | if is_consonant(prev_char) and is_consonant(next_char) then 132 | local next_lower = next_char:lower() 133 | return next_lower ~= 'w' and next_lower ~= 'x' and next_char ~= 'Y' 134 | end 135 | end 136 | return false 137 | end 138 | local ends_with_short_syllable 139 | ends_with_short_syllable = function(word) 140 | if #word < 2 then 141 | return false 142 | end 143 | if #word == 2 then 144 | return is_vowel(word:sub(1, 1)) and is_consonant(word:sub(2, 2)) 145 | end 146 | if #word >= 3 then 147 | local c1 = word:sub(-3, -3) 148 | local c2 = word:sub(-2, -2) 149 | local c3 = word:sub(-1, -1) 150 | if is_consonant(c1) and is_vowel(c2) and is_consonant(c3) then 151 | local c3_lower = c3:lower() 152 | return c3_lower ~= 'w' and c3_lower ~= 'x' and c3 ~= 'Y' 153 | end 154 | end 155 | return false 156 | end 157 | local is_short_word 158 | is_short_word = function(word) 159 | local r1 = find_r1(word) 160 | if r1 > #word then 161 | return true 162 | end 163 | if r1 == #word + 1 and ends_with_short_syllable(word) then 164 | return true 165 | end 166 | return false 167 | end 168 | local prelude 169 | prelude = function(word) 170 | if #word == 0 then 171 | return word 172 | end 173 | if word:sub(1, 1) == "'" then 174 | word = word:sub(2) 175 | end 176 | local result = { } 177 | local y_found = false 178 | for i = 1, #word do 179 | local char = word:sub(i, i) 180 | if char == 'y' then 181 | if i == 1 then 182 | table.insert(result, 'Y') 183 | y_found = true 184 | elseif i > 1 and is_vowel(word:sub(i - 1, i - 1)) then 185 | table.insert(result, 'Y') 186 | y_found = true 187 | else 188 | table.insert(result, char) 189 | end 190 | else 191 | table.insert(result, char) 192 | end 193 | end 194 | return table.concat(result), y_found 195 | end 196 | local postlude 197 | postlude = function(word, y_found) 198 | if not (y_found) then 199 | return word 200 | end 201 | return word:gsub('Y', 'y') 202 | end 203 | local exception1 204 | exception1 = function(word) 205 | local exceptions = { 206 | skis = "ski", 207 | skies = "sky", 208 | idly = "idl", 209 | gently = "gentl", 210 | ugly = "ugli", 211 | early = "earli", 212 | only = "onli", 213 | singly = "singl", 214 | sky = "sky", 215 | news = "news", 216 | howe = "howe", 217 | atlas = "atlas", 218 | cosmos = "cosmos", 219 | bias = "bias", 220 | andes = "andes" 221 | } 222 | return exceptions[word] 223 | end 224 | local step_1a 225 | step_1a = function(word) 226 | if ends_with(word, "'s'") then 227 | return word:sub(1, -4) 228 | elseif ends_with(word, "'s") then 229 | return word:sub(1, -3) 230 | elseif ends_with(word, "'") then 231 | return word:sub(1, -2) 232 | end 233 | if ends_with(word, "sses") then 234 | return replace_suffix(word, "sses", "ss") 235 | end 236 | if ends_with(word, "ied") then 237 | if #word > 4 then 238 | return replace_suffix(word, "ied", "i") 239 | else 240 | return replace_suffix(word, "ied", "ie") 241 | end 242 | end 243 | if ends_with(word, "ies") then 244 | if #word > 4 then 245 | return replace_suffix(word, "ies", "i") 246 | else 247 | return replace_suffix(word, "ies", "ie") 248 | end 249 | end 250 | if ends_with(word, "s") and not ends_with(word, "us") and not ends_with(word, "ss") then 251 | local stem = word:sub(1, -2) 252 | if contains_vowel(stem) then 253 | return stem 254 | end 255 | end 256 | return word 257 | end 258 | local step_1b 259 | step_1b = function(word) 260 | if ends_with(word, "eedly") then 261 | local stem = word:sub(1, -6) 262 | if in_r1(word, #stem + 1) then 263 | if ends_with(stem, "proc") or ends_with(stem, "exc") or ends_with(stem, "succ") then 264 | return word 265 | end 266 | return stem .. "ee" 267 | end 268 | return word 269 | end 270 | if ends_with(word, "eed") then 271 | local stem = word:sub(1, -4) 272 | if in_r1(word, #stem + 1) then 273 | if ends_with(stem, "proc") or ends_with(stem, "exc") or ends_with(stem, "succ") then 274 | return word 275 | end 276 | return stem .. "ee" 277 | end 278 | return word 279 | end 280 | local suffix_removed = false 281 | local stem = word 282 | if ends_with(word, "ingly") then 283 | stem = word:sub(1, -6) 284 | suffix_removed = true 285 | elseif ends_with(word, "edly") then 286 | stem = word:sub(1, -5) 287 | suffix_removed = true 288 | elseif ends_with(word, "ing") then 289 | stem = word:sub(1, -4) 290 | suffix_removed = true 291 | elseif ends_with(word, "ed") then 292 | stem = word:sub(1, -3) 293 | suffix_removed = true 294 | end 295 | if suffix_removed then 296 | if not (contains_vowel(stem)) then 297 | return word 298 | end 299 | if ends_with(word, "ing") then 300 | if ends_with(stem, "y") and #stem > 1 then 301 | local prev = stem:sub(-2, -2) 302 | if is_consonant(prev) and #stem == 2 then 303 | return stem:sub(1, -2) .. "ie" 304 | end 305 | end 306 | if ends_with(stem, "inn") or ends_with(stem, "out") or ends_with(stem, "cann") or ends_with(stem, "herr") or ends_with(stem, "earr") or ends_with(stem, "even") then 307 | return word 308 | end 309 | end 310 | if ends_with(stem, "at") or ends_with(stem, "bl") or ends_with(stem, "iz") then 311 | return stem .. "e" 312 | end 313 | if #stem >= 2 then 314 | local last = stem:sub(-1, -1) 315 | local prev = stem:sub(-2, -2) 316 | if last == prev and is_consonant(last) then 317 | local last_lower = last:lower() 318 | if not (last_lower == 'a' or last_lower == 'e' or last_lower == 'o') then 319 | if last_lower == 'b' or last_lower == 'd' or last_lower == 'f' or last_lower == 'g' or last_lower == 'm' or last_lower == 'n' or last_lower == 'p' or last_lower == 'r' or last_lower == 't' then 320 | return stem:sub(1, -2) 321 | end 322 | end 323 | end 324 | end 325 | if in_r1(word, #stem + 1) and ends_with_short_syllable(stem) then 326 | return stem .. "e" 327 | end 328 | return stem 329 | end 330 | return word 331 | end 332 | local step_1c 333 | step_1c = function(word) 334 | if #word > 2 then 335 | local last = word:sub(-1, -1) 336 | local prev = word:sub(-2, -2) 337 | if (last == 'y' or last == 'Y') and is_consonant(prev) then 338 | return word:sub(1, -2) .. "i" 339 | end 340 | end 341 | return word 342 | end 343 | local step_2 344 | step_2 = function(word) 345 | local mappings = { 346 | { 347 | "ational", 348 | "ate" 349 | }, 350 | { 351 | "tional", 352 | "tion" 353 | }, 354 | { 355 | "enci", 356 | "ence" 357 | }, 358 | { 359 | "anci", 360 | "ance" 361 | }, 362 | { 363 | "abli", 364 | "able" 365 | }, 366 | { 367 | "entli", 368 | "ent" 369 | }, 370 | { 371 | "ization", 372 | "ize" 373 | }, 374 | { 375 | "izer", 376 | "ize" 377 | }, 378 | { 379 | "ation", 380 | "ate" 381 | }, 382 | { 383 | "ator", 384 | "ate" 385 | }, 386 | { 387 | "alism", 388 | "al" 389 | }, 390 | { 391 | "aliti", 392 | "al" 393 | }, 394 | { 395 | "alli", 396 | "al" 397 | }, 398 | { 399 | "fulness", 400 | "ful" 401 | }, 402 | { 403 | "ousli", 404 | "ous" 405 | }, 406 | { 407 | "ousness", 408 | "ous" 409 | }, 410 | { 411 | "iveness", 412 | "ive" 413 | }, 414 | { 415 | "iviti", 416 | "ive" 417 | }, 418 | { 419 | "biliti", 420 | "ble" 421 | }, 422 | { 423 | "bli", 424 | "ble" 425 | }, 426 | { 427 | "fulli", 428 | "ful" 429 | }, 430 | { 431 | "lessli", 432 | "less" 433 | } 434 | } 435 | for _index_0 = 1, #mappings do 436 | local pair = mappings[_index_0] 437 | local suffix, replacement = pair[1], pair[2] 438 | if ends_with(word, suffix) then 439 | local stem = word:sub(1, #word - #suffix) 440 | if in_r1(word, #stem + 1) then 441 | return stem .. replacement 442 | end 443 | end 444 | end 445 | if ends_with(word, "ogi") then 446 | local stem = word:sub(1, -4) 447 | if in_r1(word, #stem + 1) and ends_with(stem, "l") then 448 | return stem .. "og" 449 | end 450 | end 451 | if ends_with(word, "li") then 452 | local stem = word:sub(1, -3) 453 | if in_r1(word, #stem + 1) and #stem > 0 then 454 | local last = stem:sub(-1, -1) 455 | if is_valid_li(last) then 456 | return stem 457 | end 458 | end 459 | end 460 | if ends_with(word, "ogist") then 461 | local stem = word:sub(1, -5) 462 | if in_r1(word, #stem + 1) then 463 | return stem .. "og" 464 | end 465 | end 466 | return word 467 | end 468 | local step_3 469 | step_3 = function(word) 470 | local mappings = { 471 | { 472 | "ational", 473 | "ate" 474 | }, 475 | { 476 | "tional", 477 | "tion" 478 | }, 479 | { 480 | "alize", 481 | "al" 482 | }, 483 | { 484 | "icate", 485 | "ic" 486 | }, 487 | { 488 | "iciti", 489 | "ic" 490 | }, 491 | { 492 | "ical", 493 | "ic" 494 | }, 495 | { 496 | "ful", 497 | "" 498 | }, 499 | { 500 | "ness", 501 | "" 502 | } 503 | } 504 | for _index_0 = 1, #mappings do 505 | local pair = mappings[_index_0] 506 | local suffix, replacement = pair[1], pair[2] 507 | if ends_with(word, suffix) then 508 | local stem = word:sub(1, #word - #suffix) 509 | if in_r1(word, #stem + 1) then 510 | return stem .. replacement 511 | end 512 | end 513 | end 514 | if ends_with(word, "ative") then 515 | local stem = word:sub(1, -6) 516 | if in_r2(word, #stem + 1) then 517 | return stem 518 | end 519 | end 520 | return word 521 | end 522 | local step_4 523 | step_4 = function(word) 524 | local suffixes = { 525 | "al", 526 | "ance", 527 | "ence", 528 | "er", 529 | "ic", 530 | "able", 531 | "ible", 532 | "ant", 533 | "ement", 534 | "ment", 535 | "ent", 536 | "ism", 537 | "ate", 538 | "iti", 539 | "ous", 540 | "ive", 541 | "ize" 542 | } 543 | for _index_0 = 1, #suffixes do 544 | local suffix = suffixes[_index_0] 545 | if ends_with(word, suffix) then 546 | local stem = word:sub(1, #word - #suffix) 547 | if in_r2(word, #stem + 1) then 548 | return stem 549 | end 550 | end 551 | end 552 | if ends_with(word, "ion") then 553 | local stem = word:sub(1, -4) 554 | if in_r2(word, #stem + 1) and #stem > 0 then 555 | local last = stem:sub(-1, -1) 556 | if last == 's' or last == 't' then 557 | return stem 558 | end 559 | end 560 | end 561 | return word 562 | end 563 | local step_5 564 | step_5 = function(word) 565 | if ends_with(word, "e") then 566 | local stem = word:sub(1, -2) 567 | if in_r2(word, #stem + 1) then 568 | return stem 569 | end 570 | if in_r1(word, #stem + 1) and not ends_with_short_syllable(stem) then 571 | return stem 572 | end 573 | end 574 | if ends_with(word, "ll") and in_r2(word, #word) then 575 | return word:sub(1, -2) 576 | end 577 | return word 578 | end 579 | local stem_word 580 | stem_word = function(word) 581 | if not (word and type(word) == "string") then 582 | return word 583 | end 584 | if #word < 3 then 585 | return word 586 | end 587 | word = word:lower() 588 | local exception = exception1(word) 589 | if exception then 590 | return exception 591 | end 592 | if #word < 3 then 593 | return word 594 | end 595 | local y_found 596 | word, y_found = prelude(word) 597 | word = step_1a(word) 598 | word = step_1b(word) 599 | word = step_1c(word) 600 | word = step_2(word) 601 | word = step_3(word) 602 | word = step_4(word) 603 | word = step_5(word) 604 | word = postlude(word, y_found) 605 | return word 606 | end 607 | return { 608 | stem_word = stem_word 609 | } 610 | -------------------------------------------------------------------------------- /lapis/bayes/text/stem.moon: -------------------------------------------------------------------------------- 1 | -- Porter Stemmer implementation in MoonScript 2 | -- Based on the Snowball English stemmer algorithm 3 | -- https://github.com/snowballstem/snowball/blob/master/algorithms/english.sbl 4 | -- 5 | -- This implementation is derived from the Snowball stemming algorithms 6 | -- Copyright (c) 2001, Dr Martin Porter 7 | -- Copyright (c) 2004,2005, Richard Boulton 8 | -- Copyright (c) 2013, Yoshiki Shibukawa 9 | -- Copyright (c) 2006,2007,2009,2010,2011,2014-2019, Olly Betts 10 | -- All rights reserved. 11 | -- 12 | -- Redistribution and use in source and binary forms, with or without 13 | -- modification, are permitted provided that the following conditions 14 | -- are met: 15 | -- 16 | -- 1. Redistributions of source code must retain the above copyright notice, 17 | -- this list of conditions and the following disclaimer. 18 | -- 2. Redistributions in binary form must reproduce the above copyright notice, 19 | -- this list of conditions and the following disclaimer in the documentation 20 | -- and/or other materials provided with the distribution. 21 | -- 3. Neither the name of the Snowball project nor the names of its contributors 22 | -- may be used to endorse or promote products derived from this software 23 | -- without specific prior written permission. 24 | -- 25 | -- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 26 | -- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 27 | -- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 28 | -- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 29 | -- ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 30 | -- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 31 | -- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 32 | -- ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 33 | -- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 34 | -- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 35 | 36 | -- Character group definitions 37 | is_vowel = (char) -> 38 | return false unless char 39 | char = char\lower! 40 | char == 'a' or char == 'e' or char == 'i' or char == 'o' or char == 'u' or char == 'y' 41 | 42 | is_consonant = (char) -> 43 | return false unless char 44 | not is_vowel char 45 | 46 | is_vowel_wxy = (char) -> 47 | return false unless char 48 | char = char\lower! 49 | char == 'a' or char == 'e' or char == 'i' or char == 'o' or char == 'u' or char == 'y' or char == 'w' or char == 'x' 50 | 51 | is_valid_li = (char) -> 52 | return false unless char 53 | char = char\lower! 54 | char == 'c' or char == 'd' or char == 'e' or char == 'g' or char == 'h' or char == 'k' or char == 'm' or char == 'n' or char == 'r' or char == 't' 55 | 56 | -- String utility functions 57 | ends_with = (word, suffix) -> 58 | return false if #word < #suffix 59 | word\sub(-#suffix) == suffix 60 | 61 | contains_vowel = (word) -> 62 | for i = 1, #word 63 | return true if is_vowel word\sub(i, i) 64 | false 65 | 66 | -- Replace suffix with replacement 67 | replace_suffix = (word, suffix, replacement) -> 68 | if ends_with word, suffix 69 | word\sub(1, #word - #suffix) .. replacement 70 | else 71 | word 72 | 73 | -- Get suffix starting at position 74 | get_suffix = (word, pos) -> 75 | return "" if pos > #word 76 | word\sub pos 77 | 78 | -- Region detection 79 | -- Find R1: the region after the first non-vowel following a vowel 80 | find_r1 = (word) -> 81 | -- Special handling for common prefixes 82 | if word\sub(1, 5) == "gener" 83 | return 6 84 | elseif word\sub(1, 6) == "commun" 85 | return 7 86 | elseif word\sub(1, 5) == "arsen" 87 | return 6 88 | elseif word\sub(1, 4) == "past" 89 | return 5 90 | elseif word\sub(1, 7) == "univers" 91 | return 8 92 | elseif word\sub(1, 5) == "later" 93 | return 6 94 | elseif word\sub(1, 5) == "emerg" 95 | return 6 96 | elseif word\sub(1, 5) == "organ" 97 | return 6 98 | 99 | -- Standard R1 detection: find first V followed by NV 100 | for i = 1, #word - 1 101 | if is_vowel(word\sub(i, i)) and is_consonant(word\sub(i + 1, i + 1)) 102 | return i + 2 103 | 104 | #word + 1 105 | 106 | -- Find R2: the region after the first non-vowel following a vowel in R1 107 | find_r2 = (word) -> 108 | r1_pos = find_r1 word 109 | return #word + 1 if r1_pos > #word 110 | 111 | -- Find V followed by NV in R1 112 | for i = r1_pos, #word - 1 113 | if is_vowel(word\sub(i, i)) and is_consonant(word\sub(i + 1, i + 1)) 114 | return i + 2 115 | 116 | #word + 1 117 | 118 | -- Test if position is at R1 119 | in_r1 = (word, pos) -> 120 | r1 = find_r1 word 121 | pos >= r1 122 | 123 | -- Test if position is at R2 124 | in_r2 = (word, pos) -> 125 | r2 = find_r2 word 126 | pos >= r2 127 | 128 | -- Test for short syllable 129 | -- A short syllable is either (a) a vowel followed by a non-vowel other than w, x or Y 130 | -- and preceded by a non-vowel, or (b) a vowel at the beginning of the word followed 131 | -- by a non-vowel. 132 | is_short_syllable_at = (word, pos) -> 133 | return false if pos < 1 or pos > #word 134 | 135 | char = word\sub(pos, pos) 136 | return false unless is_vowel char 137 | 138 | if pos == 1 139 | -- Case (b): vowel at beginning followed by non-vowel 140 | if #word > 1 141 | next_char = word\sub(2, 2) 142 | return is_consonant next_char 143 | return false 144 | 145 | -- Case (a): non-vowel, vowel, non-vowel (not w,x,Y) 146 | if pos < #word 147 | prev_char = word\sub(pos - 1, pos - 1) 148 | next_char = word\sub(pos + 1, pos + 1) 149 | 150 | if is_consonant(prev_char) and is_consonant(next_char) 151 | next_lower = next_char\lower! 152 | return next_lower != 'w' and next_lower != 'x' and next_char != 'Y' 153 | 154 | false 155 | 156 | -- Test if word ends with short syllable 157 | ends_with_short_syllable = (word) -> 158 | return false if #word < 2 159 | 160 | -- Check last two characters for pattern 161 | if #word == 2 162 | return is_vowel(word\sub(1, 1)) and is_consonant(word\sub(2, 2)) 163 | 164 | -- Check last three characters for non-vowel, vowel, non-vowel (not w,x,Y) 165 | if #word >= 3 166 | c1 = word\sub(-3, -3) 167 | c2 = word\sub(-2, -2) 168 | c3 = word\sub(-1, -1) 169 | 170 | if is_consonant(c1) and is_vowel(c2) and is_consonant(c3) 171 | c3_lower = c3\lower! 172 | return c3_lower != 'w' and c3_lower != 'x' and c3 != 'Y' 173 | 174 | false 175 | 176 | -- Test for short word: word is short if it consists of a short syllable 177 | -- and nothing else, or if R1 is null 178 | is_short_word = (word) -> 179 | r1 = find_r1 word 180 | return true if r1 > #word 181 | 182 | -- Also check if ends with short syllable at beginning of R1 183 | if r1 == #word + 1 and ends_with_short_syllable word 184 | return true 185 | 186 | false 187 | 188 | -- Prelude: handle initial Y and y after vowel 189 | prelude = (word) -> 190 | return word if #word == 0 191 | 192 | -- Remove initial apostrophe 193 | word = word\sub(2) if word\sub(1, 1) == "'" 194 | 195 | result = {} 196 | y_found = false 197 | 198 | for i = 1, #word 199 | char = word\sub(i, i) 200 | 201 | if char == 'y' 202 | -- Convert to Y if at beginning or after vowel 203 | if i == 1 204 | table.insert result, 'Y' 205 | y_found = true 206 | elseif i > 1 and is_vowel(word\sub(i - 1, i - 1)) 207 | table.insert result, 'Y' 208 | y_found = true 209 | else 210 | table.insert result, char 211 | else 212 | table.insert result, char 213 | 214 | table.concat(result), y_found 215 | 216 | -- Postlude: convert Y back to y 217 | postlude = (word, y_found) -> 218 | return word unless y_found 219 | word\gsub('Y', 'y') 220 | 221 | -- Exception list 1: special cases 222 | exception1 = (word) -> 223 | exceptions = { 224 | skis: "ski" 225 | skies: "sky" 226 | idly: "idl" 227 | gently: "gentl" 228 | ugly: "ugli" 229 | early: "earli" 230 | only: "onli" 231 | singly: "singl" 232 | sky: "sky" 233 | news: "news" 234 | howe: "howe" 235 | atlas: "atlas" 236 | cosmos: "cosmos" 237 | bias: "bias" 238 | andes: "andes" 239 | } 240 | 241 | exceptions[word] 242 | 243 | -- Step 1a: handle plural forms 244 | step_1a = (word) -> 245 | -- Handle apostrophe forms 246 | if ends_with word, "'s'" 247 | return word\sub(1, -4) 248 | elseif ends_with word, "'s" 249 | return word\sub(1, -3) 250 | elseif ends_with word, "'" 251 | return word\sub(1, -2) 252 | 253 | -- Handle sses -> ss 254 | if ends_with word, "sses" 255 | return replace_suffix word, "sses", "ss" 256 | 257 | -- Handle ied, ies 258 | if ends_with word, "ied" 259 | if #word > 4 260 | return replace_suffix word, "ied", "i" 261 | else 262 | return replace_suffix word, "ied", "ie" 263 | 264 | if ends_with word, "ies" 265 | if #word > 4 266 | return replace_suffix word, "ies", "i" 267 | else 268 | return replace_suffix word, "ies", "ie" 269 | 270 | -- Handle s (but not us or ss) 271 | if ends_with(word, "s") and not ends_with(word, "us") and not ends_with(word, "ss") 272 | -- Only remove s if preceded by vowel somewhere in word 273 | stem = word\sub(1, -2) 274 | if contains_vowel stem 275 | return stem 276 | 277 | word 278 | 279 | -- Step 1b: handle ed, ing, eed forms 280 | step_1b = (word) -> 281 | -- Handle eed, eedly 282 | if ends_with word, "eedly" 283 | stem = word\sub(1, -6) 284 | if in_r1 word, #stem + 1 285 | -- Check for special cases 286 | if ends_with(stem, "proc") or ends_with(stem, "exc") or ends_with(stem, "succ") 287 | return word 288 | return stem .. "ee" 289 | return word 290 | 291 | if ends_with word, "eed" 292 | stem = word\sub(1, -4) 293 | if in_r1 word, #stem + 1 294 | if ends_with(stem, "proc") or ends_with(stem, "exc") or ends_with(stem, "succ") 295 | return word 296 | return stem .. "ee" 297 | return word 298 | 299 | -- Handle ed, edly, ing, ingly 300 | suffix_removed = false 301 | stem = word 302 | 303 | if ends_with word, "ingly" 304 | stem = word\sub(1, -6) 305 | suffix_removed = true 306 | elseif ends_with word, "edly" 307 | stem = word\sub(1, -5) 308 | suffix_removed = true 309 | elseif ends_with word, "ing" 310 | stem = word\sub(1, -4) 311 | suffix_removed = true 312 | elseif ends_with word, "ed" 313 | stem = word\sub(1, -3) 314 | suffix_removed = true 315 | 316 | if suffix_removed 317 | -- Only proceed if stem contains vowel 318 | return word unless contains_vowel stem 319 | 320 | -- Special handling for ing forms 321 | if ends_with word, "ing" 322 | -- dying -> die, lying -> lie, tying -> tie 323 | if ends_with(stem, "y") and #stem > 1 324 | prev = stem\sub(-2, -2) 325 | if is_consonant(prev) and #stem == 2 326 | return stem\sub(1, -2) .. "ie" 327 | 328 | -- inning, outing, canning stay as is 329 | if ends_with(stem, "inn") or ends_with(stem, "out") or ends_with(stem, "cann") or ends_with(stem, "herr") or ends_with(stem, "earr") or ends_with(stem, "even") 330 | return word 331 | 332 | -- Post-processing based on stem ending 333 | if ends_with(stem, "at") or ends_with(stem, "bl") or ends_with(stem, "iz") 334 | return stem .. "e" 335 | 336 | -- Handle double consonants (not aeo) 337 | if #stem >= 2 338 | last = stem\sub(-1, -1) 339 | prev = stem\sub(-2, -2) 340 | if last == prev and is_consonant(last) 341 | last_lower = last\lower! 342 | unless last_lower == 'a' or last_lower == 'e' or last_lower == 'o' 343 | -- Remove one of the double consonants (but check for special cases) 344 | if last_lower == 'b' or last_lower == 'd' or last_lower == 'f' or last_lower == 'g' or last_lower == 'm' or last_lower == 'n' or last_lower == 'p' or last_lower == 'r' or last_lower == 't' 345 | return stem\sub(1, -2) 346 | 347 | -- If R1 is null and ends with short syllable, add e 348 | if in_r1(word, #stem + 1) and ends_with_short_syllable stem 349 | return stem .. "e" 350 | 351 | return stem 352 | 353 | word 354 | 355 | -- Step 1c: replace suffix y or Y by i if preceded by non-vowel which is not at the beginning 356 | step_1c = (word) -> 357 | if #word > 2 358 | last = word\sub(-1, -1) 359 | prev = word\sub(-2, -2) 360 | 361 | if (last == 'y' or last == 'Y') and is_consonant(prev) 362 | return word\sub(1, -2) .. "i" 363 | 364 | word 365 | 366 | -- Step 2: suffix removal for derivational suffixes 367 | step_2 = (word) -> 368 | mappings = { 369 | {"ational", "ate"} 370 | {"tional", "tion"} 371 | {"enci", "ence"} 372 | {"anci", "ance"} 373 | {"abli", "able"} 374 | {"entli", "ent"} 375 | {"ization", "ize"} 376 | {"izer", "ize"} 377 | {"ation", "ate"} 378 | {"ator", "ate"} 379 | {"alism", "al"} 380 | {"aliti", "al"} 381 | {"alli", "al"} 382 | {"fulness", "ful"} 383 | {"ousli", "ous"} 384 | {"ousness", "ous"} 385 | {"iveness", "ive"} 386 | {"iviti", "ive"} 387 | {"biliti", "ble"} 388 | {"bli", "ble"} 389 | {"fulli", "ful"} 390 | {"lessli", "less"} 391 | } 392 | 393 | for pair in *mappings 394 | suffix, replacement = pair[1], pair[2] 395 | if ends_with word, suffix 396 | stem = word\sub(1, #word - #suffix) 397 | if in_r1 word, #stem + 1 398 | return stem .. replacement 399 | 400 | -- Special case: ogi -> og (when preceded by l) 401 | if ends_with word, "ogi" 402 | stem = word\sub(1, -4) 403 | if in_r1(word, #stem + 1) and ends_with(stem, "l") 404 | return stem .. "og" 405 | 406 | -- Special case: li -> delete (when preceded by valid_li) 407 | if ends_with word, "li" 408 | stem = word\sub(1, -3) 409 | if in_r1(word, #stem + 1) and #stem > 0 410 | last = stem\sub(-1, -1) 411 | if is_valid_li last 412 | return stem 413 | 414 | -- Special case: ogist -> og 415 | if ends_with word, "ogist" 416 | stem = word\sub(1, -5) 417 | if in_r1 word, #stem + 1 418 | return stem .. "og" 419 | 420 | word 421 | 422 | -- Step 3: suffix removal 423 | step_3 = (word) -> 424 | mappings = { 425 | {"ational", "ate"} 426 | {"tional", "tion"} 427 | {"alize", "al"} 428 | {"icate", "ic"} 429 | {"iciti", "ic"} 430 | {"ical", "ic"} 431 | {"ful", ""} 432 | {"ness", ""} 433 | } 434 | 435 | for pair in *mappings 436 | suffix, replacement = pair[1], pair[2] 437 | if ends_with word, suffix 438 | stem = word\sub(1, #word - #suffix) 439 | if in_r1 word, #stem + 1 440 | return stem .. replacement 441 | 442 | -- Special case: ative -> delete (in R2) 443 | if ends_with word, "ative" 444 | stem = word\sub(1, -6) 445 | if in_r2 word, #stem + 1 446 | return stem 447 | 448 | word 449 | 450 | -- Step 4: suffix removal 451 | step_4 = (word) -> 452 | suffixes = { 453 | "al", "ance", "ence", "er", "ic", "able", "ible", 454 | "ant", "ement", "ment", "ent", "ism", "ate", 455 | "iti", "ous", "ive", "ize" 456 | } 457 | 458 | for suffix in *suffixes 459 | if ends_with word, suffix 460 | stem = word\sub(1, #word - #suffix) 461 | if in_r2 word, #stem + 1 462 | return stem 463 | 464 | -- Special case: ion -> delete (when preceded by s or t in R2) 465 | if ends_with word, "ion" 466 | stem = word\sub(1, -4) 467 | if in_r2(word, #stem + 1) and #stem > 0 468 | last = stem\sub(-1, -1) 469 | if last == 's' or last == 't' 470 | return stem 471 | 472 | word 473 | 474 | -- Step 5: suffix removal 475 | step_5 = (word) -> 476 | -- Step 5a: remove trailing e 477 | if ends_with word, "e" 478 | stem = word\sub(1, -2) 479 | 480 | -- Delete if in R2 481 | if in_r2 word, #stem + 1 482 | return stem 483 | 484 | -- Delete if in R1 and not preceded by short syllable 485 | if in_r1(word, #stem + 1) and not ends_with_short_syllable(stem) 486 | return stem 487 | 488 | -- Step 5b: remove trailing l 489 | if ends_with(word, "ll") and in_r2(word, #word) 490 | return word\sub(1, -2) 491 | 492 | word 493 | 494 | -- Main stemming function 495 | stem_word = (word) -> 496 | return word unless word and type(word) == "string" 497 | return word if #word < 3 498 | 499 | word = word\lower! 500 | 501 | -- Check exceptions first 502 | exception = exception1 word 503 | return exception if exception 504 | 505 | -- If word is too short, return as-is 506 | return word if #word < 3 507 | 508 | -- Run through stemming steps 509 | word, y_found = prelude word 510 | 511 | word = step_1a word 512 | word = step_1b word 513 | word = step_1c word 514 | word = step_2 word 515 | word = step_3 word 516 | word = step_4 word 517 | word = step_5 word 518 | 519 | word = postlude word, y_found 520 | 521 | word 522 | 523 | { 524 | :stem_word 525 | } 526 | -------------------------------------------------------------------------------- /lapis/bayes/tokenizers/spam.moon: -------------------------------------------------------------------------------- 1 | unpack_fn = table.unpack or unpack 2 | 3 | punycode = require "lapis.bayes.text.punycode" 4 | import Extractor from require "web_sanitize.html" 5 | types = require "lapis.validate.types" 6 | 7 | import cjk_character from require "lapis.bayes.text.utf8" 8 | 9 | extract_text = Extractor { 10 | escape_html: false 11 | } 12 | 13 | normalize_number = (value) -> 14 | return unless value and value != "" 15 | 16 | normalized = value\gsub("[,%s]", "") 17 | digits_only = normalized\gsub("[^%d]", "") 18 | return if digits_only == "" 19 | 20 | normalized 21 | 22 | -- NOTE: this only works with ASCII punctuation characters, be careful when 23 | -- updating punct_pattern if it's going to include unicode punctuation 24 | handle_punct = (chars) -> 25 | char = chars\sub 1, 1 26 | {tag: "punct", value: char .. tostring(#chars)} 27 | 28 | handle_invalid_byte = (byte) -> 29 | {tag: "invalid_byte", value: tostring(string.byte(byte))} 30 | 31 | -- return new array with order shuffled by dithering 32 | -- e: dither factor 33 | -- https://buildingrecommenders.wordpress.com/2015/11/11/dithering/ 34 | dithered = do 35 | -- random normal box muller 36 | gn = (sd=1, mean=0, r=math.random) -> 37 | local x1, x2, w, y1, y2 38 | while true 39 | x1 = 2 * r! - 1 40 | x2 = 2 * r! - 1 41 | w = x1^2 + x2^2 42 | break if w < 1 43 | 44 | w = math.sqrt -2 * math.log(w) / 2 45 | y1 = x1 * w 46 | y2 = x2 * w 47 | 48 | y1 * sd + mean 49 | 50 | dither_score = (rank, e) -> 51 | math.log(rank) + gn(math.log(e)) 52 | 53 | (items, e=1.5) -> 54 | rows = for i, item in ipairs items 55 | {dither_score(i, e), item} 56 | 57 | table.sort rows, (a, b) -> 58 | a[1] < b[1] 59 | 60 | [row[2] for row in *rows] 61 | 62 | 63 | -- spam tokenizer with support for domains, emails, currencies, and more 64 | -- opts = { 65 | -- filter_text: function -- function to pre-filter text, returns new text 66 | -- min_word_length: number -- minimum length of word (default 2) 67 | -- max_word_length: number -- maximum length of word (default 32) 68 | -- ignore_words: table -- table of words to ignore 69 | -- stem_words: bool -- enable word stemming 70 | -- unaccent: bool -- enable unaccenting (default true) 71 | -- dedupe: bool -- enable deduplication (default true) 72 | -- ignore_tokens: table -- table of tokens to ignore eg. {"my_token" = false} 73 | -- ignore_domains: {string} -- domains to ignore (`example.com` exact, `.example.com` includes subdomains) 74 | -- sample_at_most: number -- limit number of sampled tokens 75 | -- dither: bool -- enable dithering when sampling (default true) 76 | -- bigram_tokens: bool -- enable bigram generation 77 | -- filter_tokens: function -- function to filter tokens, called at end with (tokens, opts) 78 | -- domain_tokens_first: bool -- move domain tokens before all other tokens (default false) 79 | -- split_cjk: -- split chinese, korean, japanese characters to be individual words 80 | -- } 81 | class SpamTokenizer extends require "lapis.bayes.tokenizers.base" 82 | new: (@opts = {}) => 83 | 84 | tagged_token_to_string: (token) => 85 | "#{token.tag}:#{token.value}" 86 | 87 | normalize_domain_string: (domain) => 88 | return unless domain and domain != "" 89 | domain = tostring domain 90 | domain = domain\gsub("^%s+", "")\gsub("%s+$", "") 91 | domain = domain\gsub("%.+$", "") 92 | return if domain == "" 93 | 94 | labels = {} 95 | for label in domain\gmatch "[^%.]+" 96 | return if label == "" 97 | encoded = punycode.punycode_encode label 98 | encoded or= label 99 | table.insert labels, encoded\lower! 100 | 101 | return unless next labels 102 | table.concat labels, "." 103 | 104 | build_ignored_domains: => 105 | entries = @opts.ignore_domains 106 | return false unless entries and #entries > 0 107 | 108 | exact = {} 109 | suffix = {} 110 | 111 | for domain in *entries 112 | continue unless type(domain) == "string" 113 | domain = domain\gsub("^%s+", "")\gsub("%s+$", "") 114 | continue if domain == "" 115 | 116 | is_suffix = domain\sub(1, 1) == "." 117 | domain = domain\sub(2) if is_suffix 118 | continue if domain == "" 119 | 120 | normalized = @normalize_domain_string domain 121 | continue unless normalized 122 | 123 | if is_suffix 124 | suffix[normalized] = true 125 | else 126 | exact[normalized] = true 127 | 128 | return false unless next(exact) or next(suffix) 129 | 130 | { 131 | exact: exact 132 | suffix: suffix 133 | } 134 | 135 | should_ignore_domain: (domain) => 136 | return false unless @opts.ignore_domains 137 | 138 | if @ignored_domains == nil 139 | @ignored_domains = @build_ignored_domains! 140 | 141 | 142 | return false unless @ignored_domains 143 | normalized = @normalize_domain_string domain 144 | return false unless normalized 145 | 146 | if @ignored_domains.exact[normalized] 147 | return true 148 | 149 | for suffix in pairs @ignored_domains.suffix 150 | return true if normalized == suffix 151 | if #normalized > #suffix 152 | if normalized\sub(-(#suffix + 1)) == ".#{suffix}" 153 | return true 154 | 155 | false 156 | 157 | build_grammar: => 158 | import P, S, R, C, Ct from require "lpeg" 159 | utf8 = require "lapis.util.utf8" 160 | 161 | min_len = @opts.min_word_length or 2 162 | max_len = @opts.max_word_length or 32 163 | ignore_words = @opts.ignore_words 164 | 165 | truncate = types.truncated_text max_len 166 | 167 | stem = if @opts.stem_words 168 | require("lapis.bayes.text.stem").stem_word 169 | 170 | case_insensitive = (text) -> 171 | out = nil 172 | for char in text\gmatch "." 173 | lower = char\lower! 174 | upper = char\upper! 175 | pattern = if lower == upper 176 | P char 177 | else 178 | S "#{lower}#{upper}" 179 | 180 | out = if out 181 | out * pattern 182 | else 183 | pattern 184 | 185 | out or P(false) 186 | 187 | normalize_word = (word) -> 188 | return unless word and word != "" 189 | 190 | word = word\lower! 191 | word = word\gsub("'+", "") 192 | 193 | return if #word < min_len 194 | if #word > max_len 195 | word = truncate\transform word 196 | return if ignore_words and ignore_words[word] 197 | 198 | word 199 | 200 | handle_domain_token = (domain) -> 201 | -- convert subdomains to punycode 202 | labels = for label in domain\gmatch "[^%.]+" 203 | encoded = punycode.punycode_encode label 204 | if #encoded > max_len 205 | truncate\transform encoded 206 | else 207 | encoded 208 | 209 | tokens = { 210 | {tag: "domain", value: truncate\transform table.concat(labels, ".")\lower!} 211 | } 212 | 213 | -- Generate hierarchical domain tokens with leading dots for subdomains 214 | if #labels >= 2 215 | for i = 2, #labels 216 | suffix = table.concat [labels[j] for j = i, #labels], "." 217 | table.insert tokens, {tag: "domain", value: truncate\transform ".#{suffix\lower!}"} 218 | 219 | unpack_fn tokens 220 | 221 | extract_url_words = (...) -> 222 | out = {} 223 | for part in *{...} 224 | continue unless part and #part > 0 225 | 226 | -- Strip leading URL punctuation like / ? # 227 | part = part\gsub("^[:/?#]+", "") 228 | continue if part == "" 229 | 230 | -- Treat underscores and other punctuation as separators 231 | part = part\gsub("_", " ") 232 | part = part\gsub("[^%w']+", " ") 233 | 234 | for raw in part\gmatch "%S+" 235 | normalized = normalize_word raw 236 | table.insert out, normalized if normalized 237 | 238 | out 239 | 240 | handle_url = (domain, path="", query="", fragment="") -> 241 | return if @should_ignore_domain domain 242 | 243 | tokens = {} 244 | 245 | for word in *extract_url_words path, query, fragment 246 | table.insert tokens, word 247 | 248 | for token in *{handle_domain_token domain} 249 | table.insert tokens, token 250 | 251 | unpack_fn tokens 252 | 253 | handle_email = (email) -> 254 | email = email\lower! 255 | user, domain = email\match "^([^@]+)@(.+)$" 256 | 257 | tokens = {{tag: "email", value: truncate\transform email}} 258 | 259 | if user 260 | user_token = normalize_word user 261 | table.insert tokens, {tag: "email_user", value: user_token} if user_token 262 | 263 | if domain 264 | for token in *{handle_domain_token domain} 265 | table.insert tokens, token 266 | 267 | unpack_fn tokens 268 | 269 | handle_number = (value) -> 270 | normalized = normalize_number value 271 | return unless normalized 272 | if #normalized > max_len 273 | truncate\transform normalized 274 | else 275 | normalized 276 | 277 | handle_currency = (value) -> 278 | symbol, rest = value\match "^([%$£€¥]+)%s*(.+)$" 279 | symbol or= value\sub 1, 1 280 | rest or= "" 281 | 282 | normalized_number = normalize_number rest 283 | if normalized_number and #normalized_number > max_len 284 | normalized_number = truncate\transform normalized_number 285 | 286 | if symbol and symbol != "" 287 | if normalized_number 288 | {tag: "currency", value: symbol}, normalized_number 289 | else 290 | {tag: "currency", value: symbol} 291 | 292 | handle_percent = (value) -> 293 | number_part = value\sub 1, #value - 1 294 | normalized = normalize_number number_part 295 | return unless normalized 296 | if #normalized > max_len - 1 -- reserve 1 char for % 297 | normalized = truncate\transform normalized 298 | "#{normalized}%" 299 | 300 | handle_caps_word = (word) -> 301 | return unless word\match "%u" 302 | 303 | 304 | normalized = normalize_word word 305 | return unless normalized 306 | stemmed = if stem 307 | stem(normalized) or normalized 308 | else 309 | normalized 310 | stemmed, {tag: "caps", value: stemmed} 311 | 312 | handle_word = (word) -> 313 | normalized = normalize_word word 314 | return unless normalized 315 | if stem 316 | stem(normalized) or normalized 317 | else 318 | normalized 319 | 320 | whitespace = utf8.whitespace 321 | alpha = R "az", "AZ" 322 | digit = R "09" 323 | alphanum = alpha + digit 324 | 325 | punct_chars = S"!?$#%" 326 | other_punct = S"()[]{},.;:\"<>/@#" 327 | word_char = utf8.printable_character - whitespace - punct_chars - other_punct 328 | word_pattern = (word_char + P"'")^1 329 | 330 | cjk_word = if @opts.split_cjk 331 | word_char = word_char - cjk_character 332 | C(cjk_character) / handle_word 333 | 334 | caps_char = R"AZ" 335 | caps_pattern = caps_char^2 * (caps_char + digit)^0 336 | 337 | sign = S"+-"^-1 338 | number_body = sign * digit^1 * (P"," * digit^3)^0 * (P"." * digit^1)^-1 339 | 340 | percent_pattern = number_body * P"%" 341 | currency_pattern = S"$£€¥" * whitespace^0 * number_body 342 | 343 | punct_pattern = punct_chars^3 * punct_chars^0 344 | 345 | domain_char = utf8.printable_character - whitespace - S"./:@?#[](){}<>\"',;&" 346 | domain_label = domain_char^1 347 | domain_pattern = domain_label * (P"." * domain_label)^1 348 | 349 | not_path = S" \t\r\n\"'<>()[\\]{}?#" 350 | port_part = (P":" * digit^1)^-1 351 | path_part = (P"/" * (1 - not_path)^0)^0 352 | query_part = (P"?" * (1 - not_path)^0)^-1 353 | fragment_part = (P"#" * (1 - not_path)^0)^-1 354 | 355 | www_prefix = case_insensitive "www." 356 | scheme = (alpha + digit)^1 357 | 358 | url_with_scheme = scheme * P"://" * www_prefix^-1 * C(domain_pattern) * port_part * C(path_part) * C(query_part) * C(fragment_part) 359 | url_without_scheme = www_prefix * C(domain_pattern) * port_part * C(path_part) * C(query_part) * C(fragment_part) 360 | 361 | email_pattern = C((alphanum + S".%+_'-")^1 * P"@" * domain_pattern) 362 | 363 | number_capture = C(number_body) * -(alpha) 364 | 365 | token_patterns = { 366 | url_with_scheme / handle_url 367 | url_without_scheme / handle_url 368 | email_pattern / handle_email 369 | C(currency_pattern) / handle_currency 370 | C(percent_pattern) / handle_percent 371 | number_capture / handle_number 372 | C(caps_pattern) / handle_caps_word 373 | -- CJK here... 374 | C(word_pattern) / handle_word 375 | C(punct_pattern) / handle_punct 376 | } 377 | 378 | if cjk_word 379 | table.insert token_patterns, 8, cjk_word 380 | 381 | tokens = token_patterns[1] 382 | for i = 2, #token_patterns 383 | tokens = tokens + token_patterns[i] 384 | 385 | printable = utf8.printable_character 386 | Ct (tokens + printable + (C(P(1)) / handle_invalid_byte))^0 387 | 388 | -- this is processed on the test before HTML is stripped to get any URLs that 389 | -- might exist in attributes or in the markup 390 | collect_url_tokens: (text) => 391 | return {} unless text and text != "" 392 | 393 | @grammar or= @build_grammar! 394 | tokens = @grammar\match text 395 | return {} unless tokens 396 | 397 | out = for token in *tokens 398 | continue unless type(token) == "table" 399 | switch token.tag 400 | when "domain", "email", "email_user" 401 | token 402 | else 403 | continue 404 | out 405 | 406 | dedupe_tokens: (tokens) => 407 | return {} unless tokens 408 | seen = {} 409 | deduped = {} 410 | for token in *tokens 411 | -- For table tokens, use string representation as key 412 | key = if type(token) == "table" 413 | @tagged_token_to_string token 414 | else 415 | token 416 | 417 | unless seen[key] 418 | seen[key] = true 419 | table.insert deduped, token 420 | deduped 421 | 422 | generate_bigrams: (tokens) => 423 | return {} unless tokens 424 | count = #tokens 425 | return {} if count < 2 426 | ignore_tokens = @opts.ignore_tokens 427 | 428 | bigrams = {} 429 | for i = 1, count - 1 430 | first = tokens[i] 431 | second = tokens[i + 1] 432 | continue unless first and second 433 | 434 | bigram = first .. " " .. second 435 | continue if ignore_tokens and ignore_tokens[bigram] 436 | 437 | table.insert bigrams, bigram 438 | 439 | bigrams 440 | 441 | sample_tokens: (tokens, limit=@opts.sample_at_most) => 442 | return {} unless tokens 443 | return tokens unless limit 444 | 445 | limit = math.floor limit 446 | return {} if limit <= 0 447 | count = #tokens 448 | return tokens if count <= limit 449 | 450 | tokens_to_sample = if @opts.dither == false 451 | tokens 452 | else 453 | dithered tokens 454 | 455 | [tokens_to_sample[idx] for idx=1,limit] 456 | 457 | -- lift the tokens that match the pattern to the top, preserving order otherwise 458 | lift_tokens: (tokens, pattern) => 459 | lifted = {} 460 | rest = for t in *tokens 461 | if t\match pattern 462 | table.insert lifted, t 463 | continue 464 | 465 | t 466 | 467 | for r in *rest 468 | table.insert lifted, r 469 | 470 | lifted 471 | 472 | tokenize_text: (text) => 473 | return {} unless text 474 | 475 | text = tostring text 476 | 477 | if @opts.filter_text 478 | text = @opts.filter_text text 479 | 480 | unless @opts.unaccent == false 481 | text = require("lapis.bayes.text.unaccent").unaccent_string(text) or text 482 | 483 | -- extract URLs before cleaing up text to capture urls in HTML markup 484 | raw_domain_tokens = @collect_url_tokens text 485 | 486 | text = extract_text text 487 | 488 | @grammar or= @build_grammar! 489 | tokens = @grammar\match text or {} 490 | 491 | dedupe = true 492 | if @opts.dedupe != nil 493 | dedupe = @opts.dedupe 494 | 495 | ignore_tokens = @opts.ignore_tokens 496 | sample_limit = @opts.sample_at_most 497 | generate_bigrams = @opts.bigram_tokens 498 | 499 | -- new token merging strategy, try to keep things adjacent 500 | merged_tokens = {} 501 | seen_tokens = {} -- for deduping 502 | 503 | insert_token = (t) -> 504 | if ignore_tokens and ignore_tokens[t] 505 | return 506 | 507 | if dedupe and seen_tokens[t] 508 | return 509 | 510 | seen_tokens[t] = true 511 | 512 | table.insert merged_tokens, t 513 | 514 | prev_token = nil -- for bigram generation 515 | 516 | for idx=1,#tokens 517 | token = tokens[idx] 518 | 519 | switch type token 520 | when "table" -- special token 521 | switch token.tag 522 | when "caps", "invalid_byte", "currency" 523 | 524 | nil 525 | else 526 | prev_token = nil -- break the bigram 527 | 528 | insert_token @tagged_token_to_string token 529 | 530 | when "string" -- plain word 531 | insert_token token 532 | 533 | if prev_token and generate_bigrams 534 | insert_token "#{prev_token} #{token}" 535 | 536 | prev_token = token 537 | 538 | -- these lose positioning due to being extracted differently, so we just 539 | -- insert them in order at the top by moving some variables around 540 | if raw_domain_tokens 541 | original_tokens = merged_tokens 542 | merged_tokens = {} 543 | for token in *raw_domain_tokens 544 | insert_token @tagged_token_to_string token 545 | 546 | for t in *original_tokens 547 | table.insert merged_tokens, t 548 | 549 | if @opts.domain_tokens_first 550 | merged_tokens = @lift_tokens merged_tokens, "^domain:" 551 | 552 | if sample_limit 553 | merged_tokens = @sample_tokens merged_tokens 554 | 555 | -- Apply custom filter at the very end if provided 556 | if @opts.filter_tokens 557 | merged_tokens = @opts.filter_tokens merged_tokens, @opts 558 | 559 | merged_tokens 560 | 561 | return SpamTokenizer 562 | --------------------------------------------------------------------------------