├── .gitmodules
├── wasm-features
    ├── .gitignore
    ├── Cargo.toml
    ├── README.md
    └── src
    │   └── lib.rs
├── benchmarks
    ├── .gitignore
    ├── README.md
    ├── goya.js
    ├── scripts
    │   └── setup
    ├── package.json
    ├── kuromoji.js
    ├── bench.js
    └── package-lock.json
├── ipadic
    ├── src
    │   ├── lib.rs
    │   ├── ipadic.rs
    │   └── ipadic_loader.rs
    └── Cargo.toml
├── goya-cli
    ├── src
    │   ├── exporter.rs
    │   ├── path_util.rs
    │   ├── repl.rs
    │   ├── main.rs
    │   └── build.rs
    └── Cargo.toml
├── wasm-core
    ├── .DS_Store
    ├── README.md
    ├── Cargo.toml
    ├── .gitignore
    └── src
    │   └── lib.rs
├── scripts
    ├── lookup
    ├── vercel-install
    ├── vercel-build
    ├── build-wasm
    └── build-dict
├── Cargo.toml
├── goya
    ├── src
    │   ├── lib.rs
    │   ├── morpheme.rs
    │   ├── dictionary.rs
    │   ├── id.rs
    │   ├── common_prefix_tree.rs
    │   ├── word_features.rs
    │   ├── dot.rs
    │   ├── char_class.rs
    │   ├── double_array.rs
    │   └── lattice.rs
    └── Cargo.toml
├── playground
    ├── src
    │   ├── index.tsx
    │   ├── index.html
    │   ├── Dot.tsx
    │   ├── goya.worker.ts
    │   ├── Table.tsx
    │   ├── Result.tsx
    │   └── App.tsx
    ├── tsconfig.json
    ├── package.json
    ├── webpack.config.js
    └── .gitignore
├── .gitignore
├── LICENSE
├── .github
    └── workflows
    │   ├── CD.yml
    │   └── CI.yml
└── README.md


/.gitmodules:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/wasm-features/.gitignore:
--------------------------------------------------------------------------------
1 | pkg
2 | 


--------------------------------------------------------------------------------
/benchmarks/.gitignore:
--------------------------------------------------------------------------------
1 | ita-corpus.txt
2 | 


--------------------------------------------------------------------------------
/ipadic/src/lib.rs:
--------------------------------------------------------------------------------
1 | pub mod ipadic;
2 | pub mod ipadic_loader;
3 | 


--------------------------------------------------------------------------------
/goya-cli/src/exporter.rs:
--------------------------------------------------------------------------------
1 | enum Format {
2 |     Default,
3 |     Dot,
4 | }
5 | 


--------------------------------------------------------------------------------
/wasm-core/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Leko/goya/HEAD/wasm-core/.DS_Store


--------------------------------------------------------------------------------
/scripts/lookup:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | cat mecab/mecab-ipadic/*.csv | iconv -f eucjp | grep "^$1"
3 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [workspace]
 2 | 
 3 | members = [
 4 |   "goya-cli",
 5 |   "goya",
 6 |   "ipadic",
 7 |   "wasm-core",
 8 |   "wasm-features",
 9 | ]
10 | 
11 | [profile.release]
12 | lto = true
13 | 


--------------------------------------------------------------------------------
/goya/src/lib.rs:
--------------------------------------------------------------------------------
 1 | pub mod char_class;
 2 | pub mod common_prefix_tree;
 3 | pub mod dictionary;
 4 | pub mod dot;
 5 | pub mod double_array;
 6 | pub mod id;
 7 | pub mod lattice;
 8 | pub mod morpheme;
 9 | pub mod word_features;
10 | 


--------------------------------------------------------------------------------
/scripts/vercel-install:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -eu
 3 | 
 4 | curl https://sh.rustup.rs -sSf | sh -s -- -y 
 5 | source $HOME/.cargo/env 
 6 | cargo install wasm-pack 
 7 | rustup install nightly
 8 | 
 9 | cd playground
10 | npm ci
11 | 


--------------------------------------------------------------------------------
/benchmarks/README.md:
--------------------------------------------------------------------------------
 1 | # Goya benchmarks
 2 | 
 3 | ## Getting started
 4 | 
 5 | ```
 6 | npm i
 7 | ./scripts/setup # Generate ita-corpus.txt
 8 | 
 9 | # Run whole process benchmark
10 | node goya.js < ita-corpus.txt
11 | node kuromoji.js < ita-corpus.txt
12 | 
13 | # Run morphological analysis benchmark
14 | node bench.js < ita-corpus.txt
15 | ```
16 | 


--------------------------------------------------------------------------------
/playground/src/index.tsx:
--------------------------------------------------------------------------------
 1 | import React from "react";
 2 | import ReactDOM from "react-dom";
 3 | import { App } from "./App";
 4 | 
 5 | if ("serviceWorker" in navigator) {
 6 |   window.addEventListener("load", () => {
 7 |     navigator.serviceWorker.register("/service-worker.js");
 8 |   });
 9 | }
10 | 
11 | ReactDOM.render(<App />, document.querySelector("#app"));
12 | 


--------------------------------------------------------------------------------
/scripts/vercel-build:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -eu
 3 | 
 4 | source $HOME/.cargo/env 
 5 | 
 6 | NAME='mecab-ipadic.tar.gz'
 7 | curl -v -L 'https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7MWVlSDBCSXZMTXM' -o $NAME
 8 | tar -zxvf $NAME
 9 | rm -rf $NAME
10 | 
11 | DICT='mecab-ipadic-2.7.0-20070801'
12 | ./scripts/build-dict $DICT
13 | cd playground
14 | NODE_ENV=production npm run build
15 | 


--------------------------------------------------------------------------------
/benchmarks/goya.js:
--------------------------------------------------------------------------------
 1 | import { EOL } from "os";
 2 | import fs from "fs";
 3 | import core from "wasm-core";
 4 | import features from "wasm-features";
 5 | 
 6 | const lines = fs.readFileSync("/dev/stdin", "utf8").trim().split(EOL);
 7 | for (const line of lines) {
 8 |   const lattice = core.parse(line);
 9 |   const best = lattice.find_best().map(({ wid }) => wid);
10 |   features.get_features(best);
11 | }
12 | 
13 | console.log(process.memoryUsage());
14 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Generated by Cargo
 2 | # will have compiled files and executables
 3 | /target/
 4 | 
 5 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
 6 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
 7 | Cargo.lock
 8 | 
 9 | # These are backup files generated by rustfmt
10 | **/*.rs.bk
11 | .vscode/
12 | mecab-ipadic-2.7.0-20070801
13 | __generated__
14 | node_modules
15 | 


--------------------------------------------------------------------------------
/ipadic/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | authors = ["Leko <leko.noor@gmail.com>"]
 3 | categories = ["text-processing"]
 4 | description = "IPA dictionary for Goya"
 5 | edition = "2018"
 6 | license = "Apache-2.0 OR MIT"
 7 | name = "goya-ipadic"
 8 | version = "0.1.9"
 9 | 
10 | [dependencies]
11 | csv = "1.1"
12 | encoding_rs = "0.8"
13 | glob = "0.3"
14 | goya = {version = "^0.1.9", path = "../goya"}
15 | indexmap = {version = "1.7", features = ["serde"]}
16 | regex = "1.5"
17 | rkyv = {version = "0.7.19", features = ["indexmap"]}
18 | serde = {version = "1.0", features = ["derive"]}
19 | 


--------------------------------------------------------------------------------
/goya/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | authors = ["Leko <leko.noor@gmail.com>"]
 3 | categories = ["data-structures", "text-processing"]
 4 | description = "Yet another morphological analyzer for Rust and WebAssembly"
 5 | edition = "2018"
 6 | license = "Apache-2.0 OR MIT"
 7 | name = "goya"
 8 | repository = "https://github.com/Leko/goya"
 9 | version = "0.1.9"
10 | 
11 | [dependencies]
12 | indexmap = {version = "1.7", features = ["serde"]}
13 | itertools = "0.10"
14 | rkyv = {version = "0.7.19", features = ["indexmap"]}
15 | serde = {version = "1.0", features = ["derive"]}
16 | serde_bytes = "0.11"
17 | 


--------------------------------------------------------------------------------
/benchmarks/scripts/setup:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | wget https://github.com/mmorise/ita-corpus/raw/main/emotion_transcript_utf8.txt
 4 | wget https://github.com/mmorise/ita-corpus/raw/main/recitation_transcript_utf8.txt
 5 | 
 6 | cat emotion_transcript_utf8.txt recitation_transcript_utf8.txt > transcript_utf8.csv
 7 | rm -f emotion_transcript_utf8.txt recitation_transcript_utf8.txt
 8 | 
 9 | node <<CODE >> ita-corpus.txt
10 | const lines = require('fs').readFileSync('transcript_utf8.csv', 'utf8')
11 |   .trim()
12 |   .split('\n')
13 |   .map(line => line.split(',')[0].split(':')[1])
14 |   .join('\n')
15 | console.log(lines)
16 | CODE
17 | 


--------------------------------------------------------------------------------
/benchmarks/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "benchmarks",
 3 |   "private": true,
 4 |   "version": "0.0.0",
 5 |   "type": "module",
 6 |   "scripts": {
 7 |     "preinstall": "npm run build:core && npm run build:features",
 8 |     "build:core": "wasm-pack build --release --target nodejs ../wasm-core",
 9 |     "build:features": "wasm-pack build --release --target nodejs ../wasm-features",
10 |     "test": "echo \"Error: no test specified\" && exit 1"
11 |   },
12 |   "author": "",
13 |   "license": "ISC",
14 |   "dependencies": {
15 |     "kuromoji": "^0.1.2",
16 |     "wasm-core": "../wasm-core/pkg",
17 |     "wasm-features": "../wasm-features/pkg"
18 |   },
19 |   "devDependencies": {
20 |     "benchmark": "^2.1.4"
21 |   }
22 | }
23 | 


--------------------------------------------------------------------------------
/goya-cli/src/path_util.rs:
--------------------------------------------------------------------------------
 1 | use std::fs::create_dir_all;
 2 | use std::io;
 3 | use std::path::{Path, PathBuf};
 4 | 
 5 | pub struct PathUtil {
 6 |     base: String,
 7 | }
 8 | impl PathUtil {
 9 |     pub fn from(base: String) -> PathUtil {
10 |         PathUtil { base }
11 |     }
12 | 
13 |     pub fn mkdirp(&self) -> io::Result<()> {
14 |         create_dir_all(&self.base)
15 |     }
16 | 
17 |     pub fn da_path(&self) -> PathBuf {
18 |         Path::new(&self.base).join("da.bin")
19 |     }
20 | 
21 |     pub fn dict_path(&self) -> PathBuf {
22 |         Path::new(&self.base).join("dict.bin")
23 |     }
24 | 
25 |     pub fn features_path(&self) -> PathBuf {
26 |         Path::new(&self.base).join("features.bin")
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/wasm-features/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | authors = ["Leko <leko.noor@gmail.com>"]
 3 | categories = ["wasm", "data-structures", "text-processing"]
 4 | description = "WebAssembly binding of Goya"
 5 | edition = "2018"
 6 | license = "Apache-2.0 OR MIT"
 7 | name = "goya-features"
 8 | publish = false
 9 | repository = "https://github.com/Leko/goya"
10 | version = "0.1.9"
11 | 
12 | [lib]
13 | crate-type = ["cdylib"]
14 | 
15 | [dependencies]
16 | goya = {version = "^0.1.9", path = "../goya"}
17 | lazy_static = "1.4"
18 | rmp-serde = "1.0.0-beta.2"
19 | serde-wasm-bindgen = "0.3.1"
20 | wasm-bindgen = {version = "0.2.78", features = ["serde-serialize"]}
21 | 
22 | [package.metadata.wasm-pack.profile.release]
23 | wasm-opt = ['--dce', '-O4']
24 | 


--------------------------------------------------------------------------------
/benchmarks/kuromoji.js:
--------------------------------------------------------------------------------
 1 | import { EOL } from "os";
 2 | import fs from "fs";
 3 | import path from "path";
 4 | import kuromoji from "kuromoji";
 5 | 
 6 | const dicPath = path.join(
 7 |   path.dirname(new URL(import.meta.url).pathname),
 8 |   "node_modules",
 9 |   "kuromoji",
10 |   "dict"
11 | );
12 | 
13 | new Promise((resolve, reject) => {
14 |   kuromoji.builder({ dicPath }).build((err, tokenizer) => {
15 |     if (err) {
16 |       reject(err);
17 |     } else {
18 |       resolve(tokenizer);
19 |     }
20 |   });
21 | }).then((tokenizer) => {
22 |   const lines = fs.readFileSync("/dev/stdin", "utf8").trim().split(EOL);
23 |   for (const line of lines) {
24 |     tokenizer.tokenize(line);
25 |   }
26 |   console.log(process.memoryUsage());
27 | });
28 | 


--------------------------------------------------------------------------------
/playground/src/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 |   <head>
 4 |     <meta charset="UTF-8" />
 5 |     <title>
 6 |       Playground | Goya: Yet another morphological analyzer for Rust and
 7 |       WebAssembly
 8 |     </title>
 9 |     <link
10 |       rel="stylesheet"
11 |       href="https://fonts.googleapis.com/css?family=Roboto:300,400,500,700&display=swap"
12 |     />
13 |     <style>
14 |       html,
15 |       body,
16 |       #app {
17 |         margin: 0;
18 |         height: 100%;
19 |       }
20 |     </style>
21 |   </head>
22 |   <body>
23 |     <div id="app"></div>
24 |     <script
25 |       src="https://unpkg.com/@hpcc-js/wasm/dist/index.min.js"
26 |       type="javascript/worker"
27 |     ></script>
28 |   </body>
29 | </html>
30 | 


--------------------------------------------------------------------------------
/wasm-core/README.md:
--------------------------------------------------------------------------------
 1 | ## Getting started
 2 | 
 3 | ### 分かち書き
 4 | 
 5 | goya-core を import して `parse` 関数を使用します。parse メソッドの戻り値から各種メソッドを呼べるようにしています。
 6 | 分かち書きをするなら`wakachi`メソッドを使用します。
 7 | 
 8 | ```ts
 9 | import core from "goya-core";
10 | 
11 | const lattice = core.parse("すもももももももものうち");
12 | lattice.wakachi(); // => ["すもも", "も", "もも", "も", "もも", "の", "うち"]
13 | ```
14 | 
15 | ### 形態素解析
16 | 
17 | 形態素解析の結果を得るには`find_best`メソッドを使用します。find_best は形態素の配列を返します。各形態素はこれらのフィールドを持っています。サイズ削減のためこのオブジェクトは品詞や読み仮名などの素性を持っていません。
18 | 
19 | - wid: 語彙 ID。goya-features で使用 （後述）
20 | - is_known: 既知後なら true、未知語なら false
21 | - surface_form: 表層体
22 | 
23 | ```ts
24 | lattice.find_best()[0].surface_form; // => "すもも"
25 | lattice.find_best()[0].is_known; // => true
26 | lattice.find_best()[0].wid; // => 次項で説明
27 | ```
28 | 


--------------------------------------------------------------------------------
/wasm-features/README.md:
--------------------------------------------------------------------------------
 1 | ## Getting started
 2 | 
 3 | ```ts
 4 | import core from "goya-core";
 5 | import { get_features } from "wasm-features";
 6 | 
 7 | // Mecab IPA辞書のデフォルトでは品詞(Part of Speech)は添字0
 8 | const INDEX_POS = 0;
 9 | 
10 | const lattice = core.parse("すもももももももものうち");
11 | const morphemes = lattice.find_best();
12 | // widの配列から素性の配列を得る
13 | const features = get_features(morphemes.map((morph) => morph.wid));
14 | // 1要素ずつ取得してもいいが、まとめて取得する方がオーバーヘッドが少なく高速
15 | get_features([morphemes[0].wid]);
16 | 
17 | morphemes.forEach(({ surface_form }, i) => {
18 |   const feature = features[i]; // 渡したwid通りの順序で素性が得られる
19 |   const line = surface_form + "\t" + feature.join(",");
20 |   console.log(line); // => "すもも\t名詞,一般,*,*,*,*,すもも,スモモ,スモモ"
21 |   console.log(feature[INDEX_POS]); // => "名詞"
22 | });
23 | ```
24 | 


--------------------------------------------------------------------------------
/goya-cli/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | authors = ["Leko <leko.noor@gmail.com>"]
 3 | categories = ["command-line-interface"]
 4 | description = "CLI for Goya"
 5 | edition = "2018"
 6 | license = "Apache-2.0 OR MIT"
 7 | name = "goya-cli"
 8 | repository = "https://github.com/Leko/goya"
 9 | version = "0.1.9"
10 | 
11 | [[bin]]
12 | name = "goya"
13 | path = "src/main.rs"
14 | 
15 | [dependencies]
16 | bytesize = {version = "1.1.0", features = ["serde"]}
17 | clap = {version = "3.0.0-rc.9", features = ["derive"]}
18 | console = "0.14"
19 | dirs = "4.0"
20 | futures = "0.3.17"
21 | goya = {version = "^0.1.9", path = "../goya"}
22 | goya-ipadic = {version = "^0.1.9", path = "../ipadic"}
23 | indexmap = {version = "1.7", features = ["serde"]}
24 | rkyv = {version = "0.7.19", features = ["indexmap"]}
25 | rmp-serde = "1.0.0-beta.2"
26 | 


--------------------------------------------------------------------------------
/scripts/build-wasm:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | cd $1
 3 | wasm-pack build --release --out-dir pkg/web --target web
 4 | wasm-pack build --release --out-dir pkg/nodejs --target nodejs
 5 | 
 6 | mv pkg/web/README.md pkg/
 7 | mv pkg/nodejs/package.json pkg/
 8 | rm -rf pkg/{web,nodejs}/package.json
 9 | 
10 | node <<CODE
11 | const fs = require('fs')
12 | const path = require('path')
13 | const pkg = require('./pkg/package.json')
14 | 
15 | pkg.author = pkg.collaborators[0]
16 | delete pkg.collaborators
17 | 
18 | pkg.browser = path.join('web', pkg.main)
19 | pkg.main = path.join('nodejs', pkg.main)
20 | pkg.types = path.join('nodejs', pkg.types)
21 | pkg.files = pkg.files.map(f => path.join('nodejs', f)).concat(pkg.files.map(f => path.join('web', f)))
22 | 
23 | fs.writeFileSync('./pkg/package.json', JSON.stringify(pkg, null, 2))
24 | CODE
25 | 


--------------------------------------------------------------------------------
/playground/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "incremental": true /* Enable incremental compilation */,
 4 |     "jsx": "react",
 5 |     "target": "es2020" /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */,
 6 |     "module": "es2020" /* Specify what module code is generated. */,
 7 |     "lib": ["DOM"],
 8 |     "moduleResolution": "Node",
 9 |     "resolveJsonModule": true,
10 |     "esModuleInterop": true /* Emit additional JavaScript to ease support for importing CommonJS modules. This enables `allowSyntheticDefaultImports` for type compatibility. */,
11 |     "forceConsistentCasingInFileNames": true /* Ensure that casing is correct in imports. */,
12 |     "strict": true /* Enable all strict type-checking options. */,
13 |     "skipLibCheck": true /* Skip type checking all .d.ts files. */
14 |   }
15 | }
16 | 


--------------------------------------------------------------------------------
/wasm-core/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | authors = ["Leko <leko.noor@gmail.com>"]
 3 | categories = ["wasm", "data-structures", "text-processing"]
 4 | description = "WebAssembly binding of Goya"
 5 | edition = "2018"
 6 | license = "Apache-2.0 OR MIT"
 7 | name = "goya-core"
 8 | publish = false
 9 | repository = "https://github.com/Leko/goya"
10 | version = "0.1.9"
11 | 
12 | [lib]
13 | crate-type = ["cdylib"]
14 | 
15 | [dependencies]
16 | futures = "0.3.17"
17 | goya = {version = "^0.1.9", path = "../goya"}
18 | goya-ipadic = {version = "^0.1.9", path = "../ipadic"}
19 | lazy_static = "1.4"
20 | rkyv = {version = "0.7.19", features = ["indexmap"]}
21 | rmp-serde = "1.0.0-beta.2"
22 | serde = {version = "1.0", features = ["derive"]}
23 | serde-wasm-bindgen = "0.3.1"
24 | wasm-bindgen = {version = "0.2.78", features = ["serde-serialize"]}
25 | wasm-bindgen-futures = "0.4.28"
26 | 
27 | [package.metadata.wasm-pack.profile.release]
28 | wasm-opt = ['--dce', '-O4']
29 | 


--------------------------------------------------------------------------------
/goya/src/morpheme.rs:
--------------------------------------------------------------------------------
 1 | use serde::{Deserialize, Serialize};
 2 | 
 3 | #[derive(
 4 |     Debug,
 5 |     Hash,
 6 |     PartialEq,
 7 |     Eq,
 8 |     PartialOrd,
 9 |     Clone,
10 |     Serialize,
11 |     Deserialize,
12 |     rkyv::Archive,
13 |     rkyv::Serialize,
14 |     rkyv::Deserialize,
15 | )]
16 | pub struct Morpheme {
17 |     /// 左文脈ID (単語を左から見たときの文脈 ID)
18 |     /// https://taku910.github.io/mecab/dic-detail.html
19 |     pub left_context_id: usize,
20 |     /// 右文脈ID (単語を右から見たときの文脈 ID)
21 |     /// https://taku910.github.io/mecab/dic-detail.html
22 |     pub right_context_id: usize,
23 |     /// > 単語コスト (小さいほど出現しやすい)
24 |     /// > コスト値は short int (16bit 整数) の範囲におさめる必要があります.
25 |     pub cost: i16,
26 | }
27 | impl Morpheme {
28 |     pub fn new(left_context_id: usize, right_context_id: usize, cost: i16) -> Morpheme {
29 |         Morpheme {
30 |             left_context_id,
31 |             right_context_id,
32 |             cost,
33 |         }
34 |     }
35 | }
36 | 


--------------------------------------------------------------------------------
/wasm-features/src/lib.rs:
--------------------------------------------------------------------------------
 1 | use goya::id::WordIdentifier;
 2 | use goya::word_features::WordFeaturesMap;
 3 | use wasm_bindgen::prelude::*;
 4 | 
 5 | #[macro_use]
 6 | extern crate lazy_static;
 7 | 
 8 | lazy_static! {
 9 |     static ref WORD_FEATURES: WordFeaturesMap =
10 |         rmp_serde::from_slice(include_bytes!("../__generated__/features.bin")).unwrap();
11 | }
12 | 
13 | #[wasm_bindgen]
14 | pub fn get_features(wids: &JsValue) -> JsValue {
15 |     let wids: Vec<WordIdentifier> = wids.into_serde().unwrap();
16 |     let features: Vec<Vec<String>> = wids
17 |         .iter()
18 |         .map(|wid| {
19 |             WORD_FEATURES
20 |                 .get(wid)
21 |                 .unwrap()
22 |                 .iter()
23 |                 .map(|s| s.to_string())
24 |                 .collect()
25 |         })
26 |         .collect::<Vec<_>>();
27 |     serde_wasm_bindgen::to_value(&features).unwrap()
28 | }
29 | 
30 | #[wasm_bindgen]
31 | pub fn ready() {
32 |     lazy_static::initialize(&WORD_FEATURES);
33 | }
34 | 


--------------------------------------------------------------------------------
/goya/src/dictionary.rs:
--------------------------------------------------------------------------------
 1 | use super::char_class::CharDefinition;
 2 | use super::id::WordIdentifier;
 3 | use super::morpheme::Morpheme;
 4 | 
 5 | pub trait Dictionary {
 6 |     fn get(&self, wid: &WordIdentifier) -> Option<&Morpheme> {
 7 |         match wid {
 8 |             WordIdentifier::Known(wid, _) => self.get_known_morpheme(wid),
 9 |             WordIdentifier::Unknown(wid, _) => self.get_unknown_morpheme(wid),
10 |         }
11 |     }
12 |     fn get_known_morpheme(&self, wid: &usize) -> Option<&Morpheme>;
13 |     fn get_unknown_morpheme(&self, wid: &usize) -> Option<&Morpheme>;
14 |     fn resolve_homonyms(&self, wid: &usize) -> Option<&Vec<usize>>;
15 |     fn take_unknown_chars_seq(&self, def: &CharDefinition, text: &str, start: &usize) -> String;
16 |     fn classify_char(&self, c: &char) -> &CharDefinition;
17 |     fn get_unknown_morphemes_by_class(&self, class: &str) -> Vec<(usize, &Morpheme)>;
18 |     fn transition_cost(&self, left: &usize, right: &usize) -> Option<&i16>;
19 |     fn occurrence_cost(&self, wid: &usize) -> Option<i16>;
20 | }
21 | 


--------------------------------------------------------------------------------
/goya/src/id.rs:
--------------------------------------------------------------------------------
 1 | use serde::{Deserialize, Serialize};
 2 | 
 3 | #[derive(Debug, Clone, Serialize, Deserialize)]
 4 | #[serde(tag = "tag", content = "id")]
 5 | pub enum WordIdentifier {
 6 |     Known(usize, String),   // ID, surface_form
 7 |     Unknown(usize, String), // ID, surface_form
 8 | }
 9 | impl WordIdentifier {
10 |     pub fn get_surface(&self) -> &str {
11 |         match self {
12 |             Self::Known(_, surface) => surface,
13 |             Self::Unknown(_, surface) => surface,
14 |         }
15 |     }
16 | }
17 | 
18 | #[cfg(test)]
19 | mod tests {
20 |     use super::*;
21 | 
22 |     #[test]
23 |     fn get_surface_known() {
24 |         let surface = String::from("test");
25 |         let id = WordIdentifier::Known(0, surface.to_string());
26 |         assert_eq!(id.get_surface(), surface);
27 |     }
28 | 
29 |     #[test]
30 |     fn get_surface_unknown() {
31 |         let surface = String::from("test");
32 |         let id = WordIdentifier::Unknown(0, surface.to_string());
33 |         assert_eq!(id.get_surface(), surface);
34 |     }
35 | }
36 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Shingo Inoue
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/playground/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "@goya/playground",
 3 |   "private": true,
 4 |   "version": "1.0.0",
 5 |   "scripts": {
 6 |     "start": "webpack-dev-server --mode development",
 7 |     "build": "webpack --mode production"
 8 |   },
 9 |   "author": "Leko <leko.noor@gmail.com>",
10 |   "license": "MIT",
11 |   "devDependencies": {
12 |     "@mui/styles": "^5.0.1",
13 |     "@swc/core": "^1.2.92",
14 |     "@vue/preload-webpack-plugin": "^2.0.0",
15 |     "@wasm-tool/wasm-pack-plugin": "^1.5.0",
16 |     "file-loader": "^6.2.0",
17 |     "html-webpack-plugin": "^5.3.2",
18 |     "swc-loader": "^0.1.15",
19 |     "typescript": "^4.4.3",
20 |     "webpack": "^5.56.0",
21 |     "webpack-cli": "^4.8.0",
22 |     "webpack-dev-server": "^4.3.0",
23 |     "workbox-webpack-plugin": "^6.3.0"
24 |   },
25 |   "dependencies": {
26 |     "@emotion/react": "^11.4.1",
27 |     "@emotion/styled": "^11.3.0",
28 |     "@mui/icons-material": "^5.0.1",
29 |     "@mui/lab": "^5.0.0-alpha.49",
30 |     "@mui/material": "^5.0.2",
31 |     "@mui/x-data-grid": "^5.0.0-beta.0",
32 |     "comlink": "^4.3.1",
33 |     "react": "^17.0.2",
34 |     "react-dom": "^17.0.2",
35 |     "react-use": "^17.3.1",
36 |     "viz.js": "^2.1.2"
37 |   }
38 | }
39 | 


--------------------------------------------------------------------------------
/playground/src/Dot.tsx:
--------------------------------------------------------------------------------
 1 | import React, { useCallback, useEffect, useRef, useState } from "react";
 2 | import Box from "@mui/material/Box";
 3 | import Button from "@mui/material/Button";
 4 | import Viz from "viz.js";
 5 | import workerURL from "viz.js/full.render.js";
 6 | 
 7 | type Props = {
 8 |   dot: string;
 9 | };
10 | 
11 | const viz = new Viz({ workerURL });
12 | 
13 | export default function Dot(props: Props) {
14 |   const { dot } = props;
15 |   const [svg, setSVG] = useState<string>("");
16 | 
17 |   const handleDownload = useCallback(() => {
18 |     const a = document.createElement("a");
19 |     a.download = "lattice.svg";
20 |     a.href = `data://image/svg+xml,${encodeURIComponent(svg)}`;
21 |     a.click();
22 |   }, [svg]);
23 | 
24 |   useEffect(() => {
25 |     if (!dot || dot.trim().length === 0) {
26 |       return;
27 |     }
28 |     viz.renderSVGElement(dot).then((svg: SVGSVGElement) => {
29 |       svg.style.width = "100%";
30 |       svg.style.height = "100%";
31 |       setSVG(svg.outerHTML);
32 |     });
33 |   }, [dot, setSVG]);
34 | 
35 |   if (!svg) {
36 |     return null;
37 |   }
38 |   return (
39 |     <Box>
40 |       <Button variant="contained" onClick={handleDownload}>
41 |         Download as SVG
42 |       </Button>
43 |       <div dangerouslySetInnerHTML={{ __html: svg }} />
44 |     </Box>
45 |   );
46 | }
47 | 


--------------------------------------------------------------------------------
/scripts/build-dict:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | const os = require("os");
 3 | const fs = require("fs/promises");
 4 | const path = require("path");
 5 | const { spawnSync } = require("child_process");
 6 | 
 7 | async function main() {
 8 |   const tmp = await fs.mkdtemp(path.join(os.tmpdir(), "goya-dict-"));
 9 |   spawnSync(
10 |     "cargo",
11 |     [
12 |       "+nightly",
13 |       "run",
14 |       "-p",
15 |       "goya-cli",
16 |       "--release",
17 |       "--",
18 |       "--dicdir",
19 |       tmp,
20 |       "compile",
21 |       process.argv[2],
22 |     ],
23 |     { stdio: "inherit" }
24 |   );
25 | 
26 |   const base = path.join(__dirname, "..");
27 |   const generatedDir = path.join(base, "wasm-core", "__generated__");
28 |   await fs.rm(generatedDir, { recursive: true, force: true });
29 |   await fs.mkdir(path.dirname(generatedDir), { recursive: true });
30 |   await fs.rename(tmp, generatedDir);
31 | 
32 |   const generatedDir2 = path.join(base, "wasm-features", "__generated__");
33 |   await fs.rm(generatedDir2, { recursive: true, force: true });
34 |   await fs.mkdir(generatedDir2, { recursive: true });
35 |   await fs.rename(
36 |     path.join(generatedDir, "features.bin"),
37 |     path.join(generatedDir2, "features.bin")
38 |   );
39 | }
40 | 
41 | main().catch((e) => {
42 |   console.error(e.stack);
43 |   process.exit(1);
44 | });
45 | 


--------------------------------------------------------------------------------
/benchmarks/bench.js:
--------------------------------------------------------------------------------
 1 | import { EOL } from "os";
 2 | import path from "path";
 3 | import fs from "fs";
 4 | import Benchmark from "benchmark";
 5 | import kuromoji from "kuromoji";
 6 | import core from "wasm-core";
 7 | import features from "wasm-features";
 8 | 
 9 | const suite = new Benchmark.Suite();
10 | 
11 | const [, , tokenizer] = await Promise.all([
12 |   core.ready(),
13 |   features.ready(),
14 |   new Promise((resolve, reject) => {
15 |     kuromoji
16 |       .builder({
17 |         dicPath: path.join(
18 |           path.dirname(new URL(import.meta.url).pathname),
19 |           "node_modules",
20 |           "kuromoji",
21 |           "dict"
22 |         ),
23 |       })
24 |       .build((err, tokenizer) => {
25 |         if (err) {
26 |           return reject(err);
27 |         }
28 |         resolve(tokenizer);
29 |       });
30 |   }),
31 | ]);
32 | 
33 | const lines = fs.readFileSync("/dev/stdin", "utf8").trim().split(EOL);
34 | suite
35 |   .add("goya", () => {
36 |     for (const line of lines) {
37 |       const lattice = core.parse(line);
38 |       features.get_features(lattice.find_best().map(({ wid }) => wid));
39 |     }
40 |   })
41 |   .add("kuromoji", () => {
42 |     for (const line of lines) {
43 |       tokenizer.tokenize(line);
44 |     }
45 |   })
46 |   .on("cycle", (event) => {
47 |     console.log(String(event.target));
48 |   })
49 |   .on("complete", function () {
50 |     console.log("Fastest is " + this.filter("fastest").map("name"));
51 |   })
52 |   .run({ async: true });
53 | 


--------------------------------------------------------------------------------
/playground/src/goya.worker.ts:
--------------------------------------------------------------------------------
 1 | import * as Comlink from "comlink";
 2 | 
 3 | export type Stats = {
 4 |   loadWasm: number;
 5 |   loadDict: number;
 6 |   parse: number;
 7 | };
 8 | 
 9 | const kLoad = "loadWasm";
10 | const kDict = "loadDict";
11 | const kParse = "parse";
12 | 
13 | const encoder = new TextEncoder();
14 | const decoder = new TextDecoder();
15 | 
16 | async function parse(input: ArrayBufferLike): Promise<ArrayBufferLike> {
17 |   performance.mark(kLoad);
18 |   const mod = await import(
19 |     /* webpackChunkName: "core" */ "../../wasm-core/pkg"
20 |   );
21 |   performance.mark(kDict);
22 |   await mod.ready();
23 |   performance.mark(kParse);
24 |   const lattice = mod.parse(decoder.decode(input));
25 | 
26 |   const res = encoder.encode(
27 |     JSON.stringify({
28 |       stats: {
29 |         loadWasm: performance.measure("loadWasm", kLoad, kDict).duration,
30 |         loadDict: performance.measure("loadDict", kDict, kParse).duration,
31 |         parse: performance.measure("parse", kParse).duration,
32 |       },
33 |       dot: lattice.as_dot(),
34 |       wakachi: lattice.wakachi(),
35 |       best: lattice.find_best(),
36 |     })
37 |   );
38 |   return Comlink.transfer(res, [res.buffer]);
39 | }
40 | 
41 | async function getFeatures(payload: ArrayBufferLike): Promise<ArrayBufferLike> {
42 |   const mod = await import(
43 |     /* webpackChunkName: "features" */ "../../wasm-features/pkg"
44 |   );
45 |   const features = mod.get_features(JSON.parse(decoder.decode(payload)));
46 |   const res = encoder.encode(JSON.stringify(features));
47 |   return Comlink.transfer(res, [res.buffer]);
48 | }
49 | 
50 | Comlink.expose({ parse, getFeatures });
51 | 


--------------------------------------------------------------------------------
/.github/workflows/CD.yml:
--------------------------------------------------------------------------------
 1 | name: CD
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - "v*"
 7 | 
 8 | env:
 9 |   CARGO_TERM_COLOR: always
10 | 
11 | jobs:
12 |   crates-io:
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |       - uses: actions/checkout@v2
16 |       - uses: actions-rs/toolchain@v1
17 |         with:
18 |           toolchain: stable
19 |       - uses: actions-rs/cargo@v1
20 |         with:
21 |           command: login
22 |           args: ${{ secrets.CRATES_IO_TOKEN }}
23 |       - run: cd goya && cargo publish && sleep 30
24 |       - run: cd ipadic && cargo publish && sleep 30
25 |       - run: cd goya-cli && cargo publish && sleep 30
26 |   npm:
27 |     runs-on: ubuntu-latest
28 |     steps:
29 |       - uses: actions/checkout@v2
30 |       - uses: actions/setup-node@v2
31 |         with:
32 |           node-version: "16"
33 |           registry-url: "https://registry.npmjs.org"
34 |       - uses: actions-rs/toolchain@v1
35 |         with:
36 |           toolchain: nightly
37 |       - run: cargo install wasm-pack
38 |       - run: |
39 |           NAME='mecab-ipadic.tar.gz'
40 |           curl -v -L 'https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7MWVlSDBCSXZMTXM' -o $NAME
41 |           tar -zxvf $NAME
42 |           rm -rf $NAME
43 |           ./scripts/build-dict mecab-ipadic-2.7.0-20070801
44 |       - run: ./scripts/build-wasm wasm-core
45 |       - run: cd wasm-core/pkg && npm publish
46 |         env:
47 |           NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }}
48 |       - run: ./scripts/build-wasm wasm-features
49 |       - run: cd wasm-features/pkg && npm publish
50 |         env:
51 |           NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }}
52 | 


--------------------------------------------------------------------------------
/.github/workflows/CI.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [main]
 6 |   pull_request:
 7 |     branches: [main]
 8 | 
 9 | env:
10 |   CARGO_TERM_COLOR: always
11 | 
12 | jobs:
13 |   cargo:
14 |     runs-on: ubuntu-latest
15 |     steps:
16 |       - uses: actions/checkout@v2
17 |       - uses: actions-rs/toolchain@v1
18 |         with:
19 |           toolchain: stable
20 |       - run: |
21 |           NAME='mecab-ipadic.tar.gz'
22 |           curl -v -L 'https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7MWVlSDBCSXZMTXM' -o $NAME
23 |           tar -zxvf $NAME
24 |           rm -rf $NAME
25 |           cargo run -p goya-cli --release -- compile mecab-ipadic-2.7.0-20070801
26 |       - run: cargo clippy --workspace --exclude goya-core --exclude goya-features
27 |       - run: cargo build --workspace --exclude goya-core --exclude goya-features
28 |       - run: cargo test --workspace --exclude goya-core --exclude goya-features
29 |   wasm:
30 |     runs-on: ubuntu-latest
31 |     steps:
32 |       - uses: actions/checkout@v2
33 |       - uses: actions/setup-node@v2
34 |         with:
35 |           node-version: "16"
36 |           cache: "npm"
37 |           cache-dependency-path: benchmarks/package-lock.json
38 |       - uses: actions-rs/toolchain@v1
39 |         with:
40 |           toolchain: nightly
41 |       - run: cargo install wasm-pack
42 |       - run: |
43 |           NAME='mecab-ipadic.tar.gz'
44 |           curl -v -L 'https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7MWVlSDBCSXZMTXM' -o $NAME
45 |           tar -zxvf $NAME
46 |           rm -rf $NAME
47 |           ./scripts/build-dict mecab-ipadic-2.7.0-20070801
48 |       - run: |
49 |           cd benchmarks
50 |           ./scripts/setup
51 |           npm i
52 |       - run: cd benchmarks && node goya.js < ita-corpus.txt
53 |       - run: cd benchmarks && node kuromoji.js < ita-corpus.txt
54 |       - run: cd benchmarks && node bench.js < ita-corpus.txt
55 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Goya
 2 | 
 3 | [![goya at crates.io](https://img.shields.io/crates/v/goya.svg)](https://crates.io/crates/goya)
 4 | [![goya at docs.rs](https://docs.rs/goya/badge.svg)](https://docs.rs/goya)
 5 | 
 6 | Goya is a Japanese Morphological Analyzer written in Rust.  
 7 | The main goal is to compile to WebAssembly for morphological analysis in browsers and other JavaScript runtimes. In addition, it can be used with the CLI and Rust.
 8 | 
 9 | [Try Goya playground](https://goya.pages.dev/). It uses the Goya-wasm from WebWorker.
10 | 
11 | ## Getting started
12 | 
13 | ### Fetch the latest IPA dictionary
14 | 
15 | Download the latest IPA dictionary from [the official Mecab website](https://taku910.github.io/mecab/) and unzip it.
16 | 
17 | ### Install Goya CLI
18 | 
19 | ```
20 | cargo install goya-cli
21 | ```
22 | 
23 | ### Compile the IPA dictionary
24 | 
25 | Compile the IPA dictionary to generate a binary dictionary for morphological analysis. It may take a few minutes.
26 | 
27 | ```
28 | goya compile /path/to/ipadic
29 | ```
30 | 
31 | The binary dictionary will be generated in the `~/.goya` directory by default. You can change the destination with the `--dicdir` option.
32 | 
33 | ```
34 | goya --dicdir=/path/to/generated compile /path/to/ipadic
35 | ```
36 | 
37 | ### Run Morphological Analysis
38 | 
39 | Goya takes input from STDIN. The easiest way is using the echo command and pipe it to the Goya.
40 | 
41 | ```
42 | $ echo すもももももももものうち | goya
43 | すもも	名詞,一般,*,*,*,*,すもも,スモモ,スモモ
44 | も	助詞,係助詞,*,*,*,*,も,モ,モ
45 | もも	名詞,一般,*,*,*,*,もも,モモ,モモ
46 | も	助詞,係助詞,*,*,*,*,も,モ,モ
47 | もも	名詞,一般,*,*,*,*,もも,モモ,モモ
48 | の	助詞,連体化,*,*,*,*,の,ノ,ノ
49 | うち	名詞,非自立,副詞可能,*,*,*,うち,ウチ,ウチ
50 | EOS
51 | ```
52 | 
53 | If you specified the `--dicdir` option when compiling the dictionary, you should also specify it when running the goya command.
54 | 
55 | ```
56 | echo すもももももももものうち | goya --dicdir=/path/to/generated
57 | ```
58 | 
59 | ## Release
60 | 
61 | ```
62 | cargo release <patch|minor|major> --workspace --no-tag --skip-publish --dependent-version Upgrade
63 | git tag v{{VERSION}}
64 | git push origin v{{VERSION}}
65 | ```
66 | 


--------------------------------------------------------------------------------
/playground/webpack.config.js:
--------------------------------------------------------------------------------
 1 | const path = require("path");
 2 | const zlib = require("zlib");
 3 | const HtmlWebpackPlugin = require("html-webpack-plugin");
 4 | const WasmPackPlugin = require("@wasm-tool/wasm-pack-plugin");
 5 | const { GenerateSW: WorkboxPlugin } = require("workbox-webpack-plugin");
 6 | const PreloadWebpackPlugin = require("@vue/preload-webpack-plugin");
 7 | 
 8 | const { BROTLI_PARAM_QUALITY, BROTLI_MAX_QUALITY } = zlib.constants;
 9 | 
10 | const swcOption = {
11 |   jsc: {
12 |     parser: {
13 |       syntax: "typescript",
14 |       tsx: true,
15 |       dynamicImport: true,
16 |     },
17 |     target: "es2020",
18 |   },
19 | };
20 | 
21 | module.exports = {
22 |   entry: "./src/index.tsx",
23 |   output: {
24 |     path: path.resolve(__dirname, "dist"),
25 |     filename: "[name].[contenthash].js",
26 |     chunkFilename: "[name].[chunkhash].js",
27 |   },
28 |   resolve: {
29 |     extensions: [".tsx", ".ts", ".js"],
30 |   },
31 |   module: {
32 |     rules: [
33 |       {
34 |         test: /\.tsx?$/,
35 |         use: {
36 |           loader: "swc-loader",
37 |           options: swcOption,
38 |         },
39 |       },
40 |       // It's for Viz.js
41 |       {
42 |         test: /\.render\.js$/,
43 |         use: ["file-loader"],
44 |       },
45 |     ],
46 |   },
47 |   plugins: [
48 |     new HtmlWebpackPlugin({
49 |       template: path.resolve(__dirname, "src", "index.html"),
50 |     }),
51 |     new PreloadWebpackPlugin({
52 |       rel: "preconnect",
53 |       fileWhitelist: [/.wasm$/],
54 |     }),
55 |     new WasmPackPlugin({
56 |       crateDirectory: path.resolve(__dirname, "..", "wasm-core"),
57 |       forceMode: "production",
58 |     }),
59 |     new WasmPackPlugin({
60 |       crateDirectory: path.resolve(__dirname, "..", "wasm-features"),
61 |       forceMode: "production",
62 |     }),
63 |     ...(process.env.NODE_ENV === "production"
64 |       ? [
65 |           new WorkboxPlugin({
66 |             clientsClaim: true,
67 |             skipWaiting: true,
68 |           }),
69 |         ]
70 |       : []),
71 |   ],
72 |   experiments: {
73 |     asyncWebAssembly: true,
74 |   },
75 | };
76 | 


--------------------------------------------------------------------------------
/goya/src/common_prefix_tree.rs:
--------------------------------------------------------------------------------
 1 | use std::collections::BTreeMap;
 2 | 
 3 | #[derive(Debug, Default, PartialEq, Eq)]
 4 | pub struct CommonPrefixTree {
 5 |     pub id: Option<usize>,
 6 |     pub children: BTreeMap<char, CommonPrefixTree>,
 7 | }
 8 | impl CommonPrefixTree {
 9 |     pub fn can_stop(&self) -> bool {
10 |         self.id.is_some()
11 |     }
12 | 
13 |     pub fn size(&self) -> usize {
14 |         self.entires_dfs().len()
15 |     }
16 | 
17 |     pub fn min_char(&self) -> Option<&char> {
18 |         self.children.keys().min()
19 |     }
20 | 
21 |     pub fn append(&mut self, id: usize, word: &str) {
22 |         let mut token = String::from(word);
23 |         token.push('\0');
24 |         self.append_chars(id, &token, 0);
25 |     }
26 | 
27 |     pub fn entires_dfs(&self) -> Vec<(String, &CommonPrefixTree)> {
28 |         self.dfs_collect(&String::new())
29 |     }
30 | 
31 |     fn dfs_collect(&self, prefix: &str) -> Vec<(String, &CommonPrefixTree)> {
32 |         let mut open = vec![(prefix.to_string(), self)];
33 |         for (c, child) in self.children.iter() {
34 |             let mut substr = String::from(prefix);
35 |             substr.push(*c);
36 |             open.append(&mut child.dfs_collect(&substr));
37 |         }
38 |         open
39 |     }
40 | 
41 |     fn append_chars(&mut self, id: usize, text: &str, cursor: usize) {
42 |         let c = text.chars().nth(cursor).unwrap();
43 |         let child = self
44 |             .children
45 |             .entry(c)
46 |             .or_insert_with(CommonPrefixTree::default);
47 |         if cursor + 1 == text.chars().count() {
48 |             child.id = Some(id);
49 |             return;
50 |         }
51 |         child.append_chars(id, text, cursor + 1);
52 |     }
53 | }
54 | 
55 | #[cfg(test)]
56 | mod tests {
57 |     use super::CommonPrefixTree;
58 | 
59 |     #[test]
60 |     fn builds_a_word_that_has_1_char() {
61 |         let mut trie = CommonPrefixTree::default();
62 |         trie.append(1, "あい");
63 |         trie.append(2, "いう");
64 |         assert_eq!(
65 |             trie.entires_dfs()
66 |                 .iter()
67 |                 .map(|(p, _)| p)
68 |                 .collect::<Vec<_>>(),
69 |             vec!["", "あ", "あい", "あい\0", "い", "いう", "いう\0"]
70 |         );
71 |     }
72 | }
73 | 


--------------------------------------------------------------------------------
/playground/src/Table.tsx:
--------------------------------------------------------------------------------
 1 | import { DataGrid } from "@mui/x-data-grid";
 2 | import React, { useEffect, useState } from "react";
 3 | import { wrap, transfer } from "comlink";
 4 | 
 5 | interface GoyaFeaturesAPI {
 6 |   getFeatures: (input: ArrayBufferLike) => Promise<ArrayBufferLike>;
 7 | }
 8 | type Props = {
 9 |   rows: Record<string, string>[];
10 | };
11 | 
12 | const encoder = new TextEncoder();
13 | const decoder = new TextDecoder();
14 | const worker = wrap<GoyaFeaturesAPI>(
15 |   new Worker(new URL("./goya.worker.ts", import.meta.url))
16 | );
17 | const base = { flex: 1, sortable: false };
18 | 
19 | export default function Table(props: Props) {
20 |   const [features, setFeatures] = useState([]);
21 | 
22 |   const columns = [
23 |     { field: "surface_form", headerName: "表層形", ...base },
24 |     { field: "is_known", headerName: "既知語", ...base },
25 |     { field: "feature_0", headerName: "品詞", ...base },
26 |     { field: "feature_1", headerName: "品詞細分類1", ...base },
27 |     { field: "feature_2", headerName: "品詞細分類2", ...base },
28 |     { field: "feature_3", headerName: "品詞細分類3", ...base },
29 |     { field: "feature_4", headerName: "活用型", ...base },
30 |     { field: "feature_5", headerName: "活用形", ...base },
31 |     { field: "feature_6", headerName: "原形", ...base },
32 |     { field: "feature_7", headerName: "読み", ...base },
33 |     { field: "feature_8", headerName: "発音", ...base },
34 |   ];
35 |   const rows = props.rows.map((row, i) => ({
36 |     id: i,
37 |     ...row,
38 |     feature_0: features[i]?.[0],
39 |     feature_1: features[i]?.[1],
40 |     feature_2: features[i]?.[2],
41 |     feature_3: features[i]?.[3],
42 |     feature_4: features[i]?.[4],
43 |     feature_5: features[i]?.[5],
44 |     feature_6: features[i]?.[6],
45 |     feature_7: features[i]?.[7],
46 |     feature_8: features[i]?.[8],
47 |   }));
48 | 
49 |   useEffect(() => {
50 |     setFeatures([]);
51 |     if (!props.rows) {
52 |       return;
53 |     }
54 |     const wids = props.rows.map((m) => m.wid);
55 |     const payload = encoder.encode(JSON.stringify(wids));
56 |     worker
57 |       .getFeatures(transfer(payload, [payload.buffer]))
58 |       .then((res) => JSON.parse(decoder.decode(res)))
59 |       .then(setFeatures);
60 |   }, [props.rows]);
61 | 
62 |   return (
63 |     <DataGrid
64 |       autoHeight
65 |       disableColumnMenu
66 |       disableSelectionOnClick
67 |       rows={rows}
68 |       columns={columns}
69 |     />
70 |   );
71 | }
72 | 


--------------------------------------------------------------------------------
/playground/src/Result.tsx:
--------------------------------------------------------------------------------
 1 | import React, { Suspense, lazy, useState } from "react";
 2 | import Box from "@mui/material/Box";
 3 | import Stack from "@mui/material/Stack";
 4 | import Chip from "@mui/material/Chip";
 5 | import Tab from "@mui/material/Tab";
 6 | import TabContext from "@mui/lab/TabContext";
 7 | import TabList from "@mui/lab/TabList";
 8 | import TabPanel from "@mui/lab/TabPanel";
 9 | import type { Stats } from "./goya.worker";
10 | import { Typography } from "@mui/material";
11 | 
12 | enum ResultTab {
13 |   Wakachi = "Wakachi",
14 |   Table = "Table",
15 |   Dot = "Dot",
16 | }
17 | 
18 | type Props = {
19 |   dot?: string;
20 |   wakachi?: string[];
21 |   best?: unknown[] | null;
22 |   stats?: Stats;
23 | };
24 | 
25 | const Table = lazy(() => import(/* webpackChunkName: "table" */ "./Table"));
26 | const Dot = lazy(() => import(/* webpackChunkName: "dot" */ "./Dot"));
27 | 
28 | export function Result(props: Props) {
29 |   const { dot, wakachi, best, stats } = props;
30 |   const [tab, setTab] = useState(ResultTab.Wakachi);
31 | 
32 |   const handleChangeTab = (_: unknown, newValue: ResultTab) => {
33 |     setTab(newValue);
34 |   };
35 | 
36 |   return (
37 |     <>
38 |       <Box mb={1}>
39 |         <Stack direction="row" spacing={1}>
40 |           <Chip
41 |             size="small"
42 |             label={`load wasm: ${stats?.loadWasm.toFixed(1) ?? "- "}ms`}
43 |           />
44 |           <Chip
45 |             size="small"
46 |             label={`load dictionary: ${stats?.loadDict.toFixed(1) ?? "- "}ms`}
47 |           />
48 |           <Chip
49 |             size="small"
50 |             label={`parse: ${stats?.parse.toFixed(1) ?? "- "}ms`}
51 |           />
52 |         </Stack>
53 |       </Box>
54 |       <TabContext value={tab}>
55 |         <TabList onChange={handleChangeTab} aria-label="解析結果">
56 |           <Tab label="分かち書き" value={ResultTab.Wakachi} disabled={!best} />
57 |           <Tab label="形態素" value={ResultTab.Table} disabled={!best} />
58 |           <Tab label="ラティス" value={ResultTab.Dot} disabled={!best} />
59 |         </TabList>
60 |         <TabPanel value={ResultTab.Wakachi}>
61 |           <Typography>{wakachi?.join("/")}</Typography>
62 |         </TabPanel>
63 |         <TabPanel value={ResultTab.Table}>
64 |           <Suspense fallback={null}>
65 |             <Table rows={best ?? ([] as any[])} />
66 |           </Suspense>
67 |         </TabPanel>
68 |         <TabPanel value={ResultTab.Dot}>
69 |           <Suspense fallback={null}>{dot ? <Dot dot={dot} /> : null}</Suspense>
70 |         </TabPanel>
71 |       </TabContext>
72 |     </>
73 |   );
74 | }
75 | 


--------------------------------------------------------------------------------
/wasm-core/.gitignore:
--------------------------------------------------------------------------------
  1 | pkg
  2 | 
  3 | # Created by https://www.toptal.com/developers/gitignore/api/node
  4 | # Edit at https://www.toptal.com/developers/gitignore?templates=node
  5 | 
  6 | ### Node ###
  7 | # Logs
  8 | logs
  9 | *.log
 10 | npm-debug.log*
 11 | yarn-debug.log*
 12 | yarn-error.log*
 13 | lerna-debug.log*
 14 | .pnpm-debug.log*
 15 | 
 16 | # Diagnostic reports (https://nodejs.org/api/report.html)
 17 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
 18 | 
 19 | # Runtime data
 20 | pids
 21 | *.pid
 22 | *.seed
 23 | *.pid.lock
 24 | 
 25 | # Directory for instrumented libs generated by jscoverage/JSCover
 26 | lib-cov
 27 | 
 28 | # Coverage directory used by tools like istanbul
 29 | coverage
 30 | *.lcov
 31 | 
 32 | # nyc test coverage
 33 | .nyc_output
 34 | 
 35 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
 36 | .grunt
 37 | 
 38 | # Bower dependency directory (https://bower.io/)
 39 | bower_components
 40 | 
 41 | # node-waf configuration
 42 | .lock-wscript
 43 | 
 44 | # Compiled binary addons (https://nodejs.org/api/addons.html)
 45 | build/Release
 46 | 
 47 | # Dependency directories
 48 | node_modules/
 49 | jspm_packages/
 50 | 
 51 | # Snowpack dependency directory (https://snowpack.dev/)
 52 | web_modules/
 53 | 
 54 | # TypeScript cache
 55 | *.tsbuildinfo
 56 | 
 57 | # Optional npm cache directory
 58 | .npm
 59 | 
 60 | # Optional eslint cache
 61 | .eslintcache
 62 | 
 63 | # Microbundle cache
 64 | .rpt2_cache/
 65 | .rts2_cache_cjs/
 66 | .rts2_cache_es/
 67 | .rts2_cache_umd/
 68 | 
 69 | # Optional REPL history
 70 | .node_repl_history
 71 | 
 72 | # Output of 'npm pack'
 73 | *.tgz
 74 | 
 75 | # Yarn Integrity file
 76 | .yarn-integrity
 77 | 
 78 | # dotenv environment variables file
 79 | .env
 80 | .env.test
 81 | .env.production
 82 | 
 83 | # parcel-bundler cache (https://parceljs.org/)
 84 | .cache
 85 | .parcel-cache
 86 | 
 87 | # Next.js build output
 88 | .next
 89 | out
 90 | 
 91 | # Nuxt.js build / generate output
 92 | .nuxt
 93 | dist
 94 | 
 95 | # Gatsby files
 96 | .cache/
 97 | # Comment in the public line in if your project uses Gatsby and not Next.js
 98 | # https://nextjs.org/blog/next-9-1#public-directory-support
 99 | # public
100 | 
101 | # vuepress build output
102 | .vuepress/dist
103 | 
104 | # Serverless directories
105 | .serverless/
106 | 
107 | # FuseBox cache
108 | .fusebox/
109 | 
110 | # DynamoDB Local files
111 | .dynamodb/
112 | 
113 | # TernJS port file
114 | .tern-port
115 | 
116 | # Stores VSCode versions used for testing VSCode extensions
117 | .vscode-test
118 | 
119 | # yarn v2
120 | .yarn/cache
121 | .yarn/unplugged
122 | .yarn/build-state.yml
123 | .yarn/install-state.gz
124 | .pnp.*
125 | 
126 | ### Node Patch ###
127 | # Serverless Webpack directories
128 | .webpack/
129 | 
130 | # End of https://www.toptal.com/developers/gitignore/api/node
131 | mecab-ipadic-2.7.0-20070801
132 | 


--------------------------------------------------------------------------------
/playground/.gitignore:
--------------------------------------------------------------------------------
  1 | __generated__
  2 | pkg
  3 | 
  4 | # Created by https://www.toptal.com/developers/gitignore/api/node
  5 | # Edit at https://www.toptal.com/developers/gitignore?templates=node
  6 | 
  7 | ### Node ###
  8 | # Logs
  9 | logs
 10 | *.log
 11 | npm-debug.log*
 12 | yarn-debug.log*
 13 | yarn-error.log*
 14 | lerna-debug.log*
 15 | .pnpm-debug.log*
 16 | 
 17 | # Diagnostic reports (https://nodejs.org/api/report.html)
 18 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
 19 | 
 20 | # Runtime data
 21 | pids
 22 | *.pid
 23 | *.seed
 24 | *.pid.lock
 25 | 
 26 | # Directory for instrumented libs generated by jscoverage/JSCover
 27 | lib-cov
 28 | 
 29 | # Coverage directory used by tools like istanbul
 30 | coverage
 31 | *.lcov
 32 | 
 33 | # nyc test coverage
 34 | .nyc_output
 35 | 
 36 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
 37 | .grunt
 38 | 
 39 | # Bower dependency directory (https://bower.io/)
 40 | bower_components
 41 | 
 42 | # node-waf configuration
 43 | .lock-wscript
 44 | 
 45 | # Compiled binary addons (https://nodejs.org/api/addons.html)
 46 | build/Release
 47 | 
 48 | # Dependency directories
 49 | node_modules/
 50 | jspm_packages/
 51 | 
 52 | # Snowpack dependency directory (https://snowpack.dev/)
 53 | web_modules/
 54 | 
 55 | # TypeScript cache
 56 | *.tsbuildinfo
 57 | 
 58 | # Optional npm cache directory
 59 | .npm
 60 | 
 61 | # Optional eslint cache
 62 | .eslintcache
 63 | 
 64 | # Microbundle cache
 65 | .rpt2_cache/
 66 | .rts2_cache_cjs/
 67 | .rts2_cache_es/
 68 | .rts2_cache_umd/
 69 | 
 70 | # Optional REPL history
 71 | .node_repl_history
 72 | 
 73 | # Output of 'npm pack'
 74 | *.tgz
 75 | 
 76 | # Yarn Integrity file
 77 | .yarn-integrity
 78 | 
 79 | # dotenv environment variables file
 80 | .env
 81 | .env.test
 82 | .env.production
 83 | 
 84 | # parcel-bundler cache (https://parceljs.org/)
 85 | .cache
 86 | .parcel-cache
 87 | 
 88 | # Next.js build output
 89 | .next
 90 | out
 91 | 
 92 | # Nuxt.js build / generate output
 93 | .nuxt
 94 | dist
 95 | 
 96 | # Gatsby files
 97 | .cache/
 98 | # Comment in the public line in if your project uses Gatsby and not Next.js
 99 | # https://nextjs.org/blog/next-9-1#public-directory-support
100 | # public
101 | 
102 | # vuepress build output
103 | .vuepress/dist
104 | 
105 | # Serverless directories
106 | .serverless/
107 | 
108 | # FuseBox cache
109 | .fusebox/
110 | 
111 | # DynamoDB Local files
112 | .dynamodb/
113 | 
114 | # TernJS port file
115 | .tern-port
116 | 
117 | # Stores VSCode versions used for testing VSCode extensions
118 | .vscode-test
119 | 
120 | # yarn v2
121 | .yarn/cache
122 | .yarn/unplugged
123 | .yarn/build-state.yml
124 | .yarn/install-state.gz
125 | .pnp.*
126 | 
127 | ### Node Patch ###
128 | # Serverless Webpack directories
129 | .webpack/
130 | 
131 | # End of https://www.toptal.com/developers/gitignore/api/node
132 | mecab-ipadic-2.7.0-20070801
133 | 


--------------------------------------------------------------------------------
/goya-cli/src/repl.rs:
--------------------------------------------------------------------------------
 1 | use goya::dot;
 2 | use goya::double_array::DoubleArray;
 3 | use goya::id::WordIdentifier;
 4 | use goya::lattice::Lattice;
 5 | use goya::word_features::WordFeaturesMap;
 6 | use goya_ipadic::ipadic::IPADic;
 7 | use std::io::{stdin, stdout, BufRead, BufWriter, Write};
 8 | use std::str::FromStr;
 9 | 
10 | pub enum Format {
11 |     Dot,
12 |     Plain,
13 | }
14 | impl FromStr for Format {
15 |     type Err = &'static str;
16 | 
17 |     fn from_str(s: &str) -> Result<Self, Self::Err> {
18 |         match s {
19 |             "dot" => Ok(Format::Dot),
20 |             "plain" => Ok(Format::Plain),
21 |             _ => Err("no match"),
22 |         }
23 |     }
24 | }
25 | 
26 | pub struct ReplContext<'a> {
27 |     pub da: &'a DoubleArray,
28 |     pub dict: &'a IPADic,
29 |     pub word_set: &'a WordFeaturesMap,
30 |     pub format: Format,
31 | }
32 | 
33 | pub fn start(opt: ReplContext) -> Result<(), std::io::Error> {
34 |     let out = stdout();
35 |     let mut out = BufWriter::new(out.lock());
36 | 
37 |     for line in stdin().lock().lines() {
38 |         match line {
39 |             Ok(line) if line.is_empty() => continue,
40 |             Ok(line) => {
41 |                 let lattice = Lattice::parse(&line, opt.da, opt.dict);
42 |                 match opt.format {
43 |                     Format::Dot => {
44 |                         writeln!(out, "{}", dot::render(&lattice, opt.dict).unwrap())?;
45 |                     }
46 |                     Format::Plain => {
47 |                         if let Some(path) = lattice.find_best() {
48 |                             for wid in path.into_iter() {
49 |                                 let (surface_form, features) = match wid {
50 |                                     WordIdentifier::Unknown(id, surface_form) => {
51 |                                         (surface_form, opt.word_set.get_unknown(&id).unwrap())
52 |                                     }
53 |                                     WordIdentifier::Known(id, surface_form) => {
54 |                                         (surface_form, opt.word_set.get_known(&id).unwrap())
55 |                                     }
56 |                                 };
57 |                                 writeln!(
58 |                                     out,
59 |                                     "{}\t{}",
60 |                                     surface_form,
61 |                                     features
62 |                                         .into_iter()
63 |                                         .map(|f| f.to_string())
64 |                                         .collect::<Vec<_>>()
65 |                                         .join(",")
66 |                                 )?;
67 |                             }
68 |                             writeln!(out, "EOS")?;
69 |                             out.flush()?;
70 |                         }
71 |                     }
72 |                 }
73 |             }
74 |             Err(err) => return Err(err),
75 |         }
76 |     }
77 |     Ok(())
78 | }
79 | 


--------------------------------------------------------------------------------
/goya-cli/src/main.rs:
--------------------------------------------------------------------------------
 1 | mod build;
 2 | mod path_util;
 3 | mod repl;
 4 | 
 5 | use clap::Parser;
 6 | use futures::executor::block_on;
 7 | use futures::future;
 8 | use goya::double_array::DoubleArray;
 9 | use goya_ipadic::ipadic::IPADic;
10 | use path_util::PathUtil;
11 | use repl::Format;
12 | use rkyv::{archived_root, Deserialize, Infallible};
13 | use std::fs;
14 | 
15 | #[derive(Parser)]
16 | struct Opts {
17 |     /// `~/.goya/dict` by default
18 |     #[clap(short, long)]
19 |     dicdir: Option<String>,
20 |     #[clap(short, long, default_value = "plain")]
21 |     format: Format,
22 |     #[clap(subcommand)]
23 |     subcmd: Option<SubCommand>,
24 | }
25 | 
26 | #[derive(Parser)]
27 | enum SubCommand {
28 |     Compile(Compile),
29 |     Clean,
30 | }
31 | 
32 | /// A subcommand for controlling testing
33 | #[derive(Parser)]
34 | struct Compile {
35 |     /// Path to the IPAdic directory
36 |     dicpath: String,
37 | }
38 | 
39 | fn main() {
40 |     let opts: Opts = Opts::parse();
41 |     let base_dir = dirs::home_dir().unwrap().join(".goya");
42 |     let dicdir = opts
43 |         .dicdir
44 |         .unwrap_or_else(|| base_dir.join("dict").to_str().unwrap().to_string());
45 |     match opts.subcmd {
46 |         Some(SubCommand::Compile(c)) => match build::build(&c.dicpath, &dicdir) {
47 |             Ok(_) => {}
48 |             Err(err) => {
49 |                 println!("{:?}", err);
50 |             }
51 |         },
52 |         Some(SubCommand::Clean) => {
53 |             let util = PathUtil::from(dicdir);
54 |             fs::remove_file(util.da_path()).expect("Failed to delete file");
55 |             fs::remove_file(util.dict_path()).expect("Failed to delete file");
56 |         }
57 |         _ => {
58 |             let util = PathUtil::from(dicdir);
59 | 
60 |             let da_fut = async {
61 |                 let encoded = fs::read(util.da_path()).expect("Failed to load dictionary");
62 |                 let archived = unsafe { archived_root::<DoubleArray>(&encoded[..]) };
63 |                 archived.deserialize(&mut Infallible).unwrap()
64 |             };
65 |             let ipadic_fut = async {
66 |                 let encoded = fs::read(util.dict_path()).expect("Failed to load vocabulary");
67 |                 let archived = unsafe { archived_root::<IPADic>(&encoded[..]) };
68 |                 archived.deserialize(&mut Infallible).unwrap()
69 |             };
70 |             let features_fut = async {
71 |                 let encoded = fs::read(util.features_path()).expect("Failed to load surfaces");
72 |                 rmp_serde::from_slice(&encoded[..]).unwrap()
73 |             };
74 | 
75 |             let (ipadic, word_set) = block_on(future::join(ipadic_fut, features_fut));
76 |             let da = block_on(da_fut);
77 |             repl::start(repl::ReplContext {
78 |                 da: &da,
79 |                 dict: &ipadic,
80 |                 word_set: &word_set,
81 |                 format: opts.format,
82 |             })
83 |             .unwrap();
84 |             std::thread::spawn(move || drop(ipadic));
85 |             std::thread::spawn(move || drop(da));
86 |             std::thread::spawn(move || drop(word_set));
87 |         }
88 |     }
89 | }
90 | 


--------------------------------------------------------------------------------
/wasm-core/src/lib.rs:
--------------------------------------------------------------------------------
 1 | use goya::dictionary::Dictionary;
 2 | use goya::dot;
 3 | use goya::double_array::DoubleArray;
 4 | use goya::id::WordIdentifier;
 5 | use goya::lattice::Lattice;
 6 | use goya_ipadic::ipadic::IPADic;
 7 | use rkyv::{archived_root, Deserialize, Infallible};
 8 | use serde::Serialize;
 9 | use wasm_bindgen::prelude::*;
10 | 
11 | #[macro_use]
12 | extern crate lazy_static;
13 | 
14 | lazy_static! {
15 |     static ref DOUBLE_ARRAY: DoubleArray = {
16 |         let archived =
17 |             unsafe { archived_root::<DoubleArray>(include_bytes!("../__generated__/da.bin")) };
18 |         archived.deserialize(&mut Infallible).unwrap()
19 |     };
20 |     static ref IPADIC: IPADic = {
21 |         let archived =
22 |             unsafe { archived_root::<IPADic>(include_bytes!("../__generated__/dict.bin")) };
23 |         archived.deserialize(&mut Infallible).unwrap()
24 |     };
25 | }
26 | 
27 | #[derive(Serialize)]
28 | pub struct WasmMorpheme {
29 |     wid: WordIdentifier,
30 |     is_known: bool,
31 |     surface_form: String,
32 |     left_context_id: usize,
33 |     right_context_id: usize,
34 |     cost: i16,
35 | }
36 | impl WasmMorpheme {}
37 | 
38 | #[wasm_bindgen]
39 | pub struct WasmLattice {
40 |     lattice: Lattice,
41 | }
42 | #[wasm_bindgen]
43 | impl WasmLattice {
44 |     pub fn as_dot(&self) -> String {
45 |         dot::render(&self.lattice, &*IPADIC).unwrap()
46 |     }
47 | 
48 |     pub fn wakachi(&self) -> Vec<JsValue> {
49 |         self.best_morphemes()
50 |             .map(|morpheme| serde_wasm_bindgen::to_value(&morpheme.surface_form).unwrap())
51 |             .collect()
52 |     }
53 | 
54 |     pub fn find_best(&self) -> Vec<JsValue> {
55 |         self.best_morphemes()
56 |             .map(|morpheme| serde_wasm_bindgen::to_value(&morpheme).unwrap())
57 |             .collect()
58 |     }
59 | 
60 |     fn best_morphemes(&self) -> impl Iterator<Item = WasmMorpheme> + '_ {
61 |         self.lattice
62 |             .find_best()
63 |             .map(|path| {
64 |                 path.into_iter().map(|wid| {
65 |                     let morpheme = IPADIC.get(&wid).unwrap();
66 |                     let (surface_form, is_known) = match &wid {
67 |                         WordIdentifier::Known(_, s) => (s.to_string(), true),
68 |                         WordIdentifier::Unknown(_, s) => (s.to_string(), false),
69 |                     };
70 |                     WasmMorpheme {
71 |                         wid,
72 |                         is_known,
73 |                         surface_form,
74 |                         left_context_id: morpheme.left_context_id,
75 |                         right_context_id: morpheme.right_context_id,
76 |                         cost: morpheme.cost,
77 |                     }
78 |                 })
79 |             })
80 |             .unwrap()
81 |     }
82 | }
83 | 
84 | #[wasm_bindgen]
85 | pub async fn ready() {
86 |     futures::join!(async { lazy_static::initialize(&IPADIC) }, async {
87 |         lazy_static::initialize(&DOUBLE_ARRAY)
88 |     });
89 | }
90 | 
91 | #[wasm_bindgen]
92 | pub fn parse(text: &str) -> WasmLattice {
93 |     WasmLattice {
94 |         lattice: Lattice::parse(text, &DOUBLE_ARRAY, &*IPADIC),
95 |     }
96 | }
97 | 


--------------------------------------------------------------------------------
/goya-cli/src/build.rs:
--------------------------------------------------------------------------------
 1 | use super::path_util::PathUtil;
 2 | use bytesize::ByteSize;
 3 | use console::{style, Emoji};
 4 | use goya::common_prefix_tree::CommonPrefixTree;
 5 | use goya::double_array::DoubleArray;
 6 | use goya_ipadic::ipadic::IPADic;
 7 | use goya_ipadic::ipadic_loader::IPADicLoader;
 8 | use rkyv::ser::{serializers::AllocSerializer, Serializer};
 9 | use std::error::Error;
10 | use std::fs;
11 | use std::time::Instant;
12 | 
13 | const LOOKING_GLASS: Emoji = Emoji("🔍", "");
14 | const PAPER: Emoji = Emoji("📃", "");
15 | const CLIP: Emoji = Emoji("🔗", "");
16 | const SPARKLE: Emoji = Emoji("✨", "");
17 | const TRUCK: Emoji = Emoji("🚚", "");
18 | 
19 | pub fn build(src_dir: &str, dist_dir: &str) -> Result<(), Box<dyn Error>> {
20 |     PathUtil::from(dist_dir.to_string());
21 |     let timer = Instant::now();
22 |     eprintln!(
23 |         "{} {} Loading dictionary...",
24 |         style("[1/4]").bold().dim(),
25 |         LOOKING_GLASS
26 |     );
27 |     let loader = IPADicLoader {};
28 |     let mut loaded = loader.load(src_dir)?;
29 | 
30 |     eprintln!(
31 |         "{} {} Analyzing vocabulary...",
32 |         style("[2/4]").bold().dim(),
33 |         PAPER
34 |     );
35 |     let mut cpt = CommonPrefixTree::default();
36 |     for (id, surface) in loaded.surfaces.iter() {
37 |         cpt.append(*id, surface);
38 |     }
39 | 
40 |     eprintln!(
41 |         "{} {} Recompiling dictionary...",
42 |         style("[3/4]").bold().dim(),
43 |         CLIP
44 |     );
45 |     let da = DoubleArray::from_cpt(&cpt);
46 | 
47 |     // DoubleArray only has one ID per surface form.
48 |     let used_wids = da.wids().collect();
49 |     loaded.ipadic.shrink_to_wids(&used_wids);
50 | 
51 |     eprintln!(
52 |         "{} {} Exporting dictionary...",
53 |         style("[4/4]").bold().dim(),
54 |         TRUCK
55 |     );
56 |     let util = PathUtil::from(dist_dir.to_string());
57 |     util.mkdirp().expect("Failed to create directory");
58 | 
59 |     let mut serializer = AllocSerializer::<256>::default();
60 |     serializer.serialize_value(&da).unwrap();
61 |     let bytes = serializer.into_serializer().into_inner();
62 |     fs::write(util.da_path(), &bytes).expect("Failed to write dictionary");
63 |     eprintln!("DoubleArray stats:");
64 |     eprintln!("  elements: {}", da.base.len());
65 |     eprintln!("  bytes: {}", ByteSize(bytes.len() as u64));
66 | 
67 |     let mut serializer = AllocSerializer::<256>::default();
68 |     serializer
69 |         .serialize_value::<IPADic>(&loaded.ipadic)
70 |         .unwrap();
71 |     let bytes = serializer.into_serializer().into_inner();
72 |     fs::write(util.dict_path(), &bytes).expect("Failed to write dictionary");
73 |     eprintln!("Dictionary stats:");
74 |     eprintln!("  bytes: {}", ByteSize(bytes.len() as u64));
75 | 
76 |     let bytes = rmp_serde::to_vec(&loaded.word_set).unwrap();
77 |     fs::write(util.features_path(), &bytes).expect("Failed to write word features");
78 |     eprintln!("Word features stats:");
79 |     eprintln!("  bytes: {}", ByteSize(bytes.len() as u64));
80 | 
81 |     let end = timer.elapsed();
82 |     eprintln!(
83 |         "{} Done in {}.{:03}s",
84 |         SPARKLE,
85 |         end.as_secs(),
86 |         end.subsec_millis()
87 |     );
88 |     Ok(())
89 | }
90 | 


--------------------------------------------------------------------------------
/goya/src/word_features.rs:
--------------------------------------------------------------------------------
 1 | use super::id::WordIdentifier;
 2 | use indexmap::IndexSet;
 3 | use serde::{Deserialize, Serialize};
 4 | use std::str::from_utf8_unchecked;
 5 | 
 6 | #[derive(Debug, Serialize, Deserialize, rkyv::Archive, rkyv::Serialize, rkyv::Deserialize)]
 7 | pub struct WordFeaturesMap {
 8 |     #[serde(with = "serde_bytes")]
 9 |     index: Vec<u8>,
10 |     offsets: Vec<usize>,
11 |     known: Vec<WordFeatures>,   // index = morpheme ID
12 |     unknown: Vec<WordFeatures>, // index = morpheme ID
13 | }
14 | impl WordFeaturesMap {
15 |     pub fn new(known: Vec<Vec<String>>, unknown: Vec<Vec<String>>) -> WordFeaturesMap {
16 |         let mut tmp_index: IndexSet<String> = IndexSet::new();
17 |         for features in known.iter().chain(unknown.iter()) {
18 |             for f in features.iter() {
19 |                 tmp_index.insert(f.to_string());
20 |             }
21 |         }
22 |         let mut index = vec![];
23 |         let mut offsets: Vec<usize> = vec![0; tmp_index.len()];
24 |         offsets[0] = tmp_index.get_index(0).unwrap().as_bytes().len();
25 |         for (idx, str) in tmp_index.iter().enumerate() {
26 |             index.append(&mut str.to_string().into_bytes());
27 |             if idx > 0 {
28 |                 offsets[idx] = offsets[idx - 1] + str.as_bytes().len();
29 |             }
30 |         }
31 | 
32 |         WordFeaturesMap {
33 |             known: known
34 |                 .into_iter()
35 |                 .map(|f| {
36 |                     WordFeatures::new(f.iter().map(|s| tmp_index.get_full(s).unwrap().0).collect())
37 |                 })
38 |                 .collect(),
39 |             unknown: unknown
40 |                 .into_iter()
41 |                 .map(|f| {
42 |                     WordFeatures::new(f.iter().map(|s| tmp_index.get_full(s).unwrap().0).collect())
43 |                 })
44 |                 .collect(),
45 |             index,
46 |             offsets,
47 |         }
48 |     }
49 | 
50 |     pub fn get(&self, wid: &WordIdentifier) -> Option<Vec<&str>> {
51 |         match wid {
52 |             WordIdentifier::Known(wid, _) => self.get_known(wid),
53 |             WordIdentifier::Unknown(wid, _) => self.get_unknown(wid),
54 |         }
55 |     }
56 | 
57 |     pub fn get_known(&self, wid: &usize) -> Option<Vec<&str>> {
58 |         self.known.get(*wid).map(|f| self.get_string(f))
59 |     }
60 | 
61 |     pub fn get_unknown(&self, wid: &usize) -> Option<Vec<&str>> {
62 |         self.unknown.get(*wid).map(|f| self.get_string(f))
63 |     }
64 | 
65 |     fn get_string(&self, f: &WordFeatures) -> Vec<&str> {
66 |         f.0.iter()
67 |             .map(|idx| {
68 |                 let idx = *idx;
69 |                 let end = self.offsets[idx];
70 |                 if idx == 0 {
71 |                     unsafe { from_utf8_unchecked(&self.index[0..end]) }
72 |                 } else {
73 |                     unsafe { from_utf8_unchecked(&self.index[(self.offsets[idx - 1])..end]) }
74 |                 }
75 |             })
76 |             .collect()
77 |     }
78 | }
79 | 
80 | /// > 5カラム目以降は, ユーザ定義の CSV フィールドです. 基本的に どんな内容でも CSV の許す限り追加することができます.
81 | /// > https://taku910.github.io/mecab/dic-detail.html
82 | #[derive(Debug, Serialize, Deserialize, rkyv::Archive, rkyv::Serialize, rkyv::Deserialize)]
83 | pub struct WordFeatures(Vec<usize>);
84 | impl WordFeatures {
85 |     pub fn new(features: Vec<usize>) -> WordFeatures {
86 |         WordFeatures(features)
87 |     }
88 | }
89 | 


--------------------------------------------------------------------------------
/playground/src/App.tsx:
--------------------------------------------------------------------------------
  1 | import React, { useCallback, useState } from "react";
  2 | import AppBar from "@mui/material/AppBar";
  3 | import Toolbar from "@mui/material/Toolbar";
  4 | import IconButton from "@mui/material/IconButton";
  5 | import Container from "@mui/material/Container";
  6 | import Box from "@mui/material/Box";
  7 | import Typography from "@mui/material/Typography";
  8 | import TextField from "@mui/material/TextField";
  9 | import GitHubIcon from "@mui/icons-material/GitHub";
 10 | import { useDebounce } from "react-use";
 11 | import { wrap, transfer } from "comlink";
 12 | import type { Stats } from "./goya.worker";
 13 | import { Result } from "./Result";
 14 | 
 15 | interface GoyaCoreAPI {
 16 |   parse: (input: ArrayBufferLike) => Promise<ArrayBufferLike>;
 17 | }
 18 | const worker = wrap<GoyaCoreAPI>(
 19 |   new Worker(new URL("./goya.worker.ts", import.meta.url))
 20 | );
 21 | const encoder = new TextEncoder();
 22 | const decoder = new TextDecoder();
 23 | const initText = new URL(location.href).searchParams.get("text");
 24 | 
 25 | export function App() {
 26 |   const [text, setText] = useState(initText ?? "すもももももももものうち");
 27 |   const [result, setResult] = useState<{
 28 |     dot: string;
 29 |     wakachi: string[];
 30 |     best: unknown[];
 31 |     stats: Stats;
 32 |   } | null>(null);
 33 | 
 34 |   const handleChangeText = useCallback(
 35 |     (event) => {
 36 |       setText(event.target.value.trim());
 37 |     },
 38 |     [setText]
 39 |   );
 40 |   useDebounce(
 41 |     () => {
 42 |       if (text.length === 0) {
 43 |         setResult(null);
 44 |       } else {
 45 |         const input = encoder.encode(text);
 46 |         worker
 47 |           .parse(transfer(input, [input.buffer]))
 48 |           .then((res) => decoder.decode(res))
 49 |           .then((res) => JSON.parse(res))
 50 |           .then(setResult);
 51 |       }
 52 |     },
 53 |     200,
 54 |     [text]
 55 |   );
 56 | 
 57 |   return (
 58 |     <>
 59 |       <AppBar position="static">
 60 |         <Toolbar>
 61 |           <Typography variant="h6" component="h1" sx={{ flexGrow: 1 }}>
 62 |             Goya playground
 63 |           </Typography>
 64 |           <IconButton
 65 |             size="large"
 66 |             color="inherit"
 67 |             aria-label="GitHub"
 68 |             href="https://github.com/Leko/goya"
 69 |             rel="noreferer"
 70 |             target="_blank"
 71 |             sx={{ mr: 2 }}
 72 |           >
 73 |             <GitHubIcon />
 74 |           </IconButton>
 75 |         </Toolbar>
 76 |       </AppBar>
 77 |       <Container>
 78 |         <Box mt={4}>
 79 |           <Typography variant="h4" component="h2" sx={{ flexGrow: 1 }}>
 80 |             Goya: Yet another Japanese morphological analyzer for Rust and
 81 |             WebAssembly
 82 |           </Typography>
 83 |           <Typography variant="body1" component="p" sx={{ flexGrow: 1 }}>
 84 |             Goya: WebAssemblyで利用可能な日本語の形態素解析ライブラリ
 85 |           </Typography>
 86 |         </Box>
 87 |         <Box mt={2}>
 88 |           <TextField
 89 |             label="文章を入力"
 90 |             margin="dense"
 91 |             multiline
 92 |             rows={4}
 93 |             fullWidth
 94 |             value={text}
 95 |             onChange={handleChangeText}
 96 |           />
 97 |         </Box>
 98 |         <Box mt={2}>
 99 |           <Result {...(result ?? {})} />
100 |         </Box>
101 |       </Container>
102 |     </>
103 |   );
104 | }
105 | 


--------------------------------------------------------------------------------
/ipadic/src/ipadic.rs:
--------------------------------------------------------------------------------
  1 | use goya::char_class::CharClassifier;
  2 | use goya::char_class::CharDefinition;
  3 | use goya::dictionary::Dictionary;
  4 | use goya::morpheme::Morpheme;
  5 | use indexmap::IndexSet;
  6 | use serde::{Deserialize, Serialize};
  7 | use std::collections::HashMap;
  8 | use std::collections::HashSet;
  9 | use std::iter::FromIterator;
 10 | use std::vec::Vec;
 11 | 
 12 | // TODO: Make it newtype idiom
 13 | type MorphemeIndex = usize;
 14 | 
 15 | #[derive(Debug, Serialize, Deserialize, rkyv::Archive, rkyv::Serialize, rkyv::Deserialize)]
 16 | pub struct IPADic {
 17 |     vocabulary: Vec<MorphemeIndex>, // index = morpheme ID
 18 |     homonyms: Vec<Vec<usize>>,      // index = morpheme ID
 19 |     classes: CharClassifier,
 20 |     matrix: Vec<Vec<i16>>,
 21 |     /// 1つのカテゴリに複数の素性を定義してもかまいません. 学習後, 適切なコスト値が 自動的に与えられます.
 22 |     /// https://taku910.github.io/mecab/learn.html#config
 23 |     unknown_classes: HashMap<String, Vec<usize>>,
 24 |     unknown_vocabulary: Vec<MorphemeIndex>, // index = morpheme ID
 25 |     vocabulary_index: IndexSet<Morpheme>,
 26 | }
 27 | impl Dictionary for IPADic {
 28 |     fn get_known_morpheme(&self, wid: &usize) -> Option<&Morpheme> {
 29 |         self.vocabulary
 30 |             .get(*wid)
 31 |             .map(|idx| self.vocabulary_index.get_index(*idx).unwrap())
 32 |     }
 33 | 
 34 |     fn get_unknown_morpheme(&self, wid: &usize) -> Option<&Morpheme> {
 35 |         self.unknown_vocabulary
 36 |             .get(*wid)
 37 |             .map(|idx| self.vocabulary_index.get_index(*idx).unwrap())
 38 |     }
 39 | 
 40 |     fn resolve_homonyms(&self, wid: &usize) -> Option<&Vec<usize>> {
 41 |         self.homonyms.get(*wid)
 42 |     }
 43 | 
 44 |     fn take_unknown_chars_seq(&self, def: &CharDefinition, text: &str, start: &usize) -> String {
 45 |         self.classes.take_unknown_chars(def, text, start)
 46 |     }
 47 | 
 48 |     fn classify_char(&self, c: &char) -> &CharDefinition {
 49 |         self.classes.classify(c)
 50 |     }
 51 | 
 52 |     fn get_unknown_morphemes_by_class(&self, class: &str) -> Vec<(usize, &Morpheme)> {
 53 |         self.unknown_classes
 54 |             .get(class)
 55 |             .unwrap()
 56 |             .iter()
 57 |             .map(|wid| (*wid, self.unknown_vocabulary.get(*wid).unwrap()))
 58 |             .map(|(wid, idx)| (wid, self.vocabulary_index.get_index(*idx).unwrap()))
 59 |             .collect::<Vec<_>>()
 60 |     }
 61 | 
 62 |     fn transition_cost(&self, left: &usize, right: &usize) -> Option<&i16> {
 63 |         if let Some(rights) = self.matrix.get(*left) {
 64 |             if let Some(cost) = rights.get(*right) {
 65 |                 return Some(cost);
 66 |             }
 67 |         }
 68 |         None
 69 |     }
 70 | 
 71 |     fn occurrence_cost(&self, wid: &usize) -> Option<i16> {
 72 |         self.get_known_morpheme(wid).map(|w| w.cost)
 73 |     }
 74 | }
 75 | impl IPADic {
 76 |     pub fn from(
 77 |         vocabulary: Vec<MorphemeIndex>,
 78 |         homonyms: Vec<Vec<usize>>,
 79 |         classes: CharClassifier,
 80 |         matrix: Vec<Vec<i16>>,
 81 |         unknown_classes: HashMap<String, Vec<usize>>,
 82 |         unknown_vocabulary: Vec<MorphemeIndex>,
 83 |         vocabulary_index: IndexSet<Morpheme>,
 84 |     ) -> IPADic {
 85 |         IPADic {
 86 |             vocabulary,
 87 |             homonyms,
 88 |             classes,
 89 |             matrix,
 90 |             unknown_classes,
 91 |             unknown_vocabulary,
 92 |             vocabulary_index,
 93 |         }
 94 |     }
 95 | 
 96 |     pub fn shrink_to_wids(&mut self, wids: &Vec<usize>) {
 97 |         let set: HashSet<usize> = HashSet::from_iter(wids.iter().cloned());
 98 |         for idx in 0..self.homonyms.len() {
 99 |             if set.contains(&idx) {
100 |                 continue;
101 |             }
102 |             self.homonyms[idx] = vec![];
103 |         }
104 |     }
105 | }
106 | 


--------------------------------------------------------------------------------
/goya/src/dot.rs:
--------------------------------------------------------------------------------
  1 | use super::{
  2 |     dictionary::Dictionary,
  3 |     lattice::{Lattice, BOS_CONTEXT_ID, EOS_CONTEXT_ID},
  4 | };
  5 | use std::{error::Error, fmt::Write};
  6 | 
  7 | const BOLD: &str = " penwidth=3";
  8 | 
  9 | pub fn render<D: Dictionary>(lattice: &Lattice, dict: &D) -> Result<String, Box<dyn Error>> {
 10 |     let cursor = (lattice.dp.len() - 1, 0);
 11 |     let len = lattice.indices.len();
 12 |     let best_path = lattice.find_best_path();
 13 |     let mut dot = String::from("");
 14 |     writeln!(
 15 |         dot,
 16 |         r#"digraph lattice {{
 17 |   rankdir=LR;
 18 |   splines=polyline;
 19 |   nodesep=.05;
 20 |   
 21 |   BOS [label="BOS\n0 (0)" shape="doublecircle"{}];
 22 |   EOS [label="EOS\n{} (0)" shape="doublecircle"{}];
 23 | "#,
 24 |         BOLD,
 25 |         lattice.dp[cursor.0].get(cursor.1).unwrap().0,
 26 |         BOLD
 27 |     )?;
 28 |     for (i, index) in lattice.indices.iter().enumerate() {
 29 |         for (j, (left_wid, wlen)) in index.iter().enumerate() {
 30 |             let left = dict.get(left_wid).unwrap();
 31 |             let node_style = match &best_path {
 32 |                 Some(best_path) if best_path.contains(&(i + 1, j)) => BOLD,
 33 |                 _ => "",
 34 |             };
 35 |             writeln!(
 36 |                 dot,
 37 |                 r#"  "{}_{}" [label="{}\n({}, {})"{}];"#,
 38 |                 i,
 39 |                 j,
 40 |                 left_wid.get_surface(),
 41 |                 lattice.dp[i + 1][j].0,
 42 |                 left.cost,
 43 |                 node_style,
 44 |             )?;
 45 |             if i == 0 {
 46 |                 let right = left;
 47 |                 let cost = dict
 48 |                     .transition_cost(&BOS_CONTEXT_ID, &right.right_context_id)
 49 |                     .unwrap();
 50 |                 let bos_edge_style = match &best_path {
 51 |                     Some(best_path) if best_path.contains(&(i + 1, j)) => BOLD,
 52 |                     _ => "",
 53 |                 };
 54 |                 writeln!(
 55 |                     dot,
 56 |                     r#"  BOS -> "{}_{}" [label="({})"{}];"#,
 57 |                     i, j, cost, bos_edge_style
 58 |                 )?;
 59 |             }
 60 |             if i + wlen >= len {
 61 |                 let cost = dict
 62 |                     .transition_cost(&left.left_context_id, &EOS_CONTEXT_ID)
 63 |                     .unwrap();
 64 |                 let eos_edge_style = match &best_path {
 65 |                     Some(best_path) if best_path.contains(&(i + 1, j)) => BOLD,
 66 |                     _ => "",
 67 |                 };
 68 |                 writeln!(
 69 |                     dot,
 70 |                     r#"  "{}_{}" -> EOS [label="({})"{}];"#,
 71 |                     i, j, cost, eos_edge_style
 72 |                 )?;
 73 |                 continue;
 74 |             }
 75 |             for (k, (right_wid, _)) in lattice.indices[i + wlen].iter().enumerate() {
 76 |                 let right = dict.get(right_wid).unwrap();
 77 |                 let cost = dict
 78 |                     .transition_cost(&left.left_context_id, &right.right_context_id)
 79 |                     .unwrap();
 80 |                 let edge_style = match &best_path {
 81 |                     Some(best_path)
 82 |                         if best_path.contains(&(i + 1, j))
 83 |                             && best_path.contains(&(i + wlen + 1, k)) =>
 84 |                     {
 85 |                         BOLD
 86 |                     }
 87 |                     _ => "",
 88 |                 };
 89 |                 writeln!(
 90 |                     dot,
 91 |                     r#"  "{}_{}" -> "{}_{}" [label="({})"{}];"#,
 92 |                     i,
 93 |                     j,
 94 |                     i + wlen,
 95 |                     k,
 96 |                     cost,
 97 |                     edge_style
 98 |                 )?;
 99 |             }
100 |         }
101 |     }
102 |     writeln!(dot, "}}")?;
103 |     Ok(dot)
104 | }
105 | 


--------------------------------------------------------------------------------
/goya/src/char_class.rs:
--------------------------------------------------------------------------------
  1 | use serde::{Deserialize, Serialize};
  2 | use std::collections::{HashMap, HashSet};
  3 | 
  4 | const CLASS_DEFAULT: &str = "DEFAULT";
  5 | 
  6 | #[derive(
  7 |     Debug, PartialEq, Eq, Serialize, Deserialize, rkyv::Archive, rkyv::Serialize, rkyv::Deserialize,
  8 | )]
  9 | pub enum InvokeTiming {
 10 |     Fallback,
 11 |     Always,
 12 | }
 13 | #[derive(
 14 |     Debug, PartialEq, Eq, Serialize, Deserialize, rkyv::Archive, rkyv::Serialize, rkyv::Deserialize,
 15 | )]
 16 | pub struct CharDefinition {
 17 |     pub class: String,
 18 |     pub timing: InvokeTiming,
 19 |     pub group_by_same_kind: bool,
 20 |     pub len: usize,
 21 |     pub compatibilities: HashSet<String>, // elements = class name
 22 | }
 23 | impl CharDefinition {
 24 |     pub fn compatible_with(&self, class_name: &str) -> bool {
 25 |         self.class.eq(class_name) || self.compatibilities.contains(class_name)
 26 |     }
 27 | }
 28 | 
 29 | #[derive(Debug, Serialize, Deserialize, rkyv::Archive, rkyv::Serialize, rkyv::Deserialize)]
 30 | pub struct CharClass {
 31 |     range: (u32, u32),
 32 |     class: String,
 33 | }
 34 | impl CharClass {
 35 |     pub fn from(range: (u32, u32), class: String) -> CharClass {
 36 |         CharClass { range, class }
 37 |     }
 38 | 
 39 |     pub fn in_range(&self, c: &char) -> bool {
 40 |         let code = *c as u32;
 41 |         self.range.0 <= code && code <= self.range.1
 42 |     }
 43 | }
 44 | 
 45 | #[derive(Debug, Serialize, Deserialize, rkyv::Archive, rkyv::Serialize, rkyv::Deserialize)]
 46 | pub struct CharClassifier {
 47 |     chars: HashMap<String, CharDefinition>,
 48 |     ranges: Vec<CharClass>,
 49 | }
 50 | impl CharClassifier {
 51 |     pub fn from(chars: HashMap<String, CharDefinition>, ranges: Vec<CharClass>) -> CharClassifier {
 52 |         CharClassifier { chars, ranges }
 53 |     }
 54 | 
 55 |     pub fn classify(&self, c: &char) -> &CharDefinition {
 56 |         let class = self.get_class_name(c);
 57 |         self.chars.get(class).unwrap()
 58 |     }
 59 | 
 60 |     pub fn take_unknown_chars(&self, def: &CharDefinition, text: &str, start: &usize) -> String {
 61 |         if !def.group_by_same_kind {
 62 |             return text.chars().skip(*start).take(def.len).collect();
 63 |         }
 64 | 
 65 |         let mut len = 0;
 66 |         text.chars()
 67 |             .enumerate()
 68 |             .skip(*start)
 69 |             .take_while(|(_, c)| {
 70 |                 if def.len != 0 && len >= def.len || !def.compatible_with(self.get_class_name(c)) {
 71 |                     return false;
 72 |                 }
 73 |                 len += 1;
 74 |                 true
 75 |             })
 76 |             .map(|(_, c)| c)
 77 |             .collect()
 78 |     }
 79 | 
 80 |     fn get_class_name(&self, c: &char) -> &str {
 81 |         self.ranges
 82 |             .iter()
 83 |             .find(|class| class.in_range(c))
 84 |             .map(|class| class.class.as_str())
 85 |             .unwrap_or_else(|| CLASS_DEFAULT)
 86 |     }
 87 | }
 88 | 
 89 | #[cfg(test)]
 90 | mod tests {
 91 |     use super::*;
 92 | 
 93 |     #[test]
 94 |     fn compatible_with_without_compatibilities() {
 95 |         let def_a = CharDefinition {
 96 |             class: String::from("A"),
 97 |             timing: InvokeTiming::Always,
 98 |             group_by_same_kind: false,
 99 |             len: 2,
100 |             compatibilities: HashSet::new(),
101 |         };
102 |         assert_eq!(def_a.compatible_with("A"), true);
103 |         assert_eq!(def_a.compatible_with("B"), false);
104 |     }
105 | 
106 |     #[test]
107 |     fn compatible_with_with_compatibilities() {
108 |         let mut compatibilities = HashSet::new();
109 |         compatibilities.insert(String::from("B"));
110 |         let def_a = CharDefinition {
111 |             class: String::from("A"),
112 |             timing: InvokeTiming::Always,
113 |             group_by_same_kind: false,
114 |             len: 2,
115 |             compatibilities,
116 |         };
117 |         assert_eq!(def_a.compatible_with("A"), true);
118 |         assert_eq!(def_a.compatible_with("B"), true);
119 |         assert_eq!(def_a.compatible_with("C"), false);
120 |     }
121 | 
122 |     #[test]
123 |     fn in_range() {
124 |         let class = CharClass::from((1, 2), String::new());
125 |         assert_eq!(class.in_range(&(0 as char)), false);
126 |         assert_eq!(class.in_range(&(1 as char)), true);
127 |         assert_eq!(class.in_range(&(2 as char)), true);
128 |         assert_eq!(class.in_range(&(3 as char)), false);
129 |     }
130 | }
131 | 


--------------------------------------------------------------------------------
/benchmarks/package-lock.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "name": "benchmarks",
  3 |   "version": "0.0.0",
  4 |   "lockfileVersion": 2,
  5 |   "requires": true,
  6 |   "packages": {
  7 |     "": {
  8 |       "version": "0.0.0",
  9 |       "hasInstallScript": true,
 10 |       "license": "ISC",
 11 |       "dependencies": {
 12 |         "kuromoji": "^0.1.2",
 13 |         "wasm-core": "../wasm-core/pkg",
 14 |         "wasm-features": "../wasm-features/pkg"
 15 |       },
 16 |       "devDependencies": {
 17 |         "benchmark": "^2.1.4"
 18 |       }
 19 |     },
 20 |     "../wasm-core/pkg": {
 21 |       "name": "goya-core",
 22 |       "version": "0.1.1",
 23 |       "license": "Apache-2.0 OR MIT"
 24 |     },
 25 |     "../wasm-features/pkg": {
 26 |       "name": "goya-features",
 27 |       "version": "0.1.1",
 28 |       "license": "Apache-2.0 OR MIT"
 29 |     },
 30 |     "node_modules/async": {
 31 |       "version": "2.6.3",
 32 |       "resolved": "https://registry.npmjs.org/async/-/async-2.6.3.tgz",
 33 |       "integrity": "sha512-zflvls11DCy+dQWzTW2dzuilv8Z5X/pjfmZOWba6TNIVDm+2UDaJmXSOXlasHKfNBs8oo3M0aT50fDEWfKZjXg==",
 34 |       "dependencies": {
 35 |         "lodash": "^4.17.14"
 36 |       }
 37 |     },
 38 |     "node_modules/benchmark": {
 39 |       "version": "2.1.4",
 40 |       "resolved": "https://registry.npmjs.org/benchmark/-/benchmark-2.1.4.tgz",
 41 |       "integrity": "sha1-CfPeMckWQl1JjMLuVloOvzwqVik=",
 42 |       "dev": true,
 43 |       "dependencies": {
 44 |         "lodash": "^4.17.4",
 45 |         "platform": "^1.3.3"
 46 |       }
 47 |     },
 48 |     "node_modules/doublearray": {
 49 |       "version": "0.0.2",
 50 |       "resolved": "https://registry.npmjs.org/doublearray/-/doublearray-0.0.2.tgz",
 51 |       "integrity": "sha1-Yxhv6NNEEydtNiH2qg7F954ifvk="
 52 |     },
 53 |     "node_modules/kuromoji": {
 54 |       "version": "0.1.2",
 55 |       "resolved": "https://registry.npmjs.org/kuromoji/-/kuromoji-0.1.2.tgz",
 56 |       "integrity": "sha512-V0dUf+C2LpcPEXhoHLMAop/bOht16Dyr+mDiIE39yX3vqau7p80De/koFqpiTcL1zzdZlc3xuHZ8u5gjYRfFaQ==",
 57 |       "dependencies": {
 58 |         "async": "^2.0.1",
 59 |         "doublearray": "0.0.2",
 60 |         "zlibjs": "^0.3.1"
 61 |       }
 62 |     },
 63 |     "node_modules/lodash": {
 64 |       "version": "4.17.21",
 65 |       "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz",
 66 |       "integrity": "sha512-v2kDEe57lecTulaDIuNTPy3Ry4gLGJ6Z1O3vE1krgXZNrsQ+LFTGHVxVjcXPs17LhbZVGedAJv8XZ1tvj5FvSg=="
 67 |     },
 68 |     "node_modules/platform": {
 69 |       "version": "1.3.6",
 70 |       "resolved": "https://registry.npmjs.org/platform/-/platform-1.3.6.tgz",
 71 |       "integrity": "sha512-fnWVljUchTro6RiCFvCXBbNhJc2NijN7oIQxbwsyL0buWJPG85v81ehlHI9fXrJsMNgTofEoWIQeClKpgxFLrg==",
 72 |       "dev": true
 73 |     },
 74 |     "node_modules/wasm-core": {
 75 |       "resolved": "../wasm-core/pkg",
 76 |       "link": true
 77 |     },
 78 |     "node_modules/wasm-features": {
 79 |       "resolved": "../wasm-features/pkg",
 80 |       "link": true
 81 |     },
 82 |     "node_modules/zlibjs": {
 83 |       "version": "0.3.1",
 84 |       "resolved": "https://registry.npmjs.org/zlibjs/-/zlibjs-0.3.1.tgz",
 85 |       "integrity": "sha1-UBl+2yihxCymWcyLTmqd3W1ERVQ=",
 86 |       "engines": {
 87 |         "node": "*"
 88 |       }
 89 |     }
 90 |   },
 91 |   "dependencies": {
 92 |     "async": {
 93 |       "version": "2.6.3",
 94 |       "resolved": "https://registry.npmjs.org/async/-/async-2.6.3.tgz",
 95 |       "integrity": "sha512-zflvls11DCy+dQWzTW2dzuilv8Z5X/pjfmZOWba6TNIVDm+2UDaJmXSOXlasHKfNBs8oo3M0aT50fDEWfKZjXg==",
 96 |       "requires": {
 97 |         "lodash": "^4.17.14"
 98 |       }
 99 |     },
100 |     "benchmark": {
101 |       "version": "2.1.4",
102 |       "resolved": "https://registry.npmjs.org/benchmark/-/benchmark-2.1.4.tgz",
103 |       "integrity": "sha1-CfPeMckWQl1JjMLuVloOvzwqVik=",
104 |       "dev": true,
105 |       "requires": {
106 |         "lodash": "^4.17.4",
107 |         "platform": "^1.3.3"
108 |       }
109 |     },
110 |     "doublearray": {
111 |       "version": "0.0.2",
112 |       "resolved": "https://registry.npmjs.org/doublearray/-/doublearray-0.0.2.tgz",
113 |       "integrity": "sha1-Yxhv6NNEEydtNiH2qg7F954ifvk="
114 |     },
115 |     "kuromoji": {
116 |       "version": "0.1.2",
117 |       "resolved": "https://registry.npmjs.org/kuromoji/-/kuromoji-0.1.2.tgz",
118 |       "integrity": "sha512-V0dUf+C2LpcPEXhoHLMAop/bOht16Dyr+mDiIE39yX3vqau7p80De/koFqpiTcL1zzdZlc3xuHZ8u5gjYRfFaQ==",
119 |       "requires": {
120 |         "async": "^2.0.1",
121 |         "doublearray": "0.0.2",
122 |         "zlibjs": "^0.3.1"
123 |       }
124 |     },
125 |     "lodash": {
126 |       "version": "4.17.21",
127 |       "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz",
128 |       "integrity": "sha512-v2kDEe57lecTulaDIuNTPy3Ry4gLGJ6Z1O3vE1krgXZNrsQ+LFTGHVxVjcXPs17LhbZVGedAJv8XZ1tvj5FvSg=="
129 |     },
130 |     "platform": {
131 |       "version": "1.3.6",
132 |       "resolved": "https://registry.npmjs.org/platform/-/platform-1.3.6.tgz",
133 |       "integrity": "sha512-fnWVljUchTro6RiCFvCXBbNhJc2NijN7oIQxbwsyL0buWJPG85v81ehlHI9fXrJsMNgTofEoWIQeClKpgxFLrg==",
134 |       "dev": true
135 |     },
136 |     "wasm-core": {
137 |       "version": "file:../wasm-core/pkg"
138 |     },
139 |     "wasm-features": {
140 |       "version": "file:../wasm-features/pkg"
141 |     },
142 |     "zlibjs": {
143 |       "version": "0.3.1",
144 |       "resolved": "https://registry.npmjs.org/zlibjs/-/zlibjs-0.3.1.tgz",
145 |       "integrity": "sha1-UBl+2yihxCymWcyLTmqd3W1ERVQ="
146 |     }
147 |   }
148 | }
149 | 


--------------------------------------------------------------------------------
/goya/src/double_array.rs:
--------------------------------------------------------------------------------
  1 | use super::common_prefix_tree::CommonPrefixTree;
  2 | use indexmap::IndexSet;
  3 | use itertools::Itertools;
  4 | use serde::{Deserialize, Serialize};
  5 | use std::cmp;
  6 | use std::collections::HashMap;
  7 | 
  8 | const INDEX_ROOT: usize = 1;
  9 | const TERM_CHAR: char = '\0';
 10 | 
 11 | #[derive(Debug)]
 12 | pub enum TransitionError {
 13 |     AlreadyTerminated,
 14 |     BaseFailed,
 15 |     CheckFailed,
 16 |     UnknownChar,
 17 |     BaseOutOfBounds,
 18 |     CheckOutOfBounds,
 19 | }
 20 | 
 21 | #[derive(Debug, Serialize, Deserialize, rkyv::Archive, rkyv::Serialize, rkyv::Deserialize)]
 22 | pub struct DoubleArray {
 23 |     pub codes: IndexSet<char>,
 24 |     pub base: Vec<i32>,
 25 |     pub check: Vec<usize>,
 26 | }
 27 | impl Default for DoubleArray {
 28 |     fn default() -> Self {
 29 |         let base: Vec<i32> = vec![0, 1];
 30 |         let check: Vec<usize> = vec![0, 0];
 31 |         let mut codes: IndexSet<char> = IndexSet::new();
 32 | 
 33 |         codes.insert(TERM_CHAR);
 34 | 
 35 |         DoubleArray { base, check, codes }
 36 |     }
 37 | }
 38 | impl DoubleArray {
 39 |     pub fn from(base: Vec<i32>, check: Vec<usize>, codes: IndexSet<char>) -> Self {
 40 |         DoubleArray { base, check, codes }
 41 |     }
 42 | 
 43 |     pub fn wids(&self) -> impl Iterator<Item = usize> + '_ {
 44 |         self.base
 45 |             .iter()
 46 |             .filter(|s| **s < 0)
 47 |             .map(|s| as_usize(&(s * -1)))
 48 |     }
 49 | 
 50 |     pub fn from_cpt(trie: &CommonPrefixTree) -> Self {
 51 |         let mut state_cache = HashMap::new();
 52 |         let mut da = DoubleArray::default();
 53 |         let mut chars = trie
 54 |             .entires_dfs()
 55 |             .iter()
 56 |             .map(|(prefix, _)| prefix)
 57 |             .join("")
 58 |             .chars()
 59 |             .collect::<Vec<_>>();
 60 |         chars.sort_unstable();
 61 |         chars.dedup();
 62 |         for c in chars {
 63 |             da.insert_to_codes(c);
 64 |         }
 65 | 
 66 |         for (prefix, node) in trie.entires_dfs() {
 67 |             if node.can_stop() {
 68 |                 continue;
 69 |             }
 70 | 
 71 |             // root node
 72 |             if prefix.is_empty() {
 73 |                 for next_c in node.children.keys() {
 74 |                     let next_char_code = da.get_code(next_c).unwrap();
 75 |                     let t = da.base[INDEX_ROOT] + next_char_code as i32;
 76 |                     let t = as_usize(&t);
 77 |                     da.insert_to_check(t, INDEX_ROOT);
 78 |                     state_cache.insert(concat_char_to_str(&prefix, *next_c), t);
 79 |                 }
 80 |                 continue;
 81 |             }
 82 | 
 83 |             let s = *state_cache.get(&prefix).unwrap();
 84 |             da.insert_to_base(s, da.find_next_s(node));
 85 |             for (next_c, child) in node.children.iter() {
 86 |                 let t = da.base.get(s).unwrap() + da.get_code(next_c).unwrap() as i32;
 87 |                 let t = as_usize(&t);
 88 |                 da.insert_to_check(t, s);
 89 |                 if child.can_stop() {
 90 |                     da.insert_to_base(t, -(child.id.unwrap() as i32));
 91 |                 } else {
 92 |                     let key = concat_char_to_str(&prefix, *next_c);
 93 |                     state_cache.insert(key, t);
 94 |                 }
 95 |             }
 96 |         }
 97 |         da.base.shrink_to_fit();
 98 |         da.check.shrink_to_fit();
 99 |         da.codes.shrink_to_fit();
100 |         da
101 |     }
102 | 
103 |     pub fn transition(
104 |         &self,
105 |         from: usize,
106 |         to: char,
107 |     ) -> Result<(i32, Option<usize>), TransitionError> {
108 |         let code = self.get_code(&to).ok_or(TransitionError::UnknownChar)?;
109 |         let s = self
110 |             .base
111 |             .get(from)
112 |             .ok_or(TransitionError::BaseOutOfBounds)?;
113 |         let t = s + code as i32;
114 |         if t < 0 {
115 |             return Err(TransitionError::AlreadyTerminated);
116 |         }
117 |         let next = self
118 |             .check
119 |             .get(as_usize(&t))
120 |             .ok_or(TransitionError::CheckOutOfBounds)?;
121 |         let base = self
122 |             .base
123 |             .get(t as usize)
124 |             .ok_or(TransitionError::BaseFailed)?;
125 |         let wid = if *base < 0 {
126 |             Some((base * -1) as usize)
127 |         } else {
128 |             None
129 |         };
130 |         if *next == from {
131 |             Ok((t, wid))
132 |         } else {
133 |             Err(TransitionError::CheckFailed)
134 |         }
135 |     }
136 | 
137 |     pub fn init(&self, to: char) -> Result<(i32, Option<usize>), TransitionError> {
138 |         self.transition(INDEX_ROOT, to)
139 |     }
140 | 
141 |     pub fn stop(&self, from: usize) -> Result<usize, TransitionError> {
142 |         match self.transition(from, TERM_CHAR) {
143 |             Ok((_, Some(wid))) => Ok(wid),
144 |             Ok(_) => unreachable!("Successful transition, but no wid"),
145 |             Err(reason) => Err(reason),
146 |         }
147 |     }
148 | 
149 |     pub fn get_code(&self, c: &char) -> Option<usize> {
150 |         self.codes.get_full(c).map(|(code, _)| code)
151 |     }
152 | 
153 |     fn insert_to_codes(&mut self, c: char) -> usize {
154 |         let (char_code, _) = self.codes.insert_full(c);
155 |         char_code
156 |     }
157 | 
158 |     fn insert_to_base(&mut self, index: usize, value: i32) {
159 |         let resized = cmp::max(self.base.len(), index + 1);
160 |         self.base.resize(resized, 0);
161 |         assert_eq!(
162 |             self.base[index], 0,
163 |             "index={} already used: {:?}",
164 |             index, self.base
165 |         );
166 |         self.base[index] = value;
167 |     }
168 | 
169 |     fn insert_to_check(&mut self, index: usize, value: usize) {
170 |         let resized = cmp::max(self.check.len(), index + 1);
171 |         self.check.resize(resized, 0);
172 |         self.check[index] = value;
173 |     }
174 | 
175 |     fn get_available_check_index(&self, left: usize) -> usize {
176 |         self.check
177 |             .iter()
178 |             .enumerate()
179 |             .skip(left)
180 |             // clippy says that `find is prefered to skip_while+next` but it's slower than the current
181 |             .skip_while(|(_, value)| value != &&0)
182 |             .next()
183 |             .map(|(i, _)| i)
184 |             .unwrap_or_else(|| unreachable!("index must be found"))
185 |     }
186 | 
187 |     fn find_next_s(&self, child: &CommonPrefixTree) -> i32 {
188 |         let mut position = self.get_available_check_index(INDEX_ROOT + 1);
189 |         let min_code = self.get_code(child.min_char().unwrap()).unwrap();
190 |         let offsets: Vec<_> = child
191 |             .children
192 |             .keys()
193 |             .map(|c| self.get_code(c).unwrap() - min_code)
194 |             .collect();
195 |         while offsets
196 |             .iter()
197 |             .any(|code| match self.check.get(position + code) {
198 |                 Some(0) => false,
199 |                 Some(_) => true,
200 |                 _ => false,
201 |             })
202 |         {
203 |             position += 1;
204 |         }
205 |         (position - min_code) as i32
206 |     }
207 | }
208 | 
209 | fn as_usize(n: &i32) -> usize {
210 |     assert!(*n >= 0, "n({}) should be greater than or equal to 0", n);
211 |     *n as usize
212 | }
213 | 
214 | fn concat_char_to_str(text: &str, c: char) -> String {
215 |     let mut tmp = String::from(text);
216 |     tmp.push(c);
217 |     tmp
218 | }
219 | 


--------------------------------------------------------------------------------
/goya/src/lattice.rs:
--------------------------------------------------------------------------------
  1 | use super::char_class::{CharDefinition, InvokeTiming};
  2 | use super::dictionary::Dictionary;
  3 | use super::double_array::DoubleArray;
  4 | use super::id::WordIdentifier;
  5 | use std::collections::{HashSet, VecDeque};
  6 | 
  7 | pub const BOS_CONTEXT_ID: usize = 0;
  8 | pub const EOS_CONTEXT_ID: usize = 0;
  9 | const NODE_BOS: usize = 0;
 10 | 
 11 | #[derive(Debug)]
 12 | pub struct Lattice {
 13 |     // (wid, length of the word)
 14 |     pub indices: Vec<Vec<(WordIdentifier, usize)>>,
 15 |     // (min cost, index, length)
 16 |     pub dp: Vec<Vec<(i32, usize, usize)>>,
 17 | }
 18 | impl Lattice {
 19 |     pub fn parse<D: Dictionary>(text: &str, da: &DoubleArray, dict: &D) -> Lattice {
 20 |         let len = text.chars().count();
 21 |         let mut indices: Vec<Vec<(WordIdentifier, usize)>> = vec![vec![]; len];
 22 |         let mut open_indices = VecDeque::from(vec![0]);
 23 |         let mut visited = HashSet::with_capacity(len);
 24 |         let char_defs = text
 25 |             .chars()
 26 |             .map(|c| dict.classify_char(&c))
 27 |             .collect::<Vec<&CharDefinition>>();
 28 | 
 29 |         while let Some(index) = open_indices.pop_front() {
 30 |             if visited.contains(&index) || index >= len {
 31 |                 continue;
 32 |             }
 33 |             visited.insert(index);
 34 | 
 35 |             let c = text.chars().nth(index).unwrap();
 36 |             let def = char_defs[index];
 37 |             if let InvokeTiming::Always = def.timing {
 38 |                 let surface_form = dict.take_unknown_chars_seq(def, text, &index);
 39 |                 open_indices.push_back(index + surface_form.chars().count());
 40 |                 for (wid, _) in dict.get_unknown_morphemes_by_class(&def.class) {
 41 |                     indices[index].push((
 42 |                         WordIdentifier::Unknown(wid, surface_form.to_string()),
 43 |                         surface_form.chars().count(),
 44 |                     ));
 45 |                 }
 46 |             }
 47 | 
 48 |             if let Ok((mut cursor, _)) = da.init(c) {
 49 |                 if let Ok(wid) = da.stop(cursor as usize) {
 50 |                     open_indices.push_back(index + 1);
 51 |                     for wid in dict.resolve_homonyms(&wid).unwrap().iter() {
 52 |                         indices[index].push((
 53 |                             WordIdentifier::Known(*wid, text.chars().skip(index).take(1).collect()),
 54 |                             1,
 55 |                         ));
 56 |                     }
 57 |                 }
 58 |                 let mut j = index + 1;
 59 |                 while j < len {
 60 |                     let c = text.chars().nth(j).unwrap();
 61 |                     match da.transition(cursor as usize, c) {
 62 |                         Ok((next, _)) => {
 63 |                             if let Ok(wid) = da.stop(next as usize) {
 64 |                                 open_indices.push_back(j + 1);
 65 |                                 for wid in dict.resolve_homonyms(&wid).unwrap().iter() {
 66 |                                     indices[index].push((
 67 |                                         WordIdentifier::Known(
 68 |                                             *wid,
 69 |                                             text.chars().skip(index).take(j + 1 - index).collect(),
 70 |                                         ),
 71 |                                         j + 1 - index,
 72 |                                     ));
 73 |                                 }
 74 |                             }
 75 |                             cursor = next;
 76 |                         }
 77 |                         Err(_) => {
 78 |                             break;
 79 |                         }
 80 |                     }
 81 |                     j += 1;
 82 |                 }
 83 |             }
 84 |             if indices[index].is_empty() && matches!(def.timing, InvokeTiming::Fallback) {
 85 |                 let surface_form = dict.take_unknown_chars_seq(def, text, &index);
 86 |                 open_indices.push_back(index + surface_form.chars().count());
 87 |                 for (wid, _) in dict.get_unknown_morphemes_by_class(&def.class) {
 88 |                     indices[index].push((
 89 |                         WordIdentifier::Unknown(wid, surface_form.to_string()),
 90 |                         surface_form.chars().count(),
 91 |                     ));
 92 |                 }
 93 |             }
 94 |         }
 95 |         Lattice {
 96 |             dp: get_dp_table(&indices, dict),
 97 |             indices,
 98 |         }
 99 |     }
100 | 
101 |     pub fn word_identifiers(&self) -> Vec<WordIdentifier> {
102 |         let mut wids = vec![];
103 |         for idx in self.indices.iter() {
104 |             for (wid, _) in idx.iter() {
105 |                 wids.push(wid.clone())
106 |             }
107 |         }
108 |         wids
109 |     }
110 | 
111 |     pub fn find_best_path(&self) -> Option<Vec<(usize, usize)>> {
112 |         let mut path = vec![];
113 |         let mut cursor = (self.dp.len() - 1, 0);
114 |         loop {
115 |             match self.dp[cursor.0].get(cursor.1) {
116 |                 Some((_, i, j)) => {
117 |                     if *i == NODE_BOS {
118 |                         break;
119 |                     }
120 |                     path.insert(0, (*i, *j));
121 |                     cursor = (*i, *j);
122 |                 }
123 |                 _ => return None,
124 |             }
125 |         }
126 |         Some(path)
127 |     }
128 | 
129 |     pub fn find_best(&self) -> Option<Vec<WordIdentifier>> {
130 |         match self.find_best_path() {
131 |             Some(best_path) => {
132 |                 let mut ids = vec![];
133 |                 for (i, j) in best_path.iter() {
134 |                     ids.push(self.indices[*i - 1][*j].0.clone());
135 |                 }
136 |                 Some(ids)
137 |             }
138 |             None => None,
139 |         }
140 |     }
141 | }
142 | 
143 | fn get_dp_table<D: Dictionary>(
144 |     indices: &[Vec<(WordIdentifier, usize)>],
145 |     dict: &D,
146 | ) -> Vec<Vec<(i32, usize, usize)>> {
147 |     let len = indices.len();
148 |     let max_num_childs = indices.iter().map(|idx| idx.len()).max().unwrap();
149 |     // (min cost, idx of indices, idx2 of indices[idx])
150 |     // * dp[0][0] means BOS
151 |     // * dp[dp.len() - 1][0] means EOS
152 |     // Individual cost should be less in i16, the sum of costs can exceed its range.
153 |     // Currently each element has unused indices to reduce num alloc
154 |     let mut dp: Vec<Vec<(i32, usize, usize)>> =
155 |         vec![vec![(i32::MAX, 0, 0); max_num_childs]; len + 2];
156 |     if max_num_childs == 0 {
157 |         return dp;
158 |     }
159 |     dp[0][0] = (0, 0, 0);
160 | 
161 |     for (i, (right_wid, _)) in indices[0].iter().enumerate() {
162 |         let right = dict.get(right_wid).unwrap();
163 |         let cost = dict
164 |             .transition_cost(&BOS_CONTEXT_ID, &right.right_context_id)
165 |             .unwrap()
166 |             + right.cost;
167 |         dp[1][i] = (cost as i32, NODE_BOS, 0);
168 |     }
169 | 
170 |     for (i, index) in indices.iter().enumerate() {
171 |         for (j, (left_wid, wlen)) in index.iter().enumerate() {
172 |             let before_cost = dp[i + 1][j].0;
173 |             let left = dict.get(left_wid).unwrap();
174 |             if i + wlen >= len {
175 |                 let cost = (*dict
176 |                     .transition_cost(&left.left_context_id, &EOS_CONTEXT_ID)
177 |                     .unwrap() as i32)
178 |                     + (left.cost as i32)
179 |                     + before_cost;
180 |                 if cost < dp[i + wlen + 1][0].0 {
181 |                     dp[i + wlen + 1][0] = (cost, i + 1, j);
182 |                 }
183 |                 continue;
184 |             }
185 | 
186 |             for (k, (right_wid, _)) in indices[i + wlen].iter().enumerate() {
187 |                 let right = dict.get(right_wid).unwrap();
188 |                 let cost = (*dict
189 |                     .transition_cost(&left.left_context_id, &right.right_context_id)
190 |                     .unwrap() as i32)
191 |                     + left.cost as i32
192 |                     + right.cost as i32
193 |                     + before_cost;
194 |                 if cost < dp[i + 1 + wlen][k].0 {
195 |                     dp[i + 1 + wlen][k] = (cost, i + 1, j);
196 |                 }
197 |             }
198 |         }
199 |     }
200 |     dp
201 | }
202 | 


--------------------------------------------------------------------------------
/ipadic/src/ipadic_loader.rs:
--------------------------------------------------------------------------------
  1 | use super::ipadic::IPADic;
  2 | use csv::ReaderBuilder;
  3 | use encoding_rs::EUC_JP;
  4 | use glob::glob;
  5 | use goya::char_class::{CharClass, CharClassifier, CharDefinition, InvokeTiming};
  6 | use goya::morpheme::Morpheme;
  7 | use goya::word_features::WordFeaturesMap;
  8 | use indexmap::IndexSet;
  9 | use regex::Regex;
 10 | use serde::Deserialize;
 11 | use std::collections::{HashMap, HashSet};
 12 | use std::error::Error;
 13 | use std::fs;
 14 | use std::path::Path;
 15 | use std::vec::Vec;
 16 | 
 17 | const COL_SURFACE_FORM: usize = 0; // 表層形
 18 | const COL_LEFT_CONTEXT_ID: usize = 1; // 左文脈ID
 19 | const COL_RIGHT_CONTEXT_ID: usize = 2; // 右文脈ID
 20 | const COL_COST: usize = 3; // コスト
 21 | 
 22 | pub struct LoadResult {
 23 |     pub ipadic: IPADic,
 24 |     pub word_set: WordFeaturesMap,
 25 |     pub surfaces: HashMap<usize, String>,
 26 | }
 27 | 
 28 | pub struct IPADicLoader {}
 29 | impl IPADicLoader {
 30 |     pub fn load(&self, dir: &str) -> Result<LoadResult, Box<dyn Error>> {
 31 |         let classes = load_chars(Path::new(dir).join("char.def"))?;
 32 |         let matrix = load_matrix(Path::new(dir).join("matrix.def"))?;
 33 |         let unknown = load_unknown(Path::new(dir).join("unk.def"))?;
 34 |         let csv_pattern = Path::new(dir).join("*.csv");
 35 |         let csv_pattern = csv_pattern.to_str().ok_or("Failed to build glob pattern")?;
 36 | 
 37 |         let mut vocabulary_index: IndexSet<Morpheme> = IndexSet::new();
 38 |         let mut surfaces = HashMap::new();
 39 |         let mut known_features = HashMap::new();
 40 |         let mut vocabulary = HashMap::new();
 41 |         let mut tmp_homonyms = HashMap::new();
 42 |         let mut id: usize = 1;
 43 |         for path in glob(csv_pattern)? {
 44 |             for row in load_words_csv(path?)? {
 45 |                 surfaces.insert(id, row.surface_form.to_string());
 46 |                 known_features.insert(id, row.features.clone());
 47 |                 tmp_homonyms
 48 |                     .entry(row.surface_form.to_string())
 49 |                     .or_insert_with(Vec::new)
 50 |                     .push(id);
 51 | 
 52 |                 let (idx, _) = vocabulary_index.insert_full(row.into());
 53 |                 vocabulary.insert(id, idx);
 54 |                 id += 1;
 55 |             }
 56 |         }
 57 |         let mut homonyms: HashMap<usize, Vec<usize>> = HashMap::new();
 58 |         for wids in tmp_homonyms.values() {
 59 |             for wid in wids.iter() {
 60 |                 homonyms.insert(*wid, wids.iter().copied().collect());
 61 |             }
 62 |         }
 63 | 
 64 |         let mut unknown_vocabulary = HashMap::new();
 65 |         let mut unknown_features = HashMap::new();
 66 |         let mut unknown_classes = HashMap::new();
 67 |         let mut id = 1;
 68 |         for (class, words) in unknown.into_iter() {
 69 |             for row in words {
 70 |                 unknown_features.insert(id, row.features.clone());
 71 |                 let (idx, _) = vocabulary_index.insert_full(row.into());
 72 |                 unknown_vocabulary.insert(id, idx);
 73 |                 unknown_classes
 74 |                     .entry(class.to_string())
 75 |                     .or_insert_with(Vec::new)
 76 |                     .push(id);
 77 |                 id += 1;
 78 |             }
 79 |         }
 80 | 
 81 |         let word_set = WordFeaturesMap::new(
 82 |             map_to_vec(known_features, Vec::new),
 83 |             map_to_vec(unknown_features, Vec::new),
 84 |         );
 85 |         let ipadic = IPADic::from(
 86 |             map_to_vec(vocabulary, || 0),
 87 |             map_to_vec(homonyms, Vec::new),
 88 |             classes,
 89 |             matrix,
 90 |             unknown_classes,
 91 |             map_to_vec(unknown_vocabulary, || 0),
 92 |             vocabulary_index,
 93 |         );
 94 |         let ret = LoadResult {
 95 |             word_set,
 96 |             ipadic,
 97 |             surfaces,
 98 |         };
 99 |         Ok(ret)
100 |     }
101 | }
102 | 
103 | #[derive(Debug, Clone, Deserialize)]
104 | struct CSVRow {
105 |     /// 表層形
106 |     /// https://taku910.github.io/mecab/dic-detail.html
107 |     surface_form: String,
108 |     /// 左文脈ID (単語を左から見たときの文脈 ID)
109 |     /// https://taku910.github.io/mecab/dic-detail.html
110 |     left_context_id: usize,
111 |     /// 右文脈ID (単語を右から見たときの文脈 ID)
112 |     /// https://taku910.github.io/mecab/dic-detail.html
113 |     right_context_id: usize,
114 |     /// 単語コスト (小さいほど出現しやすい)
115 |     /// コスト値は short int (16bit 整数) の範囲におさめる必要があります.
116 |     cost: i16,
117 |     /// 5カラム目以降は, ユーザ定義の CSV フィールドです. 基本的に どんな内容でも CSV の許す限り追加することができます.
118 |     /// https://taku910.github.io/mecab/dic-detail.html
119 |     features: Vec<String>,
120 | }
121 | impl From<CSVRow> for Morpheme {
122 |     fn from(row: CSVRow) -> Self {
123 |         Morpheme {
124 |             left_context_id: row.left_context_id,
125 |             right_context_id: row.right_context_id,
126 |             cost: row.cost,
127 |         }
128 |     }
129 | }
130 | 
131 | fn load_words_csv<P>(path: P) -> Result<Vec<CSVRow>, Box<dyn Error>>
132 | where
133 |     P: AsRef<Path>,
134 | {
135 |     let eucjp = fs::read(path)?;
136 |     let (utf8, _, _) = EUC_JP.decode(&eucjp);
137 |     let mut rdr = ReaderBuilder::new()
138 |         .has_headers(false)
139 |         .from_reader(utf8.as_bytes());
140 |     let mut words = vec![];
141 |     for row in rdr.records() {
142 |         let row = row?;
143 |         words.push(CSVRow {
144 |             surface_form: row[COL_SURFACE_FORM].to_string(),
145 |             left_context_id: row[COL_LEFT_CONTEXT_ID].parse::<usize>().unwrap(),
146 |             right_context_id: row[COL_RIGHT_CONTEXT_ID].parse::<usize>().unwrap(),
147 |             cost: row[COL_COST].parse::<i16>().unwrap(),
148 |             features: row
149 |                 .iter()
150 |                 .skip(COL_COST + 1)
151 |                 .map(|v| v.to_string())
152 |                 .collect::<Vec<_>>(),
153 |         })
154 |     }
155 |     Ok(words)
156 | }
157 | 
158 | fn load_chars<P>(path: P) -> Result<CharClassifier, Box<dyn Error>>
159 | where
160 |     P: AsRef<Path>,
161 | {
162 |     let eucjp = fs::read(path)?;
163 |     let (utf8, _, _) = EUC_JP.decode(&eucjp);
164 |     let lines = utf8
165 |         .lines()
166 |         .filter(|line| !line.is_empty() && !line.starts_with('#'))
167 |         .map(|line| Regex::new(r"#.*$").unwrap().replace(line, ""))
168 |         .collect::<Vec<_>>();
169 | 
170 |     let head = lines.iter().take_while(|line| {
171 |         let parts = line.trim().split_ascii_whitespace().collect::<Vec<_>>();
172 |         !parts[0].starts_with("0x")
173 |     });
174 |     let mut chars = HashMap::new();
175 |     for line in head {
176 |         let parts = line.trim().split_ascii_whitespace().collect::<Vec<_>>();
177 |         let kind = parts[0].to_owned();
178 |         let class = kind.to_string();
179 |         let timing = if parts[1] == "0" {
180 |             InvokeTiming::Fallback
181 |         } else {
182 |             InvokeTiming::Always
183 |         };
184 |         let group_by_same_kind = parts[2] == "1";
185 |         let len = parts[3].parse::<usize>()?;
186 |         chars.insert(
187 |             kind,
188 |             CharDefinition {
189 |                 class,
190 |                 timing,
191 |                 group_by_same_kind,
192 |                 len,
193 |                 compatibilities: HashSet::new(),
194 |             },
195 |         );
196 |     }
197 | 
198 |     let tail = lines.iter().skip_while(|line| {
199 |         let parts = line.trim().split_ascii_whitespace().collect::<Vec<_>>();
200 |         !parts[0].starts_with("0x")
201 |     });
202 |     let mut ranges = vec![];
203 |     for line in tail {
204 |         let parts = line.trim().split_ascii_whitespace().collect::<Vec<_>>();
205 |         let range = parts[0]
206 |             .split("..")
207 |             .map(|c| u32::from_str_radix(&c[2..], 16).unwrap())
208 |             .map(|c| char::from_u32(c).unwrap())
209 |             .collect::<Vec<_>>();
210 |         let range = if range.len() > 1 {
211 |             (range[0] as u32, range[1] as u32)
212 |         } else {
213 |             (range[0] as u32, range[0] as u32)
214 |         };
215 |         let class = parts[1];
216 |         let compatibilities = parts
217 |             .iter()
218 |             .skip(2)
219 |             .map(|s| s.to_string())
220 |             .collect::<HashSet<_>>();
221 |         chars.get_mut(class).unwrap().compatibilities = compatibilities;
222 |         ranges.push(CharClass::from(range, class.to_string()));
223 |     }
224 | 
225 |     Ok(CharClassifier::from(chars, ranges))
226 | }
227 | 
228 | fn load_matrix<P>(path: P) -> Result<Vec<Vec<i16>>, Box<dyn Error>>
229 | where
230 |     P: AsRef<Path>,
231 | {
232 |     let eucjp = fs::read(path)?;
233 |     let (utf8, _, _) = EUC_JP.decode(&eucjp);
234 |     let mut lines = utf8.lines();
235 |     let size = lines
236 |         .next()
237 |         .expect("failed to read the first line")
238 |         .split_ascii_whitespace()
239 |         .map(|p| p.parse::<usize>().unwrap())
240 |         .collect::<Vec<_>>();
241 |     let mut matrix = vec![vec![-1; size[1]]; size[0]];
242 |     for line in lines {
243 |         let parts = line.split_ascii_whitespace().collect::<Vec<_>>();
244 |         let left = parts[0].parse::<usize>()?;
245 |         let right = parts[1].parse::<usize>()?;
246 |         let cost = parts[2].parse::<i16>()?;
247 |         matrix[left][right] = cost;
248 |     }
249 |     Ok(matrix)
250 | }
251 | 
252 | fn load_unknown<P>(path: P) -> Result<HashMap<String, Vec<CSVRow>>, Box<dyn Error>>
253 | where
254 |     P: AsRef<Path>,
255 | {
256 |     let words = load_words_csv(path)?;
257 |     let mut map = HashMap::<String, Vec<CSVRow>>::new();
258 |     for w in words.into_iter() {
259 |         map.entry(w.surface_form.to_string())
260 |             .or_insert_with(Vec::new)
261 |             .push(w);
262 |     }
263 |     Ok(map)
264 | }
265 | 
266 | fn map_to_vec<T: Clone>(map: HashMap<usize, T>, default: impl Fn() -> T) -> Vec<T> {
267 |     let mut ret = vec![default(); map.len() + 1];
268 |     for (idx, value) in map.into_iter() {
269 |         ret[idx] = value;
270 |     }
271 |     ret
272 | }
273 | 


--------------------------------------------------------------------------------