├── .gitmodules ├── wasm-features ├── .gitignore ├── Cargo.toml ├── README.md └── src │ └── lib.rs ├── benchmarks ├── .gitignore ├── README.md ├── goya.js ├── scripts │ └── setup ├── package.json ├── kuromoji.js ├── bench.js └── package-lock.json ├── ipadic ├── src │ ├── lib.rs │ ├── ipadic.rs │ └── ipadic_loader.rs └── Cargo.toml ├── goya-cli ├── src │ ├── exporter.rs │ ├── path_util.rs │ ├── repl.rs │ ├── main.rs │ └── build.rs └── Cargo.toml ├── wasm-core ├── .DS_Store ├── README.md ├── Cargo.toml ├── .gitignore └── src │ └── lib.rs ├── scripts ├── lookup ├── vercel-install ├── vercel-build ├── build-wasm └── build-dict ├── Cargo.toml ├── goya ├── src │ ├── lib.rs │ ├── morpheme.rs │ ├── dictionary.rs │ ├── id.rs │ ├── common_prefix_tree.rs │ ├── word_features.rs │ ├── dot.rs │ ├── char_class.rs │ ├── double_array.rs │ └── lattice.rs └── Cargo.toml ├── playground ├── src │ ├── index.tsx │ ├── index.html │ ├── Dot.tsx │ ├── goya.worker.ts │ ├── Table.tsx │ ├── Result.tsx │ └── App.tsx ├── tsconfig.json ├── package.json ├── webpack.config.js └── .gitignore ├── .gitignore ├── LICENSE ├── .github └── workflows │ ├── CD.yml │ └── CI.yml └── README.md /.gitmodules: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /wasm-features/.gitignore: -------------------------------------------------------------------------------- 1 | pkg 2 | -------------------------------------------------------------------------------- /benchmarks/.gitignore: -------------------------------------------------------------------------------- 1 | ita-corpus.txt 2 | -------------------------------------------------------------------------------- /ipadic/src/lib.rs: -------------------------------------------------------------------------------- 1 | pub mod ipadic; 2 | pub mod ipadic_loader; 3 | -------------------------------------------------------------------------------- /goya-cli/src/exporter.rs: -------------------------------------------------------------------------------- 1 | enum Format { 2 | Default, 3 | Dot, 4 | } 5 | -------------------------------------------------------------------------------- /wasm-core/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Leko/goya/HEAD/wasm-core/.DS_Store -------------------------------------------------------------------------------- /scripts/lookup: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | cat mecab/mecab-ipadic/*.csv | iconv -f eucjp | grep "^$1" 3 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | 3 | members = [ 4 | "goya-cli", 5 | "goya", 6 | "ipadic", 7 | "wasm-core", 8 | "wasm-features", 9 | ] 10 | 11 | [profile.release] 12 | lto = true 13 | -------------------------------------------------------------------------------- /goya/src/lib.rs: -------------------------------------------------------------------------------- 1 | pub mod char_class; 2 | pub mod common_prefix_tree; 3 | pub mod dictionary; 4 | pub mod dot; 5 | pub mod double_array; 6 | pub mod id; 7 | pub mod lattice; 8 | pub mod morpheme; 9 | pub mod word_features; 10 | -------------------------------------------------------------------------------- /scripts/vercel-install: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -eu 3 | 4 | curl https://sh.rustup.rs -sSf | sh -s -- -y 5 | source $HOME/.cargo/env 6 | cargo install wasm-pack 7 | rustup install nightly 8 | 9 | cd playground 10 | npm ci 11 | -------------------------------------------------------------------------------- /benchmarks/README.md: -------------------------------------------------------------------------------- 1 | # Goya benchmarks 2 | 3 | ## Getting started 4 | 5 | ``` 6 | npm i 7 | ./scripts/setup # Generate ita-corpus.txt 8 | 9 | # Run whole process benchmark 10 | node goya.js < ita-corpus.txt 11 | node kuromoji.js < ita-corpus.txt 12 | 13 | # Run morphological analysis benchmark 14 | node bench.js < ita-corpus.txt 15 | ``` 16 | -------------------------------------------------------------------------------- /playground/src/index.tsx: -------------------------------------------------------------------------------- 1 | import React from "react"; 2 | import ReactDOM from "react-dom"; 3 | import { App } from "./App"; 4 | 5 | if ("serviceWorker" in navigator) { 6 | window.addEventListener("load", () => { 7 | navigator.serviceWorker.register("/service-worker.js"); 8 | }); 9 | } 10 | 11 | ReactDOM.render(, document.querySelector("#app")); 12 | -------------------------------------------------------------------------------- /scripts/vercel-build: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -eu 3 | 4 | source $HOME/.cargo/env 5 | 6 | NAME='mecab-ipadic.tar.gz' 7 | curl -v -L 'https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7MWVlSDBCSXZMTXM' -o $NAME 8 | tar -zxvf $NAME 9 | rm -rf $NAME 10 | 11 | DICT='mecab-ipadic-2.7.0-20070801' 12 | ./scripts/build-dict $DICT 13 | cd playground 14 | NODE_ENV=production npm run build 15 | -------------------------------------------------------------------------------- /benchmarks/goya.js: -------------------------------------------------------------------------------- 1 | import { EOL } from "os"; 2 | import fs from "fs"; 3 | import core from "wasm-core"; 4 | import features from "wasm-features"; 5 | 6 | const lines = fs.readFileSync("/dev/stdin", "utf8").trim().split(EOL); 7 | for (const line of lines) { 8 | const lattice = core.parse(line); 9 | const best = lattice.find_best().map(({ wid }) => wid); 10 | features.get_features(best); 11 | } 12 | 13 | console.log(process.memoryUsage()); 14 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Generated by Cargo 2 | # will have compiled files and executables 3 | /target/ 4 | 5 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries 6 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html 7 | Cargo.lock 8 | 9 | # These are backup files generated by rustfmt 10 | **/*.rs.bk 11 | .vscode/ 12 | mecab-ipadic-2.7.0-20070801 13 | __generated__ 14 | node_modules 15 | -------------------------------------------------------------------------------- /ipadic/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | authors = ["Leko "] 3 | categories = ["text-processing"] 4 | description = "IPA dictionary for Goya" 5 | edition = "2018" 6 | license = "Apache-2.0 OR MIT" 7 | name = "goya-ipadic" 8 | version = "0.1.9" 9 | 10 | [dependencies] 11 | csv = "1.1" 12 | encoding_rs = "0.8" 13 | glob = "0.3" 14 | goya = {version = "^0.1.9", path = "../goya"} 15 | indexmap = {version = "1.7", features = ["serde"]} 16 | regex = "1.5" 17 | rkyv = {version = "0.7.19", features = ["indexmap"]} 18 | serde = {version = "1.0", features = ["derive"]} 19 | -------------------------------------------------------------------------------- /goya/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | authors = ["Leko "] 3 | categories = ["data-structures", "text-processing"] 4 | description = "Yet another morphological analyzer for Rust and WebAssembly" 5 | edition = "2018" 6 | license = "Apache-2.0 OR MIT" 7 | name = "goya" 8 | repository = "https://github.com/Leko/goya" 9 | version = "0.1.9" 10 | 11 | [dependencies] 12 | indexmap = {version = "1.7", features = ["serde"]} 13 | itertools = "0.10" 14 | rkyv = {version = "0.7.19", features = ["indexmap"]} 15 | serde = {version = "1.0", features = ["derive"]} 16 | serde_bytes = "0.11" 17 | -------------------------------------------------------------------------------- /benchmarks/scripts/setup: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | wget https://github.com/mmorise/ita-corpus/raw/main/emotion_transcript_utf8.txt 4 | wget https://github.com/mmorise/ita-corpus/raw/main/recitation_transcript_utf8.txt 5 | 6 | cat emotion_transcript_utf8.txt recitation_transcript_utf8.txt > transcript_utf8.csv 7 | rm -f emotion_transcript_utf8.txt recitation_transcript_utf8.txt 8 | 9 | node <> ita-corpus.txt 10 | const lines = require('fs').readFileSync('transcript_utf8.csv', 'utf8') 11 | .trim() 12 | .split('\n') 13 | .map(line => line.split(',')[0].split(':')[1]) 14 | .join('\n') 15 | console.log(lines) 16 | CODE 17 | -------------------------------------------------------------------------------- /benchmarks/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "benchmarks", 3 | "private": true, 4 | "version": "0.0.0", 5 | "type": "module", 6 | "scripts": { 7 | "preinstall": "npm run build:core && npm run build:features", 8 | "build:core": "wasm-pack build --release --target nodejs ../wasm-core", 9 | "build:features": "wasm-pack build --release --target nodejs ../wasm-features", 10 | "test": "echo \"Error: no test specified\" && exit 1" 11 | }, 12 | "author": "", 13 | "license": "ISC", 14 | "dependencies": { 15 | "kuromoji": "^0.1.2", 16 | "wasm-core": "../wasm-core/pkg", 17 | "wasm-features": "../wasm-features/pkg" 18 | }, 19 | "devDependencies": { 20 | "benchmark": "^2.1.4" 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /goya-cli/src/path_util.rs: -------------------------------------------------------------------------------- 1 | use std::fs::create_dir_all; 2 | use std::io; 3 | use std::path::{Path, PathBuf}; 4 | 5 | pub struct PathUtil { 6 | base: String, 7 | } 8 | impl PathUtil { 9 | pub fn from(base: String) -> PathUtil { 10 | PathUtil { base } 11 | } 12 | 13 | pub fn mkdirp(&self) -> io::Result<()> { 14 | create_dir_all(&self.base) 15 | } 16 | 17 | pub fn da_path(&self) -> PathBuf { 18 | Path::new(&self.base).join("da.bin") 19 | } 20 | 21 | pub fn dict_path(&self) -> PathBuf { 22 | Path::new(&self.base).join("dict.bin") 23 | } 24 | 25 | pub fn features_path(&self) -> PathBuf { 26 | Path::new(&self.base).join("features.bin") 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /wasm-features/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | authors = ["Leko "] 3 | categories = ["wasm", "data-structures", "text-processing"] 4 | description = "WebAssembly binding of Goya" 5 | edition = "2018" 6 | license = "Apache-2.0 OR MIT" 7 | name = "goya-features" 8 | publish = false 9 | repository = "https://github.com/Leko/goya" 10 | version = "0.1.9" 11 | 12 | [lib] 13 | crate-type = ["cdylib"] 14 | 15 | [dependencies] 16 | goya = {version = "^0.1.9", path = "../goya"} 17 | lazy_static = "1.4" 18 | rmp-serde = "1.0.0-beta.2" 19 | serde-wasm-bindgen = "0.3.1" 20 | wasm-bindgen = {version = "0.2.78", features = ["serde-serialize"]} 21 | 22 | [package.metadata.wasm-pack.profile.release] 23 | wasm-opt = ['--dce', '-O4'] 24 | -------------------------------------------------------------------------------- /benchmarks/kuromoji.js: -------------------------------------------------------------------------------- 1 | import { EOL } from "os"; 2 | import fs from "fs"; 3 | import path from "path"; 4 | import kuromoji from "kuromoji"; 5 | 6 | const dicPath = path.join( 7 | path.dirname(new URL(import.meta.url).pathname), 8 | "node_modules", 9 | "kuromoji", 10 | "dict" 11 | ); 12 | 13 | new Promise((resolve, reject) => { 14 | kuromoji.builder({ dicPath }).build((err, tokenizer) => { 15 | if (err) { 16 | reject(err); 17 | } else { 18 | resolve(tokenizer); 19 | } 20 | }); 21 | }).then((tokenizer) => { 22 | const lines = fs.readFileSync("/dev/stdin", "utf8").trim().split(EOL); 23 | for (const line of lines) { 24 | tokenizer.tokenize(line); 25 | } 26 | console.log(process.memoryUsage()); 27 | }); 28 | -------------------------------------------------------------------------------- /playground/src/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Playground | Goya: Yet another morphological analyzer for Rust and 7 | WebAssembly 8 | 9 | 13 | 21 | 22 | 23 |
24 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /wasm-core/README.md: -------------------------------------------------------------------------------- 1 | ## Getting started 2 | 3 | ### 分かち書き 4 | 5 | goya-core を import して `parse` 関数を使用します。parse メソッドの戻り値から各種メソッドを呼べるようにしています。 6 | 分かち書きをするなら`wakachi`メソッドを使用します。 7 | 8 | ```ts 9 | import core from "goya-core"; 10 | 11 | const lattice = core.parse("すもももももももものうち"); 12 | lattice.wakachi(); // => ["すもも", "も", "もも", "も", "もも", "の", "うち"] 13 | ``` 14 | 15 | ### 形態素解析 16 | 17 | 形態素解析の結果を得るには`find_best`メソッドを使用します。find_best は形態素の配列を返します。各形態素はこれらのフィールドを持っています。サイズ削減のためこのオブジェクトは品詞や読み仮名などの素性を持っていません。 18 | 19 | - wid: 語彙 ID。goya-features で使用 (後述) 20 | - is_known: 既知後なら true、未知語なら false 21 | - surface_form: 表層体 22 | 23 | ```ts 24 | lattice.find_best()[0].surface_form; // => "すもも" 25 | lattice.find_best()[0].is_known; // => true 26 | lattice.find_best()[0].wid; // => 次項で説明 27 | ``` 28 | -------------------------------------------------------------------------------- /wasm-features/README.md: -------------------------------------------------------------------------------- 1 | ## Getting started 2 | 3 | ```ts 4 | import core from "goya-core"; 5 | import { get_features } from "wasm-features"; 6 | 7 | // Mecab IPA辞書のデフォルトでは品詞(Part of Speech)は添字0 8 | const INDEX_POS = 0; 9 | 10 | const lattice = core.parse("すもももももももものうち"); 11 | const morphemes = lattice.find_best(); 12 | // widの配列から素性の配列を得る 13 | const features = get_features(morphemes.map((morph) => morph.wid)); 14 | // 1要素ずつ取得してもいいが、まとめて取得する方がオーバーヘッドが少なく高速 15 | get_features([morphemes[0].wid]); 16 | 17 | morphemes.forEach(({ surface_form }, i) => { 18 | const feature = features[i]; // 渡したwid通りの順序で素性が得られる 19 | const line = surface_form + "\t" + feature.join(","); 20 | console.log(line); // => "すもも\t名詞,一般,*,*,*,*,すもも,スモモ,スモモ" 21 | console.log(feature[INDEX_POS]); // => "名詞" 22 | }); 23 | ``` 24 | -------------------------------------------------------------------------------- /goya-cli/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | authors = ["Leko "] 3 | categories = ["command-line-interface"] 4 | description = "CLI for Goya" 5 | edition = "2018" 6 | license = "Apache-2.0 OR MIT" 7 | name = "goya-cli" 8 | repository = "https://github.com/Leko/goya" 9 | version = "0.1.9" 10 | 11 | [[bin]] 12 | name = "goya" 13 | path = "src/main.rs" 14 | 15 | [dependencies] 16 | bytesize = {version = "1.1.0", features = ["serde"]} 17 | clap = {version = "3.0.0-rc.9", features = ["derive"]} 18 | console = "0.14" 19 | dirs = "4.0" 20 | futures = "0.3.17" 21 | goya = {version = "^0.1.9", path = "../goya"} 22 | goya-ipadic = {version = "^0.1.9", path = "../ipadic"} 23 | indexmap = {version = "1.7", features = ["serde"]} 24 | rkyv = {version = "0.7.19", features = ["indexmap"]} 25 | rmp-serde = "1.0.0-beta.2" 26 | -------------------------------------------------------------------------------- /scripts/build-wasm: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | cd $1 3 | wasm-pack build --release --out-dir pkg/web --target web 4 | wasm-pack build --release --out-dir pkg/nodejs --target nodejs 5 | 6 | mv pkg/web/README.md pkg/ 7 | mv pkg/nodejs/package.json pkg/ 8 | rm -rf pkg/{web,nodejs}/package.json 9 | 10 | node < path.join('nodejs', f)).concat(pkg.files.map(f => path.join('web', f))) 22 | 23 | fs.writeFileSync('./pkg/package.json', JSON.stringify(pkg, null, 2)) 24 | CODE 25 | -------------------------------------------------------------------------------- /playground/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "incremental": true /* Enable incremental compilation */, 4 | "jsx": "react", 5 | "target": "es2020" /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */, 6 | "module": "es2020" /* Specify what module code is generated. */, 7 | "lib": ["DOM"], 8 | "moduleResolution": "Node", 9 | "resolveJsonModule": true, 10 | "esModuleInterop": true /* Emit additional JavaScript to ease support for importing CommonJS modules. This enables `allowSyntheticDefaultImports` for type compatibility. */, 11 | "forceConsistentCasingInFileNames": true /* Ensure that casing is correct in imports. */, 12 | "strict": true /* Enable all strict type-checking options. */, 13 | "skipLibCheck": true /* Skip type checking all .d.ts files. */ 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /wasm-core/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | authors = ["Leko "] 3 | categories = ["wasm", "data-structures", "text-processing"] 4 | description = "WebAssembly binding of Goya" 5 | edition = "2018" 6 | license = "Apache-2.0 OR MIT" 7 | name = "goya-core" 8 | publish = false 9 | repository = "https://github.com/Leko/goya" 10 | version = "0.1.9" 11 | 12 | [lib] 13 | crate-type = ["cdylib"] 14 | 15 | [dependencies] 16 | futures = "0.3.17" 17 | goya = {version = "^0.1.9", path = "../goya"} 18 | goya-ipadic = {version = "^0.1.9", path = "../ipadic"} 19 | lazy_static = "1.4" 20 | rkyv = {version = "0.7.19", features = ["indexmap"]} 21 | rmp-serde = "1.0.0-beta.2" 22 | serde = {version = "1.0", features = ["derive"]} 23 | serde-wasm-bindgen = "0.3.1" 24 | wasm-bindgen = {version = "0.2.78", features = ["serde-serialize"]} 25 | wasm-bindgen-futures = "0.4.28" 26 | 27 | [package.metadata.wasm-pack.profile.release] 28 | wasm-opt = ['--dce', '-O4'] 29 | -------------------------------------------------------------------------------- /goya/src/morpheme.rs: -------------------------------------------------------------------------------- 1 | use serde::{Deserialize, Serialize}; 2 | 3 | #[derive( 4 | Debug, 5 | Hash, 6 | PartialEq, 7 | Eq, 8 | PartialOrd, 9 | Clone, 10 | Serialize, 11 | Deserialize, 12 | rkyv::Archive, 13 | rkyv::Serialize, 14 | rkyv::Deserialize, 15 | )] 16 | pub struct Morpheme { 17 | /// 左文脈ID (単語を左から見たときの文脈 ID) 18 | /// https://taku910.github.io/mecab/dic-detail.html 19 | pub left_context_id: usize, 20 | /// 右文脈ID (単語を右から見たときの文脈 ID) 21 | /// https://taku910.github.io/mecab/dic-detail.html 22 | pub right_context_id: usize, 23 | /// > 単語コスト (小さいほど出現しやすい) 24 | /// > コスト値は short int (16bit 整数) の範囲におさめる必要があります. 25 | pub cost: i16, 26 | } 27 | impl Morpheme { 28 | pub fn new(left_context_id: usize, right_context_id: usize, cost: i16) -> Morpheme { 29 | Morpheme { 30 | left_context_id, 31 | right_context_id, 32 | cost, 33 | } 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /wasm-features/src/lib.rs: -------------------------------------------------------------------------------- 1 | use goya::id::WordIdentifier; 2 | use goya::word_features::WordFeaturesMap; 3 | use wasm_bindgen::prelude::*; 4 | 5 | #[macro_use] 6 | extern crate lazy_static; 7 | 8 | lazy_static! { 9 | static ref WORD_FEATURES: WordFeaturesMap = 10 | rmp_serde::from_slice(include_bytes!("../__generated__/features.bin")).unwrap(); 11 | } 12 | 13 | #[wasm_bindgen] 14 | pub fn get_features(wids: &JsValue) -> JsValue { 15 | let wids: Vec = wids.into_serde().unwrap(); 16 | let features: Vec> = wids 17 | .iter() 18 | .map(|wid| { 19 | WORD_FEATURES 20 | .get(wid) 21 | .unwrap() 22 | .iter() 23 | .map(|s| s.to_string()) 24 | .collect() 25 | }) 26 | .collect::>(); 27 | serde_wasm_bindgen::to_value(&features).unwrap() 28 | } 29 | 30 | #[wasm_bindgen] 31 | pub fn ready() { 32 | lazy_static::initialize(&WORD_FEATURES); 33 | } 34 | -------------------------------------------------------------------------------- /goya/src/dictionary.rs: -------------------------------------------------------------------------------- 1 | use super::char_class::CharDefinition; 2 | use super::id::WordIdentifier; 3 | use super::morpheme::Morpheme; 4 | 5 | pub trait Dictionary { 6 | fn get(&self, wid: &WordIdentifier) -> Option<&Morpheme> { 7 | match wid { 8 | WordIdentifier::Known(wid, _) => self.get_known_morpheme(wid), 9 | WordIdentifier::Unknown(wid, _) => self.get_unknown_morpheme(wid), 10 | } 11 | } 12 | fn get_known_morpheme(&self, wid: &usize) -> Option<&Morpheme>; 13 | fn get_unknown_morpheme(&self, wid: &usize) -> Option<&Morpheme>; 14 | fn resolve_homonyms(&self, wid: &usize) -> Option<&Vec>; 15 | fn take_unknown_chars_seq(&self, def: &CharDefinition, text: &str, start: &usize) -> String; 16 | fn classify_char(&self, c: &char) -> &CharDefinition; 17 | fn get_unknown_morphemes_by_class(&self, class: &str) -> Vec<(usize, &Morpheme)>; 18 | fn transition_cost(&self, left: &usize, right: &usize) -> Option<&i16>; 19 | fn occurrence_cost(&self, wid: &usize) -> Option; 20 | } 21 | -------------------------------------------------------------------------------- /goya/src/id.rs: -------------------------------------------------------------------------------- 1 | use serde::{Deserialize, Serialize}; 2 | 3 | #[derive(Debug, Clone, Serialize, Deserialize)] 4 | #[serde(tag = "tag", content = "id")] 5 | pub enum WordIdentifier { 6 | Known(usize, String), // ID, surface_form 7 | Unknown(usize, String), // ID, surface_form 8 | } 9 | impl WordIdentifier { 10 | pub fn get_surface(&self) -> &str { 11 | match self { 12 | Self::Known(_, surface) => surface, 13 | Self::Unknown(_, surface) => surface, 14 | } 15 | } 16 | } 17 | 18 | #[cfg(test)] 19 | mod tests { 20 | use super::*; 21 | 22 | #[test] 23 | fn get_surface_known() { 24 | let surface = String::from("test"); 25 | let id = WordIdentifier::Known(0, surface.to_string()); 26 | assert_eq!(id.get_surface(), surface); 27 | } 28 | 29 | #[test] 30 | fn get_surface_unknown() { 31 | let surface = String::from("test"); 32 | let id = WordIdentifier::Unknown(0, surface.to_string()); 33 | assert_eq!(id.get_surface(), surface); 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Shingo Inoue 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /playground/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@goya/playground", 3 | "private": true, 4 | "version": "1.0.0", 5 | "scripts": { 6 | "start": "webpack-dev-server --mode development", 7 | "build": "webpack --mode production" 8 | }, 9 | "author": "Leko ", 10 | "license": "MIT", 11 | "devDependencies": { 12 | "@mui/styles": "^5.0.1", 13 | "@swc/core": "^1.2.92", 14 | "@vue/preload-webpack-plugin": "^2.0.0", 15 | "@wasm-tool/wasm-pack-plugin": "^1.5.0", 16 | "file-loader": "^6.2.0", 17 | "html-webpack-plugin": "^5.3.2", 18 | "swc-loader": "^0.1.15", 19 | "typescript": "^4.4.3", 20 | "webpack": "^5.56.0", 21 | "webpack-cli": "^4.8.0", 22 | "webpack-dev-server": "^4.3.0", 23 | "workbox-webpack-plugin": "^6.3.0" 24 | }, 25 | "dependencies": { 26 | "@emotion/react": "^11.4.1", 27 | "@emotion/styled": "^11.3.0", 28 | "@mui/icons-material": "^5.0.1", 29 | "@mui/lab": "^5.0.0-alpha.49", 30 | "@mui/material": "^5.0.2", 31 | "@mui/x-data-grid": "^5.0.0-beta.0", 32 | "comlink": "^4.3.1", 33 | "react": "^17.0.2", 34 | "react-dom": "^17.0.2", 35 | "react-use": "^17.3.1", 36 | "viz.js": "^2.1.2" 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /playground/src/Dot.tsx: -------------------------------------------------------------------------------- 1 | import React, { useCallback, useEffect, useRef, useState } from "react"; 2 | import Box from "@mui/material/Box"; 3 | import Button from "@mui/material/Button"; 4 | import Viz from "viz.js"; 5 | import workerURL from "viz.js/full.render.js"; 6 | 7 | type Props = { 8 | dot: string; 9 | }; 10 | 11 | const viz = new Viz({ workerURL }); 12 | 13 | export default function Dot(props: Props) { 14 | const { dot } = props; 15 | const [svg, setSVG] = useState(""); 16 | 17 | const handleDownload = useCallback(() => { 18 | const a = document.createElement("a"); 19 | a.download = "lattice.svg"; 20 | a.href = `data://image/svg+xml,${encodeURIComponent(svg)}`; 21 | a.click(); 22 | }, [svg]); 23 | 24 | useEffect(() => { 25 | if (!dot || dot.trim().length === 0) { 26 | return; 27 | } 28 | viz.renderSVGElement(dot).then((svg: SVGSVGElement) => { 29 | svg.style.width = "100%"; 30 | svg.style.height = "100%"; 31 | setSVG(svg.outerHTML); 32 | }); 33 | }, [dot, setSVG]); 34 | 35 | if (!svg) { 36 | return null; 37 | } 38 | return ( 39 | 40 | 43 |
44 | 45 | ); 46 | } 47 | -------------------------------------------------------------------------------- /scripts/build-dict: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | const os = require("os"); 3 | const fs = require("fs/promises"); 4 | const path = require("path"); 5 | const { spawnSync } = require("child_process"); 6 | 7 | async function main() { 8 | const tmp = await fs.mkdtemp(path.join(os.tmpdir(), "goya-dict-")); 9 | spawnSync( 10 | "cargo", 11 | [ 12 | "+nightly", 13 | "run", 14 | "-p", 15 | "goya-cli", 16 | "--release", 17 | "--", 18 | "--dicdir", 19 | tmp, 20 | "compile", 21 | process.argv[2], 22 | ], 23 | { stdio: "inherit" } 24 | ); 25 | 26 | const base = path.join(__dirname, ".."); 27 | const generatedDir = path.join(base, "wasm-core", "__generated__"); 28 | await fs.rm(generatedDir, { recursive: true, force: true }); 29 | await fs.mkdir(path.dirname(generatedDir), { recursive: true }); 30 | await fs.rename(tmp, generatedDir); 31 | 32 | const generatedDir2 = path.join(base, "wasm-features", "__generated__"); 33 | await fs.rm(generatedDir2, { recursive: true, force: true }); 34 | await fs.mkdir(generatedDir2, { recursive: true }); 35 | await fs.rename( 36 | path.join(generatedDir, "features.bin"), 37 | path.join(generatedDir2, "features.bin") 38 | ); 39 | } 40 | 41 | main().catch((e) => { 42 | console.error(e.stack); 43 | process.exit(1); 44 | }); 45 | -------------------------------------------------------------------------------- /benchmarks/bench.js: -------------------------------------------------------------------------------- 1 | import { EOL } from "os"; 2 | import path from "path"; 3 | import fs from "fs"; 4 | import Benchmark from "benchmark"; 5 | import kuromoji from "kuromoji"; 6 | import core from "wasm-core"; 7 | import features from "wasm-features"; 8 | 9 | const suite = new Benchmark.Suite(); 10 | 11 | const [, , tokenizer] = await Promise.all([ 12 | core.ready(), 13 | features.ready(), 14 | new Promise((resolve, reject) => { 15 | kuromoji 16 | .builder({ 17 | dicPath: path.join( 18 | path.dirname(new URL(import.meta.url).pathname), 19 | "node_modules", 20 | "kuromoji", 21 | "dict" 22 | ), 23 | }) 24 | .build((err, tokenizer) => { 25 | if (err) { 26 | return reject(err); 27 | } 28 | resolve(tokenizer); 29 | }); 30 | }), 31 | ]); 32 | 33 | const lines = fs.readFileSync("/dev/stdin", "utf8").trim().split(EOL); 34 | suite 35 | .add("goya", () => { 36 | for (const line of lines) { 37 | const lattice = core.parse(line); 38 | features.get_features(lattice.find_best().map(({ wid }) => wid)); 39 | } 40 | }) 41 | .add("kuromoji", () => { 42 | for (const line of lines) { 43 | tokenizer.tokenize(line); 44 | } 45 | }) 46 | .on("cycle", (event) => { 47 | console.log(String(event.target)); 48 | }) 49 | .on("complete", function () { 50 | console.log("Fastest is " + this.filter("fastest").map("name")); 51 | }) 52 | .run({ async: true }); 53 | -------------------------------------------------------------------------------- /playground/src/goya.worker.ts: -------------------------------------------------------------------------------- 1 | import * as Comlink from "comlink"; 2 | 3 | export type Stats = { 4 | loadWasm: number; 5 | loadDict: number; 6 | parse: number; 7 | }; 8 | 9 | const kLoad = "loadWasm"; 10 | const kDict = "loadDict"; 11 | const kParse = "parse"; 12 | 13 | const encoder = new TextEncoder(); 14 | const decoder = new TextDecoder(); 15 | 16 | async function parse(input: ArrayBufferLike): Promise { 17 | performance.mark(kLoad); 18 | const mod = await import( 19 | /* webpackChunkName: "core" */ "../../wasm-core/pkg" 20 | ); 21 | performance.mark(kDict); 22 | await mod.ready(); 23 | performance.mark(kParse); 24 | const lattice = mod.parse(decoder.decode(input)); 25 | 26 | const res = encoder.encode( 27 | JSON.stringify({ 28 | stats: { 29 | loadWasm: performance.measure("loadWasm", kLoad, kDict).duration, 30 | loadDict: performance.measure("loadDict", kDict, kParse).duration, 31 | parse: performance.measure("parse", kParse).duration, 32 | }, 33 | dot: lattice.as_dot(), 34 | wakachi: lattice.wakachi(), 35 | best: lattice.find_best(), 36 | }) 37 | ); 38 | return Comlink.transfer(res, [res.buffer]); 39 | } 40 | 41 | async function getFeatures(payload: ArrayBufferLike): Promise { 42 | const mod = await import( 43 | /* webpackChunkName: "features" */ "../../wasm-features/pkg" 44 | ); 45 | const features = mod.get_features(JSON.parse(decoder.decode(payload))); 46 | const res = encoder.encode(JSON.stringify(features)); 47 | return Comlink.transfer(res, [res.buffer]); 48 | } 49 | 50 | Comlink.expose({ parse, getFeatures }); 51 | -------------------------------------------------------------------------------- /.github/workflows/CD.yml: -------------------------------------------------------------------------------- 1 | name: CD 2 | 3 | on: 4 | push: 5 | tags: 6 | - "v*" 7 | 8 | env: 9 | CARGO_TERM_COLOR: always 10 | 11 | jobs: 12 | crates-io: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - uses: actions/checkout@v2 16 | - uses: actions-rs/toolchain@v1 17 | with: 18 | toolchain: stable 19 | - uses: actions-rs/cargo@v1 20 | with: 21 | command: login 22 | args: ${{ secrets.CRATES_IO_TOKEN }} 23 | - run: cd goya && cargo publish && sleep 30 24 | - run: cd ipadic && cargo publish && sleep 30 25 | - run: cd goya-cli && cargo publish && sleep 30 26 | npm: 27 | runs-on: ubuntu-latest 28 | steps: 29 | - uses: actions/checkout@v2 30 | - uses: actions/setup-node@v2 31 | with: 32 | node-version: "16" 33 | registry-url: "https://registry.npmjs.org" 34 | - uses: actions-rs/toolchain@v1 35 | with: 36 | toolchain: nightly 37 | - run: cargo install wasm-pack 38 | - run: | 39 | NAME='mecab-ipadic.tar.gz' 40 | curl -v -L 'https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7MWVlSDBCSXZMTXM' -o $NAME 41 | tar -zxvf $NAME 42 | rm -rf $NAME 43 | ./scripts/build-dict mecab-ipadic-2.7.0-20070801 44 | - run: ./scripts/build-wasm wasm-core 45 | - run: cd wasm-core/pkg && npm publish 46 | env: 47 | NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }} 48 | - run: ./scripts/build-wasm wasm-features 49 | - run: cd wasm-features/pkg && npm publish 50 | env: 51 | NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }} 52 | -------------------------------------------------------------------------------- /.github/workflows/CI.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: [main] 6 | pull_request: 7 | branches: [main] 8 | 9 | env: 10 | CARGO_TERM_COLOR: always 11 | 12 | jobs: 13 | cargo: 14 | runs-on: ubuntu-latest 15 | steps: 16 | - uses: actions/checkout@v2 17 | - uses: actions-rs/toolchain@v1 18 | with: 19 | toolchain: stable 20 | - run: | 21 | NAME='mecab-ipadic.tar.gz' 22 | curl -v -L 'https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7MWVlSDBCSXZMTXM' -o $NAME 23 | tar -zxvf $NAME 24 | rm -rf $NAME 25 | cargo run -p goya-cli --release -- compile mecab-ipadic-2.7.0-20070801 26 | - run: cargo clippy --workspace --exclude goya-core --exclude goya-features 27 | - run: cargo build --workspace --exclude goya-core --exclude goya-features 28 | - run: cargo test --workspace --exclude goya-core --exclude goya-features 29 | wasm: 30 | runs-on: ubuntu-latest 31 | steps: 32 | - uses: actions/checkout@v2 33 | - uses: actions/setup-node@v2 34 | with: 35 | node-version: "16" 36 | cache: "npm" 37 | cache-dependency-path: benchmarks/package-lock.json 38 | - uses: actions-rs/toolchain@v1 39 | with: 40 | toolchain: nightly 41 | - run: cargo install wasm-pack 42 | - run: | 43 | NAME='mecab-ipadic.tar.gz' 44 | curl -v -L 'https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7MWVlSDBCSXZMTXM' -o $NAME 45 | tar -zxvf $NAME 46 | rm -rf $NAME 47 | ./scripts/build-dict mecab-ipadic-2.7.0-20070801 48 | - run: | 49 | cd benchmarks 50 | ./scripts/setup 51 | npm i 52 | - run: cd benchmarks && node goya.js < ita-corpus.txt 53 | - run: cd benchmarks && node kuromoji.js < ita-corpus.txt 54 | - run: cd benchmarks && node bench.js < ita-corpus.txt 55 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Goya 2 | 3 | [![goya at crates.io](https://img.shields.io/crates/v/goya.svg)](https://crates.io/crates/goya) 4 | [![goya at docs.rs](https://docs.rs/goya/badge.svg)](https://docs.rs/goya) 5 | 6 | Goya is a Japanese Morphological Analyzer written in Rust. 7 | The main goal is to compile to WebAssembly for morphological analysis in browsers and other JavaScript runtimes. In addition, it can be used with the CLI and Rust. 8 | 9 | [Try Goya playground](https://goya.pages.dev/). It uses the Goya-wasm from WebWorker. 10 | 11 | ## Getting started 12 | 13 | ### Fetch the latest IPA dictionary 14 | 15 | Download the latest IPA dictionary from [the official Mecab website](https://taku910.github.io/mecab/) and unzip it. 16 | 17 | ### Install Goya CLI 18 | 19 | ``` 20 | cargo install goya-cli 21 | ``` 22 | 23 | ### Compile the IPA dictionary 24 | 25 | Compile the IPA dictionary to generate a binary dictionary for morphological analysis. It may take a few minutes. 26 | 27 | ``` 28 | goya compile /path/to/ipadic 29 | ``` 30 | 31 | The binary dictionary will be generated in the `~/.goya` directory by default. You can change the destination with the `--dicdir` option. 32 | 33 | ``` 34 | goya --dicdir=/path/to/generated compile /path/to/ipadic 35 | ``` 36 | 37 | ### Run Morphological Analysis 38 | 39 | Goya takes input from STDIN. The easiest way is using the echo command and pipe it to the Goya. 40 | 41 | ``` 42 | $ echo すもももももももものうち | goya 43 | すもも 名詞,一般,*,*,*,*,すもも,スモモ,スモモ 44 | も 助詞,係助詞,*,*,*,*,も,モ,モ 45 | もも 名詞,一般,*,*,*,*,もも,モモ,モモ 46 | も 助詞,係助詞,*,*,*,*,も,モ,モ 47 | もも 名詞,一般,*,*,*,*,もも,モモ,モモ 48 | の 助詞,連体化,*,*,*,*,の,ノ,ノ 49 | うち 名詞,非自立,副詞可能,*,*,*,うち,ウチ,ウチ 50 | EOS 51 | ``` 52 | 53 | If you specified the `--dicdir` option when compiling the dictionary, you should also specify it when running the goya command. 54 | 55 | ``` 56 | echo すもももももももものうち | goya --dicdir=/path/to/generated 57 | ``` 58 | 59 | ## Release 60 | 61 | ``` 62 | cargo release --workspace --no-tag --skip-publish --dependent-version Upgrade 63 | git tag v{{VERSION}} 64 | git push origin v{{VERSION}} 65 | ``` 66 | -------------------------------------------------------------------------------- /playground/webpack.config.js: -------------------------------------------------------------------------------- 1 | const path = require("path"); 2 | const zlib = require("zlib"); 3 | const HtmlWebpackPlugin = require("html-webpack-plugin"); 4 | const WasmPackPlugin = require("@wasm-tool/wasm-pack-plugin"); 5 | const { GenerateSW: WorkboxPlugin } = require("workbox-webpack-plugin"); 6 | const PreloadWebpackPlugin = require("@vue/preload-webpack-plugin"); 7 | 8 | const { BROTLI_PARAM_QUALITY, BROTLI_MAX_QUALITY } = zlib.constants; 9 | 10 | const swcOption = { 11 | jsc: { 12 | parser: { 13 | syntax: "typescript", 14 | tsx: true, 15 | dynamicImport: true, 16 | }, 17 | target: "es2020", 18 | }, 19 | }; 20 | 21 | module.exports = { 22 | entry: "./src/index.tsx", 23 | output: { 24 | path: path.resolve(__dirname, "dist"), 25 | filename: "[name].[contenthash].js", 26 | chunkFilename: "[name].[chunkhash].js", 27 | }, 28 | resolve: { 29 | extensions: [".tsx", ".ts", ".js"], 30 | }, 31 | module: { 32 | rules: [ 33 | { 34 | test: /\.tsx?$/, 35 | use: { 36 | loader: "swc-loader", 37 | options: swcOption, 38 | }, 39 | }, 40 | // It's for Viz.js 41 | { 42 | test: /\.render\.js$/, 43 | use: ["file-loader"], 44 | }, 45 | ], 46 | }, 47 | plugins: [ 48 | new HtmlWebpackPlugin({ 49 | template: path.resolve(__dirname, "src", "index.html"), 50 | }), 51 | new PreloadWebpackPlugin({ 52 | rel: "preconnect", 53 | fileWhitelist: [/.wasm$/], 54 | }), 55 | new WasmPackPlugin({ 56 | crateDirectory: path.resolve(__dirname, "..", "wasm-core"), 57 | forceMode: "production", 58 | }), 59 | new WasmPackPlugin({ 60 | crateDirectory: path.resolve(__dirname, "..", "wasm-features"), 61 | forceMode: "production", 62 | }), 63 | ...(process.env.NODE_ENV === "production" 64 | ? [ 65 | new WorkboxPlugin({ 66 | clientsClaim: true, 67 | skipWaiting: true, 68 | }), 69 | ] 70 | : []), 71 | ], 72 | experiments: { 73 | asyncWebAssembly: true, 74 | }, 75 | }; 76 | -------------------------------------------------------------------------------- /goya/src/common_prefix_tree.rs: -------------------------------------------------------------------------------- 1 | use std::collections::BTreeMap; 2 | 3 | #[derive(Debug, Default, PartialEq, Eq)] 4 | pub struct CommonPrefixTree { 5 | pub id: Option, 6 | pub children: BTreeMap, 7 | } 8 | impl CommonPrefixTree { 9 | pub fn can_stop(&self) -> bool { 10 | self.id.is_some() 11 | } 12 | 13 | pub fn size(&self) -> usize { 14 | self.entires_dfs().len() 15 | } 16 | 17 | pub fn min_char(&self) -> Option<&char> { 18 | self.children.keys().min() 19 | } 20 | 21 | pub fn append(&mut self, id: usize, word: &str) { 22 | let mut token = String::from(word); 23 | token.push('\0'); 24 | self.append_chars(id, &token, 0); 25 | } 26 | 27 | pub fn entires_dfs(&self) -> Vec<(String, &CommonPrefixTree)> { 28 | self.dfs_collect(&String::new()) 29 | } 30 | 31 | fn dfs_collect(&self, prefix: &str) -> Vec<(String, &CommonPrefixTree)> { 32 | let mut open = vec![(prefix.to_string(), self)]; 33 | for (c, child) in self.children.iter() { 34 | let mut substr = String::from(prefix); 35 | substr.push(*c); 36 | open.append(&mut child.dfs_collect(&substr)); 37 | } 38 | open 39 | } 40 | 41 | fn append_chars(&mut self, id: usize, text: &str, cursor: usize) { 42 | let c = text.chars().nth(cursor).unwrap(); 43 | let child = self 44 | .children 45 | .entry(c) 46 | .or_insert_with(CommonPrefixTree::default); 47 | if cursor + 1 == text.chars().count() { 48 | child.id = Some(id); 49 | return; 50 | } 51 | child.append_chars(id, text, cursor + 1); 52 | } 53 | } 54 | 55 | #[cfg(test)] 56 | mod tests { 57 | use super::CommonPrefixTree; 58 | 59 | #[test] 60 | fn builds_a_word_that_has_1_char() { 61 | let mut trie = CommonPrefixTree::default(); 62 | trie.append(1, "あい"); 63 | trie.append(2, "いう"); 64 | assert_eq!( 65 | trie.entires_dfs() 66 | .iter() 67 | .map(|(p, _)| p) 68 | .collect::>(), 69 | vec!["", "あ", "あい", "あい\0", "い", "いう", "いう\0"] 70 | ); 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /playground/src/Table.tsx: -------------------------------------------------------------------------------- 1 | import { DataGrid } from "@mui/x-data-grid"; 2 | import React, { useEffect, useState } from "react"; 3 | import { wrap, transfer } from "comlink"; 4 | 5 | interface GoyaFeaturesAPI { 6 | getFeatures: (input: ArrayBufferLike) => Promise; 7 | } 8 | type Props = { 9 | rows: Record[]; 10 | }; 11 | 12 | const encoder = new TextEncoder(); 13 | const decoder = new TextDecoder(); 14 | const worker = wrap( 15 | new Worker(new URL("./goya.worker.ts", import.meta.url)) 16 | ); 17 | const base = { flex: 1, sortable: false }; 18 | 19 | export default function Table(props: Props) { 20 | const [features, setFeatures] = useState([]); 21 | 22 | const columns = [ 23 | { field: "surface_form", headerName: "表層形", ...base }, 24 | { field: "is_known", headerName: "既知語", ...base }, 25 | { field: "feature_0", headerName: "品詞", ...base }, 26 | { field: "feature_1", headerName: "品詞細分類1", ...base }, 27 | { field: "feature_2", headerName: "品詞細分類2", ...base }, 28 | { field: "feature_3", headerName: "品詞細分類3", ...base }, 29 | { field: "feature_4", headerName: "活用型", ...base }, 30 | { field: "feature_5", headerName: "活用形", ...base }, 31 | { field: "feature_6", headerName: "原形", ...base }, 32 | { field: "feature_7", headerName: "読み", ...base }, 33 | { field: "feature_8", headerName: "発音", ...base }, 34 | ]; 35 | const rows = props.rows.map((row, i) => ({ 36 | id: i, 37 | ...row, 38 | feature_0: features[i]?.[0], 39 | feature_1: features[i]?.[1], 40 | feature_2: features[i]?.[2], 41 | feature_3: features[i]?.[3], 42 | feature_4: features[i]?.[4], 43 | feature_5: features[i]?.[5], 44 | feature_6: features[i]?.[6], 45 | feature_7: features[i]?.[7], 46 | feature_8: features[i]?.[8], 47 | })); 48 | 49 | useEffect(() => { 50 | setFeatures([]); 51 | if (!props.rows) { 52 | return; 53 | } 54 | const wids = props.rows.map((m) => m.wid); 55 | const payload = encoder.encode(JSON.stringify(wids)); 56 | worker 57 | .getFeatures(transfer(payload, [payload.buffer])) 58 | .then((res) => JSON.parse(decoder.decode(res))) 59 | .then(setFeatures); 60 | }, [props.rows]); 61 | 62 | return ( 63 | 70 | ); 71 | } 72 | -------------------------------------------------------------------------------- /playground/src/Result.tsx: -------------------------------------------------------------------------------- 1 | import React, { Suspense, lazy, useState } from "react"; 2 | import Box from "@mui/material/Box"; 3 | import Stack from "@mui/material/Stack"; 4 | import Chip from "@mui/material/Chip"; 5 | import Tab from "@mui/material/Tab"; 6 | import TabContext from "@mui/lab/TabContext"; 7 | import TabList from "@mui/lab/TabList"; 8 | import TabPanel from "@mui/lab/TabPanel"; 9 | import type { Stats } from "./goya.worker"; 10 | import { Typography } from "@mui/material"; 11 | 12 | enum ResultTab { 13 | Wakachi = "Wakachi", 14 | Table = "Table", 15 | Dot = "Dot", 16 | } 17 | 18 | type Props = { 19 | dot?: string; 20 | wakachi?: string[]; 21 | best?: unknown[] | null; 22 | stats?: Stats; 23 | }; 24 | 25 | const Table = lazy(() => import(/* webpackChunkName: "table" */ "./Table")); 26 | const Dot = lazy(() => import(/* webpackChunkName: "dot" */ "./Dot")); 27 | 28 | export function Result(props: Props) { 29 | const { dot, wakachi, best, stats } = props; 30 | const [tab, setTab] = useState(ResultTab.Wakachi); 31 | 32 | const handleChangeTab = (_: unknown, newValue: ResultTab) => { 33 | setTab(newValue); 34 | }; 35 | 36 | return ( 37 | <> 38 | 39 | 40 | 44 | 48 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | {wakachi?.join("/")} 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | {dot ? : null} 70 | 71 | 72 | 73 | ); 74 | } 75 | -------------------------------------------------------------------------------- /wasm-core/.gitignore: -------------------------------------------------------------------------------- 1 | pkg 2 | 3 | # Created by https://www.toptal.com/developers/gitignore/api/node 4 | # Edit at https://www.toptal.com/developers/gitignore?templates=node 5 | 6 | ### Node ### 7 | # Logs 8 | logs 9 | *.log 10 | npm-debug.log* 11 | yarn-debug.log* 12 | yarn-error.log* 13 | lerna-debug.log* 14 | .pnpm-debug.log* 15 | 16 | # Diagnostic reports (https://nodejs.org/api/report.html) 17 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json 18 | 19 | # Runtime data 20 | pids 21 | *.pid 22 | *.seed 23 | *.pid.lock 24 | 25 | # Directory for instrumented libs generated by jscoverage/JSCover 26 | lib-cov 27 | 28 | # Coverage directory used by tools like istanbul 29 | coverage 30 | *.lcov 31 | 32 | # nyc test coverage 33 | .nyc_output 34 | 35 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) 36 | .grunt 37 | 38 | # Bower dependency directory (https://bower.io/) 39 | bower_components 40 | 41 | # node-waf configuration 42 | .lock-wscript 43 | 44 | # Compiled binary addons (https://nodejs.org/api/addons.html) 45 | build/Release 46 | 47 | # Dependency directories 48 | node_modules/ 49 | jspm_packages/ 50 | 51 | # Snowpack dependency directory (https://snowpack.dev/) 52 | web_modules/ 53 | 54 | # TypeScript cache 55 | *.tsbuildinfo 56 | 57 | # Optional npm cache directory 58 | .npm 59 | 60 | # Optional eslint cache 61 | .eslintcache 62 | 63 | # Microbundle cache 64 | .rpt2_cache/ 65 | .rts2_cache_cjs/ 66 | .rts2_cache_es/ 67 | .rts2_cache_umd/ 68 | 69 | # Optional REPL history 70 | .node_repl_history 71 | 72 | # Output of 'npm pack' 73 | *.tgz 74 | 75 | # Yarn Integrity file 76 | .yarn-integrity 77 | 78 | # dotenv environment variables file 79 | .env 80 | .env.test 81 | .env.production 82 | 83 | # parcel-bundler cache (https://parceljs.org/) 84 | .cache 85 | .parcel-cache 86 | 87 | # Next.js build output 88 | .next 89 | out 90 | 91 | # Nuxt.js build / generate output 92 | .nuxt 93 | dist 94 | 95 | # Gatsby files 96 | .cache/ 97 | # Comment in the public line in if your project uses Gatsby and not Next.js 98 | # https://nextjs.org/blog/next-9-1#public-directory-support 99 | # public 100 | 101 | # vuepress build output 102 | .vuepress/dist 103 | 104 | # Serverless directories 105 | .serverless/ 106 | 107 | # FuseBox cache 108 | .fusebox/ 109 | 110 | # DynamoDB Local files 111 | .dynamodb/ 112 | 113 | # TernJS port file 114 | .tern-port 115 | 116 | # Stores VSCode versions used for testing VSCode extensions 117 | .vscode-test 118 | 119 | # yarn v2 120 | .yarn/cache 121 | .yarn/unplugged 122 | .yarn/build-state.yml 123 | .yarn/install-state.gz 124 | .pnp.* 125 | 126 | ### Node Patch ### 127 | # Serverless Webpack directories 128 | .webpack/ 129 | 130 | # End of https://www.toptal.com/developers/gitignore/api/node 131 | mecab-ipadic-2.7.0-20070801 132 | -------------------------------------------------------------------------------- /playground/.gitignore: -------------------------------------------------------------------------------- 1 | __generated__ 2 | pkg 3 | 4 | # Created by https://www.toptal.com/developers/gitignore/api/node 5 | # Edit at https://www.toptal.com/developers/gitignore?templates=node 6 | 7 | ### Node ### 8 | # Logs 9 | logs 10 | *.log 11 | npm-debug.log* 12 | yarn-debug.log* 13 | yarn-error.log* 14 | lerna-debug.log* 15 | .pnpm-debug.log* 16 | 17 | # Diagnostic reports (https://nodejs.org/api/report.html) 18 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json 19 | 20 | # Runtime data 21 | pids 22 | *.pid 23 | *.seed 24 | *.pid.lock 25 | 26 | # Directory for instrumented libs generated by jscoverage/JSCover 27 | lib-cov 28 | 29 | # Coverage directory used by tools like istanbul 30 | coverage 31 | *.lcov 32 | 33 | # nyc test coverage 34 | .nyc_output 35 | 36 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) 37 | .grunt 38 | 39 | # Bower dependency directory (https://bower.io/) 40 | bower_components 41 | 42 | # node-waf configuration 43 | .lock-wscript 44 | 45 | # Compiled binary addons (https://nodejs.org/api/addons.html) 46 | build/Release 47 | 48 | # Dependency directories 49 | node_modules/ 50 | jspm_packages/ 51 | 52 | # Snowpack dependency directory (https://snowpack.dev/) 53 | web_modules/ 54 | 55 | # TypeScript cache 56 | *.tsbuildinfo 57 | 58 | # Optional npm cache directory 59 | .npm 60 | 61 | # Optional eslint cache 62 | .eslintcache 63 | 64 | # Microbundle cache 65 | .rpt2_cache/ 66 | .rts2_cache_cjs/ 67 | .rts2_cache_es/ 68 | .rts2_cache_umd/ 69 | 70 | # Optional REPL history 71 | .node_repl_history 72 | 73 | # Output of 'npm pack' 74 | *.tgz 75 | 76 | # Yarn Integrity file 77 | .yarn-integrity 78 | 79 | # dotenv environment variables file 80 | .env 81 | .env.test 82 | .env.production 83 | 84 | # parcel-bundler cache (https://parceljs.org/) 85 | .cache 86 | .parcel-cache 87 | 88 | # Next.js build output 89 | .next 90 | out 91 | 92 | # Nuxt.js build / generate output 93 | .nuxt 94 | dist 95 | 96 | # Gatsby files 97 | .cache/ 98 | # Comment in the public line in if your project uses Gatsby and not Next.js 99 | # https://nextjs.org/blog/next-9-1#public-directory-support 100 | # public 101 | 102 | # vuepress build output 103 | .vuepress/dist 104 | 105 | # Serverless directories 106 | .serverless/ 107 | 108 | # FuseBox cache 109 | .fusebox/ 110 | 111 | # DynamoDB Local files 112 | .dynamodb/ 113 | 114 | # TernJS port file 115 | .tern-port 116 | 117 | # Stores VSCode versions used for testing VSCode extensions 118 | .vscode-test 119 | 120 | # yarn v2 121 | .yarn/cache 122 | .yarn/unplugged 123 | .yarn/build-state.yml 124 | .yarn/install-state.gz 125 | .pnp.* 126 | 127 | ### Node Patch ### 128 | # Serverless Webpack directories 129 | .webpack/ 130 | 131 | # End of https://www.toptal.com/developers/gitignore/api/node 132 | mecab-ipadic-2.7.0-20070801 133 | -------------------------------------------------------------------------------- /goya-cli/src/repl.rs: -------------------------------------------------------------------------------- 1 | use goya::dot; 2 | use goya::double_array::DoubleArray; 3 | use goya::id::WordIdentifier; 4 | use goya::lattice::Lattice; 5 | use goya::word_features::WordFeaturesMap; 6 | use goya_ipadic::ipadic::IPADic; 7 | use std::io::{stdin, stdout, BufRead, BufWriter, Write}; 8 | use std::str::FromStr; 9 | 10 | pub enum Format { 11 | Dot, 12 | Plain, 13 | } 14 | impl FromStr for Format { 15 | type Err = &'static str; 16 | 17 | fn from_str(s: &str) -> Result { 18 | match s { 19 | "dot" => Ok(Format::Dot), 20 | "plain" => Ok(Format::Plain), 21 | _ => Err("no match"), 22 | } 23 | } 24 | } 25 | 26 | pub struct ReplContext<'a> { 27 | pub da: &'a DoubleArray, 28 | pub dict: &'a IPADic, 29 | pub word_set: &'a WordFeaturesMap, 30 | pub format: Format, 31 | } 32 | 33 | pub fn start(opt: ReplContext) -> Result<(), std::io::Error> { 34 | let out = stdout(); 35 | let mut out = BufWriter::new(out.lock()); 36 | 37 | for line in stdin().lock().lines() { 38 | match line { 39 | Ok(line) if line.is_empty() => continue, 40 | Ok(line) => { 41 | let lattice = Lattice::parse(&line, opt.da, opt.dict); 42 | match opt.format { 43 | Format::Dot => { 44 | writeln!(out, "{}", dot::render(&lattice, opt.dict).unwrap())?; 45 | } 46 | Format::Plain => { 47 | if let Some(path) = lattice.find_best() { 48 | for wid in path.into_iter() { 49 | let (surface_form, features) = match wid { 50 | WordIdentifier::Unknown(id, surface_form) => { 51 | (surface_form, opt.word_set.get_unknown(&id).unwrap()) 52 | } 53 | WordIdentifier::Known(id, surface_form) => { 54 | (surface_form, opt.word_set.get_known(&id).unwrap()) 55 | } 56 | }; 57 | writeln!( 58 | out, 59 | "{}\t{}", 60 | surface_form, 61 | features 62 | .into_iter() 63 | .map(|f| f.to_string()) 64 | .collect::>() 65 | .join(",") 66 | )?; 67 | } 68 | writeln!(out, "EOS")?; 69 | out.flush()?; 70 | } 71 | } 72 | } 73 | } 74 | Err(err) => return Err(err), 75 | } 76 | } 77 | Ok(()) 78 | } 79 | -------------------------------------------------------------------------------- /goya-cli/src/main.rs: -------------------------------------------------------------------------------- 1 | mod build; 2 | mod path_util; 3 | mod repl; 4 | 5 | use clap::Parser; 6 | use futures::executor::block_on; 7 | use futures::future; 8 | use goya::double_array::DoubleArray; 9 | use goya_ipadic::ipadic::IPADic; 10 | use path_util::PathUtil; 11 | use repl::Format; 12 | use rkyv::{archived_root, Deserialize, Infallible}; 13 | use std::fs; 14 | 15 | #[derive(Parser)] 16 | struct Opts { 17 | /// `~/.goya/dict` by default 18 | #[clap(short, long)] 19 | dicdir: Option, 20 | #[clap(short, long, default_value = "plain")] 21 | format: Format, 22 | #[clap(subcommand)] 23 | subcmd: Option, 24 | } 25 | 26 | #[derive(Parser)] 27 | enum SubCommand { 28 | Compile(Compile), 29 | Clean, 30 | } 31 | 32 | /// A subcommand for controlling testing 33 | #[derive(Parser)] 34 | struct Compile { 35 | /// Path to the IPAdic directory 36 | dicpath: String, 37 | } 38 | 39 | fn main() { 40 | let opts: Opts = Opts::parse(); 41 | let base_dir = dirs::home_dir().unwrap().join(".goya"); 42 | let dicdir = opts 43 | .dicdir 44 | .unwrap_or_else(|| base_dir.join("dict").to_str().unwrap().to_string()); 45 | match opts.subcmd { 46 | Some(SubCommand::Compile(c)) => match build::build(&c.dicpath, &dicdir) { 47 | Ok(_) => {} 48 | Err(err) => { 49 | println!("{:?}", err); 50 | } 51 | }, 52 | Some(SubCommand::Clean) => { 53 | let util = PathUtil::from(dicdir); 54 | fs::remove_file(util.da_path()).expect("Failed to delete file"); 55 | fs::remove_file(util.dict_path()).expect("Failed to delete file"); 56 | } 57 | _ => { 58 | let util = PathUtil::from(dicdir); 59 | 60 | let da_fut = async { 61 | let encoded = fs::read(util.da_path()).expect("Failed to load dictionary"); 62 | let archived = unsafe { archived_root::(&encoded[..]) }; 63 | archived.deserialize(&mut Infallible).unwrap() 64 | }; 65 | let ipadic_fut = async { 66 | let encoded = fs::read(util.dict_path()).expect("Failed to load vocabulary"); 67 | let archived = unsafe { archived_root::(&encoded[..]) }; 68 | archived.deserialize(&mut Infallible).unwrap() 69 | }; 70 | let features_fut = async { 71 | let encoded = fs::read(util.features_path()).expect("Failed to load surfaces"); 72 | rmp_serde::from_slice(&encoded[..]).unwrap() 73 | }; 74 | 75 | let (ipadic, word_set) = block_on(future::join(ipadic_fut, features_fut)); 76 | let da = block_on(da_fut); 77 | repl::start(repl::ReplContext { 78 | da: &da, 79 | dict: &ipadic, 80 | word_set: &word_set, 81 | format: opts.format, 82 | }) 83 | .unwrap(); 84 | std::thread::spawn(move || drop(ipadic)); 85 | std::thread::spawn(move || drop(da)); 86 | std::thread::spawn(move || drop(word_set)); 87 | } 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /wasm-core/src/lib.rs: -------------------------------------------------------------------------------- 1 | use goya::dictionary::Dictionary; 2 | use goya::dot; 3 | use goya::double_array::DoubleArray; 4 | use goya::id::WordIdentifier; 5 | use goya::lattice::Lattice; 6 | use goya_ipadic::ipadic::IPADic; 7 | use rkyv::{archived_root, Deserialize, Infallible}; 8 | use serde::Serialize; 9 | use wasm_bindgen::prelude::*; 10 | 11 | #[macro_use] 12 | extern crate lazy_static; 13 | 14 | lazy_static! { 15 | static ref DOUBLE_ARRAY: DoubleArray = { 16 | let archived = 17 | unsafe { archived_root::(include_bytes!("../__generated__/da.bin")) }; 18 | archived.deserialize(&mut Infallible).unwrap() 19 | }; 20 | static ref IPADIC: IPADic = { 21 | let archived = 22 | unsafe { archived_root::(include_bytes!("../__generated__/dict.bin")) }; 23 | archived.deserialize(&mut Infallible).unwrap() 24 | }; 25 | } 26 | 27 | #[derive(Serialize)] 28 | pub struct WasmMorpheme { 29 | wid: WordIdentifier, 30 | is_known: bool, 31 | surface_form: String, 32 | left_context_id: usize, 33 | right_context_id: usize, 34 | cost: i16, 35 | } 36 | impl WasmMorpheme {} 37 | 38 | #[wasm_bindgen] 39 | pub struct WasmLattice { 40 | lattice: Lattice, 41 | } 42 | #[wasm_bindgen] 43 | impl WasmLattice { 44 | pub fn as_dot(&self) -> String { 45 | dot::render(&self.lattice, &*IPADIC).unwrap() 46 | } 47 | 48 | pub fn wakachi(&self) -> Vec { 49 | self.best_morphemes() 50 | .map(|morpheme| serde_wasm_bindgen::to_value(&morpheme.surface_form).unwrap()) 51 | .collect() 52 | } 53 | 54 | pub fn find_best(&self) -> Vec { 55 | self.best_morphemes() 56 | .map(|morpheme| serde_wasm_bindgen::to_value(&morpheme).unwrap()) 57 | .collect() 58 | } 59 | 60 | fn best_morphemes(&self) -> impl Iterator + '_ { 61 | self.lattice 62 | .find_best() 63 | .map(|path| { 64 | path.into_iter().map(|wid| { 65 | let morpheme = IPADIC.get(&wid).unwrap(); 66 | let (surface_form, is_known) = match &wid { 67 | WordIdentifier::Known(_, s) => (s.to_string(), true), 68 | WordIdentifier::Unknown(_, s) => (s.to_string(), false), 69 | }; 70 | WasmMorpheme { 71 | wid, 72 | is_known, 73 | surface_form, 74 | left_context_id: morpheme.left_context_id, 75 | right_context_id: morpheme.right_context_id, 76 | cost: morpheme.cost, 77 | } 78 | }) 79 | }) 80 | .unwrap() 81 | } 82 | } 83 | 84 | #[wasm_bindgen] 85 | pub async fn ready() { 86 | futures::join!(async { lazy_static::initialize(&IPADIC) }, async { 87 | lazy_static::initialize(&DOUBLE_ARRAY) 88 | }); 89 | } 90 | 91 | #[wasm_bindgen] 92 | pub fn parse(text: &str) -> WasmLattice { 93 | WasmLattice { 94 | lattice: Lattice::parse(text, &DOUBLE_ARRAY, &*IPADIC), 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /goya-cli/src/build.rs: -------------------------------------------------------------------------------- 1 | use super::path_util::PathUtil; 2 | use bytesize::ByteSize; 3 | use console::{style, Emoji}; 4 | use goya::common_prefix_tree::CommonPrefixTree; 5 | use goya::double_array::DoubleArray; 6 | use goya_ipadic::ipadic::IPADic; 7 | use goya_ipadic::ipadic_loader::IPADicLoader; 8 | use rkyv::ser::{serializers::AllocSerializer, Serializer}; 9 | use std::error::Error; 10 | use std::fs; 11 | use std::time::Instant; 12 | 13 | const LOOKING_GLASS: Emoji = Emoji("🔍", ""); 14 | const PAPER: Emoji = Emoji("📃", ""); 15 | const CLIP: Emoji = Emoji("🔗", ""); 16 | const SPARKLE: Emoji = Emoji("✨", ""); 17 | const TRUCK: Emoji = Emoji("🚚", ""); 18 | 19 | pub fn build(src_dir: &str, dist_dir: &str) -> Result<(), Box> { 20 | PathUtil::from(dist_dir.to_string()); 21 | let timer = Instant::now(); 22 | eprintln!( 23 | "{} {} Loading dictionary...", 24 | style("[1/4]").bold().dim(), 25 | LOOKING_GLASS 26 | ); 27 | let loader = IPADicLoader {}; 28 | let mut loaded = loader.load(src_dir)?; 29 | 30 | eprintln!( 31 | "{} {} Analyzing vocabulary...", 32 | style("[2/4]").bold().dim(), 33 | PAPER 34 | ); 35 | let mut cpt = CommonPrefixTree::default(); 36 | for (id, surface) in loaded.surfaces.iter() { 37 | cpt.append(*id, surface); 38 | } 39 | 40 | eprintln!( 41 | "{} {} Recompiling dictionary...", 42 | style("[3/4]").bold().dim(), 43 | CLIP 44 | ); 45 | let da = DoubleArray::from_cpt(&cpt); 46 | 47 | // DoubleArray only has one ID per surface form. 48 | let used_wids = da.wids().collect(); 49 | loaded.ipadic.shrink_to_wids(&used_wids); 50 | 51 | eprintln!( 52 | "{} {} Exporting dictionary...", 53 | style("[4/4]").bold().dim(), 54 | TRUCK 55 | ); 56 | let util = PathUtil::from(dist_dir.to_string()); 57 | util.mkdirp().expect("Failed to create directory"); 58 | 59 | let mut serializer = AllocSerializer::<256>::default(); 60 | serializer.serialize_value(&da).unwrap(); 61 | let bytes = serializer.into_serializer().into_inner(); 62 | fs::write(util.da_path(), &bytes).expect("Failed to write dictionary"); 63 | eprintln!("DoubleArray stats:"); 64 | eprintln!(" elements: {}", da.base.len()); 65 | eprintln!(" bytes: {}", ByteSize(bytes.len() as u64)); 66 | 67 | let mut serializer = AllocSerializer::<256>::default(); 68 | serializer 69 | .serialize_value::(&loaded.ipadic) 70 | .unwrap(); 71 | let bytes = serializer.into_serializer().into_inner(); 72 | fs::write(util.dict_path(), &bytes).expect("Failed to write dictionary"); 73 | eprintln!("Dictionary stats:"); 74 | eprintln!(" bytes: {}", ByteSize(bytes.len() as u64)); 75 | 76 | let bytes = rmp_serde::to_vec(&loaded.word_set).unwrap(); 77 | fs::write(util.features_path(), &bytes).expect("Failed to write word features"); 78 | eprintln!("Word features stats:"); 79 | eprintln!(" bytes: {}", ByteSize(bytes.len() as u64)); 80 | 81 | let end = timer.elapsed(); 82 | eprintln!( 83 | "{} Done in {}.{:03}s", 84 | SPARKLE, 85 | end.as_secs(), 86 | end.subsec_millis() 87 | ); 88 | Ok(()) 89 | } 90 | -------------------------------------------------------------------------------- /goya/src/word_features.rs: -------------------------------------------------------------------------------- 1 | use super::id::WordIdentifier; 2 | use indexmap::IndexSet; 3 | use serde::{Deserialize, Serialize}; 4 | use std::str::from_utf8_unchecked; 5 | 6 | #[derive(Debug, Serialize, Deserialize, rkyv::Archive, rkyv::Serialize, rkyv::Deserialize)] 7 | pub struct WordFeaturesMap { 8 | #[serde(with = "serde_bytes")] 9 | index: Vec, 10 | offsets: Vec, 11 | known: Vec, // index = morpheme ID 12 | unknown: Vec, // index = morpheme ID 13 | } 14 | impl WordFeaturesMap { 15 | pub fn new(known: Vec>, unknown: Vec>) -> WordFeaturesMap { 16 | let mut tmp_index: IndexSet = IndexSet::new(); 17 | for features in known.iter().chain(unknown.iter()) { 18 | for f in features.iter() { 19 | tmp_index.insert(f.to_string()); 20 | } 21 | } 22 | let mut index = vec![]; 23 | let mut offsets: Vec = vec![0; tmp_index.len()]; 24 | offsets[0] = tmp_index.get_index(0).unwrap().as_bytes().len(); 25 | for (idx, str) in tmp_index.iter().enumerate() { 26 | index.append(&mut str.to_string().into_bytes()); 27 | if idx > 0 { 28 | offsets[idx] = offsets[idx - 1] + str.as_bytes().len(); 29 | } 30 | } 31 | 32 | WordFeaturesMap { 33 | known: known 34 | .into_iter() 35 | .map(|f| { 36 | WordFeatures::new(f.iter().map(|s| tmp_index.get_full(s).unwrap().0).collect()) 37 | }) 38 | .collect(), 39 | unknown: unknown 40 | .into_iter() 41 | .map(|f| { 42 | WordFeatures::new(f.iter().map(|s| tmp_index.get_full(s).unwrap().0).collect()) 43 | }) 44 | .collect(), 45 | index, 46 | offsets, 47 | } 48 | } 49 | 50 | pub fn get(&self, wid: &WordIdentifier) -> Option> { 51 | match wid { 52 | WordIdentifier::Known(wid, _) => self.get_known(wid), 53 | WordIdentifier::Unknown(wid, _) => self.get_unknown(wid), 54 | } 55 | } 56 | 57 | pub fn get_known(&self, wid: &usize) -> Option> { 58 | self.known.get(*wid).map(|f| self.get_string(f)) 59 | } 60 | 61 | pub fn get_unknown(&self, wid: &usize) -> Option> { 62 | self.unknown.get(*wid).map(|f| self.get_string(f)) 63 | } 64 | 65 | fn get_string(&self, f: &WordFeatures) -> Vec<&str> { 66 | f.0.iter() 67 | .map(|idx| { 68 | let idx = *idx; 69 | let end = self.offsets[idx]; 70 | if idx == 0 { 71 | unsafe { from_utf8_unchecked(&self.index[0..end]) } 72 | } else { 73 | unsafe { from_utf8_unchecked(&self.index[(self.offsets[idx - 1])..end]) } 74 | } 75 | }) 76 | .collect() 77 | } 78 | } 79 | 80 | /// > 5カラム目以降は, ユーザ定義の CSV フィールドです. 基本的に どんな内容でも CSV の許す限り追加することができます. 81 | /// > https://taku910.github.io/mecab/dic-detail.html 82 | #[derive(Debug, Serialize, Deserialize, rkyv::Archive, rkyv::Serialize, rkyv::Deserialize)] 83 | pub struct WordFeatures(Vec); 84 | impl WordFeatures { 85 | pub fn new(features: Vec) -> WordFeatures { 86 | WordFeatures(features) 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /playground/src/App.tsx: -------------------------------------------------------------------------------- 1 | import React, { useCallback, useState } from "react"; 2 | import AppBar from "@mui/material/AppBar"; 3 | import Toolbar from "@mui/material/Toolbar"; 4 | import IconButton from "@mui/material/IconButton"; 5 | import Container from "@mui/material/Container"; 6 | import Box from "@mui/material/Box"; 7 | import Typography from "@mui/material/Typography"; 8 | import TextField from "@mui/material/TextField"; 9 | import GitHubIcon from "@mui/icons-material/GitHub"; 10 | import { useDebounce } from "react-use"; 11 | import { wrap, transfer } from "comlink"; 12 | import type { Stats } from "./goya.worker"; 13 | import { Result } from "./Result"; 14 | 15 | interface GoyaCoreAPI { 16 | parse: (input: ArrayBufferLike) => Promise; 17 | } 18 | const worker = wrap( 19 | new Worker(new URL("./goya.worker.ts", import.meta.url)) 20 | ); 21 | const encoder = new TextEncoder(); 22 | const decoder = new TextDecoder(); 23 | const initText = new URL(location.href).searchParams.get("text"); 24 | 25 | export function App() { 26 | const [text, setText] = useState(initText ?? "すもももももももものうち"); 27 | const [result, setResult] = useState<{ 28 | dot: string; 29 | wakachi: string[]; 30 | best: unknown[]; 31 | stats: Stats; 32 | } | null>(null); 33 | 34 | const handleChangeText = useCallback( 35 | (event) => { 36 | setText(event.target.value.trim()); 37 | }, 38 | [setText] 39 | ); 40 | useDebounce( 41 | () => { 42 | if (text.length === 0) { 43 | setResult(null); 44 | } else { 45 | const input = encoder.encode(text); 46 | worker 47 | .parse(transfer(input, [input.buffer])) 48 | .then((res) => decoder.decode(res)) 49 | .then((res) => JSON.parse(res)) 50 | .then(setResult); 51 | } 52 | }, 53 | 200, 54 | [text] 55 | ); 56 | 57 | return ( 58 | <> 59 | 60 | 61 | 62 | Goya playground 63 | 64 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | Goya: Yet another Japanese morphological analyzer for Rust and 81 | WebAssembly 82 | 83 | 84 | Goya: WebAssemblyで利用可能な日本語の形態素解析ライブラリ 85 | 86 | 87 | 88 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | ); 104 | } 105 | -------------------------------------------------------------------------------- /ipadic/src/ipadic.rs: -------------------------------------------------------------------------------- 1 | use goya::char_class::CharClassifier; 2 | use goya::char_class::CharDefinition; 3 | use goya::dictionary::Dictionary; 4 | use goya::morpheme::Morpheme; 5 | use indexmap::IndexSet; 6 | use serde::{Deserialize, Serialize}; 7 | use std::collections::HashMap; 8 | use std::collections::HashSet; 9 | use std::iter::FromIterator; 10 | use std::vec::Vec; 11 | 12 | // TODO: Make it newtype idiom 13 | type MorphemeIndex = usize; 14 | 15 | #[derive(Debug, Serialize, Deserialize, rkyv::Archive, rkyv::Serialize, rkyv::Deserialize)] 16 | pub struct IPADic { 17 | vocabulary: Vec, // index = morpheme ID 18 | homonyms: Vec>, // index = morpheme ID 19 | classes: CharClassifier, 20 | matrix: Vec>, 21 | /// 1つのカテゴリに複数の素性を定義してもかまいません. 学習後, 適切なコスト値が 自動的に与えられます. 22 | /// https://taku910.github.io/mecab/learn.html#config 23 | unknown_classes: HashMap>, 24 | unknown_vocabulary: Vec, // index = morpheme ID 25 | vocabulary_index: IndexSet, 26 | } 27 | impl Dictionary for IPADic { 28 | fn get_known_morpheme(&self, wid: &usize) -> Option<&Morpheme> { 29 | self.vocabulary 30 | .get(*wid) 31 | .map(|idx| self.vocabulary_index.get_index(*idx).unwrap()) 32 | } 33 | 34 | fn get_unknown_morpheme(&self, wid: &usize) -> Option<&Morpheme> { 35 | self.unknown_vocabulary 36 | .get(*wid) 37 | .map(|idx| self.vocabulary_index.get_index(*idx).unwrap()) 38 | } 39 | 40 | fn resolve_homonyms(&self, wid: &usize) -> Option<&Vec> { 41 | self.homonyms.get(*wid) 42 | } 43 | 44 | fn take_unknown_chars_seq(&self, def: &CharDefinition, text: &str, start: &usize) -> String { 45 | self.classes.take_unknown_chars(def, text, start) 46 | } 47 | 48 | fn classify_char(&self, c: &char) -> &CharDefinition { 49 | self.classes.classify(c) 50 | } 51 | 52 | fn get_unknown_morphemes_by_class(&self, class: &str) -> Vec<(usize, &Morpheme)> { 53 | self.unknown_classes 54 | .get(class) 55 | .unwrap() 56 | .iter() 57 | .map(|wid| (*wid, self.unknown_vocabulary.get(*wid).unwrap())) 58 | .map(|(wid, idx)| (wid, self.vocabulary_index.get_index(*idx).unwrap())) 59 | .collect::>() 60 | } 61 | 62 | fn transition_cost(&self, left: &usize, right: &usize) -> Option<&i16> { 63 | if let Some(rights) = self.matrix.get(*left) { 64 | if let Some(cost) = rights.get(*right) { 65 | return Some(cost); 66 | } 67 | } 68 | None 69 | } 70 | 71 | fn occurrence_cost(&self, wid: &usize) -> Option { 72 | self.get_known_morpheme(wid).map(|w| w.cost) 73 | } 74 | } 75 | impl IPADic { 76 | pub fn from( 77 | vocabulary: Vec, 78 | homonyms: Vec>, 79 | classes: CharClassifier, 80 | matrix: Vec>, 81 | unknown_classes: HashMap>, 82 | unknown_vocabulary: Vec, 83 | vocabulary_index: IndexSet, 84 | ) -> IPADic { 85 | IPADic { 86 | vocabulary, 87 | homonyms, 88 | classes, 89 | matrix, 90 | unknown_classes, 91 | unknown_vocabulary, 92 | vocabulary_index, 93 | } 94 | } 95 | 96 | pub fn shrink_to_wids(&mut self, wids: &Vec) { 97 | let set: HashSet = HashSet::from_iter(wids.iter().cloned()); 98 | for idx in 0..self.homonyms.len() { 99 | if set.contains(&idx) { 100 | continue; 101 | } 102 | self.homonyms[idx] = vec![]; 103 | } 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /goya/src/dot.rs: -------------------------------------------------------------------------------- 1 | use super::{ 2 | dictionary::Dictionary, 3 | lattice::{Lattice, BOS_CONTEXT_ID, EOS_CONTEXT_ID}, 4 | }; 5 | use std::{error::Error, fmt::Write}; 6 | 7 | const BOLD: &str = " penwidth=3"; 8 | 9 | pub fn render(lattice: &Lattice, dict: &D) -> Result> { 10 | let cursor = (lattice.dp.len() - 1, 0); 11 | let len = lattice.indices.len(); 12 | let best_path = lattice.find_best_path(); 13 | let mut dot = String::from(""); 14 | writeln!( 15 | dot, 16 | r#"digraph lattice {{ 17 | rankdir=LR; 18 | splines=polyline; 19 | nodesep=.05; 20 | 21 | BOS [label="BOS\n0 (0)" shape="doublecircle"{}]; 22 | EOS [label="EOS\n{} (0)" shape="doublecircle"{}]; 23 | "#, 24 | BOLD, 25 | lattice.dp[cursor.0].get(cursor.1).unwrap().0, 26 | BOLD 27 | )?; 28 | for (i, index) in lattice.indices.iter().enumerate() { 29 | for (j, (left_wid, wlen)) in index.iter().enumerate() { 30 | let left = dict.get(left_wid).unwrap(); 31 | let node_style = match &best_path { 32 | Some(best_path) if best_path.contains(&(i + 1, j)) => BOLD, 33 | _ => "", 34 | }; 35 | writeln!( 36 | dot, 37 | r#" "{}_{}" [label="{}\n({}, {})"{}];"#, 38 | i, 39 | j, 40 | left_wid.get_surface(), 41 | lattice.dp[i + 1][j].0, 42 | left.cost, 43 | node_style, 44 | )?; 45 | if i == 0 { 46 | let right = left; 47 | let cost = dict 48 | .transition_cost(&BOS_CONTEXT_ID, &right.right_context_id) 49 | .unwrap(); 50 | let bos_edge_style = match &best_path { 51 | Some(best_path) if best_path.contains(&(i + 1, j)) => BOLD, 52 | _ => "", 53 | }; 54 | writeln!( 55 | dot, 56 | r#" BOS -> "{}_{}" [label="({})"{}];"#, 57 | i, j, cost, bos_edge_style 58 | )?; 59 | } 60 | if i + wlen >= len { 61 | let cost = dict 62 | .transition_cost(&left.left_context_id, &EOS_CONTEXT_ID) 63 | .unwrap(); 64 | let eos_edge_style = match &best_path { 65 | Some(best_path) if best_path.contains(&(i + 1, j)) => BOLD, 66 | _ => "", 67 | }; 68 | writeln!( 69 | dot, 70 | r#" "{}_{}" -> EOS [label="({})"{}];"#, 71 | i, j, cost, eos_edge_style 72 | )?; 73 | continue; 74 | } 75 | for (k, (right_wid, _)) in lattice.indices[i + wlen].iter().enumerate() { 76 | let right = dict.get(right_wid).unwrap(); 77 | let cost = dict 78 | .transition_cost(&left.left_context_id, &right.right_context_id) 79 | .unwrap(); 80 | let edge_style = match &best_path { 81 | Some(best_path) 82 | if best_path.contains(&(i + 1, j)) 83 | && best_path.contains(&(i + wlen + 1, k)) => 84 | { 85 | BOLD 86 | } 87 | _ => "", 88 | }; 89 | writeln!( 90 | dot, 91 | r#" "{}_{}" -> "{}_{}" [label="({})"{}];"#, 92 | i, 93 | j, 94 | i + wlen, 95 | k, 96 | cost, 97 | edge_style 98 | )?; 99 | } 100 | } 101 | } 102 | writeln!(dot, "}}")?; 103 | Ok(dot) 104 | } 105 | -------------------------------------------------------------------------------- /goya/src/char_class.rs: -------------------------------------------------------------------------------- 1 | use serde::{Deserialize, Serialize}; 2 | use std::collections::{HashMap, HashSet}; 3 | 4 | const CLASS_DEFAULT: &str = "DEFAULT"; 5 | 6 | #[derive( 7 | Debug, PartialEq, Eq, Serialize, Deserialize, rkyv::Archive, rkyv::Serialize, rkyv::Deserialize, 8 | )] 9 | pub enum InvokeTiming { 10 | Fallback, 11 | Always, 12 | } 13 | #[derive( 14 | Debug, PartialEq, Eq, Serialize, Deserialize, rkyv::Archive, rkyv::Serialize, rkyv::Deserialize, 15 | )] 16 | pub struct CharDefinition { 17 | pub class: String, 18 | pub timing: InvokeTiming, 19 | pub group_by_same_kind: bool, 20 | pub len: usize, 21 | pub compatibilities: HashSet, // elements = class name 22 | } 23 | impl CharDefinition { 24 | pub fn compatible_with(&self, class_name: &str) -> bool { 25 | self.class.eq(class_name) || self.compatibilities.contains(class_name) 26 | } 27 | } 28 | 29 | #[derive(Debug, Serialize, Deserialize, rkyv::Archive, rkyv::Serialize, rkyv::Deserialize)] 30 | pub struct CharClass { 31 | range: (u32, u32), 32 | class: String, 33 | } 34 | impl CharClass { 35 | pub fn from(range: (u32, u32), class: String) -> CharClass { 36 | CharClass { range, class } 37 | } 38 | 39 | pub fn in_range(&self, c: &char) -> bool { 40 | let code = *c as u32; 41 | self.range.0 <= code && code <= self.range.1 42 | } 43 | } 44 | 45 | #[derive(Debug, Serialize, Deserialize, rkyv::Archive, rkyv::Serialize, rkyv::Deserialize)] 46 | pub struct CharClassifier { 47 | chars: HashMap, 48 | ranges: Vec, 49 | } 50 | impl CharClassifier { 51 | pub fn from(chars: HashMap, ranges: Vec) -> CharClassifier { 52 | CharClassifier { chars, ranges } 53 | } 54 | 55 | pub fn classify(&self, c: &char) -> &CharDefinition { 56 | let class = self.get_class_name(c); 57 | self.chars.get(class).unwrap() 58 | } 59 | 60 | pub fn take_unknown_chars(&self, def: &CharDefinition, text: &str, start: &usize) -> String { 61 | if !def.group_by_same_kind { 62 | return text.chars().skip(*start).take(def.len).collect(); 63 | } 64 | 65 | let mut len = 0; 66 | text.chars() 67 | .enumerate() 68 | .skip(*start) 69 | .take_while(|(_, c)| { 70 | if def.len != 0 && len >= def.len || !def.compatible_with(self.get_class_name(c)) { 71 | return false; 72 | } 73 | len += 1; 74 | true 75 | }) 76 | .map(|(_, c)| c) 77 | .collect() 78 | } 79 | 80 | fn get_class_name(&self, c: &char) -> &str { 81 | self.ranges 82 | .iter() 83 | .find(|class| class.in_range(c)) 84 | .map(|class| class.class.as_str()) 85 | .unwrap_or_else(|| CLASS_DEFAULT) 86 | } 87 | } 88 | 89 | #[cfg(test)] 90 | mod tests { 91 | use super::*; 92 | 93 | #[test] 94 | fn compatible_with_without_compatibilities() { 95 | let def_a = CharDefinition { 96 | class: String::from("A"), 97 | timing: InvokeTiming::Always, 98 | group_by_same_kind: false, 99 | len: 2, 100 | compatibilities: HashSet::new(), 101 | }; 102 | assert_eq!(def_a.compatible_with("A"), true); 103 | assert_eq!(def_a.compatible_with("B"), false); 104 | } 105 | 106 | #[test] 107 | fn compatible_with_with_compatibilities() { 108 | let mut compatibilities = HashSet::new(); 109 | compatibilities.insert(String::from("B")); 110 | let def_a = CharDefinition { 111 | class: String::from("A"), 112 | timing: InvokeTiming::Always, 113 | group_by_same_kind: false, 114 | len: 2, 115 | compatibilities, 116 | }; 117 | assert_eq!(def_a.compatible_with("A"), true); 118 | assert_eq!(def_a.compatible_with("B"), true); 119 | assert_eq!(def_a.compatible_with("C"), false); 120 | } 121 | 122 | #[test] 123 | fn in_range() { 124 | let class = CharClass::from((1, 2), String::new()); 125 | assert_eq!(class.in_range(&(0 as char)), false); 126 | assert_eq!(class.in_range(&(1 as char)), true); 127 | assert_eq!(class.in_range(&(2 as char)), true); 128 | assert_eq!(class.in_range(&(3 as char)), false); 129 | } 130 | } 131 | -------------------------------------------------------------------------------- /benchmarks/package-lock.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "benchmarks", 3 | "version": "0.0.0", 4 | "lockfileVersion": 2, 5 | "requires": true, 6 | "packages": { 7 | "": { 8 | "version": "0.0.0", 9 | "hasInstallScript": true, 10 | "license": "ISC", 11 | "dependencies": { 12 | "kuromoji": "^0.1.2", 13 | "wasm-core": "../wasm-core/pkg", 14 | "wasm-features": "../wasm-features/pkg" 15 | }, 16 | "devDependencies": { 17 | "benchmark": "^2.1.4" 18 | } 19 | }, 20 | "../wasm-core/pkg": { 21 | "name": "goya-core", 22 | "version": "0.1.1", 23 | "license": "Apache-2.0 OR MIT" 24 | }, 25 | "../wasm-features/pkg": { 26 | "name": "goya-features", 27 | "version": "0.1.1", 28 | "license": "Apache-2.0 OR MIT" 29 | }, 30 | "node_modules/async": { 31 | "version": "2.6.3", 32 | "resolved": "https://registry.npmjs.org/async/-/async-2.6.3.tgz", 33 | "integrity": "sha512-zflvls11DCy+dQWzTW2dzuilv8Z5X/pjfmZOWba6TNIVDm+2UDaJmXSOXlasHKfNBs8oo3M0aT50fDEWfKZjXg==", 34 | "dependencies": { 35 | "lodash": "^4.17.14" 36 | } 37 | }, 38 | "node_modules/benchmark": { 39 | "version": "2.1.4", 40 | "resolved": "https://registry.npmjs.org/benchmark/-/benchmark-2.1.4.tgz", 41 | "integrity": "sha1-CfPeMckWQl1JjMLuVloOvzwqVik=", 42 | "dev": true, 43 | "dependencies": { 44 | "lodash": "^4.17.4", 45 | "platform": "^1.3.3" 46 | } 47 | }, 48 | "node_modules/doublearray": { 49 | "version": "0.0.2", 50 | "resolved": "https://registry.npmjs.org/doublearray/-/doublearray-0.0.2.tgz", 51 | "integrity": "sha1-Yxhv6NNEEydtNiH2qg7F954ifvk=" 52 | }, 53 | "node_modules/kuromoji": { 54 | "version": "0.1.2", 55 | "resolved": "https://registry.npmjs.org/kuromoji/-/kuromoji-0.1.2.tgz", 56 | "integrity": "sha512-V0dUf+C2LpcPEXhoHLMAop/bOht16Dyr+mDiIE39yX3vqau7p80De/koFqpiTcL1zzdZlc3xuHZ8u5gjYRfFaQ==", 57 | "dependencies": { 58 | "async": "^2.0.1", 59 | "doublearray": "0.0.2", 60 | "zlibjs": "^0.3.1" 61 | } 62 | }, 63 | "node_modules/lodash": { 64 | "version": "4.17.21", 65 | "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz", 66 | "integrity": "sha512-v2kDEe57lecTulaDIuNTPy3Ry4gLGJ6Z1O3vE1krgXZNrsQ+LFTGHVxVjcXPs17LhbZVGedAJv8XZ1tvj5FvSg==" 67 | }, 68 | "node_modules/platform": { 69 | "version": "1.3.6", 70 | "resolved": "https://registry.npmjs.org/platform/-/platform-1.3.6.tgz", 71 | "integrity": "sha512-fnWVljUchTro6RiCFvCXBbNhJc2NijN7oIQxbwsyL0buWJPG85v81ehlHI9fXrJsMNgTofEoWIQeClKpgxFLrg==", 72 | "dev": true 73 | }, 74 | "node_modules/wasm-core": { 75 | "resolved": "../wasm-core/pkg", 76 | "link": true 77 | }, 78 | "node_modules/wasm-features": { 79 | "resolved": "../wasm-features/pkg", 80 | "link": true 81 | }, 82 | "node_modules/zlibjs": { 83 | "version": "0.3.1", 84 | "resolved": "https://registry.npmjs.org/zlibjs/-/zlibjs-0.3.1.tgz", 85 | "integrity": "sha1-UBl+2yihxCymWcyLTmqd3W1ERVQ=", 86 | "engines": { 87 | "node": "*" 88 | } 89 | } 90 | }, 91 | "dependencies": { 92 | "async": { 93 | "version": "2.6.3", 94 | "resolved": "https://registry.npmjs.org/async/-/async-2.6.3.tgz", 95 | "integrity": "sha512-zflvls11DCy+dQWzTW2dzuilv8Z5X/pjfmZOWba6TNIVDm+2UDaJmXSOXlasHKfNBs8oo3M0aT50fDEWfKZjXg==", 96 | "requires": { 97 | "lodash": "^4.17.14" 98 | } 99 | }, 100 | "benchmark": { 101 | "version": "2.1.4", 102 | "resolved": "https://registry.npmjs.org/benchmark/-/benchmark-2.1.4.tgz", 103 | "integrity": "sha1-CfPeMckWQl1JjMLuVloOvzwqVik=", 104 | "dev": true, 105 | "requires": { 106 | "lodash": "^4.17.4", 107 | "platform": "^1.3.3" 108 | } 109 | }, 110 | "doublearray": { 111 | "version": "0.0.2", 112 | "resolved": "https://registry.npmjs.org/doublearray/-/doublearray-0.0.2.tgz", 113 | "integrity": "sha1-Yxhv6NNEEydtNiH2qg7F954ifvk=" 114 | }, 115 | "kuromoji": { 116 | "version": "0.1.2", 117 | "resolved": "https://registry.npmjs.org/kuromoji/-/kuromoji-0.1.2.tgz", 118 | "integrity": "sha512-V0dUf+C2LpcPEXhoHLMAop/bOht16Dyr+mDiIE39yX3vqau7p80De/koFqpiTcL1zzdZlc3xuHZ8u5gjYRfFaQ==", 119 | "requires": { 120 | "async": "^2.0.1", 121 | "doublearray": "0.0.2", 122 | "zlibjs": "^0.3.1" 123 | } 124 | }, 125 | "lodash": { 126 | "version": "4.17.21", 127 | "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz", 128 | "integrity": "sha512-v2kDEe57lecTulaDIuNTPy3Ry4gLGJ6Z1O3vE1krgXZNrsQ+LFTGHVxVjcXPs17LhbZVGedAJv8XZ1tvj5FvSg==" 129 | }, 130 | "platform": { 131 | "version": "1.3.6", 132 | "resolved": "https://registry.npmjs.org/platform/-/platform-1.3.6.tgz", 133 | "integrity": "sha512-fnWVljUchTro6RiCFvCXBbNhJc2NijN7oIQxbwsyL0buWJPG85v81ehlHI9fXrJsMNgTofEoWIQeClKpgxFLrg==", 134 | "dev": true 135 | }, 136 | "wasm-core": { 137 | "version": "file:../wasm-core/pkg" 138 | }, 139 | "wasm-features": { 140 | "version": "file:../wasm-features/pkg" 141 | }, 142 | "zlibjs": { 143 | "version": "0.3.1", 144 | "resolved": "https://registry.npmjs.org/zlibjs/-/zlibjs-0.3.1.tgz", 145 | "integrity": "sha1-UBl+2yihxCymWcyLTmqd3W1ERVQ=" 146 | } 147 | } 148 | } 149 | -------------------------------------------------------------------------------- /goya/src/double_array.rs: -------------------------------------------------------------------------------- 1 | use super::common_prefix_tree::CommonPrefixTree; 2 | use indexmap::IndexSet; 3 | use itertools::Itertools; 4 | use serde::{Deserialize, Serialize}; 5 | use std::cmp; 6 | use std::collections::HashMap; 7 | 8 | const INDEX_ROOT: usize = 1; 9 | const TERM_CHAR: char = '\0'; 10 | 11 | #[derive(Debug)] 12 | pub enum TransitionError { 13 | AlreadyTerminated, 14 | BaseFailed, 15 | CheckFailed, 16 | UnknownChar, 17 | BaseOutOfBounds, 18 | CheckOutOfBounds, 19 | } 20 | 21 | #[derive(Debug, Serialize, Deserialize, rkyv::Archive, rkyv::Serialize, rkyv::Deserialize)] 22 | pub struct DoubleArray { 23 | pub codes: IndexSet, 24 | pub base: Vec, 25 | pub check: Vec, 26 | } 27 | impl Default for DoubleArray { 28 | fn default() -> Self { 29 | let base: Vec = vec![0, 1]; 30 | let check: Vec = vec![0, 0]; 31 | let mut codes: IndexSet = IndexSet::new(); 32 | 33 | codes.insert(TERM_CHAR); 34 | 35 | DoubleArray { base, check, codes } 36 | } 37 | } 38 | impl DoubleArray { 39 | pub fn from(base: Vec, check: Vec, codes: IndexSet) -> Self { 40 | DoubleArray { base, check, codes } 41 | } 42 | 43 | pub fn wids(&self) -> impl Iterator + '_ { 44 | self.base 45 | .iter() 46 | .filter(|s| **s < 0) 47 | .map(|s| as_usize(&(s * -1))) 48 | } 49 | 50 | pub fn from_cpt(trie: &CommonPrefixTree) -> Self { 51 | let mut state_cache = HashMap::new(); 52 | let mut da = DoubleArray::default(); 53 | let mut chars = trie 54 | .entires_dfs() 55 | .iter() 56 | .map(|(prefix, _)| prefix) 57 | .join("") 58 | .chars() 59 | .collect::>(); 60 | chars.sort_unstable(); 61 | chars.dedup(); 62 | for c in chars { 63 | da.insert_to_codes(c); 64 | } 65 | 66 | for (prefix, node) in trie.entires_dfs() { 67 | if node.can_stop() { 68 | continue; 69 | } 70 | 71 | // root node 72 | if prefix.is_empty() { 73 | for next_c in node.children.keys() { 74 | let next_char_code = da.get_code(next_c).unwrap(); 75 | let t = da.base[INDEX_ROOT] + next_char_code as i32; 76 | let t = as_usize(&t); 77 | da.insert_to_check(t, INDEX_ROOT); 78 | state_cache.insert(concat_char_to_str(&prefix, *next_c), t); 79 | } 80 | continue; 81 | } 82 | 83 | let s = *state_cache.get(&prefix).unwrap(); 84 | da.insert_to_base(s, da.find_next_s(node)); 85 | for (next_c, child) in node.children.iter() { 86 | let t = da.base.get(s).unwrap() + da.get_code(next_c).unwrap() as i32; 87 | let t = as_usize(&t); 88 | da.insert_to_check(t, s); 89 | if child.can_stop() { 90 | da.insert_to_base(t, -(child.id.unwrap() as i32)); 91 | } else { 92 | let key = concat_char_to_str(&prefix, *next_c); 93 | state_cache.insert(key, t); 94 | } 95 | } 96 | } 97 | da.base.shrink_to_fit(); 98 | da.check.shrink_to_fit(); 99 | da.codes.shrink_to_fit(); 100 | da 101 | } 102 | 103 | pub fn transition( 104 | &self, 105 | from: usize, 106 | to: char, 107 | ) -> Result<(i32, Option), TransitionError> { 108 | let code = self.get_code(&to).ok_or(TransitionError::UnknownChar)?; 109 | let s = self 110 | .base 111 | .get(from) 112 | .ok_or(TransitionError::BaseOutOfBounds)?; 113 | let t = s + code as i32; 114 | if t < 0 { 115 | return Err(TransitionError::AlreadyTerminated); 116 | } 117 | let next = self 118 | .check 119 | .get(as_usize(&t)) 120 | .ok_or(TransitionError::CheckOutOfBounds)?; 121 | let base = self 122 | .base 123 | .get(t as usize) 124 | .ok_or(TransitionError::BaseFailed)?; 125 | let wid = if *base < 0 { 126 | Some((base * -1) as usize) 127 | } else { 128 | None 129 | }; 130 | if *next == from { 131 | Ok((t, wid)) 132 | } else { 133 | Err(TransitionError::CheckFailed) 134 | } 135 | } 136 | 137 | pub fn init(&self, to: char) -> Result<(i32, Option), TransitionError> { 138 | self.transition(INDEX_ROOT, to) 139 | } 140 | 141 | pub fn stop(&self, from: usize) -> Result { 142 | match self.transition(from, TERM_CHAR) { 143 | Ok((_, Some(wid))) => Ok(wid), 144 | Ok(_) => unreachable!("Successful transition, but no wid"), 145 | Err(reason) => Err(reason), 146 | } 147 | } 148 | 149 | pub fn get_code(&self, c: &char) -> Option { 150 | self.codes.get_full(c).map(|(code, _)| code) 151 | } 152 | 153 | fn insert_to_codes(&mut self, c: char) -> usize { 154 | let (char_code, _) = self.codes.insert_full(c); 155 | char_code 156 | } 157 | 158 | fn insert_to_base(&mut self, index: usize, value: i32) { 159 | let resized = cmp::max(self.base.len(), index + 1); 160 | self.base.resize(resized, 0); 161 | assert_eq!( 162 | self.base[index], 0, 163 | "index={} already used: {:?}", 164 | index, self.base 165 | ); 166 | self.base[index] = value; 167 | } 168 | 169 | fn insert_to_check(&mut self, index: usize, value: usize) { 170 | let resized = cmp::max(self.check.len(), index + 1); 171 | self.check.resize(resized, 0); 172 | self.check[index] = value; 173 | } 174 | 175 | fn get_available_check_index(&self, left: usize) -> usize { 176 | self.check 177 | .iter() 178 | .enumerate() 179 | .skip(left) 180 | // clippy says that `find is prefered to skip_while+next` but it's slower than the current 181 | .skip_while(|(_, value)| value != &&0) 182 | .next() 183 | .map(|(i, _)| i) 184 | .unwrap_or_else(|| unreachable!("index must be found")) 185 | } 186 | 187 | fn find_next_s(&self, child: &CommonPrefixTree) -> i32 { 188 | let mut position = self.get_available_check_index(INDEX_ROOT + 1); 189 | let min_code = self.get_code(child.min_char().unwrap()).unwrap(); 190 | let offsets: Vec<_> = child 191 | .children 192 | .keys() 193 | .map(|c| self.get_code(c).unwrap() - min_code) 194 | .collect(); 195 | while offsets 196 | .iter() 197 | .any(|code| match self.check.get(position + code) { 198 | Some(0) => false, 199 | Some(_) => true, 200 | _ => false, 201 | }) 202 | { 203 | position += 1; 204 | } 205 | (position - min_code) as i32 206 | } 207 | } 208 | 209 | fn as_usize(n: &i32) -> usize { 210 | assert!(*n >= 0, "n({}) should be greater than or equal to 0", n); 211 | *n as usize 212 | } 213 | 214 | fn concat_char_to_str(text: &str, c: char) -> String { 215 | let mut tmp = String::from(text); 216 | tmp.push(c); 217 | tmp 218 | } 219 | -------------------------------------------------------------------------------- /goya/src/lattice.rs: -------------------------------------------------------------------------------- 1 | use super::char_class::{CharDefinition, InvokeTiming}; 2 | use super::dictionary::Dictionary; 3 | use super::double_array::DoubleArray; 4 | use super::id::WordIdentifier; 5 | use std::collections::{HashSet, VecDeque}; 6 | 7 | pub const BOS_CONTEXT_ID: usize = 0; 8 | pub const EOS_CONTEXT_ID: usize = 0; 9 | const NODE_BOS: usize = 0; 10 | 11 | #[derive(Debug)] 12 | pub struct Lattice { 13 | // (wid, length of the word) 14 | pub indices: Vec>, 15 | // (min cost, index, length) 16 | pub dp: Vec>, 17 | } 18 | impl Lattice { 19 | pub fn parse(text: &str, da: &DoubleArray, dict: &D) -> Lattice { 20 | let len = text.chars().count(); 21 | let mut indices: Vec> = vec![vec![]; len]; 22 | let mut open_indices = VecDeque::from(vec![0]); 23 | let mut visited = HashSet::with_capacity(len); 24 | let char_defs = text 25 | .chars() 26 | .map(|c| dict.classify_char(&c)) 27 | .collect::>(); 28 | 29 | while let Some(index) = open_indices.pop_front() { 30 | if visited.contains(&index) || index >= len { 31 | continue; 32 | } 33 | visited.insert(index); 34 | 35 | let c = text.chars().nth(index).unwrap(); 36 | let def = char_defs[index]; 37 | if let InvokeTiming::Always = def.timing { 38 | let surface_form = dict.take_unknown_chars_seq(def, text, &index); 39 | open_indices.push_back(index + surface_form.chars().count()); 40 | for (wid, _) in dict.get_unknown_morphemes_by_class(&def.class) { 41 | indices[index].push(( 42 | WordIdentifier::Unknown(wid, surface_form.to_string()), 43 | surface_form.chars().count(), 44 | )); 45 | } 46 | } 47 | 48 | if let Ok((mut cursor, _)) = da.init(c) { 49 | if let Ok(wid) = da.stop(cursor as usize) { 50 | open_indices.push_back(index + 1); 51 | for wid in dict.resolve_homonyms(&wid).unwrap().iter() { 52 | indices[index].push(( 53 | WordIdentifier::Known(*wid, text.chars().skip(index).take(1).collect()), 54 | 1, 55 | )); 56 | } 57 | } 58 | let mut j = index + 1; 59 | while j < len { 60 | let c = text.chars().nth(j).unwrap(); 61 | match da.transition(cursor as usize, c) { 62 | Ok((next, _)) => { 63 | if let Ok(wid) = da.stop(next as usize) { 64 | open_indices.push_back(j + 1); 65 | for wid in dict.resolve_homonyms(&wid).unwrap().iter() { 66 | indices[index].push(( 67 | WordIdentifier::Known( 68 | *wid, 69 | text.chars().skip(index).take(j + 1 - index).collect(), 70 | ), 71 | j + 1 - index, 72 | )); 73 | } 74 | } 75 | cursor = next; 76 | } 77 | Err(_) => { 78 | break; 79 | } 80 | } 81 | j += 1; 82 | } 83 | } 84 | if indices[index].is_empty() && matches!(def.timing, InvokeTiming::Fallback) { 85 | let surface_form = dict.take_unknown_chars_seq(def, text, &index); 86 | open_indices.push_back(index + surface_form.chars().count()); 87 | for (wid, _) in dict.get_unknown_morphemes_by_class(&def.class) { 88 | indices[index].push(( 89 | WordIdentifier::Unknown(wid, surface_form.to_string()), 90 | surface_form.chars().count(), 91 | )); 92 | } 93 | } 94 | } 95 | Lattice { 96 | dp: get_dp_table(&indices, dict), 97 | indices, 98 | } 99 | } 100 | 101 | pub fn word_identifiers(&self) -> Vec { 102 | let mut wids = vec![]; 103 | for idx in self.indices.iter() { 104 | for (wid, _) in idx.iter() { 105 | wids.push(wid.clone()) 106 | } 107 | } 108 | wids 109 | } 110 | 111 | pub fn find_best_path(&self) -> Option> { 112 | let mut path = vec![]; 113 | let mut cursor = (self.dp.len() - 1, 0); 114 | loop { 115 | match self.dp[cursor.0].get(cursor.1) { 116 | Some((_, i, j)) => { 117 | if *i == NODE_BOS { 118 | break; 119 | } 120 | path.insert(0, (*i, *j)); 121 | cursor = (*i, *j); 122 | } 123 | _ => return None, 124 | } 125 | } 126 | Some(path) 127 | } 128 | 129 | pub fn find_best(&self) -> Option> { 130 | match self.find_best_path() { 131 | Some(best_path) => { 132 | let mut ids = vec![]; 133 | for (i, j) in best_path.iter() { 134 | ids.push(self.indices[*i - 1][*j].0.clone()); 135 | } 136 | Some(ids) 137 | } 138 | None => None, 139 | } 140 | } 141 | } 142 | 143 | fn get_dp_table( 144 | indices: &[Vec<(WordIdentifier, usize)>], 145 | dict: &D, 146 | ) -> Vec> { 147 | let len = indices.len(); 148 | let max_num_childs = indices.iter().map(|idx| idx.len()).max().unwrap(); 149 | // (min cost, idx of indices, idx2 of indices[idx]) 150 | // * dp[0][0] means BOS 151 | // * dp[dp.len() - 1][0] means EOS 152 | // Individual cost should be less in i16, the sum of costs can exceed its range. 153 | // Currently each element has unused indices to reduce num alloc 154 | let mut dp: Vec> = 155 | vec![vec![(i32::MAX, 0, 0); max_num_childs]; len + 2]; 156 | if max_num_childs == 0 { 157 | return dp; 158 | } 159 | dp[0][0] = (0, 0, 0); 160 | 161 | for (i, (right_wid, _)) in indices[0].iter().enumerate() { 162 | let right = dict.get(right_wid).unwrap(); 163 | let cost = dict 164 | .transition_cost(&BOS_CONTEXT_ID, &right.right_context_id) 165 | .unwrap() 166 | + right.cost; 167 | dp[1][i] = (cost as i32, NODE_BOS, 0); 168 | } 169 | 170 | for (i, index) in indices.iter().enumerate() { 171 | for (j, (left_wid, wlen)) in index.iter().enumerate() { 172 | let before_cost = dp[i + 1][j].0; 173 | let left = dict.get(left_wid).unwrap(); 174 | if i + wlen >= len { 175 | let cost = (*dict 176 | .transition_cost(&left.left_context_id, &EOS_CONTEXT_ID) 177 | .unwrap() as i32) 178 | + (left.cost as i32) 179 | + before_cost; 180 | if cost < dp[i + wlen + 1][0].0 { 181 | dp[i + wlen + 1][0] = (cost, i + 1, j); 182 | } 183 | continue; 184 | } 185 | 186 | for (k, (right_wid, _)) in indices[i + wlen].iter().enumerate() { 187 | let right = dict.get(right_wid).unwrap(); 188 | let cost = (*dict 189 | .transition_cost(&left.left_context_id, &right.right_context_id) 190 | .unwrap() as i32) 191 | + left.cost as i32 192 | + right.cost as i32 193 | + before_cost; 194 | if cost < dp[i + 1 + wlen][k].0 { 195 | dp[i + 1 + wlen][k] = (cost, i + 1, j); 196 | } 197 | } 198 | } 199 | } 200 | dp 201 | } 202 | -------------------------------------------------------------------------------- /ipadic/src/ipadic_loader.rs: -------------------------------------------------------------------------------- 1 | use super::ipadic::IPADic; 2 | use csv::ReaderBuilder; 3 | use encoding_rs::EUC_JP; 4 | use glob::glob; 5 | use goya::char_class::{CharClass, CharClassifier, CharDefinition, InvokeTiming}; 6 | use goya::morpheme::Morpheme; 7 | use goya::word_features::WordFeaturesMap; 8 | use indexmap::IndexSet; 9 | use regex::Regex; 10 | use serde::Deserialize; 11 | use std::collections::{HashMap, HashSet}; 12 | use std::error::Error; 13 | use std::fs; 14 | use std::path::Path; 15 | use std::vec::Vec; 16 | 17 | const COL_SURFACE_FORM: usize = 0; // 表層形 18 | const COL_LEFT_CONTEXT_ID: usize = 1; // 左文脈ID 19 | const COL_RIGHT_CONTEXT_ID: usize = 2; // 右文脈ID 20 | const COL_COST: usize = 3; // コスト 21 | 22 | pub struct LoadResult { 23 | pub ipadic: IPADic, 24 | pub word_set: WordFeaturesMap, 25 | pub surfaces: HashMap, 26 | } 27 | 28 | pub struct IPADicLoader {} 29 | impl IPADicLoader { 30 | pub fn load(&self, dir: &str) -> Result> { 31 | let classes = load_chars(Path::new(dir).join("char.def"))?; 32 | let matrix = load_matrix(Path::new(dir).join("matrix.def"))?; 33 | let unknown = load_unknown(Path::new(dir).join("unk.def"))?; 34 | let csv_pattern = Path::new(dir).join("*.csv"); 35 | let csv_pattern = csv_pattern.to_str().ok_or("Failed to build glob pattern")?; 36 | 37 | let mut vocabulary_index: IndexSet = IndexSet::new(); 38 | let mut surfaces = HashMap::new(); 39 | let mut known_features = HashMap::new(); 40 | let mut vocabulary = HashMap::new(); 41 | let mut tmp_homonyms = HashMap::new(); 42 | let mut id: usize = 1; 43 | for path in glob(csv_pattern)? { 44 | for row in load_words_csv(path?)? { 45 | surfaces.insert(id, row.surface_form.to_string()); 46 | known_features.insert(id, row.features.clone()); 47 | tmp_homonyms 48 | .entry(row.surface_form.to_string()) 49 | .or_insert_with(Vec::new) 50 | .push(id); 51 | 52 | let (idx, _) = vocabulary_index.insert_full(row.into()); 53 | vocabulary.insert(id, idx); 54 | id += 1; 55 | } 56 | } 57 | let mut homonyms: HashMap> = HashMap::new(); 58 | for wids in tmp_homonyms.values() { 59 | for wid in wids.iter() { 60 | homonyms.insert(*wid, wids.iter().copied().collect()); 61 | } 62 | } 63 | 64 | let mut unknown_vocabulary = HashMap::new(); 65 | let mut unknown_features = HashMap::new(); 66 | let mut unknown_classes = HashMap::new(); 67 | let mut id = 1; 68 | for (class, words) in unknown.into_iter() { 69 | for row in words { 70 | unknown_features.insert(id, row.features.clone()); 71 | let (idx, _) = vocabulary_index.insert_full(row.into()); 72 | unknown_vocabulary.insert(id, idx); 73 | unknown_classes 74 | .entry(class.to_string()) 75 | .or_insert_with(Vec::new) 76 | .push(id); 77 | id += 1; 78 | } 79 | } 80 | 81 | let word_set = WordFeaturesMap::new( 82 | map_to_vec(known_features, Vec::new), 83 | map_to_vec(unknown_features, Vec::new), 84 | ); 85 | let ipadic = IPADic::from( 86 | map_to_vec(vocabulary, || 0), 87 | map_to_vec(homonyms, Vec::new), 88 | classes, 89 | matrix, 90 | unknown_classes, 91 | map_to_vec(unknown_vocabulary, || 0), 92 | vocabulary_index, 93 | ); 94 | let ret = LoadResult { 95 | word_set, 96 | ipadic, 97 | surfaces, 98 | }; 99 | Ok(ret) 100 | } 101 | } 102 | 103 | #[derive(Debug, Clone, Deserialize)] 104 | struct CSVRow { 105 | /// 表層形 106 | /// https://taku910.github.io/mecab/dic-detail.html 107 | surface_form: String, 108 | /// 左文脈ID (単語を左から見たときの文脈 ID) 109 | /// https://taku910.github.io/mecab/dic-detail.html 110 | left_context_id: usize, 111 | /// 右文脈ID (単語を右から見たときの文脈 ID) 112 | /// https://taku910.github.io/mecab/dic-detail.html 113 | right_context_id: usize, 114 | /// 単語コスト (小さいほど出現しやすい) 115 | /// コスト値は short int (16bit 整数) の範囲におさめる必要があります. 116 | cost: i16, 117 | /// 5カラム目以降は, ユーザ定義の CSV フィールドです. 基本的に どんな内容でも CSV の許す限り追加することができます. 118 | /// https://taku910.github.io/mecab/dic-detail.html 119 | features: Vec, 120 | } 121 | impl From for Morpheme { 122 | fn from(row: CSVRow) -> Self { 123 | Morpheme { 124 | left_context_id: row.left_context_id, 125 | right_context_id: row.right_context_id, 126 | cost: row.cost, 127 | } 128 | } 129 | } 130 | 131 | fn load_words_csv

(path: P) -> Result, Box> 132 | where 133 | P: AsRef, 134 | { 135 | let eucjp = fs::read(path)?; 136 | let (utf8, _, _) = EUC_JP.decode(&eucjp); 137 | let mut rdr = ReaderBuilder::new() 138 | .has_headers(false) 139 | .from_reader(utf8.as_bytes()); 140 | let mut words = vec![]; 141 | for row in rdr.records() { 142 | let row = row?; 143 | words.push(CSVRow { 144 | surface_form: row[COL_SURFACE_FORM].to_string(), 145 | left_context_id: row[COL_LEFT_CONTEXT_ID].parse::().unwrap(), 146 | right_context_id: row[COL_RIGHT_CONTEXT_ID].parse::().unwrap(), 147 | cost: row[COL_COST].parse::().unwrap(), 148 | features: row 149 | .iter() 150 | .skip(COL_COST + 1) 151 | .map(|v| v.to_string()) 152 | .collect::>(), 153 | }) 154 | } 155 | Ok(words) 156 | } 157 | 158 | fn load_chars

(path: P) -> Result> 159 | where 160 | P: AsRef, 161 | { 162 | let eucjp = fs::read(path)?; 163 | let (utf8, _, _) = EUC_JP.decode(&eucjp); 164 | let lines = utf8 165 | .lines() 166 | .filter(|line| !line.is_empty() && !line.starts_with('#')) 167 | .map(|line| Regex::new(r"#.*$").unwrap().replace(line, "")) 168 | .collect::>(); 169 | 170 | let head = lines.iter().take_while(|line| { 171 | let parts = line.trim().split_ascii_whitespace().collect::>(); 172 | !parts[0].starts_with("0x") 173 | }); 174 | let mut chars = HashMap::new(); 175 | for line in head { 176 | let parts = line.trim().split_ascii_whitespace().collect::>(); 177 | let kind = parts[0].to_owned(); 178 | let class = kind.to_string(); 179 | let timing = if parts[1] == "0" { 180 | InvokeTiming::Fallback 181 | } else { 182 | InvokeTiming::Always 183 | }; 184 | let group_by_same_kind = parts[2] == "1"; 185 | let len = parts[3].parse::()?; 186 | chars.insert( 187 | kind, 188 | CharDefinition { 189 | class, 190 | timing, 191 | group_by_same_kind, 192 | len, 193 | compatibilities: HashSet::new(), 194 | }, 195 | ); 196 | } 197 | 198 | let tail = lines.iter().skip_while(|line| { 199 | let parts = line.trim().split_ascii_whitespace().collect::>(); 200 | !parts[0].starts_with("0x") 201 | }); 202 | let mut ranges = vec![]; 203 | for line in tail { 204 | let parts = line.trim().split_ascii_whitespace().collect::>(); 205 | let range = parts[0] 206 | .split("..") 207 | .map(|c| u32::from_str_radix(&c[2..], 16).unwrap()) 208 | .map(|c| char::from_u32(c).unwrap()) 209 | .collect::>(); 210 | let range = if range.len() > 1 { 211 | (range[0] as u32, range[1] as u32) 212 | } else { 213 | (range[0] as u32, range[0] as u32) 214 | }; 215 | let class = parts[1]; 216 | let compatibilities = parts 217 | .iter() 218 | .skip(2) 219 | .map(|s| s.to_string()) 220 | .collect::>(); 221 | chars.get_mut(class).unwrap().compatibilities = compatibilities; 222 | ranges.push(CharClass::from(range, class.to_string())); 223 | } 224 | 225 | Ok(CharClassifier::from(chars, ranges)) 226 | } 227 | 228 | fn load_matrix

(path: P) -> Result>, Box> 229 | where 230 | P: AsRef, 231 | { 232 | let eucjp = fs::read(path)?; 233 | let (utf8, _, _) = EUC_JP.decode(&eucjp); 234 | let mut lines = utf8.lines(); 235 | let size = lines 236 | .next() 237 | .expect("failed to read the first line") 238 | .split_ascii_whitespace() 239 | .map(|p| p.parse::().unwrap()) 240 | .collect::>(); 241 | let mut matrix = vec![vec![-1; size[1]]; size[0]]; 242 | for line in lines { 243 | let parts = line.split_ascii_whitespace().collect::>(); 244 | let left = parts[0].parse::()?; 245 | let right = parts[1].parse::()?; 246 | let cost = parts[2].parse::()?; 247 | matrix[left][right] = cost; 248 | } 249 | Ok(matrix) 250 | } 251 | 252 | fn load_unknown

(path: P) -> Result>, Box> 253 | where 254 | P: AsRef, 255 | { 256 | let words = load_words_csv(path)?; 257 | let mut map = HashMap::>::new(); 258 | for w in words.into_iter() { 259 | map.entry(w.surface_form.to_string()) 260 | .or_insert_with(Vec::new) 261 | .push(w); 262 | } 263 | Ok(map) 264 | } 265 | 266 | fn map_to_vec(map: HashMap, default: impl Fn() -> T) -> Vec { 267 | let mut ret = vec![default(); map.len() + 1]; 268 | for (idx, value) in map.into_iter() { 269 | ret[idx] = value; 270 | } 271 | ret 272 | } 273 | --------------------------------------------------------------------------------