> ita-corpus.txt
10 | const lines = require('fs').readFileSync('transcript_utf8.csv', 'utf8')
11 | .trim()
12 | .split('\n')
13 | .map(line => line.split(',')[0].split(':')[1])
14 | .join('\n')
15 | console.log(lines)
16 | CODE
17 |
--------------------------------------------------------------------------------
/benchmarks/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "benchmarks",
3 | "private": true,
4 | "version": "0.0.0",
5 | "type": "module",
6 | "scripts": {
7 | "preinstall": "npm run build:core && npm run build:features",
8 | "build:core": "wasm-pack build --release --target nodejs ../wasm-core",
9 | "build:features": "wasm-pack build --release --target nodejs ../wasm-features",
10 | "test": "echo \"Error: no test specified\" && exit 1"
11 | },
12 | "author": "",
13 | "license": "ISC",
14 | "dependencies": {
15 | "kuromoji": "^0.1.2",
16 | "wasm-core": "../wasm-core/pkg",
17 | "wasm-features": "../wasm-features/pkg"
18 | },
19 | "devDependencies": {
20 | "benchmark": "^2.1.4"
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/goya-cli/src/path_util.rs:
--------------------------------------------------------------------------------
1 | use std::fs::create_dir_all;
2 | use std::io;
3 | use std::path::{Path, PathBuf};
4 |
5 | pub struct PathUtil {
6 | base: String,
7 | }
8 | impl PathUtil {
9 | pub fn from(base: String) -> PathUtil {
10 | PathUtil { base }
11 | }
12 |
13 | pub fn mkdirp(&self) -> io::Result<()> {
14 | create_dir_all(&self.base)
15 | }
16 |
17 | pub fn da_path(&self) -> PathBuf {
18 | Path::new(&self.base).join("da.bin")
19 | }
20 |
21 | pub fn dict_path(&self) -> PathBuf {
22 | Path::new(&self.base).join("dict.bin")
23 | }
24 |
25 | pub fn features_path(&self) -> PathBuf {
26 | Path::new(&self.base).join("features.bin")
27 | }
28 | }
29 |
--------------------------------------------------------------------------------
/wasm-features/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | authors = ["Leko "]
3 | categories = ["wasm", "data-structures", "text-processing"]
4 | description = "WebAssembly binding of Goya"
5 | edition = "2018"
6 | license = "Apache-2.0 OR MIT"
7 | name = "goya-features"
8 | publish = false
9 | repository = "https://github.com/Leko/goya"
10 | version = "0.1.9"
11 |
12 | [lib]
13 | crate-type = ["cdylib"]
14 |
15 | [dependencies]
16 | goya = {version = "^0.1.9", path = "../goya"}
17 | lazy_static = "1.4"
18 | rmp-serde = "1.0.0-beta.2"
19 | serde-wasm-bindgen = "0.3.1"
20 | wasm-bindgen = {version = "0.2.78", features = ["serde-serialize"]}
21 |
22 | [package.metadata.wasm-pack.profile.release]
23 | wasm-opt = ['--dce', '-O4']
24 |
--------------------------------------------------------------------------------
/benchmarks/kuromoji.js:
--------------------------------------------------------------------------------
1 | import { EOL } from "os";
2 | import fs from "fs";
3 | import path from "path";
4 | import kuromoji from "kuromoji";
5 |
6 | const dicPath = path.join(
7 | path.dirname(new URL(import.meta.url).pathname),
8 | "node_modules",
9 | "kuromoji",
10 | "dict"
11 | );
12 |
13 | new Promise((resolve, reject) => {
14 | kuromoji.builder({ dicPath }).build((err, tokenizer) => {
15 | if (err) {
16 | reject(err);
17 | } else {
18 | resolve(tokenizer);
19 | }
20 | });
21 | }).then((tokenizer) => {
22 | const lines = fs.readFileSync("/dev/stdin", "utf8").trim().split(EOL);
23 | for (const line of lines) {
24 | tokenizer.tokenize(line);
25 | }
26 | console.log(process.memoryUsage());
27 | });
28 |
--------------------------------------------------------------------------------
/playground/src/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | Playground | Goya: Yet another morphological analyzer for Rust and
7 | WebAssembly
8 |
9 |
13 |
21 |
22 |
23 |
24 |
28 |
29 |
30 |
--------------------------------------------------------------------------------
/wasm-core/README.md:
--------------------------------------------------------------------------------
1 | ## Getting started
2 |
3 | ### 分かち書き
4 |
5 | goya-core を import して `parse` 関数を使用します。parse メソッドの戻り値から各種メソッドを呼べるようにしています。
6 | 分かち書きをするなら`wakachi`メソッドを使用します。
7 |
8 | ```ts
9 | import core from "goya-core";
10 |
11 | const lattice = core.parse("すもももももももものうち");
12 | lattice.wakachi(); // => ["すもも", "も", "もも", "も", "もも", "の", "うち"]
13 | ```
14 |
15 | ### 形態素解析
16 |
17 | 形態素解析の結果を得るには`find_best`メソッドを使用します。find_best は形態素の配列を返します。各形態素はこれらのフィールドを持っています。サイズ削減のためこのオブジェクトは品詞や読み仮名などの素性を持っていません。
18 |
19 | - wid: 語彙 ID。goya-features で使用 (後述)
20 | - is_known: 既知後なら true、未知語なら false
21 | - surface_form: 表層体
22 |
23 | ```ts
24 | lattice.find_best()[0].surface_form; // => "すもも"
25 | lattice.find_best()[0].is_known; // => true
26 | lattice.find_best()[0].wid; // => 次項で説明
27 | ```
28 |
--------------------------------------------------------------------------------
/wasm-features/README.md:
--------------------------------------------------------------------------------
1 | ## Getting started
2 |
3 | ```ts
4 | import core from "goya-core";
5 | import { get_features } from "wasm-features";
6 |
7 | // Mecab IPA辞書のデフォルトでは品詞(Part of Speech)は添字0
8 | const INDEX_POS = 0;
9 |
10 | const lattice = core.parse("すもももももももものうち");
11 | const morphemes = lattice.find_best();
12 | // widの配列から素性の配列を得る
13 | const features = get_features(morphemes.map((morph) => morph.wid));
14 | // 1要素ずつ取得してもいいが、まとめて取得する方がオーバーヘッドが少なく高速
15 | get_features([morphemes[0].wid]);
16 |
17 | morphemes.forEach(({ surface_form }, i) => {
18 | const feature = features[i]; // 渡したwid通りの順序で素性が得られる
19 | const line = surface_form + "\t" + feature.join(",");
20 | console.log(line); // => "すもも\t名詞,一般,*,*,*,*,すもも,スモモ,スモモ"
21 | console.log(feature[INDEX_POS]); // => "名詞"
22 | });
23 | ```
24 |
--------------------------------------------------------------------------------
/goya-cli/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | authors = ["Leko "]
3 | categories = ["command-line-interface"]
4 | description = "CLI for Goya"
5 | edition = "2018"
6 | license = "Apache-2.0 OR MIT"
7 | name = "goya-cli"
8 | repository = "https://github.com/Leko/goya"
9 | version = "0.1.9"
10 |
11 | [[bin]]
12 | name = "goya"
13 | path = "src/main.rs"
14 |
15 | [dependencies]
16 | bytesize = {version = "1.1.0", features = ["serde"]}
17 | clap = {version = "3.0.0-rc.9", features = ["derive"]}
18 | console = "0.14"
19 | dirs = "4.0"
20 | futures = "0.3.17"
21 | goya = {version = "^0.1.9", path = "../goya"}
22 | goya-ipadic = {version = "^0.1.9", path = "../ipadic"}
23 | indexmap = {version = "1.7", features = ["serde"]}
24 | rkyv = {version = "0.7.19", features = ["indexmap"]}
25 | rmp-serde = "1.0.0-beta.2"
26 |
--------------------------------------------------------------------------------
/scripts/build-wasm:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | cd $1
3 | wasm-pack build --release --out-dir pkg/web --target web
4 | wasm-pack build --release --out-dir pkg/nodejs --target nodejs
5 |
6 | mv pkg/web/README.md pkg/
7 | mv pkg/nodejs/package.json pkg/
8 | rm -rf pkg/{web,nodejs}/package.json
9 |
10 | node < path.join('nodejs', f)).concat(pkg.files.map(f => path.join('web', f)))
22 |
23 | fs.writeFileSync('./pkg/package.json', JSON.stringify(pkg, null, 2))
24 | CODE
25 |
--------------------------------------------------------------------------------
/playground/tsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | "compilerOptions": {
3 | "incremental": true /* Enable incremental compilation */,
4 | "jsx": "react",
5 | "target": "es2020" /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */,
6 | "module": "es2020" /* Specify what module code is generated. */,
7 | "lib": ["DOM"],
8 | "moduleResolution": "Node",
9 | "resolveJsonModule": true,
10 | "esModuleInterop": true /* Emit additional JavaScript to ease support for importing CommonJS modules. This enables `allowSyntheticDefaultImports` for type compatibility. */,
11 | "forceConsistentCasingInFileNames": true /* Ensure that casing is correct in imports. */,
12 | "strict": true /* Enable all strict type-checking options. */,
13 | "skipLibCheck": true /* Skip type checking all .d.ts files. */
14 | }
15 | }
16 |
--------------------------------------------------------------------------------
/wasm-core/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | authors = ["Leko "]
3 | categories = ["wasm", "data-structures", "text-processing"]
4 | description = "WebAssembly binding of Goya"
5 | edition = "2018"
6 | license = "Apache-2.0 OR MIT"
7 | name = "goya-core"
8 | publish = false
9 | repository = "https://github.com/Leko/goya"
10 | version = "0.1.9"
11 |
12 | [lib]
13 | crate-type = ["cdylib"]
14 |
15 | [dependencies]
16 | futures = "0.3.17"
17 | goya = {version = "^0.1.9", path = "../goya"}
18 | goya-ipadic = {version = "^0.1.9", path = "../ipadic"}
19 | lazy_static = "1.4"
20 | rkyv = {version = "0.7.19", features = ["indexmap"]}
21 | rmp-serde = "1.0.0-beta.2"
22 | serde = {version = "1.0", features = ["derive"]}
23 | serde-wasm-bindgen = "0.3.1"
24 | wasm-bindgen = {version = "0.2.78", features = ["serde-serialize"]}
25 | wasm-bindgen-futures = "0.4.28"
26 |
27 | [package.metadata.wasm-pack.profile.release]
28 | wasm-opt = ['--dce', '-O4']
29 |
--------------------------------------------------------------------------------
/goya/src/morpheme.rs:
--------------------------------------------------------------------------------
1 | use serde::{Deserialize, Serialize};
2 |
3 | #[derive(
4 | Debug,
5 | Hash,
6 | PartialEq,
7 | Eq,
8 | PartialOrd,
9 | Clone,
10 | Serialize,
11 | Deserialize,
12 | rkyv::Archive,
13 | rkyv::Serialize,
14 | rkyv::Deserialize,
15 | )]
16 | pub struct Morpheme {
17 | /// 左文脈ID (単語を左から見たときの文脈 ID)
18 | /// https://taku910.github.io/mecab/dic-detail.html
19 | pub left_context_id: usize,
20 | /// 右文脈ID (単語を右から見たときの文脈 ID)
21 | /// https://taku910.github.io/mecab/dic-detail.html
22 | pub right_context_id: usize,
23 | /// > 単語コスト (小さいほど出現しやすい)
24 | /// > コスト値は short int (16bit 整数) の範囲におさめる必要があります.
25 | pub cost: i16,
26 | }
27 | impl Morpheme {
28 | pub fn new(left_context_id: usize, right_context_id: usize, cost: i16) -> Morpheme {
29 | Morpheme {
30 | left_context_id,
31 | right_context_id,
32 | cost,
33 | }
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/wasm-features/src/lib.rs:
--------------------------------------------------------------------------------
1 | use goya::id::WordIdentifier;
2 | use goya::word_features::WordFeaturesMap;
3 | use wasm_bindgen::prelude::*;
4 |
5 | #[macro_use]
6 | extern crate lazy_static;
7 |
8 | lazy_static! {
9 | static ref WORD_FEATURES: WordFeaturesMap =
10 | rmp_serde::from_slice(include_bytes!("../__generated__/features.bin")).unwrap();
11 | }
12 |
13 | #[wasm_bindgen]
14 | pub fn get_features(wids: &JsValue) -> JsValue {
15 | let wids: Vec = wids.into_serde().unwrap();
16 | let features: Vec> = wids
17 | .iter()
18 | .map(|wid| {
19 | WORD_FEATURES
20 | .get(wid)
21 | .unwrap()
22 | .iter()
23 | .map(|s| s.to_string())
24 | .collect()
25 | })
26 | .collect::>();
27 | serde_wasm_bindgen::to_value(&features).unwrap()
28 | }
29 |
30 | #[wasm_bindgen]
31 | pub fn ready() {
32 | lazy_static::initialize(&WORD_FEATURES);
33 | }
34 |
--------------------------------------------------------------------------------
/goya/src/dictionary.rs:
--------------------------------------------------------------------------------
1 | use super::char_class::CharDefinition;
2 | use super::id::WordIdentifier;
3 | use super::morpheme::Morpheme;
4 |
5 | pub trait Dictionary {
6 | fn get(&self, wid: &WordIdentifier) -> Option<&Morpheme> {
7 | match wid {
8 | WordIdentifier::Known(wid, _) => self.get_known_morpheme(wid),
9 | WordIdentifier::Unknown(wid, _) => self.get_unknown_morpheme(wid),
10 | }
11 | }
12 | fn get_known_morpheme(&self, wid: &usize) -> Option<&Morpheme>;
13 | fn get_unknown_morpheme(&self, wid: &usize) -> Option<&Morpheme>;
14 | fn resolve_homonyms(&self, wid: &usize) -> Option<&Vec>;
15 | fn take_unknown_chars_seq(&self, def: &CharDefinition, text: &str, start: &usize) -> String;
16 | fn classify_char(&self, c: &char) -> &CharDefinition;
17 | fn get_unknown_morphemes_by_class(&self, class: &str) -> Vec<(usize, &Morpheme)>;
18 | fn transition_cost(&self, left: &usize, right: &usize) -> Option<&i16>;
19 | fn occurrence_cost(&self, wid: &usize) -> Option;
20 | }
21 |
--------------------------------------------------------------------------------
/goya/src/id.rs:
--------------------------------------------------------------------------------
1 | use serde::{Deserialize, Serialize};
2 |
3 | #[derive(Debug, Clone, Serialize, Deserialize)]
4 | #[serde(tag = "tag", content = "id")]
5 | pub enum WordIdentifier {
6 | Known(usize, String), // ID, surface_form
7 | Unknown(usize, String), // ID, surface_form
8 | }
9 | impl WordIdentifier {
10 | pub fn get_surface(&self) -> &str {
11 | match self {
12 | Self::Known(_, surface) => surface,
13 | Self::Unknown(_, surface) => surface,
14 | }
15 | }
16 | }
17 |
18 | #[cfg(test)]
19 | mod tests {
20 | use super::*;
21 |
22 | #[test]
23 | fn get_surface_known() {
24 | let surface = String::from("test");
25 | let id = WordIdentifier::Known(0, surface.to_string());
26 | assert_eq!(id.get_surface(), surface);
27 | }
28 |
29 | #[test]
30 | fn get_surface_unknown() {
31 | let surface = String::from("test");
32 | let id = WordIdentifier::Unknown(0, surface.to_string());
33 | assert_eq!(id.get_surface(), surface);
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2021 Shingo Inoue
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/playground/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "@goya/playground",
3 | "private": true,
4 | "version": "1.0.0",
5 | "scripts": {
6 | "start": "webpack-dev-server --mode development",
7 | "build": "webpack --mode production"
8 | },
9 | "author": "Leko ",
10 | "license": "MIT",
11 | "devDependencies": {
12 | "@mui/styles": "^5.0.1",
13 | "@swc/core": "^1.2.92",
14 | "@vue/preload-webpack-plugin": "^2.0.0",
15 | "@wasm-tool/wasm-pack-plugin": "^1.5.0",
16 | "file-loader": "^6.2.0",
17 | "html-webpack-plugin": "^5.3.2",
18 | "swc-loader": "^0.1.15",
19 | "typescript": "^4.4.3",
20 | "webpack": "^5.56.0",
21 | "webpack-cli": "^4.8.0",
22 | "webpack-dev-server": "^4.3.0",
23 | "workbox-webpack-plugin": "^6.3.0"
24 | },
25 | "dependencies": {
26 | "@emotion/react": "^11.4.1",
27 | "@emotion/styled": "^11.3.0",
28 | "@mui/icons-material": "^5.0.1",
29 | "@mui/lab": "^5.0.0-alpha.49",
30 | "@mui/material": "^5.0.2",
31 | "@mui/x-data-grid": "^5.0.0-beta.0",
32 | "comlink": "^4.3.1",
33 | "react": "^17.0.2",
34 | "react-dom": "^17.0.2",
35 | "react-use": "^17.3.1",
36 | "viz.js": "^2.1.2"
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/playground/src/Dot.tsx:
--------------------------------------------------------------------------------
1 | import React, { useCallback, useEffect, useRef, useState } from "react";
2 | import Box from "@mui/material/Box";
3 | import Button from "@mui/material/Button";
4 | import Viz from "viz.js";
5 | import workerURL from "viz.js/full.render.js";
6 |
7 | type Props = {
8 | dot: string;
9 | };
10 |
11 | const viz = new Viz({ workerURL });
12 |
13 | export default function Dot(props: Props) {
14 | const { dot } = props;
15 | const [svg, setSVG] = useState("");
16 |
17 | const handleDownload = useCallback(() => {
18 | const a = document.createElement("a");
19 | a.download = "lattice.svg";
20 | a.href = `data://image/svg+xml,${encodeURIComponent(svg)}`;
21 | a.click();
22 | }, [svg]);
23 |
24 | useEffect(() => {
25 | if (!dot || dot.trim().length === 0) {
26 | return;
27 | }
28 | viz.renderSVGElement(dot).then((svg: SVGSVGElement) => {
29 | svg.style.width = "100%";
30 | svg.style.height = "100%";
31 | setSVG(svg.outerHTML);
32 | });
33 | }, [dot, setSVG]);
34 |
35 | if (!svg) {
36 | return null;
37 | }
38 | return (
39 |
40 |
43 |
44 |
45 | );
46 | }
47 |
--------------------------------------------------------------------------------
/scripts/build-dict:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env node
2 | const os = require("os");
3 | const fs = require("fs/promises");
4 | const path = require("path");
5 | const { spawnSync } = require("child_process");
6 |
7 | async function main() {
8 | const tmp = await fs.mkdtemp(path.join(os.tmpdir(), "goya-dict-"));
9 | spawnSync(
10 | "cargo",
11 | [
12 | "+nightly",
13 | "run",
14 | "-p",
15 | "goya-cli",
16 | "--release",
17 | "--",
18 | "--dicdir",
19 | tmp,
20 | "compile",
21 | process.argv[2],
22 | ],
23 | { stdio: "inherit" }
24 | );
25 |
26 | const base = path.join(__dirname, "..");
27 | const generatedDir = path.join(base, "wasm-core", "__generated__");
28 | await fs.rm(generatedDir, { recursive: true, force: true });
29 | await fs.mkdir(path.dirname(generatedDir), { recursive: true });
30 | await fs.rename(tmp, generatedDir);
31 |
32 | const generatedDir2 = path.join(base, "wasm-features", "__generated__");
33 | await fs.rm(generatedDir2, { recursive: true, force: true });
34 | await fs.mkdir(generatedDir2, { recursive: true });
35 | await fs.rename(
36 | path.join(generatedDir, "features.bin"),
37 | path.join(generatedDir2, "features.bin")
38 | );
39 | }
40 |
41 | main().catch((e) => {
42 | console.error(e.stack);
43 | process.exit(1);
44 | });
45 |
--------------------------------------------------------------------------------
/benchmarks/bench.js:
--------------------------------------------------------------------------------
1 | import { EOL } from "os";
2 | import path from "path";
3 | import fs from "fs";
4 | import Benchmark from "benchmark";
5 | import kuromoji from "kuromoji";
6 | import core from "wasm-core";
7 | import features from "wasm-features";
8 |
9 | const suite = new Benchmark.Suite();
10 |
11 | const [, , tokenizer] = await Promise.all([
12 | core.ready(),
13 | features.ready(),
14 | new Promise((resolve, reject) => {
15 | kuromoji
16 | .builder({
17 | dicPath: path.join(
18 | path.dirname(new URL(import.meta.url).pathname),
19 | "node_modules",
20 | "kuromoji",
21 | "dict"
22 | ),
23 | })
24 | .build((err, tokenizer) => {
25 | if (err) {
26 | return reject(err);
27 | }
28 | resolve(tokenizer);
29 | });
30 | }),
31 | ]);
32 |
33 | const lines = fs.readFileSync("/dev/stdin", "utf8").trim().split(EOL);
34 | suite
35 | .add("goya", () => {
36 | for (const line of lines) {
37 | const lattice = core.parse(line);
38 | features.get_features(lattice.find_best().map(({ wid }) => wid));
39 | }
40 | })
41 | .add("kuromoji", () => {
42 | for (const line of lines) {
43 | tokenizer.tokenize(line);
44 | }
45 | })
46 | .on("cycle", (event) => {
47 | console.log(String(event.target));
48 | })
49 | .on("complete", function () {
50 | console.log("Fastest is " + this.filter("fastest").map("name"));
51 | })
52 | .run({ async: true });
53 |
--------------------------------------------------------------------------------
/playground/src/goya.worker.ts:
--------------------------------------------------------------------------------
1 | import * as Comlink from "comlink";
2 |
3 | export type Stats = {
4 | loadWasm: number;
5 | loadDict: number;
6 | parse: number;
7 | };
8 |
9 | const kLoad = "loadWasm";
10 | const kDict = "loadDict";
11 | const kParse = "parse";
12 |
13 | const encoder = new TextEncoder();
14 | const decoder = new TextDecoder();
15 |
16 | async function parse(input: ArrayBufferLike): Promise {
17 | performance.mark(kLoad);
18 | const mod = await import(
19 | /* webpackChunkName: "core" */ "../../wasm-core/pkg"
20 | );
21 | performance.mark(kDict);
22 | await mod.ready();
23 | performance.mark(kParse);
24 | const lattice = mod.parse(decoder.decode(input));
25 |
26 | const res = encoder.encode(
27 | JSON.stringify({
28 | stats: {
29 | loadWasm: performance.measure("loadWasm", kLoad, kDict).duration,
30 | loadDict: performance.measure("loadDict", kDict, kParse).duration,
31 | parse: performance.measure("parse", kParse).duration,
32 | },
33 | dot: lattice.as_dot(),
34 | wakachi: lattice.wakachi(),
35 | best: lattice.find_best(),
36 | })
37 | );
38 | return Comlink.transfer(res, [res.buffer]);
39 | }
40 |
41 | async function getFeatures(payload: ArrayBufferLike): Promise {
42 | const mod = await import(
43 | /* webpackChunkName: "features" */ "../../wasm-features/pkg"
44 | );
45 | const features = mod.get_features(JSON.parse(decoder.decode(payload)));
46 | const res = encoder.encode(JSON.stringify(features));
47 | return Comlink.transfer(res, [res.buffer]);
48 | }
49 |
50 | Comlink.expose({ parse, getFeatures });
51 |
--------------------------------------------------------------------------------
/.github/workflows/CD.yml:
--------------------------------------------------------------------------------
1 | name: CD
2 |
3 | on:
4 | push:
5 | tags:
6 | - "v*"
7 |
8 | env:
9 | CARGO_TERM_COLOR: always
10 |
11 | jobs:
12 | crates-io:
13 | runs-on: ubuntu-latest
14 | steps:
15 | - uses: actions/checkout@v2
16 | - uses: actions-rs/toolchain@v1
17 | with:
18 | toolchain: stable
19 | - uses: actions-rs/cargo@v1
20 | with:
21 | command: login
22 | args: ${{ secrets.CRATES_IO_TOKEN }}
23 | - run: cd goya && cargo publish && sleep 30
24 | - run: cd ipadic && cargo publish && sleep 30
25 | - run: cd goya-cli && cargo publish && sleep 30
26 | npm:
27 | runs-on: ubuntu-latest
28 | steps:
29 | - uses: actions/checkout@v2
30 | - uses: actions/setup-node@v2
31 | with:
32 | node-version: "16"
33 | registry-url: "https://registry.npmjs.org"
34 | - uses: actions-rs/toolchain@v1
35 | with:
36 | toolchain: nightly
37 | - run: cargo install wasm-pack
38 | - run: |
39 | NAME='mecab-ipadic.tar.gz'
40 | curl -v -L 'https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7MWVlSDBCSXZMTXM' -o $NAME
41 | tar -zxvf $NAME
42 | rm -rf $NAME
43 | ./scripts/build-dict mecab-ipadic-2.7.0-20070801
44 | - run: ./scripts/build-wasm wasm-core
45 | - run: cd wasm-core/pkg && npm publish
46 | env:
47 | NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }}
48 | - run: ./scripts/build-wasm wasm-features
49 | - run: cd wasm-features/pkg && npm publish
50 | env:
51 | NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }}
52 |
--------------------------------------------------------------------------------
/.github/workflows/CI.yml:
--------------------------------------------------------------------------------
1 | name: CI
2 |
3 | on:
4 | push:
5 | branches: [main]
6 | pull_request:
7 | branches: [main]
8 |
9 | env:
10 | CARGO_TERM_COLOR: always
11 |
12 | jobs:
13 | cargo:
14 | runs-on: ubuntu-latest
15 | steps:
16 | - uses: actions/checkout@v2
17 | - uses: actions-rs/toolchain@v1
18 | with:
19 | toolchain: stable
20 | - run: |
21 | NAME='mecab-ipadic.tar.gz'
22 | curl -v -L 'https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7MWVlSDBCSXZMTXM' -o $NAME
23 | tar -zxvf $NAME
24 | rm -rf $NAME
25 | cargo run -p goya-cli --release -- compile mecab-ipadic-2.7.0-20070801
26 | - run: cargo clippy --workspace --exclude goya-core --exclude goya-features
27 | - run: cargo build --workspace --exclude goya-core --exclude goya-features
28 | - run: cargo test --workspace --exclude goya-core --exclude goya-features
29 | wasm:
30 | runs-on: ubuntu-latest
31 | steps:
32 | - uses: actions/checkout@v2
33 | - uses: actions/setup-node@v2
34 | with:
35 | node-version: "16"
36 | cache: "npm"
37 | cache-dependency-path: benchmarks/package-lock.json
38 | - uses: actions-rs/toolchain@v1
39 | with:
40 | toolchain: nightly
41 | - run: cargo install wasm-pack
42 | - run: |
43 | NAME='mecab-ipadic.tar.gz'
44 | curl -v -L 'https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7MWVlSDBCSXZMTXM' -o $NAME
45 | tar -zxvf $NAME
46 | rm -rf $NAME
47 | ./scripts/build-dict mecab-ipadic-2.7.0-20070801
48 | - run: |
49 | cd benchmarks
50 | ./scripts/setup
51 | npm i
52 | - run: cd benchmarks && node goya.js < ita-corpus.txt
53 | - run: cd benchmarks && node kuromoji.js < ita-corpus.txt
54 | - run: cd benchmarks && node bench.js < ita-corpus.txt
55 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Goya
2 |
3 | [](https://crates.io/crates/goya)
4 | [](https://docs.rs/goya)
5 |
6 | Goya is a Japanese Morphological Analyzer written in Rust.
7 | The main goal is to compile to WebAssembly for morphological analysis in browsers and other JavaScript runtimes. In addition, it can be used with the CLI and Rust.
8 |
9 | [Try Goya playground](https://goya.pages.dev/). It uses the Goya-wasm from WebWorker.
10 |
11 | ## Getting started
12 |
13 | ### Fetch the latest IPA dictionary
14 |
15 | Download the latest IPA dictionary from [the official Mecab website](https://taku910.github.io/mecab/) and unzip it.
16 |
17 | ### Install Goya CLI
18 |
19 | ```
20 | cargo install goya-cli
21 | ```
22 |
23 | ### Compile the IPA dictionary
24 |
25 | Compile the IPA dictionary to generate a binary dictionary for morphological analysis. It may take a few minutes.
26 |
27 | ```
28 | goya compile /path/to/ipadic
29 | ```
30 |
31 | The binary dictionary will be generated in the `~/.goya` directory by default. You can change the destination with the `--dicdir` option.
32 |
33 | ```
34 | goya --dicdir=/path/to/generated compile /path/to/ipadic
35 | ```
36 |
37 | ### Run Morphological Analysis
38 |
39 | Goya takes input from STDIN. The easiest way is using the echo command and pipe it to the Goya.
40 |
41 | ```
42 | $ echo すもももももももものうち | goya
43 | すもも 名詞,一般,*,*,*,*,すもも,スモモ,スモモ
44 | も 助詞,係助詞,*,*,*,*,も,モ,モ
45 | もも 名詞,一般,*,*,*,*,もも,モモ,モモ
46 | も 助詞,係助詞,*,*,*,*,も,モ,モ
47 | もも 名詞,一般,*,*,*,*,もも,モモ,モモ
48 | の 助詞,連体化,*,*,*,*,の,ノ,ノ
49 | うち 名詞,非自立,副詞可能,*,*,*,うち,ウチ,ウチ
50 | EOS
51 | ```
52 |
53 | If you specified the `--dicdir` option when compiling the dictionary, you should also specify it when running the goya command.
54 |
55 | ```
56 | echo すもももももももものうち | goya --dicdir=/path/to/generated
57 | ```
58 |
59 | ## Release
60 |
61 | ```
62 | cargo release --workspace --no-tag --skip-publish --dependent-version Upgrade
63 | git tag v{{VERSION}}
64 | git push origin v{{VERSION}}
65 | ```
66 |
--------------------------------------------------------------------------------
/playground/webpack.config.js:
--------------------------------------------------------------------------------
1 | const path = require("path");
2 | const zlib = require("zlib");
3 | const HtmlWebpackPlugin = require("html-webpack-plugin");
4 | const WasmPackPlugin = require("@wasm-tool/wasm-pack-plugin");
5 | const { GenerateSW: WorkboxPlugin } = require("workbox-webpack-plugin");
6 | const PreloadWebpackPlugin = require("@vue/preload-webpack-plugin");
7 |
8 | const { BROTLI_PARAM_QUALITY, BROTLI_MAX_QUALITY } = zlib.constants;
9 |
10 | const swcOption = {
11 | jsc: {
12 | parser: {
13 | syntax: "typescript",
14 | tsx: true,
15 | dynamicImport: true,
16 | },
17 | target: "es2020",
18 | },
19 | };
20 |
21 | module.exports = {
22 | entry: "./src/index.tsx",
23 | output: {
24 | path: path.resolve(__dirname, "dist"),
25 | filename: "[name].[contenthash].js",
26 | chunkFilename: "[name].[chunkhash].js",
27 | },
28 | resolve: {
29 | extensions: [".tsx", ".ts", ".js"],
30 | },
31 | module: {
32 | rules: [
33 | {
34 | test: /\.tsx?$/,
35 | use: {
36 | loader: "swc-loader",
37 | options: swcOption,
38 | },
39 | },
40 | // It's for Viz.js
41 | {
42 | test: /\.render\.js$/,
43 | use: ["file-loader"],
44 | },
45 | ],
46 | },
47 | plugins: [
48 | new HtmlWebpackPlugin({
49 | template: path.resolve(__dirname, "src", "index.html"),
50 | }),
51 | new PreloadWebpackPlugin({
52 | rel: "preconnect",
53 | fileWhitelist: [/.wasm$/],
54 | }),
55 | new WasmPackPlugin({
56 | crateDirectory: path.resolve(__dirname, "..", "wasm-core"),
57 | forceMode: "production",
58 | }),
59 | new WasmPackPlugin({
60 | crateDirectory: path.resolve(__dirname, "..", "wasm-features"),
61 | forceMode: "production",
62 | }),
63 | ...(process.env.NODE_ENV === "production"
64 | ? [
65 | new WorkboxPlugin({
66 | clientsClaim: true,
67 | skipWaiting: true,
68 | }),
69 | ]
70 | : []),
71 | ],
72 | experiments: {
73 | asyncWebAssembly: true,
74 | },
75 | };
76 |
--------------------------------------------------------------------------------
/goya/src/common_prefix_tree.rs:
--------------------------------------------------------------------------------
1 | use std::collections::BTreeMap;
2 |
3 | #[derive(Debug, Default, PartialEq, Eq)]
4 | pub struct CommonPrefixTree {
5 | pub id: Option,
6 | pub children: BTreeMap,
7 | }
8 | impl CommonPrefixTree {
9 | pub fn can_stop(&self) -> bool {
10 | self.id.is_some()
11 | }
12 |
13 | pub fn size(&self) -> usize {
14 | self.entires_dfs().len()
15 | }
16 |
17 | pub fn min_char(&self) -> Option<&char> {
18 | self.children.keys().min()
19 | }
20 |
21 | pub fn append(&mut self, id: usize, word: &str) {
22 | let mut token = String::from(word);
23 | token.push('\0');
24 | self.append_chars(id, &token, 0);
25 | }
26 |
27 | pub fn entires_dfs(&self) -> Vec<(String, &CommonPrefixTree)> {
28 | self.dfs_collect(&String::new())
29 | }
30 |
31 | fn dfs_collect(&self, prefix: &str) -> Vec<(String, &CommonPrefixTree)> {
32 | let mut open = vec![(prefix.to_string(), self)];
33 | for (c, child) in self.children.iter() {
34 | let mut substr = String::from(prefix);
35 | substr.push(*c);
36 | open.append(&mut child.dfs_collect(&substr));
37 | }
38 | open
39 | }
40 |
41 | fn append_chars(&mut self, id: usize, text: &str, cursor: usize) {
42 | let c = text.chars().nth(cursor).unwrap();
43 | let child = self
44 | .children
45 | .entry(c)
46 | .or_insert_with(CommonPrefixTree::default);
47 | if cursor + 1 == text.chars().count() {
48 | child.id = Some(id);
49 | return;
50 | }
51 | child.append_chars(id, text, cursor + 1);
52 | }
53 | }
54 |
55 | #[cfg(test)]
56 | mod tests {
57 | use super::CommonPrefixTree;
58 |
59 | #[test]
60 | fn builds_a_word_that_has_1_char() {
61 | let mut trie = CommonPrefixTree::default();
62 | trie.append(1, "あい");
63 | trie.append(2, "いう");
64 | assert_eq!(
65 | trie.entires_dfs()
66 | .iter()
67 | .map(|(p, _)| p)
68 | .collect::>(),
69 | vec!["", "あ", "あい", "あい\0", "い", "いう", "いう\0"]
70 | );
71 | }
72 | }
73 |
--------------------------------------------------------------------------------
/playground/src/Table.tsx:
--------------------------------------------------------------------------------
1 | import { DataGrid } from "@mui/x-data-grid";
2 | import React, { useEffect, useState } from "react";
3 | import { wrap, transfer } from "comlink";
4 |
5 | interface GoyaFeaturesAPI {
6 | getFeatures: (input: ArrayBufferLike) => Promise;
7 | }
8 | type Props = {
9 | rows: Record[];
10 | };
11 |
12 | const encoder = new TextEncoder();
13 | const decoder = new TextDecoder();
14 | const worker = wrap(
15 | new Worker(new URL("./goya.worker.ts", import.meta.url))
16 | );
17 | const base = { flex: 1, sortable: false };
18 |
19 | export default function Table(props: Props) {
20 | const [features, setFeatures] = useState([]);
21 |
22 | const columns = [
23 | { field: "surface_form", headerName: "表層形", ...base },
24 | { field: "is_known", headerName: "既知語", ...base },
25 | { field: "feature_0", headerName: "品詞", ...base },
26 | { field: "feature_1", headerName: "品詞細分類1", ...base },
27 | { field: "feature_2", headerName: "品詞細分類2", ...base },
28 | { field: "feature_3", headerName: "品詞細分類3", ...base },
29 | { field: "feature_4", headerName: "活用型", ...base },
30 | { field: "feature_5", headerName: "活用形", ...base },
31 | { field: "feature_6", headerName: "原形", ...base },
32 | { field: "feature_7", headerName: "読み", ...base },
33 | { field: "feature_8", headerName: "発音", ...base },
34 | ];
35 | const rows = props.rows.map((row, i) => ({
36 | id: i,
37 | ...row,
38 | feature_0: features[i]?.[0],
39 | feature_1: features[i]?.[1],
40 | feature_2: features[i]?.[2],
41 | feature_3: features[i]?.[3],
42 | feature_4: features[i]?.[4],
43 | feature_5: features[i]?.[5],
44 | feature_6: features[i]?.[6],
45 | feature_7: features[i]?.[7],
46 | feature_8: features[i]?.[8],
47 | }));
48 |
49 | useEffect(() => {
50 | setFeatures([]);
51 | if (!props.rows) {
52 | return;
53 | }
54 | const wids = props.rows.map((m) => m.wid);
55 | const payload = encoder.encode(JSON.stringify(wids));
56 | worker
57 | .getFeatures(transfer(payload, [payload.buffer]))
58 | .then((res) => JSON.parse(decoder.decode(res)))
59 | .then(setFeatures);
60 | }, [props.rows]);
61 |
62 | return (
63 |
70 | );
71 | }
72 |
--------------------------------------------------------------------------------
/playground/src/Result.tsx:
--------------------------------------------------------------------------------
1 | import React, { Suspense, lazy, useState } from "react";
2 | import Box from "@mui/material/Box";
3 | import Stack from "@mui/material/Stack";
4 | import Chip from "@mui/material/Chip";
5 | import Tab from "@mui/material/Tab";
6 | import TabContext from "@mui/lab/TabContext";
7 | import TabList from "@mui/lab/TabList";
8 | import TabPanel from "@mui/lab/TabPanel";
9 | import type { Stats } from "./goya.worker";
10 | import { Typography } from "@mui/material";
11 |
12 | enum ResultTab {
13 | Wakachi = "Wakachi",
14 | Table = "Table",
15 | Dot = "Dot",
16 | }
17 |
18 | type Props = {
19 | dot?: string;
20 | wakachi?: string[];
21 | best?: unknown[] | null;
22 | stats?: Stats;
23 | };
24 |
25 | const Table = lazy(() => import(/* webpackChunkName: "table" */ "./Table"));
26 | const Dot = lazy(() => import(/* webpackChunkName: "dot" */ "./Dot"));
27 |
28 | export function Result(props: Props) {
29 | const { dot, wakachi, best, stats } = props;
30 | const [tab, setTab] = useState(ResultTab.Wakachi);
31 |
32 | const handleChangeTab = (_: unknown, newValue: ResultTab) => {
33 | setTab(newValue);
34 | };
35 |
36 | return (
37 | <>
38 |
39 |
40 |
44 |
48 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 | {wakachi?.join("/")}
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 | {dot ? : null}
70 |
71 |
72 | >
73 | );
74 | }
75 |
--------------------------------------------------------------------------------
/wasm-core/.gitignore:
--------------------------------------------------------------------------------
1 | pkg
2 |
3 | # Created by https://www.toptal.com/developers/gitignore/api/node
4 | # Edit at https://www.toptal.com/developers/gitignore?templates=node
5 |
6 | ### Node ###
7 | # Logs
8 | logs
9 | *.log
10 | npm-debug.log*
11 | yarn-debug.log*
12 | yarn-error.log*
13 | lerna-debug.log*
14 | .pnpm-debug.log*
15 |
16 | # Diagnostic reports (https://nodejs.org/api/report.html)
17 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
18 |
19 | # Runtime data
20 | pids
21 | *.pid
22 | *.seed
23 | *.pid.lock
24 |
25 | # Directory for instrumented libs generated by jscoverage/JSCover
26 | lib-cov
27 |
28 | # Coverage directory used by tools like istanbul
29 | coverage
30 | *.lcov
31 |
32 | # nyc test coverage
33 | .nyc_output
34 |
35 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
36 | .grunt
37 |
38 | # Bower dependency directory (https://bower.io/)
39 | bower_components
40 |
41 | # node-waf configuration
42 | .lock-wscript
43 |
44 | # Compiled binary addons (https://nodejs.org/api/addons.html)
45 | build/Release
46 |
47 | # Dependency directories
48 | node_modules/
49 | jspm_packages/
50 |
51 | # Snowpack dependency directory (https://snowpack.dev/)
52 | web_modules/
53 |
54 | # TypeScript cache
55 | *.tsbuildinfo
56 |
57 | # Optional npm cache directory
58 | .npm
59 |
60 | # Optional eslint cache
61 | .eslintcache
62 |
63 | # Microbundle cache
64 | .rpt2_cache/
65 | .rts2_cache_cjs/
66 | .rts2_cache_es/
67 | .rts2_cache_umd/
68 |
69 | # Optional REPL history
70 | .node_repl_history
71 |
72 | # Output of 'npm pack'
73 | *.tgz
74 |
75 | # Yarn Integrity file
76 | .yarn-integrity
77 |
78 | # dotenv environment variables file
79 | .env
80 | .env.test
81 | .env.production
82 |
83 | # parcel-bundler cache (https://parceljs.org/)
84 | .cache
85 | .parcel-cache
86 |
87 | # Next.js build output
88 | .next
89 | out
90 |
91 | # Nuxt.js build / generate output
92 | .nuxt
93 | dist
94 |
95 | # Gatsby files
96 | .cache/
97 | # Comment in the public line in if your project uses Gatsby and not Next.js
98 | # https://nextjs.org/blog/next-9-1#public-directory-support
99 | # public
100 |
101 | # vuepress build output
102 | .vuepress/dist
103 |
104 | # Serverless directories
105 | .serverless/
106 |
107 | # FuseBox cache
108 | .fusebox/
109 |
110 | # DynamoDB Local files
111 | .dynamodb/
112 |
113 | # TernJS port file
114 | .tern-port
115 |
116 | # Stores VSCode versions used for testing VSCode extensions
117 | .vscode-test
118 |
119 | # yarn v2
120 | .yarn/cache
121 | .yarn/unplugged
122 | .yarn/build-state.yml
123 | .yarn/install-state.gz
124 | .pnp.*
125 |
126 | ### Node Patch ###
127 | # Serverless Webpack directories
128 | .webpack/
129 |
130 | # End of https://www.toptal.com/developers/gitignore/api/node
131 | mecab-ipadic-2.7.0-20070801
132 |
--------------------------------------------------------------------------------
/playground/.gitignore:
--------------------------------------------------------------------------------
1 | __generated__
2 | pkg
3 |
4 | # Created by https://www.toptal.com/developers/gitignore/api/node
5 | # Edit at https://www.toptal.com/developers/gitignore?templates=node
6 |
7 | ### Node ###
8 | # Logs
9 | logs
10 | *.log
11 | npm-debug.log*
12 | yarn-debug.log*
13 | yarn-error.log*
14 | lerna-debug.log*
15 | .pnpm-debug.log*
16 |
17 | # Diagnostic reports (https://nodejs.org/api/report.html)
18 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
19 |
20 | # Runtime data
21 | pids
22 | *.pid
23 | *.seed
24 | *.pid.lock
25 |
26 | # Directory for instrumented libs generated by jscoverage/JSCover
27 | lib-cov
28 |
29 | # Coverage directory used by tools like istanbul
30 | coverage
31 | *.lcov
32 |
33 | # nyc test coverage
34 | .nyc_output
35 |
36 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
37 | .grunt
38 |
39 | # Bower dependency directory (https://bower.io/)
40 | bower_components
41 |
42 | # node-waf configuration
43 | .lock-wscript
44 |
45 | # Compiled binary addons (https://nodejs.org/api/addons.html)
46 | build/Release
47 |
48 | # Dependency directories
49 | node_modules/
50 | jspm_packages/
51 |
52 | # Snowpack dependency directory (https://snowpack.dev/)
53 | web_modules/
54 |
55 | # TypeScript cache
56 | *.tsbuildinfo
57 |
58 | # Optional npm cache directory
59 | .npm
60 |
61 | # Optional eslint cache
62 | .eslintcache
63 |
64 | # Microbundle cache
65 | .rpt2_cache/
66 | .rts2_cache_cjs/
67 | .rts2_cache_es/
68 | .rts2_cache_umd/
69 |
70 | # Optional REPL history
71 | .node_repl_history
72 |
73 | # Output of 'npm pack'
74 | *.tgz
75 |
76 | # Yarn Integrity file
77 | .yarn-integrity
78 |
79 | # dotenv environment variables file
80 | .env
81 | .env.test
82 | .env.production
83 |
84 | # parcel-bundler cache (https://parceljs.org/)
85 | .cache
86 | .parcel-cache
87 |
88 | # Next.js build output
89 | .next
90 | out
91 |
92 | # Nuxt.js build / generate output
93 | .nuxt
94 | dist
95 |
96 | # Gatsby files
97 | .cache/
98 | # Comment in the public line in if your project uses Gatsby and not Next.js
99 | # https://nextjs.org/blog/next-9-1#public-directory-support
100 | # public
101 |
102 | # vuepress build output
103 | .vuepress/dist
104 |
105 | # Serverless directories
106 | .serverless/
107 |
108 | # FuseBox cache
109 | .fusebox/
110 |
111 | # DynamoDB Local files
112 | .dynamodb/
113 |
114 | # TernJS port file
115 | .tern-port
116 |
117 | # Stores VSCode versions used for testing VSCode extensions
118 | .vscode-test
119 |
120 | # yarn v2
121 | .yarn/cache
122 | .yarn/unplugged
123 | .yarn/build-state.yml
124 | .yarn/install-state.gz
125 | .pnp.*
126 |
127 | ### Node Patch ###
128 | # Serverless Webpack directories
129 | .webpack/
130 |
131 | # End of https://www.toptal.com/developers/gitignore/api/node
132 | mecab-ipadic-2.7.0-20070801
133 |
--------------------------------------------------------------------------------
/goya-cli/src/repl.rs:
--------------------------------------------------------------------------------
1 | use goya::dot;
2 | use goya::double_array::DoubleArray;
3 | use goya::id::WordIdentifier;
4 | use goya::lattice::Lattice;
5 | use goya::word_features::WordFeaturesMap;
6 | use goya_ipadic::ipadic::IPADic;
7 | use std::io::{stdin, stdout, BufRead, BufWriter, Write};
8 | use std::str::FromStr;
9 |
10 | pub enum Format {
11 | Dot,
12 | Plain,
13 | }
14 | impl FromStr for Format {
15 | type Err = &'static str;
16 |
17 | fn from_str(s: &str) -> Result {
18 | match s {
19 | "dot" => Ok(Format::Dot),
20 | "plain" => Ok(Format::Plain),
21 | _ => Err("no match"),
22 | }
23 | }
24 | }
25 |
26 | pub struct ReplContext<'a> {
27 | pub da: &'a DoubleArray,
28 | pub dict: &'a IPADic,
29 | pub word_set: &'a WordFeaturesMap,
30 | pub format: Format,
31 | }
32 |
33 | pub fn start(opt: ReplContext) -> Result<(), std::io::Error> {
34 | let out = stdout();
35 | let mut out = BufWriter::new(out.lock());
36 |
37 | for line in stdin().lock().lines() {
38 | match line {
39 | Ok(line) if line.is_empty() => continue,
40 | Ok(line) => {
41 | let lattice = Lattice::parse(&line, opt.da, opt.dict);
42 | match opt.format {
43 | Format::Dot => {
44 | writeln!(out, "{}", dot::render(&lattice, opt.dict).unwrap())?;
45 | }
46 | Format::Plain => {
47 | if let Some(path) = lattice.find_best() {
48 | for wid in path.into_iter() {
49 | let (surface_form, features) = match wid {
50 | WordIdentifier::Unknown(id, surface_form) => {
51 | (surface_form, opt.word_set.get_unknown(&id).unwrap())
52 | }
53 | WordIdentifier::Known(id, surface_form) => {
54 | (surface_form, opt.word_set.get_known(&id).unwrap())
55 | }
56 | };
57 | writeln!(
58 | out,
59 | "{}\t{}",
60 | surface_form,
61 | features
62 | .into_iter()
63 | .map(|f| f.to_string())
64 | .collect::>()
65 | .join(",")
66 | )?;
67 | }
68 | writeln!(out, "EOS")?;
69 | out.flush()?;
70 | }
71 | }
72 | }
73 | }
74 | Err(err) => return Err(err),
75 | }
76 | }
77 | Ok(())
78 | }
79 |
--------------------------------------------------------------------------------
/goya-cli/src/main.rs:
--------------------------------------------------------------------------------
1 | mod build;
2 | mod path_util;
3 | mod repl;
4 |
5 | use clap::Parser;
6 | use futures::executor::block_on;
7 | use futures::future;
8 | use goya::double_array::DoubleArray;
9 | use goya_ipadic::ipadic::IPADic;
10 | use path_util::PathUtil;
11 | use repl::Format;
12 | use rkyv::{archived_root, Deserialize, Infallible};
13 | use std::fs;
14 |
15 | #[derive(Parser)]
16 | struct Opts {
17 | /// `~/.goya/dict` by default
18 | #[clap(short, long)]
19 | dicdir: Option,
20 | #[clap(short, long, default_value = "plain")]
21 | format: Format,
22 | #[clap(subcommand)]
23 | subcmd: Option,
24 | }
25 |
26 | #[derive(Parser)]
27 | enum SubCommand {
28 | Compile(Compile),
29 | Clean,
30 | }
31 |
32 | /// A subcommand for controlling testing
33 | #[derive(Parser)]
34 | struct Compile {
35 | /// Path to the IPAdic directory
36 | dicpath: String,
37 | }
38 |
39 | fn main() {
40 | let opts: Opts = Opts::parse();
41 | let base_dir = dirs::home_dir().unwrap().join(".goya");
42 | let dicdir = opts
43 | .dicdir
44 | .unwrap_or_else(|| base_dir.join("dict").to_str().unwrap().to_string());
45 | match opts.subcmd {
46 | Some(SubCommand::Compile(c)) => match build::build(&c.dicpath, &dicdir) {
47 | Ok(_) => {}
48 | Err(err) => {
49 | println!("{:?}", err);
50 | }
51 | },
52 | Some(SubCommand::Clean) => {
53 | let util = PathUtil::from(dicdir);
54 | fs::remove_file(util.da_path()).expect("Failed to delete file");
55 | fs::remove_file(util.dict_path()).expect("Failed to delete file");
56 | }
57 | _ => {
58 | let util = PathUtil::from(dicdir);
59 |
60 | let da_fut = async {
61 | let encoded = fs::read(util.da_path()).expect("Failed to load dictionary");
62 | let archived = unsafe { archived_root::(&encoded[..]) };
63 | archived.deserialize(&mut Infallible).unwrap()
64 | };
65 | let ipadic_fut = async {
66 | let encoded = fs::read(util.dict_path()).expect("Failed to load vocabulary");
67 | let archived = unsafe { archived_root::(&encoded[..]) };
68 | archived.deserialize(&mut Infallible).unwrap()
69 | };
70 | let features_fut = async {
71 | let encoded = fs::read(util.features_path()).expect("Failed to load surfaces");
72 | rmp_serde::from_slice(&encoded[..]).unwrap()
73 | };
74 |
75 | let (ipadic, word_set) = block_on(future::join(ipadic_fut, features_fut));
76 | let da = block_on(da_fut);
77 | repl::start(repl::ReplContext {
78 | da: &da,
79 | dict: &ipadic,
80 | word_set: &word_set,
81 | format: opts.format,
82 | })
83 | .unwrap();
84 | std::thread::spawn(move || drop(ipadic));
85 | std::thread::spawn(move || drop(da));
86 | std::thread::spawn(move || drop(word_set));
87 | }
88 | }
89 | }
90 |
--------------------------------------------------------------------------------
/wasm-core/src/lib.rs:
--------------------------------------------------------------------------------
1 | use goya::dictionary::Dictionary;
2 | use goya::dot;
3 | use goya::double_array::DoubleArray;
4 | use goya::id::WordIdentifier;
5 | use goya::lattice::Lattice;
6 | use goya_ipadic::ipadic::IPADic;
7 | use rkyv::{archived_root, Deserialize, Infallible};
8 | use serde::Serialize;
9 | use wasm_bindgen::prelude::*;
10 |
11 | #[macro_use]
12 | extern crate lazy_static;
13 |
14 | lazy_static! {
15 | static ref DOUBLE_ARRAY: DoubleArray = {
16 | let archived =
17 | unsafe { archived_root::(include_bytes!("../__generated__/da.bin")) };
18 | archived.deserialize(&mut Infallible).unwrap()
19 | };
20 | static ref IPADIC: IPADic = {
21 | let archived =
22 | unsafe { archived_root::(include_bytes!("../__generated__/dict.bin")) };
23 | archived.deserialize(&mut Infallible).unwrap()
24 | };
25 | }
26 |
27 | #[derive(Serialize)]
28 | pub struct WasmMorpheme {
29 | wid: WordIdentifier,
30 | is_known: bool,
31 | surface_form: String,
32 | left_context_id: usize,
33 | right_context_id: usize,
34 | cost: i16,
35 | }
36 | impl WasmMorpheme {}
37 |
38 | #[wasm_bindgen]
39 | pub struct WasmLattice {
40 | lattice: Lattice,
41 | }
42 | #[wasm_bindgen]
43 | impl WasmLattice {
44 | pub fn as_dot(&self) -> String {
45 | dot::render(&self.lattice, &*IPADIC).unwrap()
46 | }
47 |
48 | pub fn wakachi(&self) -> Vec {
49 | self.best_morphemes()
50 | .map(|morpheme| serde_wasm_bindgen::to_value(&morpheme.surface_form).unwrap())
51 | .collect()
52 | }
53 |
54 | pub fn find_best(&self) -> Vec {
55 | self.best_morphemes()
56 | .map(|morpheme| serde_wasm_bindgen::to_value(&morpheme).unwrap())
57 | .collect()
58 | }
59 |
60 | fn best_morphemes(&self) -> impl Iterator- + '_ {
61 | self.lattice
62 | .find_best()
63 | .map(|path| {
64 | path.into_iter().map(|wid| {
65 | let morpheme = IPADIC.get(&wid).unwrap();
66 | let (surface_form, is_known) = match &wid {
67 | WordIdentifier::Known(_, s) => (s.to_string(), true),
68 | WordIdentifier::Unknown(_, s) => (s.to_string(), false),
69 | };
70 | WasmMorpheme {
71 | wid,
72 | is_known,
73 | surface_form,
74 | left_context_id: morpheme.left_context_id,
75 | right_context_id: morpheme.right_context_id,
76 | cost: morpheme.cost,
77 | }
78 | })
79 | })
80 | .unwrap()
81 | }
82 | }
83 |
84 | #[wasm_bindgen]
85 | pub async fn ready() {
86 | futures::join!(async { lazy_static::initialize(&IPADIC) }, async {
87 | lazy_static::initialize(&DOUBLE_ARRAY)
88 | });
89 | }
90 |
91 | #[wasm_bindgen]
92 | pub fn parse(text: &str) -> WasmLattice {
93 | WasmLattice {
94 | lattice: Lattice::parse(text, &DOUBLE_ARRAY, &*IPADIC),
95 | }
96 | }
97 |
--------------------------------------------------------------------------------
/goya-cli/src/build.rs:
--------------------------------------------------------------------------------
1 | use super::path_util::PathUtil;
2 | use bytesize::ByteSize;
3 | use console::{style, Emoji};
4 | use goya::common_prefix_tree::CommonPrefixTree;
5 | use goya::double_array::DoubleArray;
6 | use goya_ipadic::ipadic::IPADic;
7 | use goya_ipadic::ipadic_loader::IPADicLoader;
8 | use rkyv::ser::{serializers::AllocSerializer, Serializer};
9 | use std::error::Error;
10 | use std::fs;
11 | use std::time::Instant;
12 |
13 | const LOOKING_GLASS: Emoji = Emoji("🔍", "");
14 | const PAPER: Emoji = Emoji("📃", "");
15 | const CLIP: Emoji = Emoji("🔗", "");
16 | const SPARKLE: Emoji = Emoji("✨", "");
17 | const TRUCK: Emoji = Emoji("🚚", "");
18 |
19 | pub fn build(src_dir: &str, dist_dir: &str) -> Result<(), Box
> {
20 | PathUtil::from(dist_dir.to_string());
21 | let timer = Instant::now();
22 | eprintln!(
23 | "{} {} Loading dictionary...",
24 | style("[1/4]").bold().dim(),
25 | LOOKING_GLASS
26 | );
27 | let loader = IPADicLoader {};
28 | let mut loaded = loader.load(src_dir)?;
29 |
30 | eprintln!(
31 | "{} {} Analyzing vocabulary...",
32 | style("[2/4]").bold().dim(),
33 | PAPER
34 | );
35 | let mut cpt = CommonPrefixTree::default();
36 | for (id, surface) in loaded.surfaces.iter() {
37 | cpt.append(*id, surface);
38 | }
39 |
40 | eprintln!(
41 | "{} {} Recompiling dictionary...",
42 | style("[3/4]").bold().dim(),
43 | CLIP
44 | );
45 | let da = DoubleArray::from_cpt(&cpt);
46 |
47 | // DoubleArray only has one ID per surface form.
48 | let used_wids = da.wids().collect();
49 | loaded.ipadic.shrink_to_wids(&used_wids);
50 |
51 | eprintln!(
52 | "{} {} Exporting dictionary...",
53 | style("[4/4]").bold().dim(),
54 | TRUCK
55 | );
56 | let util = PathUtil::from(dist_dir.to_string());
57 | util.mkdirp().expect("Failed to create directory");
58 |
59 | let mut serializer = AllocSerializer::<256>::default();
60 | serializer.serialize_value(&da).unwrap();
61 | let bytes = serializer.into_serializer().into_inner();
62 | fs::write(util.da_path(), &bytes).expect("Failed to write dictionary");
63 | eprintln!("DoubleArray stats:");
64 | eprintln!(" elements: {}", da.base.len());
65 | eprintln!(" bytes: {}", ByteSize(bytes.len() as u64));
66 |
67 | let mut serializer = AllocSerializer::<256>::default();
68 | serializer
69 | .serialize_value::(&loaded.ipadic)
70 | .unwrap();
71 | let bytes = serializer.into_serializer().into_inner();
72 | fs::write(util.dict_path(), &bytes).expect("Failed to write dictionary");
73 | eprintln!("Dictionary stats:");
74 | eprintln!(" bytes: {}", ByteSize(bytes.len() as u64));
75 |
76 | let bytes = rmp_serde::to_vec(&loaded.word_set).unwrap();
77 | fs::write(util.features_path(), &bytes).expect("Failed to write word features");
78 | eprintln!("Word features stats:");
79 | eprintln!(" bytes: {}", ByteSize(bytes.len() as u64));
80 |
81 | let end = timer.elapsed();
82 | eprintln!(
83 | "{} Done in {}.{:03}s",
84 | SPARKLE,
85 | end.as_secs(),
86 | end.subsec_millis()
87 | );
88 | Ok(())
89 | }
90 |
--------------------------------------------------------------------------------
/goya/src/word_features.rs:
--------------------------------------------------------------------------------
1 | use super::id::WordIdentifier;
2 | use indexmap::IndexSet;
3 | use serde::{Deserialize, Serialize};
4 | use std::str::from_utf8_unchecked;
5 |
6 | #[derive(Debug, Serialize, Deserialize, rkyv::Archive, rkyv::Serialize, rkyv::Deserialize)]
7 | pub struct WordFeaturesMap {
8 | #[serde(with = "serde_bytes")]
9 | index: Vec,
10 | offsets: Vec,
11 | known: Vec, // index = morpheme ID
12 | unknown: Vec, // index = morpheme ID
13 | }
14 | impl WordFeaturesMap {
15 | pub fn new(known: Vec>, unknown: Vec>) -> WordFeaturesMap {
16 | let mut tmp_index: IndexSet = IndexSet::new();
17 | for features in known.iter().chain(unknown.iter()) {
18 | for f in features.iter() {
19 | tmp_index.insert(f.to_string());
20 | }
21 | }
22 | let mut index = vec![];
23 | let mut offsets: Vec = vec![0; tmp_index.len()];
24 | offsets[0] = tmp_index.get_index(0).unwrap().as_bytes().len();
25 | for (idx, str) in tmp_index.iter().enumerate() {
26 | index.append(&mut str.to_string().into_bytes());
27 | if idx > 0 {
28 | offsets[idx] = offsets[idx - 1] + str.as_bytes().len();
29 | }
30 | }
31 |
32 | WordFeaturesMap {
33 | known: known
34 | .into_iter()
35 | .map(|f| {
36 | WordFeatures::new(f.iter().map(|s| tmp_index.get_full(s).unwrap().0).collect())
37 | })
38 | .collect(),
39 | unknown: unknown
40 | .into_iter()
41 | .map(|f| {
42 | WordFeatures::new(f.iter().map(|s| tmp_index.get_full(s).unwrap().0).collect())
43 | })
44 | .collect(),
45 | index,
46 | offsets,
47 | }
48 | }
49 |
50 | pub fn get(&self, wid: &WordIdentifier) -> Option> {
51 | match wid {
52 | WordIdentifier::Known(wid, _) => self.get_known(wid),
53 | WordIdentifier::Unknown(wid, _) => self.get_unknown(wid),
54 | }
55 | }
56 |
57 | pub fn get_known(&self, wid: &usize) -> Option> {
58 | self.known.get(*wid).map(|f| self.get_string(f))
59 | }
60 |
61 | pub fn get_unknown(&self, wid: &usize) -> Option> {
62 | self.unknown.get(*wid).map(|f| self.get_string(f))
63 | }
64 |
65 | fn get_string(&self, f: &WordFeatures) -> Vec<&str> {
66 | f.0.iter()
67 | .map(|idx| {
68 | let idx = *idx;
69 | let end = self.offsets[idx];
70 | if idx == 0 {
71 | unsafe { from_utf8_unchecked(&self.index[0..end]) }
72 | } else {
73 | unsafe { from_utf8_unchecked(&self.index[(self.offsets[idx - 1])..end]) }
74 | }
75 | })
76 | .collect()
77 | }
78 | }
79 |
80 | /// > 5カラム目以降は, ユーザ定義の CSV フィールドです. 基本的に どんな内容でも CSV の許す限り追加することができます.
81 | /// > https://taku910.github.io/mecab/dic-detail.html
82 | #[derive(Debug, Serialize, Deserialize, rkyv::Archive, rkyv::Serialize, rkyv::Deserialize)]
83 | pub struct WordFeatures(Vec);
84 | impl WordFeatures {
85 | pub fn new(features: Vec) -> WordFeatures {
86 | WordFeatures(features)
87 | }
88 | }
89 |
--------------------------------------------------------------------------------
/playground/src/App.tsx:
--------------------------------------------------------------------------------
1 | import React, { useCallback, useState } from "react";
2 | import AppBar from "@mui/material/AppBar";
3 | import Toolbar from "@mui/material/Toolbar";
4 | import IconButton from "@mui/material/IconButton";
5 | import Container from "@mui/material/Container";
6 | import Box from "@mui/material/Box";
7 | import Typography from "@mui/material/Typography";
8 | import TextField from "@mui/material/TextField";
9 | import GitHubIcon from "@mui/icons-material/GitHub";
10 | import { useDebounce } from "react-use";
11 | import { wrap, transfer } from "comlink";
12 | import type { Stats } from "./goya.worker";
13 | import { Result } from "./Result";
14 |
15 | interface GoyaCoreAPI {
16 | parse: (input: ArrayBufferLike) => Promise;
17 | }
18 | const worker = wrap(
19 | new Worker(new URL("./goya.worker.ts", import.meta.url))
20 | );
21 | const encoder = new TextEncoder();
22 | const decoder = new TextDecoder();
23 | const initText = new URL(location.href).searchParams.get("text");
24 |
25 | export function App() {
26 | const [text, setText] = useState(initText ?? "すもももももももものうち");
27 | const [result, setResult] = useState<{
28 | dot: string;
29 | wakachi: string[];
30 | best: unknown[];
31 | stats: Stats;
32 | } | null>(null);
33 |
34 | const handleChangeText = useCallback(
35 | (event) => {
36 | setText(event.target.value.trim());
37 | },
38 | [setText]
39 | );
40 | useDebounce(
41 | () => {
42 | if (text.length === 0) {
43 | setResult(null);
44 | } else {
45 | const input = encoder.encode(text);
46 | worker
47 | .parse(transfer(input, [input.buffer]))
48 | .then((res) => decoder.decode(res))
49 | .then((res) => JSON.parse(res))
50 | .then(setResult);
51 | }
52 | },
53 | 200,
54 | [text]
55 | );
56 |
57 | return (
58 | <>
59 |
60 |
61 |
62 | Goya playground
63 |
64 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 | Goya: Yet another Japanese morphological analyzer for Rust and
81 | WebAssembly
82 |
83 |
84 | Goya: WebAssemblyで利用可能な日本語の形態素解析ライブラリ
85 |
86 |
87 |
88 |
97 |
98 |
99 |
100 |
101 |
102 | >
103 | );
104 | }
105 |
--------------------------------------------------------------------------------
/ipadic/src/ipadic.rs:
--------------------------------------------------------------------------------
1 | use goya::char_class::CharClassifier;
2 | use goya::char_class::CharDefinition;
3 | use goya::dictionary::Dictionary;
4 | use goya::morpheme::Morpheme;
5 | use indexmap::IndexSet;
6 | use serde::{Deserialize, Serialize};
7 | use std::collections::HashMap;
8 | use std::collections::HashSet;
9 | use std::iter::FromIterator;
10 | use std::vec::Vec;
11 |
12 | // TODO: Make it newtype idiom
13 | type MorphemeIndex = usize;
14 |
15 | #[derive(Debug, Serialize, Deserialize, rkyv::Archive, rkyv::Serialize, rkyv::Deserialize)]
16 | pub struct IPADic {
17 | vocabulary: Vec, // index = morpheme ID
18 | homonyms: Vec>, // index = morpheme ID
19 | classes: CharClassifier,
20 | matrix: Vec>,
21 | /// 1つのカテゴリに複数の素性を定義してもかまいません. 学習後, 適切なコスト値が 自動的に与えられます.
22 | /// https://taku910.github.io/mecab/learn.html#config
23 | unknown_classes: HashMap>,
24 | unknown_vocabulary: Vec, // index = morpheme ID
25 | vocabulary_index: IndexSet,
26 | }
27 | impl Dictionary for IPADic {
28 | fn get_known_morpheme(&self, wid: &usize) -> Option<&Morpheme> {
29 | self.vocabulary
30 | .get(*wid)
31 | .map(|idx| self.vocabulary_index.get_index(*idx).unwrap())
32 | }
33 |
34 | fn get_unknown_morpheme(&self, wid: &usize) -> Option<&Morpheme> {
35 | self.unknown_vocabulary
36 | .get(*wid)
37 | .map(|idx| self.vocabulary_index.get_index(*idx).unwrap())
38 | }
39 |
40 | fn resolve_homonyms(&self, wid: &usize) -> Option<&Vec> {
41 | self.homonyms.get(*wid)
42 | }
43 |
44 | fn take_unknown_chars_seq(&self, def: &CharDefinition, text: &str, start: &usize) -> String {
45 | self.classes.take_unknown_chars(def, text, start)
46 | }
47 |
48 | fn classify_char(&self, c: &char) -> &CharDefinition {
49 | self.classes.classify(c)
50 | }
51 |
52 | fn get_unknown_morphemes_by_class(&self, class: &str) -> Vec<(usize, &Morpheme)> {
53 | self.unknown_classes
54 | .get(class)
55 | .unwrap()
56 | .iter()
57 | .map(|wid| (*wid, self.unknown_vocabulary.get(*wid).unwrap()))
58 | .map(|(wid, idx)| (wid, self.vocabulary_index.get_index(*idx).unwrap()))
59 | .collect::>()
60 | }
61 |
62 | fn transition_cost(&self, left: &usize, right: &usize) -> Option<&i16> {
63 | if let Some(rights) = self.matrix.get(*left) {
64 | if let Some(cost) = rights.get(*right) {
65 | return Some(cost);
66 | }
67 | }
68 | None
69 | }
70 |
71 | fn occurrence_cost(&self, wid: &usize) -> Option {
72 | self.get_known_morpheme(wid).map(|w| w.cost)
73 | }
74 | }
75 | impl IPADic {
76 | pub fn from(
77 | vocabulary: Vec,
78 | homonyms: Vec>,
79 | classes: CharClassifier,
80 | matrix: Vec>,
81 | unknown_classes: HashMap>,
82 | unknown_vocabulary: Vec,
83 | vocabulary_index: IndexSet,
84 | ) -> IPADic {
85 | IPADic {
86 | vocabulary,
87 | homonyms,
88 | classes,
89 | matrix,
90 | unknown_classes,
91 | unknown_vocabulary,
92 | vocabulary_index,
93 | }
94 | }
95 |
96 | pub fn shrink_to_wids(&mut self, wids: &Vec) {
97 | let set: HashSet = HashSet::from_iter(wids.iter().cloned());
98 | for idx in 0..self.homonyms.len() {
99 | if set.contains(&idx) {
100 | continue;
101 | }
102 | self.homonyms[idx] = vec![];
103 | }
104 | }
105 | }
106 |
--------------------------------------------------------------------------------
/goya/src/dot.rs:
--------------------------------------------------------------------------------
1 | use super::{
2 | dictionary::Dictionary,
3 | lattice::{Lattice, BOS_CONTEXT_ID, EOS_CONTEXT_ID},
4 | };
5 | use std::{error::Error, fmt::Write};
6 |
7 | const BOLD: &str = " penwidth=3";
8 |
9 | pub fn render(lattice: &Lattice, dict: &D) -> Result> {
10 | let cursor = (lattice.dp.len() - 1, 0);
11 | let len = lattice.indices.len();
12 | let best_path = lattice.find_best_path();
13 | let mut dot = String::from("");
14 | writeln!(
15 | dot,
16 | r#"digraph lattice {{
17 | rankdir=LR;
18 | splines=polyline;
19 | nodesep=.05;
20 |
21 | BOS [label="BOS\n0 (0)" shape="doublecircle"{}];
22 | EOS [label="EOS\n{} (0)" shape="doublecircle"{}];
23 | "#,
24 | BOLD,
25 | lattice.dp[cursor.0].get(cursor.1).unwrap().0,
26 | BOLD
27 | )?;
28 | for (i, index) in lattice.indices.iter().enumerate() {
29 | for (j, (left_wid, wlen)) in index.iter().enumerate() {
30 | let left = dict.get(left_wid).unwrap();
31 | let node_style = match &best_path {
32 | Some(best_path) if best_path.contains(&(i + 1, j)) => BOLD,
33 | _ => "",
34 | };
35 | writeln!(
36 | dot,
37 | r#" "{}_{}" [label="{}\n({}, {})"{}];"#,
38 | i,
39 | j,
40 | left_wid.get_surface(),
41 | lattice.dp[i + 1][j].0,
42 | left.cost,
43 | node_style,
44 | )?;
45 | if i == 0 {
46 | let right = left;
47 | let cost = dict
48 | .transition_cost(&BOS_CONTEXT_ID, &right.right_context_id)
49 | .unwrap();
50 | let bos_edge_style = match &best_path {
51 | Some(best_path) if best_path.contains(&(i + 1, j)) => BOLD,
52 | _ => "",
53 | };
54 | writeln!(
55 | dot,
56 | r#" BOS -> "{}_{}" [label="({})"{}];"#,
57 | i, j, cost, bos_edge_style
58 | )?;
59 | }
60 | if i + wlen >= len {
61 | let cost = dict
62 | .transition_cost(&left.left_context_id, &EOS_CONTEXT_ID)
63 | .unwrap();
64 | let eos_edge_style = match &best_path {
65 | Some(best_path) if best_path.contains(&(i + 1, j)) => BOLD,
66 | _ => "",
67 | };
68 | writeln!(
69 | dot,
70 | r#" "{}_{}" -> EOS [label="({})"{}];"#,
71 | i, j, cost, eos_edge_style
72 | )?;
73 | continue;
74 | }
75 | for (k, (right_wid, _)) in lattice.indices[i + wlen].iter().enumerate() {
76 | let right = dict.get(right_wid).unwrap();
77 | let cost = dict
78 | .transition_cost(&left.left_context_id, &right.right_context_id)
79 | .unwrap();
80 | let edge_style = match &best_path {
81 | Some(best_path)
82 | if best_path.contains(&(i + 1, j))
83 | && best_path.contains(&(i + wlen + 1, k)) =>
84 | {
85 | BOLD
86 | }
87 | _ => "",
88 | };
89 | writeln!(
90 | dot,
91 | r#" "{}_{}" -> "{}_{}" [label="({})"{}];"#,
92 | i,
93 | j,
94 | i + wlen,
95 | k,
96 | cost,
97 | edge_style
98 | )?;
99 | }
100 | }
101 | }
102 | writeln!(dot, "}}")?;
103 | Ok(dot)
104 | }
105 |
--------------------------------------------------------------------------------
/goya/src/char_class.rs:
--------------------------------------------------------------------------------
1 | use serde::{Deserialize, Serialize};
2 | use std::collections::{HashMap, HashSet};
3 |
4 | const CLASS_DEFAULT: &str = "DEFAULT";
5 |
6 | #[derive(
7 | Debug, PartialEq, Eq, Serialize, Deserialize, rkyv::Archive, rkyv::Serialize, rkyv::Deserialize,
8 | )]
9 | pub enum InvokeTiming {
10 | Fallback,
11 | Always,
12 | }
13 | #[derive(
14 | Debug, PartialEq, Eq, Serialize, Deserialize, rkyv::Archive, rkyv::Serialize, rkyv::Deserialize,
15 | )]
16 | pub struct CharDefinition {
17 | pub class: String,
18 | pub timing: InvokeTiming,
19 | pub group_by_same_kind: bool,
20 | pub len: usize,
21 | pub compatibilities: HashSet, // elements = class name
22 | }
23 | impl CharDefinition {
24 | pub fn compatible_with(&self, class_name: &str) -> bool {
25 | self.class.eq(class_name) || self.compatibilities.contains(class_name)
26 | }
27 | }
28 |
29 | #[derive(Debug, Serialize, Deserialize, rkyv::Archive, rkyv::Serialize, rkyv::Deserialize)]
30 | pub struct CharClass {
31 | range: (u32, u32),
32 | class: String,
33 | }
34 | impl CharClass {
35 | pub fn from(range: (u32, u32), class: String) -> CharClass {
36 | CharClass { range, class }
37 | }
38 |
39 | pub fn in_range(&self, c: &char) -> bool {
40 | let code = *c as u32;
41 | self.range.0 <= code && code <= self.range.1
42 | }
43 | }
44 |
45 | #[derive(Debug, Serialize, Deserialize, rkyv::Archive, rkyv::Serialize, rkyv::Deserialize)]
46 | pub struct CharClassifier {
47 | chars: HashMap,
48 | ranges: Vec,
49 | }
50 | impl CharClassifier {
51 | pub fn from(chars: HashMap, ranges: Vec) -> CharClassifier {
52 | CharClassifier { chars, ranges }
53 | }
54 |
55 | pub fn classify(&self, c: &char) -> &CharDefinition {
56 | let class = self.get_class_name(c);
57 | self.chars.get(class).unwrap()
58 | }
59 |
60 | pub fn take_unknown_chars(&self, def: &CharDefinition, text: &str, start: &usize) -> String {
61 | if !def.group_by_same_kind {
62 | return text.chars().skip(*start).take(def.len).collect();
63 | }
64 |
65 | let mut len = 0;
66 | text.chars()
67 | .enumerate()
68 | .skip(*start)
69 | .take_while(|(_, c)| {
70 | if def.len != 0 && len >= def.len || !def.compatible_with(self.get_class_name(c)) {
71 | return false;
72 | }
73 | len += 1;
74 | true
75 | })
76 | .map(|(_, c)| c)
77 | .collect()
78 | }
79 |
80 | fn get_class_name(&self, c: &char) -> &str {
81 | self.ranges
82 | .iter()
83 | .find(|class| class.in_range(c))
84 | .map(|class| class.class.as_str())
85 | .unwrap_or_else(|| CLASS_DEFAULT)
86 | }
87 | }
88 |
89 | #[cfg(test)]
90 | mod tests {
91 | use super::*;
92 |
93 | #[test]
94 | fn compatible_with_without_compatibilities() {
95 | let def_a = CharDefinition {
96 | class: String::from("A"),
97 | timing: InvokeTiming::Always,
98 | group_by_same_kind: false,
99 | len: 2,
100 | compatibilities: HashSet::new(),
101 | };
102 | assert_eq!(def_a.compatible_with("A"), true);
103 | assert_eq!(def_a.compatible_with("B"), false);
104 | }
105 |
106 | #[test]
107 | fn compatible_with_with_compatibilities() {
108 | let mut compatibilities = HashSet::new();
109 | compatibilities.insert(String::from("B"));
110 | let def_a = CharDefinition {
111 | class: String::from("A"),
112 | timing: InvokeTiming::Always,
113 | group_by_same_kind: false,
114 | len: 2,
115 | compatibilities,
116 | };
117 | assert_eq!(def_a.compatible_with("A"), true);
118 | assert_eq!(def_a.compatible_with("B"), true);
119 | assert_eq!(def_a.compatible_with("C"), false);
120 | }
121 |
122 | #[test]
123 | fn in_range() {
124 | let class = CharClass::from((1, 2), String::new());
125 | assert_eq!(class.in_range(&(0 as char)), false);
126 | assert_eq!(class.in_range(&(1 as char)), true);
127 | assert_eq!(class.in_range(&(2 as char)), true);
128 | assert_eq!(class.in_range(&(3 as char)), false);
129 | }
130 | }
131 |
--------------------------------------------------------------------------------
/benchmarks/package-lock.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "benchmarks",
3 | "version": "0.0.0",
4 | "lockfileVersion": 2,
5 | "requires": true,
6 | "packages": {
7 | "": {
8 | "version": "0.0.0",
9 | "hasInstallScript": true,
10 | "license": "ISC",
11 | "dependencies": {
12 | "kuromoji": "^0.1.2",
13 | "wasm-core": "../wasm-core/pkg",
14 | "wasm-features": "../wasm-features/pkg"
15 | },
16 | "devDependencies": {
17 | "benchmark": "^2.1.4"
18 | }
19 | },
20 | "../wasm-core/pkg": {
21 | "name": "goya-core",
22 | "version": "0.1.1",
23 | "license": "Apache-2.0 OR MIT"
24 | },
25 | "../wasm-features/pkg": {
26 | "name": "goya-features",
27 | "version": "0.1.1",
28 | "license": "Apache-2.0 OR MIT"
29 | },
30 | "node_modules/async": {
31 | "version": "2.6.3",
32 | "resolved": "https://registry.npmjs.org/async/-/async-2.6.3.tgz",
33 | "integrity": "sha512-zflvls11DCy+dQWzTW2dzuilv8Z5X/pjfmZOWba6TNIVDm+2UDaJmXSOXlasHKfNBs8oo3M0aT50fDEWfKZjXg==",
34 | "dependencies": {
35 | "lodash": "^4.17.14"
36 | }
37 | },
38 | "node_modules/benchmark": {
39 | "version": "2.1.4",
40 | "resolved": "https://registry.npmjs.org/benchmark/-/benchmark-2.1.4.tgz",
41 | "integrity": "sha1-CfPeMckWQl1JjMLuVloOvzwqVik=",
42 | "dev": true,
43 | "dependencies": {
44 | "lodash": "^4.17.4",
45 | "platform": "^1.3.3"
46 | }
47 | },
48 | "node_modules/doublearray": {
49 | "version": "0.0.2",
50 | "resolved": "https://registry.npmjs.org/doublearray/-/doublearray-0.0.2.tgz",
51 | "integrity": "sha1-Yxhv6NNEEydtNiH2qg7F954ifvk="
52 | },
53 | "node_modules/kuromoji": {
54 | "version": "0.1.2",
55 | "resolved": "https://registry.npmjs.org/kuromoji/-/kuromoji-0.1.2.tgz",
56 | "integrity": "sha512-V0dUf+C2LpcPEXhoHLMAop/bOht16Dyr+mDiIE39yX3vqau7p80De/koFqpiTcL1zzdZlc3xuHZ8u5gjYRfFaQ==",
57 | "dependencies": {
58 | "async": "^2.0.1",
59 | "doublearray": "0.0.2",
60 | "zlibjs": "^0.3.1"
61 | }
62 | },
63 | "node_modules/lodash": {
64 | "version": "4.17.21",
65 | "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz",
66 | "integrity": "sha512-v2kDEe57lecTulaDIuNTPy3Ry4gLGJ6Z1O3vE1krgXZNrsQ+LFTGHVxVjcXPs17LhbZVGedAJv8XZ1tvj5FvSg=="
67 | },
68 | "node_modules/platform": {
69 | "version": "1.3.6",
70 | "resolved": "https://registry.npmjs.org/platform/-/platform-1.3.6.tgz",
71 | "integrity": "sha512-fnWVljUchTro6RiCFvCXBbNhJc2NijN7oIQxbwsyL0buWJPG85v81ehlHI9fXrJsMNgTofEoWIQeClKpgxFLrg==",
72 | "dev": true
73 | },
74 | "node_modules/wasm-core": {
75 | "resolved": "../wasm-core/pkg",
76 | "link": true
77 | },
78 | "node_modules/wasm-features": {
79 | "resolved": "../wasm-features/pkg",
80 | "link": true
81 | },
82 | "node_modules/zlibjs": {
83 | "version": "0.3.1",
84 | "resolved": "https://registry.npmjs.org/zlibjs/-/zlibjs-0.3.1.tgz",
85 | "integrity": "sha1-UBl+2yihxCymWcyLTmqd3W1ERVQ=",
86 | "engines": {
87 | "node": "*"
88 | }
89 | }
90 | },
91 | "dependencies": {
92 | "async": {
93 | "version": "2.6.3",
94 | "resolved": "https://registry.npmjs.org/async/-/async-2.6.3.tgz",
95 | "integrity": "sha512-zflvls11DCy+dQWzTW2dzuilv8Z5X/pjfmZOWba6TNIVDm+2UDaJmXSOXlasHKfNBs8oo3M0aT50fDEWfKZjXg==",
96 | "requires": {
97 | "lodash": "^4.17.14"
98 | }
99 | },
100 | "benchmark": {
101 | "version": "2.1.4",
102 | "resolved": "https://registry.npmjs.org/benchmark/-/benchmark-2.1.4.tgz",
103 | "integrity": "sha1-CfPeMckWQl1JjMLuVloOvzwqVik=",
104 | "dev": true,
105 | "requires": {
106 | "lodash": "^4.17.4",
107 | "platform": "^1.3.3"
108 | }
109 | },
110 | "doublearray": {
111 | "version": "0.0.2",
112 | "resolved": "https://registry.npmjs.org/doublearray/-/doublearray-0.0.2.tgz",
113 | "integrity": "sha1-Yxhv6NNEEydtNiH2qg7F954ifvk="
114 | },
115 | "kuromoji": {
116 | "version": "0.1.2",
117 | "resolved": "https://registry.npmjs.org/kuromoji/-/kuromoji-0.1.2.tgz",
118 | "integrity": "sha512-V0dUf+C2LpcPEXhoHLMAop/bOht16Dyr+mDiIE39yX3vqau7p80De/koFqpiTcL1zzdZlc3xuHZ8u5gjYRfFaQ==",
119 | "requires": {
120 | "async": "^2.0.1",
121 | "doublearray": "0.0.2",
122 | "zlibjs": "^0.3.1"
123 | }
124 | },
125 | "lodash": {
126 | "version": "4.17.21",
127 | "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz",
128 | "integrity": "sha512-v2kDEe57lecTulaDIuNTPy3Ry4gLGJ6Z1O3vE1krgXZNrsQ+LFTGHVxVjcXPs17LhbZVGedAJv8XZ1tvj5FvSg=="
129 | },
130 | "platform": {
131 | "version": "1.3.6",
132 | "resolved": "https://registry.npmjs.org/platform/-/platform-1.3.6.tgz",
133 | "integrity": "sha512-fnWVljUchTro6RiCFvCXBbNhJc2NijN7oIQxbwsyL0buWJPG85v81ehlHI9fXrJsMNgTofEoWIQeClKpgxFLrg==",
134 | "dev": true
135 | },
136 | "wasm-core": {
137 | "version": "file:../wasm-core/pkg"
138 | },
139 | "wasm-features": {
140 | "version": "file:../wasm-features/pkg"
141 | },
142 | "zlibjs": {
143 | "version": "0.3.1",
144 | "resolved": "https://registry.npmjs.org/zlibjs/-/zlibjs-0.3.1.tgz",
145 | "integrity": "sha1-UBl+2yihxCymWcyLTmqd3W1ERVQ="
146 | }
147 | }
148 | }
149 |
--------------------------------------------------------------------------------
/goya/src/double_array.rs:
--------------------------------------------------------------------------------
1 | use super::common_prefix_tree::CommonPrefixTree;
2 | use indexmap::IndexSet;
3 | use itertools::Itertools;
4 | use serde::{Deserialize, Serialize};
5 | use std::cmp;
6 | use std::collections::HashMap;
7 |
8 | const INDEX_ROOT: usize = 1;
9 | const TERM_CHAR: char = '\0';
10 |
11 | #[derive(Debug)]
12 | pub enum TransitionError {
13 | AlreadyTerminated,
14 | BaseFailed,
15 | CheckFailed,
16 | UnknownChar,
17 | BaseOutOfBounds,
18 | CheckOutOfBounds,
19 | }
20 |
21 | #[derive(Debug, Serialize, Deserialize, rkyv::Archive, rkyv::Serialize, rkyv::Deserialize)]
22 | pub struct DoubleArray {
23 | pub codes: IndexSet,
24 | pub base: Vec,
25 | pub check: Vec,
26 | }
27 | impl Default for DoubleArray {
28 | fn default() -> Self {
29 | let base: Vec = vec![0, 1];
30 | let check: Vec = vec![0, 0];
31 | let mut codes: IndexSet = IndexSet::new();
32 |
33 | codes.insert(TERM_CHAR);
34 |
35 | DoubleArray { base, check, codes }
36 | }
37 | }
38 | impl DoubleArray {
39 | pub fn from(base: Vec, check: Vec, codes: IndexSet) -> Self {
40 | DoubleArray { base, check, codes }
41 | }
42 |
43 | pub fn wids(&self) -> impl Iterator- + '_ {
44 | self.base
45 | .iter()
46 | .filter(|s| **s < 0)
47 | .map(|s| as_usize(&(s * -1)))
48 | }
49 |
50 | pub fn from_cpt(trie: &CommonPrefixTree) -> Self {
51 | let mut state_cache = HashMap::new();
52 | let mut da = DoubleArray::default();
53 | let mut chars = trie
54 | .entires_dfs()
55 | .iter()
56 | .map(|(prefix, _)| prefix)
57 | .join("")
58 | .chars()
59 | .collect::
>();
60 | chars.sort_unstable();
61 | chars.dedup();
62 | for c in chars {
63 | da.insert_to_codes(c);
64 | }
65 |
66 | for (prefix, node) in trie.entires_dfs() {
67 | if node.can_stop() {
68 | continue;
69 | }
70 |
71 | // root node
72 | if prefix.is_empty() {
73 | for next_c in node.children.keys() {
74 | let next_char_code = da.get_code(next_c).unwrap();
75 | let t = da.base[INDEX_ROOT] + next_char_code as i32;
76 | let t = as_usize(&t);
77 | da.insert_to_check(t, INDEX_ROOT);
78 | state_cache.insert(concat_char_to_str(&prefix, *next_c), t);
79 | }
80 | continue;
81 | }
82 |
83 | let s = *state_cache.get(&prefix).unwrap();
84 | da.insert_to_base(s, da.find_next_s(node));
85 | for (next_c, child) in node.children.iter() {
86 | let t = da.base.get(s).unwrap() + da.get_code(next_c).unwrap() as i32;
87 | let t = as_usize(&t);
88 | da.insert_to_check(t, s);
89 | if child.can_stop() {
90 | da.insert_to_base(t, -(child.id.unwrap() as i32));
91 | } else {
92 | let key = concat_char_to_str(&prefix, *next_c);
93 | state_cache.insert(key, t);
94 | }
95 | }
96 | }
97 | da.base.shrink_to_fit();
98 | da.check.shrink_to_fit();
99 | da.codes.shrink_to_fit();
100 | da
101 | }
102 |
103 | pub fn transition(
104 | &self,
105 | from: usize,
106 | to: char,
107 | ) -> Result<(i32, Option), TransitionError> {
108 | let code = self.get_code(&to).ok_or(TransitionError::UnknownChar)?;
109 | let s = self
110 | .base
111 | .get(from)
112 | .ok_or(TransitionError::BaseOutOfBounds)?;
113 | let t = s + code as i32;
114 | if t < 0 {
115 | return Err(TransitionError::AlreadyTerminated);
116 | }
117 | let next = self
118 | .check
119 | .get(as_usize(&t))
120 | .ok_or(TransitionError::CheckOutOfBounds)?;
121 | let base = self
122 | .base
123 | .get(t as usize)
124 | .ok_or(TransitionError::BaseFailed)?;
125 | let wid = if *base < 0 {
126 | Some((base * -1) as usize)
127 | } else {
128 | None
129 | };
130 | if *next == from {
131 | Ok((t, wid))
132 | } else {
133 | Err(TransitionError::CheckFailed)
134 | }
135 | }
136 |
137 | pub fn init(&self, to: char) -> Result<(i32, Option), TransitionError> {
138 | self.transition(INDEX_ROOT, to)
139 | }
140 |
141 | pub fn stop(&self, from: usize) -> Result {
142 | match self.transition(from, TERM_CHAR) {
143 | Ok((_, Some(wid))) => Ok(wid),
144 | Ok(_) => unreachable!("Successful transition, but no wid"),
145 | Err(reason) => Err(reason),
146 | }
147 | }
148 |
149 | pub fn get_code(&self, c: &char) -> Option {
150 | self.codes.get_full(c).map(|(code, _)| code)
151 | }
152 |
153 | fn insert_to_codes(&mut self, c: char) -> usize {
154 | let (char_code, _) = self.codes.insert_full(c);
155 | char_code
156 | }
157 |
158 | fn insert_to_base(&mut self, index: usize, value: i32) {
159 | let resized = cmp::max(self.base.len(), index + 1);
160 | self.base.resize(resized, 0);
161 | assert_eq!(
162 | self.base[index], 0,
163 | "index={} already used: {:?}",
164 | index, self.base
165 | );
166 | self.base[index] = value;
167 | }
168 |
169 | fn insert_to_check(&mut self, index: usize, value: usize) {
170 | let resized = cmp::max(self.check.len(), index + 1);
171 | self.check.resize(resized, 0);
172 | self.check[index] = value;
173 | }
174 |
175 | fn get_available_check_index(&self, left: usize) -> usize {
176 | self.check
177 | .iter()
178 | .enumerate()
179 | .skip(left)
180 | // clippy says that `find is prefered to skip_while+next` but it's slower than the current
181 | .skip_while(|(_, value)| value != &&0)
182 | .next()
183 | .map(|(i, _)| i)
184 | .unwrap_or_else(|| unreachable!("index must be found"))
185 | }
186 |
187 | fn find_next_s(&self, child: &CommonPrefixTree) -> i32 {
188 | let mut position = self.get_available_check_index(INDEX_ROOT + 1);
189 | let min_code = self.get_code(child.min_char().unwrap()).unwrap();
190 | let offsets: Vec<_> = child
191 | .children
192 | .keys()
193 | .map(|c| self.get_code(c).unwrap() - min_code)
194 | .collect();
195 | while offsets
196 | .iter()
197 | .any(|code| match self.check.get(position + code) {
198 | Some(0) => false,
199 | Some(_) => true,
200 | _ => false,
201 | })
202 | {
203 | position += 1;
204 | }
205 | (position - min_code) as i32
206 | }
207 | }
208 |
209 | fn as_usize(n: &i32) -> usize {
210 | assert!(*n >= 0, "n({}) should be greater than or equal to 0", n);
211 | *n as usize
212 | }
213 |
214 | fn concat_char_to_str(text: &str, c: char) -> String {
215 | let mut tmp = String::from(text);
216 | tmp.push(c);
217 | tmp
218 | }
219 |
--------------------------------------------------------------------------------
/goya/src/lattice.rs:
--------------------------------------------------------------------------------
1 | use super::char_class::{CharDefinition, InvokeTiming};
2 | use super::dictionary::Dictionary;
3 | use super::double_array::DoubleArray;
4 | use super::id::WordIdentifier;
5 | use std::collections::{HashSet, VecDeque};
6 |
7 | pub const BOS_CONTEXT_ID: usize = 0;
8 | pub const EOS_CONTEXT_ID: usize = 0;
9 | const NODE_BOS: usize = 0;
10 |
11 | #[derive(Debug)]
12 | pub struct Lattice {
13 | // (wid, length of the word)
14 | pub indices: Vec>,
15 | // (min cost, index, length)
16 | pub dp: Vec>,
17 | }
18 | impl Lattice {
19 | pub fn parse(text: &str, da: &DoubleArray, dict: &D) -> Lattice {
20 | let len = text.chars().count();
21 | let mut indices: Vec> = vec![vec![]; len];
22 | let mut open_indices = VecDeque::from(vec![0]);
23 | let mut visited = HashSet::with_capacity(len);
24 | let char_defs = text
25 | .chars()
26 | .map(|c| dict.classify_char(&c))
27 | .collect::>();
28 |
29 | while let Some(index) = open_indices.pop_front() {
30 | if visited.contains(&index) || index >= len {
31 | continue;
32 | }
33 | visited.insert(index);
34 |
35 | let c = text.chars().nth(index).unwrap();
36 | let def = char_defs[index];
37 | if let InvokeTiming::Always = def.timing {
38 | let surface_form = dict.take_unknown_chars_seq(def, text, &index);
39 | open_indices.push_back(index + surface_form.chars().count());
40 | for (wid, _) in dict.get_unknown_morphemes_by_class(&def.class) {
41 | indices[index].push((
42 | WordIdentifier::Unknown(wid, surface_form.to_string()),
43 | surface_form.chars().count(),
44 | ));
45 | }
46 | }
47 |
48 | if let Ok((mut cursor, _)) = da.init(c) {
49 | if let Ok(wid) = da.stop(cursor as usize) {
50 | open_indices.push_back(index + 1);
51 | for wid in dict.resolve_homonyms(&wid).unwrap().iter() {
52 | indices[index].push((
53 | WordIdentifier::Known(*wid, text.chars().skip(index).take(1).collect()),
54 | 1,
55 | ));
56 | }
57 | }
58 | let mut j = index + 1;
59 | while j < len {
60 | let c = text.chars().nth(j).unwrap();
61 | match da.transition(cursor as usize, c) {
62 | Ok((next, _)) => {
63 | if let Ok(wid) = da.stop(next as usize) {
64 | open_indices.push_back(j + 1);
65 | for wid in dict.resolve_homonyms(&wid).unwrap().iter() {
66 | indices[index].push((
67 | WordIdentifier::Known(
68 | *wid,
69 | text.chars().skip(index).take(j + 1 - index).collect(),
70 | ),
71 | j + 1 - index,
72 | ));
73 | }
74 | }
75 | cursor = next;
76 | }
77 | Err(_) => {
78 | break;
79 | }
80 | }
81 | j += 1;
82 | }
83 | }
84 | if indices[index].is_empty() && matches!(def.timing, InvokeTiming::Fallback) {
85 | let surface_form = dict.take_unknown_chars_seq(def, text, &index);
86 | open_indices.push_back(index + surface_form.chars().count());
87 | for (wid, _) in dict.get_unknown_morphemes_by_class(&def.class) {
88 | indices[index].push((
89 | WordIdentifier::Unknown(wid, surface_form.to_string()),
90 | surface_form.chars().count(),
91 | ));
92 | }
93 | }
94 | }
95 | Lattice {
96 | dp: get_dp_table(&indices, dict),
97 | indices,
98 | }
99 | }
100 |
101 | pub fn word_identifiers(&self) -> Vec {
102 | let mut wids = vec![];
103 | for idx in self.indices.iter() {
104 | for (wid, _) in idx.iter() {
105 | wids.push(wid.clone())
106 | }
107 | }
108 | wids
109 | }
110 |
111 | pub fn find_best_path(&self) -> Option> {
112 | let mut path = vec![];
113 | let mut cursor = (self.dp.len() - 1, 0);
114 | loop {
115 | match self.dp[cursor.0].get(cursor.1) {
116 | Some((_, i, j)) => {
117 | if *i == NODE_BOS {
118 | break;
119 | }
120 | path.insert(0, (*i, *j));
121 | cursor = (*i, *j);
122 | }
123 | _ => return None,
124 | }
125 | }
126 | Some(path)
127 | }
128 |
129 | pub fn find_best(&self) -> Option> {
130 | match self.find_best_path() {
131 | Some(best_path) => {
132 | let mut ids = vec![];
133 | for (i, j) in best_path.iter() {
134 | ids.push(self.indices[*i - 1][*j].0.clone());
135 | }
136 | Some(ids)
137 | }
138 | None => None,
139 | }
140 | }
141 | }
142 |
143 | fn get_dp_table(
144 | indices: &[Vec<(WordIdentifier, usize)>],
145 | dict: &D,
146 | ) -> Vec> {
147 | let len = indices.len();
148 | let max_num_childs = indices.iter().map(|idx| idx.len()).max().unwrap();
149 | // (min cost, idx of indices, idx2 of indices[idx])
150 | // * dp[0][0] means BOS
151 | // * dp[dp.len() - 1][0] means EOS
152 | // Individual cost should be less in i16, the sum of costs can exceed its range.
153 | // Currently each element has unused indices to reduce num alloc
154 | let mut dp: Vec> =
155 | vec![vec![(i32::MAX, 0, 0); max_num_childs]; len + 2];
156 | if max_num_childs == 0 {
157 | return dp;
158 | }
159 | dp[0][0] = (0, 0, 0);
160 |
161 | for (i, (right_wid, _)) in indices[0].iter().enumerate() {
162 | let right = dict.get(right_wid).unwrap();
163 | let cost = dict
164 | .transition_cost(&BOS_CONTEXT_ID, &right.right_context_id)
165 | .unwrap()
166 | + right.cost;
167 | dp[1][i] = (cost as i32, NODE_BOS, 0);
168 | }
169 |
170 | for (i, index) in indices.iter().enumerate() {
171 | for (j, (left_wid, wlen)) in index.iter().enumerate() {
172 | let before_cost = dp[i + 1][j].0;
173 | let left = dict.get(left_wid).unwrap();
174 | if i + wlen >= len {
175 | let cost = (*dict
176 | .transition_cost(&left.left_context_id, &EOS_CONTEXT_ID)
177 | .unwrap() as i32)
178 | + (left.cost as i32)
179 | + before_cost;
180 | if cost < dp[i + wlen + 1][0].0 {
181 | dp[i + wlen + 1][0] = (cost, i + 1, j);
182 | }
183 | continue;
184 | }
185 |
186 | for (k, (right_wid, _)) in indices[i + wlen].iter().enumerate() {
187 | let right = dict.get(right_wid).unwrap();
188 | let cost = (*dict
189 | .transition_cost(&left.left_context_id, &right.right_context_id)
190 | .unwrap() as i32)
191 | + left.cost as i32
192 | + right.cost as i32
193 | + before_cost;
194 | if cost < dp[i + 1 + wlen][k].0 {
195 | dp[i + 1 + wlen][k] = (cost, i + 1, j);
196 | }
197 | }
198 | }
199 | }
200 | dp
201 | }
202 |
--------------------------------------------------------------------------------
/ipadic/src/ipadic_loader.rs:
--------------------------------------------------------------------------------
1 | use super::ipadic::IPADic;
2 | use csv::ReaderBuilder;
3 | use encoding_rs::EUC_JP;
4 | use glob::glob;
5 | use goya::char_class::{CharClass, CharClassifier, CharDefinition, InvokeTiming};
6 | use goya::morpheme::Morpheme;
7 | use goya::word_features::WordFeaturesMap;
8 | use indexmap::IndexSet;
9 | use regex::Regex;
10 | use serde::Deserialize;
11 | use std::collections::{HashMap, HashSet};
12 | use std::error::Error;
13 | use std::fs;
14 | use std::path::Path;
15 | use std::vec::Vec;
16 |
17 | const COL_SURFACE_FORM: usize = 0; // 表層形
18 | const COL_LEFT_CONTEXT_ID: usize = 1; // 左文脈ID
19 | const COL_RIGHT_CONTEXT_ID: usize = 2; // 右文脈ID
20 | const COL_COST: usize = 3; // コスト
21 |
22 | pub struct LoadResult {
23 | pub ipadic: IPADic,
24 | pub word_set: WordFeaturesMap,
25 | pub surfaces: HashMap,
26 | }
27 |
28 | pub struct IPADicLoader {}
29 | impl IPADicLoader {
30 | pub fn load(&self, dir: &str) -> Result> {
31 | let classes = load_chars(Path::new(dir).join("char.def"))?;
32 | let matrix = load_matrix(Path::new(dir).join("matrix.def"))?;
33 | let unknown = load_unknown(Path::new(dir).join("unk.def"))?;
34 | let csv_pattern = Path::new(dir).join("*.csv");
35 | let csv_pattern = csv_pattern.to_str().ok_or("Failed to build glob pattern")?;
36 |
37 | let mut vocabulary_index: IndexSet = IndexSet::new();
38 | let mut surfaces = HashMap::new();
39 | let mut known_features = HashMap::new();
40 | let mut vocabulary = HashMap::new();
41 | let mut tmp_homonyms = HashMap::new();
42 | let mut id: usize = 1;
43 | for path in glob(csv_pattern)? {
44 | for row in load_words_csv(path?)? {
45 | surfaces.insert(id, row.surface_form.to_string());
46 | known_features.insert(id, row.features.clone());
47 | tmp_homonyms
48 | .entry(row.surface_form.to_string())
49 | .or_insert_with(Vec::new)
50 | .push(id);
51 |
52 | let (idx, _) = vocabulary_index.insert_full(row.into());
53 | vocabulary.insert(id, idx);
54 | id += 1;
55 | }
56 | }
57 | let mut homonyms: HashMap> = HashMap::new();
58 | for wids in tmp_homonyms.values() {
59 | for wid in wids.iter() {
60 | homonyms.insert(*wid, wids.iter().copied().collect());
61 | }
62 | }
63 |
64 | let mut unknown_vocabulary = HashMap::new();
65 | let mut unknown_features = HashMap::new();
66 | let mut unknown_classes = HashMap::new();
67 | let mut id = 1;
68 | for (class, words) in unknown.into_iter() {
69 | for row in words {
70 | unknown_features.insert(id, row.features.clone());
71 | let (idx, _) = vocabulary_index.insert_full(row.into());
72 | unknown_vocabulary.insert(id, idx);
73 | unknown_classes
74 | .entry(class.to_string())
75 | .or_insert_with(Vec::new)
76 | .push(id);
77 | id += 1;
78 | }
79 | }
80 |
81 | let word_set = WordFeaturesMap::new(
82 | map_to_vec(known_features, Vec::new),
83 | map_to_vec(unknown_features, Vec::new),
84 | );
85 | let ipadic = IPADic::from(
86 | map_to_vec(vocabulary, || 0),
87 | map_to_vec(homonyms, Vec::new),
88 | classes,
89 | matrix,
90 | unknown_classes,
91 | map_to_vec(unknown_vocabulary, || 0),
92 | vocabulary_index,
93 | );
94 | let ret = LoadResult {
95 | word_set,
96 | ipadic,
97 | surfaces,
98 | };
99 | Ok(ret)
100 | }
101 | }
102 |
103 | #[derive(Debug, Clone, Deserialize)]
104 | struct CSVRow {
105 | /// 表層形
106 | /// https://taku910.github.io/mecab/dic-detail.html
107 | surface_form: String,
108 | /// 左文脈ID (単語を左から見たときの文脈 ID)
109 | /// https://taku910.github.io/mecab/dic-detail.html
110 | left_context_id: usize,
111 | /// 右文脈ID (単語を右から見たときの文脈 ID)
112 | /// https://taku910.github.io/mecab/dic-detail.html
113 | right_context_id: usize,
114 | /// 単語コスト (小さいほど出現しやすい)
115 | /// コスト値は short int (16bit 整数) の範囲におさめる必要があります.
116 | cost: i16,
117 | /// 5カラム目以降は, ユーザ定義の CSV フィールドです. 基本的に どんな内容でも CSV の許す限り追加することができます.
118 | /// https://taku910.github.io/mecab/dic-detail.html
119 | features: Vec,
120 | }
121 | impl From for Morpheme {
122 | fn from(row: CSVRow) -> Self {
123 | Morpheme {
124 | left_context_id: row.left_context_id,
125 | right_context_id: row.right_context_id,
126 | cost: row.cost,
127 | }
128 | }
129 | }
130 |
131 | fn load_words_csv(path: P) -> Result, Box>
132 | where
133 | P: AsRef,
134 | {
135 | let eucjp = fs::read(path)?;
136 | let (utf8, _, _) = EUC_JP.decode(&eucjp);
137 | let mut rdr = ReaderBuilder::new()
138 | .has_headers(false)
139 | .from_reader(utf8.as_bytes());
140 | let mut words = vec![];
141 | for row in rdr.records() {
142 | let row = row?;
143 | words.push(CSVRow {
144 | surface_form: row[COL_SURFACE_FORM].to_string(),
145 | left_context_id: row[COL_LEFT_CONTEXT_ID].parse::().unwrap(),
146 | right_context_id: row[COL_RIGHT_CONTEXT_ID].parse::().unwrap(),
147 | cost: row[COL_COST].parse::().unwrap(),
148 | features: row
149 | .iter()
150 | .skip(COL_COST + 1)
151 | .map(|v| v.to_string())
152 | .collect::>(),
153 | })
154 | }
155 | Ok(words)
156 | }
157 |
158 | fn load_chars(path: P) -> Result>
159 | where
160 | P: AsRef,
161 | {
162 | let eucjp = fs::read(path)?;
163 | let (utf8, _, _) = EUC_JP.decode(&eucjp);
164 | let lines = utf8
165 | .lines()
166 | .filter(|line| !line.is_empty() && !line.starts_with('#'))
167 | .map(|line| Regex::new(r"#.*$").unwrap().replace(line, ""))
168 | .collect::>();
169 |
170 | let head = lines.iter().take_while(|line| {
171 | let parts = line.trim().split_ascii_whitespace().collect::>();
172 | !parts[0].starts_with("0x")
173 | });
174 | let mut chars = HashMap::new();
175 | for line in head {
176 | let parts = line.trim().split_ascii_whitespace().collect::>();
177 | let kind = parts[0].to_owned();
178 | let class = kind.to_string();
179 | let timing = if parts[1] == "0" {
180 | InvokeTiming::Fallback
181 | } else {
182 | InvokeTiming::Always
183 | };
184 | let group_by_same_kind = parts[2] == "1";
185 | let len = parts[3].parse::()?;
186 | chars.insert(
187 | kind,
188 | CharDefinition {
189 | class,
190 | timing,
191 | group_by_same_kind,
192 | len,
193 | compatibilities: HashSet::new(),
194 | },
195 | );
196 | }
197 |
198 | let tail = lines.iter().skip_while(|line| {
199 | let parts = line.trim().split_ascii_whitespace().collect::>();
200 | !parts[0].starts_with("0x")
201 | });
202 | let mut ranges = vec![];
203 | for line in tail {
204 | let parts = line.trim().split_ascii_whitespace().collect::>();
205 | let range = parts[0]
206 | .split("..")
207 | .map(|c| u32::from_str_radix(&c[2..], 16).unwrap())
208 | .map(|c| char::from_u32(c).unwrap())
209 | .collect::>();
210 | let range = if range.len() > 1 {
211 | (range[0] as u32, range[1] as u32)
212 | } else {
213 | (range[0] as u32, range[0] as u32)
214 | };
215 | let class = parts[1];
216 | let compatibilities = parts
217 | .iter()
218 | .skip(2)
219 | .map(|s| s.to_string())
220 | .collect::>();
221 | chars.get_mut(class).unwrap().compatibilities = compatibilities;
222 | ranges.push(CharClass::from(range, class.to_string()));
223 | }
224 |
225 | Ok(CharClassifier::from(chars, ranges))
226 | }
227 |
228 | fn load_matrix(path: P) -> Result>, Box>
229 | where
230 | P: AsRef,
231 | {
232 | let eucjp = fs::read(path)?;
233 | let (utf8, _, _) = EUC_JP.decode(&eucjp);
234 | let mut lines = utf8.lines();
235 | let size = lines
236 | .next()
237 | .expect("failed to read the first line")
238 | .split_ascii_whitespace()
239 | .map(|p| p.parse::().unwrap())
240 | .collect::>();
241 | let mut matrix = vec![vec![-1; size[1]]; size[0]];
242 | for line in lines {
243 | let parts = line.split_ascii_whitespace().collect::>();
244 | let left = parts[0].parse::()?;
245 | let right = parts[1].parse::()?;
246 | let cost = parts[2].parse::()?;
247 | matrix[left][right] = cost;
248 | }
249 | Ok(matrix)
250 | }
251 |
252 | fn load_unknown(path: P) -> Result>, Box>
253 | where
254 | P: AsRef,
255 | {
256 | let words = load_words_csv(path)?;
257 | let mut map = HashMap::>::new();
258 | for w in words.into_iter() {
259 | map.entry(w.surface_form.to_string())
260 | .or_insert_with(Vec::new)
261 | .push(w);
262 | }
263 | Ok(map)
264 | }
265 |
266 | fn map_to_vec(map: HashMap, default: impl Fn() -> T) -> Vec {
267 | let mut ret = vec![default(); map.len() + 1];
268 | for (idx, value) in map.into_iter() {
269 | ret[idx] = value;
270 | }
271 | ret
272 | }
273 |
--------------------------------------------------------------------------------