├── .gitignore
├── evaluation
    ├── data
    │   └── .gitkeep
    ├── .gitignore
    ├── requirements.txt
    ├── resources
    │   ├── sim
    │   │   ├── jwsan.yaml
    │   │   └── tmu-sudachic.yaml
    │   ├── clf
    │   │   └── livedoor-sudachic.yaml
    │   └── vec
    │   │   └── chive-1.3-mc90-sudachic.yaml
    ├── models
    │   ├── classifier.py
    │   └── w2v.py
    ├── dataset
    │   ├── base.py
    │   ├── labeled.py
    │   └── pairwise.py
    ├── eval
    │   └── evaluator.py
    ├── utils.py
    ├── run_docclf.py
    ├── run_wordsim.py
    └── README.md
├── .github
    └── FUNDING.yml
├── training
    ├── requirements.txt
    ├── convert_model_format.py
    ├── README.md
    ├── prepare_corpus.py
    └── train_chive.py
├── docs
    ├── tutorial.md
    └── continue-training.md
├── LICENSE
├── README.md
└── README_en.md


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | 


--------------------------------------------------------------------------------
/evaluation/data/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/evaluation/.gitignore:
--------------------------------------------------------------------------------
1 | data/*
2 | !/data/.gitkeep
3 | 


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 | 
3 | github: WorksApplications
4 | 


--------------------------------------------------------------------------------
/training/requirements.txt:
--------------------------------------------------------------------------------
1 | gensim
2 | tqdm
3 | 
4 | # sudachi for v1.3
5 | sudachipy==0.6.8
6 | sudachidict_core==20240109
7 | 


--------------------------------------------------------------------------------
/evaluation/requirements.txt:
--------------------------------------------------------------------------------
1 | gensim
2 | pyyaml
3 | pandas
4 | scikit-learn
5 | 
6 | # sudachi for v1.3
7 | sudachipy==0.6.8
8 | sudachidict_core==20240109
9 | 


--------------------------------------------------------------------------------
/docs/tutorial.md:
--------------------------------------------------------------------------------
1 | # chiVe Tutorial
2 | 
3 | [公式README](https://github.com/WorksApplications/chiVe/blob/master/README.md)を参照してください。
4 | 
5 | Please refer to the [official README](https://github.com/WorksApplications/chiVe/blob/master/README.md) for the information.
6 | 


--------------------------------------------------------------------------------
/evaluation/resources/sim/jwsan.yaml:
--------------------------------------------------------------------------------
 1 | name: "jwsan"
 2 | data-path: "./data/JWSAN"
 3 | process:
 4 |   use-tokenizer: True
 5 |   tokenizer:
 6 |     name: "sudachi"
 7 |     others:
 8 |       mode: "C"
 9 |       dic-name:
10 |       form: "normalized" # "normalized" or "surface"
11 | 


--------------------------------------------------------------------------------
/evaluation/resources/sim/tmu-sudachic.yaml:
--------------------------------------------------------------------------------
 1 | name: "tmu"
 2 | data-path: "./data/JapaneseWordSimilarityDataset"
 3 | process:
 4 |   use-tokenizer: True
 5 |   tokenizer:
 6 |     name: "sudachi"
 7 |     others:
 8 |       mode: "C"
 9 |       dic-name:
10 |       form: "normalized" # "normalized" or "surface"
11 | 


--------------------------------------------------------------------------------
/evaluation/resources/clf/livedoor-sudachic.yaml:
--------------------------------------------------------------------------------
 1 | name: "livedoor"
 2 | data-path: "./data/ldcc-20140209/text"
 3 | process:
 4 |   pre-tokenized: False
 5 |   tokenizer:
 6 |     name: "sudachi"
 7 |     others:
 8 |       mode: "C"
 9 |       dic-name:
10 |       form: "normalized" # "surface" or "normalized"
11 |   used-pos: ["名詞"]
12 | 
13 | classifier:
14 |   method: "logreg"
15 |   others:
16 | 


--------------------------------------------------------------------------------
/evaluation/resources/vec/chive-1.3-mc90-sudachic.yaml:
--------------------------------------------------------------------------------
 1 | vec-path: "path/to/chive-1.3-mc90_gensim/chive-1.3-mc90.kv"
 2 | loading:
 3 |   w2v-fmt: True # set false to load gensim word2vec format (full model)
 4 |   fmt: "kv" # set "txt" to load text format, or "kv" to keyedvector format
 5 | gensim-others:
 6 | description:
 7 |   tokenizer:
 8 |     name: "sudachi"
 9 |     others:
10 |       mode: "C"
11 |       dic_name:
12 | api-setting:
13 | 


--------------------------------------------------------------------------------
/evaluation/models/classifier.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import logging
 3 | from sklearn.linear_model import LogisticRegression
 4 | from sklearn.neighbors import KNeighborsClassifier
 5 | 
 6 | logger = logging.getLogger(__name__)
 7 | 
 8 | 
 9 | def build_classifier(method, other_config):
10 |     if method == "logreg":
11 |         clf = LogisticRegression()
12 |     elif method == "knn":
13 |         clf = KNeighborsClassifier(n_neighbors=other_config["k"])
14 |     else:
15 |         raise ValueError("Invalid method: {}".format(method))
16 |     return clf
17 | 


--------------------------------------------------------------------------------
/evaluation/dataset/base.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import numpy as np
 3 | 
 4 | 
 5 | class BaseDataset():
 6 |     def __init__(self, samples, name=None):
 7 |         _type = type(samples)
 8 |         assert _type == list or _type == np.ndarray, "Should be list or ndarray"
 9 |         self._samples = samples if type(samples) == np.ndarray else np.array(samples)
10 |         self.name = name
11 | 
12 |     def __repr__(self):
13 |         return "<{} name={}>".format(self.__class__.__name__, self.name)
14 | 
15 |     def __getitem__(self, item):
16 |         return self._samples[item]
17 | 
18 |     def __len__(self):
19 |         return len(self._samples)
20 | 
21 |     def split(self):
22 |         raise NotImplementedError()
23 | 
24 |     def batch_iter(self, batchsize, rand_flg):
25 |         assert batchsize > 0
26 |         indices = np.random.permutation(len(self)) if rand_flg else np.arange(len(self))
27 |         for start in range(0, len(self), batchsize):
28 |             yield self[indices[start: start+batchsize]]
29 | 
30 |     def batch_iter_as_ndarray(self, batchsize, rand_flg):
31 |         raise NotImplementedError("Not available")
32 | 
33 |     def get_basic_stats(self):
34 |         raise NotImplementedError()
35 | 
36 |     def cal_stats(self):
37 |         return {"n_sample": len(self._samples)}
38 | 
39 | 
40 | class BaseInstance():
41 |     def __init__(self):
42 |         self.model_pred = None
43 | 
44 |     def get_content(self):
45 |         raise NotImplementedError()
46 | 
47 |     def set_model_prediction(self, pred):
48 |         self.model_pred = pred
49 | 


--------------------------------------------------------------------------------
/training/convert_model_format.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | from pathlib import Path
 4 | 
 5 | from gensim.models import Word2Vec
 6 | 
 7 | 
 8 | logging.basicConfig(
 9 |     style="{",
10 |     format='{levelname} {asctime} [{module}:{funcName}:{lineno}] {message}',
11 |     datefmt="%m/%d/%Y %H:%M:%S",
12 |     level=logging.INFO,
13 | )
14 | logger = logging.getLogger(__name__)
15 | 
16 | 
17 | def parse_args():
18 |     p = argparse.ArgumentParser(
19 |         "Convert trained gensim.Word2Vec full model into release formats.")
20 |     p.add_argument("--input", type=Path,
21 |                    help="target model data (.bin of gensim.Word2Vec)")
22 |     p.add_argument("--output", type=Path, default=Path("./output"),
23 |                    help="directory to output")
24 |     args = p.parse_args()
25 |     return args
26 | 
27 | 
28 | def main():
29 |     args = parse_args()
30 |     args.output.mkdir(parents=True, exist_ok=True)
31 | 
32 |     stem = args.input.stem
33 |     fullmodel = Word2Vec.load(str(args.input))
34 | 
35 |     # ref:
36 |     #   https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec
37 |     #   https://radimrehurek.com/gensim/models/keyedvectors.html#how-to-obtain-word-vectors
38 |     logger.info(f"gensim.KeyedVectors")
39 |     outfile = args.output / f"{stem}_gensim" / f"{stem}.kv"
40 |     outfile.parent.mkdir()
41 |     fullmodel.wv.save(str(outfile))
42 | 
43 |     # ref: https://radimrehurek.com/gensim/models/keyedvectors.html#gensim.models.keyedvectors.KeyedVectors.save_word2vec_format
44 |     logger.info(f"text format")
45 |     outfile = args.output / f"{stem}_text" / f"{stem}.txt"
46 |     outfile.parent.mkdir()
47 |     fullmodel.wv.save_word2vec_format(str(outfile))
48 | 
49 |     # ref: https://github.com/plasticityai/magnitude/tree/master?tab=readme-ov-file#file-format-and-converter
50 |     # logger.info(f"magnitude")
51 |     # from pymagnitude.third_party_mock.converter import convert as convert_magnitude
52 |     # outfile = args.output / f"{stem}.magnitude"
53 |     # convert_magnitude(
54 |     #     input_file_path=str(args.input),
55 |     #     output_file_path=str(outfile),
56 |     # )
57 | 
58 |     return
59 | 
60 | 
61 | if __name__ == '__main__':
62 |     main()
63 | 


--------------------------------------------------------------------------------
/evaluation/eval/evaluator.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import numpy as np
 3 | from scipy.stats import spearmanr
 4 | from sklearn.model_selection import cross_val_score, StratifiedKFold
 5 | 
 6 | 
 7 | class Evaluator():
 8 |     def __init__(self, dataset):
 9 |         self.dataset = dataset
10 | 
11 |     def predict_all(self, batchsize=1):
12 |         for batch in self.dataset.batch_iter(batchsize=batchsize, rand_flg=False):
13 |             for inst in batch:
14 |                 self._predict_one(inst)
15 | 
16 |     def run(self):
17 |         self.predict_all()
18 |         return self.get_eval_metric()
19 | 
20 |     def get_eval_metric(self):
21 |         raise NotImplementedError()
22 | 
23 |     def _predict_one(self, inst):
24 |         raise NotImplementedError()
25 | 
26 | 
27 | class W2VSimilarityEvaluator(Evaluator):
28 |     def __init__(self, dataset, w2v_api):
29 |         super(W2VSimilarityEvaluator, self).__init__(dataset=dataset)
30 |         self.w2v_api = w2v_api
31 | 
32 |     def get_eval_metric(self):
33 |         # spearman corr
34 |         human_scores = [
35 |             b[0].gold for b in self.dataset.batch_iter(1, rand_flg=False)]
36 |         auto_scores = [
37 |             b[0].model_pred for b in self.dataset.batch_iter(1, rand_flg=False)]
38 |         # assert all(human_scores), "Contain invalid gold data"
39 |         # assert all(auto_scores), "Contain invalid prediction"
40 |         corr = spearmanr(human_scores, auto_scores)[0]
41 |         return corr
42 | 
43 |     def _predict_one(self, inst):
44 |         e1, e2, gold_score = inst.get_content()
45 |         pred = self.w2v_api.cal_phrase_similarity(e1, e2)
46 |         inst.set_model_prediction(pred)
47 | 
48 | 
49 | class ClassificationEvaluator(Evaluator):
50 |     def __init__(self, dataset, w2v_api, clf):
51 |         super(ClassificationEvaluator, self).__init__(dataset=dataset)
52 |         self.w2v_api = w2v_api
53 |         self.clf = clf
54 | 
55 |     def run_kfold(self, k=10):
56 |         cv = get_sklearn_kfoldcv(k)
57 |         txt_xs, ys = self.dataset.get_xys()
58 |         xs = np.array([self.w2v_api.get_mean_vector(doc) for doc in txt_xs])
59 |         scores = cross_val_score(self.clf, xs, ys, cv=cv)
60 |         return {"scores": scores, "mean": np.mean(scores), "variance": np.var(scores)}
61 | 
62 | 
63 | def get_sklearn_kfoldcv(n_splits, seed=46):
64 |     return StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
65 | 


--------------------------------------------------------------------------------
/evaluation/utils.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import numpy as np
 3 | from scipy.spatial import distance
 4 | 
 5 | import sudachipy
 6 | 
 7 | 
 8 | def cos_sim(v1, v2):
 9 |     return 1.0 - distance.cosine(v1, v2)
10 | 
11 | 
12 | def get_zero_vector(dim, eps=1e-8):
13 |     return eps * np.ones(dim)
14 | 
15 | 
16 | def build_tokenizer(tok_name, config):
17 |     if tok_name == 'sudachi':
18 |         tok = SudachiTokenizer(config.get("mode", "C"),
19 |                                config.get("dic-name", None),
20 |                                config.get("form", "surface"))
21 |         return tok
22 |     else:
23 |         raise ValueError("Invalid Tokenizer Name: {}".format(tok_name))
24 | 
25 | 
26 | class SudachiTokenizer():
27 |     def __init__(self, mode=sudachipy.SplitMode.C, dic_name=None, form="surface"):
28 |         mode = mode if type(mode) == sudachipy.SplitMode \
29 |             else self._str2mode(mode)
30 |         self._name = f"sudachipy_{dic_name}_{self._mode2str(mode)}"
31 | 
32 |         dic_name = "core" if dic_name is None else dic_name
33 |         self._tok = sudachipy.Dictionary(dict_type=dic_name).create(mode=mode)
34 | 
35 |         assert form in ["surface",
36 |                         "normalized"], f"Invalid form for sudachi: {form}."
37 |         self.form = form
38 |         return
39 | 
40 |     def get_name(self) -> str:
41 |         return self._name
42 | 
43 |     def tokenize(self, sent):
44 |         return self._tok.tokenize(sent)
45 | 
46 |     def wakati(self, sent):
47 |         ms = self.tokenize(sent)
48 |         match self.form:
49 |             case "surface": return [m.surface() for m in ms]
50 |             case "normalized": return [m.normalized_form() for m in ms]
51 |             case _: raise RuntimeError(f"unknown sudachi form: {self.form}")
52 | 
53 |     @staticmethod
54 |     def _str2mode(modestr: str) -> sudachipy.SplitMode:
55 |         """parse sudachipy.SplitMode from str"""
56 |         match modestr.strip():
57 |             case "A" | "a": return sudachipy.SplitMode.A
58 |             case "B" | "b": return sudachipy.SplitMode.B
59 |             case "C" | "c": return sudachipy.SplitMode.C
60 |             case _: raise ValueError(f"cannot parse {modestr} as SplitMode.")
61 | 
62 |     @staticmethod
63 |     def _mode2str(mode: sudachipy.SplitMode) -> str:
64 |         """convert sudachipy.SplitMode into str"""
65 |         match mode:
66 |             case sudachipy.SplitMode.A: return "A"
67 |             case sudachipy.SplitMode.B: return "B"
68 |             case sudachipy.SplitMode.C: return "C"
69 | 


--------------------------------------------------------------------------------
/docs/continue-training.md:
--------------------------------------------------------------------------------
 1 | # chiVeの追加学習
 2 | 
 3 | chiVeは、各ドメイン（分野）に合わせたデータで追加学習させられます。
 4 | chiVeは、追加学習なしでも利用できますが、追加学習することでそのドメイン（分野）でのタスクの性能改善が期待できます。
 5 | 
 6 | 
 7 | ## Step 1. フルモデルをダウンロード
 8 | 
 9 | [学習させたいモデル](../README.md#追加学習用のフルモデル)を選択してダウンロードし、解凍します。
10 | 
11 | ```sh
12 | $ wget https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc90_gensim-full.tar.gz
13 | $ tar xzvf chive-1.2-mc90_gensim-full.tar.gz
14 | ```
15 | 
16 | ## Step 2. 学習コーパスの用意
17 | 
18 | 分かち書きした学習コーパスが必要です。
19 | 平文データ `corpus.txt` から、分かち書きしたファイル `corpus.tok.txt` を作ります。
20 | 
21 | ```bash
22 | $ pip install sudachipy sudachidict_core
23 | ```
24 | 
25 | ```py
26 | import sudachipy
27 | 
28 | tokenizer = sudachipy.Dictionary().create()
29 | 
30 | def tokenize(sentence: str, mode: str) -> str:
31 |     mode = {
32 |         'A': sudachipy.Tokenizer.SplitMode.A,
33 |         'B': sudachipy.Tokenizer.SplitMode.B,
34 |         'C': sudachipy.Tokenizer.SplitMode.C}[mode]
35 |     tokens = [m.normalized_form() for m in tokenizer.tokenize(sentence, mode)]
36 |     return ' '.join(tokens)
37 | 
38 | def create_training_corpus(inputpath, outputpath):
39 |     with open(inputpath) as inputfile, open(outputpath, 'w') as outputfile:
40 |         for mode in ('A', 'B', 'C'):
41 |             for line in inputfile:
42 |                 line = line.strip()
43 |                 if line == '':
44 |                     continue
45 |                 outputfile.write(tokenize(line, mode) + '\n')
46 |             inputfile.seek(0)
47 | 
48 | create_training_corpus('corpus.txt', 'corpus.tok.txt')
49 | ```
50 | 
51 | 
52 | ## Step 3. 学習
53 | 
54 | 学習パラメータの詳細は、[gensim.models.word2vec.Word2Vec.train](https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec.train)を参照してください。
55 | 
56 | ```bash
57 | $ pip install gensim
58 | ```
59 | 
60 | ```py
61 | from gensim.models.word2vec import LineSentence
62 | from gensim.models import Word2Vec
63 | 
64 | sentences = LineSentence('corpus.tok.txt')
65 | model = Word2Vec.load('chive-1.2-mc90_gensim-full/chive-1.2-mc90.bin')
66 | model.vocabulary.min_count = 3
67 | model.build_vocab(sentences, update=True)
68 | model.train(sentences, total_examples=model.corpus_count, epochs=15)
69 | ```
70 | 
71 | 学習したモデルを保存します。
72 | 
73 | * KeyedVectors: 学習に使用するパラメータを削除した埋め込みのみのデータ形式
74 | * Full model: 学習に使用するパラメータも保持したデータ形式
75 | 
76 | ```py
77 | model.wv.save('chive-1.2-mc90.finetuned-mc3.kv')  # Save as KeyedVectors
78 | model.save('chive-1.2-mc90.finetuned-mc3.bin')    # Save as Full model
79 | ```
80 | 
81 | 
82 | ## Step 4. 利用
83 | 
84 | 保存した分散表現を読み込んで利用します。
85 | 
86 | ```py
87 | from gensim.models import KeyedVectors
88 | KeyedVectors.load('chive-1.2-mc90.finetuned-mc3.kv')  # Load as KeyedVectors
89 | Word2Vec.load('chive-1.2-mc90.finetuned-mc3.bin')     # Load as Full model
90 | ```
91 | 


--------------------------------------------------------------------------------
/evaluation/run_docclf.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import argparse
 3 | from datetime import datetime
 4 | import logging
 5 | from pathlib import Path
 6 | import yaml
 7 | 
 8 | from dataset.labeled import build_doc_classification_dataset
 9 | from eval.evaluator import ClassificationEvaluator
10 | from models.classifier import build_classifier
11 | from models.w2v import build_gensim_w2v, build_w2v_api
12 | 
13 | 
14 | logging.basicConfig(
15 |     style="{",
16 |     format='{levelname} {asctime} [{module}:{funcName}:{lineno}] {message}',
17 |     datefmt="%m-%d-%Y %H:%M:%S",
18 |     level=logging.INFO,
19 | )
20 | logger = logging.getLogger(__name__)
21 | 
22 | 
23 | def add_file_handler(logfile: Path) -> None:
24 |     file_handler = logging.FileHandler(logfile)
25 |     logger.addHandler(file_handler)
26 |     return
27 | 
28 | 
29 | def run_docclf(vconfig_path, tconfig_path):
30 |     with open(vconfig_path) as fv, open(tconfig_path) as ft:
31 |         vec_config = yaml.load(fv, Loader=yaml.CLoader)
32 |         task_config = yaml.load(ft, Loader=yaml.CLoader)
33 |     logger.info(f"vector configulatation: {vec_config}")
34 |     logger.info(f"task configulatation: {task_config}")
35 | 
36 |     logger.info("Setup...")
37 |     w2v = build_gensim_w2v(w2v_path=vec_config["vec-path"],
38 |                            load_config=vec_config["loading"],
39 |                            other_config=vec_config["gensim-others"])
40 |     w2v_api = build_w2v_api(w2v=w2v,
41 |                             config=vec_config["api-setting"],
42 |                             w2v_desc=vec_config["description"])
43 |     dat = build_doc_classification_dataset(name=task_config["name"],
44 |                                            dir_path=task_config["data-path"],
45 |                                            process_config=task_config["process"])
46 |     clf = build_classifier(method=task_config["classifier"]["method"],
47 |                            other_config=task_config["classifier"]["others"])
48 | 
49 |     logger.info("Do evaluation")
50 |     eval = ClassificationEvaluator(dat, w2v_api, clf)
51 |     res = eval.run_kfold()
52 | 
53 |     logger.info("Results...")
54 |     logger.info(res)
55 |     logger.info("Done")
56 |     return
57 | 
58 | 
59 | if __name__ == '__main__':
60 |     p = argparse.ArgumentParser("Word similarity Evaluation")
61 |     p.add_argument("--vconfig", type=Path,
62 |                    help="Vector Configulation file path")
63 |     p.add_argument("--tconfig", type=Path, help="Task Configulation file path")
64 |     p.add_argument("--log", type=Path, default=None, help="Log path")
65 |     args = p.parse_args()
66 | 
67 |     # setup logger
68 |     logfile = args.log if args.log is not None else Path(
69 |         f"{datetime.now().strftime('%Y%m%d_%H:%M')}.log")
70 |     add_file_handler(logfile)
71 | 
72 |     logger.info("Arguments...")
73 |     for arg, val in sorted(vars(args).items()):
74 |         logger.info("{}: {}".format(arg, val))
75 | 
76 |     # main
77 |     run_docclf(args.vconfig, args.tconfig)
78 | 


--------------------------------------------------------------------------------
/evaluation/run_wordsim.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import argparse
 3 | from datetime import datetime
 4 | import logging
 5 | from pathlib import Path
 6 | import yaml
 7 | 
 8 | from dataset.pairwise import build_pairwise_similarity_datasets
 9 | from eval.evaluator import W2VSimilarityEvaluator
10 | from models.w2v import build_gensim_w2v, build_w2v_api
11 | 
12 | 
13 | logging.basicConfig(
14 |     style="{",
15 |     format='{levelname} {asctime} [{module}:{funcName}:{lineno}] {message}',
16 |     datefmt="%m-%d-%Y %H:%M:%S",
17 |     level=logging.INFO,
18 | )
19 | logger = logging.getLogger(__name__)
20 | 
21 | 
22 | def add_file_handler(logfile: Path) -> None:
23 |     file_handler = logging.FileHandler(logfile)
24 |     logger.addHandler(file_handler)
25 |     return
26 | 
27 | 
28 | def run_wordsim(vconfig_path, tconfig_path):
29 |     with open(vconfig_path) as fv, open(tconfig_path) as ft:
30 |         vec_config = yaml.load(fv, Loader=yaml.CLoader)
31 |         task_config = yaml.load(ft, Loader=yaml.CLoader)
32 |     logger.info("vector configulatation: {}".format(vec_config))
33 |     logger.info("task configulatation: {}".format(task_config))
34 | 
35 |     logger.info("Setup...")
36 |     w2v = build_gensim_w2v(w2v_path=vec_config["vec-path"],
37 |                            load_config=vec_config["loading"],
38 |                            other_config=vec_config["gensim-others"])
39 |     w2v_api = build_w2v_api(w2v=w2v,
40 |                             config=vec_config["api-setting"],
41 |                             w2v_desc=vec_config["description"])
42 |     dats = build_pairwise_similarity_datasets(name=task_config["name"],
43 |                                               dir_path=task_config["data-path"],
44 |                                               process_config=task_config["process"])
45 | 
46 |     logger.info("Do evaluation")
47 |     name2score = {}
48 |     for d in dats:
49 |         logger.info("In {}".format(d.name))
50 |         eval = W2VSimilarityEvaluator(d, w2v_api)
51 |         score = eval.run()
52 |         name2score[d.name] = score
53 |         logger.info("Out {}".format(d.name))
54 | 
55 |     logger.info("Results...")
56 |     for (k, v) in name2score.items():
57 |         logger.info("{}: {}".format(k, v))
58 |     logger.info("Done")
59 |     return
60 | 
61 | 
62 | if __name__ == '__main__':
63 |     p = argparse.ArgumentParser("Document classification Evaluation")
64 |     p.add_argument("--vconfig", type=Path,
65 |                    help="Vector Configulation file path")
66 |     p.add_argument("--tconfig", type=Path, help="Task Configulation file path")
67 |     p.add_argument("--log", type=Path, default=None, help="Log path")
68 |     args = p.parse_args()
69 | 
70 |     # setup logger
71 |     logfile = args.log if args.log is not None else Path(
72 |         f"{datetime.now().strftime('%Y%m%d_%H:%M')}.log")
73 |     add_file_handler(logfile)
74 | 
75 |     logger.info("Arguments...")
76 |     for arg, val in sorted(vars(args).items()):
77 |         logger.info("{}: {}".format(arg, val))
78 | 
79 |     # main
80 |     run_wordsim(args.vconfig, args.tconfig)
81 | 


--------------------------------------------------------------------------------
/training/README.md:
--------------------------------------------------------------------------------
 1 | # Training chiVe
 2 | 
 3 | chiVe is trained using [gensim](https://radimrehurek.com/gensim/index.html) library.
 4 | 
 5 | ## Training procedure
 6 | 
 7 | ### 0. Setup
 8 | 
 9 | Install libraries with `pip install -r requirements.txt`.
10 | Note that, for the reproducablity, the versions of Sudachi in `requirements.txt` are fixed.
11 | If you want to use latest ones, modify them or update Sudachi.
12 | 
13 | ### 1. Prepare corpus
14 | 
15 | Training a word2vec model requires a corpus.
16 | 
17 | We load the corpus using [`gensim.models.word2vec.LineSentence`](https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.LineSentence) or [`PathLineSentences`](https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.LineSentence) class.
18 | Since they assumes that words are already preprocessed and whitespace separated, we preprocess our corpus.
19 | 
20 | Put corpus text file(s) in a directory (subdirectories are NOT searched), and run
21 | `python prepare_corpus.py --input path/to/input_dir --output path/to/output_dir`.
22 | This script does followings:
23 | 
24 | - Analyze input text files using Sudachi, and output each morphemes separeted by a whitespace (分かち書き/wakachi-gaki).
25 | - Each morphemes are converted into `normalized_form` (正規化形).
26 | - Analysis is performed 3 times, using each of Sudachi split modes (A/B/C).
27 |   - Output 3 files per an input file.
28 | - Analysis is performed per line.
29 | - Words that its normalized_form is a whitespace (" ") are skiped.
30 | 
31 | Set `--skip-existing` to skip an analysis for a pair of (input file, Sudachi split mode) that already processed.
32 | Skippableness is judged based on the existance of the output file and its line count (for the case of analysis was interrupted).
33 | 
34 | Use `--mode` to specify Sudachi split mode to use. e.g. `--mode AC` to use only mode A and C.
35 | 
36 | example:
37 | 
38 | ```bash
39 | ls -A data/raw_corpus/*.txt | xargs -L 1 -I{} -P 20 \
40 |     python prepare_corpus.py --skip-existing \
41 |         --input {} --output data/corpus/
42 | ```
43 | 
44 | ### 2. Training
45 | 
46 | Use `train_chive.py` to run training using gensim word2vec class.
47 | `--input` should be set the output directory of `prepare_corpus.py`.
48 | 
49 | example:
50 | 
51 | ```bash
52 | python train_chive.py \
53 |     --input data/corpus/ --output model/full/ \
54 |     --epochs 15 --min-count 90 \
55 |     --save-epochs 3 --keep-ckpt 5 \
56 |     --worker 16
57 | ```
58 | 
59 | chiVe ~v1.3 are trained 15 epochs with following parameters.
60 | 
61 | ```json
62 | {
63 |   "vector_size": 300,
64 |   "window": 8,
65 |   "sg": 1,
66 |   "hs": 0,
67 |   "n_negative": 5,
68 |   "threshold_downsample": 1e-5,
69 |   "alpha": 0.025,
70 |   "min_alpha": 0.0001
71 | }
72 | ```
73 | 
74 | You can resume training from a checkpoint saved (auto detect from the output directory).
75 | You should resume with same parameters and corpus, otherwise the result may not be an expected one.
76 | Note that resuming a training is not the feature of gensim, and may contain some error on precision.
77 | 
78 | ### 3. Convert to distribution formats
79 | 
80 | Trained (full) model contains values for update model parameters, that is not neccessary for querying.
81 | Use `convert_model_format.py` to generate text and `gensim.KeyedVectors` format for the distribution.
82 | 
83 | example:
84 | 
85 | ```bash
86 | python convert_model_format.py \
87 |     --input model/full/ --output model/release/
88 | ```
89 | 
90 | [magnitude](https://github.com/plasticityai/magnitude) does not seems to be maintained, so we stop to distribute in that format from chiVe v1.3.
91 | 


--------------------------------------------------------------------------------
/evaluation/README.md:
--------------------------------------------------------------------------------
 1 | # Evaluation of word vectors
 2 | 
 3 | See following paper for the detail.
 4 | 
 5 | - 真鍋陽俊, 岡照晃, 海川祥毅, 髙岡一馬, 内田佳孝, 浅原正幸. [複数粒度の分割結果に基づく日本語単語分散表現](https://www.anlp.jp/proceedings/annual_meeting/2019/pdf_dir/P8-5.pdf). 言語処理学会第 25 回年次大会, 2019.
 6 | 
 7 | ## Setup
 8 | 
 9 | Download datasets.
10 | The default task setting files (`resources/**/*.yaml`) assume datasets are located under `data/`, otherwise you need to modify it.
11 | 
12 | Download word vectors, and set the path in the vec setting file (`resources/vec/*.yaml`).
13 | The default vec setting assumes to use `gensim.KeyedVector` format.
14 | 
15 | Install modules by `pip install -r requirements.txt`.
16 | 
17 | ## Tasks
18 | 
19 | ### Word similarity
20 | 
21 | Calculate the Spearman rank-order correlation coefficient between human annotated similarity and word vectors.
22 | 
23 | - Similarity of word vectors are measured by cosine-similariry.
24 | - If a target word consists of multiple (sudachi) words, we use an average vector of each (sudachi) words.
25 | 
26 | We used following datasets:
27 | 
28 | - [JWSEN](http://www.utm.inf.uec.ac.jp/JWSAN/index.html)
29 |   - Download jwsan.zip, unzip and place under `data/`
30 | - [Japanese Word Similarity Dataset](https://github.com/tmu-nlp/JapaneseWordSimilarityDataset) (JWSD)
31 |   - Clone the repository under `data/`
32 | 
33 | commands:
34 | 
35 | ```bash
36 | # JWSEN
37 | python run_wordsim.py \
38 |     --tconfig resources/sim/jwsen.yaml \
39 |     --vconfig resources/vec/chive-1.3-mc90-sudachic.yaml
40 | 
41 | # JWSD
42 | python run_wordsim.py \
43 |     --tconfig resources/sim/tmu-sudachic.yaml \
44 |     --vconfig resources/vec/chive-1.3-mc90-sudachic.yaml
45 | ```
46 | 
47 | ### Document classification
48 | 
49 | Train classifier using word vector as a feature vector.
50 | 
51 | - Document vector is calculated by averaging word vectors of nouns in the document.
52 | - LogisticRegression is used as a classifier.
53 | - We conduct 10-fold cross validation.
54 | 
55 | We used following datasets:
56 | 
57 | - [livedoor news corpus](https://www.rondhuit.com/download.html#ldcc)
58 |   - Download ldcc-20140209.tar.gz, untar and place under `data/`
59 | 
60 | commands:
61 | 
62 | ```bash
63 | # livedoor news corpus
64 | python run_docclf.py \
65 |     --tconfig resources/clf/livedoor-sudachic.yaml \
66 |     --vconfig resources/vec/chive-1.3-mc30-sudachic.yaml
67 | ```
68 | 
69 | ## Results
70 | 
71 | | version   | jwsan-1400 類似度 | jwsan-1400 関連度 | jwsd-verb | jwsd-adj | jwsd-noun | jwsd-adv | livedoor-acc   |
72 | | --------- | ----------------- | ----------------- | --------- | -------- | --------- | -------- | -------------- |
73 | | v1.3 mc5  | 0.493             | 0.626             | 0.309     | 0.459    | 0.351     | 0.231    | 0.862+1.46e-4  |
74 | | v1.3 mc15 | 0.492             | 0.627             | 0.318     | 0.465    | 0.354     | 0.239    | 0.860+1.48e-4  |
75 | | v1.3 mc30 | 0.496             | 0.626             | 0.318     | 0.459    | 0.354     | 0.250    | 0.859+1.23e-4  |
76 | | v1.3 mc90 | 0.493             | 0.622             | 0.324     | 0.460    | 0.344     | 0.261    | 0.857+1.55e-4  |
77 | |           |                   |                   |           |          |           |          |                |
78 | | v1.2 mc5  | 0.520             | 0.633             | 0.316     | 0.466    | 0.355     | 0.297    | 0.865+0.436e-4 |
79 | | v1.2 mc15 | 0.513             | 0.629             | 0.315     | 0.461    | 0.353     | 0.294    | 0.862+0.710e-4 |
80 | | v1.2 mc30 | 0.515             | 0.631             | 0.311     | 0.458    | 0.354     | 0.289    | 0.860+0.546e-4 |
81 | | v1.2 mc90 | 0.512             | 0.627             | 0.307     | 0.463    | 0.345     | 0.281    | 0.861+0.778e-4 |
82 | 
83 | ## NOTE
84 | 
85 | - Current implementation uses zero-vector for OOV words and cosine-similarity with zero-vector is 1.0.
86 |   This may affect the evaluation result.
87 | 


--------------------------------------------------------------------------------
/evaluation/dataset/labeled.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import itertools
  3 | from pathlib import Path
  4 | 
  5 | from dataset.base import BaseDataset, BaseInstance
  6 | from utils import build_tokenizer
  7 | 
  8 | 
  9 | class TextClassificationDataset(BaseDataset):
 10 |     def __init__(self, samples, label_name=None, name=None):
 11 |         super(TextClassificationDataset, self).__init__(samples, name)
 12 |         self.label_name = label_name
 13 | 
 14 |     def get_xys(self):
 15 |         xs = [b[0].word_list for b in self.batch_iter(1, rand_flg=False)]
 16 |         ys = [b[0].gold for b in self.batch_iter(1, rand_flg=False)]
 17 |         return xs, ys
 18 | 
 19 |     @staticmethod
 20 |     def build(docs, label_ids, label_name=False):
 21 |         """
 22 |         args:
 23 |             - docs (list<list<srt>>): documents
 24 |             - label_ids (list<int>): labels
 25 |             - label_name (list<str>): label names
 26 |         """
 27 |         assert len(docs) == len(label_ids), "Inconsistent length: {} != {}" \
 28 |             .format(len(docs), len(label_ids))
 29 |         insts = [TextClassificationInstance(d, l)
 30 |                  for (d, l) in zip(docs, label_ids)]
 31 |         return TextClassificationDataset(samples=insts, label_name=label_name)
 32 | 
 33 | 
 34 | class TextClassificationInstance(BaseInstance):
 35 |     def __init__(self, word_list, label_id):
 36 |         """
 37 |         args:
 38 |             - word_list (list<str>): word sequence
 39 |             - label_id (int): label id
 40 |         """
 41 |         super(TextClassificationInstance, self).__init__()
 42 |         assert type(word_list) == list
 43 |         assert type(label_id) == int
 44 |         self.word_list = word_list
 45 |         self.gold = label_id
 46 | 
 47 |     def get_content(self):
 48 |         return (self.word_list, self.gold)
 49 | 
 50 | 
 51 | def build_doc_classification_dataset(name, dir_path, process_config):
 52 |     if name == "livedoor":
 53 |         txt_xs, ys = build_livedoor(dir_path, process_config)
 54 |     else:
 55 |         raise ValueError()
 56 |     dat = TextClassificationDataset.build(txt_xs, ys)
 57 |     return dat
 58 | 
 59 | 
 60 | def build_livedoor(dir_path, process_config):
 61 |     LIVEDOOR_LABEL_LIST = ["dokujo-tsushin", "kaden-channel", "movie-enter",
 62 |                            "smax", "topic-news", "it-life-hack",
 63 |                            "livedoor-homme", "peachy", "sports-watch"]
 64 |     pre_tokenized = process_config["pre-tokenized"]
 65 |     tok = None if pre_tokenized else build_tokenizer(
 66 |         process_config["tokenizer"]["name"], process_config["tokenizer"]["others"])
 67 | 
 68 |     base = Path(dir_path)
 69 |     label_names = []
 70 |     txt_xs = []
 71 |     for label_dir in sorted(base.glob("*")):
 72 |         if not label_dir.is_dir():
 73 |             continue
 74 |         label_name = label_dir.name
 75 |         assert label_name in LIVEDOOR_LABEL_LIST, "Invalid label directory"
 76 |         for doc_path in sorted(label_dir.glob("*.txt")):  # get document files
 77 |             if pre_tokenized == True:
 78 |                 raise NotImplementedError()
 79 |             else:
 80 |                 doc = tokenize_doc(doc_path, tok, process_config)
 81 |             txt_xs.append(doc)
 82 |             label_names.append(label_name)
 83 |     assert len(txt_xs) == len(label_names), "Inconsistent length"
 84 |     ys = [LIVEDOOR_LABEL_LIST.index(n) for n in label_names]
 85 |     return txt_xs, ys
 86 | 
 87 | 
 88 | def tokenize_doc(doc_path, tok, process_config):
 89 |     ms = []
 90 |     with Path(doc_path).open() as fi:
 91 |         for l in fi:
 92 |             ms.append(tok.tokenize(l))
 93 |     ms = itertools.chain.from_iterable(ms)
 94 | 
 95 |     if (used_pos := process_config["used-pos"]) != "all":
 96 |         ms = (m for m in ms if m.part_of_speech()[0] in used_pos)
 97 | 
 98 |     match (form := process_config["tokenizer"]["others"]["form"]):
 99 |         case "surface": return [m.surface() for m in ms]
100 |         case "normalized": return [m.normalized_form() for m in ms]
101 |         case _: raise ValueError(f"invalid tokenizer form: {form}")
102 | 


--------------------------------------------------------------------------------
/evaluation/dataset/pairwise.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import pandas as pd
  3 | from pathlib import Path
  4 | 
  5 | from dataset.base import BaseDataset, BaseInstance
  6 | from utils import build_tokenizer
  7 | 
  8 | 
  9 | class PairwiseSimilarityDataset(BaseDataset):
 10 |     def __init__(self, samples, name=None):
 11 |         super(PairwiseSimilarityDataset, self).__init__(samples, name)
 12 | 
 13 |     @staticmethod
 14 |     def build_from_triplet(col1s, col2s, sims):
 15 |         """
 16 |         args:
 17 |             - col1s (list<?>): first elements in the given pairs
 18 |             - col2s (list<?>): second elements in the given pairs
 19 |             - sims (list<float>): scores
 20 |             - tok (Tokenizer): Tokenizer for entryies. If None, no tokenization
 21 |         """
 22 |         assert len(col1s) == len(col2s) and len(col1s) == len(sims), \
 23 |             "Inconsistent length: {} / {} / {}" \
 24 |             .format(len(col1s), len(col2s), len(sims))
 25 |         insts = [PairwiseSimilarityInstance(c1, c2, sim)
 26 |                  for (c1, c2, sim) in zip(col1s, col2s, sims)]
 27 |         return PairwiseSimilarityDataset(samples=insts)
 28 | 
 29 | 
 30 | class PairwiseSimilarityInstance(BaseInstance):
 31 |     def __init__(self, e1, e2, score):
 32 |         """
 33 |         args:
 34 |             - e1, e2 (list<str>): Pair of (tokenized) Strings to be evaluated.
 35 |         """
 36 |         super(PairwiseSimilarityInstance, self).__init__()
 37 |         assert type(e1) == list
 38 |         assert type(e2) == list
 39 |         assert type(score) in [
 40 |             int, float], "Invalid type: {}".format(type(score))
 41 |         self._triple = (e1, e2, score)
 42 |         self.gold = score
 43 | 
 44 |     def get_content(self):
 45 |         return self._triple
 46 | 
 47 | 
 48 | def build_pairwise_similarity_datasets(name, dir_path, process_config):
 49 |     # prepare to generate dataset
 50 |     if process_config["use-tokenizer"] == True:
 51 |         tok = build_tokenizer(process_config["tokenizer"]["name"],
 52 |                               process_config["tokenizer"]["others"])
 53 |     else:
 54 |         tok = None
 55 | 
 56 |     if name == "tmu":
 57 |         datasets = build_tmu(dir_path, tok)
 58 |     elif name == "jwsan":
 59 |         datasets = build_jwsan(dir_path, tok)
 60 |     else:
 61 |         raise ValueError("Invalid dataset name: {}".format(name))
 62 |     return datasets
 63 | 
 64 | 
 65 | def build_tmu(dir_path, tok=None):
 66 |     _filenames = ['score_adj.csv', 'score_adv.csv',
 67 |                   'score_noun.csv', 'score_verb.csv']
 68 |     base = Path(dir_path)
 69 |     dats = []
 70 |     for fn in _filenames:
 71 |         df = pd.read_csv(str((base/fn).absolute()), encoding="utf-8")
 72 |         if tok == None:
 73 |             col1s = [[c] for c in list(df["word1"])]
 74 |             col2s = [[c] for c in list(df["word2"])]
 75 |         else:
 76 |             col1s = [tok.wakati(c) for c in list(df["word1"])]
 77 |             col2s = [tok.wakati(c) for c in list(df["word2"])]
 78 |         scores = [float(score) for score in list(df["mean"])]
 79 |         dat = PairwiseSimilarityDataset.build_from_triplet(
 80 |             col1s, col2s, scores)
 81 |         dat.name = "tmu-{}".format(fn)
 82 |         dats.append(dat)
 83 |     return dats
 84 | 
 85 | 
 86 | def build_jwsan(dir_path, tok=None):
 87 |     _filenames = ['jwsan-1400.csv', 'jwsan-2145.csv']
 88 |     _elms = ['similarity', 'association']
 89 |     base = Path(dir_path)
 90 |     dats = []
 91 |     for fn in _filenames:
 92 |         df = pd.read_csv(str((base/fn).absolute()), encoding="utf-8")
 93 |         if tok == None:
 94 |             col1s = [[c] for c in list(df["word1"])]
 95 |             col2s = [[c] for c in list(df["word2"])]
 96 |         else:
 97 |             col1s = [tok.wakati(c) for c in list(df["word1"])]
 98 |             col2s = [tok.wakati(c) for c in list(df["word2"])]
 99 |         for e in _elms:
100 |             scores = [float(score) for score in list(df[e])]
101 |             dat = PairwiseSimilarityDataset.build_from_triplet(
102 |                 col1s, col2s, scores)
103 |             dat.name = "jwsan-{}-{}".format(fn, e)
104 |             dats.append(dat)
105 |     return dats
106 | 


--------------------------------------------------------------------------------
/training/prepare_corpus.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | from collections.abc import Iterable
  3 | import logging
  4 | from pathlib import Path
  5 | from tqdm import tqdm
  6 | 
  7 | import sudachipy
  8 | 
  9 | logging.basicConfig(
 10 |     style="{",
 11 |     format='{levelname} {asctime} [{module}:{funcName}:{lineno}] {message}',
 12 |     datefmt="%m/%d/%Y %H:%M:%S",
 13 |     level=logging.INFO,
 14 | )
 15 | logger = logging.getLogger(__name__)
 16 | 
 17 | 
 18 | def parse_args():
 19 |     p = argparse.ArgumentParser(
 20 |         "Wakati-split given texts per line with A/B/C mode and normalized_form.")
 21 |     p.add_argument("--input", type=Path,
 22 |                    help="text file (line-by-line) or a directory contains them")
 23 |     p.add_argument("--output", type=Path,
 24 |                    help="directory to output")
 25 | 
 26 |     p.add_argument("--mode", type=str, default="ABC",
 27 |                    help="split mode to use (default: ABC)")
 28 |     p.add_argument("--skip-existing", action="store_true",
 29 |                    help="if set, skip processing if the output file already exists")
 30 | 
 31 |     args = p.parse_args()
 32 |     return args
 33 | 
 34 | 
 35 | def list_textfiles(input: Path) -> Iterable[Path]:
 36 |     """iterate over text files in the input directory (or input text file)"""
 37 |     if input.is_file():
 38 |         return [input]
 39 |     if input.is_dir():
 40 |         return input.glob("*.txt")
 41 |     return []
 42 | 
 43 | 
 44 | def mode2str(mode: sudachipy.SplitMode) -> str:
 45 |     """convert sudachipy.SplitMode into str"""
 46 |     match mode:
 47 |         case sudachipy.SplitMode.A: return "A"
 48 |         case sudachipy.SplitMode.B: return "B"
 49 |         case sudachipy.SplitMode.C: return "C"
 50 | 
 51 | 
 52 | def str2mode(modestr: str) -> sudachipy.SplitMode:
 53 |     """parse sudachipy.SplitMode from str"""
 54 |     match modestr.strip():
 55 |         case "A" | "a": return sudachipy.SplitMode.A
 56 |         case "B" | "b": return sudachipy.SplitMode.B
 57 |         case "C" | "c": return sudachipy.SplitMode.C
 58 |         case _: raise ValueError(f"cannot parse {modestr} as SplitMode.")
 59 | 
 60 | 
 61 | def output_filepath(input_file: Path, output_dir: Path, mode: sudachipy.SplitMode) -> Path:
 62 |     """generate output file path for the current input"""
 63 |     assert input_file.is_file()
 64 |     assert output_dir.is_dir()
 65 | 
 66 |     filename = f"{input_file.stem}_wakati_{mode2str(mode)}.txt"
 67 |     return output_dir / filename
 68 | 
 69 | 
 70 | def count_line(file: Path) -> int:
 71 |     """count line of a file"""
 72 |     assert file.is_file()
 73 |     with file.open() as fi:
 74 |         count = sum(1 for _ in fi)
 75 |     return count
 76 | 
 77 | 
 78 | def able_to_skip(infile: Path, outfile: Path) -> bool:
 79 |     """check if we can skip processing based on line count."""
 80 |     if not outfile.exists():
 81 |         return False
 82 | 
 83 |     lines_in = count_line(infile)
 84 |     lines_out = count_line(outfile)
 85 |     if lines_in == lines_out:
 86 |         return True
 87 |     return False
 88 | 
 89 | 
 90 | def wakati(tok: sudachipy.Tokenizer, sentence: str) -> str:
 91 |     """tokenize given sentence by the toknizer and return their normalized form joining with spaces."""
 92 |     morphemes = tok.tokenize(sentence)
 93 |     norm_forms = (m.normalized_form() for m in morphemes)
 94 |     return ' '.join(m for m in norm_forms if m != " ")
 95 | 
 96 | 
 97 | def wakati_file(infile: Path, outfile: Path, tok: sudachipy.Tokenizer) -> ():
 98 |     """apply wakati to each line of infile and write to outfile."""
 99 |     with infile.open() as fi, outfile.open("w") as fo:
100 |         for line in tqdm(fi):
101 |             line = line.strip()
102 |             if line == "":
103 |                 fo.write("\n")
104 |                 continue
105 |             fo.write(wakati(tok, line) + "\n")
106 |     return
107 | 
108 | 
109 | def main():
110 |     args = parse_args()
111 |     args.output.mkdir(parents=True, exist_ok=True)
112 | 
113 |     dic = sudachipy.Dictionary()
114 |     for mode in map(str2mode, args.mode):
115 |         tok = dic.create(mode=mode)
116 |         for file in list_textfiles(args.input):
117 |             outfile = output_filepath(file, args.output, mode)
118 | 
119 |             # check if skip
120 |             if args.skip_existing and outfile.exists():
121 |                 if able_to_skip(file, outfile):
122 |                     logger.info(
123 |                         f"skip {file=} with split mode {mode2str(mode)}.")
124 |                     continue
125 |                 logger.info(
126 |                     f"{outfile=} exists but processing seems not finished.")
127 | 
128 |             # process
129 |             logger.info(f"process {file=} with split mode {mode2str(mode)}.")
130 |             logger.info(f"output file: \"{outfile}\"")
131 |             wakati_file(file, outfile, tok)
132 |     return
133 | 
134 | 
135 | if __name__ == '__main__':
136 |     main()
137 | 


--------------------------------------------------------------------------------
/evaluation/models/w2v.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Basically this is for Japanese
  3 | """
  4 | 
  5 | from gensim.models import Word2Vec, KeyedVectors
  6 | from gensim.models.keyedvectors import Word2VecKeyedVectors
  7 | import logging
  8 | import numpy as np
  9 | from sklearn.decomposition import PCA
 10 | 
 11 | from utils import cos_sim, get_zero_vector
 12 | 
 13 | logger = logging.getLogger(__name__)
 14 | 
 15 | 
 16 | class WordVectorizer():
 17 |     def __init__(self):
 18 |         pass
 19 | 
 20 |     def query(self, surface, return_oov_flag=False):
 21 |         """
 22 |         generate the vector for the given word.
 23 |         --- Don't OVERRIDE ---
 24 |         args:
 25 |             - surface (str): target token
 26 |         return:
 27 |             - vec (np.ndarray): generated vector
 28 |             - is_oov (bool): OOV flag
 29 |         """
 30 |         v, oov_flag = self._get_vec_with_oov(surface)
 31 |         if return_oov_flag:
 32 |             return v, oov_flag
 33 |         else:
 34 |             return v
 35 | 
 36 |     def _get_vec_with_oov(self, surface):
 37 |         raise NotImplementedError()
 38 | 
 39 |     def query_as_batch(self, surface_list, return_oov_flag=False):
 40 |         if return_oov_flag == False:
 41 |             return np.array([self.query(s, return_oov_flag) for s in surface_list])
 42 |         else:
 43 |             # TODO: maybe slow
 44 |             vecs = []
 45 |             flags = []
 46 |             for s in surface_list:
 47 |                 v, oov_flag = self.query(s, return_oov_flag)
 48 |                 vecs.append(v)
 49 |                 flags.append(oov_flag)
 50 |             return np.array(vecs), flags
 51 | 
 52 | 
 53 | class Word2VecAPI(WordVectorizer):
 54 |     """
 55 |     Word2Vec API for OOV handling or others
 56 |     """
 57 | 
 58 |     def __init__(self, w2v, config=None, train_desc=None):
 59 |         """
 60 |         args:
 61 |             - w2v (Word2Vec or Word2VecKeyedVectors): trained word2vec
 62 |             - config (dict): configs for adhoc process such as centering etc...
 63 |             - train_desc (dict): Description. This contains info for `Training Word2Vec`
 64 |         """
 65 |         assert type(w2v) in [
 66 |             Word2Vec, KeyedVectors], "Invalid word2vec type: {}".format(type(w2v))
 67 |         self._w2v = w2v
 68 |         self._config = config if config is not None else {}
 69 |         self.train_desc = train_desc if train_desc is not None else {}
 70 |         self.postprocess()
 71 | 
 72 |     def postprocess(self):
 73 |         if 'post-process' in self._config:
 74 |             post_config = self._config["post-process"]
 75 |             if 'pca' in post_config:
 76 |                 d = post_config["pca"]
 77 |                 assert self._w2v.vector_size > d, "dim should be lower than original"
 78 |                 self.do_pca(d)
 79 |             if 'abtt' in post_config:
 80 |                 self.do_all_but_the_top(post_config["abtt"])
 81 |         else:
 82 |             pass  # do nothing
 83 | 
 84 |     def do_pca(self, d):
 85 |         self._w2v.vectors = pca(self._w2v.vectors, d)
 86 |         self._w2v.vector_size = d
 87 | 
 88 |     def do_all_but_the_top(self, d=None):
 89 |         if d == None:
 90 |             d = self._w2v.vector_size // 100
 91 |         self._w2v.vectors = all_but_the_top(self._w2v.vectors, d)
 92 | 
 93 |     def _get_vec_with_oov(self, surface):
 94 |         if surface in self._w2v:
 95 |             return self._pick_vec(surface), False
 96 |         else:
 97 |             logger.warn("Out-of-vocab: surface={}".format(surface))
 98 |             return self._oov_vec(surface), True
 99 | 
100 |     def _oov_vec(self, surface):
101 |         return get_zero_vector(self._w2v.vector_size)
102 | 
103 |     def _pick_vec(self, surface):
104 |         return self._w2v[surface]
105 | 
106 |     def get_mean_vector(self, surfaces, ignore_oov=True):
107 |         vecs, oovs = self.query_as_batch(surfaces, return_oov_flag=True)
108 |         if ignore_oov:
109 |             kvs = [not flag for flag in oovs]
110 |             vecs = vecs[kvs]
111 |         if all(oovs):
112 |             logging.warn("given surfaces are all OOV: {}".format(surfaces))
113 |             vecs = np.expand_dims(get_zero_vector(self._w2v.vector_size),
114 |                                   axis=0)
115 |         return vecs.mean(axis=0)
116 | 
117 |     def cal_phrase_similarity(self, surfaces1, surfaces2):
118 |         """
119 |         calculate phrase similarity.
120 |         currently, averaged phrase embedding is only available.
121 |         """
122 |         v1 = self.get_mean_vector(surfaces1, ignore_oov=True)
123 |         v2 = self.get_mean_vector(surfaces2, ignore_oov=True)
124 |         return cos_sim(v1, v2)
125 | 
126 | 
127 | class Morph2vecAPI(WordVectorizer):
128 |     def __init__(self):
129 |         raise NotImplementedError("WIP")
130 | 
131 | 
132 | def all_but_the_top(lookup_mat, d):
133 |     """
134 |     args:
135 |         - lookup_mat (np.ndarray): postprocessed vectors, shape = (n_word, dim)
136 |         - d (int): number of principal components
137 |     """
138 |     # centering
139 |     center = lookup_mat.mean(axis=0)
140 |     new_lookup_mat = lookup_mat - np.broadcast_to(center, lookup_mat.shape)
141 | 
142 |     # remove principal component
143 |     pca = PCA(n_components=d, random_state=46)
144 |     pca.fit(new_lookup_mat)
145 |     sim = new_lookup_mat.dot(pca.components_.T)  # shape = (n_word, d)
146 |     new_lookup_mat -= new_lookup_mat.dot(
147 |         pca.components_.T).dot(pca.components_)
148 |     return new_lookup_mat
149 | 
150 | 
151 | def pca(lookup_mat, d):
152 |     pca = PCA(n_components=d, random_state=46)
153 |     return pca.fit_transform(lookup_mat)
154 | 
155 | 
156 | def build_w2v_api(w2v, config, w2v_desc):
157 |     return Word2VecAPI(w2v=w2v, config=config, train_desc=w2v_desc)
158 | 
159 | 
160 | def build_gensim_w2v(w2v_path, load_config, other_config):
161 |     if load_config["w2v-fmt"] == False:
162 |         w2v = Word2Vec.load(w2v_path).wv
163 |     elif load_config["w2v-fmt"] == True:
164 |         if load_config["fmt"] == "bin":
165 |             w2v = KeyedVectors.load_word2vec_format(w2v_path, binary=True)
166 |         elif load_config["fmt"] == "txt":
167 |             w2v = KeyedVectors.load_word2vec_format(w2v_path, binary=False)
168 |         elif load_config["fmt"] == "kv":
169 |             w2v = KeyedVectors.load(w2v_path)
170 |         else:
171 |             raise ValueError("Invalid format: {}".format(load_config["fmt"]))
172 |     else:
173 |         raise ValueError(
174 |             "w2v-fmt should be bool: {}".format(load_config["w2v-fmt"]))
175 |     return w2v
176 | 


--------------------------------------------------------------------------------
/training/train_chive.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | from dataclasses import dataclass
  3 | import json
  4 | import logging
  5 | from pathlib import Path
  6 | import time
  7 | 
  8 | from gensim.models import Word2Vec
  9 | from gensim.models.word2vec import LineSentence, PathLineSentences
 10 | from gensim.models.callbacks import CallbackAny2Vec
 11 | 
 12 | 
 13 | logging.basicConfig(
 14 |     style="{",
 15 |     format='{levelname} {asctime} [{module}:{funcName}:{lineno}] {message}',
 16 |     datefmt="%m/%d/%Y %H:%M:%S",
 17 |     level=logging.INFO,
 18 | )
 19 | logger = logging.getLogger(__name__)
 20 | 
 21 | CHIVE_VERSION = "1.3"
 22 | 
 23 | 
 24 | @dataclass
 25 | class Config():
 26 |     """Configuration for training word2vec model.
 27 | 
 28 |     ref: https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec
 29 |     """
 30 | 
 31 |     vector_size: int = 300
 32 |     window: int = 8
 33 |     sg: 0 | 1 = 1  # Training algorithm: 1 for skip-gram; otherwise CBOW.
 34 |     hs: 0 | 1 = 0  # 1 for hierachical softmax, 0 for negative sampling
 35 |     n_negative: int = 5
 36 |     threshold_downsample: float = 1e-5
 37 | 
 38 |     # value to resume training
 39 |     alpha: float = 0.025       # default value of gensim
 40 |     min_alpha: float = 0.0001  # default value of gensim
 41 | 
 42 |     @staticmethod
 43 |     def from_file(config_file: Path | None):
 44 |         """Load values from json file."""
 45 |         conf = {}
 46 |         if config_file is not None:
 47 |             with config_file.open() as fi:
 48 |                 conf = json.load(fi)
 49 |         return Config(**conf)
 50 | 
 51 | 
 52 | class LogLossCallback(CallbackAny2Vec):
 53 |     """callback to log loss and time.
 54 | 
 55 |     Note that training loss is reset when you resume training."""
 56 | 
 57 |     def __init__(self, logfile: Path, start_time: float, start_epoch: int = 0):
 58 |         self.epochs = start_epoch
 59 |         self.start_time = start_time
 60 | 
 61 |         self.loss_previous_step = 0
 62 |         self.time_previous_step = start_time
 63 | 
 64 |         self.logfile = logfile
 65 |         return
 66 | 
 67 |     def _get_and_record_loss(self, model):
 68 |         total_loss = model.get_latest_training_loss()
 69 |         current_time = total_loss - self.loss_previous_step
 70 | 
 71 |         self.loss_previous_step = total_loss
 72 |         return total_loss, current_time
 73 | 
 74 |     def _get_and_record_time(self):
 75 |         now = time.time()
 76 |         total_time = now - self.start_time
 77 |         current_time = now - self.time_previous_step
 78 | 
 79 |         self.time_previous_step = now
 80 |         return total_time, current_time
 81 | 
 82 |     def on_epoch_end(self, model):
 83 |         total_loss, current_loss = self._get_and_record_loss(model)
 84 |         total_time, current_time = self._get_and_record_time()
 85 | 
 86 |         with self.logfile.open("a") as f:
 87 |             f.write(f"{self.epochs},"
 88 |                     f"{total_loss},{current_loss},"
 89 |                     f"{total_time},{current_time}\n")
 90 | 
 91 |         self.epochs += 1
 92 |         return
 93 | 
 94 | 
 95 | class CheckpointHandler():
 96 |     def __init__(self, output_dir: Path, min_count: int, version: str = CHIVE_VERSION, keep_ckpt: int = 3):
 97 |         # values used to generate filename
 98 |         self.output_dir = output_dir
 99 |         self.version = version
100 |         self.min_count = min_count
101 | 
102 |         self.keep = keep_ckpt
103 |         self.checkpoints = self.list_checkpoints()
104 |         return
105 | 
106 |     @staticmethod
107 |     def epoch_from_file(filename: Path) -> int:
108 |         """parse epoch count from a ckpt file name."""
109 |         # NOTE: this depends on the filename pattern
110 |         stem = filename.stem
111 |         len_prefix = stem.find("_epoch") + len("_epoch")
112 |         return int(stem[len_prefix:])
113 | 
114 |     def list_checkpoints(self) -> list[Path]:
115 |         """list ckpt files under output directory."""
116 |         files = self.output_dir.glob(self.ckpt_filepath(epoch="*").name)
117 |         files = list(sorted(files, key=self.epoch_from_file))
118 |         return files
119 | 
120 |     def latest_ckpt(self) -> Path | None:
121 |         """return ckpt with largest epoch, or None if no ckpt found."""
122 |         if len(self.checkpoints) == 0:
123 |             return None
124 |         return self.checkpoints[-1]
125 | 
126 |     def ckpt_filepath(self, epoch: int) -> Path:
127 |         """generate a path to the ckpt file with given epoch"""
128 |         filename = f"chive-{self.version}-mc{self.min_count}_epoch{epoch}.bin"
129 |         return self.output_dir / filename
130 | 
131 |     def save_ckpt(self, epoch: int, save_func):
132 |         """save ckpt using given func and remove old ckpts.
133 | 
134 |         :param save_func: saves data to the given path.
135 |         """
136 |         new_ckpt = self.ckpt_filepath(epoch)
137 |         save_func(new_ckpt)
138 |         self.checkpoints.append(new_ckpt)
139 |         self.remove_old_ckpt()
140 |         return
141 | 
142 |     def remove_old_ckpt(self):
143 |         """remove old ckpts, keeping self.keep_ckpt ckpts."""
144 |         for i in range(len(self.checkpoints) - self.keep):
145 |             logger.info(f"remove ckpt {self.checkpoints[i]}")
146 |             self.checkpoints[i].unlink()
147 |         self.checkpoints = self.checkpoints[-self.keep:]
148 |         return
149 | 
150 | 
151 | class SaveCheckpointCallback(CallbackAny2Vec):
152 |     """callback to save ckpts per specified epochs."""
153 | 
154 |     def __init__(self, ckpt_handler: CheckpointHandler, save_epochs: int = 5, start_epoch: int = 0):
155 |         self.ckpt_handler = ckpt_handler
156 |         self.epochs = start_epoch
157 |         self.save_epochs = save_epochs
158 |         return
159 | 
160 |     def on_epoch_end(self, model):
161 |         self.epochs += 1
162 |         if self.epochs % self.save_epochs == 0:
163 |             self.ckpt_handler.save_ckpt(
164 |                 epoch=self.epochs,
165 |                 save_func=lambda p: model.save(str(p)))
166 |         return
167 | 
168 | 
169 | def parse_args():
170 |     p = argparse.ArgumentParser("Training word embedding by gensim")
171 |     p.add_argument("--input", type=Path,
172 |                    help="tokenized text data (line-by-line) or directory contains them")
173 |     p.add_argument("--output", type=Path,
174 |                    help="directory to output")
175 | 
176 |     p.add_argument("--epochs", type=int, default=15,
177 |                    help="how many epochs to run training (default 15)")
178 |     p.add_argument("--min-count", type=int, default=90,
179 |                    help="words that appears less than this would be ignored (default 90)")
180 |     p.add_argument("--save-epochs", type=int, default=5,
181 |                    help="save model every this epochs as checkpoint (default 5)")
182 |     p.add_argument("--keep-ckpt", type=int, default=3,
183 |                    help="how many checkpoints to keep (default 3)")
184 |     p.add_argument("--worker", type=int, default=12,
185 |                    help="how many threads to use during training (default 12)")
186 | 
187 |     p.add_argument("--config", type=Path, default=None,
188 |                    help="json file to load config parameters from (optional)")
189 | 
190 |     args = p.parse_args()
191 |     return args
192 | 
193 | 
194 | def restart_alpha(conf: Config, total_epochs: int, ckpt_epochs: int) -> float:
195 |     """calculate alpha for restarting from the given ckpt epochs.
196 | 
197 |     - learning rate decaies linearly from alpha to min_alpha
198 |         - it depends on alpha/min_alpha/total_epoch/crr_epoch
199 |         - https://github.com/piskvorky/gensim/blob/e7b441b87a967e22668a2365bcb60a13e9496215/gensim/models/word2vec.py#L1441
200 |     - resuming training requires adjusted epochs (i.e. remaining epochs).
201 |         - it affects to lr calculation, and we also need to adjust start_alpha
202 |     """
203 |     alpha = conf.alpha
204 |     min_alpha = conf.min_alpha
205 |     return alpha - (alpha - min_alpha) / total_epochs * ckpt_epochs
206 | 
207 | 
208 | def main():
209 |     args = parse_args()
210 |     args.output.mkdir(parents=True, exist_ok=True)
211 |     train_ops = Config.from_file(args.config)
212 | 
213 |     sents = LineSentence(args.input) if args.input.is_file() \
214 |         else PathLineSentences(args.input)
215 | 
216 |     ckpt_handler = CheckpointHandler(
217 |         args.output, args.min_count, keep_ckpt=args.keep_ckpt)
218 |     logfile = args.output / f"TrainLog-mc{args.min_count}.csv"
219 | 
220 |     latest_ckpt = ckpt_handler.latest_ckpt()
221 |     if latest_ckpt is None:
222 |         logger.info(f"training from scratch")
223 |         model = Word2Vec(
224 |             sents,
225 |             workers=args.worker,
226 |             vector_size=train_ops.vector_size,
227 |             window=train_ops.window,
228 |             sample=train_ops.threshold_downsample,
229 |             sg=train_ops.sg,
230 |             hs=train_ops.hs,
231 |             negative=train_ops.n_negative,
232 |             epochs=args.epochs,
233 |             min_count=args.min_count,
234 |             alpha=train_ops.alpha,
235 |             min_alpha=train_ops.min_alpha,
236 |             compute_loss=True,
237 |             callbacks=[
238 |                 SaveCheckpointCallback(ckpt_handler, args.save_epochs),
239 |                 LogLossCallback(logfile, time.time()),
240 |             ],
241 |         )
242 |     else:
243 |         # Resume training from the checkpoint.
244 |         # Assume to use same corpus and parameters.
245 |         logger.info(f"checkpoint found: {latest_ckpt}")
246 |         ckpt_epochs = ckpt_handler.epoch_from_file(latest_ckpt)
247 |         if ckpt_epochs >= args.epochs:
248 |             logger.info(
249 |                 f"training seems already finished ({latest_ckpt} exists).")
250 |             return
251 | 
252 |         model = Word2Vec.load(str(latest_ckpt))
253 |         model.train(
254 |             sents,
255 |             total_examples=model.corpus_count,
256 |             epochs=args.epochs - ckpt_epochs,
257 |             start_alpha=restart_alpha(train_ops, args.epochs, ckpt_epochs),
258 |             end_alpha=train_ops.min_alpha,
259 |             compute_loss=True,
260 |             callbacks=[
261 |                 SaveCheckpointCallback(
262 |                     ckpt_handler, args.save_epochs, start_epoch=ckpt_epochs),
263 |                 LogLossCallback(logfile, time.time(), start_epoch=ckpt_epochs),
264 |             ],
265 |         )
266 |         # fix value changed by calling model.train
267 |         model.alpha = train_ops.alpha
268 |         model.epochs = args.epochs
269 | 
270 |     logger.info(f"finish training and save model.")
271 |     model.save(str(ckpt_handler.ckpt_filepath(args.epochs)))
272 |     return
273 | 
274 | 
275 | if __name__ == '__main__':
276 |     main()
277 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # chiVe: Sudachi による日本語単語ベクトル
  2 | 
  3 | [English README](README_en.md)
  4 | 
  5 | ## 概要
  6 | 
  7 | "chiVe" (チャイブ, Suda**chi Vec**tor) は、大規模コーパスと複数粒度分割に基づく日本語単語ベクトルです。
  8 | 
  9 | [Skip-gram アルゴリズム](https://arxiv.org/abs/1301.3781)を元に、word2vec ([gensim](https://radimrehurek.com/gensim/)) を使用して単語分散表現を構築しています。
 10 | 
 11 | 学習コーパスには、v1.0-v1.2 では約 1 億のウェブページ文章を含む国立国語研究所の[日本語ウェブコーパス（NWJC）](https://masayu-a.github.io/NWJC/)、
 12 | v1.3 では [CommonCrawl](https://commoncrawl.org/) から取得したウェブページ文章を採用しています。
 13 | 
 14 | 分かち書きにはワークスアプリケーションズの形態素解析器 [Sudachi](https://github.com/WorksApplications/Sudachi) を使用しています。
 15 | Sudachi で定義されている A/B/C の 3 つの分割単位でコーパスを解析した結果を元に分散表現の学習を行なっています。
 16 | 
 17 | ## データ
 18 | 
 19 | SudachiDict と chiVe のデータは、AWS の [Open Data Sponsorship Program](https://registry.opendata.aws/sudachi/) によりホストしていただいています。
 20 | 
 21 | | 版        | 最低頻度 | 正規化 | 語彙数    | テキスト                                                                                      | [gensim](https://radimrehurek.com/gensim/)                                                           | [Magnitude](https://github.com/plasticityai/magnitude)                                               |
 22 | | --------- | -------- | ------ | --------- | --------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------- |
 23 | | v1.3 mc5  | 5        | o      | 2,530,791 | 3.6GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.3-mc5.tar.gz))  | 2.9GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.3-mc5_gensim.tar.gz))  | -                                                                                                    |
 24 | | v1.3 mc15 | 15       | o      | 1,186,019 | 1.7GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.3-mc15.tar.gz)) | 1.3GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.3-mc15_gensim.tar.gz)) | -                                                                                                    |
 25 | | v1.3 mc30 | 30       | o      | 759,011   | 1.1GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.3-mc30.tar.gz)) | 0.8GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.3-mc30_gensim.tar.gz)) | -                                                                                                    |
 26 | | v1.3 mc90 | 90       | o      | 410,533   | 0.6GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.3-mc90.tar.gz)) | 0.5GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.3-mc90_gensim.tar.gz)) | -                                                                                                    |
 27 | |           |          |        |           |                                                                                               |                                                                                                      |                                                                                                      |
 28 | | v1.2 mc5  | 5        | o      | 3,197,456 | 9.2GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc5.tar.gz))  | 3.8GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc5_gensim.tar.gz))  | 5.5GB ([.magnitude](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc5.magnitude))  |
 29 | | v1.2 mc15 | 15       | o      | 1,454,280 | 5.0GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc15.tar.gz)) | 1.7GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc15_gensim.tar.gz)) | 2.4GB ([.magnitude](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc15.magnitude)) |
 30 | | v1.2 mc30 | 30       | o      | 912,550   | 3.1GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc30.tar.gz)) | 1.1GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc30_gensim.tar.gz)) | 1.5GB ([.magnitude](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc30.magnitude)) |
 31 | | v1.2 mc90 | 90       | o      | 482,223   | 1.7GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc90.tar.gz)) | 0.6GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc90_gensim.tar.gz)) | 0.8GB ([.magnitude](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc90.magnitude)) |
 32 | |           |          |        |           |                                                                                               |                                                                                                      |                                                                                                      |
 33 | | v1.1 mc5  | 5        | o      | 3,196,481 | 11GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc5.tar.gz))   | 3.6GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc5_gensim.tar.gz))  | 5.5GB ([.magnitude](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc5.magnitude))  |
 34 | | v1.1 mc15 | 15       | o      | 1,452,205 | 4.7GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc15.tar.gz)) | 1.7GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc15_gensim.tar.gz)) | 2.4GB ([.magnitude](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc15.magnitude)) |
 35 | | v1.1 mc30 | 30       | o      | 910,424   | 3.0GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc30.tar.gz)) | 1.1GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc30_gensim.tar.gz)) | 1.5GB ([.magnitude](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc30.magnitude)) |
 36 | | v1.1 mc90 | 90       | o      | 480,443   | 1.6GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc90.tar.gz)) | 0.6GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc90_gensim.tar.gz)) | 0.8GB ([.magnitude](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc90.magnitude)) |
 37 | | v1.0 mc5  | 5        | x      | 3,644,628 | 12GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.0-mc5.tar.gz))   | 4.1GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.0-mc5_gensim.tar.gz))  | 6.3GB ([.magnitude](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.0-mc5.magnitude))  |
 38 | 
 39 | 全て 300 次元のベクトルです。
 40 | 
 41 | 「最低頻度」は、コーパス内での単語出現回数での足切り基準（[gensim](https://radimrehurek.com/gensim/models/word2vec.html) での `min_count` ）です。
 42 | 
 43 | 「正規化」は、形態素解析器 Sudachi による表記統制です。例えば `空き缶`, `空缶`, `空き罐`, `空罐`, `空きカン`, `空きかん` はすべて正規化表記 `空き缶` に統制されます。
 44 | 
 45 | | 版   | Sudachi | Sudachi 辞書          | 学習コーパス                                         |
 46 | | ---- | ------- | --------------------- | ---------------------------------------------------- |
 47 | | v1.3 | v0.6.8  | 20240109-core         | CommonCrawl (CC-MAIN-2022-40, warc, first 20k files) |
 48 | | v1.2 | v0.4.3  | 20200722-core         | NWJC                                                 |
 49 | | v1.1 | v0.3.0  | 20191030-core         | NWJC                                                 |
 50 | | v1.0 | v0.1.1  | 0.1.1-dictionary-full | NWJC                                                 |
 51 | 
 52 | 全て同じ学習アルゴリズムを使用しています。詳細は [training](training) を参照してください。
 53 | 
 54 | ### 「A 単位語のみ」の資源
 55 | 
 56 | [Sudachi 辞書](https://github.com/WorksApplications/SudachiDict)にある A 単位語のみを含む資源です（A 単位語のみでの再学習ではなく、上にある元資源から、B 単位語、C 単位語、OOV 語（Out-of-vocabulary, 辞書にない語）を除いたものです）。
 57 | 
 58 | `v1.1 mc90 aunit` が、自然言語処理ツール [spaCy](https://github.com/explosion/spaCy/) の日本語モデルに使われています。
 59 | 
 60 | | 版              | 語彙数          | テキスト                                                                                            | [gensim](https://radimrehurek.com/gensim/)                                                                 | [Magnitude](https://github.com/plasticityai/magnitude)                                                     |
 61 | | --------------- | --------------- | --------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------- |
 62 | | v1.1 mc5 aunit  | 322,094 (10.1%) | 1.1GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc5-aunit.tar.gz))  | 0.4GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc5-aunit_gensim.tar.gz))  | 0.5GB ([.magnitude](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc5-aunit.magnitude))  |
 63 | | v1.1 mc15 aunit | 276,866 (19.1%) | 1.0GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc15-aunit.tar.gz)) | 0.3GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc15-aunit_gensim.tar.gz)) | 0.4GB ([.magnitude](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc15-aunit.magnitude)) |
 64 | | v1.1 mc30 aunit | 242,658 (26.7%) | 0.8GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc30-aunit.tar.gz)) | 0.3GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc30-aunit_gensim.tar.gz)) | 0.4GB ([.magnitude](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc30-aunit.magnitude)) |
 65 | | v1.1 mc90 aunit | 189,775 (39.5%) | 0.7GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc90-aunit.tar.gz)) | 0.2GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc90-aunit_gensim.tar.gz)) | 0.3GB ([.magnitude](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc90-aunit.magnitude)) |
 66 | 
 67 | ### 追加学習用のフルモデル
 68 | 
 69 | chiVe は、各ドメイン（分野）に合わせたデータで追加学習させられます。
 70 | chiVe は、追加学習なしでも利用できますが、追加学習することでそのドメイン（分野）でのタスクの性能改善が期待できます。
 71 | 
 72 | chiVe を追加学習するためには、フルモデルを使用してください。詳しい使用方法は、[チュートリアル](docs/continue-training.md)をご覧ください。
 73 | 
 74 | | 版        | [gensim](https://radimrehurek.com/gensim/) (full)                                                         |
 75 | | --------- | --------------------------------------------------------------------------------------------------------- |
 76 | | v1.3 mc5  | 5.5GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.3-mc5_gensim-full.tar.gz))  |
 77 | | v1.3 mc15 | 2.6GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.3-mc15_gensim-full.tar.gz)) |
 78 | | v1.3 mc30 | 1.7GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.3-mc30_gensim-full.tar.gz)) |
 79 | | v1.3 mc90 | 0.9GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.3-mc90_gensim-full.tar.gz)) |
 80 | |           |                                                                                                           |
 81 | | v1.2 mc5  | 6.7GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc5_gensim-full.tar.gz))  |
 82 | | v1.2 mc15 | 3.0GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc15_gensim-full.tar.gz)) |
 83 | | v1.2 mc30 | 1.9GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc30_gensim-full.tar.gz)) |
 84 | | v1.2 mc90 | 1.0GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc90_gensim-full.tar.gz)) |
 85 | 
 86 | ## 利用方法
 87 | 
 88 | 「テキスト」「gensim」「Magnitude」という 3 つのフォーマットでデータを公開しています。
 89 | 
 90 | ### テキスト
 91 | 
 92 | プレーンテキスト形式のデータ（オリジナルの word2vec C フォーマット）です。
 93 | 
 94 | ```
 95 | 480443 300
 96 | の -0.08274004 -0.091033645 -0.08744463 -0.14393683 -0.053159036 ...
 97 | 、 -0.014216528 -0.1027064 -0.07763326 -0.16008057 -0.16116066 ...
 98 | て -0.06049706 -0.15483096 0.052628547 -0.12448246 -0.14404581 ...
 99 | ...
100 | ```
101 | 
102 | ### gensim
103 | 
104 | ライブラリ [gensim](https://radimrehurek.com/gensim/) のための、[KeyedVectors](https://radimrehurek.com/gensim/models/keyedvectors.html) 形式のデータです。
105 | 
106 | ```py
107 | import gensim
108 | 
109 | vectors = gensim.models.KeyedVectors.load("./chive-1.1-mc90_gensim/chive-1.1-mc90.kv")
110 | 
111 | "すだち" in vectors # False, v1.1 では正規化されているため
112 | "酢橘" in vectors # True
113 | 
114 | vectors["酢橘"]
115 | # array([-5.68204783e-02, -1.26615226e-01,  3.53190415e-02, -3.67305875e-01, ...])
116 | 
117 | vectors.similarity("酢橘", "徳島")
118 | # 0.3993048
119 | 
120 | vectors.most_similar("徳島", topn=5)
121 | # [('愛媛', 0.8229734897613525),
122 | # ('徳島県', 0.786933422088623),
123 | # ('高知', 0.7795713543891907),
124 | # ('岡山', 0.7623447179794312),
125 | # ('徳島市', 0.7415297031402588)]
126 | 
127 | vectors.most_similar(positive=["阿波", "高知"], negative=["徳島"], topn=5)
128 | # [('土佐', 0.620033860206604),
129 | # ('阿波踊り', 0.5988592505455017),
130 | # ('よさこい祭り', 0.5783430337905884),
131 | # ('安芸', 0.564490556716919),
132 | # ('高知県', 0.5591559410095215)]
133 | ```
134 | 
135 | ### Magnitude (~v1.2)
136 | 
137 | ライブラリ [Magnitude](https://github.com/plasticityai/magnitude) 形式のデータです。デフォルトのパラメーターで変換されています（高度な未知語サポート有り、近似最近傍インデックス無し。Magnitude が公開しているモデルの `Medium` 相当）。
138 | 
139 | ```py
140 | from pymagnitude import Magnitude
141 | 
142 | vectors = Magnitude("chive1.1-mc90.magnitude")
143 | 
144 | "すだち" in vectors # False, v1.1 では正規化されているため
145 | "酢橘" in vectors # True
146 | 
147 | vectors.query("すだち") # Magnitudeによるサブワードを使った未知語サポートによる
148 | # array([ 0.03974148,  0.11290773,  0.01493122, -0.05296252,  0.12616251, ...])
149 | 
150 | vectors.most_similar("すだち", topn=5)
151 | # [('あだち', 0.5930323079944302),
152 | # ('すだ椎', 0.5872662462335323),
153 | # ('だち', 0.5797546444016177),
154 | # ('ムクノキ', 0.46228053338159725),
155 | # ('椨', 0.4482612387097178)]
156 | 
157 | vectors.similarity("酢橘", "徳島")
158 | # 0.3993048
159 | 
160 | vectors.most_similar("徳島", topn=5)
161 | # [('愛媛', 0.8229735),
162 | # ('徳島県', 0.78693324),
163 | # ('高知', 0.7795714),
164 | # ('岡山', 0.7623447),
165 | # ('徳島市', 0.7415296)]
166 | 
167 | vectors.closer_than("徳島", "徳島市")
168 | # ['愛媛', '徳島県', '高知', '岡山']
169 | 
170 | vectors.most_similar(positive=["阿波", "高知"], negative=["徳島"], topn=5)
171 | # [('土佐', 0.62003386),
172 | # ('阿波踊り', 0.5988593),
173 | # ('よさこい祭り', 0.578343),
174 | # ('安芸', 0.56449056),
175 | # ('高知県', 0.55915594)]
176 | 
177 | vectors.most_similar_cosmul(positive=["阿波", "高知"], negative=["徳島"], topn=5)
178 | # [('土佐', 0.83830714),
179 | # ('よさこい祭り', 0.82048166),
180 | # ('阿波踊り', 0.8168015),
181 | # ('安芸', 0.80880433),
182 | # ('伊予', 0.80250806)]
183 | ```
184 | 
185 | ライブラリを使っての、ダウンロード、リモートでのロード、HTTP 上のリモートでのストリームも可能です。
186 | 
187 | ```py
188 | from pymagnitude import Magnitude, MagnitudeUtils
189 | 
190 | # ダウンロード
191 | vectors = Magnitude(MagnitudeUtils.download_model("chive-1.1-mc90-aunit", remote_path="https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/"))
192 |  # デフォルトのダウンロード先: `~/.magnitude/`
193 |  # ファイルが既にダウンロードされている場合は、再度ダウンロードしない
194 |  # 引数 `download_dir` でローカルのダウンロード先を変更できる
195 | 
196 | # リモートでのロード
197 | vectors = Magnitude("https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc90-aunit.magnitude")
198 | 
199 | # HTTP上のリモートでのストリーム
200 | vectors = Magnitude("https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc90-aunit.magnitude", stream=True)
201 | vectors.query("徳島") # ローカルにファイルをダウンロードせず、ベクトルをすばやく取得
202 | ```
203 | 
204 | ## ライセンス
205 | 
206 | ### v1.0, v1.1, v1.2
207 | 
208 | Copyright (c) 2020 National Institute for Japanese Language and Linguistics and Works Applications Co., Ltd. All rights reserved.
209 | 
210 | [Apache License, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0) の下で[国立国語研究所](https://www.ninjal.ac.jp/)と[株式会社ワークスアプリケーションズ](https://www.worksap.co.jp/)によって提供されています。
211 | 
212 | ### v1.3
213 | 
214 | Copyright (c) 2024 Works Applications Co., Ltd. All rights reserved.
215 | 
216 | [Apache License, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0) の下で[株式会社ワークスアプリケーションズ](https://www.worksap.co.jp/)によって提供されています。
217 | 
218 | ## Slack
219 | 
220 | 開発者やユーザーの方々が質問したり議論するための Slack ワークスペースを用意しています。
221 | 
222 | - https://sudachi-dev.slack.com/
223 | - ([こちら](https://join.slack.com/t/sudachi-dev/shared_invite/enQtMzg2NTI2NjYxNTUyLTMyYmNkZWQ0Y2E5NmQxMTI3ZGM3NDU0NzU4NGE1Y2UwYTVmNTViYjJmNDI0MWZiYTg4ODNmMzgxYTQ3ZmI2OWU)から招待を受けてください)
224 | 
225 | ## chiVe の引用
226 | 
227 | chiVe について、論文を発表しています;
228 | 
229 | - 真鍋陽俊, 岡照晃, 海川祥毅, 髙岡一馬, 内田佳孝, 浅原正幸. [複数粒度の分割結果に基づく日本語単語分散表現](https://www.anlp.jp/proceedings/annual_meeting/2019/pdf_dir/P8-5.pdf). 言語処理学会第 25 回年次大会, 2019.
230 | - 河村宗一郎, 久本空海, 真鍋陽俊, 高岡一馬, 内田佳孝, 岡照晃, 浅原正幸. [chiVe 2.0: Sudachi と NWJC を用いた実用的な日本語単語ベクトルの実現へ向けて](https://www.anlp.jp/proceedings/annual_meeting/2020/pdf_dir/P6-16.pdf). 言語処理学会第 26 回年次大会, 2020.
231 | - 久本空海, 山村崇, 勝田哲弘, 竹林佑斗, 髙岡一馬, 内田佳孝, 岡照晃, 浅原正幸. [chiVe: 製品利用可能な日本語単語ベクトル資源の実現へ向けて](https://www.ieice.org/ken/paper/20200910U1zQ/). 第 16 回テキストアナリティクス・シンポジウム, 2020. （[スライド](https://speakerdeck.com/sorami/chive-zhi-pin-li-yong-ke-neng-nari-ben-yu-dan-yu-bekutoruzi-yuan-falseshi-xian-hexiang-kete)）
232 | 
233 | chiVe を論文や書籍、サービスなどで引用される際には、以下の BibTex をご利用ください（基本的には、1 本目の(真鍋+ 2019)を引用してください）。
234 | 
235 | ```
236 | @INPROCEEDINGS{manabe2019chive,
237 |     author    = {真鍋陽俊, 岡照晃, 海川祥毅, 髙岡一馬, 内田佳孝, 浅原正幸},
238 |     title     = {複数粒度の分割結果に基づく日本語単語分散表現},
239 |     booktitle = "言語処理学会第25回年次大会(NLP2019)",
240 |     year      = "2019",
241 |     pages     = "NLP2019-P8-5",
242 |     publisher = "言語処理学会",
243 | }
244 | ```
245 | 
246 | ```
247 | @INPROCEEDINGS{kawamura2020chive,
248 |     author    = {河村宗一郎, 久本空海, 真鍋陽俊, 高岡一馬, 内田佳孝, 岡照晃, 浅原正幸},
249 |     title     = {chiVe 2.0: SudachiとNWJCを用いた実用的な日本語単語ベクトルの実現へ向けて},
250 |     booktitle = "言語処理学会第26回年次大会(NLP2020)",
251 |     year      = "2020",
252 |     pages     = "NLP2020-P6-16",
253 |     publisher = "言語処理学会",
254 | }
255 | ```
256 | 
257 | ```
258 | @INPROCEEDINGS{hisamoto2020chive,
259 |     author    = {久本空海, 山村崇, 勝田哲弘, 竹林佑斗, 髙岡一馬, 内田佳孝, 岡照晃, 浅原正幸},
260 |     title     = {chiVe: 製品利用可能な日本語単語ベクトル資源の実現へ向けて},
261 |     booktitle = "第16回テキストアナリティクス・シンポジウム",
262 |     year      = "2020",
263 |     pages     = "IEICE-NLC2020-9",
264 |     publisher = "電子情報通信学会",
265 | }
266 | ```
267 | 


--------------------------------------------------------------------------------
/README_en.md:
--------------------------------------------------------------------------------
  1 | # chiVe: Japanese Word Embedding with Sudachi
  2 | 
  3 | [日本語 README](README.md)
  4 | 
  5 | ## Abstract
  6 | 
  7 | "chiVe" (Suda**chi Ve**ctor) is a Japanese pre-trained word embedding resource using large-scale corpus and multi-granular tokenization.
  8 | 
  9 | Based on the [skip-gram algorithm](https://arxiv.org/abs/1301.3781), we used word2vec ([gensim](https://radimrehurek.com/gensim/)) to train the vectors.
 10 | 
 11 | For v1.0-v1.2, we used [NINJAL Web Japanese Corpus (NWJC)](https://masayu-a.github.io/NWJC/) from National Institute for Japanese Language and Linguistics which contains around 100 million web page text as a training corpus.
 12 | For v1.3, we used texts taken from [CommonCrawl](https://commoncrawl.org/).
 13 | 
 14 | We used [Sudachi](https://github.com/WorksApplications/Sudachi) by Works Applications for tokenization.
 15 | We used Sudachi's multi-granular tokenziation results of the corpus to train word vectors.
 16 | 
 17 | ## Data
 18 | 
 19 | Data are generously hosted by AWS with their [Open Data Sponsorship Program](https://registry.opendata.aws/sudachi/).
 20 | 
 21 | | Version   | Min Count | Normalized | Vocab     | Text                                                                                          | [gensim](https://radimrehurek.com/gensim/)                                                           | [Magnitude](https://github.com/plasticityai/magnitude)                                               |
 22 | | --------- | --------- | ---------- | --------- | --------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------- |
 23 | | v1.3 mc5  | 5         | o          | 2,530,791 | 3.6GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.3-mc5.tar.gz))  | 2.9GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.3-mc5_gensim.tar.gz))  | -                                                                                                    |
 24 | | v1.3 mc15 | 15        | o          | 1,186,019 | 1.7GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.3-mc15.tar.gz)) | 1.3GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.3-mc15_gensim.tar.gz)) | -                                                                                                    |
 25 | | v1.3 mc30 | 30        | o          | 759,011   | 1.1GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.3-mc30.tar.gz)) | 0.8GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.3-mc30_gensim.tar.gz)) | -                                                                                                    |
 26 | | v1.3 mc90 | 90        | o          | 410,533   | 0.6GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.3-mc90.tar.gz)) | 0.5GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.3-mc90_gensim.tar.gz)) | -                                                                                                    |
 27 | |           |           |            |           |                                                                                               |                                                                                                      |                                                                                                      |
 28 | | v1.2 mc5  | 5         | o          | 3,197,456 | 9.2GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc5.tar.gz))  | 3.8GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc5_gensim.tar.gz))  | 5.5GB ([.magnitude](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc5.magnitude))  |
 29 | | v1.2 mc15 | 15        | o          | 1,454,280 | 5.0GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc15.tar.gz)) | 1.7GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc15_gensim.tar.gz)) | 2.4GB ([.magnitude](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc15.magnitude)) |
 30 | | v1.2 mc30 | 30        | o          | 912,550   | 3.1GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc30.tar.gz)) | 1.1GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc30_gensim.tar.gz)) | 1.5GB ([.magnitude](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc30.magnitude)) |
 31 | | v1.2 mc90 | 90        | o          | 482,223   | 1.7GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc90.tar.gz)) | 0.6GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc90_gensim.tar.gz)) | 0.8GB ([.magnitude](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc90.magnitude)) |
 32 | |           |           |            |           |                                                                                               |                                                                                                      |                                                                                                      |
 33 | | v1.1 mc5  | 5         | o          | 3,196,481 | 11GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc5.tar.gz))   | 3.6GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc5_gensim.tar.gz))  | 5.5GB ([.magnitude](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc5.magnitude))  |
 34 | | v1.1 mc15 | 15        | o          | 1,452,205 | 4.7GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc15.tar.gz)) | 1.7GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc15_gensim.tar.gz)) | 2.4GB ([.magnitude](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc15.magnitude)) |
 35 | | v1.1 mc30 | 30        | o          | 910,424   | 3.0GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc30.tar.gz)) | 1.1GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc30_gensim.tar.gz)) | 1.5GB ([.magnitude](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc30.magnitude)) |
 36 | | v1.1 mc90 | 90        | o          | 480,443   | 1.6GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc90.tar.gz)) | 0.6GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc90_gensim.tar.gz)) | 0.8GB ([.magnitude](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc90.magnitude)) |
 37 | | v1.0 mc5  | 5         | x          | 3,644,628 | 12GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.0-mc5.tar.gz))   | 4.1GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.0-mc5_gensim.tar.gz))  | 6.3GB ([.magnitude](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.0-mc5.magnitude))  |
 38 | 
 39 | All vectors have 300 dimensions.
 40 | 
 41 | "Min Count" indicates the number of minimum appearance count in the training corpus (`min_count` in [gensim](https://radimrehurek.com/gensim/models/word2vec.html)).
 42 | 
 43 | "Normalized" indicates if the text is normalized using the tokenizer Sudachi. For example, words `空き缶`, `空缶`, `空き罐`, `空罐`, `空きカン`, `空きかん` will all be normalized to `空き缶`.
 44 | 
 45 | | version | Sudachi | SudachiDict           | Training Corpus                                      |
 46 | | ------- | ------- | --------------------- | ---------------------------------------------------- |
 47 | | v1.3    | v0.6.8  | 20240109-core         | CommonCrawl (CC-MAIN-2022-40, warc, first 20k files) |
 48 | | v1.2    | v0.4.3  | 20200722-core         | NWJC                                                 |
 49 | | v1.1    | v0.3.0  | 20191030-core         | NWJC                                                 |
 50 | | v1.0    | v0.1.1  | 0.1.1-dictionary-full | NWJC                                                 |
 51 | 
 52 | The training algorithm is the same. See [training](training) for the details.
 53 | 
 54 | ### "A Unit Only" Resources
 55 | 
 56 | These files contain only the [SudachiDict](https://github.com/WorksApplications/SudachiDict) A unit words (Not re-training; Simply excluding B unit words, C unit words, and OOV (Out-of-vocabulary) words from the above original resources).
 57 | 
 58 | `v1.1 mc90 aunit` is used for the natural language processing tool [spaCy](https://github.com/explosion/spaCy/)'s Japanese models.
 59 | 
 60 | | Version         | Vocab           | Text                                                                                                | [gensim](https://radimrehurek.com/gensim/)                                                                 | [Magnitude](https://github.com/plasticityai/magnitude)                                                     |
 61 | | --------------- | --------------- | --------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------- |
 62 | | v1.1 mc5 aunit  | 322,094 (10.1%) | 1.1GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc5-aunit.tar.gz))  | 0.4GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc5-aunit_gensim.tar.gz))  | 0.5GB ([.magnitude](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc5-aunit.magnitude))  |
 63 | | v1.1 mc15 aunit | 276,866 (19.1%) | 1.0GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc15-aunit.tar.gz)) | 0.3GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc15-aunit_gensim.tar.gz)) | 0.4GB ([.magnitude](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc15-aunit.magnitude)) |
 64 | | v1.1 mc30 aunit | 242,658 (26.7%) | 0.8GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc30-aunit.tar.gz)) | 0.3GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc30-aunit_gensim.tar.gz)) | 0.4GB ([.magnitude](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc30-aunit.magnitude)) |
 65 | | v1.1 mc90 aunit | 189,775 (39.5%) | 0.7GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc90-aunit.tar.gz)) | 0.2GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc90-aunit_gensim.tar.gz)) | 0.3GB ([.magnitude](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc90-aunit.magnitude)) |
 66 | 
 67 | ### Continue Training chiVe
 68 | 
 69 | Although chiVe can be used as it is, you can continue to train chiVe with your own data to improve the performance of your tasks.
 70 | 
 71 | A full model is required for further training.
 72 | See the [tutorial](docs/continue-training.md) for details on how to use it.
 73 | 
 74 | | Version   | [gensim](https://radimrehurek.com/gensim/) (full)                                                         |
 75 | | --------- | --------------------------------------------------------------------------------------------------------- |
 76 | | v1.3 mc5  | 5.5GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.3-mc5_gensim-full.tar.gz))  |
 77 | | v1.3 mc15 | 2.6GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.3-mc15_gensim-full.tar.gz)) |
 78 | | v1.3 mc30 | 1.7GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.3-mc30_gensim-full.tar.gz)) |
 79 | | v1.3 mc90 | 0.9GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.3-mc90_gensim-full.tar.gz)) |
 80 | |           |                                                                                                           |
 81 | | v1.2 mc5  | 6.7GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc5_gensim-full.tar.gz))  |
 82 | | v1.2 mc15 | 3.0GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc15_gensim-full.tar.gz)) |
 83 | | v1.2 mc30 | 1.9GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc30_gensim-full.tar.gz)) |
 84 | | v1.2 mc90 | 1.0GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc90_gensim-full.tar.gz)) |
 85 | 
 86 | ## Usage
 87 | 
 88 | We provide data in 3 formats, namely, Text, gensim, and Magitude.
 89 | 
 90 | ### Text
 91 | 
 92 | Data in plain text (original word2vec C format).
 93 | 
 94 | ```txt:chive-1.1-mc90/chive-1.1-mc90.txt
 95 | 480443 300
 96 | の -0.08274004 -0.091033645 -0.08744463 -0.14393683 -0.053159036 ...
 97 | 、 -0.014216528 -0.1027064 -0.07763326 -0.16008057 -0.16116066 ...
 98 | て -0.06049706 -0.15483096 0.052628547 -0.12448246 -0.14404581 ...
 99 | ...
100 | ```
101 | 
102 | ### gensim
103 | 
104 | Data for the library [gensim](https://radimrehurek.com/gensim/), in [KeyedVectors](https://radimrehurek.com/gensim/models/keyedvectors.html) format.
105 | 
106 | ```py
107 | import gensim
108 | 
109 | vectors = gensim.models.KeyedVectors.load("./chive-1.1-mc90_gensim/chive-1.1-mc90.kv")
110 | 
111 | "すだち" in vectors # False, because in v1.1 all vocabs are normalized
112 | "酢橘" in vectors # True
113 | 
114 | vectors["酢橘"]
115 | # array([-5.68204783e-02, -1.26615226e-01,  3.53190415e-02, -3.67305875e-01, ...])
116 | 
117 | vectors.similarity("酢橘", "徳島")
118 | # 0.3993048
119 | 
120 | vectors.most_similar("徳島", topn=5)
121 | # [('愛媛', 0.8229734897613525),
122 | # ('徳島県', 0.786933422088623),
123 | # ('高知', 0.7795713543891907),
124 | # ('岡山', 0.7623447179794312),
125 | # ('徳島市', 0.7415297031402588)]
126 | 
127 | vectors.most_similar(positive=["阿波", "高知"], negative=["徳島"], topn=5)
128 | # [('土佐', 0.620033860206604),
129 | # ('阿波踊り', 0.5988592505455017),
130 | # ('よさこい祭り', 0.5783430337905884),
131 | # ('安芸', 0.564490556716919),
132 | # ('高知県', 0.5591559410095215)]
133 | ```
134 | 
135 | ### Magnitude
136 | 
137 | Data converted for the library [Magnitude](https://github.com/plasticityai/magnitude), using the default parameters, i.e., includes advanced out-of-vocabulary key support using subword information, but does not include approximate nearest neighbours index (equivalent to their `Medium`).
138 | 
139 | ```py
140 | from pymagnitude import Magnitude
141 | 
142 | vectors = Magnitude("chive1.1-mc90.magnitude")
143 | 
144 | "すだち" in vectors # False, because in v1.1 all vocabs are normalized
145 | "酢橘" in vectors # True
146 | 
147 | vectors.query("すだち") # via Magnitude's OOV feature suing subword information
148 | # array([ 0.03974148,  0.11290773,  0.01493122, -0.05296252,  0.12616251, ...])
149 | 
150 | vectors.most_similar("すだち", topn=5)
151 | # [('あだち', 0.5930323079944302),
152 | # ('すだ椎', 0.5872662462335323),
153 | # ('だち', 0.5797546444016177),
154 | # ('ムクノキ', 0.46228053338159725),
155 | # ('椨', 0.4482612387097178)]
156 | 
157 | vectors.similarity("酢橘", "徳島")
158 | # 0.3993048
159 | 
160 | vectors.most_similar("徳島", topn=5)
161 | # [('愛媛', 0.8229735),
162 | # ('徳島県', 0.78693324),
163 | # ('高知', 0.7795714),
164 | # ('岡山', 0.7623447),
165 | # ('徳島市', 0.7415296)]
166 | 
167 | vectors.closer_than("徳島", "徳島市")
168 | # ['愛媛', '徳島県', '高知', '岡山']
169 | 
170 | vectors.most_similar(positive=["阿波", "高知"], negative=["徳島"], topn=5)
171 | # [('土佐', 0.62003386),
172 | # ('阿波踊り', 0.5988593),
173 | # ('よさこい祭り', 0.578343),
174 | # ('安芸', 0.56449056),
175 | # ('高知県', 0.55915594)]
176 | 
177 | vectors.most_similar_cosmul(positive=["阿波", "高知"], negative=["徳島"], topn=5)
178 | # [('土佐', 0.83830714),
179 | # ('よさこい祭り', 0.82048166),
180 | # ('阿波踊り', 0.8168015),
181 | # ('安芸', 0.80880433),
182 | # ('伊予', 0.80250806)]
183 | ```
184 | 
185 | You can also download, remote load, or remote stream over HTTP.
186 | 
187 | ```py
188 | from pymagnitude import Magnitude, MagnitudeUtils
189 | 
190 | # Download
191 | vectors = Magnitude(MagnitudeUtils.download_model("chive-1.1-mc90-aunit", remote_path="https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/"))
192 |  # default download dir: `~/.magnitude/`
193 |  # If the file already downloaded, it won't be downloaded again
194 |  # You can change the download dir using the `download_dir` argument
195 | 
196 | # Remote Loading
197 | vectors = Magnitude("https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc90-aunit.magnitude")
198 | 
199 | # Remote Streaming over HTTP
200 | vectors = Magnitude("https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc90-aunit.magnitude", stream=True)
201 | vectors.query("徳島") # Returns the vector quickly, even with no local file downloaded
202 | ```
203 | 
204 | ## Licence
205 | 
206 | ### v1.0, v1.1, v1.2
207 | 
208 | Copyright (c) 2020 National Institute for Japanese Language and Linguistics and Works Applications Co., Ltd. All rights reserved.
209 | 
210 | "chiVe" v1.0, v1.1 and v1.2 are distributed by [National Institute for Japanese Langauge and Linguistics](https://www.ninjal.ac.jp/) and [Works Applications Co.,Ltd.](https://www.worksap.co.jp/) under [Apache License, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0).
211 | 
212 | ### v1.3
213 | 
214 | Copyright (c) 2024 Works Applications Co., Ltd. All rights reserved.
215 | 
216 | "chiVe" v1.3 is distributed by [Works Applications Co.,Ltd.](https://www.worksap.co.jp/) under [Apache License, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0).
217 | 
218 | ## Slack
219 | 
220 | We have a Slack workspace for developers and users to ask questions and discuss a variety of topics.
221 | 
222 | - https://sudachi-dev.slack.com/
223 | - (Please get an invite from [here](https://join.slack.com/t/sudachi-dev/shared_invite/enQtMzg2NTI2NjYxNTUyLTMyYmNkZWQ0Y2E5NmQxMTI3ZGM3NDU0NzU4NGE1Y2UwYTVmNTViYjJmNDI0MWZiYTg4ODNmMzgxYTQ3ZmI2OWU))
224 | 
225 | ## Citing chiVe
226 | 
227 | We have published a following paper about chiVe;
228 | 
229 | - 真鍋陽俊, 岡照晃, 海川祥毅, 髙岡一馬, 内田佳孝, 浅原正幸. [複数粒度の分割結果に基づく日本語単語分散表現](https://www.anlp.jp/proceedings/annual_meeting/2019/pdf_dir/P8-5.pdf) _(Japanese Word Embedding based on Multi-granular Tokenization Results, in Japanese)_. 言語処理学会第 25 回年次大会, 2019.
230 | - 河村宗一郎, 久本空海, 真鍋陽俊, 高岡一馬, 内田佳孝, 岡照晃, 浅原正幸. [chiVe 2.0: Sudachi と NWJC を用いた実用的な日本語単語ベクトルの実現へ向けて](https://www.anlp.jp/proceedings/annual_meeting/2020/pdf_dir/P6-16.pdf) _(chiVe 2.0: Towards Prctical Japanese Embedding wiht Sudachi and NWJC, in Japanese)_. 言語処理学会第 26 回年次大会, 2020.
231 | - 久本空海, 山村崇, 勝田哲弘, 竹林佑斗, 髙岡一馬, 内田佳孝, 岡照晃, 浅原正幸. [chiVe: 製品利用可能な日本語単語ベクトル資源の実現へ向けて](https://www.ieice.org/ken/paper/20200910U1zQ/) _(chiVe: Towards Industrial-strength Japanese Word Vector Resources, in Japanese)_. 第 16 回テキストアナリティクス・シンポジウム, 2020. ([slides](https://speakerdeck.com/sorami/chive-zhi-pin-li-yong-ke-neng-nari-ben-yu-dan-yu-bekutoruzi-yuan-falseshi-xian-hexiang-kete))
232 | 
233 | When citing chiVe in papers, books, or services, please use the follow BibTex entries (Generally, please cite the first paper, (Manabe+ 2019));
234 | 
235 | ```
236 | @INPROCEEDINGS{manabe2019chive,
237 |     author    = {真鍋陽俊, 岡照晃, 海川祥毅, 髙岡一馬, 内田佳孝, 浅原正幸},
238 |     title     = {複数粒度の分割結果に基づく日本語単語分散表現},
239 |     booktitle = "言語処理学会第25回年次大会(NLP2019)",
240 |     year      = "2019",
241 |     pages     = "NLP2019-P8-5",
242 |     publisher = "言語処理学会",
243 | }
244 | ```
245 | 
246 | ```
247 | @INPROCEEDINGS{kawamura2020chive,
248 |     author    = {河村宗一郎, 久本空海, 真鍋陽俊, 高岡一馬, 内田佳孝, 岡照晃, 浅原正幸},
249 |     title     = {chiVe 2.0: SudachiとNWJCを用いた実用的な日本語単語ベクトルの実現へ向けて},
250 |     booktitle = "言語処理学会第26回年次大会(NLP2020)",
251 |     year      = "2020",
252 |     pages     = "NLP2020-P6-16",
253 |     publisher = "言語処理学会",
254 | }
255 | ```
256 | 
257 | ```
258 | @INPROCEEDINGS{hisamoto2020chive,
259 |     author    = {久本空海, 山村崇, 勝田哲弘, 竹林佑斗, 髙岡一馬, 内田佳孝, 岡照晃, 浅原正幸},
260 |     title     = {chiVe: 製品利用可能な日本語単語ベクトル資源の実現へ向けて},
261 |     booktitle = "第16回テキストアナリティクス・シンポジウム",
262 |     year      = "2020",
263 |     pages     = "IEICE-NLC2020-9",
264 |     publisher = "電子情報通信学会",
265 | }
266 | ```
267 | 


--------------------------------------------------------------------------------