├── .gitignore ├── evaluation ├── data │ └── .gitkeep ├── .gitignore ├── requirements.txt ├── resources │ ├── sim │ │ ├── jwsan.yaml │ │ └── tmu-sudachic.yaml │ ├── clf │ │ └── livedoor-sudachic.yaml │ └── vec │ │ └── chive-1.3-mc90-sudachic.yaml ├── models │ ├── classifier.py │ └── w2v.py ├── dataset │ ├── base.py │ ├── labeled.py │ └── pairwise.py ├── eval │ └── evaluator.py ├── utils.py ├── run_docclf.py ├── run_wordsim.py └── README.md ├── .github └── FUNDING.yml ├── training ├── requirements.txt ├── convert_model_format.py ├── README.md ├── prepare_corpus.py └── train_chive.py ├── docs ├── tutorial.md └── continue-training.md ├── LICENSE ├── README.md └── README_en.md /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | -------------------------------------------------------------------------------- /evaluation/data/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /evaluation/.gitignore: -------------------------------------------------------------------------------- 1 | data/* 2 | !/data/.gitkeep 3 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: WorksApplications 4 | -------------------------------------------------------------------------------- /training/requirements.txt: -------------------------------------------------------------------------------- 1 | gensim 2 | tqdm 3 | 4 | # sudachi for v1.3 5 | sudachipy==0.6.8 6 | sudachidict_core==20240109 7 | -------------------------------------------------------------------------------- /evaluation/requirements.txt: -------------------------------------------------------------------------------- 1 | gensim 2 | pyyaml 3 | pandas 4 | scikit-learn 5 | 6 | # sudachi for v1.3 7 | sudachipy==0.6.8 8 | sudachidict_core==20240109 9 | -------------------------------------------------------------------------------- /docs/tutorial.md: -------------------------------------------------------------------------------- 1 | # chiVe Tutorial 2 | 3 | [公式README](https://github.com/WorksApplications/chiVe/blob/master/README.md)を参照してください。 4 | 5 | Please refer to the [official README](https://github.com/WorksApplications/chiVe/blob/master/README.md) for the information. 6 | -------------------------------------------------------------------------------- /evaluation/resources/sim/jwsan.yaml: -------------------------------------------------------------------------------- 1 | name: "jwsan" 2 | data-path: "./data/JWSAN" 3 | process: 4 | use-tokenizer: True 5 | tokenizer: 6 | name: "sudachi" 7 | others: 8 | mode: "C" 9 | dic-name: 10 | form: "normalized" # "normalized" or "surface" 11 | -------------------------------------------------------------------------------- /evaluation/resources/sim/tmu-sudachic.yaml: -------------------------------------------------------------------------------- 1 | name: "tmu" 2 | data-path: "./data/JapaneseWordSimilarityDataset" 3 | process: 4 | use-tokenizer: True 5 | tokenizer: 6 | name: "sudachi" 7 | others: 8 | mode: "C" 9 | dic-name: 10 | form: "normalized" # "normalized" or "surface" 11 | -------------------------------------------------------------------------------- /evaluation/resources/clf/livedoor-sudachic.yaml: -------------------------------------------------------------------------------- 1 | name: "livedoor" 2 | data-path: "./data/ldcc-20140209/text" 3 | process: 4 | pre-tokenized: False 5 | tokenizer: 6 | name: "sudachi" 7 | others: 8 | mode: "C" 9 | dic-name: 10 | form: "normalized" # "surface" or "normalized" 11 | used-pos: ["名詞"] 12 | 13 | classifier: 14 | method: "logreg" 15 | others: 16 | -------------------------------------------------------------------------------- /evaluation/resources/vec/chive-1.3-mc90-sudachic.yaml: -------------------------------------------------------------------------------- 1 | vec-path: "path/to/chive-1.3-mc90_gensim/chive-1.3-mc90.kv" 2 | loading: 3 | w2v-fmt: True # set false to load gensim word2vec format (full model) 4 | fmt: "kv" # set "txt" to load text format, or "kv" to keyedvector format 5 | gensim-others: 6 | description: 7 | tokenizer: 8 | name: "sudachi" 9 | others: 10 | mode: "C" 11 | dic_name: 12 | api-setting: 13 | -------------------------------------------------------------------------------- /evaluation/models/classifier.py: -------------------------------------------------------------------------------- 1 | 2 | import logging 3 | from sklearn.linear_model import LogisticRegression 4 | from sklearn.neighbors import KNeighborsClassifier 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | def build_classifier(method, other_config): 10 | if method == "logreg": 11 | clf = LogisticRegression() 12 | elif method == "knn": 13 | clf = KNeighborsClassifier(n_neighbors=other_config["k"]) 14 | else: 15 | raise ValueError("Invalid method: {}".format(method)) 16 | return clf 17 | -------------------------------------------------------------------------------- /evaluation/dataset/base.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | 4 | 5 | class BaseDataset(): 6 | def __init__(self, samples, name=None): 7 | _type = type(samples) 8 | assert _type == list or _type == np.ndarray, "Should be list or ndarray" 9 | self._samples = samples if type(samples) == np.ndarray else np.array(samples) 10 | self.name = name 11 | 12 | def __repr__(self): 13 | return "<{} name={}>".format(self.__class__.__name__, self.name) 14 | 15 | def __getitem__(self, item): 16 | return self._samples[item] 17 | 18 | def __len__(self): 19 | return len(self._samples) 20 | 21 | def split(self): 22 | raise NotImplementedError() 23 | 24 | def batch_iter(self, batchsize, rand_flg): 25 | assert batchsize > 0 26 | indices = np.random.permutation(len(self)) if rand_flg else np.arange(len(self)) 27 | for start in range(0, len(self), batchsize): 28 | yield self[indices[start: start+batchsize]] 29 | 30 | def batch_iter_as_ndarray(self, batchsize, rand_flg): 31 | raise NotImplementedError("Not available") 32 | 33 | def get_basic_stats(self): 34 | raise NotImplementedError() 35 | 36 | def cal_stats(self): 37 | return {"n_sample": len(self._samples)} 38 | 39 | 40 | class BaseInstance(): 41 | def __init__(self): 42 | self.model_pred = None 43 | 44 | def get_content(self): 45 | raise NotImplementedError() 46 | 47 | def set_model_prediction(self, pred): 48 | self.model_pred = pred 49 | -------------------------------------------------------------------------------- /training/convert_model_format.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | from pathlib import Path 4 | 5 | from gensim.models import Word2Vec 6 | 7 | 8 | logging.basicConfig( 9 | style="{", 10 | format='{levelname} {asctime} [{module}:{funcName}:{lineno}] {message}', 11 | datefmt="%m/%d/%Y %H:%M:%S", 12 | level=logging.INFO, 13 | ) 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | def parse_args(): 18 | p = argparse.ArgumentParser( 19 | "Convert trained gensim.Word2Vec full model into release formats.") 20 | p.add_argument("--input", type=Path, 21 | help="target model data (.bin of gensim.Word2Vec)") 22 | p.add_argument("--output", type=Path, default=Path("./output"), 23 | help="directory to output") 24 | args = p.parse_args() 25 | return args 26 | 27 | 28 | def main(): 29 | args = parse_args() 30 | args.output.mkdir(parents=True, exist_ok=True) 31 | 32 | stem = args.input.stem 33 | fullmodel = Word2Vec.load(str(args.input)) 34 | 35 | # ref: 36 | # https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec 37 | # https://radimrehurek.com/gensim/models/keyedvectors.html#how-to-obtain-word-vectors 38 | logger.info(f"gensim.KeyedVectors") 39 | outfile = args.output / f"{stem}_gensim" / f"{stem}.kv" 40 | outfile.parent.mkdir() 41 | fullmodel.wv.save(str(outfile)) 42 | 43 | # ref: https://radimrehurek.com/gensim/models/keyedvectors.html#gensim.models.keyedvectors.KeyedVectors.save_word2vec_format 44 | logger.info(f"text format") 45 | outfile = args.output / f"{stem}_text" / f"{stem}.txt" 46 | outfile.parent.mkdir() 47 | fullmodel.wv.save_word2vec_format(str(outfile)) 48 | 49 | # ref: https://github.com/plasticityai/magnitude/tree/master?tab=readme-ov-file#file-format-and-converter 50 | # logger.info(f"magnitude") 51 | # from pymagnitude.third_party_mock.converter import convert as convert_magnitude 52 | # outfile = args.output / f"{stem}.magnitude" 53 | # convert_magnitude( 54 | # input_file_path=str(args.input), 55 | # output_file_path=str(outfile), 56 | # ) 57 | 58 | return 59 | 60 | 61 | if __name__ == '__main__': 62 | main() 63 | -------------------------------------------------------------------------------- /evaluation/eval/evaluator.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | from scipy.stats import spearmanr 4 | from sklearn.model_selection import cross_val_score, StratifiedKFold 5 | 6 | 7 | class Evaluator(): 8 | def __init__(self, dataset): 9 | self.dataset = dataset 10 | 11 | def predict_all(self, batchsize=1): 12 | for batch in self.dataset.batch_iter(batchsize=batchsize, rand_flg=False): 13 | for inst in batch: 14 | self._predict_one(inst) 15 | 16 | def run(self): 17 | self.predict_all() 18 | return self.get_eval_metric() 19 | 20 | def get_eval_metric(self): 21 | raise NotImplementedError() 22 | 23 | def _predict_one(self, inst): 24 | raise NotImplementedError() 25 | 26 | 27 | class W2VSimilarityEvaluator(Evaluator): 28 | def __init__(self, dataset, w2v_api): 29 | super(W2VSimilarityEvaluator, self).__init__(dataset=dataset) 30 | self.w2v_api = w2v_api 31 | 32 | def get_eval_metric(self): 33 | # spearman corr 34 | human_scores = [ 35 | b[0].gold for b in self.dataset.batch_iter(1, rand_flg=False)] 36 | auto_scores = [ 37 | b[0].model_pred for b in self.dataset.batch_iter(1, rand_flg=False)] 38 | # assert all(human_scores), "Contain invalid gold data" 39 | # assert all(auto_scores), "Contain invalid prediction" 40 | corr = spearmanr(human_scores, auto_scores)[0] 41 | return corr 42 | 43 | def _predict_one(self, inst): 44 | e1, e2, gold_score = inst.get_content() 45 | pred = self.w2v_api.cal_phrase_similarity(e1, e2) 46 | inst.set_model_prediction(pred) 47 | 48 | 49 | class ClassificationEvaluator(Evaluator): 50 | def __init__(self, dataset, w2v_api, clf): 51 | super(ClassificationEvaluator, self).__init__(dataset=dataset) 52 | self.w2v_api = w2v_api 53 | self.clf = clf 54 | 55 | def run_kfold(self, k=10): 56 | cv = get_sklearn_kfoldcv(k) 57 | txt_xs, ys = self.dataset.get_xys() 58 | xs = np.array([self.w2v_api.get_mean_vector(doc) for doc in txt_xs]) 59 | scores = cross_val_score(self.clf, xs, ys, cv=cv) 60 | return {"scores": scores, "mean": np.mean(scores), "variance": np.var(scores)} 61 | 62 | 63 | def get_sklearn_kfoldcv(n_splits, seed=46): 64 | return StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed) 65 | -------------------------------------------------------------------------------- /evaluation/utils.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | from scipy.spatial import distance 4 | 5 | import sudachipy 6 | 7 | 8 | def cos_sim(v1, v2): 9 | return 1.0 - distance.cosine(v1, v2) 10 | 11 | 12 | def get_zero_vector(dim, eps=1e-8): 13 | return eps * np.ones(dim) 14 | 15 | 16 | def build_tokenizer(tok_name, config): 17 | if tok_name == 'sudachi': 18 | tok = SudachiTokenizer(config.get("mode", "C"), 19 | config.get("dic-name", None), 20 | config.get("form", "surface")) 21 | return tok 22 | else: 23 | raise ValueError("Invalid Tokenizer Name: {}".format(tok_name)) 24 | 25 | 26 | class SudachiTokenizer(): 27 | def __init__(self, mode=sudachipy.SplitMode.C, dic_name=None, form="surface"): 28 | mode = mode if type(mode) == sudachipy.SplitMode \ 29 | else self._str2mode(mode) 30 | self._name = f"sudachipy_{dic_name}_{self._mode2str(mode)}" 31 | 32 | dic_name = "core" if dic_name is None else dic_name 33 | self._tok = sudachipy.Dictionary(dict_type=dic_name).create(mode=mode) 34 | 35 | assert form in ["surface", 36 | "normalized"], f"Invalid form for sudachi: {form}." 37 | self.form = form 38 | return 39 | 40 | def get_name(self) -> str: 41 | return self._name 42 | 43 | def tokenize(self, sent): 44 | return self._tok.tokenize(sent) 45 | 46 | def wakati(self, sent): 47 | ms = self.tokenize(sent) 48 | match self.form: 49 | case "surface": return [m.surface() for m in ms] 50 | case "normalized": return [m.normalized_form() for m in ms] 51 | case _: raise RuntimeError(f"unknown sudachi form: {self.form}") 52 | 53 | @staticmethod 54 | def _str2mode(modestr: str) -> sudachipy.SplitMode: 55 | """parse sudachipy.SplitMode from str""" 56 | match modestr.strip(): 57 | case "A" | "a": return sudachipy.SplitMode.A 58 | case "B" | "b": return sudachipy.SplitMode.B 59 | case "C" | "c": return sudachipy.SplitMode.C 60 | case _: raise ValueError(f"cannot parse {modestr} as SplitMode.") 61 | 62 | @staticmethod 63 | def _mode2str(mode: sudachipy.SplitMode) -> str: 64 | """convert sudachipy.SplitMode into str""" 65 | match mode: 66 | case sudachipy.SplitMode.A: return "A" 67 | case sudachipy.SplitMode.B: return "B" 68 | case sudachipy.SplitMode.C: return "C" 69 | -------------------------------------------------------------------------------- /docs/continue-training.md: -------------------------------------------------------------------------------- 1 | # chiVeの追加学習 2 | 3 | chiVeは、各ドメイン(分野)に合わせたデータで追加学習させられます。 4 | chiVeは、追加学習なしでも利用できますが、追加学習することでそのドメイン(分野)でのタスクの性能改善が期待できます。 5 | 6 | 7 | ## Step 1. フルモデルをダウンロード 8 | 9 | [学習させたいモデル](../README.md#追加学習用のフルモデル)を選択してダウンロードし、解凍します。 10 | 11 | ```sh 12 | $ wget https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc90_gensim-full.tar.gz 13 | $ tar xzvf chive-1.2-mc90_gensim-full.tar.gz 14 | ``` 15 | 16 | ## Step 2. 学習コーパスの用意 17 | 18 | 分かち書きした学習コーパスが必要です。 19 | 平文データ `corpus.txt` から、分かち書きしたファイル `corpus.tok.txt` を作ります。 20 | 21 | ```bash 22 | $ pip install sudachipy sudachidict_core 23 | ``` 24 | 25 | ```py 26 | import sudachipy 27 | 28 | tokenizer = sudachipy.Dictionary().create() 29 | 30 | def tokenize(sentence: str, mode: str) -> str: 31 | mode = { 32 | 'A': sudachipy.Tokenizer.SplitMode.A, 33 | 'B': sudachipy.Tokenizer.SplitMode.B, 34 | 'C': sudachipy.Tokenizer.SplitMode.C}[mode] 35 | tokens = [m.normalized_form() for m in tokenizer.tokenize(sentence, mode)] 36 | return ' '.join(tokens) 37 | 38 | def create_training_corpus(inputpath, outputpath): 39 | with open(inputpath) as inputfile, open(outputpath, 'w') as outputfile: 40 | for mode in ('A', 'B', 'C'): 41 | for line in inputfile: 42 | line = line.strip() 43 | if line == '': 44 | continue 45 | outputfile.write(tokenize(line, mode) + '\n') 46 | inputfile.seek(0) 47 | 48 | create_training_corpus('corpus.txt', 'corpus.tok.txt') 49 | ``` 50 | 51 | 52 | ## Step 3. 学習 53 | 54 | 学習パラメータの詳細は、[gensim.models.word2vec.Word2Vec.train](https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec.train)を参照してください。 55 | 56 | ```bash 57 | $ pip install gensim 58 | ``` 59 | 60 | ```py 61 | from gensim.models.word2vec import LineSentence 62 | from gensim.models import Word2Vec 63 | 64 | sentences = LineSentence('corpus.tok.txt') 65 | model = Word2Vec.load('chive-1.2-mc90_gensim-full/chive-1.2-mc90.bin') 66 | model.vocabulary.min_count = 3 67 | model.build_vocab(sentences, update=True) 68 | model.train(sentences, total_examples=model.corpus_count, epochs=15) 69 | ``` 70 | 71 | 学習したモデルを保存します。 72 | 73 | * KeyedVectors: 学習に使用するパラメータを削除した埋め込みのみのデータ形式 74 | * Full model: 学習に使用するパラメータも保持したデータ形式 75 | 76 | ```py 77 | model.wv.save('chive-1.2-mc90.finetuned-mc3.kv') # Save as KeyedVectors 78 | model.save('chive-1.2-mc90.finetuned-mc3.bin') # Save as Full model 79 | ``` 80 | 81 | 82 | ## Step 4. 利用 83 | 84 | 保存した分散表現を読み込んで利用します。 85 | 86 | ```py 87 | from gensim.models import KeyedVectors 88 | KeyedVectors.load('chive-1.2-mc90.finetuned-mc3.kv') # Load as KeyedVectors 89 | Word2Vec.load('chive-1.2-mc90.finetuned-mc3.bin') # Load as Full model 90 | ``` 91 | -------------------------------------------------------------------------------- /evaluation/run_docclf.py: -------------------------------------------------------------------------------- 1 | 2 | import argparse 3 | from datetime import datetime 4 | import logging 5 | from pathlib import Path 6 | import yaml 7 | 8 | from dataset.labeled import build_doc_classification_dataset 9 | from eval.evaluator import ClassificationEvaluator 10 | from models.classifier import build_classifier 11 | from models.w2v import build_gensim_w2v, build_w2v_api 12 | 13 | 14 | logging.basicConfig( 15 | style="{", 16 | format='{levelname} {asctime} [{module}:{funcName}:{lineno}] {message}', 17 | datefmt="%m-%d-%Y %H:%M:%S", 18 | level=logging.INFO, 19 | ) 20 | logger = logging.getLogger(__name__) 21 | 22 | 23 | def add_file_handler(logfile: Path) -> None: 24 | file_handler = logging.FileHandler(logfile) 25 | logger.addHandler(file_handler) 26 | return 27 | 28 | 29 | def run_docclf(vconfig_path, tconfig_path): 30 | with open(vconfig_path) as fv, open(tconfig_path) as ft: 31 | vec_config = yaml.load(fv, Loader=yaml.CLoader) 32 | task_config = yaml.load(ft, Loader=yaml.CLoader) 33 | logger.info(f"vector configulatation: {vec_config}") 34 | logger.info(f"task configulatation: {task_config}") 35 | 36 | logger.info("Setup...") 37 | w2v = build_gensim_w2v(w2v_path=vec_config["vec-path"], 38 | load_config=vec_config["loading"], 39 | other_config=vec_config["gensim-others"]) 40 | w2v_api = build_w2v_api(w2v=w2v, 41 | config=vec_config["api-setting"], 42 | w2v_desc=vec_config["description"]) 43 | dat = build_doc_classification_dataset(name=task_config["name"], 44 | dir_path=task_config["data-path"], 45 | process_config=task_config["process"]) 46 | clf = build_classifier(method=task_config["classifier"]["method"], 47 | other_config=task_config["classifier"]["others"]) 48 | 49 | logger.info("Do evaluation") 50 | eval = ClassificationEvaluator(dat, w2v_api, clf) 51 | res = eval.run_kfold() 52 | 53 | logger.info("Results...") 54 | logger.info(res) 55 | logger.info("Done") 56 | return 57 | 58 | 59 | if __name__ == '__main__': 60 | p = argparse.ArgumentParser("Word similarity Evaluation") 61 | p.add_argument("--vconfig", type=Path, 62 | help="Vector Configulation file path") 63 | p.add_argument("--tconfig", type=Path, help="Task Configulation file path") 64 | p.add_argument("--log", type=Path, default=None, help="Log path") 65 | args = p.parse_args() 66 | 67 | # setup logger 68 | logfile = args.log if args.log is not None else Path( 69 | f"{datetime.now().strftime('%Y%m%d_%H:%M')}.log") 70 | add_file_handler(logfile) 71 | 72 | logger.info("Arguments...") 73 | for arg, val in sorted(vars(args).items()): 74 | logger.info("{}: {}".format(arg, val)) 75 | 76 | # main 77 | run_docclf(args.vconfig, args.tconfig) 78 | -------------------------------------------------------------------------------- /evaluation/run_wordsim.py: -------------------------------------------------------------------------------- 1 | 2 | import argparse 3 | from datetime import datetime 4 | import logging 5 | from pathlib import Path 6 | import yaml 7 | 8 | from dataset.pairwise import build_pairwise_similarity_datasets 9 | from eval.evaluator import W2VSimilarityEvaluator 10 | from models.w2v import build_gensim_w2v, build_w2v_api 11 | 12 | 13 | logging.basicConfig( 14 | style="{", 15 | format='{levelname} {asctime} [{module}:{funcName}:{lineno}] {message}', 16 | datefmt="%m-%d-%Y %H:%M:%S", 17 | level=logging.INFO, 18 | ) 19 | logger = logging.getLogger(__name__) 20 | 21 | 22 | def add_file_handler(logfile: Path) -> None: 23 | file_handler = logging.FileHandler(logfile) 24 | logger.addHandler(file_handler) 25 | return 26 | 27 | 28 | def run_wordsim(vconfig_path, tconfig_path): 29 | with open(vconfig_path) as fv, open(tconfig_path) as ft: 30 | vec_config = yaml.load(fv, Loader=yaml.CLoader) 31 | task_config = yaml.load(ft, Loader=yaml.CLoader) 32 | logger.info("vector configulatation: {}".format(vec_config)) 33 | logger.info("task configulatation: {}".format(task_config)) 34 | 35 | logger.info("Setup...") 36 | w2v = build_gensim_w2v(w2v_path=vec_config["vec-path"], 37 | load_config=vec_config["loading"], 38 | other_config=vec_config["gensim-others"]) 39 | w2v_api = build_w2v_api(w2v=w2v, 40 | config=vec_config["api-setting"], 41 | w2v_desc=vec_config["description"]) 42 | dats = build_pairwise_similarity_datasets(name=task_config["name"], 43 | dir_path=task_config["data-path"], 44 | process_config=task_config["process"]) 45 | 46 | logger.info("Do evaluation") 47 | name2score = {} 48 | for d in dats: 49 | logger.info("In {}".format(d.name)) 50 | eval = W2VSimilarityEvaluator(d, w2v_api) 51 | score = eval.run() 52 | name2score[d.name] = score 53 | logger.info("Out {}".format(d.name)) 54 | 55 | logger.info("Results...") 56 | for (k, v) in name2score.items(): 57 | logger.info("{}: {}".format(k, v)) 58 | logger.info("Done") 59 | return 60 | 61 | 62 | if __name__ == '__main__': 63 | p = argparse.ArgumentParser("Document classification Evaluation") 64 | p.add_argument("--vconfig", type=Path, 65 | help="Vector Configulation file path") 66 | p.add_argument("--tconfig", type=Path, help="Task Configulation file path") 67 | p.add_argument("--log", type=Path, default=None, help="Log path") 68 | args = p.parse_args() 69 | 70 | # setup logger 71 | logfile = args.log if args.log is not None else Path( 72 | f"{datetime.now().strftime('%Y%m%d_%H:%M')}.log") 73 | add_file_handler(logfile) 74 | 75 | logger.info("Arguments...") 76 | for arg, val in sorted(vars(args).items()): 77 | logger.info("{}: {}".format(arg, val)) 78 | 79 | # main 80 | run_wordsim(args.vconfig, args.tconfig) 81 | -------------------------------------------------------------------------------- /training/README.md: -------------------------------------------------------------------------------- 1 | # Training chiVe 2 | 3 | chiVe is trained using [gensim](https://radimrehurek.com/gensim/index.html) library. 4 | 5 | ## Training procedure 6 | 7 | ### 0. Setup 8 | 9 | Install libraries with `pip install -r requirements.txt`. 10 | Note that, for the reproducablity, the versions of Sudachi in `requirements.txt` are fixed. 11 | If you want to use latest ones, modify them or update Sudachi. 12 | 13 | ### 1. Prepare corpus 14 | 15 | Training a word2vec model requires a corpus. 16 | 17 | We load the corpus using [`gensim.models.word2vec.LineSentence`](https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.LineSentence) or [`PathLineSentences`](https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.LineSentence) class. 18 | Since they assumes that words are already preprocessed and whitespace separated, we preprocess our corpus. 19 | 20 | Put corpus text file(s) in a directory (subdirectories are NOT searched), and run 21 | `python prepare_corpus.py --input path/to/input_dir --output path/to/output_dir`. 22 | This script does followings: 23 | 24 | - Analyze input text files using Sudachi, and output each morphemes separeted by a whitespace (分かち書き/wakachi-gaki). 25 | - Each morphemes are converted into `normalized_form` (正規化形). 26 | - Analysis is performed 3 times, using each of Sudachi split modes (A/B/C). 27 | - Output 3 files per an input file. 28 | - Analysis is performed per line. 29 | - Words that its normalized_form is a whitespace (" ") are skiped. 30 | 31 | Set `--skip-existing` to skip an analysis for a pair of (input file, Sudachi split mode) that already processed. 32 | Skippableness is judged based on the existance of the output file and its line count (for the case of analysis was interrupted). 33 | 34 | Use `--mode` to specify Sudachi split mode to use. e.g. `--mode AC` to use only mode A and C. 35 | 36 | example: 37 | 38 | ```bash 39 | ls -A data/raw_corpus/*.txt | xargs -L 1 -I{} -P 20 \ 40 | python prepare_corpus.py --skip-existing \ 41 | --input {} --output data/corpus/ 42 | ``` 43 | 44 | ### 2. Training 45 | 46 | Use `train_chive.py` to run training using gensim word2vec class. 47 | `--input` should be set the output directory of `prepare_corpus.py`. 48 | 49 | example: 50 | 51 | ```bash 52 | python train_chive.py \ 53 | --input data/corpus/ --output model/full/ \ 54 | --epochs 15 --min-count 90 \ 55 | --save-epochs 3 --keep-ckpt 5 \ 56 | --worker 16 57 | ``` 58 | 59 | chiVe ~v1.3 are trained 15 epochs with following parameters. 60 | 61 | ```json 62 | { 63 | "vector_size": 300, 64 | "window": 8, 65 | "sg": 1, 66 | "hs": 0, 67 | "n_negative": 5, 68 | "threshold_downsample": 1e-5, 69 | "alpha": 0.025, 70 | "min_alpha": 0.0001 71 | } 72 | ``` 73 | 74 | You can resume training from a checkpoint saved (auto detect from the output directory). 75 | You should resume with same parameters and corpus, otherwise the result may not be an expected one. 76 | Note that resuming a training is not the feature of gensim, and may contain some error on precision. 77 | 78 | ### 3. Convert to distribution formats 79 | 80 | Trained (full) model contains values for update model parameters, that is not neccessary for querying. 81 | Use `convert_model_format.py` to generate text and `gensim.KeyedVectors` format for the distribution. 82 | 83 | example: 84 | 85 | ```bash 86 | python convert_model_format.py \ 87 | --input model/full/ --output model/release/ 88 | ``` 89 | 90 | [magnitude](https://github.com/plasticityai/magnitude) does not seems to be maintained, so we stop to distribute in that format from chiVe v1.3. 91 | -------------------------------------------------------------------------------- /evaluation/README.md: -------------------------------------------------------------------------------- 1 | # Evaluation of word vectors 2 | 3 | See following paper for the detail. 4 | 5 | - 真鍋陽俊, 岡照晃, 海川祥毅, 髙岡一馬, 内田佳孝, 浅原正幸. [複数粒度の分割結果に基づく日本語単語分散表現](https://www.anlp.jp/proceedings/annual_meeting/2019/pdf_dir/P8-5.pdf). 言語処理学会第 25 回年次大会, 2019. 6 | 7 | ## Setup 8 | 9 | Download datasets. 10 | The default task setting files (`resources/**/*.yaml`) assume datasets are located under `data/`, otherwise you need to modify it. 11 | 12 | Download word vectors, and set the path in the vec setting file (`resources/vec/*.yaml`). 13 | The default vec setting assumes to use `gensim.KeyedVector` format. 14 | 15 | Install modules by `pip install -r requirements.txt`. 16 | 17 | ## Tasks 18 | 19 | ### Word similarity 20 | 21 | Calculate the Spearman rank-order correlation coefficient between human annotated similarity and word vectors. 22 | 23 | - Similarity of word vectors are measured by cosine-similariry. 24 | - If a target word consists of multiple (sudachi) words, we use an average vector of each (sudachi) words. 25 | 26 | We used following datasets: 27 | 28 | - [JWSEN](http://www.utm.inf.uec.ac.jp/JWSAN/index.html) 29 | - Download jwsan.zip, unzip and place under `data/` 30 | - [Japanese Word Similarity Dataset](https://github.com/tmu-nlp/JapaneseWordSimilarityDataset) (JWSD) 31 | - Clone the repository under `data/` 32 | 33 | commands: 34 | 35 | ```bash 36 | # JWSEN 37 | python run_wordsim.py \ 38 | --tconfig resources/sim/jwsen.yaml \ 39 | --vconfig resources/vec/chive-1.3-mc90-sudachic.yaml 40 | 41 | # JWSD 42 | python run_wordsim.py \ 43 | --tconfig resources/sim/tmu-sudachic.yaml \ 44 | --vconfig resources/vec/chive-1.3-mc90-sudachic.yaml 45 | ``` 46 | 47 | ### Document classification 48 | 49 | Train classifier using word vector as a feature vector. 50 | 51 | - Document vector is calculated by averaging word vectors of nouns in the document. 52 | - LogisticRegression is used as a classifier. 53 | - We conduct 10-fold cross validation. 54 | 55 | We used following datasets: 56 | 57 | - [livedoor news corpus](https://www.rondhuit.com/download.html#ldcc) 58 | - Download ldcc-20140209.tar.gz, untar and place under `data/` 59 | 60 | commands: 61 | 62 | ```bash 63 | # livedoor news corpus 64 | python run_docclf.py \ 65 | --tconfig resources/clf/livedoor-sudachic.yaml \ 66 | --vconfig resources/vec/chive-1.3-mc30-sudachic.yaml 67 | ``` 68 | 69 | ## Results 70 | 71 | | version | jwsan-1400 類似度 | jwsan-1400 関連度 | jwsd-verb | jwsd-adj | jwsd-noun | jwsd-adv | livedoor-acc | 72 | | --------- | ----------------- | ----------------- | --------- | -------- | --------- | -------- | -------------- | 73 | | v1.3 mc5 | 0.493 | 0.626 | 0.309 | 0.459 | 0.351 | 0.231 | 0.862+1.46e-4 | 74 | | v1.3 mc15 | 0.492 | 0.627 | 0.318 | 0.465 | 0.354 | 0.239 | 0.860+1.48e-4 | 75 | | v1.3 mc30 | 0.496 | 0.626 | 0.318 | 0.459 | 0.354 | 0.250 | 0.859+1.23e-4 | 76 | | v1.3 mc90 | 0.493 | 0.622 | 0.324 | 0.460 | 0.344 | 0.261 | 0.857+1.55e-4 | 77 | | | | | | | | | | 78 | | v1.2 mc5 | 0.520 | 0.633 | 0.316 | 0.466 | 0.355 | 0.297 | 0.865+0.436e-4 | 79 | | v1.2 mc15 | 0.513 | 0.629 | 0.315 | 0.461 | 0.353 | 0.294 | 0.862+0.710e-4 | 80 | | v1.2 mc30 | 0.515 | 0.631 | 0.311 | 0.458 | 0.354 | 0.289 | 0.860+0.546e-4 | 81 | | v1.2 mc90 | 0.512 | 0.627 | 0.307 | 0.463 | 0.345 | 0.281 | 0.861+0.778e-4 | 82 | 83 | ## NOTE 84 | 85 | - Current implementation uses zero-vector for OOV words and cosine-similarity with zero-vector is 1.0. 86 | This may affect the evaluation result. 87 | -------------------------------------------------------------------------------- /evaluation/dataset/labeled.py: -------------------------------------------------------------------------------- 1 | 2 | import itertools 3 | from pathlib import Path 4 | 5 | from dataset.base import BaseDataset, BaseInstance 6 | from utils import build_tokenizer 7 | 8 | 9 | class TextClassificationDataset(BaseDataset): 10 | def __init__(self, samples, label_name=None, name=None): 11 | super(TextClassificationDataset, self).__init__(samples, name) 12 | self.label_name = label_name 13 | 14 | def get_xys(self): 15 | xs = [b[0].word_list for b in self.batch_iter(1, rand_flg=False)] 16 | ys = [b[0].gold for b in self.batch_iter(1, rand_flg=False)] 17 | return xs, ys 18 | 19 | @staticmethod 20 | def build(docs, label_ids, label_name=False): 21 | """ 22 | args: 23 | - docs (list>): documents 24 | - label_ids (list): labels 25 | - label_name (list): label names 26 | """ 27 | assert len(docs) == len(label_ids), "Inconsistent length: {} != {}" \ 28 | .format(len(docs), len(label_ids)) 29 | insts = [TextClassificationInstance(d, l) 30 | for (d, l) in zip(docs, label_ids)] 31 | return TextClassificationDataset(samples=insts, label_name=label_name) 32 | 33 | 34 | class TextClassificationInstance(BaseInstance): 35 | def __init__(self, word_list, label_id): 36 | """ 37 | args: 38 | - word_list (list): word sequence 39 | - label_id (int): label id 40 | """ 41 | super(TextClassificationInstance, self).__init__() 42 | assert type(word_list) == list 43 | assert type(label_id) == int 44 | self.word_list = word_list 45 | self.gold = label_id 46 | 47 | def get_content(self): 48 | return (self.word_list, self.gold) 49 | 50 | 51 | def build_doc_classification_dataset(name, dir_path, process_config): 52 | if name == "livedoor": 53 | txt_xs, ys = build_livedoor(dir_path, process_config) 54 | else: 55 | raise ValueError() 56 | dat = TextClassificationDataset.build(txt_xs, ys) 57 | return dat 58 | 59 | 60 | def build_livedoor(dir_path, process_config): 61 | LIVEDOOR_LABEL_LIST = ["dokujo-tsushin", "kaden-channel", "movie-enter", 62 | "smax", "topic-news", "it-life-hack", 63 | "livedoor-homme", "peachy", "sports-watch"] 64 | pre_tokenized = process_config["pre-tokenized"] 65 | tok = None if pre_tokenized else build_tokenizer( 66 | process_config["tokenizer"]["name"], process_config["tokenizer"]["others"]) 67 | 68 | base = Path(dir_path) 69 | label_names = [] 70 | txt_xs = [] 71 | for label_dir in sorted(base.glob("*")): 72 | if not label_dir.is_dir(): 73 | continue 74 | label_name = label_dir.name 75 | assert label_name in LIVEDOOR_LABEL_LIST, "Invalid label directory" 76 | for doc_path in sorted(label_dir.glob("*.txt")): # get document files 77 | if pre_tokenized == True: 78 | raise NotImplementedError() 79 | else: 80 | doc = tokenize_doc(doc_path, tok, process_config) 81 | txt_xs.append(doc) 82 | label_names.append(label_name) 83 | assert len(txt_xs) == len(label_names), "Inconsistent length" 84 | ys = [LIVEDOOR_LABEL_LIST.index(n) for n in label_names] 85 | return txt_xs, ys 86 | 87 | 88 | def tokenize_doc(doc_path, tok, process_config): 89 | ms = [] 90 | with Path(doc_path).open() as fi: 91 | for l in fi: 92 | ms.append(tok.tokenize(l)) 93 | ms = itertools.chain.from_iterable(ms) 94 | 95 | if (used_pos := process_config["used-pos"]) != "all": 96 | ms = (m for m in ms if m.part_of_speech()[0] in used_pos) 97 | 98 | match (form := process_config["tokenizer"]["others"]["form"]): 99 | case "surface": return [m.surface() for m in ms] 100 | case "normalized": return [m.normalized_form() for m in ms] 101 | case _: raise ValueError(f"invalid tokenizer form: {form}") 102 | -------------------------------------------------------------------------------- /evaluation/dataset/pairwise.py: -------------------------------------------------------------------------------- 1 | 2 | import pandas as pd 3 | from pathlib import Path 4 | 5 | from dataset.base import BaseDataset, BaseInstance 6 | from utils import build_tokenizer 7 | 8 | 9 | class PairwiseSimilarityDataset(BaseDataset): 10 | def __init__(self, samples, name=None): 11 | super(PairwiseSimilarityDataset, self).__init__(samples, name) 12 | 13 | @staticmethod 14 | def build_from_triplet(col1s, col2s, sims): 15 | """ 16 | args: 17 | - col1s (list): first elements in the given pairs 18 | - col2s (list): second elements in the given pairs 19 | - sims (list): scores 20 | - tok (Tokenizer): Tokenizer for entryies. If None, no tokenization 21 | """ 22 | assert len(col1s) == len(col2s) and len(col1s) == len(sims), \ 23 | "Inconsistent length: {} / {} / {}" \ 24 | .format(len(col1s), len(col2s), len(sims)) 25 | insts = [PairwiseSimilarityInstance(c1, c2, sim) 26 | for (c1, c2, sim) in zip(col1s, col2s, sims)] 27 | return PairwiseSimilarityDataset(samples=insts) 28 | 29 | 30 | class PairwiseSimilarityInstance(BaseInstance): 31 | def __init__(self, e1, e2, score): 32 | """ 33 | args: 34 | - e1, e2 (list): Pair of (tokenized) Strings to be evaluated. 35 | """ 36 | super(PairwiseSimilarityInstance, self).__init__() 37 | assert type(e1) == list 38 | assert type(e2) == list 39 | assert type(score) in [ 40 | int, float], "Invalid type: {}".format(type(score)) 41 | self._triple = (e1, e2, score) 42 | self.gold = score 43 | 44 | def get_content(self): 45 | return self._triple 46 | 47 | 48 | def build_pairwise_similarity_datasets(name, dir_path, process_config): 49 | # prepare to generate dataset 50 | if process_config["use-tokenizer"] == True: 51 | tok = build_tokenizer(process_config["tokenizer"]["name"], 52 | process_config["tokenizer"]["others"]) 53 | else: 54 | tok = None 55 | 56 | if name == "tmu": 57 | datasets = build_tmu(dir_path, tok) 58 | elif name == "jwsan": 59 | datasets = build_jwsan(dir_path, tok) 60 | else: 61 | raise ValueError("Invalid dataset name: {}".format(name)) 62 | return datasets 63 | 64 | 65 | def build_tmu(dir_path, tok=None): 66 | _filenames = ['score_adj.csv', 'score_adv.csv', 67 | 'score_noun.csv', 'score_verb.csv'] 68 | base = Path(dir_path) 69 | dats = [] 70 | for fn in _filenames: 71 | df = pd.read_csv(str((base/fn).absolute()), encoding="utf-8") 72 | if tok == None: 73 | col1s = [[c] for c in list(df["word1"])] 74 | col2s = [[c] for c in list(df["word2"])] 75 | else: 76 | col1s = [tok.wakati(c) for c in list(df["word1"])] 77 | col2s = [tok.wakati(c) for c in list(df["word2"])] 78 | scores = [float(score) for score in list(df["mean"])] 79 | dat = PairwiseSimilarityDataset.build_from_triplet( 80 | col1s, col2s, scores) 81 | dat.name = "tmu-{}".format(fn) 82 | dats.append(dat) 83 | return dats 84 | 85 | 86 | def build_jwsan(dir_path, tok=None): 87 | _filenames = ['jwsan-1400.csv', 'jwsan-2145.csv'] 88 | _elms = ['similarity', 'association'] 89 | base = Path(dir_path) 90 | dats = [] 91 | for fn in _filenames: 92 | df = pd.read_csv(str((base/fn).absolute()), encoding="utf-8") 93 | if tok == None: 94 | col1s = [[c] for c in list(df["word1"])] 95 | col2s = [[c] for c in list(df["word2"])] 96 | else: 97 | col1s = [tok.wakati(c) for c in list(df["word1"])] 98 | col2s = [tok.wakati(c) for c in list(df["word2"])] 99 | for e in _elms: 100 | scores = [float(score) for score in list(df[e])] 101 | dat = PairwiseSimilarityDataset.build_from_triplet( 102 | col1s, col2s, scores) 103 | dat.name = "jwsan-{}-{}".format(fn, e) 104 | dats.append(dat) 105 | return dats 106 | -------------------------------------------------------------------------------- /training/prepare_corpus.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from collections.abc import Iterable 3 | import logging 4 | from pathlib import Path 5 | from tqdm import tqdm 6 | 7 | import sudachipy 8 | 9 | logging.basicConfig( 10 | style="{", 11 | format='{levelname} {asctime} [{module}:{funcName}:{lineno}] {message}', 12 | datefmt="%m/%d/%Y %H:%M:%S", 13 | level=logging.INFO, 14 | ) 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | def parse_args(): 19 | p = argparse.ArgumentParser( 20 | "Wakati-split given texts per line with A/B/C mode and normalized_form.") 21 | p.add_argument("--input", type=Path, 22 | help="text file (line-by-line) or a directory contains them") 23 | p.add_argument("--output", type=Path, 24 | help="directory to output") 25 | 26 | p.add_argument("--mode", type=str, default="ABC", 27 | help="split mode to use (default: ABC)") 28 | p.add_argument("--skip-existing", action="store_true", 29 | help="if set, skip processing if the output file already exists") 30 | 31 | args = p.parse_args() 32 | return args 33 | 34 | 35 | def list_textfiles(input: Path) -> Iterable[Path]: 36 | """iterate over text files in the input directory (or input text file)""" 37 | if input.is_file(): 38 | return [input] 39 | if input.is_dir(): 40 | return input.glob("*.txt") 41 | return [] 42 | 43 | 44 | def mode2str(mode: sudachipy.SplitMode) -> str: 45 | """convert sudachipy.SplitMode into str""" 46 | match mode: 47 | case sudachipy.SplitMode.A: return "A" 48 | case sudachipy.SplitMode.B: return "B" 49 | case sudachipy.SplitMode.C: return "C" 50 | 51 | 52 | def str2mode(modestr: str) -> sudachipy.SplitMode: 53 | """parse sudachipy.SplitMode from str""" 54 | match modestr.strip(): 55 | case "A" | "a": return sudachipy.SplitMode.A 56 | case "B" | "b": return sudachipy.SplitMode.B 57 | case "C" | "c": return sudachipy.SplitMode.C 58 | case _: raise ValueError(f"cannot parse {modestr} as SplitMode.") 59 | 60 | 61 | def output_filepath(input_file: Path, output_dir: Path, mode: sudachipy.SplitMode) -> Path: 62 | """generate output file path for the current input""" 63 | assert input_file.is_file() 64 | assert output_dir.is_dir() 65 | 66 | filename = f"{input_file.stem}_wakati_{mode2str(mode)}.txt" 67 | return output_dir / filename 68 | 69 | 70 | def count_line(file: Path) -> int: 71 | """count line of a file""" 72 | assert file.is_file() 73 | with file.open() as fi: 74 | count = sum(1 for _ in fi) 75 | return count 76 | 77 | 78 | def able_to_skip(infile: Path, outfile: Path) -> bool: 79 | """check if we can skip processing based on line count.""" 80 | if not outfile.exists(): 81 | return False 82 | 83 | lines_in = count_line(infile) 84 | lines_out = count_line(outfile) 85 | if lines_in == lines_out: 86 | return True 87 | return False 88 | 89 | 90 | def wakati(tok: sudachipy.Tokenizer, sentence: str) -> str: 91 | """tokenize given sentence by the toknizer and return their normalized form joining with spaces.""" 92 | morphemes = tok.tokenize(sentence) 93 | norm_forms = (m.normalized_form() for m in morphemes) 94 | return ' '.join(m for m in norm_forms if m != " ") 95 | 96 | 97 | def wakati_file(infile: Path, outfile: Path, tok: sudachipy.Tokenizer) -> (): 98 | """apply wakati to each line of infile and write to outfile.""" 99 | with infile.open() as fi, outfile.open("w") as fo: 100 | for line in tqdm(fi): 101 | line = line.strip() 102 | if line == "": 103 | fo.write("\n") 104 | continue 105 | fo.write(wakati(tok, line) + "\n") 106 | return 107 | 108 | 109 | def main(): 110 | args = parse_args() 111 | args.output.mkdir(parents=True, exist_ok=True) 112 | 113 | dic = sudachipy.Dictionary() 114 | for mode in map(str2mode, args.mode): 115 | tok = dic.create(mode=mode) 116 | for file in list_textfiles(args.input): 117 | outfile = output_filepath(file, args.output, mode) 118 | 119 | # check if skip 120 | if args.skip_existing and outfile.exists(): 121 | if able_to_skip(file, outfile): 122 | logger.info( 123 | f"skip {file=} with split mode {mode2str(mode)}.") 124 | continue 125 | logger.info( 126 | f"{outfile=} exists but processing seems not finished.") 127 | 128 | # process 129 | logger.info(f"process {file=} with split mode {mode2str(mode)}.") 130 | logger.info(f"output file: \"{outfile}\"") 131 | wakati_file(file, outfile, tok) 132 | return 133 | 134 | 135 | if __name__ == '__main__': 136 | main() 137 | -------------------------------------------------------------------------------- /evaluation/models/w2v.py: -------------------------------------------------------------------------------- 1 | """ 2 | Basically this is for Japanese 3 | """ 4 | 5 | from gensim.models import Word2Vec, KeyedVectors 6 | from gensim.models.keyedvectors import Word2VecKeyedVectors 7 | import logging 8 | import numpy as np 9 | from sklearn.decomposition import PCA 10 | 11 | from utils import cos_sim, get_zero_vector 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | class WordVectorizer(): 17 | def __init__(self): 18 | pass 19 | 20 | def query(self, surface, return_oov_flag=False): 21 | """ 22 | generate the vector for the given word. 23 | --- Don't OVERRIDE --- 24 | args: 25 | - surface (str): target token 26 | return: 27 | - vec (np.ndarray): generated vector 28 | - is_oov (bool): OOV flag 29 | """ 30 | v, oov_flag = self._get_vec_with_oov(surface) 31 | if return_oov_flag: 32 | return v, oov_flag 33 | else: 34 | return v 35 | 36 | def _get_vec_with_oov(self, surface): 37 | raise NotImplementedError() 38 | 39 | def query_as_batch(self, surface_list, return_oov_flag=False): 40 | if return_oov_flag == False: 41 | return np.array([self.query(s, return_oov_flag) for s in surface_list]) 42 | else: 43 | # TODO: maybe slow 44 | vecs = [] 45 | flags = [] 46 | for s in surface_list: 47 | v, oov_flag = self.query(s, return_oov_flag) 48 | vecs.append(v) 49 | flags.append(oov_flag) 50 | return np.array(vecs), flags 51 | 52 | 53 | class Word2VecAPI(WordVectorizer): 54 | """ 55 | Word2Vec API for OOV handling or others 56 | """ 57 | 58 | def __init__(self, w2v, config=None, train_desc=None): 59 | """ 60 | args: 61 | - w2v (Word2Vec or Word2VecKeyedVectors): trained word2vec 62 | - config (dict): configs for adhoc process such as centering etc... 63 | - train_desc (dict): Description. This contains info for `Training Word2Vec` 64 | """ 65 | assert type(w2v) in [ 66 | Word2Vec, KeyedVectors], "Invalid word2vec type: {}".format(type(w2v)) 67 | self._w2v = w2v 68 | self._config = config if config is not None else {} 69 | self.train_desc = train_desc if train_desc is not None else {} 70 | self.postprocess() 71 | 72 | def postprocess(self): 73 | if 'post-process' in self._config: 74 | post_config = self._config["post-process"] 75 | if 'pca' in post_config: 76 | d = post_config["pca"] 77 | assert self._w2v.vector_size > d, "dim should be lower than original" 78 | self.do_pca(d) 79 | if 'abtt' in post_config: 80 | self.do_all_but_the_top(post_config["abtt"]) 81 | else: 82 | pass # do nothing 83 | 84 | def do_pca(self, d): 85 | self._w2v.vectors = pca(self._w2v.vectors, d) 86 | self._w2v.vector_size = d 87 | 88 | def do_all_but_the_top(self, d=None): 89 | if d == None: 90 | d = self._w2v.vector_size // 100 91 | self._w2v.vectors = all_but_the_top(self._w2v.vectors, d) 92 | 93 | def _get_vec_with_oov(self, surface): 94 | if surface in self._w2v: 95 | return self._pick_vec(surface), False 96 | else: 97 | logger.warn("Out-of-vocab: surface={}".format(surface)) 98 | return self._oov_vec(surface), True 99 | 100 | def _oov_vec(self, surface): 101 | return get_zero_vector(self._w2v.vector_size) 102 | 103 | def _pick_vec(self, surface): 104 | return self._w2v[surface] 105 | 106 | def get_mean_vector(self, surfaces, ignore_oov=True): 107 | vecs, oovs = self.query_as_batch(surfaces, return_oov_flag=True) 108 | if ignore_oov: 109 | kvs = [not flag for flag in oovs] 110 | vecs = vecs[kvs] 111 | if all(oovs): 112 | logging.warn("given surfaces are all OOV: {}".format(surfaces)) 113 | vecs = np.expand_dims(get_zero_vector(self._w2v.vector_size), 114 | axis=0) 115 | return vecs.mean(axis=0) 116 | 117 | def cal_phrase_similarity(self, surfaces1, surfaces2): 118 | """ 119 | calculate phrase similarity. 120 | currently, averaged phrase embedding is only available. 121 | """ 122 | v1 = self.get_mean_vector(surfaces1, ignore_oov=True) 123 | v2 = self.get_mean_vector(surfaces2, ignore_oov=True) 124 | return cos_sim(v1, v2) 125 | 126 | 127 | class Morph2vecAPI(WordVectorizer): 128 | def __init__(self): 129 | raise NotImplementedError("WIP") 130 | 131 | 132 | def all_but_the_top(lookup_mat, d): 133 | """ 134 | args: 135 | - lookup_mat (np.ndarray): postprocessed vectors, shape = (n_word, dim) 136 | - d (int): number of principal components 137 | """ 138 | # centering 139 | center = lookup_mat.mean(axis=0) 140 | new_lookup_mat = lookup_mat - np.broadcast_to(center, lookup_mat.shape) 141 | 142 | # remove principal component 143 | pca = PCA(n_components=d, random_state=46) 144 | pca.fit(new_lookup_mat) 145 | sim = new_lookup_mat.dot(pca.components_.T) # shape = (n_word, d) 146 | new_lookup_mat -= new_lookup_mat.dot( 147 | pca.components_.T).dot(pca.components_) 148 | return new_lookup_mat 149 | 150 | 151 | def pca(lookup_mat, d): 152 | pca = PCA(n_components=d, random_state=46) 153 | return pca.fit_transform(lookup_mat) 154 | 155 | 156 | def build_w2v_api(w2v, config, w2v_desc): 157 | return Word2VecAPI(w2v=w2v, config=config, train_desc=w2v_desc) 158 | 159 | 160 | def build_gensim_w2v(w2v_path, load_config, other_config): 161 | if load_config["w2v-fmt"] == False: 162 | w2v = Word2Vec.load(w2v_path).wv 163 | elif load_config["w2v-fmt"] == True: 164 | if load_config["fmt"] == "bin": 165 | w2v = KeyedVectors.load_word2vec_format(w2v_path, binary=True) 166 | elif load_config["fmt"] == "txt": 167 | w2v = KeyedVectors.load_word2vec_format(w2v_path, binary=False) 168 | elif load_config["fmt"] == "kv": 169 | w2v = KeyedVectors.load(w2v_path) 170 | else: 171 | raise ValueError("Invalid format: {}".format(load_config["fmt"])) 172 | else: 173 | raise ValueError( 174 | "w2v-fmt should be bool: {}".format(load_config["w2v-fmt"])) 175 | return w2v 176 | -------------------------------------------------------------------------------- /training/train_chive.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from dataclasses import dataclass 3 | import json 4 | import logging 5 | from pathlib import Path 6 | import time 7 | 8 | from gensim.models import Word2Vec 9 | from gensim.models.word2vec import LineSentence, PathLineSentences 10 | from gensim.models.callbacks import CallbackAny2Vec 11 | 12 | 13 | logging.basicConfig( 14 | style="{", 15 | format='{levelname} {asctime} [{module}:{funcName}:{lineno}] {message}', 16 | datefmt="%m/%d/%Y %H:%M:%S", 17 | level=logging.INFO, 18 | ) 19 | logger = logging.getLogger(__name__) 20 | 21 | CHIVE_VERSION = "1.3" 22 | 23 | 24 | @dataclass 25 | class Config(): 26 | """Configuration for training word2vec model. 27 | 28 | ref: https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec 29 | """ 30 | 31 | vector_size: int = 300 32 | window: int = 8 33 | sg: 0 | 1 = 1 # Training algorithm: 1 for skip-gram; otherwise CBOW. 34 | hs: 0 | 1 = 0 # 1 for hierachical softmax, 0 for negative sampling 35 | n_negative: int = 5 36 | threshold_downsample: float = 1e-5 37 | 38 | # value to resume training 39 | alpha: float = 0.025 # default value of gensim 40 | min_alpha: float = 0.0001 # default value of gensim 41 | 42 | @staticmethod 43 | def from_file(config_file: Path | None): 44 | """Load values from json file.""" 45 | conf = {} 46 | if config_file is not None: 47 | with config_file.open() as fi: 48 | conf = json.load(fi) 49 | return Config(**conf) 50 | 51 | 52 | class LogLossCallback(CallbackAny2Vec): 53 | """callback to log loss and time. 54 | 55 | Note that training loss is reset when you resume training.""" 56 | 57 | def __init__(self, logfile: Path, start_time: float, start_epoch: int = 0): 58 | self.epochs = start_epoch 59 | self.start_time = start_time 60 | 61 | self.loss_previous_step = 0 62 | self.time_previous_step = start_time 63 | 64 | self.logfile = logfile 65 | return 66 | 67 | def _get_and_record_loss(self, model): 68 | total_loss = model.get_latest_training_loss() 69 | current_time = total_loss - self.loss_previous_step 70 | 71 | self.loss_previous_step = total_loss 72 | return total_loss, current_time 73 | 74 | def _get_and_record_time(self): 75 | now = time.time() 76 | total_time = now - self.start_time 77 | current_time = now - self.time_previous_step 78 | 79 | self.time_previous_step = now 80 | return total_time, current_time 81 | 82 | def on_epoch_end(self, model): 83 | total_loss, current_loss = self._get_and_record_loss(model) 84 | total_time, current_time = self._get_and_record_time() 85 | 86 | with self.logfile.open("a") as f: 87 | f.write(f"{self.epochs}," 88 | f"{total_loss},{current_loss}," 89 | f"{total_time},{current_time}\n") 90 | 91 | self.epochs += 1 92 | return 93 | 94 | 95 | class CheckpointHandler(): 96 | def __init__(self, output_dir: Path, min_count: int, version: str = CHIVE_VERSION, keep_ckpt: int = 3): 97 | # values used to generate filename 98 | self.output_dir = output_dir 99 | self.version = version 100 | self.min_count = min_count 101 | 102 | self.keep = keep_ckpt 103 | self.checkpoints = self.list_checkpoints() 104 | return 105 | 106 | @staticmethod 107 | def epoch_from_file(filename: Path) -> int: 108 | """parse epoch count from a ckpt file name.""" 109 | # NOTE: this depends on the filename pattern 110 | stem = filename.stem 111 | len_prefix = stem.find("_epoch") + len("_epoch") 112 | return int(stem[len_prefix:]) 113 | 114 | def list_checkpoints(self) -> list[Path]: 115 | """list ckpt files under output directory.""" 116 | files = self.output_dir.glob(self.ckpt_filepath(epoch="*").name) 117 | files = list(sorted(files, key=self.epoch_from_file)) 118 | return files 119 | 120 | def latest_ckpt(self) -> Path | None: 121 | """return ckpt with largest epoch, or None if no ckpt found.""" 122 | if len(self.checkpoints) == 0: 123 | return None 124 | return self.checkpoints[-1] 125 | 126 | def ckpt_filepath(self, epoch: int) -> Path: 127 | """generate a path to the ckpt file with given epoch""" 128 | filename = f"chive-{self.version}-mc{self.min_count}_epoch{epoch}.bin" 129 | return self.output_dir / filename 130 | 131 | def save_ckpt(self, epoch: int, save_func): 132 | """save ckpt using given func and remove old ckpts. 133 | 134 | :param save_func: saves data to the given path. 135 | """ 136 | new_ckpt = self.ckpt_filepath(epoch) 137 | save_func(new_ckpt) 138 | self.checkpoints.append(new_ckpt) 139 | self.remove_old_ckpt() 140 | return 141 | 142 | def remove_old_ckpt(self): 143 | """remove old ckpts, keeping self.keep_ckpt ckpts.""" 144 | for i in range(len(self.checkpoints) - self.keep): 145 | logger.info(f"remove ckpt {self.checkpoints[i]}") 146 | self.checkpoints[i].unlink() 147 | self.checkpoints = self.checkpoints[-self.keep:] 148 | return 149 | 150 | 151 | class SaveCheckpointCallback(CallbackAny2Vec): 152 | """callback to save ckpts per specified epochs.""" 153 | 154 | def __init__(self, ckpt_handler: CheckpointHandler, save_epochs: int = 5, start_epoch: int = 0): 155 | self.ckpt_handler = ckpt_handler 156 | self.epochs = start_epoch 157 | self.save_epochs = save_epochs 158 | return 159 | 160 | def on_epoch_end(self, model): 161 | self.epochs += 1 162 | if self.epochs % self.save_epochs == 0: 163 | self.ckpt_handler.save_ckpt( 164 | epoch=self.epochs, 165 | save_func=lambda p: model.save(str(p))) 166 | return 167 | 168 | 169 | def parse_args(): 170 | p = argparse.ArgumentParser("Training word embedding by gensim") 171 | p.add_argument("--input", type=Path, 172 | help="tokenized text data (line-by-line) or directory contains them") 173 | p.add_argument("--output", type=Path, 174 | help="directory to output") 175 | 176 | p.add_argument("--epochs", type=int, default=15, 177 | help="how many epochs to run training (default 15)") 178 | p.add_argument("--min-count", type=int, default=90, 179 | help="words that appears less than this would be ignored (default 90)") 180 | p.add_argument("--save-epochs", type=int, default=5, 181 | help="save model every this epochs as checkpoint (default 5)") 182 | p.add_argument("--keep-ckpt", type=int, default=3, 183 | help="how many checkpoints to keep (default 3)") 184 | p.add_argument("--worker", type=int, default=12, 185 | help="how many threads to use during training (default 12)") 186 | 187 | p.add_argument("--config", type=Path, default=None, 188 | help="json file to load config parameters from (optional)") 189 | 190 | args = p.parse_args() 191 | return args 192 | 193 | 194 | def restart_alpha(conf: Config, total_epochs: int, ckpt_epochs: int) -> float: 195 | """calculate alpha for restarting from the given ckpt epochs. 196 | 197 | - learning rate decaies linearly from alpha to min_alpha 198 | - it depends on alpha/min_alpha/total_epoch/crr_epoch 199 | - https://github.com/piskvorky/gensim/blob/e7b441b87a967e22668a2365bcb60a13e9496215/gensim/models/word2vec.py#L1441 200 | - resuming training requires adjusted epochs (i.e. remaining epochs). 201 | - it affects to lr calculation, and we also need to adjust start_alpha 202 | """ 203 | alpha = conf.alpha 204 | min_alpha = conf.min_alpha 205 | return alpha - (alpha - min_alpha) / total_epochs * ckpt_epochs 206 | 207 | 208 | def main(): 209 | args = parse_args() 210 | args.output.mkdir(parents=True, exist_ok=True) 211 | train_ops = Config.from_file(args.config) 212 | 213 | sents = LineSentence(args.input) if args.input.is_file() \ 214 | else PathLineSentences(args.input) 215 | 216 | ckpt_handler = CheckpointHandler( 217 | args.output, args.min_count, keep_ckpt=args.keep_ckpt) 218 | logfile = args.output / f"TrainLog-mc{args.min_count}.csv" 219 | 220 | latest_ckpt = ckpt_handler.latest_ckpt() 221 | if latest_ckpt is None: 222 | logger.info(f"training from scratch") 223 | model = Word2Vec( 224 | sents, 225 | workers=args.worker, 226 | vector_size=train_ops.vector_size, 227 | window=train_ops.window, 228 | sample=train_ops.threshold_downsample, 229 | sg=train_ops.sg, 230 | hs=train_ops.hs, 231 | negative=train_ops.n_negative, 232 | epochs=args.epochs, 233 | min_count=args.min_count, 234 | alpha=train_ops.alpha, 235 | min_alpha=train_ops.min_alpha, 236 | compute_loss=True, 237 | callbacks=[ 238 | SaveCheckpointCallback(ckpt_handler, args.save_epochs), 239 | LogLossCallback(logfile, time.time()), 240 | ], 241 | ) 242 | else: 243 | # Resume training from the checkpoint. 244 | # Assume to use same corpus and parameters. 245 | logger.info(f"checkpoint found: {latest_ckpt}") 246 | ckpt_epochs = ckpt_handler.epoch_from_file(latest_ckpt) 247 | if ckpt_epochs >= args.epochs: 248 | logger.info( 249 | f"training seems already finished ({latest_ckpt} exists).") 250 | return 251 | 252 | model = Word2Vec.load(str(latest_ckpt)) 253 | model.train( 254 | sents, 255 | total_examples=model.corpus_count, 256 | epochs=args.epochs - ckpt_epochs, 257 | start_alpha=restart_alpha(train_ops, args.epochs, ckpt_epochs), 258 | end_alpha=train_ops.min_alpha, 259 | compute_loss=True, 260 | callbacks=[ 261 | SaveCheckpointCallback( 262 | ckpt_handler, args.save_epochs, start_epoch=ckpt_epochs), 263 | LogLossCallback(logfile, time.time(), start_epoch=ckpt_epochs), 264 | ], 265 | ) 266 | # fix value changed by calling model.train 267 | model.alpha = train_ops.alpha 268 | model.epochs = args.epochs 269 | 270 | logger.info(f"finish training and save model.") 271 | model.save(str(ckpt_handler.ckpt_filepath(args.epochs))) 272 | return 273 | 274 | 275 | if __name__ == '__main__': 276 | main() 277 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # chiVe: Sudachi による日本語単語ベクトル 2 | 3 | [English README](README_en.md) 4 | 5 | ## 概要 6 | 7 | "chiVe" (チャイブ, Suda**chi Vec**tor) は、大規模コーパスと複数粒度分割に基づく日本語単語ベクトルです。 8 | 9 | [Skip-gram アルゴリズム](https://arxiv.org/abs/1301.3781)を元に、word2vec ([gensim](https://radimrehurek.com/gensim/)) を使用して単語分散表現を構築しています。 10 | 11 | 学習コーパスには、v1.0-v1.2 では約 1 億のウェブページ文章を含む国立国語研究所の[日本語ウェブコーパス(NWJC)](https://masayu-a.github.io/NWJC/)、 12 | v1.3 では [CommonCrawl](https://commoncrawl.org/) から取得したウェブページ文章を採用しています。 13 | 14 | 分かち書きにはワークスアプリケーションズの形態素解析器 [Sudachi](https://github.com/WorksApplications/Sudachi) を使用しています。 15 | Sudachi で定義されている A/B/C の 3 つの分割単位でコーパスを解析した結果を元に分散表現の学習を行なっています。 16 | 17 | ## データ 18 | 19 | SudachiDict と chiVe のデータは、AWS の [Open Data Sponsorship Program](https://registry.opendata.aws/sudachi/) によりホストしていただいています。 20 | 21 | | 版 | 最低頻度 | 正規化 | 語彙数 | テキスト | [gensim](https://radimrehurek.com/gensim/) | [Magnitude](https://github.com/plasticityai/magnitude) | 22 | | --------- | -------- | ------ | --------- | --------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------- | 23 | | v1.3 mc5 | 5 | o | 2,530,791 | 3.6GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.3-mc5.tar.gz)) | 2.9GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.3-mc5_gensim.tar.gz)) | - | 24 | | v1.3 mc15 | 15 | o | 1,186,019 | 1.7GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.3-mc15.tar.gz)) | 1.3GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.3-mc15_gensim.tar.gz)) | - | 25 | | v1.3 mc30 | 30 | o | 759,011 | 1.1GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.3-mc30.tar.gz)) | 0.8GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.3-mc30_gensim.tar.gz)) | - | 26 | | v1.3 mc90 | 90 | o | 410,533 | 0.6GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.3-mc90.tar.gz)) | 0.5GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.3-mc90_gensim.tar.gz)) | - | 27 | | | | | | | | | 28 | | v1.2 mc5 | 5 | o | 3,197,456 | 9.2GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc5.tar.gz)) | 3.8GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc5_gensim.tar.gz)) | 5.5GB ([.magnitude](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc5.magnitude)) | 29 | | v1.2 mc15 | 15 | o | 1,454,280 | 5.0GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc15.tar.gz)) | 1.7GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc15_gensim.tar.gz)) | 2.4GB ([.magnitude](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc15.magnitude)) | 30 | | v1.2 mc30 | 30 | o | 912,550 | 3.1GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc30.tar.gz)) | 1.1GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc30_gensim.tar.gz)) | 1.5GB ([.magnitude](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc30.magnitude)) | 31 | | v1.2 mc90 | 90 | o | 482,223 | 1.7GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc90.tar.gz)) | 0.6GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc90_gensim.tar.gz)) | 0.8GB ([.magnitude](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc90.magnitude)) | 32 | | | | | | | | | 33 | | v1.1 mc5 | 5 | o | 3,196,481 | 11GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc5.tar.gz)) | 3.6GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc5_gensim.tar.gz)) | 5.5GB ([.magnitude](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc5.magnitude)) | 34 | | v1.1 mc15 | 15 | o | 1,452,205 | 4.7GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc15.tar.gz)) | 1.7GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc15_gensim.tar.gz)) | 2.4GB ([.magnitude](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc15.magnitude)) | 35 | | v1.1 mc30 | 30 | o | 910,424 | 3.0GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc30.tar.gz)) | 1.1GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc30_gensim.tar.gz)) | 1.5GB ([.magnitude](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc30.magnitude)) | 36 | | v1.1 mc90 | 90 | o | 480,443 | 1.6GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc90.tar.gz)) | 0.6GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc90_gensim.tar.gz)) | 0.8GB ([.magnitude](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc90.magnitude)) | 37 | | v1.0 mc5 | 5 | x | 3,644,628 | 12GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.0-mc5.tar.gz)) | 4.1GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.0-mc5_gensim.tar.gz)) | 6.3GB ([.magnitude](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.0-mc5.magnitude)) | 38 | 39 | 全て 300 次元のベクトルです。 40 | 41 | 「最低頻度」は、コーパス内での単語出現回数での足切り基準([gensim](https://radimrehurek.com/gensim/models/word2vec.html) での `min_count` )です。 42 | 43 | 「正規化」は、形態素解析器 Sudachi による表記統制です。例えば `空き缶`, `空缶`, `空き罐`, `空罐`, `空きカン`, `空きかん` はすべて正規化表記 `空き缶` に統制されます。 44 | 45 | | 版 | Sudachi | Sudachi 辞書 | 学習コーパス | 46 | | ---- | ------- | --------------------- | ---------------------------------------------------- | 47 | | v1.3 | v0.6.8 | 20240109-core | CommonCrawl (CC-MAIN-2022-40, warc, first 20k files) | 48 | | v1.2 | v0.4.3 | 20200722-core | NWJC | 49 | | v1.1 | v0.3.0 | 20191030-core | NWJC | 50 | | v1.0 | v0.1.1 | 0.1.1-dictionary-full | NWJC | 51 | 52 | 全て同じ学習アルゴリズムを使用しています。詳細は [training](training) を参照してください。 53 | 54 | ### 「A 単位語のみ」の資源 55 | 56 | [Sudachi 辞書](https://github.com/WorksApplications/SudachiDict)にある A 単位語のみを含む資源です(A 単位語のみでの再学習ではなく、上にある元資源から、B 単位語、C 単位語、OOV 語(Out-of-vocabulary, 辞書にない語)を除いたものです)。 57 | 58 | `v1.1 mc90 aunit` が、自然言語処理ツール [spaCy](https://github.com/explosion/spaCy/) の日本語モデルに使われています。 59 | 60 | | 版 | 語彙数 | テキスト | [gensim](https://radimrehurek.com/gensim/) | [Magnitude](https://github.com/plasticityai/magnitude) | 61 | | --------------- | --------------- | --------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------- | 62 | | v1.1 mc5 aunit | 322,094 (10.1%) | 1.1GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc5-aunit.tar.gz)) | 0.4GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc5-aunit_gensim.tar.gz)) | 0.5GB ([.magnitude](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc5-aunit.magnitude)) | 63 | | v1.1 mc15 aunit | 276,866 (19.1%) | 1.0GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc15-aunit.tar.gz)) | 0.3GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc15-aunit_gensim.tar.gz)) | 0.4GB ([.magnitude](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc15-aunit.magnitude)) | 64 | | v1.1 mc30 aunit | 242,658 (26.7%) | 0.8GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc30-aunit.tar.gz)) | 0.3GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc30-aunit_gensim.tar.gz)) | 0.4GB ([.magnitude](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc30-aunit.magnitude)) | 65 | | v1.1 mc90 aunit | 189,775 (39.5%) | 0.7GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc90-aunit.tar.gz)) | 0.2GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc90-aunit_gensim.tar.gz)) | 0.3GB ([.magnitude](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc90-aunit.magnitude)) | 66 | 67 | ### 追加学習用のフルモデル 68 | 69 | chiVe は、各ドメイン(分野)に合わせたデータで追加学習させられます。 70 | chiVe は、追加学習なしでも利用できますが、追加学習することでそのドメイン(分野)でのタスクの性能改善が期待できます。 71 | 72 | chiVe を追加学習するためには、フルモデルを使用してください。詳しい使用方法は、[チュートリアル](docs/continue-training.md)をご覧ください。 73 | 74 | | 版 | [gensim](https://radimrehurek.com/gensim/) (full) | 75 | | --------- | --------------------------------------------------------------------------------------------------------- | 76 | | v1.3 mc5 | 5.5GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.3-mc5_gensim-full.tar.gz)) | 77 | | v1.3 mc15 | 2.6GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.3-mc15_gensim-full.tar.gz)) | 78 | | v1.3 mc30 | 1.7GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.3-mc30_gensim-full.tar.gz)) | 79 | | v1.3 mc90 | 0.9GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.3-mc90_gensim-full.tar.gz)) | 80 | | | | 81 | | v1.2 mc5 | 6.7GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc5_gensim-full.tar.gz)) | 82 | | v1.2 mc15 | 3.0GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc15_gensim-full.tar.gz)) | 83 | | v1.2 mc30 | 1.9GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc30_gensim-full.tar.gz)) | 84 | | v1.2 mc90 | 1.0GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc90_gensim-full.tar.gz)) | 85 | 86 | ## 利用方法 87 | 88 | 「テキスト」「gensim」「Magnitude」という 3 つのフォーマットでデータを公開しています。 89 | 90 | ### テキスト 91 | 92 | プレーンテキスト形式のデータ(オリジナルの word2vec C フォーマット)です。 93 | 94 | ``` 95 | 480443 300 96 | の -0.08274004 -0.091033645 -0.08744463 -0.14393683 -0.053159036 ... 97 | 、 -0.014216528 -0.1027064 -0.07763326 -0.16008057 -0.16116066 ... 98 | て -0.06049706 -0.15483096 0.052628547 -0.12448246 -0.14404581 ... 99 | ... 100 | ``` 101 | 102 | ### gensim 103 | 104 | ライブラリ [gensim](https://radimrehurek.com/gensim/) のための、[KeyedVectors](https://radimrehurek.com/gensim/models/keyedvectors.html) 形式のデータです。 105 | 106 | ```py 107 | import gensim 108 | 109 | vectors = gensim.models.KeyedVectors.load("./chive-1.1-mc90_gensim/chive-1.1-mc90.kv") 110 | 111 | "すだち" in vectors # False, v1.1 では正規化されているため 112 | "酢橘" in vectors # True 113 | 114 | vectors["酢橘"] 115 | # array([-5.68204783e-02, -1.26615226e-01, 3.53190415e-02, -3.67305875e-01, ...]) 116 | 117 | vectors.similarity("酢橘", "徳島") 118 | # 0.3993048 119 | 120 | vectors.most_similar("徳島", topn=5) 121 | # [('愛媛', 0.8229734897613525), 122 | # ('徳島県', 0.786933422088623), 123 | # ('高知', 0.7795713543891907), 124 | # ('岡山', 0.7623447179794312), 125 | # ('徳島市', 0.7415297031402588)] 126 | 127 | vectors.most_similar(positive=["阿波", "高知"], negative=["徳島"], topn=5) 128 | # [('土佐', 0.620033860206604), 129 | # ('阿波踊り', 0.5988592505455017), 130 | # ('よさこい祭り', 0.5783430337905884), 131 | # ('安芸', 0.564490556716919), 132 | # ('高知県', 0.5591559410095215)] 133 | ``` 134 | 135 | ### Magnitude (~v1.2) 136 | 137 | ライブラリ [Magnitude](https://github.com/plasticityai/magnitude) 形式のデータです。デフォルトのパラメーターで変換されています(高度な未知語サポート有り、近似最近傍インデックス無し。Magnitude が公開しているモデルの `Medium` 相当)。 138 | 139 | ```py 140 | from pymagnitude import Magnitude 141 | 142 | vectors = Magnitude("chive1.1-mc90.magnitude") 143 | 144 | "すだち" in vectors # False, v1.1 では正規化されているため 145 | "酢橘" in vectors # True 146 | 147 | vectors.query("すだち") # Magnitudeによるサブワードを使った未知語サポートによる 148 | # array([ 0.03974148, 0.11290773, 0.01493122, -0.05296252, 0.12616251, ...]) 149 | 150 | vectors.most_similar("すだち", topn=5) 151 | # [('あだち', 0.5930323079944302), 152 | # ('すだ椎', 0.5872662462335323), 153 | # ('だち', 0.5797546444016177), 154 | # ('ムクノキ', 0.46228053338159725), 155 | # ('椨', 0.4482612387097178)] 156 | 157 | vectors.similarity("酢橘", "徳島") 158 | # 0.3993048 159 | 160 | vectors.most_similar("徳島", topn=5) 161 | # [('愛媛', 0.8229735), 162 | # ('徳島県', 0.78693324), 163 | # ('高知', 0.7795714), 164 | # ('岡山', 0.7623447), 165 | # ('徳島市', 0.7415296)] 166 | 167 | vectors.closer_than("徳島", "徳島市") 168 | # ['愛媛', '徳島県', '高知', '岡山'] 169 | 170 | vectors.most_similar(positive=["阿波", "高知"], negative=["徳島"], topn=5) 171 | # [('土佐', 0.62003386), 172 | # ('阿波踊り', 0.5988593), 173 | # ('よさこい祭り', 0.578343), 174 | # ('安芸', 0.56449056), 175 | # ('高知県', 0.55915594)] 176 | 177 | vectors.most_similar_cosmul(positive=["阿波", "高知"], negative=["徳島"], topn=5) 178 | # [('土佐', 0.83830714), 179 | # ('よさこい祭り', 0.82048166), 180 | # ('阿波踊り', 0.8168015), 181 | # ('安芸', 0.80880433), 182 | # ('伊予', 0.80250806)] 183 | ``` 184 | 185 | ライブラリを使っての、ダウンロード、リモートでのロード、HTTP 上のリモートでのストリームも可能です。 186 | 187 | ```py 188 | from pymagnitude import Magnitude, MagnitudeUtils 189 | 190 | # ダウンロード 191 | vectors = Magnitude(MagnitudeUtils.download_model("chive-1.1-mc90-aunit", remote_path="https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/")) 192 | # デフォルトのダウンロード先: `~/.magnitude/` 193 | # ファイルが既にダウンロードされている場合は、再度ダウンロードしない 194 | # 引数 `download_dir` でローカルのダウンロード先を変更できる 195 | 196 | # リモートでのロード 197 | vectors = Magnitude("https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc90-aunit.magnitude") 198 | 199 | # HTTP上のリモートでのストリーム 200 | vectors = Magnitude("https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc90-aunit.magnitude", stream=True) 201 | vectors.query("徳島") # ローカルにファイルをダウンロードせず、ベクトルをすばやく取得 202 | ``` 203 | 204 | ## ライセンス 205 | 206 | ### v1.0, v1.1, v1.2 207 | 208 | Copyright (c) 2020 National Institute for Japanese Language and Linguistics and Works Applications Co., Ltd. All rights reserved. 209 | 210 | [Apache License, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0) の下で[国立国語研究所](https://www.ninjal.ac.jp/)と[株式会社ワークスアプリケーションズ](https://www.worksap.co.jp/)によって提供されています。 211 | 212 | ### v1.3 213 | 214 | Copyright (c) 2024 Works Applications Co., Ltd. All rights reserved. 215 | 216 | [Apache License, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0) の下で[株式会社ワークスアプリケーションズ](https://www.worksap.co.jp/)によって提供されています。 217 | 218 | ## Slack 219 | 220 | 開発者やユーザーの方々が質問したり議論するための Slack ワークスペースを用意しています。 221 | 222 | - https://sudachi-dev.slack.com/ 223 | - ([こちら](https://join.slack.com/t/sudachi-dev/shared_invite/enQtMzg2NTI2NjYxNTUyLTMyYmNkZWQ0Y2E5NmQxMTI3ZGM3NDU0NzU4NGE1Y2UwYTVmNTViYjJmNDI0MWZiYTg4ODNmMzgxYTQ3ZmI2OWU)から招待を受けてください) 224 | 225 | ## chiVe の引用 226 | 227 | chiVe について、論文を発表しています; 228 | 229 | - 真鍋陽俊, 岡照晃, 海川祥毅, 髙岡一馬, 内田佳孝, 浅原正幸. [複数粒度の分割結果に基づく日本語単語分散表現](https://www.anlp.jp/proceedings/annual_meeting/2019/pdf_dir/P8-5.pdf). 言語処理学会第 25 回年次大会, 2019. 230 | - 河村宗一郎, 久本空海, 真鍋陽俊, 高岡一馬, 内田佳孝, 岡照晃, 浅原正幸. [chiVe 2.0: Sudachi と NWJC を用いた実用的な日本語単語ベクトルの実現へ向けて](https://www.anlp.jp/proceedings/annual_meeting/2020/pdf_dir/P6-16.pdf). 言語処理学会第 26 回年次大会, 2020. 231 | - 久本空海, 山村崇, 勝田哲弘, 竹林佑斗, 髙岡一馬, 内田佳孝, 岡照晃, 浅原正幸. [chiVe: 製品利用可能な日本語単語ベクトル資源の実現へ向けて](https://www.ieice.org/ken/paper/20200910U1zQ/). 第 16 回テキストアナリティクス・シンポジウム, 2020. ([スライド](https://speakerdeck.com/sorami/chive-zhi-pin-li-yong-ke-neng-nari-ben-yu-dan-yu-bekutoruzi-yuan-falseshi-xian-hexiang-kete)) 232 | 233 | chiVe を論文や書籍、サービスなどで引用される際には、以下の BibTex をご利用ください(基本的には、1 本目の(真鍋+ 2019)を引用してください)。 234 | 235 | ``` 236 | @INPROCEEDINGS{manabe2019chive, 237 | author = {真鍋陽俊, 岡照晃, 海川祥毅, 髙岡一馬, 内田佳孝, 浅原正幸}, 238 | title = {複数粒度の分割結果に基づく日本語単語分散表現}, 239 | booktitle = "言語処理学会第25回年次大会(NLP2019)", 240 | year = "2019", 241 | pages = "NLP2019-P8-5", 242 | publisher = "言語処理学会", 243 | } 244 | ``` 245 | 246 | ``` 247 | @INPROCEEDINGS{kawamura2020chive, 248 | author = {河村宗一郎, 久本空海, 真鍋陽俊, 高岡一馬, 内田佳孝, 岡照晃, 浅原正幸}, 249 | title = {chiVe 2.0: SudachiとNWJCを用いた実用的な日本語単語ベクトルの実現へ向けて}, 250 | booktitle = "言語処理学会第26回年次大会(NLP2020)", 251 | year = "2020", 252 | pages = "NLP2020-P6-16", 253 | publisher = "言語処理学会", 254 | } 255 | ``` 256 | 257 | ``` 258 | @INPROCEEDINGS{hisamoto2020chive, 259 | author = {久本空海, 山村崇, 勝田哲弘, 竹林佑斗, 髙岡一馬, 内田佳孝, 岡照晃, 浅原正幸}, 260 | title = {chiVe: 製品利用可能な日本語単語ベクトル資源の実現へ向けて}, 261 | booktitle = "第16回テキストアナリティクス・シンポジウム", 262 | year = "2020", 263 | pages = "IEICE-NLC2020-9", 264 | publisher = "電子情報通信学会", 265 | } 266 | ``` 267 | -------------------------------------------------------------------------------- /README_en.md: -------------------------------------------------------------------------------- 1 | # chiVe: Japanese Word Embedding with Sudachi 2 | 3 | [日本語 README](README.md) 4 | 5 | ## Abstract 6 | 7 | "chiVe" (Suda**chi Ve**ctor) is a Japanese pre-trained word embedding resource using large-scale corpus and multi-granular tokenization. 8 | 9 | Based on the [skip-gram algorithm](https://arxiv.org/abs/1301.3781), we used word2vec ([gensim](https://radimrehurek.com/gensim/)) to train the vectors. 10 | 11 | For v1.0-v1.2, we used [NINJAL Web Japanese Corpus (NWJC)](https://masayu-a.github.io/NWJC/) from National Institute for Japanese Language and Linguistics which contains around 100 million web page text as a training corpus. 12 | For v1.3, we used texts taken from [CommonCrawl](https://commoncrawl.org/). 13 | 14 | We used [Sudachi](https://github.com/WorksApplications/Sudachi) by Works Applications for tokenization. 15 | We used Sudachi's multi-granular tokenziation results of the corpus to train word vectors. 16 | 17 | ## Data 18 | 19 | Data are generously hosted by AWS with their [Open Data Sponsorship Program](https://registry.opendata.aws/sudachi/). 20 | 21 | | Version | Min Count | Normalized | Vocab | Text | [gensim](https://radimrehurek.com/gensim/) | [Magnitude](https://github.com/plasticityai/magnitude) | 22 | | --------- | --------- | ---------- | --------- | --------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------- | 23 | | v1.3 mc5 | 5 | o | 2,530,791 | 3.6GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.3-mc5.tar.gz)) | 2.9GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.3-mc5_gensim.tar.gz)) | - | 24 | | v1.3 mc15 | 15 | o | 1,186,019 | 1.7GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.3-mc15.tar.gz)) | 1.3GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.3-mc15_gensim.tar.gz)) | - | 25 | | v1.3 mc30 | 30 | o | 759,011 | 1.1GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.3-mc30.tar.gz)) | 0.8GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.3-mc30_gensim.tar.gz)) | - | 26 | | v1.3 mc90 | 90 | o | 410,533 | 0.6GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.3-mc90.tar.gz)) | 0.5GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.3-mc90_gensim.tar.gz)) | - | 27 | | | | | | | | | 28 | | v1.2 mc5 | 5 | o | 3,197,456 | 9.2GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc5.tar.gz)) | 3.8GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc5_gensim.tar.gz)) | 5.5GB ([.magnitude](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc5.magnitude)) | 29 | | v1.2 mc15 | 15 | o | 1,454,280 | 5.0GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc15.tar.gz)) | 1.7GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc15_gensim.tar.gz)) | 2.4GB ([.magnitude](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc15.magnitude)) | 30 | | v1.2 mc30 | 30 | o | 912,550 | 3.1GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc30.tar.gz)) | 1.1GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc30_gensim.tar.gz)) | 1.5GB ([.magnitude](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc30.magnitude)) | 31 | | v1.2 mc90 | 90 | o | 482,223 | 1.7GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc90.tar.gz)) | 0.6GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc90_gensim.tar.gz)) | 0.8GB ([.magnitude](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc90.magnitude)) | 32 | | | | | | | | | 33 | | v1.1 mc5 | 5 | o | 3,196,481 | 11GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc5.tar.gz)) | 3.6GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc5_gensim.tar.gz)) | 5.5GB ([.magnitude](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc5.magnitude)) | 34 | | v1.1 mc15 | 15 | o | 1,452,205 | 4.7GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc15.tar.gz)) | 1.7GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc15_gensim.tar.gz)) | 2.4GB ([.magnitude](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc15.magnitude)) | 35 | | v1.1 mc30 | 30 | o | 910,424 | 3.0GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc30.tar.gz)) | 1.1GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc30_gensim.tar.gz)) | 1.5GB ([.magnitude](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc30.magnitude)) | 36 | | v1.1 mc90 | 90 | o | 480,443 | 1.6GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc90.tar.gz)) | 0.6GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc90_gensim.tar.gz)) | 0.8GB ([.magnitude](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc90.magnitude)) | 37 | | v1.0 mc5 | 5 | x | 3,644,628 | 12GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.0-mc5.tar.gz)) | 4.1GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.0-mc5_gensim.tar.gz)) | 6.3GB ([.magnitude](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.0-mc5.magnitude)) | 38 | 39 | All vectors have 300 dimensions. 40 | 41 | "Min Count" indicates the number of minimum appearance count in the training corpus (`min_count` in [gensim](https://radimrehurek.com/gensim/models/word2vec.html)). 42 | 43 | "Normalized" indicates if the text is normalized using the tokenizer Sudachi. For example, words `空き缶`, `空缶`, `空き罐`, `空罐`, `空きカン`, `空きかん` will all be normalized to `空き缶`. 44 | 45 | | version | Sudachi | SudachiDict | Training Corpus | 46 | | ------- | ------- | --------------------- | ---------------------------------------------------- | 47 | | v1.3 | v0.6.8 | 20240109-core | CommonCrawl (CC-MAIN-2022-40, warc, first 20k files) | 48 | | v1.2 | v0.4.3 | 20200722-core | NWJC | 49 | | v1.1 | v0.3.0 | 20191030-core | NWJC | 50 | | v1.0 | v0.1.1 | 0.1.1-dictionary-full | NWJC | 51 | 52 | The training algorithm is the same. See [training](training) for the details. 53 | 54 | ### "A Unit Only" Resources 55 | 56 | These files contain only the [SudachiDict](https://github.com/WorksApplications/SudachiDict) A unit words (Not re-training; Simply excluding B unit words, C unit words, and OOV (Out-of-vocabulary) words from the above original resources). 57 | 58 | `v1.1 mc90 aunit` is used for the natural language processing tool [spaCy](https://github.com/explosion/spaCy/)'s Japanese models. 59 | 60 | | Version | Vocab | Text | [gensim](https://radimrehurek.com/gensim/) | [Magnitude](https://github.com/plasticityai/magnitude) | 61 | | --------------- | --------------- | --------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------- | 62 | | v1.1 mc5 aunit | 322,094 (10.1%) | 1.1GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc5-aunit.tar.gz)) | 0.4GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc5-aunit_gensim.tar.gz)) | 0.5GB ([.magnitude](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc5-aunit.magnitude)) | 63 | | v1.1 mc15 aunit | 276,866 (19.1%) | 1.0GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc15-aunit.tar.gz)) | 0.3GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc15-aunit_gensim.tar.gz)) | 0.4GB ([.magnitude](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc15-aunit.magnitude)) | 64 | | v1.1 mc30 aunit | 242,658 (26.7%) | 0.8GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc30-aunit.tar.gz)) | 0.3GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc30-aunit_gensim.tar.gz)) | 0.4GB ([.magnitude](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc30-aunit.magnitude)) | 65 | | v1.1 mc90 aunit | 189,775 (39.5%) | 0.7GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc90-aunit.tar.gz)) | 0.2GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc90-aunit_gensim.tar.gz)) | 0.3GB ([.magnitude](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc90-aunit.magnitude)) | 66 | 67 | ### Continue Training chiVe 68 | 69 | Although chiVe can be used as it is, you can continue to train chiVe with your own data to improve the performance of your tasks. 70 | 71 | A full model is required for further training. 72 | See the [tutorial](docs/continue-training.md) for details on how to use it. 73 | 74 | | Version | [gensim](https://radimrehurek.com/gensim/) (full) | 75 | | --------- | --------------------------------------------------------------------------------------------------------- | 76 | | v1.3 mc5 | 5.5GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.3-mc5_gensim-full.tar.gz)) | 77 | | v1.3 mc15 | 2.6GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.3-mc15_gensim-full.tar.gz)) | 78 | | v1.3 mc30 | 1.7GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.3-mc30_gensim-full.tar.gz)) | 79 | | v1.3 mc90 | 0.9GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.3-mc90_gensim-full.tar.gz)) | 80 | | | | 81 | | v1.2 mc5 | 6.7GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc5_gensim-full.tar.gz)) | 82 | | v1.2 mc15 | 3.0GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc15_gensim-full.tar.gz)) | 83 | | v1.2 mc30 | 1.9GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc30_gensim-full.tar.gz)) | 84 | | v1.2 mc90 | 1.0GB ([tar.gz](https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.2-mc90_gensim-full.tar.gz)) | 85 | 86 | ## Usage 87 | 88 | We provide data in 3 formats, namely, Text, gensim, and Magitude. 89 | 90 | ### Text 91 | 92 | Data in plain text (original word2vec C format). 93 | 94 | ```txt:chive-1.1-mc90/chive-1.1-mc90.txt 95 | 480443 300 96 | の -0.08274004 -0.091033645 -0.08744463 -0.14393683 -0.053159036 ... 97 | 、 -0.014216528 -0.1027064 -0.07763326 -0.16008057 -0.16116066 ... 98 | て -0.06049706 -0.15483096 0.052628547 -0.12448246 -0.14404581 ... 99 | ... 100 | ``` 101 | 102 | ### gensim 103 | 104 | Data for the library [gensim](https://radimrehurek.com/gensim/), in [KeyedVectors](https://radimrehurek.com/gensim/models/keyedvectors.html) format. 105 | 106 | ```py 107 | import gensim 108 | 109 | vectors = gensim.models.KeyedVectors.load("./chive-1.1-mc90_gensim/chive-1.1-mc90.kv") 110 | 111 | "すだち" in vectors # False, because in v1.1 all vocabs are normalized 112 | "酢橘" in vectors # True 113 | 114 | vectors["酢橘"] 115 | # array([-5.68204783e-02, -1.26615226e-01, 3.53190415e-02, -3.67305875e-01, ...]) 116 | 117 | vectors.similarity("酢橘", "徳島") 118 | # 0.3993048 119 | 120 | vectors.most_similar("徳島", topn=5) 121 | # [('愛媛', 0.8229734897613525), 122 | # ('徳島県', 0.786933422088623), 123 | # ('高知', 0.7795713543891907), 124 | # ('岡山', 0.7623447179794312), 125 | # ('徳島市', 0.7415297031402588)] 126 | 127 | vectors.most_similar(positive=["阿波", "高知"], negative=["徳島"], topn=5) 128 | # [('土佐', 0.620033860206604), 129 | # ('阿波踊り', 0.5988592505455017), 130 | # ('よさこい祭り', 0.5783430337905884), 131 | # ('安芸', 0.564490556716919), 132 | # ('高知県', 0.5591559410095215)] 133 | ``` 134 | 135 | ### Magnitude 136 | 137 | Data converted for the library [Magnitude](https://github.com/plasticityai/magnitude), using the default parameters, i.e., includes advanced out-of-vocabulary key support using subword information, but does not include approximate nearest neighbours index (equivalent to their `Medium`). 138 | 139 | ```py 140 | from pymagnitude import Magnitude 141 | 142 | vectors = Magnitude("chive1.1-mc90.magnitude") 143 | 144 | "すだち" in vectors # False, because in v1.1 all vocabs are normalized 145 | "酢橘" in vectors # True 146 | 147 | vectors.query("すだち") # via Magnitude's OOV feature suing subword information 148 | # array([ 0.03974148, 0.11290773, 0.01493122, -0.05296252, 0.12616251, ...]) 149 | 150 | vectors.most_similar("すだち", topn=5) 151 | # [('あだち', 0.5930323079944302), 152 | # ('すだ椎', 0.5872662462335323), 153 | # ('だち', 0.5797546444016177), 154 | # ('ムクノキ', 0.46228053338159725), 155 | # ('椨', 0.4482612387097178)] 156 | 157 | vectors.similarity("酢橘", "徳島") 158 | # 0.3993048 159 | 160 | vectors.most_similar("徳島", topn=5) 161 | # [('愛媛', 0.8229735), 162 | # ('徳島県', 0.78693324), 163 | # ('高知', 0.7795714), 164 | # ('岡山', 0.7623447), 165 | # ('徳島市', 0.7415296)] 166 | 167 | vectors.closer_than("徳島", "徳島市") 168 | # ['愛媛', '徳島県', '高知', '岡山'] 169 | 170 | vectors.most_similar(positive=["阿波", "高知"], negative=["徳島"], topn=5) 171 | # [('土佐', 0.62003386), 172 | # ('阿波踊り', 0.5988593), 173 | # ('よさこい祭り', 0.578343), 174 | # ('安芸', 0.56449056), 175 | # ('高知県', 0.55915594)] 176 | 177 | vectors.most_similar_cosmul(positive=["阿波", "高知"], negative=["徳島"], topn=5) 178 | # [('土佐', 0.83830714), 179 | # ('よさこい祭り', 0.82048166), 180 | # ('阿波踊り', 0.8168015), 181 | # ('安芸', 0.80880433), 182 | # ('伊予', 0.80250806)] 183 | ``` 184 | 185 | You can also download, remote load, or remote stream over HTTP. 186 | 187 | ```py 188 | from pymagnitude import Magnitude, MagnitudeUtils 189 | 190 | # Download 191 | vectors = Magnitude(MagnitudeUtils.download_model("chive-1.1-mc90-aunit", remote_path="https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/")) 192 | # default download dir: `~/.magnitude/` 193 | # If the file already downloaded, it won't be downloaded again 194 | # You can change the download dir using the `download_dir` argument 195 | 196 | # Remote Loading 197 | vectors = Magnitude("https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc90-aunit.magnitude") 198 | 199 | # Remote Streaming over HTTP 200 | vectors = Magnitude("https://sudachi.s3-ap-northeast-1.amazonaws.com/chive/chive-1.1-mc90-aunit.magnitude", stream=True) 201 | vectors.query("徳島") # Returns the vector quickly, even with no local file downloaded 202 | ``` 203 | 204 | ## Licence 205 | 206 | ### v1.0, v1.1, v1.2 207 | 208 | Copyright (c) 2020 National Institute for Japanese Language and Linguistics and Works Applications Co., Ltd. All rights reserved. 209 | 210 | "chiVe" v1.0, v1.1 and v1.2 are distributed by [National Institute for Japanese Langauge and Linguistics](https://www.ninjal.ac.jp/) and [Works Applications Co.,Ltd.](https://www.worksap.co.jp/) under [Apache License, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0). 211 | 212 | ### v1.3 213 | 214 | Copyright (c) 2024 Works Applications Co., Ltd. All rights reserved. 215 | 216 | "chiVe" v1.3 is distributed by [Works Applications Co.,Ltd.](https://www.worksap.co.jp/) under [Apache License, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0). 217 | 218 | ## Slack 219 | 220 | We have a Slack workspace for developers and users to ask questions and discuss a variety of topics. 221 | 222 | - https://sudachi-dev.slack.com/ 223 | - (Please get an invite from [here](https://join.slack.com/t/sudachi-dev/shared_invite/enQtMzg2NTI2NjYxNTUyLTMyYmNkZWQ0Y2E5NmQxMTI3ZGM3NDU0NzU4NGE1Y2UwYTVmNTViYjJmNDI0MWZiYTg4ODNmMzgxYTQ3ZmI2OWU)) 224 | 225 | ## Citing chiVe 226 | 227 | We have published a following paper about chiVe; 228 | 229 | - 真鍋陽俊, 岡照晃, 海川祥毅, 髙岡一馬, 内田佳孝, 浅原正幸. [複数粒度の分割結果に基づく日本語単語分散表現](https://www.anlp.jp/proceedings/annual_meeting/2019/pdf_dir/P8-5.pdf) _(Japanese Word Embedding based on Multi-granular Tokenization Results, in Japanese)_. 言語処理学会第 25 回年次大会, 2019. 230 | - 河村宗一郎, 久本空海, 真鍋陽俊, 高岡一馬, 内田佳孝, 岡照晃, 浅原正幸. [chiVe 2.0: Sudachi と NWJC を用いた実用的な日本語単語ベクトルの実現へ向けて](https://www.anlp.jp/proceedings/annual_meeting/2020/pdf_dir/P6-16.pdf) _(chiVe 2.0: Towards Prctical Japanese Embedding wiht Sudachi and NWJC, in Japanese)_. 言語処理学会第 26 回年次大会, 2020. 231 | - 久本空海, 山村崇, 勝田哲弘, 竹林佑斗, 髙岡一馬, 内田佳孝, 岡照晃, 浅原正幸. [chiVe: 製品利用可能な日本語単語ベクトル資源の実現へ向けて](https://www.ieice.org/ken/paper/20200910U1zQ/) _(chiVe: Towards Industrial-strength Japanese Word Vector Resources, in Japanese)_. 第 16 回テキストアナリティクス・シンポジウム, 2020. ([slides](https://speakerdeck.com/sorami/chive-zhi-pin-li-yong-ke-neng-nari-ben-yu-dan-yu-bekutoruzi-yuan-falseshi-xian-hexiang-kete)) 232 | 233 | When citing chiVe in papers, books, or services, please use the follow BibTex entries (Generally, please cite the first paper, (Manabe+ 2019)); 234 | 235 | ``` 236 | @INPROCEEDINGS{manabe2019chive, 237 | author = {真鍋陽俊, 岡照晃, 海川祥毅, 髙岡一馬, 内田佳孝, 浅原正幸}, 238 | title = {複数粒度の分割結果に基づく日本語単語分散表現}, 239 | booktitle = "言語処理学会第25回年次大会(NLP2019)", 240 | year = "2019", 241 | pages = "NLP2019-P8-5", 242 | publisher = "言語処理学会", 243 | } 244 | ``` 245 | 246 | ``` 247 | @INPROCEEDINGS{kawamura2020chive, 248 | author = {河村宗一郎, 久本空海, 真鍋陽俊, 高岡一馬, 内田佳孝, 岡照晃, 浅原正幸}, 249 | title = {chiVe 2.0: SudachiとNWJCを用いた実用的な日本語単語ベクトルの実現へ向けて}, 250 | booktitle = "言語処理学会第26回年次大会(NLP2020)", 251 | year = "2020", 252 | pages = "NLP2020-P6-16", 253 | publisher = "言語処理学会", 254 | } 255 | ``` 256 | 257 | ``` 258 | @INPROCEEDINGS{hisamoto2020chive, 259 | author = {久本空海, 山村崇, 勝田哲弘, 竹林佑斗, 髙岡一馬, 内田佳孝, 岡照晃, 浅原正幸}, 260 | title = {chiVe: 製品利用可能な日本語単語ベクトル資源の実現へ向けて}, 261 | booktitle = "第16回テキストアナリティクス・シンポジウム", 262 | year = "2020", 263 | pages = "IEICE-NLC2020-9", 264 | publisher = "電子情報通信学会", 265 | } 266 | ``` 267 | --------------------------------------------------------------------------------