├── .dockerignore ├── .flake8 ├── .gitignore ├── Dockerfile ├── README.md ├── data └── .gitkeep ├── docker-compose.yml ├── examples ├── README.md ├── basic │ ├── README.md │ ├── cleaning.py │ ├── normalization.py │ ├── ruby.py │ ├── sentence_segmantation.py │ └── stopwords.py ├── embeddings │ ├── README.md │ ├── download_embeddings.py │ ├── get_fasttext.py │ ├── get_use.py │ └── get_word2vec.py ├── feature_engineering │ ├── README.md │ ├── get_bm25.py │ ├── get_bow.py │ ├── get_scdv.py │ ├── get_swem.py │ └── get_tfidf.py ├── morphological_analysis │ ├── README.md │ ├── konoha_sample.py │ └── nagisa_sample.py ├── sentence_similarity │ ├── README.md │ └── tfidf_cosine_similarity.py ├── sentiment_analysis │ ├── README.md │ └── oseti_sentiment_analysis.py ├── text_classification │ ├── README.md │ ├── run_bert.py │ ├── run_t5.py │ ├── tfidf_lgbm.py │ └── tfidf_logistic_regression.py └── visualization │ ├── README.md │ ├── japanize.png │ ├── japanize_labels.py │ └── visualization.ipynb ├── requirements.txt ├── tests └── README.md └── utils_nlp ├── README.md ├── common └── data.py ├── dataset ├── README.md └── livedoor.py ├── eval └── classification.py ├── features ├── README.md ├── scdv.py └── swem.py └── models ├── README.md ├── nn ├── README.md ├── datasets.py ├── models.py └── runner.py └── pretrained_embeddings ├── README.md └── word2vec.py /.dockerignore: -------------------------------------------------------------------------------- 1 | data 2 | examples 3 | tests 4 | utils_nlp 5 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 160 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | data/livedoor 2 | data/word2vec 3 | data/fasttext 4 | *.gz 5 | *.csv 6 | *.pkl 7 | *.log 8 | *.model 9 | *.json 10 | *.bin 11 | *.npy 12 | wandb 13 | lightning_logs 14 | 15 | # Created by https://www.gitignore.io/api/macos,python 16 | # Edit at https://www.gitignore.io/?templates=macos,python 17 | 18 | ### macOS ### 19 | # General 20 | .DS_Store 21 | .AppleDouble 22 | .LSOverride 23 | 24 | # Icon must end with two \r 25 | Icon 26 | 27 | # Thumbnails 28 | ._* 29 | 30 | # Files that might appear in the root of a volume 31 | .DocumentRevisions-V100 32 | .fseventsd 33 | .Spotlight-V100 34 | .TemporaryItems 35 | .Trashes 36 | .VolumeIcon.icns 37 | .com.apple.timemachine.donotpresent 38 | 39 | # Directories potentially created on remote AFP share 40 | .AppleDB 41 | .AppleDesktop 42 | Network Trash Folder 43 | Temporary Items 44 | .apdisk 45 | 46 | ### Python ### 47 | # Byte-compiled / optimized / DLL files 48 | __pycache__/ 49 | *.py[cod] 50 | *$py.class 51 | 52 | # C extensions 53 | *.so 54 | 55 | # Distribution / packaging 56 | .Python 57 | build/ 58 | develop-eggs/ 59 | dist/ 60 | downloads/ 61 | eggs/ 62 | .eggs/ 63 | lib/ 64 | lib64/ 65 | parts/ 66 | sdist/ 67 | var/ 68 | wheels/ 69 | pip-wheel-metadata/ 70 | share/python-wheels/ 71 | *.egg-info/ 72 | .installed.cfg 73 | *.egg 74 | MANIFEST 75 | 76 | # PyInstaller 77 | # Usually these files are written by a python script from a template 78 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 79 | *.manifest 80 | *.spec 81 | 82 | # Installer logs 83 | pip-log.txt 84 | pip-delete-this-directory.txt 85 | 86 | # Unit test / coverage reports 87 | htmlcov/ 88 | .tox/ 89 | .nox/ 90 | .coverage 91 | .coverage.* 92 | .cache 93 | nosetests.xml 94 | coverage.xml 95 | *.cover 96 | .hypothesis/ 97 | .pytest_cache/ 98 | 99 | # Translations 100 | *.mo 101 | *.pot 102 | 103 | # Scrapy stuff: 104 | .scrapy 105 | 106 | # Sphinx documentation 107 | docs/_build/ 108 | 109 | # PyBuilder 110 | target/ 111 | 112 | # pyenv 113 | .python-version 114 | 115 | # pipenv 116 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 117 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 118 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 119 | # install all needed dependencies. 120 | #Pipfile.lock 121 | 122 | # celery beat schedule file 123 | celerybeat-schedule 124 | 125 | # SageMath parsed files 126 | *.sage.py 127 | 128 | # Spyder project settings 129 | .spyderproject 130 | .spyproject 131 | 132 | # Rope project settings 133 | .ropeproject 134 | 135 | # Mr Developer 136 | .mr.developer.cfg 137 | .project 138 | .pydevproject 139 | 140 | # mkdocs documentation 141 | /site 142 | 143 | # mypy 144 | .mypy_cache/ 145 | .dmypy.json 146 | dmypy.json 147 | 148 | # Pyre type checker 149 | .pyre/ 150 | 151 | # End of https://www.gitignore.io/api/macos,python 152 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM gcr.io/kaggle-images/python:v76 2 | 3 | ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 4 | COPY requirements.txt . 5 | 6 | # mecab 7 | RUN apt-get update -y && \ 8 | apt-get install -y mecab libmecab-dev mecab-ipadic-utf8 9 | 10 | RUN pip install -U pip && \ 11 | pip install -r requirements.txt 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NLP Recipes for Japanese 2 | 3 | This repository contains samples codes for natural language processing in Japanese. 4 | It's highly inspired by [microsoft/nlp-recipes](https://github.com/microsoft/nlp-recipes). 5 | 6 | ## Content 7 | 8 | The following is a summary of the commonly used NLP scenarios covered in the repository. Each scenario is demonstrated in one or more scripts or Jupyter notebook examples that make use of the core code base of models and repository utilities. 9 | 10 | |Category|Methods| 11 | |---| --- | 12 | |[Basic](./examples/basic)|Cleaning, Normalization, Stopwords, Sentence Segmantation, Ruby| 13 | |[Embeddings](./examples/embeddings)|Word2Vec, fastText, Universal Sentence Encoder| 14 | |[Feature Engineering](./examples/feature_engineering)|Bag-of-Words, TF-IDF, BM25, SWEM, SCDV| 15 | |[Morphological Analysis](./examples/morphological_analysis)|Konoha, nagisa| 16 | |[Sentence Similarity](./examples/sentence_similarity)|Cosine Similarity| 17 | |[Sentiment Analysis](sentiment_analysis)|oseti| 18 | |[Text Classification](./examples/text_classification)|TF-IDF & Logistic Regression, TF-IDF & LightGBM, BERT, T5| 19 | |[Visualization](./examples/visualization)|Visualization with Japanese texts| 20 | 21 | ## Environment 22 | 23 | ```bash 24 | docker-compose up -d --build 25 | docker exec -it nlp-recipes-ja bash 26 | ``` 27 | -------------------------------------------------------------------------------- /data/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/upura/nlp-recipes-ja/8ac5e898864137841de8b03c11da34815009af24/data/.gitkeep -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | services: 3 | test: 4 | build: . 5 | volumes: 6 | - $PWD:/working 7 | container_name: nlp-recipes-ja 8 | working_dir: /working 9 | ports: 10 | - 8888:8888 11 | environment: 12 | - PYTHONPATH=/working 13 | tty: true 14 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # Examples 2 | 3 | This folder contains examples, for building Natural Language Processing systems for the following scenarios. 4 | 5 | |Category|Methods| 6 | |---| --- | 7 | |[Basic](basic)|Cleaning, Normalization, Stopwords, Sentence Segmantation, Ruby| 8 | |[Embeddings](embeddings)|Word2Vec, fastText, Universal Sentence Encoder| 9 | |[Feature Engineering](feature_engineering)|Bag-of-Words, TF-IDF, BM25, SWEM, SCDV| 10 | |[Morphological Analysis](morphological_analysis)|Konoha, nagisa| 11 | |[Sentence Similarity](sentence_similarity)|Cosine Similarity| 12 | |[Sentiment Analysis](sentiment_analysis)|oseti| 13 | |[Text Classification](text_classification)|Logistic Regression, LightGBM, BERT| 14 | |[Visualization](visualization)|Visualization with Japanese texts| 15 | -------------------------------------------------------------------------------- /examples/basic/README.md: -------------------------------------------------------------------------------- 1 | # Basic 2 | 3 | This folder contains examples for basic tasks of natural language processing. 4 | 5 | ## Summary 6 | 7 | |Notebook|Environment|Description| 8 | |---|---|---| 9 | |[Cleaning](cleaning.py)|Local| Text cleaning | 10 | |[Normalization](normalization.py)|Local| Text normalization by [neologdn](https://github.com/ikegami-yukino/neologdn) | 11 | |[Stopwords](stopwords.py)|Local| Stopwords by frequency and [dictonary](http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt) | 12 | |[Sentence Segmantation](normalization.py)|Local| Sentence segmantation by [ja_sentence_segmenter](https://github.com/wwwcojp/ja_sentence_segmenter) | 13 | |[Convert Japanese into Roman](ruby.py)|Local| Convert Japanese into Roman by [pykakasi](https://github.com/miurahr/pykakasi) | 14 | -------------------------------------------------------------------------------- /examples/basic/cleaning.py: -------------------------------------------------------------------------------- 1 | # Implemantation from https://github.com/Hironsan/natural-language-preprocessings/blob/master/preprocessings/ja/cleaning.py 2 | import re 3 | 4 | from bs4 import BeautifulSoup 5 | 6 | from utils_nlp.dataset.livedoor import load_pandas_df 7 | 8 | 9 | def clean_text(text): 10 | # replaced_text = '\n'.join(s.strip() for s in text.splitlines()[2:] if s != '') # skip header by [2:] 11 | replaced_text = text.lower() 12 | replaced_text = re.sub(r'[【】]', ' ', replaced_text) # 【】の除去 13 | replaced_text = re.sub(r'[()()]', ' ', replaced_text) # ()の除去 14 | replaced_text = re.sub(r'[[]\[\]]', ' ', replaced_text) # []の除去 15 | replaced_text = re.sub(r'[@@]\w+', '', replaced_text) # メンションの除去 16 | replaced_text = re.sub(r'https?:\/\/.*?[\r\n ]', '', replaced_text) # URLの除去 17 | replaced_text = re.sub(r' ', ' ', replaced_text) # 全角空白の除去 18 | return replaced_text 19 | 20 | 21 | def clean_html_tags(html_text): 22 | soup = BeautifulSoup(html_text, 'html.parser') 23 | cleaned_text = soup.get_text() 24 | cleaned_text = ''.join(cleaned_text.splitlines()) 25 | return cleaned_text 26 | 27 | 28 | def clean_html_and_js_tags(html_text): 29 | soup = BeautifulSoup(html_text, 'html.parser') 30 | [x.extract() for x in soup.findAll(['script', 'style'])] 31 | cleaned_text = soup.get_text() 32 | cleaned_text = ''.join(cleaned_text.splitlines()) 33 | return cleaned_text 34 | 35 | 36 | def clean_url(html_text): 37 | """ 38 | S+ matches all non-whitespace characters (the end of the url) 39 | :param html_text: 40 | :return: 41 | """ 42 | clean_text = re.sub(r'http\S+', '', html_text) 43 | return clean_text 44 | 45 | 46 | def clean_code(html_text): 47 | """Qiitaのコードを取り除きます 48 | :param html_text: 49 | :return: 50 | """ 51 | soup = BeautifulSoup(html_text, 'html.parser') 52 | [x.extract() for x in soup.findAll(class_="code-frame")] 53 | cleaned_text = soup.get_text() 54 | cleaned_text = ''.join(cleaned_text.splitlines()) 55 | return cleaned_text 56 | 57 | 58 | if __name__ == '__main__': 59 | df = load_pandas_df(nrows=10) 60 | df['text'] = df['text'].map(clean_text) 61 | print(df.head()) 62 | -------------------------------------------------------------------------------- /examples/basic/normalization.py: -------------------------------------------------------------------------------- 1 | import neologdn 2 | 3 | from utils_nlp.dataset.livedoor import load_pandas_df 4 | 5 | 6 | if __name__ == '__main__': 7 | df = load_pandas_df(nrows=10) 8 | df['text'] = df['text'].apply(neologdn.normalize) 9 | print(df.head()) 10 | -------------------------------------------------------------------------------- /examples/basic/ruby.py: -------------------------------------------------------------------------------- 1 | import pykakasi 2 | 3 | from utils_nlp.dataset.livedoor import load_pandas_df 4 | 5 | 6 | if __name__ == '__main__': 7 | df = load_pandas_df(nrows=10) 8 | text = df['text'][0][:30] 9 | print(text) 10 | 11 | kakasi = pykakasi.kakasi() 12 | kakasi.setMode("H", "a") # Hiragana to ascii, default: no conversion 13 | kakasi.setMode("K", "a") # Katakana to ascii, default: no conversion 14 | kakasi.setMode("J", "a") # Japanese to ascii, default: no conversion 15 | kakasi.setMode("r", "Hepburn") # default: use Hepburn Roman table 16 | kakasi.setMode("s", True) # add space, default: no separator 17 | kakasi.setMode("C", True) # capitalize, default: no capitalize 18 | conv = kakasi.getConverter() 19 | result = conv.do(text) 20 | print(result) 21 | -------------------------------------------------------------------------------- /examples/basic/sentence_segmantation.py: -------------------------------------------------------------------------------- 1 | import functools 2 | 3 | from ja_sentence_segmenter.common.pipeline import make_pipeline 4 | from ja_sentence_segmenter.concatenate.simple_concatenator import concatenate_matching 5 | from ja_sentence_segmenter.normalize.neologd_normalizer import normalize 6 | from ja_sentence_segmenter.split.simple_splitter import split_newline, split_punctuation 7 | 8 | from utils_nlp.dataset.livedoor import load_pandas_df 9 | 10 | 11 | if __name__ == '__main__': 12 | df = load_pandas_df(nrows=10) 13 | 14 | split_punc2 = functools.partial(split_punctuation, punctuations=r"。!?") 15 | concat_tail_no = functools.partial(concatenate_matching, former_matching_rule=r"^(?P.+)(の)$", remove_former_matched=False) 16 | segmenter = make_pipeline(normalize, split_newline, concat_tail_no, split_punc2) 17 | 18 | df['sentences'] = df['text'].apply(lambda x: list(segmenter(x))) 19 | print(df['sentences'][0]) 20 | -------------------------------------------------------------------------------- /examples/basic/stopwords.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | import itertools 3 | from konoha import WordTokenizer 4 | import numpy as np 5 | import pandas as pd 6 | 7 | from utils_nlp.dataset.livedoor import load_pandas_df 8 | 9 | 10 | def remove_stopwords(words, stopwords): 11 | words = [word for word in words if word not in stopwords] 12 | return words 13 | 14 | 15 | def get_stop_words_by_freq(docs, n=100): 16 | docs = list(itertools.chain(*list(docs))) 17 | fdist = Counter(docs) 18 | stopwords = [word for word, freq in fdist.most_common(n)] 19 | return stopwords 20 | 21 | 22 | def get_stop_words_by_dict(): 23 | stopwords = pd.read_table('http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt', header=None) 24 | stopwords = list(stopwords[0].values) 25 | return stopwords 26 | 27 | 28 | if __name__ == '__main__': 29 | df = load_pandas_df(nrows=100) 30 | tokenizer = WordTokenizer('MeCab') 31 | docs = np.array([ 32 | map(str, tokenizer.tokenize(text)) for text in df['text'] 33 | ]) 34 | stopwords_f = get_stop_words_by_freq(docs, n=100) 35 | stopwords_d = get_stop_words_by_dict() 36 | stopwords = set(stopwords_f) | set(stopwords_d) 37 | print(stopwords) 38 | docs = remove_stopwords(docs, stopwords) 39 | -------------------------------------------------------------------------------- /examples/embeddings/README.md: -------------------------------------------------------------------------------- 1 | # Embeddings 2 | 3 | This folder contains examples for getting pretrained embedding vectors. 4 | 5 | ## What is Word Embedding? 6 | 7 | >Word embedding is a technique to map words or phrases from a vocabulary to vectors or real numbers. 8 | >The learned vector representations of words capture syntactic and semantic word relationships and therefore can be very useful for tasks like sentence similary, text classifcation, etc. 9 | 10 | https://github.com/microsoft/nlp-recipes/blob/master/examples/embeddings/README.md 11 | 12 | ## Japanese pretrained models 13 | 14 | There is a survey article titled "[学習済み日本語word2vecとその評価について](https://blog.hoxo-m.com/entry/2020/02/20/090000)". This article introduces many Japanese pretrained embedding models avaliable and evaluate them. 15 | 16 | ## Summary 17 | 18 | |Notebook|Environment|Description| 19 | |---|---|---| 20 | |[Word2vec](get_word2vec.py)|Local| Get [word2vec vectors pretrained by Japanese Wikipedia](https://qiita.com/Hironsan/items/513b9f93752ecee9e670) | 21 | |[fastText](get_fasttext.py)|Local| Get [fastText vectors pretrained by Japanese Common Crawl](https://fasttext.cc/docs/en/crawl-vectors.html) | 22 | |[Download Pre-trained Embeddings](download_embeddings.py)|Local| Download pre-trained embeddings by [chakin](https://github.com/chakki-works/chakin) | 23 | |[Universal Sentence Encoder](get_use.py)|Local| Get [Universal Sentence Encoder](https://tfhub.dev/google/universal-sentence-encoder-multilingual/3) | 24 | -------------------------------------------------------------------------------- /examples/embeddings/download_embeddings.py: -------------------------------------------------------------------------------- 1 | import chakin 2 | 3 | 4 | if __name__ == '__main__': 5 | chakin.search(lang='Japanese') 6 | """ 7 | Name Dimension Corpus VocabularySize Method Language Author 8 | 6 fastText(ja) 300 Wikipedia 580K fastText Japanese Facebook 9 | 22 word2vec.Wiki-NEologd.50d 50 Wikipedia 335K word2vec + NEologd Japanese Shiroyagi Corporation 10 | """ 11 | chakin.download(number=22, save_dir='./') 12 | -------------------------------------------------------------------------------- /examples/embeddings/get_fasttext.py: -------------------------------------------------------------------------------- 1 | import gensim 2 | import nagisa 3 | 4 | from utils_nlp.dataset.livedoor import load_pandas_df 5 | 6 | 7 | if __name__ == '__main__': 8 | df = load_pandas_df(nrows=10) 9 | text = df['text'][0][:30] 10 | print(text) 11 | # 友人代表のスピーチ、独女はどうこなしている?もうすぐジューン 12 | 13 | tagger = nagisa.Tagger() 14 | nouns = tagger.extract(text, extract_postags=['名詞']).words 15 | print(nouns) 16 | # ['友人', '代表', 'スピーチ', '独女', 'ジューン'] 17 | 18 | model_w = gensim.models.KeyedVectors.load_word2vec_format('./data/fasttext/cc.ja.300.vec.gz', binary=False) 19 | for noun in nouns: 20 | try: 21 | print(noun, model_w[noun].shape) 22 | except KeyError: 23 | print(noun, 'Out of vocabulary') 24 | """ 25 | 友人 (300,) 26 | 代表 (300,) 27 | スピーチ (300,) 28 | 独女 Out of vocabulary 29 | ジューン (300,) 30 | """ 31 | 32 | model_f = gensim.models.fasttext.load_facebook_model('./data/fasttext/cc.ja.300.bin') 33 | for noun in nouns: 34 | print(noun, noun in model_f.wv.vocab) 35 | print(noun, model_f[noun].shape) 36 | -------------------------------------------------------------------------------- /examples/embeddings/get_use.py: -------------------------------------------------------------------------------- 1 | import tensorflow_hub as hub 2 | import tensorflow_text # noqa 3 | 4 | from utils_nlp.dataset.livedoor import load_pandas_df 5 | 6 | 7 | if __name__ == '__main__': 8 | df = load_pandas_df(nrows=10) 9 | text = df['text'][0][:30] 10 | print(text) 11 | # 友人代表のスピーチ、独女はどうこなしている?もうすぐジューン 12 | 13 | embed = hub.load('https://tfhub.dev/google/universal-sentence-encoder-multilingual/3') 14 | vectors = embed([text]) 15 | print(vectors[0].shape) 16 | print(vectors[0]) 17 | """ 18 | (512,) 19 | tf.Tensor( 20 | [-4.53309491e-02 -5.73447756e-02 3.41094285e-02 1.09533397e-02 21 | -2.55712979e-02 -8.29478130e-02 3.02479346e-03 8.89975950e-02], shape=(512,), dtype=float32 22 | """ 23 | -------------------------------------------------------------------------------- /examples/embeddings/get_word2vec.py: -------------------------------------------------------------------------------- 1 | import nagisa 2 | 3 | from utils_nlp.dataset.livedoor import load_pandas_df 4 | from utils_nlp.models.pretrained_embeddings.word2vec import load_pretrained_vectors, convert_to_wv 5 | 6 | 7 | if __name__ == '__main__': 8 | df = load_pandas_df(nrows=10) 9 | text = df['text'][0][:30] 10 | print(text) 11 | # 友人代表のスピーチ、独女はどうこなしている?もうすぐジューン 12 | 13 | tagger = nagisa.Tagger() 14 | nouns = tagger.extract(text, extract_postags=['名詞']).words 15 | print(nouns) 16 | # ['友人', '代表', 'スピーチ', '独女', 'ジューン'] 17 | 18 | word_vec = load_pretrained_vectors('data') 19 | vectors = convert_to_wv(nouns[0], word_vec) 20 | print(vectors.shape) 21 | # (300,) 22 | print(vectors[:5]) 23 | # [ 1.0028e-01 1.0647e-02 -1.7439e-01 -2.7110e-03 2.1647e-01] 24 | -------------------------------------------------------------------------------- /examples/feature_engineering/README.md: -------------------------------------------------------------------------------- 1 | # Feature Engineering 2 | 3 | This folder contains examples for feature engineering of texts. 4 | 5 | ## Summary 6 | 7 | |Notebook|Environment|Description| 8 | |---|---|---| 9 | |[Bag-of-Words](get_bow.py)|Local| [Bag-of-Words](https://en.wikipedia.org/wiki/Bag-of-words_model) | 10 | |[TF-IDF](get_tfidf.py)|Local| [TF-IDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf) | 11 | |[BM25](get_bm25.py)|Local| [BM25](https://en.wikipedia.org/wiki/Okapi_BM25) | 12 | |[SWEM](get_swem.py)|Local| [SWEM](https://arxiv.org/abs/1805.09843) | 13 | |[SCDV](get_scdv.py)|Local| [SCDV](https://arxiv.org/abs/1612.06778) | 14 | -------------------------------------------------------------------------------- /examples/feature_engineering/get_bm25.py: -------------------------------------------------------------------------------- 1 | from konoha import WordTokenizer 2 | import neologdn 3 | import numpy as np 4 | import scipy as sp 5 | from sklearn.base import BaseEstimator, TransformerMixin 6 | from sklearn.feature_extraction.text import CountVectorizer, _document_frequency 7 | from sklearn.utils.validation import check_is_fitted 8 | 9 | from utils_nlp.dataset.livedoor import load_pandas_df 10 | 11 | 12 | class BM25Transformer(BaseEstimator, TransformerMixin): 13 | 14 | def __init__(self, use_idf=True, k1=2.0, b=0.75): 15 | """Okapi BM25: a non-binary model - Introduction to Information Retrieval 16 | http://nlp.stanford.edu/IR-book/html/htmledition/okapi-bm25-a-non-binary-model-1.html. 17 | Implementation from https://github.com/arosh/BM25Transformer. 18 | 19 | Args: 20 | use_idf (bool, optional): [description]. Defaults to True. 21 | k1 (float, optional): [description]. Defaults to 2.0. 22 | b (float, optional): [description]. Defaults to 0.75. 23 | """ 24 | self.use_idf = use_idf 25 | self.k1 = k1 26 | self.b = b 27 | 28 | def fit(self, X): 29 | """ 30 | Parameters 31 | ---------- 32 | X : sparse matrix, [n_samples, n_features] document-term matrix 33 | """ 34 | if not sp.sparse.issparse(X): 35 | X = sp.sparse.csc_matrix(X) 36 | if self.use_idf: 37 | n_samples, n_features = X.shape 38 | df = _document_frequency(X) 39 | idf = np.log((n_samples - df + 0.5) / (df + 0.5)) 40 | self._idf_diag = sp.sparse.spdiags(idf, diags=0, m=n_features, n=n_features) 41 | 42 | doc_len = X.sum(axis=1) 43 | self._average_document_len = np.average(doc_len) 44 | 45 | return self 46 | 47 | def transform(self, X, copy=True): 48 | """ 49 | Parameters 50 | ---------- 51 | X : sparse matrix, [n_samples, n_features] document-term matrix 52 | copy : boolean, optional (default=True) 53 | """ 54 | if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.floating): 55 | # preserve float family dtype 56 | X = sp.sparse.csr_matrix(X, copy=copy) 57 | else: 58 | # convert counts or binary occurrences to floats 59 | X = sp.sparse.csr_matrix(X, dtype=np.float, copy=copy) 60 | 61 | n_samples, n_features = X.shape 62 | 63 | # Document length (number of terms) in each row 64 | # Shape is (n_samples, 1) 65 | doc_len = X.sum(axis=1) 66 | # Number of non-zero elements in each row 67 | # Shape is (n_samples, ) 68 | sz = X.indptr[1:] - X.indptr[0:-1] 69 | 70 | # In each row, repeat `doc_len` for `sz` times 71 | # Shape is (sum(sz), ) 72 | # Example 73 | # ------- 74 | # dl = [4, 5, 6] 75 | # sz = [1, 2, 3] 76 | # rep = [4, 5, 5, 6, 6, 6] 77 | rep = np.repeat(np.asarray(doc_len), sz) 78 | 79 | # Compute BM25 score only for non-zero elements 80 | nom = self.k1 + 1 81 | denom = X.data + self.k1 * (1 - self.b + self.b * rep / self._average_document_len) 82 | data = X.data * nom / denom 83 | 84 | X = sp.sparse.csr_matrix((data, X.indices, X.indptr), shape=X.shape) 85 | 86 | if self.use_idf: 87 | check_is_fitted(self, '_idf_diag', 'idf vector is not fitted') 88 | 89 | expected_n_features = self._idf_diag.shape[0] 90 | if n_features != expected_n_features: 91 | raise ValueError("Input has n_features=%d while the model" 92 | " has been trained with n_features=%d" % ( 93 | n_features, expected_n_features)) 94 | X = X * self._idf_diag 95 | 96 | return X 97 | 98 | 99 | if __name__ == '__main__': 100 | df = load_pandas_df(nrows=10) 101 | 102 | # Normalization 103 | df['text'] = df['text'].apply(neologdn.normalize) 104 | 105 | tokenizer = WordTokenizer('MeCab') 106 | docs = np.array([ 107 | ' '.join(map(str, tokenizer.tokenize(text))) for text in df['text'] 108 | ]) 109 | print(docs.shape) 110 | # (10,) 111 | 112 | count_vec = CountVectorizer(min_df=2, 113 | max_features=20000, 114 | ngram_range=(1, 3)) 115 | bags = count_vec.fit_transform(docs) 116 | 117 | print(bags.toarray().shape) 118 | print(bags.toarray()) 119 | """ 120 | (10, 445) 121 | [[1 0 1 ... 0 0 0] 122 | [1 0 0 ... 0 0 0] 123 | [1 0 0 ... 1 0 0] 124 | ... 125 | [0 0 1 ... 0 0 0] 126 | [0 0 0 ... 0 0 0] 127 | [0 0 0 ... 0 0 0]] 128 | """ 129 | 130 | bm25 = BM25Transformer(use_idf=True, k1=2.0, b=0.75) 131 | bm25 = bm25.fit_transform(bags) 132 | print(bm25.toarray().shape) 133 | print(bm25.toarray()) 134 | """ 135 | (10, 445) 136 | [[0.75499451 1.21230177 0. ... 0. 0. 0. ] 137 | [0.77036179 0. 0. ... 0. 0. 0. ] 138 | [0.83310313 0. 0.40196374 ... 0. 0. 1.3377215 ] 139 | ... 140 | [0. 1.02087499 0. ... 1.02087499 1.02087499 0. ] 141 | [0. 0. 0.43242275 ... 0. 0. 0. ] 142 | [0. 0. 0. ... 0. 0. 0. ]] 143 | """ 144 | -------------------------------------------------------------------------------- /examples/feature_engineering/get_bow.py: -------------------------------------------------------------------------------- 1 | from konoha import WordTokenizer 2 | import neologdn 3 | import numpy as np 4 | from sklearn.feature_extraction.text import CountVectorizer 5 | 6 | from utils_nlp.dataset.livedoor import load_pandas_df 7 | 8 | 9 | if __name__ == '__main__': 10 | df = load_pandas_df(nrows=10) 11 | 12 | # Normalization 13 | df['text'] = df['text'].apply(neologdn.normalize) 14 | 15 | tokenizer = WordTokenizer('MeCab') 16 | docs = np.array([ 17 | ' '.join(map(str, tokenizer.tokenize(text))) for text in df['text'] 18 | ]) 19 | print(docs.shape) 20 | # (10,) 21 | 22 | count_vec = CountVectorizer(min_df=2, 23 | max_features=20000, 24 | ngram_range=(1, 3)) 25 | bags = count_vec.fit_transform(docs) 26 | 27 | print(bags.toarray().shape) 28 | # (10, 445) 29 | print(bags.toarray()) 30 | """ 31 | [[1 0 1 ... 0 0 0] 32 | [1 0 0 ... 0 0 0] 33 | [1 0 0 ... 1 0 0] 34 | ... 35 | [0 0 1 ... 0 0 0] 36 | [0 0 0 ... 0 0 0] 37 | [0 0 0 ... 0 0 0]] 38 | """ 39 | -------------------------------------------------------------------------------- /examples/feature_engineering/get_scdv.py: -------------------------------------------------------------------------------- 1 | from konoha import WordTokenizer 2 | import neologdn 3 | import numpy as np 4 | 5 | from utils_nlp.dataset.livedoor import load_pandas_df 6 | from utils_nlp.features import scdv 7 | from utils_nlp.models.pretrained_embeddings.word2vec import load_pretrained_vectors 8 | 9 | 10 | if __name__ == '__main__': 11 | df = load_pandas_df(nrows=10) 12 | 13 | # Normalization 14 | df['text'] = df['text'].apply(neologdn.normalize) 15 | 16 | tokenizer = WordTokenizer('MeCab') 17 | docs = np.array([ 18 | map(str, tokenizer.tokenize(text)) for text in df['text'] 19 | ]) 20 | print(docs.shape) 21 | # (10,) 22 | 23 | word_vec = load_pretrained_vectors('data') 24 | scdv = scdv.create(docs, word_vec, n_components=10) 25 | print(scdv.shape) 26 | # (10, 3000) 27 | -------------------------------------------------------------------------------- /examples/feature_engineering/get_swem.py: -------------------------------------------------------------------------------- 1 | from konoha import WordTokenizer 2 | import neologdn 3 | import numpy as np 4 | 5 | from utils_nlp.dataset.livedoor import load_pandas_df 6 | from utils_nlp.features import swem 7 | from utils_nlp.models.pretrained_embeddings.word2vec import load_pretrained_vectors 8 | 9 | 10 | if __name__ == '__main__': 11 | df = load_pandas_df(nrows=10) 12 | 13 | # Normalization 14 | df['text'] = df['text'].apply(neologdn.normalize) 15 | 16 | tokenizer = WordTokenizer('MeCab') 17 | docs = np.array([ 18 | map(str, tokenizer.tokenize(text)) for text in df['text'] 19 | ]) 20 | print(docs.shape) 21 | # (10,) 22 | 23 | word_vec = load_pretrained_vectors('data') 24 | swem_max = swem.create(docs, word_vec, aggregation='max') 25 | swem_mean = swem.create(docs, word_vec, aggregation='mean') 26 | print(swem_max.shape) 27 | # (10, 300) 28 | -------------------------------------------------------------------------------- /examples/feature_engineering/get_tfidf.py: -------------------------------------------------------------------------------- 1 | from konoha import WordTokenizer 2 | import neologdn 3 | import numpy as np 4 | from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer 5 | 6 | from utils_nlp.dataset.livedoor import load_pandas_df 7 | 8 | 9 | if __name__ == '__main__': 10 | df = load_pandas_df(nrows=10) 11 | 12 | # Normalization 13 | df['text'] = df['text'].apply(neologdn.normalize) 14 | 15 | tokenizer = WordTokenizer('MeCab') 16 | docs = np.array([ 17 | ' '.join(map(str, tokenizer.tokenize(text))) for text in df['text'] 18 | ]) 19 | print(docs.shape) 20 | # (10,) 21 | 22 | count_vec = CountVectorizer(min_df=2, 23 | max_features=20000, 24 | ngram_range=(1, 3)) 25 | bags = count_vec.fit_transform(docs) 26 | 27 | print(bags.toarray().shape) 28 | print(bags.toarray()) 29 | """ 30 | (10, 445) 31 | [[1 0 1 ... 0 0 0] 32 | [1 0 0 ... 0 0 0] 33 | [1 0 0 ... 1 0 0] 34 | ... 35 | [0 0 1 ... 0 0 0] 36 | [0 0 0 ... 0 0 0] 37 | [0 0 0 ... 0 0 0]] 38 | """ 39 | 40 | tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True) 41 | tf_idf = tfidf.fit_transform(bags) 42 | print(tf_idf.toarray().shape) 43 | print(tf_idf.toarray()) 44 | """ 45 | (10, 445) 46 | [[0.04752833 0.05432543 0. ... 0. 0. 0. ] 47 | [0.0484923 0. 0. ... 0. 0. 0. ] 48 | [0.04909543 0. 0.04364936 ... 0. 0. 0.05611665] 49 | ... 50 | [0. 0.03772958 0. ... 0.03772958 0.03772958 0. ] 51 | [0. 0. 0.03994261 ... 0. 0. 0. ] 52 | [0. 0. 0. ... 0. 0. 0. ]] 53 | """ 54 | -------------------------------------------------------------------------------- /examples/morphological_analysis/README.md: -------------------------------------------------------------------------------- 1 | # Morphological Analysis 2 | 3 | This folder contains examples for morphological analysis. 4 | 5 | Konoha is a Python library for providing integrated interface of various Japanese tokenziers, which enables us to switch tokenizers. We can use MeCab, KyTea, Janome, Sudachi, Sentencepiece, and nagisa. Konoha doesn't support the function of filtering some nouns. We can use pure nagisa for this purpose. 6 | 7 | ## Summary 8 | 9 | |Notebook|Environment|Description| 10 | |---|---|---| 11 | |[Konoha sample](konoha_sample.py)|Local| Morphological analysis by [Konoha](https://github.com/himkt/konoha) | 12 | |[nagisa sample](nagisa_sample.py)|Local| Morphological analysis by [nagisa](https://github.com/taishi-i/nagisa) | 13 | -------------------------------------------------------------------------------- /examples/morphological_analysis/konoha_sample.py: -------------------------------------------------------------------------------- 1 | from konoha import WordTokenizer 2 | 3 | from utils_nlp.dataset.livedoor import load_pandas_df 4 | 5 | 6 | if __name__ == '__main__': 7 | df = load_pandas_df(nrows=10) 8 | text = df['text'][0][:30] 9 | print(text) 10 | # 友人代表のスピーチ、独女はどうこなしている?もうすぐジューン 11 | 12 | tokenizer_m = WordTokenizer('MeCab') 13 | print(tokenizer_m.tokenize(text)) 14 | # [友人, 代表, の, スピーチ, 、, 独, 女, は, どう, こなし, て, いる, ?, もうすぐ, ジューン] 15 | 16 | tokenizer_s = WordTokenizer('Sudachi', mode='A', with_postag=True) 17 | print(tokenizer_s.tokenize(text)) 18 | # [友人 (名詞), 代表 (名詞), の (助詞), スピーチ (名詞), 、 (補助記号), 独女 (名詞), は (助詞), どう (副詞), こなし (動詞), て (助詞), いる (動詞), ? (補助記号), もう (副詞), すぐ (副詞), ジューン (名詞)] 19 | 20 | df['sep_text'] = [tokenizer_m.tokenize(text) for text in df['text']] 21 | print(df.head()) 22 | -------------------------------------------------------------------------------- /examples/morphological_analysis/nagisa_sample.py: -------------------------------------------------------------------------------- 1 | import nagisa 2 | 3 | from utils_nlp.dataset.livedoor import load_pandas_df 4 | 5 | 6 | if __name__ == '__main__': 7 | df = load_pandas_df(nrows=10) 8 | text = df['text'][0][:30] 9 | print(text) 10 | # 友人代表のスピーチ、独女はどうこなしている?もうすぐジューン 11 | 12 | tagger = nagisa.Tagger() 13 | print(tagger.extract(text, extract_postags=['名詞'])) 14 | # 友人/名詞 代表/名詞 スピーチ/名詞 独女/名詞 ジューン/名詞 15 | 16 | df['sep_text'] = [tagger.extract(text, extract_postags=['名詞']).words for text in df['text']] 17 | print(df.head()) 18 | -------------------------------------------------------------------------------- /examples/sentence_similarity/README.md: -------------------------------------------------------------------------------- 1 | # Sentence Similarity 2 | 3 | This folder contains examples for calculating sentence similarities. 4 | 5 | ## Summary 6 | 7 | |Notebook|Environment|Description| 8 | |---|---|---| 9 | |[TF-IDF & Cosine Similarity](tfidf_cosine_similarity.py)|Local| Calculate sentence similarities from TF-IDF vectors | 10 | -------------------------------------------------------------------------------- /examples/sentence_similarity/tfidf_cosine_similarity.py: -------------------------------------------------------------------------------- 1 | from konoha import WordTokenizer 2 | import neologdn 3 | import numpy as np 4 | from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer 5 | from sklearn.metrics.pairwise import cosine_similarity 6 | 7 | from utils_nlp.dataset.livedoor import load_pandas_df 8 | 9 | 10 | if __name__ == '__main__': 11 | df = load_pandas_df(nrows=10) 12 | 13 | # Normalization 14 | df['text'] = df['text'].apply(neologdn.normalize) 15 | 16 | tokenizer = WordTokenizer('MeCab') 17 | docs = np.array([ 18 | ' '.join(map(str, tokenizer.tokenize(text))) for text in df['text'] 19 | ]) 20 | print(docs.shape) 21 | # (10,) 22 | 23 | count_vec = CountVectorizer(min_df=2, 24 | max_features=20000, 25 | ngram_range=(1, 3)) 26 | bags = count_vec.fit_transform(docs) 27 | 28 | print(bags.toarray().shape) 29 | print(bags.toarray()) 30 | """ 31 | (10, 445) 32 | [[1 0 1 ... 0 0 0] 33 | [1 0 0 ... 0 0 0] 34 | [1 0 0 ... 1 0 0] 35 | ... 36 | [0 0 1 ... 0 0 0] 37 | [0 0 0 ... 0 0 0] 38 | [0 0 0 ... 0 0 0]] 39 | """ 40 | 41 | tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True) 42 | tf_idf = tfidf.fit_transform(bags) 43 | print(tf_idf.toarray().shape) 44 | print(tf_idf.toarray()) 45 | """ 46 | (10, 445) 47 | [[0.04752833 0.05432543 0. ... 0. 0. 0. ] 48 | [0.0484923 0. 0. ... 0. 0. 0. ] 49 | [0.04909543 0. 0.04364936 ... 0. 0. 0.05611665] 50 | ... 51 | [0. 0.03772958 0. ... 0.03772958 0.03772958 0. ] 52 | [0. 0. 0.03994261 ... 0. 0. 0. ] 53 | [0. 0. 0. ... 0. 0. 0. ]] 54 | """ 55 | 56 | print(cosine_similarity(tf_idf.toarray())) 57 | """ 58 | [[1. 0.31294546 0.22234506 0.27272853 0.22658861 0.37452715 59 | 0.35456225 0.29524085 0.17193537 0.36229732] 60 | [0.31294546 1. 0.25102573 0.25264431 0.24334397 0.33785512 61 | 0.31670052 0.28218417 0.12684395 0.32628839] 62 | [0.22234506 0.25102573 1. 0.24099022 0.17307931 0.31050187 63 | 0.32489792 0.28119098 0.15070305 0.38326419] 64 | [0.27272853 0.25264431 0.24099022 1. 0.23456837 0.32640547 65 | 0.27615115 0.3153026 0.26716363 0.31163735] 66 | [0.22658861 0.24334397 0.17307931 0.23456837 1. 0.41007705 67 | 0.24911698 0.36058785 0.11835559 0.2387821 ] 68 | [0.37452715 0.33785512 0.31050187 0.32640547 0.41007705 1. 69 | 0.45739635 0.32316926 0.2059866 0.31257367] 70 | [0.35456225 0.31670052 0.32489792 0.27615115 0.24911698 0.45739635 71 | 1. 0.39132051 0.24839521 0.3321967 ] 72 | [0.29524085 0.28218417 0.28119098 0.3153026 0.36058785 0.32316926 73 | 0.39132051 1. 0.15238316 0.30832032] 74 | [0.17193537 0.12684395 0.15070305 0.26716363 0.11835559 0.2059866 75 | 0.24839521 0.15238316 1. 0.24724469] 76 | [0.36229732 0.32628839 0.38326419 0.31163735 0.2387821 0.31257367 77 | 0.3321967 0.30832032 0.24724469 1. ]] 78 | """ 79 | -------------------------------------------------------------------------------- /examples/sentiment_analysis/README.md: -------------------------------------------------------------------------------- 1 | # Sentiment Analysis 2 | 3 | This folder contains examples for sentiment analysis. 4 | 5 | ## Summary 6 | 7 | |Notebook|Environment|Description| 8 | |---|---|---| 9 | |[Sentiment Analysis by dictonary](oseti_sentiment_analysis.py)|Local| Sentiment analysis by dictonary by [oseti](https://github.com/ikegami-yukino/oseti) | 10 | -------------------------------------------------------------------------------- /examples/sentiment_analysis/oseti_sentiment_analysis.py: -------------------------------------------------------------------------------- 1 | import oseti 2 | 3 | from utils_nlp.dataset.livedoor import load_pandas_df 4 | 5 | 6 | if __name__ == '__main__': 7 | df = load_pandas_df(nrows=10) 8 | text = df['text'][0][:30] 9 | print(text) 10 | # 友人代表のスピーチ、独女はどうこなしている?もうすぐジューン 11 | 12 | analyzer = oseti.Analyzer() 13 | print(analyzer.analyze(text)) 14 | print(analyzer.count_polarity(text)) 15 | # [1.0, 0] 16 | # [{'positive': 2, 'negative': 0}, {'positive': 0, 'negative': 0}] 17 | -------------------------------------------------------------------------------- /examples/text_classification/README.md: -------------------------------------------------------------------------------- 1 | # Text Classification 2 | 3 | This folder contains examples of text classification models. 4 | 5 | ## What is Text Classification? 6 | 7 | >Text classification is a supervised learning method of learning and predicting the category or the class of a document given its text content. 8 | >The state-of-the-art methods are based on neural networks of different architectures as well as pre-trained language models or word embeddings. 9 | 10 | https://github.com/microsoft/nlp-recipes/blob/master/examples/text_classification/README.md 11 | 12 | ## Summary 13 | 14 | |Notebook|Environment|Description|ACC| 15 | |---|---|---|---| 16 | |[TF-IDF & Logistic Regression](tfidf_logistic_regression.py)|Local| [Logistic Regression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html) with TF-IDF vectors | 0.9308 | 17 | |[TF-IDF & LightGBM](tfidf_lgbm.py)|Local| [LightGBM](https://github.com/microsoft/LightGBM) with TF-IDF vectors | 0.9512 | 18 | |[BERT](run_bert.py) 'cl-tohoku/bert-base-japanese-v2' |Local| [Transformers BERT](https://github.com/huggingface/transformers) | 0.9362 | 19 | |[BERT](run_bert.py) 'cl-tohoku/bert-base-japanese-char-v2' |Local| [Transformers BERT](https://github.com/huggingface/transformers) | 0.9274 | 20 | |[BERT](run_bert.py) 'cl-tohoku/bert-base-large' |Local| [Transformers BERT](https://github.com/huggingface/transformers) | - | 21 | |[T5](run_t5.py) |Local| [T5 for japanese](https://qiita.com/sonoisa/items/a9af64ff641f0bbfed44) | 0.9566 | 22 | 23 | Accuracy scores (ACC) are calculated by running code only in fold 0 in the condition that datasets are devided into train/val/test at the rate of 0.6/0.2/0.2. 24 | Be careful that the scores are highly affected by the way of splitting dataset and hyperparameters like the number of epochs. 25 | -------------------------------------------------------------------------------- /examples/text_classification/run_bert.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | 4 | import neologdn 5 | import numpy as np 6 | import pytorch_lightning as pl 7 | from sklearn import preprocessing 8 | from sklearn.metrics import log_loss 9 | from sklearn.model_selection import train_test_split, StratifiedKFold 10 | import torch 11 | from torch import nn 12 | from torch.utils.data import DataLoader 13 | from tqdm import tqdm 14 | from transformers import BertTokenizer 15 | 16 | sys.path.append('.') 17 | from utils_nlp.dataset.livedoor import load_pandas_df 18 | from utils_nlp.eval.classification import eval_classification 19 | from utils_nlp.models.nn.datasets import LivedoorDataset 20 | from utils_nlp.models.nn.models import PLBertClassifier 21 | 22 | 23 | def preprocess_data(df): 24 | # split 25 | df['text'] = df['text'].apply(neologdn.normalize) 26 | le = preprocessing.LabelEncoder() 27 | df['label'] = le.fit_transform(df['label']) 28 | 29 | X_train, X_test, y_train, y_test = train_test_split( 30 | df, df['label'].values, test_size=0.2, random_state=42, stratify=df['label']) 31 | 32 | return X_train.reset_index(drop=True), X_test.reset_index(drop=True), y_train, y_test 33 | 34 | 35 | if __name__ == '__main__': 36 | 37 | parser = argparse.ArgumentParser() 38 | parser.add_argument('--model_name') 39 | args = parser.parse_args() 40 | 41 | MODEL_NAME = args.model_name 42 | MAX_LEN = 300 43 | pl.seed_everything(777) 44 | device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') 45 | 46 | df = load_pandas_df(shuffle=True) 47 | X_train, X_test, y_train, y_test = preprocess_data(df) 48 | 49 | tokenizer = BertTokenizer.from_pretrained(MODEL_NAME) 50 | 51 | test_dataset = LivedoorDataset(X_test, tokenizer, MAX_LEN) 52 | test_loader = DataLoader(test_dataset, shuffle=False, batch_size=32, num_workers=4) 53 | 54 | y_preds = [] 55 | NUM_CLASS = 9 56 | oof_train = np.zeros((len(X_train), NUM_CLASS)) 57 | cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=0) 58 | 59 | for fold_id, (train_index, valid_index) in enumerate(tqdm(cv.split(X_train, X_train['label']))): 60 | if fold_id == 0: 61 | X_tr = X_train.loc[train_index, :].reset_index(drop=True) 62 | X_val = X_train.loc[valid_index, :].reset_index(drop=True) 63 | y_tr = y_train[train_index] 64 | y_val = y_train[valid_index] 65 | 66 | train_dataset = LivedoorDataset(X_tr, tokenizer, MAX_LEN) 67 | valid_dataset = LivedoorDataset(X_val, tokenizer, MAX_LEN) 68 | 69 | train_loader = DataLoader(train_dataset, shuffle=True, batch_size=16, num_workers=4) 70 | valid_loader = DataLoader(valid_dataset, shuffle=False, batch_size=32, num_workers=4) 71 | 72 | model = PLBertClassifier(model_name=MODEL_NAME, 73 | num_classes=NUM_CLASS) 74 | device ='cuda:0' if torch.cuda.is_available() else 'cpu' 75 | model = model.to(device) 76 | trainer = pl.Trainer(gpus=1, max_epochs=7) 77 | trainer.fit(model, train_loader, valid_loader) 78 | trainer.test(test_dataloaders=test_loader) 79 | 80 | y_preds = np.load('data/bert/preds.npy') 81 | print(f'test, log_loss: {log_loss(y_test, y_preds)}') 82 | result_dict = eval_classification(y_test, y_preds.argmax(axis=1)) 83 | print(result_dict) 84 | """ 85 | {'accuracy': 0.9362, 86 | 'precision': [0.8939, 0.9101, 0.9588, 0.9293, 0.9451, 0.9241, 0.9822, 0.9882, 0.8935], 87 | 'recall': [0.9195, 0.931, 0.9422, 0.902, 0.9885, 0.8639, 0.954, 0.9333, 0.9805], 88 | 'f1': [0.9065, 0.9205, 0.9504, 0.9154, 0.9663, 0.893, 0.9679, 0.96, 0.935]} 89 | """ 90 | -------------------------------------------------------------------------------- /examples/text_classification/run_t5.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | import gc 4 | 5 | import neologdn 6 | import numpy as np 7 | import pytorch_lightning as pl 8 | from sklearn import preprocessing 9 | from sklearn.metrics import log_loss 10 | from sklearn.model_selection import train_test_split, StratifiedKFold 11 | import torch 12 | from torch import nn 13 | from torch.utils.data import DataLoader 14 | from tqdm import tqdm 15 | from transformers import T5Tokenizer 16 | 17 | sys.path.append('.') 18 | from utils_nlp.dataset.livedoor import load_pandas_df 19 | from utils_nlp.eval.classification import eval_classification 20 | from utils_nlp.models.nn.datasets import LivedoorDatasetT5 as LivedoorDataset 21 | from utils_nlp.models.nn.models import PLT5Classifier 22 | 23 | 24 | def preprocess_data(df): 25 | # split 26 | df['text'] = df['text'].apply(neologdn.normalize) 27 | le = preprocessing.LabelEncoder() 28 | df['label'] = le.fit_transform(df['label']) 29 | 30 | X_train, X_test, y_train, y_test = train_test_split( 31 | df, df['label'].values, test_size=0.2, random_state=42, stratify=df['label']) 32 | 33 | return X_train.reset_index(drop=True), X_test.reset_index(drop=True), y_train, y_test 34 | 35 | 36 | if __name__ == '__main__': 37 | 38 | parser = argparse.ArgumentParser() 39 | parser.add_argument('--model_name') 40 | args = parser.parse_args() 41 | 42 | MODEL_NAME = args.model_name 43 | MAX_LEN = 300 44 | pl.seed_everything(777) 45 | device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') 46 | 47 | df = load_pandas_df(shuffle=True) 48 | X_train, X_test, y_train, y_test = preprocess_data(df) 49 | 50 | tokenizer = T5Tokenizer.from_pretrained("sonoisa/t5-base-japanese", is_fast=True) 51 | 52 | test_dataset = LivedoorDataset(X_test, tokenizer, MAX_LEN) 53 | test_loader = DataLoader(test_dataset, shuffle=False, batch_size=1, num_workers=4) 54 | 55 | y_preds = [] 56 | NUM_CLASS = 9 57 | cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=0) 58 | 59 | for fold_id, (train_index, valid_index) in enumerate(tqdm(cv.split(X_train, X_train['label']))): 60 | if fold_id == 0: 61 | X_tr = X_train.loc[train_index, :].reset_index(drop=True) 62 | X_val = X_train.loc[valid_index, :].reset_index(drop=True) 63 | y_tr = y_train[train_index] 64 | y_val = y_train[valid_index] 65 | 66 | train_dataset = LivedoorDataset(X_tr, tokenizer, MAX_LEN) 67 | valid_dataset = LivedoorDataset(X_val, tokenizer, MAX_LEN) 68 | 69 | train_loader = DataLoader(train_dataset, shuffle=True, batch_size=16, num_workers=4) 70 | valid_loader = DataLoader(valid_dataset, shuffle=False, batch_size=32, num_workers=4) 71 | 72 | model = PLT5Classifier(model_name=MODEL_NAME) 73 | device ='cuda:0' if torch.cuda.is_available() else 'cpu' 74 | model = model.to(device) 75 | trainer = pl.Trainer(gpus=1, max_epochs=5) 76 | trainer.fit(model, train_loader, valid_loader) 77 | model.tokenizer.save_pretrained('data/t5') 78 | model.backbone.save_pretrained('data/t5') 79 | del train_loader, train_dataset, valid_loader, valid_dataset, X_train, X_test, y_train, df, X_tr, X_val 80 | gc.collect() 81 | trainer.test(test_dataloaders=test_loader) 82 | 83 | y_preds = np.load('data/t5/preds.npy') 84 | y_preds = np.array([int(d) for d in y_preds]) 85 | result_dict = eval_classification(y_test, y_preds) 86 | print(result_dict) 87 | """ 88 | {'accuracy': 0.9566, 89 | 'precision': [0.9699, 0.9194, 0.9815, 0.9583, 0.95, 0.9128, 0.977, 0.9888, 0.956], 90 | 'recall': [0.9253, 0.9828, 0.9191, 0.902, 0.9828, 0.929, 0.977, 0.9833, 0.987], 91 | 'f1': [0.9471, 0.95, 0.9493, 0.9293, 0.9661, 0.9208, 0.977, 0.9861, 0.9712]} 92 | """ 93 | -------------------------------------------------------------------------------- /examples/text_classification/tfidf_lgbm.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from konoha import WordTokenizer 4 | import lightgbm as lgb 5 | from loguru import logger 6 | import neologdn 7 | import numpy as np 8 | import pandas as pd 9 | from sklearn import preprocessing 10 | from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer 11 | from sklearn.metrics import log_loss 12 | from sklearn.model_selection import train_test_split, StratifiedKFold 13 | from tqdm import tqdm 14 | 15 | sys.path.append('.') 16 | from utils_nlp.common.data import Data 17 | from utils_nlp.dataset.livedoor import load_pandas_df 18 | from utils_nlp.eval.classification import eval_classification 19 | 20 | 21 | def preprocess_data(df): 22 | # split 23 | df['text'] = df['text'].apply(neologdn.normalize) 24 | le = preprocessing.LabelEncoder() 25 | df['label'] = le.fit_transform(df['label']) 26 | 27 | df_train, df_test, y_train, y_test = train_test_split( 28 | df, df['label'].values, test_size=0.2, random_state=42, stratify=df['label']) 29 | 30 | # tokenize 31 | tokenizer = WordTokenizer('MeCab') 32 | docs_train = np.array([ 33 | ' '.join(map(str, tokenizer.tokenize(text))) for text in df_train['text'] 34 | ]) 35 | docs_test = np.array([ 36 | ' '.join(map(str, tokenizer.tokenize(text))) for text in df_test['text'] 37 | ]) 38 | 39 | # tfidf: Don't use df_test for fitting 40 | count_vec = CountVectorizer(min_df=2, 41 | max_features=20000, 42 | ngram_range=(1, 3)) 43 | bags_train = count_vec.fit_transform(docs_train) 44 | bags_test = count_vec.transform(docs_test) 45 | 46 | tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True) 47 | tf_idf_train = tfidf.fit_transform(bags_train) 48 | tf_idf_test = tfidf.transform(bags_test) 49 | 50 | X_train = pd.DataFrame(tf_idf_train.toarray()) 51 | X_test = pd.DataFrame(tf_idf_test.toarray()) 52 | 53 | return X_train.reset_index(drop=True), X_test.reset_index(drop=True), y_train, y_test 54 | 55 | 56 | if __name__ == '__main__': 57 | 58 | df = load_pandas_df(shuffle=True) 59 | X_train, X_test, y_train, y_test = preprocess_data(df) 60 | 61 | RUN_NAME = 'lgbm' 62 | logger.add(f'data/{RUN_NAME}/result.log', 63 | colorize=True, 64 | format='{time} {message}') 65 | logger.info(f'{X_train.shape}, {X_test.shape}') 66 | 67 | y_preds = [] 68 | NUM_CLASS = 9 69 | oof_train = np.zeros((len(X_train), NUM_CLASS)) 70 | cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=0) 71 | 72 | params = { 73 | 'objective': 'multiclass', 74 | 'num_class': NUM_CLASS, 75 | 'num_leaves': 12, 76 | 'max_depth': 4, 77 | 'feature_fraction': 0.8, 78 | 'subsample_freq': 1, 79 | 'bagging_fraction': 0.7, 80 | 'min_data_in_leaf': 10, 81 | 'learning_rate': 0.1, 82 | 'boosting': 'gbdt', 83 | 'lambda_l1': 0.4, 84 | 'lambda_l2': 0.4, 85 | 'verbosity': -1, 86 | 'random_state': 42 87 | } 88 | 89 | for fold_id, (train_index, valid_index) in enumerate(tqdm(cv.split(X_train, y_train))): 90 | if fold_id == 0: 91 | X_tr = X_train.loc[train_index, :] 92 | X_val = X_train.loc[valid_index, :] 93 | y_tr = y_train[train_index] 94 | y_val = y_train[valid_index] 95 | 96 | lgb_train = lgb.Dataset(X_tr, y_tr) 97 | lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train) 98 | 99 | model = lgb.train(params, 100 | lgb_train, 101 | valid_sets=[lgb_train, lgb_eval], 102 | verbose_eval=10, 103 | num_boost_round=1000, 104 | early_stopping_rounds=10) 105 | 106 | Data.dump(model, f'data/{RUN_NAME}/model_{fold_id}.pkl') 107 | 108 | oof_train[valid_index] = model.predict(X_val) 109 | score = log_loss(y_val, oof_train[valid_index]) 110 | logger.info(f'fold {fold_id}, log_loss: {score}') 111 | 112 | y_pred = model.predict(X_test) 113 | y_preds.append(y_pred) 114 | 115 | y_preds = np.mean(y_preds, axis=0) 116 | logger.info(f'test, log_loss: {log_loss(y_test, y_preds)}') 117 | result_dict = eval_classification(y_test, y_preds.argmax(axis=1)) 118 | logger.info(str(result_dict)) 119 | """ 120 | {'accuracy': 0.9512, 121 | 'precision': [0.9253, 0.9714, 0.9713, 0.9348, 0.9286, 0.8786, 1.0, 0.9831, 0.9608], 122 | 'recall': [0.9253, 0.977, 0.9769, 0.8431, 0.9713, 0.8994, 1.0, 0.9667, 0.9545], 123 | 'f1': [0.9253, 0.9742, 0.9741, 0.8866, 0.9494, 0.8889, 1.0, 0.9748, 0.9577]} 124 | """ 125 | 126 | Data.dump(oof_train, f'data/{RUN_NAME}/oof_train.pkl') 127 | Data.dump(y_preds, f'data/{RUN_NAME}/y_preds.pkl') 128 | -------------------------------------------------------------------------------- /examples/text_classification/tfidf_logistic_regression.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from konoha import WordTokenizer 4 | from loguru import logger 5 | import neologdn 6 | import numpy as np 7 | import pandas as pd 8 | from sklearn import preprocessing 9 | from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer 10 | from sklearn.linear_model import LogisticRegression 11 | from sklearn.metrics import log_loss 12 | from sklearn.model_selection import train_test_split, StratifiedKFold 13 | from tqdm import tqdm 14 | 15 | sys.path.append('.') 16 | from utils_nlp.common.data import Data 17 | from utils_nlp.dataset.livedoor import load_pandas_df 18 | from utils_nlp.eval.classification import eval_classification 19 | 20 | 21 | def preprocess_data(df): 22 | # split 23 | df['text'] = df['text'].apply(neologdn.normalize) 24 | le = preprocessing.LabelEncoder() 25 | df['label'] = le.fit_transform(df['label']) 26 | 27 | df_train, df_test, y_train, y_test = train_test_split( 28 | df, df['label'].values, test_size=0.2, random_state=42, stratify=df['label']) 29 | 30 | # tokenize 31 | tokenizer = WordTokenizer('MeCab') 32 | docs_train = np.array([ 33 | ' '.join(map(str, tokenizer.tokenize(text))) for text in df_train['text'] 34 | ]) 35 | docs_test = np.array([ 36 | ' '.join(map(str, tokenizer.tokenize(text))) for text in df_test['text'] 37 | ]) 38 | 39 | # tfidf: Don't use df_test for fitting 40 | count_vec = CountVectorizer(min_df=2, 41 | max_features=20000, 42 | ngram_range=(1, 3)) 43 | bags_train = count_vec.fit_transform(docs_train) 44 | bags_test = count_vec.transform(docs_test) 45 | 46 | tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True) 47 | tf_idf_train = tfidf.fit_transform(bags_train) 48 | tf_idf_test = tfidf.transform(bags_test) 49 | 50 | X_train = pd.DataFrame(tf_idf_train.toarray()) 51 | X_test = pd.DataFrame(tf_idf_test.toarray()) 52 | 53 | return X_train.reset_index(drop=True), X_test.reset_index(drop=True), y_train, y_test 54 | 55 | 56 | if __name__ == '__main__': 57 | 58 | df = load_pandas_df(shuffle=True) 59 | X_train, X_test, y_train, y_test = preprocess_data(df) 60 | 61 | RUN_NAME = 'logistic_regression' 62 | logger.add(f'data/{RUN_NAME}/result.log', 63 | colorize=True, 64 | format='{time} {message}') 65 | logger.info(f'{X_train.shape}, {X_test.shape}') 66 | 67 | y_preds = [] 68 | NUM_CLASS = 9 69 | oof_train = np.zeros((len(X_train), NUM_CLASS)) 70 | cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=0) 71 | 72 | for fold_id, (train_index, valid_index) in enumerate(tqdm(cv.split(X_train, y_train))): 73 | if fold_id == 0: 74 | X_tr = X_train.loc[train_index, :] 75 | X_val = X_train.loc[valid_index, :] 76 | y_tr = y_train[train_index] 77 | y_val = y_train[valid_index] 78 | 79 | model = LogisticRegression(penalty='l2', solver='sag', random_state=0) 80 | model.fit(X_tr, y_tr) 81 | Data.dump(model, f'data/{RUN_NAME}/model_{fold_id}.pkl') 82 | 83 | oof_train[valid_index] = model.predict_proba(X_val) 84 | score = log_loss(y_val, oof_train[valid_index]) 85 | logger.info(f'fold {fold_id}, log_loss: {score}') 86 | 87 | y_pred = model.predict_proba(X_test) 88 | y_preds.append(y_pred) 89 | 90 | y_preds = np.mean(y_preds, axis=0) 91 | logger.info(f'test, log_loss: {log_loss(y_test, y_preds)}') 92 | result_dict = eval_classification(y_test, y_preds.argmax(axis=1)) 93 | logger.info(str(result_dict)) 94 | """ 95 | {'accuracy': 0.9308, 96 | 'precision': [0.8771, 0.96, 0.9639, 0.9412, 0.9198, 0.8678, 0.9771, 0.9309, 0.9517], 97 | 'recall': [0.9023, 0.9655, 0.9249, 0.7843, 0.9885, 0.8935, 0.9828, 0.9722, 0.8961], 98 | 'f1': [0.8895, 0.9628, 0.944, 0.8556, 0.9529, 0.8805, 0.9799, 0.9511, 0.9231]} 99 | """ 100 | 101 | Data.dump(oof_train, f'data/{RUN_NAME}/oof_train.pkl') 102 | Data.dump(y_preds, f'data/{RUN_NAME}/y_preds.pkl') 103 | -------------------------------------------------------------------------------- /examples/visualization/README.md: -------------------------------------------------------------------------------- 1 | # Visualization 2 | 3 | This folder contains examples for visualization with Japanese texts. 4 | 5 | ## Summary 6 | 7 | |Notebook|Environment|Description| 8 | |---|---|---| 9 | |[Visualization](visualization.ipynb)|[Kaggle](https://www.kaggle.com/sishihara/japanese-text-visualization-by-nlplot)| Visualization by [nlplot](https://github.com/takapy0210/nlplot) | 10 | |[japanize-matplotlib](japanize_matplotlib.py)|Local| Use Japanese labels by [japanize-matplotlib](https://github.com/uehara1414/japanize-matplotlib) | 11 | -------------------------------------------------------------------------------- /examples/visualization/japanize.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/upura/nlp-recipes-ja/8ac5e898864137841de8b03c11da34815009af24/examples/visualization/japanize.png -------------------------------------------------------------------------------- /examples/visualization/japanize_labels.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import japanize_matplotlib 3 | 4 | from utils_nlp.dataset.livedoor import load_pandas_df 5 | 6 | 7 | if __name__ == '__main__': 8 | df = load_pandas_df() 9 | df['first_char'] = df['text'].str[0] 10 | plot_df = df['first_char'].value_counts()[:10].reset_index() 11 | 12 | japanize_matplotlib.japanize() 13 | plt.figure(figsize=(15, 8)) 14 | plt.bar(plot_df['index'], plot_df['first_char']) 15 | plt.savefig('examples/visualization/japanize.png') 16 | -------------------------------------------------------------------------------- /examples/visualization/visualization.ipynb: -------------------------------------------------------------------------------- 1 | {"cells":[{"metadata":{"trusted":true},"cell_type":"code","source":"!pip install nagisa\n!pip install nlplot","execution_count":null,"outputs":[]},{"metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true},"cell_type":"code","source":"import glob\nimport os\n\nimport nagisa\nimport nlplot\nimport pandas as pd","execution_count":null,"outputs":[]},{"metadata":{"_uuid":"d629ff2d2480ee46fbb7e2d37f6b5fab8052498a","_cell_guid":"79c7e3d0-c299-4dcb-8224-4455121ee9b0","trusted":true},"cell_type":"code","source":"def extract_txt(filename: str) -> str:\n with open(filename) as text_file:\n # 0: URL, 1: timestamp\n text = text_file.readlines()[2:]\n text = [sentence.strip() for sentence in text]\n text = list(filter(lambda line: line != '', text))\n return ''.join(text)\n\n\nEXTRACTDIR = '/kaggle/input/livedoor-news/'\ncategories = [\n name for name\n in os.listdir(os.path.join(EXTRACTDIR, \"text\"))\n if os.path.isdir(os.path.join(EXTRACTDIR, \"text\", name))]\n\ncategories = sorted(categories)\ntable = str.maketrans({\n '\\n': '',\n '\\t': ' ',\n '\\r': '',\n})\n\nall_text = []\nall_label = []\n\nfor cat in categories:\n files = glob.glob(os.path.join(EXTRACTDIR, \"text\", cat, \"{}*.txt\".format(cat)))\n files = sorted(files)\n body = [extract_txt(elem).translate(table) for elem in files]\n label = [cat] * len(body)\n\n all_text.extend(body)\n all_label.extend(label)\n\ndf = pd.DataFrame({'text': all_text, 'label': all_label})","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"df.head()","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"df = df.loc[:10]\ntagger = nagisa.Tagger()\ndf['sep_text'] = [tagger.extract(text, extract_postags=['名詞']).words for text in df['text']]","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"npt = nlplot.NLPlot(df, target_col='sep_text')\nstopwords = npt.get_stopword(top_n=5, min_freq=0)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# uni-gram\nnpt.bar_ngram(\n title='uni-gram',\n xaxis_label='word_count',\n yaxis_label='word',\n ngram=1,\n top_n=50,\n width=800,\n height=1100,\n stopwords=stopwords,\n)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# bi-gram\nnpt.bar_ngram(\n title='bi-gram',\n xaxis_label='word_count',\n yaxis_label='word',\n ngram=2,\n top_n=50,\n width=800,\n height=1100,\n stopwords=stopwords,\n)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"npt.treemap(\n title='Tree Map',\n ngram=1,\n stopwords=stopwords,\n)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"npt.word_distribution(\n title='number of words distribution'\n)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"npt.wordcloud(\n stopwords=stopwords,\n colormap='tab20_r',\n)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"npt.build_graph(stopwords=stopwords, min_edge_frequency=5)\nnpt.co_network(\n title='Co-occurrence network',\n color_palette='hls',\n width=1000,\n height=1200,\n)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"npt.sunburst(\n title='sunburst chart',\n colorscale=True,\n color_continuous_scale='Oryel',\n width=1000,\n height=800,\n)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"npt.ldavis(num_topics=3, passes=5, save=False)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"","execution_count":null,"outputs":[]}],"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"pygments_lexer":"ipython3","nbconvert_exporter":"python","version":"3.6.4","file_extension":".py","codemirror_mode":{"name":"ipython","version":3},"name":"python","mimetype":"text/x-python"}},"nbformat":4,"nbformat_minor":4} -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | chakin==0.0.8 2 | ja-sentence-segmenter==0.0.2 3 | japanize-matplotlib==1.1.2 4 | konoha[all]==4.6.1 5 | loguru==0.5.1 6 | mecab-python3==1.0.3 7 | nagisa==0.2.7 8 | neologdn==0.4 9 | oseti==0.2 10 | pykakasi==2.0.1 11 | pytorch-lightning==1.2.7 12 | transformers==4.5.0 13 | -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/upura/nlp-recipes-ja/8ac5e898864137841de8b03c11da34815009af24/tests/README.md -------------------------------------------------------------------------------- /utils_nlp/README.md: -------------------------------------------------------------------------------- 1 | # NLP Utilities 2 | -------------------------------------------------------------------------------- /utils_nlp/common/data.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import joblib 4 | 5 | 6 | class Data: 7 | @classmethod 8 | def dump(cls, value, path): 9 | os.makedirs(os.path.dirname(path), exist_ok=True) 10 | joblib.dump(value, path, compress=True) 11 | 12 | @classmethod 13 | def load(cls, path): 14 | return joblib.load(path) 15 | -------------------------------------------------------------------------------- /utils_nlp/dataset/README.md: -------------------------------------------------------------------------------- 1 | ## Dataset 2 | 3 | This submodule includes helper functions for downloading datasets and formatting them appropriately as well as utilities for splitting data for training / testing. 4 | 5 | ## Data Loading 6 | 7 | There are dataloaders for several datasets. For example, the livedoor module will allow you to load a dataframe in pandas from the livedoor dataset, with the option to set the number of rows to load in order to test algorithms and evaluate performance benchmarks. 8 | Most datasets may be split into `train`, `valid`, and `test`, for example: 9 | 10 | ```python 11 | from utils_nlp.dataset.livedoor import load_pandas_df 12 | 13 | df = load_pandas_df(nrows=1000, shuffle=False) 14 | ``` 15 | 16 | ## Dataset List 17 | |Dataset|Dataloader script| 18 | |-------|-----------------| 19 | |[livedoor ニュースコーパス](https://www.rondhuit.com/download.html)|[livedoor.py](./livedoor.py)| 20 | -------------------------------------------------------------------------------- /utils_nlp/dataset/livedoor.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import tarfile 4 | from urllib.request import urlretrieve 5 | 6 | import pandas as pd 7 | 8 | 9 | def load_pandas_df(nrows: int = None, shuffle: bool = False) -> pd.DataFrame: 10 | """Loads the livedoor dataset as pd.DataFrame 11 | This code is from https://github.com/yoheikikuta/bert-japanese/blob/master/notebook/finetune-to-livedoor-corpus.ipynb 12 | 13 | Args: 14 | nrows (int, optional): [description]. Defaults to None. 15 | 16 | Returns: 17 | pd.DataFrame: livedoor dataset 18 | """ 19 | if os.path.exists('./data/livedoor.csv'): 20 | df = pd.read_csv('./data/livedoor.csv') 21 | else: 22 | df = download_livedoor() 23 | 24 | if shuffle: 25 | df = df.sample(frac=1, random_state=7).reset_index(drop=True) 26 | 27 | if nrows: 28 | df = df[:nrows] 29 | 30 | return df 31 | 32 | 33 | def download_livedoor() -> pd.DataFrame: 34 | """Download the dataset from "https://www.rondhuit.com/download.html", unzip, and load 35 | 36 | Returns: 37 | pd.DataFrame: livedoor dataset 38 | """ 39 | FILEURL = 'https://www.rondhuit.com/download/ldcc-20140209.tar.gz' 40 | FILEPATH = './data/ldcc-20140209.tar.gz' 41 | EXTRACTDIR = './data/livedoor/' 42 | urlretrieve(FILEURL, FILEPATH) 43 | 44 | mode = "r:gz" 45 | tar = tarfile.open(FILEPATH, mode) 46 | tar.extractall(EXTRACTDIR) 47 | tar.close() 48 | 49 | categories = [ 50 | name for name 51 | in os.listdir(os.path.join(EXTRACTDIR, "text")) 52 | if os.path.isdir(os.path.join(EXTRACTDIR, "text", name))] 53 | 54 | categories = sorted(categories) 55 | table = str.maketrans({ 56 | '\n': '', 57 | '\t': ' ', 58 | '\r': '', 59 | }) 60 | 61 | all_text = [] 62 | all_label = [] 63 | 64 | for cat in categories: 65 | files = glob.glob(os.path.join(EXTRACTDIR, "text", cat, "{}*.txt".format(cat))) 66 | files = sorted(files) 67 | body = [extract_txt(elem).translate(table) for elem in files] 68 | label = [cat] * len(body) 69 | 70 | all_text.extend(body) 71 | all_label.extend(label) 72 | 73 | df = pd.DataFrame({'text': all_text, 'label': all_label}) 74 | df.to_csv('./data/livedoor.csv', index=False) 75 | return df 76 | 77 | 78 | def extract_txt(filename: str) -> str: 79 | with open(filename) as text_file: 80 | # 0: URL, 1: timestamp 81 | text = text_file.readlines()[2:] 82 | text = [sentence.strip() for sentence in text] 83 | text = list(filter(lambda line: line != '', text)) 84 | return ''.join(text) 85 | -------------------------------------------------------------------------------- /utils_nlp/eval/classification.py: -------------------------------------------------------------------------------- 1 | # Implementation from https://github.com/microsoft/nlp-recipes/blob/master/utils_nlp/eval/classification.py 2 | from sklearn.metrics import ( 3 | accuracy_score, 4 | precision_score, 5 | recall_score, 6 | f1_score, 7 | confusion_matrix, 8 | ) 9 | from numpy import corrcoef 10 | 11 | from matplotlib import pyplot 12 | import seaborn as sn 13 | import numpy as np 14 | import pandas as pd 15 | 16 | 17 | def eval_classification(actual, predicted, round_decimals=4): 18 | """Returns common classification evaluation metrics. 19 | Args: 20 | actual (1d array-like): Array of actual values. 21 | predicted (1d array-like): Array of predicted values. 22 | round_decimals (int, optional): Number of decimal places. Defaults to 4. 23 | Returns: 24 | dict: A dictionary of evaluation metrics. 25 | """ 26 | return { 27 | "accuracy": accuracy_score(actual, predicted).round(round_decimals), 28 | "precision": list(precision_score(actual, predicted, average=None).round(round_decimals)), 29 | "recall": list(recall_score(actual, predicted, average=None).round(round_decimals)), 30 | "f1": list(f1_score(actual, predicted, average=None).round(round_decimals)), 31 | } 32 | 33 | 34 | def compute_correlation_coefficients(x, y=None): 35 | """ 36 | Compute Pearson product-moment correlation coefficients. 37 | Args: 38 | x: array_like 39 | A 1-D or 2-D array containing multiple variables and observations. 40 | Each row of `x` represents a variable, and each column a single 41 | observation of all those variables. 42 | y: array_like, optional 43 | An additional set of variables and observations. `y` has the same 44 | shape as `x`. 45 | Returns: 46 | pd.DataFrame : A pandas dataframe from the correlation coefficient matrix of the variables. 47 | """ 48 | return pd.DataFrame(corrcoef(x, y)) 49 | 50 | 51 | def plot_confusion_matrix( 52 | y_true, 53 | y_pred, 54 | labels, 55 | normalize=False, 56 | title="Confusion matrix", 57 | plot_size=(8, 5), 58 | font_scale=1.1, 59 | ): 60 | """Function that prints out a graphical representation of confusion matrix using Seaborn Heatmap 61 | Args: 62 | y_true (1d array-like): True labels from dataset 63 | y_pred (1d array-like): Predicted labels from the models 64 | labels: A list of labels 65 | normalize (Bool, optional): Boolean to Set Row Normalization for Confusion Matrix 66 | title (String, optional): String that is the title of the plot 67 | plot_size (tuple, optional): Tuple of Plot Dimensions Default "(8, 5)" 68 | font_scale (float, optional): float type scale factor for font within plot 69 | """ 70 | conf_matrix = np.array(confusion_matrix(y_true, y_pred)) 71 | if normalize: 72 | conf_matrix = np.round( 73 | conf_matrix.astype("float") / conf_matrix.sum(axis=1)[:, np.newaxis], 3 74 | ) 75 | conf_dataframe = pd.DataFrame(conf_matrix, labels, labels) 76 | fig, ax = pyplot.subplots(figsize=plot_size) 77 | sn.set(font_scale=font_scale) 78 | ax.set_title(title) 79 | ax = sn.heatmap(conf_dataframe, cmap="Blues", annot=True, annot_kws={"size": 16}, fmt="g") 80 | ax.set(xlabel="Predicted Labels", ylabel="True Labels") 81 | -------------------------------------------------------------------------------- /utils_nlp/features/README.md: -------------------------------------------------------------------------------- 1 | # Features 2 | 3 | The models submodule contains implementations of various algorithms that can be used in addition to external packages to evaluate and develop new natural language processing systems. 4 | 5 | ## Summary 6 | 7 | The following table summarizes each submodule. 8 | 9 | |Submodule|Description| 10 | |---|---| 11 | |[SWEM](swem.py) | Create swem-max and swem-mean vectors| 12 | |[SCDV](scdv.py) | Create scdv vectors| 13 | -------------------------------------------------------------------------------- /utils_nlp/features/scdv.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import numpy as np 3 | import pandas as pd 4 | from sklearn.mixture import GaussianMixture 5 | 6 | 7 | def create(docs, word_vec, n_components=10): 8 | """Create scdv vectors 9 | 10 | Args: 11 | docs: np.array() 12 | word_vec: Loaded word2vectors 13 | n_components (int, optional): Number of components 14 | 15 | Returns: 16 | swem: Created scdv vectors 17 | """ 18 | n_wv_embed = word_vec.vector_size 19 | 20 | # Create vocab set of w2v model and corpus 21 | vocab_model = set(k for k in word_vec.vocab.keys()) 22 | vocab_docs = set([w for doc in docs for w in doc]) 23 | out_of_vocabs = len(vocab_docs) - len(vocab_docs & vocab_model) 24 | print('out of vocabs: {out_of_vocabs}'.format(**locals())) 25 | use_words = list(vocab_docs & vocab_model) 26 | 27 | df_use = pd.DataFrame() 28 | df_use['word'] = use_words 29 | df_idf = create_idf_dataframe(docs) 30 | df_use = pd.merge(df_use, df_idf, on='word', how='left') 31 | idf = df_use['idf'].values 32 | 33 | use_word_vectors = np.array([word_vec[w] for w in use_words]) 34 | 35 | clf = GaussianMixture(n_components=n_components, covariance_type='tied', verbose=2) 36 | clf.fit(use_word_vectors) 37 | 38 | word_probs = clf.predict_proba(use_word_vectors) 39 | # (n_vocabs, n_components,) 40 | word_cluster_vector = use_word_vectors[:, None, :] * word_probs[:, :, None] 41 | # (n_vocabs, n_components, n_wv_embed) 42 | 43 | topic_vector = word_cluster_vector.reshape(-1, n_components * n_wv_embed) * idf[:, None] 44 | 45 | topic_vector[np.isnan(topic_vector)] = 0 46 | word_to_topic = dict(zip(use_words, topic_vector)) 47 | n_embedding = topic_vector.shape[1] 48 | 49 | cdv_vector = create_document_vector(docs, word_to_topic, n_embedding) 50 | compressed = compress_document_vector(cdv_vector) 51 | 52 | return compressed 53 | 54 | 55 | def create_idf_dataframe(documents): 56 | """Create idf pd.DataFrame 57 | 58 | Args: 59 | documents (list[str]): 60 | Returns: 61 | [pd.DataFrame]: Created pd.DataFrame 62 | """ 63 | 64 | d = defaultdict(int) 65 | 66 | for doc in documents: 67 | vocab_i = set(doc) 68 | for w in list(vocab_i): 69 | d[w] += 1 70 | 71 | df_idf = pd.DataFrame() 72 | df_idf['count'] = d.values() 73 | df_idf['word'] = d.keys() 74 | df_idf['idf'] = np.log(len(documents) / df_idf['count']) 75 | return df_idf 76 | 77 | 78 | def create_document_vector(documents, w2t, n_embedding): 79 | doc_vectors = [] 80 | 81 | for doc in documents: 82 | vector_i = np.zeros(shape=(n_embedding,)) 83 | for w in doc: 84 | try: 85 | v = w2t[w] 86 | vector_i += v 87 | except KeyError: 88 | continue 89 | doc_vectors.append(vector_i) 90 | return np.array(doc_vectors) 91 | 92 | 93 | def compress_document_vector(doc_vector, p=.04): 94 | v = np.copy(doc_vector) 95 | vec_norm = np.linalg.norm(v, axis=1) 96 | # To escape from zero division 97 | vec_norm = np.where(vec_norm > 0, vec_norm, 1.) 98 | v /= vec_norm[:, None] 99 | 100 | a_min = v.min(axis=1).mean() 101 | a_max = v.max(axis=1).mean() 102 | threshold = (abs(a_min) + abs(a_max)) / 2. * p 103 | v[abs(v) < threshold] = .0 104 | return v 105 | -------------------------------------------------------------------------------- /utils_nlp/features/swem.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from tqdm import tqdm 3 | 4 | from utils_nlp.models.pretrained_embeddings.word2vec import convert_to_wv 5 | 6 | 7 | def create(docs, word_vec, aggregation='max'): 8 | """Create swem vectors 9 | 10 | Args: 11 | docs: np.array() 12 | word_vec: Loaded word2vectors 13 | aggregation (str, optional): How to do max-pooling, 'max' or 'mean'. Defaults to 'max'. 14 | 15 | Raises: 16 | ValueError: Invalid aggregation arg 17 | 18 | Returns: 19 | swem: Created swem vectors 20 | """ 21 | if aggregation == 'max': 22 | agg = np.max 23 | elif aggregation == 'mean': 24 | agg = np.mean 25 | else: 26 | raise ValueError() 27 | 28 | swem = [] 29 | for sentence in tqdm(docs, total=len(docs)): 30 | embed_i = [convert_to_wv(s, word_vec) for s in sentence] 31 | embed_i = np.array(embed_i) 32 | embed_i = agg(embed_i, axis=0) 33 | swem.append(embed_i) 34 | swem = np.array(swem) 35 | return swem 36 | -------------------------------------------------------------------------------- /utils_nlp/models/README.md: -------------------------------------------------------------------------------- 1 | # Models 2 | 3 | The features submodule contains implementations of various algorithms that can create features from sentences. 4 | 5 | ## Summary 6 | 7 | The following table summarizes each submodule. 8 | 9 | |Submodule|Description| 10 | |---|---| 11 | |[pretrained embeddings](./pretrained_embeddings) | This submodule provides utilities to download and extract pretrained word embeddings trained with Word2Vec, GloVe, fastText methods.| 12 | -------------------------------------------------------------------------------- /utils_nlp/models/nn/README.md: -------------------------------------------------------------------------------- 1 | # Neural Networks 2 | 3 | The neural networks submodule contains utility functions like datasets and runners. 4 | -------------------------------------------------------------------------------- /utils_nlp/models/nn/datasets.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | class LivedoorDataset: 5 | 6 | def __init__(self, dataframe, tokenizer, max_len): 7 | self.tokenizer = tokenizer 8 | self.data = dataframe 9 | self.comment_text = dataframe['text'] 10 | self.targets = self.data['label'] 11 | self.max_len = max_len 12 | 13 | def __len__(self): 14 | return len(self.comment_text) 15 | 16 | def __getitem__(self, index): 17 | inputs = self.tokenizer.encode_plus( 18 | self.comment_text[index], 19 | None, 20 | add_special_tokens=True, 21 | max_length=self.max_len, 22 | truncation=True, 23 | padding='max_length', 24 | return_token_type_ids=True 25 | ) 26 | ids = inputs['input_ids'] 27 | attention_mask = inputs['attention_mask'] 28 | token_type_ids = inputs["token_type_ids"] 29 | 30 | return { 31 | 'ids': torch.tensor(ids, dtype=torch.long), 32 | 'attention_mask': torch.tensor(attention_mask, dtype=torch.long), 33 | 'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long), 34 | 'targets': torch.tensor(self.targets[index], dtype=torch.long) 35 | } 36 | 37 | 38 | class LivedoorDatasetT5: 39 | 40 | def __init__(self, dataframe, tokenizer, max_len): 41 | self.tokenizer = tokenizer 42 | self.data = dataframe 43 | self.comment_text = dataframe['text'] 44 | self.targets = self.data['label'] 45 | self.max_len = max_len 46 | 47 | def __len__(self): 48 | return len(self.comment_text) 49 | 50 | def __getitem__(self, index): 51 | tokenized_inputs = self.tokenizer.encode_plus( 52 | self.comment_text[index], max_length=self.max_len, truncation=True, 53 | padding="max_length", return_tensors="pt" 54 | ) 55 | 56 | tokenized_targets = self.tokenizer.encode_plus( 57 | str(self.targets[index]), max_length=4, truncation=True, 58 | padding="max_length", return_tensors="pt" 59 | ) 60 | 61 | source_ids = tokenized_inputs["input_ids"].squeeze() 62 | target_ids = tokenized_targets["input_ids"].squeeze() 63 | 64 | source_mask = tokenized_inputs["attention_mask"].squeeze() 65 | target_mask = tokenized_targets["attention_mask"].squeeze() 66 | 67 | return {"source_ids": source_ids, "source_mask": source_mask, 68 | "target_ids": target_ids, "target_mask": target_mask} 69 | -------------------------------------------------------------------------------- /utils_nlp/models/nn/models.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import numpy as np 4 | import pytorch_lightning as pl 5 | import torch 6 | from transformers import BertModel, T5ForConditionalGeneration, T5Tokenizer 7 | 8 | sys.path.append('.') 9 | from utils_nlp.eval.classification import eval_classification 10 | 11 | 12 | class BertClassifier(torch.nn.Module): 13 | def __init__(self, model_name: str, num_classes: int = 9): 14 | super().__init__() 15 | self.l1 = BertModel.from_pretrained(model_name) 16 | self.l2 = torch.nn.Dropout(0.3) 17 | if 'large' in model_name: 18 | self.l3 = torch.nn.Linear(1024, num_classes) 19 | else: 20 | self.l3 = torch.nn.Linear(768, num_classes) 21 | 22 | def forward(self, ids, attention_mask, token_type_ids): 23 | _, output_1 = self.l1(ids, attention_mask=attention_mask, token_type_ids=token_type_ids, return_dict=False) 24 | output_2 = self.l2(output_1) 25 | output = self.l3(output_2) 26 | return output 27 | 28 | 29 | class PLBertClassifier(pl.LightningModule): 30 | def __init__(self, model_name: str, num_classes: int = 9): 31 | super().__init__() 32 | self.backbone = BertClassifier(model_name, num_classes) 33 | self.criterion = torch.nn.CrossEntropyLoss() 34 | 35 | def forward(self, ids, attention_mask, token_type_ids): 36 | output = self.backbone(ids, attention_mask=attention_mask, token_type_ids=token_type_ids) 37 | return output 38 | 39 | def training_step(self, batch, batch_idx): 40 | ids = batch["ids"] 41 | attention_mask = batch["attention_mask"] 42 | token_type_ids = batch["token_type_ids"] 43 | targets = batch["targets"] 44 | output = self.backbone(ids, attention_mask=attention_mask, token_type_ids=token_type_ids) 45 | loss = self.criterion(output, targets) 46 | return loss 47 | 48 | def validation_step(self, batch, batch_idx): 49 | ids = batch["ids"] 50 | attention_mask = batch["attention_mask"] 51 | token_type_ids = batch["token_type_ids"] 52 | targets = batch["targets"] 53 | output = self.backbone(ids, attention_mask=attention_mask, token_type_ids=token_type_ids) 54 | loss = self.criterion(output, targets) 55 | return {'oof': output, 'targets': targets} 56 | 57 | def validation_epoch_end(self, outputs): 58 | oof = np.concatenate( 59 | [x['oof'].detach().cpu().numpy() for x in outputs], axis=0 60 | ) 61 | targets = np.concatenate( 62 | [x['targets'].detach().cpu().numpy() for x in outputs], axis=0 63 | ) 64 | print(eval_classification(targets, oof.argmax(axis=1))) 65 | 66 | def test_step(self, batch, batch_idx): 67 | ids = batch["ids"] 68 | attention_mask = batch["attention_mask"] 69 | token_type_ids = batch["token_type_ids"] 70 | targets = batch["targets"] 71 | output = self.backbone(ids, attention_mask=attention_mask, token_type_ids=token_type_ids) 72 | loss = self.criterion(output, targets) 73 | return {'preds': output} 74 | 75 | def test_epoch_end(self, outputs): 76 | preds = np.concatenate( 77 | [x['preds'].detach().cpu().numpy() for x in outputs], axis=0 78 | ) 79 | np.save('data/bert/preds', preds) 80 | 81 | def configure_optimizers(self): 82 | optimizer = torch.optim.Adam(self.parameters(), lr=1e-4) 83 | scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=self.trainer.max_epochs) 84 | return [optimizer], [scheduler] 85 | 86 | 87 | class PLT5Classifier(pl.LightningModule): 88 | def __init__(self, model_name: str): 89 | super().__init__() 90 | self.backbone = T5ForConditionalGeneration.from_pretrained(model_name) 91 | self.tokenizer = T5Tokenizer.from_pretrained(model_name, is_fast=True) 92 | 93 | def forward(self, input_ids, attention_mask=None, decoder_input_ids=None, 94 | decoder_attention_mask=None, labels=None): 95 | """順伝搬""" 96 | return self.backbone( 97 | input_ids=input_ids, 98 | attention_mask=attention_mask, 99 | decoder_input_ids=decoder_input_ids, 100 | decoder_attention_mask=decoder_attention_mask, 101 | labels=labels 102 | ) 103 | 104 | def _step(self, batch): 105 | """ロス計算""" 106 | labels = batch["target_ids"] 107 | 108 | # All labels set to -100 are ignored (masked), 109 | # the loss is only computed for labels in [0, ..., config.vocab_size] 110 | labels[labels[:, :] == self.tokenizer.pad_token_id] = -100 111 | outputs = self( 112 | input_ids=batch["source_ids"], 113 | attention_mask=batch["source_mask"], 114 | decoder_attention_mask=batch['target_mask'], 115 | labels=labels 116 | ) 117 | 118 | loss = outputs[0] 119 | return loss 120 | 121 | def training_step(self, batch, batch_idx): 122 | """訓練ステップ処理""" 123 | loss = self._step(batch) 124 | self.log("train_loss", loss) 125 | return {"loss": loss} 126 | 127 | def validation_step(self, batch, batch_idx): 128 | """バリデーションステップ処理""" 129 | loss = self._step(batch) 130 | self.log("val_loss", loss) 131 | return {"val_loss": loss} 132 | 133 | def validation_epoch_end(self, outputs): 134 | """バリデーション完了処理""" 135 | avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean() 136 | self.log("val_loss", avg_loss, prog_bar=True) 137 | 138 | def test_step(self, batch, batch_idx): 139 | outs = self.backbone.generate( 140 | input_ids=batch["source_ids"], 141 | attention_mask=batch["source_mask"], 142 | max_length=4, 143 | return_dict_in_generate=True, 144 | output_scores=True 145 | ) 146 | 147 | dec = [self.tokenizer.decode(ids, skip_special_tokens=True, 148 | clean_up_tokenization_spaces=False) 149 | for ids in outs.sequences] 150 | return {"preds": dec} 151 | 152 | def test_epoch_end(self, outputs): 153 | preds = np.concatenate( 154 | [x['preds'] for x in outputs], axis=0 155 | ) 156 | np.save('data/t5/preds', preds) 157 | 158 | def configure_optimizers(self): 159 | optimizer = torch.optim.Adam(self.parameters(), lr=1e-4) 160 | scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=self.trainer.max_epochs) 161 | return [optimizer], [scheduler] 162 | -------------------------------------------------------------------------------- /utils_nlp/models/nn/runner.py: -------------------------------------------------------------------------------- 1 | from catalyst.dl import Runner 2 | import torch 3 | 4 | 5 | class CustomRunner(Runner): 6 | def _handle_batch(self, batch): 7 | ids = batch['ids'] 8 | mask = batch['mask'] 9 | token_type_ids = batch['token_type_ids'] 10 | targets = batch['targets'] 11 | outputs = self.model(ids, mask, token_type_ids) 12 | loss = self.criterion(outputs, targets) 13 | self.batch_metrics = {'loss': loss} 14 | if self.is_train_loader: 15 | loss.backward() 16 | self.optimizer.step() 17 | self.optimizer.zero_grad() 18 | 19 | @torch.no_grad() 20 | def predict_batch(self, batch): 21 | batch = self._batch2device(batch, self.device) 22 | ids = batch['ids'] 23 | mask = batch['mask'] 24 | token_type_ids = batch['token_type_ids'] 25 | outputs = self.model(ids, mask, token_type_ids) 26 | return outputs 27 | -------------------------------------------------------------------------------- /utils_nlp/models/pretrained_embeddings/README.md: -------------------------------------------------------------------------------- 1 | # Pretrained Embeddings 2 | 3 | The pretrained embeddings submodule contains utility functions that help users quickly load and extract various types of pretrained embeddings such as fastText, GloVe, Word2Vec, etc. 4 | -------------------------------------------------------------------------------- /utils_nlp/models/pretrained_embeddings/word2vec.py: -------------------------------------------------------------------------------- 1 | import os 2 | import requests 3 | import zipfile 4 | 5 | import numpy as np 6 | from tqdm import tqdm 7 | import gensim 8 | 9 | 10 | def _maybe_download_and_extract(dest_path, file_name): 11 | """ Downloads and extracts Word2vec vectors if they don’t already exist 12 | Args: 13 | dest_path: Path to the directory where the vectors will be extracted. 14 | file_name: File name of the word2vec vector file. 15 | Returns: 16 | str: File path to the word2vec vector file. 17 | """ 18 | 19 | dir_path = os.path.join(dest_path, "word2vec") 20 | file_path = os.path.join(dir_path, file_name) 21 | dl_path = os.path.join(file_path, '{}.zip'.format(file_name)) 22 | 23 | if not os.path.exists(file_path): 24 | os.makedirs(file_path, exist_ok=True) 25 | download_from_gdrive('0ByFQ96A4DgSPUm9wVWRLdm5qbmc', destination=dl_path) 26 | with zipfile.ZipFile(dl_path) as f: 27 | f.extractall(file_path) 28 | else: 29 | print("Vector file already exists. No changes made.") 30 | 31 | return file_path 32 | 33 | 34 | def load_pretrained_vectors( 35 | dir_path, file_name='vector_neologd', limit=None 36 | ): 37 | """ Method that loads word2vec vectors. Downloads if it doesn't exist. 38 | Args: 39 | file_name(str): Name of the word2vec file. 40 | dir_path(str): Path to the directory where word2vec vectors exist or will be 41 | downloaded. 42 | limit(int): Number of word vectors that is loaded from gensim. This option 43 | allows us to save RAM space and avoid memory errors. 44 | Returns: 45 | gensim.models.keyedvectors.Word2VecKeyedVectors: Loaded word2vectors 46 | """ 47 | file_path = _maybe_download_and_extract(dir_path, file_name) 48 | model_path = os.path.join(file_path, 'model.vec') 49 | word2vec_vectors = gensim.models.KeyedVectors.load_word2vec_format( 50 | model_path, binary=False, limit=limit 51 | ) 52 | 53 | return word2vec_vectors 54 | 55 | 56 | def download_from_gdrive(id, destination): 57 | """ 58 | Download file from Google Drive 59 | :param str id: g-drive id 60 | :param str destination: output path 61 | :return: 62 | """ 63 | url = "https://docs.google.com/uc?export=download" 64 | 65 | session = requests.Session() 66 | response = session.get(url, params={'id': id}, stream=True) 67 | token = get_confirm_token(response) 68 | if token: 69 | print("get download warning. set confirm token.") 70 | params = {'id': id, 'confirm': token} 71 | response = session.get(url, params=params, stream=True) 72 | save_response_content(response, destination) 73 | 74 | 75 | def get_confirm_token(response): 76 | """ 77 | verify whether warned or not. 78 | [note] In Google Drive Api, if requests content size is large, 79 | the user are send to verification page. 80 | :param requests.Response response: 81 | :return: 82 | """ 83 | for k, v in response.cookies.items(): 84 | if k.startswith("download_warning"): 85 | return v 86 | 87 | return None 88 | 89 | 90 | def save_response_content(response, destination): 91 | """ 92 | :param requests.Response response: 93 | :param str destination: 94 | :return: 95 | """ 96 | chunk_size = 1024 * 1024 97 | print("start downloading...") 98 | with open(destination, "wb") as f: 99 | for chunk in tqdm(response.iter_content(chunk_size), unit="MB"): 100 | f.write(chunk) 101 | print("Finish!!") 102 | print("Save to:{}".format(destination)) 103 | 104 | 105 | def convert_to_wv(w: str, word_vec): 106 | """Convert word to vectors 107 | 108 | Args: 109 | w (str): Word 110 | word_vec: Loaded word2vectors 111 | 112 | Returns: 113 | [type]: numpy vectors 114 | """ 115 | try: 116 | v = word_vec.word_vec(w) 117 | except KeyError: 118 | v = np.zeros(shape=(word_vec.vector_size,)) 119 | return v 120 | --------------------------------------------------------------------------------