├── .dockerignore
├── .flake8
├── .gitignore
├── Dockerfile
├── README.md
├── data
    └── .gitkeep
├── docker-compose.yml
├── examples
    ├── README.md
    ├── basic
    │   ├── README.md
    │   ├── cleaning.py
    │   ├── normalization.py
    │   ├── ruby.py
    │   ├── sentence_segmantation.py
    │   └── stopwords.py
    ├── embeddings
    │   ├── README.md
    │   ├── download_embeddings.py
    │   ├── get_fasttext.py
    │   ├── get_use.py
    │   └── get_word2vec.py
    ├── feature_engineering
    │   ├── README.md
    │   ├── get_bm25.py
    │   ├── get_bow.py
    │   ├── get_scdv.py
    │   ├── get_swem.py
    │   └── get_tfidf.py
    ├── morphological_analysis
    │   ├── README.md
    │   ├── konoha_sample.py
    │   └── nagisa_sample.py
    ├── sentence_similarity
    │   ├── README.md
    │   └── tfidf_cosine_similarity.py
    ├── sentiment_analysis
    │   ├── README.md
    │   └── oseti_sentiment_analysis.py
    ├── text_classification
    │   ├── README.md
    │   ├── run_bert.py
    │   ├── run_t5.py
    │   ├── tfidf_lgbm.py
    │   └── tfidf_logistic_regression.py
    └── visualization
    │   ├── README.md
    │   ├── japanize.png
    │   ├── japanize_labels.py
    │   └── visualization.ipynb
├── requirements.txt
├── tests
    └── README.md
└── utils_nlp
    ├── README.md
    ├── common
        └── data.py
    ├── dataset
        ├── README.md
        └── livedoor.py
    ├── eval
        └── classification.py
    ├── features
        ├── README.md
        ├── scdv.py
        └── swem.py
    └── models
        ├── README.md
        ├── nn
            ├── README.md
            ├── datasets.py
            ├── models.py
            └── runner.py
        └── pretrained_embeddings
            ├── README.md
            └── word2vec.py


/.dockerignore:
--------------------------------------------------------------------------------
1 | data
2 | examples
3 | tests
4 | utils_nlp
5 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 160
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | data/livedoor
  2 | data/word2vec
  3 | data/fasttext
  4 | *.gz
  5 | *.csv
  6 | *.pkl
  7 | *.log
  8 | *.model
  9 | *.json
 10 | *.bin
 11 | *.npy
 12 | wandb
 13 | lightning_logs
 14 | 
 15 | # Created by https://www.gitignore.io/api/macos,python
 16 | # Edit at https://www.gitignore.io/?templates=macos,python
 17 | 
 18 | ### macOS ###
 19 | # General
 20 | .DS_Store
 21 | .AppleDouble
 22 | .LSOverride
 23 | 
 24 | # Icon must end with two \r
 25 | Icon
 26 | 
 27 | # Thumbnails
 28 | ._*
 29 | 
 30 | # Files that might appear in the root of a volume
 31 | .DocumentRevisions-V100
 32 | .fseventsd
 33 | .Spotlight-V100
 34 | .TemporaryItems
 35 | .Trashes
 36 | .VolumeIcon.icns
 37 | .com.apple.timemachine.donotpresent
 38 | 
 39 | # Directories potentially created on remote AFP share
 40 | .AppleDB
 41 | .AppleDesktop
 42 | Network Trash Folder
 43 | Temporary Items
 44 | .apdisk
 45 | 
 46 | ### Python ###
 47 | # Byte-compiled / optimized / DLL files
 48 | __pycache__/
 49 | *.py[cod]
 50 | *$py.class
 51 | 
 52 | # C extensions
 53 | *.so
 54 | 
 55 | # Distribution / packaging
 56 | .Python
 57 | build/
 58 | develop-eggs/
 59 | dist/
 60 | downloads/
 61 | eggs/
 62 | .eggs/
 63 | lib/
 64 | lib64/
 65 | parts/
 66 | sdist/
 67 | var/
 68 | wheels/
 69 | pip-wheel-metadata/
 70 | share/python-wheels/
 71 | *.egg-info/
 72 | .installed.cfg
 73 | *.egg
 74 | MANIFEST
 75 | 
 76 | # PyInstaller
 77 | #  Usually these files are written by a python script from a template
 78 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 79 | *.manifest
 80 | *.spec
 81 | 
 82 | # Installer logs
 83 | pip-log.txt
 84 | pip-delete-this-directory.txt
 85 | 
 86 | # Unit test / coverage reports
 87 | htmlcov/
 88 | .tox/
 89 | .nox/
 90 | .coverage
 91 | .coverage.*
 92 | .cache
 93 | nosetests.xml
 94 | coverage.xml
 95 | *.cover
 96 | .hypothesis/
 97 | .pytest_cache/
 98 | 
 99 | # Translations
100 | *.mo
101 | *.pot
102 | 
103 | # Scrapy stuff:
104 | .scrapy
105 | 
106 | # Sphinx documentation
107 | docs/_build/
108 | 
109 | # PyBuilder
110 | target/
111 | 
112 | # pyenv
113 | .python-version
114 | 
115 | # pipenv
116 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
117 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
118 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
119 | #   install all needed dependencies.
120 | #Pipfile.lock
121 | 
122 | # celery beat schedule file
123 | celerybeat-schedule
124 | 
125 | # SageMath parsed files
126 | *.sage.py
127 | 
128 | # Spyder project settings
129 | .spyderproject
130 | .spyproject
131 | 
132 | # Rope project settings
133 | .ropeproject
134 | 
135 | # Mr Developer
136 | .mr.developer.cfg
137 | .project
138 | .pydevproject
139 | 
140 | # mkdocs documentation
141 | /site
142 | 
143 | # mypy
144 | .mypy_cache/
145 | .dmypy.json
146 | dmypy.json
147 | 
148 | # Pyre type checker
149 | .pyre/
150 | 
151 | # End of https://www.gitignore.io/api/macos,python
152 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM gcr.io/kaggle-images/python:v76
 2 | 
 3 | ENV LANG=C.UTF-8 LC_ALL=C.UTF-8
 4 | COPY requirements.txt .
 5 | 
 6 | # mecab
 7 | RUN apt-get update -y && \
 8 |     apt-get install -y mecab libmecab-dev mecab-ipadic-utf8
 9 | 
10 | RUN pip install -U pip && \
11 |     pip install -r requirements.txt
12 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # NLP Recipes for Japanese
 2 | 
 3 | This repository contains samples codes for natural language processing in Japanese.
 4 | It's highly inspired by [microsoft/nlp-recipes](https://github.com/microsoft/nlp-recipes).
 5 | 
 6 | ## Content
 7 | 
 8 | The following is a summary of the commonly used NLP scenarios covered in the repository. Each scenario is demonstrated in one or more scripts or Jupyter notebook examples that make use of the core code base of models and repository utilities.
 9 | 
10 | |Category|Methods|
11 | |---| --- |
12 | |[Basic](./examples/basic)|Cleaning, Normalization, Stopwords, Sentence Segmantation, Ruby|
13 | |[Embeddings](./examples/embeddings)|Word2Vec, fastText, Universal Sentence Encoder|
14 | |[Feature Engineering](./examples/feature_engineering)|Bag-of-Words, TF-IDF, BM25, SWEM, SCDV|
15 | |[Morphological Analysis](./examples/morphological_analysis)|Konoha, nagisa|
16 | |[Sentence Similarity](./examples/sentence_similarity)|Cosine Similarity|
17 | |[Sentiment Analysis](sentiment_analysis)|oseti|
18 | |[Text Classification](./examples/text_classification)|TF-IDF & Logistic Regression, TF-IDF & LightGBM, BERT, T5|
19 | |[Visualization](./examples/visualization)|Visualization with Japanese texts|
20 | 
21 | ## Environment
22 | 
23 | ```bash
24 | docker-compose up -d --build
25 | docker exec -it nlp-recipes-ja bash
26 | ```
27 | 


--------------------------------------------------------------------------------
/data/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/upura/nlp-recipes-ja/8ac5e898864137841de8b03c11da34815009af24/data/.gitkeep


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3"
 2 | services:
 3 |   test:
 4 |     build: .
 5 |     volumes:
 6 |       - $PWD:/working
 7 |     container_name: nlp-recipes-ja
 8 |     working_dir: /working
 9 |     ports:
10 |       - 8888:8888
11 |     environment:
12 |       - PYTHONPATH=/working
13 |     tty: true
14 | 


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
 1 | # Examples
 2 | 
 3 | This folder contains examples, for building Natural Language Processing systems for the following scenarios.
 4 | 
 5 | |Category|Methods|
 6 | |---| --- |
 7 | |[Basic](basic)|Cleaning, Normalization, Stopwords, Sentence Segmantation, Ruby|
 8 | |[Embeddings](embeddings)|Word2Vec, fastText, Universal Sentence Encoder|
 9 | |[Feature Engineering](feature_engineering)|Bag-of-Words, TF-IDF, BM25, SWEM, SCDV|
10 | |[Morphological Analysis](morphological_analysis)|Konoha, nagisa|
11 | |[Sentence Similarity](sentence_similarity)|Cosine Similarity|
12 | |[Sentiment Analysis](sentiment_analysis)|oseti|
13 | |[Text Classification](text_classification)|Logistic Regression, LightGBM, BERT|
14 | |[Visualization](visualization)|Visualization with Japanese texts|
15 | 


--------------------------------------------------------------------------------
/examples/basic/README.md:
--------------------------------------------------------------------------------
 1 | # Basic
 2 | 
 3 | This folder contains examples for basic tasks of natural language processing.
 4 | 
 5 | ## Summary
 6 | 
 7 | |Notebook|Environment|Description| 
 8 | |---|---|---|
 9 | |[Cleaning](cleaning.py)|Local| Text cleaning |
10 | |[Normalization](normalization.py)|Local| Text normalization by [neologdn](https://github.com/ikegami-yukino/neologdn) |
11 | |[Stopwords](stopwords.py)|Local| Stopwords by frequency and [dictonary](http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt) |
12 | |[Sentence Segmantation](normalization.py)|Local| Sentence segmantation by [ja_sentence_segmenter](https://github.com/wwwcojp/ja_sentence_segmenter) |
13 | |[Convert Japanese into Roman](ruby.py)|Local| Convert Japanese into Roman by [pykakasi](https://github.com/miurahr/pykakasi) |
14 | 


--------------------------------------------------------------------------------
/examples/basic/cleaning.py:
--------------------------------------------------------------------------------
 1 | # Implemantation from https://github.com/Hironsan/natural-language-preprocessings/blob/master/preprocessings/ja/cleaning.py
 2 | import re
 3 | 
 4 | from bs4 import BeautifulSoup
 5 | 
 6 | from utils_nlp.dataset.livedoor import load_pandas_df
 7 | 
 8 | 
 9 | def clean_text(text):
10 |     # replaced_text = '\n'.join(s.strip() for s in text.splitlines()[2:] if s != '')  # skip header by [2:]
11 |     replaced_text = text.lower()
12 |     replaced_text = re.sub(r'[【】]', ' ', replaced_text)       # 【】の除去
13 |     replaced_text = re.sub(r'[（）()]', ' ', replaced_text)     # （）の除去
14 |     replaced_text = re.sub(r'[［］\[\]]', ' ', replaced_text)   # ［］の除去
15 |     replaced_text = re.sub(r'[@＠]\w+', '', replaced_text)  # メンションの除去
16 |     replaced_text = re.sub(r'https?:\/\/.*?[\r\n ]', '', replaced_text)  # URLの除去
17 |     replaced_text = re.sub(r'　', ' ', replaced_text)  # 全角空白の除去
18 |     return replaced_text
19 | 
20 | 
21 | def clean_html_tags(html_text):
22 |     soup = BeautifulSoup(html_text, 'html.parser')
23 |     cleaned_text = soup.get_text()
24 |     cleaned_text = ''.join(cleaned_text.splitlines())
25 |     return cleaned_text
26 | 
27 | 
28 | def clean_html_and_js_tags(html_text):
29 |     soup = BeautifulSoup(html_text, 'html.parser')
30 |     [x.extract() for x in soup.findAll(['script', 'style'])]
31 |     cleaned_text = soup.get_text()
32 |     cleaned_text = ''.join(cleaned_text.splitlines())
33 |     return cleaned_text
34 | 
35 | 
36 | def clean_url(html_text):
37 |     """
38 |     S+ matches all non-whitespace characters (the end of the url)
39 |     :param html_text:
40 |     :return:
41 |     """
42 |     clean_text = re.sub(r'http\S+', '', html_text)
43 |     return clean_text
44 | 
45 | 
46 | def clean_code(html_text):
47 |     """Qiitaのコードを取り除きます
48 |     :param html_text:
49 |     :return:
50 |     """
51 |     soup = BeautifulSoup(html_text, 'html.parser')
52 |     [x.extract() for x in soup.findAll(class_="code-frame")]
53 |     cleaned_text = soup.get_text()
54 |     cleaned_text = ''.join(cleaned_text.splitlines())
55 |     return cleaned_text
56 | 
57 | 
58 | if __name__ == '__main__':
59 |     df = load_pandas_df(nrows=10)
60 |     df['text'] = df['text'].map(clean_text)
61 |     print(df.head())
62 | 


--------------------------------------------------------------------------------
/examples/basic/normalization.py:
--------------------------------------------------------------------------------
 1 | import neologdn
 2 | 
 3 | from utils_nlp.dataset.livedoor import load_pandas_df
 4 | 
 5 | 
 6 | if __name__ == '__main__':
 7 |     df = load_pandas_df(nrows=10)
 8 |     df['text'] = df['text'].apply(neologdn.normalize)
 9 |     print(df.head())
10 | 


--------------------------------------------------------------------------------
/examples/basic/ruby.py:
--------------------------------------------------------------------------------
 1 | import pykakasi
 2 | 
 3 | from utils_nlp.dataset.livedoor import load_pandas_df
 4 | 
 5 | 
 6 | if __name__ == '__main__':
 7 |     df = load_pandas_df(nrows=10)
 8 |     text = df['text'][0][:30]
 9 |     print(text)
10 | 
11 |     kakasi = pykakasi.kakasi()
12 |     kakasi.setMode("H", "a")        # Hiragana to ascii, default: no conversion
13 |     kakasi.setMode("K", "a")        # Katakana to ascii, default: no conversion
14 |     kakasi.setMode("J", "a")        # Japanese to ascii, default: no conversion
15 |     kakasi.setMode("r", "Hepburn")  # default: use Hepburn Roman table
16 |     kakasi.setMode("s", True)       # add space, default: no separator
17 |     kakasi.setMode("C", True)       # capitalize, default: no capitalize
18 |     conv = kakasi.getConverter()
19 |     result = conv.do(text)
20 |     print(result)
21 | 


--------------------------------------------------------------------------------
/examples/basic/sentence_segmantation.py:
--------------------------------------------------------------------------------
 1 | import functools
 2 | 
 3 | from ja_sentence_segmenter.common.pipeline import make_pipeline
 4 | from ja_sentence_segmenter.concatenate.simple_concatenator import concatenate_matching
 5 | from ja_sentence_segmenter.normalize.neologd_normalizer import normalize
 6 | from ja_sentence_segmenter.split.simple_splitter import split_newline, split_punctuation
 7 | 
 8 | from utils_nlp.dataset.livedoor import load_pandas_df
 9 | 
10 | 
11 | if __name__ == '__main__':
12 |     df = load_pandas_df(nrows=10)
13 | 
14 |     split_punc2 = functools.partial(split_punctuation, punctuations=r"。!?")
15 |     concat_tail_no = functools.partial(concatenate_matching, former_matching_rule=r"^(?P<result>.+)(の)$", remove_former_matched=False)
16 |     segmenter = make_pipeline(normalize, split_newline, concat_tail_no, split_punc2)
17 | 
18 |     df['sentences'] = df['text'].apply(lambda x: list(segmenter(x)))
19 |     print(df['sentences'][0])
20 | 


--------------------------------------------------------------------------------
/examples/basic/stopwords.py:
--------------------------------------------------------------------------------
 1 | from collections import Counter
 2 | import itertools
 3 | from konoha import WordTokenizer
 4 | import numpy as np
 5 | import pandas as pd
 6 | 
 7 | from utils_nlp.dataset.livedoor import load_pandas_df
 8 | 
 9 | 
10 | def remove_stopwords(words, stopwords):
11 |     words = [word for word in words if word not in stopwords]
12 |     return words
13 | 
14 | 
15 | def get_stop_words_by_freq(docs, n=100):
16 |     docs = list(itertools.chain(*list(docs)))
17 |     fdist = Counter(docs)
18 |     stopwords = [word for word, freq in fdist.most_common(n)]
19 |     return stopwords
20 | 
21 | 
22 | def get_stop_words_by_dict():
23 |     stopwords = pd.read_table('http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt', header=None)
24 |     stopwords = list(stopwords[0].values)
25 |     return stopwords
26 | 
27 | 
28 | if __name__ == '__main__':
29 |     df = load_pandas_df(nrows=100)
30 |     tokenizer = WordTokenizer('MeCab')
31 |     docs = np.array([
32 |         map(str, tokenizer.tokenize(text)) for text in df['text']
33 |     ])
34 |     stopwords_f = get_stop_words_by_freq(docs, n=100)
35 |     stopwords_d = get_stop_words_by_dict()
36 |     stopwords = set(stopwords_f) | set(stopwords_d)
37 |     print(stopwords)
38 |     docs = remove_stopwords(docs, stopwords)
39 | 


--------------------------------------------------------------------------------
/examples/embeddings/README.md:
--------------------------------------------------------------------------------
 1 | # Embeddings
 2 | 
 3 | This folder contains examples for getting pretrained embedding vectors.
 4 | 
 5 | ## What is Word Embedding?
 6 | 
 7 | >Word embedding is a technique to map words or phrases from a vocabulary to vectors or real numbers.
 8 | >The learned vector representations of words capture  syntactic and semantic word relationships and therefore can be very useful for  tasks like sentence similary, text classifcation, etc.
 9 | 
10 | https://github.com/microsoft/nlp-recipes/blob/master/examples/embeddings/README.md
11 | 
12 | ## Japanese pretrained models
13 | 
14 | There is a survey article titled "[学習済み日本語word2vecとその評価について](https://blog.hoxo-m.com/entry/2020/02/20/090000)". This article introduces many Japanese pretrained embedding models avaliable and evaluate them.
15 | 
16 | ## Summary
17 | 
18 | |Notebook|Environment|Description| 
19 | |---|---|---|
20 | |[Word2vec](get_word2vec.py)|Local| Get [word2vec vectors pretrained by Japanese Wikipedia](https://qiita.com/Hironsan/items/513b9f93752ecee9e670) |
21 | |[fastText](get_fasttext.py)|Local| Get [fastText vectors pretrained by Japanese Common Crawl](https://fasttext.cc/docs/en/crawl-vectors.html) |
22 | |[Download Pre-trained Embeddings](download_embeddings.py)|Local| Download pre-trained embeddings by [chakin](https://github.com/chakki-works/chakin) |
23 | |[Universal Sentence Encoder](get_use.py)|Local| Get [Universal Sentence Encoder](https://tfhub.dev/google/universal-sentence-encoder-multilingual/3) |
24 | 


--------------------------------------------------------------------------------
/examples/embeddings/download_embeddings.py:
--------------------------------------------------------------------------------
 1 | import chakin
 2 | 
 3 | 
 4 | if __name__ == '__main__':
 5 |     chakin.search(lang='Japanese')
 6 |     """
 7 |                             Name  Dimension     Corpus VocabularySize              Method  Language                 Author
 8 |     6                fastText(ja)        300  Wikipedia           580K            fastText  Japanese               Facebook
 9 |     22  word2vec.Wiki-NEologd.50d         50  Wikipedia           335K  word2vec + NEologd  Japanese  Shiroyagi Corporation
10 |     """
11 |     chakin.download(number=22, save_dir='./')
12 | 


--------------------------------------------------------------------------------
/examples/embeddings/get_fasttext.py:
--------------------------------------------------------------------------------
 1 | import gensim
 2 | import nagisa
 3 | 
 4 | from utils_nlp.dataset.livedoor import load_pandas_df
 5 | 
 6 | 
 7 | if __name__ == '__main__':
 8 |     df = load_pandas_df(nrows=10)
 9 |     text = df['text'][0][:30]
10 |     print(text)
11 |     # 友人代表のスピーチ、独女はどうこなしている？もうすぐジューン
12 | 
13 |     tagger = nagisa.Tagger()
14 |     nouns = tagger.extract(text, extract_postags=['名詞']).words
15 |     print(nouns)
16 |     # ['友人', '代表', 'スピーチ', '独女', 'ジューン']
17 | 
18 |     model_w = gensim.models.KeyedVectors.load_word2vec_format('./data/fasttext/cc.ja.300.vec.gz', binary=False)
19 |     for noun in nouns:
20 |         try:
21 |             print(noun, model_w[noun].shape)
22 |         except KeyError:
23 |             print(noun, 'Out of vocabulary')
24 |     """
25 |     友人 (300,)
26 |     代表 (300,)
27 |     スピーチ (300,)
28 |     独女 Out of vocabulary
29 |     ジューン (300,)
30 |     """
31 | 
32 |     model_f = gensim.models.fasttext.load_facebook_model('./data/fasttext/cc.ja.300.bin')
33 |     for noun in nouns:
34 |         print(noun, noun in model_f.wv.vocab)
35 |         print(noun, model_f[noun].shape)
36 | 


--------------------------------------------------------------------------------
/examples/embeddings/get_use.py:
--------------------------------------------------------------------------------
 1 | import tensorflow_hub as hub
 2 | import tensorflow_text  # noqa
 3 | 
 4 | from utils_nlp.dataset.livedoor import load_pandas_df
 5 | 
 6 | 
 7 | if __name__ == '__main__':
 8 |     df = load_pandas_df(nrows=10)
 9 |     text = df['text'][0][:30]
10 |     print(text)
11 |     # 友人代表のスピーチ、独女はどうこなしている？もうすぐジューン
12 | 
13 |     embed = hub.load('https://tfhub.dev/google/universal-sentence-encoder-multilingual/3')
14 |     vectors = embed([text])
15 |     print(vectors[0].shape)
16 |     print(vectors[0])
17 |     """
18 |     (512,)
19 |     tf.Tensor(
20 |     [-4.53309491e-02 -5.73447756e-02  3.41094285e-02  1.09533397e-02
21 |     -2.55712979e-02 -8.29478130e-02  3.02479346e-03  8.89975950e-02], shape=(512,), dtype=float32
22 |     """
23 | 


--------------------------------------------------------------------------------
/examples/embeddings/get_word2vec.py:
--------------------------------------------------------------------------------
 1 | import nagisa
 2 | 
 3 | from utils_nlp.dataset.livedoor import load_pandas_df
 4 | from utils_nlp.models.pretrained_embeddings.word2vec import load_pretrained_vectors, convert_to_wv
 5 | 
 6 | 
 7 | if __name__ == '__main__':
 8 |     df = load_pandas_df(nrows=10)
 9 |     text = df['text'][0][:30]
10 |     print(text)
11 |     # 友人代表のスピーチ、独女はどうこなしている？もうすぐジューン
12 | 
13 |     tagger = nagisa.Tagger()
14 |     nouns = tagger.extract(text, extract_postags=['名詞']).words
15 |     print(nouns)
16 |     # ['友人', '代表', 'スピーチ', '独女', 'ジューン']
17 | 
18 |     word_vec = load_pretrained_vectors('data')
19 |     vectors = convert_to_wv(nouns[0], word_vec)
20 |     print(vectors.shape)
21 |     # (300,)
22 |     print(vectors[:5])
23 |     # [ 1.0028e-01  1.0647e-02 -1.7439e-01 -2.7110e-03  2.1647e-01]
24 | 


--------------------------------------------------------------------------------
/examples/feature_engineering/README.md:
--------------------------------------------------------------------------------
 1 | # Feature Engineering
 2 | 
 3 | This folder contains examples for feature engineering of texts.
 4 | 
 5 | ## Summary
 6 | 
 7 | |Notebook|Environment|Description| 
 8 | |---|---|---|
 9 | |[Bag-of-Words](get_bow.py)|Local| [Bag-of-Words](https://en.wikipedia.org/wiki/Bag-of-words_model) |
10 | |[TF-IDF](get_tfidf.py)|Local| [TF-IDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf) |
11 | |[BM25](get_bm25.py)|Local| [BM25](https://en.wikipedia.org/wiki/Okapi_BM25) |
12 | |[SWEM](get_swem.py)|Local| [SWEM](https://arxiv.org/abs/1805.09843) |
13 | |[SCDV](get_scdv.py)|Local| [SCDV](https://arxiv.org/abs/1612.06778) |
14 | 


--------------------------------------------------------------------------------
/examples/feature_engineering/get_bm25.py:
--------------------------------------------------------------------------------
  1 | from konoha import WordTokenizer
  2 | import neologdn
  3 | import numpy as np
  4 | import scipy as sp
  5 | from sklearn.base import BaseEstimator, TransformerMixin
  6 | from sklearn.feature_extraction.text import CountVectorizer, _document_frequency
  7 | from sklearn.utils.validation import check_is_fitted
  8 | 
  9 | from utils_nlp.dataset.livedoor import load_pandas_df
 10 | 
 11 | 
 12 | class BM25Transformer(BaseEstimator, TransformerMixin):
 13 | 
 14 |     def __init__(self, use_idf=True, k1=2.0, b=0.75):
 15 |         """Okapi BM25: a non-binary model - Introduction to Information Retrieval
 16 |         http://nlp.stanford.edu/IR-book/html/htmledition/okapi-bm25-a-non-binary-model-1.html.
 17 |         Implementation from https://github.com/arosh/BM25Transformer.
 18 | 
 19 |         Args:
 20 |             use_idf (bool, optional): [description]. Defaults to True.
 21 |             k1 (float, optional): [description]. Defaults to 2.0.
 22 |             b (float, optional): [description]. Defaults to 0.75.
 23 |         """
 24 |         self.use_idf = use_idf
 25 |         self.k1 = k1
 26 |         self.b = b
 27 | 
 28 |     def fit(self, X):
 29 |         """
 30 |         Parameters
 31 |         ----------
 32 |         X : sparse matrix, [n_samples, n_features] document-term matrix
 33 |         """
 34 |         if not sp.sparse.issparse(X):
 35 |             X = sp.sparse.csc_matrix(X)
 36 |         if self.use_idf:
 37 |             n_samples, n_features = X.shape
 38 |             df = _document_frequency(X)
 39 |             idf = np.log((n_samples - df + 0.5) / (df + 0.5))
 40 |             self._idf_diag = sp.sparse.spdiags(idf, diags=0, m=n_features, n=n_features)
 41 | 
 42 |         doc_len = X.sum(axis=1)
 43 |         self._average_document_len = np.average(doc_len)
 44 | 
 45 |         return self
 46 | 
 47 |     def transform(self, X, copy=True):
 48 |         """
 49 |         Parameters
 50 |         ----------
 51 |         X : sparse matrix, [n_samples, n_features] document-term matrix
 52 |         copy : boolean, optional (default=True)
 53 |         """
 54 |         if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.floating):
 55 |             # preserve float family dtype
 56 |             X = sp.sparse.csr_matrix(X, copy=copy)
 57 |         else:
 58 |             # convert counts or binary occurrences to floats
 59 |             X = sp.sparse.csr_matrix(X, dtype=np.float, copy=copy)
 60 | 
 61 |         n_samples, n_features = X.shape
 62 | 
 63 |         # Document length (number of terms) in each row
 64 |         # Shape is (n_samples, 1)
 65 |         doc_len = X.sum(axis=1)
 66 |         # Number of non-zero elements in each row
 67 |         # Shape is (n_samples, )
 68 |         sz = X.indptr[1:] - X.indptr[0:-1]
 69 | 
 70 |         # In each row, repeat `doc_len` for `sz` times
 71 |         # Shape is (sum(sz), )
 72 |         # Example
 73 |         # -------
 74 |         # dl = [4, 5, 6]
 75 |         # sz = [1, 2, 3]
 76 |         # rep = [4, 5, 5, 6, 6, 6]
 77 |         rep = np.repeat(np.asarray(doc_len), sz)
 78 | 
 79 |         # Compute BM25 score only for non-zero elements
 80 |         nom = self.k1 + 1
 81 |         denom = X.data + self.k1 * (1 - self.b + self.b * rep / self._average_document_len)
 82 |         data = X.data * nom / denom
 83 | 
 84 |         X = sp.sparse.csr_matrix((data, X.indices, X.indptr), shape=X.shape)
 85 | 
 86 |         if self.use_idf:
 87 |             check_is_fitted(self, '_idf_diag', 'idf vector is not fitted')
 88 | 
 89 |             expected_n_features = self._idf_diag.shape[0]
 90 |             if n_features != expected_n_features:
 91 |                 raise ValueError("Input has n_features=%d while the model"
 92 |                                  " has been trained with n_features=%d" % (
 93 |                                      n_features, expected_n_features))
 94 |             X = X * self._idf_diag
 95 | 
 96 |         return X
 97 | 
 98 | 
 99 | if __name__ == '__main__':
100 |     df = load_pandas_df(nrows=10)
101 | 
102 |     # Normalization
103 |     df['text'] = df['text'].apply(neologdn.normalize)
104 | 
105 |     tokenizer = WordTokenizer('MeCab')
106 |     docs = np.array([
107 |         ' '.join(map(str, tokenizer.tokenize(text))) for text in df['text']
108 |     ])
109 |     print(docs.shape)
110 |     # (10,)
111 | 
112 |     count_vec = CountVectorizer(min_df=2,
113 |                                 max_features=20000,
114 |                                 ngram_range=(1, 3))
115 |     bags = count_vec.fit_transform(docs)
116 | 
117 |     print(bags.toarray().shape)
118 |     print(bags.toarray())
119 |     """
120 |     (10, 445)
121 |     [[1 0 1 ... 0 0 0]
122 |     [1 0 0 ... 0 0 0]
123 |     [1 0 0 ... 1 0 0]
124 |     ...
125 |     [0 0 1 ... 0 0 0]
126 |     [0 0 0 ... 0 0 0]
127 |     [0 0 0 ... 0 0 0]]
128 |     """
129 | 
130 |     bm25 = BM25Transformer(use_idf=True, k1=2.0, b=0.75)
131 |     bm25 = bm25.fit_transform(bags)
132 |     print(bm25.toarray().shape)
133 |     print(bm25.toarray())
134 |     """
135 |     (10, 445)
136 |     [[0.75499451 1.21230177 0.         ... 0.         0.         0.        ]
137 |     [0.77036179 0.         0.         ... 0.         0.         0.        ]
138 |     [0.83310313 0.         0.40196374 ... 0.         0.         1.3377215 ]
139 |     ...
140 |     [0.         1.02087499 0.         ... 1.02087499 1.02087499 0.        ]
141 |     [0.         0.         0.43242275 ... 0.         0.         0.        ]
142 |     [0.         0.         0.         ... 0.         0.         0.        ]]
143 |     """
144 | 


--------------------------------------------------------------------------------
/examples/feature_engineering/get_bow.py:
--------------------------------------------------------------------------------
 1 | from konoha import WordTokenizer
 2 | import neologdn
 3 | import numpy as np
 4 | from sklearn.feature_extraction.text import CountVectorizer
 5 | 
 6 | from utils_nlp.dataset.livedoor import load_pandas_df
 7 | 
 8 | 
 9 | if __name__ == '__main__':
10 |     df = load_pandas_df(nrows=10)
11 | 
12 |     # Normalization
13 |     df['text'] = df['text'].apply(neologdn.normalize)
14 | 
15 |     tokenizer = WordTokenizer('MeCab')
16 |     docs = np.array([
17 |         ' '.join(map(str, tokenizer.tokenize(text))) for text in df['text']
18 |     ])
19 |     print(docs.shape)
20 |     # (10,)
21 | 
22 |     count_vec = CountVectorizer(min_df=2,
23 |                                 max_features=20000,
24 |                                 ngram_range=(1, 3))
25 |     bags = count_vec.fit_transform(docs)
26 | 
27 |     print(bags.toarray().shape)
28 |     # (10, 445)
29 |     print(bags.toarray())
30 |     """
31 |     [[1 0 1 ... 0 0 0]
32 |     [1 0 0 ... 0 0 0]
33 |     [1 0 0 ... 1 0 0]
34 |     ...
35 |     [0 0 1 ... 0 0 0]
36 |     [0 0 0 ... 0 0 0]
37 |     [0 0 0 ... 0 0 0]]
38 |     """
39 | 


--------------------------------------------------------------------------------
/examples/feature_engineering/get_scdv.py:
--------------------------------------------------------------------------------
 1 | from konoha import WordTokenizer
 2 | import neologdn
 3 | import numpy as np
 4 | 
 5 | from utils_nlp.dataset.livedoor import load_pandas_df
 6 | from utils_nlp.features import scdv
 7 | from utils_nlp.models.pretrained_embeddings.word2vec import load_pretrained_vectors
 8 | 
 9 | 
10 | if __name__ == '__main__':
11 |     df = load_pandas_df(nrows=10)
12 | 
13 |     # Normalization
14 |     df['text'] = df['text'].apply(neologdn.normalize)
15 | 
16 |     tokenizer = WordTokenizer('MeCab')
17 |     docs = np.array([
18 |         map(str, tokenizer.tokenize(text)) for text in df['text']
19 |     ])
20 |     print(docs.shape)
21 |     # (10,)
22 | 
23 |     word_vec = load_pretrained_vectors('data')
24 |     scdv = scdv.create(docs, word_vec, n_components=10)
25 |     print(scdv.shape)
26 |     # (10, 3000)
27 | 


--------------------------------------------------------------------------------
/examples/feature_engineering/get_swem.py:
--------------------------------------------------------------------------------
 1 | from konoha import WordTokenizer
 2 | import neologdn
 3 | import numpy as np
 4 | 
 5 | from utils_nlp.dataset.livedoor import load_pandas_df
 6 | from utils_nlp.features import swem
 7 | from utils_nlp.models.pretrained_embeddings.word2vec import load_pretrained_vectors
 8 | 
 9 | 
10 | if __name__ == '__main__':
11 |     df = load_pandas_df(nrows=10)
12 | 
13 |     # Normalization
14 |     df['text'] = df['text'].apply(neologdn.normalize)
15 | 
16 |     tokenizer = WordTokenizer('MeCab')
17 |     docs = np.array([
18 |         map(str, tokenizer.tokenize(text)) for text in df['text']
19 |     ])
20 |     print(docs.shape)
21 |     # (10,)
22 | 
23 |     word_vec = load_pretrained_vectors('data')
24 |     swem_max = swem.create(docs, word_vec, aggregation='max')
25 |     swem_mean = swem.create(docs, word_vec, aggregation='mean')
26 |     print(swem_max.shape)
27 |     # (10, 300)
28 | 


--------------------------------------------------------------------------------
/examples/feature_engineering/get_tfidf.py:
--------------------------------------------------------------------------------
 1 | from konoha import WordTokenizer
 2 | import neologdn
 3 | import numpy as np
 4 | from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
 5 | 
 6 | from utils_nlp.dataset.livedoor import load_pandas_df
 7 | 
 8 | 
 9 | if __name__ == '__main__':
10 |     df = load_pandas_df(nrows=10)
11 | 
12 |     # Normalization
13 |     df['text'] = df['text'].apply(neologdn.normalize)
14 | 
15 |     tokenizer = WordTokenizer('MeCab')
16 |     docs = np.array([
17 |         ' '.join(map(str, tokenizer.tokenize(text))) for text in df['text']
18 |     ])
19 |     print(docs.shape)
20 |     # (10,)
21 | 
22 |     count_vec = CountVectorizer(min_df=2,
23 |                                 max_features=20000,
24 |                                 ngram_range=(1, 3))
25 |     bags = count_vec.fit_transform(docs)
26 | 
27 |     print(bags.toarray().shape)
28 |     print(bags.toarray())
29 |     """
30 |     (10, 445)
31 |     [[1 0 1 ... 0 0 0]
32 |     [1 0 0 ... 0 0 0]
33 |     [1 0 0 ... 1 0 0]
34 |     ...
35 |     [0 0 1 ... 0 0 0]
36 |     [0 0 0 ... 0 0 0]
37 |     [0 0 0 ... 0 0 0]]
38 |     """
39 | 
40 |     tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True)
41 |     tf_idf = tfidf.fit_transform(bags)
42 |     print(tf_idf.toarray().shape)
43 |     print(tf_idf.toarray())
44 |     """
45 |     (10, 445)
46 |     [[0.04752833 0.05432543 0.         ... 0.         0.         0.        ]
47 |     [0.0484923  0.         0.         ... 0.         0.         0.        ]
48 |     [0.04909543 0.         0.04364936 ... 0.         0.         0.05611665]
49 |     ...
50 |     [0.         0.03772958 0.         ... 0.03772958 0.03772958 0.        ]
51 |     [0.         0.         0.03994261 ... 0.         0.         0.        ]
52 |     [0.         0.         0.         ... 0.         0.         0.        ]]
53 |     """
54 | 


--------------------------------------------------------------------------------
/examples/morphological_analysis/README.md:
--------------------------------------------------------------------------------
 1 | # Morphological Analysis
 2 | 
 3 | This folder contains examples for morphological analysis.
 4 | 
 5 | Konoha is a Python library for providing integrated interface of various Japanese tokenziers, which enables us to switch tokenizers. We can use MeCab, KyTea, Janome, Sudachi, Sentencepiece, and nagisa. Konoha doesn't support the function of filtering some nouns. We can use pure nagisa for this purpose.
 6 | 
 7 | ## Summary
 8 | 
 9 | |Notebook|Environment|Description| 
10 | |---|---|---|
11 | |[Konoha sample](konoha_sample.py)|Local| Morphological analysis by [Konoha](https://github.com/himkt/konoha) |
12 | |[nagisa sample](nagisa_sample.py)|Local| Morphological analysis by [nagisa](https://github.com/taishi-i/nagisa) |
13 | 


--------------------------------------------------------------------------------
/examples/morphological_analysis/konoha_sample.py:
--------------------------------------------------------------------------------
 1 | from konoha import WordTokenizer
 2 | 
 3 | from utils_nlp.dataset.livedoor import load_pandas_df
 4 | 
 5 | 
 6 | if __name__ == '__main__':
 7 |     df = load_pandas_df(nrows=10)
 8 |     text = df['text'][0][:30]
 9 |     print(text)
10 |     # 友人代表のスピーチ、独女はどうこなしている？もうすぐジューン
11 | 
12 |     tokenizer_m = WordTokenizer('MeCab')
13 |     print(tokenizer_m.tokenize(text))
14 |     # [友人, 代表, の, スピーチ, 、, 独, 女, は, どう, こなし, て, いる, ？, もうすぐ, ジューン]
15 | 
16 |     tokenizer_s = WordTokenizer('Sudachi', mode='A', with_postag=True)
17 |     print(tokenizer_s.tokenize(text))
18 |     # [友人 (名詞), 代表 (名詞), の (助詞), スピーチ (名詞), 、 (補助記号), 独女 (名詞), は (助詞), どう (副詞), こなし (動詞), て (助詞), いる (動詞), ？ (補助記号), もう (副詞), すぐ (副詞), ジューン (名詞)]
19 | 
20 |     df['sep_text'] = [tokenizer_m.tokenize(text) for text in df['text']]
21 |     print(df.head())
22 | 


--------------------------------------------------------------------------------
/examples/morphological_analysis/nagisa_sample.py:
--------------------------------------------------------------------------------
 1 | import nagisa
 2 | 
 3 | from utils_nlp.dataset.livedoor import load_pandas_df
 4 | 
 5 | 
 6 | if __name__ == '__main__':
 7 |     df = load_pandas_df(nrows=10)
 8 |     text = df['text'][0][:30]
 9 |     print(text)
10 |     # 友人代表のスピーチ、独女はどうこなしている？もうすぐジューン
11 | 
12 |     tagger = nagisa.Tagger()
13 |     print(tagger.extract(text, extract_postags=['名詞']))
14 |     # 友人/名詞 代表/名詞 スピーチ/名詞 独女/名詞 ジューン/名詞
15 | 
16 |     df['sep_text'] = [tagger.extract(text, extract_postags=['名詞']).words for text in df['text']]
17 |     print(df.head())
18 | 


--------------------------------------------------------------------------------
/examples/sentence_similarity/README.md:
--------------------------------------------------------------------------------
 1 | # Sentence Similarity
 2 | 
 3 | This folder contains examples for calculating sentence similarities.
 4 | 
 5 | ## Summary
 6 | 
 7 | |Notebook|Environment|Description| 
 8 | |---|---|---|
 9 | |[TF-IDF & Cosine Similarity](tfidf_cosine_similarity.py)|Local| Calculate sentence similarities from TF-IDF vectors |
10 | 


--------------------------------------------------------------------------------
/examples/sentence_similarity/tfidf_cosine_similarity.py:
--------------------------------------------------------------------------------
 1 | from konoha import WordTokenizer
 2 | import neologdn
 3 | import numpy as np
 4 | from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
 5 | from sklearn.metrics.pairwise import cosine_similarity
 6 | 
 7 | from utils_nlp.dataset.livedoor import load_pandas_df
 8 | 
 9 | 
10 | if __name__ == '__main__':
11 |     df = load_pandas_df(nrows=10)
12 | 
13 |     # Normalization
14 |     df['text'] = df['text'].apply(neologdn.normalize)
15 | 
16 |     tokenizer = WordTokenizer('MeCab')
17 |     docs = np.array([
18 |         ' '.join(map(str, tokenizer.tokenize(text))) for text in df['text']
19 |     ])
20 |     print(docs.shape)
21 |     # (10,)
22 | 
23 |     count_vec = CountVectorizer(min_df=2,
24 |                                 max_features=20000,
25 |                                 ngram_range=(1, 3))
26 |     bags = count_vec.fit_transform(docs)
27 | 
28 |     print(bags.toarray().shape)
29 |     print(bags.toarray())
30 |     """
31 |     (10, 445)
32 |     [[1 0 1 ... 0 0 0]
33 |     [1 0 0 ... 0 0 0]
34 |     [1 0 0 ... 1 0 0]
35 |     ...
36 |     [0 0 1 ... 0 0 0]
37 |     [0 0 0 ... 0 0 0]
38 |     [0 0 0 ... 0 0 0]]
39 |     """
40 | 
41 |     tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True)
42 |     tf_idf = tfidf.fit_transform(bags)
43 |     print(tf_idf.toarray().shape)
44 |     print(tf_idf.toarray())
45 |     """
46 |     (10, 445)
47 |     [[0.04752833 0.05432543 0.         ... 0.         0.         0.        ]
48 |     [0.0484923  0.         0.         ... 0.         0.         0.        ]
49 |     [0.04909543 0.         0.04364936 ... 0.         0.         0.05611665]
50 |     ...
51 |     [0.         0.03772958 0.         ... 0.03772958 0.03772958 0.        ]
52 |     [0.         0.         0.03994261 ... 0.         0.         0.        ]
53 |     [0.         0.         0.         ... 0.         0.         0.        ]]
54 |     """
55 | 
56 |     print(cosine_similarity(tf_idf.toarray()))
57 |     """
58 |     [[1.         0.31294546 0.22234506 0.27272853 0.22658861 0.37452715
59 |     0.35456225 0.29524085 0.17193537 0.36229732]
60 |     [0.31294546 1.         0.25102573 0.25264431 0.24334397 0.33785512
61 |     0.31670052 0.28218417 0.12684395 0.32628839]
62 |     [0.22234506 0.25102573 1.         0.24099022 0.17307931 0.31050187
63 |     0.32489792 0.28119098 0.15070305 0.38326419]
64 |     [0.27272853 0.25264431 0.24099022 1.         0.23456837 0.32640547
65 |     0.27615115 0.3153026  0.26716363 0.31163735]
66 |     [0.22658861 0.24334397 0.17307931 0.23456837 1.         0.41007705
67 |     0.24911698 0.36058785 0.11835559 0.2387821 ]
68 |     [0.37452715 0.33785512 0.31050187 0.32640547 0.41007705 1.
69 |     0.45739635 0.32316926 0.2059866  0.31257367]
70 |     [0.35456225 0.31670052 0.32489792 0.27615115 0.24911698 0.45739635
71 |     1.         0.39132051 0.24839521 0.3321967 ]
72 |     [0.29524085 0.28218417 0.28119098 0.3153026  0.36058785 0.32316926
73 |     0.39132051 1.         0.15238316 0.30832032]
74 |     [0.17193537 0.12684395 0.15070305 0.26716363 0.11835559 0.2059866
75 |     0.24839521 0.15238316 1.         0.24724469]
76 |     [0.36229732 0.32628839 0.38326419 0.31163735 0.2387821  0.31257367
77 |     0.3321967  0.30832032 0.24724469 1.        ]]
78 |     """
79 | 


--------------------------------------------------------------------------------
/examples/sentiment_analysis/README.md:
--------------------------------------------------------------------------------
 1 | # Sentiment Analysis
 2 | 
 3 | This folder contains examples for sentiment analysis.
 4 | 
 5 | ## Summary
 6 | 
 7 | |Notebook|Environment|Description| 
 8 | |---|---|---|
 9 | |[Sentiment Analysis by dictonary](oseti_sentiment_analysis.py)|Local| Sentiment analysis by dictonary by [oseti](https://github.com/ikegami-yukino/oseti) |
10 | 


--------------------------------------------------------------------------------
/examples/sentiment_analysis/oseti_sentiment_analysis.py:
--------------------------------------------------------------------------------
 1 | import oseti
 2 | 
 3 | from utils_nlp.dataset.livedoor import load_pandas_df
 4 | 
 5 | 
 6 | if __name__ == '__main__':
 7 |     df = load_pandas_df(nrows=10)
 8 |     text = df['text'][0][:30]
 9 |     print(text)
10 |     # 友人代表のスピーチ、独女はどうこなしている？もうすぐジューン
11 | 
12 |     analyzer = oseti.Analyzer()
13 |     print(analyzer.analyze(text))
14 |     print(analyzer.count_polarity(text))
15 |     # [1.0, 0]
16 |     # [{'positive': 2, 'negative': 0}, {'positive': 0, 'negative': 0}]
17 | 


--------------------------------------------------------------------------------
/examples/text_classification/README.md:
--------------------------------------------------------------------------------
 1 | # Text Classification
 2 | 
 3 | This folder contains examples of text classification models.
 4 | 
 5 | ## What is Text Classification?
 6 | 
 7 | >Text classification is a supervised learning method of learning and predicting the category or the class of a document given its text content.
 8 | >The state-of-the-art methods are based on neural networks of different architectures as well as pre-trained language models or word embeddings.
 9 | 
10 | https://github.com/microsoft/nlp-recipes/blob/master/examples/text_classification/README.md
11 | 
12 | ## Summary
13 | 
14 | |Notebook|Environment|Description|ACC|
15 | |---|---|---|---|
16 | |[TF-IDF & Logistic Regression](tfidf_logistic_regression.py)|Local| [Logistic Regression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html) with TF-IDF vectors | 0.9308 |
17 | |[TF-IDF & LightGBM](tfidf_lgbm.py)|Local| [LightGBM](https://github.com/microsoft/LightGBM) with TF-IDF vectors | 0.9512 |
18 | |[BERT](run_bert.py) 'cl-tohoku/bert-base-japanese-v2' |Local| [Transformers BERT](https://github.com/huggingface/transformers) | 0.9362 |
19 | |[BERT](run_bert.py) 'cl-tohoku/bert-base-japanese-char-v2' |Local| [Transformers BERT](https://github.com/huggingface/transformers) | 0.9274 |
20 | |[BERT](run_bert.py) 'cl-tohoku/bert-base-large' |Local| [Transformers BERT](https://github.com/huggingface/transformers) | - |
21 | |[T5](run_t5.py) |Local| [T5 for japanese](https://qiita.com/sonoisa/items/a9af64ff641f0bbfed44) | 0.9566 |
22 | 
23 | Accuracy scores (ACC) are calculated by running code only in fold 0 in the condition that datasets are devided into train/val/test at the rate of 0.6/0.2/0.2.
24 | Be careful that the scores are highly affected by the way of splitting dataset and hyperparameters like the number of epochs.
25 | 


--------------------------------------------------------------------------------
/examples/text_classification/run_bert.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import sys
 3 | 
 4 | import neologdn
 5 | import numpy as np
 6 | import pytorch_lightning as pl
 7 | from sklearn import preprocessing
 8 | from sklearn.metrics import log_loss
 9 | from sklearn.model_selection import train_test_split, StratifiedKFold
10 | import torch
11 | from torch import nn
12 | from torch.utils.data import DataLoader
13 | from tqdm import tqdm
14 | from transformers import BertTokenizer
15 | 
16 | sys.path.append('.')
17 | from utils_nlp.dataset.livedoor import load_pandas_df
18 | from utils_nlp.eval.classification import eval_classification
19 | from utils_nlp.models.nn.datasets import LivedoorDataset
20 | from utils_nlp.models.nn.models import PLBertClassifier
21 | 
22 | 
23 | def preprocess_data(df):
24 |     # split
25 |     df['text'] = df['text'].apply(neologdn.normalize)
26 |     le = preprocessing.LabelEncoder()
27 |     df['label'] = le.fit_transform(df['label'])
28 | 
29 |     X_train, X_test, y_train, y_test = train_test_split(
30 |         df, df['label'].values, test_size=0.2, random_state=42, stratify=df['label'])
31 | 
32 |     return X_train.reset_index(drop=True), X_test.reset_index(drop=True), y_train, y_test
33 | 
34 | 
35 | if __name__ == '__main__':
36 | 
37 |     parser = argparse.ArgumentParser()
38 |     parser.add_argument('--model_name')
39 |     args = parser.parse_args()
40 | 
41 |     MODEL_NAME = args.model_name
42 |     MAX_LEN = 300
43 |     pl.seed_everything(777)
44 |     device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
45 | 
46 |     df = load_pandas_df(shuffle=True)
47 |     X_train, X_test, y_train, y_test = preprocess_data(df)
48 | 
49 |     tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
50 | 
51 |     test_dataset = LivedoorDataset(X_test, tokenizer, MAX_LEN)
52 |     test_loader = DataLoader(test_dataset, shuffle=False, batch_size=32, num_workers=4)
53 | 
54 |     y_preds = []
55 |     NUM_CLASS = 9
56 |     oof_train = np.zeros((len(X_train), NUM_CLASS))
57 |     cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=0)
58 | 
59 |     for fold_id, (train_index, valid_index) in enumerate(tqdm(cv.split(X_train, X_train['label']))):
60 |         if fold_id == 0:
61 |             X_tr = X_train.loc[train_index, :].reset_index(drop=True)
62 |             X_val = X_train.loc[valid_index, :].reset_index(drop=True)
63 |             y_tr = y_train[train_index]
64 |             y_val = y_train[valid_index]
65 | 
66 |             train_dataset = LivedoorDataset(X_tr, tokenizer, MAX_LEN)
67 |             valid_dataset = LivedoorDataset(X_val, tokenizer, MAX_LEN)
68 | 
69 |             train_loader = DataLoader(train_dataset, shuffle=True, batch_size=16, num_workers=4)
70 |             valid_loader = DataLoader(valid_dataset, shuffle=False, batch_size=32, num_workers=4)
71 | 
72 |             model = PLBertClassifier(model_name=MODEL_NAME,
73 |                                      num_classes=NUM_CLASS)
74 |             device ='cuda:0' if torch.cuda.is_available() else 'cpu'
75 |             model = model.to(device)
76 |             trainer = pl.Trainer(gpus=1, max_epochs=7)
77 |             trainer.fit(model, train_loader, valid_loader)
78 |             trainer.test(test_dataloaders=test_loader)
79 | 
80 |     y_preds = np.load('data/bert/preds.npy')
81 |     print(f'test, log_loss: {log_loss(y_test, y_preds)}')
82 |     result_dict = eval_classification(y_test, y_preds.argmax(axis=1))
83 |     print(result_dict)
84 |     """
85 |     {'accuracy': 0.9362,
86 |      'precision': [0.8939, 0.9101, 0.9588, 0.9293, 0.9451, 0.9241, 0.9822, 0.9882, 0.8935],
87 |      'recall': [0.9195, 0.931, 0.9422, 0.902, 0.9885, 0.8639, 0.954, 0.9333, 0.9805],
88 |      'f1': [0.9065, 0.9205, 0.9504, 0.9154, 0.9663, 0.893, 0.9679, 0.96, 0.935]}
89 |     """
90 | 


--------------------------------------------------------------------------------
/examples/text_classification/run_t5.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import sys
 3 | import gc
 4 | 
 5 | import neologdn
 6 | import numpy as np
 7 | import pytorch_lightning as pl
 8 | from sklearn import preprocessing
 9 | from sklearn.metrics import log_loss
10 | from sklearn.model_selection import train_test_split, StratifiedKFold
11 | import torch
12 | from torch import nn
13 | from torch.utils.data import DataLoader
14 | from tqdm import tqdm
15 | from transformers import T5Tokenizer
16 | 
17 | sys.path.append('.')
18 | from utils_nlp.dataset.livedoor import load_pandas_df
19 | from utils_nlp.eval.classification import eval_classification
20 | from utils_nlp.models.nn.datasets import LivedoorDatasetT5 as LivedoorDataset
21 | from utils_nlp.models.nn.models import PLT5Classifier
22 | 
23 | 
24 | def preprocess_data(df):
25 |     # split
26 |     df['text'] = df['text'].apply(neologdn.normalize)
27 |     le = preprocessing.LabelEncoder()
28 |     df['label'] = le.fit_transform(df['label'])
29 | 
30 |     X_train, X_test, y_train, y_test = train_test_split(
31 |         df, df['label'].values, test_size=0.2, random_state=42, stratify=df['label'])
32 | 
33 |     return X_train.reset_index(drop=True), X_test.reset_index(drop=True), y_train, y_test
34 | 
35 | 
36 | if __name__ == '__main__':
37 | 
38 |     parser = argparse.ArgumentParser()
39 |     parser.add_argument('--model_name')
40 |     args = parser.parse_args()
41 | 
42 |     MODEL_NAME = args.model_name
43 |     MAX_LEN = 300
44 |     pl.seed_everything(777)
45 |     device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
46 | 
47 |     df = load_pandas_df(shuffle=True)
48 |     X_train, X_test, y_train, y_test = preprocess_data(df)
49 | 
50 |     tokenizer = T5Tokenizer.from_pretrained("sonoisa/t5-base-japanese", is_fast=True)
51 | 
52 |     test_dataset = LivedoorDataset(X_test, tokenizer, MAX_LEN)
53 |     test_loader = DataLoader(test_dataset, shuffle=False, batch_size=1, num_workers=4)
54 | 
55 |     y_preds = []
56 |     NUM_CLASS = 9
57 |     cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=0)
58 | 
59 |     for fold_id, (train_index, valid_index) in enumerate(tqdm(cv.split(X_train, X_train['label']))):
60 |         if fold_id == 0:
61 |             X_tr = X_train.loc[train_index, :].reset_index(drop=True)
62 |             X_val = X_train.loc[valid_index, :].reset_index(drop=True)
63 |             y_tr = y_train[train_index]
64 |             y_val = y_train[valid_index]
65 | 
66 |             train_dataset = LivedoorDataset(X_tr, tokenizer, MAX_LEN)
67 |             valid_dataset = LivedoorDataset(X_val, tokenizer, MAX_LEN)
68 | 
69 |             train_loader = DataLoader(train_dataset, shuffle=True, batch_size=16, num_workers=4)
70 |             valid_loader = DataLoader(valid_dataset, shuffle=False, batch_size=32, num_workers=4)
71 | 
72 |             model = PLT5Classifier(model_name=MODEL_NAME)
73 |             device ='cuda:0' if torch.cuda.is_available() else 'cpu'
74 |             model = model.to(device)
75 |             trainer = pl.Trainer(gpus=1, max_epochs=5)
76 |             trainer.fit(model, train_loader, valid_loader)
77 |             model.tokenizer.save_pretrained('data/t5')
78 |             model.backbone.save_pretrained('data/t5')
79 |             del train_loader, train_dataset, valid_loader, valid_dataset, X_train, X_test, y_train, df, X_tr, X_val
80 |             gc.collect()
81 |             trainer.test(test_dataloaders=test_loader)
82 | 
83 |     y_preds = np.load('data/t5/preds.npy')
84 |     y_preds = np.array([int(d) for d in y_preds])
85 |     result_dict = eval_classification(y_test, y_preds)
86 |     print(result_dict)
87 |     """
88 |     {'accuracy': 0.9566,
89 |      'precision': [0.9699, 0.9194, 0.9815, 0.9583, 0.95, 0.9128, 0.977, 0.9888, 0.956],
90 |      'recall': [0.9253, 0.9828, 0.9191, 0.902, 0.9828, 0.929, 0.977, 0.9833, 0.987],
91 |      'f1': [0.9471, 0.95, 0.9493, 0.9293, 0.9661, 0.9208, 0.977, 0.9861, 0.9712]}
92 |     """
93 | 


--------------------------------------------------------------------------------
/examples/text_classification/tfidf_lgbm.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | 
  3 | from konoha import WordTokenizer
  4 | import lightgbm as lgb
  5 | from loguru import logger
  6 | import neologdn
  7 | import numpy as np
  8 | import pandas as pd
  9 | from sklearn import preprocessing
 10 | from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
 11 | from sklearn.metrics import log_loss
 12 | from sklearn.model_selection import train_test_split, StratifiedKFold
 13 | from tqdm import tqdm
 14 | 
 15 | sys.path.append('.')
 16 | from utils_nlp.common.data import Data
 17 | from utils_nlp.dataset.livedoor import load_pandas_df
 18 | from utils_nlp.eval.classification import eval_classification
 19 | 
 20 | 
 21 | def preprocess_data(df):
 22 |     # split
 23 |     df['text'] = df['text'].apply(neologdn.normalize)
 24 |     le = preprocessing.LabelEncoder()
 25 |     df['label'] = le.fit_transform(df['label'])
 26 | 
 27 |     df_train, df_test, y_train, y_test = train_test_split(
 28 |         df, df['label'].values, test_size=0.2, random_state=42, stratify=df['label'])
 29 | 
 30 |     # tokenize
 31 |     tokenizer = WordTokenizer('MeCab')
 32 |     docs_train = np.array([
 33 |         ' '.join(map(str, tokenizer.tokenize(text))) for text in df_train['text']
 34 |     ])
 35 |     docs_test = np.array([
 36 |         ' '.join(map(str, tokenizer.tokenize(text))) for text in df_test['text']
 37 |     ])
 38 | 
 39 |     # tfidf: Don't use df_test for fitting
 40 |     count_vec = CountVectorizer(min_df=2,
 41 |                                 max_features=20000,
 42 |                                 ngram_range=(1, 3))
 43 |     bags_train = count_vec.fit_transform(docs_train)
 44 |     bags_test = count_vec.transform(docs_test)
 45 | 
 46 |     tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True)
 47 |     tf_idf_train = tfidf.fit_transform(bags_train)
 48 |     tf_idf_test = tfidf.transform(bags_test)
 49 | 
 50 |     X_train = pd.DataFrame(tf_idf_train.toarray())
 51 |     X_test = pd.DataFrame(tf_idf_test.toarray())
 52 | 
 53 |     return X_train.reset_index(drop=True), X_test.reset_index(drop=True), y_train, y_test
 54 | 
 55 | 
 56 | if __name__ == '__main__':
 57 | 
 58 |     df = load_pandas_df(shuffle=True)
 59 |     X_train, X_test, y_train, y_test = preprocess_data(df)
 60 | 
 61 |     RUN_NAME = 'lgbm'
 62 |     logger.add(f'data/{RUN_NAME}/result.log',
 63 |                colorize=True,
 64 |                format='<green>{time}</green> {message}')
 65 |     logger.info(f'{X_train.shape}, {X_test.shape}')
 66 | 
 67 |     y_preds = []
 68 |     NUM_CLASS = 9
 69 |     oof_train = np.zeros((len(X_train), NUM_CLASS))
 70 |     cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=0)
 71 | 
 72 |     params = {
 73 |         'objective': 'multiclass',
 74 |         'num_class': NUM_CLASS,
 75 |         'num_leaves': 12,
 76 |         'max_depth': 4,
 77 |         'feature_fraction': 0.8,
 78 |         'subsample_freq': 1,
 79 |         'bagging_fraction': 0.7,
 80 |         'min_data_in_leaf': 10,
 81 |         'learning_rate': 0.1,
 82 |         'boosting': 'gbdt',
 83 |         'lambda_l1': 0.4,
 84 |         'lambda_l2': 0.4,
 85 |         'verbosity': -1,
 86 |         'random_state': 42
 87 |     }
 88 | 
 89 |     for fold_id, (train_index, valid_index) in enumerate(tqdm(cv.split(X_train, y_train))):
 90 |         if fold_id == 0:
 91 |             X_tr = X_train.loc[train_index, :]
 92 |             X_val = X_train.loc[valid_index, :]
 93 |             y_tr = y_train[train_index]
 94 |             y_val = y_train[valid_index]
 95 | 
 96 |             lgb_train = lgb.Dataset(X_tr, y_tr)
 97 |             lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)
 98 | 
 99 |             model = lgb.train(params,
100 |                             lgb_train,
101 |                             valid_sets=[lgb_train, lgb_eval],
102 |                             verbose_eval=10,
103 |                             num_boost_round=1000,
104 |                             early_stopping_rounds=10)
105 | 
106 |             Data.dump(model, f'data/{RUN_NAME}/model_{fold_id}.pkl')
107 | 
108 |             oof_train[valid_index] = model.predict(X_val)
109 |             score = log_loss(y_val, oof_train[valid_index])
110 |             logger.info(f'fold {fold_id}, log_loss: {score}')
111 | 
112 |             y_pred = model.predict(X_test)
113 |             y_preds.append(y_pred)
114 | 
115 |     y_preds = np.mean(y_preds, axis=0)
116 |     logger.info(f'test, log_loss: {log_loss(y_test, y_preds)}')
117 |     result_dict = eval_classification(y_test, y_preds.argmax(axis=1))
118 |     logger.info(str(result_dict))
119 |     """
120 |     {'accuracy': 0.9512,
121 |      'precision': [0.9253, 0.9714, 0.9713, 0.9348, 0.9286, 0.8786, 1.0, 0.9831, 0.9608],
122 |      'recall': [0.9253, 0.977, 0.9769, 0.8431, 0.9713, 0.8994, 1.0, 0.9667, 0.9545],
123 |      'f1': [0.9253, 0.9742, 0.9741, 0.8866, 0.9494, 0.8889, 1.0, 0.9748, 0.9577]}
124 |     """
125 | 
126 |     Data.dump(oof_train, f'data/{RUN_NAME}/oof_train.pkl')
127 |     Data.dump(y_preds, f'data/{RUN_NAME}/y_preds.pkl')
128 | 


--------------------------------------------------------------------------------
/examples/text_classification/tfidf_logistic_regression.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | 
  3 | from konoha import WordTokenizer
  4 | from loguru import logger
  5 | import neologdn
  6 | import numpy as np
  7 | import pandas as pd
  8 | from sklearn import preprocessing
  9 | from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
 10 | from sklearn.linear_model import LogisticRegression
 11 | from sklearn.metrics import log_loss
 12 | from sklearn.model_selection import train_test_split, StratifiedKFold
 13 | from tqdm import tqdm
 14 | 
 15 | sys.path.append('.')
 16 | from utils_nlp.common.data import Data
 17 | from utils_nlp.dataset.livedoor import load_pandas_df
 18 | from utils_nlp.eval.classification import eval_classification
 19 | 
 20 | 
 21 | def preprocess_data(df):
 22 |     # split
 23 |     df['text'] = df['text'].apply(neologdn.normalize)
 24 |     le = preprocessing.LabelEncoder()
 25 |     df['label'] = le.fit_transform(df['label'])
 26 | 
 27 |     df_train, df_test, y_train, y_test = train_test_split(
 28 |         df, df['label'].values, test_size=0.2, random_state=42, stratify=df['label'])
 29 | 
 30 |     # tokenize
 31 |     tokenizer = WordTokenizer('MeCab')
 32 |     docs_train = np.array([
 33 |         ' '.join(map(str, tokenizer.tokenize(text))) for text in df_train['text']
 34 |     ])
 35 |     docs_test = np.array([
 36 |         ' '.join(map(str, tokenizer.tokenize(text))) for text in df_test['text']
 37 |     ])
 38 | 
 39 |     # tfidf: Don't use df_test for fitting
 40 |     count_vec = CountVectorizer(min_df=2,
 41 |                                 max_features=20000,
 42 |                                 ngram_range=(1, 3))
 43 |     bags_train = count_vec.fit_transform(docs_train)
 44 |     bags_test = count_vec.transform(docs_test)
 45 | 
 46 |     tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True)
 47 |     tf_idf_train = tfidf.fit_transform(bags_train)
 48 |     tf_idf_test = tfidf.transform(bags_test)
 49 | 
 50 |     X_train = pd.DataFrame(tf_idf_train.toarray())
 51 |     X_test = pd.DataFrame(tf_idf_test.toarray())
 52 | 
 53 |     return X_train.reset_index(drop=True), X_test.reset_index(drop=True), y_train, y_test
 54 | 
 55 | 
 56 | if __name__ == '__main__':
 57 | 
 58 |     df = load_pandas_df(shuffle=True)
 59 |     X_train, X_test, y_train, y_test = preprocess_data(df)
 60 | 
 61 |     RUN_NAME = 'logistic_regression'
 62 |     logger.add(f'data/{RUN_NAME}/result.log',
 63 |                colorize=True,
 64 |                format='<green>{time}</green> {message}')
 65 |     logger.info(f'{X_train.shape}, {X_test.shape}')
 66 | 
 67 |     y_preds = []
 68 |     NUM_CLASS = 9
 69 |     oof_train = np.zeros((len(X_train), NUM_CLASS))
 70 |     cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=0)
 71 | 
 72 |     for fold_id, (train_index, valid_index) in enumerate(tqdm(cv.split(X_train, y_train))):
 73 |         if fold_id == 0:
 74 |             X_tr = X_train.loc[train_index, :]
 75 |             X_val = X_train.loc[valid_index, :]
 76 |             y_tr = y_train[train_index]
 77 |             y_val = y_train[valid_index]
 78 | 
 79 |             model = LogisticRegression(penalty='l2', solver='sag', random_state=0)
 80 |             model.fit(X_tr, y_tr)
 81 |             Data.dump(model, f'data/{RUN_NAME}/model_{fold_id}.pkl')
 82 | 
 83 |             oof_train[valid_index] = model.predict_proba(X_val)
 84 |             score = log_loss(y_val, oof_train[valid_index])
 85 |             logger.info(f'fold {fold_id}, log_loss: {score}')
 86 | 
 87 |             y_pred = model.predict_proba(X_test)
 88 |             y_preds.append(y_pred)
 89 | 
 90 |     y_preds = np.mean(y_preds, axis=0)
 91 |     logger.info(f'test, log_loss: {log_loss(y_test, y_preds)}')
 92 |     result_dict = eval_classification(y_test, y_preds.argmax(axis=1))
 93 |     logger.info(str(result_dict))
 94 |     """
 95 |     {'accuracy': 0.9308,
 96 |      'precision': [0.8771, 0.96, 0.9639, 0.9412, 0.9198, 0.8678, 0.9771, 0.9309, 0.9517],
 97 |      'recall': [0.9023, 0.9655, 0.9249, 0.7843, 0.9885, 0.8935, 0.9828, 0.9722, 0.8961],
 98 |      'f1': [0.8895, 0.9628, 0.944, 0.8556, 0.9529, 0.8805, 0.9799, 0.9511, 0.9231]}
 99 |     """
100 | 
101 |     Data.dump(oof_train, f'data/{RUN_NAME}/oof_train.pkl')
102 |     Data.dump(y_preds, f'data/{RUN_NAME}/y_preds.pkl')
103 | 


--------------------------------------------------------------------------------
/examples/visualization/README.md:
--------------------------------------------------------------------------------
 1 | # Visualization
 2 | 
 3 | This folder contains examples for visualization with Japanese texts.
 4 | 
 5 | ## Summary
 6 | 
 7 | |Notebook|Environment|Description| 
 8 | |---|---|---|
 9 | |[Visualization](visualization.ipynb)|[Kaggle](https://www.kaggle.com/sishihara/japanese-text-visualization-by-nlplot)| Visualization by [nlplot](https://github.com/takapy0210/nlplot) |
10 | |[japanize-matplotlib](japanize_matplotlib.py)|Local| Use Japanese labels by [japanize-matplotlib](https://github.com/uehara1414/japanize-matplotlib) |
11 | 


--------------------------------------------------------------------------------
/examples/visualization/japanize.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/upura/nlp-recipes-ja/8ac5e898864137841de8b03c11da34815009af24/examples/visualization/japanize.png


--------------------------------------------------------------------------------
/examples/visualization/japanize_labels.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import japanize_matplotlib
 3 | 
 4 | from utils_nlp.dataset.livedoor import load_pandas_df
 5 | 
 6 | 
 7 | if __name__ == '__main__':
 8 |     df = load_pandas_df()
 9 |     df['first_char'] = df['text'].str[0]
10 |     plot_df = df['first_char'].value_counts()[:10].reset_index()
11 | 
12 |     japanize_matplotlib.japanize()
13 |     plt.figure(figsize=(15, 8))
14 |     plt.bar(plot_df['index'], plot_df['first_char'])
15 |     plt.savefig('examples/visualization/japanize.png')
16 | 


--------------------------------------------------------------------------------
/examples/visualization/visualization.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"metadata":{"trusted":true},"cell_type":"code","source":"!pip install nagisa\n!pip install nlplot","execution_count":null,"outputs":[]},{"metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true},"cell_type":"code","source":"import glob\nimport os\n\nimport nagisa\nimport nlplot\nimport pandas as pd","execution_count":null,"outputs":[]},{"metadata":{"_uuid":"d629ff2d2480ee46fbb7e2d37f6b5fab8052498a","_cell_guid":"79c7e3d0-c299-4dcb-8224-4455121ee9b0","trusted":true},"cell_type":"code","source":"def extract_txt(filename: str) -> str:\n    with open(filename) as text_file:\n        # 0: URL, 1: timestamp\n        text = text_file.readlines()[2:]\n        text = [sentence.strip() for sentence in text]\n        text = list(filter(lambda line: line != '', text))\n        return ''.join(text)\n\n\nEXTRACTDIR = '/kaggle/input/livedoor-news/'\ncategories = [\n    name for name\n    in os.listdir(os.path.join(EXTRACTDIR, \"text\"))\n    if os.path.isdir(os.path.join(EXTRACTDIR, \"text\", name))]\n\ncategories = sorted(categories)\ntable = str.maketrans({\n    '\\n': '',\n    '\\t': '　',\n    '\\r': '',\n})\n\nall_text = []\nall_label = []\n\nfor cat in categories:\n    files = glob.glob(os.path.join(EXTRACTDIR, \"text\", cat, \"{}*.txt\".format(cat)))\n    files = sorted(files)\n    body = [extract_txt(elem).translate(table) for elem in files]\n    label = [cat] * len(body)\n\n    all_text.extend(body)\n    all_label.extend(label)\n\ndf = pd.DataFrame({'text': all_text, 'label': all_label})","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"df.head()","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"df = df.loc[:10]\ntagger = nagisa.Tagger()\ndf['sep_text'] = [tagger.extract(text, extract_postags=['名詞']).words for text in df['text']]","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"npt = nlplot.NLPlot(df, target_col='sep_text')\nstopwords = npt.get_stopword(top_n=5, min_freq=0)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# uni-gram\nnpt.bar_ngram(\n    title='uni-gram',\n    xaxis_label='word_count',\n    yaxis_label='word',\n    ngram=1,\n    top_n=50,\n    width=800,\n    height=1100,\n    stopwords=stopwords,\n)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# bi-gram\nnpt.bar_ngram(\n    title='bi-gram',\n    xaxis_label='word_count',\n    yaxis_label='word',\n    ngram=2,\n    top_n=50,\n    width=800,\n    height=1100,\n    stopwords=stopwords,\n)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"npt.treemap(\n    title='Tree Map',\n    ngram=1,\n    stopwords=stopwords,\n)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"npt.word_distribution(\n    title='number of words distribution'\n)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"npt.wordcloud(\n    stopwords=stopwords,\n    colormap='tab20_r',\n)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"npt.build_graph(stopwords=stopwords, min_edge_frequency=5)\nnpt.co_network(\n    title='Co-occurrence network',\n    color_palette='hls',\n    width=1000,\n    height=1200,\n)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"npt.sunburst(\n    title='sunburst chart',\n    colorscale=True,\n    color_continuous_scale='Oryel',\n    width=1000,\n    height=800,\n)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"npt.ldavis(num_topics=3, passes=5, save=False)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"","execution_count":null,"outputs":[]}],"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"pygments_lexer":"ipython3","nbconvert_exporter":"python","version":"3.6.4","file_extension":".py","codemirror_mode":{"name":"ipython","version":3},"name":"python","mimetype":"text/x-python"}},"nbformat":4,"nbformat_minor":4}


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | chakin==0.0.8
 2 | ja-sentence-segmenter==0.0.2
 3 | japanize-matplotlib==1.1.2
 4 | konoha[all]==4.6.1
 5 | loguru==0.5.1
 6 | mecab-python3==1.0.3
 7 | nagisa==0.2.7
 8 | neologdn==0.4
 9 | oseti==0.2
10 | pykakasi==2.0.1
11 | pytorch-lightning==1.2.7
12 | transformers==4.5.0
13 | 


--------------------------------------------------------------------------------
/tests/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/upura/nlp-recipes-ja/8ac5e898864137841de8b03c11da34815009af24/tests/README.md


--------------------------------------------------------------------------------
/utils_nlp/README.md:
--------------------------------------------------------------------------------
1 | # NLP Utilities
2 | 


--------------------------------------------------------------------------------
/utils_nlp/common/data.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import joblib
 4 | 
 5 | 
 6 | class Data:
 7 |     @classmethod
 8 |     def dump(cls, value, path):
 9 |         os.makedirs(os.path.dirname(path), exist_ok=True)
10 |         joblib.dump(value, path, compress=True)
11 | 
12 |     @classmethod
13 |     def load(cls, path):
14 |         return joblib.load(path)
15 | 


--------------------------------------------------------------------------------
/utils_nlp/dataset/README.md:
--------------------------------------------------------------------------------
 1 | ## Dataset
 2 | 
 3 | This submodule includes helper functions for downloading datasets and formatting them appropriately as well as utilities for splitting data for training / testing.
 4 | 
 5 | ## Data Loading
 6 | 
 7 | There are dataloaders for several datasets. For example, the livedoor module will allow you to load a dataframe in pandas from the livedoor dataset, with the option to set the number of rows to load in order to test algorithms and evaluate performance benchmarks.
 8 | Most datasets may be split into `train`, `valid`, and `test`, for example:
 9 | 
10 | ```python
11 | from utils_nlp.dataset.livedoor import load_pandas_df
12 | 
13 | df = load_pandas_df(nrows=1000, shuffle=False)
14 | ```
15 | 
16 | ## Dataset List
17 | |Dataset|Dataloader script|
18 | |-------|-----------------|
19 | |[livedoor ニュースコーパス](https://www.rondhuit.com/download.html)|[livedoor.py](./livedoor.py)|
20 | 


--------------------------------------------------------------------------------
/utils_nlp/dataset/livedoor.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import os
 3 | import tarfile
 4 | from urllib.request import urlretrieve
 5 | 
 6 | import pandas as pd
 7 | 
 8 | 
 9 | def load_pandas_df(nrows: int = None, shuffle: bool = False) -> pd.DataFrame:
10 |     """Loads the livedoor dataset as pd.DataFrame
11 |     This code is from https://github.com/yoheikikuta/bert-japanese/blob/master/notebook/finetune-to-livedoor-corpus.ipynb
12 | 
13 |     Args:
14 |         nrows (int, optional): [description]. Defaults to None.
15 | 
16 |     Returns:
17 |         pd.DataFrame: livedoor dataset
18 |     """
19 |     if os.path.exists('./data/livedoor.csv'):
20 |         df = pd.read_csv('./data/livedoor.csv')
21 |     else:
22 |         df = download_livedoor()
23 | 
24 |     if shuffle:
25 |         df = df.sample(frac=1, random_state=7).reset_index(drop=True)
26 | 
27 |     if nrows:
28 |         df = df[:nrows]
29 | 
30 |     return df
31 | 
32 | 
33 | def download_livedoor() -> pd.DataFrame:
34 |     """Download the dataset from "https://www.rondhuit.com/download.html", unzip, and load
35 | 
36 |     Returns:
37 |         pd.DataFrame: livedoor dataset
38 |     """
39 |     FILEURL = 'https://www.rondhuit.com/download/ldcc-20140209.tar.gz'
40 |     FILEPATH = './data/ldcc-20140209.tar.gz'
41 |     EXTRACTDIR = './data/livedoor/'
42 |     urlretrieve(FILEURL, FILEPATH)
43 | 
44 |     mode = "r:gz"
45 |     tar = tarfile.open(FILEPATH, mode)
46 |     tar.extractall(EXTRACTDIR)
47 |     tar.close()
48 | 
49 |     categories = [
50 |         name for name
51 |         in os.listdir(os.path.join(EXTRACTDIR, "text"))
52 |         if os.path.isdir(os.path.join(EXTRACTDIR, "text", name))]
53 | 
54 |     categories = sorted(categories)
55 |     table = str.maketrans({
56 |         '\n': '',
57 |         '\t': '　',
58 |         '\r': '',
59 |     })
60 | 
61 |     all_text = []
62 |     all_label = []
63 | 
64 |     for cat in categories:
65 |         files = glob.glob(os.path.join(EXTRACTDIR, "text", cat, "{}*.txt".format(cat)))
66 |         files = sorted(files)
67 |         body = [extract_txt(elem).translate(table) for elem in files]
68 |         label = [cat] * len(body)
69 | 
70 |         all_text.extend(body)
71 |         all_label.extend(label)
72 | 
73 |     df = pd.DataFrame({'text': all_text, 'label': all_label})
74 |     df.to_csv('./data/livedoor.csv', index=False)
75 |     return df
76 | 
77 | 
78 | def extract_txt(filename: str) -> str:
79 |     with open(filename) as text_file:
80 |         # 0: URL, 1: timestamp
81 |         text = text_file.readlines()[2:]
82 |         text = [sentence.strip() for sentence in text]
83 |         text = list(filter(lambda line: line != '', text))
84 |         return ''.join(text)
85 | 


--------------------------------------------------------------------------------
/utils_nlp/eval/classification.py:
--------------------------------------------------------------------------------
 1 | # Implementation from https://github.com/microsoft/nlp-recipes/blob/master/utils_nlp/eval/classification.py
 2 | from sklearn.metrics import (
 3 |     accuracy_score,
 4 |     precision_score,
 5 |     recall_score,
 6 |     f1_score,
 7 |     confusion_matrix,
 8 | )
 9 | from numpy import corrcoef
10 | 
11 | from matplotlib import pyplot
12 | import seaborn as sn
13 | import numpy as np
14 | import pandas as pd
15 | 
16 | 
17 | def eval_classification(actual, predicted, round_decimals=4):
18 |     """Returns common classification evaluation metrics.
19 |     Args:
20 |         actual (1d array-like): Array of actual values.
21 |         predicted (1d array-like): Array of predicted values.
22 |         round_decimals (int, optional): Number of decimal places. Defaults to 4.
23 |     Returns:
24 |         dict: A dictionary of evaluation metrics.
25 |     """
26 |     return {
27 |         "accuracy": accuracy_score(actual, predicted).round(round_decimals),
28 |         "precision": list(precision_score(actual, predicted, average=None).round(round_decimals)),
29 |         "recall": list(recall_score(actual, predicted, average=None).round(round_decimals)),
30 |         "f1": list(f1_score(actual, predicted, average=None).round(round_decimals)),
31 |     }
32 | 
33 | 
34 | def compute_correlation_coefficients(x, y=None):
35 |     """
36 |     Compute Pearson product-moment correlation coefficients.
37 |     Args:
38 |         x: array_like
39 |             A 1-D or 2-D array containing multiple variables and observations.
40 |             Each row of `x` represents a variable, and each column a single
41 |             observation of all those variables.
42 |         y: array_like, optional
43 |             An additional set of variables and observations. `y` has the same
44 |             shape as `x`.
45 |     Returns:
46 |         pd.DataFrame : A pandas dataframe from the correlation coefficient matrix of the variables.
47 |     """
48 |     return pd.DataFrame(corrcoef(x, y))
49 | 
50 | 
51 | def plot_confusion_matrix(
52 |     y_true,
53 |     y_pred,
54 |     labels,
55 |     normalize=False,
56 |     title="Confusion matrix",
57 |     plot_size=(8, 5),
58 |     font_scale=1.1,
59 | ):
60 |     """Function that prints out a graphical representation of confusion matrix using Seaborn Heatmap
61 |     Args:
62 |         y_true (1d array-like): True labels from dataset
63 |         y_pred (1d array-like): Predicted labels from the models
64 |         labels: A list of labels
65 |         normalize (Bool, optional): Boolean to Set Row Normalization for Confusion Matrix
66 |         title (String, optional): String that is the title of the plot
67 |         plot_size (tuple, optional): Tuple of Plot Dimensions Default "(8, 5)"
68 |         font_scale (float, optional): float type scale factor for font within plot
69 |     """
70 |     conf_matrix = np.array(confusion_matrix(y_true, y_pred))
71 |     if normalize:
72 |         conf_matrix = np.round(
73 |             conf_matrix.astype("float") / conf_matrix.sum(axis=1)[:, np.newaxis], 3
74 |         )
75 |     conf_dataframe = pd.DataFrame(conf_matrix, labels, labels)
76 |     fig, ax = pyplot.subplots(figsize=plot_size)
77 |     sn.set(font_scale=font_scale)
78 |     ax.set_title(title)
79 |     ax = sn.heatmap(conf_dataframe, cmap="Blues", annot=True, annot_kws={"size": 16}, fmt="g")
80 |     ax.set(xlabel="Predicted Labels", ylabel="True Labels")
81 | 


--------------------------------------------------------------------------------
/utils_nlp/features/README.md:
--------------------------------------------------------------------------------
 1 | # Features
 2 | 
 3 | The models submodule contains implementations of various algorithms that can be used in addition to external packages to evaluate and develop new natural language processing systems.
 4 | 
 5 | ## Summary
 6 | 
 7 | The following table summarizes each submodule.
 8 | 
 9 | |Submodule|Description|
10 | |---|---|
11 | |[SWEM](swem.py) | Create swem-max and swem-mean vectors|
12 | |[SCDV](scdv.py) | Create scdv vectors|
13 | 


--------------------------------------------------------------------------------
/utils_nlp/features/scdv.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | import numpy as np
  3 | import pandas as pd
  4 | from sklearn.mixture import GaussianMixture
  5 | 
  6 | 
  7 | def create(docs, word_vec, n_components=10):
  8 |     """Create scdv vectors
  9 | 
 10 |     Args:
 11 |         docs: np.array()
 12 |         word_vec: Loaded word2vectors
 13 |         n_components (int, optional): Number of components
 14 | 
 15 |     Returns:
 16 |         swem: Created scdv vectors
 17 |     """
 18 |     n_wv_embed = word_vec.vector_size
 19 | 
 20 |     # Create vocab set of w2v model and corpus
 21 |     vocab_model = set(k for k in word_vec.vocab.keys())
 22 |     vocab_docs = set([w for doc in docs for w in doc])
 23 |     out_of_vocabs = len(vocab_docs) - len(vocab_docs & vocab_model)
 24 |     print('out of vocabs: {out_of_vocabs}'.format(**locals()))
 25 |     use_words = list(vocab_docs & vocab_model)
 26 | 
 27 |     df_use = pd.DataFrame()
 28 |     df_use['word'] = use_words
 29 |     df_idf = create_idf_dataframe(docs)
 30 |     df_use = pd.merge(df_use, df_idf, on='word', how='left')
 31 |     idf = df_use['idf'].values
 32 | 
 33 |     use_word_vectors = np.array([word_vec[w] for w in use_words])
 34 | 
 35 |     clf = GaussianMixture(n_components=n_components, covariance_type='tied', verbose=2)
 36 |     clf.fit(use_word_vectors)
 37 | 
 38 |     word_probs = clf.predict_proba(use_word_vectors)
 39 |     # (n_vocabs, n_components,)
 40 |     word_cluster_vector = use_word_vectors[:, None, :] * word_probs[:, :, None]
 41 |     # (n_vocabs, n_components, n_wv_embed)
 42 | 
 43 |     topic_vector = word_cluster_vector.reshape(-1, n_components * n_wv_embed) * idf[:, None]
 44 | 
 45 |     topic_vector[np.isnan(topic_vector)] = 0
 46 |     word_to_topic = dict(zip(use_words, topic_vector))
 47 |     n_embedding = topic_vector.shape[1]
 48 | 
 49 |     cdv_vector = create_document_vector(docs, word_to_topic, n_embedding)
 50 |     compressed = compress_document_vector(cdv_vector)
 51 | 
 52 |     return compressed
 53 | 
 54 | 
 55 | def create_idf_dataframe(documents):
 56 |     """Create idf pd.DataFrame
 57 | 
 58 |     Args:
 59 |         documents (list[str]):
 60 |     Returns:
 61 |         [pd.DataFrame]: Created pd.DataFrame
 62 |     """
 63 | 
 64 |     d = defaultdict(int)
 65 | 
 66 |     for doc in documents:
 67 |         vocab_i = set(doc)
 68 |         for w in list(vocab_i):
 69 |             d[w] += 1
 70 | 
 71 |     df_idf = pd.DataFrame()
 72 |     df_idf['count'] = d.values()
 73 |     df_idf['word'] = d.keys()
 74 |     df_idf['idf'] = np.log(len(documents) / df_idf['count'])
 75 |     return df_idf
 76 | 
 77 | 
 78 | def create_document_vector(documents, w2t, n_embedding):
 79 |     doc_vectors = []
 80 | 
 81 |     for doc in documents:
 82 |         vector_i = np.zeros(shape=(n_embedding,))
 83 |         for w in doc:
 84 |             try:
 85 |                 v = w2t[w]
 86 |                 vector_i += v
 87 |             except KeyError:
 88 |                 continue
 89 |         doc_vectors.append(vector_i)
 90 |     return np.array(doc_vectors)
 91 | 
 92 | 
 93 | def compress_document_vector(doc_vector, p=.04):
 94 |     v = np.copy(doc_vector)
 95 |     vec_norm = np.linalg.norm(v, axis=1)
 96 |     # To escape from zero division
 97 |     vec_norm = np.where(vec_norm > 0, vec_norm, 1.)
 98 |     v /= vec_norm[:, None]
 99 | 
100 |     a_min = v.min(axis=1).mean()
101 |     a_max = v.max(axis=1).mean()
102 |     threshold = (abs(a_min) + abs(a_max)) / 2. * p
103 |     v[abs(v) < threshold] = .0
104 |     return v
105 | 


--------------------------------------------------------------------------------
/utils_nlp/features/swem.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from tqdm import tqdm
 3 | 
 4 | from utils_nlp.models.pretrained_embeddings.word2vec import convert_to_wv
 5 | 
 6 | 
 7 | def create(docs, word_vec, aggregation='max'):
 8 |     """Create swem vectors
 9 | 
10 |     Args:
11 |         docs: np.array()
12 |         word_vec: Loaded word2vectors
13 |         aggregation (str, optional): How to do max-pooling, 'max' or 'mean'. Defaults to 'max'.
14 | 
15 |     Raises:
16 |         ValueError: Invalid aggregation arg
17 | 
18 |     Returns:
19 |         swem: Created swem vectors
20 |     """
21 |     if aggregation == 'max':
22 |         agg = np.max
23 |     elif aggregation == 'mean':
24 |         agg = np.mean
25 |     else:
26 |         raise ValueError()
27 | 
28 |     swem = []
29 |     for sentence in tqdm(docs, total=len(docs)):
30 |         embed_i = [convert_to_wv(s, word_vec) for s in sentence]
31 |         embed_i = np.array(embed_i)
32 |         embed_i = agg(embed_i, axis=0)
33 |         swem.append(embed_i)
34 |     swem = np.array(swem)
35 |     return swem
36 | 


--------------------------------------------------------------------------------
/utils_nlp/models/README.md:
--------------------------------------------------------------------------------
 1 | # Models
 2 | 
 3 | The features submodule contains implementations of various algorithms that can create features from sentences.
 4 | 
 5 | ## Summary
 6 | 
 7 | The following table summarizes each submodule.
 8 | 
 9 | |Submodule|Description|
10 | |---|---|
11 | |[pretrained embeddings](./pretrained_embeddings) | This submodule provides utilities to download and extract pretrained word embeddings trained with Word2Vec, GloVe, fastText methods.|
12 | 


--------------------------------------------------------------------------------
/utils_nlp/models/nn/README.md:
--------------------------------------------------------------------------------
1 | # Neural Networks
2 | 
3 | The neural networks submodule contains utility functions like datasets and runners.
4 | 


--------------------------------------------------------------------------------
/utils_nlp/models/nn/datasets.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | class LivedoorDataset:
 5 | 
 6 |     def __init__(self, dataframe, tokenizer, max_len):
 7 |         self.tokenizer = tokenizer
 8 |         self.data = dataframe
 9 |         self.comment_text = dataframe['text']
10 |         self.targets = self.data['label']
11 |         self.max_len = max_len
12 | 
13 |     def __len__(self):
14 |         return len(self.comment_text)
15 | 
16 |     def __getitem__(self, index):
17 |         inputs = self.tokenizer.encode_plus(
18 |             self.comment_text[index],
19 |             None,
20 |             add_special_tokens=True,
21 |             max_length=self.max_len,
22 |             truncation=True,
23 |             padding='max_length',
24 |             return_token_type_ids=True
25 |         )
26 |         ids = inputs['input_ids']
27 |         attention_mask = inputs['attention_mask']
28 |         token_type_ids = inputs["token_type_ids"]
29 | 
30 |         return {
31 |             'ids': torch.tensor(ids, dtype=torch.long),
32 |             'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
33 |             'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
34 |             'targets': torch.tensor(self.targets[index], dtype=torch.long)
35 |         }
36 | 
37 | 
38 | class LivedoorDatasetT5:
39 | 
40 |     def __init__(self, dataframe, tokenizer, max_len):
41 |         self.tokenizer = tokenizer
42 |         self.data = dataframe
43 |         self.comment_text = dataframe['text']
44 |         self.targets = self.data['label']
45 |         self.max_len = max_len
46 | 
47 |     def __len__(self):
48 |         return len(self.comment_text)
49 | 
50 |     def __getitem__(self, index):
51 |         tokenized_inputs = self.tokenizer.encode_plus(
52 |             self.comment_text[index], max_length=self.max_len, truncation=True, 
53 |             padding="max_length", return_tensors="pt"
54 |         )
55 | 
56 |         tokenized_targets = self.tokenizer.encode_plus(
57 |             str(self.targets[index]), max_length=4, truncation=True, 
58 |             padding="max_length", return_tensors="pt"
59 |         )
60 | 
61 |         source_ids = tokenized_inputs["input_ids"].squeeze()
62 |         target_ids = tokenized_targets["input_ids"].squeeze()
63 | 
64 |         source_mask = tokenized_inputs["attention_mask"].squeeze()
65 |         target_mask = tokenized_targets["attention_mask"].squeeze()
66 | 
67 |         return {"source_ids": source_ids, "source_mask": source_mask, 
68 |                 "target_ids": target_ids, "target_mask": target_mask}
69 | 


--------------------------------------------------------------------------------
/utils_nlp/models/nn/models.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | 
  3 | import numpy as np
  4 | import pytorch_lightning as pl
  5 | import torch
  6 | from transformers import BertModel, T5ForConditionalGeneration, T5Tokenizer
  7 | 
  8 | sys.path.append('.')
  9 | from utils_nlp.eval.classification import eval_classification
 10 | 
 11 | 
 12 | class BertClassifier(torch.nn.Module):
 13 |     def __init__(self, model_name: str, num_classes: int = 9):
 14 |         super().__init__()
 15 |         self.l1 = BertModel.from_pretrained(model_name)
 16 |         self.l2 = torch.nn.Dropout(0.3)
 17 |         if 'large' in model_name:
 18 |             self.l3 = torch.nn.Linear(1024, num_classes)
 19 |         else:
 20 |             self.l3 = torch.nn.Linear(768, num_classes)
 21 | 
 22 |     def forward(self, ids, attention_mask, token_type_ids):
 23 |         _, output_1 = self.l1(ids, attention_mask=attention_mask, token_type_ids=token_type_ids, return_dict=False)
 24 |         output_2 = self.l2(output_1)
 25 |         output = self.l3(output_2)
 26 |         return output
 27 | 
 28 | 
 29 | class PLBertClassifier(pl.LightningModule):
 30 |     def __init__(self, model_name: str, num_classes: int = 9):
 31 |         super().__init__()
 32 |         self.backbone = BertClassifier(model_name, num_classes)
 33 |         self.criterion = torch.nn.CrossEntropyLoss()
 34 | 
 35 |     def forward(self, ids, attention_mask, token_type_ids):
 36 |         output = self.backbone(ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
 37 |         return output
 38 | 
 39 |     def training_step(self, batch, batch_idx):
 40 |         ids = batch["ids"]
 41 |         attention_mask = batch["attention_mask"]
 42 |         token_type_ids = batch["token_type_ids"]
 43 |         targets = batch["targets"]
 44 |         output = self.backbone(ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
 45 |         loss = self.criterion(output, targets)
 46 |         return loss
 47 | 
 48 |     def validation_step(self, batch, batch_idx):
 49 |         ids = batch["ids"]
 50 |         attention_mask = batch["attention_mask"]
 51 |         token_type_ids = batch["token_type_ids"]
 52 |         targets = batch["targets"]
 53 |         output = self.backbone(ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
 54 |         loss = self.criterion(output, targets)
 55 |         return {'oof': output, 'targets': targets}
 56 | 
 57 |     def validation_epoch_end(self, outputs):
 58 |         oof = np.concatenate(
 59 |             [x['oof'].detach().cpu().numpy() for x in outputs], axis=0
 60 |         )
 61 |         targets = np.concatenate(
 62 |             [x['targets'].detach().cpu().numpy() for x in outputs], axis=0
 63 |         )
 64 |         print(eval_classification(targets, oof.argmax(axis=1)))
 65 | 
 66 |     def test_step(self, batch, batch_idx):
 67 |         ids = batch["ids"]
 68 |         attention_mask = batch["attention_mask"]
 69 |         token_type_ids = batch["token_type_ids"]
 70 |         targets = batch["targets"]
 71 |         output = self.backbone(ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
 72 |         loss = self.criterion(output, targets)
 73 |         return {'preds': output}
 74 | 
 75 |     def test_epoch_end(self, outputs):
 76 |         preds = np.concatenate(
 77 |             [x['preds'].detach().cpu().numpy() for x in outputs], axis=0
 78 |         )
 79 |         np.save('data/bert/preds', preds)
 80 | 
 81 |     def configure_optimizers(self):
 82 |         optimizer = torch.optim.Adam(self.parameters(), lr=1e-4)
 83 |         scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=self.trainer.max_epochs)
 84 |         return [optimizer], [scheduler]
 85 | 
 86 | 
 87 | class PLT5Classifier(pl.LightningModule):
 88 |     def __init__(self, model_name: str):
 89 |         super().__init__()
 90 |         self.backbone = T5ForConditionalGeneration.from_pretrained(model_name)
 91 |         self.tokenizer = T5Tokenizer.from_pretrained(model_name, is_fast=True)
 92 | 
 93 |     def forward(self, input_ids, attention_mask=None, decoder_input_ids=None, 
 94 |                 decoder_attention_mask=None, labels=None):
 95 |         """順伝搬"""
 96 |         return self.backbone(
 97 |             input_ids=input_ids,
 98 |             attention_mask=attention_mask,
 99 |             decoder_input_ids=decoder_input_ids,
100 |             decoder_attention_mask=decoder_attention_mask,
101 |             labels=labels
102 |         )
103 | 
104 |     def _step(self, batch):
105 |         """ロス計算"""
106 |         labels = batch["target_ids"]
107 | 
108 |         # All labels set to -100 are ignored (masked), 
109 |         # the loss is only computed for labels in [0, ..., config.vocab_size]
110 |         labels[labels[:, :] == self.tokenizer.pad_token_id] = -100
111 |         outputs = self(
112 |             input_ids=batch["source_ids"],
113 |             attention_mask=batch["source_mask"],
114 |             decoder_attention_mask=batch['target_mask'],
115 |             labels=labels
116 |         )
117 | 
118 |         loss = outputs[0]
119 |         return loss
120 | 
121 |     def training_step(self, batch, batch_idx):
122 |         """訓練ステップ処理"""
123 |         loss = self._step(batch)
124 |         self.log("train_loss", loss)
125 |         return {"loss": loss}
126 | 
127 |     def validation_step(self, batch, batch_idx):
128 |         """バリデーションステップ処理"""
129 |         loss = self._step(batch)
130 |         self.log("val_loss", loss)
131 |         return {"val_loss": loss}
132 | 
133 |     def validation_epoch_end(self, outputs):
134 |         """バリデーション完了処理"""
135 |         avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
136 |         self.log("val_loss", avg_loss, prog_bar=True)
137 | 
138 |     def test_step(self, batch, batch_idx):
139 |         outs = self.backbone.generate(
140 |             input_ids=batch["source_ids"], 
141 |             attention_mask=batch["source_mask"], 
142 |             max_length=4,
143 |             return_dict_in_generate=True,
144 |             output_scores=True
145 |         )
146 | 
147 |         dec = [self.tokenizer.decode(ids, skip_special_tokens=True, 
148 |                                 clean_up_tokenization_spaces=False) 
149 |                     for ids in outs.sequences]
150 |         return {"preds": dec}
151 | 
152 |     def test_epoch_end(self, outputs):
153 |         preds = np.concatenate(
154 |             [x['preds'] for x in outputs], axis=0
155 |         )
156 |         np.save('data/t5/preds', preds)
157 | 
158 |     def configure_optimizers(self):
159 |         optimizer = torch.optim.Adam(self.parameters(), lr=1e-4)
160 |         scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=self.trainer.max_epochs)
161 |         return [optimizer], [scheduler]
162 | 


--------------------------------------------------------------------------------
/utils_nlp/models/nn/runner.py:
--------------------------------------------------------------------------------
 1 | from catalyst.dl import Runner
 2 | import torch
 3 | 
 4 | 
 5 | class CustomRunner(Runner):
 6 |     def _handle_batch(self, batch):
 7 |         ids = batch['ids']
 8 |         mask = batch['mask']
 9 |         token_type_ids = batch['token_type_ids']
10 |         targets = batch['targets']
11 |         outputs = self.model(ids, mask, token_type_ids)
12 |         loss = self.criterion(outputs, targets)
13 |         self.batch_metrics = {'loss': loss}
14 |         if self.is_train_loader:
15 |             loss.backward()
16 |             self.optimizer.step()
17 |             self.optimizer.zero_grad()
18 | 
19 |     @torch.no_grad()
20 |     def predict_batch(self, batch):
21 |         batch = self._batch2device(batch, self.device)
22 |         ids = batch['ids']
23 |         mask = batch['mask']
24 |         token_type_ids = batch['token_type_ids']
25 |         outputs = self.model(ids, mask, token_type_ids)
26 |         return outputs
27 | 


--------------------------------------------------------------------------------
/utils_nlp/models/pretrained_embeddings/README.md:
--------------------------------------------------------------------------------
1 | # Pretrained Embeddings
2 | 
3 | The pretrained embeddings submodule contains utility functions that help users quickly load and extract various types of pretrained embeddings such as fastText, GloVe, Word2Vec, etc.
4 | 


--------------------------------------------------------------------------------
/utils_nlp/models/pretrained_embeddings/word2vec.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import requests
  3 | import zipfile
  4 | 
  5 | import numpy as np
  6 | from tqdm import tqdm
  7 | import gensim
  8 | 
  9 | 
 10 | def _maybe_download_and_extract(dest_path, file_name):
 11 |     """ Downloads and extracts Word2vec vectors if they don’t already exist
 12 |     Args:
 13 |         dest_path: Path to the directory where the vectors will be extracted.
 14 |         file_name: File name of the word2vec vector file.
 15 |     Returns:
 16 |          str: File path to the word2vec vector file.
 17 |     """
 18 | 
 19 |     dir_path = os.path.join(dest_path, "word2vec")
 20 |     file_path = os.path.join(dir_path, file_name)
 21 |     dl_path = os.path.join(file_path, '{}.zip'.format(file_name))
 22 | 
 23 |     if not os.path.exists(file_path):
 24 |         os.makedirs(file_path, exist_ok=True)
 25 |         download_from_gdrive('0ByFQ96A4DgSPUm9wVWRLdm5qbmc', destination=dl_path)
 26 |         with zipfile.ZipFile(dl_path) as f:
 27 |             f.extractall(file_path)
 28 |     else:
 29 |         print("Vector file already exists. No changes made.")
 30 | 
 31 |     return file_path
 32 | 
 33 | 
 34 | def load_pretrained_vectors(
 35 |     dir_path, file_name='vector_neologd', limit=None
 36 | ):
 37 |     """ Method that loads word2vec vectors. Downloads if it doesn't exist.
 38 |     Args:
 39 |         file_name(str): Name of the word2vec file.
 40 |         dir_path(str): Path to the directory where word2vec vectors exist or will be
 41 |         downloaded.
 42 |         limit(int): Number of word vectors that is loaded from gensim. This option
 43 |         allows us to save RAM space and avoid memory errors.
 44 |     Returns:
 45 |         gensim.models.keyedvectors.Word2VecKeyedVectors: Loaded word2vectors
 46 |     """
 47 |     file_path = _maybe_download_and_extract(dir_path, file_name)
 48 |     model_path = os.path.join(file_path, 'model.vec')
 49 |     word2vec_vectors = gensim.models.KeyedVectors.load_word2vec_format(
 50 |         model_path, binary=False, limit=limit
 51 |     )
 52 | 
 53 |     return word2vec_vectors
 54 | 
 55 | 
 56 | def download_from_gdrive(id, destination):
 57 |     """
 58 |     Download file from Google Drive
 59 |     :param str id: g-drive id
 60 |     :param str destination: output path
 61 |     :return:
 62 |     """
 63 |     url = "https://docs.google.com/uc?export=download"
 64 | 
 65 |     session = requests.Session()
 66 |     response = session.get(url, params={'id': id}, stream=True)
 67 |     token = get_confirm_token(response)
 68 |     if token:
 69 |         print("get download warning. set confirm token.")
 70 |         params = {'id': id, 'confirm': token}
 71 |         response = session.get(url, params=params, stream=True)
 72 |     save_response_content(response, destination)
 73 | 
 74 | 
 75 | def get_confirm_token(response):
 76 |     """
 77 |     verify whether warned or not.
 78 |     [note] In Google Drive Api, if requests content size is large,
 79 |     the user are send to verification page.
 80 |     :param requests.Response response:
 81 |     :return:
 82 |     """
 83 |     for k, v in response.cookies.items():
 84 |         if k.startswith("download_warning"):
 85 |             return v
 86 | 
 87 |     return None
 88 | 
 89 | 
 90 | def save_response_content(response, destination):
 91 |     """
 92 |     :param requests.Response response:
 93 |     :param str destination:
 94 |     :return:
 95 |     """
 96 |     chunk_size = 1024 * 1024
 97 |     print("start downloading...")
 98 |     with open(destination, "wb") as f:
 99 |         for chunk in tqdm(response.iter_content(chunk_size), unit="MB"):
100 |             f.write(chunk)
101 |     print("Finish!!")
102 |     print("Save to:{}".format(destination))
103 | 
104 | 
105 | def convert_to_wv(w: str, word_vec):
106 |     """Convert word to vectors
107 | 
108 |     Args:
109 |         w (str): Word
110 |         word_vec: Loaded word2vectors
111 | 
112 |     Returns:
113 |         [type]: numpy vectors
114 |     """
115 |     try:
116 |         v = word_vec.word_vec(w)
117 |     except KeyError:
118 |         v = np.zeros(shape=(word_vec.vector_size,))
119 |     return v
120 | 


--------------------------------------------------------------------------------