├── .gitignore
├── LICENSE
├── README.md
├── abstractive-summarization
├── 1.dilated-seq2seq.ipynb
├── 2.pointer-generator-bahdanau.ipynb
├── 3.pointer-generator-luong.ipynb
├── 4.dilated-fairseq-pointer-generator.ipynb
├── 5.bert-multilanguage-dilated-fairseq-pointer-generator.ipynb
├── README.md
└── pointer_generator_helper.py
├── augmentation
└── 1.word2vec.ipynb
├── cintailah-bahasa-malaysia-menggunakan-tensorflow.jpg
├── dependency-parsing
├── 1.lstm-birnn-crf-biaffine.ipynb
├── 2.lstm-birnn-bahdanau-crf-biaffine.ipynb
├── 3.lstm-birnn-luong-crf-biaffine.ipynb
├── 4.bert-crf-biaffine.ipynb
├── 5.biaffine-attention-cross-entropy.ipynb
├── 6.bert-biaffine-attention-cross-entropy.ipynb
├── 7.xlnet-biaffine-attention-cross-entropy.ipynb
└── README.md
├── english-malay-translation
├── 1.attention-is-all-you-need.ipynb
├── 2.birnn-seq2seq-beam-luong.ipynb
├── 3.conv-encoder-conv-decoder.ipynb
├── 4.dilated-conv-encoder-dilated-conv-decoder.ipynb
├── 5.dilated-conv-encoder-dilated-conv-self-attention.ipynb
├── README.md
└── translation-tokenization.ipynb
├── entity-tagging
├── 1.concat.ipynb
├── 2.luong.ipynb
├── 3.bahdanau.ipynb
├── 4.bert-multilanguage-ner.ipynb
├── 5.bert-base-ner.ipynb
├── 6.bert-small-ner.ipynb
├── 7.xlnet-base.ipynb
└── README.md
├── extractive-summarization
├── 1.skip-thought.ipynb
├── 2.residual-network-bahdanau.ipynb
└── README.md
├── long-text-classification
├── 1.dilated-cnn.ipynb
├── 2.wavenet.ipynb
├── 3.relevancy-multilanguage.ipynb
├── 4.relevancy-base.ipynb
└── README.md
├── normal-text-classification
├── 1.bahdanau.ipynb
├── 2.luong.ipynb
├── 3.bert.ipynb
└── README.md
├── optical-character-recognition
├── 1.cnn-rnn-ctc.ipynb
├── 2.im2latex.ipynb
└── README.md
├── pos-tagging
├── 1.bidirectional-lstm-crf.ipynb
├── 2.bidirectional-lstm-crf-bahdanau.ipynb
├── 3.bidirectional-lstm-crf-luong.ipynb
├── 4.bert-bahasa-base-pos.ipynb
├── 5.xlnet-bahasa-pos.ipynb
└── README.md
├── question-answer
├── 1.end-to-end-gru.ipynb
├── 2.dynamic-memory-gru.ipynb
└── README.md
├── semantic-similarity
├── augmenting.ipynb
├── bahdanau-contrastive.ipynb
├── bert-crossentropy.ipynb
├── dilated-cnn-contrastive.ipynb
└── self-attention-contrastive.ipynb
├── sparse-classification
├── 1.fast-text-ngrams.ipynb
└── README.md
├── speech-to-text
├── 1.birnn-lstm-greedy.ipynb
├── 2.wavenet.ipynb
├── 3.deep-speech-2.ipynb
├── 4.dilated-cnn.ipynb
├── 5.im2latex.ipynb
├── README.md
├── augmentation.ipynb
├── augmentation.py
└── caching.py
├── spelling-correction
└── 1.bert.ipynb
├── stemming
├── 1.seq2seq-lstm.ipynb
├── 2.seq2seq-bahdanau.ipynb
├── 3.seq2seq-luong.ipynb
└── README.md
├── text-to-speech
├── 1.tacotron
│ ├── tacotron.ipynb
│ ├── tacotron.py
│ ├── test.wav
│ └── utils.py
├── 2.seq2seq-bahdanau.ipynb
├── 3.deep-cnn-monothonic-attention.ipynb
├── README.md
├── caching.py
└── utils.py
├── topic-generator
├── 1.tat.ipynb
├── 2.tav.ipynb
├── 3.mta.ipynb
└── README.md
├── topic-modeling
├── README.md
├── business.json
├── economy.json
├── education.json
├── lda2vec.ipynb
└── politics.json
└── word-vector
├── 1.word2vec.py
├── 2.elmo.ipynb
├── 3.fasttext.ipynb
├── README.md
├── load-word2vec.ipynb
├── make-corpus.py
└── word2vec.py
/.gitignore:
--------------------------------------------------------------------------------
1 | *_pycache__
2 | *.ipynb_checkpoints
3 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 HUSEIN ZOLKEPLI
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 | ---
11 |
12 | **Bahasa-NLP-Tensorflow**, Gathers Tensorflow deep learning models for Bahasa Malaysia NLP problems, **code simplify inside Jupyter Notebooks 100% including dataset**.
13 |
14 | ## Table of contents
15 | * [Augmentation](https://github.com/huseinzol05/Bahasa-NLP-Tensorflow#augmentation)
16 | * [Sparse classification](https://github.com/huseinzol05/Bahasa-NLP-Tensorflow#sparse-classification)
17 | * [Long-text classification](https://github.com/huseinzol05/Bahasa-NLP-Tensorflow#long-text-classification)
18 | * [Dependency Parsing](https://github.com/huseinzol05/Bahasa-NLP-Tensorflow#dependency-parsing)
19 | * [English-Malay Translation](https://github.com/huseinzol05/Bahasa-NLP-Tensorflow#english-malay-translation)
20 | * [Entity Tagging](https://github.com/huseinzol05/Bahasa-NLP-Tensorflow#entity-tagging)
21 | * [Abstractive Summarization](https://github.com/huseinzol05/Bahasa-NLP-Tensorflow#abstractive-summarization)
22 | * [Extractive Summarization](https://github.com/huseinzol05/Bahasa-NLP-Tensorflow#extractive-summarization)
23 | * [POS Tagging](https://github.com/huseinzol05/Bahasa-NLP-Tensorflow#pos-tagging)
24 | * [Optical Character Recognition](https://github.com/huseinzol05/Bahasa-NLP-Tensorflow#optical-character-recognition)
25 | * [Question-Answer](https://github.com/huseinzol05/Bahasa-NLP-Tensorflow#question-answer)
26 | * [Semantic Similarity](https://github.com/huseinzol05/Bahasa-NLP-Tensorflow#semantic-similarity)
27 | * [Speech to Text](https://github.com/huseinzol05/Bahasa-NLP-Tensorflow#speech-to-text)
28 | * [Stemming](https://github.com/huseinzol05/Bahasa-NLP-Tensorflow#stemming)
29 | * [Topic Generator](https://github.com/huseinzol05/Bahasa-NLP-Tensorflow#topic-generator)
30 | * [Text to Speech](https://github.com/huseinzol05/Bahasa-NLP-Tensorflow#text-to-speech)
31 | * [Topic Modeling](https://github.com/huseinzol05/Bahasa-NLP-Tensorflow#topic-modeling)
32 | * [Word Vector](https://github.com/huseinzol05/Bahasa-NLP-Tensorflow#word-vector)
33 |
34 | ### [Augmentation](augmentation)
35 |
36 | 1. word2vec Malaya
37 |
38 | ### [Sparse classification](sparse-classification)
39 |
40 | Trained on [Tatoeba dataset](http://downloads.tatoeba.org/exports/sentences.tar.bz2).
41 |
42 | 1. Fast-text Ngrams, test accuracy 88%
43 |
44 | ### [Normal-text classification](normal-text-classification)
45 |
46 | Trained on [Bahasa subjectivity dataset](https://github.com/huseinzol05/Malaya-Dataset/tree/master/subjectivity).
47 |
48 | 1. RNN LSTM + Bahdanau Attention, test accuracy 84%
49 | 2. RNN LSTM + Luong Attention, test accuracy 82%
50 | 3. Transfer-learning Multilanguage BERT, test accuracy 94.88%
51 |
52 | 70+ more models can get from [here](https://github.com/huseinzol05/NLP-Models-Tensorflow/tree/master/text-classification).
53 |
54 | ### [Long-text classification](long-text-classification)
55 |
56 | Trained on [Bahasa fakenews dataset](https://github.com/huseinzol05/Malaya-Dataset/tree/master/fake-news).
57 |
58 | 1. Dilated CNN, test accuracy 74%
59 | 2. Wavenet, test accuracy 68%
60 | 3. BERT Multilanguage, test accuracy 85%
61 | 4. BERT-Bahasa Base, test accuracy 88%
62 |
63 | ### [Dependency Parsing](dependency-parsing)
64 |
65 | Trained on [Bahasa dependency parsing dataset](https://github.com/huseinzol05/Malaya-Dataset/tree/master/dependency). 80% to train, 20% to test.
66 |
67 | Accuracy based on arc, types and root accuracies after 10 epochs only.
68 |
69 | 1. Bidirectional RNN + CRF + Biaffine, arc accuracy 60.64%, types accuracy 58.68%, root accuracy 89.03%
70 | 2. Bidirectional RNN + Bahdanau + CRF + Biaffine, arc accuracy 60.51%, types accuracy 59.01%, root accuracy 88.99%
71 | 3. Bidirectional RNN + Luong + CRF + Biaffine, arc accuracy 60.60%, types accuracy 59.06%, root accuracy 89.76%
72 | 4. BERT Base + CRF + Biaffine, arc accuracy 58.55%, types accuracy 58.12%, root accuracy 88.87%
73 | 5. Bidirectional RNN + Biaffine Attention + Cross Entropy, arc accuracy 69.53%, types accuracy 65.38%, root accuracy 90.71%
74 | 6. BERT Base + Biaffine Attention + Cross Entropy, arc accuracy 77.03%, types accuracy 66.73%, root accuracy 88.38%
75 | 7. XLNET Base + Biaffine Attention + Cross Entropy, arc accuracy 93.50%, types accuracy 92.48%, root accuracy 94.46%
76 |
77 | ### [English-Malay Translation](english-malay-translation)
78 |
79 | Trained on [100k english-malay dataset](https://github.com/huseinzol05/Malaya-Dataset/tree/master/english-malay).
80 |
81 | 1. Attention is All you need, train accuracy 19.09% test accuracy 20.38%
82 | 2. BiRNN Seq2Seq Luong Attention, Beam decoder, train accuracy 45.2% test accuracy 37.26%
83 | 3. Convolution Encoder Decoder, train accuracy 35.89% test accuracy 30.65%
84 | 4. Dilated Convolution Encoder Decoder, train accuracy 82.3% test accuracy 56.72%
85 | 5. Dilated Convolution Encoder Decoder Self-Attention, train accuracy 60.76% test accuracy 36.59%
86 |
87 | ### [Entity Tagging](entity-tagging)
88 |
89 | Trained on [Bahasa entity dataset](https://github.com/huseinzol05/Malaya-Dataset/tree/master/entities).
90 |
91 | 1. Bidirectional LSTM + CRF, test accuracy 95.10%
92 | 2. Bidirectional LSTM + CRF + Bahdanau, test accuracy 94.34%
93 | 3. Bidirectional LSTM + CRF + Luong, test accuracy 94.84%
94 | 4. BERT Multilanguage, test accuracy 96.43%
95 | 5. BERT-Bahasa Base, test accuracy 98.11%
96 | 6. BERT-Bahasa Small, test accuracy 98.47%
97 | 7. XLNET-Bahasa Base, test accuracy 98.008%
98 |
99 | ### [POS Tagging](pos-tagging)
100 |
101 | Trained on [Bahasa entity dataset](https://github.com/huseinzol05/Malaya-Dataset/tree/master/part-of-speech).
102 |
103 | 1. Bidirectional LSTM + CRF
104 | 2. Bidirectional LSTM + CRF + Bahdanau
105 | 3. Bidirectional LSTM + CRF + Luong
106 | 4. Bert-Bahasa-Base + CRF, test accuracy 95.17%
107 | 5. XLNET-Bahasa-Base + CRF, test accuracy 95.58%
108 |
109 | ### [Abstractive Summarization](abstractive-summarization)
110 |
111 | Trained on [Malaysia news dataset](https://github.com/huseinzol05/Malaya-Dataset#30k-news).
112 |
113 | Accuracy based on ROUGE-2 after 20 epochs only.
114 |
115 | 1. Dilated Seq2Seq, test accuracy 23.926%
116 | 2. Pointer Generator + Bahdanau Attention, test accuracy 15.839%
117 | 3. Pointer Generator + Luong Attention, test accuracy 26.23%
118 | 4. Dilated Seq2Seq + Pointer Generator, test accuracy 20.382%
119 | 5. BERT Multilanguage + Dilated CNN Seq2seq + Pointer Generator, test accuracy 23.7134%
120 |
121 | ### [Extractive Summarization](extractive-summarization)
122 |
123 | Trained on [Malaysia news dataset](https://github.com/huseinzol05/Malaya-Dataset/tree/master/news).
124 |
125 | 1. Skip-thought
126 | 2. Residual Network + Bahdanau Attention
127 |
128 | ### [Optical Character Recognition](optical-character-recognition)
129 |
130 | Trained on [OCR Jawi to Malay](https://github.com/huseinzol05/Malaya-Dataset/tree/master/ocr#malay-to-jawi)
131 |
132 | 1. CNN + LSTM RNN, test accuracy 63.86%
133 | 2. Im2Latex, test accuracy 89.38%
134 |
135 | ### [Question-Answer](question-answer)
136 |
137 | Trained on [Bahasa QA dataset](https://github.com/huseinzol05/Malaya-Dataset/tree/master/question-answer).
138 |
139 | 1. End-to-End + GRU, test accuracy 89.17%
140 | 2. Dynamic Memory + GRU, test accuracy 98.86%
141 |
142 | ### [Semantic Similarity](semantic-similarity)
143 |
144 | Trained on [Translated Duplicated Quora question dataset](https://github.com/huseinzol05/Malaya-Dataset/tree/master/text-similarity/quora).
145 |
146 | 1. LSTM Bahdanau + Contrastive loss, test accuracy 79%
147 | 2. Dilated CNN + Contrastive loss, test accuracy 77%
148 | 3. Self-Attention + Contrastive loss, test accuracy 77%
149 | 4. BERT + Cross entropy, test accuracy 83%
150 |
151 | ### [Speech to Text](speech-to-text)
152 |
153 | Trained on [Kamus speech dataset](https://github.com/huseinzol05/Malaya-Dataset/tree/master/speech).
154 |
155 | 1. BiRNN + LSTM + CTC Greedy, test accuracy 72.03%
156 | 2. Wavenet, test accuracy 10.21%
157 | 3. Deep speech 2, test accuracy 56.51%
158 | 4. Dilated-CNN, test accuracy 59.31%
159 | 5. Im2Latex, test accuracy 58.59%
160 |
161 | ### [Text to Speech](text-to-speech)
162 |
163 | 1. Tacotron
164 | 2. Seq2Seq + Bahdanau Attention
165 | 3. Deep CNN + Monothonic Attention + Dilated CNN vocoder
166 |
167 | ### [Stemming](stemming)
168 |
169 | Trained on [stemming dataset](https://github.com/huseinzol05/Malaya-Dataset/tree/master/stemmer).
170 |
171 | 1. Seq2seq + Beam decoder
172 | 2. Seq2seq + Bahdanau Attention + Beam decoder
173 | 3. Seq2seq + Luong Attention + Beam decoder
174 |
175 | ### [Topic Generator](topic-generator)
176 |
177 | Trained on [Malaysia news dataset](https://github.com/huseinzol05/Malaya-Dataset/tree/master/news).
178 |
179 | 1. TAT-LSTM, test accuracy 32.89%
180 | 2. TAV-LSTM, test accuracy 40.69%
181 | 3. MTA-LSTM, test accuracy 32.96%
182 |
183 | ### [Topic Modeling](topic-modeling)
184 |
185 | 1. Lda2Vec
186 |
187 | ### [Word Vector](word-vector)
188 |
189 | 1. word2vec
190 | 2. ELMO
191 | 3. Fast-text
192 |
--------------------------------------------------------------------------------
/abstractive-summarization/README.md:
--------------------------------------------------------------------------------
1 | ## How-to
2 |
3 | 1. Download dataset,
4 | ```bash
5 | wget https://raw.githubusercontent.com/huseinzol05/Malaya-Dataset/master/news-30k/news-30k.json.zip
6 | ```
7 |
8 | 2. Unzip the dataset,
9 | ```bash
10 | unzip news-30k.json.zip
11 | ```
12 |
13 | 3. Run any notebook using Jupyter Notebook.
14 |
--------------------------------------------------------------------------------
/augmentation/1.word2vec.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## 1. Download pretrained word2vec from malaya"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [
15 | {
16 | "name": "stdout",
17 | "output_type": "stream",
18 | "text": [
19 | "--2019-05-11 18:41:18-- https://s3-ap-southeast-1.amazonaws.com/huseinhouse-storage/v13/word2vec/word2vec-wiki-nce-256.p\n",
20 | "Resolving s3-ap-southeast-1.amazonaws.com (s3-ap-southeast-1.amazonaws.com)... 52.219.32.81\n",
21 | "Connecting to s3-ap-southeast-1.amazonaws.com (s3-ap-southeast-1.amazonaws.com)|52.219.32.81|:443... connected.\n",
22 | "HTTP request sent, awaiting response... 200 OK\n",
23 | "Length: 726360284 (693M) [application/x-www-form-urlencoded]\n",
24 | "Saving to: ‘word2vec-wiki-nce-256.p’\n",
25 | "\n",
26 | "word2vec-wiki-nce-2 100%[===================>] 692.71M 3.60MB/s in 3m 11s \n",
27 | "\n",
28 | "2019-05-11 18:44:29 (3.62 MB/s) - ‘word2vec-wiki-nce-256.p’ saved [726360284/726360284]\n",
29 | "\n"
30 | ]
31 | }
32 | ],
33 | "source": [
34 | "!wget https://s3-ap-southeast-1.amazonaws.com/huseinhouse-storage/v13/word2vec/word2vec-wiki-nce-256.p"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": 2,
40 | "metadata": {},
41 | "outputs": [],
42 | "source": [
43 | "import pickle\n",
44 | "import tensorflow as tf"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": 3,
50 | "metadata": {},
51 | "outputs": [],
52 | "source": [
53 | "with open('word2vec-wiki-nce-256.p', 'rb') as fopen:\n",
54 | " word2vec = pickle.load(fopen)"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": 4,
60 | "metadata": {},
61 | "outputs": [
62 | {
63 | "data": {
64 | "text/plain": [
65 | "dict_keys(['rev_dictionary', 'nce_weights', 'dictionary'])"
66 | ]
67 | },
68 | "execution_count": 4,
69 | "metadata": {},
70 | "output_type": "execute_result"
71 | }
72 | ],
73 | "source": [
74 | "word2vec.keys()"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": 5,
80 | "metadata": {},
81 | "outputs": [
82 | {
83 | "data": {
84 | "text/plain": [
85 | "1187"
86 | ]
87 | },
88 | "execution_count": 5,
89 | "metadata": {},
90 | "output_type": "execute_result"
91 | }
92 | ],
93 | "source": [
94 | "word2vec['dictionary']['ayam']"
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": 13,
100 | "metadata": {},
101 | "outputs": [],
102 | "source": [
103 | "vectors = word2vec['nce_weights']\n",
104 | "dictionary = word2vec['dictionary']\n",
105 | "rev_dictionary = word2vec['rev_dictionary']"
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": 14,
111 | "metadata": {},
112 | "outputs": [],
113 | "source": [
114 | "class Model:\n",
115 | " def __init__(self):\n",
116 | " self._embedding = tf.convert_to_tensor(vectors, dtype = tf.float32)\n",
117 | " self.X = tf.placeholder(\n",
118 | " tf.float32, [None, vectors.shape[1]]\n",
119 | " )\n",
120 | " normed_embedding = tf.nn.l2_normalize(self._embedding, axis = 1)\n",
121 | " normed_array = tf.nn.l2_normalize(self.X, axis = 1)\n",
122 | " self.cosine_similarity = tf.matmul(\n",
123 | " normed_array, tf.transpose(normed_embedding, [1, 0])\n",
124 | " )"
125 | ]
126 | },
127 | {
128 | "cell_type": "code",
129 | "execution_count": 15,
130 | "metadata": {},
131 | "outputs": [
132 | {
133 | "name": "stderr",
134 | "output_type": "stream",
135 | "text": [
136 | "/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py:1702: UserWarning: An interactive session is already active. This can cause out-of-memory errors in some cases. You must explicitly call `InteractiveSession.close()` to release resources held by the other session(s).\n",
137 | " warnings.warn('An interactive session is already active. This can '\n"
138 | ]
139 | }
140 | ],
141 | "source": [
142 | "tf.reset_default_graph()\n",
143 | "model = Model()\n",
144 | "sess = tf.InteractiveSession()"
145 | ]
146 | },
147 | {
148 | "cell_type": "code",
149 | "execution_count": 25,
150 | "metadata": {},
151 | "outputs": [],
152 | "source": [
153 | "string_positive = 'kerajaan sebenarnya sangat prihatin dengan rakyatnya dapat diskaun dan segalanya'\n",
154 | "string_negative = 'minyak naik lagi harga klau gaji naik juga takpe juga'"
155 | ]
156 | },
157 | {
158 | "cell_type": "code",
159 | "execution_count": 12,
160 | "metadata": {},
161 | "outputs": [],
162 | "source": [
163 | "import random"
164 | ]
165 | },
166 | {
167 | "cell_type": "code",
168 | "execution_count": 16,
169 | "metadata": {},
170 | "outputs": [],
171 | "source": [
172 | "def augmentation(string, threshold = 0.5, count = 5, k = 8):\n",
173 | " string = string.split()\n",
174 | " selected = []\n",
175 | " while not len(selected):\n",
176 | " selected = [(no, w) for no, w in enumerate(string) if random.random() > threshold]\n",
177 | " indices, words = [i[0] for i in selected], [i[1] for i in selected]\n",
178 | " \n",
179 | " batches = vectors[[dictionary[w] for w in words]]\n",
180 | " top_k = tf.nn.top_k(model.cosine_similarity, k = k)\n",
181 | " results = sess.run(top_k, feed_dict = {model.X: batches})\n",
182 | " words = []\n",
183 | " for result in results.indices:\n",
184 | " words.append([rev_dictionary[i] for i in result])\n",
185 | " augmented = []\n",
186 | " for i in range(count):\n",
187 | " string_ = string[:]\n",
188 | " for no in range(len(words)):\n",
189 | " index = random.randint(0, len(words[no]) - 1)\n",
190 | " string_[indices[no]] = words[no][index]\n",
191 | " augmented.append(' '.join(string_))\n",
192 | " return augmented "
193 | ]
194 | },
195 | {
196 | "cell_type": "code",
197 | "execution_count": 21,
198 | "metadata": {},
199 | "outputs": [
200 | {
201 | "name": "stdout",
202 | "output_type": "stream",
203 | "text": [
204 | "CPU times: user 8.54 s, sys: 692 ms, total: 9.23 s\n",
205 | "Wall time: 9.28 s\n"
206 | ]
207 | },
208 | {
209 | "data": {
210 | "text/plain": [
211 | "['pemerintah sememangnya sangat prihatin kerana warganya dapat sewaan hingga nasibnya',\n",
212 | " 'pemerintah ternyata sangat prihatin untuk kaumnya dapat keselanjarannya serta segalanya',\n",
213 | " 'perlembagaan sebenarnya sangat prihatin selepas penyokongnya dapat sewaan dan laguku',\n",
214 | " 'kabinet sebenarnya sangat prihatin dalam penduduknya dapat taksiran dan nasibnya',\n",
215 | " 'kerajaan memang sangat prihatin dengan orangnya dapat milodon tetapi nasibnya']"
216 | ]
217 | },
218 | "execution_count": 21,
219 | "metadata": {},
220 | "output_type": "execute_result"
221 | }
222 | ],
223 | "source": [
224 | "%%time\n",
225 | "augmentation(string_positive)"
226 | ]
227 | },
228 | {
229 | "cell_type": "markdown",
230 | "metadata": {},
231 | "source": [
232 | "## Malaya implementation\n",
233 | "\n",
234 | "You can check stable text augmentation implementation at https://malaya.readthedocs.io/en/latest/Generator.html"
235 | ]
236 | },
237 | {
238 | "cell_type": "code",
239 | "execution_count": null,
240 | "metadata": {},
241 | "outputs": [],
242 | "source": []
243 | }
244 | ],
245 | "metadata": {
246 | "kernelspec": {
247 | "display_name": "Python 3",
248 | "language": "python",
249 | "name": "python3"
250 | },
251 | "language_info": {
252 | "codemirror_mode": {
253 | "name": "ipython",
254 | "version": 3
255 | },
256 | "file_extension": ".py",
257 | "mimetype": "text/x-python",
258 | "name": "python",
259 | "nbconvert_exporter": "python",
260 | "pygments_lexer": "ipython3",
261 | "version": "3.6.8"
262 | }
263 | },
264 | "nbformat": 4,
265 | "nbformat_minor": 2
266 | }
267 |
--------------------------------------------------------------------------------
/cintailah-bahasa-malaysia-menggunakan-tensorflow.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huseinzol05/Bahasa-NLP-Tensorflow/4e6427230e36c2d79ec951c7f2c3501bf75f9a8a/cintailah-bahasa-malaysia-menggunakan-tensorflow.jpg
--------------------------------------------------------------------------------
/dependency-parsing/README.md:
--------------------------------------------------------------------------------
1 | ## How-to
2 |
3 | 1. Run any notebook using Jupyter Notebook.
4 |
--------------------------------------------------------------------------------
/english-malay-translation/README.md:
--------------------------------------------------------------------------------
1 | ## How-to
2 |
3 | 1. Clone [Malaya-Dataset](https://github.com/huseinzol05/Malaya-Dataset)
4 |
5 | ```bash
6 | git clone https://github.com/huseinzol05/Malaya-Dataset.git
7 | ```
8 |
9 | 2. Run [translation-tokenization.ipynb](translation-tokenization.ipynb) for tokenization process.
10 |
11 | 3. Run any notebook using Jupyter Notebook.
12 |
--------------------------------------------------------------------------------
/english-malay-translation/translation-tokenization.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import json\n",
10 | "import glob\n",
11 | "import malaya\n",
12 | "from unidecode import unidecode\n",
13 | "import re"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": 2,
19 | "metadata": {},
20 | "outputs": [],
21 | "source": [
22 | "tokenizer = malaya.preprocessing._SocialTokenizer().tokenize\n",
23 | "rules_normalizer = malaya.texts._tatabahasa.rules_normalizer\n",
24 | "\n",
25 | "def is_number_regex(s):\n",
26 | " if re.match(\"^\\d+?\\.\\d+?$\", s) is None:\n",
27 | " return s.isdigit()\n",
28 | " return True\n",
29 | "\n",
30 | "def detect_money(word):\n",
31 | " if word[:2] == 'rm' and is_number_regex(word[2:]):\n",
32 | " return True\n",
33 | " else:\n",
34 | " return False\n",
35 | "\n",
36 | "def preprocessing(string):\n",
37 | " tokenized = tokenizer(unidecode(string))\n",
38 | " tokenized = [w.lower() for w in tokenized if len(w) > 1]\n",
39 | " tokenized = ['' if is_number_regex(w) else w for w in tokenized]\n",
40 | " tokenized = ['' if detect_money(w) else w for w in tokenized]\n",
41 | " return tokenized"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": 3,
47 | "metadata": {},
48 | "outputs": [],
49 | "source": [
50 | "english, bahasa = [], []\n",
51 | "\n",
52 | "files = glob.glob('Malaya-Dataset/english-malay/*.json')\n",
53 | "for file in files:\n",
54 | " with open(file) as fopen:\n",
55 | " x = json.load(fopen)\n",
56 | " for l, r in x:\n",
57 | " english.append(l)\n",
58 | " bahasa.append(r)"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": 4,
64 | "metadata": {},
65 | "outputs": [
66 | {
67 | "data": {
68 | "text/plain": [
69 | "(332964, 332964)"
70 | ]
71 | },
72 | "execution_count": 4,
73 | "metadata": {},
74 | "output_type": "execute_result"
75 | }
76 | ],
77 | "source": [
78 | "len(english), len(bahasa)"
79 | ]
80 | },
81 | {
82 | "cell_type": "code",
83 | "execution_count": 5,
84 | "metadata": {},
85 | "outputs": [
86 | {
87 | "name": "stderr",
88 | "output_type": "stream",
89 | "text": [
90 | "100%|██████████| 332964/332964 [01:37<00:00, 3412.79it/s]\n"
91 | ]
92 | }
93 | ],
94 | "source": [
95 | "from tqdm import tqdm\n",
96 | "\n",
97 | "x, y = [], []\n",
98 | "for i in tqdm(range(len(english))):\n",
99 | " p = preprocessing(english[i])\n",
100 | " u = preprocessing(bahasa[i])\n",
101 | " if len(p) <= 100 and len(p) > 3 and len(u) > 3:\n",
102 | " x.append(p)\n",
103 | " y.append(u)"
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": 6,
109 | "metadata": {},
110 | "outputs": [
111 | {
112 | "data": {
113 | "text/plain": [
114 | "(321749, 321749)"
115 | ]
116 | },
117 | "execution_count": 6,
118 | "metadata": {},
119 | "output_type": "execute_result"
120 | }
121 | ],
122 | "source": [
123 | "len(x), len(y)"
124 | ]
125 | },
126 | {
127 | "cell_type": "markdown",
128 | "metadata": {},
129 | "source": [
130 | "## Limit to 100k only, too big"
131 | ]
132 | },
133 | {
134 | "cell_type": "code",
135 | "execution_count": 7,
136 | "metadata": {},
137 | "outputs": [],
138 | "source": [
139 | "english = x[:100000]\n",
140 | "bahasa = y[:100000]"
141 | ]
142 | },
143 | {
144 | "cell_type": "code",
145 | "execution_count": 8,
146 | "metadata": {},
147 | "outputs": [],
148 | "source": [
149 | "import collections\n",
150 | "\n",
151 | "def build_dataset(words, n_words, atleast=1):\n",
152 | " count = [['PAD', 0], ['GO', 1], ['EOS', 2], ['UNK', 3]]\n",
153 | " counter = collections.Counter(words).most_common(n_words)\n",
154 | " counter = [i for i in counter if i[1] >= atleast]\n",
155 | " count.extend(counter)\n",
156 | " dictionary = dict()\n",
157 | " for word, _ in count:\n",
158 | " dictionary[word] = len(dictionary)\n",
159 | " data = list()\n",
160 | " unk_count = 0\n",
161 | " for word in words:\n",
162 | " index = dictionary.get(word, 0)\n",
163 | " if index == 0:\n",
164 | " unk_count += 1\n",
165 | " data.append(index)\n",
166 | " count[0][1] = unk_count\n",
167 | " reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))\n",
168 | " return data, count, dictionary, reversed_dictionary"
169 | ]
170 | },
171 | {
172 | "cell_type": "code",
173 | "execution_count": 9,
174 | "metadata": {},
175 | "outputs": [
176 | {
177 | "name": "stdout",
178 | "output_type": "stream",
179 | "text": [
180 | "vocab from size: 35547\n",
181 | "Most common words [('the', 132997), ('and', 67192), ('of', 67190), ('to', 65903), ('in', 44747), ('that', 41188)]\n",
182 | "Sample data [12, 10, 8, 54, 592, 9, 30, 6, 66, 17] ['it', 'is', 'in', 'their', 'interest', 'that', 'all', 'of', 'us', 'are']\n"
183 | ]
184 | }
185 | ],
186 | "source": [
187 | "import itertools\n",
188 | "\n",
189 | "concat = list(itertools.chain(*english))\n",
190 | "vocabulary_size_english = len(list(set(concat)))\n",
191 | "data, count, dictionary_english, rev_dictionary_english = build_dataset(concat, vocabulary_size_english)\n",
192 | "print('vocab from size: %d'%(vocabulary_size_english))\n",
193 | "print('Most common words', count[4:10])\n",
194 | "print('Sample data', data[:10], [rev_dictionary_english[i] for i in data[:10]])"
195 | ]
196 | },
197 | {
198 | "cell_type": "code",
199 | "execution_count": 10,
200 | "metadata": {},
201 | "outputs": [
202 | {
203 | "name": "stdout",
204 | "output_type": "stream",
205 | "text": [
206 | "vocab from size: 27434\n",
207 | "Most common words [('yang', 78745), ('dan', 67280), ('untuk', 37803), ('saya', 37802), ('ini', 31480), ('di', 31300)]\n",
208 | "Sample data [20, 15, 298, 16, 17, 10, 40, 15, 10247, 10] ['ia', 'adalah', 'kepentingan', 'mereka', 'bahawa', 'kita', 'semua', 'adalah', 'idiot', 'kita']\n"
209 | ]
210 | }
211 | ],
212 | "source": [
213 | "concat = list(itertools.chain(*bahasa))\n",
214 | "vocabulary_size_bahasa = len(list(set(concat)))\n",
215 | "data, count, dictionary_bahasa, rev_dictionary_bahasa = build_dataset(concat, vocabulary_size_bahasa)\n",
216 | "print('vocab from size: %d'%(vocabulary_size_bahasa))\n",
217 | "print('Most common words', count[4:10])\n",
218 | "print('Sample data', data[:10], [rev_dictionary_bahasa[i] for i in data[:10]])"
219 | ]
220 | },
221 | {
222 | "cell_type": "code",
223 | "execution_count": 11,
224 | "metadata": {},
225 | "outputs": [],
226 | "source": [
227 | "with open('dictionary.json', 'w') as fopen:\n",
228 | " json.dump({'english':{'dictionary': dictionary_english,\n",
229 | " 'rev_dictionary': rev_dictionary_english},\n",
230 | " 'bahasa':{\n",
231 | " 'dictionary': dictionary_bahasa,\n",
232 | " 'rev_dictionary': rev_dictionary_bahasa\n",
233 | " }}, fopen)"
234 | ]
235 | },
236 | {
237 | "cell_type": "code",
238 | "execution_count": 12,
239 | "metadata": {},
240 | "outputs": [],
241 | "source": [
242 | "with open('english-malay.json', 'w') as fopen:\n",
243 | " json.dump([english, bahasa], fopen)"
244 | ]
245 | },
246 | {
247 | "cell_type": "code",
248 | "execution_count": null,
249 | "metadata": {},
250 | "outputs": [],
251 | "source": []
252 | }
253 | ],
254 | "metadata": {
255 | "kernelspec": {
256 | "display_name": "Python 3",
257 | "language": "python",
258 | "name": "python3"
259 | },
260 | "language_info": {
261 | "codemirror_mode": {
262 | "name": "ipython",
263 | "version": 3
264 | },
265 | "file_extension": ".py",
266 | "mimetype": "text/x-python",
267 | "name": "python",
268 | "nbconvert_exporter": "python",
269 | "pygments_lexer": "ipython3",
270 | "version": "3.6.8"
271 | }
272 | },
273 | "nbformat": 4,
274 | "nbformat_minor": 2
275 | }
276 |
--------------------------------------------------------------------------------
/entity-tagging/README.md:
--------------------------------------------------------------------------------
1 | ## How-to
2 |
3 | 1. Download dataset from here, https://github.com/huseinzol05/Malaya-Dataset/tree/master/entities
4 |
5 | 2. Run any notebook using Jupyter Notebook.
6 |
7 | **For more models, you can check in https://github.com/huseinzol05/NLP-Models-Tensorflow/tree/master/entity-tagging, but the dataset is not Bahasa Malaysia**
8 |
--------------------------------------------------------------------------------
/extractive-summarization/1.skip-thought.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import os\n",
10 | "import re\n",
11 | "import tensorflow as tf\n",
12 | "import numpy as np\n",
13 | "import json\n",
14 | "import random"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 2,
20 | "metadata": {},
21 | "outputs": [
22 | {
23 | "data": {
24 | "text/plain": [
25 | "123"
26 | ]
27 | },
28 | "execution_count": 2,
29 | "metadata": {},
30 | "output_type": "execute_result"
31 | }
32 | ],
33 | "source": [
34 | "labels = os.listdir('news')\n",
35 | "news = ['news/' + i for i in labels if '.json' in i]\n",
36 | "labels = [i.replace('.json','') for i in labels]\n",
37 | "len(news)"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": 3,
43 | "metadata": {},
44 | "outputs": [],
45 | "source": [
46 | "import malaya\n",
47 | "tokenizer = malaya.preprocessing._SocialTokenizer().tokenize\n",
48 | "split_sentence = malaya.texts._text_functions.split_into_sentences"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": 4,
54 | "metadata": {},
55 | "outputs": [],
56 | "source": [
57 | "accept_tokens = ',-.()\"\\''\n",
58 | "\n",
59 | "def is_number_regex(s):\n",
60 | " if re.match(\"^\\d+?\\.\\d+?$\", s) is None:\n",
61 | " return s.isdigit()\n",
62 | " return True\n",
63 | "\n",
64 | "def detect_money(word):\n",
65 | " if word[:2] == 'rm' and is_number_regex(word[2:]):\n",
66 | " return True\n",
67 | " else:\n",
68 | " return False\n",
69 | "\n",
70 | "def preprocessing(string):\n",
71 | " splitted = split_sentence(string)\n",
72 | " for i, string in enumerate(splitted):\n",
73 | " tokenized = tokenizer(string)\n",
74 | " tokenized = [w.lower() for w in tokenized if len(w) > 1]\n",
75 | " tokenized = ['' if is_number_regex(w) else w for w in tokenized]\n",
76 | " tokenized = ['' if detect_money(w) else w for w in tokenized]\n",
77 | " splitted[i] = tokenized\n",
78 | " return splitted"
79 | ]
80 | },
81 | {
82 | "cell_type": "code",
83 | "execution_count": 5,
84 | "metadata": {},
85 | "outputs": [
86 | {
87 | "data": {
88 | "text/plain": [
89 | "263638"
90 | ]
91 | },
92 | "execution_count": 5,
93 | "metadata": {},
94 | "output_type": "execute_result"
95 | }
96 | ],
97 | "source": [
98 | "min_len = 20\n",
99 | "x = []\n",
100 | "for no, n in enumerate(news):\n",
101 | " with open(n) as fopen: \n",
102 | " news_ = json.load(fopen)\n",
103 | " for row in news_:\n",
104 | " if len(row['text'].split()) > min_len:\n",
105 | " p = preprocessing(row['text'])\n",
106 | " x.extend(p)\n",
107 | " \n",
108 | "len(x)"
109 | ]
110 | },
111 | {
112 | "cell_type": "code",
113 | "execution_count": 6,
114 | "metadata": {},
115 | "outputs": [],
116 | "source": [
117 | "x = random.sample(x, 10000)"
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": 7,
123 | "metadata": {},
124 | "outputs": [],
125 | "source": [
126 | "import collections\n",
127 | "\n",
128 | "def batch_sequence(sentences, dictionary, maxlen = 50):\n",
129 | " np_array = np.zeros((len(sentences), maxlen), dtype = np.int32)\n",
130 | " for no_sentence, sentence in enumerate(sentences):\n",
131 | " current_no = 0\n",
132 | " for no, word in enumerate(sentence[: maxlen - 2]):\n",
133 | " np_array[no_sentence, no] = dictionary.get(word, 1)\n",
134 | " current_no = no\n",
135 | " np_array[no_sentence, current_no + 1] = 3\n",
136 | " return np_array\n",
137 | "\n",
138 | "def build_dataset(words, n_words, atleast=2):\n",
139 | " count = [['PAD', 0], ['GO', 1], ['EOS', 2], ['UNK', 3]]\n",
140 | " counter = collections.Counter(words).most_common(n_words)\n",
141 | " counter = [i for i in counter if i[1] >= atleast]\n",
142 | " count.extend(counter)\n",
143 | " dictionary = dict()\n",
144 | " for word, _ in count:\n",
145 | " dictionary[word] = len(dictionary)\n",
146 | " data = list()\n",
147 | " unk_count = 0\n",
148 | " for word in words:\n",
149 | " index = dictionary.get(word, 0)\n",
150 | " if index == 0:\n",
151 | " unk_count += 1\n",
152 | " data.append(index)\n",
153 | " count[0][1] = unk_count\n",
154 | " reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))\n",
155 | " return data, count, dictionary, reversed_dictionary"
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": 8,
161 | "metadata": {},
162 | "outputs": [],
163 | "source": [
164 | "import itertools\n",
165 | "\n",
166 | "X = list(itertools.chain(*x))"
167 | ]
168 | },
169 | {
170 | "cell_type": "code",
171 | "execution_count": 9,
172 | "metadata": {},
173 | "outputs": [
174 | {
175 | "data": {
176 | "text/plain": [
177 | "24667"
178 | ]
179 | },
180 | "execution_count": 9,
181 | "metadata": {},
182 | "output_type": "execute_result"
183 | }
184 | ],
185 | "source": [
186 | "maxlen = 50\n",
187 | "vocabulary_size = len(set(X))\n",
188 | "embedding_size = 256\n",
189 | "learning_rate = 1e-3\n",
190 | "batch_size = 16\n",
191 | "vocabulary_size"
192 | ]
193 | },
194 | {
195 | "cell_type": "code",
196 | "execution_count": 10,
197 | "metadata": {},
198 | "outputs": [
199 | {
200 | "data": {
201 | "text/plain": [
202 | "(9998, 9998, 9998)"
203 | ]
204 | },
205 | "execution_count": 10,
206 | "metadata": {},
207 | "output_type": "execute_result"
208 | }
209 | ],
210 | "source": [
211 | "from sklearn.utils import shuffle\n",
212 | "\n",
213 | "stride = 1\n",
214 | "t_range = int((len(x) - 3) / stride + 1)\n",
215 | "left, middle, right = [], [], []\n",
216 | "for i in range(t_range):\n",
217 | " slices = x[i * stride : i * stride + 3]\n",
218 | " left.append(slices[0])\n",
219 | " middle.append(slices[1])\n",
220 | " right.append(slices[2])\n",
221 | "\n",
222 | "left, middle, right = shuffle(left, middle, right)\n",
223 | "len(left), len(middle), len(right)"
224 | ]
225 | },
226 | {
227 | "cell_type": "code",
228 | "execution_count": 11,
229 | "metadata": {},
230 | "outputs": [
231 | {
232 | "name": "stdout",
233 | "output_type": "stream",
234 | "text": [
235 | "vocab from size: 24667\n",
236 | "Most common words [('yang', 3919), ('the', 3820), ('dan', 3525), ('', 2603), ('di', 2357), ('ini', 1876)]\n",
237 | "Sample data [2292, 173, 1674, 2485, 12, 1859, 337, 0, 5462, 356] ['meanwhile', 'sabah', 'tourism', 'culture', 'and', 'environment', 'minister', 'PAD', 'liew', 'when']\n",
238 | "filtered vocab size: 12246\n",
239 | "% of vocab used: 49.65%\n"
240 | ]
241 | }
242 | ],
243 | "source": [
244 | "concat = X\n",
245 | "vocabulary_size = len(list(set(concat)))\n",
246 | "data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)\n",
247 | "print('vocab from size: %d'%(vocabulary_size))\n",
248 | "print('Most common words', count[4:10])\n",
249 | "print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])\n",
250 | "print('filtered vocab size:',len(dictionary))\n",
251 | "print(\"% of vocab used: {}%\".format(round(len(dictionary)/vocabulary_size,4)*100))"
252 | ]
253 | },
254 | {
255 | "cell_type": "code",
256 | "execution_count": 12,
257 | "metadata": {},
258 | "outputs": [],
259 | "source": [
260 | "class Model:\n",
261 | " def __init__(self,maxlen=50, \n",
262 | " vocabulary_size=20000,\n",
263 | " learning_rate=1e-3,\n",
264 | " embedding_size = 256):\n",
265 | " self.output_size = embedding_size\n",
266 | " self.maxlen = maxlen\n",
267 | " word_embeddings = tf.Variable(\n",
268 | " tf.random_uniform(\n",
269 | " [vocabulary_size, embedding_size], -np.sqrt(3), np.sqrt(3)\n",
270 | " )\n",
271 | " )\n",
272 | " self.global_step = tf.get_variable(\n",
273 | " \"global_step\", shape=[], trainable=False,\n",
274 | " initializer=tf.initializers.zeros())\n",
275 | " self.embeddings = word_embeddings\n",
276 | " self.output_layer = tf.layers.Dense(vocabulary_size, name=\"output_layer\")\n",
277 | " self.output_layer.build(self.output_size)\n",
278 | " \n",
279 | " self.BEFORE = tf.placeholder(tf.int32,[None,maxlen])\n",
280 | " self.INPUT = tf.placeholder(tf.int32,[None,maxlen])\n",
281 | " self.AFTER = tf.placeholder(tf.int32,[None,maxlen])\n",
282 | " self.batch_size = tf.shape(self.INPUT)[0]\n",
283 | " \n",
284 | " self.get_thought = self.thought(self.INPUT)\n",
285 | " self.attention = tf.matmul(\n",
286 | " self.get_thought, tf.transpose(self.embeddings), name = 'attention'\n",
287 | " )\n",
288 | " fw_logits = self.decoder(self.get_thought, self.AFTER)\n",
289 | " bw_logits = self.decoder(self.get_thought, self.BEFORE)\n",
290 | " self.loss = self.calculate_loss(fw_logits, self.AFTER) + self.calculate_loss(bw_logits, self.BEFORE)\n",
291 | " self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(self.loss)\n",
292 | "\n",
293 | " def get_embedding(self, inputs):\n",
294 | " return tf.nn.embedding_lookup(self.embeddings, inputs)\n",
295 | " \n",
296 | " def thought(self, inputs):\n",
297 | " encoder_in = self.get_embedding(inputs)\n",
298 | " fw_cell = tf.nn.rnn_cell.GRUCell(self.output_size)\n",
299 | " bw_cell = tf.nn.rnn_cell.GRUCell(self.output_size)\n",
300 | " sequence_length = tf.reduce_sum(tf.sign(inputs), axis=1)\n",
301 | " rnn_output = tf.nn.bidirectional_dynamic_rnn(\n",
302 | " fw_cell, bw_cell, encoder_in, sequence_length=sequence_length,\n",
303 | " dtype=tf.float32)[1]\n",
304 | " return sum(rnn_output)\n",
305 | " \n",
306 | " def decoder(self, thought, labels):\n",
307 | " main = tf.strided_slice(labels, [0, 0], [self.batch_size, -1], [1, 1])\n",
308 | " shifted_labels = tf.concat([tf.fill([self.batch_size, 1], 2), main], 1)\n",
309 | " decoder_in = self.get_embedding(shifted_labels)\n",
310 | " cell = tf.nn.rnn_cell.GRUCell(self.output_size)\n",
311 | " max_seq_lengths = tf.fill([self.batch_size], self.maxlen)\n",
312 | " helper = tf.contrib.seq2seq.TrainingHelper(\n",
313 | " decoder_in, max_seq_lengths, time_major = False\n",
314 | " )\n",
315 | " decoder = tf.contrib.seq2seq.BasicDecoder(cell, helper, thought)\n",
316 | " decoder_out = tf.contrib.seq2seq.dynamic_decode(decoder)[0].rnn_output\n",
317 | " return decoder_out\n",
318 | " \n",
319 | " def calculate_loss(self, outputs, labels):\n",
320 | " mask = tf.cast(tf.sign(labels), tf.float32)\n",
321 | " logits = self.output_layer(outputs)\n",
322 | " return tf.contrib.seq2seq.sequence_loss(logits, labels, mask)"
323 | ]
324 | },
325 | {
326 | "cell_type": "code",
327 | "execution_count": 13,
328 | "metadata": {},
329 | "outputs": [],
330 | "source": [
331 | "tf.reset_default_graph()\n",
332 | "sess = tf.InteractiveSession()\n",
333 | "model = Model(vocabulary_size = len(dictionary), embedding_size = embedding_size)\n",
334 | "sess.run(tf.global_variables_initializer())"
335 | ]
336 | },
337 | {
338 | "cell_type": "code",
339 | "execution_count": 14,
340 | "metadata": {},
341 | "outputs": [
342 | {
343 | "name": "stderr",
344 | "output_type": "stream",
345 | "text": [
346 | "train minibatch loop: 100%|██████████| 625/625 [01:59<00:00, 5.37it/s, cost=12.7]\n",
347 | "train minibatch loop: 100%|██████████| 625/625 [01:58<00:00, 5.37it/s, cost=11.1]\n",
348 | "train minibatch loop: 100%|██████████| 625/625 [01:58<00:00, 5.32it/s, cost=9.72]\n",
349 | "train minibatch loop: 100%|██████████| 625/625 [01:59<00:00, 5.38it/s, cost=8.56]\n",
350 | "train minibatch loop: 100%|██████████| 625/625 [01:58<00:00, 5.39it/s, cost=7.55]\n",
351 | "train minibatch loop: 100%|██████████| 625/625 [01:59<00:00, 5.39it/s, cost=6.65]\n",
352 | "train minibatch loop: 100%|██████████| 625/625 [01:59<00:00, 5.44it/s, cost=5.95]\n",
353 | "train minibatch loop: 100%|██████████| 625/625 [01:59<00:00, 5.34it/s, cost=5.34]\n",
354 | "train minibatch loop: 100%|██████████| 625/625 [01:59<00:00, 5.34it/s, cost=4.85]\n",
355 | "train minibatch loop: 100%|██████████| 625/625 [01:58<00:00, 5.40it/s, cost=4.36]\n"
356 | ]
357 | }
358 | ],
359 | "source": [
360 | "from tqdm import tqdm\n",
361 | "\n",
362 | "for i in range(10):\n",
363 | " pbar = tqdm(range(0, len(middle), batch_size), desc='train minibatch loop')\n",
364 | " for p in pbar:\n",
365 | " index = min(p + batch_size, len(middle))\n",
366 | " batch_x = batch_sequence(\n",
367 | " middle[p : index],\n",
368 | " dictionary,\n",
369 | " maxlen = maxlen,\n",
370 | " )\n",
371 | " batch_y_before = batch_sequence(\n",
372 | " left[p : index],\n",
373 | " dictionary,\n",
374 | " maxlen = maxlen,\n",
375 | " )\n",
376 | " batch_y_after = batch_sequence(\n",
377 | " right[p : index],\n",
378 | " dictionary,\n",
379 | " maxlen = maxlen,\n",
380 | " )\n",
381 | " loss, _ = sess.run([model.loss, model.optimizer], \n",
382 | " feed_dict = {model.BEFORE: batch_y_before,\n",
383 | " model.INPUT: batch_x,\n",
384 | " model.AFTER: batch_y_after,})\n",
385 | " pbar.set_postfix(cost=loss)"
386 | ]
387 | },
388 | {
389 | "cell_type": "code",
390 | "execution_count": 17,
391 | "metadata": {},
392 | "outputs": [],
393 | "source": [
394 | "test = random.sample(x, 100)\n",
395 | "\n",
396 | "sequences = batch_sequence(test, dictionary, maxlen = maxlen)\n",
397 | "encoded, attention = sess.run([model.get_thought, model.attention],feed_dict={model.INPUT:sequences})"
398 | ]
399 | },
400 | {
401 | "cell_type": "code",
402 | "execution_count": 23,
403 | "metadata": {},
404 | "outputs": [],
405 | "source": [
406 | "from sklearn.cluster import KMeans\n",
407 | "from sklearn.metrics import pairwise_distances_argmin_min\n",
408 | "\n",
409 | "n_clusters = 10\n",
410 | "kmeans = KMeans(n_clusters=n_clusters, random_state=0)\n",
411 | "kmeans = kmeans.fit(encoded)\n",
412 | "avg = []\n",
413 | "closest = []\n",
414 | "for j in range(n_clusters):\n",
415 | " idx = np.where(kmeans.labels_ == j)[0]\n",
416 | " avg.append(np.mean(idx))\n",
417 | "closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_,encoded)\n",
418 | "ordering = sorted(range(n_clusters), key=lambda k: avg[k])\n",
419 | "sentences = [test[closest[idx]] for idx in ordering]"
420 | ]
421 | },
422 | {
423 | "cell_type": "code",
424 | "execution_count": 26,
425 | "metadata": {},
426 | "outputs": [
427 | {
428 | "data": {
429 | "text/plain": [
430 | "'kita sudah banyak pegawai tadbir diplomatik dalam sektor awam dan syarikat syarikat besar kepunyaan kerajaan juga perlu bersaing dengan syarikat gergasi antarabangsa. dalam tempoh sama mcmc juga melaksanakan tindakan sekatan terhadap laman sesawang portal dan blog yang menyebarkan kandungan atau berita palsu. gambas instagram bahkan ia menambahkan konservasi alam di bali pun erat kaitannya dengan budaya. begitu juga bn. sikap keterbukaan dan faham memahami amat diperlukan di antara umat islam dan bukan islam bagi menjamin keharmonian kaum. sesungguhnya yang demikian itu mengandungi tanda tanda membuktikan kekuasaan allah bagi kaum yang berfikir untuk memahaminya. khoo added that it was difficult for the police to take any action because these are civil cases between tnb and the property owners. usaha ini boleh menambah pendapatan penduduk jika ia dikendalikan dengan penuh minat sebelum ia mengeluarkan hasil nanti katanya. sebagai contoh insentif untuk meningkatkan perbelanjaan pengguna buat hatchback dan sedan permulaan bagi golongan berpendapatan rendah dan penjawat awam serta sedan dan crossover pertengahan bagi golongan berpendapatan sederhana jelasnya. sementara itu pengerusi celcom axiata berhad tan sri jamaluddin ibrahim berkata tajaan tersebut adalah salah satu usaha membantu skuad badminton negara ke sukan olimpik tokyo '"
431 | ]
432 | },
433 | "execution_count": 26,
434 | "metadata": {},
435 | "output_type": "execute_result"
436 | }
437 | ],
438 | "source": [
439 | "sentences = [' '.join(s) for s in sentences]\n",
440 | "'. '.join(sentences)"
441 | ]
442 | },
443 | {
444 | "cell_type": "code",
445 | "execution_count": 27,
446 | "metadata": {},
447 | "outputs": [
448 | {
449 | "data": {
450 | "text/plain": [
451 | "['garden',\n",
452 | " 'ditanggung',\n",
453 | " 'majesty',\n",
454 | " 'maritime',\n",
455 | " 'himpunan',\n",
456 | " 'statik',\n",
457 | " 'mbm',\n",
458 | " 'permukaan',\n",
459 | " 'trial',\n",
460 | " 'pass']"
461 | ]
462 | },
463 | "execution_count": 27,
464 | "metadata": {},
465 | "output_type": "execute_result"
466 | }
467 | ],
468 | "source": [
469 | "indices = np.argsort(attention.mean(axis=0))[::-1]\n",
470 | "rev_dictionary = {v:k for k, v in dictionary.items()}\n",
471 | "[rev_dictionary[i] for i in indices[:10]]"
472 | ]
473 | },
474 | {
475 | "cell_type": "code",
476 | "execution_count": null,
477 | "metadata": {},
478 | "outputs": [],
479 | "source": []
480 | }
481 | ],
482 | "metadata": {
483 | "kernelspec": {
484 | "display_name": "Python 3",
485 | "language": "python",
486 | "name": "python3"
487 | },
488 | "language_info": {
489 | "codemirror_mode": {
490 | "name": "ipython",
491 | "version": 3
492 | },
493 | "file_extension": ".py",
494 | "mimetype": "text/x-python",
495 | "name": "python",
496 | "nbconvert_exporter": "python",
497 | "pygments_lexer": "ipython3",
498 | "version": "3.6.8"
499 | }
500 | },
501 | "nbformat": 4,
502 | "nbformat_minor": 2
503 | }
504 |
--------------------------------------------------------------------------------
/extractive-summarization/2.residual-network-bahdanau.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import os\n",
10 | "import re\n",
11 | "import tensorflow as tf\n",
12 | "import numpy as np\n",
13 | "import json\n",
14 | "import random"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 2,
20 | "metadata": {},
21 | "outputs": [
22 | {
23 | "data": {
24 | "text/plain": [
25 | "123"
26 | ]
27 | },
28 | "execution_count": 2,
29 | "metadata": {},
30 | "output_type": "execute_result"
31 | }
32 | ],
33 | "source": [
34 | "labels = os.listdir('news')\n",
35 | "news = ['news/' + i for i in labels if '.json' in i]\n",
36 | "labels = [i.replace('.json','') for i in labels]\n",
37 | "len(news)"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": 3,
43 | "metadata": {},
44 | "outputs": [],
45 | "source": [
46 | "import malaya\n",
47 | "tokenizer = malaya.preprocessing._SocialTokenizer().tokenize\n",
48 | "split_sentence = malaya.texts._text_functions.split_into_sentences"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": 4,
54 | "metadata": {},
55 | "outputs": [],
56 | "source": [
57 | "accept_tokens = ',-.()\"\\''\n",
58 | "\n",
59 | "def is_number_regex(s):\n",
60 | " if re.match(\"^\\d+?\\.\\d+?$\", s) is None:\n",
61 | " return s.isdigit()\n",
62 | " return True\n",
63 | "\n",
64 | "def detect_money(word):\n",
65 | " if word[:2] == 'rm' and is_number_regex(word[2:]):\n",
66 | " return True\n",
67 | " else:\n",
68 | " return False\n",
69 | "\n",
70 | "def preprocessing(string):\n",
71 | " splitted = split_sentence(string)\n",
72 | " for i, string in enumerate(splitted):\n",
73 | " tokenized = tokenizer(string)\n",
74 | " tokenized = [w.lower() for w in tokenized if len(w) > 1]\n",
75 | " tokenized = ['' if is_number_regex(w) else w for w in tokenized]\n",
76 | " tokenized = ['' if detect_money(w) else w for w in tokenized]\n",
77 | " splitted[i] = tokenized\n",
78 | " return splitted"
79 | ]
80 | },
81 | {
82 | "cell_type": "code",
83 | "execution_count": 5,
84 | "metadata": {},
85 | "outputs": [
86 | {
87 | "data": {
88 | "text/plain": [
89 | "263638"
90 | ]
91 | },
92 | "execution_count": 5,
93 | "metadata": {},
94 | "output_type": "execute_result"
95 | }
96 | ],
97 | "source": [
98 | "min_len = 20\n",
99 | "x = []\n",
100 | "for no, n in enumerate(news):\n",
101 | " with open(n) as fopen: \n",
102 | " news_ = json.load(fopen)\n",
103 | " for row in news_:\n",
104 | " if len(row['text'].split()) > min_len:\n",
105 | " p = preprocessing(row['text'])\n",
106 | " x.extend(p)\n",
107 | " \n",
108 | "len(x)"
109 | ]
110 | },
111 | {
112 | "cell_type": "code",
113 | "execution_count": 6,
114 | "metadata": {},
115 | "outputs": [],
116 | "source": [
117 | "x = random.sample(x, 10000)"
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": 7,
123 | "metadata": {},
124 | "outputs": [],
125 | "source": [
126 | "import collections\n",
127 | "\n",
128 | "def batch_sequence(sentences, dictionary, maxlen = 50):\n",
129 | " np_array = np.zeros((len(sentences), maxlen), dtype = np.int32)\n",
130 | " for no_sentence, sentence in enumerate(sentences):\n",
131 | " current_no = 0\n",
132 | " for no, word in enumerate(sentence[: maxlen - 2]):\n",
133 | " np_array[no_sentence, no] = dictionary.get(word, 1)\n",
134 | " current_no = no\n",
135 | " np_array[no_sentence, current_no + 1] = 3\n",
136 | " return np_array\n",
137 | "\n",
138 | "def build_dataset(words, n_words, atleast=2):\n",
139 | " count = [['PAD', 0], ['GO', 1], ['EOS', 2], ['UNK', 3]]\n",
140 | " counter = collections.Counter(words).most_common(n_words)\n",
141 | " counter = [i for i in counter if i[1] >= atleast]\n",
142 | " count.extend(counter)\n",
143 | " dictionary = dict()\n",
144 | " for word, _ in count:\n",
145 | " dictionary[word] = len(dictionary)\n",
146 | " data = list()\n",
147 | " unk_count = 0\n",
148 | " for word in words:\n",
149 | " index = dictionary.get(word, 0)\n",
150 | " if index == 0:\n",
151 | " unk_count += 1\n",
152 | " data.append(index)\n",
153 | " count[0][1] = unk_count\n",
154 | " reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))\n",
155 | " return data, count, dictionary, reversed_dictionary"
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": 8,
161 | "metadata": {},
162 | "outputs": [],
163 | "source": [
164 | "import itertools\n",
165 | "\n",
166 | "X = list(itertools.chain(*x))"
167 | ]
168 | },
169 | {
170 | "cell_type": "code",
171 | "execution_count": 9,
172 | "metadata": {},
173 | "outputs": [
174 | {
175 | "data": {
176 | "text/plain": [
177 | "24384"
178 | ]
179 | },
180 | "execution_count": 9,
181 | "metadata": {},
182 | "output_type": "execute_result"
183 | }
184 | ],
185 | "source": [
186 | "maxlen = 50\n",
187 | "vocabulary_size = len(set(X))\n",
188 | "embedding_size = 256\n",
189 | "learning_rate = 1e-3\n",
190 | "batch_size = 16\n",
191 | "vocabulary_size"
192 | ]
193 | },
194 | {
195 | "cell_type": "code",
196 | "execution_count": 10,
197 | "metadata": {},
198 | "outputs": [
199 | {
200 | "data": {
201 | "text/plain": [
202 | "(9998, 9998, 9998)"
203 | ]
204 | },
205 | "execution_count": 10,
206 | "metadata": {},
207 | "output_type": "execute_result"
208 | }
209 | ],
210 | "source": [
211 | "from sklearn.utils import shuffle\n",
212 | "\n",
213 | "stride = 1\n",
214 | "t_range = int((len(x) - 3) / stride + 1)\n",
215 | "left, middle, right = [], [], []\n",
216 | "for i in range(t_range):\n",
217 | " slices = x[i * stride : i * stride + 3]\n",
218 | " left.append(slices[0])\n",
219 | " middle.append(slices[1])\n",
220 | " right.append(slices[2])\n",
221 | "\n",
222 | "left, middle, right = shuffle(left, middle, right)\n",
223 | "len(left), len(middle), len(right)"
224 | ]
225 | },
226 | {
227 | "cell_type": "code",
228 | "execution_count": 11,
229 | "metadata": {},
230 | "outputs": [
231 | {
232 | "name": "stdout",
233 | "output_type": "stream",
234 | "text": [
235 | "vocab from size: 24384\n",
236 | "Most common words [('the', 3997), ('yang', 3660), ('dan', 3369), ('', 2534), ('di', 2310), ('to', 1847)]\n",
237 | "Sample data [14, 272, 138, 7, 7, 32, 4038, 532, 3178, 21] ['dalam', 'laporan', 'ekonomi', '', '', 'kerajaan', 'menjangkakan', 'pertumbuhan', 'kdnk', 'malaysia']\n",
238 | "filtered vocab size: 12188\n",
239 | "% of vocab used: 49.980000000000004%\n"
240 | ]
241 | }
242 | ],
243 | "source": [
244 | "concat = X\n",
245 | "vocabulary_size = len(list(set(concat)))\n",
246 | "data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)\n",
247 | "print('vocab from size: %d'%(vocabulary_size))\n",
248 | "print('Most common words', count[4:10])\n",
249 | "print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])\n",
250 | "print('filtered vocab size:',len(dictionary))\n",
251 | "print(\"% of vocab used: {}%\".format(round(len(dictionary)/vocabulary_size,4)*100))"
252 | ]
253 | },
254 | {
255 | "cell_type": "code",
256 | "execution_count": 12,
257 | "metadata": {},
258 | "outputs": [],
259 | "source": [
260 | "class Attention:\n",
261 | " def __init__(self,hidden_size):\n",
262 | " self.hidden_size = hidden_size\n",
263 | " self.dense_layer = tf.layers.Dense(hidden_size)\n",
264 | " self.v = tf.random_normal([hidden_size],mean=0,stddev=1/np.sqrt(hidden_size))\n",
265 | " \n",
266 | " def score(self, hidden_tensor, encoder_outputs):\n",
267 | " energy = tf.nn.tanh(self.dense_layer(tf.concat([hidden_tensor,encoder_outputs],2)))\n",
268 | " energy = tf.transpose(energy,[0,2,1])\n",
269 | " batch_size = tf.shape(encoder_outputs)[0]\n",
270 | " v = tf.expand_dims(tf.tile(tf.expand_dims(self.v,0),[batch_size,1]),1)\n",
271 | " energy = tf.matmul(v,energy)\n",
272 | " return tf.squeeze(energy,1)\n",
273 | " \n",
274 | " def __call__(self, hidden, encoder_outputs):\n",
275 | " seq_len = tf.shape(encoder_outputs)[1]\n",
276 | " batch_size = tf.shape(encoder_outputs)[0]\n",
277 | " H = tf.tile(tf.expand_dims(hidden, 1),[1,seq_len,1])\n",
278 | " attn_energies = self.score(H,encoder_outputs)\n",
279 | " return tf.expand_dims(tf.nn.softmax(attn_energies),1)\n",
280 | " \n",
281 | "class Model:\n",
282 | " def __init__(\n",
283 | " self,\n",
284 | " dict_size,\n",
285 | " size_layers,\n",
286 | " learning_rate,\n",
287 | " maxlen,\n",
288 | " num_blocks = 3,\n",
289 | " ):\n",
290 | " block_size = size_layers\n",
291 | " self.BEFORE = tf.placeholder(tf.int32,[None,maxlen])\n",
292 | " self.INPUT = tf.placeholder(tf.int32,[None,maxlen])\n",
293 | " self.AFTER = tf.placeholder(tf.int32,[None,maxlen])\n",
294 | " self.batch_size = tf.shape(self.INPUT)[0]\n",
295 | " self.output_layer = tf.layers.Dense(dict_size, name=\"output_layer\")\n",
296 | " self.output_layer.build(size_layers)\n",
297 | " self.embeddings = tf.Variable(tf.random_uniform([dict_size, size_layers], -1, 1))\n",
298 | " embedded = tf.nn.embedding_lookup(self.embeddings, self.INPUT)\n",
299 | " self.attention = Attention(size_layers)\n",
300 | "\n",
301 | " def residual_block(x, size, rate, block, reuse = False):\n",
302 | " with tf.variable_scope(\n",
303 | " 'block_%d_%d' % (block, rate), reuse = reuse\n",
304 | " ):\n",
305 | " attn_weights = self.attention(tf.reduce_sum(x,axis=1), x)\n",
306 | " conv_filter = tf.layers.conv1d(\n",
307 | " attn_weights,\n",
308 | " x.shape[2] // 4,\n",
309 | " kernel_size = size,\n",
310 | " strides = 1,\n",
311 | " padding = 'same',\n",
312 | " dilation_rate = rate,\n",
313 | " activation = tf.nn.tanh,\n",
314 | " )\n",
315 | " conv_gate = tf.layers.conv1d(\n",
316 | " x,\n",
317 | " x.shape[2] // 4,\n",
318 | " kernel_size = size,\n",
319 | " strides = 1,\n",
320 | " padding = 'same',\n",
321 | " dilation_rate = rate,\n",
322 | " activation = tf.nn.sigmoid,\n",
323 | " )\n",
324 | " out = tf.multiply(conv_filter, conv_gate)\n",
325 | " out = tf.layers.conv1d(\n",
326 | " out,\n",
327 | " block_size,\n",
328 | " kernel_size = 1,\n",
329 | " strides = 1,\n",
330 | " padding = 'same',\n",
331 | " activation = tf.nn.tanh,\n",
332 | " )\n",
333 | " return tf.add(x, out), out\n",
334 | "\n",
335 | " forward = tf.layers.conv1d(\n",
336 | " embedded, block_size, kernel_size = 1, strides = 1, padding = 'SAME'\n",
337 | " )\n",
338 | " zeros = tf.zeros_like(forward)\n",
339 | " for i in range(num_blocks):\n",
340 | " for r in [1, 2, 4, 8, 16]:\n",
341 | " forward, s = residual_block(\n",
342 | " forward, size = 7, rate = r, block = i\n",
343 | " )\n",
344 | " zeros = tf.add(zeros, s)\n",
345 | " forward = tf.layers.conv1d(\n",
346 | " zeros,\n",
347 | " block_size,\n",
348 | " kernel_size = 1,\n",
349 | " strides = 1,\n",
350 | " padding = 'SAME',\n",
351 | " activation = tf.nn.tanh,\n",
352 | " )\n",
353 | " self.get_thought = tf.reduce_sum(forward,axis=1, name = 'logits')\n",
354 | " \n",
355 | " def decoder(labels, reuse):\n",
356 | " decoder_in = tf.nn.embedding_lookup(self.embeddings, labels)\n",
357 | " forward = tf.layers.conv1d(\n",
358 | " decoder_in, block_size, kernel_size = 1, strides = 1, padding = 'SAME'\n",
359 | " )\n",
360 | " zeros = tf.zeros_like(forward)\n",
361 | " for r in [8, 16, 24]:\n",
362 | " forward, s = residual_block(forward, size = 7, rate = r, block = 10, reuse = reuse)\n",
363 | " zeros = tf.add(zeros, s)\n",
364 | " return tf.layers.conv1d(\n",
365 | " zeros,\n",
366 | " block_size,\n",
367 | " kernel_size = 1,\n",
368 | " strides = 1,\n",
369 | " padding = 'SAME',\n",
370 | " activation = tf.nn.tanh,\n",
371 | " )\n",
372 | " \n",
373 | " fw_logits = decoder(self.AFTER, False)\n",
374 | " bw_logits = decoder(self.BEFORE, True)\n",
375 | " self.attention = tf.matmul(\n",
376 | " self.get_thought, tf.transpose(self.embeddings), name = 'attention'\n",
377 | " )\n",
378 | " self.loss = self.calculate_loss(fw_logits, self.AFTER) + self.calculate_loss(bw_logits, self.BEFORE)\n",
379 | " self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(self.loss)\n",
380 | " \n",
381 | " def calculate_loss(self, outputs, labels):\n",
382 | " mask = tf.cast(tf.sign(labels), tf.float32)\n",
383 | " logits = self.output_layer(outputs)\n",
384 | " return tf.contrib.seq2seq.sequence_loss(logits, labels, mask)"
385 | ]
386 | },
387 | {
388 | "cell_type": "code",
389 | "execution_count": 13,
390 | "metadata": {},
391 | "outputs": [],
392 | "source": [
393 | "tf.reset_default_graph()\n",
394 | "sess = tf.InteractiveSession()\n",
395 | "model = Model(len(dictionary), embedding_size, learning_rate, maxlen)\n",
396 | "sess.run(tf.global_variables_initializer())"
397 | ]
398 | },
399 | {
400 | "cell_type": "code",
401 | "execution_count": 14,
402 | "metadata": {},
403 | "outputs": [
404 | {
405 | "name": "stderr",
406 | "output_type": "stream",
407 | "text": [
408 | "train minibatch loop: 100%|██████████| 625/625 [00:36<00:00, 13.49it/s, cost=9.81]\n",
409 | "train minibatch loop: 100%|██████████| 625/625 [00:35<00:00, 17.48it/s, cost=7.15]\n",
410 | "train minibatch loop: 100%|██████████| 625/625 [00:36<00:00, 17.30it/s, cost=5.54]\n",
411 | "train minibatch loop: 100%|██████████| 625/625 [00:35<00:00, 17.39it/s, cost=4.42]\n",
412 | "train minibatch loop: 100%|██████████| 625/625 [00:36<00:00, 17.25it/s, cost=3.6] \n",
413 | "train minibatch loop: 100%|██████████| 625/625 [00:35<00:00, 17.36it/s, cost=2.95]\n",
414 | "train minibatch loop: 100%|██████████| 625/625 [00:36<00:00, 17.22it/s, cost=2.38]\n",
415 | "train minibatch loop: 100%|██████████| 625/625 [00:35<00:00, 17.36it/s, cost=1.87]\n",
416 | "train minibatch loop: 100%|██████████| 625/625 [00:36<00:00, 17.25it/s, cost=1.59]\n",
417 | "train minibatch loop: 100%|██████████| 625/625 [00:36<00:00, 17.35it/s, cost=1.36]\n"
418 | ]
419 | }
420 | ],
421 | "source": [
422 | "from tqdm import tqdm\n",
423 | "\n",
424 | "for i in range(10):\n",
425 | " pbar = tqdm(range(0, len(middle), batch_size), desc='train minibatch loop')\n",
426 | " for p in pbar:\n",
427 | " index = min(p + batch_size, len(middle))\n",
428 | " batch_x = batch_sequence(\n",
429 | " middle[p : index],\n",
430 | " dictionary,\n",
431 | " maxlen = maxlen,\n",
432 | " )\n",
433 | " batch_y_before = batch_sequence(\n",
434 | " left[p : index],\n",
435 | " dictionary,\n",
436 | " maxlen = maxlen,\n",
437 | " )\n",
438 | " batch_y_after = batch_sequence(\n",
439 | " right[p : index],\n",
440 | " dictionary,\n",
441 | " maxlen = maxlen,\n",
442 | " )\n",
443 | " loss, _ = sess.run([model.loss, model.optimizer], \n",
444 | " feed_dict = {model.BEFORE: batch_y_before,\n",
445 | " model.INPUT: batch_x,\n",
446 | " model.AFTER: batch_y_after,})\n",
447 | " pbar.set_postfix(cost=loss)"
448 | ]
449 | },
450 | {
451 | "cell_type": "code",
452 | "execution_count": 15,
453 | "metadata": {},
454 | "outputs": [],
455 | "source": [
456 | "test = random.sample(x, 100)\n",
457 | "\n",
458 | "sequences = batch_sequence(test, dictionary, maxlen = maxlen)\n",
459 | "encoded, attention = sess.run([model.get_thought, model.attention],feed_dict={model.INPUT:sequences})"
460 | ]
461 | },
462 | {
463 | "cell_type": "code",
464 | "execution_count": 16,
465 | "metadata": {},
466 | "outputs": [],
467 | "source": [
468 | "from sklearn.cluster import KMeans\n",
469 | "from sklearn.metrics import pairwise_distances_argmin_min\n",
470 | "\n",
471 | "n_clusters = 10\n",
472 | "kmeans = KMeans(n_clusters=n_clusters, random_state=0)\n",
473 | "kmeans = kmeans.fit(encoded)\n",
474 | "avg = []\n",
475 | "closest = []\n",
476 | "for j in range(n_clusters):\n",
477 | " idx = np.where(kmeans.labels_ == j)[0]\n",
478 | " avg.append(np.mean(idx))\n",
479 | "closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_,encoded)\n",
480 | "ordering = sorted(range(n_clusters), key=lambda k: avg[k])\n",
481 | "sentences = [test[closest[idx]] for idx in ordering]"
482 | ]
483 | },
484 | {
485 | "cell_type": "code",
486 | "execution_count": 17,
487 | "metadata": {},
488 | "outputs": [
489 | {
490 | "data": {
491 | "text/plain": [
492 | "'jelasnya malaysia mempunyai kemudahan lengkap yang boleh ditawarkan sebagai venue kejohanan itu antaranya arena axiata bukit jalil dan stadium malawati shah alam malah sukan gimnastik estetik lebih mudah dikendalikan kerana kurang menggunakan peralatan. ini penyelesaian jangka pendek yang lebih praktikal dalam memastikan semua isu berkaitan kebajikan dan perlindungan kanak kanak dapat dipantau serta memastikan agar agensi atau badan badan berkaitan memikul tanggungjawab masing masing katanya sewaktu sesi soal jawab lisan di dewan rakyat hari ini. presiden persatuan bola sepak malaysia fam tunku ismail sultan ibrahim berkata semua jurulatih terlibat tan cheng hoe harimau malaya datuk ong kim swee bojan hodak dan lim teong kim ppbn perlu mengikut perancangan dan strategi taktikal yang dirangka pengarah teknikal fam peter de roo. four people have been charged with multiple counts of money laundering in connection to the case and are now out on bail pending trial. saya perlu bertemu semua pihak berkepentingan bukan hanya kaum cina tetapi juga golongan pejuang bahasa kebangssan dan mereka yang berurusan dalam menjaga keharmonian. kita tidak berpolitik semata mata hanya untuk merebut kuasa kerana kita mahu rakyat menikmati manfaat pembangunan hasil daripada pertumbuhan ekonomi yang terus berkembang hasil kerjasama erat kedua dua kepimpinan kerajaan pusat dan negeri katanya. presiden juga mengumumkan bahwa pemerintah menghapus lima nol dari mata uang venezuela menjatuhkan nilai bolivar lebih dari persen. malaysiakini requires javascript to run normally. so this heat could really pose problem to the industry. his wife fauziah ag piut and former deputy director lim lam beng have been charged with counts for money laundering amounting to million $697,000 and four separate counts totalling million under section of the same act'"
493 | ]
494 | },
495 | "execution_count": 17,
496 | "metadata": {},
497 | "output_type": "execute_result"
498 | }
499 | ],
500 | "source": [
501 | "sentences = [' '.join(s) for s in sentences]\n",
502 | "'. '.join(sentences)"
503 | ]
504 | },
505 | {
506 | "cell_type": "code",
507 | "execution_count": 18,
508 | "metadata": {},
509 | "outputs": [
510 | {
511 | "data": {
512 | "text/plain": [
513 | "['mendukung',\n",
514 | " 'mind',\n",
515 | " 'interest',\n",
516 | " 'introduced',\n",
517 | " 'paul',\n",
518 | " 'evolusi',\n",
519 | " 'entire',\n",
520 | " 'sejujurnya',\n",
521 | " 'ilmuwan',\n",
522 | " 'barangkali']"
523 | ]
524 | },
525 | "execution_count": 18,
526 | "metadata": {},
527 | "output_type": "execute_result"
528 | }
529 | ],
530 | "source": [
531 | "indices = np.argsort(attention.mean(axis=0))[::-1]\n",
532 | "rev_dictionary = {v:k for k, v in dictionary.items()}\n",
533 | "[rev_dictionary[i] for i in indices[:10]]"
534 | ]
535 | },
536 | {
537 | "cell_type": "code",
538 | "execution_count": null,
539 | "metadata": {},
540 | "outputs": [],
541 | "source": []
542 | }
543 | ],
544 | "metadata": {
545 | "kernelspec": {
546 | "display_name": "Python 3",
547 | "language": "python",
548 | "name": "python3"
549 | },
550 | "language_info": {
551 | "codemirror_mode": {
552 | "name": "ipython",
553 | "version": 3
554 | },
555 | "file_extension": ".py",
556 | "mimetype": "text/x-python",
557 | "name": "python",
558 | "nbconvert_exporter": "python",
559 | "pygments_lexer": "ipython3",
560 | "version": "3.6.8"
561 | }
562 | },
563 | "nbformat": 4,
564 | "nbformat_minor": 2
565 | }
566 |
--------------------------------------------------------------------------------
/extractive-summarization/README.md:
--------------------------------------------------------------------------------
1 | ## How-to
2 |
3 | 1. Download dataset,
4 | ```bash
5 | wget https://github.com/huseinzol05/Malaya-Dataset/raw/master/news/news.zip
6 | ```
7 |
8 | 2. Unzip the dataset,
9 | ```bash
10 | unzip news.zip
11 | ```
12 |
13 | 3. Run any notebook using Jupyter Notebook.
14 |
--------------------------------------------------------------------------------
/long-text-classification/1.dilated-cnn.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import os\n",
10 | "import tensorflow as tf\n",
11 | "import malaya\n",
12 | "import json"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": 2,
18 | "metadata": {},
19 | "outputs": [
20 | {
21 | "data": {
22 | "text/plain": [
23 | "['negative/2.json',\n",
24 | " 'negative/clean-politifact_real.json',\n",
25 | " 'negative/clean-gossipcop_real3.json',\n",
26 | " 'negative/clean-gossipcop_real2.json',\n",
27 | " 'negative/clean-gossipcop_real1.json',\n",
28 | " 'negative/1.json']"
29 | ]
30 | },
31 | "execution_count": 2,
32 | "metadata": {},
33 | "output_type": "execute_result"
34 | }
35 | ],
36 | "source": [
37 | "negatives = ['negative/' + i for i in os.listdir('negative') if '.json' in i]\n",
38 | "positives = ['positive/' + i for i in os.listdir('positive') if '.json' in i]\n",
39 | "negatives"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": 3,
45 | "metadata": {},
46 | "outputs": [],
47 | "source": [
48 | "import re\n",
49 | "\n",
50 | "tokenizer = malaya.preprocessing._SocialTokenizer().tokenize\n",
51 | "accept_tokens = ',-.()\"\\''\n",
52 | "\n",
53 | "def is_number_regex(s):\n",
54 | " if re.match(\"^\\d+?\\.\\d+?$\", s) is None:\n",
55 | " return s.isdigit()\n",
56 | " return True\n",
57 | "\n",
58 | "def detect_money(word):\n",
59 | " if word[:2] == 'rm' and is_number_regex(word[2:]):\n",
60 | " return True\n",
61 | " else:\n",
62 | " return False\n",
63 | "\n",
64 | "def preprocessing(string):\n",
65 | " tokenized = tokenizer(string)\n",
66 | " tokenized = [w.lower() for w in tokenized if len(w) > 1 or w in accept_tokens]\n",
67 | " tokenized = ['' if is_number_regex(w) else w for w in tokenized]\n",
68 | " tokenized = ['' if detect_money(w) else w for w in tokenized]\n",
69 | " return tokenized\n",
70 | "\n",
71 | "def clean_label(label):\n",
72 | " string = re.sub('[^A-Za-z\\- ]+', ' ', label)\n",
73 | " return re.sub(r'[ ]+', ' ', string.lower()).strip()"
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": 4,
79 | "metadata": {},
80 | "outputs": [
81 | {
82 | "data": {
83 | "text/plain": [
84 | "(26832, 26832)"
85 | ]
86 | },
87 | "execution_count": 4,
88 | "metadata": {},
89 | "output_type": "execute_result"
90 | }
91 | ],
92 | "source": [
93 | "X, Y = [], []\n",
94 | "\n",
95 | "for n in negatives:\n",
96 | " with open(n) as fopen:\n",
97 | " x = json.load(fopen)\n",
98 | " processed = [preprocessing(s) for s in x]\n",
99 | " X.extend(processed)\n",
100 | " Y.extend([0] * len(processed))\n",
101 | " \n",
102 | "len(X), len(Y)"
103 | ]
104 | },
105 | {
106 | "cell_type": "code",
107 | "execution_count": 5,
108 | "metadata": {},
109 | "outputs": [
110 | {
111 | "data": {
112 | "text/plain": [
113 | "(42023, 42023)"
114 | ]
115 | },
116 | "execution_count": 5,
117 | "metadata": {},
118 | "output_type": "execute_result"
119 | }
120 | ],
121 | "source": [
122 | "for p in positives:\n",
123 | " with open(p) as fopen:\n",
124 | " x = json.load(fopen)\n",
125 | " processed = [preprocessing(s) for s in x]\n",
126 | " X.extend(processed)\n",
127 | " Y.extend([1] * len(processed))\n",
128 | " \n",
129 | "len(X), len(Y)"
130 | ]
131 | },
132 | {
133 | "cell_type": "code",
134 | "execution_count": 6,
135 | "metadata": {},
136 | "outputs": [],
137 | "source": [
138 | "def build_dataset(words, n_words):\n",
139 | " count = [['GO', 0], ['PAD', 1], ['EOS', 2], ['UNK', 3]]\n",
140 | " count.extend(collections.Counter(words).most_common(n_words - 1))\n",
141 | " dictionary = dict()\n",
142 | " for word, _ in count:\n",
143 | " dictionary[word] = len(dictionary)\n",
144 | " data = list()\n",
145 | " unk_count = 0\n",
146 | " for word in words:\n",
147 | " index = dictionary.get(word, 0)\n",
148 | " if index == 0:\n",
149 | " unk_count += 1\n",
150 | " data.append(index)\n",
151 | " count[0][1] = unk_count\n",
152 | " reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))\n",
153 | " return data, count, dictionary, reversed_dictionary"
154 | ]
155 | },
156 | {
157 | "cell_type": "code",
158 | "execution_count": 7,
159 | "metadata": {},
160 | "outputs": [
161 | {
162 | "name": "stdout",
163 | "output_type": "stream",
164 | "text": [
165 | "vocab from size: 199359\n",
166 | "Most common words [(',', 1088529), ('.', 956019), ('yang', 636546), ('dan', 479145), ('\"', 468691), ('untuk', 319102)]\n",
167 | "Sample data [436, 2073, 5, 926, 7923, 67, 4, 8, 40, 264] ['demokrat', 'sen', '.', 'al', 'franken', 'berkata', ',', '\"', 'kita', 'harus']\n"
168 | ]
169 | }
170 | ],
171 | "source": [
172 | "import itertools\n",
173 | "import collections\n",
174 | "\n",
175 | "concat = list(itertools.chain(*X))\n",
176 | "vocabulary_size = len(list(set(concat)))\n",
177 | "data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)\n",
178 | "print('vocab from size: %d'%(vocabulary_size))\n",
179 | "print('Most common words', count[4:10])\n",
180 | "print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])"
181 | ]
182 | },
183 | {
184 | "cell_type": "code",
185 | "execution_count": 8,
186 | "metadata": {},
187 | "outputs": [],
188 | "source": [
189 | "GO = dictionary['GO']\n",
190 | "PAD = dictionary['PAD']\n",
191 | "EOS = dictionary['EOS']\n",
192 | "UNK = dictionary['UNK']"
193 | ]
194 | },
195 | {
196 | "cell_type": "code",
197 | "execution_count": 9,
198 | "metadata": {},
199 | "outputs": [],
200 | "source": [
201 | "import tensorflow as tf\n",
202 | "import numpy as np"
203 | ]
204 | },
205 | {
206 | "cell_type": "code",
207 | "execution_count": 10,
208 | "metadata": {},
209 | "outputs": [],
210 | "source": [
211 | "def embed_seq(x, vocab_sz, embed_dim, name, zero_pad=True): \n",
212 | " embedding = tf.get_variable(name, [vocab_sz, embed_dim]) \n",
213 | " if zero_pad:\n",
214 | " embedding = tf.concat([tf.zeros([1, embed_dim]), embedding[1:, :]], 0) \n",
215 | " x = tf.nn.embedding_lookup(embedding, x)\n",
216 | " return x\n",
217 | "\n",
218 | "def position_encoding(inputs):\n",
219 | " T = tf.shape(inputs)[1]\n",
220 | " repr_dim = inputs.get_shape()[-1].value\n",
221 | " pos = tf.reshape(tf.range(0.0, tf.to_float(T), dtype=tf.float32), [-1, 1])\n",
222 | " i = np.arange(0, repr_dim, 2, np.float32)\n",
223 | " denom = np.reshape(np.power(10000.0, i / repr_dim), [1, -1])\n",
224 | " enc = tf.expand_dims(tf.concat([tf.sin(pos / denom), tf.cos(pos / denom)], 1), 0)\n",
225 | " return tf.tile(enc, [tf.shape(inputs)[0], 1, 1])\n",
226 | "\n",
227 | "def layer_norm(inputs, epsilon=1e-8):\n",
228 | " mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)\n",
229 | " normalized = (inputs - mean) / (tf.sqrt(variance + epsilon))\n",
230 | " params_shape = inputs.get_shape()[-1:]\n",
231 | " gamma = tf.get_variable('gamma', params_shape, tf.float32, tf.ones_initializer())\n",
232 | " beta = tf.get_variable('beta', params_shape, tf.float32, tf.zeros_initializer())\n",
233 | " return gamma * normalized + beta\n",
234 | "\n",
235 | "\n",
236 | "def cnn_block(x, dilation_rate, pad_sz, hidden_dim, kernel_size, is_training):\n",
237 | " x = layer_norm(x)\n",
238 | " pad = tf.zeros([tf.shape(x)[0], pad_sz, hidden_dim])\n",
239 | " x = tf.layers.conv1d(inputs = tf.concat([pad, x, pad], 1),\n",
240 | " filters = hidden_dim,\n",
241 | " kernel_size = kernel_size,\n",
242 | " dilation_rate = dilation_rate)\n",
243 | " x = tf.layers.dropout(x, 0.2, training=is_training) \n",
244 | " x = x[:, :-pad_sz, :]\n",
245 | " x = tf.nn.relu(x)\n",
246 | " return x\n",
247 | "\n",
248 | "class Model:\n",
249 | " def __init__(self, dict_size, dimension_output, size_layer, num_layers, kernel_size = 5):\n",
250 | " self.X = tf.placeholder(tf.int32, [None, None])\n",
251 | " self.Y = tf.placeholder(tf.int32, [None])\n",
252 | " self.training = tf.placeholder(tf.bool, None)\n",
253 | " x = self.X\n",
254 | " \n",
255 | " with tf.variable_scope('embed'):\n",
256 | " x = embed_seq(x, dict_size, size_layer, 'word')\n",
257 | " x += position_encoding(x)\n",
258 | " \n",
259 | " for i in range(num_layers): \n",
260 | " dilation_rate = 2 ** i\n",
261 | " pad_sz = (kernel_size - 1) * dilation_rate \n",
262 | " with tf.variable_scope('block_%d'%i):\n",
263 | " a = cnn_block(x, dilation_rate, pad_sz, size_layer, kernel_size, self.training)\n",
264 | " \n",
265 | " x += a\n",
266 | " \n",
267 | " self.logits = tf.layers.dense(x[:,-1], dimension_output)\n",
268 | " self.cost = tf.reduce_mean(\n",
269 | " tf.nn.sparse_softmax_cross_entropy_with_logits(\n",
270 | " logits = self.logits, labels = self.Y\n",
271 | " )\n",
272 | " )\n",
273 | " self.optimizer = tf.train.AdamOptimizer(\n",
274 | " learning_rate = learning_rate\n",
275 | " ).minimize(self.cost)\n",
276 | " correct_pred = tf.equal(\n",
277 | " tf.argmax(self.logits, 1, output_type = tf.int32), self.Y\n",
278 | " )\n",
279 | " self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))"
280 | ]
281 | },
282 | {
283 | "cell_type": "code",
284 | "execution_count": 11,
285 | "metadata": {},
286 | "outputs": [],
287 | "source": [
288 | "maxlen = 1000\n",
289 | "size_layer = 128\n",
290 | "num_layers = 6\n",
291 | "learning_rate = 1e-4\n",
292 | "batch_size = 32"
293 | ]
294 | },
295 | {
296 | "cell_type": "code",
297 | "execution_count": 12,
298 | "metadata": {},
299 | "outputs": [],
300 | "source": [
301 | "tf.reset_default_graph()\n",
302 | "sess = tf.InteractiveSession()\n",
303 | "model = Model(len(dictionary), 2, size_layer,num_layers)\n",
304 | "sess.run(tf.global_variables_initializer())"
305 | ]
306 | },
307 | {
308 | "cell_type": "code",
309 | "execution_count": 13,
310 | "metadata": {},
311 | "outputs": [],
312 | "source": [
313 | "def str_idx(corpus, dic, maxlen, UNK = 3):\n",
314 | " X = np.zeros((len(corpus), maxlen))\n",
315 | " for i in range(len(corpus)):\n",
316 | " for no, k in enumerate(corpus[i][:maxlen][::-1]):\n",
317 | " X[i, -1 - no] = dic.get(k, UNK)\n",
318 | " return X"
319 | ]
320 | },
321 | {
322 | "cell_type": "code",
323 | "execution_count": 14,
324 | "metadata": {},
325 | "outputs": [],
326 | "source": [
327 | "from sklearn.cross_validation import train_test_split\n",
328 | "\n",
329 | "vectors = str_idx(X,dictionary,maxlen)\n",
330 | "train_X, test_X, train_Y, test_Y = train_test_split(vectors, Y, test_size = 0.2)"
331 | ]
332 | },
333 | {
334 | "cell_type": "code",
335 | "execution_count": 15,
336 | "metadata": {},
337 | "outputs": [
338 | {
339 | "name": "stderr",
340 | "output_type": "stream",
341 | "text": [
342 | "train minibatch loop: 100%|██████████| 1051/1051 [03:31<00:00, 2.79it/s, accuracy=0.667, cost=0.649]\n",
343 | "test minibatch loop: 100%|██████████| 263/263 [00:15<00:00, 10.55it/s, accuracy=0.571, cost=0.727]\n",
344 | "train minibatch loop: 0%| | 0/1051 [00:00, ?it/s]"
345 | ]
346 | },
347 | {
348 | "name": "stdout",
349 | "output_type": "stream",
350 | "text": [
351 | "epoch: 0, pass acc: 0.000000, current acc: 0.638701\n",
352 | "time taken: 226.76698470115662\n",
353 | "epoch: 0, training loss: 0.732253, training acc: 0.593977, valid loss: 0.650750, valid acc: 0.638701\n",
354 | "\n"
355 | ]
356 | },
357 | {
358 | "name": "stderr",
359 | "output_type": "stream",
360 | "text": [
361 | "train minibatch loop: 100%|██████████| 1051/1051 [03:25<00:00, 5.61it/s, accuracy=0.722, cost=0.576]\n",
362 | "test minibatch loop: 100%|██████████| 263/263 [00:14<00:00, 17.68it/s, accuracy=0.571, cost=0.682]\n",
363 | "train minibatch loop: 0%| | 0/1051 [00:00, ?it/s]"
364 | ]
365 | },
366 | {
367 | "name": "stdout",
368 | "output_type": "stream",
369 | "text": [
370 | "epoch: 1, pass acc: 0.638701, current acc: 0.662497\n",
371 | "time taken: 220.16101932525635\n",
372 | "epoch: 1, training loss: 0.626222, training acc: 0.655456, valid loss: 0.608959, valid acc: 0.662497\n",
373 | "\n"
374 | ]
375 | },
376 | {
377 | "name": "stderr",
378 | "output_type": "stream",
379 | "text": [
380 | "train minibatch loop: 100%|██████████| 1051/1051 [03:30<00:00, 5.62it/s, accuracy=0.722, cost=0.521]\n",
381 | "test minibatch loop: 100%|██████████| 263/263 [00:14<00:00, 17.74it/s, accuracy=0.667, cost=0.624]\n",
382 | "train minibatch loop: 0%| | 0/1051 [00:00, ?it/s]"
383 | ]
384 | },
385 | {
386 | "name": "stdout",
387 | "output_type": "stream",
388 | "text": [
389 | "epoch: 2, pass acc: 0.662497, current acc: 0.704025\n",
390 | "time taken: 224.93579745292664\n",
391 | "epoch: 2, training loss: 0.577414, training acc: 0.703495, valid loss: 0.566277, valid acc: 0.704025\n",
392 | "\n"
393 | ]
394 | },
395 | {
396 | "name": "stderr",
397 | "output_type": "stream",
398 | "text": [
399 | "train minibatch loop: 100%|██████████| 1051/1051 [03:29<00:00, 5.62it/s, accuracy=0.833, cost=0.4] \n",
400 | "test minibatch loop: 100%|██████████| 263/263 [00:14<00:00, 17.75it/s, accuracy=0.619, cost=0.547]\n",
401 | "train minibatch loop: 0%| | 0/1051 [00:00, ?it/s]"
402 | ]
403 | },
404 | {
405 | "name": "stdout",
406 | "output_type": "stream",
407 | "text": [
408 | "time taken: 224.78561234474182\n",
409 | "epoch: 3, training loss: 0.518492, training acc: 0.748309, valid loss: 0.559724, valid acc: 0.688496\n",
410 | "\n"
411 | ]
412 | },
413 | {
414 | "name": "stderr",
415 | "output_type": "stream",
416 | "text": [
417 | "train minibatch loop: 100%|██████████| 1051/1051 [03:29<00:00, 5.63it/s, accuracy=0.833, cost=0.398]\n",
418 | "test minibatch loop: 100%|██████████| 263/263 [00:14<00:00, 17.71it/s, accuracy=0.667, cost=0.516]\n",
419 | "train minibatch loop: 0%| | 0/1051 [00:00, ?it/s, accuracy=0.938, cost=0.313]"
420 | ]
421 | },
422 | {
423 | "name": "stdout",
424 | "output_type": "stream",
425 | "text": [
426 | "time taken: 224.8228726387024\n",
427 | "epoch: 4, training loss: 0.456134, training acc: 0.787247, valid loss: 0.552487, valid acc: 0.697482\n",
428 | "\n"
429 | ]
430 | },
431 | {
432 | "name": "stderr",
433 | "output_type": "stream",
434 | "text": [
435 | "train minibatch loop: 100%|██████████| 1051/1051 [03:24<00:00, 6.44it/s, accuracy=0.778, cost=0.481]\n",
436 | "test minibatch loop: 100%|██████████| 263/263 [00:12<00:00, 20.50it/s, accuracy=0.714, cost=0.719]"
437 | ]
438 | },
439 | {
440 | "name": "stdout",
441 | "output_type": "stream",
442 | "text": [
443 | "time taken: 216.91703510284424\n",
444 | "epoch: 5, training loss: 0.394863, training acc: 0.824674, valid loss: 0.805891, valid acc: 0.582255\n",
445 | "\n",
446 | "break epoch:6\n",
447 | "\n"
448 | ]
449 | },
450 | {
451 | "name": "stderr",
452 | "output_type": "stream",
453 | "text": [
454 | "\n"
455 | ]
456 | }
457 | ],
458 | "source": [
459 | "from tqdm import tqdm\n",
460 | "import time\n",
461 | "\n",
462 | "EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 3, 0, 0, 0\n",
463 | "\n",
464 | "while True:\n",
465 | " lasttime = time.time()\n",
466 | " if CURRENT_CHECKPOINT == EARLY_STOPPING:\n",
467 | " print('break epoch:%d\\n' % (EPOCH))\n",
468 | " break\n",
469 | "\n",
470 | " train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0\n",
471 | " pbar = tqdm(\n",
472 | " range(0, len(train_X), batch_size), desc = 'train minibatch loop'\n",
473 | " )\n",
474 | " for i in pbar:\n",
475 | " batch_x = train_X[i : min(i + batch_size, train_X.shape[0])]\n",
476 | " batch_y = train_Y[i : min(i + batch_size, train_X.shape[0])]\n",
477 | " acc, cost, _ = sess.run(\n",
478 | " [model.accuracy, model.cost, model.optimizer],\n",
479 | " feed_dict = {\n",
480 | " model.Y: batch_y,\n",
481 | " model.X: batch_x,\n",
482 | " model.training: True\n",
483 | " },\n",
484 | " )\n",
485 | " assert not np.isnan(cost)\n",
486 | " train_loss += cost\n",
487 | " train_acc += acc\n",
488 | " pbar.set_postfix(cost = cost, accuracy = acc)\n",
489 | "\n",
490 | " pbar = tqdm(range(0, len(test_X), batch_size), desc = 'test minibatch loop')\n",
491 | " for i in pbar:\n",
492 | " batch_x = test_X[i : min(i + batch_size, test_X.shape[0])]\n",
493 | " batch_y = test_Y[i : min(i + batch_size, test_X.shape[0])]\n",
494 | " acc, cost = sess.run(\n",
495 | " [model.accuracy, model.cost],\n",
496 | " feed_dict = {\n",
497 | " model.Y: batch_y,\n",
498 | " model.X: batch_x,\n",
499 | " model.training: False\n",
500 | " },\n",
501 | " )\n",
502 | " test_loss += cost\n",
503 | " test_acc += acc\n",
504 | " pbar.set_postfix(cost = cost, accuracy = acc)\n",
505 | "\n",
506 | " train_loss /= len(train_X) / batch_size\n",
507 | " train_acc /= len(train_X) / batch_size\n",
508 | " test_loss /= len(test_X) / batch_size\n",
509 | " test_acc /= len(test_X) / batch_size\n",
510 | "\n",
511 | " if test_acc > CURRENT_ACC:\n",
512 | " print(\n",
513 | " 'epoch: %d, pass acc: %f, current acc: %f'\n",
514 | " % (EPOCH, CURRENT_ACC, test_acc)\n",
515 | " )\n",
516 | " CURRENT_ACC = test_acc\n",
517 | " CURRENT_CHECKPOINT = 0\n",
518 | " else:\n",
519 | " CURRENT_CHECKPOINT += 1\n",
520 | " \n",
521 | " print('time taken:', time.time() - lasttime)\n",
522 | " print(\n",
523 | " 'epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\\n'\n",
524 | " % (EPOCH, train_loss, train_acc, test_loss, test_acc)\n",
525 | " )\n",
526 | " EPOCH += 1"
527 | ]
528 | },
529 | {
530 | "cell_type": "code",
531 | "execution_count": 16,
532 | "metadata": {},
533 | "outputs": [
534 | {
535 | "name": "stderr",
536 | "output_type": "stream",
537 | "text": [
538 | "test minibatch loop: 100%|██████████| 263/263 [00:12<00:00, 20.85it/s]\n"
539 | ]
540 | }
541 | ],
542 | "source": [
543 | "pbar = tqdm(range(0, len(test_X), batch_size), desc = 'test minibatch loop')\n",
544 | "y_predict = []\n",
545 | "for i in pbar:\n",
546 | " batch_x = test_X[i : min(i + batch_size, test_X.shape[0])]\n",
547 | " batch_y = test_Y[i : min(i + batch_size, test_X.shape[0])]\n",
548 | " logits = sess.run(model.logits,\n",
549 | " feed_dict = {\n",
550 | " model.Y: batch_y,\n",
551 | " model.X: batch_x,\n",
552 | " model.training: False\n",
553 | " },\n",
554 | " )\n",
555 | " logits = np.argmax(logits,1).tolist()\n",
556 | " y_predict.extend(logits)"
557 | ]
558 | },
559 | {
560 | "cell_type": "code",
561 | "execution_count": 17,
562 | "metadata": {},
563 | "outputs": [
564 | {
565 | "name": "stdout",
566 | "output_type": "stream",
567 | "text": [
568 | " precision recall f1-score support\n",
569 | "\n",
570 | " - 0.90 0.39 0.54 5362\n",
571 | " + 0.46 0.92 0.61 3043\n",
572 | "\n",
573 | "avg / total 0.74 0.58 0.57 8405\n",
574 | "\n"
575 | ]
576 | }
577 | ],
578 | "source": [
579 | "from sklearn import metrics\n",
580 | "\n",
581 | "print(metrics.classification_report(test_Y, y_predict, target_names = ['-', '+']))"
582 | ]
583 | },
584 | {
585 | "cell_type": "code",
586 | "execution_count": null,
587 | "metadata": {},
588 | "outputs": [],
589 | "source": []
590 | }
591 | ],
592 | "metadata": {
593 | "kernelspec": {
594 | "display_name": "Python 3",
595 | "language": "python",
596 | "name": "python3"
597 | },
598 | "language_info": {
599 | "codemirror_mode": {
600 | "name": "ipython",
601 | "version": 3
602 | },
603 | "file_extension": ".py",
604 | "mimetype": "text/x-python",
605 | "name": "python",
606 | "nbconvert_exporter": "python",
607 | "pygments_lexer": "ipython3",
608 | "version": "3.6.8"
609 | }
610 | },
611 | "nbformat": 4,
612 | "nbformat_minor": 2
613 | }
614 |
--------------------------------------------------------------------------------
/long-text-classification/README.md:
--------------------------------------------------------------------------------
1 | ## How-to
2 |
3 | 1. Download dataset,
4 | ```bash
5 | wget https://github.com/huseinzol05/Malaya-Dataset/raw/master/fake-news/compressed-fake-news.zip
6 | ```
7 |
8 | 2. Unzip the dataset,
9 | ```bash
10 | unzip compressed-fake-news.zip
11 | ```
12 |
13 | 3. Run any notebook using Jupyter Notebook.
14 |
--------------------------------------------------------------------------------
/normal-text-classification/3.bert.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "# !pip3 install bert-tensorflow --user\n",
10 | "# !wget https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip\n",
11 | "# !unzip multi_cased_L-12_H-768_A-12.zip"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 2,
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "import bert\n",
21 | "from bert import run_classifier\n",
22 | "from bert import optimization\n",
23 | "from bert import tokenization\n",
24 | "from bert import modeling\n",
25 | "import numpy as np\n",
26 | "import tensorflow as tf"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 3,
32 | "metadata": {},
33 | "outputs": [],
34 | "source": [
35 | "BERT_VOCAB = 'multi_cased_L-12_H-768_A-12/vocab.txt'\n",
36 | "BERT_INIT_CHKPNT = 'multi_cased_L-12_H-768_A-12/bert_model.ckpt'\n",
37 | "BERT_CONFIG = 'multi_cased_L-12_H-768_A-12/bert_config.json'"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": 4,
43 | "metadata": {},
44 | "outputs": [],
45 | "source": [
46 | "from tqdm import tqdm\n",
47 | "import malaya"
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": 5,
53 | "metadata": {},
54 | "outputs": [],
55 | "source": [
56 | "import re\n",
57 | "from unidecode import unidecode\n",
58 | "\n",
59 | "tokenizer = malaya.preprocessing._SocialTokenizer().tokenize\n",
60 | "rules_normalizer = malaya.texts._tatabahasa.rules_normalizer\n",
61 | "\n",
62 | "def is_number_regex(s):\n",
63 | " if re.match(\"^\\d+?\\.\\d+?$\", s) is None:\n",
64 | " return s.isdigit()\n",
65 | " return True\n",
66 | "\n",
67 | "def detect_money(word):\n",
68 | " if word[:2] == 'rm' and is_number_regex(word[2:]):\n",
69 | " return True\n",
70 | " else:\n",
71 | " return False\n",
72 | "\n",
73 | "def preprocessing(string):\n",
74 | " tokenized = tokenizer(unidecode(string))\n",
75 | " tokenized = [malaya.stem.naive(w) for w in tokenized]\n",
76 | " tokenized = [w.lower() for w in tokenized if len(w) > 1]\n",
77 | " tokenized = [rules_normalizer.get(w, w) for w in tokenized]\n",
78 | " tokenized = ['' if is_number_regex(w) else w for w in tokenized]\n",
79 | " tokenized = ['' if detect_money(w) else w for w in tokenized]\n",
80 | " return tokenized"
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": 6,
86 | "metadata": {},
87 | "outputs": [],
88 | "source": [
89 | "with open('subjectivity-negative-bm.txt','r') as fopen:\n",
90 | " texts = fopen.read().split('\\n')\n",
91 | "labels = [0] * len(texts)\n",
92 | "\n",
93 | "with open('subjectivity-positive-bm.txt','r') as fopen:\n",
94 | " positive_texts = fopen.read().split('\\n')\n",
95 | "labels += [1] * len(positive_texts)\n",
96 | "texts += positive_texts\n",
97 | "\n",
98 | "assert len(labels) == len(texts)"
99 | ]
100 | },
101 | {
102 | "cell_type": "code",
103 | "execution_count": 7,
104 | "metadata": {},
105 | "outputs": [],
106 | "source": [
107 | "# from tqdm import tqdm\n",
108 | "# pbar = tqdm(range(len(texts)))\n",
109 | "# for i in pbar:\n",
110 | "# texts[i] = preprocessing(texts[i])"
111 | ]
112 | },
113 | {
114 | "cell_type": "code",
115 | "execution_count": 8,
116 | "metadata": {},
117 | "outputs": [],
118 | "source": [
119 | "MAX_SEQ_LENGTH = 100"
120 | ]
121 | },
122 | {
123 | "cell_type": "code",
124 | "execution_count": 10,
125 | "metadata": {},
126 | "outputs": [],
127 | "source": [
128 | "tokenization.validate_case_matches_checkpoint(False,BERT_INIT_CHKPNT)\n",
129 | "tokenizer = tokenization.FullTokenizer(\n",
130 | " vocab_file=BERT_VOCAB, do_lower_case=False)"
131 | ]
132 | },
133 | {
134 | "cell_type": "code",
135 | "execution_count": 11,
136 | "metadata": {},
137 | "outputs": [
138 | {
139 | "data": {
140 | "text/plain": [
141 | "['yang',\n",
142 | " 'muncul',\n",
143 | " 'dari',\n",
144 | " 'jiwa',\n",
145 | " 'manusia',\n",
146 | " 'dan',\n",
147 | " 'menunjukkan',\n",
148 | " 'ciri',\n",
149 | " '-',\n",
150 | " 'ciri',\n",
151 | " 'ab',\n",
152 | " '##stra',\n",
153 | " '##k',\n",
154 | " 'expression',\n",
155 | " '##ism',\n",
156 | " 'ab',\n",
157 | " '##stra',\n",
158 | " '##k',\n",
159 | " 'dan',\n",
160 | " 'pen',\n",
161 | " '##ying',\n",
162 | " '##kir',\n",
163 | " '##an',\n",
164 | " 'graf',\n",
165 | " '##iti',\n",
166 | " 'konst',\n",
167 | " '##ruk',\n",
168 | " '##tivi',\n",
169 | " '##sme',\n",
170 | " 'rus',\n",
171 | " '##sian',\n",
172 | " 'telah',\n",
173 | " 'men',\n",
174 | " '##gua',\n",
175 | " '##tkan',\n",
176 | " 'tempat',\n",
177 | " '##nya',\n",
178 | " 'dalam',\n",
179 | " 'sejarah',\n",
180 | " 'seni',\n",
181 | " 'mode',\n",
182 | " '##n',\n",
183 | " 'ketika',\n",
184 | " 'di',\n",
185 | " '##cipta',\n",
186 | " 'oleh',\n",
187 | " 'artis',\n",
188 | " 'yang',\n",
189 | " 'tidak',\n",
190 | " 'seda',\n",
191 | " '##rkan',\n",
192 | " 'diri',\n",
193 | " 'dengan',\n",
194 | " 'pen',\n",
195 | " '##cap',\n",
196 | " '##aian',\n",
197 | " 'kes',\n",
198 | " '##enia',\n",
199 | " '##n',\n",
200 | " 'mereka']"
201 | ]
202 | },
203 | "execution_count": 11,
204 | "metadata": {},
205 | "output_type": "execute_result"
206 | }
207 | ],
208 | "source": [
209 | "tokenizer.tokenize(texts[1])"
210 | ]
211 | },
212 | {
213 | "cell_type": "code",
214 | "execution_count": 12,
215 | "metadata": {},
216 | "outputs": [
217 | {
218 | "name": "stderr",
219 | "output_type": "stream",
220 | "text": [
221 | "100%|██████████| 9962/9962 [00:04<00:00, 2483.16it/s]\n"
222 | ]
223 | }
224 | ],
225 | "source": [
226 | "input_ids, input_masks, segment_ids = [], [], []\n",
227 | "\n",
228 | "for text in tqdm(texts):\n",
229 | " tokens_a = tokenizer.tokenize(text)\n",
230 | " if len(tokens_a) > MAX_SEQ_LENGTH - 2:\n",
231 | " tokens_a = tokens_a[:(MAX_SEQ_LENGTH - 2)]\n",
232 | " tokens = [\"[CLS]\"] + tokens_a + [\"[SEP]\"]\n",
233 | " segment_id = [0] * len(tokens)\n",
234 | " input_id = tokenizer.convert_tokens_to_ids(tokens)\n",
235 | " input_mask = [1] * len(input_id)\n",
236 | " padding = [0] * (MAX_SEQ_LENGTH - len(input_id))\n",
237 | " input_id += padding\n",
238 | " input_mask += padding\n",
239 | " segment_id += padding\n",
240 | " \n",
241 | " input_ids.append(input_id)\n",
242 | " input_masks.append(input_mask)\n",
243 | " segment_ids.append(segment_id)"
244 | ]
245 | },
246 | {
247 | "cell_type": "code",
248 | "execution_count": 13,
249 | "metadata": {},
250 | "outputs": [],
251 | "source": [
252 | "bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG)"
253 | ]
254 | },
255 | {
256 | "cell_type": "code",
257 | "execution_count": 14,
258 | "metadata": {},
259 | "outputs": [],
260 | "source": [
261 | "epoch = 10\n",
262 | "batch_size = 60\n",
263 | "warmup_proportion = 0.1\n",
264 | "num_train_steps = int(len(texts) / batch_size * epoch)\n",
265 | "num_warmup_steps = int(num_train_steps * warmup_proportion)"
266 | ]
267 | },
268 | {
269 | "cell_type": "code",
270 | "execution_count": 15,
271 | "metadata": {},
272 | "outputs": [],
273 | "source": [
274 | "class Model:\n",
275 | " def __init__(\n",
276 | " self,\n",
277 | " dimension_output,\n",
278 | " learning_rate = 2e-5,\n",
279 | " ):\n",
280 | " self.X = tf.placeholder(tf.int32, [None, None])\n",
281 | " self.segment_ids = tf.placeholder(tf.int32, [None, None])\n",
282 | " self.input_masks = tf.placeholder(tf.int32, [None, None])\n",
283 | " self.Y = tf.placeholder(tf.int32, [None])\n",
284 | " \n",
285 | " model = modeling.BertModel(\n",
286 | " config=bert_config,\n",
287 | " is_training=False,\n",
288 | " input_ids=self.X,\n",
289 | " input_mask=self.input_masks,\n",
290 | " token_type_ids=self.segment_ids,\n",
291 | " use_one_hot_embeddings=False)\n",
292 | " \n",
293 | " output_layer = model.get_pooled_output()\n",
294 | " self.logits = tf.layers.dense(output_layer, dimension_output)\n",
295 | " \n",
296 | " self.cost = tf.reduce_mean(\n",
297 | " tf.nn.sparse_softmax_cross_entropy_with_logits(\n",
298 | " logits = self.logits, labels = self.Y\n",
299 | " )\n",
300 | " )\n",
301 | " \n",
302 | " self.optimizer = optimization.create_optimizer(self.cost, learning_rate, \n",
303 | " num_train_steps, num_warmup_steps, False)\n",
304 | " correct_pred = tf.equal(\n",
305 | " tf.argmax(self.logits, 1, output_type = tf.int32), self.Y\n",
306 | " )\n",
307 | " self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))"
308 | ]
309 | },
310 | {
311 | "cell_type": "code",
312 | "execution_count": 16,
313 | "metadata": {},
314 | "outputs": [
315 | {
316 | "name": "stdout",
317 | "output_type": "stream",
318 | "text": [
319 | "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/op_def_library.py:263: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.\n",
320 | "Instructions for updating:\n",
321 | "Colocations handled automatically by placer.\n",
322 | "WARNING:tensorflow:From /home/jupyter/.local/lib/python3.6/site-packages/bert/modeling.py:671: dense (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.\n",
323 | "Instructions for updating:\n",
324 | "Use keras.layers.dense instead.\n",
325 | "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow/python/training/learning_rate_decay_v2.py:321: div (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.\n",
326 | "Instructions for updating:\n",
327 | "Deprecated in favor of operator or tf.math.divide.\n",
328 | "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/math_ops.py:3066: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.\n",
329 | "Instructions for updating:\n",
330 | "Use tf.cast instead.\n",
331 | "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow/python/training/saver.py:1266: checkpoint_exists (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.\n",
332 | "Instructions for updating:\n",
333 | "Use standard file APIs to check for files with this prefix.\n",
334 | "INFO:tensorflow:Restoring parameters from multi_cased_L-12_H-768_A-12/bert_model.ckpt\n"
335 | ]
336 | }
337 | ],
338 | "source": [
339 | "dimension_output = 2\n",
340 | "learning_rate = 2e-5\n",
341 | "\n",
342 | "tf.reset_default_graph()\n",
343 | "sess = tf.InteractiveSession()\n",
344 | "model = Model(\n",
345 | " dimension_output,\n",
346 | " learning_rate\n",
347 | ")\n",
348 | "\n",
349 | "sess.run(tf.global_variables_initializer())\n",
350 | "var_lists = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = 'bert')\n",
351 | "saver = tf.train.Saver(var_list = var_lists)\n",
352 | "saver.restore(sess, BERT_INIT_CHKPNT)"
353 | ]
354 | },
355 | {
356 | "cell_type": "code",
357 | "execution_count": 17,
358 | "metadata": {},
359 | "outputs": [],
360 | "source": [
361 | "from sklearn.cross_validation import train_test_split\n",
362 | "\n",
363 | "train_input_ids, test_input_ids, train_input_masks, test_input_masks, train_segment_ids, test_segment_ids, train_Y, test_Y = train_test_split(\n",
364 | " input_ids, input_masks, segment_ids, labels, test_size = 0.2\n",
365 | ")"
366 | ]
367 | },
368 | {
369 | "cell_type": "code",
370 | "execution_count": 18,
371 | "metadata": {},
372 | "outputs": [
373 | {
374 | "name": "stderr",
375 | "output_type": "stream",
376 | "text": [
377 | "train minibatch loop: 100%|██████████| 133/133 [00:53<00:00, 2.64it/s, accuracy=0.959, cost=0.177]\n",
378 | "test minibatch loop: 100%|██████████| 34/34 [00:04<00:00, 7.86it/s, accuracy=0.923, cost=0.135]\n",
379 | "train minibatch loop: 0%| | 0/133 [00:00, ?it/s]"
380 | ]
381 | },
382 | {
383 | "name": "stdout",
384 | "output_type": "stream",
385 | "text": [
386 | "epoch: 0, pass acc: 0.000000, current acc: 0.908372\n",
387 | "time taken: 57.89858078956604\n",
388 | "epoch: 0, training loss: 0.381553, training acc: 0.827651, valid loss: 0.274786, valid acc: 0.908372\n",
389 | "\n"
390 | ]
391 | },
392 | {
393 | "name": "stderr",
394 | "output_type": "stream",
395 | "text": [
396 | "train minibatch loop: 100%|██████████| 133/133 [00:52<00:00, 2.64it/s, accuracy=0.939, cost=0.114] \n",
397 | "test minibatch loop: 100%|██████████| 34/34 [00:04<00:00, 8.23it/s, accuracy=0.923, cost=0.159] \n",
398 | "train minibatch loop: 0%| | 0/133 [00:00, ?it/s]"
399 | ]
400 | },
401 | {
402 | "name": "stdout",
403 | "output_type": "stream",
404 | "text": [
405 | "epoch: 1, pass acc: 0.908372, current acc: 0.932456\n",
406 | "time taken: 56.61021065711975\n",
407 | "epoch: 1, training loss: 0.195364, training acc: 0.929392, valid loss: 0.306439, valid acc: 0.932456\n",
408 | "\n"
409 | ]
410 | },
411 | {
412 | "name": "stderr",
413 | "output_type": "stream",
414 | "text": [
415 | "train minibatch loop: 100%|██████████| 133/133 [00:52<00:00, 2.62it/s, accuracy=1, cost=0.00224] \n",
416 | "test minibatch loop: 100%|██████████| 34/34 [00:04<00:00, 8.20it/s, accuracy=1, cost=0.0409] \n",
417 | "train minibatch loop: 0%| | 0/133 [00:00, ?it/s]"
418 | ]
419 | },
420 | {
421 | "name": "stdout",
422 | "output_type": "stream",
423 | "text": [
424 | "time taken: 56.71245884895325\n",
425 | "epoch: 2, training loss: 0.095202, training acc: 0.972393, valid loss: 0.386368, valid acc: 0.932263\n",
426 | "\n"
427 | ]
428 | },
429 | {
430 | "name": "stderr",
431 | "output_type": "stream",
432 | "text": [
433 | "train minibatch loop: 100%|██████████| 133/133 [00:52<00:00, 2.62it/s, accuracy=1, cost=0.017] \n",
434 | "test minibatch loop: 100%|██████████| 34/34 [00:04<00:00, 8.16it/s, accuracy=1, cost=0.0373] \n",
435 | "train minibatch loop: 0%| | 0/133 [00:00, ?it/s]"
436 | ]
437 | },
438 | {
439 | "name": "stdout",
440 | "output_type": "stream",
441 | "text": [
442 | "epoch: 3, pass acc: 0.932456, current acc: 0.948821\n",
443 | "time taken: 56.79030680656433\n",
444 | "epoch: 3, training loss: 0.054034, training acc: 0.985444, valid loss: 0.299653, valid acc: 0.948821\n",
445 | "\n"
446 | ]
447 | },
448 | {
449 | "name": "stderr",
450 | "output_type": "stream",
451 | "text": [
452 | "train minibatch loop: 100%|██████████| 133/133 [00:52<00:00, 2.63it/s, accuracy=1, cost=0.000184] \n",
453 | "test minibatch loop: 100%|██████████| 34/34 [00:04<00:00, 8.17it/s, accuracy=0.923, cost=0.059]\n",
454 | "train minibatch loop: 0%| | 0/133 [00:00, ?it/s]"
455 | ]
456 | },
457 | {
458 | "name": "stdout",
459 | "output_type": "stream",
460 | "text": [
461 | "time taken: 56.84759020805359\n",
462 | "epoch: 4, training loss: 0.017070, training acc: 0.996737, valid loss: 0.488607, valid acc: 0.931954\n",
463 | "\n"
464 | ]
465 | },
466 | {
467 | "name": "stderr",
468 | "output_type": "stream",
469 | "text": [
470 | "train minibatch loop: 100%|██████████| 133/133 [00:52<00:00, 2.63it/s, accuracy=1, cost=0.000105]\n",
471 | "test minibatch loop: 100%|██████████| 34/34 [00:04<00:00, 8.16it/s, accuracy=0.923, cost=0.758]\n",
472 | "train minibatch loop: 0%| | 0/133 [00:00, ?it/s]"
473 | ]
474 | },
475 | {
476 | "name": "stdout",
477 | "output_type": "stream",
478 | "text": [
479 | "time taken: 56.89115834236145\n",
480 | "epoch: 5, training loss: 0.003643, training acc: 1.000753, valid loss: 0.515469, valid acc: 0.942993\n",
481 | "\n"
482 | ]
483 | },
484 | {
485 | "name": "stderr",
486 | "output_type": "stream",
487 | "text": [
488 | "train minibatch loop: 100%|██████████| 133/133 [00:52<00:00, 2.62it/s, accuracy=1, cost=2.97e-5] \n",
489 | "test minibatch loop: 100%|██████████| 34/34 [00:04<00:00, 8.16it/s, accuracy=0.923, cost=0.668]"
490 | ]
491 | },
492 | {
493 | "name": "stdout",
494 | "output_type": "stream",
495 | "text": [
496 | "time taken: 56.8890597820282\n",
497 | "epoch: 6, training loss: 0.001304, training acc: 1.001129, valid loss: 0.547710, valid acc: 0.945502\n",
498 | "\n",
499 | "break epoch:7\n",
500 | "\n"
501 | ]
502 | },
503 | {
504 | "name": "stderr",
505 | "output_type": "stream",
506 | "text": [
507 | "\n"
508 | ]
509 | }
510 | ],
511 | "source": [
512 | "from tqdm import tqdm\n",
513 | "import time\n",
514 | "\n",
515 | "EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 3, 0, 0, 0\n",
516 | "\n",
517 | "while True:\n",
518 | " lasttime = time.time()\n",
519 | " if CURRENT_CHECKPOINT == EARLY_STOPPING:\n",
520 | " print('break epoch:%d\\n' % (EPOCH))\n",
521 | " break\n",
522 | "\n",
523 | " train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0\n",
524 | " pbar = tqdm(\n",
525 | " range(0, len(train_input_ids), batch_size), desc = 'train minibatch loop'\n",
526 | " )\n",
527 | " for i in pbar:\n",
528 | " index = min(i + batch_size, len(train_input_ids))\n",
529 | " batch_x = train_input_ids[i: index]\n",
530 | " batch_masks = train_input_masks[i: index]\n",
531 | " batch_segment = train_segment_ids[i: index]\n",
532 | " batch_y = train_Y[i: index]\n",
533 | " acc, cost, _ = sess.run(\n",
534 | " [model.accuracy, model.cost, model.optimizer],\n",
535 | " feed_dict = {\n",
536 | " model.Y: batch_y,\n",
537 | " model.X: batch_x,\n",
538 | " model.segment_ids: batch_segment,\n",
539 | " model.input_masks: batch_masks\n",
540 | " },\n",
541 | " )\n",
542 | " assert not np.isnan(cost)\n",
543 | " train_loss += cost\n",
544 | " train_acc += acc\n",
545 | " pbar.set_postfix(cost = cost, accuracy = acc)\n",
546 | "\n",
547 | " pbar = tqdm(range(0, len(test_input_ids), batch_size), desc = 'test minibatch loop')\n",
548 | " for i in pbar:\n",
549 | " index = min(i + batch_size, len(test_input_ids))\n",
550 | " batch_x = test_input_ids[i: index]\n",
551 | " batch_masks = test_input_masks[i: index]\n",
552 | " batch_segment = test_segment_ids[i: index]\n",
553 | " batch_y = test_Y[i: index]\n",
554 | " acc, cost = sess.run(\n",
555 | " [model.accuracy, model.cost],\n",
556 | " feed_dict = {\n",
557 | " model.Y: batch_y,\n",
558 | " model.X: batch_x,\n",
559 | " model.segment_ids: batch_segment,\n",
560 | " model.input_masks: batch_masks\n",
561 | " },\n",
562 | " )\n",
563 | " test_loss += cost\n",
564 | " test_acc += acc\n",
565 | " pbar.set_postfix(cost = cost, accuracy = acc)\n",
566 | "\n",
567 | " train_loss /= len(train_input_ids) / batch_size\n",
568 | " train_acc /= len(train_input_ids) / batch_size\n",
569 | " test_loss /= len(test_input_ids) / batch_size\n",
570 | " test_acc /= len(test_input_ids) / batch_size\n",
571 | "\n",
572 | " if test_acc > CURRENT_ACC:\n",
573 | " print(\n",
574 | " 'epoch: %d, pass acc: %f, current acc: %f'\n",
575 | " % (EPOCH, CURRENT_ACC, test_acc)\n",
576 | " )\n",
577 | " CURRENT_ACC = test_acc\n",
578 | " CURRENT_CHECKPOINT = 0\n",
579 | " else:\n",
580 | " CURRENT_CHECKPOINT += 1\n",
581 | " \n",
582 | " print('time taken:', time.time() - lasttime)\n",
583 | " print(\n",
584 | " 'epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\\n'\n",
585 | " % (EPOCH, train_loss, train_acc, test_loss, test_acc)\n",
586 | " )\n",
587 | " EPOCH += 1"
588 | ]
589 | },
590 | {
591 | "cell_type": "code",
592 | "execution_count": null,
593 | "metadata": {},
594 | "outputs": [],
595 | "source": []
596 | }
597 | ],
598 | "metadata": {
599 | "kernelspec": {
600 | "display_name": "Python 3",
601 | "language": "python",
602 | "name": "python3"
603 | },
604 | "language_info": {
605 | "codemirror_mode": {
606 | "name": "ipython",
607 | "version": 3
608 | },
609 | "file_extension": ".py",
610 | "mimetype": "text/x-python",
611 | "name": "python",
612 | "nbconvert_exporter": "python",
613 | "pygments_lexer": "ipython3",
614 | "version": "3.6.8"
615 | }
616 | },
617 | "nbformat": 4,
618 | "nbformat_minor": 2
619 | }
620 |
--------------------------------------------------------------------------------
/normal-text-classification/README.md:
--------------------------------------------------------------------------------
1 | ## How-to
2 |
3 | 1. Download dataset from here,
4 | ```bash
5 | wget https://github.com/huseinzol05/Malaya-Dataset/raw/master/subjectivity/subjectivity-negative-bm.txt
6 | wget https://github.com/huseinzol05/Malaya-Dataset/raw/master/subjectivity/subjectivity-positive-bm.txt
7 | ```
8 |
9 | 2. Run any notebook using Jupyter Notebook.
10 |
--------------------------------------------------------------------------------
/optical-character-recognition/README.md:
--------------------------------------------------------------------------------
1 | ## How-to
2 |
3 | 1. Download dataset Malay-to-Jawi from here, https://malaya-dataset.s3-ap-southeast-1.amazonaws.com/jawi-rumi.tar.gz
4 |
5 | 2. Unzip it.
6 |
7 | 3. Run any notebook using Jupyter Notebook.
8 |
--------------------------------------------------------------------------------
/pos-tagging/README.md:
--------------------------------------------------------------------------------
1 | ## How-to
2 |
3 | 1. Download dataset from here, https://github.com/huseinzol05/Malaya-Dataset/tree/master/part-of-speech
4 |
5 | 2. Run any notebook using Jupyter Notebook.
6 |
7 | **For more models, you can check in https://github.com/huseinzol05/NLP-Models-Tensorflow/tree/master/pos-tagging, but the dataset is not Bahasa Malaysia**
8 |
--------------------------------------------------------------------------------
/question-answer/README.md:
--------------------------------------------------------------------------------
1 | ## How-to
2 |
3 | 1. Download dataset from here, https://github.com/huseinzol05/Malaya-Dataset/tree/master/question-answer, both train and test.
4 |
5 | 2. Run any notebook using Jupyter Notebook.
6 |
7 | **For more models, you can check in https://github.com/huseinzol05/NLP-Models-Tensorflow/tree/master/question-answer, but the dataset is not Bahasa Malaysia**
8 |
--------------------------------------------------------------------------------
/semantic-similarity/augmenting.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import json\n",
10 | "import glob\n",
11 | "import re\n",
12 | "import malaya"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": 2,
18 | "metadata": {},
19 | "outputs": [],
20 | "source": [
21 | "tokenizer = malaya.preprocessing._SocialTokenizer().tokenize\n",
22 | "\n",
23 | "def is_number_regex(s):\n",
24 | " if re.match(\"^\\d+?\\.\\d+?$\", s) is None:\n",
25 | " return s.isdigit()\n",
26 | " return True\n",
27 | "\n",
28 | "def detect_money(word):\n",
29 | " if word[:2] == 'rm' and is_number_regex(word[2:]):\n",
30 | " return True\n",
31 | " else:\n",
32 | " return False\n",
33 | "\n",
34 | "def preprocessing(string):\n",
35 | " tokenized = tokenizer(string)\n",
36 | " tokenized = [w.lower() for w in tokenized if len(w) > 2]\n",
37 | " tokenized = ['' if is_number_regex(w) else w for w in tokenized]\n",
38 | " tokenized = ['' if detect_money(w) else w for w in tokenized]\n",
39 | " return tokenized"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": 3,
45 | "metadata": {},
46 | "outputs": [],
47 | "source": [
48 | "left, right, label = [], [], []\n",
49 | "for file in glob.glob('quora/*.json'):\n",
50 | " with open(file) as fopen:\n",
51 | " x = json.load(fopen)\n",
52 | " for i in x:\n",
53 | " splitted = i[0].split(' <> ')\n",
54 | " if len(splitted) != 2:\n",
55 | " continue\n",
56 | " left.append(splitted[0])\n",
57 | " right.append(splitted[1])\n",
58 | " label.append(i[1])"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": 4,
64 | "metadata": {},
65 | "outputs": [
66 | {
67 | "data": {
68 | "text/plain": [
69 | "(403831, 403831, 403831)"
70 | ]
71 | },
72 | "execution_count": 4,
73 | "metadata": {},
74 | "output_type": "execute_result"
75 | }
76 | ],
77 | "source": [
78 | "len(left), len(right), len(label)"
79 | ]
80 | },
81 | {
82 | "cell_type": "code",
83 | "execution_count": 5,
84 | "metadata": {},
85 | "outputs": [],
86 | "source": [
87 | "with open('synonym0.json') as fopen:\n",
88 | " s = json.load(fopen)\n",
89 | " \n",
90 | "with open('synonym1.json') as fopen:\n",
91 | " s1 = json.load(fopen)"
92 | ]
93 | },
94 | {
95 | "cell_type": "code",
96 | "execution_count": 6,
97 | "metadata": {},
98 | "outputs": [],
99 | "source": [
100 | "synonyms = {}\n",
101 | "for l, r in (s + s1):\n",
102 | " if l not in synonyms:\n",
103 | " synonyms[l] = r + [l]\n",
104 | " else:\n",
105 | " synonyms[l].extend(r)\n",
106 | "synonyms = {k: list(set(v)) for k, v in synonyms.items()}"
107 | ]
108 | },
109 | {
110 | "cell_type": "code",
111 | "execution_count": 7,
112 | "metadata": {},
113 | "outputs": [],
114 | "source": [
115 | "import random\n",
116 | "\n",
117 | "def augmentation(s, maximum = 0.8):\n",
118 | " s = s.lower().split()\n",
119 | " for i in range(int(len(s) * maximum)):\n",
120 | " index = random.randint(0, len(s) - 1)\n",
121 | " word = s[index]\n",
122 | " sy = synonyms.get(word, [word])\n",
123 | " sy = random.choice(sy)\n",
124 | " s[index] = sy\n",
125 | " return s"
126 | ]
127 | },
128 | {
129 | "cell_type": "code",
130 | "execution_count": 8,
131 | "metadata": {},
132 | "outputs": [],
133 | "source": [
134 | "train_left, test_left = left[:-50000], left[-50000:]\n",
135 | "train_right, test_right = right[:-50000], right[-50000:]\n",
136 | "train_label, test_label = label[:-50000], label[-50000:]"
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": 9,
142 | "metadata": {},
143 | "outputs": [
144 | {
145 | "data": {
146 | "text/plain": [
147 | "(353831, 50000)"
148 | ]
149 | },
150 | "execution_count": 9,
151 | "metadata": {},
152 | "output_type": "execute_result"
153 | }
154 | ],
155 | "source": [
156 | "len(train_left), len(test_left)"
157 | ]
158 | },
159 | {
160 | "cell_type": "code",
161 | "execution_count": 10,
162 | "metadata": {},
163 | "outputs": [
164 | {
165 | "data": {
166 | "text/plain": [
167 | "['apakah maksud mengecap sejati kepada anda?',\n",
168 | " 'apakah maksud pilihan sejati kepada anda?',\n",
169 | " 'apakah maksud mencinta sejati kepada anda?',\n",
170 | " 'apakah maksud mengasihi sejati kepada anda?',\n",
171 | " 'apakah maksud cinta sejati kepada anda?',\n",
172 | " 'apakah maksud menyayangi sejati kepada anda?',\n",
173 | " 'apakah maksud percintaan sejati kepada anda?']"
174 | ]
175 | },
176 | "execution_count": 10,
177 | "metadata": {},
178 | "output_type": "execute_result"
179 | }
180 | ],
181 | "source": [
182 | "aug = [' '.join(augmentation(train_left[0])) for _ in range(10)] + [train_left[0].lower()]\n",
183 | "aug = list(set(aug))\n",
184 | "aug"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": 11,
190 | "metadata": {},
191 | "outputs": [
192 | {
193 | "data": {
194 | "text/plain": [
195 | "['apakah maksud \"cinta sejati\"?']"
196 | ]
197 | },
198 | "execution_count": 11,
199 | "metadata": {},
200 | "output_type": "execute_result"
201 | }
202 | ],
203 | "source": [
204 | "aug = [' '.join(augmentation(train_right[0])) for _ in range(10)] + [train_right[0].lower()]\n",
205 | "aug = list(set(aug))\n",
206 | "aug"
207 | ]
208 | },
209 | {
210 | "cell_type": "code",
211 | "execution_count": 12,
212 | "metadata": {},
213 | "outputs": [
214 | {
215 | "data": {
216 | "text/plain": [
217 | "0"
218 | ]
219 | },
220 | "execution_count": 12,
221 | "metadata": {},
222 | "output_type": "execute_result"
223 | }
224 | ],
225 | "source": [
226 | "train_label[0]"
227 | ]
228 | },
229 | {
230 | "cell_type": "code",
231 | "execution_count": 13,
232 | "metadata": {},
233 | "outputs": [
234 | {
235 | "name": "stderr",
236 | "output_type": "stream",
237 | "text": [
238 | "100%|██████████| 353831/353831 [00:46<00:00, 7536.26it/s]\n"
239 | ]
240 | }
241 | ],
242 | "source": [
243 | "from tqdm import tqdm\n",
244 | "\n",
245 | "LEFT, RIGHT, LABEL = [], [], []\n",
246 | "for i in tqdm(range(len(train_left))):\n",
247 | " aug_left = [' '.join(augmentation(train_left[i])) for _ in range(3)] + [train_left[i].lower()]\n",
248 | " aug_left = list(set(aug_left))\n",
249 | " \n",
250 | " aug_right = [' '.join(augmentation(train_right[i])) for _ in range(3)] + [train_right[i].lower()]\n",
251 | " aug_right = list(set(aug_right))\n",
252 | " \n",
253 | " for l in aug_left:\n",
254 | " for r in aug_right:\n",
255 | " LEFT.append(l)\n",
256 | " RIGHT.append(r)\n",
257 | " LABEL.append(train_label[i])"
258 | ]
259 | },
260 | {
261 | "cell_type": "code",
262 | "execution_count": 14,
263 | "metadata": {},
264 | "outputs": [
265 | {
266 | "data": {
267 | "text/plain": [
268 | "(4136391, 4136391, 4136391)"
269 | ]
270 | },
271 | "execution_count": 14,
272 | "metadata": {},
273 | "output_type": "execute_result"
274 | }
275 | ],
276 | "source": [
277 | "len(LEFT), len(RIGHT), len(LABEL)"
278 | ]
279 | },
280 | {
281 | "cell_type": "code",
282 | "execution_count": 15,
283 | "metadata": {},
284 | "outputs": [
285 | {
286 | "name": "stderr",
287 | "output_type": "stream",
288 | "text": [
289 | "100%|██████████| 4136391/4136391 [10:34<00:00, 6523.13it/s]\n"
290 | ]
291 | }
292 | ],
293 | "source": [
294 | "for i in tqdm(range(len(LEFT))):\n",
295 | " LEFT[i] = preprocessing(LEFT[i])\n",
296 | " RIGHT[i] = preprocessing(RIGHT[i])"
297 | ]
298 | },
299 | {
300 | "cell_type": "code",
301 | "execution_count": 16,
302 | "metadata": {},
303 | "outputs": [
304 | {
305 | "name": "stderr",
306 | "output_type": "stream",
307 | "text": [
308 | "100%|██████████| 50000/50000 [00:06<00:00, 7268.75it/s]\n"
309 | ]
310 | }
311 | ],
312 | "source": [
313 | "for i in tqdm(range(len(test_left))):\n",
314 | " test_left[i] = preprocessing(test_left[i])\n",
315 | " test_right[i] = preprocessing(test_right[i])"
316 | ]
317 | },
318 | {
319 | "cell_type": "code",
320 | "execution_count": 17,
321 | "metadata": {},
322 | "outputs": [],
323 | "source": [
324 | "with open('train-similarity.json', 'w') as fopen:\n",
325 | " json.dump({'left': LEFT, 'right': RIGHT, 'label': LABEL}, fopen)"
326 | ]
327 | },
328 | {
329 | "cell_type": "code",
330 | "execution_count": 18,
331 | "metadata": {},
332 | "outputs": [],
333 | "source": [
334 | "with open('test-similarity.json', 'w') as fopen:\n",
335 | " json.dump({'left': test_left, 'right': test_right, 'label': test_label}, fopen)"
336 | ]
337 | },
338 | {
339 | "cell_type": "code",
340 | "execution_count": null,
341 | "metadata": {},
342 | "outputs": [],
343 | "source": []
344 | }
345 | ],
346 | "metadata": {
347 | "kernelspec": {
348 | "display_name": "Python 3",
349 | "language": "python",
350 | "name": "python3"
351 | },
352 | "language_info": {
353 | "codemirror_mode": {
354 | "name": "ipython",
355 | "version": 3
356 | },
357 | "file_extension": ".py",
358 | "mimetype": "text/x-python",
359 | "name": "python",
360 | "nbconvert_exporter": "python",
361 | "pygments_lexer": "ipython3",
362 | "version": "3.6.8"
363 | }
364 | },
365 | "nbformat": 4,
366 | "nbformat_minor": 2
367 | }
368 |
--------------------------------------------------------------------------------
/sparse-classification/README.md:
--------------------------------------------------------------------------------
1 | ## How-to
2 |
3 | 1. You need to download and process dataset first,
4 | ```bash
5 | wget http://downloads.tatoeba.org/exports/sentences.tar.bz2
6 | bunzip2 sentences.tar.bz2
7 | tar xvf sentences.tar
8 | ```
9 |
10 | 2. Change to csv,
11 | ```bash
12 | awk -F"\t" '{print"__label__"$2" "$3}' < sentences.csv | shuf > all.txt
13 | ```
14 |
15 | 3. Run any notebook using Jupyter Notebook.
16 |
--------------------------------------------------------------------------------
/speech-to-text/README.md:
--------------------------------------------------------------------------------
1 | ## How-to
2 |
3 | 1. Install required libraries,
4 |
5 | ```bash
6 | pip3 install librosa numpy scipy
7 | ```
8 |
9 | 2. Run [generate-labels.ipynb](generate-labels.ipynb) to get [labels-text.txt](labels-text.txt)
10 |
11 | 2. Generate sentencepiece vocab,
12 |
13 | ```bash
14 | spm_train \
15 | --input=label-text.txt \
16 | --model_prefix=sp10m.cased.speech \
17 | --vocab_size=400 \
18 | --character_coverage=0.99995 \
19 | --model_type=unigram \
20 | --control_symbols=\,\,\,\,\ \
21 | --user_defined_symbols=\,.,\(,\),\",-,–,£,€ \
22 | --shuffle_input_sentence \
23 | --input_sentence_size=10000000
24 | ```
25 |
--------------------------------------------------------------------------------
/speech-to-text/augmentation.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import librosa
3 | import os
4 | import scipy
5 | import json
6 |
7 |
8 | def change_pitch_speech(samples):
9 | y_pitch_speed = samples.copy()
10 | length_change = np.random.uniform(low = 0.8, high = 1)
11 | speed_fac = 1.0 / length_change
12 | tmp = np.interp(
13 | np.arange(0, len(y_pitch_speed), speed_fac),
14 | np.arange(0, len(y_pitch_speed)),
15 | y_pitch_speed,
16 | )
17 | minlen = min(y_pitch_speed.shape[0], tmp.shape[0])
18 | y_pitch_speed *= 0
19 | y_pitch_speed[0:minlen] = tmp[0:minlen]
20 | return y_pitch_speed
21 |
22 |
23 | def change_amplitude(samples):
24 | y_aug = samples.copy()
25 | dyn_change = np.random.uniform(low = 1.5, high = 3)
26 | return y_aug * dyn_change
27 |
28 |
29 | def add_noise(samples):
30 | y_noise = samples.copy()
31 | noise_amp = 0.01 * np.random.uniform() * np.amax(y_noise)
32 | return y_noise.astype('float64') + noise_amp * np.random.normal(
33 | size = y_noise.shape[0]
34 | )
35 |
36 |
37 | def add_hpss(samples):
38 | y_hpss = librosa.effects.hpss(samples.astype('float64'))
39 | return y_hpss[1]
40 |
41 |
42 | def strech(samples):
43 | input_length = len(samples)
44 | streching = samples.copy()
45 | random_strech = np.random.uniform(low = 0.5, high = 1.3)
46 | print('random_strech = ', random_strech)
47 | streching = librosa.effects.time_stretch(
48 | streching.astype('float'), random_strech
49 | )
50 | return streching
51 |
52 |
53 | def random_augmentation(samples):
54 | cp = samples.copy()
55 | if np.random.randint(0, 2):
56 | length_change = np.random.uniform(low = 0.8, high = 1)
57 | speed_fac = 1.0 / length_change
58 | print('resample length_change = ', length_change)
59 | tmp = np.interp(
60 | np.arange(0, len(cp), speed_fac), np.arange(0, len(cp)), cp
61 | )
62 | minlen = min(cp.shape[0], tmp.shape[0])
63 | cp *= 0
64 | cp[0:minlen] = tmp[0:minlen]
65 |
66 | if np.random.randint(0, 2):
67 | dyn_change = np.random.uniform(low = 1.5, high = 3)
68 | print('dyn_change = ', dyn_change)
69 | cp = cp * dyn_change
70 |
71 | if np.random.randint(0, 2):
72 | noise_amp = 0.005 * np.random.uniform() * np.amax(cp)
73 | cp = cp.astype('float64') + noise_amp * np.random.normal(
74 | size = cp.shape[0]
75 | )
76 |
77 | if np.random.randint(0, 2):
78 | timeshift_fac = 0.2 * 2 * (np.random.uniform() - 0.5)
79 | print('timeshift_fac = ', timeshift_fac)
80 | start = int(cp.shape[0] * timeshift_fac)
81 | if start > 0:
82 | cp = np.pad(cp, (start, 0), mode = 'constant')[0 : cp.shape[0]]
83 | else:
84 | cp = np.pad(cp, (0, -start), mode = 'constant')[0 : cp.shape[0]]
85 | return cp
86 |
87 |
88 | with open('train-test.json') as fopen:
89 | wavs = json.load(fopen)['train']
90 |
91 | if not os.path.exists('augment'):
92 | os.makedirs('augment')
93 |
94 | for no, wav in enumerate(wavs):
95 | try:
96 | root, ext = os.path.splitext(wav)
97 | if (no + 1) % 100 == 0:
98 | print(no + 1, root, ext)
99 | root = root.replace('/', '<>')
100 | root = '%s/%s' % ('augment', root)
101 | sample_rate, samples = scipy.io.wavfile.read(wav)
102 | aug = change_pitch_speech(samples)
103 | librosa.output.write_wav(
104 | '%s-1%s' % (root, ext),
105 | aug.astype('float32'),
106 | sample_rate,
107 | norm = True,
108 | )
109 |
110 | aug = change_amplitude(samples)
111 | librosa.output.write_wav(
112 | '%s-2%s' % (root, ext),
113 | aug.astype('float32'),
114 | sample_rate,
115 | norm = True,
116 | )
117 |
118 | aug = add_noise(samples)
119 | librosa.output.write_wav(
120 | '%s-3%s' % (root, ext),
121 | aug.astype('float32'),
122 | sample_rate,
123 | norm = True,
124 | )
125 |
126 | aug = add_hpss(samples)
127 | librosa.output.write_wav(
128 | '%s-4%s' % (root, ext),
129 | aug.astype('float32'),
130 | sample_rate,
131 | norm = True,
132 | )
133 |
134 | aug = strech(samples)
135 | librosa.output.write_wav(
136 | '%s-5%s' % (root, ext),
137 | aug.astype('float32'),
138 | sample_rate,
139 | norm = True,
140 | )
141 |
142 | aug = random_augmentation(samples)
143 | librosa.output.write_wav(
144 | '%s-6%s' % (root, ext),
145 | aug.astype('float32'),
146 | sample_rate,
147 | norm = True,
148 | )
149 | except Exception as e:
150 | print(e)
151 | pass
152 |
--------------------------------------------------------------------------------
/speech-to-text/caching.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import librosa
3 | import os
4 | import scipy
5 | import tqdm
6 | import glob
7 | import json
8 |
9 | sampling_rate = 22050
10 | n_fft = 2048
11 | frame_shift = 0.0125
12 | frame_length = 0.05
13 | hop_length = int(sampling_rate * frame_shift)
14 | win_length = int(sampling_rate * frame_length)
15 | n_mels = 80
16 | reduction_factor = 5
17 |
18 |
19 | def get_spectrogram(fpath):
20 | y, sr = librosa.load(fpath, sr = sampling_rate)
21 | D = librosa.stft(
22 | y = y, n_fft = n_fft, hop_length = hop_length, win_length = win_length
23 | )
24 | magnitude = np.abs(D)
25 | power = magnitude ** 2
26 | S = librosa.feature.melspectrogram(S = power, n_mels = n_mels)
27 | return np.transpose(S.astype(np.float32))
28 |
29 |
30 | def reduce_frames(x, r_factor):
31 | T, C = x.shape
32 | num_paddings = reduction_factor - (T % r_factor) if T % r_factor != 0 else 0
33 | padded = np.pad(x, [[0, num_paddings], [0, 0]], 'constant')
34 | return np.reshape(padded, (-1, C * r_factor))
35 |
36 |
37 | if not os.path.exists('spectrogram-train'):
38 | os.mkdir('spectrogram-train')
39 |
40 | if not os.path.exists('spectrogram-test'):
41 | os.mkdir('spectrogram-test')
42 |
43 | with open('train-test.json') as fopen:
44 | wavs = json.load(fopen)['train']
45 |
46 | wavs = wavs + glob.glob('augment/*.wav')
47 |
48 | for path in tqdm.tqdm(wavs):
49 | try:
50 | root, ext = os.path.splitext(path)
51 | root = root.replace('/', '-')
52 | spectrogram = get_spectrogram(path)
53 | spectrogram = reduce_frames(spectrogram, reduction_factor)
54 | np.save('spectrogram-train/%s.npy' % (root), spectrogram)
55 | except Exception as e:
56 | print(e)
57 | pass
58 |
59 | with open('train-test.json') as fopen:
60 | wavs = json.load(fopen)['test']
61 |
62 | for path in tqdm.tqdm(wavs):
63 | try:
64 | root, ext = os.path.splitext(path)
65 | root = root.replace('/', '-')
66 | spectrogram = get_spectrogram(path)
67 | spectrogram = reduce_frames(spectrogram, reduction_factor)
68 | np.save('spectrogram-test/%s.npy' % (root), spectrogram)
69 | except Exception as e:
70 | print(e)
71 | pass
72 |
--------------------------------------------------------------------------------
/stemming/README.md:
--------------------------------------------------------------------------------
1 | ## How-to
2 |
3 | 1. Download dataset from here, https://github.com/huseinzol05/Malaya-Dataset/tree/master/stemmer
4 |
5 | 2. Run any notebook using Jupyter Notebook.
6 |
7 | **For more models, you can check in https://github.com/huseinzol05/NLP-Models-Tensorflow/tree/master/stemming, but the dataset is not Bahasa Malaysia**
8 |
--------------------------------------------------------------------------------
/text-to-speech/1.tacotron/tacotron.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | from utils import *
3 |
4 |
5 | def learning_rate_decay(init_lr, global_step, warmup_steps = 4000.0):
6 | step = tf.cast(global_step + 1, dtype = tf.float32)
7 | return (
8 | init_lr
9 | * warmup_steps ** 0.5
10 | * tf.minimum(step * warmup_steps ** -1.5, step ** -0.5)
11 | )
12 |
13 |
14 | max_N = 50
15 | max_T = 120
16 |
17 |
18 | def guided_attention(g = 0.2):
19 | W = np.zeros((max_N, max_T), dtype = np.float32)
20 | for n_pos in range(W.shape[0]):
21 | for t_pos in range(W.shape[1]):
22 | W[n_pos, t_pos] = 1 - np.exp(
23 | -(t_pos / float(max_T) - n_pos / float(max_N)) ** 2
24 | / (2 * g * g)
25 | )
26 | return W
27 |
28 |
29 | def prenet(inputs, num_units = None, is_training = True, scope = 'prenet'):
30 | if num_units is None:
31 | num_units = [embed_size, embed_size // 2]
32 | with tf.variable_scope(scope):
33 | outputs = tf.layers.dense(
34 | inputs,
35 | units = num_units[0],
36 | activation = tf.nn.relu,
37 | name = 'dense1',
38 | )
39 | # outputs = tf.layers.dropout(
40 | # outputs,
41 | # rate = dropout_rate,
42 | # training = is_training,
43 | # name = 'dropout1',
44 | # )
45 | outputs = tf.layers.dense(
46 | outputs,
47 | units = num_units[1],
48 | activation = tf.nn.relu,
49 | name = 'dense2',
50 | )
51 | # outputs = tf.layers.dropout(
52 | # outputs,
53 | # rate = dropout_rate,
54 | # training = is_training,
55 | # name = 'dropout2',
56 | # )
57 | return outputs
58 |
59 |
60 | def bn(
61 | inputs, is_training = True, activation_fn = None, scope = 'bn', reuse = None
62 | ):
63 | inputs_shape = inputs.get_shape()
64 | inputs_rank = inputs_shape.ndims
65 | if inputs_rank in [2, 3, 4]:
66 | if inputs_rank == 2:
67 | inputs = tf.expand_dims(inputs, axis = 1)
68 | inputs = tf.expand_dims(inputs, axis = 2)
69 | elif inputs_rank == 3:
70 | inputs = tf.expand_dims(inputs, axis = 1)
71 |
72 | outputs = tf.contrib.layers.batch_norm(
73 | inputs = inputs,
74 | center = True,
75 | scale = True,
76 | updates_collections = None,
77 | is_training = is_training,
78 | scope = scope,
79 | fused = True,
80 | reuse = reuse,
81 | )
82 | if inputs_rank == 2:
83 | outputs = tf.squeeze(outputs, axis = [1, 2])
84 | elif inputs_rank == 3:
85 | outputs = tf.squeeze(outputs, axis = 1)
86 | else:
87 | outputs = tf.contrib.layers.batch_norm(
88 | inputs = inputs,
89 | center = True,
90 | scale = True,
91 | updates_collections = None,
92 | is_training = is_training,
93 | scope = scope,
94 | reuse = reuse,
95 | fused = False,
96 | )
97 | if activation_fn is not None:
98 | outputs = activation_fn(outputs)
99 |
100 | return outputs
101 |
102 |
103 | def highwaynet(inputs, num_units = None, scope = 'highwaynet'):
104 | if not num_units:
105 | num_units = inputs.get_shape()[-1]
106 | with tf.variable_scope(scope):
107 | H = tf.layers.dense(
108 | inputs, units = num_units, activation = tf.nn.relu, name = 'dense1'
109 | )
110 | T = tf.layers.dense(
111 | inputs,
112 | units = num_units,
113 | activation = tf.nn.sigmoid,
114 | bias_initializer = tf.constant_initializer(-1.0),
115 | name = 'dense2',
116 | )
117 | outputs = H * T + inputs * (1.0 - T)
118 | return outputs
119 |
120 |
121 | def conv1d_banks(inputs, K = 16, is_training = True, scope = 'conv1d_banks'):
122 | with tf.variable_scope(scope):
123 | outputs = tf.layers.conv1d(inputs, embed_size // 2, 1, padding = 'SAME')
124 | for k in range(2, K + 1):
125 | with tf.variable_scope('num_{}'.format(k)):
126 | output = tf.layers.conv1d(
127 | inputs, embed_size // 2, k, padding = 'SAME'
128 | )
129 | outputs = tf.concat((outputs, output), -1)
130 | # outputs = bn(outputs, is_training, tf.nn.relu)
131 | return outputs
132 |
133 |
134 | class Tacotron:
135 | def __init__(self, reuse = None):
136 | self.X = tf.placeholder(tf.int32, (None, None))
137 | lookup_table = tf.get_variable(
138 | 'lookup_table',
139 | dtype = tf.float32,
140 | shape = [len(vocab), embed_size],
141 | initializer = tf.truncated_normal_initializer(
142 | mean = 0.0, stddev = 0.01
143 | ),
144 | )
145 | embedded = tf.nn.embedding_lookup(lookup_table, self.X)
146 | self.Y = tf.placeholder(tf.float32, (None, None, n_mels * resampled))
147 | self.decoder_inputs = tf.concat(
148 | (tf.zeros_like(self.Y[:, :1, :]), self.Y[:, :-1, :]), 1
149 | )
150 | self.gts = tf.convert_to_tensor(guided_attention())
151 | self.decoder_inputs = self.decoder_inputs[:, :, -n_mels:]
152 | self.Z = tf.placeholder(
153 | tf.float32, (None, None, fourier_window_size // 2 + 1)
154 | )
155 | self.training = tf.placeholder(tf.bool, None)
156 | batch_size = tf.shape(self.X)[0]
157 | with tf.variable_scope('encoder', reuse = reuse):
158 | prenet_out_encoder = prenet(embedded, is_training = self.training)
159 | enc = conv1d_banks(
160 | prenet_out_encoder,
161 | K = decoder_num_banks,
162 | is_training = self.training,
163 | )
164 | enc = tf.layers.max_pooling1d(
165 | enc, pool_size = 2, strides = 1, padding = 'same'
166 | )
167 | enc = tf.layers.conv1d(
168 | enc,
169 | embed_size // 2,
170 | 3,
171 | name = 'encoder-conv1-1',
172 | padding = 'SAME',
173 | )
174 | # enc = bn(enc, self.training, tf.nn.relu, scope = 'encoder-conv1-1')
175 | enc = tf.layers.conv1d(
176 | enc,
177 | embed_size // 2,
178 | 3,
179 | name = 'encoder-conv1-2',
180 | padding = 'SAME',
181 | )
182 | # enc = bn(enc, self.training, scope = 'encoder-conv1-2')
183 | enc += prenet_out_encoder
184 | for i in range(num_highwaynet_blocks):
185 | enc = highwaynet(
186 | enc,
187 | num_units = embed_size // 2,
188 | scope = 'encoder-highwaynet-{}'.format(i),
189 | )
190 | with tf.variable_scope('encoder-gru', reuse = reuse):
191 | cell = tf.contrib.rnn.GRUCell(embed_size // 2)
192 | cell_bw = tf.contrib.rnn.GRUCell(embed_size // 2)
193 | outputs, states = tf.nn.bidirectional_dynamic_rnn(
194 | cell, cell_bw, enc, dtype = tf.float32
195 | )
196 | self.memory = tf.concat(outputs, 2)
197 | states = tf.concat(states, 1)
198 | with tf.variable_scope('decoder-1', reuse = reuse):
199 | prenet_out_decoder1 = prenet(
200 | self.decoder_inputs, is_training = self.training
201 | )
202 | with tf.variable_scope('attention-decoder-1', reuse = reuse):
203 | attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
204 | embed_size, self.memory
205 | )
206 | decoder_cell = tf.contrib.rnn.GRUCell(embed_size)
207 | cell_with_attention = tf.contrib.seq2seq.AttentionWrapper(
208 | decoder_cell,
209 | attention_mechanism,
210 | embed_size,
211 | alignment_history = True,
212 | )
213 | encoder_state = cell_with_attention.zero_state(
214 | batch_size, tf.float32
215 | ).clone(cell_state = states)
216 | outputs_attention, state_attention = tf.nn.dynamic_rnn(
217 | cell_with_attention,
218 | prenet_out_decoder1,
219 | initial_state = encoder_state,
220 | dtype = tf.float32,
221 | )
222 | self.alignments = tf.transpose(
223 | state_attention.alignment_history.stack(), [1, 2, 0]
224 | )
225 | with tf.variable_scope('decoder1-gru-1'):
226 | cell = tf.contrib.rnn.GRUCell(embed_size)
227 | outputs, _ = tf.nn.dynamic_rnn(
228 | cell, outputs_attention, dtype = tf.float32
229 | )
230 | outputs_attention += outputs
231 | with tf.variable_scope('decoder1-gru-2'):
232 | cell = tf.contrib.rnn.GRUCell(embed_size)
233 | outputs, _ = tf.nn.dynamic_rnn(
234 | cell, outputs_attention, dtype = tf.float32
235 | )
236 | outputs_attention += outputs
237 | self.Y_hat = tf.layers.dense(outputs_attention, n_mels * resampled)
238 | with tf.variable_scope('decoder-2', reuse = reuse):
239 | out_decoder2 = tf.reshape(self.Y, [tf.shape(self.Y)[0], -1, n_mels])
240 | dec = conv1d_banks(
241 | out_decoder2, K = decoder_num_banks, is_training = self.training
242 | )
243 | dec = tf.layers.max_pooling1d(
244 | dec, pool_size = 2, strides = 1, padding = 'same'
245 | )
246 | dec = tf.layers.conv1d(
247 | dec,
248 | embed_size // 2,
249 | 3,
250 | name = 'decoder-conv1-1',
251 | padding = 'SAME',
252 | )
253 | # dec = bn(dec, self.training, tf.nn.relu, scope = 'decoder-conv1-1')
254 | dec = tf.layers.conv1d(
255 | dec,
256 | embed_size // 2,
257 | 3,
258 | name = 'decoder-conv1-2',
259 | padding = 'SAME',
260 | )
261 | # dec = bn(dec, self.training, scope = 'decoder-conv1-2')
262 | dec = tf.layers.dense(dec, embed_size // 2)
263 | for i in range(4):
264 | dec = highwaynet(
265 | dec,
266 | num_units = embed_size // 2,
267 | scope = 'decoder-highwaynet-{}'.format(i),
268 | )
269 | with tf.variable_scope('decoder-gru', reuse = reuse):
270 | cell = tf.contrib.rnn.GRUCell(embed_size // 2)
271 | cell_bw = tf.contrib.rnn.GRUCell(embed_size // 2)
272 | outputs, _ = tf.nn.bidirectional_dynamic_rnn(
273 | cell, cell_bw, dec, dtype = tf.float32
274 | )
275 | outputs = tf.concat(outputs, 2)
276 | self.Z_hat = tf.layers.dense(outputs, 1 + fourier_window_size // 2)
277 | self.loss1 = tf.reduce_mean(tf.abs(self.Y_hat - self.Y))
278 | self.loss2 = tf.reduce_mean(tf.abs(self.Z_hat - self.Z))
279 | self.loss_bd1 = tf.reduce_mean(
280 | tf.nn.sigmoid_cross_entropy_with_logits(
281 | logits = self.Y_hat, labels = self.Y
282 | )
283 | )
284 | self.A = tf.pad(
285 | self.alignments,
286 | [(0, 0), (0, max_N), (0, max_T)],
287 | mode = 'CONSTANT',
288 | constant_values = -1.0,
289 | )[:, :max_N, :max_T]
290 | self.attention_masks = tf.to_float(tf.not_equal(self.A, -1))
291 | self.loss_att = tf.reduce_sum(
292 | tf.abs(self.A * self.gts) * self.attention_masks
293 | )
294 | self.mask_sum = tf.reduce_sum(self.attention_masks)
295 | self.loss_att /= self.mask_sum
296 | self.loss_bd2 = tf.reduce_mean(
297 | tf.nn.sigmoid_cross_entropy_with_logits(
298 | logits = self.Z_hat, labels = self.Z
299 | )
300 | )
301 |
302 | self.loss = (
303 | self.loss1
304 | + self.loss2
305 | + self.loss_bd1
306 | + self.loss_att
307 | + self.loss_bd2
308 | )
309 |
310 | self.global_step = tf.Variable(
311 | 0, name = 'global_step', trainable = False
312 | )
313 | self.lr = learning_rate_decay(1e-3, global_step = self.global_step)
314 | optimizer = tf.train.AdamOptimizer(learning_rate = self.lr)
315 | self.gvs = optimizer.compute_gradients(self.loss)
316 | self.clipped = []
317 | for grad, var in self.gvs:
318 | grad = tf.clip_by_norm(grad, 5.0)
319 | self.clipped.append((grad, var))
320 | self.optimizer = optimizer.apply_gradients(
321 | self.clipped, global_step = self.global_step
322 | )
323 |
--------------------------------------------------------------------------------
/text-to-speech/1.tacotron/test.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huseinzol05/Bahasa-NLP-Tensorflow/4e6427230e36c2d79ec951c7f2c3501bf75f9a8a/text-to-speech/1.tacotron/test.wav
--------------------------------------------------------------------------------
/text-to-speech/1.tacotron/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import librosa
3 | import copy
4 | from scipy import signal
5 | from scipy.io import wavfile
6 | import matplotlib.pyplot as plt
7 | import seaborn as sns
8 | import os
9 | import unicodedata
10 | import re
11 |
12 | # P: Padding
13 | # E: End of Sentence
14 | path = '../data/'
15 | vocab = "PE abcdefghijklmnopqrstuvwxyz'.?"
16 | max_duration = 10.0
17 | sample_rate = 22050
18 | fourier_window_size = 2048
19 | frame_shift = 0.0125
20 | frame_length = 0.05
21 | hop_length = int(sample_rate * frame_shift)
22 | win_length = int(sample_rate * frame_length)
23 | n_mels = 80
24 | power = 1.2
25 | iteration_griffin = 50
26 | preemphasis = 0.97
27 | max_db = 100
28 | ref_db = 20
29 | embed_size = 256
30 | encoder_num_banks = 16
31 | decoder_num_banks = 8
32 | num_highwaynet_blocks = 4
33 | resampled = 5
34 | dropout_rate = 0.05
35 | learning_rate = 0.001
36 | batch_size = 32
37 |
38 |
39 | def get_spectrogram(audio_file):
40 | y, sr = librosa.load(audio_file, sr = sample_rate)
41 | y, _ = librosa.effects.trim(y)
42 | y = np.append(y[0], y[1:] - preemphasis * y[:-1])
43 | linear = librosa.stft(
44 | y = y,
45 | n_fft = fourier_window_size,
46 | hop_length = hop_length,
47 | win_length = win_length,
48 | )
49 | mag = np.abs(linear)
50 | mel_basis = librosa.filters.mel(sample_rate, fourier_window_size, n_mels)
51 | mel = np.dot(mel_basis, mag)
52 | mel = 20 * np.log10(np.maximum(1e-5, mel))
53 | mag = 20 * np.log10(np.maximum(1e-5, mag))
54 | mel = np.clip((mel - ref_db + max_db) / max_db, 1e-8, 1)
55 | mag = np.clip((mag - ref_db + max_db) / max_db, 1e-8, 1)
56 | return mel.T.astype(np.float32), mag.T.astype(np.float32)
57 |
58 |
59 | def invert_spectrogram(spectrogram):
60 | return librosa.istft(
61 | spectrogram, hop_length, win_length = win_length, window = 'hann'
62 | )
63 |
64 |
65 | def spectrogram2wav(mag):
66 | mag = mag.T
67 | mag = (np.clip(mag, 0, 1) * max_db) - max_db + ref_db
68 | mag = np.power(10.0, mag * 0.05)
69 | wav = griffin_lim(mag)
70 | wav = signal.lfilter([1], [1, -preemphasis], wav)
71 | wav, _ = librosa.effects.trim(wav)
72 | return wav.astype(np.float32)
73 |
74 |
75 | def griffin_lim(spectrogram):
76 | X_best = copy.deepcopy(spectrogram)
77 | for i in range(iteration_griffin):
78 | X_T = invert_spectrogram(X_best)
79 | est = librosa.stft(
80 | X_T, fourier_window_size, hop_length, win_length = win_length
81 | )
82 | phase = est / np.maximum(1e-8, np.abs(est))
83 | X_best = spectrogram * phase
84 | X_T = invert_spectrogram(X_best)
85 | return np.real(X_T)
86 |
87 |
88 | def get_wav(spectrogram):
89 | mag = (np.clip(spectrogram.T, 0, 1) * max_db) - max_db + ref_db
90 | mag = np.power(10.0, mag * 0.05)
91 | wav = griffin_lim(mag)
92 | wav = signal.lfilter([1], [1, -preemphasis], wav)
93 | return librosa.effects.trim(wav).astype(np.float32)
94 |
95 |
96 | def load_file(path):
97 | fname = os.path.basename(path)
98 | mel, mag = get_spectrogram(path)
99 | t = mel.shape[0]
100 | num_paddings = resampled - (t % resampled) if t % resampled != 0 else 0
101 | mel = np.pad(mel, [[0, num_paddings], [0, 0]], mode = 'constant')
102 | mag = np.pad(mag, [[0, num_paddings], [0, 0]], mode = 'constant')
103 | return fname, mel.reshape((-1, n_mels * resampled)), mag
104 |
105 |
106 | def text_normalize(text):
107 | text = ''.join(
108 | char
109 | for char in unicodedata.normalize('NFD', text)
110 | if unicodedata.category(char) != 'Mn'
111 | )
112 | text = text.lower()
113 | text = re.sub('[^{}]'.format(vocab), ' ', text)
114 | text = re.sub('[ ]+', ' ', text)
115 | return text
116 |
117 |
118 | def get_cached(path):
119 | mel = '../mel/{}.npy'.format(path)
120 | mag = '../mag/{}.npy'.format(path)
121 | return np.load(mel), np.load(mag)
122 |
123 |
124 | def plot_alignment(alignment, e):
125 | fig, ax = plt.subplots()
126 | im = ax.imshow(alignment)
127 | fig.colorbar(im)
128 | plt.title('epoch %d' % (e))
129 | plt.show()
130 |
131 |
132 | char2idx = {char: idx for idx, char in enumerate(vocab)}
133 | idx2char = {idx: char for idx, char in enumerate(vocab)}
134 |
--------------------------------------------------------------------------------
/text-to-speech/README.md:
--------------------------------------------------------------------------------
1 | ## How-to
2 |
3 | 1. Install required libraries,
4 | ```bash
5 | pip3 install librosa numpy scipy
6 | ```
7 |
8 | 2. Download dataset from here, https://s3-ap-southeast-1.amazonaws.com/malaya-dataset/speech-bahasa.zip
9 |
10 | 3. Unzip it, and you will get 3 folders,
11 | ```bash
12 | unzip speech-bahasa.zip
13 | ```
14 |
15 | ```text
16 | sebut-perkataan-man
17 | sebut-perkataan-woman
18 | tolong-sebut
19 | ```
20 |
21 | 4. Run [caching.py](caching.py) to cache meg and mel locally,
22 | ```bash
23 | python3 caching.py
24 | ```
25 |
26 | ```text
27 | 1%|▉ | 113/17399 [00:24<1:00:05, 4.79it/s]
28 | ```
29 |
30 | 6. Run any notebook using Jupyter Notebook.
31 |
32 | **For more models, you can check in https://github.com/huseinzol05/NLP-Models-Tensorflow/tree/master/text-to-speech, but the dataset is not Bahasa Malaysia**
33 |
--------------------------------------------------------------------------------
/text-to-speech/caching.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import librosa
3 | import os
4 | import scipy
5 | import tqdm
6 |
7 | sampling_rate = 22050
8 | n_fft = 2048
9 | frame_shift = 0.0125
10 | frame_length = 0.05
11 | fourier_window_size = 2048
12 | max_db = 100
13 | ref_db = 20
14 | preemphasis = 0.97
15 | hop_length = int(sampling_rate * frame_shift)
16 | win_length = int(sampling_rate * frame_length)
17 | n_mels = 80
18 | resampled = 5
19 | reduction_factor = 5
20 |
21 |
22 | def get_spectrogram(audio_file):
23 | y, sr = librosa.load(audio_file, sr = sampling_rate)
24 | y, _ = librosa.effects.trim(y)
25 | y = np.append(y[0], y[1:] - preemphasis * y[:-1])
26 | linear = librosa.stft(
27 | y = y,
28 | n_fft = fourier_window_size,
29 | hop_length = hop_length,
30 | win_length = win_length,
31 | )
32 | mag = np.abs(linear)
33 | mel_basis = librosa.filters.mel(sampling_rate, fourier_window_size, n_mels)
34 | mel = np.dot(mel_basis, mag)
35 | mel = 20 * np.log10(np.maximum(1e-5, mel))
36 | mag = 20 * np.log10(np.maximum(1e-5, mag))
37 | mel = np.clip((mel - ref_db + max_db) / max_db, 1e-8, 1)
38 | mag = np.clip((mag - ref_db + max_db) / max_db, 1e-8, 1)
39 | return mel.T.astype(np.float32), mag.T.astype(np.float32)
40 |
41 |
42 | def load_file(path):
43 | mel, mag = get_spectrogram(path)
44 | t = mel.shape[0]
45 | num_paddings = resampled - (t % resampled) if t % resampled != 0 else 0
46 | mel = np.pad(mel, [[0, num_paddings], [0, 0]], mode = 'constant')
47 | mag = np.pad(mag, [[0, num_paddings], [0, 0]], mode = 'constant')
48 | return mel.reshape((-1, n_mels * resampled)), mag
49 |
50 |
51 | if not os.path.exists('mel'):
52 | os.mkdir('mel')
53 | if not os.path.exists('mag'):
54 | os.mkdir('mag')
55 |
56 | tolong_sebut = [
57 | 'tolong-sebut/' + i for i in os.listdir('tolong-sebut') if '.wav' in i
58 | ]
59 | sebut_perkataan_man = [
60 | 'sebut-perkataan-man/' + i
61 | for i in os.listdir('sebut-perkataan-man')
62 | if '.wav' in i
63 | ]
64 | sebut_perkataan_woman = [
65 | 'sebut-perkataan-woman/' + i
66 | for i in os.listdir('sebut-perkataan-woman')
67 | if '.wav' in i
68 | ]
69 |
70 | wavs = tolong_sebut + sebut_perkataan_man + sebut_perkataan_woman
71 |
72 | for path in tqdm.tqdm(wavs):
73 | try:
74 | mel, mag = load_file(path)
75 | root, ext = os.path.splitext(path)
76 | root = root.replace('/', '-')
77 | np.save('mel/%s.npy' % (root), mel)
78 | np.save('mag/%s.npy' % (root), mag)
79 | except Exception as e:
80 | print(e)
81 | pass
82 |
--------------------------------------------------------------------------------
/text-to-speech/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import librosa
3 | import copy
4 | from scipy import signal
5 | from scipy.io import wavfile
6 | import matplotlib.pyplot as plt
7 | import seaborn as sns
8 | import os
9 | import unicodedata
10 | import re
11 |
12 | # P: Padding
13 | # S: Start of Sentence
14 | # E: End of Sentence
15 | path = '../data/'
16 | vocab = "PSE abcdefghijklmnopqrstuvwxyz'.?"
17 | max_duration = 10.0
18 | sample_rate = 22050
19 | fourier_window_size = 2048
20 | frame_shift = 0.0125
21 | frame_length = 0.05
22 | hop_length = int(sample_rate * frame_shift)
23 | win_length = int(sample_rate * frame_length)
24 | n_mels = 80
25 | power = 1.2
26 | iteration_griffin = 50
27 | preemphasis = 0.97
28 | max_db = 100
29 | ref_db = 20
30 | embed_size = 256
31 | encoder_num_banks = 16
32 | decoder_num_banks = 8
33 | num_highwaynet_blocks = 4
34 | resampled = 5
35 | dropout_rate = 0.5
36 | learning_rate = 0.001
37 | batch_size = 32
38 |
39 |
40 | def get_spectrogram(audio_file):
41 | y, sr = librosa.load(audio_file, sr = sample_rate)
42 | y, _ = librosa.effects.trim(y)
43 | y = np.append(y[0], y[1:] - preemphasis * y[:-1])
44 | linear = librosa.stft(
45 | y = y,
46 | n_fft = fourier_window_size,
47 | hop_length = hop_length,
48 | win_length = win_length,
49 | )
50 | mag = np.abs(linear)
51 | mel_basis = librosa.filters.mel(sample_rate, fourier_window_size, n_mels)
52 | mel = np.dot(mel_basis, mag)
53 | mel = 20 * np.log10(np.maximum(1e-5, mel))
54 | mag = 20 * np.log10(np.maximum(1e-5, mag))
55 | mel = np.clip((mel - ref_db + max_db) / max_db, 1e-8, 1)
56 | mag = np.clip((mag - ref_db + max_db) / max_db, 1e-8, 1)
57 | return mel.T.astype(np.float32), mag.T.astype(np.float32)
58 |
59 |
60 | def invert_spectrogram(spectrogram):
61 | return librosa.istft(
62 | spectrogram, hop_length, win_length = win_length, window = 'hann'
63 | )
64 |
65 |
66 | def spectrogram2wav(mag):
67 | mag = mag.T
68 | mag = (np.clip(mag, 0, 1) * max_db) - max_db + ref_db
69 | mag = np.power(10.0, mag * 0.05)
70 | wav = griffin_lim(mag)
71 | wav = signal.lfilter([1], [1, -preemphasis], wav)
72 | wav, _ = librosa.effects.trim(wav)
73 | return wav.astype(np.float32)
74 |
75 |
76 | def griffin_lim(spectrogram):
77 | X_best = copy.deepcopy(spectrogram)
78 | for i in range(iteration_griffin):
79 | X_T = invert_spectrogram(X_best)
80 | est = librosa.stft(
81 | X_T, fourier_window_size, hop_length, win_length = win_length
82 | )
83 | phase = est / np.maximum(1e-8, np.abs(est))
84 | X_best = spectrogram * phase
85 | X_T = invert_spectrogram(X_best)
86 | return np.real(X_T)
87 |
88 |
89 | def get_wav(spectrogram):
90 | mag = (np.clip(spectrogram.T, 0, 1) * max_db) - max_db + ref_db
91 | mag = np.power(10.0, mag * 0.05)
92 | wav = griffin_lim(mag)
93 | wav = signal.lfilter([1], [1, -preemphasis], wav)
94 | return librosa.effects.trim(wav).astype(np.float32)
95 |
96 |
97 | def load_file(path):
98 | fname = os.path.basename(path)
99 | mel, mag = get_spectrogram(path)
100 | t = mel.shape[0]
101 | num_paddings = resampled - (t % resampled) if t % resampled != 0 else 0
102 | mel = np.pad(mel, [[0, num_paddings], [0, 0]], mode = 'constant')
103 | mag = np.pad(mag, [[0, num_paddings], [0, 0]], mode = 'constant')
104 | return fname, mel.reshape((-1, n_mels * resampled)), mag
105 |
106 |
107 | def text_normalize(text):
108 | text = ''.join(
109 | char
110 | for char in unicodedata.normalize('NFD', text)
111 | if unicodedata.category(char) != 'Mn'
112 | )
113 | text = text.lower()
114 | text = re.sub('[^{}]'.format(vocab), ' ', text)
115 | text = re.sub('[ ]+', ' ', text)
116 | return text
117 |
118 |
119 | def get_cached(path):
120 | mel = 'mel/{}.npy'.format(path)
121 | mag = 'mag/{}.npy'.format(path)
122 | return np.load(mel), np.load(mag)
123 |
124 |
125 | def plot_alignment(alignment, e):
126 | fig, ax = plt.subplots()
127 | im = ax.imshow(alignment)
128 | fig.colorbar(im)
129 | plt.title('epoch %d' % (e))
130 | plt.show()
131 |
132 |
133 | char2idx = {char: idx for idx, char in enumerate(vocab)}
134 | idx2char = {idx: char for idx, char in enumerate(vocab)}
135 |
--------------------------------------------------------------------------------
/topic-generator/README.md:
--------------------------------------------------------------------------------
1 | ## How-to
2 |
3 | 1. Download dataset,
4 | ```bash
5 | wget https://github.com/huseinzol05/Malaya-Dataset/raw/master/news/news.zip
6 | ```
7 |
8 | 2. Unzip the dataset,
9 | ```bash
10 | unzip news.zip
11 | ```
12 |
13 | 3. Run any notebook using Jupyter Notebook.
14 |
--------------------------------------------------------------------------------
/topic-modeling/README.md:
--------------------------------------------------------------------------------
1 | ## How-to
2 |
3 | Dataset included in this directory
4 |
5 | 1. Run any notebook using Jupyter Notebook.
6 |
--------------------------------------------------------------------------------
/topic-modeling/lda2vec.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "data": {
10 | "text/plain": [
11 | "'2.3.4'"
12 | ]
13 | },
14 | "execution_count": 1,
15 | "metadata": {},
16 | "output_type": "execute_result"
17 | }
18 | ],
19 | "source": [
20 | "import malaya\n",
21 | "malaya.bump_version"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 2,
27 | "metadata": {},
28 | "outputs": [],
29 | "source": [
30 | "text_split = malaya.texts._text_functions.split_into_sentences\n",
31 | "text_cleaning = malaya.texts._text_functions.summary_textcleaning"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": 3,
37 | "metadata": {},
38 | "outputs": [
39 | {
40 | "data": {
41 | "text/plain": [
42 | "11258"
43 | ]
44 | },
45 | "execution_count": 3,
46 | "metadata": {},
47 | "output_type": "execute_result"
48 | }
49 | ],
50 | "source": [
51 | "import json\n",
52 | "\n",
53 | "files = ['politics.json', 'education.json', 'economy.json', 'business.json']\n",
54 | "sentences = []\n",
55 | "for file in files:\n",
56 | " with open(file) as fopen:\n",
57 | " news = json.load(fopen)\n",
58 | " for n in news:\n",
59 | " if len(n['text']) > 50:\n",
60 | " splitted = text_split(n['text'])\n",
61 | " sentences.extend(splitted)\n",
62 | " \n",
63 | "len(sentences)"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": 4,
69 | "metadata": {},
70 | "outputs": [],
71 | "source": [
72 | "sentences = [text_cleaning(s)[1] for s in sentences]"
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": 5,
78 | "metadata": {},
79 | "outputs": [],
80 | "source": [
81 | "window_size = 4\n",
82 | "n_topics = 10\n",
83 | "embedding_size = 128\n",
84 | "epoch = 5\n",
85 | "switch_loss = 2"
86 | ]
87 | },
88 | {
89 | "cell_type": "code",
90 | "execution_count": 6,
91 | "metadata": {},
92 | "outputs": [],
93 | "source": [
94 | "class LDA2VEC:\n",
95 | " def __init__(\n",
96 | " self,\n",
97 | " num_unique_documents,\n",
98 | " vocab_size,\n",
99 | " num_topics,\n",
100 | " freqs,\n",
101 | " embedding_size = 128,\n",
102 | " num_sampled = 40,\n",
103 | " learning_rate = 1e-3,\n",
104 | " lmbda = 150.0,\n",
105 | " alpha = None,\n",
106 | " power = 0.75,\n",
107 | " batch_size = 32,\n",
108 | " clip_gradients = 5.0,\n",
109 | " **kwargs\n",
110 | " ):\n",
111 | " moving_avgs = tf.train.ExponentialMovingAverage(0.9)\n",
112 | " self.batch_size = batch_size\n",
113 | " self.freqs = freqs\n",
114 | " self.sess = tf.InteractiveSession()\n",
115 | " self.X = tf.placeholder(tf.int32, shape = [None])\n",
116 | " self.Y = tf.placeholder(tf.int64, shape = [None])\n",
117 | " self.DOC = tf.placeholder(tf.int32, shape = [None])\n",
118 | " step = tf.Variable(0, trainable = False, name = 'global_step')\n",
119 | " self.switch_loss = tf.Variable(0, trainable = False)\n",
120 | " train_labels = tf.reshape(self.Y, [-1, 1])\n",
121 | " sampler = tf.nn.fixed_unigram_candidate_sampler(\n",
122 | " train_labels,\n",
123 | " num_true = 1,\n",
124 | " num_sampled = num_sampled,\n",
125 | " unique = True,\n",
126 | " range_max = vocab_size,\n",
127 | " distortion = power,\n",
128 | " unigrams = self.freqs,\n",
129 | " )\n",
130 | "\n",
131 | " self.word_embedding = tf.Variable(\n",
132 | " tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0)\n",
133 | " )\n",
134 | " self.nce_weights = tf.Variable(\n",
135 | " tf.truncated_normal(\n",
136 | " [vocab_size, embedding_size],\n",
137 | " stddev = tf.sqrt(1 / embedding_size),\n",
138 | " )\n",
139 | " )\n",
140 | " self.nce_biases = tf.Variable(tf.zeros([vocab_size]))\n",
141 | " scalar = 1 / np.sqrt(num_unique_documents + num_topics)\n",
142 | " self.doc_embedding = tf.Variable(\n",
143 | " tf.random_normal(\n",
144 | " [num_unique_documents, num_topics],\n",
145 | " mean = 0,\n",
146 | " stddev = 50 * scalar,\n",
147 | " )\n",
148 | " )\n",
149 | " self.topic_embedding = tf.get_variable(\n",
150 | " 'topic_embedding',\n",
151 | " shape = [num_topics, embedding_size],\n",
152 | " dtype = tf.float32,\n",
153 | " initializer = tf.orthogonal_initializer(gain = scalar),\n",
154 | " )\n",
155 | " pivot = tf.nn.embedding_lookup(self.word_embedding, self.X)\n",
156 | " proportions = tf.nn.embedding_lookup(self.doc_embedding, self.DOC)\n",
157 | " doc = tf.matmul(proportions, self.topic_embedding)\n",
158 | " doc_context = doc\n",
159 | " word_context = pivot\n",
160 | " context = tf.add(word_context, doc_context)\n",
161 | " loss_word2vec = tf.reduce_mean(\n",
162 | " tf.nn.nce_loss(\n",
163 | " weights = self.nce_weights,\n",
164 | " biases = self.nce_biases,\n",
165 | " labels = self.Y,\n",
166 | " inputs = context,\n",
167 | " num_sampled = num_sampled,\n",
168 | " num_classes = vocab_size,\n",
169 | " num_true = 1,\n",
170 | " sampled_values = sampler,\n",
171 | " )\n",
172 | " )\n",
173 | " self.fraction = tf.Variable(1, trainable = False, dtype = tf.float32)\n",
174 | "\n",
175 | " n_topics = self.doc_embedding.get_shape()[1].value\n",
176 | " log_proportions = tf.nn.log_softmax(self.doc_embedding)\n",
177 | " if alpha is None:\n",
178 | " alpha = 1.0 / n_topics\n",
179 | " loss = -(alpha - 1) * log_proportions\n",
180 | " prior = tf.reduce_sum(loss)\n",
181 | "\n",
182 | " loss_lda = lmbda * self.fraction * prior\n",
183 | " self.cost = tf.cond(\n",
184 | " step < self.switch_loss,\n",
185 | " lambda: loss_word2vec,\n",
186 | " lambda: loss_word2vec + loss_lda,\n",
187 | " )\n",
188 | " loss_avgs_op = moving_avgs.apply([loss_lda, loss_word2vec, self.cost])\n",
189 | " with tf.control_dependencies([loss_avgs_op]):\n",
190 | " self.optimizer = tf.contrib.layers.optimize_loss(\n",
191 | " self.cost,\n",
192 | " tf.train.get_global_step(),\n",
193 | " learning_rate,\n",
194 | " 'Adam',\n",
195 | " clip_gradients = clip_gradients,\n",
196 | " )\n",
197 | " self.sess.run(tf.global_variables_initializer())\n",
198 | "\n",
199 | " def train(\n",
200 | " self, pivot_words, target_words, doc_ids, num_epochs, switch_loss = 3\n",
201 | " ):\n",
202 | " from tqdm import tqdm\n",
203 | "\n",
204 | " temp_fraction = self.batch_size / len(pivot_words)\n",
205 | " self.sess.run(tf.assign(self.fraction, temp_fraction))\n",
206 | " self.sess.run(tf.assign(self.switch_loss, switch_loss))\n",
207 | " for e in range(num_epochs):\n",
208 | " pbar = tqdm(\n",
209 | " range(0, len(pivot_words), self.batch_size),\n",
210 | " desc = 'minibatch loop',\n",
211 | " )\n",
212 | " for i in pbar:\n",
213 | " batch_x = pivot_words[\n",
214 | " i : min(i + self.batch_size, len(pivot_words))\n",
215 | " ]\n",
216 | " batch_y = target_words[\n",
217 | " i : min(i + self.batch_size, len(pivot_words))\n",
218 | " ]\n",
219 | " batch_doc = doc_ids[\n",
220 | " i : min(i + self.batch_size, len(pivot_words))\n",
221 | " ]\n",
222 | " _, cost = self.sess.run(\n",
223 | " [self.optimizer, self.cost],\n",
224 | " feed_dict = {\n",
225 | " self.X: batch_x,\n",
226 | " self.Y: batch_y,\n",
227 | " self.DOC: batch_doc,\n",
228 | " },\n",
229 | " )\n",
230 | " pbar.set_postfix(cost = cost, epoch = e + 1)"
231 | ]
232 | },
233 | {
234 | "cell_type": "code",
235 | "execution_count": 7,
236 | "metadata": {},
237 | "outputs": [],
238 | "source": [
239 | "import random\n",
240 | "from sklearn.utils import shuffle\n",
241 | "\n",
242 | "def skipgrams(\n",
243 | " sequence,\n",
244 | " vocabulary_size,\n",
245 | " window_size = 4,\n",
246 | " negative_samples = 1.0,\n",
247 | " shuffle = True,\n",
248 | " categorical = False,\n",
249 | " sampling_table = None,\n",
250 | " seed = None,\n",
251 | "):\n",
252 | " couples = []\n",
253 | " labels = []\n",
254 | " for i, wi in enumerate(sequence):\n",
255 | " if not wi:\n",
256 | " continue\n",
257 | " if sampling_table is not None:\n",
258 | " if sampling_table[wi] < random.random():\n",
259 | " continue\n",
260 | "\n",
261 | " window_start = max(0, i - window_size)\n",
262 | " window_end = min(len(sequence), i + window_size + 1)\n",
263 | " for j in range(window_start, window_end):\n",
264 | " if j != i:\n",
265 | " wj = sequence[j]\n",
266 | " if not wj:\n",
267 | " continue\n",
268 | " couples.append([wi, wj])\n",
269 | " if categorical:\n",
270 | " labels.append([0, 1])\n",
271 | " else:\n",
272 | " labels.append(1)\n",
273 | "\n",
274 | " if negative_samples > 0:\n",
275 | " num_negative_samples = int(len(labels) * negative_samples)\n",
276 | " words = [c[0] for c in couples]\n",
277 | " random.shuffle(words)\n",
278 | "\n",
279 | " couples += [\n",
280 | " [words[i % len(words)], random.randint(1, vocabulary_size - 1)]\n",
281 | " for i in range(num_negative_samples)\n",
282 | " ]\n",
283 | " if categorical:\n",
284 | " labels += [[1, 0]] * num_negative_samples\n",
285 | " else:\n",
286 | " labels += [0] * num_negative_samples\n",
287 | "\n",
288 | " if shuffle:\n",
289 | " if seed is None:\n",
290 | " seed = random.randint(0, 10e6)\n",
291 | " random.seed(seed)\n",
292 | " random.shuffle(couples)\n",
293 | " random.seed(seed)\n",
294 | " random.shuffle(labels)\n",
295 | "\n",
296 | " return couples, labels"
297 | ]
298 | },
299 | {
300 | "cell_type": "code",
301 | "execution_count": 8,
302 | "metadata": {},
303 | "outputs": [],
304 | "source": [
305 | "import tensorflow as tf\n",
306 | "from collections import Counter\n",
307 | "from sklearn.feature_extraction.text import CountVectorizer\n",
308 | "import numpy as np\n",
309 | "\n",
310 | "bow = CountVectorizer().fit(sentences)\n",
311 | "transformed = bow.transform(sentences)\n",
312 | "idx_text_clean, len_idx_text_clean = [], []\n",
313 | "for text in transformed:\n",
314 | " splitted = text.nonzero()[1]\n",
315 | " idx_text_clean.append(splitted)\n",
316 | " \n",
317 | "dictionary = {\n",
318 | " i: no for no, i in enumerate(bow.get_feature_names())\n",
319 | " }\n",
320 | "reversed_dictionary = {\n",
321 | " no: i for no, i in enumerate(bow.get_feature_names())\n",
322 | " }\n",
323 | "freqs = transformed.toarray().sum(axis = 0).tolist()\n",
324 | "doc_ids = np.arange(len(idx_text_clean))\n",
325 | "num_unique_documents = doc_ids.max()\n",
326 | "pivot_words, target_words, doc_ids = [], [], []\n",
327 | "for i, t in enumerate(idx_text_clean):\n",
328 | " pairs, _ = skipgrams(\n",
329 | " t,\n",
330 | " vocabulary_size = len(dictionary),\n",
331 | " window_size = window_size,\n",
332 | " shuffle = True,\n",
333 | " negative_samples = 0,\n",
334 | " )\n",
335 | " for pair in pairs:\n",
336 | " temp_data = pair\n",
337 | " pivot_words.append(temp_data[0])\n",
338 | " target_words.append(temp_data[1])\n",
339 | " doc_ids.append(i)\n",
340 | "pivot_words, target_words, doc_ids = shuffle(\n",
341 | " pivot_words, target_words, doc_ids, random_state = 10\n",
342 | ")\n",
343 | "num_unique_documents = len(idx_text_clean)"
344 | ]
345 | },
346 | {
347 | "cell_type": "code",
348 | "execution_count": 9,
349 | "metadata": {},
350 | "outputs": [
351 | {
352 | "name": "stdout",
353 | "output_type": "stream",
354 | "text": [
355 | "WARNING:tensorflow:From /home/huseinzol05/.local/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py:263: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.\n",
356 | "Instructions for updating:\n",
357 | "Colocations handled automatically by placer.\n",
358 | "WARNING:tensorflow:From /home/huseinzol05/.local/lib/python3.6/site-packages/tensorflow/python/ops/array_grad.py:425: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.\n",
359 | "Instructions for updating:\n",
360 | "Use tf.cast instead.\n"
361 | ]
362 | }
363 | ],
364 | "source": [
365 | "model = LDA2VEC(\n",
366 | " num_unique_documents,\n",
367 | " len(dictionary),\n",
368 | " n_topics,\n",
369 | " freqs,\n",
370 | " embedding_size = embedding_size)"
371 | ]
372 | },
373 | {
374 | "cell_type": "code",
375 | "execution_count": 10,
376 | "metadata": {},
377 | "outputs": [
378 | {
379 | "name": "stderr",
380 | "output_type": "stream",
381 | "text": [
382 | "minibatch loop: 100%|██████████| 45372/45372 [08:43<00:00, 86.60it/s, cost=-2.21e+4, epoch=1]\n",
383 | "minibatch loop: 100%|██████████| 45372/45372 [08:44<00:00, 86.50it/s, cost=-4.71e+4, epoch=2]\n",
384 | "minibatch loop: 100%|██████████| 45372/45372 [08:44<00:00, 86.50it/s, cost=-7.2e+4, epoch=3] \n",
385 | "minibatch loop: 100%|██████████| 45372/45372 [08:44<00:00, 86.44it/s, cost=-9.62e+4, epoch=4]\n",
386 | "minibatch loop: 100%|██████████| 45372/45372 [08:44<00:00, 86.45it/s, cost=-1.19e+5, epoch=5]\n"
387 | ]
388 | }
389 | ],
390 | "source": [
391 | "model.train(\n",
392 | " pivot_words, target_words, doc_ids, epoch, switch_loss = switch_loss\n",
393 | ")"
394 | ]
395 | },
396 | {
397 | "cell_type": "code",
398 | "execution_count": 11,
399 | "metadata": {},
400 | "outputs": [],
401 | "source": [
402 | "doc_embed = model.sess.run(model.doc_embedding)\n",
403 | "topic_embed = model.sess.run(model.topic_embedding)\n",
404 | "word_embed = model.sess.run(model.word_embedding)"
405 | ]
406 | },
407 | {
408 | "cell_type": "code",
409 | "execution_count": 12,
410 | "metadata": {},
411 | "outputs": [
412 | {
413 | "name": "stdout",
414 | "output_type": "stream",
415 | "text": [
416 | "topic 1 : g25 hardiknas doktor kashif kompetensi keep banjir harvest ditargetkan\n",
417 | "topic 2 : g25 izzah kashif ioi halilintar harvest 1984 keep candreva\n",
418 | "topic 3 : g25 hardiknas 1984 hamisah 2001 kashif doktor keep halilintar\n",
419 | "topic 4 : g25 hardiknas keep kashif doktor alfamart lombok diimport washing\n",
420 | "topic 5 : keep g25 harvest halilintar menghiraukan 1984 administrative kejahatan marketplace\n",
421 | "topic 6 : keep ioi kompetensi washing kashif g25 dominan halilintar asuhan\n",
422 | "topic 7 : g25 kashif harvest keep kritis diimport chow escas berbau\n",
423 | "topic 8 : keep citi mulai 1984 escas g25 doktor garis asuhan\n",
424 | "topic 9 : g25 asuhan 2001 gapoktan doktor halilintar umt kashif harmonis\n",
425 | "topic 10 : g25 keep harvest doktor kashif tribunkaltim administrative asuhan halilintar\n"
426 | ]
427 | }
428 | ],
429 | "source": [
430 | "components = topic_embed.dot(word_embed.T)\n",
431 | "for no, topic in enumerate(components):\n",
432 | " topic_string = ' '.join([reversed_dictionary[i]\n",
433 | " for i in topic.argsort()[: -10 : -1]])\n",
434 | " print('topic %d : %s'%(no + 1, topic_string))"
435 | ]
436 | },
437 | {
438 | "cell_type": "code",
439 | "execution_count": null,
440 | "metadata": {},
441 | "outputs": [],
442 | "source": []
443 | }
444 | ],
445 | "metadata": {
446 | "kernelspec": {
447 | "display_name": "Python 3",
448 | "language": "python",
449 | "name": "python3"
450 | },
451 | "language_info": {
452 | "codemirror_mode": {
453 | "name": "ipython",
454 | "version": 3
455 | },
456 | "file_extension": ".py",
457 | "mimetype": "text/x-python",
458 | "name": "python",
459 | "nbconvert_exporter": "python",
460 | "pygments_lexer": "ipython3",
461 | "version": "3.6.7"
462 | }
463 | },
464 | "nbformat": 4,
465 | "nbformat_minor": 2
466 | }
467 |
--------------------------------------------------------------------------------
/word-vector/1.word2vec.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | # In[1]:
4 |
5 |
6 | import word2vec
7 | import numpy as np
8 | import tensorflow as tf
9 | import json
10 | import os
11 | import re
12 | from unidecode import unidecode
13 |
14 | os.environ['CUDA_VISIBLE_DEVICES'] = ''
15 |
16 |
17 | with open('wiki-ms.txt') as fopen:
18 | sentences = fopen.read()
19 |
20 |
21 | def cleaning(string):
22 | string = re.sub(
23 | 'http\S+|www.\S+',
24 | '',
25 | ' '.join(
26 | [i for i in string.split() if i.find('#') < 0 and i.find('@') < 0]
27 | ),
28 | )
29 | string = unidecode(string).replace('.', '. ').replace(',', ', ')
30 | string = re.sub('[^A-Za-z ]+', ' ', string)
31 | string = re.sub(r'[ ]+', ' ', string).strip()
32 | string = ' '.join(
33 | [i for i in re.findall('[\\w\']+|[;:\-\(\)&.,!?"]', string) if len(i)]
34 | )
35 | return string.lower()
36 |
37 |
38 | sentences = cleaning(sentences).split()
39 |
40 | word_array, dictionary, rev_dictionary, num_lines, num_words = word2vec.build_word_array(
41 | sentences, vocab_size = 1000000
42 | )
43 |
44 |
45 | X, Y = word2vec.build_training_set(word_array)
46 | graph_params = {
47 | 'batch_size': 32,
48 | 'vocab_size': np.max(X) + 1,
49 | 'embed_size': 256,
50 | 'hid_size': 256,
51 | 'neg_samples': 128,
52 | 'learn_rate': 0.01,
53 | 'momentum': 0.9,
54 | 'embed_noise': 0.1,
55 | 'hid_noise': 0.3,
56 | 'epoch': 10,
57 | 'optimizer': 'Momentum',
58 | }
59 |
60 |
61 | split = round(X.shape[0] * 0.9)
62 | train_X, train_Y = X[:split, :], Y[:split, :]
63 | test_X, test_Y = X[split:, :], Y[split:, :]
64 |
65 |
66 | model = word2vec.Model(graph_params)
67 | print(
68 | 'model built, vocab size %d, document length %d'
69 | % (np.max(X) + 1, len(word_array))
70 | )
71 |
72 |
73 | embed_weights, nce_weights = model.train(
74 | train_X,
75 | train_Y,
76 | test_X,
77 | test_Y,
78 | graph_params['epoch'],
79 | graph_params['batch_size'],
80 | )
81 |
82 |
83 | import pickle
84 |
85 | with open('word2vec-wiki-256.p', 'wb') as fopen:
86 | pickle.dump(
87 | {
88 | 'dictionary': dictionary,
89 | 'rev_dictionary': rev_dictionary,
90 | 'embed_weights': embed_weights,
91 | 'nce_weights': nce_weights,
92 | },
93 | fopen,
94 | )
95 |
--------------------------------------------------------------------------------
/word-vector/3.fasttext.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import word2vec\n",
10 | "import numpy as np\n",
11 | "import tensorflow as tf\n",
12 | "import json\n",
13 | "import os\n",
14 | "os.environ['CUDA_VISIBLE_DEVICES'] = ''"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 2,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "import collections\n",
24 | "from unidecode import unidecode\n",
25 | "import re\n",
26 | "import numpy as np\n",
27 | "import tensorflow as tf\n",
28 | "from sklearn.utils import shuffle\n",
29 | "from scipy.spatial.distance import cdist\n",
30 | "from tqdm import tqdm\n",
31 | "import itertools\n",
32 | "\n",
33 | "def _pad_sequence(\n",
34 | " sequence,\n",
35 | " n,\n",
36 | " pad_left = False,\n",
37 | " pad_right = False,\n",
38 | " left_pad_symbol = None,\n",
39 | " right_pad_symbol = None,\n",
40 | "):\n",
41 | " sequence = iter(sequence)\n",
42 | " if pad_left:\n",
43 | " sequence = itertools.chain((left_pad_symbol,) * (n - 1), sequence)\n",
44 | " if pad_right:\n",
45 | " sequence = itertools.chain(sequence, (right_pad_symbol,) * (n - 1))\n",
46 | " return sequence\n",
47 | "\n",
48 | "\n",
49 | "def ngrams(\n",
50 | " sequence,\n",
51 | " n,\n",
52 | " pad_left = False,\n",
53 | " pad_right = False,\n",
54 | " left_pad_symbol = None,\n",
55 | " right_pad_symbol = None,\n",
56 | "):\n",
57 | " \"\"\"\n",
58 | " generate ngrams\n",
59 | "\n",
60 | " Parameters\n",
61 | " ----------\n",
62 | " sequence : list of str\n",
63 | " list of tokenize words\n",
64 | " n : int\n",
65 | " ngram size\n",
66 | "\n",
67 | " Returns\n",
68 | " -------\n",
69 | " ngram: list\n",
70 | " \"\"\"\n",
71 | " sequence = _pad_sequence(\n",
72 | " sequence, n, pad_left, pad_right, left_pad_symbol, right_pad_symbol\n",
73 | " )\n",
74 | "\n",
75 | " history = []\n",
76 | " while n > 1:\n",
77 | " try:\n",
78 | " next_item = next(sequence)\n",
79 | " except StopIteration:\n",
80 | " return\n",
81 | " history.append(next_item)\n",
82 | " n -= 1\n",
83 | " for item in sequence:\n",
84 | " history.append(item)\n",
85 | " yield tuple(history)\n",
86 | " del history[0]\n",
87 | "\n",
88 | "def generator(word, ngram = (2,3)):\n",
89 | " return [''.join(i) for n in ngram for i in ngrams(word,n)]\n",
90 | "\n",
91 | "def build_dict(word_counter, vocab_size = 50000):\n",
92 | " count = [['PAD', 0], ['UNK', 1], ['START', 2], ['END', 3]]\n",
93 | " count.extend(word_counter.most_common(vocab_size))\n",
94 | " dictionary = dict()\n",
95 | " for word, _ in count:\n",
96 | " dictionary[word] = len(dictionary)\n",
97 | " return dictionary, {word: idx for idx, word in dictionary.items()}\n",
98 | "\n",
99 | "\n",
100 | "def doc2num(word_list, dictionary):\n",
101 | " word_array = []\n",
102 | " for word in word_list:\n",
103 | " words = generator(word)\n",
104 | " word_array.append([dictionary.get(word, 1) for word in words])\n",
105 | " return word_array\n",
106 | "\n",
107 | "\n",
108 | "def build_word_array(sentences, vocab_size):\n",
109 | " word_counter, word_list, num_lines, num_words = counter_words(sentences)\n",
110 | " dictionary, rev_dictionary = build_dict(word_counter, vocab_size)\n",
111 | " word_array = doc2num(word_list, dictionary)\n",
112 | " return word_array, dictionary, rev_dictionary, num_lines, num_words\n",
113 | "\n",
114 | "\n",
115 | "def build_training_set(word_array, maxlen = 100):\n",
116 | " num_words = len(word_array)\n",
117 | " maxlen = max([len(i) for i in word_array]) if not maxlen else maxlen\n",
118 | " x = np.zeros((num_words - 4, maxlen, 4), dtype = np.int32)\n",
119 | " y = np.zeros((num_words - 4, maxlen), dtype = np.int32)\n",
120 | " shift = [-2, -1, 1, 2]\n",
121 | " for idx in range(2, num_words - 2):\n",
122 | " y[idx - 2, :len(word_array[idx])] = word_array[idx][:maxlen]\n",
123 | " for no, s in enumerate(shift):\n",
124 | " x[idx - 2, :len(word_array[idx + s]), no] = word_array[idx + s][:maxlen]\n",
125 | " return x, y\n",
126 | "\n",
127 | "def counter_words(sentences):\n",
128 | " word_counter = collections.Counter()\n",
129 | " word_list = []\n",
130 | " num_lines, num_words = (0, 0)\n",
131 | " for i in sentences:\n",
132 | " words = re.sub('[^\\'\"A-Za-z\\-<> ]+', ' ', unidecode(i))\n",
133 | " word_list.append(words)\n",
134 | " words = generator(words)\n",
135 | " word_counter.update(words)\n",
136 | " num_lines += 1\n",
137 | " num_words += len(words)\n",
138 | " return word_counter, word_list, num_lines, num_words"
139 | ]
140 | },
141 | {
142 | "cell_type": "code",
143 | "execution_count": 3,
144 | "metadata": {},
145 | "outputs": [],
146 | "source": [
147 | "with open('wiki-ms.txt') as fopen:\n",
148 | " sentences = ['<%s>'%(w) for w in fopen.read().split()]"
149 | ]
150 | },
151 | {
152 | "cell_type": "code",
153 | "execution_count": 4,
154 | "metadata": {},
155 | "outputs": [
156 | {
157 | "name": "stdout",
158 | "output_type": "stream",
159 | "text": [
160 | "CPU times: user 11min 31s, sys: 1.48 s, total: 11min 33s\n",
161 | "Wall time: 11min 33s\n"
162 | ]
163 | }
164 | ],
165 | "source": [
166 | "%%time\n",
167 | "word_array, dictionary, rev_dictionary, num_lines, num_words = build_word_array(sentences,\n",
168 | " vocab_size=1000000)"
169 | ]
170 | },
171 | {
172 | "cell_type": "code",
173 | "execution_count": null,
174 | "metadata": {},
175 | "outputs": [
176 | {
177 | "data": {
178 | "text/plain": [
179 | "27169"
180 | ]
181 | },
182 | "execution_count": 5,
183 | "metadata": {},
184 | "output_type": "execute_result"
185 | }
186 | ],
187 | "source": [
188 | "len(dictionary)"
189 | ]
190 | },
191 | {
192 | "cell_type": "code",
193 | "execution_count": null,
194 | "metadata": {},
195 | "outputs": [],
196 | "source": [
197 | "X, Y = build_training_set(word_array[:32])"
198 | ]
199 | },
200 | {
201 | "cell_type": "code",
202 | "execution_count": null,
203 | "metadata": {},
204 | "outputs": [
205 | {
206 | "data": {
207 | "text/plain": [
208 | "(2, 100)"
209 | ]
210 | },
211 | "execution_count": 7,
212 | "metadata": {},
213 | "output_type": "execute_result"
214 | }
215 | ],
216 | "source": [
217 | "Y[:2].shape"
218 | ]
219 | },
220 | {
221 | "cell_type": "code",
222 | "execution_count": null,
223 | "metadata": {},
224 | "outputs": [],
225 | "source": [
226 | "graph_params = {'batch_size': 128,\n",
227 | " 'vocab_size': len(dictionary),\n",
228 | " 'embed_size': 1024,\n",
229 | " 'hid_size': 1024,\n",
230 | " 'neg_samples': 128,\n",
231 | " 'learn_rate': 0.01,\n",
232 | " 'momentum': 0.9,\n",
233 | " 'embed_noise': 0.1,\n",
234 | " 'hid_noise': 0.3,\n",
235 | " 'epoch':10,\n",
236 | " 'optimizer': 'Momentum'}\n",
237 | "maxlen = 100"
238 | ]
239 | },
240 | {
241 | "cell_type": "code",
242 | "execution_count": null,
243 | "metadata": {},
244 | "outputs": [],
245 | "source": [
246 | "class Model:\n",
247 | " def __init__(self, graph_params):\n",
248 | " g_params = graph_params\n",
249 | " tf.reset_default_graph()\n",
250 | " self.sess = tf.InteractiveSession()\n",
251 | " self.X = tf.placeholder(tf.int64, shape = [None, None, 4])\n",
252 | " self.Y = tf.placeholder(tf.int64, shape = [None, None])\n",
253 | " length_X = tf.count_nonzero(self.X, 1)\n",
254 | " length_Y = tf.count_nonzero(self.Y, 1)\n",
255 | " \n",
256 | " w_m2, w_m1, w_p1, w_p2 = tf.unstack(self.X, axis = 2)\n",
257 | " self.embed_weights = tf.Variable(\n",
258 | " tf.random_uniform(\n",
259 | " [g_params['vocab_size'], g_params['embed_size']],\n",
260 | " -g_params['embed_noise'],\n",
261 | " g_params['embed_noise'],\n",
262 | " )\n",
263 | " )\n",
264 | " y = tf.argmax(tf.nn.embedding_lookup(self.embed_weights, self.Y),axis=-1)\n",
265 | " embed_m2 = tf.reduce_mean(tf.nn.embedding_lookup(self.embed_weights, w_m2),axis = 1)\n",
266 | " embed_m1 = tf.reduce_mean(tf.nn.embedding_lookup(self.embed_weights, w_m1),axis = 1)\n",
267 | " embed_p1 = tf.reduce_mean(tf.nn.embedding_lookup(self.embed_weights, w_p1),axis = 1)\n",
268 | " embed_p2 = tf.reduce_mean(tf.nn.embedding_lookup(self.embed_weights, w_p2),axis = 1)\n",
269 | " embed_stack = tf.concat([embed_m2, embed_m1, embed_p1, embed_p2], 1)\n",
270 | " hid_weights = tf.Variable(\n",
271 | " tf.random_normal(\n",
272 | " [g_params['embed_size'] * 4, g_params['hid_size']],\n",
273 | " stddev = g_params['hid_noise']\n",
274 | " / (g_params['embed_size'] * 4) ** 0.5,\n",
275 | " )\n",
276 | " )\n",
277 | " hid_bias = tf.Variable(tf.zeros([g_params['hid_size']]))\n",
278 | " print(embed_stack.shape)\n",
279 | " hid_out = tf.nn.tanh(tf.matmul(embed_stack, hid_weights) + hid_bias)\n",
280 | " self.nce_weights = tf.Variable(\n",
281 | " tf.random_normal(\n",
282 | " [g_params['vocab_size'], g_params['hid_size']],\n",
283 | " stddev = 1.0 / g_params['hid_size'] ** 0.5,\n",
284 | " )\n",
285 | " )\n",
286 | " nce_bias = tf.Variable(tf.zeros([g_params['vocab_size']]))\n",
287 | " self.cost = tf.reduce_mean(\n",
288 | " tf.nn.nce_loss(\n",
289 | " self.nce_weights,\n",
290 | " nce_bias,\n",
291 | " inputs = hid_out,\n",
292 | " labels = y,\n",
293 | " num_sampled = g_params['neg_samples'],\n",
294 | " num_classes = g_params['vocab_size'],\n",
295 | " num_true = maxlen,\n",
296 | " remove_accidental_hits = True,\n",
297 | " )\n",
298 | " )\n",
299 | " if g_params['optimizer'] == 'RMSProp':\n",
300 | " self.optimizer = tf.train.RMSPropOptimizer(\n",
301 | " g_params['learn_rate']\n",
302 | " ).minimize(self.cost)\n",
303 | " elif g_params['optimizer'] == 'Momentum':\n",
304 | " self.optimizer = tf.train.MomentumOptimizer(\n",
305 | " g_params['learn_rate'], g_params['momentum']\n",
306 | " ).minimize(self.cost)\n",
307 | " elif g_params['optimizer'] == 'Adam':\n",
308 | " self.optimizer = tf.train.AdamOptimizer(\n",
309 | " g_params['learn_rate']\n",
310 | " ).minimize(self.cost)\n",
311 | " else:\n",
312 | " print('Optimizer not supported,exit.')\n",
313 | " self.sess.run(tf.global_variables_initializer())\n",
314 | "\n",
315 | " def train(self, train, epoch, batch_size):\n",
316 | " for i in range(epoch):\n",
317 | " pbar = tqdm(\n",
318 | " range(0, len(train), batch_size), desc = 'train minibatch loop'\n",
319 | " )\n",
320 | " for batch in pbar:\n",
321 | " X, Y = build_training_set(train[batch : min(batch + batch_size, len(train))], maxlen = maxlen)\n",
322 | " X, Y = shuffle(X, Y)\n",
323 | " feed_dict = {\n",
324 | " self.X: X,\n",
325 | " self.Y: Y,\n",
326 | " }\n",
327 | " _, loss = self.sess.run(\n",
328 | " [self.optimizer, self.cost], feed_dict = feed_dict\n",
329 | " )\n",
330 | " pbar.set_postfix(cost = loss)\n",
331 | " \n",
332 | " return self.embed_weights.eval(), self.nce_weights.eval()"
333 | ]
334 | },
335 | {
336 | "cell_type": "code",
337 | "execution_count": null,
338 | "metadata": {},
339 | "outputs": [
340 | {
341 | "name": "stdout",
342 | "output_type": "stream",
343 | "text": [
344 | "(?, 2048)\n",
345 | "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/nn_impl.py:1124: sparse_to_dense (from tensorflow.python.ops.sparse_ops) is deprecated and will be removed in a future version.\n",
346 | "Instructions for updating:\n",
347 | "Create a `tf.sparse.SparseTensor` and use `tf.sparse.to_dense` instead.\n"
348 | ]
349 | }
350 | ],
351 | "source": [
352 | "model = Model(graph_params)"
353 | ]
354 | },
355 | {
356 | "cell_type": "code",
357 | "execution_count": null,
358 | "metadata": {},
359 | "outputs": [
360 | {
361 | "name": "stderr",
362 | "output_type": "stream",
363 | "text": [
364 | "train minibatch loop: 49%|████▊ | 135041/278233 [5:03:57<5:09:57, 7.70it/s, cost=2.11e+3] "
365 | ]
366 | }
367 | ],
368 | "source": [
369 | "embed_weights, nce_weights = model.train(word_array,\n",
370 | " graph_params['epoch'],\n",
371 | " graph_params['batch_size'])"
372 | ]
373 | },
374 | {
375 | "cell_type": "code",
376 | "execution_count": null,
377 | "metadata": {},
378 | "outputs": [],
379 | "source": [
380 | "embed_weights, nce_weights = model.embed_weights.eval(), model.nce_weights.eval()"
381 | ]
382 | },
383 | {
384 | "cell_type": "code",
385 | "execution_count": null,
386 | "metadata": {},
387 | "outputs": [],
388 | "source": [
389 | "import pickle\n",
390 | "with open('fasttext-wiki-1024.p', 'wb') as fopen:\n",
391 | " pickle.dump({'dictionary':dictionary,'rev_dictionary':rev_dictionary,\n",
392 | " 'embed_weights':embed_weights,'nce_weights':nce_weights}, fopen)"
393 | ]
394 | }
395 | ],
396 | "metadata": {
397 | "kernelspec": {
398 | "display_name": "Python 3",
399 | "language": "python",
400 | "name": "python3"
401 | },
402 | "language_info": {
403 | "codemirror_mode": {
404 | "name": "ipython",
405 | "version": 3
406 | },
407 | "file_extension": ".py",
408 | "mimetype": "text/x-python",
409 | "name": "python",
410 | "nbconvert_exporter": "python",
411 | "pygments_lexer": "ipython3",
412 | "version": "3.6.8"
413 | }
414 | },
415 | "nbformat": 4,
416 | "nbformat_minor": 2
417 | }
418 |
--------------------------------------------------------------------------------
/word-vector/README.md:
--------------------------------------------------------------------------------
1 | ## How-to
2 |
3 | 1. Download wiki dump from here, https://dumps.wikimedia.org/mswiki/latest/mswiki-latest-pages-articles.xml.bz2
4 |
5 | 2. Run [make-corpus.py](make-corpus.py),
6 | ```bash
7 | python make-corpus.py mswiki-latest-pages-articles.xml.bz2 wiki-ms.txt
8 | ```
9 |
10 | 3. Run any notebook using Jupyter Notebook.
11 |
12 | ## Notes
13 |
14 | All these word vectors already implemented in Malaya.
15 |
16 | For word2vec, https://malaya.readthedocs.io/en/latest/Word2vec.html
17 |
18 | For ELMO, https://malaya.readthedocs.io/en/latest/Elmo.html
19 |
20 | For Fast-text, https://malaya.readthedocs.io/en/latest/Fasttext.html
21 |
--------------------------------------------------------------------------------
/word-vector/load-word2vec.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import malaya"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 2,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "embedded_wiki = malaya.word2vec.load_wiki()"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": 4,
24 | "metadata": {},
25 | "outputs": [],
26 | "source": [
27 | "word_vector_wiki = malaya.word2vec.word2vec(embedded_wiki['nce_weights'], embedded_wiki['dictionary'])"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": 5,
33 | "metadata": {},
34 | "outputs": [
35 | {
36 | "name": "stdout",
37 | "output_type": "stream",
38 | "text": [
39 | "Embedding layer: 8 closest words to: 'anwar'\n",
40 | "[['zaid', 0.7285637855529785], ['khairy', 0.6839416027069092], ['zabidi', 0.6709405183792114], ['nizar', 0.6695379018783569], ['harussani', 0.6595045328140259], ['shahidan', 0.6565827131271362], ['azalina', 0.6541041135787964], ['shahrizat', 0.6538639068603516]]\n"
41 | ]
42 | }
43 | ],
44 | "source": [
45 | "word = 'anwar'\n",
46 | "print(\"Embedding layer: 8 closest words to: '%s'\"%(word))\n",
47 | "print(word_vector_wiki.n_closest(word=word, num_closest=8, metric='cosine'))"
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": null,
53 | "metadata": {},
54 | "outputs": [],
55 | "source": []
56 | }
57 | ],
58 | "metadata": {
59 | "kernelspec": {
60 | "display_name": "Python 3",
61 | "language": "python",
62 | "name": "python3"
63 | },
64 | "language_info": {
65 | "codemirror_mode": {
66 | "name": "ipython",
67 | "version": 3
68 | },
69 | "file_extension": ".py",
70 | "mimetype": "text/x-python",
71 | "name": "python",
72 | "nbconvert_exporter": "python",
73 | "pygments_lexer": "ipython3",
74 | "version": "3.6.8"
75 | }
76 | },
77 | "nbformat": 4,
78 | "nbformat_minor": 2
79 | }
80 |
--------------------------------------------------------------------------------
/word-vector/make-corpus.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from gensim.corpora import WikiCorpus
3 |
4 |
5 | def make_corpus(in_f, out_f):
6 |
7 | """Convert Wikipedia xml dump file to text corpus"""
8 |
9 | output = open(out_f, 'w')
10 | wiki = WikiCorpus(in_f)
11 |
12 | i = 0
13 | for text in wiki.get_texts():
14 | output.write(bytes(' '.join(text), 'utf-8').decode('utf-8') + '\n')
15 | i = i + 1
16 | if i % 10000 == 0:
17 | print('Processed ' + str(i) + ' articles')
18 | output.close()
19 | print('Processing complete!')
20 |
21 |
22 | if __name__ == '__main__':
23 |
24 | if len(sys.argv) != 3:
25 | print(
26 | 'Usage: python make_wiki_corpus.py '
27 | )
28 | sys.exit(1)
29 | in_f = sys.argv[1]
30 | out_f = sys.argv[2]
31 | make_corpus(in_f, out_f)
32 |
--------------------------------------------------------------------------------
/word-vector/word2vec.py:
--------------------------------------------------------------------------------
1 | import collections
2 | import re
3 | import numpy as np
4 | import tensorflow as tf
5 | from sklearn.utils import shuffle
6 | from sklearn.manifold import TSNE
7 | from scipy.spatial.distance import cdist
8 | from tqdm import tqdm
9 |
10 |
11 | def counter_words(sentences):
12 | word_counter = collections.Counter()
13 | word_list = []
14 | num_lines, num_words = (0, 0)
15 | for i in sentences:
16 | words = re.findall('[\\w\']+|[;:\-\(\)&.,!?"]', i)
17 | word_counter.update(words)
18 | word_list.extend(words)
19 | num_lines += 1
20 | num_words += len(words)
21 | return word_counter, word_list, num_lines, num_words
22 |
23 |
24 | def build_dict(word_counter, vocab_size = 50000):
25 | count = [['PAD', 0], ['UNK', 1], ['START', 2], ['END', 3]]
26 | count.extend(word_counter.most_common(vocab_size))
27 | dictionary = dict()
28 | for word, _ in count:
29 | dictionary[word] = len(dictionary)
30 | return dictionary, {word: idx for idx, word in dictionary.items()}
31 |
32 |
33 | def doc2num(word_list, dictionary):
34 | word_array = []
35 | unknown_val = len(dictionary)
36 | for word in word_list:
37 | word_array.append(dictionary.get(word, unknown_val))
38 | return np.array(word_array, dtype = np.int32)
39 |
40 |
41 | def build_word_array(sentences, vocab_size):
42 | word_counter, word_list, num_lines, num_words = counter_words(sentences)
43 | dictionary, rev_dictionary = build_dict(word_counter, vocab_size)
44 | word_array = doc2num(word_list, dictionary)
45 | return word_array, dictionary, rev_dictionary, num_lines, num_words
46 |
47 |
48 | def build_training_set(word_array):
49 | num_words = len(word_array)
50 | x = np.zeros((num_words - 4, 4), dtype = np.int32)
51 | y = np.zeros((num_words - 4, 1), dtype = np.int32)
52 | shift = np.array([-2, -1, 1, 2], dtype = np.int32)
53 | for idx in range(2, num_words - 2):
54 | y[idx - 2, 0] = word_array[idx]
55 | x[idx - 2, :] = word_array[idx + shift]
56 | return x, y
57 |
58 |
59 | class Model:
60 | def __init__(self, graph_params):
61 | g_params = graph_params
62 | tf.reset_default_graph()
63 | self.sess = tf.InteractiveSession()
64 | self.X = tf.placeholder(tf.int64, shape = [None, 4])
65 | self.Y = tf.placeholder(tf.int64, shape = [None, 1])
66 | w_m2, w_m1, w_p1, w_p2 = tf.unstack(self.X, axis = 1)
67 | self.embed_weights = tf.Variable(
68 | tf.random_uniform(
69 | [g_params['vocab_size'], g_params['embed_size']],
70 | -g_params['embed_noise'],
71 | g_params['embed_noise'],
72 | )
73 | )
74 | embed_m2 = tf.nn.embedding_lookup(self.embed_weights, w_m2)
75 | embed_m1 = tf.nn.embedding_lookup(self.embed_weights, w_m1)
76 | embed_p1 = tf.nn.embedding_lookup(self.embed_weights, w_p1)
77 | embed_p2 = tf.nn.embedding_lookup(self.embed_weights, w_p2)
78 | embed_stack = tf.concat([embed_m2, embed_m1, embed_p1, embed_p2], 1)
79 | hid_weights = tf.Variable(
80 | tf.random_normal(
81 | [g_params['embed_size'] * 4, g_params['hid_size']],
82 | stddev = g_params['hid_noise']
83 | / (g_params['embed_size'] * 4) ** 0.5,
84 | )
85 | )
86 | hid_bias = tf.Variable(tf.zeros([g_params['hid_size']]))
87 | hid_out = tf.nn.tanh(tf.matmul(embed_stack, hid_weights) + hid_bias)
88 | self.nce_weights = tf.Variable(
89 | tf.random_normal(
90 | [g_params['vocab_size'], g_params['hid_size']],
91 | stddev = 1.0 / g_params['hid_size'] ** 0.5,
92 | )
93 | )
94 | nce_bias = tf.Variable(tf.zeros([g_params['vocab_size']]))
95 | self.cost = tf.reduce_mean(
96 | tf.nn.nce_loss(
97 | self.nce_weights,
98 | nce_bias,
99 | inputs = hid_out,
100 | labels = self.Y,
101 | num_sampled = g_params['neg_samples'],
102 | num_classes = g_params['vocab_size'],
103 | num_true = 1,
104 | remove_accidental_hits = True,
105 | )
106 | )
107 | self.logits = tf.argmax(
108 | tf.matmul(hid_out, self.nce_weights, transpose_b = True) + nce_bias,
109 | axis = 1,
110 | )
111 | if g_params['optimizer'] == 'RMSProp':
112 | self.optimizer = tf.train.RMSPropOptimizer(
113 | g_params['learn_rate']
114 | ).minimize(self.cost)
115 | elif g_params['optimizer'] == 'Momentum':
116 | self.optimizer = tf.train.MomentumOptimizer(
117 | g_params['learn_rate'], g_params['momentum']
118 | ).minimize(self.cost)
119 | elif g_params['optimizer'] == 'Adam':
120 | self.optimizer = tf.train.AdamOptimizer(
121 | g_params['learn_rate']
122 | ).minimize(self.cost)
123 | else:
124 | print('Optimizer not supported,exit.')
125 | self.sess.run(tf.global_variables_initializer())
126 |
127 | def train(self, X, Y, X_val, Y_val, epoch, batch_size):
128 | for i in range(epoch):
129 | X, Y = shuffle(X, Y)
130 | pbar = tqdm(
131 | range(0, len(X), batch_size), desc = 'train minibatch loop'
132 | )
133 | for batch in pbar:
134 | feed_dict = {
135 | self.X: X[batch : min(batch + batch_size, len(X))],
136 | self.Y: Y[batch : min(batch + batch_size, len(X))],
137 | }
138 | _, loss = self.sess.run(
139 | [self.optimizer, self.cost], feed_dict = feed_dict
140 | )
141 | pbar.set_postfix(cost = loss)
142 |
143 | pbar = tqdm(
144 | range(0, len(X_val), batch_size), desc = 'test minibatch loop'
145 | )
146 | for batch in pbar:
147 | feed_dict = {
148 | self.X: X_val[batch : min(batch + batch_size, len(X_val))],
149 | self.Y: Y_val[batch : min(batch + batch_size, len(X_val))],
150 | }
151 | loss = self.sess.run(self.cost, feed_dict = feed_dict)
152 | pbar.set_postfix(cost = loss)
153 | return self.embed_weights.eval(), self.nce_weights.eval()
154 |
155 |
156 | class Word2Vec:
157 | def __init__(self, embed_matrix, dictionary):
158 | self._embed_matrix = embed_matrix
159 | self._dictionary = dictionary
160 | self._reverse_dictionary = {v: k for k, v in dictionary.items()}
161 |
162 | def get_vector_by_name(self, word):
163 | return np.ravel(self._embed_matrix[self._dictionary[word], :])
164 |
165 | def n_closest(self, word, num_closest = 5, metric = 'cosine'):
166 | wv = self.get_vector_by_name(word)
167 | closest_indices = self.closest_row_indices(wv, num_closest + 1, metric)
168 | word_list = []
169 | for i in closest_indices:
170 | word_list.append(self._reverse_dictionary[i])
171 | if word in word_list:
172 | word_list.remove(word)
173 | return word_list
174 |
175 | def closest_row_indices(self, wv, num, metric):
176 | dist_array = np.ravel(
177 | cdist(self._embed_matrix, wv.reshape((1, -1)), metric = metric)
178 | )
179 | sorted_indices = np.argsort(dist_array)
180 | return sorted_indices[:num]
181 |
182 | def analogy(self, a, b, c, num = 1, metric = 'cosine'):
183 | va = self.get_vector_by_name(a)
184 | vb = self.get_vector_by_name(b)
185 | vc = self.get_vector_by_name(c)
186 | vd = vb - va + vc
187 | closest_indices = self.closest_row_indices(vd, num, metric)
188 | d_word_list = []
189 | for i in closest_indices:
190 | d_word_list.append(self._reverse_dictionary[i])
191 | return d_word_list
192 |
193 | def project_2d(self, start, end):
194 | tsne = TSNE(n_components = 2)
195 | embed_2d = tsne.fit_transform(self._embed_matrix[start:end, :])
196 | word_list = []
197 | for i in range(start, end):
198 | word_list.append(self._reverse_dictionary[i])
199 | return embed_2d, word_list
200 |
--------------------------------------------------------------------------------