├── .gitignore ├── LICENSE ├── README.md ├── abstractive-summarization ├── 1.lstm-seq2seq-greedy.ipynb ├── 10.bert-dilated-fair.ipynb ├── 11.self-attention-pointer-generator.ipynb ├── 12.dilated-fairseq-pointer-generator.ipynb ├── 2.lstm-seq2seq-greedy-luong.ipynb ├── 3.lstm-seq2seq-beam.ipynb ├── 4.lstm-birnn-seq2seq-beam-luong.ipynb ├── 5.xueyouluo-pointer-generator-bahdanau.ipynb ├── 6.copynet.ipynb ├── 7.xueyouluo-pointer-generator-luong.ipynb ├── 8.dilated-seq2seq.ipynb ├── 9.dilated-seq2seq-self-attention.ipynb ├── README.md ├── dataset.tar.gz ├── dataset │ ├── ctexts.json │ └── headlines.json └── pointer_generator_helper.py ├── attention ├── 1.bahdanau.ipynb ├── 2.luong.ipynb ├── 3.hierarchical.ipynb ├── 4.additive.ipynb ├── 5.soft.ipynb ├── 6.attention-over-attention.ipynb ├── 7.bahdanau-api.ipynb └── 8.luong-api.ipynb ├── chatbot ├── 1.basic-seq2seq-manual.ipynb ├── 10.basic-birnn-seq2seq-greedy.ipynb ├── 11.lstm-birnn-seq2seq-greedy.ipynb ├── 12.gru-birnn-seq2seq-greedy.ipynb ├── 13.basic-seq2seq-luong.ipynb ├── 14.lstm-seq2seq-luong.ipynb ├── 15.gru-seq2seq-luong.ipynb ├── 16.basic-seq2seq-bahdanau.ipynb ├── 17.lstm-seq2seq-bahdanau.ipynb ├── 18.gru-seq2seq-bahdanau.ipynb ├── 19.lstm-birnn-seq2seq-luong.ipynb ├── 2.lstm-seq2seq-manual.ipynb ├── 20.gru-birnn-seq2seq-luong.ipynb ├── 21.lstm-birnn-seq2seq-bahdanau.ipynb ├── 22.gru-birnn-seq2seq-bahdanau.ipynb ├── 23.lstm-birnn-seq2seq-bahdanau-luong.ipynb ├── 24.gru-birnn-seq2seq-bahdanau-luong.ipynb ├── 25.lstm-seq2seq-greedy-luong.ipynb ├── 26.gru-seq2seq-greedy-luong.ipynb ├── 27.lstm-seq2seq-greedy-bahdanau.ipynb ├── 28.gru-seq2seq-greedy-bahdanau.ipynb ├── 29.lstm-seq2seq-beam.ipynb ├── 3.gru-seq2seq-manual.ipynb ├── 30.gru-seq2seq-beam.ipynb ├── 31.lstm-birnn-seq2seq-beam-luong.ipynb ├── 32.gru-birnn-seq2seq-beam-luong.ipynb ├── 33.lstm-birnn-seq2seq-luong-bahdanau-stack-beam.ipynb ├── 34.gru-birnn-seq2seq-luong-bahdanau-stack-beam.ipynb ├── 35.byte-net.ipynb ├── 35.byte-net.py ├── 36.estimator.ipynb ├── 37.capsule-lstm-seq2seq-greedy.ipynb ├── 37.capsule-lstm-seq2seq-greedy.py ├── 38.capsule-lstm-seq2seq-luong-beam.ipynb ├── 38.capsule-lstm-seq2seq-luong-beam.py ├── 39.lstm-birnn-seq2seq-luong-bahdanau-stack-beam-dropout-l2.ipynb ├── 4.basic-seq2seq-api-greedy.ipynb ├── 40.dnc-seq2seq-bahdanau-greedy.ipynb ├── 41.lstm-birnn-seq2seq-beam-luongmonotic.ipynb ├── 42.lstm-birnn-seq2seq-beam-bahdanaumonotic.ipynb ├── 43.memory-network-basic.ipynb ├── 44.memory-network-lstm.ipynb ├── 45.attention-is-all-you-need.ipynb ├── 46.transformer-xl.ipynb ├── 47.attention-is-all-you-need-beam-search.ipynb ├── 48.transformer-xl-lstm.ipynb ├── 49.gpt-2-lstm.ipynb ├── 5.lstm-seq2seq-api-greedy.ipynb ├── 50.conv-encoder-conv-decoder.ipynb ├── 51.conv-encoder-lstm.ipynb ├── 52.tacotron-greedy.ipynb ├── 53.tacotron-beam.ipynb ├── 54.google-nmt.ipynb ├── 6.gru-seq2seq-greedy.ipynb ├── 7.basic-birnn-seq2seq-manual.ipynb ├── 8.lstm-birnn-seq2seq-manual.ipynb ├── 9.gru-birnn-seq2seq-manual.ipynb ├── README.md ├── access.py ├── addressing.py ├── dataset.tar.gz ├── dnc.py ├── gpt_2.py └── util.py ├── classification-comparison ├── Deep-learning │ ├── LNLSTM-vector.ipynb │ ├── bidirectional-rnn-vector.ipynb │ ├── cnn-rnn-vector.ipynb │ ├── cnn-vector.ipynb │ ├── feedforward-vector.ipynb │ ├── kmax-conv-vector.ipynb │ ├── multihead-attention.ipynb │ ├── ntm-vector.ipynb │ ├── only-attention-vector.ipynb │ ├── rnn-attention-vector.ipynb │ ├── rnn-timestamp.ipynb │ ├── rnn-vector-hinge.ipynb │ ├── rnn-vector-huber.ipynb │ ├── rnn-vector-stack.ipynb │ ├── rnn-vector.ipynb │ ├── self-optimized-feedforward-timestamp.ipynb │ ├── seq2seq-vector-stable.ipynb │ └── seq2seq-vector.ipynb ├── Ensemble │ ├── featuring-ensemble.ipynb │ └── oracle.ipynb ├── LGB │ ├── lgb-tfidf-svd50.ipynb │ ├── lgb-tfidf.ipynb │ ├── lgb-timestamp.ipynb │ └── nce-vector-lgb.ipynb ├── NB-SVM │ └── NB-SVM.ipynb ├── Naive-Bayes │ └── Bayes classifier.ipynb ├── README.md ├── SVM │ └── SVM.ipynb ├── XGB │ ├── xgb-bow.ipynb │ ├── xgb-tfidf-svd50.ipynb │ ├── xgb-tfidf.ipynb │ ├── xgb-timestamp-avg.ipynb │ └── xgb-timestamp50.ipynb └── preparation │ ├── dictionary_emotion.p │ ├── prepare-dataset.ipynb │ ├── prepare-vocab.ipynb │ └── word-vector.ipynb ├── dependency-parser ├── 1.lstm-birnn-crf-biaffine.ipynb ├── 2.lstm-birnn-bahdanau-crf-biaffine.ipynb ├── 3.lstm-birnn-luong-crf-biaffine.ipynb ├── 4.bert-crf-biaffine.ipynb ├── 5.biaffine-attention-cross-entropy.ipynb ├── 6.bert-biaffine-attention-cross-entropy.ipynb ├── 7.stackpointer.ipynb ├── 8.xlnet-biaffine-attention-cross-entropy.ipynb └── README.md ├── entity-tagging ├── 1.rnn-lstm-crf.ipynb ├── 2.rnn-lstm-crf-luong.ipynb ├── 3.rnn-lstm-crf-bahdanau.ipynb ├── 4.rnn-lstm-crf-bahdanau-ngrams.ipynb ├── 5.rnn-lstm-crf-luong-ngrams.ipynb ├── 6.cnn-residual-bahdanau-ngrams.ipynb ├── 7.attention-is-all-you-need.ipynb ├── 8.bert.ipynb ├── 9.xlnet-base.ipynb └── README.md ├── extractive-summarization ├── 1.rnn-lstm.ipynb ├── 2.dilated-cnn.ipynb ├── 3.multihead-attention.ipynb ├── 4.bert-base.ipynb ├── download-data.ipynb ├── modeling.py ├── preprocessing-data-bert.ipynb └── preprocessing-data.ipynb ├── generator ├── 1.char-generator-lstm.ipynb ├── 10.gru-seq2seq-beam-word.ipynb ├── 11.gru-seq2seq-bahdanau-greedy-char.ipynb ├── 12.gru-seq2seq-bahdanau-greedy-word.ipynb ├── 13.dilated-cnn-beam.ipynb ├── 14.transformer-beam.ipynb ├── 15.transformer-xl-beam.ipynb ├── 2.char-rnn-beam.ipynb ├── 3.char-generator-lstm-embedding.ipynb ├── 4.word-generator-lstm.ipynb ├── 5.word-generator-lstm-embedding.ipynb ├── 6.gru-seq2seq-greedy-char.ipynb ├── 7.gru-seq2seq-greedy-word.ipynb ├── 8.char-generator-lstm-bahdanau.ipynb ├── 9.char-generator-lstm-luong.ipynb ├── README.md └── shakespeare.txt ├── language-detection ├── 1.fast-text-ngrams.ipynb └── README.md ├── neural-machine-translation ├── 1.basic-seq2seq.ipynb ├── 10.basic-birnn-seq2seq-contrib-greedy.ipynb ├── 11.lstm-birnn-seq2seq-contrib-greedy.ipynb ├── 12.gru-birnn-seq2seq-contrib-greedy.ipynb ├── 13.basic-seq2seq-luong.ipynb ├── 14.lstm-seq2seq-luong.ipynb ├── 15.gru-seq2seq-luong.ipynb ├── 16.basic-seq2seq-bahdanau.ipynb ├── 17.lstm-seq2seq-bahdanau.ipynb ├── 18.gru-seq2seq-bahdanau.ipynb ├── 19.basic-birnn-seq2seq-bahdanau.ipynb ├── 2.lstm-seq2seq.ipynb ├── 20.lstm-birnn-seq2seq-bahdanau.ipynb ├── 21.gru-birnn-seq2seq-bahdanau.ipynb ├── 22.basic-birnn-seq2seq-luong.ipynb ├── 23.lstm-birnn-seq2seq-luong.ipynb ├── 24.gru-birnn-seq2seq-luong.ipynb ├── 25.lstm-seq2seq-contrib-greedy-luong.ipynb ├── 26.gru-seq2seq-contrib-greedy-luong.ipynb ├── 27.lstm-seq2seq-contrib-greedy-bahdanau.ipynb ├── 28.gru-seq2seq-contrib-greedy-bahdanau.ipynb ├── 29.lstm-seq2seq-contrib-beam-luong.ipynb ├── 3.gru-seq2seq.ipynb ├── 30.gru-seq2seq-contrib-beam-luong.ipynb ├── 31.lstm-seq2seq-contrib-beam-bahdanau.ipynb ├── 32.gru-seq2seq-contrib-beam-bahdanau.ipynb ├── 33.lstm-birnn-seq2seq-contrib-beam-bahdanau.ipynb ├── 34.lstm-birnn-seq2seq-contrib-beam-luong.ipynb ├── 35.gru-birnn-seq2seq-contrib-beam-bahdanau.ipynb ├── 36.gru-birnn-seq2seq-contrib-beam-luong.ipynb ├── 37.lstm-birnn-seq2seq-contrib-beam-luongmonotonic.ipynb ├── 38.gru-birnn-seq2seq-contrib-beam-luongmonotic.ipynb ├── 39.lstm-birnn-seq2seq-contrib-beam-bahdanaumonotonic.ipynb ├── 4.basic-seq2seq-contrib-greedy.ipynb ├── 40.gru-birnn-seq2seq-contrib-beam-bahdanaumonotic.ipynb ├── 41.residual-lstm-seq2seq-greedy-luong.ipynb ├── 42.residual-gru-seq2seq-greedy-luong.ipynb ├── 43.residual-lstm-seq2seq-greedy-bahdanau.ipynb ├── 44.residual-gru-seq2seq-greedy-bahdanau.ipynb ├── 45.memory-network-lstm-decoder-greedy.ipynb ├── 46.google-nmt.ipynb ├── 47.transformer-encoder-transformer-decoder.ipynb ├── 48.transformer-encoder-lstm-decoder-greedy.ipynb ├── 49.bertmultilanguage-encoder-bertmultilanguage-decoder.ipynb ├── 5.lstm-seq2seq-contrib-greedy.ipynb ├── 50.bertmultilanguage-encoder-lstm-decoder.ipynb ├── 51.bertmultilanguage-encoder-transformer-decoder.ipynb ├── 52.bertenglish-encoder-transformer-decoder.ipynb ├── 53.transformer-t2t-2gpu.ipynb ├── 6.gru-seq2seq-contrib-greedy.ipynb ├── 7.basic-birnn-seq2seq.ipynb ├── 8.lstm-birnn-seq2seq.ipynb ├── 9.gru-birnn-seq2seq.ipynb ├── README.md ├── bert_decoder.py ├── electra │ └── model │ │ └── optimization.py ├── prepare-bpe.ipynb ├── prepare-dataset.ipynb ├── prepare-t2t.ipynb ├── t │ ├── text_encoder.py │ └── tokenizer.py └── transformer │ ├── attention_layer.py │ ├── beam_search.py │ ├── embedding_layer.py │ ├── ffn_layer.py │ ├── model_utils.py │ ├── transformer.py │ └── utils.py ├── nlp-tf.png ├── not-deep-learning ├── decomposition-summarization │ ├── 1.lda.ipynb │ ├── 2.lsa.ipynb │ └── 3.nmf.ipynb └── markov-chatbot │ └── markov-chatbot.ipynb ├── ocr ├── 1.cnn-rnn-ctc.ipynb ├── 2.im2latex.ipynb └── README.md ├── pos-tagging ├── 1.rnn-lstm-crf.ipynb ├── 2.rnn-lstm-crf-luong.ipynb ├── 3.rnn-lstm-crf-bahdanau.ipynb ├── 4.rnn-lstm-crf-bahdanau-ngrams.ipynb ├── 5.rnn-lstm-crf-luong-ngrams.ipynb ├── 6.cnn-residual-bahdanau-ngrams.ipynb ├── 7.attention-is-all-you-need.ipynb ├── 8.bert.ipynb └── README.md ├── question-answer ├── 1.end-to-end-basic.ipynb ├── 2.end-to-end-gru.ipynb ├── 3.end-to-end-lstm.ipynb ├── 4.dynamic-memory-gru.ipynb ├── README.md ├── attention_gru.py ├── qa5_three-arg-relations_test.txt └── qa5_three-arg-relations_train.txt ├── requirements.txt ├── sentence-pair ├── Archive.zip ├── README.md └── bert.ipynb ├── speech-to-text ├── 1.tacotron.ipynb ├── 10.deep-speech2.ipynb ├── 11.wav2vec-transfer-learning-birnn-lstm-ctc.ipynb ├── 2.birnn-lstm-ctc-greedy.ipynb ├── 3.birnn-seq2seq-luong-cross-entropy.ipynb ├── 4.birnn-seq2seq-bahdanau-cross-entropy.ipynb ├── 5.birnn-seq2seq-bahdanau-ctc.ipynb ├── 6.birnn-seq2seq-luong-ctc.ipynb ├── 7.cnn-rnn-bahdanau.ipynb ├── 8.dilated-cnn-rnn.ipynb ├── 9.wavenet.ipynb ├── README.md ├── augmentation.py ├── caching.ipynb ├── download.ipynb ├── wav2vec-preprocessing.ipynb ├── wav2vec-pytorch.ipynb ├── wav2vec-tf.ipynb └── wav2vec.ipynb ├── spelling-correction ├── 1.bert-base.ipynb ├── 2.xlnet-base.ipynb ├── 3.bert-base-fast.ipynb └── 4.bert-accurate.ipynb ├── squad-qa └── 1.bert.ipynb ├── stemming ├── 1.lstm-seq2seq-beam.ipynb ├── 2.gru-seq2seq-beam.ipynb ├── 3.lstm-birnn-seq2seq-greedy.ipynb ├── 4.gru-birnn-seq2seq-greedy.ipynb ├── 5.dnc-seq2seq-bahdanau-greedy.ipynb ├── 6.birnn-bahdanau-copynet.ipynb ├── README.md ├── access.py ├── addressing.py ├── dnc-seq2seq-bahdanau-greedy.ipynb ├── dnc.py ├── lemmatization-en.txt └── util.py ├── text-augmentation ├── 1.glove.ipynb ├── 2.gru-vae-beam-tfprob.ipynb ├── 3.lstm-vae-beam-tfprob.ipynb ├── 4.gru-vae-beam-bahdanau-tfprob.ipynb ├── 5.vae-deterministic-bahdanau.ipynb ├── 6.vae-varitional-bahdanau │ ├── attention_wrapper.py │ ├── basic_decoder.py │ ├── decoder.py │ └── vae-variational-bahdanau.ipynb ├── 7.bert-base.ipynb ├── 8.xlnet-augmentation.ipynb └── README.md ├── text-classification ├── 1.basic-rnn.ipynb ├── 10.lstm-rnn-bidirectional.ipynb ├── 11.lstm-rnn-bidirectional-huber.ipynb ├── 12.lstm-rnn-dropout-l2.ipynb ├── 13.gru-rnn.ipynb ├── 14.gru-rnn-hinge.ipynb ├── 15.gru-rnn-huber.ipynb ├── 16.gru-rnn-bidirectional.ipynb ├── 17.gru-rnn-bidirectional-hinge.ipynb ├── 18.gru-rnn-bidirectional-huber.ipynb ├── 19.lstm-cnn-rnn.ipynb ├── 2.basic-rnn-hinge.ipynb ├── 20.kmax-cnn.ipynb ├── 21.lstm-cnn-rnn-highway.ipynb ├── 22.lstm-rnn-attention.ipynb ├── 23.dilated-rnn-lstm.ipynb ├── 24.lnlstm-rnn.ipynb ├── 25.only-attention.ipynb ├── 26.multihead-attention.ipynb ├── 27.neural-turing-machine.ipynb ├── 28.lstm-seq2seq.ipynb ├── 29.lstm-seq2seq-luong.ipynb ├── 3.basic-rnn-huber.ipynb ├── 30.lstm-seq2seq-bahdanau.ipynb ├── 31.lstm-seq2seq-beam.ipynb ├── 32.lstm-seq2seq-birnn.ipynb ├── 33.pointer-net.ipynb ├── 34.lstm-rnn-bahdanau.ipynb ├── 35.lstm-rnn-luong.ipynb ├── 36.lstm-rnn-bahdanau-luong.ipynb ├── 37.lstm-birnn-bahdanau-luong.ipynb ├── 38.bytenet.ipynb ├── 39.fast-slow-lstm.ipynb ├── 4.basic-rnn-bidirectional.ipynb ├── 40.siamese-network.ipynb ├── 41.estimator.ipynb ├── 42.capsule-rnn-lstm.ipynb ├── 43.capsule-seq2seq-lstm.ipynb ├── 44.capsule-birrn-seq2seq-lstm.ipynb ├── 45.nested-lstm.ipynb ├── 46.lstm-seq2seq-highway.ipynb ├── 47.triplet-loss-lstm.ipynb ├── 48.dnc.ipynb ├── 49.convlstm.ipynb ├── 5.basic-rnn-bidirectional-hinge.ipynb ├── 50.temporalconvd.ipynb ├── 51.batch-all-triplet-loss-lstm.ipynb ├── 52.fast-text.ipynb ├── 53.gated-convolution-network.ipynb ├── 54.simple-recurrent-units.ipynb ├── 55.lstm-han.ipynb ├── 56.bert.ipynb ├── 57.dynamic-memory-network.ipynb ├── 58.entity-network.ipynb ├── 59.memory-network.ipynb ├── 6.basic-rnn-bidirectional-huber.ipynb ├── 60.char-sparse.ipynb ├── 61.residual-network.ipynb ├── 62.residual-network-bahdanau.ipynb ├── 63.deep-pyramid-cnn.ipynb ├── 63.deep-pyramid-cnn.py ├── 64.transformer-xl.ipynb ├── 65.transfer-learning-gpt2.ipynb ├── 66.quasi-rnn.ipynb ├── 67.tacotron.ipynb ├── 68.slice-gru.ipynb ├── 69.slice-gru-bahdanau.ipynb ├── 7.lstm-rnn.ipynb ├── 70.wavenet.ipynb ├── 71.transfer-learning-bert-base.ipynb ├── 72.transfer-learning-xlnet-large.ipynb ├── 73.lstm-birnn-max-avg.ipynb ├── 74.transfer-learning-bert-base-6.ipynb ├── 75.transfer-learning-bert-large-12.ipynb ├── 76.transfer-learning-xlnet-base.ipynb ├── 77.transfer-learning-albert-base.ipynb ├── 78.electra-base.ipynb ├── 79.electra-large.ipynb ├── 8.lstm-rnn-hinge.ipynb ├── 9.lstm-rnn-huber.ipynb ├── README.md ├── bert_model.py ├── data.zip ├── data │ ├── negative │ │ └── negative │ └── positive │ │ └── positive ├── dynamic_memory_network.py ├── entity_network.py ├── gpt_2.py ├── modules.py ├── utils.py └── xl.py ├── text-similarity ├── 1.birnn-contrastive.ipynb ├── 10.xlnet-base-circle-loss.ipynb ├── 2.birnn-cross-entropy.ipynb ├── 3.birnn-circle-loss.ipynb ├── 4.birnn-proxy-anchor-loss.ipynb ├── 5.bert-base-cross-entropy.ipynb ├── 6.bert-base-circle-loss.ipynb ├── 7.electra-base-cross-entropy.ipynb ├── 8.electra-base-circle-loss.ipynb ├── 9.xlnet-base-cross-entropy.ipynb ├── README.md └── prepare-dataset.ipynb ├── text-to-speech ├── 1.tacotron │ ├── caching.py │ ├── tacotron.ipynb │ ├── tacotron.py │ ├── test-tacotron.wav │ └── utils.py ├── 2.fairseq-dilated-cnn.ipynb ├── 3.seq2seq-bahdanau.ipynb ├── 4.seq2seq-luong.ipynb ├── 5.dilated-cnn-monothonic-attention.ipynb ├── 6.dilated-cnn-self-attention.ipynb ├── 7.deep-cnn-monothonic-attention.ipynb ├── 8.deep-cnn-self-attention.ipynb ├── README.md ├── caching.py ├── download.ipynb ├── test-bahdanau.wav ├── test-dilated-cnn-monothonic-attention.wav ├── test-dilated-cnn-self-attention.wav ├── test-luong.wav └── utils.py ├── topic-generator ├── 1.tat.ipynb ├── 2.tav.ipynb ├── 3.mta.ipynb ├── 4.dilated-seq2seq.ipynb └── README.md ├── topic-model ├── 1.lda2vec.ipynb ├── 2.bert-topic.ipynb ├── 3.xlnet-topic.ipynb ├── modeling.py ├── prepro_utils.py ├── utils.py └── xlnet.py ├── unsupervised-extractive-summarization ├── 1.skip-thought.ipynb ├── 2.residual-network.ipynb ├── 3.residual-network-bahdanau.ipynb ├── README.md └── books │ ├── Blood_Born │ ├── Dark_Thirst │ └── Driftas_Quest ├── vectorizer ├── 1.cbow-softmax.ipynb ├── 10.fast-text.ipynb ├── 11.elmo.ipynb ├── 12.bert-batch-all-triplet-loss.ipynb ├── 2.cbow-nce.ipynb ├── 3.skipgram-softmax.ipynb ├── 4.skipgram-nce.ipynb ├── 5.lda2vec.ipynb ├── 6.supervised-embedded.ipynb ├── 7.triplet-loss.ipynb ├── 8.auto-encoder.ipynb ├── 9.batch-all-triplet-loss-lstm-embedded.ipynb ├── README.md ├── data │ ├── negative │ │ └── negative │ └── positive │ │ └── positive └── utils.py ├── visualization ├── 1.attention-visualization-bahdanau.ipynb ├── 2.attention-visualization-luong.ipynb ├── 3.bert-attention.ipynb ├── 4.xlnet-attention.ipynb └── 5.bert-topic.ipynb └── vocoder ├── 1.dilated-cnn.ipynb ├── README.md ├── caching-vocoder.ipynb └── download.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | *.DS_Store 2 | *__pycache__ 3 | *.ipynb_checkpoints 4 | movie_*.txt 5 | summarization/dataset 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 HUSEIN ZOLKEPLI 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /abstractive-summarization/README.md: -------------------------------------------------------------------------------- 1 | ## How-to 2 | 3 | 1. Unzip [dataset.tar.gz](dataset.tar.gz) 4 | 5 | 2. Run any notebook using Jupyter Notebook. 6 | -------------------------------------------------------------------------------- /abstractive-summarization/dataset.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mesolitica/NLP-Models-Tensorflow/0741216aa8235e1228b3de7903cc36d73f8f2b45/abstractive-summarization/dataset.tar.gz -------------------------------------------------------------------------------- /chatbot/README.md: -------------------------------------------------------------------------------- 1 | ## How-to 2 | 3 | 1. Unzip [dataset.tar.gz](dataset.tar.gz) 4 | 5 | 2. Run any notebook using Jupyter Notebook. 6 | 7 | ## Accuracy, not sorted 8 | 9 | Based on training accuracy for 20 epochs. 10 | 11 | | name | accuracy | 12 | |------------------------------------------------------------|----------| 13 | | 1.basic-seq2seq-manual | 0.816000 | 14 | | 2.lstm-seq2seq-manual | 0.735000 | 15 | | 3.gru-seq2seq-manual | 0.846833 | 16 | | 4.basic-seq2seq-api-greedy | 1.009119 | 17 | | 5.lstm-seq2seq-api-greedy | 0.984596 | 18 | | 6.gru-seq2seq-greedy | 1.008869 | 19 | | 7.basic-birnn-seq2seq-manual | 0.990333 | 20 | | 8.lstm-birnn-seq2seq-manual | 0.732833 | 21 | | 9.gru-birnn-seq2seq-manual | 0.936667 | 22 | | 10.basic-birnn-seq2seq-greedy | 1.009586 | 23 | | 11.lstm-birnn-seq2seq-greedy | 0.991938 | 24 | | 12.gru-birnn-seq2seq-greedy | 1.008791 | 25 | | 13.basic-seq2seq-luong | 0.821167 | 26 | | 14.lstm-seq2seq-luong | 0.723167 | 27 | | 15.gru-seq2seq-luong | 0.751667 | 28 | | 16.basic-seq2seq-bahdanau | 0.811833 | 29 | | 17.lstm-seq2seq-bahdanau | 0.721833 | 30 | | 18.gru-seq2seq-bahdanau | 0.728167 | 31 | | 19.lstm-birnn-seq2seq-luong | 0.728500 | 32 | | 20.gru-birnn-seq2seq-luong | 0.743833 | 33 | | 21.lstm-birnn-seq2seq-bahdanau | 0.718833 | 34 | | 22.gru-birnn-seq2seq-bahdanau | 0.746667 | 35 | | 23.lstm-birnn-seq2seq-bahdanau-luong | 0.721000 | 36 | | 24.gru-birnn-seq2seq-bahdanau-luong | 0.747667 | 37 | | 25.lstm-seq2seq-greedy-luong | 0.974864 | 38 | | 26.gru-seq2seq-greedy-luong | 0.999175 | 39 | | 27.lstm-seq2seq-greedy-bahdanau | 0.987874 | 40 | | 28.gru-seq2seq-greedy-bahdanau | 1.000434 | 41 | | 29.lstm-seq2seq-beam | 0.874802 | 42 | | 30.gru-seq2seq-beam | 0.905397 | 43 | | 31.lstm-birnn-seq2seq-beam-luong | 0.913772 | 44 | | 32.gru-birnn-seq2seq-beam-luong | 0.856824 | 45 | | 33.lstm-birnn-seq2seq-luong-bahdanau-stack-beam | 0.732801 | 46 | | 34.gru-birnn-seq2seq-luong-bahdanau-stack-beam | 0.756537 | 47 | | 35.byte-net | 0.877510 | 48 | | 36.estimator | | 49 | | 37.capsule-lstm-seq2seq-greedy | 0.655007 | 50 | | 38.capsule-lstm-seq2seq-luong-beam | 0.275569 | 51 | | 39.lstm-birnn-seq2seq-luong-bahdanau-stack-beam-dropout-l2 | 0.312999 | 52 | | 40.dnc-seq2seq-bahdanau-greedy | 0.962712 | 53 | | 41.lstm-birnn-seq2seq-beam-luongmonotic | 0.917333 | 54 | | 42.lstm-birnn-seq2seq-beam-bahdanaumonotic | 0.929333 | 55 | | 43.memory-network-basic | 0.945333 | 56 | | 44.memory-network-lstm | 0.900000 | 57 | | 45.attention-is-all-you-need | 0.704549 | 58 | | 46.transformer-xl | 0.874486 | 59 | | 47.attention-is-all-you-need-beam-search | 0.836433 | 60 | | 48.transformer-xl-lstm | 0.826571 | 61 | | 49.gpt-2-lstm | 0.645157 | 62 | | 50.conv-encoder-conv-decoder | 0.518504 | 63 | | 51.conv-encoder-lstm | 0.924609 | 64 | | 52.tacotron-greedy | 0.876267 | 65 | | 53.tacotron-beam | 0.855140 | 66 | | 54.google-nmt | 1.006089 | 67 | -------------------------------------------------------------------------------- /chatbot/dataset.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mesolitica/NLP-Models-Tensorflow/0741216aa8235e1228b3de7903cc36d73f8f2b45/chatbot/dataset.tar.gz -------------------------------------------------------------------------------- /chatbot/dnc.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """DNC Cores. 16 | 17 | These modules create a DNC core. They take input, pass parameters to the memory 18 | access module, and integrate the output of memory to form an output. 19 | """ 20 | 21 | from __future__ import absolute_import 22 | from __future__ import division 23 | from __future__ import print_function 24 | 25 | import collections 26 | import numpy as np 27 | import sonnet as snt 28 | import tensorflow as tf 29 | 30 | import access 31 | 32 | DNCState = collections.namedtuple('DNCState', ('access_output', 'access_state', 33 | 'controller_state')) 34 | 35 | 36 | class DNC(snt.RNNCore): 37 | """DNC core module. 38 | 39 | Contains controller and memory access module. 40 | """ 41 | 42 | def __init__(self, 43 | access_config, 44 | controller_config, 45 | output_size, 46 | clip_value=None, 47 | name='dnc'): 48 | """Initializes the DNC core. 49 | 50 | Args: 51 | access_config: dictionary of access module configurations. 52 | controller_config: dictionary of controller (LSTM) module configurations. 53 | output_size: output dimension size of core. 54 | clip_value: clips controller and core output values to between 55 | `[-clip_value, clip_value]` if specified. 56 | name: module name (default 'dnc'). 57 | 58 | Raises: 59 | TypeError: if direct_input_size is not None for any access module other 60 | than KeyValueMemory. 61 | """ 62 | super(DNC, self).__init__(name=name) 63 | 64 | with self._enter_variable_scope(): 65 | self._controller = snt.LSTM(**controller_config) 66 | self._access = access.MemoryAccess(**access_config) 67 | 68 | self._access_output_size = np.prod(self._access.output_size.as_list()) 69 | self._output_size = output_size 70 | self._clip_value = clip_value or 0 71 | 72 | self._output_size = tf.TensorShape([output_size]) 73 | self._state_size = DNCState( 74 | access_output=self._access_output_size, 75 | access_state=self._access.state_size, 76 | controller_state=self._controller.state_size) 77 | 78 | def _clip_if_enabled(self, x): 79 | if self._clip_value > 0: 80 | return tf.clip_by_value(x, -self._clip_value, self._clip_value) 81 | else: 82 | return x 83 | 84 | def _build(self, inputs, prev_state): 85 | """Connects the DNC core into the graph. 86 | 87 | Args: 88 | inputs: Tensor input. 89 | prev_state: A `DNCState` tuple containing the fields `access_output`, 90 | `access_state` and `controller_state`. `access_state` is a 3-D Tensor 91 | of shape `[batch_size, num_reads, word_size]` containing read words. 92 | `access_state` is a tuple of the access module's state, and 93 | `controller_state` is a tuple of controller module's state. 94 | 95 | Returns: 96 | A tuple `(output, next_state)` where `output` is a tensor and `next_state` 97 | is a `DNCState` tuple containing the fields `access_output`, 98 | `access_state`, and `controller_state`. 99 | """ 100 | 101 | prev_access_output = prev_state.access_output 102 | prev_access_state = prev_state.access_state 103 | prev_controller_state = prev_state.controller_state 104 | 105 | batch_flatten = snt.BatchFlatten() 106 | controller_input = tf.concat( 107 | [batch_flatten(inputs), batch_flatten(prev_access_output)], 1) 108 | 109 | controller_output, controller_state = self._controller( 110 | controller_input, prev_controller_state) 111 | 112 | controller_output = self._clip_if_enabled(controller_output) 113 | controller_state = snt.nest.map(self._clip_if_enabled, controller_state) 114 | 115 | access_output, access_state = self._access(controller_output, 116 | prev_access_state) 117 | 118 | output = tf.concat([controller_output, batch_flatten(access_output)], 1) 119 | output = snt.Linear( 120 | output_size=self._output_size.as_list()[0], 121 | name='output_linear')(output) 122 | output = self._clip_if_enabled(output) 123 | 124 | return output, DNCState( 125 | access_output=access_output, 126 | access_state=access_state, 127 | controller_state=controller_state) 128 | 129 | def initial_state(self, batch_size, dtype=tf.float32): 130 | return DNCState( 131 | controller_state=self._controller.initial_state(batch_size, dtype), 132 | access_state=self._access.initial_state(batch_size, dtype), 133 | access_output=tf.zeros( 134 | [batch_size] + self._access.output_size.as_list(), dtype)) 135 | 136 | @property 137 | def state_size(self): 138 | return self._state_size 139 | 140 | @property 141 | def output_size(self): 142 | return self._output_size 143 | -------------------------------------------------------------------------------- /chatbot/gpt_2.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | 5 | def shape_list(x): 6 | """Deal with dynamic shape in tensorflow cleanly.""" 7 | static = x.shape.as_list() 8 | dynamic = tf.shape(x) 9 | return [dynamic[i] if s is None else s for i, s in enumerate(static)] 10 | 11 | 12 | def softmax(x, axis = -1): 13 | x = x - tf.reduce_max(x, axis = axis, keepdims = True) 14 | ex = tf.exp(x) 15 | return ex / tf.reduce_sum(ex, axis = axis, keepdims = True) 16 | 17 | 18 | def gelu(x): 19 | return ( 20 | 0.5 21 | * x 22 | * (1 + tf.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))) 23 | ) 24 | 25 | 26 | def norm(x, scope, *, axis = -1, epsilon = 1e-5): 27 | """Normalize to mean = 0, std = 1, then do a diagonal affine transform.""" 28 | with tf.variable_scope(scope): 29 | n_state = x.shape[-1].value 30 | g = tf.get_variable( 31 | 'g', [n_state], initializer = tf.constant_initializer(1) 32 | ) 33 | b = tf.get_variable( 34 | 'b', [n_state], initializer = tf.constant_initializer(0) 35 | ) 36 | u = tf.reduce_mean(x, axis = axis, keepdims = True) 37 | s = tf.reduce_mean(tf.square(x - u), axis = axis, keepdims = True) 38 | x = (x - u) * tf.rsqrt(s + epsilon) 39 | x = x * g + b 40 | return x 41 | 42 | 43 | def split_states(x, n): 44 | """Reshape the last dimension of x into [n, x.shape[-1]/n].""" 45 | *start, m = shape_list(x) 46 | return tf.reshape(x, start + [n, m // n]) 47 | 48 | 49 | def merge_states(x): 50 | """Smash the last two dimensions of x into a single dimension.""" 51 | *start, a, b = shape_list(x) 52 | return tf.reshape(x, start + [a * b]) 53 | 54 | 55 | def conv1d(x, scope, nf, *, w_init_stdev = 0.02): 56 | with tf.variable_scope(scope): 57 | *start, nx = shape_list(x) 58 | w = tf.get_variable( 59 | 'w', 60 | [1, nx, nf], 61 | initializer = tf.random_normal_initializer(stddev = w_init_stdev), 62 | ) 63 | b = tf.get_variable('b', [nf], initializer = tf.constant_initializer(0)) 64 | c = tf.reshape( 65 | tf.matmul(tf.reshape(x, [-1, nx]), tf.reshape(w, [-1, nf])) + b, 66 | start + [nf], 67 | ) 68 | return c 69 | 70 | 71 | def attention_mask(nd, ns, *, dtype): 72 | """1's in the lower triangle, counting from the lower right corner. 73 | 74 | Same as tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd), but doesn't produce garbage on TPUs. 75 | """ 76 | i = tf.range(nd)[:, None] 77 | j = tf.range(ns) 78 | m = i >= j - ns + nd 79 | return tf.cast(m, dtype) 80 | 81 | 82 | def attn(x, scope, n_state, *, past, hparams): 83 | assert x.shape.ndims == 3 # Should be [batch, sequence, features] 84 | assert n_state % hparams.n_head == 0 85 | if past is not None: 86 | assert ( 87 | past.shape.ndims == 5 88 | ) # Should be [batch, 2, heads, sequence, features], where 2 is [k, v] 89 | 90 | def split_heads(x): 91 | # From [batch, sequence, features] to [batch, heads, sequence, features] 92 | return tf.transpose(split_states(x, hparams.n_head), [0, 2, 1, 3]) 93 | 94 | def merge_heads(x): 95 | # Reverse of split_heads 96 | return merge_states(tf.transpose(x, [0, 2, 1, 3])) 97 | 98 | def mask_attn_weights(w): 99 | # w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst. 100 | _, _, nd, ns = shape_list(w) 101 | b = attention_mask(nd, ns, dtype = w.dtype) 102 | b = tf.reshape(b, [1, 1, nd, ns]) 103 | w = w * b - tf.cast(1e10, w.dtype) * (1 - b) 104 | return w 105 | 106 | def multihead_attn(q, k, v): 107 | # q, k, v have shape [batch, heads, sequence, features] 108 | w = tf.matmul(q, k, transpose_b = True) 109 | w = w * tf.rsqrt(tf.cast(v.shape[-1].value, w.dtype)) 110 | 111 | w = mask_attn_weights(w) 112 | w = softmax(w) 113 | a = tf.matmul(w, v) 114 | return a 115 | 116 | with tf.variable_scope(scope): 117 | c = conv1d(x, 'c_attn', n_state * 3) 118 | q, k, v = map(split_heads, tf.split(c, 3, axis = 2)) 119 | present = tf.stack([k, v], axis = 1) 120 | if past is not None: 121 | pk, pv = tf.unstack(past, axis = 1) 122 | k = tf.concat([pk, k], axis = -2) 123 | v = tf.concat([pv, v], axis = -2) 124 | a = multihead_attn(q, k, v) 125 | a = merge_heads(a) 126 | a = conv1d(a, 'c_proj', n_state) 127 | return a, present 128 | 129 | 130 | def mlp(x, scope, n_state, *, hparams): 131 | with tf.variable_scope(scope): 132 | nx = x.shape[-1].value 133 | h = gelu(conv1d(x, 'c_fc', n_state)) 134 | h2 = conv1d(h, 'c_proj', nx) 135 | return h2 136 | 137 | 138 | def block(x, scope, *, past, hparams): 139 | with tf.variable_scope(scope): 140 | nx = x.shape[-1].value 141 | a, present = attn( 142 | norm(x, 'ln_1'), 'attn', nx, past = past, hparams = hparams 143 | ) 144 | x = x + a 145 | m = mlp(norm(x, 'ln_2'), 'mlp', nx * 4, hparams = hparams) 146 | x = x + m 147 | return x, present 148 | 149 | 150 | def past_shape(*, hparams, batch_size = None, sequence = None): 151 | return [ 152 | batch_size, 153 | hparams.n_layer, 154 | 2, 155 | hparams.n_head, 156 | sequence, 157 | hparams.n_embd // hparams.n_head, 158 | ] 159 | 160 | 161 | def expand_tile(value, size): 162 | """Add a new axis of given size.""" 163 | value = tf.convert_to_tensor(value, name = 'value') 164 | ndims = value.shape.ndims 165 | return tf.tile(tf.expand_dims(value, axis = 0), [size] + [1] * ndims) 166 | 167 | 168 | def positions_for(tokens, past_length): 169 | batch_size = tf.shape(tokens)[0] 170 | nsteps = tf.shape(tokens)[1] 171 | return expand_tile(past_length + tf.range(nsteps), batch_size) 172 | 173 | 174 | def model(hparams, X, past = None, scope = 'model', reuse = False): 175 | with tf.variable_scope(scope, reuse = reuse): 176 | results = {} 177 | batch, sequence = shape_list(X) 178 | 179 | wpe = tf.get_variable( 180 | 'wpe', 181 | [hparams.n_ctx, hparams.n_embd], 182 | initializer = tf.random_normal_initializer(stddev = 0.01), 183 | ) 184 | wte = tf.get_variable( 185 | 'wte', 186 | [hparams.n_vocab, hparams.n_embd], 187 | initializer = tf.random_normal_initializer(stddev = 0.02), 188 | ) 189 | past_length = 0 if past is None else tf.shape(past)[-2] 190 | h = tf.gather(wte, X) + tf.gather(wpe, positions_for(X, past_length)) 191 | 192 | # Transformer 193 | presents = [] 194 | pasts = ( 195 | tf.unstack(past, axis = 1) 196 | if past is not None 197 | else [None] * hparams.n_layer 198 | ) 199 | assert len(pasts) == hparams.n_layer 200 | for layer, past in enumerate(pasts): 201 | h, present = block(h, 'h%d' % layer, past = past, hparams = hparams) 202 | presents.append(present) 203 | results['present'] = tf.stack(presents, axis = 1) 204 | h = norm(h, 'ln_f') 205 | 206 | # Language model loss. Do tokens CURRENT_ACC:\n", 178 | " print('epoch:', EPOCH, ', pass acc:', CURRENT_ACC, ', current acc:', test_acc)\n", 179 | " CURRENT_ACC = test_acc\n", 180 | " CURRENT_CHECKPOINT = 0\n", 181 | " saver.save(sess, os.getcwd() + \"/model-rnn-vector.ckpt\")\n", 182 | " else:\n", 183 | " CURRENT_CHECKPOINT += 1\n", 184 | " EPOCH += 1\n", 185 | " print('time taken:', time.time()-lasttime)\n", 186 | " print('epoch:', EPOCH, ', training loss:', train_loss, ', training acc:', train_acc, ', valid loss:', test_loss, ', valid acc:', test_acc)" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [] 195 | } 196 | ], 197 | "metadata": { 198 | "kernelspec": { 199 | "display_name": "Python 3", 200 | "language": "python", 201 | "name": "python3" 202 | }, 203 | "language_info": { 204 | "codemirror_mode": { 205 | "name": "ipython", 206 | "version": 3 207 | }, 208 | "file_extension": ".py", 209 | "mimetype": "text/x-python", 210 | "name": "python", 211 | "nbconvert_exporter": "python", 212 | "pygments_lexer": "ipython3", 213 | "version": "3.5.2" 214 | } 215 | }, 216 | "nbformat": 4, 217 | "nbformat_minor": 2 218 | } 219 | -------------------------------------------------------------------------------- /classification-comparison/LGB/lgb-tfidf-svd50.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# GPU Light gradient boosting trained on TF-IDF reduced 50 dimensions\n", 8 | "\n", 9 | "1. Same emotion dataset from [NLP-dataset](https://github.com/huseinzol05/NLP-Dataset)\n", 10 | "2. Same splitting 80% training, 20% testing, may vary depends on randomness\n", 11 | "3. Same regex substitution '[^\\\"\\'A-Za-z0-9 ]+'\n", 12 | "\n", 13 | "## Example\n", 14 | "\n", 15 | "Based on Term-frequency Inverse document frequency\n", 16 | "\n", 17 | "After that we apply SVD to reduce the dimensions, n_components = 50" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 8, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "import lightgbm as lgb\n", 27 | "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n", 28 | "from sklearn.decomposition import TruncatedSVD\n", 29 | "import numpy as np\n", 30 | "import re\n", 31 | "import time\n", 32 | "from sklearn.cross_validation import train_test_split\n", 33 | "import sklearn.datasets\n", 34 | "from sklearn import pipeline\n", 35 | "from sklearn.model_selection import StratifiedKFold" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 2, 41 | "metadata": { 42 | "collapsed": true 43 | }, 44 | "outputs": [], 45 | "source": [ 46 | "def clearstring(string):\n", 47 | " string = re.sub('[^\\\"\\'A-Za-z0-9 ]+', '', string)\n", 48 | " string = string.split(' ')\n", 49 | " string = filter(None, string)\n", 50 | " string = [y.strip() for y in string]\n", 51 | " string = ' '.join(string)\n", 52 | " return string\n", 53 | "\n", 54 | "# because of sklean.datasets read a document as a single element\n", 55 | "# so we want to split based on new line\n", 56 | "def separate_dataset(trainset):\n", 57 | " datastring = []\n", 58 | " datatarget = []\n", 59 | " for i in range(len(trainset.data)):\n", 60 | " data_ = trainset.data[i].split('\\n')\n", 61 | " # python3, if python2, just remove list()\n", 62 | " data_ = list(filter(None, data_))\n", 63 | " for n in range(len(data_)):\n", 64 | " data_[n] = clearstring(data_[n])\n", 65 | " datastring += data_\n", 66 | " for n in range(len(data_)):\n", 67 | " datatarget.append(trainset.target[i])\n", 68 | " return datastring, datatarget" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 3, 74 | "metadata": { 75 | "collapsed": true 76 | }, 77 | "outputs": [], 78 | "source": [ 79 | "trainset_data = sklearn.datasets.load_files(container_path = 'data', encoding = 'UTF-8')\n", 80 | "trainset_data.data, trainset_data.target = separate_dataset(trainset_data)" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 4, 86 | "metadata": { 87 | "collapsed": true 88 | }, 89 | "outputs": [], 90 | "source": [ 91 | "train_X, test_X, train_Y, test_Y = train_test_split(trainset_data.data, trainset_data.target, test_size = 0.2)" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 5, 97 | "metadata": { 98 | "collapsed": true 99 | }, 100 | "outputs": [], 101 | "source": [ 102 | "decompose = pipeline.Pipeline([('count', TfidfVectorizer()),\n", 103 | " ('svd', TruncatedSVD(n_components=50))]).fit(trainset_data.data)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 6, 109 | "metadata": { 110 | "collapsed": true 111 | }, 112 | "outputs": [], 113 | "source": [ 114 | "params_lgb = {\n", 115 | " 'max_depth': 27, \n", 116 | " 'learning_rate': 0.03,\n", 117 | " 'verbose': 50, \n", 118 | " 'early_stopping_round': 200,\n", 119 | " 'metric': 'multi_logloss',\n", 120 | " 'objective': 'multiclass',\n", 121 | " 'num_classes': len(trainset_data.target_names),\n", 122 | " 'device': 'gpu',\n", 123 | " 'gpu_platform_id': 0,\n", 124 | " 'gpu_device_id': 0\n", 125 | " }" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 10, 131 | "metadata": { 132 | "collapsed": true 133 | }, 134 | "outputs": [], 135 | "source": [ 136 | "train_X = decompose.transform(train_X)\n", 137 | "test_X = decompose.transform(test_X)" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 11, 143 | "metadata": {}, 144 | "outputs": [ 145 | { 146 | "name": "stdout", 147 | "output_type": "stream", 148 | "text": [ 149 | "Training until validation scores don't improve for 200 rounds.\n", 150 | "[100]\ttraining's multi_logloss: 1.51641\tvalid_1's multi_logloss: 1.52991\n", 151 | "[200]\ttraining's multi_logloss: 1.48039\tvalid_1's multi_logloss: 1.50404\n", 152 | "[300]\ttraining's multi_logloss: 1.46016\tvalid_1's multi_logloss: 1.49379\n", 153 | "[400]\ttraining's multi_logloss: 1.44402\tvalid_1's multi_logloss: 1.48755\n", 154 | "[500]\ttraining's multi_logloss: 1.43032\tvalid_1's multi_logloss: 1.4837\n", 155 | "[600]\ttraining's multi_logloss: 1.41806\tvalid_1's multi_logloss: 1.4811\n", 156 | "[700]\ttraining's multi_logloss: 1.40679\tvalid_1's multi_logloss: 1.4791\n", 157 | "[800]\ttraining's multi_logloss: 1.39626\tvalid_1's multi_logloss: 1.47765\n", 158 | "[900]\ttraining's multi_logloss: 1.38603\tvalid_1's multi_logloss: 1.4765\n", 159 | "[1000]\ttraining's multi_logloss: 1.37627\tvalid_1's multi_logloss: 1.47559\n", 160 | "[1100]\ttraining's multi_logloss: 1.36678\tvalid_1's multi_logloss: 1.47482\n", 161 | "[1200]\ttraining's multi_logloss: 1.35761\tvalid_1's multi_logloss: 1.4741\n", 162 | "[1300]\ttraining's multi_logloss: 1.34862\tvalid_1's multi_logloss: 1.47349\n", 163 | "[1400]\ttraining's multi_logloss: 1.33981\tvalid_1's multi_logloss: 1.47288\n", 164 | "[1500]\ttraining's multi_logloss: 1.33125\tvalid_1's multi_logloss: 1.47229\n", 165 | "[1600]\ttraining's multi_logloss: 1.32281\tvalid_1's multi_logloss: 1.47181\n", 166 | "[1700]\ttraining's multi_logloss: 1.31465\tvalid_1's multi_logloss: 1.47146\n", 167 | "[1800]\ttraining's multi_logloss: 1.30664\tvalid_1's multi_logloss: 1.47115\n", 168 | "[1900]\ttraining's multi_logloss: 1.29872\tvalid_1's multi_logloss: 1.47091\n", 169 | "[2000]\ttraining's multi_logloss: 1.29104\tvalid_1's multi_logloss: 1.47071\n", 170 | "[2100]\ttraining's multi_logloss: 1.28331\tvalid_1's multi_logloss: 1.47047\n", 171 | "[2200]\ttraining's multi_logloss: 1.2759\tvalid_1's multi_logloss: 1.47042\n", 172 | "[2300]\ttraining's multi_logloss: 1.26851\tvalid_1's multi_logloss: 1.47032\n", 173 | "[2400]\ttraining's multi_logloss: 1.26119\tvalid_1's multi_logloss: 1.47017\n", 174 | "[2500]\ttraining's multi_logloss: 1.25404\tvalid_1's multi_logloss: 1.47011\n", 175 | "[2600]\ttraining's multi_logloss: 1.247\tvalid_1's multi_logloss: 1.47003\n", 176 | "[2700]\ttraining's multi_logloss: 1.24004\tvalid_1's multi_logloss: 1.46998\n", 177 | "[2800]\ttraining's multi_logloss: 1.23314\tvalid_1's multi_logloss: 1.46996\n", 178 | "[2900]\ttraining's multi_logloss: 1.22632\tvalid_1's multi_logloss: 1.46997\n", 179 | "[3000]\ttraining's multi_logloss: 1.21957\tvalid_1's multi_logloss: 1.46994\n", 180 | "Early stopping, best iteration is:\n", 181 | "[2849]\ttraining's multi_logloss: 1.22982\tvalid_1's multi_logloss: 1.46992\n", 182 | "415.922 Seconds to train lgb\n" 183 | ] 184 | } 185 | ], 186 | "source": [ 187 | "d_train = lgb.Dataset(train_X, train_Y)\n", 188 | "d_valid = lgb.Dataset(test_X, test_Y)\n", 189 | "watchlist = [d_train, d_valid]\n", 190 | "t=time.time()\n", 191 | "clf = lgb.train(params_lgb, d_train, 100000, watchlist, early_stopping_rounds=200, verbose_eval=100)\n", 192 | "print(round(time.time()-t, 3), 'Seconds to train lgb')" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": 12, 198 | "metadata": {}, 199 | "outputs": [ 200 | { 201 | "name": "stdout", 202 | "output_type": "stream", 203 | "text": [ 204 | " precision recall f1-score support\n", 205 | "\n", 206 | " anger 0.38 0.05 0.09 11460\n", 207 | " fear 0.32 0.06 0.10 9545\n", 208 | " joy 0.44 0.73 0.55 28052\n", 209 | " love 0.17 0.01 0.02 7015\n", 210 | " sadness 0.39 0.54 0.45 24291\n", 211 | " surprise 0.09 0.01 0.01 2999\n", 212 | "\n", 213 | "avg / total 0.37 0.42 0.34 83362\n", 214 | "\n" 215 | ] 216 | } 217 | ], 218 | "source": [ 219 | "from sklearn import metrics\n", 220 | "print(metrics.classification_report(test_Y, np.argmax(clf.predict(test_X), axis = 1), target_names = trainset_data.target_names))" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": 13, 226 | "metadata": { 227 | "collapsed": true 228 | }, 229 | "outputs": [], 230 | "source": [ 231 | "clf.save_model('lgb-tfidf-svd50.model')" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": null, 237 | "metadata": { 238 | "collapsed": true 239 | }, 240 | "outputs": [], 241 | "source": [] 242 | } 243 | ], 244 | "metadata": { 245 | "kernelspec": { 246 | "display_name": "Python 3", 247 | "language": "python", 248 | "name": "python3" 249 | }, 250 | "language_info": { 251 | "codemirror_mode": { 252 | "name": "ipython", 253 | "version": 3 254 | }, 255 | "file_extension": ".py", 256 | "mimetype": "text/x-python", 257 | "name": "python", 258 | "nbconvert_exporter": "python", 259 | "pygments_lexer": "ipython3", 260 | "version": "3.5.2" 261 | } 262 | }, 263 | "nbformat": 4, 264 | "nbformat_minor": 2 265 | } 266 | -------------------------------------------------------------------------------- /classification-comparison/preparation/dictionary_emotion.p: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mesolitica/NLP-Models-Tensorflow/0741216aa8235e1228b3de7903cc36d73f8f2b45/classification-comparison/preparation/dictionary_emotion.p -------------------------------------------------------------------------------- /classification-comparison/preparation/prepare-dataset.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 6, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import os\n", 11 | "import re\n", 12 | "import pickle" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 7, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "def clearstring(string):\n", 22 | " string = re.sub('[^\\'\\\"A-Za-z0-9 ]+', '', string)\n", 23 | " string = string.split(' ')\n", 24 | " string = filter(None, string)\n", 25 | " string = [y.strip() for y in string]\n", 26 | " string = [y for y in string if len(y) > 3 and y.find('nbsp') < 0]\n", 27 | " return ' '.join(string)\n", 28 | "\n", 29 | "def read_data(location):\n", 30 | " list_folder = os.listdir(location)\n", 31 | " label = list_folder\n", 32 | " label.sort()\n", 33 | " outer_string, outer_label = [], []\n", 34 | " for i in range(len(list_folder)):\n", 35 | " list_file = os.listdir('data/' + list_folder[i])\n", 36 | " strings = []\n", 37 | " for x in range(len(list_file)):\n", 38 | " with open('data/' + list_folder[i] + '/' + list_file[x], 'r') as fopen:\n", 39 | " strings += fopen.read().split('\\n')\n", 40 | " strings = list(filter(None, strings))\n", 41 | " for k in range(len(strings)):\n", 42 | " strings[k] = clearstring(strings[k])\n", 43 | " labels = [i] * len(strings)\n", 44 | " outer_string += strings\n", 45 | " outer_label += labels\n", 46 | " \n", 47 | " dataset = np.array([outer_string, outer_label])\n", 48 | " dataset = dataset.T\n", 49 | " np.random.shuffle(dataset)\n", 50 | " \n", 51 | " return dataset" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 8, 57 | "metadata": {}, 58 | "outputs": [ 59 | { 60 | "data": { 61 | "text/plain": [ 62 | "array([[ 'woked feelin pain mood been tarified caused cant stnd hater could this life being hated throught whole career dont dont care thing thoe finna break that feel about babi carroline caused heart soul feelin real',\n", 63 | " '4'],\n", 64 | " [ 'couldnt stop feeling threatened cards grandmother kept sending with cash inside them',\n", 65 | " '1'],\n", 66 | " ['feel chronically defeated satisfied', '4'],\n", 67 | " [ 'apologize anyone feels offended these remarks certainly welcome your opinions',\n", 68 | " '0'],\n", 69 | " [ 'feel like every romantic movie there dorky best friend that desperately love with beautiful leading girl',\n", 70 | " '3']],\n", 71 | " dtype=' 3 and y.find('nbsp') < 0]\n", 28 | " return ' '.join(string)\n", 29 | "\n", 30 | "def read_data(location):\n", 31 | " list_folder = os.listdir(location)\n", 32 | " label = list_folder\n", 33 | " label.sort()\n", 34 | " outer_string, outer_label = [], []\n", 35 | " for i in range(len(list_folder)):\n", 36 | " list_file = os.listdir('data/' + list_folder[i])\n", 37 | " strings = []\n", 38 | " for x in range(len(list_file)):\n", 39 | " with open('data/' + list_folder[i] + '/' + list_file[x], 'r') as fopen:\n", 40 | " strings += fopen.read().split('\\n')\n", 41 | " strings = list(filter(None, strings))\n", 42 | " for k in range(len(strings)):\n", 43 | " strings[k] = clearstring(strings[k])\n", 44 | " labels = [i] * len(strings)\n", 45 | " outer_string += strings\n", 46 | " outer_label += labels\n", 47 | " \n", 48 | " dataset = np.array([outer_string, outer_label])\n", 49 | " dataset = dataset.T\n", 50 | " np.random.shuffle(dataset)\n", 51 | " \n", 52 | " string = []\n", 53 | " for i in range(dataset.shape[0]):\n", 54 | " string += dataset[i][0].split()\n", 55 | " \n", 56 | " return string\n", 57 | "\n", 58 | "def build_vocab(words, n_words):\n", 59 | " count = [['UNK', -1]]\n", 60 | " count.extend(collections.Counter(words).most_common(n_words - 1))\n", 61 | " dictionary = dict()\n", 62 | " for word, _ in count:\n", 63 | " dictionary[word] = len(dictionary)\n", 64 | " data = list()\n", 65 | " unk_count = 0\n", 66 | " for word in words:\n", 67 | " index = dictionary.get(word, 0)\n", 68 | " if index == 0: # dictionary['UNK']\n", 69 | " unk_count += 1\n", 70 | " data.append(index)\n", 71 | " count[0][1] = unk_count\n", 72 | " reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))\n", 73 | " return data, count, dictionary, reversed_dictionary" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 9, 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "strings = read_data('data')" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 10, 88 | "metadata": {}, 89 | "outputs": [ 90 | { 91 | "data": { 92 | "text/plain": [ 93 | "['left', 'feeling', 'very', 'jealous', 'feel']" 94 | ] 95 | }, 96 | "execution_count": 10, 97 | "metadata": {}, 98 | "output_type": "execute_result" 99 | } 100 | ], 101 | "source": [ 102 | "strings[:5]" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 12, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "n_words = len(set(strings))\n", 112 | "_,_,dictionary,reversed_dictionary = build_vocab(strings,n_words)" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "with open('dataset-dictionary.p', 'wb') as fopen:\n", 122 | " pickle.dump(reversed_dictionary, fopen)\n", 123 | "with open('dataset-dictionary-reverse.p', 'wb') as fopen:\n", 124 | " pickle.dump(dictionary, fopen)" 125 | ] 126 | } 127 | ], 128 | "metadata": { 129 | "kernelspec": { 130 | "display_name": "Python 3", 131 | "language": "python", 132 | "name": "python3" 133 | }, 134 | "language_info": { 135 | "codemirror_mode": { 136 | "name": "ipython", 137 | "version": 3 138 | }, 139 | "file_extension": ".py", 140 | "mimetype": "text/x-python", 141 | "name": "python", 142 | "nbconvert_exporter": "python", 143 | "pygments_lexer": "ipython3", 144 | "version": "3.5.2" 145 | } 146 | }, 147 | "nbformat": 4, 148 | "nbformat_minor": 2 149 | } 150 | -------------------------------------------------------------------------------- /dependency-parser/README.md: -------------------------------------------------------------------------------- 1 | ## How-to 2 | 3 | 1. Run any notebook using Jupyter Notebook. 4 | -------------------------------------------------------------------------------- /entity-tagging/README.md: -------------------------------------------------------------------------------- 1 | ## How-to 2 | 3 | 1. Run any notebook using Jupyter Notebook. 4 | -------------------------------------------------------------------------------- /extractive-summarization/download-data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# !pip3 install googledrivedownloader" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "from google_drive_downloader import GoogleDriveDownloader as gdd\n", 19 | "\n", 20 | "id = '0BwmD_VLjROrfTHk4NFg2SndKcjQ'\n", 21 | "gdd.download_file_from_google_drive(file_id=id, dest_path='./cnn.tgz')" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "!tar -zxf cnn.tgz" 31 | ] 32 | } 33 | ], 34 | "metadata": { 35 | "kernelspec": { 36 | "display_name": "Python 3", 37 | "language": "python", 38 | "name": "python3" 39 | }, 40 | "language_info": { 41 | "codemirror_mode": { 42 | "name": "ipython", 43 | "version": 3 44 | }, 45 | "file_extension": ".py", 46 | "mimetype": "text/x-python", 47 | "name": "python", 48 | "nbconvert_exporter": "python", 49 | "pygments_lexer": "ipython3", 50 | "version": "3.6.8" 51 | } 52 | }, 53 | "nbformat": 4, 54 | "nbformat_minor": 2 55 | } 56 | -------------------------------------------------------------------------------- /generator/README.md: -------------------------------------------------------------------------------- 1 | ## How-to 2 | 3 | 1. Run any notebook using Jupyter Notebook. 4 | -------------------------------------------------------------------------------- /language-detection/README.md: -------------------------------------------------------------------------------- 1 | ## How-to 2 | 3 | 1. You need to download and process dataset first, 4 | ```bash 5 | wget http://downloads.tatoeba.org/exports/sentences.tar.bz2 6 | bunzip2 sentences.tar.bz2 7 | tar xvf sentences.tar 8 | ``` 9 | 10 | 2. Change to csv, 11 | ```bash 12 | awk -F"\t" '{print"__label__"$2" "$3}' < sentences.csv | shuf > all.txt 13 | ``` 14 | 15 | 3. Run any notebook using Jupyter Notebook. 16 | -------------------------------------------------------------------------------- /neural-machine-translation/README.md: -------------------------------------------------------------------------------- 1 | ## how-to 2 | 3 | 1. run [prepare-dataset.ipynb](prepare-dataset.ipynb). 4 | 2. run [prepare-bpe.ipynb](prepare-bpe.ipynb). 5 | 3. run [prepare-t2t.ipynb](prepare-t2t.ipynb). 6 | 7 | ## Notes 8 | 9 | 1. First 200k Trainset to train, validation and test set to test. 10 | 2. Based on 20 epochs. 11 | 3. Accuracy based on BLEU. 12 | 4. RNN and Transformer parameters are not consistent. 13 | 14 | For RNN, 15 | 16 | ```python 17 | size_layer = 512 18 | num_layers = 2 19 | ``` 20 | 21 | For Transformer, we use BASE parameter from Tensor2Tensor. 22 | 23 | Here we never tested what happened to RNN based models if we increase number of layers and size of layers same as Transformer BASE parameter. 24 | 25 | 5. Batch size not consistent, most of the models used 128 batch size. 26 | 27 | ## Accuracy, not sorted 28 | 29 | | notebook | BLEU | 30 | |--------------------------------------------------------------|---------------| 31 | | 1.basic-seq2seq.ipynb | 6.319555e-05 | 32 | | 2.lstm-seq2seq.ipynb | 0.016924812 | 33 | | 3.gru-seq2seq.ipynb | 0.0094467895 | 34 | | 4.basic-seq2seq-contrib-greedy.ipynb | 0.005418866 | 35 | | 5.lstm-seq2seq-contrib-greedy.ipynb | | 36 | | 6.gru-seq2seq-contrib-greedy.ipynb | 0.051461186 | 37 | | 7.basic-birnn-seq2seq.ipynb | 6.319555e-05 | 38 | | 8.lstm-birnn-seq2seq.ipynb | 0.012854616 | 39 | | 9.gru-birnn-seq2seq.ipynb | 0.0095551545 | 40 | | 10.basic-birnn-seq2seq-contrib-greedy.ipynb | 0.019748569 | 41 | | 11.lstm-birnn-seq2seq-contrib-greedy.ipynb | 0.052993 | 42 | | 12.gru-birnn-seq2seq-contrib-greedy.ipynb | 0.047413725 | 43 | | 13.basic-seq2seq-luong.ipynb | 8.97118e-05 | 44 | | 14.lstm-seq2seq-luong.ipynb | 0.053475615 | 45 | | 15.gru-seq2seq-luong.ipynb | 0.01888038 | 46 | | 16.basic-seq2seq-bahdanau.ipynb | 0.00020161743 | 47 | | 17.lstm-seq2seq-bahdanau.ipynb | 0.048261568 | 48 | | 18.gru-seq2seq-bahdanau.ipynb | 0.025584696 | 49 | | 19.basic-birnn-seq2seq-bahdanau.ipynb | 0.00020161743 | 50 | | 20.lstm-birnn-seq2seq-bahdanau.ipynb | 0.054097746 | 51 | | 21.gru-birnn-seq2seq-bahdanau.ipynb | 0.00020161743 | 52 | | 22.basic-birnn-seq2seq-luong.ipynb | | 53 | | 23.lstm-birnn-seq2seq-luong.ipynb | 0.05320787 | 54 | | 24.gru-birnn-seq2seq-luong.ipynb | 0.027758315 | 55 | | 25.lstm-seq2seq-contrib-greedy-luong.ipynb | 0.15195806 | 56 | | 26.gru-seq2seq-contrib-greedy-luong.ipynb | 0.101576895 | 57 | | 27.lstm-seq2seq-contrib-greedy-bahdanau.ipynb | 0.15275387 | 58 | | 28.gru-seq2seq-contrib-greedy-bahdanau.ipynb | 0.13868862 | 59 | | 29.lstm-seq2seq-contrib-beam-luong.ipynb | 0.17535137 | 60 | | 30.gru-seq2seq-contrib-beam-luong.ipynb | 0.003980886 | 61 | | 31.lstm-seq2seq-contrib-beam-bahdanau.ipynb | 0.17929372 | 62 | | 32.gru-seq2seq-contrib-beam-bahdanau.ipynb | 0.1767827 | 63 | | 33.lstm-birnn-seq2seq-contrib-beam-bahdanau.ipynb | 0.19480321 | 64 | | 34.lstm-birnn-seq2seq-contrib-beam-luong.ipynb | 0.20042004 | 65 | | 35.gru-birnn-seq2seq-contrib-beam-bahdanau.ipynb | 0.1784567 | 66 | | 36.gru-birnn-seq2seq-contrib-beam-luong.ipynb | 0.0557322 | 67 | | 37.lstm-birnn-seq2seq-contrib-beam-luongmonotonic.ipynb | 0.06368613 | 68 | | 38.gru-birnn-seq2seq-contrib-beam-luongmonotic.ipynb | 0.06407658 | 69 | | 39.lstm-birnn-seq2seq-contrib-beam-bahdanaumonotonic.ipynb | 0.17586066 | 70 | | 40.gru-birnn-seq2seq-contrib-beam-bahdanaumonotic.ipynb | 0.065290846 | 71 | | 41.residual-lstm-seq2seq-greedy-luong.ipynb | 0.1475228 | 72 | | 42.residual-gru-seq2seq-greedy-luong.ipynb | 5.0574585e-05 | 73 | | 43.residual-lstm-seq2seq-greedy-bahdanau.ipynb | 0.15493448 | 74 | | 44.residual-gru-seq2seq-greedy-bahdanau.ipynb | | 75 | | 45.memory-network-lstm-decoder-greedy.ipynb | | 76 | | 46.google-nmt.ipynb | 0.055380445 | 77 | | 47.transformer-encoder-transformer-decoder.ipynb | 0.17100729 | 78 | | 48.transformer-encoder-lstm-decoder-greedy.ipynb | 0.049064703 | 79 | | 49.bertmultilanguage-encoder-bertmultilanguage-decoder.ipynb | 0.37003958 | 80 | | 50.bertmultilanguage-encoder-lstm-decoder.ipynb | 0.11384286 | 81 | | 51.bertmultilanguage-encoder-transformer-decoder.ipynb | 0.3941662 | 82 | | 52.bertenglish-encoder-transformer-decoder.ipynb | 0.23225775 | 83 | | 53.transformer-t2t-2gpu.ipynb | 0.36773485 | -------------------------------------------------------------------------------- /neural-machine-translation/electra/model/optimization.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The Google Research Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Functions and classes related to optimization (weight updates). 17 | Modified from the original BERT code to allow for having separate learning 18 | rates for different layers of the network. 19 | """ 20 | 21 | from __future__ import absolute_import 22 | from __future__ import division 23 | from __future__ import print_function 24 | 25 | import collections 26 | import re 27 | import tensorflow.compat.v1 as tf 28 | 29 | 30 | def create_optimizer( 31 | loss, 32 | learning_rate, 33 | num_train_steps, 34 | weight_decay_rate = 0.0, 35 | use_tpu = False, 36 | warmup_steps = 0, 37 | warmup_proportion = 0, 38 | lr_decay_power = 1.0, 39 | layerwise_lr_decay_power = -1, 40 | n_transformer_layers = None, 41 | decoder_layers = None, 42 | ): 43 | """Creates an optimizer and training op.""" 44 | global_step = tf.train.get_or_create_global_step() 45 | learning_rate = tf.train.polynomial_decay( 46 | learning_rate, 47 | global_step, 48 | num_train_steps, 49 | end_learning_rate = 0.0, 50 | power = lr_decay_power, 51 | cycle = False, 52 | ) 53 | warmup_steps = max(num_train_steps * warmup_proportion, warmup_steps) 54 | learning_rate *= tf.minimum( 55 | 1.0, 56 | tf.cast(global_step, tf.float32) / tf.cast(warmup_steps, tf.float32), 57 | ) 58 | cp_learning_rate = learning_rate 59 | 60 | if layerwise_lr_decay_power > 0: 61 | learning_rate = _get_layer_lrs( 62 | learning_rate, 63 | layerwise_lr_decay_power, 64 | n_transformer_layers, 65 | decoder_layers, 66 | ) 67 | learning_rate['embedding_shared_weights/'] = cp_learning_rate 68 | learning_rate['decoder_stack/layer_normalization/'] = cp_learning_rate 69 | print(learning_rate) 70 | optimizer = AdamWeightDecayOptimizer( 71 | learning_rate = learning_rate, 72 | weight_decay_rate = weight_decay_rate, 73 | beta_1 = 0.9, 74 | beta_2 = 0.999, 75 | epsilon = 1e-6, 76 | exclude_from_weight_decay = ['LayerNorm', 'layer_norm', 'bias'], 77 | ) 78 | if use_tpu: 79 | optimizer = tf.tpu.CrossShardOptimizer(optimizer) 80 | 81 | tvars = tf.trainable_variables() 82 | grads = tf.gradients(loss, tvars) 83 | (grads, _) = tf.clip_by_global_norm(grads, clip_norm = 1.0) 84 | train_op = optimizer.apply_gradients( 85 | zip(grads, tvars), global_step = global_step 86 | ) 87 | new_global_step = global_step + 1 88 | train_op = tf.group(train_op, [global_step.assign(new_global_step)]) 89 | return train_op 90 | 91 | 92 | class AdamWeightDecayOptimizer(tf.train.Optimizer): 93 | """A basic Adam optimizer that includes "correct" L2 weight decay.""" 94 | 95 | def __init__( 96 | self, 97 | learning_rate, 98 | weight_decay_rate = 0.0, 99 | beta_1 = 0.9, 100 | beta_2 = 0.999, 101 | epsilon = 1e-6, 102 | exclude_from_weight_decay = None, 103 | name = 'AdamWeightDecayOptimizer', 104 | ): 105 | """Constructs a AdamWeightDecayOptimizer.""" 106 | super(AdamWeightDecayOptimizer, self).__init__(False, name) 107 | 108 | self.learning_rate = learning_rate 109 | self.weight_decay_rate = weight_decay_rate 110 | self.beta_1 = beta_1 111 | self.beta_2 = beta_2 112 | self.epsilon = epsilon 113 | self.exclude_from_weight_decay = exclude_from_weight_decay 114 | 115 | def _apply_gradients(self, grads_and_vars, learning_rate): 116 | """See base class.""" 117 | assignments = [] 118 | for (grad, param) in grads_and_vars: 119 | if grad is None or param is None: 120 | continue 121 | 122 | param_name = self._get_variable_name(param.name) 123 | 124 | m = tf.get_variable( 125 | name = param_name + '/adam_m', 126 | shape = param.shape.as_list(), 127 | dtype = tf.float32, 128 | trainable = False, 129 | initializer = tf.zeros_initializer(), 130 | ) 131 | v = tf.get_variable( 132 | name = param_name + '/adam_v', 133 | shape = param.shape.as_list(), 134 | dtype = tf.float32, 135 | trainable = False, 136 | initializer = tf.zeros_initializer(), 137 | ) 138 | 139 | # Standard Adam update. 140 | next_m = tf.multiply(self.beta_1, m) + tf.multiply( 141 | 1.0 - self.beta_1, grad 142 | ) 143 | next_v = tf.multiply(self.beta_2, v) + tf.multiply( 144 | 1.0 - self.beta_2, tf.square(grad) 145 | ) 146 | update = next_m / (tf.sqrt(next_v) + self.epsilon) 147 | 148 | # Just adding the square of the weights to the loss function is *not* 149 | # the correct way of using L2 regularization/weight decay with Adam, 150 | # since that will interact with the m and v parameters in strange ways. 151 | # 152 | # Instead we want ot decay the weights in a manner that doesn't interact 153 | # with the m/v parameters. This is equivalent to adding the square 154 | # of the weights to the loss with plain (non-momentum) SGD. 155 | if self.weight_decay_rate > 0: 156 | if self._do_use_weight_decay(param_name): 157 | update += self.weight_decay_rate * param 158 | 159 | update_with_lr = learning_rate * update 160 | next_param = param - update_with_lr 161 | 162 | assignments.extend( 163 | [param.assign(next_param), m.assign(next_m), v.assign(next_v)] 164 | ) 165 | 166 | return assignments 167 | 168 | def apply_gradients(self, grads_and_vars, global_step = None, name = None): 169 | if isinstance(self.learning_rate, dict): 170 | key_to_grads_and_vars = {} 171 | for grad, var in grads_and_vars: 172 | update_for_var = False 173 | for key in self.learning_rate: 174 | if key in var.name: 175 | update_for_var = True 176 | if key not in key_to_grads_and_vars: 177 | key_to_grads_and_vars[key] = [] 178 | key_to_grads_and_vars[key].append((grad, var)) 179 | if not update_for_var: 180 | raise ValueError( 181 | 'No learning rate specified for variable', var 182 | ) 183 | assignments = [] 184 | for key, key_grads_and_vars in key_to_grads_and_vars.items(): 185 | assignments += self._apply_gradients( 186 | key_grads_and_vars, self.learning_rate[key] 187 | ) 188 | else: 189 | assignments = self._apply_gradients( 190 | grads_and_vars, self.learning_rate 191 | ) 192 | return tf.group(*assignments, name = name) 193 | 194 | def _do_use_weight_decay(self, param_name): 195 | """Whether to use L2 weight decay for `param_name`.""" 196 | if not self.weight_decay_rate: 197 | return False 198 | if self.exclude_from_weight_decay: 199 | for r in self.exclude_from_weight_decay: 200 | if re.search(r, param_name) is not None: 201 | return False 202 | return True 203 | 204 | def _get_variable_name(self, param_name): 205 | """Get the variable name from the tensor name.""" 206 | m = re.match('^(.*):\\d+$', param_name) 207 | if m is not None: 208 | param_name = m.group(1) 209 | return param_name 210 | 211 | 212 | def _get_layer_lrs(learning_rate, layer_decay, n_layers, decoder_layers): 213 | """Have lower learning rates for layers closer to the input.""" 214 | key_to_depths = collections.OrderedDict( 215 | { 216 | '/embeddings/': 0, 217 | '/embeddings_project/': 0, 218 | 'task_specific/': n_layers + 2, 219 | } 220 | ) 221 | for layer in range(n_layers): 222 | key_to_depths['encoder/layer_' + str(layer) + '/'] = layer + 1 223 | for layer in range(decoder_layers): 224 | key_to_depths['decoder_stack/layer_' + str(layer) + '/'] = layer + 1 225 | return { 226 | key: learning_rate * (layer_decay ** (n_layers + 2 - depth)) 227 | for key, depth in key_to_depths.items() 228 | } 229 | -------------------------------------------------------------------------------- /neural-machine-translation/prepare-bpe.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import json\n", 10 | "\n", 11 | "with open('dataset.json') as fopen:\n", 12 | " dataset = json.load(fopen)\n", 13 | " \n", 14 | "X = dataset['train_X'] + dataset['test_X']\n", 15 | "Y = dataset['train_Y'] + dataset['test_Y']" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 2, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "import youtokentome as yttm" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 5, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "with open('text.txt', 'w') as fopen:\n", 34 | " fopen.write('\\n'.join(X + Y))\n", 35 | " \n", 36 | "bpe = yttm.BPE.train(data='text.txt', vocab_size=32000, model='bpe.model',\n", 37 | " pad_id=0, unk_id=2, bos_id=3, eos_id=1)" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 8, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "train_X = bpe.encode(dataset['train_X'], output_type=yttm.OutputType.ID)\n", 47 | "train_Y = bpe.encode(dataset['train_Y'], output_type=yttm.OutputType.ID)\n", 48 | "test_X = bpe.encode(dataset['test_X'], output_type=yttm.OutputType.ID)\n", 49 | "test_Y = bpe.encode(dataset['test_Y'], output_type=yttm.OutputType.ID)" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 10, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "with open('dataset-bpe.json', 'w') as fopen:\n", 59 | " json.dump({'train_X': train_X, 'train_Y': train_Y, 'test_X': test_X, 'test_Y': test_Y}, fopen)" 60 | ] 61 | } 62 | ], 63 | "metadata": { 64 | "kernelspec": { 65 | "display_name": "Python 3", 66 | "language": "python", 67 | "name": "python3" 68 | }, 69 | "language_info": { 70 | "codemirror_mode": { 71 | "name": "ipython", 72 | "version": 3 73 | }, 74 | "file_extension": ".py", 75 | "mimetype": "text/x-python", 76 | "name": "python", 77 | "nbconvert_exporter": "python", 78 | "pygments_lexer": "ipython3", 79 | "version": "3.6.8" 80 | } 81 | }, 82 | "nbformat": 4, 83 | "nbformat_minor": 2 84 | } 85 | -------------------------------------------------------------------------------- /neural-machine-translation/prepare-dataset.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# !wget https://s3.amazonaws.com/opennmt-trainingdata/baseline-1M-enfr.tgz\n", 10 | "# !tar -zxf baseline-1M-enfr.tgz" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [ 18 | { 19 | "data": { 20 | "text/plain": [ 21 | "['baseline-1M-enfr/baseline-1M_test.en',\n", 22 | " 'baseline-1M-enfr/baseline-1M_valid.en',\n", 23 | " 'baseline-1M-enfr/baseline-1M_train.fr',\n", 24 | " 'baseline-1M-enfr/baseline-1M_valid.fr',\n", 25 | " 'baseline-1M-enfr/baseline-1M_train.en',\n", 26 | " 'baseline-1M-enfr/baseline-1M_test.fr']" 27 | ] 28 | }, 29 | "execution_count": 2, 30 | "metadata": {}, 31 | "output_type": "execute_result" 32 | } 33 | ], 34 | "source": [ 35 | "from glob import glob\n", 36 | "\n", 37 | "files = glob('baseline-1M-enfr/*')\n", 38 | "files" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 3, 44 | "metadata": {}, 45 | "outputs": [ 46 | { 47 | "data": { 48 | "text/plain": [ 49 | "(1009163, 1009163)" 50 | ] 51 | }, 52 | "execution_count": 3, 53 | "metadata": {}, 54 | "output_type": "execute_result" 55 | } 56 | ], 57 | "source": [ 58 | "with open('baseline-1M-enfr/baseline-1M_train.en') as fopen:\n", 59 | " train_en = fopen.read().split('\\n')[:-1]\n", 60 | " \n", 61 | "with open('baseline-1M-enfr/baseline-1M_train.fr') as fopen:\n", 62 | " train_fr = fopen.read().split('\\n')[:-1]\n", 63 | " \n", 64 | "len(train_en), len(train_fr)" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 4, 70 | "metadata": {}, 71 | "outputs": [ 72 | { 73 | "data": { 74 | "text/plain": [ 75 | "(1000, 1000)" 76 | ] 77 | }, 78 | "execution_count": 4, 79 | "metadata": {}, 80 | "output_type": "execute_result" 81 | } 82 | ], 83 | "source": [ 84 | "with open('baseline-1M-enfr/baseline-1M_test.en') as fopen:\n", 85 | " test_en = fopen.read().split('\\n')[:-1]\n", 86 | " \n", 87 | "with open('baseline-1M-enfr/baseline-1M_test.fr') as fopen:\n", 88 | " test_fr = fopen.read().split('\\n')[:-1]\n", 89 | " \n", 90 | "len(test_en), len(test_fr)" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 5, 96 | "metadata": {}, 97 | "outputs": [ 98 | { 99 | "data": { 100 | "text/plain": [ 101 | "(2000, 2000)" 102 | ] 103 | }, 104 | "execution_count": 5, 105 | "metadata": {}, 106 | "output_type": "execute_result" 107 | } 108 | ], 109 | "source": [ 110 | "with open('baseline-1M-enfr/baseline-1M_valid.en') as fopen:\n", 111 | " test_en.extend(fopen.read().split('\\n')[:-1])\n", 112 | " \n", 113 | "with open('baseline-1M-enfr/baseline-1M_valid.fr') as fopen:\n", 114 | " test_fr.extend(fopen.read().split('\\n')[:-1])\n", 115 | " \n", 116 | "len(test_en), len(test_fr)" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 8, 122 | "metadata": {}, 123 | "outputs": [ 124 | { 125 | "name": "stderr", 126 | "output_type": "stream", 127 | "text": [ 128 | "100%|██████████| 1009163/1009163 [00:03<00:00, 301686.48it/s]\n" 129 | ] 130 | }, 131 | { 132 | "data": { 133 | "text/plain": [ 134 | "(1009088, 1009088)" 135 | ] 136 | }, 137 | "execution_count": 8, 138 | "metadata": {}, 139 | "output_type": "execute_result" 140 | } 141 | ], 142 | "source": [ 143 | "from tqdm import tqdm\n", 144 | "\n", 145 | "train_X, train_Y = [], []\n", 146 | "\n", 147 | "for i in tqdm(range(len(train_en))):\n", 148 | " if len(train_en[i].split()) > 100 or len(train_fr[i].split()) > 100:\n", 149 | " continue\n", 150 | " train_X.append(train_en[i])\n", 151 | " train_Y.append(train_fr[i])\n", 152 | " \n", 153 | "len(train_X), len(train_Y)" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 9, 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [ 162 | "test_X, test_Y = train_X[-5000:], train_Y[-5000:]\n", 163 | "train_X, train_Y = train_X[:200000], train_Y[:200000]" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 10, 169 | "metadata": {}, 170 | "outputs": [], 171 | "source": [ 172 | "import json\n", 173 | "\n", 174 | "with open('dataset.json', 'w') as fopen:\n", 175 | " json.dump({'train_X': train_X, 'train_Y': train_Y, 'test_X': test_X, 'test_Y': test_Y}, fopen)" 176 | ] 177 | } 178 | ], 179 | "metadata": { 180 | "kernelspec": { 181 | "display_name": "Python 3", 182 | "language": "python", 183 | "name": "python3" 184 | }, 185 | "language_info": { 186 | "codemirror_mode": { 187 | "name": "ipython", 188 | "version": 3 189 | }, 190 | "file_extension": ".py", 191 | "mimetype": "text/x-python", 192 | "name": "python", 193 | "nbconvert_exporter": "python", 194 | "pygments_lexer": "ipython3", 195 | "version": "3.6.8" 196 | } 197 | }, 198 | "nbformat": 4, 199 | "nbformat_minor": 2 200 | } 201 | -------------------------------------------------------------------------------- /neural-machine-translation/prepare-t2t.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "!mkdir train\n", 10 | "!mkdir test" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "import json\n", 20 | "\n", 21 | "with open('dataset.json') as fopen:\n", 22 | " data = json.load(fopen)\n", 23 | " \n", 24 | "train_X = data['train_X']\n", 25 | "train_Y = data['train_Y']\n", 26 | "test_X = data['test_X']\n", 27 | "test_Y = data['test_Y']" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 3, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "with open('train/before.txt', 'w') as fopen:\n", 37 | " fopen.write('\\n'.join(train_X))\n", 38 | " \n", 39 | "with open('train/after.txt', 'w') as fopen:\n", 40 | " fopen.write('\\n'.join(train_Y))\n", 41 | " \n", 42 | "with open('test/before.txt', 'w') as fopen:\n", 43 | " fopen.write('\\n'.join(test_X))\n", 44 | " \n", 45 | "with open('test/after.txt', 'w') as fopen:\n", 46 | " fopen.write('\\n'.join(test_Y))" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 4, 52 | "metadata": {}, 53 | "outputs": [ 54 | { 55 | "name": "stdout", 56 | "output_type": "stream", 57 | "text": [ 58 | "train/\n", 59 | "train/after.txt\n", 60 | "train/before.txt\n", 61 | "test/\n", 62 | "test/after.txt\n", 63 | "test/before.txt\n" 64 | ] 65 | } 66 | ], 67 | "source": [ 68 | "!tar -czvf train-translation.tar.gz train\n", 69 | "!tar -czvf test-translation.tar.gz test" 70 | ] 71 | } 72 | ], 73 | "metadata": { 74 | "kernelspec": { 75 | "display_name": "Python 3", 76 | "language": "python", 77 | "name": "python3" 78 | }, 79 | "language_info": { 80 | "codemirror_mode": { 81 | "name": "ipython", 82 | "version": 3 83 | }, 84 | "file_extension": ".py", 85 | "mimetype": "text/x-python", 86 | "name": "python", 87 | "nbconvert_exporter": "python", 88 | "pygments_lexer": "ipython3", 89 | "version": "3.6.8" 90 | } 91 | }, 92 | "nbformat": 4, 93 | "nbformat_minor": 2 94 | } 95 | -------------------------------------------------------------------------------- /neural-machine-translation/t/tokenizer.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The Tensor2Tensor Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """A simple invertible tokenizer. 17 | 18 | Converts from a unicode string to a list of tokens 19 | (represented as Unicode strings). 20 | 21 | This tokenizer has the following desirable properties: 22 | - It is invertible. 23 | - Alphanumeric characters are broken away from non-alphanumeric characters. 24 | - A single space between words does not produce an extra token. 25 | - The full Unicode punctuation and separator set is recognized. 26 | 27 | The tokenization algorithm is as follows: 28 | 29 | 1. Split the text into a list of tokens, splitting at every boundary of an 30 | alphanumeric character and a non-alphanumeric character. This produces 31 | a list which alternates between "alphanumeric tokens" 32 | (strings of alphanumeric characters) and "non-alphanumeric tokens" 33 | (strings of non-alphanumeric characters). 34 | 35 | 2. Remove every token consisting of a single space, unless it is 36 | the very first or very last token in the list. These tokens are now 37 | implied by the fact that there are two adjacent alphanumeric tokens. 38 | 39 | e.g. u"Dude - that's so cool." 40 | -> [u"Dude", u" - ", u"that", u"'", u"s", u"so", u"cool", u"."] 41 | """ 42 | 43 | from __future__ import absolute_import 44 | from __future__ import division 45 | from __future__ import print_function 46 | 47 | import collections 48 | import sys 49 | import unicodedata 50 | import six 51 | from six.moves import range # pylint: disable=redefined-builtin 52 | import tensorflow.compat.v1 as tf 53 | 54 | # Conversion between Unicode and UTF-8, if required (on Python2) 55 | _native_to_unicode = (lambda s: s.decode('utf-8')) if six.PY2 else (lambda s: s) 56 | 57 | 58 | # This set contains all letter and number characters. 59 | _ALPHANUMERIC_CHAR_SET = set( 60 | six.unichr(i) 61 | for i in range(sys.maxunicode) 62 | if ( 63 | unicodedata.category(six.unichr(i)).startswith('L') 64 | or unicodedata.category(six.unichr(i)).startswith('N') 65 | ) 66 | ) 67 | 68 | 69 | def encode(text): 70 | """Encode a unicode string as a list of tokens. 71 | 72 | Args: 73 | text: a unicode string 74 | Returns: 75 | a list of tokens as Unicode strings 76 | """ 77 | if not text: 78 | return [] 79 | ret = [] 80 | token_start = 0 81 | # Classify each character in the input string 82 | is_alnum = [c in _ALPHANUMERIC_CHAR_SET for c in text] 83 | for pos in range(1, len(text)): 84 | if is_alnum[pos] != is_alnum[pos - 1]: 85 | token = text[token_start:pos] 86 | if token != u' ' or token_start == 0: 87 | ret.append(token) 88 | token_start = pos 89 | final_token = text[token_start:] 90 | ret.append(final_token) 91 | return ret 92 | 93 | 94 | def decode(tokens): 95 | """Decode a list of tokens to a unicode string. 96 | 97 | Args: 98 | tokens: a list of Unicode strings 99 | Returns: 100 | a unicode string 101 | """ 102 | token_is_alnum = [t[0] in _ALPHANUMERIC_CHAR_SET for t in tokens] 103 | ret = [] 104 | for i, token in enumerate(tokens): 105 | if i > 0 and token_is_alnum[i - 1] and token_is_alnum[i]: 106 | ret.append(u' ') 107 | ret.append(token) 108 | return ''.join(ret) 109 | 110 | 111 | def _read_filepattern(filepattern, max_lines = None, split_on_newlines = True): 112 | """Reads files matching a wildcard pattern, yielding the contents. 113 | 114 | Args: 115 | filepattern: A wildcard pattern matching one or more files. 116 | max_lines: If set, stop reading after reading this many lines. 117 | split_on_newlines: A boolean. If true, then split files by lines and strip 118 | leading and trailing whitespace from each line. Otherwise, treat each 119 | file as a single string. 120 | 121 | Yields: 122 | The contents of the files as lines, if split_on_newlines is True, or 123 | the entire contents of each file if False. 124 | """ 125 | filenames = sorted(tf.gfile.Glob(filepattern)) 126 | lines_read = 0 127 | for filename in filenames: 128 | with tf.gfile.Open(filename) as f: 129 | if split_on_newlines: 130 | for line in f: 131 | yield line.strip() 132 | lines_read += 1 133 | if max_lines and lines_read >= max_lines: 134 | return 135 | 136 | else: 137 | if max_lines: 138 | doc = [] 139 | for line in f: 140 | doc.append(line) 141 | lines_read += 1 142 | if max_lines and lines_read >= max_lines: 143 | yield ''.join(doc) 144 | return 145 | yield ''.join(doc) 146 | 147 | else: 148 | yield f.read() 149 | 150 | 151 | def vocab_token_counts(text_filepattern, max_lines): 152 | """Read a vocab file and return a dictionary of token counts. 153 | 154 | Reads a two-column CSV file of tokens and their frequency in a dataset. The 155 | tokens are presumed to be generated by encode() or the equivalent. 156 | 157 | Args: 158 | text_filepattern: A pattern matching one or more files. 159 | max_lines: An integer; maximum total lines to read. 160 | 161 | Returns: 162 | a dictionary mapping token to count. 163 | """ 164 | ret = {} 165 | for i, line in enumerate( 166 | _read_filepattern(text_filepattern, max_lines = max_lines) 167 | ): 168 | if ',' not in line: 169 | tf.logging.warning("Malformed vocab line #%d '%s'", i, line) 170 | continue 171 | 172 | token, count = line.rsplit(',', 1) 173 | ret[_native_to_unicode(token)] = int(count) 174 | 175 | return ret 176 | -------------------------------------------------------------------------------- /neural-machine-translation/transformer/attention_layer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Implementation of multiheaded attention and self-attention layers.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import tensorflow as tf 22 | 23 | 24 | class Attention(tf.layers.Layer): 25 | """Multi-headed attention layer.""" 26 | 27 | def __init__(self, hidden_size, num_heads, attention_dropout, train): 28 | if hidden_size % num_heads != 0: 29 | raise ValueError( 30 | 'Hidden size must be evenly divisible by the number of ' 31 | 'heads.' 32 | ) 33 | 34 | super(Attention, self).__init__() 35 | self.hidden_size = hidden_size 36 | self.num_heads = num_heads 37 | self.attention_dropout = attention_dropout 38 | self.train = train 39 | 40 | # Layers for linearly projecting the queries, keys, and values. 41 | self.q_dense_layer = tf.layers.Dense( 42 | hidden_size, use_bias = False, name = 'q' 43 | ) 44 | self.k_dense_layer = tf.layers.Dense( 45 | hidden_size, use_bias = False, name = 'k' 46 | ) 47 | self.v_dense_layer = tf.layers.Dense( 48 | hidden_size, use_bias = False, name = 'v' 49 | ) 50 | 51 | self.output_dense_layer = tf.layers.Dense( 52 | hidden_size, use_bias = False, name = 'output_transform' 53 | ) 54 | 55 | def split_heads(self, x): 56 | """Split x into different heads, and transpose the resulting value. 57 | 58 | The tensor is transposed to insure the inner dimensions hold the correct 59 | values during the matrix multiplication. 60 | 61 | Args: 62 | x: A tensor with shape [batch_size, length, hidden_size] 63 | 64 | Returns: 65 | A tensor with shape [batch_size, num_heads, length, hidden_size/num_heads] 66 | """ 67 | with tf.name_scope('split_heads'): 68 | batch_size = tf.shape(x)[0] 69 | length = tf.shape(x)[1] 70 | 71 | # Calculate depth of last dimension after it has been split. 72 | depth = self.hidden_size // self.num_heads 73 | 74 | # Split the last dimension 75 | x = tf.reshape(x, [batch_size, length, self.num_heads, depth]) 76 | 77 | # Transpose the result 78 | return tf.transpose(x, [0, 2, 1, 3]) 79 | 80 | def combine_heads(self, x): 81 | """Combine tensor that has been split. 82 | 83 | Args: 84 | x: A tensor [batch_size, num_heads, length, hidden_size/num_heads] 85 | 86 | Returns: 87 | A tensor with shape [batch_size, length, hidden_size] 88 | """ 89 | with tf.name_scope('combine_heads'): 90 | batch_size = tf.shape(x)[0] 91 | length = tf.shape(x)[2] 92 | x = tf.transpose( 93 | x, [0, 2, 1, 3] 94 | ) # --> [batch, length, num_heads, depth] 95 | return tf.reshape(x, [batch_size, length, self.hidden_size]) 96 | 97 | def call(self, x, y, bias, cache = None): 98 | """Apply attention mechanism to x and y. 99 | 100 | Args: 101 | x: a tensor with shape [batch_size, length_x, hidden_size] 102 | y: a tensor with shape [batch_size, length_y, hidden_size] 103 | bias: attention bias that will be added to the result of the dot product. 104 | cache: (Used during prediction) dictionary with tensors containing results 105 | of previous attentions. The dictionary must have the items: 106 | {"k": tensor with shape [batch_size, i, key_channels], 107 | "v": tensor with shape [batch_size, i, value_channels]} 108 | where i is the current decoded length. 109 | 110 | Returns: 111 | Attention layer output with shape [batch_size, length_x, hidden_size] 112 | """ 113 | # Linearly project the query (q), key (k) and value (v) using different 114 | # learned projections. This is in preparation of splitting them into 115 | # multiple heads. Multi-head attention uses multiple queries, keys, and 116 | # values rather than regular attention (which uses a single q, k, v). 117 | q = self.q_dense_layer(x) 118 | k = self.k_dense_layer(y) 119 | v = self.v_dense_layer(y) 120 | 121 | if cache is not None: 122 | # Combine cached keys and values with new keys and values. 123 | k = tf.concat([cache['k'], k], axis = 1) 124 | v = tf.concat([cache['v'], v], axis = 1) 125 | 126 | # Update cache 127 | cache['k'] = k 128 | cache['v'] = v 129 | 130 | # Split q, k, v into heads. 131 | q = self.split_heads(q) 132 | k = self.split_heads(k) 133 | v = self.split_heads(v) 134 | 135 | # Scale q to prevent the dot product between q and k from growing too large. 136 | depth = self.hidden_size // self.num_heads 137 | q *= depth ** -0.5 138 | 139 | # Calculate dot product attention 140 | logits = tf.matmul(q, k, transpose_b = True) 141 | logits += bias 142 | weights = tf.nn.softmax(logits, name = 'attention_weights') 143 | if self.train: 144 | weights = tf.nn.dropout(weights, 1.0 - self.attention_dropout) 145 | attention_output = tf.matmul(weights, v) 146 | 147 | # Recombine heads --> [batch_size, length, hidden_size] 148 | attention_output = self.combine_heads(attention_output) 149 | 150 | # Run the combined outputs through another linear projection layer. 151 | attention_output = self.output_dense_layer(attention_output) 152 | return attention_output 153 | 154 | 155 | class SelfAttention(Attention): 156 | """Multiheaded self-attention layer.""" 157 | 158 | def call(self, x, bias, cache = None): 159 | return super(SelfAttention, self).call(x, x, bias, cache) 160 | -------------------------------------------------------------------------------- /neural-machine-translation/transformer/embedding_layer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Implementation of embedding layer with shared weights.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import tensorflow as tf # pylint: disable=g-bad-import-order 22 | 23 | 24 | class EmbeddingSharedWeights(tf.layers.Layer): 25 | """Calculates input embeddings and pre-softmax linear with shared weights.""" 26 | 27 | def __init__(self, vocab_size, hidden_size, method = 'gather'): 28 | """Specify characteristic parameters of embedding layer. 29 | 30 | Args: 31 | vocab_size: Number of tokens in the embedding. (Typically ~32,000) 32 | hidden_size: Dimensionality of the embedding. (Typically 512 or 1024) 33 | method: Strategy for performing embedding lookup. "gather" uses tf.gather 34 | which performs well on CPUs and GPUs, but very poorly on TPUs. "matmul" 35 | one-hot encodes the indicies and formulates the embedding as a sparse 36 | matrix multiplication. The matmul formulation is wasteful as it does 37 | extra work, however matrix multiplication is very fast on TPUs which 38 | makes "matmul" considerably faster than "gather" on TPUs. 39 | """ 40 | super(EmbeddingSharedWeights, self).__init__() 41 | self.vocab_size = vocab_size 42 | self.hidden_size = hidden_size 43 | if method not in ('gather', 'matmul'): 44 | raise ValueError( 45 | "method {} must be 'gather' or 'matmul'".format(method) 46 | ) 47 | self.method = method 48 | 49 | def build(self, _): 50 | with tf.variable_scope('embedding_and_softmax', reuse = tf.AUTO_REUSE): 51 | # Create and initialize weights. The random normal initializer was chosen 52 | # randomly, and works well. 53 | self.shared_weights = tf.get_variable( 54 | 'weights', 55 | [self.vocab_size, self.hidden_size], 56 | initializer = tf.random_normal_initializer( 57 | 0.0, self.hidden_size ** -0.5 58 | ), 59 | ) 60 | 61 | self.built = True 62 | 63 | def call(self, x): 64 | """Get token embeddings of x. 65 | 66 | Args: 67 | x: An int64 tensor with shape [batch_size, length] 68 | Returns: 69 | embeddings: float32 tensor with shape [batch_size, length, embedding_size] 70 | padding: float32 tensor with shape [batch_size, length] indicating the 71 | locations of the padding tokens in x. 72 | """ 73 | with tf.name_scope('embedding'): 74 | # Create binary mask of size [batch_size, length] 75 | mask = tf.to_float(tf.not_equal(x, 0)) 76 | 77 | if self.method == 'gather': 78 | embeddings = tf.gather(self.shared_weights, x) 79 | embeddings *= tf.expand_dims(mask, -1) 80 | else: # matmul 81 | embeddings = tpu_utils.embedding_matmul( 82 | embedding_table = self.shared_weights, 83 | values = tf.cast(x, dtype = tf.int32), 84 | mask = mask, 85 | ) 86 | # embedding_matmul already zeros out masked positions, so 87 | # `embeddings *= tf.expand_dims(mask, -1)` is unnecessary. 88 | 89 | # Scale embedding by the sqrt of the hidden size 90 | embeddings *= self.hidden_size ** 0.5 91 | 92 | return embeddings 93 | 94 | def linear(self, x): 95 | """Computes logits by running x through a linear layer. 96 | 97 | Args: 98 | x: A float32 tensor with shape [batch_size, length, hidden_size] 99 | Returns: 100 | float32 tensor with shape [batch_size, length, vocab_size]. 101 | """ 102 | with tf.name_scope('presoftmax_linear'): 103 | batch_size = tf.shape(x)[0] 104 | length = tf.shape(x)[1] 105 | 106 | x = tf.reshape(x, [-1, self.hidden_size]) 107 | logits = tf.matmul(x, self.shared_weights, transpose_b = True) 108 | 109 | return tf.reshape(logits, [batch_size, length, self.vocab_size]) 110 | -------------------------------------------------------------------------------- /neural-machine-translation/transformer/ffn_layer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Implementation of fully connected network.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import tensorflow as tf 22 | 23 | 24 | class FeedFowardNetwork(tf.layers.Layer): 25 | """Fully connected feedforward network.""" 26 | 27 | def __init__( 28 | self, hidden_size, filter_size, relu_dropout, train, allow_pad 29 | ): 30 | super(FeedFowardNetwork, self).__init__() 31 | self.hidden_size = hidden_size 32 | self.filter_size = filter_size 33 | self.relu_dropout = relu_dropout 34 | self.train = train 35 | self.allow_pad = allow_pad 36 | 37 | self.filter_dense_layer = tf.layers.Dense( 38 | filter_size, 39 | use_bias = True, 40 | activation = tf.nn.relu, 41 | name = 'filter_layer', 42 | ) 43 | self.output_dense_layer = tf.layers.Dense( 44 | hidden_size, use_bias = True, name = 'output_layer' 45 | ) 46 | 47 | def call(self, x, padding = None): 48 | """Return outputs of the feedforward network. 49 | 50 | Args: 51 | x: tensor with shape [batch_size, length, hidden_size] 52 | padding: (optional) If set, the padding values are temporarily removed 53 | from x (provided self.allow_pad is set). The padding values are placed 54 | back in the output tensor in the same locations. 55 | shape [batch_size, length] 56 | 57 | Returns: 58 | Output of the feedforward network. 59 | tensor with shape [batch_size, length, hidden_size] 60 | """ 61 | padding = None if not self.allow_pad else padding 62 | 63 | # Retrieve dynamically known shapes 64 | batch_size = tf.shape(x)[0] 65 | length = tf.shape(x)[1] 66 | 67 | if padding is not None: 68 | with tf.name_scope('remove_padding'): 69 | # Flatten padding to [batch_size*length] 70 | pad_mask = tf.reshape(padding, [-1]) 71 | 72 | nonpad_ids = tf.to_int32(tf.where(pad_mask < 1e-9)) 73 | 74 | # Reshape x to [batch_size*length, hidden_size] to remove padding 75 | x = tf.reshape(x, [-1, self.hidden_size]) 76 | x = tf.gather_nd(x, indices = nonpad_ids) 77 | 78 | # Reshape x from 2 dimensions to 3 dimensions. 79 | x.set_shape([None, self.hidden_size]) 80 | x = tf.expand_dims(x, axis = 0) 81 | 82 | output = self.filter_dense_layer(x) 83 | if self.train: 84 | output = tf.nn.dropout(output, 1.0 - self.relu_dropout) 85 | output = self.output_dense_layer(output) 86 | 87 | if padding is not None: 88 | with tf.name_scope('re_add_padding'): 89 | output = tf.squeeze(output, axis = 0) 90 | output = tf.scatter_nd( 91 | indices = nonpad_ids, 92 | updates = output, 93 | shape = [batch_size * length, self.hidden_size], 94 | ) 95 | output = tf.reshape( 96 | output, [batch_size, length, self.hidden_size] 97 | ) 98 | return output 99 | -------------------------------------------------------------------------------- /neural-machine-translation/transformer/model_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Transformer model helper methods.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import math 22 | 23 | import tensorflow as tf 24 | 25 | _NEG_INF = -1e9 26 | 27 | 28 | def get_position_encoding( 29 | length, hidden_size, min_timescale = 1.0, max_timescale = 1.0e4 30 | ): 31 | """Return positional encoding. 32 | 33 | Calculates the position encoding as a mix of sine and cosine functions with 34 | geometrically increasing wavelengths. 35 | Defined and formulized in Attention is All You Need, section 3.5. 36 | 37 | Args: 38 | length: Sequence length. 39 | hidden_size: Size of the 40 | min_timescale: Minimum scale that will be applied at each position 41 | max_timescale: Maximum scale that will be applied at each position 42 | 43 | Returns: 44 | Tensor with shape [length, hidden_size] 45 | """ 46 | position = tf.to_float(tf.range(length)) 47 | num_timescales = hidden_size // 2 48 | log_timescale_increment = math.log( 49 | float(max_timescale) / float(min_timescale) 50 | ) / (tf.to_float(num_timescales) - 1) 51 | inv_timescales = min_timescale * tf.exp( 52 | tf.to_float(tf.range(num_timescales)) * -log_timescale_increment 53 | ) 54 | scaled_time = tf.expand_dims(position, 1) * tf.expand_dims( 55 | inv_timescales, 0 56 | ) 57 | signal = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis = 1) 58 | return signal 59 | 60 | 61 | def get_decoder_self_attention_bias(length): 62 | """Calculate bias for decoder that maintains model's autoregressive property. 63 | 64 | Creates a tensor that masks out locations that correspond to illegal 65 | connections, so prediction at position i cannot draw information from future 66 | positions. 67 | 68 | Args: 69 | length: int length of sequences in batch. 70 | 71 | Returns: 72 | float tensor of shape [1, 1, length, length] 73 | """ 74 | with tf.name_scope('decoder_self_attention_bias'): 75 | valid_locs = tf.matrix_band_part(tf.ones([length, length]), -1, 0) 76 | valid_locs = tf.reshape(valid_locs, [1, 1, length, length]) 77 | decoder_bias = _NEG_INF * (1.0 - valid_locs) 78 | return decoder_bias 79 | 80 | 81 | def get_padding(x, padding_value = 0): 82 | """Return float tensor representing the padding values in x. 83 | 84 | Args: 85 | x: int tensor with any shape 86 | padding_value: int value that 87 | 88 | Returns: 89 | flaot tensor with same shape as x containing values 0 or 1. 90 | 0 -> non-padding, 1 -> padding 91 | """ 92 | with tf.name_scope('padding'): 93 | return tf.to_float(tf.equal(x, padding_value)) 94 | 95 | 96 | def get_padding_bias(x): 97 | """Calculate bias tensor from padding values in tensor. 98 | 99 | Bias tensor that is added to the pre-softmax multi-headed attention logits, 100 | which has shape [batch_size, num_heads, length, length]. The tensor is zero at 101 | non-padding locations, and -1e9 (negative infinity) at padding locations. 102 | 103 | Args: 104 | x: int tensor with shape [batch_size, length] 105 | 106 | Returns: 107 | Attention bias tensor of shape [batch_size, 1, 1, length]. 108 | """ 109 | with tf.name_scope('attention_bias'): 110 | padding = get_padding(x) 111 | attention_bias = padding * _NEG_INF 112 | attention_bias = tf.expand_dims( 113 | tf.expand_dims(attention_bias, axis = 1), axis = 1 114 | ) 115 | return attention_bias 116 | -------------------------------------------------------------------------------- /nlp-tf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mesolitica/NLP-Models-Tensorflow/0741216aa8235e1228b3de7903cc36d73f8f2b45/nlp-tf.png -------------------------------------------------------------------------------- /ocr/README.md: -------------------------------------------------------------------------------- 1 | ## How-to 2 | 3 | 1. Download dataset from here, http://baidudeeplearning.bj.bcebos.com/image_contest_level_1.tar.gz 4 | 2. Run any notebook using Jupyter Notebook. 5 | -------------------------------------------------------------------------------- /pos-tagging/README.md: -------------------------------------------------------------------------------- 1 | ## How-to 2 | 3 | 1. Run any notebook using Jupyter Notebook. 4 | -------------------------------------------------------------------------------- /question-answer/README.md: -------------------------------------------------------------------------------- 1 | ## How-to 2 | 3 | 1. Run any notebook using Jupyter Notebook. 4 | -------------------------------------------------------------------------------- /question-answer/attention_gru.py: -------------------------------------------------------------------------------- 1 | # Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Module implementing RNN Cells. 16 | 17 | This module provides a number of basic commonly used RNN cells, such as LSTM 18 | (Long Short Term Memory) or GRU (Gated Recurrent Unit), and a number of 19 | operators that allow adding dropouts, projections, or embeddings for inputs. 20 | Constructing multi-layer cells is supported by the class `MultiRNNCell`, or by 21 | calling the `rnn` ops several times. 22 | """ 23 | from __future__ import absolute_import 24 | from __future__ import division 25 | from __future__ import print_function 26 | 27 | import collections 28 | import hashlib 29 | import numbers 30 | 31 | from tensorflow.python.eager import context 32 | from tensorflow.python.framework import constant_op 33 | from tensorflow.python.framework import dtypes 34 | from tensorflow.python.framework import ops 35 | from tensorflow.python.framework import tensor_shape 36 | from tensorflow.python.framework import tensor_util 37 | from tensorflow.python.layers import base as base_layer 38 | from tensorflow.python.ops import array_ops 39 | from tensorflow.python.ops import clip_ops 40 | from tensorflow.python.ops import init_ops 41 | from tensorflow.python.ops import math_ops 42 | from tensorflow.python.ops import nn_ops 43 | from tensorflow.python.ops import partitioned_variables 44 | from tensorflow.python.ops import random_ops 45 | from tensorflow.python.ops import tensor_array_ops 46 | from tensorflow.python.ops import variable_scope as vs 47 | from tensorflow.python.ops import variables as tf_variables 48 | from tensorflow.python.platform import tf_logging as logging 49 | from tensorflow.python.util import nest 50 | from tensorflow.python.ops.rnn_cell_impl import RNNCell 51 | 52 | 53 | _BIAS_VARIABLE_NAME = "bias" 54 | _WEIGHTS_VARIABLE_NAME = "kernel" 55 | 56 | 57 | class AttentionGRUCell(RNNCell): 58 | """Gated Recurrent Unit cell (cf. http://arxiv.org/abs/1406.1078). 59 | 60 | Args: 61 | num_units: int, The number of units in the GRU cell. 62 | activation: Nonlinearity to use. Default: `tanh`. 63 | reuse: (optional) Python boolean describing whether to reuse variables 64 | in an existing scope. If not `True`, and the existing scope already has 65 | the given variables, an error is raised. 66 | kernel_initializer: (optional) The initializer to use for the weight and 67 | projection matrices. 68 | bias_initializer: (optional) The initializer to use for the bias. 69 | """ 70 | 71 | def __init__(self, 72 | num_units, 73 | activation=None, 74 | reuse=None, 75 | name=None, 76 | kernel_initializer=None, 77 | bias_initializer=None): 78 | super(AttentionGRUCell, self).__init__(_reuse=reuse, name=name) 79 | self._num_units = num_units 80 | self._activation = activation or math_ops.tanh 81 | self._kernel_initializer = kernel_initializer 82 | self._bias_initializer = bias_initializer 83 | self._gate_linear = None 84 | self._candidate_linear = None 85 | 86 | @property 87 | def state_size(self): 88 | return self._num_units 89 | 90 | @property 91 | def output_size(self): 92 | return self._num_units 93 | 94 | def call(self, inputs, state): 95 | # extract input vector and attention (gate) 96 | if inputs.get_shape()[-1] != self._num_units + 1: 97 | raise ValueError("Input should be passed as word input concatenated with 1D attention on end axis") 98 | inputs, g = array_ops.split(inputs, 99 | num_or_size_splits=[self._num_units,1], 100 | axis=1) 101 | 102 | """Gated recurrent unit (GRU) with nunits cells.""" 103 | if self._gate_linear is None: 104 | bias_ones = self._bias_initializer 105 | if self._bias_initializer is None: 106 | bias_ones = init_ops.constant_initializer(1.0, dtype=inputs.dtype) 107 | with vs.variable_scope("gates"): # Reset gate and update gate. 108 | self._gate_linear = _Linear( 109 | [inputs, state], 110 | self._num_units, 111 | True, 112 | bias_initializer=bias_ones, 113 | kernel_initializer=self._kernel_initializer) 114 | 115 | r = math_ops.sigmoid(self._gate_linear([inputs, state])) 116 | 117 | r_state = r * state 118 | if self._candidate_linear is None: 119 | with vs.variable_scope("candidate"): 120 | self._candidate_linear = _Linear( 121 | [inputs, r_state], 122 | self._num_units, 123 | True, 124 | bias_initializer=self._bias_initializer, 125 | kernel_initializer=self._kernel_initializer) 126 | c = self._activation(self._candidate_linear([inputs, r_state])) 127 | new_h = (1 - g) * state + g * c 128 | return new_h, new_h 129 | 130 | 131 | class _Linear(object): 132 | """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable. 133 | 134 | Args: 135 | args: a 2D Tensor or a list of 2D, batch x n, Tensors. 136 | output_size: int, second dimension of weight variable. 137 | dtype: data type for variables. 138 | build_bias: boolean, whether to build a bias variable. 139 | bias_initializer: starting value to initialize the bias 140 | (default is all zeros). 141 | kernel_initializer: starting value to initialize the weight. 142 | 143 | Raises: 144 | ValueError: if inputs_shape is wrong. 145 | """ 146 | 147 | def __init__(self, 148 | args, 149 | output_size, 150 | build_bias, 151 | bias_initializer=None, 152 | kernel_initializer=None): 153 | self._build_bias = build_bias 154 | 155 | if args is None or (nest.is_sequence(args) and not args): 156 | raise ValueError("`args` must be specified") 157 | if not nest.is_sequence(args): 158 | args = [args] 159 | self._is_sequence = False 160 | else: 161 | self._is_sequence = True 162 | 163 | # Calculate the total size of arguments on dimension 1. 164 | total_arg_size = 0 165 | shapes = [a.get_shape() for a in args] 166 | for shape in shapes: 167 | if shape.ndims != 2: 168 | raise ValueError("linear is expecting 2D arguments: %s" % shapes) 169 | if shape[1].value is None: 170 | raise ValueError("linear expects shape[1] to be provided for shape %s, " 171 | "but saw %s" % (shape, shape[1])) 172 | else: 173 | total_arg_size += shape[1].value 174 | 175 | dtype = [a.dtype for a in args][0] 176 | 177 | scope = vs.get_variable_scope() 178 | with vs.variable_scope(scope) as outer_scope: 179 | self._weights = vs.get_variable( 180 | _WEIGHTS_VARIABLE_NAME, [total_arg_size, output_size], 181 | dtype=dtype, 182 | initializer=kernel_initializer) 183 | if build_bias: 184 | with vs.variable_scope(outer_scope) as inner_scope: 185 | inner_scope.set_partitioner(None) 186 | if bias_initializer is None: 187 | bias_initializer = init_ops.constant_initializer(0.0, dtype=dtype) 188 | self._biases = vs.get_variable( 189 | _BIAS_VARIABLE_NAME, [output_size], 190 | dtype=dtype, 191 | initializer=bias_initializer) 192 | 193 | def __call__(self, args): 194 | if not self._is_sequence: 195 | args = [args] 196 | 197 | if len(args) == 1: 198 | res = math_ops.matmul(args[0], self._weights) 199 | else: 200 | res = math_ops.matmul(array_ops.concat(args, 1), self._weights) 201 | if self._build_bias: 202 | res = nn_ops.bias_add(res, self._biases) 203 | return res -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | tensorflow 2 | numpy 3 | scipy 4 | sklearn 5 | scikit-learn 6 | matplotlib 7 | seaborn 8 | pandas 9 | -------------------------------------------------------------------------------- /sentence-pair/Archive.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mesolitica/NLP-Models-Tensorflow/0741216aa8235e1228b3de7903cc36d73f8f2b45/sentence-pair/Archive.zip -------------------------------------------------------------------------------- /sentence-pair/README.md: -------------------------------------------------------------------------------- 1 | ## How-to 2 | 3 | 1. Extract [Archive.zip](Archive.zip) to get the dataset and dictionary, 4 | 5 | ```bash 6 | unzip Archive.zip 7 | ``` 8 | 9 | 2. Run any notebook using Jupyter Notebook. 10 | -------------------------------------------------------------------------------- /speech-to-text/README.md: -------------------------------------------------------------------------------- 1 | ## How-to 2 | 3 | 1. First, you need to run [download.ipynb](download.ipynb) 4 | 5 | 2. Run [caching.py](caching.py) 6 | 7 | 3. Run any notebook using Jupyter Notebook 8 | -------------------------------------------------------------------------------- /speech-to-text/augmentation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import librosa 3 | import os 4 | import scipy 5 | import json 6 | 7 | 8 | def change_pitch_speech(samples): 9 | y_pitch_speed = samples.copy() 10 | length_change = np.random.uniform(low = 0.8, high = 1) 11 | speed_fac = 1.0 / length_change 12 | tmp = np.interp( 13 | np.arange(0, len(y_pitch_speed), speed_fac), 14 | np.arange(0, len(y_pitch_speed)), 15 | y_pitch_speed, 16 | ) 17 | minlen = min(y_pitch_speed.shape[0], tmp.shape[0]) 18 | y_pitch_speed *= 0 19 | y_pitch_speed[0:minlen] = tmp[0:minlen] 20 | return y_pitch_speed 21 | 22 | 23 | def change_amplitude(samples): 24 | y_aug = samples.copy() 25 | dyn_change = np.random.uniform(low = 1.5, high = 3) 26 | return y_aug * dyn_change 27 | 28 | 29 | def add_noise(samples): 30 | y_noise = samples.copy() 31 | noise_amp = 0.01 * np.random.uniform() * np.amax(y_noise) 32 | return y_noise.astype('float64') + noise_amp * np.random.normal( 33 | size = y_noise.shape[0] 34 | ) 35 | 36 | 37 | def add_hpss(samples): 38 | y_hpss = librosa.effects.hpss(samples.astype('float64')) 39 | return y_hpss[1] 40 | 41 | 42 | def strech(samples): 43 | input_length = len(samples) 44 | streching = samples.copy() 45 | random_strech = np.random.uniform(low = 0.5, high = 1.3) 46 | print('random_strech = ', random_strech) 47 | streching = librosa.effects.time_stretch( 48 | streching.astype('float'), random_strech 49 | ) 50 | return streching 51 | 52 | 53 | def random_augmentation(samples): 54 | cp = samples.copy() 55 | if np.random.randint(0, 2): 56 | length_change = np.random.uniform(low = 0.8, high = 1) 57 | speed_fac = 1.0 / length_change 58 | print('resample length_change = ', length_change) 59 | tmp = np.interp( 60 | np.arange(0, len(cp), speed_fac), np.arange(0, len(cp)), cp 61 | ) 62 | minlen = min(cp.shape[0], tmp.shape[0]) 63 | cp *= 0 64 | cp[0:minlen] = tmp[0:minlen] 65 | 66 | if np.random.randint(0, 2): 67 | dyn_change = np.random.uniform(low = 1.5, high = 3) 68 | print('dyn_change = ', dyn_change) 69 | cp = cp * dyn_change 70 | 71 | if np.random.randint(0, 2): 72 | noise_amp = 0.005 * np.random.uniform() * np.amax(cp) 73 | cp = cp.astype('float64') + noise_amp * np.random.normal( 74 | size = cp.shape[0] 75 | ) 76 | 77 | if np.random.randint(0, 2): 78 | timeshift_fac = 0.2 * 2 * (np.random.uniform() - 0.5) 79 | print('timeshift_fac = ', timeshift_fac) 80 | start = int(cp.shape[0] * timeshift_fac) 81 | if start > 0: 82 | cp = np.pad(cp, (start, 0), mode = 'constant')[0 : cp.shape[0]] 83 | else: 84 | cp = np.pad(cp, (0, -start), mode = 'constant')[0 : cp.shape[0]] 85 | return cp 86 | 87 | 88 | with open('train-test.json') as fopen: 89 | wavs = json.load(fopen)['train'] 90 | 91 | if not os.path.exists('augment'): 92 | os.makedirs('augment') 93 | 94 | for no, wav in enumerate(wavs): 95 | try: 96 | root, ext = os.path.splitext(wav) 97 | if (no + 1) % 100 == 0: 98 | print(no + 1, root, ext) 99 | root = root.replace('/', '<>') 100 | root = '%s/%s'%('augment', root) 101 | sample_rate, samples = scipy.io.wavfile.read(wav) 102 | aug = change_pitch_speech(samples) 103 | librosa.output.write_wav( 104 | '%s-1%s' % (root, ext), 105 | aug.astype('float32'), 106 | sample_rate, 107 | norm = True, 108 | ) 109 | 110 | aug = change_amplitude(samples) 111 | librosa.output.write_wav( 112 | '%s-2%s' % (root, ext), 113 | aug.astype('float32'), 114 | sample_rate, 115 | norm = True, 116 | ) 117 | 118 | aug = add_noise(samples) 119 | librosa.output.write_wav( 120 | '%s-3%s' % (root, ext), 121 | aug.astype('float32'), 122 | sample_rate, 123 | norm = True, 124 | ) 125 | 126 | aug = add_hpss(samples) 127 | librosa.output.write_wav( 128 | '%s-4%s' % (root, ext), 129 | aug.astype('float32'), 130 | sample_rate, 131 | norm = True, 132 | ) 133 | 134 | aug = strech(samples) 135 | librosa.output.write_wav( 136 | '%s-5%s' % (root, ext), 137 | aug.astype('float32'), 138 | sample_rate, 139 | norm = True, 140 | ) 141 | 142 | aug = random_augmentation(samples) 143 | librosa.output.write_wav( 144 | '%s-6%s' % (root, ext), 145 | aug.astype('float32'), 146 | sample_rate, 147 | norm = True, 148 | ) 149 | except Exception as e: 150 | print(e) 151 | pass -------------------------------------------------------------------------------- /speech-to-text/download.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from bs4 import BeautifulSoup\n", 10 | "from urllib.request import urlopen, urlretrieve\n", 11 | "from tqdm import tqdm\n", 12 | "import re\n", 13 | "import os" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "name": "stderr", 23 | "output_type": "stream", 24 | "text": [ 25 | "100%|███████████████████████████████| 200/200 [05:56<00:00, 1.84s/it]\n", 26 | "100%|███████████████████████████████| 200/200 [05:38<00:00, 1.69s/it]\n", 27 | "100%|███████████████████████████████| 200/200 [06:16<00:00, 1.82s/it]\n", 28 | "100%|███████████████████████████████| 200/200 [06:00<00:00, 1.76s/it]\n", 29 | "100%|███████████████████████████████| 200/200 [06:46<00:00, 2.47s/it]\n", 30 | "100%|███████████████████████████████| 200/200 [09:04<00:00, 2.60s/it]\n", 31 | "100%|███████████████████████████████| 200/200 [10:12<00:00, 2.87s/it]\n", 32 | "100%|███████████████████████████████| 200/200 [09:01<00:00, 2.63s/it]\n", 33 | "100%|███████████████████████████████| 200/200 [09:39<00:00, 3.47s/it]\n", 34 | "100%|███████████████████████████████| 200/200 [10:56<00:00, 3.04s/it]\n", 35 | "100%|███████████████████████████████| 200/200 [11:12<00:00, 3.06s/it]\n", 36 | "100%|███████████████████████████████| 200/200 [07:46<00:00, 2.32s/it]\n", 37 | "100%|███████████████████████████████| 200/200 [09:30<00:00, 2.83s/it]\n", 38 | "100%|███████████████████████████████| 200/200 [10:05<00:00, 3.83s/it]\n" 39 | ] 40 | } 41 | ], 42 | "source": [ 43 | "prefix = 'https://tspace.library.utoronto.ca'\n", 44 | "save_dir = './data/'\n", 45 | "if not os.path.exists(save_dir):\n", 46 | " os.makedirs(save_dir)\n", 47 | "\n", 48 | "base_url = 'https://tspace.library.utoronto.ca/handle/1807/24'\n", 49 | "urls = [base_url+str(i) for i in range(488, 502)]\n", 50 | "for url in urls:\n", 51 | " soup = BeautifulSoup(urlopen(url).read(), 'html5lib')\n", 52 | " targets = soup.findAll('a', href=re.compile(r'/bitstream/.*.wav'))\n", 53 | " \n", 54 | " for a in tqdm(targets, total=len(targets), ncols=70):\n", 55 | " link = a['href']\n", 56 | "\n", 57 | " audio_save_loc = save_dir + link.split('/')[-1]\n", 58 | " if os.path.isfile(audio_save_loc):\n", 59 | " print(\"File Already Exists\")\n", 60 | " urlretrieve(prefix+a['href'], audio_save_loc)\n", 61 | "\n", 62 | " with open(audio_save_loc.replace('.wav', '.txt'), 'w') as f:\n", 63 | " f.write('say the word ' + link.split('_')[-2])" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [] 72 | } 73 | ], 74 | "metadata": { 75 | "kernelspec": { 76 | "display_name": "Python 3", 77 | "language": "python", 78 | "name": "python3" 79 | }, 80 | "language_info": { 81 | "codemirror_mode": { 82 | "name": "ipython", 83 | "version": 3 84 | }, 85 | "file_extension": ".py", 86 | "mimetype": "text/x-python", 87 | "name": "python", 88 | "nbconvert_exporter": "python", 89 | "pygments_lexer": "ipython3", 90 | "version": "3.5.2" 91 | } 92 | }, 93 | "nbformat": 4, 94 | "nbformat_minor": 2 95 | } 96 | -------------------------------------------------------------------------------- /speech-to-text/wav2vec-preprocessing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import librosa\n", 10 | "import glob" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 12, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "train = glob.glob('spectrogram-train/*.npy')\n", 20 | "x = []\n", 21 | "for fpath in train:\n", 22 | " fpath = fpath.split('/')[1]\n", 23 | " splitted = fpath.split('-')\n", 24 | " if len(splitted) == 2:\n", 25 | " splitted[1] = splitted[1].split('.')[1]\n", 26 | " fpath = splitted[0] + '.' + splitted[1]\n", 27 | " fpath = fpath.replace('.npy','.wav')\n", 28 | " x.append('data/' + fpath)" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 13, 34 | "metadata": {}, 35 | "outputs": [ 36 | { 37 | "data": { 38 | "text/plain": [ 39 | "16341" 40 | ] 41 | }, 42 | "execution_count": 13, 43 | "metadata": {}, 44 | "output_type": "execute_result" 45 | } 46 | ], 47 | "source": [ 48 | "augment = glob.glob('augment/*.wav')\n", 49 | "x.extend(augment)\n", 50 | "x = list(set(x))\n", 51 | "len(x)" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 15, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "test_ = glob.glob('spectrogram-test/*.npy')\n", 61 | "test = []\n", 62 | "for t in test_:\n", 63 | " f = t.split('/')[1].replace('.npy', '.wav')\n", 64 | " test.append('data/'+f)" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 16, 70 | "metadata": {}, 71 | "outputs": [ 72 | { 73 | "name": "stderr", 74 | "output_type": "stream", 75 | "text": [ 76 | "100%|██████████| 16341/16341 [15:07<00:00, 18.01it/s]\n", 77 | "100%|██████████| 560/560 [00:30<00:00, 18.51it/s]\n" 78 | ] 79 | } 80 | ], 81 | "source": [ 82 | "from tqdm import tqdm\n", 83 | "\n", 84 | "X = []\n", 85 | "for i in tqdm(range(len(x))):\n", 86 | " y, sr = librosa.load(x[i], sr = 16000)\n", 87 | " X.append(y)\n", 88 | " \n", 89 | "Y = []\n", 90 | "for i in tqdm(range(len(test))):\n", 91 | " y, sr = librosa.load(test[i], sr = 16000)\n", 92 | " Y.append(y)" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 20, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "import pickle\n", 102 | "\n", 103 | "with open('train-wav.pkl', 'wb') as fopen:\n", 104 | " pickle.dump({'X': X, 'x': x}, fopen)\n", 105 | " \n", 106 | "with open('test-wav.pkl', 'wb') as fopen:\n", 107 | " pickle.dump({'Y': Y, 'y': test}, fopen)" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [] 116 | } 117 | ], 118 | "metadata": { 119 | "kernelspec": { 120 | "display_name": "Python 3", 121 | "language": "python", 122 | "name": "python3" 123 | }, 124 | "language_info": { 125 | "codemirror_mode": { 126 | "name": "ipython", 127 | "version": 3 128 | }, 129 | "file_extension": ".py", 130 | "mimetype": "text/x-python", 131 | "name": "python", 132 | "nbconvert_exporter": "python", 133 | "pygments_lexer": "ipython3", 134 | "version": "3.6.8" 135 | } 136 | }, 137 | "nbformat": 4, 138 | "nbformat_minor": 2 139 | } 140 | -------------------------------------------------------------------------------- /stemming/README.md: -------------------------------------------------------------------------------- 1 | ## How-to 2 | 3 | 1. Run any notebook using Jupyter Notebook. 4 | -------------------------------------------------------------------------------- /stemming/dnc.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """DNC Cores. 16 | 17 | These modules create a DNC core. They take input, pass parameters to the memory 18 | access module, and integrate the output of memory to form an output. 19 | """ 20 | 21 | from __future__ import absolute_import 22 | from __future__ import division 23 | from __future__ import print_function 24 | 25 | import collections 26 | import numpy as np 27 | import sonnet as snt 28 | import tensorflow as tf 29 | 30 | import access 31 | 32 | DNCState = collections.namedtuple('DNCState', ('access_output', 'access_state', 33 | 'controller_state')) 34 | 35 | 36 | class DNC(snt.RNNCore): 37 | """DNC core module. 38 | 39 | Contains controller and memory access module. 40 | """ 41 | 42 | def __init__(self, 43 | access_config, 44 | controller_config, 45 | output_size, 46 | clip_value=None, 47 | name='dnc'): 48 | """Initializes the DNC core. 49 | 50 | Args: 51 | access_config: dictionary of access module configurations. 52 | controller_config: dictionary of controller (LSTM) module configurations. 53 | output_size: output dimension size of core. 54 | clip_value: clips controller and core output values to between 55 | `[-clip_value, clip_value]` if specified. 56 | name: module name (default 'dnc'). 57 | 58 | Raises: 59 | TypeError: if direct_input_size is not None for any access module other 60 | than KeyValueMemory. 61 | """ 62 | super(DNC, self).__init__(name=name) 63 | 64 | with self._enter_variable_scope(): 65 | self._controller = snt.LSTM(**controller_config) 66 | self._access = access.MemoryAccess(**access_config) 67 | 68 | self._access_output_size = np.prod(self._access.output_size.as_list()) 69 | self._output_size = output_size 70 | self._clip_value = clip_value or 0 71 | 72 | self._output_size = tf.TensorShape([output_size]) 73 | self._state_size = DNCState( 74 | access_output=self._access_output_size, 75 | access_state=self._access.state_size, 76 | controller_state=self._controller.state_size) 77 | 78 | def _clip_if_enabled(self, x): 79 | if self._clip_value > 0: 80 | return tf.clip_by_value(x, -self._clip_value, self._clip_value) 81 | else: 82 | return x 83 | 84 | def _build(self, inputs, prev_state): 85 | """Connects the DNC core into the graph. 86 | 87 | Args: 88 | inputs: Tensor input. 89 | prev_state: A `DNCState` tuple containing the fields `access_output`, 90 | `access_state` and `controller_state`. `access_state` is a 3-D Tensor 91 | of shape `[batch_size, num_reads, word_size]` containing read words. 92 | `access_state` is a tuple of the access module's state, and 93 | `controller_state` is a tuple of controller module's state. 94 | 95 | Returns: 96 | A tuple `(output, next_state)` where `output` is a tensor and `next_state` 97 | is a `DNCState` tuple containing the fields `access_output`, 98 | `access_state`, and `controller_state`. 99 | """ 100 | 101 | prev_access_output = prev_state.access_output 102 | prev_access_state = prev_state.access_state 103 | prev_controller_state = prev_state.controller_state 104 | 105 | batch_flatten = snt.BatchFlatten() 106 | controller_input = tf.concat( 107 | [batch_flatten(inputs), batch_flatten(prev_access_output)], 1) 108 | 109 | controller_output, controller_state = self._controller( 110 | controller_input, prev_controller_state) 111 | 112 | controller_output = self._clip_if_enabled(controller_output) 113 | controller_state = snt.nest.map(self._clip_if_enabled, controller_state) 114 | 115 | access_output, access_state = self._access(controller_output, 116 | prev_access_state) 117 | 118 | output = tf.concat([controller_output, batch_flatten(access_output)], 1) 119 | output = snt.Linear( 120 | output_size=self._output_size.as_list()[0], 121 | name='output_linear')(output) 122 | output = self._clip_if_enabled(output) 123 | 124 | return output, DNCState( 125 | access_output=access_output, 126 | access_state=access_state, 127 | controller_state=controller_state) 128 | 129 | def initial_state(self, batch_size, dtype=tf.float32): 130 | return DNCState( 131 | controller_state=self._controller.initial_state(batch_size, dtype), 132 | access_state=self._access.initial_state(batch_size, dtype), 133 | access_output=tf.zeros( 134 | [batch_size] + self._access.output_size.as_list(), dtype)) 135 | 136 | @property 137 | def state_size(self): 138 | return self._state_size 139 | 140 | @property 141 | def output_size(self): 142 | return self._output_size 143 | -------------------------------------------------------------------------------- /stemming/util.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """DNC util ops and modules.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import numpy as np 22 | import tensorflow as tf 23 | 24 | 25 | def batch_invert_permutation(permutations): 26 | """Returns batched `tf.invert_permutation` for every row in `permutations`.""" 27 | with tf.name_scope('batch_invert_permutation', values=[permutations]): 28 | unpacked = tf.unstack(permutations) 29 | inverses = [tf.invert_permutation(permutation) for permutation in unpacked] 30 | return tf.stack(inverses) 31 | 32 | 33 | def batch_gather(values, indices): 34 | """Returns batched `tf.gather` for every row in the input.""" 35 | with tf.name_scope('batch_gather', values=[values, indices]): 36 | unpacked = zip(tf.unstack(values), tf.unstack(indices)) 37 | result = [tf.gather(value, index) for value, index in unpacked] 38 | return tf.stack(result) 39 | 40 | 41 | def one_hot(length, index): 42 | """Return an nd array of given `length` filled with 0s and a 1 at `index`.""" 43 | result = np.zeros(length) 44 | result[index] = 1 45 | return result 46 | -------------------------------------------------------------------------------- /text-augmentation/1.glove.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 2, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "f = open('glove.6B.200d.txt','r')\n", 19 | "dictionary = {}\n", 20 | "vectors = []\n", 21 | "for no, line in enumerate(f):\n", 22 | " splitLine = line.split()\n", 23 | " word = splitLine[0]\n", 24 | " dictionary[word] = no\n", 25 | " embedding = np.array([float(val) for val in splitLine[1:]])\n", 26 | " vectors.append(embedding)" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 5, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "vectors = np.array(vectors)" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 24, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "rev_dictionary = {v:k for k, v in dictionary.items()}" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 7, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "import tensorflow as tf" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 10, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "class Model:\n", 63 | " def __init__(self):\n", 64 | " self._embedding = tf.convert_to_tensor(vectors, dtype = tf.float32)\n", 65 | " self.X = tf.placeholder(\n", 66 | " tf.float32, [None, vectors.shape[1]]\n", 67 | " )\n", 68 | " normed_embedding = tf.nn.l2_normalize(self._embedding, axis = 1)\n", 69 | " normed_array = tf.nn.l2_normalize(self.X, axis = 1)\n", 70 | " self.cosine_similarity = tf.matmul(\n", 71 | " normed_array, tf.transpose(normed_embedding, [1, 0])\n", 72 | " )" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 11, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "model = Model()\n", 82 | "sess = tf.InteractiveSession()" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 14, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "string_positive = 'i really love to eat chicken and meat'\n", 92 | "string_negative = 'i really hate you and i do not want to see you again'" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 28, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "import random" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 52, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "def augmentation(string, threshold = 0.5, count = 5, k = 8):\n", 111 | " string = string.split()\n", 112 | " selected = []\n", 113 | " while not len(selected):\n", 114 | " selected = [(no, w) for no, w in enumerate(string) if random.random() > threshold]\n", 115 | " indices, words = [i[0] for i in selected], [i[1] for i in selected]\n", 116 | " \n", 117 | " batches = vectors[[dictionary[w] for w in words]]\n", 118 | " top_k = tf.nn.top_k(model.cosine_similarity, k = k)\n", 119 | " results = sess.run(top_k, feed_dict = {model.X: batches})\n", 120 | " words = []\n", 121 | " for result in results.indices:\n", 122 | " words.append([rev_dictionary[i] for i in result])\n", 123 | " augmented = []\n", 124 | " for i in range(count):\n", 125 | " string_ = string[:]\n", 126 | " for no in range(len(words)):\n", 127 | " index = random.randint(0, len(words[no]) - 1)\n", 128 | " string_[indices[no]] = words[no][index]\n", 129 | " augmented.append(' '.join(string_))\n", 130 | " return augmented " 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 53, 136 | "metadata": {}, 137 | "outputs": [ 138 | { 139 | "name": "stdout", 140 | "output_type": "stream", 141 | "text": [ 142 | "CPU times: user 4.07 s, sys: 1.59 s, total: 5.66 s\n", 143 | "Wall time: 5.68 s\n" 144 | ] 145 | }, 146 | { 147 | "data": { 148 | "text/plain": [ 149 | "['i thing love to eat pork also chicken',\n", 150 | " 'i really love to eat fried well cooked',\n", 151 | " 'i things love to eat meat with beef',\n", 152 | " 'i thing love to eat roasted well chicken',\n", 153 | " 'i something love to eat cooked , beef']" 154 | ] 155 | }, 156 | "execution_count": 53, 157 | "metadata": {}, 158 | "output_type": "execute_result" 159 | } 160 | ], 161 | "source": [ 162 | "%%time\n", 163 | "augmentation(string_positive)" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 54, 169 | "metadata": {}, 170 | "outputs": [ 171 | { 172 | "name": "stdout", 173 | "output_type": "stream", 174 | "text": [ 175 | "CPU times: user 3.96 s, sys: 1.78 s, total: 5.73 s\n", 176 | "Wall time: 5.75 s\n" 177 | ] 178 | }, 179 | { 180 | "data": { 181 | "text/plain": [ 182 | "['i really hate you also i know not want make see i again',\n", 183 | " \"i really hate you while i n't not want help see 'll again\",\n", 184 | " \"i really hate you . i not not want take see 'll again\",\n", 185 | " 'i really hate you well i know not want able see you again',\n", 186 | " \"i really hate you , i does not want could see 'll again\"]" 187 | ] 188 | }, 189 | "execution_count": 54, 190 | "metadata": {}, 191 | "output_type": "execute_result" 192 | } 193 | ], 194 | "source": [ 195 | "%%time\n", 196 | "augmentation(string_negative)" 197 | ] 198 | } 199 | ], 200 | "metadata": { 201 | "kernelspec": { 202 | "display_name": "Python 3", 203 | "language": "python", 204 | "name": "python3" 205 | }, 206 | "language_info": { 207 | "codemirror_mode": { 208 | "name": "ipython", 209 | "version": 3 210 | }, 211 | "file_extension": ".py", 212 | "mimetype": "text/x-python", 213 | "name": "python", 214 | "nbconvert_exporter": "python", 215 | "pygments_lexer": "ipython3", 216 | "version": "3.6.8" 217 | } 218 | }, 219 | "nbformat": 4, 220 | "nbformat_minor": 2 221 | } 222 | -------------------------------------------------------------------------------- /text-augmentation/6.vae-varitional-bahdanau/basic_decoder.py: -------------------------------------------------------------------------------- 1 | import collections 2 | 3 | import tensorflow as tf 4 | 5 | import decoder 6 | from tensorflow.contrib.seq2seq.python.ops import helper as helper_py 7 | from tensorflow.python.framework import dtypes 8 | from tensorflow.python.framework import ops 9 | from tensorflow.python.framework import tensor_shape 10 | from tensorflow.python.layers import base as layers_base 11 | from tensorflow.python.ops import rnn_cell_impl 12 | from tensorflow.python.util import nest 13 | 14 | __all__ = [ 15 | "BasicDecoderOutput", 16 | "BasicDecoder", 17 | ] 18 | 19 | 20 | class BasicDecoderOutput(collections.namedtuple("BasicDecoderOutput", ("rnn_output", "sample_id"))): 21 | pass 22 | 23 | 24 | class BasicDecoder(decoder.Decoder): 25 | """Basic sampling decoder.""" 26 | 27 | def __init__(self, cell, helper, initial_state, latent_vector, output_layer=None): 28 | """Initialize BasicDecoder. 29 | Args: 30 | cell: An `RNNCell` instance. 31 | helper: A `Helper` instance. 32 | initial_state: A (possibly nested tuple of...) tensors and TensorArrays. 33 | The initial state of the RNNCell. 34 | output_layer: (Optional) An instance of `tf.layers.Layer`, i.e., 35 | `tf.layers.Dense`. Optional layer to apply to the RNN output prior 36 | to storing the result or sampling. 37 | Raises: 38 | TypeError: if `cell`, `helper` or `output_layer` have an incorrect type. 39 | """ 40 | if not isinstance(helper, helper_py.Helper): 41 | raise TypeError("helper must be a Helper, received: %s" % type(helper)) 42 | if (output_layer is not None and not isinstance(output_layer, layers_base.Layer)): 43 | raise TypeError("output_layer must be a Layer, received: %s" % type(output_layer)) 44 | self._cell = cell 45 | self._helper = helper 46 | self._initial_state = initial_state 47 | self._output_layer = output_layer 48 | self._latent_vector = latent_vector 49 | self._intermediate_context_kl_loss = tf.zeros(shape=(helper.batch_size,)) # shape of (batch_size,) 50 | # CHANGE-1: Variable to keep adding the c_kl_losses from each timestep 51 | 52 | @property 53 | def batch_size(self): 54 | return self._helper.batch_size 55 | 56 | def _rnn_output_size(self): 57 | size = self._cell.output_size 58 | if self._output_layer is None: 59 | return size 60 | else: 61 | # To use layer's compute_output_shape, we need to convert the 62 | # RNNCell's output_size entries into shapes with an unknown 63 | # batch size. We then pass this through the layer's 64 | # compute_output_shape and read off all but the first (batch) 65 | # dimensions to get the output size of the rnn with the layer 66 | # applied to the top. 67 | output_shape_with_unknown_batch = nest.map_structure( 68 | lambda s: tensor_shape.TensorShape([None]).concatenate(s), 69 | size) 70 | layer_output_shape = self._output_layer.compute_output_shape( # pylint: disable=protected-access 71 | output_shape_with_unknown_batch) 72 | return nest.map_structure(lambda s: s[1:], layer_output_shape) 73 | 74 | @property 75 | def output_size(self): 76 | # Return the cell output and the id 77 | return BasicDecoderOutput( 78 | rnn_output=self._rnn_output_size(), 79 | sample_id=tensor_shape.TensorShape([])) 80 | 81 | @property 82 | def output_dtype(self): 83 | # Assume the dtype of the cell is the output_size structure 84 | # containing the input_state's first component's dtype. 85 | # Return that structure and int32 (the id) 86 | dtype = nest.flatten(self._initial_state)[0].dtype 87 | return BasicDecoderOutput( 88 | nest.map_structure(lambda _: dtype, self._rnn_output_size()), 89 | dtypes.int32) 90 | 91 | def initialize(self, name=None): 92 | """Initialize the decoder. 93 | Args: 94 | name: Name scope for any created operations. 95 | Returns: 96 | `(finished, first_inputs, initial_state)`. 97 | """ 98 | # Concatenate the latent vector to the 1st input to the decoder LSTM, i.e, the embedding + latent vector 99 | return (self._helper.initialize()[0], 100 | tf.concat([self._helper.initialize()[1], self._latent_vector], axis=-1)) + (self._initial_state,) 101 | 102 | def step(self, time, inputs, state, name=None): 103 | """Perform a decoding step. 104 | Args: 105 | time: scalar `int32` tensor. 106 | inputs: A (structure of) input tensors. 107 | state: A (structure of) state tensors and TensorArrays. 108 | name: Name scope for any created operations. 109 | Returns: 110 | `(outputs, next_state, next_inputs, finished)`. 111 | """ 112 | with ops.name_scope(name, "BasicDecoderStep", (time, inputs, state)): 113 | cell_outputs, cell_state, c_kl_loss = self._cell(inputs, state) 114 | # Accumulate the context KL loss from token at the current decoder step 115 | self._intermediate_context_kl_loss += c_kl_loss 116 | c_kl_loss = self._intermediate_context_kl_loss 117 | 118 | if self._output_layer is not None: 119 | cell_outputs = self._output_layer(cell_outputs) 120 | sample_ids = self._helper.sample( 121 | time=time, outputs=cell_outputs, state=cell_state) 122 | (finished, next_inputs, next_state) = self._helper.next_inputs( 123 | time=time, 124 | outputs=cell_outputs, 125 | state=cell_state, 126 | sample_ids=sample_ids) 127 | 128 | # Concatenate the latent vector to the predicted word's embedding 129 | next_inputs = tf.concat([next_inputs, self._latent_vector], axis=-1) 130 | 131 | outputs = BasicDecoderOutput(cell_outputs, sample_ids) 132 | return (outputs, next_state, next_inputs, finished, c_kl_loss) -------------------------------------------------------------------------------- /text-augmentation/README.md: -------------------------------------------------------------------------------- 1 | ## How-to 2 | 3 | 1. Download any glove pretrained from here, https://nlp.stanford.edu/projects/glove/ 4 | 5 | 2. Run any notebook using Jupyter Notebook. 6 | -------------------------------------------------------------------------------- /text-classification/63.deep-pyramid-cnn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[ ]: 5 | 6 | 7 | from utils import * 8 | import tensorflow as tf 9 | from sklearn.cross_validation import train_test_split 10 | import time 11 | import random 12 | import os 13 | 14 | 15 | # In[ ]: 16 | 17 | 18 | trainset = sklearn.datasets.load_files(container_path = 'data', encoding = 'UTF-8') 19 | trainset.data, trainset.target = separate_dataset(trainset,1.0) 20 | print (trainset.target_names) 21 | print (len(trainset.data)) 22 | print (len(trainset.target)) 23 | 24 | 25 | # In[ ]: 26 | 27 | 28 | concat = ' '.join(trainset.data).split() 29 | vocabulary_size = len(list(set(concat))) 30 | data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size) 31 | print('vocab from size: %d'%(vocabulary_size)) 32 | print('Most common words', count[4:10]) 33 | print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]]) 34 | 35 | 36 | # In[ ]: 37 | 38 | 39 | GO = dictionary['GO'] 40 | PAD = dictionary['PAD'] 41 | EOS = dictionary['EOS'] 42 | UNK = dictionary['UNK'] 43 | 44 | 45 | # In[ ]: 46 | 47 | 48 | embedding_size = 128 49 | dimension_output = len(trainset.target_names) 50 | maxlen = 50 51 | batch_size = 32 52 | kernel_size = 3 53 | num_filters = 150 54 | 55 | 56 | # In[ ]: 57 | 58 | 59 | class Model: 60 | def __init__(self, 61 | maxlen, 62 | dimension_output, 63 | vocab_size, 64 | embedding_size, 65 | kernel_size, 66 | num_filters, 67 | learning_rate): 68 | self.X = tf.placeholder(tf.int32,[None, maxlen]) 69 | self.Y = tf.placeholder(tf.int32,[None]) 70 | embeddings = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1, 1)) 71 | embedded = tf.nn.embedding_lookup(embeddings, self.X) 72 | first_region = tf.layers.conv1d( 73 | embedded, 74 | num_filters, 75 | kernel_size = kernel_size, 76 | strides = 1, 77 | padding = 'valid' 78 | ) 79 | forward = tf.nn.relu(first_region) 80 | forward = tf.layers.conv1d( 81 | forward, 82 | num_filters, 83 | kernel_size = kernel_size, 84 | strides = 1, 85 | padding = 'same' 86 | ) 87 | forward = tf.layers.batch_normalization(forward) 88 | forward = tf.nn.relu(first_region) 89 | forward = tf.layers.conv1d( 90 | forward, 91 | num_filters, 92 | kernel_size = kernel_size, 93 | strides = 1, 94 | padding = 'same' 95 | ) 96 | forward = tf.layers.batch_normalization(forward) 97 | forward = tf.nn.relu(first_region) 98 | forward = forward + first_region 99 | 100 | def _block(x): 101 | x = tf.pad(x, paddings=[[0, 0], [0, 1], [0, 0]]) 102 | px = tf.layers.max_pooling1d(x, 3, 2) 103 | x = tf.nn.relu(px) 104 | x = tf.layers.conv1d( 105 | x, 106 | num_filters, 107 | kernel_size = kernel_size, 108 | strides = 1, 109 | padding = 'same' 110 | ) 111 | x = tf.layers.batch_normalization(x) 112 | x = tf.nn.relu(x) 113 | x = tf.layers.conv1d( 114 | x, 115 | num_filters, 116 | kernel_size = kernel_size, 117 | strides = 1, 118 | padding = 'same' 119 | ) 120 | x = tf.layers.batch_normalization(x) 121 | x = x + px 122 | return x 123 | while forward.get_shape().as_list()[1] >= 2: 124 | forward = _block(forward) 125 | self.logits = tf.reduce_sum(tf.layers.conv1d( 126 | forward, dimension_output, kernel_size = 1, strides = 1, padding = 'SAME' 127 | ), 1) 128 | self.cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits( 129 | logits=self.logits, 130 | labels=self.Y)) 131 | self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(self.cost) 132 | correct_pred = tf.equal(tf.argmax(self.logits, 1,output_type=tf.int32), self.Y) 133 | self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) 134 | 135 | 136 | # In[ ]: 137 | 138 | 139 | tf.reset_default_graph() 140 | sess = tf.InteractiveSession() 141 | model = Model(maxlen, dimension_output, len(dictionary), embedding_size, 142 | kernel_size, num_filters, 1e-3) 143 | sess.run(tf.global_variables_initializer()) 144 | 145 | 146 | # In[ ]: 147 | 148 | 149 | vectors = str_idx(trainset.data,dictionary,maxlen) 150 | train_X, test_X, train_Y, test_Y = train_test_split(vectors, trainset.target,test_size = 0.2) 151 | 152 | 153 | # In[ ]: 154 | 155 | 156 | from tqdm import tqdm 157 | import time 158 | 159 | EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 3, 0, 0, 0 160 | 161 | while True: 162 | lasttime = time.time() 163 | if CURRENT_CHECKPOINT == EARLY_STOPPING: 164 | print('break epoch:%d\n' % (EPOCH)) 165 | break 166 | 167 | train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0 168 | pbar = tqdm( 169 | range(0, len(train_X), batch_size), desc = 'train minibatch loop' 170 | ) 171 | for i in pbar: 172 | batch_x = train_X[i : min(i + batch_size, train_X.shape[0])] 173 | batch_y = train_Y[i : min(i + batch_size, train_X.shape[0])] 174 | batch_x_expand = np.expand_dims(batch_x,axis = 1) 175 | acc, cost, _ = sess.run( 176 | [model.accuracy, model.cost, model.optimizer], 177 | feed_dict = { 178 | model.Y: batch_y, 179 | model.X: batch_x 180 | }, 181 | ) 182 | assert not np.isnan(cost) 183 | train_loss += cost 184 | train_acc += acc 185 | pbar.set_postfix(cost = cost, accuracy = acc) 186 | 187 | pbar = tqdm(range(0, len(test_X), batch_size), desc = 'test minibatch loop') 188 | for i in pbar: 189 | batch_x = test_X[i : min(i + batch_size, test_X.shape[0])] 190 | batch_y = test_Y[i : min(i + batch_size, test_X.shape[0])] 191 | batch_x_expand = np.expand_dims(batch_x,axis = 1) 192 | acc, cost = sess.run( 193 | [model.accuracy, model.cost], 194 | feed_dict = { 195 | model.Y: batch_y, 196 | model.X: batch_x 197 | }, 198 | ) 199 | test_loss += cost 200 | test_acc += acc 201 | pbar.set_postfix(cost = cost, accuracy = acc) 202 | 203 | train_loss /= len(train_X) / batch_size 204 | train_acc /= len(train_X) / batch_size 205 | test_loss /= len(test_X) / batch_size 206 | test_acc /= len(test_X) / batch_size 207 | 208 | if test_acc > CURRENT_ACC: 209 | print( 210 | 'epoch: %d, pass acc: %f, current acc: %f' 211 | % (EPOCH, CURRENT_ACC, test_acc) 212 | ) 213 | CURRENT_ACC = test_acc 214 | CURRENT_CHECKPOINT = 0 215 | else: 216 | CURRENT_CHECKPOINT += 1 217 | 218 | print('time taken:', time.time() - lasttime) 219 | print( 220 | 'epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n' 221 | % (EPOCH, train_loss, train_acc, test_loss, test_acc) 222 | ) 223 | EPOCH += 1 224 | 225 | 226 | # In[ ]: 227 | 228 | 229 | real_Y, predict_Y = [], [] 230 | 231 | pbar = tqdm( 232 | range(0, len(test_X), batch_size), desc = 'validation minibatch loop' 233 | ) 234 | for i in pbar: 235 | batch_x = test_X[i : min(i + batch_size, test_X.shape[0])] 236 | batch_y = test_Y[i : min(i + batch_size, test_X.shape[0])] 237 | predict_Y += np.argmax( 238 | sess.run( 239 | model.logits, feed_dict = {model.X: batch_x, model.Y: batch_y} 240 | ), 241 | 1, 242 | ).tolist() 243 | real_Y += batch_y 244 | 245 | 246 | # In[ ]: 247 | 248 | 249 | print(metrics.classification_report(real_Y, predict_Y, target_names = trainset.target_names)) 250 | 251 | -------------------------------------------------------------------------------- /text-classification/README.md: -------------------------------------------------------------------------------- 1 | ## How-to 2 | 3 | 1. Make sure `data` folder in the same directory of the notebooks. 4 | 5 | 2. Run any notebook using Jupyter Notebook. 6 | 7 | ## Score and average time taken per epoch, not sorted 8 | 9 | Based on 20% validation, time taken based on single Tesla V100 32GB VRAM. 10 | 11 | | name | accuracy | time taken (s) | 12 | |--------------------------------------|----------|----------------| 13 | | 1. basic-rnn | 0.68 | 1.3219 | 14 | | 2. basic-rnn-hinge | 0.65 | 1.2455 | 15 | | 3. basic-rnn-huber | 0.68 | 1.2468 | 16 | | 4. basic-rnn-bidirectional | 0.71 | 3.8174 | 17 | | 5. basic-rnn-bidirectional-hinge | 0.68 | 2.5127 | 18 | | 6. basic-rnn-bidirectional-huber | 0.63 | 3.5095 | 19 | | 7. lstm-rnn | 0.73 | 2.69683 | 20 | | 8. lstm-rnn-hinge | 0.72 | 8.2088 | 21 | | 9. lstm-rnn-huber | 0.73 | 10.1754 | 22 | | 10. lstm-rnn-bidirectional | 0.71 | 11.0388 | 23 | | 11. lstm-rnn-bidirectional-huber | 0.71 | 5.5258 | 24 | | 12. lstm-rnn-dropout-l2 | 0.74 | 3.2420 | 25 | | 13. gru-rnn | 0.72 | 3.16123 | 26 | | 14. gru-rnn-hinge | 0.72 | 6.71951 | 27 | | 15. gru-rnn-huber | 0.70 | 7.93373 | 28 | | 16. gru-rnn-bidirectional | 0.73 | 2.91590 | 29 | | 17. gru-rnn-bidirectional-hinge | 0.72 | 5.66385 | 30 | | 18. gru-rnn-bidirectional-huber | 0.70 | 18.01133 | 31 | | 19. lstm-cnn-rnn | 0.65 | 4.42849 | 32 | | 20. kmax-cnn | 0.73 | 18.89667 | 33 | | 21. lstm-cnn-rnn-highway | 0.68 | 3.23122 | 34 | | 22. lstm-rnn-attention | 0.75 | 13.97496 | 35 | | 23. dilated-rnn-lstm | 0.25 | 24.54002 | 36 | | 24. lnlstm-rnn | 0.68 | 24.86363 | 37 | | 25. only-attention | 0.74 | 2.63291 | 38 | | 26. multihead-attention | 0.69 | 9.033228 | 39 | | 27. neural-turing-machine | | | 40 | | 28. lstm-seq2seq | 0.72 | 9.63291 | 41 | | 29. lstm-seq2seq-luong | | | 42 | | 30. lstm-seq2seq-bahdanau | | | 43 | | 31. lstm-seq2seq-beam | | | 44 | | 32. lstm-seq2seq-birnn | | | 45 | | 33. pointer-net | | | 46 | | 34. lstm-rnn-bahdanau | 0.71 | 9.81993 | 47 | | 35. lstm-rnn-luong | 0.66 | 27.73932 | 48 | | 36. lstm-rnn-bahdanau-luong | 0.69 | 36.97628 | 49 | | 37. lstm-birnn-bahdanau-luong | 0.70 | 38.86009 | 50 | | 38. bytenet | | | 51 | | 39. fast-slow-lstm | | | 52 | | 40. siamese-network | 0.52 | 7.13535 | 53 | | 41. estimator | | | 54 | | 42. capsule-rnn-lstm | | | 55 | | 43. capsule-seq2seq-lstm | | | 56 | | 44. capsule-birrn-seq2seq-lstm | | | 57 | | 45. nested-lstm | | | 58 | | 46. lstm-seq2seq-highway | | | 59 | | 47. triplet-loss-lstm | 0.50 | | 60 | | 48. dnc | 0.68 | 85.98529 | 61 | | 49. convlstm | 0.69 | 2.66726 | 62 | | 50. temporalconvd | 0.66 | 11.90590 | 63 | | 51. batch-all-triplet-loss-lstm | 0.70 | | 64 | | 52. fast-text | 0.76 | 0.49499 | 65 | | 53. gated-convolution-network | 0.67 | 3.37712 | 66 | | 54. simple-recurrent-units | 0.65 | 3.12624 | 67 | | 55. lstm-han | 0.50 | 3.47965 | 68 | | 56. bert | 0.73 | 6.31015 | 69 | | 57. dynamic-memory-network | 0.71 | 3.25820 | 70 | | 58. entity-network | 0.74 | 1.10458 | 71 | | 59. memory-network | 0.58 | 1.157306 | 72 | | 60. char-sparse | 0.76 | 2.350096 | 73 | | 61. residual-network | 0.72 | 9.557085 | 74 | | 62. residual-network-bahdanau | 0.71 | 11.53799 | 75 | | 63. deep-pyramid-cnn | 0.68 | 6.980528 | 76 | | 64. transformer-xl | 0.51 | 38.66338 | 77 | | 65. transfer-learning-gpt2 | 0.79 | 178.0716 | 78 | | 66. quasi-rnn | 0.66 | 166.1456 | 79 | | 67. tacotron | 0.74 | 360.5551 | 80 | | 68. slice-gru | 0.72 | 10.140633 | 81 | | 69. slice-gru-bahdanau | 0.70 | 20.247409 | 82 | | 70. wavenet | 0.59 | 101.293274 | 83 | | 71. transfer-learning-bert | 0.81 | 887.590460 | 84 | | 72. transfer-learning-xlnet-large | 0.846 | 340.7679 | 85 | | 73. lstm-birnn-max-avg | 0.7552 | 9.35624 | 86 | | 74. transfer-learning-bert-base-6 | 0.7655 | 494.169 | 87 | | 75. transfer-learning-bert-large-12 | 0.80 | 1365.30 | 88 | | 76. transfer-learning-xlnet-base | 0.820441 | 240.262 | 89 | | 77. transfer-learning-albert-base | 0.799053 | 61.8179 | 90 | | 78. transfer-learning-electra-base | 0.836336 | 66.0257 | 91 | | 79. transfer-learning-electra-large | 0.875248 | 195.37280 | 92 | -------------------------------------------------------------------------------- /text-classification/data.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mesolitica/NLP-Models-Tensorflow/0741216aa8235e1228b3de7903cc36d73f8f2b45/text-classification/data.zip -------------------------------------------------------------------------------- /text-classification/gpt_2.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | 5 | def shape_list(x): 6 | """Deal with dynamic shape in tensorflow cleanly.""" 7 | static = x.shape.as_list() 8 | dynamic = tf.shape(x) 9 | return [dynamic[i] if s is None else s for i, s in enumerate(static)] 10 | 11 | 12 | def softmax(x, axis = -1): 13 | x = x - tf.reduce_max(x, axis = axis, keepdims = True) 14 | ex = tf.exp(x) 15 | return ex / tf.reduce_sum(ex, axis = axis, keepdims = True) 16 | 17 | 18 | def gelu(x): 19 | return ( 20 | 0.5 21 | * x 22 | * (1 + tf.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))) 23 | ) 24 | 25 | 26 | def norm(x, scope, *, axis = -1, epsilon = 1e-5): 27 | """Normalize to mean = 0, std = 1, then do a diagonal affine transform.""" 28 | with tf.variable_scope(scope): 29 | n_state = x.shape[-1].value 30 | g = tf.get_variable( 31 | 'g', [n_state], initializer = tf.constant_initializer(1) 32 | ) 33 | b = tf.get_variable( 34 | 'b', [n_state], initializer = tf.constant_initializer(0) 35 | ) 36 | u = tf.reduce_mean(x, axis = axis, keepdims = True) 37 | s = tf.reduce_mean(tf.square(x - u), axis = axis, keepdims = True) 38 | x = (x - u) * tf.rsqrt(s + epsilon) 39 | x = x * g + b 40 | return x 41 | 42 | 43 | def split_states(x, n): 44 | """Reshape the last dimension of x into [n, x.shape[-1]/n].""" 45 | *start, m = shape_list(x) 46 | return tf.reshape(x, start + [n, m // n]) 47 | 48 | 49 | def merge_states(x): 50 | """Smash the last two dimensions of x into a single dimension.""" 51 | *start, a, b = shape_list(x) 52 | return tf.reshape(x, start + [a * b]) 53 | 54 | 55 | def conv1d(x, scope, nf, *, w_init_stdev = 0.02): 56 | with tf.variable_scope(scope): 57 | *start, nx = shape_list(x) 58 | w = tf.get_variable( 59 | 'w', 60 | [1, nx, nf], 61 | initializer = tf.random_normal_initializer(stddev = w_init_stdev), 62 | ) 63 | b = tf.get_variable('b', [nf], initializer = tf.constant_initializer(0)) 64 | c = tf.reshape( 65 | tf.matmul(tf.reshape(x, [-1, nx]), tf.reshape(w, [-1, nf])) + b, 66 | start + [nf], 67 | ) 68 | return c 69 | 70 | 71 | def attention_mask(nd, ns, *, dtype): 72 | """1's in the lower triangle, counting from the lower right corner. 73 | 74 | Same as tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd), but doesn't produce garbage on TPUs. 75 | """ 76 | i = tf.range(nd)[:, None] 77 | j = tf.range(ns) 78 | m = i >= j - ns + nd 79 | return tf.cast(m, dtype) 80 | 81 | 82 | def attn(x, scope, n_state, *, past, hparams): 83 | assert x.shape.ndims == 3 # Should be [batch, sequence, features] 84 | assert n_state % hparams.n_head == 0 85 | if past is not None: 86 | assert ( 87 | past.shape.ndims == 5 88 | ) # Should be [batch, 2, heads, sequence, features], where 2 is [k, v] 89 | 90 | def split_heads(x): 91 | # From [batch, sequence, features] to [batch, heads, sequence, features] 92 | return tf.transpose(split_states(x, hparams.n_head), [0, 2, 1, 3]) 93 | 94 | def merge_heads(x): 95 | # Reverse of split_heads 96 | return merge_states(tf.transpose(x, [0, 2, 1, 3])) 97 | 98 | def mask_attn_weights(w): 99 | # w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst. 100 | _, _, nd, ns = shape_list(w) 101 | b = attention_mask(nd, ns, dtype = w.dtype) 102 | b = tf.reshape(b, [1, 1, nd, ns]) 103 | w = w * b - tf.cast(1e10, w.dtype) * (1 - b) 104 | return w 105 | 106 | def multihead_attn(q, k, v): 107 | # q, k, v have shape [batch, heads, sequence, features] 108 | w = tf.matmul(q, k, transpose_b = True) 109 | w = w * tf.rsqrt(tf.cast(v.shape[-1].value, w.dtype)) 110 | 111 | w = mask_attn_weights(w) 112 | w = softmax(w) 113 | a = tf.matmul(w, v) 114 | return a 115 | 116 | with tf.variable_scope(scope): 117 | c = conv1d(x, 'c_attn', n_state * 3) 118 | q, k, v = map(split_heads, tf.split(c, 3, axis = 2)) 119 | present = tf.stack([k, v], axis = 1) 120 | if past is not None: 121 | pk, pv = tf.unstack(past, axis = 1) 122 | k = tf.concat([pk, k], axis = -2) 123 | v = tf.concat([pv, v], axis = -2) 124 | a = multihead_attn(q, k, v) 125 | a = merge_heads(a) 126 | a = conv1d(a, 'c_proj', n_state) 127 | return a, present 128 | 129 | 130 | def mlp(x, scope, n_state, *, hparams): 131 | with tf.variable_scope(scope): 132 | nx = x.shape[-1].value 133 | h = gelu(conv1d(x, 'c_fc', n_state)) 134 | h2 = conv1d(h, 'c_proj', nx) 135 | return h2 136 | 137 | 138 | def block(x, scope, *, past, hparams): 139 | with tf.variable_scope(scope): 140 | nx = x.shape[-1].value 141 | a, present = attn( 142 | norm(x, 'ln_1'), 'attn', nx, past = past, hparams = hparams 143 | ) 144 | x = x + a 145 | m = mlp(norm(x, 'ln_2'), 'mlp', nx * 4, hparams = hparams) 146 | x = x + m 147 | return x, present 148 | 149 | 150 | def past_shape(*, hparams, batch_size = None, sequence = None): 151 | return [ 152 | batch_size, 153 | hparams.n_layer, 154 | 2, 155 | hparams.n_head, 156 | sequence, 157 | hparams.n_embd // hparams.n_head, 158 | ] 159 | 160 | 161 | def expand_tile(value, size): 162 | """Add a new axis of given size.""" 163 | value = tf.convert_to_tensor(value, name = 'value') 164 | ndims = value.shape.ndims 165 | return tf.tile(tf.expand_dims(value, axis = 0), [size] + [1] * ndims) 166 | 167 | 168 | def positions_for(tokens, past_length): 169 | batch_size = tf.shape(tokens)[0] 170 | nsteps = tf.shape(tokens)[1] 171 | return expand_tile(past_length + tf.range(nsteps), batch_size) 172 | 173 | 174 | def model(hparams, X, past = None, scope = 'model', reuse = False): 175 | with tf.variable_scope(scope, reuse = reuse): 176 | results = {} 177 | batch, sequence = shape_list(X) 178 | 179 | wpe = tf.get_variable( 180 | 'wpe', 181 | [hparams.n_ctx, hparams.n_embd], 182 | initializer = tf.random_normal_initializer(stddev = 0.01), 183 | ) 184 | wte = tf.get_variable( 185 | 'wte', 186 | [hparams.n_vocab, hparams.n_embd], 187 | initializer = tf.random_normal_initializer(stddev = 0.02), 188 | ) 189 | past_length = 0 if past is None else tf.shape(past)[-2] 190 | h = tf.gather(wte, X) + tf.gather(wpe, positions_for(X, past_length)) 191 | 192 | # Transformer 193 | presents = [] 194 | pasts = ( 195 | tf.unstack(past, axis = 1) 196 | if past is not None 197 | else [None] * hparams.n_layer 198 | ) 199 | assert len(pasts) == hparams.n_layer 200 | for layer, past in enumerate(pasts): 201 | h, present = block(h, 'h%d' % layer, past = past, hparams = hparams) 202 | presents.append(present) 203 | results['present'] = tf.stack(presents, axis = 1) 204 | h = norm(h, 'ln_f') 205 | 206 | # Language model loss. Do tokens {l['sentence2']}\"\n", 112 | " train_X.append(s)\n", 113 | " train_Y.append(l['gold_label'])\n", 114 | " except:\n", 115 | " pass" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 8, 121 | "metadata": {}, 122 | "outputs": [ 123 | { 124 | "name": "stderr", 125 | "output_type": "stream", 126 | "text": [ 127 | "100%|██████████| 20002/20002 [00:00<00:00, 93673.10it/s]\n" 128 | ] 129 | } 130 | ], 131 | "source": [ 132 | "test_X, test_Y = [], []\n", 133 | "\n", 134 | "for i in tqdm(range(len(dev))):\n", 135 | " try:\n", 136 | " l = json.loads(dev[i])\n", 137 | " if l['gold_label'] not in labels:\n", 138 | " continue\n", 139 | " if len(l['sentence1']) and len(l['sentence2']):\n", 140 | " s = f\"{l['sentence1']} <> {l['sentence2']}\"\n", 141 | " test_X.append(s)\n", 142 | " test_Y.append(l['gold_label'])\n", 143 | " except:\n", 144 | " pass" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": 9, 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "import youtokentome as yttm\n", 154 | "\n", 155 | "with open('out.txt', 'w') as fopen:\n", 156 | " fopen.write('\\n'.join(test_X + train_X))\n", 157 | " \n", 158 | "yttm.BPE.train(data='out.txt', vocab_size=30000, model='vocab.model')\n", 159 | "bpe = yttm.BPE(model='vocab.model')" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": 10, 165 | "metadata": {}, 166 | "outputs": [ 167 | { 168 | "data": { 169 | "text/plain": [ 170 | "['', '', '', '']" 171 | ] 172 | }, 173 | "execution_count": 10, 174 | "metadata": {}, 175 | "output_type": "execute_result" 176 | } 177 | ], 178 | "source": [ 179 | "bpe.vocab()[:4]" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 11, 185 | "metadata": {}, 186 | "outputs": [ 187 | { 188 | "data": { 189 | "text/plain": [ 190 | "['halo halo']" 191 | ] 192 | }, 193 | "execution_count": 11, 194 | "metadata": {}, 195 | "output_type": "execute_result" 196 | } 197 | ], 198 | "source": [ 199 | "bpe.decode(bpe.encode('halo') + [2] + bpe.encode('halo'))" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 15, 205 | "metadata": {}, 206 | "outputs": [ 207 | { 208 | "name": "stderr", 209 | "output_type": "stream", 210 | "text": [ 211 | "100%|██████████| 261802/261802 [00:09<00:00, 26791.84it/s]\n" 212 | ] 213 | } 214 | ], 215 | "source": [ 216 | "left_train, right_train, label_train = [], [], []\n", 217 | "\n", 218 | "for i in tqdm(range(len(train_X))):\n", 219 | " l, r = train_X[i].split(' <> ')\n", 220 | " left_train.append(bpe.encode(l))\n", 221 | " right_train.append(bpe.encode(r))\n", 222 | " label_train.append(labels.index(train_Y[i]))" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": 16, 228 | "metadata": {}, 229 | "outputs": [ 230 | { 231 | "name": "stderr", 232 | "output_type": "stream", 233 | "text": [ 234 | "100%|██████████| 13395/13395 [00:00<00:00, 29595.87it/s]\n" 235 | ] 236 | } 237 | ], 238 | "source": [ 239 | "left_test, right_test, label_test = [], [], []\n", 240 | "\n", 241 | "for i in tqdm(range(len(test_X))):\n", 242 | " l, r = test_X[i].split(' <> ')\n", 243 | " try:\n", 244 | " label_test.append(labels.index(test_Y[i]))\n", 245 | " left_test.append(bpe.encode(l))\n", 246 | " right_test.append(bpe.encode(r))\n", 247 | " except:\n", 248 | " pass" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": 17, 254 | "metadata": {}, 255 | "outputs": [], 256 | "source": [ 257 | "with open('contrastive.json', 'w') as fopen:\n", 258 | " json.dump({'left_train': left_train,\n", 259 | " 'right_train': right_train,\n", 260 | " 'label_train': label_train,\n", 261 | " 'left_test': left_test,\n", 262 | " 'right_test': right_test,\n", 263 | " 'label_test': label_test}, fopen)" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": 18, 269 | "metadata": {}, 270 | "outputs": [ 271 | { 272 | "name": "stderr", 273 | "output_type": "stream", 274 | "text": [ 275 | "100%|██████████| 261802/261802 [00:09<00:00, 26215.21it/s]\n" 276 | ] 277 | } 278 | ], 279 | "source": [ 280 | "left_train, label_train = [], []\n", 281 | "\n", 282 | "for i in tqdm(range(len(train_X))):\n", 283 | " l, r = train_X[i].split(' <> ')\n", 284 | " left_train.append(bpe.encode(l) + [2] + bpe.encode(r))\n", 285 | " label_train.append(labels.index(train_Y[i]))" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 19, 291 | "metadata": {}, 292 | "outputs": [ 293 | { 294 | "name": "stderr", 295 | "output_type": "stream", 296 | "text": [ 297 | "100%|██████████| 13395/13395 [00:00<00:00, 13604.82it/s]\n" 298 | ] 299 | } 300 | ], 301 | "source": [ 302 | "left_test, label_test = [], []\n", 303 | "\n", 304 | "for i in tqdm(range(len(test_X))):\n", 305 | " try:\n", 306 | " l, r = test_X[i].split(' <> ')\n", 307 | " label_test.append(labels.index(test_Y[i]))\n", 308 | " left_test.append(bpe.encode(l) + [2] + bpe.encode(r))\n", 309 | " except:\n", 310 | " pass" 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": 20, 316 | "metadata": {}, 317 | "outputs": [], 318 | "source": [ 319 | "with open('pair.json', 'w') as fopen:\n", 320 | " json.dump({'left_train': left_train,\n", 321 | " 'label_train': label_train,\n", 322 | " 'left_test': left_test,\n", 323 | " 'label_test': label_test}, fopen)" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": 21, 329 | "metadata": {}, 330 | "outputs": [], 331 | "source": [ 332 | "with open('text.json', 'w') as fopen:\n", 333 | " json.dump({'train_X': train_X,\n", 334 | " 'train_Y': train_Y,\n", 335 | " 'test_X': test_X,\n", 336 | " 'test_Y': test_Y}, fopen)" 337 | ] 338 | } 339 | ], 340 | "metadata": { 341 | "kernelspec": { 342 | "display_name": "Python 3", 343 | "language": "python", 344 | "name": "python3" 345 | }, 346 | "language_info": { 347 | "codemirror_mode": { 348 | "name": "ipython", 349 | "version": 3 350 | }, 351 | "file_extension": ".py", 352 | "mimetype": "text/x-python", 353 | "name": "python", 354 | "nbconvert_exporter": "python", 355 | "pygments_lexer": "ipython3", 356 | "version": "3.6.8" 357 | } 358 | }, 359 | "nbformat": 4, 360 | "nbformat_minor": 2 361 | } 362 | -------------------------------------------------------------------------------- /text-to-speech/1.tacotron/caching.py: -------------------------------------------------------------------------------- 1 | import tqdm 2 | import os 3 | import numpy as np 4 | from utils import load_file, path 5 | 6 | if not os.path.exists('mel'): 7 | os.mkdir('mel') 8 | if not os.path.exists('mag'): 9 | os.mkdir('mag') 10 | wav_files = [f for f in os.listdir(path) if f.endswith('.wav')] 11 | for fpath in tqdm.tqdm(wav_files): 12 | fname, mel, mag = load_file(path + fpath) 13 | np.save('mel/{}'.format(fname.replace('wav', 'npy')), mel) 14 | np.save('mag/{}'.format(fname.replace('wav', 'npy')), mag) 15 | -------------------------------------------------------------------------------- /text-to-speech/1.tacotron/test-tacotron.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mesolitica/NLP-Models-Tensorflow/0741216aa8235e1228b3de7903cc36d73f8f2b45/text-to-speech/1.tacotron/test-tacotron.wav -------------------------------------------------------------------------------- /text-to-speech/1.tacotron/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import librosa 3 | import copy 4 | from scipy import signal 5 | from scipy.io import wavfile 6 | import matplotlib.pyplot as plt 7 | import seaborn as sns 8 | import os 9 | import unicodedata 10 | import re 11 | 12 | # P: Padding 13 | # E: End of Sentence 14 | path = '../data/' 15 | vocab = "PE abcdefghijklmnopqrstuvwxyz'.?" 16 | max_duration = 10.0 17 | sample_rate = 22050 18 | fourier_window_size = 2048 19 | frame_shift = 0.0125 20 | frame_length = 0.05 21 | hop_length = int(sample_rate * frame_shift) 22 | win_length = int(sample_rate * frame_length) 23 | n_mels = 80 24 | power = 1.2 25 | iteration_griffin = 50 26 | preemphasis = 0.97 27 | max_db = 100 28 | ref_db = 20 29 | embed_size = 256 30 | encoder_num_banks = 16 31 | decoder_num_banks = 8 32 | num_highwaynet_blocks = 4 33 | resampled = 5 34 | dropout_rate = 0.5 35 | learning_rate = 0.001 36 | batch_size = 32 37 | 38 | 39 | def get_spectrogram(audio_file): 40 | y, sr = librosa.load(audio_file, sr = sample_rate) 41 | y, _ = librosa.effects.trim(y) 42 | y = np.append(y[0], y[1:] - preemphasis * y[:-1]) 43 | linear = librosa.stft( 44 | y = y, 45 | n_fft = fourier_window_size, 46 | hop_length = hop_length, 47 | win_length = win_length, 48 | ) 49 | mag = np.abs(linear) 50 | mel_basis = librosa.filters.mel(sample_rate, fourier_window_size, n_mels) 51 | mel = np.dot(mel_basis, mag) 52 | mel = 20 * np.log10(np.maximum(1e-5, mel)) 53 | mag = 20 * np.log10(np.maximum(1e-5, mag)) 54 | mel = np.clip((mel - ref_db + max_db) / max_db, 1e-8, 1) 55 | mag = np.clip((mag - ref_db + max_db) / max_db, 1e-8, 1) 56 | return mel.T.astype(np.float32), mag.T.astype(np.float32) 57 | 58 | 59 | def invert_spectrogram(spectrogram): 60 | return librosa.istft( 61 | spectrogram, hop_length, win_length = win_length, window = 'hann' 62 | ) 63 | 64 | 65 | def spectrogram2wav(mag): 66 | mag = mag.T 67 | mag = (np.clip(mag, 0, 1) * max_db) - max_db + ref_db 68 | mag = np.power(10.0, mag * 0.05) 69 | wav = griffin_lim(mag) 70 | wav = signal.lfilter([1], [1, -preemphasis], wav) 71 | wav, _ = librosa.effects.trim(wav) 72 | return wav.astype(np.float32) 73 | 74 | 75 | def griffin_lim(spectrogram): 76 | X_best = copy.deepcopy(spectrogram) 77 | for i in range(iteration_griffin): 78 | X_T = invert_spectrogram(X_best) 79 | est = librosa.stft( 80 | X_T, fourier_window_size, hop_length, win_length = win_length 81 | ) 82 | phase = est / np.maximum(1e-8, np.abs(est)) 83 | X_best = spectrogram * phase 84 | X_T = invert_spectrogram(X_best) 85 | return np.real(X_T) 86 | 87 | 88 | def get_wav(spectrogram): 89 | mag = (np.clip(spectrogram.T, 0, 1) * max_db) - max_db + ref_db 90 | mag = np.power(10.0, mag * 0.05) 91 | wav = griffin_lim(mag) 92 | wav = signal.lfilter([1], [1, -preemphasis], wav) 93 | return librosa.effects.trim(wav).astype(np.float32) 94 | 95 | 96 | def load_file(path): 97 | fname = os.path.basename(path) 98 | mel, mag = get_spectrogram(path) 99 | t = mel.shape[0] 100 | num_paddings = resampled - (t % resampled) if t % resampled != 0 else 0 101 | mel = np.pad(mel, [[0, num_paddings], [0, 0]], mode = 'constant') 102 | mag = np.pad(mag, [[0, num_paddings], [0, 0]], mode = 'constant') 103 | return fname, mel.reshape((-1, n_mels * resampled)), mag 104 | 105 | 106 | def text_normalize(text): 107 | text = ''.join( 108 | char 109 | for char in unicodedata.normalize('NFD', text) 110 | if unicodedata.category(char) != 'Mn' 111 | ) 112 | text = text.lower() 113 | text = re.sub('[^{}]'.format(vocab), ' ', text) 114 | text = re.sub('[ ]+', ' ', text) 115 | return text 116 | 117 | 118 | def get_cached(path): 119 | mel = 'mel/{}.npy'.format(path) 120 | mag = 'mag/{}.npy'.format(path) 121 | return np.load(mel), np.load(mag) 122 | 123 | def plot_alignment(alignment): 124 | fig, ax = plt.subplots() 125 | im = ax.imshow(alignment) 126 | fig.colorbar(im) 127 | plt.show() 128 | 129 | char2idx = {char: idx for idx, char in enumerate(vocab)} 130 | idx2char = {idx: char for idx, char in enumerate(vocab)} 131 | -------------------------------------------------------------------------------- /text-to-speech/README.md: -------------------------------------------------------------------------------- 1 | ## How-to 2 | 3 | 1. First, you need to run [download.ipynb](download.ipynb) 4 | 5 | 2. Go to any training folder, and run [caching.py](caching.py) 6 | 7 | 3. Run any notebook using Jupyter Notebook. 8 | -------------------------------------------------------------------------------- /text-to-speech/caching.py: -------------------------------------------------------------------------------- 1 | import tqdm 2 | import os 3 | import numpy as np 4 | from utils import load_file, path 5 | 6 | if not os.path.exists('mel'): 7 | os.mkdir('mel') 8 | if not os.path.exists('mag'): 9 | os.mkdir('mag') 10 | wav_files = [f for f in os.listdir(path) if f.endswith('.wav')] 11 | for fpath in tqdm.tqdm(wav_files): 12 | fname, mel, mag = load_file(path + fpath) 13 | np.save('mel/{}'.format(fname.replace('wav', 'npy')), mel) 14 | np.save('mag/{}'.format(fname.replace('wav', 'npy')), mag) 15 | -------------------------------------------------------------------------------- /text-to-speech/download.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from bs4 import BeautifulSoup\n", 10 | "from urllib.request import urlopen, urlretrieve\n", 11 | "from tqdm import tqdm\n", 12 | "import re\n", 13 | "import os" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "name": "stderr", 23 | "output_type": "stream", 24 | "text": [ 25 | "100%|███████████████████████████████| 200/200 [05:56<00:00, 1.84s/it]\n", 26 | "100%|███████████████████████████████| 200/200 [05:38<00:00, 1.69s/it]\n", 27 | "100%|███████████████████████████████| 200/200 [06:16<00:00, 1.82s/it]\n", 28 | "100%|███████████████████████████████| 200/200 [06:00<00:00, 1.76s/it]\n", 29 | "100%|███████████████████████████████| 200/200 [06:46<00:00, 2.47s/it]\n", 30 | "100%|███████████████████████████████| 200/200 [09:04<00:00, 2.60s/it]\n", 31 | "100%|███████████████████████████████| 200/200 [10:12<00:00, 2.87s/it]\n", 32 | "100%|███████████████████████████████| 200/200 [09:01<00:00, 2.63s/it]\n", 33 | "100%|███████████████████████████████| 200/200 [09:39<00:00, 3.47s/it]\n", 34 | "100%|███████████████████████████████| 200/200 [10:56<00:00, 3.04s/it]\n", 35 | "100%|███████████████████████████████| 200/200 [11:12<00:00, 3.06s/it]\n", 36 | "100%|███████████████████████████████| 200/200 [07:46<00:00, 2.32s/it]\n", 37 | "100%|███████████████████████████████| 200/200 [09:30<00:00, 2.83s/it]\n", 38 | "100%|███████████████████████████████| 200/200 [10:05<00:00, 3.83s/it]\n" 39 | ] 40 | } 41 | ], 42 | "source": [ 43 | "prefix = 'https://tspace.library.utoronto.ca'\n", 44 | "save_dir = './data/'\n", 45 | "if not os.path.exists(save_dir):\n", 46 | " os.makedirs(save_dir)\n", 47 | "\n", 48 | "base_url = 'https://tspace.library.utoronto.ca/handle/1807/24'\n", 49 | "urls = [base_url+str(i) for i in range(488, 502)]\n", 50 | "for url in urls:\n", 51 | " soup = BeautifulSoup(urlopen(url).read(), 'html5lib')\n", 52 | " targets = soup.findAll('a', href=re.compile(r'/bitstream/.*.wav'))\n", 53 | " \n", 54 | " for a in tqdm(targets, total=len(targets), ncols=70):\n", 55 | " link = a['href']\n", 56 | "\n", 57 | " audio_save_loc = save_dir + link.split('/')[-1]\n", 58 | " if os.path.isfile(audio_save_loc):\n", 59 | " print(\"File Already Exists\")\n", 60 | " urlretrieve(prefix+a['href'], audio_save_loc)\n", 61 | "\n", 62 | " with open(audio_save_loc.replace('.wav', '.txt'), 'w') as f:\n", 63 | " f.write('say the word ' + link.split('_')[-2])" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [] 72 | } 73 | ], 74 | "metadata": { 75 | "kernelspec": { 76 | "display_name": "Python 3", 77 | "language": "python", 78 | "name": "python3" 79 | }, 80 | "language_info": { 81 | "codemirror_mode": { 82 | "name": "ipython", 83 | "version": 3 84 | }, 85 | "file_extension": ".py", 86 | "mimetype": "text/x-python", 87 | "name": "python", 88 | "nbconvert_exporter": "python", 89 | "pygments_lexer": "ipython3", 90 | "version": "3.5.2" 91 | } 92 | }, 93 | "nbformat": 4, 94 | "nbformat_minor": 2 95 | } 96 | -------------------------------------------------------------------------------- /text-to-speech/test-bahdanau.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mesolitica/NLP-Models-Tensorflow/0741216aa8235e1228b3de7903cc36d73f8f2b45/text-to-speech/test-bahdanau.wav -------------------------------------------------------------------------------- /text-to-speech/test-dilated-cnn-monothonic-attention.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mesolitica/NLP-Models-Tensorflow/0741216aa8235e1228b3de7903cc36d73f8f2b45/text-to-speech/test-dilated-cnn-monothonic-attention.wav -------------------------------------------------------------------------------- /text-to-speech/test-dilated-cnn-self-attention.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mesolitica/NLP-Models-Tensorflow/0741216aa8235e1228b3de7903cc36d73f8f2b45/text-to-speech/test-dilated-cnn-self-attention.wav -------------------------------------------------------------------------------- /text-to-speech/test-luong.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mesolitica/NLP-Models-Tensorflow/0741216aa8235e1228b3de7903cc36d73f8f2b45/text-to-speech/test-luong.wav -------------------------------------------------------------------------------- /text-to-speech/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import librosa 3 | import copy 4 | from scipy import signal 5 | from scipy.io import wavfile 6 | import matplotlib.pyplot as plt 7 | import seaborn as sns 8 | import os 9 | import unicodedata 10 | import re 11 | 12 | # P: Padding 13 | # S: Start of Sentence 14 | # E: End of Sentence 15 | path = '../data/' 16 | vocab = "PSE abcdefghijklmnopqrstuvwxyz'.?" 17 | max_duration = 10.0 18 | sample_rate = 22050 19 | fourier_window_size = 2048 20 | frame_shift = 0.0125 21 | frame_length = 0.05 22 | hop_length = int(sample_rate * frame_shift) 23 | win_length = int(sample_rate * frame_length) 24 | n_mels = 80 25 | power = 1.2 26 | iteration_griffin = 50 27 | preemphasis = 0.97 28 | max_db = 100 29 | ref_db = 20 30 | embed_size = 256 31 | encoder_num_banks = 16 32 | decoder_num_banks = 8 33 | num_highwaynet_blocks = 4 34 | resampled = 5 35 | dropout_rate = 0.5 36 | learning_rate = 0.001 37 | batch_size = 32 38 | 39 | 40 | def get_spectrogram(audio_file): 41 | y, sr = librosa.load(audio_file, sr = sample_rate) 42 | y, _ = librosa.effects.trim(y) 43 | y = np.append(y[0], y[1:] - preemphasis * y[:-1]) 44 | linear = librosa.stft( 45 | y = y, 46 | n_fft = fourier_window_size, 47 | hop_length = hop_length, 48 | win_length = win_length, 49 | ) 50 | mag = np.abs(linear) 51 | mel_basis = librosa.filters.mel(sample_rate, fourier_window_size, n_mels) 52 | mel = np.dot(mel_basis, mag) 53 | mel = 20 * np.log10(np.maximum(1e-5, mel)) 54 | mag = 20 * np.log10(np.maximum(1e-5, mag)) 55 | mel = np.clip((mel - ref_db + max_db) / max_db, 1e-8, 1) 56 | mag = np.clip((mag - ref_db + max_db) / max_db, 1e-8, 1) 57 | return mel.T.astype(np.float32), mag.T.astype(np.float32) 58 | 59 | 60 | def invert_spectrogram(spectrogram): 61 | return librosa.istft( 62 | spectrogram, hop_length, win_length = win_length, window = 'hann' 63 | ) 64 | 65 | 66 | def spectrogram2wav(mag): 67 | mag = mag.T 68 | mag = (np.clip(mag, 0, 1) * max_db) - max_db + ref_db 69 | mag = np.power(10.0, mag * 0.05) 70 | wav = griffin_lim(mag) 71 | wav = signal.lfilter([1], [1, -preemphasis], wav) 72 | wav, _ = librosa.effects.trim(wav) 73 | return wav.astype(np.float32) 74 | 75 | 76 | def griffin_lim(spectrogram): 77 | X_best = copy.deepcopy(spectrogram) 78 | for i in range(iteration_griffin): 79 | X_T = invert_spectrogram(X_best) 80 | est = librosa.stft( 81 | X_T, fourier_window_size, hop_length, win_length = win_length 82 | ) 83 | phase = est / np.maximum(1e-8, np.abs(est)) 84 | X_best = spectrogram * phase 85 | X_T = invert_spectrogram(X_best) 86 | return np.real(X_T) 87 | 88 | 89 | def get_wav(spectrogram): 90 | mag = (np.clip(spectrogram.T, 0, 1) * max_db) - max_db + ref_db 91 | mag = np.power(10.0, mag * 0.05) 92 | wav = griffin_lim(mag) 93 | wav = signal.lfilter([1], [1, -preemphasis], wav) 94 | return librosa.effects.trim(wav).astype(np.float32) 95 | 96 | 97 | def load_file(path): 98 | fname = os.path.basename(path) 99 | mel, mag = get_spectrogram(path) 100 | t = mel.shape[0] 101 | num_paddings = resampled - (t % resampled) if t % resampled != 0 else 0 102 | mel = np.pad(mel, [[0, num_paddings], [0, 0]], mode = 'constant') 103 | mag = np.pad(mag, [[0, num_paddings], [0, 0]], mode = 'constant') 104 | return fname, mel.reshape((-1, n_mels * resampled)), mag 105 | 106 | 107 | def text_normalize(text): 108 | text = ''.join( 109 | char 110 | for char in unicodedata.normalize('NFD', text) 111 | if unicodedata.category(char) != 'Mn' 112 | ) 113 | text = text.lower() 114 | text = re.sub('[^{}]'.format(vocab), ' ', text) 115 | text = re.sub('[ ]+', ' ', text) 116 | return text 117 | 118 | 119 | def get_cached(path): 120 | mel = 'mel/{}.npy'.format(path) 121 | mag = 'mag/{}.npy'.format(path) 122 | return np.load(mel), np.load(mag) 123 | 124 | def plot_alignment(alignment, e): 125 | fig, ax = plt.subplots() 126 | im = ax.imshow(alignment) 127 | fig.colorbar(im) 128 | plt.title('epoch %d' % (e)) 129 | plt.show() 130 | 131 | char2idx = {char: idx for idx, char in enumerate(vocab)} 132 | idx2char = {idx: char for idx, char in enumerate(vocab)} 133 | -------------------------------------------------------------------------------- /topic-generator/README.md: -------------------------------------------------------------------------------- 1 | ## How-to 2 | 3 | 1. Download dataset, 4 | ```bash 5 | wget https://github.com/huseinzol05/Malaya-Dataset/raw/master/news/news.zip 6 | ``` 7 | 8 | 2. Unzip the dataset, 9 | ```bash 10 | unzip news.zip 11 | ``` 12 | 13 | 3. Run any notebook using Jupyter Notebook. 14 | -------------------------------------------------------------------------------- /topic-model/prepro_utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | 6 | import unicodedata 7 | import six 8 | from functools import partial 9 | 10 | 11 | SPIECE_UNDERLINE = '▁' 12 | 13 | 14 | def printable_text(text): 15 | """Returns text encoded in a way suitable for print or `tf.logging`.""" 16 | 17 | # These functions want `str` for both Python2 and Python3, but in one case 18 | # it's a Unicode string and in the other it's a byte string. 19 | if six.PY3: 20 | if isinstance(text, str): 21 | return text 22 | elif isinstance(text, bytes): 23 | return text.decode('utf-8', 'ignore') 24 | else: 25 | raise ValueError('Unsupported string type: %s' % (type(text))) 26 | elif six.PY2: 27 | if isinstance(text, str): 28 | return text 29 | elif isinstance(text, unicode): 30 | return text.encode('utf-8') 31 | else: 32 | raise ValueError('Unsupported string type: %s' % (type(text))) 33 | else: 34 | raise ValueError('Not running on Python2 or Python 3?') 35 | 36 | 37 | def print_(*args): 38 | new_args = [] 39 | for arg in args: 40 | if isinstance(arg, list): 41 | s = [printable_text(i) for i in arg] 42 | s = ' '.join(s) 43 | new_args.append(s) 44 | else: 45 | new_args.append(printable_text(arg)) 46 | print(*new_args) 47 | 48 | 49 | def preprocess_text( 50 | inputs, lower = False, remove_space = True, keep_accents = False 51 | ): 52 | if remove_space: 53 | outputs = ' '.join(inputs.strip().split()) 54 | else: 55 | outputs = inputs 56 | outputs = outputs.replace('``', '"').replace("''", '"') 57 | 58 | if six.PY2 and isinstance(outputs, str): 59 | outputs = outputs.decode('utf-8') 60 | 61 | if not keep_accents: 62 | outputs = unicodedata.normalize('NFKD', outputs) 63 | outputs = ''.join([c for c in outputs if not unicodedata.combining(c)]) 64 | if lower: 65 | outputs = outputs.lower() 66 | 67 | return outputs 68 | 69 | 70 | def encode_pieces(sp_model, text, return_unicode = True, sample = False): 71 | # return_unicode is used only for py2 72 | 73 | # note(zhiliny): in some systems, sentencepiece only accepts str for py2 74 | if six.PY2 and isinstance(text, unicode): 75 | text = text.encode('utf-8') 76 | 77 | if not sample: 78 | pieces = sp_model.EncodeAsPieces(text) 79 | else: 80 | pieces = sp_model.SampleEncodeAsPieces(text, 64, 0.1) 81 | new_pieces = [] 82 | for piece in pieces: 83 | if len(piece) > 1 and piece[-1] == ',' and piece[-2].isdigit(): 84 | cur_pieces = sp_model.EncodeAsPieces( 85 | piece[:-1].replace(SPIECE_UNDERLINE, '') 86 | ) 87 | if ( 88 | piece[0] != SPIECE_UNDERLINE 89 | and cur_pieces[0][0] == SPIECE_UNDERLINE 90 | ): 91 | if len(cur_pieces[0]) == 1: 92 | cur_pieces = cur_pieces[1:] 93 | else: 94 | cur_pieces[0] = cur_pieces[0][1:] 95 | cur_pieces.append(piece[-1]) 96 | new_pieces.extend(cur_pieces) 97 | else: 98 | new_pieces.append(piece) 99 | 100 | # note(zhiliny): convert back to unicode for py2 101 | if six.PY2 and return_unicode: 102 | ret_pieces = [] 103 | for piece in new_pieces: 104 | if isinstance(piece, str): 105 | piece = piece.decode('utf-8') 106 | ret_pieces.append(piece) 107 | new_pieces = ret_pieces 108 | 109 | return new_pieces 110 | 111 | 112 | def encode_ids(sp_model, text, sample = False): 113 | pieces = encode_pieces( 114 | sp_model, text, return_unicode = False, sample = sample 115 | ) 116 | ids = [sp_model.PieceToId(piece) for piece in pieces] 117 | return ids 118 | 119 | 120 | if __name__ == '__main__': 121 | import sentencepiece as spm 122 | 123 | sp = spm.SentencePieceProcessor() 124 | sp.load('sp10m.uncased.v3.model') 125 | 126 | print_(u'I was born in 2000, and this is falsé.') 127 | print_( 128 | u'ORIGINAL', 129 | sp.EncodeAsPieces(u'I was born in 2000, and this is falsé.'), 130 | ) 131 | print_( 132 | u'OURS', encode_pieces(sp, u'I was born in 2000, and this is falsé.') 133 | ) 134 | print(encode_ids(sp, u'I was born in 2000, and this is falsé.')) 135 | print_('') 136 | prepro_func = partial(preprocess_text, lower = True) 137 | print_(prepro_func('I was born in 2000, and this is falsé.')) 138 | print_( 139 | 'ORIGINAL', 140 | sp.EncodeAsPieces( 141 | prepro_func('I was born in 2000, and this is falsé.') 142 | ), 143 | ) 144 | print_( 145 | 'OURS', 146 | encode_pieces( 147 | sp, prepro_func('I was born in 2000, and this is falsé.') 148 | ), 149 | ) 150 | print(encode_ids(sp, prepro_func('I was born in 2000, and this is falsé.'))) 151 | print_('') 152 | print_('I was born in 2000, and this is falsé.') 153 | print_( 154 | 'ORIGINAL', sp.EncodeAsPieces('I was born in 2000, and this is falsé.') 155 | ) 156 | print_('OURS', encode_pieces(sp, 'I was born in 2000, and this is falsé.')) 157 | print(encode_ids(sp, 'I was born in 2000, and this is falsé.')) 158 | print_('') 159 | print_('I was born in 92000, and this is falsé.') 160 | print_( 161 | 'ORIGINAL', sp.EncodeAsPieces('I was born in 92000, and this is falsé.') 162 | ) 163 | print_('OURS', encode_pieces(sp, 'I was born in 92000, and this is falsé.')) 164 | print(encode_ids(sp, 'I was born in 92000, and this is falsé.')) 165 | -------------------------------------------------------------------------------- /topic-model/utils.py: -------------------------------------------------------------------------------- 1 | import sklearn.datasets 2 | import numpy as np 3 | import re 4 | import collections 5 | import random 6 | from sklearn import metrics 7 | from nltk.corpus import stopwords 8 | 9 | english_stopwords = stopwords.words('english') 10 | 11 | 12 | def clearstring(string): 13 | string = re.sub('[^A-Za-z0-9 ]+', '', string) 14 | string = string.split(' ') 15 | string = filter(None, string) 16 | string = [y.strip() for y in string if y.strip() not in english_stopwords] 17 | string = ' '.join(string) 18 | return string.lower() 19 | 20 | 21 | def separate_dataset(trainset, ratio = 0.5): 22 | datastring = [] 23 | datatarget = [] 24 | for i in range(len(trainset.data)): 25 | data_ = trainset.data[i].split('\n') 26 | data_ = list(filter(None, data_)) 27 | data_ = random.sample(data_, int(len(data_) * ratio)) 28 | for n in range(len(data_)): 29 | data_[n] = clearstring(data_[n]) 30 | datastring += data_ 31 | for n in range(len(data_)): 32 | datatarget.append(trainset.target[i]) 33 | return datastring, datatarget 34 | 35 | 36 | def build_dataset(words, n_words): 37 | count = [['GO', 0], ['PAD', 1], ['EOS', 2], ['UNK', 3]] 38 | count.extend(collections.Counter(words).most_common(n_words - 1)) 39 | dictionary = dict() 40 | for word, _ in count: 41 | dictionary[word] = len(dictionary) 42 | data = list() 43 | unk_count = 0 44 | for word in words: 45 | index = dictionary.get(word, 0) 46 | if index == 0: 47 | unk_count += 1 48 | data.append(index) 49 | count[0][1] = unk_count 50 | reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys())) 51 | return data, count, dictionary, reversed_dictionary 52 | 53 | 54 | def str_idx(corpus, dic, maxlen, UNK = 3): 55 | X = np.zeros((len(corpus), maxlen)) 56 | for i in range(len(corpus)): 57 | for no, k in enumerate(corpus[i].split()[:maxlen][::-1]): 58 | X[i, -1 - no] = dic.get(k, UNK) 59 | return X 60 | -------------------------------------------------------------------------------- /unsupervised-extractive-summarization/README.md: -------------------------------------------------------------------------------- 1 | ## How-to 2 | 3 | 1. Run any notebook using Jupyter Notebook. 4 | -------------------------------------------------------------------------------- /vectorizer/README.md: -------------------------------------------------------------------------------- 1 | ## How-to 2 | 3 | 1. Make sure `data` folder in the same directory of the notebooks. 4 | 5 | 2. Run any notebook using Jupyter Notebook. 6 | -------------------------------------------------------------------------------- /vectorizer/utils.py: -------------------------------------------------------------------------------- 1 | import sklearn.datasets 2 | import numpy as np 3 | import re 4 | import collections 5 | import random 6 | from sklearn import metrics 7 | from nltk.corpus import stopwords 8 | 9 | english_stopwords = stopwords.words('english') 10 | 11 | 12 | def clearstring(string): 13 | string = re.sub('[^A-Za-z0-9 ]+', '', string) 14 | string = string.split(' ') 15 | string = filter(None, string) 16 | string = [y.strip() for y in string if y.strip() not in english_stopwords] 17 | string = ' '.join(string) 18 | return string.lower() 19 | 20 | 21 | def separate_dataset(trainset, ratio = 0.5): 22 | datastring = [] 23 | datatarget = [] 24 | for i in range(len(trainset.data)): 25 | data_ = trainset.data[i].split('\n') 26 | data_ = list(filter(None, data_)) 27 | data_ = random.sample(data_, int(len(data_) * ratio)) 28 | for n in range(len(data_)): 29 | data_[n] = clearstring(data_[n]) 30 | datastring += data_ 31 | for n in range(len(data_)): 32 | datatarget.append(trainset.target[i]) 33 | return datastring, datatarget 34 | 35 | 36 | def build_dataset(words, n_words): 37 | count = [['GO', 0], ['PAD', 1], ['EOS', 2], ['UNK', 3]] 38 | count.extend(collections.Counter(words).most_common(n_words - 1)) 39 | dictionary = dict() 40 | for word, _ in count: 41 | dictionary[word] = len(dictionary) 42 | data = list() 43 | unk_count = 0 44 | for word in words: 45 | index = dictionary.get(word, 0) 46 | if index == 0: 47 | unk_count += 1 48 | data.append(index) 49 | count[0][1] = unk_count 50 | reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys())) 51 | return data, count, dictionary, reversed_dictionary 52 | 53 | 54 | def str_idx(corpus, dic, maxlen, UNK = 3): 55 | X = np.zeros((len(corpus), maxlen)) 56 | for i in range(len(corpus)): 57 | for no, k in enumerate(corpus[i].split()[:maxlen][::-1]): 58 | X[i, -1 - no] = dic.get(k, UNK) 59 | return X 60 | -------------------------------------------------------------------------------- /vocoder/README.md: -------------------------------------------------------------------------------- 1 | ## How-to 2 | 3 | 1. First, you need to run [download.ipynb](download.ipynb) 4 | 5 | 2. Go to any training folder, and run [caching-vocoder.ipynb](caching-vocoder.ipynb) 6 | 7 | 3. Run any notebook using Jupyter Notebook. 8 | -------------------------------------------------------------------------------- /vocoder/caching-vocoder.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import librosa\n", 11 | "import os\n", 12 | "from tqdm import tqdm" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "sample_rate = 22050\n", 22 | "fourier_window_size = 2048\n", 23 | "max_db = 100\n", 24 | "preemphasis = 0.97\n", 25 | "frame_shift = 0.0125\n", 26 | "frame_length = 0.05\n", 27 | "hop_length = int(sample_rate * frame_shift)\n", 28 | "win_length = int(sample_rate * frame_length)\n", 29 | "n_mels = 80\n", 30 | "ref_db = 20\n", 31 | "resampled = 5\n", 32 | "\n", 33 | "def get_spectrogram(audio_file):\n", 34 | " y, sr = librosa.load(audio_file, sr = sample_rate)\n", 35 | " y, _ = librosa.effects.trim(y)\n", 36 | " y = np.append(y[0], y[1:] - preemphasis * y[:-1])\n", 37 | " linear = librosa.stft(\n", 38 | " y = y,\n", 39 | " n_fft = fourier_window_size,\n", 40 | " hop_length = hop_length,\n", 41 | " win_length = win_length,\n", 42 | " )\n", 43 | " mag = np.abs(linear)\n", 44 | " mel_basis = librosa.filters.mel(sample_rate, fourier_window_size, n_mels)\n", 45 | " mel = np.dot(mel_basis, mag)\n", 46 | " mel = 20 * np.log10(np.maximum(1e-5, mel))\n", 47 | " mag = 20 * np.log10(np.maximum(1e-5, mag))\n", 48 | " mel = np.clip((mel - ref_db + max_db) / max_db, 1e-8, 1)\n", 49 | " mag = np.clip((mag - ref_db + max_db) / max_db, 1e-8, 1)\n", 50 | " return mel.T.astype(np.float32), mag.T.astype(np.float32)\n", 51 | "\n", 52 | "def load_file(path):\n", 53 | " fname = os.path.basename(path)\n", 54 | " mel, mag = get_spectrogram(path)\n", 55 | " t = mel.shape[0]\n", 56 | " num_paddings = resampled - (t % resampled) if t % resampled != 0 else 0\n", 57 | " mel = np.pad(mel, [[0, num_paddings], [0, 0]], mode = 'constant')\n", 58 | " mag = np.pad(mag, [[0, num_paddings], [0, 0]], mode = 'constant')\n", 59 | " return fname, mel.reshape((-1, n_mels * resampled)), mag" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 3, 65 | "metadata": {}, 66 | "outputs": [ 67 | { 68 | "name": "stderr", 69 | "output_type": "stream", 70 | "text": [ 71 | "100%|██████████| 200/200 [00:25<00:00, 7.88it/s]\n" 72 | ] 73 | } 74 | ], 75 | "source": [ 76 | "if not os.path.exists('mel_old'):\n", 77 | " os.mkdir('mel_old')\n", 78 | "if not os.path.exists('mag_old'):\n", 79 | " os.mkdir('mag_old')\n", 80 | "\n", 81 | "wav_files = [f for f in os.listdir('old') if f.endswith('.wav')]\n", 82 | "\n", 83 | "for fpath in tqdm(wav_files):\n", 84 | " fname, mel, mag = load_file('old/' + fpath)\n", 85 | " np.save('mel_old/{}'.format(fname.replace('wav', 'npy')), mel)\n", 86 | " np.save('mag_old/{}'.format(fname.replace('wav', 'npy')), mag)" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 4, 92 | "metadata": {}, 93 | "outputs": [ 94 | { 95 | "name": "stderr", 96 | "output_type": "stream", 97 | "text": [ 98 | "100%|██████████| 200/200 [00:25<00:00, 7.98it/s]\n" 99 | ] 100 | } 101 | ], 102 | "source": [ 103 | "if not os.path.exists('mel_young'):\n", 104 | " os.mkdir('mel_young')\n", 105 | "if not os.path.exists('mag_young'):\n", 106 | " os.mkdir('mag_young')\n", 107 | "\n", 108 | "wav_files = [f for f in os.listdir('young') if f.endswith('.wav')]\n", 109 | "\n", 110 | "for fpath in tqdm(wav_files):\n", 111 | " fname, mel, mag = load_file('young/' + fpath)\n", 112 | " np.save('mel_young/{}'.format(fname.replace('wav', 'npy')), mel)\n", 113 | " np.save('mag_young/{}'.format(fname.replace('wav', 'npy')), mag)" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [] 122 | } 123 | ], 124 | "metadata": { 125 | "kernelspec": { 126 | "display_name": "Python 3", 127 | "language": "python", 128 | "name": "python3" 129 | }, 130 | "language_info": { 131 | "codemirror_mode": { 132 | "name": "ipython", 133 | "version": 3 134 | }, 135 | "file_extension": ".py", 136 | "mimetype": "text/x-python", 137 | "name": "python", 138 | "nbconvert_exporter": "python", 139 | "pygments_lexer": "ipython3", 140 | "version": "3.6.8" 141 | } 142 | }, 143 | "nbformat": 4, 144 | "nbformat_minor": 2 145 | } 146 | -------------------------------------------------------------------------------- /vocoder/download.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from bs4 import BeautifulSoup\n", 10 | "from urllib.request import urlopen, urlretrieve\n", 11 | "from tqdm import tqdm\n", 12 | "import re\n", 13 | "import os" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "name": "stdout", 23 | "output_type": "stream", 24 | "text": [ 25 | "young\n" 26 | ] 27 | }, 28 | { 29 | "name": "stderr", 30 | "output_type": "stream", 31 | "text": [ 32 | "100%|███████████████████████████████| 200/200 [04:26<00:00, 1.29s/it]\n" 33 | ] 34 | }, 35 | { 36 | "name": "stdout", 37 | "output_type": "stream", 38 | "text": [ 39 | "old\n" 40 | ] 41 | }, 42 | { 43 | "name": "stderr", 44 | "output_type": "stream", 45 | "text": [ 46 | "100%|███████████████████████████████| 200/200 [04:30<00:00, 1.30s/it]\n" 47 | ] 48 | } 49 | ], 50 | "source": [ 51 | "prefix = 'https://tspace.library.utoronto.ca'\n", 52 | "base_url = 'https://tspace.library.utoronto.ca/handle/1807/24'\n", 53 | "\n", 54 | "directories = {'young':493, 'old': 501}\n", 55 | "\n", 56 | "for k, v in directories.items():\n", 57 | " print(k)\n", 58 | " save_dir = './%s/'%(k)\n", 59 | " if not os.path.exists(save_dir):\n", 60 | " os.makedirs(save_dir)\n", 61 | "\n", 62 | " url = base_url + str(v)\n", 63 | " soup = BeautifulSoup(urlopen(url).read(), 'html5lib')\n", 64 | " targets = soup.findAll('a', href=re.compile(r'/bitstream/.*.wav'))\n", 65 | " \n", 66 | " for a in tqdm(targets, total=len(targets), ncols=70):\n", 67 | " link = a['href']\n", 68 | "\n", 69 | " audio_save_loc = save_dir + link.split('/')[-1]\n", 70 | " if os.path.isfile(audio_save_loc):\n", 71 | " print(\"File Already Exists\")\n", 72 | " urlretrieve(prefix+a['href'], audio_save_loc)\n", 73 | "\n", 74 | " with open(audio_save_loc.replace('.wav', '.txt'), 'w') as f:\n", 75 | " f.write('say the word ' + link.split('_')[-2])" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [] 84 | } 85 | ], 86 | "metadata": { 87 | "kernelspec": { 88 | "display_name": "Python 3", 89 | "language": "python", 90 | "name": "python3" 91 | }, 92 | "language_info": { 93 | "codemirror_mode": { 94 | "name": "ipython", 95 | "version": 3 96 | }, 97 | "file_extension": ".py", 98 | "mimetype": "text/x-python", 99 | "name": "python", 100 | "nbconvert_exporter": "python", 101 | "pygments_lexer": "ipython3", 102 | "version": "3.6.8" 103 | } 104 | }, 105 | "nbformat": 4, 106 | "nbformat_minor": 2 107 | } 108 | --------------------------------------------------------------------------------