├── .gitignore
├── LICENSE
├── README.md
├── abstractive-summarization
    ├── 1.lstm-seq2seq-greedy.ipynb
    ├── 10.bert-dilated-fair.ipynb
    ├── 11.self-attention-pointer-generator.ipynb
    ├── 12.dilated-fairseq-pointer-generator.ipynb
    ├── 2.lstm-seq2seq-greedy-luong.ipynb
    ├── 3.lstm-seq2seq-beam.ipynb
    ├── 4.lstm-birnn-seq2seq-beam-luong.ipynb
    ├── 5.xueyouluo-pointer-generator-bahdanau.ipynb
    ├── 6.copynet.ipynb
    ├── 7.xueyouluo-pointer-generator-luong.ipynb
    ├── 8.dilated-seq2seq.ipynb
    ├── 9.dilated-seq2seq-self-attention.ipynb
    ├── README.md
    ├── dataset.tar.gz
    ├── dataset
    │   ├── ctexts.json
    │   └── headlines.json
    └── pointer_generator_helper.py
├── attention
    ├── 1.bahdanau.ipynb
    ├── 2.luong.ipynb
    ├── 3.hierarchical.ipynb
    ├── 4.additive.ipynb
    ├── 5.soft.ipynb
    ├── 6.attention-over-attention.ipynb
    ├── 7.bahdanau-api.ipynb
    └── 8.luong-api.ipynb
├── chatbot
    ├── 1.basic-seq2seq-manual.ipynb
    ├── 10.basic-birnn-seq2seq-greedy.ipynb
    ├── 11.lstm-birnn-seq2seq-greedy.ipynb
    ├── 12.gru-birnn-seq2seq-greedy.ipynb
    ├── 13.basic-seq2seq-luong.ipynb
    ├── 14.lstm-seq2seq-luong.ipynb
    ├── 15.gru-seq2seq-luong.ipynb
    ├── 16.basic-seq2seq-bahdanau.ipynb
    ├── 17.lstm-seq2seq-bahdanau.ipynb
    ├── 18.gru-seq2seq-bahdanau.ipynb
    ├── 19.lstm-birnn-seq2seq-luong.ipynb
    ├── 2.lstm-seq2seq-manual.ipynb
    ├── 20.gru-birnn-seq2seq-luong.ipynb
    ├── 21.lstm-birnn-seq2seq-bahdanau.ipynb
    ├── 22.gru-birnn-seq2seq-bahdanau.ipynb
    ├── 23.lstm-birnn-seq2seq-bahdanau-luong.ipynb
    ├── 24.gru-birnn-seq2seq-bahdanau-luong.ipynb
    ├── 25.lstm-seq2seq-greedy-luong.ipynb
    ├── 26.gru-seq2seq-greedy-luong.ipynb
    ├── 27.lstm-seq2seq-greedy-bahdanau.ipynb
    ├── 28.gru-seq2seq-greedy-bahdanau.ipynb
    ├── 29.lstm-seq2seq-beam.ipynb
    ├── 3.gru-seq2seq-manual.ipynb
    ├── 30.gru-seq2seq-beam.ipynb
    ├── 31.lstm-birnn-seq2seq-beam-luong.ipynb
    ├── 32.gru-birnn-seq2seq-beam-luong.ipynb
    ├── 33.lstm-birnn-seq2seq-luong-bahdanau-stack-beam.ipynb
    ├── 34.gru-birnn-seq2seq-luong-bahdanau-stack-beam.ipynb
    ├── 35.byte-net.ipynb
    ├── 35.byte-net.py
    ├── 36.estimator.ipynb
    ├── 37.capsule-lstm-seq2seq-greedy.ipynb
    ├── 37.capsule-lstm-seq2seq-greedy.py
    ├── 38.capsule-lstm-seq2seq-luong-beam.ipynb
    ├── 38.capsule-lstm-seq2seq-luong-beam.py
    ├── 39.lstm-birnn-seq2seq-luong-bahdanau-stack-beam-dropout-l2.ipynb
    ├── 4.basic-seq2seq-api-greedy.ipynb
    ├── 40.dnc-seq2seq-bahdanau-greedy.ipynb
    ├── 41.lstm-birnn-seq2seq-beam-luongmonotic.ipynb
    ├── 42.lstm-birnn-seq2seq-beam-bahdanaumonotic.ipynb
    ├── 43.memory-network-basic.ipynb
    ├── 44.memory-network-lstm.ipynb
    ├── 45.attention-is-all-you-need.ipynb
    ├── 46.transformer-xl.ipynb
    ├── 47.attention-is-all-you-need-beam-search.ipynb
    ├── 48.transformer-xl-lstm.ipynb
    ├── 49.gpt-2-lstm.ipynb
    ├── 5.lstm-seq2seq-api-greedy.ipynb
    ├── 50.conv-encoder-conv-decoder.ipynb
    ├── 51.conv-encoder-lstm.ipynb
    ├── 52.tacotron-greedy.ipynb
    ├── 53.tacotron-beam.ipynb
    ├── 54.google-nmt.ipynb
    ├── 6.gru-seq2seq-greedy.ipynb
    ├── 7.basic-birnn-seq2seq-manual.ipynb
    ├── 8.lstm-birnn-seq2seq-manual.ipynb
    ├── 9.gru-birnn-seq2seq-manual.ipynb
    ├── README.md
    ├── access.py
    ├── addressing.py
    ├── dataset.tar.gz
    ├── dnc.py
    ├── gpt_2.py
    └── util.py
├── classification-comparison
    ├── Deep-learning
    │   ├── LNLSTM-vector.ipynb
    │   ├── bidirectional-rnn-vector.ipynb
    │   ├── cnn-rnn-vector.ipynb
    │   ├── cnn-vector.ipynb
    │   ├── feedforward-vector.ipynb
    │   ├── kmax-conv-vector.ipynb
    │   ├── multihead-attention.ipynb
    │   ├── ntm-vector.ipynb
    │   ├── only-attention-vector.ipynb
    │   ├── rnn-attention-vector.ipynb
    │   ├── rnn-timestamp.ipynb
    │   ├── rnn-vector-hinge.ipynb
    │   ├── rnn-vector-huber.ipynb
    │   ├── rnn-vector-stack.ipynb
    │   ├── rnn-vector.ipynb
    │   ├── self-optimized-feedforward-timestamp.ipynb
    │   ├── seq2seq-vector-stable.ipynb
    │   └── seq2seq-vector.ipynb
    ├── Ensemble
    │   ├── featuring-ensemble.ipynb
    │   └── oracle.ipynb
    ├── LGB
    │   ├── lgb-tfidf-svd50.ipynb
    │   ├── lgb-tfidf.ipynb
    │   ├── lgb-timestamp.ipynb
    │   └── nce-vector-lgb.ipynb
    ├── NB-SVM
    │   └── NB-SVM.ipynb
    ├── Naive-Bayes
    │   └── Bayes classifier.ipynb
    ├── README.md
    ├── SVM
    │   └── SVM.ipynb
    ├── XGB
    │   ├── xgb-bow.ipynb
    │   ├── xgb-tfidf-svd50.ipynb
    │   ├── xgb-tfidf.ipynb
    │   ├── xgb-timestamp-avg.ipynb
    │   └── xgb-timestamp50.ipynb
    └── preparation
    │   ├── dictionary_emotion.p
    │   ├── prepare-dataset.ipynb
    │   ├── prepare-vocab.ipynb
    │   └── word-vector.ipynb
├── dependency-parser
    ├── 1.lstm-birnn-crf-biaffine.ipynb
    ├── 2.lstm-birnn-bahdanau-crf-biaffine.ipynb
    ├── 3.lstm-birnn-luong-crf-biaffine.ipynb
    ├── 4.bert-crf-biaffine.ipynb
    ├── 5.biaffine-attention-cross-entropy.ipynb
    ├── 6.bert-biaffine-attention-cross-entropy.ipynb
    ├── 7.stackpointer.ipynb
    ├── 8.xlnet-biaffine-attention-cross-entropy.ipynb
    └── README.md
├── entity-tagging
    ├── 1.rnn-lstm-crf.ipynb
    ├── 2.rnn-lstm-crf-luong.ipynb
    ├── 3.rnn-lstm-crf-bahdanau.ipynb
    ├── 4.rnn-lstm-crf-bahdanau-ngrams.ipynb
    ├── 5.rnn-lstm-crf-luong-ngrams.ipynb
    ├── 6.cnn-residual-bahdanau-ngrams.ipynb
    ├── 7.attention-is-all-you-need.ipynb
    ├── 8.bert.ipynb
    ├── 9.xlnet-base.ipynb
    └── README.md
├── extractive-summarization
    ├── 1.rnn-lstm.ipynb
    ├── 2.dilated-cnn.ipynb
    ├── 3.multihead-attention.ipynb
    ├── 4.bert-base.ipynb
    ├── download-data.ipynb
    ├── modeling.py
    ├── preprocessing-data-bert.ipynb
    └── preprocessing-data.ipynb
├── generator
    ├── 1.char-generator-lstm.ipynb
    ├── 10.gru-seq2seq-beam-word.ipynb
    ├── 11.gru-seq2seq-bahdanau-greedy-char.ipynb
    ├── 12.gru-seq2seq-bahdanau-greedy-word.ipynb
    ├── 13.dilated-cnn-beam.ipynb
    ├── 14.transformer-beam.ipynb
    ├── 15.transformer-xl-beam.ipynb
    ├── 2.char-rnn-beam.ipynb
    ├── 3.char-generator-lstm-embedding.ipynb
    ├── 4.word-generator-lstm.ipynb
    ├── 5.word-generator-lstm-embedding.ipynb
    ├── 6.gru-seq2seq-greedy-char.ipynb
    ├── 7.gru-seq2seq-greedy-word.ipynb
    ├── 8.char-generator-lstm-bahdanau.ipynb
    ├── 9.char-generator-lstm-luong.ipynb
    ├── README.md
    └── shakespeare.txt
├── language-detection
    ├── 1.fast-text-ngrams.ipynb
    └── README.md
├── neural-machine-translation
    ├── 1.basic-seq2seq.ipynb
    ├── 10.basic-birnn-seq2seq-contrib-greedy.ipynb
    ├── 11.lstm-birnn-seq2seq-contrib-greedy.ipynb
    ├── 12.gru-birnn-seq2seq-contrib-greedy.ipynb
    ├── 13.basic-seq2seq-luong.ipynb
    ├── 14.lstm-seq2seq-luong.ipynb
    ├── 15.gru-seq2seq-luong.ipynb
    ├── 16.basic-seq2seq-bahdanau.ipynb
    ├── 17.lstm-seq2seq-bahdanau.ipynb
    ├── 18.gru-seq2seq-bahdanau.ipynb
    ├── 19.basic-birnn-seq2seq-bahdanau.ipynb
    ├── 2.lstm-seq2seq.ipynb
    ├── 20.lstm-birnn-seq2seq-bahdanau.ipynb
    ├── 21.gru-birnn-seq2seq-bahdanau.ipynb
    ├── 22.basic-birnn-seq2seq-luong.ipynb
    ├── 23.lstm-birnn-seq2seq-luong.ipynb
    ├── 24.gru-birnn-seq2seq-luong.ipynb
    ├── 25.lstm-seq2seq-contrib-greedy-luong.ipynb
    ├── 26.gru-seq2seq-contrib-greedy-luong.ipynb
    ├── 27.lstm-seq2seq-contrib-greedy-bahdanau.ipynb
    ├── 28.gru-seq2seq-contrib-greedy-bahdanau.ipynb
    ├── 29.lstm-seq2seq-contrib-beam-luong.ipynb
    ├── 3.gru-seq2seq.ipynb
    ├── 30.gru-seq2seq-contrib-beam-luong.ipynb
    ├── 31.lstm-seq2seq-contrib-beam-bahdanau.ipynb
    ├── 32.gru-seq2seq-contrib-beam-bahdanau.ipynb
    ├── 33.lstm-birnn-seq2seq-contrib-beam-bahdanau.ipynb
    ├── 34.lstm-birnn-seq2seq-contrib-beam-luong.ipynb
    ├── 35.gru-birnn-seq2seq-contrib-beam-bahdanau.ipynb
    ├── 36.gru-birnn-seq2seq-contrib-beam-luong.ipynb
    ├── 37.lstm-birnn-seq2seq-contrib-beam-luongmonotonic.ipynb
    ├── 38.gru-birnn-seq2seq-contrib-beam-luongmonotic.ipynb
    ├── 39.lstm-birnn-seq2seq-contrib-beam-bahdanaumonotonic.ipynb
    ├── 4.basic-seq2seq-contrib-greedy.ipynb
    ├── 40.gru-birnn-seq2seq-contrib-beam-bahdanaumonotic.ipynb
    ├── 41.residual-lstm-seq2seq-greedy-luong.ipynb
    ├── 42.residual-gru-seq2seq-greedy-luong.ipynb
    ├── 43.residual-lstm-seq2seq-greedy-bahdanau.ipynb
    ├── 44.residual-gru-seq2seq-greedy-bahdanau.ipynb
    ├── 45.memory-network-lstm-decoder-greedy.ipynb
    ├── 46.google-nmt.ipynb
    ├── 47.transformer-encoder-transformer-decoder.ipynb
    ├── 48.transformer-encoder-lstm-decoder-greedy.ipynb
    ├── 49.bertmultilanguage-encoder-bertmultilanguage-decoder.ipynb
    ├── 5.lstm-seq2seq-contrib-greedy.ipynb
    ├── 50.bertmultilanguage-encoder-lstm-decoder.ipynb
    ├── 51.bertmultilanguage-encoder-transformer-decoder.ipynb
    ├── 52.bertenglish-encoder-transformer-decoder.ipynb
    ├── 53.transformer-t2t-2gpu.ipynb
    ├── 6.gru-seq2seq-contrib-greedy.ipynb
    ├── 7.basic-birnn-seq2seq.ipynb
    ├── 8.lstm-birnn-seq2seq.ipynb
    ├── 9.gru-birnn-seq2seq.ipynb
    ├── README.md
    ├── bert_decoder.py
    ├── electra
    │   └── model
    │   │   └── optimization.py
    ├── prepare-bpe.ipynb
    ├── prepare-dataset.ipynb
    ├── prepare-t2t.ipynb
    ├── t
    │   ├── text_encoder.py
    │   └── tokenizer.py
    └── transformer
    │   ├── attention_layer.py
    │   ├── beam_search.py
    │   ├── embedding_layer.py
    │   ├── ffn_layer.py
    │   ├── model_utils.py
    │   ├── transformer.py
    │   └── utils.py
├── nlp-tf.png
├── not-deep-learning
    ├── decomposition-summarization
    │   ├── 1.lda.ipynb
    │   ├── 2.lsa.ipynb
    │   └── 3.nmf.ipynb
    └── markov-chatbot
    │   └── markov-chatbot.ipynb
├── ocr
    ├── 1.cnn-rnn-ctc.ipynb
    ├── 2.im2latex.ipynb
    └── README.md
├── pos-tagging
    ├── 1.rnn-lstm-crf.ipynb
    ├── 2.rnn-lstm-crf-luong.ipynb
    ├── 3.rnn-lstm-crf-bahdanau.ipynb
    ├── 4.rnn-lstm-crf-bahdanau-ngrams.ipynb
    ├── 5.rnn-lstm-crf-luong-ngrams.ipynb
    ├── 6.cnn-residual-bahdanau-ngrams.ipynb
    ├── 7.attention-is-all-you-need.ipynb
    ├── 8.bert.ipynb
    └── README.md
├── question-answer
    ├── 1.end-to-end-basic.ipynb
    ├── 2.end-to-end-gru.ipynb
    ├── 3.end-to-end-lstm.ipynb
    ├── 4.dynamic-memory-gru.ipynb
    ├── README.md
    ├── attention_gru.py
    ├── qa5_three-arg-relations_test.txt
    └── qa5_three-arg-relations_train.txt
├── requirements.txt
├── sentence-pair
    ├── Archive.zip
    ├── README.md
    └── bert.ipynb
├── speech-to-text
    ├── 1.tacotron.ipynb
    ├── 10.deep-speech2.ipynb
    ├── 11.wav2vec-transfer-learning-birnn-lstm-ctc.ipynb
    ├── 2.birnn-lstm-ctc-greedy.ipynb
    ├── 3.birnn-seq2seq-luong-cross-entropy.ipynb
    ├── 4.birnn-seq2seq-bahdanau-cross-entropy.ipynb
    ├── 5.birnn-seq2seq-bahdanau-ctc.ipynb
    ├── 6.birnn-seq2seq-luong-ctc.ipynb
    ├── 7.cnn-rnn-bahdanau.ipynb
    ├── 8.dilated-cnn-rnn.ipynb
    ├── 9.wavenet.ipynb
    ├── README.md
    ├── augmentation.py
    ├── caching.ipynb
    ├── download.ipynb
    ├── wav2vec-preprocessing.ipynb
    ├── wav2vec-pytorch.ipynb
    ├── wav2vec-tf.ipynb
    └── wav2vec.ipynb
├── spelling-correction
    ├── 1.bert-base.ipynb
    ├── 2.xlnet-base.ipynb
    ├── 3.bert-base-fast.ipynb
    └── 4.bert-accurate.ipynb
├── squad-qa
    └── 1.bert.ipynb
├── stemming
    ├── 1.lstm-seq2seq-beam.ipynb
    ├── 2.gru-seq2seq-beam.ipynb
    ├── 3.lstm-birnn-seq2seq-greedy.ipynb
    ├── 4.gru-birnn-seq2seq-greedy.ipynb
    ├── 5.dnc-seq2seq-bahdanau-greedy.ipynb
    ├── 6.birnn-bahdanau-copynet.ipynb
    ├── README.md
    ├── access.py
    ├── addressing.py
    ├── dnc-seq2seq-bahdanau-greedy.ipynb
    ├── dnc.py
    ├── lemmatization-en.txt
    └── util.py
├── text-augmentation
    ├── 1.glove.ipynb
    ├── 2.gru-vae-beam-tfprob.ipynb
    ├── 3.lstm-vae-beam-tfprob.ipynb
    ├── 4.gru-vae-beam-bahdanau-tfprob.ipynb
    ├── 5.vae-deterministic-bahdanau.ipynb
    ├── 6.vae-varitional-bahdanau
    │   ├── attention_wrapper.py
    │   ├── basic_decoder.py
    │   ├── decoder.py
    │   └── vae-variational-bahdanau.ipynb
    ├── 7.bert-base.ipynb
    ├── 8.xlnet-augmentation.ipynb
    └── README.md
├── text-classification
    ├── 1.basic-rnn.ipynb
    ├── 10.lstm-rnn-bidirectional.ipynb
    ├── 11.lstm-rnn-bidirectional-huber.ipynb
    ├── 12.lstm-rnn-dropout-l2.ipynb
    ├── 13.gru-rnn.ipynb
    ├── 14.gru-rnn-hinge.ipynb
    ├── 15.gru-rnn-huber.ipynb
    ├── 16.gru-rnn-bidirectional.ipynb
    ├── 17.gru-rnn-bidirectional-hinge.ipynb
    ├── 18.gru-rnn-bidirectional-huber.ipynb
    ├── 19.lstm-cnn-rnn.ipynb
    ├── 2.basic-rnn-hinge.ipynb
    ├── 20.kmax-cnn.ipynb
    ├── 21.lstm-cnn-rnn-highway.ipynb
    ├── 22.lstm-rnn-attention.ipynb
    ├── 23.dilated-rnn-lstm.ipynb
    ├── 24.lnlstm-rnn.ipynb
    ├── 25.only-attention.ipynb
    ├── 26.multihead-attention.ipynb
    ├── 27.neural-turing-machine.ipynb
    ├── 28.lstm-seq2seq.ipynb
    ├── 29.lstm-seq2seq-luong.ipynb
    ├── 3.basic-rnn-huber.ipynb
    ├── 30.lstm-seq2seq-bahdanau.ipynb
    ├── 31.lstm-seq2seq-beam.ipynb
    ├── 32.lstm-seq2seq-birnn.ipynb
    ├── 33.pointer-net.ipynb
    ├── 34.lstm-rnn-bahdanau.ipynb
    ├── 35.lstm-rnn-luong.ipynb
    ├── 36.lstm-rnn-bahdanau-luong.ipynb
    ├── 37.lstm-birnn-bahdanau-luong.ipynb
    ├── 38.bytenet.ipynb
    ├── 39.fast-slow-lstm.ipynb
    ├── 4.basic-rnn-bidirectional.ipynb
    ├── 40.siamese-network.ipynb
    ├── 41.estimator.ipynb
    ├── 42.capsule-rnn-lstm.ipynb
    ├── 43.capsule-seq2seq-lstm.ipynb
    ├── 44.capsule-birrn-seq2seq-lstm.ipynb
    ├── 45.nested-lstm.ipynb
    ├── 46.lstm-seq2seq-highway.ipynb
    ├── 47.triplet-loss-lstm.ipynb
    ├── 48.dnc.ipynb
    ├── 49.convlstm.ipynb
    ├── 5.basic-rnn-bidirectional-hinge.ipynb
    ├── 50.temporalconvd.ipynb
    ├── 51.batch-all-triplet-loss-lstm.ipynb
    ├── 52.fast-text.ipynb
    ├── 53.gated-convolution-network.ipynb
    ├── 54.simple-recurrent-units.ipynb
    ├── 55.lstm-han.ipynb
    ├── 56.bert.ipynb
    ├── 57.dynamic-memory-network.ipynb
    ├── 58.entity-network.ipynb
    ├── 59.memory-network.ipynb
    ├── 6.basic-rnn-bidirectional-huber.ipynb
    ├── 60.char-sparse.ipynb
    ├── 61.residual-network.ipynb
    ├── 62.residual-network-bahdanau.ipynb
    ├── 63.deep-pyramid-cnn.ipynb
    ├── 63.deep-pyramid-cnn.py
    ├── 64.transformer-xl.ipynb
    ├── 65.transfer-learning-gpt2.ipynb
    ├── 66.quasi-rnn.ipynb
    ├── 67.tacotron.ipynb
    ├── 68.slice-gru.ipynb
    ├── 69.slice-gru-bahdanau.ipynb
    ├── 7.lstm-rnn.ipynb
    ├── 70.wavenet.ipynb
    ├── 71.transfer-learning-bert-base.ipynb
    ├── 72.transfer-learning-xlnet-large.ipynb
    ├── 73.lstm-birnn-max-avg.ipynb
    ├── 74.transfer-learning-bert-base-6.ipynb
    ├── 75.transfer-learning-bert-large-12.ipynb
    ├── 76.transfer-learning-xlnet-base.ipynb
    ├── 77.transfer-learning-albert-base.ipynb
    ├── 78.electra-base.ipynb
    ├── 79.electra-large.ipynb
    ├── 8.lstm-rnn-hinge.ipynb
    ├── 9.lstm-rnn-huber.ipynb
    ├── README.md
    ├── bert_model.py
    ├── data.zip
    ├── data
    │   ├── negative
    │   │   └── negative
    │   └── positive
    │   │   └── positive
    ├── dynamic_memory_network.py
    ├── entity_network.py
    ├── gpt_2.py
    ├── modules.py
    ├── utils.py
    └── xl.py
├── text-similarity
    ├── 1.birnn-contrastive.ipynb
    ├── 10.xlnet-base-circle-loss.ipynb
    ├── 2.birnn-cross-entropy.ipynb
    ├── 3.birnn-circle-loss.ipynb
    ├── 4.birnn-proxy-anchor-loss.ipynb
    ├── 5.bert-base-cross-entropy.ipynb
    ├── 6.bert-base-circle-loss.ipynb
    ├── 7.electra-base-cross-entropy.ipynb
    ├── 8.electra-base-circle-loss.ipynb
    ├── 9.xlnet-base-cross-entropy.ipynb
    ├── README.md
    └── prepare-dataset.ipynb
├── text-to-speech
    ├── 1.tacotron
    │   ├── caching.py
    │   ├── tacotron.ipynb
    │   ├── tacotron.py
    │   ├── test-tacotron.wav
    │   └── utils.py
    ├── 2.fairseq-dilated-cnn.ipynb
    ├── 3.seq2seq-bahdanau.ipynb
    ├── 4.seq2seq-luong.ipynb
    ├── 5.dilated-cnn-monothonic-attention.ipynb
    ├── 6.dilated-cnn-self-attention.ipynb
    ├── 7.deep-cnn-monothonic-attention.ipynb
    ├── 8.deep-cnn-self-attention.ipynb
    ├── README.md
    ├── caching.py
    ├── download.ipynb
    ├── test-bahdanau.wav
    ├── test-dilated-cnn-monothonic-attention.wav
    ├── test-dilated-cnn-self-attention.wav
    ├── test-luong.wav
    └── utils.py
├── topic-generator
    ├── 1.tat.ipynb
    ├── 2.tav.ipynb
    ├── 3.mta.ipynb
    ├── 4.dilated-seq2seq.ipynb
    └── README.md
├── topic-model
    ├── 1.lda2vec.ipynb
    ├── 2.bert-topic.ipynb
    ├── 3.xlnet-topic.ipynb
    ├── modeling.py
    ├── prepro_utils.py
    ├── utils.py
    └── xlnet.py
├── unsupervised-extractive-summarization
    ├── 1.skip-thought.ipynb
    ├── 2.residual-network.ipynb
    ├── 3.residual-network-bahdanau.ipynb
    ├── README.md
    └── books
    │   ├── Blood_Born
    │   ├── Dark_Thirst
    │   └── Driftas_Quest
├── vectorizer
    ├── 1.cbow-softmax.ipynb
    ├── 10.fast-text.ipynb
    ├── 11.elmo.ipynb
    ├── 12.bert-batch-all-triplet-loss.ipynb
    ├── 2.cbow-nce.ipynb
    ├── 3.skipgram-softmax.ipynb
    ├── 4.skipgram-nce.ipynb
    ├── 5.lda2vec.ipynb
    ├── 6.supervised-embedded.ipynb
    ├── 7.triplet-loss.ipynb
    ├── 8.auto-encoder.ipynb
    ├── 9.batch-all-triplet-loss-lstm-embedded.ipynb
    ├── README.md
    ├── data
    │   ├── negative
    │   │   └── negative
    │   └── positive
    │   │   └── positive
    └── utils.py
├── visualization
    ├── 1.attention-visualization-bahdanau.ipynb
    ├── 2.attention-visualization-luong.ipynb
    ├── 3.bert-attention.ipynb
    ├── 4.xlnet-attention.ipynb
    └── 5.bert-topic.ipynb
└── vocoder
    ├── 1.dilated-cnn.ipynb
    ├── README.md
    ├── caching-vocoder.ipynb
    └── download.ipynb


/.gitignore:
--------------------------------------------------------------------------------
1 | *.DS_Store
2 | *__pycache__
3 | *.ipynb_checkpoints
4 | movie_*.txt
5 | summarization/dataset
6 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 HUSEIN ZOLKEPLI
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/abstractive-summarization/README.md:
--------------------------------------------------------------------------------
1 | ## How-to
2 | 
3 | 1. Unzip [dataset.tar.gz](dataset.tar.gz)
4 | 
5 | 2. Run any notebook using Jupyter Notebook.
6 | 


--------------------------------------------------------------------------------
/abstractive-summarization/dataset.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mesolitica/NLP-Models-Tensorflow/0741216aa8235e1228b3de7903cc36d73f8f2b45/abstractive-summarization/dataset.tar.gz


--------------------------------------------------------------------------------
/chatbot/README.md:
--------------------------------------------------------------------------------
 1 | ## How-to
 2 | 
 3 | 1. Unzip [dataset.tar.gz](dataset.tar.gz)
 4 | 
 5 | 2. Run any notebook using Jupyter Notebook.
 6 | 
 7 | ## Accuracy, not sorted
 8 | 
 9 | Based on training accuracy for 20 epochs.
10 | 
11 | | name                                                       | accuracy |
12 | |------------------------------------------------------------|----------|
13 | | 1.basic-seq2seq-manual                                     | 0.816000 |
14 | | 2.lstm-seq2seq-manual                                      | 0.735000 |
15 | | 3.gru-seq2seq-manual                                       | 0.846833 |
16 | | 4.basic-seq2seq-api-greedy                                 | 1.009119 |
17 | | 5.lstm-seq2seq-api-greedy                                  | 0.984596 |
18 | | 6.gru-seq2seq-greedy                                       | 1.008869 |
19 | | 7.basic-birnn-seq2seq-manual                               | 0.990333 |
20 | | 8.lstm-birnn-seq2seq-manual                                | 0.732833 |
21 | | 9.gru-birnn-seq2seq-manual                                 | 0.936667 |
22 | | 10.basic-birnn-seq2seq-greedy                              | 1.009586 |
23 | | 11.lstm-birnn-seq2seq-greedy                               | 0.991938 |
24 | | 12.gru-birnn-seq2seq-greedy                                | 1.008791 |
25 | | 13.basic-seq2seq-luong                                     | 0.821167 |
26 | | 14.lstm-seq2seq-luong                                      | 0.723167 |
27 | | 15.gru-seq2seq-luong                                       | 0.751667 |
28 | | 16.basic-seq2seq-bahdanau                                  | 0.811833 |
29 | | 17.lstm-seq2seq-bahdanau                                   | 0.721833 |
30 | | 18.gru-seq2seq-bahdanau                                    | 0.728167 |
31 | | 19.lstm-birnn-seq2seq-luong                                | 0.728500 |
32 | | 20.gru-birnn-seq2seq-luong                                 | 0.743833 |
33 | | 21.lstm-birnn-seq2seq-bahdanau                             | 0.718833 |
34 | | 22.gru-birnn-seq2seq-bahdanau                              | 0.746667 |
35 | | 23.lstm-birnn-seq2seq-bahdanau-luong                       | 0.721000 |
36 | | 24.gru-birnn-seq2seq-bahdanau-luong                        | 0.747667 |
37 | | 25.lstm-seq2seq-greedy-luong                               | 0.974864 |
38 | | 26.gru-seq2seq-greedy-luong                                | 0.999175 |
39 | | 27.lstm-seq2seq-greedy-bahdanau                            | 0.987874 |
40 | | 28.gru-seq2seq-greedy-bahdanau                             | 1.000434 |
41 | | 29.lstm-seq2seq-beam                                       | 0.874802 |
42 | | 30.gru-seq2seq-beam                                        | 0.905397 |
43 | | 31.lstm-birnn-seq2seq-beam-luong                           | 0.913772 |
44 | | 32.gru-birnn-seq2seq-beam-luong                            | 0.856824 |
45 | | 33.lstm-birnn-seq2seq-luong-bahdanau-stack-beam            | 0.732801 |
46 | | 34.gru-birnn-seq2seq-luong-bahdanau-stack-beam             | 0.756537 |
47 | | 35.byte-net                                                | 0.877510 |
48 | | 36.estimator                                               |          |
49 | | 37.capsule-lstm-seq2seq-greedy                             | 0.655007 |
50 | | 38.capsule-lstm-seq2seq-luong-beam                         | 0.275569 |
51 | | 39.lstm-birnn-seq2seq-luong-bahdanau-stack-beam-dropout-l2 | 0.312999 |
52 | | 40.dnc-seq2seq-bahdanau-greedy                             | 0.962712 |
53 | | 41.lstm-birnn-seq2seq-beam-luongmonotic                    | 0.917333 |
54 | | 42.lstm-birnn-seq2seq-beam-bahdanaumonotic                 | 0.929333 |
55 | | 43.memory-network-basic                                    | 0.945333 |
56 | | 44.memory-network-lstm                                     | 0.900000 |
57 | | 45.attention-is-all-you-need                               | 0.704549 |
58 | | 46.transformer-xl                                          | 0.874486 |
59 | | 47.attention-is-all-you-need-beam-search                   | 0.836433 |
60 | | 48.transformer-xl-lstm                                     | 0.826571 |
61 | | 49.gpt-2-lstm                                              | 0.645157 |
62 | | 50.conv-encoder-conv-decoder                               | 0.518504 |
63 | | 51.conv-encoder-lstm                                       | 0.924609 |
64 | | 52.tacotron-greedy                                         | 0.876267 |
65 | | 53.tacotron-beam                                           | 0.855140 |
66 | | 54.google-nmt                                              | 1.006089 |
67 | 


--------------------------------------------------------------------------------
/chatbot/dataset.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mesolitica/NLP-Models-Tensorflow/0741216aa8235e1228b3de7903cc36d73f8f2b45/chatbot/dataset.tar.gz


--------------------------------------------------------------------------------
/chatbot/dnc.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Google Inc.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | """DNC Cores.
 16 | 
 17 | These modules create a DNC core. They take input, pass parameters to the memory
 18 | access module, and integrate the output of memory to form an output.
 19 | """
 20 | 
 21 | from __future__ import absolute_import
 22 | from __future__ import division
 23 | from __future__ import print_function
 24 | 
 25 | import collections
 26 | import numpy as np
 27 | import sonnet as snt
 28 | import tensorflow as tf
 29 | 
 30 | import access
 31 | 
 32 | DNCState = collections.namedtuple('DNCState', ('access_output', 'access_state',
 33 |                                                'controller_state'))
 34 | 
 35 | 
 36 | class DNC(snt.RNNCore):
 37 |   """DNC core module.
 38 | 
 39 |   Contains controller and memory access module.
 40 |   """
 41 | 
 42 |   def __init__(self,
 43 |                access_config,
 44 |                controller_config,
 45 |                output_size,
 46 |                clip_value=None,
 47 |                name='dnc'):
 48 |     """Initializes the DNC core.
 49 | 
 50 |     Args:
 51 |       access_config: dictionary of access module configurations.
 52 |       controller_config: dictionary of controller (LSTM) module configurations.
 53 |       output_size: output dimension size of core.
 54 |       clip_value: clips controller and core output values to between
 55 |           `[-clip_value, clip_value]` if specified.
 56 |       name: module name (default 'dnc').
 57 | 
 58 |     Raises:
 59 |       TypeError: if direct_input_size is not None for any access module other
 60 |         than KeyValueMemory.
 61 |     """
 62 |     super(DNC, self).__init__(name=name)
 63 | 
 64 |     with self._enter_variable_scope():
 65 |       self._controller = snt.LSTM(**controller_config)
 66 |       self._access = access.MemoryAccess(**access_config)
 67 | 
 68 |     self._access_output_size = np.prod(self._access.output_size.as_list())
 69 |     self._output_size = output_size
 70 |     self._clip_value = clip_value or 0
 71 | 
 72 |     self._output_size = tf.TensorShape([output_size])
 73 |     self._state_size = DNCState(
 74 |         access_output=self._access_output_size,
 75 |         access_state=self._access.state_size,
 76 |         controller_state=self._controller.state_size)
 77 | 
 78 |   def _clip_if_enabled(self, x):
 79 |     if self._clip_value > 0:
 80 |       return tf.clip_by_value(x, -self._clip_value, self._clip_value)
 81 |     else:
 82 |       return x
 83 | 
 84 |   def _build(self, inputs, prev_state):
 85 |     """Connects the DNC core into the graph.
 86 | 
 87 |     Args:
 88 |       inputs: Tensor input.
 89 |       prev_state: A `DNCState` tuple containing the fields `access_output`,
 90 |           `access_state` and `controller_state`. `access_state` is a 3-D Tensor
 91 |           of shape `[batch_size, num_reads, word_size]` containing read words.
 92 |           `access_state` is a tuple of the access module's state, and
 93 |           `controller_state` is a tuple of controller module's state.
 94 | 
 95 |     Returns:
 96 |       A tuple `(output, next_state)` where `output` is a tensor and `next_state`
 97 |       is a `DNCState` tuple containing the fields `access_output`,
 98 |       `access_state`, and `controller_state`.
 99 |     """
100 | 
101 |     prev_access_output = prev_state.access_output
102 |     prev_access_state = prev_state.access_state
103 |     prev_controller_state = prev_state.controller_state
104 | 
105 |     batch_flatten = snt.BatchFlatten()
106 |     controller_input = tf.concat(
107 |         [batch_flatten(inputs), batch_flatten(prev_access_output)], 1)
108 | 
109 |     controller_output, controller_state = self._controller(
110 |         controller_input, prev_controller_state)
111 | 
112 |     controller_output = self._clip_if_enabled(controller_output)
113 |     controller_state = snt.nest.map(self._clip_if_enabled, controller_state)
114 | 
115 |     access_output, access_state = self._access(controller_output,
116 |                                                prev_access_state)
117 | 
118 |     output = tf.concat([controller_output, batch_flatten(access_output)], 1)
119 |     output = snt.Linear(
120 |         output_size=self._output_size.as_list()[0],
121 |         name='output_linear')(output)
122 |     output = self._clip_if_enabled(output)
123 | 
124 |     return output, DNCState(
125 |         access_output=access_output,
126 |         access_state=access_state,
127 |         controller_state=controller_state)
128 | 
129 |   def initial_state(self, batch_size, dtype=tf.float32):
130 |     return DNCState(
131 |         controller_state=self._controller.initial_state(batch_size, dtype),
132 |         access_state=self._access.initial_state(batch_size, dtype),
133 |         access_output=tf.zeros(
134 |             [batch_size] + self._access.output_size.as_list(), dtype))
135 | 
136 |   @property
137 |   def state_size(self):
138 |     return self._state_size
139 | 
140 |   @property
141 |   def output_size(self):
142 |     return self._output_size
143 | 


--------------------------------------------------------------------------------
/chatbot/gpt_2.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | 
  4 | 
  5 | def shape_list(x):
  6 |     """Deal with dynamic shape in tensorflow cleanly."""
  7 |     static = x.shape.as_list()
  8 |     dynamic = tf.shape(x)
  9 |     return [dynamic[i] if s is None else s for i, s in enumerate(static)]
 10 | 
 11 | 
 12 | def softmax(x, axis = -1):
 13 |     x = x - tf.reduce_max(x, axis = axis, keepdims = True)
 14 |     ex = tf.exp(x)
 15 |     return ex / tf.reduce_sum(ex, axis = axis, keepdims = True)
 16 | 
 17 | 
 18 | def gelu(x):
 19 |     return (
 20 |         0.5
 21 |         * x
 22 |         * (1 + tf.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3))))
 23 |     )
 24 | 
 25 | 
 26 | def norm(x, scope, *, axis = -1, epsilon = 1e-5):
 27 |     """Normalize to mean = 0, std = 1, then do a diagonal affine transform."""
 28 |     with tf.variable_scope(scope):
 29 |         n_state = x.shape[-1].value
 30 |         g = tf.get_variable(
 31 |             'g', [n_state], initializer = tf.constant_initializer(1)
 32 |         )
 33 |         b = tf.get_variable(
 34 |             'b', [n_state], initializer = tf.constant_initializer(0)
 35 |         )
 36 |         u = tf.reduce_mean(x, axis = axis, keepdims = True)
 37 |         s = tf.reduce_mean(tf.square(x - u), axis = axis, keepdims = True)
 38 |         x = (x - u) * tf.rsqrt(s + epsilon)
 39 |         x = x * g + b
 40 |         return x
 41 | 
 42 | 
 43 | def split_states(x, n):
 44 |     """Reshape the last dimension of x into [n, x.shape[-1]/n]."""
 45 |     *start, m = shape_list(x)
 46 |     return tf.reshape(x, start + [n, m // n])
 47 | 
 48 | 
 49 | def merge_states(x):
 50 |     """Smash the last two dimensions of x into a single dimension."""
 51 |     *start, a, b = shape_list(x)
 52 |     return tf.reshape(x, start + [a * b])
 53 | 
 54 | 
 55 | def conv1d(x, scope, nf, *, w_init_stdev = 0.02):
 56 |     with tf.variable_scope(scope):
 57 |         *start, nx = shape_list(x)
 58 |         w = tf.get_variable(
 59 |             'w',
 60 |             [1, nx, nf],
 61 |             initializer = tf.random_normal_initializer(stddev = w_init_stdev),
 62 |         )
 63 |         b = tf.get_variable('b', [nf], initializer = tf.constant_initializer(0))
 64 |         c = tf.reshape(
 65 |             tf.matmul(tf.reshape(x, [-1, nx]), tf.reshape(w, [-1, nf])) + b,
 66 |             start + [nf],
 67 |         )
 68 |         return c
 69 | 
 70 | 
 71 | def attention_mask(nd, ns, *, dtype):
 72 |     """1's in the lower triangle, counting from the lower right corner.
 73 | 
 74 |     Same as tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd), but doesn't produce garbage on TPUs.
 75 |     """
 76 |     i = tf.range(nd)[:, None]
 77 |     j = tf.range(ns)
 78 |     m = i >= j - ns + nd
 79 |     return tf.cast(m, dtype)
 80 | 
 81 | 
 82 | def attn(x, scope, n_state, *, past, hparams):
 83 |     assert x.shape.ndims == 3  # Should be [batch, sequence, features]
 84 |     assert n_state % hparams.n_head == 0
 85 |     if past is not None:
 86 |         assert (
 87 |             past.shape.ndims == 5
 88 |         )  # Should be [batch, 2, heads, sequence, features], where 2 is [k, v]
 89 | 
 90 |     def split_heads(x):
 91 |         # From [batch, sequence, features] to [batch, heads, sequence, features]
 92 |         return tf.transpose(split_states(x, hparams.n_head), [0, 2, 1, 3])
 93 | 
 94 |     def merge_heads(x):
 95 |         # Reverse of split_heads
 96 |         return merge_states(tf.transpose(x, [0, 2, 1, 3]))
 97 | 
 98 |     def mask_attn_weights(w):
 99 |         # w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst.
100 |         _, _, nd, ns = shape_list(w)
101 |         b = attention_mask(nd, ns, dtype = w.dtype)
102 |         b = tf.reshape(b, [1, 1, nd, ns])
103 |         w = w * b - tf.cast(1e10, w.dtype) * (1 - b)
104 |         return w
105 | 
106 |     def multihead_attn(q, k, v):
107 |         # q, k, v have shape [batch, heads, sequence, features]
108 |         w = tf.matmul(q, k, transpose_b = True)
109 |         w = w * tf.rsqrt(tf.cast(v.shape[-1].value, w.dtype))
110 | 
111 |         w = mask_attn_weights(w)
112 |         w = softmax(w)
113 |         a = tf.matmul(w, v)
114 |         return a
115 | 
116 |     with tf.variable_scope(scope):
117 |         c = conv1d(x, 'c_attn', n_state * 3)
118 |         q, k, v = map(split_heads, tf.split(c, 3, axis = 2))
119 |         present = tf.stack([k, v], axis = 1)
120 |         if past is not None:
121 |             pk, pv = tf.unstack(past, axis = 1)
122 |             k = tf.concat([pk, k], axis = -2)
123 |             v = tf.concat([pv, v], axis = -2)
124 |         a = multihead_attn(q, k, v)
125 |         a = merge_heads(a)
126 |         a = conv1d(a, 'c_proj', n_state)
127 |         return a, present
128 | 
129 | 
130 | def mlp(x, scope, n_state, *, hparams):
131 |     with tf.variable_scope(scope):
132 |         nx = x.shape[-1].value
133 |         h = gelu(conv1d(x, 'c_fc', n_state))
134 |         h2 = conv1d(h, 'c_proj', nx)
135 |         return h2
136 | 
137 | 
138 | def block(x, scope, *, past, hparams):
139 |     with tf.variable_scope(scope):
140 |         nx = x.shape[-1].value
141 |         a, present = attn(
142 |             norm(x, 'ln_1'), 'attn', nx, past = past, hparams = hparams
143 |         )
144 |         x = x + a
145 |         m = mlp(norm(x, 'ln_2'), 'mlp', nx * 4, hparams = hparams)
146 |         x = x + m
147 |         return x, present
148 | 
149 | 
150 | def past_shape(*, hparams, batch_size = None, sequence = None):
151 |     return [
152 |         batch_size,
153 |         hparams.n_layer,
154 |         2,
155 |         hparams.n_head,
156 |         sequence,
157 |         hparams.n_embd // hparams.n_head,
158 |     ]
159 | 
160 | 
161 | def expand_tile(value, size):
162 |     """Add a new axis of given size."""
163 |     value = tf.convert_to_tensor(value, name = 'value')
164 |     ndims = value.shape.ndims
165 |     return tf.tile(tf.expand_dims(value, axis = 0), [size] + [1] * ndims)
166 | 
167 | 
168 | def positions_for(tokens, past_length):
169 |     batch_size = tf.shape(tokens)[0]
170 |     nsteps = tf.shape(tokens)[1]
171 |     return expand_tile(past_length + tf.range(nsteps), batch_size)
172 | 
173 | 
174 | def model(hparams, X, past = None, scope = 'model', reuse = False):
175 |     with tf.variable_scope(scope, reuse = reuse):
176 |         results = {}
177 |         batch, sequence = shape_list(X)
178 | 
179 |         wpe = tf.get_variable(
180 |             'wpe',
181 |             [hparams.n_ctx, hparams.n_embd],
182 |             initializer = tf.random_normal_initializer(stddev = 0.01),
183 |         )
184 |         wte = tf.get_variable(
185 |             'wte',
186 |             [hparams.n_vocab, hparams.n_embd],
187 |             initializer = tf.random_normal_initializer(stddev = 0.02),
188 |         )
189 |         past_length = 0 if past is None else tf.shape(past)[-2]
190 |         h = tf.gather(wte, X) + tf.gather(wpe, positions_for(X, past_length))
191 | 
192 |         # Transformer
193 |         presents = []
194 |         pasts = (
195 |             tf.unstack(past, axis = 1)
196 |             if past is not None
197 |             else [None] * hparams.n_layer
198 |         )
199 |         assert len(pasts) == hparams.n_layer
200 |         for layer, past in enumerate(pasts):
201 |             h, present = block(h, 'h%d' % layer, past = past, hparams = hparams)
202 |             presents.append(present)
203 |         results['present'] = tf.stack(presents, axis = 1)
204 |         h = norm(h, 'ln_f')
205 | 
206 |         # Language model loss.  Do tokens <n predict token n?
207 |         h_flat = tf.reshape(h, [batch * sequence, hparams.n_embd])
208 |         logits = tf.matmul(h_flat, wte, transpose_b = True)
209 |         logits = tf.reshape(logits, [batch, sequence, hparams.n_vocab])
210 |         results['logits'] = logits
211 |         return results
212 | 


--------------------------------------------------------------------------------
/chatbot/util.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """DNC util ops and modules."""
16 | 
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 | 
21 | import numpy as np
22 | import tensorflow as tf
23 | 
24 | 
25 | def batch_invert_permutation(permutations):
26 |   """Returns batched `tf.invert_permutation` for every row in `permutations`."""
27 |   with tf.name_scope('batch_invert_permutation', values=[permutations]):
28 |     unpacked = tf.unstack(permutations)
29 |     inverses = [tf.invert_permutation(permutation) for permutation in unpacked]
30 |     return tf.stack(inverses)
31 | 
32 | 
33 | def batch_gather(values, indices):
34 |   """Returns batched `tf.gather` for every row in the input."""
35 |   with tf.name_scope('batch_gather', values=[values, indices]):
36 |     unpacked = zip(tf.unstack(values), tf.unstack(indices))
37 |     result = [tf.gather(value, index) for value, index in unpacked]
38 |     return tf.stack(result)
39 | 
40 | 
41 | def one_hot(length, index):
42 |   """Return an nd array of given `length` filled with 0s and a 1 at `index`."""
43 |   result = np.zeros(length)
44 |   result[index] = 1
45 |   return result
46 | 


--------------------------------------------------------------------------------
/classification-comparison/Deep-learning/seq2seq-vector.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import tensorflow as tf\n",
 10 |     "import numpy as np\n",
 11 |     "import time\n",
 12 |     "import os\n",
 13 |     "from sklearn.preprocessing import LabelEncoder\n",
 14 |     "import re\n",
 15 |     "import collections\n",
 16 |     "import random\n",
 17 |     "import pickle"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": 2,
 23 |    "metadata": {},
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "maxlen = 20\n",
 27 |     "location = os.getcwd()\n",
 28 |     "num_layers = 3\n",
 29 |     "size_layer = 256\n",
 30 |     "learning_rate = 0.0001\n",
 31 |     "batch = 100\n",
 32 |     "batch_vector = 64"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": 3,
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "with open('dataset-emotion.p', 'rb') as fopen:\n",
 42 |     "    df = pickle.load(fopen)\n",
 43 |     "with open('vector-emotion.p', 'rb') as fopen:\n",
 44 |     "    vectors = pickle.load(fopen)\n",
 45 |     "with open('dataset-dictionary.p', 'rb') as fopen:\n",
 46 |     "    dictionary = pickle.load(fopen)"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 4,
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "label = np.unique(df[:,1])"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 5,
 61 |    "metadata": {},
 62 |    "outputs": [
 63 |     {
 64 |      "name": "stderr",
 65 |      "output_type": "stream",
 66 |      "text": [
 67 |       "/usr/local/lib/python3.5/dist-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
 68 |       "  \"This module will be removed in 0.20.\", DeprecationWarning)\n"
 69 |      ]
 70 |     }
 71 |    ],
 72 |    "source": [
 73 |     "from sklearn.cross_validation import train_test_split\n",
 74 |     "train_X, test_X, train_Y, test_Y = train_test_split(df[:,0], df[:, 1].astype('int'), test_size = 0.2)"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": null,
 80 |    "metadata": {},
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "class Model:\n",
 84 |     "    \n",
 85 |     "    def __init__(self, num_layers, size_layer, dimension_input, dimension_output, learning_rate):\n",
 86 |     "        def lstm_cell():\n",
 87 |     "            return tf.nn.rnn_cell.LSTMCell(size_layer)\n",
 88 |     "        self.rnn_cells = tf.nn.rnn_cell.MultiRNNCell([lstm_cell() for _ in range(num_layers)])\n",
 89 |     "        self.X = tf.placeholder(tf.float32, [None, None, dimension_input])\n",
 90 |     "        self.Y = tf.placeholder(tf.float32, [None, 1])\n",
 91 |     "        drop = tf.contrib.rnn.DropoutWrapper(self.rnn_cells, output_keep_prob = 0.5)\n",
 92 |     "        self.outputs, self.last_state = tf.nn.dynamic_rnn(drop, self.X, dtype = tf.float32)\n",
 93 |     "        self.rnn_W = tf.Variable(tf.random_normal((size_layer, dimension_output)))\n",
 94 |     "        self.rnn_B = tf.Variable(tf.random_normal([dimension_output]))\n",
 95 |     "        self.logits = tf.matmul(self.outputs[:, -1], self.rnn_W) + self.rnn_B\n",
 96 |     "        self.cost = tf.contrib.seq2seq.sequence_loss(\n",
 97 |     "            logits = tf.reshape(self.logits, [tf.shape(self.X)[0], 1, dimension_output]),\n",
 98 |     "            targets = tf.cast(self.Y, tf.int32),\n",
 99 |     "            weights = tf.ones_like(self.Y, tf.float32),\n",
100 |     "            average_across_timesteps = True,\n",
101 |     "            average_across_batch = True)\n",
102 |     "        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)\n",
103 |     "        self.correct_pred = tf.equal(tf.argmax(self.logits, 1), tf.argmax(self.Y, 1))\n",
104 |     "        self.accuracy = tf.reduce_mean(tf.cast(self.correct_pred, tf.float32))"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": null,
110 |    "metadata": {},
111 |    "outputs": [
112 |     {
113 |      "name": "stdout",
114 |      "output_type": "stream",
115 |      "text": [
116 |       "'unwarrentedly'\n",
117 |       "epoch: 0 , pass acc: 0 , current acc: 0.123805517847\n",
118 |       "time taken: 191.62172055244446\n",
119 |       "epoch: 1 , training loss: 1.03163203897 , training acc: 0.0928824202123 , valid loss: 3.96208436664 , valid acc: 0.123805517847\n",
120 |       "'unwarrentedly'\n"
121 |      ]
122 |     }
123 |    ],
124 |    "source": [
125 |     "tf.reset_default_graph()\n",
126 |     "sess = tf.InteractiveSession()\n",
127 |     "model = Model(num_layers, size_layer, vectors.shape[1], label.shape[0], learning_rate)\n",
128 |     "sess.run(tf.global_variables_initializer())\n",
129 |     "dimension = vectors.shape[1]\n",
130 |     "saver = tf.train.Saver(tf.global_variables())\n",
131 |     "EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 10, 0, 0, 0\n",
132 |     "while True:\n",
133 |     "    lasttime = time.time()\n",
134 |     "    if CURRENT_CHECKPOINT == EARLY_STOPPING:\n",
135 |     "        print('break epoch:', EPOCH)\n",
136 |     "        break\n",
137 |     "    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0\n",
138 |     "    for i in range(0, (train_X.shape[0] // batch) * batch, batch):\n",
139 |     "        batch_x = np.zeros((batch, maxlen, dimension))\n",
140 |     "        batch_y = np.zeros((batch, 1))\n",
141 |     "        for k in range(batch):\n",
142 |     "            tokens = train_X[i + k].split()[:maxlen]\n",
143 |     "            emb_data = np.zeros((maxlen, dimension), dtype = np.float32)\n",
144 |     "            for no, text in enumerate(tokens[::-1]):\n",
145 |     "                try:\n",
146 |     "                    emb_data[-1 - no, :] += vectors[dictionary[text], :]\n",
147 |     "                except Exception as e:\n",
148 |     "                    print(e)\n",
149 |     "                    continue\n",
150 |     "            batch_y[k, 0] = int(train_Y[i + k])\n",
151 |     "            batch_x[k, :, :] = emb_data[:, :]\n",
152 |     "        loss, _ = sess.run([model.cost, model.optimizer], feed_dict = {model.X : batch_x, model.Y : batch_y})\n",
153 |     "        train_loss += loss\n",
154 |     "        train_acc += sess.run(model.accuracy, feed_dict = {model.X : batch_x, model.Y : batch_y})\n",
155 |     "    \n",
156 |     "    for i in range(0, (test_X.shape[0] // batch) * batch, batch):\n",
157 |     "        batch_x = np.zeros((batch, maxlen, dimension))\n",
158 |     "        batch_y = np.zeros((batch, 1))\n",
159 |     "        for k in range(batch):\n",
160 |     "            tokens = test_X[i + k].split()[:maxlen]\n",
161 |     "            emb_data = np.zeros((maxlen, dimension), dtype = np.float32)\n",
162 |     "            for no, text in enumerate(tokens[::-1]):\n",
163 |     "                try:\n",
164 |     "                    emb_data[-1 - no, :] += vectors[dictionary[text], :]\n",
165 |     "                except:\n",
166 |     "                    continue\n",
167 |     "            batch_y[k, 0] = int(test_Y[i + k])\n",
168 |     "            batch_x[k, :, :] = emb_data[:, :]\n",
169 |     "        loss, acc = sess.run([model.cost, model.accuracy], feed_dict = {model.X : batch_x, model.Y : batch_y})\n",
170 |     "        test_loss += loss\n",
171 |     "        test_acc += acc\n",
172 |     "        \n",
173 |     "    train_loss /= (train_X.shape[0] // batch)\n",
174 |     "    train_acc /= (train_X.shape[0] // batch)\n",
175 |     "    test_loss /= (test_X.shape[0] // batch)\n",
176 |     "    test_acc /= (test_X.shape[0] // batch)\n",
177 |     "    if test_acc > CURRENT_ACC:\n",
178 |     "        print('epoch:', EPOCH, ', pass acc:', CURRENT_ACC, ', current acc:', test_acc)\n",
179 |     "        CURRENT_ACC = test_acc\n",
180 |     "        CURRENT_CHECKPOINT = 0\n",
181 |     "        saver.save(sess, os.getcwd() + \"/model-rnn-vector.ckpt\")\n",
182 |     "    else:\n",
183 |     "        CURRENT_CHECKPOINT += 1\n",
184 |     "    EPOCH += 1\n",
185 |     "    print('time taken:', time.time()-lasttime)\n",
186 |     "    print('epoch:', EPOCH, ', training loss:', train_loss, ', training acc:', train_acc, ', valid loss:', test_loss, ', valid acc:', test_acc)"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": null,
192 |    "metadata": {},
193 |    "outputs": [],
194 |    "source": []
195 |   }
196 |  ],
197 |  "metadata": {
198 |   "kernelspec": {
199 |    "display_name": "Python 3",
200 |    "language": "python",
201 |    "name": "python3"
202 |   },
203 |   "language_info": {
204 |    "codemirror_mode": {
205 |     "name": "ipython",
206 |     "version": 3
207 |    },
208 |    "file_extension": ".py",
209 |    "mimetype": "text/x-python",
210 |    "name": "python",
211 |    "nbconvert_exporter": "python",
212 |    "pygments_lexer": "ipython3",
213 |    "version": "3.5.2"
214 |   }
215 |  },
216 |  "nbformat": 4,
217 |  "nbformat_minor": 2
218 | }
219 | 


--------------------------------------------------------------------------------
/classification-comparison/LGB/lgb-tfidf-svd50.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# GPU Light gradient boosting trained on TF-IDF reduced 50 dimensions\n",
  8 |     "\n",
  9 |     "1. Same emotion dataset from [NLP-dataset](https://github.com/huseinzol05/NLP-Dataset)\n",
 10 |     "2. Same splitting 80% training, 20% testing, may vary depends on randomness\n",
 11 |     "3. Same regex substitution '[^\\\"\\'A-Za-z0-9 ]+'\n",
 12 |     "\n",
 13 |     "## Example\n",
 14 |     "\n",
 15 |     "Based on Term-frequency Inverse document frequency\n",
 16 |     "\n",
 17 |     "After that we apply SVD to reduce the dimensions, n_components = 50"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": 8,
 23 |    "metadata": {},
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "import lightgbm as lgb\n",
 27 |     "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n",
 28 |     "from sklearn.decomposition import TruncatedSVD\n",
 29 |     "import numpy as np\n",
 30 |     "import re\n",
 31 |     "import time\n",
 32 |     "from sklearn.cross_validation import train_test_split\n",
 33 |     "import sklearn.datasets\n",
 34 |     "from sklearn import pipeline\n",
 35 |     "from sklearn.model_selection import StratifiedKFold"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 2,
 41 |    "metadata": {
 42 |     "collapsed": true
 43 |    },
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "def clearstring(string):\n",
 47 |     "    string = re.sub('[^\\\"\\'A-Za-z0-9 ]+', '', string)\n",
 48 |     "    string = string.split(' ')\n",
 49 |     "    string = filter(None, string)\n",
 50 |     "    string = [y.strip() for y in string]\n",
 51 |     "    string = ' '.join(string)\n",
 52 |     "    return string\n",
 53 |     "\n",
 54 |     "# because of sklean.datasets read a document as a single element\n",
 55 |     "# so we want to split based on new line\n",
 56 |     "def separate_dataset(trainset):\n",
 57 |     "    datastring = []\n",
 58 |     "    datatarget = []\n",
 59 |     "    for i in range(len(trainset.data)):\n",
 60 |     "        data_ = trainset.data[i].split('\\n')\n",
 61 |     "        # python3, if python2, just remove list()\n",
 62 |     "        data_ = list(filter(None, data_))\n",
 63 |     "        for n in range(len(data_)):\n",
 64 |     "            data_[n] = clearstring(data_[n])\n",
 65 |     "        datastring += data_\n",
 66 |     "        for n in range(len(data_)):\n",
 67 |     "            datatarget.append(trainset.target[i])\n",
 68 |     "    return datastring, datatarget"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": 3,
 74 |    "metadata": {
 75 |     "collapsed": true
 76 |    },
 77 |    "outputs": [],
 78 |    "source": [
 79 |     "trainset_data = sklearn.datasets.load_files(container_path = 'data', encoding = 'UTF-8')\n",
 80 |     "trainset_data.data, trainset_data.target = separate_dataset(trainset_data)"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": 4,
 86 |    "metadata": {
 87 |     "collapsed": true
 88 |    },
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "train_X, test_X, train_Y, test_Y = train_test_split(trainset_data.data, trainset_data.target, test_size = 0.2)"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": 5,
 97 |    "metadata": {
 98 |     "collapsed": true
 99 |    },
100 |    "outputs": [],
101 |    "source": [
102 |     "decompose = pipeline.Pipeline([('count', TfidfVectorizer()),\n",
103 |     "                               ('svd', TruncatedSVD(n_components=50))]).fit(trainset_data.data)"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": 6,
109 |    "metadata": {
110 |     "collapsed": true
111 |    },
112 |    "outputs": [],
113 |    "source": [
114 |     "params_lgb = {\n",
115 |     "    'max_depth': 27, \n",
116 |     "    'learning_rate': 0.03,\n",
117 |     "    'verbose': 50, \n",
118 |     "    'early_stopping_round': 200,\n",
119 |     "    'metric': 'multi_logloss',\n",
120 |     "    'objective': 'multiclass',\n",
121 |     "    'num_classes': len(trainset_data.target_names),\n",
122 |     "    'device': 'gpu',\n",
123 |     "    'gpu_platform_id': 0,\n",
124 |     "    'gpu_device_id': 0\n",
125 |     "    }"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": 10,
131 |    "metadata": {
132 |     "collapsed": true
133 |    },
134 |    "outputs": [],
135 |    "source": [
136 |     "train_X = decompose.transform(train_X)\n",
137 |     "test_X = decompose.transform(test_X)"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": 11,
143 |    "metadata": {},
144 |    "outputs": [
145 |     {
146 |      "name": "stdout",
147 |      "output_type": "stream",
148 |      "text": [
149 |       "Training until validation scores don't improve for 200 rounds.\n",
150 |       "[100]\ttraining's multi_logloss: 1.51641\tvalid_1's multi_logloss: 1.52991\n",
151 |       "[200]\ttraining's multi_logloss: 1.48039\tvalid_1's multi_logloss: 1.50404\n",
152 |       "[300]\ttraining's multi_logloss: 1.46016\tvalid_1's multi_logloss: 1.49379\n",
153 |       "[400]\ttraining's multi_logloss: 1.44402\tvalid_1's multi_logloss: 1.48755\n",
154 |       "[500]\ttraining's multi_logloss: 1.43032\tvalid_1's multi_logloss: 1.4837\n",
155 |       "[600]\ttraining's multi_logloss: 1.41806\tvalid_1's multi_logloss: 1.4811\n",
156 |       "[700]\ttraining's multi_logloss: 1.40679\tvalid_1's multi_logloss: 1.4791\n",
157 |       "[800]\ttraining's multi_logloss: 1.39626\tvalid_1's multi_logloss: 1.47765\n",
158 |       "[900]\ttraining's multi_logloss: 1.38603\tvalid_1's multi_logloss: 1.4765\n",
159 |       "[1000]\ttraining's multi_logloss: 1.37627\tvalid_1's multi_logloss: 1.47559\n",
160 |       "[1100]\ttraining's multi_logloss: 1.36678\tvalid_1's multi_logloss: 1.47482\n",
161 |       "[1200]\ttraining's multi_logloss: 1.35761\tvalid_1's multi_logloss: 1.4741\n",
162 |       "[1300]\ttraining's multi_logloss: 1.34862\tvalid_1's multi_logloss: 1.47349\n",
163 |       "[1400]\ttraining's multi_logloss: 1.33981\tvalid_1's multi_logloss: 1.47288\n",
164 |       "[1500]\ttraining's multi_logloss: 1.33125\tvalid_1's multi_logloss: 1.47229\n",
165 |       "[1600]\ttraining's multi_logloss: 1.32281\tvalid_1's multi_logloss: 1.47181\n",
166 |       "[1700]\ttraining's multi_logloss: 1.31465\tvalid_1's multi_logloss: 1.47146\n",
167 |       "[1800]\ttraining's multi_logloss: 1.30664\tvalid_1's multi_logloss: 1.47115\n",
168 |       "[1900]\ttraining's multi_logloss: 1.29872\tvalid_1's multi_logloss: 1.47091\n",
169 |       "[2000]\ttraining's multi_logloss: 1.29104\tvalid_1's multi_logloss: 1.47071\n",
170 |       "[2100]\ttraining's multi_logloss: 1.28331\tvalid_1's multi_logloss: 1.47047\n",
171 |       "[2200]\ttraining's multi_logloss: 1.2759\tvalid_1's multi_logloss: 1.47042\n",
172 |       "[2300]\ttraining's multi_logloss: 1.26851\tvalid_1's multi_logloss: 1.47032\n",
173 |       "[2400]\ttraining's multi_logloss: 1.26119\tvalid_1's multi_logloss: 1.47017\n",
174 |       "[2500]\ttraining's multi_logloss: 1.25404\tvalid_1's multi_logloss: 1.47011\n",
175 |       "[2600]\ttraining's multi_logloss: 1.247\tvalid_1's multi_logloss: 1.47003\n",
176 |       "[2700]\ttraining's multi_logloss: 1.24004\tvalid_1's multi_logloss: 1.46998\n",
177 |       "[2800]\ttraining's multi_logloss: 1.23314\tvalid_1's multi_logloss: 1.46996\n",
178 |       "[2900]\ttraining's multi_logloss: 1.22632\tvalid_1's multi_logloss: 1.46997\n",
179 |       "[3000]\ttraining's multi_logloss: 1.21957\tvalid_1's multi_logloss: 1.46994\n",
180 |       "Early stopping, best iteration is:\n",
181 |       "[2849]\ttraining's multi_logloss: 1.22982\tvalid_1's multi_logloss: 1.46992\n",
182 |       "415.922 Seconds to train lgb\n"
183 |      ]
184 |     }
185 |    ],
186 |    "source": [
187 |     "d_train = lgb.Dataset(train_X, train_Y)\n",
188 |     "d_valid = lgb.Dataset(test_X, test_Y)\n",
189 |     "watchlist = [d_train, d_valid]\n",
190 |     "t=time.time()\n",
191 |     "clf = lgb.train(params_lgb, d_train, 100000, watchlist, early_stopping_rounds=200, verbose_eval=100)\n",
192 |     "print(round(time.time()-t, 3), 'Seconds to train lgb')"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "code",
197 |    "execution_count": 12,
198 |    "metadata": {},
199 |    "outputs": [
200 |     {
201 |      "name": "stdout",
202 |      "output_type": "stream",
203 |      "text": [
204 |       "             precision    recall  f1-score   support\n",
205 |       "\n",
206 |       "      anger       0.38      0.05      0.09     11460\n",
207 |       "       fear       0.32      0.06      0.10      9545\n",
208 |       "        joy       0.44      0.73      0.55     28052\n",
209 |       "       love       0.17      0.01      0.02      7015\n",
210 |       "    sadness       0.39      0.54      0.45     24291\n",
211 |       "   surprise       0.09      0.01      0.01      2999\n",
212 |       "\n",
213 |       "avg / total       0.37      0.42      0.34     83362\n",
214 |       "\n"
215 |      ]
216 |     }
217 |    ],
218 |    "source": [
219 |     "from sklearn import metrics\n",
220 |     "print(metrics.classification_report(test_Y, np.argmax(clf.predict(test_X), axis = 1), target_names = trainset_data.target_names))"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": 13,
226 |    "metadata": {
227 |     "collapsed": true
228 |    },
229 |    "outputs": [],
230 |    "source": [
231 |     "clf.save_model('lgb-tfidf-svd50.model')"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "code",
236 |    "execution_count": null,
237 |    "metadata": {
238 |     "collapsed": true
239 |    },
240 |    "outputs": [],
241 |    "source": []
242 |   }
243 |  ],
244 |  "metadata": {
245 |   "kernelspec": {
246 |    "display_name": "Python 3",
247 |    "language": "python",
248 |    "name": "python3"
249 |   },
250 |   "language_info": {
251 |    "codemirror_mode": {
252 |     "name": "ipython",
253 |     "version": 3
254 |    },
255 |    "file_extension": ".py",
256 |    "mimetype": "text/x-python",
257 |    "name": "python",
258 |    "nbconvert_exporter": "python",
259 |    "pygments_lexer": "ipython3",
260 |    "version": "3.5.2"
261 |   }
262 |  },
263 |  "nbformat": 4,
264 |  "nbformat_minor": 2
265 | }
266 | 


--------------------------------------------------------------------------------
/classification-comparison/preparation/dictionary_emotion.p:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mesolitica/NLP-Models-Tensorflow/0741216aa8235e1228b3de7903cc36d73f8f2b45/classification-comparison/preparation/dictionary_emotion.p


--------------------------------------------------------------------------------
/classification-comparison/preparation/prepare-dataset.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 6,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n",
 10 |     "import os\n",
 11 |     "import re\n",
 12 |     "import pickle"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 7,
 18 |    "metadata": {},
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "def clearstring(string):\n",
 22 |     "    string = re.sub('[^\\'\\\"A-Za-z0-9 ]+', '', string)\n",
 23 |     "    string = string.split(' ')\n",
 24 |     "    string = filter(None, string)\n",
 25 |     "    string = [y.strip() for y in string]\n",
 26 |     "    string = [y for y in string if len(y) > 3 and y.find('nbsp') < 0]\n",
 27 |     "    return ' '.join(string)\n",
 28 |     "\n",
 29 |     "def read_data(location):\n",
 30 |     "    list_folder = os.listdir(location)\n",
 31 |     "    label = list_folder\n",
 32 |     "    label.sort()\n",
 33 |     "    outer_string, outer_label = [], []\n",
 34 |     "    for i in range(len(list_folder)):\n",
 35 |     "        list_file = os.listdir('data/' + list_folder[i])\n",
 36 |     "        strings = []\n",
 37 |     "        for x in range(len(list_file)):\n",
 38 |     "            with open('data/' + list_folder[i] + '/' + list_file[x], 'r') as fopen:\n",
 39 |     "                strings += fopen.read().split('\\n')\n",
 40 |     "        strings = list(filter(None, strings))\n",
 41 |     "        for k in range(len(strings)):\n",
 42 |     "            strings[k] = clearstring(strings[k])\n",
 43 |     "        labels = [i] * len(strings)\n",
 44 |     "        outer_string += strings\n",
 45 |     "        outer_label += labels\n",
 46 |     "    \n",
 47 |     "    dataset = np.array([outer_string, outer_label])\n",
 48 |     "    dataset = dataset.T\n",
 49 |     "    np.random.shuffle(dataset)\n",
 50 |     "        \n",
 51 |     "    return dataset"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 8,
 57 |    "metadata": {},
 58 |    "outputs": [
 59 |     {
 60 |      "data": {
 61 |       "text/plain": [
 62 |        "array([[ 'woked feelin pain mood been tarified caused cant stnd hater could this life being hated throught whole career dont dont care thing thoe finna break that feel about babi carroline caused heart soul feelin real',\n",
 63 |        "        '4'],\n",
 64 |        "       [ 'couldnt stop feeling threatened cards grandmother kept sending with cash inside them',\n",
 65 |        "        '1'],\n",
 66 |        "       ['feel chronically defeated satisfied', '4'],\n",
 67 |        "       [ 'apologize anyone feels offended these remarks certainly welcome your opinions',\n",
 68 |        "        '0'],\n",
 69 |        "       [ 'feel like every romantic movie there dorky best friend that desperately love with beautiful leading girl',\n",
 70 |        "        '3']],\n",
 71 |        "      dtype='<U477')"
 72 |       ]
 73 |      },
 74 |      "execution_count": 8,
 75 |      "metadata": {},
 76 |      "output_type": "execute_result"
 77 |     }
 78 |    ],
 79 |    "source": [
 80 |     "dataset = read_data('data/')\n",
 81 |     "dataset[:5,:]"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "with open('dataset-emotion.p', 'wb') as fopen:\n",
 91 |     "    pickle.dump(dataset, fopen)"
 92 |    ]
 93 |   }
 94 |  ],
 95 |  "metadata": {
 96 |   "kernelspec": {
 97 |    "display_name": "Python 3",
 98 |    "language": "python",
 99 |    "name": "python3"
100 |   },
101 |   "language_info": {
102 |    "codemirror_mode": {
103 |     "name": "ipython",
104 |     "version": 3
105 |    },
106 |    "file_extension": ".py",
107 |    "mimetype": "text/x-python",
108 |    "name": "python",
109 |    "nbconvert_exporter": "python",
110 |    "pygments_lexer": "ipython3",
111 |    "version": "3.5.2"
112 |   }
113 |  },
114 |  "nbformat": 4,
115 |  "nbformat_minor": 2
116 | }
117 | 


--------------------------------------------------------------------------------
/classification-comparison/preparation/prepare-vocab.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 7,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import re\n",
 10 |     "import pickle\n",
 11 |     "import collections\n",
 12 |     "import numpy as np\n",
 13 |     "import os"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 11,
 19 |    "metadata": {},
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "def clearstring(string):\n",
 23 |     "    string = re.sub('[^\\'\\\"A-Za-z0-9 ]+', '', string)\n",
 24 |     "    string = string.split(' ')\n",
 25 |     "    string = filter(None, string)\n",
 26 |     "    string = [y.strip() for y in string]\n",
 27 |     "    string = [y for y in string if len(y) > 3 and y.find('nbsp') < 0]\n",
 28 |     "    return ' '.join(string)\n",
 29 |     "\n",
 30 |     "def read_data(location):\n",
 31 |     "    list_folder = os.listdir(location)\n",
 32 |     "    label = list_folder\n",
 33 |     "    label.sort()\n",
 34 |     "    outer_string, outer_label = [], []\n",
 35 |     "    for i in range(len(list_folder)):\n",
 36 |     "        list_file = os.listdir('data/' + list_folder[i])\n",
 37 |     "        strings = []\n",
 38 |     "        for x in range(len(list_file)):\n",
 39 |     "            with open('data/' + list_folder[i] + '/' + list_file[x], 'r') as fopen:\n",
 40 |     "                strings += fopen.read().split('\\n')\n",
 41 |     "        strings = list(filter(None, strings))\n",
 42 |     "        for k in range(len(strings)):\n",
 43 |     "            strings[k] = clearstring(strings[k])\n",
 44 |     "        labels = [i] * len(strings)\n",
 45 |     "        outer_string += strings\n",
 46 |     "        outer_label += labels\n",
 47 |     "    \n",
 48 |     "    dataset = np.array([outer_string, outer_label])\n",
 49 |     "    dataset = dataset.T\n",
 50 |     "    np.random.shuffle(dataset)\n",
 51 |     "    \n",
 52 |     "    string = []\n",
 53 |     "    for i in range(dataset.shape[0]):\n",
 54 |     "        string += dataset[i][0].split()\n",
 55 |     "    \n",
 56 |     "    return string\n",
 57 |     "\n",
 58 |     "def build_vocab(words, n_words):\n",
 59 |     "    count = [['UNK', -1]]\n",
 60 |     "    count.extend(collections.Counter(words).most_common(n_words - 1))\n",
 61 |     "    dictionary = dict()\n",
 62 |     "    for word, _ in count:\n",
 63 |     "        dictionary[word] = len(dictionary)\n",
 64 |     "    data = list()\n",
 65 |     "    unk_count = 0\n",
 66 |     "    for word in words:\n",
 67 |     "        index = dictionary.get(word, 0)\n",
 68 |     "        if index == 0:  # dictionary['UNK']\n",
 69 |     "            unk_count += 1\n",
 70 |     "        data.append(index)\n",
 71 |     "    count[0][1] = unk_count\n",
 72 |     "    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))\n",
 73 |     "    return data, count, dictionary, reversed_dictionary"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": 9,
 79 |    "metadata": {},
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "strings = read_data('data')"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": 10,
 88 |    "metadata": {},
 89 |    "outputs": [
 90 |     {
 91 |      "data": {
 92 |       "text/plain": [
 93 |        "['left', 'feeling', 'very', 'jealous', 'feel']"
 94 |       ]
 95 |      },
 96 |      "execution_count": 10,
 97 |      "metadata": {},
 98 |      "output_type": "execute_result"
 99 |     }
100 |    ],
101 |    "source": [
102 |     "strings[:5]"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": 12,
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "n_words = len(set(strings))\n",
112 |     "_,_,dictionary,reversed_dictionary = build_vocab(strings,n_words)"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": null,
118 |    "metadata": {},
119 |    "outputs": [],
120 |    "source": [
121 |     "with open('dataset-dictionary.p', 'wb') as fopen:\n",
122 |     "    pickle.dump(reversed_dictionary, fopen)\n",
123 |     "with open('dataset-dictionary-reverse.p', 'wb') as fopen:\n",
124 |     "    pickle.dump(dictionary, fopen)"
125 |    ]
126 |   }
127 |  ],
128 |  "metadata": {
129 |   "kernelspec": {
130 |    "display_name": "Python 3",
131 |    "language": "python",
132 |    "name": "python3"
133 |   },
134 |   "language_info": {
135 |    "codemirror_mode": {
136 |     "name": "ipython",
137 |     "version": 3
138 |    },
139 |    "file_extension": ".py",
140 |    "mimetype": "text/x-python",
141 |    "name": "python",
142 |    "nbconvert_exporter": "python",
143 |    "pygments_lexer": "ipython3",
144 |    "version": "3.5.2"
145 |   }
146 |  },
147 |  "nbformat": 4,
148 |  "nbformat_minor": 2
149 | }
150 | 


--------------------------------------------------------------------------------
/dependency-parser/README.md:
--------------------------------------------------------------------------------
1 | ## How-to
2 | 
3 | 1. Run any notebook using Jupyter Notebook.
4 | 


--------------------------------------------------------------------------------
/entity-tagging/README.md:
--------------------------------------------------------------------------------
1 | ## How-to
2 | 
3 | 1. Run any notebook using Jupyter Notebook.
4 | 


--------------------------------------------------------------------------------
/extractive-summarization/download-data.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 1,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "# !pip3 install googledrivedownloader"
10 |    ]
11 |   },
12 |   {
13 |    "cell_type": "code",
14 |    "execution_count": null,
15 |    "metadata": {},
16 |    "outputs": [],
17 |    "source": [
18 |     "from google_drive_downloader import GoogleDriveDownloader as gdd\n",
19 |     "\n",
20 |     "id = '0BwmD_VLjROrfTHk4NFg2SndKcjQ'\n",
21 |     "gdd.download_file_from_google_drive(file_id=id, dest_path='./cnn.tgz')"
22 |    ]
23 |   },
24 |   {
25 |    "cell_type": "code",
26 |    "execution_count": null,
27 |    "metadata": {},
28 |    "outputs": [],
29 |    "source": [
30 |     "!tar -zxf cnn.tgz"
31 |    ]
32 |   }
33 |  ],
34 |  "metadata": {
35 |   "kernelspec": {
36 |    "display_name": "Python 3",
37 |    "language": "python",
38 |    "name": "python3"
39 |   },
40 |   "language_info": {
41 |    "codemirror_mode": {
42 |     "name": "ipython",
43 |     "version": 3
44 |    },
45 |    "file_extension": ".py",
46 |    "mimetype": "text/x-python",
47 |    "name": "python",
48 |    "nbconvert_exporter": "python",
49 |    "pygments_lexer": "ipython3",
50 |    "version": "3.6.8"
51 |   }
52 |  },
53 |  "nbformat": 4,
54 |  "nbformat_minor": 2
55 | }
56 | 


--------------------------------------------------------------------------------
/generator/README.md:
--------------------------------------------------------------------------------
1 | ## How-to
2 | 
3 | 1. Run any notebook using Jupyter Notebook.
4 | 


--------------------------------------------------------------------------------
/language-detection/README.md:
--------------------------------------------------------------------------------
 1 | ## How-to
 2 | 
 3 | 1. You need to download and process dataset first,
 4 | ```bash
 5 | wget http://downloads.tatoeba.org/exports/sentences.tar.bz2
 6 | bunzip2 sentences.tar.bz2
 7 | tar xvf sentences.tar
 8 | ```
 9 | 
10 | 2. Change to csv,
11 | ```bash
12 | awk -F"\t" '{print"__label__"$2" "$3}' < sentences.csv | shuf > all.txt
13 | ```
14 | 
15 | 3. Run any notebook using Jupyter Notebook.
16 | 


--------------------------------------------------------------------------------
/neural-machine-translation/README.md:
--------------------------------------------------------------------------------
 1 | ## how-to
 2 | 
 3 | 1. run [prepare-dataset.ipynb](prepare-dataset.ipynb).
 4 | 2. run [prepare-bpe.ipynb](prepare-bpe.ipynb).
 5 | 3. run [prepare-t2t.ipynb](prepare-t2t.ipynb).
 6 | 
 7 | ## Notes
 8 | 
 9 | 1. First 200k Trainset to train, validation and test set to test.
10 | 2. Based on 20 epochs.
11 | 3. Accuracy based on BLEU.
12 | 4. RNN and Transformer parameters are not consistent.
13 | 
14 | For RNN,
15 | 
16 | ```python
17 | size_layer = 512
18 | num_layers = 2
19 | ```
20 | 
21 | For Transformer, we use BASE parameter from Tensor2Tensor.
22 | 
23 | Here we never tested what happened to RNN based models if we increase number of layers and size of layers same as Transformer BASE parameter.
24 | 
25 | 5. Batch size not consistent, most of the models used 128 batch size.
26 | 
27 | ## Accuracy, not sorted
28 | 
29 | | notebook                                                     | BLEU          |
30 | |--------------------------------------------------------------|---------------|
31 | | 1.basic-seq2seq.ipynb                                        | 6.319555e-05  |
32 | | 2.lstm-seq2seq.ipynb                                         | 0.016924812   |
33 | | 3.gru-seq2seq.ipynb                                          | 0.0094467895  |
34 | | 4.basic-seq2seq-contrib-greedy.ipynb                         | 0.005418866   |
35 | | 5.lstm-seq2seq-contrib-greedy.ipynb                          |               |
36 | | 6.gru-seq2seq-contrib-greedy.ipynb                           | 0.051461186   |
37 | | 7.basic-birnn-seq2seq.ipynb                                  | 6.319555e-05  |
38 | | 8.lstm-birnn-seq2seq.ipynb                                   | 0.012854616   |
39 | | 9.gru-birnn-seq2seq.ipynb                                    | 0.0095551545  |
40 | | 10.basic-birnn-seq2seq-contrib-greedy.ipynb                  | 0.019748569   |
41 | | 11.lstm-birnn-seq2seq-contrib-greedy.ipynb                   | 0.052993      |
42 | | 12.gru-birnn-seq2seq-contrib-greedy.ipynb                    | 0.047413725   |
43 | | 13.basic-seq2seq-luong.ipynb                                 | 8.97118e-05   |
44 | | 14.lstm-seq2seq-luong.ipynb                                  | 0.053475615   |
45 | | 15.gru-seq2seq-luong.ipynb                                   | 0.01888038    |
46 | | 16.basic-seq2seq-bahdanau.ipynb                              | 0.00020161743 |
47 | | 17.lstm-seq2seq-bahdanau.ipynb                               | 0.048261568   |
48 | | 18.gru-seq2seq-bahdanau.ipynb                                | 0.025584696   |
49 | | 19.basic-birnn-seq2seq-bahdanau.ipynb                        | 0.00020161743 |
50 | | 20.lstm-birnn-seq2seq-bahdanau.ipynb                         | 0.054097746   |
51 | | 21.gru-birnn-seq2seq-bahdanau.ipynb                          | 0.00020161743 |
52 | | 22.basic-birnn-seq2seq-luong.ipynb                           |               |
53 | | 23.lstm-birnn-seq2seq-luong.ipynb                            | 0.05320787    |
54 | | 24.gru-birnn-seq2seq-luong.ipynb                             | 0.027758315   |
55 | | 25.lstm-seq2seq-contrib-greedy-luong.ipynb                   | 0.15195806    |
56 | | 26.gru-seq2seq-contrib-greedy-luong.ipynb                    | 0.101576895   |
57 | | 27.lstm-seq2seq-contrib-greedy-bahdanau.ipynb                | 0.15275387    |
58 | | 28.gru-seq2seq-contrib-greedy-bahdanau.ipynb                 | 0.13868862    |
59 | | 29.lstm-seq2seq-contrib-beam-luong.ipynb                     | 0.17535137    |
60 | | 30.gru-seq2seq-contrib-beam-luong.ipynb                      | 0.003980886   |
61 | | 31.lstm-seq2seq-contrib-beam-bahdanau.ipynb                  | 0.17929372    |
62 | | 32.gru-seq2seq-contrib-beam-bahdanau.ipynb                   | 0.1767827     |
63 | | 33.lstm-birnn-seq2seq-contrib-beam-bahdanau.ipynb            | 0.19480321    |
64 | | 34.lstm-birnn-seq2seq-contrib-beam-luong.ipynb               | 0.20042004    |
65 | | 35.gru-birnn-seq2seq-contrib-beam-bahdanau.ipynb             | 0.1784567     |
66 | | 36.gru-birnn-seq2seq-contrib-beam-luong.ipynb                | 0.0557322     |
67 | | 37.lstm-birnn-seq2seq-contrib-beam-luongmonotonic.ipynb      | 0.06368613    |
68 | | 38.gru-birnn-seq2seq-contrib-beam-luongmonotic.ipynb         | 0.06407658    |
69 | | 39.lstm-birnn-seq2seq-contrib-beam-bahdanaumonotonic.ipynb   | 0.17586066    |
70 | | 40.gru-birnn-seq2seq-contrib-beam-bahdanaumonotic.ipynb      | 0.065290846   |
71 | | 41.residual-lstm-seq2seq-greedy-luong.ipynb                  | 0.1475228     |
72 | | 42.residual-gru-seq2seq-greedy-luong.ipynb                   | 5.0574585e-05 |
73 | | 43.residual-lstm-seq2seq-greedy-bahdanau.ipynb               | 0.15493448    |
74 | | 44.residual-gru-seq2seq-greedy-bahdanau.ipynb                |               |
75 | | 45.memory-network-lstm-decoder-greedy.ipynb                  |               |
76 | | 46.google-nmt.ipynb                                          | 0.055380445   |
77 | | 47.transformer-encoder-transformer-decoder.ipynb             | 0.17100729    |
78 | | 48.transformer-encoder-lstm-decoder-greedy.ipynb             | 0.049064703   |
79 | | 49.bertmultilanguage-encoder-bertmultilanguage-decoder.ipynb | 0.37003958    |
80 | | 50.bertmultilanguage-encoder-lstm-decoder.ipynb              | 0.11384286    |
81 | | 51.bertmultilanguage-encoder-transformer-decoder.ipynb       | 0.3941662     |
82 | | 52.bertenglish-encoder-transformer-decoder.ipynb             | 0.23225775    |
83 | | 53.transformer-t2t-2gpu.ipynb                                | 0.36773485    |


--------------------------------------------------------------------------------
/neural-machine-translation/electra/model/optimization.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2020 The Google Research Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """Functions and classes related to optimization (weight updates).
 17 | Modified from the original BERT code to allow for having separate learning
 18 | rates for different layers of the network.
 19 | """
 20 | 
 21 | from __future__ import absolute_import
 22 | from __future__ import division
 23 | from __future__ import print_function
 24 | 
 25 | import collections
 26 | import re
 27 | import tensorflow.compat.v1 as tf
 28 | 
 29 | 
 30 | def create_optimizer(
 31 |     loss,
 32 |     learning_rate,
 33 |     num_train_steps,
 34 |     weight_decay_rate = 0.0,
 35 |     use_tpu = False,
 36 |     warmup_steps = 0,
 37 |     warmup_proportion = 0,
 38 |     lr_decay_power = 1.0,
 39 |     layerwise_lr_decay_power = -1,
 40 |     n_transformer_layers = None,
 41 |     decoder_layers = None,
 42 | ):
 43 |     """Creates an optimizer and training op."""
 44 |     global_step = tf.train.get_or_create_global_step()
 45 |     learning_rate = tf.train.polynomial_decay(
 46 |         learning_rate,
 47 |         global_step,
 48 |         num_train_steps,
 49 |         end_learning_rate = 0.0,
 50 |         power = lr_decay_power,
 51 |         cycle = False,
 52 |     )
 53 |     warmup_steps = max(num_train_steps * warmup_proportion, warmup_steps)
 54 |     learning_rate *= tf.minimum(
 55 |         1.0,
 56 |         tf.cast(global_step, tf.float32) / tf.cast(warmup_steps, tf.float32),
 57 |     )
 58 |     cp_learning_rate = learning_rate
 59 | 
 60 |     if layerwise_lr_decay_power > 0:
 61 |         learning_rate = _get_layer_lrs(
 62 |             learning_rate,
 63 |             layerwise_lr_decay_power,
 64 |             n_transformer_layers,
 65 |             decoder_layers,
 66 |         )
 67 |         learning_rate['embedding_shared_weights/'] = cp_learning_rate
 68 |         learning_rate['decoder_stack/layer_normalization/'] = cp_learning_rate
 69 |         print(learning_rate)
 70 |     optimizer = AdamWeightDecayOptimizer(
 71 |         learning_rate = learning_rate,
 72 |         weight_decay_rate = weight_decay_rate,
 73 |         beta_1 = 0.9,
 74 |         beta_2 = 0.999,
 75 |         epsilon = 1e-6,
 76 |         exclude_from_weight_decay = ['LayerNorm', 'layer_norm', 'bias'],
 77 |     )
 78 |     if use_tpu:
 79 |         optimizer = tf.tpu.CrossShardOptimizer(optimizer)
 80 | 
 81 |     tvars = tf.trainable_variables()
 82 |     grads = tf.gradients(loss, tvars)
 83 |     (grads, _) = tf.clip_by_global_norm(grads, clip_norm = 1.0)
 84 |     train_op = optimizer.apply_gradients(
 85 |         zip(grads, tvars), global_step = global_step
 86 |     )
 87 |     new_global_step = global_step + 1
 88 |     train_op = tf.group(train_op, [global_step.assign(new_global_step)])
 89 |     return train_op
 90 | 
 91 | 
 92 | class AdamWeightDecayOptimizer(tf.train.Optimizer):
 93 |     """A basic Adam optimizer that includes "correct" L2 weight decay."""
 94 | 
 95 |     def __init__(
 96 |         self,
 97 |         learning_rate,
 98 |         weight_decay_rate = 0.0,
 99 |         beta_1 = 0.9,
100 |         beta_2 = 0.999,
101 |         epsilon = 1e-6,
102 |         exclude_from_weight_decay = None,
103 |         name = 'AdamWeightDecayOptimizer',
104 |     ):
105 |         """Constructs a AdamWeightDecayOptimizer."""
106 |         super(AdamWeightDecayOptimizer, self).__init__(False, name)
107 | 
108 |         self.learning_rate = learning_rate
109 |         self.weight_decay_rate = weight_decay_rate
110 |         self.beta_1 = beta_1
111 |         self.beta_2 = beta_2
112 |         self.epsilon = epsilon
113 |         self.exclude_from_weight_decay = exclude_from_weight_decay
114 | 
115 |     def _apply_gradients(self, grads_and_vars, learning_rate):
116 |         """See base class."""
117 |         assignments = []
118 |         for (grad, param) in grads_and_vars:
119 |             if grad is None or param is None:
120 |                 continue
121 | 
122 |             param_name = self._get_variable_name(param.name)
123 | 
124 |             m = tf.get_variable(
125 |                 name = param_name + '/adam_m',
126 |                 shape = param.shape.as_list(),
127 |                 dtype = tf.float32,
128 |                 trainable = False,
129 |                 initializer = tf.zeros_initializer(),
130 |             )
131 |             v = tf.get_variable(
132 |                 name = param_name + '/adam_v',
133 |                 shape = param.shape.as_list(),
134 |                 dtype = tf.float32,
135 |                 trainable = False,
136 |                 initializer = tf.zeros_initializer(),
137 |             )
138 | 
139 |             # Standard Adam update.
140 |             next_m = tf.multiply(self.beta_1, m) + tf.multiply(
141 |                 1.0 - self.beta_1, grad
142 |             )
143 |             next_v = tf.multiply(self.beta_2, v) + tf.multiply(
144 |                 1.0 - self.beta_2, tf.square(grad)
145 |             )
146 |             update = next_m / (tf.sqrt(next_v) + self.epsilon)
147 | 
148 |             # Just adding the square of the weights to the loss function is *not*
149 |             # the correct way of using L2 regularization/weight decay with Adam,
150 |             # since that will interact with the m and v parameters in strange ways.
151 |             #
152 |             # Instead we want ot decay the weights in a manner that doesn't interact
153 |             # with the m/v parameters. This is equivalent to adding the square
154 |             # of the weights to the loss with plain (non-momentum) SGD.
155 |             if self.weight_decay_rate > 0:
156 |                 if self._do_use_weight_decay(param_name):
157 |                     update += self.weight_decay_rate * param
158 | 
159 |             update_with_lr = learning_rate * update
160 |             next_param = param - update_with_lr
161 | 
162 |             assignments.extend(
163 |                 [param.assign(next_param), m.assign(next_m), v.assign(next_v)]
164 |             )
165 | 
166 |         return assignments
167 | 
168 |     def apply_gradients(self, grads_and_vars, global_step = None, name = None):
169 |         if isinstance(self.learning_rate, dict):
170 |             key_to_grads_and_vars = {}
171 |             for grad, var in grads_and_vars:
172 |                 update_for_var = False
173 |                 for key in self.learning_rate:
174 |                     if key in var.name:
175 |                         update_for_var = True
176 |                         if key not in key_to_grads_and_vars:
177 |                             key_to_grads_and_vars[key] = []
178 |                         key_to_grads_and_vars[key].append((grad, var))
179 |                 if not update_for_var:
180 |                     raise ValueError(
181 |                         'No learning rate specified for variable', var
182 |                     )
183 |             assignments = []
184 |             for key, key_grads_and_vars in key_to_grads_and_vars.items():
185 |                 assignments += self._apply_gradients(
186 |                     key_grads_and_vars, self.learning_rate[key]
187 |                 )
188 |         else:
189 |             assignments = self._apply_gradients(
190 |                 grads_and_vars, self.learning_rate
191 |             )
192 |         return tf.group(*assignments, name = name)
193 | 
194 |     def _do_use_weight_decay(self, param_name):
195 |         """Whether to use L2 weight decay for `param_name`."""
196 |         if not self.weight_decay_rate:
197 |             return False
198 |         if self.exclude_from_weight_decay:
199 |             for r in self.exclude_from_weight_decay:
200 |                 if re.search(r, param_name) is not None:
201 |                     return False
202 |         return True
203 | 
204 |     def _get_variable_name(self, param_name):
205 |         """Get the variable name from the tensor name."""
206 |         m = re.match('^(.*):\\d+$', param_name)
207 |         if m is not None:
208 |             param_name = m.group(1)
209 |         return param_name
210 | 
211 | 
212 | def _get_layer_lrs(learning_rate, layer_decay, n_layers, decoder_layers):
213 |     """Have lower learning rates for layers closer to the input."""
214 |     key_to_depths = collections.OrderedDict(
215 |         {
216 |             '/embeddings/': 0,
217 |             '/embeddings_project/': 0,
218 |             'task_specific/': n_layers + 2,
219 |         }
220 |     )
221 |     for layer in range(n_layers):
222 |         key_to_depths['encoder/layer_' + str(layer) + '/'] = layer + 1
223 |     for layer in range(decoder_layers):
224 |         key_to_depths['decoder_stack/layer_' + str(layer) + '/'] = layer + 1
225 |     return {
226 |         key: learning_rate * (layer_decay ** (n_layers + 2 - depth))
227 |         for key, depth in key_to_depths.items()
228 |     }
229 | 


--------------------------------------------------------------------------------
/neural-machine-translation/prepare-bpe.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 1,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "import json\n",
10 |     "\n",
11 |     "with open('dataset.json') as fopen:\n",
12 |     "    dataset = json.load(fopen)\n",
13 |     "    \n",
14 |     "X = dataset['train_X'] + dataset['test_X']\n",
15 |     "Y = dataset['train_Y'] + dataset['test_Y']"
16 |    ]
17 |   },
18 |   {
19 |    "cell_type": "code",
20 |    "execution_count": 2,
21 |    "metadata": {},
22 |    "outputs": [],
23 |    "source": [
24 |     "import youtokentome as yttm"
25 |    ]
26 |   },
27 |   {
28 |    "cell_type": "code",
29 |    "execution_count": 5,
30 |    "metadata": {},
31 |    "outputs": [],
32 |    "source": [
33 |     "with open('text.txt', 'w') as fopen:\n",
34 |     "    fopen.write('\\n'.join(X + Y))\n",
35 |     "    \n",
36 |     "bpe = yttm.BPE.train(data='text.txt', vocab_size=32000, model='bpe.model',\n",
37 |     "               pad_id=0, unk_id=2, bos_id=3, eos_id=1)"
38 |    ]
39 |   },
40 |   {
41 |    "cell_type": "code",
42 |    "execution_count": 8,
43 |    "metadata": {},
44 |    "outputs": [],
45 |    "source": [
46 |     "train_X = bpe.encode(dataset['train_X'], output_type=yttm.OutputType.ID)\n",
47 |     "train_Y = bpe.encode(dataset['train_Y'], output_type=yttm.OutputType.ID)\n",
48 |     "test_X = bpe.encode(dataset['test_X'], output_type=yttm.OutputType.ID)\n",
49 |     "test_Y = bpe.encode(dataset['test_Y'], output_type=yttm.OutputType.ID)"
50 |    ]
51 |   },
52 |   {
53 |    "cell_type": "code",
54 |    "execution_count": 10,
55 |    "metadata": {},
56 |    "outputs": [],
57 |    "source": [
58 |     "with open('dataset-bpe.json', 'w') as fopen:\n",
59 |     "    json.dump({'train_X': train_X, 'train_Y': train_Y, 'test_X': test_X, 'test_Y': test_Y}, fopen)"
60 |    ]
61 |   }
62 |  ],
63 |  "metadata": {
64 |   "kernelspec": {
65 |    "display_name": "Python 3",
66 |    "language": "python",
67 |    "name": "python3"
68 |   },
69 |   "language_info": {
70 |    "codemirror_mode": {
71 |     "name": "ipython",
72 |     "version": 3
73 |    },
74 |    "file_extension": ".py",
75 |    "mimetype": "text/x-python",
76 |    "name": "python",
77 |    "nbconvert_exporter": "python",
78 |    "pygments_lexer": "ipython3",
79 |    "version": "3.6.8"
80 |   }
81 |  },
82 |  "nbformat": 4,
83 |  "nbformat_minor": 2
84 | }
85 | 


--------------------------------------------------------------------------------
/neural-machine-translation/prepare-dataset.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "# !wget https://s3.amazonaws.com/opennmt-trainingdata/baseline-1M-enfr.tgz\n",
 10 |     "# !tar -zxf baseline-1M-enfr.tgz"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 2,
 16 |    "metadata": {},
 17 |    "outputs": [
 18 |     {
 19 |      "data": {
 20 |       "text/plain": [
 21 |        "['baseline-1M-enfr/baseline-1M_test.en',\n",
 22 |        " 'baseline-1M-enfr/baseline-1M_valid.en',\n",
 23 |        " 'baseline-1M-enfr/baseline-1M_train.fr',\n",
 24 |        " 'baseline-1M-enfr/baseline-1M_valid.fr',\n",
 25 |        " 'baseline-1M-enfr/baseline-1M_train.en',\n",
 26 |        " 'baseline-1M-enfr/baseline-1M_test.fr']"
 27 |       ]
 28 |      },
 29 |      "execution_count": 2,
 30 |      "metadata": {},
 31 |      "output_type": "execute_result"
 32 |     }
 33 |    ],
 34 |    "source": [
 35 |     "from glob import glob\n",
 36 |     "\n",
 37 |     "files = glob('baseline-1M-enfr/*')\n",
 38 |     "files"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": 3,
 44 |    "metadata": {},
 45 |    "outputs": [
 46 |     {
 47 |      "data": {
 48 |       "text/plain": [
 49 |        "(1009163, 1009163)"
 50 |       ]
 51 |      },
 52 |      "execution_count": 3,
 53 |      "metadata": {},
 54 |      "output_type": "execute_result"
 55 |     }
 56 |    ],
 57 |    "source": [
 58 |     "with open('baseline-1M-enfr/baseline-1M_train.en') as fopen:\n",
 59 |     "    train_en = fopen.read().split('\\n')[:-1]\n",
 60 |     "    \n",
 61 |     "with open('baseline-1M-enfr/baseline-1M_train.fr') as fopen:\n",
 62 |     "    train_fr = fopen.read().split('\\n')[:-1]\n",
 63 |     "    \n",
 64 |     "len(train_en), len(train_fr)"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 4,
 70 |    "metadata": {},
 71 |    "outputs": [
 72 |     {
 73 |      "data": {
 74 |       "text/plain": [
 75 |        "(1000, 1000)"
 76 |       ]
 77 |      },
 78 |      "execution_count": 4,
 79 |      "metadata": {},
 80 |      "output_type": "execute_result"
 81 |     }
 82 |    ],
 83 |    "source": [
 84 |     "with open('baseline-1M-enfr/baseline-1M_test.en') as fopen:\n",
 85 |     "    test_en = fopen.read().split('\\n')[:-1]\n",
 86 |     "    \n",
 87 |     "with open('baseline-1M-enfr/baseline-1M_test.fr') as fopen:\n",
 88 |     "    test_fr = fopen.read().split('\\n')[:-1]\n",
 89 |     "    \n",
 90 |     "len(test_en), len(test_fr)"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": 5,
 96 |    "metadata": {},
 97 |    "outputs": [
 98 |     {
 99 |      "data": {
100 |       "text/plain": [
101 |        "(2000, 2000)"
102 |       ]
103 |      },
104 |      "execution_count": 5,
105 |      "metadata": {},
106 |      "output_type": "execute_result"
107 |     }
108 |    ],
109 |    "source": [
110 |     "with open('baseline-1M-enfr/baseline-1M_valid.en') as fopen:\n",
111 |     "    test_en.extend(fopen.read().split('\\n')[:-1])\n",
112 |     "    \n",
113 |     "with open('baseline-1M-enfr/baseline-1M_valid.fr') as fopen:\n",
114 |     "    test_fr.extend(fopen.read().split('\\n')[:-1])\n",
115 |     "    \n",
116 |     "len(test_en), len(test_fr)"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": 8,
122 |    "metadata": {},
123 |    "outputs": [
124 |     {
125 |      "name": "stderr",
126 |      "output_type": "stream",
127 |      "text": [
128 |       "100%|██████████| 1009163/1009163 [00:03<00:00, 301686.48it/s]\n"
129 |      ]
130 |     },
131 |     {
132 |      "data": {
133 |       "text/plain": [
134 |        "(1009088, 1009088)"
135 |       ]
136 |      },
137 |      "execution_count": 8,
138 |      "metadata": {},
139 |      "output_type": "execute_result"
140 |     }
141 |    ],
142 |    "source": [
143 |     "from tqdm import tqdm\n",
144 |     "\n",
145 |     "train_X, train_Y = [], []\n",
146 |     "\n",
147 |     "for i in tqdm(range(len(train_en))):\n",
148 |     "    if len(train_en[i].split()) > 100 or len(train_fr[i].split()) > 100:\n",
149 |     "        continue\n",
150 |     "    train_X.append(train_en[i])\n",
151 |     "    train_Y.append(train_fr[i])\n",
152 |     "    \n",
153 |     "len(train_X), len(train_Y)"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": 9,
159 |    "metadata": {},
160 |    "outputs": [],
161 |    "source": [
162 |     "test_X, test_Y = train_X[-5000:], train_Y[-5000:]\n",
163 |     "train_X, train_Y = train_X[:200000], train_Y[:200000]"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": 10,
169 |    "metadata": {},
170 |    "outputs": [],
171 |    "source": [
172 |     "import json\n",
173 |     "\n",
174 |     "with open('dataset.json', 'w') as fopen:\n",
175 |     "    json.dump({'train_X': train_X, 'train_Y': train_Y, 'test_X': test_X, 'test_Y': test_Y}, fopen)"
176 |    ]
177 |   }
178 |  ],
179 |  "metadata": {
180 |   "kernelspec": {
181 |    "display_name": "Python 3",
182 |    "language": "python",
183 |    "name": "python3"
184 |   },
185 |   "language_info": {
186 |    "codemirror_mode": {
187 |     "name": "ipython",
188 |     "version": 3
189 |    },
190 |    "file_extension": ".py",
191 |    "mimetype": "text/x-python",
192 |    "name": "python",
193 |    "nbconvert_exporter": "python",
194 |    "pygments_lexer": "ipython3",
195 |    "version": "3.6.8"
196 |   }
197 |  },
198 |  "nbformat": 4,
199 |  "nbformat_minor": 2
200 | }
201 | 


--------------------------------------------------------------------------------
/neural-machine-translation/prepare-t2t.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 1,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "!mkdir train\n",
10 |     "!mkdir test"
11 |    ]
12 |   },
13 |   {
14 |    "cell_type": "code",
15 |    "execution_count": 2,
16 |    "metadata": {},
17 |    "outputs": [],
18 |    "source": [
19 |     "import json\n",
20 |     "\n",
21 |     "with open('dataset.json') as fopen:\n",
22 |     "    data = json.load(fopen)\n",
23 |     "    \n",
24 |     "train_X = data['train_X']\n",
25 |     "train_Y = data['train_Y']\n",
26 |     "test_X = data['test_X']\n",
27 |     "test_Y = data['test_Y']"
28 |    ]
29 |   },
30 |   {
31 |    "cell_type": "code",
32 |    "execution_count": 3,
33 |    "metadata": {},
34 |    "outputs": [],
35 |    "source": [
36 |     "with open('train/before.txt', 'w') as fopen:\n",
37 |     "    fopen.write('\\n'.join(train_X))\n",
38 |     "    \n",
39 |     "with open('train/after.txt', 'w') as fopen:\n",
40 |     "    fopen.write('\\n'.join(train_Y))\n",
41 |     "    \n",
42 |     "with open('test/before.txt', 'w') as fopen:\n",
43 |     "    fopen.write('\\n'.join(test_X))\n",
44 |     "    \n",
45 |     "with open('test/after.txt', 'w') as fopen:\n",
46 |     "    fopen.write('\\n'.join(test_Y))"
47 |    ]
48 |   },
49 |   {
50 |    "cell_type": "code",
51 |    "execution_count": 4,
52 |    "metadata": {},
53 |    "outputs": [
54 |     {
55 |      "name": "stdout",
56 |      "output_type": "stream",
57 |      "text": [
58 |       "train/\n",
59 |       "train/after.txt\n",
60 |       "train/before.txt\n",
61 |       "test/\n",
62 |       "test/after.txt\n",
63 |       "test/before.txt\n"
64 |      ]
65 |     }
66 |    ],
67 |    "source": [
68 |     "!tar -czvf train-translation.tar.gz train\n",
69 |     "!tar -czvf test-translation.tar.gz test"
70 |    ]
71 |   }
72 |  ],
73 |  "metadata": {
74 |   "kernelspec": {
75 |    "display_name": "Python 3",
76 |    "language": "python",
77 |    "name": "python3"
78 |   },
79 |   "language_info": {
80 |    "codemirror_mode": {
81 |     "name": "ipython",
82 |     "version": 3
83 |    },
84 |    "file_extension": ".py",
85 |    "mimetype": "text/x-python",
86 |    "name": "python",
87 |    "nbconvert_exporter": "python",
88 |    "pygments_lexer": "ipython3",
89 |    "version": "3.6.8"
90 |   }
91 |  },
92 |  "nbformat": 4,
93 |  "nbformat_minor": 2
94 | }
95 | 


--------------------------------------------------------------------------------
/neural-machine-translation/t/tokenizer.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2020 The Tensor2Tensor Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """A simple invertible tokenizer.
 17 | 
 18 | Converts from a unicode string to a list of tokens
 19 | (represented as Unicode strings).
 20 | 
 21 | This tokenizer has the following desirable properties:
 22 |  - It is invertible.
 23 |  - Alphanumeric characters are broken away from non-alphanumeric characters.
 24 |  - A single space between words does not produce an extra token.
 25 |  - The full Unicode punctuation and separator set is recognized.
 26 | 
 27 | The tokenization algorithm is as follows:
 28 | 
 29 | 1.  Split the text into a list of tokens, splitting at every boundary of an
 30 |     alphanumeric character and a non-alphanumeric character.  This produces
 31 |     a list which alternates between "alphanumeric tokens"
 32 |     (strings of alphanumeric characters) and "non-alphanumeric tokens"
 33 |     (strings of non-alphanumeric characters).
 34 | 
 35 | 2.  Remove every token consisting of a single space, unless it is
 36 |     the very first or very last token in the list.  These tokens are now
 37 |     implied by the fact that there are two adjacent alphanumeric tokens.
 38 | 
 39 | e.g.  u"Dude - that's so cool."
 40 |         -> [u"Dude", u" - ", u"that", u"'", u"s", u"so", u"cool", u"."]
 41 | """
 42 | 
 43 | from __future__ import absolute_import
 44 | from __future__ import division
 45 | from __future__ import print_function
 46 | 
 47 | import collections
 48 | import sys
 49 | import unicodedata
 50 | import six
 51 | from six.moves import range  # pylint: disable=redefined-builtin
 52 | import tensorflow.compat.v1 as tf
 53 | 
 54 | # Conversion between Unicode and UTF-8, if required (on Python2)
 55 | _native_to_unicode = (lambda s: s.decode('utf-8')) if six.PY2 else (lambda s: s)
 56 | 
 57 | 
 58 | # This set contains all letter and number characters.
 59 | _ALPHANUMERIC_CHAR_SET = set(
 60 |     six.unichr(i)
 61 |     for i in range(sys.maxunicode)
 62 |     if (
 63 |         unicodedata.category(six.unichr(i)).startswith('L')
 64 |         or unicodedata.category(six.unichr(i)).startswith('N')
 65 |     )
 66 | )
 67 | 
 68 | 
 69 | def encode(text):
 70 |     """Encode a unicode string as a list of tokens.
 71 | 
 72 |   Args:
 73 |     text: a unicode string
 74 |   Returns:
 75 |     a list of tokens as Unicode strings
 76 |   """
 77 |     if not text:
 78 |         return []
 79 |     ret = []
 80 |     token_start = 0
 81 |     # Classify each character in the input string
 82 |     is_alnum = [c in _ALPHANUMERIC_CHAR_SET for c in text]
 83 |     for pos in range(1, len(text)):
 84 |         if is_alnum[pos] != is_alnum[pos - 1]:
 85 |             token = text[token_start:pos]
 86 |             if token != u' ' or token_start == 0:
 87 |                 ret.append(token)
 88 |             token_start = pos
 89 |     final_token = text[token_start:]
 90 |     ret.append(final_token)
 91 |     return ret
 92 | 
 93 | 
 94 | def decode(tokens):
 95 |     """Decode a list of tokens to a unicode string.
 96 | 
 97 |   Args:
 98 |     tokens: a list of Unicode strings
 99 |   Returns:
100 |     a unicode string
101 |   """
102 |     token_is_alnum = [t[0] in _ALPHANUMERIC_CHAR_SET for t in tokens]
103 |     ret = []
104 |     for i, token in enumerate(tokens):
105 |         if i > 0 and token_is_alnum[i - 1] and token_is_alnum[i]:
106 |             ret.append(u' ')
107 |         ret.append(token)
108 |     return ''.join(ret)
109 | 
110 | 
111 | def _read_filepattern(filepattern, max_lines = None, split_on_newlines = True):
112 |     """Reads files matching a wildcard pattern, yielding the contents.
113 | 
114 |   Args:
115 |     filepattern: A wildcard pattern matching one or more files.
116 |     max_lines: If set, stop reading after reading this many lines.
117 |     split_on_newlines: A boolean. If true, then split files by lines and strip
118 |         leading and trailing whitespace from each line. Otherwise, treat each
119 |         file as a single string.
120 | 
121 |   Yields:
122 |     The contents of the files as lines, if split_on_newlines is True, or
123 |     the entire contents of each file if False.
124 |   """
125 |     filenames = sorted(tf.gfile.Glob(filepattern))
126 |     lines_read = 0
127 |     for filename in filenames:
128 |         with tf.gfile.Open(filename) as f:
129 |             if split_on_newlines:
130 |                 for line in f:
131 |                     yield line.strip()
132 |                     lines_read += 1
133 |                     if max_lines and lines_read >= max_lines:
134 |                         return
135 | 
136 |             else:
137 |                 if max_lines:
138 |                     doc = []
139 |                     for line in f:
140 |                         doc.append(line)
141 |                         lines_read += 1
142 |                         if max_lines and lines_read >= max_lines:
143 |                             yield ''.join(doc)
144 |                             return
145 |                     yield ''.join(doc)
146 | 
147 |                 else:
148 |                     yield f.read()
149 | 
150 | 
151 | def vocab_token_counts(text_filepattern, max_lines):
152 |     """Read a vocab file and return a dictionary of token counts.
153 | 
154 |   Reads a two-column CSV file of tokens and their frequency in a dataset. The
155 |   tokens are presumed to be generated by encode() or the equivalent.
156 | 
157 |   Args:
158 |     text_filepattern: A pattern matching one or more files.
159 |     max_lines: An integer; maximum total lines to read.
160 | 
161 |   Returns:
162 |     a dictionary mapping token to count.
163 |   """
164 |     ret = {}
165 |     for i, line in enumerate(
166 |         _read_filepattern(text_filepattern, max_lines = max_lines)
167 |     ):
168 |         if ',' not in line:
169 |             tf.logging.warning("Malformed vocab line #%d '%s'", i, line)
170 |             continue
171 | 
172 |         token, count = line.rsplit(',', 1)
173 |         ret[_native_to_unicode(token)] = int(count)
174 | 
175 |     return ret
176 | 


--------------------------------------------------------------------------------
/neural-machine-translation/transformer/attention_layer.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2018 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | """Implementation of multiheaded attention and self-attention layers."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import tensorflow as tf
 22 | 
 23 | 
 24 | class Attention(tf.layers.Layer):
 25 |     """Multi-headed attention layer."""
 26 | 
 27 |     def __init__(self, hidden_size, num_heads, attention_dropout, train):
 28 |         if hidden_size % num_heads != 0:
 29 |             raise ValueError(
 30 |                 'Hidden size must be evenly divisible by the number of '
 31 |                 'heads.'
 32 |             )
 33 | 
 34 |         super(Attention, self).__init__()
 35 |         self.hidden_size = hidden_size
 36 |         self.num_heads = num_heads
 37 |         self.attention_dropout = attention_dropout
 38 |         self.train = train
 39 | 
 40 |         # Layers for linearly projecting the queries, keys, and values.
 41 |         self.q_dense_layer = tf.layers.Dense(
 42 |             hidden_size, use_bias = False, name = 'q'
 43 |         )
 44 |         self.k_dense_layer = tf.layers.Dense(
 45 |             hidden_size, use_bias = False, name = 'k'
 46 |         )
 47 |         self.v_dense_layer = tf.layers.Dense(
 48 |             hidden_size, use_bias = False, name = 'v'
 49 |         )
 50 | 
 51 |         self.output_dense_layer = tf.layers.Dense(
 52 |             hidden_size, use_bias = False, name = 'output_transform'
 53 |         )
 54 | 
 55 |     def split_heads(self, x):
 56 |         """Split x into different heads, and transpose the resulting value.
 57 | 
 58 |     The tensor is transposed to insure the inner dimensions hold the correct
 59 |     values during the matrix multiplication.
 60 | 
 61 |     Args:
 62 |       x: A tensor with shape [batch_size, length, hidden_size]
 63 | 
 64 |     Returns:
 65 |       A tensor with shape [batch_size, num_heads, length, hidden_size/num_heads]
 66 |     """
 67 |         with tf.name_scope('split_heads'):
 68 |             batch_size = tf.shape(x)[0]
 69 |             length = tf.shape(x)[1]
 70 | 
 71 |             # Calculate depth of last dimension after it has been split.
 72 |             depth = self.hidden_size // self.num_heads
 73 | 
 74 |             # Split the last dimension
 75 |             x = tf.reshape(x, [batch_size, length, self.num_heads, depth])
 76 | 
 77 |             # Transpose the result
 78 |             return tf.transpose(x, [0, 2, 1, 3])
 79 | 
 80 |     def combine_heads(self, x):
 81 |         """Combine tensor that has been split.
 82 | 
 83 |     Args:
 84 |       x: A tensor [batch_size, num_heads, length, hidden_size/num_heads]
 85 | 
 86 |     Returns:
 87 |       A tensor with shape [batch_size, length, hidden_size]
 88 |     """
 89 |         with tf.name_scope('combine_heads'):
 90 |             batch_size = tf.shape(x)[0]
 91 |             length = tf.shape(x)[2]
 92 |             x = tf.transpose(
 93 |                 x, [0, 2, 1, 3]
 94 |             )  # --> [batch, length, num_heads, depth]
 95 |             return tf.reshape(x, [batch_size, length, self.hidden_size])
 96 | 
 97 |     def call(self, x, y, bias, cache = None):
 98 |         """Apply attention mechanism to x and y.
 99 | 
100 |     Args:
101 |       x: a tensor with shape [batch_size, length_x, hidden_size]
102 |       y: a tensor with shape [batch_size, length_y, hidden_size]
103 |       bias: attention bias that will be added to the result of the dot product.
104 |       cache: (Used during prediction) dictionary with tensors containing results
105 |         of previous attentions. The dictionary must have the items:
106 |             {"k": tensor with shape [batch_size, i, key_channels],
107 |              "v": tensor with shape [batch_size, i, value_channels]}
108 |         where i is the current decoded length.
109 | 
110 |     Returns:
111 |       Attention layer output with shape [batch_size, length_x, hidden_size]
112 |     """
113 |         # Linearly project the query (q), key (k) and value (v) using different
114 |         # learned projections. This is in preparation of splitting them into
115 |         # multiple heads. Multi-head attention uses multiple queries, keys, and
116 |         # values rather than regular attention (which uses a single q, k, v).
117 |         q = self.q_dense_layer(x)
118 |         k = self.k_dense_layer(y)
119 |         v = self.v_dense_layer(y)
120 | 
121 |         if cache is not None:
122 |             # Combine cached keys and values with new keys and values.
123 |             k = tf.concat([cache['k'], k], axis = 1)
124 |             v = tf.concat([cache['v'], v], axis = 1)
125 | 
126 |             # Update cache
127 |             cache['k'] = k
128 |             cache['v'] = v
129 | 
130 |         # Split q, k, v into heads.
131 |         q = self.split_heads(q)
132 |         k = self.split_heads(k)
133 |         v = self.split_heads(v)
134 | 
135 |         # Scale q to prevent the dot product between q and k from growing too large.
136 |         depth = self.hidden_size // self.num_heads
137 |         q *= depth ** -0.5
138 | 
139 |         # Calculate dot product attention
140 |         logits = tf.matmul(q, k, transpose_b = True)
141 |         logits += bias
142 |         weights = tf.nn.softmax(logits, name = 'attention_weights')
143 |         if self.train:
144 |             weights = tf.nn.dropout(weights, 1.0 - self.attention_dropout)
145 |         attention_output = tf.matmul(weights, v)
146 | 
147 |         # Recombine heads --> [batch_size, length, hidden_size]
148 |         attention_output = self.combine_heads(attention_output)
149 | 
150 |         # Run the combined outputs through another linear projection layer.
151 |         attention_output = self.output_dense_layer(attention_output)
152 |         return attention_output
153 | 
154 | 
155 | class SelfAttention(Attention):
156 |     """Multiheaded self-attention layer."""
157 | 
158 |     def call(self, x, bias, cache = None):
159 |         return super(SelfAttention, self).call(x, x, bias, cache)
160 | 


--------------------------------------------------------------------------------
/neural-machine-translation/transformer/embedding_layer.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2018 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | """Implementation of embedding layer with shared weights."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import tensorflow as tf  # pylint: disable=g-bad-import-order
 22 | 
 23 | 
 24 | class EmbeddingSharedWeights(tf.layers.Layer):
 25 |     """Calculates input embeddings and pre-softmax linear with shared weights."""
 26 | 
 27 |     def __init__(self, vocab_size, hidden_size, method = 'gather'):
 28 |         """Specify characteristic parameters of embedding layer.
 29 | 
 30 |     Args:
 31 |       vocab_size: Number of tokens in the embedding. (Typically ~32,000)
 32 |       hidden_size: Dimensionality of the embedding. (Typically 512 or 1024)
 33 |       method: Strategy for performing embedding lookup. "gather" uses tf.gather
 34 |         which performs well on CPUs and GPUs, but very poorly on TPUs. "matmul"
 35 |         one-hot encodes the indicies and formulates the embedding as a sparse
 36 |         matrix multiplication. The matmul formulation is wasteful as it does
 37 |         extra work, however matrix multiplication is very fast on TPUs which
 38 |         makes "matmul" considerably faster than "gather" on TPUs.
 39 |     """
 40 |         super(EmbeddingSharedWeights, self).__init__()
 41 |         self.vocab_size = vocab_size
 42 |         self.hidden_size = hidden_size
 43 |         if method not in ('gather', 'matmul'):
 44 |             raise ValueError(
 45 |                 "method {} must be 'gather' or 'matmul'".format(method)
 46 |             )
 47 |         self.method = method
 48 | 
 49 |     def build(self, _):
 50 |         with tf.variable_scope('embedding_and_softmax', reuse = tf.AUTO_REUSE):
 51 |             # Create and initialize weights. The random normal initializer was chosen
 52 |             # randomly, and works well.
 53 |             self.shared_weights = tf.get_variable(
 54 |                 'weights',
 55 |                 [self.vocab_size, self.hidden_size],
 56 |                 initializer = tf.random_normal_initializer(
 57 |                     0.0, self.hidden_size ** -0.5
 58 |                 ),
 59 |             )
 60 | 
 61 |         self.built = True
 62 | 
 63 |     def call(self, x):
 64 |         """Get token embeddings of x.
 65 | 
 66 |     Args:
 67 |       x: An int64 tensor with shape [batch_size, length]
 68 |     Returns:
 69 |       embeddings: float32 tensor with shape [batch_size, length, embedding_size]
 70 |       padding: float32 tensor with shape [batch_size, length] indicating the
 71 |         locations of the padding tokens in x.
 72 |     """
 73 |         with tf.name_scope('embedding'):
 74 |             # Create binary mask of size [batch_size, length]
 75 |             mask = tf.to_float(tf.not_equal(x, 0))
 76 | 
 77 |             if self.method == 'gather':
 78 |                 embeddings = tf.gather(self.shared_weights, x)
 79 |                 embeddings *= tf.expand_dims(mask, -1)
 80 |             else:  # matmul
 81 |                 embeddings = tpu_utils.embedding_matmul(
 82 |                     embedding_table = self.shared_weights,
 83 |                     values = tf.cast(x, dtype = tf.int32),
 84 |                     mask = mask,
 85 |                 )
 86 |                 # embedding_matmul already zeros out masked positions, so
 87 |                 # `embeddings *= tf.expand_dims(mask, -1)` is unnecessary.
 88 | 
 89 |             # Scale embedding by the sqrt of the hidden size
 90 |             embeddings *= self.hidden_size ** 0.5
 91 | 
 92 |             return embeddings
 93 | 
 94 |     def linear(self, x):
 95 |         """Computes logits by running x through a linear layer.
 96 | 
 97 |     Args:
 98 |       x: A float32 tensor with shape [batch_size, length, hidden_size]
 99 |     Returns:
100 |       float32 tensor with shape [batch_size, length, vocab_size].
101 |     """
102 |         with tf.name_scope('presoftmax_linear'):
103 |             batch_size = tf.shape(x)[0]
104 |             length = tf.shape(x)[1]
105 | 
106 |             x = tf.reshape(x, [-1, self.hidden_size])
107 |             logits = tf.matmul(x, self.shared_weights, transpose_b = True)
108 | 
109 |             return tf.reshape(logits, [batch_size, length, self.vocab_size])
110 | 


--------------------------------------------------------------------------------
/neural-machine-translation/transformer/ffn_layer.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """Implementation of fully connected network."""
16 | 
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 | 
21 | import tensorflow as tf
22 | 
23 | 
24 | class FeedFowardNetwork(tf.layers.Layer):
25 |     """Fully connected feedforward network."""
26 | 
27 |     def __init__(
28 |         self, hidden_size, filter_size, relu_dropout, train, allow_pad
29 |     ):
30 |         super(FeedFowardNetwork, self).__init__()
31 |         self.hidden_size = hidden_size
32 |         self.filter_size = filter_size
33 |         self.relu_dropout = relu_dropout
34 |         self.train = train
35 |         self.allow_pad = allow_pad
36 | 
37 |         self.filter_dense_layer = tf.layers.Dense(
38 |             filter_size,
39 |             use_bias = True,
40 |             activation = tf.nn.relu,
41 |             name = 'filter_layer',
42 |         )
43 |         self.output_dense_layer = tf.layers.Dense(
44 |             hidden_size, use_bias = True, name = 'output_layer'
45 |         )
46 | 
47 |     def call(self, x, padding = None):
48 |         """Return outputs of the feedforward network.
49 | 
50 |     Args:
51 |       x: tensor with shape [batch_size, length, hidden_size]
52 |       padding: (optional) If set, the padding values are temporarily removed
53 |         from x (provided self.allow_pad is set). The padding values are placed
54 |         back in the output tensor in the same locations.
55 |         shape [batch_size, length]
56 | 
57 |     Returns:
58 |       Output of the feedforward network.
59 |       tensor with shape [batch_size, length, hidden_size]
60 |     """
61 |         padding = None if not self.allow_pad else padding
62 | 
63 |         # Retrieve dynamically known shapes
64 |         batch_size = tf.shape(x)[0]
65 |         length = tf.shape(x)[1]
66 | 
67 |         if padding is not None:
68 |             with tf.name_scope('remove_padding'):
69 |                 # Flatten padding to [batch_size*length]
70 |                 pad_mask = tf.reshape(padding, [-1])
71 | 
72 |                 nonpad_ids = tf.to_int32(tf.where(pad_mask < 1e-9))
73 | 
74 |                 # Reshape x to [batch_size*length, hidden_size] to remove padding
75 |                 x = tf.reshape(x, [-1, self.hidden_size])
76 |                 x = tf.gather_nd(x, indices = nonpad_ids)
77 | 
78 |                 # Reshape x from 2 dimensions to 3 dimensions.
79 |                 x.set_shape([None, self.hidden_size])
80 |                 x = tf.expand_dims(x, axis = 0)
81 | 
82 |         output = self.filter_dense_layer(x)
83 |         if self.train:
84 |             output = tf.nn.dropout(output, 1.0 - self.relu_dropout)
85 |         output = self.output_dense_layer(output)
86 | 
87 |         if padding is not None:
88 |             with tf.name_scope('re_add_padding'):
89 |                 output = tf.squeeze(output, axis = 0)
90 |                 output = tf.scatter_nd(
91 |                     indices = nonpad_ids,
92 |                     updates = output,
93 |                     shape = [batch_size * length, self.hidden_size],
94 |                 )
95 |                 output = tf.reshape(
96 |                     output, [batch_size, length, self.hidden_size]
97 |                 )
98 |         return output
99 | 


--------------------------------------------------------------------------------
/neural-machine-translation/transformer/model_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2018 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | """Transformer model helper methods."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import math
 22 | 
 23 | import tensorflow as tf
 24 | 
 25 | _NEG_INF = -1e9
 26 | 
 27 | 
 28 | def get_position_encoding(
 29 |     length, hidden_size, min_timescale = 1.0, max_timescale = 1.0e4
 30 | ):
 31 |     """Return positional encoding.
 32 | 
 33 |   Calculates the position encoding as a mix of sine and cosine functions with
 34 |   geometrically increasing wavelengths.
 35 |   Defined and formulized in Attention is All You Need, section 3.5.
 36 | 
 37 |   Args:
 38 |     length: Sequence length.
 39 |     hidden_size: Size of the
 40 |     min_timescale: Minimum scale that will be applied at each position
 41 |     max_timescale: Maximum scale that will be applied at each position
 42 | 
 43 |   Returns:
 44 |     Tensor with shape [length, hidden_size]
 45 |   """
 46 |     position = tf.to_float(tf.range(length))
 47 |     num_timescales = hidden_size // 2
 48 |     log_timescale_increment = math.log(
 49 |         float(max_timescale) / float(min_timescale)
 50 |     ) / (tf.to_float(num_timescales) - 1)
 51 |     inv_timescales = min_timescale * tf.exp(
 52 |         tf.to_float(tf.range(num_timescales)) * -log_timescale_increment
 53 |     )
 54 |     scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(
 55 |         inv_timescales, 0
 56 |     )
 57 |     signal = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis = 1)
 58 |     return signal
 59 | 
 60 | 
 61 | def get_decoder_self_attention_bias(length):
 62 |     """Calculate bias for decoder that maintains model's autoregressive property.
 63 | 
 64 |   Creates a tensor that masks out locations that correspond to illegal
 65 |   connections, so prediction at position i cannot draw information from future
 66 |   positions.
 67 | 
 68 |   Args:
 69 |     length: int length of sequences in batch.
 70 | 
 71 |   Returns:
 72 |     float tensor of shape [1, 1, length, length]
 73 |   """
 74 |     with tf.name_scope('decoder_self_attention_bias'):
 75 |         valid_locs = tf.matrix_band_part(tf.ones([length, length]), -1, 0)
 76 |         valid_locs = tf.reshape(valid_locs, [1, 1, length, length])
 77 |         decoder_bias = _NEG_INF * (1.0 - valid_locs)
 78 |     return decoder_bias
 79 | 
 80 | 
 81 | def get_padding(x, padding_value = 0):
 82 |     """Return float tensor representing the padding values in x.
 83 | 
 84 |   Args:
 85 |     x: int tensor with any shape
 86 |     padding_value: int value that
 87 | 
 88 |   Returns:
 89 |     flaot tensor with same shape as x containing values 0 or 1.
 90 |       0 -> non-padding, 1 -> padding
 91 |   """
 92 |     with tf.name_scope('padding'):
 93 |         return tf.to_float(tf.equal(x, padding_value))
 94 | 
 95 | 
 96 | def get_padding_bias(x):
 97 |     """Calculate bias tensor from padding values in tensor.
 98 | 
 99 |   Bias tensor that is added to the pre-softmax multi-headed attention logits,
100 |   which has shape [batch_size, num_heads, length, length]. The tensor is zero at
101 |   non-padding locations, and -1e9 (negative infinity) at padding locations.
102 | 
103 |   Args:
104 |     x: int tensor with shape [batch_size, length]
105 | 
106 |   Returns:
107 |     Attention bias tensor of shape [batch_size, 1, 1, length].
108 |   """
109 |     with tf.name_scope('attention_bias'):
110 |         padding = get_padding(x)
111 |         attention_bias = padding * _NEG_INF
112 |         attention_bias = tf.expand_dims(
113 |             tf.expand_dims(attention_bias, axis = 1), axis = 1
114 |         )
115 |     return attention_bias
116 | 


--------------------------------------------------------------------------------
/nlp-tf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mesolitica/NLP-Models-Tensorflow/0741216aa8235e1228b3de7903cc36d73f8f2b45/nlp-tf.png


--------------------------------------------------------------------------------
/ocr/README.md:
--------------------------------------------------------------------------------
1 | ## How-to
2 | 
3 | 1. Download dataset from here, http://baidudeeplearning.bj.bcebos.com/image_contest_level_1.tar.gz
4 | 2. Run any notebook using Jupyter Notebook.
5 | 


--------------------------------------------------------------------------------
/pos-tagging/README.md:
--------------------------------------------------------------------------------
1 | ## How-to
2 | 
3 | 1. Run any notebook using Jupyter Notebook.
4 | 


--------------------------------------------------------------------------------
/question-answer/README.md:
--------------------------------------------------------------------------------
1 | ## How-to
2 | 
3 | 1. Run any notebook using Jupyter Notebook.
4 | 


--------------------------------------------------------------------------------
/question-answer/attention_gru.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | """Module implementing RNN Cells.
 16 | 
 17 | This module provides a number of basic commonly used RNN cells, such as LSTM
 18 | (Long Short Term Memory) or GRU (Gated Recurrent Unit), and a number of
 19 | operators that allow adding dropouts, projections, or embeddings for inputs.
 20 | Constructing multi-layer cells is supported by the class `MultiRNNCell`, or by
 21 | calling the `rnn` ops several times.
 22 | """
 23 | from __future__ import absolute_import
 24 | from __future__ import division
 25 | from __future__ import print_function
 26 | 
 27 | import collections
 28 | import hashlib
 29 | import numbers
 30 | 
 31 | from tensorflow.python.eager import context
 32 | from tensorflow.python.framework import constant_op
 33 | from tensorflow.python.framework import dtypes
 34 | from tensorflow.python.framework import ops
 35 | from tensorflow.python.framework import tensor_shape
 36 | from tensorflow.python.framework import tensor_util
 37 | from tensorflow.python.layers import base as base_layer
 38 | from tensorflow.python.ops import array_ops
 39 | from tensorflow.python.ops import clip_ops
 40 | from tensorflow.python.ops import init_ops
 41 | from tensorflow.python.ops import math_ops
 42 | from tensorflow.python.ops import nn_ops
 43 | from tensorflow.python.ops import partitioned_variables
 44 | from tensorflow.python.ops import random_ops
 45 | from tensorflow.python.ops import tensor_array_ops
 46 | from tensorflow.python.ops import variable_scope as vs
 47 | from tensorflow.python.ops import variables as tf_variables
 48 | from tensorflow.python.platform import tf_logging as logging
 49 | from tensorflow.python.util import nest
 50 | from tensorflow.python.ops.rnn_cell_impl import RNNCell
 51 | 
 52 | 
 53 | _BIAS_VARIABLE_NAME = "bias"
 54 | _WEIGHTS_VARIABLE_NAME = "kernel"
 55 | 
 56 | 
 57 | class AttentionGRUCell(RNNCell):
 58 |   """Gated Recurrent Unit cell (cf. http://arxiv.org/abs/1406.1078).
 59 | 
 60 |   Args:
 61 |     num_units: int, The number of units in the GRU cell.
 62 |     activation: Nonlinearity to use.  Default: `tanh`.
 63 |     reuse: (optional) Python boolean describing whether to reuse variables
 64 |      in an existing scope.  If not `True`, and the existing scope already has
 65 |      the given variables, an error is raised.
 66 |     kernel_initializer: (optional) The initializer to use for the weight and
 67 |     projection matrices.
 68 |     bias_initializer: (optional) The initializer to use for the bias.
 69 |   """
 70 | 
 71 |   def __init__(self,
 72 |                num_units,
 73 |                activation=None,
 74 |                reuse=None,
 75 |                name=None,
 76 |                kernel_initializer=None,
 77 |                bias_initializer=None):
 78 |     super(AttentionGRUCell, self).__init__(_reuse=reuse, name=name)
 79 |     self._num_units = num_units
 80 |     self._activation = activation or math_ops.tanh
 81 |     self._kernel_initializer = kernel_initializer
 82 |     self._bias_initializer = bias_initializer
 83 |     self._gate_linear = None
 84 |     self._candidate_linear = None
 85 | 
 86 |   @property
 87 |   def state_size(self):
 88 |     return self._num_units
 89 | 
 90 |   @property
 91 |   def output_size(self):
 92 |     return self._num_units
 93 | 
 94 |   def call(self, inputs, state):
 95 |     # extract input vector and attention (gate)
 96 |     if inputs.get_shape()[-1] != self._num_units + 1:
 97 |       raise ValueError("Input should be passed as word input concatenated with 1D attention on end axis")
 98 |     inputs, g = array_ops.split(inputs,
 99 |       num_or_size_splits=[self._num_units,1],
100 |       axis=1)
101 | 
102 |     """Gated recurrent unit (GRU) with nunits cells."""
103 |     if self._gate_linear is None:
104 |       bias_ones = self._bias_initializer
105 |       if self._bias_initializer is None:
106 |         bias_ones = init_ops.constant_initializer(1.0, dtype=inputs.dtype)
107 |       with vs.variable_scope("gates"):  # Reset gate and update gate.
108 |         self._gate_linear = _Linear(
109 |             [inputs, state],
110 |             self._num_units,
111 |             True,
112 |             bias_initializer=bias_ones,
113 |             kernel_initializer=self._kernel_initializer)
114 | 
115 |     r = math_ops.sigmoid(self._gate_linear([inputs, state]))
116 | 
117 |     r_state = r * state
118 |     if self._candidate_linear is None:
119 |       with vs.variable_scope("candidate"):
120 |         self._candidate_linear = _Linear(
121 |             [inputs, r_state],
122 |             self._num_units,
123 |             True,
124 |             bias_initializer=self._bias_initializer,
125 |             kernel_initializer=self._kernel_initializer)
126 |     c = self._activation(self._candidate_linear([inputs, r_state]))
127 |     new_h = (1 - g) * state + g * c
128 |     return new_h, new_h
129 | 
130 | 
131 | class _Linear(object):
132 |   """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable.
133 | 
134 |   Args:
135 |     args: a 2D Tensor or a list of 2D, batch x n, Tensors.
136 |     output_size: int, second dimension of weight variable.
137 |     dtype: data type for variables.
138 |     build_bias: boolean, whether to build a bias variable.
139 |     bias_initializer: starting value to initialize the bias
140 |       (default is all zeros).
141 |     kernel_initializer: starting value to initialize the weight.
142 | 
143 |   Raises:
144 |     ValueError: if inputs_shape is wrong.
145 |   """
146 | 
147 |   def __init__(self,
148 |                args,
149 |                output_size,
150 |                build_bias,
151 |                bias_initializer=None,
152 |                kernel_initializer=None):
153 |     self._build_bias = build_bias
154 | 
155 |     if args is None or (nest.is_sequence(args) and not args):
156 |       raise ValueError("`args` must be specified")
157 |     if not nest.is_sequence(args):
158 |       args = [args]
159 |       self._is_sequence = False
160 |     else:
161 |       self._is_sequence = True
162 | 
163 |     # Calculate the total size of arguments on dimension 1.
164 |     total_arg_size = 0
165 |     shapes = [a.get_shape() for a in args]
166 |     for shape in shapes:
167 |       if shape.ndims != 2:
168 |         raise ValueError("linear is expecting 2D arguments: %s" % shapes)
169 |       if shape[1].value is None:
170 |         raise ValueError("linear expects shape[1] to be provided for shape %s, "
171 |                          "but saw %s" % (shape, shape[1]))
172 |       else:
173 |         total_arg_size += shape[1].value
174 | 
175 |     dtype = [a.dtype for a in args][0]
176 | 
177 |     scope = vs.get_variable_scope()
178 |     with vs.variable_scope(scope) as outer_scope:
179 |       self._weights = vs.get_variable(
180 |           _WEIGHTS_VARIABLE_NAME, [total_arg_size, output_size],
181 |           dtype=dtype,
182 |           initializer=kernel_initializer)
183 |       if build_bias:
184 |         with vs.variable_scope(outer_scope) as inner_scope:
185 |           inner_scope.set_partitioner(None)
186 |           if bias_initializer is None:
187 |             bias_initializer = init_ops.constant_initializer(0.0, dtype=dtype)
188 |           self._biases = vs.get_variable(
189 |               _BIAS_VARIABLE_NAME, [output_size],
190 |               dtype=dtype,
191 |               initializer=bias_initializer)
192 | 
193 |   def __call__(self, args):
194 |     if not self._is_sequence:
195 |       args = [args]
196 | 
197 |     if len(args) == 1:
198 |       res = math_ops.matmul(args[0], self._weights)
199 |     else:
200 |       res = math_ops.matmul(array_ops.concat(args, 1), self._weights)
201 |     if self._build_bias:
202 |       res = nn_ops.bias_add(res, self._biases)
203 |     return res


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorflow
2 | numpy
3 | scipy
4 | sklearn
5 | scikit-learn
6 | matplotlib
7 | seaborn
8 | pandas
9 | 


--------------------------------------------------------------------------------
/sentence-pair/Archive.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mesolitica/NLP-Models-Tensorflow/0741216aa8235e1228b3de7903cc36d73f8f2b45/sentence-pair/Archive.zip


--------------------------------------------------------------------------------
/sentence-pair/README.md:
--------------------------------------------------------------------------------
 1 | ## How-to
 2 | 
 3 | 1. Extract [Archive.zip](Archive.zip) to get the dataset and dictionary,
 4 | 
 5 | ```bash
 6 | unzip Archive.zip
 7 | ```
 8 | 
 9 | 2. Run any notebook using Jupyter Notebook.
10 | 


--------------------------------------------------------------------------------
/speech-to-text/README.md:
--------------------------------------------------------------------------------
1 | ## How-to
2 | 
3 | 1. First, you need to run [download.ipynb](download.ipynb)
4 | 
5 | 2. Run [caching.py](caching.py)
6 | 
7 | 3. Run any notebook using Jupyter Notebook
8 | 


--------------------------------------------------------------------------------
/speech-to-text/augmentation.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import librosa
  3 | import os
  4 | import scipy
  5 | import json
  6 | 
  7 | 
  8 | def change_pitch_speech(samples):
  9 |     y_pitch_speed = samples.copy()
 10 |     length_change = np.random.uniform(low = 0.8, high = 1)
 11 |     speed_fac = 1.0 / length_change
 12 |     tmp = np.interp(
 13 |         np.arange(0, len(y_pitch_speed), speed_fac),
 14 |         np.arange(0, len(y_pitch_speed)),
 15 |         y_pitch_speed,
 16 |     )
 17 |     minlen = min(y_pitch_speed.shape[0], tmp.shape[0])
 18 |     y_pitch_speed *= 0
 19 |     y_pitch_speed[0:minlen] = tmp[0:minlen]
 20 |     return y_pitch_speed
 21 | 
 22 | 
 23 | def change_amplitude(samples):
 24 |     y_aug = samples.copy()
 25 |     dyn_change = np.random.uniform(low = 1.5, high = 3)
 26 |     return y_aug * dyn_change
 27 | 
 28 | 
 29 | def add_noise(samples):
 30 |     y_noise = samples.copy()
 31 |     noise_amp = 0.01 * np.random.uniform() * np.amax(y_noise)
 32 |     return y_noise.astype('float64') + noise_amp * np.random.normal(
 33 |         size = y_noise.shape[0]
 34 |     )
 35 | 
 36 | 
 37 | def add_hpss(samples):
 38 |     y_hpss = librosa.effects.hpss(samples.astype('float64'))
 39 |     return y_hpss[1]
 40 | 
 41 | 
 42 | def strech(samples):
 43 |     input_length = len(samples)
 44 |     streching = samples.copy()
 45 |     random_strech = np.random.uniform(low = 0.5, high = 1.3)
 46 |     print('random_strech = ', random_strech)
 47 |     streching = librosa.effects.time_stretch(
 48 |         streching.astype('float'), random_strech
 49 |     )
 50 |     return streching
 51 | 
 52 | 
 53 | def random_augmentation(samples):
 54 |     cp = samples.copy()
 55 |     if np.random.randint(0, 2):
 56 |         length_change = np.random.uniform(low = 0.8, high = 1)
 57 |         speed_fac = 1.0 / length_change
 58 |         print('resample length_change = ', length_change)
 59 |         tmp = np.interp(
 60 |             np.arange(0, len(cp), speed_fac), np.arange(0, len(cp)), cp
 61 |         )
 62 |         minlen = min(cp.shape[0], tmp.shape[0])
 63 |         cp *= 0
 64 |         cp[0:minlen] = tmp[0:minlen]
 65 | 
 66 |     if np.random.randint(0, 2):
 67 |         dyn_change = np.random.uniform(low = 1.5, high = 3)
 68 |         print('dyn_change = ', dyn_change)
 69 |         cp = cp * dyn_change
 70 | 
 71 |     if np.random.randint(0, 2):
 72 |         noise_amp = 0.005 * np.random.uniform() * np.amax(cp)
 73 |         cp = cp.astype('float64') + noise_amp * np.random.normal(
 74 |             size = cp.shape[0]
 75 |         )
 76 | 
 77 |     if np.random.randint(0, 2):
 78 |         timeshift_fac = 0.2 * 2 * (np.random.uniform() - 0.5)
 79 |         print('timeshift_fac = ', timeshift_fac)
 80 |         start = int(cp.shape[0] * timeshift_fac)
 81 |         if start > 0:
 82 |             cp = np.pad(cp, (start, 0), mode = 'constant')[0 : cp.shape[0]]
 83 |         else:
 84 |             cp = np.pad(cp, (0, -start), mode = 'constant')[0 : cp.shape[0]]
 85 |     return cp
 86 | 
 87 | 
 88 | with open('train-test.json') as fopen:
 89 |     wavs = json.load(fopen)['train']
 90 |     
 91 | if not os.path.exists('augment'):
 92 |     os.makedirs('augment')
 93 | 
 94 | for no, wav in enumerate(wavs):
 95 |     try:
 96 |         root, ext = os.path.splitext(wav)
 97 |         if (no + 1) % 100 == 0:
 98 |             print(no + 1, root, ext)
 99 |         root = root.replace('/', '<>')
100 |         root = '%s/%s'%('augment', root)
101 |         sample_rate, samples = scipy.io.wavfile.read(wav)
102 |         aug = change_pitch_speech(samples)
103 |         librosa.output.write_wav(
104 |             '%s-1%s' % (root, ext),
105 |             aug.astype('float32'),
106 |             sample_rate,
107 |             norm = True,
108 |         )
109 | 
110 |         aug = change_amplitude(samples)
111 |         librosa.output.write_wav(
112 |             '%s-2%s' % (root, ext),
113 |             aug.astype('float32'),
114 |             sample_rate,
115 |             norm = True,
116 |         )
117 | 
118 |         aug = add_noise(samples)
119 |         librosa.output.write_wav(
120 |             '%s-3%s' % (root, ext),
121 |             aug.astype('float32'),
122 |             sample_rate,
123 |             norm = True,
124 |         )
125 | 
126 |         aug = add_hpss(samples)
127 |         librosa.output.write_wav(
128 |             '%s-4%s' % (root, ext),
129 |             aug.astype('float32'),
130 |             sample_rate,
131 |             norm = True,
132 |         )
133 | 
134 |         aug = strech(samples)
135 |         librosa.output.write_wav(
136 |             '%s-5%s' % (root, ext),
137 |             aug.astype('float32'),
138 |             sample_rate,
139 |             norm = True,
140 |         )
141 | 
142 |         aug = random_augmentation(samples)
143 |         librosa.output.write_wav(
144 |             '%s-6%s' % (root, ext),
145 |             aug.astype('float32'),
146 |             sample_rate,
147 |             norm = True,
148 |         )
149 |     except Exception as e:
150 |         print(e)
151 |         pass


--------------------------------------------------------------------------------
/speech-to-text/download.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 1,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "from bs4 import BeautifulSoup\n",
10 |     "from urllib.request import urlopen, urlretrieve\n",
11 |     "from tqdm import tqdm\n",
12 |     "import re\n",
13 |     "import os"
14 |    ]
15 |   },
16 |   {
17 |    "cell_type": "code",
18 |    "execution_count": 2,
19 |    "metadata": {},
20 |    "outputs": [
21 |     {
22 |      "name": "stderr",
23 |      "output_type": "stream",
24 |      "text": [
25 |       "100%|███████████████████████████████| 200/200 [05:56<00:00,  1.84s/it]\n",
26 |       "100%|███████████████████████████████| 200/200 [05:38<00:00,  1.69s/it]\n",
27 |       "100%|███████████████████████████████| 200/200 [06:16<00:00,  1.82s/it]\n",
28 |       "100%|███████████████████████████████| 200/200 [06:00<00:00,  1.76s/it]\n",
29 |       "100%|███████████████████████████████| 200/200 [06:46<00:00,  2.47s/it]\n",
30 |       "100%|███████████████████████████████| 200/200 [09:04<00:00,  2.60s/it]\n",
31 |       "100%|███████████████████████████████| 200/200 [10:12<00:00,  2.87s/it]\n",
32 |       "100%|███████████████████████████████| 200/200 [09:01<00:00,  2.63s/it]\n",
33 |       "100%|███████████████████████████████| 200/200 [09:39<00:00,  3.47s/it]\n",
34 |       "100%|███████████████████████████████| 200/200 [10:56<00:00,  3.04s/it]\n",
35 |       "100%|███████████████████████████████| 200/200 [11:12<00:00,  3.06s/it]\n",
36 |       "100%|███████████████████████████████| 200/200 [07:46<00:00,  2.32s/it]\n",
37 |       "100%|███████████████████████████████| 200/200 [09:30<00:00,  2.83s/it]\n",
38 |       "100%|███████████████████████████████| 200/200 [10:05<00:00,  3.83s/it]\n"
39 |      ]
40 |     }
41 |    ],
42 |    "source": [
43 |     "prefix = 'https://tspace.library.utoronto.ca'\n",
44 |     "save_dir = './data/'\n",
45 |     "if not os.path.exists(save_dir):\n",
46 |     "    os.makedirs(save_dir)\n",
47 |     "\n",
48 |     "base_url = 'https://tspace.library.utoronto.ca/handle/1807/24'\n",
49 |     "urls = [base_url+str(i) for i in range(488, 502)]\n",
50 |     "for url in urls:\n",
51 |     "    soup = BeautifulSoup(urlopen(url).read(), 'html5lib')\n",
52 |     "    targets = soup.findAll('a', href=re.compile(r'/bitstream/.*.wav'))\n",
53 |     "        \n",
54 |     "    for a in tqdm(targets, total=len(targets), ncols=70):\n",
55 |     "        link = a['href']\n",
56 |     "\n",
57 |     "        audio_save_loc = save_dir + link.split('/')[-1]\n",
58 |     "        if os.path.isfile(audio_save_loc):\n",
59 |     "            print(\"File Already Exists\")\n",
60 |     "        urlretrieve(prefix+a['href'], audio_save_loc)\n",
61 |     "\n",
62 |     "        with open(audio_save_loc.replace('.wav', '.txt'), 'w') as f:\n",
63 |     "            f.write('say the word ' + link.split('_')[-2])"
64 |    ]
65 |   },
66 |   {
67 |    "cell_type": "code",
68 |    "execution_count": null,
69 |    "metadata": {},
70 |    "outputs": [],
71 |    "source": []
72 |   }
73 |  ],
74 |  "metadata": {
75 |   "kernelspec": {
76 |    "display_name": "Python 3",
77 |    "language": "python",
78 |    "name": "python3"
79 |   },
80 |   "language_info": {
81 |    "codemirror_mode": {
82 |     "name": "ipython",
83 |     "version": 3
84 |    },
85 |    "file_extension": ".py",
86 |    "mimetype": "text/x-python",
87 |    "name": "python",
88 |    "nbconvert_exporter": "python",
89 |    "pygments_lexer": "ipython3",
90 |    "version": "3.5.2"
91 |   }
92 |  },
93 |  "nbformat": 4,
94 |  "nbformat_minor": 2
95 | }
96 | 


--------------------------------------------------------------------------------
/speech-to-text/wav2vec-preprocessing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import librosa\n",
 10 |     "import glob"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 12,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "train = glob.glob('spectrogram-train/*.npy')\n",
 20 |     "x = []\n",
 21 |     "for fpath in train:\n",
 22 |     "    fpath = fpath.split('/')[1]\n",
 23 |     "    splitted = fpath.split('-')\n",
 24 |     "    if len(splitted) == 2:\n",
 25 |     "        splitted[1] = splitted[1].split('.')[1]\n",
 26 |     "        fpath = splitted[0] + '.' + splitted[1]\n",
 27 |     "    fpath = fpath.replace('.npy','.wav')\n",
 28 |     "    x.append('data/' + fpath)"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 13,
 34 |    "metadata": {},
 35 |    "outputs": [
 36 |     {
 37 |      "data": {
 38 |       "text/plain": [
 39 |        "16341"
 40 |       ]
 41 |      },
 42 |      "execution_count": 13,
 43 |      "metadata": {},
 44 |      "output_type": "execute_result"
 45 |     }
 46 |    ],
 47 |    "source": [
 48 |     "augment = glob.glob('augment/*.wav')\n",
 49 |     "x.extend(augment)\n",
 50 |     "x = list(set(x))\n",
 51 |     "len(x)"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 15,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "test_ = glob.glob('spectrogram-test/*.npy')\n",
 61 |     "test = []\n",
 62 |     "for t in test_:\n",
 63 |     "    f = t.split('/')[1].replace('.npy', '.wav')\n",
 64 |     "    test.append('data/'+f)"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 16,
 70 |    "metadata": {},
 71 |    "outputs": [
 72 |     {
 73 |      "name": "stderr",
 74 |      "output_type": "stream",
 75 |      "text": [
 76 |       "100%|██████████| 16341/16341 [15:07<00:00, 18.01it/s]\n",
 77 |       "100%|██████████| 560/560 [00:30<00:00, 18.51it/s]\n"
 78 |      ]
 79 |     }
 80 |    ],
 81 |    "source": [
 82 |     "from tqdm import tqdm\n",
 83 |     "\n",
 84 |     "X = []\n",
 85 |     "for i in tqdm(range(len(x))):\n",
 86 |     "    y, sr = librosa.load(x[i], sr = 16000)\n",
 87 |     "    X.append(y)\n",
 88 |     "    \n",
 89 |     "Y = []\n",
 90 |     "for i in tqdm(range(len(test))):\n",
 91 |     "    y, sr = librosa.load(test[i], sr = 16000)\n",
 92 |     "    Y.append(y)"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": 20,
 98 |    "metadata": {},
 99 |    "outputs": [],
100 |    "source": [
101 |     "import pickle\n",
102 |     "\n",
103 |     "with open('train-wav.pkl', 'wb') as fopen:\n",
104 |     "    pickle.dump({'X': X, 'x': x}, fopen)\n",
105 |     "    \n",
106 |     "with open('test-wav.pkl', 'wb') as fopen:\n",
107 |     "    pickle.dump({'Y': Y, 'y': test}, fopen)"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": null,
113 |    "metadata": {},
114 |    "outputs": [],
115 |    "source": []
116 |   }
117 |  ],
118 |  "metadata": {
119 |   "kernelspec": {
120 |    "display_name": "Python 3",
121 |    "language": "python",
122 |    "name": "python3"
123 |   },
124 |   "language_info": {
125 |    "codemirror_mode": {
126 |     "name": "ipython",
127 |     "version": 3
128 |    },
129 |    "file_extension": ".py",
130 |    "mimetype": "text/x-python",
131 |    "name": "python",
132 |    "nbconvert_exporter": "python",
133 |    "pygments_lexer": "ipython3",
134 |    "version": "3.6.8"
135 |   }
136 |  },
137 |  "nbformat": 4,
138 |  "nbformat_minor": 2
139 | }
140 | 


--------------------------------------------------------------------------------
/stemming/README.md:
--------------------------------------------------------------------------------
1 | ## How-to
2 | 
3 | 1. Run any notebook using Jupyter Notebook.
4 | 


--------------------------------------------------------------------------------
/stemming/dnc.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Google Inc.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | """DNC Cores.
 16 | 
 17 | These modules create a DNC core. They take input, pass parameters to the memory
 18 | access module, and integrate the output of memory to form an output.
 19 | """
 20 | 
 21 | from __future__ import absolute_import
 22 | from __future__ import division
 23 | from __future__ import print_function
 24 | 
 25 | import collections
 26 | import numpy as np
 27 | import sonnet as snt
 28 | import tensorflow as tf
 29 | 
 30 | import access
 31 | 
 32 | DNCState = collections.namedtuple('DNCState', ('access_output', 'access_state',
 33 |                                                'controller_state'))
 34 | 
 35 | 
 36 | class DNC(snt.RNNCore):
 37 |   """DNC core module.
 38 | 
 39 |   Contains controller and memory access module.
 40 |   """
 41 | 
 42 |   def __init__(self,
 43 |                access_config,
 44 |                controller_config,
 45 |                output_size,
 46 |                clip_value=None,
 47 |                name='dnc'):
 48 |     """Initializes the DNC core.
 49 | 
 50 |     Args:
 51 |       access_config: dictionary of access module configurations.
 52 |       controller_config: dictionary of controller (LSTM) module configurations.
 53 |       output_size: output dimension size of core.
 54 |       clip_value: clips controller and core output values to between
 55 |           `[-clip_value, clip_value]` if specified.
 56 |       name: module name (default 'dnc').
 57 | 
 58 |     Raises:
 59 |       TypeError: if direct_input_size is not None for any access module other
 60 |         than KeyValueMemory.
 61 |     """
 62 |     super(DNC, self).__init__(name=name)
 63 | 
 64 |     with self._enter_variable_scope():
 65 |       self._controller = snt.LSTM(**controller_config)
 66 |       self._access = access.MemoryAccess(**access_config)
 67 | 
 68 |     self._access_output_size = np.prod(self._access.output_size.as_list())
 69 |     self._output_size = output_size
 70 |     self._clip_value = clip_value or 0
 71 | 
 72 |     self._output_size = tf.TensorShape([output_size])
 73 |     self._state_size = DNCState(
 74 |         access_output=self._access_output_size,
 75 |         access_state=self._access.state_size,
 76 |         controller_state=self._controller.state_size)
 77 | 
 78 |   def _clip_if_enabled(self, x):
 79 |     if self._clip_value > 0:
 80 |       return tf.clip_by_value(x, -self._clip_value, self._clip_value)
 81 |     else:
 82 |       return x
 83 | 
 84 |   def _build(self, inputs, prev_state):
 85 |     """Connects the DNC core into the graph.
 86 | 
 87 |     Args:
 88 |       inputs: Tensor input.
 89 |       prev_state: A `DNCState` tuple containing the fields `access_output`,
 90 |           `access_state` and `controller_state`. `access_state` is a 3-D Tensor
 91 |           of shape `[batch_size, num_reads, word_size]` containing read words.
 92 |           `access_state` is a tuple of the access module's state, and
 93 |           `controller_state` is a tuple of controller module's state.
 94 | 
 95 |     Returns:
 96 |       A tuple `(output, next_state)` where `output` is a tensor and `next_state`
 97 |       is a `DNCState` tuple containing the fields `access_output`,
 98 |       `access_state`, and `controller_state`.
 99 |     """
100 | 
101 |     prev_access_output = prev_state.access_output
102 |     prev_access_state = prev_state.access_state
103 |     prev_controller_state = prev_state.controller_state
104 | 
105 |     batch_flatten = snt.BatchFlatten()
106 |     controller_input = tf.concat(
107 |         [batch_flatten(inputs), batch_flatten(prev_access_output)], 1)
108 | 
109 |     controller_output, controller_state = self._controller(
110 |         controller_input, prev_controller_state)
111 | 
112 |     controller_output = self._clip_if_enabled(controller_output)
113 |     controller_state = snt.nest.map(self._clip_if_enabled, controller_state)
114 | 
115 |     access_output, access_state = self._access(controller_output,
116 |                                                prev_access_state)
117 | 
118 |     output = tf.concat([controller_output, batch_flatten(access_output)], 1)
119 |     output = snt.Linear(
120 |         output_size=self._output_size.as_list()[0],
121 |         name='output_linear')(output)
122 |     output = self._clip_if_enabled(output)
123 | 
124 |     return output, DNCState(
125 |         access_output=access_output,
126 |         access_state=access_state,
127 |         controller_state=controller_state)
128 | 
129 |   def initial_state(self, batch_size, dtype=tf.float32):
130 |     return DNCState(
131 |         controller_state=self._controller.initial_state(batch_size, dtype),
132 |         access_state=self._access.initial_state(batch_size, dtype),
133 |         access_output=tf.zeros(
134 |             [batch_size] + self._access.output_size.as_list(), dtype))
135 | 
136 |   @property
137 |   def state_size(self):
138 |     return self._state_size
139 | 
140 |   @property
141 |   def output_size(self):
142 |     return self._output_size
143 | 


--------------------------------------------------------------------------------
/stemming/util.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """DNC util ops and modules."""
16 | 
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 | 
21 | import numpy as np
22 | import tensorflow as tf
23 | 
24 | 
25 | def batch_invert_permutation(permutations):
26 |   """Returns batched `tf.invert_permutation` for every row in `permutations`."""
27 |   with tf.name_scope('batch_invert_permutation', values=[permutations]):
28 |     unpacked = tf.unstack(permutations)
29 |     inverses = [tf.invert_permutation(permutation) for permutation in unpacked]
30 |     return tf.stack(inverses)
31 | 
32 | 
33 | def batch_gather(values, indices):
34 |   """Returns batched `tf.gather` for every row in the input."""
35 |   with tf.name_scope('batch_gather', values=[values, indices]):
36 |     unpacked = zip(tf.unstack(values), tf.unstack(indices))
37 |     result = [tf.gather(value, index) for value, index in unpacked]
38 |     return tf.stack(result)
39 | 
40 | 
41 | def one_hot(length, index):
42 |   """Return an nd array of given `length` filled with 0s and a 1 at `index`."""
43 |   result = np.zeros(length)
44 |   result[index] = 1
45 |   return result
46 | 


--------------------------------------------------------------------------------
/text-augmentation/1.glove.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 2,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "f = open('glove.6B.200d.txt','r')\n",
 19 |     "dictionary = {}\n",
 20 |     "vectors = []\n",
 21 |     "for no, line in enumerate(f):\n",
 22 |     "    splitLine = line.split()\n",
 23 |     "    word = splitLine[0]\n",
 24 |     "    dictionary[word] = no\n",
 25 |     "    embedding = np.array([float(val) for val in splitLine[1:]])\n",
 26 |     "    vectors.append(embedding)"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 5,
 32 |    "metadata": {},
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "vectors = np.array(vectors)"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 24,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "rev_dictionary = {v:k for k, v in dictionary.items()}"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 7,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "import tensorflow as tf"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": 10,
 59 |    "metadata": {},
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "class Model:\n",
 63 |     "    def __init__(self):\n",
 64 |     "        self._embedding = tf.convert_to_tensor(vectors, dtype = tf.float32)\n",
 65 |     "        self.X = tf.placeholder(\n",
 66 |     "                tf.float32, [None, vectors.shape[1]]\n",
 67 |     "        )\n",
 68 |     "        normed_embedding = tf.nn.l2_normalize(self._embedding, axis = 1)\n",
 69 |     "        normed_array = tf.nn.l2_normalize(self.X, axis = 1)\n",
 70 |     "        self.cosine_similarity = tf.matmul(\n",
 71 |     "            normed_array, tf.transpose(normed_embedding, [1, 0])\n",
 72 |     "        )"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": 11,
 78 |    "metadata": {},
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "model = Model()\n",
 82 |     "sess = tf.InteractiveSession()"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": 14,
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "string_positive = 'i really love to eat chicken and meat'\n",
 92 |     "string_negative = 'i really hate you and i do not want to see you again'"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": 28,
 98 |    "metadata": {},
 99 |    "outputs": [],
100 |    "source": [
101 |     "import random"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": 52,
107 |    "metadata": {},
108 |    "outputs": [],
109 |    "source": [
110 |     "def augmentation(string, threshold = 0.5, count = 5, k = 8):\n",
111 |     "    string = string.split()\n",
112 |     "    selected = []\n",
113 |     "    while not len(selected):\n",
114 |     "        selected = [(no, w) for no, w in enumerate(string) if random.random() > threshold]\n",
115 |     "    indices, words = [i[0] for i in selected], [i[1] for i in selected]\n",
116 |     "    \n",
117 |     "    batches = vectors[[dictionary[w] for w in words]]\n",
118 |     "    top_k = tf.nn.top_k(model.cosine_similarity, k = k)\n",
119 |     "    results = sess.run(top_k, feed_dict = {model.X: batches})\n",
120 |     "    words = []\n",
121 |     "    for result in results.indices:\n",
122 |     "        words.append([rev_dictionary[i] for i in result])\n",
123 |     "    augmented = []\n",
124 |     "    for i in range(count):\n",
125 |     "        string_ = string[:]\n",
126 |     "        for no in range(len(words)):\n",
127 |     "            index = random.randint(0, len(words[no]) - 1)\n",
128 |     "            string_[indices[no]] = words[no][index]\n",
129 |     "        augmented.append(' '.join(string_))\n",
130 |     "    return augmented   "
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": 53,
136 |    "metadata": {},
137 |    "outputs": [
138 |     {
139 |      "name": "stdout",
140 |      "output_type": "stream",
141 |      "text": [
142 |       "CPU times: user 4.07 s, sys: 1.59 s, total: 5.66 s\n",
143 |       "Wall time: 5.68 s\n"
144 |      ]
145 |     },
146 |     {
147 |      "data": {
148 |       "text/plain": [
149 |        "['i thing love to eat pork also chicken',\n",
150 |        " 'i really love to eat fried well cooked',\n",
151 |        " 'i things love to eat meat with beef',\n",
152 |        " 'i thing love to eat roasted well chicken',\n",
153 |        " 'i something love to eat cooked , beef']"
154 |       ]
155 |      },
156 |      "execution_count": 53,
157 |      "metadata": {},
158 |      "output_type": "execute_result"
159 |     }
160 |    ],
161 |    "source": [
162 |     "%%time\n",
163 |     "augmentation(string_positive)"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": 54,
169 |    "metadata": {},
170 |    "outputs": [
171 |     {
172 |      "name": "stdout",
173 |      "output_type": "stream",
174 |      "text": [
175 |       "CPU times: user 3.96 s, sys: 1.78 s, total: 5.73 s\n",
176 |       "Wall time: 5.75 s\n"
177 |      ]
178 |     },
179 |     {
180 |      "data": {
181 |       "text/plain": [
182 |        "['i really hate you also i know not want make see i again',\n",
183 |        " \"i really hate you while i n't not want help see 'll again\",\n",
184 |        " \"i really hate you . i not not want take see 'll again\",\n",
185 |        " 'i really hate you well i know not want able see you again',\n",
186 |        " \"i really hate you , i does not want could see 'll again\"]"
187 |       ]
188 |      },
189 |      "execution_count": 54,
190 |      "metadata": {},
191 |      "output_type": "execute_result"
192 |     }
193 |    ],
194 |    "source": [
195 |     "%%time\n",
196 |     "augmentation(string_negative)"
197 |    ]
198 |   }
199 |  ],
200 |  "metadata": {
201 |   "kernelspec": {
202 |    "display_name": "Python 3",
203 |    "language": "python",
204 |    "name": "python3"
205 |   },
206 |   "language_info": {
207 |    "codemirror_mode": {
208 |     "name": "ipython",
209 |     "version": 3
210 |    },
211 |    "file_extension": ".py",
212 |    "mimetype": "text/x-python",
213 |    "name": "python",
214 |    "nbconvert_exporter": "python",
215 |    "pygments_lexer": "ipython3",
216 |    "version": "3.6.8"
217 |   }
218 |  },
219 |  "nbformat": 4,
220 |  "nbformat_minor": 2
221 | }
222 | 


--------------------------------------------------------------------------------
/text-augmentation/6.vae-varitional-bahdanau/basic_decoder.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | 
  3 | import tensorflow as tf
  4 | 
  5 | import decoder
  6 | from tensorflow.contrib.seq2seq.python.ops import helper as helper_py
  7 | from tensorflow.python.framework import dtypes
  8 | from tensorflow.python.framework import ops
  9 | from tensorflow.python.framework import tensor_shape
 10 | from tensorflow.python.layers import base as layers_base
 11 | from tensorflow.python.ops import rnn_cell_impl
 12 | from tensorflow.python.util import nest
 13 | 
 14 | __all__ = [
 15 |     "BasicDecoderOutput",
 16 |     "BasicDecoder",
 17 | ]
 18 | 
 19 | 
 20 | class BasicDecoderOutput(collections.namedtuple("BasicDecoderOutput", ("rnn_output", "sample_id"))):
 21 |     pass
 22 | 
 23 | 
 24 | class BasicDecoder(decoder.Decoder):
 25 |     """Basic sampling decoder."""
 26 | 
 27 |     def __init__(self, cell, helper, initial_state, latent_vector, output_layer=None):
 28 |         """Initialize BasicDecoder.
 29 |         Args:
 30 |           cell: An `RNNCell` instance.
 31 |           helper: A `Helper` instance.
 32 |           initial_state: A (possibly nested tuple of...) tensors and TensorArrays.
 33 |             The initial state of the RNNCell.
 34 |           output_layer: (Optional) An instance of `tf.layers.Layer`, i.e.,
 35 |             `tf.layers.Dense`.  Optional layer to apply to the RNN output prior
 36 |             to storing the result or sampling.
 37 |         Raises:
 38 |           TypeError: if `cell`, `helper` or `output_layer` have an incorrect type.
 39 |         """
 40 |         if not isinstance(helper, helper_py.Helper):
 41 |             raise TypeError("helper must be a Helper, received: %s" % type(helper))
 42 |         if (output_layer is not None and not isinstance(output_layer, layers_base.Layer)):
 43 |             raise TypeError("output_layer must be a Layer, received: %s" % type(output_layer))
 44 |         self._cell = cell
 45 |         self._helper = helper
 46 |         self._initial_state = initial_state
 47 |         self._output_layer = output_layer
 48 |         self._latent_vector = latent_vector
 49 |         self._intermediate_context_kl_loss = tf.zeros(shape=(helper.batch_size,))  # shape of (batch_size,)
 50 |         # CHANGE-1: Variable to keep adding the c_kl_losses from each timestep
 51 | 
 52 |     @property
 53 |     def batch_size(self):
 54 |         return self._helper.batch_size
 55 | 
 56 |     def _rnn_output_size(self):
 57 |         size = self._cell.output_size
 58 |         if self._output_layer is None:
 59 |             return size
 60 |         else:
 61 |             # To use layer's compute_output_shape, we need to convert the
 62 |             # RNNCell's output_size entries into shapes with an unknown
 63 |             # batch size.  We then pass this through the layer's
 64 |             # compute_output_shape and read off all but the first (batch)
 65 |             # dimensions to get the output size of the rnn with the layer
 66 |             # applied to the top.
 67 |             output_shape_with_unknown_batch = nest.map_structure(
 68 |                 lambda s: tensor_shape.TensorShape([None]).concatenate(s),
 69 |                 size)
 70 |             layer_output_shape = self._output_layer.compute_output_shape(  # pylint: disable=protected-access
 71 |                 output_shape_with_unknown_batch)
 72 |         return nest.map_structure(lambda s: s[1:], layer_output_shape)
 73 | 
 74 |     @property
 75 |     def output_size(self):
 76 |         # Return the cell output and the id
 77 |         return BasicDecoderOutput(
 78 |             rnn_output=self._rnn_output_size(),
 79 |             sample_id=tensor_shape.TensorShape([]))
 80 | 
 81 |     @property
 82 |     def output_dtype(self):
 83 |         # Assume the dtype of the cell is the output_size structure
 84 |         # containing the input_state's first component's dtype.
 85 |         # Return that structure and int32 (the id)
 86 |         dtype = nest.flatten(self._initial_state)[0].dtype
 87 |         return BasicDecoderOutput(
 88 |             nest.map_structure(lambda _: dtype, self._rnn_output_size()),
 89 |             dtypes.int32)
 90 | 
 91 |     def initialize(self, name=None):
 92 |         """Initialize the decoder.
 93 |         Args:
 94 |           name: Name scope for any created operations.
 95 |         Returns:
 96 |           `(finished, first_inputs, initial_state)`.
 97 |         """
 98 |         # Concatenate the latent vector to the 1st input to the decoder LSTM, i.e, the <GO> embedding + latent vector
 99 |         return (self._helper.initialize()[0],
100 |                 tf.concat([self._helper.initialize()[1], self._latent_vector], axis=-1)) + (self._initial_state,)
101 | 
102 |     def step(self, time, inputs, state, name=None):
103 |         """Perform a decoding step.
104 |         Args:
105 |           time: scalar `int32` tensor.
106 |           inputs: A (structure of) input tensors.
107 |           state: A (structure of) state tensors and TensorArrays.
108 |           name: Name scope for any created operations.
109 |         Returns:
110 |           `(outputs, next_state, next_inputs, finished)`.
111 |         """
112 |         with ops.name_scope(name, "BasicDecoderStep", (time, inputs, state)):
113 |             cell_outputs, cell_state, c_kl_loss = self._cell(inputs, state)
114 |             # Accumulate the context KL loss from token at the current decoder step
115 |             self._intermediate_context_kl_loss += c_kl_loss
116 |             c_kl_loss = self._intermediate_context_kl_loss
117 | 
118 |             if self._output_layer is not None:
119 |                 cell_outputs = self._output_layer(cell_outputs)
120 |             sample_ids = self._helper.sample(
121 |                 time=time, outputs=cell_outputs, state=cell_state)
122 |             (finished, next_inputs, next_state) = self._helper.next_inputs(
123 |                 time=time,
124 |                 outputs=cell_outputs,
125 |                 state=cell_state,
126 |                 sample_ids=sample_ids)
127 | 
128 |             # Concatenate the latent vector to the predicted word's embedding
129 |             next_inputs = tf.concat([next_inputs, self._latent_vector], axis=-1)
130 | 
131 |         outputs = BasicDecoderOutput(cell_outputs, sample_ids)
132 |         return (outputs, next_state, next_inputs, finished, c_kl_loss)


--------------------------------------------------------------------------------
/text-augmentation/README.md:
--------------------------------------------------------------------------------
1 | ## How-to
2 | 
3 | 1. Download any glove pretrained from here, https://nlp.stanford.edu/projects/glove/
4 | 
5 | 2. Run any notebook using Jupyter Notebook.
6 | 


--------------------------------------------------------------------------------
/text-classification/63.deep-pyramid-cnn.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # In[ ]:
  5 | 
  6 | 
  7 | from utils import *
  8 | import tensorflow as tf
  9 | from sklearn.cross_validation import train_test_split
 10 | import time
 11 | import random
 12 | import os
 13 | 
 14 | 
 15 | # In[ ]:
 16 | 
 17 | 
 18 | trainset = sklearn.datasets.load_files(container_path = 'data', encoding = 'UTF-8')
 19 | trainset.data, trainset.target = separate_dataset(trainset,1.0)
 20 | print (trainset.target_names)
 21 | print (len(trainset.data))
 22 | print (len(trainset.target))
 23 | 
 24 | 
 25 | # In[ ]:
 26 | 
 27 | 
 28 | concat = ' '.join(trainset.data).split()
 29 | vocabulary_size = len(list(set(concat)))
 30 | data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)
 31 | print('vocab from size: %d'%(vocabulary_size))
 32 | print('Most common words', count[4:10])
 33 | print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])
 34 | 
 35 | 
 36 | # In[ ]:
 37 | 
 38 | 
 39 | GO = dictionary['GO']
 40 | PAD = dictionary['PAD']
 41 | EOS = dictionary['EOS']
 42 | UNK = dictionary['UNK']
 43 | 
 44 | 
 45 | # In[ ]:
 46 | 
 47 | 
 48 | embedding_size = 128
 49 | dimension_output = len(trainset.target_names)
 50 | maxlen = 50
 51 | batch_size = 32
 52 | kernel_size = 3
 53 | num_filters = 150
 54 | 
 55 | 
 56 | # In[ ]:
 57 | 
 58 | 
 59 | class Model:
 60 |     def __init__(self, 
 61 |                  maxlen,
 62 |                  dimension_output,
 63 |                  vocab_size,
 64 |                  embedding_size,
 65 |                  kernel_size,
 66 |                  num_filters,
 67 |                  learning_rate):
 68 |         self.X = tf.placeholder(tf.int32,[None, maxlen])
 69 |         self.Y = tf.placeholder(tf.int32,[None])
 70 |         embeddings = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1, 1))
 71 |         embedded = tf.nn.embedding_lookup(embeddings, self.X)
 72 |         first_region = tf.layers.conv1d(
 73 |                     embedded,
 74 |                     num_filters,
 75 |                     kernel_size = kernel_size,
 76 |                     strides = 1,
 77 |                     padding = 'valid'
 78 |                 )
 79 |         forward = tf.nn.relu(first_region)
 80 |         forward = tf.layers.conv1d(
 81 |                     forward,
 82 |                     num_filters,
 83 |                     kernel_size = kernel_size,
 84 |                     strides = 1,
 85 |                     padding = 'same'
 86 |                 )
 87 |         forward = tf.layers.batch_normalization(forward)
 88 |         forward = tf.nn.relu(first_region)
 89 |         forward = tf.layers.conv1d(
 90 |                     forward,
 91 |                     num_filters,
 92 |                     kernel_size = kernel_size,
 93 |                     strides = 1,
 94 |                     padding = 'same'
 95 |                 )
 96 |         forward = tf.layers.batch_normalization(forward)
 97 |         forward = tf.nn.relu(first_region)
 98 |         forward = forward + first_region
 99 |         
100 |         def _block(x):
101 |             x = tf.pad(x, paddings=[[0, 0], [0, 1], [0, 0]])
102 |             px = tf.layers.max_pooling1d(x, 3, 2)
103 |             x = tf.nn.relu(px)
104 |             x = tf.layers.conv1d(
105 |                     x,
106 |                     num_filters,
107 |                     kernel_size = kernel_size,
108 |                     strides = 1,
109 |                     padding = 'same'
110 |                 )
111 |             x = tf.layers.batch_normalization(x)
112 |             x = tf.nn.relu(x)
113 |             x = tf.layers.conv1d(
114 |                     x,
115 |                     num_filters,
116 |                     kernel_size = kernel_size,
117 |                     strides = 1,
118 |                     padding = 'same'
119 |                 )
120 |             x = tf.layers.batch_normalization(x)
121 |             x = x + px
122 |             return x
123 |         while forward.get_shape().as_list()[1] >= 2:
124 |             forward = _block(forward)
125 |         self.logits = tf.reduce_sum(tf.layers.conv1d(
126 |             forward, dimension_output, kernel_size = 1, strides = 1, padding = 'SAME'
127 |         ), 1)
128 |         self.cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
129 |             logits=self.logits,
130 |             labels=self.Y))
131 |         self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(self.cost)
132 |         correct_pred = tf.equal(tf.argmax(self.logits, 1,output_type=tf.int32), self.Y)
133 |         self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
134 | 
135 | 
136 | # In[ ]:
137 | 
138 | 
139 | tf.reset_default_graph()
140 | sess = tf.InteractiveSession()
141 | model = Model(maxlen, dimension_output, len(dictionary), embedding_size,
142 |              kernel_size, num_filters, 1e-3)
143 | sess.run(tf.global_variables_initializer())
144 | 
145 | 
146 | # In[ ]:
147 | 
148 | 
149 | vectors = str_idx(trainset.data,dictionary,maxlen)
150 | train_X, test_X, train_Y, test_Y = train_test_split(vectors, trainset.target,test_size = 0.2)
151 | 
152 | 
153 | # In[ ]:
154 | 
155 | 
156 | from tqdm import tqdm
157 | import time
158 | 
159 | EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 3, 0, 0, 0
160 | 
161 | while True:
162 |     lasttime = time.time()
163 |     if CURRENT_CHECKPOINT == EARLY_STOPPING:
164 |         print('break epoch:%d\n' % (EPOCH))
165 |         break
166 | 
167 |     train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
168 |     pbar = tqdm(
169 |         range(0, len(train_X), batch_size), desc = 'train minibatch loop'
170 |     )
171 |     for i in pbar:
172 |         batch_x = train_X[i : min(i + batch_size, train_X.shape[0])]
173 |         batch_y = train_Y[i : min(i + batch_size, train_X.shape[0])]
174 |         batch_x_expand = np.expand_dims(batch_x,axis = 1)
175 |         acc, cost, _ = sess.run(
176 |             [model.accuracy, model.cost, model.optimizer],
177 |             feed_dict = {
178 |                 model.Y: batch_y,
179 |                 model.X: batch_x
180 |             },
181 |         )
182 |         assert not np.isnan(cost)
183 |         train_loss += cost
184 |         train_acc += acc
185 |         pbar.set_postfix(cost = cost, accuracy = acc)
186 |         
187 |     pbar = tqdm(range(0, len(test_X), batch_size), desc = 'test minibatch loop')
188 |     for i in pbar:
189 |         batch_x = test_X[i : min(i + batch_size, test_X.shape[0])]
190 |         batch_y = test_Y[i : min(i + batch_size, test_X.shape[0])]
191 |         batch_x_expand = np.expand_dims(batch_x,axis = 1)
192 |         acc, cost = sess.run(
193 |             [model.accuracy, model.cost],
194 |             feed_dict = {
195 |                 model.Y: batch_y,
196 |                 model.X: batch_x
197 |             },
198 |         )
199 |         test_loss += cost
200 |         test_acc += acc
201 |         pbar.set_postfix(cost = cost, accuracy = acc)
202 | 
203 |     train_loss /= len(train_X) / batch_size
204 |     train_acc /= len(train_X) / batch_size
205 |     test_loss /= len(test_X) / batch_size
206 |     test_acc /= len(test_X) / batch_size
207 | 
208 |     if test_acc > CURRENT_ACC:
209 |         print(
210 |             'epoch: %d, pass acc: %f, current acc: %f'
211 |             % (EPOCH, CURRENT_ACC, test_acc)
212 |         )
213 |         CURRENT_ACC = test_acc
214 |         CURRENT_CHECKPOINT = 0
215 |     else:
216 |         CURRENT_CHECKPOINT += 1
217 | 
218 |     print('time taken:', time.time() - lasttime)
219 |     print(
220 |         'epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'
221 |         % (EPOCH, train_loss, train_acc, test_loss, test_acc)
222 |     )
223 |     EPOCH += 1
224 | 
225 | 
226 | # In[ ]:
227 | 
228 | 
229 | real_Y, predict_Y = [], []
230 | 
231 | pbar = tqdm(
232 |     range(0, len(test_X), batch_size), desc = 'validation minibatch loop'
233 | )
234 | for i in pbar:
235 |     batch_x = test_X[i : min(i + batch_size, test_X.shape[0])]
236 |     batch_y = test_Y[i : min(i + batch_size, test_X.shape[0])]
237 |     predict_Y += np.argmax(
238 |         sess.run(
239 |             model.logits, feed_dict = {model.X: batch_x, model.Y: batch_y}
240 |         ),
241 |         1,
242 |     ).tolist()
243 |     real_Y += batch_y
244 | 
245 | 
246 | # In[ ]:
247 | 
248 | 
249 | print(metrics.classification_report(real_Y, predict_Y, target_names = trainset.target_names))
250 | 
251 | 


--------------------------------------------------------------------------------
/text-classification/README.md:
--------------------------------------------------------------------------------
 1 | ## How-to
 2 | 
 3 | 1. Make sure `data` folder in the same directory of the notebooks.
 4 | 
 5 | 2. Run any notebook using Jupyter Notebook.
 6 | 
 7 | ## Score and average time taken per epoch, not sorted
 8 | 
 9 | Based on 20% validation, time taken based on single Tesla V100 32GB VRAM.
10 | 
11 | | name                                 | accuracy | time taken (s) |
12 | |--------------------------------------|----------|----------------|
13 | | 1. basic-rnn                         | 0.68     | 1.3219         |
14 | | 2. basic-rnn-hinge                   | 0.65     | 1.2455         |
15 | | 3. basic-rnn-huber                   | 0.68     | 1.2468         |
16 | | 4. basic-rnn-bidirectional           | 0.71     | 3.8174         |
17 | | 5. basic-rnn-bidirectional-hinge     | 0.68     | 2.5127         |
18 | | 6. basic-rnn-bidirectional-huber     | 0.63     | 3.5095         |
19 | | 7. lstm-rnn                          | 0.73     | 2.69683        |
20 | | 8. lstm-rnn-hinge                    | 0.72     | 8.2088         |
21 | | 9. lstm-rnn-huber                    | 0.73     | 10.1754        |
22 | | 10. lstm-rnn-bidirectional           | 0.71     | 11.0388        |
23 | | 11. lstm-rnn-bidirectional-huber     | 0.71     | 5.5258         |
24 | | 12. lstm-rnn-dropout-l2              | 0.74     | 3.2420         |
25 | | 13. gru-rnn                          | 0.72     | 3.16123        |
26 | | 14. gru-rnn-hinge                    | 0.72     | 6.71951        |
27 | | 15. gru-rnn-huber                    | 0.70     | 7.93373        |
28 | | 16. gru-rnn-bidirectional            | 0.73     | 2.91590        |
29 | | 17. gru-rnn-bidirectional-hinge      | 0.72     | 5.66385        |
30 | | 18. gru-rnn-bidirectional-huber      | 0.70     | 18.01133       |
31 | | 19. lstm-cnn-rnn                     | 0.65     | 4.42849        |
32 | | 20. kmax-cnn                         | 0.73     | 18.89667       |
33 | | 21. lstm-cnn-rnn-highway             | 0.68     | 3.23122        |
34 | | 22. lstm-rnn-attention               | 0.75     | 13.97496       |
35 | | 23. dilated-rnn-lstm                 | 0.25     | 24.54002       |
36 | | 24. lnlstm-rnn                       | 0.68     | 24.86363       |
37 | | 25. only-attention                   | 0.74     | 2.63291        |
38 | | 26. multihead-attention              | 0.69     | 9.033228       |
39 | | 27. neural-turing-machine            |          |                |
40 | | 28. lstm-seq2seq                     | 0.72     | 9.63291        |
41 | | 29. lstm-seq2seq-luong               |          |                |
42 | | 30. lstm-seq2seq-bahdanau            |          |                |
43 | | 31. lstm-seq2seq-beam                |          |                |
44 | | 32. lstm-seq2seq-birnn               |          |                |
45 | | 33. pointer-net                      |          |                |
46 | | 34. lstm-rnn-bahdanau                | 0.71     | 9.81993        |
47 | | 35. lstm-rnn-luong                   | 0.66     | 27.73932       |
48 | | 36. lstm-rnn-bahdanau-luong          | 0.69     | 36.97628       |
49 | | 37. lstm-birnn-bahdanau-luong        | 0.70     | 38.86009       |
50 | | 38. bytenet                          |          |                |
51 | | 39. fast-slow-lstm                   |          |                |
52 | | 40. siamese-network                  | 0.52     | 7.13535        |
53 | | 41. estimator                        |          |                |
54 | | 42. capsule-rnn-lstm                 |          |                |
55 | | 43. capsule-seq2seq-lstm             |          |                |
56 | | 44. capsule-birrn-seq2seq-lstm       |          |                |
57 | | 45. nested-lstm                      |          |                |
58 | | 46. lstm-seq2seq-highway             |          |                |
59 | | 47. triplet-loss-lstm                | 0.50     |                |
60 | | 48. dnc                              | 0.68     | 85.98529       |
61 | | 49. convlstm                         | 0.69     | 2.66726        |
62 | | 50. temporalconvd                    | 0.66     | 11.90590       |
63 | | 51. batch-all-triplet-loss-lstm      | 0.70     |                |
64 | | 52. fast-text                        | 0.76     | 0.49499        |
65 | | 53. gated-convolution-network        | 0.67     | 3.37712        |
66 | | 54. simple-recurrent-units           | 0.65     | 3.12624        |
67 | | 55. lstm-han                         | 0.50     | 3.47965        |
68 | | 56. bert                             | 0.73     | 6.31015        |
69 | | 57. dynamic-memory-network           | 0.71     | 3.25820        |
70 | | 58. entity-network                   | 0.74     | 1.10458        |
71 | | 59. memory-network                   | 0.58     | 1.157306       |
72 | | 60. char-sparse                      | 0.76     | 2.350096       |
73 | | 61. residual-network                 | 0.72     | 9.557085       |
74 | | 62. residual-network-bahdanau        | 0.71     | 11.53799       |
75 | | 63. deep-pyramid-cnn                 | 0.68     | 6.980528       |
76 | | 64. transformer-xl                   | 0.51     | 38.66338       |
77 | | 65. transfer-learning-gpt2           | 0.79     | 178.0716       |
78 | | 66. quasi-rnn                        | 0.66     | 166.1456       |
79 | | 67. tacotron                         | 0.74     | 360.5551       |
80 | | 68. slice-gru                        | 0.72     | 10.140633      |
81 | | 69. slice-gru-bahdanau               | 0.70     | 20.247409      |
82 | | 70. wavenet                          | 0.59     | 101.293274     |
83 | | 71. transfer-learning-bert           | 0.81     | 887.590460     |
84 | | 72. transfer-learning-xlnet-large    | 0.846    | 340.7679       |
85 | | 73. lstm-birnn-max-avg               | 0.7552   | 9.35624        |
86 | | 74. transfer-learning-bert-base-6    | 0.7655   | 494.169        |
87 | | 75. transfer-learning-bert-large-12  | 0.80     | 1365.30        |
88 | | 76. transfer-learning-xlnet-base     | 0.820441 | 240.262        |
89 | | 77. transfer-learning-albert-base    | 0.799053 | 61.8179        |
90 | | 78. transfer-learning-electra-base   | 0.836336 | 66.0257        |
91 | | 79. transfer-learning-electra-large  | 0.875248 | 195.37280      |
92 | 


--------------------------------------------------------------------------------
/text-classification/data.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mesolitica/NLP-Models-Tensorflow/0741216aa8235e1228b3de7903cc36d73f8f2b45/text-classification/data.zip


--------------------------------------------------------------------------------
/text-classification/gpt_2.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | 
  4 | 
  5 | def shape_list(x):
  6 |     """Deal with dynamic shape in tensorflow cleanly."""
  7 |     static = x.shape.as_list()
  8 |     dynamic = tf.shape(x)
  9 |     return [dynamic[i] if s is None else s for i, s in enumerate(static)]
 10 | 
 11 | 
 12 | def softmax(x, axis = -1):
 13 |     x = x - tf.reduce_max(x, axis = axis, keepdims = True)
 14 |     ex = tf.exp(x)
 15 |     return ex / tf.reduce_sum(ex, axis = axis, keepdims = True)
 16 | 
 17 | 
 18 | def gelu(x):
 19 |     return (
 20 |         0.5
 21 |         * x
 22 |         * (1 + tf.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3))))
 23 |     )
 24 | 
 25 | 
 26 | def norm(x, scope, *, axis = -1, epsilon = 1e-5):
 27 |     """Normalize to mean = 0, std = 1, then do a diagonal affine transform."""
 28 |     with tf.variable_scope(scope):
 29 |         n_state = x.shape[-1].value
 30 |         g = tf.get_variable(
 31 |             'g', [n_state], initializer = tf.constant_initializer(1)
 32 |         )
 33 |         b = tf.get_variable(
 34 |             'b', [n_state], initializer = tf.constant_initializer(0)
 35 |         )
 36 |         u = tf.reduce_mean(x, axis = axis, keepdims = True)
 37 |         s = tf.reduce_mean(tf.square(x - u), axis = axis, keepdims = True)
 38 |         x = (x - u) * tf.rsqrt(s + epsilon)
 39 |         x = x * g + b
 40 |         return x
 41 | 
 42 | 
 43 | def split_states(x, n):
 44 |     """Reshape the last dimension of x into [n, x.shape[-1]/n]."""
 45 |     *start, m = shape_list(x)
 46 |     return tf.reshape(x, start + [n, m // n])
 47 | 
 48 | 
 49 | def merge_states(x):
 50 |     """Smash the last two dimensions of x into a single dimension."""
 51 |     *start, a, b = shape_list(x)
 52 |     return tf.reshape(x, start + [a * b])
 53 | 
 54 | 
 55 | def conv1d(x, scope, nf, *, w_init_stdev = 0.02):
 56 |     with tf.variable_scope(scope):
 57 |         *start, nx = shape_list(x)
 58 |         w = tf.get_variable(
 59 |             'w',
 60 |             [1, nx, nf],
 61 |             initializer = tf.random_normal_initializer(stddev = w_init_stdev),
 62 |         )
 63 |         b = tf.get_variable('b', [nf], initializer = tf.constant_initializer(0))
 64 |         c = tf.reshape(
 65 |             tf.matmul(tf.reshape(x, [-1, nx]), tf.reshape(w, [-1, nf])) + b,
 66 |             start + [nf],
 67 |         )
 68 |         return c
 69 | 
 70 | 
 71 | def attention_mask(nd, ns, *, dtype):
 72 |     """1's in the lower triangle, counting from the lower right corner.
 73 | 
 74 |     Same as tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd), but doesn't produce garbage on TPUs.
 75 |     """
 76 |     i = tf.range(nd)[:, None]
 77 |     j = tf.range(ns)
 78 |     m = i >= j - ns + nd
 79 |     return tf.cast(m, dtype)
 80 | 
 81 | 
 82 | def attn(x, scope, n_state, *, past, hparams):
 83 |     assert x.shape.ndims == 3  # Should be [batch, sequence, features]
 84 |     assert n_state % hparams.n_head == 0
 85 |     if past is not None:
 86 |         assert (
 87 |             past.shape.ndims == 5
 88 |         )  # Should be [batch, 2, heads, sequence, features], where 2 is [k, v]
 89 | 
 90 |     def split_heads(x):
 91 |         # From [batch, sequence, features] to [batch, heads, sequence, features]
 92 |         return tf.transpose(split_states(x, hparams.n_head), [0, 2, 1, 3])
 93 | 
 94 |     def merge_heads(x):
 95 |         # Reverse of split_heads
 96 |         return merge_states(tf.transpose(x, [0, 2, 1, 3]))
 97 | 
 98 |     def mask_attn_weights(w):
 99 |         # w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst.
100 |         _, _, nd, ns = shape_list(w)
101 |         b = attention_mask(nd, ns, dtype = w.dtype)
102 |         b = tf.reshape(b, [1, 1, nd, ns])
103 |         w = w * b - tf.cast(1e10, w.dtype) * (1 - b)
104 |         return w
105 | 
106 |     def multihead_attn(q, k, v):
107 |         # q, k, v have shape [batch, heads, sequence, features]
108 |         w = tf.matmul(q, k, transpose_b = True)
109 |         w = w * tf.rsqrt(tf.cast(v.shape[-1].value, w.dtype))
110 | 
111 |         w = mask_attn_weights(w)
112 |         w = softmax(w)
113 |         a = tf.matmul(w, v)
114 |         return a
115 | 
116 |     with tf.variable_scope(scope):
117 |         c = conv1d(x, 'c_attn', n_state * 3)
118 |         q, k, v = map(split_heads, tf.split(c, 3, axis = 2))
119 |         present = tf.stack([k, v], axis = 1)
120 |         if past is not None:
121 |             pk, pv = tf.unstack(past, axis = 1)
122 |             k = tf.concat([pk, k], axis = -2)
123 |             v = tf.concat([pv, v], axis = -2)
124 |         a = multihead_attn(q, k, v)
125 |         a = merge_heads(a)
126 |         a = conv1d(a, 'c_proj', n_state)
127 |         return a, present
128 | 
129 | 
130 | def mlp(x, scope, n_state, *, hparams):
131 |     with tf.variable_scope(scope):
132 |         nx = x.shape[-1].value
133 |         h = gelu(conv1d(x, 'c_fc', n_state))
134 |         h2 = conv1d(h, 'c_proj', nx)
135 |         return h2
136 | 
137 | 
138 | def block(x, scope, *, past, hparams):
139 |     with tf.variable_scope(scope):
140 |         nx = x.shape[-1].value
141 |         a, present = attn(
142 |             norm(x, 'ln_1'), 'attn', nx, past = past, hparams = hparams
143 |         )
144 |         x = x + a
145 |         m = mlp(norm(x, 'ln_2'), 'mlp', nx * 4, hparams = hparams)
146 |         x = x + m
147 |         return x, present
148 | 
149 | 
150 | def past_shape(*, hparams, batch_size = None, sequence = None):
151 |     return [
152 |         batch_size,
153 |         hparams.n_layer,
154 |         2,
155 |         hparams.n_head,
156 |         sequence,
157 |         hparams.n_embd // hparams.n_head,
158 |     ]
159 | 
160 | 
161 | def expand_tile(value, size):
162 |     """Add a new axis of given size."""
163 |     value = tf.convert_to_tensor(value, name = 'value')
164 |     ndims = value.shape.ndims
165 |     return tf.tile(tf.expand_dims(value, axis = 0), [size] + [1] * ndims)
166 | 
167 | 
168 | def positions_for(tokens, past_length):
169 |     batch_size = tf.shape(tokens)[0]
170 |     nsteps = tf.shape(tokens)[1]
171 |     return expand_tile(past_length + tf.range(nsteps), batch_size)
172 | 
173 | 
174 | def model(hparams, X, past = None, scope = 'model', reuse = False):
175 |     with tf.variable_scope(scope, reuse = reuse):
176 |         results = {}
177 |         batch, sequence = shape_list(X)
178 | 
179 |         wpe = tf.get_variable(
180 |             'wpe',
181 |             [hparams.n_ctx, hparams.n_embd],
182 |             initializer = tf.random_normal_initializer(stddev = 0.01),
183 |         )
184 |         wte = tf.get_variable(
185 |             'wte',
186 |             [hparams.n_vocab, hparams.n_embd],
187 |             initializer = tf.random_normal_initializer(stddev = 0.02),
188 |         )
189 |         past_length = 0 if past is None else tf.shape(past)[-2]
190 |         h = tf.gather(wte, X) + tf.gather(wpe, positions_for(X, past_length))
191 | 
192 |         # Transformer
193 |         presents = []
194 |         pasts = (
195 |             tf.unstack(past, axis = 1)
196 |             if past is not None
197 |             else [None] * hparams.n_layer
198 |         )
199 |         assert len(pasts) == hparams.n_layer
200 |         for layer, past in enumerate(pasts):
201 |             h, present = block(h, 'h%d' % layer, past = past, hparams = hparams)
202 |             presents.append(present)
203 |         results['present'] = tf.stack(presents, axis = 1)
204 |         h = norm(h, 'ln_f')
205 | 
206 |         # Language model loss.  Do tokens <n predict token n?
207 |         h_flat = tf.reshape(h, [batch * sequence, hparams.n_embd])
208 |         logits = tf.matmul(h_flat, wte, transpose_b = True)
209 |         logits = tf.reshape(logits, [batch, sequence, hparams.n_vocab])
210 |         results['logits'] = logits
211 |         return results
212 | 


--------------------------------------------------------------------------------
/text-classification/modules.py:
--------------------------------------------------------------------------------
  1 | from setting import embed_size
  2 | import tensorflow as tf
  3 | 
  4 | 
  5 | def embed(inputs, vocab_size, dimension, scope = 'embedding', reuse = None):
  6 |     with tf.variable_scope(scope, reuse = reuse):
  7 |         lookup_table = tf.get_variable(
  8 |             'lookup_table',
  9 |             dtype = tf.float32,
 10 |             shape = [vocab_size, dimension],
 11 |             initializer = tf.truncated_normal_initializer(
 12 |                 mean = 0.0, stddev = 0.01
 13 |             ),
 14 |         )
 15 |         lookup_table = tf.concat(
 16 |             (tf.zeros(shape = [1, dimension]), lookup_table[1:, :]), 0
 17 |         )
 18 |     return tf.nn.embedding_lookup(lookup_table, inputs)
 19 | 
 20 | 
 21 | def normalize_bn(
 22 |     inputs,
 23 |     decay = 0.99,
 24 |     is_training = True,
 25 |     activation_fn = None,
 26 |     scope = 'normalize_bn',
 27 | ):
 28 |     inputs_shape = inputs.get_shape()
 29 |     inputs_rank = inputs_shape.ndims
 30 |     if inputs_rank in [2, 3, 4]:
 31 |         if inputs_rank == 2:
 32 |             inputs = tf.expand_dims(inputs, axis = 1)
 33 |             inputs = tf.expand_dims(inputs, axis = 2)
 34 |         elif inputs_rank == 3:
 35 |             inputs = tf.expand_dims(inputs, axis = 1)
 36 |         outputs = tf.contrib.layers.batch_norm(
 37 |             inputs = inputs,
 38 |             decay = decay,
 39 |             center = True,
 40 |             scale = True,
 41 |             activation_fn = activation_fn,
 42 |             updates_collections = None,
 43 |             is_training = is_training,
 44 |             scope = scope,
 45 |             zero_debias_moving_mean = True,
 46 |             fused = True,
 47 |         )
 48 |         if inputs_rank == 2:
 49 |             outputs = tf.squeeze(outputs, axis = [1, 2])
 50 |         elif inputs_rank == 3:
 51 |             outputs = tf.squeeze(outputs, axis = 1)
 52 |     else:
 53 |         outputs = tf.contrib.layers.batch_norm(
 54 |             inputs = inputs,
 55 |             decay = decay,
 56 |             center = True,
 57 |             scale = True,
 58 |             activation_fn = activation_fn,
 59 |             updates_collections = None,
 60 |             is_training = is_training,
 61 |             scope = scope,
 62 |             fused = False,
 63 |         )
 64 |     return outputs
 65 | 
 66 | 
 67 | def normalize_layer_norm(
 68 |     inputs, activation_fn = None, scope = 'normalize_layer_norm'
 69 | ):
 70 |     return tf.contrib.layers.layer_norm(
 71 |         inputs = inputs,
 72 |         center = True,
 73 |         scale = True,
 74 |         activation_fn = activation_fn,
 75 |         scope = scope,
 76 |     )
 77 | 
 78 | 
 79 | def normalize_in(inputs, activation_fn = None, scope = 'normalize_in'):
 80 |     with tf.variable_scope(scope):
 81 |         batch, steps, channels = inputs.get_shape().as_list()
 82 |         var_shape = [channels]
 83 |         mu, sigma_sq = tf.nn.moments(inputs, [1], keep_dims = True)
 84 |         shift = tf.Variable(tf.zeros(var_shape))
 85 |         scale = tf.Variable(tf.ones(var_shape))
 86 |         epsilon = 1e-8
 87 |         normalized = (inputs - mu) / (sigma_sq + epsilon) ** (0.5)
 88 |         outputs = scale * normalized + shift
 89 |         if activation_fn:
 90 |             outputs = activation_fn(outputs)
 91 |     return outputs
 92 | 
 93 | 
 94 | def conv1d(
 95 |     inputs,
 96 |     filters = None,
 97 |     size = 1,
 98 |     rate = 1,
 99 |     padding = 'SAME',
100 |     use_bias = False,
101 |     activation_fn = None,
102 |     scope = 'conv1d',
103 |     reuse = None,
104 | ):
105 |     with tf.variable_scope(scope):
106 |         if padding.lower() == 'causal':
107 |             pad_len = (size - 1) * rate
108 |             inputs = tf.pad(inputs, [[0, 0], [pad_len, 0], [0, 0]])
109 |             padding = 'valid'
110 |         if filters is None:
111 |             filters = inputs.get_shape().as_list()[-1]
112 |         params = {
113 |             'inputs': inputs,
114 |             'filters': filters,
115 |             'kernel_size': size,
116 |             'dilation_rate': rate,
117 |             'padding': padding,
118 |             'activation': activation_fn,
119 |             'use_bias': use_bias,
120 |             'reuse': reuse,
121 |         }
122 |         outputs = tf.layers.conv1d(**params)
123 |     return outputs
124 | 
125 | 
126 | def conv1d_banks(
127 |     inputs, K = 16, is_training = True, scope = 'conv1d_banks', reuse = None
128 | ):
129 |     with tf.variable_scope(scope, reuse = reuse):
130 |         outputs = conv1d(inputs, embed_size // 2, 1)
131 |         outputs = normalize_in(outputs, tf.nn.relu)
132 |         for k in range(2, K + 1):
133 |             with tf.variable_scope('num_%d' % (k)):
134 |                 output = conv1d(inputs, embed_size // 2, k)
135 |                 output = normalize_in(output, tf.nn.relu)
136 |                 outputs = tf.concat((outputs, output), -1)
137 |     return outputs
138 | 
139 | 
140 | def gru(inputs, units = None, bidirection = False, scope = 'gru', reuse = None):
141 |     with tf.variable_scope(scope, reuse = reuse):
142 |         if units is None:
143 |             units = inputs.get_shape().as_list()[-1]
144 |         cell = tf.contrib.rnn.GRUCell(units)
145 |         if bidirection:
146 |             cell_bw = tf.contrib.rnn.GRUCell(units)
147 |             outputs, _ = tf.nn.bidirectional_dynamic_rnn(
148 |                 cell, cell_bw, inputs, dtype = tf.float32
149 |             )
150 |             return tf.concat(outputs, 2)
151 |         else:
152 |             outputs, _ = tf.nn.dynamic_rnn(cell, inputs, dtype = tf.float32)
153 |             return outputs
154 | 
155 | 
156 | def attention_decoder(
157 |     inputs, memory, units = None, scope = 'attention_decoder', reuse = None
158 | ):
159 |     with tf.variable_scope(scope, reuse = reuse):
160 |         if units is None:
161 |             units = inputs.get_shape().as_list()[-1]
162 |         attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
163 |             units, memory
164 |         )
165 |         decoder_cell = tf.contrib.rnn.GRUCell(units)
166 |         cell_with_attention = tf.contrib.seq2seq.AttentionWrapper(
167 |             decoder_cell, attention_mechanism, units
168 |         )
169 |         outputs, _ = tf.nn.dynamic_rnn(
170 |             cell_with_attention, inputs, dtype = tf.float32
171 |         )
172 |     return outputs
173 | 
174 | 
175 | def prenet(inputs, is_training = True, scope = 'prenet', reuse = None):
176 |     with tf.variable_scope(scope, reuse = reuse):
177 |         outputs = tf.layers.dense(
178 |             inputs, units = embed_size, activation = tf.nn.relu, name = 'dense1'
179 |         )
180 |         outputs = tf.nn.dropout(
181 |             outputs,
182 |             keep_prob = 0.5 if is_training == True else 1.0,
183 |             name = 'dropout1',
184 |         )
185 |         outputs = tf.layers.dense(
186 |             outputs,
187 |             units = embed_size // 2,
188 |             activation = tf.nn.relu,
189 |             name = 'dense2',
190 |         )
191 |         outputs = tf.nn.dropout(
192 |             outputs,
193 |             keep_prob = 0.5 if is_training == True else 1.0,
194 |             name = 'dropout2',
195 |         )
196 |     return outputs
197 | 
198 | 
199 | def highwaynet(inputs, units = None, scope = 'highwaynet', reuse = None):
200 |     with tf.variable_scope(scope, reuse = reuse):
201 |         if units is None:
202 |             units = inputs.get_shape().as_list()[-1]
203 |         H = tf.layers.dense(
204 |             inputs, units = units, activation = tf.nn.relu, name = 'dense1'
205 |         )
206 |         T = tf.layers.dense(
207 |             inputs, units = units, activation = tf.nn.sigmoid, name = 'dense2'
208 |         )
209 |         C = 1.0 - T
210 |         return H * T + inputs * C
211 | 
212 | 
213 | def shift_by_one(inputs):
214 |     return tf.concat((tf.zeros_like(inputs[:, :1]), inputs[:, :-1]), 1)
215 | 


--------------------------------------------------------------------------------
/text-classification/utils.py:
--------------------------------------------------------------------------------
 1 | import sklearn.datasets
 2 | import numpy as np
 3 | import re
 4 | import collections
 5 | import random
 6 | from sklearn import metrics
 7 | from nltk.corpus import stopwords
 8 | 
 9 | english_stopwords = stopwords.words('english')
10 | 
11 | 
12 | def clearstring(string):
13 |     string = re.sub('[^A-Za-z0-9 ]+', '', string)
14 |     string = string.split(' ')
15 |     string = filter(None, string)
16 |     string = [y.strip() for y in string if y.strip() not in english_stopwords]
17 |     string = ' '.join(string)
18 |     return string.lower()
19 | 
20 | 
21 | def separate_dataset(trainset, ratio = 0.5):
22 |     datastring = []
23 |     datatarget = []
24 |     for i in range(len(trainset.data)):
25 |         data_ = trainset.data[i].split('\n')
26 |         data_ = list(filter(None, data_))
27 |         data_ = random.sample(data_, int(len(data_) * ratio))
28 |         for n in range(len(data_)):
29 |             data_[n] = clearstring(data_[n])
30 |         datastring += data_
31 |         for n in range(len(data_)):
32 |             datatarget.append(trainset.target[i])
33 |     return datastring, datatarget
34 | 
35 | 
36 | def build_dataset(words, n_words):
37 |     count = [['GO', 0], ['PAD', 1], ['EOS', 2], ['UNK', 3]]
38 |     count.extend(collections.Counter(words).most_common(n_words - 1))
39 |     dictionary = dict()
40 |     for word, _ in count:
41 |         dictionary[word] = len(dictionary)
42 |     data = list()
43 |     unk_count = 0
44 |     for word in words:
45 |         index = dictionary.get(word, 0)
46 |         if index == 0:
47 |             unk_count += 1
48 |         data.append(index)
49 |     count[0][1] = unk_count
50 |     reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
51 |     return data, count, dictionary, reversed_dictionary
52 | 
53 | 
54 | def str_idx(corpus, dic, maxlen, UNK = 3):
55 |     X = np.zeros((len(corpus), maxlen))
56 |     for i in range(len(corpus)):
57 |         for no, k in enumerate(corpus[i].split()[:maxlen][::-1]):
58 |             X[i, -1 - no] = dic.get(k, UNK)
59 |     return X
60 | 


--------------------------------------------------------------------------------
/text-similarity/README.md:
--------------------------------------------------------------------------------
1 | ## How-to
2 | 
3 | 1. Run [prepare-dataset.ipynb](prepare-dataset.ipynb).
4 | 2. Run any notebooks.
5 | 


--------------------------------------------------------------------------------
/text-similarity/prepare-dataset.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "# !wget https://cims.nyu.edu/~sbowman/multinli/multinli_1.0.zip\n",
 10 |     "# !unzip -o multinli_1.0.zip"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 2,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "# !wget http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 3,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "import re\n",
 29 |     "from unidecode import unidecode\n",
 30 |     "\n",
 31 |     "def cleaning(string):\n",
 32 |     "    return re.sub(r'[ ]+', ' ', unidecode(string)).strip()"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": 4,
 38 |    "metadata": {},
 39 |    "outputs": [
 40 |     {
 41 |      "data": {
 42 |       "text/plain": [
 43 |        "['multinli_1.0/multinli_1.0_dev_mismatched.jsonl',\n",
 44 |        " 'multinli_1.0/multinli_1.0_train.jsonl',\n",
 45 |        " 'multinli_1.0/multinli_1.0_dev_matched.jsonl']"
 46 |       ]
 47 |      },
 48 |      "execution_count": 4,
 49 |      "metadata": {},
 50 |      "output_type": "execute_result"
 51 |     }
 52 |    ],
 53 |    "source": [
 54 |     "from glob import glob\n",
 55 |     "\n",
 56 |     "files = glob('multinli_1.0/multinli_1.0_*.jsonl')\n",
 57 |     "files"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 5,
 63 |    "metadata": {},
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "import json\n",
 67 |     "\n",
 68 |     "with open(files[1]) as fopen:\n",
 69 |     "    train = fopen.read().split('\\n')\n",
 70 |     "    \n",
 71 |     "with open(files[0]) as fopen:\n",
 72 |     "    dev = fopen.read().split('\\n')\n",
 73 |     "    \n",
 74 |     "with open(files[2]) as fopen:\n",
 75 |     "    dev.extend(fopen.read().split('\\n'))"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 6,
 81 |    "metadata": {},
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "labels = ['contradiction', 'entailment']"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": 13,
 90 |    "metadata": {},
 91 |    "outputs": [
 92 |     {
 93 |      "name": "stderr",
 94 |      "output_type": "stream",
 95 |      "text": [
 96 |       "100%|██████████| 392703/392703 [00:03<00:00, 115787.44it/s]\n"
 97 |      ]
 98 |     }
 99 |    ],
100 |    "source": [
101 |     "from tqdm import tqdm\n",
102 |     "\n",
103 |     "train_X, train_Y = [], []\n",
104 |     "\n",
105 |     "for i in tqdm(range(len(train))):\n",
106 |     "    try:\n",
107 |     "        l = json.loads(train[i])\n",
108 |     "        if l['gold_label'] not in labels:\n",
109 |     "            continue\n",
110 |     "        if len(l['sentence1']) and len(l['sentence2']):\n",
111 |     "            s = f\"{l['sentence1']} <> {l['sentence2']}\"\n",
112 |     "            train_X.append(s)\n",
113 |     "            train_Y.append(l['gold_label'])\n",
114 |     "    except:\n",
115 |     "        pass"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": 8,
121 |    "metadata": {},
122 |    "outputs": [
123 |     {
124 |      "name": "stderr",
125 |      "output_type": "stream",
126 |      "text": [
127 |       "100%|██████████| 20002/20002 [00:00<00:00, 93673.10it/s]\n"
128 |      ]
129 |     }
130 |    ],
131 |    "source": [
132 |     "test_X, test_Y = [], []\n",
133 |     "\n",
134 |     "for i in tqdm(range(len(dev))):\n",
135 |     "    try:\n",
136 |     "        l = json.loads(dev[i])\n",
137 |     "        if l['gold_label'] not in labels:\n",
138 |     "            continue\n",
139 |     "        if len(l['sentence1']) and len(l['sentence2']):\n",
140 |     "            s = f\"{l['sentence1']} <> {l['sentence2']}\"\n",
141 |     "            test_X.append(s)\n",
142 |     "            test_Y.append(l['gold_label'])\n",
143 |     "    except:\n",
144 |     "        pass"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": 9,
150 |    "metadata": {},
151 |    "outputs": [],
152 |    "source": [
153 |     "import youtokentome as yttm\n",
154 |     "\n",
155 |     "with open('out.txt', 'w') as fopen:\n",
156 |     "    fopen.write('\\n'.join(test_X + train_X))\n",
157 |     "    \n",
158 |     "yttm.BPE.train(data='out.txt', vocab_size=30000, model='vocab.model')\n",
159 |     "bpe = yttm.BPE(model='vocab.model')"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": 10,
165 |    "metadata": {},
166 |    "outputs": [
167 |     {
168 |      "data": {
169 |       "text/plain": [
170 |        "['<PAD>', '<UNK>', '<BOS>', '<EOS>']"
171 |       ]
172 |      },
173 |      "execution_count": 10,
174 |      "metadata": {},
175 |      "output_type": "execute_result"
176 |     }
177 |    ],
178 |    "source": [
179 |     "bpe.vocab()[:4]"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "code",
184 |    "execution_count": 11,
185 |    "metadata": {},
186 |    "outputs": [
187 |     {
188 |      "data": {
189 |       "text/plain": [
190 |        "['halo<BOS> halo']"
191 |       ]
192 |      },
193 |      "execution_count": 11,
194 |      "metadata": {},
195 |      "output_type": "execute_result"
196 |     }
197 |    ],
198 |    "source": [
199 |     "bpe.decode(bpe.encode('halo') + [2] + bpe.encode('halo'))"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": 15,
205 |    "metadata": {},
206 |    "outputs": [
207 |     {
208 |      "name": "stderr",
209 |      "output_type": "stream",
210 |      "text": [
211 |       "100%|██████████| 261802/261802 [00:09<00:00, 26791.84it/s]\n"
212 |      ]
213 |     }
214 |    ],
215 |    "source": [
216 |     "left_train, right_train, label_train = [], [], []\n",
217 |     "\n",
218 |     "for i in tqdm(range(len(train_X))):\n",
219 |     "    l, r = train_X[i].split(' <> ')\n",
220 |     "    left_train.append(bpe.encode(l))\n",
221 |     "    right_train.append(bpe.encode(r))\n",
222 |     "    label_train.append(labels.index(train_Y[i]))"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "code",
227 |    "execution_count": 16,
228 |    "metadata": {},
229 |    "outputs": [
230 |     {
231 |      "name": "stderr",
232 |      "output_type": "stream",
233 |      "text": [
234 |       "100%|██████████| 13395/13395 [00:00<00:00, 29595.87it/s]\n"
235 |      ]
236 |     }
237 |    ],
238 |    "source": [
239 |     "left_test, right_test, label_test = [], [], []\n",
240 |     "\n",
241 |     "for i in tqdm(range(len(test_X))):\n",
242 |     "    l, r = test_X[i].split(' <> ')\n",
243 |     "    try:\n",
244 |     "        label_test.append(labels.index(test_Y[i]))\n",
245 |     "        left_test.append(bpe.encode(l))\n",
246 |     "        right_test.append(bpe.encode(r))\n",
247 |     "    except:\n",
248 |     "        pass"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "code",
253 |    "execution_count": 17,
254 |    "metadata": {},
255 |    "outputs": [],
256 |    "source": [
257 |     "with open('contrastive.json', 'w') as fopen:\n",
258 |     "    json.dump({'left_train': left_train,\n",
259 |     "              'right_train': right_train,\n",
260 |     "              'label_train': label_train,\n",
261 |     "              'left_test': left_test,\n",
262 |     "              'right_test': right_test,\n",
263 |     "              'label_test': label_test}, fopen)"
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "code",
268 |    "execution_count": 18,
269 |    "metadata": {},
270 |    "outputs": [
271 |     {
272 |      "name": "stderr",
273 |      "output_type": "stream",
274 |      "text": [
275 |       "100%|██████████| 261802/261802 [00:09<00:00, 26215.21it/s]\n"
276 |      ]
277 |     }
278 |    ],
279 |    "source": [
280 |     "left_train, label_train = [], []\n",
281 |     "\n",
282 |     "for i in tqdm(range(len(train_X))):\n",
283 |     "    l, r = train_X[i].split(' <> ')\n",
284 |     "    left_train.append(bpe.encode(l) + [2] + bpe.encode(r))\n",
285 |     "    label_train.append(labels.index(train_Y[i]))"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "code",
290 |    "execution_count": 19,
291 |    "metadata": {},
292 |    "outputs": [
293 |     {
294 |      "name": "stderr",
295 |      "output_type": "stream",
296 |      "text": [
297 |       "100%|██████████| 13395/13395 [00:00<00:00, 13604.82it/s]\n"
298 |      ]
299 |     }
300 |    ],
301 |    "source": [
302 |     "left_test, label_test = [], []\n",
303 |     "\n",
304 |     "for i in tqdm(range(len(test_X))):\n",
305 |     "    try:\n",
306 |     "        l, r = test_X[i].split(' <> ')\n",
307 |     "        label_test.append(labels.index(test_Y[i]))\n",
308 |     "        left_test.append(bpe.encode(l) + [2] + bpe.encode(r))\n",
309 |     "    except:\n",
310 |     "        pass"
311 |    ]
312 |   },
313 |   {
314 |    "cell_type": "code",
315 |    "execution_count": 20,
316 |    "metadata": {},
317 |    "outputs": [],
318 |    "source": [
319 |     "with open('pair.json', 'w') as fopen:\n",
320 |     "    json.dump({'left_train': left_train,\n",
321 |     "              'label_train': label_train,\n",
322 |     "              'left_test': left_test,\n",
323 |     "              'label_test': label_test}, fopen)"
324 |    ]
325 |   },
326 |   {
327 |    "cell_type": "code",
328 |    "execution_count": 21,
329 |    "metadata": {},
330 |    "outputs": [],
331 |    "source": [
332 |     "with open('text.json', 'w') as fopen:\n",
333 |     "    json.dump({'train_X': train_X,\n",
334 |     "              'train_Y': train_Y,\n",
335 |     "              'test_X': test_X,\n",
336 |     "              'test_Y': test_Y}, fopen)"
337 |    ]
338 |   }
339 |  ],
340 |  "metadata": {
341 |   "kernelspec": {
342 |    "display_name": "Python 3",
343 |    "language": "python",
344 |    "name": "python3"
345 |   },
346 |   "language_info": {
347 |    "codemirror_mode": {
348 |     "name": "ipython",
349 |     "version": 3
350 |    },
351 |    "file_extension": ".py",
352 |    "mimetype": "text/x-python",
353 |    "name": "python",
354 |    "nbconvert_exporter": "python",
355 |    "pygments_lexer": "ipython3",
356 |    "version": "3.6.8"
357 |   }
358 |  },
359 |  "nbformat": 4,
360 |  "nbformat_minor": 2
361 | }
362 | 


--------------------------------------------------------------------------------
/text-to-speech/1.tacotron/caching.py:
--------------------------------------------------------------------------------
 1 | import tqdm
 2 | import os
 3 | import numpy as np
 4 | from utils import load_file, path
 5 | 
 6 | if not os.path.exists('mel'):
 7 |     os.mkdir('mel')
 8 | if not os.path.exists('mag'):
 9 |     os.mkdir('mag')
10 | wav_files = [f for f in os.listdir(path) if f.endswith('.wav')]
11 | for fpath in tqdm.tqdm(wav_files):
12 |     fname, mel, mag = load_file(path + fpath)
13 |     np.save('mel/{}'.format(fname.replace('wav', 'npy')), mel)
14 |     np.save('mag/{}'.format(fname.replace('wav', 'npy')), mag)
15 | 


--------------------------------------------------------------------------------
/text-to-speech/1.tacotron/test-tacotron.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mesolitica/NLP-Models-Tensorflow/0741216aa8235e1228b3de7903cc36d73f8f2b45/text-to-speech/1.tacotron/test-tacotron.wav


--------------------------------------------------------------------------------
/text-to-speech/1.tacotron/utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import librosa
  3 | import copy
  4 | from scipy import signal
  5 | from scipy.io import wavfile
  6 | import matplotlib.pyplot as plt
  7 | import seaborn as sns
  8 | import os
  9 | import unicodedata
 10 | import re
 11 | 
 12 | # P: Padding
 13 | # E: End of Sentence
 14 | path = '../data/'
 15 | vocab = "PE abcdefghijklmnopqrstuvwxyz'.?"
 16 | max_duration = 10.0
 17 | sample_rate = 22050
 18 | fourier_window_size = 2048
 19 | frame_shift = 0.0125
 20 | frame_length = 0.05
 21 | hop_length = int(sample_rate * frame_shift)
 22 | win_length = int(sample_rate * frame_length)
 23 | n_mels = 80
 24 | power = 1.2
 25 | iteration_griffin = 50
 26 | preemphasis = 0.97
 27 | max_db = 100
 28 | ref_db = 20
 29 | embed_size = 256
 30 | encoder_num_banks = 16
 31 | decoder_num_banks = 8
 32 | num_highwaynet_blocks = 4
 33 | resampled = 5
 34 | dropout_rate = 0.5
 35 | learning_rate = 0.001
 36 | batch_size = 32
 37 | 
 38 | 
 39 | def get_spectrogram(audio_file):
 40 |     y, sr = librosa.load(audio_file, sr = sample_rate)
 41 |     y, _ = librosa.effects.trim(y)
 42 |     y = np.append(y[0], y[1:] - preemphasis * y[:-1])
 43 |     linear = librosa.stft(
 44 |         y = y,
 45 |         n_fft = fourier_window_size,
 46 |         hop_length = hop_length,
 47 |         win_length = win_length,
 48 |     )
 49 |     mag = np.abs(linear)
 50 |     mel_basis = librosa.filters.mel(sample_rate, fourier_window_size, n_mels)
 51 |     mel = np.dot(mel_basis, mag)
 52 |     mel = 20 * np.log10(np.maximum(1e-5, mel))
 53 |     mag = 20 * np.log10(np.maximum(1e-5, mag))
 54 |     mel = np.clip((mel - ref_db + max_db) / max_db, 1e-8, 1)
 55 |     mag = np.clip((mag - ref_db + max_db) / max_db, 1e-8, 1)
 56 |     return mel.T.astype(np.float32), mag.T.astype(np.float32)
 57 | 
 58 | 
 59 | def invert_spectrogram(spectrogram):
 60 |     return librosa.istft(
 61 |         spectrogram, hop_length, win_length = win_length, window = 'hann'
 62 |     )
 63 | 
 64 | 
 65 | def spectrogram2wav(mag):
 66 |     mag = mag.T
 67 |     mag = (np.clip(mag, 0, 1) * max_db) - max_db + ref_db
 68 |     mag = np.power(10.0, mag * 0.05)
 69 |     wav = griffin_lim(mag)
 70 |     wav = signal.lfilter([1], [1, -preemphasis], wav)
 71 |     wav, _ = librosa.effects.trim(wav)
 72 |     return wav.astype(np.float32)
 73 | 
 74 | 
 75 | def griffin_lim(spectrogram):
 76 |     X_best = copy.deepcopy(spectrogram)
 77 |     for i in range(iteration_griffin):
 78 |         X_T = invert_spectrogram(X_best)
 79 |         est = librosa.stft(
 80 |             X_T, fourier_window_size, hop_length, win_length = win_length
 81 |         )
 82 |         phase = est / np.maximum(1e-8, np.abs(est))
 83 |         X_best = spectrogram * phase
 84 |     X_T = invert_spectrogram(X_best)
 85 |     return np.real(X_T)
 86 | 
 87 | 
 88 | def get_wav(spectrogram):
 89 |     mag = (np.clip(spectrogram.T, 0, 1) * max_db) - max_db + ref_db
 90 |     mag = np.power(10.0, mag * 0.05)
 91 |     wav = griffin_lim(mag)
 92 |     wav = signal.lfilter([1], [1, -preemphasis], wav)
 93 |     return librosa.effects.trim(wav).astype(np.float32)
 94 | 
 95 | 
 96 | def load_file(path):
 97 |     fname = os.path.basename(path)
 98 |     mel, mag = get_spectrogram(path)
 99 |     t = mel.shape[0]
100 |     num_paddings = resampled - (t % resampled) if t % resampled != 0 else 0
101 |     mel = np.pad(mel, [[0, num_paddings], [0, 0]], mode = 'constant')
102 |     mag = np.pad(mag, [[0, num_paddings], [0, 0]], mode = 'constant')
103 |     return fname, mel.reshape((-1, n_mels * resampled)), mag
104 | 
105 | 
106 | def text_normalize(text):
107 |     text = ''.join(
108 |         char
109 |         for char in unicodedata.normalize('NFD', text)
110 |         if unicodedata.category(char) != 'Mn'
111 |     )
112 |     text = text.lower()
113 |     text = re.sub('[^{}]'.format(vocab), ' ', text)
114 |     text = re.sub('[ ]+', ' ', text)
115 |     return text
116 | 
117 | 
118 | def get_cached(path):
119 |     mel = 'mel/{}.npy'.format(path)
120 |     mag = 'mag/{}.npy'.format(path)
121 |     return np.load(mel), np.load(mag)
122 | 
123 | def plot_alignment(alignment):
124 |     fig, ax = plt.subplots()
125 |     im = ax.imshow(alignment)
126 |     fig.colorbar(im)
127 |     plt.show()
128 | 
129 | char2idx = {char: idx for idx, char in enumerate(vocab)}
130 | idx2char = {idx: char for idx, char in enumerate(vocab)}
131 | 


--------------------------------------------------------------------------------
/text-to-speech/README.md:
--------------------------------------------------------------------------------
1 | ## How-to
2 | 
3 | 1. First, you need to run [download.ipynb](download.ipynb)
4 | 
5 | 2. Go to any training folder, and run [caching.py](caching.py)
6 | 
7 | 3. Run any notebook using Jupyter Notebook.
8 | 


--------------------------------------------------------------------------------
/text-to-speech/caching.py:
--------------------------------------------------------------------------------
 1 | import tqdm
 2 | import os
 3 | import numpy as np
 4 | from utils import load_file, path
 5 | 
 6 | if not os.path.exists('mel'):
 7 |     os.mkdir('mel')
 8 | if not os.path.exists('mag'):
 9 |     os.mkdir('mag')
10 | wav_files = [f for f in os.listdir(path) if f.endswith('.wav')]
11 | for fpath in tqdm.tqdm(wav_files):
12 |     fname, mel, mag = load_file(path + fpath)
13 |     np.save('mel/{}'.format(fname.replace('wav', 'npy')), mel)
14 |     np.save('mag/{}'.format(fname.replace('wav', 'npy')), mag)
15 | 


--------------------------------------------------------------------------------
/text-to-speech/download.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 1,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "from bs4 import BeautifulSoup\n",
10 |     "from urllib.request import urlopen, urlretrieve\n",
11 |     "from tqdm import tqdm\n",
12 |     "import re\n",
13 |     "import os"
14 |    ]
15 |   },
16 |   {
17 |    "cell_type": "code",
18 |    "execution_count": 2,
19 |    "metadata": {},
20 |    "outputs": [
21 |     {
22 |      "name": "stderr",
23 |      "output_type": "stream",
24 |      "text": [
25 |       "100%|███████████████████████████████| 200/200 [05:56<00:00,  1.84s/it]\n",
26 |       "100%|███████████████████████████████| 200/200 [05:38<00:00,  1.69s/it]\n",
27 |       "100%|███████████████████████████████| 200/200 [06:16<00:00,  1.82s/it]\n",
28 |       "100%|███████████████████████████████| 200/200 [06:00<00:00,  1.76s/it]\n",
29 |       "100%|███████████████████████████████| 200/200 [06:46<00:00,  2.47s/it]\n",
30 |       "100%|███████████████████████████████| 200/200 [09:04<00:00,  2.60s/it]\n",
31 |       "100%|███████████████████████████████| 200/200 [10:12<00:00,  2.87s/it]\n",
32 |       "100%|███████████████████████████████| 200/200 [09:01<00:00,  2.63s/it]\n",
33 |       "100%|███████████████████████████████| 200/200 [09:39<00:00,  3.47s/it]\n",
34 |       "100%|███████████████████████████████| 200/200 [10:56<00:00,  3.04s/it]\n",
35 |       "100%|███████████████████████████████| 200/200 [11:12<00:00,  3.06s/it]\n",
36 |       "100%|███████████████████████████████| 200/200 [07:46<00:00,  2.32s/it]\n",
37 |       "100%|███████████████████████████████| 200/200 [09:30<00:00,  2.83s/it]\n",
38 |       "100%|███████████████████████████████| 200/200 [10:05<00:00,  3.83s/it]\n"
39 |      ]
40 |     }
41 |    ],
42 |    "source": [
43 |     "prefix = 'https://tspace.library.utoronto.ca'\n",
44 |     "save_dir = './data/'\n",
45 |     "if not os.path.exists(save_dir):\n",
46 |     "    os.makedirs(save_dir)\n",
47 |     "\n",
48 |     "base_url = 'https://tspace.library.utoronto.ca/handle/1807/24'\n",
49 |     "urls = [base_url+str(i) for i in range(488, 502)]\n",
50 |     "for url in urls:\n",
51 |     "    soup = BeautifulSoup(urlopen(url).read(), 'html5lib')\n",
52 |     "    targets = soup.findAll('a', href=re.compile(r'/bitstream/.*.wav'))\n",
53 |     "        \n",
54 |     "    for a in tqdm(targets, total=len(targets), ncols=70):\n",
55 |     "        link = a['href']\n",
56 |     "\n",
57 |     "        audio_save_loc = save_dir + link.split('/')[-1]\n",
58 |     "        if os.path.isfile(audio_save_loc):\n",
59 |     "            print(\"File Already Exists\")\n",
60 |     "        urlretrieve(prefix+a['href'], audio_save_loc)\n",
61 |     "\n",
62 |     "        with open(audio_save_loc.replace('.wav', '.txt'), 'w') as f:\n",
63 |     "            f.write('say the word ' + link.split('_')[-2])"
64 |    ]
65 |   },
66 |   {
67 |    "cell_type": "code",
68 |    "execution_count": null,
69 |    "metadata": {},
70 |    "outputs": [],
71 |    "source": []
72 |   }
73 |  ],
74 |  "metadata": {
75 |   "kernelspec": {
76 |    "display_name": "Python 3",
77 |    "language": "python",
78 |    "name": "python3"
79 |   },
80 |   "language_info": {
81 |    "codemirror_mode": {
82 |     "name": "ipython",
83 |     "version": 3
84 |    },
85 |    "file_extension": ".py",
86 |    "mimetype": "text/x-python",
87 |    "name": "python",
88 |    "nbconvert_exporter": "python",
89 |    "pygments_lexer": "ipython3",
90 |    "version": "3.5.2"
91 |   }
92 |  },
93 |  "nbformat": 4,
94 |  "nbformat_minor": 2
95 | }
96 | 


--------------------------------------------------------------------------------
/text-to-speech/test-bahdanau.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mesolitica/NLP-Models-Tensorflow/0741216aa8235e1228b3de7903cc36d73f8f2b45/text-to-speech/test-bahdanau.wav


--------------------------------------------------------------------------------
/text-to-speech/test-dilated-cnn-monothonic-attention.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mesolitica/NLP-Models-Tensorflow/0741216aa8235e1228b3de7903cc36d73f8f2b45/text-to-speech/test-dilated-cnn-monothonic-attention.wav


--------------------------------------------------------------------------------
/text-to-speech/test-dilated-cnn-self-attention.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mesolitica/NLP-Models-Tensorflow/0741216aa8235e1228b3de7903cc36d73f8f2b45/text-to-speech/test-dilated-cnn-self-attention.wav


--------------------------------------------------------------------------------
/text-to-speech/test-luong.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mesolitica/NLP-Models-Tensorflow/0741216aa8235e1228b3de7903cc36d73f8f2b45/text-to-speech/test-luong.wav


--------------------------------------------------------------------------------
/text-to-speech/utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import librosa
  3 | import copy
  4 | from scipy import signal
  5 | from scipy.io import wavfile
  6 | import matplotlib.pyplot as plt
  7 | import seaborn as sns
  8 | import os
  9 | import unicodedata
 10 | import re
 11 | 
 12 | # P: Padding
 13 | # S: Start of Sentence
 14 | # E: End of Sentence
 15 | path = '../data/'
 16 | vocab = "PSE abcdefghijklmnopqrstuvwxyz'.?"
 17 | max_duration = 10.0
 18 | sample_rate = 22050
 19 | fourier_window_size = 2048
 20 | frame_shift = 0.0125
 21 | frame_length = 0.05
 22 | hop_length = int(sample_rate * frame_shift)
 23 | win_length = int(sample_rate * frame_length)
 24 | n_mels = 80
 25 | power = 1.2
 26 | iteration_griffin = 50
 27 | preemphasis = 0.97
 28 | max_db = 100
 29 | ref_db = 20
 30 | embed_size = 256
 31 | encoder_num_banks = 16
 32 | decoder_num_banks = 8
 33 | num_highwaynet_blocks = 4
 34 | resampled = 5
 35 | dropout_rate = 0.5
 36 | learning_rate = 0.001
 37 | batch_size = 32
 38 | 
 39 | 
 40 | def get_spectrogram(audio_file):
 41 |     y, sr = librosa.load(audio_file, sr = sample_rate)
 42 |     y, _ = librosa.effects.trim(y)
 43 |     y = np.append(y[0], y[1:] - preemphasis * y[:-1])
 44 |     linear = librosa.stft(
 45 |         y = y,
 46 |         n_fft = fourier_window_size,
 47 |         hop_length = hop_length,
 48 |         win_length = win_length,
 49 |     )
 50 |     mag = np.abs(linear)
 51 |     mel_basis = librosa.filters.mel(sample_rate, fourier_window_size, n_mels)
 52 |     mel = np.dot(mel_basis, mag)
 53 |     mel = 20 * np.log10(np.maximum(1e-5, mel))
 54 |     mag = 20 * np.log10(np.maximum(1e-5, mag))
 55 |     mel = np.clip((mel - ref_db + max_db) / max_db, 1e-8, 1)
 56 |     mag = np.clip((mag - ref_db + max_db) / max_db, 1e-8, 1)
 57 |     return mel.T.astype(np.float32), mag.T.astype(np.float32)
 58 | 
 59 | 
 60 | def invert_spectrogram(spectrogram):
 61 |     return librosa.istft(
 62 |         spectrogram, hop_length, win_length = win_length, window = 'hann'
 63 |     )
 64 | 
 65 | 
 66 | def spectrogram2wav(mag):
 67 |     mag = mag.T
 68 |     mag = (np.clip(mag, 0, 1) * max_db) - max_db + ref_db
 69 |     mag = np.power(10.0, mag * 0.05)
 70 |     wav = griffin_lim(mag)
 71 |     wav = signal.lfilter([1], [1, -preemphasis], wav)
 72 |     wav, _ = librosa.effects.trim(wav)
 73 |     return wav.astype(np.float32)
 74 | 
 75 | 
 76 | def griffin_lim(spectrogram):
 77 |     X_best = copy.deepcopy(spectrogram)
 78 |     for i in range(iteration_griffin):
 79 |         X_T = invert_spectrogram(X_best)
 80 |         est = librosa.stft(
 81 |             X_T, fourier_window_size, hop_length, win_length = win_length
 82 |         )
 83 |         phase = est / np.maximum(1e-8, np.abs(est))
 84 |         X_best = spectrogram * phase
 85 |     X_T = invert_spectrogram(X_best)
 86 |     return np.real(X_T)
 87 | 
 88 | 
 89 | def get_wav(spectrogram):
 90 |     mag = (np.clip(spectrogram.T, 0, 1) * max_db) - max_db + ref_db
 91 |     mag = np.power(10.0, mag * 0.05)
 92 |     wav = griffin_lim(mag)
 93 |     wav = signal.lfilter([1], [1, -preemphasis], wav)
 94 |     return librosa.effects.trim(wav).astype(np.float32)
 95 | 
 96 | 
 97 | def load_file(path):
 98 |     fname = os.path.basename(path)
 99 |     mel, mag = get_spectrogram(path)
100 |     t = mel.shape[0]
101 |     num_paddings = resampled - (t % resampled) if t % resampled != 0 else 0
102 |     mel = np.pad(mel, [[0, num_paddings], [0, 0]], mode = 'constant')
103 |     mag = np.pad(mag, [[0, num_paddings], [0, 0]], mode = 'constant')
104 |     return fname, mel.reshape((-1, n_mels * resampled)), mag
105 | 
106 | 
107 | def text_normalize(text):
108 |     text = ''.join(
109 |         char
110 |         for char in unicodedata.normalize('NFD', text)
111 |         if unicodedata.category(char) != 'Mn'
112 |     )
113 |     text = text.lower()
114 |     text = re.sub('[^{}]'.format(vocab), ' ', text)
115 |     text = re.sub('[ ]+', ' ', text)
116 |     return text
117 | 
118 | 
119 | def get_cached(path):
120 |     mel = 'mel/{}.npy'.format(path)
121 |     mag = 'mag/{}.npy'.format(path)
122 |     return np.load(mel), np.load(mag)
123 | 
124 | def plot_alignment(alignment, e):
125 |     fig, ax = plt.subplots()
126 |     im = ax.imshow(alignment)
127 |     fig.colorbar(im)
128 |     plt.title('epoch %d' % (e))
129 |     plt.show()
130 | 
131 | char2idx = {char: idx for idx, char in enumerate(vocab)}
132 | idx2char = {idx: char for idx, char in enumerate(vocab)}
133 | 


--------------------------------------------------------------------------------
/topic-generator/README.md:
--------------------------------------------------------------------------------
 1 | ## How-to
 2 | 
 3 | 1. Download dataset,
 4 | ```bash
 5 | wget https://github.com/huseinzol05/Malaya-Dataset/raw/master/news/news.zip
 6 | ```
 7 | 
 8 | 2. Unzip the dataset,
 9 | ```bash
10 | unzip news.zip
11 | ```
12 | 
13 | 3. Run any notebook using Jupyter Notebook.
14 | 


--------------------------------------------------------------------------------
/topic-model/prepro_utils.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | from __future__ import absolute_import
  3 | from __future__ import division
  4 | from __future__ import print_function
  5 | 
  6 | import unicodedata
  7 | import six
  8 | from functools import partial
  9 | 
 10 | 
 11 | SPIECE_UNDERLINE = '▁'
 12 | 
 13 | 
 14 | def printable_text(text):
 15 |     """Returns text encoded in a way suitable for print or `tf.logging`."""
 16 | 
 17 |     # These functions want `str` for both Python2 and Python3, but in one case
 18 |     # it's a Unicode string and in the other it's a byte string.
 19 |     if six.PY3:
 20 |         if isinstance(text, str):
 21 |             return text
 22 |         elif isinstance(text, bytes):
 23 |             return text.decode('utf-8', 'ignore')
 24 |         else:
 25 |             raise ValueError('Unsupported string type: %s' % (type(text)))
 26 |     elif six.PY2:
 27 |         if isinstance(text, str):
 28 |             return text
 29 |         elif isinstance(text, unicode):
 30 |             return text.encode('utf-8')
 31 |         else:
 32 |             raise ValueError('Unsupported string type: %s' % (type(text)))
 33 |     else:
 34 |         raise ValueError('Not running on Python2 or Python 3?')
 35 | 
 36 | 
 37 | def print_(*args):
 38 |     new_args = []
 39 |     for arg in args:
 40 |         if isinstance(arg, list):
 41 |             s = [printable_text(i) for i in arg]
 42 |             s = ' '.join(s)
 43 |             new_args.append(s)
 44 |         else:
 45 |             new_args.append(printable_text(arg))
 46 |     print(*new_args)
 47 | 
 48 | 
 49 | def preprocess_text(
 50 |     inputs, lower = False, remove_space = True, keep_accents = False
 51 | ):
 52 |     if remove_space:
 53 |         outputs = ' '.join(inputs.strip().split())
 54 |     else:
 55 |         outputs = inputs
 56 |     outputs = outputs.replace('``', '"').replace("''", '"')
 57 | 
 58 |     if six.PY2 and isinstance(outputs, str):
 59 |         outputs = outputs.decode('utf-8')
 60 | 
 61 |     if not keep_accents:
 62 |         outputs = unicodedata.normalize('NFKD', outputs)
 63 |         outputs = ''.join([c for c in outputs if not unicodedata.combining(c)])
 64 |     if lower:
 65 |         outputs = outputs.lower()
 66 | 
 67 |     return outputs
 68 | 
 69 | 
 70 | def encode_pieces(sp_model, text, return_unicode = True, sample = False):
 71 |     # return_unicode is used only for py2
 72 | 
 73 |     # note(zhiliny): in some systems, sentencepiece only accepts str for py2
 74 |     if six.PY2 and isinstance(text, unicode):
 75 |         text = text.encode('utf-8')
 76 | 
 77 |     if not sample:
 78 |         pieces = sp_model.EncodeAsPieces(text)
 79 |     else:
 80 |         pieces = sp_model.SampleEncodeAsPieces(text, 64, 0.1)
 81 |     new_pieces = []
 82 |     for piece in pieces:
 83 |         if len(piece) > 1 and piece[-1] == ',' and piece[-2].isdigit():
 84 |             cur_pieces = sp_model.EncodeAsPieces(
 85 |                 piece[:-1].replace(SPIECE_UNDERLINE, '')
 86 |             )
 87 |             if (
 88 |                 piece[0] != SPIECE_UNDERLINE
 89 |                 and cur_pieces[0][0] == SPIECE_UNDERLINE
 90 |             ):
 91 |                 if len(cur_pieces[0]) == 1:
 92 |                     cur_pieces = cur_pieces[1:]
 93 |                 else:
 94 |                     cur_pieces[0] = cur_pieces[0][1:]
 95 |             cur_pieces.append(piece[-1])
 96 |             new_pieces.extend(cur_pieces)
 97 |         else:
 98 |             new_pieces.append(piece)
 99 | 
100 |     # note(zhiliny): convert back to unicode for py2
101 |     if six.PY2 and return_unicode:
102 |         ret_pieces = []
103 |         for piece in new_pieces:
104 |             if isinstance(piece, str):
105 |                 piece = piece.decode('utf-8')
106 |             ret_pieces.append(piece)
107 |         new_pieces = ret_pieces
108 | 
109 |     return new_pieces
110 | 
111 | 
112 | def encode_ids(sp_model, text, sample = False):
113 |     pieces = encode_pieces(
114 |         sp_model, text, return_unicode = False, sample = sample
115 |     )
116 |     ids = [sp_model.PieceToId(piece) for piece in pieces]
117 |     return ids
118 | 
119 | 
120 | if __name__ == '__main__':
121 |     import sentencepiece as spm
122 | 
123 |     sp = spm.SentencePieceProcessor()
124 |     sp.load('sp10m.uncased.v3.model')
125 | 
126 |     print_(u'I was born in 2000, and this is falsé.')
127 |     print_(
128 |         u'ORIGINAL',
129 |         sp.EncodeAsPieces(u'I was born in 2000, and this is falsé.'),
130 |     )
131 |     print_(
132 |         u'OURS', encode_pieces(sp, u'I was born in 2000, and this is falsé.')
133 |     )
134 |     print(encode_ids(sp, u'I was born in 2000, and this is falsé.'))
135 |     print_('')
136 |     prepro_func = partial(preprocess_text, lower = True)
137 |     print_(prepro_func('I was born in 2000, and this is falsé.'))
138 |     print_(
139 |         'ORIGINAL',
140 |         sp.EncodeAsPieces(
141 |             prepro_func('I was born in 2000, and this is falsé.')
142 |         ),
143 |     )
144 |     print_(
145 |         'OURS',
146 |         encode_pieces(
147 |             sp, prepro_func('I was born in 2000, and this is falsé.')
148 |         ),
149 |     )
150 |     print(encode_ids(sp, prepro_func('I was born in 2000, and this is falsé.')))
151 |     print_('')
152 |     print_('I was born in 2000, and this is falsé.')
153 |     print_(
154 |         'ORIGINAL', sp.EncodeAsPieces('I was born in 2000, and this is falsé.')
155 |     )
156 |     print_('OURS', encode_pieces(sp, 'I was born in 2000, and this is falsé.'))
157 |     print(encode_ids(sp, 'I was born in 2000, and this is falsé.'))
158 |     print_('')
159 |     print_('I was born in 92000, and this is falsé.')
160 |     print_(
161 |         'ORIGINAL', sp.EncodeAsPieces('I was born in 92000, and this is falsé.')
162 |     )
163 |     print_('OURS', encode_pieces(sp, 'I was born in 92000, and this is falsé.'))
164 |     print(encode_ids(sp, 'I was born in 92000, and this is falsé.'))
165 | 


--------------------------------------------------------------------------------
/topic-model/utils.py:
--------------------------------------------------------------------------------
 1 | import sklearn.datasets
 2 | import numpy as np
 3 | import re
 4 | import collections
 5 | import random
 6 | from sklearn import metrics
 7 | from nltk.corpus import stopwords
 8 | 
 9 | english_stopwords = stopwords.words('english')
10 | 
11 | 
12 | def clearstring(string):
13 |     string = re.sub('[^A-Za-z0-9 ]+', '', string)
14 |     string = string.split(' ')
15 |     string = filter(None, string)
16 |     string = [y.strip() for y in string if y.strip() not in english_stopwords]
17 |     string = ' '.join(string)
18 |     return string.lower()
19 | 
20 | 
21 | def separate_dataset(trainset, ratio = 0.5):
22 |     datastring = []
23 |     datatarget = []
24 |     for i in range(len(trainset.data)):
25 |         data_ = trainset.data[i].split('\n')
26 |         data_ = list(filter(None, data_))
27 |         data_ = random.sample(data_, int(len(data_) * ratio))
28 |         for n in range(len(data_)):
29 |             data_[n] = clearstring(data_[n])
30 |         datastring += data_
31 |         for n in range(len(data_)):
32 |             datatarget.append(trainset.target[i])
33 |     return datastring, datatarget
34 | 
35 | 
36 | def build_dataset(words, n_words):
37 |     count = [['GO', 0], ['PAD', 1], ['EOS', 2], ['UNK', 3]]
38 |     count.extend(collections.Counter(words).most_common(n_words - 1))
39 |     dictionary = dict()
40 |     for word, _ in count:
41 |         dictionary[word] = len(dictionary)
42 |     data = list()
43 |     unk_count = 0
44 |     for word in words:
45 |         index = dictionary.get(word, 0)
46 |         if index == 0:
47 |             unk_count += 1
48 |         data.append(index)
49 |     count[0][1] = unk_count
50 |     reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
51 |     return data, count, dictionary, reversed_dictionary
52 | 
53 | 
54 | def str_idx(corpus, dic, maxlen, UNK = 3):
55 |     X = np.zeros((len(corpus), maxlen))
56 |     for i in range(len(corpus)):
57 |         for no, k in enumerate(corpus[i].split()[:maxlen][::-1]):
58 |             X[i, -1 - no] = dic.get(k, UNK)
59 |     return X
60 | 


--------------------------------------------------------------------------------
/unsupervised-extractive-summarization/README.md:
--------------------------------------------------------------------------------
1 | ## How-to
2 | 
3 | 1. Run any notebook using Jupyter Notebook.
4 | 


--------------------------------------------------------------------------------
/vectorizer/README.md:
--------------------------------------------------------------------------------
1 | ## How-to
2 | 
3 | 1. Make sure `data` folder in the same directory of the notebooks.
4 | 
5 | 2. Run any notebook using Jupyter Notebook.
6 | 


--------------------------------------------------------------------------------
/vectorizer/utils.py:
--------------------------------------------------------------------------------
 1 | import sklearn.datasets
 2 | import numpy as np
 3 | import re
 4 | import collections
 5 | import random
 6 | from sklearn import metrics
 7 | from nltk.corpus import stopwords
 8 | 
 9 | english_stopwords = stopwords.words('english')
10 | 
11 | 
12 | def clearstring(string):
13 |     string = re.sub('[^A-Za-z0-9 ]+', '', string)
14 |     string = string.split(' ')
15 |     string = filter(None, string)
16 |     string = [y.strip() for y in string if y.strip() not in english_stopwords]
17 |     string = ' '.join(string)
18 |     return string.lower()
19 | 
20 | 
21 | def separate_dataset(trainset, ratio = 0.5):
22 |     datastring = []
23 |     datatarget = []
24 |     for i in range(len(trainset.data)):
25 |         data_ = trainset.data[i].split('\n')
26 |         data_ = list(filter(None, data_))
27 |         data_ = random.sample(data_, int(len(data_) * ratio))
28 |         for n in range(len(data_)):
29 |             data_[n] = clearstring(data_[n])
30 |         datastring += data_
31 |         for n in range(len(data_)):
32 |             datatarget.append(trainset.target[i])
33 |     return datastring, datatarget
34 | 
35 | 
36 | def build_dataset(words, n_words):
37 |     count = [['GO', 0], ['PAD', 1], ['EOS', 2], ['UNK', 3]]
38 |     count.extend(collections.Counter(words).most_common(n_words - 1))
39 |     dictionary = dict()
40 |     for word, _ in count:
41 |         dictionary[word] = len(dictionary)
42 |     data = list()
43 |     unk_count = 0
44 |     for word in words:
45 |         index = dictionary.get(word, 0)
46 |         if index == 0:
47 |             unk_count += 1
48 |         data.append(index)
49 |     count[0][1] = unk_count
50 |     reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
51 |     return data, count, dictionary, reversed_dictionary
52 | 
53 | 
54 | def str_idx(corpus, dic, maxlen, UNK = 3):
55 |     X = np.zeros((len(corpus), maxlen))
56 |     for i in range(len(corpus)):
57 |         for no, k in enumerate(corpus[i].split()[:maxlen][::-1]):
58 |             X[i, -1 - no] = dic.get(k, UNK)
59 |     return X
60 | 


--------------------------------------------------------------------------------
/vocoder/README.md:
--------------------------------------------------------------------------------
1 | ## How-to
2 | 
3 | 1. First, you need to run [download.ipynb](download.ipynb)
4 | 
5 | 2. Go to any training folder, and run [caching-vocoder.ipynb](caching-vocoder.ipynb)
6 | 
7 | 3. Run any notebook using Jupyter Notebook.
8 | 


--------------------------------------------------------------------------------
/vocoder/caching-vocoder.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n",
 10 |     "import librosa\n",
 11 |     "import os\n",
 12 |     "from tqdm import tqdm"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 2,
 18 |    "metadata": {},
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "sample_rate = 22050\n",
 22 |     "fourier_window_size = 2048\n",
 23 |     "max_db = 100\n",
 24 |     "preemphasis = 0.97\n",
 25 |     "frame_shift = 0.0125\n",
 26 |     "frame_length = 0.05\n",
 27 |     "hop_length = int(sample_rate * frame_shift)\n",
 28 |     "win_length = int(sample_rate * frame_length)\n",
 29 |     "n_mels = 80\n",
 30 |     "ref_db = 20\n",
 31 |     "resampled = 5\n",
 32 |     "\n",
 33 |     "def get_spectrogram(audio_file):\n",
 34 |     "    y, sr = librosa.load(audio_file, sr = sample_rate)\n",
 35 |     "    y, _ = librosa.effects.trim(y)\n",
 36 |     "    y = np.append(y[0], y[1:] - preemphasis * y[:-1])\n",
 37 |     "    linear = librosa.stft(\n",
 38 |     "        y = y,\n",
 39 |     "        n_fft = fourier_window_size,\n",
 40 |     "        hop_length = hop_length,\n",
 41 |     "        win_length = win_length,\n",
 42 |     "    )\n",
 43 |     "    mag = np.abs(linear)\n",
 44 |     "    mel_basis = librosa.filters.mel(sample_rate, fourier_window_size, n_mels)\n",
 45 |     "    mel = np.dot(mel_basis, mag)\n",
 46 |     "    mel = 20 * np.log10(np.maximum(1e-5, mel))\n",
 47 |     "    mag = 20 * np.log10(np.maximum(1e-5, mag))\n",
 48 |     "    mel = np.clip((mel - ref_db + max_db) / max_db, 1e-8, 1)\n",
 49 |     "    mag = np.clip((mag - ref_db + max_db) / max_db, 1e-8, 1)\n",
 50 |     "    return mel.T.astype(np.float32), mag.T.astype(np.float32)\n",
 51 |     "\n",
 52 |     "def load_file(path):\n",
 53 |     "    fname = os.path.basename(path)\n",
 54 |     "    mel, mag = get_spectrogram(path)\n",
 55 |     "    t = mel.shape[0]\n",
 56 |     "    num_paddings = resampled - (t % resampled) if t % resampled != 0 else 0\n",
 57 |     "    mel = np.pad(mel, [[0, num_paddings], [0, 0]], mode = 'constant')\n",
 58 |     "    mag = np.pad(mag, [[0, num_paddings], [0, 0]], mode = 'constant')\n",
 59 |     "    return fname, mel.reshape((-1, n_mels * resampled)), mag"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": 3,
 65 |    "metadata": {},
 66 |    "outputs": [
 67 |     {
 68 |      "name": "stderr",
 69 |      "output_type": "stream",
 70 |      "text": [
 71 |       "100%|██████████| 200/200 [00:25<00:00,  7.88it/s]\n"
 72 |      ]
 73 |     }
 74 |    ],
 75 |    "source": [
 76 |     "if not os.path.exists('mel_old'):\n",
 77 |     "    os.mkdir('mel_old')\n",
 78 |     "if not os.path.exists('mag_old'):\n",
 79 |     "    os.mkdir('mag_old')\n",
 80 |     "\n",
 81 |     "wav_files = [f for f in os.listdir('old') if f.endswith('.wav')]\n",
 82 |     "\n",
 83 |     "for fpath in tqdm(wav_files):\n",
 84 |     "    fname, mel, mag = load_file('old/' + fpath)\n",
 85 |     "    np.save('mel_old/{}'.format(fname.replace('wav', 'npy')), mel)\n",
 86 |     "    np.save('mag_old/{}'.format(fname.replace('wav', 'npy')), mag)"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": 4,
 92 |    "metadata": {},
 93 |    "outputs": [
 94 |     {
 95 |      "name": "stderr",
 96 |      "output_type": "stream",
 97 |      "text": [
 98 |       "100%|██████████| 200/200 [00:25<00:00,  7.98it/s]\n"
 99 |      ]
100 |     }
101 |    ],
102 |    "source": [
103 |     "if not os.path.exists('mel_young'):\n",
104 |     "    os.mkdir('mel_young')\n",
105 |     "if not os.path.exists('mag_young'):\n",
106 |     "    os.mkdir('mag_young')\n",
107 |     "\n",
108 |     "wav_files = [f for f in os.listdir('young') if f.endswith('.wav')]\n",
109 |     "\n",
110 |     "for fpath in tqdm(wav_files):\n",
111 |     "    fname, mel, mag = load_file('young/' + fpath)\n",
112 |     "    np.save('mel_young/{}'.format(fname.replace('wav', 'npy')), mel)\n",
113 |     "    np.save('mag_young/{}'.format(fname.replace('wav', 'npy')), mag)"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": null,
119 |    "metadata": {},
120 |    "outputs": [],
121 |    "source": []
122 |   }
123 |  ],
124 |  "metadata": {
125 |   "kernelspec": {
126 |    "display_name": "Python 3",
127 |    "language": "python",
128 |    "name": "python3"
129 |   },
130 |   "language_info": {
131 |    "codemirror_mode": {
132 |     "name": "ipython",
133 |     "version": 3
134 |    },
135 |    "file_extension": ".py",
136 |    "mimetype": "text/x-python",
137 |    "name": "python",
138 |    "nbconvert_exporter": "python",
139 |    "pygments_lexer": "ipython3",
140 |    "version": "3.6.8"
141 |   }
142 |  },
143 |  "nbformat": 4,
144 |  "nbformat_minor": 2
145 | }
146 | 


--------------------------------------------------------------------------------
/vocoder/download.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from bs4 import BeautifulSoup\n",
 10 |     "from urllib.request import urlopen, urlretrieve\n",
 11 |     "from tqdm import tqdm\n",
 12 |     "import re\n",
 13 |     "import os"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 2,
 19 |    "metadata": {},
 20 |    "outputs": [
 21 |     {
 22 |      "name": "stdout",
 23 |      "output_type": "stream",
 24 |      "text": [
 25 |       "young\n"
 26 |      ]
 27 |     },
 28 |     {
 29 |      "name": "stderr",
 30 |      "output_type": "stream",
 31 |      "text": [
 32 |       "100%|███████████████████████████████| 200/200 [04:26<00:00,  1.29s/it]\n"
 33 |      ]
 34 |     },
 35 |     {
 36 |      "name": "stdout",
 37 |      "output_type": "stream",
 38 |      "text": [
 39 |       "old\n"
 40 |      ]
 41 |     },
 42 |     {
 43 |      "name": "stderr",
 44 |      "output_type": "stream",
 45 |      "text": [
 46 |       "100%|███████████████████████████████| 200/200 [04:30<00:00,  1.30s/it]\n"
 47 |      ]
 48 |     }
 49 |    ],
 50 |    "source": [
 51 |     "prefix = 'https://tspace.library.utoronto.ca'\n",
 52 |     "base_url = 'https://tspace.library.utoronto.ca/handle/1807/24'\n",
 53 |     "\n",
 54 |     "directories = {'young':493, 'old': 501}\n",
 55 |     "\n",
 56 |     "for k, v in directories.items():\n",
 57 |     "    print(k)\n",
 58 |     "    save_dir = './%s/'%(k)\n",
 59 |     "    if not os.path.exists(save_dir):\n",
 60 |     "        os.makedirs(save_dir)\n",
 61 |     "\n",
 62 |     "    url = base_url + str(v)\n",
 63 |     "    soup = BeautifulSoup(urlopen(url).read(), 'html5lib')\n",
 64 |     "    targets = soup.findAll('a', href=re.compile(r'/bitstream/.*.wav'))\n",
 65 |     "        \n",
 66 |     "    for a in tqdm(targets, total=len(targets), ncols=70):\n",
 67 |     "        link = a['href']\n",
 68 |     "\n",
 69 |     "        audio_save_loc = save_dir + link.split('/')[-1]\n",
 70 |     "        if os.path.isfile(audio_save_loc):\n",
 71 |     "            print(\"File Already Exists\")\n",
 72 |     "        urlretrieve(prefix+a['href'], audio_save_loc)\n",
 73 |     "\n",
 74 |     "        with open(audio_save_loc.replace('.wav', '.txt'), 'w') as f:\n",
 75 |     "            f.write('say the word ' + link.split('_')[-2])"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": null,
 81 |    "metadata": {},
 82 |    "outputs": [],
 83 |    "source": []
 84 |   }
 85 |  ],
 86 |  "metadata": {
 87 |   "kernelspec": {
 88 |    "display_name": "Python 3",
 89 |    "language": "python",
 90 |    "name": "python3"
 91 |   },
 92 |   "language_info": {
 93 |    "codemirror_mode": {
 94 |     "name": "ipython",
 95 |     "version": 3
 96 |    },
 97 |    "file_extension": ".py",
 98 |    "mimetype": "text/x-python",
 99 |    "name": "python",
100 |    "nbconvert_exporter": "python",
101 |    "pygments_lexer": "ipython3",
102 |    "version": "3.6.8"
103 |   }
104 |  },
105 |  "nbformat": 4,
106 |  "nbformat_minor": 2
107 | }
108 | 


--------------------------------------------------------------------------------