├── .gitignore ├── CODE_OF_CONDUCT.md ├── LICENSE ├── README.md ├── chatbot ├── bahdanau_attention_mechanism_based_seq2seq_model_for_chatbot.ipynb └── chatbot.md ├── huggingface ├── Hugging_Face_Usage_Guide.ipynb ├── IMDB Sentiment Analysis - BERT.ipynb └── public_transformers_in_ktrain.ipynb ├── image captioning └── Image_Captioning_with_Visual_Attention.ipynb ├── image classification-lstm ├── Bidirectional_LSTM_MNIST_Classifier.ipynb └── image classification.md ├── language modelling ├── Language_Modelling_Sequence_to_Sequence_model_for_Poetry_Generation.ipynb ├── Learning_to_read_with_TensorFlow_and_Keras.ipynb └── language modelling.md ├── machine translation ├── Attention_based_Sequence_to_Sequence_Models_for_Neural_Machine_Translation.ipynb ├── Neural_Machine_Translation_with_Attention.ipynb ├── Neural_Machine_Translation_with_Bahdanau_Attention_and_Pretrained_Glove_Embeddings.ipynb ├── Sequence_to_Sequence_Models_for_Neural_Machine_Translation.ipynb └── machine translation.md ├── memory networks └── Memory_Networks.ipynb ├── music generation ├── Generate_Music_with_Transofrmers.ipynb └── music generation.md ├── named-entity-recognition └── Named_Entitiy_Recognition_spacy_simple.ipynb ├── question-answering ├── Question_Answering_System.ipynb └── question answering.md ├── recurrent neural networks ├── Bidirectional_LSTM_Test.ipynb ├── Simple_RNN_Test_(Return_State_vs_Return_Sequences).ipynb ├── Stacked_Long_Short_Term_Memory_Networks.ipynb └── recurrent neural networks.md ├── sentiment analysis ├── IMDB Sentiment Analysis - BERT.ipynb ├── Sentiment_Analysis_on_Hotel_Reviews_.ipynb └── Universal_Sentence_Encoder_Sentiment_Analysis.ipynb ├── sequence classification ├── Long_Short_Term_Memory_Sequence_Prediction.ipynb └── Sequence_Classification_with_Transformers.ipynb ├── spaCy └── guide │ └── Linguistic_Features.ipynb ├── text classification ├── Text_Classification_with_RNN.ipynb └── text classification.md ├── text generation ├── Generating_Pokemon_names_with_RNNs.ipynb ├── Text_Generation_with_LSTM.ipynb ├── Text_Generation_with_an_RNN.ipynb ├── Train_a_GPT_2_Text_Generating_Model_w_GPU.ipynb └── text generation.md ├── text preprocessing └── Text_Processing.ipynb ├── text summarization ├── Seq2Seq_Varianats_Based_Text_Summarization_in_TensorFlow.ipynb └── summarization.md ├── time-series ├── TIme_Series_Forcecasting.ipynb └── Time_Series.ipynb ├── universal sentence encoder ├── Semantic_Textual_Similarity_using_Universal_Sentence_Encoder.ipynb ├── Universal_Sentence_Encoder.ipynb └── Universal_Sentence_Encoder_Sentiment_Analysis.ipynb └── word embeddings ├── Word_Embeddings.ipynb └── word embedding.md /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at sourcecode369@gmail.com. All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 72 | 73 | [homepage]: https://www.contributor-covenant.org 74 | 75 | For answers to common questions about this code of conduct, see 76 | https://www.contributor-covenant.org/faq 77 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Deep - Natural Language Processing 2 | 3 | Natural language processing (NLP) is a field of computer science that studies how computers and humans interact. In the 1950s, Alan Turing published an article that proposed a measure of intelligence, now called the Turing test. More modern techniques, such as deep learning, have produced results in the fields of language modeling, parsing, and natural-language tasks. 4 | 5 | This repository aims to to cover both traditional and deep learning based NLP tasks such as speech recognition, trigger word detection, question answering systems as well as more recent ones such as transfer learning techniques and unsupervised nlp, implemented in tensorflow and pytorch. 6 | -------------------------------------------------------------------------------- /chatbot/chatbot.md: -------------------------------------------------------------------------------- 1 | # Chatbot 2 | 3 | A deep learning chatbot learns right from scratch through a process called “Deep Learning.” 4 | In this process, the chatbot is created using machine learning algorithms. 5 | A deep learning chatbot learns everything from its data and human-to-human dialogue. 6 | 7 | ![chatbot](https://hackernoon.com/photos/ShmX0jqVZDeSwN3LueVrsJsj8lv2-lj230f7) 8 | 9 | The chatbot is trained to develop its own consciousness on the text, and you can teach it how to converse with people. 10 | Alternatively, you can teach the chatbot through movie dialogue or play scripts. 11 | However, a human-to-human conversation is the preferred way to create the best possible deep learning chatbot. 12 | Remember, the more data you have, the better the effectiveness of machine learning will be. 13 | -------------------------------------------------------------------------------- /huggingface/IMDB Sentiment Analysis - BERT.ipynb: -------------------------------------------------------------------------------- 1 | {"cells":[{"metadata":{"trusted":true},"cell_type":"code","source":"import transformers\nimport torch.nn as nn\nfrom tqdm import tqdm\nimport pandas as pd\nfrom sklearn.model_selection import train_test_split\nfrom sklearn import metrics\nimport numpy as np\nimport torch","execution_count":null,"outputs":[]},{"metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true},"cell_type":"code","source":"MAX_LEN = 512\nTRAIN_BATCH_SIZE = 8\nVALID_BATCH_SIZE = 4\nEPOCHS = 10\nACCUMULATION = 2\nBERT_PATH = '../input/bert_base_cased/'\nMODEL_PATH = \"model.bin\"\nTRAINING_FILE = \"../input/imbd-movie-reviews-for-binary-sentiment-analysis/MovieReviewTrainingDatabase.csv\"\nTOKENIZER = transformers.BertTokenizer.from_pretrained('bert-base-uncased',do_lower_case=True)","execution_count":null,"outputs":[]},{"metadata":{"_uuid":"d629ff2d2480ee46fbb7e2d37f6b5fab8052498a","_cell_guid":"79c7e3d0-c299-4dcb-8224-4455121ee9b0","trusted":true},"cell_type":"code","source":"class BERTBaseUncased(nn.Module):\n def __init__(self):\n super(BERTBaseUncased,self).__init__()\n self.bert = transformers.BertModel.from_pretrained('bert-base-uncased')\n self.bert_drop = nn.Dropout(0.3)\n self.out = nn.Linear(768,1)\n def forward(self, ids, mask, token_type_ids):\n _, o2 = self.bert(ids, \n attention_mask=mask, \n token_type_ids=token_type_ids)\n bo = self.bert_drop(o2)\n output = self.out(bo)\n return output","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"class BERTDataset:\n def __init__(self, review, target):\n self.review = review\n self.target = target\n self.tokenizer = TOKENIZER\n self.max_len = MAX_LEN\n \n def __len__(self):\n return len(self.review)\n \n def __getitem__(self, item):\n review = str(self.review[item])\n review = \" \".join(review.split())\n\n inputs = self.tokenizer.encode_plus(\n review,\n None,\n add_special_tokens=True,\n max_length=self.max_len,\n pad_to_max_length=True\n )\n\n ids = inputs[\"input_ids\"]\n mask = inputs[\"attention_mask\"]\n token_type_ids = inputs[\"token_type_ids\"]\n\n return {\n 'ids': torch.tensor(ids, dtype=torch.long),\n 'mask': torch.tensor(mask, dtype=torch.long),\n 'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),\n 'targets': torch.tensor(self.target[item], dtype=torch.float)\n }","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"def loss_fn(outputs, targets):\n return nn.BCEWithLogitsLoss()(outputs, targets.view(-1, 1))\n\n\ndef train_fn(data_loader, model, optimizer, device, scheduler):\n model.train()\n\n for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):\n ids = d[\"ids\"]\n token_type_ids = d[\"token_type_ids\"]\n mask = d[\"mask\"]\n targets = d[\"targets\"]\n\n ids = ids.to(device, dtype=torch.long)\n token_type_ids = token_type_ids.to(device, dtype=torch.long)\n mask = mask.to(device, dtype=torch.long)\n targets = targets.to(device, dtype=torch.float)\n\n optimizer.zero_grad()\n outputs = model(\n ids=ids,\n mask=mask,\n token_type_ids=token_type_ids\n )\n\n loss = loss_fn(outputs, targets)\n loss.backward()\n optimizer.step()\n scheduler.step()\n\n\ndef eval_fn(data_loader, model, device):\n model.eval()\n fin_targets = []\n fin_outputs = []\n with torch.no_grad():\n for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):\n ids = d[\"ids\"]\n token_type_ids = d[\"token_type_ids\"]\n mask = d[\"mask\"]\n targets = d[\"targets\"]\n\n ids = ids.to(device, dtype=torch.long)\n token_type_ids = token_type_ids.to(device, dtype=torch.long)\n mask = mask.to(device, dtype=torch.long)\n targets = targets.to(device, dtype=torch.float)\n\n outputs = model(\n ids=ids,\n mask=mask,\n token_type_ids=token_type_ids\n )\n fin_targets.extend(targets.cpu().detach().numpy().tolist())\n fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())\n return fin_outputs, fin_targets","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"def run():\n df = pd.read_csv(TRAINING_FILE).fillna(\"None\")\n \n df.sentiment = df.sentiment.apply(lambda x: 1 if x==\"postive\" else 0)\n \n df_train, df_valid = train_test_split(df, test_size=0.1, random_state=2020, stratify=df.sentiment.values)\n \n df_train = df_train.reset_index(drop=True)\n df_train = df_valid.reset_index(drop=True)\n \n train_dataset = BERTDataset(review=df_train.review.values, target=df_train.sentiment.values)\n train_data_loader = torch.utils.data.DataLoader(\n train_dataset, \n batch_size=TRAIN_BATCH_SIZE,\n num_workers=4\n )\n \n valid_dataset = BERTDataset(review=df_valid.review.values, target=df_valid.sentiment.values)\n valid_data_loader = torch.utils.data.DataLoader(\n valid_dataset, \n batch_size=VALID_BATCH_SIZE,\n num_workers=4\n )\n \n device = torch.device(\"cuda\")\n model = BERTBaseUncased()\n model.to(device)\n param_optimizer = list(model.named_parameters())\n no_decay = [\"bias\",\"LayerNorm.bias\",\"LayerNorm.weight\"]\n optimizer_parameters = [\n {\n \"params\":[p for n, p in param_optimizer if not any(nd for nd in no_decay)], 'weight_decay':0.001\n },\n {\n \"params\":[p for n, p in param_optimizer if any(nd for nd in no_decay)], 'weight_decay':0.0\n }\n ]\n num_train_steps = int(len(df_train)/TRAIN_BATCH_SIZE*EPOCHS)\n optimizer = transformers.AdamW(optimizer_parameters, lr=3e-5)\n scheduler = transformers.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)\n model = nn.DataParallel(model)\n best_accuracy = 0\n for epoch in range(EPOCHS):\n train_fn(train_data_loader, model, optimizer, device, scheduler)\n outputs, targets = eval_fn(valid_data_loader, model, device)\n outputs = np.array(outputs) >= 0.5\n accuracy = metrics.accuracy_score(targets, outputs)\n print(f\"Accuracy score = {accuracy}\")\n if accuracy > best_accuracy:\n torch.save(model.state_dict(),MODEL_PATH)\n best_accuracy = accuracy","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"if __name__ == \"__main__\":\n run()","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"","execution_count":null,"outputs":[]}],"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"pygments_lexer":"ipython3","nbconvert_exporter":"python","version":"3.6.4","file_extension":".py","codemirror_mode":{"name":"ipython","version":3},"name":"python","mimetype":"text/x-python"}},"nbformat":4,"nbformat_minor":4} -------------------------------------------------------------------------------- /huggingface/public_transformers_in_ktrain.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "public-transformers_in_ktrain.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [], 9 | "include_colab_link": true 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | }, 15 | "accelerator": "GPU" 16 | }, 17 | "cells": [ 18 | { 19 | "cell_type": "markdown", 20 | "metadata": { 21 | "id": "view-in-github", 22 | "colab_type": "text" 23 | }, 24 | "source": [ 25 | "\"Open" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": { 31 | "id": "DgfHNcOPbOk3", 32 | "colab_type": "text" 33 | }, 34 | "source": [ 35 | "### A Simplied Interface to Text Classification With Hugging Face Transformers in TensorFlow Using [ktrain](https://github.com/amaiya/ktrain)\n", 36 | "\n", 37 | "*ktrain* requires TensorFlow 2." 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "metadata": { 43 | "id": "BDBNS4iNXuUL", 44 | "colab_type": "code", 45 | "colab": {} 46 | }, 47 | "source": [ 48 | "!pip3 install -q tensorflow_gpu>=2.0" 49 | ], 50 | "execution_count": 0, 51 | "outputs": [] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "metadata": { 56 | "id": "MAA46kq4X0C_", 57 | "colab_type": "code", 58 | "outputId": "74586b67-a045-42ca-e25f-2ab1cb49c706", 59 | "colab": { 60 | "base_uri": "https://localhost:8080/", 61 | "height": 35 62 | } 63 | }, 64 | "source": [ 65 | "import tensorflow as tf\n", 66 | "print(tf.__version__)" 67 | ], 68 | "execution_count": 0, 69 | "outputs": [ 70 | { 71 | "output_type": "stream", 72 | "text": [ 73 | "2.1.0\n" 74 | ], 75 | "name": "stdout" 76 | } 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": { 82 | "id": "iitDk1j6bpY3", 83 | "colab_type": "text" 84 | }, 85 | "source": [ 86 | "We then need to install *ktrain* library using pip." 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "metadata": { 92 | "id": "sAYBZG2SX4nP", 93 | "colab_type": "code", 94 | "outputId": "8a829748-3383-4258-ebf9-c9500407064f", 95 | "colab": { 96 | "base_uri": "https://localhost:8080/", 97 | "height": 35 98 | } 99 | }, 100 | "source": [ 101 | "!pip3 install -q ktrain" 102 | ], 103 | "execution_count": 0, 104 | "outputs": [ 105 | { 106 | "output_type": "stream", 107 | "text": [ 108 | " Building wheel for ktrain (setup.py) ... \u001b[?25l\u001b[?25hdone\n" 109 | ], 110 | "name": "stdout" 111 | } 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "metadata": { 117 | "id": "Cdk5lPu3bxze", 118 | "colab_type": "text" 119 | }, 120 | "source": [ 121 | "### Load a Dataset Into Arrays" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "metadata": { 127 | "id": "-rSbSqApYPBe", 128 | "colab_type": "code", 129 | "outputId": "9b75484e-5c8e-4a7d-9c9e-2969b99cc19f", 130 | "colab": { 131 | "base_uri": "https://localhost:8080/", 132 | "height": 69 133 | } 134 | }, 135 | "source": [ 136 | "categories = ['alt.atheism', 'soc.religion.christian',\n", 137 | " 'comp.graphics', 'sci.med']\n", 138 | "from sklearn.datasets import fetch_20newsgroups\n", 139 | "train_b = fetch_20newsgroups(subset='train',\n", 140 | " categories=categories, shuffle=True, random_state=42)\n", 141 | "test_b = fetch_20newsgroups(subset='test',\n", 142 | " categories=categories, shuffle=True, random_state=42)\n", 143 | "\n", 144 | "print('size of training set: %s' % (len(train_b['data'])))\n", 145 | "print('size of validation set: %s' % (len(test_b['data'])))\n", 146 | "print('classes: %s' % (train_b.target_names))\n", 147 | "\n", 148 | "x_train = train_b.data\n", 149 | "y_train = train_b.target\n", 150 | "x_test = test_b.data\n", 151 | "y_test = test_b.target" 152 | ], 153 | "execution_count": 0, 154 | "outputs": [ 155 | { 156 | "output_type": "stream", 157 | "text": [ 158 | "size of training set: 2257\n", 159 | "size of validation set: 1502\n", 160 | "classes: ['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']\n" 161 | ], 162 | "name": "stdout" 163 | } 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "metadata": { 169 | "id": "pe5xxVPrb4IO", 170 | "colab_type": "text" 171 | }, 172 | "source": [ 173 | "## STEP 1: Preprocess Data and Create a Transformer Model\n", 174 | "\n", 175 | "We will use [DistilBERT](https://arxiv.org/abs/1910.01108)." 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "metadata": { 181 | "id": "5phkVc7ZYnue", 182 | "colab_type": "code", 183 | "outputId": "f9cac7b8-aeda-4cb4-a349-5a14b29e1790", 184 | "colab": { 185 | "base_uri": "https://localhost:8080/", 186 | "height": 104 187 | } 188 | }, 189 | "source": [ 190 | "import ktrain\n", 191 | "from ktrain import text\n", 192 | "MODEL_NAME = 'distilbert-base-uncased'\n", 193 | "t = text.Transformer(MODEL_NAME, maxlen=500, classes=train_b.target_names)\n", 194 | "trn = t.preprocess_train(x_train, y_train)\n", 195 | "val = t.preprocess_test(x_test, y_test)\n", 196 | "model = t.get_classifier()\n", 197 | "learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=6)" 198 | ], 199 | "execution_count": 0, 200 | "outputs": [ 201 | { 202 | "output_type": "stream", 203 | "text": [ 204 | "using Keras version: 2.2.4-tf\n", 205 | "preprocessing train...\n", 206 | "language: en\n" 207 | ], 208 | "name": "stdout" 209 | }, 210 | { 211 | "output_type": "display_data", 212 | "data": { 213 | "text/html": [ 214 | "" 215 | ], 216 | "text/plain": [ 217 | "" 218 | ] 219 | }, 220 | "metadata": { 221 | "tags": [] 222 | } 223 | }, 224 | { 225 | "output_type": "stream", 226 | "text": [ 227 | "preprocessing test...\n", 228 | "language: en\n" 229 | ], 230 | "name": "stdout" 231 | }, 232 | { 233 | "output_type": "display_data", 234 | "data": { 235 | "text/html": [ 236 | "" 237 | ], 238 | "text/plain": [ 239 | "" 240 | ] 241 | }, 242 | "metadata": { 243 | "tags": [] 244 | } 245 | } 246 | ] 247 | }, 248 | { 249 | "cell_type": "markdown", 250 | "metadata": { 251 | "id": "G4sGPJgOcBTd", 252 | "colab_type": "text" 253 | }, 254 | "source": [ 255 | "## STEP 2: Train the Model" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "metadata": { 261 | "id": "c_nH_F9yYvCd", 262 | "colab_type": "code", 263 | "outputId": "2b7eebcf-711b-4b9c-a5a1-bf7501155378", 264 | "colab": { 265 | "base_uri": "https://localhost:8080/", 266 | "height": 243 267 | } 268 | }, 269 | "source": [ 270 | "learner.fit_onecycle(5e-5, 4)" 271 | ], 272 | "execution_count": 0, 273 | "outputs": [ 274 | { 275 | "output_type": "stream", 276 | "text": [ 277 | "\n", 278 | "\n", 279 | "begin training using onecycle policy with max lr of 5e-05...\n", 280 | "Train for 377 steps, validate for 251 steps\n", 281 | "Epoch 1/4\n", 282 | "377/377 [==============================] - 111s 294ms/step - loss: 0.5522 - accuracy: 0.8241 - val_loss: 0.1924 - val_accuracy: 0.9447\n", 283 | "Epoch 2/4\n", 284 | "377/377 [==============================] - 104s 276ms/step - loss: 0.1219 - accuracy: 0.9641 - val_loss: 0.2383 - val_accuracy: 0.9261\n", 285 | "Epoch 3/4\n", 286 | "377/377 [==============================] - 104s 276ms/step - loss: 0.0763 - accuracy: 0.9774 - val_loss: 0.2119 - val_accuracy: 0.9401\n", 287 | "Epoch 4/4\n", 288 | "377/377 [==============================] - 104s 275ms/step - loss: 0.0180 - accuracy: 0.9960 - val_loss: 0.1762 - val_accuracy: 0.9554\n" 289 | ], 290 | "name": "stdout" 291 | }, 292 | { 293 | "output_type": "execute_result", 294 | "data": { 295 | "text/plain": [ 296 | "" 297 | ] 298 | }, 299 | "metadata": { 300 | "tags": [] 301 | }, 302 | "execution_count": 6 303 | } 304 | ] 305 | }, 306 | { 307 | "cell_type": "markdown", 308 | "metadata": { 309 | "id": "ho6eSo9IcI3_", 310 | "colab_type": "text" 311 | }, 312 | "source": [ 313 | "## STEP 3: Evaluate and Inspect the Model" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "metadata": { 319 | "id": "UvcxCvLOcOje", 320 | "colab_type": "code", 321 | "outputId": "fb5a7f72-916b-4d82-8a40-29e56059c626", 322 | "colab": { 323 | "base_uri": "https://localhost:8080/", 324 | "height": 277 325 | } 326 | }, 327 | "source": [ 328 | "learner.validate(class_names=t.get_classes())" 329 | ], 330 | "execution_count": 0, 331 | "outputs": [ 332 | { 333 | "output_type": "stream", 334 | "text": [ 335 | " precision recall f1-score support\n", 336 | "\n", 337 | " alt.atheism 0.94 0.90 0.92 319\n", 338 | " comp.graphics 0.96 0.97 0.96 389\n", 339 | " sci.med 0.98 0.96 0.97 396\n", 340 | "soc.religion.christian 0.94 0.98 0.96 398\n", 341 | "\n", 342 | " accuracy 0.96 1502\n", 343 | " macro avg 0.95 0.95 0.95 1502\n", 344 | " weighted avg 0.96 0.96 0.96 1502\n", 345 | "\n" 346 | ], 347 | "name": "stdout" 348 | }, 349 | { 350 | "output_type": "execute_result", 351 | "data": { 352 | "text/plain": [ 353 | "array([[286, 8, 5, 20],\n", 354 | " [ 9, 377, 2, 1],\n", 355 | " [ 4, 7, 381, 4],\n", 356 | " [ 5, 1, 1, 391]])" 357 | ] 358 | }, 359 | "metadata": { 360 | "tags": [] 361 | }, 362 | "execution_count": 7 363 | } 364 | ] 365 | }, 366 | { 367 | "cell_type": "markdown", 368 | "metadata": { 369 | "id": "yhG3fPtPcVKe", 370 | "colab_type": "text" 371 | }, 372 | "source": [ 373 | "Let's examine the validation example about which we were the most wrong." 374 | ] 375 | }, 376 | { 377 | "cell_type": "code", 378 | "metadata": { 379 | "id": "mCABLebacTWM", 380 | "colab_type": "code", 381 | "outputId": "272dbe35-dbc0-494f-c09b-5e2cbcc03c2f", 382 | "colab": { 383 | "base_uri": "https://localhost:8080/", 384 | "height": 69 385 | } 386 | }, 387 | "source": [ 388 | "learner.view_top_losses(n=1, preproc=t)" 389 | ], 390 | "execution_count": 0, 391 | "outputs": [ 392 | { 393 | "output_type": "stream", 394 | "text": [ 395 | "----------\n", 396 | "id:371 | loss:7.01 | true:alt.atheism | pred:comp.graphics)\n", 397 | "\n" 398 | ], 399 | "name": "stdout" 400 | } 401 | ] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "metadata": { 406 | "id": "pHYRBdBycfne", 407 | "colab_type": "code", 408 | "outputId": "4d1ef2bb-e634-4026-bac7-0924dae5c825", 409 | "colab": { 410 | "base_uri": "https://localhost:8080/", 411 | "height": 659 412 | } 413 | }, 414 | "source": [ 415 | "print(x_test[371])" 416 | ], 417 | "execution_count": 0, 418 | "outputs": [ 419 | { 420 | "output_type": "stream", 421 | "text": [ 422 | "From: kempmp@phoenix.oulu.fi (Petri Pihko)\n", 423 | "Subject: Re: Consciousness part II - Kev Strikes Back!\n", 424 | "Organization: University of Oulu, Finland\n", 425 | "X-Newsreader: TIN [version 1.1 PL9]\n", 426 | "Lines: 30\n", 427 | "\n", 428 | "Scott D. Sauyet (SSAUYET@eagle.wesleyan.edu) wrote:\n", 429 | "> In <1993Apr21.163848.8099@cs.nott.ac.uk> \n", 430 | "> Kevin Anthony (kax@cs.nott.ac.uk) writes:\n", 431 | "\n", 432 | "> > Firstly, I'm not impressed with the ability of algorithms. They're\n", 433 | "> > great at solving problems once the method has been worked out, but not\n", 434 | "> > at working out the method itself.\n", 435 | "> [ .. crossword example deleted ... ]\n", 436 | "\n", 437 | "> Have you heard of neural networks? I've read a little about them, and\n", 438 | "> they seems to overcome most of your objections.\n", 439 | "\n", 440 | "I'm sure there are many people who work with neural networks and\n", 441 | "read this newsgroup. Please tell Kevin what you've achieved, and\n", 442 | "what you expect.\n", 443 | "\n", 444 | "> I am not saying that NNs will solve all such problems, but I think\n", 445 | "> they show that it is not as hard as you think to come up with\n", 446 | "> mechanical models of consciousness.\n", 447 | "\n", 448 | "Indeed. I think dualism is a non-solution, or, as Dennett recently\n", 449 | "put it, a dead horse. \n", 450 | "\n", 451 | "Petri\n", 452 | "\n", 453 | "--\n", 454 | " ___. .'*''.* Petri Pihko kem-pmp@ Mathematics is the Truth.\n", 455 | "!___.'* '.'*' ' . Pihatie 15 C finou.oulu.fi Physics is the Rule of\n", 456 | " ' *' .* '* SF-90650 OULU kempmp@ the Game.\n", 457 | " *' * .* FINLAND phoenix.oulu.fi -> Chemistry is The Game.\n", 458 | "\n" 459 | ], 460 | "name": "stdout" 461 | } 462 | ] 463 | }, 464 | { 465 | "cell_type": "markdown", 466 | "metadata": { 467 | "id": "bUBegwcKcyEG", 468 | "colab_type": "text" 469 | }, 470 | "source": [ 471 | "This post talks more about computing than `alt.atheism` (the true category), so our model placed it into the only computing category available to it: `comp.graphics`" 472 | ] 473 | }, 474 | { 475 | "cell_type": "markdown", 476 | "metadata": { 477 | "id": "CcZQ6HbqdMcF", 478 | "colab_type": "text" 479 | }, 480 | "source": [ 481 | "## STEP 4: Making Predictions on New Data in Deployment" 482 | ] 483 | }, 484 | { 485 | "cell_type": "code", 486 | "metadata": { 487 | "id": "hp8tw3Y0cnJa", 488 | "colab_type": "code", 489 | "colab": {} 490 | }, 491 | "source": [ 492 | "predictor = ktrain.get_predictor(learner.model, preproc=t)" 493 | ], 494 | "execution_count": 0, 495 | "outputs": [] 496 | }, 497 | { 498 | "cell_type": "code", 499 | "metadata": { 500 | "id": "LZOeu9cDdguM", 501 | "colab_type": "code", 502 | "outputId": "2626090f-04de-4cef-c1da-d644bfe69542", 503 | "colab": { 504 | "base_uri": "https://localhost:8080/", 505 | "height": 35 506 | } 507 | }, 508 | "source": [ 509 | "predictor.predict('Jesus Christ is the central figure of Christianity.')" 510 | ], 511 | "execution_count": 0, 512 | "outputs": [ 513 | { 514 | "output_type": "display_data", 515 | "data": { 516 | "text/html": [ 517 | "" 518 | ], 519 | "text/plain": [ 520 | "" 521 | ] 522 | }, 523 | "metadata": { 524 | "tags": [] 525 | } 526 | }, 527 | { 528 | "output_type": "execute_result", 529 | "data": { 530 | "text/plain": [ 531 | "'soc.religion.christian'" 532 | ] 533 | }, 534 | "metadata": { 535 | "tags": [] 536 | }, 537 | "execution_count": 11 538 | } 539 | ] 540 | }, 541 | { 542 | "cell_type": "code", 543 | "metadata": { 544 | "id": "JuMmx8f5dr45", 545 | "colab_type": "code", 546 | "outputId": "56530d72-16e5-41e7-d40f-91d7e370cf75", 547 | "colab": { 548 | "base_uri": "https://localhost:8080/", 549 | "height": 52 550 | } 551 | }, 552 | "source": [ 553 | "# predicted probability scores for each category\n", 554 | "predictor.predict_proba('Jesus Christ is the central figure of Christianity.')" 555 | ], 556 | "execution_count": 0, 557 | "outputs": [ 558 | { 559 | "output_type": "display_data", 560 | "data": { 561 | "text/html": [ 562 | "" 563 | ], 564 | "text/plain": [ 565 | "" 566 | ] 567 | }, 568 | "metadata": { 569 | "tags": [] 570 | } 571 | }, 572 | { 573 | "output_type": "execute_result", 574 | "data": { 575 | "text/plain": [ 576 | "array([2.9704000e-03, 5.0002872e-04, 6.5480877e-04, 9.9587470e-01],\n", 577 | " dtype=float32)" 578 | ] 579 | }, 580 | "metadata": { 581 | "tags": [] 582 | }, 583 | "execution_count": 12 584 | } 585 | ] 586 | }, 587 | { 588 | "cell_type": "code", 589 | "metadata": { 590 | "id": "ldxX1mtLd3Nq", 591 | "colab_type": "code", 592 | "outputId": "e5b5ed72-f5d7-4562-9cdf-afa2f8bd4ea6", 593 | "colab": { 594 | "base_uri": "https://localhost:8080/", 595 | "height": 35 596 | } 597 | }, 598 | "source": [ 599 | "predictor.get_classes()" 600 | ], 601 | "execution_count": 0, 602 | "outputs": [ 603 | { 604 | "output_type": "execute_result", 605 | "data": { 606 | "text/plain": [ 607 | "['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']" 608 | ] 609 | }, 610 | "metadata": { 611 | "tags": [] 612 | }, 613 | "execution_count": 13 614 | } 615 | ] 616 | }, 617 | { 618 | "cell_type": "markdown", 619 | "metadata": { 620 | "id": "9tHos7V6d8RQ", 621 | "colab_type": "text" 622 | }, 623 | "source": [ 624 | "As expected, `soc.religion.christian` is assigned the highest probability.\n", 625 | "\n", 626 | "Let's invoke the `explain` method to see which words contribute most to the classification.\n", 627 | "\n", 628 | "We will need a forked version of the **eli5** library that supportes TensorFlow Keras, so let's install it first." 629 | ] 630 | }, 631 | { 632 | "cell_type": "code", 633 | "metadata": { 634 | "id": "g6HjLF9dd5iZ", 635 | "colab_type": "code", 636 | "outputId": "1e273516-c9f1-4d3c-a956-933b176fccc9", 637 | "colab": { 638 | "base_uri": "https://localhost:8080/", 639 | "height": 35 640 | } 641 | }, 642 | "source": [ 643 | "!pip3 install -q git+https://github.com/amaiya/eli5@tfkeras_0_10_1\n" 644 | ], 645 | "execution_count": 0, 646 | "outputs": [ 647 | { 648 | "output_type": "stream", 649 | "text": [ 650 | " Building wheel for eli5 (setup.py) ... \u001b[?25l\u001b[?25hdone\n" 651 | ], 652 | "name": "stdout" 653 | } 654 | ] 655 | }, 656 | { 657 | "cell_type": "code", 658 | "metadata": { 659 | "id": "3HgZDLYUeVaM", 660 | "colab_type": "code", 661 | "outputId": "b1d2fc08-6b12-4ef1-c740-d7f38385b8f2", 662 | "colab": { 663 | "base_uri": "https://localhost:8080/", 664 | "height": 169 665 | } 666 | }, 667 | "source": [ 668 | "predictor.explain('Jesus Christ is the central figure in Christianity.')" 669 | ], 670 | "execution_count": 0, 671 | "outputs": [ 672 | { 673 | "output_type": "display_data", 674 | "data": { 675 | "text/html": [ 676 | "" 677 | ], 678 | "text/plain": [ 679 | "" 680 | ] 681 | }, 682 | "metadata": { 683 | "tags": [] 684 | } 685 | }, 686 | { 687 | "output_type": "display_data", 688 | "data": { 689 | "text/html": [ 690 | "" 691 | ], 692 | "text/plain": [ 693 | "" 694 | ] 695 | }, 696 | "metadata": { 697 | "tags": [] 698 | } 699 | }, 700 | { 701 | "output_type": "execute_result", 702 | "data": { 703 | "text/html": [ 704 | "\n", 705 | " \n", 710 | "\n", 711 | "\n", 712 | "\n", 713 | " \n", 714 | "\n", 715 | " \n", 716 | "\n", 717 | " \n", 718 | "\n", 719 | " \n", 720 | "\n", 721 | " \n", 722 | "\n", 723 | " \n", 724 | "\n", 725 | "\n", 726 | " \n", 727 | "\n", 728 | " \n", 729 | "\n", 730 | " \n", 731 | "\n", 732 | " \n", 733 | " \n", 734 | "\n", 735 | " \n", 736 | "\n", 737 | " \n", 738 | "\n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | "

\n", 745 | " \n", 746 | " \n", 747 | " y=soc.religion.christian\n", 748 | " \n", 749 | "\n", 750 | "\n", 751 | " \n", 752 | " (probability 1.000, score 8.865)\n", 753 | "\n", 754 | "top features\n", 755 | "

\n", 756 | " \n", 757 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 776 | " \n", 779 | " \n", 780 | "\n", 781 | " \n", 782 | " \n", 783 | "\n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 790 | " \n", 793 | " \n", 794 | "\n", 795 | " \n", 796 | "\n", 797 | " \n", 798 | "
\n", 763 | " Contribution?\n", 764 | " Feature
\n", 774 | " +8.967\n", 775 | " \n", 777 | " Highlighted in text (sum)\n", 778 | "
\n", 788 | " -0.101\n", 789 | " \n", 791 | " <BIAS>\n", 792 | "
\n", 799 | "\n", 800 | " \n", 801 | "\n", 802 | "\n", 803 | "\n", 804 | "

\n", 805 | " jesus christ is the central figure in christianity.\n", 806 | "

\n", 807 | "\n", 808 | "\n", 809 | " \n", 810 | "\n", 811 | " \n", 812 | "\n", 813 | " \n", 814 | "\n", 815 | " \n", 816 | "\n", 817 | "\n", 818 | " \n", 819 | "\n", 820 | " \n", 821 | "\n", 822 | " \n", 823 | "\n", 824 | " \n", 825 | "\n", 826 | " \n", 827 | "\n", 828 | " \n", 829 | "\n", 830 | "\n", 831 | " \n", 832 | "\n", 833 | " \n", 834 | "\n", 835 | " \n", 836 | "\n", 837 | " \n", 838 | "\n", 839 | " \n", 840 | "\n", 841 | " \n", 842 | "\n", 843 | "\n", 844 | "\n" 845 | ], 846 | "text/plain": [ 847 | "" 848 | ] 849 | }, 850 | "metadata": { 851 | "tags": [] 852 | }, 853 | "execution_count": 15 854 | } 855 | ] 856 | }, 857 | { 858 | "cell_type": "markdown", 859 | "metadata": { 860 | "id": "mcph2bSLe5cW", 861 | "colab_type": "text" 862 | }, 863 | "source": [ 864 | "The words in the darkest shade of green contribute most to the classification and agree with what you would expect for this example.\n", 865 | "\n", 866 | "We can save and reload our predictor for later deployment." 867 | ] 868 | }, 869 | { 870 | "cell_type": "code", 871 | "metadata": { 872 | "id": "Z1nzxI_Jec-5", 873 | "colab_type": "code", 874 | "colab": {} 875 | }, 876 | "source": [ 877 | "predictor.save('/tmp/my_distilbert_predictor')" 878 | ], 879 | "execution_count": 0, 880 | "outputs": [] 881 | }, 882 | { 883 | "cell_type": "code", 884 | "metadata": { 885 | "id": "DDEU2s03fHsw", 886 | "colab_type": "code", 887 | "colab": {} 888 | }, 889 | "source": [ 890 | "reloaded_predictor = ktrain.load_predictor('/tmp/my_distilbert_predictor')" 891 | ], 892 | "execution_count": 0, 893 | "outputs": [] 894 | }, 895 | { 896 | "cell_type": "code", 897 | "metadata": { 898 | "id": "B4R1r12rgNlI", 899 | "colab_type": "code", 900 | "outputId": "1bd96ed0-4d5d-480d-cc01-e9a2993bf204", 901 | "colab": { 902 | "base_uri": "https://localhost:8080/", 903 | "height": 35 904 | } 905 | }, 906 | "source": [ 907 | "reloaded_predictor.predict('My computer monitor is really blurry.')" 908 | ], 909 | "execution_count": 0, 910 | "outputs": [ 911 | { 912 | "output_type": "display_data", 913 | "data": { 914 | "text/html": [ 915 | "" 916 | ], 917 | "text/plain": [ 918 | "" 919 | ] 920 | }, 921 | "metadata": { 922 | "tags": [] 923 | } 924 | }, 925 | { 926 | "output_type": "execute_result", 927 | "data": { 928 | "text/plain": [ 929 | "'comp.graphics'" 930 | ] 931 | }, 932 | "metadata": { 933 | "tags": [] 934 | }, 935 | "execution_count": 24 936 | } 937 | ] 938 | }, 939 | { 940 | "cell_type": "code", 941 | "metadata": { 942 | "id": "FCJgsiUzg1wg", 943 | "colab_type": "code", 944 | "colab": {} 945 | }, 946 | "source": [ 947 | "" 948 | ], 949 | "execution_count": 0, 950 | "outputs": [] 951 | } 952 | ] 953 | } -------------------------------------------------------------------------------- /image classification-lstm/Bidirectional_LSTM_MNIST_Classifier.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Bidirectional LSTM MNIST Classifier.ipynb", 7 | "provenance": [], 8 | "authorship_tag": "ABX9TyNceWTeTnWZUh29R/F4C+Tn", 9 | "include_colab_link": true 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | }, 15 | "accelerator": "GPU" 16 | }, 17 | "cells": [ 18 | { 19 | "cell_type": "markdown", 20 | "metadata": { 21 | "id": "view-in-github", 22 | "colab_type": "text" 23 | }, 24 | "source": [ 25 | "\"Open" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "metadata": { 31 | "id": "fN2rWQkWwj6N", 32 | "colab_type": "code", 33 | "colab": {} 34 | }, 35 | "source": [ 36 | "from __future__ import absolute_import, print_function, unicode_literals, division\n", 37 | "from builtins import range, input" 38 | ], 39 | "execution_count": 0, 40 | "outputs": [] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "metadata": { 45 | "id": "UMiRpu5IwxY2", 46 | "colab_type": "code", 47 | "colab": { 48 | "base_uri": "https://localhost:8080/", 49 | "height": 34 50 | }, 51 | "outputId": "6254f501-964e-4e2f-91d4-30a4ec68f087" 52 | }, 53 | "source": [ 54 | "import os\n", 55 | "%tensorflow_version 2.x\n", 56 | "import tensorflow as tf\n", 57 | "from tensorflow.keras.models import Sequential, Model\n", 58 | "from tensorflow.keras.layers import Input, LSTM, GRU, Bidirectional, GlobalMaxPooling1D, Lambda, Concatenate, Dense\n", 59 | "from tensorflow.keras import backend as K\n", 60 | "from tensorflow.keras.datasets.mnist import load_data\n", 61 | "import numpy as np\n", 62 | "import pandas as pd\n", 63 | "import matplotlib.pyplot as plt\n", 64 | "%matplotlib inline " 65 | ], 66 | "execution_count": 11, 67 | "outputs": [ 68 | { 69 | "output_type": "stream", 70 | "text": [ 71 | "TensorFlow is already loaded. Please restart the runtime to change versions.\n" 72 | ], 73 | "name": "stdout" 74 | } 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "metadata": { 80 | "id": "X-hzZzfdxbw6", 81 | "colab_type": "code", 82 | "colab": { 83 | "base_uri": "https://localhost:8080/", 84 | "height": 52 85 | }, 86 | "outputId": "46b42fe6-60e5-42a3-89b3-ea5c71806e63" 87 | }, 88 | "source": [ 89 | "(X_train, y_train), _ = load_data()" 90 | ], 91 | "execution_count": 4, 92 | "outputs": [ 93 | { 94 | "output_type": "stream", 95 | "text": [ 96 | "Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz\n", 97 | "11493376/11490434 [==============================] - 0s 0us/step\n" 98 | ], 99 | "name": "stdout" 100 | } 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "metadata": { 106 | "id": "VQFB6DtHxk4z", 107 | "colab_type": "code", 108 | "colab": {} 109 | }, 110 | "source": [ 111 | "D = 28\n", 112 | "M = 15" 113 | ], 114 | "execution_count": 0, 115 | "outputs": [] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "metadata": { 120 | "id": "F0u4sjetxoc8", 121 | "colab_type": "code", 122 | "colab": { 123 | "base_uri": "https://localhost:8080/", 124 | "height": 250 125 | }, 126 | "outputId": "04461072-70ee-4d67-f4dc-b8c727b2e12f" 127 | }, 128 | "source": [ 129 | "input_ = Input(shape=(D, D))\n", 130 | "rnn1 = Bidirectional(LSTM(M, return_sequences=True))\n", 131 | "x1 = rnn1(input_)\n", 132 | "x1 = GlobalMaxPooling1D()(x1)\n", 133 | "\n", 134 | "rnn2 = Bidirectional(LSTM(M, return_sequences=True))\n", 135 | "permutor = Lambda(lambda t: K.permute_dimensions(t, pattern=(0,2,1)))\n", 136 | "x2 = permutor(input_)\n", 137 | "x2 = rnn2(x2)\n", 138 | "x2 = GlobalMaxPooling1D()(x2)" 139 | ], 140 | "execution_count": 8, 141 | "outputs": [ 142 | { 143 | "output_type": "stream", 144 | "text": [ 145 | "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/init_ops.py:97: calling GlorotUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n", 146 | "Instructions for updating:\n", 147 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n", 148 | "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/init_ops.py:97: calling Orthogonal.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n", 149 | "Instructions for updating:\n", 150 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n", 151 | "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/init_ops.py:97: calling Zeros.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n", 152 | "Instructions for updating:\n", 153 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n", 154 | "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/resource_variable_ops.py:1630: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.\n", 155 | "Instructions for updating:\n", 156 | "If using Keras pass *_constraint arguments to layers.\n" 157 | ], 158 | "name": "stdout" 159 | } 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "metadata": { 165 | "id": "LGYBJbQnyQh_", 166 | "colab_type": "code", 167 | "colab": {} 168 | }, 169 | "source": [ 170 | "concatenator = Concatenate(axis=1)\n", 171 | "x = concatenator([x1, x2])\n", 172 | "output = Dense(10, activation='softmax')(x)\n", 173 | "model = Model(input_, output)" 174 | ], 175 | "execution_count": 0, 176 | "outputs": [] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "metadata": { 181 | "id": "ynKnWhmgyYPi", 182 | "colab_type": "code", 183 | "colab": {} 184 | }, 185 | "source": [ 186 | "model.compile(loss='sparse_categorical_crossentropy',metrics=[\"accuracy\"], optimizer=\"adam\")" 187 | ], 188 | "execution_count": 0, 189 | "outputs": [] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "metadata": { 194 | "id": "N8qTO1P6yyHd", 195 | "colab_type": "code", 196 | "colab": { 197 | "base_uri": "https://localhost:8080/", 198 | "height": 301 199 | }, 200 | "outputId": "b8d046b1-d9d3-48ba-882b-db47fed9e84d" 201 | }, 202 | "source": [ 203 | "history = model.fit(X_train, y_train, batch_size=512, epochs=20, validation_split=0.3)" 204 | ], 205 | "execution_count": 0, 206 | "outputs": [ 207 | { 208 | "output_type": "stream", 209 | "text": [ 210 | "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/math_grad.py:1424: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n", 211 | "Instructions for updating:\n", 212 | "Use tf.where in 2.0, which has the same broadcast rule as np.where\n", 213 | "Train on 42000 samples, validate on 18000 samples\n", 214 | "Epoch 1/20\n", 215 | "42000/42000 [==============================] - 18s 438us/sample - loss: 2.0282 - acc: 0.3389 - val_loss: 1.6820 - val_acc: 0.5607\n", 216 | "Epoch 2/20\n", 217 | "42000/42000 [==============================] - 16s 369us/sample - loss: 1.4425 - acc: 0.6530 - val_loss: 1.1953 - val_acc: 0.7313\n", 218 | "Epoch 3/20\n", 219 | "42000/42000 [==============================] - 15s 358us/sample - loss: 1.0370 - acc: 0.7680 - val_loss: 0.8789 - val_acc: 0.8024\n", 220 | "Epoch 4/20\n", 221 | "42000/42000 [==============================] - 15s 361us/sample - loss: 0.7897 - acc: 0.8183 - val_loss: 0.6905 - val_acc: 0.8393\n", 222 | "Epoch 5/20\n", 223 | "42000/42000 [==============================] - 15s 362us/sample - loss: 0.6329 - acc: 0.8494 - val_loss: 0.5712 - val_acc: 0.8618\n", 224 | "Epoch 6/20\n", 225 | "24064/42000 [================>.............] - ETA: 5s - loss: 0.5460 - acc: 0.8681" 226 | ], 227 | "name": "stdout" 228 | } 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "metadata": { 234 | "id": "qFwqAVyiy77e", 235 | "colab_type": "code", 236 | "colab": {} 237 | }, 238 | "source": [ 239 | "" 240 | ], 241 | "execution_count": 0, 242 | "outputs": [] 243 | } 244 | ] 245 | } -------------------------------------------------------------------------------- /image classification-lstm/image classification.md: -------------------------------------------------------------------------------- 1 | # Image Classification using LSTM 2 | 3 | The objective of an Image Classification Model is to be accurate. Why would someone use any sort of technology if it had a really large range of error, and was wrong most of the time? It’s like asking your friend who is not very strong in math to do your math test for you — it just makes no sense whatsoever. 4 | 5 | Using my knowledge of RNNs, I coded one that classifies images — which iterates, trains, and tests data for higher accuracy. The output of the code is the loss function and percentage accuracy for each epoch, so you can see how it increases with trial and error as the weights and biases are adjusted. 6 | -------------------------------------------------------------------------------- /language modelling/language modelling.md: -------------------------------------------------------------------------------- 1 | # Language modeling 2 | 3 | Language modeling is the task of predicting the next word or character in a document. 4 | 5 | \* indicates models using dynamic evaluation; where, at test time, models may adapt to seen tokens in order to improve performance on following tokens. ([Mikolov et al., (2010)](https://www.fit.vutbr.cz/research/groups/speech/publi/2010/mikolov_interspeech2010_IS100722.pdf), [Krause et al., (2017)](https://arxiv.org/pdf/1709.07432)) 6 | 7 | ## Word Level Models 8 | 9 | ### Penn Treebank 10 | 11 | A common evaluation dataset for language modeling ist the Penn Treebank, 12 | as pre-processed by [Mikolov et al., (2011)](https://www.isca-speech.org/archive/archive_papers/interspeech_2011/i11_0605.pdf). 13 | The dataset consists of 929k training words, 73k validation words, and 14 | 82k test words. As part of the pre-processing, words were lower-cased, numbers 15 | were replaced with N, newlines were replaced with ``, 16 | and all other punctuation was removed. The vocabulary is 17 | the most frequent 10k words with the rest of the tokens replaced by an `` token. 18 | Models are evaluated based on perplexity, which is the average 19 | per-word log-probability (lower is better). 20 | 21 | | Model | Validation perplexity | Test perplexity | Number of params | Paper / Source | Code | 22 | | ------------- | :-----:| :-----: | :-----: | -------------- | ---- | 23 | | Mogrifier LSTM + dynamic eval (Melis et al., 2019) | 44.9 | 44.8 | 24M | [Mogrifier LSTM](http://arxiv.org/abs/1909.01792) | | 24 | | AdvSoft + AWD-LSTM-MoS + dynamic eval (Wang et al., 2019) | 46.63 | 46.01 | 22M | [Improving Neural Language Modeling via Adversarial Training](http://proceedings.mlr.press/v97/wang19f/wang19f.pdf) | [Official](https://github.com/ChengyueGongR/advsoft) | 25 | | FRAGE + AWD-LSTM-MoS + dynamic eval (Gong et al., 2018) | 47.38 | 46.54 | 22M | [FRAGE: Frequency-Agnostic Word Representation](https://arxiv.org/abs/1809.06858) | [Official](https://github.com/ChengyueGongR/Frequency-Agnostic) | 26 | | AWD-LSTM-DOC x5 (Takase et al., 2018) | 48.63 | 47.17 | 185M | [Direct Output Connection for a High-Rank Language Model](https://arxiv.org/abs/1808.10143) | [Official](https://github.com/nttcslab-nlp/doc_lm) | 27 | | AWD-LSTM-MoS + dynamic eval (Yang et al., 2018)* | 48.33 | 47.69 | 22M | [Breaking the Softmax Bottleneck: A High-Rank RNN Language Model](https://arxiv.org/abs/1711.03953) | [Official](https://github.com/zihangdai/mos) | 28 | | Mogrifier LSTM (Melis et al., 2019) | 51.4 | 50.1 | 24M | [Mogrifier LSTM](http://arxiv.org/abs/1909.01792) | | 29 | | AWD-LSTM + dynamic eval (Krause et al., 2017)* | 51.6 | 51.1 | 24M | [Dynamic Evaluation of Neural Sequence Models](https://arxiv.org/abs/1709.07432) | [Official](https://github.com/benkrause/dynamic-evaluation) | 30 | | AWD-LSTM-DOC + Partial Shuffle (Press, 2019) ***preprint*** | 53.79 | 52.00 | 23M | [Partially Shuffling the Training Data to Improve Language Models](https://arxiv.org/abs/1903.04167) | [Official](https://github.com/ofirpress/PartialShuffle) | 31 | | AWD-LSTM-DOC (Takase et al., 2018) | 54.12 | 52.38 | 23M | [Direct Output Connection for a High-Rank Language Model](https://arxiv.org/abs/1808.10143) | [Official](https://github.com/nttcslab-nlp/doc_lm) | 32 | | AWD-LSTM + continuous cache pointer (Merity et al., 2017)* | 53.9 | 52.8 | 24M | [Regularizing and Optimizing LSTM Language Models](https://arxiv.org/abs/1708.02182) | [Official](https://github.com/salesforce/awd-lstm-lm) | 33 | | Trellis Network (Bai et al., 2019) | - | 54.19 | 34M | [Trellis Networks for Sequence Modeling](https://openreview.net/pdf?id=HyeVtoRqtQ) | [Official](https://github.com/locuslab/trellisnet) 34 | | AWD-LSTM-MoS + ATOI (Kocher et al., 2019) | 56.44 | 54.33 | 22M | [Alleviating Sequence Information Loss with Data Overlapping and Prime Batch Sizes](https://arxiv.org/abs/1909.08700) | [Official](https://github.com/nkcr/overlap-ml) | 35 | | AWD-LSTM-MoS + finetune (Yang et al., 2018) | 56.54 | 54.44 | 22M | [Breaking the Softmax Bottleneck: A High-Rank RNN Language Model](https://arxiv.org/abs/1711.03953) | [Official](https://github.com/zihangdai/mos) | 36 | | Transformer-XL (Dai et al., 2018) ***under review*** | 56.72 | 54.52 | 24M | [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/pdf/1901.02860.pdf) | [Official](https://github.com/kimiyoung/transformer-xl) | 37 | | AWD-LSTM-MoS (Yang et al., 2018) | 58.08 | 55.97 | 22M | [Breaking the Softmax Bottleneck: A High-Rank RNN Language Model](https://arxiv.org/abs/1711.03953) | [Official](https://github.com/zihangdai/mos) | 38 | | AWD-LSTM 3-layer with Fraternal dropout (Zołna et al., 2018) | 58.9 | 56.8 | 24M | [Fraternal dropout](https://arxiv.org/pdf/1711.00066.pdf) | [Official](https://github.com/kondiz/fraternal-dropout) | 39 | | AWD-LSTM (Merity et al., 2017) | 60.0 | 57.3 | 24M | [Regularizing and Optimizing LSTM Language Models](https://arxiv.org/abs/1708.02182) | [Official](https://github.com/salesforce/awd-lstm-lm) | 40 | 41 | ### WikiText-2 42 | 43 | [WikiText-2](https://arxiv.org/abs/1609.07843) has been proposed as a more realistic 44 | benchmark for language modeling than the pre-processed Penn Treebank. WikiText-2 45 | consists of around 2 million words extracted from Wikipedia articles. 46 | 47 | | Model | Validation perplexity | Test perplexity | Number of params | Paper / Source | Code | 48 | | ------------- | :-----:| :-----: | :-----: | -------------- | ---- | 49 | | Mogrifier LSTM + dynamic eval (Melis et al., 2019) | 40.2 | 38.6 | 35M | [Mogrifier LSTM](http://arxiv.org/abs/1909.01792) | | 50 | | AdvSoft + AWD-LSTM-MoS + dynamic eval (Wang et al., 2019) | 40.27 | 38.65 | 35M | [Improving Neural Language Modeling via Adversarial Training](http://proceedings.mlr.press/v97/wang19f/wang19f.pdf) | [Official](https://github.com/ChengyueGongR/advsoft) | 51 | | FRAGE + AWD-LSTM-MoS + dynamic eval (Gong et al., 2018) | 40.85 | 39.14 | 35M | [FRAGE: Frequency-Agnostic Word Representation](https://arxiv.org/abs/1809.06858) | [Official](https://github.com/ChengyueGongR/Frequency-Agnostic) | 52 | | AWD-LSTM-MoS + dynamic eval (Yang et al., 2018)* | 42.41 | 40.68 | 35M | [Breaking the Softmax Bottleneck: A High-Rank RNN Language Model](https://arxiv.org/abs/1711.03953) | [Official](https://github.com/zihangdai/mos) | 53 | | AWD-LSTM + dynamic eval (Krause et al., 2017)* | 46.4 | 44.3 | 33M | [Dynamic Evaluation of Neural Sequence Models](https://arxiv.org/abs/1709.07432) | [Official](https://github.com/benkrause/dynamic-evaluation) | 54 | | AWD-LSTM + continuous cache pointer (Merity et al., 2017)* | 53.8 | 52.0 | 33M | [Regularizing and Optimizing LSTM Language Models](https://arxiv.org/abs/1708.02182) | [Official](https://github.com/salesforce/awd-lstm-lm) | 55 | | AWD-LSTM-DOC x5 (Takase et al., 2018) | 54.19 | 53.09 | 185M | [Direct Output Connection for a High-Rank Language Model](https://arxiv.org/abs/1808.10143) | [Official](https://github.com/nttcslab-nlp/doc_lm) | 56 | | Mogrifier LSTM (Melis et al., 2019) | 57.3 | 55.1 | 35M | [Mogrifier LSTM](http://arxiv.org/abs/1909.01792) | | 57 | | AWD-LSTM-DOC + Partial Shuffle (Press, 2019) ***preprint*** | 60.16 | 57.85 | 37M | [Partially Shuffling the Training Data to Improve Language Models](https://arxiv.org/abs/1903.04167) | [Official](https://github.com/ofirpress/PartialShuffle) | 58 | | AWD-LSTM-DOC (Takase et al., 2018) | 60.29 | 58.03 | 37M | [Direct Output Connection for a High-Rank Language Model](https://arxiv.org/abs/1808.10143) | [Official](https://github.com/nttcslab-nlp/doc_lm) | 59 | | AWD-LSTM-MoS (Yang et al., 2018) | 63.88 | 61.45 | 35M | [Breaking the Softmax Bottleneck: A High-Rank RNN Language Model](https://arxiv.org/abs/1711.03953) | [Official](https://github.com/zihangdai/mos) | 60 | | AWD-LSTM 3-layer with Fraternal dropout (Zołna et al., 2018) | 66.8 | 64.1 | 34M | [Fraternal dropout](https://arxiv.org/pdf/1711.00066.pdf) | [Official](https://github.com/kondiz/fraternal-dropout) | 61 | | AWD-LSTM + ATOI (Kocher et al., 2019) | 67.47 | 64.73 | 33M | [Alleviating Sequence Information Loss with Data Overlapping and Prime Batch Sizes](https://arxiv.org/abs/1909.08700) | [Official](https://github.com/nkcr/overlap-ml) | 62 | | AWD-LSTM (Merity et al., 2017) | 68.6 | 65.8 | 33M | [Regularizing and Optimizing LSTM Language Models](https://arxiv.org/abs/1708.02182) | [Official](https://github.com/salesforce/awd-lstm-lm) | 63 | 64 | ### WikiText-103 65 | 66 | [WikiText-103](https://arxiv.org/abs/1609.07843) The WikiText-103 corpus contains 267,735 unique words and each word occurs at least three times in the training set. 67 | 68 | | Model | Validation perplexity | Test perplexity | Number of params | Paper / Source | Code | 69 | | ------------- | :---:| :---:| :---:| -------- | --- | 70 | | Transformer-XL + RMS dynamic eval (Krause et al., 2019)* ***arxiv preprint*** | 15.8 | 16.4 | 257M | [Dynamic Evaluation of Transformer Language Models](https://arxiv.org/pdf/1904.08378.pdf) | [Official](https://github.com/benkrause/dynamiceval-transformer) | 71 | | Compressive Transformer (Rae et al., 2019)* ***arxiv preprint*** | 16.0 | 17.1(16.1 with basic dynamic evaluation) | ~257M | [Compressive Transformers for Long-Range Sequence Modelling](https://arxiv.org/pdf/1911.05507.pdf) | - | 72 | | Transformer-XL Large (Dai et al., 2018) ***under review*** | 17.7 | 18.3 | 257M | [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/pdf/1901.02860.pdf) | [Official](https://github.com/kimiyoung/transformer-xl) | 73 | | Transformer with tied adaptive embeddings (Baevski and Auli, 2018) | 19.8 | 20.5 | 247M | [Adaptive Input Representations for Neural Language Modeling](https://arxiv.org/pdf/1809.10853.pdf) | [Link](https://github.com/AranKomat/adapinp) | 74 | | Transformer-XL Standard (Dai et al., 2018) ***under review*** | 23.1 | 24.0 | 151M | [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/pdf/1901.02860.pdf) | [Official](https://github.com/kimiyoung/transformer-xl) | 75 | | AdvSoft + 4 layer QRNN + dynamic eval (Wang et al., 2019) | 27.2 | 28.0 | | [Improving Neural Language Modeling via Adversarial Training](http://proceedings.mlr.press/v97/wang19f/wang19f.pdf) | [Official](https://github.com/ChengyueGongR/advsoft) | 76 | | LSTM + Hebbian + Cache + MbPA (Rae et al., 2018) | 29.0 | 29.2 | | [Fast Parametric Learning with Activation Memorization](http://arxiv.org/abs/1803.10049) || 77 | | Trellis Network (Bai et al., 2019) | - | 30.35 | 180M | [Trellis Networks for Sequence Modeling](https://openreview.net/pdf?id=HyeVtoRqtQ) | [Official](https://github.com/locuslab/trellisnet) 78 | | AWD-LSTM-MoS + ATOI (Kocher et al., 2019) | 31.92 | 32.85 | | [Alleviating Sequence Information Loss with Data Overlapping and Prime Batch Sizes](https://arxiv.org/abs/1909.08700) | [Official](https://github.com/nkcr/overlap-ml) | 79 | | LSTM + Hebbian (Rae et al., 2018) | 34.1 | 34.3 | | [Fast Parametric Learning with Activation Memorization](http://arxiv.org/abs/1803.10049) || 80 | | LSTM (Rae et al., 2018) | 36.0 | 36.4 | | [Fast Parametric Learning with Activation Memorization](http://arxiv.org/abs/1803.10049) || 81 | | Gated CNN (Dauphin et al., 2016) | - | 37.2 | | [Language modeling with gated convolutional networks](https://arxiv.org/abs/1612.08083) || 82 | | Neural cache model (size = 2,000) (Grave et al., 2017) | - | 40.8 | | [Improving Neural Language Models with a Continuous Cache](https://arxiv.org/pdf/1612.04426.pdf) | [Link](https://github.com/kaishengtai/torch-ntm) | 83 | | Temporal CNN (Bai et al., 2018) | - | 45.2 | | [Convolutional sequence modeling revisited](https://openreview.net/forum?id=BJEX-H1Pf) || 84 | | LSTM (Grave et al., 2017) | - | 48.7 | | [Improving Neural Language Models with a Continuous Cache](https://arxiv.org/pdf/1612.04426.pdf) | [Link](https://github.com/kaishengtai/torch-ntm) | 85 | 86 | ### 1B Words / Google Billion Word benchmark 87 | 88 | [The One-Billion Word benchmark](https://arxiv.org/pdf/1312.3005.pdf) is a large dataset derived from a news-commentary site. 89 | The dataset consists of 829,250,940 tokens over a vocabulary of 793,471 words. 90 | Importantly, sentences in this model are shuffled and hence context is limited. 91 | 92 | | Model | Test perplexity | Number of params | Paper / Source | Code | 93 | | ------------- | :-----:| :-----:| --------- | --- | 94 | | Transformer-XL Large (Dai et al., 2018) ***under review*** | 21.8 | 0.8B | [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/pdf/1901.02860.pdf) | [Official](https://github.com/kimiyoung/transformer-xl) | 95 | | Transformer-XL Base (Dai et al., 2018) ***under review*** | 23.5 | 0.46B | [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/pdf/1901.02860.pdf) | [Official](https://github.com/kimiyoung/transformer-xl) | 96 | | Transformer with shared adaptive embeddings - Very large (Baevski and Auli, 2018) | 23.7 | 0.8B | [Adaptive Input Representations for Neural Language Modeling](https://arxiv.org/pdf/1809.10853.pdf) | [Link](https://github.com/AranKomat/adapinp) 97 | | 10 LSTM+CNN inputs + SNM10-SKIP (Jozefowicz et al., 2016) ***ensemble*** | 23.7 | 43B? | [Exploring the Limits of Language Modeling](https://arxiv.org/pdf/1602.02410.pdf) | [Official](https://github.com/rafaljozefowicz/lm) | 98 | | Transformer with shared adaptive embeddings (Baevski and Auli, 2018) | 24.1 | 0.46B | [Adaptive Input Representations for Neural Language Modeling](https://arxiv.org/pdf/1809.10853.pdf) | [Link](https://github.com/AranKomat/adapinp) 99 | | Big LSTM+CNN inputs (Jozefowicz et al., 2016) | 30.0 | 1.04B | [Exploring the Limits of Language Modeling](https://arxiv.org/pdf/1602.02410.pdf) || 100 | | Gated CNN-14Bottleneck (Dauphin et al., 2017) | 31.9 | ? | [Language Modeling with Gated Convolutional Networks](https://arxiv.org/pdf/1612.08083.pdf) || 101 | | BIGLSTM baseline (Kuchaiev and Ginsburg, 2018) | 35.1 | 0.151B | [Factorization tricks for LSTM networks](https://arxiv.org/pdf/1703.10722.pdf) | [Official](https://github.com/okuchaiev/f-lm) | 102 | | BIG F-LSTM F512 (Kuchaiev and Ginsburg, 2018) | 36.3 | 0.052B | [Factorization tricks for LSTM networks](https://arxiv.org/pdf/1703.10722.pdf) | [Official](https://github.com/okuchaiev/f-lm) | 103 | | BIG G-LSTM G-8 (Kuchaiev and Ginsburg, 2018) | 39.4 | 0.035B | [Factorization tricks for LSTM networks](https://arxiv.org/pdf/1703.10722.pdf) | [Official](https://github.com/okuchaiev/f-lm) | 104 | 105 | 106 | ## Character Level Models 107 | 108 | ### Hutter Prize 109 | 110 | [The Hutter Prize](http://prize.hutter1.net) Wikipedia dataset, also known as enwiki8, is a byte-level dataset consisting of the 111 | first 100 million bytes of a Wikipedia XML dump. For simplicity we shall refer to it as a character-level dataset. 112 | Within these 100 million bytes are 205 unique tokens. 113 | 114 | | Model | Bit per Character (BPC) | Number of params | Paper / Source | Code | 115 | | ---------------- | :-----: | :-----: | -------------- | ---- | 116 | | Transformer-XL + RMS dynamic eval (Krause et al., 2019)* ***arxiv preprint*** | 0.94 | 277M | [Dynamic Evaluation of Transformer Language Models](https://arxiv.org/pdf/1904.08378.pdf) | [Official](https://github.com/benkrause/dynamiceval-transformer) | 117 | | Compressive Transformer (Rae et al., 2019) ***arxiv preprint*** | 0.97 | - | [Compressive Transformers for Long-Range Sequence Modelling](https://arxiv.org/pdf/1911.05507.pdf) | - | 118 | | Mogrifier LSTM + dynamic eval (Melis et al., 2019) | 0.988 | 96M | [Mogrifier LSTM](http://arxiv.org/abs/1909.01792) | | 119 | | 24-layer Transformer-XL (Dai et al., 2018) ***under review*** | 0.99 | 277M | [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/pdf/1901.02860.pdf) | [Official](https://github.com/kimiyoung/transformer-xl) | 120 | | 18-layer Transformer-XL (Dai et al., 2018) ***under review*** | 1.03 | 88M | [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/pdf/1901.02860.pdf) | [Official](https://github.com/kimiyoung/transformer-xl) | 121 | | 12-layer Transformer-XL (Dai et al., 2018) ***under review*** | 1.06 | 41M | [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/pdf/1901.02860.pdf) | [Official](https://github.com/kimiyoung/transformer-xl) | 122 | | 64-layer Character Transformer Model (Al-Rfou et al., 2018) | 1.06 | 235M | [Character-Level Language Modeling with Deeper Self-Attention](https://arxiv.org/abs/1808.04444) || 123 | | mLSTM + dynamic eval (Krause et al., 2017)* | 1.08 | 46M | [Dynamic Evaluation of Neural Sequence Models](https://arxiv.org/abs/1709.07432) | [Official](https://github.com/benkrause/dynamic-evaluation) | 124 | | 12-layer Character Transformer Model (Al-Rfou et al., 2018) | 1.11 | 44M | [Character-Level Language Modeling with Deeper Self-Attention](https://arxiv.org/abs/1808.04444) || 125 | | Mogrifier LSTM (Melis et al., 2019) | 1.122 | 96M | [Mogrifier LSTM](http://arxiv.org/abs/1909.01792) | | 126 | | 3-layer AWD-LSTM (Merity et al., 2018) | 1.232 | 47M | [An Analysis of Neural Language Modeling at Multiple Scales](https://arxiv.org/abs/1803.08240) | [Official](https://github.com/salesforce/awd-lstm-lm) | 127 | | Large mLSTM +emb +WN +VD (Krause et al., 2017) | 1.24 | 46M | [Multiplicative LSTM for sequence modelling](https://arxiv.org/abs/1609.07959) | [Official](https://github.com/benkrause/mLSTM) | 128 | | Large FS-LSTM-4 (Mujika et al., 2017) | 1.245 | 47M | [Fast-Slow Recurrent Neural Networks](https://arxiv.org/abs/1705.08639) | [Official](https://github.com/amujika/Fast-Slow-LSTM) | 129 | | Large RHN (Zilly et al., 2016) | 1.27 | 46M | [Recurrent Highway Networks](https://arxiv.org/abs/1607.03474) | [Official](https://github.com/jzilly/RecurrentHighwayNetworks) | 130 | | FS-LSTM-4 (Mujika et al., 2017) | 1.277 | 27M | [Fast-Slow Recurrent Neural Networks](https://arxiv.org/abs/1705.08639) | [Official](https://github.com/amujika/Fast-Slow-LSTM) | 131 | 132 | ### Text8 133 | [The text8 dataset](http://mattmahoney.net/dc/textdata.html) is also derived from Wikipedia text, but has all XML removed, and is lower cased to only have 26 characters of English text plus spaces. 134 | 135 | | Model | Bit per Character (BPC) | Number of params | Paper / Source | Code | 136 | | ---------------- | :-----: | :-----: | -------------- | ---- | 137 | | Transformer-XL + RMS dynamic eval (Krause et al., 2019)* ***arxiv preprint*** | 1.038 | 277M | [Dynamic Evaluation of Transformer Language Models](https://arxiv.org/pdf/1904.08378.pdf) | [Official](https://github.com/benkrause/dynamiceval-transformer) | 138 | | Transformer-XL Large (Dai et al., 2018) ***under review*** | 1.08 | 277M | [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/pdf/1901.02860.pdf) | [Official](https://github.com/kimiyoung/transformer-xl) | 139 | | 64-layer Character Transformer Model (Al-Rfou et al., 2018) | 1.13 | 235M | [Character-Level Language Modeling with Deeper Self-Attention](https://arxiv.org/abs/1808.04444) || 140 | | 12-layer Character Transformer Model (Al-Rfou et al., 2018) | 1.18 | 44M | [Character-Level Language Modeling with Deeper Self-Attention](https://arxiv.org/abs/1808.04444) || 141 | | mLSTM + dynamic eval (Krause et al., 2017)* | 1.19 | 45M | [Dynamic Evaluation of Neural Sequence Models](https://arxiv.org/abs/1709.07432) | [Official](https://github.com/benkrause/dynamic-evaluation) | 142 | | Large mLSTM +emb +WN +VD (Krause et al., 2016) | 1.27 | 45M | [Multiplicative LSTM for sequence modelling](https://arxiv.org/abs/1609.07959) | [Official](https://github.com/benkrause/mLSTM) | 143 | | Large RHN (Zilly et al., 2016) | 1.27 | 46M | [Recurrent Highway Networks](https://arxiv.org/abs/1607.03474) | [Official](https://github.com/jzilly/RecurrentHighwayNetworks) | 144 | | LayerNorm HM-LSTM (Chung et al., 2017) | 1.29 | 35M | [Hierarchical Multiscale Recurrent Neural Networks](https://arxiv.org/abs/1609.01704) || 145 | | BN LSTM (Cooijmans et al., 2016) | 1.36 | 16M | [Recurrent Batch Normalization](https://arxiv.org/abs/1603.09025) | [Official](https://github.com/cooijmanstim/recurrent-batch-normalization) | 146 | | Unregularised mLSTM (Krause et al., 2016) | 1.40 | 45M | [Multiplicative LSTM for sequence modelling](https://arxiv.org/abs/1609.07959) | [Official](https://github.com/benkrause/mLSTM) | 147 | 148 | ### Penn Treebank 149 | The vocabulary of the words in the character-level dataset is limited to 10 000 - the same vocabulary as used in the word level dataset. This vastly simplifies the task of character-level language modeling as character transitions will be limited to those found within the limited word level vocabulary. 150 | 151 | | Model | Bit per Character (BPC) | Number of params | Paper / Source | Code | 152 | | ---------------- | :-----: | :-----: | -------------- | ---- | 153 | | Mogrifier LSTM + dynamic eval (Melis et al., 2019)| 1.083 | 24M | [Mogrifier LSTM](http://arxiv.org/abs/1909.01792) | | 154 | | Mogrifier LSTM (Melis et al., 2019) | 1.120 | 24M | [Mogrifier LSTM](http://arxiv.org/abs/1909.01792) | | 155 | | Trellis Network (Bai et al., 2019) | 1.159 | 13.4M | [Trellis Networks for Sequence Modeling](https://openreview.net/pdf?id=HyeVtoRqtQ) | [Official](https://github.com/locuslab/trellisnet) 156 | | 3-layer AWD-LSTM (Merity et al., 2018) | 1.175 | 13.8M | [An Analysis of Neural Language Modeling at Multiple Scales](https://arxiv.org/abs/1803.08240) | [Official](https://github.com/salesforce/awd-lstm-lm) | 157 | | 6-layer QRNN (Merity et al., 2018) | 1.187 | 13.8M | [An Analysis of Neural Language Modeling at Multiple Scales](https://arxiv.org/abs/1803.08240) | [Official](https://github.com/salesforce/awd-lstm-lm) | 158 | | FS-LSTM-4 (Mujika et al., 2017) | 1.190 | 27M | [Fast-Slow Recurrent Neural Networks](https://arxiv.org/abs/1705.08639) | [Official](https://github.com/amujika/Fast-Slow-LSTM) | 159 | | FS-LSTM-2 (Mujika et al., 2017) | 1.193 | 27M | [Fast-Slow Recurrent Neural Networks](https://arxiv.org/abs/1705.08639) | [Official](https://github.com/amujika/Fast-Slow-LSTM) | 160 | | NASCell (Zoph & Le, 2016) | 1.214 | 16.3M | [Neural Architecture Search with Reinforcement Learning](https://arxiv.org/abs/1611.01578) || 161 | | 2-layer Norm HyperLSTM (Ha et al., 2016) | 1.219 | 14.4M | [HyperNetworks](https://arxiv.org/abs/1609.09106) || 162 | 163 | ### Multilingual Wikipedia Corpus 164 | 165 | The character-based [MWC](http://k-kawakami.com/research/mwc) dataset is a collection of Wikipedia pages available in a number of languages. Markup and rare characters were removed, but otherwise no preprocessing was applied. 166 | 167 | #### MWC English in the single text, large setting. 168 | 169 | | Model | Validation BPC | Test BPC | Number of params | Paper / Source | Code | 170 | | ------------- | :-----:| :-----: | :-----: | -------------- | ---- | 171 | | Mogrifier LSTM + dynamic eval (Melis et al., 2019)| 1.200 | 1.187 | 24M | [Mogrifier LSTM](http://arxiv.org/abs/1909.01792) | | 172 | | Mogrifier LSTM (Melis et al., 2019) | 1.312 | 1.298 | 24M | [Mogrifier LSTM](http://arxiv.org/abs/1909.01792) | | 173 | | HCLM with Cache (Kawakami et al. 2017) | 1.591 | 1.538 | 8M | [Learning to Create and Reuse Words in Open-Vocabulary Neural Language Modeling](https://arxiv.org/abs/1704.06986) | | 174 | | LSTM (Kawakami et al. 2017) | 1.793 | 1.736 | 8M | [Learning to Create and Reuse Words in Open-Vocabulary Neural Language Modeling](https://arxiv.org/abs/1704.06986) | | 175 | 176 | #### MWC Finnish in the single text, large setting. 177 | 178 | | Model | Validation BPC | Test BPC | Number of params | Paper / Source | Code | 179 | | ------------- | :-----:| :-----: | :-----: | -------------- | ---- | 180 | | Mogrifier LSTM + dynamic eval (Melis et al., 2019)| 1.202 | 1.191 | 24M | [Mogrifier LSTM](http://arxiv.org/abs/1909.01792) | | 181 | | Mogrifier LSTM (Melis et al., 2019) | 1.327 | 1.313 | 24M | [Mogrifier LSTM](http://arxiv.org/abs/1909.01792) | | 182 | | HCLM with Cache (Kawakami et al. 2017) | 1.754 | 1.711 | 8M | [Learning to Create and Reuse Words in Open-Vocabulary Neural Language Modeling](https://arxiv.org/abs/1704.06986) | | 183 | | LSTM (Kawakami et al. 2017) | 1.943 | 1.913 | 8M | [Learning to Create and Reuse Words in Open-Vocabulary Neural Language Modeling](https://arxiv.org/abs/1704.06986) | | 184 | 185 | [Go back to the README](../README.md) 186 | -------------------------------------------------------------------------------- /machine translation/Neural_Machine_Translation_with_Attention.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Neural Machine Translation with Attention.ipynb", 7 | "provenance": [], 8 | "authorship_tag": "ABX9TyMw39GVIh7nxoBeTzGdp9Cy", 9 | "include_colab_link": true 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | }, 15 | "accelerator": "GPU" 16 | }, 17 | "cells": [ 18 | { 19 | "cell_type": "markdown", 20 | "metadata": { 21 | "id": "view-in-github", 22 | "colab_type": "text" 23 | }, 24 | "source": [ 25 | "\"Open" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "metadata": { 31 | "id": "HIjQ1GZNtX2g", 32 | "colab_type": "code", 33 | "outputId": "ade6f82f-a558-4781-d3c0-8bcb0003c5f6", 34 | "colab": { 35 | "base_uri": "https://localhost:8080/", 36 | "height": 35 37 | } 38 | }, 39 | "source": [ 40 | "from __future__ import absolute_import, print_function, division, unicode_literals\n", 41 | "%tensorflow_version 2.x\n", 42 | "import tensorflow as tf\n", 43 | "import matplotlib.pyplot as plt\n", 44 | "%matplotlib inline\n", 45 | "import matplotlib.ticker as ticker\n", 46 | "from sklearn.model_selection import train_test_split\n", 47 | "\n", 48 | "import unicodedata\n", 49 | "import re\n", 50 | "import numpy as np\n", 51 | "import os\n", 52 | "import io\n", 53 | "import time" 54 | ], 55 | "execution_count": 0, 56 | "outputs": [ 57 | { 58 | "output_type": "stream", 59 | "text": [ 60 | "TensorFlow 2.x selected.\n" 61 | ], 62 | "name": "stdout" 63 | } 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "metadata": { 69 | "id": "uq975qFs7gV9", 70 | "colab_type": "code", 71 | "outputId": "0334647f-06d9-46f2-aff1-8feec8ea9cab", 72 | "colab": { 73 | "base_uri": "https://localhost:8080/", 74 | "height": 54 75 | } 76 | }, 77 | "source": [ 78 | "path_to_zip = tf.keras.utils.get_file('spa-eng.zip',origin='http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',extract=True)\n", 79 | "path_to_file = os.path.dirname(path_to_zip) + \"/spa-eng/spa.txt\"" 80 | ], 81 | "execution_count": 0, 82 | "outputs": [ 83 | { 84 | "output_type": "stream", 85 | "text": [ 86 | "Downloading data from http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip\n", 87 | "2646016/2638744 [==============================] - 0s 0us/step\n" 88 | ], 89 | "name": "stdout" 90 | } 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "metadata": { 96 | "id": "YLfzN__18DP7", 97 | "colab_type": "code", 98 | "colab": {} 99 | }, 100 | "source": [ 101 | "def unicode_to_ascii(s):\n", 102 | " return ''.join(c for c in unicodedata.normalize('NFD',s) if unicodedata.category(c) != 'Mn')" 103 | ], 104 | "execution_count": 0, 105 | "outputs": [] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "metadata": { 110 | "id": "w6IQh_mK8TuE", 111 | "colab_type": "code", 112 | "colab": {} 113 | }, 114 | "source": [ 115 | "def preprocess_sentence(w):\n", 116 | " w = unicode_to_ascii(w.lower().strip())\n", 117 | " w = re.sub(r\"([?.!,])\",r\" \\1 \",w)\n", 118 | " w = re.sub(r'[\" \"]+', \" \", w)\n", 119 | "\n", 120 | " w = re.sub(r\"[^a-zA-Z?.!,]+\",\" \", w)\n", 121 | "\n", 122 | " w = w.rstrip().strip()\n", 123 | " w = \" \" + w + \" \"\n", 124 | " return w" 125 | ], 126 | "execution_count": 0, 127 | "outputs": [] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "metadata": { 132 | "id": "jErs4FjJ8jIq", 133 | "colab_type": "code", 134 | "outputId": "1706d976-f5c3-4604-c47c-39e395347697", 135 | "colab": { 136 | "base_uri": "https://localhost:8080/", 137 | "height": 54 138 | } 139 | }, 140 | "source": [ 141 | "en_sentence = u\"May I borrow this book?\"\n", 142 | "sp_sentence = u\"Puedo tomar prestado este libro?\"\n", 143 | "print(preprocess_sentence(en_sentence))\n", 144 | "print(preprocess_sentence(sp_sentence))" 145 | ], 146 | "execution_count": 0, 147 | "outputs": [ 148 | { 149 | "output_type": "stream", 150 | "text": [ 151 | " may i borrow this book ? \n", 152 | " puedo tomar prestado este libro ? \n" 153 | ], 154 | "name": "stdout" 155 | } 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "metadata": { 161 | "id": "NikFMxRV9qJn", 162 | "colab_type": "code", 163 | "colab": {} 164 | }, 165 | "source": [ 166 | "" 167 | ], 168 | "execution_count": 0, 169 | "outputs": [] 170 | } 171 | ] 172 | } -------------------------------------------------------------------------------- /machine translation/machine translation.md: -------------------------------------------------------------------------------- 1 | # Machine translation 2 | 3 | Machine translation is the task of translating a sentence in a source language to a different target language. 4 | 5 | Results with a * indicate that the mean test score over the the best window based on average dev-set BLEU score over 6 | 21 consecutive evaluations is reported as in [Chen et al. (2018)](https://arxiv.org/abs/1804.09849). 7 | 8 | ### WMT 2014 EN-DE 9 | 10 | Models are evaluated on the English-German dataset of the Ninth Workshop on Statistical Machine Translation (WMT 2014) based 11 | on BLEU. 12 | 13 | | Model | BLEU | Paper / Source | 14 | | ------------- | :-----:| --- | 15 | | Transformer Big + BT (Edunov et al., 2018) | 35.0 | [Understanding Back-Translation at Scale](https://arxiv.org/pdf/1808.09381.pdf) | 16 | | DeepL | 33.3 | [DeepL Press release](https://www.deepl.com/press.html) | 17 | | MUSE (Zhao et al., 2019)| 29.9 | [MUSE: Parallel Multi-Scale Attention for Sequence to Sequence Learning](https://arxiv.org/abs/1911.09483) | 18 | | DynamicConv (Wu et al., 2019)| 29.7 | [Pay Less Attention With Lightweight and Dynamic Convolutions](https://arxiv.org/abs/1901.10430) | 19 | | AdvSoft + Transformer Big (Wang et al., 2019)| 29.52 | [Improving Neural Language Modeling via Adversarial Training](http://proceedings.mlr.press/v97/wang19f/wang19f.pdf) | 20 | | Transformer Big (Ott et al., 2018) | 29.3 | [Scaling Neural Machine Translation](https://arxiv.org/abs/1806.00187) | 21 | | RNMT+ (Chen et al., 2018) | 28.5* | [The Best of Both Worlds: Combining Recent Advances in Neural Machine Translation](https://arxiv.org/abs/1804.09849) | 22 | | Transformer Big (Vaswani et al., 2017) | 28.4 | [Attention Is All You Need](https://arxiv.org/abs/1706.03762) | 23 | | Transformer Base (Vaswani et al., 2017) | 27.3 | [Attention Is All You Need](https://arxiv.org/abs/1706.03762) | 24 | | MoE (Shazeer et al., 2017) | 26.03 | [Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer](https://arxiv.org/abs/1701.06538) | 25 | | ConvS2S (Gehring et al., 2017) | 25.16 | [Convolutional Sequence to Sequence Learning](https://arxiv.org/abs/1705.03122) | 26 | 27 | ### WMT 2014 EN-FR 28 | 29 | Similarly, models are evaluated on the English-French dataset of the Ninth Workshop on Statistical Machine Translation (WMT 2014) based 30 | on BLEU. 31 | 32 | | Model | BLEU | Paper / Source | 33 | | ------------- | :-----:| --- | 34 | | DeepL | 45.9 | [DeepL Press release](https://www.deepl.com/press.html) | 35 | | Transformer Big + BT (Edunov et al., 2018) | 45.6 | [Understanding Back-Translation at Scale](https://arxiv.org/pdf/1808.09381.pdf) | 36 | | MUSE (Zhao et al., 2019)| 43.5 | [MUSE: Parallel Multi-Scale Attention for Sequence to Sequence Learning](https://arxiv.org/abs/1911.09483) | 37 | | DynamicConv (Wu et al., 2019)| 43.2 | [Pay Less Attention With Lightweight and Dynamic Convolutions](https://arxiv.org/abs/1901.10430) | 38 | | Transformer Big (Ott et al., 2018) | 43.2 | [Scaling Neural Machine Translation](https://arxiv.org/abs/1806.00187) | 39 | | RNMT+ (Chen et al., 2018) | 41.0* | [The Best of Both Worlds: Combining Recent Advances in Neural Machine Translation](https://arxiv.org/abs/1804.09849) | 40 | | Transformer Big (Vaswani et al., 2017) | 41.0 | [Attention Is All You Need](https://arxiv.org/abs/1706.03762) | 41 | | MoE (Shazeer et al., 2017) | 40.56 | [Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer](https://arxiv.org/abs/1701.06538) | 42 | | ConvS2S (Gehring et al., 2017) | 40.46 | [Convolutional Sequence to Sequence Learning](https://arxiv.org/abs/1705.03122) | 43 | | Transformer Base (Vaswani et al., 2017) | 38.1 | [Attention Is All You Need](https://arxiv.org/abs/1706.03762) | 44 | 45 | [Go back to the README](../README.md) 46 | -------------------------------------------------------------------------------- /memory networks/Memory_Networks.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Memory Networks.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [], 9 | "authorship_tag": "ABX9TyPtxg35PNtyZ3A7bAMAoWPt", 10 | "include_colab_link": true 11 | }, 12 | "kernelspec": { 13 | "name": "python3", 14 | "display_name": "Python 3" 15 | }, 16 | "accelerator": "GPU" 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "id": "view-in-github", 23 | "colab_type": "text" 24 | }, 25 | "source": [ 26 | "\"Open" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "metadata": { 32 | "id": "vAQALE_CCZ7T", 33 | "colab_type": "code", 34 | "colab": {} 35 | }, 36 | "source": [ 37 | "from __future__ import absolute_import, print_function, unicode_literals, division\n", 38 | "from builtins import range, input" 39 | ], 40 | "execution_count": 0, 41 | "outputs": [] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "metadata": { 46 | "id": "LbpKb3PpYSFK", 47 | "colab_type": "code", 48 | "outputId": "18037470-f7d3-4261-fbd1-d0415cb3c344", 49 | "colab": { 50 | "base_uri": "https://localhost:8080/", 51 | "height": 83 52 | } 53 | }, 54 | "source": [ 55 | "import numpy as np\n", 56 | "import pandas as pd\n", 57 | "import matplotlib.pyplot as plt\n", 58 | "%matplotlib inline\n", 59 | "import re \n", 60 | "import os\n", 61 | "import sys\n", 62 | "import gc\n", 63 | "gc.enable()\n", 64 | "import tarfile\n", 65 | "\n", 66 | "from keras.models import Model, Sequential\n", 67 | "from keras.layers import Dense, Embedding, Input, Lambda, Reshape, add, dot, Activation \n", 68 | "from keras.preprocessing.sequence import pad_sequences\n", 69 | "from keras.optimizers import Adam, RMSprop\n", 70 | "from keras.utils import get_file\n", 71 | "import keras.backend as K\n", 72 | "\n", 73 | "import warnings\n", 74 | "warnings.simplefilter(\"ignore\")\n", 75 | "warnings.filterwarnings(\"ignore\")" 76 | ], 77 | "execution_count": 0, 78 | "outputs": [ 79 | { 80 | "output_type": "stream", 81 | "text": [ 82 | "Using TensorFlow backend.\n" 83 | ], 84 | "name": "stderr" 85 | }, 86 | { 87 | "output_type": "display_data", 88 | "data": { 89 | "text/html": [ 90 | "

\n", 91 | "The default version of TensorFlow in Colab will soon switch to TensorFlow 2.x.
\n", 92 | "We recommend you upgrade now \n", 93 | "or ensure your notebook will continue to use TensorFlow 1.x via the %tensorflow_version 1.x magic:\n", 94 | "more info.

\n" 95 | ], 96 | "text/plain": [ 97 | "" 98 | ] 99 | }, 100 | "metadata": { 101 | "tags": [] 102 | } 103 | } 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": { 109 | "id": "QvZaRoPvqkgz", 110 | "colab_type": "text" 111 | }, 112 | "source": [ 113 | "### Single Supproting Fact" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "metadata": { 119 | "id": "hvQtSXQSZNYH", 120 | "colab_type": "code", 121 | "colab": {} 122 | }, 123 | "source": [ 124 | "path = get_file('babi-tasks-v1-2.tar.gz',\n", 125 | " origin='https://s3.amazonaws.com/text-datasets/babi_tasks_1-20_v1-2.tar.gz')\n", 126 | "\n", 127 | "tar = tarfile.open(path)" 128 | ], 129 | "execution_count": 0, 130 | "outputs": [] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "metadata": { 135 | "id": "XD5ROsULZs7v", 136 | "colab_type": "code", 137 | "colab": {} 138 | }, 139 | "source": [ 140 | "challenges = {\n", 141 | " 'single_supporting_fact_10k':'tasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_{}.txt',\n", 142 | " 'two_supporting_fact_10k':'tasks_1-20_v1-2/en-10k/qa2_two-supporting-facts_{}.txt'\n", 143 | "}" 144 | ], 145 | "execution_count": 0, 146 | "outputs": [] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "metadata": { 151 | "id": "xGGfk7MTaT1q", 152 | "colab_type": "code", 153 | "colab": {} 154 | }, 155 | "source": [ 156 | "def tokenize(sent):\n", 157 | " return [x.strip() for x in re.split('(\\W+)?',sent) if x.strip()]" 158 | ], 159 | "execution_count": 0, 160 | "outputs": [] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "metadata": { 165 | "id": "Bta0uyNtamJW", 166 | "colab_type": "code", 167 | "colab": {} 168 | }, 169 | "source": [ 170 | "def get_stories(f):\n", 171 | " data = []\n", 172 | " story = []\n", 173 | " printed = False\n", 174 | " count = 0\n", 175 | " for line in f:\n", 176 | " count+=1\n", 177 | " if count < 5:\n", 178 | " print(line)\n", 179 | " line = line.decode('utf-8').strip()\n", 180 | " nid, line = line.split(' ', 1)\n", 181 | " if int(nid) == 1:\n", 182 | " story = []\n", 183 | " if '\\t' in line:\n", 184 | " q, a, supporting = line.split('\\t')\n", 185 | " q = tokenize(q)\n", 186 | " story_so_far = [[str(i)] + s for i, s in enumerate(story) if s]\n", 187 | " data.append((story_so_far, q, a))\n", 188 | " story.append('')\n", 189 | " else:\n", 190 | " story.append(tokenize(line))\n", 191 | " return data" 192 | ], 193 | "execution_count": 0, 194 | "outputs": [] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "metadata": { 199 | "id": "VbkFgUm8bvmW", 200 | "colab_type": "code", 201 | "colab": {} 202 | }, 203 | "source": [ 204 | "def should_flatten(el):\n", 205 | " return not isinstance(el, (str, bytes))\n", 206 | "\n", 207 | "def flatten(l):\n", 208 | " for el in l:\n", 209 | " if should_flatten(el):\n", 210 | " yield from flatten(el)\n", 211 | " else:\n", 212 | " yield el" 213 | ], 214 | "execution_count": 0, 215 | "outputs": [] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "metadata": { 220 | "id": "G49NVvvWcMH2", 221 | "colab_type": "code", 222 | "colab": {} 223 | }, 224 | "source": [ 225 | "def vectorize_stories(data, word2idx, story_maxlen, query_maxlen):\n", 226 | " inputs, queries, answers = [], [], []\n", 227 | " for story, query, answer in data:\n", 228 | " inputs.append([[word2idx[w] for w in s] for s in story])\n", 229 | " queries.append([word2idx[w] for w in query])\n", 230 | " answers.append([word2idx[answer]])\n", 231 | " return (\n", 232 | " [pad_sequences(x, maxlen=story_maxlen) for x in inputs],\n", 233 | " pad_sequences(queries, maxlen=query_maxlen),\n", 234 | " np.array(answers)\n", 235 | " )" 236 | ], 237 | "execution_count": 0, 238 | "outputs": [] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "metadata": { 243 | "id": "6F2JRWqXerXl", 244 | "colab_type": "code", 245 | "colab": {} 246 | }, 247 | "source": [ 248 | "def stack_inputs(inputs, story_maxsents, story_maxlen):\n", 249 | " for i, story in enumerate(inputs):\n", 250 | " inputs[i] = np.concatenate(\n", 251 | " [\n", 252 | " story, \n", 253 | " np.zeros((story_maxsents-story.shape[0], story_maxlen),'int')\n", 254 | " ]\n", 255 | " )\n", 256 | " return np.stack(inputs)" 257 | ], 258 | "execution_count": 0, 259 | "outputs": [] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "metadata": { 264 | "id": "WA0HHRoSfxHW", 265 | "colab_type": "code", 266 | "colab": {} 267 | }, 268 | "source": [ 269 | "def get_data(challenge_type):\n", 270 | " challenge = challenges[challenge_type]\n", 271 | " \n", 272 | " train_stories = get_stories(tar.extractfile(challenge.format('train')))\n", 273 | " test_stories = get_stories(tar.extractfile(challenge.format('test')))\n", 274 | " \n", 275 | " stories = train_stories + test_stories\n", 276 | " \n", 277 | " story_maxlen = max((len(s) for x, _, _ in stories for s in x))\n", 278 | " story_maxsents = max((len(x) for x, _, _ in stories))\n", 279 | " query_maxlen = max(len(x) for _, x, _ in stories)\n", 280 | "\n", 281 | " vocab = sorted(set(flatten(stories)))\n", 282 | " vocab.insert(0, '')\n", 283 | " vocab_size = len(vocab)\n", 284 | "\n", 285 | " word2idx = {c:i for i, c in enumerate(vocab)}\n", 286 | "\n", 287 | " inputs_train, queries_train, answers_train = vectorize_stories(\n", 288 | " train_stories,\n", 289 | " word2idx,\n", 290 | " story_maxlen,\n", 291 | " query_maxlen\n", 292 | " )\n", 293 | " inputs_test, queries_test, answers_test = vectorize_stories(\n", 294 | " test_stories, \n", 295 | " word2idx,\n", 296 | " story_maxlen,\n", 297 | " query_maxlen\n", 298 | " )\n", 299 | " inputs_train = stack_inputs(inputs_train, story_maxsents, story_maxlen)\n", 300 | " inputs_test = stack_inputs(inputs_test, story_maxsents, story_maxlen)\n", 301 | " print(f\"inputs_train.shape {inputs_train.shape}, inputs_test.shape {inputs_test.shape}\")\n", 302 | " return train_stories, test_stories, inputs_train, queries_train, answers_train, \\\n", 303 | " inputs_test, queries_test, answers_test, story_maxsents, story_maxlen, query_maxlen, vocab, vocab_size " 304 | ], 305 | "execution_count": 0, 306 | "outputs": [] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "metadata": { 311 | "id": "WQvw8syFiFzF", 312 | "colab_type": "code", 313 | "outputId": "5459a452-ccc5-45cd-8a7f-7d2de1b2d998", 314 | "colab": { 315 | "base_uri": "https://localhost:8080/", 316 | "height": 181 317 | } 318 | }, 319 | "source": [ 320 | "train_stories, test_stories, inputs_train, queries_train, answers_train, \\\n", 321 | " inputs_test, queries_test, answers_test, story_maxsents, story_maxlen, query_maxlen, vocab, vocab_size = get_data('single_supporting_fact_10k')" 322 | ], 323 | "execution_count": 0, 324 | "outputs": [ 325 | { 326 | "output_type": "stream", 327 | "text": [ 328 | "b'1 Mary moved to the bathroom.\\n'\n", 329 | "b'2 John went to the hallway.\\n'\n", 330 | "b'3 Where is Mary? \\tbathroom\\t1\\n'\n", 331 | "b'4 Daniel went back to the hallway.\\n'\n", 332 | "b'1 John travelled to the hallway.\\n'\n", 333 | "b'2 Mary journeyed to the bathroom.\\n'\n", 334 | "b'3 Where is John? \\thallway\\t1\\n'\n", 335 | "b'4 Daniel went back to the bathroom.\\n'\n", 336 | "inputs_train.shape (10000, 10, 8), inputs_test.shape (1000, 10, 8)\n" 337 | ], 338 | "name": "stdout" 339 | } 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "metadata": { 345 | "id": "HyR1p8i3iUtc", 346 | "colab_type": "code", 347 | "outputId": "1f52cefb-3961-4884-9c97-91b11268a399", 348 | "colab": { 349 | "base_uri": "https://localhost:8080/", 350 | "height": 892 351 | } 352 | }, 353 | "source": [ 354 | "embedding_dim = 15\n", 355 | "\n", 356 | "input_story_ = Input((story_maxsents, story_maxlen))\n", 357 | "embedded_story = Embedding(vocab_size, embedding_dim)(input_story_)\n", 358 | "embedded_story = Lambda(lambda x: K.sum(x, axis=2))(embedded_story)\n", 359 | "print('input_story_.shape, embedded_story.shape: ', input_story_.shape, embedded_story.shape)\n", 360 | "\n", 361 | "\n", 362 | "input_question_ = Input((query_maxlen, ))\n", 363 | "embedded_question = Embedding(vocab_size, embedding_dim)(input_question_)\n", 364 | "embedded_question = Lambda(lambda x: K.sum(x, axis=1))(embedded_question)\n", 365 | "\n", 366 | "embedded_question = Reshape((1, embedding_dim))(embedded_question)\n", 367 | "print('inp_q.shape, emb_q.shape', input_question_.shape, embedded_question.shape)\n", 368 | "\n", 369 | "x = dot([embedded_story, embedded_question], 2)\n", 370 | "x = Reshape((story_maxsents, ))(x)\n", 371 | "x = Activation('softmax')(x)\n", 372 | "story_weights = Reshape((story_maxsents, 1))(x)\n", 373 | "print(\"story_weights.shape\", story_weights.shape)\n", 374 | "\n", 375 | "x = dot([story_weights, embedded_story], 1)\n", 376 | "x = Reshape((embedding_dim, ))(x)\n", 377 | "ans = Dense(vocab_size, activation='softmax')(x)\n", 378 | "\n", 379 | "model = Model([input_story_, input_question_], ans)\n", 380 | "\n", 381 | "model.compile(optimizer = RMSprop(lr=1e-2),\n", 382 | " loss='sparse_categorical_crossentropy',\n", 383 | " metrics=['accuracy'])\n", 384 | "\n", 385 | "r = model.fit([inputs_train, queries_train],\n", 386 | " answers_train,\n", 387 | " epochs=10,\n", 388 | " batch_size=32,\n", 389 | " validation_data=([inputs_test, queries_test], answers_test)\n", 390 | " )" 391 | ], 392 | "execution_count": 0, 393 | "outputs": [ 394 | { 395 | "output_type": "stream", 396 | "text": [ 397 | "input_story_.shape, embedded_story.shape: (?, 10, 8) (?, 10, 15)\n", 398 | "inp_q.shape, emb_q.shape (?, 4) (?, 1, 15)\n", 399 | "story_weights.shape (?, 10, 1)\n", 400 | "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/optimizers.py:793: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.\n", 401 | "\n", 402 | "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:3622: The name tf.log is deprecated. Please use tf.math.log instead.\n", 403 | "\n", 404 | "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/math_grad.py:1424: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n", 405 | "Instructions for updating:\n", 406 | "Use tf.where in 2.0, which has the same broadcast rule as np.where\n", 407 | "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:1033: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead.\n", 408 | "\n", 409 | "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:1020: The name tf.assign is deprecated. Please use tf.compat.v1.assign instead.\n", 410 | "\n", 411 | "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:3005: The name tf.Session is deprecated. Please use tf.compat.v1.Session instead.\n", 412 | "\n", 413 | "Train on 10000 samples, validate on 1000 samples\n", 414 | "Epoch 1/10\n", 415 | "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:190: The name tf.get_default_session is deprecated. Please use tf.compat.v1.get_default_session instead.\n", 416 | "\n", 417 | "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:197: The name tf.ConfigProto is deprecated. Please use tf.compat.v1.ConfigProto instead.\n", 418 | "\n", 419 | "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:207: The name tf.global_variables is deprecated. Please use tf.compat.v1.global_variables instead.\n", 420 | "\n", 421 | "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:216: The name tf.is_variable_initialized is deprecated. Please use tf.compat.v1.is_variable_initialized instead.\n", 422 | "\n", 423 | "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:223: The name tf.variables_initializer is deprecated. Please use tf.compat.v1.variables_initializer instead.\n", 424 | "\n", 425 | "10000/10000 [==============================] - 9s 879us/step - loss: 0.7462 - acc: 0.7281 - val_loss: 0.0239 - val_acc: 0.9950\n", 426 | "Epoch 2/10\n", 427 | "10000/10000 [==============================] - 1s 147us/step - loss: 0.0122 - acc: 0.9962 - val_loss: 1.9996e-04 - val_acc: 1.0000\n", 428 | "Epoch 3/10\n", 429 | "10000/10000 [==============================] - 1s 146us/step - loss: 0.0049 - acc: 0.9990 - val_loss: 0.0051 - val_acc: 0.9960\n", 430 | "Epoch 4/10\n", 431 | "10000/10000 [==============================] - 1s 145us/step - loss: 0.0034 - acc: 0.9991 - val_loss: 0.0037 - val_acc: 0.9990\n", 432 | "Epoch 5/10\n", 433 | "10000/10000 [==============================] - 2s 151us/step - loss: 0.0040 - acc: 0.9990 - val_loss: 3.2651e-06 - val_acc: 1.0000\n", 434 | "Epoch 6/10\n", 435 | "10000/10000 [==============================] - 1s 150us/step - loss: 0.0029 - acc: 0.9995 - val_loss: 2.9528e-05 - val_acc: 1.0000\n", 436 | "Epoch 7/10\n", 437 | "10000/10000 [==============================] - 1s 148us/step - loss: 0.0032 - acc: 0.9993 - val_loss: 1.3819e-05 - val_acc: 1.0000\n", 438 | "Epoch 8/10\n", 439 | "10000/10000 [==============================] - 1s 148us/step - loss: 0.0043 - acc: 0.9995 - val_loss: 2.3058e-04 - val_acc: 1.0000\n", 440 | "Epoch 9/10\n", 441 | "10000/10000 [==============================] - 1s 147us/step - loss: 0.0011 - acc: 0.9999 - val_loss: 3.1609e-06 - val_acc: 1.0000\n", 442 | "Epoch 10/10\n", 443 | "10000/10000 [==============================] - 2s 151us/step - loss: 3.1586e-06 - acc: 1.0000 - val_loss: 3.1608e-06 - val_acc: 1.0000\n" 444 | ], 445 | "name": "stdout" 446 | } 447 | ] 448 | }, 449 | { 450 | "cell_type": "markdown", 451 | "metadata": { 452 | "id": "UeaIAdGkqgDh", 453 | "colab_type": "text" 454 | }, 455 | "source": [ 456 | "#### Demo" 457 | ] 458 | }, 459 | { 460 | "cell_type": "code", 461 | "metadata": { 462 | "id": "33TfsnFOnky1", 463 | "colab_type": "code", 464 | "outputId": "2f817ccc-fae6-408a-cc94-2a17448cd96f", 465 | "colab": { 466 | "base_uri": "https://localhost:8080/", 467 | "height": 1000 468 | } 469 | }, 470 | "source": [ 471 | "while True:\n", 472 | "\n", 473 | " debug_model = Model([input_story_, input_question_], story_weights)\n", 474 | "\n", 475 | " story_idx = np.random.choice(len(train_stories))\n", 476 | "\n", 477 | " i = inputs_train[story_idx:story_idx+1]\n", 478 | " q = queries_train[story_idx:story_idx+1]\n", 479 | " w = debug_model.predict([i,q]).flatten()\n", 480 | "\n", 481 | " story, question, ans = train_stories[story_idx]\n", 482 | " print(\"story:\\n\")\n", 483 | " for i, line in enumerate(story):\n", 484 | " print(\"{:1.5f}\".format(w[i]), \"\\t\", \" \".join(line))\n", 485 | " print()\n", 486 | " print(\"question: \", \" \".join(question))\n", 487 | " print(\"answer: \", ans)\n", 488 | "\n", 489 | " print()\n", 490 | " if input(\"Another story.? y/n\") == \"n\":\n", 491 | " break" 492 | ], 493 | "execution_count": 0, 494 | "outputs": [ 495 | { 496 | "output_type": "stream", 497 | "text": [ 498 | "story:\n", 499 | "\n", 500 | "0.00000 \t 0 John moved to the bedroom .\n", 501 | "0.00000 \t 1 Daniel journeyed to the bathroom .\n", 502 | "0.00000 \t 3 Daniel moved to the hallway .\n", 503 | "0.00000 \t 4 Sandra journeyed to the garden .\n", 504 | "0.00010 \t 6 Daniel went back to the bedroom .\n", 505 | "0.00000 \t 7 Mary moved to the hallway .\n", 506 | "0.02529 \t 9 Daniel went to the kitchen .\n", 507 | "0.97460 \t 10 Daniel went back to the hallway .\n", 508 | "0.00000 \t 12 Sandra went to the bathroom .\n", 509 | "0.00001 \t 13 Sandra travelled to the bedroom .\n", 510 | "\n", 511 | "question: Where is Daniel ?\n", 512 | "answer: hallway\n", 513 | "\n", 514 | "Another story.? y/ny\n", 515 | "story:\n", 516 | "\n", 517 | "0.00000 \t 0 Daniel went to the bedroom .\n", 518 | "0.00000 \t 1 Daniel travelled to the office .\n", 519 | "0.00000 \t 3 Sandra went to the office .\n", 520 | "0.00000 \t 4 John travelled to the office .\n", 521 | "0.00000 \t 6 John travelled to the kitchen .\n", 522 | "0.00000 \t 7 John journeyed to the office .\n", 523 | "0.04168 \t 9 Daniel moved to the bathroom .\n", 524 | "0.95832 \t 10 Daniel moved to the garden .\n", 525 | "\n", 526 | "question: Where is Daniel ?\n", 527 | "answer: garden\n", 528 | "\n", 529 | "Another story.? y/ny\n", 530 | "story:\n", 531 | "\n", 532 | "0.00000 \t 0 Mary moved to the hallway .\n", 533 | "0.00000 \t 1 Mary travelled to the bathroom .\n", 534 | "0.00000 \t 3 Mary went back to the office .\n", 535 | "0.00000 \t 4 Daniel travelled to the hallway .\n", 536 | "0.00000 \t 6 Sandra moved to the bedroom .\n", 537 | "0.00000 \t 7 Mary travelled to the bedroom .\n", 538 | "0.00000 \t 9 Daniel went back to the bathroom .\n", 539 | "0.00000 \t 10 Daniel went to the kitchen .\n", 540 | "0.00000 \t 12 Daniel journeyed to the bathroom .\n", 541 | "1.00000 \t 13 Mary journeyed to the garden .\n", 542 | "\n", 543 | "question: Where is Mary ?\n", 544 | "answer: garden\n", 545 | "\n", 546 | "Another story.? y/ny\n", 547 | "story:\n", 548 | "\n", 549 | "0.00000 \t 0 Sandra went to the bedroom .\n", 550 | "0.00000 \t 1 Mary journeyed to the hallway .\n", 551 | "0.00000 \t 3 Sandra went back to the bathroom .\n", 552 | "0.00000 \t 4 Sandra went to the kitchen .\n", 553 | "0.00124 \t 6 Daniel went back to the kitchen .\n", 554 | "0.02751 \t 7 Daniel travelled to the bathroom .\n", 555 | "0.97125 \t 9 Daniel went back to the hallway .\n", 556 | "0.00000 \t 10 Mary journeyed to the office .\n", 557 | "\n", 558 | "question: Where is Daniel ?\n", 559 | "answer: hallway\n", 560 | "\n", 561 | "Another story.? y/ny\n", 562 | "story:\n", 563 | "\n", 564 | "0.99855 \t 0 Mary moved to the bathroom .\n", 565 | "0.00000 \t 1 John went to the garden .\n", 566 | "\n", 567 | "question: Where is Mary ?\n", 568 | "answer: bathroom\n", 569 | "\n", 570 | "Another story.? y/ny\n", 571 | "story:\n", 572 | "\n", 573 | "0.00953 \t 0 Sandra went back to the garden .\n", 574 | "0.99046 \t 1 Sandra went to the bedroom .\n", 575 | "\n", 576 | "question: Where is Sandra ?\n", 577 | "answer: bedroom\n", 578 | "\n", 579 | "Another story.? y/ny\n", 580 | "story:\n", 581 | "\n", 582 | "0.00000 \t 0 Mary went to the bedroom .\n", 583 | "0.00000 \t 1 Sandra went back to the hallway .\n", 584 | "0.10075 \t 3 Daniel moved to the hallway .\n", 585 | "0.89925 \t 4 Daniel journeyed to the bedroom .\n", 586 | "\n", 587 | "question: Where is Daniel ?\n", 588 | "answer: bedroom\n", 589 | "\n", 590 | "Another story.? y/ny\n", 591 | "story:\n", 592 | "\n", 593 | "0.00000 \t 0 Mary went back to the office .\n", 594 | "0.00000 \t 1 Daniel moved to the bathroom .\n", 595 | "0.00000 \t 3 Daniel journeyed to the hallway .\n", 596 | "0.00000 \t 4 Mary travelled to the hallway .\n", 597 | "0.00000 \t 6 Mary went to the bathroom .\n", 598 | "0.99993 \t 7 John journeyed to the office .\n", 599 | "0.00000 \t 9 Sandra went to the bedroom .\n", 600 | "0.00007 \t 10 Sandra moved to the hallway .\n", 601 | "\n", 602 | "question: Where is John ?\n", 603 | "answer: office\n", 604 | "\n", 605 | "Another story.? y/ny\n", 606 | "story:\n", 607 | "\n", 608 | "0.00000 \t 0 Sandra went back to the garden .\n", 609 | "0.99996 \t 1 Daniel travelled to the bathroom .\n", 610 | "0.00000 \t 3 Sandra journeyed to the bedroom .\n", 611 | "0.00004 \t 4 Mary went to the kitchen .\n", 612 | "\n", 613 | "question: Where is Daniel ?\n", 614 | "answer: bathroom\n", 615 | "\n", 616 | "Another story.? y/nn\n" 617 | ], 618 | "name": "stdout" 619 | } 620 | ] 621 | } 622 | ] 623 | } -------------------------------------------------------------------------------- /music generation/music generation.md: -------------------------------------------------------------------------------- 1 | # Music Generation 2 | 3 | > Find more at [Papers with Code](https://paperswithcode.com/task/music-generation) 4 | -------------------------------------------------------------------------------- /named-entity-recognition/Named_Entitiy_Recognition_spacy_simple.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Named Entitiy Recognition - spacy simple.ipynb", 7 | "provenance": [], 8 | "authorship_tag": "ABX9TyMqfurMsLcxU8ktEAjINPwV", 9 | "include_colab_link": true 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "markdown", 19 | "metadata": { 20 | "id": "view-in-github", 21 | "colab_type": "text" 22 | }, 23 | "source": [ 24 | "\"Open" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "metadata": { 30 | "id": "GI2cRsuAM9DV", 31 | "colab_type": "code", 32 | "colab": {} 33 | }, 34 | "source": [ 35 | "import spacy" 36 | ], 37 | "execution_count": 0, 38 | "outputs": [] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "metadata": { 43 | "id": "ogXVeetfNAeN", 44 | "colab_type": "code", 45 | "colab": {} 46 | }, 47 | "source": [ 48 | "nlp = spacy.load(\"en\", tagger=False, parser=False, matcher=False)" 49 | ], 50 | "execution_count": 0, 51 | "outputs": [] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "metadata": { 56 | "id": "WyrdAJYkNHsl", 57 | "colab_type": "code", 58 | "colab": {} 59 | }, 60 | "source": [ 61 | "doc = nlp(\"Hello my name is rohit singh and i live in India.\")" 62 | ], 63 | "execution_count": 0, 64 | "outputs": [] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "metadata": { 69 | "id": "yz310JCXNMQ0", 70 | "colab_type": "code", 71 | "colab": { 72 | "base_uri": "https://localhost:8080/", 73 | "height": 33 74 | }, 75 | "outputId": "5a06a578-43c6-47f2-a698-cc91a8389c57" 76 | }, 77 | "source": [ 78 | "for ent in doc.ents:\n", 79 | " print(ent.label_, ent.text)" 80 | ], 81 | "execution_count": 6, 82 | "outputs": [ 83 | { 84 | "output_type": "stream", 85 | "text": [ 86 | "GPE India\n" 87 | ], 88 | "name": "stdout" 89 | } 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "metadata": { 95 | "id": "7gaxDfFVNmK-", 96 | "colab_type": "code", 97 | "colab": {} 98 | }, 99 | "source": [ 100 | "# other libraries\n", 101 | "###\n", 102 | "# polyglot\n", 103 | "# gensim\n", 104 | "# nltk" 105 | ], 106 | "execution_count": 0, 107 | "outputs": [] 108 | } 109 | ] 110 | } -------------------------------------------------------------------------------- /question-answering/question answering.md: -------------------------------------------------------------------------------- 1 | # Question answering 2 | 3 | Question answering is the task of answering a question. 4 | 5 | ### Table of contents 6 | 7 | - [ARC](#arc) 8 | - [ShARC](#sharc) 9 | - [Reading comprehension](#reading-comprehension) 10 | - [CliCR](#clicr) 11 | - [CNN / Daily Mail](#cnn--daily-mail) 12 | - [CODAH](#codah) 13 | - [CoQA](#coqa) 14 | - [HotpotQA](#hotpotqa) 15 | - [MS MARCO](#ms-marco) 16 | - [MultiRC](#multirc) 17 | - [NewsQA](#newsqa) 18 | - [QAngaroo](#qangaroo) 19 | - [QuAC](#quac) 20 | - [RACE](#race) 21 | - [SQuAD](#squad) 22 | - [Story Cloze Test](#story-cloze-test) 23 | - [SWAG](#swag) 24 | - [Recipe QA](#recipeqa) 25 | - [NarrativeQA](#narrativeqa) 26 | - [DuoRC](#duorc) 27 | - [DROP](#drop) 28 | - [Cosmos QA](#cosmos-qa) 29 | - [Open-domain Question Answering](#open-domain-question-answering) 30 | - [DuReader](#dureader) 31 | - [Quasar](#quasar) 32 | - [SearchQA](#searchqa) 33 | - [Knowledge Base Question Answering](#knowledge-base-question-answering) 34 | 35 | ### ARC 36 | 37 | The [AI2 Reasoning Challenge (ARC)](http://ai2-website.s3.amazonaws.com/publications/AI2ReasoningChallenge2018.pdf) 38 | dataset is a question answering, which contains 7,787 genuine grade-school level, multiple-choice science questions. 39 | The dataset is partitioned into a Challenge Set and an Easy Set. The Challenge Set contains only questions 40 | answered incorrectly by both a retrieval-based algorithm and a word co-occurrence algorithm. Models are evaluated 41 | based on accuracy. 42 | 43 | A public leaderboard is available on the [ARC website](http://data.allenai.org/arc/). 44 | 45 | ### ShARC 46 | 47 | [ShARC](https://arxiv.org/abs/1809.01494) is a challenging QA dataset that requires logical reasoning, elements of entailment/NLI and natural language generation. 48 | 49 | Most work in machine reading focuses on question answering problems where the answer is directly expressed in the text to read. However, many real-world question answering problems require the reading of text not because it contains the literal answer, but because it contains a recipe to derive an answer together with the reader's background knowledge. We formalise this task and introduce the challenging ShARC dataset with 32k task instances. 50 | 51 | The goal is to answer questions by possibly asking follow-up questions first. We assume that the question does not provide enough information to be answered directly. However, a model can use the supporting rule text to infer what needs to be asked in order to determine the final answer. Concretely, The model must decide whether to answer with "Yes", "No", "Irrelevant", or to generate a follow-up question given rule text, a user scenario and a conversation history. Performance is measured with Micro and Macro Accuracy for "Yes"/"No"/"Irrelevant"/"More" classifications, and the quality of follow-up questions are measured with BLEU. 52 | 53 | The public data, further task details and public leaderboard are available on the [ShARC Website](https://sharc-data.github.io/). 54 | 55 | ## Reading comprehension 56 | 57 | Most current question answering datasets frame the task as reading comprehension where the question is about a paragraph 58 | or document and the answer often is a span in the document. The Machine Reading group 59 | at UCL also provides an [overview of reading comprehension tasks](https://uclnlp.github.io/ai4exams/data.html). 60 | 61 | ### CliCR 62 | 63 | The [CliCR dataset](http://aclweb.org/anthology/N18-1140) is a gap-filling reading comprehension dataset consisting of around 100,000 queries and their associated documents. The dataset was built from clinical case reports, requiring the reader to answer the query with a medical problem/test/treatment entity. The abilities to perform bridging inferences and track objects have been found to be the most frequently required skills for successful answering. 64 | 65 | The instructions for accessing the dataset, the processing scripts, the baselines and the adaptations of some neural models can be found [here](https://github.com/clips/clicr). 66 | 67 | Example: 68 | 69 | | Document | Question | Answer | 70 | | ------------- | -----:| -----: | 71 | | We report a case of a 72-year-old Caucasian woman with pl-7 positive antisynthetase syndrome. Clinical presentation included interstitial lung disease, myositis, mechanic’s hands and dysphagia. As lung injury was the main concern, treatment consisted of prednisolone and cyclophosphamide. Complete remission with reversal of pulmonary damage was achieved, as reported by CT scan, pulmonary function tests and functional status. [...] | Therefore, in severe cases an aggressive treatment, combining ________ and glucocorticoids as used in systemic vasculitis, is suggested.| cyclophoshamide | 72 | 73 | | Model | F1 | Paper | 74 | | ------------- | :-----:| --- | 75 | | Gated-Attention Reader (Dhingra et al., 2017) | 33.9 | [CliCR: A Dataset of Clinical Case Reports for Machine Reading Comprehension](http://aclweb.org/anthology/N18-1140) | 76 | | Stanford Attentive Reader (Chen et al., 2016) | 27.2| [CliCR: A Dataset of Clinical Case Reports for Machine Reading Comprehension](http://aclweb.org/anthology/N18-1140) | 77 | 78 | ### CNN / Daily Mail 79 | 80 | The [CNN / Daily Mail dataset](https://arxiv.org/abs/1506.03340) is a Cloze-style reading comprehension dataset 81 | created from CNN and Daily Mail news articles using heuristics. [Close-style](https://en.wikipedia.org/wiki/Cloze_test) 82 | means that a missing word has to be inferred. In this case, "questions" were created by replacing entities 83 | from bullet points summarizing one or several aspects of the article. Coreferent entities have been replaced with an 84 | entity marker @entityn where n is a distinct index. 85 | The model is tasked to infer the missing entity 86 | in the bullet point based on the content of the corresponding article and models are evaluated based on 87 | their accuracy on the test set. 88 | 89 | | | CNN | Daily Mail | 90 | | ------------- | -----:| -----: | 91 | | # Train | 380,298 | 879,450 | 92 | | # Dev | 3,924 | 64,835 | 93 | | # Test | 3,198 | 53,182 | 94 | 95 | Example: 96 | 97 | | Passage | Question | Answer | 98 | | ------------- | -----:| -----: | 99 | | ( @entity4 ) if you feel a ripple in the force today , it may be the news that the official @entity6 is getting its first gay character . according to the sci-fi website @entity9 , the upcoming novel " @entity11 " will feature a capable but flawed @entity13 official named @entity14 who " also happens to be a lesbian . " the character is the first gay figure in the official @entity6 -- the movies , television shows , comics and books approved by @entity6 franchise owner @entity22 -- according to @entity24 , editor of " @entity6 " books at @entity28 imprint @entity26 . | characters in " @placeholder " movies have gradually become more diverse | @entity6 | 100 | 101 | | Model | CNN | Daily Mail | Paper / Source | 102 | | ------------- | :-----:| :-----:|--- | 103 | | GA Reader(Dhingra et al., 2017) | 77.9 | 80.9 | [Gated-Attention Readers for Text Comprehension](http://aclweb.org/anthology/P17-1168) | 104 | | BIDAF(Seo et al., 2017) | 76.9 | 79.6 |[Bidirectional Attention Flow for Machine Comprehension](https://arxiv.org/pdf/1611.01603.pdf)| 105 | | AoA Reader(Cui et al., 2017) | 74.4 | - | [Attention-over-Attention Neural Networks for Reading Comprehension](http://aclweb.org/anthology/P17-1055) | 106 | | Neural net (Chen et al., 2016) | 72.4 | 75.8 | [A Thorough Examination of the CNN/Daily Mail Reading Comprehension Task](https://www.aclweb.org/anthology/P16-1223) | 107 | | Classifier (Chen et al., 2016) | 67.9 | 68.3 | [A Thorough Examination of the CNN/Daily Mail Reading Comprehension Task](https://www.aclweb.org/anthology/P16-1223) | 108 | | Impatient Reader (Hermann et al., 2015) | 63.8 | 68.0 | [Teaching Machines to Read and Comprehend](https://arxiv.org/abs/1506.03340) | 109 | 110 | ### CODAH 111 | [CODAH](https://arxiv.org/abs/1904.04365) is an adversarially-constructed evaluation dataset with 2.8k questions for testing common sense. CODAH forms a challenging extension to the SWAG dataset, which tests commonsense knowledge using sentence-completion questions that describe situations observed in video. 112 | 113 | The dataset and more information can be found [here](https://github.com/Websail-NU/CODAH) 114 | 115 | ### CoQA 116 | 117 | [CoQA](https://arxiv.org/abs/1808.07042) is a large-scale dataset for building Conversational Question Answering systems. 118 | CoQA contains 127,000+ questions with answers collected from 8000+ conversations. 119 | Each conversation is collected by pairing two crowdworkers to chat about a passage in the form of questions and answers. 120 | 121 | The data and public leaderboard are available [here](https://stanfordnlp.github.io/coqa/). 122 | 123 | ### HotpotQA 124 | 125 | HotpotQA is a dataset with 113k Wikipedia-based question-answer pairs. Questions require 126 | finding and reasoning over multiple supporting documents and are not constrained to any pre-existing knowledge bases. 127 | Sentence-level supporting facts are available. 128 | 129 | The data and public leaderboard are available from the [HotpotQA website](https://hotpotqa.github.io/). 130 | 131 | ### MS MARCO 132 | [MS MARCO](http://www.msmarco.org/dataset.aspx) aka Human Generated MAchine 133 | Reading COmprehension Dataset, is designed and developed by Microsoft AI & Research. [Link to paper](https://arxiv.org/abs/1611.09268) 134 | - The questions are obtained from real anonymized user queries. 135 | - The answers are human generated. The context passages from which the answers are obtained are extracted from real documents using the latest Bing search engine. 136 | - The data set contains 100,000 queries and a subset of them contain multiple answers, and aim to release 1M queries in the future. 137 | 138 | The leaderboards for multiple tasks are available on the [MS MARCO leaderboard page](http://www.msmarco.org/leaders.aspx). 139 | 140 | ### MultiRC 141 | MultiRC (Multi-Sentence Reading Comprehension) is a dataset of short paragraphs and multi-sentence questions that can be answered from the content of the paragraph. 142 | We have designed the dataset with three key challenges in mind: 143 | - The number of correct answer-options for each question is not pre-specified. This removes the over-reliance of current approaches on answer-options and forces them to decide on the correctness of each candidate answer independently of others. In other words, unlike previous work, the task here is not to simply identify the best answer-option, but to evaluate the correctness of each answer-option individually. 144 | - The correct answer(s) is not required to be a span in the text. 145 | - The paragraphs in our dataset have diverse provenance by being extracted from 7 different domains such as news, fiction, historical text etc., and hence are expected to be more diverse in their contents as compared to single-domain datasets. 146 | 147 | The leaderboards for the dataset is available on the [MultiRC website](http://cogcomp.org/multirc/). 148 | 149 | ### NewsQA 150 | 151 | The [NewsQA dataset](https://arxiv.org/pdf/1611.09830.pdf) is a reading comprehension dataset of over 100,000 152 | human-generated question-answer pairs from over 10,000 news articles from CNN, with answers consisting of spans of text 153 | from the corresponding articles. 154 | Some challenging characteristics of this dataset are: 155 | - Answers are spans of arbitrary length; 156 | - Some questions have no answer in the corresponding article; 157 | - There are no candidate answers from which to choose. 158 | Although very similar to the SQuAD dataset, NewsQA offers a greater challenge to existing models at time of 159 | introduction (eg. the paragraphs are longer than those in SQuAD). Models are evaluated based on F1 and Exact Match. 160 | 161 | Example: 162 | 163 | | Story | Question | Answer | 164 | | ------------- | -----:| -----: | 165 | | MOSCOW, Russia (CNN) -- Russian space officials say the crew of the Soyuz space ship is resting after a rough ride back to Earth. A South Korean bioengineer was one of three people on board the Soyuz capsule. The craft carrying South Korea's first astronaut landed in northern Kazakhstan on Saturday, 260 miles (418 kilometers) off its mark, they said. Mission Control spokesman Valery Lyndin said the condition of the crew -- South Korean bioengineer Yi So-yeon, American astronaut Peggy Whitson and Russian flight engineer Yuri Malenchenko -- was satisfactory, though the three had been subjected to severe G-forces during the re-entry. [...] | Where did the Soyuz capsule land? | northern Kazakhstan | 166 | 167 | The dataset can be downloaded [here](https://github.com/Maluuba/newsqa). 168 | 169 | | Model | F1 | EM | Paper / Source | 170 | | ------------- | :-----: | :-----: | --- | 171 | | DecaProp (Tay et al., 2018) | 66.3 | 53.1 | [Densely Connected Attention Propagation for Reading Comprehension](https://arxiv.org/abs/1811.04210) | 172 | | AMANDA (Kundu et al., 2018) | 63.7 | 48.4| [A Question-Focused Multi-Factor Attention Network for Question Answering](https://arxiv.org/abs/1801.08290) | 173 | | MINIMAL(Dyn) (Min et al., 2018) | 63.2 | 50.1 | [Efficient and Robust Question Answering from Minimal Context over Documents](https://arxiv.org/abs/1805.08092) | 174 | | FastQAExt (Weissenborn et al., 2017) | 56.1 | 43.7 | [Making Neural QA as Simple as Possible but not Simpler](https://arxiv.org/abs/1703.04816) | 175 | 176 | ### QAngaroo 177 | 178 | [QAngaroo](http://qangaroo.cs.ucl.ac.uk/index.html) is a set of two reading comprehension datasets, 179 | which require multiple steps of inference that combine facts from multiple documents. The first dataset, WikiHop 180 | is open-domain and focuses on Wikipedia articles. The second dataset, MedHop is based on paper abstracts from 181 | PubMed. 182 | 183 | The leaderboards for both datasets are available on the [QAngaroo website](http://qangaroo.cs.ucl.ac.uk/leaderboard.html). 184 | 185 | ### QuAC 186 | 187 | Question Answering in Context (QuAC) is a dataset for modeling, understanding, and participating in information seeking dialog. 188 | Data instances consist of an interactive dialog between two crowd workers: 189 | (1) a student who poses a sequence of freeform questions to learn as much as possible about a hidden Wikipedia text, 190 | and (2) a teacher who answers the questions by providing short excerpts (spans) from the text. 191 | 192 | The leaderboard and data are available on the [QuAC website](http://quac.ai/). 193 | 194 | ### RACE 195 | 196 | The [RACE dataset](https://arxiv.org/abs/1704.04683) is a reading comprehension dataset 197 | collected from English examinations in China, which are designed for middle school and high school students. 198 | The dataset contains more than 28,000 passages and nearly 100,000 questions and can be 199 | downloaded [here](http://www.cs.cmu.edu/~glai1/data/race/). Models are evaluated based on accuracy 200 | on middle school examinations (RACE-m), high school examinations (RACE-h), and on the total dataset (RACE). 201 | 202 | The public leaderboard is available on the [RACE leaderboard](http://www.qizhexie.com//data/RACE_leaderboard). 203 | 204 | | Model | RACE-m | RACE-h | RACE | Paper | Code | 205 | | ------------- | :-----:| :-----:| :-----:| --- | --- | 206 | | XLNet (Yang et al., 2019) | 85.45 | 80.21 | 81.75 | [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/pdf/1906.08237.pdf) | [Official](https://github.com/zihangdai/xlnet/) | 207 | | OCN_large (Ran et al., 2019) | 76.7 | 69.6 | 71.7 | [Option Comparison Network for Multiple-choice Reading Comprehension](https://arxiv.org/pdf/1903.03033.pdf) | | 208 | | DCMN_large (Zhang et al., 2019) | 73.4 | 68.1 | 69.7 | [Dual Co-Matching Network for Multi-choice Reading Comprehension](https://arxiv.org/pdf/1901.09381.pdf) | | 209 | | Finetuned Transformer LM (Radford et al., 2018) | 62.9 | 57.4 | 59.0 | [Improving Language Understanding by Generative Pre-Training](https://s3-us-west-2.amazonaws.com/openai-assets/research-covers/language-unsupervised/language_understanding_paper.pdf) | [Official](https://github.com/openai/finetune-transformer-lm) | 210 | | BiAttention MRU (Tay et al., 2018) | 60.2 | 50.3 | 53.3 | [Multi-range Reasoning for Machine Comprehension](https://arxiv.org/abs/1803.09074) | | 211 | 212 | ### SQuAD 213 | 214 | The [Stanford Question Answering Dataset (SQuAD)](https://arxiv.org/abs/1606.05250) 215 | is a reading comprehension dataset, consisting of questions posed by crowdworkers 216 | on a set of Wikipedia articles. The answer to every question is a segment of text (a span) 217 | from the corresponding reading passage. Recently, [SQuAD 2.0](https://arxiv.org/abs/1806.03822) 218 | has been released, which includes unanswerable questions. 219 | 220 | The public leaderboard is available on the [SQuAD website](https://rajpurkar.github.io/SQuAD-explorer/). 221 | 222 | ### Story Cloze Test 223 | 224 | The [Story Cloze Test](http://aclweb.org/anthology/W17-0906.pdf) is a dataset for 225 | story understanding that provides systems with four-sentence stories and two possible 226 | endings. The systems must then choose the correct ending to the story. 227 | 228 | More details are available on the [Story Cloze Test Challenge](https://competitions.codalab.org/competitions/15333). 229 | 230 | | Model | Accuracy | Paper / Source | Code | 231 | | ------------- | :-----:| --- | --- | 232 | | Reading Strategies Model (Sun et al., 2018) | 88.3 | [Improving Machine Reading Comprehension by General Reading Strategies](https://arxiv.org/pdf/1810.13441v1.pdf) | 233 | | Finetuned Transformer LM (Radford et al., 2018) | 86.5 | [Improving Language Understanding by Generative Pre-Training](https://s3-us-west-2.amazonaws.com/openai-assets/research-covers/language-unsupervised/language_understanding_paper.pdf) | [Official](https://github.com/openai/finetune-transformer-lm) 234 | | Liu et al. (2018) | 78.7 | [Narrative Modeling with Memory Chains and Semantic Supervision](http://aclweb.org/anthology/P18-2045) | [Official](https://github.com/liufly/narrative-modeling) 235 | | Hidden Coherence Model (Chaturvedi et al., 2017) | 77.6 | [Story Comprehension for Predicting What Happens Next](http://aclweb.org/anthology/D17-1168) | 236 | | val-LS-skip (Srinivasan et al., 2018) | 76.5 | [A Simple and Effective Approach to the Story Cloze Test](http://aclweb.org/anthology/N18-2015) | 237 | 238 | ### SWAG 239 | 240 | [SWAG](https://arxiv.org/abs/1808.05326) (Situations With Adversarial Generations) is a large-scale dataset for the task of grounded commonsense inference, unifying natural language inference and physically grounded reasoning. The dataset consists of 113k multiple choice questions about grounded situations. Each question is a video caption from LSMDC or ActivityNet Captions, with four answer choices about what might happen next in the scene. The correct answer is the (real) video caption for the next event in the video; the three incorrect answers are adversarially generated and human verified, so as to fool machines but not humans. 241 | 242 | The public leaderboard is available on the [AI2 website] (https://leaderboard.allenai.org/swag/submissions/public). 243 | 244 | ### RecipeQA 245 | 246 | [RecipeQA](https://arxiv.org/abs/1809.00812) is a dataset for multimodal comprehension of cooking recipes. It consists of over 36K question-answer pairs automatically generated from approximately 20K unique recipes with step-by-step instructions and images. Each question in RecipeQA involves multiple modalities such as titles, descriptions or images, and working towards an answer requires (i) joint understanding of images and text, (ii) capturing the temporal flow of events, and (iii) making sense of procedural knowledge. 247 | 248 | The public leaderboard is available on the [RecipeQA website](https://hucvl.github.io/recipeqa/). 249 | 250 | 251 | 252 | ### NarrativeQA 253 | [NarrativeQA](https://arxiv.org/abs/1712.07040) is a dataset built to encourage deeper comprehension of language. This dataset involves reasoning over reading entire books or movie scripts. This dataset contains approximately 45K question answer pairs in free form text. There are two modes of this dataset (1) reading comprehension over summaries and (2) reading comprehension over entire books/scripts. 254 | 255 | | Model | BLEU-1 | BLEU-4 | METEOR | Rouge-L | Paper / Source | Code | 256 | | ------------- | :-----: | :-----:|:-----:| :-----:|--- | --- | 257 | |DecaProp (Tay et al., 2018) |44.35 |27.61 | 21.80 | 44.69 |[Densely Connected Attention Propagation for Reading Comprehension](https://arxiv.org/abs/1811.04210) | [official](https://github.com/vanzytay/NIPS2018_DECAPROP) | 258 | |BiAttention + DCU-LSTM (Tay et al., 2018) |36.55 |19.79 | 17.87 | 41.44 |[Multi-Granular Sequence Encoding via Dilated Compositional Units for Reading Comprehension](http://aclweb.org/anthology/D18-1238) | | 259 | |BiDAF (Seo et al., 2017) |33.45 |15.69 | 15.68 | 36.74 |[Bidirectional Attention Flow for Machine Comprehension](https://arxiv.org/abs/1611.01603) | | 260 | 261 | *Note that the above is for the Summary setting. There are no official published results for reading over entire books/stories except for the original paper. 262 | 263 | ### DuoRC 264 | 265 | [DuoRC](https://duorc.github.io) contains 186,089 unique question-answer pairs created from a collection of 7680 pairs of movie plots where each pair in the collection reflects two versions of the same movie. 266 | 267 | DuoRC pushes the NLP community to address challenges on incorporating knowledge and reasoning in neural architectures for reading comprehension. It poses several interesting challenges such as: 268 | - DuoRC using parallel plots is especially designed to contain a large number of questions with low lexical overlap between questions and their corresponding passages 269 | - It requires models to go beyond the content of the given passage itself and incorporate world-knowledge, background knowledge, and common-sense knowledge to arrive at the answer 270 | - It revolves around narrative passages from movie plots describing complex events and therefore naturally require complex reasoning (e.g. temporal reasoning, entailment, long-distance anaphoras, etc.) across multiple sentences to infer the answer to questions 271 | - Several of the questions in DuoRC, while seeming relevant, cannot actually be answered from the given passage. This requires the model to detect the unanswerability of questions. This aspect is important for machines to achieve in industrial settings in particular. 272 | 273 | ### DROP 274 | 275 | [DROP](https://allennlp.org/drop) is a crowdsourced, adversarially-created, 96k-question benchmark, in which a system must resolve references in a question, perhaps to multiple input positions, and perform discrete operations over them (such as addition, counting, or sorting). These operations require a much more comprehensive understanding of the content of paragraphs than what was necessary for prior datasets. 276 | 277 | ### Cosmos QA 278 | 279 | [Cosmos QA](https://wilburone.github.io/cosmos/) is a large-scale dataset of 35.6K problems that require commonsense-based reading comprehension, formulated as multiple-choice questions. It focuses on reading between the lines over a diverse collection of people's everyday narratives, asking questions concerning on the likely causes or effects of events that require reasoning beyond the exact text spans in the context. 280 | 281 | ## Open-domain Question Answering 282 | 283 | ### DuReader 284 | [DuReader](https://ai.baidu.com/broad/subordinate?dataset=dureader) is a large-scale, open-domain Chinese machine reading comprehension (MRC) dataset, designed to address real-world MRC. [Link to paper](https://arxiv.org/pdf/1711.05073.pdf) 285 | 286 | DuReader has three advantages over other MRC datasets: 287 | - (1) data sources: questions and documents are based on Baidu Search and Baidu Zhidao; answers are manually generated. 288 | - (2) question types: it provides rich annotations for more question types, especially yes-no and opinion questions, that leaves more opportunity for the research community. 289 | - (3) scale: it contains 300K questions, 660K answers and 1.5M documents; it is the largest Chinese MRC dataset so far. 290 | 291 | To help the community make these improvements, both the [dataset](https://ai.baidu.com/broad/download?dataset=dureader) of DuReader and [baseline systems](https://github.com/baidu/DuReader) have been posted online. 292 | 293 | The [leaderboard](https://ai.baidu.com/broad/leaderboard?dataset=dureader) is avaiable on DuReader page. 294 | 295 | ### Quasar 296 | [Quasar](https://arxiv.org/abs/1707.03904) is a dataset for open-domain question answering. It includes two parts: (1) The Quasar-S dataset consists of 37,000 cloze-style queries constructed from definitions of software entity tags on the popular website Stack Overflow. (2) The Quasar-T dataset consists of 43,000 open-domain trivia questions and their answers obtained from various internet sources. 297 | 298 | | Model | EM (Quasar-T) | F1 (Quasar-T) |Paper / Source | Code | 299 | | ------------- | :-----:| :-----:|--- | --- | 300 | |Denoising QA (Lin et al. 2018)|42.2 |49.3 |[Denoising Distantly Supervised Open-Domain Question Answering](http://aclweb.org/anthology/P18-1161)|[official](https://github.com/thunlp/OpenQA)| 301 | |DecaProp (Tay et al., 2018) |38.6 |46.9 |[Densely Connected Attention Propagation for Reading Comprehension](https://arxiv.org/abs/1811.04210)|[official](https://github.com/vanzytay/NIPS2018_DECAPROP)| 302 | |R^3 (Wang et al., 2018) |35.3 |41.7 |[R^3: Reinforced Ranker-Reader for Open-Domain Question Answering](https://aaai.org/ocs/index.php/AAAI/AAAI18/paper/view/16712/16165)|[official](https://github.com/shuohangwang/mprc)| 303 | |BiDAF (Seo et al., 2017) |25.9 |28.5 |[Bidirectional Attention Flow for Machine Comprehensio](https://arxiv.org/abs/1611.01603) | [official](https://github.com/allenai/bi-att-flow)| 304 | |GA (Dhingra et al., 2017) |26.4 |26.4 |[Gated-Attention Readers for Text Comprehension](https://arxiv.org/pdf/1606.01549) | | 305 | 306 | 307 | 308 | ### SearchQA 309 | [SearchQA](https://arxiv.org/abs/1704.05179) was constructed to reflect a full pipeline of general question-answering. SearchQA consists of more than 140k question-answer pairs with each pair having 49.6 snippets on average. Each question-answer-context tuple of the SearchQA comes with additional meta-data such as the snippet's URL. 310 | 311 | | Model | Unigram Acc | N-gram F1 | EM | F1 |Paper / Source | Code | 312 | | ------------- | :-----:| :-----:| :-----:| :-----:|--- | --- | 313 | |DecaProp (Tay et al., 2018) |62.2 |70.8 |56.8 |63.6 |[Densely Connected Attention Propagation for Reading Comprehension](https://arxiv.org/abs/1811.04210) | [official](https://github.com/vanzytay/NIPS2018_DECAPROP) | 314 | |Denoising QA (Lin et al. 2018)| - |- | 58.8| 64.5|[Denoising Distantly Supervised Open-Domain Question Answering](http://aclweb.org/anthology/P18-1161)|[official](https://github.com/thunlp/OpenQA)| 315 | |R^3 (Wang et al., 2018) |- |- | 49.0| 55.3 |[R^3: Reinforced Ranker-Reader for Open-Domain Question Answering](https://aaai.org/ocs/index.php/AAAI/AAAI18/paper/view/16712/16165)|[official](https://github.com/shuohangwang/mprc)| 316 | |Bi-Attention + DCU-LSTM (Tay et al., 2018) |49.4 |59.5 |- |- |[Multi-Granular Sequence Encoding via Dilated Compositional Units for Reading Comprehension](http://aclweb.org/anthology/D18-1238) | | 317 | |AMANDA (Kundu et al., 2018) |46.8 |56.6 |- |- |[A Question-Focused Multi-Factor Attention Network for Question Answering](https://arxiv.org/abs/1801.08290) | [official](https://github.com/nusnlp/amanda)| 318 | |Focused Hierarchical RNN (Ke et al., 2018) |46.8 |53.4 |- |- |[Focused Hierarchical RNNs for Conditional Sequence Processing](http://proceedings.mlr.press/v80/ke18a/ke18a.pdf)|| 319 | |ASR (Kadlec et al, 2016) |41.3 |22.8 |- |- |[Text Understanding with the Attention Sum Reader Network](https://arxiv.org/abs/1603.01547)| 320 | 321 | ## Knowledge Base Question Answering 322 | 323 | Knowledge Base Question Answering is the task of answering natural language question based on a knowledge base/knowledge graph such as [DBpedia](https://wiki.dbpedia.org/) or [Wikidata](https://www.wikidata.org/). 324 | 325 | ### QALD-9 326 | [QALD-9](http://ceur-ws.org/Vol-2241/paper-06.pdf) is a manually curated superset of the previous eight editions of the [Question Answering over Linked Data (QALD) challenge](http://2018.nliwod.org/challenge) published in 2018. It is constructed by human experts to cover a wide range of natural language to SPARQL conversions based on DBpedia 2016-10 knowledge base. Each question-answer-pair has additional meta-data. QALD-9 is best evaluated using the [GERBIL QA platform](http://gerbil-qa.aksw.org/gerbil/config) for repeatability of the evaluation numbers. 327 | 328 | | Annotator | Macro P | Macro R | Macro F1 | Error Count | Average Time/Doc ms | Macro F1 QALD | Paper (including links to webservices/source code)| 329 | |------------------------|:-------:|:-------:|:--------:|:-----------:|:-------------------:|:-------------:|----------------------| 330 | | Elon (WS) | 0.049 | 0.053 | 0.050 | 2 | 219 | 0.100 || 331 | | QASystem (WS) | 0.097 | 0.116 | 0.098 | 0 | 1014 | 0.200 || 332 | | TeBaQA (WS) | 0.129 | 0.134 | 0.130 | 0 | 2668 | 0.222 || 333 | | wdaqua-core1 (DBpedia) | 0.261 | 0.267 | 0.250 | 0 | 661 | 0.289 | Diefenbach, Dennis, Kamal Singh, and Pierre Maret. "Wdaqua-core1: a question answering service for rdf knowledge bases." Companion of the The Web Conference 2018 on The Web Conference 2018. International World Wide Web Conferences Steering Committee, 2018. | 334 | | gAnswer (WS) | 0.293 | 0.327 | 0.298 | 1 | 3076 | 0.430 | Zou, Lei, et al. "Natural language question answering over RDF: a graph data driven approach." Proceedings of the 2014 ACM SIGMOD international conference on Management of data. ACM, 2014.| 335 | 336 | [Go back to the README](../README.md) 337 | -------------------------------------------------------------------------------- /recurrent neural networks/Bidirectional_LSTM_Test.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Bidirectional LSTM Test.ipynb", 7 | "provenance": [], 8 | "authorship_tag": "ABX9TyPCTxeN2/bd2zBIr3yuXUzK", 9 | "include_colab_link": true 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | }, 15 | "accelerator": "GPU" 16 | }, 17 | "cells": [ 18 | { 19 | "cell_type": "markdown", 20 | "metadata": { 21 | "id": "view-in-github", 22 | "colab_type": "text" 23 | }, 24 | "source": [ 25 | "\"Open" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "metadata": { 31 | "id": "iqr4gSb_riUO", 32 | "colab_type": "code", 33 | "colab": {} 34 | }, 35 | "source": [ 36 | "from __future__ import print_function, division\n", 37 | "from builtins import range, input" 38 | ], 39 | "execution_count": 0, 40 | "outputs": [] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "metadata": { 45 | "id": "XfbcaXd5rtRp", 46 | "colab_type": "code", 47 | "colab": { 48 | "base_uri": "https://localhost:8080/", 49 | "height": 34 50 | }, 51 | "outputId": "8e17b677-8815-4c32-e8e4-95bae761149e" 52 | }, 53 | "source": [ 54 | "%tensorflow_version 2.x\n", 55 | "import tensorflow as tf\n", 56 | "from tensorflow.keras.models import Model \n", 57 | "from tensorflow.keras.layers import Input, LSTM, GRU, Bidirectional\n", 58 | "import numpy as np \n", 59 | "import matplotlib.pyplot as plt \n", 60 | "%matplotlib inline " 61 | ], 62 | "execution_count": 2, 63 | "outputs": [ 64 | { 65 | "output_type": "stream", 66 | "text": [ 67 | "TensorFlow is already loaded. Please restart the runtime to change versions.\n" 68 | ], 69 | "name": "stdout" 70 | } 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "metadata": { 76 | "id": "LD1TMgXAsGXP", 77 | "colab_type": "code", 78 | "colab": {} 79 | }, 80 | "source": [ 81 | "T = 8\n", 82 | "D = 2\n", 83 | "M = 3" 84 | ], 85 | "execution_count": 0, 86 | "outputs": [] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "metadata": { 91 | "id": "aEwTA3oJsJoj", 92 | "colab_type": "code", 93 | "colab": {} 94 | }, 95 | "source": [ 96 | "X = np.random.randn(1,T,D)" 97 | ], 98 | "execution_count": 0, 99 | "outputs": [] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "metadata": { 104 | "id": "S7gaNF2rsMgC", 105 | "colab_type": "code", 106 | "colab": { 107 | "base_uri": "https://localhost:8080/", 108 | "height": 250 109 | }, 110 | "outputId": "0b0fed52-9af2-4ba0-c1cf-f7715732d4a4" 111 | }, 112 | "source": [ 113 | "input_ = Input(shape=(T,D))\n", 114 | "rnn = Bidirectional(LSTM(M, return_state=True, return_sequences=True))\n", 115 | "x = rnn(input_)" 116 | ], 117 | "execution_count": 5, 118 | "outputs": [ 119 | { 120 | "output_type": "stream", 121 | "text": [ 122 | "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/init_ops.py:97: calling GlorotUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n", 123 | "Instructions for updating:\n", 124 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n", 125 | "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/init_ops.py:97: calling Orthogonal.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n", 126 | "Instructions for updating:\n", 127 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n", 128 | "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/init_ops.py:97: calling Zeros.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n", 129 | "Instructions for updating:\n", 130 | "Call initializer instance with the dtype argument instead of passing it to the constructor\n", 131 | "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/resource_variable_ops.py:1630: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.\n", 132 | "Instructions for updating:\n", 133 | "If using Keras pass *_constraint arguments to layers.\n" 134 | ], 135 | "name": "stdout" 136 | } 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "metadata": { 142 | "id": "EWOy4pNssZoU", 143 | "colab_type": "code", 144 | "colab": {} 145 | }, 146 | "source": [ 147 | "model = Model(input_, x)\n", 148 | "o, h1, c1, h2, c2 = model.predict(X)" 149 | ], 150 | "execution_count": 0, 151 | "outputs": [] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "metadata": { 156 | "id": "RJ3RYIsIslKp", 157 | "colab_type": "code", 158 | "colab": { 159 | "base_uri": "https://localhost:8080/", 160 | "height": 390 161 | }, 162 | "outputId": "01ce4f4c-054a-41be-dbc1-d56ab9aea9f8" 163 | }, 164 | "source": [ 165 | "print(\"o:\",o)\n", 166 | "print(\"o.shape:\",o.shape)\n", 167 | "print(\"h1:\",h1)\n", 168 | "print(\"c1:\",c1)\n", 169 | "print(\"h2:\",h2)\n", 170 | "print(\"c2:\",c2)" 171 | ], 172 | "execution_count": 11, 173 | "outputs": [ 174 | { 175 | "output_type": "stream", 176 | "text": [ 177 | "o: [[[ 0.11253004 -0.03092608 -0.06630566 -0.39619136 0.24818416\n", 178 | " -0.08490453]\n", 179 | " [-0.07517125 0.01502566 0.01917559 -0.22268571 0.14949295\n", 180 | " -0.03665679]\n", 181 | " [ 0.01815493 0.01556195 -0.15813972 -0.43729374 0.29446658\n", 182 | " -0.13612342]\n", 183 | " [ 0.23609129 -0.0448236 0.01032039 -0.39598283 0.2849627\n", 184 | " -0.285201 ]\n", 185 | " [ 0.27708757 -0.07811401 -0.00345694 -0.17063354 0.20096005\n", 186 | " -0.18531086]\n", 187 | " [ 0.2189869 -0.13215548 0.07653959 0.08197068 0.0867257\n", 188 | " -0.11753946]\n", 189 | " [ 0.08296373 -0.03460078 -0.06036649 0.07887851 0.05881318\n", 190 | " -0.06289996]\n", 191 | " [ 0.1126808 -0.21593112 0.11169305 0.15077989 0.0751698\n", 192 | " -0.24802327]]]\n", 193 | "o.shape: (1, 8, 6)\n", 194 | "h1: [[ 0.1126808 -0.21593112 0.11169305]]\n", 195 | "c1: [[ 0.15140209 -0.45981696 0.49486363]]\n", 196 | "h2: [[-0.39619136 0.24818416 -0.08490453]]\n", 197 | "c2: [[-0.87975156 0.48920622 -0.16847518]]\n" 198 | ], 199 | "name": "stdout" 200 | } 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "metadata": { 206 | "id": "PoBnQMJXtA3Q", 207 | "colab_type": "code", 208 | "colab": { 209 | "base_uri": "https://localhost:8080/", 210 | "height": 123 211 | }, 212 | "outputId": "70e9c389-d6b7-49b1-a1fc-0236e925d8c2" 213 | }, 214 | "source": [ 215 | "input_ = Input(shape=(T,D))\n", 216 | "rnn = Bidirectional(LSTM(M, return_state=True, return_sequences=False))\n", 217 | "x = rnn(input_)\n", 218 | "model = Model(input_, x)\n", 219 | "o, h1, c1, h2, c2 = model.predict(X)\n", 220 | "print(\"o:\",o)\n", 221 | "print(\"o.shape:\",o.shape)\n", 222 | "print(\"h1:\",h1)\n", 223 | "print(\"c1:\",c1)\n", 224 | "print(\"h2:\",h2)\n", 225 | "print(\"c2:\",c2)" 226 | ], 227 | "execution_count": 12, 228 | "outputs": [ 229 | { 230 | "output_type": "stream", 231 | "text": [ 232 | "o: [[ 0.14969218 -0.19363384 0.01567384 0.0152386 0.15255322 -0.04848756]]\n", 233 | "o.shape: (1, 6)\n", 234 | "h1: [[ 0.14969218 -0.19363384 0.01567384]]\n", 235 | "c1: [[ 0.7560293 -0.4260858 0.02213971]]\n", 236 | "h2: [[ 0.0152386 0.15255322 -0.04848756]]\n", 237 | "c2: [[ 0.02924997 0.32409564 -0.10173343]]\n" 238 | ], 239 | "name": "stdout" 240 | } 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "metadata": { 246 | "id": "jpjfkE1vt25w", 247 | "colab_type": "code", 248 | "colab": {} 249 | }, 250 | "source": [ 251 | "" 252 | ], 253 | "execution_count": 0, 254 | "outputs": [] 255 | } 256 | ] 257 | } -------------------------------------------------------------------------------- /recurrent neural networks/Simple_RNN_Test_(Return_State_vs_Return_Sequences).ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Simple RNN Test (Return State vs Return Sequences).ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [], 9 | "authorship_tag": "ABX9TyNpU2OnnlAo9mhAV9CCPiis", 10 | "include_colab_link": true 11 | }, 12 | "kernelspec": { 13 | "name": "python3", 14 | "display_name": "Python 3" 15 | }, 16 | "accelerator": "GPU" 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "id": "view-in-github", 23 | "colab_type": "text" 24 | }, 25 | "source": [ 26 | "\"Open" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "metadata": { 32 | "id": "62Fy2Eq75v0L", 33 | "colab_type": "code", 34 | "colab": {} 35 | }, 36 | "source": [ 37 | "from __future__ import print_function, division\n", 38 | "from builtins import range, input\n", 39 | "\n", 40 | "from keras.models import Model\n", 41 | "from keras.layers import Input, LSTM, GRU, RNN\n", 42 | "import numpy as np\n", 43 | "import matplotlib.pyplot as plt\n", 44 | "%matplotlib inline\n", 45 | "\n", 46 | "T = 8\n", 47 | "D = 2\n", 48 | "M = 3" 49 | ], 50 | "execution_count": 0, 51 | "outputs": [] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "metadata": { 56 | "id": "y4PXtF8t6E0p", 57 | "colab_type": "code", 58 | "colab": {} 59 | }, 60 | "source": [ 61 | "X = np.random.randn(1,T, D)\n", 62 | "def lstm1():\n", 63 | " input_ = Input(shape=(T, D))\n", 64 | " rnn = LSTM(M, return_state=True)\n", 65 | " x = rnn(input_)\n", 66 | " model = Model(inputs=input_, outputs=x)\n", 67 | " o, h, c = model.predict(X)\n", 68 | " print(\"o:\",o)\n", 69 | " print(\"h:\",h)\n", 70 | " print(\"c:\",c)\n", 71 | "\n", 72 | "def lstm2():\n", 73 | " input_ = Input(shape=(T, D))\n", 74 | " rnn = LSTM(M, return_state=True,return_sequences=True)\n", 75 | " x = rnn(input_)\n", 76 | "\n", 77 | " model = Model(inputs=input_, outputs=x)\n", 78 | " o, h, c = model.predict(X)\n", 79 | " print(\"o:\",o)\n", 80 | " print(\"h:\",h)\n", 81 | " print(\"c:\",c)" 82 | ], 83 | "execution_count": 0, 84 | "outputs": [] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "metadata": { 89 | "id": "snXcGm-k7AnF", 90 | "colab_type": "code", 91 | "colab": {} 92 | }, 93 | "source": [ 94 | "def gru1():\n", 95 | " input_ = Input(shape=(T,D))\n", 96 | " rnn = GRU(M, return_state=True)\n", 97 | " x = rnn(input_)\n", 98 | " model = Model(inputs=input_, outputs=x)\n", 99 | " o, h = model.predict(X)\n", 100 | " print(\"o:\",o)\n", 101 | " print(\"h:\",h)\n", 102 | "\n", 103 | "def gru2():\n", 104 | " input_ = Input(shape=(T,D))\n", 105 | " rnn = GRU(M, return_state=True, return_sequences=True)\n", 106 | " x = rnn(input_)\n", 107 | "\n", 108 | " model = Model(inputs=input_, outputs=x)\n", 109 | " o, h = model.predict(X)\n", 110 | " print(\"o:\",o)\n", 111 | " print(\"h:\",h)" 112 | ], 113 | "execution_count": 0, 114 | "outputs": [] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "metadata": { 119 | "id": "NjnXD1MS7UPQ", 120 | "colab_type": "code", 121 | "outputId": "ce276214-5f99-47ee-ab7c-10b3529129da", 122 | "colab": { 123 | "base_uri": "https://localhost:8080/", 124 | "height": 514 125 | } 126 | }, 127 | "source": [ 128 | "print(\"lstm1:\")\n", 129 | "lstm1()\n", 130 | "\n", 131 | "print(\"lstm2:\")\n", 132 | "lstm2()\n", 133 | "\n", 134 | "print(\"gru1:\")\n", 135 | "gru1()\n", 136 | "\n", 137 | "print(\"gru2:\")\n", 138 | "gru2()" 139 | ], 140 | "execution_count": 0, 141 | "outputs": [ 142 | { 143 | "output_type": "stream", 144 | "text": [ 145 | "lstm1:\n", 146 | "o: [[0.04405436 0.19382982 0.09365804]]\n", 147 | "h: [[0.04405436 0.19382982 0.09365804]]\n", 148 | "c: [[0.07515235 0.74266243 0.2020666 ]]\n", 149 | "lstm2:\n", 150 | "o: [[[-0.13985486 0.10550682 -0.09445278]\n", 151 | " [-0.10432478 -0.27001908 -0.00554497]\n", 152 | " [-0.08823357 -0.29882026 0.02738047]\n", 153 | " [-0.00888742 -0.27147183 0.08124414]\n", 154 | " [-0.0464495 -0.15506767 0.01873142]\n", 155 | " [ 0.0823176 -0.07812153 0.0788352 ]\n", 156 | " [ 0.17786328 0.02429481 0.08474788]\n", 157 | " [ 0.19463679 -0.14741603 0.18657811]]]\n", 158 | "h: [[ 0.19463679 -0.14741603 0.18657811]]\n", 159 | "c: [[ 0.45711023 -0.33413443 0.36481676]]\n", 160 | "gru1:\n", 161 | "o: [[-0.04083507 -0.62333137 -0.669806 ]]\n", 162 | "h: [[-0.04083507 -0.62333137 -0.669806 ]]\n", 163 | "gru2:\n", 164 | "o: [[[-0.00711141 -0.32353368 -0.2206695 ]\n", 165 | " [-0.21819714 -0.14739999 -0.46059692]\n", 166 | " [-0.22654344 -0.08817856 -0.48051834]\n", 167 | " [-0.18662423 0.01923173 -0.15814897]\n", 168 | " [-0.10439495 -0.14305443 -0.19330898]\n", 169 | " [-0.02846893 0.07500637 0.4005316 ]\n", 170 | " [ 0.11978745 0.22520865 0.6934356 ]\n", 171 | " [ 0.15229619 0.3807733 0.37033397]]]\n", 172 | "h: [[0.15229619 0.3807733 0.37033397]]\n" 173 | ], 174 | "name": "stdout" 175 | } 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "metadata": { 181 | "id": "3ItHViet73Ll", 182 | "colab_type": "text" 183 | }, 184 | "source": [ 185 | "## LSTM (Long Short Term Memory)\n", 186 | "\n", 187 | "#### 1. Return State\n", 188 | "\n", 189 | "For the first LSTM i.e. LSTM 1 the output is the same as the hidden state\n", 190 | "\n", 191 | " lstm1:\n", 192 | " o: [[0.04405436 0.19382982 0.09365804]]\n", 193 | " h: [[0.04405436 0.19382982 0.09365804]]\n", 194 | " c: [[0.07515235 0.74266243 0.2020666 ]]\n", 195 | "\n", 196 | "#### 2. Return Sequences\n", 197 | "\n", 198 | "For the second LSTM i.e. LSTM 2 the output is longer i.e. 8*3 since the length of the sequence is 8. However the last values of the output state is the same as hidden state.\n", 199 | "\n", 200 | " lstm2:\n", 201 | " o: [[[-0.13985486 0.10550682 -0.09445278]\n", 202 | " [-0.10432478 -0.27001908 -0.00554497]\n", 203 | " [-0.08823357 -0.29882026 0.02738047]\n", 204 | " [-0.00888742 -0.27147183 0.08124414]\n", 205 | " [-0.0464495 -0.15506767 0.01873142]\n", 206 | " [ 0.0823176 -0.07812153 0.0788352 ]\n", 207 | " [ 0.17786328 0.02429481 0.08474788]\n", 208 | " [ 0.19463679 -0.14741603 0.18657811]]]\n", 209 | " h: [[ 0.19463679 -0.14741603 0.18657811]]\n", 210 | " c: [[ 0.45711023 -0.33413443 0.36481676]]\n", 211 | "\n", 212 | "Thus we can conclude that H and C are the same. In other words, H and C are the hidden state and the cell state of the final time step of the input in an LSTM.\n", 213 | "\n", 214 | "\n" 215 | ] 216 | }, 217 | { 218 | "cell_type": "markdown", 219 | "metadata": { 220 | "id": "JhlEuhDp8zB5", 221 | "colab_type": "text" 222 | }, 223 | "source": [ 224 | "## GRU (Gated Recurrent Unit)\n", 225 | "\n", 226 | "Similarly goes for the GRU. Only there is no cell state gate in an GRU.|" 227 | ] 228 | } 229 | ] 230 | } -------------------------------------------------------------------------------- /recurrent neural networks/Stacked_Long_Short_Term_Memory_Networks.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Stacked Long Short Term Memory Networks.ipynb", 7 | "provenance": [], 8 | "authorship_tag": "ABX9TyMkqji1TdYDEHZZZ/Y6TRJD", 9 | "include_colab_link": true 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | }, 15 | "accelerator": "GPU" 16 | }, 17 | "cells": [ 18 | { 19 | "cell_type": "markdown", 20 | "metadata": { 21 | "id": "view-in-github", 22 | "colab_type": "text" 23 | }, 24 | "source": [ 25 | "\"Open" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "metadata": { 31 | "id": "1XJLbqTYqsx6", 32 | "colab_type": "code", 33 | "colab": { 34 | "base_uri": "https://localhost:8080/", 35 | "height": 34 36 | }, 37 | "outputId": "c267d959-4e40-4521-990f-d533fdffa198" 38 | }, 39 | "source": [ 40 | "%tensorflow_version 2.x\n", 41 | "from tensorflow.keras.models import Sequential\n", 42 | "from tensorflow.keras.layers import LSTM, Dense, Activation\n", 43 | "from numpy import array \n", 44 | "\n", 45 | "import numpy as np\n", 46 | "import pandas as pd \n", 47 | "import matplotlib.pyplot as plt\n", 48 | "%matplotlib inline" 49 | ], 50 | "execution_count": 2, 51 | "outputs": [ 52 | { 53 | "output_type": "stream", 54 | "text": [ 55 | "TensorFlow 2.x selected.\n" 56 | ], 57 | "name": "stdout" 58 | } 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "metadata": { 64 | "id": "h5wrIwiDrV6L", 65 | "colab_type": "code", 66 | "colab": { 67 | "base_uri": "https://localhost:8080/", 68 | "height": 52 69 | }, 70 | "outputId": "e9e44447-ff0e-48b5-9c65-0a24816b144a" 71 | }, 72 | "source": [ 73 | "# Each LSTMs memory cell requires a 3D input. When an LSTM processes one input sequence of time \n", 74 | "# steps, each memory cell will output a single value for the whole sequence as a 2D array.\n", 75 | "\n", 76 | "model = Sequential()\n", 77 | "model.add(LSTM(10, input_shape=(3,1)))\n", 78 | "model.compile(optimizer='adam', loss='mse')\n", 79 | "data = np.random.randn(1,3,1)\n", 80 | "print(model.predict(data))" 81 | ], 82 | "execution_count": 3, 83 | "outputs": [ 84 | { 85 | "output_type": "stream", 86 | "text": [ 87 | "[[ 0.05921615 -0.03010203 -0.00745623 -0.10550459 -0.08348058 -0.13135718\n", 88 | " 0.05771159 0.00856336 -0.02335096 0.01949931]]\n" 89 | ], 90 | "name": "stdout" 91 | } 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "metadata": { 97 | "id": "H62shZmNrqrz", 98 | "colab_type": "code", 99 | "colab": { 100 | "base_uri": "https://localhost:8080/", 101 | "height": 70 102 | }, 103 | "outputId": "729bea9c-88cb-4ada-c2a4-a73babbaf0ef" 104 | }, 105 | "source": [ 106 | "# To stack LSTM layers, we need to change the configuration of the prior LSTM layer to output a 3D array as input for the subsequent layer.\n", 107 | "# We can do this by setting the return_sequences argument on the layer to True (defaults to False). \n", 108 | "# This will return one output for each input time step and provide a 3D array.\n", 109 | "# Below is the same example as above with return_sequences=True.\n", 110 | "model = Sequential()\n", 111 | "model.add(LSTM(1, return_sequences=True, input_shape=(3,1)))\n", 112 | "model.compile(optimizer='adam',loss='mse')\n", 113 | "data = np.random.randn(1,3,1)\n", 114 | "print(model.predict(data))" 115 | ], 116 | "execution_count": 4, 117 | "outputs": [ 118 | { 119 | "output_type": "stream", 120 | "text": [ 121 | "[[[-0.0309435 ]\n", 122 | " [ 0.07525866]\n", 123 | " [-0.10665452]]]\n" 124 | ], 125 | "name": "stdout" 126 | } 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "metadata": { 132 | "id": "DZZT2oKbsx4_", 133 | "colab_type": "code", 134 | "colab": {} 135 | }, 136 | "source": [ 137 | "model = Sequential()\n", 138 | "model.add(LSTM(32, return_sequences=True, input_shape=(1,1)))\n", 139 | "model.add(LSTM(16,return_sequences=True))\n", 140 | "model.add(LSTM(8))\n", 141 | "model.add(Dense(1, activation='sigmoid'))\n", 142 | "model.compile(optimizer=\"adam\", loss='binary_crossentropy',metrics=['accuracy'])" 143 | ], 144 | "execution_count": 0, 145 | "outputs": [] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "metadata": { 150 | "id": "chtUZfyouI1-", 151 | "colab_type": "code", 152 | "colab": {} 153 | }, 154 | "source": [ 155 | "data = np.expand_dims(np.expand_dims(np.array([np.random.randint(0,10000)*0.001*np.random.choice([-1,1]) for i in range(10000)]),axis=1),axis=2)\n", 156 | "y = np.array(data > 0).astype(np.int).ravel()" 157 | ], 158 | "execution_count": 0, 159 | "outputs": [] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "metadata": { 164 | "id": "y1BNbh6ICGYN", 165 | "colab_type": "code", 166 | "colab": { 167 | "base_uri": "https://localhost:8080/", 168 | "height": 407 169 | }, 170 | "outputId": "21597657-5d5c-4142-90b2-4c01afbbc8e3" 171 | }, 172 | "source": [ 173 | "model.fit(data, y, epochs=10, validation_split=0.2, verbose=1)" 174 | ], 175 | "execution_count": 40, 176 | "outputs": [ 177 | { 178 | "output_type": "stream", 179 | "text": [ 180 | "Train on 8000 samples, validate on 2000 samples\n", 181 | "Epoch 1/10\n", 182 | "8000/8000 [==============================] - 4s 550us/sample - loss: 0.3755 - accuracy: 0.9719 - val_loss: 0.0830 - val_accuracy: 0.9980\n", 183 | "Epoch 2/10\n", 184 | "8000/8000 [==============================] - 2s 198us/sample - loss: 0.0433 - accuracy: 0.9984 - val_loss: 0.0221 - val_accuracy: 1.0000\n", 185 | "Epoch 3/10\n", 186 | "8000/8000 [==============================] - 1s 183us/sample - loss: 0.0176 - accuracy: 0.9989 - val_loss: 0.0116 - val_accuracy: 1.0000\n", 187 | "Epoch 4/10\n", 188 | "8000/8000 [==============================] - 1s 180us/sample - loss: 0.0109 - accuracy: 0.9987 - val_loss: 0.0074 - val_accuracy: 1.0000\n", 189 | "Epoch 5/10\n", 190 | "8000/8000 [==============================] - 1s 181us/sample - loss: 0.0075 - accuracy: 0.9995 - val_loss: 0.0058 - val_accuracy: 0.9990\n", 191 | "Epoch 6/10\n", 192 | "8000/8000 [==============================] - 1s 176us/sample - loss: 0.0058 - accuracy: 0.9994 - val_loss: 0.0039 - val_accuracy: 1.0000\n", 193 | "Epoch 7/10\n", 194 | "8000/8000 [==============================] - 1s 182us/sample - loss: 0.0047 - accuracy: 0.9995 - val_loss: 0.0029 - val_accuracy: 1.0000\n", 195 | "Epoch 8/10\n", 196 | "8000/8000 [==============================] - 2s 192us/sample - loss: 0.0037 - accuracy: 0.9995 - val_loss: 0.0023 - val_accuracy: 1.0000\n", 197 | "Epoch 9/10\n", 198 | "8000/8000 [==============================] - 1s 182us/sample - loss: 0.0035 - accuracy: 0.9991 - val_loss: 0.0019 - val_accuracy: 1.0000\n", 199 | "Epoch 10/10\n", 200 | "8000/8000 [==============================] - 1s 182us/sample - loss: 0.0033 - accuracy: 0.9991 - val_loss: 0.0015 - val_accuracy: 1.0000\n" 201 | ], 202 | "name": "stdout" 203 | }, 204 | { 205 | "output_type": "execute_result", 206 | "data": { 207 | "text/plain": [ 208 | "" 209 | ] 210 | }, 211 | "metadata": { 212 | "tags": [] 213 | }, 214 | "execution_count": 40 215 | } 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "metadata": { 221 | "id": "nBEjxoybCQrT", 222 | "colab_type": "code", 223 | "colab": { 224 | "base_uri": "https://localhost:8080/", 225 | "height": 194 226 | }, 227 | "outputId": "6c914ff2-20cb-49f0-f5ae-962eff642476" 228 | }, 229 | "source": [ 230 | "model.predict_classes(np.random.randn(10,1,1))" 231 | ], 232 | "execution_count": 44, 233 | "outputs": [ 234 | { 235 | "output_type": "execute_result", 236 | "data": { 237 | "text/plain": [ 238 | "array([[0],\n", 239 | " [0],\n", 240 | " [1],\n", 241 | " [0],\n", 242 | " [0],\n", 243 | " [1],\n", 244 | " [0],\n", 245 | " [0],\n", 246 | " [0],\n", 247 | " [1]], dtype=int32)" 248 | ] 249 | }, 250 | "metadata": { 251 | "tags": [] 252 | }, 253 | "execution_count": 44 254 | } 255 | ] 256 | } 257 | ] 258 | } -------------------------------------------------------------------------------- /recurrent neural networks/recurrent neural networks.md: -------------------------------------------------------------------------------- 1 | # Recurrent Neural Networks 2 | 3 | Recurrent Neural Networks (RNNs) are a form of machine learning algorithm that are ideal for 4 | sequential data such as text, time series, financial data, speech, audio, video among others. 5 | 6 | RNNs are ideal for solving problems where the sequence is more important than the individual items themselves. 7 | 8 | An RNNs is essentially a fully connected neural network that contains a refactoring of some of its layers into a loop. 9 | That loop is typically an iteration over the addition or concatenation of two inputs, a matrix multiplication and a non-linear function. 10 | -------------------------------------------------------------------------------- /sentiment analysis/IMDB Sentiment Analysis - BERT.ipynb: -------------------------------------------------------------------------------- 1 | {"cells":[{"metadata":{"trusted":true},"cell_type":"code","source":"import transformers\nimport torch.nn as nn\nfrom tqdm import tqdm\nimport pandas as pd\nfrom sklearn.model_selection import train_test_split\nfrom sklearn import metrics\nimport numpy as np\nimport torch","execution_count":null,"outputs":[]},{"metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true},"cell_type":"code","source":"MAX_LEN = 512\nTRAIN_BATCH_SIZE = 8\nVALID_BATCH_SIZE = 4\nEPOCHS = 10\nACCUMULATION = 2\nBERT_PATH = '../input/bert_base_cased/'\nMODEL_PATH = \"model.bin\"\nTRAINING_FILE = \"../input/imbd-movie-reviews-for-binary-sentiment-analysis/MovieReviewTrainingDatabase.csv\"\nTOKENIZER = transformers.BertTokenizer.from_pretrained('bert-base-uncased',do_lower_case=True)","execution_count":null,"outputs":[]},{"metadata":{"_uuid":"d629ff2d2480ee46fbb7e2d37f6b5fab8052498a","_cell_guid":"79c7e3d0-c299-4dcb-8224-4455121ee9b0","trusted":true},"cell_type":"code","source":"class BERTBaseUncased(nn.Module):\n def __init__(self):\n super(BERTBaseUncased,self).__init__()\n self.bert = transformers.BertModel.from_pretrained('bert-base-uncased')\n self.bert_drop = nn.Dropout(0.3)\n self.out = nn.Linear(768,1)\n def forward(self, ids, mask, token_type_ids):\n _, o2 = self.bert(ids, \n attention_mask=mask, \n token_type_ids=token_type_ids)\n bo = self.bert_drop(o2)\n output = self.out(bo)\n return output","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"class BERTDataset:\n def __init__(self, review, target):\n self.review = review\n self.target = target\n self.tokenizer = TOKENIZER\n self.max_len = MAX_LEN\n \n def __len__(self):\n return len(self.review)\n \n def __getitem__(self, item):\n review = str(self.review[item])\n review = \" \".join(review.split())\n\n inputs = self.tokenizer.encode_plus(\n review,\n None,\n add_special_tokens=True,\n max_length=self.max_len,\n pad_to_max_length=True\n )\n\n ids = inputs[\"input_ids\"]\n mask = inputs[\"attention_mask\"]\n token_type_ids = inputs[\"token_type_ids\"]\n\n return {\n 'ids': torch.tensor(ids, dtype=torch.long),\n 'mask': torch.tensor(mask, dtype=torch.long),\n 'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),\n 'targets': torch.tensor(self.target[item], dtype=torch.float)\n }","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"def loss_fn(outputs, targets):\n return nn.BCEWithLogitsLoss()(outputs, targets.view(-1, 1))\n\n\ndef train_fn(data_loader, model, optimizer, device, scheduler):\n model.train()\n\n for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):\n ids = d[\"ids\"]\n token_type_ids = d[\"token_type_ids\"]\n mask = d[\"mask\"]\n targets = d[\"targets\"]\n\n ids = ids.to(device, dtype=torch.long)\n token_type_ids = token_type_ids.to(device, dtype=torch.long)\n mask = mask.to(device, dtype=torch.long)\n targets = targets.to(device, dtype=torch.float)\n\n optimizer.zero_grad()\n outputs = model(\n ids=ids,\n mask=mask,\n token_type_ids=token_type_ids\n )\n\n loss = loss_fn(outputs, targets)\n loss.backward()\n optimizer.step()\n scheduler.step()\n\n\ndef eval_fn(data_loader, model, device):\n model.eval()\n fin_targets = []\n fin_outputs = []\n with torch.no_grad():\n for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):\n ids = d[\"ids\"]\n token_type_ids = d[\"token_type_ids\"]\n mask = d[\"mask\"]\n targets = d[\"targets\"]\n\n ids = ids.to(device, dtype=torch.long)\n token_type_ids = token_type_ids.to(device, dtype=torch.long)\n mask = mask.to(device, dtype=torch.long)\n targets = targets.to(device, dtype=torch.float)\n\n outputs = model(\n ids=ids,\n mask=mask,\n token_type_ids=token_type_ids\n )\n fin_targets.extend(targets.cpu().detach().numpy().tolist())\n fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())\n return fin_outputs, fin_targets","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"def run():\n df = pd.read_csv(TRAINING_FILE).fillna(\"None\")\n \n df.sentiment = df.sentiment.apply(lambda x: 1 if x==\"postive\" else 0)\n \n df_train, df_valid = train_test_split(df, test_size=0.1, random_state=2020, stratify=df.sentiment.values)\n \n df_train = df_train.reset_index(drop=True)\n df_train = df_valid.reset_index(drop=True)\n \n train_dataset = BERTDataset(review=df_train.review.values, target=df_train.sentiment.values)\n train_data_loader = torch.utils.data.DataLoader(\n train_dataset, \n batch_size=TRAIN_BATCH_SIZE,\n num_workers=4\n )\n \n valid_dataset = BERTDataset(review=df_valid.review.values, target=df_valid.sentiment.values)\n valid_data_loader = torch.utils.data.DataLoader(\n valid_dataset, \n batch_size=VALID_BATCH_SIZE,\n num_workers=4\n )\n \n device = torch.device(\"cuda\")\n model = BERTBaseUncased()\n model.to(device)\n param_optimizer = list(model.named_parameters())\n no_decay = [\"bias\",\"LayerNorm.bias\",\"LayerNorm.weight\"]\n optimizer_parameters = [\n {\n \"params\":[p for n, p in param_optimizer if not any(nd for nd in no_decay)], 'weight_decay':0.001\n },\n {\n \"params\":[p for n, p in param_optimizer if any(nd for nd in no_decay)], 'weight_decay':0.0\n }\n ]\n num_train_steps = int(len(df_train)/TRAIN_BATCH_SIZE*EPOCHS)\n optimizer = transformers.AdamW(optimizer_parameters, lr=3e-5)\n scheduler = transformers.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)\n model = nn.DataParallel(model)\n best_accuracy = 0\n for epoch in range(EPOCHS):\n train_fn(train_data_loader, model, optimizer, device, scheduler)\n outputs, targets = eval_fn(valid_data_loader, model, device)\n outputs = np.array(outputs) >= 0.5\n accuracy = metrics.accuracy_score(targets, outputs)\n print(f\"Accuracy score = {accuracy}\")\n if accuracy > best_accuracy:\n torch.save(model.state_dict(),MODEL_PATH)\n best_accuracy = accuracy","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"if __name__ == \"__main__\":\n run()","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"","execution_count":null,"outputs":[]}],"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"pygments_lexer":"ipython3","nbconvert_exporter":"python","version":"3.6.4","file_extension":".py","codemirror_mode":{"name":"ipython","version":3},"name":"python","mimetype":"text/x-python"}},"nbformat":4,"nbformat_minor":4} -------------------------------------------------------------------------------- /text classification/text classification.md: -------------------------------------------------------------------------------- 1 | # Text classification 2 | 3 | Text classification is the task of assigning a sentence or document an appropriate category. 4 | The categories depend on the chosen dataset and can range from topics. 5 | 6 | ### AG News 7 | 8 | The [AG News corpus](https://papers.nips.cc/paper/5782-character-level-convolutional-networks-for-text-classification.pdf) 9 | consists of news articles from the [AG's corpus of news articles on the web](http://www.di.unipi.it/~gulli/AG_corpus_of_news_articles.html) 10 | pertaining to the 4 largest classes. The dataset contains 30,000 training and 1,900 testing examples for each class. 11 | Models are evaluated based on error rate (lower is better). 12 | 13 | | Model | Error | Paper / Source | Code | 14 | | ------------- | :-----:| --- | :-----: | 15 | | XLNet (Yang et al., 2019) | 4.49 | [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/pdf/1906.08237.pdf) | [Official](https://github.com/zihangdai/xlnet/) | 16 | | ULMFiT (Howard and Ruder, 2018) | 5.01 | [Universal Language Model Fine-tuning for Text Classification](https://arxiv.org/abs/1801.06146) | [Official](http://nlp.fast.ai/ulmfit ) | 17 | | CNN (Johnson and Zhang, 2016) * | 6.57 | [Supervised and Semi-Supervised Text Categorization using LSTM for Region Embeddings](https://arxiv.org/abs/1602.02373) | [Official](https://github.com/riejohnson/ConText ) | 18 | | DPCNN (Johnson and Zhang, 2017) | 6.87 | [Deep Pyramid Convolutional Neural Networks for Text Categorization](http://aclweb.org/anthology/P17-1052) | [Official](https://github.com/riejohnson/ConText ) | 19 | | VDCN (Alexis et al., 2016) | 8.67 | [Very Deep Convolutional Networks for Text Classification](https://arxiv.org/abs/1606.01781) | [Non Official](https://github.com/ArdalanM/nlp-benchmarks/tree/master/src/vdcnn) | 20 | | Char-level CNN (Zhang et al., 2015) | 9.51 | [Character-level Convolutional Networks for Text Classification](https://papers.nips.cc/paper/5782-character-level-convolutional-networks-for-text-classification.pdf) | [Non Official](https://github.com/ArdalanM/nlp-benchmarks/tree/master/src/cnn) | 21 | 22 | \* Results reported in Johnson and Zhang, 2017 23 | 24 | ### DBpedia 25 | 26 | The [DBpedia ontology](https://papers.nips.cc/paper/5782-character-level-convolutional-networks-for-text-classification.pdf) 27 | dataset contains 560,000 training samples and 70,000 testing samples for each of 14 nonoverlapping classes from DBpedia. 28 | Models are evaluated based on error rate (lower is better). 29 | 30 | | Model | Error | Paper / Source | Code | 31 | | ------------- | :-----:| --- | :-----: | 32 | | XLNet (Yang et al., 2019) | 0.62 | [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/pdf/1906.08237.pdf) | [Official](https://github.com/zihangdai/xlnet/) | 33 | | Bidirectional Encoder Representations from Transformers (Devlin et al., 2018) | 0.64 | [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) | [Official](https://github.com/google-research/bert) | 34 | | ULMFiT (Howard and Ruder, 2018) | 0.80 | [Universal Language Model Fine-tuning for Text Classification](https://arxiv.org/abs/1801.06146) | [Official](http://nlp.fast.ai/ulmfit ) | 35 | | CNN (Johnson and Zhang, 2016) | 0.84 | [Supervised and Semi-Supervised Text Categorization using LSTM for Region Embeddings](https://arxiv.org/abs/1602.02373) | [Official](https://github.com/riejohnson/ConText ) | 36 | | DPCNN (Johnson and Zhang, 2017) | 0.88 | [Deep Pyramid Convolutional Neural Networks for Text Categorization](http://aclweb.org/anthology/P17-1052) | [Official](https://github.com/riejohnson/ConText ) | 37 | | VDCN (Alexis et al., 2016) | 1.29 | [Very Deep Convolutional Networks for Text Classification](https://arxiv.org/abs/1606.01781) | [Non Official](https://github.com/ArdalanM/nlp-benchmarks/tree/master/src/vdcnn) | 38 | | Char-level CNN (Zhang et al., 2015) | 1.55 | [Character-level Convolutional Networks for Text Classification](https://papers.nips.cc/paper/5782-character-level-convolutional-networks-for-text-classification.pdf) | [Non Official](https://github.com/ArdalanM/nlp-benchmarks/tree/master/src/cnn) | 39 | 40 | ### TREC 41 | 42 | The [TREC dataset](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.11.2766&rep=rep1&type=pdf) is dataset for 43 | question classification consisting of open-domain, fact-based questions divided into broad semantic categories. 44 | It has both a six-class (TREC-6) and a fifty-class (TREC-50) version. Both have 5,452 training examples and 500 test examples, 45 | but TREC-50 has finer-grained labels. Models are evaluated based on accuracy. 46 | 47 | TREC-6: 48 | 49 | | Model | Error | Paper / Source | Code | 50 | | ------------- | :-----:| --- | :-----: | 51 | | USE_T+CNN (Cer et al., 2018) | 1.93 | [Universal Sentence Encoder](https://arxiv.org/pdf/1803.11175.pdf) | [Official](https://tfhub.dev/google/universal-sentence-encoder/1) | 52 | | ULMFiT (Howard and Ruder, 2018) | 3.6 | [Universal Language Model Fine-tuning for Text Classification](https://arxiv.org/abs/1801.06146) | [Official](http://nlp.fast.ai/ulmfit ) | 53 | | LSTM-CNN (Zhou et al., 2016) | 3.9 | [Text Classification Improved by Integrating Bidirectional LSTM with Two-dimensional Max Pooling](http://www.aclweb.org/anthology/C16-1329) | 54 | | CNN+MCFA (Amplayo et al., 2018) | 4 | [Translations as Additional Contexts for Sentence Classification](https://arxiv.org/pdf/1806.05516.pdf) | 55 | | TBCNN (Mou et al., 2015) | 4 | [Discriminative Neural Sentence Modeling by Tree-Based Convolution](http://aclweb.org/anthology/D15-1279) | 56 | | CoVe (McCann et al., 2017) | 4.2 | [Learned in Translation: Contextualized Word Vectors](https://arxiv.org/abs/1708.00107) | 57 | 58 | TREC-50: 59 | 60 | | Model | Error | Paper / Source | Code | 61 | | ------------- | :-----:| --- | :-----: | 62 | | Rules (Madabushi and Lee, 2016) | 2.8 |[High Accuracy Rule-based Question Classification using Question Syntax and Semantics](http://www.aclweb.org/anthology/C16-1116)| | 63 | | SVM (Van-Tu and Anh-Cuong, 2016) | 8.4 | [Improving Question Classification by Feature Extraction and Selection](https://www.researchgate.net/publication/303553351_Improving_Question_Classification_by_Feature_Extraction_and_Selection) | | 64 | 65 | [Go back to the README](../README.md) 66 | -------------------------------------------------------------------------------- /text generation/text generation.md: -------------------------------------------------------------------------------- 1 | # Text Generation 2 | 3 | > Find more at [Papers with Code](https://paperswithcode.com/task/text-generation) 4 | 5 | Text generation is the task of generating text with the goal of appearing indistinguishable to human-written text. 6 | 7 | -------------------------------------------------------------------------------- /text summarization/summarization.md: -------------------------------------------------------------------------------- 1 | # Summarization 2 | 3 | Summarization is the task of producing a shorter version of one or several documents that preserves most of the 4 | input's meaning. 5 | 6 | ### Warning: Evaluation Metrics 7 | 8 | For summarization, automatic metrics such as ROUGE and METEOR have serious limitations: 9 | 1. They only assess content selection and do not account for other quality aspects, such as fluency, grammaticality, coherence, etc. 10 | 2. To assess content selection, they rely mostly on lexical overlap, although an abstractive summary could express they same content as a reference without any lexical overlap. 11 | 3. Given the subjectiveness of summarization and the correspondingly low agreement between annotators, the metrics were designed to be used with multiple reference summaries per input. However, recent datasets such as CNN/DailyMail and Gigaword provide only a single reference. 12 | 13 | Therefore, tracking progress and claiming state-of-the-art based only on these metrics is questionable. Most papers carry out additional manual comparisons of alternative summaries. Unfortunately, such experiments are difficult to compare across papers. If you have an idea on how to do that, feel free to contribute. 14 | 15 | 16 | ### CNN / Daily Mail 17 | 18 | The [CNN / Daily Mail dataset](https://arxiv.org/abs/1506.03340) as processed by 19 | [Nallapati et al. (2016)](http://www.aclweb.org/anthology/K16-1028) has been used 20 | for evaluating summarization. The dataset contains online news articles (781 tokens 21 | on average) paired with multi-sentence summaries (3.75 sentences or 56 tokens on average). 22 | The processed version contains 287,226 training pairs, 13,368 validation pairs and 11,490 test pairs. 23 | Models are evaluated with full-length F1-scores of ROUGE-1, ROUGE-2, ROUGE-L, and METEOR (optional). 24 | 25 | #### Anonymized version 26 | 27 | The following models have been evaluated on the entitiy-anonymized version of the dataset introduced by [Nallapati et al. (2016)](http://www.aclweb.org/anthology/K16-1028). 28 | 29 | | Model | ROUGE-1 | ROUGE-2 | ROUGE-L | METEOR | Paper / Source | Code | 30 | | --------------- | :-----: | :-----: | :-----: | :----: | -------------- | ---- | 31 | | RNES w/o coherence (Wu and Hu, 2018) | 41.25 | 18.87 | 37.75 | - | [Learning to Extract Coherent Summary via Deep Reinforcement Learning](https://www.aaai.org/ocs/index.php/AAAI/AAAI18/paper/view/16838/16118) | | 32 | | SWAP-NET (Jadhav and Rajan, 2018) | 41.6 | 18.3 | 37.7 | - | [Extractive Summarization with SWAP-NET: Sentences and Words from Alternating Pointer Networks](http://aclweb.org/anthology/P18-1014) | | 33 | | HSASS (Al-Sabahi et al., 2018) | 42.3 | 17.8 | 37.6 | - | [A Hierarchical Structured Self-Attentive Model for Extractive Document Summarization (HSSAS)](https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=8344797) | | 34 | | GAN (Liu et al., 2018) | 39.92 | 17.65 | 36.71 | - | [Generative Adversarial Network for Abstractive Text Summarization](https://aaai.org/ocs/index.php/AAAI/AAAI18/paper/view/16238/16492) | | 35 | | KIGN+Prediction-guide (Li et al., 2018) | 38.95| 17.12 | 35.68 | - | [Guiding Generation for Abstractive Text Summarization based on Key Information Guide Network](http://aclweb.org/anthology/N18-2009) | | 36 | | SummaRuNNer (Nallapati et al., 2017) | 39.6 | 16.2 | 35.3 | - | [SummaRuNNer: A Recurrent Neural Network based Sequence Model for Extractive Summarization of Documents](https://arxiv.org/abs/1611.04230) | | 37 | | rnn-ext + abs + RL + rerank (Chen and Bansal, 2018) | 39.66 | 15.85 | 37.34 | - | [Fast Abstractive Summarization with Reinforce-Selected Sentence Rewriting](http://aclweb.org/anthology/P18-1063) | [Official](https://github.com/ChenRocks/fast_abs_rl) | 38 | | ML+RL, with intra-attention (Paulus et al., 2018) | 39.87 | 15.82 | 36.90 | - | [A Deep Reinforced Model for Abstractive Summarization](https://openreview.net/pdf?id=HkAClQgA-) | | 39 | | Lead-3 baseline (Nallapati et al., 2017) | 39.2 | 15.7 | 35.5 | - | [SummaRuNNer: A Recurrent Neural Network based Sequence Model for Extractive Summarization of Documents](https://arxiv.org/abs/1611.04230) | | 40 | | ML+RL ROUGE+Novel, with LM (Kryscinski et al., 2018) | 40.02 | 15.53 | 37.44 | - | [Improving Abstraction in Text Summarization](http://aclweb.org/anthology/D18-1207) | | 41 | | (Tan et al., 2017) | 38.1 | 13.9 | 34.0 | - | [Abstractive Document Summarization with a Graph-Based Attentional Neural Model](http://aclweb.org/anthology/P17-1108) | | 42 | | words-lvt2k-temp-att (Nallapti et al., 2016) | 35.46 | 13.30 | 32.65 | - | [Abstractive Text Summarization using Sequence-to-sequence RNNs and Beyond](http://www.aclweb.org/anthology/K16-1028) | | 43 | 44 | 45 | #### Non-Anonymized Version: Extractive Models 46 | 47 | The following models have been evaluated on the non-anonymized version of the dataset introduced by [See et al. (2017)](http://aclweb.org/anthology/P17-1099). 48 | 49 | The first table covers Extractive Models, while the second covers abstractive approaches. 50 | 51 | | Model | ROUGE-1 | ROUGE-2 | ROUGE-L | METEOR | Paper / Source | Code | 52 | | --------------- | :-----: | :-----: | :-----: | :----: | -------------- | ---- | 53 | | BertSumExt (Liu and Lapata 2019) | 43.85 | 20.34 | 39.90 | - | [Text Summarization with Pretrained Encoders](https://arxiv.org/abs/1908.08345) |[Official](https://github.com/nlpyang/PreSumm) | 54 | | BERT-ext + RL (Bae et al., 2019) | 42.76 | 19.87 | 39.11 | - | [Summary Level Training of Sentence Rewriting for Abstractive Summarization](https://arxiv.org/abs/1909.08752) | | 55 | | PNBERT (Zhong et al., 2019) | 42.69 | 19.60 | 38.85 | - | [Searching for Effective Neural Extractive Summarization: What Works and What's Next](https://arxiv.org/abs/1907.03491) | [Official](https://github.com/maszhongming/Effective_Extractive_Summarization) | 56 | | HIBERT (Zhang et al., 2019) | 42.37 | 19.95 | 38.83 | - | [HIBERT: Document Level Pre-training of Hierarchical Bidirectional Transformers for Document Summarization](https://arxiv.org/abs/1905.06566) | | 57 | | NeuSUM (Zhou et al., 2018) | 41.59 | 19.01 | 37.98 | - | [Neural Document Summarization by Jointly Learning to Score and Select Sentences](http://aclweb.org/anthology/P18-1061) | [Official](https://github.com/magic282/NeuSum) | 58 | | Latent (Zhang et al., 2018) | 41.05 | 18.77 | 37.54 | - | [Neural Latent Extractive Document Summarization](http://aclweb.org/anthology/D18-1088) | | 59 | | BanditSum (Dong et al., 2018) | 41.5 | 18.7 | 37.6 | - | [BANDITSUM: Extractive Summarization as a Contextual Bandit](https://aclweb.org/anthology/D18-1409) | [Official](https://github.com/yuedongP/BanditSum)| 60 | | REFRESH (Narayan et al., 2018) | 40.0 | 18.2 | 36.6 | - | [Ranking Sentences for Extractive Summarization with Reinforcement Learning](http://aclweb.org/anthology/N18-1158) | [Official](https://github.com/EdinburghNLP/Refresh) | 61 | | Lead-3 baseline (See et al., 2017) | 40.34 | 17.70 | 36.57 | 22.21 | [Get To The Point: Summarization with Pointer-Generator Networks](http://aclweb.org/anthology/P17-1099) | [Official](https://github.com/abisee/pointer-generator) | 62 | 63 | #### Non-Anonymized: Abstractive Models & Mixed Models 64 | 65 | | Model | ROUGE-1 | ROUGE-2 | ROUGE-L | METEOR | Paper / Source | Code | 66 | | --------------- | :-----: | :-----: | :-----: | :----: | -------------- | ---- | 67 | | PEGASUS (Zhang et al., 2019) | 44.17 | 21.47 | 41.11 | - | [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/pdf/1912.08777.pdf) | - | 68 | | BART (Lewis et al., 2019) | 44.16 | 21.28 | 40.90 | - | [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) | [Official](https://github.com/pytorch/fairseq/tree/master/examples/bart) | 69 | | ProphetNet (Yan, Qi, Gong, Liu et al., 2020) | 43.68 | 20.64 | 40.72 | - | [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/pdf/2001.04063.pdf) | - | 70 | | T5 (Raffel et al., 2019) | 43.52 | 21.55 | 40.69 | - | [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/pdf/1910.10683.pdf) | [Official](https://github.com/google-research/text-to-text-transfer-transformer) | 71 | | UniLM (Dong et al., 2019) | 43.33 | 20.21 | 40.51 | - | [Unified Language Model Pre-training for Natural Language Understanding and Generation](https://arxiv.org/pdf/1905.03197.pdf) | [Official](https://github.com/microsoft/unilm) | 72 | | CNN-2sent-hieco-RBM (Zhang et al., 2019) | 42.04 | 19.77 | 39.42 | - |[Abstract Text Summarization with a Convolutional Seq2Seq Model](https://www.mdpi.com/2076-3417/9/8/1665/pdf) | | 73 | | BertSumExtAbs (Liu and Lapata 2019) | 42.13 | 19.60 | 39.18 | - | [Text Summarization with Pretrained Encoders](https://arxiv.org/abs/1908.08345) |[Official](https://github.com/nlpyang/PreSumm) | 74 | | BERT-ext + abs + RL + rerank (Bae et al., 2019) | 41.90 | 19.08 | 39.64 | - | [Summary Level Training of Sentence Rewriting for Abstractive Summarization](https://arxiv.org/abs/1909.08752) | | 75 | | Two-Stage + RL (Zhang et al., 2019) | 41.71 | 19.49 | 38.79 | - | [Pretraining-Based Natural Language Generation for Text Summarization](https://arxiv.org/abs/1902.09243) | | 76 | | DCA (Celikyilmaz et al., 2018) | 41.69 | 19.47 | 37.92 | - | [Deep Communicating Agents for Abstractive Summarization](http://aclweb.org/anthology/N18-1150) | | 77 | | EditNet (Moroshko et al., 2018) | 41.42 | 19.03 | 38.36 | - | [An Editorial Network for Enhanced Document Summarization](https://arxiv.org/abs/1902.10360) | | 78 | | rnn-ext + RL (Chen and Bansal, 2018) | 41.47 | 18.72 | 37.76 | 22.35 | [Fast Abstractive Summarization with Reinforce-Selected Sentence Rewriting](http://aclweb.org/anthology/P18-1061) | [Official](https://github.com/chenrocks/fast_abs_rl) | 79 | | Bottom-Up Summarization (Gehrmann et al., 2018) | 41.22 | 18.68 | 38.34 | - | [Bottom-Up Abstractive Summarization](https://arxiv.org/abs/1808.10792) | [Official](https://github.com/sebastianGehrmann/bottom-up-summary) | 80 | | (Li et al., 2018a) | 41.54 | 18.18 | 36.47 | - | [Improving Neural Abstractive Document Summarization with Explicit Information Selection Modeling](http://aclweb.org/anthology/D18-1205) | | 81 | | (Li et al., 2018b) | 40.30 | 18.02 | 37.36 | - | [Improving Neural Abstractive Document Summarization with Structural Regularization](http://aclweb.org/anthology/D18-1441) | | 82 | | ROUGESal+Ent RL (Pasunuru and Bansal, 2018) | 40.43 | 18.00 | 37.10 | 20.02 | [Multi-Reward Reinforced Summarization with Saliency and Entailment](http://aclweb.org/anthology/N18-2102) | | 83 | | end2end w/ inconsistency loss (Hsu et al., 2018) | 40.68 | 17.97 | 37.13 | - | [A Unified Model for Extractive and Abstractive Summarization using Inconsistency Loss](http://aclweb.org/anthology/P18-1013) | | 84 | | RL + pg + cbdec (Jiang and Bansal, 2018) | 40.66 | 17.87 | 37.06 | 20.51 | [Closed-Book Training to Improve Summarization Encoder Memory](http://aclweb.org/anthology/D18-1440) | | 85 | | rnn-ext + abs + RL + rerank (Chen and Bansal, 2018) | 40.88 | 17.80 | 38.54 | 20.38 | [Fast Abstractive Summarization with Reinforce-Selected Sentence Rewriting](http://aclweb.org/anthology/P18-1061) | [Official](https://github.com/chenrocks/fast_abs_rl) | 86 | | Pointer + Coverage + EntailmentGen + QuestionGen (Guo et al., 2018) | 39.81 | 17.64 | 36.54 | 18.54 | [Soft Layer-Specific Multi-Task Summarization with Entailment and Question Generation](http://aclweb.org/anthology/P18-1064) | | 87 | | ML+RL ROUGE+Novel, with LM (Kryscinski et al., 2018) | 40.19 | 17.38 | 37.52 | - | [Improving Abstraction in Text Summarization](http://aclweb.org/anthology/D18-1207) | | 88 | | Pointer-generator + coverage (See et al., 2017) | 39.53 | 17.28 | 36.38 | 18.72 | [Get To The Point: Summarization with Pointer-Generator Networks](http://aclweb.org/anthology/P17-1099) | [Official](https://github.com/abisee/pointer-generator) | 89 | 90 | ### Gigaword 91 | 92 | The Gigaword summarization dataset has been first used by [Rush et al., 2015](https://www.aclweb.org/anthology/D/D15/D15-1044.pdf) and represents a sentence summarization / headline generation task with very short input documents (31.4 tokens) and summaries (8.3 tokens). It contains 3.8M training, 189k development and 1951 test instances. Models are evaluated with ROUGE-1, ROUGE-2 and ROUGE-L using full-length F1-scores. 93 | 94 | Below Results are ranking by ROUGE-2 Scores. 95 | 96 | | Model | ROUGE-1 | ROUGE-2* | ROUGE-L | Paper / Source | Code | 97 | | --------------- | :-----: | :-----: | :-----: | -------------- | ---- | 98 | | ControlCopying (Song et al., 2020) | 39.08 | 20.47 | 36.69 | [Controlling the Amount of Verbatim Copying in Abstractive Summarizatio](https://arxiv.org/pdf/1911.10390.pdf) | [Official](https://github.com/ucfnlp/control-over-copying) | 99 | | ProphetNet (Yan, Qi, Gong, Liu et al., 2020) | 39.23 | 20.36 | 36.57 | [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/pdf/2001.04063.pdf) | - | 100 | | UniLM (Dong et al., 2019) | 38.90 | 20.05 | 36.00 | [Unified Language Model Pre-training for Natural Language Understanding and Generation](https://arxiv.org/pdf/1905.03197.pdf) | [Official](https://github.com/microsoft/unilm) | 101 | | PEGASUS (Zhang et al., 2019) | 39.12 | 19.86 | 36.24 | [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/pdf/1912.08777.pdf) | - | 102 | | BiSET (Wang et al., 2019) | 39.11 | 19.78 | 36.87 | [BiSET: Bi-directional Selective Encoding with Template for Abstractive Summarization](https://www.aclweb.org/anthology/P19-1207) | [Official](https://github.com/InitialBug/BiSET) | 103 | | MASS (Song et al., 2019) | 38.73 | 19.71 | 35.96 | [MASS: Masked Sequence to Sequence Pre-training for Language Generation](https://arxiv.org/pdf/1905.02450v5.pdf) | [Official](https://github.com/microsoft/MASS) | 104 | | Re^3 Sum (Cao et al., 2018) | 37.04 | 19.03 | 34.46 | [Retrieve, Rerank and Rewrite: Soft Template Based Neural Summarization](http://aclweb.org/anthology/P18-1015) | | 105 | | JointParsing (Song el at., 2020) | 36.61 | 18.85 | 34.33 | [Joint Parsing and Generation for Abstractive Summarization](https://arxiv.org/pdf/1911.10389.pdf) | [Official](https://github.com/KaiQiangSong/joint_parse_summ) | 106 | | CNN-2sent-hieco-RBM (Zhang et al., 2019) | 37.95 | 18.64 | 35.11 | [Abstract Text Summarization with a Convolutional Seq2Seq Model](https://www.mdpi.com/2076-3417/9/8/1665/pdf) | | | 107 | | Reinforced-Topic-ConvS2S (Wang et al., 2018) | 36.92 | 18.29 | 34.58 | [A Reinforced Topic-Aware Convolutional Sequence-to-Sequence Model for Abstractive Text Summarization](https://www.ijcai.org/proceedings/2018/0619.pdf) | | 108 | | CGU (Lin et al., 2018) | 36.3 | 18.0 | 33.8 | [Global Encoding for Abstractive Summarization](http://aclweb.org/anthology/P18-2027) | [Official](https://www.github.com/lancopku/Global-Encoding) | 109 | | Pointer + Coverage + EntailmentGen + QuestionGen (Guo et al., 2018) | 35.98 | 17.76 | 33.63 | [Soft Layer-Specific Multi-Task Summarization with Entailment and Question Generation](http://aclweb.org/anthology/P18-1064) | | 110 | | Struct+2Way+Word (Song et al., 2018) | 35.47 | 17.66 | 33.52 | [Structure-Infused Copy Mechanisms for Abstractive Summarization](http://aclweb.org/anthology/C18-1146) | [Official](https://github.com/KaiQiangSong/struct_infused_summ)| 111 | | FTSum_g (Cao et al., 2018) | 37.27 | 17.65 | 34.24 | [Faithful to the Original: Fact Aware Neural Abstractive Summarization](https://arxiv.org/pdf/1711.04434.pdf) | | 112 | | DRGD (Li et al., 2017) | 36.27 | 17.57 | 33.62 | [Deep Recurrent Generative Decoder for Abstractive Text Summarization](http://aclweb.org/anthology/D17-1222) | | 113 | | SEASS (Zhou et al., 2017) | 36.15 | 17.54 | 33.63 | [Selective Encoding for Abstractive Sentence Summarization](http://aclweb.org/anthology/P17-1101) | [Official](https://github.com/magic282/SEASS) | 114 | | EndDec+WFE (Suzuki and Nagata, 2017) | 36.30 | 17.31 | 33.88 | [Cutting-off Redundant Repeating Generations for Neural Abstractive Summarization](http://aclweb.org/anthology/E17-2047) | | 115 | | Seq2seq + selective + MTL + ERAM (Li et al., 2018) | 35.33 | 17.27 | 33.19 | [Ensure the Correctness of the Summary: Incorporate Entailment Knowledge into Abstractive Sentence Summarization](http://aclweb.org/anthology/C18-1121) | | 116 | | Seq2seq + E2T_cnn (Amplayo et al., 2018) | 37.04 | 16.66 | 34.93 | [Entity Commonsense Representation for Neural Abstractive Summarization](http://aclweb.org/anthology/N18-1064) | | 117 | | RAS-Elman (Chopra et al., 2016) | 33.78 | 15.97 | 31.15 | [Abstractive Sentence Summarization with Attentive Recurrent Neural Networks](http://www.aclweb.org/anthology/N16-1012) | | 118 | | words-lvt5k-1sent (Nallapti et al., 2016) | 32.67 | 15.59 | 30.64 | [Abstractive Text Summarization using Sequence-to-sequence RNNs and Beyond](http://www.aclweb.org/anthology/K16-1028) | | 119 | | ABS+ (Rush et al., 2015) | 29.76 | 11.88 | 26.96 | [A Neural Attention Model for Sentence Summarization](https://www.aclweb.org/anthology/D/D15/D15-1044.pdf) * | | 120 | | ABS (Rush et al., 2015) | 29.55 | 11.32 | 26.42 | [A Neural Attention Model for Sentence Summarization](https://www.aclweb.org/anthology/D/D15/D15-1044.pdf) * | | 121 | 122 | (*) [Rush et al., 2015](https://www.aclweb.org/anthology/D/D15/D15-1044.pdf) report ROUGE recall, the table here contains ROUGE F1-scores for Rush's model reported by [Chopra et al., 2016](http://www.aclweb.org/anthology/N16-1012) 123 | 124 | ### X-Sum 125 | 126 | X-Sum (standing for _Extreme Summarization_), introduced by [Narayan et al., 2018](https://arxiv.org/pdf/1808.08745.pdf), is a summarization dataset which does not favor extractive strategies and calls for an abstractive modeling approach. 127 | The idea of this dataset is to create a short, one sentence news summary. 128 | Data is collected by harvesting online articles from the BBC. 129 | The dataset contain **204 045** samples for the training set, **11 332** for the validation set, and **11 334** for the test set. In average the length of article is 431 words (~20 sentences) and the length of summary is 23 words. It can be downloaded [here](https://github.com/EdinburghNLP/XSum). 130 | Evaluation metrics are ROUGE-1, ROUGE-2 and ROUGE-L. 131 | 132 | | Model | ROUGE-1 | ROUGE-2 | ROUGE-L | Paper / Source | Code | 133 | | --------------- | :-----: | :-----: | :-----: | -------------- | ---- | 134 | | PEGASUS (Zhang et al., 2019) | 47.21 | 24.56 | 39.25 | [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/pdf/1912.08777.pdf) | - | 135 | | BART (Lewis et al., 2019) | 45.14 | 22.27 | 37.25 | [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) | [Official](https://github.com/pytorch/fairseq/tree/master/examples/bart) | 136 | | BertSumExtAbs (Liu et al., 2019) | 38.81 | 16.50 | 31.27 | [Text Summarization with Pretrained Encoders](https://arxiv.org/pdf/1908.08345.pdf) | [Official](https://github.com/nlpyang/PreSumm) | 137 | | T-ConvS2S | 31.89 | 11.54 | 25.75 | [Don’t Give Me the Details, Just the Summary!](https://arxiv.org/pdf/1808.08745.pdf) | [Official](https://github.com/EdinburghNLP/XSum) | 138 | | PtGen | 29.70 | 9.21 | 23.24 | [Don’t Give Me the Details, Just the Summary!](https://arxiv.org/pdf/1808.08745.pdf) | [Official](https://github.com/EdinburghNLP/XSum) | 139 | | Seq2Seq | 28.42 | 8.77 | 22.48 | [Don’t Give Me the Details, Just the Summary!](https://arxiv.org/pdf/1808.08745.pdf) | [Official](https://github.com/EdinburghNLP/XSum) | 140 | | PtGen-Covg | 28.10 | 8.02 | 21.72 | [Don’t Give Me the Details, Just the Summary!](https://arxiv.org/pdf/1808.08745.pdf) | [Official](https://github.com/EdinburghNLP/XSum) | 141 | | Baseline : Extractive Oracle | 29.79 | 8.81 | 22.66 | [Don’t Give Me the Details, Just the Summary!](https://arxiv.org/pdf/1808.08745.pdf) | [Official](https://github.com/EdinburghNLP/XSum) | 142 | | Baseline : Lead-3 | 16.30 | 1.60 | 11.95 | [Don’t Give Me the Details, Just the Summary!](https://arxiv.org/pdf/1808.08745.pdf) | [Official](https://github.com/EdinburghNLP/XSum) | 143 | | Baseline : Random | 15.16 | 1.78 | 11.27 | [Don’t Give Me the Details, Just the Summary!](https://arxiv.org/pdf/1808.08745.pdf) | [Official](https://github.com/EdinburghNLP/XSum) | 144 | 145 | ### DUC 2004 Task 1 146 | 147 | Similar to Gigaword, task 1 of [DUC 2004](https://duc.nist.gov/duc2004/) is a sentence summarization task. The dataset contains 500 documents with on average 35.6 tokens and summaries with 10.4 tokens. Due to its size, neural models are typically trained on other datasets and only tested on DUC 2004. Evaluation metrics are ROUGE-1, ROUGE-2 and ROUGE-L recall @ 75 bytes. 148 | 149 | | Model | ROUGE-1 | ROUGE-2 | ROUGE-L | Paper / Source | Code | 150 | | --------------- | :-----: | :-----: | :-----: | -------------- | ---- | 151 | | Transformer + LRPE + PE + Re-ranking (Takase and Okazaki, 2019) | 32.29 | 11.49 | 28.03 | [Positional Encoding to Control Output Sequence Length](https://arxiv.org/abs/1904.07418) | [Official](https://github.com/takase/control-length) | 152 | | DRGD (Li et al., 2017) | 31.79 | 10.75 | 27.48 | [Deep Recurrent Generative Decoder for Abstractive Text Summarization](http://aclweb.org/anthology/D17-1222) | | 153 | | EndDec+WFE (Suzuki and Nagata, 2017) | 32.28 | 10.54 | 27.8 | [Cutting-off Redundant Repeating Generations for Neural Abstractive Summarization](http://aclweb.org/anthology/E17-2047) | | 154 | | Reinforced-Topic-ConvS2S (Wang et al., 2018) | 31.15 | 10.85 | 27.68 | [A Reinforced Topic-Aware Convolutional Sequence-to-Sequence Model for Abstractive Text Summarization](https://www.ijcai.org/proceedings/2018/0619.pdf) | | 155 | | CNN-2sent-hieco-RBM (Zhang et al., 2019) | 29.74 | 9.85 | 25.81 | [Abstract Text Summarization with a Convolutional Seq2Seq Model](https://www.mdpi.com/2076-3417/9/8/1665/pdf) | 156 | | Seq2seq + selective + MTL + ERAM (Li et al., 2018) | 29.33 | 10.24 | 25.24 | [Ensure the Correctness of the Summary: Incorporate Entailment Knowledge into Abstractive Sentence Summarization](http://aclweb.org/anthology/C18-1121) | | 157 | | SEASS (Zhou et al., 2017) | 29.21 | 9.56 | 25.51 | [Selective Encoding for Abstractive Sentence Summarization](http://aclweb.org/anthology/P17-1101) | | 158 | | words-lvt5k-1sent (Nallapti et al., 2016) | 28.61 | 9.42 | 25.24 | [Abstractive Text Summarization using Sequence-to-sequence RNNs and Beyond](http://www.aclweb.org/anthology/K16-1028) | | 159 | | ABS+ (Rush et al., 2015) | 28.18 | 8.49 | 23.81 | [A Neural Attention Model for Sentence Summarization](https://www.aclweb.org/anthology/D/D15/D15-1044.pdf) | | 160 | | RAS-Elman (Chopra et al., 2016) | 28.97 | 8.26 | 24.06 | [Abstractive Sentence Summarization with Attentive Recurrent Neural Networks](http://www.aclweb.org/anthology/N16-1012) | | 161 | | ABS (Rush et al., 2015) | 26.55 | 7.06 | 22.05 | [A Neural Attention Model for Sentence Summarization](https://www.aclweb.org/anthology/D/D15/D15-1044.pdf) | | 162 | 163 | ## Webis-TLDR-17 Corpus 164 | 165 | This [dataset](https://zenodo.org/record/1168855) contains 3 Million pairs of content and self-written summaries mined from Reddit. It is one of the first large-scale summarization dataset from the social media domain. For more details, refer to [TL;DR: Mining Reddit to Learn Automatic Summarization](https://aclweb.org/anthology/W17-4508) 166 | 167 | ## Sentence Compression 168 | 169 | Sentence compression produces a shorter sentence by removing redundant information, 170 | preserving the grammatically and the important content of the original sentence. 171 | 172 | ### Google Dataset 173 | 174 | The [Google Dataset](https://github.com/google-research-datasets/sentence-compression) was built by Filippova et al., 2013([Overcoming the Lack of Parallel Data in Sentence Compression](https://www.aclweb.org/anthology/D/D13/D13-1155.pdf)). The first dataset released contained only 10,000 sentence-compression pairs, but last year was released an additional 200,000 pairs. 175 | 176 | Example of a sentence-compression pair: 177 | > Sentence: Floyd Mayweather is open to fighting Amir Khan in the future, despite snubbing the Bolton-born boxer in favour of a May bout with Argentine Marcos Maidana, according to promoters Golden Boy 178 | 179 | > Compression: Floyd Mayweather is open to fighting Amir Khan in the future. 180 | 181 | In short, this is a deletion-based task where the compression is a subsequence from the original sentence. From the 10,000 pairs of the eval portion([repository](https://github.com/google-research-datasets/sentence-compression/tree/master/data)) it is used the very first 1,000 sentence for automatic evaluation and the 200,000 pairs for training. 182 | 183 | Models are evaluated using the following metrics: 184 | * F1 - compute the recall and precision in terms of tokens kept in the golden and the generated compressions. 185 | * Compression rate (CR) - the length of the compression in characters divided over the sentence length. 186 | 187 | | Model | F1 | CR |Paper / Source | Code | 188 | | ------------- | :-----:| --- | --- | --- | 189 | | BiRNN + LM Evaluator (Zhao et al. 2018) | 0.851 | 0.39 | [A Language Model based Evaluator for Sentence Compression](https://aclweb.org/anthology/P18-2028) | https://github.com/code4conference/code4sc | 190 | | LSTM (Filippova et al., 2015) | 0.82 | 0.38 | [Sentence Compression by Deletion with LSTMs](https://research.google.com/pubs/archive/43852.pdf) | | 191 | | BiLSTM (Wang et al., 2017) | 0.8 | 0.43 | [Can Syntax Help? Improving an LSTM-based Sentence Compression Model for New Domains](http://www.aclweb.org/anthology/P17-1127) | | 192 | 193 | [Go back to the README](../README.md) 194 | -------------------------------------------------------------------------------- /universal sentence encoder/Semantic_Textual_Similarity_using_Universal_Sentence_Encoder.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Semantic Textual Similarity using Universal Sentence Encoder.ipynb", 7 | "provenance": [], 8 | "authorship_tag": "ABX9TyOG0xsrn2afmfNaZ8WcAVu7", 9 | "include_colab_link": true 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | }, 15 | "accelerator": "GPU" 16 | }, 17 | "cells": [ 18 | { 19 | "cell_type": "markdown", 20 | "metadata": { 21 | "id": "view-in-github", 22 | "colab_type": "text" 23 | }, 24 | "source": [ 25 | "\"Open" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "metadata": { 31 | "id": "zm2qu7hoFH_z", 32 | "colab_type": "code", 33 | "colab": { 34 | "base_uri": "https://localhost:8080/", 35 | "height": 50 36 | }, 37 | "outputId": "bc70add6-4ebc-402c-9f91-bf8cea96d121" 38 | }, 39 | "source": [ 40 | "%tensorflow_version 2.x\n", 41 | "import tensorflow as tf\n", 42 | "import tensorflow_hub as hub\n", 43 | "from sklearn import preprocessing\n", 44 | "from tensorflow import keras\n", 45 | "import numpy as np \n", 46 | "import pandas as pd\n", 47 | "\n", 48 | "from absl import logging\n", 49 | "import os\n", 50 | "import re\n", 51 | "import seaborn as sns\n", 52 | "\n", 53 | "module_url = \"https://tfhub.dev/google/universal-sentence-encoder-large/5\" #@param [\"https://tfhub.dev/google/universal-sentence-encoder/4\", \"https://tfhub.dev/google/universal-sentence-encoder-large/5\"]\n", 54 | "emb = hub.load(module_url)\n", 55 | "\n", 56 | "print(f\"{module_url} has been loaded.\")" 57 | ], 58 | "execution_count": 1, 59 | "outputs": [ 60 | { 61 | "output_type": "stream", 62 | "text": [ 63 | "TensorFlow 2.x selected.\n", 64 | "https://tfhub.dev/google/universal-sentence-encoder-large/5 has been loaded.\n" 65 | ], 66 | "name": "stdout" 67 | } 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "metadata": { 73 | "id": "hX_FYI3rFWbJ", 74 | "colab_type": "code", 75 | "colab": {} 76 | }, 77 | "source": [ 78 | "def embed(input):\n", 79 | " return emb(input)" 80 | ], 81 | "execution_count": 0, 82 | "outputs": [] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "metadata": { 87 | "id": "t04wD3iZFYwx", 88 | "colab_type": "code", 89 | "colab": {} 90 | }, 91 | "source": [ 92 | "import scipy\n", 93 | "import math\n", 94 | "import csv\n", 95 | "import pandas " 96 | ], 97 | "execution_count": 0, 98 | "outputs": [] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "metadata": { 103 | "id": "s7Bm2-k_FeQY", 104 | "colab_type": "code", 105 | "colab": { 106 | "base_uri": "https://localhost:8080/", 107 | "height": 50 108 | }, 109 | "outputId": "5ce8d1fc-68f5-4a3a-e9be-7ecb932aaa49" 110 | }, 111 | "source": [ 112 | "sts_dataset = tf.keras.utils.get_file(\n", 113 | " fname=\"Stsbenchmark.tar.gz\",\n", 114 | " origin=\"http://ixa2.si.ehu.es/stswiki/images/4/48/Stsbenchmark.tar.gz\",\n", 115 | " extract=True)" 116 | ], 117 | "execution_count": 5, 118 | "outputs": [ 119 | { 120 | "output_type": "stream", 121 | "text": [ 122 | "Downloading data from http://ixa2.si.ehu.es/stswiki/images/4/48/Stsbenchmark.tar.gz\n", 123 | "417792/409630 [==============================] - 2s 5us/step\n" 124 | ], 125 | "name": "stdout" 126 | } 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "metadata": { 132 | "id": "bUTuw8cRFobw", 133 | "colab_type": "code", 134 | "colab": {} 135 | }, 136 | "source": [ 137 | "sts_dev = pandas.read_table(\n", 138 | " os.path.join(os.path.dirname(sts_dataset), \"stsbenchmark\", \"sts-dev.csv\"),\n", 139 | " error_bad_lines=False,\n", 140 | " skip_blank_lines=True,\n", 141 | " usecols=[4, 5, 6],\n", 142 | " names=[\"sim\", \"sent_1\", \"sent_2\"])" 143 | ], 144 | "execution_count": 0, 145 | "outputs": [] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "metadata": { 150 | "id": "6KCMEB5GF3Wu", 151 | "colab_type": "code", 152 | "colab": {} 153 | }, 154 | "source": [ 155 | "sts_test = pandas.read_table(\n", 156 | " os.path.join(\n", 157 | " os.path.dirname(sts_dataset), \"stsbenchmark\", \"sts-test.csv\"),\n", 158 | " error_bad_lines=False,\n", 159 | " quoting=csv.QUOTE_NONE,\n", 160 | " skip_blank_lines=True,\n", 161 | " usecols=[4, 5, 6],\n", 162 | " names=[\"sim\", \"sent_1\", \"sent_2\"])" 163 | ], 164 | "execution_count": 0, 165 | "outputs": [] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "metadata": { 170 | "id": "XNNkWsxWF6hV", 171 | "colab_type": "code", 172 | "colab": {} 173 | }, 174 | "source": [ 175 | "sts_dev = sts_dev[[isinstance(s, str) for s in sts_dev['sent_2']]]" 176 | ], 177 | "execution_count": 0, 178 | "outputs": [] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "metadata": { 183 | "id": "VRxwa4iVF-Qx", 184 | "colab_type": "code", 185 | "colab": {} 186 | }, 187 | "source": [ 188 | "sts_data = sts_dev #@param [\"sts_dev\", \"sts_test\"] {type:\"raw\"}\n", 189 | "\n", 190 | "def run_sts_benchmark(batch):\n", 191 | " sts_encode1 = tf.nn.l2_normalize(embed(tf.constant(batch['sent_1'].tolist())), axis=1)\n", 192 | " sts_encode2 = tf.nn.l2_normalize(embed(tf.constant(batch['sent_2'].tolist())), axis=1)\n", 193 | " cosine_similarities = tf.reduce_sum(tf.multiply(sts_encode1, sts_encode2), axis=1)\n", 194 | " clip_cosine_similarities = tf.clip_by_value(cosine_similarities, -1.0, 1.0)\n", 195 | " scores = 1.0 - tf.acos(clip_cosine_similarities)\n", 196 | " \"\"\"Returns the similarity scores\"\"\"\n", 197 | " return scores" 198 | ], 199 | "execution_count": 0, 200 | "outputs": [] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "metadata": { 205 | "id": "CAoj8J_jGFF4", 206 | "colab_type": "code", 207 | "colab": {} 208 | }, 209 | "source": [ 210 | "dev_scores = sts_data['sim'].tolist()\n", 211 | "scores = []\n", 212 | "for batch in np.array_split(sts_data, 10):\n", 213 | " scores.extend(run_sts_benchmark(batch))" 214 | ], 215 | "execution_count": 0, 216 | "outputs": [] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "metadata": { 221 | "id": "8_qp2YjQGfst", 222 | "colab_type": "code", 223 | "colab": { 224 | "base_uri": "https://localhost:8080/", 225 | "height": 50 226 | }, 227 | "outputId": "4111f6b2-2d72-4b27-e8be-1bbf1f7a0ac7" 228 | }, 229 | "source": [ 230 | "pearson_correlation = scipy.stats.pearsonr(scores, dev_scores)\n", 231 | "print('Pearson correlation coefficient = {0}\\np-value = {1}'.format(\n", 232 | " pearson_correlation[0], pearson_correlation[1]))" 233 | ], 234 | "execution_count": 17, 235 | "outputs": [ 236 | { 237 | "output_type": "stream", 238 | "text": [ 239 | "Pearson correlation coefficient = 0.8334395804368266\n", 240 | "p-value = 0.0\n" 241 | ], 242 | "name": "stdout" 243 | } 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "metadata": { 249 | "id": "peOYEVaGGkRY", 250 | "colab_type": "code", 251 | "colab": {} 252 | }, 253 | "source": [ 254 | "" 255 | ], 256 | "execution_count": 0, 257 | "outputs": [] 258 | } 259 | ] 260 | } -------------------------------------------------------------------------------- /word embeddings/word embedding.md: -------------------------------------------------------------------------------- 1 | # Word Embeddings 2 | Subtask of [Representation Learning](https://paperswithcode.com/task/representation-learning) 3 | > Find more at [Papers with code](https://paperswithcode.com/task/word-embeddings) 4 | 5 | Word embedding is the collective name for a set of language modeling and feature learning techniques in natural language processing (NLP) 6 | where words or phrases from the vocabulary are mapped to vectors of real numbers. 7 | 8 | --------------------------------------------------------------------------------