├── .gitignore ├── LICENSE ├── README.md ├── assignments ├── Chat │ ├── .gitignore │ ├── chat.py │ ├── dialog-ger.md │ ├── german-aixml-2.md │ ├── german-aixml.md │ ├── readme.md │ ├── requirements.txt │ └── tools.py ├── Embeddings │ ├── 1-baseline.py │ ├── 2-preprocessing.py │ ├── assginment_text_classifier.ipynb │ └── data │ │ ├── custom-emo.txt │ │ ├── germeval2018.test.txt │ │ └── germeval2018.training.txt ├── RNNs │ ├── classifying names with rnns.ipynb │ ├── shakespear-lstm.py │ └── vanishing-gradients.ipynb ├── germeval2018.test.txt ├── germeval2018.training.txt ├── goethe.txt ├── sensor-data.csv ├── tiny_goethe.ipynb └── transformer │ ├── nlp_2_transformer_offensive_language_classification.ipynb │ └── nlp_3_neural_search.ipynb ├── hello-python ├── Hello PyTorch.ipynb ├── Short Python Intro.html └── Short Python Intro.ipynb └── slides ├── Deep Learning - Hello Python.pdf ├── Deep NLP 1 Recurrent Neural Networks.pdf ├── Deep NLP 2 Word Vectors and Transfer Learning.pdf └── Deep NLP 3 Transforners and Attention.pdf /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # logs 7 | */logs/* 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | db.sqlite3 61 | 62 | # Flask stuff: 63 | instance/ 64 | .webassets-cache 65 | 66 | # Scrapy stuff: 67 | .scrapy 68 | 69 | # Sphinx documentation 70 | docs/_build/ 71 | 72 | # PyBuilder 73 | target/ 74 | 75 | # Jupyter Notebook 76 | .ipynb_checkpoints 77 | 78 | # pyenv 79 | .python-version 80 | 81 | # celery beat schedule file 82 | celerybeat-schedule 83 | 84 | # SageMath parsed files 85 | *.sage.py 86 | 87 | # Environments 88 | .env 89 | .venv 90 | env/ 91 | venv/ 92 | ENV/ 93 | env.bak/ 94 | venv.bak/ 95 | 96 | # Spyder project settings 97 | .spyderproject 98 | .spyproject 99 | 100 | # Rope project settings 101 | .ropeproject 102 | 103 | # mkdocs documentation 104 | /site 105 | 106 | # mypy 107 | .mypy_cache/ 108 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Oliver Guhr 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # HTW Dresden NLP Lecture 2 | 3 | This repository contains NLP related material for the I833 Deep Learning course at University of Applied Sciences Dresden. 4 | 5 | You can find the all the CNN examples in this [repository](https://github.com/tneumann/htw_cnn_lecture) 6 | 7 | 8 | ## Hello Python - a brief introduction 9 | 10 | [Slides](https://github.com/oliverguhr/htw-nlp-lecture/blob/master/slides/Deep%20NLP%201%20Recurrent%20Neural%20Networks.pdf) 11 | 12 | [Hello Python Notebook](./hello-python/Short%20Python%20Intro.ipynb) 13 | 14 | [Hello PyTorch Notebook](./hello-python/Hello%20PyTorch.ipynb) 15 | 16 | ## Introduction into RNNs 17 | 18 | [Slides](./slides/Deep%20NLP%201%20Recurrent%20Neural%20Networks.pdf) 19 | 20 | [Vanishing Gradients Notebook](./assignments/RNNs/vanishing-gradients.ipynb) 21 | 22 | 23 | [Classifying Names with a Character-Level RNN](./assignments/RNNs/classifying%20names%20with%20rnns.ipynb) 24 | 25 | ## Word Vectors and Transfer Learning 26 | 27 | [Slides](./slides/Deep%20NLP%202%20Word%20Vectors%20and%20Transfer%20Learning.pdf) 28 | 29 | Code for a simple offensive language classificator, for german texts. 30 | 31 | [Offensive Language Classification](./assignments/transformer/nlp_2_transformer_offensive_language_classification.ipynb) 32 | 33 | 34 | 35 | ## Transforners and Attention 36 | 37 | [Slides](./slides/Deep%20NLP%203%20Transforners%20and%20Attention.pdf) 38 | 39 | Implement an Neural search using Transformers. 40 | 41 | [Neural Search](./assignments/transformer/nlp_3_neural_search.ipynb) 42 | -------------------------------------------------------------------------------- /assignments/Chat/.gitignore: -------------------------------------------------------------------------------- 1 | chat-final.py 2 | *.pickle 3 | __pychache__ -------------------------------------------------------------------------------- /assignments/Chat/chat.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from termcolor import colored 3 | from typing import List 4 | import random 5 | import pickle 6 | import os 7 | import re, locale 8 | import torch 9 | from transformers import AutoModel, AutoTokenizer 10 | from tqdm import tqdm 11 | import tools 12 | from tools import Pattern 13 | 14 | 15 | tokenizer = AutoTokenizer.from_pretrained("bert-base-german-cased") 16 | model = AutoModel.from_pretrained("bert-base-german-cased") 17 | 18 | def encode(text): 19 | tokens = [tokenizer.encode(text, add_special_tokens=False)] 20 | 21 | input_ids = torch.tensor(tokens) 22 | with torch.no_grad(): 23 | all_hidden_states, _ = model(input_ids) 24 | 25 | # todo: implement a pooling strategy to generate a document vector 26 | # tip: take a look at slides from our last lesson 27 | document_vector = None # <- you code :) 28 | 29 | assert(np.shape(document_vector) == (768,)) # <- the output should have this shape 30 | return document_vector 31 | 32 | def load_chat_texts(): 33 | patterns = tools.load_file('dialog-ger.md') 34 | patterns.extend(tools.load_file('german-aixml.md')) 35 | patterns.extend(tools.load_file('german-aixml-2.md')) 36 | return patterns 37 | 38 | def encode_chat_text_to_vectors(patterns): 39 | data = tools.load_if_exists("tmp.pickle") 40 | if data is not None: 41 | doc, doc_vecs, reponse_patterns = data 42 | else: 43 | doc = list() 44 | doc_vecs = list() 45 | reponse_patterns = list() 46 | print("encoding sentences") 47 | for i,pattern in tqdm(enumerate(patterns), total=len(patterns)): 48 | doc.extend(pattern.input) 49 | vectors = [encode(line) for line in pattern.input] 50 | doc_vecs.extend(vectors) 51 | reponse_patterns.extend([pattern]*len(pattern.input)) 52 | tools.save("tmp.pickle",[doc,doc_vecs,reponse_patterns]) 53 | return doc, np.array(doc_vecs), reponse_patterns 54 | 55 | 56 | if __name__ == '__main__': 57 | 58 | # 1. load chat texts 59 | texts = load_chat_texts() 60 | # 2. convert texts into vectors 61 | doc, vectors, reponse_patterns = encode_chat_text_to_vectors(texts) 62 | 63 | topk = 5 # number of top scoring answers to print 64 | while True: 65 | query = input(colored('you: ', 'green')) 66 | query = query.strip().lower() 67 | query = re.sub(r'\W ', '', query) # remove non text chars 68 | query_vec = encode(query) 69 | 70 | # 3. compare user input to stored vectors unsing the dot product or cosine similarity 71 | score = None # <- todo: write code to score the output here 72 | topk_idx = None # <- todo: create a list with the [topk] document ids here 73 | 74 | # 4. Output the answers with the highest score 75 | print('top %d texts similar to "%s"' % (topk, colored(query, 'green'))) 76 | for idx in topk_idx: 77 | matched_pattern = doc[idx] 78 | print('> %s\t%s' % (colored('%.1f' % score[idx], 'cyan'), colored(matched_pattern, 'yellow'))) 79 | 80 | reponse_text = random.choice(reponse_patterns[topk_idx[0]].response) 81 | print(colored("robo: "+reponse_text+"\n","blue")) 82 | 83 | # 5. Create a chatbot startup :) 84 | -------------------------------------------------------------------------------- /assignments/Chat/dialog-ger.md: -------------------------------------------------------------------------------- 1 | ## Greeting 2 | * hallo 3 | * hi 4 | * hallo tesaro 5 | * hallo roboter 6 | * hallo robo 7 | * hallo du da 8 | * hallöchen 9 | * Moin 10 | - Hallo 11 | - Guten Tag 12 | 13 | ## trivia 1 14 | * wie gehts dir 15 | - Mir gehts super! 16 | - Heute ist mein Tag. 17 | 18 | ## trivia 2 19 | * erzähle mir einen witz 20 | - Das willst du nicht wirklich, glaub mir. 21 | 22 | ## trivia 23 | * wirklich 24 | * echt jetzt 25 | - Absolut. 26 | 27 | ## gefühl 28 | * mir gehts schlecht 29 | * mir geht es schlecht 30 | * mir geht es nicht gut 31 | * mir ist nicht gut 32 | - Das ist nicht schön. Kann ich etwas für dich tun? 33 | 34 | ## studium 35 | * mein tutor nervt 36 | - Echt? Warum? 37 | 38 | ## Super cool 39 | * das fetzt 40 | * das ist toll 41 | * das ist großartig 42 | * das ist grandios 43 | * awesome 44 | - Ja oder? 45 | 46 | ## elsterglanz 47 | * juhu juri 48 | * huhu juri 49 | - Juri ist ein echter Russe! -------------------------------------------------------------------------------- /assignments/Chat/readme.md: -------------------------------------------------------------------------------- 1 | # Bert Chatter 2 | 3 | 4 | ## Instructions 5 | 6 | 1. start by installing the requirements 7 | 8 | ``` 9 | pip3 install -r requirements.txt 10 | ``` 11 | 12 | 2. start a local Python IDE 13 | 14 | 15 | 3. Open the [code](chat.py) and fill all the comments marked with "todo" 16 | 17 | 18 | ## Tasks 19 | 20 | * Implement a pooling strategy to generate a document vector. 21 | 22 | * Implement a scoring function (dot product, cosine similarity, euclidean distance). 23 | 24 | * Next Steps: 25 | * Use the output from different layers 26 | * Try other models like [this](https://huggingface.co/bert-base-german-dbmdz-uncased) 27 | 28 | -------------------------------------------------------------------------------- /assignments/Chat/requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | transformers 3 | tqdm 4 | termcolor 5 | numpy -------------------------------------------------------------------------------- /assignments/Chat/tools.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from termcolor import colored 3 | from typing import List 4 | import random 5 | import pickle 6 | import os 7 | import re, locale 8 | 9 | 10 | class Pattern: 11 | def __init__(self,name:str): 12 | self.input:List[str] = list() 13 | self.response:List[str] = list() 14 | self.name = name 15 | self.input_vectors: List[float] = None 16 | 17 | def load_if_exists(filename:str): 18 | if os.path.isfile(filename): 19 | with open(filename, 'rb') as f: 20 | data = pickle.load(f) 21 | return data 22 | else: 23 | return None 24 | def save(filename:str,data): 25 | with open(filename, 'wb') as f: 26 | pickle.dump(data, f) 27 | 28 | def load_file(path): 29 | with open(path) as fp: 30 | patterns: List[Pattern] = list() 31 | pattern = None 32 | for line in fp: 33 | if line.startswith('##'): 34 | if pattern is not None: patterns.append(pattern) 35 | pattern = Pattern(line.replace('##','',1).strip()) 36 | elif line.startswith('*'): 37 | pattern.input.append(line.replace('*','',1).strip()) 38 | elif line.startswith(' -'): 39 | if '*' not in line: # skips wildcard lines as we can't handel them yet 40 | pattern.response.append(line.replace(' -','',1).strip()) 41 | patterns.append(pattern) 42 | print(f'{len(patterns)} patterns loaded from file {path}') 43 | return patterns -------------------------------------------------------------------------------- /assignments/Embeddings/1-baseline.py: -------------------------------------------------------------------------------- 1 | import fastText 2 | import re 3 | 4 | def load_data(path): 5 | file = open(path, "r",encoding="utf-8") 6 | data = file.readlines() 7 | return [line.split("\t") for line in data] 8 | 9 | def save_data(path,data): 10 | with open(path, 'w',encoding="utf-8") as f: 11 | f.write("\n".join(data)) 12 | 13 | def train(): 14 | traning_parameters = {'input': 'fasttext.train', 'epoch': 60, 'lr': 0.01, 'wordNgrams': 1, 'verbose': 2, 'minCount': 1, 'loss': "ns", 15 | 'lrUpdateRate': 100, 'thread': 1, 'ws':5, 'dim': 100} 16 | model = fastText.train_supervised(**traning_parameters) 17 | model.save_model("model.bin") 18 | return model 19 | 20 | def test(model): 21 | f1_score = lambda precision, recall: 2 * ((precision * recall) / (precision + recall)) 22 | nexamples, recall, precision = model.test('fasttext.test') 23 | print (f'recall: {recall}' ) 24 | print (f'precision: {precision}') 25 | print (f'f1 score: {f1_score(precision,recall)}') 26 | print (f'Number of examples: {nexamples}') 27 | 28 | def transform(input_file, output_file): 29 | # load data 30 | data = load_data(input_file) 31 | # transform it into fasttext format __label__other have a nice day 32 | data = [f"__label__{line[1]}\t{line[0]}" for line in data] 33 | # and save the data 34 | save_data(output_file,data) 35 | 36 | if __name__ == "__main__": 37 | transform("data/germeval2018.training.txt","fasttext.train") 38 | transform("data/germeval2018.test.txt","fasttext.test") 39 | 40 | # train the model 41 | model = train() 42 | test(model) 43 | 44 | -------------------------------------------------------------------------------- /assignments/Embeddings/2-preprocessing.py: -------------------------------------------------------------------------------- 1 | import fastText 2 | import re 3 | 4 | def load_data(path): 5 | file = open(path, "r",encoding="utf-8") 6 | data = file.readlines() 7 | return [line.split("\t") for line in data] 8 | 9 | def save_data(path,data): 10 | with open(path, 'w',encoding="utf-8") as f: 11 | f.write("\n".join(data)) 12 | 13 | def train(): 14 | traning_parameters = {'input': 'fasttext.train', 'epoch': 60, 'lr': 0.01, 'wordNgrams': 1, 'verbose': 2, 'minCount': 1, 'loss': "ns", 15 | 'lrUpdateRate': 100, 'thread': 1, 'ws':5, 'dim': 300} 16 | model = fastText.train_supervised(**traning_parameters) 17 | model.save_model("model.bin") 18 | return model 19 | 20 | def replaceNumbers(text): 21 | text = text.replace("0"," null ") 22 | text = text.replace("1"," eins ") 23 | text = text.replace("2"," zwei ") 24 | text = text.replace("3"," drei ") 25 | text = text.replace("4"," vier ") 26 | text = text.replace("5"," fünf ") 27 | text = text.replace("6"," sechs ") 28 | text = text.replace("7"," sieben ") 29 | text = text.replace("8"," acht ") 30 | text = text.replace("9"," neun ") 31 | return text 32 | 33 | def loadSmileyData(path): 34 | file = open(path, "r",encoding="utf-8") 35 | data = file.readlines() 36 | data = [line.replace("\n","") for line in data] 37 | return [line.split("\t") for line in data] 38 | 39 | simleys = loadSmileyData("data/custom-emo.txt") 40 | 41 | def replaceSmiley(text): 42 | for simley in simleys: 43 | text = text.replace(simley[0],' '+simley[1].lower()+' ') 44 | return text 45 | 46 | cleanChars = re.compile(r'[^a-züöäÖÜÄß ]', re.MULTILINE) 47 | def preprocess(line): 48 | line = replaceSmiley(line) 49 | line = line.strip().lower() 50 | line = " ".join([word for word in line.split() if word[0] is not "@"]) 51 | line = line.replace("."," ").replace("#","").replace("@"," ").replace(":","").replace(",","").replace("|","").replace("("," ").replace(")"," ").replace("-"," ").replace("/"," ").replace("!","").replace(";","").replace("\"","").replace("="," ") 52 | line = replaceNumbers(line) 53 | #line = line.replace(" u "," und ") 54 | #line = cleanChars.sub('', line) 55 | return line 56 | 57 | def test(model): 58 | f1_score = lambda precision, recall: 2 * ((precision * recall) / (precision + recall)) 59 | nexamples, recall, precision = model.test('fasttext.test') 60 | print (f'recall: {recall}' ) 61 | print (f'precision: {precision}') 62 | print (f'f1 score: {f1_score(precision,recall)}') 63 | print (f'Number of examples: {nexamples}') 64 | 65 | def transform(input_file, output_file): 66 | # load data 67 | data = load_data(input_file) 68 | # transform it into fasttext format __label__other have a nice day 69 | data = [f"__label__{line[1]}\t{preprocess(line[0])}" for line in data] 70 | # and save the data 71 | save_data(output_file,data) 72 | 73 | if __name__ == "__main__": 74 | transform("data/germeval2018.training.txt","fasttext.train") 75 | transform("data/germeval2018.test.txt","fasttext.test") 76 | 77 | # train the model 78 | model = train() 79 | test(model) 80 | 81 | -------------------------------------------------------------------------------- /assignments/Embeddings/assginment_text_classifier.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Classifying Text \n", 8 | "\n", 9 | "In this little turorial we are using PyTorch, TorchText and Byte Pair Encoding to quickly build a text classifyer." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "!pip3 install bpemb pandas torchtext torch" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "import time\n", 28 | "from bpemb import BPEmb\n", 29 | "import pandas as pd\n", 30 | "import numpy as np\n", 31 | "\n", 32 | "import torch\n", 33 | "from torchtext import data\n", 34 | "import torch.nn as nn" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "## 1. Load the data\n", 42 | "\n", 43 | "\n", 44 | "At first, we need to downlad the data:" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "!wget https://www.htw-dresden.de/~guhr/dist/sample/germeval2018.training.txt\n", 54 | "!wget https://www.htw-dresden.de/~guhr/dist/sample/germeval2018.test.txt" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": {}, 60 | "source": [ 61 | "Now we can load the data, using pandas:" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "test_df = pd.read_csv(\"germeval2018.test.txt\", sep='\\t', header=0,encoding=\"utf-8\")\n", 71 | "train_df = pd.read_csv(\"germeval2018.training.txt\", sep='\\t', header=0,encoding=\"utf-8\")" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": { 78 | "scrolled": true 79 | }, 80 | "outputs": [], 81 | "source": [ 82 | "train_df.head()" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "# drop unused columns\n", 92 | "test_df.drop(columns=['label2'], inplace=True)\n", 93 | "train_df.drop(columns=['label2'], inplace=True)" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "## 2. Data Preprocessing\n", 101 | "\n", 102 | "Now we can preprocess our dataset. In this step we remove all special chars and binarize our labels:" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "def clean_text (text):\n", 112 | " text = text.str.lower() # lowercase\n", 113 | " text = text.str.replace(r\"\\#\",\"\") # replaces hashtags\n", 114 | " text = text.str.replace(r\"http\\S+\",\"URL\") # remove URL addresses\n", 115 | " text = text.str.replace(r\"@\",\"\")\n", 116 | " text = text.str.replace(r\"[^A-Za-z0-9öäüÖÄÜß()!?]\", \" \")\n", 117 | " text = text.str.replace(\"\\s{2,}\", \" \")\n", 118 | " return text\n", 119 | "\n", 120 | "def convert_label(label):\n", 121 | " return 1 if label == \"OFFENSE\" else 0" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "train_df[\"text\"]=clean_text(train_df[\"text\"])\n", 131 | "test_df[\"text\"]=clean_text(test_df[\"text\"])\n", 132 | "train_df[\"label\"]=train_df[\"label\"].map(convert_label)\n", 133 | "test_df[\"label\"]=test_df[\"label\"].map(convert_label)" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "# this is how our data set looks now. No urls no @ :)\n", 143 | "train_df.head()" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "# The following will help make the results reproducible later.\n", 153 | "# This is will make shure that you get the same result every time you train you model\n", 154 | "# Turn this off, for you final train run, to improve performance.\n", 155 | "SEED = 42\n", 156 | "\n", 157 | "torch.manual_seed(SEED)\n", 158 | "torch.backends.cudnn.deterministic = True\n", 159 | "torch.backends.cudnn.benchmark = False" 160 | ] 161 | }, 162 | { 163 | "cell_type": "markdown", 164 | "metadata": {}, 165 | "source": [ 166 | "### data magic\n", 167 | "\n", 168 | "The following class helps us to convert the pandas dataframe into an pytorch data set. You can skip that. " 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [ 177 | "# source : https://gist.github.com/lextoumbourou/8f90313cbc3598ffbabeeaa1741a11c8\n", 178 | "# to use DataFrame as a Data source\n", 179 | "\n", 180 | "class DataFrameDataset(data.Dataset):\n", 181 | "\n", 182 | " def __init__(self, df, fields, is_test=False, **kwargs):\n", 183 | " print(df)\n", 184 | " examples = []\n", 185 | " for i, row in df.iterrows(): \n", 186 | " label = row.label#row.target if not is_test else None \n", 187 | " text = row.text \n", 188 | " examples.append(data.Example.fromlist([text, label], fields))\n", 189 | "\n", 190 | " super().__init__(examples, fields, **kwargs)\n", 191 | "\n", 192 | " @staticmethod\n", 193 | " def sort_key(ex):\n", 194 | " return len(ex.text)\n", 195 | "\n", 196 | " @classmethod\n", 197 | " def splits(cls, fields, train_df, val_df=None, test_df=None, **kwargs):\n", 198 | " train_data, val_data, test_data = (None, None, None)\n", 199 | " data_field = fields\n", 200 | "\n", 201 | " if train_df is not None:\n", 202 | " train_data = cls(train_df, data_field, **kwargs)\n", 203 | " if val_df is not None:\n", 204 | " val_data = cls(val_df, data_field, **kwargs)\n", 205 | " if test_df is not None:\n", 206 | " test_data = cls(test_df, data_field, True, **kwargs)\n", 207 | "\n", 208 | " return tuple(d for d in (train_data, val_data, test_data) if d is not None)" 209 | ] 210 | }, 211 | { 212 | "cell_type": "markdown", 213 | "metadata": {}, 214 | "source": [ 215 | "## 3. Loading the pretrained word vectors\n", 216 | "\n", 217 | "For this tutorial we are using the byte pair encoding. The great [BPEmb](https://pypi.org/project/bpemb/) library helps us the encode the text and provides pretrained models for a lot of languages." 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": null, 223 | "metadata": {}, 224 | "outputs": [], 225 | "source": [ 226 | "from collections import Counter\n", 227 | "from torchtext import vocab\n", 228 | "\n", 229 | "bpemb_de = BPEmb(lang=\"de\", vs=10000)\n", 230 | "bpemb_de_counter = Counter(bpemb_de.words)\n", 231 | "bpemb_de_stoi = {word:i for i, word in enumerate(bpemb_de.words)}\n", 232 | "\n", 233 | "bpemb_vocab = vocab.Vocab(counter = bpemb_de_counter)\n", 234 | "bpemb_vocab.set_vectors(stoi = bpemb_de_stoi, vectors = torch.tensor(bpemb_de.vectors), dim = bpemb_de.dim)\n", 235 | "\n", 236 | "bpemb_vocab.stoi = bpemb_de_stoi # pytorch overwrite our tokens, so we need to reset them\n" 237 | ] 238 | }, 239 | { 240 | "cell_type": "markdown", 241 | "metadata": {}, 242 | "source": [ 243 | "The byte pair encoding turns words into tokens. Every tokens has an id and a coresponding vector that we can feed to our neural network." 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": {}, 250 | "outputs": [], 251 | "source": [ 252 | "tokens = bpemb_de.encode_with_bos_eos(\"das ist ein test\")\n", 253 | "print(tokens)\n", 254 | "\n", 255 | "token_ids = bpemb_de.encode_ids_with_bos_eos(\"das ist ein test\")\n", 256 | "print(token_ids)\n" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": null, 262 | "metadata": {}, 263 | "outputs": [], 264 | "source": [ 265 | "# and this is how the vector for the \"_das\" token looks like:\n", 266 | "bpemb_de.vectors[99]\n", 267 | "#[bpemb_de.vectors[id] for id in token_ids] # vectors for all tokens" 268 | ] 269 | }, 270 | { 271 | "cell_type": "markdown", 272 | "metadata": {}, 273 | "source": [ 274 | "## 4. Load Train and Valid Data Sets\n", 275 | "\n", 276 | "First, we define how the TEXT and LABEL's will encoded. Thats what the Field fields do. With these fields and the class we defined above we can create a data set." 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": null, 282 | "metadata": {}, 283 | "outputs": [], 284 | "source": [ 285 | "TEXT = data.Field(tokenize= bpemb_de.encode,init_token ='', eos_token='',pad_token=\"\",use_vocab = True, batch_first = True,sequential=True )\n", 286 | "\n", 287 | "TEXT.vocab = bpemb_vocab # -> assign our byte pair endcoing module\n", 288 | "LABEL = data.LabelField(dtype = torch.float, use_vocab = False)\n", 289 | "\n", 290 | "fields = [('text',TEXT), ('label',LABEL)]\n", 291 | "train_ds, val_ds = DataFrameDataset.splits(fields, train_df=train_df, val_df=test_df)" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": null, 297 | "metadata": {}, 298 | "outputs": [], 299 | "source": [ 300 | "# Lets look at a the first example\n", 301 | "print(vars(train_ds[0]))" 302 | ] 303 | }, 304 | { 305 | "cell_type": "markdown", 306 | "metadata": {}, 307 | "source": [ 308 | "### Batch Iterator\n", 309 | "\n", 310 | "With this data set we can now create a iterator that prepares the batches for us." 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": null, 316 | "metadata": {}, 317 | "outputs": [], 318 | "source": [ 319 | "BATCH_SIZE = 64\n", 320 | "\n", 321 | "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", 322 | "\n", 323 | "train_iterator, valid_iterator = data.Iterator.splits(\n", 324 | " (train_ds, val_ds), \n", 325 | " batch_size = BATCH_SIZE,\n", 326 | " shuffle = True, \n", 327 | " device = device)" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": null, 333 | "metadata": {}, 334 | "outputs": [], 335 | "source": [ 336 | "# This is how a batch looks like. Do you know why our texts a still id's?\n", 337 | "\n", 338 | "batch = next(iter(train_iterator))\n", 339 | "\n", 340 | "print(batch.label)\n", 341 | "print(batch.text)" 342 | ] 343 | }, 344 | { 345 | "cell_type": "markdown", 346 | "metadata": {}, 347 | "source": [ 348 | "## 5. Define the Model\n", 349 | "\n", 350 | "Now its finally time to define our model:" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": null, 356 | "metadata": {}, 357 | "outputs": [], 358 | "source": [ 359 | "class SimpleModel(nn.Module):\n", 360 | " def __init__(self, weights,embedding_length = 100):\n", 361 | " super(SimpleModel, self).__init__()\n", 362 | " \n", 363 | " # these three lines load to pretrained vecotrs into our embedding layer\n", 364 | " vocab_size= len(weights) \n", 365 | " self.word_embeddings = nn.Embedding(vocab_size, embedding_length) \n", 366 | " self.word_embeddings.weight = nn.Parameter(weights, requires_grad=False) \n", 367 | " \n", 368 | " def forward(self, input_sentences):\n", 369 | " input = self.word_embeddings(input_sentences) # <-- here we turn our ids into actual vectors\n", 370 | " \n", 371 | " # since our sentences are do not have a equal length, we can't simply feed them \n", 372 | " # into a feed forward network. How can we solve that?\n", 373 | " \n", 374 | " return input # " 375 | ] 376 | }, 377 | { 378 | "cell_type": "markdown", 379 | "metadata": {}, 380 | "source": [ 381 | "## 6. Train the model\n", 382 | "\n", 383 | "First we define a set of helper funtions, to make our live a bit easier. " 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": null, 389 | "metadata": {}, 390 | "outputs": [], 391 | "source": [ 392 | "def binary_accuracy(preds, y):\n", 393 | " \"\"\"\n", 394 | " Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8\n", 395 | " \"\"\"\n", 396 | "\n", 397 | " #round predictions to the closest integer\n", 398 | " rounded_preds = torch.round(torch.sigmoid(preds))\n", 399 | " correct = (rounded_preds == y).float() #convert into float for division \n", 400 | " acc = correct.sum() / len(correct)\n", 401 | " return acc" 402 | ] 403 | }, 404 | { 405 | "cell_type": "code", 406 | "execution_count": null, 407 | "metadata": {}, 408 | "outputs": [], 409 | "source": [ 410 | "# we moved the training of a single batch into a method for convenience\n", 411 | "def train(model, iterator):\n", 412 | " \n", 413 | " epoch_loss = 0\n", 414 | " epoch_acc = 0\n", 415 | " \n", 416 | " model.train()\n", 417 | " \n", 418 | " for batch in iterator:\n", 419 | " text = batch.text\n", 420 | " optimizer.zero_grad()\n", 421 | " predictions = model(text).squeeze(1) \n", 422 | " loss = criterion(predictions, batch.label)\n", 423 | " acc = binary_accuracy(predictions, batch.label)\n", 424 | " \n", 425 | " loss.backward()\n", 426 | " optimizer.step()\n", 427 | " \n", 428 | " epoch_loss += loss.item()\n", 429 | " epoch_acc += acc.item()\n", 430 | " \n", 431 | "\n", 432 | " return epoch_loss / len(iterator), epoch_acc / len(iterator)" 433 | ] 434 | }, 435 | { 436 | "cell_type": "code", 437 | "execution_count": null, 438 | "metadata": {}, 439 | "outputs": [], 440 | "source": [ 441 | "# ...same with the eval code\n", 442 | "def evaluate(model, iterator):\n", 443 | " \n", 444 | " epoch_acc = 0\n", 445 | " model.eval()\n", 446 | " \n", 447 | " with torch.no_grad():\n", 448 | " for batch in iterator:\n", 449 | " text = batch.text\n", 450 | " predictions = model(text).squeeze(1)\n", 451 | " acc = binary_accuracy(predictions, batch.label)\n", 452 | " \n", 453 | " epoch_acc += acc.item()\n", 454 | " \n", 455 | " return epoch_acc / len(iterator)" 456 | ] 457 | }, 458 | { 459 | "cell_type": "markdown", 460 | "metadata": {}, 461 | "source": [ 462 | "### Now we can create an instance of our model, with the pretrained byte pair vectors." 463 | ] 464 | }, 465 | { 466 | "cell_type": "code", 467 | "execution_count": null, 468 | "metadata": {}, 469 | "outputs": [], 470 | "source": [ 471 | "model = SimpleModel(torch.tensor(bpemb_de.vectors))\n", 472 | "model.to(device)\n", 473 | "\n", 474 | "learning_rate = 0.001\n", 475 | "\n", 476 | "criterion = nn.BCEWithLogitsLoss()\n", 477 | "\n", 478 | "optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)" 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": null, 484 | "metadata": {}, 485 | "outputs": [], 486 | "source": [ 487 | "num_epochs = 10\n", 488 | "loss=[]\n", 489 | "acc=[]\n", 490 | "val_acc=[]\n", 491 | "\n", 492 | "for epoch in range(num_epochs):\n", 493 | " \n", 494 | " train_loss, train_acc = train(model, train_iterator)\n", 495 | " valid_acc = evaluate(model, valid_iterator)\n", 496 | " \n", 497 | " print(f'{epoch} Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Acc: {valid_acc*100:.2f}%') \n", 498 | " \n", 499 | " loss.append(train_loss)\n", 500 | " acc.append(train_acc)\n", 501 | " val_acc.append(valid_acc)\n", 502 | " \n" 503 | ] 504 | }, 505 | { 506 | "cell_type": "code", 507 | "execution_count": null, 508 | "metadata": {}, 509 | "outputs": [], 510 | "source": [ 511 | "import matplotlib.pyplot as plt\n", 512 | "\n", 513 | "epochs = range(1,num_epochs+1)\n", 514 | "#plt.plot(epochs, loss, 'g', label='Training loss')\n", 515 | "plt.plot(epochs, acc, 'b', label='Training acc')\n", 516 | "plt.plot(epochs, val_acc, 'r', label='validation acc')\n", 517 | "plt.title('Training and Validation loss')\n", 518 | "plt.xlabel('Epochs')\n", 519 | "plt.ylabel('Loss')\n", 520 | "plt.legend()\n", 521 | "plt.show()" 522 | ] 523 | }, 524 | { 525 | "cell_type": "code", 526 | "execution_count": null, 527 | "metadata": {}, 528 | "outputs": [], 529 | "source": [] 530 | }, 531 | { 532 | "cell_type": "markdown", 533 | "metadata": {}, 534 | "source": [ 535 | "# Tasks\n", 536 | "\n", 537 | "1. Implement a feed forward neural entwork classifyer\n", 538 | "\n", 539 | "2. Try to improve the results. What happens when,\n", 540 | " * you use more layers\n", 541 | " * more neurons\n", 542 | " * a bigger vocabulary size\n", 543 | " \n", 544 | "3. Try differnt models:\n", 545 | " * Use LSTMs \n", 546 | " * Did you know that you can use a cnn to classify text?" 547 | ] 548 | }, 549 | { 550 | "cell_type": "code", 551 | "execution_count": null, 552 | "metadata": {}, 553 | "outputs": [], 554 | "source": [] 555 | } 556 | ], 557 | "metadata": { 558 | "kernelspec": { 559 | "display_name": "Python 3", 560 | "language": "python", 561 | "name": "python3" 562 | }, 563 | "language_info": { 564 | "codemirror_mode": { 565 | "name": "ipython", 566 | "version": 3 567 | }, 568 | "file_extension": ".py", 569 | "mimetype": "text/x-python", 570 | "name": "python", 571 | "nbconvert_exporter": "python", 572 | "pygments_lexer": "ipython3", 573 | "version": "3.8.2" 574 | } 575 | }, 576 | "nbformat": 4, 577 | "nbformat_minor": 4 578 | } 579 | -------------------------------------------------------------------------------- /assignments/Embeddings/data/custom-emo.txt: -------------------------------------------------------------------------------- 1 | 🤢 Negative 2 | 😡 Negative 3 | 🤮 Negative 4 | 💩 Negative -------------------------------------------------------------------------------- /assignments/RNNs/shakespear-lstm.py: -------------------------------------------------------------------------------- 1 | ''' 2 | #Example script to generate text from Nietzsche's writings. 3 | 4 | At least 20 epochs are required before the generated text 5 | starts sounding coherent. 6 | 7 | It is recommended to run this script on GPU, as recurrent 8 | networks are quite computationally intensive. 9 | 10 | If you try this script on new data, make sure your corpus 11 | has at least ~100k characters. ~1M is better. 12 | 13 | 14 | You can try some other texts too: 15 | 16 | 17 | What about Tolstoys Anna Karenina: 18 | https://raw.githubusercontent.com/udacity/deep-learning/master/tensorboard/anna.txt 19 | 20 | Or some Nietzsche: 21 | https://s3.amazonaws.com/text-datasets/nietzsche.txt 22 | 23 | Germany Wikipedia Articles: 24 | https://www2.htw-dresden.de/~guhr/dist/wiki.txt 25 | 26 | Shakesspears Sonnets: 27 | https://raw.githubusercontent.com/vivshaw/shakespeare-LSTM/master/sonnets.txt 28 | ''' 29 | 30 | from __future__ import print_function 31 | from keras.callbacks import LambdaCallback, TensorBoard 32 | from keras.models import Sequential 33 | from keras.layers import Dense 34 | from keras.layers import LSTM, CuDNNLSTM, CuDNNGRU, Dropout 35 | from keras.optimizers import RMSprop, SGD, Nadam 36 | from keras.utils.data_utils import get_file 37 | import numpy as np 38 | import random 39 | import sys 40 | import io 41 | from datetime import datetime 42 | import re 43 | 44 | path = get_file( 45 | 'shakespear.txt', 46 | origin='https://cs.stanford.edu/people/karpathy/char-rnn/shakespear.txt') 47 | 48 | 49 | with io.open(path, encoding='utf-8') as f: 50 | text = f.read().lower() 51 | print('corpus length:', len(text)) 52 | 53 | # build lookup table 54 | chars = sorted(list(set(text))) 55 | print('total chars:', len(chars)) 56 | char_indices = dict((c, i) for i, c in enumerate(chars)) 57 | indices_char = dict((i, c) for i, c in enumerate(chars)) 58 | 59 | # cut the text in semi-redundant sequences of maxlen characters 60 | # How does the network react when you change the sequence length or stepsize 61 | maxlen = 40 62 | step = 3 63 | sentences = [] 64 | next_chars = [] 65 | for i in range(0, len(text) - maxlen, step): 66 | sentences.append(text[i: i + maxlen]) 67 | next_chars.append(text[i + maxlen]) 68 | print('nb sequences:', len(sentences)) 69 | 70 | print('Vectorization...') 71 | x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool) 72 | y = np.zeros((len(sentences), len(chars)), dtype=np.bool) 73 | for i, sentence in enumerate(sentences): 74 | for t, char in enumerate(sentence): 75 | x[i, t, char_indices[char]] = 1 76 | y[i, char_indices[next_chars[i]]] = 1 77 | 78 | 79 | # build the model: a single LSTM layer 80 | # experiment: 81 | # - add some more neurons 82 | # - add some more layers 83 | # - add dropout 84 | # - try out GRU's 85 | 86 | print('Build model...') 87 | model = Sequential() 88 | model.add(CuDNNLSTM(128,input_shape=(maxlen, len(chars)))) 89 | model.add(Dense(len(chars), activation='softmax')) 90 | 91 | rms = RMSprop(lr=0.01) 92 | # try some other optimizers 93 | #sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True) 94 | #nadam = Nadam(lr=0.002, beta_1=0.9, beta_2=0.999, epsilon=None, schedule_decay=0.004) 95 | model.compile(loss='categorical_crossentropy', optimizer=rms) 96 | 97 | 98 | def sample(preds, temperature=1.0): 99 | # helper function to sample an index from a probability array 100 | # Read more about this softmax with temperature here: 101 | # Distilling the Knowledge in a Neural Network (Geoffrey Hinton, Oriol Vinyals, Jeff Dean) 102 | # https://arxiv.org/abs/1503.02531 103 | preds = np.asarray(preds).astype('float64') 104 | preds = np.log(preds) / temperature 105 | exp_preds = np.exp(preds) 106 | preds = exp_preds / np.sum(exp_preds) 107 | probas = np.random.multinomial(1, preds, 1) 108 | return np.argmax(probas) 109 | 110 | 111 | def on_epoch_end(epoch, _): 112 | # Function invoked at end of each epoch. Prints generated text. 113 | print() 114 | print('----- Generating text after Epoch: %d' % epoch) 115 | 116 | start_index = random.randint(0, len(text) - maxlen - 1) 117 | for diversity in [0.8, 1.0, 1.2]: 118 | print('----- diversity:', diversity) 119 | 120 | generated = '' 121 | sentence = text[start_index: start_index + maxlen] 122 | generated += sentence 123 | print('----- Generating with seed: "' + sentence + '"') 124 | sys.stdout.write(generated) 125 | sys.stdout.write("\n----- result ------\n") 126 | for i in range(300): 127 | x_pred = np.zeros((1, maxlen, len(chars))) 128 | for t, char in enumerate(sentence): 129 | x_pred[0, t, char_indices[char]] = 1. 130 | 131 | preds = model.predict(x_pred, verbose=0)[0] 132 | next_index = sample(preds, diversity) 133 | next_char = indices_char[next_index] 134 | 135 | sentence = sentence[1:] + next_char 136 | 137 | sys.stdout.write(next_char) 138 | sys.stdout.flush() 139 | print() 140 | 141 | # print some text with the current model 142 | print_callback = LambdaCallback(on_epoch_end=on_epoch_end) 143 | 144 | # train the model 145 | model.fit(x, y, 146 | batch_size=128, 147 | epochs=90, 148 | callbacks=[print_callback]) 149 | 150 | # save the model 151 | model.save("shakespear-rnn") -------------------------------------------------------------------------------- /assignments/RNNs/vanishing-gradients.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 11, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%matplotlib inline\n", 10 | "import matplotlib.pyplot as plt\n", 11 | "import torch\n", 12 | "import torch.nn as nn\n", 13 | "import torch.nn.functional as F\n", 14 | "plt.rcParams[\"figure.figsize\"] = (12, 9)" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Vanishing & Exploding Gradient Problem\n", 22 | "\n", 23 | "By gradient, we mean the gradient of the loss function with respect to the weights of the neural network. As you already learned, this gradient is calculated using backpropagation. \n", 24 | "\n", 25 | "* What you should know:\n", 26 | " * [Backpropagation](https://www.youtube.com/watch?v=tIeHLnjs5U8)\n", 27 | " \n", 28 | "* Video for this topic:\n", 29 | " * [Vanishing & Exploding Gradient explained](https://www.youtube.com/watch?v=qO_NLVjD6zE)" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": {}, 35 | "source": [ 36 | "### What happens during backpropagation?\n", 37 | "\n", 38 | "Let's start with the loss function. In case we use an MSE loss (or cost) function it is calculated this way:\n", 39 | "\n", 40 | "$ Loss = (a_{L} -y)^2 $\n", 41 | "\n", 42 | "Where $ y $ denotes the desired output of the network and $a_{L}$ is the activation of the last neuron.\n", 43 | "\n", 44 | "\n", 45 | "$ a_{L} = \\sigma( w_{L}a_{L-1}+b_{L})$\n", 46 | "\n", 47 | "\n", 48 | "Let's define $ in $ as the input of our network and \n", 49 | "write down a 5 layer network single neuron network:\n", 50 | "\n", 51 | "\n", 52 | "$ a_{1} = \\sigma( w_{1} in +b_{1}) $ \n", 53 | "\n", 54 | "$ a_{2} = \\sigma( w_{2}a_{1}+b_{2}) $ \n", 55 | "\n", 56 | "$ a_{3} = \\sigma( w_{3}a_{2}+b_{3}) $ \n", 57 | "\n", 58 | "$ a_{4} = \\sigma( w_{4}a_{3}+b_{4}) $ \n", 59 | "\n", 60 | "$ a_{5} = \\sigma( w_{5}a_{4}+b_{5}) $ \n", 61 | "\n", 62 | "$ Loss = (a_{5} -y)^2 $\n", 63 | "\n", 64 | "With the help of the backpropagation algorithm, we can adjust the weights. As you know the backpropagation uses derivatives to calculate the weight changes. How do these derivatives look like?\n" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "### A close look at the derivative of our activation function" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 12, 77 | "metadata": {}, 78 | "outputs": [ 79 | { 80 | "data": { 81 | "image/png": "\n", 82 | "text/plain": [ 83 | "
" 84 | ] 85 | }, 86 | "metadata": { 87 | "needs_background": "light" 88 | }, 89 | "output_type": "display_data" 90 | } 91 | ], 92 | "source": [ 93 | "activationFunction = nn.Sigmoid() # try this for Sigmoid, Tanh, ReLU\n", 94 | "\n", 95 | "for i in range(-50,50):\n", 96 | " data = torch.tensor([i/10],requires_grad=True)\n", 97 | " calc = activationFunction(data)\n", 98 | " calc.backward() \n", 99 | " plt.plot(i/10,data.grad[0], 'ro') \n", 100 | "\n", 101 | "plt.show()" 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": {}, 107 | "source": [ 108 | "The maximum of the derivative of the sigmoid function is 0.25 at $x = 0$ and much smaller for all other values. \n", 109 | "\n", 110 | "\n", 111 | "### Let's implement a small network...\n", 112 | "so we can see what's going on here. We use a 5 layer single neuron network that we defined earlier. \n", 113 | "\n", 114 | "Let's say our network should negate a number. If we put in a $1$ and it should return a $-1$. For simplicity, we do not use biases in this tiny example." 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 3, 120 | "metadata": {}, 121 | "outputs": [ 122 | { 123 | "data": { 124 | "text/plain": [ 125 | "tensor([1.1797, 1.3934, 0.2839, 0.1575, 1.3092], grad_fn=)" 126 | ] 127 | }, 128 | "execution_count": 3, 129 | "metadata": {}, 130 | "output_type": "execute_result" 131 | } 132 | ], 133 | "source": [ 134 | "number_of_layers = 5\n", 135 | "weight_tensor = torch.randn((1, number_of_layers), requires_grad=True)\n", 136 | "weight = weight_tensor[0].abs() \n", 137 | "net_input = 1 \n", 138 | "y = net_output = -1\n", 139 | "weight" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 4, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "layer = activationFunction(weight[0] * net_input) \n", 149 | "\n", 150 | "for i in range(1,number_of_layers):\n", 151 | " layer = activationFunction(weight[i] * layer) \n", 152 | " \n", 153 | "loss= (layer - y)**2\n", 154 | "\n", 155 | "loss.backward()" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 5, 161 | "metadata": {}, 162 | "outputs": [ 163 | { 164 | "data": { 165 | "text/plain": [ 166 | "tensor([[1.2801e-04, 3.9076e-04, 7.0230e-03, 1.3398e-01, 3.8725e-01]])" 167 | ] 168 | }, 169 | "execution_count": 5, 170 | "metadata": {}, 171 | "output_type": "execute_result" 172 | } 173 | ], 174 | "source": [ 175 | "# we can obtian the gradient of the weights with:\n", 176 | "weight_tensor.grad" 177 | ] 178 | }, 179 | { 180 | "cell_type": "markdown", 181 | "metadata": {}, 182 | "source": [ 183 | "From right to left, we can see how the weight delta gets smaller. Since they are a product of numbers smaller than one, they can only get even smaller." 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": 6, 189 | "metadata": {}, 190 | "outputs": [ 191 | { 192 | "name": "stdout", 193 | "output_type": "stream", 194 | "text": [ 195 | "wieght update:\n", 196 | "\n", 197 | "weight delta \t\t* learning rate \t= update step\n", 198 | "0.0001280104479520 \t* 0.01 \t\t\t= 0.0000012801044795\n", 199 | "\n", 200 | "old weight:1.1797258853912354\n", 201 | "new weight:1.1797246052867558\n" 202 | ] 203 | } 204 | ], 205 | "source": [ 206 | "learning_rate = 0.01\n", 207 | "weight_neuron_one = weight_tensor[0][0].item()\n", 208 | "weight_grad_neuron_one = weight_tensor.grad[0][0].item()\n", 209 | "\n", 210 | "weight_update = weight_grad_neuron_one * learning_rate\n", 211 | "\n", 212 | "print(\"wieght update:\\n\")\n", 213 | "print(\"weight delta \\t\\t* learning rate \\t= update step\")\n", 214 | "print(f\"{weight_grad_neuron_one:.16f} \\t* {learning_rate} \\t\\t\\t= {weight_update:.16f}\")\n", 215 | "print(\"\")\n", 216 | "print(f\"old weight:{weight_neuron_one}\")\n", 217 | "print(f\"new weight:{weight_neuron_one - weight_update}\")" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": 13, 223 | "metadata": {}, 224 | "outputs": [ 225 | { 226 | "data": { 227 | "image/png": "\n", 228 | "text/plain": [ 229 | "
" 230 | ] 231 | }, 232 | "metadata": { 233 | "needs_background": "light" 234 | }, 235 | "output_type": "display_data" 236 | } 237 | ], 238 | "source": [ 239 | "fig, ax = plt.subplots()\n", 240 | "ax.plot(range(1,number_of_layers+1),weight_tensor.grad[0].numpy())\n", 241 | "\n", 242 | "ax.set(xlabel='weight update', ylabel='Layer',\n", 243 | " title='Gradient w.r.t. the weights')\n", 244 | "ax.grid()\n", 245 | "\n", 246 | "plt.show()" 247 | ] 248 | }, 249 | { 250 | "cell_type": "markdown", 251 | "metadata": {}, 252 | "source": [ 253 | "As you can the gradient of our toy network quickly gets close to zero. That's why this problem is called vanishing gradient problem. Since the updates to the weights are so small, they also don't help to reduce the loss.\n", 254 | "\n", 255 | "The opposite of this is called the exploding gradient, this happens if weights or activations.\n", 256 | "\n", 257 | "\n", 258 | "## Tasks\n", 259 | "\n", 260 | "1. Try different activation functions.\n", 261 | "2. Increase the number of layers.\n", 262 | "3. Modify the code so that the gradient explodes." 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "metadata": {}, 269 | "outputs": [], 270 | "source": [] 271 | } 272 | ], 273 | "metadata": { 274 | "kernelspec": { 275 | "display_name": "Python 3", 276 | "language": "python", 277 | "name": "python3" 278 | }, 279 | "language_info": { 280 | "codemirror_mode": { 281 | "name": "ipython", 282 | "version": 3 283 | }, 284 | "file_extension": ".py", 285 | "mimetype": "text/x-python", 286 | "name": "python", 287 | "nbconvert_exporter": "python", 288 | "pygments_lexer": "ipython3", 289 | "version": "3.6.7" 290 | } 291 | }, 292 | "nbformat": 4, 293 | "nbformat_minor": 2 294 | } 295 | -------------------------------------------------------------------------------- /assignments/transformer/nlp_2_transformer_offensive_language_classification.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "kernelspec": { 6 | "display_name": "Python 3", 7 | "language": "python", 8 | "name": "python3" 9 | }, 10 | "language_info": { 11 | "codemirror_mode": { 12 | "name": "ipython", 13 | "version": 3 14 | }, 15 | "file_extension": ".py", 16 | "mimetype": "text/x-python", 17 | "name": "python", 18 | "nbconvert_exporter": "python", 19 | "pygments_lexer": "ipython3", 20 | "version": "3.6.9" 21 | }, 22 | "colab": { 23 | "name": "nlp-2-transformer-offensive-language-classification.ipynb", 24 | "private_outputs": true, 25 | "provenance": [], 26 | "include_colab_link": true 27 | }, 28 | "accelerator": "GPU" 29 | }, 30 | "cells": [ 31 | { 32 | "cell_type": "markdown", 33 | "metadata": { 34 | "id": "view-in-github", 35 | "colab_type": "text" 36 | }, 37 | "source": [ 38 | "\"Open" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": { 44 | "id": "1x7ywAcjTMyk" 45 | }, 46 | "source": [ 47 | "# Offensive Language Classification\n", 48 | "\n", 49 | "\n", 50 | "## First Steps\n", 51 | "\n", 52 | "We need to download the required packages and our the training data." 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "metadata": { 58 | "id": "T6cFhLiDTMyk" 59 | }, 60 | "source": [ 61 | "!pip install datasets transformers accelerate" 62 | ], 63 | "execution_count": null, 64 | "outputs": [] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "metadata": { 69 | "id": "lqhVeVEnTMyl" 70 | }, 71 | "source": [ 72 | "!wget -c https://www.htw-dresden.de/~guhr/dist/sample/germeval2018.training.txt\n", 73 | "!wget -c https://www.htw-dresden.de/~guhr/dist/sample/germeval2018.test.txt" 74 | ], 75 | "execution_count": null, 76 | "outputs": [] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "metadata": { 81 | "id": "ONd9nMwMTMyl" 82 | }, 83 | "source": [ 84 | "import time\n", 85 | "import pandas as pd\n", 86 | "import numpy as np" 87 | ], 88 | "execution_count": null, 89 | "outputs": [] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "metadata": { 94 | "id": "4Ca0b7_IpH1M" 95 | }, 96 | "source": [ 97 | "# check if we have a GPU\n", 98 | "!nvidia-smi" 99 | ], 100 | "execution_count": null, 101 | "outputs": [] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": { 106 | "id": "5wCOi_UiTMyl" 107 | }, 108 | "source": [ 109 | "## Prepairing the data\n", 110 | "\n", 111 | "In the next step we have to load the data and adjust it a bit. The data is available in tab delimited csv. Pandas is a good choice for simple processing, but it could also be done with Python board tools." 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "metadata": { 117 | "id": "fpvStxVhTMyl" 118 | }, 119 | "source": [ 120 | "test_df = pd.read_csv(\"germeval2018.test.txt\", sep='\\t', header=0,encoding=\"utf-8\")\n", 121 | "train_df = pd.read_csv(\"germeval2018.training.txt\", sep='\\t', header=0,encoding=\"utf-8\")" 122 | ], 123 | "execution_count": null, 124 | "outputs": [] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "metadata": { 129 | "id": "H9CkOlvaTMyl" 130 | }, 131 | "source": [ 132 | "train_df.head()" 133 | ], 134 | "execution_count": null, 135 | "outputs": [] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "metadata": { 140 | "id": "W4BsXBCsTMyn" 141 | }, 142 | "source": [ 143 | "# Since we do not need the label 2 columns, we can delete them.\n", 144 | "test_df.drop(columns=['label2'], inplace=True)\n", 145 | "train_df.drop(columns=['label2'], inplace=True)" 146 | ], 147 | "execution_count": null, 148 | "outputs": [] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "metadata": { 153 | "id": "q4P20Gf_TMyn" 154 | }, 155 | "source": [ 156 | "def clean_text (text):\n", 157 | " #text = text.str.lower() # lowercase\n", 158 | " #text = text.str.replace(r\"\\#\",\"\") # replaces hashtags\n", 159 | " #text = text.str.replace(r\"http\\S+\",\"URL\") # remove URL addresses\n", 160 | " #text = text.str.replace(r\"@\",\"\")\n", 161 | " #text = text.str.replace(r\"[^A-Za-z0-9öäüÖÄÜß()!?]\", \" \")\n", 162 | " #text = text.str.replace(\"\\s{2,}\", \" \")\n", 163 | " return text\n", 164 | "\n", 165 | "def convert_label(label):\n", 166 | " return 1 if label == \"OFFENSE\" else 0" 167 | ], 168 | "execution_count": null, 169 | "outputs": [] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "metadata": { 174 | "id": "p690qluXTMyn" 175 | }, 176 | "source": [ 177 | "train_df[\"text\"]=clean_text(train_df[\"text\"])\n", 178 | "test_df[\"text\"]=clean_text(test_df[\"text\"])\n", 179 | "train_df[\"label\"]=train_df[\"label\"].map(convert_label)\n", 180 | "test_df[\"label\"]=test_df[\"label\"].map(convert_label)" 181 | ], 182 | "execution_count": null, 183 | "outputs": [] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "metadata": { 188 | "id": "9BIixoz-TMyn" 189 | }, 190 | "source": [ 191 | "# this is how our data set looks now\n", 192 | "train_df.head() " 193 | ], 194 | "execution_count": null, 195 | "outputs": [] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "metadata": { 200 | "id": "HeE1qHXhTMyo" 201 | }, 202 | "source": [ 203 | "len(train_df.loc[train_df[\"label\"]==1])" 204 | ], 205 | "execution_count": null, 206 | "outputs": [] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "source": [ 211 | "from sklearn.utils import shuffle\n", 212 | "train_df = shuffle(train_df)" 213 | ], 214 | "metadata": { 215 | "id": "XYIke-q7Oqfz" 216 | }, 217 | "execution_count": null, 218 | "outputs": [] 219 | }, 220 | { 221 | "cell_type": "markdown", 222 | "metadata": { 223 | "id": "T1_tIVzLTMyo" 224 | }, 225 | "source": [ 226 | "How many datasets do we have in our Train/Valid/Test sets?" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "metadata": { 232 | "id": "6rCCzWaJTMyo" 233 | }, 234 | "source": [ 235 | "print(f\"Test exampels \\t {len(test_df) }\")\n", 236 | "print(f\"Train exampels \\t {len(train_df[500:])}\")\n", 237 | "print(f\"Valid exampels \\t {len(train_df[:500])}\")" 238 | ], 239 | "execution_count": null, 240 | "outputs": [] 241 | }, 242 | { 243 | "cell_type": "markdown", 244 | "metadata": { 245 | "id": "ObJ7KjhYDxCX" 246 | }, 247 | "source": [ 248 | "In the next step we convert the data in a format that our ml lib can use." 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "metadata": { 254 | "id": "5zRn0t3oTMyp" 255 | }, 256 | "source": [ 257 | "from datasets import Dataset\n", 258 | "\n", 259 | "train_dataset = Dataset.from_pandas(train_df[500:])\n", 260 | "valid_dataset = Dataset.from_pandas(train_df[:500])\n", 261 | "test_dataset = Dataset.from_pandas(test_df)" 262 | ], 263 | "execution_count": null, 264 | "outputs": [] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "metadata": { 269 | "id": "2qt9p2yeTMyp" 270 | }, 271 | "source": [ 272 | "# What is the shape of our dataset?\n", 273 | "train_dataset" 274 | ], 275 | "execution_count": null, 276 | "outputs": [] 277 | }, 278 | { 279 | "cell_type": "markdown", 280 | "metadata": { 281 | "id": "fUhmAob8TMyp" 282 | }, 283 | "source": [ 284 | "## Encoding of the data \n", 285 | "\n", 286 | "We convert our texts into token that our model can process." 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "metadata": { 292 | "id": "jxDv4WeXTMyp" 293 | }, 294 | "source": [ 295 | "from transformers import AutoTokenizer\n", 296 | "from datasets import load_dataset, load_metric, list_metrics\n", 297 | "\n", 298 | "\n", 299 | "# try out different models :) \n", 300 | "\n", 301 | "model_checkpoint =\"distilbert-base-multilingual-cased\"\n", 302 | "\n", 303 | "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)" 304 | ], 305 | "execution_count": null, 306 | "outputs": [] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "source": [ 311 | "!rm -rf ./test-offsive-language/checkpoint*" 312 | ], 313 | "metadata": { 314 | "id": "ZFuavLu5UlwZ" 315 | }, 316 | "execution_count": null, 317 | "outputs": [] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "metadata": { 322 | "id": "ZkNiW-kITMyp" 323 | }, 324 | "source": [ 325 | "demo_tokens = tokenizer([\"Mehr Daten führen oftmals zu besseren Ergebnissen.\", \"And this is a second sentence\"],add_special_tokens=True, truncation=True)\n", 326 | "demo_tokens" 327 | ], 328 | "execution_count": null, 329 | "outputs": [] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "metadata": { 334 | "id": "tjSb3j1RTMyp" 335 | }, 336 | "source": [ 337 | "tokenizer.convert_ids_to_tokens(demo_tokens['input_ids'][0])" 338 | ], 339 | "execution_count": null, 340 | "outputs": [] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "metadata": { 345 | "id": "2CBg2qhVTMyp" 346 | }, 347 | "source": [ 348 | "def example_tokenizer(examples):\n", 349 | " return tokenizer(examples[\"text\"], truncation=True,padding=False)" 350 | ], 351 | "execution_count": null, 352 | "outputs": [] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "metadata": { 357 | "id": "tWghamclTMyp" 358 | }, 359 | "source": [ 360 | "encoded_train_dataset = train_dataset.map(example_tokenizer, batched=True)\n", 361 | "encoded_valid_dataset = valid_dataset.map(example_tokenizer, batched=True)\n", 362 | "encoded_test_dataset = test_dataset.map(example_tokenizer, batched=True)" 363 | ], 364 | "execution_count": null, 365 | "outputs": [] 366 | }, 367 | { 368 | "cell_type": "markdown", 369 | "metadata": { 370 | "id": "xhkxSIVLTMyq" 371 | }, 372 | "source": [ 373 | "## The training \\o/\n", 374 | "\n", 375 | "Now we can train our model. To do this, we need to define a number of settings (hyperparameters):" 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "metadata": { 381 | "id": "X4DtlapiTMyq" 382 | }, 383 | "source": [ 384 | "from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer\n", 385 | "\n", 386 | "model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)\n", 387 | "\n", 388 | "batch_size = 8\n", 389 | "\n", 390 | "args = TrainingArguments(\n", 391 | " \"test-offsive-language\",\n", 392 | " evaluation_strategy = \"steps\",\n", 393 | " save_strategy= \"steps\",\n", 394 | " learning_rate=3e-5,\n", 395 | " per_device_train_batch_size=batch_size,\n", 396 | " per_device_eval_batch_size=batch_size,\n", 397 | " gradient_accumulation_steps=4,\n", 398 | " num_train_epochs=2,\n", 399 | " eval_steps=50,\n", 400 | " save_steps=50,\n", 401 | " warmup_steps=50,\n", 402 | " logging_steps=10,\n", 403 | " weight_decay=0.001,\n", 404 | " load_best_model_at_end=True,\n", 405 | " overwrite_output_dir=True,\n", 406 | " metric_for_best_model=\"f1\",\n", 407 | " save_total_limit=2, \n", 408 | " fp16=True \n", 409 | ")" 410 | ], 411 | "execution_count": null, 412 | "outputs": [] 413 | }, 414 | { 415 | "cell_type": "code", 416 | "source": [ 417 | "from sklearn.metrics import accuracy_score, f1_score\n", 418 | "\n", 419 | "def compute_metrics(pred):\n", 420 | " labels = pred.label_ids\n", 421 | " preds = pred.predictions.argmax(-1)\n", 422 | " f1 = f1_score(labels, preds, average=\"macro\")\n", 423 | " acc = accuracy_score(labels, preds)\n", 424 | " return {\"accuracy\": acc, \"f1\": f1}" 425 | ], 426 | "metadata": { 427 | "id": "Rh_wjh5TKhY8" 428 | }, 429 | "execution_count": null, 430 | "outputs": [] 431 | }, 432 | { 433 | "cell_type": "code", 434 | "metadata": { 435 | "id": "g0v5GJXmTMyq" 436 | }, 437 | "source": [ 438 | "trainer = Trainer(\n", 439 | " model,\n", 440 | " args,\n", 441 | " train_dataset=encoded_train_dataset,\n", 442 | " eval_dataset=encoded_valid_dataset, \n", 443 | " tokenizer=tokenizer,\n", 444 | " compute_metrics=compute_metrics\n", 445 | ")" 446 | ], 447 | "execution_count": null, 448 | "outputs": [] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "metadata": { 453 | "id": "y_lUSvpDTMyq" 454 | }, 455 | "source": [ 456 | "trainer.train()" 457 | ], 458 | "execution_count": null, 459 | "outputs": [] 460 | }, 461 | { 462 | "cell_type": "code", 463 | "metadata": { 464 | "id": "i98JNibNTMyr" 465 | }, 466 | "source": [ 467 | "#trainer.model.to(\"cuda\")\n", 468 | "trainer.evaluate()" 469 | ], 470 | "execution_count": null, 471 | "outputs": [] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "metadata": { 476 | "id": "rfEjiV_p7BRh" 477 | }, 478 | "source": [ 479 | "# How much GPU memory did we use?\n", 480 | "!nvidia-smi\n", 481 | "#import torch\n", 482 | "#torch.cuda.empty_cache()\n", 483 | "#!nvidia-smi" 484 | ], 485 | "execution_count": null, 486 | "outputs": [] 487 | }, 488 | { 489 | "cell_type": "code", 490 | "metadata": { 491 | "id": "-w-nuYI8TMyr" 492 | }, 493 | "source": [ 494 | "#tensorboard --logdir runs\n", 495 | "%load_ext tensorboard\n", 496 | "#%reload_ext tensorboard\n", 497 | "%tensorboard --logdir /content/test-offsive-language/runs" 498 | ], 499 | "execution_count": null, 500 | "outputs": [] 501 | }, 502 | { 503 | "cell_type": "markdown", 504 | "metadata": { 505 | "id": "tv-PghAYTMyr" 506 | }, 507 | "source": [ 508 | "## Testing the model\n", 509 | "\n", 510 | "The next step is to test the model with the provided test data." 511 | ] 512 | }, 513 | { 514 | "cell_type": "code", 515 | "metadata": { 516 | "id": "jYI9LEbvTMyr" 517 | }, 518 | "source": [ 519 | "result = trainer.predict(encoded_test_dataset)\n", 520 | "result.metrics[\"test_f1\"]" 521 | ], 522 | "execution_count": null, 523 | "outputs": [] 524 | }, 525 | { 526 | "cell_type": "code", 527 | "metadata": { 528 | "id": "YiYw4kS3TMyr" 529 | }, 530 | "source": [ 531 | "import torch\n", 532 | "\n", 533 | "#trainer.prediction_step(trainer.model,tokenizer(\"das ist ein test\"),False)\n", 534 | "trainer.model.cpu()\n", 535 | "#trainer.model.num_parameters()\n", 536 | "encoded_texts = tokenizer([\"du bist so dumm\", \"du bist toll\"],padding=True, return_tensors=\"pt\")\n", 537 | "print(encoded_texts)\n", 538 | "logits = trainer.model(**encoded_texts)\n", 539 | "probabilities = torch.softmax(logits[0],dim=1)\n", 540 | "print(probabilities)\n", 541 | "class_label = torch.argmax(probabilities,dim=1)\n", 542 | "print(class_label)" 543 | ], 544 | "execution_count": null, 545 | "outputs": [] 546 | }, 547 | { 548 | "cell_type": "markdown", 549 | "metadata": { 550 | "id": "ylTzH9P8uu-8" 551 | }, 552 | "source": [ 553 | "How can we predict a sigle test example and how long does it take on a cpu?" 554 | ] 555 | }, 556 | { 557 | "cell_type": "code", 558 | "metadata": { 559 | "id": "_eyM790HTMyr" 560 | }, 561 | "source": [ 562 | "def predict(text):\n", 563 | " trainer.model.cpu()\n", 564 | " #trainer.model.num_parameters()\n", 565 | " encoded_texts = tokenizer(text, return_tensors=\"pt\")\n", 566 | " #print(encoded_texts)\n", 567 | " logits = trainer.model(**encoded_texts)\n", 568 | " probabilities = torch.softmax(logits[0],dim=1)\n", 569 | " #print(probabilities)\n", 570 | " class_label = torch.argmax(probabilities)\n", 571 | " return class_label\n", 572 | " #print(class_label)\n", 573 | "\n", 574 | "%timeit predict(\"du bist so toll\")\n", 575 | "\n" 576 | ], 577 | "execution_count": null, 578 | "outputs": [] 579 | }, 580 | { 581 | "cell_type": "markdown", 582 | "metadata": { 583 | "id": "lsL5FUdTTMys" 584 | }, 585 | "source": [ 586 | "# Tutorial:\n", 587 | "\n", 588 | "Our results are already quite good - but we can still improve the results. First get familiar with the notebook - change a few parameters like learning rate and number of epochs and see how they change the results. \n", 589 | "\n", 590 | "**Your task is to improve the classification score.**\n", 591 | "\n", 592 | "Here are some ideas how you can improve the score.\n", 593 | "\n", 594 | "* Test different models. The [Model Hub](https://huggingface.co/models) lists a number of German models with which you can improve the results. \n", 595 | "\n", 596 | "* About 5000 sampels in the data set are comparatively few for this problem. You may find more data sets that you can add to the current training data set.\n", 597 | "\n", 598 | "* A number of multilingual models are available in the [Model Hub](https://huggingface.co/models). These models have been trained with different languages. You could also try adding English to the German dataset to train a multilingual model. This may also be better on the German data. \n", 599 | "\n", 600 | "Data augmentation is a procedure to create new data sets by modifying existing data sets. It is important that the statement does not change (the class remains the same).\n", 601 | "\n", 602 | "* You can replace synonyms words and thus generate new data sets. An example:\n", 603 | "\n", 604 | "> \"Can you still believe all this crap?\" -> \"Can you still believe all this crap?\"\n", 605 | "\n", 606 | "* Everything is allowed here. Try translating texts from German to English and back to German. If the meaning is preserved, the result can also be used for training. A small example with Google Translate:\n", 607 | "\n", 608 | "> Deutsch: \"Kann man diesen ganzen Scheiß noch glauben?\" \n", 609 | "\n", 610 | "> Englisch: \"Can you still believe all this shit?\"\n", 611 | "\n", 612 | "> Deutsch: \"Kannst du all diese Scheiße noch glauben?\"\n", 613 | "\n", 614 | "\n" 615 | ] 616 | } 617 | ] 618 | } -------------------------------------------------------------------------------- /hello-python/Hello PyTorch.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Hello PyTorch - a tiny intro." 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 4, 13 | "metadata": {}, 14 | "outputs": [ 15 | { 16 | "name": "stdout", 17 | "output_type": "stream", 18 | "text": [ 19 | "Collecting package metadata (current_repodata.json): ...working... done\n", 20 | "Solving environment: ...working... done\n", 21 | "\n", 22 | "# All requested packages already installed.\n", 23 | "\n", 24 | "\n", 25 | "Note: you may need to restart the kernel to use updated packages.\n" 26 | ] 27 | } 28 | ], 29 | "source": [ 30 | "conda install pytorch torchvision cpuonly -c pytorch" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 5, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "import torch" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "## Create a tensor (aka matrix) with PyTorch" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 6, 52 | "metadata": {}, 53 | "outputs": [ 54 | { 55 | "data": { 56 | "text/plain": [ 57 | "tensor([[ 1., -1.],\n", 58 | " [ 1., -1.]])" 59 | ] 60 | }, 61 | "execution_count": 6, 62 | "metadata": {}, 63 | "output_type": "execute_result" 64 | } 65 | ], 66 | "source": [ 67 | "torch.tensor([[1., -1.], [1., -1.]])" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 14, 73 | "metadata": {}, 74 | "outputs": [ 75 | { 76 | "name": "stdout", 77 | "output_type": "stream", 78 | "text": [ 79 | "tensor([[ 0.0381, -1.6110, -0.9273],\n", 80 | " [-0.0901, -1.8310, -1.4307],\n", 81 | " [-0.3276, -0.5256, 0.2382],\n", 82 | " [-0.6835, -0.1138, -0.6881]])\n" 83 | ] 84 | } 85 | ], 86 | "source": [ 87 | "matrix = torch.randn(4, 3)\n", 88 | "print(matrix)" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 15, 94 | "metadata": {}, 95 | "outputs": [ 96 | { 97 | "data": { 98 | "text/plain": [ 99 | "torch.Size([4, 3])" 100 | ] 101 | }, 102 | "execution_count": 15, 103 | "metadata": {}, 104 | "output_type": "execute_result" 105 | } 106 | ], 107 | "source": [ 108 | "matrix.shape" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 16, 114 | "metadata": {}, 115 | "outputs": [ 116 | { 117 | "data": { 118 | "text/plain": [ 119 | "tensor([[ 0.0381, -0.0901, -0.3276, -0.6835],\n", 120 | " [-1.6110, -1.8310, -0.5256, -0.1138],\n", 121 | " [-0.9273, -1.4307, 0.2382, -0.6881]])" 122 | ] 123 | }, 124 | "execution_count": 16, 125 | "metadata": {}, 126 | "output_type": "execute_result" 127 | } 128 | ], 129 | "source": [ 130 | "matrix.t()" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 11, 136 | "metadata": {}, 137 | "outputs": [ 138 | { 139 | "data": { 140 | "text/plain": [ 141 | "tensor([ 0.3480, 0.8093, -2.0684])" 142 | ] 143 | }, 144 | "execution_count": 11, 145 | "metadata": {}, 146 | "output_type": "execute_result" 147 | } 148 | ], 149 | "source": [ 150 | "matrix[2]" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": 12, 156 | "metadata": {}, 157 | "outputs": [ 158 | { 159 | "data": { 160 | "text/plain": [ 161 | "tensor(0.3480)" 162 | ] 163 | }, 164 | "execution_count": 12, 165 | "metadata": {}, 166 | "output_type": "execute_result" 167 | } 168 | ], 169 | "source": [ 170 | "matrix[2,0]" 171 | ] 172 | }, 173 | { 174 | "cell_type": "markdown", 175 | "metadata": {}, 176 | "source": [ 177 | "Take a look at the PyTorch [documentation](https://pytorch.org/docs/stable/tensors.html#torch.Tensor.view) for details about the ``view()`` function." 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": 17, 183 | "metadata": {}, 184 | "outputs": [ 185 | { 186 | "data": { 187 | "text/plain": [ 188 | "tensor([ 0.0381, -1.6110, -0.9273, -0.0901, -1.8310, -1.4307, -0.3276, -0.5256,\n", 189 | " 0.2382, -0.6835, -0.1138, -0.6881])" 190 | ] 191 | }, 192 | "execution_count": 17, 193 | "metadata": {}, 194 | "output_type": "execute_result" 195 | } 196 | ], 197 | "source": [ 198 | "matrix.view(-1)" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": 26, 204 | "metadata": {}, 205 | "outputs": [ 206 | { 207 | "data": { 208 | "text/plain": [ 209 | "tensor([[ 0.0381, -1.6110, -0.9273, -0.0901, -1.8310, -1.4307],\n", 210 | " [-0.3276, -0.5256, 0.2382, -0.6835, -0.1138, -0.6881]])" 211 | ] 212 | }, 213 | "execution_count": 26, 214 | "metadata": {}, 215 | "output_type": "execute_result" 216 | } 217 | ], 218 | "source": [ 219 | "matrix.view(2,-1) # view the matrix with 6 rows and 2 columns" 220 | ] 221 | }, 222 | { 223 | "cell_type": "markdown", 224 | "metadata": {}, 225 | "source": [ 226 | "## Matrix multiplication with PyTorch" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": 31, 232 | "metadata": {}, 233 | "outputs": [ 234 | { 235 | "name": "stdout", 236 | "output_type": "stream", 237 | "text": [ 238 | "tensor([[ 0.5291, -0.1487, 0.8204],\n", 239 | " [ 1.4145, -1.9181, -2.5060],\n", 240 | " [-0.2387, 0.0584, 0.7217]])\n", 241 | "tensor([[ 0.8169, 0.2696, -1.3153],\n", 242 | " [ 0.8360, -0.8357, -1.5030],\n", 243 | " [ 0.9936, 0.1287, 1.9048]])\n" 244 | ] 245 | } 246 | ], 247 | "source": [ 248 | "a = torch.randn(3,3)\n", 249 | "b = torch.randn(3,3)\n", 250 | "print(a)\n", 251 | "print(b)" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": 32, 257 | "metadata": {}, 258 | "outputs": [ 259 | { 260 | "data": { 261 | "text/plain": [ 262 | "tensor([[ 0.4322, -0.0401, -1.0791],\n", 263 | " [ 1.1825, 1.6031, 3.7667],\n", 264 | " [-0.2372, 0.0075, 1.3747]])" 265 | ] 266 | }, 267 | "execution_count": 32, 268 | "metadata": {}, 269 | "output_type": "execute_result" 270 | } 271 | ], 272 | "source": [ 273 | "a * b # this does not do what you might think!" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": 33, 279 | "metadata": {}, 280 | "outputs": [ 281 | { 282 | "data": { 283 | "text/plain": [ 284 | "tensor([[ 1.1232, 0.3725, 1.0902],\n", 285 | " [-2.9381, 1.6619, -3.7509],\n", 286 | " [ 0.5709, -0.0203, 1.6010]])" 287 | ] 288 | }, 289 | "execution_count": 33, 290 | "metadata": {}, 291 | "output_type": "execute_result" 292 | } 293 | ], 294 | "source": [ 295 | "a @ b" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": 34, 301 | "metadata": {}, 302 | "outputs": [ 303 | { 304 | "data": { 305 | "text/plain": [ 306 | "tensor([[ 1.1232, 0.3725, 1.0902],\n", 307 | " [-2.9381, 1.6619, -3.7509],\n", 308 | " [ 0.5709, -0.0203, 1.6010]])" 309 | ] 310 | }, 311 | "execution_count": 34, 312 | "metadata": {}, 313 | "output_type": "execute_result" 314 | } 315 | ], 316 | "source": [ 317 | "a.mm(b)" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": 35, 323 | "metadata": {}, 324 | "outputs": [ 325 | { 326 | "data": { 327 | "text/plain": [ 328 | "tensor([[3., 3.],\n", 329 | " [3., 4.],\n", 330 | " [5., 6.]])" 331 | ] 332 | }, 333 | "execution_count": 35, 334 | "metadata": {}, 335 | "output_type": "execute_result" 336 | } 337 | ], 338 | "source": [ 339 | "c = torch.tensor([[1., 2.], [3., 4.], [5., 6.]]) \n", 340 | "c.clamp(min=3)" 341 | ] 342 | }, 343 | { 344 | "cell_type": "markdown", 345 | "metadata": {}, 346 | "source": [ 347 | "# What can Pytorch do?\n", 348 | "You can find both examples with further [explanations here](https://pytorch.org/tutorials/beginner/pytorch_with_examples.html)." 349 | ] 350 | }, 351 | { 352 | "cell_type": "markdown", 353 | "metadata": {}, 354 | "source": [ 355 | "## Autograd: AUTOMATIC DIFFERENTIATION \n", 356 | "\n", 357 | "![Hello World]()\n" 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": 37, 363 | "metadata": {}, 364 | "outputs": [ 365 | { 366 | "data": { 367 | "text/plain": [ 368 | "tensor([[ 2., -2.],\n", 369 | " [ 2., 2.]])" 370 | ] 371 | }, 372 | "execution_count": 37, 373 | "metadata": {}, 374 | "output_type": "execute_result" 375 | } 376 | ], 377 | "source": [ 378 | "x = torch.tensor([[1., -1.], [1., 1.]], requires_grad=True)\n", 379 | "out = x.pow(2).sum()\n", 380 | "out.backward()\n", 381 | "x.grad" 382 | ] 383 | }, 384 | { 385 | "cell_type": "markdown", 386 | "metadata": {}, 387 | "source": [ 388 | "# Tutotrials\n", 389 | "\n", 390 | "\n", 391 | "**1. I am new to Python:**\n", 392 | "\n", 393 | "If you are new to Python here is a list of online tutorials that you might find useful:\n", 394 | "\n", 395 | "*Learn the Basics*\n", 396 | "\n", 397 | "- [Hello, World!](https://www.learnpython.org/en/Hello%2C_World!)\n", 398 | "- [Variables and Types](https://www.learnpython.org/en/Variables_and_Types)\n", 399 | "- [Lists](https://www.learnpython.org/en/Lists)\n", 400 | "- [Basic Operators](https://www.learnpython.org/en/Basic_Operators)\n", 401 | "- [String Formatting](https://www.learnpython.org/en/String_Formatting)\n", 402 | "- [Basic String Operations](https://www.learnpython.org/en/Basic_String_Operations)\n", 403 | "- [Conditions](https://www.learnpython.org/en/Conditions)\n", 404 | "- [Loops](https://www.learnpython.org/en/Loops)\n", 405 | "- [Functions](https://www.learnpython.org/en/Functions)\n", 406 | "- [Classes and Objects](https://www.learnpython.org/en/Classes_and_Objects)\n", 407 | "- [Dictionaries](https://www.learnpython.org/en/Dictionaries)\n", 408 | "- [Modules and Packages](https://www.learnpython.org/en/Modules_and_Packages)\n", 409 | "\n", 410 | "**2. I am new to PyTorch**\n", 411 | "\n", 412 | "To get familiar with the PyTorchs concepts you can take the [A 60 Minute Blitz](https://pytorch.org/tutorials/beginner/blitz/tensor_tutorial.html#sphx-glr-beginner-blitz-tensor-tutorial-py) tutorial. \n", 413 | "You can find both example toy networks with further [explanations here](https://pytorch.org/tutorials/beginner/pytorch_with_examples.html)." 414 | ] 415 | } 416 | ], 417 | "metadata": { 418 | "kernelspec": { 419 | "display_name": "Python 3", 420 | "language": "python", 421 | "name": "python3" 422 | }, 423 | "language_info": { 424 | "codemirror_mode": { 425 | "name": "ipython", 426 | "version": 3 427 | }, 428 | "file_extension": ".py", 429 | "mimetype": "text/x-python", 430 | "name": "python", 431 | "nbconvert_exporter": "python", 432 | "pygments_lexer": "ipython3", 433 | "version": "3.7.6" 434 | } 435 | }, 436 | "nbformat": 4, 437 | "nbformat_minor": 4 438 | } 439 | -------------------------------------------------------------------------------- /slides/Deep Learning - Hello Python.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oliverguhr/htw-nlp-lecture/0b7cf49ff455d61fcbd2261034bbf5ecb570088d/slides/Deep Learning - Hello Python.pdf -------------------------------------------------------------------------------- /slides/Deep NLP 1 Recurrent Neural Networks.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oliverguhr/htw-nlp-lecture/0b7cf49ff455d61fcbd2261034bbf5ecb570088d/slides/Deep NLP 1 Recurrent Neural Networks.pdf -------------------------------------------------------------------------------- /slides/Deep NLP 2 Word Vectors and Transfer Learning.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oliverguhr/htw-nlp-lecture/0b7cf49ff455d61fcbd2261034bbf5ecb570088d/slides/Deep NLP 2 Word Vectors and Transfer Learning.pdf -------------------------------------------------------------------------------- /slides/Deep NLP 3 Transforners and Attention.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oliverguhr/htw-nlp-lecture/0b7cf49ff455d61fcbd2261034bbf5ecb570088d/slides/Deep NLP 3 Transforners and Attention.pdf --------------------------------------------------------------------------------