├── .gitignore ├── requirements.txt ├── .gitattributes ├── Dockerfile.cpu ├── Dockerfile.gpu ├── CONTRIBUTORS.md ├── LICENSE ├── download_model.py ├── DEVELOPERS.md ├── README.md └── src ├── sample.py ├── generate_unconditional_samples.py ├── interactive_conditional_samples.py ├── encoder.py └── model.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | .mypy_cache/ 3 | models/ 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | fire>=0.1.3 2 | regex==2017.4.5 3 | requests==2.21.0 4 | tqdm==4.31.1 5 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # convert to OS line endings on checkout, back to LF on commit 2 | * text=auto 3 | 4 | # ensure anything copied to the container has unix style line endings 5 | *.sh text eol=lf 6 | requirements.txt text eol=lf -------------------------------------------------------------------------------- /Dockerfile.cpu: -------------------------------------------------------------------------------- 1 | FROM tensorflow/tensorflow:1.12.0-py3 2 | 3 | ENV LANG=C.UTF-8 4 | RUN mkdir /gpt-2 5 | WORKDIR /gpt-2 6 | ADD . /gpt-2 7 | RUN pip3 install -r requirements.txt 8 | RUN python3 download_model.py 117M 9 | RUN python3 download_model.py 345M 10 | -------------------------------------------------------------------------------- /Dockerfile.gpu: -------------------------------------------------------------------------------- 1 | FROM tensorflow/tensorflow:1.12.0-gpu-py3 2 | 3 | # nvidia-docker 1.0 4 | LABEL com.nvidia.volumes.needed="nvidia_driver" 5 | LABEL com.nvidia.cuda.version="${CUDA_VERSION}" 6 | 7 | # nvidia-container-runtime 8 | ENV NVIDIA_VISIBLE_DEVICES=all \ 9 | NVIDIA_DRIVER_CAPABILITIES=compute,utility \ 10 | NVIDIA_REQUIRE_CUDA="cuda>=8.0" \ 11 | LANG=C.UTF-8 12 | 13 | RUN mkdir /gpt-2 14 | WORKDIR /gpt-2 15 | ADD . /gpt-2 16 | RUN pip3 install -r requirements.txt 17 | RUN python3 download_model.py 117M 18 | RUN python3 download_model.py 345M 19 | -------------------------------------------------------------------------------- /CONTRIBUTORS.md: -------------------------------------------------------------------------------- 1 | # Contributors (alphabetically) 2 | 3 | * **[madisonmay](https://github.com/madisonmay)** 4 | 5 | Added Dockerfiles 6 | 7 | * **[Margaret Mitchell et al](https://arxiv.org/abs/1810.03993)** 8 | 9 | Our [usage](./README.md#usage) writeup was loosely inspired by the paper 10 | [Model Cards for Model Reporting](https://arxiv.org/abs/1810.03993) 11 | and related conversations with some of the authors. 12 | 13 | * **[webproduktion01](https://github.com/webproduktion01)** 14 | 15 | Ported download script to python. 16 | 17 | **[Full code contributors list](https://github.com/openai/gpt-2/contributors).** 18 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 OpenAI 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /download_model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import requests 4 | from tqdm import tqdm 5 | 6 | if len(sys.argv) != 2: 7 | print('You must enter the model name as a parameter, e.g.: download_model.py 117M') 8 | sys.exit(1) 9 | 10 | model = sys.argv[1] 11 | 12 | subdir = os.path.join('models', model) 13 | if not os.path.exists(subdir): 14 | os.makedirs(subdir) 15 | subdir = subdir.replace('\\','/') # needed for Windows 16 | 17 | for filename in ['checkpoint','encoder.json','hparams.json','model.ckpt.data-00000-of-00001', 'model.ckpt.index', 'model.ckpt.meta', 'vocab.bpe']: 18 | 19 | r = requests.get("https://storage.googleapis.com/gpt-2/" + subdir + "/" + filename, stream=True) 20 | 21 | with open(os.path.join(subdir, filename), 'wb') as f: 22 | file_size = int(r.headers["content-length"]) 23 | chunk_size = 1000 24 | with tqdm(ncols=100, desc="Fetching " + filename, total=file_size, unit_scale=True) as pbar: 25 | # 1k for chunk_size, since Ethernet packet size is around 1500 bytes 26 | for chunk in r.iter_content(chunk_size=chunk_size): 27 | f.write(chunk) 28 | pbar.update(chunk_size) 29 | -------------------------------------------------------------------------------- /DEVELOPERS.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | Git clone this repository, and `cd` into directory for remaining commands 4 | ``` 5 | git clone https://github.com/openai/gpt-2.git && cd gpt-2 6 | ``` 7 | 8 | Then, follow instructions for either native or Docker installation. 9 | 10 | ## Native Installation 11 | 12 | All steps can optionally be done in a virtual environment using tools such as `virtualenv` or `conda`. 13 | 14 | Install tensorflow 1.12 (with GPU support, if you have a GPU and want everything to run faster) 15 | ``` 16 | pip3 install tensorflow==1.12.0 17 | ``` 18 | or 19 | ``` 20 | pip3 install tensorflow-gpu==1.12.0 21 | ``` 22 | 23 | Install other python packages: 24 | ``` 25 | pip3 install -r requirements.txt 26 | ``` 27 | 28 | Download the model data 29 | ``` 30 | python3 download_model.py 117M 31 | python3 download_model.py 345M 32 | ``` 33 | 34 | ## Docker Installation 35 | 36 | Build the Dockerfile and tag the created image as `gpt-2`: 37 | ``` 38 | docker build --tag gpt-2 -f Dockerfile.gpu . # or Dockerfile.cpu 39 | ``` 40 | 41 | Start an interactive bash session from the `gpt-2` docker image. 42 | 43 | You can opt to use the `--runtime=nvidia` flag if you have access to a NVIDIA GPU 44 | and a valid install of [nvidia-docker 2.0](https://github.com/nvidia/nvidia-docker/wiki/Installation-(version-2.0)). 45 | ``` 46 | docker run --runtime=nvidia -it gpt-2 bash 47 | ``` 48 | 49 | # Running 50 | 51 | | WARNING: Samples are unfiltered and may contain offensive content. | 52 | | --- | 53 | 54 | Some of the examples below may include Unicode text characters. Set the environment variable: 55 | ``` 56 | export PYTHONIOENCODING=UTF-8 57 | ``` 58 | to override the standard stream settings in UTF-8 mode. 59 | 60 | ## Unconditional sample generation 61 | 62 | To generate unconditional samples from the small model: 63 | ``` 64 | python3 src/generate_unconditional_samples.py | tee /tmp/samples 65 | ``` 66 | There are various flags for controlling the samples: 67 | ``` 68 | python3 src/generate_unconditional_samples.py --top_k 40 --temperature 0.7 | tee /tmp/samples 69 | ``` 70 | 71 | To check flag descriptions, use: 72 | ``` 73 | python3 src/generate_unconditional_samples.py -- --help 74 | ``` 75 | 76 | ## Conditional sample generation 77 | 78 | To give the model custom prompts, you can use: 79 | ``` 80 | python3 src/interactive_conditional_samples.py --top_k 40 81 | ``` 82 | 83 | To check flag descriptions, use: 84 | ``` 85 | python3 src/interactive_conditional_samples.py -- --help 86 | ``` 87 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # gpt-2 2 | 3 | Code from the paper ["Language Models are Unsupervised Multitask Learners"](https://d4mucfpksywv.cloudfront.net/better-language-models/language-models.pdf). 4 | 5 | We have currently released small (117M parameter) and medium (345M parameter) versions of GPT-2. While we have not released the larger models, we have [released a dataset](https://github.com/openai/gpt-2-output-dataset) for researchers to study their behaviors. 6 | 7 | See more details in our [blog post](https://blog.openai.com/better-language-models/). 8 | 9 | ## Usage 10 | 11 | This repository is meant to be a starting point for researchers and engineers to experiment with GPT-2. 12 | 13 | ### Some caveats 14 | 15 | - GPT-2 models' robustness and worst case behaviors are not well-understood. As with any machine-learned model, carefully evaluate GPT-2 for your use case, especially if used without fine-tuning or in safety-critical applications where reliability is important. 16 | - The dataset our GPT-2 models were trained on contains many texts with [biases](https://twitter.com/TomerUllman/status/1101485289720242177) and factual inaccuracies, and thus GPT-2 models are likely to be biased and inaccurate as well. 17 | - To avoid having samples mistaken as human-written, we recommend clearly labeling samples as synthetic before wide dissemination. Our models are often incoherent or inaccurate in subtle ways, which takes more than a quick read for a human to notice. 18 | 19 | ### Work with us 20 | 21 | Please [let us know](mailto:languagequestions@openai.com) if you’re doing interesting research with or working on applications of GPT-2! We’re especially interested in hearing from and potentially working with those who are studying 22 | - Potential malicious use cases and defenses against them (e.g. the detectability of synthetic text) 23 | - The extent of problematic content (e.g. bias) being baked into the models and effective mitigations 24 | 25 | ## Development 26 | 27 | See [DEVELOPERS.md](./DEVELOPERS.md) 28 | 29 | ## Contributors 30 | 31 | See [CONTRIBUTORS.md](./CONTRIBUTORS.md) 32 | 33 | ## Citation 34 | 35 | Please use the following bibtex entry: 36 | ``` 37 | @article{radford2019language, 38 | title={Language Models are Unsupervised Multitask Learners}, 39 | author={Radford, Alec and Wu, Jeff and Child, Rewon and Luan, David and Amodei, Dario and Sutskever, Ilya}, 40 | year={2019} 41 | } 42 | ``` 43 | 44 | ## Future work 45 | 46 | We may release code for evaluating the models on various benchmarks. 47 | 48 | We are still considering release of the larger models. 49 | 50 | ## License 51 | 52 | [MIT](./LICENSE) 53 | -------------------------------------------------------------------------------- /src/sample.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | import model 4 | 5 | def top_k_logits(logits, k): 6 | if k == 0: 7 | # no truncation 8 | return logits 9 | 10 | def _top_k(): 11 | values, _ = tf.nn.top_k(logits, k=k) 12 | min_values = values[:, -1, tf.newaxis] 13 | return tf.where( 14 | logits < min_values, 15 | tf.ones_like(logits, dtype=logits.dtype) * -1e10, 16 | logits, 17 | ) 18 | return tf.cond( 19 | tf.equal(k, 0), 20 | lambda: logits, 21 | lambda: _top_k(), 22 | ) 23 | 24 | 25 | def sample_sequence(*, hparams, length, start_token=None, batch_size=None, context=None, temperature=1, top_k=0): 26 | if start_token is None: 27 | assert context is not None, 'Specify exactly one of start_token and context!' 28 | else: 29 | assert context is None, 'Specify exactly one of start_token and context!' 30 | context = tf.fill([batch_size, 1], start_token) 31 | 32 | def step(hparams, tokens, past=None): 33 | lm_output = model.model(hparams=hparams, X=tokens, past=past, reuse=tf.AUTO_REUSE) 34 | 35 | logits = lm_output['logits'][:, :, :hparams.n_vocab] 36 | presents = lm_output['present'] 37 | presents.set_shape(model.past_shape(hparams=hparams, batch_size=batch_size)) 38 | return { 39 | 'logits': logits, 40 | 'presents': presents, 41 | } 42 | 43 | with tf.name_scope('sample_sequence'): 44 | def body(past, prev, output): 45 | next_outputs = step(hparams, prev, past=past) 46 | logits = next_outputs['logits'][:, -1, :] / tf.to_float(temperature) 47 | logits = top_k_logits(logits, k=top_k) 48 | samples = tf.multinomial(logits, num_samples=1, output_dtype=tf.int32) 49 | return [ 50 | next_outputs['presents'] if past is None else tf.concat([past, next_outputs['presents']], axis=-2), 51 | samples, 52 | tf.concat([output, samples], axis=1) 53 | ] 54 | 55 | past, prev, output = body(None, context, context) 56 | 57 | def cond(*args): 58 | return True 59 | 60 | _, _, tokens = tf.while_loop( 61 | cond=cond, body=body, 62 | maximum_iterations=length - 1, 63 | loop_vars=[ 64 | past, 65 | prev, 66 | output 67 | ], 68 | shape_invariants=[ 69 | tf.TensorShape(model.past_shape(hparams=hparams, batch_size=batch_size)), 70 | tf.TensorShape([batch_size, None]), 71 | tf.TensorShape([batch_size, None]), 72 | ], 73 | back_prop=False, 74 | ) 75 | 76 | return tokens 77 | -------------------------------------------------------------------------------- /src/generate_unconditional_samples.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import fire 4 | import json 5 | import os 6 | import numpy as np 7 | import tensorflow as tf 8 | 9 | import model, sample, encoder 10 | 11 | def sample_model( 12 | model_name='117M', 13 | seed=None, 14 | nsamples=0, 15 | batch_size=1, 16 | length=None, 17 | temperature=1, 18 | top_k=0, 19 | models_dir='models', 20 | ): 21 | """ 22 | Run the sample_model 23 | :model_name=117M : String, which model to use 24 | :seed=None : Integer seed for random number generators, fix seed to 25 | reproduce results 26 | :nsamples=0 : Number of samples to return, if 0, continues to 27 | generate samples indefinately. 28 | :batch_size=1 : Number of batches (only affects speed/memory). 29 | :length=None : Number of tokens in generated text, if None (default), is 30 | determined by model hyperparameters 31 | :temperature=1 : Float value controlling randomness in boltzmann 32 | distribution. Lower temperature results in less random completions. As the 33 | temperature approaches zero, the model will become deterministic and 34 | repetitive. Higher temperature results in more random completions. 35 | :top_k=0 : Integer value controlling diversity. 1 means only 1 word is 36 | considered for each step (token), resulting in deterministic completions, 37 | while 40 means 40 words are considered at each step. 0 (default) is a 38 | special setting meaning no restrictions. 40 generally is a good value. 39 | :models_dir : path to parent folder containing model subfolders 40 | (i.e. contains the folder) 41 | """ 42 | models_dir = os.path.expanduser(os.path.expandvars(models_dir)) 43 | enc = encoder.get_encoder(model_name, models_dir) 44 | hparams = model.default_hparams() 45 | with open(os.path.join(models_dir, model_name, 'hparams.json')) as f: 46 | hparams.override_from_dict(json.load(f)) 47 | 48 | if length is None: 49 | length = hparams.n_ctx 50 | elif length > hparams.n_ctx: 51 | raise ValueError("Can't get samples longer than window size: %s" % hparams.n_ctx) 52 | 53 | with tf.Session(graph=tf.Graph()) as sess: 54 | np.random.seed(seed) 55 | tf.set_random_seed(seed) 56 | 57 | output = sample.sample_sequence( 58 | hparams=hparams, length=length, 59 | start_token=enc.encoder['<|endoftext|>'], 60 | batch_size=batch_size, 61 | temperature=temperature, top_k=top_k 62 | )[:, 1:] 63 | 64 | saver = tf.train.Saver() 65 | ckpt = tf.train.latest_checkpoint(os.path.join(models_dir, model_name)) 66 | saver.restore(sess, ckpt) 67 | 68 | generated = 0 69 | while nsamples == 0 or generated < nsamples: 70 | out = sess.run(output) 71 | for i in range(batch_size): 72 | generated += batch_size 73 | text = enc.decode(out[i]) 74 | print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40) 75 | print(text) 76 | 77 | if __name__ == '__main__': 78 | fire.Fire(sample_model) 79 | 80 | -------------------------------------------------------------------------------- /src/interactive_conditional_samples.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import fire 4 | import json 5 | import os 6 | import numpy as np 7 | import tensorflow as tf 8 | 9 | import model, sample, encoder 10 | 11 | def interact_model( 12 | model_name='117M', 13 | seed=None, 14 | nsamples=1, 15 | batch_size=1, 16 | length=None, 17 | temperature=1, 18 | top_k=0, 19 | models_dir='models', 20 | ): 21 | """ 22 | Interactively run the model 23 | :model_name=117M : String, which model to use 24 | :seed=None : Integer seed for random number generators, fix seed to reproduce 25 | results 26 | :nsamples=1 : Number of samples to return total 27 | :batch_size=1 : Number of batches (only affects speed/memory). Must divide nsamples. 28 | :length=None : Number of tokens in generated text, if None (default), is 29 | determined by model hyperparameters 30 | :temperature=1 : Float value controlling randomness in boltzmann 31 | distribution. Lower temperature results in less random completions. As the 32 | temperature approaches zero, the model will become deterministic and 33 | repetitive. Higher temperature results in more random completions. 34 | :top_k=0 : Integer value controlling diversity. 1 means only 1 word is 35 | considered for each step (token), resulting in deterministic completions, 36 | while 40 means 40 words are considered at each step. 0 (default) is a 37 | special setting meaning no restrictions. 40 generally is a good value. 38 | :models_dir : path to parent folder containing model subfolders 39 | (i.e. contains the folder) 40 | """ 41 | models_dir = os.path.expanduser(os.path.expandvars(models_dir)) 42 | if batch_size is None: 43 | batch_size = 1 44 | assert nsamples % batch_size == 0 45 | 46 | enc = encoder.get_encoder(model_name, models_dir) 47 | hparams = model.default_hparams() 48 | with open(os.path.join(models_dir, model_name, 'hparams.json')) as f: 49 | hparams.override_from_dict(json.load(f)) 50 | 51 | if length is None: 52 | length = hparams.n_ctx // 2 53 | elif length > hparams.n_ctx: 54 | raise ValueError("Can't get samples longer than window size: %s" % hparams.n_ctx) 55 | 56 | with tf.Session(graph=tf.Graph()) as sess: 57 | context = tf.placeholder(tf.int32, [batch_size, None]) 58 | np.random.seed(seed) 59 | tf.set_random_seed(seed) 60 | output = sample.sample_sequence( 61 | hparams=hparams, length=length, 62 | context=context, 63 | batch_size=batch_size, 64 | temperature=temperature, top_k=top_k 65 | ) 66 | 67 | saver = tf.train.Saver() 68 | ckpt = tf.train.latest_checkpoint(os.path.join(models_dir, model_name)) 69 | saver.restore(sess, ckpt) 70 | 71 | while True: 72 | raw_text = input("Model prompt >>> ") 73 | while not raw_text: 74 | print('Prompt should not be empty!') 75 | raw_text = input("Model prompt >>> ") 76 | context_tokens = enc.encode(raw_text) 77 | generated = 0 78 | for _ in range(nsamples // batch_size): 79 | out = sess.run(output, feed_dict={ 80 | context: [context_tokens for _ in range(batch_size)] 81 | })[:, len(context_tokens):] 82 | for i in range(batch_size): 83 | generated += 1 84 | text = enc.decode(out[i]) 85 | print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40) 86 | print(text) 87 | print("=" * 80) 88 | 89 | if __name__ == '__main__': 90 | fire.Fire(interact_model) 91 | 92 | -------------------------------------------------------------------------------- /src/encoder.py: -------------------------------------------------------------------------------- 1 | """Byte pair encoding utilities""" 2 | 3 | import os 4 | import json 5 | import regex as re 6 | from functools import lru_cache 7 | 8 | @lru_cache() 9 | def bytes_to_unicode(): 10 | """ 11 | Returns list of utf-8 byte and a corresponding list of unicode strings. 12 | The reversible bpe codes work on unicode strings. 13 | This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. 14 | When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. 15 | This is a signficant percentage of your normal, say, 32K bpe vocab. 16 | To avoid that, we want lookup tables between utf-8 bytes and unicode strings. 17 | And avoids mapping to whitespace/control characters the bpe code barfs on. 18 | """ 19 | bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1)) 20 | cs = bs[:] 21 | n = 0 22 | for b in range(2**8): 23 | if b not in bs: 24 | bs.append(b) 25 | cs.append(2**8+n) 26 | n += 1 27 | cs = [chr(n) for n in cs] 28 | return dict(zip(bs, cs)) 29 | 30 | def get_pairs(word): 31 | """Return set of symbol pairs in a word. 32 | 33 | Word is represented as tuple of symbols (symbols being variable-length strings). 34 | """ 35 | pairs = set() 36 | prev_char = word[0] 37 | for char in word[1:]: 38 | pairs.add((prev_char, char)) 39 | prev_char = char 40 | return pairs 41 | 42 | class Encoder: 43 | def __init__(self, encoder, bpe_merges, errors='replace'): 44 | self.encoder = encoder 45 | self.decoder = {v:k for k,v in self.encoder.items()} 46 | self.errors = errors # how to handle errors in decoding 47 | self.byte_encoder = bytes_to_unicode() 48 | self.byte_decoder = {v:k for k, v in self.byte_encoder.items()} 49 | self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) 50 | self.cache = {} 51 | 52 | # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions 53 | self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") 54 | 55 | def bpe(self, token): 56 | if token in self.cache: 57 | return self.cache[token] 58 | word = tuple(token) 59 | pairs = get_pairs(word) 60 | 61 | if not pairs: 62 | return token 63 | 64 | while True: 65 | bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf'))) 66 | if bigram not in self.bpe_ranks: 67 | break 68 | first, second = bigram 69 | new_word = [] 70 | i = 0 71 | while i < len(word): 72 | try: 73 | j = word.index(first, i) 74 | new_word.extend(word[i:j]) 75 | i = j 76 | except: 77 | new_word.extend(word[i:]) 78 | break 79 | 80 | if word[i] == first and i < len(word)-1 and word[i+1] == second: 81 | new_word.append(first+second) 82 | i += 2 83 | else: 84 | new_word.append(word[i]) 85 | i += 1 86 | new_word = tuple(new_word) 87 | word = new_word 88 | if len(word) == 1: 89 | break 90 | else: 91 | pairs = get_pairs(word) 92 | word = ' '.join(word) 93 | self.cache[token] = word 94 | return word 95 | 96 | def encode(self, text): 97 | bpe_tokens = [] 98 | for token in re.findall(self.pat, text): 99 | token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) 100 | bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' ')) 101 | return bpe_tokens 102 | 103 | def decode(self, tokens): 104 | text = ''.join([self.decoder[token] for token in tokens]) 105 | text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors) 106 | return text 107 | 108 | def get_encoder(model_name, models_dir): 109 | with open(os.path.join(models_dir, model_name, 'encoder.json'), 'r') as f: 110 | encoder = json.load(f) 111 | with open(os.path.join(models_dir, model_name, 'vocab.bpe'), 'r', encoding="utf-8") as f: 112 | bpe_data = f.read() 113 | bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split('\n')[1:-1]] 114 | return Encoder( 115 | encoder=encoder, 116 | bpe_merges=bpe_merges, 117 | ) 118 | -------------------------------------------------------------------------------- /src/model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from tensorflow.contrib.training import HParams 4 | 5 | def default_hparams(): 6 | return HParams( 7 | n_vocab=0, 8 | n_ctx=1024, 9 | n_embd=768, 10 | n_head=12, 11 | n_layer=12, 12 | ) 13 | 14 | def shape_list(x): 15 | """Deal with dynamic shape in tensorflow cleanly.""" 16 | static = x.shape.as_list() 17 | dynamic = tf.shape(x) 18 | return [dynamic[i] if s is None else s for i, s in enumerate(static)] 19 | 20 | def softmax(x, axis=-1): 21 | x = x - tf.reduce_max(x, axis=axis, keepdims=True) 22 | ex = tf.exp(x) 23 | return ex / tf.reduce_sum(ex, axis=axis, keepdims=True) 24 | 25 | def gelu(x): 26 | return 0.5*x*(1+tf.tanh(np.sqrt(2/np.pi)*(x+0.044715*tf.pow(x, 3)))) 27 | 28 | def norm(x, scope, *, axis=-1, epsilon=1e-5): 29 | """Normalize to mean = 0, std = 1, then do a diagonal affine transform.""" 30 | with tf.variable_scope(scope): 31 | n_state = x.shape[-1].value 32 | g = tf.get_variable('g', [n_state], initializer=tf.constant_initializer(1)) 33 | b = tf.get_variable('b', [n_state], initializer=tf.constant_initializer(0)) 34 | u = tf.reduce_mean(x, axis=axis, keepdims=True) 35 | s = tf.reduce_mean(tf.square(x-u), axis=axis, keepdims=True) 36 | x = (x - u) * tf.rsqrt(s + epsilon) 37 | x = x*g + b 38 | return x 39 | 40 | def split_states(x, n): 41 | """Reshape the last dimension of x into [n, x.shape[-1]/n].""" 42 | *start, m = shape_list(x) 43 | return tf.reshape(x, start + [n, m//n]) 44 | 45 | def merge_states(x): 46 | """Smash the last two dimensions of x into a single dimension.""" 47 | *start, a, b = shape_list(x) 48 | return tf.reshape(x, start + [a*b]) 49 | 50 | def conv1d(x, scope, nf, *, w_init_stdev=0.02): 51 | with tf.variable_scope(scope): 52 | *start, nx = shape_list(x) 53 | w = tf.get_variable('w', [1, nx, nf], initializer=tf.random_normal_initializer(stddev=w_init_stdev)) 54 | b = tf.get_variable('b', [nf], initializer=tf.constant_initializer(0)) 55 | c = tf.reshape(tf.matmul(tf.reshape(x, [-1, nx]), tf.reshape(w, [-1, nf]))+b, start+[nf]) 56 | return c 57 | 58 | def attention_mask(nd, ns, *, dtype): 59 | """1's in the lower triangle, counting from the lower right corner. 60 | 61 | Same as tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd), but doesn't produce garbage on TPUs. 62 | """ 63 | i = tf.range(nd)[:,None] 64 | j = tf.range(ns) 65 | m = i >= j - ns + nd 66 | return tf.cast(m, dtype) 67 | 68 | 69 | def attn(x, scope, n_state, *, past, hparams): 70 | assert x.shape.ndims == 3 # Should be [batch, sequence, features] 71 | assert n_state % hparams.n_head == 0 72 | if past is not None: 73 | assert past.shape.ndims == 5 # Should be [batch, 2, heads, sequence, features], where 2 is [k, v] 74 | 75 | def split_heads(x): 76 | # From [batch, sequence, features] to [batch, heads, sequence, features] 77 | return tf.transpose(split_states(x, hparams.n_head), [0, 2, 1, 3]) 78 | 79 | def merge_heads(x): 80 | # Reverse of split_heads 81 | return merge_states(tf.transpose(x, [0, 2, 1, 3])) 82 | 83 | def mask_attn_weights(w): 84 | # w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst. 85 | _, _, nd, ns = shape_list(w) 86 | b = attention_mask(nd, ns, dtype=w.dtype) 87 | b = tf.reshape(b, [1, 1, nd, ns]) 88 | w = w*b - tf.cast(1e10, w.dtype)*(1-b) 89 | return w 90 | 91 | def multihead_attn(q, k, v): 92 | # q, k, v have shape [batch, heads, sequence, features] 93 | w = tf.matmul(q, k, transpose_b=True) 94 | w = w * tf.rsqrt(tf.cast(v.shape[-1].value, w.dtype)) 95 | 96 | w = mask_attn_weights(w) 97 | w = softmax(w) 98 | a = tf.matmul(w, v) 99 | return a 100 | 101 | with tf.variable_scope(scope): 102 | c = conv1d(x, 'c_attn', n_state*3) 103 | q, k, v = map(split_heads, tf.split(c, 3, axis=2)) 104 | present = tf.stack([k, v], axis=1) 105 | if past is not None: 106 | pk, pv = tf.unstack(past, axis=1) 107 | k = tf.concat([pk, k], axis=-2) 108 | v = tf.concat([pv, v], axis=-2) 109 | a = multihead_attn(q, k, v) 110 | a = merge_heads(a) 111 | a = conv1d(a, 'c_proj', n_state) 112 | return a, present 113 | 114 | 115 | def mlp(x, scope, n_state, *, hparams): 116 | with tf.variable_scope(scope): 117 | nx = x.shape[-1].value 118 | h = gelu(conv1d(x, 'c_fc', n_state)) 119 | h2 = conv1d(h, 'c_proj', nx) 120 | return h2 121 | 122 | 123 | def block(x, scope, *, past, hparams): 124 | with tf.variable_scope(scope): 125 | nx = x.shape[-1].value 126 | a, present = attn(norm(x, 'ln_1'), 'attn', nx, past=past, hparams=hparams) 127 | x = x + a 128 | m = mlp(norm(x, 'ln_2'), 'mlp', nx*4, hparams=hparams) 129 | x = x + m 130 | return x, present 131 | 132 | def past_shape(*, hparams, batch_size=None, sequence=None): 133 | return [batch_size, hparams.n_layer, 2, hparams.n_head, sequence, hparams.n_embd // hparams.n_head] 134 | 135 | def expand_tile(value, size): 136 | """Add a new axis of given size.""" 137 | value = tf.convert_to_tensor(value, name='value') 138 | ndims = value.shape.ndims 139 | return tf.tile(tf.expand_dims(value, axis=0), [size] + [1]*ndims) 140 | 141 | def positions_for(tokens, past_length): 142 | batch_size = tf.shape(tokens)[0] 143 | nsteps = tf.shape(tokens)[1] 144 | return expand_tile(past_length + tf.range(nsteps), batch_size) 145 | 146 | 147 | def model(hparams, X, past=None, scope='model', reuse=False): 148 | with tf.variable_scope(scope, reuse=reuse): 149 | results = {} 150 | batch, sequence = shape_list(X) 151 | 152 | wpe = tf.get_variable('wpe', [hparams.n_ctx, hparams.n_embd], 153 | initializer=tf.random_normal_initializer(stddev=0.01)) 154 | wte = tf.get_variable('wte', [hparams.n_vocab, hparams.n_embd], 155 | initializer=tf.random_normal_initializer(stddev=0.02)) 156 | past_length = 0 if past is None else tf.shape(past)[-2] 157 | h = tf.gather(wte, X) + tf.gather(wpe, positions_for(X, past_length)) 158 | 159 | # Transformer 160 | presents = [] 161 | pasts = tf.unstack(past, axis=1) if past is not None else [None] * hparams.n_layer 162 | assert len(pasts) == hparams.n_layer 163 | for layer, past in enumerate(pasts): 164 | h, present = block(h, 'h%d' % layer, past=past, hparams=hparams) 165 | presents.append(present) 166 | results['present'] = tf.stack(presents, axis=1) 167 | h = norm(h, 'ln_f') 168 | 169 | # Language model loss. Do tokens