├── .gitignore
├── requirements.txt
├── .gitattributes
├── Dockerfile.cpu
├── Dockerfile.gpu
├── CONTRIBUTORS.md
├── LICENSE
├── download_model.py
├── DEVELOPERS.md
├── README.md
└── src
    ├── sample.py
    ├── generate_unconditional_samples.py
    ├── interactive_conditional_samples.py
    ├── encoder.py
    └── model.py


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | .mypy_cache/
3 | models/
4 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | fire>=0.1.3
2 | regex==2017.4.5
3 | requests==2.21.0
4 | tqdm==4.31.1
5 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # convert to OS line endings on checkout, back to LF on commit
2 | * text=auto
3 | 
4 | # ensure anything copied to the container has unix style line endings
5 | *.sh text eol=lf
6 | requirements.txt text eol=lf


--------------------------------------------------------------------------------
/Dockerfile.cpu:
--------------------------------------------------------------------------------
 1 | FROM tensorflow/tensorflow:1.12.0-py3
 2 | 
 3 | ENV LANG=C.UTF-8
 4 | RUN mkdir /gpt-2
 5 | WORKDIR /gpt-2
 6 | ADD . /gpt-2
 7 | RUN pip3 install -r requirements.txt
 8 | RUN python3 download_model.py 117M
 9 | RUN python3 download_model.py 345M
10 | 


--------------------------------------------------------------------------------
/Dockerfile.gpu:
--------------------------------------------------------------------------------
 1 | FROM tensorflow/tensorflow:1.12.0-gpu-py3
 2 | 
 3 | # nvidia-docker 1.0
 4 | LABEL com.nvidia.volumes.needed="nvidia_driver"
 5 | LABEL com.nvidia.cuda.version="${CUDA_VERSION}"
 6 | 
 7 | # nvidia-container-runtime
 8 | ENV NVIDIA_VISIBLE_DEVICES=all \
 9 |     NVIDIA_DRIVER_CAPABILITIES=compute,utility \
10 |     NVIDIA_REQUIRE_CUDA="cuda>=8.0" \
11 |     LANG=C.UTF-8
12 | 
13 | RUN mkdir /gpt-2
14 | WORKDIR /gpt-2
15 | ADD . /gpt-2
16 | RUN pip3 install -r requirements.txt
17 | RUN python3 download_model.py 117M
18 | RUN python3 download_model.py 345M
19 | 


--------------------------------------------------------------------------------
/CONTRIBUTORS.md:
--------------------------------------------------------------------------------
 1 | # Contributors (alphabetically)
 2 | 
 3 | * **[madisonmay](https://github.com/madisonmay)**
 4 | 
 5 |   Added Dockerfiles
 6 | 
 7 | * **[Margaret Mitchell et al](https://arxiv.org/abs/1810.03993)**
 8 | 
 9 |   Our [usage](./README.md#usage) writeup was loosely inspired by the paper
10 |   [Model Cards for Model Reporting](https://arxiv.org/abs/1810.03993)
11 |   and related conversations with some of the authors.
12 | 
13 | * **[webproduktion01](https://github.com/webproduktion01)**
14 | 
15 |   Ported download script to python.
16 | 
17 | **[Full code contributors list](https://github.com/openai/gpt-2/contributors).**
18 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 OpenAI
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/download_model.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import requests
 4 | from tqdm import tqdm
 5 | 
 6 | if len(sys.argv) != 2:
 7 |     print('You must enter the model name as a parameter, e.g.: download_model.py 117M')
 8 |     sys.exit(1)
 9 | 
10 | model = sys.argv[1]
11 | 
12 | subdir = os.path.join('models', model)
13 | if not os.path.exists(subdir):
14 |     os.makedirs(subdir)
15 | subdir = subdir.replace('\\','/') # needed for Windows
16 | 
17 | for filename in ['checkpoint','encoder.json','hparams.json','model.ckpt.data-00000-of-00001', 'model.ckpt.index', 'model.ckpt.meta', 'vocab.bpe']:
18 | 
19 |     r = requests.get("https://storage.googleapis.com/gpt-2/" + subdir + "/" + filename, stream=True)
20 | 
21 |     with open(os.path.join(subdir, filename), 'wb') as f:
22 |         file_size = int(r.headers["content-length"])
23 |         chunk_size = 1000
24 |         with tqdm(ncols=100, desc="Fetching " + filename, total=file_size, unit_scale=True) as pbar:
25 |             # 1k for chunk_size, since Ethernet packet size is around 1500 bytes
26 |             for chunk in r.iter_content(chunk_size=chunk_size):
27 |                 f.write(chunk)
28 |                 pbar.update(chunk_size)
29 | 


--------------------------------------------------------------------------------
/DEVELOPERS.md:
--------------------------------------------------------------------------------
 1 | # Installation
 2 | 
 3 | Git clone this repository, and `cd` into directory for remaining commands
 4 | ```
 5 | git clone https://github.com/openai/gpt-2.git && cd gpt-2
 6 | ```
 7 | 
 8 | Then, follow instructions for either native or Docker installation.
 9 | 
10 | ## Native Installation
11 | 
12 | All steps can optionally be done in a virtual environment using tools such as `virtualenv` or `conda`.
13 | 
14 | Install tensorflow 1.12 (with GPU support, if you have a GPU and want everything to run faster)
15 | ```
16 | pip3 install tensorflow==1.12.0
17 | ```
18 | or
19 | ```
20 | pip3 install tensorflow-gpu==1.12.0
21 | ```
22 | 
23 | Install other python packages:
24 | ```
25 | pip3 install -r requirements.txt
26 | ```
27 | 
28 | Download the model data
29 | ```
30 | python3 download_model.py 117M
31 | python3 download_model.py 345M
32 | ```
33 | 
34 | ## Docker Installation
35 | 
36 | Build the Dockerfile and tag the created image as `gpt-2`:
37 | ```
38 | docker build --tag gpt-2 -f Dockerfile.gpu . # or Dockerfile.cpu
39 | ```
40 | 
41 | Start an interactive bash session from the `gpt-2` docker image.
42 | 
43 | You can opt to use the `--runtime=nvidia` flag if you have access to a NVIDIA GPU
44 | and a valid install of [nvidia-docker 2.0](https://github.com/nvidia/nvidia-docker/wiki/Installation-(version-2.0)).
45 | ```
46 | docker run --runtime=nvidia -it gpt-2 bash
47 | ```
48 | 
49 | # Running
50 | 
51 | | WARNING: Samples are unfiltered and may contain offensive content. |
52 | | --- |
53 | 
54 | Some of the examples below may include Unicode text characters. Set the environment variable:
55 | ```
56 | export PYTHONIOENCODING=UTF-8
57 | ```
58 | to override the standard stream settings in UTF-8 mode.
59 | 
60 | ## Unconditional sample generation
61 | 
62 | To generate unconditional samples from the small model:
63 | ```
64 | python3 src/generate_unconditional_samples.py | tee /tmp/samples
65 | ```
66 | There are various flags for controlling the samples:
67 | ```
68 | python3 src/generate_unconditional_samples.py --top_k 40 --temperature 0.7 | tee /tmp/samples
69 | ```
70 | 
71 | To check flag descriptions, use:
72 | ```
73 | python3 src/generate_unconditional_samples.py -- --help
74 | ```
75 | 
76 | ## Conditional sample generation
77 | 
78 | To give the model custom prompts, you can use:
79 | ```
80 | python3 src/interactive_conditional_samples.py --top_k 40
81 | ```
82 | 
83 | To check flag descriptions, use:
84 | ```
85 | python3 src/interactive_conditional_samples.py -- --help
86 | ```
87 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # gpt-2
 2 | 
 3 | Code from the paper ["Language Models are Unsupervised Multitask Learners"](https://d4mucfpksywv.cloudfront.net/better-language-models/language-models.pdf).
 4 | 
 5 | We have currently released small (117M parameter) and medium (345M parameter) versions of GPT-2.  While we have not released the larger models, we have [released a dataset](https://github.com/openai/gpt-2-output-dataset) for researchers to study their behaviors.
 6 | 
 7 | See more details in our [blog post](https://blog.openai.com/better-language-models/).
 8 | 
 9 | ## Usage
10 | 
11 | This repository is meant to be a starting point for researchers and engineers to experiment with GPT-2.
12 | 
13 | ### Some caveats
14 | 
15 | - GPT-2 models' robustness and worst case behaviors are not well-understood.  As with any machine-learned model, carefully evaluate GPT-2 for your use case, especially if used without fine-tuning or in safety-critical applications where reliability is important.
16 | - The dataset our GPT-2 models were trained on contains many texts with [biases](https://twitter.com/TomerUllman/status/1101485289720242177) and factual inaccuracies, and thus GPT-2 models are likely to be biased and inaccurate as well.
17 | - To avoid having samples mistaken as human-written, we recommend clearly labeling samples as synthetic before wide dissemination.  Our models are often incoherent or inaccurate in subtle ways, which takes more than a quick read for a human to notice.
18 | 
19 | ### Work with us
20 | 
21 | Please [let us know](mailto:languagequestions@openai.com) if you’re doing interesting research with or working on applications of GPT-2!  We’re especially interested in hearing from and potentially working with those who are studying
22 | - Potential malicious use cases and defenses against them (e.g. the detectability of synthetic text)
23 | - The extent of problematic content (e.g. bias) being baked into the models and effective mitigations
24 | 
25 | ## Development
26 | 
27 | See [DEVELOPERS.md](./DEVELOPERS.md)
28 | 
29 | ## Contributors
30 | 
31 | See [CONTRIBUTORS.md](./CONTRIBUTORS.md)
32 | 
33 | ## Citation
34 | 
35 | Please use the following bibtex entry:
36 | ```
37 | @article{radford2019language,
38 |   title={Language Models are Unsupervised Multitask Learners},
39 |   author={Radford, Alec and Wu, Jeff and Child, Rewon and Luan, David and Amodei, Dario and Sutskever, Ilya},
40 |   year={2019}
41 | }
42 | ```
43 | 
44 | ## Future work
45 | 
46 | We may release code for evaluating the models on various benchmarks.
47 | 
48 | We are still considering release of the larger models.
49 | 
50 | ## License
51 | 
52 | [MIT](./LICENSE)
53 | 


--------------------------------------------------------------------------------
/src/sample.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | import model
 4 | 
 5 | def top_k_logits(logits, k):
 6 |     if k == 0:
 7 |         # no truncation
 8 |         return logits
 9 | 
10 |     def _top_k():
11 |         values, _ = tf.nn.top_k(logits, k=k)
12 |         min_values = values[:, -1, tf.newaxis]
13 |         return tf.where(
14 |             logits < min_values,
15 |             tf.ones_like(logits, dtype=logits.dtype) * -1e10,
16 |             logits,
17 |         )
18 |     return tf.cond(
19 |        tf.equal(k, 0),
20 |        lambda: logits,
21 |        lambda: _top_k(),
22 |     )
23 | 
24 | 
25 | def sample_sequence(*, hparams, length, start_token=None, batch_size=None, context=None, temperature=1, top_k=0):
26 |     if start_token is None:
27 |         assert context is not None, 'Specify exactly one of start_token and context!'
28 |     else:
29 |         assert context is None, 'Specify exactly one of start_token and context!'
30 |         context = tf.fill([batch_size, 1], start_token)
31 | 
32 |     def step(hparams, tokens, past=None):
33 |         lm_output = model.model(hparams=hparams, X=tokens, past=past, reuse=tf.AUTO_REUSE)
34 | 
35 |         logits = lm_output['logits'][:, :, :hparams.n_vocab]
36 |         presents = lm_output['present']
37 |         presents.set_shape(model.past_shape(hparams=hparams, batch_size=batch_size))
38 |         return {
39 |             'logits': logits,
40 |             'presents': presents,
41 |         }
42 | 
43 |     with tf.name_scope('sample_sequence'):
44 |         def body(past, prev, output):
45 |             next_outputs = step(hparams, prev, past=past)
46 |             logits = next_outputs['logits'][:, -1, :]  / tf.to_float(temperature)
47 |             logits = top_k_logits(logits, k=top_k)
48 |             samples = tf.multinomial(logits, num_samples=1, output_dtype=tf.int32)
49 |             return [
50 |                 next_outputs['presents'] if past is None else tf.concat([past, next_outputs['presents']], axis=-2),
51 |                 samples,
52 |                 tf.concat([output, samples], axis=1)
53 |             ]
54 | 
55 |         past, prev, output = body(None, context, context)
56 | 
57 |         def cond(*args):
58 |             return True
59 | 
60 |         _, _, tokens = tf.while_loop(
61 |             cond=cond, body=body,
62 |             maximum_iterations=length - 1,
63 |             loop_vars=[
64 |                 past,
65 |                 prev,
66 |                 output
67 |             ],
68 |             shape_invariants=[
69 |                 tf.TensorShape(model.past_shape(hparams=hparams, batch_size=batch_size)),
70 |                 tf.TensorShape([batch_size, None]),
71 |                 tf.TensorShape([batch_size, None]),
72 |             ],
73 |             back_prop=False,
74 |         )
75 | 
76 |         return tokens
77 | 


--------------------------------------------------------------------------------
/src/generate_unconditional_samples.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import fire
 4 | import json
 5 | import os
 6 | import numpy as np
 7 | import tensorflow as tf
 8 | 
 9 | import model, sample, encoder
10 | 
11 | def sample_model(
12 |     model_name='117M',
13 |     seed=None,
14 |     nsamples=0,
15 |     batch_size=1,
16 |     length=None,
17 |     temperature=1,
18 |     top_k=0,
19 |     models_dir='models',
20 | ):
21 |     """
22 |     Run the sample_model
23 |     :model_name=117M : String, which model to use
24 |     :seed=None : Integer seed for random number generators, fix seed to
25 |      reproduce results
26 |     :nsamples=0 : Number of samples to return, if 0, continues to
27 |      generate samples indefinately.
28 |     :batch_size=1 : Number of batches (only affects speed/memory).
29 |     :length=None : Number of tokens in generated text, if None (default), is
30 |      determined by model hyperparameters
31 |     :temperature=1 : Float value controlling randomness in boltzmann
32 |      distribution. Lower temperature results in less random completions. As the
33 |      temperature approaches zero, the model will become deterministic and
34 |      repetitive. Higher temperature results in more random completions.
35 |     :top_k=0 : Integer value controlling diversity. 1 means only 1 word is
36 |      considered for each step (token), resulting in deterministic completions,
37 |      while 40 means 40 words are considered at each step. 0 (default) is a
38 |      special setting meaning no restrictions. 40 generally is a good value.
39 |      :models_dir : path to parent folder containing model subfolders
40 |      (i.e. contains the <model_name> folder)
41 |     """
42 |     models_dir = os.path.expanduser(os.path.expandvars(models_dir))
43 |     enc = encoder.get_encoder(model_name, models_dir)
44 |     hparams = model.default_hparams()
45 |     with open(os.path.join(models_dir, model_name, 'hparams.json')) as f:
46 |         hparams.override_from_dict(json.load(f))
47 | 
48 |     if length is None:
49 |         length = hparams.n_ctx
50 |     elif length > hparams.n_ctx:
51 |         raise ValueError("Can't get samples longer than window size: %s" % hparams.n_ctx)
52 | 
53 |     with tf.Session(graph=tf.Graph()) as sess:
54 |         np.random.seed(seed)
55 |         tf.set_random_seed(seed)
56 | 
57 |         output = sample.sample_sequence(
58 |             hparams=hparams, length=length,
59 |             start_token=enc.encoder['<|endoftext|>'],
60 |             batch_size=batch_size,
61 |             temperature=temperature, top_k=top_k
62 |         )[:, 1:]
63 | 
64 |         saver = tf.train.Saver()
65 |         ckpt = tf.train.latest_checkpoint(os.path.join(models_dir, model_name))
66 |         saver.restore(sess, ckpt)
67 | 
68 |         generated = 0
69 |         while nsamples == 0 or generated < nsamples:
70 |             out = sess.run(output)
71 |             for i in range(batch_size):
72 |                 generated += batch_size
73 |                 text = enc.decode(out[i])
74 |                 print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
75 |                 print(text)
76 | 
77 | if __name__ == '__main__':
78 |     fire.Fire(sample_model)
79 | 
80 | 


--------------------------------------------------------------------------------
/src/interactive_conditional_samples.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import fire
 4 | import json
 5 | import os
 6 | import numpy as np
 7 | import tensorflow as tf
 8 | 
 9 | import model, sample, encoder
10 | 
11 | def interact_model(
12 |     model_name='117M',
13 |     seed=None,
14 |     nsamples=1,
15 |     batch_size=1,
16 |     length=None,
17 |     temperature=1,
18 |     top_k=0,
19 |     models_dir='models',    
20 | ):
21 |     """
22 |     Interactively run the model
23 |     :model_name=117M : String, which model to use
24 |     :seed=None : Integer seed for random number generators, fix seed to reproduce
25 |      results
26 |     :nsamples=1 : Number of samples to return total
27 |     :batch_size=1 : Number of batches (only affects speed/memory).  Must divide nsamples.
28 |     :length=None : Number of tokens in generated text, if None (default), is
29 |      determined by model hyperparameters
30 |     :temperature=1 : Float value controlling randomness in boltzmann
31 |      distribution. Lower temperature results in less random completions. As the
32 |      temperature approaches zero, the model will become deterministic and
33 |      repetitive. Higher temperature results in more random completions.
34 |     :top_k=0 : Integer value controlling diversity. 1 means only 1 word is
35 |      considered for each step (token), resulting in deterministic completions,
36 |      while 40 means 40 words are considered at each step. 0 (default) is a
37 |      special setting meaning no restrictions. 40 generally is a good value.
38 |      :models_dir : path to parent folder containing model subfolders
39 |      (i.e. contains the <model_name> folder)     
40 |     """
41 |     models_dir = os.path.expanduser(os.path.expandvars(models_dir))
42 |     if batch_size is None:
43 |         batch_size = 1
44 |     assert nsamples % batch_size == 0
45 | 
46 |     enc = encoder.get_encoder(model_name, models_dir)
47 |     hparams = model.default_hparams()
48 |     with open(os.path.join(models_dir, model_name, 'hparams.json')) as f:
49 |         hparams.override_from_dict(json.load(f))
50 | 
51 |     if length is None:
52 |         length = hparams.n_ctx // 2
53 |     elif length > hparams.n_ctx:
54 |         raise ValueError("Can't get samples longer than window size: %s" % hparams.n_ctx)
55 | 
56 |     with tf.Session(graph=tf.Graph()) as sess:
57 |         context = tf.placeholder(tf.int32, [batch_size, None])
58 |         np.random.seed(seed)
59 |         tf.set_random_seed(seed)
60 |         output = sample.sample_sequence(
61 |             hparams=hparams, length=length,
62 |             context=context,
63 |             batch_size=batch_size,
64 |             temperature=temperature, top_k=top_k
65 |         )
66 | 
67 |         saver = tf.train.Saver()
68 |         ckpt = tf.train.latest_checkpoint(os.path.join(models_dir, model_name))
69 |         saver.restore(sess, ckpt)
70 | 
71 |         while True:
72 |             raw_text = input("Model prompt >>> ")
73 |             while not raw_text:
74 |                 print('Prompt should not be empty!')
75 |                 raw_text = input("Model prompt >>> ")
76 |             context_tokens = enc.encode(raw_text)
77 |             generated = 0
78 |             for _ in range(nsamples // batch_size):
79 |                 out = sess.run(output, feed_dict={
80 |                     context: [context_tokens for _ in range(batch_size)]
81 |                 })[:, len(context_tokens):]
82 |                 for i in range(batch_size):
83 |                     generated += 1
84 |                     text = enc.decode(out[i])
85 |                     print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
86 |                     print(text)
87 |             print("=" * 80)
88 | 
89 | if __name__ == '__main__':
90 |     fire.Fire(interact_model)
91 | 
92 | 


--------------------------------------------------------------------------------
/src/encoder.py:
--------------------------------------------------------------------------------
  1 | """Byte pair encoding utilities"""
  2 | 
  3 | import os
  4 | import json
  5 | import regex as re
  6 | from functools import lru_cache
  7 | 
  8 | @lru_cache()
  9 | def bytes_to_unicode():
 10 |     """
 11 |     Returns list of utf-8 byte and a corresponding list of unicode strings.
 12 |     The reversible bpe codes work on unicode strings.
 13 |     This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
 14 |     When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
 15 |     This is a signficant percentage of your normal, say, 32K bpe vocab.
 16 |     To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
 17 |     And avoids mapping to whitespace/control characters the bpe code barfs on.
 18 |     """
 19 |     bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
 20 |     cs = bs[:]
 21 |     n = 0
 22 |     for b in range(2**8):
 23 |         if b not in bs:
 24 |             bs.append(b)
 25 |             cs.append(2**8+n)
 26 |             n += 1
 27 |     cs = [chr(n) for n in cs]
 28 |     return dict(zip(bs, cs))
 29 | 
 30 | def get_pairs(word):
 31 |     """Return set of symbol pairs in a word.
 32 | 
 33 |     Word is represented as tuple of symbols (symbols being variable-length strings).
 34 |     """
 35 |     pairs = set()
 36 |     prev_char = word[0]
 37 |     for char in word[1:]:
 38 |         pairs.add((prev_char, char))
 39 |         prev_char = char
 40 |     return pairs
 41 | 
 42 | class Encoder:
 43 |     def __init__(self, encoder, bpe_merges, errors='replace'):
 44 |         self.encoder = encoder
 45 |         self.decoder = {v:k for k,v in self.encoder.items()}
 46 |         self.errors = errors # how to handle errors in decoding
 47 |         self.byte_encoder = bytes_to_unicode()
 48 |         self.byte_decoder = {v:k for k, v in self.byte_encoder.items()}
 49 |         self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
 50 |         self.cache = {}
 51 | 
 52 |         # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
 53 |         self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
 54 | 
 55 |     def bpe(self, token):
 56 |         if token in self.cache:
 57 |             return self.cache[token]
 58 |         word = tuple(token)
 59 |         pairs = get_pairs(word)
 60 | 
 61 |         if not pairs:
 62 |             return token
 63 | 
 64 |         while True:
 65 |             bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
 66 |             if bigram not in self.bpe_ranks:
 67 |                 break
 68 |             first, second = bigram
 69 |             new_word = []
 70 |             i = 0
 71 |             while i < len(word):
 72 |                 try:
 73 |                     j = word.index(first, i)
 74 |                     new_word.extend(word[i:j])
 75 |                     i = j
 76 |                 except:
 77 |                     new_word.extend(word[i:])
 78 |                     break
 79 | 
 80 |                 if word[i] == first and i < len(word)-1 and word[i+1] == second:
 81 |                     new_word.append(first+second)
 82 |                     i += 2
 83 |                 else:
 84 |                     new_word.append(word[i])
 85 |                     i += 1
 86 |             new_word = tuple(new_word)
 87 |             word = new_word
 88 |             if len(word) == 1:
 89 |                 break
 90 |             else:
 91 |                 pairs = get_pairs(word)
 92 |         word = ' '.join(word)
 93 |         self.cache[token] = word
 94 |         return word
 95 | 
 96 |     def encode(self, text):
 97 |         bpe_tokens = []
 98 |         for token in re.findall(self.pat, text):
 99 |             token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
100 |             bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
101 |         return bpe_tokens
102 | 
103 |     def decode(self, tokens):
104 |         text = ''.join([self.decoder[token] for token in tokens])
105 |         text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
106 |         return text
107 | 
108 | def get_encoder(model_name, models_dir):
109 |     with open(os.path.join(models_dir, model_name, 'encoder.json'), 'r') as f:
110 |         encoder = json.load(f)
111 |     with open(os.path.join(models_dir, model_name, 'vocab.bpe'), 'r', encoding="utf-8") as f:
112 |         bpe_data = f.read()
113 |     bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split('\n')[1:-1]]
114 |     return Encoder(
115 |         encoder=encoder,
116 |         bpe_merges=bpe_merges,
117 |     )
118 | 


--------------------------------------------------------------------------------
/src/model.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | from tensorflow.contrib.training import HParams
  4 | 
  5 | def default_hparams():
  6 |     return HParams(
  7 |         n_vocab=0,
  8 |         n_ctx=1024,
  9 |         n_embd=768,
 10 |         n_head=12,
 11 |         n_layer=12,
 12 |     )
 13 | 
 14 | def shape_list(x):
 15 |     """Deal with dynamic shape in tensorflow cleanly."""
 16 |     static = x.shape.as_list()
 17 |     dynamic = tf.shape(x)
 18 |     return [dynamic[i] if s is None else s for i, s in enumerate(static)]
 19 | 
 20 | def softmax(x, axis=-1):
 21 |     x = x - tf.reduce_max(x, axis=axis, keepdims=True)
 22 |     ex = tf.exp(x)
 23 |     return ex / tf.reduce_sum(ex, axis=axis, keepdims=True)
 24 | 
 25 | def gelu(x):
 26 |     return 0.5*x*(1+tf.tanh(np.sqrt(2/np.pi)*(x+0.044715*tf.pow(x, 3))))
 27 | 
 28 | def norm(x, scope, *, axis=-1, epsilon=1e-5):
 29 |     """Normalize to mean = 0, std = 1, then do a diagonal affine transform."""
 30 |     with tf.variable_scope(scope):
 31 |         n_state = x.shape[-1].value
 32 |         g = tf.get_variable('g', [n_state], initializer=tf.constant_initializer(1))
 33 |         b = tf.get_variable('b', [n_state], initializer=tf.constant_initializer(0))
 34 |         u = tf.reduce_mean(x, axis=axis, keepdims=True)
 35 |         s = tf.reduce_mean(tf.square(x-u), axis=axis, keepdims=True)
 36 |         x = (x - u) * tf.rsqrt(s + epsilon)
 37 |         x = x*g + b
 38 |         return x
 39 | 
 40 | def split_states(x, n):
 41 |     """Reshape the last dimension of x into [n, x.shape[-1]/n]."""
 42 |     *start, m = shape_list(x)
 43 |     return tf.reshape(x, start + [n, m//n])
 44 | 
 45 | def merge_states(x):
 46 |     """Smash the last two dimensions of x into a single dimension."""
 47 |     *start, a, b = shape_list(x)
 48 |     return tf.reshape(x, start + [a*b])
 49 | 
 50 | def conv1d(x, scope, nf, *, w_init_stdev=0.02):
 51 |     with tf.variable_scope(scope):
 52 |         *start, nx = shape_list(x)
 53 |         w = tf.get_variable('w', [1, nx, nf], initializer=tf.random_normal_initializer(stddev=w_init_stdev))
 54 |         b = tf.get_variable('b', [nf], initializer=tf.constant_initializer(0))
 55 |         c = tf.reshape(tf.matmul(tf.reshape(x, [-1, nx]), tf.reshape(w, [-1, nf]))+b, start+[nf])
 56 |         return c
 57 | 
 58 | def attention_mask(nd, ns, *, dtype):
 59 |     """1's in the lower triangle, counting from the lower right corner.
 60 | 
 61 |     Same as tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd), but doesn't produce garbage on TPUs.
 62 |     """
 63 |     i = tf.range(nd)[:,None]
 64 |     j = tf.range(ns)
 65 |     m = i >= j - ns + nd
 66 |     return tf.cast(m, dtype)
 67 | 
 68 | 
 69 | def attn(x, scope, n_state, *, past, hparams):
 70 |     assert x.shape.ndims == 3  # Should be [batch, sequence, features]
 71 |     assert n_state % hparams.n_head == 0
 72 |     if past is not None:
 73 |         assert past.shape.ndims == 5  # Should be [batch, 2, heads, sequence, features], where 2 is [k, v]
 74 | 
 75 |     def split_heads(x):
 76 |         # From [batch, sequence, features] to [batch, heads, sequence, features]
 77 |         return tf.transpose(split_states(x, hparams.n_head), [0, 2, 1, 3])
 78 | 
 79 |     def merge_heads(x):
 80 |         # Reverse of split_heads
 81 |         return merge_states(tf.transpose(x, [0, 2, 1, 3]))
 82 | 
 83 |     def mask_attn_weights(w):
 84 |         # w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst.
 85 |         _, _, nd, ns = shape_list(w)
 86 |         b = attention_mask(nd, ns, dtype=w.dtype)
 87 |         b = tf.reshape(b, [1, 1, nd, ns])
 88 |         w = w*b - tf.cast(1e10, w.dtype)*(1-b)
 89 |         return w
 90 | 
 91 |     def multihead_attn(q, k, v):
 92 |         # q, k, v have shape [batch, heads, sequence, features]
 93 |         w = tf.matmul(q, k, transpose_b=True)
 94 |         w = w * tf.rsqrt(tf.cast(v.shape[-1].value, w.dtype))
 95 | 
 96 |         w = mask_attn_weights(w)
 97 |         w = softmax(w)
 98 |         a = tf.matmul(w, v)
 99 |         return a
100 | 
101 |     with tf.variable_scope(scope):
102 |         c = conv1d(x, 'c_attn', n_state*3)
103 |         q, k, v = map(split_heads, tf.split(c, 3, axis=2))
104 |         present = tf.stack([k, v], axis=1)
105 |         if past is not None:
106 |             pk, pv = tf.unstack(past, axis=1)
107 |             k = tf.concat([pk, k], axis=-2)
108 |             v = tf.concat([pv, v], axis=-2)
109 |         a = multihead_attn(q, k, v)
110 |         a = merge_heads(a)
111 |         a = conv1d(a, 'c_proj', n_state)
112 |         return a, present
113 | 
114 | 
115 | def mlp(x, scope, n_state, *, hparams):
116 |     with tf.variable_scope(scope):
117 |         nx = x.shape[-1].value
118 |         h = gelu(conv1d(x, 'c_fc', n_state))
119 |         h2 = conv1d(h, 'c_proj', nx)
120 |         return h2
121 | 
122 | 
123 | def block(x, scope, *, past, hparams):
124 |     with tf.variable_scope(scope):
125 |         nx = x.shape[-1].value
126 |         a, present = attn(norm(x, 'ln_1'), 'attn', nx, past=past, hparams=hparams)
127 |         x = x + a
128 |         m = mlp(norm(x, 'ln_2'), 'mlp', nx*4, hparams=hparams)
129 |         x = x + m
130 |         return x, present
131 | 
132 | def past_shape(*, hparams, batch_size=None, sequence=None):
133 |     return [batch_size, hparams.n_layer, 2, hparams.n_head, sequence, hparams.n_embd // hparams.n_head]
134 | 
135 | def expand_tile(value, size):
136 |     """Add a new axis of given size."""
137 |     value = tf.convert_to_tensor(value, name='value')
138 |     ndims = value.shape.ndims
139 |     return tf.tile(tf.expand_dims(value, axis=0), [size] + [1]*ndims)
140 | 
141 | def positions_for(tokens, past_length):
142 |     batch_size = tf.shape(tokens)[0]
143 |     nsteps = tf.shape(tokens)[1]
144 |     return expand_tile(past_length + tf.range(nsteps), batch_size)
145 | 
146 | 
147 | def model(hparams, X, past=None, scope='model', reuse=False):
148 |     with tf.variable_scope(scope, reuse=reuse):
149 |         results = {}
150 |         batch, sequence = shape_list(X)
151 | 
152 |         wpe = tf.get_variable('wpe', [hparams.n_ctx, hparams.n_embd],
153 |                              initializer=tf.random_normal_initializer(stddev=0.01))
154 |         wte = tf.get_variable('wte', [hparams.n_vocab, hparams.n_embd],
155 |                              initializer=tf.random_normal_initializer(stddev=0.02))
156 |         past_length = 0 if past is None else tf.shape(past)[-2]
157 |         h = tf.gather(wte, X) + tf.gather(wpe, positions_for(X, past_length))
158 | 
159 |         # Transformer
160 |         presents = []
161 |         pasts = tf.unstack(past, axis=1) if past is not None else [None] * hparams.n_layer
162 |         assert len(pasts) == hparams.n_layer
163 |         for layer, past in enumerate(pasts):
164 |             h, present = block(h, 'h%d' % layer, past=past, hparams=hparams)
165 |             presents.append(present)
166 |         results['present'] = tf.stack(presents, axis=1)
167 |         h = norm(h, 'ln_f')
168 | 
169 |         # Language model loss.  Do tokens <n predict token n?
170 |         h_flat = tf.reshape(h, [batch*sequence, hparams.n_embd])
171 |         logits = tf.matmul(h_flat, wte, transpose_b=True)
172 |         logits = tf.reshape(logits, [batch, sequence, hparams.n_vocab])
173 |         results['logits'] = logits
174 |         return results
175 | 


--------------------------------------------------------------------------------