├── .gitignore
├── 001_test_list.sh
├── 002_test_completion.sh
├── 003_completions.sh
├── LICENSE
├── README.md
├── download_model.py
├── openai_server
    ├── __init__.py
    ├── __main__.py
    └── gpt
    │   ├── encoder.py
    │   ├── generate_unconditional_samples.py
    │   ├── interactive_conditional_samples.py
    │   ├── model.py
    │   └── sample.py
├── prod.sh
├── requirements.txt
└── start.sh


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | __pycache__
3 | *.pyc
4 | /models
5 | 


--------------------------------------------------------------------------------
/001_test_list.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export OPENAI_API_KEY="${OPENAI_API_KEY:-stub}"
 3 | set -ex
 4 | export PORT="${PORT:-9000}"
 5 | export OPENAI_API_BASE="${OPENAI_API_BASE:-http://localhost:${PORT}}"
 6 | #export OPENAI_API_BASE="${OPENAI_API_BASE:-https://api.openai.com}"
 7 | #export OPENAI_LOG="${OPENAI_LOG:-debug}"
 8 | 
 9 | set +ex
10 | #export OPENAI_LOG="${OPENAI_LOG:-debug}"
11 | set -x
12 | exec openai api engines.list "$@"
13 | 


--------------------------------------------------------------------------------
/002_test_completion.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export OPENAI_API_KEY="${OPENAI_API_KEY:-stub}"
 3 | set -ex
 4 | export PORT="${PORT:-9000}"
 5 | export OPENAI_API_BASE="${OPENAI_API_BASE:-http://localhost:${PORT}}"
 6 | #export OPENAI_API_BASE="${OPENAI_API_BASE:-https://api.openai.com}"
 7 | #export OPENAI_LOG="${OPENAI_LOG:-debug}"
 8 | 
 9 | prompt="${1:-Hello, my name is}"
10 | 
11 | if [ -e "$prompt" ]
12 | then
13 |   prompt="$(cat "$prompt")"
14 | fi
15 | 
16 | ENGINE="${ENGINE:-${E:-davinci}}"
17 | TEMPERATURE="${TEMPERATURE:-${T:-0.9}}"
18 | MAX_TOKENS="${MAX_TOKENS:-${M:-12}}"
19 | N="${N:-1}"
20 | set +ex
21 | shift 1
22 | 
23 | set -x
24 | exec openai api completions.create -e "${ENGINE}" -t "${TEMPERATURE}" -M "${MAX_TOKENS}" -n "${N}" -p "${prompt}" "$@"
25 | 


--------------------------------------------------------------------------------
/003_completions.sh:
--------------------------------------------------------------------------------
 1 | export OPENAI_API_KEY="${OPENAI_API_KEY:-stub}"
 2 | set -x
 3 | export PORT="${PORT:-9000}"
 4 | export OPENAI_API_BASE="${OPENAI_API_BASE:-http://localhost:${PORT}}"
 5 | #export OPENAI_API_BASE="${OPENAI_API_BASE:-https://api.openai.com}"
 6 | #export OPENAI_LOG="${OPENAI_LOG:-debug}"
 7 | set +x
 8 | 
 9 | prompt="${1:-Hello, my name is}"
10 | while true
11 | do
12 |   prompt="$(openai api completions.create -e davinci -t 0.6 -M 32 -n 1 -p "$prompt")"
13 |   printf "\033c"
14 |   #echo '----------'
15 |   echo "$prompt"
16 | done
17 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | ===============================================================================
 2 | openai-server -- An implementation of the OpenAI API
 3 | 
 4 | Copyright (C) 2020 Shawn Presser. All rights reserved.
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in
14 | all copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 | THE SOFTWARE.
23 | 
24 | [ MIT license: http://www.opensource.org/licenses/mit-license.php ]
25 | 
26 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # openai-server
  2 | 
  3 | `openai-server` is an implementation of the [OpenAI API](https://openai.com/blog/openai-api/).
  4 | 
  5 | Specifically, we implement `/v1/engines/list` and `/v1/engines/{model_name}/completions` endpoints.
  6 | 
  7 | Both endpoints are mostly feature-complete, with a few differences. The JSON response is identical; any library that works with the OpenAI API will probably work with this.
  8 | 
  9 | To get started, see the [quickstart](#Quickstart) or the [examples](#Examples) or the [JavaScript API](https://github.com/shawwn/tensorfork-openai-api).
 10 | 
 11 | ## Contact
 12 | 
 13 | - Twitter: [@theshawwn](https://twitter.com/theshawwn)
 14 | - HN: [sillysaurusx](https://news.ycombinator.com/item?id=23346972)
 15 | - ML discord: [https://discordapp.com/invite/x52Xz3y](https://discordapp.com/invite/x52Xz3y)
 16 | - Support me on patreon: [patreon.com/shawwn](https://patreon.com/shawwn)
 17 | 
 18 | ## Quickstart
 19 | 
 20 | ```sh
 21 | # grab the code.
 22 | git clone https://github.com/shawwn/openai-server
 23 | cd openai-server
 24 | 
 25 | # install dependencies.
 26 | pip3 install -r requirements.txt
 27 | 
 28 | # grab a gpt-2 model.
 29 | python3 download_model.py 117M # or 345M, 774M, 1558M
 30 | 
 31 | # start the server.
 32 | MODELS=117M bash prod.sh
 33 | 
 34 | # in a new terminal, ask for a completion.
 35 | bash 002_test_completion.sh 'Hello there. My name is'
 36 | ```
 37 | 
 38 | Your server is now serving the OpenAI API at localhost:9000. (You can change the port via `export PORT=8000`)
 39 | 
 40 | ## Examples
 41 | 
 42 | ### Generating completions via the `openai` SDK
 43 | 
 44 | You can grab some completions using the official `openai` command-line tool:
 45 | ```sh
 46 | $ OPENAI_API_BASE=http://localhost:9000 openai api completions.create -e davinci -p 'Hello, world' -t 0.8 -M 16 -n 4
 47 | ===== Completion 0 =====
 48 | Hello, world. It seems like a good idea to make a living. The fact that it
 49 | ===== Completion 1 =====
 50 | Hello, world. This is not the first time you're seeing the same thing at any given
 51 | ===== Completion 2 =====
 52 | Hello, world, please do my best to continue the development of Monad and its conforming
 53 | ===== Completion 3 =====
 54 | Hello, world controlled enemy.
 55 | 
 56 | "Be careful. We have come across a near total
 57 | ```
 58 | 
 59 | 
 60 | ### Continuously dump completions to terminal
 61 | ```sh
 62 | $ bash 003_completions.sh 'Yo dawg, we implemented OpenAI API'
 63 | Yo dawg, we implemented OpenAI API. Now, we have the ability to connect to Signal, a cryptographic data store.
 64 | 
 65 | We can now make this secure by using new kid on the block chain, OpenAI.
 66 | 
 67 | OpenAI is the new block chain protocol for the internet. This is a major milestone. As the internet becomes more open and open for everybody, it is important for us to have a robust, high-quality blockchain. It is also important that we never create an untraceable chain. The blockchain is the only way to guarantee that everyone has the same access to the network.
 68 | 
 69 | We are an open consortium and we believe that the blockchain is the bridge between the internet and the rest of the world. We're committed to this project. We believe that the blockchain is a bridge between the internet and
 70 | ^C
 71 | ```
 72 | 
 73 | ### Fetch the JSON endpoint manually
 74 | ```sh
 75 | $ curl 'http://localhost:9000/v1/engines/117M/completions?prompt=Hello,%20my%20name%20is&max_tokens=32&n=4&temperature=0.9&echo=true'
 76 | {
 77 |   "choices": [
 78 |     {
 79 |       "finish-reason": "length",
 80 |       "index": 0,
 81 |       "logprobs": null,
 82 |       "text": "Hello, my name is Loium Chazz, and I have been far from satisfied with your departure. But I will, at least by some chance, give you permission to decide for"
 83 |     },
 84 |     {
 85 |       "finish-reason": "length",
 86 |       "index": 1,
 87 |       "logprobs": null,
 88 |       "text": "Hello, my name is Tim and my name is Jodie. Yours, Tom.\n\nTim: Oh hello, my name is Tim.\n\nJB: Where?'"
 89 |     },
 90 |     {
 91 |       "finish-reason": "length",
 92 |       "index": 2,
 93 |       "logprobs": null,
 94 |       "text": "Hello, my name is Rosen Sylvan. That's right, Buck Paoli, who was a member of the Board of Governors for George W. Bush in the 2009 Democratic primary\u2014"
 95 |     },
 96 |     {
 97 |       "finish-reason": "length",
 98 |       "index": 3,
 99 |       "logprobs": null,
100 |       "text": "Hello, my name is Nick Martens, I am an English-speaking Canadian, University of Toronto, Mississauga, Canada. I work in a computer software company located in Canada."
101 |     }
102 |   ],
103 |   "created": 1601701785.777768,
104 |   "id": "cmpl-3qN8kwW1Ya7_qxWz4h8wuIzN",
105 |   "model": "117M",
106 |   "object": "text_completion"
107 | }
108 | ```
109 | 
110 | ### Explore via your browser
111 | 
112 | You can [open the JSON endpoint in your browser](http://localhost:9000/v1/engines/117M/completions?prompt=Hello,%20my%20name%20is&max_tokens=32&n=4&temperature=0.9&echo=true) and start playing around with the query params.
113 | 
114 | ### A simple bash script for dumping completions
115 | 
116 | ```sh
117 | $ T=0.8 M=32 bash 002_test_completion.sh 'Hello, my name is'
118 | Hello, my name is Plato and, like many of you, I am very happy with the pre-release.
119 | 
120 | The primary goal of the pre-release was to provide
121 | ```
122 | 
123 | The first argument to `002_test_completion.sh` is the prompt:
124 | ```sh
125 | bash 002_test_completion.sh 'Hello there. My name is'
126 | ```
127 | 
128 | You can set the temperature using `T=0.8` and the token count using `M=32`:
129 | ```sh
130 | T=0.8 M=32 bash 002_test_completion.sh 'Hello there. My name is'
131 | ```
132 | 
133 | To read a prompt from a file, simply pass in the filename. If the first argument is a valid filename, the file becomes the prompt:
134 | ```sh
135 | T=0.8 M=32 bash 002_test_completion.sh README.md
136 | ```
137 | 
138 | If the prompt is too long, the last `1023 - M` tokens of the prompt are used. **Note**: This means if you request 500 tokens, it will only use `1023 minus 500` tokens from the prompt. Therefore, to let GPT see as many tokens as possible, request a small number of tokens (e.g. 16).
139 | 
140 | ### Setting up everything from scratch
141 | 
142 | A complete example of how to go from zero code to a fully functional OpenAI API server:
143 | 
144 | ```sh
145 | # grab the code.
146 | git clone https://github.com/shawwn/openai-server
147 | cd openai-server
148 | 
149 | # install dependencies.
150 | pip3 install -r requirements.txt
151 | 
152 | # grab all models (requires ~8GB of disk space; if low, just download 117M, which only requires 550MB)
153 | python3 download_model.py 117M
154 | python3 download_model.py 345M
155 | python3 download_model.py 774M
156 | python3 download_model.py 1558M
157 | 
158 | # then, do *one* of the following:
159 | 
160 | # ...serve one specific model:
161 | MODELS=117M bash prod.sh
162 | 
163 | # ...or serve multiple models:
164 | MODELS=1558M,117M bash prod.sh
165 | 
166 | # ...or serve all models you've downloaded (the default):
167 | bash prod.sh
168 | ```
169 | 
170 | The server listens on port 9000 by default. You can change it via PORT:
171 | ```sh
172 | PORT=8080 bash prod.sh
173 | ```
174 | 
175 | Now that the server is running, you can start making API requests. See [examples](#Examples).
176 | 
177 | ## Notes
178 | 
179 | ### A warning about frequency_penalty
180 | 
181 | for 1558M, the best results seem to come from `temperature=0.6` and `frequency_penalty=0.9`:
182 | ```sh
183 | curl 'http://localhost:9000/v1/engines/1558M/completions?prompt=Hello,%20my%20name%20is&max_tokens=32&n=4&temperature=0.4&frequency_penalty=0.9&echo=true'
184 | ```
185 | 
186 | But beware: you shouldn't use `frequency_penalty` unless your model is the largest (1558M, commonly known as "1.5B"). For some reason, `frequency_penalty` causes the output to be scrambled when the model is smaller than 1558M.
187 | 
188 | ### Running in production
189 | 
190 | For production usage, consider running it via the following command:
191 | 
192 | ```sh
193 | while true; do MODELS=117M bash prod.sh ; sleep 20 ; done
194 | ```
195 | 
196 | That way, if the server terminates for any reason, it will automatically restart.
197 | 
198 | For endpoint monitoring, I recommend [updown.io](https://updown.io/).
199 | 
200 | ## Community
201 | 
202 | ### Join the ML Discord
203 | 
204 | If you're an ML enthusiast, join the [ML Discord](https://discordapp.com/invite/x52Xz3y).
205 | There are ~800 members, with ~120 online at any given time:
206 | 
207 | ![image](https://user-images.githubusercontent.com/59632/84269906-bc7d2080-aade-11ea-8b4e-f78412855d43.png)
208 | 
209 | There are a variety of interesting channels:
210 | 
211 | - `#papers` for pointing out interesting research papers
212 | - `#research` for discussing ML research
213 | - `#show` and `#samples` for showing off your work
214 | - `#hardware` for hardware enthusiasts
215 | - `#ideas` for brainstorming
216 | - `#tensorflow` and `#pytorch`
217 | - `#cats`, `#doggos`, and of course `#memes`
218 | - Quite a few more.
219 | 
220 | ## Support me
221 | 
222 | *If you found this library helpful, consider [joining my patreon](https://patreon.com/shawwn).*
223 | 
224 | 


--------------------------------------------------------------------------------
/download_model.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import requests
 4 | from tqdm import tqdm
 5 | 
 6 | if len(sys.argv) != 2:
 7 |     print('You must enter the model name as a parameter, e.g.: download_model.py 124M')
 8 |     sys.exit(1)
 9 | 
10 | model = sys.argv[1]
11 | 
12 | subdir = os.path.join('models', model)
13 | if not os.path.exists(subdir):
14 |     os.makedirs(subdir)
15 | subdir = subdir.replace('\\','/') # needed for Windows
16 | 
17 | name = 'model.ckpt'
18 | 
19 | for filename in ['checkpoint','hparams.json','encoder.json','vocab.bpe','model.ckpt.index', 'model.ckpt.meta', 'model.ckpt.data-00000-of-00001']:
20 | 
21 |     filename = filename.replace('model.ckpt', name)
22 | 
23 |     bucket = os.environ.get('BUCKET', 'gpt-2')
24 |     path = os.environ.get('MODEL_DIR', 'gs://{bucket}/{subdir}'.format(bucket=bucket, subdir=subdir)).lstrip('gs:').strip('/')
25 |     url = "https://openaipublic.blob.core.windows.net/" + path + "/" + filename
26 |     r = requests.get(url, stream=True)
27 |     if not r.ok and filename == 'checkpoint':
28 |         raise FileNotFoundError(url)
29 |     
30 |     if not r.ok:
31 |         continue
32 | 
33 |     with open(os.path.join(subdir, filename), 'wb') as f:
34 |         file_size = int(r.headers["content-length"])
35 |         chunk_size = 1000
36 |         with tqdm(ncols=100, desc="Fetching " + filename, total=file_size, unit_scale=True) as pbar:
37 |             # 1k for chunk_size, since Ethernet packet size is around 1500 bytes
38 |             for chunk in r.iter_content(chunk_size=chunk_size):
39 |                 f.write(chunk)
40 |                 pbar.update(chunk_size)
41 |     if filename == 'checkpoint':
42 |         with open(os.path.join(subdir, filename)) as f:
43 |             for line in f:
44 |                 if line.startswith('model_checkpoint_path'):
45 |                     name = line.split(':', 1)[1].strip().strip('"')
46 | 
47 |         
48 | 


--------------------------------------------------------------------------------
/openai_server/__init__.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import tqdm
  4 | import traceback
  5 | import time
  6 | import base64
  7 | import secrets
  8 | sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'gpt'))
  9 | from json import loads, dumps
 10 | from json import load as json_load
 11 | 
 12 | from sanic import Sanic
 13 | from sanic.response import json, text
 14 | from sanic_cors import CORS, cross_origin
 15 | 
 16 | from pprint import pprint as pp
 17 | 
 18 | from openai_server.gpt import sample, model, encoder
 19 | 
 20 | import tensorflow as tf
 21 | import ftfy
 22 | 
 23 | from tokenizers import Tokenizer
 24 | from transformers import GPT2TokenizerFast
 25 | 
 26 | 
 27 | class GPTEngine:
 28 |   def __init__(self, api, model_name, batch_size=1):
 29 |     self.api = api
 30 |     self.id = model_name
 31 |     self.ckpt = tf.train.latest_checkpoint(os.path.join(api.model_path, model_name))
 32 |     if self.ckpt is None:
 33 |       raise ValueError("Couldn't load checkpoint for {model_name} from {path}".format(model_name=model_name, path=os.path.join(api.model_path, model_name)))
 34 |     self.graph = tf.Graph()
 35 |     self.config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)
 36 |     self.session = tf.Session(graph=self.graph, config=self.config)
 37 |     #self.encoder = encoder.get_encoder(model_name, self.api.model_path)
 38 |     self.encoder = GPT2TokenizerFast.from_pretrained("gpt2")
 39 |     self.hparams = model.default_hparams()
 40 |     with open(os.path.join(self.api.model_path, model_name, 'hparams.json')) as f:
 41 |       params = json_load(f)
 42 |       self.hparams.override_from_dict(params)
 43 |     with self.session.as_default() as sess, self.graph.as_default() as graph:
 44 |       pp(self.session.list_devices())
 45 |       if 'CUDA_VISIBLE_DEVICES' in os.environ:
 46 |         print('Using /gpu:0 on device {}'.format(os.environ['CUDA_VISIBLE_DEVICES']))
 47 |       with tf.device('/gpu:0' if 'CUDA_VISIBLE_DEVICES' in os.environ else None):
 48 |         self.batch_size = batch_size
 49 |         self.context = tf.placeholder(tf.int32, [self.batch_size, None], name="context")
 50 |         self.length = tf.placeholder(tf.int32, (), name="length")
 51 |         self.temperature = tf.placeholder(tf.float32, (), name="temperature")
 52 |         self.top_k = tf.placeholder(tf.int32, (), name="top_k")
 53 |         self.top_p = tf.placeholder(tf.float32, (), name="top_p")
 54 |         self.frequency_penalty = tf.placeholder(tf.float32, (), name="frequency_penalty")
 55 |         #np.random.seed(seed)
 56 |         #tf.set_random_seed(seed)
 57 |         self.output = sample.sample_sequence(
 58 |             hparams=self.hparams,
 59 |             length=self.length,
 60 |             context=self.context,
 61 |             batch_size=self.batch_size,
 62 |             temperature=self.temperature,
 63 |             top_k=self.top_k,
 64 |             top_p=self.top_p,
 65 |             frequency_penalty=self.frequency_penalty,
 66 |         )
 67 |       var_list = tf.trainable_variables()
 68 |       self.saver = tf.train.Saver(var_list=var_list)
 69 |       for v in var_list:
 70 |         print(self.ckpt, v)
 71 |       pp(self.hparams)
 72 |       print('Restoring from {!r}'.format(self.ckpt))
 73 |       self.saver.restore(sess, self.ckpt)
 74 | 
 75 | 
 76 |   def fix(self, text):
 77 |     fixed = ftfy.fix_text(text)
 78 |     return fixed
 79 | 
 80 | 
 81 |   # GPT2Tokenizer and Tokenizer has different ways of fetching token ids
 82 |   def encode(self, text, encoder=None):
 83 |     if encoder is None:
 84 |       encoder = self.encoder
 85 |     result = encoder.encode(text)
 86 |     if isinstance(result, list):
 87 |         return result
 88 |     return result.ids
 89 | 
 90 | 
 91 |   def completion(self, prompt, n=None, max_tokens=None, logprobs=None, stream=False, temperature=None, top_p=None, top_k=None, echo=None, frequency_penalty=None, best_of=None, stop=None, **kws):
 92 |     if temperature is None:
 93 |       temperature = 0.9
 94 |     if top_p is None:
 95 |       top_p = 1.0
 96 |     if top_k is None:
 97 |       top_k = 0
 98 |     if max_tokens is None:
 99 |       max_tokens = 16
100 |     if max_tokens > int(os.environ.get('MAX_TOKENS', '500')):
101 |       max_tokens = int(os.environ.get('MAX_TOKENS', '500'))
102 |     if n is None:
103 |       n = 1
104 |     if n >= int(os.environ.get('MAX_N', '4')):
105 |       n = int(os.environ.get('MAX_N', '4')) # cap to 4 choices
106 |     if echo is None:
107 |       echo = False
108 |     if frequency_penalty is None or frequency_penalty <= 0.0:
109 |       frequency_penalty = 1.0
110 |     if stop is not None:
111 |       if isinstance(stop, str):
112 |         stop = [stop]
113 |       print('Stop: {!r}'.format(stop))
114 |     if len(kws) > 0:
115 |       print('Got extra keywords: {!r}'.format(kws))
116 |     prompt = self.fix(prompt)
117 |     with self.session.as_default() as sess, self.graph.as_default() as graph:
118 |       tokens = self.encode(prompt)
119 |       while len(tokens) + max_tokens >= self.hparams.n_ctx:
120 |         tokens = tokens[1:]
121 |       length = max_tokens
122 |       for i in range(n):
123 |         params = {
124 |           self.temperature: temperature,
125 |           self.top_p: top_p,
126 |           self.top_k: top_k,
127 |           self.frequency_penalty: frequency_penalty,
128 |           self.length: length,
129 |         }
130 |         print(params)
131 |         result = self.session.run(self.output, {self.context: [tokens], **params})
132 |         result_tokens = result[0]
133 |         completion = result_tokens[len(tokens):]
134 |         completion_text = self.encoder.decode(completion)
135 |         finish_reason = 'length'
136 |         if stop is not None:
137 |           for s in stop:
138 |             if s in completion_text:
139 |               completion_text = completion_text.split(s, 1)[0]
140 |               finish_reason = 'stop'
141 |         if echo:
142 |           text = prompt + completion_text
143 |         else:
144 |           text = completion_text
145 |         print(repr(text))
146 |         yield {'index': i, 'logprobs': None, 'text': text, 'finish-reason': finish_reason}
147 | 
148 | class API:
149 |   def __init__(self, model_path=None):
150 |     if model_path is None:
151 |       model_path = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'models')
152 |     self.model_path = model_path
153 |     self.models = []
154 |     self.engines = {}
155 |     if 'MODELS' in os.environ:
156 |       models = os.environ['MODELS'].split(',')
157 |     else:
158 |       models = os.listdir(self.model_path)
159 |     for model in tqdm.tqdm(models):
160 |       try:
161 |         engine = GPTEngine(api=self, model_name=model)
162 |         self.engines[model] = engine
163 |         self.models.append(model)
164 |       except:
165 |         traceback.print_exc()
166 |     pp(self.engines)
167 |     pp(self.models)
168 | 
169 | 
170 |   def engines_list(self):
171 |     for model in self.models:
172 |       yield {
173 |           "id": model,
174 |           "object": "engine",
175 |           "owner": "openai",
176 |           "ready": True,
177 |       }
178 | 
179 | api = API()
180 | 
181 | app = Sanic()
182 | CORS(app)
183 | 
184 | 
185 | def log_request(request):
186 |   #import pdb; pdb.set_trace()
187 |   headers = dict(list(request.headers.items()))
188 |   if 'authorization' in headers:
189 |     del headers['authorization']
190 |   headers['x-openai-client-user-agent'] = loads(headers.get('x-openai-client-user-agent', '{}'))
191 |   props = {}
192 |   props['url'] = request.url
193 |   props['method'] = request.method
194 |   props['headers'] = headers
195 |   props['request'] = request.json
196 |   pp(props)
197 |   #print(request.json)
198 | 
199 | 
200 | @app.route('/v1/engines')
201 | async def v1_engines_list(request):
202 |   log_request(request)
203 |   res = {"object": "list", "data": []}
204 |   for result in api.engines_list():
205 |     res["data"].append(result)
206 |   return json(res)
207 |   return json({
208 |     "data": [
209 |       {
210 |         "id": "ada",
211 |         "object": "engine",
212 |         "owner": "openai",
213 |         "ready": True
214 |       },
215 |       {
216 |         "id": "ada-beta",
217 |         "object": "engine",
218 |         "owner": "openai",
219 |         "ready": True
220 |       },
221 |       {
222 |         "id": "babbage",
223 |         "object": "engine",
224 |         "owner": "openai",
225 |         "ready": True
226 |       },
227 |       {
228 |         "id": "babbage-beta",
229 |         "object": "engine",
230 |         "owner": "openai",
231 |         "ready": True
232 |       },
233 |       {
234 |         "id": "curie",
235 |         "object": "engine",
236 |         "owner": "openai",
237 |         "ready": True
238 |       },
239 |       {
240 |         "id": "curie-beta",
241 |         "object": "engine",
242 |         "owner": "openai",
243 |         "ready": True
244 |       },
245 |       {
246 |         "id": "davinci",
247 |         "object": "engine",
248 |         "owner": "openai",
249 |         "ready": True
250 |       },
251 |       {
252 |         "id": "davinci-beta",
253 |         "object": "engine",
254 |         "owner": "openai",
255 |         "ready": True
256 |       }
257 |     ],
258 |     "object": "list"
259 |   })
260 | 
261 | 
262 | def random_id(prefix, nbytes=18):
263 |   token = secrets.token_bytes(nbytes)
264 |   return prefix + '-' + base64.urlsafe_b64encode(token).decode('utf8')
265 | 
266 | 
267 | def number(x):
268 |   if isinstance(x, str):
269 |     try:
270 |       x = int(x)
271 |     except ValueError:
272 |       try:
273 |         x = float(x)
274 |       except ValueError:
275 |         pass
276 |   return x
277 | 
278 | 
279 | def json_pretty_dumps(x):
280 |    return dumps(x, sort_keys=True, indent=2)
281 | 
282 | 
283 | from urllib import parse
284 | 
285 | @app.route('/v1/engines/<engine_name>/completions', methods=['POST', 'GET'])
286 | async def v1_engines_completions(request, engine_name):
287 |   log_request(request)
288 |   kws = request.json
289 |   if kws is None:
290 |     url, query = request.url.split('?', 1) if '?' in request.url else (request.url, '')
291 |     kws = dict(parse.parse_qsl(query))
292 |     kws = {k: number(v) for k, v in kws.items()}
293 |   pp(kws)
294 |   engine = None
295 |   if engine_name in api.engines:
296 |     engine = api.engines[engine_name]
297 |   else:
298 |     # rather than throw an error when someone attempts to use an
299 |     # invalid engine, silently fall back to any valid engine for
300 |     # simplicity. E.g. if they try to request 'davinci' but you're
301 |     # serving 117M, then automatically fall back to 117M.
302 |     for info in api.engines_list():
303 |       print('Warning: attempted to use invalid enngine {!r}; falling back to engine {!r}'.format(engine_name, info['id']))
304 |       engine = api.engines[info['id']]
305 |       break
306 |   if engine is None:
307 |     raise RuntimeError("Not serving any models. Try running `python3 download_model.py 117M` and be sure to `export MODELS=117M` before starting the server.")
308 |       
309 |   choices = []
310 |   for choice in engine.completion(**kws):
311 |     choices.append(choice)
312 |   id_ = random_id("cmpl")
313 |   return json({"id": id_, "object": "text_completion", "created": time.time(), "model": engine.id, "choices": choices}, dumps=json_pretty_dumps)
314 |   #return json({"id": "cmpl-Wt5z1RZglyDHHl0SnSvKWVzA", "object": "text_completion", "created": 1599616871, "model": "davinci:2020-05-03", "choices": [{"text": "Test.SetLayerPropertiesWithNonContainedInvisible (", "index": 0, "logprobs": None, "finish_reason": "length"}]})
315 |     
316 | if __name__ == '__main__':
317 |   args = sys.argv[1:]
318 |   port = int(args[0] if len(args) > 0 else os.environ.get('PORT', '9000'))
319 |   app.run(host='0.0.0.0', port=port)
320 | 
321 | 


--------------------------------------------------------------------------------
/openai_server/__main__.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | from . import app
 4 | 
 5 | if __name__ == '__main__':
 6 |   args = sys.argv[1:]
 7 |   port = int(args[0] if len(args) > 0 else os.environ.get('PORT', '9000'))
 8 |   app.run(host='0.0.0.0', port=port)
 9 | 
10 | 


--------------------------------------------------------------------------------
/openai_server/gpt/encoder.py:
--------------------------------------------------------------------------------
  1 | """Byte pair encoding utilities"""
  2 | 
  3 | import os
  4 | import json
  5 | import regex as re
  6 | from functools import lru_cache
  7 | 
  8 | @lru_cache()
  9 | def bytes_to_unicode():
 10 |     """
 11 |     Returns list of utf-8 byte and a corresponding list of unicode strings.
 12 |     The reversible bpe codes work on unicode strings.
 13 |     This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
 14 |     When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
 15 |     This is a signficant percentage of your normal, say, 32K bpe vocab.
 16 |     To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
 17 |     And avoids mapping to whitespace/control characters the bpe code barfs on.
 18 |     """
 19 |     bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
 20 |     cs = bs[:]
 21 |     n = 0
 22 |     for b in range(2**8):
 23 |         if b not in bs:
 24 |             bs.append(b)
 25 |             cs.append(2**8+n)
 26 |             n += 1
 27 |     cs = [chr(n) for n in cs]
 28 |     return dict(zip(bs, cs))
 29 | 
 30 | def get_pairs(word):
 31 |     """Return set of symbol pairs in a word.
 32 | 
 33 |     Word is represented as tuple of symbols (symbols being variable-length strings).
 34 |     """
 35 |     pairs = set()
 36 |     prev_char = word[0]
 37 |     for char in word[1:]:
 38 |         pairs.add((prev_char, char))
 39 |         prev_char = char
 40 |     return pairs
 41 | 
 42 | class Encoder:
 43 |     def __init__(self, encoder, bpe_merges, errors='replace'):
 44 |         self.encoder = encoder
 45 |         self.decoder = {v:k for k,v in self.encoder.items()}
 46 |         self.errors = errors # how to handle errors in decoding
 47 |         self.byte_encoder = bytes_to_unicode()
 48 |         self.byte_decoder = {v:k for k, v in self.byte_encoder.items()}
 49 |         self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
 50 |         self.cache = {}
 51 | 
 52 |         # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
 53 |         self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
 54 | 
 55 |     def bpe(self, token):
 56 |         if token in self.cache:
 57 |             return self.cache[token]
 58 |         word = tuple(token)
 59 |         pairs = get_pairs(word)
 60 | 
 61 |         if not pairs:
 62 |             return token
 63 | 
 64 |         while True:
 65 |             bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
 66 |             if bigram not in self.bpe_ranks:
 67 |                 break
 68 |             first, second = bigram
 69 |             new_word = []
 70 |             i = 0
 71 |             while i < len(word):
 72 |                 try:
 73 |                     j = word.index(first, i)
 74 |                     new_word.extend(word[i:j])
 75 |                     i = j
 76 |                 except:
 77 |                     new_word.extend(word[i:])
 78 |                     break
 79 | 
 80 |                 if word[i] == first and i < len(word)-1 and word[i+1] == second:
 81 |                     new_word.append(first+second)
 82 |                     i += 2
 83 |                 else:
 84 |                     new_word.append(word[i])
 85 |                     i += 1
 86 |             new_word = tuple(new_word)
 87 |             word = new_word
 88 |             if len(word) == 1:
 89 |                 break
 90 |             else:
 91 |                 pairs = get_pairs(word)
 92 |         word = ' '.join(word)
 93 |         self.cache[token] = word
 94 |         return word
 95 | 
 96 |     def encode(self, text):
 97 |         bpe_tokens = []
 98 |         for token in re.findall(self.pat, text):
 99 |             token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
100 |             bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
101 |         return bpe_tokens
102 | 
103 |     def decode(self, tokens):
104 |         text = ''.join([self.decoder[token] for token in tokens])
105 |         text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
106 |         return text
107 | 
108 | def get_encoder(model_name, models_dir):
109 |     with open(os.path.join(models_dir, model_name, 'encoder.json'), 'r') as f:
110 |         encoder = json.load(f)
111 |     with open(os.path.join(models_dir, model_name, 'vocab.bpe'), 'r', encoding="utf-8") as f:
112 |         bpe_data = f.read()
113 |     bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split('\n')[1:-1]]
114 |     return Encoder(
115 |         encoder=encoder,
116 |         bpe_merges=bpe_merges,
117 |     )
118 | 


--------------------------------------------------------------------------------
/openai_server/gpt/generate_unconditional_samples.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import fire
 4 | import json
 5 | import os
 6 | import numpy as np
 7 | import tensorflow as tf
 8 | 
 9 | import model, sample, encoder
10 | 
11 | def sample_model(
12 |     model_name='124M',
13 |     seed=None,
14 |     nsamples=0,
15 |     batch_size=1,
16 |     length=None,
17 |     temperature=1,
18 |     top_k=0,
19 |     top_p=1,
20 |     models_dir='models',
21 | ):
22 |     """
23 |     Run the sample_model
24 |     :model_name=124M : String, which model to use
25 |     :seed=None : Integer seed for random number generators, fix seed to
26 |      reproduce results
27 |     :nsamples=0 : Number of samples to return, if 0, continues to
28 |      generate samples indefinately.
29 |     :batch_size=1 : Number of batches (only affects speed/memory).
30 |     :length=None : Number of tokens in generated text, if None (default), is
31 |      determined by model hyperparameters
32 |     :temperature=1 : Float value controlling randomness in boltzmann
33 |      distribution. Lower temperature results in less random completions. As the
34 |      temperature approaches zero, the model will become deterministic and
35 |      repetitive. Higher temperature results in more random completions.
36 |     :top_k=0 : Integer value controlling diversity. 1 means only 1 word is
37 |      considered for each step (token), resulting in deterministic completions,
38 |      while 40 means 40 words are considered at each step. 0 (default) is a
39 |      special setting meaning no restrictions. 40 generally is a good value.
40 |      :models_dir : path to parent folder containing model subfolders
41 |      (i.e. contains the <model_name> folder)
42 |     """
43 |     models_dir = os.path.expanduser(os.path.expandvars(models_dir))
44 |     enc = encoder.get_encoder(model_name, models_dir)
45 |     hparams = model.default_hparams()
46 |     with open(os.path.join(models_dir, model_name, 'hparams.json')) as f:
47 |         hparams.override_from_dict(json.load(f))
48 | 
49 |     if length is None:
50 |         length = hparams.n_ctx
51 |     elif length > hparams.n_ctx:
52 |         raise ValueError("Can't get samples longer than window size: %s" % hparams.n_ctx)
53 | 
54 |     with tf.Session(graph=tf.Graph()) as sess:
55 |         np.random.seed(seed)
56 |         tf.set_random_seed(seed)
57 | 
58 |         output = sample.sample_sequence(
59 |             hparams=hparams, length=length,
60 |             start_token=enc.encoder['<|endoftext|>'],
61 |             batch_size=batch_size,
62 |             temperature=temperature, top_k=top_k, top_p=top_p
63 |         )[:, 1:]
64 | 
65 |         saver = tf.train.Saver()
66 |         ckpt = tf.train.latest_checkpoint(os.path.join(models_dir, model_name))
67 |         saver.restore(sess, ckpt)
68 | 
69 |         generated = 0
70 |         while nsamples == 0 or generated < nsamples:
71 |             out = sess.run(output)
72 |             for i in range(batch_size):
73 |                 generated += batch_size
74 |                 text = enc.decode(out[i])
75 |                 print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
76 |                 print(text)
77 | 
78 | if __name__ == '__main__':
79 |     fire.Fire(sample_model)
80 | 
81 | 


--------------------------------------------------------------------------------
/openai_server/gpt/interactive_conditional_samples.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import fire
 4 | import json
 5 | import os
 6 | import numpy as np
 7 | import tensorflow as tf
 8 | 
 9 | import model, sample, encoder
10 | 
11 | def interact_model(
12 |     model_name='124M',
13 |     seed=None,
14 |     nsamples=1,
15 |     batch_size=1,
16 |     length=None,
17 |     temperature=1,
18 |     top_k=0,
19 |     top_p=1,
20 |     models_dir='models',
21 | ):
22 |     """
23 |     Interactively run the model
24 |     :model_name=124M : String, which model to use
25 |     :seed=None : Integer seed for random number generators, fix seed to reproduce
26 |      results
27 |     :nsamples=1 : Number of samples to return total
28 |     :batch_size=1 : Number of batches (only affects speed/memory).  Must divide nsamples.
29 |     :length=None : Number of tokens in generated text, if None (default), is
30 |      determined by model hyperparameters
31 |     :temperature=1 : Float value controlling randomness in boltzmann
32 |      distribution. Lower temperature results in less random completions. As the
33 |      temperature approaches zero, the model will become deterministic and
34 |      repetitive. Higher temperature results in more random completions.
35 |     :top_k=0 : Integer value controlling diversity. 1 means only 1 word is
36 |      considered for each step (token), resulting in deterministic completions,
37 |      while 40 means 40 words are considered at each step. 0 (default) is a
38 |      special setting meaning no restrictions. 40 generally is a good value.
39 |      :models_dir : path to parent folder containing model subfolders
40 |      (i.e. contains the <model_name> folder)
41 |     """
42 |     models_dir = os.path.expanduser(os.path.expandvars(models_dir))
43 |     if batch_size is None:
44 |         batch_size = 1
45 |     assert nsamples % batch_size == 0
46 | 
47 |     enc = encoder.get_encoder(model_name, models_dir)
48 |     hparams = model.default_hparams()
49 |     with open(os.path.join(models_dir, model_name, 'hparams.json')) as f:
50 |         hparams.override_from_dict(json.load(f))
51 | 
52 |     if length is None:
53 |         length = hparams.n_ctx // 2
54 |     elif length > hparams.n_ctx:
55 |         raise ValueError("Can't get samples longer than window size: %s" % hparams.n_ctx)
56 | 
57 |     with tf.Session(graph=tf.Graph()) as sess:
58 |         context = tf.placeholder(tf.int32, [batch_size, None])
59 |         np.random.seed(seed)
60 |         tf.set_random_seed(seed)
61 |         output = sample.sample_sequence(
62 |             hparams=hparams, length=length,
63 |             context=context,
64 |             batch_size=batch_size,
65 |             temperature=temperature, top_k=top_k, top_p=top_p
66 |         )
67 | 
68 |         saver = tf.train.Saver()
69 |         ckpt = tf.train.latest_checkpoint(os.path.join(models_dir, model_name))
70 |         saver.restore(sess, ckpt)
71 | 
72 |         while True:
73 |             raw_text = input("Model prompt >>> ")
74 |             while not raw_text:
75 |                 print('Prompt should not be empty!')
76 |                 raw_text = input("Model prompt >>> ")
77 |             context_tokens = enc.encode(raw_text)
78 |             generated = 0
79 |             for _ in range(nsamples // batch_size):
80 |                 out = sess.run(output, feed_dict={
81 |                     context: [context_tokens for _ in range(batch_size)]
82 |                 })[:, len(context_tokens):]
83 |                 for i in range(batch_size):
84 |                     generated += 1
85 |                     text = enc.decode(out[i])
86 |                     print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
87 |                     print(text)
88 |             print("=" * 80)
89 | 
90 | if __name__ == '__main__':
91 |     fire.Fire(interact_model)
92 | 
93 | 


--------------------------------------------------------------------------------
/openai_server/gpt/model.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | from tensorflow.contrib.training import HParams
  4 | 
  5 | def default_hparams():
  6 |     return HParams(
  7 |         n_vocab=0,
  8 |         n_ctx=1024,
  9 |         n_embd=768,
 10 |         n_head=12,
 11 |         n_layer=12,
 12 |     )
 13 | 
 14 | def shape_list(x):
 15 |     """Deal with dynamic shape in tensorflow cleanly."""
 16 |     static = x.shape.as_list()
 17 |     dynamic = tf.shape(x)
 18 |     return [dynamic[i] if s is None else s for i, s in enumerate(static)]
 19 | 
 20 | def softmax(x, axis=-1):
 21 |     x = x - tf.reduce_max(x, axis=axis, keepdims=True)
 22 |     ex = tf.exp(x)
 23 |     return ex / tf.reduce_sum(ex, axis=axis, keepdims=True)
 24 | 
 25 | def gelu(x):
 26 |     return 0.5*x*(1+tf.tanh(np.sqrt(2/np.pi)*(x+0.044715*tf.pow(x, 3))))
 27 | 
 28 | def norm(x, scope, *, axis=-1, epsilon=1e-5):
 29 |     """Normalize to mean = 0, std = 1, then do a diagonal affine transform."""
 30 |     with tf.variable_scope(scope):
 31 |         n_state = x.shape[-1].value
 32 |         g = tf.get_variable('g', [n_state], initializer=tf.constant_initializer(1))
 33 |         b = tf.get_variable('b', [n_state], initializer=tf.constant_initializer(0))
 34 |         u = tf.reduce_mean(x, axis=axis, keepdims=True)
 35 |         s = tf.reduce_mean(tf.square(x-u), axis=axis, keepdims=True)
 36 |         x = (x - u) * tf.rsqrt(s + epsilon)
 37 |         x = x*g + b
 38 |         return x
 39 | 
 40 | def split_states(x, n):
 41 |     """Reshape the last dimension of x into [n, x.shape[-1]/n]."""
 42 |     *start, m = shape_list(x)
 43 |     return tf.reshape(x, start + [n, m//n])
 44 | 
 45 | def merge_states(x):
 46 |     """Smash the last two dimensions of x into a single dimension."""
 47 |     *start, a, b = shape_list(x)
 48 |     return tf.reshape(x, start + [a*b])
 49 | 
 50 | def conv1d(x, scope, nf, *, w_init_stdev=0.02):
 51 |     with tf.variable_scope(scope):
 52 |         *start, nx = shape_list(x)
 53 |         w = tf.get_variable('w', [1, nx, nf], initializer=tf.random_normal_initializer(stddev=w_init_stdev))
 54 |         b = tf.get_variable('b', [nf], initializer=tf.constant_initializer(0))
 55 |         c = tf.reshape(tf.matmul(tf.reshape(x, [-1, nx]), tf.reshape(w, [-1, nf]))+b, start+[nf])
 56 |         return c
 57 | 
 58 | def attention_mask(nd, ns, *, dtype):
 59 |     """1's in the lower triangle, counting from the lower right corner.
 60 | 
 61 |     Same as tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd), but doesn't produce garbage on TPUs.
 62 |     """
 63 |     i = tf.range(nd)[:,None]
 64 |     j = tf.range(ns)
 65 |     m = i >= j - ns + nd
 66 |     return tf.cast(m, dtype)
 67 | 
 68 | 
 69 | def attn(x, scope, n_state, *, past, hparams):
 70 |     assert x.shape.ndims == 3  # Should be [batch, sequence, features]
 71 |     assert n_state % hparams.n_head == 0
 72 |     if past is not None:
 73 |         assert past.shape.ndims == 5  # Should be [batch, 2, heads, sequence, features], where 2 is [k, v]
 74 | 
 75 |     def split_heads(x):
 76 |         # From [batch, sequence, features] to [batch, heads, sequence, features]
 77 |         return tf.transpose(split_states(x, hparams.n_head), [0, 2, 1, 3])
 78 | 
 79 |     def merge_heads(x):
 80 |         # Reverse of split_heads
 81 |         return merge_states(tf.transpose(x, [0, 2, 1, 3]))
 82 | 
 83 |     def mask_attn_weights(w):
 84 |         # w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst.
 85 |         _, _, nd, ns = shape_list(w)
 86 |         b = attention_mask(nd, ns, dtype=w.dtype)
 87 |         b = tf.reshape(b, [1, 1, nd, ns])
 88 |         w = w*b - tf.cast(1e10, w.dtype)*(1-b)
 89 |         return w
 90 | 
 91 |     def multihead_attn(q, k, v):
 92 |         # q, k, v have shape [batch, heads, sequence, features]
 93 |         w = tf.matmul(q, k, transpose_b=True)
 94 |         w = w * tf.rsqrt(tf.cast(v.shape[-1].value, w.dtype))
 95 | 
 96 |         w = mask_attn_weights(w)
 97 |         w = softmax(w)
 98 |         a = tf.matmul(w, v)
 99 |         return a
100 | 
101 |     with tf.variable_scope(scope):
102 |         c = conv1d(x, 'c_attn', n_state*3)
103 |         q, k, v = map(split_heads, tf.split(c, 3, axis=2))
104 |         present = tf.stack([k, v], axis=1)
105 |         if past is not None:
106 |             pk, pv = tf.unstack(past, axis=1)
107 |             k = tf.concat([pk, k], axis=-2)
108 |             v = tf.concat([pv, v], axis=-2)
109 |         a = multihead_attn(q, k, v)
110 |         a = merge_heads(a)
111 |         a = conv1d(a, 'c_proj', n_state)
112 |         return a, present
113 | 
114 | 
115 | def mlp(x, scope, n_state, *, hparams):
116 |     with tf.variable_scope(scope):
117 |         nx = x.shape[-1].value
118 |         h = gelu(conv1d(x, 'c_fc', n_state))
119 |         h2 = conv1d(h, 'c_proj', nx)
120 |         return h2
121 | 
122 | 
123 | def block(x, scope, *, past, hparams):
124 |     with tf.variable_scope(scope):
125 |         nx = x.shape[-1].value
126 |         a, present = attn(norm(x, 'ln_1'), 'attn', nx, past=past, hparams=hparams)
127 |         x = x + a
128 |         m = mlp(norm(x, 'ln_2'), 'mlp', nx*4, hparams=hparams)
129 |         x = x + m
130 |         return x, present
131 | 
132 | 
133 | # def block(x, scope, *, past, hparams):
134 | #     with tf.variable_scope(scope):
135 | #         nx = x.shape[-1].value
136 | #         x_norm = norm(x, 'ln_1')
137 | #         a, present = attn(x_norm, 'attn', nx, past=past, hparams=hparams)
138 | #         x1 = tf.add(x, a)
139 | #         x1_norm = norm(x1, 'ln_2')
140 | #         m = mlp(x1_norm, 'mlp', nx*4, hparams=hparams)
141 | #         x2 = tf.add(x1, m)
142 | #         return x2, present
143 | #
144 | #
145 | # from functools import partial
146 | #
147 | #
148 | # def block(x, scope, *, past, hparams):
149 | #     with tf.variable_scope(scope):
150 | #         nx = x.shape[-1].value
151 | #         o0 = tf.identity
152 | #         o1 = partial(norm, scope='ln_1') # x_norm = norm(x, 'ln_1')
153 | #         o2 = partial(attn, scope='attn', n_state=nx, past=past, hparams=hparams) # a, present = attn(x_norm, 'attn', nx, past=past, hparams=hparams)
154 | #         o3 = partial(tf.add, x) # x1 = tf.add(x, a)
155 | #         o4 = partial(norm, scope='ln_2') # x1_norm = norm(x1, 'ln_2')
156 | #         o5 = partial(mlp, scope='mlp', n_state=nx*4, hparams=hparams) # m = mlp(x1_norm, 'mlp', nx*4, hparams=hparams)
157 | #         o6 = partial(tf.add, o3) # x2 = tf.add(x1, m)
158 | #         return o6, present # return x2, present
159 | 
160 | 
161 | def past_shape(*, hparams, batch_size=None, sequence=None):
162 |     return [batch_size, hparams.n_layer, 2, hparams.n_head, sequence, hparams.n_embd // hparams.n_head]
163 | 
164 | def expand_tile(value, size):
165 |     """Add a new axis of given size."""
166 |     value = tf.convert_to_tensor(value, name='value')
167 |     ndims = value.shape.ndims
168 |     return tf.tile(tf.expand_dims(value, axis=0), [size] + [1]*ndims)
169 | 
170 | def positions_for(tokens, past_length):
171 |     batch_size = tf.shape(tokens)[0]
172 |     nsteps = tf.shape(tokens)[1]
173 |     return expand_tile(past_length + tf.range(nsteps), batch_size)
174 | 
175 | 
176 | def model(hparams, X, past=None, scope='model', reuse=False):
177 |     with tf.variable_scope(scope, reuse=reuse):
178 |         results = {}
179 |         batch, sequence = shape_list(X)
180 | 
181 |         wpe = tf.get_variable('wpe', [hparams.n_ctx, hparams.n_embd],
182 |                              initializer=tf.random_normal_initializer(stddev=0.01))
183 |         wte = tf.get_variable('wte', [hparams.n_vocab, hparams.n_embd],
184 |                              initializer=tf.random_normal_initializer(stddev=0.02))
185 |         past_length = 0 if past is None else tf.shape(past)[-2]
186 |         h = tf.gather(wte, X) + tf.gather(wpe, positions_for(X, past_length))
187 | 
188 |         # Transformer
189 |         presents = []
190 |         pasts = tf.unstack(past, axis=1) if past is not None else [None] * hparams.n_layer
191 |         assert len(pasts) == hparams.n_layer
192 |         for layer, past in enumerate(pasts):
193 |             h, present = block(h, 'h%d' % layer, past=past, hparams=hparams)
194 |             presents.append(present)
195 |         results['present'] = tf.stack(presents, axis=1)
196 |         h = norm(h, 'ln_f')
197 | 
198 |         # Language model loss.  Do tokens <n predict token n?
199 |         h_flat = tf.reshape(h, [batch*sequence, hparams.n_embd])
200 |         logits = tf.matmul(h_flat, wte, transpose_b=True)
201 |         logits = tf.reshape(logits, [batch, sequence, hparams.n_vocab])
202 |         results['logits'] = logits
203 |         return results
204 | 


--------------------------------------------------------------------------------
/openai_server/gpt/sample.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | 
  3 | import model
  4 | 
  5 | 
  6 | def lerp(a, b, t):
  7 |     return (b - a) * t + a
  8 | 
  9 | 
 10 | # OOMs for 1.5B
 11 | def penalize_used_expensive(logits, output, frequency_penalty=0.85):
 12 |     penalty = tf.reduce_min(tf.one_hot(output, logits.shape[1], frequency_penalty, 1.0), axis=1)
 13 |     minimum_logit = tf.reduce_min(logits, axis=1)
 14 |     change_tensor = minimum_logit * tf.ones_like(logits, dtype=logits.dtype)
 15 |     result = lerp(change_tensor, logits, penalty)
 16 |     return result
 17 | 
 18 | 
 19 | # Doesn't seem quite right
 20 | def penalize_used_new(logits, output, frequency_penalty=0.85):
 21 |     # I want to change the indices of logits wherever the index is found in output
 22 |     minimum_logit = tf.reduce_min(logits, axis=1)
 23 |     unique = tf.unique(output[0])[0]
 24 |     ones = tf.ones_like(unique, dtype=unique.dtype)
 25 |     indices = tf.expand_dims(unique, 1)
 26 | 
 27 |     updates = tf.scatter_nd(indices, ones, [logits.shape[1]])
 28 | 
 29 |     bool_tensor = tf.expand_dims(tf.cast(updates, tf.bool), 0)
 30 | 
 31 |     result = tf.compat.v1.where(bool_tensor, lerp(minimum_logit, logits, frequency_penalty), logits)
 32 |     return result
 33 | 
 34 | 
 35 | # Only works for 1558M
 36 | def penalize_used(logits, output, frequency_penalty=0.85):
 37 |     # I want to change the indices of logits wherever the index is found in output
 38 |     unique = tf.unique(output[0])[0]
 39 |     ones = tf.ones_like(unique, dtype=unique.dtype)
 40 |     indices = tf.expand_dims(unique, 1)
 41 | 
 42 |     updates = tf.scatter_nd(indices, ones, [logits.shape[1]])
 43 | 
 44 |     bool_tensor = tf.expand_dims(tf.cast(updates, tf.bool), 0)
 45 | 
 46 |     result = tf.compat.v1.where(bool_tensor, logits * frequency_penalty, logits)
 47 |     return result
 48 | 
 49 | 
 50 | def top_k_logits(logits, k):
 51 |     if k == 0:
 52 |         # no truncation
 53 |         return logits
 54 | 
 55 |     def _top_k():
 56 |         values, _ = tf.nn.top_k(logits, k=k)
 57 |         min_values = values[:, -1, tf.newaxis]
 58 |         return tf.where(
 59 |             logits < min_values,
 60 |             tf.ones_like(logits, dtype=logits.dtype) * -1e10,
 61 |             logits,
 62 |         )
 63 |     return tf.cond(
 64 |        tf.equal(k, 0),
 65 |        lambda: logits,
 66 |        lambda: _top_k(),
 67 |     )
 68 | 
 69 | 
 70 | def top_p_logits(logits, p):
 71 |     """Nucleus sampling"""
 72 |     batch, _ = logits.shape.as_list()
 73 |     sorted_logits = tf.sort(logits, direction='DESCENDING', axis=-1)
 74 |     cumulative_probs = tf.cumsum(tf.nn.softmax(sorted_logits, axis=-1), axis=-1)
 75 |     indices = tf.stack([
 76 |         tf.range(0, batch),
 77 |         # number of indices to include
 78 |         tf.maximum(tf.reduce_sum(tf.cast(cumulative_probs <= p, tf.int32), axis=-1) - 1, 0),
 79 |     ], axis=-1)
 80 |     min_values = tf.gather_nd(sorted_logits, indices)
 81 |     return tf.where(
 82 |         logits < min_values,
 83 |         tf.ones_like(logits) * -1e10,
 84 |         logits,
 85 |     )
 86 | 
 87 | 
 88 | def sample_sequence(*, hparams, length, start_token=None, batch_size=None, context=None, temperature=1, top_k=0, top_p=1, frequency_penalty=0.0):
 89 |     if start_token is None:
 90 |         assert context is not None, 'Specify exactly one of start_token and context!'
 91 |     else:
 92 |         assert context is None, 'Specify exactly one of start_token and context!'
 93 |         context = tf.fill([batch_size, 1], start_token)
 94 | 
 95 |     def step(hparams, tokens, past=None):
 96 |         lm_output = model.model(hparams=hparams, X=tokens, past=past, reuse=tf.AUTO_REUSE)
 97 | 
 98 |         logits = lm_output['logits'][:, :, :hparams.n_vocab]
 99 |         presents = lm_output['present']
100 |         presents.set_shape(model.past_shape(hparams=hparams, batch_size=batch_size))
101 |         return {
102 |             'logits': logits,
103 |             'presents': presents,
104 |         }
105 | 
106 |     with tf.name_scope('sample_sequence'):
107 |         def body(past, prev, output):
108 |             next_outputs = step(hparams, prev, past=past)
109 |             logits = next_outputs['logits'][:, -1, :]  / tf.to_float(temperature)
110 |             if frequency_penalty != 0.0:
111 |                 logits = penalize_used(logits, output, frequency_penalty=frequency_penalty)
112 |             logits = top_k_logits(logits, k=top_k)
113 |             logits = top_p_logits(logits, p=top_p)
114 |             samples = tf.multinomial(logits, num_samples=1, output_dtype=tf.int32)
115 |             return [
116 |                 next_outputs['presents'] if past is None else tf.concat([past, next_outputs['presents']], axis=-2),
117 |                 samples,
118 |                 tf.concat([output, samples], axis=1)
119 |             ]
120 | 
121 |         past, prev, output = body(None, context, context)
122 | 
123 |         def cond(*args):
124 |             return True
125 | 
126 |         _, _, tokens = tf.while_loop(
127 |             cond=cond, body=body,
128 |             maximum_iterations=length - 1,
129 |             loop_vars=[
130 |                 past,
131 |                 prev,
132 |                 output
133 |             ],
134 |             shape_invariants=[
135 |                 tf.TensorShape(model.past_shape(hparams=hparams, batch_size=batch_size)),
136 |                 tf.TensorShape([batch_size, None]),
137 |                 tf.TensorShape([batch_size, None]),
138 |             ],
139 |             back_prop=False,
140 |         )
141 | 
142 |         return tokens
143 | 


--------------------------------------------------------------------------------
/prod.sh:
--------------------------------------------------------------------------------
1 | set -ex
2 | export MAX_TOKENS="${MAX_TOKENS:-500}"
3 | export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0}"
4 | export PORT="${PORT:-9000}"
5 | exec bash start.sh
6 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | sanic
2 | sanic-cors
3 | tokenizers
4 | transformers
5 | regex
6 | tqdm
7 | tensorflow>=1.15.0
8 | openai
9 | 


--------------------------------------------------------------------------------
/start.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -ex
3 | exec python3 -m openai_server "$@"
4 | 
5 | 


--------------------------------------------------------------------------------