├── requirements.txt
├── transformers_openai_api
    ├── __init__.py
    ├── serve.py
    ├── __main__.py
    ├── metrics.py
    ├── app.py
    └── models.py
├── .gitignore
├── setup.py
├── LICENSE
├── config.example.json
└── README.md


/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers
2 | accelerate
3 | torch
4 | Flask


--------------------------------------------------------------------------------
/transformers_openai_api/__init__.py:
--------------------------------------------------------------------------------
1 | from .app import make_transformers_openai_api
2 | from .serve import run_server


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | venv/
3 | build/
4 | dist/
5 | *.egg-info
6 | config.json
7 | cache/
8 | .vscode/launch.json


--------------------------------------------------------------------------------
/transformers_openai_api/serve.py:
--------------------------------------------------------------------------------
 1 | from flask import Flask
 2 | 
 3 | 
 4 | def run_server(app: Flask):
 5 |     app.run(
 6 |         host=app.config.get('HOST', '127.0.0.1'),
 7 |         port=app.config.get('PORT', 5000),
 8 |         debug=app.config.get('ENV', 'production') != 'production'
 9 |     )
10 | 


--------------------------------------------------------------------------------
/transformers_openai_api/__main__.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import sys
 4 | from .app import make_transformers_openai_api
 5 | from .serve import run_server
 6 | 
 7 | def main():
 8 |     parser = argparse.ArgumentParser(
 9 |         prog='transformers-openai-api',
10 |         description='An OpenAI Completions API compatible server for locally running transformers models')
11 |     parser.add_argument('config', nargs='?', help='Path to config.json',
12 |                         default=os.path.join(os.getcwd(), 'config.json'))
13 |     args = parser.parse_args()
14 | 
15 |     run_server(make_transformers_openai_api(args.config))
16 | 
17 | if __name__ == '__main__':
18 |     sys.exit(main())
19 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | setup(
 4 |     name='transformers-openai-api',
 5 |     packages=["transformers_openai_api"],
 6 |     version='1.1.0',
 7 |     description='An OpenAI Completions API compatible server for NLP transformers models',
 8 |     author='Jeffrey Quesnelle',
 9 |     author_email='jq@jeffq.com',
10 |     url='https://github.com/jquesnelle/transformers-openai-api/',
11 |     license='MIT',
12 |     install_requires=[
13 |         'transformers',
14 |         'accelerate',
15 |         'torch',
16 |         'Flask'
17 |     ],
18 |     entry_points={
19 |         'console_scripts': [
20 |             'transformers-openai-api = transformers_openai_api.__main__:main'
21 |         ]
22 |     },
23 |     classifiers=[
24 |         "Programming Language :: Python :: 3",
25 |         "License :: OSI Approved :: MIT License",
26 |         "Operating System :: OS Independent",
27 |     ]
28 | )
29 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Jeffrey Quesnelle
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/config.example.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "MODELS": {
 3 |         "text-davinci-003": {
 4 |             "NAME": "google/flan-t5-base",
 5 |             "TYPE": "Seq2Seq",
 6 |             "MODEL_CONFIG": {
 7 |                 "device_map": "auto"
 8 |             },
 9 |             "MODEL_DEVICE": null
10 |         },
11 |         "text-curie-001": {
12 |             "ENABLED": false,
13 |             "NAME": "facebook/opt-350m",
14 |             "TYPE": "CausalLM",
15 |             "MODEL_CONFIG": {
16 |                 "torch_dtype": "float16"
17 |             },
18 |             "GENERATE_CONFIG": {
19 |                 "max_new_tokens": 2048
20 |             }
21 |         },
22 |         "text-babbage-001": {
23 |             "ENABLED": false,
24 |             "NAME": "pszemraj/pegasus-x-large-book-summary",
25 |             "TYPE": "Seq2Seq",
26 |             "GENERATE_CONFIG": {
27 |                 "max_length": 256,
28 |                 "min_length": 8,
29 |                 "no_repeat_ngram_size": 3,
30 |                 "early_stopping": true,
31 |                 "repetition_penalty": 3.5,
32 |                 "length_penalty": 0.2,
33 |                 "encoder_no_repeat_ngram_size": 3,
34 |                 "num_beams": 4
35 |             }
36 |         }
37 |     }
38 | }


--------------------------------------------------------------------------------
/transformers_openai_api/metrics.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Mapping
 2 | 
 3 | 
 4 | class Metrics:
 5 | 
 6 |     data: Mapping[str, Any]
 7 | 
 8 |     def __init__(self) -> None:
 9 |         self.data = {
10 |             'total_prompt_tokens': 0,
11 |             'total_completion_tokens': 0,
12 |             'total_total_tokens': 0,
13 |             'model_metrics': {}
14 |         }
15 | 
16 |     def update(self, response: Mapping[str, Any]):
17 |         if 'model' in response:
18 |             model = response['model']
19 |             if model not in self.data['model_metrics']:
20 |                 self.data['model_metrics'][model] = {
21 |                     'total_prompt_tokens': 0,
22 |                     'total_completion_tokens': 0,
23 |                     'total_total_tokens': 0,
24 |                     'calls': 0
25 |                 }
26 |             
27 |             model_metrics = self.data['model_metrics'][model]
28 |             model_metrics['calls'] += 1
29 | 
30 |             if 'usage' in response:
31 |                 usage = response['usage']
32 |                 prompt_tokens = usage.get('prompt_tokens', 0)
33 |                 completion_tokens = usage.get('completion_tokens', 0)
34 |                 total_tokens = usage.get('total_tokens', 0)
35 | 
36 |                 self.data['total_prompt_tokens'] += prompt_tokens
37 |                 self.data['total_completion_tokens'] += completion_tokens
38 |                 self.data['total_total_tokens'] += total_tokens
39 | 
40 |                 model_metrics['total_prompt_tokens'] += prompt_tokens
41 |                 model_metrics['total_completion_tokens'] += completion_tokens
42 |                 model_metrics['total_total_tokens'] += total_tokens
43 | 
44 |     def get(self) -> Mapping[str, Any]:
45 |         return self.data


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # transformers-openai-api
 2 | 
 3 | `transformers-openai-api` is a server for hosting locally running NLP [transformers](https://github.com/huggingface/transformers/) models via the [OpenAI Completions API](https://beta.openai.com/docs/api-reference/completions). In short, you can run `transformers` models and offer them through an API compatible with existing OpenAI tooling such as the [OpenAI Python Client](https://github.com/openai/openai-python) itself or any package that uses it (e.g. [LangChain](https://github.com/hwchase17/langchain)).
 4 | 
 5 | ## Quickstart
 6 | 
 7 | ### From pip
 8 | 
 9 | ```sh
10 | pip install transformers-openai-api
11 | wget https://raw.githubusercontent.com/jquesnelle/transformers-openai-api/master/config.example.json
12 | mv config.example.json config.json
13 | transformers-openai-api
14 | ```
15 | 
16 | ### From source
17 | 
18 | ```sh
19 | git clone https://github.com/jquesnelle/transformers-openai-api
20 | cd transformers-openai-api
21 | cp config.example.json config.json
22 | pip install -r requirements.txt
23 | python transformers_openai_api/
24 | ```
25 | 
26 | ## Using with OpenAI Python Client
27 | 
28 | Simply set the environment variable `OPENAI_API_BASE` to `http://HOST:PORT/v1` before importing the `openai` package. For example, to access a local instance of `transformers-openai-api`, set `OPENAI_API_BASE` to `http://127.0.0.1:5000/v1`. Alternatively, you can set the `api_base` property on the `openai` object:
29 | 
30 | ```python
31 | import openai
32 | openai.api_base = 'http://HOST:PORT/v1'
33 | ```
34 | 
35 | ## Configuration
36 | 
37 | All configuration is managed through `config.json`. By default `transformers-openai-api` looks for this file the in the current working directory, however a different path can be passed as the command-line argument to the program.  See [config.example.json](config.example.json).
38 | 
39 | ### Hosting
40 | 
41 | By default the API server listens on `127.0.0.1:5000` to change this, add a `HOST` and/or `PORT` entries to the configuration file. For example to serve publicly:
42 | ```json
43 | {
44 |     "HOST": "0.0.0.0",
45 |     "PORT": 80
46 | }
47 | ```
48 | 
49 | ### Models
50 | 
51 | The `MODELS` object handles mapping an OpenAI model name to a `transformers` model configuration. The structure of a model configuration is:
52 | | Key | Description |
53 | | - | - |
54 | | `ENABLED` | Boolean value to disable a model |
55 | | `TYPE` | Either "Seq2Seq" or "CausalLM" |
56 | | `MODEL_CONFIG` | Parameters for model creation; passed to `AutoModelForTYPE.from_pretrained` |
57 | | `MODEL_DEVICE` | Convert model to this device; passed to `to` called on the created model (default `cuda`) |
58 | | `TOKENIZER_CONFIG` | Parameters for tokenizer creation; passed to `AutoTokenizer.from_pretrained` |
59 | | `TOKENIZER_DEVICE` | Convert tokens to this device; passed to `to` called on the tokenized input (default `cuda`) |
60 | | `GENERATE_CONFIG` | Parameters for generation; passed to the model's `generate` function |
61 | | `DECODE_CONFIG` | Parameters for decoding; passed to the tokenizer's `decode` function |
62 | 
63 | #### Using accelerate
64 | 
65 | To use [accelerate](https://github.com/huggingface/accelerate), set `device_map` on the `MODEL_CONFIG` to `auto` and explicitly set `MODEL_DEVICE` to `null`. The default `text-davinci-003` model in [config.example.json](config.example.json) is an example of this.
66 | 
67 | #### Using CPU
68 | 
69 | To switch to CPU inference, set `MODEL_DEVICE` and `TOKENIZER_DEVICE` to `cpu`.
70 | 
71 | #### Using FP16
72 | 
73 | To use a model at half-precision, set `torch_dtype` on the `MODEL_CONFIG` to `torch_dtype`. The disabled `text-curie-001` model in [config.example.json](config.example.json) is an example of this.
74 | 
75 | ### Authorization
76 | 
77 | To limit access to the API (i.e. enforcing `OPENAI_API_KEY`), fill in the `BEARER_TOKENS` object with a list of authorized tokens (e.g. your OpenAI key). If the `BEARER_TOKENS` list does not exist, no authorization will be enforced.
78 | ```json
79 | {
80 |     "BEARER_TOKENS": ["sk-..."]
81 | }
82 | ```


--------------------------------------------------------------------------------
/transformers_openai_api/app.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import time
  3 | import torch
  4 | from typing import Any, Callable, Mapping, Optional
  5 | from flask import Flask, make_response, request, abort
  6 | from flask.json import jsonify
  7 | from functools import wraps
  8 | from .models import CausalLM, Model, Seq2Seq
  9 | from .metrics import Metrics
 10 | 
 11 | app = Flask(__name__)
 12 | models = {}
 13 | id = 0
 14 | metrics: Optional[Metrics]
 15 | 
 16 | 
 17 | def check_token(f: Callable):
 18 |     @wraps(f)
 19 |     def decorator(*args, **kwargs):
 20 |         bearer_tokens = app.config.get('BEARER_TOKENS')
 21 |         if bearer_tokens is None:
 22 |             return f(*args, **kwargs)
 23 | 
 24 |         authorization = request.headers['Authorization']
 25 |         if authorization.startswith('Bearer '):
 26 |             token = authorization[7:]
 27 |             if token in bearer_tokens:
 28 |                 return f(*args, **kwargs)
 29 |         return make_response(jsonify({
 30 |             'message': 'Invalid token'
 31 |         }), 401)
 32 |     return decorator
 33 | 
 34 | 
 35 | def convert_model_config(val: Optional[Mapping[str, Any]]) -> Mapping[str, Any]:
 36 |     config = {}
 37 |     if val is not None:
 38 |         for key, value in val.items():
 39 |             if key == 'torch_dtype':
 40 |                 if value == 'float16':
 41 |                     config['torch_dtype'] = torch.float16
 42 |                 elif value == 'float32':
 43 |                     config['torch_dtype'] = torch.float32
 44 |                 elif value == 'int8':
 45 |                     config['torch_dtype'] = torch.int8
 46 |                 else:
 47 |                     raise RuntimeError(
 48 |                         f"Unknown torch_dtype {config['torch_dtype']}")
 49 |             else:
 50 |                 config[key] = value
 51 |     return config
 52 | 
 53 | 
 54 | def convert_tokenizer_config(val: Optional[Mapping[str, Any]]) -> Mapping[str, Any]:
 55 |     return val if val is not None else {}
 56 | 
 57 | 
 58 | def convert_generate_config(val: Optional[Mapping[str, Any]]) -> Mapping[str, Any]:
 59 |     config = {}
 60 |     if val is not None:
 61 |         for key, value in val.items():
 62 |             if key == 'max_tokens':
 63 |                 config['max_length'] = value
 64 |             else:
 65 |                 config[key] = value
 66 |     return config
 67 | 
 68 | 
 69 | def convert_decode_config(val: Optional[Mapping[str, Any]]) -> Mapping[str, Any]:
 70 |     return val if val is not None else {}
 71 | 
 72 | 
 73 | def completion(model_name: str):
 74 |     global id
 75 |     this_id = id
 76 |     id += 1
 77 | 
 78 |     model: Model = models[model_name]
 79 | 
 80 |     response = model.completions(convert_generate_config(request.json))
 81 |     response.update({
 82 |         'object': 'text_completion',
 83 |         'model': model_name,
 84 |         'created': int(time.time()),
 85 |         'id': f'cmpl-{this_id}'
 86 |     })
 87 | 
 88 |     global metrics
 89 |     if metrics is not None:
 90 |         metrics.update(response)
 91 | 
 92 |     return make_response(jsonify(response))
 93 | 
 94 | 
 95 | @app.route('/v1/engines')
 96 | def v1_engines():
 97 |     return make_response(jsonify({
 98 |         'data': [{
 99 |             'object': 'engine',
100 |             'id': id,
101 |             'ready': True,
102 |             'owner': 'openai',
103 |             'permissions': None,
104 |             'created': None
105 |         } for id in models.keys()]
106 |     }))
107 | 
108 | 
109 | @app.route('/v1/completions', methods=['POST'])
110 | @check_token
111 | def v1_completions():
112 |     return completion(request.json['model'])
113 | 
114 | 
115 | @app.route('/v1/engines/<model_name>/completions', methods=['POST'])
116 | @check_token
117 | def engine_completion(model_name: str):
118 |     return completion(model_name)
119 | 
120 | 
121 | @app.route('/v1/metrics')
122 | def metrics_():
123 |     global metrics
124 |     if metrics is None:
125 |         abort(404)
126 | 
127 |     return make_response(jsonify(metrics.get()))
128 | 
129 | 
130 | def make_transformers_openai_api(config_path: str) -> Flask:
131 |     app.config.from_file(config_path, load=json.load)
132 | 
133 |     if app.config.get('METRICS', 1) != 0:
134 |         global metrics
135 |         metrics = Metrics()
136 | 
137 |     for mapping, config in app.config['MODELS'].items():
138 |         if config.get('ENABLED', True) == False:
139 |             continue
140 |         model_config = convert_model_config(config.get('MODEL_CONFIG'))
141 |         model_device = config.get('MODEL_DEVICE', 'cuda')
142 |         tokenizer_config = convert_tokenizer_config(
143 |             config.get('TOKENIZER_CONFIG'))
144 |         tokenizer_device = config.get('TOKENIZER_DEVICE', 'cuda')
145 |         generate_config = convert_generate_config(
146 |             config.get('GENERATE_CONFIG'))
147 |         decode_config = convert_decode_config(
148 |             config.get('DECODE_CONFIG'))
149 |         if config['TYPE'] == 'Seq2Seq':
150 |             models[mapping] = Seq2Seq(
151 |                 config['NAME'], model_config, model_device, tokenizer_config, tokenizer_device, generate_config, decode_config)
152 |         elif config['TYPE'] == 'CausalLM':
153 |             models[mapping] = CausalLM(
154 |                 config['NAME'], model_config, model_device, tokenizer_config, tokenizer_device, generate_config, decode_config)
155 |         else:
156 |             raise RuntimeError(f'Unknown model type {config["TYPE"]}')
157 | 
158 |     return app
159 | 


--------------------------------------------------------------------------------
/transformers_openai_api/models.py:
--------------------------------------------------------------------------------
  1 | from abc import ABC
  2 | from typing import Any, List, Mapping, Optional
  3 | from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForCausalLM
  4 | 
  5 | 
  6 | def get_prompts(request: Mapping[str, Any]) -> List[str]:
  7 |     prompt = request['prompt']
  8 |     if isinstance(prompt, str):
  9 |         prompt = [prompt]
 10 |     return prompt
 11 | 
 12 | 
 13 | def _completions_auto(
 14 |         request: Mapping[str, Any],
 15 |         tokenizer: Any,
 16 |         tokenizer_device: Optional[str],
 17 |         model: Any,
 18 |         generate_config: Mapping[str, Any],
 19 |         decode_config: Mapping[str, Any],
 20 |         auto_echo: bool):
 21 |     generate_args = {}
 22 |     generate_args.update(generate_config)
 23 |     generate_args.update(request)
 24 | 
 25 |     decode_args = {
 26 |         "skip_special_tokens": True
 27 |     }
 28 |     decode_args.update(decode_config)
 29 | 
 30 |     if ('top_p' in generate_args or 'top_k' in generate_args or 'temperature' in generate_args) and 'do_sample' not in generate_args:
 31 |         generate_args['do_sample'] = True
 32 |         if generate_args.get('temperature', 1.0) == 0:
 33 |             generate_args.pop('temperature', None)
 34 |         elif generate_args.get('top_p', 1.0) == 1.0:
 35 |             generate_args.pop('top_p', None)
 36 |         if 'top_k' not in generate_args:
 37 |             generate_args['top_k'] = 0
 38 | 
 39 |     prompts = get_prompts(generate_args)
 40 |     echo = generate_args.get('echo', False)
 41 |     n = generate_args.get('n', 1)
 42 | 
 43 |     generate_args.pop('model', None)
 44 |     generate_args.pop('prompt', None)
 45 |     generate_args.pop('n', None)
 46 | 
 47 |     # TODO
 48 |     generate_args.pop('best_of', None)
 49 |     generate_args.pop('presence_penalty', None)
 50 |     generate_args.pop('frequency_penalty', None)
 51 |     generate_args.pop('logit_bias', None)
 52 | 
 53 |     inputs = []
 54 |     prompt_tokens_count = 0
 55 |     for prompt in prompts:
 56 |         input = tokenizer(prompt, return_tensors="pt").input_ids
 57 |         if tokenizer_device is not None:
 58 |             input = input.to(tokenizer_device)
 59 |         prompt_tokens_count += input.size(dim=1)
 60 |         inputs.append(input)
 61 | 
 62 |     choices = []
 63 |     completion_tokens_count = 0
 64 |     for i in range(0, len(inputs)):
 65 |         for _ in range(0, n):
 66 |             output = model.generate(inputs[i], **generate_args)[0]
 67 |             completion_tokens_count += len(output)
 68 |             text = tokenizer.decode(output, **decode_args)
 69 |             if echo and not auto_echo:
 70 |                 text = prompts[i] + text
 71 |             choices.append({
 72 |                 'text': text,
 73 |                 'index': i,
 74 |             })
 75 | 
 76 |     return {
 77 |         'choices': choices,
 78 |         'usage': {
 79 |             'prompt_tokens': prompt_tokens_count,
 80 |             'completion_tokens': completion_tokens_count,
 81 |             'total_tokens': prompt_tokens_count + completion_tokens_count
 82 |         }
 83 |     }
 84 | 
 85 | 
 86 | class Model(ABC):
 87 | 
 88 |     def completions(self, request: Mapping[str, Any]):
 89 |         pass
 90 | 
 91 | 
 92 | class Seq2Seq(Model):
 93 |     model: AutoModelForSeq2SeqLM
 94 |     tokenizer: AutoTokenizer
 95 |     generate_config: Mapping[str, Any]
 96 |     decode_config: Mapping[str, Any]
 97 |     tokenizer_device: Optional[str]
 98 | 
 99 |     def __init__(
100 |             self,
101 |             pretrained_model_name_or_path: str,
102 |             model_config: Mapping[str, Any],
103 |             model_device: Optional[str],
104 |             tokenizer_config: Mapping[str, Any],
105 |             tokenizer_device: Optional[str],
106 |             generate_config: Mapping[str, Any],
107 |             decode_config: Mapping[str, Any]) -> None:
108 |         self.model = AutoModelForSeq2SeqLM.from_pretrained(
109 |             pretrained_model_name_or_path, **model_config)
110 |         if model_device is not None:
111 |             self.model = self.model.to(model_device)
112 |         self.tokenizer = AutoTokenizer.from_pretrained(
113 |             pretrained_model_name_or_path, **tokenizer_config)
114 |         self.generate_config = generate_config
115 |         self.decode_config = decode_config
116 |         self.tokenizer_device = tokenizer_device
117 | 
118 |     def completions(self, request) -> List[str]:
119 |         return _completions_auto(request, self.tokenizer, self.tokenizer_device, self.model, self.generate_config, self.decode_config, False)
120 | 
121 | 
122 | class CausalLM(Model):
123 |     model: AutoModelForCausalLM
124 |     tokenizer: AutoTokenizer
125 |     generate_config: Mapping[str, Any]
126 |     decode_config: Mapping[str, Any]
127 |     tokenizer_device: Optional[str]
128 | 
129 |     def __init__(
130 |             self,
131 |             pretrained_model_name_or_path: str,
132 |             model_config: Mapping[str, Any],
133 |             model_device: Optional[str],
134 |             tokenizer_config: Mapping[str, Any],
135 |             tokenizer_device: Optional[str],
136 |             generate_config: Mapping[str, Any],
137 |             decode_config: Mapping[str, Any]) -> None:
138 |         self.model = AutoModelForCausalLM.from_pretrained(
139 |             pretrained_model_name_or_path, **model_config)
140 |         if model_device is not None:
141 |             self.model = self.model.to(model_device)
142 |         self.tokenizer = AutoTokenizer.from_pretrained(
143 |             pretrained_model_name_or_path, **tokenizer_config)
144 |         self.generate_config = generate_config
145 |         self.decode_config = decode_config
146 |         self.tokenizer_device = tokenizer_device
147 | 
148 |     def completions(self, request) -> List[str]:
149 |         return _completions_auto(request, self.tokenizer, self.tokenizer_device, self.model, self.generate_config, self.decode_config, False)
150 | 


--------------------------------------------------------------------------------