├── requirements.txt ├── transformers_openai_api ├── __init__.py ├── serve.py ├── __main__.py ├── metrics.py ├── app.py └── models.py ├── .gitignore ├── setup.py ├── LICENSE ├── config.example.json └── README.md /requirements.txt: -------------------------------------------------------------------------------- 1 | transformers 2 | accelerate 3 | torch 4 | Flask -------------------------------------------------------------------------------- /transformers_openai_api/__init__.py: -------------------------------------------------------------------------------- 1 | from .app import make_transformers_openai_api 2 | from .serve import run_server -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | venv/ 3 | build/ 4 | dist/ 5 | *.egg-info 6 | config.json 7 | cache/ 8 | .vscode/launch.json -------------------------------------------------------------------------------- /transformers_openai_api/serve.py: -------------------------------------------------------------------------------- 1 | from flask import Flask 2 | 3 | 4 | def run_server(app: Flask): 5 | app.run( 6 | host=app.config.get('HOST', '127.0.0.1'), 7 | port=app.config.get('PORT', 5000), 8 | debug=app.config.get('ENV', 'production') != 'production' 9 | ) 10 | -------------------------------------------------------------------------------- /transformers_openai_api/__main__.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import sys 4 | from .app import make_transformers_openai_api 5 | from .serve import run_server 6 | 7 | def main(): 8 | parser = argparse.ArgumentParser( 9 | prog='transformers-openai-api', 10 | description='An OpenAI Completions API compatible server for locally running transformers models') 11 | parser.add_argument('config', nargs='?', help='Path to config.json', 12 | default=os.path.join(os.getcwd(), 'config.json')) 13 | args = parser.parse_args() 14 | 15 | run_server(make_transformers_openai_api(args.config)) 16 | 17 | if __name__ == '__main__': 18 | sys.exit(main()) 19 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup( 4 | name='transformers-openai-api', 5 | packages=["transformers_openai_api"], 6 | version='1.1.0', 7 | description='An OpenAI Completions API compatible server for NLP transformers models', 8 | author='Jeffrey Quesnelle', 9 | author_email='jq@jeffq.com', 10 | url='https://github.com/jquesnelle/transformers-openai-api/', 11 | license='MIT', 12 | install_requires=[ 13 | 'transformers', 14 | 'accelerate', 15 | 'torch', 16 | 'Flask' 17 | ], 18 | entry_points={ 19 | 'console_scripts': [ 20 | 'transformers-openai-api = transformers_openai_api.__main__:main' 21 | ] 22 | }, 23 | classifiers=[ 24 | "Programming Language :: Python :: 3", 25 | "License :: OSI Approved :: MIT License", 26 | "Operating System :: OS Independent", 27 | ] 28 | ) 29 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Jeffrey Quesnelle 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /config.example.json: -------------------------------------------------------------------------------- 1 | { 2 | "MODELS": { 3 | "text-davinci-003": { 4 | "NAME": "google/flan-t5-base", 5 | "TYPE": "Seq2Seq", 6 | "MODEL_CONFIG": { 7 | "device_map": "auto" 8 | }, 9 | "MODEL_DEVICE": null 10 | }, 11 | "text-curie-001": { 12 | "ENABLED": false, 13 | "NAME": "facebook/opt-350m", 14 | "TYPE": "CausalLM", 15 | "MODEL_CONFIG": { 16 | "torch_dtype": "float16" 17 | }, 18 | "GENERATE_CONFIG": { 19 | "max_new_tokens": 2048 20 | } 21 | }, 22 | "text-babbage-001": { 23 | "ENABLED": false, 24 | "NAME": "pszemraj/pegasus-x-large-book-summary", 25 | "TYPE": "Seq2Seq", 26 | "GENERATE_CONFIG": { 27 | "max_length": 256, 28 | "min_length": 8, 29 | "no_repeat_ngram_size": 3, 30 | "early_stopping": true, 31 | "repetition_penalty": 3.5, 32 | "length_penalty": 0.2, 33 | "encoder_no_repeat_ngram_size": 3, 34 | "num_beams": 4 35 | } 36 | } 37 | } 38 | } -------------------------------------------------------------------------------- /transformers_openai_api/metrics.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Mapping 2 | 3 | 4 | class Metrics: 5 | 6 | data: Mapping[str, Any] 7 | 8 | def __init__(self) -> None: 9 | self.data = { 10 | 'total_prompt_tokens': 0, 11 | 'total_completion_tokens': 0, 12 | 'total_total_tokens': 0, 13 | 'model_metrics': {} 14 | } 15 | 16 | def update(self, response: Mapping[str, Any]): 17 | if 'model' in response: 18 | model = response['model'] 19 | if model not in self.data['model_metrics']: 20 | self.data['model_metrics'][model] = { 21 | 'total_prompt_tokens': 0, 22 | 'total_completion_tokens': 0, 23 | 'total_total_tokens': 0, 24 | 'calls': 0 25 | } 26 | 27 | model_metrics = self.data['model_metrics'][model] 28 | model_metrics['calls'] += 1 29 | 30 | if 'usage' in response: 31 | usage = response['usage'] 32 | prompt_tokens = usage.get('prompt_tokens', 0) 33 | completion_tokens = usage.get('completion_tokens', 0) 34 | total_tokens = usage.get('total_tokens', 0) 35 | 36 | self.data['total_prompt_tokens'] += prompt_tokens 37 | self.data['total_completion_tokens'] += completion_tokens 38 | self.data['total_total_tokens'] += total_tokens 39 | 40 | model_metrics['total_prompt_tokens'] += prompt_tokens 41 | model_metrics['total_completion_tokens'] += completion_tokens 42 | model_metrics['total_total_tokens'] += total_tokens 43 | 44 | def get(self) -> Mapping[str, Any]: 45 | return self.data -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # transformers-openai-api 2 | 3 | `transformers-openai-api` is a server for hosting locally running NLP [transformers](https://github.com/huggingface/transformers/) models via the [OpenAI Completions API](https://beta.openai.com/docs/api-reference/completions). In short, you can run `transformers` models and offer them through an API compatible with existing OpenAI tooling such as the [OpenAI Python Client](https://github.com/openai/openai-python) itself or any package that uses it (e.g. [LangChain](https://github.com/hwchase17/langchain)). 4 | 5 | ## Quickstart 6 | 7 | ### From pip 8 | 9 | ```sh 10 | pip install transformers-openai-api 11 | wget https://raw.githubusercontent.com/jquesnelle/transformers-openai-api/master/config.example.json 12 | mv config.example.json config.json 13 | transformers-openai-api 14 | ``` 15 | 16 | ### From source 17 | 18 | ```sh 19 | git clone https://github.com/jquesnelle/transformers-openai-api 20 | cd transformers-openai-api 21 | cp config.example.json config.json 22 | pip install -r requirements.txt 23 | python transformers_openai_api/ 24 | ``` 25 | 26 | ## Using with OpenAI Python Client 27 | 28 | Simply set the environment variable `OPENAI_API_BASE` to `http://HOST:PORT/v1` before importing the `openai` package. For example, to access a local instance of `transformers-openai-api`, set `OPENAI_API_BASE` to `http://127.0.0.1:5000/v1`. Alternatively, you can set the `api_base` property on the `openai` object: 29 | 30 | ```python 31 | import openai 32 | openai.api_base = 'http://HOST:PORT/v1' 33 | ``` 34 | 35 | ## Configuration 36 | 37 | All configuration is managed through `config.json`. By default `transformers-openai-api` looks for this file the in the current working directory, however a different path can be passed as the command-line argument to the program. See [config.example.json](config.example.json). 38 | 39 | ### Hosting 40 | 41 | By default the API server listens on `127.0.0.1:5000` to change this, add a `HOST` and/or `PORT` entries to the configuration file. For example to serve publicly: 42 | ```json 43 | { 44 | "HOST": "0.0.0.0", 45 | "PORT": 80 46 | } 47 | ``` 48 | 49 | ### Models 50 | 51 | The `MODELS` object handles mapping an OpenAI model name to a `transformers` model configuration. The structure of a model configuration is: 52 | | Key | Description | 53 | | - | - | 54 | | `ENABLED` | Boolean value to disable a model | 55 | | `TYPE` | Either "Seq2Seq" or "CausalLM" | 56 | | `MODEL_CONFIG` | Parameters for model creation; passed to `AutoModelForTYPE.from_pretrained` | 57 | | `MODEL_DEVICE` | Convert model to this device; passed to `to` called on the created model (default `cuda`) | 58 | | `TOKENIZER_CONFIG` | Parameters for tokenizer creation; passed to `AutoTokenizer.from_pretrained` | 59 | | `TOKENIZER_DEVICE` | Convert tokens to this device; passed to `to` called on the tokenized input (default `cuda`) | 60 | | `GENERATE_CONFIG` | Parameters for generation; passed to the model's `generate` function | 61 | | `DECODE_CONFIG` | Parameters for decoding; passed to the tokenizer's `decode` function | 62 | 63 | #### Using accelerate 64 | 65 | To use [accelerate](https://github.com/huggingface/accelerate), set `device_map` on the `MODEL_CONFIG` to `auto` and explicitly set `MODEL_DEVICE` to `null`. The default `text-davinci-003` model in [config.example.json](config.example.json) is an example of this. 66 | 67 | #### Using CPU 68 | 69 | To switch to CPU inference, set `MODEL_DEVICE` and `TOKENIZER_DEVICE` to `cpu`. 70 | 71 | #### Using FP16 72 | 73 | To use a model at half-precision, set `torch_dtype` on the `MODEL_CONFIG` to `torch_dtype`. The disabled `text-curie-001` model in [config.example.json](config.example.json) is an example of this. 74 | 75 | ### Authorization 76 | 77 | To limit access to the API (i.e. enforcing `OPENAI_API_KEY`), fill in the `BEARER_TOKENS` object with a list of authorized tokens (e.g. your OpenAI key). If the `BEARER_TOKENS` list does not exist, no authorization will be enforced. 78 | ```json 79 | { 80 | "BEARER_TOKENS": ["sk-..."] 81 | } 82 | ``` -------------------------------------------------------------------------------- /transformers_openai_api/app.py: -------------------------------------------------------------------------------- 1 | import json 2 | import time 3 | import torch 4 | from typing import Any, Callable, Mapping, Optional 5 | from flask import Flask, make_response, request, abort 6 | from flask.json import jsonify 7 | from functools import wraps 8 | from .models import CausalLM, Model, Seq2Seq 9 | from .metrics import Metrics 10 | 11 | app = Flask(__name__) 12 | models = {} 13 | id = 0 14 | metrics: Optional[Metrics] 15 | 16 | 17 | def check_token(f: Callable): 18 | @wraps(f) 19 | def decorator(*args, **kwargs): 20 | bearer_tokens = app.config.get('BEARER_TOKENS') 21 | if bearer_tokens is None: 22 | return f(*args, **kwargs) 23 | 24 | authorization = request.headers['Authorization'] 25 | if authorization.startswith('Bearer '): 26 | token = authorization[7:] 27 | if token in bearer_tokens: 28 | return f(*args, **kwargs) 29 | return make_response(jsonify({ 30 | 'message': 'Invalid token' 31 | }), 401) 32 | return decorator 33 | 34 | 35 | def convert_model_config(val: Optional[Mapping[str, Any]]) -> Mapping[str, Any]: 36 | config = {} 37 | if val is not None: 38 | for key, value in val.items(): 39 | if key == 'torch_dtype': 40 | if value == 'float16': 41 | config['torch_dtype'] = torch.float16 42 | elif value == 'float32': 43 | config['torch_dtype'] = torch.float32 44 | elif value == 'int8': 45 | config['torch_dtype'] = torch.int8 46 | else: 47 | raise RuntimeError( 48 | f"Unknown torch_dtype {config['torch_dtype']}") 49 | else: 50 | config[key] = value 51 | return config 52 | 53 | 54 | def convert_tokenizer_config(val: Optional[Mapping[str, Any]]) -> Mapping[str, Any]: 55 | return val if val is not None else {} 56 | 57 | 58 | def convert_generate_config(val: Optional[Mapping[str, Any]]) -> Mapping[str, Any]: 59 | config = {} 60 | if val is not None: 61 | for key, value in val.items(): 62 | if key == 'max_tokens': 63 | config['max_length'] = value 64 | else: 65 | config[key] = value 66 | return config 67 | 68 | 69 | def convert_decode_config(val: Optional[Mapping[str, Any]]) -> Mapping[str, Any]: 70 | return val if val is not None else {} 71 | 72 | 73 | def completion(model_name: str): 74 | global id 75 | this_id = id 76 | id += 1 77 | 78 | model: Model = models[model_name] 79 | 80 | response = model.completions(convert_generate_config(request.json)) 81 | response.update({ 82 | 'object': 'text_completion', 83 | 'model': model_name, 84 | 'created': int(time.time()), 85 | 'id': f'cmpl-{this_id}' 86 | }) 87 | 88 | global metrics 89 | if metrics is not None: 90 | metrics.update(response) 91 | 92 | return make_response(jsonify(response)) 93 | 94 | 95 | @app.route('/v1/engines') 96 | def v1_engines(): 97 | return make_response(jsonify({ 98 | 'data': [{ 99 | 'object': 'engine', 100 | 'id': id, 101 | 'ready': True, 102 | 'owner': 'openai', 103 | 'permissions': None, 104 | 'created': None 105 | } for id in models.keys()] 106 | })) 107 | 108 | 109 | @app.route('/v1/completions', methods=['POST']) 110 | @check_token 111 | def v1_completions(): 112 | return completion(request.json['model']) 113 | 114 | 115 | @app.route('/v1/engines//completions', methods=['POST']) 116 | @check_token 117 | def engine_completion(model_name: str): 118 | return completion(model_name) 119 | 120 | 121 | @app.route('/v1/metrics') 122 | def metrics_(): 123 | global metrics 124 | if metrics is None: 125 | abort(404) 126 | 127 | return make_response(jsonify(metrics.get())) 128 | 129 | 130 | def make_transformers_openai_api(config_path: str) -> Flask: 131 | app.config.from_file(config_path, load=json.load) 132 | 133 | if app.config.get('METRICS', 1) != 0: 134 | global metrics 135 | metrics = Metrics() 136 | 137 | for mapping, config in app.config['MODELS'].items(): 138 | if config.get('ENABLED', True) == False: 139 | continue 140 | model_config = convert_model_config(config.get('MODEL_CONFIG')) 141 | model_device = config.get('MODEL_DEVICE', 'cuda') 142 | tokenizer_config = convert_tokenizer_config( 143 | config.get('TOKENIZER_CONFIG')) 144 | tokenizer_device = config.get('TOKENIZER_DEVICE', 'cuda') 145 | generate_config = convert_generate_config( 146 | config.get('GENERATE_CONFIG')) 147 | decode_config = convert_decode_config( 148 | config.get('DECODE_CONFIG')) 149 | if config['TYPE'] == 'Seq2Seq': 150 | models[mapping] = Seq2Seq( 151 | config['NAME'], model_config, model_device, tokenizer_config, tokenizer_device, generate_config, decode_config) 152 | elif config['TYPE'] == 'CausalLM': 153 | models[mapping] = CausalLM( 154 | config['NAME'], model_config, model_device, tokenizer_config, tokenizer_device, generate_config, decode_config) 155 | else: 156 | raise RuntimeError(f'Unknown model type {config["TYPE"]}') 157 | 158 | return app 159 | -------------------------------------------------------------------------------- /transformers_openai_api/models.py: -------------------------------------------------------------------------------- 1 | from abc import ABC 2 | from typing import Any, List, Mapping, Optional 3 | from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForCausalLM 4 | 5 | 6 | def get_prompts(request: Mapping[str, Any]) -> List[str]: 7 | prompt = request['prompt'] 8 | if isinstance(prompt, str): 9 | prompt = [prompt] 10 | return prompt 11 | 12 | 13 | def _completions_auto( 14 | request: Mapping[str, Any], 15 | tokenizer: Any, 16 | tokenizer_device: Optional[str], 17 | model: Any, 18 | generate_config: Mapping[str, Any], 19 | decode_config: Mapping[str, Any], 20 | auto_echo: bool): 21 | generate_args = {} 22 | generate_args.update(generate_config) 23 | generate_args.update(request) 24 | 25 | decode_args = { 26 | "skip_special_tokens": True 27 | } 28 | decode_args.update(decode_config) 29 | 30 | if ('top_p' in generate_args or 'top_k' in generate_args or 'temperature' in generate_args) and 'do_sample' not in generate_args: 31 | generate_args['do_sample'] = True 32 | if generate_args.get('temperature', 1.0) == 0: 33 | generate_args.pop('temperature', None) 34 | elif generate_args.get('top_p', 1.0) == 1.0: 35 | generate_args.pop('top_p', None) 36 | if 'top_k' not in generate_args: 37 | generate_args['top_k'] = 0 38 | 39 | prompts = get_prompts(generate_args) 40 | echo = generate_args.get('echo', False) 41 | n = generate_args.get('n', 1) 42 | 43 | generate_args.pop('model', None) 44 | generate_args.pop('prompt', None) 45 | generate_args.pop('n', None) 46 | 47 | # TODO 48 | generate_args.pop('best_of', None) 49 | generate_args.pop('presence_penalty', None) 50 | generate_args.pop('frequency_penalty', None) 51 | generate_args.pop('logit_bias', None) 52 | 53 | inputs = [] 54 | prompt_tokens_count = 0 55 | for prompt in prompts: 56 | input = tokenizer(prompt, return_tensors="pt").input_ids 57 | if tokenizer_device is not None: 58 | input = input.to(tokenizer_device) 59 | prompt_tokens_count += input.size(dim=1) 60 | inputs.append(input) 61 | 62 | choices = [] 63 | completion_tokens_count = 0 64 | for i in range(0, len(inputs)): 65 | for _ in range(0, n): 66 | output = model.generate(inputs[i], **generate_args)[0] 67 | completion_tokens_count += len(output) 68 | text = tokenizer.decode(output, **decode_args) 69 | if echo and not auto_echo: 70 | text = prompts[i] + text 71 | choices.append({ 72 | 'text': text, 73 | 'index': i, 74 | }) 75 | 76 | return { 77 | 'choices': choices, 78 | 'usage': { 79 | 'prompt_tokens': prompt_tokens_count, 80 | 'completion_tokens': completion_tokens_count, 81 | 'total_tokens': prompt_tokens_count + completion_tokens_count 82 | } 83 | } 84 | 85 | 86 | class Model(ABC): 87 | 88 | def completions(self, request: Mapping[str, Any]): 89 | pass 90 | 91 | 92 | class Seq2Seq(Model): 93 | model: AutoModelForSeq2SeqLM 94 | tokenizer: AutoTokenizer 95 | generate_config: Mapping[str, Any] 96 | decode_config: Mapping[str, Any] 97 | tokenizer_device: Optional[str] 98 | 99 | def __init__( 100 | self, 101 | pretrained_model_name_or_path: str, 102 | model_config: Mapping[str, Any], 103 | model_device: Optional[str], 104 | tokenizer_config: Mapping[str, Any], 105 | tokenizer_device: Optional[str], 106 | generate_config: Mapping[str, Any], 107 | decode_config: Mapping[str, Any]) -> None: 108 | self.model = AutoModelForSeq2SeqLM.from_pretrained( 109 | pretrained_model_name_or_path, **model_config) 110 | if model_device is not None: 111 | self.model = self.model.to(model_device) 112 | self.tokenizer = AutoTokenizer.from_pretrained( 113 | pretrained_model_name_or_path, **tokenizer_config) 114 | self.generate_config = generate_config 115 | self.decode_config = decode_config 116 | self.tokenizer_device = tokenizer_device 117 | 118 | def completions(self, request) -> List[str]: 119 | return _completions_auto(request, self.tokenizer, self.tokenizer_device, self.model, self.generate_config, self.decode_config, False) 120 | 121 | 122 | class CausalLM(Model): 123 | model: AutoModelForCausalLM 124 | tokenizer: AutoTokenizer 125 | generate_config: Mapping[str, Any] 126 | decode_config: Mapping[str, Any] 127 | tokenizer_device: Optional[str] 128 | 129 | def __init__( 130 | self, 131 | pretrained_model_name_or_path: str, 132 | model_config: Mapping[str, Any], 133 | model_device: Optional[str], 134 | tokenizer_config: Mapping[str, Any], 135 | tokenizer_device: Optional[str], 136 | generate_config: Mapping[str, Any], 137 | decode_config: Mapping[str, Any]) -> None: 138 | self.model = AutoModelForCausalLM.from_pretrained( 139 | pretrained_model_name_or_path, **model_config) 140 | if model_device is not None: 141 | self.model = self.model.to(model_device) 142 | self.tokenizer = AutoTokenizer.from_pretrained( 143 | pretrained_model_name_or_path, **tokenizer_config) 144 | self.generate_config = generate_config 145 | self.decode_config = decode_config 146 | self.tokenizer_device = tokenizer_device 147 | 148 | def completions(self, request) -> List[str]: 149 | return _completions_auto(request, self.tokenizer, self.tokenizer_device, self.model, self.generate_config, self.decode_config, False) 150 | --------------------------------------------------------------------------------