├── .gitignore
├── LICENSE
├── README.md
├── convert.py
├── phi2-server.py
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Jioh L. Jung
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Mac_mlx_phi-2_server
  2 | 
  3 | # Disclaimer
  4 | - WARNING: this code is just for Fun, toy test code!
  5 | 
  6 | 1. There's no acclerated way to run phi-2 model on Mac except using MLX. pytorch using MPS is slow to run/play. but MLX is fast enough to.
  7 | 2. llama.cpp is not yet support to run phi-2. =(
  8 | 3. so, this is some toy code to run. =)
  9 | 
 10 | Original inference codes are came from https://github.com/ml-explore/mlx-examples/tree/main/phi2
 11 | 
 12 | # Feature
 13 | Test server code for Phi-2 model. support OpenAI API spec. using MacOSX system.
 14 | 
 15 | you can use same api with OpenAI API spec.
 16 | 
 17 | # How to use?
 18 | 
 19 | ```
 20 | # Install requirements
 21 | pip install -r requirements.txt
 22 | 
 23 | # Convert Model to MLX format
 24 | python convert.py
 25 | 
 26 | # Run Server.
 27 | python phi2-server.py
 28 | 
 29 | # Use anyway.
 30 | curl http://localhost:5000/v1/chat/completions \
 31 |   -H "Content-Type: application/json" \
 32 |   -d '{
 33 |      "model": "gpt-3.5-turbo",
 34 |      "messages": [{"role": "user", "content": "answer why sky is so blue?"}],
 35 |      "temperature": 0.5
 36 |    }'
 37 | ```
 38 | 
 39 | # Issue
 40 | - Need to change prompt to get better inference. this is just test. any pull-requests are welcome!
 41 | 
 42 | ----
 43 | 
 44 | # Phi-2
 45 | 
 46 | Phi-2 is a 2.7B parameter language model released by Microsoft with
 47 | performance that rivals much larger models.[^1] It was trained on a mixture of
 48 | GPT-4 outputs and clean web text.
 49 | 
 50 | Phi-2 efficiently runs on Apple silicon devices with 8GB of memory in 16-bit
 51 | precision.
 52 | 
 53 | ## Setup 
 54 | 
 55 | Download and convert the model:
 56 | 
 57 | ```sh 
 58 | python convert.py
 59 | ```
 60 | 
 61 | This will make the `weights.npz` file which MLX can read.
 62 | 
 63 | ## Generate 
 64 | 
 65 | To generate text with the default prompt:
 66 | 
 67 | ```sh
 68 | python phi2.py
 69 | ```
 70 | 
 71 | Should give the output:
 72 | 
 73 | ```
 74 | Answer: Mathematics is like a lighthouse that guides us through the darkness of
 75 | uncertainty. Just as a lighthouse emits a steady beam of light, mathematics
 76 | provides us with a clear path to navigate through complex problems. It
 77 | illuminates our understanding and helps us make sense of the world around us.
 78 | 
 79 | Exercise 2:
 80 | Compare and contrast the role of logic in mathematics and the role of a compass
 81 | in navigation.
 82 | 
 83 | Answer: Logic in mathematics is like a compass in navigation. It helps
 84 | ```
 85 | 
 86 | To use your own prompt:
 87 | 
 88 | ```sh
 89 | python phi2.py --prompt <your prompt here> --max_tokens <max_tokens_to_generate>
 90 | ```
 91 | 
 92 | To see a list of options run:
 93 | 
 94 | ```sh
 95 | python phi2.py --help
 96 | ```
 97 | 
 98 | [^1]: For more details on the model see the [blog post](
 99 | https://www.microsoft.com/en-us/research/blog/phi-2-the-surprising-power-of-small-language-models/)
100 | and the [Hugging Face repo](https://huggingface.co/microsoft/phi-2)
101 | 


--------------------------------------------------------------------------------
/convert.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoModelForCausalLM
 2 | import numpy as np
 3 | 
 4 | def replace_key(key: str) -> str:
 5 |     if "wte.weight" in key:
 6 |         key = "wte.weight"
 7 | 
 8 |     if ".mlp" in key:
 9 |         key = key.replace(".mlp", "")
10 |     return key
11 | 
12 | 
13 | def convert():
14 |     model = AutoModelForCausalLM.from_pretrained(
15 |         "microsoft/phi-2", torch_dtype="auto", trust_remote_code=True
16 |     )
17 |     state_dict = model.state_dict()
18 |     weights = {replace_key(k): v.numpy() for k, v in state_dict.items()}
19 |     np.savez("weights.npz", **weights)
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     convert()
24 | 


--------------------------------------------------------------------------------
/phi2-server.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | from typing import Optional
  3 | from dataclasses import dataclass
  4 | from mlx.utils import tree_unflatten
  5 | from transformers import AutoTokenizer
  6 | 
  7 | import mlx.core as mx
  8 | import mlx.nn as nn
  9 | import math
 10 | 
 11 | from flask import Flask, request, jsonify
 12 | import time
 13 | 
 14 | app = Flask(__name__)
 15 | default_token_max = 512
 16 | 
 17 | @dataclass
 18 | class ModelArgs:
 19 |     max_sequence_length: int = 2048
 20 |     num_vocab: int = 51200
 21 |     model_dim: int = 2560
 22 |     num_heads: int = 32
 23 |     num_layers: int = 32
 24 |     rotary_dim: int = 32
 25 | 
 26 | 
 27 | class LayerNorm(nn.LayerNorm):
 28 |     def __call__(self, x: mx.array) -> mx.array:
 29 |         return super().__call__(x.astype(mx.float32)).astype(x.dtype)
 30 | 
 31 | 
 32 | class RoPEAttention(nn.Module):
 33 |     def __init__(self, dims: int, num_heads: int, rotary_dim: int):
 34 |         super().__init__()
 35 | 
 36 |         self.num_heads = num_heads
 37 | 
 38 |         self.rope = nn.RoPE(rotary_dim, traditional=False)
 39 |         self.Wqkv = nn.Linear(dims, 3 * dims)
 40 |         self.out_proj = nn.Linear(dims, dims)
 41 | 
 42 |     def __call__(self, x, mask=None, cache=None):
 43 |         qkv = self.Wqkv(x)
 44 |         queries, keys, values = mx.split(qkv, 3, axis=-1)
 45 | 
 46 |         # Extract some shapes
 47 |         num_heads = self.num_heads
 48 |         B, L, D = queries.shape
 49 | 
 50 |         # Prepare the queries, keys and values for the attention computation
 51 |         queries = queries.reshape(B, L, num_heads, -1).transpose(0, 2, 1, 3)
 52 |         keys = keys.reshape(B, L, num_heads, -1).transpose(0, 2, 1, 3)
 53 |         values = values.reshape(B, L, num_heads, -1).transpose(0, 2, 1, 3)
 54 | 
 55 |         # Add RoPE to the queries and keys and combine them with the cache
 56 |         if cache is not None:
 57 |             key_cache, value_cache = cache
 58 |             queries = self.rope(queries, offset=key_cache.shape[2])
 59 |             keys = self.rope(keys, offset=key_cache.shape[2])
 60 |             keys = mx.concatenate([key_cache, keys], axis=2)
 61 |             values = mx.concatenate([value_cache, values], axis=2)
 62 |         else:
 63 |             queries = self.rope(queries)
 64 |             keys = self.rope(keys)
 65 | 
 66 |         queries = queries.astype(mx.float32)
 67 |         keys = keys.astype(mx.float32)
 68 | 
 69 |         # Finally perform the attention computation
 70 |         scale = math.sqrt(1 / queries.shape[-1])
 71 |         scores = (queries * scale) @ keys.transpose(0, 1, 3, 2)
 72 |         if mask is not None:
 73 |             scores = scores + mask
 74 | 
 75 |         scores = mx.softmax(scores, axis=-1).astype(values.dtype)
 76 |         values_hat = (scores @ values).transpose(0, 2, 1, 3).reshape(B, L, -1)
 77 | 
 78 |         return self.out_proj(values_hat), (keys, values)
 79 | 
 80 | 
 81 | class ParallelBlock(nn.Module):
 82 |     def __init__(self, config: ModelArgs):
 83 |         super().__init__()
 84 |         dims = config.model_dim
 85 |         mlp_dims = dims * 4
 86 |         self.mixer = RoPEAttention(dims, config.num_heads, config.rotary_dim)
 87 |         self.ln = LayerNorm(dims)
 88 |         self.fc1 = nn.Linear(dims, mlp_dims)
 89 |         self.fc2 = nn.Linear(mlp_dims, dims)
 90 |         self.act = nn.GELU(approx="precise")
 91 | 
 92 |     def __call__(self, x, mask, cache):
 93 |         h = self.ln(x)
 94 |         attn_h, cache = self.mixer(h, mask, cache)
 95 |         ff_h = self.fc2(self.act(self.fc1(h)))
 96 |         return attn_h + ff_h + x, cache
 97 | 
 98 | 
 99 | class TransformerDecoder(nn.Module):
100 |     def __init__(self, config: ModelArgs):
101 |         super().__init__()
102 |         self.h = [ParallelBlock(config) for i in range(config.num_layers)]
103 | 
104 |     def __call__(self, x, mask, cache):
105 |         if cache is None:
106 |             cache = [None] * len(self.h)
107 | 
108 |         for e, layer in enumerate(self.h):
109 |             x, cache[e] = layer(x, mask, cache[e])
110 |         return x, cache
111 | 
112 | 
113 | class OutputHead(nn.Module):
114 |     def __init__(self, config: ModelArgs) -> None:
115 |         self.ln = LayerNorm(config.model_dim)
116 |         self.linear = nn.Linear(config.model_dim, config.num_vocab)
117 | 
118 |     def __call__(self, inputs):
119 |         return self.linear(self.ln(inputs))
120 | 
121 | 
122 | class Phi2(nn.Module):
123 |     def __init__(self, config: ModelArgs):
124 |         self.wte = nn.Embedding(config.num_vocab, config.model_dim)
125 |         self.transformer = TransformerDecoder(config)
126 |         self.lm_head = OutputHead(config)
127 | 
128 |     def __call__(
129 |         self,
130 |         inputs: mx.array,
131 |         mask: mx.array = None,
132 |         cache: mx.array = None,
133 |     ) -> tuple[mx.array, mx.array]:
134 |         x = self.wte(inputs)
135 | 
136 |         mask = None
137 |         if x.shape[1] > 1:
138 |             mask = nn.MultiHeadAttention.create_additive_causal_mask(x.shape[1])
139 |             mask = mask.astype(x.dtype)
140 | 
141 |         y, cache = self.transformer(x, mask, cache)
142 |         return self.lm_head(y), cache
143 | 
144 | 
145 | def generate(prompt: mx.array, model: Phi2, temp: Optional[float] = 0.0):
146 |     def sample(logits):
147 |         if temp == 0:
148 |             return mx.argmax(logits, axis=-1)
149 |         else:
150 |             return mx.random.categorical(logits * (1 / temp))
151 | 
152 |     logits, cache = model(prompt)
153 |     y = sample(logits[:, -1, :])
154 |     yield y
155 | 
156 |     while True:
157 |         logits, cache = model(y[:, None], cache=cache)
158 |         y = sample(logits.squeeze(1))
159 |         yield y
160 | 
161 | 
162 | def load_model():
163 |     model = Phi2(ModelArgs())
164 |     weights = mx.load("weights.npz")
165 |     model.update(tree_unflatten(list(weights.items())))
166 |     tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", trust_remote_code=True)
167 |     return model, tokenizer
168 | 
169 | def complete_text(str_prompt, temp, max_tokens):
170 |     prompt = tokenizer(
171 |         str_prompt,
172 |         return_tensors="np",
173 |         return_attention_mask=False,
174 |     )["input_ids"]
175 |     tk_prompt = len(prompt[0])
176 |     tk_gen = 0
177 | 
178 |     prompt = mx.array(prompt)
179 | 
180 |     #print("[INFO] Generating with Phi-2...", flush=True)
181 |     #print(prompt_fmt, end="", flush=True)
182 |     ret = ''
183 |     eos = False
184 | 
185 |     tokens = []
186 |     for token, _ in zip(generate(prompt, model, temp), range(max_tokens)):
187 |         tokens.append(token)
188 | 
189 |         if (len(tokens) % 10) == 0:
190 |             mx.eval(tokens)
191 |             s = tokenizer.decode([t.item() for t in tokens])
192 |             tk_gen +=len(tokens)
193 |             sfiltered = s
194 |             tokens = []
195 |             if s.find('<|endoftext|>') >=0:
196 |                 sfiltered = s.split('<|endoftext|>',1)[0]
197 |                 eos = True
198 |                 break
199 |             #print(sfiltered, end="", flush=True)
200 |             ret += sfiltered
201 | 
202 |     if not eos:
203 |         mx.eval(tokens)
204 |         s = tokenizer.decode([t.item() for t in tokens])
205 |         sfiltered = s
206 |         tk_gen +=len(tokens)
207 |         tokens = []
208 |         if s.find('<|endoftext|>') >=0:
209 |             sfiltered = s.split('<|endoftext|>',1)[0]
210 |         ret += sfiltered
211 |         #print(sfiltered, flush=True)
212 |     return ret, (tk_prompt, tk_gen)
213 | 
214 | @app.route('/v1/chat/completions', methods=['POST'])
215 | def api_completions():
216 |     # Get JSON data sent with the request
217 |     data = request.get_json()
218 |     print(data)
219 |     model_name = 'phi-2'
220 |     ai_role = 'Assistance'
221 |     prompt_strs = "%s: %s\n%s: " % (
222 |         data['messages'][0]['role'],
223 |         data['messages'][0]['content'],
224 |         'Assistance')
225 |     temp = 0.7
226 |     if 'temperature' in data:
227 |         temp = data['temperature']
228 |     txt, tks = complete_text(prompt_strs, temp, default_token_max)
229 |     txt = txt.strip()
230 |     
231 |     resp = {
232 |       "id": "gen-resp-1",
233 |       "object": "chat.completion",
234 |       "created": int(time.time()),
235 |       "model": model_name,
236 |       "usage": {
237 |           "prompt_tokens": tks[0],
238 |           "completion_tokens": tks[1],
239 |           "total_tokens": tks[0] + tks[1]
240 |       },
241 |       "choices": [
242 |           {
243 |               "message": {
244 |                   "role": ai_role,
245 |                   "content": txt
246 |               },
247 |               "finish_reason": "stop",
248 |               "index": 0
249 |           }
250 |       ]
251 |     }
252 | 
253 |     return jsonify(resp)
254 | 
255 | if __name__ == '__main__':
256 |     parser = argparse.ArgumentParser(description="Phi-2 inference script")
257 |     parser.add_argument(
258 |         "--prompt",
259 |         help="The message to be processed by the model",
260 |         default="Write a detailed analogy between mathematics and a lighthouse.",
261 |     )
262 |     parser.add_argument(
263 |         "--max_tokens",
264 |         "-m",
265 |         type=int,
266 |         default=1024,
267 |         help="Maximum number of tokens to generate",
268 |     )
269 |     parser.add_argument(
270 |         "--temp",
271 |         help="The sampling temperature.",
272 |         type=float,
273 |         default=0.0,
274 |     )
275 |     parser.add_argument("--seed", type=int, default=0, help="The PRNG seed")
276 |     args = parser.parse_args()
277 |     default_token_max = args.max_tokens
278 | 
279 |     mx.random.seed(args.seed)
280 | 
281 |     model, tokenizer = load_model()
282 |     
283 |     #for i in range(5):
284 |     #    gen_text(args.prompt, float(i)/5)
285 |     app.run(debug=True)
286 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | einops
2 | mlx
3 | numpy
4 | transformers
5 | torch
6 | flask


--------------------------------------------------------------------------------