├── LICENSE
├── README.md
└── llamacpp_mock_api.py
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 Blake Wyatt
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Code Llama for VSCode
2 |
3 | An API which mocks [Llama.cpp](https://github.com/ggerganov/llama.cpp) to enable support for Code Llama with the
4 | [Continue Visual Studio Code extension](https://continue.dev/).
5 |
6 | As of the time of writing and to my knowledge, this is the only way to use Code Llama with VSCode locally without having
7 | to sign up or get an API key for a service. The only exception to this is Continue with [Ollama](https://ollama.ai/), but
8 | Ollama doesn't support Windows or Linux. On the other hand, Code Llama for VSCode is completely cross-platform and will
9 | run wherever Meta's own [codellama](https://github.com/facebookresearch/codellama) code will run.
10 |
11 | Now let's get started!
12 |
13 | ### Setup
14 |
15 | Prerequisites:
16 | - [Download and run one of the Code Llama Instruct models](https://github.com/facebookresearch/codellama)
17 | - [Install the Continue VSCode extension](https://marketplace.visualstudio.com/items?itemName=Continue.continue)
18 |
19 | After you are able to use both independently, we will glue them together with Code Llama for VSCode.
20 |
21 | Steps:
22 | 1. Move `llamacpp_mock_api.py` to your [`codellama`](https://github.com/facebookresearch/codellama) folder and install Flask to your environment with `pip install flask`.
23 | 2. Run `llamacpp_mock_api.py` with your [Code Llama Instruct torchrun command](https://github.com/facebookresearch/codellama#fine-tuned-instruction-models). For example:
24 | ```
25 | torchrun --nproc_per_node 1 llamacpp_mock_api.py \
26 | --ckpt_dir CodeLlama-7b-Instruct/ \
27 | --tokenizer_path CodeLlama-7b-Instruct/tokenizer.model \
28 | --max_seq_len 512 --max_batch_size 4
29 | ```
30 | 3. Click the settings button at the bottom right of Continue's UI in VSCode and make changes to `config.json` so it looks like [this](https://docs.continue.dev/reference/Model%20Providers/llamacpp)[\[archive\]](http://web.archive.org/web/20240531162330/https://docs.continue.dev/reference/Model%20Providers/llamacpp). Replace `MODEL_NAME` with `codellama-7b`.
31 |
32 | Restart VSCode or reload the Continue extension and you should now be able to use Code Llama for VSCode!
33 |
--------------------------------------------------------------------------------
/llamacpp_mock_api.py:
--------------------------------------------------------------------------------
1 | from typing import Optional
2 |
3 | import fire
4 | from flask import Flask, jsonify, request, Response
5 | import torch.distributed as dist
6 |
7 | from llama import Llama
8 |
9 | def main(
10 | ckpt_dir: str,
11 | tokenizer_path: str,
12 | temperature: float = 0.2,
13 | top_p: float = 0.95,
14 | max_seq_len: int = 512,
15 | max_batch_size: int = 8,
16 | max_gen_len: Optional[int] = None,
17 | port: int = 8080,
18 | ):
19 | print("Loading Code Llama...", end="", flush=True)
20 |
21 | # Create our Code Llama object.
22 | generator = Llama.build(
23 | ckpt_dir=ckpt_dir,
24 | tokenizer_path=tokenizer_path,
25 | max_seq_len=max_seq_len,
26 | max_batch_size=max_batch_size,
27 | )
28 |
29 | print("Done!", flush=True)
30 | print()
31 |
32 | # With torchrun and distributed PyTorch, multiple copies of this code
33 | # can be run at once. We only want one of them (node 0) to have the Flask API
34 | # and we will use it to control the rest.
35 | if dist.get_rank() == 0:
36 | app = Flask(__name__)
37 |
38 | def prompt_to_instructions(prompt):
39 | # Remove unnecessary tokens and spacing from Continue's prompt format.
40 | prompt = prompt.replace("\n", "")
41 | prompt = prompt.replace("[INST] ", "[INST]")
42 | prompt = prompt.replace(" [/INST]", "[/INST]")
43 |
44 | # Consume Continue's prompt string and transform it into a list of
45 | # message dicts which contain role information.
46 | messages = []
47 | prompt_start = 0
48 | while True:
49 | user_message_start = prompt.find("[INST]", prompt_start) + 6
50 | user_message_end = prompt.find("[/INST]", prompt_start)
51 | assistant_message_end = prompt.find("[INST]", user_message_end)
52 |
53 | messages += [{"role": "user", "content": prompt[user_message_start:user_message_end]}]
54 |
55 | if assistant_message_end != -1:
56 | messages += [{"role": "assistant", "content": prompt[user_message_end + 7:assistant_message_end]}]
57 | else:
58 | break
59 |
60 | prompt_start = assistant_message_end
61 |
62 | # Send back the message instructions.
63 | return [messages]
64 |
65 | def run_chat_completion(prompt):
66 | # Transform the prompt format Continue uses into a list of
67 | # message dicts Code Llama supports.
68 | instructions = prompt_to_instructions(prompt)
69 |
70 | # Broadcast what should be processed to other nodes (acting as a C&C node).
71 | dist.broadcast_object_list([instructions, max_gen_len, temperature, top_p])
72 |
73 | # Start Code Llama inferencing.
74 | results = generator.chat_completion(
75 | instructions,
76 | max_gen_len=max_gen_len,
77 | temperature=temperature,
78 | top_p=top_p,
79 | )
80 |
81 | # Send the response back.
82 | return results[0]["generation"]["content"].strip()
83 |
84 | @app.route("/completion", methods=["POST"])
85 | def completion():
86 | content = request.json
87 |
88 | print("Incoming request: " + str(content))
89 |
90 | # Perform Code Llama chat completion.
91 | response = run_chat_completion(content["prompt"])
92 | response = jsonify({"content": response}).get_data(as_text=True)
93 |
94 | print("Outgoing response: " + str(response))
95 |
96 | # Llama.cpp's HTTP server uses Server-Sent Events to stream results to the client
97 | # so we reimplement it here, for a single event sent to Continue which contains
98 | # the entire Code Llama response.
99 | def generate():
100 | yield "data: " + response + "\n"
101 | yield "data: [DONE]\n"
102 |
103 | # Send back the response.
104 | return Response(generate())
105 |
106 | # Run the Flask API server on the Llama.cpp port.
107 | app.run(port=port)
108 |
109 | # Nodes which are not node 0 wait for tasks.
110 | else:
111 | while True:
112 | config = [None] * 4
113 | try:
114 | dist.broadcast_object_list(config)
115 | generator.chat_completion(
116 | config[0], max_gen_len=config[1], temperature=config[2], top_p=config[3]
117 | )
118 | except:
119 | pass
120 |
121 | if __name__ == "__main__":
122 | fire.Fire(main)
123 |
--------------------------------------------------------------------------------