├── LICENSE ├── README.md └── llamacpp_mock_api.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Blake Wyatt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Code Llama for VSCode 2 | 3 | An API which mocks [Llama.cpp](https://github.com/ggerganov/llama.cpp) to enable support for Code Llama with the 4 | [Continue Visual Studio Code extension](https://continue.dev/). 5 | 6 | As of the time of writing and to my knowledge, this is the only way to use Code Llama with VSCode locally without having 7 | to sign up or get an API key for a service. The only exception to this is Continue with [Ollama](https://ollama.ai/), but 8 | Ollama doesn't support Windows or Linux. On the other hand, Code Llama for VSCode is completely cross-platform and will 9 | run wherever Meta's own [codellama](https://github.com/facebookresearch/codellama) code will run. 10 | 11 | Now let's get started! 12 | 13 | ### Setup 14 | 15 | Prerequisites: 16 | - [Download and run one of the Code Llama Instruct models](https://github.com/facebookresearch/codellama) 17 | - [Install the Continue VSCode extension](https://marketplace.visualstudio.com/items?itemName=Continue.continue) 18 | 19 | After you are able to use both independently, we will glue them together with Code Llama for VSCode. 20 | 21 | Steps: 22 | 1. Move `llamacpp_mock_api.py` to your [`codellama`](https://github.com/facebookresearch/codellama) folder and install Flask to your environment with `pip install flask`. 23 | 2. Run `llamacpp_mock_api.py` with your [Code Llama Instruct torchrun command](https://github.com/facebookresearch/codellama#fine-tuned-instruction-models). For example: 24 | ``` 25 | torchrun --nproc_per_node 1 llamacpp_mock_api.py \ 26 | --ckpt_dir CodeLlama-7b-Instruct/ \ 27 | --tokenizer_path CodeLlama-7b-Instruct/tokenizer.model \ 28 | --max_seq_len 512 --max_batch_size 4 29 | ``` 30 | 3. Click the settings button at the bottom right of Continue's UI in VSCode and make changes to `config.json` so it looks like [this](https://docs.continue.dev/reference/Model%20Providers/llamacpp)[\[archive\]](http://web.archive.org/web/20240531162330/https://docs.continue.dev/reference/Model%20Providers/llamacpp). Replace `MODEL_NAME` with `codellama-7b`. 31 | 32 | Restart VSCode or reload the Continue extension and you should now be able to use Code Llama for VSCode! 33 | -------------------------------------------------------------------------------- /llamacpp_mock_api.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | import fire 4 | from flask import Flask, jsonify, request, Response 5 | import torch.distributed as dist 6 | 7 | from llama import Llama 8 | 9 | def main( 10 | ckpt_dir: str, 11 | tokenizer_path: str, 12 | temperature: float = 0.2, 13 | top_p: float = 0.95, 14 | max_seq_len: int = 512, 15 | max_batch_size: int = 8, 16 | max_gen_len: Optional[int] = None, 17 | port: int = 8080, 18 | ): 19 | print("Loading Code Llama...", end="", flush=True) 20 | 21 | # Create our Code Llama object. 22 | generator = Llama.build( 23 | ckpt_dir=ckpt_dir, 24 | tokenizer_path=tokenizer_path, 25 | max_seq_len=max_seq_len, 26 | max_batch_size=max_batch_size, 27 | ) 28 | 29 | print("Done!", flush=True) 30 | print() 31 | 32 | # With torchrun and distributed PyTorch, multiple copies of this code 33 | # can be run at once. We only want one of them (node 0) to have the Flask API 34 | # and we will use it to control the rest. 35 | if dist.get_rank() == 0: 36 | app = Flask(__name__) 37 | 38 | def prompt_to_instructions(prompt): 39 | # Remove unnecessary tokens and spacing from Continue's prompt format. 40 | prompt = prompt.replace("\n", "") 41 | prompt = prompt.replace("[INST] ", "[INST]") 42 | prompt = prompt.replace(" [/INST]", "[/INST]") 43 | 44 | # Consume Continue's prompt string and transform it into a list of 45 | # message dicts which contain role information. 46 | messages = [] 47 | prompt_start = 0 48 | while True: 49 | user_message_start = prompt.find("[INST]", prompt_start) + 6 50 | user_message_end = prompt.find("[/INST]", prompt_start) 51 | assistant_message_end = prompt.find("[INST]", user_message_end) 52 | 53 | messages += [{"role": "user", "content": prompt[user_message_start:user_message_end]}] 54 | 55 | if assistant_message_end != -1: 56 | messages += [{"role": "assistant", "content": prompt[user_message_end + 7:assistant_message_end]}] 57 | else: 58 | break 59 | 60 | prompt_start = assistant_message_end 61 | 62 | # Send back the message instructions. 63 | return [messages] 64 | 65 | def run_chat_completion(prompt): 66 | # Transform the prompt format Continue uses into a list of 67 | # message dicts Code Llama supports. 68 | instructions = prompt_to_instructions(prompt) 69 | 70 | # Broadcast what should be processed to other nodes (acting as a C&C node). 71 | dist.broadcast_object_list([instructions, max_gen_len, temperature, top_p]) 72 | 73 | # Start Code Llama inferencing. 74 | results = generator.chat_completion( 75 | instructions, 76 | max_gen_len=max_gen_len, 77 | temperature=temperature, 78 | top_p=top_p, 79 | ) 80 | 81 | # Send the response back. 82 | return results[0]["generation"]["content"].strip() 83 | 84 | @app.route("/completion", methods=["POST"]) 85 | def completion(): 86 | content = request.json 87 | 88 | print("Incoming request: " + str(content)) 89 | 90 | # Perform Code Llama chat completion. 91 | response = run_chat_completion(content["prompt"]) 92 | response = jsonify({"content": response}).get_data(as_text=True) 93 | 94 | print("Outgoing response: " + str(response)) 95 | 96 | # Llama.cpp's HTTP server uses Server-Sent Events to stream results to the client 97 | # so we reimplement it here, for a single event sent to Continue which contains 98 | # the entire Code Llama response. 99 | def generate(): 100 | yield "data: " + response + "\n" 101 | yield "data: [DONE]\n" 102 | 103 | # Send back the response. 104 | return Response(generate()) 105 | 106 | # Run the Flask API server on the Llama.cpp port. 107 | app.run(port=port) 108 | 109 | # Nodes which are not node 0 wait for tasks. 110 | else: 111 | while True: 112 | config = [None] * 4 113 | try: 114 | dist.broadcast_object_list(config) 115 | generator.chat_completion( 116 | config[0], max_gen_len=config[1], temperature=config[2], top_p=config[3] 117 | ) 118 | except: 119 | pass 120 | 121 | if __name__ == "__main__": 122 | fire.Fire(main) 123 | --------------------------------------------------------------------------------